linux/kernel/sched/fair.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   4 *
   5 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   6 *
   7 *  Interactivity improvements by Mike Galbraith
   8 *  (C) 2007 Mike Galbraith <efault@gmx.de>
   9 *
  10 *  Various enhancements by Dmitry Adamushko.
  11 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  12 *
  13 *  Group scheduling enhancements by Srivatsa Vaddagiri
  14 *  Copyright IBM Corporation, 2007
  15 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  16 *
  17 *  Scaled math optimizations by Thomas Gleixner
  18 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  19 *
  20 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  21 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  22 */
  23#include "sched.h"
  24
  25/*
  26 * Targeted preemption latency for CPU-bound tasks:
  27 *
  28 * NOTE: this latency value is not the same as the concept of
  29 * 'timeslice length' - timeslices in CFS are of variable length
  30 * and have no persistent notion like in traditional, time-slice
  31 * based scheduling concepts.
  32 *
  33 * (to see the precise effective timeslice length of your workload,
  34 *  run vmstat and monitor the context-switches (cs) field)
  35 *
  36 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  37 */
  38unsigned int sysctl_sched_latency                       = 6000000ULL;
  39static unsigned int normalized_sysctl_sched_latency     = 6000000ULL;
  40
  41/*
  42 * The initial- and re-scaling of tunables is configurable
  43 *
  44 * Options are:
  45 *
  46 *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
  47 *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  48 *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  49 *
  50 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  51 */
  52unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  53
  54/*
  55 * Minimal preemption granularity for CPU-bound tasks:
  56 *
  57 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  58 */
  59unsigned int sysctl_sched_min_granularity                       = 750000ULL;
  60static unsigned int normalized_sysctl_sched_min_granularity     = 750000ULL;
  61
  62/*
  63 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
  64 */
  65static unsigned int sched_nr_latency = 8;
  66
  67/*
  68 * After fork, child runs first. If set to 0 (default) then
  69 * parent will (try to) run first.
  70 */
  71unsigned int sysctl_sched_child_runs_first __read_mostly;
  72
  73/*
  74 * SCHED_OTHER wake-up granularity.
  75 *
  76 * This option delays the preemption effects of decoupled workloads
  77 * and reduces their over-scheduling. Synchronous workloads will still
  78 * have immediate wakeup/sleep latencies.
  79 *
  80 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  81 */
  82unsigned int sysctl_sched_wakeup_granularity                    = 1000000UL;
  83static unsigned int normalized_sysctl_sched_wakeup_granularity  = 1000000UL;
  84
  85const_debug unsigned int sysctl_sched_migration_cost    = 500000UL;
  86
  87int sched_thermal_decay_shift;
  88static int __init setup_sched_thermal_decay_shift(char *str)
  89{
  90        int _shift = 0;
  91
  92        if (kstrtoint(str, 0, &_shift))
  93                pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
  94
  95        sched_thermal_decay_shift = clamp(_shift, 0, 10);
  96        return 1;
  97}
  98__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
  99
 100#ifdef CONFIG_SMP
 101/*
 102 * For asym packing, by default the lower numbered CPU has higher priority.
 103 */
 104int __weak arch_asym_cpu_priority(int cpu)
 105{
 106        return -cpu;
 107}
 108
 109/*
 110 * The margin used when comparing utilization with CPU capacity.
 111 *
 112 * (default: ~20%)
 113 */
 114#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
 115
 116/*
 117 * The margin used when comparing CPU capacities.
 118 * is 'cap1' noticeably greater than 'cap2'
 119 *
 120 * (default: ~5%)
 121 */
 122#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
 123#endif
 124
 125#ifdef CONFIG_CFS_BANDWIDTH
 126/*
 127 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 128 * each time a cfs_rq requests quota.
 129 *
 130 * Note: in the case that the slice exceeds the runtime remaining (either due
 131 * to consumption or the quota being specified to be smaller than the slice)
 132 * we will always only issue the remaining available time.
 133 *
 134 * (default: 5 msec, units: microseconds)
 135 */
 136unsigned int sysctl_sched_cfs_bandwidth_slice           = 5000UL;
 137#endif
 138
 139static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 140{
 141        lw->weight += inc;
 142        lw->inv_weight = 0;
 143}
 144
 145static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 146{
 147        lw->weight -= dec;
 148        lw->inv_weight = 0;
 149}
 150
 151static inline void update_load_set(struct load_weight *lw, unsigned long w)
 152{
 153        lw->weight = w;
 154        lw->inv_weight = 0;
 155}
 156
 157/*
 158 * Increase the granularity value when there are more CPUs,
 159 * because with more CPUs the 'effective latency' as visible
 160 * to users decreases. But the relationship is not linear,
 161 * so pick a second-best guess by going with the log2 of the
 162 * number of CPUs.
 163 *
 164 * This idea comes from the SD scheduler of Con Kolivas:
 165 */
 166static unsigned int get_update_sysctl_factor(void)
 167{
 168        unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
 169        unsigned int factor;
 170
 171        switch (sysctl_sched_tunable_scaling) {
 172        case SCHED_TUNABLESCALING_NONE:
 173                factor = 1;
 174                break;
 175        case SCHED_TUNABLESCALING_LINEAR:
 176                factor = cpus;
 177                break;
 178        case SCHED_TUNABLESCALING_LOG:
 179        default:
 180                factor = 1 + ilog2(cpus);
 181                break;
 182        }
 183
 184        return factor;
 185}
 186
 187static void update_sysctl(void)
 188{
 189        unsigned int factor = get_update_sysctl_factor();
 190
 191#define SET_SYSCTL(name) \
 192        (sysctl_##name = (factor) * normalized_sysctl_##name)
 193        SET_SYSCTL(sched_min_granularity);
 194        SET_SYSCTL(sched_latency);
 195        SET_SYSCTL(sched_wakeup_granularity);
 196#undef SET_SYSCTL
 197}
 198
 199void __init sched_init_granularity(void)
 200{
 201        update_sysctl();
 202}
 203
 204#define WMULT_CONST     (~0U)
 205#define WMULT_SHIFT     32
 206
 207static void __update_inv_weight(struct load_weight *lw)
 208{
 209        unsigned long w;
 210
 211        if (likely(lw->inv_weight))
 212                return;
 213
 214        w = scale_load_down(lw->weight);
 215
 216        if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 217                lw->inv_weight = 1;
 218        else if (unlikely(!w))
 219                lw->inv_weight = WMULT_CONST;
 220        else
 221                lw->inv_weight = WMULT_CONST / w;
 222}
 223
 224/*
 225 * delta_exec * weight / lw.weight
 226 *   OR
 227 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 228 *
 229 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
 230 * we're guaranteed shift stays positive because inv_weight is guaranteed to
 231 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 232 *
 233 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 234 * weight/lw.weight <= 1, and therefore our shift will also be positive.
 235 */
 236static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 237{
 238        u64 fact = scale_load_down(weight);
 239        u32 fact_hi = (u32)(fact >> 32);
 240        int shift = WMULT_SHIFT;
 241        int fs;
 242
 243        __update_inv_weight(lw);
 244
 245        if (unlikely(fact_hi)) {
 246                fs = fls(fact_hi);
 247                shift -= fs;
 248                fact >>= fs;
 249        }
 250
 251        fact = mul_u32_u32(fact, lw->inv_weight);
 252
 253        fact_hi = (u32)(fact >> 32);
 254        if (fact_hi) {
 255                fs = fls(fact_hi);
 256                shift -= fs;
 257                fact >>= fs;
 258        }
 259
 260        return mul_u64_u32_shr(delta_exec, fact, shift);
 261}
 262
 263
 264const struct sched_class fair_sched_class;
 265
 266/**************************************************************
 267 * CFS operations on generic schedulable entities:
 268 */
 269
 270#ifdef CONFIG_FAIR_GROUP_SCHED
 271
 272/* Walk up scheduling entities hierarchy */
 273#define for_each_sched_entity(se) \
 274                for (; se; se = se->parent)
 275
 276static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
 277{
 278        if (!path)
 279                return;
 280
 281        if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
 282                autogroup_path(cfs_rq->tg, path, len);
 283        else if (cfs_rq && cfs_rq->tg->css.cgroup)
 284                cgroup_path(cfs_rq->tg->css.cgroup, path, len);
 285        else
 286                strlcpy(path, "(null)", len);
 287}
 288
 289static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 290{
 291        struct rq *rq = rq_of(cfs_rq);
 292        int cpu = cpu_of(rq);
 293
 294        if (cfs_rq->on_list)
 295                return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
 296
 297        cfs_rq->on_list = 1;
 298
 299        /*
 300         * Ensure we either appear before our parent (if already
 301         * enqueued) or force our parent to appear after us when it is
 302         * enqueued. The fact that we always enqueue bottom-up
 303         * reduces this to two cases and a special case for the root
 304         * cfs_rq. Furthermore, it also means that we will always reset
 305         * tmp_alone_branch either when the branch is connected
 306         * to a tree or when we reach the top of the tree
 307         */
 308        if (cfs_rq->tg->parent &&
 309            cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
 310                /*
 311                 * If parent is already on the list, we add the child
 312                 * just before. Thanks to circular linked property of
 313                 * the list, this means to put the child at the tail
 314                 * of the list that starts by parent.
 315                 */
 316                list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 317                        &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
 318                /*
 319                 * The branch is now connected to its tree so we can
 320                 * reset tmp_alone_branch to the beginning of the
 321                 * list.
 322                 */
 323                rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 324                return true;
 325        }
 326
 327        if (!cfs_rq->tg->parent) {
 328                /*
 329                 * cfs rq without parent should be put
 330                 * at the tail of the list.
 331                 */
 332                list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 333                        &rq->leaf_cfs_rq_list);
 334                /*
 335                 * We have reach the top of a tree so we can reset
 336                 * tmp_alone_branch to the beginning of the list.
 337                 */
 338                rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 339                return true;
 340        }
 341
 342        /*
 343         * The parent has not already been added so we want to
 344         * make sure that it will be put after us.
 345         * tmp_alone_branch points to the begin of the branch
 346         * where we will add parent.
 347         */
 348        list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
 349        /*
 350         * update tmp_alone_branch to points to the new begin
 351         * of the branch
 352         */
 353        rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
 354        return false;
 355}
 356
 357static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 358{
 359        if (cfs_rq->on_list) {
 360                struct rq *rq = rq_of(cfs_rq);
 361
 362                /*
 363                 * With cfs_rq being unthrottled/throttled during an enqueue,
 364                 * it can happen the tmp_alone_branch points the a leaf that
 365                 * we finally want to del. In this case, tmp_alone_branch moves
 366                 * to the prev element but it will point to rq->leaf_cfs_rq_list
 367                 * at the end of the enqueue.
 368                 */
 369                if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
 370                        rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
 371
 372                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 373                cfs_rq->on_list = 0;
 374        }
 375}
 376
 377static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 378{
 379        SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
 380}
 381
 382/* Iterate thr' all leaf cfs_rq's on a runqueue */
 383#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                      \
 384        list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
 385                                 leaf_cfs_rq_list)
 386
 387/* Do the two (enqueued) entities belong to the same group ? */
 388static inline struct cfs_rq *
 389is_same_group(struct sched_entity *se, struct sched_entity *pse)
 390{
 391        if (se->cfs_rq == pse->cfs_rq)
 392                return se->cfs_rq;
 393
 394        return NULL;
 395}
 396
 397static inline struct sched_entity *parent_entity(struct sched_entity *se)
 398{
 399        return se->parent;
 400}
 401
 402static void
 403find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 404{
 405        int se_depth, pse_depth;
 406
 407        /*
 408         * preemption test can be made between sibling entities who are in the
 409         * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 410         * both tasks until we find their ancestors who are siblings of common
 411         * parent.
 412         */
 413
 414        /* First walk up until both entities are at same depth */
 415        se_depth = (*se)->depth;
 416        pse_depth = (*pse)->depth;
 417
 418        while (se_depth > pse_depth) {
 419                se_depth--;
 420                *se = parent_entity(*se);
 421        }
 422
 423        while (pse_depth > se_depth) {
 424                pse_depth--;
 425                *pse = parent_entity(*pse);
 426        }
 427
 428        while (!is_same_group(*se, *pse)) {
 429                *se = parent_entity(*se);
 430                *pse = parent_entity(*pse);
 431        }
 432}
 433
 434#else   /* !CONFIG_FAIR_GROUP_SCHED */
 435
 436#define for_each_sched_entity(se) \
 437                for (; se; se = NULL)
 438
 439static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
 440{
 441        if (path)
 442                strlcpy(path, "(null)", len);
 443}
 444
 445static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 446{
 447        return true;
 448}
 449
 450static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 451{
 452}
 453
 454static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 455{
 456}
 457
 458#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)      \
 459                for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
 460
 461static inline struct sched_entity *parent_entity(struct sched_entity *se)
 462{
 463        return NULL;
 464}
 465
 466static inline void
 467find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 468{
 469}
 470
 471#endif  /* CONFIG_FAIR_GROUP_SCHED */
 472
 473static __always_inline
 474void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 475
 476/**************************************************************
 477 * Scheduling class tree data structure manipulation methods:
 478 */
 479
 480static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 481{
 482        s64 delta = (s64)(vruntime - max_vruntime);
 483        if (delta > 0)
 484                max_vruntime = vruntime;
 485
 486        return max_vruntime;
 487}
 488
 489static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 490{
 491        s64 delta = (s64)(vruntime - min_vruntime);
 492        if (delta < 0)
 493                min_vruntime = vruntime;
 494
 495        return min_vruntime;
 496}
 497
 498static inline bool entity_before(struct sched_entity *a,
 499                                struct sched_entity *b)
 500{
 501        return (s64)(a->vruntime - b->vruntime) < 0;
 502}
 503
 504#define __node_2_se(node) \
 505        rb_entry((node), struct sched_entity, run_node)
 506
 507static void update_min_vruntime(struct cfs_rq *cfs_rq)
 508{
 509        struct sched_entity *curr = cfs_rq->curr;
 510        struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
 511
 512        u64 vruntime = cfs_rq->min_vruntime;
 513
 514        if (curr) {
 515                if (curr->on_rq)
 516                        vruntime = curr->vruntime;
 517                else
 518                        curr = NULL;
 519        }
 520
 521        if (leftmost) { /* non-empty tree */
 522                struct sched_entity *se = __node_2_se(leftmost);
 523
 524                if (!curr)
 525                        vruntime = se->vruntime;
 526                else
 527                        vruntime = min_vruntime(vruntime, se->vruntime);
 528        }
 529
 530        /* ensure we never gain time by being placed backwards. */
 531        cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 532#ifndef CONFIG_64BIT
 533        smp_wmb();
 534        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 535#endif
 536}
 537
 538static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
 539{
 540        return entity_before(__node_2_se(a), __node_2_se(b));
 541}
 542
 543/*
 544 * Enqueue an entity into the rb-tree:
 545 */
 546static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 547{
 548        rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
 549}
 550
 551static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 552{
 553        rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
 554}
 555
 556struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 557{
 558        struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
 559
 560        if (!left)
 561                return NULL;
 562
 563        return __node_2_se(left);
 564}
 565
 566static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 567{
 568        struct rb_node *next = rb_next(&se->run_node);
 569
 570        if (!next)
 571                return NULL;
 572
 573        return __node_2_se(next);
 574}
 575
 576#ifdef CONFIG_SCHED_DEBUG
 577struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 578{
 579        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
 580
 581        if (!last)
 582                return NULL;
 583
 584        return __node_2_se(last);
 585}
 586
 587/**************************************************************
 588 * Scheduling class statistics methods:
 589 */
 590
 591int sched_update_scaling(void)
 592{
 593        unsigned int factor = get_update_sysctl_factor();
 594
 595        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 596                                        sysctl_sched_min_granularity);
 597
 598#define WRT_SYSCTL(name) \
 599        (normalized_sysctl_##name = sysctl_##name / (factor))
 600        WRT_SYSCTL(sched_min_granularity);
 601        WRT_SYSCTL(sched_latency);
 602        WRT_SYSCTL(sched_wakeup_granularity);
 603#undef WRT_SYSCTL
 604
 605        return 0;
 606}
 607#endif
 608
 609/*
 610 * delta /= w
 611 */
 612static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 613{
 614        if (unlikely(se->load.weight != NICE_0_LOAD))
 615                delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 616
 617        return delta;
 618}
 619
 620/*
 621 * The idea is to set a period in which each task runs once.
 622 *
 623 * When there are too many tasks (sched_nr_latency) we have to stretch
 624 * this period because otherwise the slices get too small.
 625 *
 626 * p = (nr <= nl) ? l : l*nr/nl
 627 */
 628static u64 __sched_period(unsigned long nr_running)
 629{
 630        if (unlikely(nr_running > sched_nr_latency))
 631                return nr_running * sysctl_sched_min_granularity;
 632        else
 633                return sysctl_sched_latency;
 634}
 635
 636/*
 637 * We calculate the wall-time slice from the period by taking a part
 638 * proportional to the weight.
 639 *
 640 * s = p*P[w/rw]
 641 */
 642static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 643{
 644        unsigned int nr_running = cfs_rq->nr_running;
 645        u64 slice;
 646
 647        if (sched_feat(ALT_PERIOD))
 648                nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
 649
 650        slice = __sched_period(nr_running + !se->on_rq);
 651
 652        for_each_sched_entity(se) {
 653                struct load_weight *load;
 654                struct load_weight lw;
 655
 656                cfs_rq = cfs_rq_of(se);
 657                load = &cfs_rq->load;
 658
 659                if (unlikely(!se->on_rq)) {
 660                        lw = cfs_rq->load;
 661
 662                        update_load_add(&lw, se->load.weight);
 663                        load = &lw;
 664                }
 665                slice = __calc_delta(slice, se->load.weight, load);
 666        }
 667
 668        if (sched_feat(BASE_SLICE))
 669                slice = max(slice, (u64)sysctl_sched_min_granularity);
 670
 671        return slice;
 672}
 673
 674/*
 675 * We calculate the vruntime slice of a to-be-inserted task.
 676 *
 677 * vs = s/w
 678 */
 679static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 680{
 681        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 682}
 683
 684#include "pelt.h"
 685#ifdef CONFIG_SMP
 686
 687static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 688static unsigned long task_h_load(struct task_struct *p);
 689static unsigned long capacity_of(int cpu);
 690
 691/* Give new sched_entity start runnable values to heavy its load in infant time */
 692void init_entity_runnable_average(struct sched_entity *se)
 693{
 694        struct sched_avg *sa = &se->avg;
 695
 696        memset(sa, 0, sizeof(*sa));
 697
 698        /*
 699         * Tasks are initialized with full load to be seen as heavy tasks until
 700         * they get a chance to stabilize to their real load level.
 701         * Group entities are initialized with zero load to reflect the fact that
 702         * nothing has been attached to the task group yet.
 703         */
 704        if (entity_is_task(se))
 705                sa->load_avg = scale_load_down(se->load.weight);
 706
 707        /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 708}
 709
 710static void attach_entity_cfs_rq(struct sched_entity *se);
 711
 712/*
 713 * With new tasks being created, their initial util_avgs are extrapolated
 714 * based on the cfs_rq's current util_avg:
 715 *
 716 *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
 717 *
 718 * However, in many cases, the above util_avg does not give a desired
 719 * value. Moreover, the sum of the util_avgs may be divergent, such
 720 * as when the series is a harmonic series.
 721 *
 722 * To solve this problem, we also cap the util_avg of successive tasks to
 723 * only 1/2 of the left utilization budget:
 724 *
 725 *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
 726 *
 727 * where n denotes the nth task and cpu_scale the CPU capacity.
 728 *
 729 * For example, for a CPU with 1024 of capacity, a simplest series from
 730 * the beginning would be like:
 731 *
 732 *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
 733 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
 734 *
 735 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
 736 * if util_avg > util_avg_cap.
 737 */
 738void post_init_entity_util_avg(struct task_struct *p)
 739{
 740        struct sched_entity *se = &p->se;
 741        struct cfs_rq *cfs_rq = cfs_rq_of(se);
 742        struct sched_avg *sa = &se->avg;
 743        long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
 744        long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
 745
 746        if (cap > 0) {
 747                if (cfs_rq->avg.util_avg != 0) {
 748                        sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
 749                        sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 750
 751                        if (sa->util_avg > cap)
 752                                sa->util_avg = cap;
 753                } else {
 754                        sa->util_avg = cap;
 755                }
 756        }
 757
 758        sa->runnable_avg = sa->util_avg;
 759
 760        if (p->sched_class != &fair_sched_class) {
 761                /*
 762                 * For !fair tasks do:
 763                 *
 764                update_cfs_rq_load_avg(now, cfs_rq);
 765                attach_entity_load_avg(cfs_rq, se);
 766                switched_from_fair(rq, p);
 767                 *
 768                 * such that the next switched_to_fair() has the
 769                 * expected state.
 770                 */
 771                se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
 772                return;
 773        }
 774
 775        attach_entity_cfs_rq(se);
 776}
 777
 778#else /* !CONFIG_SMP */
 779void init_entity_runnable_average(struct sched_entity *se)
 780{
 781}
 782void post_init_entity_util_avg(struct task_struct *p)
 783{
 784}
 785static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 786{
 787}
 788#endif /* CONFIG_SMP */
 789
 790/*
 791 * Update the current task's runtime statistics.
 792 */
 793static void update_curr(struct cfs_rq *cfs_rq)
 794{
 795        struct sched_entity *curr = cfs_rq->curr;
 796        u64 now = rq_clock_task(rq_of(cfs_rq));
 797        u64 delta_exec;
 798
 799        if (unlikely(!curr))
 800                return;
 801
 802        delta_exec = now - curr->exec_start;
 803        if (unlikely((s64)delta_exec <= 0))
 804                return;
 805
 806        curr->exec_start = now;
 807
 808        schedstat_set(curr->statistics.exec_max,
 809                      max(delta_exec, curr->statistics.exec_max));
 810
 811        curr->sum_exec_runtime += delta_exec;
 812        schedstat_add(cfs_rq->exec_clock, delta_exec);
 813
 814        curr->vruntime += calc_delta_fair(delta_exec, curr);
 815        update_min_vruntime(cfs_rq);
 816
 817        if (entity_is_task(curr)) {
 818                struct task_struct *curtask = task_of(curr);
 819
 820                trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 821                cgroup_account_cputime(curtask, delta_exec);
 822                account_group_exec_runtime(curtask, delta_exec);
 823        }
 824
 825        account_cfs_rq_runtime(cfs_rq, delta_exec);
 826}
 827
 828static void update_curr_fair(struct rq *rq)
 829{
 830        update_curr(cfs_rq_of(&rq->curr->se));
 831}
 832
 833static inline void
 834update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 835{
 836        u64 wait_start, prev_wait_start;
 837
 838        if (!schedstat_enabled())
 839                return;
 840
 841        wait_start = rq_clock(rq_of(cfs_rq));
 842        prev_wait_start = schedstat_val(se->statistics.wait_start);
 843
 844        if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
 845            likely(wait_start > prev_wait_start))
 846                wait_start -= prev_wait_start;
 847
 848        __schedstat_set(se->statistics.wait_start, wait_start);
 849}
 850
 851static inline void
 852update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 853{
 854        struct task_struct *p;
 855        u64 delta;
 856
 857        if (!schedstat_enabled())
 858                return;
 859
 860        /*
 861         * When the sched_schedstat changes from 0 to 1, some sched se
 862         * maybe already in the runqueue, the se->statistics.wait_start
 863         * will be 0.So it will let the delta wrong. We need to avoid this
 864         * scenario.
 865         */
 866        if (unlikely(!schedstat_val(se->statistics.wait_start)))
 867                return;
 868
 869        delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
 870
 871        if (entity_is_task(se)) {
 872                p = task_of(se);
 873                if (task_on_rq_migrating(p)) {
 874                        /*
 875                         * Preserve migrating task's wait time so wait_start
 876                         * time stamp can be adjusted to accumulate wait time
 877                         * prior to migration.
 878                         */
 879                        __schedstat_set(se->statistics.wait_start, delta);
 880                        return;
 881                }
 882                trace_sched_stat_wait(p, delta);
 883        }
 884
 885        __schedstat_set(se->statistics.wait_max,
 886                      max(schedstat_val(se->statistics.wait_max), delta));
 887        __schedstat_inc(se->statistics.wait_count);
 888        __schedstat_add(se->statistics.wait_sum, delta);
 889        __schedstat_set(se->statistics.wait_start, 0);
 890}
 891
 892static inline void
 893update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 894{
 895        struct task_struct *tsk = NULL;
 896        u64 sleep_start, block_start;
 897
 898        if (!schedstat_enabled())
 899                return;
 900
 901        sleep_start = schedstat_val(se->statistics.sleep_start);
 902        block_start = schedstat_val(se->statistics.block_start);
 903
 904        if (entity_is_task(se))
 905                tsk = task_of(se);
 906
 907        if (sleep_start) {
 908                u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
 909
 910                if ((s64)delta < 0)
 911                        delta = 0;
 912
 913                if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
 914                        __schedstat_set(se->statistics.sleep_max, delta);
 915
 916                __schedstat_set(se->statistics.sleep_start, 0);
 917                __schedstat_add(se->statistics.sum_sleep_runtime, delta);
 918
 919                if (tsk) {
 920                        account_scheduler_latency(tsk, delta >> 10, 1);
 921                        trace_sched_stat_sleep(tsk, delta);
 922                }
 923        }
 924        if (block_start) {
 925                u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
 926
 927                if ((s64)delta < 0)
 928                        delta = 0;
 929
 930                if (unlikely(delta > schedstat_val(se->statistics.block_max)))
 931                        __schedstat_set(se->statistics.block_max, delta);
 932
 933                __schedstat_set(se->statistics.block_start, 0);
 934                __schedstat_add(se->statistics.sum_sleep_runtime, delta);
 935
 936                if (tsk) {
 937                        if (tsk->in_iowait) {
 938                                __schedstat_add(se->statistics.iowait_sum, delta);
 939                                __schedstat_inc(se->statistics.iowait_count);
 940                                trace_sched_stat_iowait(tsk, delta);
 941                        }
 942
 943                        trace_sched_stat_blocked(tsk, delta);
 944
 945                        /*
 946                         * Blocking time is in units of nanosecs, so shift by
 947                         * 20 to get a milliseconds-range estimation of the
 948                         * amount of time that the task spent sleeping:
 949                         */
 950                        if (unlikely(prof_on == SLEEP_PROFILING)) {
 951                                profile_hits(SLEEP_PROFILING,
 952                                                (void *)get_wchan(tsk),
 953                                                delta >> 20);
 954                        }
 955                        account_scheduler_latency(tsk, delta >> 10, 0);
 956                }
 957        }
 958}
 959
 960/*
 961 * Task is being enqueued - update stats:
 962 */
 963static inline void
 964update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 965{
 966        if (!schedstat_enabled())
 967                return;
 968
 969        /*
 970         * Are we enqueueing a waiting task? (for current tasks
 971         * a dequeue/enqueue event is a NOP)
 972         */
 973        if (se != cfs_rq->curr)
 974                update_stats_wait_start(cfs_rq, se);
 975
 976        if (flags & ENQUEUE_WAKEUP)
 977                update_stats_enqueue_sleeper(cfs_rq, se);
 978}
 979
 980static inline void
 981update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 982{
 983
 984        if (!schedstat_enabled())
 985                return;
 986
 987        /*
 988         * Mark the end of the wait period if dequeueing a
 989         * waiting task:
 990         */
 991        if (se != cfs_rq->curr)
 992                update_stats_wait_end(cfs_rq, se);
 993
 994        if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
 995                struct task_struct *tsk = task_of(se);
 996                unsigned int state;
 997
 998                /* XXX racy against TTWU */
 999                state = READ_ONCE(tsk->__state);
1000                if (state & TASK_INTERRUPTIBLE)
1001                        __schedstat_set(se->statistics.sleep_start,
1002                                      rq_clock(rq_of(cfs_rq)));
1003                if (state & TASK_UNINTERRUPTIBLE)
1004                        __schedstat_set(se->statistics.block_start,
1005                                      rq_clock(rq_of(cfs_rq)));
1006        }
1007}
1008
1009/*
1010 * We are picking a new current task - update its stats:
1011 */
1012static inline void
1013update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1014{
1015        /*
1016         * We are starting a new run period:
1017         */
1018        se->exec_start = rq_clock_task(rq_of(cfs_rq));
1019}
1020
1021/**************************************************
1022 * Scheduling class queueing methods:
1023 */
1024
1025#ifdef CONFIG_NUMA_BALANCING
1026/*
1027 * Approximate time to scan a full NUMA task in ms. The task scan period is
1028 * calculated based on the tasks virtual memory size and
1029 * numa_balancing_scan_size.
1030 */
1031unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1032unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1033
1034/* Portion of address space to scan in MB */
1035unsigned int sysctl_numa_balancing_scan_size = 256;
1036
1037/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1038unsigned int sysctl_numa_balancing_scan_delay = 1000;
1039
1040struct numa_group {
1041        refcount_t refcount;
1042
1043        spinlock_t lock; /* nr_tasks, tasks */
1044        int nr_tasks;
1045        pid_t gid;
1046        int active_nodes;
1047
1048        struct rcu_head rcu;
1049        unsigned long total_faults;
1050        unsigned long max_faults_cpu;
1051        /*
1052         * Faults_cpu is used to decide whether memory should move
1053         * towards the CPU. As a consequence, these stats are weighted
1054         * more by CPU use than by memory faults.
1055         */
1056        unsigned long *faults_cpu;
1057        unsigned long faults[];
1058};
1059
1060/*
1061 * For functions that can be called in multiple contexts that permit reading
1062 * ->numa_group (see struct task_struct for locking rules).
1063 */
1064static struct numa_group *deref_task_numa_group(struct task_struct *p)
1065{
1066        return rcu_dereference_check(p->numa_group, p == current ||
1067                (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
1068}
1069
1070static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1071{
1072        return rcu_dereference_protected(p->numa_group, p == current);
1073}
1074
1075static inline unsigned long group_faults_priv(struct numa_group *ng);
1076static inline unsigned long group_faults_shared(struct numa_group *ng);
1077
1078static unsigned int task_nr_scan_windows(struct task_struct *p)
1079{
1080        unsigned long rss = 0;
1081        unsigned long nr_scan_pages;
1082
1083        /*
1084         * Calculations based on RSS as non-present and empty pages are skipped
1085         * by the PTE scanner and NUMA hinting faults should be trapped based
1086         * on resident pages
1087         */
1088        nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1089        rss = get_mm_rss(p->mm);
1090        if (!rss)
1091                rss = nr_scan_pages;
1092
1093        rss = round_up(rss, nr_scan_pages);
1094        return rss / nr_scan_pages;
1095}
1096
1097/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1098#define MAX_SCAN_WINDOW 2560
1099
1100static unsigned int task_scan_min(struct task_struct *p)
1101{
1102        unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1103        unsigned int scan, floor;
1104        unsigned int windows = 1;
1105
1106        if (scan_size < MAX_SCAN_WINDOW)
1107                windows = MAX_SCAN_WINDOW / scan_size;
1108        floor = 1000 / windows;
1109
1110        scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1111        return max_t(unsigned int, floor, scan);
1112}
1113
1114static unsigned int task_scan_start(struct task_struct *p)
1115{
1116        unsigned long smin = task_scan_min(p);
1117        unsigned long period = smin;
1118        struct numa_group *ng;
1119
1120        /* Scale the maximum scan period with the amount of shared memory. */
1121        rcu_read_lock();
1122        ng = rcu_dereference(p->numa_group);
1123        if (ng) {
1124                unsigned long shared = group_faults_shared(ng);
1125                unsigned long private = group_faults_priv(ng);
1126
1127                period *= refcount_read(&ng->refcount);
1128                period *= shared + 1;
1129                period /= private + shared + 1;
1130        }
1131        rcu_read_unlock();
1132
1133        return max(smin, period);
1134}
1135
1136static unsigned int task_scan_max(struct task_struct *p)
1137{
1138        unsigned long smin = task_scan_min(p);
1139        unsigned long smax;
1140        struct numa_group *ng;
1141
1142        /* Watch for min being lower than max due to floor calculations */
1143        smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1144
1145        /* Scale the maximum scan period with the amount of shared memory. */
1146        ng = deref_curr_numa_group(p);
1147        if (ng) {
1148                unsigned long shared = group_faults_shared(ng);
1149                unsigned long private = group_faults_priv(ng);
1150                unsigned long period = smax;
1151
1152                period *= refcount_read(&ng->refcount);
1153                period *= shared + 1;
1154                period /= private + shared + 1;
1155
1156                smax = max(smax, period);
1157        }
1158
1159        return max(smin, smax);
1160}
1161
1162static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1163{
1164        rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1165        rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1166}
1167
1168static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1169{
1170        rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1171        rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1172}
1173
1174/* Shared or private faults. */
1175#define NR_NUMA_HINT_FAULT_TYPES 2
1176
1177/* Memory and CPU locality */
1178#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1179
1180/* Averaged statistics, and temporary buffers. */
1181#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1182
1183pid_t task_numa_group_id(struct task_struct *p)
1184{
1185        struct numa_group *ng;
1186        pid_t gid = 0;
1187
1188        rcu_read_lock();
1189        ng = rcu_dereference(p->numa_group);
1190        if (ng)
1191                gid = ng->gid;
1192        rcu_read_unlock();
1193
1194        return gid;
1195}
1196
1197/*
1198 * The averaged statistics, shared & private, memory & CPU,
1199 * occupy the first half of the array. The second half of the
1200 * array is for current counters, which are averaged into the
1201 * first set by task_numa_placement.
1202 */
1203static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1204{
1205        return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1206}
1207
1208static inline unsigned long task_faults(struct task_struct *p, int nid)
1209{
1210        if (!p->numa_faults)
1211                return 0;
1212
1213        return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1214                p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1215}
1216
1217static inline unsigned long group_faults(struct task_struct *p, int nid)
1218{
1219        struct numa_group *ng = deref_task_numa_group(p);
1220
1221        if (!ng)
1222                return 0;
1223
1224        return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1225                ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1226}
1227
1228static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1229{
1230        return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1231                group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1232}
1233
1234static inline unsigned long group_faults_priv(struct numa_group *ng)
1235{
1236        unsigned long faults = 0;
1237        int node;
1238
1239        for_each_online_node(node) {
1240                faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1241        }
1242
1243        return faults;
1244}
1245
1246static inline unsigned long group_faults_shared(struct numa_group *ng)
1247{
1248        unsigned long faults = 0;
1249        int node;
1250
1251        for_each_online_node(node) {
1252                faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1253        }
1254
1255        return faults;
1256}
1257
1258/*
1259 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1260 * considered part of a numa group's pseudo-interleaving set. Migrations
1261 * between these nodes are slowed down, to allow things to settle down.
1262 */
1263#define ACTIVE_NODE_FRACTION 3
1264
1265static bool numa_is_active_node(int nid, struct numa_group *ng)
1266{
1267        return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1268}
1269
1270/* Handle placement on systems where not all nodes are directly connected. */
1271static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1272                                        int maxdist, bool task)
1273{
1274        unsigned long score = 0;
1275        int node;
1276
1277        /*
1278         * All nodes are directly connected, and the same distance
1279         * from each other. No need for fancy placement algorithms.
1280         */
1281        if (sched_numa_topology_type == NUMA_DIRECT)
1282                return 0;
1283
1284        /*
1285         * This code is called for each node, introducing N^2 complexity,
1286         * which should be ok given the number of nodes rarely exceeds 8.
1287         */
1288        for_each_online_node(node) {
1289                unsigned long faults;
1290                int dist = node_distance(nid, node);
1291
1292                /*
1293                 * The furthest away nodes in the system are not interesting
1294                 * for placement; nid was already counted.
1295                 */
1296                if (dist == sched_max_numa_distance || node == nid)
1297                        continue;
1298
1299                /*
1300                 * On systems with a backplane NUMA topology, compare groups
1301                 * of nodes, and move tasks towards the group with the most
1302                 * memory accesses. When comparing two nodes at distance
1303                 * "hoplimit", only nodes closer by than "hoplimit" are part
1304                 * of each group. Skip other nodes.
1305                 */
1306                if (sched_numa_topology_type == NUMA_BACKPLANE &&
1307                                        dist >= maxdist)
1308                        continue;
1309
1310                /* Add up the faults from nearby nodes. */
1311                if (task)
1312                        faults = task_faults(p, node);
1313                else
1314                        faults = group_faults(p, node);
1315
1316                /*
1317                 * On systems with a glueless mesh NUMA topology, there are
1318                 * no fixed "groups of nodes". Instead, nodes that are not
1319                 * directly connected bounce traffic through intermediate
1320                 * nodes; a numa_group can occupy any set of nodes.
1321                 * The further away a node is, the less the faults count.
1322                 * This seems to result in good task placement.
1323                 */
1324                if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1325                        faults *= (sched_max_numa_distance - dist);
1326                        faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1327                }
1328
1329                score += faults;
1330        }
1331
1332        return score;
1333}
1334
1335/*
1336 * These return the fraction of accesses done by a particular task, or
1337 * task group, on a particular numa node.  The group weight is given a
1338 * larger multiplier, in order to group tasks together that are almost
1339 * evenly spread out between numa nodes.
1340 */
1341static inline unsigned long task_weight(struct task_struct *p, int nid,
1342                                        int dist)
1343{
1344        unsigned long faults, total_faults;
1345
1346        if (!p->numa_faults)
1347                return 0;
1348
1349        total_faults = p->total_numa_faults;
1350
1351        if (!total_faults)
1352                return 0;
1353
1354        faults = task_faults(p, nid);
1355        faults += score_nearby_nodes(p, nid, dist, true);
1356
1357        return 1000 * faults / total_faults;
1358}
1359
1360static inline unsigned long group_weight(struct task_struct *p, int nid,
1361                                         int dist)
1362{
1363        struct numa_group *ng = deref_task_numa_group(p);
1364        unsigned long faults, total_faults;
1365
1366        if (!ng)
1367                return 0;
1368
1369        total_faults = ng->total_faults;
1370
1371        if (!total_faults)
1372                return 0;
1373
1374        faults = group_faults(p, nid);
1375        faults += score_nearby_nodes(p, nid, dist, false);
1376
1377        return 1000 * faults / total_faults;
1378}
1379
1380bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1381                                int src_nid, int dst_cpu)
1382{
1383        struct numa_group *ng = deref_curr_numa_group(p);
1384        int dst_nid = cpu_to_node(dst_cpu);
1385        int last_cpupid, this_cpupid;
1386
1387        this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1388        last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1389
1390        /*
1391         * Allow first faults or private faults to migrate immediately early in
1392         * the lifetime of a task. The magic number 4 is based on waiting for
1393         * two full passes of the "multi-stage node selection" test that is
1394         * executed below.
1395         */
1396        if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1397            (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1398                return true;
1399
1400        /*
1401         * Multi-stage node selection is used in conjunction with a periodic
1402         * migration fault to build a temporal task<->page relation. By using
1403         * a two-stage filter we remove short/unlikely relations.
1404         *
1405         * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1406         * a task's usage of a particular page (n_p) per total usage of this
1407         * page (n_t) (in a given time-span) to a probability.
1408         *
1409         * Our periodic faults will sample this probability and getting the
1410         * same result twice in a row, given these samples are fully
1411         * independent, is then given by P(n)^2, provided our sample period
1412         * is sufficiently short compared to the usage pattern.
1413         *
1414         * This quadric squishes small probabilities, making it less likely we
1415         * act on an unlikely task<->page relation.
1416         */
1417        if (!cpupid_pid_unset(last_cpupid) &&
1418                                cpupid_to_nid(last_cpupid) != dst_nid)
1419                return false;
1420
1421        /* Always allow migrate on private faults */
1422        if (cpupid_match_pid(p, last_cpupid))
1423                return true;
1424
1425        /* A shared fault, but p->numa_group has not been set up yet. */
1426        if (!ng)
1427                return true;
1428
1429        /*
1430         * Destination node is much more heavily used than the source
1431         * node? Allow migration.
1432         */
1433        if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1434                                        ACTIVE_NODE_FRACTION)
1435                return true;
1436
1437        /*
1438         * Distribute memory according to CPU & memory use on each node,
1439         * with 3/4 hysteresis to avoid unnecessary memory migrations:
1440         *
1441         * faults_cpu(dst)   3   faults_cpu(src)
1442         * --------------- * - > ---------------
1443         * faults_mem(dst)   4   faults_mem(src)
1444         */
1445        return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1446               group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1447}
1448
1449/*
1450 * 'numa_type' describes the node at the moment of load balancing.
1451 */
1452enum numa_type {
1453        /* The node has spare capacity that can be used to run more tasks.  */
1454        node_has_spare = 0,
1455        /*
1456         * The node is fully used and the tasks don't compete for more CPU
1457         * cycles. Nevertheless, some tasks might wait before running.
1458         */
1459        node_fully_busy,
1460        /*
1461         * The node is overloaded and can't provide expected CPU cycles to all
1462         * tasks.
1463         */
1464        node_overloaded
1465};
1466
1467/* Cached statistics for all CPUs within a node */
1468struct numa_stats {
1469        unsigned long load;
1470        unsigned long runnable;
1471        unsigned long util;
1472        /* Total compute capacity of CPUs on a node */
1473        unsigned long compute_capacity;
1474        unsigned int nr_running;
1475        unsigned int weight;
1476        enum numa_type node_type;
1477        int idle_cpu;
1478};
1479
1480static inline bool is_core_idle(int cpu)
1481{
1482#ifdef CONFIG_SCHED_SMT
1483        int sibling;
1484
1485        for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1486                if (cpu == sibling)
1487                        continue;
1488
1489                if (!idle_cpu(cpu))
1490                        return false;
1491        }
1492#endif
1493
1494        return true;
1495}
1496
1497struct task_numa_env {
1498        struct task_struct *p;
1499
1500        int src_cpu, src_nid;
1501        int dst_cpu, dst_nid;
1502
1503        struct numa_stats src_stats, dst_stats;
1504
1505        int imbalance_pct;
1506        int dist;
1507
1508        struct task_struct *best_task;
1509        long best_imp;
1510        int best_cpu;
1511};
1512
1513static unsigned long cpu_load(struct rq *rq);
1514static unsigned long cpu_runnable(struct rq *rq);
1515static unsigned long cpu_util(int cpu);
1516static inline long adjust_numa_imbalance(int imbalance,
1517                                        int dst_running, int dst_weight);
1518
1519static inline enum
1520numa_type numa_classify(unsigned int imbalance_pct,
1521                         struct numa_stats *ns)
1522{
1523        if ((ns->nr_running > ns->weight) &&
1524            (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1525             ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
1526                return node_overloaded;
1527
1528        if ((ns->nr_running < ns->weight) ||
1529            (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
1530             ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
1531                return node_has_spare;
1532
1533        return node_fully_busy;
1534}
1535
1536#ifdef CONFIG_SCHED_SMT
1537/* Forward declarations of select_idle_sibling helpers */
1538static inline bool test_idle_cores(int cpu, bool def);
1539static inline int numa_idle_core(int idle_core, int cpu)
1540{
1541        if (!static_branch_likely(&sched_smt_present) ||
1542            idle_core >= 0 || !test_idle_cores(cpu, false))
1543                return idle_core;
1544
1545        /*
1546         * Prefer cores instead of packing HT siblings
1547         * and triggering future load balancing.
1548         */
1549        if (is_core_idle(cpu))
1550                idle_core = cpu;
1551
1552        return idle_core;
1553}
1554#else
1555static inline int numa_idle_core(int idle_core, int cpu)
1556{
1557        return idle_core;
1558}
1559#endif
1560
1561/*
1562 * Gather all necessary information to make NUMA balancing placement
1563 * decisions that are compatible with standard load balancer. This
1564 * borrows code and logic from update_sg_lb_stats but sharing a
1565 * common implementation is impractical.
1566 */
1567static void update_numa_stats(struct task_numa_env *env,
1568                              struct numa_stats *ns, int nid,
1569                              bool find_idle)
1570{
1571        int cpu, idle_core = -1;
1572
1573        memset(ns, 0, sizeof(*ns));
1574        ns->idle_cpu = -1;
1575
1576        rcu_read_lock();
1577        for_each_cpu(cpu, cpumask_of_node(nid)) {
1578                struct rq *rq = cpu_rq(cpu);
1579
1580                ns->load += cpu_load(rq);
1581                ns->runnable += cpu_runnable(rq);
1582                ns->util += cpu_util(cpu);
1583                ns->nr_running += rq->cfs.h_nr_running;
1584                ns->compute_capacity += capacity_of(cpu);
1585
1586                if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
1587                        if (READ_ONCE(rq->numa_migrate_on) ||
1588                            !cpumask_test_cpu(cpu, env->p->cpus_ptr))
1589                                continue;
1590
1591                        if (ns->idle_cpu == -1)
1592                                ns->idle_cpu = cpu;
1593
1594                        idle_core = numa_idle_core(idle_core, cpu);
1595                }
1596        }
1597        rcu_read_unlock();
1598
1599        ns->weight = cpumask_weight(cpumask_of_node(nid));
1600
1601        ns->node_type = numa_classify(env->imbalance_pct, ns);
1602
1603        if (idle_core >= 0)
1604                ns->idle_cpu = idle_core;
1605}
1606
1607static void task_numa_assign(struct task_numa_env *env,
1608                             struct task_struct *p, long imp)
1609{
1610        struct rq *rq = cpu_rq(env->dst_cpu);
1611
1612        /* Check if run-queue part of active NUMA balance. */
1613        if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
1614                int cpu;
1615                int start = env->dst_cpu;
1616
1617                /* Find alternative idle CPU. */
1618                for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
1619                        if (cpu == env->best_cpu || !idle_cpu(cpu) ||
1620                            !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
1621                                continue;
1622                        }
1623
1624                        env->dst_cpu = cpu;
1625                        rq = cpu_rq(env->dst_cpu);
1626                        if (!xchg(&rq->numa_migrate_on, 1))
1627                                goto assign;
1628                }
1629
1630                /* Failed to find an alternative idle CPU */
1631                return;
1632        }
1633
1634assign:
1635        /*
1636         * Clear previous best_cpu/rq numa-migrate flag, since task now
1637         * found a better CPU to move/swap.
1638         */
1639        if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
1640                rq = cpu_rq(env->best_cpu);
1641                WRITE_ONCE(rq->numa_migrate_on, 0);
1642        }
1643
1644        if (env->best_task)
1645                put_task_struct(env->best_task);
1646        if (p)
1647                get_task_struct(p);
1648
1649        env->best_task = p;
1650        env->best_imp = imp;
1651        env->best_cpu = env->dst_cpu;
1652}
1653
1654static bool load_too_imbalanced(long src_load, long dst_load,
1655                                struct task_numa_env *env)
1656{
1657        long imb, old_imb;
1658        long orig_src_load, orig_dst_load;
1659        long src_capacity, dst_capacity;
1660
1661        /*
1662         * The load is corrected for the CPU capacity available on each node.
1663         *
1664         * src_load        dst_load
1665         * ------------ vs ---------
1666         * src_capacity    dst_capacity
1667         */
1668        src_capacity = env->src_stats.compute_capacity;
1669        dst_capacity = env->dst_stats.compute_capacity;
1670
1671        imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1672
1673        orig_src_load = env->src_stats.load;
1674        orig_dst_load = env->dst_stats.load;
1675
1676        old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1677
1678        /* Would this change make things worse? */
1679        return (imb > old_imb);
1680}
1681
1682/*
1683 * Maximum NUMA importance can be 1998 (2*999);
1684 * SMALLIMP @ 30 would be close to 1998/64.
1685 * Used to deter task migration.
1686 */
1687#define SMALLIMP        30
1688
1689/*
1690 * This checks if the overall compute and NUMA accesses of the system would
1691 * be improved if the source tasks was migrated to the target dst_cpu taking
1692 * into account that it might be best if task running on the dst_cpu should
1693 * be exchanged with the source task
1694 */
1695static bool task_numa_compare(struct task_numa_env *env,
1696                              long taskimp, long groupimp, bool maymove)
1697{
1698        struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
1699        struct rq *dst_rq = cpu_rq(env->dst_cpu);
1700        long imp = p_ng ? groupimp : taskimp;
1701        struct task_struct *cur;
1702        long src_load, dst_load;
1703        int dist = env->dist;
1704        long moveimp = imp;
1705        long load;
1706        bool stopsearch = false;
1707
1708        if (READ_ONCE(dst_rq->numa_migrate_on))
1709                return false;
1710
1711        rcu_read_lock();
1712        cur = rcu_dereference(dst_rq->curr);
1713        if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1714                cur = NULL;
1715
1716        /*
1717         * Because we have preemption enabled we can get migrated around and
1718         * end try selecting ourselves (current == env->p) as a swap candidate.
1719         */
1720        if (cur == env->p) {
1721                stopsearch = true;
1722                goto unlock;
1723        }
1724
1725        if (!cur) {
1726                if (maymove && moveimp >= env->best_imp)
1727                        goto assign;
1728                else
1729                        goto unlock;
1730        }
1731
1732        /* Skip this swap candidate if cannot move to the source cpu. */
1733        if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1734                goto unlock;
1735
1736        /*
1737         * Skip this swap candidate if it is not moving to its preferred
1738         * node and the best task is.
1739         */
1740        if (env->best_task &&
1741            env->best_task->numa_preferred_nid == env->src_nid &&
1742            cur->numa_preferred_nid != env->src_nid) {
1743                goto unlock;
1744        }
1745
1746        /*
1747         * "imp" is the fault differential for the source task between the
1748         * source and destination node. Calculate the total differential for
1749         * the source task and potential destination task. The more negative
1750         * the value is, the more remote accesses that would be expected to
1751         * be incurred if the tasks were swapped.
1752         *
1753         * If dst and source tasks are in the same NUMA group, or not
1754         * in any group then look only at task weights.
1755         */
1756        cur_ng = rcu_dereference(cur->numa_group);
1757        if (cur_ng == p_ng) {
1758                imp = taskimp + task_weight(cur, env->src_nid, dist) -
1759                      task_weight(cur, env->dst_nid, dist);
1760                /*
1761                 * Add some hysteresis to prevent swapping the
1762                 * tasks within a group over tiny differences.
1763                 */
1764                if (cur_ng)
1765                        imp -= imp / 16;
1766        } else {
1767                /*
1768                 * Compare the group weights. If a task is all by itself
1769                 * (not part of a group), use the task weight instead.
1770                 */
1771                if (cur_ng && p_ng)
1772                        imp += group_weight(cur, env->src_nid, dist) -
1773                               group_weight(cur, env->dst_nid, dist);
1774                else
1775                        imp += task_weight(cur, env->src_nid, dist) -
1776                               task_weight(cur, env->dst_nid, dist);
1777        }
1778
1779        /* Discourage picking a task already on its preferred node */
1780        if (cur->numa_preferred_nid == env->dst_nid)
1781                imp -= imp / 16;
1782
1783        /*
1784         * Encourage picking a task that moves to its preferred node.
1785         * This potentially makes imp larger than it's maximum of
1786         * 1998 (see SMALLIMP and task_weight for why) but in this
1787         * case, it does not matter.
1788         */
1789        if (cur->numa_preferred_nid == env->src_nid)
1790                imp += imp / 8;
1791
1792        if (maymove && moveimp > imp && moveimp > env->best_imp) {
1793                imp = moveimp;
1794                cur = NULL;
1795                goto assign;
1796        }
1797
1798        /*
1799         * Prefer swapping with a task moving to its preferred node over a
1800         * task that is not.
1801         */
1802        if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
1803            env->best_task->numa_preferred_nid != env->src_nid) {
1804                goto assign;
1805        }
1806
1807        /*
1808         * If the NUMA importance is less than SMALLIMP,
1809         * task migration might only result in ping pong
1810         * of tasks and also hurt performance due to cache
1811         * misses.
1812         */
1813        if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1814                goto unlock;
1815
1816        /*
1817         * In the overloaded case, try and keep the load balanced.
1818         */
1819        load = task_h_load(env->p) - task_h_load(cur);
1820        if (!load)
1821                goto assign;
1822
1823        dst_load = env->dst_stats.load + load;
1824        src_load = env->src_stats.load - load;
1825
1826        if (load_too_imbalanced(src_load, dst_load, env))
1827                goto unlock;
1828
1829assign:
1830        /* Evaluate an idle CPU for a task numa move. */
1831        if (!cur) {
1832                int cpu = env->dst_stats.idle_cpu;
1833
1834                /* Nothing cached so current CPU went idle since the search. */
1835                if (cpu < 0)
1836                        cpu = env->dst_cpu;
1837
1838                /*
1839                 * If the CPU is no longer truly idle and the previous best CPU
1840                 * is, keep using it.
1841                 */
1842                if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
1843                    idle_cpu(env->best_cpu)) {
1844                        cpu = env->best_cpu;
1845                }
1846
1847                env->dst_cpu = cpu;
1848        }
1849
1850        task_numa_assign(env, cur, imp);
1851
1852        /*
1853         * If a move to idle is allowed because there is capacity or load
1854         * balance improves then stop the search. While a better swap
1855         * candidate may exist, a search is not free.
1856         */
1857        if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
1858                stopsearch = true;
1859
1860        /*
1861         * If a swap candidate must be identified and the current best task
1862         * moves its preferred node then stop the search.
1863         */
1864        if (!maymove && env->best_task &&
1865            env->best_task->numa_preferred_nid == env->src_nid) {
1866                stopsearch = true;
1867        }
1868unlock:
1869        rcu_read_unlock();
1870
1871        return stopsearch;
1872}
1873
1874static void task_numa_find_cpu(struct task_numa_env *env,
1875                                long taskimp, long groupimp)
1876{
1877        bool maymove = false;
1878        int cpu;
1879
1880        /*
1881         * If dst node has spare capacity, then check if there is an
1882         * imbalance that would be overruled by the load balancer.
1883         */
1884        if (env->dst_stats.node_type == node_has_spare) {
1885                unsigned int imbalance;
1886                int src_running, dst_running;
1887
1888                /*
1889                 * Would movement cause an imbalance? Note that if src has
1890                 * more running tasks that the imbalance is ignored as the
1891                 * move improves the imbalance from the perspective of the
1892                 * CPU load balancer.
1893                 * */
1894                src_running = env->src_stats.nr_running - 1;
1895                dst_running = env->dst_stats.nr_running + 1;
1896                imbalance = max(0, dst_running - src_running);
1897                imbalance = adjust_numa_imbalance(imbalance, dst_running,
1898                                                        env->dst_stats.weight);
1899
1900                /* Use idle CPU if there is no imbalance */
1901                if (!imbalance) {
1902                        maymove = true;
1903                        if (env->dst_stats.idle_cpu >= 0) {
1904                                env->dst_cpu = env->dst_stats.idle_cpu;
1905                                task_numa_assign(env, NULL, 0);
1906                                return;
1907                        }
1908                }
1909        } else {
1910                long src_load, dst_load, load;
1911                /*
1912                 * If the improvement from just moving env->p direction is better
1913                 * than swapping tasks around, check if a move is possible.
1914                 */
1915                load = task_h_load(env->p);
1916                dst_load = env->dst_stats.load + load;
1917                src_load = env->src_stats.load - load;
1918                maymove = !load_too_imbalanced(src_load, dst_load, env);
1919        }
1920
1921        for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1922                /* Skip this CPU if the source task cannot migrate */
1923                if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1924                        continue;
1925
1926                env->dst_cpu = cpu;
1927                if (task_numa_compare(env, taskimp, groupimp, maymove))
1928                        break;
1929        }
1930}
1931
1932static int task_numa_migrate(struct task_struct *p)
1933{
1934        struct task_numa_env env = {
1935                .p = p,
1936
1937                .src_cpu = task_cpu(p),
1938                .src_nid = task_node(p),
1939
1940                .imbalance_pct = 112,
1941
1942                .best_task = NULL,
1943                .best_imp = 0,
1944                .best_cpu = -1,
1945        };
1946        unsigned long taskweight, groupweight;
1947        struct sched_domain *sd;
1948        long taskimp, groupimp;
1949        struct numa_group *ng;
1950        struct rq *best_rq;
1951        int nid, ret, dist;
1952
1953        /*
1954         * Pick the lowest SD_NUMA domain, as that would have the smallest
1955         * imbalance and would be the first to start moving tasks about.
1956         *
1957         * And we want to avoid any moving of tasks about, as that would create
1958         * random movement of tasks -- counter the numa conditions we're trying
1959         * to satisfy here.
1960         */
1961        rcu_read_lock();
1962        sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1963        if (sd)
1964                env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1965        rcu_read_unlock();
1966
1967        /*
1968         * Cpusets can break the scheduler domain tree into smaller
1969         * balance domains, some of which do not cross NUMA boundaries.
1970         * Tasks that are "trapped" in such domains cannot be migrated
1971         * elsewhere, so there is no point in (re)trying.
1972         */
1973        if (unlikely(!sd)) {
1974                sched_setnuma(p, task_node(p));
1975                return -EINVAL;
1976        }
1977
1978        env.dst_nid = p->numa_preferred_nid;
1979        dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1980        taskweight = task_weight(p, env.src_nid, dist);
1981        groupweight = group_weight(p, env.src_nid, dist);
1982        update_numa_stats(&env, &env.src_stats, env.src_nid, false);
1983        taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1984        groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1985        update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
1986
1987        /* Try to find a spot on the preferred nid. */
1988        task_numa_find_cpu(&env, taskimp, groupimp);
1989
1990        /*
1991         * Look at other nodes in these cases:
1992         * - there is no space available on the preferred_nid
1993         * - the task is part of a numa_group that is interleaved across
1994         *   multiple NUMA nodes; in order to better consolidate the group,
1995         *   we need to check other locations.
1996         */
1997        ng = deref_curr_numa_group(p);
1998        if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
1999                for_each_online_node(nid) {
2000                        if (nid == env.src_nid || nid == p->numa_preferred_nid)
2001                                continue;
2002
2003                        dist = node_distance(env.src_nid, env.dst_nid);
2004                        if (sched_numa_topology_type == NUMA_BACKPLANE &&
2005                                                dist != env.dist) {
2006                                taskweight = task_weight(p, env.src_nid, dist);
2007                                groupweight = group_weight(p, env.src_nid, dist);
2008                        }
2009
2010                        /* Only consider nodes where both task and groups benefit */
2011                        taskimp = task_weight(p, nid, dist) - taskweight;
2012                        groupimp = group_weight(p, nid, dist) - groupweight;
2013                        if (taskimp < 0 && groupimp < 0)
2014                                continue;
2015
2016                        env.dist = dist;
2017                        env.dst_nid = nid;
2018                        update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2019                        task_numa_find_cpu(&env, taskimp, groupimp);
2020                }
2021        }
2022
2023        /*
2024         * If the task is part of a workload that spans multiple NUMA nodes,
2025         * and is migrating into one of the workload's active nodes, remember
2026         * this node as the task's preferred numa node, so the workload can
2027         * settle down.
2028         * A task that migrated to a second choice node will be better off
2029         * trying for a better one later. Do not set the preferred node here.
2030         */
2031        if (ng) {
2032                if (env.best_cpu == -1)
2033                        nid = env.src_nid;
2034                else
2035                        nid = cpu_to_node(env.best_cpu);
2036
2037                if (nid != p->numa_preferred_nid)
2038                        sched_setnuma(p, nid);
2039        }
2040
2041        /* No better CPU than the current one was found. */
2042        if (env.best_cpu == -1) {
2043                trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
2044                return -EAGAIN;
2045        }
2046
2047        best_rq = cpu_rq(env.best_cpu);
2048        if (env.best_task == NULL) {
2049                ret = migrate_task_to(p, env.best_cpu);
2050                WRITE_ONCE(best_rq->numa_migrate_on, 0);
2051                if (ret != 0)
2052                        trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
2053                return ret;
2054        }
2055
2056        ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
2057        WRITE_ONCE(best_rq->numa_migrate_on, 0);
2058
2059        if (ret != 0)
2060                trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
2061        put_task_struct(env.best_task);
2062        return ret;
2063}
2064
2065/* Attempt to migrate a task to a CPU on the preferred node. */
2066static void numa_migrate_preferred(struct task_struct *p)
2067{
2068        unsigned long interval = HZ;
2069
2070        /* This task has no NUMA fault statistics yet */
2071        if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
2072                return;
2073
2074        /* Periodically retry migrating the task to the preferred node */
2075        interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
2076        p->numa_migrate_retry = jiffies + interval;
2077
2078        /* Success if task is already running on preferred CPU */
2079        if (task_node(p) == p->numa_preferred_nid)
2080                return;
2081
2082        /* Otherwise, try migrate to a CPU on the preferred node */
2083        task_numa_migrate(p);
2084}
2085
2086/*
2087 * Find out how many nodes on the workload is actively running on. Do this by
2088 * tracking the nodes from which NUMA hinting faults are triggered. This can
2089 * be different from the set of nodes where the workload's memory is currently
2090 * located.
2091 */
2092static void numa_group_count_active_nodes(struct numa_group *numa_group)
2093{
2094        unsigned long faults, max_faults = 0;
2095        int nid, active_nodes = 0;
2096
2097        for_each_online_node(nid) {
2098                faults = group_faults_cpu(numa_group, nid);
2099                if (faults > max_faults)
2100                        max_faults = faults;
2101        }
2102
2103        for_each_online_node(nid) {
2104                faults = group_faults_cpu(numa_group, nid);
2105                if (faults * ACTIVE_NODE_FRACTION > max_faults)
2106                        active_nodes++;
2107        }
2108
2109        numa_group->max_faults_cpu = max_faults;
2110        numa_group->active_nodes = active_nodes;
2111}
2112
2113/*
2114 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
2115 * increments. The more local the fault statistics are, the higher the scan
2116 * period will be for the next scan window. If local/(local+remote) ratio is
2117 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
2118 * the scan period will decrease. Aim for 70% local accesses.
2119 */
2120#define NUMA_PERIOD_SLOTS 10
2121#define NUMA_PERIOD_THRESHOLD 7
2122
2123/*
2124 * Increase the scan period (slow down scanning) if the majority of
2125 * our memory is already on our local node, or if the majority of
2126 * the page accesses are shared with other processes.
2127 * Otherwise, decrease the scan period.
2128 */
2129static void update_task_scan_period(struct task_struct *p,
2130                        unsigned long shared, unsigned long private)
2131{
2132        unsigned int period_slot;
2133        int lr_ratio, ps_ratio;
2134        int diff;
2135
2136        unsigned long remote = p->numa_faults_locality[0];
2137        unsigned long local = p->numa_faults_locality[1];
2138
2139        /*
2140         * If there were no record hinting faults then either the task is
2141         * completely idle or all activity is areas that are not of interest
2142         * to automatic numa balancing. Related to that, if there were failed
2143         * migration then it implies we are migrating too quickly or the local
2144         * node is overloaded. In either case, scan slower
2145         */
2146        if (local + shared == 0 || p->numa_faults_locality[2]) {
2147                p->numa_scan_period = min(p->numa_scan_period_max,
2148                        p->numa_scan_period << 1);
2149
2150                p->mm->numa_next_scan = jiffies +
2151                        msecs_to_jiffies(p->numa_scan_period);
2152
2153                return;
2154        }
2155
2156        /*
2157         * Prepare to scale scan period relative to the current period.
2158         *       == NUMA_PERIOD_THRESHOLD scan period stays the same
2159         *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
2160         *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
2161         */
2162        period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2163        lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2164        ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2165
2166        if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2167                /*
2168                 * Most memory accesses are local. There is no need to
2169                 * do fast NUMA scanning, since memory is already local.
2170                 */
2171                int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2172                if (!slot)
2173                        slot = 1;
2174                diff = slot * period_slot;
2175        } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2176                /*
2177                 * Most memory accesses are shared with other tasks.
2178                 * There is no point in continuing fast NUMA scanning,
2179                 * since other tasks may just move the memory elsewhere.
2180                 */
2181                int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2182                if (!slot)
2183                        slot = 1;
2184                diff = slot * period_slot;
2185        } else {
2186                /*
2187                 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
2188                 * yet they are not on the local NUMA node. Speed up
2189                 * NUMA scanning to get the memory moved over.
2190                 */
2191                int ratio = max(lr_ratio, ps_ratio);
2192                diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2193        }
2194
2195        p->numa_scan_period = clamp(p->numa_scan_period + diff,
2196                        task_scan_min(p), task_scan_max(p));
2197        memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2198}
2199
2200/*
2201 * Get the fraction of time the task has been running since the last
2202 * NUMA placement cycle. The scheduler keeps similar statistics, but
2203 * decays those on a 32ms period, which is orders of magnitude off
2204 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2205 * stats only if the task is so new there are no NUMA statistics yet.
2206 */
2207static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2208{
2209        u64 runtime, delta, now;
2210        /* Use the start of this time slice to avoid calculations. */
2211        now = p->se.exec_start;
2212        runtime = p->se.sum_exec_runtime;
2213
2214        if (p->last_task_numa_placement) {
2215                delta = runtime - p->last_sum_exec_runtime;
2216                *period = now - p->last_task_numa_placement;
2217
2218                /* Avoid time going backwards, prevent potential divide error: */
2219                if (unlikely((s64)*period < 0))
2220                        *period = 0;
2221        } else {
2222                delta = p->se.avg.load_sum;
2223                *period = LOAD_AVG_MAX;
2224        }
2225
2226        p->last_sum_exec_runtime = runtime;
2227        p->last_task_numa_placement = now;
2228
2229        return delta;
2230}
2231
2232/*
2233 * Determine the preferred nid for a task in a numa_group. This needs to
2234 * be done in a way that produces consistent results with group_weight,
2235 * otherwise workloads might not converge.
2236 */
2237static int preferred_group_nid(struct task_struct *p, int nid)
2238{
2239        nodemask_t nodes;
2240        int dist;
2241
2242        /* Direct connections between all NUMA nodes. */
2243        if (sched_numa_topology_type == NUMA_DIRECT)
2244                return nid;
2245
2246        /*
2247         * On a system with glueless mesh NUMA topology, group_weight
2248         * scores nodes according to the number of NUMA hinting faults on
2249         * both the node itself, and on nearby nodes.
2250         */
2251        if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2252                unsigned long score, max_score = 0;
2253                int node, max_node = nid;
2254
2255                dist = sched_max_numa_distance;
2256
2257                for_each_online_node(node) {
2258                        score = group_weight(p, node, dist);
2259                        if (score > max_score) {
2260                                max_score = score;
2261                                max_node = node;
2262                        }
2263                }
2264                return max_node;
2265        }
2266
2267        /*
2268         * Finding the preferred nid in a system with NUMA backplane
2269         * interconnect topology is more involved. The goal is to locate
2270         * tasks from numa_groups near each other in the system, and
2271         * untangle workloads from different sides of the system. This requires
2272         * searching down the hierarchy of node groups, recursively searching
2273         * inside the highest scoring group of nodes. The nodemask tricks
2274         * keep the complexity of the search down.
2275         */
2276        nodes = node_online_map;
2277        for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2278                unsigned long max_faults = 0;
2279                nodemask_t max_group = NODE_MASK_NONE;
2280                int a, b;
2281
2282                /* Are there nodes at this distance from each other? */
2283                if (!find_numa_distance(dist))
2284                        continue;
2285
2286                for_each_node_mask(a, nodes) {
2287                        unsigned long faults = 0;
2288                        nodemask_t this_group;
2289                        nodes_clear(this_group);
2290
2291                        /* Sum group's NUMA faults; includes a==b case. */
2292                        for_each_node_mask(b, nodes) {
2293                                if (node_distance(a, b) < dist) {
2294                                        faults += group_faults(p, b);
2295                                        node_set(b, this_group);
2296                                        node_clear(b, nodes);
2297                                }
2298                        }
2299
2300                        /* Remember the top group. */
2301                        if (faults > max_faults) {
2302                                max_faults = faults;
2303                                max_group = this_group;
2304                                /*
2305                                 * subtle: at the smallest distance there is
2306                                 * just one node left in each "group", the
2307                                 * winner is the preferred nid.
2308                                 */
2309                                nid = a;
2310                        }
2311                }
2312                /* Next round, evaluate the nodes within max_group. */
2313                if (!max_faults)
2314                        break;
2315                nodes = max_group;
2316        }
2317        return nid;
2318}
2319
2320static void task_numa_placement(struct task_struct *p)
2321{
2322        int seq, nid, max_nid = NUMA_NO_NODE;
2323        unsigned long max_faults = 0;
2324        unsigned long fault_types[2] = { 0, 0 };
2325        unsigned long total_faults;
2326        u64 runtime, period;
2327        spinlock_t *group_lock = NULL;
2328        struct numa_group *ng;
2329
2330        /*
2331         * The p->mm->numa_scan_seq field gets updated without
2332         * exclusive access. Use READ_ONCE() here to ensure
2333         * that the field is read in a single access:
2334         */
2335        seq = READ_ONCE(p->mm->numa_scan_seq);
2336        if (p->numa_scan_seq == seq)
2337                return;
2338        p->numa_scan_seq = seq;
2339        p->numa_scan_period_max = task_scan_max(p);
2340
2341        total_faults = p->numa_faults_locality[0] +
2342                       p->numa_faults_locality[1];
2343        runtime = numa_get_avg_runtime(p, &period);
2344
2345        /* If the task is part of a group prevent parallel updates to group stats */
2346        ng = deref_curr_numa_group(p);
2347        if (ng) {
2348                group_lock = &ng->lock;
2349                spin_lock_irq(group_lock);
2350        }
2351
2352        /* Find the node with the highest number of faults */
2353        for_each_online_node(nid) {
2354                /* Keep track of the offsets in numa_faults array */
2355                int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2356                unsigned long faults = 0, group_faults = 0;
2357                int priv;
2358
2359                for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2360                        long diff, f_diff, f_weight;
2361
2362                        mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2363                        membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2364                        cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2365                        cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2366
2367                        /* Decay existing window, copy faults since last scan */
2368                        diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2369                        fault_types[priv] += p->numa_faults[membuf_idx];
2370                        p->numa_faults[membuf_idx] = 0;
2371
2372                        /*
2373                         * Normalize the faults_from, so all tasks in a group
2374                         * count according to CPU use, instead of by the raw
2375                         * number of faults. Tasks with little runtime have
2376                         * little over-all impact on throughput, and thus their
2377                         * faults are less important.
2378                         */
2379                        f_weight = div64_u64(runtime << 16, period + 1);
2380                        f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2381                                   (total_faults + 1);
2382                        f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2383                        p->numa_faults[cpubuf_idx] = 0;
2384
2385                        p->numa_faults[mem_idx] += diff;
2386                        p->numa_faults[cpu_idx] += f_diff;
2387                        faults += p->numa_faults[mem_idx];
2388                        p->total_numa_faults += diff;
2389                        if (ng) {
2390                                /*
2391                                 * safe because we can only change our own group
2392                                 *
2393                                 * mem_idx represents the offset for a given
2394                                 * nid and priv in a specific region because it
2395                                 * is at the beginning of the numa_faults array.
2396                                 */
2397                                ng->faults[mem_idx] += diff;
2398                                ng->faults_cpu[mem_idx] += f_diff;
2399                                ng->total_faults += diff;
2400                                group_faults += ng->faults[mem_idx];
2401                        }
2402                }
2403
2404                if (!ng) {
2405                        if (faults > max_faults) {
2406                                max_faults = faults;
2407                                max_nid = nid;
2408                        }
2409                } else if (group_faults > max_faults) {
2410                        max_faults = group_faults;
2411                        max_nid = nid;
2412                }
2413        }
2414
2415        if (ng) {
2416                numa_group_count_active_nodes(ng);
2417                spin_unlock_irq(group_lock);
2418                max_nid = preferred_group_nid(p, max_nid);
2419        }
2420
2421        if (max_faults) {
2422                /* Set the new preferred node */
2423                if (max_nid != p->numa_preferred_nid)
2424                        sched_setnuma(p, max_nid);
2425        }
2426
2427        update_task_scan_period(p, fault_types[0], fault_types[1]);
2428}
2429
2430static inline int get_numa_group(struct numa_group *grp)
2431{
2432        return refcount_inc_not_zero(&grp->refcount);
2433}
2434
2435static inline void put_numa_group(struct numa_group *grp)
2436{
2437        if (refcount_dec_and_test(&grp->refcount))
2438                kfree_rcu(grp, rcu);
2439}
2440
2441static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2442                        int *priv)
2443{
2444        struct numa_group *grp, *my_grp;
2445        struct task_struct *tsk;
2446        bool join = false;
2447        int cpu = cpupid_to_cpu(cpupid);
2448        int i;
2449
2450        if (unlikely(!deref_curr_numa_group(p))) {
2451                unsigned int size = sizeof(struct numa_group) +
2452                                    4*nr_node_ids*sizeof(unsigned long);
2453
2454                grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2455                if (!grp)
2456                        return;
2457
2458                refcount_set(&grp->refcount, 1);
2459                grp->active_nodes = 1;
2460                grp->max_faults_cpu = 0;
2461                spin_lock_init(&grp->lock);
2462                grp->gid = p->pid;
2463                /* Second half of the array tracks nids where faults happen */
2464                grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2465                                                nr_node_ids;
2466
2467                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2468                        grp->faults[i] = p->numa_faults[i];
2469
2470                grp->total_faults = p->total_numa_faults;
2471
2472                grp->nr_tasks++;
2473                rcu_assign_pointer(p->numa_group, grp);
2474        }
2475
2476        rcu_read_lock();
2477        tsk = READ_ONCE(cpu_rq(cpu)->curr);
2478
2479        if (!cpupid_match_pid(tsk, cpupid))
2480                goto no_join;
2481
2482        grp = rcu_dereference(tsk->numa_group);
2483        if (!grp)
2484                goto no_join;
2485
2486        my_grp = deref_curr_numa_group(p);
2487        if (grp == my_grp)
2488                goto no_join;
2489
2490        /*
2491         * Only join the other group if its bigger; if we're the bigger group,
2492         * the other task will join us.
2493         */
2494        if (my_grp->nr_tasks > grp->nr_tasks)
2495                goto no_join;
2496
2497        /*
2498         * Tie-break on the grp address.
2499         */
2500        if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2501                goto no_join;
2502
2503        /* Always join threads in the same process. */
2504        if (tsk->mm == current->mm)
2505                join = true;
2506
2507        /* Simple filter to avoid false positives due to PID collisions */
2508        if (flags & TNF_SHARED)
2509                join = true;
2510
2511        /* Update priv based on whether false sharing was detected */
2512        *priv = !join;
2513
2514        if (join && !get_numa_group(grp))
2515                goto no_join;
2516
2517        rcu_read_unlock();
2518
2519        if (!join)
2520                return;
2521
2522        BUG_ON(irqs_disabled());
2523        double_lock_irq(&my_grp->lock, &grp->lock);
2524
2525        for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2526                my_grp->faults[i] -= p->numa_faults[i];
2527                grp->faults[i] += p->numa_faults[i];
2528        }
2529        my_grp->total_faults -= p->total_numa_faults;
2530        grp->total_faults += p->total_numa_faults;
2531
2532        my_grp->nr_tasks--;
2533        grp->nr_tasks++;
2534
2535        spin_unlock(&my_grp->lock);
2536        spin_unlock_irq(&grp->lock);
2537
2538        rcu_assign_pointer(p->numa_group, grp);
2539
2540        put_numa_group(my_grp);
2541        return;
2542
2543no_join:
2544        rcu_read_unlock();
2545        return;
2546}
2547
2548/*
2549 * Get rid of NUMA statistics associated with a task (either current or dead).
2550 * If @final is set, the task is dead and has reached refcount zero, so we can
2551 * safely free all relevant data structures. Otherwise, there might be
2552 * concurrent reads from places like load balancing and procfs, and we should
2553 * reset the data back to default state without freeing ->numa_faults.
2554 */
2555void task_numa_free(struct task_struct *p, bool final)
2556{
2557        /* safe: p either is current or is being freed by current */
2558        struct numa_group *grp = rcu_dereference_raw(p->numa_group);
2559        unsigned long *numa_faults = p->numa_faults;
2560        unsigned long flags;
2561        int i;
2562
2563        if (!numa_faults)
2564                return;
2565
2566        if (grp) {
2567                spin_lock_irqsave(&grp->lock, flags);
2568                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2569                        grp->faults[i] -= p->numa_faults[i];
2570                grp->total_faults -= p->total_numa_faults;
2571
2572                grp->nr_tasks--;
2573                spin_unlock_irqrestore(&grp->lock, flags);
2574                RCU_INIT_POINTER(p->numa_group, NULL);
2575                put_numa_group(grp);
2576        }
2577
2578        if (final) {
2579                p->numa_faults = NULL;
2580                kfree(numa_faults);
2581        } else {
2582                p->total_numa_faults = 0;
2583                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2584                        numa_faults[i] = 0;
2585        }
2586}
2587
2588/*
2589 * Got a PROT_NONE fault for a page on @node.
2590 */
2591void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2592{
2593        struct task_struct *p = current;
2594        bool migrated = flags & TNF_MIGRATED;
2595        int cpu_node = task_node(current);
2596        int local = !!(flags & TNF_FAULT_LOCAL);
2597        struct numa_group *ng;
2598        int priv;
2599
2600        if (!static_branch_likely(&sched_numa_balancing))
2601                return;
2602
2603        /* for example, ksmd faulting in a user's mm */
2604        if (!p->mm)
2605                return;
2606
2607        /* Allocate buffer to track faults on a per-node basis */
2608        if (unlikely(!p->numa_faults)) {
2609                int size = sizeof(*p->numa_faults) *
2610                           NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2611
2612                p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2613                if (!p->numa_faults)
2614                        return;
2615
2616                p->total_numa_faults = 0;
2617                memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2618        }
2619
2620        /*
2621         * First accesses are treated as private, otherwise consider accesses
2622         * to be private if the accessing pid has not changed
2623         */
2624        if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2625                priv = 1;
2626        } else {
2627                priv = cpupid_match_pid(p, last_cpupid);
2628                if (!priv && !(flags & TNF_NO_GROUP))
2629                        task_numa_group(p, last_cpupid, flags, &priv);
2630        }
2631
2632        /*
2633         * If a workload spans multiple NUMA nodes, a shared fault that
2634         * occurs wholly within the set of nodes that the workload is
2635         * actively using should be counted as local. This allows the
2636         * scan rate to slow down when a workload has settled down.
2637         */
2638        ng = deref_curr_numa_group(p);
2639        if (!priv && !local && ng && ng->active_nodes > 1 &&
2640                                numa_is_active_node(cpu_node, ng) &&
2641                                numa_is_active_node(mem_node, ng))
2642                local = 1;
2643
2644        /*
2645         * Retry to migrate task to preferred node periodically, in case it
2646         * previously failed, or the scheduler moved us.
2647         */
2648        if (time_after(jiffies, p->numa_migrate_retry)) {
2649                task_numa_placement(p);
2650                numa_migrate_preferred(p);
2651        }
2652
2653        if (migrated)
2654                p->numa_pages_migrated += pages;
2655        if (flags & TNF_MIGRATE_FAIL)
2656                p->numa_faults_locality[2] += pages;
2657
2658        p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2659        p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2660        p->numa_faults_locality[local] += pages;
2661}
2662
2663static void reset_ptenuma_scan(struct task_struct *p)
2664{
2665        /*
2666         * We only did a read acquisition of the mmap sem, so
2667         * p->mm->numa_scan_seq is written to without exclusive access
2668         * and the update is not guaranteed to be atomic. That's not
2669         * much of an issue though, since this is just used for
2670         * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2671         * expensive, to avoid any form of compiler optimizations:
2672         */
2673        WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2674        p->mm->numa_scan_offset = 0;
2675}
2676
2677/*
2678 * The expensive part of numa migration is done from task_work context.
2679 * Triggered from task_tick_numa().
2680 */
2681static void task_numa_work(struct callback_head *work)
2682{
2683        unsigned long migrate, next_scan, now = jiffies;
2684        struct task_struct *p = current;
2685        struct mm_struct *mm = p->mm;
2686        u64 runtime = p->se.sum_exec_runtime;
2687        struct vm_area_struct *vma;
2688        unsigned long start, end;
2689        unsigned long nr_pte_updates = 0;
2690        long pages, virtpages;
2691
2692        SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2693
2694        work->next = work;
2695        /*
2696         * Who cares about NUMA placement when they're dying.
2697         *
2698         * NOTE: make sure not to dereference p->mm before this check,
2699         * exit_task_work() happens _after_ exit_mm() so we could be called
2700         * without p->mm even though we still had it when we enqueued this
2701         * work.
2702         */
2703        if (p->flags & PF_EXITING)
2704                return;
2705
2706        if (!mm->numa_next_scan) {
2707                mm->numa_next_scan = now +
2708                        msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2709        }
2710
2711        /*
2712         * Enforce maximal scan/migration frequency..
2713         */
2714        migrate = mm->numa_next_scan;
2715        if (time_before(now, migrate))
2716                return;
2717
2718        if (p->numa_scan_period == 0) {
2719                p->numa_scan_period_max = task_scan_max(p);
2720                p->numa_scan_period = task_scan_start(p);
2721        }
2722
2723        next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2724        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2725                return;
2726
2727        /*
2728         * Delay this task enough that another task of this mm will likely win
2729         * the next time around.
2730         */
2731        p->node_stamp += 2 * TICK_NSEC;
2732
2733        start = mm->numa_scan_offset;
2734        pages = sysctl_numa_balancing_scan_size;
2735        pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2736        virtpages = pages * 8;     /* Scan up to this much virtual space */
2737        if (!pages)
2738                return;
2739
2740
2741        if (!mmap_read_trylock(mm))
2742                return;
2743        vma = find_vma(mm, start);
2744        if (!vma) {
2745                reset_ptenuma_scan(p);
2746                start = 0;
2747                vma = mm->mmap;
2748        }
2749        for (; vma; vma = vma->vm_next) {
2750                if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2751                        is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2752                        continue;
2753                }
2754
2755                /*
2756                 * Shared library pages mapped by multiple processes are not
2757                 * migrated as it is expected they are cache replicated. Avoid
2758                 * hinting faults in read-only file-backed mappings or the vdso
2759                 * as migrating the pages will be of marginal benefit.
2760                 */
2761                if (!vma->vm_mm ||
2762                    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2763                        continue;
2764
2765                /*
2766                 * Skip inaccessible VMAs to avoid any confusion between
2767                 * PROT_NONE and NUMA hinting ptes
2768                 */
2769                if (!vma_is_accessible(vma))
2770                        continue;
2771
2772                do {
2773                        start = max(start, vma->vm_start);
2774                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2775                        end = min(end, vma->vm_end);
2776                        nr_pte_updates = change_prot_numa(vma, start, end);
2777
2778                        /*
2779                         * Try to scan sysctl_numa_balancing_size worth of
2780                         * hpages that have at least one present PTE that
2781                         * is not already pte-numa. If the VMA contains
2782                         * areas that are unused or already full of prot_numa
2783                         * PTEs, scan up to virtpages, to skip through those
2784                         * areas faster.
2785                         */
2786                        if (nr_pte_updates)
2787                                pages -= (end - start) >> PAGE_SHIFT;
2788                        virtpages -= (end - start) >> PAGE_SHIFT;
2789
2790                        start = end;
2791                        if (pages <= 0 || virtpages <= 0)
2792                                goto out;
2793
2794                        cond_resched();
2795                } while (end != vma->vm_end);
2796        }
2797
2798out:
2799        /*
2800         * It is possible to reach the end of the VMA list but the last few
2801         * VMAs are not guaranteed to the vma_migratable. If they are not, we
2802         * would find the !migratable VMA on the next scan but not reset the
2803         * scanner to the start so check it now.
2804         */
2805        if (vma)
2806                mm->numa_scan_offset = start;
2807        else
2808                reset_ptenuma_scan(p);
2809        mmap_read_unlock(mm);
2810
2811        /*
2812         * Make sure tasks use at least 32x as much time to run other code
2813         * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2814         * Usually update_task_scan_period slows down scanning enough; on an
2815         * overloaded system we need to limit overhead on a per task basis.
2816         */
2817        if (unlikely(p->se.sum_exec_runtime != runtime)) {
2818                u64 diff = p->se.sum_exec_runtime - runtime;
2819                p->node_stamp += 32 * diff;
2820        }
2821}
2822
2823void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2824{
2825        int mm_users = 0;
2826        struct mm_struct *mm = p->mm;
2827
2828        if (mm) {
2829                mm_users = atomic_read(&mm->mm_users);
2830                if (mm_users == 1) {
2831                        mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2832                        mm->numa_scan_seq = 0;
2833                }
2834        }
2835        p->node_stamp                   = 0;
2836        p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
2837        p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
2838        /* Protect against double add, see task_tick_numa and task_numa_work */
2839        p->numa_work.next               = &p->numa_work;
2840        p->numa_faults                  = NULL;
2841        RCU_INIT_POINTER(p->numa_group, NULL);
2842        p->last_task_numa_placement     = 0;
2843        p->last_sum_exec_runtime        = 0;
2844
2845        init_task_work(&p->numa_work, task_numa_work);
2846
2847        /* New address space, reset the preferred nid */
2848        if (!(clone_flags & CLONE_VM)) {
2849                p->numa_preferred_nid = NUMA_NO_NODE;
2850                return;
2851        }
2852
2853        /*
2854         * New thread, keep existing numa_preferred_nid which should be copied
2855         * already by arch_dup_task_struct but stagger when scans start.
2856         */
2857        if (mm) {
2858                unsigned int delay;
2859
2860                delay = min_t(unsigned int, task_scan_max(current),
2861                        current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2862                delay += 2 * TICK_NSEC;
2863                p->node_stamp = delay;
2864        }
2865}
2866
2867/*
2868 * Drive the periodic memory faults..
2869 */
2870static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2871{
2872        struct callback_head *work = &curr->numa_work;
2873        u64 period, now;
2874
2875        /*
2876         * We don't care about NUMA placement if we don't have memory.
2877         */
2878        if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
2879                return;
2880
2881        /*
2882         * Using runtime rather than walltime has the dual advantage that
2883         * we (mostly) drive the selection from busy threads and that the
2884         * task needs to have done some actual work before we bother with
2885         * NUMA placement.
2886         */
2887        now = curr->se.sum_exec_runtime;
2888        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2889
2890        if (now > curr->node_stamp + period) {
2891                if (!curr->node_stamp)
2892                        curr->numa_scan_period = task_scan_start(curr);
2893                curr->node_stamp += period;
2894
2895                if (!time_before(jiffies, curr->mm->numa_next_scan))
2896                        task_work_add(curr, work, TWA_RESUME);
2897        }
2898}
2899
2900static void update_scan_period(struct task_struct *p, int new_cpu)
2901{
2902        int src_nid = cpu_to_node(task_cpu(p));
2903        int dst_nid = cpu_to_node(new_cpu);
2904
2905        if (!static_branch_likely(&sched_numa_balancing))
2906                return;
2907
2908        if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2909                return;
2910
2911        if (src_nid == dst_nid)
2912                return;
2913
2914        /*
2915         * Allow resets if faults have been trapped before one scan
2916         * has completed. This is most likely due to a new task that
2917         * is pulled cross-node due to wakeups or load balancing.
2918         */
2919        if (p->numa_scan_seq) {
2920                /*
2921                 * Avoid scan adjustments if moving to the preferred
2922                 * node or if the task was not previously running on
2923                 * the preferred node.
2924                 */
2925                if (dst_nid == p->numa_preferred_nid ||
2926                    (p->numa_preferred_nid != NUMA_NO_NODE &&
2927                        src_nid != p->numa_preferred_nid))
2928                        return;
2929        }
2930
2931        p->numa_scan_period = task_scan_start(p);
2932}
2933
2934#else
2935static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2936{
2937}
2938
2939static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2940{
2941}
2942
2943static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2944{
2945}
2946
2947static inline void update_scan_period(struct task_struct *p, int new_cpu)
2948{
2949}
2950
2951#endif /* CONFIG_NUMA_BALANCING */
2952
2953static void
2954account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2955{
2956        update_load_add(&cfs_rq->load, se->load.weight);
2957#ifdef CONFIG_SMP
2958        if (entity_is_task(se)) {
2959                struct rq *rq = rq_of(cfs_rq);
2960
2961                account_numa_enqueue(rq, task_of(se));
2962                list_add(&se->group_node, &rq->cfs_tasks);
2963        }
2964#endif
2965        cfs_rq->nr_running++;
2966}
2967
2968static void
2969account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2970{
2971        update_load_sub(&cfs_rq->load, se->load.weight);
2972#ifdef CONFIG_SMP
2973        if (entity_is_task(se)) {
2974                account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2975                list_del_init(&se->group_node);
2976        }
2977#endif
2978        cfs_rq->nr_running--;
2979}
2980
2981/*
2982 * Signed add and clamp on underflow.
2983 *
2984 * Explicitly do a load-store to ensure the intermediate value never hits
2985 * memory. This allows lockless observations without ever seeing the negative
2986 * values.
2987 */
2988#define add_positive(_ptr, _val) do {                           \
2989        typeof(_ptr) ptr = (_ptr);                              \
2990        typeof(_val) val = (_val);                              \
2991        typeof(*ptr) res, var = READ_ONCE(*ptr);                \
2992                                                                \
2993        res = var + val;                                        \
2994                                                                \
2995        if (val < 0 && res > var)                               \
2996                res = 0;                                        \
2997                                                                \
2998        WRITE_ONCE(*ptr, res);                                  \
2999} while (0)
3000
3001/*
3002 * Unsigned subtract and clamp on underflow.
3003 *
3004 * Explicitly do a load-store to ensure the intermediate value never hits
3005 * memory. This allows lockless observations without ever seeing the negative
3006 * values.
3007 */
3008#define sub_positive(_ptr, _val) do {                           \
3009        typeof(_ptr) ptr = (_ptr);                              \
3010        typeof(*ptr) val = (_val);                              \
3011        typeof(*ptr) res, var = READ_ONCE(*ptr);                \
3012        res = var - val;                                        \
3013        if (res > var)                                          \
3014                res = 0;                                        \
3015        WRITE_ONCE(*ptr, res);                                  \
3016} while (0)
3017
3018/*
3019 * Remove and clamp on negative, from a local variable.
3020 *
3021 * A variant of sub_positive(), which does not use explicit load-store
3022 * and is thus optimized for local variable updates.
3023 */
3024#define lsub_positive(_ptr, _val) do {                          \
3025        typeof(_ptr) ptr = (_ptr);                              \
3026        *ptr -= min_t(typeof(*ptr), *ptr, _val);                \
3027} while (0)
3028
3029#ifdef CONFIG_SMP
3030static inline void
3031enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3032{
3033        cfs_rq->avg.load_avg += se->avg.load_avg;
3034        cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3035}
3036
3037static inline void
3038dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3039{
3040        u32 divider = get_pelt_divider(&se->avg);
3041        sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3042        cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
3043}
3044#else
3045static inline void
3046enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3047static inline void
3048dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3049#endif
3050
3051static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
3052                            unsigned long weight)
3053{
3054        if (se->on_rq) {
3055                /* commit outstanding execution time */
3056                if (cfs_rq->curr == se)
3057                        update_curr(cfs_rq);
3058                update_load_sub(&cfs_rq->load, se->load.weight);
3059        }
3060        dequeue_load_avg(cfs_rq, se);
3061
3062        update_load_set(&se->load, weight);
3063
3064#ifdef CONFIG_SMP
3065        do {
3066                u32 divider = get_pelt_divider(&se->avg);
3067
3068                se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
3069        } while (0);
3070#endif
3071
3072        enqueue_load_avg(cfs_rq, se);
3073        if (se->on_rq)
3074                update_load_add(&cfs_rq->load, se->load.weight);
3075
3076}
3077
3078void reweight_task(struct task_struct *p, int prio)
3079{
3080        struct sched_entity *se = &p->se;
3081        struct cfs_rq *cfs_rq = cfs_rq_of(se);
3082        struct load_weight *load = &se->load;
3083        unsigned long weight = scale_load(sched_prio_to_weight[prio]);
3084
3085        reweight_entity(cfs_rq, se, weight);
3086        load->inv_weight = sched_prio_to_wmult[prio];
3087}
3088
3089#ifdef CONFIG_FAIR_GROUP_SCHED
3090#ifdef CONFIG_SMP
3091/*
3092 * All this does is approximate the hierarchical proportion which includes that
3093 * global sum we all love to hate.
3094 *
3095 * That is, the weight of a group entity, is the proportional share of the
3096 * group weight based on the group runqueue weights. That is:
3097 *
3098 *                     tg->weight * grq->load.weight
3099 *   ge->load.weight = -----------------------------               (1)
3100 *                       \Sum grq->load.weight
3101 *
3102 * Now, because computing that sum is prohibitively expensive to compute (been
3103 * there, done that) we approximate it with this average stuff. The average
3104 * moves slower and therefore the approximation is cheaper and more stable.
3105 *
3106 * So instead of the above, we substitute:
3107 *
3108 *   grq->load.weight -> grq->avg.load_avg                         (2)
3109 *
3110 * which yields the following:
3111 *
3112 *                     tg->weight * grq->avg.load_avg
3113 *   ge->load.weight = ------------------------------              (3)
3114 *                             tg->load_avg
3115 *
3116 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
3117 *
3118 * That is shares_avg, and it is right (given the approximation (2)).
3119 *
3120 * The problem with it is that because the average is slow -- it was designed
3121 * to be exactly that of course -- this leads to transients in boundary
3122 * conditions. In specific, the case where the group was idle and we start the
3123 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
3124 * yielding bad latency etc..
3125 *
3126 * Now, in that special case (1) reduces to:
3127 *
3128 *                     tg->weight * grq->load.weight
3129 *   ge->load.weight = ----------------------------- = tg->weight   (4)
3130 *                         grp->load.weight
3131 *
3132 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
3133 *
3134 * So what we do is modify our approximation (3) to approach (4) in the (near)
3135 * UP case, like:
3136 *
3137 *   ge->load.weight =
3138 *
3139 *              tg->weight * grq->load.weight
3140 *     ---------------------------------------------------         (5)
3141 *     tg->load_avg - grq->avg.load_avg + grq->load.weight
3142 *
3143 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
3144 * we need to use grq->avg.load_avg as its lower bound, which then gives:
3145 *
3146 *
3147 *                     tg->weight * grq->load.weight
3148 *   ge->load.weight = -----------------------------               (6)
3149 *                             tg_load_avg'
3150 *
3151 * Where:
3152 *
3153 *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
3154 *                  max(grq->load.weight, grq->avg.load_avg)
3155 *
3156 * And that is shares_weight and is icky. In the (near) UP case it approaches
3157 * (4) while in the normal case it approaches (3). It consistently
3158 * overestimates the ge->load.weight and therefore:
3159 *
3160 *   \Sum ge->load.weight >= tg->weight
3161 *
3162 * hence icky!
3163 */
3164static long calc_group_shares(struct cfs_rq *cfs_rq)
3165{
3166        long tg_weight, tg_shares, load, shares;
3167        struct task_group *tg = cfs_rq->tg;
3168
3169        tg_shares = READ_ONCE(tg->shares);
3170
3171        load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3172
3173        tg_weight = atomic_long_read(&tg->load_avg);
3174
3175        /* Ensure tg_weight >= load */
3176        tg_weight -= cfs_rq->tg_load_avg_contrib;
3177        tg_weight += load;
3178
3179        shares = (tg_shares * load);
3180        if (tg_weight)
3181                shares /= tg_weight;
3182
3183        /*
3184         * MIN_SHARES has to be unscaled here to support per-CPU partitioning
3185         * of a group with small tg->shares value. It is a floor value which is
3186         * assigned as a minimum load.weight to the sched_entity representing
3187         * the group on a CPU.
3188         *
3189         * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
3190         * on an 8-core system with 8 tasks each runnable on one CPU shares has
3191         * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
3192         * case no task is runnable on a CPU MIN_SHARES=2 should be returned
3193         * instead of 0.
3194         */
3195        return clamp_t(long, shares, MIN_SHARES, tg_shares);
3196}
3197#endif /* CONFIG_SMP */
3198
3199static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3200
3201/*
3202 * Recomputes the group entity based on the current state of its group
3203 * runqueue.
3204 */
3205static void update_cfs_group(struct sched_entity *se)
3206{
3207        struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3208        long shares;
3209
3210        if (!gcfs_rq)
3211                return;
3212
3213        if (throttled_hierarchy(gcfs_rq))
3214                return;
3215
3216#ifndef CONFIG_SMP
3217        shares = READ_ONCE(gcfs_rq->tg->shares);
3218
3219        if (likely(se->load.weight == shares))
3220                return;
3221#else
3222        shares   = calc_group_shares(gcfs_rq);
3223#endif
3224
3225        reweight_entity(cfs_rq_of(se), se, shares);
3226}
3227
3228#else /* CONFIG_FAIR_GROUP_SCHED */
3229static inline void update_cfs_group(struct sched_entity *se)
3230{
3231}
3232#endif /* CONFIG_FAIR_GROUP_SCHED */
3233
3234static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3235{
3236        struct rq *rq = rq_of(cfs_rq);
3237
3238        if (&rq->cfs == cfs_rq) {
3239                /*
3240                 * There are a few boundary cases this might miss but it should
3241                 * get called often enough that that should (hopefully) not be
3242                 * a real problem.
3243                 *
3244                 * It will not get called when we go idle, because the idle
3245                 * thread is a different class (!fair), nor will the utilization
3246                 * number include things like RT tasks.
3247                 *
3248                 * As is, the util number is not freq-invariant (we'd have to
3249                 * implement arch_scale_freq_capacity() for that).
3250                 *
3251                 * See cpu_util().
3252                 */
3253                cpufreq_update_util(rq, flags);
3254        }
3255}
3256
3257#ifdef CONFIG_SMP
3258#ifdef CONFIG_FAIR_GROUP_SCHED
3259/*
3260 * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
3261 * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
3262 * bottom-up, we only have to test whether the cfs_rq before us on the list
3263 * is our child.
3264 * If cfs_rq is not on the list, test whether a child needs its to be added to
3265 * connect a branch to the tree  * (see list_add_leaf_cfs_rq() for details).
3266 */
3267static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
3268{
3269        struct cfs_rq *prev_cfs_rq;
3270        struct list_head *prev;
3271
3272        if (cfs_rq->on_list) {
3273                prev = cfs_rq->leaf_cfs_rq_list.prev;
3274        } else {
3275                struct rq *rq = rq_of(cfs_rq);
3276
3277                prev = rq->tmp_alone_branch;
3278        }
3279
3280        prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
3281
3282        return (prev_cfs_rq->tg->parent == cfs_rq->tg);
3283}
3284
3285static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
3286{
3287        if (cfs_rq->load.weight)
3288                return false;
3289
3290        if (cfs_rq->avg.load_sum)
3291                return false;
3292
3293        if (cfs_rq->avg.util_sum)
3294                return false;
3295
3296        if (cfs_rq->avg.runnable_sum)
3297                return false;
3298
3299        if (child_cfs_rq_on_list(cfs_rq))
3300                return false;
3301
3302        /*
3303         * _avg must be null when _sum are null because _avg = _sum / divider
3304         * Make sure that rounding and/or propagation of PELT values never
3305         * break this.
3306         */
3307        SCHED_WARN_ON(cfs_rq->avg.load_avg ||
3308                      cfs_rq->avg.util_avg ||
3309                      cfs_rq->avg.runnable_avg);
3310
3311        return true;
3312}
3313
3314/**
3315 * update_tg_load_avg - update the tg's load avg
3316 * @cfs_rq: the cfs_rq whose avg changed
3317 *
3318 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3319 * However, because tg->load_avg is a global value there are performance
3320 * considerations.
3321 *
3322 * In order to avoid having to look at the other cfs_rq's, we use a
3323 * differential update where we store the last value we propagated. This in
3324 * turn allows skipping updates if the differential is 'small'.
3325 *
3326 * Updating tg's load_avg is necessary before update_cfs_share().
3327 */
3328static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
3329{
3330        long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3331
3332        /*
3333         * No need to update load_avg for root_task_group as it is not used.
3334         */
3335        if (cfs_rq->tg == &root_task_group)
3336                return;
3337
3338        if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3339                atomic_long_add(delta, &cfs_rq->tg->load_avg);
3340                cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3341        }
3342}
3343
3344/*
3345 * Called within set_task_rq() right before setting a task's CPU. The
3346 * caller only guarantees p->pi_lock is held; no other assumptions,
3347 * including the state of rq->lock, should be made.
3348 */
3349void set_task_rq_fair(struct sched_entity *se,
3350                      struct cfs_rq *prev, struct cfs_rq *next)
3351{
3352        u64 p_last_update_time;
3353        u64 n_last_update_time;
3354
3355        if (!sched_feat(ATTACH_AGE_LOAD))
3356                return;
3357
3358        /*
3359         * We are supposed to update the task to "current" time, then its up to
3360         * date and ready to go to new CPU/cfs_rq. But we have difficulty in
3361         * getting what current time is, so simply throw away the out-of-date
3362         * time. This will result in the wakee task is less decayed, but giving
3363         * the wakee more load sounds not bad.
3364         */
3365        if (!(se->avg.last_update_time && prev))
3366                return;
3367
3368#ifndef CONFIG_64BIT
3369        {
3370                u64 p_last_update_time_copy;
3371                u64 n_last_update_time_copy;
3372
3373                do {
3374                        p_last_update_time_copy = prev->load_last_update_time_copy;
3375                        n_last_update_time_copy = next->load_last_update_time_copy;
3376
3377                        smp_rmb();
3378
3379                        p_last_update_time = prev->avg.last_update_time;
3380                        n_last_update_time = next->avg.last_update_time;
3381
3382                } while (p_last_update_time != p_last_update_time_copy ||
3383                         n_last_update_time != n_last_update_time_copy);
3384        }
3385#else
3386        p_last_update_time = prev->avg.last_update_time;
3387        n_last_update_time = next->avg.last_update_time;
3388#endif
3389        __update_load_avg_blocked_se(p_last_update_time, se);
3390        se->avg.last_update_time = n_last_update_time;
3391}
3392
3393
3394/*
3395 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3396 * propagate its contribution. The key to this propagation is the invariant
3397 * that for each group:
3398 *
3399 *   ge->avg == grq->avg                                                (1)
3400 *
3401 * _IFF_ we look at the pure running and runnable sums. Because they
3402 * represent the very same entity, just at different points in the hierarchy.
3403 *
3404 * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
3405 * and simply copies the running/runnable sum over (but still wrong, because
3406 * the group entity and group rq do not have their PELT windows aligned).
3407 *
3408 * However, update_tg_cfs_load() is more complex. So we have:
3409 *
3410 *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg          (2)
3411 *
3412 * And since, like util, the runnable part should be directly transferable,
3413 * the following would _appear_ to be the straight forward approach:
3414 *
3415 *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg       (3)
3416 *
3417 * And per (1) we have:
3418 *
3419 *   ge->avg.runnable_avg == grq->avg.runnable_avg
3420 *
3421 * Which gives:
3422 *
3423 *                      ge->load.weight * grq->avg.load_avg
3424 *   ge->avg.load_avg = -----------------------------------             (4)
3425 *                               grq->load.weight
3426 *
3427 * Except that is wrong!
3428 *
3429 * Because while for entities historical weight is not important and we
3430 * really only care about our future and therefore can consider a pure
3431 * runnable sum, runqueues can NOT do this.
3432 *
3433 * We specifically want runqueues to have a load_avg that includes
3434 * historical weights. Those represent the blocked load, the load we expect
3435 * to (shortly) return to us. This only works by keeping the weights as
3436 * integral part of the sum. We therefore cannot decompose as per (3).
3437 *
3438 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
3439 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
3440 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
3441 * runnable section of these tasks overlap (or not). If they were to perfectly
3442 * align the rq as a whole would be runnable 2/3 of the time. If however we
3443 * always have at least 1 runnable task, the rq as a whole is always runnable.
3444 *
3445 * So we'll have to approximate.. :/
3446 *
3447 * Given the constraint:
3448 *
3449 *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
3450 *
3451 * We can construct a rule that adds runnable to a rq by assuming minimal
3452 * overlap.
3453 *
3454 * On removal, we'll assume each task is equally runnable; which yields:
3455 *
3456 *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
3457 *
3458 * XXX: only do this for the part of runnable > running ?
3459 *
3460 */
3461
3462static inline void
3463update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3464{
3465        long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3466        u32 divider;
3467
3468        /* Nothing to update */
3469        if (!delta)
3470                return;
3471
3472        /*
3473         * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3474         * See ___update_load_avg() for details.
3475         */
3476        divider = get_pelt_divider(&cfs_rq->avg);
3477
3478        /* Set new sched_entity's utilization */
3479        se->avg.util_avg = gcfs_rq->avg.util_avg;
3480        se->avg.util_sum = se->avg.util_avg * divider;
3481
3482        /* Update parent cfs_rq utilization */
3483        add_positive(&cfs_rq->avg.util_avg, delta);
3484        cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3485}
3486
3487static inline void
3488update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3489{
3490        long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3491        u32 divider;
3492
3493        /* Nothing to update */
3494        if (!delta)
3495                return;
3496
3497        /*
3498         * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3499         * See ___update_load_avg() for details.
3500         */
3501        divider = get_pelt_divider(&cfs_rq->avg);
3502
3503        /* Set new sched_entity's runnable */
3504        se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
3505        se->avg.runnable_sum = se->avg.runnable_avg * divider;
3506
3507        /* Update parent cfs_rq runnable */
3508        add_positive(&cfs_rq->avg.runnable_avg, delta);
3509        cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3510}
3511
3512static inline void
3513update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3514{
3515        long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3516        unsigned long load_avg;
3517        u64 load_sum = 0;
3518        u32 divider;
3519
3520        if (!runnable_sum)
3521                return;
3522
3523        gcfs_rq->prop_runnable_sum = 0;
3524
3525        /*
3526         * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3527         * See ___update_load_avg() for details.
3528         */
3529        divider = get_pelt_divider(&cfs_rq->avg);
3530
3531        if (runnable_sum >= 0) {
3532                /*
3533                 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
3534                 * the CPU is saturated running == runnable.
3535                 */
3536                runnable_sum += se->avg.load_sum;
3537                runnable_sum = min_t(long, runnable_sum, divider);
3538        } else {
3539                /*
3540                 * Estimate the new unweighted runnable_sum of the gcfs_rq by
3541                 * assuming all tasks are equally runnable.
3542                 */
3543                if (scale_load_down(gcfs_rq->load.weight)) {
3544                        load_sum = div_s64(gcfs_rq->avg.load_sum,
3545                                scale_load_down(gcfs_rq->load.weight));
3546                }
3547
3548                /* But make sure to not inflate se's runnable */
3549                runnable_sum = min(se->avg.load_sum, load_sum);
3550        }
3551
3552        /*
3553         * runnable_sum can't be lower than running_sum
3554         * Rescale running sum to be in the same range as runnable sum
3555         * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
3556         * runnable_sum is in [0 : LOAD_AVG_MAX]
3557         */
3558        running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3559        runnable_sum = max(runnable_sum, running_sum);
3560
3561        load_sum = (s64)se_weight(se) * runnable_sum;
3562        load_avg = div_s64(load_sum, divider);
3563
3564        se->avg.load_sum = runnable_sum;
3565
3566        delta = load_avg - se->avg.load_avg;
3567        if (!delta)
3568                return;
3569
3570        se->avg.load_avg = load_avg;
3571
3572        add_positive(&cfs_rq->avg.load_avg, delta);
3573        cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
3574}
3575
3576static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3577{
3578        cfs_rq->propagate = 1;
3579        cfs_rq->prop_runnable_sum += runnable_sum;
3580}
3581
3582/* Update task and its cfs_rq load average */
3583static inline int propagate_entity_load_avg(struct sched_entity *se)
3584{
3585        struct cfs_rq *cfs_rq, *gcfs_rq;
3586
3587        if (entity_is_task(se))
3588                return 0;
3589
3590        gcfs_rq = group_cfs_rq(se);
3591        if (!gcfs_rq->propagate)
3592                return 0;
3593
3594        gcfs_rq->propagate = 0;
3595
3596        cfs_rq = cfs_rq_of(se);
3597
3598        add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3599
3600        update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3601        update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3602        update_tg_cfs_load(cfs_rq, se, gcfs_rq);
3603
3604        trace_pelt_cfs_tp(cfs_rq);
3605        trace_pelt_se_tp(se);
3606
3607        return 1;
3608}
3609
3610/*
3611 * Check if we need to update the load and the utilization of a blocked
3612 * group_entity:
3613 */
3614static inline bool skip_blocked_update(struct sched_entity *se)
3615{
3616        struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3617
3618        /*
3619         * If sched_entity still have not zero load or utilization, we have to
3620         * decay it:
3621         */
3622        if (se->avg.load_avg || se->avg.util_avg)
3623                return false;
3624
3625        /*
3626         * If there is a pending propagation, we have to update the load and
3627         * the utilization of the sched_entity:
3628         */
3629        if (gcfs_rq->propagate)
3630                return false;
3631
3632        /*
3633         * Otherwise, the load and the utilization of the sched_entity is
3634         * already zero and there is no pending propagation, so it will be a
3635         * waste of time to try to decay it:
3636         */
3637        return true;
3638}
3639
3640#else /* CONFIG_FAIR_GROUP_SCHED */
3641
3642static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
3643
3644static inline int propagate_entity_load_avg(struct sched_entity *se)
3645{
3646        return 0;
3647}
3648
3649static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3650
3651#endif /* CONFIG_FAIR_GROUP_SCHED */
3652
3653/**
3654 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3655 * @now: current time, as per cfs_rq_clock_pelt()
3656 * @cfs_rq: cfs_rq to update
3657 *
3658 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3659 * avg. The immediate corollary is that all (fair) tasks must be attached, see
3660 * post_init_entity_util_avg().
3661 *
3662 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3663 *
3664 * Returns true if the load decayed or we removed load.
3665 *
3666 * Since both these conditions indicate a changed cfs_rq->avg.load we should
3667 * call update_tg_load_avg() when this function returns true.
3668 */
3669static inline int
3670update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3671{
3672        unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
3673        struct sched_avg *sa = &cfs_rq->avg;
3674        int decayed = 0;
3675
3676        if (cfs_rq->removed.nr) {
3677                unsigned long r;
3678                u32 divider = get_pelt_divider(&cfs_rq->avg);
3679
3680                raw_spin_lock(&cfs_rq->removed.lock);
3681                swap(cfs_rq->removed.util_avg, removed_util);
3682                swap(cfs_rq->removed.load_avg, removed_load);
3683                swap(cfs_rq->removed.runnable_avg, removed_runnable);
3684                cfs_rq->removed.nr = 0;
3685                raw_spin_unlock(&cfs_rq->removed.lock);
3686
3687                r = removed_load;
3688                sub_positive(&sa->load_avg, r);
3689                sa->load_sum = sa->load_avg * divider;
3690
3691                r = removed_util;
3692                sub_positive(&sa->util_avg, r);
3693                sa->util_sum = sa->util_avg * divider;
3694
3695                r = removed_runnable;
3696                sub_positive(&sa->runnable_avg, r);
3697                sa->runnable_sum = sa->runnable_avg * divider;
3698
3699                /*
3700                 * removed_runnable is the unweighted version of removed_load so we
3701                 * can use it to estimate removed_load_sum.
3702                 */
3703                add_tg_cfs_propagate(cfs_rq,
3704                        -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
3705
3706                decayed = 1;
3707        }
3708
3709        decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3710
3711#ifndef CONFIG_64BIT
3712        smp_wmb();
3713        cfs_rq->load_last_update_time_copy = sa->last_update_time;
3714#endif
3715
3716        return decayed;
3717}
3718
3719/**
3720 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3721 * @cfs_rq: cfs_rq to attach to
3722 * @se: sched_entity to attach
3723 *
3724 * Must call update_cfs_rq_load_avg() before this, since we rely on
3725 * cfs_rq->avg.last_update_time being current.
3726 */
3727static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3728{
3729        /*
3730         * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3731         * See ___update_load_avg() for details.
3732         */
3733        u32 divider = get_pelt_divider(&cfs_rq->avg);
3734
3735        /*
3736         * When we attach the @se to the @cfs_rq, we must align the decay
3737         * window because without that, really weird and wonderful things can
3738         * happen.
3739         *
3740         * XXX illustrate
3741         */
3742        se->avg.last_update_time = cfs_rq->avg.last_update_time;
3743        se->avg.period_contrib = cfs_rq->avg.period_contrib;
3744
3745        /*
3746         * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
3747         * period_contrib. This isn't strictly correct, but since we're
3748         * entirely outside of the PELT hierarchy, nobody cares if we truncate
3749         * _sum a little.
3750         */
3751        se->avg.util_sum = se->avg.util_avg * divider;
3752
3753        se->avg.runnable_sum = se->avg.runnable_avg * divider;
3754
3755        se->avg.load_sum = divider;
3756        if (se_weight(se)) {
3757                se->avg.load_sum =
3758                        div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3759        }
3760
3761        enqueue_load_avg(cfs_rq, se);
3762        cfs_rq->avg.util_avg += se->avg.util_avg;
3763        cfs_rq->avg.util_sum += se->avg.util_sum;
3764        cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
3765        cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
3766
3767        add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3768
3769        cfs_rq_util_change(cfs_rq, 0);
3770
3771        trace_pelt_cfs_tp(cfs_rq);
3772}
3773
3774/**
3775 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3776 * @cfs_rq: cfs_rq to detach from
3777 * @se: sched_entity to detach
3778 *
3779 * Must call update_cfs_rq_load_avg() before this, since we rely on
3780 * cfs_rq->avg.last_update_time being current.
3781 */
3782static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3783{
3784        /*
3785         * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3786         * See ___update_load_avg() for details.
3787         */
3788        u32 divider = get_pelt_divider(&cfs_rq->avg);
3789
3790        dequeue_load_avg(cfs_rq, se);
3791        sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3792        cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3793        sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3794        cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3795
3796        add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3797
3798        cfs_rq_util_change(cfs_rq, 0);
3799
3800        trace_pelt_cfs_tp(cfs_rq);
3801}
3802
3803/*
3804 * Optional action to be done while updating the load average
3805 */
3806#define UPDATE_TG       0x1
3807#define SKIP_AGE_LOAD   0x2
3808#define DO_ATTACH       0x4
3809
3810/* Update task and its cfs_rq load average */
3811static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3812{
3813        u64 now = cfs_rq_clock_pelt(cfs_rq);
3814        int decayed;
3815
3816        /*
3817         * Track task load average for carrying it to new CPU after migrated, and
3818         * track group sched_entity load average for task_h_load calc in migration
3819         */
3820        if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3821                __update_load_avg_se(now, cfs_rq, se);
3822
3823        decayed  = update_cfs_rq_load_avg(now, cfs_rq);
3824        decayed |= propagate_entity_load_avg(se);
3825
3826        if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3827
3828                /*
3829                 * DO_ATTACH means we're here from enqueue_entity().
3830                 * !last_update_time means we've passed through
3831                 * migrate_task_rq_fair() indicating we migrated.
3832                 *
3833                 * IOW we're enqueueing a task on a new CPU.
3834                 */
3835                attach_entity_load_avg(cfs_rq, se);
3836                update_tg_load_avg(cfs_rq);
3837
3838        } else if (decayed) {
3839                cfs_rq_util_change(cfs_rq, 0);
3840
3841                if (flags & UPDATE_TG)
3842                        update_tg_load_avg(cfs_rq);
3843        }
3844}
3845
3846#ifndef CONFIG_64BIT
3847static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3848{
3849        u64 last_update_time_copy;
3850        u64 last_update_time;
3851
3852        do {
3853                last_update_time_copy = cfs_rq->load_last_update_time_copy;
3854                smp_rmb();
3855                last_update_time = cfs_rq->avg.last_update_time;
3856        } while (last_update_time != last_update_time_copy);
3857
3858        return last_update_time;
3859}
3860#else
3861static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3862{
3863        return cfs_rq->avg.last_update_time;
3864}
3865#endif
3866
3867/*
3868 * Synchronize entity load avg of dequeued entity without locking
3869 * the previous rq.
3870 */
3871static void sync_entity_load_avg(struct sched_entity *se)
3872{
3873        struct cfs_rq *cfs_rq = cfs_rq_of(se);
3874        u64 last_update_time;
3875
3876        last_update_time = cfs_rq_last_update_time(cfs_rq);
3877        __update_load_avg_blocked_se(last_update_time, se);
3878}
3879
3880/*
3881 * Task first catches up with cfs_rq, and then subtract
3882 * itself from the cfs_rq (task must be off the queue now).
3883 */
3884static void remove_entity_load_avg(struct sched_entity *se)
3885{
3886        struct cfs_rq *cfs_rq = cfs_rq_of(se);
3887        unsigned long flags;
3888
3889        /*
3890         * tasks cannot exit without having gone through wake_up_new_task() ->
3891         * post_init_entity_util_avg() which will have added things to the
3892         * cfs_rq, so we can remove unconditionally.
3893         */
3894
3895        sync_entity_load_avg(se);
3896
3897        raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3898        ++cfs_rq->removed.nr;
3899        cfs_rq->removed.util_avg        += se->avg.util_avg;
3900        cfs_rq->removed.load_avg        += se->avg.load_avg;
3901        cfs_rq->removed.runnable_avg    += se->avg.runnable_avg;
3902        raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3903}
3904
3905static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
3906{
3907        return cfs_rq->avg.runnable_avg;
3908}
3909
3910static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3911{
3912        return cfs_rq->avg.load_avg;
3913}
3914
3915static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
3916
3917static inline unsigned long task_util(struct task_struct *p)
3918{
3919        return READ_ONCE(p->se.avg.util_avg);
3920}
3921
3922static inline unsigned long _task_util_est(struct task_struct *p)
3923{
3924        struct util_est ue = READ_ONCE(p->se.avg.util_est);
3925
3926        return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
3927}
3928
3929static inline unsigned long task_util_est(struct task_struct *p)
3930{
3931        return max(task_util(p), _task_util_est(p));
3932}
3933
3934#ifdef CONFIG_UCLAMP_TASK
3935static inline unsigned long uclamp_task_util(struct task_struct *p)
3936{
3937        return clamp(task_util_est(p),
3938                     uclamp_eff_value(p, UCLAMP_MIN),
3939                     uclamp_eff_value(p, UCLAMP_MAX));
3940}
3941#else
3942static inline unsigned long uclamp_task_util(struct task_struct *p)
3943{
3944        return task_util_est(p);
3945}
3946#endif
3947
3948static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3949                                    struct task_struct *p)
3950{
3951        unsigned int enqueued;
3952
3953        if (!sched_feat(UTIL_EST))
3954                return;
3955
3956        /* Update root cfs_rq's estimated utilization */
3957        enqueued  = cfs_rq->avg.util_est.enqueued;
3958        enqueued += _task_util_est(p);
3959        WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3960
3961        trace_sched_util_est_cfs_tp(cfs_rq);
3962}
3963
3964static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
3965                                    struct task_struct *p)
3966{
3967        unsigned int enqueued;
3968
3969        if (!sched_feat(UTIL_EST))
3970                return;
3971
3972        /* Update root cfs_rq's estimated utilization */
3973        enqueued  = cfs_rq->avg.util_est.enqueued;
3974        enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
3975        WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3976
3977        trace_sched_util_est_cfs_tp(cfs_rq);
3978}
3979
3980#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
3981
3982/*
3983 * Check if a (signed) value is within a specified (unsigned) margin,
3984 * based on the observation that:
3985 *
3986 *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
3987 *
3988 * NOTE: this only works when value + margin < INT_MAX.
3989 */
3990static inline bool within_margin(int value, int margin)
3991{
3992        return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3993}
3994
3995static inline void util_est_update(struct cfs_rq *cfs_rq,
3996                                   struct task_struct *p,
3997                                   bool task_sleep)
3998{
3999        long last_ewma_diff, last_enqueued_diff;
4000        struct util_est ue;
4001
4002        if (!sched_feat(UTIL_EST))
4003                return;
4004
4005        /*
4006         * Skip update of task's estimated utilization when the task has not
4007         * yet completed an activation, e.g. being migrated.
4008         */
4009        if (!task_sleep)
4010                return;
4011
4012        /*
4013         * If the PELT values haven't changed since enqueue time,
4014         * skip the util_est update.
4015         */
4016        ue = p->se.avg.util_est;
4017        if (ue.enqueued & UTIL_AVG_UNCHANGED)
4018                return;
4019
4020        last_enqueued_diff = ue.enqueued;
4021
4022        /*
4023         * Reset EWMA on utilization increases, the moving average is used only
4024         * to smooth utilization decreases.
4025         */
4026        ue.enqueued = task_util(p);
4027        if (sched_feat(UTIL_EST_FASTUP)) {
4028                if (ue.ewma < ue.enqueued) {
4029                        ue.ewma = ue.enqueued;
4030                        goto done;
4031                }
4032        }
4033
4034        /*
4035         * Skip update of task's estimated utilization when its members are
4036         * already ~1% close to its last activation value.
4037         */
4038        last_ewma_diff = ue.enqueued - ue.ewma;
4039        last_enqueued_diff -= ue.enqueued;
4040        if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
4041                if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
4042                        goto done;
4043
4044                return;
4045        }
4046
4047        /*
4048         * To avoid overestimation of actual task utilization, skip updates if
4049         * we cannot grant there is idle time in this CPU.
4050         */
4051        if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
4052                return;
4053
4054        /*
4055         * Update Task's estimated utilization
4056         *
4057         * When *p completes an activation we can consolidate another sample
4058         * of the task size. This is done by storing the current PELT value
4059         * as ue.enqueued and by using this value to update the Exponential
4060         * Weighted Moving Average (EWMA):
4061         *
4062         *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
4063         *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
4064         *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
4065         *          = w * (      last_ewma_diff            ) +     ewma(t-1)
4066         *          = w * (last_ewma_diff  +  ewma(t-1) / w)
4067         *
4068         * Where 'w' is the weight of new samples, which is configured to be
4069         * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4070         */
4071        ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4072        ue.ewma  += last_ewma_diff;
4073        ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
4074done:
4075        ue.enqueued |= UTIL_AVG_UNCHANGED;
4076        WRITE_ONCE(p->se.avg.util_est, ue);
4077
4078        trace_sched_util_est_se_tp(&p->se);
4079}
4080
4081static inline int task_fits_capacity(struct task_struct *p, long capacity)
4082{
4083        return fits_capacity(uclamp_task_util(p), capacity);
4084}
4085
4086static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
4087{
4088        if (!static_branch_unlikely(&sched_asym_cpucapacity))
4089                return;
4090
4091        if (!p || p->nr_cpus_allowed == 1) {
4092                rq->misfit_task_load = 0;
4093                return;
4094        }
4095
4096        if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
4097                rq->misfit_task_load = 0;
4098                return;
4099        }
4100
4101        /*
4102         * Make sure that misfit_task_load will not be null even if
4103         * task_h_load() returns 0.
4104         */
4105        rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
4106}
4107
4108#else /* CONFIG_SMP */
4109
4110static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4111{
4112        return true;
4113}
4114
4115#define UPDATE_TG       0x0
4116#define SKIP_AGE_LOAD   0x0
4117#define DO_ATTACH       0x0
4118
4119static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
4120{
4121        cfs_rq_util_change(cfs_rq, 0);
4122}
4123
4124static inline void remove_entity_load_avg(struct sched_entity *se) {}
4125
4126static inline void
4127attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4128static inline void
4129detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4130
4131static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
4132{
4133        return 0;
4134}
4135
4136static inline void
4137util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4138
4139static inline void
4140util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4141
4142static inline void
4143util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
4144                bool task_sleep) {}
4145static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
4146
4147#endif /* CONFIG_SMP */
4148
4149static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4150{
4151#ifdef CONFIG_SCHED_DEBUG
4152        s64 d = se->vruntime - cfs_rq->min_vruntime;
4153
4154        if (d < 0)
4155                d = -d;
4156
4157        if (d > 3*sysctl_sched_latency)
4158                schedstat_inc(cfs_rq->nr_spread_over);
4159#endif
4160}
4161
4162static void
4163place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4164{
4165        u64 vruntime = cfs_rq->min_vruntime;
4166
4167        /*
4168         * The 'current' period is already promised to the current tasks,
4169         * however the extra weight of the new task will slow them down a
4170         * little, place the new task so that it fits in the slot that
4171         * stays open at the end.
4172         */
4173        if (initial && sched_feat(START_DEBIT))
4174                vruntime += sched_vslice(cfs_rq, se);
4175
4176        /* sleeps up to a single latency don't count. */
4177        if (!initial) {
4178                unsigned long thresh = sysctl_sched_latency;
4179
4180                /*
4181                 * Halve their sleep time's effect, to allow
4182                 * for a gentler effect of sleepers:
4183                 */
4184                if (sched_feat(GENTLE_FAIR_SLEEPERS))
4185                        thresh >>= 1;
4186
4187                vruntime -= thresh;
4188        }
4189
4190        /* ensure we never gain time by being placed backwards. */
4191        se->vruntime = max_vruntime(se->vruntime, vruntime);
4192}
4193
4194static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4195
4196static inline void check_schedstat_required(void)
4197{
4198#ifdef CONFIG_SCHEDSTATS
4199        if (schedstat_enabled())
4200                return;
4201
4202        /* Force schedstat enabled if a dependent tracepoint is active */
4203        if (trace_sched_stat_wait_enabled()    ||
4204                        trace_sched_stat_sleep_enabled()   ||
4205                        trace_sched_stat_iowait_enabled()  ||
4206                        trace_sched_stat_blocked_enabled() ||
4207                        trace_sched_stat_runtime_enabled())  {
4208                printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
4209                             "stat_blocked and stat_runtime require the "
4210                             "kernel parameter schedstats=enable or "
4211                             "kernel.sched_schedstats=1\n");
4212        }
4213#endif
4214}
4215
4216static inline bool cfs_bandwidth_used(void);
4217
4218/*
4219 * MIGRATION
4220 *
4221 *      dequeue
4222 *        update_curr()
4223 *          update_min_vruntime()
4224 *        vruntime -= min_vruntime
4225 *
4226 *      enqueue
4227 *        update_curr()
4228 *          update_min_vruntime()
4229 *        vruntime += min_vruntime
4230 *
4231 * this way the vruntime transition between RQs is done when both
4232 * min_vruntime are up-to-date.
4233 *
4234 * WAKEUP (remote)
4235 *
4236 *      ->migrate_task_rq_fair() (p->state == TASK_WAKING)
4237 *        vruntime -= min_vruntime
4238 *
4239 *      enqueue
4240 *        update_curr()
4241 *          update_min_vruntime()
4242 *        vruntime += min_vruntime
4243 *
4244 * this way we don't have the most up-to-date min_vruntime on the originating
4245 * CPU and an up-to-date min_vruntime on the destination CPU.
4246 */
4247
4248static void
4249enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4250{
4251        bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4252        bool curr = cfs_rq->curr == se;
4253
4254        /*
4255         * If we're the current task, we must renormalise before calling
4256         * update_curr().
4257         */
4258        if (renorm && curr)
4259                se->vruntime += cfs_rq->min_vruntime;
4260
4261        update_curr(cfs_rq);
4262
4263        /*
4264         * Otherwise, renormalise after, such that we're placed at the current
4265         * moment in time, instead of some random moment in the past. Being
4266         * placed in the past could significantly boost this task to the
4267         * fairness detriment of existing tasks.
4268         */
4269        if (renorm && !curr)
4270                se->vruntime += cfs_rq->min_vruntime;
4271
4272        /*
4273         * When enqueuing a sched_entity, we must:
4274         *   - Update loads to have both entity and cfs_rq synced with now.
4275         *   - Add its load to cfs_rq->runnable_avg
4276         *   - For group_entity, update its weight to reflect the new share of
4277         *     its group cfs_rq
4278         *   - Add its new weight to cfs_rq->load.weight
4279         */
4280        update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4281        se_update_runnable(se);
4282        update_cfs_group(se);
4283        account_entity_enqueue(cfs_rq, se);
4284
4285        if (flags & ENQUEUE_WAKEUP)
4286                place_entity(cfs_rq, se, 0);
4287
4288        check_schedstat_required();
4289        update_stats_enqueue(cfs_rq, se, flags);
4290        check_spread(cfs_rq, se);
4291        if (!curr)
4292                __enqueue_entity(cfs_rq, se);
4293        se->on_rq = 1;
4294
4295        /*
4296         * When bandwidth control is enabled, cfs might have been removed
4297         * because of a parent been throttled but cfs->nr_running > 1. Try to
4298         * add it unconditionally.
4299         */
4300        if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
4301                list_add_leaf_cfs_rq(cfs_rq);
4302
4303        if (cfs_rq->nr_running == 1)
4304                check_enqueue_throttle(cfs_rq);
4305}
4306
4307static void __clear_buddies_last(struct sched_entity *se)
4308{
4309        for_each_sched_entity(se) {
4310                struct cfs_rq *cfs_rq = cfs_rq_of(se);
4311                if (cfs_rq->last != se)
4312                        break;
4313
4314                cfs_rq->last = NULL;
4315        }
4316}
4317
4318static void __clear_buddies_next(struct sched_entity *se)
4319{
4320        for_each_sched_entity(se) {
4321                struct cfs_rq *cfs_rq = cfs_rq_of(se);
4322                if (cfs_rq->next != se)
4323                        break;
4324
4325                cfs_rq->next = NULL;
4326        }
4327}
4328
4329static void __clear_buddies_skip(struct sched_entity *se)
4330{
4331        for_each_sched_entity(se) {
4332                struct cfs_rq *cfs_rq = cfs_rq_of(se);
4333                if (cfs_rq->skip != se)
4334                        break;
4335
4336                cfs_rq->skip = NULL;
4337        }
4338}
4339
4340static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4341{
4342        if (cfs_rq->last == se)
4343                __clear_buddies_last(se);
4344
4345        if (cfs_rq->next == se)
4346                __clear_buddies_next(se);
4347
4348        if (cfs_rq->skip == se)
4349                __clear_buddies_skip(se);
4350}
4351
4352static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4353
4354static void
4355dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4356{
4357        /*
4358         * Update run-time statistics of the 'current'.
4359         */
4360        update_curr(cfs_rq);
4361
4362        /*
4363         * When dequeuing a sched_entity, we must:
4364         *   - Update loads to have both entity and cfs_rq synced with now.
4365         *   - Subtract its load from the cfs_rq->runnable_avg.
4366         *   - Subtract its previous weight from cfs_rq->load.weight.
4367         *   - For group entity, update its weight to reflect the new share
4368         *     of its group cfs_rq.
4369         */
4370        update_load_avg(cfs_rq, se, UPDATE_TG);
4371        se_update_runnable(se);
4372
4373        update_stats_dequeue(cfs_rq, se, flags);
4374
4375        clear_buddies(cfs_rq, se);
4376
4377        if (se != cfs_rq->curr)
4378                __dequeue_entity(cfs_rq, se);
4379        se->on_rq = 0;
4380        account_entity_dequeue(cfs_rq, se);
4381
4382        /*
4383         * Normalize after update_curr(); which will also have moved
4384         * min_vruntime if @se is the one holding it back. But before doing
4385         * update_min_vruntime() again, which will discount @se's position and
4386         * can move min_vruntime forward still more.
4387         */
4388        if (!(flags & DEQUEUE_SLEEP))
4389                se->vruntime -= cfs_rq->min_vruntime;
4390
4391        /* return excess runtime on last dequeue */
4392        return_cfs_rq_runtime(cfs_rq);
4393
4394        update_cfs_group(se);
4395
4396        /*
4397         * Now advance min_vruntime if @se was the entity holding it back,
4398         * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
4399         * put back on, and if we advance min_vruntime, we'll be placed back
4400         * further than we started -- ie. we'll be penalized.
4401         */
4402        if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4403                update_min_vruntime(cfs_rq);
4404}
4405
4406/*
4407 * Preempt the current task with a newly woken task if needed:
4408 */
4409static void
4410check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4411{
4412        unsigned long ideal_runtime, delta_exec;
4413        struct sched_entity *se;
4414        s64 delta;
4415
4416        ideal_runtime = sched_slice(cfs_rq, curr);
4417        delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4418        if (delta_exec > ideal_runtime) {
4419                resched_curr(rq_of(cfs_rq));
4420                /*
4421                 * The current task ran long enough, ensure it doesn't get
4422                 * re-elected due to buddy favours.
4423                 */
4424                clear_buddies(cfs_rq, curr);
4425                return;
4426        }
4427
4428        /*
4429         * Ensure that a task that missed wakeup preemption by a
4430         * narrow margin doesn't have to wait for a full slice.
4431         * This also mitigates buddy induced latencies under load.
4432         */
4433        if (delta_exec < sysctl_sched_min_granularity)
4434                return;
4435
4436        se = __pick_first_entity(cfs_rq);
4437        delta = curr->vruntime - se->vruntime;
4438
4439        if (delta < 0)
4440                return;
4441
4442        if (delta > ideal_runtime)
4443                resched_curr(rq_of(cfs_rq));
4444}
4445
4446static void
4447set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4448{
4449        clear_buddies(cfs_rq, se);
4450
4451        /* 'current' is not kept within the tree. */
4452        if (se->on_rq) {
4453                /*
4454                 * Any task has to be enqueued before it get to execute on
4455                 * a CPU. So account for the time it spent waiting on the
4456                 * runqueue.
4457                 */
4458                update_stats_wait_end(cfs_rq, se);
4459                __dequeue_entity(cfs_rq, se);
4460                update_load_avg(cfs_rq, se, UPDATE_TG);
4461        }
4462
4463        update_stats_curr_start(cfs_rq, se);
4464        cfs_rq->curr = se;
4465
4466        /*
4467         * Track our maximum slice length, if the CPU's load is at
4468         * least twice that of our own weight (i.e. dont track it
4469         * when there are only lesser-weight tasks around):
4470         */
4471        if (schedstat_enabled() &&
4472            rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4473                schedstat_set(se->statistics.slice_max,
4474                        max((u64)schedstat_val(se->statistics.slice_max),
4475                            se->sum_exec_runtime - se->prev_sum_exec_runtime));
4476        }
4477
4478        se->prev_sum_exec_runtime = se->sum_exec_runtime;
4479}
4480
4481static int
4482wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4483
4484/*
4485 * Pick the next process, keeping these things in mind, in this order:
4486 * 1) keep things fair between processes/task groups
4487 * 2) pick the "next" process, since someone really wants that to run
4488 * 3) pick the "last" process, for cache locality
4489 * 4) do not run the "skip" process, if something else is available
4490 */
4491static struct sched_entity *
4492pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4493{
4494        struct sched_entity *left = __pick_first_entity(cfs_rq);
4495        struct sched_entity *se;
4496
4497        /*
4498         * If curr is set we have to see if its left of the leftmost entity
4499         * still in the tree, provided there was anything in the tree at all.
4500         */
4501        if (!left || (curr && entity_before(curr, left)))
4502                left = curr;
4503
4504        se = left; /* ideally we run the leftmost entity */
4505
4506        /*
4507         * Avoid running the skip buddy, if running something else can
4508         * be done without getting too unfair.
4509         */
4510        if (cfs_rq->skip && cfs_rq->skip == se) {
4511                struct sched_entity *second;
4512
4513                if (se == curr) {
4514                        second = __pick_first_entity(cfs_rq);
4515                } else {
4516                        second = __pick_next_entity(se);
4517                        if (!second || (curr && entity_before(curr, second)))
4518                                second = curr;
4519                }
4520
4521                if (second && wakeup_preempt_entity(second, left) < 1)
4522                        se = second;
4523        }
4524
4525        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
4526                /*
4527                 * Someone really wants this to run. If it's not unfair, run it.
4528                 */
4529                se = cfs_rq->next;
4530        } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
4531                /*
4532                 * Prefer last buddy, try to return the CPU to a preempted task.
4533                 */
4534                se = cfs_rq->last;
4535        }
4536
4537        return se;
4538}
4539
4540static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4541
4542static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4543{
4544        /*
4545         * If still on the runqueue then deactivate_task()
4546         * was not called and update_curr() has to be done:
4547         */
4548        if (prev->on_rq)
4549                update_curr(cfs_rq);
4550
4551        /* throttle cfs_rqs exceeding runtime */
4552        check_cfs_rq_runtime(cfs_rq);
4553
4554        check_spread(cfs_rq, prev);
4555
4556        if (prev->on_rq) {
4557                update_stats_wait_start(cfs_rq, prev);
4558                /* Put 'current' back into the tree. */
4559                __enqueue_entity(cfs_rq, prev);
4560                /* in !on_rq case, update occurred at dequeue */
4561                update_load_avg(cfs_rq, prev, 0);
4562        }
4563        cfs_rq->curr = NULL;
4564}
4565
4566static void
4567entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4568{
4569        /*
4570         * Update run-time statistics of the 'current'.
4571         */
4572        update_curr(cfs_rq);
4573
4574        /*
4575         * Ensure that runnable average is periodically updated.
4576         */
4577        update_load_avg(cfs_rq, curr, UPDATE_TG);
4578        update_cfs_group(curr);
4579
4580#ifdef CONFIG_SCHED_HRTICK
4581        /*
4582         * queued ticks are scheduled to match the slice, so don't bother
4583         * validating it and just reschedule.
4584         */
4585        if (queued) {
4586                resched_curr(rq_of(cfs_rq));
4587                return;
4588        }
4589        /*
4590         * don't let the period tick interfere with the hrtick preemption
4591         */
4592        if (!sched_feat(DOUBLE_TICK) &&
4593                        hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4594                return;
4595#endif
4596
4597        if (cfs_rq->nr_running > 1)
4598                check_preempt_tick(cfs_rq, curr);
4599}
4600
4601
4602/**************************************************
4603 * CFS bandwidth control machinery
4604 */
4605
4606#ifdef CONFIG_CFS_BANDWIDTH
4607
4608#ifdef CONFIG_JUMP_LABEL
4609static struct static_key __cfs_bandwidth_used;
4610
4611static inline bool cfs_bandwidth_used(void)
4612{
4613        return static_key_false(&__cfs_bandwidth_used);
4614}
4615
4616void cfs_bandwidth_usage_inc(void)
4617{
4618        static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4619}
4620
4621void cfs_bandwidth_usage_dec(void)
4622{
4623        static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4624}