linux/kernel/sched_fair.c
<<
>>
Prefs
   1/*
   2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3 *
   4 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5 *
   6 *  Interactivity improvements by Mike Galbraith
   7 *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8 *
   9 *  Various enhancements by Dmitry Adamushko.
  10 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11 *
  12 *  Group scheduling enhancements by Srivatsa Vaddagiri
  13 *  Copyright IBM Corporation, 2007
  14 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15 *
  16 *  Scaled math optimizations by Thomas Gleixner
  17 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18 *
  19 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  21 */
  22
  23#include <linux/latencytop.h>
  24
  25/*
  26 * Targeted preemption latency for CPU-bound tasks:
  27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
  28 *
  29 * NOTE: this latency value is not the same as the concept of
  30 * 'timeslice length' - timeslices in CFS are of variable length
  31 * and have no persistent notion like in traditional, time-slice
  32 * based scheduling concepts.
  33 *
  34 * (to see the precise effective timeslice length of your workload,
  35 *  run vmstat and monitor the context-switches (cs) field)
  36 */
  37unsigned int sysctl_sched_latency = 20000000ULL;
  38
  39/*
  40 * Minimal preemption granularity for CPU-bound tasks:
  41 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds)
  42 */
  43unsigned int sysctl_sched_min_granularity = 4000000ULL;
  44
  45/*
  46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  47 */
  48static unsigned int sched_nr_latency = 5;
  49
  50/*
  51 * After fork, child runs first. (default) If set to 0 then
  52 * parent will (try to) run first.
  53 */
  54const_debug unsigned int sysctl_sched_child_runs_first = 1;
  55
  56/*
  57 * sys_sched_yield() compat mode
  58 *
  59 * This option switches the agressive yield implementation of the
  60 * old scheduler back on.
  61 */
  62unsigned int __read_mostly sysctl_sched_compat_yield;
  63
  64/*
  65 * SCHED_OTHER wake-up granularity.
  66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
  67 *
  68 * This option delays the preemption effects of decoupled workloads
  69 * and reduces their over-scheduling. Synchronous workloads will still
  70 * have immediate wakeup/sleep latencies.
  71 */
  72unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
  73
  74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  75
  76/**************************************************************
  77 * CFS operations on generic schedulable entities:
  78 */
  79
  80static inline struct task_struct *task_of(struct sched_entity *se)
  81{
  82        return container_of(se, struct task_struct, se);
  83}
  84
  85#ifdef CONFIG_FAIR_GROUP_SCHED
  86
  87/* cpu runqueue to which this cfs_rq is attached */
  88static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  89{
  90        return cfs_rq->rq;
  91}
  92
  93/* An entity is a task if it doesn't "own" a runqueue */
  94#define entity_is_task(se)      (!se->my_q)
  95
  96/* Walk up scheduling entities hierarchy */
  97#define for_each_sched_entity(se) \
  98                for (; se; se = se->parent)
  99
 100static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 101{
 102        return p->se.cfs_rq;
 103}
 104
 105/* runqueue on which this entity is (to be) queued */
 106static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 107{
 108        return se->cfs_rq;
 109}
 110
 111/* runqueue "owned" by this group */
 112static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 113{
 114        return grp->my_q;
 115}
 116
 117/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
 118 * another cpu ('this_cpu')
 119 */
 120static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 121{
 122        return cfs_rq->tg->cfs_rq[this_cpu];
 123}
 124
 125/* Iterate thr' all leaf cfs_rq's on a runqueue */
 126#define for_each_leaf_cfs_rq(rq, cfs_rq) \
 127        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 128
 129/* Do the two (enqueued) entities belong to the same group ? */
 130static inline int
 131is_same_group(struct sched_entity *se, struct sched_entity *pse)
 132{
 133        if (se->cfs_rq == pse->cfs_rq)
 134                return 1;
 135
 136        return 0;
 137}
 138
 139static inline struct sched_entity *parent_entity(struct sched_entity *se)
 140{
 141        return se->parent;
 142}
 143
 144#else   /* CONFIG_FAIR_GROUP_SCHED */
 145
 146static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 147{
 148        return container_of(cfs_rq, struct rq, cfs);
 149}
 150
 151#define entity_is_task(se)      1
 152
 153#define for_each_sched_entity(se) \
 154                for (; se; se = NULL)
 155
 156static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 157{
 158        return &task_rq(p)->cfs;
 159}
 160
 161static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 162{
 163        struct task_struct *p = task_of(se);
 164        struct rq *rq = task_rq(p);
 165
 166        return &rq->cfs;
 167}
 168
 169/* runqueue "owned" by this group */
 170static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 171{
 172        return NULL;
 173}
 174
 175static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 176{
 177        return &cpu_rq(this_cpu)->cfs;
 178}
 179
 180#define for_each_leaf_cfs_rq(rq, cfs_rq) \
 181                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 182
 183static inline int
 184is_same_group(struct sched_entity *se, struct sched_entity *pse)
 185{
 186        return 1;
 187}
 188
 189static inline struct sched_entity *parent_entity(struct sched_entity *se)
 190{
 191        return NULL;
 192}
 193
 194#endif  /* CONFIG_FAIR_GROUP_SCHED */
 195
 196
 197/**************************************************************
 198 * Scheduling class tree data structure manipulation methods:
 199 */
 200
 201static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
 202{
 203        s64 delta = (s64)(vruntime - min_vruntime);
 204        if (delta > 0)
 205                min_vruntime = vruntime;
 206
 207        return min_vruntime;
 208}
 209
 210static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 211{
 212        s64 delta = (s64)(vruntime - min_vruntime);
 213        if (delta < 0)
 214                min_vruntime = vruntime;
 215
 216        return min_vruntime;
 217}
 218
 219static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 220{
 221        return se->vruntime - cfs_rq->min_vruntime;
 222}
 223
 224/*
 225 * Enqueue an entity into the rb-tree:
 226 */
 227static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 228{
 229        struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 230        struct rb_node *parent = NULL;
 231        struct sched_entity *entry;
 232        s64 key = entity_key(cfs_rq, se);
 233        int leftmost = 1;
 234
 235        /*
 236         * Find the right place in the rbtree:
 237         */
 238        while (*link) {
 239                parent = *link;
 240                entry = rb_entry(parent, struct sched_entity, run_node);
 241                /*
 242                 * We dont care about collisions. Nodes with
 243                 * the same key stay together.
 244                 */
 245                if (key < entity_key(cfs_rq, entry)) {
 246                        link = &parent->rb_left;
 247                } else {
 248                        link = &parent->rb_right;
 249                        leftmost = 0;
 250                }
 251        }
 252
 253        /*
 254         * Maintain a cache of leftmost tree entries (it is frequently
 255         * used):
 256         */
 257        if (leftmost) {
 258                cfs_rq->rb_leftmost = &se->run_node;
 259                /*
 260                 * maintain cfs_rq->min_vruntime to be a monotonic increasing
 261                 * value tracking the leftmost vruntime in the tree.
 262                 */
 263                cfs_rq->min_vruntime =
 264                        max_vruntime(cfs_rq->min_vruntime, se->vruntime);
 265        }
 266
 267        rb_link_node(&se->run_node, parent, link);
 268        rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 269}
 270
 271static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 272{
 273        if (cfs_rq->rb_leftmost == &se->run_node) {
 274                struct rb_node *next_node;
 275                struct sched_entity *next;
 276
 277                next_node = rb_next(&se->run_node);
 278                cfs_rq->rb_leftmost = next_node;
 279
 280                if (next_node) {
 281                        next = rb_entry(next_node,
 282                                        struct sched_entity, run_node);
 283                        cfs_rq->min_vruntime =
 284                                max_vruntime(cfs_rq->min_vruntime,
 285                                             next->vruntime);
 286                }
 287        }
 288
 289        if (cfs_rq->next == se)
 290                cfs_rq->next = NULL;
 291
 292        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 293}
 294
 295static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
 296{
 297        return cfs_rq->rb_leftmost;
 298}
 299
 300static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 301{
 302        return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
 303}
 304
 305static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 306{
 307        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 308
 309        if (!last)
 310                return NULL;
 311
 312        return rb_entry(last, struct sched_entity, run_node);
 313}
 314
 315/**************************************************************
 316 * Scheduling class statistics methods:
 317 */
 318
 319#ifdef CONFIG_SCHED_DEBUG
 320int sched_nr_latency_handler(struct ctl_table *table, int write,
 321                struct file *filp, void __user *buffer, size_t *lenp,
 322                loff_t *ppos)
 323{
 324        int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
 325
 326        if (ret || !write)
 327                return ret;
 328
 329        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 330                                        sysctl_sched_min_granularity);
 331
 332        return 0;
 333}
 334#endif
 335
 336/*
 337 * The idea is to set a period in which each task runs once.
 338 *
 339 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
 340 * this period because otherwise the slices get too small.
 341 *
 342 * p = (nr <= nl) ? l : l*nr/nl
 343 */
 344static u64 __sched_period(unsigned long nr_running)
 345{
 346        u64 period = sysctl_sched_latency;
 347        unsigned long nr_latency = sched_nr_latency;
 348
 349        if (unlikely(nr_running > nr_latency)) {
 350                period = sysctl_sched_min_granularity;
 351                period *= nr_running;
 352        }
 353
 354        return period;
 355}
 356
 357/*
 358 * We calculate the wall-time slice from the period by taking a part
 359 * proportional to the weight.
 360 *
 361 * s = p*w/rw
 362 */
 363static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 364{
 365        u64 slice = __sched_period(cfs_rq->nr_running);
 366
 367        for_each_sched_entity(se) {
 368                cfs_rq = cfs_rq_of(se);
 369
 370                slice *= se->load.weight;
 371                do_div(slice, cfs_rq->load.weight);
 372        }
 373
 374
 375        return slice;
 376}
 377
 378/*
 379 * We calculate the vruntime slice of a to be inserted task
 380 *
 381 * vs = s/w = p/rw
 382 */
 383static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 384{
 385        unsigned long nr_running = cfs_rq->nr_running;
 386        unsigned long weight;
 387        u64 vslice;
 388
 389        if (!se->on_rq)
 390                nr_running++;
 391
 392        vslice = __sched_period(nr_running);
 393
 394        for_each_sched_entity(se) {
 395                cfs_rq = cfs_rq_of(se);
 396
 397                weight = cfs_rq->load.weight;
 398                if (!se->on_rq)
 399                        weight += se->load.weight;
 400
 401                vslice *= NICE_0_LOAD;
 402                do_div(vslice, weight);
 403        }
 404
 405        return vslice;
 406}
 407
 408/*
 409 * Update the current task's runtime statistics. Skip current tasks that
 410 * are not in our scheduling class.
 411 */
 412static inline void
 413__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 414              unsigned long delta_exec)
 415{
 416        unsigned long delta_exec_weighted;
 417
 418        schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 419
 420        curr->sum_exec_runtime += delta_exec;
 421        schedstat_add(cfs_rq, exec_clock, delta_exec);
 422        delta_exec_weighted = delta_exec;
 423        if (unlikely(curr->load.weight != NICE_0_LOAD)) {
 424                delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
 425                                                        &curr->load);
 426        }
 427        curr->vruntime += delta_exec_weighted;
 428}
 429
 430static void update_curr(struct cfs_rq *cfs_rq)
 431{
 432        struct sched_entity *curr = cfs_rq->curr;
 433        u64 now = rq_of(cfs_rq)->clock;
 434        unsigned long delta_exec;
 435
 436        if (unlikely(!curr))
 437                return;
 438
 439        /*
 440         * Get the amount of time the current task was running
 441         * since the last time we changed load (this cannot
 442         * overflow on 32 bits):
 443         */
 444        delta_exec = (unsigned long)(now - curr->exec_start);
 445
 446        __update_curr(cfs_rq, curr, delta_exec);
 447        curr->exec_start = now;
 448
 449        if (entity_is_task(curr)) {
 450                struct task_struct *curtask = task_of(curr);
 451
 452                cpuacct_charge(curtask, delta_exec);
 453        }
 454}
 455
 456static inline void
 457update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 458{
 459        schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 460}
 461
 462/*
 463 * Task is being enqueued - update stats:
 464 */
 465static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 466{
 467        /*
 468         * Are we enqueueing a waiting task? (for current tasks
 469         * a dequeue/enqueue event is a NOP)
 470         */
 471        if (se != cfs_rq->curr)
 472                update_stats_wait_start(cfs_rq, se);
 473}
 474
 475static void
 476update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 477{
 478        schedstat_set(se->wait_max, max(se->wait_max,
 479                        rq_of(cfs_rq)->clock - se->wait_start));
 480        schedstat_set(se->wait_count, se->wait_count + 1);
 481        schedstat_set(se->wait_sum, se->wait_sum +
 482                        rq_of(cfs_rq)->clock - se->wait_start);
 483        schedstat_set(se->wait_start, 0);
 484}
 485
 486static inline void
 487update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 488{
 489        /*
 490         * Mark the end of the wait period if dequeueing a
 491         * waiting task:
 492         */
 493        if (se != cfs_rq->curr)
 494                update_stats_wait_end(cfs_rq, se);
 495}
 496
 497/*
 498 * We are picking a new current task - update its stats:
 499 */
 500static inline void
 501update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 502{
 503        /*
 504         * We are starting a new run period:
 505         */
 506        se->exec_start = rq_of(cfs_rq)->clock;
 507}
 508
 509/**************************************************
 510 * Scheduling class queueing methods:
 511 */
 512
 513static void
 514account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 515{
 516        update_load_add(&cfs_rq->load, se->load.weight);
 517        cfs_rq->nr_running++;
 518        se->on_rq = 1;
 519        list_add(&se->group_node, &cfs_rq->tasks);
 520}
 521
 522static void
 523account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 524{
 525        update_load_sub(&cfs_rq->load, se->load.weight);
 526        cfs_rq->nr_running--;
 527        se->on_rq = 0;
 528        list_del_init(&se->group_node);
 529}
 530
 531static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 532{
 533#ifdef CONFIG_SCHEDSTATS
 534        if (se->sleep_start) {
 535                u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
 536                struct task_struct *tsk = task_of(se);
 537
 538                if ((s64)delta < 0)
 539                        delta = 0;
 540
 541                if (unlikely(delta > se->sleep_max))
 542                        se->sleep_max = delta;
 543
 544                se->sleep_start = 0;
 545                se->sum_sleep_runtime += delta;
 546
 547                account_scheduler_latency(tsk, delta >> 10, 1);
 548        }
 549        if (se->block_start) {
 550                u64 delta = rq_of(cfs_rq)->clock - se->block_start;
 551                struct task_struct *tsk = task_of(se);
 552
 553                if ((s64)delta < 0)
 554                        delta = 0;
 555
 556                if (unlikely(delta > se->block_max))
 557                        se->block_max = delta;
 558
 559                se->block_start = 0;
 560                se->sum_sleep_runtime += delta;
 561
 562                /*
 563                 * Blocking time is in units of nanosecs, so shift by 20 to
 564                 * get a milliseconds-range estimation of the amount of
 565                 * time that the task spent sleeping:
 566                 */
 567                if (unlikely(prof_on == SLEEP_PROFILING)) {
 568
 569                        profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 570                                     delta >> 20);
 571                }
 572                account_scheduler_latency(tsk, delta >> 10, 0);
 573        }
 574#endif
 575}
 576
 577static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 578{
 579#ifdef CONFIG_SCHED_DEBUG
 580        s64 d = se->vruntime - cfs_rq->min_vruntime;
 581
 582        if (d < 0)
 583                d = -d;
 584
 585        if (d > 3*sysctl_sched_latency)
 586                schedstat_inc(cfs_rq, nr_spread_over);
 587#endif
 588}
 589
 590static void
 591place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 592{
 593        u64 vruntime;
 594
 595        if (first_fair(cfs_rq)) {
 596                vruntime = min_vruntime(cfs_rq->min_vruntime,
 597                                __pick_next_entity(cfs_rq)->vruntime);
 598        } else
 599                vruntime = cfs_rq->min_vruntime;
 600
 601        /*
 602         * The 'current' period is already promised to the current tasks,
 603         * however the extra weight of the new task will slow them down a
 604         * little, place the new task so that it fits in the slot that
 605         * stays open at the end.
 606         */
 607        if (initial && sched_feat(START_DEBIT))
 608                vruntime += sched_vslice_add(cfs_rq, se);
 609
 610        if (!initial) {
 611                /* sleeps upto a single latency don't count. */
 612                if (sched_feat(NEW_FAIR_SLEEPERS))
 613                        vruntime -= sysctl_sched_latency;
 614
 615                /* ensure we never gain time by being placed backwards. */
 616                vruntime = max_vruntime(se->vruntime, vruntime);
 617        }
 618
 619        se->vruntime = vruntime;
 620}
 621
 622static void
 623enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 624{
 625        /*
 626         * Update run-time statistics of the 'current'.
 627         */
 628        update_curr(cfs_rq);
 629        account_entity_enqueue(cfs_rq, se);
 630
 631        if (wakeup) {
 632                place_entity(cfs_rq, se, 0);
 633                enqueue_sleeper(cfs_rq, se);
 634        }
 635
 636        update_stats_enqueue(cfs_rq, se);
 637        check_spread(cfs_rq, se);
 638        if (se != cfs_rq->curr)
 639                __enqueue_entity(cfs_rq, se);
 640}
 641
 642static void update_avg(u64 *avg, u64 sample)
 643{
 644        s64 diff = sample - *avg;
 645        *avg += diff >> 3;
 646}
 647
 648static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
 649{
 650        if (!se->last_wakeup)
 651                return;
 652
 653        update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
 654        se->last_wakeup = 0;
 655}
 656
 657static void
 658dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 659{
 660        /*
 661         * Update run-time statistics of the 'current'.
 662         */
 663        update_curr(cfs_rq);
 664
 665        update_stats_dequeue(cfs_rq, se);
 666        if (sleep) {
 667                update_avg_stats(cfs_rq, se);
 668#ifdef CONFIG_SCHEDSTATS
 669                if (entity_is_task(se)) {
 670                        struct task_struct *tsk = task_of(se);
 671
 672                        if (tsk->state & TASK_INTERRUPTIBLE)
 673                                se->sleep_start = rq_of(cfs_rq)->clock;
 674                        if (tsk->state & TASK_UNINTERRUPTIBLE)
 675                                se->block_start = rq_of(cfs_rq)->clock;
 676                }
 677#endif
 678        }
 679
 680        if (se != cfs_rq->curr)
 681                __dequeue_entity(cfs_rq, se);
 682        account_entity_dequeue(cfs_rq, se);
 683}
 684
 685/*
 686 * Preempt the current task with a newly woken task if needed:
 687 */
 688static void
 689check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 690{
 691        unsigned long ideal_runtime, delta_exec;
 692
 693        ideal_runtime = sched_slice(cfs_rq, curr);
 694        delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 695        if (delta_exec > ideal_runtime)
 696                resched_task(rq_of(cfs_rq)->curr);
 697}
 698
 699static void
 700set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 701{
 702        /* 'current' is not kept within the tree. */
 703        if (se->on_rq) {
 704                /*
 705                 * Any task has to be enqueued before it get to execute on
 706                 * a CPU. So account for the time it spent waiting on the
 707                 * runqueue.
 708                 */
 709                update_stats_wait_end(cfs_rq, se);
 710                __dequeue_entity(cfs_rq, se);
 711        }
 712
 713        update_stats_curr_start(cfs_rq, se);
 714        cfs_rq->curr = se;
 715#ifdef CONFIG_SCHEDSTATS
 716        /*
 717         * Track our maximum slice length, if the CPU's load is at
 718         * least twice that of our own weight (i.e. dont track it
 719         * when there are only lesser-weight tasks around):
 720         */
 721        if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
 722                se->slice_max = max(se->slice_max,
 723                        se->sum_exec_runtime - se->prev_sum_exec_runtime);
 724        }
 725#endif
 726        se->prev_sum_exec_runtime = se->sum_exec_runtime;
 727}
 728
 729static int
 730wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 731
 732static struct sched_entity *
 733pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
 734{
 735        if (!cfs_rq->next)
 736                return se;
 737
 738        if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
 739                return se;
 740
 741        return cfs_rq->next;
 742}
 743
 744static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 745{
 746        struct sched_entity *se = NULL;
 747
 748        if (first_fair(cfs_rq)) {
 749                se = __pick_next_entity(cfs_rq);
 750                se = pick_next(cfs_rq, se);
 751                set_next_entity(cfs_rq, se);
 752        }
 753
 754        return se;
 755}
 756
 757static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 758{
 759        /*
 760         * If still on the runqueue then deactivate_task()
 761         * was not called and update_curr() has to be done:
 762         */
 763        if (prev->on_rq)
 764                update_curr(cfs_rq);
 765
 766        check_spread(cfs_rq, prev);
 767        if (prev->on_rq) {
 768                update_stats_wait_start(cfs_rq, prev);
 769                /* Put 'current' back into the tree. */
 770                __enqueue_entity(cfs_rq, prev);
 771        }
 772        cfs_rq->curr = NULL;
 773}
 774
 775static void
 776entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 777{
 778        /*
 779         * Update run-time statistics of the 'current'.
 780         */
 781        update_curr(cfs_rq);
 782
 783#ifdef CONFIG_SCHED_HRTICK
 784        /*
 785         * queued ticks are scheduled to match the slice, so don't bother
 786         * validating it and just reschedule.
 787         */
 788        if (queued) {
 789                resched_task(rq_of(cfs_rq)->curr);
 790                return;
 791        }
 792        /*
 793         * don't let the period tick interfere with the hrtick preemption
 794         */
 795        if (!sched_feat(DOUBLE_TICK) &&
 796                        hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
 797                return;
 798#endif
 799
 800        if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
 801                check_preempt_tick(cfs_rq, curr);
 802}
 803
 804/**************************************************
 805 * CFS operations on tasks:
 806 */
 807
 808#ifdef CONFIG_SCHED_HRTICK
 809static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 810{
 811        int requeue = rq->curr == p;
 812        struct sched_entity *se = &p->se;
 813        struct cfs_rq *cfs_rq = cfs_rq_of(se);
 814
 815        WARN_ON(task_rq(p) != rq);
 816
 817        if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
 818                u64 slice = sched_slice(cfs_rq, se);
 819                u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
 820                s64 delta = slice - ran;
 821
 822                if (delta < 0) {
 823                        if (rq->curr == p)
 824                                resched_task(p);
 825                        return;
 826                }
 827
 828                /*
 829                 * Don't schedule slices shorter than 10000ns, that just
 830                 * doesn't make sense. Rely on vruntime for fairness.
 831                 */
 832                if (!requeue)
 833                        delta = max(10000LL, delta);
 834
 835                hrtick_start(rq, delta, requeue);
 836        }
 837}
 838#else
 839static inline void
 840hrtick_start_fair(struct rq *rq, struct task_struct *p)
 841{
 842}
 843#endif
 844
 845/*
 846 * The enqueue_task method is called before nr_running is
 847 * increased. Here we update the fair scheduling stats and
 848 * then put the task into the rbtree:
 849 */
 850static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 851{
 852        struct cfs_rq *cfs_rq;
 853        struct sched_entity *se = &p->se;
 854
 855        for_each_sched_entity(se) {
 856                if (se->on_rq)
 857                        break;
 858                cfs_rq = cfs_rq_of(se);
 859                enqueue_entity(cfs_rq, se, wakeup);
 860                wakeup = 1;
 861        }
 862
 863        hrtick_start_fair(rq, rq->curr);
 864}
 865
 866/*
 867 * The dequeue_task method is called before nr_running is
 868 * decreased. We remove the task from the rbtree and
 869 * update the fair scheduling stats:
 870 */
 871static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 872{
 873        struct cfs_rq *cfs_rq;
 874        struct sched_entity *se = &p->se;
 875
 876        for_each_sched_entity(se) {
 877                cfs_rq = cfs_rq_of(se);
 878                dequeue_entity(cfs_rq, se, sleep);
 879                /* Don't dequeue parent if it has other entities besides us */
 880                if (cfs_rq->load.weight)
 881                        break;
 882                sleep = 1;
 883        }
 884
 885        hrtick_start_fair(rq, rq->curr);
 886}
 887
 888/*
 889 * sched_yield() support is very simple - we dequeue and enqueue.
 890 *
 891 * If compat_yield is turned on then we requeue to the end of the tree.
 892 */
 893static void yield_task_fair(struct rq *rq)
 894{
 895        struct task_struct *curr = rq->curr;
 896        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 897        struct sched_entity *rightmost, *se = &curr->se;
 898
 899        /*
 900         * Are we the only task in the tree?
 901         */
 902        if (unlikely(cfs_rq->nr_running == 1))
 903                return;
 904
 905        if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
 906                update_rq_clock(rq);
 907                /*
 908                 * Update run-time statistics of the 'current'.
 909                 */
 910                update_curr(cfs_rq);
 911
 912                return;
 913        }
 914        /*
 915         * Find the rightmost entry in the rbtree:
 916         */
 917        rightmost = __pick_last_entity(cfs_rq);
 918        /*
 919         * Already in the rightmost position?
 920         */
 921        if (unlikely(!rightmost || rightmost->vruntime < se->vruntime))
 922                return;
 923
 924        /*
 925         * Minimally necessary key value to be last in the tree:
 926         * Upon rescheduling, sched_class::put_prev_task() will place
 927         * 'current' within the tree based on its new key value.
 928         */
 929        se->vruntime = rightmost->vruntime + 1;
 930}
 931
 932/*
 933 * wake_idle() will wake a task on an idle cpu if task->cpu is
 934 * not idle and an idle cpu is available.  The span of cpus to
 935 * search starts with cpus closest then further out as needed,
 936 * so we always favor a closer, idle cpu.
 937 *
 938 * Returns the CPU we should wake onto.
 939 */
 940#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
 941static int wake_idle(int cpu, struct task_struct *p)
 942{
 943        cpumask_t tmp;
 944        struct sched_domain *sd;
 945        int i;
 946
 947        /*
 948         * If it is idle, then it is the best cpu to run this task.
 949         *
 950         * This cpu is also the best, if it has more than one task already.
 951         * Siblings must be also busy(in most cases) as they didn't already
 952         * pickup the extra load from this cpu and hence we need not check
 953         * sibling runqueue info. This will avoid the checks and cache miss
 954         * penalities associated with that.
 955         */
 956        if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
 957                return cpu;
 958
 959        for_each_domain(cpu, sd) {
 960                if ((sd->flags & SD_WAKE_IDLE)
 961                    || ((sd->flags & SD_WAKE_IDLE_FAR)
 962                        && !task_hot(p, task_rq(p)->clock, sd))) {
 963                        cpus_and(tmp, sd->span, p->cpus_allowed);
 964                        for_each_cpu_mask(i, tmp) {
 965                                if (idle_cpu(i)) {
 966                                        if (i != task_cpu(p)) {
 967                                                schedstat_inc(p,
 968                                                       se.nr_wakeups_idle);
 969                                        }
 970                                        return i;
 971                                }
 972                        }
 973                } else {
 974                        break;
 975                }
 976        }
 977        return cpu;
 978}
 979#else
 980static inline int wake_idle(int cpu, struct task_struct *p)
 981{
 982        return cpu;
 983}
 984#endif
 985
 986#ifdef CONFIG_SMP
 987
 988static const struct sched_class fair_sched_class;
 989
 990static int
 991wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 992            struct task_struct *p, int prev_cpu, int this_cpu, int sync,
 993            int idx, unsigned long load, unsigned long this_load,
 994            unsigned int imbalance)
 995{
 996        struct task_struct *curr = this_rq->curr;
 997        unsigned long tl = this_load;
 998        unsigned long tl_per_task;
 999        int balanced;
1000
1001        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1002                return 0;
1003
1004        /*
1005         * If sync wakeup then subtract the (maximum possible)
1006         * effect of the currently running task from the load
1007         * of the current CPU:
1008         */
1009        if (sync)
1010                tl -= current->se.load.weight;
1011
1012        balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
1013
1014        /*
1015         * If the currently running task will sleep within
1016         * a reasonable amount of time then attract this newly
1017         * woken task:
1018         */
1019        if (sync && balanced && curr->sched_class == &fair_sched_class) {
1020                if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1021                                p->se.avg_overlap < sysctl_sched_migration_cost)
1022                        return 1;
1023        }
1024
1025        schedstat_inc(p, se.nr_wakeups_affine_attempts);
1026        tl_per_task = cpu_avg_load_per_task(this_cpu);
1027
1028        if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
1029                        balanced) {
1030                /*
1031                 * This domain has SD_WAKE_AFFINE and
1032                 * p is cache cold in this domain, and
1033                 * there is no bad imbalance.
1034                 */
1035                schedstat_inc(this_sd, ttwu_move_affine);
1036                schedstat_inc(p, se.nr_wakeups_affine);
1037
1038                return 1;
1039        }
1040        return 0;
1041}
1042
1043static int select_task_rq_fair(struct task_struct *p, int sync)
1044{
1045        struct sched_domain *sd, *this_sd = NULL;
1046        int prev_cpu, this_cpu, new_cpu;
1047        unsigned long load, this_load;
1048        struct rq *rq, *this_rq;
1049        unsigned int imbalance;
1050        int idx;
1051
1052        prev_cpu        = task_cpu(p);
1053        rq              = task_rq(p);
1054        this_cpu        = smp_processor_id();
1055        this_rq         = cpu_rq(this_cpu);
1056        new_cpu         = prev_cpu;
1057
1058        /*
1059         * 'this_sd' is the first domain that both
1060         * this_cpu and prev_cpu are present in:
1061         */
1062        for_each_domain(this_cpu, sd) {
1063                if (cpu_isset(prev_cpu, sd->span)) {
1064                        this_sd = sd;
1065                        break;
1066                }
1067        }
1068
1069        if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1070                goto out;
1071
1072        /*
1073         * Check for affine wakeup and passive balancing possibilities.
1074         */
1075        if (!this_sd)
1076                goto out;
1077
1078        idx = this_sd->wake_idx;
1079
1080        imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1081
1082        load = source_load(prev_cpu, idx);
1083        this_load = target_load(this_cpu, idx);
1084
1085        if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1086                                     load, this_load, imbalance))
1087                return this_cpu;
1088
1089        if (prev_cpu == this_cpu)
1090                goto out;
1091
1092        /*
1093         * Start passive balancing when half the imbalance_pct
1094         * limit is reached.
1095         */
1096        if (this_sd->flags & SD_WAKE_BALANCE) {
1097                if (imbalance*this_load <= 100*load) {
1098                        schedstat_inc(this_sd, ttwu_move_balance);
1099                        schedstat_inc(p, se.nr_wakeups_passive);
1100                        return this_cpu;
1101                }
1102        }
1103
1104out:
1105        return wake_idle(new_cpu, p);
1106}
1107#endif /* CONFIG_SMP */
1108
1109static unsigned long wakeup_gran(struct sched_entity *se)
1110{
1111        unsigned long gran = sysctl_sched_wakeup_granularity;
1112
1113        /*
1114         * More easily preempt - nice tasks, while not making
1115         * it harder for + nice tasks.
1116         */
1117        if (unlikely(se->load.weight > NICE_0_LOAD))
1118                gran = calc_delta_fair(gran, &se->load);
1119
1120        return gran;
1121}
1122
1123/*
1124 * Should 'se' preempt 'curr'.
1125 *
1126 *             |s1
1127 *        |s2
1128 *   |s3
1129 *         g
1130 *      |<--->|c
1131 *
1132 *  w(c, s1) = -1
1133 *  w(c, s2) =  0
1134 *  w(c, s3) =  1
1135 *
1136 */
1137static int
1138wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1139{
1140        s64 gran, vdiff = curr->vruntime - se->vruntime;
1141
1142        if (vdiff < 0)
1143                return -1;
1144
1145        gran = wakeup_gran(curr);
1146        if (vdiff > gran)
1147                return 1;
1148
1149        return 0;
1150}
1151
1152/* return depth at which a sched entity is present in the hierarchy */
1153static inline int depth_se(struct sched_entity *se)
1154{
1155        int depth = 0;
1156
1157        for_each_sched_entity(se)
1158                depth++;
1159
1160        return depth;
1161}
1162
1163/*
1164 * Preempt the current task with a newly woken task if needed:
1165 */
1166static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1167{
1168        struct task_struct *curr = rq->curr;
1169        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1170        struct sched_entity *se = &curr->se, *pse = &p->se;
1171        int se_depth, pse_depth;
1172
1173        if (unlikely(rt_prio(p->prio))) {
1174                update_rq_clock(rq);
1175                update_curr(cfs_rq);
1176                resched_task(curr);
1177                return;
1178        }
1179
1180        se->last_wakeup = se->sum_exec_runtime;
1181        if (unlikely(se == pse))
1182                return;
1183
1184        cfs_rq_of(pse)->next = pse;
1185
1186        /*
1187         * Batch tasks do not preempt (their preemption is driven by
1188         * the tick):
1189         */
1190        if (unlikely(p->policy == SCHED_BATCH))
1191                return;
1192
1193        if (!sched_feat(WAKEUP_PREEMPT))
1194                return;
1195
1196        /*
1197         * preemption test can be made between sibling entities who are in the
1198         * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
1199         * both tasks until we find their ancestors who are siblings of common
1200         * parent.
1201         */
1202
1203        /* First walk up until both entities are at same depth */
1204        se_depth = depth_se(se);
1205        pse_depth = depth_se(pse);
1206
1207        while (se_depth > pse_depth) {
1208                se_depth--;
1209                se = parent_entity(se);
1210        }
1211
1212        while (pse_depth > se_depth) {
1213                pse_depth--;
1214                pse = parent_entity(pse);
1215        }
1216
1217        while (!is_same_group(se, pse)) {
1218                se = parent_entity(se);
1219                pse = parent_entity(pse);
1220        }
1221
1222        if (wakeup_preempt_entity(se, pse) == 1)
1223                resched_task(curr);
1224}
1225
1226static struct task_struct *pick_next_task_fair(struct rq *rq)
1227{
1228        struct task_struct *p;
1229        struct cfs_rq *cfs_rq = &rq->cfs;
1230        struct sched_entity *se;
1231
1232        if (unlikely(!cfs_rq->nr_running))
1233                return NULL;
1234
1235        do {
1236                se = pick_next_entity(cfs_rq);
1237                cfs_rq = group_cfs_rq(se);
1238        } while (cfs_rq);
1239
1240        p = task_of(se);
1241        hrtick_start_fair(rq, p);
1242
1243        return p;
1244}
1245
1246/*
1247 * Account for a descheduled task:
1248 */
1249static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1250{
1251        struct sched_entity *se = &prev->se;
1252        struct cfs_rq *cfs_rq;
1253
1254        for_each_sched_entity(se) {
1255                cfs_rq = cfs_rq_of(se);
1256                put_prev_entity(cfs_rq, se);
1257        }
1258}
1259
1260#ifdef CONFIG_SMP
1261/**************************************************
1262 * Fair scheduling class load-balancing methods:
1263 */
1264
1265/*
1266 * Load-balancing iterator. Note: while the runqueue stays locked
1267 * during the whole iteration, the current task might be
1268 * dequeued so the iterator has to be dequeue-safe. Here we
1269 * achieve that by always pre-iterating before returning
1270 * the current task:
1271 */
1272static struct task_struct *
1273__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1274{
1275        struct task_struct *p = NULL;
1276        struct sched_entity *se;
1277
1278        if (next == &cfs_rq->tasks)
1279                return NULL;
1280
1281        /* Skip over entities that are not tasks */
1282        do {
1283                se = list_entry(next, struct sched_entity, group_node);
1284                next = next->next;
1285        } while (next != &cfs_rq->tasks && !entity_is_task(se));
1286
1287        if (next == &cfs_rq->tasks)
1288                return NULL;
1289
1290        cfs_rq->balance_iterator = next;
1291
1292        if (entity_is_task(se))
1293                p = task_of(se);
1294
1295        return p;
1296}
1297
1298static struct task_struct *load_balance_start_fair(void *arg)
1299{
1300        struct cfs_rq *cfs_rq = arg;
1301
1302        return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
1303}
1304
1305static struct task_struct *load_balance_next_fair(void *arg)
1306{
1307        struct cfs_rq *cfs_rq = arg;
1308
1309        return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1310}
1311
1312#ifdef CONFIG_FAIR_GROUP_SCHED
1313static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
1314{
1315        struct sched_entity *curr;
1316        struct task_struct *p;
1317
1318        if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1319                return MAX_PRIO;
1320
1321        curr = cfs_rq->curr;
1322        if (!curr)
1323                curr = __pick_next_entity(cfs_rq);
1324
1325        p = task_of(curr);
1326
1327        return p->prio;
1328}
1329#endif
1330
1331static unsigned long
1332load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1333                  unsigned long max_load_move,
1334                  struct sched_domain *sd, enum cpu_idle_type idle,
1335                  int *all_pinned, int *this_best_prio)
1336{
1337        struct cfs_rq *busy_cfs_rq;
1338        long rem_load_move = max_load_move;
1339        struct rq_iterator cfs_rq_iterator;
1340
1341        cfs_rq_iterator.start = load_balance_start_fair;
1342        cfs_rq_iterator.next = load_balance_next_fair;
1343
1344        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1345#ifdef CONFIG_FAIR_GROUP_SCHED
1346                struct cfs_rq *this_cfs_rq;
1347                long imbalance;
1348                unsigned long maxload;
1349
1350                this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
1351
1352                imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
1353                /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
1354                if (imbalance <= 0)
1355                        continue;
1356
1357                /* Don't pull more than imbalance/2 */
1358                imbalance /= 2;
1359                maxload = min(rem_load_move, imbalance);
1360
1361                *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
1362#else
1363# define maxload rem_load_move
1364#endif
1365                /*
1366                 * pass busy_cfs_rq argument into
1367                 * load_balance_[start|next]_fair iterators
1368                 */
1369                cfs_rq_iterator.arg = busy_cfs_rq;
1370                rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1371                                               maxload, sd, idle, all_pinned,
1372                                               this_best_prio,
1373                                               &cfs_rq_iterator);
1374
1375                if (rem_load_move <= 0)
1376                        break;
1377        }
1378
1379        return max_load_move - rem_load_move;
1380}
1381
1382static int
1383move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1384                   struct sched_domain *sd, enum cpu_idle_type idle)
1385{
1386        struct cfs_rq *busy_cfs_rq;
1387        struct rq_iterator cfs_rq_iterator;
1388
1389        cfs_rq_iterator.start = load_balance_start_fair;
1390        cfs_rq_iterator.next = load_balance_next_fair;
1391
1392        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1393                /*
1394                 * pass busy_cfs_rq argument into
1395                 * load_balance_[start|next]_fair iterators
1396                 */
1397                cfs_rq_iterator.arg = busy_cfs_rq;
1398                if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
1399                                       &cfs_rq_iterator))
1400                    return 1;
1401        }
1402
1403        return 0;
1404}
1405#endif
1406
1407/*
1408 * scheduler tick hitting a task of our scheduling class:
1409 */
1410static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1411{
1412        struct cfs_rq *cfs_rq;
1413        struct sched_entity *se = &curr->se;
1414
1415        for_each_sched_entity(se) {
1416                cfs_rq = cfs_rq_of(se);
1417                entity_tick(cfs_rq, se, queued);
1418        }
1419}
1420
1421#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1422
1423/*
1424 * Share the fairness runtime between parent and child, thus the
1425 * total amount of pressure for CPU stays equal - new tasks
1426 * get a chance to run but frequent forkers are not allowed to
1427 * monopolize the CPU. Note: the parent runqueue is locked,
1428 * the child is not running yet.
1429 */
1430static void task_new_fair(struct rq *rq, struct task_struct *p)
1431{
1432        struct cfs_rq *cfs_rq = task_cfs_rq(p);
1433        struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1434        int this_cpu = smp_processor_id();
1435
1436        sched_info_queued(p);
1437
1438        update_curr(cfs_rq);
1439        place_entity(cfs_rq, se, 1);
1440
1441        /* 'curr' will be NULL if the child belongs to a different group */
1442        if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1443                        curr && curr->vruntime < se->vruntime) {
1444                /*
1445                 * Upon rescheduling, sched_class::put_prev_task() will place
1446                 * 'current' within the tree based on its new key value.
1447                 */
1448                swap(curr->vruntime, se->vruntime);
1449        }
1450
1451        enqueue_task_fair(rq, p, 0);
1452        resched_task(rq->curr);
1453}
1454
1455/*
1456 * Priority of the task has changed. Check to see if we preempt
1457 * the current task.
1458 */
1459static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1460                              int oldprio, int running)
1461{
1462        /*
1463         * Reschedule if we are currently running on this runqueue and
1464         * our priority decreased, or if we are not currently running on
1465         * this runqueue and our priority is higher than the current's
1466         */
1467        if (running) {
1468                if (p->prio > oldprio)
1469                        resched_task(rq->curr);
1470        } else
1471                check_preempt_curr(rq, p);
1472}
1473
1474/*
1475 * We switched to the sched_fair class.
1476 */
1477static void switched_to_fair(struct rq *rq, struct task_struct *p,
1478                             int running)
1479{
1480        /*
1481         * We were most likely switched from sched_rt, so
1482         * kick off the schedule if running, otherwise just see
1483         * if we can still preempt the current task.
1484         */
1485        if (running)
1486                resched_task(rq->curr);
1487        else
1488                check_preempt_curr(rq, p);
1489}
1490
1491/* Account for a task changing its policy or group.
1492 *
1493 * This routine is mostly called to set cfs_rq->curr field when a task
1494 * migrates between groups/classes.
1495 */
1496static void set_curr_task_fair(struct rq *rq)
1497{
1498        struct sched_entity *se = &rq->curr->se;
1499
1500        for_each_sched_entity(se)
1501                set_next_entity(cfs_rq_of(se), se);
1502}
1503
1504#ifdef CONFIG_FAIR_GROUP_SCHED
1505static void moved_group_fair(struct task_struct *p)
1506{
1507        struct cfs_rq *cfs_rq = task_cfs_rq(p);
1508
1509        update_curr(cfs_rq);
1510        place_entity(cfs_rq, &p->se, 1);
1511}
1512#endif
1513
1514/*
1515 * All the scheduling class methods:
1516 */
1517static const struct sched_class fair_sched_class = {
1518        .next                   = &idle_sched_class,
1519        .enqueue_task           = enqueue_task_fair,
1520        .dequeue_task           = dequeue_task_fair,
1521        .yield_task             = yield_task_fair,
1522#ifdef CONFIG_SMP
1523        .select_task_rq         = select_task_rq_fair,
1524#endif /* CONFIG_SMP */
1525
1526        .check_preempt_curr     = check_preempt_wakeup,
1527
1528        .pick_next_task         = pick_next_task_fair,
1529        .put_prev_task          = put_prev_task_fair,
1530
1531#ifdef CONFIG_SMP
1532        .load_balance           = load_balance_fair,
1533        .move_one_task          = move_one_task_fair,
1534#endif
1535
1536        .set_curr_task          = set_curr_task_fair,
1537        .task_tick              = task_tick_fair,
1538        .task_new               = task_new_fair,
1539
1540        .prio_changed           = prio_changed_fair,
1541        .switched_to            = switched_to_fair,
1542
1543#ifdef CONFIG_FAIR_GROUP_SCHED
1544        .moved_group            = moved_group_fair,
1545#endif
1546};
1547
1548#ifdef CONFIG_SCHED_DEBUG
1549static void print_cfs_stats(struct seq_file *m, int cpu)
1550{
1551        struct cfs_rq *cfs_rq;
1552
1553        rcu_read_lock();
1554        for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1555                print_cfs_rq(m, cpu, cfs_rq);
1556        rcu_read_unlock();
1557}
1558#endif
1559
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.