linux/kernel/sched_rt.c
<<
>>
Prefs
   1/*
   2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3 * policies)
   4 */
   5
   6#ifdef CONFIG_RT_GROUP_SCHED
   7
   8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
   9
  10static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  11{
  12#ifdef CONFIG_SCHED_DEBUG
  13        WARN_ON_ONCE(!rt_entity_is_task(rt_se));
  14#endif
  15        return container_of(rt_se, struct task_struct, rt);
  16}
  17
  18static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  19{
  20        return rt_rq->rq;
  21}
  22
  23static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  24{
  25        return rt_se->rt_rq;
  26}
  27
  28#else /* CONFIG_RT_GROUP_SCHED */
  29
  30#define rt_entity_is_task(rt_se) (1)
  31
  32static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  33{
  34        return container_of(rt_se, struct task_struct, rt);
  35}
  36
  37static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  38{
  39        return container_of(rt_rq, struct rq, rt);
  40}
  41
  42static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  43{
  44        struct task_struct *p = rt_task_of(rt_se);
  45        struct rq *rq = task_rq(p);
  46
  47        return &rq->rt;
  48}
  49
  50#endif /* CONFIG_RT_GROUP_SCHED */
  51
  52#ifdef CONFIG_SMP
  53
  54static inline int rt_overloaded(struct rq *rq)
  55{
  56        return atomic_read(&rq->rd->rto_count);
  57}
  58
  59static inline void rt_set_overload(struct rq *rq)
  60{
  61        if (!rq->online)
  62                return;
  63
  64        cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
  65        /*
  66         * Make sure the mask is visible before we set
  67         * the overload count. That is checked to determine
  68         * if we should look at the mask. It would be a shame
  69         * if we looked at the mask, but the mask was not
  70         * updated yet.
  71         */
  72        wmb();
  73        atomic_inc(&rq->rd->rto_count);
  74}
  75
  76static inline void rt_clear_overload(struct rq *rq)
  77{
  78        if (!rq->online)
  79                return;
  80
  81        /* the order here really doesn't matter */
  82        atomic_dec(&rq->rd->rto_count);
  83        cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
  84}
  85
  86static void update_rt_migration(struct rt_rq *rt_rq)
  87{
  88        if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
  89                if (!rt_rq->overloaded) {
  90                        rt_set_overload(rq_of_rt_rq(rt_rq));
  91                        rt_rq->overloaded = 1;
  92                }
  93        } else if (rt_rq->overloaded) {
  94                rt_clear_overload(rq_of_rt_rq(rt_rq));
  95                rt_rq->overloaded = 0;
  96        }
  97}
  98
  99static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 100{
 101        if (!rt_entity_is_task(rt_se))
 102                return;
 103
 104        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 105
 106        rt_rq->rt_nr_total++;
 107        if (rt_se->nr_cpus_allowed > 1)
 108                rt_rq->rt_nr_migratory++;
 109
 110        update_rt_migration(rt_rq);
 111}
 112
 113static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 114{
 115        if (!rt_entity_is_task(rt_se))
 116                return;
 117
 118        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 119
 120        rt_rq->rt_nr_total--;
 121        if (rt_se->nr_cpus_allowed > 1)
 122                rt_rq->rt_nr_migratory--;
 123
 124        update_rt_migration(rt_rq);
 125}
 126
 127static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 128{
 129        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 130        plist_node_init(&p->pushable_tasks, p->prio);
 131        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 132}
 133
 134static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 135{
 136        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 137}
 138
 139static inline int has_pushable_tasks(struct rq *rq)
 140{
 141        return !plist_head_empty(&rq->rt.pushable_tasks);
 142}
 143
 144#else
 145
 146static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 147{
 148}
 149
 150static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 151{
 152}
 153
 154static inline
 155void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 156{
 157}
 158
 159static inline
 160void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 161{
 162}
 163
 164#endif /* CONFIG_SMP */
 165
 166static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 167{
 168        return !list_empty(&rt_se->run_list);
 169}
 170
 171#ifdef CONFIG_RT_GROUP_SCHED
 172
 173static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 174{
 175        if (!rt_rq->tg)
 176                return RUNTIME_INF;
 177
 178        return rt_rq->rt_runtime;
 179}
 180
 181static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 182{
 183        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 184}
 185
 186typedef struct task_group *rt_rq_iter_t;
 187
 188static inline struct task_group *next_task_group(struct task_group *tg)
 189{
 190        do {
 191                tg = list_entry_rcu(tg->list.next,
 192                        typeof(struct task_group), list);
 193        } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 194
 195        if (&tg->list == &task_groups)
 196                tg = NULL;
 197
 198        return tg;
 199}
 200
 201#define for_each_rt_rq(rt_rq, iter, rq)                                 \
 202        for (iter = container_of(&task_groups, typeof(*iter), list);    \
 203                (iter = next_task_group(iter)) &&                       \
 204                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 205
 206static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 207{
 208        list_add_rcu(&rt_rq->leaf_rt_rq_list,
 209                        &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
 210}
 211
 212static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
 213{
 214        list_del_rcu(&rt_rq->leaf_rt_rq_list);
 215}
 216
 217#define for_each_leaf_rt_rq(rt_rq, rq) \
 218        list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 219
 220#define for_each_sched_rt_entity(rt_se) \
 221        for (; rt_se; rt_se = rt_se->parent)
 222
 223static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 224{
 225        return rt_se->my_q;
 226}
 227
 228static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
 229static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 230
 231static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 232{
 233        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 234        struct sched_rt_entity *rt_se;
 235
 236        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 237
 238        rt_se = rt_rq->tg->rt_se[cpu];
 239
 240        if (rt_rq->rt_nr_running) {
 241                if (rt_se && !on_rt_rq(rt_se))
 242                        enqueue_rt_entity(rt_se, false);
 243                if (rt_rq->highest_prio.curr < curr->prio)
 244                        resched_task(curr);
 245        }
 246}
 247
 248static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 249{
 250        struct sched_rt_entity *rt_se;
 251        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 252
 253        rt_se = rt_rq->tg->rt_se[cpu];
 254
 255        if (rt_se && on_rt_rq(rt_se))
 256                dequeue_rt_entity(rt_se);
 257}
 258
 259static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 260{
 261        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 262}
 263
 264static int rt_se_boosted(struct sched_rt_entity *rt_se)
 265{
 266        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 267        struct task_struct *p;
 268
 269        if (rt_rq)
 270                return !!rt_rq->rt_nr_boosted;
 271
 272        p = rt_task_of(rt_se);
 273        return p->prio != p->normal_prio;
 274}
 275
 276#ifdef CONFIG_SMP
 277static inline const struct cpumask *sched_rt_period_mask(void)
 278{
 279        return cpu_rq(smp_processor_id())->rd->span;
 280}
 281#else
 282static inline const struct cpumask *sched_rt_period_mask(void)
 283{
 284        return cpu_online_mask;
 285}
 286#endif
 287
 288static inline
 289struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 290{
 291        return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 292}
 293
 294static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 295{
 296        return &rt_rq->tg->rt_bandwidth;
 297}
 298
 299#else /* !CONFIG_RT_GROUP_SCHED */
 300
 301static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 302{
 303        return rt_rq->rt_runtime;
 304}
 305
 306static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 307{
 308        return ktime_to_ns(def_rt_bandwidth.rt_period);
 309}
 310
 311typedef struct rt_rq *rt_rq_iter_t;
 312
 313#define for_each_rt_rq(rt_rq, iter, rq) \
 314        for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 315
 316static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 317{
 318}
 319
 320static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
 321{
 322}
 323
 324#define for_each_leaf_rt_rq(rt_rq, rq) \
 325        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 326
 327#define for_each_sched_rt_entity(rt_se) \
 328        for (; rt_se; rt_se = NULL)
 329
 330static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 331{
 332        return NULL;
 333}
 334
 335static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 336{
 337        if (rt_rq->rt_nr_running)
 338                resched_task(rq_of_rt_rq(rt_rq)->curr);
 339}
 340
 341static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 342{
 343}
 344
 345static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 346{
 347        return rt_rq->rt_throttled;
 348}
 349
 350static inline const struct cpumask *sched_rt_period_mask(void)
 351{
 352        return cpu_online_mask;
 353}
 354
 355static inline
 356struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 357{
 358        return &cpu_rq(cpu)->rt;
 359}
 360
 361static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 362{
 363        return &def_rt_bandwidth;
 364}
 365
 366#endif /* CONFIG_RT_GROUP_SCHED */
 367
 368#ifdef CONFIG_SMP
 369/*
 370 * We ran out of runtime, see if we can borrow some from our neighbours.
 371 */
 372static int do_balance_runtime(struct rt_rq *rt_rq)
 373{
 374        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 375        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 376        int i, weight, more = 0;
 377        u64 rt_period;
 378
 379        weight = cpumask_weight(rd->span);
 380
 381        raw_spin_lock(&rt_b->rt_runtime_lock);
 382        rt_period = ktime_to_ns(rt_b->rt_period);
 383        for_each_cpu(i, rd->span) {
 384                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 385                s64 diff;
 386
 387                if (iter == rt_rq)
 388                        continue;
 389
 390                raw_spin_lock(&iter->rt_runtime_lock);
 391                /*
 392                 * Either all rqs have inf runtime and there's nothing to steal
 393                 * or __disable_runtime() below sets a specific rq to inf to
 394                 * indicate its been disabled and disalow stealing.
 395                 */
 396                if (iter->rt_runtime == RUNTIME_INF)
 397                        goto next;
 398
 399                /*
 400                 * From runqueues with spare time, take 1/n part of their
 401                 * spare time, but no more than our period.
 402                 */
 403                diff = iter->rt_runtime - iter->rt_time;
 404                if (diff > 0) {
 405                        diff = div_u64((u64)diff, weight);
 406                        if (rt_rq->rt_runtime + diff > rt_period)
 407                                diff = rt_period - rt_rq->rt_runtime;
 408                        iter->rt_runtime -= diff;
 409                        rt_rq->rt_runtime += diff;
 410                        more = 1;
 411                        if (rt_rq->rt_runtime == rt_period) {
 412                                raw_spin_unlock(&iter->rt_runtime_lock);
 413                                break;
 414                        }
 415                }
 416next:
 417                raw_spin_unlock(&iter->rt_runtime_lock);
 418        }
 419        raw_spin_unlock(&rt_b->rt_runtime_lock);
 420
 421        return more;
 422}
 423
 424/*
 425 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 426 */
 427static void __disable_runtime(struct rq *rq)
 428{
 429        struct root_domain *rd = rq->rd;
 430        rt_rq_iter_t iter;
 431        struct rt_rq *rt_rq;
 432
 433        if (unlikely(!scheduler_running))
 434                return;
 435
 436        for_each_rt_rq(rt_rq, iter, rq) {
 437                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 438                s64 want;
 439                int i;
 440
 441                raw_spin_lock(&rt_b->rt_runtime_lock);
 442                raw_spin_lock(&rt_rq->rt_runtime_lock);
 443                /*
 444                 * Either we're all inf and nobody needs to borrow, or we're
 445                 * already disabled and thus have nothing to do, or we have
 446                 * exactly the right amount of runtime to take out.
 447                 */
 448                if (rt_rq->rt_runtime == RUNTIME_INF ||
 449                                rt_rq->rt_runtime == rt_b->rt_runtime)
 450                        goto balanced;
 451                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 452
 453                /*
 454                 * Calculate the difference between what we started out with
 455                 * and what we current have, that's the amount of runtime
 456                 * we lend and now have to reclaim.
 457                 */
 458                want = rt_b->rt_runtime - rt_rq->rt_runtime;
 459
 460                /*
 461                 * Greedy reclaim, take back as much as we can.
 462                 */
 463                for_each_cpu(i, rd->span) {
 464                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 465                        s64 diff;
 466
 467                        /*
 468                         * Can't reclaim from ourselves or disabled runqueues.
 469                         */
 470                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 471                                continue;
 472
 473                        raw_spin_lock(&iter->rt_runtime_lock);
 474                        if (want > 0) {
 475                                diff = min_t(s64, iter->rt_runtime, want);
 476                                iter->rt_runtime -= diff;
 477                                want -= diff;
 478                        } else {
 479                                iter->rt_runtime -= want;
 480                                want -= want;
 481                        }
 482                        raw_spin_unlock(&iter->rt_runtime_lock);
 483
 484                        if (!want)
 485                                break;
 486                }
 487
 488                raw_spin_lock(&rt_rq->rt_runtime_lock);
 489                /*
 490                 * We cannot be left wanting - that would mean some runtime
 491                 * leaked out of the system.
 492                 */
 493                BUG_ON(want);
 494balanced:
 495                /*
 496                 * Disable all the borrow logic by pretending we have inf
 497                 * runtime - in which case borrowing doesn't make sense.
 498                 */
 499                rt_rq->rt_runtime = RUNTIME_INF;
 500                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 501                raw_spin_unlock(&rt_b->rt_runtime_lock);
 502        }
 503}
 504
 505static void disable_runtime(struct rq *rq)
 506{
 507        unsigned long flags;
 508
 509        raw_spin_lock_irqsave(&rq->lock, flags);
 510        __disable_runtime(rq);
 511        raw_spin_unlock_irqrestore(&rq->lock, flags);
 512}
 513
 514static void __enable_runtime(struct rq *rq)
 515{
 516        rt_rq_iter_t iter;
 517        struct rt_rq *rt_rq;
 518
 519        if (unlikely(!scheduler_running))
 520                return;
 521
 522        /*
 523         * Reset each runqueue's bandwidth settings
 524         */
 525        for_each_rt_rq(rt_rq, iter, rq) {
 526                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 527
 528                raw_spin_lock(&rt_b->rt_runtime_lock);
 529                raw_spin_lock(&rt_rq->rt_runtime_lock);
 530                rt_rq->rt_runtime = rt_b->rt_runtime;
 531                rt_rq->rt_time = 0;
 532                rt_rq->rt_throttled = 0;
 533                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 534                raw_spin_unlock(&rt_b->rt_runtime_lock);
 535        }
 536}
 537
 538static void enable_runtime(struct rq *rq)
 539{
 540        unsigned long flags;
 541
 542        raw_spin_lock_irqsave(&rq->lock, flags);
 543        __enable_runtime(rq);
 544        raw_spin_unlock_irqrestore(&rq->lock, flags);
 545}
 546
 547static int balance_runtime(struct rt_rq *rt_rq)
 548{
 549        int more = 0;
 550
 551        if (rt_rq->rt_time > rt_rq->rt_runtime) {
 552                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 553                more = do_balance_runtime(rt_rq);
 554                raw_spin_lock(&rt_rq->rt_runtime_lock);
 555        }
 556
 557        return more;
 558}
 559#else /* !CONFIG_SMP */
 560static inline int balance_runtime(struct rt_rq *rt_rq)
 561{
 562        return 0;
 563}
 564#endif /* CONFIG_SMP */
 565
 566static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 567{
 568        int i, idle = 1;
 569        const struct cpumask *span;
 570
 571        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 572                return 1;
 573
 574        span = sched_rt_period_mask();
 575        for_each_cpu(i, span) {
 576                int enqueue = 0;
 577                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 578                struct rq *rq = rq_of_rt_rq(rt_rq);
 579
 580                raw_spin_lock(&rq->lock);
 581                if (rt_rq->rt_time) {
 582                        u64 runtime;
 583
 584                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 585                        if (rt_rq->rt_throttled)
 586                                balance_runtime(rt_rq);
 587                        runtime = rt_rq->rt_runtime;
 588                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 589                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 590                                rt_rq->rt_throttled = 0;
 591                                enqueue = 1;
 592
 593                                /*
 594                                 * Force a clock update if the CPU was idle,
 595                                 * lest wakeup -> unthrottle time accumulate.
 596                                 */
 597                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 598                                        rq->skip_clock_update = -1;
 599                        }
 600                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
 601                                idle = 0;
 602                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 603                } else if (rt_rq->rt_nr_running) {
 604                        idle = 0;
 605                        if (!rt_rq_throttled(rt_rq))
 606                                enqueue = 1;
 607                }
 608
 609                if (enqueue)
 610                        sched_rt_rq_enqueue(rt_rq);
 611                raw_spin_unlock(&rq->lock);
 612        }
 613
 614        return idle;
 615}
 616
 617static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 618{
 619#ifdef CONFIG_RT_GROUP_SCHED
 620        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 621
 622        if (rt_rq)
 623                return rt_rq->highest_prio.curr;
 624#endif
 625
 626        return rt_task_of(rt_se)->prio;
 627}
 628
 629static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 630{
 631        u64 runtime = sched_rt_runtime(rt_rq);
 632
 633        if (rt_rq->rt_throttled)
 634                return rt_rq_throttled(rt_rq);
 635
 636        if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
 637                return 0;
 638
 639        balance_runtime(rt_rq);
 640        runtime = sched_rt_runtime(rt_rq);
 641        if (runtime == RUNTIME_INF)
 642                return 0;
 643
 644        if (rt_rq->rt_time > runtime) {
 645                rt_rq->rt_throttled = 1;
 646                if (rt_rq_throttled(rt_rq)) {
 647                        sched_rt_rq_dequeue(rt_rq);
 648                        return 1;
 649                }
 650        }
 651
 652        return 0;
 653}
 654
 655/*
 656 * Update the current task's runtime statistics. Skip current tasks that
 657 * are not in our scheduling class.
 658 */
 659static void update_curr_rt(struct rq *rq)
 660{
 661        struct task_struct *curr = rq->curr;
 662        struct sched_rt_entity *rt_se = &curr->rt;
 663        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 664        u64 delta_exec;
 665
 666        if (curr->sched_class != &rt_sched_class)
 667                return;
 668
 669        delta_exec = rq->clock_task - curr->se.exec_start;
 670        if (unlikely((s64)delta_exec < 0))
 671                delta_exec = 0;
 672
 673        schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
 674
 675        curr->se.sum_exec_runtime += delta_exec;
 676        account_group_exec_runtime(curr, delta_exec);
 677
 678        curr->se.exec_start = rq->clock_task;
 679        cpuacct_charge(curr, delta_exec);
 680
 681        sched_rt_avg_update(rq, delta_exec);
 682
 683        if (!rt_bandwidth_enabled())
 684                return;
 685
 686        for_each_sched_rt_entity(rt_se) {
 687                rt_rq = rt_rq_of_se(rt_se);
 688
 689                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 690                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 691                        rt_rq->rt_time += delta_exec;
 692                        if (sched_rt_runtime_exceeded(rt_rq))
 693                                resched_task(curr);
 694                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 695                }
 696        }
 697}
 698
 699#if defined CONFIG_SMP
 700
 701static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
 702
 703static inline int next_prio(struct rq *rq)
 704{
 705        struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
 706
 707        if (next && rt_prio(next->prio))
 708                return next->prio;
 709        else
 710                return MAX_RT_PRIO;
 711}
 712
 713static void
 714inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 715{
 716        struct rq *rq = rq_of_rt_rq(rt_rq);
 717
 718        if (prio < prev_prio) {
 719
 720                /*
 721                 * If the new task is higher in priority than anything on the
 722                 * run-queue, we know that the previous high becomes our
 723                 * next-highest.
 724                 */
 725                rt_rq->highest_prio.next = prev_prio;
 726
 727                if (rq->online)
 728                        cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 729
 730        } else if (prio == rt_rq->highest_prio.curr)
 731                /*
 732                 * If the next task is equal in priority to the highest on
 733                 * the run-queue, then we implicitly know that the next highest
 734                 * task cannot be any lower than current
 735                 */
 736                rt_rq->highest_prio.next = prio;
 737        else if (prio < rt_rq->highest_prio.next)
 738                /*
 739                 * Otherwise, we need to recompute next-highest
 740                 */
 741                rt_rq->highest_prio.next = next_prio(rq);
 742}
 743
 744static void
 745dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 746{
 747        struct rq *rq = rq_of_rt_rq(rt_rq);
 748
 749        if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
 750                rt_rq->highest_prio.next = next_prio(rq);
 751
 752        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
 753                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 754}
 755
 756#else /* CONFIG_SMP */
 757
 758static inline
 759void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 760static inline
 761void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 762
 763#endif /* CONFIG_SMP */
 764
 765#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 766static void
 767inc_rt_prio(struct rt_rq *rt_rq, int prio)
 768{
 769        int prev_prio = rt_rq->highest_prio.curr;
 770
 771        if (prio < prev_prio)
 772                rt_rq->highest_prio.curr = prio;
 773
 774        inc_rt_prio_smp(rt_rq, prio, prev_prio);
 775}
 776
 777static void
 778dec_rt_prio(struct rt_rq *rt_rq, int prio)
 779{
 780        int prev_prio = rt_rq->highest_prio.curr;
 781
 782        if (rt_rq->rt_nr_running) {
 783
 784                WARN_ON(prio < prev_prio);
 785
 786                /*
 787                 * This may have been our highest task, and therefore
 788                 * we may have some recomputation to do
 789                 */
 790                if (prio == prev_prio) {
 791                        struct rt_prio_array *array = &rt_rq->active;
 792
 793                        rt_rq->highest_prio.curr =
 794                                sched_find_first_bit(array->bitmap);
 795                }
 796
 797        } else
 798                rt_rq->highest_prio.curr = MAX_RT_PRIO;
 799
 800        dec_rt_prio_smp(rt_rq, prio, prev_prio);
 801}
 802
 803#else
 804
 805static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
 806static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
 807
 808#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 809
 810#ifdef CONFIG_RT_GROUP_SCHED
 811
 812static void
 813inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 814{
 815        if (rt_se_boosted(rt_se))
 816                rt_rq->rt_nr_boosted++;
 817
 818        if (rt_rq->tg)
 819                start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
 820}
 821
 822static void
 823dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 824{
 825        if (rt_se_boosted(rt_se))
 826                rt_rq->rt_nr_boosted--;
 827
 828        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
 829}
 830
 831#else /* CONFIG_RT_GROUP_SCHED */
 832
 833static void
 834inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 835{
 836        start_rt_bandwidth(&def_rt_bandwidth);
 837}
 838
 839static inline
 840void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 841
 842#endif /* CONFIG_RT_GROUP_SCHED */
 843
 844static inline
 845void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 846{
 847        int prio = rt_se_prio(rt_se);
 848
 849        WARN_ON(!rt_prio(prio));
 850        rt_rq->rt_nr_running++;
 851
 852        inc_rt_prio(rt_rq, prio);
 853        inc_rt_migration(rt_se, rt_rq);
 854        inc_rt_group(rt_se, rt_rq);
 855}
 856
 857static inline
 858void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 859{
 860        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 861        WARN_ON(!rt_rq->rt_nr_running);
 862        rt_rq->rt_nr_running--;
 863
 864        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
 865        dec_rt_migration(rt_se, rt_rq);
 866        dec_rt_group(rt_se, rt_rq);
 867}
 868
 869static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 870{
 871        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 872        struct rt_prio_array *array = &rt_rq->active;
 873        struct rt_rq *group_rq = group_rt_rq(rt_se);
 874        struct list_head *queue = array->queue + rt_se_prio(rt_se);
 875
 876        /*
 877         * Don't enqueue the group if its throttled, or when empty.
 878         * The latter is a consequence of the former when a child group
 879         * get throttled and the current group doesn't have any other
 880         * active members.
 881         */
 882        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 883                return;
 884
 885        if (!rt_rq->rt_nr_running)
 886                list_add_leaf_rt_rq(rt_rq);
 887
 888        if (head)
 889                list_add(&rt_se->run_list, queue);
 890        else
 891                list_add_tail(&rt_se->run_list, queue);
 892        __set_bit(rt_se_prio(rt_se), array->bitmap);
 893
 894        inc_rt_tasks(rt_se, rt_rq);
 895}
 896
 897static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
 898{
 899        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 900        struct rt_prio_array *array = &rt_rq->active;
 901
 902        list_del_init(&rt_se->run_list);
 903        if (list_empty(array->queue + rt_se_prio(rt_se)))
 904                __clear_bit(rt_se_prio(rt_se), array->bitmap);
 905
 906        dec_rt_tasks(rt_se, rt_rq);
 907        if (!rt_rq->rt_nr_running)
 908                list_del_leaf_rt_rq(rt_rq);
 909}
 910
 911/*
 912 * Because the prio of an upper entry depends on the lower
 913 * entries, we must remove entries top - down.
 914 */
 915static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 916{
 917        struct sched_rt_entity *back = NULL;
 918
 919        for_each_sched_rt_entity(rt_se) {
 920                rt_se->back = back;
 921                back = rt_se;
 922        }
 923
 924        for (rt_se = back; rt_se; rt_se = rt_se->back) {
 925                if (on_rt_rq(rt_se))
 926                        __dequeue_rt_entity(rt_se);
 927        }
 928}
 929
 930static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 931{
 932        dequeue_rt_stack(rt_se);
 933        for_each_sched_rt_entity(rt_se)
 934                __enqueue_rt_entity(rt_se, head);
 935}
 936
 937static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 938{
 939        dequeue_rt_stack(rt_se);
 940
 941        for_each_sched_rt_entity(rt_se) {
 942                struct rt_rq *rt_rq = group_rt_rq(rt_se);
 943
 944                if (rt_rq && rt_rq->rt_nr_running)
 945                        __enqueue_rt_entity(rt_se, false);
 946        }
 947}
 948
 949/*
 950 * Adding/removing a task to/from a priority array:
 951 */
 952static void
 953enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 954{
 955        struct sched_rt_entity *rt_se = &p->rt;
 956
 957        if (flags & ENQUEUE_WAKEUP)
 958                rt_se->timeout = 0;
 959
 960        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 961
 962        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 963                enqueue_pushable_task(rq, p);
 964}
 965
 966static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 967{
 968        struct sched_rt_entity *rt_se = &p->rt;
 969
 970        update_curr_rt(rq);
 971        dequeue_rt_entity(rt_se);
 972
 973        dequeue_pushable_task(rq, p);
 974}
 975
 976/*
 977 * Put task to the end of the run list without the overhead of dequeue
 978 * followed by enqueue.
 979 */
 980static void
 981requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
 982{
 983        if (on_rt_rq(rt_se)) {
 984                struct rt_prio_array *array = &rt_rq->active;
 985                struct list_head *queue = array->queue + rt_se_prio(rt_se);
 986
 987                if (head)
 988                        list_move(&rt_se->run_list, queue);
 989                else
 990                        list_move_tail(&rt_se->run_list, queue);
 991        }
 992}
 993
 994static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 995{
 996        struct sched_rt_entity *rt_se = &p->rt;
 997        struct rt_rq *rt_rq;
 998
 999        for_each_sched_rt_entity(rt_se) {
1000                rt_rq = rt_rq_of_se(rt_se);
1001                requeue_rt_entity(rt_rq, rt_se, head);
1002        }
1003}
1004
1005static void yield_task_rt(struct rq *rq)
1006{
1007        requeue_task_rt(rq, rq->curr, 0);
1008}
1009
1010#ifdef CONFIG_SMP
1011static int find_lowest_rq(struct task_struct *task);
1012
1013static int
1014select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1015{
1016        struct task_struct *curr;
1017        struct rq *rq;
1018        int cpu;
1019
1020        if (sd_flag != SD_BALANCE_WAKE)
1021                return smp_processor_id();
1022
1023        cpu = task_cpu(p);
1024        rq = cpu_rq(cpu);
1025
1026        rcu_read_lock();
1027        curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1028
1029        /*
1030         * If the current task on @p's runqueue is an RT task, then
1031         * try to see if we can wake this RT task up on another
1032         * runqueue. Otherwise simply start this RT task
1033         * on its current runqueue.
1034         *
1035         * We want to avoid overloading runqueues. If the woken
1036         * task is a higher priority, then it will stay on this CPU
1037         * and the lower prio task should be moved to another CPU.
1038         * Even though this will probably make the lower prio task
1039         * lose its cache, we do not want to bounce a higher task
1040         * around just because it gave up its CPU, perhaps for a
1041         * lock?
1042         *
1043         * For equal prio tasks, we just let the scheduler sort it out.
1044         *
1045         * Otherwise, just let it ride on the affined RQ and the
1046         * post-schedule router will push the preempted task away
1047         *
1048         * This test is optimistic, if we get it wrong the load-balancer
1049         * will have to sort it out.
1050         */
1051        if (curr && unlikely(rt_task(curr)) &&
1052            (curr->rt.nr_cpus_allowed < 2 ||
1053             curr->prio <= p->prio) &&
1054            (p->rt.nr_cpus_allowed > 1)) {
1055                int target = find_lowest_rq(p);
1056
1057                if (target != -1)
1058                        cpu = target;
1059        }
1060        rcu_read_unlock();
1061
1062        return cpu;
1063}
1064
1065static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1066{
1067        if (rq->curr->rt.nr_cpus_allowed == 1)
1068                return;
1069
1070        if (p->rt.nr_cpus_allowed != 1
1071            && cpupri_find(&rq->rd->cpupri, p, NULL))
1072                return;
1073
1074        if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1075                return;
1076
1077        /*
1078         * There appears to be other cpus that can accept
1079         * current and none to run 'p', so lets reschedule
1080         * to try and push current away:
1081         */
1082        requeue_task_rt(rq, p, 1);
1083        resched_task(rq->curr);
1084}
1085
1086#endif /* CONFIG_SMP */
1087
1088/*
1089 * Preempt the current task with a newly woken task if needed:
1090 */
1091static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1092{
1093        if (p->prio < rq->curr->prio) {
1094                resched_task(rq->curr);
1095                return;
1096        }
1097
1098#ifdef CONFIG_SMP
1099        /*
1100         * If:
1101         *
1102         * - the newly woken task is of equal priority to the current task
1103         * - the newly woken task is non-migratable while current is migratable
1104         * - current will be preempted on the next reschedule
1105         *
1106         * we should check to see if current can readily move to a different
1107         * cpu.  If so, we will reschedule to allow the push logic to try
1108         * to move current somewhere else, making room for our non-migratable
1109         * task.
1110         */
1111        if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1112                check_preempt_equal_prio(rq, p);
1113#endif
1114}
1115
1116static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1117                                                   struct rt_rq *rt_rq)
1118{
1119        struct rt_prio_array *array = &rt_rq->active;
1120        struct sched_rt_entity *next = NULL;
1121        struct list_head *queue;
1122        int idx;
1123
1124        idx = sched_find_first_bit(array->bitmap);
1125        BUG_ON(idx >= MAX_RT_PRIO);
1126
1127        queue = array->queue + idx;
1128        next = list_entry(queue->next, struct sched_rt_entity, run_list);
1129
1130        return next;
1131}
1132
1133static struct task_struct *_pick_next_task_rt(struct rq *rq)
1134{
1135        struct sched_rt_entity *rt_se;
1136        struct task_struct *p;
1137        struct rt_rq *rt_rq;
1138
1139        rt_rq = &rq->rt;
1140
1141        if (!rt_rq->rt_nr_running)
1142                return NULL;
1143
1144        if (rt_rq_throttled(rt_rq))
1145                return NULL;
1146
1147        do {
1148                rt_se = pick_next_rt_entity(rq, rt_rq);
1149                BUG_ON(!rt_se);
1150                rt_rq = group_rt_rq(rt_se);
1151        } while (rt_rq);
1152
1153        p = rt_task_of(rt_se);
1154        p->se.exec_start = rq->clock_task;
1155
1156        return p;
1157}
1158
1159static struct task_struct *pick_next_task_rt(struct rq *rq)
1160{
1161        struct task_struct *p = _pick_next_task_rt(rq);
1162
1163        /* The running task is never eligible for pushing */
1164        if (p)
1165                dequeue_pushable_task(rq, p);
1166
1167#ifdef CONFIG_SMP
1168        /*
1169         * We detect this state here so that we can avoid taking the RQ
1170         * lock again later if there is no need to push
1171         */
1172        rq->post_schedule = has_pushable_tasks(rq);
1173#endif
1174
1175        return p;
1176}
1177
1178static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1179{
1180        update_curr_rt(rq);
1181        p->se.exec_start = 0;
1182
1183        /*
1184         * The previous task needs to be made eligible for pushing
1185         * if it is still active
1186         */
1187        if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
1188                enqueue_pushable_task(rq, p);
1189}
1190
1191#ifdef CONFIG_SMP
1192
1193/* Only try algorithms three times */
1194#define RT_MAX_TRIES 3
1195
1196static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1197
1198static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1199{
1200        if (!task_running(rq, p) &&
1201            (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
1202            (p->rt.nr_cpus_allowed > 1))
1203                return 1;
1204        return 0;
1205}
1206
1207/* Return the second highest RT task, NULL otherwise */
1208static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1209{
1210        struct task_struct *next = NULL;
1211        struct sched_rt_entity *rt_se;
1212        struct rt_prio_array *array;
1213        struct rt_rq *rt_rq;
1214        int idx;
1215
1216        for_each_leaf_rt_rq(rt_rq, rq) {
1217                array = &rt_rq->active;
1218                idx = sched_find_first_bit(array->bitmap);
1219next_idx:
1220                if (idx >= MAX_RT_PRIO)
1221                        continue;
1222                if (next && next->prio < idx)
1223                        continue;
1224                list_for_each_entry(rt_se, array->queue + idx, run_list) {
1225                        struct task_struct *p;
1226
1227                        if (!rt_entity_is_task(rt_se))
1228                                continue;
1229
1230                        p = rt_task_of(rt_se);
1231                        if (pick_rt_task(rq, p, cpu)) {
1232                                next = p;
1233                                break;
1234                        }
1235                }
1236                if (!next) {
1237                        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1238                        goto next_idx;
1239                }
1240        }
1241
1242        return next;
1243}
1244
1245static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1246
1247static int find_lowest_rq(struct task_struct *task)
1248{
1249        struct sched_domain *sd;
1250        struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1251        int this_cpu = smp_processor_id();
1252        int cpu      = task_cpu(task);
1253
1254        /* Make sure the mask is initialized first */
1255        if (unlikely(!lowest_mask))
1256                return -1;
1257
1258        if (task->rt.nr_cpus_allowed == 1)
1259                return -1; /* No other targets possible */
1260
1261        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1262                return -1; /* No targets found */
1263
1264        /*
1265         * At this point we have built a mask of cpus representing the
1266         * lowest priority tasks in the system.  Now we want to elect
1267         * the best one based on our affinity and topology.
1268         *
1269         * We prioritize the last cpu that the task executed on since
1270         * it is most likely cache-hot in that location.
1271         */
1272        if (cpumask_test_cpu(cpu, lowest_mask))
1273                return cpu;
1274
1275        /*
1276         * Otherwise, we consult the sched_domains span maps to figure
1277         * out which cpu is logically closest to our hot cache data.
1278         */
1279        if (!cpumask_test_cpu(this_cpu, lowest_mask))
1280                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1281
1282        rcu_read_lock();
1283        for_each_domain(cpu, sd) {
1284                if (sd->flags & SD_WAKE_AFFINE) {
1285                        int best_cpu;
1286
1287                        /*
1288                         * "this_cpu" is cheaper to preempt than a
1289                         * remote processor.
1290                         */
1291                        if (this_cpu != -1 &&
1292                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1293                                rcu_read_unlock();
1294                                return this_cpu;
1295                        }
1296
1297                        best_cpu = cpumask_first_and(lowest_mask,
1298                                                     sched_domain_span(sd));
1299                        if (best_cpu < nr_cpu_ids) {
1300                                rcu_read_unlock();
1301                                return best_cpu;
1302                        }
1303                }
1304        }
1305        rcu_read_unlock();
1306
1307        /*
1308         * And finally, if there were no matches within the domains
1309         * just give the caller *something* to work with from the compatible
1310         * locations.
1311         */
1312        if (this_cpu != -1)
1313                return this_cpu;
1314
1315        cpu = cpumask_any(lowest_mask);
1316        if (cpu < nr_cpu_ids)
1317                return cpu;
1318        return -1;
1319}
1320
1321/* Will lock the rq it finds */
1322static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1323{
1324        struct rq *lowest_rq = NULL;
1325        int tries;
1326        int cpu;
1327
1328        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1329                cpu = find_lowest_rq(task);
1330
1331                if ((cpu == -1) || (cpu == rq->cpu))
1332                        break;
1333
1334                lowest_rq = cpu_rq(cpu);
1335
1336                /* if the prio of this runqueue changed, try again */
1337                if (double_lock_balance(rq, lowest_rq)) {
1338                        /*
1339                         * We had to unlock the run queue. In
1340                         * the mean time, task could have
1341                         * migrated already or had its affinity changed.
1342                         * Also make sure that it wasn't scheduled on its rq.
1343                         */
1344                        if (unlikely(task_rq(task) != rq ||
1345                                     !cpumask_test_cpu(lowest_rq->cpu,
1346                                                       &task->cpus_allowed) ||
1347                                     task_running(rq, task) ||
1348                                     !task->on_rq)) {
1349
1350                                raw_spin_unlock(&lowest_rq->lock);
1351                                lowest_rq = NULL;
1352                                break;
1353                        }
1354                }
1355
1356                /* If this rq is still suitable use it. */
1357                if (lowest_rq->rt.highest_prio.curr > task->prio)
1358                        break;
1359
1360                /* try again */
1361                double_unlock_balance(rq, lowest_rq);
1362                lowest_rq = NULL;
1363        }
1364
1365        return lowest_rq;
1366}
1367
1368static struct task_struct *pick_next_pushable_task(struct rq *rq)
1369{
1370        struct task_struct *p;
1371
1372        if (!has_pushable_tasks(rq))
1373                return NULL;
1374
1375        p = plist_first_entry(&rq->rt.pushable_tasks,
1376                              struct task_struct, pushable_tasks);
1377
1378        BUG_ON(rq->cpu != task_cpu(p));
1379        BUG_ON(task_current(rq, p));
1380        BUG_ON(p->rt.nr_cpus_allowed <= 1);
1381
1382        BUG_ON(!p->on_rq);
1383        BUG_ON(!rt_task(p));
1384
1385        return p;
1386}
1387
1388/*
1389 * If the current CPU has more than one RT task, see if the non
1390 * running task can migrate over to a CPU that is running a task
1391 * of lesser priority.
1392 */
1393static int push_rt_task(struct rq *rq)
1394{
1395        struct task_struct *next_task;
1396        struct rq *lowest_rq;
1397
1398        if (!rq->rt.overloaded)
1399                return 0;
1400
1401        next_task = pick_next_pushable_task(rq);
1402        if (!next_task)
1403                return 0;
1404
1405retry:
1406        if (unlikely(next_task == rq->curr)) {
1407                WARN_ON(1);
1408                return 0;
1409        }
1410
1411        /*
1412         * It's possible that the next_task slipped in of
1413         * higher priority than current. If that's the case
1414         * just reschedule current.
1415         */
1416        if (unlikely(next_task->prio < rq->curr->prio)) {
1417                resched_task(rq->curr);
1418                return 0;
1419        }
1420
1421        /* We might release rq lock */
1422        get_task_struct(next_task);
1423
1424        /* find_lock_lowest_rq locks the rq if found */
1425        lowest_rq = find_lock_lowest_rq(next_task, rq);
1426        if (!lowest_rq) {
1427                struct task_struct *task;
1428                /*
1429                 * find lock_lowest_rq releases rq->lock
1430                 * so it is possible that next_task has migrated.
1431                 *
1432                 * We need to make sure that the task is still on the same
1433                 * run-queue and is also still the next task eligible for
1434                 * pushing.
1435                 */
1436                task = pick_next_pushable_task(rq);
1437                if (task_cpu(next_task) == rq->cpu && task == next_task) {
1438                        /*
1439                         * If we get here, the task hasn't moved at all, but
1440                         * it has failed to push.  We will not try again,
1441                         * since the other cpus will pull from us when they
1442                         * are ready.
1443                         */
1444                        dequeue_pushable_task(rq, next_task);
1445                        goto out;
1446                }
1447
1448                if (!task)
1449                        /* No more tasks, just exit */
1450                        goto out;
1451
1452                /*
1453                 * Something has shifted, try again.
1454                 */
1455                put_task_struct(next_task);
1456                next_task = task;
1457                goto retry;
1458        }
1459
1460        deactivate_task(rq, next_task, 0);
1461        set_task_cpu(next_task, lowest_rq->cpu);
1462        activate_task(lowest_rq, next_task, 0);
1463
1464        resched_task(lowest_rq->curr);
1465
1466        double_unlock_balance(rq, lowest_rq);
1467
1468out:
1469        put_task_struct(next_task);
1470
1471        return 1;
1472}
1473
1474static void push_rt_tasks(struct rq *rq)
1475{
1476        /* push_rt_task will return true if it moved an RT */
1477        while (push_rt_task(rq))
1478                ;
1479}
1480
1481static int pull_rt_task(struct rq *this_rq)
1482{
1483        int this_cpu = this_rq->cpu, ret = 0, cpu;
1484        struct task_struct *p;
1485        struct rq *src_rq;
1486
1487        if (likely(!rt_overloaded(this_rq)))
1488                return 0;
1489
1490        for_each_cpu(cpu, this_rq->rd->rto_mask) {
1491                if (this_cpu == cpu)
1492                        continue;
1493
1494                src_rq = cpu_rq(cpu);
1495
1496                /*
1497                 * Don't bother taking the src_rq->lock if the next highest
1498                 * task is known to be lower-priority than our current task.
1499                 * This may look racy, but if this value is about to go
1500                 * logically higher, the src_rq will push this task away.
1501                 * And if its going logically lower, we do not care
1502                 */
1503                if (src_rq->rt.highest_prio.next >=
1504                    this_rq->rt.highest_prio.curr)
1505                        continue;
1506
1507                /*
1508                 * We can potentially drop this_rq's lock in
1509                 * double_lock_balance, and another CPU could
1510                 * alter this_rq
1511                 */
1512                double_lock_balance(this_rq, src_rq);
1513
1514                /*
1515                 * Are there still pullable RT tasks?
1516                 */
1517                if (src_rq->rt.rt_nr_running <= 1)
1518                        goto skip;
1519
1520                p = pick_next_highest_task_rt(src_rq, this_cpu);
1521
1522                /*
1523                 * Do we have an RT task that preempts
1524                 * the to-be-scheduled task?
1525                 */
1526                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1527                        WARN_ON(p == src_rq->curr);
1528                        WARN_ON(!p->on_rq);
1529
1530                        /*
1531                         * There's a chance that p is higher in priority
1532                         * than what's currently running on its cpu.
1533                         * This is just that p is wakeing up and hasn't
1534                         * had a chance to schedule. We only pull
1535                         * p if it is lower in priority than the
1536                         * current task on the run queue
1537                         */
1538                        if (p->prio < src_rq->curr->prio)
1539                                goto skip;
1540
1541                        ret = 1;
1542
1543                        deactivate_task(src_rq, p, 0);
1544                        set_task_cpu(p, this_cpu);
1545                        activate_task(this_rq, p, 0);
1546                        /*
1547                         * We continue with the search, just in
1548                         * case there's an even higher prio task
1549                         * in another runqueue. (low likelihood
1550                         * but possible)
1551                         */
1552                }
1553skip:
1554                double_unlock_balance(this_rq, src_rq);
1555        }
1556
1557        return ret;
1558}
1559
1560static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1561{
1562        /* Try to pull RT tasks here if we lower this rq's prio */
1563        if (rq->rt.highest_prio.curr > prev->prio)
1564                pull_rt_task(rq);
1565}
1566
1567static void post_schedule_rt(struct rq *rq)
1568{
1569        push_rt_tasks(rq);
1570}
1571
1572/*
1573 * If we are not running and we are not going to reschedule soon, we should
1574 * try to push tasks away now
1575 */
1576static void task_woken_rt(struct rq *rq, struct task_struct *p)
1577{
1578        if (!task_running(rq, p) &&
1579            !test_tsk_need_resched(rq->curr) &&
1580            has_pushable_tasks(rq) &&
1581            p->rt.nr_cpus_allowed > 1 &&
1582            rt_task(rq->curr) &&
1583            (rq->curr->rt.nr_cpus_allowed < 2 ||
1584             rq->curr->prio <= p->prio))
1585                push_rt_tasks(rq);
1586}
1587
1588static void set_cpus_allowed_rt(struct task_struct *p,
1589                                const struct cpumask *new_mask)
1590{
1591        int weight = cpumask_weight(new_mask);
1592
1593        BUG_ON(!rt_task(p));
1594
1595        /*
1596         * Update the migration status of the RQ if we have an RT task
1597         * which is running AND changing its weight value.
1598         */
1599        if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1600                struct rq *rq = task_rq(p);
1601
1602                if (!task_current(rq, p)) {
1603                        /*
1604                         * Make sure we dequeue this task from the pushable list
1605                         * before going further.  It will either remain off of
1606                         * the list because we are no longer pushable, or it
1607                         * will be requeued.
1608                         */
1609                        if (p->rt.nr_cpus_allowed > 1)
1610                                dequeue_pushable_task(rq, p);
1611
1612                        /*
1613                         * Requeue if our weight is changing and still > 1
1614                         */
1615                        if (weight > 1)
1616                                enqueue_pushable_task(rq, p);
1617
1618                }
1619
1620                if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1621                        rq->rt.rt_nr_migratory++;
1622                } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1623                        BUG_ON(!rq->rt.rt_nr_migratory);
1624                        rq->rt.rt_nr_migratory--;
1625                }
1626
1627                update_rt_migration(&rq->rt);
1628        }
1629
1630        cpumask_copy(&p->cpus_allowed, new_mask);
1631        p->rt.nr_cpus_allowed = weight;
1632}
1633
1634/* Assumes rq->lock is held */
1635static void rq_online_rt(struct rq *rq)
1636{
1637        if (rq->rt.overloaded)
1638                rt_set_overload(rq);
1639
1640        __enable_runtime(rq);
1641
1642        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1643}
1644
1645/* Assumes rq->lock is held */
1646static void rq_offline_rt(struct rq *rq)
1647{
1648        if (rq->rt.overloaded)
1649                rt_clear_overload(rq);
1650
1651        __disable_runtime(rq);
1652
1653        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1654}
1655
1656/*
1657 * When switch from the rt queue, we bring ourselves to a position
1658 * that we might want to pull RT tasks from other runqueues.
1659 */
1660static void switched_from_rt(struct rq *rq, struct task_struct *p)
1661{
1662        /*
1663         * If there are other RT tasks then we will reschedule
1664         * and the scheduling of the other RT tasks will handle
1665         * the balancing. But if we are the last RT task
1666         * we may need to handle the pulling of RT tasks
1667         * now.
1668         */
1669        if (p->on_rq && !rq->rt.rt_nr_running)
1670                pull_rt_task(rq);
1671}
1672
1673static inline void init_sched_rt_class(void)
1674{
1675        unsigned int i;
1676
1677        for_each_possible_cpu(i)
1678                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1679                                        GFP_KERNEL, cpu_to_node(i));
1680}
1681#endif /* CONFIG_SMP */
1682
1683/*
1684 * When switching a task to RT, we may overload the runqueue
1685 * with RT tasks. In this case we try to push them off to
1686 * other runqueues.
1687 */
1688static void switched_to_rt(struct rq *rq, struct task_struct *p)
1689{
1690        int check_resched = 1;
1691
1692        /*
1693         * If we are already running, then there's nothing
1694         * that needs to be done. But if we are not running
1695         * we may need to preempt the current running task.
1696         * If that current running task is also an RT task
1697         * then see if we can move to another run queue.
1698         */
1699        if (p->on_rq && rq->curr != p) {
1700#ifdef CONFIG_SMP
1701                if (rq->rt.overloaded && push_rt_task(rq) &&
1702                    /* Don't resched if we changed runqueues */
1703                    rq != task_rq(p))
1704                        check_resched = 0;
1705#endif /* CONFIG_SMP */
1706                if (check_resched && p->prio < rq->curr->prio)
1707                        resched_task(rq->curr);
1708        }
1709}
1710
1711/*
1712 * Priority of the task has changed. This may cause
1713 * us to initiate a push or pull.
1714 */
1715static void
1716prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1717{
1718        if (!p->on_rq)
1719                return;
1720
1721        if (rq->curr == p) {
1722#ifdef CONFIG_SMP
1723                /*
1724                 * If our priority decreases while running, we
1725                 * may need to pull tasks to this runqueue.
1726                 */
1727                if (oldprio < p->prio)
1728                        pull_rt_task(rq);
1729                /*
1730                 * If there's a higher priority task waiting to run
1731                 * then reschedule. Note, the above pull_rt_task
1732                 * can release the rq lock and p could migrate.
1733                 * Only reschedule if p is still on the same runqueue.
1734                 */
1735                if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1736                        resched_task(p);
1737#else
1738                /* For UP simply resched on drop of prio */
1739                if (oldprio < p->prio)
1740                        resched_task(p);
1741#endif /* CONFIG_SMP */
1742        } else {
1743                /*
1744                 * This task is not running, but if it is
1745                 * greater than the current running task
1746                 * then reschedule.
1747                 */
1748                if (p->prio < rq->curr->prio)
1749                        resched_task(rq->curr);
1750        }
1751}
1752
1753static void watchdog(struct rq *rq, struct task_struct *p)
1754{
1755        unsigned long soft, hard;
1756
1757        /* max may change after cur was read, this will be fixed next tick */
1758        soft = task_rlimit(p, RLIMIT_RTTIME);
1759        hard = task_rlimit_max(p, RLIMIT_RTTIME);
1760
1761        if (soft != RLIM_INFINITY) {
1762                unsigned long next;
1763
1764                p->rt.timeout++;
1765                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1766                if (p->rt.timeout > next)
1767                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1768        }
1769}
1770
1771static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1772{
1773        update_curr_rt(rq);
1774
1775        watchdog(rq, p);
1776
1777        /*
1778         * RR tasks need a special form of timeslice management.
1779         * FIFO tasks have no timeslices.
1780         */
1781        if (p->policy != SCHED_RR)
1782                return;
1783
1784        if (--p->rt.time_slice)
1785                return;
1786
1787        p->rt.time_slice = DEF_TIMESLICE;
1788
1789        /*
1790         * Requeue to the end of queue if we are not the only element
1791         * on the queue:
1792         */
1793        if (p->rt.run_list.prev != p->rt.run_list.next) {
1794                requeue_task_rt(rq, p, 0);
1795                set_tsk_need_resched(p);
1796        }
1797}
1798
1799static void set_curr_task_rt(struct rq *rq)
1800{
1801        struct task_struct *p = rq->curr;
1802
1803        p->se.exec_start = rq->clock_task;
1804
1805        /* The running task is never eligible for pushing */
1806        dequeue_pushable_task(rq, p);
1807}
1808
1809static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1810{
1811        /*
1812         * Time slice is 0 for SCHED_FIFO tasks
1813         */
1814        if (task->policy == SCHED_RR)
1815                return DEF_TIMESLICE;
1816        else
1817                return 0;
1818}
1819
1820static const struct sched_class rt_sched_class = {
1821        .next                   = &fair_sched_class,
1822        .enqueue_task           = enqueue_task_rt,
1823        .dequeue_task           = dequeue_task_rt,
1824        .yield_task             = yield_task_rt,
1825
1826        .check_preempt_curr     = check_preempt_curr_rt,
1827
1828        .pick_next_task         = pick_next_task_rt,
1829        .put_prev_task          = put_prev_task_rt,
1830
1831#ifdef CONFIG_SMP
1832        .select_task_rq         = select_task_rq_rt,
1833
1834        .set_cpus_allowed       = set_cpus_allowed_rt,
1835        .rq_online              = rq_online_rt,
1836        .rq_offline             = rq_offline_rt,
1837        .pre_schedule           = pre_schedule_rt,
1838        .post_schedule          = post_schedule_rt,
1839        .task_woken             = task_woken_rt,
1840        .switched_from          = switched_from_rt,
1841#endif
1842
1843        .set_curr_task          = set_curr_task_rt,
1844        .task_tick              = task_tick_rt,
1845
1846        .get_rr_interval        = get_rr_interval_rt,
1847
1848        .prio_changed           = prio_changed_rt,
1849        .switched_to            = switched_to_rt,
1850};
1851
1852#ifdef CONFIG_SCHED_DEBUG
1853extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1854
1855static void print_rt_stats(struct seq_file *m, int cpu)
1856{
1857        rt_rq_iter_t iter;
1858        struct rt_rq *rt_rq;
1859
1860        rcu_read_lock();
1861        for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
1862                print_rt_rq(m, cpu, rt_rq);
1863        rcu_read_unlock();
1864}
1865#endif /* CONFIG_SCHED_DEBUG */
1866
1867
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.