linux/kernel/sched_rt.c
<<
>>
Prefs
   1/*
   2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3 * policies)
   4 */
   5
   6#ifdef CONFIG_RT_GROUP_SCHED
   7
   8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
   9
  10static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  11{
  12#ifdef CONFIG_SCHED_DEBUG
  13        WARN_ON_ONCE(!rt_entity_is_task(rt_se));
  14#endif
  15        return container_of(rt_se, struct task_struct, rt);
  16}
  17
  18static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  19{
  20        return rt_rq->rq;
  21}
  22
  23static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  24{
  25        return rt_se->rt_rq;
  26}
  27
  28#else /* CONFIG_RT_GROUP_SCHED */
  29
  30#define rt_entity_is_task(rt_se) (1)
  31
  32static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  33{
  34        return container_of(rt_se, struct task_struct, rt);
  35}
  36
  37static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  38{
  39        return container_of(rt_rq, struct rq, rt);
  40}
  41
  42static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  43{
  44        struct task_struct *p = rt_task_of(rt_se);
  45        struct rq *rq = task_rq(p);
  46
  47        return &rq->rt;
  48}
  49
  50#endif /* CONFIG_RT_GROUP_SCHED */
  51
  52#ifdef CONFIG_SMP
  53
  54static inline int rt_overloaded(struct rq *rq)
  55{
  56        return atomic_read(&rq->rd->rto_count);
  57}
  58
  59static inline void rt_set_overload(struct rq *rq)
  60{
  61        if (!rq->online)
  62                return;
  63
  64        cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
  65        /*
  66         * Make sure the mask is visible before we set
  67         * the overload count. That is checked to determine
  68         * if we should look at the mask. It would be a shame
  69         * if we looked at the mask, but the mask was not
  70         * updated yet.
  71         */
  72        wmb();
  73        atomic_inc(&rq->rd->rto_count);
  74}
  75
  76static inline void rt_clear_overload(struct rq *rq)
  77{
  78        if (!rq->online)
  79                return;
  80
  81        /* the order here really doesn't matter */
  82        atomic_dec(&rq->rd->rto_count);
  83        cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
  84}
  85
  86static void update_rt_migration(struct rt_rq *rt_rq)
  87{
  88        if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
  89                if (!rt_rq->overloaded) {
  90                        rt_set_overload(rq_of_rt_rq(rt_rq));
  91                        rt_rq->overloaded = 1;
  92                }
  93        } else if (rt_rq->overloaded) {
  94                rt_clear_overload(rq_of_rt_rq(rt_rq));
  95                rt_rq->overloaded = 0;
  96        }
  97}
  98
  99static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 100{
 101        if (!rt_entity_is_task(rt_se))
 102                return;
 103
 104        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 105
 106        rt_rq->rt_nr_total++;
 107        if (rt_se->nr_cpus_allowed > 1)
 108                rt_rq->rt_nr_migratory++;
 109
 110        update_rt_migration(rt_rq);
 111}
 112
 113static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 114{
 115        if (!rt_entity_is_task(rt_se))
 116                return;
 117
 118        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 119
 120        rt_rq->rt_nr_total--;
 121        if (rt_se->nr_cpus_allowed > 1)
 122                rt_rq->rt_nr_migratory--;
 123
 124        update_rt_migration(rt_rq);
 125}
 126
 127static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 128{
 129        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 130        plist_node_init(&p->pushable_tasks, p->prio);
 131        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 132}
 133
 134static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 135{
 136        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 137}
 138
 139static inline int has_pushable_tasks(struct rq *rq)
 140{
 141        return !plist_head_empty(&rq->rt.pushable_tasks);
 142}
 143
 144#else
 145
 146static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 147{
 148}
 149
 150static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 151{
 152}
 153
 154static inline
 155void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 156{
 157}
 158
 159static inline
 160void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 161{
 162}
 163
 164#endif /* CONFIG_SMP */
 165
 166static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 167{
 168        return !list_empty(&rt_se->run_list);
 169}
 170
 171#ifdef CONFIG_RT_GROUP_SCHED
 172
 173static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 174{
 175        if (!rt_rq->tg)
 176                return RUNTIME_INF;
 177
 178        return rt_rq->rt_runtime;
 179}
 180
 181static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 182{
 183        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 184}
 185
 186#define for_each_leaf_rt_rq(rt_rq, rq) \
 187        list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 188
 189#define for_each_sched_rt_entity(rt_se) \
 190        for (; rt_se; rt_se = rt_se->parent)
 191
 192static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 193{
 194        return rt_se->my_q;
 195}
 196
 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 199
 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 201{
 202        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 203        struct sched_rt_entity *rt_se;
 204
 205        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 206
 207        rt_se = rt_rq->tg->rt_se[cpu];
 208
 209        if (rt_rq->rt_nr_running) {
 210                if (rt_se && !on_rt_rq(rt_se))
 211                        enqueue_rt_entity(rt_se, false);
 212                if (rt_rq->highest_prio.curr < curr->prio)
 213                        resched_task(curr);
 214        }
 215}
 216
 217static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 218{
 219        struct sched_rt_entity *rt_se;
 220        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 221
 222        rt_se = rt_rq->tg->rt_se[cpu];
 223
 224        if (rt_se && on_rt_rq(rt_se))
 225                dequeue_rt_entity(rt_se);
 226}
 227
 228static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 229{
 230        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 231}
 232
 233static int rt_se_boosted(struct sched_rt_entity *rt_se)
 234{
 235        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 236        struct task_struct *p;
 237
 238        if (rt_rq)
 239                return !!rt_rq->rt_nr_boosted;
 240
 241        p = rt_task_of(rt_se);
 242        return p->prio != p->normal_prio;
 243}
 244
 245#ifdef CONFIG_SMP
 246static inline const struct cpumask *sched_rt_period_mask(void)
 247{
 248        return cpu_rq(smp_processor_id())->rd->span;
 249}
 250#else
 251static inline const struct cpumask *sched_rt_period_mask(void)
 252{
 253        return cpu_online_mask;
 254}
 255#endif
 256
 257static inline
 258struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 259{
 260        return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 261}
 262
 263static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 264{
 265        return &rt_rq->tg->rt_bandwidth;
 266}
 267
 268#else /* !CONFIG_RT_GROUP_SCHED */
 269
 270static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 271{
 272        return rt_rq->rt_runtime;
 273}
 274
 275static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 276{
 277        return ktime_to_ns(def_rt_bandwidth.rt_period);
 278}
 279
 280#define for_each_leaf_rt_rq(rt_rq, rq) \
 281        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 282
 283#define for_each_sched_rt_entity(rt_se) \
 284        for (; rt_se; rt_se = NULL)
 285
 286static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 287{
 288        return NULL;
 289}
 290
 291static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 292{
 293        if (rt_rq->rt_nr_running)
 294                resched_task(rq_of_rt_rq(rt_rq)->curr);
 295}
 296
 297static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 298{
 299}
 300
 301static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 302{
 303        return rt_rq->rt_throttled;
 304}
 305
 306static inline const struct cpumask *sched_rt_period_mask(void)
 307{
 308        return cpu_online_mask;
 309}
 310
 311static inline
 312struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 313{
 314        return &cpu_rq(cpu)->rt;
 315}
 316
 317static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 318{
 319        return &def_rt_bandwidth;
 320}
 321
 322#endif /* CONFIG_RT_GROUP_SCHED */
 323
 324#ifdef CONFIG_SMP
 325/*
 326 * We ran out of runtime, see if we can borrow some from our neighbours.
 327 */
 328static int do_balance_runtime(struct rt_rq *rt_rq)
 329{
 330        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 331        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 332        int i, weight, more = 0;
 333        u64 rt_period;
 334
 335        weight = cpumask_weight(rd->span);
 336
 337        raw_spin_lock(&rt_b->rt_runtime_lock);
 338        rt_period = ktime_to_ns(rt_b->rt_period);
 339        for_each_cpu(i, rd->span) {
 340                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 341                s64 diff;
 342
 343                if (iter == rt_rq)
 344                        continue;
 345
 346                raw_spin_lock(&iter->rt_runtime_lock);
 347                /*
 348                 * Either all rqs have inf runtime and there's nothing to steal
 349                 * or __disable_runtime() below sets a specific rq to inf to
 350                 * indicate its been disabled and disalow stealing.
 351                 */
 352                if (iter->rt_runtime == RUNTIME_INF)
 353                        goto next;
 354
 355                /*
 356                 * From runqueues with spare time, take 1/n part of their
 357                 * spare time, but no more than our period.
 358                 */
 359                diff = iter->rt_runtime - iter->rt_time;
 360                if (diff > 0) {
 361                        diff = div_u64((u64)diff, weight);
 362                        if (rt_rq->rt_runtime + diff > rt_period)
 363                                diff = rt_period - rt_rq->rt_runtime;
 364                        iter->rt_runtime -= diff;
 365                        rt_rq->rt_runtime += diff;
 366                        more = 1;
 367                        if (rt_rq->rt_runtime == rt_period) {
 368                                raw_spin_unlock(&iter->rt_runtime_lock);
 369                                break;
 370                        }
 371                }
 372next:
 373                raw_spin_unlock(&iter->rt_runtime_lock);
 374        }
 375        raw_spin_unlock(&rt_b->rt_runtime_lock);
 376
 377        return more;
 378}
 379
 380/*
 381 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 382 */
 383static void __disable_runtime(struct rq *rq)
 384{
 385        struct root_domain *rd = rq->rd;
 386        struct rt_rq *rt_rq;
 387
 388        if (unlikely(!scheduler_running))
 389                return;
 390
 391        for_each_leaf_rt_rq(rt_rq, rq) {
 392                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 393                s64 want;
 394                int i;
 395
 396                raw_spin_lock(&rt_b->rt_runtime_lock);
 397                raw_spin_lock(&rt_rq->rt_runtime_lock);
 398                /*
 399                 * Either we're all inf and nobody needs to borrow, or we're
 400                 * already disabled and thus have nothing to do, or we have
 401                 * exactly the right amount of runtime to take out.
 402                 */
 403                if (rt_rq->rt_runtime == RUNTIME_INF ||
 404                                rt_rq->rt_runtime == rt_b->rt_runtime)
 405                        goto balanced;
 406                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 407
 408                /*
 409                 * Calculate the difference between what we started out with
 410                 * and what we current have, that's the amount of runtime
 411                 * we lend and now have to reclaim.
 412                 */
 413                want = rt_b->rt_runtime - rt_rq->rt_runtime;
 414
 415                /*
 416                 * Greedy reclaim, take back as much as we can.
 417                 */
 418                for_each_cpu(i, rd->span) {
 419                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 420                        s64 diff;
 421
 422                        /*
 423                         * Can't reclaim from ourselves or disabled runqueues.
 424                         */
 425                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 426                                continue;
 427
 428                        raw_spin_lock(&iter->rt_runtime_lock);
 429                        if (want > 0) {
 430                                diff = min_t(s64, iter->rt_runtime, want);
 431                                iter->rt_runtime -= diff;
 432                                want -= diff;
 433                        } else {
 434                                iter->rt_runtime -= want;
 435                                want -= want;
 436                        }
 437                        raw_spin_unlock(&iter->rt_runtime_lock);
 438
 439                        if (!want)
 440                                break;
 441                }
 442
 443                raw_spin_lock(&rt_rq->rt_runtime_lock);
 444                /*
 445                 * We cannot be left wanting - that would mean some runtime
 446                 * leaked out of the system.
 447                 */
 448                BUG_ON(want);
 449balanced:
 450                /*
 451                 * Disable all the borrow logic by pretending we have inf
 452                 * runtime - in which case borrowing doesn't make sense.
 453                 */
 454                rt_rq->rt_runtime = RUNTIME_INF;
 455                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 456                raw_spin_unlock(&rt_b->rt_runtime_lock);
 457        }
 458}
 459
 460static void disable_runtime(struct rq *rq)
 461{
 462        unsigned long flags;
 463
 464        raw_spin_lock_irqsave(&rq->lock, flags);
 465        __disable_runtime(rq);
 466        raw_spin_unlock_irqrestore(&rq->lock, flags);
 467}
 468
 469static void __enable_runtime(struct rq *rq)
 470{
 471        struct rt_rq *rt_rq;
 472
 473        if (unlikely(!scheduler_running))
 474                return;
 475
 476        /*
 477         * Reset each runqueue's bandwidth settings
 478         */
 479        for_each_leaf_rt_rq(rt_rq, rq) {
 480                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 481
 482                raw_spin_lock(&rt_b->rt_runtime_lock);
 483                raw_spin_lock(&rt_rq->rt_runtime_lock);
 484                rt_rq->rt_runtime = rt_b->rt_runtime;
 485                rt_rq->rt_time = 0;
 486                rt_rq->rt_throttled = 0;
 487                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 488                raw_spin_unlock(&rt_b->rt_runtime_lock);
 489        }
 490}
 491
 492static void enable_runtime(struct rq *rq)
 493{
 494        unsigned long flags;
 495
 496        raw_spin_lock_irqsave(&rq->lock, flags);
 497        __enable_runtime(rq);
 498        raw_spin_unlock_irqrestore(&rq->lock, flags);
 499}
 500
 501static int balance_runtime(struct rt_rq *rt_rq)
 502{
 503        int more = 0;
 504
 505        if (rt_rq->rt_time > rt_rq->rt_runtime) {
 506                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 507                more = do_balance_runtime(rt_rq);
 508                raw_spin_lock(&rt_rq->rt_runtime_lock);
 509        }
 510
 511        return more;
 512}
 513#else /* !CONFIG_SMP */
 514static inline int balance_runtime(struct rt_rq *rt_rq)
 515{
 516        return 0;
 517}
 518#endif /* CONFIG_SMP */
 519
 520static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 521{
 522        int i, idle = 1;
 523        const struct cpumask *span;
 524
 525        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 526                return 1;
 527
 528        span = sched_rt_period_mask();
 529        for_each_cpu(i, span) {
 530                int enqueue = 0;
 531                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 532                struct rq *rq = rq_of_rt_rq(rt_rq);
 533
 534                raw_spin_lock(&rq->lock);
 535                if (rt_rq->rt_time) {
 536                        u64 runtime;
 537
 538                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 539                        if (rt_rq->rt_throttled)
 540                                balance_runtime(rt_rq);
 541                        runtime = rt_rq->rt_runtime;
 542                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 543                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 544                                rt_rq->rt_throttled = 0;
 545                                enqueue = 1;
 546                        }
 547                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
 548                                idle = 0;
 549                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 550                } else if (rt_rq->rt_nr_running) {
 551                        idle = 0;
 552                        if (!rt_rq_throttled(rt_rq))
 553                                enqueue = 1;
 554                }
 555
 556                if (enqueue)
 557                        sched_rt_rq_enqueue(rt_rq);
 558                raw_spin_unlock(&rq->lock);
 559        }
 560
 561        return idle;
 562}
 563
 564static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 565{
 566#ifdef CONFIG_RT_GROUP_SCHED
 567        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 568
 569        if (rt_rq)
 570                return rt_rq->highest_prio.curr;
 571#endif
 572
 573        return rt_task_of(rt_se)->prio;
 574}
 575
 576static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 577{
 578        u64 runtime = sched_rt_runtime(rt_rq);
 579
 580        if (rt_rq->rt_throttled)
 581                return rt_rq_throttled(rt_rq);
 582
 583        if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
 584                return 0;
 585
 586        balance_runtime(rt_rq);
 587        runtime = sched_rt_runtime(rt_rq);
 588        if (runtime == RUNTIME_INF)
 589                return 0;
 590
 591        if (rt_rq->rt_time > runtime) {
 592                rt_rq->rt_throttled = 1;
 593                if (rt_rq_throttled(rt_rq)) {
 594                        sched_rt_rq_dequeue(rt_rq);
 595                        return 1;
 596                }
 597        }
 598
 599        return 0;
 600}
 601
 602/*
 603 * Update the current task's runtime statistics. Skip current tasks that
 604 * are not in our scheduling class.
 605 */
 606static void update_curr_rt(struct rq *rq)
 607{
 608        struct task_struct *curr = rq->curr;
 609        struct sched_rt_entity *rt_se = &curr->rt;
 610        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 611        u64 delta_exec;
 612
 613        if (curr->sched_class != &rt_sched_class)
 614                return;
 615
 616        delta_exec = rq->clock_task - curr->se.exec_start;
 617        if (unlikely((s64)delta_exec < 0))
 618                delta_exec = 0;
 619
 620        schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
 621
 622        curr->se.sum_exec_runtime += delta_exec;
 623        account_group_exec_runtime(curr, delta_exec);
 624
 625        curr->se.exec_start = rq->clock_task;
 626        cpuacct_charge(curr, delta_exec);
 627
 628        sched_rt_avg_update(rq, delta_exec);
 629
 630        if (!rt_bandwidth_enabled())
 631                return;
 632
 633        for_each_sched_rt_entity(rt_se) {
 634                rt_rq = rt_rq_of_se(rt_se);
 635
 636                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 637                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 638                        rt_rq->rt_time += delta_exec;
 639                        if (sched_rt_runtime_exceeded(rt_rq))
 640                                resched_task(curr);
 641                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 642                }
 643        }
 644}
 645
 646#if defined CONFIG_SMP
 647
 648static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
 649
 650static inline int next_prio(struct rq *rq)
 651{
 652        struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
 653
 654        if (next && rt_prio(next->prio))
 655                return next->prio;
 656        else
 657                return MAX_RT_PRIO;
 658}
 659
 660static void
 661inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 662{
 663        struct rq *rq = rq_of_rt_rq(rt_rq);
 664
 665        if (prio < prev_prio) {
 666
 667                /*
 668                 * If the new task is higher in priority than anything on the
 669                 * run-queue, we know that the previous high becomes our
 670                 * next-highest.
 671                 */
 672                rt_rq->highest_prio.next = prev_prio;
 673
 674                if (rq->online)
 675                        cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 676
 677        } else if (prio == rt_rq->highest_prio.curr)
 678                /*
 679                 * If the next task is equal in priority to the highest on
 680                 * the run-queue, then we implicitly know that the next highest
 681                 * task cannot be any lower than current
 682                 */
 683                rt_rq->highest_prio.next = prio;
 684        else if (prio < rt_rq->highest_prio.next)
 685                /*
 686                 * Otherwise, we need to recompute next-highest
 687                 */
 688                rt_rq->highest_prio.next = next_prio(rq);
 689}
 690
 691static void
 692dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 693{
 694        struct rq *rq = rq_of_rt_rq(rt_rq);
 695
 696        if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
 697                rt_rq->highest_prio.next = next_prio(rq);
 698
 699        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
 700                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 701}
 702
 703#else /* CONFIG_SMP */
 704
 705static inline
 706void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 707static inline
 708void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 709
 710#endif /* CONFIG_SMP */
 711
 712#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 713static void
 714inc_rt_prio(struct rt_rq *rt_rq, int prio)
 715{
 716        int prev_prio = rt_rq->highest_prio.curr;
 717
 718        if (prio < prev_prio)
 719                rt_rq->highest_prio.curr = prio;
 720
 721        inc_rt_prio_smp(rt_rq, prio, prev_prio);
 722}
 723
 724static void
 725dec_rt_prio(struct rt_rq *rt_rq, int prio)
 726{
 727        int prev_prio = rt_rq->highest_prio.curr;
 728
 729        if (rt_rq->rt_nr_running) {
 730
 731                WARN_ON(prio < prev_prio);
 732
 733                /*
 734                 * This may have been our highest task, and therefore
 735                 * we may have some recomputation to do
 736                 */
 737                if (prio == prev_prio) {
 738                        struct rt_prio_array *array = &rt_rq->active;
 739
 740                        rt_rq->highest_prio.curr =
 741                                sched_find_first_bit(array->bitmap);
 742                }
 743
 744        } else
 745                rt_rq->highest_prio.curr = MAX_RT_PRIO;
 746
 747        dec_rt_prio_smp(rt_rq, prio, prev_prio);
 748}
 749
 750#else
 751
 752static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
 753static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
 754
 755#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 756
 757#ifdef CONFIG_RT_GROUP_SCHED
 758
 759static void
 760inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 761{
 762        if (rt_se_boosted(rt_se))
 763                rt_rq->rt_nr_boosted++;
 764
 765        if (rt_rq->tg)
 766                start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
 767}
 768
 769static void
 770dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 771{
 772        if (rt_se_boosted(rt_se))
 773                rt_rq->rt_nr_boosted--;
 774
 775        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
 776}
 777
 778#else /* CONFIG_RT_GROUP_SCHED */
 779
 780static void
 781inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 782{
 783        start_rt_bandwidth(&def_rt_bandwidth);
 784}
 785
 786static inline
 787void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 788
 789#endif /* CONFIG_RT_GROUP_SCHED */
 790
 791static inline
 792void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 793{
 794        int prio = rt_se_prio(rt_se);
 795
 796        WARN_ON(!rt_prio(prio));
 797        rt_rq->rt_nr_running++;
 798
 799        inc_rt_prio(rt_rq, prio);
 800        inc_rt_migration(rt_se, rt_rq);
 801        inc_rt_group(rt_se, rt_rq);
 802}
 803
 804static inline
 805void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 806{
 807        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 808        WARN_ON(!rt_rq->rt_nr_running);
 809        rt_rq->rt_nr_running--;
 810
 811        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
 812        dec_rt_migration(rt_se, rt_rq);
 813        dec_rt_group(rt_se, rt_rq);
 814}
 815
 816static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 817{
 818        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 819        struct rt_prio_array *array = &rt_rq->active;
 820        struct rt_rq *group_rq = group_rt_rq(rt_se);
 821        struct list_head *queue = array->queue + rt_se_prio(rt_se);
 822
 823        /*
 824         * Don't enqueue the group if its throttled, or when empty.
 825         * The latter is a consequence of the former when a child group
 826         * get throttled and the current group doesn't have any other
 827         * active members.
 828         */
 829        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 830                return;
 831
 832        if (head)
 833                list_add(&rt_se->run_list, queue);
 834        else
 835                list_add_tail(&rt_se->run_list, queue);
 836        __set_bit(rt_se_prio(rt_se), array->bitmap);
 837
 838        inc_rt_tasks(rt_se, rt_rq);
 839}
 840
 841static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
 842{
 843        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 844        struct rt_prio_array *array = &rt_rq->active;
 845
 846        list_del_init(&rt_se->run_list);
 847        if (list_empty(array->queue + rt_se_prio(rt_se)))
 848                __clear_bit(rt_se_prio(rt_se), array->bitmap);
 849
 850        dec_rt_tasks(rt_se, rt_rq);
 851}
 852
 853/*
 854 * Because the prio of an upper entry depends on the lower
 855 * entries, we must remove entries top - down.
 856 */
 857static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 858{
 859        struct sched_rt_entity *back = NULL;
 860
 861        for_each_sched_rt_entity(rt_se) {
 862                rt_se->back = back;
 863                back = rt_se;
 864        }
 865
 866        for (rt_se = back; rt_se; rt_se = rt_se->back) {
 867                if (on_rt_rq(rt_se))
 868                        __dequeue_rt_entity(rt_se);
 869        }
 870}
 871
 872static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 873{
 874        dequeue_rt_stack(rt_se);
 875        for_each_sched_rt_entity(rt_se)
 876                __enqueue_rt_entity(rt_se, head);
 877}
 878
 879static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 880{
 881        dequeue_rt_stack(rt_se);
 882
 883        for_each_sched_rt_entity(rt_se) {
 884                struct rt_rq *rt_rq = group_rt_rq(rt_se);
 885
 886                if (rt_rq && rt_rq->rt_nr_running)
 887                        __enqueue_rt_entity(rt_se, false);
 888        }
 889}
 890
 891/*
 892 * Adding/removing a task to/from a priority array:
 893 */
 894static void
 895enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 896{
 897        struct sched_rt_entity *rt_se = &p->rt;
 898
 899        if (flags & ENQUEUE_WAKEUP)
 900                rt_se->timeout = 0;
 901
 902        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 903
 904        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 905                enqueue_pushable_task(rq, p);
 906}
 907
 908static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 909{
 910        struct sched_rt_entity *rt_se = &p->rt;
 911
 912        update_curr_rt(rq);
 913        dequeue_rt_entity(rt_se);
 914
 915        dequeue_pushable_task(rq, p);
 916}
 917
 918/*
 919 * Put task to the end of the run list without the overhead of dequeue
 920 * followed by enqueue.
 921 */
 922static void
 923requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
 924{
 925        if (on_rt_rq(rt_se)) {
 926                struct rt_prio_array *array = &rt_rq->active;
 927                struct list_head *queue = array->queue + rt_se_prio(rt_se);
 928
 929                if (head)
 930                        list_move(&rt_se->run_list, queue);
 931                else
 932                        list_move_tail(&rt_se->run_list, queue);
 933        }
 934}
 935
 936static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 937{
 938        struct sched_rt_entity *rt_se = &p->rt;
 939        struct rt_rq *rt_rq;
 940
 941        for_each_sched_rt_entity(rt_se) {
 942                rt_rq = rt_rq_of_se(rt_se);
 943                requeue_rt_entity(rt_rq, rt_se, head);
 944        }
 945}
 946
 947static void yield_task_rt(struct rq *rq)
 948{
 949        requeue_task_rt(rq, rq->curr, 0);
 950}
 951
 952#ifdef CONFIG_SMP
 953static int find_lowest_rq(struct task_struct *task);
 954
 955static int
 956select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 957{
 958        if (sd_flag != SD_BALANCE_WAKE)
 959                return smp_processor_id();
 960
 961        /*
 962         * If the current task is an RT task, then
 963         * try to see if we can wake this RT task up on another
 964         * runqueue. Otherwise simply start this RT task
 965         * on its current runqueue.
 966         *
 967         * We want to avoid overloading runqueues. If the woken
 968         * task is a higher priority, then it will stay on this CPU
 969         * and the lower prio task should be moved to another CPU.
 970         * Even though this will probably make the lower prio task
 971         * lose its cache, we do not want to bounce a higher task
 972         * around just because it gave up its CPU, perhaps for a
 973         * lock?
 974         *
 975         * For equal prio tasks, we just let the scheduler sort it out.
 976         */
 977        if (unlikely(rt_task(rq->curr)) &&
 978            (rq->curr->rt.nr_cpus_allowed < 2 ||
 979             rq->curr->prio < p->prio) &&
 980            (p->rt.nr_cpus_allowed > 1)) {
 981                int cpu = find_lowest_rq(p);
 982
 983                return (cpu == -1) ? task_cpu(p) : cpu;
 984        }
 985
 986        /*
 987         * Otherwise, just let it ride on the affined RQ and the
 988         * post-schedule router will push the preempted task away
 989         */
 990        return task_cpu(p);
 991}
 992
 993static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 994{
 995        if (rq->curr->rt.nr_cpus_allowed == 1)
 996                return;
 997
 998        if (p->rt.nr_cpus_allowed != 1
 999            && cpupri_find(&rq->rd->cpupri, p, NULL))
1000                return;
1001
1002        if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1003                return;
1004
1005        /*
1006         * There appears to be other cpus that can accept
1007         * current and none to run 'p', so lets reschedule
1008         * to try and push current away:
1009         */
1010        requeue_task_rt(rq, p, 1);
1011        resched_task(rq->curr);
1012}
1013
1014#endif /* CONFIG_SMP */
1015
1016/*
1017 * Preempt the current task with a newly woken task if needed:
1018 */
1019static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1020{
1021        if (p->prio < rq->curr->prio) {
1022                resched_task(rq->curr);
1023                return;
1024        }
1025
1026#ifdef CONFIG_SMP
1027        /*
1028         * If:
1029         *
1030         * - the newly woken task is of equal priority to the current task
1031         * - the newly woken task is non-migratable while current is migratable
1032         * - current will be preempted on the next reschedule
1033         *
1034         * we should check to see if current can readily move to a different
1035         * cpu.  If so, we will reschedule to allow the push logic to try
1036         * to move current somewhere else, making room for our non-migratable
1037         * task.
1038         */
1039        if (p->prio == rq->curr->prio && !need_resched())
1040                check_preempt_equal_prio(rq, p);
1041#endif
1042}
1043
1044static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1045                                                   struct rt_rq *rt_rq)
1046{
1047        struct rt_prio_array *array = &rt_rq->active;
1048        struct sched_rt_entity *next = NULL;
1049        struct list_head *queue;
1050        int idx;
1051
1052        idx = sched_find_first_bit(array->bitmap);
1053        BUG_ON(idx >= MAX_RT_PRIO);
1054
1055        queue = array->queue + idx;
1056        next = list_entry(queue->next, struct sched_rt_entity, run_list);
1057
1058        return next;
1059}
1060
1061static struct task_struct *_pick_next_task_rt(struct rq *rq)
1062{
1063        struct sched_rt_entity *rt_se;
1064        struct task_struct *p;
1065        struct rt_rq *rt_rq;
1066
1067        rt_rq = &rq->rt;
1068
1069        if (unlikely(!rt_rq->rt_nr_running))
1070                return NULL;
1071
1072        if (rt_rq_throttled(rt_rq))
1073                return NULL;
1074
1075        do {
1076                rt_se = pick_next_rt_entity(rq, rt_rq);
1077                BUG_ON(!rt_se);
1078                rt_rq = group_rt_rq(rt_se);
1079        } while (rt_rq);
1080
1081        p = rt_task_of(rt_se);
1082        p->se.exec_start = rq->clock_task;
1083
1084        return p;
1085}
1086
1087static struct task_struct *pick_next_task_rt(struct rq *rq)
1088{
1089        struct task_struct *p = _pick_next_task_rt(rq);
1090
1091        /* The running task is never eligible for pushing */
1092        if (p)
1093                dequeue_pushable_task(rq, p);
1094
1095#ifdef CONFIG_SMP
1096        /*
1097         * We detect this state here so that we can avoid taking the RQ
1098         * lock again later if there is no need to push
1099         */
1100        rq->post_schedule = has_pushable_tasks(rq);
1101#endif
1102
1103        return p;
1104}
1105
1106static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1107{
1108        update_curr_rt(rq);
1109        p->se.exec_start = 0;
1110
1111        /*
1112         * The previous task needs to be made eligible for pushing
1113         * if it is still active
1114         */
1115        if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
1116                enqueue_pushable_task(rq, p);
1117}
1118
1119#ifdef CONFIG_SMP
1120
1121/* Only try algorithms three times */
1122#define RT_MAX_TRIES 3
1123
1124static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1125
1126static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1127{
1128        if (!task_running(rq, p) &&
1129            (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
1130            (p->rt.nr_cpus_allowed > 1))
1131                return 1;
1132        return 0;
1133}
1134
1135/* Return the second highest RT task, NULL otherwise */
1136static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1137{
1138        struct task_struct *next = NULL;
1139        struct sched_rt_entity *rt_se;
1140        struct rt_prio_array *array;
1141        struct rt_rq *rt_rq;
1142        int idx;
1143
1144        for_each_leaf_rt_rq(rt_rq, rq) {
1145                array = &rt_rq->active;
1146                idx = sched_find_first_bit(array->bitmap);
1147next_idx:
1148                if (idx >= MAX_RT_PRIO)
1149                        continue;
1150                if (next && next->prio < idx)
1151                        continue;
1152                list_for_each_entry(rt_se, array->queue + idx, run_list) {
1153                        struct task_struct *p;
1154
1155                        if (!rt_entity_is_task(rt_se))
1156                                continue;
1157
1158                        p = rt_task_of(rt_se);
1159                        if (pick_rt_task(rq, p, cpu)) {
1160                                next = p;
1161                                break;
1162                        }
1163                }
1164                if (!next) {
1165                        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1166                        goto next_idx;
1167                }
1168        }
1169
1170        return next;
1171}
1172
1173static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1174
1175static int find_lowest_rq(struct task_struct *task)
1176{
1177        struct sched_domain *sd;
1178        struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1179        int this_cpu = smp_processor_id();
1180        int cpu      = task_cpu(task);
1181
1182        if (task->rt.nr_cpus_allowed == 1)
1183                return -1; /* No other targets possible */
1184
1185        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1186                return -1; /* No targets found */
1187
1188        /*
1189         * At this point we have built a mask of cpus representing the
1190         * lowest priority tasks in the system.  Now we want to elect
1191         * the best one based on our affinity and topology.
1192         *
1193         * We prioritize the last cpu that the task executed on since
1194         * it is most likely cache-hot in that location.
1195         */
1196        if (cpumask_test_cpu(cpu, lowest_mask))
1197                return cpu;
1198
1199        /*
1200         * Otherwise, we consult the sched_domains span maps to figure
1201         * out which cpu is logically closest to our hot cache data.
1202         */
1203        if (!cpumask_test_cpu(this_cpu, lowest_mask))
1204                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1205
1206        for_each_domain(cpu, sd) {
1207                if (sd->flags & SD_WAKE_AFFINE) {
1208                        int best_cpu;
1209
1210                        /*
1211                         * "this_cpu" is cheaper to preempt than a
1212                         * remote processor.
1213                         */
1214                        if (this_cpu != -1 &&
1215                            cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
1216                                return this_cpu;
1217
1218                        best_cpu = cpumask_first_and(lowest_mask,
1219                                                     sched_domain_span(sd));
1220                        if (best_cpu < nr_cpu_ids)
1221                                return best_cpu;
1222                }
1223        }
1224
1225        /*
1226         * And finally, if there were no matches within the domains
1227         * just give the caller *something* to work with from the compatible
1228         * locations.
1229         */
1230        if (this_cpu != -1)
1231                return this_cpu;
1232
1233        cpu = cpumask_any(lowest_mask);
1234        if (cpu < nr_cpu_ids)
1235                return cpu;
1236        return -1;
1237}
1238
1239/* Will lock the rq it finds */
1240static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1241{
1242        struct rq *lowest_rq = NULL;
1243        int tries;
1244        int cpu;
1245
1246        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1247                cpu = find_lowest_rq(task);
1248
1249                if ((cpu == -1) || (cpu == rq->cpu))
1250                        break;
1251
1252                lowest_rq = cpu_rq(cpu);
1253
1254                /* if the prio of this runqueue changed, try again */
1255                if (double_lock_balance(rq, lowest_rq)) {
1256                        /*
1257                         * We had to unlock the run queue. In
1258                         * the mean time, task could have
1259                         * migrated already or had its affinity changed.
1260                         * Also make sure that it wasn't scheduled on its rq.
1261                         */
1262                        if (unlikely(task_rq(task) != rq ||
1263                                     !cpumask_test_cpu(lowest_rq->cpu,
1264                                                       &task->cpus_allowed) ||
1265                                     task_running(rq, task) ||
1266                                     !task->se.on_rq)) {
1267
1268                                raw_spin_unlock(&lowest_rq->lock);
1269                                lowest_rq = NULL;
1270                                break;
1271                        }
1272                }
1273
1274                /* If this rq is still suitable use it. */
1275                if (lowest_rq->rt.highest_prio.curr > task->prio)
1276                        break;
1277
1278                /* try again */
1279                double_unlock_balance(rq, lowest_rq);
1280                lowest_rq = NULL;
1281        }
1282
1283        return lowest_rq;
1284}
1285
1286static struct task_struct *pick_next_pushable_task(struct rq *rq)
1287{
1288        struct task_struct *p;
1289
1290        if (!has_pushable_tasks(rq))
1291                return NULL;
1292
1293        p = plist_first_entry(&rq->rt.pushable_tasks,
1294                              struct task_struct, pushable_tasks);
1295
1296        BUG_ON(rq->cpu != task_cpu(p));
1297        BUG_ON(task_current(rq, p));
1298        BUG_ON(p->rt.nr_cpus_allowed <= 1);
1299
1300        BUG_ON(!p->se.on_rq);
1301        BUG_ON(!rt_task(p));
1302
1303        return p;
1304}
1305
1306/*
1307 * If the current CPU has more than one RT task, see if the non
1308 * running task can migrate over to a CPU that is running a task
1309 * of lesser priority.
1310 */
1311static int push_rt_task(struct rq *rq)
1312{
1313        struct task_struct *next_task;
1314        struct rq *lowest_rq;
1315
1316        if (!rq->rt.overloaded)
1317                return 0;
1318
1319        next_task = pick_next_pushable_task(rq);
1320        if (!next_task)
1321                return 0;
1322
1323retry:
1324        if (unlikely(next_task == rq->curr)) {
1325                WARN_ON(1);
1326                return 0;
1327        }
1328
1329        /*
1330         * It's possible that the next_task slipped in of
1331         * higher priority than current. If that's the case
1332         * just reschedule current.
1333         */
1334        if (unlikely(next_task->prio < rq->curr->prio)) {
1335                resched_task(rq->curr);
1336                return 0;
1337        }
1338
1339        /* We might release rq lock */
1340        get_task_struct(next_task);
1341
1342        /* find_lock_lowest_rq locks the rq if found */
1343        lowest_rq = find_lock_lowest_rq(next_task, rq);
1344        if (!lowest_rq) {
1345                struct task_struct *task;
1346                /*
1347                 * find lock_lowest_rq releases rq->lock
1348                 * so it is possible that next_task has migrated.
1349                 *
1350                 * We need to make sure that the task is still on the same
1351                 * run-queue and is also still the next task eligible for
1352                 * pushing.
1353                 */
1354                task = pick_next_pushable_task(rq);
1355                if (task_cpu(next_task) == rq->cpu && task == next_task) {
1356                        /*
1357                         * If we get here, the task hasnt moved at all, but
1358                         * it has failed to push.  We will not try again,
1359                         * since the other cpus will pull from us when they
1360                         * are ready.
1361                         */
1362                        dequeue_pushable_task(rq, next_task);
1363                        goto out;
1364                }
1365
1366                if (!task)
1367                        /* No more tasks, just exit */
1368                        goto out;
1369
1370                /*
1371                 * Something has shifted, try again.
1372                 */
1373                put_task_struct(next_task);
1374                next_task = task;
1375                goto retry;
1376        }
1377
1378        deactivate_task(rq, next_task, 0);
1379        set_task_cpu(next_task, lowest_rq->cpu);
1380        activate_task(lowest_rq, next_task, 0);
1381
1382        resched_task(lowest_rq->curr);
1383
1384        double_unlock_balance(rq, lowest_rq);
1385
1386out:
1387        put_task_struct(next_task);
1388
1389        return 1;
1390}
1391
1392static void push_rt_tasks(struct rq *rq)
1393{
1394        /* push_rt_task will return true if it moved an RT */
1395        while (push_rt_task(rq))
1396                ;
1397}
1398
1399static int pull_rt_task(struct rq *this_rq)
1400{
1401        int this_cpu = this_rq->cpu, ret = 0, cpu;
1402        struct task_struct *p;
1403        struct rq *src_rq;
1404
1405        if (likely(!rt_overloaded(this_rq)))
1406                return 0;
1407
1408        for_each_cpu(cpu, this_rq->rd->rto_mask) {
1409                if (this_cpu == cpu)
1410                        continue;
1411
1412                src_rq = cpu_rq(cpu);
1413
1414                /*
1415                 * Don't bother taking the src_rq->lock if the next highest
1416                 * task is known to be lower-priority than our current task.
1417                 * This may look racy, but if this value is about to go
1418                 * logically higher, the src_rq will push this task away.
1419                 * And if its going logically lower, we do not care
1420                 */
1421                if (src_rq->rt.highest_prio.next >=
1422                    this_rq->rt.highest_prio.curr)
1423                        continue;
1424
1425                /*
1426                 * We can potentially drop this_rq's lock in
1427                 * double_lock_balance, and another CPU could
1428                 * alter this_rq
1429                 */
1430                double_lock_balance(this_rq, src_rq);
1431
1432                /*
1433                 * Are there still pullable RT tasks?
1434                 */
1435                if (src_rq->rt.rt_nr_running <= 1)
1436                        goto skip;
1437
1438                p = pick_next_highest_task_rt(src_rq, this_cpu);
1439
1440                /*
1441                 * Do we have an RT task that preempts
1442                 * the to-be-scheduled task?
1443                 */
1444                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1445                        WARN_ON(p == src_rq->curr);
1446                        WARN_ON(!p->se.on_rq);
1447
1448                        /*
1449                         * There's a chance that p is higher in priority
1450                         * than what's currently running on its cpu.
1451                         * This is just that p is wakeing up and hasn't
1452                         * had a chance to schedule. We only pull
1453                         * p if it is lower in priority than the
1454                         * current task on the run queue
1455                         */
1456                        if (p->prio < src_rq->curr->prio)
1457                                goto skip;
1458
1459                        ret = 1;
1460
1461                        deactivate_task(src_rq, p, 0);
1462                        set_task_cpu(p, this_cpu);
1463                        activate_task(this_rq, p, 0);
1464                        /*
1465                         * We continue with the search, just in
1466                         * case there's an even higher prio task
1467                         * in another runqueue. (low likelyhood
1468                         * but possible)
1469                         */
1470                }
1471skip:
1472                double_unlock_balance(this_rq, src_rq);
1473        }
1474
1475        return ret;
1476}
1477
1478static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1479{
1480        /* Try to pull RT tasks here if we lower this rq's prio */
1481        if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
1482                pull_rt_task(rq);
1483}
1484
1485static void post_schedule_rt(struct rq *rq)
1486{
1487        push_rt_tasks(rq);
1488}
1489
1490/*
1491 * If we are not running and we are not going to reschedule soon, we should
1492 * try to push tasks away now
1493 */
1494static void task_woken_rt(struct rq *rq, struct task_struct *p)
1495{
1496        if (!task_running(rq, p) &&
1497            !test_tsk_need_resched(rq->curr) &&
1498            has_pushable_tasks(rq) &&
1499            p->rt.nr_cpus_allowed > 1 &&
1500            rt_task(rq->curr) &&
1501            (rq->curr->rt.nr_cpus_allowed < 2 ||
1502             rq->curr->prio < p->prio))
1503                push_rt_tasks(rq);
1504}
1505
1506static void set_cpus_allowed_rt(struct task_struct *p,
1507                                const struct cpumask *new_mask)
1508{
1509        int weight = cpumask_weight(new_mask);
1510
1511        BUG_ON(!rt_task(p));
1512
1513        /*
1514         * Update the migration status of the RQ if we have an RT task
1515         * which is running AND changing its weight value.
1516         */
1517        if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
1518                struct rq *rq = task_rq(p);
1519
1520                if (!task_current(rq, p)) {
1521                        /*
1522                         * Make sure we dequeue this task from the pushable list
1523                         * before going further.  It will either remain off of
1524                         * the list because we are no longer pushable, or it
1525                         * will be requeued.
1526                         */
1527                        if (p->rt.nr_cpus_allowed > 1)
1528                                dequeue_pushable_task(rq, p);
1529
1530                        /*
1531                         * Requeue if our weight is changing and still > 1
1532                         */
1533                        if (weight > 1)
1534                                enqueue_pushable_task(rq, p);
1535
1536                }
1537
1538                if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1539                        rq->rt.rt_nr_migratory++;
1540                } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1541                        BUG_ON(!rq->rt.rt_nr_migratory);
1542                        rq->rt.rt_nr_migratory--;
1543                }
1544
1545                update_rt_migration(&rq->rt);
1546        }
1547
1548        cpumask_copy(&p->cpus_allowed, new_mask);
1549        p->rt.nr_cpus_allowed = weight;
1550}
1551
1552/* Assumes rq->lock is held */
1553static void rq_online_rt(struct rq *rq)
1554{
1555        if (rq->rt.overloaded)
1556                rt_set_overload(rq);
1557
1558        __enable_runtime(rq);
1559
1560        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1561}
1562
1563/* Assumes rq->lock is held */
1564static void rq_offline_rt(struct rq *rq)
1565{
1566        if (rq->rt.overloaded)
1567                rt_clear_overload(rq);
1568
1569        __disable_runtime(rq);
1570
1571        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1572}
1573
1574/*
1575 * When switch from the rt queue, we bring ourselves to a position
1576 * that we might want to pull RT tasks from other runqueues.
1577 */
1578static void switched_from_rt(struct rq *rq, struct task_struct *p,
1579                           int running)
1580{
1581        /*
1582         * If there are other RT tasks then we will reschedule
1583         * and the scheduling of the other RT tasks will handle
1584         * the balancing. But if we are the last RT task
1585         * we may need to handle the pulling of RT tasks
1586         * now.
1587         */
1588        if (!rq->rt.rt_nr_running)
1589                pull_rt_task(rq);
1590}
1591
1592static inline void init_sched_rt_class(void)
1593{
1594        unsigned int i;
1595
1596        for_each_possible_cpu(i)
1597                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1598                                        GFP_KERNEL, cpu_to_node(i));
1599}
1600#endif /* CONFIG_SMP */
1601
1602/*
1603 * When switching a task to RT, we may overload the runqueue
1604 * with RT tasks. In this case we try to push them off to
1605 * other runqueues.
1606 */
1607static void switched_to_rt(struct rq *rq, struct task_struct *p,
1608                           int running)
1609{
1610        int check_resched = 1;
1611
1612        /*
1613         * If we are already running, then there's nothing
1614         * that needs to be done. But if we are not running
1615         * we may need to preempt the current running task.
1616         * If that current running task is also an RT task
1617         * then see if we can move to another run queue.
1618         */
1619        if (!running) {
1620#ifdef CONFIG_SMP
1621                if (rq->rt.overloaded && push_rt_task(rq) &&
1622                    /* Don't resched if we changed runqueues */
1623                    rq != task_rq(p))
1624                        check_resched = 0;
1625#endif /* CONFIG_SMP */
1626                if (check_resched && p->prio < rq->curr->prio)
1627                        resched_task(rq->curr);
1628        }
1629}
1630
1631/*
1632 * Priority of the task has changed. This may cause
1633 * us to initiate a push or pull.
1634 */
1635static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1636                            int oldprio, int running)
1637{
1638        if (running) {
1639#ifdef CONFIG_SMP
1640                /*
1641                 * If our priority decreases while running, we
1642                 * may need to pull tasks to this runqueue.
1643                 */
1644                if (oldprio < p->prio)
1645                        pull_rt_task(rq);
1646                /*
1647                 * If there's a higher priority task waiting to run
1648                 * then reschedule. Note, the above pull_rt_task
1649                 * can release the rq lock and p could migrate.
1650                 * Only reschedule if p is still on the same runqueue.
1651                 */
1652                if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1653                        resched_task(p);
1654#else
1655                /* For UP simply resched on drop of prio */
1656                if (oldprio < p->prio)
1657                        resched_task(p);
1658#endif /* CONFIG_SMP */
1659        } else {
1660                /*
1661                 * This task is not running, but if it is
1662                 * greater than the current running task
1663                 * then reschedule.
1664                 */
1665                if (p->prio < rq->curr->prio)
1666                        resched_task(rq->curr);
1667        }
1668}
1669
1670static void watchdog(struct rq *rq, struct task_struct *p)
1671{
1672        unsigned long soft, hard;
1673
1674        /* max may change after cur was read, this will be fixed next tick */
1675        soft = task_rlimit(p, RLIMIT_RTTIME);
1676        hard = task_rlimit_max(p, RLIMIT_RTTIME);
1677
1678        if (soft != RLIM_INFINITY) {
1679                unsigned long next;
1680
1681                p->rt.timeout++;
1682                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1683                if (p->rt.timeout > next)
1684                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1685        }
1686}
1687
1688static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1689{
1690        update_curr_rt(rq);
1691
1692        watchdog(rq, p);
1693
1694        /*
1695         * RR tasks need a special form of timeslice management.
1696         * FIFO tasks have no timeslices.
1697         */
1698        if (p->policy != SCHED_RR)
1699                return;
1700
1701        if (--p->rt.time_slice)
1702                return;
1703
1704        p->rt.time_slice = DEF_TIMESLICE;
1705
1706        /*
1707         * Requeue to the end of queue if we are not the only element
1708         * on the queue:
1709         */
1710        if (p->rt.run_list.prev != p->rt.run_list.next) {
1711                requeue_task_rt(rq, p, 0);
1712                set_tsk_need_resched(p);
1713        }
1714}
1715
1716static void set_curr_task_rt(struct rq *rq)
1717{
1718        struct task_struct *p = rq->curr;
1719
1720        p->se.exec_start = rq->clock_task;
1721
1722        /* The running task is never eligible for pushing */
1723        dequeue_pushable_task(rq, p);
1724}
1725
1726static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1727{
1728        /*
1729         * Time slice is 0 for SCHED_FIFO tasks
1730         */
1731        if (task->policy == SCHED_RR)
1732                return DEF_TIMESLICE;
1733        else
1734                return 0;
1735}
1736
1737static const struct sched_class rt_sched_class = {
1738        .next                   = &fair_sched_class,
1739        .enqueue_task           = enqueue_task_rt,
1740        .dequeue_task           = dequeue_task_rt,
1741        .yield_task             = yield_task_rt,
1742
1743        .check_preempt_curr     = check_preempt_curr_rt,
1744
1745        .pick_next_task         = pick_next_task_rt,
1746        .put_prev_task          = put_prev_task_rt,
1747
1748#ifdef CONFIG_SMP
1749        .select_task_rq         = select_task_rq_rt,
1750
1751        .set_cpus_allowed       = set_cpus_allowed_rt,
1752        .rq_online              = rq_online_rt,
1753        .rq_offline             = rq_offline_rt,
1754        .pre_schedule           = pre_schedule_rt,
1755        .post_schedule          = post_schedule_rt,
1756        .task_woken             = task_woken_rt,
1757        .switched_from          = switched_from_rt,
1758#endif
1759
1760        .set_curr_task          = set_curr_task_rt,
1761        .task_tick              = task_tick_rt,
1762
1763        .get_rr_interval        = get_rr_interval_rt,
1764
1765        .prio_changed           = prio_changed_rt,
1766        .switched_to            = switched_to_rt,
1767};
1768
1769#ifdef CONFIG_SCHED_DEBUG
1770extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1771
1772static void print_rt_stats(struct seq_file *m, int cpu)
1773{
1774        struct rt_rq *rt_rq;
1775
1776        rcu_read_lock();
1777        for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
1778                print_rt_rq(m, cpu, rt_rq);
1779        rcu_read_unlock();
1780}
1781#endif /* CONFIG_SCHED_DEBUG */
1782
1783