linux/kernel/sched_rt.c
<<
>>
Prefs
   1/*
   2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3 * policies)
   4 */
   5
   6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
   7{
   8        return container_of(rt_se, struct task_struct, rt);
   9}
  10
  11#ifdef CONFIG_RT_GROUP_SCHED
  12
  13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
  14
  15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  16{
  17        return rt_rq->rq;
  18}
  19
  20static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  21{
  22        return rt_se->rt_rq;
  23}
  24
  25#else /* CONFIG_RT_GROUP_SCHED */
  26
  27#define rt_entity_is_task(rt_se) (1)
  28
  29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  30{
  31        return container_of(rt_rq, struct rq, rt);
  32}
  33
  34static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  35{
  36        struct task_struct *p = rt_task_of(rt_se);
  37        struct rq *rq = task_rq(p);
  38
  39        return &rq->rt;
  40}
  41
  42#endif /* CONFIG_RT_GROUP_SCHED */
  43
  44#ifdef CONFIG_SMP
  45
  46static inline int rt_overloaded(struct rq *rq)
  47{
  48        return atomic_read(&rq->rd->rto_count);
  49}
  50
  51static inline void rt_set_overload(struct rq *rq)
  52{
  53        if (!rq->online)
  54                return;
  55
  56        cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
  57        /*
  58         * Make sure the mask is visible before we set
  59         * the overload count. That is checked to determine
  60         * if we should look at the mask. It would be a shame
  61         * if we looked at the mask, but the mask was not
  62         * updated yet.
  63         */
  64        wmb();
  65        atomic_inc(&rq->rd->rto_count);
  66}
  67
  68static inline void rt_clear_overload(struct rq *rq)
  69{
  70        if (!rq->online)
  71                return;
  72
  73        /* the order here really doesn't matter */
  74        atomic_dec(&rq->rd->rto_count);
  75        cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
  76}
  77
  78static void update_rt_migration(struct rt_rq *rt_rq)
  79{
  80        if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
  81                if (!rt_rq->overloaded) {
  82                        rt_set_overload(rq_of_rt_rq(rt_rq));
  83                        rt_rq->overloaded = 1;
  84                }
  85        } else if (rt_rq->overloaded) {
  86                rt_clear_overload(rq_of_rt_rq(rt_rq));
  87                rt_rq->overloaded = 0;
  88        }
  89}
  90
  91static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  92{
  93        if (!rt_entity_is_task(rt_se))
  94                return;
  95
  96        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
  97
  98        rt_rq->rt_nr_total++;
  99        if (rt_se->nr_cpus_allowed > 1)
 100                rt_rq->rt_nr_migratory++;
 101
 102        update_rt_migration(rt_rq);
 103}
 104
 105static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 106{
 107        if (!rt_entity_is_task(rt_se))
 108                return;
 109
 110        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 111
 112        rt_rq->rt_nr_total--;
 113        if (rt_se->nr_cpus_allowed > 1)
 114                rt_rq->rt_nr_migratory--;
 115
 116        update_rt_migration(rt_rq);
 117}
 118
 119static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 120{
 121        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 122        plist_node_init(&p->pushable_tasks, p->prio);
 123        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 124}
 125
 126static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 127{
 128        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 129}
 130
 131#else
 132
 133static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 134{
 135}
 136
 137static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 138{
 139}
 140
 141static inline
 142void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 143{
 144}
 145
 146static inline
 147void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 148{
 149}
 150
 151#endif /* CONFIG_SMP */
 152
 153static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 154{
 155        return !list_empty(&rt_se->run_list);
 156}
 157
 158#ifdef CONFIG_RT_GROUP_SCHED
 159
 160static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 161{
 162        if (!rt_rq->tg)
 163                return RUNTIME_INF;
 164
 165        return rt_rq->rt_runtime;
 166}
 167
 168static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 169{
 170        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 171}
 172
 173#define for_each_leaf_rt_rq(rt_rq, rq) \
 174        list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 175
 176#define for_each_sched_rt_entity(rt_se) \
 177        for (; rt_se; rt_se = rt_se->parent)
 178
 179static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 180{
 181        return rt_se->my_q;
 182}
 183
 184static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
 185static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 186
 187static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 188{
 189        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 190        struct sched_rt_entity *rt_se = rt_rq->rt_se;
 191
 192        if (rt_rq->rt_nr_running) {
 193                if (rt_se && !on_rt_rq(rt_se))
 194                        enqueue_rt_entity(rt_se);
 195                if (rt_rq->highest_prio.curr < curr->prio)
 196                        resched_task(curr);
 197        }
 198}
 199
 200static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 201{
 202        struct sched_rt_entity *rt_se = rt_rq->rt_se;
 203
 204        if (rt_se && on_rt_rq(rt_se))
 205                dequeue_rt_entity(rt_se);
 206}
 207
 208static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 209{
 210        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 211}
 212
 213static int rt_se_boosted(struct sched_rt_entity *rt_se)
 214{
 215        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 216        struct task_struct *p;
 217
 218        if (rt_rq)
 219                return !!rt_rq->rt_nr_boosted;
 220
 221        p = rt_task_of(rt_se);
 222        return p->prio != p->normal_prio;
 223}
 224
 225#ifdef CONFIG_SMP
 226static inline const struct cpumask *sched_rt_period_mask(void)
 227{
 228        return cpu_rq(smp_processor_id())->rd->span;
 229}
 230#else
 231static inline const struct cpumask *sched_rt_period_mask(void)
 232{
 233        return cpu_online_mask;
 234}
 235#endif
 236
 237static inline
 238struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 239{
 240        return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 241}
 242
 243static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 244{
 245        return &rt_rq->tg->rt_bandwidth;
 246}
 247
 248#else /* !CONFIG_RT_GROUP_SCHED */
 249
 250static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 251{
 252        return rt_rq->rt_runtime;
 253}
 254
 255static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 256{
 257        return ktime_to_ns(def_rt_bandwidth.rt_period);
 258}
 259
 260#define for_each_leaf_rt_rq(rt_rq, rq) \
 261        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 262
 263#define for_each_sched_rt_entity(rt_se) \
 264        for (; rt_se; rt_se = NULL)
 265
 266static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 267{
 268        return NULL;
 269}
 270
 271static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 272{
 273        if (rt_rq->rt_nr_running)
 274                resched_task(rq_of_rt_rq(rt_rq)->curr);
 275}
 276
 277static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 278{
 279}
 280
 281static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 282{
 283        return rt_rq->rt_throttled;
 284}
 285
 286static inline const struct cpumask *sched_rt_period_mask(void)
 287{
 288        return cpu_online_mask;
 289}
 290
 291static inline
 292struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 293{
 294        return &cpu_rq(cpu)->rt;
 295}
 296
 297static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 298{
 299        return &def_rt_bandwidth;
 300}
 301
 302#endif /* CONFIG_RT_GROUP_SCHED */
 303
 304#ifdef CONFIG_SMP
 305/*
 306 * We ran out of runtime, see if we can borrow some from our neighbours.
 307 */
 308static int do_balance_runtime(struct rt_rq *rt_rq)
 309{
 310        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 311        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 312        int i, weight, more = 0;
 313        u64 rt_period;
 314
 315        weight = cpumask_weight(rd->span);
 316
 317        spin_lock(&rt_b->rt_runtime_lock);
 318        rt_period = ktime_to_ns(rt_b->rt_period);
 319        for_each_cpu(i, rd->span) {
 320                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 321                s64 diff;
 322
 323                if (iter == rt_rq)
 324                        continue;
 325
 326                spin_lock(&iter->rt_runtime_lock);
 327                /*
 328                 * Either all rqs have inf runtime and there's nothing to steal
 329                 * or __disable_runtime() below sets a specific rq to inf to
 330                 * indicate its been disabled and disalow stealing.
 331                 */
 332                if (iter->rt_runtime == RUNTIME_INF)
 333                        goto next;
 334
 335                /*
 336                 * From runqueues with spare time, take 1/n part of their
 337                 * spare time, but no more than our period.
 338                 */
 339                diff = iter->rt_runtime - iter->rt_time;
 340                if (diff > 0) {
 341                        diff = div_u64((u64)diff, weight);
 342                        if (rt_rq->rt_runtime + diff > rt_period)
 343                                diff = rt_period - rt_rq->rt_runtime;
 344                        iter->rt_runtime -= diff;
 345                        rt_rq->rt_runtime += diff;
 346                        more = 1;
 347                        if (rt_rq->rt_runtime == rt_period) {
 348                                spin_unlock(&iter->rt_runtime_lock);
 349                                break;
 350                        }
 351                }
 352next:
 353                spin_unlock(&iter->rt_runtime_lock);
 354        }
 355        spin_unlock(&rt_b->rt_runtime_lock);
 356
 357        return more;
 358}
 359
 360/*
 361 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 362 */
 363static void __disable_runtime(struct rq *rq)
 364{
 365        struct root_domain *rd = rq->rd;
 366        struct rt_rq *rt_rq;
 367
 368        if (unlikely(!scheduler_running))
 369                return;
 370
 371        for_each_leaf_rt_rq(rt_rq, rq) {
 372                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 373                s64 want;
 374                int i;
 375
 376                spin_lock(&rt_b->rt_runtime_lock);
 377                spin_lock(&rt_rq->rt_runtime_lock);
 378                /*
 379                 * Either we're all inf and nobody needs to borrow, or we're
 380                 * already disabled and thus have nothing to do, or we have
 381                 * exactly the right amount of runtime to take out.
 382                 */
 383                if (rt_rq->rt_runtime == RUNTIME_INF ||
 384                                rt_rq->rt_runtime == rt_b->rt_runtime)
 385                        goto balanced;
 386                spin_unlock(&rt_rq->rt_runtime_lock);
 387
 388                /*
 389                 * Calculate the difference between what we started out with
 390                 * and what we current have, that's the amount of runtime
 391                 * we lend and now have to reclaim.
 392                 */
 393                want = rt_b->rt_runtime - rt_rq->rt_runtime;
 394
 395                /*
 396                 * Greedy reclaim, take back as much as we can.
 397                 */
 398                for_each_cpu(i, rd->span) {
 399                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 400                        s64 diff;
 401
 402                        /*
 403                         * Can't reclaim from ourselves or disabled runqueues.
 404                         */
 405                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 406                                continue;
 407
 408                        spin_lock(&iter->rt_runtime_lock);
 409                        if (want > 0) {
 410                                diff = min_t(s64, iter->rt_runtime, want);
 411                                iter->rt_runtime -= diff;
 412                                want -= diff;
 413                        } else {
 414                                iter->rt_runtime -= want;
 415                                want -= want;
 416                        }
 417                        spin_unlock(&iter->rt_runtime_lock);
 418
 419                        if (!want)
 420                                break;
 421                }
 422
 423                spin_lock(&rt_rq->rt_runtime_lock);
 424                /*
 425                 * We cannot be left wanting - that would mean some runtime
 426                 * leaked out of the system.
 427                 */
 428                BUG_ON(want);
 429balanced:
 430                /*
 431                 * Disable all the borrow logic by pretending we have inf
 432                 * runtime - in which case borrowing doesn't make sense.
 433                 */
 434                rt_rq->rt_runtime = RUNTIME_INF;
 435                spin_unlock(&rt_rq->rt_runtime_lock);
 436                spin_unlock(&rt_b->rt_runtime_lock);
 437        }
 438}
 439
 440static void disable_runtime(struct rq *rq)
 441{
 442        unsigned long flags;
 443
 444        spin_lock_irqsave(&rq->lock, flags);
 445        __disable_runtime(rq);
 446        spin_unlock_irqrestore(&rq->lock, flags);
 447}
 448
 449static void __enable_runtime(struct rq *rq)
 450{
 451        struct rt_rq *rt_rq;
 452
 453        if (unlikely(!scheduler_running))
 454                return;
 455
 456        /*
 457         * Reset each runqueue's bandwidth settings
 458         */
 459        for_each_leaf_rt_rq(rt_rq, rq) {
 460                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 461
 462                spin_lock(&rt_b->rt_runtime_lock);
 463                spin_lock(&rt_rq->rt_runtime_lock);
 464                rt_rq->rt_runtime = rt_b->rt_runtime;
 465                rt_rq->rt_time = 0;
 466                rt_rq->rt_throttled = 0;
 467                spin_unlock(&rt_rq->rt_runtime_lock);
 468                spin_unlock(&rt_b->rt_runtime_lock);
 469        }
 470}
 471
 472static void enable_runtime(struct rq *rq)
 473{
 474        unsigned long flags;
 475
 476        spin_lock_irqsave(&rq->lock, flags);
 477        __enable_runtime(rq);
 478        spin_unlock_irqrestore(&rq->lock, flags);
 479}
 480
 481static int balance_runtime(struct rt_rq *rt_rq)
 482{
 483        int more = 0;
 484
 485        if (rt_rq->rt_time > rt_rq->rt_runtime) {
 486                spin_unlock(&rt_rq->rt_runtime_lock);
 487                more = do_balance_runtime(rt_rq);
 488                spin_lock(&rt_rq->rt_runtime_lock);
 489        }
 490
 491        return more;
 492}
 493#else /* !CONFIG_SMP */
 494static inline int balance_runtime(struct rt_rq *rt_rq)
 495{
 496        return 0;
 497}
 498#endif /* CONFIG_SMP */
 499
 500static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 501{
 502        int i, idle = 1;
 503        const struct cpumask *span;
 504
 505        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 506                return 1;
 507
 508        span = sched_rt_period_mask();
 509        for_each_cpu(i, span) {
 510                int enqueue = 0;
 511                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 512                struct rq *rq = rq_of_rt_rq(rt_rq);
 513
 514                spin_lock(&rq->lock);
 515                if (rt_rq->rt_time) {
 516                        u64 runtime;
 517
 518                        spin_lock(&rt_rq->rt_runtime_lock);
 519                        if (rt_rq->rt_throttled)
 520                                balance_runtime(rt_rq);
 521                        runtime = rt_rq->rt_runtime;
 522                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 523                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 524                                rt_rq->rt_throttled = 0;
 525                                enqueue = 1;
 526                        }
 527                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
 528                                idle = 0;
 529                        spin_unlock(&rt_rq->rt_runtime_lock);
 530                } else if (rt_rq->rt_nr_running)
 531                        idle = 0;
 532
 533                if (enqueue)
 534                        sched_rt_rq_enqueue(rt_rq);
 535                spin_unlock(&rq->lock);
 536        }
 537
 538        return idle;
 539}
 540
 541static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 542{
 543#ifdef CONFIG_RT_GROUP_SCHED
 544        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 545
 546        if (rt_rq)
 547                return rt_rq->highest_prio.curr;
 548#endif
 549
 550        return rt_task_of(rt_se)->prio;
 551}
 552
 553static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 554{
 555        u64 runtime = sched_rt_runtime(rt_rq);
 556
 557        if (rt_rq->rt_throttled)
 558                return rt_rq_throttled(rt_rq);
 559
 560        if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
 561                return 0;
 562
 563        balance_runtime(rt_rq);
 564        runtime = sched_rt_runtime(rt_rq);
 565        if (runtime == RUNTIME_INF)
 566                return 0;
 567
 568        if (rt_rq->rt_time > runtime) {
 569                rt_rq->rt_throttled = 1;
 570                if (rt_rq_throttled(rt_rq)) {
 571                        sched_rt_rq_dequeue(rt_rq);
 572                        return 1;
 573                }
 574        }
 575
 576        return 0;
 577}
 578
 579/*
 580 * Update the current task's runtime statistics. Skip current tasks that
 581 * are not in our scheduling class.
 582 */
 583static void update_curr_rt(struct rq *rq)
 584{
 585        struct task_struct *curr = rq->curr;
 586        struct sched_rt_entity *rt_se = &curr->rt;
 587        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 588        u64 delta_exec;
 589
 590        if (!task_has_rt_policy(curr))
 591                return;
 592
 593        delta_exec = rq->clock - curr->se.exec_start;
 594        if (unlikely((s64)delta_exec < 0))
 595                delta_exec = 0;
 596
 597        schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 598
 599        curr->se.sum_exec_runtime += delta_exec;
 600        account_group_exec_runtime(curr, delta_exec);
 601
 602        curr->se.exec_start = rq->clock;
 603        cpuacct_charge(curr, delta_exec);
 604
 605        if (!rt_bandwidth_enabled())
 606                return;
 607
 608        for_each_sched_rt_entity(rt_se) {
 609                rt_rq = rt_rq_of_se(rt_se);
 610
 611                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 612                        spin_lock(&rt_rq->rt_runtime_lock);
 613                        rt_rq->rt_time += delta_exec;
 614                        if (sched_rt_runtime_exceeded(rt_rq))
 615                                resched_task(curr);
 616                        spin_unlock(&rt_rq->rt_runtime_lock);
 617                }
 618        }
 619}
 620
 621#if defined CONFIG_SMP
 622
 623static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
 624
 625static inline int next_prio(struct rq *rq)
 626{
 627        struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
 628
 629        if (next && rt_prio(next->prio))
 630                return next->prio;
 631        else
 632                return MAX_RT_PRIO;
 633}
 634
 635static void
 636inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 637{
 638        struct rq *rq = rq_of_rt_rq(rt_rq);
 639
 640        if (prio < prev_prio) {
 641
 642                /*
 643                 * If the new task is higher in priority than anything on the
 644                 * run-queue, we know that the previous high becomes our
 645                 * next-highest.
 646                 */
 647                rt_rq->highest_prio.next = prev_prio;
 648
 649                if (rq->online)
 650                        cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 651
 652        } else if (prio == rt_rq->highest_prio.curr)
 653                /*
 654                 * If the next task is equal in priority to the highest on
 655                 * the run-queue, then we implicitly know that the next highest
 656                 * task cannot be any lower than current
 657                 */
 658                rt_rq->highest_prio.next = prio;
 659        else if (prio < rt_rq->highest_prio.next)
 660                /*
 661                 * Otherwise, we need to recompute next-highest
 662                 */
 663                rt_rq->highest_prio.next = next_prio(rq);
 664}
 665
 666static void
 667dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 668{
 669        struct rq *rq = rq_of_rt_rq(rt_rq);
 670
 671        if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
 672                rt_rq->highest_prio.next = next_prio(rq);
 673
 674        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
 675                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 676}
 677
 678#else /* CONFIG_SMP */
 679
 680static inline
 681void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 682static inline
 683void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 684
 685#endif /* CONFIG_SMP */
 686
 687#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 688static void
 689inc_rt_prio(struct rt_rq *rt_rq, int prio)
 690{
 691        int prev_prio = rt_rq->highest_prio.curr;
 692
 693        if (prio < prev_prio)
 694                rt_rq->highest_prio.curr = prio;
 695
 696        inc_rt_prio_smp(rt_rq, prio, prev_prio);
 697}
 698
 699static void
 700dec_rt_prio(struct rt_rq *rt_rq, int prio)
 701{
 702        int prev_prio = rt_rq->highest_prio.curr;
 703
 704        if (rt_rq->rt_nr_running) {
 705
 706                WARN_ON(prio < prev_prio);
 707
 708                /*
 709                 * This may have been our highest task, and therefore
 710                 * we may have some recomputation to do
 711                 */
 712                if (prio == prev_prio) {
 713                        struct rt_prio_array *array = &rt_rq->active;
 714
 715                        rt_rq->highest_prio.curr =
 716                                sched_find_first_bit(array->bitmap);
 717                }
 718
 719        } else
 720                rt_rq->highest_prio.curr = MAX_RT_PRIO;
 721
 722        dec_rt_prio_smp(rt_rq, prio, prev_prio);
 723}
 724
 725#else
 726
 727static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
 728static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
 729
 730#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 731
 732#ifdef CONFIG_RT_GROUP_SCHED
 733
 734static void
 735inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 736{
 737        if (rt_se_boosted(rt_se))
 738                rt_rq->rt_nr_boosted++;
 739
 740        if (rt_rq->tg)
 741                start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
 742}
 743
 744static void
 745dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 746{
 747        if (rt_se_boosted(rt_se))
 748                rt_rq->rt_nr_boosted--;
 749
 750        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
 751}
 752
 753#else /* CONFIG_RT_GROUP_SCHED */
 754
 755static void
 756inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 757{
 758        start_rt_bandwidth(&def_rt_bandwidth);
 759}
 760
 761static inline
 762void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 763
 764#endif /* CONFIG_RT_GROUP_SCHED */
 765
 766static inline
 767void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 768{
 769        int prio = rt_se_prio(rt_se);
 770
 771        WARN_ON(!rt_prio(prio));
 772        rt_rq->rt_nr_running++;
 773
 774        inc_rt_prio(rt_rq, prio);
 775        inc_rt_migration(rt_se, rt_rq);
 776        inc_rt_group(rt_se, rt_rq);
 777}
 778
 779static inline
 780void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 781{
 782        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 783        WARN_ON(!rt_rq->rt_nr_running);
 784        rt_rq->rt_nr_running--;
 785
 786        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
 787        dec_rt_migration(rt_se, rt_rq);
 788        dec_rt_group(rt_se, rt_rq);
 789}
 790
 791static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
 792{
 793        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 794        struct rt_prio_array *array = &rt_rq->active;
 795        struct rt_rq *group_rq = group_rt_rq(rt_se);
 796        struct list_head *queue = array->queue + rt_se_prio(rt_se);
 797
 798        /*
 799         * Don't enqueue the group if its throttled, or when empty.
 800         * The latter is a consequence of the former when a child group
 801         * get throttled and the current group doesn't have any other
 802         * active members.
 803         */
 804        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 805                return;
 806
 807        list_add_tail(&rt_se->run_list, queue);
 808        __set_bit(rt_se_prio(rt_se), array->bitmap);
 809
 810        inc_rt_tasks(rt_se, rt_rq);
 811}
 812
 813static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
 814{
 815        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 816        struct rt_prio_array *array = &rt_rq->active;
 817
 818        list_del_init(&rt_se->run_list);
 819        if (list_empty(array->queue + rt_se_prio(rt_se)))
 820                __clear_bit(rt_se_prio(rt_se), array->bitmap);
 821
 822        dec_rt_tasks(rt_se, rt_rq);
 823}
 824
 825/*
 826 * Because the prio of an upper entry depends on the lower
 827 * entries, we must remove entries top - down.
 828 */
 829static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 830{
 831        struct sched_rt_entity *back = NULL;
 832
 833        for_each_sched_rt_entity(rt_se) {
 834                rt_se->back = back;
 835                back = rt_se;
 836        }
 837
 838        for (rt_se = back; rt_se; rt_se = rt_se->back) {
 839                if (on_rt_rq(rt_se))
 840                        __dequeue_rt_entity(rt_se);
 841        }
 842}
 843
 844static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 845{
 846        dequeue_rt_stack(rt_se);
 847        for_each_sched_rt_entity(rt_se)
 848                __enqueue_rt_entity(rt_se);
 849}
 850
 851static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 852{
 853        dequeue_rt_stack(rt_se);
 854
 855        for_each_sched_rt_entity(rt_se) {
 856                struct rt_rq *rt_rq = group_rt_rq(rt_se);
 857
 858                if (rt_rq && rt_rq->rt_nr_running)
 859                        __enqueue_rt_entity(rt_se);
 860        }
 861}
 862
 863/*
 864 * Adding/removing a task to/from a priority array:
 865 */
 866static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 867{
 868        struct sched_rt_entity *rt_se = &p->rt;
 869
 870        if (wakeup)
 871                rt_se->timeout = 0;
 872
 873        enqueue_rt_entity(rt_se);
 874
 875        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 876                enqueue_pushable_task(rq, p);
 877
 878        inc_cpu_load(rq, p->se.load.weight);
 879}
 880
 881static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 882{
 883        struct sched_rt_entity *rt_se = &p->rt;
 884
 885        update_curr_rt(rq);
 886        dequeue_rt_entity(rt_se);
 887
 888        dequeue_pushable_task(rq, p);
 889
 890        dec_cpu_load(rq, p->se.load.weight);
 891}
 892
 893/*
 894 * Put task to the end of the run list without the overhead of dequeue
 895 * followed by enqueue.
 896 */
 897static void
 898requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
 899{
 900        if (on_rt_rq(rt_se)) {
 901                struct rt_prio_array *array = &rt_rq->active;
 902                struct list_head *queue = array->queue + rt_se_prio(rt_se);
 903
 904                if (head)
 905                        list_move(&rt_se->run_list, queue);
 906                else
 907                        list_move_tail(&rt_se->run_list, queue);
 908        }
 909}
 910
 911static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 912{
 913        struct sched_rt_entity *rt_se = &p->rt;
 914        struct rt_rq *rt_rq;
 915
 916        for_each_sched_rt_entity(rt_se) {
 917                rt_rq = rt_rq_of_se(rt_se);
 918                requeue_rt_entity(rt_rq, rt_se, head);
 919        }
 920}
 921
 922static void yield_task_rt(struct rq *rq)
 923{
 924        requeue_task_rt(rq, rq->curr, 0);
 925}
 926
 927#ifdef CONFIG_SMP
 928static int find_lowest_rq(struct task_struct *task);
 929
 930static int select_task_rq_rt(struct task_struct *p, int sync)
 931{
 932        struct rq *rq = task_rq(p);
 933
 934        /*
 935         * If the current task is an RT task, then
 936         * try to see if we can wake this RT task up on another
 937         * runqueue. Otherwise simply start this RT task
 938         * on its current runqueue.
 939         *
 940         * We want to avoid overloading runqueues. Even if
 941         * the RT task is of higher priority than the current RT task.
 942         * RT tasks behave differently than other tasks. If
 943         * one gets preempted, we try to push it off to another queue.
 944         * So trying to keep a preempting RT task on the same
 945         * cache hot CPU will force the running RT task to
 946         * a cold CPU. So we waste all the cache for the lower
 947         * RT task in hopes of saving some of a RT task
 948         * that is just being woken and probably will have
 949         * cold cache anyway.
 950         */
 951        if (unlikely(rt_task(rq->curr)) &&
 952            (p->rt.nr_cpus_allowed > 1)) {
 953                int cpu = find_lowest_rq(p);
 954
 955                return (cpu == -1) ? task_cpu(p) : cpu;
 956        }
 957
 958        /*
 959         * Otherwise, just let it ride on the affined RQ and the
 960         * post-schedule router will push the preempted task away
 961         */
 962        return task_cpu(p);
 963}
 964
 965static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 966{
 967        if (rq->curr->rt.nr_cpus_allowed == 1)
 968                return;
 969
 970        if (p->rt.nr_cpus_allowed != 1
 971            && cpupri_find(&rq->rd->cpupri, p, NULL))
 972                return;
 973
 974        if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
 975                return;
 976
 977        /*
 978         * There appears to be other cpus that can accept
 979         * current and none to run 'p', so lets reschedule
 980         * to try and push current away:
 981         */
 982        requeue_task_rt(rq, p, 1);
 983        resched_task(rq->curr);
 984}
 985
 986#endif /* CONFIG_SMP */
 987
 988/*
 989 * Preempt the current task with a newly woken task if needed:
 990 */
 991static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
 992{
 993        if (p->prio < rq->curr->prio) {
 994                resched_task(rq->curr);
 995                return;
 996        }
 997
 998#ifdef CONFIG_SMP
 999        /*
1000         * If:
1001         *
1002         * - the newly woken task is of equal priority to the current task
1003         * - the newly woken task is non-migratable while current is migratable
1004         * - current will be preempted on the next reschedule
1005         *
1006         * we should check to see if current can readily move to a different
1007         * cpu.  If so, we will reschedule to allow the push logic to try
1008         * to move current somewhere else, making room for our non-migratable
1009         * task.
1010         */
1011        if (p->prio == rq->curr->prio && !need_resched())
1012                check_preempt_equal_prio(rq, p);
1013#endif
1014}
1015
1016static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1017                                                   struct rt_rq *rt_rq)
1018{
1019        struct rt_prio_array *array = &rt_rq->active;
1020        struct sched_rt_entity *next = NULL;
1021        struct list_head *queue;
1022        int idx;
1023
1024        idx = sched_find_first_bit(array->bitmap);
1025        BUG_ON(idx >= MAX_RT_PRIO);
1026
1027        queue = array->queue + idx;
1028        next = list_entry(queue->next, struct sched_rt_entity, run_list);
1029
1030        return next;
1031}
1032
1033static struct task_struct *_pick_next_task_rt(struct rq *rq)
1034{
1035        struct sched_rt_entity *rt_se;
1036        struct task_struct *p;
1037        struct rt_rq *rt_rq;
1038
1039        rt_rq = &rq->rt;
1040
1041        if (unlikely(!rt_rq->rt_nr_running))
1042                return NULL;
1043
1044        if (rt_rq_throttled(rt_rq))
1045                return NULL;
1046
1047        do {
1048                rt_se = pick_next_rt_entity(rq, rt_rq);
1049                BUG_ON(!rt_se);
1050                rt_rq = group_rt_rq(rt_se);
1051        } while (rt_rq);
1052
1053        p = rt_task_of(rt_se);
1054        p->se.exec_start = rq->clock;
1055
1056        return p;
1057}
1058
1059static struct task_struct *pick_next_task_rt(struct rq *rq)
1060{
1061        struct task_struct *p = _pick_next_task_rt(rq);
1062
1063        /* The running task is never eligible for pushing */
1064        if (p)
1065                dequeue_pushable_task(rq, p);
1066
1067        return p;
1068}
1069
1070static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1071{
1072        update_curr_rt(rq);
1073        p->se.exec_start = 0;
1074
1075        /*
1076         * The previous task needs to be made eligible for pushing
1077         * if it is still active
1078         */
1079        if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
1080                enqueue_pushable_task(rq, p);
1081}
1082
1083#ifdef CONFIG_SMP
1084
1085/* Only try algorithms three times */
1086#define RT_MAX_TRIES 3
1087
1088static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1089
1090static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1091{
1092        if (!task_running(rq, p) &&
1093            (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
1094            (p->rt.nr_cpus_allowed > 1))
1095                return 1;
1096        return 0;
1097}
1098
1099/* Return the second highest RT task, NULL otherwise */
1100static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1101{
1102        struct task_struct *next = NULL;
1103        struct sched_rt_entity *rt_se;
1104        struct rt_prio_array *array;
1105        struct rt_rq *rt_rq;
1106        int idx;
1107
1108        for_each_leaf_rt_rq(rt_rq, rq) {
1109                array = &rt_rq->active;
1110                idx = sched_find_first_bit(array->bitmap);
1111 next_idx:
1112                if (idx >= MAX_RT_PRIO)
1113                        continue;
1114                if (next && next->prio < idx)
1115                        continue;
1116                list_for_each_entry(rt_se, array->queue + idx, run_list) {
1117                        struct task_struct *p = rt_task_of(rt_se);
1118                        if (pick_rt_task(rq, p, cpu)) {
1119                                next = p;
1120                                break;
1121                        }
1122                }
1123                if (!next) {
1124                        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1125                        goto next_idx;
1126                }
1127        }
1128
1129        return next;
1130}
1131
1132static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1133
1134static inline int pick_optimal_cpu(int this_cpu,
1135                                   const struct cpumask *mask)
1136{
1137        int first;
1138
1139        /* "this_cpu" is cheaper to preempt than a remote processor */
1140        if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
1141                return this_cpu;
1142
1143        first = cpumask_first(mask);
1144        if (first < nr_cpu_ids)
1145                return first;
1146
1147        return -1;
1148}
1149
1150static int find_lowest_rq(struct task_struct *task)
1151{
1152        struct sched_domain *sd;
1153        struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1154        int this_cpu = smp_processor_id();
1155        int cpu      = task_cpu(task);
1156        cpumask_var_t domain_mask;
1157
1158        if (task->rt.nr_cpus_allowed == 1)
1159                return -1; /* No other targets possible */
1160
1161        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1162                return -1; /* No targets found */
1163
1164        /*
1165         * Only consider CPUs that are usable for migration.
1166         * I guess we might want to change cpupri_find() to ignore those
1167         * in the first place.
1168         */
1169        cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1170
1171        /*
1172         * At this point we have built a mask of cpus representing the
1173         * lowest priority tasks in the system.  Now we want to elect
1174         * the best one based on our affinity and topology.
1175         *
1176         * We prioritize the last cpu that the task executed on since
1177         * it is most likely cache-hot in that location.
1178         */
1179        if (cpumask_test_cpu(cpu, lowest_mask))
1180                return cpu;
1181
1182        /*
1183         * Otherwise, we consult the sched_domains span maps to figure
1184         * out which cpu is logically closest to our hot cache data.
1185         */
1186        if (this_cpu == cpu)
1187                this_cpu = -1; /* Skip this_cpu opt if the same */
1188
1189        if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1190                for_each_domain(cpu, sd) {
1191                        if (sd->flags & SD_WAKE_AFFINE) {
1192                                int best_cpu;
1193
1194                                cpumask_and(domain_mask,
1195                                            sched_domain_span(sd),
1196                                            lowest_mask);
1197
1198                                best_cpu = pick_optimal_cpu(this_cpu,
1199                                                            domain_mask);
1200
1201                                if (best_cpu != -1) {
1202                                        free_cpumask_var(domain_mask);
1203                                        return best_cpu;
1204                                }
1205                        }
1206                }
1207                free_cpumask_var(domain_mask);
1208        }
1209
1210        /*
1211         * And finally, if there were no matches within the domains
1212         * just give the caller *something* to work with from the compatible
1213         * locations.
1214         */
1215        return pick_optimal_cpu(this_cpu, lowest_mask);
1216}
1217
1218/* Will lock the rq it finds */
1219static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1220{
1221        struct rq *lowest_rq = NULL;
1222        int tries;
1223        int cpu;
1224
1225        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1226                cpu = find_lowest_rq(task);
1227
1228                if ((cpu == -1) || (cpu == rq->cpu))
1229                        break;
1230
1231                lowest_rq = cpu_rq(cpu);
1232
1233                /* if the prio of this runqueue changed, try again */
1234                if (double_lock_balance(rq, lowest_rq)) {
1235                        /*
1236                         * We had to unlock the run queue. In
1237                         * the mean time, task could have
1238                         * migrated already or had its affinity changed.
1239                         * Also make sure that it wasn't scheduled on its rq.
1240                         */
1241                        if (unlikely(task_rq(task) != rq ||
1242                                     !cpumask_test_cpu(lowest_rq->cpu,
1243                                                       &task->cpus_allowed) ||
1244                                     task_running(rq, task) ||
1245                                     !task->se.on_rq)) {
1246
1247                                spin_unlock(&lowest_rq->lock);
1248                                lowest_rq = NULL;
1249                                break;
1250                        }
1251                }
1252
1253                /* If this rq is still suitable use it. */
1254                if (lowest_rq->rt.highest_prio.curr > task->prio)
1255                        break;
1256
1257                /* try again */
1258                double_unlock_balance(rq, lowest_rq);
1259                lowest_rq = NULL;
1260        }
1261
1262        return lowest_rq;
1263}
1264
1265static inline int has_pushable_tasks(struct rq *rq)
1266{
1267        return !plist_head_empty(&rq->rt.pushable_tasks);
1268}
1269
1270static struct task_struct *pick_next_pushable_task(struct rq *rq)
1271{
1272        struct task_struct *p;
1273
1274        if (!has_pushable_tasks(rq))
1275                return NULL;
1276
1277        p = plist_first_entry(&rq->rt.pushable_tasks,
1278                              struct task_struct, pushable_tasks);
1279
1280        BUG_ON(rq->cpu != task_cpu(p));
1281        BUG_ON(task_current(rq, p));
1282        BUG_ON(p->rt.nr_cpus_allowed <= 1);
1283
1284        BUG_ON(!p->se.on_rq);
1285        BUG_ON(!rt_task(p));
1286
1287        return p;
1288}
1289
1290/*
1291 * If the current CPU has more than one RT task, see if the non
1292 * running task can migrate over to a CPU that is running a task
1293 * of lesser priority.
1294 */
1295static int push_rt_task(struct rq *rq)
1296{
1297        struct task_struct *next_task;
1298        struct rq *lowest_rq;
1299
1300        if (!rq->rt.overloaded)
1301                return 0;
1302
1303        next_task = pick_next_pushable_task(rq);
1304        if (!next_task)
1305                return 0;
1306
1307 retry:
1308        if (unlikely(next_task == rq->curr)) {
1309                WARN_ON(1);
1310                return 0;
1311        }
1312
1313        /*
1314         * It's possible that the next_task slipped in of
1315         * higher priority than current. If that's the case
1316         * just reschedule current.
1317         */
1318        if (unlikely(next_task->prio < rq->curr->prio)) {
1319                resched_task(rq->curr);
1320                return 0;
1321        }
1322
1323        /* We might release rq lock */
1324        get_task_struct(next_task);
1325
1326        /* find_lock_lowest_rq locks the rq if found */
1327        lowest_rq = find_lock_lowest_rq(next_task, rq);
1328        if (!lowest_rq) {
1329                struct task_struct *task;
1330                /*
1331                 * find lock_lowest_rq releases rq->lock
1332                 * so it is possible that next_task has migrated.
1333                 *
1334                 * We need to make sure that the task is still on the same
1335                 * run-queue and is also still the next task eligible for
1336                 * pushing.
1337                 */
1338                task = pick_next_pushable_task(rq);
1339                if (task_cpu(next_task) == rq->cpu && task == next_task) {
1340                        /*
1341                         * If we get here, the task hasnt moved at all, but
1342                         * it has failed to push.  We will not try again,
1343                         * since the other cpus will pull from us when they
1344                         * are ready.
1345                         */
1346                        dequeue_pushable_task(rq, next_task);
1347                        goto out;
1348                }
1349
1350                if (!task)
1351                        /* No more tasks, just exit */
1352                        goto out;
1353
1354                /*
1355                 * Something has shifted, try again.
1356                 */
1357                put_task_struct(next_task);
1358                next_task = task;
1359                goto retry;
1360        }
1361
1362        deactivate_task(rq, next_task, 0);
1363        set_task_cpu(next_task, lowest_rq->cpu);
1364        activate_task(lowest_rq, next_task, 0);
1365
1366        resched_task(lowest_rq->curr);
1367
1368        double_unlock_balance(rq, lowest_rq);
1369
1370out:
1371        put_task_struct(next_task);
1372
1373        return 1;
1374}
1375
1376static void push_rt_tasks(struct rq *rq)
1377{
1378        /* push_rt_task will return true if it moved an RT */
1379        while (push_rt_task(rq))
1380                ;
1381}
1382
1383static int pull_rt_task(struct rq *this_rq)
1384{
1385        int this_cpu = this_rq->cpu, ret = 0, cpu;
1386        struct task_struct *p;
1387        struct rq *src_rq;
1388
1389        if (likely(!rt_overloaded(this_rq)))
1390                return 0;
1391
1392        for_each_cpu(cpu, this_rq->rd->rto_mask) {
1393                if (this_cpu == cpu)
1394                        continue;
1395
1396                src_rq = cpu_rq(cpu);
1397
1398                /*
1399                 * Don't bother taking the src_rq->lock if the next highest
1400                 * task is known to be lower-priority than our current task.
1401                 * This may look racy, but if this value is about to go
1402                 * logically higher, the src_rq will push this task away.
1403                 * And if its going logically lower, we do not care
1404                 */
1405                if (src_rq->rt.highest_prio.next >=
1406                    this_rq->rt.highest_prio.curr)
1407                        continue;
1408
1409                /*
1410                 * We can potentially drop this_rq's lock in
1411                 * double_lock_balance, and another CPU could
1412                 * alter this_rq
1413                 */
1414                double_lock_balance(this_rq, src_rq);
1415
1416                /*
1417                 * Are there still pullable RT tasks?
1418                 */
1419                if (src_rq->rt.rt_nr_running <= 1)
1420                        goto skip;
1421
1422                p = pick_next_highest_task_rt(src_rq, this_cpu);
1423
1424                /*
1425                 * Do we have an RT task that preempts
1426                 * the to-be-scheduled task?
1427                 */
1428                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1429                        WARN_ON(p == src_rq->curr);
1430                        WARN_ON(!p->se.on_rq);
1431
1432                        /*
1433                         * There's a chance that p is higher in priority
1434                         * than what's currently running on its cpu.
1435                         * This is just that p is wakeing up and hasn't
1436                         * had a chance to schedule. We only pull
1437                         * p if it is lower in priority than the
1438                         * current task on the run queue
1439                         */
1440                        if (p->prio < src_rq->curr->prio)
1441                                goto skip;
1442
1443                        ret = 1;
1444
1445                        deactivate_task(src_rq, p, 0);
1446                        set_task_cpu(p, this_cpu);
1447                        activate_task(this_rq, p, 0);
1448                        /*
1449                         * We continue with the search, just in
1450                         * case there's an even higher prio task
1451                         * in another runqueue. (low likelyhood
1452                         * but possible)
1453                         */
1454                }
1455 skip:
1456                double_unlock_balance(this_rq, src_rq);
1457        }
1458
1459        return ret;
1460}
1461
1462static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1463{
1464        /* Try to pull RT tasks here if we lower this rq's prio */
1465        if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
1466                pull_rt_task(rq);
1467}
1468
1469/*
1470 * assumes rq->lock is held
1471 */
1472static int needs_post_schedule_rt(struct rq *rq)
1473{
1474        return has_pushable_tasks(rq);
1475}
1476
1477static void post_schedule_rt(struct rq *rq)
1478{
1479        /*
1480         * This is only called if needs_post_schedule_rt() indicates that
1481         * we need to push tasks away
1482         */
1483        spin_lock_irq(&rq->lock);
1484        push_rt_tasks(rq);
1485        spin_unlock_irq(&rq->lock);
1486}
1487
1488/*
1489 * If we are not running and we are not going to reschedule soon, we should
1490 * try to push tasks away now
1491 */
1492static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1493{
1494        if (!task_running(rq, p) &&
1495            !test_tsk_need_resched(rq->curr) &&
1496            has_pushable_tasks(rq) &&
1497            p->rt.nr_cpus_allowed > 1)
1498                push_rt_tasks(rq);
1499}
1500
1501static unsigned long
1502load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1503                unsigned long max_load_move,
1504                struct sched_domain *sd, enum cpu_idle_type idle,
1505                int *all_pinned, int *this_best_prio)
1506{
1507        /* don't touch RT tasks */
1508        return 0;
1509}
1510
1511static int
1512move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1513                 struct sched_domain *sd, enum cpu_idle_type idle)
1514{
1515        /* don't touch RT tasks */
1516        return 0;
1517}
1518
1519static void set_cpus_allowed_rt(struct task_struct *p,
1520                                const struct cpumask *new_mask)
1521{
1522        int weight = cpumask_weight(new_mask);
1523
1524        BUG_ON(!rt_task(p));
1525
1526        /*
1527         * Update the migration status of the RQ if we have an RT task
1528         * which is running AND changing its weight value.
1529         */
1530        if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
1531                struct rq *rq = task_rq(p);
1532
1533                if (!task_current(rq, p)) {
1534                        /*
1535                         * Make sure we dequeue this task from the pushable list
1536                         * before going further.  It will either remain off of
1537                         * the list because we are no longer pushable, or it
1538                         * will be requeued.
1539                         */
1540                        if (p->rt.nr_cpus_allowed > 1)
1541                                dequeue_pushable_task(rq, p);
1542
1543                        /*
1544                         * Requeue if our weight is changing and still > 1
1545                         */
1546                        if (weight > 1)
1547                                enqueue_pushable_task(rq, p);
1548
1549                }
1550
1551                if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1552                        rq->rt.rt_nr_migratory++;
1553                } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1554                        BUG_ON(!rq->rt.rt_nr_migratory);
1555                        rq->rt.rt_nr_migratory--;
1556                }
1557
1558                update_rt_migration(&rq->rt);
1559        }
1560
1561        cpumask_copy(&p->cpus_allowed, new_mask);
1562        p->rt.nr_cpus_allowed = weight;
1563}
1564
1565/* Assumes rq->lock is held */
1566static void rq_online_rt(struct rq *rq)
1567{
1568        if (rq->rt.overloaded)
1569                rt_set_overload(rq);
1570
1571        __enable_runtime(rq);
1572
1573        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1574}
1575
1576/* Assumes rq->lock is held */
1577static void rq_offline_rt(struct rq *rq)
1578{
1579        if (rq->rt.overloaded)
1580                rt_clear_overload(rq);
1581
1582        __disable_runtime(rq);
1583
1584        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1585}
1586
1587/*
1588 * When switch from the rt queue, we bring ourselves to a position
1589 * that we might want to pull RT tasks from other runqueues.
1590 */
1591static void switched_from_rt(struct rq *rq, struct task_struct *p,
1592                           int running)
1593{
1594        /*
1595         * If there are other RT tasks then we will reschedule
1596         * and the scheduling of the other RT tasks will handle
1597         * the balancing. But if we are the last RT task
1598         * we may need to handle the pulling of RT tasks
1599         * now.
1600         */
1601        if (!rq->rt.rt_nr_running)
1602                pull_rt_task(rq);
1603}
1604
1605static inline void init_sched_rt_class(void)
1606{
1607        unsigned int i;
1608
1609        for_each_possible_cpu(i)
1610                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1611                                        GFP_KERNEL, cpu_to_node(i));
1612}
1613#endif /* CONFIG_SMP */
1614
1615/*
1616 * When switching a task to RT, we may overload the runqueue
1617 * with RT tasks. In this case we try to push them off to
1618 * other runqueues.
1619 */
1620static void switched_to_rt(struct rq *rq, struct task_struct *p,
1621                           int running)
1622{
1623        int check_resched = 1;
1624
1625        /*
1626         * If we are already running, then there's nothing
1627         * that needs to be done. But if we are not running
1628         * we may need to preempt the current running task.
1629         * If that current running task is also an RT task
1630         * then see if we can move to another run queue.
1631         */
1632        if (!running) {
1633#ifdef CONFIG_SMP
1634                if (rq->rt.overloaded && push_rt_task(rq) &&
1635                    /* Don't resched if we changed runqueues */
1636                    rq != task_rq(p))
1637                        check_resched = 0;
1638#endif /* CONFIG_SMP */
1639                if (check_resched && p->prio < rq->curr->prio)
1640                        resched_task(rq->curr);
1641        }
1642}
1643
1644/*
1645 * Priority of the task has changed. This may cause
1646 * us to initiate a push or pull.
1647 */
1648static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1649                            int oldprio, int running)
1650{
1651        if (running) {
1652#ifdef CONFIG_SMP
1653                /*
1654                 * If our priority decreases while running, we
1655                 * may need to pull tasks to this runqueue.
1656                 */
1657                if (oldprio < p->prio)
1658                        pull_rt_task(rq);
1659                /*
1660                 * If there's a higher priority task waiting to run
1661                 * then reschedule. Note, the above pull_rt_task
1662                 * can release the rq lock and p could migrate.
1663                 * Only reschedule if p is still on the same runqueue.
1664                 */
1665                if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1666                        resched_task(p);
1667#else
1668                /* For UP simply resched on drop of prio */
1669                if (oldprio < p->prio)
1670                        resched_task(p);
1671#endif /* CONFIG_SMP */
1672        } else {
1673                /*
1674                 * This task is not running, but if it is
1675                 * greater than the current running task
1676                 * then reschedule.
1677                 */
1678                if (p->prio < rq->curr->prio)
1679                        resched_task(rq->curr);
1680        }
1681}
1682
1683static void watchdog(struct rq *rq, struct task_struct *p)
1684{
1685        unsigned long soft, hard;
1686
1687        if (!p->signal)
1688                return;
1689
1690        soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
1691        hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
1692
1693        if (soft != RLIM_INFINITY) {
1694                unsigned long next;
1695
1696                p->rt.timeout++;
1697                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1698                if (p->rt.timeout > next)
1699                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1700        }
1701}
1702
1703static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1704{
1705        update_curr_rt(rq);
1706
1707        watchdog(rq, p);
1708
1709        /*
1710         * RR tasks need a special form of timeslice management.
1711         * FIFO tasks have no timeslices.
1712         */
1713        if (p->policy != SCHED_RR)
1714                return;
1715
1716        if (--p->rt.time_slice)
1717                return;
1718
1719        p->rt.time_slice = DEF_TIMESLICE;
1720
1721        /*
1722         * Requeue to the end of queue if we are not the only element
1723         * on the queue:
1724         */
1725        if (p->rt.run_list.prev != p->rt.run_list.next) {
1726                requeue_task_rt(rq, p, 0);
1727                set_tsk_need_resched(p);
1728        }
1729}
1730
1731static void set_curr_task_rt(struct rq *rq)
1732{
1733        struct task_struct *p = rq->curr;
1734
1735        p->se.exec_start = rq->clock;
1736
1737        /* The running task is never eligible for pushing */
1738        dequeue_pushable_task(rq, p);
1739}
1740
1741static const struct sched_class rt_sched_class = {
1742        .next                   = &fair_sched_class,
1743        .enqueue_task           = enqueue_task_rt,
1744        .dequeue_task           = dequeue_task_rt,
1745        .yield_task             = yield_task_rt,
1746
1747        .check_preempt_curr     = check_preempt_curr_rt,
1748
1749        .pick_next_task         = pick_next_task_rt,
1750        .put_prev_task          = put_prev_task_rt,
1751
1752#ifdef CONFIG_SMP
1753        .select_task_rq         = select_task_rq_rt,
1754
1755        .load_balance           = load_balance_rt,
1756        .move_one_task          = move_one_task_rt,
1757        .set_cpus_allowed       = set_cpus_allowed_rt,
1758        .rq_online              = rq_online_rt,
1759        .rq_offline             = rq_offline_rt,
1760        .pre_schedule           = pre_schedule_rt,
1761        .needs_post_schedule    = needs_post_schedule_rt,
1762        .post_schedule          = post_schedule_rt,
1763        .task_wake_up           = task_wake_up_rt,
1764        .switched_from          = switched_from_rt,
1765#endif
1766
1767        .set_curr_task          = set_curr_task_rt,
1768        .task_tick              = task_tick_rt,
1769
1770        .prio_changed           = prio_changed_rt,
1771        .switched_to            = switched_to_rt,
1772};
1773
1774#ifdef CONFIG_SCHED_DEBUG
1775extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1776
1777static void print_rt_stats(struct seq_file *m, int cpu)
1778{
1779        struct rt_rq *rt_rq;
1780
1781        rcu_read_lock();
1782        for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
1783                print_rt_rq(m, cpu, rt_rq);
1784        rcu_read_unlock();
1785}
1786#endif /* CONFIG_SCHED_DEBUG */
1787
1788