linux/kernel/sched_rt.c
<<
>>
Prefs
   1/*
   2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3 * policies)
   4 */
   5
   6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
   7{
   8        return container_of(rt_se, struct task_struct, rt);
   9}
  10
  11#ifdef CONFIG_RT_GROUP_SCHED
  12
  13static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  14{
  15        return rt_rq->rq;
  16}
  17
  18static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  19{
  20        return rt_se->rt_rq;
  21}
  22
  23#else /* CONFIG_RT_GROUP_SCHED */
  24
  25static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  26{
  27        return container_of(rt_rq, struct rq, rt);
  28}
  29
  30static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  31{
  32        struct task_struct *p = rt_task_of(rt_se);
  33        struct rq *rq = task_rq(p);
  34
  35        return &rq->rt;
  36}
  37
  38#endif /* CONFIG_RT_GROUP_SCHED */
  39
  40#ifdef CONFIG_SMP
  41
  42static inline int rt_overloaded(struct rq *rq)
  43{
  44        return atomic_read(&rq->rd->rto_count);
  45}
  46
  47static inline void rt_set_overload(struct rq *rq)
  48{
  49        if (!rq->online)
  50                return;
  51
  52        cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
  53        /*
  54         * Make sure the mask is visible before we set
  55         * the overload count. That is checked to determine
  56         * if we should look at the mask. It would be a shame
  57         * if we looked at the mask, but the mask was not
  58         * updated yet.
  59         */
  60        wmb();
  61        atomic_inc(&rq->rd->rto_count);
  62}
  63
  64static inline void rt_clear_overload(struct rq *rq)
  65{
  66        if (!rq->online)
  67                return;
  68
  69        /* the order here really doesn't matter */
  70        atomic_dec(&rq->rd->rto_count);
  71        cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
  72}
  73
  74static void update_rt_migration(struct rt_rq *rt_rq)
  75{
  76        if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
  77                if (!rt_rq->overloaded) {
  78                        rt_set_overload(rq_of_rt_rq(rt_rq));
  79                        rt_rq->overloaded = 1;
  80                }
  81        } else if (rt_rq->overloaded) {
  82                rt_clear_overload(rq_of_rt_rq(rt_rq));
  83                rt_rq->overloaded = 0;
  84        }
  85}
  86
  87static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  88{
  89        if (rt_se->nr_cpus_allowed > 1)
  90                rt_rq->rt_nr_migratory++;
  91
  92        update_rt_migration(rt_rq);
  93}
  94
  95static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  96{
  97        if (rt_se->nr_cpus_allowed > 1)
  98                rt_rq->rt_nr_migratory--;
  99
 100        update_rt_migration(rt_rq);
 101}
 102
 103static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 104{
 105        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 106        plist_node_init(&p->pushable_tasks, p->prio);
 107        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 108}
 109
 110static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 111{
 112        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 113}
 114
 115#else
 116
 117static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 118{
 119}
 120
 121static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 122{
 123}
 124
 125static inline
 126void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 127{
 128}
 129
 130static inline
 131void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 132{
 133}
 134
 135#endif /* CONFIG_SMP */
 136
 137static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 138{
 139        return !list_empty(&rt_se->run_list);
 140}
 141
 142#ifdef CONFIG_RT_GROUP_SCHED
 143
 144static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 145{
 146        if (!rt_rq->tg)
 147                return RUNTIME_INF;
 148
 149        return rt_rq->rt_runtime;
 150}
 151
 152static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 153{
 154        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 155}
 156
 157#define for_each_leaf_rt_rq(rt_rq, rq) \
 158        list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 159
 160#define for_each_sched_rt_entity(rt_se) \
 161        for (; rt_se; rt_se = rt_se->parent)
 162
 163static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 164{
 165        return rt_se->my_q;
 166}
 167
 168static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
 169static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 170
 171static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 172{
 173        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 174        struct sched_rt_entity *rt_se = rt_rq->rt_se;
 175
 176        if (rt_rq->rt_nr_running) {
 177                if (rt_se && !on_rt_rq(rt_se))
 178                        enqueue_rt_entity(rt_se);
 179                if (rt_rq->highest_prio.curr < curr->prio)
 180                        resched_task(curr);
 181        }
 182}
 183
 184static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 185{
 186        struct sched_rt_entity *rt_se = rt_rq->rt_se;
 187
 188        if (rt_se && on_rt_rq(rt_se))
 189                dequeue_rt_entity(rt_se);
 190}
 191
 192static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 193{
 194        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 195}
 196
 197static int rt_se_boosted(struct sched_rt_entity *rt_se)
 198{
 199        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 200        struct task_struct *p;
 201
 202        if (rt_rq)
 203                return !!rt_rq->rt_nr_boosted;
 204
 205        p = rt_task_of(rt_se);
 206        return p->prio != p->normal_prio;
 207}
 208
 209#ifdef CONFIG_SMP
 210static inline const struct cpumask *sched_rt_period_mask(void)
 211{
 212        return cpu_rq(smp_processor_id())->rd->span;
 213}
 214#else
 215static inline const struct cpumask *sched_rt_period_mask(void)
 216{
 217        return cpu_online_mask;
 218}
 219#endif
 220
 221static inline
 222struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 223{
 224        return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 225}
 226
 227static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 228{
 229        return &rt_rq->tg->rt_bandwidth;
 230}
 231
 232#else /* !CONFIG_RT_GROUP_SCHED */
 233
 234static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 235{
 236        return rt_rq->rt_runtime;
 237}
 238
 239static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 240{
 241        return ktime_to_ns(def_rt_bandwidth.rt_period);
 242}
 243
 244#define for_each_leaf_rt_rq(rt_rq, rq) \
 245        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 246
 247#define for_each_sched_rt_entity(rt_se) \
 248        for (; rt_se; rt_se = NULL)
 249
 250static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 251{
 252        return NULL;
 253}
 254
 255static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 256{
 257        if (rt_rq->rt_nr_running)
 258                resched_task(rq_of_rt_rq(rt_rq)->curr);
 259}
 260
 261static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 262{
 263}
 264
 265static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 266{
 267        return rt_rq->rt_throttled;
 268}
 269
 270static inline const struct cpumask *sched_rt_period_mask(void)
 271{
 272        return cpu_online_mask;
 273}
 274
 275static inline
 276struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 277{
 278        return &cpu_rq(cpu)->rt;
 279}
 280
 281static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 282{
 283        return &def_rt_bandwidth;
 284}
 285
 286#endif /* CONFIG_RT_GROUP_SCHED */
 287
 288#ifdef CONFIG_SMP
 289/*
 290 * We ran out of runtime, see if we can borrow some from our neighbours.
 291 */
 292static int do_balance_runtime(struct rt_rq *rt_rq)
 293{
 294        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 295        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 296        int i, weight, more = 0;
 297        u64 rt_period;
 298
 299        weight = cpumask_weight(rd->span);
 300
 301        spin_lock(&rt_b->rt_runtime_lock);
 302        rt_period = ktime_to_ns(rt_b->rt_period);
 303        for_each_cpu(i, rd->span) {
 304                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 305                s64 diff;
 306
 307                if (iter == rt_rq)
 308                        continue;
 309
 310                spin_lock(&iter->rt_runtime_lock);
 311                /*
 312                 * Either all rqs have inf runtime and there's nothing to steal
 313                 * or __disable_runtime() below sets a specific rq to inf to
 314                 * indicate its been disabled and disalow stealing.
 315                 */
 316                if (iter->rt_runtime == RUNTIME_INF)
 317                        goto next;
 318
 319                /*
 320                 * From runqueues with spare time, take 1/n part of their
 321                 * spare time, but no more than our period.
 322                 */
 323                diff = iter->rt_runtime - iter->rt_time;
 324                if (diff > 0) {
 325                        diff = div_u64((u64)diff, weight);
 326                        if (rt_rq->rt_runtime + diff > rt_period)
 327                                diff = rt_period - rt_rq->rt_runtime;
 328                        iter->rt_runtime -= diff;
 329                        rt_rq->rt_runtime += diff;
 330                        more = 1;
 331                        if (rt_rq->rt_runtime == rt_period) {
 332                                spin_unlock(&iter->rt_runtime_lock);
 333                                break;
 334                        }
 335                }
 336next:
 337                spin_unlock(&iter->rt_runtime_lock);
 338        }
 339        spin_unlock(&rt_b->rt_runtime_lock);
 340
 341        return more;
 342}
 343
 344/*
 345 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 346 */
 347static void __disable_runtime(struct rq *rq)
 348{
 349        struct root_domain *rd = rq->rd;
 350        struct rt_rq *rt_rq;
 351
 352        if (unlikely(!scheduler_running))
 353                return;
 354
 355        for_each_leaf_rt_rq(rt_rq, rq) {
 356                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 357                s64 want;
 358                int i;
 359
 360                spin_lock(&rt_b->rt_runtime_lock);
 361                spin_lock(&rt_rq->rt_runtime_lock);
 362                /*
 363                 * Either we're all inf and nobody needs to borrow, or we're
 364                 * already disabled and thus have nothing to do, or we have
 365                 * exactly the right amount of runtime to take out.
 366                 */
 367                if (rt_rq->rt_runtime == RUNTIME_INF ||
 368                                rt_rq->rt_runtime == rt_b->rt_runtime)
 369                        goto balanced;
 370                spin_unlock(&rt_rq->rt_runtime_lock);
 371
 372                /*
 373                 * Calculate the difference between what we started out with
 374                 * and what we current have, that's the amount of runtime
 375                 * we lend and now have to reclaim.
 376                 */
 377                want = rt_b->rt_runtime - rt_rq->rt_runtime;
 378
 379                /*
 380                 * Greedy reclaim, take back as much as we can.
 381                 */
 382                for_each_cpu(i, rd->span) {
 383                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 384                        s64 diff;
 385
 386                        /*
 387                         * Can't reclaim from ourselves or disabled runqueues.
 388                         */
 389                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 390                                continue;
 391
 392                        spin_lock(&iter->rt_runtime_lock);
 393                        if (want > 0) {
 394                                diff = min_t(s64, iter->rt_runtime, want);
 395                                iter->rt_runtime -= diff;
 396                                want -= diff;
 397                        } else {
 398                                iter->rt_runtime -= want;
 399                                want -= want;
 400                        }
 401                        spin_unlock(&iter->rt_runtime_lock);
 402
 403                        if (!want)
 404                                break;
 405                }
 406
 407                spin_lock(&rt_rq->rt_runtime_lock);
 408                /*
 409                 * We cannot be left wanting - that would mean some runtime
 410                 * leaked out of the system.
 411                 */
 412                BUG_ON(want);
 413balanced:
 414                /*
 415                 * Disable all the borrow logic by pretending we have inf
 416                 * runtime - in which case borrowing doesn't make sense.
 417                 */
 418                rt_rq->rt_runtime = RUNTIME_INF;
 419                spin_unlock(&rt_rq->rt_runtime_lock);
 420                spin_unlock(&rt_b->rt_runtime_lock);
 421        }
 422}
 423
 424static void disable_runtime(struct rq *rq)
 425{
 426        unsigned long flags;
 427
 428        spin_lock_irqsave(&rq->lock, flags);
 429        __disable_runtime(rq);
 430        spin_unlock_irqrestore(&rq->lock, flags);
 431}
 432
 433static void __enable_runtime(struct rq *rq)
 434{
 435        struct rt_rq *rt_rq;
 436
 437        if (unlikely(!scheduler_running))
 438                return;
 439
 440        /*
 441         * Reset each runqueue's bandwidth settings
 442         */
 443        for_each_leaf_rt_rq(rt_rq, rq) {
 444                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 445
 446                spin_lock(&rt_b->rt_runtime_lock);
 447                spin_lock(&rt_rq->rt_runtime_lock);
 448                rt_rq->rt_runtime = rt_b->rt_runtime;
 449                rt_rq->rt_time = 0;
 450                rt_rq->rt_throttled = 0;
 451                spin_unlock(&rt_rq->rt_runtime_lock);
 452                spin_unlock(&rt_b->rt_runtime_lock);
 453        }
 454}
 455
 456static void enable_runtime(struct rq *rq)
 457{
 458        unsigned long flags;
 459
 460        spin_lock_irqsave(&rq->lock, flags);
 461        __enable_runtime(rq);
 462        spin_unlock_irqrestore(&rq->lock, flags);
 463}
 464
 465static int balance_runtime(struct rt_rq *rt_rq)
 466{
 467        int more = 0;
 468
 469        if (rt_rq->rt_time > rt_rq->rt_runtime) {
 470                spin_unlock(&rt_rq->rt_runtime_lock);
 471                more = do_balance_runtime(rt_rq);
 472                spin_lock(&rt_rq->rt_runtime_lock);
 473        }
 474
 475        return more;
 476}
 477#else /* !CONFIG_SMP */
 478static inline int balance_runtime(struct rt_rq *rt_rq)
 479{
 480        return 0;
 481}
 482#endif /* CONFIG_SMP */
 483
 484static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 485{
 486        int i, idle = 1;
 487        const struct cpumask *span;
 488
 489        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 490                return 1;
 491
 492        span = sched_rt_period_mask();
 493        for_each_cpu(i, span) {
 494                int enqueue = 0;
 495                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 496                struct rq *rq = rq_of_rt_rq(rt_rq);
 497
 498                spin_lock(&rq->lock);
 499                if (rt_rq->rt_time) {
 500                        u64 runtime;
 501
 502                        spin_lock(&rt_rq->rt_runtime_lock);
 503                        if (rt_rq->rt_throttled)
 504                                balance_runtime(rt_rq);
 505                        runtime = rt_rq->rt_runtime;
 506                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 507                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 508                                rt_rq->rt_throttled = 0;
 509                                enqueue = 1;
 510                        }
 511                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
 512                                idle = 0;
 513                        spin_unlock(&rt_rq->rt_runtime_lock);
 514                } else if (rt_rq->rt_nr_running)
 515                        idle = 0;
 516
 517                if (enqueue)
 518                        sched_rt_rq_enqueue(rt_rq);
 519                spin_unlock(&rq->lock);
 520        }
 521
 522        return idle;
 523}
 524
 525static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 526{
 527#ifdef CONFIG_RT_GROUP_SCHED
 528        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 529
 530        if (rt_rq)
 531                return rt_rq->highest_prio.curr;
 532#endif
 533
 534        return rt_task_of(rt_se)->prio;
 535}
 536
 537static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 538{
 539        u64 runtime = sched_rt_runtime(rt_rq);
 540
 541        if (rt_rq->rt_throttled)
 542                return rt_rq_throttled(rt_rq);
 543
 544        if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
 545                return 0;
 546
 547        balance_runtime(rt_rq);
 548        runtime = sched_rt_runtime(rt_rq);
 549        if (runtime == RUNTIME_INF)
 550                return 0;
 551
 552        if (rt_rq->rt_time > runtime) {
 553                rt_rq->rt_throttled = 1;
 554                if (rt_rq_throttled(rt_rq)) {
 555                        sched_rt_rq_dequeue(rt_rq);
 556                        return 1;
 557                }
 558        }
 559
 560        return 0;
 561}
 562
 563/*
 564 * Update the current task's runtime statistics. Skip current tasks that
 565 * are not in our scheduling class.
 566 */
 567static void update_curr_rt(struct rq *rq)
 568{
 569        struct task_struct *curr = rq->curr;
 570        struct sched_rt_entity *rt_se = &curr->rt;
 571        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 572        u64 delta_exec;
 573
 574        if (!task_has_rt_policy(curr))
 575                return;
 576
 577        delta_exec = rq->clock - curr->se.exec_start;
 578        if (unlikely((s64)delta_exec < 0))
 579                delta_exec = 0;
 580
 581        schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 582
 583        curr->se.sum_exec_runtime += delta_exec;
 584        account_group_exec_runtime(curr, delta_exec);
 585
 586        curr->se.exec_start = rq->clock;
 587        cpuacct_charge(curr, delta_exec);
 588
 589        if (!rt_bandwidth_enabled())
 590                return;
 591
 592        for_each_sched_rt_entity(rt_se) {
 593                rt_rq = rt_rq_of_se(rt_se);
 594
 595                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 596                        spin_lock(&rt_rq->rt_runtime_lock);
 597                        rt_rq->rt_time += delta_exec;
 598                        if (sched_rt_runtime_exceeded(rt_rq))
 599                                resched_task(curr);
 600                        spin_unlock(&rt_rq->rt_runtime_lock);
 601                }
 602        }
 603}
 604
 605#if defined CONFIG_SMP
 606
 607static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
 608
 609static inline int next_prio(struct rq *rq)
 610{
 611        struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
 612
 613        if (next && rt_prio(next->prio))
 614                return next->prio;
 615        else
 616                return MAX_RT_PRIO;
 617}
 618
 619static void
 620inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 621{
 622        struct rq *rq = rq_of_rt_rq(rt_rq);
 623
 624        if (prio < prev_prio) {
 625
 626                /*
 627                 * If the new task is higher in priority than anything on the
 628                 * run-queue, we know that the previous high becomes our
 629                 * next-highest.
 630                 */
 631                rt_rq->highest_prio.next = prev_prio;
 632
 633                if (rq->online)
 634                        cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 635
 636        } else if (prio == rt_rq->highest_prio.curr)
 637                /*
 638                 * If the next task is equal in priority to the highest on
 639                 * the run-queue, then we implicitly know that the next highest
 640                 * task cannot be any lower than current
 641                 */
 642                rt_rq->highest_prio.next = prio;
 643        else if (prio < rt_rq->highest_prio.next)
 644                /*
 645                 * Otherwise, we need to recompute next-highest
 646                 */
 647                rt_rq->highest_prio.next = next_prio(rq);
 648}
 649
 650static void
 651dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 652{
 653        struct rq *rq = rq_of_rt_rq(rt_rq);
 654
 655        if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
 656                rt_rq->highest_prio.next = next_prio(rq);
 657
 658        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
 659                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 660}
 661
 662#else /* CONFIG_SMP */
 663
 664static inline
 665void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 666static inline
 667void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 668
 669#endif /* CONFIG_SMP */
 670
 671#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 672static void
 673inc_rt_prio(struct rt_rq *rt_rq, int prio)
 674{
 675        int prev_prio = rt_rq->highest_prio.curr;
 676
 677        if (prio < prev_prio)
 678                rt_rq->highest_prio.curr = prio;
 679
 680        inc_rt_prio_smp(rt_rq, prio, prev_prio);
 681}
 682
 683static void
 684dec_rt_prio(struct rt_rq *rt_rq, int prio)
 685{
 686        int prev_prio = rt_rq->highest_prio.curr;
 687
 688        if (rt_rq->rt_nr_running) {
 689
 690                WARN_ON(prio < prev_prio);
 691
 692                /*
 693                 * This may have been our highest task, and therefore
 694                 * we may have some recomputation to do
 695                 */
 696                if (prio == prev_prio) {
 697                        struct rt_prio_array *array = &rt_rq->active;
 698
 699                        rt_rq->highest_prio.curr =
 700                                sched_find_first_bit(array->bitmap);
 701                }
 702
 703        } else
 704                rt_rq->highest_prio.curr = MAX_RT_PRIO;
 705
 706        dec_rt_prio_smp(rt_rq, prio, prev_prio);
 707}
 708
 709#else
 710
 711static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
 712static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
 713
 714#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 715
 716#ifdef CONFIG_RT_GROUP_SCHED
 717
 718static void
 719inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 720{
 721        if (rt_se_boosted(rt_se))
 722                rt_rq->rt_nr_boosted++;
 723
 724        if (rt_rq->tg)
 725                start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
 726}
 727
 728static void
 729dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 730{
 731        if (rt_se_boosted(rt_se))
 732                rt_rq->rt_nr_boosted--;
 733
 734        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
 735}
 736
 737#else /* CONFIG_RT_GROUP_SCHED */
 738
 739static void
 740inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 741{
 742        start_rt_bandwidth(&def_rt_bandwidth);
 743}
 744
 745static inline
 746void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 747
 748#endif /* CONFIG_RT_GROUP_SCHED */
 749
 750static inline
 751void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 752{
 753        int prio = rt_se_prio(rt_se);
 754
 755        WARN_ON(!rt_prio(prio));
 756        rt_rq->rt_nr_running++;
 757
 758        inc_rt_prio(rt_rq, prio);
 759        inc_rt_migration(rt_se, rt_rq);
 760        inc_rt_group(rt_se, rt_rq);
 761}
 762
 763static inline
 764void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 765{
 766        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 767        WARN_ON(!rt_rq->rt_nr_running);
 768        rt_rq->rt_nr_running--;
 769
 770        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
 771        dec_rt_migration(rt_se, rt_rq);
 772        dec_rt_group(rt_se, rt_rq);
 773}
 774
 775static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
 776{
 777        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 778        struct rt_prio_array *array = &rt_rq->active;
 779        struct rt_rq *group_rq = group_rt_rq(rt_se);
 780        struct list_head *queue = array->queue + rt_se_prio(rt_se);
 781
 782        /*
 783         * Don't enqueue the group if its throttled, or when empty.
 784         * The latter is a consequence of the former when a child group
 785         * get throttled and the current group doesn't have any other
 786         * active members.
 787         */
 788        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 789                return;
 790
 791        list_add_tail(&rt_se->run_list, queue);
 792        __set_bit(rt_se_prio(rt_se), array->bitmap);
 793
 794        inc_rt_tasks(rt_se, rt_rq);
 795}
 796
 797static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
 798{
 799        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 800        struct rt_prio_array *array = &rt_rq->active;
 801
 802        list_del_init(&rt_se->run_list);
 803        if (list_empty(array->queue + rt_se_prio(rt_se)))
 804                __clear_bit(rt_se_prio(rt_se), array->bitmap);
 805
 806        dec_rt_tasks(rt_se, rt_rq);
 807}
 808
 809/*
 810 * Because the prio of an upper entry depends on the lower
 811 * entries, we must remove entries top - down.
 812 */
 813static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 814{
 815        struct sched_rt_entity *back = NULL;
 816
 817        for_each_sched_rt_entity(rt_se) {
 818                rt_se->back = back;
 819                back = rt_se;
 820        }
 821
 822        for (rt_se = back; rt_se; rt_se = rt_se->back) {
 823                if (on_rt_rq(rt_se))
 824                        __dequeue_rt_entity(rt_se);
 825        }
 826}
 827
 828static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 829{
 830        dequeue_rt_stack(rt_se);
 831        for_each_sched_rt_entity(rt_se)
 832                __enqueue_rt_entity(rt_se);
 833}
 834
 835static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 836{
 837        dequeue_rt_stack(rt_se);
 838
 839        for_each_sched_rt_entity(rt_se) {
 840                struct rt_rq *rt_rq = group_rt_rq(rt_se);
 841
 842                if (rt_rq && rt_rq->rt_nr_running)
 843                        __enqueue_rt_entity(rt_se);
 844        }
 845}
 846
 847/*
 848 * Adding/removing a task to/from a priority array:
 849 */
 850static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 851{
 852        struct sched_rt_entity *rt_se = &p->rt;
 853
 854        if (wakeup)
 855                rt_se->timeout = 0;
 856
 857        enqueue_rt_entity(rt_se);
 858
 859        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 860                enqueue_pushable_task(rq, p);
 861
 862        inc_cpu_load(rq, p->se.load.weight);
 863}
 864
 865static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 866{
 867        struct sched_rt_entity *rt_se = &p->rt;
 868
 869        update_curr_rt(rq);
 870        dequeue_rt_entity(rt_se);
 871
 872        dequeue_pushable_task(rq, p);
 873
 874        dec_cpu_load(rq, p->se.load.weight);
 875}
 876
 877/*
 878 * Put task to the end of the run list without the overhead of dequeue
 879 * followed by enqueue.
 880 */
 881static void
 882requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
 883{
 884        if (on_rt_rq(rt_se)) {
 885                struct rt_prio_array *array = &rt_rq->active;
 886                struct list_head *queue = array->queue + rt_se_prio(rt_se);
 887
 888                if (head)
 889                        list_move(&rt_se->run_list, queue);
 890                else
 891                        list_move_tail(&rt_se->run_list, queue);
 892        }
 893}
 894
 895static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 896{
 897        struct sched_rt_entity *rt_se = &p->rt;
 898        struct rt_rq *rt_rq;
 899
 900        for_each_sched_rt_entity(rt_se) {
 901                rt_rq = rt_rq_of_se(rt_se);
 902                requeue_rt_entity(rt_rq, rt_se, head);
 903        }
 904}
 905
 906static void yield_task_rt(struct rq *rq)
 907{
 908        requeue_task_rt(rq, rq->curr, 0);
 909}
 910
 911#ifdef CONFIG_SMP
 912static int find_lowest_rq(struct task_struct *task);
 913
 914static int select_task_rq_rt(struct task_struct *p, int sync)
 915{
 916        struct rq *rq = task_rq(p);
 917
 918        /*
 919         * If the current task is an RT task, then
 920         * try to see if we can wake this RT task up on another
 921         * runqueue. Otherwise simply start this RT task
 922         * on its current runqueue.
 923         *
 924         * We want to avoid overloading runqueues. Even if
 925         * the RT task is of higher priority than the current RT task.
 926         * RT tasks behave differently than other tasks. If
 927         * one gets preempted, we try to push it off to another queue.
 928         * So trying to keep a preempting RT task on the same
 929         * cache hot CPU will force the running RT task to
 930         * a cold CPU. So we waste all the cache for the lower
 931         * RT task in hopes of saving some of a RT task
 932         * that is just being woken and probably will have
 933         * cold cache anyway.
 934         */
 935        if (unlikely(rt_task(rq->curr)) &&
 936            (p->rt.nr_cpus_allowed > 1)) {
 937                int cpu = find_lowest_rq(p);
 938
 939                return (cpu == -1) ? task_cpu(p) : cpu;
 940        }
 941
 942        /*
 943         * Otherwise, just let it ride on the affined RQ and the
 944         * post-schedule router will push the preempted task away
 945         */
 946        return task_cpu(p);
 947}
 948
 949static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 950{
 951        if (rq->curr->rt.nr_cpus_allowed == 1)
 952                return;
 953
 954        if (p->rt.nr_cpus_allowed != 1
 955            && cpupri_find(&rq->rd->cpupri, p, NULL))
 956                return;
 957
 958        if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
 959                return;
 960
 961        /*
 962         * There appears to be other cpus that can accept
 963         * current and none to run 'p', so lets reschedule
 964         * to try and push current away:
 965         */
 966        requeue_task_rt(rq, p, 1);
 967        resched_task(rq->curr);
 968}
 969
 970#endif /* CONFIG_SMP */
 971
 972/*
 973 * Preempt the current task with a newly woken task if needed:
 974 */
 975static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
 976{
 977        if (p->prio < rq->curr->prio) {
 978                resched_task(rq->curr);
 979                return;
 980        }
 981
 982#ifdef CONFIG_SMP
 983        /*
 984         * If:
 985         *
 986         * - the newly woken task is of equal priority to the current task
 987         * - the newly woken task is non-migratable while current is migratable
 988         * - current will be preempted on the next reschedule
 989         *
 990         * we should check to see if current can readily move to a different
 991         * cpu.  If so, we will reschedule to allow the push logic to try
 992         * to move current somewhere else, making room for our non-migratable
 993         * task.
 994         */
 995        if (p->prio == rq->curr->prio && !need_resched())
 996                check_preempt_equal_prio(rq, p);
 997#endif
 998}
 999
1000static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1001                                                   struct rt_rq *rt_rq)
1002{
1003        struct rt_prio_array *array = &rt_rq->active;
1004        struct sched_rt_entity *next = NULL;
1005        struct list_head *queue;
1006        int idx;
1007
1008        idx = sched_find_first_bit(array->bitmap);
1009        BUG_ON(idx >= MAX_RT_PRIO);
1010
1011        queue = array->queue + idx;
1012        next = list_entry(queue->next, struct sched_rt_entity, run_list);
1013
1014        return next;
1015}
1016
1017static struct task_struct *_pick_next_task_rt(struct rq *rq)
1018{
1019        struct sched_rt_entity *rt_se;
1020        struct task_struct *p;
1021        struct rt_rq *rt_rq;
1022
1023        rt_rq = &rq->rt;
1024
1025        if (unlikely(!rt_rq->rt_nr_running))
1026                return NULL;
1027
1028        if (rt_rq_throttled(rt_rq))
1029                return NULL;
1030
1031        do {
1032                rt_se = pick_next_rt_entity(rq, rt_rq);
1033                BUG_ON(!rt_se);
1034                rt_rq = group_rt_rq(rt_se);
1035        } while (rt_rq);
1036
1037        p = rt_task_of(rt_se);
1038        p->se.exec_start = rq->clock;
1039
1040        return p;
1041}
1042
1043static struct task_struct *pick_next_task_rt(struct rq *rq)
1044{
1045        struct task_struct *p = _pick_next_task_rt(rq);
1046
1047        /* The running task is never eligible for pushing */
1048        if (p)
1049                dequeue_pushable_task(rq, p);
1050
1051        return p;
1052}
1053
1054static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1055{
1056        update_curr_rt(rq);
1057        p->se.exec_start = 0;
1058
1059        /*
1060         * The previous task needs to be made eligible for pushing
1061         * if it is still active
1062         */
1063        if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
1064                enqueue_pushable_task(rq, p);
1065}
1066
1067#ifdef CONFIG_SMP
1068
1069/* Only try algorithms three times */
1070#define RT_MAX_TRIES 3
1071
1072static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1073
1074static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1075{
1076        if (!task_running(rq, p) &&
1077            (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
1078            (p->rt.nr_cpus_allowed > 1))
1079                return 1;
1080        return 0;
1081}
1082
1083/* Return the second highest RT task, NULL otherwise */
1084static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1085{
1086        struct task_struct *next = NULL;
1087        struct sched_rt_entity *rt_se;
1088        struct rt_prio_array *array;
1089        struct rt_rq *rt_rq;
1090        int idx;
1091
1092        for_each_leaf_rt_rq(rt_rq, rq) {
1093                array = &rt_rq->active;
1094                idx = sched_find_first_bit(array->bitmap);
1095 next_idx:
1096                if (idx >= MAX_RT_PRIO)
1097                        continue;
1098                if (next && next->prio < idx)
1099                        continue;
1100                list_for_each_entry(rt_se, array->queue + idx, run_list) {
1101                        struct task_struct *p = rt_task_of(rt_se);
1102                        if (pick_rt_task(rq, p, cpu)) {
1103                                next = p;
1104                                break;
1105                        }
1106                }
1107                if (!next) {
1108                        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1109                        goto next_idx;
1110                }
1111        }
1112
1113        return next;
1114}
1115
1116static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1117
1118static inline int pick_optimal_cpu(int this_cpu,
1119                                   const struct cpumask *mask)
1120{
1121        int first;
1122
1123        /* "this_cpu" is cheaper to preempt than a remote processor */
1124        if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
1125                return this_cpu;
1126
1127        first = cpumask_first(mask);
1128        if (first < nr_cpu_ids)
1129                return first;
1130
1131        return -1;
1132}
1133
1134static int find_lowest_rq(struct task_struct *task)
1135{
1136        struct sched_domain *sd;
1137        struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1138        int this_cpu = smp_processor_id();
1139        int cpu      = task_cpu(task);
1140        cpumask_var_t domain_mask;
1141
1142        if (task->rt.nr_cpus_allowed == 1)
1143                return -1; /* No other targets possible */
1144
1145        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1146                return -1; /* No targets found */
1147
1148        /*
1149         * Only consider CPUs that are usable for migration.
1150         * I guess we might want to change cpupri_find() to ignore those
1151         * in the first place.
1152         */
1153        cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1154
1155        /*
1156         * At this point we have built a mask of cpus representing the
1157         * lowest priority tasks in the system.  Now we want to elect
1158         * the best one based on our affinity and topology.
1159         *
1160         * We prioritize the last cpu that the task executed on since
1161         * it is most likely cache-hot in that location.
1162         */
1163        if (cpumask_test_cpu(cpu, lowest_mask))
1164                return cpu;
1165
1166        /*
1167         * Otherwise, we consult the sched_domains span maps to figure
1168         * out which cpu is logically closest to our hot cache data.
1169         */
1170        if (this_cpu == cpu)
1171                this_cpu = -1; /* Skip this_cpu opt if the same */
1172
1173        if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1174                for_each_domain(cpu, sd) {
1175                        if (sd->flags & SD_WAKE_AFFINE) {
1176                                int best_cpu;
1177
1178                                cpumask_and(domain_mask,
1179                                            sched_domain_span(sd),
1180                                            lowest_mask);
1181
1182                                best_cpu = pick_optimal_cpu(this_cpu,
1183                                                            domain_mask);
1184
1185                                if (best_cpu != -1) {
1186                                        free_cpumask_var(domain_mask);
1187                                        return best_cpu;
1188                                }
1189                        }
1190                }
1191                free_cpumask_var(domain_mask);
1192        }
1193
1194        /*
1195         * And finally, if there were no matches within the domains
1196         * just give the caller *something* to work with from the compatible
1197         * locations.
1198         */
1199        return pick_optimal_cpu(this_cpu, lowest_mask);
1200}
1201
1202/* Will lock the rq it finds */
1203static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1204{
1205        struct rq *lowest_rq = NULL;
1206        int tries;
1207        int cpu;
1208
1209        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1210                cpu = find_lowest_rq(task);
1211
1212                if ((cpu == -1) || (cpu == rq->cpu))
1213                        break;
1214
1215                lowest_rq = cpu_rq(cpu);
1216
1217                /* if the prio of this runqueue changed, try again */
1218                if (double_lock_balance(rq, lowest_rq)) {
1219                        /*
1220                         * We had to unlock the run queue. In
1221                         * the mean time, task could have
1222                         * migrated already or had its affinity changed.
1223                         * Also make sure that it wasn't scheduled on its rq.
1224                         */
1225                        if (unlikely(task_rq(task) != rq ||
1226                                     !cpumask_test_cpu(lowest_rq->cpu,
1227                                                       &task->cpus_allowed) ||
1228                                     task_running(rq, task) ||
1229                                     !task->se.on_rq)) {
1230
1231                                spin_unlock(&lowest_rq->lock);
1232                                lowest_rq = NULL;
1233                                break;
1234                        }
1235                }
1236
1237                /* If this rq is still suitable use it. */
1238                if (lowest_rq->rt.highest_prio.curr > task->prio)
1239                        break;
1240
1241                /* try again */
1242                double_unlock_balance(rq, lowest_rq);
1243                lowest_rq = NULL;
1244        }
1245
1246        return lowest_rq;
1247}
1248
1249static inline int has_pushable_tasks(struct rq *rq)
1250{
1251        return !plist_head_empty(&rq->rt.pushable_tasks);
1252}
1253
1254static struct task_struct *pick_next_pushable_task(struct rq *rq)
1255{
1256        struct task_struct *p;
1257
1258        if (!has_pushable_tasks(rq))
1259                return NULL;
1260
1261        p = plist_first_entry(&rq->rt.pushable_tasks,
1262                              struct task_struct, pushable_tasks);
1263
1264        BUG_ON(rq->cpu != task_cpu(p));
1265        BUG_ON(task_current(rq, p));
1266        BUG_ON(p->rt.nr_cpus_allowed <= 1);
1267
1268        BUG_ON(!p->se.on_rq);
1269        BUG_ON(!rt_task(p));
1270
1271        return p;
1272}
1273
1274/*
1275 * If the current CPU has more than one RT task, see if the non
1276 * running task can migrate over to a CPU that is running a task
1277 * of lesser priority.
1278 */
1279static int push_rt_task(struct rq *rq)
1280{
1281        struct task_struct *next_task;
1282        struct rq *lowest_rq;
1283
1284        if (!rq->rt.overloaded)
1285                return 0;
1286
1287        next_task = pick_next_pushable_task(rq);
1288        if (!next_task)
1289                return 0;
1290
1291 retry:
1292        if (unlikely(next_task == rq->curr)) {
1293                WARN_ON(1);
1294                return 0;
1295        }
1296
1297        /*
1298         * It's possible that the next_task slipped in of
1299         * higher priority than current. If that's the case
1300         * just reschedule current.
1301         */
1302        if (unlikely(next_task->prio < rq->curr->prio)) {
1303                resched_task(rq->curr);
1304                return 0;
1305        }
1306
1307        /* We might release rq lock */
1308        get_task_struct(next_task);
1309
1310        /* find_lock_lowest_rq locks the rq if found */
1311        lowest_rq = find_lock_lowest_rq(next_task, rq);
1312        if (!lowest_rq) {
1313                struct task_struct *task;
1314                /*
1315                 * find lock_lowest_rq releases rq->lock
1316                 * so it is possible that next_task has migrated.
1317                 *
1318                 * We need to make sure that the task is still on the same
1319                 * run-queue and is also still the next task eligible for
1320                 * pushing.
1321                 */
1322                task = pick_next_pushable_task(rq);
1323                if (task_cpu(next_task) == rq->cpu && task == next_task) {
1324                        /*
1325                         * If we get here, the task hasnt moved at all, but
1326                         * it has failed to push.  We will not try again,
1327                         * since the other cpus will pull from us when they
1328                         * are ready.
1329                         */
1330                        dequeue_pushable_task(rq, next_task);
1331                        goto out;
1332                }
1333
1334                if (!task)
1335                        /* No more tasks, just exit */
1336                        goto out;
1337
1338                /*
1339                 * Something has shifted, try again.
1340                 */
1341                put_task_struct(next_task);
1342                next_task = task;
1343                goto retry;
1344        }
1345
1346        deactivate_task(rq, next_task, 0);
1347        set_task_cpu(next_task, lowest_rq->cpu);
1348        activate_task(lowest_rq, next_task, 0);
1349
1350        resched_task(lowest_rq->curr);
1351
1352        double_unlock_balance(rq, lowest_rq);
1353
1354out:
1355        put_task_struct(next_task);
1356
1357        return 1;
1358}
1359
1360static void push_rt_tasks(struct rq *rq)
1361{
1362        /* push_rt_task will return true if it moved an RT */
1363        while (push_rt_task(rq))
1364                ;
1365}
1366
1367static int pull_rt_task(struct rq *this_rq)
1368{
1369        int this_cpu = this_rq->cpu, ret = 0, cpu;
1370        struct task_struct *p;
1371        struct rq *src_rq;
1372
1373        if (likely(!rt_overloaded(this_rq)))
1374                return 0;
1375
1376        for_each_cpu(cpu, this_rq->rd->rto_mask) {
1377                if (this_cpu == cpu)
1378                        continue;
1379
1380                src_rq = cpu_rq(cpu);
1381
1382                /*
1383                 * Don't bother taking the src_rq->lock if the next highest
1384                 * task is known to be lower-priority than our current task.
1385                 * This may look racy, but if this value is about to go
1386                 * logically higher, the src_rq will push this task away.
1387                 * And if its going logically lower, we do not care
1388                 */
1389                if (src_rq->rt.highest_prio.next >=
1390                    this_rq->rt.highest_prio.curr)
1391                        continue;
1392
1393                /*
1394                 * We can potentially drop this_rq's lock in
1395                 * double_lock_balance, and another CPU could
1396                 * alter this_rq
1397                 */
1398                double_lock_balance(this_rq, src_rq);
1399
1400                /*
1401                 * Are there still pullable RT tasks?
1402                 */
1403                if (src_rq->rt.rt_nr_running <= 1)
1404                        goto skip;
1405
1406                p = pick_next_highest_task_rt(src_rq, this_cpu);
1407
1408                /*
1409                 * Do we have an RT task that preempts
1410                 * the to-be-scheduled task?
1411                 */
1412                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1413                        WARN_ON(p == src_rq->curr);
1414                        WARN_ON(!p->se.on_rq);
1415
1416                        /*
1417                         * There's a chance that p is higher in priority
1418                         * than what's currently running on its cpu.
1419                         * This is just that p is wakeing up and hasn't
1420                         * had a chance to schedule. We only pull
1421                         * p if it is lower in priority than the
1422                         * current task on the run queue
1423                         */
1424                        if (p->prio < src_rq->curr->prio)
1425                                goto skip;
1426
1427                        ret = 1;
1428
1429                        deactivate_task(src_rq, p, 0);
1430                        set_task_cpu(p, this_cpu);
1431                        activate_task(this_rq, p, 0);
1432                        /*
1433                         * We continue with the search, just in
1434                         * case there's an even higher prio task
1435                         * in another runqueue. (low likelyhood
1436                         * but possible)
1437                         */
1438                }
1439 skip:
1440                double_unlock_balance(this_rq, src_rq);
1441        }
1442
1443        return ret;
1444}
1445
1446static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1447{
1448        /* Try to pull RT tasks here if we lower this rq's prio */
1449        if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
1450                pull_rt_task(rq);
1451}
1452
1453/*
1454 * assumes rq->lock is held
1455 */
1456static int needs_post_schedule_rt(struct rq *rq)
1457{
1458        return has_pushable_tasks(rq);
1459}
1460
1461static void post_schedule_rt(struct rq *rq)
1462{
1463        /*
1464         * This is only called if needs_post_schedule_rt() indicates that
1465         * we need to push tasks away
1466         */
1467        spin_lock_irq(&rq->lock);
1468        push_rt_tasks(rq);
1469        spin_unlock_irq(&rq->lock);
1470}
1471
1472/*
1473 * If we are not running and we are not going to reschedule soon, we should
1474 * try to push tasks away now
1475 */
1476static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1477{
1478        if (!task_running(rq, p) &&
1479            !test_tsk_need_resched(rq->curr) &&
1480            has_pushable_tasks(rq) &&
1481            p->rt.nr_cpus_allowed > 1)
1482                push_rt_tasks(rq);
1483}
1484
1485static unsigned long
1486load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1487                unsigned long max_load_move,
1488                struct sched_domain *sd, enum cpu_idle_type idle,
1489                int *all_pinned, int *this_best_prio)
1490{
1491        /* don't touch RT tasks */
1492        return 0;
1493}
1494
1495static int
1496move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1497                 struct sched_domain *sd, enum cpu_idle_type idle)
1498{
1499        /* don't touch RT tasks */
1500        return 0;
1501}
1502
1503static void set_cpus_allowed_rt(struct task_struct *p,
1504                                const struct cpumask *new_mask)
1505{
1506        int weight = cpumask_weight(new_mask);
1507
1508        BUG_ON(!rt_task(p));
1509
1510        /*
1511         * Update the migration status of the RQ if we have an RT task
1512         * which is running AND changing its weight value.
1513         */
1514        if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
1515                struct rq *rq = task_rq(p);
1516
1517                if (!task_current(rq, p)) {
1518                        /*
1519                         * Make sure we dequeue this task from the pushable list
1520                         * before going further.  It will either remain off of
1521                         * the list because we are no longer pushable, or it
1522                         * will be requeued.
1523                         */
1524                        if (p->rt.nr_cpus_allowed > 1)
1525                                dequeue_pushable_task(rq, p);
1526
1527                        /*
1528                         * Requeue if our weight is changing and still > 1
1529                         */
1530                        if (weight > 1)
1531                                enqueue_pushable_task(rq, p);
1532
1533                }
1534
1535                if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1536                        rq->rt.rt_nr_migratory++;
1537                } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1538                        BUG_ON(!rq->rt.rt_nr_migratory);
1539                        rq->rt.rt_nr_migratory--;
1540                }
1541
1542                update_rt_migration(&rq->rt);
1543        }
1544
1545        cpumask_copy(&p->cpus_allowed, new_mask);
1546        p->rt.nr_cpus_allowed = weight;
1547}
1548
1549/* Assumes rq->lock is held */
1550static void rq_online_rt(struct rq *rq)
1551{
1552        if (rq->rt.overloaded)
1553                rt_set_overload(rq);
1554
1555        __enable_runtime(rq);
1556
1557        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1558}
1559
1560/* Assumes rq->lock is held */
1561static void rq_offline_rt(struct rq *rq)
1562{
1563        if (rq->rt.overloaded)
1564                rt_clear_overload(rq);
1565
1566        __disable_runtime(rq);
1567
1568        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1569}
1570
1571/*
1572 * When switch from the rt queue, we bring ourselves to a position
1573 * that we might want to pull RT tasks from other runqueues.
1574 */
1575static void switched_from_rt(struct rq *rq, struct task_struct *p,
1576                           int running)
1577{
1578        /*
1579         * If there are other RT tasks then we will reschedule
1580         * and the scheduling of the other RT tasks will handle
1581         * the balancing. But if we are the last RT task
1582         * we may need to handle the pulling of RT tasks
1583         * now.
1584         */
1585        if (!rq->rt.rt_nr_running)
1586                pull_rt_task(rq);
1587}
1588
1589static inline void init_sched_rt_class(void)
1590{
1591        unsigned int i;
1592
1593        for_each_possible_cpu(i)
1594                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1595                                        GFP_KERNEL, cpu_to_node(i));
1596}
1597#endif /* CONFIG_SMP */
1598
1599/*
1600 * When switching a task to RT, we may overload the runqueue
1601 * with RT tasks. In this case we try to push them off to
1602 * other runqueues.
1603 */
1604static void switched_to_rt(struct rq *rq, struct task_struct *p,
1605                           int running)
1606{
1607        int check_resched = 1;
1608
1609        /*
1610         * If we are already running, then there's nothing
1611         * that needs to be done. But if we are not running
1612         * we may need to preempt the current running task.
1613         * If that current running task is also an RT task
1614         * then see if we can move to another run queue.
1615         */
1616        if (!running) {
1617#ifdef CONFIG_SMP
1618                if (rq->rt.overloaded && push_rt_task(rq) &&
1619                    /* Don't resched if we changed runqueues */
1620                    rq != task_rq(p))
1621                        check_resched = 0;
1622#endif /* CONFIG_SMP */
1623                if (check_resched && p->prio < rq->curr->prio)
1624                        resched_task(rq->curr);
1625        }
1626}
1627
1628/*
1629 * Priority of the task has changed. This may cause
1630 * us to initiate a push or pull.
1631 */
1632static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1633                            int oldprio, int running)
1634{
1635        if (running) {
1636#ifdef CONFIG_SMP
1637                /*
1638                 * If our priority decreases while running, we
1639                 * may need to pull tasks to this runqueue.
1640                 */
1641                if (oldprio < p->prio)
1642                        pull_rt_task(rq);
1643                /*
1644                 * If there's a higher priority task waiting to run
1645                 * then reschedule. Note, the above pull_rt_task
1646                 * can release the rq lock and p could migrate.
1647                 * Only reschedule if p is still on the same runqueue.
1648                 */
1649                if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1650                        resched_task(p);
1651#else
1652                /* For UP simply resched on drop of prio */
1653                if (oldprio < p->prio)
1654                        resched_task(p);
1655#endif /* CONFIG_SMP */
1656        } else {
1657                /*
1658                 * This task is not running, but if it is
1659                 * greater than the current running task
1660                 * then reschedule.
1661                 */
1662                if (p->prio < rq->curr->prio)
1663                        resched_task(rq->curr);
1664        }
1665}
1666
1667static void watchdog(struct rq *rq, struct task_struct *p)
1668{
1669        unsigned long soft, hard;
1670
1671        if (!p->signal)
1672                return;
1673
1674        soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
1675        hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
1676
1677        if (soft != RLIM_INFINITY) {
1678                unsigned long next;
1679
1680                p->rt.timeout++;
1681                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1682                if (p->rt.timeout > next)
1683                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1684        }
1685}
1686
1687static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1688{
1689        update_curr_rt(rq);
1690
1691        watchdog(rq, p);
1692
1693        /*
1694         * RR tasks need a special form of timeslice management.
1695         * FIFO tasks have no timeslices.
1696         */
1697        if (p->policy != SCHED_RR)
1698                return;
1699
1700        if (--p->rt.time_slice)
1701                return;
1702
1703        p->rt.time_slice = DEF_TIMESLICE;
1704
1705        /*
1706         * Requeue to the end of queue if we are not the only element
1707         * on the queue:
1708         */
1709        if (p->rt.run_list.prev != p->rt.run_list.next) {
1710                requeue_task_rt(rq, p, 0);
1711                set_tsk_need_resched(p);
1712        }
1713}
1714
1715static void set_curr_task_rt(struct rq *rq)
1716{
1717        struct task_struct *p = rq->curr;
1718
1719        p->se.exec_start = rq->clock;
1720
1721        /* The running task is never eligible for pushing */
1722        dequeue_pushable_task(rq, p);
1723}
1724
1725static const struct sched_class rt_sched_class = {
1726        .next                   = &fair_sched_class,
1727        .enqueue_task           = enqueue_task_rt,
1728        .dequeue_task           = dequeue_task_rt,
1729        .yield_task             = yield_task_rt,
1730
1731        .check_preempt_curr     = check_preempt_curr_rt,
1732
1733        .pick_next_task         = pick_next_task_rt,
1734        .put_prev_task          = put_prev_task_rt,
1735
1736#ifdef CONFIG_SMP
1737        .select_task_rq         = select_task_rq_rt,
1738
1739        .load_balance           = load_balance_rt,
1740        .move_one_task          = move_one_task_rt,
1741        .set_cpus_allowed       = set_cpus_allowed_rt,
1742        .rq_online              = rq_online_rt,
1743        .rq_offline             = rq_offline_rt,
1744        .pre_schedule           = pre_schedule_rt,
1745        .needs_post_schedule    = needs_post_schedule_rt,
1746        .post_schedule          = post_schedule_rt,
1747        .task_wake_up           = task_wake_up_rt,
1748        .switched_from          = switched_from_rt,
1749#endif
1750
1751        .set_curr_task          = set_curr_task_rt,
1752        .task_tick              = task_tick_rt,
1753
1754        .prio_changed           = prio_changed_rt,
1755        .switched_to            = switched_to_rt,
1756};
1757
1758#ifdef CONFIG_SCHED_DEBUG
1759extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1760
1761static void print_rt_stats(struct seq_file *m, int cpu)
1762{
1763        struct rt_rq *rt_rq;
1764
1765        rcu_read_lock();
1766        for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
1767                print_rt_rq(m, cpu, rt_rq);
1768        rcu_read_unlock();
1769}
1770#endif /* CONFIG_SCHED_DEBUG */
1771
1772