linux/kernel/sched_rt.c
<<
>>
Prefs
   1/*
   2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3 * policies)
   4 */
   5
   6#ifdef CONFIG_RT_GROUP_SCHED
   7
   8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
   9
  10static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  11{
  12#ifdef CONFIG_SCHED_DEBUG
  13        WARN_ON_ONCE(!rt_entity_is_task(rt_se));
  14#endif
  15        return container_of(rt_se, struct task_struct, rt);
  16}
  17
  18static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  19{
  20        return rt_rq->rq;
  21}
  22
  23static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  24{
  25        return rt_se->rt_rq;
  26}
  27
  28#else /* CONFIG_RT_GROUP_SCHED */
  29
  30#define rt_entity_is_task(rt_se) (1)
  31
  32static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  33{
  34        return container_of(rt_se, struct task_struct, rt);
  35}
  36
  37static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  38{
  39        return container_of(rt_rq, struct rq, rt);
  40}
  41
  42static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  43{
  44        struct task_struct *p = rt_task_of(rt_se);
  45        struct rq *rq = task_rq(p);
  46
  47        return &rq->rt;
  48}
  49
  50#endif /* CONFIG_RT_GROUP_SCHED */
  51
  52#ifdef CONFIG_SMP
  53
  54static inline int rt_overloaded(struct rq *rq)
  55{
  56        return atomic_read(&rq->rd->rto_count);
  57}
  58
  59static inline void rt_set_overload(struct rq *rq)
  60{
  61        if (!rq->online)
  62                return;
  63
  64        cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
  65        /*
  66         * Make sure the mask is visible before we set
  67         * the overload count. That is checked to determine
  68         * if we should look at the mask. It would be a shame
  69         * if we looked at the mask, but the mask was not
  70         * updated yet.
  71         */
  72        wmb();
  73        atomic_inc(&rq->rd->rto_count);
  74}
  75
  76static inline void rt_clear_overload(struct rq *rq)
  77{
  78        if (!rq->online)
  79                return;
  80
  81        /* the order here really doesn't matter */
  82        atomic_dec(&rq->rd->rto_count);
  83        cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
  84}
  85
  86static void update_rt_migration(struct rt_rq *rt_rq)
  87{
  88        if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
  89                if (!rt_rq->overloaded) {
  90                        rt_set_overload(rq_of_rt_rq(rt_rq));
  91                        rt_rq->overloaded = 1;
  92                }
  93        } else if (rt_rq->overloaded) {
  94                rt_clear_overload(rq_of_rt_rq(rt_rq));
  95                rt_rq->overloaded = 0;
  96        }
  97}
  98
  99static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 100{
 101        if (!rt_entity_is_task(rt_se))
 102                return;
 103
 104        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 105
 106        rt_rq->rt_nr_total++;
 107        if (rt_se->nr_cpus_allowed > 1)
 108                rt_rq->rt_nr_migratory++;
 109
 110        update_rt_migration(rt_rq);
 111}
 112
 113static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 114{
 115        if (!rt_entity_is_task(rt_se))
 116                return;
 117
 118        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 119
 120        rt_rq->rt_nr_total--;
 121        if (rt_se->nr_cpus_allowed > 1)
 122                rt_rq->rt_nr_migratory--;
 123
 124        update_rt_migration(rt_rq);
 125}
 126
 127static inline int has_pushable_tasks(struct rq *rq)
 128{
 129        return !plist_head_empty(&rq->rt.pushable_tasks);
 130}
 131
 132static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 133{
 134        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 135        plist_node_init(&p->pushable_tasks, p->prio);
 136        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 137
 138        /* Update the highest prio pushable task */
 139        if (p->prio < rq->rt.highest_prio.next)
 140                rq->rt.highest_prio.next = p->prio;
 141}
 142
 143static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 144{
 145        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 146
 147        /* Update the new highest prio pushable task */
 148        if (has_pushable_tasks(rq)) {
 149                p = plist_first_entry(&rq->rt.pushable_tasks,
 150                                      struct task_struct, pushable_tasks);
 151                rq->rt.highest_prio.next = p->prio;
 152        } else
 153                rq->rt.highest_prio.next = MAX_RT_PRIO;
 154}
 155
 156#else
 157
 158static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 159{
 160}
 161
 162static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 163{
 164}
 165
 166static inline
 167void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 168{
 169}
 170
 171static inline
 172void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 173{
 174}
 175
 176#endif /* CONFIG_SMP */
 177
 178static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 179{
 180        return !list_empty(&rt_se->run_list);
 181}
 182
 183#ifdef CONFIG_RT_GROUP_SCHED
 184
 185static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 186{
 187        if (!rt_rq->tg)
 188                return RUNTIME_INF;
 189
 190        return rt_rq->rt_runtime;
 191}
 192
 193static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 194{
 195        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 196}
 197
 198typedef struct task_group *rt_rq_iter_t;
 199
 200static inline struct task_group *next_task_group(struct task_group *tg)
 201{
 202        do {
 203                tg = list_entry_rcu(tg->list.next,
 204                        typeof(struct task_group), list);
 205        } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 206
 207        if (&tg->list == &task_groups)
 208                tg = NULL;
 209
 210        return tg;
 211}
 212
 213#define for_each_rt_rq(rt_rq, iter, rq)                                 \
 214        for (iter = container_of(&task_groups, typeof(*iter), list);    \
 215                (iter = next_task_group(iter)) &&                       \
 216                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 217
 218static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 219{
 220        list_add_rcu(&rt_rq->leaf_rt_rq_list,
 221                        &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
 222}
 223
 224static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
 225{
 226        list_del_rcu(&rt_rq->leaf_rt_rq_list);
 227}
 228
 229#define for_each_leaf_rt_rq(rt_rq, rq) \
 230        list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 231
 232#define for_each_sched_rt_entity(rt_se) \
 233        for (; rt_se; rt_se = rt_se->parent)
 234
 235static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 236{
 237        return rt_se->my_q;
 238}
 239
 240static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
 241static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 242
 243static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 244{
 245        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 246        struct sched_rt_entity *rt_se;
 247
 248        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 249
 250        rt_se = rt_rq->tg->rt_se[cpu];
 251
 252        if (rt_rq->rt_nr_running) {
 253                if (rt_se && !on_rt_rq(rt_se))
 254                        enqueue_rt_entity(rt_se, false);
 255                if (rt_rq->highest_prio.curr < curr->prio)
 256                        resched_task(curr);
 257        }
 258}
 259
 260static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 261{
 262        struct sched_rt_entity *rt_se;
 263        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 264
 265        rt_se = rt_rq->tg->rt_se[cpu];
 266
 267        if (rt_se && on_rt_rq(rt_se))
 268                dequeue_rt_entity(rt_se);
 269}
 270
 271static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 272{
 273        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 274}
 275
 276static int rt_se_boosted(struct sched_rt_entity *rt_se)
 277{
 278        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 279        struct task_struct *p;
 280
 281        if (rt_rq)
 282                return !!rt_rq->rt_nr_boosted;
 283
 284        p = rt_task_of(rt_se);
 285        return p->prio != p->normal_prio;
 286}
 287
 288#ifdef CONFIG_SMP
 289static inline const struct cpumask *sched_rt_period_mask(void)
 290{
 291        return cpu_rq(smp_processor_id())->rd->span;
 292}
 293#else
 294static inline const struct cpumask *sched_rt_period_mask(void)
 295{
 296        return cpu_online_mask;
 297}
 298#endif
 299
 300static inline
 301struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 302{
 303        return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 304}
 305
 306static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 307{
 308        return &rt_rq->tg->rt_bandwidth;
 309}
 310
 311#else /* !CONFIG_RT_GROUP_SCHED */
 312
 313static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 314{
 315        return rt_rq->rt_runtime;
 316}
 317
 318static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 319{
 320        return ktime_to_ns(def_rt_bandwidth.rt_period);
 321}
 322
 323typedef struct rt_rq *rt_rq_iter_t;
 324
 325#define for_each_rt_rq(rt_rq, iter, rq) \
 326        for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 327
 328static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 329{
 330}
 331
 332static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
 333{
 334}
 335
 336#define for_each_leaf_rt_rq(rt_rq, rq) \
 337        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 338
 339#define for_each_sched_rt_entity(rt_se) \
 340        for (; rt_se; rt_se = NULL)
 341
 342static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 343{
 344        return NULL;
 345}
 346
 347static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 348{
 349        if (rt_rq->rt_nr_running)
 350                resched_task(rq_of_rt_rq(rt_rq)->curr);
 351}
 352
 353static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 354{
 355}
 356
 357static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 358{
 359        return rt_rq->rt_throttled;
 360}
 361
 362static inline const struct cpumask *sched_rt_period_mask(void)
 363{
 364        return cpu_online_mask;
 365}
 366
 367static inline
 368struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 369{
 370        return &cpu_rq(cpu)->rt;
 371}
 372
 373static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 374{
 375        return &def_rt_bandwidth;
 376}
 377
 378#endif /* CONFIG_RT_GROUP_SCHED */
 379
 380#ifdef CONFIG_SMP
 381/*
 382 * We ran out of runtime, see if we can borrow some from our neighbours.
 383 */
 384static int do_balance_runtime(struct rt_rq *rt_rq)
 385{
 386        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 387        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 388        int i, weight, more = 0;
 389        u64 rt_period;
 390
 391        weight = cpumask_weight(rd->span);
 392
 393        raw_spin_lock(&rt_b->rt_runtime_lock);
 394        rt_period = ktime_to_ns(rt_b->rt_period);
 395        for_each_cpu(i, rd->span) {
 396                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 397                s64 diff;
 398
 399                if (iter == rt_rq)
 400                        continue;
 401
 402                raw_spin_lock(&iter->rt_runtime_lock);
 403                /*
 404                 * Either all rqs have inf runtime and there's nothing to steal
 405                 * or __disable_runtime() below sets a specific rq to inf to
 406                 * indicate its been disabled and disalow stealing.
 407                 */
 408                if (iter->rt_runtime == RUNTIME_INF)
 409                        goto next;
 410
 411                /*
 412                 * From runqueues with spare time, take 1/n part of their
 413                 * spare time, but no more than our period.
 414                 */
 415                diff = iter->rt_runtime - iter->rt_time;
 416                if (diff > 0) {
 417                        diff = div_u64((u64)diff, weight);
 418                        if (rt_rq->rt_runtime + diff > rt_period)
 419                                diff = rt_period - rt_rq->rt_runtime;
 420                        iter->rt_runtime -= diff;
 421                        rt_rq->rt_runtime += diff;
 422                        more = 1;
 423                        if (rt_rq->rt_runtime == rt_period) {
 424                                raw_spin_unlock(&iter->rt_runtime_lock);
 425                                break;
 426                        }
 427                }
 428next:
 429                raw_spin_unlock(&iter->rt_runtime_lock);
 430        }
 431        raw_spin_unlock(&rt_b->rt_runtime_lock);
 432
 433        return more;
 434}
 435
 436/*
 437 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 438 */
 439static void __disable_runtime(struct rq *rq)
 440{
 441        struct root_domain *rd = rq->rd;
 442        rt_rq_iter_t iter;
 443        struct rt_rq *rt_rq;
 444
 445        if (unlikely(!scheduler_running))
 446                return;
 447
 448        for_each_rt_rq(rt_rq, iter, rq) {
 449                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 450                s64 want;
 451                int i;
 452
 453                raw_spin_lock(&rt_b->rt_runtime_lock);
 454                raw_spin_lock(&rt_rq->rt_runtime_lock);
 455                /*
 456                 * Either we're all inf and nobody needs to borrow, or we're
 457                 * already disabled and thus have nothing to do, or we have
 458                 * exactly the right amount of runtime to take out.
 459                 */
 460                if (rt_rq->rt_runtime == RUNTIME_INF ||
 461                                rt_rq->rt_runtime == rt_b->rt_runtime)
 462                        goto balanced;
 463                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 464
 465                /*
 466                 * Calculate the difference between what we started out with
 467                 * and what we current have, that's the amount of runtime
 468                 * we lend and now have to reclaim.
 469                 */
 470                want = rt_b->rt_runtime - rt_rq->rt_runtime;
 471
 472                /*
 473                 * Greedy reclaim, take back as much as we can.
 474                 */
 475                for_each_cpu(i, rd->span) {
 476                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 477                        s64 diff;
 478
 479                        /*
 480                         * Can't reclaim from ourselves or disabled runqueues.
 481                         */
 482                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 483                                continue;
 484
 485                        raw_spin_lock(&iter->rt_runtime_lock);
 486                        if (want > 0) {
 487                                diff = min_t(s64, iter->rt_runtime, want);
 488                                iter->rt_runtime -= diff;
 489                                want -= diff;
 490                        } else {
 491                                iter->rt_runtime -= want;
 492                                want -= want;
 493                        }
 494                        raw_spin_unlock(&iter->rt_runtime_lock);
 495
 496                        if (!want)
 497                                break;
 498                }
 499
 500                raw_spin_lock(&rt_rq->rt_runtime_lock);
 501                /*
 502                 * We cannot be left wanting - that would mean some runtime
 503                 * leaked out of the system.
 504                 */
 505                BUG_ON(want);
 506balanced:
 507                /*
 508                 * Disable all the borrow logic by pretending we have inf
 509                 * runtime - in which case borrowing doesn't make sense.
 510                 */
 511                rt_rq->rt_runtime = RUNTIME_INF;
 512                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 513                raw_spin_unlock(&rt_b->rt_runtime_lock);
 514        }
 515}
 516
 517static void disable_runtime(struct rq *rq)
 518{
 519        unsigned long flags;
 520
 521        raw_spin_lock_irqsave(&rq->lock, flags);
 522        __disable_runtime(rq);
 523        raw_spin_unlock_irqrestore(&rq->lock, flags);
 524}
 525
 526static void __enable_runtime(struct rq *rq)
 527{
 528        rt_rq_iter_t iter;
 529        struct rt_rq *rt_rq;
 530
 531        if (unlikely(!scheduler_running))
 532                return;
 533
 534        /*
 535         * Reset each runqueue's bandwidth settings
 536         */
 537        for_each_rt_rq(rt_rq, iter, rq) {
 538                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 539
 540                raw_spin_lock(&rt_b->rt_runtime_lock);
 541                raw_spin_lock(&rt_rq->rt_runtime_lock);
 542                rt_rq->rt_runtime = rt_b->rt_runtime;
 543                rt_rq->rt_time = 0;
 544                rt_rq->rt_throttled = 0;
 545                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 546                raw_spin_unlock(&rt_b->rt_runtime_lock);
 547        }
 548}
 549
 550static void enable_runtime(struct rq *rq)
 551{
 552        unsigned long flags;
 553
 554        raw_spin_lock_irqsave(&rq->lock, flags);
 555        __enable_runtime(rq);
 556        raw_spin_unlock_irqrestore(&rq->lock, flags);
 557}
 558
 559static int balance_runtime(struct rt_rq *rt_rq)
 560{
 561        int more = 0;
 562
 563        if (!sched_feat(RT_RUNTIME_SHARE))
 564                return more;
 565
 566        if (rt_rq->rt_time > rt_rq->rt_runtime) {
 567                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 568                more = do_balance_runtime(rt_rq);
 569                raw_spin_lock(&rt_rq->rt_runtime_lock);
 570        }
 571
 572        return more;
 573}
 574#else /* !CONFIG_SMP */
 575static inline int balance_runtime(struct rt_rq *rt_rq)
 576{
 577        return 0;
 578}
 579#endif /* CONFIG_SMP */
 580
 581static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 582{
 583        int i, idle = 1;
 584        const struct cpumask *span;
 585
 586        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 587                return 1;
 588
 589        span = sched_rt_period_mask();
 590        for_each_cpu(i, span) {
 591                int enqueue = 0;
 592                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 593                struct rq *rq = rq_of_rt_rq(rt_rq);
 594
 595                raw_spin_lock(&rq->lock);
 596                if (rt_rq->rt_time) {
 597                        u64 runtime;
 598
 599                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 600                        if (rt_rq->rt_throttled)
 601                                balance_runtime(rt_rq);
 602                        runtime = rt_rq->rt_runtime;
 603                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 604                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 605                                rt_rq->rt_throttled = 0;
 606                                enqueue = 1;
 607
 608                                /*
 609                                 * Force a clock update if the CPU was idle,
 610                                 * lest wakeup -> unthrottle time accumulate.
 611                                 */
 612                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 613                                        rq->skip_clock_update = -1;
 614                        }
 615                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
 616                                idle = 0;
 617                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 618                } else if (rt_rq->rt_nr_running) {
 619                        idle = 0;
 620                        if (!rt_rq_throttled(rt_rq))
 621                                enqueue = 1;
 622                }
 623
 624                if (enqueue)
 625                        sched_rt_rq_enqueue(rt_rq);
 626                raw_spin_unlock(&rq->lock);
 627        }
 628
 629        return idle;
 630}
 631
 632static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 633{
 634#ifdef CONFIG_RT_GROUP_SCHED
 635        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 636
 637        if (rt_rq)
 638                return rt_rq->highest_prio.curr;
 639#endif
 640
 641        return rt_task_of(rt_se)->prio;
 642}
 643
 644static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 645{
 646        u64 runtime = sched_rt_runtime(rt_rq);
 647
 648        if (rt_rq->rt_throttled)
 649                return rt_rq_throttled(rt_rq);
 650
 651        if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
 652                return 0;
 653
 654        balance_runtime(rt_rq);
 655        runtime = sched_rt_runtime(rt_rq);
 656        if (runtime == RUNTIME_INF)
 657                return 0;
 658
 659        if (rt_rq->rt_time > runtime) {
 660                rt_rq->rt_throttled = 1;
 661                printk_once(KERN_WARNING "sched: RT throttling activated\n");
 662                if (rt_rq_throttled(rt_rq)) {
 663                        sched_rt_rq_dequeue(rt_rq);
 664                        return 1;
 665                }
 666        }
 667
 668        return 0;
 669}
 670
 671/*
 672 * Update the current task's runtime statistics. Skip current tasks that
 673 * are not in our scheduling class.
 674 */
 675static void update_curr_rt(struct rq *rq)
 676{
 677        struct task_struct *curr = rq->curr;
 678        struct sched_rt_entity *rt_se = &curr->rt;
 679        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 680        u64 delta_exec;
 681
 682        if (curr->sched_class != &rt_sched_class)
 683                return;
 684
 685        delta_exec = rq->clock_task - curr->se.exec_start;
 686        if (unlikely((s64)delta_exec < 0))
 687                delta_exec = 0;
 688
 689        schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
 690
 691        curr->se.sum_exec_runtime += delta_exec;
 692        account_group_exec_runtime(curr, delta_exec);
 693
 694        curr->se.exec_start = rq->clock_task;
 695        cpuacct_charge(curr, delta_exec);
 696
 697        sched_rt_avg_update(rq, delta_exec);
 698
 699        if (!rt_bandwidth_enabled())
 700                return;
 701
 702        for_each_sched_rt_entity(rt_se) {
 703                rt_rq = rt_rq_of_se(rt_se);
 704
 705                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 706                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 707                        rt_rq->rt_time += delta_exec;
 708                        if (sched_rt_runtime_exceeded(rt_rq))
 709                                resched_task(curr);
 710                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 711                }
 712        }
 713}
 714
 715#if defined CONFIG_SMP
 716
 717static void
 718inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 719{
 720        struct rq *rq = rq_of_rt_rq(rt_rq);
 721
 722        if (rq->online && prio < prev_prio)
 723                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 724}
 725
 726static void
 727dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 728{
 729        struct rq *rq = rq_of_rt_rq(rt_rq);
 730
 731        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
 732                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 733}
 734
 735#else /* CONFIG_SMP */
 736
 737static inline
 738void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 739static inline
 740void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 741
 742#endif /* CONFIG_SMP */
 743
 744#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 745static void
 746inc_rt_prio(struct rt_rq *rt_rq, int prio)
 747{
 748        int prev_prio = rt_rq->highest_prio.curr;
 749
 750        if (prio < prev_prio)
 751                rt_rq->highest_prio.curr = prio;
 752
 753        inc_rt_prio_smp(rt_rq, prio, prev_prio);
 754}
 755
 756static void
 757dec_rt_prio(struct rt_rq *rt_rq, int prio)
 758{
 759        int prev_prio = rt_rq->highest_prio.curr;
 760
 761        if (rt_rq->rt_nr_running) {
 762
 763                WARN_ON(prio < prev_prio);
 764
 765                /*
 766                 * This may have been our highest task, and therefore
 767                 * we may have some recomputation to do
 768                 */
 769                if (prio == prev_prio) {
 770                        struct rt_prio_array *array = &rt_rq->active;
 771
 772                        rt_rq->highest_prio.curr =
 773                                sched_find_first_bit(array->bitmap);
 774                }
 775
 776        } else
 777                rt_rq->highest_prio.curr = MAX_RT_PRIO;
 778
 779        dec_rt_prio_smp(rt_rq, prio, prev_prio);
 780}
 781
 782#else
 783
 784static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
 785static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
 786
 787#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 788
 789#ifdef CONFIG_RT_GROUP_SCHED
 790
 791static void
 792inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 793{
 794        if (rt_se_boosted(rt_se))
 795                rt_rq->rt_nr_boosted++;
 796
 797        if (rt_rq->tg)
 798                start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
 799}
 800
 801static void
 802dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 803{
 804        if (rt_se_boosted(rt_se))
 805                rt_rq->rt_nr_boosted--;
 806
 807        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
 808}
 809
 810#else /* CONFIG_RT_GROUP_SCHED */
 811
 812static void
 813inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 814{
 815        start_rt_bandwidth(&def_rt_bandwidth);
 816}
 817
 818static inline
 819void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 820
 821#endif /* CONFIG_RT_GROUP_SCHED */
 822
 823static inline
 824void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 825{
 826        int prio = rt_se_prio(rt_se);
 827
 828        WARN_ON(!rt_prio(prio));
 829        rt_rq->rt_nr_running++;
 830
 831        inc_rt_prio(rt_rq, prio);
 832        inc_rt_migration(rt_se, rt_rq);
 833        inc_rt_group(rt_se, rt_rq);
 834}
 835
 836static inline
 837void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 838{
 839        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 840        WARN_ON(!rt_rq->rt_nr_running);
 841        rt_rq->rt_nr_running--;
 842
 843        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
 844        dec_rt_migration(rt_se, rt_rq);
 845        dec_rt_group(rt_se, rt_rq);
 846}
 847
 848static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 849{
 850        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 851        struct rt_prio_array *array = &rt_rq->active;
 852        struct rt_rq *group_rq = group_rt_rq(rt_se);
 853        struct list_head *queue = array->queue + rt_se_prio(rt_se);
 854
 855        /*
 856         * Don't enqueue the group if its throttled, or when empty.
 857         * The latter is a consequence of the former when a child group
 858         * get throttled and the current group doesn't have any other
 859         * active members.
 860         */
 861        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 862                return;
 863
 864        if (!rt_rq->rt_nr_running)
 865                list_add_leaf_rt_rq(rt_rq);
 866
 867        if (head)
 868                list_add(&rt_se->run_list, queue);
 869        else
 870                list_add_tail(&rt_se->run_list, queue);
 871        __set_bit(rt_se_prio(rt_se), array->bitmap);
 872
 873        inc_rt_tasks(rt_se, rt_rq);
 874}
 875
 876static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
 877{
 878        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 879        struct rt_prio_array *array = &rt_rq->active;
 880
 881        list_del_init(&rt_se->run_list);
 882        if (list_empty(array->queue + rt_se_prio(rt_se)))
 883                __clear_bit(rt_se_prio(rt_se), array->bitmap);
 884
 885        dec_rt_tasks(rt_se, rt_rq);
 886        if (!rt_rq->rt_nr_running)
 887                list_del_leaf_rt_rq(rt_rq);
 888}
 889
 890/*
 891 * Because the prio of an upper entry depends on the lower
 892 * entries, we must remove entries top - down.
 893 */
 894static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 895{
 896        struct sched_rt_entity *back = NULL;
 897
 898        for_each_sched_rt_entity(rt_se) {
 899                rt_se->back = back;
 900                back = rt_se;
 901        }
 902
 903        for (rt_se = back; rt_se; rt_se = rt_se->back) {
 904                if (on_rt_rq(rt_se))
 905                        __dequeue_rt_entity(rt_se);
 906        }
 907}
 908
 909static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 910{
 911        dequeue_rt_stack(rt_se);
 912        for_each_sched_rt_entity(rt_se)
 913                __enqueue_rt_entity(rt_se, head);
 914}
 915
 916static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 917{
 918        dequeue_rt_stack(rt_se);
 919
 920        for_each_sched_rt_entity(rt_se) {
 921                struct rt_rq *rt_rq = group_rt_rq(rt_se);
 922
 923                if (rt_rq && rt_rq->rt_nr_running)
 924                        __enqueue_rt_entity(rt_se, false);
 925        }
 926}
 927
 928/*
 929 * Adding/removing a task to/from a priority array:
 930 */
 931static void
 932enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 933{
 934        struct sched_rt_entity *rt_se = &p->rt;
 935
 936        if (flags & ENQUEUE_WAKEUP)
 937                rt_se->timeout = 0;
 938
 939        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 940
 941        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 942                enqueue_pushable_task(rq, p);
 943
 944        inc_nr_running(rq);
 945}
 946
 947static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 948{
 949        struct sched_rt_entity *rt_se = &p->rt;
 950
 951        update_curr_rt(rq);
 952        dequeue_rt_entity(rt_se);
 953
 954        dequeue_pushable_task(rq, p);
 955
 956        dec_nr_running(rq);
 957}
 958
 959/*
 960 * Put task to the end of the run list without the overhead of dequeue
 961 * followed by enqueue.
 962 */
 963static void
 964requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
 965{
 966        if (on_rt_rq(rt_se)) {
 967                struct rt_prio_array *array = &rt_rq->active;
 968                struct list_head *queue = array->queue + rt_se_prio(rt_se);
 969
 970                if (head)
 971                        list_move(&rt_se->run_list, queue);
 972                else
 973                        list_move_tail(&rt_se->run_list, queue);
 974        }
 975}
 976
 977static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 978{
 979        struct sched_rt_entity *rt_se = &p->rt;
 980        struct rt_rq *rt_rq;
 981
 982        for_each_sched_rt_entity(rt_se) {
 983                rt_rq = rt_rq_of_se(rt_se);
 984                requeue_rt_entity(rt_rq, rt_se, head);
 985        }
 986}
 987
 988static void yield_task_rt(struct rq *rq)
 989{
 990        requeue_task_rt(rq, rq->curr, 0);
 991}
 992
 993#ifdef CONFIG_SMP
 994static int find_lowest_rq(struct task_struct *task);
 995
 996static int
 997select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 998{
 999        struct task_struct *curr;
1000        struct rq *rq;
1001        int cpu;
1002
1003        cpu = task_cpu(p);
1004
1005        /* For anything but wake ups, just return the task_cpu */
1006        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1007                goto out;
1008
1009        rq = cpu_rq(cpu);
1010
1011        rcu_read_lock();
1012        curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1013
1014        /*
1015         * If the current task on @p's runqueue is an RT task, then
1016         * try to see if we can wake this RT task up on another
1017         * runqueue. Otherwise simply start this RT task
1018         * on its current runqueue.
1019         *
1020         * We want to avoid overloading runqueues. If the woken
1021         * task is a higher priority, then it will stay on this CPU
1022         * and the lower prio task should be moved to another CPU.
1023         * Even though this will probably make the lower prio task
1024         * lose its cache, we do not want to bounce a higher task
1025         * around just because it gave up its CPU, perhaps for a
1026         * lock?
1027         *
1028         * For equal prio tasks, we just let the scheduler sort it out.
1029         *
1030         * Otherwise, just let it ride on the affined RQ and the
1031         * post-schedule router will push the preempted task away
1032         *
1033         * This test is optimistic, if we get it wrong the load-balancer
1034         * will have to sort it out.
1035         */
1036        if (curr && unlikely(rt_task(curr)) &&
1037            (curr->rt.nr_cpus_allowed < 2 ||
1038             curr->prio <= p->prio) &&
1039            (p->rt.nr_cpus_allowed > 1)) {
1040                int target = find_lowest_rq(p);
1041
1042                if (target != -1)
1043                        cpu = target;
1044        }
1045        rcu_read_unlock();
1046
1047out:
1048        return cpu;
1049}
1050
1051static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1052{
1053        if (rq->curr->rt.nr_cpus_allowed == 1)
1054                return;
1055
1056        if (p->rt.nr_cpus_allowed != 1
1057            && cpupri_find(&rq->rd->cpupri, p, NULL))
1058                return;
1059
1060        if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1061                return;
1062
1063        /*
1064         * There appears to be other cpus that can accept
1065         * current and none to run 'p', so lets reschedule
1066         * to try and push current away:
1067         */
1068        requeue_task_rt(rq, p, 1);
1069        resched_task(rq->curr);
1070}
1071
1072#endif /* CONFIG_SMP */
1073
1074/*
1075 * Preempt the current task with a newly woken task if needed:
1076 */
1077static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1078{
1079        if (p->prio < rq->curr->prio) {
1080                resched_task(rq->curr);
1081                return;
1082        }
1083
1084#ifdef CONFIG_SMP
1085        /*
1086         * If:
1087         *
1088         * - the newly woken task is of equal priority to the current task
1089         * - the newly woken task is non-migratable while current is migratable
1090         * - current will be preempted on the next reschedule
1091         *
1092         * we should check to see if current can readily move to a different
1093         * cpu.  If so, we will reschedule to allow the push logic to try
1094         * to move current somewhere else, making room for our non-migratable
1095         * task.
1096         */
1097        if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1098                check_preempt_equal_prio(rq, p);
1099#endif
1100}
1101
1102static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1103                                                   struct rt_rq *rt_rq)
1104{
1105        struct rt_prio_array *array = &rt_rq->active;
1106        struct sched_rt_entity *next = NULL;
1107        struct list_head *queue;
1108        int idx;
1109
1110        idx = sched_find_first_bit(array->bitmap);
1111        BUG_ON(idx >= MAX_RT_PRIO);
1112
1113        queue = array->queue + idx;
1114        next = list_entry(queue->next, struct sched_rt_entity, run_list);
1115
1116        return next;
1117}
1118
1119static struct task_struct *_pick_next_task_rt(struct rq *rq)
1120{
1121        struct sched_rt_entity *rt_se;
1122        struct task_struct *p;
1123        struct rt_rq *rt_rq;
1124
1125        rt_rq = &rq->rt;
1126
1127        if (!rt_rq->rt_nr_running)
1128                return NULL;
1129
1130        if (rt_rq_throttled(rt_rq))
1131                return NULL;
1132
1133        do {
1134                rt_se = pick_next_rt_entity(rq, rt_rq);
1135                BUG_ON(!rt_se);
1136                rt_rq = group_rt_rq(rt_se);
1137        } while (rt_rq);
1138
1139        p = rt_task_of(rt_se);
1140        p->se.exec_start = rq->clock_task;
1141
1142        return p;
1143}
1144
1145static struct task_struct *pick_next_task_rt(struct rq *rq)
1146{
1147        struct task_struct *p = _pick_next_task_rt(rq);
1148
1149        /* The running task is never eligible for pushing */
1150        if (p)
1151                dequeue_pushable_task(rq, p);
1152
1153#ifdef CONFIG_SMP
1154        /*
1155         * We detect this state here so that we can avoid taking the RQ
1156         * lock again later if there is no need to push
1157         */
1158        rq->post_schedule = has_pushable_tasks(rq);
1159#endif
1160
1161        return p;
1162}
1163
1164static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1165{
1166        update_curr_rt(rq);
1167
1168        /*
1169         * The previous task needs to be made eligible for pushing
1170         * if it is still active
1171         */
1172        if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
1173                enqueue_pushable_task(rq, p);
1174}
1175
1176#ifdef CONFIG_SMP
1177
1178/* Only try algorithms three times */
1179#define RT_MAX_TRIES 3
1180
1181static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1182
1183static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1184{
1185        if (!task_running(rq, p) &&
1186            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1187            (p->rt.nr_cpus_allowed > 1))
1188                return 1;
1189        return 0;
1190}
1191
1192/* Return the second highest RT task, NULL otherwise */
1193static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1194{
1195        struct task_struct *next = NULL;
1196        struct sched_rt_entity *rt_se;
1197        struct rt_prio_array *array;
1198        struct rt_rq *rt_rq;
1199        int idx;
1200
1201        for_each_leaf_rt_rq(rt_rq, rq) {
1202                array = &rt_rq->active;
1203                idx = sched_find_first_bit(array->bitmap);
1204next_idx:
1205                if (idx >= MAX_RT_PRIO)
1206                        continue;
1207                if (next && next->prio < idx)
1208                        continue;
1209                list_for_each_entry(rt_se, array->queue + idx, run_list) {
1210                        struct task_struct *p;
1211
1212                        if (!rt_entity_is_task(rt_se))
1213                                continue;
1214
1215                        p = rt_task_of(rt_se);
1216                        if (pick_rt_task(rq, p, cpu)) {
1217                                next = p;
1218                                break;
1219                        }
1220                }
1221                if (!next) {
1222                        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1223                        goto next_idx;
1224                }
1225        }
1226
1227        return next;
1228}
1229
1230static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1231
1232static int find_lowest_rq(struct task_struct *task)
1233{
1234        struct sched_domain *sd;
1235        struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1236        int this_cpu = smp_processor_id();
1237        int cpu      = task_cpu(task);
1238
1239        /* Make sure the mask is initialized first */
1240        if (unlikely(!lowest_mask))
1241                return -1;
1242
1243        if (task->rt.nr_cpus_allowed == 1)
1244                return -1; /* No other targets possible */
1245
1246        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1247                return -1; /* No targets found */
1248
1249        /*
1250         * At this point we have built a mask of cpus representing the
1251         * lowest priority tasks in the system.  Now we want to elect
1252         * the best one based on our affinity and topology.
1253         *
1254         * We prioritize the last cpu that the task executed on since
1255         * it is most likely cache-hot in that location.
1256         */
1257        if (cpumask_test_cpu(cpu, lowest_mask))
1258                return cpu;
1259
1260        /*
1261         * Otherwise, we consult the sched_domains span maps to figure
1262         * out which cpu is logically closest to our hot cache data.
1263         */
1264        if (!cpumask_test_cpu(this_cpu, lowest_mask))
1265                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1266
1267        rcu_read_lock();
1268        for_each_domain(cpu, sd) {
1269                if (sd->flags & SD_WAKE_AFFINE) {
1270                        int best_cpu;
1271
1272                        /*
1273                         * "this_cpu" is cheaper to preempt than a
1274                         * remote processor.
1275                         */
1276                        if (this_cpu != -1 &&
1277                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1278                                rcu_read_unlock();
1279                                return this_cpu;
1280                        }
1281
1282                        best_cpu = cpumask_first_and(lowest_mask,
1283                                                     sched_domain_span(sd));
1284                        if (best_cpu < nr_cpu_ids) {
1285                                rcu_read_unlock();
1286                                return best_cpu;
1287                        }
1288                }
1289        }
1290        rcu_read_unlock();
1291
1292        /*
1293         * And finally, if there were no matches within the domains
1294         * just give the caller *something* to work with from the compatible
1295         * locations.
1296         */
1297        if (this_cpu != -1)
1298                return this_cpu;
1299
1300        cpu = cpumask_any(lowest_mask);
1301        if (cpu < nr_cpu_ids)
1302                return cpu;
1303        return -1;
1304}
1305
1306/* Will lock the rq it finds */
1307static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1308{
1309        struct rq *lowest_rq = NULL;
1310        int tries;
1311        int cpu;
1312
1313        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1314                cpu = find_lowest_rq(task);
1315
1316                if ((cpu == -1) || (cpu == rq->cpu))
1317                        break;
1318
1319                lowest_rq = cpu_rq(cpu);
1320
1321                /* if the prio of this runqueue changed, try again */
1322                if (double_lock_balance(rq, lowest_rq)) {
1323                        /*
1324                         * We had to unlock the run queue. In
1325                         * the mean time, task could have
1326                         * migrated already or had its affinity changed.
1327                         * Also make sure that it wasn't scheduled on its rq.
1328                         */
1329                        if (unlikely(task_rq(task) != rq ||
1330                                     !cpumask_test_cpu(lowest_rq->cpu,
1331                                                       tsk_cpus_allowed(task)) ||
1332                                     task_running(rq, task) ||
1333                                     !task->on_rq)) {
1334
1335                                raw_spin_unlock(&lowest_rq->lock);
1336                                lowest_rq = NULL;
1337                                break;
1338                        }
1339                }
1340
1341                /* If this rq is still suitable use it. */
1342                if (lowest_rq->rt.highest_prio.curr > task->prio)
1343                        break;
1344
1345                /* try again */
1346                double_unlock_balance(rq, lowest_rq);
1347                lowest_rq = NULL;
1348        }
1349
1350        return lowest_rq;
1351}
1352
1353static struct task_struct *pick_next_pushable_task(struct rq *rq)
1354{
1355        struct task_struct *p;
1356
1357        if (!has_pushable_tasks(rq))
1358                return NULL;
1359
1360        p = plist_first_entry(&rq->rt.pushable_tasks,
1361                              struct task_struct, pushable_tasks);
1362
1363        BUG_ON(rq->cpu != task_cpu(p));
1364        BUG_ON(task_current(rq, p));
1365        BUG_ON(p->rt.nr_cpus_allowed <= 1);
1366
1367        BUG_ON(!p->on_rq);
1368        BUG_ON(!rt_task(p));
1369
1370        return p;
1371}
1372
1373/*
1374 * If the current CPU has more than one RT task, see if the non
1375 * running task can migrate over to a CPU that is running a task
1376 * of lesser priority.
1377 */
1378static int push_rt_task(struct rq *rq)
1379{
1380        struct task_struct *next_task;
1381        struct rq *lowest_rq;
1382        int ret = 0;
1383
1384        if (!rq->rt.overloaded)
1385                return 0;
1386
1387        next_task = pick_next_pushable_task(rq);
1388        if (!next_task)
1389                return 0;
1390
1391retry:
1392        if (unlikely(next_task == rq->curr)) {
1393                WARN_ON(1);
1394                return 0;
1395        }
1396
1397        /*
1398         * It's possible that the next_task slipped in of
1399         * higher priority than current. If that's the case
1400         * just reschedule current.
1401         */
1402        if (unlikely(next_task->prio < rq->curr->prio)) {
1403                resched_task(rq->curr);
1404                return 0;
1405        }
1406
1407        /* We might release rq lock */
1408        get_task_struct(next_task);
1409
1410        /* find_lock_lowest_rq locks the rq if found */
1411        lowest_rq = find_lock_lowest_rq(next_task, rq);
1412        if (!lowest_rq) {
1413                struct task_struct *task;
1414                /*
1415                 * find_lock_lowest_rq releases rq->lock
1416                 * so it is possible that next_task has migrated.
1417                 *
1418                 * We need to make sure that the task is still on the same
1419                 * run-queue and is also still the next task eligible for
1420                 * pushing.
1421                 */
1422                task = pick_next_pushable_task(rq);
1423                if (task_cpu(next_task) == rq->cpu && task == next_task) {
1424                        /*
1425                         * The task hasn't migrated, and is still the next
1426                         * eligible task, but we failed to find a run-queue
1427                         * to push it to.  Do not retry in this case, since
1428                         * other cpus will pull from us when ready.
1429                         */
1430                        goto out;
1431                }
1432
1433                if (!task)
1434                        /* No more tasks, just exit */
1435                        goto out;
1436
1437                /*
1438                 * Something has shifted, try again.
1439                 */
1440                put_task_struct(next_task);
1441                next_task = task;
1442                goto retry;
1443        }
1444
1445        deactivate_task(rq, next_task, 0);
1446        set_task_cpu(next_task, lowest_rq->cpu);
1447        activate_task(lowest_rq, next_task, 0);
1448        ret = 1;
1449
1450        resched_task(lowest_rq->curr);
1451
1452        double_unlock_balance(rq, lowest_rq);
1453
1454out:
1455        put_task_struct(next_task);
1456
1457        return ret;
1458}
1459
1460static void push_rt_tasks(struct rq *rq)
1461{
1462        /* push_rt_task will return true if it moved an RT */
1463        while (push_rt_task(rq))
1464                ;
1465}
1466
1467static int pull_rt_task(struct rq *this_rq)
1468{
1469        int this_cpu = this_rq->cpu, ret = 0, cpu;
1470        struct task_struct *p;
1471        struct rq *src_rq;
1472
1473        if (likely(!rt_overloaded(this_rq)))
1474                return 0;
1475
1476        for_each_cpu(cpu, this_rq->rd->rto_mask) {
1477                if (this_cpu == cpu)
1478                        continue;
1479
1480                src_rq = cpu_rq(cpu);
1481
1482                /*
1483                 * Don't bother taking the src_rq->lock if the next highest
1484                 * task is known to be lower-priority than our current task.
1485                 * This may look racy, but if this value is about to go
1486                 * logically higher, the src_rq will push this task away.
1487                 * And if its going logically lower, we do not care
1488                 */
1489                if (src_rq->rt.highest_prio.next >=
1490                    this_rq->rt.highest_prio.curr)
1491                        continue;
1492
1493                /*
1494                 * We can potentially drop this_rq's lock in
1495                 * double_lock_balance, and another CPU could
1496                 * alter this_rq
1497                 */
1498                double_lock_balance(this_rq, src_rq);
1499
1500                /*
1501                 * Are there still pullable RT tasks?
1502                 */
1503                if (src_rq->rt.rt_nr_running <= 1)
1504                        goto skip;
1505
1506                p = pick_next_highest_task_rt(src_rq, this_cpu);
1507
1508                /*
1509                 * Do we have an RT task that preempts
1510                 * the to-be-scheduled task?
1511                 */
1512                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1513                        WARN_ON(p == src_rq->curr);
1514                        WARN_ON(!p->on_rq);
1515
1516                        /*
1517                         * There's a chance that p is higher in priority
1518                         * than what's currently running on its cpu.
1519                         * This is just that p is wakeing up and hasn't
1520                         * had a chance to schedule. We only pull
1521                         * p if it is lower in priority than the
1522                         * current task on the run queue
1523                         */
1524                        if (p->prio < src_rq->curr->prio)
1525                                goto skip;
1526
1527                        ret = 1;
1528
1529                        deactivate_task(src_rq, p, 0);
1530                        set_task_cpu(p, this_cpu);
1531                        activate_task(this_rq, p, 0);
1532                        /*
1533                         * We continue with the search, just in
1534                         * case there's an even higher prio task
1535                         * in another runqueue. (low likelihood
1536                         * but possible)
1537                         */
1538                }
1539skip:
1540                double_unlock_balance(this_rq, src_rq);
1541        }
1542
1543        return ret;
1544}
1545
1546static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1547{
1548        /* Try to pull RT tasks here if we lower this rq's prio */
1549        if (rq->rt.highest_prio.curr > prev->prio)
1550                pull_rt_task(rq);
1551}
1552
1553static void post_schedule_rt(struct rq *rq)
1554{
1555        push_rt_tasks(rq);
1556}
1557
1558/*
1559 * If we are not running and we are not going to reschedule soon, we should
1560 * try to push tasks away now
1561 */
1562static void task_woken_rt(struct rq *rq, struct task_struct *p)
1563{
1564        if (!task_running(rq, p) &&
1565            !test_tsk_need_resched(rq->curr) &&
1566            has_pushable_tasks(rq) &&
1567            p->rt.nr_cpus_allowed > 1 &&
1568            rt_task(rq->curr) &&
1569            (rq->curr->rt.nr_cpus_allowed < 2 ||
1570             rq->curr->prio <= p->prio))
1571                push_rt_tasks(rq);
1572}
1573
1574static void set_cpus_allowed_rt(struct task_struct *p,
1575                                const struct cpumask *new_mask)
1576{
1577        int weight = cpumask_weight(new_mask);
1578
1579        BUG_ON(!rt_task(p));
1580
1581        /*
1582         * Update the migration status of the RQ if we have an RT task
1583         * which is running AND changing its weight value.
1584         */
1585        if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1586                struct rq *rq = task_rq(p);
1587
1588                if (!task_current(rq, p)) {
1589                        /*
1590                         * Make sure we dequeue this task from the pushable list
1591                         * before going further.  It will either remain off of
1592                         * the list because we are no longer pushable, or it
1593                         * will be requeued.
1594                         */
1595                        if (p->rt.nr_cpus_allowed > 1)
1596                                dequeue_pushable_task(rq, p);
1597
1598                        /*
1599                         * Requeue if our weight is changing and still > 1
1600                         */
1601                        if (weight > 1)
1602                                enqueue_pushable_task(rq, p);
1603
1604                }
1605
1606                if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1607                        rq->rt.rt_nr_migratory++;
1608                } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1609                        BUG_ON(!rq->rt.rt_nr_migratory);
1610                        rq->rt.rt_nr_migratory--;
1611                }
1612
1613                update_rt_migration(&rq->rt);
1614        }
1615}
1616
1617/* Assumes rq->lock is held */
1618static void rq_online_rt(struct rq *rq)
1619{
1620        if (rq->rt.overloaded)
1621                rt_set_overload(rq);
1622
1623        __enable_runtime(rq);
1624
1625        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1626}
1627
1628/* Assumes rq->lock is held */
1629static void rq_offline_rt(struct rq *rq)
1630{
1631        if (rq->rt.overloaded)
1632                rt_clear_overload(rq);
1633
1634        __disable_runtime(rq);
1635
1636        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1637}
1638
1639/*
1640 * When switch from the rt queue, we bring ourselves to a position
1641 * that we might want to pull RT tasks from other runqueues.
1642 */
1643static void switched_from_rt(struct rq *rq, struct task_struct *p)
1644{
1645        /*
1646         * If there are other RT tasks then we will reschedule
1647         * and the scheduling of the other RT tasks will handle
1648         * the balancing. But if we are the last RT task
1649         * we may need to handle the pulling of RT tasks
1650         * now.
1651         */
1652        if (p->on_rq && !rq->rt.rt_nr_running)
1653                pull_rt_task(rq);
1654}
1655
1656static inline void init_sched_rt_class(void)
1657{
1658        unsigned int i;
1659
1660        for_each_possible_cpu(i)
1661                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1662                                        GFP_KERNEL, cpu_to_node(i));
1663}
1664#endif /* CONFIG_SMP */
1665
1666/*
1667 * When switching a task to RT, we may overload the runqueue
1668 * with RT tasks. In this case we try to push them off to
1669 * other runqueues.
1670 */
1671static void switched_to_rt(struct rq *rq, struct task_struct *p)
1672{
1673        int check_resched = 1;
1674
1675        /*
1676         * If we are already running, then there's nothing
1677         * that needs to be done. But if we are not running
1678         * we may need to preempt the current running task.
1679         * If that current running task is also an RT task
1680         * then see if we can move to another run queue.
1681         */
1682        if (p->on_rq && rq->curr != p) {
1683#ifdef CONFIG_SMP
1684                if (rq->rt.overloaded && push_rt_task(rq) &&
1685                    /* Don't resched if we changed runqueues */
1686                    rq != task_rq(p))
1687                        check_resched = 0;
1688#endif /* CONFIG_SMP */
1689                if (check_resched && p->prio < rq->curr->prio)
1690                        resched_task(rq->curr);
1691        }
1692}
1693
1694/*
1695 * Priority of the task has changed. This may cause
1696 * us to initiate a push or pull.
1697 */
1698static void
1699prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1700{
1701        if (!p->on_rq)
1702                return;
1703
1704        if (rq->curr == p) {
1705#ifdef CONFIG_SMP
1706                /*
1707                 * If our priority decreases while running, we
1708                 * may need to pull tasks to this runqueue.
1709                 */
1710                if (oldprio < p->prio)
1711                        pull_rt_task(rq);
1712                /*
1713                 * If there's a higher priority task waiting to run
1714                 * then reschedule. Note, the above pull_rt_task
1715                 * can release the rq lock and p could migrate.
1716                 * Only reschedule if p is still on the same runqueue.
1717                 */
1718                if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1719                        resched_task(p);
1720#else
1721                /* For UP simply resched on drop of prio */
1722                if (oldprio < p->prio)
1723                        resched_task(p);
1724#endif /* CONFIG_SMP */
1725        } else {
1726                /*
1727                 * This task is not running, but if it is
1728                 * greater than the current running task
1729                 * then reschedule.
1730                 */
1731                if (p->prio < rq->curr->prio)
1732                        resched_task(rq->curr);
1733        }
1734}
1735
1736static void watchdog(struct rq *rq, struct task_struct *p)
1737{
1738        unsigned long soft, hard;
1739
1740        /* max may change after cur was read, this will be fixed next tick */
1741        soft = task_rlimit(p, RLIMIT_RTTIME);
1742        hard = task_rlimit_max(p, RLIMIT_RTTIME);
1743
1744        if (soft != RLIM_INFINITY) {
1745                unsigned long next;
1746
1747                p->rt.timeout++;
1748                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1749                if (p->rt.timeout > next)
1750                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1751        }
1752}
1753
1754static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1755{
1756        update_curr_rt(rq);
1757
1758        watchdog(rq, p);
1759
1760        /*
1761         * RR tasks need a special form of timeslice management.
1762         * FIFO tasks have no timeslices.
1763         */
1764        if (p->policy != SCHED_RR)
1765                return;
1766
1767        if (--p->rt.time_slice)
1768                return;
1769
1770        p->rt.time_slice = DEF_TIMESLICE;
1771
1772        /*
1773         * Requeue to the end of queue if we are not the only element
1774         * on the queue:
1775         */
1776        if (p->rt.run_list.prev != p->rt.run_list.next) {
1777                requeue_task_rt(rq, p, 0);
1778                set_tsk_need_resched(p);
1779        }
1780}
1781
1782static void set_curr_task_rt(struct rq *rq)
1783{
1784        struct task_struct *p = rq->curr;
1785
1786        p->se.exec_start = rq->clock_task;
1787
1788        /* The running task is never eligible for pushing */
1789        dequeue_pushable_task(rq, p);
1790}
1791
1792static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1793{
1794        /*
1795         * Time slice is 0 for SCHED_FIFO tasks
1796         */
1797        if (task->policy == SCHED_RR)
1798                return DEF_TIMESLICE;
1799        else
1800                return 0;
1801}
1802
1803static const struct sched_class rt_sched_class = {
1804        .next                   = &fair_sched_class,
1805        .enqueue_task           = enqueue_task_rt,
1806        .dequeue_task           = dequeue_task_rt,
1807        .yield_task             = yield_task_rt,
1808
1809        .check_preempt_curr     = check_preempt_curr_rt,
1810
1811        .pick_next_task         = pick_next_task_rt,
1812        .put_prev_task          = put_prev_task_rt,
1813
1814#ifdef CONFIG_SMP
1815        .select_task_rq         = select_task_rq_rt,
1816
1817        .set_cpus_allowed       = set_cpus_allowed_rt,
1818        .rq_online              = rq_online_rt,
1819        .rq_offline             = rq_offline_rt,
1820        .pre_schedule           = pre_schedule_rt,
1821        .post_schedule          = post_schedule_rt,
1822        .task_woken             = task_woken_rt,
1823        .switched_from          = switched_from_rt,
1824#endif
1825
1826        .set_curr_task          = set_curr_task_rt,
1827        .task_tick              = task_tick_rt,
1828
1829        .get_rr_interval        = get_rr_interval_rt,
1830
1831        .prio_changed           = prio_changed_rt,
1832        .switched_to            = switched_to_rt,
1833};
1834
1835#ifdef CONFIG_SCHED_DEBUG
1836extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1837
1838static void print_rt_stats(struct seq_file *m, int cpu)
1839{
1840        rt_rq_iter_t iter;
1841        struct rt_rq *rt_rq;
1842
1843        rcu_read_lock();
1844        for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
1845                print_rt_rq(m, cpu, rt_rq);
1846        rcu_read_unlock();
1847}
1848#endif /* CONFIG_SCHED_DEBUG */
1849
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.