linux/kernel/sched_rt.c
<<
>>
Prefs
   1/*
   2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3 * policies)
   4 */
   5
   6#ifdef CONFIG_SMP
   7
   8static inline int rt_overloaded(struct rq *rq)
   9{
  10        return atomic_read(&rq->rd->rto_count);
  11}
  12
  13static inline void rt_set_overload(struct rq *rq)
  14{
  15        cpu_set(rq->cpu, rq->rd->rto_mask);
  16        /*
  17         * Make sure the mask is visible before we set
  18         * the overload count. That is checked to determine
  19         * if we should look at the mask. It would be a shame
  20         * if we looked at the mask, but the mask was not
  21         * updated yet.
  22         */
  23        wmb();
  24        atomic_inc(&rq->rd->rto_count);
  25}
  26
  27static inline void rt_clear_overload(struct rq *rq)
  28{
  29        /* the order here really doesn't matter */
  30        atomic_dec(&rq->rd->rto_count);
  31        cpu_clear(rq->cpu, rq->rd->rto_mask);
  32}
  33
  34static void update_rt_migration(struct rq *rq)
  35{
  36        if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
  37                if (!rq->rt.overloaded) {
  38                        rt_set_overload(rq);
  39                        rq->rt.overloaded = 1;
  40                }
  41        } else if (rq->rt.overloaded) {
  42                rt_clear_overload(rq);
  43                rq->rt.overloaded = 0;
  44        }
  45}
  46#endif /* CONFIG_SMP */
  47
  48static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  49{
  50        return container_of(rt_se, struct task_struct, rt);
  51}
  52
  53static inline int on_rt_rq(struct sched_rt_entity *rt_se)
  54{
  55        return !list_empty(&rt_se->run_list);
  56}
  57
  58#ifdef CONFIG_RT_GROUP_SCHED
  59
  60static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
  61{
  62        if (!rt_rq->tg)
  63                return RUNTIME_INF;
  64
  65        return rt_rq->tg->rt_runtime;
  66}
  67
  68#define for_each_leaf_rt_rq(rt_rq, rq) \
  69        list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
  70
  71static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  72{
  73        return rt_rq->rq;
  74}
  75
  76static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  77{
  78        return rt_se->rt_rq;
  79}
  80
  81#define for_each_sched_rt_entity(rt_se) \
  82        for (; rt_se; rt_se = rt_se->parent)
  83
  84static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
  85{
  86        return rt_se->my_q;
  87}
  88
  89static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
  90static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
  91
  92static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
  93{
  94        struct sched_rt_entity *rt_se = rt_rq->rt_se;
  95
  96        if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
  97                struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
  98
  99                enqueue_rt_entity(rt_se);
 100                if (rt_rq->highest_prio < curr->prio)
 101                        resched_task(curr);
 102        }
 103}
 104
 105static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 106{
 107        struct sched_rt_entity *rt_se = rt_rq->rt_se;
 108
 109        if (rt_se && on_rt_rq(rt_se))
 110                dequeue_rt_entity(rt_se);
 111}
 112
 113static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 114{
 115        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 116}
 117
 118static int rt_se_boosted(struct sched_rt_entity *rt_se)
 119{
 120        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 121        struct task_struct *p;
 122
 123        if (rt_rq)
 124                return !!rt_rq->rt_nr_boosted;
 125
 126        p = rt_task_of(rt_se);
 127        return p->prio != p->normal_prio;
 128}
 129
 130#else
 131
 132static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 133{
 134        if (sysctl_sched_rt_runtime == -1)
 135                return RUNTIME_INF;
 136
 137        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 138}
 139
 140#define for_each_leaf_rt_rq(rt_rq, rq) \
 141        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 142
 143static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 144{
 145        return container_of(rt_rq, struct rq, rt);
 146}
 147
 148static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 149{
 150        struct task_struct *p = rt_task_of(rt_se);
 151        struct rq *rq = task_rq(p);
 152
 153        return &rq->rt;
 154}
 155
 156#define for_each_sched_rt_entity(rt_se) \
 157        for (; rt_se; rt_se = NULL)
 158
 159static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 160{
 161        return NULL;
 162}
 163
 164static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 165{
 166}
 167
 168static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 169{
 170}
 171
 172static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 173{
 174        return rt_rq->rt_throttled;
 175}
 176#endif
 177
 178static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 179{
 180#ifdef CONFIG_RT_GROUP_SCHED
 181        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 182
 183        if (rt_rq)
 184                return rt_rq->highest_prio;
 185#endif
 186
 187        return rt_task_of(rt_se)->prio;
 188}
 189
 190static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 191{
 192        u64 runtime = sched_rt_runtime(rt_rq);
 193
 194        if (runtime == RUNTIME_INF)
 195                return 0;
 196
 197        if (rt_rq->rt_throttled)
 198                return rt_rq_throttled(rt_rq);
 199
 200        if (rt_rq->rt_time > runtime) {
 201                struct rq *rq = rq_of_rt_rq(rt_rq);
 202
 203                rq->rt_throttled = 1;
 204                rt_rq->rt_throttled = 1;
 205
 206                if (rt_rq_throttled(rt_rq)) {
 207                        sched_rt_rq_dequeue(rt_rq);
 208                        return 1;
 209                }
 210        }
 211
 212        return 0;
 213}
 214
 215static void update_sched_rt_period(struct rq *rq)
 216{
 217        struct rt_rq *rt_rq;
 218        u64 period;
 219
 220        while (rq->clock > rq->rt_period_expire) {
 221                period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 222                rq->rt_period_expire += period;
 223
 224                for_each_leaf_rt_rq(rt_rq, rq) {
 225                        u64 runtime = sched_rt_runtime(rt_rq);
 226
 227                        rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
 228                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 229                                rt_rq->rt_throttled = 0;
 230                                sched_rt_rq_enqueue(rt_rq);
 231                        }
 232                }
 233
 234                rq->rt_throttled = 0;
 235        }
 236}
 237
 238/*
 239 * Update the current task's runtime statistics. Skip current tasks that
 240 * are not in our scheduling class.
 241 */
 242static void update_curr_rt(struct rq *rq)
 243{
 244        struct task_struct *curr = rq->curr;
 245        struct sched_rt_entity *rt_se = &curr->rt;
 246        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 247        u64 delta_exec;
 248
 249        if (!task_has_rt_policy(curr))
 250                return;
 251
 252        delta_exec = rq->clock - curr->se.exec_start;
 253        if (unlikely((s64)delta_exec < 0))
 254                delta_exec = 0;
 255
 256        schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 257
 258        curr->se.sum_exec_runtime += delta_exec;
 259        curr->se.exec_start = rq->clock;
 260        cpuacct_charge(curr, delta_exec);
 261
 262        rt_rq->rt_time += delta_exec;
 263        if (sched_rt_runtime_exceeded(rt_rq))
 264                resched_task(curr);
 265}
 266
 267static inline
 268void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 269{
 270        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 271        rt_rq->rt_nr_running++;
 272#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 273        if (rt_se_prio(rt_se) < rt_rq->highest_prio)
 274                rt_rq->highest_prio = rt_se_prio(rt_se);
 275#endif
 276#ifdef CONFIG_SMP
 277        if (rt_se->nr_cpus_allowed > 1) {
 278                struct rq *rq = rq_of_rt_rq(rt_rq);
 279                rq->rt.rt_nr_migratory++;
 280        }
 281
 282        update_rt_migration(rq_of_rt_rq(rt_rq));
 283#endif
 284#ifdef CONFIG_RT_GROUP_SCHED
 285        if (rt_se_boosted(rt_se))
 286                rt_rq->rt_nr_boosted++;
 287#endif
 288}
 289
 290static inline
 291void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 292{
 293        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 294        WARN_ON(!rt_rq->rt_nr_running);
 295        rt_rq->rt_nr_running--;
 296#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 297        if (rt_rq->rt_nr_running) {
 298                struct rt_prio_array *array;
 299
 300                WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
 301                if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
 302                        /* recalculate */
 303                        array = &rt_rq->active;
 304                        rt_rq->highest_prio =
 305                                sched_find_first_bit(array->bitmap);
 306                } /* otherwise leave rq->highest prio alone */
 307        } else
 308                rt_rq->highest_prio = MAX_RT_PRIO;
 309#endif
 310#ifdef CONFIG_SMP
 311        if (rt_se->nr_cpus_allowed > 1) {
 312                struct rq *rq = rq_of_rt_rq(rt_rq);
 313                rq->rt.rt_nr_migratory--;
 314        }
 315
 316        update_rt_migration(rq_of_rt_rq(rt_rq));
 317#endif /* CONFIG_SMP */
 318#ifdef CONFIG_RT_GROUP_SCHED
 319        if (rt_se_boosted(rt_se))
 320                rt_rq->rt_nr_boosted--;
 321
 322        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
 323#endif
 324}
 325
 326static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 327{
 328        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 329        struct rt_prio_array *array = &rt_rq->active;
 330        struct rt_rq *group_rq = group_rt_rq(rt_se);
 331
 332        if (group_rq && rt_rq_throttled(group_rq))
 333                return;
 334
 335        list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
 336        __set_bit(rt_se_prio(rt_se), array->bitmap);
 337
 338        inc_rt_tasks(rt_se, rt_rq);
 339}
 340
 341static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 342{
 343        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 344        struct rt_prio_array *array = &rt_rq->active;
 345
 346        list_del_init(&rt_se->run_list);
 347        if (list_empty(array->queue + rt_se_prio(rt_se)))
 348                __clear_bit(rt_se_prio(rt_se), array->bitmap);
 349
 350        dec_rt_tasks(rt_se, rt_rq);
 351}
 352
 353/*
 354 * Because the prio of an upper entry depends on the lower
 355 * entries, we must remove entries top - down.
 356 *
 357 * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
 358 *      doesn't matter much for now, as h=2 for GROUP_SCHED.
 359 */
 360static void dequeue_rt_stack(struct task_struct *p)
 361{
 362        struct sched_rt_entity *rt_se, *top_se;
 363
 364        /*
 365         * dequeue all, top - down.
 366         */
 367        do {
 368                rt_se = &p->rt;
 369                top_se = NULL;
 370                for_each_sched_rt_entity(rt_se) {
 371                        if (on_rt_rq(rt_se))
 372                                top_se = rt_se;
 373                }
 374                if (top_se)
 375                        dequeue_rt_entity(top_se);
 376        } while (top_se);
 377}
 378
 379/*
 380 * Adding/removing a task to/from a priority array:
 381 */
 382static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 383{
 384        struct sched_rt_entity *rt_se = &p->rt;
 385
 386        if (wakeup)
 387                rt_se->timeout = 0;
 388
 389        dequeue_rt_stack(p);
 390
 391        /*
 392         * enqueue everybody, bottom - up.
 393         */
 394        for_each_sched_rt_entity(rt_se)
 395                enqueue_rt_entity(rt_se);
 396}
 397
 398static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 399{
 400        struct sched_rt_entity *rt_se = &p->rt;
 401        struct rt_rq *rt_rq;
 402
 403        update_curr_rt(rq);
 404
 405        dequeue_rt_stack(p);
 406
 407        /*
 408         * re-enqueue all non-empty rt_rq entities.
 409         */
 410        for_each_sched_rt_entity(rt_se) {
 411                rt_rq = group_rt_rq(rt_se);
 412                if (rt_rq && rt_rq->rt_nr_running)
 413                        enqueue_rt_entity(rt_se);
 414        }
 415}
 416
 417/*
 418 * Put task to the end of the run list without the overhead of dequeue
 419 * followed by enqueue.
 420 */
 421static
 422void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 423{
 424        struct rt_prio_array *array = &rt_rq->active;
 425
 426        list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
 427}
 428
 429static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 430{
 431        struct sched_rt_entity *rt_se = &p->rt;
 432        struct rt_rq *rt_rq;
 433
 434        for_each_sched_rt_entity(rt_se) {
 435                rt_rq = rt_rq_of_se(rt_se);
 436                requeue_rt_entity(rt_rq, rt_se);
 437        }
 438}
 439
 440static void yield_task_rt(struct rq *rq)
 441{
 442        requeue_task_rt(rq, rq->curr);
 443}
 444
 445#ifdef CONFIG_SMP
 446static int find_lowest_rq(struct task_struct *task);
 447
 448static int select_task_rq_rt(struct task_struct *p, int sync)
 449{
 450        struct rq *rq = task_rq(p);
 451
 452        /*
 453         * If the current task is an RT task, then
 454         * try to see if we can wake this RT task up on another
 455         * runqueue. Otherwise simply start this RT task
 456         * on its current runqueue.
 457         *
 458         * We want to avoid overloading runqueues. Even if
 459         * the RT task is of higher priority than the current RT task.
 460         * RT tasks behave differently than other tasks. If
 461         * one gets preempted, we try to push it off to another queue.
 462         * So trying to keep a preempting RT task on the same
 463         * cache hot CPU will force the running RT task to
 464         * a cold CPU. So we waste all the cache for the lower
 465         * RT task in hopes of saving some of a RT task
 466         * that is just being woken and probably will have
 467         * cold cache anyway.
 468         */
 469        if (unlikely(rt_task(rq->curr)) &&
 470            (p->rt.nr_cpus_allowed > 1)) {
 471                int cpu = find_lowest_rq(p);
 472
 473                return (cpu == -1) ? task_cpu(p) : cpu;
 474        }
 475
 476        /*
 477         * Otherwise, just let it ride on the affined RQ and the
 478         * post-schedule router will push the preempted task away
 479         */
 480        return task_cpu(p);
 481}
 482#endif /* CONFIG_SMP */
 483
 484/*
 485 * Preempt the current task with a newly woken task if needed:
 486 */
 487static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 488{
 489        if (p->prio < rq->curr->prio)
 490                resched_task(rq->curr);
 491}
 492
 493static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 494                                                   struct rt_rq *rt_rq)
 495{
 496        struct rt_prio_array *array = &rt_rq->active;
 497        struct sched_rt_entity *next = NULL;
 498        struct list_head *queue;
 499        int idx;
 500
 501        idx = sched_find_first_bit(array->bitmap);
 502        BUG_ON(idx >= MAX_RT_PRIO);
 503
 504        queue = array->queue + idx;
 505        next = list_entry(queue->next, struct sched_rt_entity, run_list);
 506
 507        return next;
 508}
 509
 510static struct task_struct *pick_next_task_rt(struct rq *rq)
 511{
 512        struct sched_rt_entity *rt_se;
 513        struct task_struct *p;
 514        struct rt_rq *rt_rq;
 515
 516        rt_rq = &rq->rt;
 517
 518        if (unlikely(!rt_rq->rt_nr_running))
 519                return NULL;
 520
 521        if (rt_rq_throttled(rt_rq))
 522                return NULL;
 523
 524        do {
 525                rt_se = pick_next_rt_entity(rq, rt_rq);
 526                BUG_ON(!rt_se);
 527                rt_rq = group_rt_rq(rt_se);
 528        } while (rt_rq);
 529
 530        p = rt_task_of(rt_se);
 531        p->se.exec_start = rq->clock;
 532        return p;
 533}
 534
 535static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 536{
 537        update_curr_rt(rq);
 538        p->se.exec_start = 0;
 539}
 540
 541#ifdef CONFIG_SMP
 542
 543/* Only try algorithms three times */
 544#define RT_MAX_TRIES 3
 545
 546static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 547static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 548
 549static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 550{
 551        if (!task_running(rq, p) &&
 552            (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
 553            (p->rt.nr_cpus_allowed > 1))
 554                return 1;
 555        return 0;
 556}
 557
 558/* Return the second highest RT task, NULL otherwise */
 559static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 560{
 561        struct task_struct *next = NULL;
 562        struct sched_rt_entity *rt_se;
 563        struct rt_prio_array *array;
 564        struct rt_rq *rt_rq;
 565        int idx;
 566
 567        for_each_leaf_rt_rq(rt_rq, rq) {
 568                array = &rt_rq->active;
 569                idx = sched_find_first_bit(array->bitmap);
 570 next_idx:
 571                if (idx >= MAX_RT_PRIO)
 572                        continue;
 573                if (next && next->prio < idx)
 574                        continue;
 575                list_for_each_entry(rt_se, array->queue + idx, run_list) {
 576                        struct task_struct *p = rt_task_of(rt_se);
 577                        if (pick_rt_task(rq, p, cpu)) {
 578                                next = p;
 579                                break;
 580                        }
 581                }
 582                if (!next) {
 583                        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
 584                        goto next_idx;
 585                }
 586        }
 587
 588        return next;
 589}
 590
 591static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
 592
 593static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 594{
 595        int       lowest_prio = -1;
 596        int       lowest_cpu  = -1;
 597        int       count       = 0;
 598        int       cpu;
 599
 600        cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
 601
 602        /*
 603         * Scan each rq for the lowest prio.
 604         */
 605        for_each_cpu_mask(cpu, *lowest_mask) {
 606                struct rq *rq = cpu_rq(cpu);
 607
 608                /* We look for lowest RT prio or non-rt CPU */
 609                if (rq->rt.highest_prio >= MAX_RT_PRIO) {
 610                        /*
 611                         * if we already found a low RT queue
 612                         * and now we found this non-rt queue
 613                         * clear the mask and set our bit.
 614                         * Otherwise just return the queue as is
 615                         * and the count==1 will cause the algorithm
 616                         * to use the first bit found.
 617                         */
 618                        if (lowest_cpu != -1) {
 619                                cpus_clear(*lowest_mask);
 620                                cpu_set(rq->cpu, *lowest_mask);
 621                        }
 622                        return 1;
 623                }
 624
 625                /* no locking for now */
 626                if ((rq->rt.highest_prio > task->prio)
 627                    && (rq->rt.highest_prio >= lowest_prio)) {
 628                        if (rq->rt.highest_prio > lowest_prio) {
 629                                /* new low - clear old data */
 630                                lowest_prio = rq->rt.highest_prio;
 631                                lowest_cpu = cpu;
 632                                count = 0;
 633                        }
 634                        count++;
 635                } else
 636                        cpu_clear(cpu, *lowest_mask);
 637        }
 638
 639        /*
 640         * Clear out all the set bits that represent
 641         * runqueues that were of higher prio than
 642         * the lowest_prio.
 643         */
 644        if (lowest_cpu > 0) {
 645                /*
 646                 * Perhaps we could add another cpumask op to
 647                 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
 648                 * Then that could be optimized to use memset and such.
 649                 */
 650                for_each_cpu_mask(cpu, *lowest_mask) {
 651                        if (cpu >= lowest_cpu)
 652                                break;
 653                        cpu_clear(cpu, *lowest_mask);
 654                }
 655        }
 656
 657        return count;
 658}
 659
 660static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 661{
 662        int first;
 663
 664        /* "this_cpu" is cheaper to preempt than a remote processor */
 665        if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
 666                return this_cpu;
 667
 668        first = first_cpu(*mask);
 669        if (first != NR_CPUS)
 670                return first;
 671
 672        return -1;
 673}
 674
 675static int find_lowest_rq(struct task_struct *task)
 676{
 677        struct sched_domain *sd;
 678        cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
 679        int this_cpu = smp_processor_id();
 680        int cpu      = task_cpu(task);
 681        int count    = find_lowest_cpus(task, lowest_mask);
 682
 683        if (!count)
 684                return -1; /* No targets found */
 685
 686        /*
 687         * There is no sense in performing an optimal search if only one
 688         * target is found.
 689         */
 690        if (count == 1)
 691                return first_cpu(*lowest_mask);
 692
 693        /*
 694         * At this point we have built a mask of cpus representing the
 695         * lowest priority tasks in the system.  Now we want to elect
 696         * the best one based on our affinity and topology.
 697         *
 698         * We prioritize the last cpu that the task executed on since
 699         * it is most likely cache-hot in that location.
 700         */
 701        if (cpu_isset(cpu, *lowest_mask))
 702                return cpu;
 703
 704        /*
 705         * Otherwise, we consult the sched_domains span maps to figure
 706         * out which cpu is logically closest to our hot cache data.
 707         */
 708        if (this_cpu == cpu)
 709                this_cpu = -1; /* Skip this_cpu opt if the same */
 710
 711        for_each_domain(cpu, sd) {
 712                if (sd->flags & SD_WAKE_AFFINE) {
 713                        cpumask_t domain_mask;
 714                        int       best_cpu;
 715
 716                        cpus_and(domain_mask, sd->span, *lowest_mask);
 717
 718                        best_cpu = pick_optimal_cpu(this_cpu,
 719                                                    &domain_mask);
 720                        if (best_cpu != -1)
 721                                return best_cpu;
 722                }
 723        }
 724
 725        /*
 726         * And finally, if there were no matches within the domains
 727         * just give the caller *something* to work with from the compatible
 728         * locations.
 729         */
 730        return pick_optimal_cpu(this_cpu, lowest_mask);
 731}
 732
 733/* Will lock the rq it finds */
 734static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 735{
 736        struct rq *lowest_rq = NULL;
 737        int tries;
 738        int cpu;
 739
 740        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
 741                cpu = find_lowest_rq(task);
 742
 743                if ((cpu == -1) || (cpu == rq->cpu))
 744                        break;
 745
 746                lowest_rq = cpu_rq(cpu);
 747
 748                /* if the prio of this runqueue changed, try again */
 749                if (double_lock_balance(rq, lowest_rq)) {
 750                        /*
 751                         * We had to unlock the run queue. In
 752                         * the mean time, task could have
 753                         * migrated already or had its affinity changed.
 754                         * Also make sure that it wasn't scheduled on its rq.
 755                         */
 756                        if (unlikely(task_rq(task) != rq ||
 757                                     !cpu_isset(lowest_rq->cpu,
 758                                                task->cpus_allowed) ||
 759                                     task_running(rq, task) ||
 760                                     !task->se.on_rq)) {
 761
 762                                spin_unlock(&lowest_rq->lock);
 763                                lowest_rq = NULL;
 764                                break;
 765                        }
 766                }
 767
 768                /* If this rq is still suitable use it. */
 769                if (lowest_rq->rt.highest_prio > task->prio)
 770                        break;
 771
 772                /* try again */
 773                spin_unlock(&lowest_rq->lock);
 774                lowest_rq = NULL;
 775        }
 776
 777        return lowest_rq;
 778}
 779
 780/*
 781 * If the current CPU has more than one RT task, see if the non
 782 * running task can migrate over to a CPU that is running a task
 783 * of lesser priority.
 784 */
 785static int push_rt_task(struct rq *rq)
 786{
 787        struct task_struct *next_task;
 788        struct rq *lowest_rq;
 789        int ret = 0;
 790        int paranoid = RT_MAX_TRIES;
 791
 792        if (!rq->rt.overloaded)
 793                return 0;
 794
 795        next_task = pick_next_highest_task_rt(rq, -1);
 796        if (!next_task)
 797                return 0;
 798
 799 retry:
 800        if (unlikely(next_task == rq->curr)) {
 801                WARN_ON(1);
 802                return 0;
 803        }
 804
 805        /*
 806         * It's possible that the next_task slipped in of
 807         * higher priority than current. If that's the case
 808         * just reschedule current.
 809         */
 810        if (unlikely(next_task->prio < rq->curr->prio)) {
 811                resched_task(rq->curr);
 812                return 0;
 813        }
 814
 815        /* We might release rq lock */
 816        get_task_struct(next_task);
 817
 818        /* find_lock_lowest_rq locks the rq if found */
 819        lowest_rq = find_lock_lowest_rq(next_task, rq);
 820        if (!lowest_rq) {
 821                struct task_struct *task;
 822                /*
 823                 * find lock_lowest_rq releases rq->lock
 824                 * so it is possible that next_task has changed.
 825                 * If it has, then try again.
 826                 */
 827                task = pick_next_highest_task_rt(rq, -1);
 828                if (unlikely(task != next_task) && task && paranoid--) {
 829                        put_task_struct(next_task);
 830                        next_task = task;
 831                        goto retry;
 832                }
 833                goto out;
 834        }
 835
 836        deactivate_task(rq, next_task, 0);
 837        set_task_cpu(next_task, lowest_rq->cpu);
 838        activate_task(lowest_rq, next_task, 0);
 839
 840        resched_task(lowest_rq->curr);
 841
 842        spin_unlock(&lowest_rq->lock);
 843
 844        ret = 1;
 845out:
 846        put_task_struct(next_task);
 847
 848        return ret;
 849}
 850
 851/*
 852 * TODO: Currently we just use the second highest prio task on
 853 *       the queue, and stop when it can't migrate (or there's
 854 *       no more RT tasks).  There may be a case where a lower
 855 *       priority RT task has a different affinity than the
 856 *       higher RT task. In this case the lower RT task could
 857 *       possibly be able to migrate where as the higher priority
 858 *       RT task could not.  We currently ignore this issue.
 859 *       Enhancements are welcome!
 860 */
 861static void push_rt_tasks(struct rq *rq)
 862{
 863        /* push_rt_task will return true if it moved an RT */
 864        while (push_rt_task(rq))
 865                ;
 866}
 867
 868static int pull_rt_task(struct rq *this_rq)
 869{
 870        int this_cpu = this_rq->cpu, ret = 0, cpu;
 871        struct task_struct *p, *next;
 872        struct rq *src_rq;
 873
 874        if (likely(!rt_overloaded(this_rq)))
 875                return 0;
 876
 877        next = pick_next_task_rt(this_rq);
 878
 879        for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
 880                if (this_cpu == cpu)
 881                        continue;
 882
 883                src_rq = cpu_rq(cpu);
 884                /*
 885                 * We can potentially drop this_rq's lock in
 886                 * double_lock_balance, and another CPU could
 887                 * steal our next task - hence we must cause
 888                 * the caller to recalculate the next task
 889                 * in that case:
 890                 */
 891                if (double_lock_balance(this_rq, src_rq)) {
 892                        struct task_struct *old_next = next;
 893
 894                        next = pick_next_task_rt(this_rq);
 895                        if (next != old_next)
 896                                ret = 1;
 897                }
 898
 899                /*
 900                 * Are there still pullable RT tasks?
 901                 */
 902                if (src_rq->rt.rt_nr_running <= 1)
 903                        goto skip;
 904
 905                p = pick_next_highest_task_rt(src_rq, this_cpu);
 906
 907                /*
 908                 * Do we have an RT task that preempts
 909                 * the to-be-scheduled task?
 910                 */
 911                if (p && (!next || (p->prio < next->prio))) {
 912                        WARN_ON(p == src_rq->curr);
 913                        WARN_ON(!p->se.on_rq);
 914
 915                        /*
 916                         * There's a chance that p is higher in priority
 917                         * than what's currently running on its cpu.
 918                         * This is just that p is wakeing up and hasn't
 919                         * had a chance to schedule. We only pull
 920                         * p if it is lower in priority than the
 921                         * current task on the run queue or
 922                         * this_rq next task is lower in prio than
 923                         * the current task on that rq.
 924                         */
 925                        if (p->prio < src_rq->curr->prio ||
 926                            (next && next->prio < src_rq->curr->prio))
 927                                goto skip;
 928
 929                        ret = 1;
 930
 931                        deactivate_task(src_rq, p, 0);
 932                        set_task_cpu(p, this_cpu);
 933                        activate_task(this_rq, p, 0);
 934                        /*
 935                         * We continue with the search, just in
 936                         * case there's an even higher prio task
 937                         * in another runqueue. (low likelyhood
 938                         * but possible)
 939                         *
 940                         * Update next so that we won't pick a task
 941                         * on another cpu with a priority lower (or equal)
 942                         * than the one we just picked.
 943                         */
 944                        next = p;
 945
 946                }
 947 skip:
 948                spin_unlock(&src_rq->lock);
 949        }
 950
 951        return ret;
 952}
 953
 954static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 955{
 956        /* Try to pull RT tasks here if we lower this rq's prio */
 957        if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
 958                pull_rt_task(rq);
 959}
 960
 961static void post_schedule_rt(struct rq *rq)
 962{
 963        /*
 964         * If we have more than one rt_task queued, then
 965         * see if we can push the other rt_tasks off to other CPUS.
 966         * Note we may release the rq lock, and since
 967         * the lock was owned by prev, we need to release it
 968         * first via finish_lock_switch and then reaquire it here.
 969         */
 970        if (unlikely(rq->rt.overloaded)) {
 971                spin_lock_irq(&rq->lock);
 972                push_rt_tasks(rq);
 973                spin_unlock_irq(&rq->lock);
 974        }
 975}
 976
 977
 978static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 979{
 980        if (!task_running(rq, p) &&
 981            (p->prio >= rq->rt.highest_prio) &&
 982            rq->rt.overloaded)
 983                push_rt_tasks(rq);
 984}
 985
 986static unsigned long
 987load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 988                unsigned long max_load_move,
 989                struct sched_domain *sd, enum cpu_idle_type idle,
 990                int *all_pinned, int *this_best_prio)
 991{
 992        /* don't touch RT tasks */
 993        return 0;
 994}
 995
 996static int
 997move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 998                 struct sched_domain *sd, enum cpu_idle_type idle)
 999{
1000        /* don't touch RT tasks */
1001        return 0;
1002}
1003
1004static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
1005{
1006        int weight = cpus_weight(*new_mask);
1007
1008        BUG_ON(!rt_task(p));
1009
1010        /*
1011         * Update the migration status of the RQ if we have an RT task
1012         * which is running AND changing its weight value.
1013         */
1014        if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
1015                struct rq *rq = task_rq(p);
1016
1017                if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1018                        rq->rt.rt_nr_migratory++;
1019                } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1020                        BUG_ON(!rq->rt.rt_nr_migratory);
1021                        rq->rt.rt_nr_migratory--;
1022                }
1023
1024                update_rt_migration(rq);
1025        }
1026
1027        p->cpus_allowed    = *new_mask;
1028        p->rt.nr_cpus_allowed = weight;
1029}
1030
1031/* Assumes rq->lock is held */
1032static void join_domain_rt(struct rq *rq)
1033{
1034        if (rq->rt.overloaded)
1035                rt_set_overload(rq);
1036}
1037
1038/* Assumes rq->lock is held */
1039static void leave_domain_rt(struct rq *rq)
1040{
1041        if (rq->rt.overloaded)
1042                rt_clear_overload(rq);
1043}
1044
1045/*
1046 * When switch from the rt queue, we bring ourselves to a position
1047 * that we might want to pull RT tasks from other runqueues.
1048 */
1049static void switched_from_rt(struct rq *rq, struct task_struct *p,
1050                           int running)
1051{
1052        /*
1053         * If there are other RT tasks then we will reschedule
1054         * and the scheduling of the other RT tasks will handle
1055         * the balancing. But if we are the last RT task
1056         * we may need to handle the pulling of RT tasks
1057         * now.
1058         */
1059        if (!rq->rt.rt_nr_running)
1060                pull_rt_task(rq);
1061}
1062#endif /* CONFIG_SMP */
1063
1064/*
1065 * When switching a task to RT, we may overload the runqueue
1066 * with RT tasks. In this case we try to push them off to
1067 * other runqueues.
1068 */
1069static void switched_to_rt(struct rq *rq, struct task_struct *p,
1070                           int running)
1071{
1072        int check_resched = 1;
1073
1074        /*
1075         * If we are already running, then there's nothing
1076         * that needs to be done. But if we are not running
1077         * we may need to preempt the current running task.
1078         * If that current running task is also an RT task
1079         * then see if we can move to another run queue.
1080         */
1081        if (!running) {
1082#ifdef CONFIG_SMP
1083                if (rq->rt.overloaded && push_rt_task(rq) &&
1084                    /* Don't resched if we changed runqueues */
1085                    rq != task_rq(p))
1086                        check_resched = 0;
1087#endif /* CONFIG_SMP */
1088                if (check_resched && p->prio < rq->curr->prio)
1089                        resched_task(rq->curr);
1090        }
1091}
1092
1093/*
1094 * Priority of the task has changed. This may cause
1095 * us to initiate a push or pull.
1096 */
1097static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1098                            int oldprio, int running)
1099{
1100        if (running) {
1101#ifdef CONFIG_SMP
1102                /*
1103                 * If our priority decreases while running, we
1104                 * may need to pull tasks to this runqueue.
1105                 */
1106                if (oldprio < p->prio)
1107                        pull_rt_task(rq);
1108                /*
1109                 * If there's a higher priority task waiting to run
1110                 * then reschedule. Note, the above pull_rt_task
1111                 * can release the rq lock and p could migrate.
1112                 * Only reschedule if p is still on the same runqueue.
1113                 */
1114                if (p->prio > rq->rt.highest_prio && rq->curr == p)
1115                        resched_task(p);
1116#else
1117                /* For UP simply resched on drop of prio */
1118                if (oldprio < p->prio)
1119                        resched_task(p);
1120#endif /* CONFIG_SMP */
1121        } else {
1122                /*
1123                 * This task is not running, but if it is
1124                 * greater than the current running task
1125                 * then reschedule.
1126                 */
1127                if (p->prio < rq->curr->prio)
1128                        resched_task(rq->curr);
1129        }
1130}
1131
1132static void watchdog(struct rq *rq, struct task_struct *p)
1133{
1134        unsigned long soft, hard;
1135
1136        if (!p->signal)
1137                return;
1138
1139        soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
1140        hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
1141
1142        if (soft != RLIM_INFINITY) {
1143                unsigned long next;
1144
1145                p->rt.timeout++;
1146                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1147                if (p->rt.timeout > next)
1148                        p->it_sched_expires = p->se.sum_exec_runtime;
1149        }
1150}
1151
1152static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1153{
1154        update_curr_rt(rq);
1155
1156        watchdog(rq, p);
1157
1158        /*
1159         * RR tasks need a special form of timeslice management.
1160         * FIFO tasks have no timeslices.
1161         */
1162        if (p->policy != SCHED_RR)
1163                return;
1164
1165        if (--p->rt.time_slice)
1166                return;
1167
1168        p->rt.time_slice = DEF_TIMESLICE;
1169
1170        /*
1171         * Requeue to the end of queue if we are not the only element
1172         * on the queue:
1173         */
1174        if (p->rt.run_list.prev != p->rt.run_list.next) {
1175                requeue_task_rt(rq, p);
1176                set_tsk_need_resched(p);
1177        }
1178}
1179
1180static void set_curr_task_rt(struct rq *rq)
1181{
1182        struct task_struct *p = rq->curr;
1183
1184        p->se.exec_start = rq->clock;
1185}
1186
1187const struct sched_class rt_sched_class = {
1188        .next                   = &fair_sched_class,
1189        .enqueue_task           = enqueue_task_rt,
1190        .dequeue_task           = dequeue_task_rt,
1191        .yield_task             = yield_task_rt,
1192#ifdef CONFIG_SMP
1193        .select_task_rq         = select_task_rq_rt,
1194#endif /* CONFIG_SMP */
1195
1196        .check_preempt_curr     = check_preempt_curr_rt,
1197
1198        .pick_next_task         = pick_next_task_rt,
1199        .put_prev_task          = put_prev_task_rt,
1200
1201#ifdef CONFIG_SMP
1202        .load_balance           = load_balance_rt,
1203        .move_one_task          = move_one_task_rt,
1204        .set_cpus_allowed       = set_cpus_allowed_rt,
1205        .join_domain            = join_domain_rt,
1206        .leave_domain           = leave_domain_rt,
1207        .pre_schedule           = pre_schedule_rt,
1208        .post_schedule          = post_schedule_rt,
1209        .task_wake_up           = task_wake_up_rt,
1210        .switched_from          = switched_from_rt,
1211#endif
1212
1213        .set_curr_task          = set_curr_task_rt,
1214        .task_tick              = task_tick_rt,
1215
1216        .prio_changed           = prio_changed_rt,
1217        .switched_to            = switched_to_rt,
1218};
1219
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.