linux/kernel/rcutree_plugin.h
<<
>>
Prefs
   1/*
   2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
   3 * Internal non-public definitions that provide either classic
   4 * or preemptible semantics.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  19 *
  20 * Copyright Red Hat, 2009
  21 * Copyright IBM Corporation, 2009
  22 *
  23 * Author: Ingo Molnar <mingo@elte.hu>
  24 *         Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  25 */
  26
  27#include <linux/delay.h>
  28#include <linux/stop_machine.h>
  29
  30/*
  31 * Check the RCU kernel configuration parameters and print informative
  32 * messages about anything out of the ordinary.  If you like #ifdef, you
  33 * will love this function.
  34 */
  35static void __init rcu_bootup_announce_oddness(void)
  36{
  37#ifdef CONFIG_RCU_TRACE
  38        printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
  39#endif
  40#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
  41        printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
  42               CONFIG_RCU_FANOUT);
  43#endif
  44#ifdef CONFIG_RCU_FANOUT_EXACT
  45        printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
  46#endif
  47#ifdef CONFIG_RCU_FAST_NO_HZ
  48        printk(KERN_INFO
  49               "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
  50#endif
  51#ifdef CONFIG_PROVE_RCU
  52        printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
  53#endif
  54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
  55        printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
  56#endif
  57#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
  58        printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
  59#endif
  60#if NUM_RCU_LVL_4 != 0
  61        printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
  62#endif
  63}
  64
  65#ifdef CONFIG_TREE_PREEMPT_RCU
  66
  67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
  68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
  69static struct rcu_state *rcu_state = &rcu_preempt_state;
  70
  71static void rcu_read_unlock_special(struct task_struct *t);
  72static int rcu_preempted_readers_exp(struct rcu_node *rnp);
  73
  74/*
  75 * Tell them what RCU they are running.
  76 */
  77static void __init rcu_bootup_announce(void)
  78{
  79        printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
  80        rcu_bootup_announce_oddness();
  81}
  82
  83/*
  84 * Return the number of RCU-preempt batches processed thus far
  85 * for debug and statistics.
  86 */
  87long rcu_batches_completed_preempt(void)
  88{
  89        return rcu_preempt_state.completed;
  90}
  91EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
  92
  93/*
  94 * Return the number of RCU batches processed thus far for debug & stats.
  95 */
  96long rcu_batches_completed(void)
  97{
  98        return rcu_batches_completed_preempt();
  99}
 100EXPORT_SYMBOL_GPL(rcu_batches_completed);
 101
 102/*
 103 * Force a quiescent state for preemptible RCU.
 104 */
 105void rcu_force_quiescent_state(void)
 106{
 107        force_quiescent_state(&rcu_preempt_state, 0);
 108}
 109EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 110
 111/*
 112 * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 113 * that this just means that the task currently running on the CPU is
 114 * not in a quiescent state.  There might be any number of tasks blocked
 115 * while in an RCU read-side critical section.
 116 *
 117 * Unlike the other rcu_*_qs() functions, callers to this function
 118 * must disable irqs in order to protect the assignment to
 119 * ->rcu_read_unlock_special.
 120 */
 121static void rcu_preempt_qs(int cpu)
 122{
 123        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
 124
 125        rdp->passed_quiesc_completed = rdp->gpnum - 1;
 126        barrier();
 127        rdp->passed_quiesc = 1;
 128        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 129}
 130
 131/*
 132 * We have entered the scheduler, and the current task might soon be
 133 * context-switched away from.  If this task is in an RCU read-side
 134 * critical section, we will no longer be able to rely on the CPU to
 135 * record that fact, so we enqueue the task on the blkd_tasks list.
 136 * The task will dequeue itself when it exits the outermost enclosing
 137 * RCU read-side critical section.  Therefore, the current grace period
 138 * cannot be permitted to complete until the blkd_tasks list entries
 139 * predating the current grace period drain, in other words, until
 140 * rnp->gp_tasks becomes NULL.
 141 *
 142 * Caller must disable preemption.
 143 */
 144static void rcu_preempt_note_context_switch(int cpu)
 145{
 146        struct task_struct *t = current;
 147        unsigned long flags;
 148        struct rcu_data *rdp;
 149        struct rcu_node *rnp;
 150
 151        if (t->rcu_read_lock_nesting > 0 &&
 152            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
 153
 154                /* Possibly blocking in an RCU read-side critical section. */
 155                rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
 156                rnp = rdp->mynode;
 157                raw_spin_lock_irqsave(&rnp->lock, flags);
 158                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
 159                t->rcu_blocked_node = rnp;
 160
 161                /*
 162                 * If this CPU has already checked in, then this task
 163                 * will hold up the next grace period rather than the
 164                 * current grace period.  Queue the task accordingly.
 165                 * If the task is queued for the current grace period
 166                 * (i.e., this CPU has not yet passed through a quiescent
 167                 * state for the current grace period), then as long
 168                 * as that task remains queued, the current grace period
 169                 * cannot end.  Note that there is some uncertainty as
 170                 * to exactly when the current grace period started.
 171                 * We take a conservative approach, which can result
 172                 * in unnecessarily waiting on tasks that started very
 173                 * slightly after the current grace period began.  C'est
 174                 * la vie!!!
 175                 *
 176                 * But first, note that the current CPU must still be
 177                 * on line!
 178                 */
 179                WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
 180                WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
 181                if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
 182                        list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
 183                        rnp->gp_tasks = &t->rcu_node_entry;
 184#ifdef CONFIG_RCU_BOOST
 185                        if (rnp->boost_tasks != NULL)
 186                                rnp->boost_tasks = rnp->gp_tasks;
 187#endif /* #ifdef CONFIG_RCU_BOOST */
 188                } else {
 189                        list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
 190                        if (rnp->qsmask & rdp->grpmask)
 191                                rnp->gp_tasks = &t->rcu_node_entry;
 192                }
 193                raw_spin_unlock_irqrestore(&rnp->lock, flags);
 194        } else if (t->rcu_read_lock_nesting < 0 &&
 195                   t->rcu_read_unlock_special) {
 196
 197                /*
 198                 * Complete exit from RCU read-side critical section on
 199                 * behalf of preempted instance of __rcu_read_unlock().
 200                 */
 201                rcu_read_unlock_special(t);
 202        }
 203
 204        /*
 205         * Either we were not in an RCU read-side critical section to
 206         * begin with, or we have now recorded that critical section
 207         * globally.  Either way, we can now note a quiescent state
 208         * for this CPU.  Again, if we were in an RCU read-side critical
 209         * section, and if that critical section was blocking the current
 210         * grace period, then the fact that the task has been enqueued
 211         * means that we continue to block the current grace period.
 212         */
 213        local_irq_save(flags);
 214        rcu_preempt_qs(cpu);
 215        local_irq_restore(flags);
 216}
 217
 218/*
 219 * Tree-preemptible RCU implementation for rcu_read_lock().
 220 * Just increment ->rcu_read_lock_nesting, shared state will be updated
 221 * if we block.
 222 */
 223void __rcu_read_lock(void)
 224{
 225        current->rcu_read_lock_nesting++;
 226        barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
 227}
 228EXPORT_SYMBOL_GPL(__rcu_read_lock);
 229
 230/*
 231 * Check for preempted RCU readers blocking the current grace period
 232 * for the specified rcu_node structure.  If the caller needs a reliable
 233 * answer, it must hold the rcu_node's ->lock.
 234 */
 235static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 236{
 237        return rnp->gp_tasks != NULL;
 238}
 239
 240/*
 241 * Record a quiescent state for all tasks that were previously queued
 242 * on the specified rcu_node structure and that were blocking the current
 243 * RCU grace period.  The caller must hold the specified rnp->lock with
 244 * irqs disabled, and this lock is released upon return, but irqs remain
 245 * disabled.
 246 */
 247static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 248        __releases(rnp->lock)
 249{
 250        unsigned long mask;
 251        struct rcu_node *rnp_p;
 252
 253        if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 254                raw_spin_unlock_irqrestore(&rnp->lock, flags);
 255                return;  /* Still need more quiescent states! */
 256        }
 257
 258        rnp_p = rnp->parent;
 259        if (rnp_p == NULL) {
 260                /*
 261                 * Either there is only one rcu_node in the tree,
 262                 * or tasks were kicked up to root rcu_node due to
 263                 * CPUs going offline.
 264                 */
 265                rcu_report_qs_rsp(&rcu_preempt_state, flags);
 266                return;
 267        }
 268
 269        /* Report up the rest of the hierarchy. */
 270        mask = rnp->grpmask;
 271        raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
 272        raw_spin_lock(&rnp_p->lock);    /* irqs already disabled. */
 273        rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
 274}
 275
 276/*
 277 * Advance a ->blkd_tasks-list pointer to the next entry, instead
 278 * returning NULL if at the end of the list.
 279 */
 280static struct list_head *rcu_next_node_entry(struct task_struct *t,
 281                                             struct rcu_node *rnp)
 282{
 283        struct list_head *np;
 284
 285        np = t->rcu_node_entry.next;
 286        if (np == &rnp->blkd_tasks)
 287                np = NULL;
 288        return np;
 289}
 290
 291/*
 292 * Handle special cases during rcu_read_unlock(), such as needing to
 293 * notify RCU core processing or task having blocked during the RCU
 294 * read-side critical section.
 295 */
 296static noinline void rcu_read_unlock_special(struct task_struct *t)
 297{
 298        int empty;
 299        int empty_exp;
 300        unsigned long flags;
 301        struct list_head *np;
 302        struct rcu_node *rnp;
 303        int special;
 304
 305        /* NMI handlers cannot block and cannot safely manipulate state. */
 306        if (in_nmi())
 307                return;
 308
 309        local_irq_save(flags);
 310
 311        /*
 312         * If RCU core is waiting for this CPU to exit critical section,
 313         * let it know that we have done so.
 314         */
 315        special = t->rcu_read_unlock_special;
 316        if (special & RCU_READ_UNLOCK_NEED_QS) {
 317                rcu_preempt_qs(smp_processor_id());
 318        }
 319
 320        /* Hardware IRQ handlers cannot block. */
 321        if (in_irq() || in_serving_softirq()) {
 322                local_irq_restore(flags);
 323                return;
 324        }
 325
 326        /* Clean up if blocked during RCU read-side critical section. */
 327        if (special & RCU_READ_UNLOCK_BLOCKED) {
 328                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
 329
 330                /*
 331                 * Remove this task from the list it blocked on.  The
 332                 * task can migrate while we acquire the lock, but at
 333                 * most one time.  So at most two passes through loop.
 334                 */
 335                for (;;) {
 336                        rnp = t->rcu_blocked_node;
 337                        raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
 338                        if (rnp == t->rcu_blocked_node)
 339                                break;
 340                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 341                }
 342                empty = !rcu_preempt_blocked_readers_cgp(rnp);
 343                empty_exp = !rcu_preempted_readers_exp(rnp);
 344                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 345                np = rcu_next_node_entry(t, rnp);
 346                list_del_init(&t->rcu_node_entry);
 347                if (&t->rcu_node_entry == rnp->gp_tasks)
 348                        rnp->gp_tasks = np;
 349                if (&t->rcu_node_entry == rnp->exp_tasks)
 350                        rnp->exp_tasks = np;
 351#ifdef CONFIG_RCU_BOOST
 352                if (&t->rcu_node_entry == rnp->boost_tasks)
 353                        rnp->boost_tasks = np;
 354                /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
 355                if (t->rcu_boosted) {
 356                        special |= RCU_READ_UNLOCK_BOOSTED;
 357                        t->rcu_boosted = 0;
 358                }
 359#endif /* #ifdef CONFIG_RCU_BOOST */
 360                t->rcu_blocked_node = NULL;
 361
 362                /*
 363                 * If this was the last task on the current list, and if
 364                 * we aren't waiting on any CPUs, report the quiescent state.
 365                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
 366                 */
 367                if (empty)
 368                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 369                else
 370                        rcu_report_unblock_qs_rnp(rnp, flags);
 371
 372#ifdef CONFIG_RCU_BOOST
 373                /* Unboost if we were boosted. */
 374                if (special & RCU_READ_UNLOCK_BOOSTED) {
 375                        rt_mutex_unlock(t->rcu_boost_mutex);
 376                        t->rcu_boost_mutex = NULL;
 377                }
 378#endif /* #ifdef CONFIG_RCU_BOOST */
 379
 380                /*
 381                 * If this was the last task on the expedited lists,
 382                 * then we need to report up the rcu_node hierarchy.
 383                 */
 384                if (!empty_exp && !rcu_preempted_readers_exp(rnp))
 385                        rcu_report_exp_rnp(&rcu_preempt_state, rnp);
 386        } else {
 387                local_irq_restore(flags);
 388        }
 389}
 390
 391/*
 392 * Tree-preemptible RCU implementation for rcu_read_unlock().
 393 * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
 394 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
 395 * invoke rcu_read_unlock_special() to clean up after a context switch
 396 * in an RCU read-side critical section and other special cases.
 397 */
 398void __rcu_read_unlock(void)
 399{
 400        struct task_struct *t = current;
 401
 402        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
 403        if (t->rcu_read_lock_nesting != 1)
 404                --t->rcu_read_lock_nesting;
 405        else {
 406                t->rcu_read_lock_nesting = INT_MIN;
 407                barrier();  /* assign before ->rcu_read_unlock_special load */
 408                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
 409                        rcu_read_unlock_special(t);
 410                barrier();  /* ->rcu_read_unlock_special load before assign */
 411                t->rcu_read_lock_nesting = 0;
 412        }
 413#ifdef CONFIG_PROVE_LOCKING
 414        {
 415                int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
 416
 417                WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
 418        }
 419#endif /* #ifdef CONFIG_PROVE_LOCKING */
 420}
 421EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 422
 423#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
 424
 425/*
 426 * Dump detailed information for all tasks blocking the current RCU
 427 * grace period on the specified rcu_node structure.
 428 */
 429static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 430{
 431        unsigned long flags;
 432        struct task_struct *t;
 433
 434        if (!rcu_preempt_blocked_readers_cgp(rnp))
 435                return;
 436        raw_spin_lock_irqsave(&rnp->lock, flags);
 437        t = list_entry(rnp->gp_tasks,
 438                       struct task_struct, rcu_node_entry);
 439        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
 440                sched_show_task(t);
 441        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 442}
 443
 444/*
 445 * Dump detailed information for all tasks blocking the current RCU
 446 * grace period.
 447 */
 448static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 449{
 450        struct rcu_node *rnp = rcu_get_root(rsp);
 451
 452        rcu_print_detail_task_stall_rnp(rnp);
 453        rcu_for_each_leaf_node(rsp, rnp)
 454                rcu_print_detail_task_stall_rnp(rnp);
 455}
 456
 457#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
 458
 459static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 460{
 461}
 462
 463#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
 464
 465/*
 466 * Scan the current list of tasks blocked within RCU read-side critical
 467 * sections, printing out the tid of each.
 468 */
 469static void rcu_print_task_stall(struct rcu_node *rnp)
 470{
 471        struct task_struct *t;
 472
 473        if (!rcu_preempt_blocked_readers_cgp(rnp))
 474                return;
 475        t = list_entry(rnp->gp_tasks,
 476                       struct task_struct, rcu_node_entry);
 477        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
 478                printk(" P%d", t->pid);
 479}
 480
 481/*
 482 * Suppress preemptible RCU's CPU stall warnings by pushing the
 483 * time of the next stall-warning message comfortably far into the
 484 * future.
 485 */
 486static void rcu_preempt_stall_reset(void)
 487{
 488        rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
 489}
 490
 491/*
 492 * Check that the list of blocked tasks for the newly completed grace
 493 * period is in fact empty.  It is a serious bug to complete a grace
 494 * period that still has RCU readers blocked!  This function must be
 495 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
 496 * must be held by the caller.
 497 *
 498 * Also, if there are blocked tasks on the list, they automatically
 499 * block the newly created grace period, so set up ->gp_tasks accordingly.
 500 */
 501static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 502{
 503        WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
 504        if (!list_empty(&rnp->blkd_tasks))
 505                rnp->gp_tasks = rnp->blkd_tasks.next;
 506        WARN_ON_ONCE(rnp->qsmask);
 507}
 508
 509#ifdef CONFIG_HOTPLUG_CPU
 510
 511/*
 512 * Handle tasklist migration for case in which all CPUs covered by the
 513 * specified rcu_node have gone offline.  Move them up to the root
 514 * rcu_node.  The reason for not just moving them to the immediate
 515 * parent is to remove the need for rcu_read_unlock_special() to
 516 * make more than two attempts to acquire the target rcu_node's lock.
 517 * Returns true if there were tasks blocking the current RCU grace
 518 * period.
 519 *
 520 * Returns 1 if there was previously a task blocking the current grace
 521 * period on the specified rcu_node structure.
 522 *
 523 * The caller must hold rnp->lock with irqs disabled.
 524 */
 525static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 526                                     struct rcu_node *rnp,
 527                                     struct rcu_data *rdp)
 528{
 529        struct list_head *lp;
 530        struct list_head *lp_root;
 531        int retval = 0;
 532        struct rcu_node *rnp_root = rcu_get_root(rsp);
 533        struct task_struct *t;
 534
 535        if (rnp == rnp_root) {
 536                WARN_ONCE(1, "Last CPU thought to be offlined?");
 537                return 0;  /* Shouldn't happen: at least one CPU online. */
 538        }
 539
 540        /* If we are on an internal node, complain bitterly. */
 541        WARN_ON_ONCE(rnp != rdp->mynode);
 542
 543        /*
 544         * Move tasks up to root rcu_node.  Don't try to get fancy for
 545         * this corner-case operation -- just put this node's tasks
 546         * at the head of the root node's list, and update the root node's
 547         * ->gp_tasks and ->exp_tasks pointers to those of this node's,
 548         * if non-NULL.  This might result in waiting for more tasks than
 549         * absolutely necessary, but this is a good performance/complexity
 550         * tradeoff.
 551         */
 552        if (rcu_preempt_blocked_readers_cgp(rnp))
 553                retval |= RCU_OFL_TASKS_NORM_GP;
 554        if (rcu_preempted_readers_exp(rnp))
 555                retval |= RCU_OFL_TASKS_EXP_GP;
 556        lp = &rnp->blkd_tasks;
 557        lp_root = &rnp_root->blkd_tasks;
 558        while (!list_empty(lp)) {
 559                t = list_entry(lp->next, typeof(*t), rcu_node_entry);
 560                raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
 561                list_del(&t->rcu_node_entry);
 562                t->rcu_blocked_node = rnp_root;
 563                list_add(&t->rcu_node_entry, lp_root);
 564                if (&t->rcu_node_entry == rnp->gp_tasks)
 565                        rnp_root->gp_tasks = rnp->gp_tasks;
 566                if (&t->rcu_node_entry == rnp->exp_tasks)
 567                        rnp_root->exp_tasks = rnp->exp_tasks;
 568#ifdef CONFIG_RCU_BOOST
 569                if (&t->rcu_node_entry == rnp->boost_tasks)
 570                        rnp_root->boost_tasks = rnp->boost_tasks;
 571#endif /* #ifdef CONFIG_RCU_BOOST */
 572                raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
 573        }
 574
 575#ifdef CONFIG_RCU_BOOST
 576        /* In case root is being boosted and leaf is not. */
 577        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
 578        if (rnp_root->boost_tasks != NULL &&
 579            rnp_root->boost_tasks != rnp_root->gp_tasks)
 580                rnp_root->boost_tasks = rnp_root->gp_tasks;
 581        raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
 582#endif /* #ifdef CONFIG_RCU_BOOST */
 583
 584        rnp->gp_tasks = NULL;
 585        rnp->exp_tasks = NULL;
 586        return retval;
 587}
 588
 589/*
 590 * Do CPU-offline processing for preemptible RCU.
 591 */
 592static void rcu_preempt_offline_cpu(int cpu)
 593{
 594        __rcu_offline_cpu(cpu, &rcu_preempt_state);
 595}
 596
 597#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 598
 599/*
 600 * Check for a quiescent state from the current CPU.  When a task blocks,
 601 * the task is recorded in the corresponding CPU's rcu_node structure,
 602 * which is checked elsewhere.
 603 *
 604 * Caller must disable hard irqs.
 605 */
 606static void rcu_preempt_check_callbacks(int cpu)
 607{
 608        struct task_struct *t = current;
 609
 610        if (t->rcu_read_lock_nesting == 0) {
 611                rcu_preempt_qs(cpu);
 612                return;
 613        }
 614        if (t->rcu_read_lock_nesting > 0 &&
 615            per_cpu(rcu_preempt_data, cpu).qs_pending)
 616                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 617}
 618
 619/*
 620 * Process callbacks for preemptible RCU.
 621 */
 622static void rcu_preempt_process_callbacks(void)
 623{
 624        __rcu_process_callbacks(&rcu_preempt_state,
 625                                &__get_cpu_var(rcu_preempt_data));
 626}
 627
 628#ifdef CONFIG_RCU_BOOST
 629
 630static void rcu_preempt_do_callbacks(void)
 631{
 632        rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
 633}
 634
 635#endif /* #ifdef CONFIG_RCU_BOOST */
 636
 637/*
 638 * Queue a preemptible-RCU callback for invocation after a grace period.
 639 */
 640void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 641{
 642        __call_rcu(head, func, &rcu_preempt_state);
 643}
 644EXPORT_SYMBOL_GPL(call_rcu);
 645
 646/**
 647 * synchronize_rcu - wait until a grace period has elapsed.
 648 *
 649 * Control will return to the caller some time after a full grace
 650 * period has elapsed, in other words after all currently executing RCU
 651 * read-side critical sections have completed.  Note, however, that
 652 * upon return from synchronize_rcu(), the caller might well be executing
 653 * concurrently with new RCU read-side critical sections that began while
 654 * synchronize_rcu() was waiting.  RCU read-side critical sections are
 655 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
 656 */
 657void synchronize_rcu(void)
 658{
 659        struct rcu_synchronize rcu;
 660
 661        if (!rcu_scheduler_active)
 662                return;
 663
 664        init_rcu_head_on_stack(&rcu.head);
 665        init_completion(&rcu.completion);
 666        /* Will wake me after RCU finished. */
 667        call_rcu(&rcu.head, wakeme_after_rcu);
 668        /* Wait for it. */
 669        wait_for_completion(&rcu.completion);
 670        destroy_rcu_head_on_stack(&rcu.head);
 671}
 672EXPORT_SYMBOL_GPL(synchronize_rcu);
 673
 674static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
 675static long sync_rcu_preempt_exp_count;
 676static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
 677
 678/*
 679 * Return non-zero if there are any tasks in RCU read-side critical
 680 * sections blocking the current preemptible-RCU expedited grace period.
 681 * If there is no preemptible-RCU expedited grace period currently in
 682 * progress, returns zero unconditionally.
 683 */
 684static int rcu_preempted_readers_exp(struct rcu_node *rnp)
 685{
 686        return rnp->exp_tasks != NULL;
 687}
 688
 689/*
 690 * return non-zero if there is no RCU expedited grace period in progress
 691 * for the specified rcu_node structure, in other words, if all CPUs and
 692 * tasks covered by the specified rcu_node structure have done their bit
 693 * for the current expedited grace period.  Works only for preemptible
 694 * RCU -- other RCU implementation use other means.
 695 *
 696 * Caller must hold sync_rcu_preempt_exp_mutex.
 697 */
 698static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 699{
 700        return !rcu_preempted_readers_exp(rnp) &&
 701               ACCESS_ONCE(rnp->expmask) == 0;
 702}
 703
 704/*
 705 * Report the exit from RCU read-side critical section for the last task
 706 * that queued itself during or before the current expedited preemptible-RCU
 707 * grace period.  This event is reported either to the rcu_node structure on
 708 * which the task was queued or to one of that rcu_node structure's ancestors,
 709 * recursively up the tree.  (Calm down, calm down, we do the recursion
 710 * iteratively!)
 711 *
 712 * Caller must hold sync_rcu_preempt_exp_mutex.
 713 */
 714static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
 715{
 716        unsigned long flags;
 717        unsigned long mask;
 718
 719        raw_spin_lock_irqsave(&rnp->lock, flags);
 720        for (;;) {
 721                if (!sync_rcu_preempt_exp_done(rnp)) {
 722                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 723                        break;
 724                }
 725                if (rnp->parent == NULL) {
 726                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 727                        wake_up(&sync_rcu_preempt_exp_wq);
 728                        break;
 729                }
 730                mask = rnp->grpmask;
 731                raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
 732                rnp = rnp->parent;
 733                raw_spin_lock(&rnp->lock); /* irqs already disabled */
 734                rnp->expmask &= ~mask;
 735        }
 736}
 737
 738/*
 739 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
 740 * grace period for the specified rcu_node structure.  If there are no such
 741 * tasks, report it up the rcu_node hierarchy.
 742 *
 743 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
 744 */
 745static void
 746sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 747{
 748        unsigned long flags;
 749        int must_wait = 0;
 750
 751        raw_spin_lock_irqsave(&rnp->lock, flags);
 752        if (list_empty(&rnp->blkd_tasks))
 753                raw_spin_unlock_irqrestore(&rnp->lock, flags);
 754        else {
 755                rnp->exp_tasks = rnp->blkd_tasks.next;
 756                rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
 757                must_wait = 1;
 758        }
 759        if (!must_wait)
 760                rcu_report_exp_rnp(rsp, rnp);
 761}
 762
 763/*
 764 * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
 765 * is to invoke synchronize_sched_expedited() to push all the tasks to
 766 * the ->blkd_tasks lists and wait for this list to drain.
 767 */
 768void synchronize_rcu_expedited(void)
 769{
 770        unsigned long flags;
 771        struct rcu_node *rnp;
 772        struct rcu_state *rsp = &rcu_preempt_state;
 773        long snap;
 774        int trycount = 0;
 775
 776        smp_mb(); /* Caller's modifications seen first by other CPUs. */
 777        snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
 778        smp_mb(); /* Above access cannot bleed into critical section. */
 779
 780        /*
 781         * Acquire lock, falling back to synchronize_rcu() if too many
 782         * lock-acquisition failures.  Of course, if someone does the
 783         * expedited grace period for us, just leave.
 784         */
 785        while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
 786                if (trycount++ < 10)
 787                        udelay(trycount * num_online_cpus());
 788                else {
 789                        synchronize_rcu();
 790                        return;
 791                }
 792                if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
 793                        goto mb_ret; /* Others did our work for us. */
 794        }
 795        if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
 796                goto unlock_mb_ret; /* Others did our work for us. */
 797
 798        /* force all RCU readers onto ->blkd_tasks lists. */
 799        synchronize_sched_expedited();
 800
 801        raw_spin_lock_irqsave(&rsp->onofflock, flags);
 802
 803        /* Initialize ->expmask for all non-leaf rcu_node structures. */
 804        rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
 805                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 806                rnp->expmask = rnp->qsmaskinit;
 807                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 808        }
 809
 810        /* Snapshot current state of ->blkd_tasks lists. */
 811        rcu_for_each_leaf_node(rsp, rnp)
 812                sync_rcu_preempt_exp_init(rsp, rnp);
 813        if (NUM_RCU_NODES > 1)
 814                sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
 815
 816        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 817
 818        /* Wait for snapshotted ->blkd_tasks lists to drain. */
 819        rnp = rcu_get_root(rsp);
 820        wait_event(sync_rcu_preempt_exp_wq,
 821                   sync_rcu_preempt_exp_done(rnp));
 822
 823        /* Clean up and exit. */
 824        smp_mb(); /* ensure expedited GP seen before counter increment. */
 825        ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
 826unlock_mb_ret:
 827        mutex_unlock(&sync_rcu_preempt_exp_mutex);
 828mb_ret:
 829        smp_mb(); /* ensure subsequent action seen after grace period. */
 830}
 831EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 832
 833/*
 834 * Check to see if there is any immediate preemptible-RCU-related work
 835 * to be done.
 836 */
 837static int rcu_preempt_pending(int cpu)
 838{
 839        return __rcu_pending(&rcu_preempt_state,
 840                             &per_cpu(rcu_preempt_data, cpu));
 841}
 842
 843/*
 844 * Does preemptible RCU need the CPU to stay out of dynticks mode?
 845 */
 846static int rcu_preempt_needs_cpu(int cpu)
 847{
 848        return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
 849}
 850
 851/**
 852 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
 853 */
 854void rcu_barrier(void)
 855{
 856        _rcu_barrier(&rcu_preempt_state, call_rcu);
 857}
 858EXPORT_SYMBOL_GPL(rcu_barrier);
 859
 860/*
 861 * Initialize preemptible RCU's per-CPU data.
 862 */
 863static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 864{
 865        rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
 866}
 867
 868/*
 869 * Move preemptible RCU's callbacks from dying CPU to other online CPU.
 870 */
 871static void rcu_preempt_send_cbs_to_online(void)
 872{
 873        rcu_send_cbs_to_online(&rcu_preempt_state);
 874}
 875
 876/*
 877 * Initialize preemptible RCU's state structures.
 878 */
 879static void __init __rcu_init_preempt(void)
 880{
 881        rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 882}
 883
 884/*
 885 * Check for a task exiting while in a preemptible-RCU read-side
 886 * critical section, clean up if so.  No need to issue warnings,
 887 * as debug_check_no_locks_held() already does this if lockdep
 888 * is enabled.
 889 */
 890void exit_rcu(void)
 891{
 892        struct task_struct *t = current;
 893
 894        if (t->rcu_read_lock_nesting == 0)
 895                return;
 896        t->rcu_read_lock_nesting = 1;
 897        __rcu_read_unlock();
 898}
 899
 900#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 901
 902static struct rcu_state *rcu_state = &rcu_sched_state;
 903
 904/*
 905 * Tell them what RCU they are running.
 906 */
 907static void __init rcu_bootup_announce(void)
 908{
 909        printk(KERN_INFO "Hierarchical RCU implementation.\n");
 910        rcu_bootup_announce_oddness();
 911}
 912
 913/*
 914 * Return the number of RCU batches processed thus far for debug & stats.
 915 */
 916long rcu_batches_completed(void)
 917{
 918        return rcu_batches_completed_sched();
 919}
 920EXPORT_SYMBOL_GPL(rcu_batches_completed);
 921
 922/*
 923 * Force a quiescent state for RCU, which, because there is no preemptible
 924 * RCU, becomes the same as rcu-sched.
 925 */
 926void rcu_force_quiescent_state(void)
 927{
 928        rcu_sched_force_quiescent_state();
 929}
 930EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 931
 932/*
 933 * Because preemptible RCU does not exist, we never have to check for
 934 * CPUs being in quiescent states.
 935 */
 936static void rcu_preempt_note_context_switch(int cpu)
 937{
 938}
 939
 940/*
 941 * Because preemptible RCU does not exist, there are never any preempted
 942 * RCU readers.
 943 */
 944static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 945{
 946        return 0;
 947}
 948
 949#ifdef CONFIG_HOTPLUG_CPU
 950
 951/* Because preemptible RCU does not exist, no quieting of tasks. */
 952static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 953{
 954        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 955}
 956
 957#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 958
 959/*
 960 * Because preemptible RCU does not exist, we never have to check for
 961 * tasks blocked within RCU read-side critical sections.
 962 */
 963static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 964{
 965}
 966
 967/*
 968 * Because preemptible RCU does not exist, we never have to check for
 969 * tasks blocked within RCU read-side critical sections.
 970 */
 971static void rcu_print_task_stall(struct rcu_node *rnp)
 972{
 973}
 974
 975/*
 976 * Because preemptible RCU does not exist, there is no need to suppress
 977 * its CPU stall warnings.
 978 */
 979static void rcu_preempt_stall_reset(void)
 980{
 981}
 982
 983/*
 984 * Because there is no preemptible RCU, there can be no readers blocked,
 985 * so there is no need to check for blocked tasks.  So check only for
 986 * bogus qsmask values.
 987 */
 988static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 989{
 990        WARN_ON_ONCE(rnp->qsmask);
 991}
 992
 993#ifdef CONFIG_HOTPLUG_CPU
 994
 995/*
 996 * Because preemptible RCU does not exist, it never needs to migrate
 997 * tasks that were blocked within RCU read-side critical sections, and
 998 * such non-existent tasks cannot possibly have been blocking the current
 999 * grace period.
1000 */
1001static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1002                                     struct rcu_node *rnp,
1003                                     struct rcu_data *rdp)
1004{
1005        return 0;
1006}
1007
1008/*
1009 * Because preemptible RCU does not exist, it never needs CPU-offline
1010 * processing.
1011 */
1012static void rcu_preempt_offline_cpu(int cpu)
1013{
1014}
1015
1016#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1017
1018/*
1019 * Because preemptible RCU does not exist, it never has any callbacks
1020 * to check.
1021 */
1022static void rcu_preempt_check_callbacks(int cpu)
1023{
1024}
1025
1026/*
1027 * Because preemptible RCU does not exist, it never has any callbacks
1028 * to process.
1029 */
1030static void rcu_preempt_process_callbacks(void)
1031{
1032}
1033
1034/*
1035 * Wait for an rcu-preempt grace period, but make it happen quickly.
1036 * But because preemptible RCU does not exist, map to rcu-sched.
1037 */
1038void synchronize_rcu_expedited(void)
1039{
1040        synchronize_sched_expedited();
1041}
1042EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1043
1044#ifdef CONFIG_HOTPLUG_CPU
1045
1046/*
1047 * Because preemptible RCU does not exist, there is never any need to
1048 * report on tasks preempted in RCU read-side critical sections during
1049 * expedited RCU grace periods.
1050 */
1051static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
1052{
1053        return;
1054}
1055
1056#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1057
1058/*
1059 * Because preemptible RCU does not exist, it never has any work to do.
1060 */
1061static int rcu_preempt_pending(int cpu)
1062{
1063        return 0;
1064}
1065
1066/*
1067 * Because preemptible RCU does not exist, it never needs any CPU.
1068 */
1069static int rcu_preempt_needs_cpu(int cpu)
1070{
1071        return 0;
1072}
1073
1074/*
1075 * Because preemptible RCU does not exist, rcu_barrier() is just
1076 * another name for rcu_barrier_sched().
1077 */
1078void rcu_barrier(void)
1079{
1080        rcu_barrier_sched();
1081}
1082EXPORT_SYMBOL_GPL(rcu_barrier);
1083
1084/*
1085 * Because preemptible RCU does not exist, there is no per-CPU
1086 * data to initialize.
1087 */
1088static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1089{
1090}
1091
1092/*
1093 * Because there is no preemptible RCU, there are no callbacks to move.
1094 */
1095static void rcu_preempt_send_cbs_to_online(void)
1096{
1097}
1098
1099/*
1100 * Because preemptible RCU does not exist, it need not be initialized.
1101 */
1102static void __init __rcu_init_preempt(void)
1103{
1104}
1105
1106#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1107
1108#ifdef CONFIG_RCU_BOOST
1109
1110#include "rtmutex_common.h"
1111
1112#ifdef CONFIG_RCU_TRACE
1113
1114static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1115{
1116        if (list_empty(&rnp->blkd_tasks))
1117                rnp->n_balk_blkd_tasks++;
1118        else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1119                rnp->n_balk_exp_gp_tasks++;
1120        else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
1121                rnp->n_balk_boost_tasks++;
1122        else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
1123                rnp->n_balk_notblocked++;
1124        else if (rnp->gp_tasks != NULL &&
1125                 ULONG_CMP_LT(jiffies, rnp->boost_time))
1126                rnp->n_balk_notyet++;
1127        else
1128                rnp->n_balk_nos++;
1129}
1130
1131#else /* #ifdef CONFIG_RCU_TRACE */
1132
1133static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1134{
1135}
1136
1137#endif /* #else #ifdef CONFIG_RCU_TRACE */
1138
1139/*
1140 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1141 * or ->boost_tasks, advancing the pointer to the next task in the
1142 * ->blkd_tasks list.
1143 *
1144 * Note that irqs must be enabled: boosting the task can block.
1145 * Returns 1 if there are more tasks needing to be boosted.
1146 */
1147static int rcu_boost(struct rcu_node *rnp)
1148{
1149        unsigned long flags;
1150        struct rt_mutex mtx;
1151        struct task_struct *t;
1152        struct list_head *tb;
1153
1154        if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
1155                return 0;  /* Nothing left to boost. */
1156
1157        raw_spin_lock_irqsave(&rnp->lock, flags);
1158
1159        /*
1160         * Recheck under the lock: all tasks in need of boosting
1161         * might exit their RCU read-side critical sections on their own.
1162         */
1163        if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1164                raw_spin_unlock_irqrestore(&rnp->lock, flags);
1165                return 0;
1166        }
1167
1168        /*
1169         * Preferentially boost tasks blocking expedited grace periods.
1170         * This cannot starve the normal grace periods because a second
1171         * expedited grace period must boost all blocked tasks, including
1172         * those blocking the pre-existing normal grace period.
1173         */
1174        if (rnp->exp_tasks != NULL) {
1175                tb = rnp->exp_tasks;
1176                rnp->n_exp_boosts++;
1177        } else {
1178                tb = rnp->boost_tasks;
1179                rnp->n_normal_boosts++;
1180        }
1181        rnp->n_tasks_boosted++;
1182
1183        /*
1184         * We boost task t by manufacturing an rt_mutex that appears to
1185         * be held by task t.  We leave a pointer to that rt_mutex where
1186         * task t can find it, and task t will release the mutex when it
1187         * exits its outermost RCU read-side critical section.  Then
1188         * simply acquiring this artificial rt_mutex will boost task
1189         * t's priority.  (Thanks to tglx for suggesting this approach!)
1190         *
1191         * Note that task t must acquire rnp->lock to remove itself from
1192         * the ->blkd_tasks list, which it will do from exit() if from
1193         * nowhere else.  We therefore are guaranteed that task t will
1194         * stay around at least until we drop rnp->lock.  Note that
1195         * rnp->lock also resolves races between our priority boosting
1196         * and task t's exiting its outermost RCU read-side critical
1197         * section.
1198         */
1199        t = container_of(tb, struct task_struct, rcu_node_entry);
1200        rt_mutex_init_proxy_locked(&mtx, t);
1201        t->rcu_boost_mutex = &mtx;
1202        t->rcu_boosted = 1;
1203        raw_spin_unlock_irqrestore(&rnp->lock, flags);
1204        rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
1205        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
1206
1207        return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
1208}
1209
1210/*
1211 * Timer handler to initiate waking up of boost kthreads that
1212 * have yielded the CPU due to excessive numbers of tasks to
1213 * boost.  We wake up the per-rcu_node kthread, which in turn
1214 * will wake up the booster kthread.
1215 */
1216static void rcu_boost_kthread_timer(unsigned long arg)
1217{
1218        invoke_rcu_node_kthread((struct rcu_node *)arg);
1219}
1220
1221/*
1222 * Priority-boosting kthread.  One per leaf rcu_node and one for the
1223 * root rcu_node.
1224 */
1225static int rcu_boost_kthread(void *arg)
1226{
1227        struct rcu_node *rnp = (struct rcu_node *)arg;
1228        int spincnt = 0;
1229        int more2boost;
1230
1231        for (;;) {
1232                rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1233                rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1234                rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1235                more2boost = rcu_boost(rnp);
1236                if (more2boost)
1237                        spincnt++;
1238                else
1239                        spincnt = 0;
1240                if (spincnt > 10) {
1241                        rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1242                        spincnt = 0;
1243                }
1244        }
1245        /* NOTREACHED */
1246        return 0;
1247}
1248
1249/*
1250 * Check to see if it is time to start boosting RCU readers that are
1251 * blocking the current grace period, and, if so, tell the per-rcu_node
1252 * kthread to start boosting them.  If there is an expedited grace
1253 * period in progress, it is always time to boost.
1254 *
1255 * The caller must hold rnp->lock, which this function releases,
1256 * but irqs remain disabled.  The ->boost_kthread_task is immortal,
1257 * so we don't need to worry about it going away.
1258 */
1259static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1260{
1261        struct task_struct *t;
1262
1263        if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1264                rnp->n_balk_exp_gp_tasks++;
1265                raw_spin_unlock_irqrestore(&rnp->lock, flags);
1266                return;
1267        }
1268        if (rnp->exp_tasks != NULL ||
1269            (rnp->gp_tasks != NULL &&
1270             rnp->boost_tasks == NULL &&
1271             rnp->qsmask == 0 &&
1272             ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1273                if (rnp->exp_tasks == NULL)
1274                        rnp->boost_tasks = rnp->gp_tasks;
1275                raw_spin_unlock_irqrestore(&rnp->lock, flags);
1276                t = rnp->boost_kthread_task;
1277                if (t != NULL)
1278                        wake_up_process(t);
1279        } else {
1280                rcu_initiate_boost_trace(rnp);
1281                raw_spin_unlock_irqrestore(&rnp->lock, flags);
1282        }
1283}
1284
1285/*
1286 * Wake up the per-CPU kthread to invoke RCU callbacks.
1287 */
1288static void invoke_rcu_callbacks_kthread(void)
1289{
1290        unsigned long flags;
1291
1292        local_irq_save(flags);
1293        __this_cpu_write(rcu_cpu_has_work, 1);
1294        if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
1295                local_irq_restore(flags);
1296                return;
1297        }
1298        wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1299        local_irq_restore(flags);
1300}
1301
1302/*
1303 * Set the affinity of the boost kthread.  The CPU-hotplug locks are
1304 * held, so no one should be messing with the existence of the boost
1305 * kthread.
1306 */
1307static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1308                                          cpumask_var_t cm)
1309{
1310        struct task_struct *t;
1311
1312        t = rnp->boost_kthread_task;
1313        if (t != NULL)
1314                set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1315}
1316
1317#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1318
1319/*
1320 * Do priority-boost accounting for the start of a new grace period.
1321 */
1322static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1323{
1324        rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1325}
1326
1327/*
1328 * Create an RCU-boost kthread for the specified node if one does not
1329 * already exist.  We only create this kthread for preemptible RCU.
1330 * Returns zero if all is well, a negated errno otherwise.
1331 */
1332static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1333                                                 struct rcu_node *rnp,
1334                                                 int rnp_index)
1335{
1336        unsigned long flags;
1337        struct sched_param sp;
1338        struct task_struct *t;
1339
1340        if (&rcu_preempt_state != rsp)
1341                return 0;
1342        rsp->boost = 1;
1343        if (rnp->boost_kthread_task != NULL)
1344                return 0;
1345        t = kthread_create(rcu_boost_kthread, (void *)rnp,
1346                           "rcub%d", rnp_index);
1347        if (IS_ERR(t))
1348                return PTR_ERR(t);
1349        raw_spin_lock_irqsave(&rnp->lock, flags);
1350        rnp->boost_kthread_task = t;
1351        raw_spin_unlock_irqrestore(&rnp->lock, flags);
1352        sp.sched_priority = RCU_KTHREAD_PRIO;
1353        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1354        wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1355        return 0;
1356}
1357
1358#ifdef CONFIG_HOTPLUG_CPU
1359
1360/*
1361 * Stop the RCU's per-CPU kthread when its CPU goes offline,.
1362 */
1363static void rcu_stop_cpu_kthread(int cpu)
1364{
1365        struct task_struct *t;
1366
1367        /* Stop the CPU's kthread. */
1368        t = per_cpu(rcu_cpu_kthread_task, cpu);
1369        if (t != NULL) {
1370                per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1371                kthread_stop(t);
1372        }
1373}
1374
1375#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1376
1377static void rcu_kthread_do_work(void)
1378{
1379        rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
1380        rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1381        rcu_preempt_do_callbacks();
1382}
1383
1384/*
1385 * Wake up the specified per-rcu_node-structure kthread.
1386 * Because the per-rcu_node kthreads are immortal, we don't need
1387 * to do anything to keep them alive.
1388 */
1389static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1390{
1391        struct task_struct *t;
1392
1393        t = rnp->node_kthread_task;
1394        if (t != NULL)
1395                wake_up_process(t);
1396}
1397
1398/*
1399 * Set the specified CPU's kthread to run RT or not, as specified by
1400 * the to_rt argument.  The CPU-hotplug locks are held, so the task
1401 * is not going away.
1402 */
1403static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1404{
1405        int policy;
1406        struct sched_param sp;
1407        struct task_struct *t;
1408
1409        t = per_cpu(rcu_cpu_kthread_task, cpu);
1410        if (t == NULL)
1411                return;
1412        if (to_rt) {
1413                policy = SCHED_FIFO;
1414                sp.sched_priority = RCU_KTHREAD_PRIO;
1415        } else {
1416                policy = SCHED_NORMAL;
1417                sp.sched_priority = 0;
1418        }
1419        sched_setscheduler_nocheck(t, policy, &sp);
1420}
1421
1422/*
1423 * Timer handler to initiate the waking up of per-CPU kthreads that
1424 * have yielded the CPU due to excess numbers of RCU callbacks.
1425 * We wake up the per-rcu_node kthread, which in turn will wake up
1426 * the booster kthread.
1427 */
1428static void rcu_cpu_kthread_timer(unsigned long arg)
1429{
1430        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1431        struct rcu_node *rnp = rdp->mynode;
1432
1433        atomic_or(rdp->grpmask, &rnp->wakemask);
1434        invoke_rcu_node_kthread(rnp);
1435}
1436
1437/*
1438 * Drop to non-real-time priority and yield, but only after posting a
1439 * timer that will cause us to regain our real-time priority if we
1440 * remain preempted.  Either way, we restore our real-time priority
1441 * before returning.
1442 */
1443static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1444{
1445        struct sched_param sp;
1446        struct timer_list yield_timer;
1447
1448        setup_timer_on_stack(&yield_timer, f, arg);
1449        mod_timer(&yield_timer, jiffies + 2);
1450        sp.sched_priority = 0;
1451        sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1452        set_user_nice(current, 19);
1453        schedule();
1454        sp.sched_priority = RCU_KTHREAD_PRIO;
1455        sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1456        del_timer(&yield_timer);
1457}
1458
1459/*
1460 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1461 * This can happen while the corresponding CPU is either coming online
1462 * or going offline.  We cannot wait until the CPU is fully online
1463 * before starting the kthread, because the various notifier functions
1464 * can wait for RCU grace periods.  So we park rcu_cpu_kthread() until
1465 * the corresponding CPU is online.
1466 *
1467 * Return 1 if the kthread needs to stop, 0 otherwise.
1468 *
1469 * Caller must disable bh.  This function can momentarily enable it.
1470 */
1471static int rcu_cpu_kthread_should_stop(int cpu)
1472{
1473        while (cpu_is_offline(cpu) ||
1474               !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1475               smp_processor_id() != cpu) {
1476                if (kthread_should_stop())
1477                        return 1;
1478                per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1479                per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1480                local_bh_enable();
1481                schedule_timeout_uninterruptible(1);
1482                if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1483                        set_cpus_allowed_ptr(current, cpumask_of(cpu));
1484                local_bh_disable();
1485        }
1486        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1487        return 0;
1488}
1489
1490/*
1491 * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
1492 * earlier RCU softirq.
1493 */
1494static int rcu_cpu_kthread(void *arg)
1495{
1496        int cpu = (int)(long)arg;
1497        unsigned long flags;
1498        int spincnt = 0;
1499        unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1500        char work;
1501        char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1502
1503        for (;;) {
1504                *statusp = RCU_KTHREAD_WAITING;
1505                rcu_wait(*workp != 0 || kthread_should_stop());
1506                local_bh_disable();
1507                if (rcu_cpu_kthread_should_stop(cpu)) {
1508                        local_bh_enable();
1509                        break;
1510                }
1511                *statusp = RCU_KTHREAD_RUNNING;
1512                per_cpu(rcu_cpu_kthread_loops, cpu)++;
1513                local_irq_save(flags);
1514                work = *workp;
1515                *workp = 0;
1516                local_irq_restore(flags);
1517                if (work)
1518                        rcu_kthread_do_work();
1519                local_bh_enable();
1520                if (*workp != 0)
1521                        spincnt++;
1522                else
1523                        spincnt = 0;
1524                if (spincnt > 10) {
1525                        *statusp = RCU_KTHREAD_YIELDING;
1526                        rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1527                        spincnt = 0;
1528                }
1529        }
1530        *statusp = RCU_KTHREAD_STOPPED;
1531        return 0;
1532}
1533
1534/*
1535 * Spawn a per-CPU kthread, setting up affinity and priority.
1536 * Because the CPU hotplug lock is held, no other CPU will be attempting
1537 * to manipulate rcu_cpu_kthread_task.  There might be another CPU
1538 * attempting to access it during boot, but the locking in kthread_bind()
1539 * will enforce sufficient ordering.
1540 *
1541 * Please note that we cannot simply refuse to wake up the per-CPU
1542 * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
1543 * which can result in softlockup complaints if the task ends up being
1544 * idle for more than a couple of minutes.
1545 *
1546 * However, please note also that we cannot bind the per-CPU kthread to its
1547 * CPU until that CPU is fully online.  We also cannot wait until the
1548 * CPU is fully online before we create its per-CPU kthread, as this would
1549 * deadlock the system when CPU notifiers tried waiting for grace
1550 * periods.  So we bind the per-CPU kthread to its CPU only if the CPU
1551 * is online.  If its CPU is not yet fully online, then the code in
1552 * rcu_cpu_kthread() will wait until it is fully online, and then do
1553 * the binding.
1554 */
1555static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1556{
1557        struct sched_param sp;
1558        struct task_struct *t;
1559
1560        if (!rcu_scheduler_fully_active ||
1561            per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1562                return 0;
1563        t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
1564        if (IS_ERR(t))
1565                return PTR_ERR(t);
1566        if (cpu_online(cpu))
1567                kthread_bind(t, cpu);
1568        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1569        WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1570        sp.sched_priority = RCU_KTHREAD_PRIO;
1571        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1572        per_cpu(rcu_cpu_kthread_task, cpu) = t;
1573        wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
1574        return 0;
1575}
1576
1577/*
1578 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1579 * kthreads when needed.  We ignore requests to wake up kthreads
1580 * for offline CPUs, which is OK because force_quiescent_state()
1581 * takes care of this case.
1582 */
1583static int rcu_node_kthread(void *arg)
1584{
1585        int cpu;
1586        unsigned long flags;
1587        unsigned long mask;
1588        struct rcu_node *rnp = (struct rcu_node *)arg;
1589        struct sched_param sp;
1590        struct task_struct *t;
1591
1592        for (;;) {
1593                rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1594                rcu_wait(atomic_read(&rnp->wakemask) != 0);
1595                rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1596                raw_spin_lock_irqsave(&rnp->lock, flags);
1597                mask = atomic_xchg(&rnp->wakemask, 0);
1598                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1599                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1600                        if ((mask & 0x1) == 0)
1601                                continue;
1602                        preempt_disable();
1603                        t = per_cpu(rcu_cpu_kthread_task, cpu);
1604                        if (!cpu_online(cpu) || t == NULL) {
1605                                preempt_enable();
1606                                continue;
1607                        }
1608                        per_cpu(rcu_cpu_has_work, cpu) = 1;
1609                        sp.sched_priority = RCU_KTHREAD_PRIO;
1610                        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1611                        preempt_enable();
1612                }
1613        }
1614        /* NOTREACHED */
1615        rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
1616        return 0;
1617}
1618
1619/*
1620 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1621 * served by the rcu_node in question.  The CPU hotplug lock is still
1622 * held, so the value of rnp->qsmaskinit will be stable.
1623 *
1624 * We don't include outgoingcpu in the affinity set, use -1 if there is
1625 * no outgoing CPU.  If there are no CPUs left in the affinity set,
1626 * this function allows the kthread to execute on any CPU.
1627 */
1628static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1629{
1630        cpumask_var_t cm;
1631        int cpu;
1632        unsigned long mask = rnp->qsmaskinit;
1633
1634        if (rnp->node_kthread_task == NULL)
1635                return;
1636        if (!alloc_cpumask_var(&cm, GFP_KERNEL))
1637                return;
1638        cpumask_clear(cm);
1639        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1640                if ((mask & 0x1) && cpu != outgoingcpu)
1641                        cpumask_set_cpu(cpu, cm);
1642        if (cpumask_weight(cm) == 0) {
1643                cpumask_setall(cm);
1644                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1645                        cpumask_clear_cpu(cpu, cm);
1646                WARN_ON_ONCE(cpumask_weight(cm) == 0);
1647        }
1648        set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
1649        rcu_boost_kthread_setaffinity(rnp, cm);
1650        free_cpumask_var(cm);
1651}
1652
1653/*
1654 * Spawn a per-rcu_node kthread, setting priority and affinity.
1655 * Called during boot before online/offline can happen, or, if
1656 * during runtime, with the main CPU-hotplug locks held.  So only
1657 * one of these can be executing at a time.
1658 */
1659static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1660                                                struct rcu_node *rnp)
1661{
1662        unsigned long flags;
1663        int rnp_index = rnp - &rsp->node[0];
1664        struct sched_param sp;
1665        struct task_struct *t;
1666
1667        if (!rcu_scheduler_fully_active ||
1668            rnp->qsmaskinit == 0)
1669                return 0;
1670        if (rnp->node_kthread_task == NULL) {
1671                t = kthread_create(rcu_node_kthread, (void *)rnp,
1672                                   "rcun%d", rnp_index);
1673                if (IS_ERR(t))
1674                        return PTR_ERR(t);
1675                raw_spin_lock_irqsave(&rnp->lock, flags);
1676                rnp->node_kthread_task = t;
1677                raw_spin_unlock_irqrestore(&rnp->lock, flags);
1678                sp.sched_priority = 99;
1679                sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1680                wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1681        }
1682        return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1683}
1684
1685/*
1686 * Spawn all kthreads -- called as soon as the scheduler is running.
1687 */
1688static int __init rcu_spawn_kthreads(void)
1689{
1690        int cpu;
1691        struct rcu_node *rnp;
1692
1693        rcu_scheduler_fully_active = 1;
1694        for_each_possible_cpu(cpu) {
1695                per_cpu(rcu_cpu_has_work, cpu) = 0;
1696                if (cpu_online(cpu))
1697                        (void)rcu_spawn_one_cpu_kthread(cpu);
1698        }
1699        rnp = rcu_get_root(rcu_state);
1700        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1701        if (NUM_RCU_NODES > 1) {
1702                rcu_for_each_leaf_node(rcu_state, rnp)
1703                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1704        }
1705        return 0;
1706}
1707early_initcall(rcu_spawn_kthreads);
1708
1709static void __cpuinit rcu_prepare_kthreads(int cpu)
1710{
1711        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1712        struct rcu_node *rnp = rdp->mynode;
1713
1714        /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1715        if (rcu_scheduler_fully_active) {
1716                (void)rcu_spawn_one_cpu_kthread(cpu);
1717                if (rnp->node_kthread_task == NULL)
1718                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1719        }
1720}
1721
1722#else /* #ifdef CONFIG_RCU_BOOST */
1723
1724static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1725{
1726        raw_spin_unlock_irqrestore(&rnp->lock, flags);
1727}
1728
1729static void invoke_rcu_callbacks_kthread(void)
1730{
1731        WARN_ON_ONCE(1);
1732}
1733
1734static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1735{
1736}
1737
1738#ifdef CONFIG_HOTPLUG_CPU
1739
1740static void rcu_stop_cpu_kthread(int cpu)
1741{
1742}
1743
1744#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1745
1746static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1747{
1748}
1749
1750static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1751{
1752}
1753
1754static int __init rcu_scheduler_really_started(void)
1755{
1756        rcu_scheduler_fully_active = 1;
1757        return 0;
1758}
1759early_initcall(rcu_scheduler_really_started);
1760
1761static void __cpuinit rcu_prepare_kthreads(int cpu)
1762{
1763}
1764
1765#endif /* #else #ifdef CONFIG_RCU_BOOST */
1766
1767#ifndef CONFIG_SMP
1768
1769void synchronize_sched_expedited(void)
1770{
1771        cond_resched();
1772}
1773EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1774
1775#else /* #ifndef CONFIG_SMP */
1776
1777static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1778static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1779
1780static int synchronize_sched_expedited_cpu_stop(void *data)
1781{
1782        /*
1783         * There must be a full memory barrier on each affected CPU
1784         * between the time that try_stop_cpus() is called and the
1785         * time that it returns.
1786         *
1787         * In the current initial implementation of cpu_stop, the
1788         * above condition is already met when the control reaches
1789         * this point and the following smp_mb() is not strictly
1790         * necessary.  Do smp_mb() anyway for documentation and
1791         * robustness against future implementation changes.
1792         */
1793        smp_mb(); /* See above comment block. */
1794        return 0;
1795}
1796
1797/*
1798 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
1799 * approach to force grace period to end quickly.  This consumes
1800 * significant time on all CPUs, and is thus not recommended for
1801 * any sort of common-case code.
1802 *
1803 * Note that it is illegal to call this function while holding any
1804 * lock that is acquired by a CPU-hotplug notifier.  Failing to
1805 * observe this restriction will result in deadlock.
1806 *
1807 * This implementation can be thought of as an application of ticket
1808 * locking to RCU, with sync_sched_expedited_started and
1809 * sync_sched_expedited_done taking on the roles of the halves
1810 * of the ticket-lock word.  Each task atomically increments
1811 * sync_sched_expedited_started upon entry, snapshotting the old value,
1812 * then attempts to stop all the CPUs.  If this succeeds, then each
1813 * CPU will have executed a context switch, resulting in an RCU-sched
1814 * grace period.  We are then done, so we use atomic_cmpxchg() to
1815 * update sync_sched_expedited_done to match our snapshot -- but
1816 * only if someone else has not already advanced past our snapshot.
1817 *
1818 * On the other hand, if try_stop_cpus() fails, we check the value
1819 * of sync_sched_expedited_done.  If it has advanced past our
1820 * initial snapshot, then someone else must have forced a grace period
1821 * some time after we took our snapshot.  In this case, our work is
1822 * done for us, and we can simply return.  Otherwise, we try again,
1823 * but keep our initial snapshot for purposes of checking for someone
1824 * doing our work for us.
1825 *
1826 * If we fail too many times in a row, we fall back to synchronize_sched().
1827 */
1828void synchronize_sched_expedited(void)
1829{
1830        int firstsnap, s, snap, trycount = 0;
1831
1832        /* Note that atomic_inc_return() implies full memory barrier. */
1833        firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1834        get_online_cpus();
1835
1836        /*
1837         * Each pass through the following loop attempts to force a
1838         * context switch on each CPU.
1839         */
1840        while (try_stop_cpus(cpu_online_mask,
1841                             synchronize_sched_expedited_cpu_stop,
1842                             NULL) == -EAGAIN) {
1843                put_online_cpus();
1844
1845                /* No joy, try again later.  Or just synchronize_sched(). */
1846                if (trycount++ < 10)
1847                        udelay(trycount * num_online_cpus());
1848                else {
1849                        synchronize_sched();
1850                        return;
1851                }
1852
1853                /* Check to see if someone else did our work for us. */
1854                s = atomic_read(&sync_sched_expedited_done);
1855                if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1856                        smp_mb(); /* ensure test happens before caller kfree */
1857                        return;
1858                }
1859
1860                /*
1861                 * Refetching sync_sched_expedited_started allows later
1862                 * callers to piggyback on our grace period.  We subtract
1863                 * 1 to get the same token that the last incrementer got.
1864                 * We retry after they started, so our grace period works
1865                 * for them, and they started after our first try, so their
1866                 * grace period works for us.
1867                 */
1868                get_online_cpus();
1869                snap = atomic_read(&sync_sched_expedited_started) - 1;
1870                smp_mb(); /* ensure read is before try_stop_cpus(). */
1871        }
1872
1873        /*
1874         * Everyone up to our most recent fetch is covered by our grace
1875         * period.  Update the counter, but only if our work is still
1876         * relevant -- which it won't be if someone who started later
1877         * than we did beat us to the punch.
1878         */
1879        do {
1880                s = atomic_read(&sync_sched_expedited_done);
1881                if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1882                        smp_mb(); /* ensure test happens before caller kfree */
1883                        break;
1884                }
1885        } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1886
1887        put_online_cpus();
1888}
1889EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1890
1891#endif /* #else #ifndef CONFIG_SMP */
1892
1893#if !defined(CONFIG_RCU_FAST_NO_HZ)
1894
1895/*
1896 * Check to see if any future RCU-related work will need to be done
1897 * by the current CPU, even if none need be done immediately, returning
1898 * 1 if so.  This function is part of the RCU implementation; it is -not-
1899 * an exported member of the RCU API.
1900 *
1901 * Because we have preemptible RCU, just check whether this CPU needs
1902 * any flavor of RCU.  Do not chew up lots of CPU cycles with preemption
1903 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
1904 */
1905int rcu_needs_cpu(int cpu)
1906{
1907        return rcu_needs_cpu_quick_check(cpu);
1908}
1909
1910/*
1911 * Check to see if we need to continue a callback-flush operations to
1912 * allow the last CPU to enter dyntick-idle mode.  But fast dyntick-idle
1913 * entry is not configured, so we never do need to.
1914 */
1915static void rcu_needs_cpu_flush(void)
1916{
1917}
1918
1919#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1920
1921#define RCU_NEEDS_CPU_FLUSHES 5
1922static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1923static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1924
1925/*
1926 * Check to see if any future RCU-related work will need to be done
1927 * by the current CPU, even if none need be done immediately, returning
1928 * 1 if so.  This function is part of the RCU implementation; it is -not-
1929 * an exported member of the RCU API.
1930 *
1931 * Because we are not supporting preemptible RCU, attempt to accelerate
1932 * any current grace periods so that RCU no longer needs this CPU, but
1933 * only if all other CPUs are already in dynticks-idle mode.  This will
1934 * allow the CPU cores to be powered down immediately, as opposed to after
1935 * waiting many milliseconds for grace periods to elapse.
1936 *
1937 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1938 * disabled, we do one pass of force_quiescent_state(), then do a
1939 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1940 * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
1941 */
1942int rcu_needs_cpu(int cpu)
1943{
1944        int c = 0;
1945        int snap;
1946        int thatcpu;
1947
1948        /* Check for being in the holdoff period. */
1949        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
1950                return rcu_needs_cpu_quick_check(cpu);
1951
1952        /* Don't bother unless we are the last non-dyntick-idle CPU. */
1953        for_each_online_cpu(thatcpu) {
1954                if (thatcpu == cpu)
1955                        continue;
1956                snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
1957                                                     thatcpu).dynticks);
1958                smp_mb(); /* Order sampling of snap with end of grace period. */
1959                if ((snap & 0x1) != 0) {
1960                        per_cpu(rcu_dyntick_drain, cpu) = 0;
1961                        per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1962                        return rcu_needs_cpu_quick_check(cpu);
1963                }
1964        }
1965
1966        /* Check and update the rcu_dyntick_drain sequencing. */
1967        if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1968                /* First time through, initialize the counter. */
1969                per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
1970        } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1971                /* We have hit the limit, so time to give up. */
1972                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1973                return rcu_needs_cpu_quick_check(cpu);
1974        }
1975
1976        /* Do one step pushing remaining RCU callbacks through. */
1977        if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1978                rcu_sched_qs(cpu);
1979                force_quiescent_state(&rcu_sched_state, 0);
1980                c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
1981        }
1982        if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1983                rcu_bh_qs(cpu);
1984                force_quiescent_state(&rcu_bh_state, 0);
1985                c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
1986        }
1987
1988        /* If RCU callbacks are still pending, RCU still needs this CPU. */
1989        if (c)
1990                invoke_rcu_core();
1991        return c;
1992}
1993
1994/*
1995 * Check to see if we need to continue a callback-flush operations to
1996 * allow the last CPU to enter dyntick-idle mode.
1997 */
1998static void rcu_needs_cpu_flush(void)
1999{
2000        int cpu = smp_processor_id();
2001        unsigned long flags;
2002
2003        if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
2004                return;
2005        local_irq_save(flags);
2006        (void)rcu_needs_cpu(cpu);
2007        local_irq_restore(flags);
2008}
2009
2010#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2011
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.