linux/kernel/rcutree_plugin.h
<<
>>
Prefs
   1/*
   2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
   3 * Internal non-public definitions that provide either classic
   4 * or preemptible semantics.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  19 *
  20 * Copyright Red Hat, 2009
  21 * Copyright IBM Corporation, 2009
  22 *
  23 * Author: Ingo Molnar <mingo@elte.hu>
  24 *         Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  25 */
  26
  27#include <linux/delay.h>
  28#include <linux/oom.h>
  29#include <linux/smpboot.h>
  30
  31#define RCU_KTHREAD_PRIO 1
  32
  33#ifdef CONFIG_RCU_BOOST
  34#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
  35#else
  36#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
  37#endif
  38
  39/*
  40 * Check the RCU kernel configuration parameters and print informative
  41 * messages about anything out of the ordinary.  If you like #ifdef, you
  42 * will love this function.
  43 */
  44static void __init rcu_bootup_announce_oddness(void)
  45{
  46#ifdef CONFIG_RCU_TRACE
  47        printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
  48#endif
  49#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
  50        printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
  51               CONFIG_RCU_FANOUT);
  52#endif
  53#ifdef CONFIG_RCU_FANOUT_EXACT
  54        printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
  55#endif
  56#ifdef CONFIG_RCU_FAST_NO_HZ
  57        printk(KERN_INFO
  58               "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
  59#endif
  60#ifdef CONFIG_PROVE_RCU
  61        printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
  62#endif
  63#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
  64        printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
  65#endif
  66#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
  67        printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n");
  68#endif
  69#if defined(CONFIG_RCU_CPU_STALL_INFO)
  70        printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
  71#endif
  72#if NUM_RCU_LVL_4 != 0
  73        printk(KERN_INFO "\tFour-level hierarchy is enabled.\n");
  74#endif
  75        if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
  76                printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
  77        if (nr_cpu_ids != NR_CPUS)
  78                printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
  79}
  80
  81#ifdef CONFIG_TREE_PREEMPT_RCU
  82
  83struct rcu_state rcu_preempt_state =
  84        RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
  85DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
  86static struct rcu_state *rcu_state = &rcu_preempt_state;
  87
  88static int rcu_preempted_readers_exp(struct rcu_node *rnp);
  89
  90/*
  91 * Tell them what RCU they are running.
  92 */
  93static void __init rcu_bootup_announce(void)
  94{
  95        printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
  96        rcu_bootup_announce_oddness();
  97}
  98
  99/*
 100 * Return the number of RCU-preempt batches processed thus far
 101 * for debug and statistics.
 102 */
 103long rcu_batches_completed_preempt(void)
 104{
 105        return rcu_preempt_state.completed;
 106}
 107EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
 108
 109/*
 110 * Return the number of RCU batches processed thus far for debug & stats.
 111 */
 112long rcu_batches_completed(void)
 113{
 114        return rcu_batches_completed_preempt();
 115}
 116EXPORT_SYMBOL_GPL(rcu_batches_completed);
 117
 118/*
 119 * Force a quiescent state for preemptible RCU.
 120 */
 121void rcu_force_quiescent_state(void)
 122{
 123        force_quiescent_state(&rcu_preempt_state);
 124}
 125EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 126
 127/*
 128 * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 129 * that this just means that the task currently running on the CPU is
 130 * not in a quiescent state.  There might be any number of tasks blocked
 131 * while in an RCU read-side critical section.
 132 *
 133 * Unlike the other rcu_*_qs() functions, callers to this function
 134 * must disable irqs in order to protect the assignment to
 135 * ->rcu_read_unlock_special.
 136 */
 137static void rcu_preempt_qs(int cpu)
 138{
 139        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
 140
 141        if (rdp->passed_quiesce == 0)
 142                trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
 143        rdp->passed_quiesce = 1;
 144        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 145}
 146
 147/*
 148 * We have entered the scheduler, and the current task might soon be
 149 * context-switched away from.  If this task is in an RCU read-side
 150 * critical section, we will no longer be able to rely on the CPU to
 151 * record that fact, so we enqueue the task on the blkd_tasks list.
 152 * The task will dequeue itself when it exits the outermost enclosing
 153 * RCU read-side critical section.  Therefore, the current grace period
 154 * cannot be permitted to complete until the blkd_tasks list entries
 155 * predating the current grace period drain, in other words, until
 156 * rnp->gp_tasks becomes NULL.
 157 *
 158 * Caller must disable preemption.
 159 */
 160static void rcu_preempt_note_context_switch(int cpu)
 161{
 162        struct task_struct *t = current;
 163        unsigned long flags;
 164        struct rcu_data *rdp;
 165        struct rcu_node *rnp;
 166
 167        if (t->rcu_read_lock_nesting > 0 &&
 168            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
 169
 170                /* Possibly blocking in an RCU read-side critical section. */
 171                rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
 172                rnp = rdp->mynode;
 173                raw_spin_lock_irqsave(&rnp->lock, flags);
 174                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
 175                t->rcu_blocked_node = rnp;
 176
 177                /*
 178                 * If this CPU has already checked in, then this task
 179                 * will hold up the next grace period rather than the
 180                 * current grace period.  Queue the task accordingly.
 181                 * If the task is queued for the current grace period
 182                 * (i.e., this CPU has not yet passed through a quiescent
 183                 * state for the current grace period), then as long
 184                 * as that task remains queued, the current grace period
 185                 * cannot end.  Note that there is some uncertainty as
 186                 * to exactly when the current grace period started.
 187                 * We take a conservative approach, which can result
 188                 * in unnecessarily waiting on tasks that started very
 189                 * slightly after the current grace period began.  C'est
 190                 * la vie!!!
 191                 *
 192                 * But first, note that the current CPU must still be
 193                 * on line!
 194                 */
 195                WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
 196                WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
 197                if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
 198                        list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
 199                        rnp->gp_tasks = &t->rcu_node_entry;
 200#ifdef CONFIG_RCU_BOOST
 201                        if (rnp->boost_tasks != NULL)
 202                                rnp->boost_tasks = rnp->gp_tasks;
 203#endif /* #ifdef CONFIG_RCU_BOOST */
 204                } else {
 205                        list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
 206                        if (rnp->qsmask & rdp->grpmask)
 207                                rnp->gp_tasks = &t->rcu_node_entry;
 208                }
 209                trace_rcu_preempt_task(rdp->rsp->name,
 210                                       t->pid,
 211                                       (rnp->qsmask & rdp->grpmask)
 212                                       ? rnp->gpnum
 213                                       : rnp->gpnum + 1);
 214                raw_spin_unlock_irqrestore(&rnp->lock, flags);
 215        } else if (t->rcu_read_lock_nesting < 0 &&
 216                   t->rcu_read_unlock_special) {
 217
 218                /*
 219                 * Complete exit from RCU read-side critical section on
 220                 * behalf of preempted instance of __rcu_read_unlock().
 221                 */
 222                rcu_read_unlock_special(t);
 223        }
 224
 225        /*
 226         * Either we were not in an RCU read-side critical section to
 227         * begin with, or we have now recorded that critical section
 228         * globally.  Either way, we can now note a quiescent state
 229         * for this CPU.  Again, if we were in an RCU read-side critical
 230         * section, and if that critical section was blocking the current
 231         * grace period, then the fact that the task has been enqueued
 232         * means that we continue to block the current grace period.
 233         */
 234        local_irq_save(flags);
 235        rcu_preempt_qs(cpu);
 236        local_irq_restore(flags);
 237}
 238
 239/*
 240 * Check for preempted RCU readers blocking the current grace period
 241 * for the specified rcu_node structure.  If the caller needs a reliable
 242 * answer, it must hold the rcu_node's ->lock.
 243 */
 244static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 245{
 246        return rnp->gp_tasks != NULL;
 247}
 248
 249/*
 250 * Record a quiescent state for all tasks that were previously queued
 251 * on the specified rcu_node structure and that were blocking the current
 252 * RCU grace period.  The caller must hold the specified rnp->lock with
 253 * irqs disabled, and this lock is released upon return, but irqs remain
 254 * disabled.
 255 */
 256static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 257        __releases(rnp->lock)
 258{
 259        unsigned long mask;
 260        struct rcu_node *rnp_p;
 261
 262        if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 263                raw_spin_unlock_irqrestore(&rnp->lock, flags);
 264                return;  /* Still need more quiescent states! */
 265        }
 266
 267        rnp_p = rnp->parent;
 268        if (rnp_p == NULL) {
 269                /*
 270                 * Either there is only one rcu_node in the tree,
 271                 * or tasks were kicked up to root rcu_node due to
 272                 * CPUs going offline.
 273                 */
 274                rcu_report_qs_rsp(&rcu_preempt_state, flags);
 275                return;
 276        }
 277
 278        /* Report up the rest of the hierarchy. */
 279        mask = rnp->grpmask;
 280        raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
 281        raw_spin_lock(&rnp_p->lock);    /* irqs already disabled. */
 282        rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
 283}
 284
 285/*
 286 * Advance a ->blkd_tasks-list pointer to the next entry, instead
 287 * returning NULL if at the end of the list.
 288 */
 289static struct list_head *rcu_next_node_entry(struct task_struct *t,
 290                                             struct rcu_node *rnp)
 291{
 292        struct list_head *np;
 293
 294        np = t->rcu_node_entry.next;
 295        if (np == &rnp->blkd_tasks)
 296                np = NULL;
 297        return np;
 298}
 299
 300/*
 301 * Handle special cases during rcu_read_unlock(), such as needing to
 302 * notify RCU core processing or task having blocked during the RCU
 303 * read-side critical section.
 304 */
 305void rcu_read_unlock_special(struct task_struct *t)
 306{
 307        int empty;
 308        int empty_exp;
 309        int empty_exp_now;
 310        unsigned long flags;
 311        struct list_head *np;
 312#ifdef CONFIG_RCU_BOOST
 313        struct rt_mutex *rbmp = NULL;
 314#endif /* #ifdef CONFIG_RCU_BOOST */
 315        struct rcu_node *rnp;
 316        int special;
 317
 318        /* NMI handlers cannot block and cannot safely manipulate state. */
 319        if (in_nmi())
 320                return;
 321
 322        local_irq_save(flags);
 323
 324        /*
 325         * If RCU core is waiting for this CPU to exit critical section,
 326         * let it know that we have done so.
 327         */
 328        special = t->rcu_read_unlock_special;
 329        if (special & RCU_READ_UNLOCK_NEED_QS) {
 330                rcu_preempt_qs(smp_processor_id());
 331        }
 332
 333        /* Hardware IRQ handlers cannot block. */
 334        if (in_irq() || in_serving_softirq()) {
 335                local_irq_restore(flags);
 336                return;
 337        }
 338
 339        /* Clean up if blocked during RCU read-side critical section. */
 340        if (special & RCU_READ_UNLOCK_BLOCKED) {
 341                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
 342
 343                /*
 344                 * Remove this task from the list it blocked on.  The
 345                 * task can migrate while we acquire the lock, but at
 346                 * most one time.  So at most two passes through loop.
 347                 */
 348                for (;;) {
 349                        rnp = t->rcu_blocked_node;
 350                        raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
 351                        if (rnp == t->rcu_blocked_node)
 352                                break;
 353                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 354                }
 355                empty = !rcu_preempt_blocked_readers_cgp(rnp);
 356                empty_exp = !rcu_preempted_readers_exp(rnp);
 357                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 358                np = rcu_next_node_entry(t, rnp);
 359                list_del_init(&t->rcu_node_entry);
 360                t->rcu_blocked_node = NULL;
 361                trace_rcu_unlock_preempted_task("rcu_preempt",
 362                                                rnp->gpnum, t->pid);
 363                if (&t->rcu_node_entry == rnp->gp_tasks)
 364                        rnp->gp_tasks = np;
 365                if (&t->rcu_node_entry == rnp->exp_tasks)
 366                        rnp->exp_tasks = np;
 367#ifdef CONFIG_RCU_BOOST
 368                if (&t->rcu_node_entry == rnp->boost_tasks)
 369                        rnp->boost_tasks = np;
 370                /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
 371                if (t->rcu_boost_mutex) {
 372                        rbmp = t->rcu_boost_mutex;
 373                        t->rcu_boost_mutex = NULL;
 374                }
 375#endif /* #ifdef CONFIG_RCU_BOOST */
 376
 377                /*
 378                 * If this was the last task on the current list, and if
 379                 * we aren't waiting on any CPUs, report the quiescent state.
 380                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
 381                 * so we must take a snapshot of the expedited state.
 382                 */
 383                empty_exp_now = !rcu_preempted_readers_exp(rnp);
 384                if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
 385                        trace_rcu_quiescent_state_report("preempt_rcu",
 386                                                         rnp->gpnum,
 387                                                         0, rnp->qsmask,
 388                                                         rnp->level,
 389                                                         rnp->grplo,
 390                                                         rnp->grphi,
 391                                                         !!rnp->gp_tasks);
 392                        rcu_report_unblock_qs_rnp(rnp, flags);
 393                } else {
 394                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 395                }
 396
 397#ifdef CONFIG_RCU_BOOST
 398                /* Unboost if we were boosted. */
 399                if (rbmp)
 400                        rt_mutex_unlock(rbmp);
 401#endif /* #ifdef CONFIG_RCU_BOOST */
 402
 403                /*
 404                 * If this was the last task on the expedited lists,
 405                 * then we need to report up the rcu_node hierarchy.
 406                 */
 407                if (!empty_exp && empty_exp_now)
 408                        rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
 409        } else {
 410                local_irq_restore(flags);
 411        }
 412}
 413
 414#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
 415
 416/*
 417 * Dump detailed information for all tasks blocking the current RCU
 418 * grace period on the specified rcu_node structure.
 419 */
 420static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 421{
 422        unsigned long flags;
 423        struct task_struct *t;
 424
 425        raw_spin_lock_irqsave(&rnp->lock, flags);
 426        if (!rcu_preempt_blocked_readers_cgp(rnp)) {
 427                raw_spin_unlock_irqrestore(&rnp->lock, flags);
 428                return;
 429        }
 430        t = list_entry(rnp->gp_tasks,
 431                       struct task_struct, rcu_node_entry);
 432        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
 433                sched_show_task(t);
 434        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 435}
 436
 437/*
 438 * Dump detailed information for all tasks blocking the current RCU
 439 * grace period.
 440 */
 441static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 442{
 443        struct rcu_node *rnp = rcu_get_root(rsp);
 444
 445        rcu_print_detail_task_stall_rnp(rnp);
 446        rcu_for_each_leaf_node(rsp, rnp)
 447                rcu_print_detail_task_stall_rnp(rnp);
 448}
 449
 450#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
 451
 452static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 453{
 454}
 455
 456#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
 457
 458#ifdef CONFIG_RCU_CPU_STALL_INFO
 459
 460static void rcu_print_task_stall_begin(struct rcu_node *rnp)
 461{
 462        printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
 463               rnp->level, rnp->grplo, rnp->grphi);
 464}
 465
 466static void rcu_print_task_stall_end(void)
 467{
 468        printk(KERN_CONT "\n");
 469}
 470
 471#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
 472
 473static void rcu_print_task_stall_begin(struct rcu_node *rnp)
 474{
 475}
 476
 477static void rcu_print_task_stall_end(void)
 478{
 479}
 480
 481#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
 482
 483/*
 484 * Scan the current list of tasks blocked within RCU read-side critical
 485 * sections, printing out the tid of each.
 486 */
 487static int rcu_print_task_stall(struct rcu_node *rnp)
 488{
 489        struct task_struct *t;
 490        int ndetected = 0;
 491
 492        if (!rcu_preempt_blocked_readers_cgp(rnp))
 493                return 0;
 494        rcu_print_task_stall_begin(rnp);
 495        t = list_entry(rnp->gp_tasks,
 496                       struct task_struct, rcu_node_entry);
 497        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
 498                printk(KERN_CONT " P%d", t->pid);
 499                ndetected++;
 500        }
 501        rcu_print_task_stall_end();
 502        return ndetected;
 503}
 504
 505/*
 506 * Check that the list of blocked tasks for the newly completed grace
 507 * period is in fact empty.  It is a serious bug to complete a grace
 508 * period that still has RCU readers blocked!  This function must be
 509 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
 510 * must be held by the caller.
 511 *
 512 * Also, if there are blocked tasks on the list, they automatically
 513 * block the newly created grace period, so set up ->gp_tasks accordingly.
 514 */
 515static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 516{
 517        WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
 518        if (!list_empty(&rnp->blkd_tasks))
 519                rnp->gp_tasks = rnp->blkd_tasks.next;
 520        WARN_ON_ONCE(rnp->qsmask);
 521}
 522
 523#ifdef CONFIG_HOTPLUG_CPU
 524
 525/*
 526 * Handle tasklist migration for case in which all CPUs covered by the
 527 * specified rcu_node have gone offline.  Move them up to the root
 528 * rcu_node.  The reason for not just moving them to the immediate
 529 * parent is to remove the need for rcu_read_unlock_special() to
 530 * make more than two attempts to acquire the target rcu_node's lock.
 531 * Returns true if there were tasks blocking the current RCU grace
 532 * period.
 533 *
 534 * Returns 1 if there was previously a task blocking the current grace
 535 * period on the specified rcu_node structure.
 536 *
 537 * The caller must hold rnp->lock with irqs disabled.
 538 */
 539static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 540                                     struct rcu_node *rnp,
 541                                     struct rcu_data *rdp)
 542{
 543        struct list_head *lp;
 544        struct list_head *lp_root;
 545        int retval = 0;
 546        struct rcu_node *rnp_root = rcu_get_root(rsp);
 547        struct task_struct *t;
 548
 549        if (rnp == rnp_root) {
 550                WARN_ONCE(1, "Last CPU thought to be offlined?");
 551                return 0;  /* Shouldn't happen: at least one CPU online. */
 552        }
 553
 554        /* If we are on an internal node, complain bitterly. */
 555        WARN_ON_ONCE(rnp != rdp->mynode);
 556
 557        /*
 558         * Move tasks up to root rcu_node.  Don't try to get fancy for
 559         * this corner-case operation -- just put this node's tasks
 560         * at the head of the root node's list, and update the root node's
 561         * ->gp_tasks and ->exp_tasks pointers to those of this node's,
 562         * if non-NULL.  This might result in waiting for more tasks than
 563         * absolutely necessary, but this is a good performance/complexity
 564         * tradeoff.
 565         */
 566        if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
 567                retval |= RCU_OFL_TASKS_NORM_GP;
 568        if (rcu_preempted_readers_exp(rnp))
 569                retval |= RCU_OFL_TASKS_EXP_GP;
 570        lp = &rnp->blkd_tasks;
 571        lp_root = &rnp_root->blkd_tasks;
 572        while (!list_empty(lp)) {
 573                t = list_entry(lp->next, typeof(*t), rcu_node_entry);
 574                raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
 575                list_del(&t->rcu_node_entry);
 576                t->rcu_blocked_node = rnp_root;
 577                list_add(&t->rcu_node_entry, lp_root);
 578                if (&t->rcu_node_entry == rnp->gp_tasks)
 579                        rnp_root->gp_tasks = rnp->gp_tasks;
 580                if (&t->rcu_node_entry == rnp->exp_tasks)
 581                        rnp_root->exp_tasks = rnp->exp_tasks;
 582#ifdef CONFIG_RCU_BOOST
 583                if (&t->rcu_node_entry == rnp->boost_tasks)
 584                        rnp_root->boost_tasks = rnp->boost_tasks;
 585#endif /* #ifdef CONFIG_RCU_BOOST */
 586                raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
 587        }
 588
 589        rnp->gp_tasks = NULL;
 590        rnp->exp_tasks = NULL;
 591#ifdef CONFIG_RCU_BOOST
 592        rnp->boost_tasks = NULL;
 593        /*
 594         * In case root is being boosted and leaf was not.  Make sure
 595         * that we boost the tasks blocking the current grace period
 596         * in this case.
 597         */
 598        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
 599        if (rnp_root->boost_tasks != NULL &&
 600            rnp_root->boost_tasks != rnp_root->gp_tasks &&
 601            rnp_root->boost_tasks != rnp_root->exp_tasks)
 602                rnp_root->boost_tasks = rnp_root->gp_tasks;
 603        raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
 604#endif /* #ifdef CONFIG_RCU_BOOST */
 605
 606        return retval;
 607}
 608
 609#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 610
 611/*
 612 * Check for a quiescent state from the current CPU.  When a task blocks,
 613 * the task is recorded in the corresponding CPU's rcu_node structure,
 614 * which is checked elsewhere.
 615 *
 616 * Caller must disable hard irqs.
 617 */
 618static void rcu_preempt_check_callbacks(int cpu)
 619{
 620        struct task_struct *t = current;
 621
 622        if (t->rcu_read_lock_nesting == 0) {
 623                rcu_preempt_qs(cpu);
 624                return;
 625        }
 626        if (t->rcu_read_lock_nesting > 0 &&
 627            per_cpu(rcu_preempt_data, cpu).qs_pending)
 628                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 629}
 630
 631#ifdef CONFIG_RCU_BOOST
 632
 633static void rcu_preempt_do_callbacks(void)
 634{
 635        rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
 636}
 637
 638#endif /* #ifdef CONFIG_RCU_BOOST */
 639
 640/*
 641 * Queue a preemptible-RCU callback for invocation after a grace period.
 642 */
 643void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 644{
 645        __call_rcu(head, func, &rcu_preempt_state, 0);
 646}
 647EXPORT_SYMBOL_GPL(call_rcu);
 648
 649/*
 650 * Queue an RCU callback for lazy invocation after a grace period.
 651 * This will likely be later named something like "call_rcu_lazy()",
 652 * but this change will require some way of tagging the lazy RCU
 653 * callbacks in the list of pending callbacks.  Until then, this
 654 * function may only be called from __kfree_rcu().
 655 */
 656void kfree_call_rcu(struct rcu_head *head,
 657                    void (*func)(struct rcu_head *rcu))
 658{
 659        __call_rcu(head, func, &rcu_preempt_state, 1);
 660}
 661EXPORT_SYMBOL_GPL(kfree_call_rcu);
 662
 663/**
 664 * synchronize_rcu - wait until a grace period has elapsed.
 665 *
 666 * Control will return to the caller some time after a full grace
 667 * period has elapsed, in other words after all currently executing RCU
 668 * read-side critical sections have completed.  Note, however, that
 669 * upon return from synchronize_rcu(), the caller might well be executing
 670 * concurrently with new RCU read-side critical sections that began while
 671 * synchronize_rcu() was waiting.  RCU read-side critical sections are
 672 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
 673 */
 674void synchronize_rcu(void)
 675{
 676        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
 677                           !lock_is_held(&rcu_lock_map) &&
 678                           !lock_is_held(&rcu_sched_lock_map),
 679                           "Illegal synchronize_rcu() in RCU read-side critical section");
 680        if (!rcu_scheduler_active)
 681                return;
 682        wait_rcu_gp(call_rcu);
 683}
 684EXPORT_SYMBOL_GPL(synchronize_rcu);
 685
 686static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
 687static unsigned long sync_rcu_preempt_exp_count;
 688static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
 689
 690/*
 691 * Return non-zero if there are any tasks in RCU read-side critical
 692 * sections blocking the current preemptible-RCU expedited grace period.
 693 * If there is no preemptible-RCU expedited grace period currently in
 694 * progress, returns zero unconditionally.
 695 */
 696static int rcu_preempted_readers_exp(struct rcu_node *rnp)
 697{
 698        return rnp->exp_tasks != NULL;
 699}
 700
 701/*
 702 * return non-zero if there is no RCU expedited grace period in progress
 703 * for the specified rcu_node structure, in other words, if all CPUs and
 704 * tasks covered by the specified rcu_node structure have done their bit
 705 * for the current expedited grace period.  Works only for preemptible
 706 * RCU -- other RCU implementation use other means.
 707 *
 708 * Caller must hold sync_rcu_preempt_exp_mutex.
 709 */
 710static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 711{
 712        return !rcu_preempted_readers_exp(rnp) &&
 713               ACCESS_ONCE(rnp->expmask) == 0;
 714}
 715
 716/*
 717 * Report the exit from RCU read-side critical section for the last task
 718 * that queued itself during or before the current expedited preemptible-RCU
 719 * grace period.  This event is reported either to the rcu_node structure on
 720 * which the task was queued or to one of that rcu_node structure's ancestors,
 721 * recursively up the tree.  (Calm down, calm down, we do the recursion
 722 * iteratively!)
 723 *
 724 * Most callers will set the "wake" flag, but the task initiating the
 725 * expedited grace period need not wake itself.
 726 *
 727 * Caller must hold sync_rcu_preempt_exp_mutex.
 728 */
 729static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 730                               bool wake)
 731{
 732        unsigned long flags;
 733        unsigned long mask;
 734
 735        raw_spin_lock_irqsave(&rnp->lock, flags);
 736        for (;;) {
 737                if (!sync_rcu_preempt_exp_done(rnp)) {
 738                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 739                        break;
 740                }
 741                if (rnp->parent == NULL) {
 742                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 743                        if (wake)
 744                                wake_up(&sync_rcu_preempt_exp_wq);
 745                        break;
 746                }
 747                mask = rnp->grpmask;
 748                raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
 749                rnp = rnp->parent;
 750                raw_spin_lock(&rnp->lock); /* irqs already disabled */
 751                rnp->expmask &= ~mask;
 752        }
 753}
 754
 755/*
 756 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
 757 * grace period for the specified rcu_node structure.  If there are no such
 758 * tasks, report it up the rcu_node hierarchy.
 759 *
 760 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
 761 */
 762static void
 763sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 764{
 765        unsigned long flags;
 766        int must_wait = 0;
 767
 768        raw_spin_lock_irqsave(&rnp->lock, flags);
 769        if (list_empty(&rnp->blkd_tasks)) {
 770                raw_spin_unlock_irqrestore(&rnp->lock, flags);
 771        } else {
 772                rnp->exp_tasks = rnp->blkd_tasks.next;
 773                rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
 774                must_wait = 1;
 775        }
 776        if (!must_wait)
 777                rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
 778}
 779
 780/**
 781 * synchronize_rcu_expedited - Brute-force RCU grace period
 782 *
 783 * Wait for an RCU-preempt grace period, but expedite it.  The basic
 784 * idea is to invoke synchronize_sched_expedited() to push all the tasks to
 785 * the ->blkd_tasks lists and wait for this list to drain.  This consumes
 786 * significant time on all CPUs and is unfriendly to real-time workloads,
 787 * so is thus not recommended for any sort of common-case code.
 788 * In fact, if you are using synchronize_rcu_expedited() in a loop,
 789 * please restructure your code to batch your updates, and then Use a
 790 * single synchronize_rcu() instead.
 791 *
 792 * Note that it is illegal to call this function while holding any lock
 793 * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
 794 * to call this function from a CPU-hotplug notifier.  Failing to observe
 795 * these restriction will result in deadlock.
 796 */
 797void synchronize_rcu_expedited(void)
 798{
 799        unsigned long flags;
 800        struct rcu_node *rnp;
 801        struct rcu_state *rsp = &rcu_preempt_state;
 802        unsigned long snap;
 803        int trycount = 0;
 804
 805        smp_mb(); /* Caller's modifications seen first by other CPUs. */
 806        snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
 807        smp_mb(); /* Above access cannot bleed into critical section. */
 808
 809        /*
 810         * Block CPU-hotplug operations.  This means that any CPU-hotplug
 811         * operation that finds an rcu_node structure with tasks in the
 812         * process of being boosted will know that all tasks blocking
 813         * this expedited grace period will already be in the process of
 814         * being boosted.  This simplifies the process of moving tasks
 815         * from leaf to root rcu_node structures.
 816         */
 817        get_online_cpus();
 818
 819        /*
 820         * Acquire lock, falling back to synchronize_rcu() if too many
 821         * lock-acquisition failures.  Of course, if someone does the
 822         * expedited grace period for us, just leave.
 823         */
 824        while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
 825                if (ULONG_CMP_LT(snap,
 826                    ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
 827                        put_online_cpus();
 828                        goto mb_ret; /* Others did our work for us. */
 829                }
 830                if (trycount++ < 10) {
 831                        udelay(trycount * num_online_cpus());
 832                } else {
 833                        put_online_cpus();
 834                        synchronize_rcu();
 835                        return;
 836                }
 837        }
 838        if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
 839                put_online_cpus();
 840                goto unlock_mb_ret; /* Others did our work for us. */
 841        }
 842
 843        /* force all RCU readers onto ->blkd_tasks lists. */
 844        synchronize_sched_expedited();
 845
 846        /* Initialize ->expmask for all non-leaf rcu_node structures. */
 847        rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
 848                raw_spin_lock_irqsave(&rnp->lock, flags);
 849                rnp->expmask = rnp->qsmaskinit;
 850                raw_spin_unlock_irqrestore(&rnp->lock, flags);
 851        }
 852
 853        /* Snapshot current state of ->blkd_tasks lists. */
 854        rcu_for_each_leaf_node(rsp, rnp)
 855                sync_rcu_preempt_exp_init(rsp, rnp);
 856        if (NUM_RCU_NODES > 1)
 857                sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
 858
 859        put_online_cpus();
 860
 861        /* Wait for snapshotted ->blkd_tasks lists to drain. */
 862        rnp = rcu_get_root(rsp);
 863        wait_event(sync_rcu_preempt_exp_wq,
 864                   sync_rcu_preempt_exp_done(rnp));
 865
 866        /* Clean up and exit. */
 867        smp_mb(); /* ensure expedited GP seen before counter increment. */
 868        ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
 869unlock_mb_ret:
 870        mutex_unlock(&sync_rcu_preempt_exp_mutex);
 871mb_ret:
 872        smp_mb(); /* ensure subsequent action seen after grace period. */
 873}
 874EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 875
 876/**
 877 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
 878 */
 879void rcu_barrier(void)
 880{
 881        _rcu_barrier(&rcu_preempt_state);
 882}
 883EXPORT_SYMBOL_GPL(rcu_barrier);
 884
 885/*
 886 * Initialize preemptible RCU's state structures.
 887 */
 888static void __init __rcu_init_preempt(void)
 889{
 890        rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 891}
 892
 893#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 894
 895static struct rcu_state *rcu_state = &rcu_sched_state;
 896
 897/*
 898 * Tell them what RCU they are running.
 899 */
 900static void __init rcu_bootup_announce(void)
 901{
 902        printk(KERN_INFO "Hierarchical RCU implementation.\n");
 903        rcu_bootup_announce_oddness();
 904}
 905
 906/*
 907 * Return the number of RCU batches processed thus far for debug & stats.
 908 */
 909long rcu_batches_completed(void)
 910{
 911        return rcu_batches_completed_sched();
 912}
 913EXPORT_SYMBOL_GPL(rcu_batches_completed);
 914
 915/*
 916 * Force a quiescent state for RCU, which, because there is no preemptible
 917 * RCU, becomes the same as rcu-sched.
 918 */
 919void rcu_force_quiescent_state(void)
 920{
 921        rcu_sched_force_quiescent_state();
 922}
 923EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 924
 925/*
 926 * Because preemptible RCU does not exist, we never have to check for
 927 * CPUs being in quiescent states.
 928 */
 929static void rcu_preempt_note_context_switch(int cpu)
 930{
 931}
 932
 933/*
 934 * Because preemptible RCU does not exist, there are never any preempted
 935 * RCU readers.
 936 */
 937static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 938{
 939        return 0;
 940}
 941
 942#ifdef CONFIG_HOTPLUG_CPU
 943
 944/* Because preemptible RCU does not exist, no quieting of tasks. */
 945static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 946{
 947        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 948}
 949
 950#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 951
 952/*
 953 * Because preemptible RCU does not exist, we never have to check for
 954 * tasks blocked within RCU read-side critical sections.
 955 */
 956static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 957{
 958}
 959
 960/*
 961 * Because preemptible RCU does not exist, we never have to check for
 962 * tasks blocked within RCU read-side critical sections.
 963 */
 964static int rcu_print_task_stall(struct rcu_node *rnp)
 965{
 966        return 0;
 967}
 968
 969/*
 970 * Because there is no preemptible RCU, there can be no readers blocked,
 971 * so there is no need to check for blocked tasks.  So check only for
 972 * bogus qsmask values.
 973 */
 974static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 975{
 976        WARN_ON_ONCE(rnp->qsmask);
 977}
 978
 979#ifdef CONFIG_HOTPLUG_CPU
 980
 981/*
 982 * Because preemptible RCU does not exist, it never needs to migrate
 983 * tasks that were blocked within RCU read-side critical sections, and
 984 * such non-existent tasks cannot possibly have been blocking the current
 985 * grace period.
 986 */
 987static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 988                                     struct rcu_node *rnp,
 989                                     struct rcu_data *rdp)
 990{
 991        return 0;
 992}
 993
 994#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 995
 996/*
 997 * Because preemptible RCU does not exist, it never has any callbacks
 998 * to check.
 999 */
1000static void rcu_preempt_check_callbacks(int cpu)
1001{
1002}
1003
1004/*
1005 * Queue an RCU callback for lazy invocation after a grace period.
1006 * This will likely be later named something like "call_rcu_lazy()",
1007 * but this change will require some way of tagging the lazy RCU
1008 * callbacks in the list of pending callbacks.  Until then, this
1009 * function may only be called from __kfree_rcu().
1010 *
1011 * Because there is no preemptible RCU, we use RCU-sched instead.
1012 */
1013void kfree_call_rcu(struct rcu_head *head,
1014                    void (*func)(struct rcu_head *rcu))
1015{
1016        __call_rcu(head, func, &rcu_sched_state, 1);
1017}
1018EXPORT_SYMBOL_GPL(kfree_call_rcu);
1019
1020/*
1021 * Wait for an rcu-preempt grace period, but make it happen quickly.
1022 * But because preemptible RCU does not exist, map to rcu-sched.
1023 */
1024void synchronize_rcu_expedited(void)
1025{
1026        synchronize_sched_expedited();
1027}
1028EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1029
1030#ifdef CONFIG_HOTPLUG_CPU
1031
1032/*
1033 * Because preemptible RCU does not exist, there is never any need to
1034 * report on tasks preempted in RCU read-side critical sections during
1035 * expedited RCU grace periods.
1036 */
1037static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1038                               bool wake)
1039{
1040}
1041
1042#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1043
1044/*
1045 * Because preemptible RCU does not exist, rcu_barrier() is just
1046 * another name for rcu_barrier_sched().
1047 */
1048void rcu_barrier(void)
1049{
1050        rcu_barrier_sched();
1051}
1052EXPORT_SYMBOL_GPL(rcu_barrier);
1053
1054/*
1055 * Because preemptible RCU does not exist, it need not be initialized.
1056 */
1057static void __init __rcu_init_preempt(void)
1058{
1059}
1060
1061#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1062
1063#ifdef CONFIG_RCU_BOOST
1064
1065#include "rtmutex_common.h"
1066
1067#ifdef CONFIG_RCU_TRACE
1068
1069static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1070{
1071        if (list_empty(&rnp->blkd_tasks))
1072                rnp->n_balk_blkd_tasks++;
1073        else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1074                rnp->n_balk_exp_gp_tasks++;
1075        else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
1076                rnp->n_balk_boost_tasks++;
1077        else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
1078                rnp->n_balk_notblocked++;
1079        else if (rnp->gp_tasks != NULL &&
1080                 ULONG_CMP_LT(jiffies, rnp->boost_time))
1081                rnp->n_balk_notyet++;
1082        else
1083                rnp->n_balk_nos++;
1084}
1085
1086#else /* #ifdef CONFIG_RCU_TRACE */
1087
1088static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1089{
1090}
1091
1092#endif /* #else #ifdef CONFIG_RCU_TRACE */
1093
1094static void rcu_wake_cond(struct task_struct *t, int status)
1095{
1096        /*
1097         * If the thread is yielding, only wake it when this
1098         * is invoked from idle
1099         */
1100        if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
1101                wake_up_process(t);
1102}
1103
1104/*
1105 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1106 * or ->boost_tasks, advancing the pointer to the next task in the
1107 * ->blkd_tasks list.
1108 *
1109 * Note that irqs must be enabled: boosting the task can block.
1110 * Returns 1 if there are more tasks needing to be boosted.
1111 */
1112static int rcu_boost(struct rcu_node *rnp)
1113{
1114        unsigned long flags;
1115        struct rt_mutex mtx;
1116        struct task_struct *t;
1117        struct list_head *tb;
1118
1119        if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
1120                return 0;  /* Nothing left to boost. */
1121
1122        raw_spin_lock_irqsave(&rnp->lock, flags);
1123
1124        /*
1125         * Recheck under the lock: all tasks in need of boosting
1126         * might exit their RCU read-side critical sections on their own.
1127         */
1128        if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1129                raw_spin_unlock_irqrestore(&rnp->lock, flags);
1130                return 0;
1131        }
1132
1133        /*
1134         * Preferentially boost tasks blocking expedited grace periods.
1135         * This cannot starve the normal grace periods because a second
1136         * expedited grace period must boost all blocked tasks, including
1137         * those blocking the pre-existing normal grace period.
1138         */
1139        if (rnp->exp_tasks != NULL) {
1140                tb = rnp->exp_tasks;
1141                rnp->n_exp_boosts++;
1142        } else {
1143                tb = rnp->boost_tasks;
1144                rnp->n_normal_boosts++;
1145        }
1146        rnp->n_tasks_boosted++;
1147
1148        /*
1149         * We boost task t by manufacturing an rt_mutex that appears to
1150         * be held by task t.  We leave a pointer to that rt_mutex where
1151         * task t can find it, and task t will release the mutex when it
1152         * exits its outermost RCU read-side critical section.  Then
1153         * simply acquiring this artificial rt_mutex will boost task
1154         * t's priority.  (Thanks to tglx for suggesting this approach!)
1155         *
1156         * Note that task t must acquire rnp->lock to remove itself from
1157         * the ->blkd_tasks list, which it will do from exit() if from
1158         * nowhere else.  We therefore are guaranteed that task t will
1159         * stay around at least until we drop rnp->lock.  Note that
1160         * rnp->lock also resolves races between our priority boosting
1161         * and task t's exiting its outermost RCU read-side critical
1162         * section.
1163         */
1164        t = container_of(tb, struct task_struct, rcu_node_entry);
1165        rt_mutex_init_proxy_locked(&mtx, t);
1166        t->rcu_boost_mutex = &mtx;
1167        raw_spin_unlock_irqrestore(&rnp->lock, flags);
1168        rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
1169        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
1170
1171        return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1172               ACCESS_ONCE(rnp->boost_tasks) != NULL;
1173}
1174
1175/*
1176 * Priority-boosting kthread.  One per leaf rcu_node and one for the
1177 * root rcu_node.
1178 */
1179static int rcu_boost_kthread(void *arg)
1180{
1181        struct rcu_node *rnp = (struct rcu_node *)arg;
1182        int spincnt = 0;
1183        int more2boost;
1184
1185        trace_rcu_utilization("Start boost kthread@init");
1186        for (;;) {
1187                rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1188                trace_rcu_utilization("End boost kthread@rcu_wait");
1189                rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1190                trace_rcu_utilization("Start boost kthread@rcu_wait");
1191                rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1192                more2boost = rcu_boost(rnp);
1193                if (more2boost)
1194                        spincnt++;
1195                else
1196                        spincnt = 0;
1197                if (spincnt > 10) {
1198                        rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
1199                        trace_rcu_utilization("End boost kthread@rcu_yield");
1200                        schedule_timeout_interruptible(2);
1201                        trace_rcu_utilization("Start boost kthread@rcu_yield");
1202                        spincnt = 0;
1203                }
1204        }
1205        /* NOTREACHED */
1206        trace_rcu_utilization("End boost kthread@notreached");
1207        return 0;
1208}
1209
1210/*
1211 * Check to see if it is time to start boosting RCU readers that are
1212 * blocking the current grace period, and, if so, tell the per-rcu_node
1213 * kthread to start boosting them.  If there is an expedited grace
1214 * period in progress, it is always time to boost.
1215 *
1216 * The caller must hold rnp->lock, which this function releases.
1217 * The ->boost_kthread_task is immortal, so we don't need to worry
1218 * about it going away.
1219 */
1220static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1221{
1222        struct task_struct *t;
1223
1224        if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1225                rnp->n_balk_exp_gp_tasks++;
1226                raw_spin_unlock_irqrestore(&rnp->lock, flags);
1227                return;
1228        }
1229        if (rnp->exp_tasks != NULL ||
1230            (rnp->gp_tasks != NULL &&
1231             rnp->boost_tasks == NULL &&
1232             rnp->qsmask == 0 &&
1233             ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1234                if (rnp->exp_tasks == NULL)
1235                        rnp->boost_tasks = rnp->gp_tasks;
1236                raw_spin_unlock_irqrestore(&rnp->lock, flags);
1237                t = rnp->boost_kthread_task;
1238                if (t)
1239                        rcu_wake_cond(t, rnp->boost_kthread_status);
1240        } else {
1241                rcu_initiate_boost_trace(rnp);
1242                raw_spin_unlock_irqrestore(&rnp->lock, flags);
1243        }
1244}
1245
1246/*
1247 * Wake up the per-CPU kthread to invoke RCU callbacks.
1248 */
1249static void invoke_rcu_callbacks_kthread(void)
1250{
1251        unsigned long flags;
1252
1253        local_irq_save(flags);
1254        __this_cpu_write(rcu_cpu_has_work, 1);
1255        if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
1256            current != __this_cpu_read(rcu_cpu_kthread_task)) {
1257                rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
1258                              __this_cpu_read(rcu_cpu_kthread_status));
1259        }
1260        local_irq_restore(flags);
1261}
1262
1263/*
1264 * Is the current CPU running the RCU-callbacks kthread?
1265 * Caller must have preemption disabled.
1266 */
1267static bool rcu_is_callbacks_kthread(void)
1268{
1269        return __get_cpu_var(rcu_cpu_kthread_task) == current;
1270}
1271
1272#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1273
1274/*
1275 * Do priority-boost accounting for the start of a new grace period.
1276 */
1277static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1278{
1279        rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1280}
1281
1282/*
1283 * Create an RCU-boost kthread for the specified node if one does not
1284 * already exist.  We only create this kthread for preemptible RCU.
1285 * Returns zero if all is well, a negated errno otherwise.
1286 */
1287static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1288                                                 struct rcu_node *rnp)
1289{
1290        int rnp_index = rnp - &rsp->node[0];
1291        unsigned long flags;
1292        struct sched_param sp;
1293        struct task_struct *t;
1294
1295        if (&rcu_preempt_state != rsp)
1296                return 0;
1297
1298        if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
1299                return 0;
1300
1301        rsp->boost = 1;
1302        if (rnp->boost_kthread_task != NULL)
1303                return 0;
1304        t = kthread_create(rcu_boost_kthread, (void *)rnp,
1305                           "rcub/%d", rnp_index);
1306        if (IS_ERR(t))
1307                return PTR_ERR(t);
1308        raw_spin_lock_irqsave(&rnp->lock, flags);
1309        rnp->boost_kthread_task = t;
1310        raw_spin_unlock_irqrestore(&rnp->lock, flags);
1311        sp.sched_priority = RCU_BOOST_PRIO;
1312        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1313        wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1314        return 0;
1315}
1316
1317static void rcu_kthread_do_work(void)
1318{
1319        rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
1320        rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1321        rcu_preempt_do_callbacks();
1322}
1323
1324static void rcu_cpu_kthread_setup(unsigned int cpu)
1325{
1326        struct sched_param sp;
1327
1328        sp.sched_priority = RCU_KTHREAD_PRIO;
1329        sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1330}
1331
1332static void rcu_cpu_kthread_park(unsigned int cpu)
1333{
1334        per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1335}
1336
1337static int rcu_cpu_kthread_should_run(unsigned int cpu)
1338{
1339        return __get_cpu_var(rcu_cpu_has_work);
1340}
1341
1342/*
1343 * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
1344 * RCU softirq used in flavors and configurations of RCU that do not
1345 * support RCU priority boosting.
1346 */
1347static void rcu_cpu_kthread(unsigned int cpu)
1348{
1349        unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
1350        char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
1351        int spincnt;
1352
1353        for (spincnt = 0; spincnt < 10; spincnt++) {
1354                trace_rcu_utilization("Start CPU kthread@rcu_wait");
1355                local_bh_disable();
1356                *statusp = RCU_KTHREAD_RUNNING;
1357                this_cpu_inc(rcu_cpu_kthread_loops);
1358                local_irq_disable();
1359                work = *workp;
1360                *workp = 0;
1361                local_irq_enable();
1362                if (work)
1363                        rcu_kthread_do_work();
1364                local_bh_enable();
1365                if (*workp == 0) {
1366                        trace_rcu_utilization("End CPU kthread@rcu_wait");
1367                        *statusp = RCU_KTHREAD_WAITING;
1368                        return;
1369                }
1370        }
1371        *statusp = RCU_KTHREAD_YIELDING;
1372        trace_rcu_utilization("Start CPU kthread@rcu_yield");
1373        schedule_timeout_interruptible(2);
1374        trace_rcu_utilization("End CPU kthread@rcu_yield");
1375        *statusp = RCU_KTHREAD_WAITING;
1376}
1377
1378/*
1379 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1380 * served by the rcu_node in question.  The CPU hotplug lock is still
1381 * held, so the value of rnp->qsmaskinit will be stable.
1382 *
1383 * We don't include outgoingcpu in the affinity set, use -1 if there is
1384 * no outgoing CPU.  If there are no CPUs left in the affinity set,
1385 * this function allows the kthread to execute on any CPU.
1386 */
1387static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1388{
1389        struct task_struct *t = rnp->boost_kthread_task;
1390        unsigned long mask = rnp->qsmaskinit;
1391        cpumask_var_t cm;
1392        int cpu;
1393
1394        if (!t)
1395                return;
1396        if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
1397                return;
1398        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1399                if ((mask & 0x1) && cpu != outgoingcpu)
1400                        cpumask_set_cpu(cpu, cm);
1401        if (cpumask_weight(cm) == 0) {
1402                cpumask_setall(cm);
1403                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1404                        cpumask_clear_cpu(cpu, cm);
1405                WARN_ON_ONCE(cpumask_weight(cm) == 0);
1406        }
1407        set_cpus_allowed_ptr(t, cm);
1408        free_cpumask_var(cm);
1409}
1410
1411static struct smp_hotplug_thread rcu_cpu_thread_spec = {
1412        .store                  = &rcu_cpu_kthread_task,
1413        .thread_should_run      = rcu_cpu_kthread_should_run,
1414        .thread_fn              = rcu_cpu_kthread,
1415        .thread_comm            = "rcuc/%u",
1416        .setup                  = rcu_cpu_kthread_setup,
1417        .park                   = rcu_cpu_kthread_park,
1418};
1419
1420/*
1421 * Spawn all kthreads -- called as soon as the scheduler is running.
1422 */
1423static int __init rcu_spawn_kthreads(void)
1424{
1425        struct rcu_node *rnp;
1426        int cpu;
1427
1428        rcu_scheduler_fully_active = 1;
1429        for_each_possible_cpu(cpu)
1430                per_cpu(rcu_cpu_has_work, cpu) = 0;
1431        BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1432        rnp = rcu_get_root(rcu_state);
1433        (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1434        if (NUM_RCU_NODES > 1) {
1435                rcu_for_each_leaf_node(rcu_state, rnp)
1436                        (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1437        }
1438        return 0;
1439}
1440early_initcall(rcu_spawn_kthreads);
1441
1442static void __cpuinit rcu_prepare_kthreads(int cpu)
1443{
1444        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1445        struct rcu_node *rnp = rdp->mynode;
1446
1447        /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1448        if (rcu_scheduler_fully_active)
1449                (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1450}
1451
1452#else /* #ifdef CONFIG_RCU_BOOST */
1453
1454static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1455{
1456        raw_spin_unlock_irqrestore(&rnp->lock, flags);
1457}
1458
1459static void invoke_rcu_callbacks_kthread(void)
1460{
1461        WARN_ON_ONCE(1);
1462}
1463
1464static bool rcu_is_callbacks_kthread(void)
1465{
1466        return false;
1467}
1468
1469static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1470{
1471}
1472
1473static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1474{
1475}
1476
1477static int __init rcu_scheduler_really_started(void)
1478{
1479        rcu_scheduler_fully_active = 1;
1480        return 0;
1481}
1482early_initcall(rcu_scheduler_really_started);
1483
1484static void __cpuinit rcu_prepare_kthreads(int cpu)
1485{
1486}
1487
1488#endif /* #else #ifdef CONFIG_RCU_BOOST */
1489
1490#if !defined(CONFIG_RCU_FAST_NO_HZ)
1491
1492/*
1493 * Check to see if any future RCU-related work will need to be done
1494 * by the current CPU, even if none need be done immediately, returning
1495 * 1 if so.  This function is part of the RCU implementation; it is -not-
1496 * an exported member of the RCU API.
1497 *
1498 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1499 * any flavor of RCU.
1500 */
1501int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1502{
1503        *delta_jiffies = ULONG_MAX;
1504        return rcu_cpu_has_callbacks(cpu);
1505}
1506
1507/*
1508 * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1509 */
1510static void rcu_prepare_for_idle_init(int cpu)
1511{
1512}
1513
1514/*
1515 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1516 * after it.
1517 */
1518static void rcu_cleanup_after_idle(int cpu)
1519{
1520}
1521
1522/*
1523 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
1524 * is nothing.
1525 */
1526static void rcu_prepare_for_idle(int cpu)
1527{
1528}
1529
1530/*
1531 * Don't bother keeping a running count of the number of RCU callbacks
1532 * posted because CONFIG_RCU_FAST_NO_HZ=n.
1533 */
1534static void rcu_idle_count_callbacks_posted(void)
1535{
1536}
1537
1538#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1539
1540/*
1541 * This code is invoked when a CPU goes idle, at which point we want
1542 * to have the CPU do everything required for RCU so that it can enter
1543 * the energy-efficient dyntick-idle mode.  This is handled by a
1544 * state machine implemented by rcu_prepare_for_idle() below.
1545 *
1546 * The following three proprocessor symbols control this state machine:
1547 *
1548 * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
1549 *      to satisfy RCU.  Beyond this point, it is better to incur a periodic
1550 *      scheduling-clock interrupt than to loop through the state machine
1551 *      at full power.
1552 * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
1553 *      optional if RCU does not need anything immediately from this
1554 *      CPU, even if this CPU still has RCU callbacks queued.  The first
1555 *      times through the state machine are mandatory: we need to give
1556 *      the state machine a chance to communicate a quiescent state
1557 *      to the RCU core.
1558 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1559 *      to sleep in dyntick-idle mode with RCU callbacks pending.  This
1560 *      is sized to be roughly one RCU grace period.  Those energy-efficiency
1561 *      benchmarkers who might otherwise be tempted to set this to a large
1562 *      number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
1563 *      system.  And if you are -that- concerned about energy efficiency,
1564 *      just power the system down and be done with it!
1565 * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
1566 *      permitted to sleep in dyntick-idle mode with only lazy RCU
1567 *      callbacks pending.  Setting this too high can OOM your system.
1568 *
1569 * The values below work well in practice.  If future workloads require
1570 * adjustment, they can be converted into kernel config parameters, though
1571 * making the state machine smarter might be a better option.
1572 */
1573#define RCU_IDLE_FLUSHES 5              /* Number of dyntick-idle tries. */
1574#define RCU_IDLE_OPT_FLUSHES 3          /* Optional dyntick-idle tries. */
1575#define RCU_IDLE_GP_DELAY 4             /* Roughly one grace period. */
1576#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1577
1578extern int tick_nohz_enabled;
1579
1580/*
1581 * Does the specified flavor of RCU have non-lazy callbacks pending on
1582 * the specified CPU?  Both RCU flavor and CPU are specified by the
1583 * rcu_data structure.
1584 */
1585static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
1586{
1587        return rdp->qlen != rdp->qlen_lazy;
1588}
1589
1590#ifdef CONFIG_TREE_PREEMPT_RCU
1591
1592/*
1593 * Are there non-lazy RCU-preempt callbacks?  (There cannot be if there
1594 * is no RCU-preempt in the kernel.)
1595 */
1596static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
1597{
1598        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
1599
1600        return __rcu_cpu_has_nonlazy_callbacks(rdp);
1601}
1602
1603#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1604
1605static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
1606{
1607        return 0;
1608}
1609
1610#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
1611
1612/*
1613 * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
1614 */
1615static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
1616{
1617        return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
1618               __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
1619               rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
1620}
1621
1622/*
1623 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
1624 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
1625 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
1626 * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
1627 * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
1628 * it is better to incur scheduling-clock interrupts than to spin
1629 * continuously for the same time duration!
1630 *
1631 * The delta_jiffies argument is used to store the time when RCU is
1632 * going to need the CPU again if it still has callbacks.  The reason
1633 * for this is that rcu_prepare_for_idle() might need to post a timer,
1634 * but if so, it will do so after tick_nohz_stop_sched_tick() has set
1635 * the wakeup time for this CPU.  This means that RCU's timer can be
1636 * delayed until the wakeup time, which defeats the purpose of posting
1637 * a timer.
1638 */
1639int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1640{
1641        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1642
1643        /* Flag a new idle sojourn to the idle-entry state machine. */
1644        rdtp->idle_first_pass = 1;
1645        /* If no callbacks, RCU doesn't need the CPU. */
1646        if (!rcu_cpu_has_callbacks(cpu)) {
1647                *delta_jiffies = ULONG_MAX;
1648                return 0;
1649        }
1650        if (rdtp->dyntick_holdoff == jiffies) {
1651                /* RCU recently tried and failed, so don't try again. */
1652                *delta_jiffies = 1;
1653                return 1;
1654        }
1655        /* Set up for the possibility that RCU will post a timer. */
1656        if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1657                *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
1658                                          RCU_IDLE_GP_DELAY) - jiffies;
1659        } else {
1660                *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
1661                *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
1662        }
1663        return 0;
1664}
1665
1666/*
1667 * Handler for smp_call_function_single().  The only point of this
1668 * handler is to wake the CPU up, so the handler does only tracing.
1669 */
1670void rcu_idle_demigrate(void *unused)
1671{
1672        trace_rcu_prep_idle("Demigrate");
1673}
1674
1675/*
1676 * Timer handler used to force CPU to start pushing its remaining RCU
1677 * callbacks in the case where it entered dyntick-idle mode with callbacks
1678 * pending.  The hander doesn't really need to do anything because the
1679 * real work is done upon re-entry to idle, or by the next scheduling-clock
1680 * interrupt should idle not be re-entered.
1681 *
1682 * One special case: the timer gets migrated without awakening the CPU
1683 * on which the timer was scheduled on.  In this case, we must wake up
1684 * that CPU.  We do so with smp_call_function_single().
1685 */
1686static void rcu_idle_gp_timer_func(unsigned long cpu_in)
1687{
1688        int cpu = (int)cpu_in;
1689
1690        trace_rcu_prep_idle("Timer");
1691        if (cpu != smp_processor_id())
1692                smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
1693        else
1694                WARN_ON_ONCE(1); /* Getting here can hang the system... */
1695}
1696
1697/*
1698 * Initialize the timer used to pull CPUs out of dyntick-idle mode.
1699 */
1700static void rcu_prepare_for_idle_init(int cpu)
1701{
1702        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1703
1704        rdtp->dyntick_holdoff = jiffies - 1;
1705        setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
1706        rdtp->idle_gp_timer_expires = jiffies - 1;
1707        rdtp->idle_first_pass = 1;
1708}
1709
1710/*
1711 * Clean up for exit from idle.  Because we are exiting from idle, there
1712 * is no longer any point to ->idle_gp_timer, so cancel it.  This will
1713 * do nothing if this timer is not active, so just cancel it unconditionally.
1714 */
1715static void rcu_cleanup_after_idle(int cpu)
1716{
1717        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1718
1719        del_timer(&rdtp->idle_gp_timer);
1720        trace_rcu_prep_idle("Cleanup after idle");
1721        rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
1722}
1723
1724/*
1725 * Check to see if any RCU-related work can be done by the current CPU,
1726 * and if so, schedule a softirq to get it done.  This function is part
1727 * of the RCU implementation; it is -not- an exported member of the RCU API.
1728 *
1729 * The idea is for the current CPU to clear out all work required by the
1730 * RCU core for the current grace period, so that this CPU can be permitted
1731 * to enter dyntick-idle mode.  In some cases, it will need to be awakened
1732 * at the end of the grace period by whatever CPU ends the grace period.
1733 * This allows CPUs to go dyntick-idle more quickly, and to reduce the
1734 * number of wakeups by a modest integer factor.
1735 *
1736 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1737 * disabled, we do one pass of force_quiescent_state(), then do a
1738 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1739 * later.  The ->dyntick_drain field controls the sequencing.
1740 *
1741 * The caller must have disabled interrupts.
1742 */
1743static void rcu_prepare_for_idle(int cpu)
1744{
1745        struct timer_list *tp;
1746        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1747        int tne;
1748
1749        /* Handle nohz enablement switches conservatively. */
1750        tne = ACCESS_ONCE(tick_nohz_enabled);
1751        if (tne != rdtp->tick_nohz_enabled_snap) {
1752                if (rcu_cpu_has_callbacks(cpu))
1753                        invoke_rcu_core(); /* force nohz to see update. */
1754                rdtp->tick_nohz_enabled_snap = tne;
1755                return;
1756        }
1757        if (!tne)
1758                return;
1759
1760        /* Adaptive-tick mode, where usermode execution is idle to RCU. */
1761        if (!is_idle_task(current)) {
1762                rdtp->dyntick_holdoff = jiffies - 1;
1763                if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1764                        trace_rcu_prep_idle("User dyntick with callbacks");
1765                        rdtp->idle_gp_timer_expires =
1766                                round_up(jiffies + RCU_IDLE_GP_DELAY,
1767                                         RCU_IDLE_GP_DELAY);
1768                } else if (rcu_cpu_has_callbacks(cpu)) {
1769                        rdtp->idle_gp_timer_expires =
1770                                round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1771                        trace_rcu_prep_idle("User dyntick with lazy callbacks");
1772                } else {
1773                        return;
1774                }
1775                tp = &rdtp->idle_gp_timer;
1776                mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1777                return;
1778        }
1779
1780        /*
1781         * If this is an idle re-entry, for example, due to use of
1782         * RCU_NONIDLE() or the new idle-loop tracing API within the idle
1783         * loop, then don't take any state-machine actions, unless the
1784         * momentary exit from idle queued additional non-lazy callbacks.
1785         * Instead, repost the ->idle_gp_timer if this CPU has callbacks
1786         * pending.
1787         */
1788        if (!rdtp->idle_first_pass &&
1789            (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
1790                if (rcu_cpu_has_callbacks(cpu)) {
1791                        tp = &rdtp->idle_gp_timer;
1792                        mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1793                }
1794                return;
1795        }
1796        rdtp->idle_first_pass = 0;
1797        rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
1798
1799        /*
1800         * If there are no callbacks on this CPU, enter dyntick-idle mode.
1801         * Also reset state to avoid prejudicing later attempts.
1802         */
1803        if (!rcu_cpu_has_callbacks(cpu)) {
1804                rdtp->dyntick_holdoff = jiffies - 1;
1805                rdtp->dyntick_drain = 0;
1806                trace_rcu_prep_idle("No callbacks");
1807                return;
1808        }
1809
1810        /*
1811         * If in holdoff mode, just return.  We will presumably have
1812         * refrained from disabling the scheduling-clock tick.
1813         */
1814        if (rdtp->dyntick_holdoff == jiffies) {
1815                trace_rcu_prep_idle("In holdoff");
1816                return;
1817        }
1818
1819        /* Check and update the ->dyntick_drain sequencing. */
1820        if (rdtp->dyntick_drain <= 0) {
1821                /* First time through, initialize the counter. */
1822                rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
1823        } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
1824                   !rcu_pending(cpu) &&
1825                   !local_softirq_pending()) {
1826                /* Can we go dyntick-idle despite still having callbacks? */
1827                rdtp->dyntick_drain = 0;
1828                rdtp->dyntick_holdoff = jiffies;
1829                if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1830                        trace_rcu_prep_idle("Dyntick with callbacks");
1831                        rdtp->idle_gp_timer_expires =
1832                                round_up(jiffies + RCU_IDLE_GP_DELAY,
1833                                         RCU_IDLE_GP_DELAY);
1834                } else {
1835                        rdtp->idle_gp_timer_expires =
1836                                round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1837                        trace_rcu_prep_idle("Dyntick with lazy callbacks");
1838                }
1839                tp = &rdtp->idle_gp_timer;
1840                mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1841                rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1842                return; /* Nothing more to do immediately. */
1843        } else if (--(rdtp->dyntick_drain) <= 0) {
1844                /* We have hit the limit, so time to give up. */
1845                rdtp->dyntick_holdoff = jiffies;
1846                trace_rcu_prep_idle("Begin holdoff");
1847                invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
1848                return;
1849        }
1850
1851        /*
1852         * Do one step of pushing the remaining RCU callbacks through
1853         * the RCU core state machine.
1854         */
1855#ifdef CONFIG_TREE_PREEMPT_RCU
1856        if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
1857                rcu_preempt_qs(cpu);
1858                force_quiescent_state(&rcu_preempt_state);
1859        }
1860#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1861        if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1862                rcu_sched_qs(cpu);
1863                force_quiescent_state(&rcu_sched_state);
1864        }
1865        if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1866                rcu_bh_qs(cpu);
1867                force_quiescent_state(&rcu_bh_state);
1868        }
1869
1870        /*
1871         * If RCU callbacks are still pending, RCU still needs this CPU.
1872         * So try forcing the callbacks through the grace period.
1873         */
1874        if (rcu_cpu_has_callbacks(cpu)) {
1875                trace_rcu_prep_idle("More callbacks");
1876                invoke_rcu_core();
1877        } else {
1878                trace_rcu_prep_idle("Callbacks drained");
1879        }
1880}
1881
1882/*
1883 * Keep a running count of the number of non-lazy callbacks posted
1884 * on this CPU.  This running counter (which is never decremented) allows
1885 * rcu_prepare_for_idle() to detect when something out of the idle loop
1886 * posts a callback, even if an equal number of callbacks are invoked.
1887 * Of course, callbacks should only be posted from within a trace event
1888 * designed to be called from idle or from within RCU_NONIDLE().
1889 */
1890static void rcu_idle_count_callbacks_posted(void)
1891{
1892        __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
1893}
1894
1895/*
1896 * Data for flushing lazy RCU callbacks at OOM time.
1897 */
1898static atomic_t oom_callback_count;
1899static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
1900
1901/*
1902 * RCU OOM callback -- decrement the outstanding count and deliver the
1903 * wake-up if we are the last one.
1904 */
1905static void rcu_oom_callback(struct rcu_head *rhp)
1906{
1907        if (atomic_dec_and_test(&oom_callback_count))
1908                wake_up(&oom_callback_wq);
1909}
1910
1911/*
1912 * Post an rcu_oom_notify callback on the current CPU if it has at
1913 * least one lazy callback.  This will unnecessarily post callbacks
1914 * to CPUs that already have a non-lazy callback at the end of their
1915 * callback list, but this is an infrequent operation, so accept some
1916 * extra overhead to keep things simple.
1917 */
1918static void rcu_oom_notify_cpu(void *unused)
1919{
1920        struct rcu_state *rsp;
1921        struct rcu_data *rdp;
1922
1923        for_each_rcu_flavor(rsp) {
1924                rdp = __this_cpu_ptr(rsp->rda);
1925                if (rdp->qlen_lazy != 0) {
1926                        atomic_inc(&oom_callback_count);
1927                        rsp->call(&rdp->oom_head, rcu_oom_callback);
1928                }
1929        }
1930}
1931
1932/*
1933 * If low on memory, ensure that each CPU has a non-lazy callback.
1934 * This will wake up CPUs that have only lazy callbacks, in turn
1935 * ensuring that they free up the corresponding memory in a timely manner.
1936 * Because an uncertain amount of memory will be freed in some uncertain
1937 * timeframe, we do not claim to have freed anything.
1938 */
1939static int rcu_oom_notify(struct notifier_block *self,
1940                          unsigned long notused, void *nfreed)
1941{
1942        int cpu;
1943
1944        /* Wait for callbacks from earlier instance to complete. */
1945        wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
1946
1947        /*
1948         * Prevent premature wakeup: ensure that all increments happen
1949         * before there is a chance of the counter reaching zero.
1950         */
1951        atomic_set(&oom_callback_count, 1);
1952
1953        get_online_cpus();
1954        for_each_online_cpu(cpu) {
1955                smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
1956                cond_resched();
1957        }
1958        put_online_cpus();
1959
1960        /* Unconditionally decrement: no need to wake ourselves up. */
1961        atomic_dec(&oom_callback_count);
1962
1963        return NOTIFY_OK;
1964}
1965
1966static struct notifier_block rcu_oom_nb = {
1967        .notifier_call = rcu_oom_notify
1968};
1969
1970static int __init rcu_register_oom_notifier(void)
1971{
1972        register_oom_notifier(&rcu_oom_nb);
1973        return 0;
1974}
1975early_initcall(rcu_register_oom_notifier);
1976
1977#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1978
1979#ifdef CONFIG_RCU_CPU_STALL_INFO
1980
1981#ifdef CONFIG_RCU_FAST_NO_HZ
1982
1983static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1984{
1985        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1986        struct timer_list *tltp = &rdtp->idle_gp_timer;
1987        char c;
1988
1989        c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
1990        if (timer_pending(tltp))
1991                sprintf(cp, "drain=%d %c timer=%lu",
1992                        rdtp->dyntick_drain, c, tltp->expires - jiffies);
1993        else
1994                sprintf(cp, "drain=%d %c timer not pending",
1995                        rdtp->dyntick_drain, c);
1996}
1997
1998#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
1999
2000static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2001{
2002        *cp = '\0';
2003}
2004
2005#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
2006
2007/* Initiate the stall-info list. */
2008static void print_cpu_stall_info_begin(void)
2009{
2010        printk(KERN_CONT "\n");
2011}
2012
2013/*
2014 * Print out diagnostic information for the specified stalled CPU.
2015 *
2016 * If the specified CPU is aware of the current RCU grace period
2017 * (flavor specified by rsp), then print the number of scheduling
2018 * clock interrupts the CPU has taken during the time that it has
2019 * been aware.  Otherwise, print the number of RCU grace periods
2020 * that this CPU is ignorant of, for example, "1" if the CPU was
2021 * aware of the previous grace period.
2022 *
2023 * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
2024 */
2025static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2026{
2027        char fast_no_hz[72];
2028        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2029        struct rcu_dynticks *rdtp = rdp->dynticks;
2030        char *ticks_title;
2031        unsigned long ticks_value;
2032
2033        if (rsp->gpnum == rdp->gpnum) {
2034                ticks_title = "ticks this GP";
2035                ticks_value = rdp->ticks_this_gp;
2036        } else {
2037                ticks_title = "GPs behind";
2038                ticks_value = rsp->gpnum - rdp->gpnum;
2039        }
2040        print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
2041        printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
2042               cpu, ticks_value, ticks_title,
2043               atomic_read(&rdtp->dynticks) & 0xfff,
2044               rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
2045               fast_no_hz);
2046}
2047
2048/* Terminate the stall-info list. */
2049static void print_cpu_stall_info_end(void)
2050{
2051        printk(KERN_ERR "\t");
2052}
2053
2054/* Zero ->ticks_this_gp for all flavors of RCU. */
2055static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2056{
2057        rdp->ticks_this_gp = 0;
2058}
2059
2060/* Increment ->ticks_this_gp for all flavors of RCU. */
2061static void increment_cpu_stall_ticks(void)
2062{
2063        struct rcu_state *rsp;
2064
2065        for_each_rcu_flavor(rsp)
2066                __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
2067}
2068
2069#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
2070
2071static void print_cpu_stall_info_begin(void)
2072{
2073        printk(KERN_CONT " {");
2074}
2075
2076static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2077{
2078        printk(KERN_CONT " %d", cpu);
2079}
2080
2081static void print_cpu_stall_info_end(void)
2082{
2083        printk(KERN_CONT "} ");
2084}
2085
2086static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2087{
2088}
2089
2090static void increment_cpu_stall_ticks(void)
2091{
2092}
2093
2094#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
2095
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.