linux/kernel/rcutree.c
<<
>>
Prefs
   1/*
   2 * Read-Copy Update mechanism for mutual exclusion
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License as published by
   6 * the Free Software Foundation; either version 2 of the License, or
   7 * (at your option) any later version.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17 *
  18 * Copyright IBM Corporation, 2008
  19 *
  20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  21 *          Manfred Spraul <manfred@colorfullife.com>
  22 *          Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
  23 *
  24 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
  26 *
  27 * For detailed explanation of Read-Copy Update mechanism see -
  28 *      Documentation/RCU
  29 */
  30#include <linux/types.h>
  31#include <linux/kernel.h>
  32#include <linux/init.h>
  33#include <linux/spinlock.h>
  34#include <linux/smp.h>
  35#include <linux/rcupdate.h>
  36#include <linux/interrupt.h>
  37#include <linux/sched.h>
  38#include <asm/atomic.h>
  39#include <linux/bitops.h>
  40#include <linux/module.h>
  41#include <linux/completion.h>
  42#include <linux/moduleparam.h>
  43#include <linux/percpu.h>
  44#include <linux/notifier.h>
  45#include <linux/cpu.h>
  46#include <linux/mutex.h>
  47#include <linux/time.h>
  48
  49#ifdef CONFIG_DEBUG_LOCK_ALLOC
  50static struct lock_class_key rcu_lock_key;
  51struct lockdep_map rcu_lock_map =
  52        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
  53EXPORT_SYMBOL_GPL(rcu_lock_map);
  54#endif
  55
  56/* Data structures. */
  57
  58#define RCU_STATE_INITIALIZER(name) { \
  59        .level = { &name.node[0] }, \
  60        .levelcnt = { \
  61                NUM_RCU_LVL_0,  /* root of hierarchy. */ \
  62                NUM_RCU_LVL_1, \
  63                NUM_RCU_LVL_2, \
  64                NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
  65        }, \
  66        .signaled = RCU_SIGNAL_INIT, \
  67        .gpnum = -300, \
  68        .completed = -300, \
  69        .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
  70        .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
  71        .n_force_qs = 0, \
  72        .n_force_qs_ngp = 0, \
  73}
  74
  75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
  76DEFINE_PER_CPU(struct rcu_data, rcu_data);
  77
  78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
  79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
  80
  81/*
  82 * Increment the quiescent state counter.
  83 * The counter is a bit degenerated: We do not need to know
  84 * how many quiescent states passed, just if there was at least
  85 * one since the start of the grace period. Thus just a flag.
  86 */
  87void rcu_qsctr_inc(int cpu)
  88{
  89        struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
  90        rdp->passed_quiesc = 1;
  91        rdp->passed_quiesc_completed = rdp->completed;
  92}
  93
  94void rcu_bh_qsctr_inc(int cpu)
  95{
  96        struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
  97        rdp->passed_quiesc = 1;
  98        rdp->passed_quiesc_completed = rdp->completed;
  99}
 100
 101#ifdef CONFIG_NO_HZ
 102DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 103        .dynticks_nesting = 1,
 104        .dynticks = 1,
 105};
 106#endif /* #ifdef CONFIG_NO_HZ */
 107
 108static int blimit = 10;         /* Maximum callbacks per softirq. */
 109static int qhimark = 10000;     /* If this many pending, ignore blimit. */
 110static int qlowmark = 100;      /* Once only this many pending, use blimit. */
 111
 112static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 113
 114/*
 115 * Return the number of RCU batches processed thus far for debug & stats.
 116 */
 117long rcu_batches_completed(void)
 118{
 119        return rcu_state.completed;
 120}
 121EXPORT_SYMBOL_GPL(rcu_batches_completed);
 122
 123/*
 124 * Return the number of RCU BH batches processed thus far for debug & stats.
 125 */
 126long rcu_batches_completed_bh(void)
 127{
 128        return rcu_bh_state.completed;
 129}
 130EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 131
 132/*
 133 * Does the CPU have callbacks ready to be invoked?
 134 */
 135static int
 136cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
 137{
 138        return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
 139}
 140
 141/*
 142 * Does the current CPU require a yet-as-unscheduled grace period?
 143 */
 144static int
 145cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 146{
 147        /* ACCESS_ONCE() because we are accessing outside of lock. */
 148        return *rdp->nxttail[RCU_DONE_TAIL] &&
 149               ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
 150}
 151
 152/*
 153 * Return the root node of the specified rcu_state structure.
 154 */
 155static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 156{
 157        return &rsp->node[0];
 158}
 159
 160#ifdef CONFIG_SMP
 161
 162/*
 163 * If the specified CPU is offline, tell the caller that it is in
 164 * a quiescent state.  Otherwise, whack it with a reschedule IPI.
 165 * Grace periods can end up waiting on an offline CPU when that
 166 * CPU is in the process of coming online -- it will be added to the
 167 * rcu_node bitmasks before it actually makes it online.  The same thing
 168 * can happen while a CPU is in the process of coming online.  Because this
 169 * race is quite rare, we check for it after detecting that the grace
 170 * period has been delayed rather than checking each and every CPU
 171 * each and every time we start a new grace period.
 172 */
 173static int rcu_implicit_offline_qs(struct rcu_data *rdp)
 174{
 175        /*
 176         * If the CPU is offline, it is in a quiescent state.  We can
 177         * trust its state not to change because interrupts are disabled.
 178         */
 179        if (cpu_is_offline(rdp->cpu)) {
 180                rdp->offline_fqs++;
 181                return 1;
 182        }
 183
 184        /* The CPU is online, so send it a reschedule IPI. */
 185        if (rdp->cpu != smp_processor_id())
 186                smp_send_reschedule(rdp->cpu);
 187        else
 188                set_need_resched();
 189        rdp->resched_ipi++;
 190        return 0;
 191}
 192
 193#endif /* #ifdef CONFIG_SMP */
 194
 195#ifdef CONFIG_NO_HZ
 196static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
 197
 198/**
 199 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
 200 *
 201 * Enter nohz mode, in other words, -leave- the mode in which RCU
 202 * read-side critical sections can occur.  (Though RCU read-side
 203 * critical sections can occur in irq handlers in nohz mode, a possibility
 204 * handled by rcu_irq_enter() and rcu_irq_exit()).
 205 */
 206void rcu_enter_nohz(void)
 207{
 208        unsigned long flags;
 209        struct rcu_dynticks *rdtp;
 210
 211        smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
 212        local_irq_save(flags);
 213        rdtp = &__get_cpu_var(rcu_dynticks);
 214        rdtp->dynticks++;
 215        rdtp->dynticks_nesting--;
 216        WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
 217        local_irq_restore(flags);
 218}
 219
 220/*
 221 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
 222 *
 223 * Exit nohz mode, in other words, -enter- the mode in which RCU
 224 * read-side critical sections normally occur.
 225 */
 226void rcu_exit_nohz(void)
 227{
 228        unsigned long flags;
 229        struct rcu_dynticks *rdtp;
 230
 231        local_irq_save(flags);
 232        rdtp = &__get_cpu_var(rcu_dynticks);
 233        rdtp->dynticks++;
 234        rdtp->dynticks_nesting++;
 235        WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
 236        local_irq_restore(flags);
 237        smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 238}
 239
 240/**
 241 * rcu_nmi_enter - inform RCU of entry to NMI context
 242 *
 243 * If the CPU was idle with dynamic ticks active, and there is no
 244 * irq handler running, this updates rdtp->dynticks_nmi to let the
 245 * RCU grace-period handling know that the CPU is active.
 246 */
 247void rcu_nmi_enter(void)
 248{
 249        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
 250
 251        if (rdtp->dynticks & 0x1)
 252                return;
 253        rdtp->dynticks_nmi++;
 254        WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
 255        smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 256}
 257
 258/**
 259 * rcu_nmi_exit - inform RCU of exit from NMI context
 260 *
 261 * If the CPU was idle with dynamic ticks active, and there is no
 262 * irq handler running, this updates rdtp->dynticks_nmi to let the
 263 * RCU grace-period handling know that the CPU is no longer active.
 264 */
 265void rcu_nmi_exit(void)
 266{
 267        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
 268
 269        if (rdtp->dynticks & 0x1)
 270                return;
 271        smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
 272        rdtp->dynticks_nmi++;
 273        WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
 274}
 275
 276/**
 277 * rcu_irq_enter - inform RCU of entry to hard irq context
 278 *
 279 * If the CPU was idle with dynamic ticks active, this updates the
 280 * rdtp->dynticks to let the RCU handling know that the CPU is active.
 281 */
 282void rcu_irq_enter(void)
 283{
 284        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
 285
 286        if (rdtp->dynticks_nesting++)
 287                return;
 288        rdtp->dynticks++;
 289        WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
 290        smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 291}
 292
 293/**
 294 * rcu_irq_exit - inform RCU of exit from hard irq context
 295 *
 296 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
 297 * to put let the RCU handling be aware that the CPU is going back to idle
 298 * with no ticks.
 299 */
 300void rcu_irq_exit(void)
 301{
 302        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
 303
 304        if (--rdtp->dynticks_nesting)
 305                return;
 306        smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
 307        rdtp->dynticks++;
 308        WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
 309
 310        /* If the interrupt queued a callback, get out of dyntick mode. */
 311        if (__get_cpu_var(rcu_data).nxtlist ||
 312            __get_cpu_var(rcu_bh_data).nxtlist)
 313                set_need_resched();
 314}
 315
 316/*
 317 * Record the specified "completed" value, which is later used to validate
 318 * dynticks counter manipulations.  Specify "rsp->completed - 1" to
 319 * unconditionally invalidate any future dynticks manipulations (which is
 320 * useful at the beginning of a grace period).
 321 */
 322static void dyntick_record_completed(struct rcu_state *rsp, long comp)
 323{
 324        rsp->dynticks_completed = comp;
 325}
 326
 327#ifdef CONFIG_SMP
 328
 329/*
 330 * Recall the previously recorded value of the completion for dynticks.
 331 */
 332static long dyntick_recall_completed(struct rcu_state *rsp)
 333{
 334        return rsp->dynticks_completed;
 335}
 336
 337/*
 338 * Snapshot the specified CPU's dynticks counter so that we can later
 339 * credit them with an implicit quiescent state.  Return 1 if this CPU
 340 * is already in a quiescent state courtesy of dynticks idle mode.
 341 */
 342static int dyntick_save_progress_counter(struct rcu_data *rdp)
 343{
 344        int ret;
 345        int snap;
 346        int snap_nmi;
 347
 348        snap = rdp->dynticks->dynticks;
 349        snap_nmi = rdp->dynticks->dynticks_nmi;
 350        smp_mb();       /* Order sampling of snap with end of grace period. */
 351        rdp->dynticks_snap = snap;
 352        rdp->dynticks_nmi_snap = snap_nmi;
 353        ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
 354        if (ret)
 355                rdp->dynticks_fqs++;
 356        return ret;
 357}
 358
 359/*
 360 * Return true if the specified CPU has passed through a quiescent
 361 * state by virtue of being in or having passed through an dynticks
 362 * idle state since the last call to dyntick_save_progress_counter()
 363 * for this same CPU.
 364 */
 365static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 366{
 367        long curr;
 368        long curr_nmi;
 369        long snap;
 370        long snap_nmi;
 371
 372        curr = rdp->dynticks->dynticks;
 373        snap = rdp->dynticks_snap;
 374        curr_nmi = rdp->dynticks->dynticks_nmi;
 375        snap_nmi = rdp->dynticks_nmi_snap;
 376        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
 377
 378        /*
 379         * If the CPU passed through or entered a dynticks idle phase with
 380         * no active irq/NMI handlers, then we can safely pretend that the CPU
 381         * already acknowledged the request to pass through a quiescent
 382         * state.  Either way, that CPU cannot possibly be in an RCU
 383         * read-side critical section that started before the beginning
 384         * of the current RCU grace period.
 385         */
 386        if ((curr != snap || (curr & 0x1) == 0) &&
 387            (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
 388                rdp->dynticks_fqs++;
 389                return 1;
 390        }
 391
 392        /* Go check for the CPU being offline. */
 393        return rcu_implicit_offline_qs(rdp);
 394}
 395
 396#endif /* #ifdef CONFIG_SMP */
 397
 398#else /* #ifdef CONFIG_NO_HZ */
 399
 400static void dyntick_record_completed(struct rcu_state *rsp, long comp)
 401{
 402}
 403
 404#ifdef CONFIG_SMP
 405
 406/*
 407 * If there are no dynticks, then the only way that a CPU can passively
 408 * be in a quiescent state is to be offline.  Unlike dynticks idle, which
 409 * is a point in time during the prior (already finished) grace period,
 410 * an offline CPU is always in a quiescent state, and thus can be
 411 * unconditionally applied.  So just return the current value of completed.
 412 */
 413static long dyntick_recall_completed(struct rcu_state *rsp)
 414{
 415        return rsp->completed;
 416}
 417
 418static int dyntick_save_progress_counter(struct rcu_data *rdp)
 419{
 420        return 0;
 421}
 422
 423static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 424{
 425        return rcu_implicit_offline_qs(rdp);
 426}
 427
 428#endif /* #ifdef CONFIG_SMP */
 429
 430#endif /* #else #ifdef CONFIG_NO_HZ */
 431
 432#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 433
 434static void record_gp_stall_check_time(struct rcu_state *rsp)
 435{
 436        rsp->gp_start = jiffies;
 437        rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
 438}
 439
 440static void print_other_cpu_stall(struct rcu_state *rsp)
 441{
 442        int cpu;
 443        long delta;
 444        unsigned long flags;
 445        struct rcu_node *rnp = rcu_get_root(rsp);
 446        struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
 447        struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
 448
 449        /* Only let one CPU complain about others per time interval. */
 450
 451        spin_lock_irqsave(&rnp->lock, flags);
 452        delta = jiffies - rsp->jiffies_stall;
 453        if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
 454                spin_unlock_irqrestore(&rnp->lock, flags);
 455                return;
 456        }
 457        rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
 458        spin_unlock_irqrestore(&rnp->lock, flags);
 459
 460        /* OK, time to rat on our buddy... */
 461
 462        printk(KERN_ERR "INFO: RCU detected CPU stalls:");
 463        for (; rnp_cur < rnp_end; rnp_cur++) {
 464                if (rnp_cur->qsmask == 0)
 465                        continue;
 466                for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
 467                        if (rnp_cur->qsmask & (1UL << cpu))
 468                                printk(" %d", rnp_cur->grplo + cpu);
 469        }
 470        printk(" (detected by %d, t=%ld jiffies)\n",
 471               smp_processor_id(), (long)(jiffies - rsp->gp_start));
 472        force_quiescent_state(rsp, 0);  /* Kick them all. */
 473}
 474
 475static void print_cpu_stall(struct rcu_state *rsp)
 476{
 477        unsigned long flags;
 478        struct rcu_node *rnp = rcu_get_root(rsp);
 479
 480        printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
 481                        smp_processor_id(), jiffies - rsp->gp_start);
 482        dump_stack();
 483        spin_lock_irqsave(&rnp->lock, flags);
 484        if ((long)(jiffies - rsp->jiffies_stall) >= 0)
 485                rsp->jiffies_stall =
 486                        jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
 487        spin_unlock_irqrestore(&rnp->lock, flags);
 488        set_need_resched();  /* kick ourselves to get things going. */
 489}
 490
 491static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 492{
 493        long delta;
 494        struct rcu_node *rnp;
 495
 496        delta = jiffies - rsp->jiffies_stall;
 497        rnp = rdp->mynode;
 498        if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
 499
 500                /* We haven't checked in, so go dump stack. */
 501                print_cpu_stall(rsp);
 502
 503        } else if (rsp->gpnum != rsp->completed &&
 504                   delta >= RCU_STALL_RAT_DELAY) {
 505
 506                /* They had two time units to dump stack, so complain. */
 507                print_other_cpu_stall(rsp);
 508        }
 509}
 510
 511#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 512
 513static void record_gp_stall_check_time(struct rcu_state *rsp)
 514{
 515}
 516
 517static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 518{
 519}
 520
 521#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 522
 523/*
 524 * Update CPU-local rcu_data state to record the newly noticed grace period.
 525 * This is used both when we started the grace period and when we notice
 526 * that someone else started the grace period.
 527 */
 528static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
 529{
 530        rdp->qs_pending = 1;
 531        rdp->passed_quiesc = 0;
 532        rdp->gpnum = rsp->gpnum;
 533}
 534
 535/*
 536 * Did someone else start a new RCU grace period start since we last
 537 * checked?  Update local state appropriately if so.  Must be called
 538 * on the CPU corresponding to rdp.
 539 */
 540static int
 541check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
 542{
 543        unsigned long flags;
 544        int ret = 0;
 545
 546        local_irq_save(flags);
 547        if (rdp->gpnum != rsp->gpnum) {
 548                note_new_gpnum(rsp, rdp);
 549                ret = 1;
 550        }
 551        local_irq_restore(flags);
 552        return ret;
 553}
 554
 555/*
 556 * Start a new RCU grace period if warranted, re-initializing the hierarchy
 557 * in preparation for detecting the next grace period.  The caller must hold
 558 * the root node's ->lock, which is released before return.  Hard irqs must
 559 * be disabled.
 560 */
 561static void
 562rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 563        __releases(rcu_get_root(rsp)->lock)
 564{
 565        struct rcu_data *rdp = rsp->rda[smp_processor_id()];
 566        struct rcu_node *rnp = rcu_get_root(rsp);
 567        struct rcu_node *rnp_cur;
 568        struct rcu_node *rnp_end;
 569
 570        if (!cpu_needs_another_gp(rsp, rdp)) {
 571                spin_unlock_irqrestore(&rnp->lock, flags);
 572                return;
 573        }
 574
 575        /* Advance to a new grace period and initialize state. */
 576        rsp->gpnum++;
 577        rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
 578        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
 579        record_gp_stall_check_time(rsp);
 580        dyntick_record_completed(rsp, rsp->completed - 1);
 581        note_new_gpnum(rsp, rdp);
 582
 583        /*
 584         * Because we are first, we know that all our callbacks will
 585         * be covered by this upcoming grace period, even the ones
 586         * that were registered arbitrarily recently.
 587         */
 588        rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 589        rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 590
 591        /* Special-case the common single-level case. */
 592        if (NUM_RCU_NODES == 1) {
 593                rnp->qsmask = rnp->qsmaskinit;
 594                rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
 595                spin_unlock_irqrestore(&rnp->lock, flags);
 596                return;
 597        }
 598
 599        spin_unlock(&rnp->lock);  /* leave irqs disabled. */
 600
 601
 602        /* Exclude any concurrent CPU-hotplug operations. */
 603        spin_lock(&rsp->onofflock);  /* irqs already disabled. */
 604
 605        /*
 606         * Set the quiescent-state-needed bits in all the non-leaf RCU
 607         * nodes for all currently online CPUs.  This operation relies
 608         * on the layout of the hierarchy within the rsp->node[] array.
 609         * Note that other CPUs will access only the leaves of the
 610         * hierarchy, which still indicate that no grace period is in
 611         * progress.  In addition, we have excluded CPU-hotplug operations.
 612         *
 613         * We therefore do not need to hold any locks.  Any required
 614         * memory barriers will be supplied by the locks guarding the
 615         * leaf rcu_nodes in the hierarchy.
 616         */
 617
 618        rnp_end = rsp->level[NUM_RCU_LVLS - 1];
 619        for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
 620                rnp_cur->qsmask = rnp_cur->qsmaskinit;
 621
 622        /*
 623         * Now set up the leaf nodes.  Here we must be careful.  First,
 624         * we need to hold the lock in order to exclude other CPUs, which
 625         * might be contending for the leaf nodes' locks.  Second, as
 626         * soon as we initialize a given leaf node, its CPUs might run
 627         * up the rest of the hierarchy.  We must therefore acquire locks
 628         * for each node that we touch during this stage.  (But we still
 629         * are excluding CPU-hotplug operations.)
 630         *
 631         * Note that the grace period cannot complete until we finish
 632         * the initialization process, as there will be at least one
 633         * qsmask bit set in the root node until that time, namely the
 634         * one corresponding to this CPU.
 635         */
 636        rnp_end = &rsp->node[NUM_RCU_NODES];
 637        rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
 638        for (; rnp_cur < rnp_end; rnp_cur++) {
 639                spin_lock(&rnp_cur->lock);      /* irqs already disabled. */
 640                rnp_cur->qsmask = rnp_cur->qsmaskinit;
 641                spin_unlock(&rnp_cur->lock);    /* irqs already disabled. */
 642        }
 643
 644        rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
 645        spin_unlock_irqrestore(&rsp->onofflock, flags);
 646}
 647
 648/*
 649 * Advance this CPU's callbacks, but only if the current grace period
 650 * has ended.  This may be called only from the CPU to whom the rdp
 651 * belongs.
 652 */
 653static void
 654rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
 655{
 656        long completed_snap;
 657        unsigned long flags;
 658
 659        local_irq_save(flags);
 660        completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */
 661
 662        /* Did another grace period end? */
 663        if (rdp->completed != completed_snap) {
 664
 665                /* Advance callbacks.  No harm if list empty. */
 666                rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
 667                rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
 668                rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 669
 670                /* Remember that we saw this grace-period completion. */
 671                rdp->completed = completed_snap;
 672        }
 673        local_irq_restore(flags);
 674}
 675
 676/*
 677 * Similar to cpu_quiet(), for which it is a helper function.  Allows
 678 * a group of CPUs to be quieted at one go, though all the CPUs in the
 679 * group must be represented by the same leaf rcu_node structure.
 680 * That structure's lock must be held upon entry, and it is released
 681 * before return.
 682 */
 683static void
 684cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
 685              unsigned long flags)
 686        __releases(rnp->lock)
 687{
 688        /* Walk up the rcu_node hierarchy. */
 689        for (;;) {
 690                if (!(rnp->qsmask & mask)) {
 691
 692                        /* Our bit has already been cleared, so done. */
 693                        spin_unlock_irqrestore(&rnp->lock, flags);
 694                        return;
 695                }
 696                rnp->qsmask &= ~mask;
 697                if (rnp->qsmask != 0) {
 698
 699                        /* Other bits still set at this level, so done. */
 700                        spin_unlock_irqrestore(&rnp->lock, flags);
 701                        return;
 702                }
 703                mask = rnp->grpmask;
 704                if (rnp->parent == NULL) {
 705
 706                        /* No more levels.  Exit loop holding root lock. */
 707
 708                        break;
 709                }
 710                spin_unlock_irqrestore(&rnp->lock, flags);
 711                rnp = rnp->parent;
 712                spin_lock_irqsave(&rnp->lock, flags);
 713        }
 714
 715        /*
 716         * Get here if we are the last CPU to pass through a quiescent
 717         * state for this grace period.  Clean up and let rcu_start_gp()
 718         * start up the next grace period if one is needed.  Note that
 719         * we still hold rnp->lock, as required by rcu_start_gp(), which
 720         * will release it.
 721         */
 722        rsp->completed = rsp->gpnum;
 723        rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
 724        rcu_start_gp(rsp, flags);  /* releases rnp->lock. */
 725}
 726
 727/*
 728 * Record a quiescent state for the specified CPU, which must either be
 729 * the current CPU or an offline CPU.  The lastcomp argument is used to
 730 * make sure we are still in the grace period of interest.  We don't want
 731 * to end the current grace period based on quiescent states detected in
 732 * an earlier grace period!
 733 */
 734static void
 735cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
 736{
 737        unsigned long flags;
 738        unsigned long mask;
 739        struct rcu_node *rnp;
 740
 741        rnp = rdp->mynode;
 742        spin_lock_irqsave(&rnp->lock, flags);
 743        if (lastcomp != ACCESS_ONCE(rsp->completed)) {
 744
 745                /*
 746                 * Someone beat us to it for this grace period, so leave.
 747                 * The race with GP start is resolved by the fact that we
 748                 * hold the leaf rcu_node lock, so that the per-CPU bits
 749                 * cannot yet be initialized -- so we would simply find our
 750                 * CPU's bit already cleared in cpu_quiet_msk() if this race
 751                 * occurred.
 752                 */
 753                rdp->passed_quiesc = 0; /* try again later! */
 754                spin_unlock_irqrestore(&rnp->lock, flags);
 755                return;
 756        }
 757        mask = rdp->grpmask;
 758        if ((rnp->qsmask & mask) == 0) {
 759                spin_unlock_irqrestore(&rnp->lock, flags);
 760        } else {
 761                rdp->qs_pending = 0;
 762
 763                /*
 764                 * This GP can't end until cpu checks in, so all of our
 765                 * callbacks can be processed during the next GP.
 766                 */
 767                rdp = rsp->rda[smp_processor_id()];
 768                rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 769
 770                cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
 771        }
 772}
 773
 774/*
 775 * Check to see if there is a new grace period of which this CPU
 776 * is not yet aware, and if so, set up local rcu_data state for it.
 777 * Otherwise, see if this CPU has just passed through its first
 778 * quiescent state for this grace period, and record that fact if so.
 779 */
 780static void
 781rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 782{
 783        /* If there is now a new grace period, record and return. */
 784        if (check_for_new_grace_period(rsp, rdp))
 785                return;
 786
 787        /*
 788         * Does this CPU still need to do its part for current grace period?
 789         * If no, return and let the other CPUs do their part as well.
 790         */
 791        if (!rdp->qs_pending)
 792                return;
 793
 794        /*
 795         * Was there a quiescent state since the beginning of the grace
 796         * period? If no, then exit and wait for the next call.
 797         */
 798        if (!rdp->passed_quiesc)
 799                return;
 800
 801        /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
 802        cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
 803}
 804
 805#ifdef CONFIG_HOTPLUG_CPU
 806
 807/*
 808 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
 809 * and move all callbacks from the outgoing CPU to the current one.
 810 */
 811static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 812{
 813        int i;
 814        unsigned long flags;
 815        long lastcomp;
 816        unsigned long mask;
 817        struct rcu_data *rdp = rsp->rda[cpu];
 818        struct rcu_data *rdp_me;
 819        struct rcu_node *rnp;
 820
 821        /* Exclude any attempts to start a new grace period. */
 822        spin_lock_irqsave(&rsp->onofflock, flags);
 823
 824        /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
 825        rnp = rdp->mynode;
 826        mask = rdp->grpmask;    /* rnp->grplo is constant. */
 827        do {
 828                spin_lock(&rnp->lock);          /* irqs already disabled. */
 829                rnp->qsmaskinit &= ~mask;
 830                if (rnp->qsmaskinit != 0) {
 831                        spin_unlock(&rnp->lock); /* irqs already disabled. */
 832                        break;
 833                }
 834                mask = rnp->grpmask;
 835                spin_unlock(&rnp->lock);        /* irqs already disabled. */
 836                rnp = rnp->parent;
 837        } while (rnp != NULL);
 838        lastcomp = rsp->completed;
 839
 840        spin_unlock(&rsp->onofflock);           /* irqs remain disabled. */
 841
 842        /* Being offline is a quiescent state, so go record it. */
 843        cpu_quiet(cpu, rsp, rdp, lastcomp);
 844
 845        /*
 846         * Move callbacks from the outgoing CPU to the running CPU.
 847         * Note that the outgoing CPU is now quiscent, so it is now
 848         * (uncharacteristically) safe to access it rcu_data structure.
 849         * Note also that we must carefully retain the order of the
 850         * outgoing CPU's callbacks in order for rcu_barrier() to work
 851         * correctly.  Finally, note that we start all the callbacks
 852         * afresh, even those that have passed through a grace period
 853         * and are therefore ready to invoke.  The theory is that hotplug
 854         * events are rare, and that if they are frequent enough to
 855         * indefinitely delay callbacks, you have far worse things to
 856         * be worrying about.
 857         */
 858        rdp_me = rsp->rda[smp_processor_id()];
 859        if (rdp->nxtlist != NULL) {
 860                *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
 861                rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 862                rdp->nxtlist = NULL;
 863                for (i = 0; i < RCU_NEXT_SIZE; i++)
 864                        rdp->nxttail[i] = &rdp->nxtlist;
 865                rdp_me->qlen += rdp->qlen;
 866                rdp->qlen = 0;
 867        }
 868        local_irq_restore(flags);
 869}
 870
 871/*
 872 * Remove the specified CPU from the RCU hierarchy and move any pending
 873 * callbacks that it might have to the current CPU.  This code assumes
 874 * that at least one CPU in the system will remain running at all times.
 875 * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
 876 */
 877static void rcu_offline_cpu(int cpu)
 878{
 879        __rcu_offline_cpu(cpu, &rcu_state);
 880        __rcu_offline_cpu(cpu, &rcu_bh_state);
 881}
 882
 883#else /* #ifdef CONFIG_HOTPLUG_CPU */
 884
 885static void rcu_offline_cpu(int cpu)
 886{
 887}
 888
 889#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 890
 891/*
 892 * Invoke any RCU callbacks that have made it to the end of their grace
 893 * period.  Thottle as specified by rdp->blimit.
 894 */
 895static void rcu_do_batch(struct rcu_data *rdp)
 896{
 897        unsigned long flags;
 898        struct rcu_head *next, *list, **tail;
 899        int count;
 900
 901        /* If no callbacks are ready, just return.*/
 902        if (!cpu_has_callbacks_ready_to_invoke(rdp))
 903                return;
 904
 905        /*
 906         * Extract the list of ready callbacks, disabling to prevent
 907         * races with call_rcu() from interrupt handlers.
 908         */
 909        local_irq_save(flags);
 910        list = rdp->nxtlist;
 911        rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
 912        *rdp->nxttail[RCU_DONE_TAIL] = NULL;
 913        tail = rdp->nxttail[RCU_DONE_TAIL];
 914        for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
 915                if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
 916                        rdp->nxttail[count] = &rdp->nxtlist;
 917        local_irq_restore(flags);
 918
 919        /* Invoke callbacks. */
 920        count = 0;
 921        while (list) {
 922                next = list->next;
 923                prefetch(next);
 924                list->func(list);
 925                list = next;
 926                if (++count >= rdp->blimit)
 927                        break;
 928        }
 929
 930        local_irq_save(flags);
 931
 932        /* Update count, and requeue any remaining callbacks. */
 933        rdp->qlen -= count;
 934        if (list != NULL) {
 935                *tail = rdp->nxtlist;
 936                rdp->nxtlist = list;
 937                for (count = 0; count < RCU_NEXT_SIZE; count++)
 938                        if (&rdp->nxtlist == rdp->nxttail[count])
 939                                rdp->nxttail[count] = tail;
 940                        else
 941                                break;
 942        }
 943
 944        /* Reinstate batch limit if we have worked down the excess. */
 945        if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
 946                rdp->blimit = blimit;
 947
 948        local_irq_restore(flags);
 949
 950        /* Re-raise the RCU softirq if there are callbacks remaining. */
 951        if (cpu_has_callbacks_ready_to_invoke(rdp))
 952                raise_softirq(RCU_SOFTIRQ);
 953}
 954
 955/*
 956 * Check to see if this CPU is in a non-context-switch quiescent state
 957 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
 958 * Also schedule the RCU softirq handler.
 959 *
 960 * This function must be called with hardirqs disabled.  It is normally
 961 * invoked from the scheduling-clock interrupt.  If rcu_pending returns
 962 * false, there is no point in invoking rcu_check_callbacks().
 963 */
 964void rcu_check_callbacks(int cpu, int user)
 965{
 966        if (user ||
 967            (idle_cpu(cpu) && rcu_scheduler_active &&
 968             !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
 969
 970                /*
 971                 * Get here if this CPU took its interrupt from user
 972                 * mode or from the idle loop, and if this is not a
 973                 * nested interrupt.  In this case, the CPU is in
 974                 * a quiescent state, so count it.
 975                 *
 976                 * No memory barrier is required here because both
 977                 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
 978                 * only CPU-local variables that other CPUs neither
 979                 * access nor modify, at least not while the corresponding
 980                 * CPU is online.
 981                 */
 982
 983                rcu_qsctr_inc(cpu);
 984                rcu_bh_qsctr_inc(cpu);
 985
 986        } else if (!in_softirq()) {
 987
 988                /*
 989                 * Get here if this CPU did not take its interrupt from
 990                 * softirq, in other words, if it is not interrupting
 991                 * a rcu_bh read-side critical section.  This is an _bh
 992                 * critical section, so count it.
 993                 */
 994
 995                rcu_bh_qsctr_inc(cpu);
 996        }
 997        raise_softirq(RCU_SOFTIRQ);
 998}
 999
1000#ifdef CONFIG_SMP
1001
1002/*
1003 * Scan the leaf rcu_node structures, processing dyntick state for any that
1004 * have not yet encountered a quiescent state, using the function specified.
1005 * Returns 1 if the current grace period ends while scanning (possibly
1006 * because we made it end).
1007 */
1008static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1009                               int (*f)(struct rcu_data *))
1010{
1011        unsigned long bit;
1012        int cpu;
1013        unsigned long flags;
1014        unsigned long mask;
1015        struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
1016        struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
1017
1018        for (; rnp_cur < rnp_end; rnp_cur++) {
1019                mask = 0;
1020                spin_lock_irqsave(&rnp_cur->lock, flags);
1021                if (rsp->completed != lastcomp) {
1022                        spin_unlock_irqrestore(&rnp_cur->lock, flags);
1023                        return 1;
1024                }
1025                if (rnp_cur->qsmask == 0) {
1026                        spin_unlock_irqrestore(&rnp_cur->lock, flags);
1027                        continue;
1028                }
1029                cpu = rnp_cur->grplo;
1030                bit = 1;
1031                for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
1032                        if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1033                                mask |= bit;
1034                }
1035                if (mask != 0 && rsp->completed == lastcomp) {
1036
1037                        /* cpu_quiet_msk() releases rnp_cur->lock. */
1038                        cpu_quiet_msk(mask, rsp, rnp_cur, flags);
1039                        continue;
1040                }
1041                spin_unlock_irqrestore(&rnp_cur->lock, flags);
1042        }
1043        return 0;
1044}
1045
1046/*
1047 * Force quiescent states on reluctant CPUs, and also detect which
1048 * CPUs are in dyntick-idle mode.
1049 */
1050static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1051{
1052        unsigned long flags;
1053        long lastcomp;
1054        struct rcu_node *rnp = rcu_get_root(rsp);
1055        u8 signaled;
1056
1057        if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
1058                return;  /* No grace period in progress, nothing to force. */
1059        if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1060                rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
1061                return; /* Someone else is already on the job. */
1062        }
1063        if (relaxed &&
1064            (long)(rsp->jiffies_force_qs - jiffies) >= 0)
1065                goto unlock_ret; /* no emergency and done recently. */
1066        rsp->n_force_qs++;
1067        spin_lock(&rnp->lock);
1068        lastcomp = rsp->completed;
1069        signaled = rsp->signaled;
1070        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1071        if (lastcomp == rsp->gpnum) {
1072                rsp->n_force_qs_ngp++;
1073                spin_unlock(&rnp->lock);
1074                goto unlock_ret;  /* no GP in progress, time updated. */
1075        }
1076        spin_unlock(&rnp->lock);
1077        switch (signaled) {
1078        case RCU_GP_INIT:
1079
1080                break; /* grace period still initializing, ignore. */
1081
1082        case RCU_SAVE_DYNTICK:
1083
1084                if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1085                        break; /* So gcc recognizes the dead code. */
1086
1087                /* Record dyntick-idle state. */
1088                if (rcu_process_dyntick(rsp, lastcomp,
1089                                        dyntick_save_progress_counter))
1090                        goto unlock_ret;
1091
1092                /* Update state, record completion counter. */
1093                spin_lock(&rnp->lock);
1094                if (lastcomp == rsp->completed) {
1095                        rsp->signaled = RCU_FORCE_QS;
1096                        dyntick_record_completed(rsp, lastcomp);
1097                }
1098                spin_unlock(&rnp->lock);
1099                break;
1100
1101        case RCU_FORCE_QS:
1102
1103                /* Check dyntick-idle state, send IPI to laggarts. */
1104                if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
1105                                        rcu_implicit_dynticks_qs))
1106                        goto unlock_ret;
1107
1108                /* Leave state in case more forcing is required. */
1109
1110                break;
1111        }
1112unlock_ret:
1113        spin_unlock_irqrestore(&rsp->fqslock, flags);
1114}
1115
1116#else /* #ifdef CONFIG_SMP */
1117
1118static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1119{
1120        set_need_resched();
1121}
1122
1123#endif /* #else #ifdef CONFIG_SMP */
1124
1125/*
1126 * This does the RCU processing work from softirq context for the
1127 * specified rcu_state and rcu_data structures.  This may be called
1128 * only from the CPU to whom the rdp belongs.
1129 */
1130static void
1131__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1132{
1133        unsigned long flags;
1134
1135        /*
1136         * If an RCU GP has gone long enough, go check for dyntick
1137         * idle CPUs and, if needed, send resched IPIs.
1138         */
1139        if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
1140                force_quiescent_state(rsp, 1);
1141
1142        /*
1143         * Advance callbacks in response to end of earlier grace
1144         * period that some other CPU ended.
1145         */
1146        rcu_process_gp_end(rsp, rdp);
1147
1148        /* Update RCU state based on any recent quiescent states. */
1149        rcu_check_quiescent_state(rsp, rdp);
1150
1151        /* Does this CPU require a not-yet-started grace period? */
1152        if (cpu_needs_another_gp(rsp, rdp)) {
1153                spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1154                rcu_start_gp(rsp, flags);  /* releases above lock */
1155        }
1156
1157        /* If there are callbacks ready, invoke them. */
1158        rcu_do_batch(rdp);
1159}
1160
1161/*
1162 * Do softirq processing for the current CPU.
1163 */
1164static void rcu_process_callbacks(struct softirq_action *unused)
1165{
1166        /*
1167         * Memory references from any prior RCU read-side critical sections
1168         * executed by the interrupted code must be seen before any RCU
1169         * grace-period manipulations below.
1170         */
1171        smp_mb(); /* See above block comment. */
1172
1173        __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
1174        __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1175
1176        /*
1177         * Memory references from any later RCU read-side critical sections
1178         * executed by the interrupted code must be seen after any RCU
1179         * grace-period manipulations above.
1180         */
1181        smp_mb(); /* See above block comment. */
1182}
1183
1184static void
1185__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1186           struct rcu_state *rsp)
1187{
1188        unsigned long flags;
1189        struct rcu_data *rdp;
1190
1191        head->func = func;
1192        head->next = NULL;
1193
1194        smp_mb(); /* Ensure RCU update seen before callback registry. */
1195
1196        /*
1197         * Opportunistically note grace-period endings and beginnings.
1198         * Note that we might see a beginning right after we see an
1199         * end, but never vice versa, since this CPU has to pass through
1200         * a quiescent state betweentimes.
1201         */
1202        local_irq_save(flags);
1203        rdp = rsp->rda[smp_processor_id()];
1204        rcu_process_gp_end(rsp, rdp);
1205        check_for_new_grace_period(rsp, rdp);
1206
1207        /* Add the callback to our list. */
1208        *rdp->nxttail[RCU_NEXT_TAIL] = head;
1209        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1210
1211        /* Start a new grace period if one not already started. */
1212        if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
1213                unsigned long nestflag;
1214                struct rcu_node *rnp_root = rcu_get_root(rsp);
1215
1216                spin_lock_irqsave(&rnp_root->lock, nestflag);
1217                rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
1218        }
1219
1220        /* Force the grace period if too many callbacks or too long waiting. */
1221        if (unlikely(++rdp->qlen > qhimark)) {
1222                rdp->blimit = LONG_MAX;
1223                force_quiescent_state(rsp, 0);
1224        } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
1225                force_quiescent_state(rsp, 1);
1226        local_irq_restore(flags);
1227}
1228
1229/*
1230 * Queue an RCU callback for invocation after a grace period.
1231 */
1232void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1233{
1234        __call_rcu(head, func, &rcu_state);
1235}
1236EXPORT_SYMBOL_GPL(call_rcu);
1237
1238/*
1239 * Queue an RCU for invocation after a quicker grace period.
1240 */
1241void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1242{
1243        __call_rcu(head, func, &rcu_bh_state);
1244}
1245EXPORT_SYMBOL_GPL(call_rcu_bh);
1246
1247/*
1248 * Check to see if there is any immediate RCU-related work to be done
1249 * by the current CPU, for the specified type of RCU, returning 1 if so.
1250 * The checks are in order of increasing expense: checks that can be
1251 * carried out against CPU-local state are performed first.  However,
1252 * we must check for CPU stalls first, else we might not get a chance.
1253 */
1254static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1255{
1256        rdp->n_rcu_pending++;
1257
1258        /* Check for CPU stalls, if enabled. */
1259        check_cpu_stall(rsp, rdp);
1260
1261        /* Is the RCU core waiting for a quiescent state from this CPU? */
1262        if (rdp->qs_pending)
1263                return 1;
1264
1265        /* Does this CPU have callbacks ready to invoke? */
1266        if (cpu_has_callbacks_ready_to_invoke(rdp))
1267                return 1;
1268
1269        /* Has RCU gone idle with this CPU needing another grace period? */
1270        if (cpu_needs_another_gp(rsp, rdp))
1271                return 1;
1272
1273        /* Has another RCU grace period completed?  */
1274        if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
1275                return 1;
1276
1277        /* Has a new RCU grace period started? */
1278        if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
1279                return 1;
1280
1281        /* Has an RCU GP gone long enough to send resched IPIs &c? */
1282        if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1283            ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0))
1284                return 1;
1285
1286        /* nothing to do */
1287        return 0;
1288}
1289
1290/*
1291 * Check to see if there is any immediate RCU-related work to be done
1292 * by the current CPU, returning 1 if so.  This function is part of the
1293 * RCU implementation; it is -not- an exported member of the RCU API.
1294 */
1295int rcu_pending(int cpu)
1296{
1297        return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
1298               __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
1299}
1300
1301/*
1302 * Check to see if any future RCU-related work will need to be done
1303 * by the current CPU, even if none need be done immediately, returning
1304 * 1 if so.  This function is part of the RCU implementation; it is -not-
1305 * an exported member of the RCU API.
1306 */
1307int rcu_needs_cpu(int cpu)
1308{
1309        /* RCU callbacks either ready or pending? */
1310        return per_cpu(rcu_data, cpu).nxtlist ||
1311               per_cpu(rcu_bh_data, cpu).nxtlist;
1312}
1313
1314/*
1315 * Initialize a CPU's per-CPU RCU data.  We take this "scorched earth"
1316 * approach so that we don't have to worry about how long the CPU has
1317 * been gone, or whether it ever was online previously.  We do trust the
1318 * ->mynode field, as it is constant for a given struct rcu_data and
1319 * initialized during early boot.
1320 *
1321 * Note that only one online or offline event can be happening at a given
1322 * time.  Note also that we can accept some slop in the rsp->completed
1323 * access due to the fact that this CPU cannot possibly have any RCU
1324 * callbacks in flight yet.
1325 */
1326static void __cpuinit
1327rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1328{
1329        unsigned long flags;
1330        int i;
1331        long lastcomp;
1332        unsigned long mask;
1333        struct rcu_data *rdp = rsp->rda[cpu];
1334        struct rcu_node *rnp = rcu_get_root(rsp);
1335
1336        /* Set up local state, ensuring consistent view of global state. */
1337        spin_lock_irqsave(&rnp->lock, flags);
1338        lastcomp = rsp->completed;
1339        rdp->completed = lastcomp;
1340        rdp->gpnum = lastcomp;
1341        rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
1342        rdp->qs_pending = 1;     /*  so set up to respond to current GP. */
1343        rdp->beenonline = 1;     /* We have now been online. */
1344        rdp->passed_quiesc_completed = lastcomp - 1;
1345        rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1346        rdp->nxtlist = NULL;
1347        for (i = 0; i < RCU_NEXT_SIZE; i++)
1348                rdp->nxttail[i] = &rdp->nxtlist;
1349        rdp->qlen = 0;
1350        rdp->blimit = blimit;
1351#ifdef CONFIG_NO_HZ
1352        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1353#endif /* #ifdef CONFIG_NO_HZ */
1354        rdp->cpu = cpu;
1355        spin_unlock(&rnp->lock);                /* irqs remain disabled. */
1356
1357        /*
1358         * A new grace period might start here.  If so, we won't be part
1359         * of it, but that is OK, as we are currently in a quiescent state.
1360         */
1361
1362        /* Exclude any attempts to start a new GP on large systems. */
1363        spin_lock(&rsp->onofflock);             /* irqs already disabled. */
1364
1365        /* Add CPU to rcu_node bitmasks. */
1366        rnp = rdp->mynode;
1367        mask = rdp->grpmask;
1368        do {
1369                /* Exclude any attempts to start a new GP on small systems. */
1370                spin_lock(&rnp->lock);  /* irqs already disabled. */
1371                rnp->qsmaskinit |= mask;
1372                mask = rnp->grpmask;
1373                spin_unlock(&rnp->lock); /* irqs already disabled. */
1374                rnp = rnp->parent;
1375        } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1376
1377        spin_unlock(&rsp->onofflock);           /* irqs remain disabled. */
1378
1379        /*
1380         * A new grace period might start here.  If so, we will be part of
1381         * it, and its gpnum will be greater than ours, so we will
1382         * participate.  It is also possible for the gpnum to have been
1383         * incremented before this function was called, and the bitmasks
1384         * to not be filled out until now, in which case we will also
1385         * participate due to our gpnum being behind.
1386         */
1387
1388        /* Since it is coming online, the CPU is in a quiescent state. */
1389        cpu_quiet(cpu, rsp, rdp, lastcomp);
1390        local_irq_restore(flags);
1391}
1392
1393static void __cpuinit rcu_online_cpu(int cpu)
1394{
1395        rcu_init_percpu_data(cpu, &rcu_state);
1396        rcu_init_percpu_data(cpu, &rcu_bh_state);
1397        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1398}
1399
1400/*
1401 * Handle CPU online/offline notifcation events.
1402 */
1403static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1404                                unsigned long action, void *hcpu)
1405{
1406        long cpu = (long)hcpu;
1407
1408        switch (action) {
1409        case CPU_UP_PREPARE:
1410        case CPU_UP_PREPARE_FROZEN:
1411                rcu_online_cpu(cpu);
1412                break;
1413        case CPU_DEAD:
1414        case CPU_DEAD_FROZEN:
1415        case CPU_UP_CANCELED:
1416        case CPU_UP_CANCELED_FROZEN:
1417                rcu_offline_cpu(cpu);
1418                break;
1419        default:
1420                break;
1421        }
1422        return NOTIFY_OK;
1423}
1424
1425/*
1426 * Compute the per-level fanout, either using the exact fanout specified
1427 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1428 */
1429#ifdef CONFIG_RCU_FANOUT_EXACT
1430static void __init rcu_init_levelspread(struct rcu_state *rsp)
1431{
1432        int i;
1433
1434        for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
1435                rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1436}
1437#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1438static void __init rcu_init_levelspread(struct rcu_state *rsp)
1439{
1440        int ccur;
1441        int cprv;
1442        int i;
1443
1444        cprv = NR_CPUS;
1445        for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1446                ccur = rsp->levelcnt[i];
1447                rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
1448                cprv = ccur;
1449        }
1450}
1451#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
1452
1453/*
1454 * Helper function for rcu_init() that initializes one rcu_state structure.
1455 */
1456static void __init rcu_init_one(struct rcu_state *rsp)
1457{
1458        int cpustride = 1;
1459        int i;
1460        int j;
1461        struct rcu_node *rnp;
1462
1463        /* Initialize the level-tracking arrays. */
1464
1465        for (i = 1; i < NUM_RCU_LVLS; i++)
1466                rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
1467        rcu_init_levelspread(rsp);
1468
1469        /* Initialize the elements themselves, starting from the leaves. */
1470
1471        for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1472                cpustride *= rsp->levelspread[i];
1473                rnp = rsp->level[i];
1474                for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1475                        spin_lock_init(&rnp->lock);
1476                        rnp->qsmask = 0;
1477                        rnp->qsmaskinit = 0;
1478                        rnp->grplo = j * cpustride;
1479                        rnp->grphi = (j + 1) * cpustride - 1;
1480                        if (rnp->grphi >= NR_CPUS)
1481                                rnp->grphi = NR_CPUS - 1;
1482                        if (i == 0) {
1483                                rnp->grpnum = 0;
1484                                rnp->grpmask = 0;
1485                                rnp->parent = NULL;
1486                        } else {
1487                                rnp->grpnum = j % rsp->levelspread[i - 1];
1488                                rnp->grpmask = 1UL << rnp->grpnum;
1489                                rnp->parent = rsp->level[i - 1] +
1490                                              j / rsp->levelspread[i - 1];
1491                        }
1492                        rnp->level = i;
1493                }
1494        }
1495}
1496
1497/*
1498 * Helper macro for __rcu_init().  To be used nowhere else!
1499 * Assigns leaf node pointers into each CPU's rcu_data structure.
1500 */
1501#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
1502do { \
1503        rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1504        j = 0; \
1505        for_each_possible_cpu(i) { \
1506                if (i > rnp[j].grphi) \
1507                        j++; \
1508                per_cpu(rcu_data, i).mynode = &rnp[j]; \
1509                (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1510        } \
1511} while (0)
1512
1513static struct notifier_block __cpuinitdata rcu_nb = {
1514        .notifier_call  = rcu_cpu_notify,
1515};
1516
1517void __init __rcu_init(void)
1518{
1519        int i;                  /* All used by RCU_DATA_PTR_INIT(). */
1520        int j;
1521        struct rcu_node *rnp;
1522
1523        printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
1524#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1525        printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1526#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1527        rcu_init_one(&rcu_state);
1528        RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
1529        rcu_init_one(&rcu_bh_state);
1530        RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
1531
1532        for_each_online_cpu(i)
1533                rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1534        /* Register notifier for non-boot CPUs */
1535        register_cpu_notifier(&rcu_nb);
1536        printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1537}
1538
1539module_param(blimit, int, 0);
1540module_param(qhimark, int, 0);
1541module_param(qlowmark, int, 0);
1542