linux/kernel/rcupreempt.c
<<
>>
Prefs
   1/*
   2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License as published by
   6 * the Free Software Foundation; either version 2 of the License, or
   7 * (at your option) any later version.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17 *
  18 * Copyright IBM Corporation, 2006
  19 *
  20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
  21 *              With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
  22 *              for pushing me away from locks and towards counters, and
  23 *              to Suparna Bhattacharya for pushing me completely away
  24 *              from atomic instructions on the read side.
  25 *
  26 *  - Added handling of Dynamic Ticks
  27 *      Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
  28 *                     - Steven Rostedt <srostedt@redhat.com>
  29 *
  30 * Papers:  http://www.rdrop.com/users/paulmck/RCU
  31 *
  32 * Design Document: http://lwn.net/Articles/253651/
  33 *
  34 * For detailed explanation of Read-Copy Update mechanism see -
  35 *              Documentation/RCU/ *.txt
  36 *
  37 */
  38#include <linux/types.h>
  39#include <linux/kernel.h>
  40#include <linux/init.h>
  41#include <linux/spinlock.h>
  42#include <linux/smp.h>
  43#include <linux/rcupdate.h>
  44#include <linux/interrupt.h>
  45#include <linux/sched.h>
  46#include <asm/atomic.h>
  47#include <linux/bitops.h>
  48#include <linux/module.h>
  49#include <linux/kthread.h>
  50#include <linux/completion.h>
  51#include <linux/moduleparam.h>
  52#include <linux/percpu.h>
  53#include <linux/notifier.h>
  54#include <linux/cpu.h>
  55#include <linux/random.h>
  56#include <linux/delay.h>
  57#include <linux/byteorder/swabb.h>
  58#include <linux/cpumask.h>
  59#include <linux/rcupreempt_trace.h>
  60
  61/*
  62 * Macro that prevents the compiler from reordering accesses, but does
  63 * absolutely -nothing- to prevent CPUs from reordering.  This is used
  64 * only to mediate communication between mainline code and hardware
  65 * interrupt and NMI handlers.
  66 */
  67#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
  68
  69/*
  70 * PREEMPT_RCU data structures.
  71 */
  72
  73/*
  74 * GP_STAGES specifies the number of times the state machine has
  75 * to go through the all the rcu_try_flip_states (see below)
  76 * in a single Grace Period.
  77 *
  78 * GP in GP_STAGES stands for Grace Period ;)
  79 */
  80#define GP_STAGES    2
  81struct rcu_data {
  82        spinlock_t      lock;           /* Protect rcu_data fields. */
  83        long            completed;      /* Number of last completed batch. */
  84        int             waitlistcount;
  85        struct rcu_head *nextlist;
  86        struct rcu_head **nexttail;
  87        struct rcu_head *waitlist[GP_STAGES];
  88        struct rcu_head **waittail[GP_STAGES];
  89        struct rcu_head *donelist;      /* from waitlist & waitschedlist */
  90        struct rcu_head **donetail;
  91        long rcu_flipctr[2];
  92        struct rcu_head *nextschedlist;
  93        struct rcu_head **nextschedtail;
  94        struct rcu_head *waitschedlist;
  95        struct rcu_head **waitschedtail;
  96        int rcu_sched_sleeping;
  97#ifdef CONFIG_RCU_TRACE
  98        struct rcupreempt_trace trace;
  99#endif /* #ifdef CONFIG_RCU_TRACE */
 100};
 101
 102/*
 103 * States for rcu_try_flip() and friends.
 104 */
 105
 106enum rcu_try_flip_states {
 107
 108        /*
 109         * Stay here if nothing is happening. Flip the counter if somthing
 110         * starts happening. Denoted by "I"
 111         */
 112        rcu_try_flip_idle_state,
 113
 114        /*
 115         * Wait here for all CPUs to notice that the counter has flipped. This
 116         * prevents the old set of counters from ever being incremented once
 117         * we leave this state, which in turn is necessary because we cannot
 118         * test any individual counter for zero -- we can only check the sum.
 119         * Denoted by "A".
 120         */
 121        rcu_try_flip_waitack_state,
 122
 123        /*
 124         * Wait here for the sum of the old per-CPU counters to reach zero.
 125         * Denoted by "Z".
 126         */
 127        rcu_try_flip_waitzero_state,
 128
 129        /*
 130         * Wait here for each of the other CPUs to execute a memory barrier.
 131         * This is necessary to ensure that these other CPUs really have
 132         * completed executing their RCU read-side critical sections, despite
 133         * their CPUs wildly reordering memory. Denoted by "M".
 134         */
 135        rcu_try_flip_waitmb_state,
 136};
 137
 138/*
 139 * States for rcu_ctrlblk.rcu_sched_sleep.
 140 */
 141
 142enum rcu_sched_sleep_states {
 143        rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP.  */
 144        rcu_sched_sleep_prep,   /* Thinking of sleeping, rechecking. */
 145        rcu_sched_sleeping,     /* Sleeping, awaken if GP needed. */
 146};
 147
 148struct rcu_ctrlblk {
 149        spinlock_t      fliplock;       /* Protect state-machine transitions. */
 150        long            completed;      /* Number of last completed batch. */
 151        enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
 152                                                        the rcu state machine */
 153        spinlock_t      schedlock;      /* Protect rcu_sched sleep state. */
 154        enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
 155        wait_queue_head_t sched_wq;     /* Place for rcu_sched to sleep. */
 156};
 157
 158static DEFINE_PER_CPU(struct rcu_data, rcu_data);
 159static struct rcu_ctrlblk rcu_ctrlblk = {
 160        .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
 161        .completed = 0,
 162        .rcu_try_flip_state = rcu_try_flip_idle_state,
 163        .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
 164        .sched_sleep = rcu_sched_not_sleeping,
 165        .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
 166};
 167
 168static struct task_struct *rcu_sched_grace_period_task;
 169
 170#ifdef CONFIG_RCU_TRACE
 171static char *rcu_try_flip_state_names[] =
 172        { "idle", "waitack", "waitzero", "waitmb" };
 173#endif /* #ifdef CONFIG_RCU_TRACE */
 174
 175static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
 176
 177/*
 178 * Enum and per-CPU flag to determine when each CPU has seen
 179 * the most recent counter flip.
 180 */
 181
 182enum rcu_flip_flag_values {
 183        rcu_flip_seen,          /* Steady/initial state, last flip seen. */
 184                                /* Only GP detector can update. */
 185        rcu_flipped             /* Flip just completed, need confirmation. */
 186                                /* Only corresponding CPU can update. */
 187};
 188static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
 189                                                                = rcu_flip_seen;
 190
 191/*
 192 * Enum and per-CPU flag to determine when each CPU has executed the
 193 * needed memory barrier to fence in memory references from its last RCU
 194 * read-side critical section in the just-completed grace period.
 195 */
 196
 197enum rcu_mb_flag_values {
 198        rcu_mb_done,            /* Steady/initial state, no mb()s required. */
 199                                /* Only GP detector can update. */
 200        rcu_mb_needed           /* Flip just completed, need an mb(). */
 201                                /* Only corresponding CPU can update. */
 202};
 203static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
 204                                                                = rcu_mb_done;
 205
 206/*
 207 * RCU_DATA_ME: find the current CPU's rcu_data structure.
 208 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
 209 */
 210#define RCU_DATA_ME()           (&__get_cpu_var(rcu_data))
 211#define RCU_DATA_CPU(cpu)       (&per_cpu(rcu_data, cpu))
 212
 213/*
 214 * Helper macro for tracing when the appropriate rcu_data is not
 215 * cached in a local variable, but where the CPU number is so cached.
 216 */
 217#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
 218
 219/*
 220 * Helper macro for tracing when the appropriate rcu_data is not
 221 * cached in a local variable.
 222 */
 223#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
 224
 225/*
 226 * Helper macro for tracing when the appropriate rcu_data is pointed
 227 * to by a local variable.
 228 */
 229#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
 230
 231#define RCU_SCHED_BATCH_TIME (HZ / 50)
 232
 233/*
 234 * Return the number of RCU batches processed thus far.  Useful
 235 * for debug and statistics.
 236 */
 237long rcu_batches_completed(void)
 238{
 239        return rcu_ctrlblk.completed;
 240}
 241EXPORT_SYMBOL_GPL(rcu_batches_completed);
 242
 243void __rcu_read_lock(void)
 244{
 245        int idx;
 246        struct task_struct *t = current;
 247        int nesting;
 248
 249        nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
 250        if (nesting != 0) {
 251
 252                /* An earlier rcu_read_lock() covers us, just count it. */
 253
 254                t->rcu_read_lock_nesting = nesting + 1;
 255
 256        } else {
 257                unsigned long flags;
 258
 259                /*
 260                 * We disable interrupts for the following reasons:
 261                 * - If we get scheduling clock interrupt here, and we
 262                 *   end up acking the counter flip, it's like a promise
 263                 *   that we will never increment the old counter again.
 264                 *   Thus we will break that promise if that
 265                 *   scheduling clock interrupt happens between the time
 266                 *   we pick the .completed field and the time that we
 267                 *   increment our counter.
 268                 *
 269                 * - We don't want to be preempted out here.
 270                 *
 271                 * NMIs can still occur, of course, and might themselves
 272                 * contain rcu_read_lock().
 273                 */
 274
 275                local_irq_save(flags);
 276
 277                /*
 278                 * Outermost nesting of rcu_read_lock(), so increment
 279                 * the current counter for the current CPU.  Use volatile
 280                 * casts to prevent the compiler from reordering.
 281                 */
 282
 283                idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
 284                ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
 285
 286                /*
 287                 * Now that the per-CPU counter has been incremented, we
 288                 * are protected from races with rcu_read_lock() invoked
 289                 * from NMI handlers on this CPU.  We can therefore safely
 290                 * increment the nesting counter, relieving further NMIs
 291                 * of the need to increment the per-CPU counter.
 292                 */
 293
 294                ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
 295
 296                /*
 297                 * Now that we have preventing any NMIs from storing
 298                 * to the ->rcu_flipctr_idx, we can safely use it to
 299                 * remember which counter to decrement in the matching
 300                 * rcu_read_unlock().
 301                 */
 302
 303                ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
 304                local_irq_restore(flags);
 305        }
 306}
 307EXPORT_SYMBOL_GPL(__rcu_read_lock);
 308
 309void __rcu_read_unlock(void)
 310{
 311        int idx;
 312        struct task_struct *t = current;
 313        int nesting;
 314
 315        nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
 316        if (nesting > 1) {
 317
 318                /*
 319                 * We are still protected by the enclosing rcu_read_lock(),
 320                 * so simply decrement the counter.
 321                 */
 322
 323                t->rcu_read_lock_nesting = nesting - 1;
 324
 325        } else {
 326                unsigned long flags;
 327
 328                /*
 329                 * Disable local interrupts to prevent the grace-period
 330                 * detection state machine from seeing us half-done.
 331                 * NMIs can still occur, of course, and might themselves
 332                 * contain rcu_read_lock() and rcu_read_unlock().
 333                 */
 334
 335                local_irq_save(flags);
 336
 337                /*
 338                 * Outermost nesting of rcu_read_unlock(), so we must
 339                 * decrement the current counter for the current CPU.
 340                 * This must be done carefully, because NMIs can
 341                 * occur at any point in this code, and any rcu_read_lock()
 342                 * and rcu_read_unlock() pairs in the NMI handlers
 343                 * must interact non-destructively with this code.
 344                 * Lots of volatile casts, and -very- careful ordering.
 345                 *
 346                 * Changes to this code, including this one, must be
 347                 * inspected, validated, and tested extremely carefully!!!
 348                 */
 349
 350                /*
 351                 * First, pick up the index.
 352                 */
 353
 354                idx = ACCESS_ONCE(t->rcu_flipctr_idx);
 355
 356                /*
 357                 * Now that we have fetched the counter index, it is
 358                 * safe to decrement the per-task RCU nesting counter.
 359                 * After this, any interrupts or NMIs will increment and
 360                 * decrement the per-CPU counters.
 361                 */
 362                ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
 363
 364                /*
 365                 * It is now safe to decrement this task's nesting count.
 366                 * NMIs that occur after this statement will route their
 367                 * rcu_read_lock() calls through this "else" clause, and
 368                 * will thus start incrementing the per-CPU counter on
 369                 * their own.  They will also clobber ->rcu_flipctr_idx,
 370                 * but that is OK, since we have already fetched it.
 371                 */
 372
 373                ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
 374                local_irq_restore(flags);
 375        }
 376}
 377EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 378
 379/*
 380 * If a global counter flip has occurred since the last time that we
 381 * advanced callbacks, advance them.  Hardware interrupts must be
 382 * disabled when calling this function.
 383 */
 384static void __rcu_advance_callbacks(struct rcu_data *rdp)
 385{
 386        int cpu;
 387        int i;
 388        int wlc = 0;
 389
 390        if (rdp->completed != rcu_ctrlblk.completed) {
 391                if (rdp->waitlist[GP_STAGES - 1] != NULL) {
 392                        *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
 393                        rdp->donetail = rdp->waittail[GP_STAGES - 1];
 394                        RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
 395                }
 396                for (i = GP_STAGES - 2; i >= 0; i--) {
 397                        if (rdp->waitlist[i] != NULL) {
 398                                rdp->waitlist[i + 1] = rdp->waitlist[i];
 399                                rdp->waittail[i + 1] = rdp->waittail[i];
 400                                wlc++;
 401                        } else {
 402                                rdp->waitlist[i + 1] = NULL;
 403                                rdp->waittail[i + 1] =
 404                                        &rdp->waitlist[i + 1];
 405                        }
 406                }
 407                if (rdp->nextlist != NULL) {
 408                        rdp->waitlist[0] = rdp->nextlist;
 409                        rdp->waittail[0] = rdp->nexttail;
 410                        wlc++;
 411                        rdp->nextlist = NULL;
 412                        rdp->nexttail = &rdp->nextlist;
 413                        RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
 414                } else {
 415                        rdp->waitlist[0] = NULL;
 416                        rdp->waittail[0] = &rdp->waitlist[0];
 417                }
 418                rdp->waitlistcount = wlc;
 419                rdp->completed = rcu_ctrlblk.completed;
 420        }
 421
 422        /*
 423         * Check to see if this CPU needs to report that it has seen
 424         * the most recent counter flip, thereby declaring that all
 425         * subsequent rcu_read_lock() invocations will respect this flip.
 426         */
 427
 428        cpu = raw_smp_processor_id();
 429        if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
 430                smp_mb();  /* Subsequent counter accesses must see new value */
 431                per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
 432                smp_mb();  /* Subsequent RCU read-side critical sections */
 433                           /*  seen -after- acknowledgement. */
 434        }
 435}
 436
 437DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
 438        .dynticks = 1,
 439};
 440
 441#ifdef CONFIG_NO_HZ
 442static DEFINE_PER_CPU(int, rcu_update_flag);
 443
 444/**
 445 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
 446 *
 447 * If the CPU was idle with dynamic ticks active, this updates the
 448 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
 449 * CPU is active.
 450 */
 451void rcu_irq_enter(void)
 452{
 453        int cpu = smp_processor_id();
 454        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 455
 456        if (per_cpu(rcu_update_flag, cpu))
 457                per_cpu(rcu_update_flag, cpu)++;
 458
 459        /*
 460         * Only update if we are coming from a stopped ticks mode
 461         * (rcu_dyntick_sched.dynticks is even).
 462         */
 463        if (!in_interrupt() &&
 464            (rdssp->dynticks & 0x1) == 0) {
 465                /*
 466                 * The following might seem like we could have a race
 467                 * with NMI/SMIs. But this really isn't a problem.
 468                 * Here we do a read/modify/write, and the race happens
 469                 * when an NMI/SMI comes in after the read and before
 470                 * the write. But NMI/SMIs will increment this counter
 471                 * twice before returning, so the zero bit will not
 472                 * be corrupted by the NMI/SMI which is the most important
 473                 * part.
 474                 *
 475                 * The only thing is that we would bring back the counter
 476                 * to a postion that it was in during the NMI/SMI.
 477                 * But the zero bit would be set, so the rest of the
 478                 * counter would again be ignored.
 479                 *
 480                 * On return from the IRQ, the counter may have the zero
 481                 * bit be 0 and the counter the same as the return from
 482                 * the NMI/SMI. If the state machine was so unlucky to
 483                 * see that, it still doesn't matter, since all
 484                 * RCU read-side critical sections on this CPU would
 485                 * have already completed.
 486                 */
 487                rdssp->dynticks++;
 488                /*
 489                 * The following memory barrier ensures that any
 490                 * rcu_read_lock() primitives in the irq handler
 491                 * are seen by other CPUs to follow the above
 492                 * increment to rcu_dyntick_sched.dynticks. This is
 493                 * required in order for other CPUs to correctly
 494                 * determine when it is safe to advance the RCU
 495                 * grace-period state machine.
 496                 */
 497                smp_mb(); /* see above block comment. */
 498                /*
 499                 * Since we can't determine the dynamic tick mode from
 500                 * the rcu_dyntick_sched.dynticks after this routine,
 501                 * we use a second flag to acknowledge that we came
 502                 * from an idle state with ticks stopped.
 503                 */
 504                per_cpu(rcu_update_flag, cpu)++;
 505                /*
 506                 * If we take an NMI/SMI now, they will also increment
 507                 * the rcu_update_flag, and will not update the
 508                 * rcu_dyntick_sched.dynticks on exit. That is for
 509                 * this IRQ to do.
 510                 */
 511        }
 512}
 513
 514/**
 515 * rcu_irq_exit - Called from exiting Hard irq context.
 516 *
 517 * If the CPU was idle with dynamic ticks active, update the
 518 * rcu_dyntick_sched.dynticks to put let the RCU handling be
 519 * aware that the CPU is going back to idle with no ticks.
 520 */
 521void rcu_irq_exit(void)
 522{
 523        int cpu = smp_processor_id();
 524        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 525
 526        /*
 527         * rcu_update_flag is set if we interrupted the CPU
 528         * when it was idle with ticks stopped.
 529         * Once this occurs, we keep track of interrupt nesting
 530         * because a NMI/SMI could also come in, and we still
 531         * only want the IRQ that started the increment of the
 532         * rcu_dyntick_sched.dynticks to be the one that modifies
 533         * it on exit.
 534         */
 535        if (per_cpu(rcu_update_flag, cpu)) {
 536                if (--per_cpu(rcu_update_flag, cpu))
 537                        return;
 538
 539                /* This must match the interrupt nesting */
 540                WARN_ON(in_interrupt());
 541
 542                /*
 543                 * If an NMI/SMI happens now we are still
 544                 * protected by the rcu_dyntick_sched.dynticks being odd.
 545                 */
 546
 547                /*
 548                 * The following memory barrier ensures that any
 549                 * rcu_read_unlock() primitives in the irq handler
 550                 * are seen by other CPUs to preceed the following
 551                 * increment to rcu_dyntick_sched.dynticks. This
 552                 * is required in order for other CPUs to determine
 553                 * when it is safe to advance the RCU grace-period
 554                 * state machine.
 555                 */
 556                smp_mb(); /* see above block comment. */
 557                rdssp->dynticks++;
 558                WARN_ON(rdssp->dynticks & 0x1);
 559        }
 560}
 561
 562static void dyntick_save_progress_counter(int cpu)
 563{
 564        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 565
 566        rdssp->dynticks_snap = rdssp->dynticks;
 567}
 568
 569static inline int
 570rcu_try_flip_waitack_needed(int cpu)
 571{
 572        long curr;
 573        long snap;
 574        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 575
 576        curr = rdssp->dynticks;
 577        snap = rdssp->dynticks_snap;
 578        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
 579
 580        /*
 581         * If the CPU remained in dynticks mode for the entire time
 582         * and didn't take any interrupts, NMIs, SMIs, or whatever,
 583         * then it cannot be in the middle of an rcu_read_lock(), so
 584         * the next rcu_read_lock() it executes must use the new value
 585         * of the counter.  So we can safely pretend that this CPU
 586         * already acknowledged the counter.
 587         */
 588
 589        if ((curr == snap) && ((curr & 0x1) == 0))
 590                return 0;
 591
 592        /*
 593         * If the CPU passed through or entered a dynticks idle phase with
 594         * no active irq handlers, then, as above, we can safely pretend
 595         * that this CPU already acknowledged the counter.
 596         */
 597
 598        if ((curr - snap) > 2 || (curr & 0x1) == 0)
 599                return 0;
 600
 601        /* We need this CPU to explicitly acknowledge the counter flip. */
 602
 603        return 1;
 604}
 605
 606static inline int
 607rcu_try_flip_waitmb_needed(int cpu)
 608{
 609        long curr;
 610        long snap;
 611        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 612
 613        curr = rdssp->dynticks;
 614        snap = rdssp->dynticks_snap;
 615        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
 616
 617        /*
 618         * If the CPU remained in dynticks mode for the entire time
 619         * and didn't take any interrupts, NMIs, SMIs, or whatever,
 620         * then it cannot have executed an RCU read-side critical section
 621         * during that time, so there is no need for it to execute a
 622         * memory barrier.
 623         */
 624
 625        if ((curr == snap) && ((curr & 0x1) == 0))
 626                return 0;
 627
 628        /*
 629         * If the CPU either entered or exited an outermost interrupt,
 630         * SMI, NMI, or whatever handler, then we know that it executed
 631         * a memory barrier when doing so.  So we don't need another one.
 632         */
 633        if (curr != snap)
 634                return 0;
 635
 636        /* We need the CPU to execute a memory barrier. */
 637
 638        return 1;
 639}
 640
 641static void dyntick_save_progress_counter_sched(int cpu)
 642{
 643        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 644
 645        rdssp->sched_dynticks_snap = rdssp->dynticks;
 646}
 647
 648static int rcu_qsctr_inc_needed_dyntick(int cpu)
 649{
 650        long curr;
 651        long snap;
 652        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 653
 654        curr = rdssp->dynticks;
 655        snap = rdssp->sched_dynticks_snap;
 656        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
 657
 658        /*
 659         * If the CPU remained in dynticks mode for the entire time
 660         * and didn't take any interrupts, NMIs, SMIs, or whatever,
 661         * then it cannot be in the middle of an rcu_read_lock(), so
 662         * the next rcu_read_lock() it executes must use the new value
 663         * of the counter.  Therefore, this CPU has been in a quiescent
 664         * state the entire time, and we don't need to wait for it.
 665         */
 666
 667        if ((curr == snap) && ((curr & 0x1) == 0))
 668                return 0;
 669
 670        /*
 671         * If the CPU passed through or entered a dynticks idle phase with
 672         * no active irq handlers, then, as above, this CPU has already
 673         * passed through a quiescent state.
 674         */
 675
 676        if ((curr - snap) > 2 || (snap & 0x1) == 0)
 677                return 0;
 678
 679        /* We need this CPU to go through a quiescent state. */
 680
 681        return 1;
 682}
 683
 684#else /* !CONFIG_NO_HZ */
 685
 686# define dyntick_save_progress_counter(cpu)             do { } while (0)
 687# define rcu_try_flip_waitack_needed(cpu)               (1)
 688# define rcu_try_flip_waitmb_needed(cpu)                (1)
 689
 690# define dyntick_save_progress_counter_sched(cpu)       do { } while (0)
 691# define rcu_qsctr_inc_needed_dyntick(cpu)              (1)
 692
 693#endif /* CONFIG_NO_HZ */
 694
 695static void save_qsctr_sched(int cpu)
 696{
 697        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 698
 699        rdssp->sched_qs_snap = rdssp->sched_qs;
 700}
 701
 702static inline int rcu_qsctr_inc_needed(int cpu)
 703{
 704        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 705
 706        /*
 707         * If there has been a quiescent state, no more need to wait
 708         * on this CPU.
 709         */
 710
 711        if (rdssp->sched_qs != rdssp->sched_qs_snap) {
 712                smp_mb(); /* force ordering with cpu entering schedule(). */
 713                return 0;
 714        }
 715
 716        /* We need this CPU to go through a quiescent state. */
 717
 718        return 1;
 719}
 720
 721/*
 722 * Get here when RCU is idle.  Decide whether we need to
 723 * move out of idle state, and return non-zero if so.
 724 * "Straightforward" approach for the moment, might later
 725 * use callback-list lengths, grace-period duration, or
 726 * some such to determine when to exit idle state.
 727 * Might also need a pre-idle test that does not acquire
 728 * the lock, but let's get the simple case working first...
 729 */
 730
 731static int
 732rcu_try_flip_idle(void)
 733{
 734        int cpu;
 735
 736        RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
 737        if (!rcu_pending(smp_processor_id())) {
 738                RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
 739                return 0;
 740        }
 741
 742        /*
 743         * Do the flip.
 744         */
 745
 746        RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
 747        rcu_ctrlblk.completed++;  /* stands in for rcu_try_flip_g2 */
 748
 749        /*
 750         * Need a memory barrier so that other CPUs see the new
 751         * counter value before they see the subsequent change of all
 752         * the rcu_flip_flag instances to rcu_flipped.
 753         */
 754
 755        smp_mb();       /* see above block comment. */
 756
 757        /* Now ask each CPU for acknowledgement of the flip. */
 758
 759        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
 760                per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
 761                dyntick_save_progress_counter(cpu);
 762        }
 763
 764        return 1;
 765}
 766
 767/*
 768 * Wait for CPUs to acknowledge the flip.
 769 */
 770
 771static int
 772rcu_try_flip_waitack(void)
 773{
 774        int cpu;
 775
 776        RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
 777        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 778                if (rcu_try_flip_waitack_needed(cpu) &&
 779                    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 780                        RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
 781                        return 0;
 782                }
 783
 784        /*
 785         * Make sure our checks above don't bleed into subsequent
 786         * waiting for the sum of the counters to reach zero.
 787         */
 788
 789        smp_mb();       /* see above block comment. */
 790        RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
 791        return 1;
 792}
 793
 794/*
 795 * Wait for collective ``last'' counter to reach zero,
 796 * then tell all CPUs to do an end-of-grace-period memory barrier.
 797 */
 798
 799static int
 800rcu_try_flip_waitzero(void)
 801{
 802        int cpu;
 803        int lastidx = !(rcu_ctrlblk.completed & 0x1);
 804        int sum = 0;
 805
 806        /* Check to see if the sum of the "last" counters is zero. */
 807
 808        RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
 809        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 810                sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
 811        if (sum != 0) {
 812                RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
 813                return 0;
 814        }
 815
 816        /*
 817         * This ensures that the other CPUs see the call for
 818         * memory barriers -after- the sum to zero has been
 819         * detected here
 820         */
 821        smp_mb();  /*  ^^^^^^^^^^^^ */
 822
 823        /* Call for a memory barrier from each CPU. */
 824        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
 825                per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
 826                dyntick_save_progress_counter(cpu);
 827        }
 828
 829        RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
 830        return 1;
 831}
 832
 833/*
 834 * Wait for all CPUs to do their end-of-grace-period memory barrier.
 835 * Return 0 once all CPUs have done so.
 836 */
 837
 838static int
 839rcu_try_flip_waitmb(void)
 840{
 841        int cpu;
 842
 843        RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
 844        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 845                if (rcu_try_flip_waitmb_needed(cpu) &&
 846                    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 847                        RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
 848                        return 0;
 849                }
 850
 851        smp_mb(); /* Ensure that the above checks precede any following flip. */
 852        RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
 853        return 1;
 854}
 855
 856/*
 857 * Attempt a single flip of the counters.  Remember, a single flip does
 858 * -not- constitute a grace period.  Instead, the interval between
 859 * at least GP_STAGES consecutive flips is a grace period.
 860 *
 861 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
 862 * on a large SMP, they might want to use a hierarchical organization of
 863 * the per-CPU-counter pairs.
 864 */
 865static void rcu_try_flip(void)
 866{
 867        unsigned long flags;
 868
 869        RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
 870        if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
 871                RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
 872                return;
 873        }
 874
 875        /*
 876         * Take the next transition(s) through the RCU grace-period
 877         * flip-counter state machine.
 878         */
 879
 880        switch (rcu_ctrlblk.rcu_try_flip_state) {
 881        case rcu_try_flip_idle_state:
 882                if (rcu_try_flip_idle())
 883                        rcu_ctrlblk.rcu_try_flip_state =
 884                                rcu_try_flip_waitack_state;
 885                break;
 886        case rcu_try_flip_waitack_state:
 887                if (rcu_try_flip_waitack())
 888                        rcu_ctrlblk.rcu_try_flip_state =
 889                                rcu_try_flip_waitzero_state;
 890                break;
 891        case rcu_try_flip_waitzero_state:
 892                if (rcu_try_flip_waitzero())
 893                        rcu_ctrlblk.rcu_try_flip_state =
 894                                rcu_try_flip_waitmb_state;
 895                break;
 896        case rcu_try_flip_waitmb_state:
 897                if (rcu_try_flip_waitmb())
 898                        rcu_ctrlblk.rcu_try_flip_state =
 899                                rcu_try_flip_idle_state;
 900        }
 901        spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 902}
 903
 904/*
 905 * Check to see if this CPU needs to do a memory barrier in order to
 906 * ensure that any prior RCU read-side critical sections have committed
 907 * their counter manipulations and critical-section memory references
 908 * before declaring the grace period to be completed.
 909 */
 910static void rcu_check_mb(int cpu)
 911{
 912        if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
 913                smp_mb();  /* Ensure RCU read-side accesses are visible. */
 914                per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
 915        }
 916}
 917
 918void rcu_check_callbacks(int cpu, int user)
 919{
 920        unsigned long flags;
 921        struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 922
 923        /*
 924         * If this CPU took its interrupt from user mode or from the
 925         * idle loop, and this is not a nested interrupt, then
 926         * this CPU has to have exited all prior preept-disable
 927         * sections of code.  So increment the counter to note this.
 928         *
 929         * The memory barrier is needed to handle the case where
 930         * writes from a preempt-disable section of code get reordered
 931         * into schedule() by this CPU's write buffer.  So the memory
 932         * barrier makes sure that the rcu_qsctr_inc() is seen by other
 933         * CPUs to happen after any such write.
 934         */
 935
 936        if (user ||
 937            (idle_cpu(cpu) && !in_softirq() &&
 938             hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
 939                smp_mb();       /* Guard against aggressive schedule(). */
 940                rcu_qsctr_inc(cpu);
 941        }
 942
 943        rcu_check_mb(cpu);
 944        if (rcu_ctrlblk.completed == rdp->completed)
 945                rcu_try_flip();
 946        spin_lock_irqsave(&rdp->lock, flags);
 947        RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
 948        __rcu_advance_callbacks(rdp);
 949        if (rdp->donelist == NULL) {
 950                spin_unlock_irqrestore(&rdp->lock, flags);
 951        } else {
 952                spin_unlock_irqrestore(&rdp->lock, flags);
 953                raise_softirq(RCU_SOFTIRQ);
 954        }
 955}
 956
 957/*
 958 * Needed by dynticks, to make sure all RCU processing has finished
 959 * when we go idle:
 960 */
 961void rcu_advance_callbacks(int cpu, int user)
 962{
 963        unsigned long flags;
 964        struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 965
 966        if (rcu_ctrlblk.completed == rdp->completed) {
 967                rcu_try_flip();
 968                if (rcu_ctrlblk.completed == rdp->completed)
 969                        return;
 970        }
 971        spin_lock_irqsave(&rdp->lock, flags);
 972        RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
 973        __rcu_advance_callbacks(rdp);
 974        spin_unlock_irqrestore(&rdp->lock, flags);
 975}
 976
 977#ifdef CONFIG_HOTPLUG_CPU
 978#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
 979                *dsttail = srclist; \
 980                if (srclist != NULL) { \
 981                        dsttail = srctail; \
 982                        srclist = NULL; \
 983                        srctail = &srclist;\
 984                } \
 985        } while (0)
 986
 987void rcu_offline_cpu(int cpu)
 988{
 989        int i;
 990        struct rcu_head *list = NULL;
 991        unsigned long flags;
 992        struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 993        struct rcu_head *schedlist = NULL;
 994        struct rcu_head **schedtail = &schedlist;
 995        struct rcu_head **tail = &list;
 996
 997        /*
 998         * Remove all callbacks from the newly dead CPU, retaining order.
 999         * Otherwise rcu_barrier() will fail
1000         */
1001
1002        spin_lock_irqsave(&rdp->lock, flags);
1003        rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
1004        for (i = GP_STAGES - 1; i >= 0; i--)
1005                rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
1006                                                list, tail);
1007        rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1008        rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1009                                schedlist, schedtail);
1010        rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1011                                schedlist, schedtail);
1012        rdp->rcu_sched_sleeping = 0;
1013        spin_unlock_irqrestore(&rdp->lock, flags);
1014        rdp->waitlistcount = 0;
1015
1016        /* Disengage the newly dead CPU from the grace-period computation. */
1017
1018        spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1019        rcu_check_mb(cpu);
1020        if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
1021                smp_mb();  /* Subsequent counter accesses must see new value */
1022                per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
1023                smp_mb();  /* Subsequent RCU read-side critical sections */
1024                           /*  seen -after- acknowledgement. */
1025        }
1026
1027        RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1028        RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
1029
1030        RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1031        RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1032
1033        cpu_clear(cpu, rcu_cpu_online_map);
1034
1035        spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1036
1037        /*
1038         * Place the removed callbacks on the current CPU's queue.
1039         * Make them all start a new grace period: simple approach,
1040         * in theory could starve a given set of callbacks, but
1041         * you would need to be doing some serious CPU hotplugging
1042         * to make this happen.  If this becomes a problem, adding
1043         * a synchronize_rcu() to the hotplug path would be a simple
1044         * fix.
1045         */
1046
1047        local_irq_save(flags);  /* disable preempt till we know what lock. */
1048        rdp = RCU_DATA_ME();
1049        spin_lock(&rdp->lock);
1050        *rdp->nexttail = list;
1051        if (list)
1052                rdp->nexttail = tail;
1053        *rdp->nextschedtail = schedlist;
1054        if (schedlist)
1055                rdp->nextschedtail = schedtail;
1056        spin_unlock_irqrestore(&rdp->lock, flags);
1057}
1058
1059#else /* #ifdef CONFIG_HOTPLUG_CPU */
1060
1061void rcu_offline_cpu(int cpu)
1062{
1063}
1064
1065#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1066
1067void __cpuinit rcu_online_cpu(int cpu)
1068{
1069        unsigned long flags;
1070        struct rcu_data *rdp;
1071
1072        spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1073        cpu_set(cpu, rcu_cpu_online_map);
1074        spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1075
1076        /*
1077         * The rcu_sched grace-period processing might have bypassed
1078         * this CPU, given that it was not in the rcu_cpu_online_map
1079         * when the grace-period scan started.  This means that the
1080         * grace-period task might sleep.  So make sure that if this
1081         * should happen, the first callback posted to this CPU will
1082         * wake up the grace-period task if need be.
1083         */
1084
1085        rdp = RCU_DATA_CPU(cpu);
1086        spin_lock_irqsave(&rdp->lock, flags);
1087        rdp->rcu_sched_sleeping = 1;
1088        spin_unlock_irqrestore(&rdp->lock, flags);
1089}
1090
1091static void rcu_process_callbacks(struct softirq_action *unused)
1092{
1093        unsigned long flags;
1094        struct rcu_head *next, *list;
1095        struct rcu_data *rdp;
1096
1097        local_irq_save(flags);
1098        rdp = RCU_DATA_ME();
1099        spin_lock(&rdp->lock);
1100        list = rdp->donelist;
1101        if (list == NULL) {
1102                spin_unlock_irqrestore(&rdp->lock, flags);
1103                return;
1104        }
1105        rdp->donelist = NULL;
1106        rdp->donetail = &rdp->donelist;
1107        RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
1108        spin_unlock_irqrestore(&rdp->lock, flags);
1109        while (list) {
1110                next = list->next;
1111                list->func(list);
1112                list = next;
1113                RCU_TRACE_ME(rcupreempt_trace_invoke);
1114        }
1115}
1116
1117void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1118{
1119        unsigned long flags;
1120        struct rcu_data *rdp;
1121
1122        head->func = func;
1123        head->next = NULL;
1124        local_irq_save(flags);
1125        rdp = RCU_DATA_ME();
1126        spin_lock(&rdp->lock);
1127        __rcu_advance_callbacks(rdp);
1128        *rdp->nexttail = head;
1129        rdp->nexttail = &head->next;
1130        RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
1131        spin_unlock_irqrestore(&rdp->lock, flags);
1132}
1133EXPORT_SYMBOL_GPL(call_rcu);
1134
1135void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1136{
1137        unsigned long flags;
1138        struct rcu_data *rdp;
1139        int wake_gp = 0;
1140
1141        head->func = func;
1142        head->next = NULL;
1143        local_irq_save(flags);
1144        rdp = RCU_DATA_ME();
1145        spin_lock(&rdp->lock);
1146        *rdp->nextschedtail = head;
1147        rdp->nextschedtail = &head->next;
1148        if (rdp->rcu_sched_sleeping) {
1149
1150                /* Grace-period processing might be sleeping... */
1151
1152                rdp->rcu_sched_sleeping = 0;
1153                wake_gp = 1;
1154        }
1155        spin_unlock_irqrestore(&rdp->lock, flags);
1156        if (wake_gp) {
1157
1158                /* Wake up grace-period processing, unless someone beat us. */
1159
1160                spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1161                if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1162                        wake_gp = 0;
1163                rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1164                spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1165                if (wake_gp)
1166                        wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1167        }
1168}
1169EXPORT_SYMBOL_GPL(call_rcu_sched);
1170
1171/*
1172 * Wait until all currently running preempt_disable() code segments
1173 * (including hardware-irq-disable segments) complete.  Note that
1174 * in -rt this does -not- necessarily result in all currently executing
1175 * interrupt -handlers- having completed.
1176 */
1177synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
1178EXPORT_SYMBOL_GPL(__synchronize_sched);
1179
1180/*
1181 * kthread function that manages call_rcu_sched grace periods.
1182 */
1183static int rcu_sched_grace_period(void *arg)
1184{
1185        int couldsleep;         /* might sleep after current pass. */
1186        int couldsleepnext = 0; /* might sleep after next pass. */
1187        int cpu;
1188        unsigned long flags;
1189        struct rcu_data *rdp;
1190        int ret;
1191
1192        /*
1193         * Each pass through the following loop handles one
1194         * rcu_sched grace period cycle.
1195         */
1196        do {
1197                /* Save each CPU's current state. */
1198
1199                for_each_online_cpu(cpu) {
1200                        dyntick_save_progress_counter_sched(cpu);
1201                        save_qsctr_sched(cpu);
1202                }
1203
1204                /*
1205                 * Sleep for about an RCU grace-period's worth to
1206                 * allow better batching and to consume less CPU.
1207                 */
1208                schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1209
1210                /*
1211                 * If there was nothing to do last time, prepare to
1212                 * sleep at the end of the current grace period cycle.
1213                 */
1214                couldsleep = couldsleepnext;
1215                couldsleepnext = 1;
1216                if (couldsleep) {
1217                        spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1218                        rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1219                        spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1220                }
1221
1222                /*
1223                 * Wait on each CPU in turn to have either visited
1224                 * a quiescent state or been in dynticks-idle mode.
1225                 */
1226                for_each_online_cpu(cpu) {
1227                        while (rcu_qsctr_inc_needed(cpu) &&
1228                               rcu_qsctr_inc_needed_dyntick(cpu)) {
1229                                /* resched_cpu(cpu); @@@ */
1230                                schedule_timeout_interruptible(1);
1231                        }
1232                }
1233
1234                /* Advance callbacks for each CPU.  */
1235
1236                for_each_online_cpu(cpu) {
1237
1238                        rdp = RCU_DATA_CPU(cpu);
1239                        spin_lock_irqsave(&rdp->lock, flags);
1240
1241                        /*
1242                         * We are running on this CPU irq-disabled, so no
1243                         * CPU can go offline until we re-enable irqs.
1244                         * The current CPU might have already gone
1245                         * offline (between the for_each_offline_cpu and
1246                         * the spin_lock_irqsave), but in that case all its
1247                         * callback lists will be empty, so no harm done.
1248                         *
1249                         * Advance the callbacks!  We share normal RCU's
1250                         * donelist, since callbacks are invoked the
1251                         * same way in either case.
1252                         */
1253                        if (rdp->waitschedlist != NULL) {
1254                                *rdp->donetail = rdp->waitschedlist;
1255                                rdp->donetail = rdp->waitschedtail;
1256
1257                                /*
1258                                 * Next rcu_check_callbacks() will
1259                                 * do the required raise_softirq().
1260                                 */
1261                        }
1262                        if (rdp->nextschedlist != NULL) {
1263                                rdp->waitschedlist = rdp->nextschedlist;
1264                                rdp->waitschedtail = rdp->nextschedtail;
1265                                couldsleep = 0;
1266                                couldsleepnext = 0;
1267                        } else {
1268                                rdp->waitschedlist = NULL;
1269                                rdp->waitschedtail = &rdp->waitschedlist;
1270                        }
1271                        rdp->nextschedlist = NULL;
1272                        rdp->nextschedtail = &rdp->nextschedlist;
1273
1274                        /* Mark sleep intention. */
1275
1276                        rdp->rcu_sched_sleeping = couldsleep;
1277
1278                        spin_unlock_irqrestore(&rdp->lock, flags);
1279                }
1280
1281                /* If we saw callbacks on the last scan, go deal with them. */
1282
1283                if (!couldsleep)
1284                        continue;
1285
1286                /* Attempt to block... */
1287
1288                spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1289                if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1290
1291                        /*
1292                         * Someone posted a callback after we scanned.
1293                         * Go take care of it.
1294                         */
1295                        spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1296                        couldsleepnext = 0;
1297                        continue;
1298                }
1299
1300                /* Block until the next person posts a callback. */
1301
1302                rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1303                spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1304                ret = 0;
1305                __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1306                        rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1307                        ret);
1308
1309                /*
1310                 * Signals would prevent us from sleeping, and we cannot
1311                 * do much with them in any case.  So flush them.
1312                 */
1313                if (ret)
1314                        flush_signals(current);
1315                couldsleepnext = 0;
1316
1317        } while (!kthread_should_stop());
1318
1319        return (0);
1320}
1321
1322/*
1323 * Check to see if any future RCU-related work will need to be done
1324 * by the current CPU, even if none need be done immediately, returning
1325 * 1 if so.  Assumes that notifiers would take care of handling any
1326 * outstanding requests from the RCU core.
1327 *
1328 * This function is part of the RCU implementation; it is -not-
1329 * an exported member of the RCU API.
1330 */
1331int rcu_needs_cpu(int cpu)
1332{
1333        struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1334
1335        return (rdp->donelist != NULL ||
1336                !!rdp->waitlistcount ||
1337                rdp->nextlist != NULL ||
1338                rdp->nextschedlist != NULL ||
1339                rdp->waitschedlist != NULL);
1340}
1341
1342int rcu_pending(int cpu)
1343{
1344        struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1345
1346        /* The CPU has at least one callback queued somewhere. */
1347
1348        if (rdp->donelist != NULL ||
1349            !!rdp->waitlistcount ||
1350            rdp->nextlist != NULL ||
1351            rdp->nextschedlist != NULL ||
1352            rdp->waitschedlist != NULL)
1353                return 1;
1354
1355        /* The RCU core needs an acknowledgement from this CPU. */
1356
1357        if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
1358            (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
1359                return 1;
1360
1361        /* This CPU has fallen behind the global grace-period number. */
1362
1363        if (rdp->completed != rcu_ctrlblk.completed)
1364                return 1;
1365
1366        /* Nothing needed from this CPU. */
1367
1368        return 0;
1369}
1370
1371static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1372                                unsigned long action, void *hcpu)
1373{
1374        long cpu = (long)hcpu;
1375
1376        switch (action) {
1377        case CPU_UP_PREPARE:
1378        case CPU_UP_PREPARE_FROZEN:
1379                rcu_online_cpu(cpu);
1380                break;
1381        case CPU_UP_CANCELED:
1382        case CPU_UP_CANCELED_FROZEN:
1383        case CPU_DEAD:
1384        case CPU_DEAD_FROZEN:
1385                rcu_offline_cpu(cpu);
1386                break;
1387        default:
1388                break;
1389        }
1390        return NOTIFY_OK;
1391}
1392
1393static struct notifier_block __cpuinitdata rcu_nb = {
1394        .notifier_call = rcu_cpu_notify,
1395};
1396
1397void __init __rcu_init(void)
1398{
1399        int cpu;
1400        int i;
1401        struct rcu_data *rdp;
1402
1403        printk(KERN_NOTICE "Preemptible RCU implementation.\n");
1404        for_each_possible_cpu(cpu) {
1405                rdp = RCU_DATA_CPU(cpu);
1406                spin_lock_init(&rdp->lock);
1407                rdp->completed = 0;
1408                rdp->waitlistcount = 0;
1409                rdp->nextlist = NULL;
1410                rdp->nexttail = &rdp->nextlist;
1411                for (i = 0; i < GP_STAGES; i++) {
1412                        rdp->waitlist[i] = NULL;
1413                        rdp->waittail[i] = &rdp->waitlist[i];
1414                }
1415                rdp->donelist = NULL;
1416                rdp->donetail = &rdp->donelist;
1417                rdp->rcu_flipctr[0] = 0;
1418                rdp->rcu_flipctr[1] = 0;
1419                rdp->nextschedlist = NULL;
1420                rdp->nextschedtail = &rdp->nextschedlist;
1421                rdp->waitschedlist = NULL;
1422                rdp->waitschedtail = &rdp->waitschedlist;
1423                rdp->rcu_sched_sleeping = 0;
1424        }
1425        register_cpu_notifier(&rcu_nb);
1426
1427        /*
1428         * We don't need protection against CPU-Hotplug here
1429         * since
1430         * a) If a CPU comes online while we are iterating over the
1431         *    cpu_online_map below, we would only end up making a
1432         *    duplicate call to rcu_online_cpu() which sets the corresponding
1433         *    CPU's mask in the rcu_cpu_online_map.
1434         *
1435         * b) A CPU cannot go offline at this point in time since the user
1436         *    does not have access to the sysfs interface, nor do we
1437         *    suspend the system.
1438         */
1439        for_each_online_cpu(cpu)
1440                rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
1441
1442        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1443}
1444
1445/*
1446 * Late-boot-time RCU initialization that must wait until after scheduler
1447 * has been initialized.
1448 */
1449void __init rcu_init_sched(void)
1450{
1451        rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1452                                                  NULL,
1453                                                  "rcu_sched_grace_period");
1454        WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1455}
1456
1457#ifdef CONFIG_RCU_TRACE
1458long *rcupreempt_flipctr(int cpu)
1459{
1460        return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1461}
1462EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
1463
1464int rcupreempt_flip_flag(int cpu)
1465{
1466        return per_cpu(rcu_flip_flag, cpu);
1467}
1468EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
1469
1470int rcupreempt_mb_flag(int cpu)
1471{
1472        return per_cpu(rcu_mb_flag, cpu);
1473}
1474EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
1475
1476char *rcupreempt_try_flip_state_name(void)
1477{
1478        return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
1479}
1480EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
1481
1482struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
1483{
1484        struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1485
1486        return &rdp->trace;
1487}
1488EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
1489
1490#endif /* #ifdef RCU_TRACE */
1491