linux/kernel/context_tracking.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Context tracking: Probe on high level context boundaries such as kernel,
   4 * userspace, guest or idle.
   5 *
   6 * This is used by RCU to remove its dependency on the timer tick while a CPU
   7 * runs in idle, userspace or guest mode.
   8 *
   9 * User/guest tracking started by Frederic Weisbecker:
  10 *
  11 * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker
  12 *
  13 * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
  14 * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
  15 *
  16 * RCU extended quiescent state bits imported from kernel/rcu/tree.c
  17 * where the relevant authorship may be found.
  18 */
  19
  20#include <linux/context_tracking.h>
  21#include <linux/rcupdate.h>
  22#include <linux/sched.h>
  23#include <linux/hardirq.h>
  24#include <linux/export.h>
  25#include <linux/kprobes.h>
  26#include <trace/events/rcu.h>
  27
  28
  29DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
  30#ifdef CONFIG_CONTEXT_TRACKING_IDLE
  31        .dynticks_nesting = 1,
  32        .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
  33#endif
  34        .state = ATOMIC_INIT(RCU_DYNTICKS_IDX),
  35};
  36EXPORT_SYMBOL_GPL(context_tracking);
  37
  38#ifdef CONFIG_CONTEXT_TRACKING_IDLE
  39#define TPS(x)  tracepoint_string(x)
  40
  41/* Record the current task on dyntick-idle entry. */
  42static __always_inline void rcu_dynticks_task_enter(void)
  43{
  44#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
  45        WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
  46#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
  47}
  48
  49/* Record no current task on dyntick-idle exit. */
  50static __always_inline void rcu_dynticks_task_exit(void)
  51{
  52#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
  53        WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
  54#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
  55}
  56
  57/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
  58static __always_inline void rcu_dynticks_task_trace_enter(void)
  59{
  60#ifdef CONFIG_TASKS_TRACE_RCU
  61        if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
  62                current->trc_reader_special.b.need_mb = true;
  63#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
  64}
  65
  66/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
  67static __always_inline void rcu_dynticks_task_trace_exit(void)
  68{
  69#ifdef CONFIG_TASKS_TRACE_RCU
  70        if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
  71                current->trc_reader_special.b.need_mb = false;
  72#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
  73}
  74
  75/*
  76 * Record entry into an extended quiescent state.  This is only to be
  77 * called when not already in an extended quiescent state, that is,
  78 * RCU is watching prior to the call to this function and is no longer
  79 * watching upon return.
  80 */
  81static noinstr void ct_kernel_exit_state(int offset)
  82{
  83        int seq;
  84
  85        /*
  86         * CPUs seeing atomic_add_return() must see prior RCU read-side
  87         * critical sections, and we also must force ordering with the
  88         * next idle sojourn.
  89         */
  90        rcu_dynticks_task_trace_enter();  // Before ->dynticks update!
  91        seq = ct_state_inc(offset);
  92        // RCU is no longer watching.  Better be in extended quiescent state!
  93        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX));
  94}
  95
  96/*
  97 * Record exit from an extended quiescent state.  This is only to be
  98 * called from an extended quiescent state, that is, RCU is not watching
  99 * prior to the call to this function and is watching upon return.
 100 */
 101static noinstr void ct_kernel_enter_state(int offset)
 102{
 103        int seq;
 104
 105        /*
 106         * CPUs seeing atomic_add_return() must see prior idle sojourns,
 107         * and we also must force ordering with the next RCU read-side
 108         * critical section.
 109         */
 110        seq = ct_state_inc(offset);
 111        // RCU is now watching.  Better not be in an extended quiescent state!
 112        rcu_dynticks_task_trace_exit();  // After ->dynticks update!
 113        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX));
 114}
 115
 116/*
 117 * Enter an RCU extended quiescent state, which can be either the
 118 * idle loop or adaptive-tickless usermode execution.
 119 *
 120 * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
 121 * the possibility of usermode upcalls having messed up our count
 122 * of interrupt nesting level during the prior busy period.
 123 */
 124static void noinstr ct_kernel_exit(bool user, int offset)
 125{
 126        struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 127
 128        WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE);
 129        WRITE_ONCE(ct->dynticks_nmi_nesting, 0);
 130        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
 131                     ct_dynticks_nesting() == 0);
 132        if (ct_dynticks_nesting() != 1) {
 133                // RCU will still be watching, so just do accounting and leave.
 134                ct->dynticks_nesting--;
 135                return;
 136        }
 137
 138        instrumentation_begin();
 139        lockdep_assert_irqs_disabled();
 140        trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks());
 141        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 142        rcu_preempt_deferred_qs(current);
 143
 144        // instrumentation for the noinstr ct_kernel_exit_state()
 145        instrument_atomic_write(&ct->state, sizeof(ct->state));
 146
 147        instrumentation_end();
 148        WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */
 149        // RCU is watching here ...
 150        ct_kernel_exit_state(offset);
 151        // ... but is no longer watching here.
 152        rcu_dynticks_task_enter();
 153}
 154
 155/*
 156 * Exit an RCU extended quiescent state, which can be either the
 157 * idle loop or adaptive-tickless usermode execution.
 158 *
 159 * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
 160 * allow for the possibility of usermode upcalls messing up our count of
 161 * interrupt nesting level during the busy period that is just now starting.
 162 */
 163static void noinstr ct_kernel_enter(bool user, int offset)
 164{
 165        struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 166        long oldval;
 167
 168        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
 169        oldval = ct_dynticks_nesting();
 170        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
 171        if (oldval) {
 172                // RCU was already watching, so just do accounting and leave.
 173                ct->dynticks_nesting++;
 174                return;
 175        }
 176        rcu_dynticks_task_exit();
 177        // RCU is not watching here ...
 178        ct_kernel_enter_state(offset);
 179        // ... but is watching here.
 180        instrumentation_begin();
 181
 182        // instrumentation for the noinstr ct_kernel_enter_state()
 183        instrument_atomic_write(&ct->state, sizeof(ct->state));
 184
 185        trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
 186        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 187        WRITE_ONCE(ct->dynticks_nesting, 1);
 188        WARN_ON_ONCE(ct_dynticks_nmi_nesting());
 189        WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
 190        instrumentation_end();
 191}
 192
 193/**
 194 * ct_nmi_exit - inform RCU of exit from NMI context
 195 *
 196 * If we are returning from the outermost NMI handler that interrupted an
 197 * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting
 198 * to let the RCU grace-period handling know that the CPU is back to
 199 * being RCU-idle.
 200 *
 201 * If you add or remove a call to ct_nmi_exit(), be sure to test
 202 * with CONFIG_RCU_EQS_DEBUG=y.
 203 */
 204void noinstr ct_nmi_exit(void)
 205{
 206        struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 207
 208        instrumentation_begin();
 209        /*
 210         * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
 211         * (We are exiting an NMI handler, so RCU better be paying attention
 212         * to us!)
 213         */
 214        WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0);
 215        WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
 216
 217        /*
 218         * If the nesting level is not 1, the CPU wasn't RCU-idle, so
 219         * leave it in non-RCU-idle state.
 220         */
 221        if (ct_dynticks_nmi_nesting() != 1) {
 222                trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2,
 223                                  ct_dynticks());
 224                WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */
 225                           ct_dynticks_nmi_nesting() - 2);
 226                instrumentation_end();
 227                return;
 228        }
 229
 230        /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
 231        trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks());
 232        WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
 233
 234        // instrumentation for the noinstr ct_kernel_exit_state()
 235        instrument_atomic_write(&ct->state, sizeof(ct->state));
 236        instrumentation_end();
 237
 238        // RCU is watching here ...
 239        ct_kernel_exit_state(RCU_DYNTICKS_IDX);
 240        // ... but is no longer watching here.
 241
 242        if (!in_nmi())
 243                rcu_dynticks_task_enter();
 244}
 245
 246/**
 247 * ct_nmi_enter - inform RCU of entry to NMI context
 248 *
 249 * If the CPU was idle from RCU's viewpoint, update ct->state and
 250 * ct->dynticks_nmi_nesting to let the RCU grace-period handling know
 251 * that the CPU is active.  This implementation permits nested NMIs, as
 252 * long as the nesting level does not overflow an int.  (You will probably
 253 * run out of stack space first.)
 254 *
 255 * If you add or remove a call to ct_nmi_enter(), be sure to test
 256 * with CONFIG_RCU_EQS_DEBUG=y.
 257 */
 258void noinstr ct_nmi_enter(void)
 259{
 260        long incby = 2;
 261        struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 262
 263        /* Complain about underflow. */
 264        WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0);
 265
 266        /*
 267         * If idle from RCU viewpoint, atomically increment ->dynticks
 268         * to mark non-idle and increment ->dynticks_nmi_nesting by one.
 269         * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
 270         * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
 271         * to be in the outermost NMI handler that interrupted an RCU-idle
 272         * period (observation due to Andy Lutomirski).
 273         */
 274        if (rcu_dynticks_curr_cpu_in_eqs()) {
 275
 276                if (!in_nmi())
 277                        rcu_dynticks_task_exit();
 278
 279                // RCU is not watching here ...
 280                ct_kernel_enter_state(RCU_DYNTICKS_IDX);
 281                // ... but is watching here.
 282
 283                instrumentation_begin();
 284                // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
 285                instrument_atomic_read(&ct->state, sizeof(ct->state));
 286                // instrumentation for the noinstr ct_kernel_enter_state()
 287                instrument_atomic_write(&ct->state, sizeof(ct->state));
 288
 289                incby = 1;
 290        } else if (!in_nmi()) {
 291                instrumentation_begin();
 292                rcu_irq_enter_check_tick();
 293        } else  {
 294                instrumentation_begin();
 295        }
 296
 297        trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
 298                          ct_dynticks_nmi_nesting(),
 299                          ct_dynticks_nmi_nesting() + incby, ct_dynticks());
 300        instrumentation_end();
 301        WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */
 302                   ct_dynticks_nmi_nesting() + incby);
 303        barrier();
 304}
 305
 306/**
 307 * ct_idle_enter - inform RCU that current CPU is entering idle
 308 *
 309 * Enter idle mode, in other words, -leave- the mode in which RCU
 310 * read-side critical sections can occur.  (Though RCU read-side
 311 * critical sections can occur in irq handlers in idle, a possibility
 312 * handled by irq_enter() and irq_exit().)
 313 *
 314 * If you add or remove a call to ct_idle_enter(), be sure to test with
 315 * CONFIG_RCU_EQS_DEBUG=y.
 316 */
 317void noinstr ct_idle_enter(void)
 318{
 319        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
 320        ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE);
 321}
 322EXPORT_SYMBOL_GPL(ct_idle_enter);
 323
 324/**
 325 * ct_idle_exit - inform RCU that current CPU is leaving idle
 326 *
 327 * Exit idle mode, in other words, -enter- the mode in which RCU
 328 * read-side critical sections can occur.
 329 *
 330 * If you add or remove a call to ct_idle_exit(), be sure to test with
 331 * CONFIG_RCU_EQS_DEBUG=y.
 332 */
 333void noinstr ct_idle_exit(void)
 334{
 335        unsigned long flags;
 336
 337        raw_local_irq_save(flags);
 338        ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE);
 339        raw_local_irq_restore(flags);
 340}
 341EXPORT_SYMBOL_GPL(ct_idle_exit);
 342
 343/**
 344 * ct_irq_enter - inform RCU that current CPU is entering irq away from idle
 345 *
 346 * Enter an interrupt handler, which might possibly result in exiting
 347 * idle mode, in other words, entering the mode in which read-side critical
 348 * sections can occur.  The caller must have disabled interrupts.
 349 *
 350 * Note that the Linux kernel is fully capable of entering an interrupt
 351 * handler that it never exits, for example when doing upcalls to user mode!
 352 * This code assumes that the idle loop never does upcalls to user mode.
 353 * If your architecture's idle loop does do upcalls to user mode (or does
 354 * anything else that results in unbalanced calls to the irq_enter() and
 355 * irq_exit() functions), RCU will give you what you deserve, good and hard.
 356 * But very infrequently and irreproducibly.
 357 *
 358 * Use things like work queues to work around this limitation.
 359 *
 360 * You have been warned.
 361 *
 362 * If you add or remove a call to ct_irq_enter(), be sure to test with
 363 * CONFIG_RCU_EQS_DEBUG=y.
 364 */
 365noinstr void ct_irq_enter(void)
 366{
 367        lockdep_assert_irqs_disabled();
 368        ct_nmi_enter();
 369}
 370
 371/**
 372 * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle
 373 *
 374 * Exit from an interrupt handler, which might possibly result in entering
 375 * idle mode, in other words, leaving the mode in which read-side critical
 376 * sections can occur.  The caller must have disabled interrupts.
 377 *
 378 * This code assumes that the idle loop never does anything that might
 379 * result in unbalanced calls to irq_enter() and irq_exit().  If your
 380 * architecture's idle loop violates this assumption, RCU will give you what
 381 * you deserve, good and hard.  But very infrequently and irreproducibly.
 382 *
 383 * Use things like work queues to work around this limitation.
 384 *
 385 * You have been warned.
 386 *
 387 * If you add or remove a call to ct_irq_exit(), be sure to test with
 388 * CONFIG_RCU_EQS_DEBUG=y.
 389 */
 390noinstr void ct_irq_exit(void)
 391{
 392        lockdep_assert_irqs_disabled();
 393        ct_nmi_exit();
 394}
 395
 396/*
 397 * Wrapper for ct_irq_enter() where interrupts are enabled.
 398 *
 399 * If you add or remove a call to ct_irq_enter_irqson(), be sure to test
 400 * with CONFIG_RCU_EQS_DEBUG=y.
 401 */
 402void ct_irq_enter_irqson(void)
 403{
 404        unsigned long flags;
 405
 406        local_irq_save(flags);
 407        ct_irq_enter();
 408        local_irq_restore(flags);
 409}
 410
 411/*
 412 * Wrapper for ct_irq_exit() where interrupts are enabled.
 413 *
 414 * If you add or remove a call to ct_irq_exit_irqson(), be sure to test
 415 * with CONFIG_RCU_EQS_DEBUG=y.
 416 */
 417void ct_irq_exit_irqson(void)
 418{
 419        unsigned long flags;
 420
 421        local_irq_save(flags);
 422        ct_irq_exit();
 423        local_irq_restore(flags);
 424}
 425#else
 426static __always_inline void ct_kernel_exit(bool user, int offset) { }
 427static __always_inline void ct_kernel_enter(bool user, int offset) { }
 428#endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
 429
 430#ifdef CONFIG_CONTEXT_TRACKING_USER
 431
 432#define CREATE_TRACE_POINTS
 433#include <trace/events/context_tracking.h>
 434
 435DEFINE_STATIC_KEY_FALSE(context_tracking_key);
 436EXPORT_SYMBOL_GPL(context_tracking_key);
 437
 438static noinstr bool context_tracking_recursion_enter(void)
 439{
 440        int recursion;
 441
 442        recursion = __this_cpu_inc_return(context_tracking.recursion);
 443        if (recursion == 1)
 444                return true;
 445
 446        WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion);
 447        __this_cpu_dec(context_tracking.recursion);
 448
 449        return false;
 450}
 451
 452static __always_inline void context_tracking_recursion_exit(void)
 453{
 454        __this_cpu_dec(context_tracking.recursion);
 455}
 456
 457/**
 458 * __ct_user_enter - Inform the context tracking that the CPU is going
 459 *                   to enter user or guest space mode.
 460 *
 461 * This function must be called right before we switch from the kernel
 462 * to user or guest space, when it's guaranteed the remaining kernel
 463 * instructions to execute won't use any RCU read side critical section
 464 * because this function sets RCU in extended quiescent state.
 465 */
 466void noinstr __ct_user_enter(enum ctx_state state)
 467{
 468        struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 469        lockdep_assert_irqs_disabled();
 470
 471        /* Kernel threads aren't supposed to go to userspace */
 472        WARN_ON_ONCE(!current->mm);
 473
 474        if (!context_tracking_recursion_enter())
 475                return;
 476
 477        if (__ct_state() != state) {
 478                if (ct->active) {
 479                        /*
 480                         * At this stage, only low level arch entry code remains and
 481                         * then we'll run in userspace. We can assume there won't be
 482                         * any RCU read-side critical section until the next call to
 483                         * user_exit() or ct_irq_enter(). Let's remove RCU's dependency
 484                         * on the tick.
 485                         */
 486                        if (state == CONTEXT_USER) {
 487                                instrumentation_begin();
 488                                trace_user_enter(0);
 489                                vtime_user_enter(current);
 490                                instrumentation_end();
 491                        }
 492                        /*
 493                         * Other than generic entry implementation, we may be past the last
 494                         * rescheduling opportunity in the entry code. Trigger a self IPI
 495                         * that will fire and reschedule once we resume in user/guest mode.
 496                         */
 497                        rcu_irq_work_resched();
 498
 499                        /*
 500                         * Enter RCU idle mode right before resuming userspace.  No use of RCU
 501                         * is permitted between this call and rcu_eqs_exit(). This way the
 502                         * CPU doesn't need to maintain the tick for RCU maintenance purposes
 503                         * when the CPU runs in userspace.
 504                         */
 505                        ct_kernel_exit(true, RCU_DYNTICKS_IDX + state);
 506
 507                        /*
 508                         * Special case if we only track user <-> kernel transitions for tickless
 509                         * cputime accounting but we don't support RCU extended quiescent state.
 510                         * In this we case we don't care about any concurrency/ordering.
 511                         */
 512                        if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
 513                                atomic_set(&ct->state, state);
 514                } else {
 515                        /*
 516                         * Even if context tracking is disabled on this CPU, because it's outside
 517                         * the full dynticks mask for example, we still have to keep track of the
 518                         * context transitions and states to prevent inconsistency on those of
 519                         * other CPUs.
 520                         * If a task triggers an exception in userspace, sleep on the exception
 521                         * handler and then migrate to another CPU, that new CPU must know where
 522                         * the exception returns by the time we call exception_exit().
 523                         * This information can only be provided by the previous CPU when it called
 524                         * exception_enter().
 525                         * OTOH we can spare the calls to vtime and RCU when context_tracking.active
 526                         * is false because we know that CPU is not tickless.
 527                         */
 528                        if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
 529                                /* Tracking for vtime only, no concurrent RCU EQS accounting */
 530                                atomic_set(&ct->state, state);
 531                        } else {
 532                                /*
 533                                 * Tracking for vtime and RCU EQS. Make sure we don't race
 534                                 * with NMIs. OTOH we don't care about ordering here since
 535                                 * RCU only requires RCU_DYNTICKS_IDX increments to be fully
 536                                 * ordered.
 537                                 */
 538                                atomic_add(state, &ct->state);
 539                        }
 540                }
 541        }
 542        context_tracking_recursion_exit();
 543}
 544EXPORT_SYMBOL_GPL(__ct_user_enter);
 545
 546/*
 547 * OBSOLETE:
 548 * This function should be noinstr but the below local_irq_restore() is
 549 * unsafe because it involves illegal RCU uses through tracing and lockdep.
 550 * This is unlikely to be fixed as this function is obsolete. The preferred
 551 * way is to call __context_tracking_enter() through user_enter_irqoff()
 552 * or context_tracking_guest_enter(). It should be the arch entry code
 553 * responsibility to call into context tracking with IRQs disabled.
 554 */
 555void ct_user_enter(enum ctx_state state)
 556{
 557        unsigned long flags;
 558
 559        /*
 560         * Some contexts may involve an exception occuring in an irq,
 561         * leading to that nesting:
 562         * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit()
 563         * This would mess up the dyntick_nesting count though. And rcu_irq_*()
 564         * helpers are enough to protect RCU uses inside the exception. So
 565         * just return immediately if we detect we are in an IRQ.
 566         */
 567        if (in_interrupt())
 568                return;
 569
 570        local_irq_save(flags);
 571        __ct_user_enter(state);
 572        local_irq_restore(flags);
 573}
 574NOKPROBE_SYMBOL(ct_user_enter);
 575EXPORT_SYMBOL_GPL(ct_user_enter);
 576
 577/**
 578 * user_enter_callable() - Unfortunate ASM callable version of user_enter() for
 579 *                         archs that didn't manage to check the context tracking
 580 *                         static key from low level code.
 581 *
 582 * This OBSOLETE function should be noinstr but it unsafely calls
 583 * local_irq_restore(), involving illegal RCU uses through tracing and lockdep.
 584 * This is unlikely to be fixed as this function is obsolete. The preferred
 585 * way is to call user_enter_irqoff(). It should be the arch entry code
 586 * responsibility to call into context tracking with IRQs disabled.
 587 */
 588void user_enter_callable(void)
 589{
 590        user_enter();
 591}
 592NOKPROBE_SYMBOL(user_enter_callable);
 593
 594/**
 595 * __ct_user_exit - Inform the context tracking that the CPU is
 596 *                  exiting user or guest mode and entering the kernel.
 597 *
 598 * This function must be called after we entered the kernel from user or
 599 * guest space before any use of RCU read side critical section. This
 600 * potentially include any high level kernel code like syscalls, exceptions,
 601 * signal handling, etc...
 602 *
 603 * This call supports re-entrancy. This way it can be called from any exception
 604 * handler without needing to know if we came from userspace or not.
 605 */
 606void noinstr __ct_user_exit(enum ctx_state state)
 607{
 608        struct context_tracking *ct = this_cpu_ptr(&context_tracking);
 609
 610        if (!context_tracking_recursion_enter())
 611                return;
 612
 613        if (__ct_state() == state) {
 614                if (ct->active) {
 615                        /*
 616                         * Exit RCU idle mode while entering the kernel because it can
 617                         * run a RCU read side critical section anytime.
 618                         */
 619                        ct_kernel_enter(true, RCU_DYNTICKS_IDX - state);
 620                        if (state == CONTEXT_USER) {
 621                                instrumentation_begin();
 622                                vtime_user_exit(current);
 623                                trace_user_exit(0);
 624                                instrumentation_end();
 625                        }
 626
 627                        /*
 628                         * Special case if we only track user <-> kernel transitions for tickless
 629                         * cputime accounting but we don't support RCU extended quiescent state.
 630                         * In this we case we don't care about any concurrency/ordering.
 631                         */
 632                        if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
 633                                atomic_set(&ct->state, CONTEXT_KERNEL);
 634
 635                } else {
 636                        if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
 637                                /* Tracking for vtime only, no concurrent RCU EQS accounting */
 638                                atomic_set(&ct->state, CONTEXT_KERNEL);
 639                        } else {
 640                                /*
 641                                 * Tracking for vtime and RCU EQS. Make sure we don't race
 642                                 * with NMIs. OTOH we don't care about ordering here since
 643                                 * RCU only requires RCU_DYNTICKS_IDX increments to be fully
 644                                 * ordered.
 645                                 */
 646                                atomic_sub(state, &ct->state);
 647                        }
 648                }
 649        }
 650        context_tracking_recursion_exit();
 651}
 652EXPORT_SYMBOL_GPL(__ct_user_exit);
 653
 654/*
 655 * OBSOLETE:
 656 * This function should be noinstr but the below local_irq_save() is
 657 * unsafe because it involves illegal RCU uses through tracing and lockdep.
 658 * This is unlikely to be fixed as this function is obsolete. The preferred
 659 * way is to call __context_tracking_exit() through user_exit_irqoff()
 660 * or context_tracking_guest_exit(). It should be the arch entry code
 661 * responsibility to call into context tracking with IRQs disabled.
 662 */
 663void ct_user_exit(enum ctx_state state)
 664{
 665        unsigned long flags;
 666
 667        if (in_interrupt())
 668                return;
 669
 670        local_irq_save(flags);
 671        __ct_user_exit(state);
 672        local_irq_restore(flags);
 673}
 674NOKPROBE_SYMBOL(ct_user_exit);
 675EXPORT_SYMBOL_GPL(ct_user_exit);
 676
 677/**
 678 * user_exit_callable() - Unfortunate ASM callable version of user_exit() for
 679 *                        archs that didn't manage to check the context tracking
 680 *                        static key from low level code.
 681 *
 682 * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(),
 683 * involving illegal RCU uses through tracing and lockdep. This is unlikely
 684 * to be fixed as this function is obsolete. The preferred way is to call
 685 * user_exit_irqoff(). It should be the arch entry code responsibility to
 686 * call into context tracking with IRQs disabled.
 687 */
 688void user_exit_callable(void)
 689{
 690        user_exit();
 691}
 692NOKPROBE_SYMBOL(user_exit_callable);
 693
 694void __init ct_cpu_track_user(int cpu)
 695{
 696        static __initdata bool initialized = false;
 697
 698        if (!per_cpu(context_tracking.active, cpu)) {
 699                per_cpu(context_tracking.active, cpu) = true;
 700                static_branch_inc(&context_tracking_key);
 701        }
 702
 703        if (initialized)
 704                return;
 705
 706#ifdef CONFIG_HAVE_TIF_NOHZ
 707        /*
 708         * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork
 709         * This assumes that init is the only task at this early boot stage.
 710         */
 711        set_tsk_thread_flag(&init_task, TIF_NOHZ);
 712#endif
 713        WARN_ON_ONCE(!tasklist_empty());
 714
 715        initialized = true;
 716}
 717
 718#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
 719void __init context_tracking_init(void)
 720{
 721        int cpu;
 722
 723        for_each_possible_cpu(cpu)
 724                ct_cpu_track_user(cpu);
 725}
 726#endif
 727
 728#endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */
 729