linux/drivers/cpuidle/cpuidle-pseries.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  cpuidle-pseries - idle state cpuidle driver.
   4 *  Adapted from drivers/idle/intel_idle.c and
   5 *  drivers/acpi/processor_idle.c
   6 *
   7 */
   8
   9#include <linux/kernel.h>
  10#include <linux/module.h>
  11#include <linux/init.h>
  12#include <linux/moduleparam.h>
  13#include <linux/cpuidle.h>
  14#include <linux/cpu.h>
  15#include <linux/notifier.h>
  16
  17#include <asm/paca.h>
  18#include <asm/reg.h>
  19#include <asm/machdep.h>
  20#include <asm/firmware.h>
  21#include <asm/runlatch.h>
  22#include <asm/idle.h>
  23#include <asm/plpar_wrappers.h>
  24#include <asm/rtas.h>
  25
  26static struct cpuidle_driver pseries_idle_driver = {
  27        .name             = "pseries_idle",
  28        .owner            = THIS_MODULE,
  29};
  30
  31static int max_idle_state __read_mostly;
  32static struct cpuidle_state *cpuidle_state_table __read_mostly;
  33static u64 snooze_timeout __read_mostly;
  34static bool snooze_timeout_en __read_mostly;
  35
  36static int snooze_loop(struct cpuidle_device *dev,
  37                        struct cpuidle_driver *drv,
  38                        int index)
  39{
  40        u64 snooze_exit_time;
  41
  42        set_thread_flag(TIF_POLLING_NRFLAG);
  43
  44        pseries_idle_prolog();
  45        local_irq_enable();
  46        snooze_exit_time = get_tb() + snooze_timeout;
  47
  48        while (!need_resched()) {
  49                HMT_low();
  50                HMT_very_low();
  51                if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
  52                        /*
  53                         * Task has not woken up but we are exiting the polling
  54                         * loop anyway. Require a barrier after polling is
  55                         * cleared to order subsequent test of need_resched().
  56                         */
  57                        clear_thread_flag(TIF_POLLING_NRFLAG);
  58                        smp_mb();
  59                        break;
  60                }
  61        }
  62
  63        HMT_medium();
  64        clear_thread_flag(TIF_POLLING_NRFLAG);
  65
  66        local_irq_disable();
  67
  68        pseries_idle_epilog();
  69
  70        return index;
  71}
  72
  73static void check_and_cede_processor(void)
  74{
  75        /*
  76         * Ensure our interrupt state is properly tracked,
  77         * also checks if no interrupt has occurred while we
  78         * were soft-disabled
  79         */
  80        if (prep_irq_for_idle()) {
  81                cede_processor();
  82#ifdef CONFIG_TRACE_IRQFLAGS
  83                /* Ensure that H_CEDE returns with IRQs on */
  84                if (WARN_ON(!(mfmsr() & MSR_EE)))
  85                        __hard_irq_enable();
  86#endif
  87        }
  88}
  89
  90/*
  91 * XCEDE: Extended CEDE states discovered through the
  92 *        "ibm,get-systems-parameter" RTAS call with the token
  93 *        CEDE_LATENCY_TOKEN
  94 */
  95
  96/*
  97 * Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a
  98 * table with all the parameters to ibm,get-system-parameters.
  99 * CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency
 100 * Settings Information.
 101 */
 102#define CEDE_LATENCY_TOKEN      45
 103
 104/*
 105 * If the platform supports the cede latency settings information system
 106 * parameter it must provide the following information in the NULL terminated
 107 * parameter string:
 108 *
 109 * a. The first byte is the length \xE2\x80\x9CN\xE2\x80\x9D of each cede latency setting record minus
 110 *    one (zero indicates a length of 1 byte).
 111 *
 112 * b. For each supported cede latency setting a cede latency setting record
 113 *    consisting of the first \xE2\x80\x9CN\xE2\x80\x9D bytes as per the following table.
 114 *
 115 *    -----------------------------
 116 *    | Field           | Field   |
 117 *    | Name            | Length  |
 118 *    -----------------------------
 119 *    | Cede Latency    | 1 Byte  |
 120 *    | Specifier Value |         |
 121 *    -----------------------------
 122 *    | Maximum wakeup  |         |
 123 *    | latency in      | 8 Bytes |
 124 *    | tb-ticks        |         |
 125 *    -----------------------------
 126 *    | Responsive to   |         |
 127 *    | external        | 1 Byte  |
 128 *    | interrupts      |         |
 129 *    -----------------------------
 130 *
 131 * This version has cede latency record size = 10.
 132 *
 133 * The structure xcede_latency_payload represents a) and b) with
 134 * xcede_latency_record representing the table in b).
 135 *
 136 * xcede_latency_parameter is what gets returned by
 137 * ibm,get-systems-parameter RTAS call when made with
 138 * CEDE_LATENCY_TOKEN.
 139 *
 140 * These structures are only used to represent the data obtained by the RTAS
 141 * call. The data is in big-endian.
 142 */
 143struct xcede_latency_record {
 144        u8      hint;
 145        __be64  latency_ticks;
 146        u8      wake_on_irqs;
 147} __packed;
 148
 149// Make space for 16 records, which "should be enough".
 150struct xcede_latency_payload {
 151        u8     record_size;
 152        struct xcede_latency_record records[16];
 153} __packed;
 154
 155struct xcede_latency_parameter {
 156        __be16  payload_size;
 157        struct xcede_latency_payload payload;
 158        u8 null_char;
 159} __packed;
 160
 161static unsigned int nr_xcede_records;
 162static struct xcede_latency_parameter xcede_latency_parameter __initdata;
 163
 164static int __init parse_cede_parameters(void)
 165{
 166        struct xcede_latency_payload *payload;
 167        u32 total_xcede_records_size;
 168        u8 xcede_record_size;
 169        u16 payload_size;
 170        int ret, i;
 171
 172        ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
 173                        NULL, CEDE_LATENCY_TOKEN, __pa(&xcede_latency_parameter),
 174                        sizeof(xcede_latency_parameter));
 175        if (ret) {
 176                pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
 177                return ret;
 178        }
 179
 180        payload_size = be16_to_cpu(xcede_latency_parameter.payload_size);
 181        payload = &xcede_latency_parameter.payload;
 182
 183        xcede_record_size = payload->record_size + 1;
 184
 185        if (xcede_record_size != sizeof(struct xcede_latency_record)) {
 186                pr_err("xcede: Expected record-size %lu. Observed size %u.\n",
 187                       sizeof(struct xcede_latency_record), xcede_record_size);
 188                return -EINVAL;
 189        }
 190
 191        pr_info("xcede: xcede_record_size = %d\n", xcede_record_size);
 192
 193        /*
 194         * Since the payload_size includes the last NULL byte and the
 195         * xcede_record_size, the remaining bytes correspond to array of all
 196         * cede_latency settings.
 197         */
 198        total_xcede_records_size = payload_size - 2;
 199        nr_xcede_records = total_xcede_records_size / xcede_record_size;
 200
 201        for (i = 0; i < nr_xcede_records; i++) {
 202                struct xcede_latency_record *record = &payload->records[i];
 203                u64 latency_ticks = be64_to_cpu(record->latency_ticks);
 204                u8 wake_on_irqs = record->wake_on_irqs;
 205                u8 hint = record->hint;
 206
 207                pr_info("xcede: Record %d : hint = %u, latency = 0x%llx tb ticks, Wake-on-irq = %u\n",
 208                        i, hint, latency_ticks, wake_on_irqs);
 209        }
 210
 211        return 0;
 212}
 213
 214#define NR_DEDICATED_STATES     2 /* snooze, CEDE */
 215static u8 cede_latency_hint[NR_DEDICATED_STATES];
 216
 217static int dedicated_cede_loop(struct cpuidle_device *dev,
 218                                struct cpuidle_driver *drv,
 219                                int index)
 220{
 221        u8 old_latency_hint;
 222
 223        pseries_idle_prolog();
 224        get_lppaca()->donate_dedicated_cpu = 1;
 225        old_latency_hint = get_lppaca()->cede_latency_hint;
 226        get_lppaca()->cede_latency_hint = cede_latency_hint[index];
 227
 228        HMT_medium();
 229        check_and_cede_processor();
 230
 231        local_irq_disable();
 232        get_lppaca()->donate_dedicated_cpu = 0;
 233        get_lppaca()->cede_latency_hint = old_latency_hint;
 234
 235        pseries_idle_epilog();
 236
 237        return index;
 238}
 239
 240static int shared_cede_loop(struct cpuidle_device *dev,
 241                        struct cpuidle_driver *drv,
 242                        int index)
 243{
 244
 245        pseries_idle_prolog();
 246
 247        /*
 248         * Yield the processor to the hypervisor.  We return if
 249         * an external interrupt occurs (which are driven prior
 250         * to returning here) or if a prod occurs from another
 251         * processor. When returning here, external interrupts
 252         * are enabled.
 253         */
 254        check_and_cede_processor();
 255
 256        local_irq_disable();
 257        pseries_idle_epilog();
 258
 259        return index;
 260}
 261
 262/*
 263 * States for dedicated partition case.
 264 */
 265static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
 266        { /* Snooze */
 267                .name = "snooze",
 268                .desc = "snooze",
 269                .exit_latency = 0,
 270                .target_residency = 0,
 271                .enter = &snooze_loop },
 272        { /* CEDE */
 273                .name = "CEDE",
 274                .desc = "CEDE",
 275                .exit_latency = 10,
 276                .target_residency = 100,
 277                .enter = &dedicated_cede_loop },
 278};
 279
 280/*
 281 * States for shared partition case.
 282 */
 283static struct cpuidle_state shared_states[] = {
 284        { /* Snooze */
 285                .name = "snooze",
 286                .desc = "snooze",
 287                .exit_latency = 0,
 288                .target_residency = 0,
 289                .enter = &snooze_loop },
 290        { /* Shared Cede */
 291                .name = "Shared Cede",
 292                .desc = "Shared Cede",
 293                .exit_latency = 10,
 294                .target_residency = 100,
 295                .enter = &shared_cede_loop },
 296};
 297
 298static int pseries_cpuidle_cpu_online(unsigned int cpu)
 299{
 300        struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
 301
 302        if (dev && cpuidle_get_driver()) {
 303                cpuidle_pause_and_lock();
 304                cpuidle_enable_device(dev);
 305                cpuidle_resume_and_unlock();
 306        }
 307        return 0;
 308}
 309
 310static int pseries_cpuidle_cpu_dead(unsigned int cpu)
 311{
 312        struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
 313
 314        if (dev && cpuidle_get_driver()) {
 315                cpuidle_pause_and_lock();
 316                cpuidle_disable_device(dev);
 317                cpuidle_resume_and_unlock();
 318        }
 319        return 0;
 320}
 321
 322/*
 323 * pseries_cpuidle_driver_init()
 324 */
 325static int pseries_cpuidle_driver_init(void)
 326{
 327        int idle_state;
 328        struct cpuidle_driver *drv = &pseries_idle_driver;
 329
 330        drv->state_count = 0;
 331
 332        for (idle_state = 0; idle_state < max_idle_state; ++idle_state) {
 333                /* Is the state not enabled? */
 334                if (cpuidle_state_table[idle_state].enter == NULL)
 335                        continue;
 336
 337                drv->states[drv->state_count] = /* structure copy */
 338                        cpuidle_state_table[idle_state];
 339
 340                drv->state_count += 1;
 341        }
 342
 343        return 0;
 344}
 345
 346static void __init fixup_cede0_latency(void)
 347{
 348        struct xcede_latency_payload *payload;
 349        u64 min_latency_us;
 350        int i;
 351
 352        min_latency_us = dedicated_states[1].exit_latency; // CEDE latency
 353
 354        if (parse_cede_parameters())
 355                return;
 356
 357        pr_info("cpuidle: Skipping the %d Extended CEDE idle states\n",
 358                nr_xcede_records);
 359
 360        payload = &xcede_latency_parameter.payload;
 361        for (i = 0; i < nr_xcede_records; i++) {
 362                struct xcede_latency_record *record = &payload->records[i];
 363                u64 latency_tb = be64_to_cpu(record->latency_ticks);
 364                u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC);
 365
 366                if (latency_us == 0)
 367                        pr_warn("cpuidle: xcede record %d has an unrealistic latency of 0us.\n", i);
 368
 369                if (latency_us < min_latency_us)
 370                        min_latency_us = latency_us;
 371        }
 372
 373        /*
 374         * By default, we assume that CEDE(0) has exit latency 10us,
 375         * since there is no way for us to query from the platform.
 376         *
 377         * However, if the wakeup latency of an Extended CEDE state is
 378         * smaller than 10us, then we can be sure that CEDE(0)
 379         * requires no more than that.
 380         *
 381         * Perform the fix-up.
 382         */
 383        if (min_latency_us < dedicated_states[1].exit_latency) {
 384                /*
 385                 * We set a minimum of 1us wakeup latency for cede0 to
 386                 * distinguish it from snooze
 387                 */
 388                u64 cede0_latency = 1;
 389
 390                if (min_latency_us > cede0_latency)
 391                        cede0_latency = min_latency_us - 1;
 392
 393                dedicated_states[1].exit_latency = cede0_latency;
 394                dedicated_states[1].target_residency = 10 * (cede0_latency);
 395                pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n",
 396                        cede0_latency);
 397        }
 398
 399}
 400
 401/*
 402 * pseries_idle_probe()
 403 * Choose state table for shared versus dedicated partition
 404 */
 405static int pseries_idle_probe(void)
 406{
 407
 408        if (cpuidle_disable != IDLE_NO_OVERRIDE)
 409                return -ENODEV;
 410
 411        if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
 412                /*
 413                 * Use local_paca instead of get_lppaca() since
 414                 * preemption is not disabled, and it is not required in
 415                 * fact, since lppaca_ptr does not need to be the value
 416                 * associated to the current CPU, it can be from any CPU.
 417                 */
 418                if (lppaca_shared_proc(local_paca->lppaca_ptr)) {
 419                        cpuidle_state_table = shared_states;
 420                        max_idle_state = ARRAY_SIZE(shared_states);
 421                } else {
 422                        fixup_cede0_latency();
 423                        cpuidle_state_table = dedicated_states;
 424                        max_idle_state = NR_DEDICATED_STATES;
 425                }
 426        } else
 427                return -ENODEV;
 428
 429        if (max_idle_state > 1) {
 430                snooze_timeout_en = true;
 431                snooze_timeout = cpuidle_state_table[1].target_residency *
 432                                 tb_ticks_per_usec;
 433        }
 434        return 0;
 435}
 436
 437static int __init pseries_processor_idle_init(void)
 438{
 439        int retval;
 440
 441        retval = pseries_idle_probe();
 442        if (retval)
 443                return retval;
 444
 445        pseries_cpuidle_driver_init();
 446        retval = cpuidle_register(&pseries_idle_driver, NULL);
 447        if (retval) {
 448                printk(KERN_DEBUG "Registration of pseries driver failed.\n");
 449                return retval;
 450        }
 451
 452        retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
 453                                           "cpuidle/pseries:online",
 454                                           pseries_cpuidle_cpu_online, NULL);
 455        WARN_ON(retval < 0);
 456        retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD,
 457                                           "cpuidle/pseries:DEAD", NULL,
 458                                           pseries_cpuidle_cpu_dead);
 459        WARN_ON(retval < 0);
 460        printk(KERN_DEBUG "pseries_idle_driver registered\n");
 461        return 0;
 462}
 463
 464device_initcall(pseries_processor_idle_init);
 465