linux/drivers/cpufreq/intel_pstate.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * intel_pstate.c: Native P state management for Intel processors
   4 *
   5 * (C) Copyright 2012 Intel Corporation
   6 * Author: Dirk Brandewie <dirk.j.brandewie@intel.com>
   7 */
   8
   9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  10
  11#include <linux/kernel.h>
  12#include <linux/kernel_stat.h>
  13#include <linux/module.h>
  14#include <linux/ktime.h>
  15#include <linux/hrtimer.h>
  16#include <linux/tick.h>
  17#include <linux/slab.h>
  18#include <linux/sched/cpufreq.h>
  19#include <linux/list.h>
  20#include <linux/cpu.h>
  21#include <linux/cpufreq.h>
  22#include <linux/sysfs.h>
  23#include <linux/types.h>
  24#include <linux/fs.h>
  25#include <linux/acpi.h>
  26#include <linux/vmalloc.h>
  27#include <linux/pm_qos.h>
  28#include <trace/events/power.h>
  29
  30#include <asm/div64.h>
  31#include <asm/msr.h>
  32#include <asm/cpu_device_id.h>
  33#include <asm/cpufeature.h>
  34#include <asm/intel-family.h>
  35
  36#define INTEL_PSTATE_SAMPLING_INTERVAL  (10 * NSEC_PER_MSEC)
  37
  38#define INTEL_CPUFREQ_TRANSITION_LATENCY        20000
  39#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP      5000
  40#define INTEL_CPUFREQ_TRANSITION_DELAY          500
  41
  42#ifdef CONFIG_ACPI
  43#include <acpi/processor.h>
  44#include <acpi/cppc_acpi.h>
  45#endif
  46
  47#define FRAC_BITS 8
  48#define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
  49#define fp_toint(X) ((X) >> FRAC_BITS)
  50
  51#define ONE_EIGHTH_FP ((int64_t)1 << (FRAC_BITS - 3))
  52
  53#define EXT_BITS 6
  54#define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
  55#define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS)
  56#define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS)
  57
  58static inline int32_t mul_fp(int32_t x, int32_t y)
  59{
  60        return ((int64_t)x * (int64_t)y) >> FRAC_BITS;
  61}
  62
  63static inline int32_t div_fp(s64 x, s64 y)
  64{
  65        return div64_s64((int64_t)x << FRAC_BITS, y);
  66}
  67
  68static inline int ceiling_fp(int32_t x)
  69{
  70        int mask, ret;
  71
  72        ret = fp_toint(x);
  73        mask = (1 << FRAC_BITS) - 1;
  74        if (x & mask)
  75                ret += 1;
  76        return ret;
  77}
  78
  79static inline u64 mul_ext_fp(u64 x, u64 y)
  80{
  81        return (x * y) >> EXT_FRAC_BITS;
  82}
  83
  84static inline u64 div_ext_fp(u64 x, u64 y)
  85{
  86        return div64_u64(x << EXT_FRAC_BITS, y);
  87}
  88
  89/**
  90 * struct sample -      Store performance sample
  91 * @core_avg_perf:      Ratio of APERF/MPERF which is the actual average
  92 *                      performance during last sample period
  93 * @busy_scaled:        Scaled busy value which is used to calculate next
  94 *                      P state. This can be different than core_avg_perf
  95 *                      to account for cpu idle period
  96 * @aperf:              Difference of actual performance frequency clock count
  97 *                      read from APERF MSR between last and current sample
  98 * @mperf:              Difference of maximum performance frequency clock count
  99 *                      read from MPERF MSR between last and current sample
 100 * @tsc:                Difference of time stamp counter between last and
 101 *                      current sample
 102 * @time:               Current time from scheduler
 103 *
 104 * This structure is used in the cpudata structure to store performance sample
 105 * data for choosing next P State.
 106 */
 107struct sample {
 108        int32_t core_avg_perf;
 109        int32_t busy_scaled;
 110        u64 aperf;
 111        u64 mperf;
 112        u64 tsc;
 113        u64 time;
 114};
 115
 116/**
 117 * struct pstate_data - Store P state data
 118 * @current_pstate:     Current requested P state
 119 * @min_pstate:         Min P state possible for this platform
 120 * @max_pstate:         Max P state possible for this platform
 121 * @max_pstate_physical:This is physical Max P state for a processor
 122 *                      This can be higher than the max_pstate which can
 123 *                      be limited by platform thermal design power limits
 124 * @scaling:            Scaling factor to  convert frequency to cpufreq
 125 *                      frequency units
 126 * @turbo_pstate:       Max Turbo P state possible for this platform
 127 * @max_freq:           @max_pstate frequency in cpufreq units
 128 * @turbo_freq:         @turbo_pstate frequency in cpufreq units
 129 *
 130 * Stores the per cpu model P state limits and current P state.
 131 */
 132struct pstate_data {
 133        int     current_pstate;
 134        int     min_pstate;
 135        int     max_pstate;
 136        int     max_pstate_physical;
 137        int     scaling;
 138        int     turbo_pstate;
 139        unsigned int max_freq;
 140        unsigned int turbo_freq;
 141};
 142
 143/**
 144 * struct vid_data -    Stores voltage information data
 145 * @min:                VID data for this platform corresponding to
 146 *                      the lowest P state
 147 * @max:                VID data corresponding to the highest P State.
 148 * @turbo:              VID data for turbo P state
 149 * @ratio:              Ratio of (vid max - vid min) /
 150 *                      (max P state - Min P State)
 151 *
 152 * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling)
 153 * This data is used in Atom platforms, where in addition to target P state,
 154 * the voltage data needs to be specified to select next P State.
 155 */
 156struct vid_data {
 157        int min;
 158        int max;
 159        int turbo;
 160        int32_t ratio;
 161};
 162
 163/**
 164 * struct global_params - Global parameters, mostly tunable via sysfs.
 165 * @no_turbo:           Whether or not to use turbo P-states.
 166 * @turbo_disabled:     Whether or not turbo P-states are available at all,
 167 *                      based on the MSR_IA32_MISC_ENABLE value and whether or
 168 *                      not the maximum reported turbo P-state is different from
 169 *                      the maximum reported non-turbo one.
 170 * @turbo_disabled_mf:  The @turbo_disabled value reflected by cpuinfo.max_freq.
 171 * @min_perf_pct:       Minimum capacity limit in percent of the maximum turbo
 172 *                      P-state capacity.
 173 * @max_perf_pct:       Maximum capacity limit in percent of the maximum turbo
 174 *                      P-state capacity.
 175 */
 176struct global_params {
 177        bool no_turbo;
 178        bool turbo_disabled;
 179        bool turbo_disabled_mf;
 180        int max_perf_pct;
 181        int min_perf_pct;
 182};
 183
 184/**
 185 * struct cpudata -     Per CPU instance data storage
 186 * @cpu:                CPU number for this instance data
 187 * @policy:             CPUFreq policy value
 188 * @update_util:        CPUFreq utility callback information
 189 * @update_util_set:    CPUFreq utility callback is set
 190 * @iowait_boost:       iowait-related boost fraction
 191 * @last_update:        Time of the last update.
 192 * @pstate:             Stores P state limits for this CPU
 193 * @vid:                Stores VID limits for this CPU
 194 * @last_sample_time:   Last Sample time
 195 * @aperf_mperf_shift:  APERF vs MPERF counting frequency difference
 196 * @prev_aperf:         Last APERF value read from APERF MSR
 197 * @prev_mperf:         Last MPERF value read from MPERF MSR
 198 * @prev_tsc:           Last timestamp counter (TSC) value
 199 * @prev_cummulative_iowait: IO Wait time difference from last and
 200 *                      current sample
 201 * @sample:             Storage for storing last Sample data
 202 * @min_perf_ratio:     Minimum capacity in terms of PERF or HWP ratios
 203 * @max_perf_ratio:     Maximum capacity in terms of PERF or HWP ratios
 204 * @acpi_perf_data:     Stores ACPI perf information read from _PSS
 205 * @valid_pss_table:    Set to true for valid ACPI _PSS entries found
 206 * @epp_powersave:      Last saved HWP energy performance preference
 207 *                      (EPP) or energy performance bias (EPB),
 208 *                      when policy switched to performance
 209 * @epp_policy:         Last saved policy used to set EPP/EPB
 210 * @epp_default:        Power on default HWP energy performance
 211 *                      preference/bias
 212 * @epp_cached          Cached HWP energy-performance preference value
 213 * @hwp_req_cached:     Cached value of the last HWP Request MSR
 214 * @hwp_cap_cached:     Cached value of the last HWP Capabilities MSR
 215 * @last_io_update:     Last time when IO wake flag was set
 216 * @sched_flags:        Store scheduler flags for possible cross CPU update
 217 * @hwp_boost_min:      Last HWP boosted min performance
 218 * @suspended:          Whether or not the driver has been suspended.
 219 *
 220 * This structure stores per CPU instance data for all CPUs.
 221 */
 222struct cpudata {
 223        int cpu;
 224
 225        unsigned int policy;
 226        struct update_util_data update_util;
 227        bool   update_util_set;
 228
 229        struct pstate_data pstate;
 230        struct vid_data vid;
 231
 232        u64     last_update;
 233        u64     last_sample_time;
 234        u64     aperf_mperf_shift;
 235        u64     prev_aperf;
 236        u64     prev_mperf;
 237        u64     prev_tsc;
 238        u64     prev_cummulative_iowait;
 239        struct sample sample;
 240        int32_t min_perf_ratio;
 241        int32_t max_perf_ratio;
 242#ifdef CONFIG_ACPI
 243        struct acpi_processor_performance acpi_perf_data;
 244        bool valid_pss_table;
 245#endif
 246        unsigned int iowait_boost;
 247        s16 epp_powersave;
 248        s16 epp_policy;
 249        s16 epp_default;
 250        s16 epp_cached;
 251        u64 hwp_req_cached;
 252        u64 hwp_cap_cached;
 253        u64 last_io_update;
 254        unsigned int sched_flags;
 255        u32 hwp_boost_min;
 256        bool suspended;
 257};
 258
 259static struct cpudata **all_cpu_data;
 260
 261/**
 262 * struct pstate_funcs - Per CPU model specific callbacks
 263 * @get_max:            Callback to get maximum non turbo effective P state
 264 * @get_max_physical:   Callback to get maximum non turbo physical P state
 265 * @get_min:            Callback to get minimum P state
 266 * @get_turbo:          Callback to get turbo P state
 267 * @get_scaling:        Callback to get frequency scaling factor
 268 * @get_aperf_mperf_shift: Callback to get the APERF vs MPERF frequency difference
 269 * @get_val:            Callback to convert P state to actual MSR write value
 270 * @get_vid:            Callback to get VID data for Atom platforms
 271 *
 272 * Core and Atom CPU models have different way to get P State limits. This
 273 * structure is used to store those callbacks.
 274 */
 275struct pstate_funcs {
 276        int (*get_max)(void);
 277        int (*get_max_physical)(void);
 278        int (*get_min)(void);
 279        int (*get_turbo)(void);
 280        int (*get_scaling)(void);
 281        int (*get_aperf_mperf_shift)(void);
 282        u64 (*get_val)(struct cpudata*, int pstate);
 283        void (*get_vid)(struct cpudata *);
 284};
 285
 286static struct pstate_funcs pstate_funcs __read_mostly;
 287
 288static int hwp_active __read_mostly;
 289static int hwp_mode_bdw __read_mostly;
 290static bool per_cpu_limits __read_mostly;
 291static bool hwp_boost __read_mostly;
 292
 293static struct cpufreq_driver *intel_pstate_driver __read_mostly;
 294
 295#ifdef CONFIG_ACPI
 296static bool acpi_ppc;
 297#endif
 298
 299static struct global_params global;
 300
 301static DEFINE_MUTEX(intel_pstate_driver_lock);
 302static DEFINE_MUTEX(intel_pstate_limits_lock);
 303
 304#ifdef CONFIG_ACPI
 305
 306static bool intel_pstate_acpi_pm_profile_server(void)
 307{
 308        if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER ||
 309            acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER)
 310                return true;
 311
 312        return false;
 313}
 314
 315static bool intel_pstate_get_ppc_enable_status(void)
 316{
 317        if (intel_pstate_acpi_pm_profile_server())
 318                return true;
 319
 320        return acpi_ppc;
 321}
 322
 323#ifdef CONFIG_ACPI_CPPC_LIB
 324
 325/* The work item is needed to avoid CPU hotplug locking issues */
 326static void intel_pstste_sched_itmt_work_fn(struct work_struct *work)
 327{
 328        sched_set_itmt_support();
 329}
 330
 331static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn);
 332
 333static void intel_pstate_set_itmt_prio(int cpu)
 334{
 335        struct cppc_perf_caps cppc_perf;
 336        static u32 max_highest_perf = 0, min_highest_perf = U32_MAX;
 337        int ret;
 338
 339        ret = cppc_get_perf_caps(cpu, &cppc_perf);
 340        if (ret)
 341                return;
 342
 343        /*
 344         * The priorities can be set regardless of whether or not
 345         * sched_set_itmt_support(true) has been called and it is valid to
 346         * update them at any time after it has been called.
 347         */
 348        sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu);
 349
 350        if (max_highest_perf <= min_highest_perf) {
 351                if (cppc_perf.highest_perf > max_highest_perf)
 352                        max_highest_perf = cppc_perf.highest_perf;
 353
 354                if (cppc_perf.highest_perf < min_highest_perf)
 355                        min_highest_perf = cppc_perf.highest_perf;
 356
 357                if (max_highest_perf > min_highest_perf) {
 358                        /*
 359                         * This code can be run during CPU online under the
 360                         * CPU hotplug locks, so sched_set_itmt_support()
 361                         * cannot be called from here.  Queue up a work item
 362                         * to invoke it.
 363                         */
 364                        schedule_work(&sched_itmt_work);
 365                }
 366        }
 367}
 368
 369static int intel_pstate_get_cppc_guranteed(int cpu)
 370{
 371        struct cppc_perf_caps cppc_perf;
 372        int ret;
 373
 374        ret = cppc_get_perf_caps(cpu, &cppc_perf);
 375        if (ret)
 376                return ret;
 377
 378        if (cppc_perf.guaranteed_perf)
 379                return cppc_perf.guaranteed_perf;
 380
 381        return cppc_perf.nominal_perf;
 382}
 383
 384#else /* CONFIG_ACPI_CPPC_LIB */
 385static void intel_pstate_set_itmt_prio(int cpu)
 386{
 387}
 388#endif /* CONFIG_ACPI_CPPC_LIB */
 389
 390static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
 391{
 392        struct cpudata *cpu;
 393        int ret;
 394        int i;
 395
 396        if (hwp_active) {
 397                intel_pstate_set_itmt_prio(policy->cpu);
 398                return;
 399        }
 400
 401        if (!intel_pstate_get_ppc_enable_status())
 402                return;
 403
 404        cpu = all_cpu_data[policy->cpu];
 405
 406        ret = acpi_processor_register_performance(&cpu->acpi_perf_data,
 407                                                  policy->cpu);
 408        if (ret)
 409                return;
 410
 411        /*
 412         * Check if the control value in _PSS is for PERF_CTL MSR, which should
 413         * guarantee that the states returned by it map to the states in our
 414         * list directly.
 415         */
 416        if (cpu->acpi_perf_data.control_register.space_id !=
 417                                                ACPI_ADR_SPACE_FIXED_HARDWARE)
 418                goto err;
 419
 420        /*
 421         * If there is only one entry _PSS, simply ignore _PSS and continue as
 422         * usual without taking _PSS into account
 423         */
 424        if (cpu->acpi_perf_data.state_count < 2)
 425                goto err;
 426
 427        pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu);
 428        for (i = 0; i < cpu->acpi_perf_data.state_count; i++) {
 429                pr_debug("     %cP%d: %u MHz, %u mW, 0x%x\n",
 430                         (i == cpu->acpi_perf_data.state ? '*' : ' '), i,
 431                         (u32) cpu->acpi_perf_data.states[i].core_frequency,
 432                         (u32) cpu->acpi_perf_data.states[i].power,
 433                         (u32) cpu->acpi_perf_data.states[i].control);
 434        }
 435
 436        /*
 437         * The _PSS table doesn't contain whole turbo frequency range.
 438         * This just contains +1 MHZ above the max non turbo frequency,
 439         * with control value corresponding to max turbo ratio. But
 440         * when cpufreq set policy is called, it will call with this
 441         * max frequency, which will cause a reduced performance as
 442         * this driver uses real max turbo frequency as the max
 443         * frequency. So correct this frequency in _PSS table to
 444         * correct max turbo frequency based on the turbo state.
 445         * Also need to convert to MHz as _PSS freq is in MHz.
 446         */
 447        if (!global.turbo_disabled)
 448                cpu->acpi_perf_data.states[0].core_frequency =
 449                                        policy->cpuinfo.max_freq / 1000;
 450        cpu->valid_pss_table = true;
 451        pr_debug("_PPC limits will be enforced\n");
 452
 453        return;
 454
 455 err:
 456        cpu->valid_pss_table = false;
 457        acpi_processor_unregister_performance(policy->cpu);
 458}
 459
 460static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 461{
 462        struct cpudata *cpu;
 463
 464        cpu = all_cpu_data[policy->cpu];
 465        if (!cpu->valid_pss_table)
 466                return;
 467
 468        acpi_processor_unregister_performance(policy->cpu);
 469}
 470#else /* CONFIG_ACPI */
 471static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
 472{
 473}
 474
 475static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 476{
 477}
 478
 479static inline bool intel_pstate_acpi_pm_profile_server(void)
 480{
 481        return false;
 482}
 483#endif /* CONFIG_ACPI */
 484
 485#ifndef CONFIG_ACPI_CPPC_LIB
 486static int intel_pstate_get_cppc_guranteed(int cpu)
 487{
 488        return -ENOTSUPP;
 489}
 490#endif /* CONFIG_ACPI_CPPC_LIB */
 491
 492static inline void update_turbo_state(void)
 493{
 494        u64 misc_en;
 495        struct cpudata *cpu;
 496
 497        cpu = all_cpu_data[0];
 498        rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
 499        global.turbo_disabled =
 500                (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ||
 501                 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
 502}
 503
 504static int min_perf_pct_min(void)
 505{
 506        struct cpudata *cpu = all_cpu_data[0];
 507        int turbo_pstate = cpu->pstate.turbo_pstate;
 508
 509        return turbo_pstate ?
 510                (cpu->pstate.min_pstate * 100 / turbo_pstate) : 0;
 511}
 512
 513static s16 intel_pstate_get_epb(struct cpudata *cpu_data)
 514{
 515        u64 epb;
 516        int ret;
 517
 518        if (!boot_cpu_has(X86_FEATURE_EPB))
 519                return -ENXIO;
 520
 521        ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
 522        if (ret)
 523                return (s16)ret;
 524
 525        return (s16)(epb & 0x0f);
 526}
 527
 528static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data)
 529{
 530        s16 epp;
 531
 532        if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
 533                /*
 534                 * When hwp_req_data is 0, means that caller didn't read
 535                 * MSR_HWP_REQUEST, so need to read and get EPP.
 536                 */
 537                if (!hwp_req_data) {
 538                        epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST,
 539                                            &hwp_req_data);
 540                        if (epp)
 541                                return epp;
 542                }
 543                epp = (hwp_req_data >> 24) & 0xff;
 544        } else {
 545                /* When there is no EPP present, HWP uses EPB settings */
 546                epp = intel_pstate_get_epb(cpu_data);
 547        }
 548
 549        return epp;
 550}
 551
 552static int intel_pstate_set_epb(int cpu, s16 pref)
 553{
 554        u64 epb;
 555        int ret;
 556
 557        if (!boot_cpu_has(X86_FEATURE_EPB))
 558                return -ENXIO;
 559
 560        ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
 561        if (ret)
 562                return ret;
 563
 564        epb = (epb & ~0x0f) | pref;
 565        wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb);
 566
 567        return 0;
 568}
 569
 570/*
 571 * EPP/EPB display strings corresponding to EPP index in the
 572 * energy_perf_strings[]
 573 *      index           String
 574 *-------------------------------------
 575 *      0               default
 576 *      1               performance
 577 *      2               balance_performance
 578 *      3               balance_power
 579 *      4               power
 580 */
 581static const char * const energy_perf_strings[] = {
 582        "default",
 583        "performance",
 584        "balance_performance",
 585        "balance_power",
 586        "power",
 587        NULL
 588};
 589static const unsigned int epp_values[] = {
 590        HWP_EPP_PERFORMANCE,
 591        HWP_EPP_BALANCE_PERFORMANCE,
 592        HWP_EPP_BALANCE_POWERSAVE,
 593        HWP_EPP_POWERSAVE
 594};
 595
 596static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data, int *raw_epp)
 597{
 598        s16 epp;
 599        int index = -EINVAL;
 600
 601        *raw_epp = 0;
 602        epp = intel_pstate_get_epp(cpu_data, 0);
 603        if (epp < 0)
 604                return epp;
 605
 606        if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
 607                if (epp == HWP_EPP_PERFORMANCE)
 608                        return 1;
 609                if (epp == HWP_EPP_BALANCE_PERFORMANCE)
 610                        return 2;
 611                if (epp == HWP_EPP_BALANCE_POWERSAVE)
 612                        return 3;
 613                if (epp == HWP_EPP_POWERSAVE)
 614                        return 4;
 615                *raw_epp = epp;
 616                return 0;
 617        } else if (boot_cpu_has(X86_FEATURE_EPB)) {
 618                /*
 619                 * Range:
 620                 *      0x00-0x03       :       Performance
 621                 *      0x04-0x07       :       Balance performance
 622                 *      0x08-0x0B       :       Balance power
 623                 *      0x0C-0x0F       :       Power
 624                 * The EPB is a 4 bit value, but our ranges restrict the
 625                 * value which can be set. Here only using top two bits
 626                 * effectively.
 627                 */
 628                index = (epp >> 2) + 1;
 629        }
 630
 631        return index;
 632}
 633
 634static int intel_pstate_set_epp(struct cpudata *cpu, u32 epp)
 635{
 636        int ret;
 637
 638        /*
 639         * Use the cached HWP Request MSR value, because in the active mode the
 640         * register itself may be updated by intel_pstate_hwp_boost_up() or
 641         * intel_pstate_hwp_boost_down() at any time.
 642         */
 643        u64 value = READ_ONCE(cpu->hwp_req_cached);
 644
 645        value &= ~GENMASK_ULL(31, 24);
 646        value |= (u64)epp << 24;
 647        /*
 648         * The only other updater of hwp_req_cached in the active mode,
 649         * intel_pstate_hwp_set(), is called under the same lock as this
 650         * function, so it cannot run in parallel with the update below.
 651         */
 652        WRITE_ONCE(cpu->hwp_req_cached, value);
 653        ret = wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
 654        if (!ret)
 655                cpu->epp_cached = epp;
 656
 657        return ret;
 658}
 659
 660static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data,
 661                                              int pref_index, bool use_raw,
 662                                              u32 raw_epp)
 663{
 664        int epp = -EINVAL;
 665        int ret;
 666
 667        if (!pref_index)
 668                epp = cpu_data->epp_default;
 669
 670        if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
 671                if (use_raw)
 672                        epp = raw_epp;
 673                else if (epp == -EINVAL)
 674                        epp = epp_values[pref_index - 1];
 675
 676                /*
 677                 * To avoid confusion, refuse to set EPP to any values different
 678                 * from 0 (performance) if the current policy is "performance",
 679                 * because those values would be overridden.
 680                 */
 681                if (epp > 0 && cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE)
 682                        return -EBUSY;
 683
 684                ret = intel_pstate_set_epp(cpu_data, epp);
 685        } else {
 686                if (epp == -EINVAL)
 687                        epp = (pref_index - 1) << 2;
 688                ret = intel_pstate_set_epb(cpu_data->cpu, epp);
 689        }
 690
 691        return ret;
 692}
 693
 694static ssize_t show_energy_performance_available_preferences(
 695                                struct cpufreq_policy *policy, char *buf)
 696{
 697        int i = 0;
 698        int ret = 0;
 699
 700        while (energy_perf_strings[i] != NULL)
 701                ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
 702
 703        ret += sprintf(&buf[ret], "\n");
 704
 705        return ret;
 706}
 707
 708cpufreq_freq_attr_ro(energy_performance_available_preferences);
 709
 710static struct cpufreq_driver intel_pstate;
 711
 712static ssize_t store_energy_performance_preference(
 713                struct cpufreq_policy *policy, const char *buf, size_t count)
 714{
 715        struct cpudata *cpu = all_cpu_data[policy->cpu];
 716        char str_preference[21];
 717        bool raw = false;
 718        ssize_t ret;
 719        u32 epp = 0;
 720
 721        ret = sscanf(buf, "%20s", str_preference);
 722        if (ret != 1)
 723                return -EINVAL;
 724
 725        ret = match_string(energy_perf_strings, -1, str_preference);
 726        if (ret < 0) {
 727                if (!boot_cpu_has(X86_FEATURE_HWP_EPP))
 728                        return ret;
 729
 730                ret = kstrtouint(buf, 10, &epp);
 731                if (ret)
 732                        return ret;
 733
 734                if (epp > 255)
 735                        return -EINVAL;
 736
 737                raw = true;
 738        }
 739
 740        /*
 741         * This function runs with the policy R/W semaphore held, which
 742         * guarantees that the driver pointer will not change while it is
 743         * running.
 744         */
 745        if (!intel_pstate_driver)
 746                return -EAGAIN;
 747
 748        mutex_lock(&intel_pstate_limits_lock);
 749
 750        if (intel_pstate_driver == &intel_pstate) {
 751                ret = intel_pstate_set_energy_pref_index(cpu, ret, raw, epp);
 752        } else {
 753                /*
 754                 * In the passive mode the governor needs to be stopped on the
 755                 * target CPU before the EPP update and restarted after it,
 756                 * which is super-heavy-weight, so make sure it is worth doing
 757                 * upfront.
 758                 */
 759                if (!raw)
 760                        epp = ret ? epp_values[ret - 1] : cpu->epp_default;
 761
 762                if (cpu->epp_cached != epp) {
 763                        int err;
 764
 765                        cpufreq_stop_governor(policy);
 766                        ret = intel_pstate_set_epp(cpu, epp);
 767                        err = cpufreq_start_governor(policy);
 768                        if (!ret)
 769                                ret = err;
 770                }
 771        }
 772
 773        mutex_unlock(&intel_pstate_limits_lock);
 774
 775        return ret ?: count;
 776}
 777
 778static ssize_t show_energy_performance_preference(
 779                                struct cpufreq_policy *policy, char *buf)
 780{
 781        struct cpudata *cpu_data = all_cpu_data[policy->cpu];
 782        int preference, raw_epp;
 783
 784        preference = intel_pstate_get_energy_pref_index(cpu_data, &raw_epp);
 785        if (preference < 0)
 786                return preference;
 787
 788        if (raw_epp)
 789                return  sprintf(buf, "%d\n", raw_epp);
 790        else
 791                return  sprintf(buf, "%s\n", energy_perf_strings[preference]);
 792}
 793
 794cpufreq_freq_attr_rw(energy_performance_preference);
 795
 796static ssize_t show_base_frequency(struct cpufreq_policy *policy, char *buf)
 797{
 798        struct cpudata *cpu;
 799        u64 cap;
 800        int ratio;
 801
 802        ratio = intel_pstate_get_cppc_guranteed(policy->cpu);
 803        if (ratio <= 0) {
 804                rdmsrl_on_cpu(policy->cpu, MSR_HWP_CAPABILITIES, &cap);
 805                ratio = HWP_GUARANTEED_PERF(cap);
 806        }
 807
 808        cpu = all_cpu_data[policy->cpu];
 809
 810        return sprintf(buf, "%d\n", ratio * cpu->pstate.scaling);
 811}
 812
 813cpufreq_freq_attr_ro(base_frequency);
 814
 815static struct freq_attr *hwp_cpufreq_attrs[] = {
 816        &energy_performance_preference,
 817        &energy_performance_available_preferences,
 818        &base_frequency,
 819        NULL,
 820};
 821
 822static void __intel_pstate_get_hwp_cap(struct cpudata *cpu)
 823{
 824        u64 cap;
 825
 826        rdmsrl_on_cpu(cpu->cpu, MSR_HWP_CAPABILITIES, &cap);
 827        WRITE_ONCE(cpu->hwp_cap_cached, cap);
 828        cpu->pstate.max_pstate = HWP_GUARANTEED_PERF(cap);
 829        cpu->pstate.turbo_pstate = HWP_HIGHEST_PERF(cap);
 830}
 831
 832static void intel_pstate_get_hwp_cap(struct cpudata *cpu)
 833{
 834        __intel_pstate_get_hwp_cap(cpu);
 835        cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling;
 836        cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
 837}
 838
 839static void intel_pstate_hwp_set(unsigned int cpu)
 840{
 841        struct cpudata *cpu_data = all_cpu_data[cpu];
 842        int max, min;
 843        u64 value;
 844        s16 epp;
 845
 846        max = cpu_data->max_perf_ratio;
 847        min = cpu_data->min_perf_ratio;
 848
 849        if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE)
 850                min = max;
 851
 852        rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
 853
 854        value &= ~HWP_MIN_PERF(~0L);
 855        value |= HWP_MIN_PERF(min);
 856
 857        value &= ~HWP_MAX_PERF(~0L);
 858        value |= HWP_MAX_PERF(max);
 859
 860        if (cpu_data->epp_policy == cpu_data->policy)
 861                goto skip_epp;
 862
 863        cpu_data->epp_policy = cpu_data->policy;
 864
 865        if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
 866                epp = intel_pstate_get_epp(cpu_data, value);
 867                cpu_data->epp_powersave = epp;
 868                /* If EPP read was failed, then don't try to write */
 869                if (epp < 0)
 870                        goto skip_epp;
 871
 872                epp = 0;
 873        } else {
 874                /* skip setting EPP, when saved value is invalid */
 875                if (cpu_data->epp_powersave < 0)
 876                        goto skip_epp;
 877
 878                /*
 879                 * No need to restore EPP when it is not zero. This
 880                 * means:
 881                 *  - Policy is not changed
 882                 *  - user has manually changed
 883                 *  - Error reading EPB
 884                 */
 885                epp = intel_pstate_get_epp(cpu_data, value);
 886                if (epp)
 887                        goto skip_epp;
 888
 889                epp = cpu_data->epp_powersave;
 890        }
 891        if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
 892                value &= ~GENMASK_ULL(31, 24);
 893                value |= (u64)epp << 24;
 894        } else {
 895                intel_pstate_set_epb(cpu, epp);
 896        }
 897skip_epp:
 898        WRITE_ONCE(cpu_data->hwp_req_cached, value);
 899        wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
 900}
 901
 902static void intel_pstate_hwp_offline(struct cpudata *cpu)
 903{
 904        u64 value = READ_ONCE(cpu->hwp_req_cached);
 905        int min_perf;
 906
 907        if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
 908                /*
 909                 * In case the EPP has been set to "performance" by the
 910                 * active mode "performance" scaling algorithm, replace that
 911                 * temporary value with the cached EPP one.
 912                 */
 913                value &= ~GENMASK_ULL(31, 24);
 914                value |= HWP_ENERGY_PERF_PREFERENCE(cpu->epp_cached);
 915                WRITE_ONCE(cpu->hwp_req_cached, value);
 916        }
 917
 918        value &= ~GENMASK_ULL(31, 0);
 919        min_perf = HWP_LOWEST_PERF(READ_ONCE(cpu->hwp_cap_cached));
 920
 921        /* Set hwp_max = hwp_min */
 922        value |= HWP_MAX_PERF(min_perf);
 923        value |= HWP_MIN_PERF(min_perf);
 924
 925        /* Set EPP to min */
 926        if (boot_cpu_has(X86_FEATURE_HWP_EPP))
 927                value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE);
 928
 929        wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
 930}
 931
 932#define POWER_CTL_EE_ENABLE     1
 933#define POWER_CTL_EE_DISABLE    2
 934
 935static int power_ctl_ee_state;
 936
 937static void set_power_ctl_ee_state(bool input)
 938{
 939        u64 power_ctl;
 940
 941        mutex_lock(&intel_pstate_driver_lock);
 942        rdmsrl(MSR_IA32_POWER_CTL, power_ctl);
 943        if (input) {
 944                power_ctl &= ~BIT(MSR_IA32_POWER_CTL_BIT_EE);
 945                power_ctl_ee_state = POWER_CTL_EE_ENABLE;
 946        } else {
 947                power_ctl |= BIT(MSR_IA32_POWER_CTL_BIT_EE);
 948                power_ctl_ee_state = POWER_CTL_EE_DISABLE;
 949        }
 950        wrmsrl(MSR_IA32_POWER_CTL, power_ctl);
 951        mutex_unlock(&intel_pstate_driver_lock);
 952}
 953
 954static void intel_pstate_hwp_enable(struct cpudata *cpudata);
 955
 956static void intel_pstate_hwp_reenable(struct cpudata *cpu)
 957{
 958        intel_pstate_hwp_enable(cpu);
 959        wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, READ_ONCE(cpu->hwp_req_cached));
 960}
 961
 962static int intel_pstate_suspend(struct cpufreq_policy *policy)
 963{
 964        struct cpudata *cpu = all_cpu_data[policy->cpu];
 965
 966        pr_debug("CPU %d suspending\n", cpu->cpu);
 967
 968        cpu->suspended = true;
 969
 970        return 0;
 971}
 972
 973static int intel_pstate_resume(struct cpufreq_policy *policy)
 974{
 975        struct cpudata *cpu = all_cpu_data[policy->cpu];
 976
 977        pr_debug("CPU %d resuming\n", cpu->cpu);
 978
 979        /* Only restore if the system default is changed */
 980        if (power_ctl_ee_state == POWER_CTL_EE_ENABLE)
 981                set_power_ctl_ee_state(true);
 982        else if (power_ctl_ee_state == POWER_CTL_EE_DISABLE)
 983                set_power_ctl_ee_state(false);
 984
 985        if (cpu->suspended && hwp_active) {
 986                mutex_lock(&intel_pstate_limits_lock);
 987
 988                /* Re-enable HWP, because "online" has not done that. */
 989                intel_pstate_hwp_reenable(cpu);
 990
 991                mutex_unlock(&intel_pstate_limits_lock);
 992        }
 993
 994        cpu->suspended = false;
 995
 996        return 0;
 997}
 998
 999static void intel_pstate_update_policies(void)
1000{
1001        int cpu;
1002
1003        for_each_possible_cpu(cpu)
1004                cpufreq_update_policy(cpu);
1005}
1006
1007static void intel_pstate_update_max_freq(unsigned int cpu)
1008{
1009        struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
1010        struct cpudata *cpudata;
1011
1012        if (!policy)
1013                return;
1014
1015        cpudata = all_cpu_data[cpu];
1016        policy->cpuinfo.max_freq = global.turbo_disabled_mf ?
1017                        cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
1018
1019        refresh_frequency_limits(policy);
1020
1021        cpufreq_cpu_release(policy);
1022}
1023
1024static void intel_pstate_update_limits(unsigned int cpu)
1025{
1026        mutex_lock(&intel_pstate_driver_lock);
1027
1028        update_turbo_state();
1029        /*
1030         * If turbo has been turned on or off globally, policy limits for
1031         * all CPUs need to be updated to reflect that.
1032         */
1033        if (global.turbo_disabled_mf != global.turbo_disabled) {
1034                global.turbo_disabled_mf = global.turbo_disabled;
1035                arch_set_max_freq_ratio(global.turbo_disabled);
1036                for_each_possible_cpu(cpu)
1037                        intel_pstate_update_max_freq(cpu);
1038        } else {
1039                cpufreq_update_policy(cpu);
1040        }
1041
1042        mutex_unlock(&intel_pstate_driver_lock);
1043}
1044
1045/************************** sysfs begin ************************/
1046#define show_one(file_name, object)                                     \
1047        static ssize_t show_##file_name                                 \
1048        (struct kobject *kobj, struct kobj_attribute *attr, char *buf)  \
1049        {                                                               \
1050                return sprintf(buf, "%u\n", global.object);             \
1051        }
1052
1053static ssize_t intel_pstate_show_status(char *buf);
1054static int intel_pstate_update_status(const char *buf, size_t size);
1055
1056static ssize_t show_status(struct kobject *kobj,
1057                           struct kobj_attribute *attr, char *buf)
1058{
1059        ssize_t ret;
1060
1061        mutex_lock(&intel_pstate_driver_lock);
1062        ret = intel_pstate_show_status(buf);
1063        mutex_unlock(&intel_pstate_driver_lock);
1064
1065        return ret;
1066}
1067
1068static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
1069                            const char *buf, size_t count)
1070{
1071        char *p = memchr(buf, '\n', count);
1072        int ret;
1073
1074        mutex_lock(&intel_pstate_driver_lock);
1075        ret = intel_pstate_update_status(buf, p ? p - buf : count);
1076        mutex_unlock(&intel_pstate_driver_lock);
1077
1078        return ret < 0 ? ret : count;
1079}
1080
1081static ssize_t show_turbo_pct(struct kobject *kobj,
1082                                struct kobj_attribute *attr, char *buf)
1083{
1084        struct cpudata *cpu;
1085        int total, no_turbo, turbo_pct;
1086        uint32_t turbo_fp;
1087
1088        mutex_lock(&intel_pstate_driver_lock);
1089
1090        if (!intel_pstate_driver) {
1091                mutex_unlock(&intel_pstate_driver_lock);
1092                return -EAGAIN;
1093        }
1094
1095        cpu = all_cpu_data[0];
1096
1097        total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
1098        no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1;
1099        turbo_fp = div_fp(no_turbo, total);
1100        turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100)));
1101
1102        mutex_unlock(&intel_pstate_driver_lock);
1103
1104        return sprintf(buf, "%u\n", turbo_pct);
1105}
1106
1107static ssize_t show_num_pstates(struct kobject *kobj,
1108                                struct kobj_attribute *attr, char *buf)
1109{
1110        struct cpudata *cpu;
1111        int total;
1112
1113        mutex_lock(&intel_pstate_driver_lock);
1114
1115        if (!intel_pstate_driver) {
1116                mutex_unlock(&intel_pstate_driver_lock);
1117                return -EAGAIN;
1118        }
1119
1120        cpu = all_cpu_data[0];
1121        total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
1122
1123        mutex_unlock(&intel_pstate_driver_lock);
1124
1125        return sprintf(buf, "%u\n", total);
1126}
1127
1128static ssize_t show_no_turbo(struct kobject *kobj,
1129                             struct kobj_attribute *attr, char *buf)
1130{
1131        ssize_t ret;
1132
1133        mutex_lock(&intel_pstate_driver_lock);
1134
1135        if (!intel_pstate_driver) {
1136                mutex_unlock(&intel_pstate_driver_lock);
1137                return -EAGAIN;
1138        }
1139
1140        update_turbo_state();
1141        if (global.turbo_disabled)
1142                ret = sprintf(buf, "%u\n", global.turbo_disabled);
1143        else
1144                ret = sprintf(buf, "%u\n", global.no_turbo);
1145
1146        mutex_unlock(&intel_pstate_driver_lock);
1147
1148        return ret;
1149}
1150
1151static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b,
1152                              const char *buf, size_t count)
1153{
1154        unsigned int input;
1155        int ret;
1156
1157        ret = sscanf(buf, "%u", &input);
1158        if (ret != 1)
1159                return -EINVAL;
1160
1161        mutex_lock(&intel_pstate_driver_lock);
1162
1163        if (!intel_pstate_driver) {
1164                mutex_unlock(&intel_pstate_driver_lock);
1165                return -EAGAIN;
1166        }
1167
1168        mutex_lock(&intel_pstate_limits_lock);
1169
1170        update_turbo_state();
1171        if (global.turbo_disabled) {
1172                pr_notice_once("Turbo disabled by BIOS or unavailable on processor\n");
1173                mutex_unlock(&intel_pstate_limits_lock);
1174                mutex_unlock(&intel_pstate_driver_lock);
1175                return -EPERM;
1176        }
1177
1178        global.no_turbo = clamp_t(int, input, 0, 1);
1179
1180        if (global.no_turbo) {
1181                struct cpudata *cpu = all_cpu_data[0];
1182                int pct = cpu->pstate.max_pstate * 100 / cpu->pstate.turbo_pstate;
1183
1184                /* Squash the global minimum into the permitted range. */
1185                if (global.min_perf_pct > pct)
1186                        global.min_perf_pct = pct;
1187        }
1188
1189        mutex_unlock(&intel_pstate_limits_lock);
1190
1191        intel_pstate_update_policies();
1192
1193        mutex_unlock(&intel_pstate_driver_lock);
1194
1195        return count;
1196}
1197
1198static void update_qos_request(enum freq_qos_req_type type)
1199{
1200        struct freq_qos_request *req;
1201        struct cpufreq_policy *policy;
1202        int i;
1203
1204        for_each_possible_cpu(i) {
1205                struct cpudata *cpu = all_cpu_data[i];
1206                unsigned int freq, perf_pct;
1207
1208                policy = cpufreq_cpu_get(i);
1209                if (!policy)
1210                        continue;
1211
1212                req = policy->driver_data;
1213                cpufreq_cpu_put(policy);
1214
1215                if (!req)
1216                        continue;
1217
1218                if (hwp_active)
1219                        intel_pstate_get_hwp_cap(cpu);
1220
1221                if (type == FREQ_QOS_MIN) {
1222                        perf_pct = global.min_perf_pct;
1223                } else {
1224                        req++;
1225                        perf_pct = global.max_perf_pct;
1226                }
1227
1228                freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * perf_pct, 100);
1229
1230                if (freq_qos_update_request(req, freq) < 0)
1231                        pr_warn("Failed to update freq constraint: CPU%d\n", i);
1232        }
1233}
1234
1235static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b,
1236                                  const char *buf, size_t count)
1237{
1238        unsigned int input;
1239        int ret;
1240
1241        ret = sscanf(buf, "%u", &input);
1242        if (ret != 1)
1243                return -EINVAL;
1244
1245        mutex_lock(&intel_pstate_driver_lock);
1246
1247        if (!intel_pstate_driver) {
1248                mutex_unlock(&intel_pstate_driver_lock);
1249                return -EAGAIN;
1250        }
1251
1252        mutex_lock(&intel_pstate_limits_lock);
1253
1254        global.max_perf_pct = clamp_t(int, input, global.min_perf_pct, 100);
1255
1256        mutex_unlock(&intel_pstate_limits_lock);
1257
1258        if (intel_pstate_driver == &intel_pstate)
1259                intel_pstate_update_policies();
1260        else
1261                update_qos_request(FREQ_QOS_MAX);
1262
1263        mutex_unlock(&intel_pstate_driver_lock);
1264
1265        return count;
1266}
1267
1268static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b,
1269                                  const char *buf, size_t count)
1270{
1271        unsigned int input;
1272        int ret;
1273
1274        ret = sscanf(buf, "%u", &input);
1275        if (ret != 1)
1276                return -EINVAL;
1277
1278        mutex_lock(&intel_pstate_driver_lock);
1279
1280        if (!intel_pstate_driver) {
1281                mutex_unlock(&intel_pstate_driver_lock);
1282                return -EAGAIN;
1283        }
1284
1285        mutex_lock(&intel_pstate_limits_lock);
1286
1287        global.min_perf_pct = clamp_t(int, input,
1288                                      min_perf_pct_min(), global.max_perf_pct);
1289
1290        mutex_unlock(&intel_pstate_limits_lock);
1291
1292        if (intel_pstate_driver == &intel_pstate)
1293                intel_pstate_update_policies();
1294        else
1295                update_qos_request(FREQ_QOS_MIN);
1296
1297        mutex_unlock(&intel_pstate_driver_lock);
1298
1299        return count;
1300}
1301
1302static ssize_t show_hwp_dynamic_boost(struct kobject *kobj,
1303                                struct kobj_attribute *attr, char *buf)
1304{
1305        return sprintf(buf, "%u\n", hwp_boost);
1306}
1307
1308static ssize_t store_hwp_dynamic_boost(struct kobject *a,
1309                                       struct kobj_attribute *b,
1310                                       const char *buf, size_t count)
1311{
1312        unsigned int input;
1313        int ret;
1314
1315        ret = kstrtouint(buf, 10, &input);
1316        if (ret)
1317                return ret;
1318
1319        mutex_lock(&intel_pstate_driver_lock);
1320        hwp_boost = !!input;
1321        intel_pstate_update_policies();
1322        mutex_unlock(&intel_pstate_driver_lock);
1323
1324        return count;
1325}
1326
1327static ssize_t show_energy_efficiency(struct kobject *kobj, struct kobj_attribute *attr,
1328                                      char *buf)
1329{
1330        u64 power_ctl;
1331        int enable;
1332
1333        rdmsrl(MSR_IA32_POWER_CTL, power_ctl);
1334        enable = !!(power_ctl & BIT(MSR_IA32_POWER_CTL_BIT_EE));
1335        return sprintf(buf, "%d\n", !enable);
1336}
1337
1338static ssize_t store_energy_efficiency(struct kobject *a, struct kobj_attribute *b,
1339                                       const char *buf, size_t count)
1340{
1341        bool input;
1342        int ret;
1343
1344        ret = kstrtobool(buf, &input);
1345        if (ret)
1346                return ret;
1347
1348        set_power_ctl_ee_state(input);
1349
1350        return count;
1351}
1352
1353show_one(max_perf_pct, max_perf_pct);
1354show_one(min_perf_pct, min_perf_pct);
1355
1356define_one_global_rw(status);
1357define_one_global_rw(no_turbo);
1358define_one_global_rw(max_perf_pct);
1359define_one_global_rw(min_perf_pct);
1360define_one_global_ro(turbo_pct);
1361define_one_global_ro(num_pstates);
1362define_one_global_rw(hwp_dynamic_boost);
1363define_one_global_rw(energy_efficiency);
1364
1365static struct attribute *intel_pstate_attributes[] = {
1366        &status.attr,
1367        &no_turbo.attr,
1368        &turbo_pct.attr,
1369        &num_pstates.attr,
1370        NULL
1371};
1372
1373static const struct attribute_group intel_pstate_attr_group = {
1374        .attrs = intel_pstate_attributes,
1375};
1376
1377static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[];
1378
1379static struct kobject *intel_pstate_kobject;
1380
1381static void __init intel_pstate_sysfs_expose_params(void)
1382{
1383        int rc;
1384
1385        intel_pstate_kobject = kobject_create_and_add("intel_pstate",
1386                                                &cpu_subsys.dev_root->kobj);
1387        if (WARN_ON(!intel_pstate_kobject))
1388                return;
1389
1390        rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group);
1391        if (WARN_ON(rc))
1392                return;
1393
1394        /*
1395         * If per cpu limits are enforced there are no global limits, so
1396         * return without creating max/min_perf_pct attributes
1397         */
1398        if (per_cpu_limits)
1399                return;
1400
1401        rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr);
1402        WARN_ON(rc);
1403
1404        rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr);
1405        WARN_ON(rc);
1406
1407        if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids)) {
1408                rc = sysfs_create_file(intel_pstate_kobject, &energy_efficiency.attr);
1409                WARN_ON(rc);
1410        }
1411}
1412
1413static void __init intel_pstate_sysfs_remove(void)
1414{
1415        if (!intel_pstate_kobject)
1416                return;
1417
1418        sysfs_remove_group(intel_pstate_kobject, &intel_pstate_attr_group);
1419
1420        if (!per_cpu_limits) {
1421                sysfs_remove_file(intel_pstate_kobject, &max_perf_pct.attr);
1422                sysfs_remove_file(intel_pstate_kobject, &min_perf_pct.attr);
1423
1424                if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids))
1425                        sysfs_remove_file(intel_pstate_kobject, &energy_efficiency.attr);
1426        }
1427
1428        kobject_put(intel_pstate_kobject);
1429}
1430
1431static void intel_pstate_sysfs_expose_hwp_dynamic_boost(void)
1432{
1433        int rc;
1434
1435        if (!hwp_active)
1436                return;
1437
1438        rc = sysfs_create_file(intel_pstate_kobject, &hwp_dynamic_boost.attr);
1439        WARN_ON_ONCE(rc);
1440}
1441
1442static void intel_pstate_sysfs_hide_hwp_dynamic_boost(void)
1443{
1444        if (!hwp_active)
1445                return;
1446
1447        sysfs_remove_file(intel_pstate_kobject, &hwp_dynamic_boost.attr);
1448}
1449
1450/************************** sysfs end ************************/
1451
1452static void intel_pstate_hwp_enable(struct cpudata *cpudata)
1453{
1454        /* First disable HWP notification interrupt as we don't process them */
1455        if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
1456                wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
1457
1458        wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
1459        if (cpudata->epp_default == -EINVAL)
1460                cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
1461}
1462
1463static int atom_get_min_pstate(void)
1464{
1465        u64 value;
1466
1467        rdmsrl(MSR_ATOM_CORE_RATIOS, value);
1468        return (value >> 8) & 0x7F;
1469}
1470
1471static int atom_get_max_pstate(void)
1472{
1473        u64 value;
1474
1475        rdmsrl(MSR_ATOM_CORE_RATIOS, value);
1476        return (value >> 16) & 0x7F;
1477}
1478
1479static int atom_get_turbo_pstate(void)
1480{
1481        u64 value;
1482
1483        rdmsrl(MSR_ATOM_CORE_TURBO_RATIOS, value);
1484        return value & 0x7F;
1485}
1486
1487static u64 atom_get_val(struct cpudata *cpudata, int pstate)
1488{
1489        u64 val;
1490        int32_t vid_fp;
1491        u32 vid;
1492
1493        val = (u64)pstate << 8;
1494        if (global.no_turbo && !global.turbo_disabled)
1495                val |= (u64)1 << 32;
1496
1497        vid_fp = cpudata->vid.min + mul_fp(
1498                int_tofp(pstate - cpudata->pstate.min_pstate),
1499                cpudata->vid.ratio);
1500
1501        vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max);
1502        vid = ceiling_fp(vid_fp);
1503
1504        if (pstate > cpudata->pstate.max_pstate)
1505                vid = cpudata->vid.turbo;
1506
1507        return val | vid;
1508}
1509
1510static int silvermont_get_scaling(void)
1511{
1512        u64 value;
1513        int i;
1514        /* Defined in Table 35-6 from SDM (Sept 2015) */
1515        static int silvermont_freq_table[] = {
1516                83300, 100000, 133300, 116700, 80000};
1517
1518        rdmsrl(MSR_FSB_FREQ, value);
1519        i = value & 0x7;
1520        WARN_ON(i > 4);
1521
1522        return silvermont_freq_table[i];
1523}
1524
1525static int airmont_get_scaling(void)
1526{
1527        u64 value;
1528        int i;
1529        /* Defined in Table 35-10 from SDM (Sept 2015) */
1530        static int airmont_freq_table[] = {
1531                83300, 100000, 133300, 116700, 80000,
1532                93300, 90000, 88900, 87500};
1533
1534        rdmsrl(MSR_FSB_FREQ, value);
1535        i = value & 0xF;
1536        WARN_ON(i > 8);
1537
1538        return airmont_freq_table[i];
1539}
1540
1541static void atom_get_vid(struct cpudata *cpudata)
1542{
1543        u64 value;
1544
1545        rdmsrl(MSR_ATOM_CORE_VIDS, value);
1546        cpudata->vid.min = int_tofp((value >> 8) & 0x7f);
1547        cpudata->vid.max = int_tofp((value >> 16) & 0x7f);
1548        cpudata->vid.ratio = div_fp(
1549                cpudata->vid.max - cpudata->vid.min,
1550                int_tofp(cpudata->pstate.max_pstate -
1551                        cpudata->pstate.min_pstate));
1552
1553        rdmsrl(MSR_ATOM_CORE_TURBO_VIDS, value);
1554        cpudata->vid.turbo = value & 0x7f;
1555}
1556
1557static int core_get_min_pstate(void)
1558{
1559        u64 value;
1560
1561        rdmsrl(MSR_PLATFORM_INFO, value);
1562        return (value >> 40) & 0xFF;
1563}
1564
1565static int core_get_max_pstate_physical(void)
1566{
1567        u64 value;
1568
1569        rdmsrl(MSR_PLATFORM_INFO, value);
1570        return (value >> 8) & 0xFF;
1571}
1572
1573static int core_get_tdp_ratio(u64 plat_info)
1574{
1575        /* Check how many TDP levels present */
1576        if (plat_info & 0x600000000) {
1577                u64 tdp_ctrl;
1578                u64 tdp_ratio;
1579                int tdp_msr;
1580                int err;
1581
1582                /* Get the TDP level (0, 1, 2) to get ratios */
1583                err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl);
1584                if (err)
1585                        return err;
1586
1587                /* TDP MSR are continuous starting at 0x648 */
1588                tdp_msr = MSR_CONFIG_TDP_NOMINAL + (tdp_ctrl & 0x03);
1589                err = rdmsrl_safe(tdp_msr, &tdp_ratio);
1590                if (err)
1591                        return err;
1592
1593                /* For level 1 and 2, bits[23:16] contain the ratio */
1594                if (tdp_ctrl & 0x03)
1595                        tdp_ratio >>= 16;
1596
1597                tdp_ratio &= 0xff; /* ratios are only 8 bits long */
1598                pr_debug("tdp_ratio %x\n", (int)tdp_ratio);
1599
1600                return (int)tdp_ratio;
1601        }
1602
1603        return -ENXIO;
1604}
1605
1606static int core_get_max_pstate(void)
1607{
1608        u64 tar;
1609        u64 plat_info;
1610        int max_pstate;
1611        int tdp_ratio;
1612        int err;
1613
1614        rdmsrl(MSR_PLATFORM_INFO, plat_info);
1615        max_pstate = (plat_info >> 8) & 0xFF;
1616
1617        tdp_ratio = core_get_tdp_ratio(plat_info);
1618        if (tdp_ratio <= 0)
1619                return max_pstate;
1620
1621        if (hwp_active) {
1622                /* Turbo activation ratio is not used on HWP platforms */
1623                return tdp_ratio;
1624        }
1625
1626        err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar);
1627        if (!err) {
1628                int tar_levels;
1629
1630                /* Do some sanity checking for safety */
1631                tar_levels = tar & 0xff;
1632                if (tdp_ratio - 1 == tar_levels) {
1633                        max_pstate = tar_levels;
1634                        pr_debug("max_pstate=TAC %x\n", max_pstate);
1635                }
1636        }
1637
1638        return max_pstate;
1639}
1640
1641static int core_get_turbo_pstate(void)
1642{
1643        u64 value;
1644        int nont, ret;
1645
1646        rdmsrl(MSR_TURBO_RATIO_LIMIT, value);
1647        nont = core_get_max_pstate();
1648        ret = (value) & 255;
1649        if (ret <= nont)
1650                ret = nont;
1651        return ret;
1652}
1653
1654static inline int core_get_scaling(void)
1655{
1656        return 100000;
1657}
1658
1659static u64 core_get_val(struct cpudata *cpudata, int pstate)
1660{
1661        u64 val;
1662
1663        val = (u64)pstate << 8;
1664        if (global.no_turbo && !global.turbo_disabled)
1665                val |= (u64)1 << 32;
1666
1667        return val;
1668}
1669
1670static int knl_get_aperf_mperf_shift(void)
1671{
1672        return 10;
1673}
1674
1675static int knl_get_turbo_pstate(void)
1676{
1677        u64 value;
1678        int nont, ret;
1679
1680        rdmsrl(MSR_TURBO_RATIO_LIMIT, value);
1681        nont = core_get_max_pstate();
1682        ret = (((value) >> 8) & 0xFF);
1683        if (ret <= nont)
1684                ret = nont;
1685        return ret;
1686}
1687
1688static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate)
1689{
1690        trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
1691        cpu->pstate.current_pstate = pstate;
1692        /*
1693         * Generally, there is no guarantee that this code will always run on
1694         * the CPU being updated, so force the register update to run on the
1695         * right CPU.
1696         */
1697        wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
1698                      pstate_funcs.get_val(cpu, pstate));
1699}
1700
1701static void intel_pstate_set_min_pstate(struct cpudata *cpu)
1702{
1703        intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
1704}
1705
1706static void intel_pstate_max_within_limits(struct cpudata *cpu)
1707{
1708        int pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio);
1709
1710        update_turbo_state();
1711        intel_pstate_set_pstate(cpu, pstate);
1712}
1713
1714static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
1715{
1716        cpu->pstate.min_pstate = pstate_funcs.get_min();
1717        cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical();
1718        cpu->pstate.scaling = pstate_funcs.get_scaling();
1719
1720        if (hwp_active && !hwp_mode_bdw) {
1721                __intel_pstate_get_hwp_cap(cpu);
1722        } else {
1723                cpu->pstate.max_pstate = pstate_funcs.get_max();
1724                cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
1725        }
1726
1727        cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling;
1728        cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
1729
1730        if (pstate_funcs.get_aperf_mperf_shift)
1731                cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift();
1732
1733        if (pstate_funcs.get_vid)
1734                pstate_funcs.get_vid(cpu);
1735
1736        intel_pstate_set_min_pstate(cpu);
1737}
1738
1739/*
1740 * Long hold time will keep high perf limits for long time,
1741 * which negatively impacts perf/watt for some workloads,
1742 * like specpower. 3ms is based on experiements on some
1743 * workoads.
1744 */
1745static int hwp_boost_hold_time_ns = 3 * NSEC_PER_MSEC;
1746
1747static inline void intel_pstate_hwp_boost_up(struct cpudata *cpu)
1748{
1749        u64 hwp_req = READ_ONCE(cpu->hwp_req_cached);
1750        u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached);
1751        u32 max_limit = (hwp_req & 0xff00) >> 8;
1752        u32 min_limit = (hwp_req & 0xff);
1753        u32 boost_level1;
1754
1755        /*
1756         * Cases to consider (User changes via sysfs or boot time):
1757         * If, P0 (Turbo max) = P1 (Guaranteed max) = min:
1758         *      No boost, return.
1759         * If, P0 (Turbo max) > P1 (Guaranteed max) = min:
1760         *     Should result in one level boost only for P0.
1761         * If, P0 (Turbo max) = P1 (Guaranteed max) > min:
1762         *     Should result in two level boost:
1763         *         (min + p1)/2 and P1.
1764         * If, P0 (Turbo max) > P1 (Guaranteed max) > min:
1765         *     Should result in three level boost:
1766         *        (min + p1)/2, P1 and P0.
1767         */
1768
1769        /* If max and min are equal or already at max, nothing to boost */
1770        if (max_limit == min_limit || cpu->hwp_boost_min >= max_limit)
1771                return;
1772
1773        if (!cpu->hwp_boost_min)
1774                cpu->hwp_boost_min = min_limit;
1775
1776        /* level at half way mark between min and guranteed */
1777        boost_level1 = (HWP_GUARANTEED_PERF(hwp_cap) + min_limit) >> 1;
1778
1779        if (cpu->hwp_boost_min < boost_level1)
1780                cpu->hwp_boost_min = boost_level1;
1781        else if (cpu->hwp_boost_min < HWP_GUARANTEED_PERF(hwp_cap))
1782                cpu->hwp_boost_min = HWP_GUARANTEED_PERF(hwp_cap);
1783        else if (cpu->hwp_boost_min == HWP_GUARANTEED_PERF(hwp_cap) &&
1784                 max_limit != HWP_GUARANTEED_PERF(hwp_cap))
1785                cpu->hwp_boost_min = max_limit;
1786        else
1787                return;
1788
1789        hwp_req = (hwp_req & ~GENMASK_ULL(7, 0)) | cpu->hwp_boost_min;
1790        wrmsrl(MSR_HWP_REQUEST, hwp_req);
1791        cpu->last_update = cpu->sample.time;
1792}
1793
1794static inline void intel_pstate_hwp_boost_down(struct cpudata *cpu)
1795{
1796        if (cpu->hwp_boost_min) {
1797                bool expired;
1798
1799                /* Check if we are idle for hold time to boost down */
1800                expired = time_after64(cpu->sample.time, cpu->last_update +
1801                                       hwp_boost_hold_time_ns);
1802                if (expired) {
1803                        wrmsrl(MSR_HWP_REQUEST, cpu->hwp_req_cached);
1804                        cpu->hwp_boost_min = 0;
1805                }
1806        }
1807        cpu->last_update = cpu->sample.time;
1808}
1809
1810static inline void intel_pstate_update_util_hwp_local(struct cpudata *cpu,
1811                                                      u64 time)
1812{
1813        cpu->sample.time = time;
1814
1815        if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) {
1816                bool do_io = false;
1817
1818                cpu->sched_flags = 0;
1819                /*
1820                 * Set iowait_boost flag and update time. Since IO WAIT flag
1821                 * is set all the time, we can't just conclude that there is
1822                 * some IO bound activity is scheduled on this CPU with just
1823                 * one occurrence. If we receive at least two in two
1824                 * consecutive ticks, then we treat as boost candidate.
1825                 */
1826                if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC))
1827                        do_io = true;
1828
1829                cpu->last_io_update = time;
1830
1831                if (do_io)
1832                        intel_pstate_hwp_boost_up(cpu);
1833
1834        } else {
1835                intel_pstate_hwp_boost_down(cpu);
1836        }
1837}
1838
1839static inline void intel_pstate_update_util_hwp(struct update_util_data *data,
1840                                                u64 time, unsigned int flags)
1841{
1842        struct cpudata *cpu = container_of(data, struct cpudata, update_util);
1843
1844        cpu->sched_flags |= flags;
1845
1846        if (smp_processor_id() == cpu->cpu)
1847                intel_pstate_update_util_hwp_local(cpu, time);
1848}
1849
1850static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu)
1851{
1852        struct sample *sample = &cpu->sample;
1853
1854        sample->core_avg_perf = div_ext_fp(sample->aperf, sample->mperf);
1855}
1856
1857static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time)
1858{
1859        u64 aperf, mperf;
1860        unsigned long flags;
1861        u64 tsc;
1862
1863        local_irq_save(flags);
1864        rdmsrl(MSR_IA32_APERF, aperf);
1865        rdmsrl(MSR_IA32_MPERF, mperf);
1866        tsc = rdtsc();
1867        if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) {
1868                local_irq_restore(flags);
1869                return false;
1870        }
1871        local_irq_restore(flags);
1872
1873        cpu->last_sample_time = cpu->sample.time;
1874        cpu->sample.time = time;
1875        cpu->sample.aperf = aperf;
1876        cpu->sample.mperf = mperf;
1877        cpu->sample.tsc =  tsc;
1878        cpu->sample.aperf -= cpu->prev_aperf;
1879        cpu->sample.mperf -= cpu->prev_mperf;
1880        cpu->sample.tsc -= cpu->prev_tsc;
1881
1882        cpu->prev_aperf = aperf;
1883        cpu->prev_mperf = mperf;
1884        cpu->prev_tsc = tsc;
1885        /*
1886         * First time this function is invoked in a given cycle, all of the
1887         * previous sample data fields are equal to zero or stale and they must
1888         * be populated with meaningful numbers for things to work, so assume
1889         * that sample.time will always be reset before setting the utilization
1890         * update hook and make the caller skip the sample then.
1891         */
1892        if (cpu->last_sample_time) {
1893                intel_pstate_calc_avg_perf(cpu);
1894                return true;
1895        }
1896        return false;
1897}
1898
1899static inline int32_t get_avg_frequency(struct cpudata *cpu)
1900{
1901        return mul_ext_fp(cpu->sample.core_avg_perf, cpu_khz);
1902}
1903
1904static inline int32_t get_avg_pstate(struct cpudata *cpu)
1905{
1906        return mul_ext_fp(cpu->pstate.max_pstate_physical,
1907                          cpu->sample.core_avg_perf);
1908}
1909
1910static inline int32_t get_target_pstate(struct cpudata *cpu)
1911{
1912        struct sample *sample = &cpu->sample;
1913        int32_t busy_frac;
1914        int target, avg_pstate;
1915
1916        busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift,
1917                           sample->tsc);
1918
1919        if (busy_frac < cpu->iowait_boost)
1920                busy_frac = cpu->iowait_boost;
1921
1922        sample->busy_scaled = busy_frac * 100;
1923
1924        target = global.no_turbo || global.turbo_disabled ?
1925                        cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
1926        target += target >> 2;
1927        target = mul_fp(target, busy_frac);
1928        if (target < cpu->pstate.min_pstate)
1929                target = cpu->pstate.min_pstate;
1930
1931        /*
1932         * If the average P-state during the previous cycle was higher than the
1933         * current target, add 50% of the difference to the target to reduce
1934         * possible performance oscillations and offset possible performance
1935         * loss related to moving the workload from one CPU to another within
1936         * a package/module.
1937         */
1938        avg_pstate = get_avg_pstate(cpu);
1939        if (avg_pstate > target)
1940                target += (avg_pstate - target) >> 1;
1941
1942        return target;
1943}
1944
1945static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate)
1946{
1947        int min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio);
1948        int max_pstate = max(min_pstate, cpu->max_perf_ratio);
1949
1950        return clamp_t(int, pstate, min_pstate, max_pstate);
1951}
1952
1953static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
1954{
1955        if (pstate == cpu->pstate.current_pstate)
1956                return;
1957
1958        cpu->pstate.current_pstate = pstate;
1959        wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate));
1960}
1961
1962static void intel_pstate_adjust_pstate(struct cpudata *cpu)
1963{
1964        int from = cpu->pstate.current_pstate;
1965        struct sample *sample;
1966        int target_pstate;
1967
1968        update_turbo_state();
1969
1970        target_pstate = get_target_pstate(cpu);
1971        target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
1972        trace_cpu_frequency(target_pstate * cpu->pstate.scaling, cpu->cpu);
1973        intel_pstate_update_pstate(cpu, target_pstate);
1974
1975        sample = &cpu->sample;
1976        trace_pstate_sample(mul_ext_fp(100, sample->core_avg_perf),
1977                fp_toint(sample->busy_scaled),
1978                from,
1979                cpu->pstate.current_pstate,
1980                sample->mperf,
1981                sample->aperf,
1982                sample->tsc,
1983                get_avg_frequency(cpu),
1984                fp_toint(cpu->iowait_boost * 100));
1985}
1986
1987static void intel_pstate_update_util(struct update_util_data *data, u64 time,
1988                                     unsigned int flags)
1989{
1990        struct cpudata *cpu = container_of(data, struct cpudata, update_util);
1991        u64 delta_ns;
1992
1993        /* Don't allow remote callbacks */
1994        if (smp_processor_id() != cpu->cpu)
1995                return;
1996
1997        delta_ns = time - cpu->last_update;
1998        if (flags & SCHED_CPUFREQ_IOWAIT) {
1999                /* Start over if the CPU may have been idle. */
2000                if (delta_ns > TICK_NSEC) {
2001                        cpu->iowait_boost = ONE_EIGHTH_FP;
2002                } else if (cpu->iowait_boost >= ONE_EIGHTH_FP) {
2003                        cpu->iowait_boost <<= 1;
2004                        if (cpu->iowait_boost > int_tofp(1))
2005                                cpu->iowait_boost = int_tofp(1);
2006                } else {
2007                        cpu->iowait_boost = ONE_EIGHTH_FP;
2008                }
2009        } else if (cpu->iowait_boost) {
2010                /* Clear iowait_boost if the CPU may have been idle. */
2011                if (delta_ns > TICK_NSEC)
2012                        cpu->iowait_boost = 0;
2013                else
2014                        cpu->iowait_boost >>= 1;
2015        }
2016        cpu->last_update = time;
2017        delta_ns = time - cpu->sample.time;
2018        if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL)
2019                return;
2020
2021        if (intel_pstate_sample(cpu, time))
2022                intel_pstate_adjust_pstate(cpu);
2023}
2024
2025static struct pstate_funcs core_funcs = {
2026        .get_max = core_get_max_pstate,
2027        .get_max_physical = core_get_max_pstate_physical,
2028        .get_min = core_get_min_pstate,
2029        .get_turbo = core_get_turbo_pstate,
2030        .get_scaling = core_get_scaling,
2031        .get_val = core_get_val,
2032};
2033
2034static const struct pstate_funcs silvermont_funcs = {
2035        .get_max = atom_get_max_pstate,
2036        .get_max_physical = atom_get_max_pstate,
2037        .get_min = atom_get_min_pstate,
2038        .get_turbo = atom_get_turbo_pstate,
2039        .get_val = atom_get_val,
2040        .get_scaling = silvermont_get_scaling,
2041        .get_vid = atom_get_vid,
2042};
2043
2044static const struct pstate_funcs airmont_funcs = {
2045        .get_max = atom_get_max_pstate,
2046        .get_max_physical = atom_get_max_pstate,
2047        .get_min = atom_get_min_pstate,
2048        .get_turbo = atom_get_turbo_pstate,
2049        .get_val = atom_get_val,
2050        .get_scaling = airmont_get_scaling,
2051        .get_vid = atom_get_vid,
2052};
2053
2054static const struct pstate_funcs knl_funcs = {
2055        .get_max = core_get_max_pstate,
2056        .get_max_physical = core_get_max_pstate_physical,
2057        .get_min = core_get_min_pstate,
2058        .get_turbo = knl_get_turbo_pstate,
2059        .get_aperf_mperf_shift = knl_get_aperf_mperf_shift,
2060        .get_scaling = core_get_scaling,
2061        .get_val = core_get_val,
2062};
2063
2064#define X86_MATCH(model, policy)                                         \
2065        X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \
2066                                           X86_FEATURE_APERFMPERF, &policy)
2067
2068static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
2069        X86_MATCH(SANDYBRIDGE,          core_funcs),
2070        X86_MATCH(SANDYBRIDGE_X,        core_funcs),
2071        X86_MATCH(ATOM_SILVERMONT,      silvermont_funcs),
2072        X86_MATCH(IVYBRIDGE,            core_funcs),
2073        X86_MATCH(HASWELL,              core_funcs),
2074        X86_MATCH(BROADWELL,            core_funcs),
2075        X86_MATCH(IVYBRIDGE_X,          core_funcs),
2076        X86_MATCH(HASWELL_X,            core_funcs),
2077        X86_MATCH(HASWELL_L,            core_funcs),
2078        X86_MATCH(HASWELL_G,            core_funcs),
2079        X86_MATCH(BROADWELL_G,          core_funcs),
2080        X86_MATCH(ATOM_AIRMONT,         airmont_funcs),
2081        X86_MATCH(SKYLAKE_L,            core_funcs),
2082        X86_MATCH(BROADWELL_X,          core_funcs),
2083        X86_MATCH(SKYLAKE,              core_funcs),
2084        X86_MATCH(BROADWELL_D,          core_funcs),
2085        X86_MATCH(XEON_PHI_KNL,         knl_funcs),
2086        X86_MATCH(XEON_PHI_KNM,         knl_funcs),
2087        X86_MATCH(ATOM_GOLDMONT,        core_funcs),
2088        X86_MATCH(ATOM_GOLDMONT_PLUS,   core_funcs),
2089        X86_MATCH(SKYLAKE_X,            core_funcs),
2090        {}
2091};
2092MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);
2093
2094static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = {
2095        X86_MATCH(BROADWELL_D,          core_funcs),
2096        X86_MATCH(BROADWELL_X,          core_funcs),
2097        X86_MATCH(SKYLAKE_X,            core_funcs),
2098        {}
2099};
2100
2101static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = {
2102        X86_MATCH(KABYLAKE,             core_funcs),
2103        {}
2104};
2105
2106static const struct x86_cpu_id intel_pstate_hwp_boost_ids[] = {
2107        X86_MATCH(SKYLAKE_X,            core_funcs),
2108        X86_MATCH(SKYLAKE,              core_funcs),
2109        {}
2110};
2111
2112static int intel_pstate_init_cpu(unsigned int cpunum)
2113{
2114        struct cpudata *cpu;
2115
2116        cpu = all_cpu_data[cpunum];
2117
2118        if (!cpu) {
2119                cpu = kzalloc(sizeof(*cpu), GFP_KERNEL);
2120                if (!cpu)
2121                        return -ENOMEM;
2122
2123                all_cpu_data[cpunum] = cpu;
2124
2125                cpu->cpu = cpunum;
2126
2127                cpu->epp_default = -EINVAL;
2128
2129                if (hwp_active) {
2130                        const struct x86_cpu_id *id;
2131
2132                        intel_pstate_hwp_enable(cpu);
2133
2134                        id = x86_match_cpu(intel_pstate_hwp_boost_ids);
2135                        if (id && intel_pstate_acpi_pm_profile_server())
2136                                hwp_boost = true;
2137                }
2138        } else if (hwp_active) {
2139                /*
2140                 * Re-enable HWP in case this happens after a resume from ACPI
2141                 * S3 if the CPU was offline during the whole system/resume
2142                 * cycle.
2143                 */
2144                intel_pstate_hwp_reenable(cpu);
2145        }
2146
2147        cpu->epp_powersave = -EINVAL;
2148        cpu->epp_policy = 0;
2149
2150        intel_pstate_get_cpu_pstates(cpu);
2151
2152        pr_debug("controlling: cpu %d\n", cpunum);
2153
2154        return 0;
2155}
2156
2157static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
2158{
2159        struct cpudata *cpu = all_cpu_data[cpu_num];
2160
2161        if (hwp_active && !hwp_boost)
2162                return;
2163
2164        if (cpu->update_util_set)
2165                return;
2166
2167        /* Prevent intel_pstate_update_util() from using stale data. */
2168        cpu->sample.time = 0;
2169        cpufreq_add_update_util_hook(cpu_num, &cpu->update_util,
2170                                     (hwp_active ?
2171                                      intel_pstate_update_util_hwp :
2172                                      intel_pstate_update_util));
2173        cpu->update_util_set = true;
2174}
2175
2176static void intel_pstate_clear_update_util_hook(unsigned int cpu)
2177{
2178        struct cpudata *cpu_data = all_cpu_data[cpu];
2179
2180        if (!cpu_data->update_util_set)
2181                return;
2182
2183        cpufreq_remove_update_util_hook(cpu);
2184        cpu_data->update_util_set = false;
2185        synchronize_rcu();
2186}
2187
2188static int intel_pstate_get_max_freq(struct cpudata *cpu)
2189{
2190        return global.turbo_disabled || global.no_turbo ?
2191                        cpu->pstate.max_freq : cpu->pstate.turbo_freq;
2192}
2193
2194static void intel_pstate_update_perf_limits(struct cpudata *cpu,
2195                                            unsigned int policy_min,
2196                                            unsigned int policy_max)
2197{
2198        int scaling = cpu->pstate.scaling;
2199        int32_t max_policy_perf, min_policy_perf;
2200
2201        /*
2202         * HWP needs some special consideration, because HWP_REQUEST uses
2203         * abstract values to represent performance rather than pure ratios.
2204         */
2205        if (hwp_active)
2206                intel_pstate_get_hwp_cap(cpu);
2207
2208        max_policy_perf = policy_max / scaling;
2209        if (policy_max == policy_min) {
2210                min_policy_perf = max_policy_perf;
2211        } else {
2212                min_policy_perf = policy_min / scaling;
2213                min_policy_perf = clamp_t(int32_t, min_policy_perf,
2214                                          0, max_policy_perf);
2215        }
2216
2217        pr_debug("cpu:%d min_policy_perf:%d max_policy_perf:%d\n",
2218                 cpu->cpu, min_policy_perf, max_policy_perf);
2219
2220        /* Normalize user input to [min_perf, max_perf] */
2221        if (per_cpu_limits) {
2222                cpu->min_perf_ratio = min_policy_perf;
2223                cpu->max_perf_ratio = max_policy_perf;
2224        } else {
2225                int turbo_max = cpu->pstate.turbo_pstate;
2226                int32_t global_min, global_max;
2227
2228                /* Global limits are in percent of the maximum turbo P-state. */
2229                global_max = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100);
2230                global_min = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100);
2231                global_min = clamp_t(int32_t, global_min, 0, global_max);
2232
2233                pr_debug("cpu:%d global_min:%d global_max:%d\n", cpu->cpu,
2234                         global_min, global_max);
2235
2236                cpu->min_perf_ratio = max(min_policy_perf, global_min);
2237                cpu->min_perf_ratio = min(cpu->min_perf_ratio, max_policy_perf);
2238                cpu->max_perf_ratio = min(max_policy_perf, global_max);
2239                cpu->max_perf_ratio = max(min_policy_perf, cpu->max_perf_ratio);
2240
2241                /* Make sure min_perf <= max_perf */
2242                cpu->min_perf_ratio = min(cpu->min_perf_ratio,
2243                                          cpu->max_perf_ratio);
2244
2245        }
2246        pr_debug("cpu:%d max_perf_ratio:%d min_perf_ratio:%d\n", cpu->cpu,
2247                 cpu->max_perf_ratio,
2248                 cpu->min_perf_ratio);
2249}
2250
2251static int intel_pstate_set_policy(struct cpufreq_policy *policy)
2252{
2253        struct cpudata *cpu;
2254
2255        if (!policy->cpuinfo.max_freq)
2256                return -ENODEV;
2257
2258        pr_debug("set_policy cpuinfo.max %u policy->max %u\n",
2259                 policy->cpuinfo.max_freq, policy->max);
2260
2261        cpu = all_cpu_data[policy->cpu];
2262        cpu->policy = policy->policy;
2263
2264        mutex_lock(&intel_pstate_limits_lock);
2265
2266        intel_pstate_update_perf_limits(cpu, policy->min, policy->max);
2267
2268        if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) {
2269                /*
2270                 * NOHZ_FULL CPUs need this as the governor callback may not
2271                 * be invoked on them.
2272                 */
2273                intel_pstate_clear_update_util_hook(policy->cpu);
2274                intel_pstate_max_within_limits(cpu);
2275        } else {
2276                intel_pstate_set_update_util_hook(policy->cpu);
2277        }
2278
2279        if (hwp_active) {
2280                /*
2281                 * When hwp_boost was active before and dynamically it
2282                 * was turned off, in that case we need to clear the
2283                 * update util hook.
2284                 */
2285                if (!hwp_boost)
2286                        intel_pstate_clear_update_util_hook(policy->cpu);
2287                intel_pstate_hwp_set(policy->cpu);
2288        }
2289
2290        mutex_unlock(&intel_pstate_limits_lock);
2291
2292        return 0;
2293}
2294
2295static void intel_pstate_adjust_policy_max(struct cpudata *cpu,
2296                                           struct cpufreq_policy_data *policy)
2297{
2298        if (!hwp_active &&
2299            cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate &&
2300            policy->max < policy->cpuinfo.max_freq &&
2301            policy->max > cpu->pstate.max_freq) {
2302                pr_debug("policy->max > max non turbo frequency\n");
2303                policy->max = policy->cpuinfo.max_freq;
2304        }
2305}
2306
2307static void intel_pstate_verify_cpu_policy(struct cpudata *cpu,
2308                                           struct cpufreq_policy_data *policy)
2309{
2310        int max_freq;
2311
2312        update_turbo_state();
2313        if (hwp_active) {
2314                intel_pstate_get_hwp_cap(cpu);
2315                max_freq = global.no_turbo || global.turbo_disabled ?
2316                                cpu->pstate.max_freq : cpu->pstate.turbo_freq;
2317        } else {
2318                max_freq = intel_pstate_get_max_freq(cpu);
2319        }
2320        cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, max_freq);
2321
2322        intel_pstate_adjust_policy_max(cpu, policy);
2323}
2324
2325static int intel_pstate_verify_policy(struct cpufreq_policy_data *policy)
2326{
2327        intel_pstate_verify_cpu_policy(all_cpu_data[policy->cpu], policy);
2328
2329        return 0;
2330}
2331
2332static int intel_pstate_cpu_offline(struct cpufreq_policy *policy)
2333{
2334        struct cpudata *cpu = all_cpu_data[policy->cpu];
2335
2336        pr_debug("CPU %d going offline\n", cpu->cpu);
2337
2338        if (cpu->suspended)
2339                return 0;
2340
2341        /*
2342         * If the CPU is an SMT thread and it goes offline with the performance
2343         * settings different from the minimum, it will prevent its sibling
2344         * from getting to lower performance levels, so force the minimum
2345         * performance on CPU offline to prevent that from happening.
2346         */
2347        if (hwp_active)
2348                intel_pstate_hwp_offline(cpu);
2349        else
2350                intel_pstate_set_min_pstate(cpu);
2351
2352        intel_pstate_exit_perf_limits(policy);
2353
2354        return 0;
2355}
2356
2357static int intel_pstate_cpu_online(struct cpufreq_policy *policy)
2358{
2359        struct cpudata *cpu = all_cpu_data[policy->cpu];
2360
2361        pr_debug("CPU %d going online\n", cpu->cpu);
2362
2363        intel_pstate_init_acpi_perf_limits(policy);
2364
2365        if (hwp_active) {
2366                /*
2367                 * Re-enable HWP and clear the "suspended" flag to let "resume"
2368                 * know that it need not do that.
2369                 */
2370                intel_pstate_hwp_reenable(cpu);
2371                cpu->suspended = false;
2372        }
2373
2374        return 0;
2375}
2376
2377static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
2378{
2379        pr_debug("CPU %d stopping\n", policy->cpu);
2380
2381        intel_pstate_clear_update_util_hook(policy->cpu);
2382}
2383
2384static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
2385{
2386        pr_debug("CPU %d exiting\n", policy->cpu);
2387
2388        policy->fast_switch_possible = false;
2389
2390        return 0;
2391}
2392
2393static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
2394{
2395        struct cpudata *cpu;
2396        int rc;
2397
2398        rc = intel_pstate_init_cpu(policy->cpu);
2399        if (rc)
2400                return rc;
2401
2402        cpu = all_cpu_data[policy->cpu];
2403
2404        cpu->max_perf_ratio = 0xFF;
2405        cpu->min_perf_ratio = 0;
2406
2407        /* cpuinfo and default policy values */
2408        policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
2409        update_turbo_state();
2410        global.turbo_disabled_mf = global.turbo_disabled;
2411        policy->cpuinfo.max_freq = global.turbo_disabled ?
2412                        cpu->pstate.max_freq : cpu->pstate.turbo_freq;
2413
2414        policy->min = policy->cpuinfo.min_freq;
2415        policy->max = policy->cpuinfo.max_freq;
2416
2417        intel_pstate_init_acpi_perf_limits(policy);
2418
2419        policy->fast_switch_possible = true;
2420
2421        return 0;
2422}
2423
2424static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
2425{
2426        int ret = __intel_pstate_cpu_init(policy);
2427
2428        if (ret)
2429                return ret;
2430
2431        /*
2432         * Set the policy to powersave to provide a valid fallback value in case
2433         * the default cpufreq governor is neither powersave nor performance.
2434         */
2435        policy->policy = CPUFREQ_POLICY_POWERSAVE;
2436
2437        if (hwp_active) {
2438                struct cpudata *cpu = all_cpu_data[policy->cpu];
2439
2440                cpu->epp_cached = intel_pstate_get_epp(cpu, 0);
2441        }
2442
2443        return 0;
2444}
2445
2446static struct cpufreq_driver intel_pstate = {
2447        .flags          = CPUFREQ_CONST_LOOPS,
2448        .verify         = intel_pstate_verify_policy,
2449        .setpolicy      = intel_pstate_set_policy,
2450        .suspend        = intel_pstate_suspend,
2451        .resume         = intel_pstate_resume,
2452        .init           = intel_pstate_cpu_init,
2453        .exit           = intel_pstate_cpu_exit,
2454        .stop_cpu       = intel_pstate_stop_cpu,
2455        .offline        = intel_pstate_cpu_offline,
2456        .online         = intel_pstate_cpu_online,
2457        .update_limits  = intel_pstate_update_limits,
2458        .name           = "intel_pstate",
2459};
2460
2461static int intel_cpufreq_verify_policy(struct cpufreq_policy_data *policy)
2462{
2463        struct cpudata *cpu = all_cpu_data[policy->cpu];
2464
2465        intel_pstate_verify_cpu_policy(cpu, policy);
2466        intel_pstate_update_perf_limits(cpu, policy->min, policy->max);
2467
2468        return 0;
2469}
2470
2471/* Use of trace in passive mode:
2472 *
2473 * In passive mode the trace core_busy field (also known as the
2474 * performance field, and lablelled as such on the graphs; also known as
2475 * core_avg_perf) is not needed and so is re-assigned to indicate if the
2476 * driver call was via the normal or fast switch path. Various graphs
2477 * output from the intel_pstate_tracer.py utility that include core_busy
2478 * (or performance or core_avg_perf) have a fixed y-axis from 0 to 100%,
2479 * so we use 10 to indicate the normal path through the driver, and
2480 * 90 to indicate the fast switch path through the driver.
2481 * The scaled_busy field is not used, and is set to 0.
2482 */
2483
2484#define INTEL_PSTATE_TRACE_TARGET 10
2485#define INTEL_PSTATE_TRACE_FAST_SWITCH 90
2486
2487static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate)
2488{
2489        struct sample *sample;
2490
2491        if (!trace_pstate_sample_enabled())
2492                return;
2493
2494        if (!intel_pstate_sample(cpu, ktime_get()))
2495                return;
2496
2497        sample = &cpu->sample;
2498        trace_pstate_sample(trace_type,
2499                0,
2500                old_pstate,
2501                cpu->pstate.current_pstate,
2502                sample->mperf,
2503                sample->aperf,
2504                sample->tsc,
2505                get_avg_frequency(cpu),
2506                fp_toint(cpu->iowait_boost * 100));
2507}
2508
2509static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min, u32 max,
2510                                     u32 desired, bool fast_switch)
2511{
2512        u64 prev = READ_ONCE(cpu->hwp_req_cached), value = prev;
2513
2514        value &= ~HWP_MIN_PERF(~0L);
2515        value |= HWP_MIN_PERF(min);
2516
2517        value &= ~HWP_MAX_PERF(~0L);
2518        value |= HWP_MAX_PERF(max);
2519
2520        value &= ~HWP_DESIRED_PERF(~0L);
2521        value |= HWP_DESIRED_PERF(desired);
2522
2523        if (value == prev)
2524                return;
2525
2526        WRITE_ONCE(cpu->hwp_req_cached, value);
2527        if (fast_switch)
2528                wrmsrl(MSR_HWP_REQUEST, value);
2529        else
2530                wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
2531}
2532
2533static void intel_cpufreq_perf_ctl_update(struct cpudata *cpu,
2534                                          u32 target_pstate, bool fast_switch)
2535{
2536        if (fast_switch)
2537                wrmsrl(MSR_IA32_PERF_CTL,
2538                       pstate_funcs.get_val(cpu, target_pstate));
2539        else
2540                wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
2541                              pstate_funcs.get_val(cpu, target_pstate));
2542}
2543
2544static int intel_cpufreq_update_pstate(struct cpufreq_policy *policy,
2545                                       int target_pstate, bool fast_switch)
2546{
2547        struct cpudata *cpu = all_cpu_data[policy->cpu];
2548        int old_pstate = cpu->pstate.current_pstate;
2549
2550        target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
2551        if (hwp_active) {
2552                int max_pstate = policy->strict_target ?
2553                                        target_pstate : cpu->max_perf_ratio;
2554
2555                intel_cpufreq_hwp_update(cpu, target_pstate, max_pstate, 0,
2556                                         fast_switch);
2557        } else if (target_pstate != old_pstate) {
2558                intel_cpufreq_perf_ctl_update(cpu, target_pstate, fast_switch);
2559        }
2560
2561        cpu->pstate.current_pstate = target_pstate;
2562
2563        intel_cpufreq_trace(cpu, fast_switch ? INTEL_PSTATE_TRACE_FAST_SWITCH :
2564                            INTEL_PSTATE_TRACE_TARGET, old_pstate);
2565
2566        return target_pstate;
2567}
2568
2569static int intel_cpufreq_target(struct cpufreq_policy *policy,
2570                                unsigned int target_freq,
2571                                unsigned int relation)
2572{
2573        struct cpudata *cpu = all_cpu_data[policy->cpu];
2574        struct cpufreq_freqs freqs;
2575        int target_pstate;
2576
2577        update_turbo_state();
2578
2579        freqs.old = policy->cur;
2580        freqs.new = target_freq;
2581
2582        cpufreq_freq_transition_begin(policy, &freqs);
2583
2584        switch (relation) {
2585        case CPUFREQ_RELATION_L:
2586                target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
2587                break;
2588        case CPUFREQ_RELATION_H:
2589                target_pstate = freqs.new / cpu->pstate.scaling;
2590                break;
2591        default:
2592                target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
2593                break;
2594        }
2595
2596        target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, false);
2597
2598        freqs.new = target_pstate * cpu->pstate.scaling;
2599
2600        cpufreq_freq_transition_end(policy, &freqs, false);
2601
2602        return 0;
2603}
2604
2605static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
2606                                              unsigned int target_freq)
2607{
2608        struct cpudata *cpu = all_cpu_data[policy->cpu];
2609        int target_pstate;
2610
2611        update_turbo_state();
2612
2613        target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
2614
2615        target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, true);
2616
2617        return target_pstate * cpu->pstate.scaling;
2618}
2619
2620static void intel_cpufreq_adjust_perf(unsigned int cpunum,
2621                                      unsigned long min_perf,
2622                                      unsigned long target_perf,
2623                                      unsigned long capacity)
2624{
2625        struct cpudata *cpu = all_cpu_data[cpunum];
2626        u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached);
2627        int old_pstate = cpu->pstate.current_pstate;
2628        int cap_pstate, min_pstate, max_pstate, target_pstate;
2629
2630        update_turbo_state();
2631        cap_pstate = global.turbo_disabled ? HWP_GUARANTEED_PERF(hwp_cap) :
2632                                             HWP_HIGHEST_PERF(hwp_cap);
2633
2634        /* Optimization: Avoid unnecessary divisions. */
2635
2636        target_pstate = cap_pstate;
2637        if (target_perf < capacity)
2638                target_pstate = DIV_ROUND_UP(cap_pstate * target_perf, capacity);
2639
2640        min_pstate = cap_pstate;
2641        if (min_perf < capacity)
2642                min_pstate = DIV_ROUND_UP(cap_pstate * min_perf, capacity);
2643
2644        if (min_pstate < cpu->pstate.min_pstate)
2645                min_pstate = cpu->pstate.min_pstate;
2646
2647        if (min_pstate < cpu->min_perf_ratio)
2648                min_pstate = cpu->min_perf_ratio;
2649
2650        max_pstate = min(cap_pstate, cpu->max_perf_ratio);
2651        if (max_pstate < min_pstate)
2652                max_pstate = min_pstate;
2653
2654        target_pstate = clamp_t(int, target_pstate, min_pstate, max_pstate);
2655
2656        intel_cpufreq_hwp_update(cpu, min_pstate, max_pstate, target_pstate, true);
2657
2658        cpu->pstate.current_pstate = target_pstate;
2659        intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate);
2660}
2661
2662static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
2663{
2664        struct freq_qos_request *req;
2665        struct cpudata *cpu;
2666        struct device *dev;
2667        int ret, freq;
2668
2669        dev = get_cpu_device(policy->cpu);
2670        if (!dev)
2671                return -ENODEV;
2672
2673        ret = __intel_pstate_cpu_init(policy);
2674        if (ret)
2675                return ret;
2676
2677        policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
2678        /* This reflects the intel_pstate_get_cpu_pstates() setting. */
2679        policy->cur = policy->cpuinfo.min_freq;
2680
2681        req = kcalloc(2, sizeof(*req), GFP_KERNEL);
2682        if (!req) {
2683                ret = -ENOMEM;
2684                goto pstate_exit;
2685        }
2686
2687        cpu = all_cpu_data[policy->cpu];
2688
2689        if (hwp_active) {
2690                u64 value;
2691
2692                policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP;
2693
2694                intel_pstate_get_hwp_cap(cpu);
2695
2696                rdmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, &value);
2697                WRITE_ONCE(cpu->hwp_req_cached, value);
2698
2699                cpu->epp_cached = intel_pstate_get_epp(cpu, value);
2700        } else {
2701                policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
2702        }
2703
2704        freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * global.min_perf_pct, 100);
2705
2706        ret = freq_qos_add_request(&policy->constraints, req, FREQ_QOS_MIN,
2707                                   freq);
2708        if (ret < 0) {
2709                dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
2710                goto free_req;
2711        }
2712
2713        freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * global.max_perf_pct, 100);
2714
2715        ret = freq_qos_add_request(&policy->constraints, req + 1, FREQ_QOS_MAX,
2716                                   freq);
2717        if (ret < 0) {
2718                dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
2719                goto remove_min_req;
2720        }
2721
2722        policy->driver_data = req;
2723
2724        return 0;
2725
2726remove_min_req:
2727        freq_qos_remove_request(req);
2728free_req:
2729        kfree(req);
2730pstate_exit:
2731        intel_pstate_exit_perf_limits(policy);
2732
2733        return ret;
2734}
2735
2736static int intel_cpufreq_cpu_exit(struct cpufreq_policy *policy)
2737{
2738        struct freq_qos_request *req;
2739
2740        req = policy->driver_data;
2741
2742        freq_qos_remove_request(req + 1);
2743        freq_qos_remove_request(req);
2744        kfree(req);
2745
2746        return intel_pstate_cpu_exit(policy);
2747}
2748
2749static struct cpufreq_driver intel_cpufreq = {
2750        .flags          = CPUFREQ_CONST_LOOPS,
2751        .verify         = intel_cpufreq_verify_policy,
2752        .target         = intel_cpufreq_target,
2753        .fast_switch    = intel_cpufreq_fast_switch,
2754        .init           = intel_cpufreq_cpu_init,
2755        .exit           = intel_cpufreq_cpu_exit,
2756        .offline        = intel_pstate_cpu_offline,
2757        .online         = intel_pstate_cpu_online,
2758        .suspend        = intel_pstate_suspend,
2759        .resume         = intel_pstate_resume,
2760        .update_limits  = intel_pstate_update_limits,
2761        .name           = "intel_cpufreq",
2762};
2763
2764static struct cpufreq_driver *default_driver;
2765
2766static void intel_pstate_driver_cleanup(void)
2767{
2768        unsigned int cpu;
2769
2770        get_online_cpus();
2771        for_each_online_cpu(cpu) {
2772                if (all_cpu_data[cpu]) {
2773                        if (intel_pstate_driver == &intel_pstate)
2774                                intel_pstate_clear_update_util_hook(cpu);
2775
2776                        kfree(all_cpu_data[cpu]);
2777                        all_cpu_data[cpu] = NULL;
2778                }
2779        }
2780        put_online_cpus();
2781
2782        intel_pstate_driver = NULL;
2783}
2784
2785static int intel_pstate_register_driver(struct cpufreq_driver *driver)
2786{
2787        int ret;
2788
2789        if (driver == &intel_pstate)
2790                intel_pstate_sysfs_expose_hwp_dynamic_boost();
2791
2792        memset(&global, 0, sizeof(global));
2793        global.max_perf_pct = 100;
2794
2795        intel_pstate_driver = driver;
2796        ret = cpufreq_register_driver(intel_pstate_driver);
2797        if (ret) {
2798                intel_pstate_driver_cleanup();
2799                return ret;
2800        }
2801
2802        global.min_perf_pct = min_perf_pct_min();
2803
2804        return 0;
2805}
2806
2807static ssize_t intel_pstate_show_status(char *buf)
2808{
2809        if (!intel_pstate_driver)
2810                return sprintf(buf, "off\n");
2811
2812        return sprintf(buf, "%s\n", intel_pstate_driver == &intel_pstate ?
2813                                        "active" : "passive");
2814}
2815
2816static int intel_pstate_update_status(const char *buf, size_t size)
2817{
2818        if (size == 3 && !strncmp(buf, "off", size)) {
2819                if (!intel_pstate_driver)
2820                        return -EINVAL;
2821
2822                if (hwp_active)
2823                        return -EBUSY;
2824
2825                cpufreq_unregister_driver(intel_pstate_driver);
2826                intel_pstate_driver_cleanup();
2827                return 0;
2828        }
2829
2830        if (size == 6 && !strncmp(buf, "active", size)) {
2831                if (intel_pstate_driver) {
2832                        if (intel_pstate_driver == &intel_pstate)
2833                                return 0;
2834
2835                        cpufreq_unregister_driver(intel_pstate_driver);
2836                }
2837
2838                return intel_pstate_register_driver(&intel_pstate);
2839        }
2840
2841        if (size == 7 && !strncmp(buf, "passive", size)) {
2842                if (intel_pstate_driver) {
2843                        if (intel_pstate_driver == &intel_cpufreq)
2844                                return 0;
2845
2846                        cpufreq_unregister_driver(intel_pstate_driver);
2847                        intel_pstate_sysfs_hide_hwp_dynamic_boost();
2848                }
2849
2850                return intel_pstate_register_driver(&intel_cpufreq);
2851        }
2852
2853        return -EINVAL;
2854}
2855
2856static int no_load __initdata;
2857static int no_hwp __initdata;
2858static int hwp_only __initdata;
2859static unsigned int force_load __initdata;
2860
2861static int __init intel_pstate_msrs_not_valid(void)
2862{
2863        if (!pstate_funcs.get_max() ||
2864            !pstate_funcs.get_min() ||
2865            !pstate_funcs.get_turbo())
2866                return -ENODEV;
2867
2868        return 0;
2869}
2870
2871static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
2872{
2873        pstate_funcs.get_max   = funcs->get_max;
2874        pstate_funcs.get_max_physical = funcs->get_max_physical;
2875        pstate_funcs.get_min   = funcs->get_min;
2876        pstate_funcs.get_turbo = funcs->get_turbo;
2877        pstate_funcs.get_scaling = funcs->get_scaling;
2878        pstate_funcs.get_val   = funcs->get_val;
2879        pstate_funcs.get_vid   = funcs->get_vid;
2880        pstate_funcs.get_aperf_mperf_shift = funcs->get_aperf_mperf_shift;
2881}
2882
2883#ifdef CONFIG_ACPI
2884
2885static bool __init intel_pstate_no_acpi_pss(void)
2886{
2887        int i;
2888
2889        for_each_possible_cpu(i) {
2890                acpi_status status;
2891                union acpi_object *pss;
2892                struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
2893                struct acpi_processor *pr = per_cpu(processors, i);
2894
2895                if (!pr)
2896                        continue;
2897
2898                status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer);
2899                if (ACPI_FAILURE(status))
2900                        continue;
2901
2902                pss = buffer.pointer;
2903                if (pss && pss->type == ACPI_TYPE_PACKAGE) {
2904                        kfree(pss);
2905                        return false;
2906                }
2907
2908                kfree(pss);
2909        }
2910
2911        pr_debug("ACPI _PSS not found\n");
2912        return true;
2913}
2914
2915static bool __init intel_pstate_no_acpi_pcch(void)
2916{
2917        acpi_status status;
2918        acpi_handle handle;
2919
2920        status = acpi_get_handle(NULL, "\\_SB", &handle);
2921        if (ACPI_FAILURE(status))
2922                goto not_found;
2923
2924        if (acpi_has_method(handle, "PCCH"))
2925                return false;
2926
2927not_found:
2928        pr_debug("ACPI PCCH not found\n");
2929        return true;
2930}
2931
2932static bool __init intel_pstate_has_acpi_ppc(void)
2933{
2934        int i;
2935
2936        for_each_possible_cpu(i) {
2937                struct acpi_processor *pr = per_cpu(processors, i);
2938
2939                if (!pr)
2940                        continue;
2941                if (acpi_has_method(pr->handle, "_PPC"))
2942                        return true;
2943        }
2944        pr_debug("ACPI _PPC not found\n");
2945        return false;
2946}
2947
2948enum {
2949        PSS,
2950        PPC,
2951};
2952
2953/* Hardware vendor-specific info that has its own power management modes */
2954static struct acpi_platform_list plat_info[] __initdata = {
2955        {"HP    ", "ProLiant", 0, ACPI_SIG_FADT, all_versions, NULL, PSS},
2956        {"ORACLE", "X4-2    ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2957        {"ORACLE", "X4-2L   ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2958        {"ORACLE", "X4-2B   ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2959        {"ORACLE", "X3-2    ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2960        {"ORACLE", "X3-2L   ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2961        {"ORACLE", "X3-2B   ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2962        {"ORACLE", "X4470M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2963        {"ORACLE", "X4270M3 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2964        {"ORACLE", "X4270M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2965        {"ORACLE", "X4170M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2966        {"ORACLE", "X4170 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2967        {"ORACLE", "X4275 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2968        {"ORACLE", "X6-2    ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2969        {"ORACLE", "Sudbury ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
2970        { } /* End */
2971};
2972
2973#define BITMASK_OOB     (BIT(8) | BIT(18))
2974
2975static bool __init intel_pstate_platform_pwr_mgmt_exists(void)
2976{
2977        const struct x86_cpu_id *id;
2978        u64 misc_pwr;
2979        int idx;
2980
2981        id = x86_match_cpu(intel_pstate_cpu_oob_ids);
2982        if (id) {
2983                rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr);
2984                if (misc_pwr & BITMASK_OOB) {
2985                        pr_debug("Bit 8 or 18 in the MISC_PWR_MGMT MSR set\n");
2986                        pr_debug("P states are controlled in Out of Band mode by the firmware/hardware\n");
2987                        return true;
2988                }
2989        }
2990
2991        idx = acpi_match_platform_list(plat_info);
2992        if (idx < 0)
2993                return false;
2994
2995        switch (plat_info[idx].data) {
2996        case PSS:
2997                if (!intel_pstate_no_acpi_pss())
2998                        return false;
2999
3000                return intel_pstate_no_acpi_pcch();
3001        case PPC:
3002                return intel_pstate_has_acpi_ppc() && !force_load;
3003        }
3004
3005        return false;
3006}
3007
3008static void intel_pstate_request_control_from_smm(void)
3009{
3010        /*
3011         * It may be unsafe to request P-states control from SMM if _PPC support
3012         * has not been enabled.
3013         */
3014        if (acpi_ppc)
3015                acpi_processor_pstate_control();
3016}
3017#else /* CONFIG_ACPI not enabled */
3018static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
3019static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
3020static inline void intel_pstate_request_control_from_smm(void) {}
3021#endif /* CONFIG_ACPI */
3022
3023#define INTEL_PSTATE_HWP_BROADWELL      0x01
3024
3025#define X86_MATCH_HWP(model, hwp_mode)                                  \
3026        X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \
3027                                           X86_FEATURE_HWP, hwp_mode)
3028
3029static const struct x86_cpu_id hwp_support_ids[] __initconst = {
3030        X86_MATCH_HWP(BROADWELL_X,      INTEL_PSTATE_HWP_BROADWELL),
3031        X86_MATCH_HWP(BROADWELL_D,      INTEL_PSTATE_HWP_BROADWELL),
3032        X86_MATCH_HWP(ANY,              0),
3033        {}
3034};
3035
3036static bool intel_pstate_hwp_is_enabled(void)
3037{
3038        u64 value;
3039
3040        rdmsrl(MSR_PM_ENABLE, value);
3041        return !!(value & 0x1);
3042}
3043
3044static int __init intel_pstate_init(void)
3045{
3046        const struct x86_cpu_id *id;
3047        int rc;
3048
3049        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
3050                return -ENODEV;
3051
3052        if (no_load)
3053                return -ENODEV;
3054
3055        id = x86_match_cpu(hwp_support_ids);
3056        if (id) {
3057                copy_cpu_funcs(&core_funcs);
3058                /*
3059                 * Avoid enabling HWP for processors without EPP support,
3060                 * because that means incomplete HWP implementation which is a
3061                 * corner case and supporting it is generally problematic.
3062                 *
3063                 * If HWP is enabled already, though, there is no choice but to
3064                 * deal with it.
3065                 */
3066                if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) ||
3067                    intel_pstate_hwp_is_enabled()) {
3068                        hwp_active++;
3069                        hwp_mode_bdw = id->driver_data;
3070                        intel_pstate.attr = hwp_cpufreq_attrs;
3071                        intel_cpufreq.attr = hwp_cpufreq_attrs;
3072                        intel_cpufreq.flags |= CPUFREQ_NEED_UPDATE_LIMITS;
3073                        intel_cpufreq.adjust_perf = intel_cpufreq_adjust_perf;
3074                        if (!default_driver)
3075                                default_driver = &intel_pstate;
3076
3077                        goto hwp_cpu_matched;
3078                }
3079        } else {
3080                id = x86_match_cpu(intel_pstate_cpu_ids);
3081                if (!id) {
3082                        pr_info("CPU model not supported\n");
3083                        return -ENODEV;
3084                }
3085
3086                copy_cpu_funcs((struct pstate_funcs *)id->driver_data);
3087        }
3088
3089        if (intel_pstate_msrs_not_valid()) {
3090                pr_info("Invalid MSRs\n");
3091                return -ENODEV;
3092        }
3093        /* Without HWP start in the passive mode. */
3094        if (!default_driver)
3095                default_driver = &intel_cpufreq;
3096
3097hwp_cpu_matched:
3098        /*
3099         * The Intel pstate driver will be ignored if the platform
3100         * firmware has its own power management modes.
3101         */
3102        if (intel_pstate_platform_pwr_mgmt_exists()) {
3103                pr_info("P-states controlled by the platform\n");
3104                return -ENODEV;
3105        }
3106
3107        if (!hwp_active && hwp_only)
3108                return -ENOTSUPP;
3109
3110        pr_info("Intel P-state driver initializing\n");
3111
3112        all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus()));
3113        if (!all_cpu_data)
3114                return -ENOMEM;
3115
3116        intel_pstate_request_control_from_smm();
3117
3118        intel_pstate_sysfs_expose_params();
3119
3120        mutex_lock(&intel_pstate_driver_lock);
3121        rc = intel_pstate_register_driver(default_driver);
3122        mutex_unlock(&intel_pstate_driver_lock);
3123        if (rc) {
3124                intel_pstate_sysfs_remove();
3125                return rc;
3126        }
3127
3128        if (hwp_active) {
3129                const struct x86_cpu_id *id;
3130
3131                id = x86_match_cpu(intel_pstate_cpu_ee_disable_ids);
3132                if (id) {
3133                        set_power_ctl_ee_state(false);
3134                        pr_info("Disabling energy efficiency optimization\n");
3135                }
3136
3137                pr_info("HWP enabled\n");
3138        }
3139
3140        return 0;
3141}
3142device_initcall(intel_pstate_init);
3143
3144static int __init intel_pstate_setup(char *str)
3145{
3146        if (!str)
3147                return -EINVAL;
3148
3149        if (!strcmp(str, "disable"))
3150                no_load = 1;
3151        else if (!strcmp(str, "active"))
3152                default_driver = &intel_pstate;
3153        else if (!strcmp(str, "passive"))
3154                default_driver = &intel_cpufreq;
3155
3156        if (!strcmp(str, "no_hwp")) {
3157                pr_info("HWP disabled\n");
3158                no_hwp = 1;
3159        }
3160        if (!strcmp(str, "force"))
3161                force_load = 1;
3162        if (!strcmp(str, "hwp_only"))
3163                hwp_only = 1;
3164        if (!strcmp(str, "per_cpu_perf_limits"))
3165                per_cpu_limits = true;
3166
3167#ifdef CONFIG_ACPI
3168        if (!strcmp(str, "support_acpi_ppc"))
3169                acpi_ppc = true;
3170#endif
3171
3172        return 0;
3173}
3174early_param("intel_pstate", intel_pstate_setup);
3175
3176MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>");
3177MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors");
3178MODULE_LICENSE("GPL");
3179