linux/arch/x86/events/rapl.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Support Intel/AMD RAPL energy consumption counters
   4 * Copyright (C) 2013 Google, Inc., Stephane Eranian
   5 *
   6 * Intel RAPL interface is specified in the IA-32 Manual Vol3b
   7 * section 14.7.1 (September 2013)
   8 *
   9 * AMD RAPL interface for Fam17h is described in the public PPR:
  10 * https://bugzilla.kernel.org/show_bug.cgi?id=206537
  11 *
  12 * RAPL provides more controls than just reporting energy consumption
  13 * however here we only expose the 3 energy consumption free running
  14 * counters (pp0, pkg, dram).
  15 *
  16 * Each of those counters increments in a power unit defined by the
  17 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
  18 * but it can vary.
  19 *
  20 * Counter to rapl events mappings:
  21 *
  22 *  pp0 counter: consumption of all physical cores (power plane 0)
  23 *        event: rapl_energy_cores
  24 *    perf code: 0x1
  25 *
  26 *  pkg counter: consumption of the whole processor package
  27 *        event: rapl_energy_pkg
  28 *    perf code: 0x2
  29 *
  30 * dram counter: consumption of the dram domain (servers only)
  31 *        event: rapl_energy_dram
  32 *    perf code: 0x3
  33 *
  34 * gpu counter: consumption of the builtin-gpu domain (client only)
  35 *        event: rapl_energy_gpu
  36 *    perf code: 0x4
  37 *
  38 *  psys counter: consumption of the builtin-psys domain (client only)
  39 *        event: rapl_energy_psys
  40 *    perf code: 0x5
  41 *
  42 * We manage those counters as free running (read-only). They may be
  43 * use simultaneously by other tools, such as turbostat.
  44 *
  45 * The events only support system-wide mode counting. There is no
  46 * sampling support because it does not make sense and is not
  47 * supported by the RAPL hardware.
  48 *
  49 * Because we want to avoid floating-point operations in the kernel,
  50 * the events are all reported in fixed point arithmetic (32.32).
  51 * Tools must adjust the counts to convert them to Watts using
  52 * the duration of the measurement. Tools may use a function such as
  53 * ldexp(raw_count, -32);
  54 */
  55
  56#define pr_fmt(fmt) "RAPL PMU: " fmt
  57
  58#include <linux/module.h>
  59#include <linux/slab.h>
  60#include <linux/perf_event.h>
  61#include <linux/nospec.h>
  62#include <asm/cpu_device_id.h>
  63#include <asm/intel-family.h>
  64#include "perf_event.h"
  65#include "probe.h"
  66
  67MODULE_LICENSE("GPL");
  68
  69/*
  70 * RAPL energy status counters
  71 */
  72enum perf_rapl_events {
  73        PERF_RAPL_PP0 = 0,              /* all cores */
  74        PERF_RAPL_PKG,                  /* entire package */
  75        PERF_RAPL_RAM,                  /* DRAM */
  76        PERF_RAPL_PP1,                  /* gpu */
  77        PERF_RAPL_PSYS,                 /* psys */
  78
  79        PERF_RAPL_MAX,
  80        NR_RAPL_DOMAINS = PERF_RAPL_MAX,
  81};
  82
  83static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
  84        "pp0-core",
  85        "package",
  86        "dram",
  87        "pp1-gpu",
  88        "psys",
  89};
  90
  91/*
  92 * event code: LSB 8 bits, passed in attr->config
  93 * any other bit is reserved
  94 */
  95#define RAPL_EVENT_MASK 0xFFULL
  96#define RAPL_CNTR_WIDTH 32
  97
  98#define RAPL_EVENT_ATTR_STR(_name, v, str)                                      \
  99static struct perf_pmu_events_attr event_attr_##v = {                           \
 100        .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),     \
 101        .id             = 0,                                                    \
 102        .event_str      = str,                                                  \
 103};
 104
 105struct rapl_pmu {
 106        raw_spinlock_t          lock;
 107        int                     n_active;
 108        int                     cpu;
 109        struct list_head        active_list;
 110        struct pmu              *pmu;
 111        ktime_t                 timer_interval;
 112        struct hrtimer          hrtimer;
 113};
 114
 115struct rapl_pmus {
 116        struct pmu              pmu;
 117        unsigned int            maxdie;
 118        struct rapl_pmu         *pmus[];
 119};
 120
 121enum rapl_unit_quirk {
 122        RAPL_UNIT_QUIRK_NONE,
 123        RAPL_UNIT_QUIRK_INTEL_HSW,
 124        RAPL_UNIT_QUIRK_INTEL_SPR,
 125};
 126
 127struct rapl_model {
 128        struct perf_msr *rapl_msrs;
 129        unsigned long   events;
 130        unsigned int    msr_power_unit;
 131        enum rapl_unit_quirk    unit_quirk;
 132};
 133
 134 /* 1/2^hw_unit Joule */
 135static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
 136static struct rapl_pmus *rapl_pmus;
 137static cpumask_t rapl_cpu_mask;
 138static unsigned int rapl_cntr_mask;
 139static u64 rapl_timer_ms;
 140static struct perf_msr *rapl_msrs;
 141
 142static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
 143{
 144        unsigned int dieid = topology_logical_die_id(cpu);
 145
 146        /*
 147         * The unsigned check also catches the '-1' return value for non
 148         * existent mappings in the topology map.
 149         */
 150        return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
 151}
 152
 153static inline u64 rapl_read_counter(struct perf_event *event)
 154{
 155        u64 raw;
 156        rdmsrl(event->hw.event_base, raw);
 157        return raw;
 158}
 159
 160static inline u64 rapl_scale(u64 v, int cfg)
 161{
 162        if (cfg > NR_RAPL_DOMAINS) {
 163                pr_warn("Invalid domain %d, failed to scale data\n", cfg);
 164                return v;
 165        }
 166        /*
 167         * scale delta to smallest unit (1/2^32)
 168         * users must then scale back: count * 1/(1e9*2^32) to get Joules
 169         * or use ldexp(count, -32).
 170         * Watts = Joules/Time delta
 171         */
 172        return v << (32 - rapl_hw_unit[cfg - 1]);
 173}
 174
 175static u64 rapl_event_update(struct perf_event *event)
 176{
 177        struct hw_perf_event *hwc = &event->hw;
 178        u64 prev_raw_count, new_raw_count;
 179        s64 delta, sdelta;
 180        int shift = RAPL_CNTR_WIDTH;
 181
 182again:
 183        prev_raw_count = local64_read(&hwc->prev_count);
 184        rdmsrl(event->hw.event_base, new_raw_count);
 185
 186        if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 187                            new_raw_count) != prev_raw_count) {
 188                cpu_relax();
 189                goto again;
 190        }
 191
 192        /*
 193         * Now we have the new raw value and have updated the prev
 194         * timestamp already. We can now calculate the elapsed delta
 195         * (event-)time and add that to the generic event.
 196         *
 197         * Careful, not all hw sign-extends above the physical width
 198         * of the count.
 199         */
 200        delta = (new_raw_count << shift) - (prev_raw_count << shift);
 201        delta >>= shift;
 202
 203        sdelta = rapl_scale(delta, event->hw.config);
 204
 205        local64_add(sdelta, &event->count);
 206
 207        return new_raw_count;
 208}
 209
 210static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 211{
 212       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
 213                     HRTIMER_MODE_REL_PINNED);
 214}
 215
 216static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 217{
 218        struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
 219        struct perf_event *event;
 220        unsigned long flags;
 221
 222        if (!pmu->n_active)
 223                return HRTIMER_NORESTART;
 224
 225        raw_spin_lock_irqsave(&pmu->lock, flags);
 226
 227        list_for_each_entry(event, &pmu->active_list, active_entry)
 228                rapl_event_update(event);
 229
 230        raw_spin_unlock_irqrestore(&pmu->lock, flags);
 231
 232        hrtimer_forward_now(hrtimer, pmu->timer_interval);
 233
 234        return HRTIMER_RESTART;
 235}
 236
 237static void rapl_hrtimer_init(struct rapl_pmu *pmu)
 238{
 239        struct hrtimer *hr = &pmu->hrtimer;
 240
 241        hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 242        hr->function = rapl_hrtimer_handle;
 243}
 244
 245static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
 246                                   struct perf_event *event)
 247{
 248        if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
 249                return;
 250
 251        event->hw.state = 0;
 252
 253        list_add_tail(&event->active_entry, &pmu->active_list);
 254
 255        local64_set(&event->hw.prev_count, rapl_read_counter(event));
 256
 257        pmu->n_active++;
 258        if (pmu->n_active == 1)
 259                rapl_start_hrtimer(pmu);
 260}
 261
 262static void rapl_pmu_event_start(struct perf_event *event, int mode)
 263{
 264        struct rapl_pmu *pmu = event->pmu_private;
 265        unsigned long flags;
 266
 267        raw_spin_lock_irqsave(&pmu->lock, flags);
 268        __rapl_pmu_event_start(pmu, event);
 269        raw_spin_unlock_irqrestore(&pmu->lock, flags);
 270}
 271
 272static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 273{
 274        struct rapl_pmu *pmu = event->pmu_private;
 275        struct hw_perf_event *hwc = &event->hw;
 276        unsigned long flags;
 277
 278        raw_spin_lock_irqsave(&pmu->lock, flags);
 279
 280        /* mark event as deactivated and stopped */
 281        if (!(hwc->state & PERF_HES_STOPPED)) {
 282                WARN_ON_ONCE(pmu->n_active <= 0);
 283                pmu->n_active--;
 284                if (pmu->n_active == 0)
 285                        hrtimer_cancel(&pmu->hrtimer);
 286
 287                list_del(&event->active_entry);
 288
 289                WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
 290                hwc->state |= PERF_HES_STOPPED;
 291        }
 292
 293        /* check if update of sw counter is necessary */
 294        if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
 295                /*
 296                 * Drain the remaining delta count out of a event
 297                 * that we are disabling:
 298                 */
 299                rapl_event_update(event);
 300                hwc->state |= PERF_HES_UPTODATE;
 301        }
 302
 303        raw_spin_unlock_irqrestore(&pmu->lock, flags);
 304}
 305
 306static int rapl_pmu_event_add(struct perf_event *event, int mode)
 307{
 308        struct rapl_pmu *pmu = event->pmu_private;
 309        struct hw_perf_event *hwc = &event->hw;
 310        unsigned long flags;
 311
 312        raw_spin_lock_irqsave(&pmu->lock, flags);
 313
 314        hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 315
 316        if (mode & PERF_EF_START)
 317                __rapl_pmu_event_start(pmu, event);
 318
 319        raw_spin_unlock_irqrestore(&pmu->lock, flags);
 320
 321        return 0;
 322}
 323
 324static void rapl_pmu_event_del(struct perf_event *event, int flags)
 325{
 326        rapl_pmu_event_stop(event, PERF_EF_UPDATE);
 327}
 328
 329static int rapl_pmu_event_init(struct perf_event *event)
 330{
 331        u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 332        int bit, ret = 0;
 333        struct rapl_pmu *pmu;
 334
 335        /* only look at RAPL events */
 336        if (event->attr.type != rapl_pmus->pmu.type)
 337                return -ENOENT;
 338
 339        /* check only supported bits are set */
 340        if (event->attr.config & ~RAPL_EVENT_MASK)
 341                return -EINVAL;
 342
 343        if (event->cpu < 0)
 344                return -EINVAL;
 345
 346        event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
 347
 348        if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
 349                return -EINVAL;
 350
 351        cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
 352        bit = cfg - 1;
 353
 354        /* check event supported */
 355        if (!(rapl_cntr_mask & (1 << bit)))
 356                return -EINVAL;
 357
 358        /* unsupported modes and filters */
 359        if (event->attr.sample_period) /* no sampling */
 360                return -EINVAL;
 361
 362        /* must be done before validate_group */
 363        pmu = cpu_to_rapl_pmu(event->cpu);
 364        if (!pmu)
 365                return -EINVAL;
 366        event->cpu = pmu->cpu;
 367        event->pmu_private = pmu;
 368        event->hw.event_base = rapl_msrs[bit].msr;
 369        event->hw.config = cfg;
 370        event->hw.idx = bit;
 371
 372        return ret;
 373}
 374
 375static void rapl_pmu_event_read(struct perf_event *event)
 376{
 377        rapl_event_update(event);
 378}
 379
 380static ssize_t rapl_get_attr_cpumask(struct device *dev,
 381                                struct device_attribute *attr, char *buf)
 382{
 383        return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
 384}
 385
 386static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
 387
 388static struct attribute *rapl_pmu_attrs[] = {
 389        &dev_attr_cpumask.attr,
 390        NULL,
 391};
 392
 393static struct attribute_group rapl_pmu_attr_group = {
 394        .attrs = rapl_pmu_attrs,
 395};
 396
 397RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 398RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 399RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
 400RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
 401RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
 402
 403RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
 404RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
 405RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
 406RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
 407RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
 408
 409/*
 410 * we compute in 0.23 nJ increments regardless of MSR
 411 */
 412RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
 413RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
 414RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
 415RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
 416RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
 417
 418/*
 419 * There are no default events, but we need to create
 420 * "events" group (with empty attrs) before updating
 421 * it with detected events.
 422 */
 423static struct attribute *attrs_empty[] = {
 424        NULL,
 425};
 426
 427static struct attribute_group rapl_pmu_events_group = {
 428        .name = "events",
 429        .attrs = attrs_empty,
 430};
 431
 432PMU_FORMAT_ATTR(event, "config:0-7");
 433static struct attribute *rapl_formats_attr[] = {
 434        &format_attr_event.attr,
 435        NULL,
 436};
 437
 438static struct attribute_group rapl_pmu_format_group = {
 439        .name = "format",
 440        .attrs = rapl_formats_attr,
 441};
 442
 443static const struct attribute_group *rapl_attr_groups[] = {
 444        &rapl_pmu_attr_group,
 445        &rapl_pmu_format_group,
 446        &rapl_pmu_events_group,
 447        NULL,
 448};
 449
 450static struct attribute *rapl_events_cores[] = {
 451        EVENT_PTR(rapl_cores),
 452        EVENT_PTR(rapl_cores_unit),
 453        EVENT_PTR(rapl_cores_scale),
 454        NULL,
 455};
 456
 457static struct attribute_group rapl_events_cores_group = {
 458        .name  = "events",
 459        .attrs = rapl_events_cores,
 460};
 461
 462static struct attribute *rapl_events_pkg[] = {
 463        EVENT_PTR(rapl_pkg),
 464        EVENT_PTR(rapl_pkg_unit),
 465        EVENT_PTR(rapl_pkg_scale),
 466        NULL,
 467};
 468
 469static struct attribute_group rapl_events_pkg_group = {
 470        .name  = "events",
 471        .attrs = rapl_events_pkg,
 472};
 473
 474static struct attribute *rapl_events_ram[] = {
 475        EVENT_PTR(rapl_ram),
 476        EVENT_PTR(rapl_ram_unit),
 477        EVENT_PTR(rapl_ram_scale),
 478        NULL,
 479};
 480
 481static struct attribute_group rapl_events_ram_group = {
 482        .name  = "events",
 483        .attrs = rapl_events_ram,
 484};
 485
 486static struct attribute *rapl_events_gpu[] = {
 487        EVENT_PTR(rapl_gpu),
 488        EVENT_PTR(rapl_gpu_unit),
 489        EVENT_PTR(rapl_gpu_scale),
 490        NULL,
 491};
 492
 493static struct attribute_group rapl_events_gpu_group = {
 494        .name  = "events",
 495        .attrs = rapl_events_gpu,
 496};
 497
 498static struct attribute *rapl_events_psys[] = {
 499        EVENT_PTR(rapl_psys),
 500        EVENT_PTR(rapl_psys_unit),
 501        EVENT_PTR(rapl_psys_scale),
 502        NULL,
 503};
 504
 505static struct attribute_group rapl_events_psys_group = {
 506        .name  = "events",
 507        .attrs = rapl_events_psys,
 508};
 509
 510static bool test_msr(int idx, void *data)
 511{
 512        return test_bit(idx, (unsigned long *) data);
 513}
 514
 515/* Only lower 32bits of the MSR represents the energy counter */
 516#define RAPL_MSR_MASK 0xFFFFFFFF
 517
 518static struct perf_msr intel_rapl_msrs[] = {
 519        [PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
 520        [PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
 521        [PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
 522        [PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
 523        [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, false, RAPL_MSR_MASK },
 524};
 525
 526static struct perf_msr intel_rapl_spr_msrs[] = {
 527        [PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
 528        [PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
 529        [PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
 530        [PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
 531        [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, true, RAPL_MSR_MASK },
 532};
 533
 534/*
 535 * Force to PERF_RAPL_MAX size due to:
 536 * - perf_msr_probe(PERF_RAPL_MAX)
 537 * - want to use same event codes across both architectures
 538 */
 539static struct perf_msr amd_rapl_msrs[PERF_RAPL_MAX] = {
 540        [PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr },
 541};
 542
 543
 544static int rapl_cpu_offline(unsigned int cpu)
 545{
 546        struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 547        int target;
 548
 549        /* Check if exiting cpu is used for collecting rapl events */
 550        if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
 551                return 0;
 552
 553        pmu->cpu = -1;
 554        /* Find a new cpu to collect rapl events */
 555        target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
 556
 557        /* Migrate rapl events to the new target */
 558        if (target < nr_cpu_ids) {
 559                cpumask_set_cpu(target, &rapl_cpu_mask);
 560                pmu->cpu = target;
 561                perf_pmu_migrate_context(pmu->pmu, cpu, target);
 562        }
 563        return 0;
 564}
 565
 566static int rapl_cpu_online(unsigned int cpu)
 567{
 568        struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 569        int target;
 570
 571        if (!pmu) {
 572                pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 573                if (!pmu)
 574                        return -ENOMEM;
 575
 576                raw_spin_lock_init(&pmu->lock);
 577                INIT_LIST_HEAD(&pmu->active_list);
 578                pmu->pmu = &rapl_pmus->pmu;
 579                pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
 580                rapl_hrtimer_init(pmu);
 581
 582                rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
 583        }
 584
 585        /*
 586         * Check if there is an online cpu in the package which collects rapl
 587         * events already.
 588         */
 589        target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
 590        if (target < nr_cpu_ids)
 591                return 0;
 592
 593        cpumask_set_cpu(cpu, &rapl_cpu_mask);
 594        pmu->cpu = cpu;
 595        return 0;
 596}
 597
 598static int rapl_check_hw_unit(struct rapl_model *rm)
 599{
 600        u64 msr_rapl_power_unit_bits;
 601        int i;
 602
 603        /* protect rdmsrl() to handle virtualization */
 604        if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
 605                return -1;
 606        for (i = 0; i < NR_RAPL_DOMAINS; i++)
 607                rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 608
 609        switch (rm->unit_quirk) {
 610        /*
 611         * DRAM domain on HSW server and KNL has fixed energy unit which can be
 612         * different than the unit from power unit MSR. See
 613         * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
 614         * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
 615         */
 616        case RAPL_UNIT_QUIRK_INTEL_HSW:
 617                rapl_hw_unit[PERF_RAPL_RAM] = 16;
 618                break;
 619        /*
 620         * SPR shares the same DRAM domain energy unit as HSW, plus it
 621         * also has a fixed energy unit for Psys domain.
 622         */
 623        case RAPL_UNIT_QUIRK_INTEL_SPR:
 624                rapl_hw_unit[PERF_RAPL_RAM] = 16;
 625                rapl_hw_unit[PERF_RAPL_PSYS] = 0;
 626                break;
 627        default:
 628                break;
 629        }
 630
 631
 632        /*
 633         * Calculate the timer rate:
 634         * Use reference of 200W for scaling the timeout to avoid counter
 635         * overflows. 200W = 200 Joules/sec
 636         * Divide interval by 2 to avoid lockstep (2 * 100)
 637         * if hw unit is 32, then we use 2 ms 1/200/2
 638         */
 639        rapl_timer_ms = 2;
 640        if (rapl_hw_unit[0] < 32) {
 641                rapl_timer_ms = (1000 / (2 * 100));
 642                rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
 643        }
 644        return 0;
 645}
 646
 647static void __init rapl_advertise(void)
 648{
 649        int i;
 650
 651        pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
 652                hweight32(rapl_cntr_mask), rapl_timer_ms);
 653
 654        for (i = 0; i < NR_RAPL_DOMAINS; i++) {
 655                if (rapl_cntr_mask & (1 << i)) {
 656                        pr_info("hw unit of domain %s 2^-%d Joules\n",
 657                                rapl_domain_names[i], rapl_hw_unit[i]);
 658                }
 659        }
 660}
 661
 662static void cleanup_rapl_pmus(void)
 663{
 664        int i;
 665
 666        for (i = 0; i < rapl_pmus->maxdie; i++)
 667                kfree(rapl_pmus->pmus[i]);
 668        kfree(rapl_pmus);
 669}
 670
 671static const struct attribute_group *rapl_attr_update[] = {
 672        &rapl_events_cores_group,
 673        &rapl_events_pkg_group,
 674        &rapl_events_ram_group,
 675        &rapl_events_gpu_group,
 676        &rapl_events_psys_group,
 677        NULL,
 678};
 679
 680static int __init init_rapl_pmus(void)
 681{
 682        int maxdie = topology_max_packages() * topology_max_die_per_package();
 683        size_t size;
 684
 685        size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
 686        rapl_pmus = kzalloc(size, GFP_KERNEL);
 687        if (!rapl_pmus)
 688                return -ENOMEM;
 689
 690        rapl_pmus->maxdie               = maxdie;
 691        rapl_pmus->pmu.attr_groups      = rapl_attr_groups;
 692        rapl_pmus->pmu.attr_update      = rapl_attr_update;
 693        rapl_pmus->pmu.task_ctx_nr      = perf_invalid_context;
 694        rapl_pmus->pmu.event_init       = rapl_pmu_event_init;
 695        rapl_pmus->pmu.add              = rapl_pmu_event_add;
 696        rapl_pmus->pmu.del              = rapl_pmu_event_del;
 697        rapl_pmus->pmu.start            = rapl_pmu_event_start;
 698        rapl_pmus->pmu.stop             = rapl_pmu_event_stop;
 699        rapl_pmus->pmu.read             = rapl_pmu_event_read;
 700        rapl_pmus->pmu.module           = THIS_MODULE;
 701        rapl_pmus->pmu.capabilities     = PERF_PMU_CAP_NO_EXCLUDE;
 702        return 0;
 703}
 704
 705static struct rapl_model model_snb = {
 706        .events         = BIT(PERF_RAPL_PP0) |
 707                          BIT(PERF_RAPL_PKG) |
 708                          BIT(PERF_RAPL_PP1),
 709        .msr_power_unit = MSR_RAPL_POWER_UNIT,
 710        .rapl_msrs      = intel_rapl_msrs,
 711};
 712
 713static struct rapl_model model_snbep = {
 714        .events         = BIT(PERF_RAPL_PP0) |
 715                          BIT(PERF_RAPL_PKG) |
 716                          BIT(PERF_RAPL_RAM),
 717        .msr_power_unit = MSR_RAPL_POWER_UNIT,
 718        .rapl_msrs      = intel_rapl_msrs,
 719};
 720
 721static struct rapl_model model_hsw = {
 722        .events         = BIT(PERF_RAPL_PP0) |
 723                          BIT(PERF_RAPL_PKG) |
 724                          BIT(PERF_RAPL_RAM) |
 725                          BIT(PERF_RAPL_PP1),
 726        .msr_power_unit = MSR_RAPL_POWER_UNIT,
 727        .rapl_msrs      = intel_rapl_msrs,
 728};
 729
 730static struct rapl_model model_hsx = {
 731        .events         = BIT(PERF_RAPL_PP0) |
 732                          BIT(PERF_RAPL_PKG) |
 733                          BIT(PERF_RAPL_RAM),
 734        .unit_quirk     = RAPL_UNIT_QUIRK_INTEL_HSW,
 735        .msr_power_unit = MSR_RAPL_POWER_UNIT,
 736        .rapl_msrs      = intel_rapl_msrs,
 737};
 738
 739static struct rapl_model model_knl = {
 740        .events         = BIT(PERF_RAPL_PKG) |
 741                          BIT(PERF_RAPL_RAM),
 742        .unit_quirk     = RAPL_UNIT_QUIRK_INTEL_HSW,
 743        .msr_power_unit = MSR_RAPL_POWER_UNIT,
 744        .rapl_msrs      = intel_rapl_msrs,
 745};
 746
 747static struct rapl_model model_skl = {
 748        .events         = BIT(PERF_RAPL_PP0) |
 749                          BIT(PERF_RAPL_PKG) |
 750                          BIT(PERF_RAPL_RAM) |
 751                          BIT(PERF_RAPL_PP1) |
 752                          BIT(PERF_RAPL_PSYS),
 753        .msr_power_unit = MSR_RAPL_POWER_UNIT,
 754        .rapl_msrs      = intel_rapl_msrs,
 755};
 756
 757static struct rapl_model model_spr = {
 758        .events         = BIT(PERF_RAPL_PP0) |
 759                          BIT(PERF_RAPL_PKG) |
 760                          BIT(PERF_RAPL_RAM) |
 761                          BIT(PERF_RAPL_PSYS),
 762        .unit_quirk     = RAPL_UNIT_QUIRK_INTEL_SPR,
 763        .msr_power_unit = MSR_RAPL_POWER_UNIT,
 764        .rapl_msrs      = intel_rapl_spr_msrs,
 765};
 766
 767static struct rapl_model model_amd_hygon = {
 768        .events         = BIT(PERF_RAPL_PKG),
 769        .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
 770        .rapl_msrs      = amd_rapl_msrs,
 771};
 772
 773static const struct x86_cpu_id rapl_model_match[] __initconst = {
 774        X86_MATCH_FEATURE(X86_FEATURE_RAPL,             &model_amd_hygon),
 775        X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,         &model_snb),
 776        X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,       &model_snbep),
 777        X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,           &model_snb),
 778        X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,         &model_snbep),
 779        X86_MATCH_INTEL_FAM6_MODEL(HASWELL,             &model_hsw),
 780        X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,           &model_hsx),
 781        X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,           &model_hsw),
 782        X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,           &model_hsw),
 783        X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,           &model_hsw),
 784        X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,         &model_hsw),
 785        X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,         &model_hsx),
 786        X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,         &model_hsx),
 787        X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,        &model_knl),
 788        X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,        &model_knl),
 789        X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,           &model_skl),
 790        X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,             &model_skl),
 791        X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,           &model_hsx),
 792        X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,          &model_skl),
 793        X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,            &model_skl),
 794        X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,        &model_skl),
 795        X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,       &model_hsw),
 796        X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,     &model_hsw),
 797        X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,  &model_hsw),
 798        X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,           &model_skl),
 799        X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,             &model_skl),
 800        X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,           &model_hsx),
 801        X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,           &model_hsx),
 802        X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,         &model_skl),
 803        X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,           &model_skl),
 804        X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,           &model_skl),
 805        X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,         &model_skl),
 806        X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,    &model_spr),
 807        {},
 808};
 809MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
 810
 811static int __init rapl_pmu_init(void)
 812{
 813        const struct x86_cpu_id *id;
 814        struct rapl_model *rm;
 815        int ret;
 816
 817        id = x86_match_cpu(rapl_model_match);
 818        if (!id)
 819                return -ENODEV;
 820
 821        rm = (struct rapl_model *) id->driver_data;
 822
 823        rapl_msrs = rm->rapl_msrs;
 824
 825        rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
 826                                        false, (void *) &rm->events);
 827
 828        ret = rapl_check_hw_unit(rm);
 829        if (ret)
 830                return ret;
 831
 832        ret = init_rapl_pmus();
 833        if (ret)
 834                return ret;
 835
 836        /*
 837         * Install callbacks. Core will call them for each online cpu.
 838         */
 839        ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
 840                                "perf/x86/rapl:online",
 841                                rapl_cpu_online, rapl_cpu_offline);
 842        if (ret)
 843                goto out;
 844
 845        ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
 846        if (ret)
 847                goto out1;
 848
 849        rapl_advertise();
 850        return 0;
 851
 852out1:
 853        cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 854out:
 855        pr_warn("Initialization failed (%d), disabled\n", ret);
 856        cleanup_rapl_pmus();
 857        return ret;
 858}
 859module_init(rapl_pmu_init);
 860
 861static void __exit intel_rapl_exit(void)
 862{
 863        cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 864        perf_pmu_unregister(&rapl_pmus->pmu);
 865        cleanup_rapl_pmus();
 866}
 867module_exit(intel_rapl_exit);
 868