linux/arch/x86/events/amd/ibs.c
<<
>>
Prefs
   1/*
   2 * Performance events - AMD IBS
   3 *
   4 *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
   5 *
   6 *  For licencing details see kernel-base/COPYING
   7 */
   8
   9#include <linux/perf_event.h>
  10#include <linux/init.h>
  11#include <linux/export.h>
  12#include <linux/pci.h>
  13#include <linux/ptrace.h>
  14#include <linux/syscore_ops.h>
  15#include <linux/sched/clock.h>
  16
  17#include <asm/apic.h>
  18
  19#include "../perf_event.h"
  20
  21static u32 ibs_caps;
  22
  23#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
  24
  25#include <linux/kprobes.h>
  26#include <linux/hardirq.h>
  27
  28#include <asm/nmi.h>
  29
  30#define IBS_FETCH_CONFIG_MASK   (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
  31#define IBS_OP_CONFIG_MASK      IBS_OP_MAX_CNT
  32
  33
  34/*
  35 * IBS states:
  36 *
  37 * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken
  38 * and any further add()s must fail.
  39 *
  40 * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are
  41 * complicated by the fact that the IBS hardware can send late NMIs (ie. after
  42 * we've cleared the EN bit).
  43 *
  44 * In order to consume these late NMIs we have the STOPPED state, any NMI that
  45 * happens after we've cleared the EN state will clear this bit and report the
  46 * NMI handled (this is fundamentally racy in the face or multiple NMI sources,
  47 * someone else can consume our BIT and our NMI will go unhandled).
  48 *
  49 * And since we cannot set/clear this separate bit together with the EN bit,
  50 * there are races; if we cleared STARTED early, an NMI could land in
  51 * between clearing STARTED and clearing the EN bit (in fact multiple NMIs
  52 * could happen if the period is small enough), and consume our STOPPED bit
  53 * and trigger streams of unhandled NMIs.
  54 *
  55 * If, however, we clear STARTED late, an NMI can hit between clearing the
  56 * EN bit and clearing STARTED, still see STARTED set and process the event.
  57 * If this event will have the VALID bit clear, we bail properly, but this
  58 * is not a given. With VALID set we can end up calling pmu::stop() again
  59 * (the throttle logic) and trigger the WARNs in there.
  60 *
  61 * So what we do is set STOPPING before clearing EN to avoid the pmu::stop()
  62 * nesting, and clear STARTED late, so that we have a well defined state over
  63 * the clearing of the EN bit.
  64 *
  65 * XXX: we could probably be using !atomic bitops for all this.
  66 */
  67
  68enum ibs_states {
  69        IBS_ENABLED     = 0,
  70        IBS_STARTED     = 1,
  71        IBS_STOPPING    = 2,
  72        IBS_STOPPED     = 3,
  73
  74        IBS_MAX_STATES,
  75};
  76
  77struct cpu_perf_ibs {
  78        struct perf_event       *event;
  79        unsigned long           state[BITS_TO_LONGS(IBS_MAX_STATES)];
  80};
  81
  82struct perf_ibs {
  83        struct pmu                      pmu;
  84        unsigned int                    msr;
  85        u64                             config_mask;
  86        u64                             cnt_mask;
  87        u64                             enable_mask;
  88        u64                             valid_mask;
  89        u64                             max_period;
  90        unsigned long                   offset_mask[1];
  91        int                             offset_max;
  92        unsigned int                    fetch_count_reset_broken : 1;
  93        unsigned int                    fetch_ignore_if_zero_rip : 1;
  94        struct cpu_perf_ibs __percpu    *pcpu;
  95
  96        struct attribute                **format_attrs;
  97        struct attribute_group          format_group;
  98        const struct attribute_group    *attr_groups[2];
  99
 100        u64                             (*get_count)(u64 config);
 101};
 102
 103struct perf_ibs_data {
 104        u32             size;
 105        union {
 106                u32     data[0];        /* data buffer starts here */
 107                u32     caps;
 108        };
 109        u64             regs[MSR_AMD64_IBS_REG_COUNT_MAX];
 110};
 111
 112static int
 113perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
 114{
 115        s64 left = local64_read(&hwc->period_left);
 116        s64 period = hwc->sample_period;
 117        int overflow = 0;
 118
 119        /*
 120         * If we are way outside a reasonable range then just skip forward:
 121         */
 122        if (unlikely(left <= -period)) {
 123                left = period;
 124                local64_set(&hwc->period_left, left);
 125                hwc->last_period = period;
 126                overflow = 1;
 127        }
 128
 129        if (unlikely(left < (s64)min)) {
 130                left += period;
 131                local64_set(&hwc->period_left, left);
 132                hwc->last_period = period;
 133                overflow = 1;
 134        }
 135
 136        /*
 137         * If the hw period that triggers the sw overflow is too short
 138         * we might hit the irq handler. This biases the results.
 139         * Thus we shorten the next-to-last period and set the last
 140         * period to the max period.
 141         */
 142        if (left > max) {
 143                left -= max;
 144                if (left > max)
 145                        left = max;
 146                else if (left < min)
 147                        left = min;
 148        }
 149
 150        *hw_period = (u64)left;
 151
 152        return overflow;
 153}
 154
 155static  int
 156perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
 157{
 158        struct hw_perf_event *hwc = &event->hw;
 159        int shift = 64 - width;
 160        u64 prev_raw_count;
 161        u64 delta;
 162
 163        /*
 164         * Careful: an NMI might modify the previous event value.
 165         *
 166         * Our tactic to handle this is to first atomically read and
 167         * exchange a new raw count - then add that new-prev delta
 168         * count to the generic event atomically:
 169         */
 170        prev_raw_count = local64_read(&hwc->prev_count);
 171        if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 172                                        new_raw_count) != prev_raw_count)
 173                return 0;
 174
 175        /*
 176         * Now we have the new raw value and have updated the prev
 177         * timestamp already. We can now calculate the elapsed delta
 178         * (event-)time and add that to the generic event.
 179         *
 180         * Careful, not all hw sign-extends above the physical width
 181         * of the count.
 182         */
 183        delta = (new_raw_count << shift) - (prev_raw_count << shift);
 184        delta >>= shift;
 185
 186        local64_add(delta, &event->count);
 187        local64_sub(delta, &hwc->period_left);
 188
 189        return 1;
 190}
 191
 192static struct perf_ibs perf_ibs_fetch;
 193static struct perf_ibs perf_ibs_op;
 194
 195static struct perf_ibs *get_ibs_pmu(int type)
 196{
 197        if (perf_ibs_fetch.pmu.type == type)
 198                return &perf_ibs_fetch;
 199        if (perf_ibs_op.pmu.type == type)
 200                return &perf_ibs_op;
 201        return NULL;
 202}
 203
 204/*
 205 * Use IBS for precise event sampling:
 206 *
 207 *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
 208 *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
 209 *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
 210 *
 211 * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
 212 * MSRC001_1033) is used to select either cycle or micro-ops counting
 213 * mode.
 214 *
 215 * The rip of IBS samples has skid 0. Thus, IBS supports precise
 216 * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
 217 * rip is invalid when IBS was not able to record the rip correctly.
 218 * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
 219 *
 220 */
 221static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
 222{
 223        switch (event->attr.precise_ip) {
 224        case 0:
 225                return -ENOENT;
 226        case 1:
 227        case 2:
 228                break;
 229        default:
 230                return -EOPNOTSUPP;
 231        }
 232
 233        switch (event->attr.type) {
 234        case PERF_TYPE_HARDWARE:
 235                switch (event->attr.config) {
 236                case PERF_COUNT_HW_CPU_CYCLES:
 237                        *config = 0;
 238                        return 0;
 239                }
 240                break;
 241        case PERF_TYPE_RAW:
 242                switch (event->attr.config) {
 243                case 0x0076:
 244                        *config = 0;
 245                        return 0;
 246                case 0x00C1:
 247                        *config = IBS_OP_CNT_CTL;
 248                        return 0;
 249                }
 250                break;
 251        default:
 252                return -ENOENT;
 253        }
 254
 255        return -EOPNOTSUPP;
 256}
 257
 258static int perf_ibs_init(struct perf_event *event)
 259{
 260        struct hw_perf_event *hwc = &event->hw;
 261        struct perf_ibs *perf_ibs;
 262        u64 max_cnt, config;
 263        int ret;
 264
 265        perf_ibs = get_ibs_pmu(event->attr.type);
 266        if (perf_ibs) {
 267                config = event->attr.config;
 268        } else {
 269                perf_ibs = &perf_ibs_op;
 270                ret = perf_ibs_precise_event(event, &config);
 271                if (ret)
 272                        return ret;
 273        }
 274
 275        if (event->pmu != &perf_ibs->pmu)
 276                return -ENOENT;
 277
 278        if (config & ~perf_ibs->config_mask)
 279                return -EINVAL;
 280
 281        if (hwc->sample_period) {
 282                if (config & perf_ibs->cnt_mask)
 283                        /* raw max_cnt may not be set */
 284                        return -EINVAL;
 285                if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
 286                        /*
 287                         * lower 4 bits can not be set in ibs max cnt,
 288                         * but allowing it in case we adjust the
 289                         * sample period to set a frequency.
 290                         */
 291                        return -EINVAL;
 292                hwc->sample_period &= ~0x0FULL;
 293                if (!hwc->sample_period)
 294                        hwc->sample_period = 0x10;
 295        } else {
 296                max_cnt = config & perf_ibs->cnt_mask;
 297                config &= ~perf_ibs->cnt_mask;
 298                event->attr.sample_period = max_cnt << 4;
 299                hwc->sample_period = event->attr.sample_period;
 300        }
 301
 302        if (!hwc->sample_period)
 303                return -EINVAL;
 304
 305        /*
 306         * If we modify hwc->sample_period, we also need to update
 307         * hwc->last_period and hwc->period_left.
 308         */
 309        hwc->last_period = hwc->sample_period;
 310        local64_set(&hwc->period_left, hwc->sample_period);
 311
 312        hwc->config_base = perf_ibs->msr;
 313        hwc->config = config;
 314
 315        return 0;
 316}
 317
 318static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
 319                               struct hw_perf_event *hwc, u64 *period)
 320{
 321        int overflow;
 322
 323        /* ignore lower 4 bits in min count: */
 324        overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
 325        local64_set(&hwc->prev_count, 0);
 326
 327        return overflow;
 328}
 329
 330static u64 get_ibs_fetch_count(u64 config)
 331{
 332        return (config & IBS_FETCH_CNT) >> 12;
 333}
 334
 335static u64 get_ibs_op_count(u64 config)
 336{
 337        u64 count = 0;
 338
 339        /*
 340         * If the internal 27-bit counter rolled over, the count is MaxCnt
 341         * and the lower 7 bits of CurCnt are randomized.
 342         * Otherwise CurCnt has the full 27-bit current counter value.
 343         */
 344        if (config & IBS_OP_VAL) {
 345                count = (config & IBS_OP_MAX_CNT) << 4;
 346                if (ibs_caps & IBS_CAPS_OPCNTEXT)
 347                        count += config & IBS_OP_MAX_CNT_EXT_MASK;
 348        } else if (ibs_caps & IBS_CAPS_RDWROPCNT) {
 349                count = (config & IBS_OP_CUR_CNT) >> 32;
 350        }
 351
 352        return count;
 353}
 354
 355static void
 356perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
 357                      u64 *config)
 358{
 359        u64 count = perf_ibs->get_count(*config);
 360
 361        /*
 362         * Set width to 64 since we do not overflow on max width but
 363         * instead on max count. In perf_ibs_set_period() we clear
 364         * prev count manually on overflow.
 365         */
 366        while (!perf_event_try_update(event, count, 64)) {
 367                rdmsrl(event->hw.config_base, *config);
 368                count = perf_ibs->get_count(*config);
 369        }
 370}
 371
 372static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
 373                                         struct hw_perf_event *hwc, u64 config)
 374{
 375        u64 tmp = hwc->config | config;
 376
 377        if (perf_ibs->fetch_count_reset_broken)
 378                wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask);
 379
 380        wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask);
 381}
 382
 383/*
 384 * Erratum #420 Instruction-Based Sampling Engine May Generate
 385 * Interrupt that Cannot Be Cleared:
 386 *
 387 * Must clear counter mask first, then clear the enable bit. See
 388 * Revision Guide for AMD Family 10h Processors, Publication #41322.
 389 */
 390static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
 391                                          struct hw_perf_event *hwc, u64 config)
 392{
 393        config &= ~perf_ibs->cnt_mask;
 394        if (boot_cpu_data.x86 == 0x10)
 395                wrmsrl(hwc->config_base, config);
 396        config &= ~perf_ibs->enable_mask;
 397        wrmsrl(hwc->config_base, config);
 398}
 399
 400/*
 401 * We cannot restore the ibs pmu state, so we always needs to update
 402 * the event while stopping it and then reset the state when starting
 403 * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
 404 * perf_ibs_start()/perf_ibs_stop() and instead always do it.
 405 */
 406static void perf_ibs_start(struct perf_event *event, int flags)
 407{
 408        struct hw_perf_event *hwc = &event->hw;
 409        struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
 410        struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
 411        u64 period, config = 0;
 412
 413        if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
 414                return;
 415
 416        WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
 417        hwc->state = 0;
 418
 419        perf_ibs_set_period(perf_ibs, hwc, &period);
 420        if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) {
 421                config |= period & IBS_OP_MAX_CNT_EXT_MASK;
 422                period &= ~IBS_OP_MAX_CNT_EXT_MASK;
 423        }
 424        config |= period >> 4;
 425
 426        /*
 427         * Set STARTED before enabling the hardware, such that a subsequent NMI
 428         * must observe it.
 429         */
 430        set_bit(IBS_STARTED,    pcpu->state);
 431        clear_bit(IBS_STOPPING, pcpu->state);
 432        perf_ibs_enable_event(perf_ibs, hwc, config);
 433
 434        perf_event_update_userpage(event);
 435}
 436
 437static void perf_ibs_stop(struct perf_event *event, int flags)
 438{
 439        struct hw_perf_event *hwc = &event->hw;
 440        struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
 441        struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
 442        u64 config;
 443        int stopping;
 444
 445        if (test_and_set_bit(IBS_STOPPING, pcpu->state))
 446                return;
 447
 448        stopping = test_bit(IBS_STARTED, pcpu->state);
 449
 450        if (!stopping && (hwc->state & PERF_HES_UPTODATE))
 451                return;
 452
 453        rdmsrl(hwc->config_base, config);
 454
 455        if (stopping) {
 456                /*
 457                 * Set STOPPED before disabling the hardware, such that it
 458                 * must be visible to NMIs the moment we clear the EN bit,
 459                 * at which point we can generate an !VALID sample which
 460                 * we need to consume.
 461                 */
 462                set_bit(IBS_STOPPED, pcpu->state);
 463                perf_ibs_disable_event(perf_ibs, hwc, config);
 464                /*
 465                 * Clear STARTED after disabling the hardware; if it were
 466                 * cleared before an NMI hitting after the clear but before
 467                 * clearing the EN bit might think it a spurious NMI and not
 468                 * handle it.
 469                 *
 470                 * Clearing it after, however, creates the problem of the NMI
 471                 * handler seeing STARTED but not having a valid sample.
 472                 */
 473                clear_bit(IBS_STARTED, pcpu->state);
 474                WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
 475                hwc->state |= PERF_HES_STOPPED;
 476        }
 477
 478        if (hwc->state & PERF_HES_UPTODATE)
 479                return;
 480
 481        /*
 482         * Clear valid bit to not count rollovers on update, rollovers
 483         * are only updated in the irq handler.
 484         */
 485        config &= ~perf_ibs->valid_mask;
 486
 487        perf_ibs_event_update(perf_ibs, event, &config);
 488        hwc->state |= PERF_HES_UPTODATE;
 489}
 490
 491static int perf_ibs_add(struct perf_event *event, int flags)
 492{
 493        struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
 494        struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
 495
 496        if (test_and_set_bit(IBS_ENABLED, pcpu->state))
 497                return -ENOSPC;
 498
 499        event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 500
 501        pcpu->event = event;
 502
 503        if (flags & PERF_EF_START)
 504                perf_ibs_start(event, PERF_EF_RELOAD);
 505
 506        return 0;
 507}
 508
 509static void perf_ibs_del(struct perf_event *event, int flags)
 510{
 511        struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
 512        struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
 513
 514        if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
 515                return;
 516
 517        perf_ibs_stop(event, PERF_EF_UPDATE);
 518
 519        pcpu->event = NULL;
 520
 521        perf_event_update_userpage(event);
 522}
 523
 524static void perf_ibs_read(struct perf_event *event) { }
 525
 526PMU_FORMAT_ATTR(rand_en,        "config:57");
 527PMU_FORMAT_ATTR(cnt_ctl,        "config:19");
 528
 529static struct attribute *ibs_fetch_format_attrs[] = {
 530        &format_attr_rand_en.attr,
 531        NULL,
 532};
 533
 534static struct attribute *ibs_op_format_attrs[] = {
 535        NULL,   /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
 536        NULL,
 537};
 538
 539static struct perf_ibs perf_ibs_fetch = {
 540        .pmu = {
 541                .task_ctx_nr    = perf_invalid_context,
 542
 543                .event_init     = perf_ibs_init,
 544                .add            = perf_ibs_add,
 545                .del            = perf_ibs_del,
 546                .start          = perf_ibs_start,
 547                .stop           = perf_ibs_stop,
 548                .read           = perf_ibs_read,
 549                .capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
 550        },
 551        .msr                    = MSR_AMD64_IBSFETCHCTL,
 552        .config_mask            = IBS_FETCH_CONFIG_MASK,
 553        .cnt_mask               = IBS_FETCH_MAX_CNT,
 554        .enable_mask            = IBS_FETCH_ENABLE,
 555        .valid_mask             = IBS_FETCH_VAL,
 556        .max_period             = IBS_FETCH_MAX_CNT << 4,
 557        .offset_mask            = { MSR_AMD64_IBSFETCH_REG_MASK },
 558        .offset_max             = MSR_AMD64_IBSFETCH_REG_COUNT,
 559        .format_attrs           = ibs_fetch_format_attrs,
 560
 561        .get_count              = get_ibs_fetch_count,
 562};
 563
 564static struct perf_ibs perf_ibs_op = {
 565        .pmu = {
 566                .task_ctx_nr    = perf_invalid_context,
 567
 568                .event_init     = perf_ibs_init,
 569                .add            = perf_ibs_add,
 570                .del            = perf_ibs_del,
 571                .start          = perf_ibs_start,
 572                .stop           = perf_ibs_stop,
 573                .read           = perf_ibs_read,
 574                .capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
 575        },
 576        .msr                    = MSR_AMD64_IBSOPCTL,
 577        .config_mask            = IBS_OP_CONFIG_MASK,
 578        .cnt_mask               = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT |
 579                                  IBS_OP_CUR_CNT_RAND,
 580        .enable_mask            = IBS_OP_ENABLE,
 581        .valid_mask             = IBS_OP_VAL,
 582        .max_period             = IBS_OP_MAX_CNT << 4,
 583        .offset_mask            = { MSR_AMD64_IBSOP_REG_MASK },
 584        .offset_max             = MSR_AMD64_IBSOP_REG_COUNT,
 585        .format_attrs           = ibs_op_format_attrs,
 586
 587        .get_count              = get_ibs_op_count,
 588};
 589
 590static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 591{
 592        struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
 593        struct perf_event *event = pcpu->event;
 594        struct hw_perf_event *hwc;
 595        struct perf_sample_data data;
 596        struct perf_raw_record raw;
 597        struct pt_regs regs;
 598        struct perf_ibs_data ibs_data;
 599        int offset, size, check_rip, offset_max, throttle = 0;
 600        unsigned int msr;
 601        u64 *buf, *config, period, new_config = 0;
 602
 603        if (!test_bit(IBS_STARTED, pcpu->state)) {
 604fail:
 605                /*
 606                 * Catch spurious interrupts after stopping IBS: After
 607                 * disabling IBS there could be still incoming NMIs
 608                 * with samples that even have the valid bit cleared.
 609                 * Mark all this NMIs as handled.
 610                 */
 611                if (test_and_clear_bit(IBS_STOPPED, pcpu->state))
 612                        return 1;
 613
 614                return 0;
 615        }
 616
 617        if (WARN_ON_ONCE(!event))
 618                goto fail;
 619
 620        hwc = &event->hw;
 621        msr = hwc->config_base;
 622        buf = ibs_data.regs;
 623        rdmsrl(msr, *buf);
 624        if (!(*buf++ & perf_ibs->valid_mask))
 625                goto fail;
 626
 627        config = &ibs_data.regs[0];
 628        perf_ibs_event_update(perf_ibs, event, config);
 629        perf_sample_data_init(&data, 0, hwc->last_period);
 630        if (!perf_ibs_set_period(perf_ibs, hwc, &period))
 631                goto out;       /* no sw counter overflow */
 632
 633        ibs_data.caps = ibs_caps;
 634        size = 1;
 635        offset = 1;
 636        check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
 637        if (event->attr.sample_type & PERF_SAMPLE_RAW)
 638                offset_max = perf_ibs->offset_max;
 639        else if (check_rip)
 640                offset_max = 3;
 641        else
 642                offset_max = 1;
 643        do {
 644                rdmsrl(msr + offset, *buf++);
 645                size++;
 646                offset = find_next_bit(perf_ibs->offset_mask,
 647                                       perf_ibs->offset_max,
 648                                       offset + 1);
 649        } while (offset < offset_max);
 650        /*
 651         * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately
 652         * depending on their availability.
 653         * Can't add to offset_max as they are staggered
 654         */
 655        if (event->attr.sample_type & PERF_SAMPLE_RAW) {
 656                if (perf_ibs == &perf_ibs_op) {
 657                        if (ibs_caps & IBS_CAPS_BRNTRGT) {
 658                                rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
 659                                size++;
 660                        }
 661                        if (ibs_caps & IBS_CAPS_OPDATA4) {
 662                                rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
 663                                size++;
 664                        }
 665                }
 666                if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) {
 667                        rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++);
 668                        size++;
 669                }
 670        }
 671        ibs_data.size = sizeof(u64) * size;
 672
 673        regs = *iregs;
 674        if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
 675                regs.flags &= ~PERF_EFLAGS_EXACT;
 676        } else {
 677                /* Workaround for erratum #1197 */
 678                if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1]))
 679                        goto out;
 680
 681                set_linear_ip(&regs, ibs_data.regs[1]);
 682                regs.flags |= PERF_EFLAGS_EXACT;
 683        }
 684
 685        if (event->attr.sample_type & PERF_SAMPLE_RAW) {
 686                raw = (struct perf_raw_record){
 687                        .frag = {
 688                                .size = sizeof(u32) + ibs_data.size,
 689                                .data = ibs_data.data,
 690                        },
 691                };
 692                data.raw = &raw;
 693        }
 694
 695        throttle = perf_event_overflow(event, &data, &regs);
 696out:
 697        if (throttle) {
 698                perf_ibs_stop(event, 0);
 699        } else {
 700                if (perf_ibs == &perf_ibs_op) {
 701                        if (ibs_caps & IBS_CAPS_OPCNTEXT) {
 702                                new_config = period & IBS_OP_MAX_CNT_EXT_MASK;
 703                                period &= ~IBS_OP_MAX_CNT_EXT_MASK;
 704                        }
 705                        if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL))
 706                                new_config |= *config & IBS_OP_CUR_CNT_RAND;
 707                }
 708                new_config |= period >> 4;
 709
 710                perf_ibs_enable_event(perf_ibs, hwc, new_config);
 711        }
 712
 713        perf_event_update_userpage(event);
 714
 715        return 1;
 716}
 717
 718static int
 719perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 720{
 721        u64 stamp = sched_clock();
 722        int handled = 0;
 723
 724        handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
 725        handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
 726
 727        if (handled)
 728                inc_irq_stat(apic_perf_irqs);
 729
 730        perf_sample_event_took(sched_clock() - stamp);
 731
 732        return handled;
 733}
 734NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
 735
 736static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
 737{
 738        struct cpu_perf_ibs __percpu *pcpu;
 739        int ret;
 740
 741        pcpu = alloc_percpu(struct cpu_perf_ibs);
 742        if (!pcpu)
 743                return -ENOMEM;
 744
 745        perf_ibs->pcpu = pcpu;
 746
 747        /* register attributes */
 748        if (perf_ibs->format_attrs[0]) {
 749                memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
 750                perf_ibs->format_group.name     = "format";
 751                perf_ibs->format_group.attrs    = perf_ibs->format_attrs;
 752
 753                memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
 754                perf_ibs->attr_groups[0]        = &perf_ibs->format_group;
 755                perf_ibs->pmu.attr_groups       = perf_ibs->attr_groups;
 756        }
 757
 758        ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
 759        if (ret) {
 760                perf_ibs->pcpu = NULL;
 761                free_percpu(pcpu);
 762        }
 763
 764        return ret;
 765}
 766
 767static __init void perf_event_ibs_init(void)
 768{
 769        struct attribute **attr = ibs_op_format_attrs;
 770
 771        /*
 772         * Some chips fail to reset the fetch count when it is written; instead
 773         * they need a 0-1 transition of IbsFetchEn.
 774         */
 775        if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18)
 776                perf_ibs_fetch.fetch_count_reset_broken = 1;
 777
 778        if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10)
 779                perf_ibs_fetch.fetch_ignore_if_zero_rip = 1;
 780
 781        perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
 782
 783        if (ibs_caps & IBS_CAPS_OPCNT) {
 784                perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
 785                *attr++ = &format_attr_cnt_ctl.attr;
 786        }
 787
 788        if (ibs_caps & IBS_CAPS_OPCNTEXT) {
 789                perf_ibs_op.max_period  |= IBS_OP_MAX_CNT_EXT_MASK;
 790                perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK;
 791                perf_ibs_op.cnt_mask    |= IBS_OP_MAX_CNT_EXT_MASK;
 792        }
 793
 794        perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
 795
 796        register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
 797        pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
 798}
 799
 800#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
 801
 802static __init void perf_event_ibs_init(void) { }
 803
 804#endif
 805
 806/* IBS - apic initialization, for perf and oprofile */
 807
 808static __init u32 __get_ibs_caps(void)
 809{
 810        u32 caps;
 811        unsigned int max_level;
 812
 813        if (!boot_cpu_has(X86_FEATURE_IBS))
 814                return 0;
 815
 816        /* check IBS cpuid feature flags */
 817        max_level = cpuid_eax(0x80000000);
 818        if (max_level < IBS_CPUID_FEATURES)
 819                return IBS_CAPS_DEFAULT;
 820
 821        caps = cpuid_eax(IBS_CPUID_FEATURES);
 822        if (!(caps & IBS_CAPS_AVAIL))
 823                /* cpuid flags not valid */
 824                return IBS_CAPS_DEFAULT;
 825
 826        return caps;
 827}
 828
 829u32 get_ibs_caps(void)
 830{
 831        return ibs_caps;
 832}
 833
 834EXPORT_SYMBOL(get_ibs_caps);
 835
 836static inline int get_eilvt(int offset)
 837{
 838        return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
 839}
 840
 841static inline int put_eilvt(int offset)
 842{
 843        return !setup_APIC_eilvt(offset, 0, 0, 1);
 844}
 845
 846/*
 847 * Check and reserve APIC extended interrupt LVT offset for IBS if available.
 848 */
 849static inline int ibs_eilvt_valid(void)
 850{
 851        int offset;
 852        u64 val;
 853        int valid = 0;
 854
 855        preempt_disable();
 856
 857        rdmsrl(MSR_AMD64_IBSCTL, val);
 858        offset = val & IBSCTL_LVT_OFFSET_MASK;
 859
 860        if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
 861                pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
 862                       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
 863                goto out;
 864        }
 865
 866        if (!get_eilvt(offset)) {
 867                pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
 868                       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
 869                goto out;
 870        }
 871
 872        valid = 1;
 873out:
 874        preempt_enable();
 875
 876        return valid;
 877}
 878
 879static int setup_ibs_ctl(int ibs_eilvt_off)
 880{
 881        struct pci_dev *cpu_cfg;
 882        int nodes;
 883        u32 value = 0;
 884
 885        nodes = 0;
 886        cpu_cfg = NULL;
 887        do {
 888                cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
 889                                         PCI_DEVICE_ID_AMD_10H_NB_MISC,
 890                                         cpu_cfg);
 891                if (!cpu_cfg)
 892                        break;
 893                ++nodes;
 894                pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
 895                                       | IBSCTL_LVT_OFFSET_VALID);
 896                pci_read_config_dword(cpu_cfg, IBSCTL, &value);
 897                if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
 898                        pci_dev_put(cpu_cfg);
 899                        pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
 900                                 value);
 901                        return -EINVAL;
 902                }
 903        } while (1);
 904
 905        if (!nodes) {
 906                pr_debug("No CPU node configured for IBS\n");
 907                return -ENODEV;
 908        }
 909
 910        return 0;
 911}
 912
 913/*
 914 * This runs only on the current cpu. We try to find an LVT offset and
 915 * setup the local APIC. For this we must disable preemption. On
 916 * success we initialize all nodes with this offset. This updates then
 917 * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
 918 * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
 919 * is using the new offset.
 920 */
 921static void force_ibs_eilvt_setup(void)
 922{
 923        int offset;
 924        int ret;
 925
 926        preempt_disable();
 927        /* find the next free available EILVT entry, skip offset 0 */
 928        for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
 929                if (get_eilvt(offset))
 930                        break;
 931        }
 932        preempt_enable();
 933
 934        if (offset == APIC_EILVT_NR_MAX) {
 935                pr_debug("No EILVT entry available\n");
 936                return;
 937        }
 938
 939        ret = setup_ibs_ctl(offset);
 940        if (ret)
 941                goto out;
 942
 943        if (!ibs_eilvt_valid())
 944                goto out;
 945
 946        pr_info("LVT offset %d assigned\n", offset);
 947
 948        return;
 949out:
 950        preempt_disable();
 951        put_eilvt(offset);
 952        preempt_enable();
 953        return;
 954}
 955
 956static void ibs_eilvt_setup(void)
 957{
 958        /*
 959         * Force LVT offset assignment for family 10h: The offsets are
 960         * not assigned by the BIOS for this family, so the OS is
 961         * responsible for doing it. If the OS assignment fails, fall
 962         * back to BIOS settings and try to setup this.
 963         */
 964        if (boot_cpu_data.x86 == 0x10)
 965                force_ibs_eilvt_setup();
 966}
 967
 968static inline int get_ibs_lvt_offset(void)
 969{
 970        u64 val;
 971
 972        rdmsrl(MSR_AMD64_IBSCTL, val);
 973        if (!(val & IBSCTL_LVT_OFFSET_VALID))
 974                return -EINVAL;
 975
 976        return val & IBSCTL_LVT_OFFSET_MASK;
 977}
 978
 979static void setup_APIC_ibs(void)
 980{
 981        int offset;
 982
 983        offset = get_ibs_lvt_offset();
 984        if (offset < 0)
 985                goto failed;
 986
 987        if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
 988                return;
 989failed:
 990        pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
 991                smp_processor_id());
 992}
 993
 994static void clear_APIC_ibs(void)
 995{
 996        int offset;
 997
 998        offset = get_ibs_lvt_offset();
 999        if (offset >= 0)
1000                setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
1001}
1002
1003static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu)
1004{
1005        setup_APIC_ibs();
1006        return 0;
1007}
1008
1009#ifdef CONFIG_PM
1010
1011static int perf_ibs_suspend(void)
1012{
1013        clear_APIC_ibs();
1014        return 0;
1015}
1016
1017static void perf_ibs_resume(void)
1018{
1019        ibs_eilvt_setup();
1020        setup_APIC_ibs();
1021}
1022
1023static struct syscore_ops perf_ibs_syscore_ops = {
1024        .resume         = perf_ibs_resume,
1025        .suspend        = perf_ibs_suspend,
1026};
1027
1028static void perf_ibs_pm_init(void)
1029{
1030        register_syscore_ops(&perf_ibs_syscore_ops);
1031}
1032
1033#else
1034
1035static inline void perf_ibs_pm_init(void) { }
1036
1037#endif
1038
1039static int x86_pmu_amd_ibs_dying_cpu(unsigned int cpu)
1040{
1041        clear_APIC_ibs();
1042        return 0;
1043}
1044
1045static __init int amd_ibs_init(void)
1046{
1047        u32 caps;
1048
1049        caps = __get_ibs_caps();
1050        if (!caps)
1051                return -ENODEV; /* ibs not supported by the cpu */
1052
1053        ibs_eilvt_setup();
1054
1055        if (!ibs_eilvt_valid())
1056                return -EINVAL;
1057
1058        perf_ibs_pm_init();
1059
1060        ibs_caps = caps;
1061        /* make ibs_caps visible to other cpus: */
1062        smp_mb();
1063        /*
1064         * x86_pmu_amd_ibs_starting_cpu will be called from core on
1065         * all online cpus.
1066         */
1067        cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
1068                          "perf/x86/amd/ibs:starting",
1069                          x86_pmu_amd_ibs_starting_cpu,
1070                          x86_pmu_amd_ibs_dying_cpu);
1071
1072        perf_event_ibs_init();
1073
1074        return 0;
1075}
1076
1077/* Since we need the pci subsystem to init ibs we can't do this earlier: */
1078device_initcall(amd_ibs_init);
1079