linux/arch/x86/events/intel/ds.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/bitops.h>
   3#include <linux/types.h>
   4#include <linux/slab.h>
   5
   6#include <asm/cpu_entry_area.h>
   7#include <asm/perf_event.h>
   8#include <asm/tlbflush.h>
   9#include <asm/insn.h>
  10#include <asm/io.h>
  11
  12#include "../perf_event.h"
  13
  14/* Waste a full page so it can be mapped into the cpu_entry_area */
  15DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
  16
  17/* The size of a BTS record in bytes: */
  18#define BTS_RECORD_SIZE         24
  19
  20#define PEBS_FIXUP_SIZE         PAGE_SIZE
  21
  22/*
  23 * pebs_record_32 for p4 and core not supported
  24
  25struct pebs_record_32 {
  26        u32 flags, ip;
  27        u32 ax, bc, cx, dx;
  28        u32 si, di, bp, sp;
  29};
  30
  31 */
  32
  33union intel_x86_pebs_dse {
  34        u64 val;
  35        struct {
  36                unsigned int ld_dse:4;
  37                unsigned int ld_stlb_miss:1;
  38                unsigned int ld_locked:1;
  39                unsigned int ld_data_blk:1;
  40                unsigned int ld_addr_blk:1;
  41                unsigned int ld_reserved:24;
  42        };
  43        struct {
  44                unsigned int st_l1d_hit:1;
  45                unsigned int st_reserved1:3;
  46                unsigned int st_stlb_miss:1;
  47                unsigned int st_locked:1;
  48                unsigned int st_reserved2:26;
  49        };
  50        struct {
  51                unsigned int st_lat_dse:4;
  52                unsigned int st_lat_stlb_miss:1;
  53                unsigned int st_lat_locked:1;
  54                unsigned int ld_reserved3:26;
  55        };
  56};
  57
  58
  59/*
  60 * Map PEBS Load Latency Data Source encodings to generic
  61 * memory data source information
  62 */
  63#define P(a, b) PERF_MEM_S(a, b)
  64#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
  65#define LEVEL(x) P(LVLNUM, x)
  66#define REM P(REMOTE, REMOTE)
  67#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
  68
  69/* Version for Sandy Bridge and later */
  70static u64 pebs_data_source[] = {
  71        P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
  72        OP_LH | P(LVL, L1)  | LEVEL(L1) | P(SNOOP, NONE),  /* 0x01: L1 local */
  73        OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
  74        OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, NONE),  /* 0x03: L2 hit */
  75        OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, NONE),  /* 0x04: L3 hit */
  76        OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, MISS),  /* 0x05: L3 hit, snoop miss */
  77        OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, HIT),   /* 0x06: L3 hit, snoop hit */
  78        OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, HITM),  /* 0x07: L3 hit, snoop hitm */
  79        OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
  80        OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
  81        OP_LH | P(LVL, LOC_RAM)  | LEVEL(RAM) | P(SNOOP, HIT),       /* 0x0a: L3 miss, shared */
  82        OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
  83        OP_LH | P(LVL, LOC_RAM)  | LEVEL(RAM) | SNOOP_NONE_MISS,     /* 0x0c: L3 miss, excl */
  84        OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */
  85        OP_LH | P(LVL, IO)  | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */
  86        OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
  87};
  88
  89/* Patch up minor differences in the bits */
  90void __init intel_pmu_pebs_data_source_nhm(void)
  91{
  92        pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
  93        pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
  94        pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
  95}
  96
  97static void __init __intel_pmu_pebs_data_source_skl(bool pmem, u64 *data_source)
  98{
  99        u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
 100
 101        data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
 102        data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
 103        data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
 104        data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
 105        data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
 106}
 107
 108void __init intel_pmu_pebs_data_source_skl(bool pmem)
 109{
 110        __intel_pmu_pebs_data_source_skl(pmem, pebs_data_source);
 111}
 112
 113static void __init __intel_pmu_pebs_data_source_grt(u64 *data_source)
 114{
 115        data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
 116        data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
 117        data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD);
 118}
 119
 120void __init intel_pmu_pebs_data_source_grt(void)
 121{
 122        __intel_pmu_pebs_data_source_grt(pebs_data_source);
 123}
 124
 125void __init intel_pmu_pebs_data_source_adl(void)
 126{
 127        u64 *data_source;
 128
 129        data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source;
 130        memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
 131        __intel_pmu_pebs_data_source_skl(false, data_source);
 132
 133        data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
 134        memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
 135        __intel_pmu_pebs_data_source_grt(data_source);
 136}
 137
 138static u64 precise_store_data(u64 status)
 139{
 140        union intel_x86_pebs_dse dse;
 141        u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
 142
 143        dse.val = status;
 144
 145        /*
 146         * bit 4: TLB access
 147         * 1 = stored missed 2nd level TLB
 148         *
 149         * so it either hit the walker or the OS
 150         * otherwise hit 2nd level TLB
 151         */
 152        if (dse.st_stlb_miss)
 153                val |= P(TLB, MISS);
 154        else
 155                val |= P(TLB, HIT);
 156
 157        /*
 158         * bit 0: hit L1 data cache
 159         * if not set, then all we know is that
 160         * it missed L1D
 161         */
 162        if (dse.st_l1d_hit)
 163                val |= P(LVL, HIT);
 164        else
 165                val |= P(LVL, MISS);
 166
 167        /*
 168         * bit 5: Locked prefix
 169         */
 170        if (dse.st_locked)
 171                val |= P(LOCK, LOCKED);
 172
 173        return val;
 174}
 175
 176static u64 precise_datala_hsw(struct perf_event *event, u64 status)
 177{
 178        union perf_mem_data_src dse;
 179
 180        dse.val = PERF_MEM_NA;
 181
 182        if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
 183                dse.mem_op = PERF_MEM_OP_STORE;
 184        else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
 185                dse.mem_op = PERF_MEM_OP_LOAD;
 186
 187        /*
 188         * L1 info only valid for following events:
 189         *
 190         * MEM_UOPS_RETIRED.STLB_MISS_STORES
 191         * MEM_UOPS_RETIRED.LOCK_STORES
 192         * MEM_UOPS_RETIRED.SPLIT_STORES
 193         * MEM_UOPS_RETIRED.ALL_STORES
 194         */
 195        if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
 196                if (status & 1)
 197                        dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
 198                else
 199                        dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
 200        }
 201        return dse.val;
 202}
 203
 204static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock)
 205{
 206        /*
 207         * TLB access
 208         * 0 = did not miss 2nd level TLB
 209         * 1 = missed 2nd level TLB
 210         */
 211        if (tlb)
 212                *val |= P(TLB, MISS) | P(TLB, L2);
 213        else
 214                *val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
 215
 216        /* locked prefix */
 217        if (lock)
 218                *val |= P(LOCK, LOCKED);
 219}
 220
 221/* Retrieve the latency data for e-core of ADL */
 222u64 adl_latency_data_small(struct perf_event *event, u64 status)
 223{
 224        union intel_x86_pebs_dse dse;
 225        u64 val;
 226
 227        WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big);
 228
 229        dse.val = status;
 230
 231        val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse];
 232
 233        /*
 234         * For the atom core on ADL,
 235         * bit 4: lock, bit 5: TLB access.
 236         */
 237        pebs_set_tlb_lock(&val, dse.ld_locked, dse.ld_stlb_miss);
 238
 239        if (dse.ld_data_blk)
 240                val |= P(BLK, DATA);
 241        else
 242                val |= P(BLK, NA);
 243
 244        return val;
 245}
 246
 247static u64 load_latency_data(struct perf_event *event, u64 status)
 248{
 249        union intel_x86_pebs_dse dse;
 250        u64 val;
 251
 252        dse.val = status;
 253
 254        /*
 255         * use the mapping table for bit 0-3
 256         */
 257        val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse];
 258
 259        /*
 260         * Nehalem models do not support TLB, Lock infos
 261         */
 262        if (x86_pmu.pebs_no_tlb) {
 263                val |= P(TLB, NA) | P(LOCK, NA);
 264                return val;
 265        }
 266
 267        pebs_set_tlb_lock(&val, dse.ld_stlb_miss, dse.ld_locked);
 268
 269        /*
 270         * Ice Lake and earlier models do not support block infos.
 271         */
 272        if (!x86_pmu.pebs_block) {
 273                val |= P(BLK, NA);
 274                return val;
 275        }
 276        /*
 277         * bit 6: load was blocked since its data could not be forwarded
 278         *        from a preceding store
 279         */
 280        if (dse.ld_data_blk)
 281                val |= P(BLK, DATA);
 282
 283        /*
 284         * bit 7: load was blocked due to potential address conflict with
 285         *        a preceding store
 286         */
 287        if (dse.ld_addr_blk)
 288                val |= P(BLK, ADDR);
 289
 290        if (!dse.ld_data_blk && !dse.ld_addr_blk)
 291                val |= P(BLK, NA);
 292
 293        return val;
 294}
 295
 296static u64 store_latency_data(struct perf_event *event, u64 status)
 297{
 298        union intel_x86_pebs_dse dse;
 299        union perf_mem_data_src src;
 300        u64 val;
 301
 302        dse.val = status;
 303
 304        /*
 305         * use the mapping table for bit 0-3
 306         */
 307        val = hybrid_var(event->pmu, pebs_data_source)[dse.st_lat_dse];
 308
 309        pebs_set_tlb_lock(&val, dse.st_lat_stlb_miss, dse.st_lat_locked);
 310
 311        val |= P(BLK, NA);
 312
 313        /*
 314         * the pebs_data_source table is only for loads
 315         * so override the mem_op to say STORE instead
 316         */
 317        src.val = val;
 318        src.mem_op = P(OP,STORE);
 319
 320        return src.val;
 321}
 322
 323struct pebs_record_core {
 324        u64 flags, ip;
 325        u64 ax, bx, cx, dx;
 326        u64 si, di, bp, sp;
 327        u64 r8,  r9,  r10, r11;
 328        u64 r12, r13, r14, r15;
 329};
 330
 331struct pebs_record_nhm {
 332        u64 flags, ip;
 333        u64 ax, bx, cx, dx;
 334        u64 si, di, bp, sp;
 335        u64 r8,  r9,  r10, r11;
 336        u64 r12, r13, r14, r15;
 337        u64 status, dla, dse, lat;
 338};
 339
 340/*
 341 * Same as pebs_record_nhm, with two additional fields.
 342 */
 343struct pebs_record_hsw {
 344        u64 flags, ip;
 345        u64 ax, bx, cx, dx;
 346        u64 si, di, bp, sp;
 347        u64 r8,  r9,  r10, r11;
 348        u64 r12, r13, r14, r15;
 349        u64 status, dla, dse, lat;
 350        u64 real_ip, tsx_tuning;
 351};
 352
 353union hsw_tsx_tuning {
 354        struct {
 355                u32 cycles_last_block     : 32,
 356                    hle_abort             : 1,
 357                    rtm_abort             : 1,
 358                    instruction_abort     : 1,
 359                    non_instruction_abort : 1,
 360                    retry                 : 1,
 361                    data_conflict         : 1,
 362                    capacity_writes       : 1,
 363                    capacity_reads        : 1;
 364        };
 365        u64         value;
 366};
 367
 368#define PEBS_HSW_TSX_FLAGS      0xff00000000ULL
 369
 370/* Same as HSW, plus TSC */
 371
 372struct pebs_record_skl {
 373        u64 flags, ip;
 374        u64 ax, bx, cx, dx;
 375        u64 si, di, bp, sp;
 376        u64 r8,  r9,  r10, r11;
 377        u64 r12, r13, r14, r15;
 378        u64 status, dla, dse, lat;
 379        u64 real_ip, tsx_tuning;
 380        u64 tsc;
 381};
 382
 383void init_debug_store_on_cpu(int cpu)
 384{
 385        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 386
 387        if (!ds)
 388                return;
 389
 390        wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
 391                     (u32)((u64)(unsigned long)ds),
 392                     (u32)((u64)(unsigned long)ds >> 32));
 393}
 394
 395void fini_debug_store_on_cpu(int cpu)
 396{
 397        if (!per_cpu(cpu_hw_events, cpu).ds)
 398                return;
 399
 400        wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
 401}
 402
 403static DEFINE_PER_CPU(void *, insn_buffer);
 404
 405static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
 406{
 407        unsigned long start = (unsigned long)cea;
 408        phys_addr_t pa;
 409        size_t msz = 0;
 410
 411        pa = virt_to_phys(addr);
 412
 413        preempt_disable();
 414        for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
 415                cea_set_pte(cea, pa, prot);
 416
 417        /*
 418         * This is a cross-CPU update of the cpu_entry_area, we must shoot down
 419         * all TLB entries for it.
 420         */
 421        flush_tlb_kernel_range(start, start + size);
 422        preempt_enable();
 423}
 424
 425static void ds_clear_cea(void *cea, size_t size)
 426{
 427        unsigned long start = (unsigned long)cea;
 428        size_t msz = 0;
 429
 430        preempt_disable();
 431        for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
 432                cea_set_pte(cea, 0, PAGE_NONE);
 433
 434        flush_tlb_kernel_range(start, start + size);
 435        preempt_enable();
 436}
 437
 438static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
 439{
 440        unsigned int order = get_order(size);
 441        int node = cpu_to_node(cpu);
 442        struct page *page;
 443
 444        page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
 445        return page ? page_address(page) : NULL;
 446}
 447
 448static void dsfree_pages(const void *buffer, size_t size)
 449{
 450        if (buffer)
 451                free_pages((unsigned long)buffer, get_order(size));
 452}
 453
 454static int alloc_pebs_buffer(int cpu)
 455{
 456        struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
 457        struct debug_store *ds = hwev->ds;
 458        size_t bsiz = x86_pmu.pebs_buffer_size;
 459        int max, node = cpu_to_node(cpu);
 460        void *buffer, *insn_buff, *cea;
 461
 462        if (!x86_pmu.pebs)
 463                return 0;
 464
 465        buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
 466        if (unlikely(!buffer))
 467                return -ENOMEM;
 468
 469        /*
 470         * HSW+ already provides us the eventing ip; no need to allocate this
 471         * buffer then.
 472         */
 473        if (x86_pmu.intel_cap.pebs_format < 2) {
 474                insn_buff = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
 475                if (!insn_buff) {
 476                        dsfree_pages(buffer, bsiz);
 477                        return -ENOMEM;
 478                }
 479                per_cpu(insn_buffer, cpu) = insn_buff;
 480        }
 481        hwev->ds_pebs_vaddr = buffer;
 482        /* Update the cpu entry area mapping */
 483        cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
 484        ds->pebs_buffer_base = (unsigned long) cea;
 485        ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
 486        ds->pebs_index = ds->pebs_buffer_base;
 487        max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
 488        ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
 489        return 0;
 490}
 491
 492static void release_pebs_buffer(int cpu)
 493{
 494        struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
 495        void *cea;
 496
 497        if (!x86_pmu.pebs)
 498                return;
 499
 500        kfree(per_cpu(insn_buffer, cpu));
 501        per_cpu(insn_buffer, cpu) = NULL;
 502
 503        /* Clear the fixmap */
 504        cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
 505        ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
 506        dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
 507        hwev->ds_pebs_vaddr = NULL;
 508}
 509
 510static int alloc_bts_buffer(int cpu)
 511{
 512        struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
 513        struct debug_store *ds = hwev->ds;
 514        void *buffer, *cea;
 515        int max;
 516
 517        if (!x86_pmu.bts)
 518                return 0;
 519
 520        buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
 521        if (unlikely(!buffer)) {
 522                WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
 523                return -ENOMEM;
 524        }
 525        hwev->ds_bts_vaddr = buffer;
 526        /* Update the fixmap */
 527        cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
 528        ds->bts_buffer_base = (unsigned long) cea;
 529        ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
 530        ds->bts_index = ds->bts_buffer_base;
 531        max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
 532        ds->bts_absolute_maximum = ds->bts_buffer_base +
 533                                        max * BTS_RECORD_SIZE;
 534        ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
 535                                        (max / 16) * BTS_RECORD_SIZE;
 536        return 0;
 537}
 538
 539static void release_bts_buffer(int cpu)
 540{
 541        struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
 542        void *cea;
 543
 544        if (!x86_pmu.bts)
 545                return;
 546
 547        /* Clear the fixmap */
 548        cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
 549        ds_clear_cea(cea, BTS_BUFFER_SIZE);
 550        dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
 551        hwev->ds_bts_vaddr = NULL;
 552}
 553
 554static int alloc_ds_buffer(int cpu)
 555{
 556        struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
 557
 558        memset(ds, 0, sizeof(*ds));
 559        per_cpu(cpu_hw_events, cpu).ds = ds;
 560        return 0;
 561}
 562
 563static void release_ds_buffer(int cpu)
 564{
 565        per_cpu(cpu_hw_events, cpu).ds = NULL;
 566}
 567
 568void release_ds_buffers(void)
 569{
 570        int cpu;
 571
 572        if (!x86_pmu.bts && !x86_pmu.pebs)
 573                return;
 574
 575        for_each_possible_cpu(cpu)
 576                release_ds_buffer(cpu);
 577
 578        for_each_possible_cpu(cpu) {
 579                /*
 580                 * Again, ignore errors from offline CPUs, they will no longer
 581                 * observe cpu_hw_events.ds and not program the DS_AREA when
 582                 * they come up.
 583                 */
 584                fini_debug_store_on_cpu(cpu);
 585        }
 586
 587        for_each_possible_cpu(cpu) {
 588                release_pebs_buffer(cpu);
 589                release_bts_buffer(cpu);
 590        }
 591}
 592
 593void reserve_ds_buffers(void)
 594{
 595        int bts_err = 0, pebs_err = 0;
 596        int cpu;
 597
 598        x86_pmu.bts_active = 0;
 599        x86_pmu.pebs_active = 0;
 600
 601        if (!x86_pmu.bts && !x86_pmu.pebs)
 602                return;
 603
 604        if (!x86_pmu.bts)
 605                bts_err = 1;
 606
 607        if (!x86_pmu.pebs)
 608                pebs_err = 1;
 609
 610        for_each_possible_cpu(cpu) {
 611                if (alloc_ds_buffer(cpu)) {
 612                        bts_err = 1;
 613                        pebs_err = 1;
 614                }
 615
 616                if (!bts_err && alloc_bts_buffer(cpu))
 617                        bts_err = 1;
 618
 619                if (!pebs_err && alloc_pebs_buffer(cpu))
 620                        pebs_err = 1;
 621
 622                if (bts_err && pebs_err)
 623                        break;
 624        }
 625
 626        if (bts_err) {
 627                for_each_possible_cpu(cpu)
 628                        release_bts_buffer(cpu);
 629        }
 630
 631        if (pebs_err) {
 632                for_each_possible_cpu(cpu)
 633                        release_pebs_buffer(cpu);
 634        }
 635
 636        if (bts_err && pebs_err) {
 637                for_each_possible_cpu(cpu)
 638                        release_ds_buffer(cpu);
 639        } else {
 640                if (x86_pmu.bts && !bts_err)
 641                        x86_pmu.bts_active = 1;
 642
 643                if (x86_pmu.pebs && !pebs_err)
 644                        x86_pmu.pebs_active = 1;
 645
 646                for_each_possible_cpu(cpu) {
 647                        /*
 648                         * Ignores wrmsr_on_cpu() errors for offline CPUs they
 649                         * will get this call through intel_pmu_cpu_starting().
 650                         */
 651                        init_debug_store_on_cpu(cpu);
 652                }
 653        }
 654}
 655
 656/*
 657 * BTS
 658 */
 659
 660struct event_constraint bts_constraint =
 661        EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
 662
 663void intel_pmu_enable_bts(u64 config)
 664{
 665        unsigned long debugctlmsr;
 666
 667        debugctlmsr = get_debugctlmsr();
 668
 669        debugctlmsr |= DEBUGCTLMSR_TR;
 670        debugctlmsr |= DEBUGCTLMSR_BTS;
 671        if (config & ARCH_PERFMON_EVENTSEL_INT)
 672                debugctlmsr |= DEBUGCTLMSR_BTINT;
 673
 674        if (!(config & ARCH_PERFMON_EVENTSEL_OS))
 675                debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
 676
 677        if (!(config & ARCH_PERFMON_EVENTSEL_USR))
 678                debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
 679
 680        update_debugctlmsr(debugctlmsr);
 681}
 682
 683void intel_pmu_disable_bts(void)
 684{
 685        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 686        unsigned long debugctlmsr;
 687
 688        if (!cpuc->ds)
 689                return;
 690
 691        debugctlmsr = get_debugctlmsr();
 692
 693        debugctlmsr &=
 694                ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
 695                  DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
 696
 697        update_debugctlmsr(debugctlmsr);
 698}
 699
 700int intel_pmu_drain_bts_buffer(void)
 701{
 702        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 703        struct debug_store *ds = cpuc->ds;
 704        struct bts_record {
 705                u64     from;
 706                u64     to;
 707                u64     flags;
 708        };
 709        struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
 710        struct bts_record *at, *base, *top;
 711        struct perf_output_handle handle;
 712        struct perf_event_header header;
 713        struct perf_sample_data data;
 714        unsigned long skip = 0;
 715        struct pt_regs regs;
 716
 717        if (!event)
 718                return 0;
 719
 720        if (!x86_pmu.bts_active)
 721                return 0;
 722
 723        base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
 724        top  = (struct bts_record *)(unsigned long)ds->bts_index;
 725
 726        if (top <= base)
 727                return 0;
 728
 729        memset(&regs, 0, sizeof(regs));
 730
 731        ds->bts_index = ds->bts_buffer_base;
 732
 733        perf_sample_data_init(&data, 0, event->hw.last_period);
 734
 735        /*
 736         * BTS leaks kernel addresses in branches across the cpl boundary,
 737         * such as traps or system calls, so unless the user is asking for
 738         * kernel tracing (and right now it's not possible), we'd need to
 739         * filter them out. But first we need to count how many of those we
 740         * have in the current batch. This is an extra O(n) pass, however,
 741         * it's much faster than the other one especially considering that
 742         * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
 743         * alloc_bts_buffer()).
 744         */
 745        for (at = base; at < top; at++) {
 746                /*
 747                 * Note that right now *this* BTS code only works if
 748                 * attr::exclude_kernel is set, but let's keep this extra
 749                 * check here in case that changes.
 750                 */
 751                if (event->attr.exclude_kernel &&
 752                    (kernel_ip(at->from) || kernel_ip(at->to)))
 753                        skip++;
 754        }
 755
 756        /*
 757         * Prepare a generic sample, i.e. fill in the invariant fields.
 758         * We will overwrite the from and to address before we output
 759         * the sample.
 760         */
 761        rcu_read_lock();
 762        perf_prepare_sample(&header, &data, event, &regs);
 763
 764        if (perf_output_begin(&handle, &data, event,
 765                              header.size * (top - base - skip)))
 766                goto unlock;
 767
 768        for (at = base; at < top; at++) {
 769                /* Filter out any records that contain kernel addresses. */
 770                if (event->attr.exclude_kernel &&
 771                    (kernel_ip(at->from) || kernel_ip(at->to)))
 772                        continue;
 773
 774                data.ip         = at->from;
 775                data.addr       = at->to;
 776
 777                perf_output_sample(&handle, &header, &data, event);
 778        }
 779
 780        perf_output_end(&handle);
 781
 782        /* There's new data available. */
 783        event->hw.interrupts++;
 784        event->pending_kill = POLL_IN;
 785unlock:
 786        rcu_read_unlock();
 787        return 1;
 788}
 789
 790static inline void intel_pmu_drain_pebs_buffer(void)
 791{
 792        struct perf_sample_data data;
 793
 794        x86_pmu.drain_pebs(NULL, &data);
 795}
 796
 797/*
 798 * PEBS
 799 */
 800struct event_constraint intel_core2_pebs_event_constraints[] = {
 801        INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
 802        INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
 803        INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
 804        INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
 805        INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
 806        /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
 807        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x01),
 808        EVENT_CONSTRAINT_END
 809};
 810
 811struct event_constraint intel_atom_pebs_event_constraints[] = {
 812        INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
 813        INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
 814        INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
 815        /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
 816        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x01),
 817        /* Allow all events as PEBS with no flags */
 818        INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
 819        EVENT_CONSTRAINT_END
 820};
 821
 822struct event_constraint intel_slm_pebs_event_constraints[] = {
 823        /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
 824        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x1),
 825        /* Allow all events as PEBS with no flags */
 826        INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
 827        EVENT_CONSTRAINT_END
 828};
 829
 830struct event_constraint intel_glm_pebs_event_constraints[] = {
 831        /* Allow all events as PEBS with no flags */
 832        INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
 833        EVENT_CONSTRAINT_END
 834};
 835
 836struct event_constraint intel_grt_pebs_event_constraints[] = {
 837        /* Allow all events as PEBS with no flags */
 838        INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0x3),
 839        INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xf),
 840        EVENT_CONSTRAINT_END
 841};
 842
 843struct event_constraint intel_nehalem_pebs_event_constraints[] = {
 844        INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
 845        INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
 846        INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
 847        INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
 848        INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
 849        INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
 850        INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
 851        INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
 852        INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
 853        INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
 854        INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
 855        /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
 856        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
 857        EVENT_CONSTRAINT_END
 858};
 859
 860struct event_constraint intel_westmere_pebs_event_constraints[] = {
 861        INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
 862        INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
 863        INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
 864        INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
 865        INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
 866        INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
 867        INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
 868        INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
 869        INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
 870        INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
 871        INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
 872        /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
 873        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
 874        EVENT_CONSTRAINT_END
 875};
 876
 877struct event_constraint intel_snb_pebs_event_constraints[] = {
 878        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
 879        INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
 880        INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
 881        /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 882        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
 883        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
 884        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
 885        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
 886        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
 887        /* Allow all events as PEBS with no flags */
 888        INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 889        EVENT_CONSTRAINT_END
 890};
 891
 892struct event_constraint intel_ivb_pebs_event_constraints[] = {
 893        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
 894        INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
 895        INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
 896        /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 897        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
 898        /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
 899        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
 900        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
 901        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
 902        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
 903        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
 904        /* Allow all events as PEBS with no flags */
 905        INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 906        EVENT_CONSTRAINT_END
 907};
 908
 909struct event_constraint intel_hsw_pebs_event_constraints[] = {
 910        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
 911        INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
 912        /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 913        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
 914        /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
 915        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
 916        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
 917        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
 918        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
 919        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
 920        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
 921        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
 922        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
 923        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
 924        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
 925        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
 926        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
 927        /* Allow all events as PEBS with no flags */
 928        INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 929        EVENT_CONSTRAINT_END
 930};
 931
 932struct event_constraint intel_bdw_pebs_event_constraints[] = {
 933        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
 934        INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
 935        /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 936        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
 937        /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
 938        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
 939        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
 940        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
 941        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
 942        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
 943        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
 944        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
 945        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
 946        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
 947        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
 948        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
 949        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
 950        /* Allow all events as PEBS with no flags */
 951        INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 952        EVENT_CONSTRAINT_END
 953};
 954
 955
 956struct event_constraint intel_skl_pebs_event_constraints[] = {
 957        INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),      /* INST_RETIRED.PREC_DIST */
 958        /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
 959        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
 960        /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
 961        INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
 962        INTEL_PLD_CONSTRAINT(0x1cd, 0xf),                     /* MEM_TRANS_RETIRED.* */
 963        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
 964        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
 965        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
 966        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
 967        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
 968        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
 969        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
 970        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
 971        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
 972        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
 973        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
 974        /* Allow all events as PEBS with no flags */
 975        INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 976        EVENT_CONSTRAINT_END
 977};
 978
 979struct event_constraint intel_icl_pebs_event_constraints[] = {
 980        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x100000000ULL),  /* old INST_RETIRED.PREC_DIST */
 981        INTEL_FLAGS_UEVENT_CONSTRAINT(0x0100, 0x100000000ULL),  /* INST_RETIRED.PREC_DIST */
 982        INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),  /* SLOTS */
 983
 984        INTEL_PLD_CONSTRAINT(0x1cd, 0xff),                      /* MEM_TRANS_RETIRED.LOAD_LATENCY */
 985        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf),   /* MEM_INST_RETIRED.STLB_MISS_LOADS */
 986        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf),   /* MEM_INST_RETIRED.STLB_MISS_STORES */
 987        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf),   /* MEM_INST_RETIRED.LOCK_LOADS */
 988        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf),   /* MEM_INST_RETIRED.SPLIT_LOADS */
 989        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf),   /* MEM_INST_RETIRED.SPLIT_STORES */
 990        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf),   /* MEM_INST_RETIRED.ALL_LOADS */
 991        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf),   /* MEM_INST_RETIRED.ALL_STORES */
 992
 993        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), /* MEM_LOAD_*_RETIRED.* */
 994
 995        INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),                /* MEM_INST_RETIRED.* */
 996
 997        /*
 998         * Everything else is handled by PMU_FL_PEBS_ALL, because we
 999         * need the full constraints from the main table.
1000         */
1001
1002        EVENT_CONSTRAINT_END
1003};
1004
1005struct event_constraint intel_spr_pebs_event_constraints[] = {
1006        INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL),   /* INST_RETIRED.PREC_DIST */
1007        INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
1008
1009        INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
1010        INTEL_PLD_CONSTRAINT(0x1cd, 0xfe),
1011        INTEL_PSD_CONSTRAINT(0x2cd, 0x1),
1012        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf),   /* MEM_INST_RETIRED.STLB_MISS_LOADS */
1013        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf),   /* MEM_INST_RETIRED.STLB_MISS_STORES */
1014        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf),   /* MEM_INST_RETIRED.LOCK_LOADS */
1015        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf),   /* MEM_INST_RETIRED.SPLIT_LOADS */
1016        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf),   /* MEM_INST_RETIRED.SPLIT_STORES */
1017        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf),   /* MEM_INST_RETIRED.ALL_LOADS */
1018        INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf),   /* MEM_INST_RETIRED.ALL_STORES */
1019
1020        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
1021
1022        INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),
1023
1024        /*
1025         * Everything else is handled by PMU_FL_PEBS_ALL, because we
1026         * need the full constraints from the main table.
1027         */
1028
1029        EVENT_CONSTRAINT_END
1030};
1031
1032struct event_constraint *intel_pebs_constraints(struct perf_event *event)
1033{
1034        struct event_constraint *pebs_constraints = hybrid(event->pmu, pebs_constraints);
1035        struct event_constraint *c;
1036
1037        if (!event->attr.precise_ip)
1038                return NULL;
1039
1040        if (pebs_constraints) {
1041                for_each_event_constraint(c, pebs_constraints) {
1042                        if (constraint_match(c, event->hw.config)) {
1043                                event->hw.flags |= c->flags;
1044                                return c;
1045                        }
1046                }
1047        }
1048
1049        /*
1050         * Extended PEBS support
1051         * Makes the PEBS code search the normal constraints.
1052         */
1053        if (x86_pmu.flags & PMU_FL_PEBS_ALL)
1054                return NULL;
1055
1056        return &emptyconstraint;
1057}
1058
1059/*
1060 * We need the sched_task callback even for per-cpu events when we use
1061 * the large interrupt threshold, such that we can provide PID and TID
1062 * to PEBS samples.
1063 */
1064static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
1065{
1066        if (cpuc->n_pebs == cpuc->n_pebs_via_pt)
1067                return false;
1068
1069        return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
1070}
1071
1072void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
1073{
1074        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1075
1076        if (!sched_in && pebs_needs_sched_cb(cpuc))
1077                intel_pmu_drain_pebs_buffer();
1078}
1079
1080static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
1081{
1082        struct debug_store *ds = cpuc->ds;
1083        int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events);
1084        int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
1085        u64 threshold;
1086        int reserved;
1087
1088        if (cpuc->n_pebs_via_pt)
1089                return;
1090
1091        if (x86_pmu.flags & PMU_FL_PEBS_ALL)
1092                reserved = max_pebs_events + num_counters_fixed;
1093        else
1094                reserved = max_pebs_events;
1095
1096        if (cpuc->n_pebs == cpuc->n_large_pebs) {
1097                threshold = ds->pebs_absolute_maximum -
1098                        reserved * cpuc->pebs_record_size;
1099        } else {
1100                threshold = ds->pebs_buffer_base + cpuc->pebs_record_size;
1101        }
1102
1103        ds->pebs_interrupt_threshold = threshold;
1104}
1105
1106static void adaptive_pebs_record_size_update(void)
1107{
1108        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1109        u64 pebs_data_cfg = cpuc->pebs_data_cfg;
1110        int sz = sizeof(struct pebs_basic);
1111
1112        if (pebs_data_cfg & PEBS_DATACFG_MEMINFO)
1113                sz += sizeof(struct pebs_meminfo);
1114        if (pebs_data_cfg & PEBS_DATACFG_GP)
1115                sz += sizeof(struct pebs_gprs);
1116        if (pebs_data_cfg & PEBS_DATACFG_XMMS)
1117                sz += sizeof(struct pebs_xmm);
1118        if (pebs_data_cfg & PEBS_DATACFG_LBRS)
1119                sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry);
1120
1121        cpuc->pebs_record_size = sz;
1122}
1123
1124#define PERF_PEBS_MEMINFO_TYPE  (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC |   \
1125                                PERF_SAMPLE_PHYS_ADDR |                      \
1126                                PERF_SAMPLE_WEIGHT_TYPE |                    \
1127                                PERF_SAMPLE_TRANSACTION |                    \
1128                                PERF_SAMPLE_DATA_PAGE_SIZE)
1129
1130static u64 pebs_update_adaptive_cfg(struct perf_event *event)
1131{
1132        struct perf_event_attr *attr = &event->attr;
1133        u64 sample_type = attr->sample_type;
1134        u64 pebs_data_cfg = 0;
1135        bool gprs, tsx_weight;
1136
1137        if (!(sample_type & ~(PERF_SAMPLE_IP|PERF_SAMPLE_TIME)) &&
1138            attr->precise_ip > 1)
1139                return pebs_data_cfg;
1140
1141        if (sample_type & PERF_PEBS_MEMINFO_TYPE)
1142                pebs_data_cfg |= PEBS_DATACFG_MEMINFO;
1143
1144        /*
1145         * We need GPRs when:
1146         * + user requested them
1147         * + precise_ip < 2 for the non event IP
1148         * + For RTM TSX weight we need GPRs for the abort code.
1149         */
1150        gprs = (sample_type & PERF_SAMPLE_REGS_INTR) &&
1151               (attr->sample_regs_intr & PEBS_GP_REGS);
1152
1153        tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
1154                     ((attr->config & INTEL_ARCH_EVENT_MASK) ==
1155                      x86_pmu.rtm_abort_event);
1156
1157        if (gprs || (attr->precise_ip < 2) || tsx_weight)
1158                pebs_data_cfg |= PEBS_DATACFG_GP;
1159
1160        if ((sample_type & PERF_SAMPLE_REGS_INTR) &&
1161            (attr->sample_regs_intr & PERF_REG_EXTENDED_MASK))
1162                pebs_data_cfg |= PEBS_DATACFG_XMMS;
1163
1164        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
1165                /*
1166                 * For now always log all LBRs. Could configure this
1167                 * later.
1168                 */
1169                pebs_data_cfg |= PEBS_DATACFG_LBRS |
1170                        ((x86_pmu.lbr_nr-1) << PEBS_DATACFG_LBR_SHIFT);
1171        }
1172
1173        return pebs_data_cfg;
1174}
1175
1176static void
1177pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
1178                  struct perf_event *event, bool add)
1179{
1180        struct pmu *pmu = event->ctx->pmu;
1181        /*
1182         * Make sure we get updated with the first PEBS
1183         * event. It will trigger also during removal, but
1184         * that does not hurt:
1185         */
1186        bool update = cpuc->n_pebs == 1;
1187
1188        if (needed_cb != pebs_needs_sched_cb(cpuc)) {
1189                if (!needed_cb)
1190                        perf_sched_cb_inc(pmu);
1191                else
1192                        perf_sched_cb_dec(pmu);
1193
1194                update = true;
1195        }
1196
1197        /*
1198         * The PEBS record doesn't shrink on pmu::del(). Doing so would require
1199         * iterating all remaining PEBS events to reconstruct the config.
1200         */
1201        if (x86_pmu.intel_cap.pebs_baseline && add) {
1202                u64 pebs_data_cfg;
1203
1204                /* Clear pebs_data_cfg and pebs_record_size for first PEBS. */
1205                if (cpuc->n_pebs == 1) {
1206                        cpuc->pebs_data_cfg = 0;
1207                        cpuc->pebs_record_size = sizeof(struct pebs_basic);
1208                }
1209
1210                pebs_data_cfg = pebs_update_adaptive_cfg(event);
1211
1212                /* Update pebs_record_size if new event requires more data. */
1213                if (pebs_data_cfg & ~cpuc->pebs_data_cfg) {
1214                        cpuc->pebs_data_cfg |= pebs_data_cfg;
1215                        adaptive_pebs_record_size_update();
1216                        update = true;
1217                }
1218        }
1219
1220        if (update)
1221                pebs_update_threshold(cpuc);
1222}
1223
1224void intel_pmu_pebs_add(struct perf_event *event)
1225{
1226        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1227        struct hw_perf_event *hwc = &event->hw;
1228        bool needed_cb = pebs_needs_sched_cb(cpuc);
1229
1230        cpuc->n_pebs++;
1231        if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
1232                cpuc->n_large_pebs++;
1233        if (hwc->flags & PERF_X86_EVENT_PEBS_VIA_PT)
1234                cpuc->n_pebs_via_pt++;
1235
1236        pebs_update_state(needed_cb, cpuc, event, true);
1237}
1238
1239static void intel_pmu_pebs_via_pt_disable(struct perf_event *event)
1240{
1241        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1242
1243        if (!is_pebs_pt(event))
1244                return;
1245
1246        if (!(cpuc->pebs_enabled & ~PEBS_VIA_PT_MASK))
1247                cpuc->pebs_enabled &= ~PEBS_VIA_PT_MASK;
1248}
1249
1250static void intel_pmu_pebs_via_pt_enable(struct perf_event *event)
1251{
1252        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1253        struct hw_perf_event *hwc = &event->hw;
1254        struct debug_store *ds = cpuc->ds;
1255        u64 value = ds->pebs_event_reset[hwc->idx];
1256        u32 base = MSR_RELOAD_PMC0;
1257        unsigned int idx = hwc->idx;
1258
1259        if (!is_pebs_pt(event))
1260                return;
1261
1262        if (!(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
1263                cpuc->pebs_enabled |= PEBS_PMI_AFTER_EACH_RECORD;
1264
1265        cpuc->pebs_enabled |= PEBS_OUTPUT_PT;
1266
1267        if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
1268                base = MSR_RELOAD_FIXED_CTR0;
1269                idx = hwc->idx - INTEL_PMC_IDX_FIXED;
1270                if (x86_pmu.intel_cap.pebs_format < 5)
1271                        value = ds->pebs_event_reset[MAX_PEBS_EVENTS_FMT4 + idx];
1272                else
1273                        value = ds->pebs_event_reset[MAX_PEBS_EVENTS + idx];
1274        }
1275        wrmsrl(base + idx, value);
1276}
1277
1278void intel_pmu_pebs_enable(struct perf_event *event)
1279{
1280        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1281        struct hw_perf_event *hwc = &event->hw;
1282        struct debug_store *ds = cpuc->ds;
1283        unsigned int idx = hwc->idx;
1284
1285        hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
1286
1287        cpuc->pebs_enabled |= 1ULL << hwc->idx;
1288
1289        if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5))
1290                cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
1291        else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
1292                cpuc->pebs_enabled |= 1ULL << 63;
1293
1294        if (x86_pmu.intel_cap.pebs_baseline) {
1295                hwc->config |= ICL_EVENTSEL_ADAPTIVE;
1296                if (cpuc->pebs_data_cfg != cpuc->active_pebs_data_cfg) {
1297                        wrmsrl(MSR_PEBS_DATA_CFG, cpuc->pebs_data_cfg);
1298                        cpuc->active_pebs_data_cfg = cpuc->pebs_data_cfg;
1299                }
1300        }
1301
1302        if (idx >= INTEL_PMC_IDX_FIXED) {
1303                if (x86_pmu.intel_cap.pebs_format < 5)
1304                        idx = MAX_PEBS_EVENTS_FMT4 + (idx - INTEL_PMC_IDX_FIXED);
1305                else
1306                        idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED);
1307        }
1308
1309        /*
1310         * Use auto-reload if possible to save a MSR write in the PMI.
1311         * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
1312         */
1313        if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
1314                ds->pebs_event_reset[idx] =
1315                        (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
1316        } else {
1317                ds->pebs_event_reset[idx] = 0;
1318        }
1319
1320        intel_pmu_pebs_via_pt_enable(event);
1321}
1322
1323void intel_pmu_pebs_del(struct perf_event *event)
1324{
1325        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1326        struct hw_perf_event *hwc = &event->hw;
1327        bool needed_cb = pebs_needs_sched_cb(cpuc);
1328
1329        cpuc->n_pebs--;
1330        if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
1331                cpuc->n_large_pebs--;
1332        if (hwc->flags & PERF_X86_EVENT_PEBS_VIA_PT)
1333                cpuc->n_pebs_via_pt--;
1334
1335        pebs_update_state(needed_cb, cpuc, event, false);
1336}
1337
1338void intel_pmu_pebs_disable(struct perf_event *event)
1339{
1340        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1341        struct hw_perf_event *hwc = &event->hw;
1342
1343        if (cpuc->n_pebs == cpuc->n_large_pebs &&
1344            cpuc->n_pebs != cpuc->n_pebs_via_pt)
1345                intel_pmu_drain_pebs_buffer();
1346
1347        cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
1348
1349        if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) &&
1350            (x86_pmu.version < 5))
1351                cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
1352        else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
1353                cpuc->pebs_enabled &= ~(1ULL << 63);
1354
1355        intel_pmu_pebs_via_pt_disable(event);
1356
1357        if (cpuc->enabled)
1358                wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
1359
1360        hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
1361}
1362
1363void intel_pmu_pebs_enable_all(void)
1364{
1365        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1366
1367        if (cpuc->pebs_enabled)
1368                wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
1369}
1370
1371void intel_pmu_pebs_disable_all(void)
1372{
1373        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1374
1375        if (cpuc->pebs_enabled)
1376                __intel_pmu_pebs_disable_all();
1377}
1378
1379static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
1380{
1381        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1382        unsigned long from = cpuc->lbr_entries[0].from;
1383        unsigned long old_to, to = cpuc->lbr_entries[0].to;
1384        unsigned long ip = regs->ip;
1385        int is_64bit = 0;
1386        void *kaddr;
1387        int size;
1388
1389        /*
1390         * We don't need to fixup if the PEBS assist is fault like
1391         */
1392        if (!x86_pmu.intel_cap.pebs_trap)
1393                return 1;
1394
1395        /*
1396         * No LBR entry, no basic block, no rewinding
1397         */
1398        if (!cpuc->lbr_stack.nr || !from || !to)
1399                return 0;
1400
1401        /*
1402         * Basic blocks should never cross user/kernel boundaries
1403         */
1404        if (kernel_ip(ip) != kernel_ip(to))
1405                return 0;
1406
1407        /*
1408         * unsigned math, either ip is before the start (impossible) or
1409         * the basic block is larger than 1 page (sanity)
1410         */
1411        if ((ip - to) > PEBS_FIXUP_SIZE)
1412                return 0;
1413
1414        /*
1415         * We sampled a branch insn, rewind using the LBR stack
1416         */
1417        if (ip == to) {
1418                set_linear_ip(regs, from);
1419                return 1;
1420        }
1421
1422        size = ip - to;
1423        if (!kernel_ip(ip)) {
1424                int bytes;
1425                u8 *buf = this_cpu_read(insn_buffer);
1426
1427                /* 'size' must fit our buffer, see above */
1428                bytes = copy_from_user_nmi(buf, (void __user *)to, size);
1429                if (bytes != 0)
1430                        return 0;
1431
1432                kaddr = buf;
1433        } else {
1434                kaddr = (void *)to;
1435        }
1436
1437        do {
1438                struct insn insn;
1439
1440                old_to = to;
1441
1442#ifdef CONFIG_X86_64
1443                is_64bit = kernel_ip(to) || any_64bit_mode(regs);
1444#endif
1445                insn_init(&insn, kaddr, size, is_64bit);
1446
1447                /*
1448                 * Make sure there was not a problem decoding the instruction.
1449                 * This is doubly important because we have an infinite loop if
1450                 * insn.length=0.
1451                 */
1452                if (insn_get_length(&insn))
1453                        break;
1454
1455                to += insn.length;
1456                kaddr += insn.length;
1457                size -= insn.length;
1458        } while (to < ip);
1459
1460        if (to == ip) {
1461                set_linear_ip(regs, old_to);
1462                return 1;
1463        }
1464
1465        /*
1466         * Even though we decoded the basic block, the instruction stream
1467         * never matched the given IP, either the TO or the IP got corrupted.
1468         */
1469        return 0;
1470}
1471
1472static inline u64 intel_get_tsx_weight(u64 tsx_tuning)
1473{
1474        if (tsx_tuning) {
1475                union hsw_tsx_tuning tsx = { .value = tsx_tuning };
1476                return tsx.cycles_last_block;
1477        }
1478        return 0;
1479}
1480
1481static inline u64 intel_get_tsx_transaction(u64 tsx_tuning, u64 ax)
1482{
1483        u64 txn = (tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
1484
1485        /* For RTM XABORTs also log the abort code from AX */
1486        if ((txn & PERF_TXN_TRANSACTION) && (ax & 1))
1487                txn |= ((ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
1488        return txn;
1489}
1490
1491static inline u64 get_pebs_status(void *n)
1492{
1493        if (x86_pmu.intel_cap.pebs_format < 4)
1494                return ((struct pebs_record_nhm *)n)->status;
1495        return ((struct pebs_basic *)n)->applicable_counters;
1496}
1497
1498#define PERF_X86_EVENT_PEBS_HSW_PREC \
1499                (PERF_X86_EVENT_PEBS_ST_HSW | \
1500                 PERF_X86_EVENT_PEBS_LD_HSW | \
1501                 PERF_X86_EVENT_PEBS_NA_HSW)
1502
1503static u64 get_data_src(struct perf_event *event, u64 aux)
1504{
1505        u64 val = PERF_MEM_NA;
1506        int fl = event->hw.flags;
1507        bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
1508
1509        if (fl & PERF_X86_EVENT_PEBS_LDLAT)
1510                val = load_latency_data(event, aux);
1511        else if (fl & PERF_X86_EVENT_PEBS_STLAT)
1512                val = store_latency_data(event, aux);
1513        else if (fl & PERF_X86_EVENT_PEBS_LAT_HYBRID)
1514                val = x86_pmu.pebs_latency_data(event, aux);
1515        else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
1516                val = precise_datala_hsw(event, aux);
1517        else if (fst)
1518                val = precise_store_data(aux);
1519        return val;
1520}
1521
1522#define PERF_SAMPLE_ADDR_TYPE   (PERF_SAMPLE_ADDR |             \
1523                                 PERF_SAMPLE_PHYS_ADDR |        \
1524                                 PERF_SAMPLE_DATA_PAGE_SIZE)
1525
1526static void setup_pebs_fixed_sample_data(struct perf_event *event,
1527                                   struct pt_regs *iregs, void *__pebs,
1528                                   struct perf_sample_data *data,
1529                                   struct pt_regs *regs)
1530{
1531        /*
1532         * We cast to the biggest pebs_record but are careful not to
1533         * unconditionally access the 'extra' entries.
1534         */
1535        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1536        struct pebs_record_skl *pebs = __pebs;
1537        u64 sample_type;
1538        int fll;
1539
1540        if (pebs == NULL)
1541                return;
1542
1543        sample_type = event->attr.sample_type;
1544        fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT;
1545
1546        perf_sample_data_init(data, 0, event->hw.last_period);
1547
1548        data->period = event->hw.last_period;
1549
1550        /*
1551         * Use latency for weight (only avail with PEBS-LL)
1552         */
1553        if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE))
1554                data->weight.full = pebs->lat;
1555
1556        /*
1557         * data.data_src encodes the data source
1558         */
1559        if (sample_type & PERF_SAMPLE_DATA_SRC)
1560                data->data_src.val = get_data_src(event, pebs->dse);
1561
1562        /*
1563         * We must however always use iregs for the unwinder to stay sane; the
1564         * record BP,SP,IP can point into thin air when the record is from a
1565         * previous PMI context or an (I)RET happened between the record and
1566         * PMI.
1567         */
1568        if (sample_type & PERF_SAMPLE_CALLCHAIN)
1569                data->callchain = perf_callchain(event, iregs);
1570
1571        /*
1572         * We use the interrupt regs as a base because the PEBS record does not
1573         * contain a full regs set, specifically it seems to lack segment
1574         * descriptors, which get used by things like user_mode().
1575         *
1576         * In the simple case fix up only the IP for PERF_SAMPLE_IP.
1577         */
1578        *regs = *iregs;
1579
1580        /*
1581         * Initialize regs_>flags from PEBS,
1582         * Clear exact bit (which uses x86 EFLAGS Reserved bit 3),
1583         * i.e., do not rely on it being zero:
1584         */
1585        regs->flags = pebs->flags & ~PERF_EFLAGS_EXACT;
1586
1587        if (sample_type & PERF_SAMPLE_REGS_INTR) {
1588                regs->ax = pebs->ax;
1589                regs->bx = pebs->bx;
1590                regs->cx = pebs->cx;
1591                regs->dx = pebs->dx;
1592                regs->si = pebs->si;
1593                regs->di = pebs->di;
1594
1595                regs->bp = pebs->bp;
1596                regs->sp = pebs->sp;
1597
1598#ifndef CONFIG_X86_32
1599                regs->r8 = pebs->r8;
1600                regs->r9 = pebs->r9;
1601                regs->r10 = pebs->r10;
1602                regs->r11 = pebs->r11;
1603                regs->r12 = pebs->r12;
1604                regs->r13 = pebs->r13;
1605                regs->r14 = pebs->r14;
1606                regs->r15 = pebs->r15;
1607#endif
1608        }
1609
1610        if (event->attr.precise_ip > 1) {
1611                /*
1612                 * Haswell and later processors have an 'eventing IP'
1613                 * (real IP) which fixes the off-by-1 skid in hardware.
1614                 * Use it when precise_ip >= 2 :
1615                 */
1616                if (x86_pmu.intel_cap.pebs_format >= 2) {
1617                        set_linear_ip(regs, pebs->real_ip);
1618                        regs->flags |= PERF_EFLAGS_EXACT;
1619                } else {
1620                        /* Otherwise, use PEBS off-by-1 IP: */
1621                        set_linear_ip(regs, pebs->ip);
1622
1623                        /*
1624                         * With precise_ip >= 2, try to fix up the off-by-1 IP
1625                         * using the LBR. If successful, the fixup function
1626                         * corrects regs->ip and calls set_linear_ip() on regs:
1627                         */
1628                        if (intel_pmu_pebs_fixup_ip(regs))
1629                                regs->flags |= PERF_EFLAGS_EXACT;
1630                }
1631        } else {
1632                /*
1633                 * When precise_ip == 1, return the PEBS off-by-1 IP,
1634                 * no fixup attempted:
1635                 */
1636                set_linear_ip(regs, pebs->ip);
1637        }
1638
1639
1640        if ((sample_type & PERF_SAMPLE_ADDR_TYPE) &&
1641            x86_pmu.intel_cap.pebs_format >= 1)
1642                data->addr = pebs->dla;
1643
1644        if (x86_pmu.intel_cap.pebs_format >= 2) {
1645                /* Only set the TSX weight when no memory weight. */
1646                if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll)
1647                        data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning);
1648
1649                if (sample_type & PERF_SAMPLE_TRANSACTION)
1650                        data->txn = intel_get_tsx_transaction(pebs->tsx_tuning,
1651                                                              pebs->ax);
1652        }
1653
1654        /*
1655         * v3 supplies an accurate time stamp, so we use that
1656         * for the time stamp.
1657         *
1658         * We can only do this for the default trace clock.
1659         */
1660        if (x86_pmu.intel_cap.pebs_format >= 3 &&
1661                event->attr.use_clockid == 0)
1662                data->time = native_sched_clock_from_tsc(pebs->tsc);
1663
1664        if (has_branch_stack(event))
1665                data->br_stack = &cpuc->lbr_stack;
1666}
1667
1668static void adaptive_pebs_save_regs(struct pt_regs *regs,
1669                                    struct pebs_gprs *gprs)
1670{
1671        regs->ax = gprs->ax;
1672        regs->bx = gprs->bx;
1673        regs->cx = gprs->cx;
1674        regs->dx = gprs->dx;
1675        regs->si = gprs->si;
1676        regs->di = gprs->di;
1677        regs->bp = gprs->bp;
1678        regs->sp = gprs->sp;
1679#ifndef CONFIG_X86_32
1680        regs->r8 = gprs->r8;
1681        regs->r9 = gprs->r9;
1682        regs->r10 = gprs->r10;
1683        regs->r11 = gprs->r11;
1684        regs->r12 = gprs->r12;
1685        regs->r13 = gprs->r13;
1686        regs->r14 = gprs->r14;
1687        regs->r15 = gprs->r15;
1688#endif
1689}
1690
1691#define PEBS_LATENCY_MASK                       0xffff
1692#define PEBS_CACHE_LATENCY_OFFSET               32
1693
1694/*
1695 * With adaptive PEBS the layout depends on what fields are configured.
1696 */
1697
1698static void setup_pebs_adaptive_sample_data(struct perf_event *event,
1699                                            struct pt_regs *iregs, void *__pebs,
1700                                            struct perf_sample_data *data,
1701                                            struct pt_regs *regs)
1702{
1703        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1704        struct pebs_basic *basic = __pebs;
1705        void *next_record = basic + 1;
1706        u64 sample_type;
1707        u64 format_size;
1708        struct pebs_meminfo *meminfo = NULL;
1709        struct pebs_gprs *gprs = NULL;
1710        struct x86_perf_regs *perf_regs;
1711
1712        if (basic == NULL)
1713                return;
1714
1715        perf_regs = container_of(regs, struct x86_perf_regs, regs);
1716        perf_regs->xmm_regs = NULL;
1717
1718        sample_type = event->attr.sample_type;
1719        format_size = basic->format_size;
1720        perf_sample_data_init(data, 0, event->hw.last_period);
1721        data->period = event->hw.last_period;
1722
1723        if (event->attr.use_clockid == 0)
1724                data->time = native_sched_clock_from_tsc(basic->tsc);
1725
1726        /*
1727         * We must however always use iregs for the unwinder to stay sane; the
1728         * record BP,SP,IP can point into thin air when the record is from a
1729         * previous PMI context or an (I)RET happened between the record and
1730         * PMI.
1731         */
1732        if (sample_type & PERF_SAMPLE_CALLCHAIN)
1733                data->callchain = perf_callchain(event, iregs);
1734
1735        *regs = *iregs;
1736        /* The ip in basic is EventingIP */
1737        set_linear_ip(regs, basic->ip);
1738        regs->flags = PERF_EFLAGS_EXACT;
1739
1740        /*
1741         * The record for MEMINFO is in front of GP
1742         * But PERF_SAMPLE_TRANSACTION needs gprs->ax.
1743         * Save the pointer here but process later.
1744         */
1745        if (format_size & PEBS_DATACFG_MEMINFO) {
1746                meminfo = next_record;
1747                next_record = meminfo + 1;
1748        }
1749
1750        if (format_size & PEBS_DATACFG_GP) {
1751                gprs = next_record;
1752                next_record = gprs + 1;
1753
1754                if (event->attr.precise_ip < 2) {
1755                        set_linear_ip(regs, gprs->ip);
1756                        regs->flags &= ~PERF_EFLAGS_EXACT;
1757                }
1758
1759                if (sample_type & PERF_SAMPLE_REGS_INTR)
1760                        adaptive_pebs_save_regs(regs, gprs);
1761        }
1762
1763        if (format_size & PEBS_DATACFG_MEMINFO) {
1764                if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
1765                        u64 weight = meminfo->latency;
1766
1767                        if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) {
1768                                data->weight.var2_w = weight & PEBS_LATENCY_MASK;
1769                                weight >>= PEBS_CACHE_LATENCY_OFFSET;
1770                        }
1771
1772                        /*
1773                         * Although meminfo::latency is defined as a u64,
1774                         * only the lower 32 bits include the valid data
1775                         * in practice on Ice Lake and earlier platforms.
1776                         */
1777                        if (sample_type & PERF_SAMPLE_WEIGHT) {
1778                                data->weight.full = weight ?:
1779                                        intel_get_tsx_weight(meminfo->tsx_tuning);
1780                        } else {
1781                                data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
1782                                        intel_get_tsx_weight(meminfo->tsx_tuning);
1783                        }
1784                }
1785
1786                if (sample_type & PERF_SAMPLE_DATA_SRC)
1787                        data->data_src.val = get_data_src(event, meminfo->aux);
1788
1789                if (sample_type & PERF_SAMPLE_ADDR_TYPE)
1790                        data->addr = meminfo->address;
1791
1792                if (sample_type & PERF_SAMPLE_TRANSACTION)
1793                        data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning,
1794                                                          gprs ? gprs->ax : 0);
1795        }
1796
1797        if (format_size & PEBS_DATACFG_XMMS) {
1798                struct pebs_xmm *xmm = next_record;
1799
1800                next_record = xmm + 1;
1801                perf_regs->xmm_regs = xmm->xmm;
1802        }
1803
1804        if (format_size & PEBS_DATACFG_LBRS) {
1805                struct lbr_entry *lbr = next_record;
1806                int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT)
1807                                        & 0xff) + 1;
1808                next_record = next_record + num_lbr * sizeof(struct lbr_entry);
1809
1810                if (has_branch_stack(event)) {
1811                        intel_pmu_store_pebs_lbrs(lbr);
1812                        data->br_stack = &cpuc->lbr_stack;
1813                }
1814        }
1815
1816        WARN_ONCE(next_record != __pebs + (format_size >> 48),
1817                        "PEBS record size %llu, expected %llu, config %llx\n",
1818                        format_size >> 48,
1819                        (u64)(next_record - __pebs),
1820                        basic->format_size);
1821}
1822
1823static inline void *
1824get_next_pebs_record_by_bit(void *base, void *top, int bit)
1825{
1826        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1827        void *at;
1828        u64 pebs_status;
1829
1830        /*
1831         * fmt0 does not have a status bitfield (does not use
1832         * perf_record_nhm format)
1833         */
1834        if (x86_pmu.intel_cap.pebs_format < 1)
1835                return base;
1836
1837        if (base == NULL)
1838                return NULL;
1839
1840        for (at = base; at < top; at += cpuc->pebs_record_size) {
1841                unsigned long status = get_pebs_status(at);
1842
1843                if (test_bit(bit, (unsigned long *)&status)) {
1844                        /* PEBS v3 has accurate status bits */
1845                        if (x86_pmu.intel_cap.pebs_format >= 3)
1846                                return at;
1847
1848                        if (status == (1 << bit))
1849                                return at;
1850
1851                        /* clear non-PEBS bit and re-check */
1852                        pebs_status = status & cpuc->pebs_enabled;
1853                        pebs_status &= PEBS_COUNTER_MASK;
1854                        if (pebs_status == (1 << bit))
1855                                return at;
1856                }
1857        }
1858        return NULL;
1859}
1860
1861void intel_pmu_auto_reload_read(struct perf_event *event)
1862{
1863        WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
1864
1865        perf_pmu_disable(event->pmu);
1866        intel_pmu_drain_pebs_buffer();
1867        perf_pmu_enable(event->pmu);
1868}
1869
1870/*
1871 * Special variant of intel_pmu_save_and_restart() for auto-reload.
1872 */
1873static int
1874intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
1875{
1876        struct hw_perf_event *hwc = &event->hw;
1877        int shift = 64 - x86_pmu.cntval_bits;
1878        u64 period = hwc->sample_period;
1879        u64 prev_raw_count, new_raw_count;
1880        s64 new, old;
1881
1882        WARN_ON(!period);
1883
1884        /*
1885         * drain_pebs() only happens when the PMU is disabled.
1886         */
1887        WARN_ON(this_cpu_read(cpu_hw_events.enabled));
1888
1889        prev_raw_count = local64_read(&hwc->prev_count);
1890        rdpmcl(hwc->event_base_rdpmc, new_raw_count);
1891        local64_set(&hwc->prev_count, new_raw_count);
1892
1893        /*
1894         * Since the counter increments a negative counter value and
1895         * overflows on the sign switch, giving the interval:
1896         *
1897         *   [-period, 0]
1898         *
1899         * the difference between two consecutive reads is:
1900         *
1901         *   A) value2 - value1;
1902         *      when no overflows have happened in between,
1903         *
1904         *   B) (0 - value1) + (value2 - (-period));
1905         *      when one overflow happened in between,
1906         *
1907         *   C) (0 - value1) + (n - 1) * (period) + (value2 - (-period));
1908         *      when @n overflows happened in between.
1909         *
1910         * Here A) is the obvious difference, B) is the extension to the
1911         * discrete interval, where the first term is to the top of the
1912         * interval and the second term is from the bottom of the next
1913         * interval and C) the extension to multiple intervals, where the
1914         * middle term is the whole intervals covered.
1915         *
1916         * An equivalent of C, by reduction, is:
1917         *
1918         *   value2 - value1 + n * period
1919         */
1920        new = ((s64)(new_raw_count << shift) >> shift);
1921        old = ((s64)(prev_raw_count << shift) >> shift);
1922        local64_add(new - old + count * period, &event->count);
1923
1924        local64_set(&hwc->period_left, -new);
1925
1926        perf_event_update_userpage(event);
1927
1928        return 0;
1929}
1930
1931static __always_inline void
1932__intel_pmu_pebs_event(struct perf_event *event,
1933                       struct pt_regs *iregs,
1934                       struct perf_sample_data *data,
1935                       void *base, void *top,
1936                       int bit, int count,
1937                       void (*setup_sample)(struct perf_event *,
1938                                            struct pt_regs *,
1939                                            void *,
1940                                            struct perf_sample_data *,
1941                                            struct pt_regs *))
1942{
1943        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1944        struct hw_perf_event *hwc = &event->hw;
1945        struct x86_perf_regs perf_regs;
1946        struct pt_regs *regs = &perf_regs.regs;
1947        void *at = get_next_pebs_record_by_bit(base, top, bit);
1948        static struct pt_regs dummy_iregs;
1949
1950        if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
1951                /*
1952                 * Now, auto-reload is only enabled in fixed period mode.
1953                 * The reload value is always hwc->sample_period.
1954                 * May need to change it, if auto-reload is enabled in
1955                 * freq mode later.
1956                 */
1957                intel_pmu_save_and_restart_reload(event, count);
1958        } else if (!intel_pmu_save_and_restart(event))
1959                return;
1960
1961        if (!iregs)
1962                iregs = &dummy_iregs;
1963
1964        while (count > 1) {
1965                setup_sample(event, iregs, at, data, regs);
1966                perf_event_output(event, data, regs);
1967                at += cpuc->pebs_record_size;
1968                at = get_next_pebs_record_by_bit(at, top, bit);
1969                count--;
1970        }
1971
1972        setup_sample(event, iregs, at, data, regs);
1973        if (iregs == &dummy_iregs) {
1974                /*
1975                 * The PEBS records may be drained in the non-overflow context,
1976                 * e.g., large PEBS + context switch. Perf should treat the
1977                 * last record the same as other PEBS records, and doesn't
1978                 * invoke the generic overflow handler.
1979                 */
1980                perf_event_output(event, data, regs);
1981        } else {
1982                /*
1983                 * All but the last records are processed.
1984                 * The last one is left to be able to call the overflow handler.
1985                 */
1986                if (perf_event_overflow(event, data, regs))
1987                        x86_pmu_stop(event, 0);
1988        }
1989}
1990
1991static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
1992{
1993        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1994        struct debug_store *ds = cpuc->ds;
1995        struct perf_event *event = cpuc->events[0]; /* PMC0 only */
1996        struct pebs_record_core *at, *top;
1997        int n;
1998
1999        if (!x86_pmu.pebs_active)
2000                return;
2001
2002        at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
2003        top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
2004
2005        /*
2006         * Whatever else happens, drain the thing
2007         */
2008        ds->pebs_index = ds->pebs_buffer_base;
2009
2010        if (!test_bit(0, cpuc->active_mask))
2011                return;
2012
2013        WARN_ON_ONCE(!event);
2014
2015        if (!event->attr.precise_ip)
2016                return;
2017
2018        n = top - at;
2019        if (n <= 0) {
2020                if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
2021                        intel_pmu_save_and_restart_reload(event, 0);
2022                return;
2023        }
2024
2025        __intel_pmu_pebs_event(event, iregs, data, at, top, 0, n,
2026                               setup_pebs_fixed_sample_data);
2027}
2028
2029static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size)
2030{
2031        struct perf_event *event;
2032        int bit;
2033
2034        /*
2035         * The drain_pebs() could be called twice in a short period
2036         * for auto-reload event in pmu::read(). There are no
2037         * overflows have happened in between.
2038         * It needs to call intel_pmu_save_and_restart_reload() to
2039         * update the event->count for this case.
2040         */
2041        for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) {
2042                event = cpuc->events[bit];
2043                if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
2044                        intel_pmu_save_and_restart_reload(event, 0);
2045        }
2046}
2047
2048static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_data *data)
2049{
2050        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2051        struct debug_store *ds = cpuc->ds;
2052        struct perf_event *event;
2053        void *base, *at, *top;
2054        short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
2055        short error[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
2056        int bit, i, size;
2057        u64 mask;
2058
2059        if (!x86_pmu.pebs_active)
2060                return;
2061
2062        base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
2063        top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
2064
2065        ds->pebs_index = ds->pebs_buffer_base;
2066
2067        mask = (1ULL << x86_pmu.max_pebs_events) - 1;
2068        size = x86_pmu.max_pebs_events;
2069        if (x86_pmu.flags & PMU_FL_PEBS_ALL) {
2070                mask |= ((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED;
2071                size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed;
2072        }
2073
2074        if (unlikely(base >= top)) {
2075                intel_pmu_pebs_event_update_no_drain(cpuc, size);
2076                return;
2077        }
2078
2079        for (at = base; at < top; at += x86_pmu.pebs_record_size) {
2080                struct pebs_record_nhm *p = at;
2081                u64 pebs_status;
2082
2083                pebs_status = p->status & cpuc->pebs_enabled;
2084                pebs_status &= mask;
2085
2086                /* PEBS v3 has more accurate status bits */
2087                if (x86_pmu.intel_cap.pebs_format >= 3) {
2088                        for_each_set_bit(bit, (unsigned long *)&pebs_status, size)
2089                                counts[bit]++;
2090
2091                        continue;
2092                }
2093
2094                /*
2095                 * On some CPUs the PEBS status can be zero when PEBS is
2096                 * racing with clearing of GLOBAL_STATUS.
2097                 *
2098                 * Normally we would drop that record, but in the
2099                 * case when there is only a single active PEBS event
2100                 * we can assume it's for that event.
2101                 */
2102                if (!pebs_status && cpuc->pebs_enabled &&
2103                        !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
2104                        pebs_status = p->status = cpuc->pebs_enabled;
2105
2106                bit = find_first_bit((unsigned long *)&pebs_status,
2107                                        x86_pmu.max_pebs_events);
2108                if (bit >= x86_pmu.max_pebs_events)
2109                        continue;
2110
2111                /*
2112                 * The PEBS hardware does not deal well with the situation
2113                 * when events happen near to each other and multiple bits
2114                 * are set. But it should happen rarely.
2115                 *
2116                 * If these events include one PEBS and multiple non-PEBS
2117                 * events, it doesn't impact PEBS record. The record will
2118                 * be handled normally. (slow path)
2119                 *
2120                 * If these events include two or more PEBS events, the
2121                 * records for the events can be collapsed into a single
2122                 * one, and it's not possible to reconstruct all events
2123                 * that caused the PEBS record. It's called collision.
2124                 * If collision happened, the record will be dropped.
2125                 */
2126                if (pebs_status != (1ULL << bit)) {
2127                        for_each_set_bit(i, (unsigned long *)&pebs_status, size)
2128                                error[i]++;
2129                        continue;
2130                }
2131
2132                counts[bit]++;
2133        }
2134
2135        for_each_set_bit(bit, (unsigned long *)&mask, size) {
2136                if ((counts[bit] == 0) && (error[bit] == 0))
2137                        continue;
2138
2139                event = cpuc->events[bit];
2140                if (WARN_ON_ONCE(!event))
2141                        continue;
2142
2143                if (WARN_ON_ONCE(!event->attr.precise_ip))
2144                        continue;
2145
2146                /* log dropped samples number */
2147                if (error[bit]) {
2148                        perf_log_lost_samples(event, error[bit]);
2149
2150                        if (iregs && perf_event_account_interrupt(event))
2151                                x86_pmu_stop(event, 0);
2152                }
2153
2154                if (counts[bit]) {
2155                        __intel_pmu_pebs_event(event, iregs, data, base,
2156                                               top, bit, counts[bit],
2157                                               setup_pebs_fixed_sample_data);
2158                }
2159        }
2160}
2161
2162static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
2163{
2164        short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
2165        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2166        int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events);
2167        int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
2168        struct debug_store *ds = cpuc->ds;
2169        struct perf_event *event;
2170        void *base, *at, *top;
2171        int bit, size;
2172        u64 mask;
2173
2174        if (!x86_pmu.pebs_active)
2175                return;
2176
2177        base = (struct pebs_basic *)(unsigned long)ds->pebs_buffer_base;
2178        top = (struct pebs_basic *)(unsigned long)ds->pebs_index;
2179
2180        ds->pebs_index = ds->pebs_buffer_base;
2181
2182        mask = ((1ULL << max_pebs_events) - 1) |
2183               (((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED);
2184        size = INTEL_PMC_IDX_FIXED + num_counters_fixed;
2185
2186        if (unlikely(base >= top)) {
2187                intel_pmu_pebs_event_update_no_drain(cpuc, size);
2188                return;
2189        }
2190
2191        for (at = base; at < top; at += cpuc->pebs_record_size) {
2192                u64 pebs_status;
2193
2194                pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
2195                pebs_status &= mask;
2196
2197                for_each_set_bit(bit, (unsigned long *)&pebs_status, size)
2198                        counts[bit]++;
2199        }
2200
2201        for_each_set_bit(bit, (unsigned long *)&mask, size) {
2202                if (counts[bit] == 0)
2203                        continue;
2204
2205                event = cpuc->events[bit];
2206                if (WARN_ON_ONCE(!event))
2207                        continue;
2208
2209                if (WARN_ON_ONCE(!event->attr.precise_ip))
2210                        continue;
2211
2212                __intel_pmu_pebs_event(event, iregs, data, base,
2213                                       top, bit, counts[bit],
2214                                       setup_pebs_adaptive_sample_data);
2215        }
2216}
2217
2218/*
2219 * BTS, PEBS probe and setup
2220 */
2221
2222void __init intel_ds_init(void)
2223{
2224        /*
2225         * No support for 32bit formats
2226         */
2227        if (!boot_cpu_has(X86_FEATURE_DTES64))
2228                return;
2229
2230        x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
2231        x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
2232        x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
2233        if (x86_pmu.version <= 4)
2234                x86_pmu.pebs_no_isolation = 1;
2235
2236        if (x86_pmu.pebs) {
2237                char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
2238                char *pebs_qual = "";
2239                int format = x86_pmu.intel_cap.pebs_format;
2240
2241                if (format < 4)
2242                        x86_pmu.intel_cap.pebs_baseline = 0;
2243
2244                switch (format) {
2245                case 0:
2246                        pr_cont("PEBS fmt0%c, ", pebs_type);
2247                        x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
2248                        /*
2249                         * Using >PAGE_SIZE buffers makes the WRMSR to
2250                         * PERF_GLOBAL_CTRL in intel_pmu_enable_all()
2251                         * mysteriously hang on Core2.
2252                         *
2253                         * As a workaround, we don't do this.
2254                         */
2255                        x86_pmu.pebs_buffer_size = PAGE_SIZE;
2256                        x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
2257                        break;
2258
2259                case 1:
2260                        pr_cont("PEBS fmt1%c, ", pebs_type);
2261                        x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
2262                        x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
2263                        break;
2264
2265                case 2:
2266                        pr_cont("PEBS fmt2%c, ", pebs_type);
2267                        x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
2268                        x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
2269                        break;
2270
2271                case 3:
2272                        pr_cont("PEBS fmt3%c, ", pebs_type);
2273                        x86_pmu.pebs_record_size =
2274                                                sizeof(struct pebs_record_skl);
2275                        x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
2276                        x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
2277                        break;
2278
2279                case 4:
2280                case 5:
2281                        x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl;
2282                        x86_pmu.pebs_record_size = sizeof(struct pebs_basic);
2283                        if (x86_pmu.intel_cap.pebs_baseline) {
2284                                x86_pmu.large_pebs_flags |=
2285                                        PERF_SAMPLE_BRANCH_STACK |
2286                                        PERF_SAMPLE_TIME;
2287                                x86_pmu.flags |= PMU_FL_PEBS_ALL;
2288                                x86_pmu.pebs_capable = ~0ULL;
2289                                pebs_qual = "-baseline";
2290                                x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
2291                        } else {
2292                                /* Only basic record supported */
2293                                x86_pmu.large_pebs_flags &=
2294                                        ~(PERF_SAMPLE_ADDR |
2295                                          PERF_SAMPLE_TIME |
2296                                          PERF_SAMPLE_DATA_SRC |
2297                                          PERF_SAMPLE_TRANSACTION |
2298                                          PERF_SAMPLE_REGS_USER |
2299                                          PERF_SAMPLE_REGS_INTR);
2300                        }
2301                        pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual);
2302
2303                        if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) {
2304                                pr_cont("PEBS-via-PT, ");
2305                                x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
2306                        }
2307
2308                        break;
2309
2310                default:
2311                        pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
2312                        x86_pmu.pebs = 0;
2313                }
2314        }
2315}
2316
2317void perf_restore_debug_store(void)
2318{
2319        struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
2320
2321        if (!x86_pmu.bts && !x86_pmu.pebs)
2322                return;
2323
2324        wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
2325}
2326