linux/drivers/edac/mce_amd.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2#include <linux/module.h>
   3#include <linux/slab.h>
   4
   5#include <asm/cpu.h>
   6
   7#include "mce_amd.h"
   8
   9static struct amd_decoder_ops fam_ops;
  10
  11static u8 xec_mask       = 0xf;
  12
  13static void (*decode_dram_ecc)(int node_id, struct mce *m);
  14
  15void amd_register_ecc_decoder(void (*f)(int, struct mce *))
  16{
  17        decode_dram_ecc = f;
  18}
  19EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
  20
  21void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
  22{
  23        if (decode_dram_ecc) {
  24                WARN_ON(decode_dram_ecc != f);
  25
  26                decode_dram_ecc = NULL;
  27        }
  28}
  29EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
  30
  31/*
  32 * string representation for the different MCA reported error types, see F3x48
  33 * or MSR0000_0411.
  34 */
  35
  36/* transaction type */
  37static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
  38
  39/* cache level */
  40static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
  41
  42/* memory transaction type */
  43static const char * const rrrr_msgs[] = {
  44       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
  45};
  46
  47/* participating processor */
  48const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
  49EXPORT_SYMBOL_GPL(pp_msgs);
  50
  51/* request timeout */
  52static const char * const to_msgs[] = { "no timeout", "timed out" };
  53
  54/* memory or i/o */
  55static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
  56
  57/* internal error type */
  58static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
  59
  60static const char * const f15h_mc1_mce_desc[] = {
  61        "UC during a demand linefill from L2",
  62        "Parity error during data load from IC",
  63        "Parity error for IC valid bit",
  64        "Main tag parity error",
  65        "Parity error in prediction queue",
  66        "PFB data/address parity error",
  67        "Parity error in the branch status reg",
  68        "PFB promotion address error",
  69        "Tag error during probe/victimization",
  70        "Parity error for IC probe tag valid bit",
  71        "PFB non-cacheable bit parity error",
  72        "PFB valid bit parity error",                   /* xec = 0xd */
  73        "Microcode Patch Buffer",                       /* xec = 010 */
  74        "uop queue",
  75        "insn buffer",
  76        "predecode buffer",
  77        "fetch address FIFO",
  78        "dispatch uop queue"
  79};
  80
  81static const char * const f15h_mc2_mce_desc[] = {
  82        "Fill ECC error on data fills",                 /* xec = 0x4 */
  83        "Fill parity error on insn fills",
  84        "Prefetcher request FIFO parity error",
  85        "PRQ address parity error",
  86        "PRQ data parity error",
  87        "WCC Tag ECC error",
  88        "WCC Data ECC error",
  89        "WCB Data parity error",
  90        "VB Data ECC or parity error",
  91        "L2 Tag ECC error",                             /* xec = 0x10 */
  92        "Hard L2 Tag ECC error",
  93        "Multiple hits on L2 tag",
  94        "XAB parity error",
  95        "PRB address parity error"
  96};
  97
  98static const char * const mc4_mce_desc[] = {
  99        "DRAM ECC error detected on the NB",
 100        "CRC error detected on HT link",
 101        "Link-defined sync error packets detected on HT link",
 102        "HT Master abort",
 103        "HT Target abort",
 104        "Invalid GART PTE entry during GART table walk",
 105        "Unsupported atomic RMW received from an IO link",
 106        "Watchdog timeout due to lack of progress",
 107        "DRAM ECC error detected on the NB",
 108        "SVM DMA Exclusion Vector error",
 109        "HT data error detected on link",
 110        "Protocol error (link, L3, probe filter)",
 111        "NB internal arrays parity error",
 112        "DRAM addr/ctl signals parity error",
 113        "IO link transmission error",
 114        "L3 data cache ECC error",                      /* xec = 0x1c */
 115        "L3 cache tag error",
 116        "L3 LRU parity bits error",
 117        "ECC Error in the Probe Filter directory"
 118};
 119
 120static const char * const mc5_mce_desc[] = {
 121        "CPU Watchdog timer expire",
 122        "Wakeup array dest tag",
 123        "AG payload array",
 124        "EX payload array",
 125        "IDRF array",
 126        "Retire dispatch queue",
 127        "Mapper checkpoint array",
 128        "Physical register file EX0 port",
 129        "Physical register file EX1 port",
 130        "Physical register file AG0 port",
 131        "Physical register file AG1 port",
 132        "Flag register file",
 133        "DE error occurred",
 134        "Retire status queue"
 135};
 136
 137static const char * const mc6_mce_desc[] = {
 138        "Hardware Assertion",
 139        "Free List",
 140        "Physical Register File",
 141        "Retire Queue",
 142        "Scheduler table",
 143        "Status Register File",
 144};
 145
 146/* Scalable MCA error strings */
 147static const char * const smca_ls_mce_desc[] = {
 148        "Load queue parity error",
 149        "Store queue parity error",
 150        "Miss address buffer payload parity error",
 151        "Level 1 TLB parity error",
 152        "DC Tag error type 5",
 153        "DC Tag error type 6",
 154        "DC Tag error type 1",
 155        "Internal error type 1",
 156        "Internal error type 2",
 157        "System Read Data Error Thread 0",
 158        "System Read Data Error Thread 1",
 159        "DC Tag error type 2",
 160        "DC Data error type 1 and poison consumption",
 161        "DC Data error type 2",
 162        "DC Data error type 3",
 163        "DC Tag error type 4",
 164        "Level 2 TLB parity error",
 165        "PDC parity error",
 166        "DC Tag error type 3",
 167        "DC Tag error type 5",
 168        "L2 Fill Data error",
 169};
 170
 171static const char * const smca_ls2_mce_desc[] = {
 172        "An ECC error was detected on a data cache read by a probe or victimization",
 173        "An ECC error or L2 poison was detected on a data cache read by a load",
 174        "An ECC error was detected on a data cache read-modify-write by a store",
 175        "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
 176        "An ECC error or poison bit mismatch was detected on a tag read by a load",
 177        "An ECC error or poison bit mismatch was detected on a tag read by a store",
 178        "An ECC error was detected on an EMEM read by a load",
 179        "An ECC error was detected on an EMEM read-modify-write by a store",
 180        "A parity error was detected in an L1 TLB entry by any access",
 181        "A parity error was detected in an L2 TLB entry by any access",
 182        "A parity error was detected in a PWC entry by any access",
 183        "A parity error was detected in an STQ entry by any access",
 184        "A parity error was detected in an LDQ entry by any access",
 185        "A parity error was detected in a MAB entry by any access",
 186        "A parity error was detected in an SCB entry state field by any access",
 187        "A parity error was detected in an SCB entry address field by any access",
 188        "A parity error was detected in an SCB entry data field by any access",
 189        "A parity error was detected in a WCB entry by any access",
 190        "A poisoned line was detected in an SCB entry by any access",
 191        "A SystemReadDataError error was reported on read data returned from L2 for a load",
 192        "A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
 193        "A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
 194        "A hardware assertion error was reported",
 195        "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
 196};
 197
 198static const char * const smca_if_mce_desc[] = {
 199        "Op Cache Microtag Probe Port Parity Error",
 200        "IC Microtag or Full Tag Multi-hit Error",
 201        "IC Full Tag Parity Error",
 202        "IC Data Array Parity Error",
 203        "Decoupling Queue PhysAddr Parity Error",
 204        "L0 ITLB Parity Error",
 205        "L1 ITLB Parity Error",
 206        "L2 ITLB Parity Error",
 207        "BPQ Thread 0 Snoop Parity Error",
 208        "BPQ Thread 1 Snoop Parity Error",
 209        "L1 BTB Multi-Match Error",
 210        "L2 BTB Multi-Match Error",
 211        "L2 Cache Response Poison Error",
 212        "System Read Data Error",
 213        "Hardware Assertion Error",
 214        "L1-TLB Multi-Hit",
 215        "L2-TLB Multi-Hit",
 216        "BSR Parity Error",
 217        "CT MCE",
 218};
 219
 220static const char * const smca_l2_mce_desc[] = {
 221        "L2M Tag Multiple-Way-Hit error",
 222        "L2M Tag or State Array ECC Error",
 223        "L2M Data Array ECC Error",
 224        "Hardware Assert Error",
 225};
 226
 227static const char * const smca_de_mce_desc[] = {
 228        "Micro-op cache tag parity error",
 229        "Micro-op cache data parity error",
 230        "Instruction buffer parity error",
 231        "Micro-op queue parity error",
 232        "Instruction dispatch queue parity error",
 233        "Fetch address FIFO parity error",
 234        "Patch RAM data parity error",
 235        "Patch RAM sequencer parity error",
 236        "Micro-op buffer parity error",
 237        "Hardware Assertion MCA Error",
 238};
 239
 240static const char * const smca_ex_mce_desc[] = {
 241        "Watchdog Timeout error",
 242        "Physical register file parity error",
 243        "Flag register file parity error",
 244        "Immediate displacement register file parity error",
 245        "Address generator payload parity error",
 246        "EX payload parity error",
 247        "Checkpoint queue parity error",
 248        "Retire dispatch queue parity error",
 249        "Retire status queue parity error",
 250        "Scheduling queue parity error",
 251        "Branch buffer queue parity error",
 252        "Hardware Assertion error",
 253        "Spec Map parity error",
 254        "Retire Map parity error",
 255};
 256
 257static const char * const smca_fp_mce_desc[] = {
 258        "Physical register file (PRF) parity error",
 259        "Freelist (FL) parity error",
 260        "Schedule queue parity error",
 261        "NSQ parity error",
 262        "Retire queue (RQ) parity error",
 263        "Status register file (SRF) parity error",
 264        "Hardware assertion",
 265};
 266
 267static const char * const smca_l3_mce_desc[] = {
 268        "Shadow Tag Macro ECC Error",
 269        "Shadow Tag Macro Multi-way-hit Error",
 270        "L3M Tag ECC Error",
 271        "L3M Tag Multi-way-hit Error",
 272        "L3M Data ECC Error",
 273        "SDP Parity Error or SystemReadDataError from XI",
 274        "L3 Victim Queue Parity Error",
 275        "L3 Hardware Assertion",
 276};
 277
 278static const char * const smca_cs_mce_desc[] = {
 279        "Illegal Request",
 280        "Address Violation",
 281        "Security Violation",
 282        "Illegal Response",
 283        "Unexpected Response",
 284        "Request or Probe Parity Error",
 285        "Read Response Parity Error",
 286        "Atomic Request Parity Error",
 287        "Probe Filter ECC Error",
 288};
 289
 290static const char * const smca_cs2_mce_desc[] = {
 291        "Illegal Request",
 292        "Address Violation",
 293        "Security Violation",
 294        "Illegal Response",
 295        "Unexpected Response",
 296        "Request or Probe Parity Error",
 297        "Read Response Parity Error",
 298        "Atomic Request Parity Error",
 299        "SDP read response had no match in the CS queue",
 300        "Probe Filter Protocol Error",
 301        "Probe Filter ECC Error",
 302        "SDP read response had an unexpected RETRY error",
 303        "Counter overflow error",
 304        "Counter underflow error",
 305};
 306
 307static const char * const smca_pie_mce_desc[] = {
 308        "Hardware Assert",
 309        "Register security violation",
 310        "Link Error",
 311        "Poison data consumption",
 312        "A deferred error was detected in the DF"
 313};
 314
 315static const char * const smca_umc_mce_desc[] = {
 316        "DRAM ECC error",
 317        "Data poison error",
 318        "SDP parity error",
 319        "Advanced peripheral bus error",
 320        "Address/Command parity error",
 321        "Write data CRC error",
 322        "DCQ SRAM ECC error",
 323        "AES SRAM ECC error",
 324};
 325
 326static const char * const smca_pb_mce_desc[] = {
 327        "An ECC error in the Parameter Block RAM array",
 328};
 329
 330static const char * const smca_psp_mce_desc[] = {
 331        "An ECC or parity error in a PSP RAM instance",
 332};
 333
 334static const char * const smca_psp2_mce_desc[] = {
 335        "High SRAM ECC or parity error",
 336        "Low SRAM ECC or parity error",
 337        "Instruction Cache Bank 0 ECC or parity error",
 338        "Instruction Cache Bank 1 ECC or parity error",
 339        "Instruction Tag Ram 0 parity error",
 340        "Instruction Tag Ram 1 parity error",
 341        "Data Cache Bank 0 ECC or parity error",
 342        "Data Cache Bank 1 ECC or parity error",
 343        "Data Cache Bank 2 ECC or parity error",
 344        "Data Cache Bank 3 ECC or parity error",
 345        "Data Tag Bank 0 parity error",
 346        "Data Tag Bank 1 parity error",
 347        "Data Tag Bank 2 parity error",
 348        "Data Tag Bank 3 parity error",
 349        "Dirty Data Ram parity error",
 350        "TLB Bank 0 parity error",
 351        "TLB Bank 1 parity error",
 352        "System Hub Read Buffer ECC or parity error",
 353};
 354
 355static const char * const smca_smu_mce_desc[] = {
 356        "An ECC or parity error in an SMU RAM instance",
 357};
 358
 359static const char * const smca_smu2_mce_desc[] = {
 360        "High SRAM ECC or parity error",
 361        "Low SRAM ECC or parity error",
 362        "Data Cache Bank A ECC or parity error",
 363        "Data Cache Bank B ECC or parity error",
 364        "Data Tag Cache Bank A ECC or parity error",
 365        "Data Tag Cache Bank B ECC or parity error",
 366        "Instruction Cache Bank A ECC or parity error",
 367        "Instruction Cache Bank B ECC or parity error",
 368        "Instruction Tag Cache Bank A ECC or parity error",
 369        "Instruction Tag Cache Bank B ECC or parity error",
 370        "System Hub Read Buffer ECC or parity error",
 371        "PHY RAM ECC error",
 372};
 373
 374static const char * const smca_mp5_mce_desc[] = {
 375        "High SRAM ECC or parity error",
 376        "Low SRAM ECC or parity error",
 377        "Data Cache Bank A ECC or parity error",
 378        "Data Cache Bank B ECC or parity error",
 379        "Data Tag Cache Bank A ECC or parity error",
 380        "Data Tag Cache Bank B ECC or parity error",
 381        "Instruction Cache Bank A ECC or parity error",
 382        "Instruction Cache Bank B ECC or parity error",
 383        "Instruction Tag Cache Bank A ECC or parity error",
 384        "Instruction Tag Cache Bank B ECC or parity error",
 385};
 386
 387static const char * const smca_nbio_mce_desc[] = {
 388        "ECC or Parity error",
 389        "PCIE error",
 390        "SDP ErrEvent error",
 391        "SDP Egress Poison Error",
 392        "IOHC Internal Poison Error",
 393};
 394
 395static const char * const smca_pcie_mce_desc[] = {
 396        "CCIX PER Message logging",
 397        "CCIX Read Response with Status: Non-Data Error",
 398        "CCIX Write Response with Status: Non-Data Error",
 399        "CCIX Read Response with Status: Data Error",
 400        "CCIX Non-okay write response with data error",
 401};
 402
 403struct smca_mce_desc {
 404        const char * const *descs;
 405        unsigned int num_descs;
 406};
 407
 408static struct smca_mce_desc smca_mce_descs[] = {
 409        [SMCA_LS]       = { smca_ls_mce_desc,   ARRAY_SIZE(smca_ls_mce_desc)    },
 410        [SMCA_LS_V2]    = { smca_ls2_mce_desc,  ARRAY_SIZE(smca_ls2_mce_desc)   },
 411        [SMCA_IF]       = { smca_if_mce_desc,   ARRAY_SIZE(smca_if_mce_desc)    },
 412        [SMCA_L2_CACHE] = { smca_l2_mce_desc,   ARRAY_SIZE(smca_l2_mce_desc)    },
 413        [SMCA_DE]       = { smca_de_mce_desc,   ARRAY_SIZE(smca_de_mce_desc)    },
 414        [SMCA_EX]       = { smca_ex_mce_desc,   ARRAY_SIZE(smca_ex_mce_desc)    },
 415        [SMCA_FP]       = { smca_fp_mce_desc,   ARRAY_SIZE(smca_fp_mce_desc)    },
 416        [SMCA_L3_CACHE] = { smca_l3_mce_desc,   ARRAY_SIZE(smca_l3_mce_desc)    },
 417        [SMCA_CS]       = { smca_cs_mce_desc,   ARRAY_SIZE(smca_cs_mce_desc)    },
 418        [SMCA_CS_V2]    = { smca_cs2_mce_desc,  ARRAY_SIZE(smca_cs2_mce_desc)   },
 419        [SMCA_PIE]      = { smca_pie_mce_desc,  ARRAY_SIZE(smca_pie_mce_desc)   },
 420        [SMCA_UMC]      = { smca_umc_mce_desc,  ARRAY_SIZE(smca_umc_mce_desc)   },
 421        [SMCA_PB]       = { smca_pb_mce_desc,   ARRAY_SIZE(smca_pb_mce_desc)    },
 422        [SMCA_PSP]      = { smca_psp_mce_desc,  ARRAY_SIZE(smca_psp_mce_desc)   },
 423        [SMCA_PSP_V2]   = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)  },
 424        [SMCA_SMU]      = { smca_smu_mce_desc,  ARRAY_SIZE(smca_smu_mce_desc)   },
 425        [SMCA_SMU_V2]   = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)  },
 426        [SMCA_MP5]      = { smca_mp5_mce_desc,  ARRAY_SIZE(smca_mp5_mce_desc)   },
 427        [SMCA_NBIO]     = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)  },
 428        [SMCA_PCIE]     = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)  },
 429};
 430
 431static bool f12h_mc0_mce(u16 ec, u8 xec)
 432{
 433        bool ret = false;
 434
 435        if (MEM_ERROR(ec)) {
 436                u8 ll = LL(ec);
 437                ret = true;
 438
 439                if (ll == LL_L2)
 440                        pr_cont("during L1 linefill from L2.\n");
 441                else if (ll == LL_L1)
 442                        pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
 443                else
 444                        ret = false;
 445        }
 446        return ret;
 447}
 448
 449static bool f10h_mc0_mce(u16 ec, u8 xec)
 450{
 451        if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
 452                pr_cont("during data scrub.\n");
 453                return true;
 454        }
 455        return f12h_mc0_mce(ec, xec);
 456}
 457
 458static bool k8_mc0_mce(u16 ec, u8 xec)
 459{
 460        if (BUS_ERROR(ec)) {
 461                pr_cont("during system linefill.\n");
 462                return true;
 463        }
 464
 465        return f10h_mc0_mce(ec, xec);
 466}
 467
 468static bool cat_mc0_mce(u16 ec, u8 xec)
 469{
 470        u8 r4    = R4(ec);
 471        bool ret = true;
 472
 473        if (MEM_ERROR(ec)) {
 474
 475                if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
 476                        return false;
 477
 478                switch (r4) {
 479                case R4_DRD:
 480                case R4_DWR:
 481                        pr_cont("Data/Tag parity error due to %s.\n",
 482                                (r4 == R4_DRD ? "load/hw prf" : "store"));
 483                        break;
 484                case R4_EVICT:
 485                        pr_cont("Copyback parity error on a tag miss.\n");
 486                        break;
 487                case R4_SNOOP:
 488                        pr_cont("Tag parity error during snoop.\n");
 489                        break;
 490                default:
 491                        ret = false;
 492                }
 493        } else if (BUS_ERROR(ec)) {
 494
 495                if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
 496                        return false;
 497
 498                pr_cont("System read data error on a ");
 499
 500                switch (r4) {
 501                case R4_RD:
 502                        pr_cont("TLB reload.\n");
 503                        break;
 504                case R4_DWR:
 505                        pr_cont("store.\n");
 506                        break;
 507                case R4_DRD:
 508                        pr_cont("load.\n");
 509                        break;
 510                default:
 511                        ret = false;
 512                }
 513        } else {
 514                ret = false;
 515        }
 516
 517        return ret;
 518}
 519
 520static bool f15h_mc0_mce(u16 ec, u8 xec)
 521{
 522        bool ret = true;
 523
 524        if (MEM_ERROR(ec)) {
 525
 526                switch (xec) {
 527                case 0x0:
 528                        pr_cont("Data Array access error.\n");
 529                        break;
 530
 531                case 0x1:
 532                        pr_cont("UC error during a linefill from L2/NB.\n");
 533                        break;
 534
 535                case 0x2:
 536                case 0x11:
 537                        pr_cont("STQ access error.\n");
 538                        break;
 539
 540                case 0x3:
 541                        pr_cont("SCB access error.\n");
 542                        break;
 543
 544                case 0x10:
 545                        pr_cont("Tag error.\n");
 546                        break;
 547
 548                case 0x12:
 549                        pr_cont("LDQ access error.\n");
 550                        break;
 551
 552                default:
 553                        ret = false;
 554                }
 555        } else if (BUS_ERROR(ec)) {
 556
 557                if (!xec)
 558                        pr_cont("System Read Data Error.\n");
 559                else
 560                        pr_cont(" Internal error condition type %d.\n", xec);
 561        } else if (INT_ERROR(ec)) {
 562                if (xec <= 0x1f)
 563                        pr_cont("Hardware Assert.\n");
 564                else
 565                        ret = false;
 566
 567        } else
 568                ret = false;
 569
 570        return ret;
 571}
 572
 573static void decode_mc0_mce(struct mce *m)
 574{
 575        u16 ec = EC(m->status);
 576        u8 xec = XEC(m->status, xec_mask);
 577
 578        pr_emerg(HW_ERR "MC0 Error: ");
 579
 580        /* TLB error signatures are the same across families */
 581        if (TLB_ERROR(ec)) {
 582                if (TT(ec) == TT_DATA) {
 583                        pr_cont("%s TLB %s.\n", LL_MSG(ec),
 584                                ((xec == 2) ? "locked miss"
 585                                            : (xec ? "multimatch" : "parity")));
 586                        return;
 587                }
 588        } else if (fam_ops.mc0_mce(ec, xec))
 589                ;
 590        else
 591                pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
 592}
 593
 594static bool k8_mc1_mce(u16 ec, u8 xec)
 595{
 596        u8 ll    = LL(ec);
 597        bool ret = true;
 598
 599        if (!MEM_ERROR(ec))
 600                return false;
 601
 602        if (ll == 0x2)
 603                pr_cont("during a linefill from L2.\n");
 604        else if (ll == 0x1) {
 605                switch (R4(ec)) {
 606                case R4_IRD:
 607                        pr_cont("Parity error during data load.\n");
 608                        break;
 609
 610                case R4_EVICT:
 611                        pr_cont("Copyback Parity/Victim error.\n");
 612                        break;
 613
 614                case R4_SNOOP:
 615                        pr_cont("Tag Snoop error.\n");
 616                        break;
 617
 618                default:
 619                        ret = false;
 620                        break;
 621                }
 622        } else
 623                ret = false;
 624
 625        return ret;
 626}
 627
 628static bool cat_mc1_mce(u16 ec, u8 xec)
 629{
 630        u8 r4    = R4(ec);
 631        bool ret = true;
 632
 633        if (!MEM_ERROR(ec))
 634                return false;
 635
 636        if (TT(ec) != TT_INSTR)
 637                return false;
 638
 639        if (r4 == R4_IRD)
 640                pr_cont("Data/tag array parity error for a tag hit.\n");
 641        else if (r4 == R4_SNOOP)
 642                pr_cont("Tag error during snoop/victimization.\n");
 643        else if (xec == 0x0)
 644                pr_cont("Tag parity error from victim castout.\n");
 645        else if (xec == 0x2)
 646                pr_cont("Microcode patch RAM parity error.\n");
 647        else
 648                ret = false;
 649
 650        return ret;
 651}
 652
 653static bool f15h_mc1_mce(u16 ec, u8 xec)
 654{
 655        bool ret = true;
 656
 657        if (!MEM_ERROR(ec))
 658                return false;
 659
 660        switch (xec) {
 661        case 0x0 ... 0xa:
 662                pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
 663                break;
 664
 665        case 0xd:
 666                pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
 667                break;
 668
 669        case 0x10:
 670                pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
 671                break;
 672
 673        case 0x11 ... 0x15:
 674                pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
 675                break;
 676
 677        default:
 678                ret = false;
 679        }
 680        return ret;
 681}
 682
 683static void decode_mc1_mce(struct mce *m)
 684{
 685        u16 ec = EC(m->status);
 686        u8 xec = XEC(m->status, xec_mask);
 687
 688        pr_emerg(HW_ERR "MC1 Error: ");
 689
 690        if (TLB_ERROR(ec))
 691                pr_cont("%s TLB %s.\n", LL_MSG(ec),
 692                        (xec ? "multimatch" : "parity error"));
 693        else if (BUS_ERROR(ec)) {
 694                bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
 695
 696                pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
 697        } else if (INT_ERROR(ec)) {
 698                if (xec <= 0x3f)
 699                        pr_cont("Hardware Assert.\n");
 700                else
 701                        goto wrong_mc1_mce;
 702        } else if (fam_ops.mc1_mce(ec, xec))
 703                ;
 704        else
 705                goto wrong_mc1_mce;
 706
 707        return;
 708
 709wrong_mc1_mce:
 710        pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
 711}
 712
 713static bool k8_mc2_mce(u16 ec, u8 xec)
 714{
 715        bool ret = true;
 716
 717        if (xec == 0x1)
 718                pr_cont(" in the write data buffers.\n");
 719        else if (xec == 0x3)
 720                pr_cont(" in the victim data buffers.\n");
 721        else if (xec == 0x2 && MEM_ERROR(ec))
 722                pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
 723        else if (xec == 0x0) {
 724                if (TLB_ERROR(ec))
 725                        pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
 726                                TT_MSG(ec));
 727                else if (BUS_ERROR(ec))
 728                        pr_cont(": %s/ECC error in data read from NB: %s.\n",
 729                                R4_MSG(ec), PP_MSG(ec));
 730                else if (MEM_ERROR(ec)) {
 731                        u8 r4 = R4(ec);
 732
 733                        if (r4 >= 0x7)
 734                                pr_cont(": %s error during data copyback.\n",
 735                                        R4_MSG(ec));
 736                        else if (r4 <= 0x1)
 737                                pr_cont(": %s parity/ECC error during data "
 738                                        "access from L2.\n", R4_MSG(ec));
 739                        else
 740                                ret = false;
 741                } else
 742                        ret = false;
 743        } else
 744                ret = false;
 745
 746        return ret;
 747}
 748
 749static bool f15h_mc2_mce(u16 ec, u8 xec)
 750{
 751        bool ret = true;
 752
 753        if (TLB_ERROR(ec)) {
 754                if (xec == 0x0)
 755                        pr_cont("Data parity TLB read error.\n");
 756                else if (xec == 0x1)
 757                        pr_cont("Poison data provided for TLB fill.\n");
 758                else
 759                        ret = false;
 760        } else if (BUS_ERROR(ec)) {
 761                if (xec > 2)
 762                        ret = false;
 763
 764                pr_cont("Error during attempted NB data read.\n");
 765        } else if (MEM_ERROR(ec)) {
 766                switch (xec) {
 767                case 0x4 ... 0xc:
 768                        pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
 769                        break;
 770
 771                case 0x10 ... 0x14:
 772                        pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
 773                        break;
 774
 775                default:
 776                        ret = false;
 777                }
 778        } else if (INT_ERROR(ec)) {
 779                if (xec <= 0x3f)
 780                        pr_cont("Hardware Assert.\n");
 781                else
 782                        ret = false;
 783        }
 784
 785        return ret;
 786}
 787
 788static bool f16h_mc2_mce(u16 ec, u8 xec)
 789{
 790        u8 r4 = R4(ec);
 791
 792        if (!MEM_ERROR(ec))
 793                return false;
 794
 795        switch (xec) {
 796        case 0x04 ... 0x05:
 797                pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
 798                break;
 799
 800        case 0x09 ... 0x0b:
 801        case 0x0d ... 0x0f:
 802                pr_cont("ECC error in L2 tag (%s).\n",
 803                        ((r4 == R4_GEN)   ? "BankReq" :
 804                        ((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
 805                break;
 806
 807        case 0x10 ... 0x19:
 808        case 0x1b:
 809                pr_cont("ECC error in L2 data array (%s).\n",
 810                        (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
 811                        ((r4 == R4_GEN)   ? "Attr" :
 812                        ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
 813                break;
 814
 815        case 0x1c ... 0x1d:
 816        case 0x1f:
 817                pr_cont("Parity error in L2 attribute bits (%s).\n",
 818                        ((r4 == R4_RD)  ? "Hit"  :
 819                        ((r4 == R4_GEN) ? "Attr" : "Fill")));
 820                break;
 821
 822        default:
 823                return false;
 824        }
 825
 826        return true;
 827}
 828
 829static void decode_mc2_mce(struct mce *m)
 830{
 831        u16 ec = EC(m->status);
 832        u8 xec = XEC(m->status, xec_mask);
 833
 834        pr_emerg(HW_ERR "MC2 Error: ");
 835
 836        if (!fam_ops.mc2_mce(ec, xec))
 837                pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
 838}
 839
 840static void decode_mc3_mce(struct mce *m)
 841{
 842        u16 ec = EC(m->status);
 843        u8 xec = XEC(m->status, xec_mask);
 844
 845        if (boot_cpu_data.x86 >= 0x14) {
 846                pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
 847                         " please report on LKML.\n");
 848                return;
 849        }
 850
 851        pr_emerg(HW_ERR "MC3 Error");
 852
 853        if (xec == 0x0) {
 854                u8 r4 = R4(ec);
 855
 856                if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
 857                        goto wrong_mc3_mce;
 858
 859                pr_cont(" during %s.\n", R4_MSG(ec));
 860        } else
 861                goto wrong_mc3_mce;
 862
 863        return;
 864
 865 wrong_mc3_mce:
 866        pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
 867}
 868
 869static void decode_mc4_mce(struct mce *m)
 870{
 871        unsigned int fam = x86_family(m->cpuid);
 872        int node_id = topology_die_id(m->extcpu);
 873        u16 ec = EC(m->status);
 874        u8 xec = XEC(m->status, 0x1f);
 875        u8 offset = 0;
 876
 877        pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
 878
 879        switch (xec) {
 880        case 0x0 ... 0xe:
 881
 882                /* special handling for DRAM ECCs */
 883                if (xec == 0x0 || xec == 0x8) {
 884                        /* no ECCs on F11h */
 885                        if (fam == 0x11)
 886                                goto wrong_mc4_mce;
 887
 888                        pr_cont("%s.\n", mc4_mce_desc[xec]);
 889
 890                        if (decode_dram_ecc)
 891                                decode_dram_ecc(node_id, m);
 892                        return;
 893                }
 894                break;
 895
 896        case 0xf:
 897                if (TLB_ERROR(ec))
 898                        pr_cont("GART Table Walk data error.\n");
 899                else if (BUS_ERROR(ec))
 900                        pr_cont("DMA Exclusion Vector Table Walk error.\n");
 901                else
 902                        goto wrong_mc4_mce;
 903                return;
 904
 905        case 0x19:
 906                if (fam == 0x15 || fam == 0x16)
 907                        pr_cont("Compute Unit Data Error.\n");
 908                else
 909                        goto wrong_mc4_mce;
 910                return;
 911
 912        case 0x1c ... 0x1f:
 913                offset = 13;
 914                break;
 915
 916        default:
 917                goto wrong_mc4_mce;
 918        }
 919
 920        pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
 921        return;
 922
 923 wrong_mc4_mce:
 924        pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
 925}
 926
 927static void decode_mc5_mce(struct mce *m)
 928{
 929        unsigned int fam = x86_family(m->cpuid);
 930        u16 ec = EC(m->status);
 931        u8 xec = XEC(m->status, xec_mask);
 932
 933        if (fam == 0xf || fam == 0x11)
 934                goto wrong_mc5_mce;
 935
 936        pr_emerg(HW_ERR "MC5 Error: ");
 937
 938        if (INT_ERROR(ec)) {
 939                if (xec <= 0x1f) {
 940                        pr_cont("Hardware Assert.\n");
 941                        return;
 942                } else
 943                        goto wrong_mc5_mce;
 944        }
 945
 946        if (xec == 0x0 || xec == 0xc)
 947                pr_cont("%s.\n", mc5_mce_desc[xec]);
 948        else if (xec <= 0xd)
 949                pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
 950        else
 951                goto wrong_mc5_mce;
 952
 953        return;
 954
 955 wrong_mc5_mce:
 956        pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
 957}
 958
 959static void decode_mc6_mce(struct mce *m)
 960{
 961        u8 xec = XEC(m->status, xec_mask);
 962
 963        pr_emerg(HW_ERR "MC6 Error: ");
 964
 965        if (xec > 0x5)
 966                goto wrong_mc6_mce;
 967
 968        pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
 969        return;
 970
 971 wrong_mc6_mce:
 972        pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
 973}
 974
 975/* Decode errors according to Scalable MCA specification */
 976static void decode_smca_error(struct mce *m)
 977{
 978        struct smca_hwid *hwid;
 979        enum smca_bank_types bank_type;
 980        const char *ip_name;
 981        u8 xec = XEC(m->status, xec_mask);
 982
 983        if (m->bank >= ARRAY_SIZE(smca_banks))
 984                return;
 985
 986        hwid = smca_banks[m->bank].hwid;
 987        if (!hwid)
 988                return;
 989
 990        bank_type = hwid->bank_type;
 991
 992        if (bank_type == SMCA_RESERVED) {
 993                pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
 994                return;
 995        }
 996
 997        ip_name = smca_get_long_name(bank_type);
 998
 999        pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
1000
1001        /* Only print the decode of valid error codes */
1002        if (xec < smca_mce_descs[bank_type].num_descs)
1003                pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
1004
1005        if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
1006                decode_dram_ecc(topology_die_id(m->extcpu), m);
1007}
1008
1009static inline void amd_decode_err_code(u16 ec)
1010{
1011        if (INT_ERROR(ec)) {
1012                pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
1013                return;
1014        }
1015
1016        pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
1017
1018        if (BUS_ERROR(ec))
1019                pr_cont(", mem/io: %s", II_MSG(ec));
1020        else
1021                pr_cont(", tx: %s", TT_MSG(ec));
1022
1023        if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
1024                pr_cont(", mem-tx: %s", R4_MSG(ec));
1025
1026                if (BUS_ERROR(ec))
1027                        pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
1028        }
1029
1030        pr_cont("\n");
1031}
1032
1033static const char *decode_error_status(struct mce *m)
1034{
1035        if (m->status & MCI_STATUS_UC) {
1036                if (m->status & MCI_STATUS_PCC)
1037                        return "System Fatal error.";
1038                if (m->mcgstatus & MCG_STATUS_RIPV)
1039                        return "Uncorrected, software restartable error.";
1040                return "Uncorrected, software containable error.";
1041        }
1042
1043        if (m->status & MCI_STATUS_DEFERRED)
1044                return "Deferred error, no action required.";
1045
1046        return "Corrected error, no action required.";
1047}
1048
1049static int
1050amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1051{
1052        struct mce *m = (struct mce *)data;
1053        unsigned int fam = x86_family(m->cpuid);
1054        int ecc;
1055
1056        if (m->kflags & MCE_HANDLED_CEC)
1057                return NOTIFY_DONE;
1058
1059        pr_emerg(HW_ERR "%s\n", decode_error_status(m));
1060
1061        pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
1062                m->extcpu,
1063                fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
1064                m->bank,
1065                ((m->status & MCI_STATUS_OVER)  ? "Over"  : "-"),
1066                ((m->status & MCI_STATUS_UC)    ? "UE"    :
1067                 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
1068                ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
1069                ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"),
1070                ((m->status & MCI_STATUS_PCC)   ? "PCC"   : "-"));
1071
1072        if (boot_cpu_has(X86_FEATURE_SMCA)) {
1073                u32 low, high;
1074                u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1075
1076                if (!rdmsr_safe(addr, &low, &high) &&
1077                    (low & MCI_CONFIG_MCAX))
1078                        pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1079
1080                pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
1081        }
1082
1083        /* do the two bits[14:13] together */
1084        ecc = (m->status >> 45) & 0x3;
1085        if (ecc)
1086                pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1087
1088        if (fam >= 0x15) {
1089                pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
1090
1091                /* F15h, bank4, bit 43 is part of McaStatSubCache. */
1092                if (fam != 0x15 || m->bank != 4)
1093                        pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
1094        }
1095
1096        if (fam >= 0x17)
1097                pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
1098
1099        pr_cont("]: 0x%016llx\n", m->status);
1100
1101        if (m->status & MCI_STATUS_ADDRV)
1102                pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
1103
1104        if (m->ppin)
1105                pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
1106
1107        if (boot_cpu_has(X86_FEATURE_SMCA)) {
1108                pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
1109
1110                if (m->status & MCI_STATUS_SYNDV)
1111                        pr_cont(", Syndrome: 0x%016llx", m->synd);
1112
1113                pr_cont("\n");
1114
1115                decode_smca_error(m);
1116                goto err_code;
1117        }
1118
1119        if (m->tsc)
1120                pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1121
1122        /* Doesn't matter which member to test. */
1123        if (!fam_ops.mc0_mce)
1124                goto err_code;
1125
1126        switch (m->bank) {
1127        case 0:
1128                decode_mc0_mce(m);
1129                break;
1130
1131        case 1:
1132                decode_mc1_mce(m);
1133                break;
1134
1135        case 2:
1136                decode_mc2_mce(m);
1137                break;
1138
1139        case 3:
1140                decode_mc3_mce(m);
1141                break;
1142
1143        case 4:
1144                decode_mc4_mce(m);
1145                break;
1146
1147        case 5:
1148                decode_mc5_mce(m);
1149                break;
1150
1151        case 6:
1152                decode_mc6_mce(m);
1153                break;
1154
1155        default:
1156                break;
1157        }
1158
1159 err_code:
1160        amd_decode_err_code(m->status & 0xffff);
1161
1162        m->kflags |= MCE_HANDLED_EDAC;
1163        return NOTIFY_OK;
1164}
1165
1166static struct notifier_block amd_mce_dec_nb = {
1167        .notifier_call  = amd_decode_mce,
1168        .priority       = MCE_PRIO_EDAC,
1169};
1170
1171static int __init mce_amd_init(void)
1172{
1173        struct cpuinfo_x86 *c = &boot_cpu_data;
1174
1175        if (c->x86_vendor != X86_VENDOR_AMD &&
1176            c->x86_vendor != X86_VENDOR_HYGON)
1177                return -ENODEV;
1178
1179        if (boot_cpu_has(X86_FEATURE_SMCA)) {
1180                xec_mask = 0x3f;
1181                goto out;
1182        }
1183
1184        switch (c->x86) {
1185        case 0xf:
1186                fam_ops.mc0_mce = k8_mc0_mce;
1187                fam_ops.mc1_mce = k8_mc1_mce;
1188                fam_ops.mc2_mce = k8_mc2_mce;
1189                break;
1190
1191        case 0x10:
1192                fam_ops.mc0_mce = f10h_mc0_mce;
1193                fam_ops.mc1_mce = k8_mc1_mce;
1194                fam_ops.mc2_mce = k8_mc2_mce;
1195                break;
1196
1197        case 0x11:
1198                fam_ops.mc0_mce = k8_mc0_mce;
1199                fam_ops.mc1_mce = k8_mc1_mce;
1200                fam_ops.mc2_mce = k8_mc2_mce;
1201                break;
1202
1203        case 0x12:
1204                fam_ops.mc0_mce = f12h_mc0_mce;
1205                fam_ops.mc1_mce = k8_mc1_mce;
1206                fam_ops.mc2_mce = k8_mc2_mce;
1207                break;
1208
1209        case 0x14:
1210                fam_ops.mc0_mce = cat_mc0_mce;
1211                fam_ops.mc1_mce = cat_mc1_mce;
1212                fam_ops.mc2_mce = k8_mc2_mce;
1213                break;
1214
1215        case 0x15:
1216                xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1217
1218                fam_ops.mc0_mce = f15h_mc0_mce;
1219                fam_ops.mc1_mce = f15h_mc1_mce;
1220                fam_ops.mc2_mce = f15h_mc2_mce;
1221                break;
1222
1223        case 0x16:
1224                xec_mask = 0x1f;
1225                fam_ops.mc0_mce = cat_mc0_mce;
1226                fam_ops.mc1_mce = cat_mc1_mce;
1227                fam_ops.mc2_mce = f16h_mc2_mce;
1228                break;
1229
1230        case 0x17:
1231        case 0x18:
1232                pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
1233                return -EINVAL;
1234
1235        default:
1236                printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1237                return -EINVAL;
1238        }
1239
1240out:
1241        pr_info("MCE: In-kernel MCE decoding enabled.\n");
1242
1243        mce_register_decode_chain(&amd_mce_dec_nb);
1244
1245        return 0;
1246}
1247early_initcall(mce_amd_init);
1248
1249#ifdef MODULE
1250static void __exit mce_amd_exit(void)
1251{
1252        mce_unregister_decode_chain(&amd_mce_dec_nb);
1253}
1254
1255MODULE_DESCRIPTION("AMD MCE decoder");
1256MODULE_ALIAS("edac-mce-amd");
1257MODULE_LICENSE("GPL");
1258module_exit(mce_amd_exit);
1259#endif
1260