From be0aec23bf4624fd55650629fe8df20483487049 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 7 Mar 2016 14:02:18 +0100 Subject: [PATCH] x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors For Scalable MCA enabled processors, errors are listed per IP block. And since it is not required for an IP to map to a particular bank, we need to use HWID and McaType values from the MCx_IPID register to figure out which IP a given bank represents. We also have a new bit (TCC) in the MCx_STATUS register to indicate Task context is corrupt. Add logic here to decode errors from all known IP blocks for Fam17h Model 00-0fh and to print TCC errors. [ Minor fixups. ] Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1457021458-2522-3-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 59 +++++ arch/x86/kernel/cpu/mcheck/mce_amd.c | 29 +++ drivers/edac/mce_amd.c | 335 ++++++++++++++++++++++++++- 3 files changed, 420 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 80ba0a8d6d06..9c467fe00551 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -42,6 +42,18 @@ /* AMD-specific bits */ #define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */ #define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ +#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ + +/* + * McaX field if set indicates a given bank supports MCA extensions: + * - Deferred error interrupt type is specifiable by bank. + * - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers, + * But should not be used to determine MSR numbers. + * - TCC bit is present in MCx_STATUS. + */ +#define MCI_CONFIG_MCAX 0x1 +#define MCI_IPID_MCATYPE 0xFFFF0000 +#define MCI_IPID_HWID 0xFFF /* * Note that the full MCACOD field of IA32_MCi_STATUS MSR is @@ -93,7 +105,9 @@ /* AMD Scalable MCA */ #define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004 +#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005 #define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x)) /* * This structure contains all data related to the MCE log. Also @@ -291,4 +305,49 @@ struct cper_sec_mem_err; extern void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err); +/* + * Enumerate new IP types and HWID values in AMD processors which support + * Scalable MCA. + */ +#ifdef CONFIG_X86_MCE_AMD +enum amd_ip_types { + SMCA_F17H_CORE = 0, /* Core errors */ + SMCA_DF, /* Data Fabric */ + SMCA_UMC, /* Unified Memory Controller */ + SMCA_PB, /* Parameter Block */ + SMCA_PSP, /* Platform Security Processor */ + SMCA_SMU, /* System Management Unit */ + N_AMD_IP_TYPES +}; + +struct amd_hwid { + const char *name; + unsigned int hwid; +}; + +extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES]; + +enum amd_core_mca_blocks { + SMCA_LS = 0, /* Load Store */ + SMCA_IF, /* Instruction Fetch */ + SMCA_L2_CACHE, /* L2 cache */ + SMCA_DE, /* Decoder unit */ + RES, /* Reserved */ + SMCA_EX, /* Execution unit */ + SMCA_FP, /* Floating Point */ + SMCA_L3_CACHE, /* L3 cache */ + N_CORE_MCA_BLOCKS +}; + +extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS]; + +enum amd_df_mca_blocks { + SMCA_CS = 0, /* Coherent Slave */ + SMCA_PIE, /* Power management, Interrupts, etc */ + N_DF_BLOCKS +}; + +extern const char * const amd_df_mcablock_names[N_DF_BLOCKS]; +#endif + #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 88de27bd5797..ee487a93ebe7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -71,6 +71,35 @@ static const char * const th_names[] = { "execution_unit", }; +/* Define HWID to IP type mappings for Scalable MCA */ +struct amd_hwid amd_hwids[] = { + [SMCA_F17H_CORE] = { "f17h_core", 0xB0 }, + [SMCA_DF] = { "data_fabric", 0x2E }, + [SMCA_UMC] = { "umc", 0x96 }, + [SMCA_PB] = { "param_block", 0x5 }, + [SMCA_PSP] = { "psp", 0xFF }, + [SMCA_SMU] = { "smu", 0x1 }, +}; +EXPORT_SYMBOL_GPL(amd_hwids); + +const char * const amd_core_mcablock_names[] = { + [SMCA_LS] = "load_store", + [SMCA_IF] = "insn_fetch", + [SMCA_L2_CACHE] = "l2_cache", + [SMCA_DE] = "decode_unit", + [RES] = "", + [SMCA_EX] = "execution_unit", + [SMCA_FP] = "floating_point", + [SMCA_L3_CACHE] = "l3_cache", +}; +EXPORT_SYMBOL_GPL(amd_core_mcablock_names); + +const char * const amd_df_mcablock_names[] = { + [SMCA_CS] = "coherent_slave", + [SMCA_PIE] = "pie", +}; +EXPORT_SYMBOL_GPL(amd_df_mcablock_names); + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index e3a945ce374b..49768c08ac07 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = { "Status Register File", }; +/* Scalable MCA error strings */ +static const char * const f17h_ls_mce_desc[] = { + "Load queue parity", + "Store queue parity", + "Miss address buffer payload parity", + "L1 TLB parity", + "", /* reserved */ + "DC tag error type 6", + "DC tag error type 1", + "Internal error type 1", + "Internal error type 2", + "Sys Read data error thread 0", + "Sys read data error thread 1", + "DC tag error type 2", + "DC data error type 1 (poison comsumption)", + "DC data error type 2", + "DC data error type 3", + "DC tag error type 4", + "L2 TLB parity", + "PDC parity error", + "DC tag error type 3", + "DC tag error type 5", + "L2 fill data error", +}; + +static const char * const f17h_if_mce_desc[] = { + "microtag probe port parity error", + "IC microtag or full tag multi-hit error", + "IC full tag parity", + "IC data array parity", + "Decoupling queue phys addr parity error", + "L0 ITLB parity error", + "L1 ITLB parity error", + "L2 ITLB parity error", + "BPQ snoop parity on Thread 0", + "BPQ snoop parity on Thread 1", + "L1 BTB multi-match error", + "L2 BTB multi-match error", +}; + +static const char * const f17h_l2_mce_desc[] = { + "L2M tag multi-way-hit error", + "L2M tag ECC error", + "L2M data ECC error", + "HW assert", +}; + +static const char * const f17h_de_mce_desc[] = { + "uop cache tag parity error", + "uop cache data parity error", + "Insn buffer parity error", + "Insn dispatch queue parity error", + "Fetch address FIFO parity", + "Patch RAM data parity", + "Patch RAM sequencer parity", + "uop buffer parity" +}; + +static const char * const f17h_ex_mce_desc[] = { + "Watchdog timeout error", + "Phy register file parity", + "Flag register file parity", + "Immediate displacement register file parity", + "Address generator payload parity", + "EX payload parity", + "Checkpoint queue parity", + "Retire dispatch queue parity", +}; + +static const char * const f17h_fp_mce_desc[] = { + "Physical register file parity", + "Freelist parity error", + "Schedule queue parity", + "NSQ parity error", + "Retire queue parity", + "Status register file parity", +}; + +static const char * const f17h_l3_mce_desc[] = { + "Shadow tag macro ECC error", + "Shadow tag macro multi-way-hit error", + "L3M tag ECC error", + "L3M tag multi-way-hit error", + "L3M data ECC error", + "XI parity, L3 fill done channel error", + "L3 victim queue parity", + "L3 HW assert", +}; + +static const char * const f17h_cs_mce_desc[] = { + "Illegal request from transport layer", + "Address violation", + "Security violation", + "Illegal response from transport layer", + "Unexpected response", + "Parity error on incoming request or probe response data", + "Parity error on incoming read response data", + "Atomic request parity", + "ECC error on probe filter access", +}; + +static const char * const f17h_pie_mce_desc[] = { + "HW assert", + "Internal PIE register security violation", + "Error on GMI link", + "Poison data written to internal PIE register", +}; + +static const char * const f17h_umc_mce_desc[] = { + "DRAM ECC error", + "Data poison error on DRAM", + "SDP parity error", + "Advanced peripheral bus error", + "Command/address parity error", + "Write data CRC error", +}; + +static const char * const f17h_pb_mce_desc[] = { + "Parameter Block RAM ECC error", +}; + +static const char * const f17h_psp_mce_desc[] = { + "PSP RAM ECC or parity error", +}; + +static const char * const f17h_smu_mce_desc[] = { + "SMU RAM ECC or parity error", +}; + static bool f12h_mc0_mce(u16 ec, u8 xec) { bool ret = false; @@ -691,6 +820,177 @@ static void decode_mc6_mce(struct mce *m) pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); } +static void decode_f17h_core_errors(const char *ip_name, u8 xec, + unsigned int mca_type) +{ + const char * const *error_desc_array; + size_t len; + + pr_emerg(HW_ERR "%s Error: ", ip_name); + + switch (mca_type) { + case SMCA_LS: + error_desc_array = f17h_ls_mce_desc; + len = ARRAY_SIZE(f17h_ls_mce_desc) - 1; + + if (xec == 0x4) { + pr_cont("Unrecognized LS MCA error code.\n"); + return; + } + break; + + case SMCA_IF: + error_desc_array = f17h_if_mce_desc; + len = ARRAY_SIZE(f17h_if_mce_desc) - 1; + break; + + case SMCA_L2_CACHE: + error_desc_array = f17h_l2_mce_desc; + len = ARRAY_SIZE(f17h_l2_mce_desc) - 1; + break; + + case SMCA_DE: + error_desc_array = f17h_de_mce_desc; + len = ARRAY_SIZE(f17h_de_mce_desc) - 1; + break; + + case SMCA_EX: + error_desc_array = f17h_ex_mce_desc; + len = ARRAY_SIZE(f17h_ex_mce_desc) - 1; + break; + + case SMCA_FP: + error_desc_array = f17h_fp_mce_desc; + len = ARRAY_SIZE(f17h_fp_mce_desc) - 1; + break; + + case SMCA_L3_CACHE: + error_desc_array = f17h_l3_mce_desc; + len = ARRAY_SIZE(f17h_l3_mce_desc) - 1; + break; + + default: + pr_cont("Corrupted MCA core error info.\n"); + return; + } + + if (xec > len) { + pr_cont("Unrecognized %s MCA bank error code.\n", + amd_core_mcablock_names[mca_type]); + return; + } + + pr_cont("%s.\n", error_desc_array[xec]); +} + +static void decode_df_errors(u8 xec, unsigned int mca_type) +{ + const char * const *error_desc_array; + size_t len; + + pr_emerg(HW_ERR "Data Fabric Error: "); + + switch (mca_type) { + case SMCA_CS: + error_desc_array = f17h_cs_mce_desc; + len = ARRAY_SIZE(f17h_cs_mce_desc) - 1; + break; + + case SMCA_PIE: + error_desc_array = f17h_pie_mce_desc; + len = ARRAY_SIZE(f17h_pie_mce_desc) - 1; + break; + + default: + pr_cont("Corrupted MCA Data Fabric info.\n"); + return; + } + + if (xec > len) { + pr_cont("Unrecognized %s MCA bank error code.\n", + amd_df_mcablock_names[mca_type]); + return; + } + + pr_cont("%s.\n", error_desc_array[xec]); +} + +/* Decode errors according to Scalable MCA specification */ +static void decode_smca_errors(struct mce *m) +{ + u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank); + unsigned int hwid, mca_type, i; + u8 xec = XEC(m->status, xec_mask); + const char * const *error_desc_array; + const char *ip_name; + u32 low, high; + size_t len; + + if (rdmsr_safe(addr, &low, &high)) { + pr_emerg("Invalid IP block specified, error information is unreliable.\n"); + return; + } + + hwid = high & MCI_IPID_HWID; + mca_type = (high & MCI_IPID_MCATYPE) >> 16; + + pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low); + + /* + * Based on hwid and mca_type values, decode errors from respective IPs. + * Note: mca_type values make sense only in the context of an hwid. + */ + for (i = 0; i < ARRAY_SIZE(amd_hwids); i++) + if (amd_hwids[i].hwid == hwid) + break; + + switch (i) { + case SMCA_F17H_CORE: + ip_name = (mca_type == SMCA_L3_CACHE) ? + "L3 Cache" : "F17h Core"; + return decode_f17h_core_errors(ip_name, xec, mca_type); + break; + + case SMCA_DF: + return decode_df_errors(xec, mca_type); + break; + + case SMCA_UMC: + error_desc_array = f17h_umc_mce_desc; + len = ARRAY_SIZE(f17h_umc_mce_desc) - 1; + break; + + case SMCA_PB: + error_desc_array = f17h_pb_mce_desc; + len = ARRAY_SIZE(f17h_pb_mce_desc) - 1; + break; + + case SMCA_PSP: + error_desc_array = f17h_psp_mce_desc; + len = ARRAY_SIZE(f17h_psp_mce_desc) - 1; + break; + + case SMCA_SMU: + error_desc_array = f17h_smu_mce_desc; + len = ARRAY_SIZE(f17h_smu_mce_desc) - 1; + break; + + default: + pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid); + return; + } + + ip_name = amd_hwids[i].name; + pr_emerg(HW_ERR "%s Error: ", ip_name); + + if (xec > len) { + pr_cont("Unrecognized %s MCA bank error code.\n", ip_name); + return; + } + + pr_cont("%s.\n", error_desc_array[xec]); +} + static inline void amd_decode_err_code(u16 ec) { if (INT_ERROR(ec)) { @@ -752,6 +1052,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) struct mce *m = (struct mce *)data; struct cpuinfo_x86 *c = &cpu_data(m->extcpu); int ecc; + u32 ebx = cpuid_ebx(0x80000007); if (amd_filter_mce(m)) return NOTIFY_STOP; @@ -769,11 +1070,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - if (c->x86 == 0x15 || c->x86 == 0x16) + if (c->x86 >= 0x15) pr_cont("|%s|%s", ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); + if (!!(ebx & BIT(3))) { + u32 low, high; + u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); + + if (!rdmsr_safe(addr, &low, &high) && + (low & MCI_CONFIG_MCAX)) + pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); + } + /* do the two bits[14:13] together */ ecc = (m->status >> 45) & 0x3; if (ecc) @@ -784,6 +1094,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (m->status & MCI_STATUS_ADDRV) pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr); + if (!!(ebx & BIT(3))) { + decode_smca_errors(m); + goto err_code; + } + if (!fam_ops) goto err_code; @@ -834,6 +1149,7 @@ static struct notifier_block amd_mce_dec_nb = { static int __init mce_amd_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; + u32 ebx; if (c->x86_vendor != X86_VENDOR_AMD) return -ENODEV; @@ -888,10 +1204,18 @@ static int __init mce_amd_init(void) fam_ops->mc2_mce = f16h_mc2_mce; break; + case 0x17: + ebx = cpuid_ebx(0x80000007); + xec_mask = 0x3f; + if (!(ebx & BIT(3))) { + printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n"); + goto err_out; + } + break; + default: printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); - kfree(fam_ops); - fam_ops = NULL; + goto err_out; } pr_info("MCE: In-kernel MCE decoding enabled.\n"); @@ -899,6 +1223,11 @@ static int __init mce_amd_init(void) mce_register_decode_chain(&amd_mce_dec_nb); return 0; + +err_out: + kfree(fam_ops); + fam_ops = NULL; + return -EINVAL; } early_initcall(mce_amd_init);