From 85e6084e0b436cabe9c909e679937998ffbf9c9d Mon Sep 17 00:00:00 2001 From: Luca Stefani Date: Wed, 5 Aug 2020 11:57:08 +0200 Subject: [PATCH 01/20] RAS/CEC: Fix cec_init() prototype late_initcall() expects a function that returns an integer. Update the function signature to match. [ bp: Massage commit message into proper sentences. ] Fixes: 9554bfe403bd ("x86/mce: Convert the CEC to use the MCE notifier") Signed-off-by: Luca Stefani Signed-off-by: Borislav Petkov Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Link: https://lkml.kernel.org/r/20200805095708.83939-1-luca.stefani.ge1@gmail.com --- drivers/ras/cec.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index 569d9ad2c594..6939aa5b3dc7 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -553,20 +553,20 @@ static struct notifier_block cec_nb = { .priority = MCE_PRIO_CEC, }; -static void __init cec_init(void) +static int __init cec_init(void) { if (ce_arr.disabled) - return; + return -ENODEV; ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL); if (!ce_arr.array) { pr_err("Error allocating CE array page!\n"); - return; + return -ENOMEM; } if (create_debugfs_nodes()) { free_page((unsigned long)ce_arr.array); - return; + return -ENOMEM; } INIT_DELAYED_WORK(&cec_work, cec_work_fn); @@ -575,6 +575,7 @@ static void __init cec_init(void) mce_register_decode_chain(&cec_nb); pr_info("Correctable Errors collector initialized.\n"); + return 0; } late_initcall(cec_init); From 368d1887200d68075c064a62a9aa191168cf1eed Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 20 Jul 2020 14:53:53 +0000 Subject: [PATCH 02/20] x86/MCE/AMD, EDAC/mce_amd: Remove struct smca_hwid.xec_bitmap The Extended Error Code Bitmap (xec_bitmap) for a Scalable MCA bank type was intended to be used by the kernel to filter out invalid error codes on a system. However, this is unnecessary after a few product releases because the hardware will only report valid error codes. Thus, there's no need for it with future systems. Remove the xec_bitmap field and all references to it. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200720145353.43924-1-Yazen.Ghannam@amd.com --- arch/x86/include/asm/mce.h | 1 - arch/x86/kernel/cpu/mce/amd.c | 44 +++++++++++++++++------------------ drivers/edac/mce_amd.c | 4 +--- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index cf503824529c..6adced6e7dd3 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -328,7 +328,6 @@ enum smca_bank_types { struct smca_hwid { unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ u32 hwid_mcatype; /* (hwid,mcatype) tuple */ - u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */ u8 count; /* Number of instances. */ }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 99be063fcb1b..0c6b02dd744c 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -132,49 +132,49 @@ static enum smca_bank_types smca_get_bank_type(unsigned int bank) } static struct smca_hwid smca_hwid_mcatypes[] = { - /* { bank_type, hwid_mcatype, xec_bitmap } */ + /* { bank_type, hwid_mcatype } */ /* Reserved type */ - { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 }, + { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0) }, /* ZN Core (HWID=0xB0) MCA types */ - { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF }, - { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10), 0xFFFFFF }, - { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, - { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, - { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) }, + { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) }, /* HWID 0xB0 MCATYPE 0x4 is Reserved */ - { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0xFFF }, - { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, - { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) }, /* Data Fabric MCA types */ - { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, - { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0x1F }, - { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2), 0x3FFF }, + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) }, + { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) }, /* Unified Memory Controller MCA type */ - { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0xFF }, + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, /* Parameter Block MCA type */ - { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, + { SMCA_PB, HWID_MCATYPE(0x05, 0x0) }, /* Platform Security Processor MCA type */ - { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, - { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1), 0x3FFFF }, + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) }, + { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) }, /* System Management Unit MCA type */ - { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, - { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1), 0x7FF }, + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) }, + { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) }, /* Microprocessor 5 Unit MCA type */ - { SMCA_MP5, HWID_MCATYPE(0x01, 0x2), 0x3FF }, + { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) }, /* Northbridge IO Unit MCA type */ - { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0), 0x1F }, + { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) }, /* PCI Express Unit MCA type */ - { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0), 0x1F }, + { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, }; struct smca_bank smca_banks[MAX_NR_BANKS]; diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 325aedf46ff2..d4168c46739c 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -990,10 +990,8 @@ static void decode_smca_error(struct mce *m) pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec); /* Only print the decode of valid error codes */ - if (xec < smca_mce_descs[bank_type].num_descs && - (hwid->xec_bitmap & BIT_ULL(xec))) { + if (xec < smca_mce_descs[bank_type].num_descs) pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]); - } if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc) decode_dram_ecc(cpu_to_node(m->extcpu), m); From 1e36d9c6886849c6f3d3c836370563e6bc1a6ddd Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 24 Aug 2020 15:12:37 -0700 Subject: [PATCH 03/20] x86/mce: Delay clearing IA32_MCG_STATUS to the end of do_machine_check() A long time ago, Linux cleared IA32_MCG_STATUS at the very end of machine check processing. Then, some fancy recovery and IST manipulation was added in: d4812e169de4 ("x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks") and clearing IA32_MCG_STATUS was pulled earlier in the function. Next change moved the actual recovery out of do_machine_check() and just used task_work_add() to schedule it later (before returning to the user): 5567d11c21a1 ("x86/mce: Send #MC singal from task work") Most recently the fancy IST footwork was removed as no longer needed: b052df3da821 ("x86/entry: Get rid of ist_begin/end_non_atomic()") At this point there is no reason remaining to clear IA32_MCG_STATUS early. It can move back to the very end of the function. Also move sync_core(). The comments for this function say that it should only be called when instructions have been changed/re-mapped. Recovery for an instruction fetch may change the physical address. But that doesn't happen until the scheduled work runs (which could be on another CPU). [ bp: Massage commit message. ] Reported-by: Gabriele Paoloni Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200824221237.5397-1-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index f43a78bde670..0ba24dfffdb2 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1190,6 +1190,7 @@ static void kill_me_maybe(struct callback_head *cb) if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + sync_core(); return; } @@ -1330,12 +1331,8 @@ noinstr void do_machine_check(struct pt_regs *regs) if (worst > 0) irq_work_queue(&mce_irq_work); - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - - sync_core(); - if (worst != MCE_AR_SEVERITY && !kill_it) - return; + goto out; /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { @@ -1364,6 +1361,8 @@ noinstr void do_machine_check(struct pt_regs *regs) mce_panic("Failed kernel mode recovery", &m, msg); } } +out: + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); From a0bc32b3cacf194dc479b342f006203fd1e1941a Mon Sep 17 00:00:00 2001 From: Akshay Gupta Date: Fri, 28 Aug 2020 19:24:12 +0000 Subject: [PATCH 04/20] x86/mce: Increase maximum number of banks to 64 ...because future AMD systems will support up to 64 MCA banks per CPU. MAX_NR_BANKS is used to allocate a number of data structures, and it is used as a ceiling for values read from MCG_CAP[Count]. Therefore, this change will have no functional effect on existing systems with 32 or fewer MCA banks per CPU. However, this will increase the size of the following structures: Global bitmaps: - core.c / mce_banks_ce_disabled - core.c / all_banks - core.c / valid_banks - core.c / toclear - Total: 32 new bits * 4 bitmaps = 16 new bytes Per-CPU bitmaps: - core.c / mce_poll_banks - intel.c / mce_banks_owned - Total: 32 new bits * 2 bitmaps = 8 new bytes The bitmaps are arrays of longs. So this change will only affect 32-bit execution, since there will be one additional long used. There will be no additional memory use on 64-bit execution, because the size of long is 64 bits. Global structs: - amd.c / struct smca_bank smca_banks[]: 16 bytes per bank - core.c / struct mce_bank_dev mce_bank_devs[]: 56 bytes per bank - Total: 32 new banks * (16 + 56) bytes = 2304 new bytes Per-CPU structs: - core.c / struct mce_bank mce_banks_array[]: 16 bytes per bank - Total: 32 new banks * 16 bytes = 512 new bytes 32-bit Total global size increase: 2320 bytes Total per-CPU size increase: 520 bytes 64-bit Total global size increase: 2304 bytes Total per-CPU size increase: 512 bytes This additional memory should still fit within the existing .data section of the kernel binary. However, in the case where it doesn't fit, an additional page (4kB) of memory will be added to the binary to accommodate the extra data which will be the maximum size increase of vmlinux. Signed-off-by: Akshay Gupta [ Adjust commit message and code comment. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200828192412.320052-1-Yazen.Ghannam@amd.com --- arch/x86/include/asm/mce.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6adced6e7dd3..109af5c7f515 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -200,12 +200,8 @@ void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct device *, mce_device); -/* - * Maximum banks number. - * This is the limit of the current register layout on - * Intel CPUs. - */ -#define MAX_NR_BANKS 32 +/* Maximum number of MCA banks per CPU. */ +#define MAX_NR_BANKS 64 #ifdef CONFIG_X86_MCE_INTEL void mce_intel_feature_init(struct cpuinfo_x86 *c); From e2def7d49d0812ea40a224161b2001b2e815dce2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 6 Sep 2020 23:02:52 +0200 Subject: [PATCH 05/20] x86/mce: Make mce_rdmsrl() panic on an inaccessible MSR If an exception needs to be handled while reading an MSR - which is in most of the cases caused by a #GP on a non-existent MSR - then this is most likely the incarnation of a BIOS or a hardware bug. Such bug violates the architectural guarantee that MCA banks are present with all MSRs belonging to them. The proper fix belongs in the hardware/firmware - not in the kernel. Handling an #MC exception which is raised while an NMI is being handled would cause the nasty NMI nesting issue because of the shortcoming of IRET of reenabling NMIs when executed. And the machine is in an #MC context already so be at its side. Tracing MSR accesses while in #MC is another no-no due to tracing being inherently a bad idea in atomic context: vmlinux.o: warning: objtool: do_machine_check()+0x4a: call to mce_rdmsrl() leaves .noinstr.text section so remove all that "additional" functionality from mce_rdmsrl() and provide it with a special exception handler which panics the machine when that MSR is not accessible. The exception handler prints a human-readable message explaining what the panic reason is but, what is more, it panics while in the #GP handler and latter won't have executed an IRET, thus opening the NMI nesting issue in the case when the #MC has happened while handling an NMI. (#MC itself won't be reenabled until MCG_STATUS hasn't been cleared). Suggested-by: Andy Lutomirski Suggested-by: Peter Zijlstra [ Add missing prototypes for ex_handler_* ] Reported-by: kernel test robot Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20200906212130.GA28456@zn.tnic --- arch/x86/kernel/cpu/mce/core.c | 72 +++++++++++++++++++++++++----- arch/x86/kernel/cpu/mce/internal.h | 10 +++++ 2 files changed, 70 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 0ba24dfffdb2..a697baee9aa0 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -373,10 +373,28 @@ static int msr_to_offset(u32 msr) return -1; } +__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + + show_stack_regs(regs); + + panic("MCA architectural violation!\n"); + + while (true) + cpu_relax(); + + return true; +} + /* MSR access wrappers used for error injection */ static u64 mce_rdmsrl(u32 msr) { - u64 v; + DECLARE_ARGS(val, low, high); if (__this_cpu_read(injectm.finished)) { int offset = msr_to_offset(msr); @@ -386,21 +404,43 @@ static u64 mce_rdmsrl(u32 msr) return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); } - if (rdmsrl_safe(msr, &v)) { - WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); - /* - * Return zero in case the access faulted. This should - * not happen normally but can happen if the CPU does - * something weird, or if the code is buggy. - */ - v = 0; - } + /* + * RDMSR on MCA MSRs should not fault. If they do, this is very much an + * architectural violation and needs to be reported to hw vendor. Panic + * the box to not allow any further progress. + */ + asm volatile("1: rdmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault) + : EAX_EDX_RET(val, low, high) : "c" (msr)); - return v; + + return EAX_EDX_VAL(val, low, high); +} + +__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, + regs->ip, (void *)regs->ip); + + show_stack_regs(regs); + + panic("MCA architectural violation!\n"); + + while (true) + cpu_relax(); + + return true; } static void mce_wrmsrl(u32 msr, u64 v) { + u32 low, high; + if (__this_cpu_read(injectm.finished)) { int offset = msr_to_offset(msr); @@ -408,7 +448,15 @@ static void mce_wrmsrl(u32 msr, u64 v) *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; return; } - wrmsrl(msr, v); + + low = (u32)v; + high = (u32)(v >> 32); + + /* See comment in mce_rdmsrl() */ + asm volatile("1: wrmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault) + : : "c" (msr), "a"(low), "d" (high) : "memory"); } /* diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 6473070b5da4..b122610e9046 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -185,4 +185,14 @@ extern bool amd_filter_mce(struct mce *m); static inline bool amd_filter_mce(struct mce *m) { return false; }; #endif +__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); + +__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); + #endif /* __X86_MCE_INTERNAL_H__ */ From 13c877f4b48b943105ad9e13ba2c7a093fb694e8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 8 Sep 2020 10:55:12 -0700 Subject: [PATCH 06/20] x86/mce: Stop mce_reign() from re-computing severity for every CPU Back in commit: 20d51a426fe9 ("x86/mce: Reuse one of the u16 padding fields in 'struct mce'") a field was added to "struct mce" to save the computed error severity. Make use of this in mce_reign() to avoid re-computing the severity for every CPU. In the case where the machine panics, one call to mce_severity() is still needed in order to provide the correct message giving the reason for the panic. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200908175519.14223-2-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index a697baee9aa0..5b1d5f33d60b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -920,7 +920,6 @@ static void mce_reign(void) struct mce *m = NULL; int global_worst = 0; char *msg = NULL; - char *nmsg = NULL; /* * This CPU is the Monarch and the other CPUs have run @@ -928,12 +927,10 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - int severity = mce_severity(&per_cpu(mces_seen, cpu), - mca_cfg.tolerant, - &nmsg, true); - if (severity > global_worst) { - msg = nmsg; - global_worst = severity; + struct mce *mtmp = &per_cpu(mces_seen, cpu); + + if (mtmp->severity > global_worst) { + global_worst = mtmp->severity; m = &per_cpu(mces_seen, cpu); } } @@ -943,8 +940,11 @@ static void mce_reign(void) * This dumps all the mces in the log buffer and stops the * other CPUs. */ - if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { + /* call mce_severity() to get "msg" for panic */ + mce_severity(m, mca_cfg.tolerant, &msg, true); mce_panic("Fatal machine check", m, msg); + } /* * For UC somewhere we let the CPU who detects it handle it. From dc0592b73715c8e84ad8ebbc50c6057d5e203aac Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Thu, 3 Sep 2020 18:45:31 -0500 Subject: [PATCH 07/20] x86/mce/dev-mcelog: Do not update kflags on AMD systems The mcelog utility is not commonly used on AMD systems. Therefore, errors logged only by the dev_mce_log() notifier will be missed. This may occur if the EDAC modules are not loaded, in which case it's preferable to print the error record by the default notifier. However, the mce->kflags set by dev_mce_log() notifier makes the default notifier skip over the errors assuming they are processed by dev_mce_log(). Do not update kflags in the dev_mce_log() notifier on AMD systems. Signed-off-by: Smita Koralahalli Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200903234531.162484-3-Smita.KoralahalliChannabasappa@amd.com --- arch/x86/kernel/cpu/mce/dev-mcelog.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index 03e51053592a..100fbeebdc72 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -67,7 +67,9 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, unlock: mutex_unlock(&mce_chrdev_read_mutex); - mce->kflags |= MCE_HANDLED_MCELOG; + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + mce->kflags |= MCE_HANDLED_MCELOG; + return NOTIFY_OK; } From e100777016fdf6ec3a9d7c1773b15a2b5eca6c55 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 14 Sep 2020 19:21:28 +0200 Subject: [PATCH 08/20] x86/mce: Annotate mce_rd/wrmsrl() with noinstr They do get called from the #MC handler which is already marked "noinstr". Commit e2def7d49d08 ("x86/mce: Make mce_rdmsrl() panic on an inaccessible MSR") already got rid of the instrumentation in the MSR accessors, fix the annotation now too, in order to get rid of: vmlinux.o: warning: objtool: do_machine_check()+0x4a: call to mce_rdmsrl() leaves .noinstr.text section Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200915194020.28807-1-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 5b1d5f33d60b..11b6697be010 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -392,16 +392,25 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, } /* MSR access wrappers used for error injection */ -static u64 mce_rdmsrl(u32 msr) +static noinstr u64 mce_rdmsrl(u32 msr) { DECLARE_ARGS(val, low, high); if (__this_cpu_read(injectm.finished)) { - int offset = msr_to_offset(msr); + int offset; + u64 ret; + instrumentation_begin(); + + offset = msr_to_offset(msr); if (offset < 0) - return 0; - return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); + ret = 0; + else + ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); + + instrumentation_end(); + + return ret; } /* @@ -437,15 +446,21 @@ __visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, return true; } -static void mce_wrmsrl(u32 msr, u64 v) +static noinstr void mce_wrmsrl(u32 msr, u64 v) { u32 low, high; if (__this_cpu_read(injectm.finished)) { - int offset = msr_to_offset(msr); + int offset; + instrumentation_begin(); + + offset = msr_to_offset(msr); if (offset >= 0) *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; + + instrumentation_end(); + return; } From 4bd442e9a8388e8ec4ba7cf23a4774989d93b78e Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Sat, 19 Sep 2020 09:22:52 +0800 Subject: [PATCH 09/20] RAS/CEC: Convert to DEFINE_SHOW_ATTRIBUTE() Use the DEFINE_SHOW_ATTRIBUTE() macro and simplify the code. Signed-off-by: Qinglang Miao Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200919012252.171437-1-miaoqinglang@huawei.com --- drivers/ras/cec.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index 6939aa5b3dc7..ddecf25b5dd4 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -435,7 +435,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(action_threshold_ops, u64_get, action_threshold_set, "% static const char * const bins[] = { "00", "01", "10", "11" }; -static int array_dump(struct seq_file *m, void *v) +static int array_show(struct seq_file *m, void *v) { struct ce_array *ca = &ce_arr; int i; @@ -467,18 +467,7 @@ static int array_dump(struct seq_file *m, void *v) return 0; } -static int array_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, array_dump, NULL); -} - -static const struct file_operations array_ops = { - .owner = THIS_MODULE, - .open = array_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(array); static int __init create_debugfs_nodes(void) { @@ -513,7 +502,7 @@ static int __init create_debugfs_nodes(void) goto err; } - array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops); + array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_fops); if (!array) { pr_warn("Error creating array debugfs node!\n"); goto err; From fd258dc4442c5c1c069c6b5b42bfe7d10cddda95 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Sep 2020 19:13:12 -0700 Subject: [PATCH 10/20] x86/mce: Add Skylake quirk for patrol scrub reported errors The patrol scrubber in Skylake and Cascade Lake systems can be configured to report uncorrected errors using a special signature in the machine check bank and to signal using CMCI instead of machine check. Update the severity calculation mechanism to allow specifying the model, minimum stepping and range of machine check bank numbers. Add a new rule to detect the special signature (on model 0x55, stepping >=4 in any of the memory controller banks). [ bp: Rewrite it. aegl: Productize it. ] Suggested-by: Youquan Song Signed-off-by: Borislav Petkov Co-developed-by: Tony Luck Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200930021313.31810-2-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/severity.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index e1da619add19..567ce09a0286 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -9,9 +9,11 @@ #include #include #include -#include #include +#include +#include + #include "internal.h" /* @@ -40,9 +42,14 @@ static struct severity { unsigned char context; unsigned char excp; unsigned char covered; + unsigned char cpu_model; + unsigned char cpu_minstepping; + unsigned char bank_lo, bank_hi; char *msg; } severities[] = { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h +#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER #define KERNEL_RECOV .context = IN_KERNEL_RECOV @@ -97,7 +104,6 @@ static struct severity { KEEP, "Corrected error", NOSER, BITCLR(MCI_STATUS_UC) ), - /* * known AO MCACODs reported via MCE or CMC: * @@ -113,6 +119,18 @@ static struct severity { AO, "Action optional: last level cache writeback error", SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) ), + /* + * Quirk for Skylake/Cascade Lake. Patrol scrubber may be configured + * to report uncorrected errors using CMCI with a special signature. + * UC=0, MSCOD=0x0010, MCACOD=binary(000X 0000 1100 XXXX) reported + * in one of the memory controller banks. + * Set severity to "AO" for same action as normal patrol scrub error. + */ + MCESEV( + AO, "Uncorrected Patrol Scrub Error", + SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), + MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18) + ), /* ignore OVER for UCNA */ MCESEV( @@ -324,6 +342,12 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e continue; if (s->excp && excp != s->excp) continue; + if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model) + continue; + if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) + continue; + if (s->bank_lo && (m->bank < s->bank_lo || m->bank > s->bank_hi)) + continue; if (msg) *msg = s->msg; s->covered = 1; From ed9705e4ad1c19ae51ed0cb4c112f9eb6dfc69fc Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 29 Sep 2020 19:13:13 -0700 Subject: [PATCH 11/20] x86/mce: Drop AMD-specific "DEFERRED" case from Intel severity rule list Way back in v3.19 Intel and AMD shared the same machine check severity grading code. So it made sense to add a case for AMD DEFERRED errors in commit e3480271f592 ("x86, mce, severity: Extend the the mce_severity mechanism to handle UCNA/DEFERRED error") But later in v4.2 AMD switched to a separate grading function in commit bf80bbd7dcf5 ("x86/mce: Add an AMD severities-grading function") Belatedly drop the DEFERRED case from the Intel rule list. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200930021313.31810-3-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/severity.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 567ce09a0286..e0722461bb57 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -96,10 +96,6 @@ static struct severity { PANIC, "In kernel and no restart IP", EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) ), - MCESEV( - DEFERRED, "Deferred error", - NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) - ), MCESEV( KEEP, "Corrected error", NOSER, BITCLR(MCI_STATUS_UC) From ec6347bb43395cb92126788a1a5b25302543f815 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 5 Oct 2020 20:40:16 -0700 Subject: [PATCH 12/20] x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}() In reaction to a proposal to introduce a memcpy_mcsafe_fast() implementation Linus points out that memcpy_mcsafe() is poorly named relative to communicating the scope of the interface. Specifically what addresses are valid to pass as source, destination, and what faults / exceptions are handled. Of particular concern is that even though x86 might be able to handle the semantics of copy_mc_to_user() with its common copy_user_generic() implementation other archs likely need / want an explicit path for this case: On Fri, May 1, 2020 at 11:28 AM Linus Torvalds wrote: > > On Thu, Apr 30, 2020 at 6:21 PM Dan Williams wrote: > > > > However now I see that copy_user_generic() works for the wrong reason. > > It works because the exception on the source address due to poison > > looks no different than a write fault on the user address to the > > caller, it's still just a short copy. So it makes copy_to_user() work > > for the wrong reason relative to the name. > > Right. > > And it won't work that way on other architectures. On x86, we have a > generic function that can take faults on either side, and we use it > for both cases (and for the "in_user" case too), but that's an > artifact of the architecture oddity. > > In fact, it's probably wrong even on x86 - because it can hide bugs - > but writing those things is painful enough that everybody prefers > having just one function. Replace a single top-level memcpy_mcsafe() with either copy_mc_to_user(), or copy_mc_to_kernel(). Introduce an x86 copy_mc_fragile() name as the rename for the low-level x86 implementation formerly named memcpy_mcsafe(). It is used as the slow / careful backend that is supplanted by a fast copy_mc_generic() in a follow-on patch. One side-effect of this reorganization is that separating copy_mc_64.S to its own file means that perf no longer needs to track dependencies for its memcpy_64.S benchmarks. [ bp: Massage a bit. ] Signed-off-by: Dan Williams Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Acked-by: Michael Ellerman Cc: Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/string.h | 2 - arch/powerpc/include/asm/uaccess.h | 40 ++++-- arch/powerpc/lib/Makefile | 2 +- .../lib/{memcpy_mcsafe_64.S => copy_mc_64.S} | 4 +- arch/x86/Kconfig | 2 +- arch/x86/Kconfig.debug | 2 +- arch/x86/include/asm/copy_mc_test.h | 75 +++++++++++ arch/x86/include/asm/mce.h | 9 ++ arch/x86/include/asm/mcsafe_test.h | 75 ----------- arch/x86/include/asm/string_64.h | 32 ----- arch/x86/include/asm/uaccess.h | 9 ++ arch/x86/include/asm/uaccess_64.h | 20 --- arch/x86/kernel/cpu/mce/core.c | 8 +- arch/x86/kernel/quirks.c | 10 +- arch/x86/lib/Makefile | 1 + arch/x86/lib/copy_mc.c | 82 +++++++++++ arch/x86/lib/copy_mc_64.S | 127 ++++++++++++++++++ arch/x86/lib/memcpy_64.S | 115 ---------------- arch/x86/lib/usercopy_64.c | 21 --- drivers/md/dm-writecache.c | 15 ++- drivers/nvdimm/claim.c | 2 +- drivers/nvdimm/pmem.c | 6 +- include/linux/string.h | 9 +- include/linux/uaccess.h | 13 ++ include/linux/uio.h | 10 +- lib/Kconfig | 7 +- lib/iov_iter.c | 48 ++++--- tools/arch/x86/include/asm/mcsafe_test.h | 13 -- tools/arch/x86/lib/memcpy_64.S | 115 ---------------- tools/objtool/check.c | 4 +- tools/perf/bench/Build | 1 - tools/perf/bench/mem-memcpy-x86-64-lib.c | 24 ---- tools/testing/nvdimm/test/nfit.c | 49 +++---- .../selftests/powerpc/copyloops/.gitignore | 2 +- .../selftests/powerpc/copyloops/Makefile | 6 +- .../selftests/powerpc/copyloops/copy_mc_64.S | 1 + .../powerpc/copyloops/memcpy_mcsafe_64.S | 1 - 38 files changed, 433 insertions(+), 531 deletions(-) rename arch/powerpc/lib/{memcpy_mcsafe_64.S => copy_mc_64.S} (98%) create mode 100644 arch/x86/include/asm/copy_mc_test.h delete mode 100644 arch/x86/include/asm/mcsafe_test.h create mode 100644 arch/x86/lib/copy_mc.c create mode 100644 arch/x86/lib/copy_mc_64.S delete mode 100644 tools/arch/x86/include/asm/mcsafe_test.h delete mode 100644 tools/perf/bench/mem-memcpy-x86-64-lib.c create mode 120000 tools/testing/selftests/powerpc/copyloops/copy_mc_64.S delete mode 120000 tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1f48bbfb3ce9..76473eda3426 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -136,7 +136,7 @@ config PPC select ARCH_HAS_STRICT_KERNEL_RWX if (PPC32 && !HIBERNATION) select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE - select ARCH_HAS_UACCESS_MCSAFE if PPC64 + select ARCH_HAS_COPY_MC if PPC64 select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index 283552cd0e58..2aa0e31e6884 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -53,9 +53,7 @@ void *__memmove(void *to, const void *from, __kernel_size_t n); #ifndef CONFIG_KASAN #define __HAVE_ARCH_MEMSET32 #define __HAVE_ARCH_MEMSET64 -#define __HAVE_ARCH_MEMCPY_MCSAFE -extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz); extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t); extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t); diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 00699903f1ef..20a35373cafc 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -435,6 +435,32 @@ do { \ extern unsigned long __copy_tofrom_user(void __user *to, const void __user *from, unsigned long size); +#ifdef CONFIG_ARCH_HAS_COPY_MC +unsigned long __must_check +copy_mc_generic(void *to, const void *from, unsigned long size); + +static inline unsigned long __must_check +copy_mc_to_kernel(void *to, const void *from, unsigned long size) +{ + return copy_mc_generic(to, from, size); +} +#define copy_mc_to_kernel copy_mc_to_kernel + +static inline unsigned long __must_check +copy_mc_to_user(void __user *to, const void *from, unsigned long n) +{ + if (likely(check_copy_size(from, n, true))) { + if (access_ok(to, n)) { + allow_write_to_user(to, n); + n = copy_mc_generic((void *)to, from, n); + prevent_write_to_user(to, n); + } + } + + return n; +} +#endif + #ifdef __powerpc64__ static inline unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) @@ -523,20 +549,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) return ret; } -static __always_inline unsigned long __must_check -copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n) -{ - if (likely(check_copy_size(from, n, true))) { - if (access_ok(to, n)) { - allow_write_to_user(to, n); - n = memcpy_mcsafe((void *)to, from, n); - prevent_write_to_user(to, n); - } - } - - return n; -} - unsigned long __arch_clear_user(void __user *addr, unsigned long size); static inline unsigned long clear_user(void __user *addr, unsigned long size) diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index d66a645503eb..69a91b571845 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \ memcpy_power7.o obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ - memcpy_64.o memcpy_mcsafe_64.o + memcpy_64.o copy_mc_64.o ifndef CONFIG_PPC_QUEUED_SPINLOCKS obj64-$(CONFIG_SMP) += locks.o diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S b/arch/powerpc/lib/copy_mc_64.S similarity index 98% rename from arch/powerpc/lib/memcpy_mcsafe_64.S rename to arch/powerpc/lib/copy_mc_64.S index cb882d9a6d8a..88d46c471493 100644 --- a/arch/powerpc/lib/memcpy_mcsafe_64.S +++ b/arch/powerpc/lib/copy_mc_64.S @@ -50,7 +50,7 @@ err3; stb r0,0(r3) blr -_GLOBAL(memcpy_mcsafe) +_GLOBAL(copy_mc_generic) mr r7,r5 cmpldi r5,16 blt .Lshort_copy @@ -239,4 +239,4 @@ err1; stb r0,0(r3) 15: li r3,0 blr -EXPORT_SYMBOL_GPL(memcpy_mcsafe); +EXPORT_SYMBOL_GPL(copy_mc_generic); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7101ac64bb20..e876b3a087f9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -75,7 +75,7 @@ config X86 select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 - select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE + select ARCH_HAS_COPY_MC if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index ee1d3c5834c6..27b5e2bc6a01 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -62,7 +62,7 @@ config EARLY_PRINTK_USB_XDBC You should normally say N here, unless you want to debug early crashes or need a very simple printk logging facility. -config MCSAFE_TEST +config COPY_MC_TEST def_bool n config EFI_PGT_DUMP diff --git a/arch/x86/include/asm/copy_mc_test.h b/arch/x86/include/asm/copy_mc_test.h new file mode 100644 index 000000000000..e4991ba96726 --- /dev/null +++ b/arch/x86/include/asm/copy_mc_test.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _COPY_MC_TEST_H_ +#define _COPY_MC_TEST_H_ + +#ifndef __ASSEMBLY__ +#ifdef CONFIG_COPY_MC_TEST +extern unsigned long copy_mc_test_src; +extern unsigned long copy_mc_test_dst; + +static inline void copy_mc_inject_src(void *addr) +{ + if (addr) + copy_mc_test_src = (unsigned long) addr; + else + copy_mc_test_src = ~0UL; +} + +static inline void copy_mc_inject_dst(void *addr) +{ + if (addr) + copy_mc_test_dst = (unsigned long) addr; + else + copy_mc_test_dst = ~0UL; +} +#else /* CONFIG_COPY_MC_TEST */ +static inline void copy_mc_inject_src(void *addr) +{ +} + +static inline void copy_mc_inject_dst(void *addr) +{ +} +#endif /* CONFIG_COPY_MC_TEST */ + +#else /* __ASSEMBLY__ */ +#include + +#ifdef CONFIG_COPY_MC_TEST +.macro COPY_MC_TEST_CTL + .pushsection .data + .align 8 + .globl copy_mc_test_src + copy_mc_test_src: + .quad 0 + EXPORT_SYMBOL_GPL(copy_mc_test_src) + .globl copy_mc_test_dst + copy_mc_test_dst: + .quad 0 + EXPORT_SYMBOL_GPL(copy_mc_test_dst) + .popsection +.endm + +.macro COPY_MC_TEST_SRC reg count target + leaq \count(\reg), %r9 + cmp copy_mc_test_src, %r9 + ja \target +.endm + +.macro COPY_MC_TEST_DST reg count target + leaq \count(\reg), %r9 + cmp copy_mc_test_dst, %r9 + ja \target +.endm +#else +.macro COPY_MC_TEST_CTL +.endm + +.macro COPY_MC_TEST_SRC reg count target +.endm + +.macro COPY_MC_TEST_DST reg count target +.endm +#endif /* CONFIG_COPY_MC_TEST */ +#endif /* __ASSEMBLY__ */ +#endif /* _COPY_MC_TEST_H_ */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 109af5c7f515..ba2062d6df92 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -174,6 +174,15 @@ extern void mce_unregister_decode_chain(struct notifier_block *nb); extern int mce_p5_enabled; +#ifdef CONFIG_ARCH_HAS_COPY_MC +extern void enable_copy_mc_fragile(void); +unsigned long __must_check copy_mc_fragile(void *dst, const void *src, unsigned cnt); +#else +static inline void enable_copy_mc_fragile(void) +{ +} +#endif + #ifdef CONFIG_X86_MCE int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/mcsafe_test.h b/arch/x86/include/asm/mcsafe_test.h deleted file mode 100644 index eb59804b6201..000000000000 --- a/arch/x86/include/asm/mcsafe_test.h +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _MCSAFE_TEST_H_ -#define _MCSAFE_TEST_H_ - -#ifndef __ASSEMBLY__ -#ifdef CONFIG_MCSAFE_TEST -extern unsigned long mcsafe_test_src; -extern unsigned long mcsafe_test_dst; - -static inline void mcsafe_inject_src(void *addr) -{ - if (addr) - mcsafe_test_src = (unsigned long) addr; - else - mcsafe_test_src = ~0UL; -} - -static inline void mcsafe_inject_dst(void *addr) -{ - if (addr) - mcsafe_test_dst = (unsigned long) addr; - else - mcsafe_test_dst = ~0UL; -} -#else /* CONFIG_MCSAFE_TEST */ -static inline void mcsafe_inject_src(void *addr) -{ -} - -static inline void mcsafe_inject_dst(void *addr) -{ -} -#endif /* CONFIG_MCSAFE_TEST */ - -#else /* __ASSEMBLY__ */ -#include - -#ifdef CONFIG_MCSAFE_TEST -.macro MCSAFE_TEST_CTL - .pushsection .data - .align 8 - .globl mcsafe_test_src - mcsafe_test_src: - .quad 0 - EXPORT_SYMBOL_GPL(mcsafe_test_src) - .globl mcsafe_test_dst - mcsafe_test_dst: - .quad 0 - EXPORT_SYMBOL_GPL(mcsafe_test_dst) - .popsection -.endm - -.macro MCSAFE_TEST_SRC reg count target - leaq \count(\reg), %r9 - cmp mcsafe_test_src, %r9 - ja \target -.endm - -.macro MCSAFE_TEST_DST reg count target - leaq \count(\reg), %r9 - cmp mcsafe_test_dst, %r9 - ja \target -.endm -#else -.macro MCSAFE_TEST_CTL -.endm - -.macro MCSAFE_TEST_SRC reg count target -.endm - -.macro MCSAFE_TEST_DST reg count target -.endm -#endif /* CONFIG_MCSAFE_TEST */ -#endif /* __ASSEMBLY__ */ -#endif /* _MCSAFE_TEST_H_ */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 75314c3dbe47..6e450827f677 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -82,38 +82,6 @@ int strcmp(const char *cs, const char *ct); #endif -#define __HAVE_ARCH_MEMCPY_MCSAFE 1 -__must_check unsigned long __memcpy_mcsafe(void *dst, const void *src, - size_t cnt); -DECLARE_STATIC_KEY_FALSE(mcsafe_key); - -/** - * memcpy_mcsafe - copy memory with indication if a machine check happened - * - * @dst: destination address - * @src: source address - * @cnt: number of bytes to copy - * - * Low level memory copy function that catches machine checks - * We only call into the "safe" function on systems that can - * actually do machine check recovery. Everyone else can just - * use memcpy(). - * - * Return 0 for success, or number of bytes not copied if there was an - * exception. - */ -static __always_inline __must_check unsigned long -memcpy_mcsafe(void *dst, const void *src, size_t cnt) -{ -#ifdef CONFIG_X86_MCE - if (static_branch_unlikely(&mcsafe_key)) - return __memcpy_mcsafe(dst, src, cnt); - else -#endif - memcpy(dst, src, cnt); - return 0; -} - #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 void __memcpy_flushcache(void *dst, const void *src, size_t cnt); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index ecefaffd15d4..eff7fb847149 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -455,6 +455,15 @@ extern __must_check long strnlen_user(const char __user *str, long n); unsigned long __must_check clear_user(void __user *mem, unsigned long len); unsigned long __must_check __clear_user(void __user *mem, unsigned long len); +#ifdef CONFIG_ARCH_HAS_COPY_MC +unsigned long __must_check +copy_mc_to_kernel(void *to, const void *from, unsigned len); +#define copy_mc_to_kernel copy_mc_to_kernel + +unsigned long __must_check +copy_mc_to_user(void *to, const void *from, unsigned len); +#endif + /* * movsl can be slow when source and dest are not both 8-byte aligned */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index bc10e3dc64fe..e7265a552f4f 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -46,22 +46,6 @@ copy_user_generic(void *to, const void *from, unsigned len) return ret; } -static __always_inline __must_check unsigned long -copy_to_user_mcsafe(void *to, const void *from, unsigned len) -{ - unsigned long ret; - - __uaccess_begin(); - /* - * Note, __memcpy_mcsafe() is explicitly used since it can - * handle exceptions / faults. memcpy_mcsafe() may fall back to - * memcpy() which lacks this handling. - */ - ret = __memcpy_mcsafe(to, from, len); - __uaccess_end(); - return ret; -} - static __always_inline __must_check unsigned long raw_copy_from_user(void *dst, const void __user *src, unsigned long size) { @@ -102,8 +86,4 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) kasan_check_write(dst, size); return __copy_user_flushcache(dst, src, size); } - -unsigned long -mcsafe_handle_tail(char *to, char *from, unsigned len); - #endif /* _ASM_X86_UACCESS_64_H */ diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 11b6697be010..b5b70f4b351d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include @@ -2124,7 +2123,7 @@ void mce_disable_bank(int bank) and older. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold - * mce=recovery force enable memcpy_mcsafe() + * mce=recovery force enable copy_mc_fragile() */ static int __init mcheck_enable(char *str) { @@ -2732,13 +2731,10 @@ static void __init mcheck_debugfs_init(void) static void __init mcheck_debugfs_init(void) { } #endif -DEFINE_STATIC_KEY_FALSE(mcsafe_key); -EXPORT_SYMBOL_GPL(mcsafe_key); - static int __init mcheck_late_init(void) { if (mca_cfg.recovery) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); mcheck_debugfs_init(); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 1b10717c9321..6d0df6a58873 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -8,6 +8,7 @@ #include #include +#include #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) @@ -624,10 +625,6 @@ static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, amd_disable_seq_and_redirect_scrub); -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) -#include -#include - /* Ivy Bridge, Haswell, Broadwell */ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) { @@ -636,7 +633,7 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) pci_read_config_dword(pdev, 0x84, &capid0); if (capid0 & 0x10) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); } /* Skylake */ @@ -653,7 +650,7 @@ static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) * enabled, so memory machine check recovery is also enabled. */ if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0)) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); @@ -661,7 +658,6 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); #endif -#endif bool x86_apple_machine; EXPORT_SYMBOL(x86_apple_machine); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index d46fff11f06f..f7d23c9c22b2 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o misc.o cmdline.o cpu.o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-$(CONFIG_ARCH_HAS_COPY_MC) += copy_mc.o copy_mc_64.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c new file mode 100644 index 000000000000..2633635530b7 --- /dev/null +++ b/arch/x86/lib/copy_mc.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */ + +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_X86_MCE +/* + * See COPY_MC_TEST for self-test of the copy_mc_fragile() + * implementation. + */ +static DEFINE_STATIC_KEY_FALSE(copy_mc_fragile_key); + +void enable_copy_mc_fragile(void) +{ + static_branch_inc(©_mc_fragile_key); +} +#define copy_mc_fragile_enabled (static_branch_unlikely(©_mc_fragile_key)) + +/* + * Similar to copy_user_handle_tail, probe for the write fault point, or + * source exception point. + */ +__visible notrace unsigned long +copy_mc_fragile_handle_tail(char *to, char *from, unsigned len) +{ + for (; len; --len, to++, from++) + if (copy_mc_fragile(to, from, 1)) + break; + return len; +} +#else +/* + * No point in doing careful copying, or consulting a static key when + * there is no #MC handler in the CONFIG_X86_MCE=n case. + */ +void enable_copy_mc_fragile(void) +{ +} +#define copy_mc_fragile_enabled (0) +#endif + +/** + * copy_mc_to_kernel - memory copy that handles source exceptions + * + * @dst: destination address + * @src: source address + * @len: number of bytes to copy + * + * Call into the 'fragile' version on systems that have trouble + * actually do machine check recovery. Everyone else can just + * use memcpy(). + * + * Return 0 for success, or number of bytes not copied if there was an + * exception. + */ +unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len) +{ + if (copy_mc_fragile_enabled) + return copy_mc_fragile(dst, src, len); + memcpy(dst, src, len); + return 0; +} +EXPORT_SYMBOL_GPL(copy_mc_to_kernel); + +unsigned long __must_check copy_mc_to_user(void *dst, const void *src, unsigned len) +{ + unsigned long ret; + + if (!copy_mc_fragile_enabled) + return copy_user_generic(dst, src, len); + + __uaccess_begin(); + ret = copy_mc_fragile(dst, src, len); + __uaccess_end(); + return ret; +} diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S new file mode 100644 index 000000000000..c3b613c4544a --- /dev/null +++ b/arch/x86/lib/copy_mc_64.S @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */ + +#include +#include +#include +#include + +#ifndef CONFIG_UML + +#ifdef CONFIG_X86_MCE +COPY_MC_TEST_CTL + +/* + * copy_mc_fragile - copy memory with indication if an exception / fault happened + * + * The 'fragile' version is opted into by platform quirks and takes + * pains to avoid unrecoverable corner cases like 'fast-string' + * instruction sequences, and consuming poison across a cacheline + * boundary. The non-fragile version is equivalent to memcpy() + * regardless of CPU machine-check-recovery capability. + */ +SYM_FUNC_START(copy_mc_fragile) + cmpl $8, %edx + /* Less than 8 bytes? Go to byte copy loop */ + jb .L_no_whole_words + + /* Check for bad alignment of source */ + testl $7, %esi + /* Already aligned */ + jz .L_8byte_aligned + + /* Copy one byte at a time until source is 8-byte aligned */ + movl %esi, %ecx + andl $7, %ecx + subl $8, %ecx + negl %ecx + subl %ecx, %edx +.L_read_leading_bytes: + movb (%rsi), %al + COPY_MC_TEST_SRC %rsi 1 .E_leading_bytes + COPY_MC_TEST_DST %rdi 1 .E_leading_bytes +.L_write_leading_bytes: + movb %al, (%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .L_read_leading_bytes + +.L_8byte_aligned: + movl %edx, %ecx + andl $7, %edx + shrl $3, %ecx + jz .L_no_whole_words + +.L_read_words: + movq (%rsi), %r8 + COPY_MC_TEST_SRC %rsi 8 .E_read_words + COPY_MC_TEST_DST %rdi 8 .E_write_words +.L_write_words: + movq %r8, (%rdi) + addq $8, %rsi + addq $8, %rdi + decl %ecx + jnz .L_read_words + + /* Any trailing bytes? */ +.L_no_whole_words: + andl %edx, %edx + jz .L_done_memcpy_trap + + /* Copy trailing bytes */ + movl %edx, %ecx +.L_read_trailing_bytes: + movb (%rsi), %al + COPY_MC_TEST_SRC %rsi 1 .E_trailing_bytes + COPY_MC_TEST_DST %rdi 1 .E_trailing_bytes +.L_write_trailing_bytes: + movb %al, (%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .L_read_trailing_bytes + + /* Copy successful. Return zero */ +.L_done_memcpy_trap: + xorl %eax, %eax +.L_done: + ret +SYM_FUNC_END(copy_mc_fragile) +EXPORT_SYMBOL_GPL(copy_mc_fragile) + + .section .fixup, "ax" + /* + * Return number of bytes not copied for any failure. Note that + * there is no "tail" handling since the source buffer is 8-byte + * aligned and poison is cacheline aligned. + */ +.E_read_words: + shll $3, %ecx +.E_leading_bytes: + addl %edx, %ecx +.E_trailing_bytes: + mov %ecx, %eax + jmp .L_done + + /* + * For write fault handling, given the destination is unaligned, + * we handle faults on multi-byte writes with a byte-by-byte + * copy up to the write-protected page. + */ +.E_write_words: + shll $3, %ecx + addl %edx, %ecx + movl %ecx, %edx + jmp copy_mc_fragile_handle_tail + + .previous + + _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes) + _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words) + _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes) + _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes) + _ASM_EXTABLE(.L_write_words, .E_write_words) + _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes) +#endif /* CONFIG_X86_MCE */ +#endif /* !CONFIG_UML */ diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bbcc05bcefad..037faac46b0c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -4,7 +4,6 @@ #include #include #include -#include #include #include @@ -187,117 +186,3 @@ SYM_FUNC_START_LOCAL(memcpy_orig) SYM_FUNC_END(memcpy_orig) .popsection - -#ifndef CONFIG_UML - -MCSAFE_TEST_CTL - -/* - * __memcpy_mcsafe - memory copy with machine check exception handling - * Note that we only catch machine checks when reading the source addresses. - * Writes to target are posted and don't generate machine checks. - */ -SYM_FUNC_START(__memcpy_mcsafe) - cmpl $8, %edx - /* Less than 8 bytes? Go to byte copy loop */ - jb .L_no_whole_words - - /* Check for bad alignment of source */ - testl $7, %esi - /* Already aligned */ - jz .L_8byte_aligned - - /* Copy one byte at a time until source is 8-byte aligned */ - movl %esi, %ecx - andl $7, %ecx - subl $8, %ecx - negl %ecx - subl %ecx, %edx -.L_read_leading_bytes: - movb (%rsi), %al - MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes - MCSAFE_TEST_DST %rdi 1 .E_leading_bytes -.L_write_leading_bytes: - movb %al, (%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .L_read_leading_bytes - -.L_8byte_aligned: - movl %edx, %ecx - andl $7, %edx - shrl $3, %ecx - jz .L_no_whole_words - -.L_read_words: - movq (%rsi), %r8 - MCSAFE_TEST_SRC %rsi 8 .E_read_words - MCSAFE_TEST_DST %rdi 8 .E_write_words -.L_write_words: - movq %r8, (%rdi) - addq $8, %rsi - addq $8, %rdi - decl %ecx - jnz .L_read_words - - /* Any trailing bytes? */ -.L_no_whole_words: - andl %edx, %edx - jz .L_done_memcpy_trap - - /* Copy trailing bytes */ - movl %edx, %ecx -.L_read_trailing_bytes: - movb (%rsi), %al - MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes - MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes -.L_write_trailing_bytes: - movb %al, (%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .L_read_trailing_bytes - - /* Copy successful. Return zero */ -.L_done_memcpy_trap: - xorl %eax, %eax -.L_done: - ret -SYM_FUNC_END(__memcpy_mcsafe) -EXPORT_SYMBOL_GPL(__memcpy_mcsafe) - - .section .fixup, "ax" - /* - * Return number of bytes not copied for any failure. Note that - * there is no "tail" handling since the source buffer is 8-byte - * aligned and poison is cacheline aligned. - */ -.E_read_words: - shll $3, %ecx -.E_leading_bytes: - addl %edx, %ecx -.E_trailing_bytes: - mov %ecx, %eax - jmp .L_done - - /* - * For write fault handling, given the destination is unaligned, - * we handle faults on multi-byte writes with a byte-by-byte - * copy up to the write-protected page. - */ -.E_write_words: - shll $3, %ecx - addl %edx, %ecx - movl %ecx, %edx - jmp mcsafe_handle_tail - - .previous - - _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes) - _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words) - _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes) - _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes) - _ASM_EXTABLE(.L_write_words, .E_write_words) - _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes) -#endif diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index b0dfac3d3df7..5f1d4a9ebd5a 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -56,27 +56,6 @@ unsigned long clear_user(void __user *to, unsigned long n) } EXPORT_SYMBOL(clear_user); -/* - * Similar to copy_user_handle_tail, probe for the write fault point, - * but reuse __memcpy_mcsafe in case a new read error is encountered. - * clac() is handled in _copy_to_iter_mcsafe(). - */ -__visible notrace unsigned long -mcsafe_handle_tail(char *to, char *from, unsigned len) -{ - for (; len; --len, to++, from++) { - /* - * Call the assembly routine back directly since - * memcpy_mcsafe() may silently fallback to memcpy. - */ - unsigned long rem = __memcpy_mcsafe(to, from, 1); - - if (rem) - break; - } - return len; -} - #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /** * clean_cache_range - write back a cache range with CLWB diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 86dbe0c8b45c..9fc18fadacc6 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -49,7 +49,7 @@ do { \ #define pmem_assign(dest, src) ((dest) = (src)) #endif -#if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM) +#if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM) #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS #endif @@ -984,7 +984,8 @@ static void writecache_resume(struct dm_target *ti) } wc->freelist_size = 0; - r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t)); + r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count, + sizeof(uint64_t)); if (r) { writecache_error(wc, r, "hardware memory error when reading superblock: %d", r); sb_seq_count = cpu_to_le64(0); @@ -1000,7 +1001,8 @@ static void writecache_resume(struct dm_target *ti) e->seq_count = -1; continue; } - r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry)); + r = copy_mc_to_kernel(&wme, memory_entry(wc, e), + sizeof(struct wc_memory_entry)); if (r) { writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", (unsigned long)b, r); @@ -1198,7 +1200,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data if (rw == READ) { int r; - r = memcpy_mcsafe(buf, data, size); + r = copy_mc_to_kernel(buf, data, size); flush_dcache_page(bio_page(bio)); if (unlikely(r)) { writecache_error(wc, r, "hardware memory error when reading data: %d", r); @@ -2341,7 +2343,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) } } - r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock)); + r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock)); if (r) { ti->error = "Hardware memory error when reading superblock"; goto bad; @@ -2352,7 +2354,8 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->error = "Unable to initialize device"; goto bad; } - r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock)); + r = copy_mc_to_kernel(&s, sb(wc), + sizeof(struct wc_memory_superblock)); if (r) { ti->error = "Hardware memory error when reading superblock"; goto bad; diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index 45964acba944..22d865ba6353 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -268,7 +268,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, if (rw == READ) { if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) return -EIO; - if (memcpy_mcsafe(buf, nsio->addr + offset, size) != 0) + if (copy_mc_to_kernel(buf, nsio->addr + offset, size) != 0) return -EIO; return 0; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index fab29b514372..5c6939e004e2 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -125,7 +125,7 @@ static blk_status_t read_pmem(struct page *page, unsigned int off, while (len) { mem = kmap_atomic(page); chunk = min_t(unsigned int, len, PAGE_SIZE - off); - rem = memcpy_mcsafe(mem + off, pmem_addr, chunk); + rem = copy_mc_to_kernel(mem + off, pmem_addr, chunk); kunmap_atomic(mem); if (rem) return BLK_STS_IOERR; @@ -304,7 +304,7 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev, /* * Use the 'no check' versions of copy_from_iter_flushcache() and - * copy_to_iter_mcsafe() to bypass HARDENED_USERCOPY overhead. Bounds + * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds * checking, both file offset and device offset, is handled by * dax_iomap_actor() */ @@ -317,7 +317,7 @@ static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { - return _copy_to_iter_mcsafe(addr, bytes, i); + return _copy_mc_to_iter(addr, bytes, i); } static const struct dax_operations pmem_dax_ops = { diff --git a/include/linux/string.h b/include/linux/string.h index 9b7a0632e87a..b1f3894a0a3e 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -161,20 +161,13 @@ extern int bcmp(const void *,const void *,__kernel_size_t); #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif -#ifndef __HAVE_ARCH_MEMCPY_MCSAFE -static inline __must_check unsigned long memcpy_mcsafe(void *dst, - const void *src, size_t cnt) -{ - memcpy(dst, src, cnt); - return 0; -} -#endif #ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) { memcpy(dst, src, cnt); } #endif + void *memchr_inv(const void *s, int c, size_t n); char *strreplace(char *s, char old, char new); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 94b285411659..1ae36bc8db35 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -179,6 +179,19 @@ copy_in_user(void __user *to, const void __user *from, unsigned long n) } #endif +#ifndef copy_mc_to_kernel +/* + * Without arch opt-in this generic copy_mc_to_kernel() will not handle + * #MC (or arch equivalent) during source read. + */ +static inline unsigned long __must_check +copy_mc_to_kernel(void *dst, const void *src, size_t cnt) +{ + memcpy(dst, src, cnt); + return 0; +} +#endif + static __always_inline void pagefault_disabled_inc(void) { current->pagefault_disabled++; diff --git a/include/linux/uio.h b/include/linux/uio.h index 3835a8a8e9ea..f14410c678bd 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -185,10 +185,10 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i); #define _copy_from_iter_flushcache _copy_from_iter_nocache #endif -#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE -size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i); +#ifdef CONFIG_ARCH_HAS_COPY_MC +size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #else -#define _copy_to_iter_mcsafe _copy_to_iter +#define _copy_mc_to_iter _copy_to_iter #endif static __always_inline __must_check @@ -201,12 +201,12 @@ size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) } static __always_inline __must_check -size_t copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i) +size_t copy_mc_to_iter(void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, true))) return 0; else - return _copy_to_iter_mcsafe(addr, bytes, i); + return _copy_mc_to_iter(addr, bytes, i); } size_t iov_iter_zero(size_t bytes, struct iov_iter *); diff --git a/lib/Kconfig b/lib/Kconfig index b4b98a03ff98..b46a9fd122c8 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -635,7 +635,12 @@ config UACCESS_MEMCPY config ARCH_HAS_UACCESS_FLUSHCACHE bool -config ARCH_HAS_UACCESS_MCSAFE +# arch has a concept of a recoverable synchronous exception due to a +# memory-read error like x86 machine-check or ARM data-abort, and +# implements copy_mc_to_{user,kernel} to abort and report +# 'bytes-transferred' if that exception fires when accessing the source +# buffer. +config ARCH_HAS_COPY_MC bool # Temporary. Goes away when all archs are cleaned up diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 5e40786c8f12..d13304a034f5 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -637,30 +637,30 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) } EXPORT_SYMBOL(_copy_to_iter); -#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE -static int copyout_mcsafe(void __user *to, const void *from, size_t n) +#ifdef CONFIG_ARCH_HAS_COPY_MC +static int copyout_mc(void __user *to, const void *from, size_t n) { if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); - n = copy_to_user_mcsafe((__force void *) to, from, n); + n = copy_mc_to_user((__force void *) to, from, n); } return n; } -static unsigned long memcpy_mcsafe_to_page(struct page *page, size_t offset, +static unsigned long copy_mc_to_page(struct page *page, size_t offset, const char *from, size_t len) { unsigned long ret; char *to; to = kmap_atomic(page); - ret = memcpy_mcsafe(to + offset, from, len); + ret = copy_mc_to_kernel(to + offset, from, len); kunmap_atomic(to); return ret; } -static size_t copy_pipe_to_iter_mcsafe(const void *addr, size_t bytes, +static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { struct pipe_inode_info *pipe = i->pipe; @@ -678,7 +678,7 @@ static size_t copy_pipe_to_iter_mcsafe(const void *addr, size_t bytes, size_t chunk = min_t(size_t, n, PAGE_SIZE - off); unsigned long rem; - rem = memcpy_mcsafe_to_page(pipe->bufs[i_head & p_mask].page, + rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk); i->head = i_head; i->iov_offset = off + chunk - rem; @@ -695,18 +695,17 @@ static size_t copy_pipe_to_iter_mcsafe(const void *addr, size_t bytes, } /** - * _copy_to_iter_mcsafe - copy to user with source-read error exception handling + * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address * @bytes: total transfer length * @iter: destination iterator * - * The pmem driver arranges for filesystem-dax to use this facility via - * dax_copy_to_iter() for protecting read/write to persistent memory. - * Unless / until an architecture can guarantee identical performance - * between _copy_to_iter_mcsafe() and _copy_to_iter() it would be a - * performance regression to switch more users to the mcsafe version. + * The pmem driver deploys this for the dax operation + * (dax_copy_to_iter()) for dax reads (bypass page-cache and the + * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes + * successfully copied. * - * Otherwise, the main differences between this and typical _copy_to_iter(). + * The main differences between this and typical _copy_to_iter(). * * * Typical tail/residue handling after a fault retries the copy * byte-by-byte until the fault happens again. Re-triggering machine @@ -717,23 +716,22 @@ static size_t copy_pipe_to_iter_mcsafe(const void *addr, size_t bytes, * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. * Compare to copy_to_iter() where only ITER_IOVEC attempts might return * a short copy. - * - * See MCSAFE_TEST for self-test. */ -size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i) +size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { const char *from = addr; unsigned long rem, curr_addr, s_addr = (unsigned long) addr; if (unlikely(iov_iter_is_pipe(i))) - return copy_pipe_to_iter_mcsafe(addr, bytes, i); + return copy_mc_pipe_to_iter(addr, bytes, i); if (iter_is_iovec(i)) might_fault(); iterate_and_advance(i, bytes, v, - copyout_mcsafe(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), + copyout_mc(v.iov_base, (from += v.iov_len) - v.iov_len, + v.iov_len), ({ - rem = memcpy_mcsafe_to_page(v.bv_page, v.bv_offset, - (from += v.bv_len) - v.bv_len, v.bv_len); + rem = copy_mc_to_page(v.bv_page, v.bv_offset, + (from += v.bv_len) - v.bv_len, v.bv_len); if (rem) { curr_addr = (unsigned long) from; bytes = curr_addr - s_addr - rem; @@ -741,8 +739,8 @@ size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i) } }), ({ - rem = memcpy_mcsafe(v.iov_base, (from += v.iov_len) - v.iov_len, - v.iov_len); + rem = copy_mc_to_kernel(v.iov_base, (from += v.iov_len) + - v.iov_len, v.iov_len); if (rem) { curr_addr = (unsigned long) from; bytes = curr_addr - s_addr - rem; @@ -753,8 +751,8 @@ size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i) return bytes; } -EXPORT_SYMBOL_GPL(_copy_to_iter_mcsafe); -#endif /* CONFIG_ARCH_HAS_UACCESS_MCSAFE */ +EXPORT_SYMBOL_GPL(_copy_mc_to_iter); +#endif /* CONFIG_ARCH_HAS_COPY_MC */ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { diff --git a/tools/arch/x86/include/asm/mcsafe_test.h b/tools/arch/x86/include/asm/mcsafe_test.h deleted file mode 100644 index 2ccd588fbad4..000000000000 --- a/tools/arch/x86/include/asm/mcsafe_test.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _MCSAFE_TEST_H_ -#define _MCSAFE_TEST_H_ - -.macro MCSAFE_TEST_CTL -.endm - -.macro MCSAFE_TEST_SRC reg count target -.endm - -.macro MCSAFE_TEST_DST reg count target -.endm -#endif /* _MCSAFE_TEST_H_ */ diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S index 45f8e1b02241..0b5b8ae56bd9 100644 --- a/tools/arch/x86/lib/memcpy_64.S +++ b/tools/arch/x86/lib/memcpy_64.S @@ -4,7 +4,6 @@ #include #include #include -#include #include #include @@ -187,117 +186,3 @@ SYM_FUNC_START(memcpy_orig) SYM_FUNC_END(memcpy_orig) .popsection - -#ifndef CONFIG_UML - -MCSAFE_TEST_CTL - -/* - * __memcpy_mcsafe - memory copy with machine check exception handling - * Note that we only catch machine checks when reading the source addresses. - * Writes to target are posted and don't generate machine checks. - */ -SYM_FUNC_START(__memcpy_mcsafe) - cmpl $8, %edx - /* Less than 8 bytes? Go to byte copy loop */ - jb .L_no_whole_words - - /* Check for bad alignment of source */ - testl $7, %esi - /* Already aligned */ - jz .L_8byte_aligned - - /* Copy one byte at a time until source is 8-byte aligned */ - movl %esi, %ecx - andl $7, %ecx - subl $8, %ecx - negl %ecx - subl %ecx, %edx -.L_read_leading_bytes: - movb (%rsi), %al - MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes - MCSAFE_TEST_DST %rdi 1 .E_leading_bytes -.L_write_leading_bytes: - movb %al, (%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .L_read_leading_bytes - -.L_8byte_aligned: - movl %edx, %ecx - andl $7, %edx - shrl $3, %ecx - jz .L_no_whole_words - -.L_read_words: - movq (%rsi), %r8 - MCSAFE_TEST_SRC %rsi 8 .E_read_words - MCSAFE_TEST_DST %rdi 8 .E_write_words -.L_write_words: - movq %r8, (%rdi) - addq $8, %rsi - addq $8, %rdi - decl %ecx - jnz .L_read_words - - /* Any trailing bytes? */ -.L_no_whole_words: - andl %edx, %edx - jz .L_done_memcpy_trap - - /* Copy trailing bytes */ - movl %edx, %ecx -.L_read_trailing_bytes: - movb (%rsi), %al - MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes - MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes -.L_write_trailing_bytes: - movb %al, (%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .L_read_trailing_bytes - - /* Copy successful. Return zero */ -.L_done_memcpy_trap: - xorl %eax, %eax -.L_done: - ret -SYM_FUNC_END(__memcpy_mcsafe) -EXPORT_SYMBOL_GPL(__memcpy_mcsafe) - - .section .fixup, "ax" - /* - * Return number of bytes not copied for any failure. Note that - * there is no "tail" handling since the source buffer is 8-byte - * aligned and poison is cacheline aligned. - */ -.E_read_words: - shll $3, %ecx -.E_leading_bytes: - addl %edx, %ecx -.E_trailing_bytes: - mov %ecx, %eax - jmp .L_done - - /* - * For write fault handling, given the destination is unaligned, - * we handle faults on multi-byte writes with a byte-by-byte - * copy up to the write-protected page. - */ -.E_write_words: - shll $3, %ecx - addl %edx, %ecx - movl %ecx, %edx - jmp mcsafe_handle_tail - - .previous - - _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes) - _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words) - _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes) - _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes) - _ASM_EXTABLE(.L_write_words, .E_write_words) - _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes) -#endif diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e034a8f24f46..893f021fec63 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -548,8 +548,8 @@ static const char *uaccess_safe_builtin[] = { "__ubsan_handle_shift_out_of_bounds", /* misc */ "csum_partial_copy_generic", - "__memcpy_mcsafe", - "mcsafe_handle_tail", + "copy_mc_fragile", + "copy_mc_fragile_handle_tail", "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */ NULL }; diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build index dd68a40a790c..878db6a59a41 100644 --- a/tools/perf/bench/Build +++ b/tools/perf/bench/Build @@ -13,7 +13,6 @@ perf-y += synthesize.o perf-y += kallsyms-parse.o perf-y += find-bit-bench.o -perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o diff --git a/tools/perf/bench/mem-memcpy-x86-64-lib.c b/tools/perf/bench/mem-memcpy-x86-64-lib.c deleted file mode 100644 index 4130734dde84..000000000000 --- a/tools/perf/bench/mem-memcpy-x86-64-lib.c +++ /dev/null @@ -1,24 +0,0 @@ -/* - * From code in arch/x86/lib/usercopy_64.c, copied to keep tools/ copy - * of the kernel's arch/x86/lib/memcpy_64.s used in 'perf bench mem memcpy' - * happy. - */ -#include - -unsigned long __memcpy_mcsafe(void *dst, const void *src, size_t cnt); -unsigned long mcsafe_handle_tail(char *to, char *from, unsigned len); - -unsigned long mcsafe_handle_tail(char *to, char *from, unsigned len) -{ - for (; len; --len, to++, from++) { - /* - * Call the assembly routine back directly since - * memcpy_mcsafe() may silently fallback to memcpy. - */ - unsigned long rem = __memcpy_mcsafe(to, from, 1); - - if (rem) - break; - } - return len; -} diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index a1a5dc645b40..2ac0fff6dad8 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -23,7 +23,8 @@ #include "nfit_test.h" #include "../watermark.h" -#include +#include +#include /* * Generate an NFIT table to describe the following topology: @@ -3283,7 +3284,7 @@ static struct platform_driver nfit_test_driver = { .id_table = nfit_test_id, }; -static char mcsafe_buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); +static char copy_mc_buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); enum INJECT { INJECT_NONE, @@ -3291,7 +3292,7 @@ enum INJECT { INJECT_DST, }; -static void mcsafe_test_init(char *dst, char *src, size_t size) +static void copy_mc_test_init(char *dst, char *src, size_t size) { size_t i; @@ -3300,7 +3301,7 @@ static void mcsafe_test_init(char *dst, char *src, size_t size) src[i] = (char) i; } -static bool mcsafe_test_validate(unsigned char *dst, unsigned char *src, +static bool copy_mc_test_validate(unsigned char *dst, unsigned char *src, size_t size, unsigned long rem) { size_t i; @@ -3321,12 +3322,12 @@ static bool mcsafe_test_validate(unsigned char *dst, unsigned char *src, return true; } -void mcsafe_test(void) +void copy_mc_test(void) { char *inject_desc[] = { "none", "source", "destination" }; enum INJECT inj; - if (IS_ENABLED(CONFIG_MCSAFE_TEST)) { + if (IS_ENABLED(CONFIG_COPY_MC_TEST)) { pr_info("%s: run...\n", __func__); } else { pr_info("%s: disabled, skip.\n", __func__); @@ -3344,31 +3345,31 @@ void mcsafe_test(void) switch (inj) { case INJECT_NONE: - mcsafe_inject_src(NULL); - mcsafe_inject_dst(NULL); - dst = &mcsafe_buf[2048]; - src = &mcsafe_buf[1024 - i]; + copy_mc_inject_src(NULL); + copy_mc_inject_dst(NULL); + dst = ©_mc_buf[2048]; + src = ©_mc_buf[1024 - i]; expect = 0; break; case INJECT_SRC: - mcsafe_inject_src(&mcsafe_buf[1024]); - mcsafe_inject_dst(NULL); - dst = &mcsafe_buf[2048]; - src = &mcsafe_buf[1024 - i]; + copy_mc_inject_src(©_mc_buf[1024]); + copy_mc_inject_dst(NULL); + dst = ©_mc_buf[2048]; + src = ©_mc_buf[1024 - i]; expect = 512 - i; break; case INJECT_DST: - mcsafe_inject_src(NULL); - mcsafe_inject_dst(&mcsafe_buf[2048]); - dst = &mcsafe_buf[2048 - i]; - src = &mcsafe_buf[1024]; + copy_mc_inject_src(NULL); + copy_mc_inject_dst(©_mc_buf[2048]); + dst = ©_mc_buf[2048 - i]; + src = ©_mc_buf[1024]; expect = 512 - i; break; } - mcsafe_test_init(dst, src, 512); - rem = __memcpy_mcsafe(dst, src, 512); - valid = mcsafe_test_validate(dst, src, 512, expect); + copy_mc_test_init(dst, src, 512); + rem = copy_mc_fragile(dst, src, 512); + valid = copy_mc_test_validate(dst, src, 512, expect); if (rem == expect && valid) continue; pr_info("%s: copy(%#lx, %#lx, %d) off: %d rem: %ld %s expect: %ld\n", @@ -3380,8 +3381,8 @@ void mcsafe_test(void) } } - mcsafe_inject_src(NULL); - mcsafe_inject_dst(NULL); + copy_mc_inject_src(NULL); + copy_mc_inject_dst(NULL); } static __init int nfit_test_init(void) @@ -3392,7 +3393,7 @@ static __init int nfit_test_init(void) libnvdimm_test(); acpi_nfit_test(); device_dax_test(); - mcsafe_test(); + copy_mc_test(); dax_pmem_test(); dax_pmem_core_test(); #ifdef CONFIG_DEV_DAX_PMEM_COMPAT diff --git a/tools/testing/selftests/powerpc/copyloops/.gitignore b/tools/testing/selftests/powerpc/copyloops/.gitignore index ddaf140b8255..994b11af765c 100644 --- a/tools/testing/selftests/powerpc/copyloops/.gitignore +++ b/tools/testing/selftests/powerpc/copyloops/.gitignore @@ -12,4 +12,4 @@ memcpy_p7_t1 copyuser_64_exc_t0 copyuser_64_exc_t1 copyuser_64_exc_t2 -memcpy_mcsafe_64 +copy_mc_64 diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile index 0917983a1c78..3095b1f1c02b 100644 --- a/tools/testing/selftests/powerpc/copyloops/Makefile +++ b/tools/testing/selftests/powerpc/copyloops/Makefile @@ -12,7 +12,7 @@ ASFLAGS = $(CFLAGS) -Wa,-mpower4 TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \ copyuser_p7_t0 copyuser_p7_t1 \ memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \ - memcpy_p7_t0 memcpy_p7_t1 memcpy_mcsafe_64 \ + memcpy_p7_t0 memcpy_p7_t1 copy_mc_64 \ copyuser_64_exc_t0 copyuser_64_exc_t1 copyuser_64_exc_t2 EXTRA_SOURCES := validate.c ../harness.c stubs.S @@ -45,9 +45,9 @@ $(OUTPUT)/memcpy_p7_t%: memcpy_power7.S $(EXTRA_SOURCES) -D SELFTEST_CASE=$(subst memcpy_p7_t,,$(notdir $@)) \ -o $@ $^ -$(OUTPUT)/memcpy_mcsafe_64: memcpy_mcsafe_64.S $(EXTRA_SOURCES) +$(OUTPUT)/copy_mc_64: copy_mc_64.S $(EXTRA_SOURCES) $(CC) $(CPPFLAGS) $(CFLAGS) \ - -D COPY_LOOP=test_memcpy_mcsafe \ + -D COPY_LOOP=test_copy_mc_generic \ -o $@ $^ $(OUTPUT)/copyuser_64_exc_t%: copyuser_64.S exc_validate.c ../harness.c \ diff --git a/tools/testing/selftests/powerpc/copyloops/copy_mc_64.S b/tools/testing/selftests/powerpc/copyloops/copy_mc_64.S new file mode 120000 index 000000000000..dcbe06d500fb --- /dev/null +++ b/tools/testing/selftests/powerpc/copyloops/copy_mc_64.S @@ -0,0 +1 @@ +../../../../../arch/powerpc/lib/copy_mc_64.S \ No newline at end of file diff --git a/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S b/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S deleted file mode 120000 index f0feef3062f6..000000000000 --- a/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S +++ /dev/null @@ -1 +0,0 @@ -../../../../../arch/powerpc/lib/memcpy_mcsafe_64.S \ No newline at end of file From 5da8e4a658109e3b7e1f45ae672b7c06ac3e7158 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 5 Oct 2020 20:40:25 -0700 Subject: [PATCH 13/20] x86/copy_mc: Introduce copy_mc_enhanced_fast_string() The motivations to go rework memcpy_mcsafe() are that the benefit of doing slow and careful copies is obviated on newer CPUs, and that the current opt-in list of CPUs to instrument recovery is broken relative to those CPUs. There is no need to keep an opt-in list up to date on an ongoing basis if pmem/dax operations are instrumented for recovery by default. With recovery enabled by default the old "mcsafe_key" opt-in to careful copying can be made a "fragile" opt-out. Where the "fragile" list takes steps to not consume poison across cachelines. The discussion with Linus made clear that the current "_mcsafe" suffix was imprecise to a fault. The operations that are needed by pmem/dax are to copy from a source address that might throw #MC to a destination that may write-fault, if it is a user page. So copy_to_user_mcsafe() becomes copy_mc_to_user() to indicate the separate precautions taken on source and destination. copy_mc_to_kernel() is introduced as a non-SMAP version that does not expect write-faults on the destination, but is still prepared to abort with an error code upon taking #MC. The original copy_mc_fragile() implementation had negative performance implications since it did not use the fast-string instruction sequence to perform copies. For this reason copy_mc_to_kernel() fell back to plain memcpy() to preserve performance on platforms that did not indicate the capability to recover from machine check exceptions. However, that capability detection was not architectural and now that some platforms can recover from fast-string consumption of memory errors the memcpy() fallback now causes these more capable platforms to fail. Introduce copy_mc_enhanced_fast_string() as the fast default implementation of copy_mc_to_kernel() and finalize the transition of copy_mc_fragile() to be a platform quirk to indicate 'copy-carefully'. With this in place, copy_mc_to_kernel() is fast and recovery-ready by default regardless of hardware capability. Thanks to Vivek for identifying that copy_user_generic() is not suitable as the copy_mc_to_user() backend since the #MC handler explicitly checks ex_has_fault_handler(). Thanks to the 0day robot for catching a performance bug in the x86/copy_mc_to_user implementation. [ bp: Add the "why" for this change from the 0/2th message, massage. ] Fixes: 92b0729c34ca ("x86/mm, x86/mce: Add memcpy_mcsafe()") Reported-by: Erwin Tsaur Reported-by: 0day robot Signed-off-by: Dan Williams Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Tested-by: Erwin Tsaur Cc: Link: https://lkml.kernel.org/r/160195562556.2163339.18063423034951948973.stgit@dwillia2-desk3.amr.corp.intel.com --- arch/x86/lib/copy_mc.c | 32 +++++++++++++++++++++++--------- arch/x86/lib/copy_mc_64.S | 36 ++++++++++++++++++++++++++++++++++++ tools/objtool/check.c | 1 + 3 files changed, 60 insertions(+), 9 deletions(-) diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c index 2633635530b7..c13e8c9ee926 100644 --- a/arch/x86/lib/copy_mc.c +++ b/arch/x86/lib/copy_mc.c @@ -45,6 +45,8 @@ void enable_copy_mc_fragile(void) #define copy_mc_fragile_enabled (0) #endif +unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len); + /** * copy_mc_to_kernel - memory copy that handles source exceptions * @@ -52,9 +54,11 @@ void enable_copy_mc_fragile(void) * @src: source address * @len: number of bytes to copy * - * Call into the 'fragile' version on systems that have trouble - * actually do machine check recovery. Everyone else can just - * use memcpy(). + * Call into the 'fragile' version on systems that benefit from avoiding + * corner case poison consumption scenarios, For example, accessing + * poison across 2 cachelines with a single instruction. Almost all + * other uses case can use copy_mc_enhanced_fast_string() for a fast + * recoverable copy, or fallback to plain memcpy. * * Return 0 for success, or number of bytes not copied if there was an * exception. @@ -63,6 +67,8 @@ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigne { if (copy_mc_fragile_enabled) return copy_mc_fragile(dst, src, len); + if (static_cpu_has(X86_FEATURE_ERMS)) + return copy_mc_enhanced_fast_string(dst, src, len); memcpy(dst, src, len); return 0; } @@ -72,11 +78,19 @@ unsigned long __must_check copy_mc_to_user(void *dst, const void *src, unsigned { unsigned long ret; - if (!copy_mc_fragile_enabled) - return copy_user_generic(dst, src, len); + if (copy_mc_fragile_enabled) { + __uaccess_begin(); + ret = copy_mc_fragile(dst, src, len); + __uaccess_end(); + return ret; + } - __uaccess_begin(); - ret = copy_mc_fragile(dst, src, len); - __uaccess_end(); - return ret; + if (static_cpu_has(X86_FEATURE_ERMS)) { + __uaccess_begin(); + ret = copy_mc_enhanced_fast_string(dst, src, len); + __uaccess_end(); + return ret; + } + + return copy_user_generic(dst, src, len); } diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S index c3b613c4544a..892d8915f609 100644 --- a/arch/x86/lib/copy_mc_64.S +++ b/arch/x86/lib/copy_mc_64.S @@ -124,4 +124,40 @@ EXPORT_SYMBOL_GPL(copy_mc_fragile) _ASM_EXTABLE(.L_write_words, .E_write_words) _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes) #endif /* CONFIG_X86_MCE */ + +/* + * copy_mc_enhanced_fast_string - memory copy with exception handling + * + * Fast string copy + fault / exception handling. If the CPU does + * support machine check exception recovery, but does not support + * recovering from fast-string exceptions then this CPU needs to be + * added to the copy_mc_fragile_key set of quirks. Otherwise, absent any + * machine check recovery support this version should be no slower than + * standard memcpy. + */ +SYM_FUNC_START(copy_mc_enhanced_fast_string) + movq %rdi, %rax + movq %rdx, %rcx +.L_copy: + rep movsb + /* Copy successful. Return zero */ + xorl %eax, %eax + ret +SYM_FUNC_END(copy_mc_enhanced_fast_string) + + .section .fixup, "ax" +.E_copy: + /* + * On fault %rcx is updated such that the copy instruction could + * optionally be restarted at the fault position, i.e. it + * contains 'bytes remaining'. A non-zero return indicates error + * to copy_mc_generic() users, or indicate short transfers to + * user-copy routines. + */ + movq %rcx, %rax + ret + + .previous + + _ASM_EXTABLE_FAULT(.L_copy, .E_copy) #endif /* !CONFIG_UML */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 893f021fec63..b3e4efcf7ca6 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -550,6 +550,7 @@ static const char *uaccess_safe_builtin[] = { "csum_partial_copy_generic", "copy_mc_fragile", "copy_mc_fragile_handle_tail", + "copy_mc_enhanced_fast_string", "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */ NULL }; From 41ce0564bfe2e129d56730418d8c0a9f9f2d31b5 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Tue, 6 Oct 2020 14:09:05 -0700 Subject: [PATCH 14/20] x86/mce: Pass pointer to saved pt_regs to severity calculation routines New recovery features require additional information about processor state when a machine check occurred. Pass pt_regs down to the routines that need it. No functional change. Signed-off-by: Youquan Song Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20201006210910.21062-2-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 14 +++++++------- arch/x86/kernel/cpu/mce/internal.h | 3 ++- arch/x86/kernel/cpu/mce/severity.c | 14 ++++++++------ 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b5b70f4b351d..2d6caf09e8e6 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -807,7 +807,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) goto clear_it; mce_read_aux(&m, i); - m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); + m.severity = mce_severity(&m, NULL, mca_cfg.tolerant, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. @@ -856,7 +856,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, quirk_no_way_out(i, m, regs); m->bank = i; - if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { mce_read_aux(m, i); *msg = tmp; return 1; @@ -956,7 +956,7 @@ static void mce_reign(void) */ if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { /* call mce_severity() to get "msg" for panic */ - mce_severity(m, mca_cfg.tolerant, &msg, true); + mce_severity(m, NULL, mca_cfg.tolerant, &msg, true); mce_panic("Fatal machine check", m, msg); } @@ -1167,7 +1167,7 @@ static noinstr bool mce_check_crashing_cpu(void) return false; } -static void __mc_scan_banks(struct mce *m, struct mce *final, +static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, unsigned long *toclear, unsigned long *valid_banks, int no_way_out, int *worst) { @@ -1202,7 +1202,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, /* Set taint even when machine check was not enabled. */ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(m, cfg->tolerant, NULL, true); + severity = mce_severity(m, regs, cfg->tolerant, NULL, true); /* * When machine check was for corrected/deferred handler don't @@ -1354,7 +1354,7 @@ noinstr void do_machine_check(struct pt_regs *regs) order = mce_start(&no_way_out); } - __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); + __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1376,7 +1376,7 @@ noinstr void do_machine_check(struct pt_regs *regs) * make sure we have the right "msg". */ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { - mce_severity(&m, cfg->tolerant, &msg, true); + mce_severity(&m, regs, cfg->tolerant, &msg, true); mce_panic("Local fatal machine check!", &m, msg); } } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index b122610e9046..88dcc79cfb07 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -38,7 +38,8 @@ int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); -extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); +extern int (*mce_severity)(struct mce *a, struct pt_regs *regs, + int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index e0722461bb57..0b072dc231ad 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -223,7 +223,7 @@ static struct severity { * distinguish an exception taken in user from from one * taken in the kernel. */ -static int error_context(struct mce *m) +static int error_context(struct mce *m, struct pt_regs *regs) { if ((m->cs & 3) == 3) return IN_USER; @@ -267,9 +267,10 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) * See AMD Error Scope Hierarchy table in a newer BKDG. For example * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" */ -static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) +static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant, + char **msg, bool is_excp) { - enum context ctx = error_context(m); + enum context ctx = error_context(m, regs); /* Processor Context Corrupt, no need to fumble too much, die! */ if (m->status & MCI_STATUS_PCC) @@ -319,10 +320,11 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc return MCE_KEEP_SEVERITY; } -static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp) +static int mce_severity_intel(struct mce *m, struct pt_regs *regs, + int tolerant, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); - enum context ctx = error_context(m); + enum context ctx = error_context(m, regs); struct severity *s; for (s = severities;; s++) { @@ -356,7 +358,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e } /* Default to mce_severity_intel */ -int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = +int (*mce_severity)(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, bool is_excp) = mce_severity_intel; void __init mcheck_vendor_init_severity(void) From a05d54c41ecfa1a322b229b4e5ce50c157284f74 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 6 Oct 2020 14:09:06 -0700 Subject: [PATCH 15/20] x86/mce: Provide method to find out the type of an exception handler Avoid a proliferation of ex_has_*_handler() functions by having just one function that returns the type of the handler (if any). Drop the __visible attribute for this function. It is not called from assembler so the attribute is not necessary. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20201006210910.21062-3-tony.luck@intel.com --- arch/x86/include/asm/extable.h | 9 ++++++++- arch/x86/kernel/cpu/mce/severity.c | 5 ++++- arch/x86/mm/extable.c | 12 ++++++++---- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index d8c2198d543b..1f0cbc52937c 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -29,10 +29,17 @@ struct pt_regs; (b)->handler = (tmp).handler - (delta); \ } while (0) +enum handler_type { + EX_HANDLER_NONE, + EX_HANDLER_FAULT, + EX_HANDLER_UACCESS, + EX_HANDLER_OTHER +}; + extern int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr); extern int fixup_bug(struct pt_regs *regs, int trapnr); -extern bool ex_has_fault_handler(unsigned long ip); +extern enum handler_type ex_get_fault_handler_type(unsigned long ip); extern void early_fixup_exception(struct pt_regs *regs, int trapnr); #endif diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 0b072dc231ad..c6494e6579c1 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -225,10 +225,13 @@ static struct severity { */ static int error_context(struct mce *m, struct pt_regs *regs) { + enum handler_type t; + if ((m->cs & 3) == 3) return IN_USER; - if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) { + t = ex_get_fault_handler_type(m->ip); + if (mc_recoverable(m->mcgstatus) && t == EX_HANDLER_FAULT) { m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; } diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 1d6cb07f4f86..de43525df69b 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -125,17 +125,21 @@ __visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_clear_fs); -__visible bool ex_has_fault_handler(unsigned long ip) +enum handler_type ex_get_fault_handler_type(unsigned long ip) { const struct exception_table_entry *e; ex_handler_t handler; e = search_exception_tables(ip); if (!e) - return false; + return EX_HANDLER_NONE; handler = ex_fixup_handler(e); - - return handler == ex_handler_fault; + if (handler == ex_handler_fault) + return EX_HANDLER_FAULT; + else if (handler == ex_handler_uaccess) + return EX_HANDLER_UACCESS; + else + return EX_HANDLER_OTHER; } int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, From 278b917f8cb9b02923c15249f9d1a5769d2c1976 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Tue, 6 Oct 2020 14:09:07 -0700 Subject: [PATCH 16/20] x86/mce: Add _ASM_EXTABLE_CPY for copy user access _ASM_EXTABLE_UA is a general exception entry to record the exception fixup for all exception spots between kernel and user space access. To enable recovery from machine checks while coping data from user addresses it is necessary to be able to distinguish the places that are looping copying data from those that copy a single byte/word/etc. Add a new macro _ASM_EXTABLE_CPY and use it in place of _ASM_EXTABLE_UA in the copy functions. Record the exception reason number to regs->ax at ex_handler_uaccess which is used to check MCE triggered. The new fixup routine ex_handler_copy() is almost an exact copy of ex_handler_uaccess() The difference is that it sets regs->ax to the trap number. Following patches use this to avoid trying to copy remaining bytes from the tail of the copy and possibly hitting the poison again. New mce.kflags bit MCE_IN_KERNEL_COPYIN will be used by mce_severity() calculation to indicate that a machine check is recoverable because the kernel was copying from user space. Signed-off-by: Youquan Song Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20201006210910.21062-4-tony.luck@intel.com --- arch/x86/include/asm/asm.h | 6 +++ arch/x86/include/asm/mce.h | 15 ++++++ arch/x86/lib/copy_user_64.S | 96 ++++++++++++++++++------------------- arch/x86/mm/extable.c | 14 +++++- 4 files changed, 82 insertions(+), 49 deletions(-) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 5c15f95b1ba7..0359cbbd0f50 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -135,6 +135,9 @@ # define _ASM_EXTABLE_UA(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) +# define _ASM_EXTABLE_CPY(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) + # define _ASM_EXTABLE_FAULT(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) @@ -160,6 +163,9 @@ # define _ASM_EXTABLE_UA(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) +# define _ASM_EXTABLE_CPY(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) + # define _ASM_EXTABLE_FAULT(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ba2062d6df92..a0f147893a04 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -136,8 +136,23 @@ #define MCE_HANDLED_NFIT BIT_ULL(3) #define MCE_HANDLED_EDAC BIT_ULL(4) #define MCE_HANDLED_MCELOG BIT_ULL(5) + +/* + * Indicates an MCE which has happened in kernel space but from + * which the kernel can recover simply by executing fixup_exception() + * so that an error is returned to the caller of the function that + * hit the machine check. + */ #define MCE_IN_KERNEL_RECOV BIT_ULL(6) +/* + * Indicates an MCE that happened in kernel space while copying data + * from user. In this case fixup_exception() gets the kernel to the + * error exit for the copy function. Machine check handler can then + * treat it like a fault taken in user mode. + */ +#define MCE_IN_KERNEL_COPYIN BIT_ULL(7) + /* * This structure contains all data related to the MCE log. Also * carries a signature to make it easier to find from external diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 816f128a6d52..5b68e945bf65 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -36,8 +36,8 @@ jmp .Lcopy_user_handle_tail .previous - _ASM_EXTABLE_UA(100b, 103b) - _ASM_EXTABLE_UA(101b, 103b) + _ASM_EXTABLE_CPY(100b, 103b) + _ASM_EXTABLE_CPY(101b, 103b) .endm /* @@ -116,26 +116,26 @@ SYM_FUNC_START(copy_user_generic_unrolled) 60: jmp .Lcopy_user_handle_tail /* ecx is zerorest also */ .previous - _ASM_EXTABLE_UA(1b, 30b) - _ASM_EXTABLE_UA(2b, 30b) - _ASM_EXTABLE_UA(3b, 30b) - _ASM_EXTABLE_UA(4b, 30b) - _ASM_EXTABLE_UA(5b, 30b) - _ASM_EXTABLE_UA(6b, 30b) - _ASM_EXTABLE_UA(7b, 30b) - _ASM_EXTABLE_UA(8b, 30b) - _ASM_EXTABLE_UA(9b, 30b) - _ASM_EXTABLE_UA(10b, 30b) - _ASM_EXTABLE_UA(11b, 30b) - _ASM_EXTABLE_UA(12b, 30b) - _ASM_EXTABLE_UA(13b, 30b) - _ASM_EXTABLE_UA(14b, 30b) - _ASM_EXTABLE_UA(15b, 30b) - _ASM_EXTABLE_UA(16b, 30b) - _ASM_EXTABLE_UA(18b, 40b) - _ASM_EXTABLE_UA(19b, 40b) - _ASM_EXTABLE_UA(21b, 50b) - _ASM_EXTABLE_UA(22b, 50b) + _ASM_EXTABLE_CPY(1b, 30b) + _ASM_EXTABLE_CPY(2b, 30b) + _ASM_EXTABLE_CPY(3b, 30b) + _ASM_EXTABLE_CPY(4b, 30b) + _ASM_EXTABLE_CPY(5b, 30b) + _ASM_EXTABLE_CPY(6b, 30b) + _ASM_EXTABLE_CPY(7b, 30b) + _ASM_EXTABLE_CPY(8b, 30b) + _ASM_EXTABLE_CPY(9b, 30b) + _ASM_EXTABLE_CPY(10b, 30b) + _ASM_EXTABLE_CPY(11b, 30b) + _ASM_EXTABLE_CPY(12b, 30b) + _ASM_EXTABLE_CPY(13b, 30b) + _ASM_EXTABLE_CPY(14b, 30b) + _ASM_EXTABLE_CPY(15b, 30b) + _ASM_EXTABLE_CPY(16b, 30b) + _ASM_EXTABLE_CPY(18b, 40b) + _ASM_EXTABLE_CPY(19b, 40b) + _ASM_EXTABLE_CPY(21b, 50b) + _ASM_EXTABLE_CPY(22b, 50b) SYM_FUNC_END(copy_user_generic_unrolled) EXPORT_SYMBOL(copy_user_generic_unrolled) @@ -180,8 +180,8 @@ SYM_FUNC_START(copy_user_generic_string) jmp .Lcopy_user_handle_tail .previous - _ASM_EXTABLE_UA(1b, 11b) - _ASM_EXTABLE_UA(3b, 12b) + _ASM_EXTABLE_CPY(1b, 11b) + _ASM_EXTABLE_CPY(3b, 12b) SYM_FUNC_END(copy_user_generic_string) EXPORT_SYMBOL(copy_user_generic_string) @@ -213,7 +213,7 @@ SYM_FUNC_START(copy_user_enhanced_fast_string) jmp .Lcopy_user_handle_tail .previous - _ASM_EXTABLE_UA(1b, 12b) + _ASM_EXTABLE_CPY(1b, 12b) SYM_FUNC_END(copy_user_enhanced_fast_string) EXPORT_SYMBOL(copy_user_enhanced_fast_string) @@ -237,7 +237,7 @@ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) ASM_CLAC ret - _ASM_EXTABLE_UA(1b, 2b) + _ASM_EXTABLE_CPY(1b, 2b) SYM_CODE_END(.Lcopy_user_handle_tail) /* @@ -366,27 +366,27 @@ SYM_FUNC_START(__copy_user_nocache) jmp .Lcopy_user_handle_tail .previous - _ASM_EXTABLE_UA(1b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(2b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(3b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(4b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(5b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(6b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(7b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(8b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(9b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(10b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(11b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(12b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(13b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(14b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(15b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(16b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_UA(20b, .L_fixup_8b_copy) - _ASM_EXTABLE_UA(21b, .L_fixup_8b_copy) - _ASM_EXTABLE_UA(30b, .L_fixup_4b_copy) - _ASM_EXTABLE_UA(31b, .L_fixup_4b_copy) - _ASM_EXTABLE_UA(40b, .L_fixup_1b_copy) - _ASM_EXTABLE_UA(41b, .L_fixup_1b_copy) + _ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy) + _ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy) + _ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy) + _ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy) + _ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy) + _ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy) SYM_FUNC_END(__copy_user_nocache) EXPORT_SYMBOL(__copy_user_nocache) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index de43525df69b..5829457f7ca3 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -80,6 +80,18 @@ __visible bool ex_handler_uaccess(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_uaccess); +__visible bool ex_handler_copy(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); + regs->ip = ex_fixup_addr(fixup); + regs->ax = trapnr; + return true; +} +EXPORT_SYMBOL(ex_handler_copy); + __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr, unsigned long error_code, @@ -136,7 +148,7 @@ enum handler_type ex_get_fault_handler_type(unsigned long ip) handler = ex_fixup_handler(e); if (handler == ex_handler_fault) return EX_HANDLER_FAULT; - else if (handler == ex_handler_uaccess) + else if (handler == ex_handler_uaccess || handler == ex_handler_copy) return EX_HANDLER_UACCESS; else return EX_HANDLER_OTHER; From a2f73400e4dfd13f673c6e1b4b98d180fd1e47b3 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 6 Oct 2020 14:09:08 -0700 Subject: [PATCH 17/20] x86/mce: Avoid tail copy when machine check terminated a copy from user In the page fault case it is ok to see if a few more unaligned bytes can be copied from the source address. Worst case is that the page fault will be triggered again. Machine checks are more serious. Just give up at the point where the main copy loop triggered the #MC and return from the copy code as if the copy succeeded. The machine check handler will use task_work_add() to make sure that the task is sent a SIGBUS. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20201006210910.21062-5-tony.luck@intel.com --- arch/x86/lib/copy_user_64.S | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 5b68e945bf65..77b9b2a3b5c8 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -15,6 +15,7 @@ #include #include #include +#include .macro ALIGN_DESTINATION /* check for bad alignment of destination */ @@ -221,6 +222,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * Try to copy last bytes and clear the rest if needed. * Since protection fault in copy_from/to_user is not a normal situation, * it is not necessary to optimize tail handling. + * Don't try to copy the tail if machine check happened * * Input: * rdi destination @@ -232,11 +234,24 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) */ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) movl %edx,%ecx + cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ + je 3f 1: rep movsb 2: mov %ecx,%eax ASM_CLAC ret + /* + * Return zero to pretend that this copy succeeded. This + * is counter-intuitive, but needed to prevent the code + * in lib/iov_iter.c from retrying and running back into + * the poison cache line again. The machine check handler + * will ensure that a SIGBUS is sent to the task. + */ +3: xorl %eax,%eax + ASM_CLAC + ret + _ASM_EXTABLE_CPY(1b, 2b) SYM_CODE_END(.Lcopy_user_handle_tail) From c0ab7ffce275d3f83bd253c70889c28821d4a41d Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 6 Oct 2020 14:09:09 -0700 Subject: [PATCH 18/20] x86/mce: Recover from poison found while copying from user space Existing kernel code can only recover from a machine check on code that is tagged in the exception table with a fault handling recovery path. Add two new fields in the task structure to pass information from machine check handler to the "task_work" that is queued to run before the task returns to user mode: + mce_vaddr: will be initialized to the user virtual address of the fault in the case where the fault occurred in the kernel copying data from a user address. This is so that kill_me_maybe() can provide that information to the user SIGBUS handler. + mce_kflags: copy of the struct mce.kflags needed by kill_me_maybe() to determine if mce_vaddr is applicable to this error. Add code to recover from a machine check while copying data from user space to the kernel. Action for this case is the same as if the user touched the poison directly; unmap the page and send a SIGBUS to the task. Use a new helper function to share common code between the "fault in user mode" case and the "fault while copying from user" case. New code paths will be activated by the next patch which sets MCE_IN_KERNEL_COPYIN. Suggested-by: Borislav Petkov Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20201006210910.21062-6-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 27 ++++++++++++++++++++------- include/linux/sched.h | 2 ++ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2d6caf09e8e6..5c423c4bab68 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1260,6 +1260,21 @@ static void kill_me_maybe(struct callback_head *cb) kill_me_now(cb); } +static void queue_task_work(struct mce *m, int kill_it) +{ + current->mce_addr = m->addr; + current->mce_kflags = m->kflags; + current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); + current->mce_whole_page = whole_page(m); + + if (kill_it) + current->mce_kill_me.func = kill_me_now; + else + current->mce_kill_me.func = kill_me_maybe; + + task_work_add(current, ¤t->mce_kill_me, true); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1401,13 +1416,8 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - current->mce_addr = m.addr; - current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV); - current->mce_whole_page = whole_page(&m); - current->mce_kill_me.func = kill_me_maybe; - if (kill_it) - current->mce_kill_me.func = kill_me_now; - task_work_add(current, ¤t->mce_kill_me, true); + queue_task_work(&m, kill_it); + } else { /* * Handle an MCE which has happened in kernel space but from @@ -1422,6 +1432,9 @@ noinstr void do_machine_check(struct pt_regs *regs) if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } + + if (m.kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&m, kill_it); } out: mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); diff --git a/include/linux/sched.h b/include/linux/sched.h index 93ecd930efd3..2cbba3e2b150 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1308,6 +1308,8 @@ struct task_struct { #endif #ifdef CONFIG_X86_MCE + void __user *mce_vaddr; + __u64 mce_kflags; u64 mce_addr; __u64 mce_ripv : 1, mce_whole_page : 1, From 300638101329e8f1569115f3d7197ef5ef754a3a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 6 Oct 2020 14:09:10 -0700 Subject: [PATCH 19/20] x86/mce: Decode a kernel instruction to determine if it is copying from user All instructions copying data between kernel and user memory are tagged with either _ASM_EXTABLE_UA or _ASM_EXTABLE_CPY entries in the exception table. ex_fault_handler_type() returns EX_HANDLER_UACCESS for both of these. Recovery is only possible when the machine check was triggered on a read from user memory. In this case the same strategy for recovery applies as if the user had made the access in ring3. If the fault was in kernel memory while copying to user there is no current recovery plan. For MOV and MOVZ instructions a full decode of the instruction is done to find the source address. For MOVS instructions the source address is in the %rsi register. The function fault_in_kernel_space() determines whether the source address is kernel or user, upgrade it from "static" so it can be used here. Co-developed-by: Youquan Song Signed-off-by: Youquan Song Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20201006210910.21062-7-tony.luck@intel.com --- arch/x86/include/asm/traps.h | 2 ++ arch/x86/kernel/cpu/mce/core.c | 11 +++++-- arch/x86/kernel/cpu/mce/severity.c | 53 +++++++++++++++++++++++++++++- arch/x86/mm/fault.c | 2 +- 4 files changed, 63 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 714b1a30e7b0..df0b7bfc1234 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -35,6 +35,8 @@ extern int panic_on_unrecovered_nmi; void math_emulate(struct math_emu_info *); +bool fault_in_kernel_space(unsigned long address); + #ifdef CONFIG_VMAP_STACK void __noreturn handle_stack_overflow(const char *message, struct pt_regs *regs, diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 5c423c4bab68..3d6e1bfa4f9d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1250,14 +1250,19 @@ static void kill_me_maybe(struct callback_head *cb) if (!p->mce_ripv) flags |= MF_MUST_KILL; - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) && + !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); sync_core(); return; } - pr_err("Memory error not recovered"); - kill_me_now(cb); + if (p->mce_vaddr != (void __user *)-1l) { + force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); + } else { + pr_err("Memory error not recovered"); + kill_me_now(cb); + } } static void queue_task_work(struct mce *m, int kill_it) diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index c6494e6579c1..83df991314c5 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -13,6 +13,9 @@ #include #include +#include +#include +#include #include "internal.h" @@ -212,6 +215,47 @@ static struct severity { #define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) +static bool is_copy_from_user(struct pt_regs *regs) +{ + u8 insn_buf[MAX_INSN_SIZE]; + struct insn insn; + unsigned long addr; + + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) + return false; + + kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); + insn_get_opcode(&insn); + if (!insn.opcode.got) + return false; + + switch (insn.opcode.value) { + /* MOV mem,reg */ + case 0x8A: case 0x8B: + /* MOVZ mem,reg */ + case 0xB60F: case 0xB70F: + insn_get_modrm(&insn); + insn_get_sib(&insn); + if (!insn.modrm.got || !insn.sib.got) + return false; + addr = (unsigned long)insn_get_addr_ref(&insn, regs); + break; + /* REP MOVS */ + case 0xA4: case 0xA5: + addr = regs->si; + break; + default: + return false; + } + + if (fault_in_kernel_space(addr)) + return false; + + current->mce_vaddr = (void __user *)addr; + + return true; +} + /* * If mcgstatus indicated that ip/cs on the stack were * no good, then "m->cs" will be zero and we will have @@ -229,12 +273,19 @@ static int error_context(struct mce *m, struct pt_regs *regs) if ((m->cs & 3) == 3) return IN_USER; + if (!mc_recoverable(m->mcgstatus)) + return IN_KERNEL; t = ex_get_fault_handler_type(m->ip); - if (mc_recoverable(m->mcgstatus) && t == EX_HANDLER_FAULT) { + if (t == EX_HANDLER_FAULT) { m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; } + if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) { + m->kflags |= MCE_IN_KERNEL_RECOV; + m->kflags |= MCE_IN_KERNEL_COPYIN; + return IN_KERNEL_RECOV; + } return IN_KERNEL; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 35f1498e9832..88ae443e4e5f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1081,7 +1081,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) return 0; } -static int fault_in_kernel_space(unsigned long address) +bool fault_in_kernel_space(unsigned long address) { /* * On 64-bit systems, the vsyscall page is at an address above From b3149ffcdb31a8eb854cc442a389ae0b539bf28a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 7 Oct 2020 18:55:35 +0200 Subject: [PATCH 20/20] x86/mce: Allow for copy_mc_fragile symbol checksum to be generated Add asm/mce.h to asm/asm-prototypes.h so that that asm symbol's checksum can be generated in order to support CONFIG_MODVERSIONS with it and fix: WARNING: modpost: EXPORT symbol "copy_mc_fragile" [vmlinux] version \ generation failed, symbol will not be versioned. For reference see: 4efca4ed05cb ("kbuild: modversions for EXPORT_SYMBOL() for asm") 334bb7738764 ("x86/kbuild: enable modversions for symbols exported from asm") Fixes: ec6347bb4339 ("x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()") Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20201007111447.GA23257@zn.tnic --- arch/x86/include/asm/asm-prototypes.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 5a42f9206138..51e2bf27cc9b 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -5,6 +5,7 @@ #include #include #include +#include #include