From b1d405751cd5792856b1b8333aafaca6bf09ccbb Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 25 May 2020 16:41:23 +0200 Subject: [PATCH 001/127] KVM: x86: Switch KVM guest to using interrupts for page ready APF delivery KVM now supports using interrupt for 'page ready' APF event delivery and legacy mechanism was deprecated. Switch KVM guests to the new one. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200525144125.143875-9-vkuznets@redhat.com> [Use HYPERVISOR_CALLBACK_VECTOR instead of a separate vector. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/Kconfig | 1 + arch/x86/entry/entry_32.S | 5 ++++ arch/x86/entry/entry_64.S | 5 ++++ arch/x86/include/asm/kvm_para.h | 7 +++++ arch/x86/kernel/kvm.c | 50 ++++++++++++++++++++++----------- 5 files changed, 52 insertions(+), 16 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 67f6a40b5e93..31d0ca8de02f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -801,6 +801,7 @@ config KVM_GUEST depends on PARAVIRT select PARAVIRT_CLOCK select ARCH_CPUIDLE_HALTPOLL + select X86_HV_CALLBACK_VECTOR default y ---help--- This option enables various optimizations for running under the KVM diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a5eed844e948..e0d1938c0415 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1475,6 +1475,11 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, #endif /* CONFIG_HYPERV */ +#ifdef CONFIG_KVM_GUEST +BUILD_INTERRUPT3(kvm_async_pf_vector, HYPERVISOR_CALLBACK_VECTOR, + kvm_async_pf_intr) +#endif + SYM_CODE_START(page_fault) ASM_CLAC pushl $do_page_fault diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index eead1e2bebd5..cd8af69dd9ff 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1190,6 +1190,11 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ acrn_hv_callback_vector acrn_hv_vector_handler #endif +#ifdef CONFIG_KVM_GUEST +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + kvm_async_pf_vector kvm_async_pf_intr +#endif + idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET idtentry int3 do_int3 has_error_code=0 create_gap=1 idtentry stack_segment do_stack_segment has_error_code=1 diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 57fd1966c4ea..bbc43e5411d9 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -4,6 +4,7 @@ #include #include +#include #include extern void kvmclock_init(void); @@ -104,6 +105,12 @@ static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) return false; } +extern __visible void kvm_async_pf_vector(void); +#ifdef CONFIG_TRACING +#define trace_kvm_async_pf_vector kvm_async_pf_vector +#endif +__visible void __irq_entry kvm_async_pf_intr(struct pt_regs *regs); + #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init kvm_spinlock_init(void); #else /* !CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 7e6403a8d861..3a0115e8d880 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -233,15 +233,10 @@ NOKPROBE_SYMBOL(kvm_read_and_reset_apf_flags); bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { - u32 reason = kvm_read_and_reset_apf_flags(); + u32 flags = kvm_read_and_reset_apf_flags(); - switch (reason) { - case KVM_PV_REASON_PAGE_NOT_PRESENT: - case KVM_PV_REASON_PAGE_READY: - break; - default: + if (!flags) return false; - } /* * If the host managed to inject an async #PF into an interrupt @@ -251,20 +246,39 @@ bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) if (unlikely(!(regs->flags & X86_EFLAGS_IF))) panic("Host injected async #PF in interrupt disabled region\n"); - if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) { + if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { if (unlikely(!(user_mode(regs)))) panic("Host injected async #PF in kernel mode\n"); /* Page is swapped out by the host. */ kvm_async_pf_task_wait_schedule(token); - } else { - rcu_irq_enter(); - kvm_async_pf_task_wake(token); - rcu_irq_exit(); + return true; } + + WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags); return true; } NOKPROBE_SYMBOL(__kvm_handle_async_pf); +__visible void __irq_entry kvm_async_pf_intr(struct pt_regs *regs) +{ + u32 token; + + entering_ack_irq(); + + inc_irq_stat(irq_hv_callback_count); + + if (__this_cpu_read(apf_reason.enabled)) { + token = __this_cpu_read(apf_reason.token); + rcu_irq_enter(); + kvm_async_pf_task_wake(token); + rcu_irq_exit(); + __this_cpu_write(apf_reason.token, 0); + wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); + } + + exiting_irq(); +} + static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; @@ -308,17 +322,19 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa; + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { + u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); - pa |= KVM_ASYNC_PF_ENABLED; + pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); @@ -643,8 +659,10 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, kvm_async_pf_vector); + } #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; From a23816f3cdcbffe5dc6e8c331914b3f51b87c2f3 Mon Sep 17 00:00:00 2001 From: Collin Walling Date: Mon, 22 Jun 2020 11:46:35 -0400 Subject: [PATCH 002/127] s390/setup: diag 318: refactor struct The diag 318 struct introduced in include/asm/diag.h can be reused in KVM, so let's condense the version code fields in the diag318_info struct for easier usage and simplify it until we can determine how the data should be formatted. Signed-off-by: Collin Walling Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Reviewed-by: Thomas Huth Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20200622154636.5499-2-walling@linux.ibm.com Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/diag.h | 6 ++---- arch/s390/kernel/setup.c | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 0036eab14391..ca8f85b53a90 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -298,10 +298,8 @@ struct diag26c_mac_resp { union diag318_info { unsigned long val; struct { - unsigned int cpnc : 8; - unsigned int cpvc_linux : 24; - unsigned char cpvc_distro[3]; - unsigned char zero; + unsigned long cpnc : 8; + unsigned long cpvc : 56; }; }; diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 5853c9872dfe..878cacfc9c3e 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -1021,8 +1021,7 @@ static void __init setup_control_program_code(void) { union diag318_info diag318_info = { .cpnc = CPNC_LINUX, - .cpvc_linux = 0, - .cpvc_distro = {0}, + .cpvc = 0, }; if (!sclp.has_diag318) From 23a60f834406c8e3805328b630d09d5546b460c1 Mon Sep 17 00:00:00 2001 From: Collin Walling Date: Mon, 22 Jun 2020 11:46:36 -0400 Subject: [PATCH 003/127] s390/kvm: diagnose 0x318 sync and reset DIAGNOSE 0x318 (diag318) sets information regarding the environment the VM is running in (Linux, z/VM, etc) and is observed via firmware/service events. This is a privileged s390x instruction that must be intercepted by SIE. Userspace handles the instruction as well as migration. Data is communicated via VCPU register synchronization. The Control Program Name Code (CPNC) is stored in the SIE block. The CPNC along with the Control Program Version Code (CPVC) are stored in the kvm_vcpu_arch struct. This data is reset on load normal and clear resets. Signed-off-by: Collin Walling Reviewed-by: Janosch Frank Acked-by: Cornelia Huck Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20200622154636.5499-3-walling@linux.ibm.com [borntraeger@de.ibm.com: fix sync_reg position] Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 4 +++- arch/s390/include/uapi/asm/kvm.h | 7 +++++-- arch/s390/kvm/kvm-s390.c | 11 ++++++++++- arch/s390/kvm/vsie.c | 1 + include/uapi/linux/kvm.h | 1 + 5 files changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index cee3cb6455a2..371ec6beb618 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -260,7 +260,8 @@ struct kvm_s390_sie_block { __u32 scaol; /* 0x0064 */ __u8 sdf; /* 0x0068 */ __u8 epdx; /* 0x0069 */ - __u8 reserved6a[2]; /* 0x006a */ + __u8 cpnc; /* 0x006a */ + __u8 reserved6b; /* 0x006b */ __u32 todpr; /* 0x006c */ #define GISA_FORMAT1 0x00000001 __u32 gd; /* 0x0070 */ @@ -745,6 +746,7 @@ struct kvm_vcpu_arch { bool gs_enabled; bool skey_enabled; struct kvm_s390_pv_vcpu pv; + union diag318_info diag318_info; }; struct kvm_vm_stat { diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 436ec7636927..7a6b14874d65 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -231,11 +231,13 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_GSCB (1UL << 9) #define KVM_SYNC_BPBC (1UL << 10) #define KVM_SYNC_ETOKEN (1UL << 11) +#define KVM_SYNC_DIAG318 (1UL << 12) #define KVM_SYNC_S390_VALID_FIELDS \ (KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \ KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \ - KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN) + KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN | \ + KVM_SYNC_DIAG318) /* length and alignment of the sdnx as a power of two */ #define SDNXC 8 @@ -264,7 +266,8 @@ struct kvm_sync_regs { __u8 reserved2 : 7; __u8 padding1[51]; /* riccb needs to be 64byte aligned */ __u8 riccb[64]; /* runtime instrumentation controls block */ - __u8 padding2[192]; /* sdnx needs to be 256byte aligned */ + __u64 diag318; /* diagnose 0x318 info */ + __u8 padding2[184]; /* sdnx needs to be 256byte aligned */ union { __u8 sdnx[SDNXL]; /* state description annex */ struct { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d47c19718615..08e6cf6cb454 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -545,6 +545,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_AIS_MIGRATION: case KVM_CAP_S390_VCPU_RESETS: case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_S390_DIAG318: r = 1; break; case KVM_CAP_S390_HPAGE_1M: @@ -3267,7 +3268,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) KVM_SYNC_ACRS | KVM_SYNC_CRS | KVM_SYNC_ARCH0 | - KVM_SYNC_PFAULT; + KVM_SYNC_PFAULT | + KVM_SYNC_DIAG318; kvm_s390_set_prefix(vcpu, 0); if (test_kvm_facility(vcpu->kvm, 64)) vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; @@ -3562,6 +3564,7 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->pp = 0; vcpu->arch.sie_block->fpf &= ~FPF_BPBC; vcpu->arch.sie_block->todpr = 0; + vcpu->arch.sie_block->cpnc = 0; } } @@ -3579,6 +3582,7 @@ static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu) regs->etoken = 0; regs->etoken_extension = 0; + regs->diag318 = 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) @@ -4196,6 +4200,10 @@ static void sync_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) kvm_clear_async_pf_completion_queue(vcpu); } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_DIAG318) { + vcpu->arch.diag318_info.val = kvm_run->s.regs.diag318; + vcpu->arch.sie_block->cpnc = vcpu->arch.diag318_info.cpnc; + } /* * If userspace sets the riccb (e.g. after migration) to a valid state, * we should enable RI here instead of doing the lazy enablement. @@ -4297,6 +4305,7 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; + kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val; if (MACHINE_HAS_GS) { __ctl_set_bit(2, 4); if (vcpu->arch.gs_enabled) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 9e9056cebfcf..4f3cbf6003a9 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -548,6 +548,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->ecd |= scb_o->ecd & ECD_ETOKENF; scb_s->hpid = HPID_VSIE; + scb_s->cpnc = scb_o->cpnc; prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4fdf30316582..35cdb4307904 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1031,6 +1031,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_SECURE_GUEST 181 #define KVM_CAP_HALT_POLL 182 #define KVM_CAP_ASYNC_PF_INT 183 +#define KVM_CAP_S390_DIAG318 184 #ifdef KVM_CAP_IRQ_ROUTING From 0ed076c7ba3d676535b01a5f848746674bdfa351 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 15 Jun 2020 09:26:36 +0100 Subject: [PATCH 004/127] KVM: MIPS: fix spelling mistake "Exteneded" -> "Extended" There is a spelling mistake in a couple of kvm_err messages. Fix them. Signed-off-by: Colin Ian King Message-Id: <20200615082636.7004-1-colin.king@canonical.com> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 5ae82d925197..d3d322f70fe0 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1861,7 +1861,7 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, vcpu->arch.gprs[rt], *(u64 *)data); break; default: - kvm_err("Godson Exteneded GS-Store not yet supported (inst=0x%08x)\n", + kvm_err("Godson Extended GS-Store not yet supported (inst=0x%08x)\n", inst.word); break; } @@ -2103,7 +2103,7 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, vcpu->mmio_needed = 30; /* signed */ break; default: - kvm_err("Godson Exteneded GS-Load for float not yet supported (inst=0x%08x)\n", + kvm_err("Godson Extended GS-Load for float not yet supported (inst=0x%08x)\n", inst.word); break; } From 9ce372b33a2ebbd0b965148879ae169a0015d3f3 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 7 May 2020 16:36:02 +0200 Subject: [PATCH 005/127] KVM: x86: drop KVM_PV_REASON_PAGE_READY case from kvm_handle_page_fault() KVM guest code in Linux enables APF only when KVM_FEATURE_ASYNC_PF_INT is supported, this means we will never see KVM_PV_REASON_PAGE_READY when handling page fault vmexit in KVM. While on it, make sure we only follow genuine page fault path when APF reason is zero. If we happen to see something else this means that the underlying hypervisor is misbehaving. Leave WARN_ON_ONCE() to catch that. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6d6a0ae7800c..3ca70554d5f1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4156,6 +4156,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { int r = 1; + u32 flags = vcpu->arch.apf.host_apf_flags; #ifndef CONFIG_X86_64 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ @@ -4164,28 +4165,22 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, #endif vcpu->arch.l1tf_flush_l1d = true; - switch (vcpu->arch.apf.host_apf_flags) { - default: + if (!flags) { trace_kvm_page_fault(fault_address, error_code); if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: + } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); kvm_async_pf_task_wait_schedule(fault_address); local_irq_enable(); - break; - case KVM_PV_REASON_PAGE_READY: - vcpu->arch.apf.host_apf_flags = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); - break; + } else { + WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags); } + return r; } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); From e8c22266e68f0db2a7e11b0a9f29fd88ec0cfd4a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 15 Jun 2020 14:13:34 +0200 Subject: [PATCH 006/127] KVM: async_pf: change kvm_setup_async_pf()/kvm_arch_setup_async_pf() return type to bool Unlike normal 'int' functions returning '0' on success, kvm_setup_async_pf()/ kvm_arch_setup_async_pf() return '1' when a job to handle page fault asynchronously was scheduled and '0' otherwise. To avoid the confusion change return type to 'bool'. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20200615121334.91300-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 20 +++++++++----------- arch/x86/kvm/mmu/mmu.c | 4 ++-- include/linux/kvm_host.h | 4 ++-- virt/kvm/async_pf.c | 16 ++++++++++------ 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d47c19718615..7fd4fdb165fc 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3954,33 +3954,31 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) return true; } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) { hva_t hva; struct kvm_arch_async_pf arch; - int rc; if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) - return 0; + return false; if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) != vcpu->arch.pfault_compare) - return 0; + return false; if (psw_extint_disabled(vcpu)) - return 0; + return false; if (kvm_s390_vcpu_has_irq(vcpu, 0)) - return 0; + return false; if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) - return 0; + return false; if (!vcpu->arch.gmap->pfault_enabled) - return 0; + return false; hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr)); hva += current->thread.gmap_addr & ~PAGE_MASK; if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8)) - return 0; + return false; - rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); - return rc; + return kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); } static int vcpu_pre_run(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3ca70554d5f1..a1850120ede0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4045,8 +4045,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_end(vcpu); } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - gfn_t gfn) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + gfn_t gfn) { struct kvm_arch_async_pf arch; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 62ec926c78a0..9edc6fc71a89 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -211,8 +211,8 @@ struct kvm_async_pf { void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - unsigned long hva, struct kvm_arch_async_pf *arch); +bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + unsigned long hva, struct kvm_arch_async_pf *arch); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 45799606bb3e..390f758d5a27 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -156,17 +156,21 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) } } -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - unsigned long hva, struct kvm_arch_async_pf *arch) +/* + * Try to schedule a job to handle page fault asynchronously. Returns 'true' on + * success, 'false' on failure (page fault has to be handled synchronously). + */ +bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + unsigned long hva, struct kvm_arch_async_pf *arch) { struct kvm_async_pf *work; if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) - return 0; + return false; /* Arch specific code should not do async PF in this case */ if (unlikely(kvm_is_error_hva(hva))) - return 0; + return false; /* * do alloc nowait since if we are going to sleep anyway we @@ -174,7 +178,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, */ work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN); if (!work) - return 0; + return false; work->wakeup_all = false; work->vcpu = vcpu; @@ -193,7 +197,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, schedule_work(&work->work); - return 1; + return true; } int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) From f25a9dec2da3b303e4c62cf9fa67e836866198b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Jun 2020 14:58:30 -0700 Subject: [PATCH 007/127] KVM: x86/mmu: Drop kvm_arch_write_log_dirty() wrapper Drop kvm_arch_write_log_dirty() in favor of invoking .write_log_dirty() directly from FNAME(update_accessed_dirty_bits). "kvm_arch" is usually used for x86 functions that are invoked from generic KVM, and implies that there are external callers, neither of which is true. Remove the check for a non-NULL kvm_x86_ops hook as the call is wrapped in PTTYPE_EPT and is unconditionally set by VMX. Signed-off-by: Sean Christopherson Message-Id: <20200622215832.22090-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 1 - arch/x86/kvm/mmu/mmu.c | 15 --------------- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 3 files changed, 1 insertion(+), 17 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 444bb9c54548..81cafc937cfb 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -222,7 +222,6 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a1850120ede0..03ce2cad04f7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1738,21 +1738,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -/** - * kvm_arch_write_log_dirty - emulate dirty page logging - * @vcpu: Guest mode vcpu - * - * Emulate arch specific page modification logging for the - * nested hypervisor - */ -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa) -{ - if (kvm_x86_ops.write_log_dirty) - return kvm_x86_ops.write_log_dirty(vcpu, l2_gpa); - - return 0; -} - bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index bd70ece1ef8b..6886be325e1d 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -260,7 +260,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); #if PTTYPE == PTTYPE_EPT - if (kvm_arch_write_log_dirty(vcpu, addr)) + if (kvm_x86_ops.write_log_dirty(vcpu, addr)) return -EINVAL; #endif pte |= PT_GUEST_DIRTY_MASK; From 2f1d48aae2961b68922a653587f359c245569ccb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Jun 2020 14:58:31 -0700 Subject: [PATCH 008/127] KVM: nVMX: WARN if PML emulation helper is invoked outside of nested guest WARN if vmx_write_pml_buffer() is called outside of guest mode instead of silently ignoring the condition. The only caller is nested EPT's ept_update_accessed_dirty_bits(), which should only be reachable when L2 is active. Signed-off-by: Sean Christopherson Message-Id: <20200622215832.22090-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 45 +++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 13745f2a5ecd..2c9594898fbc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7485,33 +7485,34 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t dst; - if (is_guest_mode(vcpu)) { - WARN_ON_ONCE(vmx->nested.pml_full); + if (WARN_ON_ONCE(!is_guest_mode(vcpu))) + return 0; - /* - * Check if PML is enabled for the nested guest. - * Whether eptp bit 6 is set is already checked - * as part of A/D emulation. - */ - vmcs12 = get_vmcs12(vcpu); - if (!nested_cpu_has_pml(vmcs12)) - return 0; + if (WARN_ON_ONCE(vmx->nested.pml_full)) + return 1; - if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { - vmx->nested.pml_full = true; - return 1; - } + /* + * Check if PML is enabled for the nested guest. Whether eptp bit 6 is + * set is already checked as part of A/D emulation. + */ + vmcs12 = get_vmcs12(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; - gpa &= ~0xFFFull; - dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; - - if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, - offset_in_page(dst), sizeof(gpa))) - return 0; - - vmcs12->guest_pml_index--; + if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { + vmx->nested.pml_full = true; + return 1; } + gpa &= ~0xFFFull; + dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; + + if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, + offset_in_page(dst), sizeof(gpa))) + return 0; + + vmcs12->guest_pml_index--; + return 0; } From 02f5fb2e69f653ef09490d081ef65296a0cbf114 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Jun 2020 14:58:32 -0700 Subject: [PATCH 009/127] KVM: x86/mmu: Make .write_log_dirty a nested operation Move .write_log_dirty() into kvm_x86_nested_ops to help differentiate it from the non-nested dirty log hooks. And because it's a nested-only operation. Signed-off-by: Sean Christopherson Message-Id: <20200622215832.22090-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- arch/x86/kvm/vmx/nested.c | 38 +++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 38 --------------------------------- 4 files changed, 40 insertions(+), 40 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index be5363b21540..89e1a66ebee9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1220,7 +1220,6 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); - int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; @@ -1281,6 +1280,7 @@ struct kvm_x86_nested_ops { struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int (*enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 6886be325e1d..7e370d8bd576 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -260,7 +260,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); #if PTTYPE == PTTYPE_EPT - if (kvm_x86_ops.write_log_dirty(vcpu, addr)) + if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr)) return -EINVAL; #endif pte |= PT_GUEST_DIRTY_MASK; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b26655104d4a..aeaac9febca4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3205,6 +3205,43 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) return true; } +static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t dst; + + if (WARN_ON_ONCE(!is_guest_mode(vcpu))) + return 0; + + if (WARN_ON_ONCE(vmx->nested.pml_full)) + return 1; + + /* + * Check if PML is enabled for the nested guest. Whether eptp bit 6 is + * set is already checked as part of A/D emulation. + */ + vmcs12 = get_vmcs12(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { + vmx->nested.pml_full = true; + return 1; + } + + gpa &= ~0xFFFull; + dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; + + if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, + offset_in_page(dst), sizeof(gpa))) + return 0; + + vmcs12->guest_pml_index--; + + return 0; +} + /* * Intel's VMX Instruction Reference specifies a common set of prerequisites * for running VMX instructions (except VMXON, whose prerequisites are @@ -6503,6 +6540,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, .get_vmcs12_pages = nested_get_vmcs12_pages, + .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, }; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2c9594898fbc..8411118e51a2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7479,43 +7479,6 @@ static void vmx_flush_log_dirty(struct kvm *kvm) kvm_flush_pml_buffers(kvm); } -static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - struct vmcs12 *vmcs12; - struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t dst; - - if (WARN_ON_ONCE(!is_guest_mode(vcpu))) - return 0; - - if (WARN_ON_ONCE(vmx->nested.pml_full)) - return 1; - - /* - * Check if PML is enabled for the nested guest. Whether eptp bit 6 is - * set is already checked as part of A/D emulation. - */ - vmcs12 = get_vmcs12(vcpu); - if (!nested_cpu_has_pml(vmcs12)) - return 0; - - if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { - vmx->nested.pml_full = true; - return 1; - } - - gpa &= ~0xFFFull; - dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; - - if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, - offset_in_page(dst), sizeof(gpa))) - return 0; - - vmcs12->guest_pml_index--; - - return 0; -} - static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t offset, unsigned long mask) @@ -7944,7 +7907,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, - .write_log_dirty = vmx_write_pml_buffer, .pre_block = vmx_pre_block, .post_block = vmx_post_block, From 6abe9c1386e5c86f360e4e8fde8eec95eee77aa3 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 22 Jun 2020 18:04:41 -0400 Subject: [PATCH 010/127] KVM: X86: Move ignore_msrs handling upper the stack MSR accesses can be one of: (1) KVM internal access, (2) userspace access (e.g., via KVM_SET_MSRS ioctl), (3) guest access. The ignore_msrs was previously handled by kvm_get_msr_common() and kvm_set_msr_common(), which is the bottom of the msr access stack. It's working in most cases, however it could dump unwanted warning messages to dmesg even if kvm get/set the msrs internally when calling __kvm_set_msr() or __kvm_get_msr() (e.g. kvm_cpuid()). Ideally we only want to trap cases (2) or (3), but not (1) above. To achieve this, move the ignore_msrs handling upper until the callers of __kvm_get_msr() and __kvm_set_msr(). To identify the "msr missing" event, a new return value (KVM_MSR_RET_INVALID==2) is used for that. Signed-off-by: Peter Xu Message-Id: <20200622220442.21998-2-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 80 +++++++++++++++++++++++++++++++--------------- arch/x86/kvm/x86.h | 2 ++ 2 files changed, 56 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 88c593f83b28..e7c1567b9501 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -243,6 +243,29 @@ static struct kmem_cache *x86_fpu_cache; static struct kmem_cache *x86_emulator_cache; +/* + * When called, it means the previous get/set msr reached an invalid msr. + * Return 0 if we want to ignore/silent this failed msr access, or 1 if we want + * to fail the caller. + */ +static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, + u64 data, bool write) +{ + const char *op = write ? "wrmsr" : "rdmsr"; + + if (ignore_msrs) { + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n", + op, msr, data); + /* Mask the error */ + return 0; + } else { + vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n", + op, msr, data); + return 1; + } +} + static struct kmem_cache *kvm_alloc_emulator_cache(void) { unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); @@ -1516,6 +1539,17 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, return kvm_x86_ops.set_msr(vcpu, &msr); } +static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 data, bool host_initiated) +{ + int ret = __kvm_set_msr(vcpu, index, data, host_initiated); + + if (ret == KVM_MSR_RET_INVALID) + ret = kvm_msr_ignored_check(vcpu, index, data, true); + + return ret; +} + /* * Read the MSR specified by @index into @data. Select MSR specific fault * checks are bypassed if @host_initiated is %true. @@ -1537,15 +1571,29 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, return ret; } +static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 *data, bool host_initiated) +{ + int ret = __kvm_get_msr(vcpu, index, data, host_initiated); + + if (ret == KVM_MSR_RET_INVALID) { + /* Unconditionally clear *data for simplicity */ + *data = 0; + ret = kvm_msr_ignored_check(vcpu, index, 0, false); + } + + return ret; +} + int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - return __kvm_get_msr(vcpu, index, data, false); + return kvm_get_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_get_msr); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) { - return __kvm_set_msr(vcpu, index, data, false); + return kvm_set_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_set_msr); @@ -1665,12 +1713,12 @@ EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); */ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return __kvm_get_msr(vcpu, index, data, true); + return kvm_get_msr_ignored_check(vcpu, index, data, true); } static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return __kvm_set_msr(vcpu, index, *data, true); + return kvm_set_msr_ignored_check(vcpu, index, *data, true); } #ifdef CONFIG_X86_64 @@ -3066,17 +3114,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return xen_hvm_config(vcpu, data); if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - if (!ignore_msrs) { - vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); - return 1; - } else { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, - "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); - break; - } + return KVM_MSR_RET_INVALID; } return 0; } @@ -3331,17 +3369,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); - if (!ignore_msrs) { - vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", - msr_info->index); - return 1; - } else { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", - msr_info->index); - msr_info->data = 0; - } - break; + return KVM_MSR_RET_INVALID; } return 0; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 6eb62e97e59f..8d42dd0cf81e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -366,4 +366,6 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); +#define KVM_MSR_RET_INVALID 2 + #endif From 12bc2132b15e0a969b3f455d90a5f215ef239eff Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 22 Jun 2020 18:04:42 -0400 Subject: [PATCH 011/127] KVM: X86: Do the same ignore_msrs check for feature msrs Logically the ignore_msrs and report_ignored_msrs should also apply to feature MSRs. Add them in. Signed-off-by: Peter Xu Message-Id: <20200622220442.21998-3-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 10 ++++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c0da4dd78ac5..70a824806274 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2359,7 +2359,7 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; break; default: - return 1; + return KVM_MSR_RET_INVALID; } return 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8411118e51a2..608e992e8db9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1815,7 +1815,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) msr->data = vmx_get_perf_capabilities(); return 0; default: - return 1; + return KVM_MSR_RET_INVALID; } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e7c1567b9501..ec9aba133c1c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1412,8 +1412,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - if (kvm_x86_ops.get_msr_feature(msr)) - return 1; + return kvm_x86_ops.get_msr_feature(msr); } return 0; } @@ -1425,6 +1424,13 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) msr.index = index; r = kvm_get_msr_feature(&msr); + + if (r == KVM_MSR_RET_INVALID) { + /* Unconditionally clear the output for simplicity */ + *data = 0; + r = kvm_msr_ignored_check(vcpu, index, 0, false); + } + if (r) return r; From f5f6145e41d39c7fd04a17c3b2596c7abe933f10 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Fri, 22 May 2020 18:19:51 -0400 Subject: [PATCH 012/127] KVM: x86: Move the check for upper 32 reserved bits of DR6 to separate function Signed-off-by: Krish Sadhukhan Message-Id: <20200522221954.32131-2-krish.sadhukhan@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/x86.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec9aba133c1c..82f457f0e7e0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1133,7 +1133,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 4: /* fall through */ case 6: - if (val & 0xffffffff00000000ULL) + if (!kvm_dr6_valid(val)) return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); break; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 8d42dd0cf81e..31928bf18ba5 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -360,6 +360,11 @@ static inline bool kvm_dr7_valid(u64 data) /* Bits [63:32] are reserved */ return !(data >> 32); } +static inline bool kvm_dr6_valid(u64 data) +{ + /* Bits [63:32] are reserved */ + return !(data >> 32); +} void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); From 1aef8161b38a531895a8bffad0e9fb1445ca91f7 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Fri, 22 May 2020 18:19:52 -0400 Subject: [PATCH 013/127] KVM: nSVM: Check that DR6[63:32] and DR7[64:32] are not set on vmrun of nested guests According to section "Canonicalization and Consistency Checks" in APM vol. 2 the following guest state is illegal: "DR6[63:32] are not zero." "DR7[63:32] are not zero." "Any MBZ bit of EFER is set." Signed-off-by: Krish Sadhukhan Message-Id: <20200522221954.32131-3-krish.sadhukhan@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 6bceafb19108..e4ef980981af 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -231,6 +231,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) (vmcb->save.cr0 & X86_CR0_NW)) return false; + if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7)) + return false; + return nested_vmcb_check_controls(&vmcb->control); } From 78824fabc72e5e37d51e6e567fde70a4fc41a6d7 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 25 May 2020 23:22:06 -0700 Subject: [PATCH 014/127] KVM: SVM: fix svn_pin_memory()'s use of get_user_pages_fast() There are two problems in svn_pin_memory(): 1) The return value of get_user_pages_fast() is stored in an unsigned long, although the declared return value is of type int. This will not cause any symptoms, but it is misleading. Fix this by changing the type of npinned to "int". 2) The number of pages passed into get_user_pages_fast() is stored in an unsigned long, even though get_user_pages_fast() accepts an int. This means that it is possible to silently overflow the number of pages. Fix this by adding a WARN_ON_ONCE() and an early error return. The npages variable is left as an unsigned long for convenience in checking for overflow. Fixes: 89c505809052 ("KVM: SVM: Add support for KVM_SEV_LAUNCH_UPDATE_DATA command") Cc: Ingo Molnar Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Wanpeng Li Cc: Jim Mattson Cc: Joerg Roedel Cc: H. Peter Anvin Cc: x86@kernel.org Cc: kvm@vger.kernel.org Signed-off-by: John Hubbard Message-Id: <20200526062207.1360225-2-jhubbard@nvidia.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5573a97f1520..ceeee4bb6150 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -313,7 +313,8 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, int write) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - unsigned long npages, npinned, size; + unsigned long npages, size; + int npinned; unsigned long locked, lock_limit; struct page **pages; unsigned long first, last; @@ -333,6 +334,9 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return NULL; } + if (WARN_ON_ONCE(npages > INT_MAX)) + return NULL; + /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) From dc42c8ae0a7762378102dd043779d19331804cce Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 25 May 2020 23:22:07 -0700 Subject: [PATCH 015/127] KVM: SVM: convert get_user_pages() --> pin_user_pages() This code was using get_user_pages*(), in a "Case 2" scenario (DMA/RDMA), using the categorization from [1]. That means that it's time to convert the get_user_pages*() + put_page() calls to pin_user_pages*() + unpin_user_pages() calls. There is some helpful background in [2]: basically, this is a small part of fixing a long-standing disconnect between pinning pages, and file systems' use of those pages. [1] Documentation/core-api/pin_user_pages.rst [2] "Explicit pinning of user-space pages": https://lwn.net/Articles/807108/ Cc: Ingo Molnar Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Wanpeng Li Cc: Jim Mattson Cc: Joerg Roedel Cc: H. Peter Anvin Cc: x86@kernel.org Cc: kvm@vger.kernel.org Signed-off-by: John Hubbard Message-Id: <20200526062207.1360225-3-jhubbard@nvidia.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index ceeee4bb6150..a893624b9275 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -348,7 +348,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return NULL; /* Pin the user virtual address. */ - npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); goto err; @@ -361,7 +361,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, err: if (npinned > 0) - release_pages(pages, npinned); + unpin_user_pages(pages, npinned); kvfree(pages); return NULL; @@ -372,7 +372,7 @@ static void sev_unpin_memory(struct kvm *kvm, struct page **pages, { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - release_pages(pages, npages); + unpin_user_pages(pages, npages); kvfree(pages); sev->pages_locked -= npages; } From a8d908b5873cad212b0f74569f5a23b804e694ce Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 23 Jun 2020 05:12:24 -0400 Subject: [PATCH 016/127] KVM: x86: report sev_pin_memory errors with PTR_ERR Callers of sev_pin_memory() treat NULL differently: sev_launch_secret()/svm_register_enc_region() return -ENOMEM sev_dbg_crypt() returns -EFAULT. Switching to ERR_PTR() preserves the error and enables cleaner reporting of different kinds of failures. Suggested-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a893624b9275..2b4916ffa906 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -320,7 +320,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long first, last; if (ulen == 0 || uaddr + ulen < uaddr) - return NULL; + return ERR_PTR(-EINVAL); /* Calculate number of pages. */ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT; @@ -331,11 +331,11 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit); - return NULL; + return ERR_PTR(-ENOMEM); } if (WARN_ON_ONCE(npages > INT_MAX)) - return NULL; + return ERR_PTR(-EINVAL); /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); @@ -345,7 +345,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, pages = kmalloc(size, GFP_KERNEL_ACCOUNT); if (!pages) - return NULL; + return ERR_PTR(-ENOMEM); /* Pin the user virtual address. */ npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); @@ -360,11 +360,13 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return pages; err: - if (npinned > 0) + if (npinned > 0) { unpin_user_pages(pages, npinned); + npinned = -ENOMEM; + } kvfree(pages); - return NULL; + return ERR_PTR(npinned); } static void sev_unpin_memory(struct kvm *kvm, struct page **pages, @@ -864,8 +866,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) return -EFAULT; pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); - if (!pages) - return -ENOMEM; + if (IS_ERR(pages)) + return PTR_ERR(pages); /* * The secret must be copied into contiguous memory region, lets verify @@ -991,8 +993,8 @@ int svm_register_enc_region(struct kvm *kvm, return -ENOMEM; region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); - if (!region->pages) { - ret = -ENOMEM; + if (IS_ERR(region->pages)) { + ret = PTR_ERR(region->pages); goto e_free; } From 73cd6e5f7f0b8c07330c48bfec4aaa84a6921cf4 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 3 Jun 2020 16:56:18 -0700 Subject: [PATCH 017/127] kvm: svm: Prefer vcpu->cpu to raw_smp_processor_id() The current logical processor id is cached in vcpu->cpu. Use it instead of raw_smp_processor_id() when a kvm_vcpu struct is available. Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Message-Id: <20200603235623.245638-2-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 70a824806274..fd4a9188902c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2992,21 +2992,18 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) static void reload_tss(struct kvm_vcpu *vcpu) { - int cpu = raw_smp_processor_id(); + struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); sd->tss_desc->type = 9; /* available 32/64-bit TSS */ load_TR_desc(); } static void pre_svm_run(struct vcpu_svm *svm) { - int cpu = raw_smp_processor_id(); - - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu); if (sev_guest(svm->vcpu.kvm)) - return pre_sev_run(svm, cpu); + return pre_sev_run(svm, svm->vcpu.cpu); /* FIXME: handle wraparound of asid_generation */ if (svm->asid_generation != sd->asid_generation) From 242636343c246e338b8ea317e32dbf4ed47edc65 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 3 Jun 2020 16:56:19 -0700 Subject: [PATCH 018/127] kvm: svm: Always set svm->last_cpu on VMRUN Previously, this field was only set when using SEV. Set it for all vCPU configurations, so that it can be communicated to userspace for diagnosing potential hardware errors. Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20200603235623.245638-3-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 1 - arch/x86/kvm/svm/svm.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 2b4916ffa906..a8444c74430e 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1189,7 +1189,6 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) svm->last_cpu == cpu) return; - svm->last_cpu = cpu; sd->sev_vmcbs[asid] = svm->vmcb; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; mark_dirty(svm->vmcb, VMCB_ASID); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index fd4a9188902c..24b7f321874f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3396,6 +3396,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); + svm->last_cpu = vcpu->cpu; __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); #ifdef CONFIG_X86_64 From 80a1684c0161088710398d84a7cdd683c5d88228 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 3 Jun 2020 16:56:20 -0700 Subject: [PATCH 019/127] kvm: vmx: Add last_cpu to struct vcpu_vmx As we already do in svm, record the last logical processor on which a vCPU has run, so that it can be communicated to userspace for potential hardware errors. Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20200603235623.245638-4-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 1 + arch/x86/kvm/vmx/vmx.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 608e992e8db9..4d8f12c0a5c6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6734,6 +6734,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.cr2 != read_cr2()) write_cr2(vcpu->arch.cr2); + vmx->last_cpu = vcpu->cpu; vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, vmx->loaded_vmcs->launched); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 639798e4a6ca..f8f9e214d285 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -300,6 +300,9 @@ struct vcpu_vmx { u64 ept_pointer; struct pt_desc pt_desc; + + /* which host CPU was used for running this vcpu */ + unsigned int last_cpu; }; enum ept_pointers_status { From 1aa561b1a4c0ae2a9a9b9c21a84b5ca66b4775d8 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 3 Jun 2020 16:56:21 -0700 Subject: [PATCH 020/127] kvm: x86: Add "last CPU" to some KVM_EXIT information More often than not, a failed VM-entry in an x86 production environment is induced by a defective CPU. To help identify the bad hardware, include the id of the last logical CPU to run a vCPU in the information provided to userspace on a KVM exit for failed VM-entry or for KVM internal errors not associated with emulation. The presence of this additional information is indicated by a new capability, KVM_CAP_LAST_CPU. Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20200603235623.245638-5-jmattson@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 1 + arch/x86/kvm/svm/svm.c | 4 +++- arch/x86/kvm/vmx/vmx.c | 10 ++++++++-- arch/x86/kvm/x86.c | 1 + include/uapi/linux/kvm.h | 2 ++ 5 files changed, 15 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 426f94582b7a..1cfe79b932d6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4794,6 +4794,7 @@ hardware_exit_reason. /* KVM_EXIT_FAIL_ENTRY */ struct { __u64 hardware_entry_failure_reason; + __u32 cpu; /* if KVM_LAST_CPU */ } fail_entry; If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 24b7f321874f..8ecd46f2cb1e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2947,6 +2947,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; + kvm_run->fail_entry.cpu = svm->last_cpu; dump_vmcb(vcpu); return 0; } @@ -2970,8 +2971,9 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; + vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_code; + vcpu->run->internal.data[1] = svm->last_cpu; return 0; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4d8f12c0a5c6..b52bcebfa094 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4781,10 +4781,11 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 3; + vcpu->run->internal.ndata = 4; vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; vcpu->run->internal.data[2] = error_code; + vcpu->run->internal.data[3] = vmx->last_cpu; return 0; } @@ -6006,6 +6007,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason; + vcpu->run->fail_entry.cpu = vmx->last_cpu; return 0; } @@ -6014,6 +6016,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + vcpu->run->fail_entry.cpu = vmx->last_cpu; return 0; } @@ -6040,6 +6043,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->internal.data[3] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); } + vcpu->run->internal.data[vcpu->run->internal.ndata++] = + vmx->last_cpu; return 0; } @@ -6095,8 +6100,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; + vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_reason; + vcpu->run->internal.data[1] = vmx->last_cpu; return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 82f457f0e7e0..1a0fad1018f9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3510,6 +3510,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_LAST_CPU: r = 1; break; case KVM_CAP_SYNC_REGS: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4fdf30316582..ff9b335620d0 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -289,6 +289,7 @@ struct kvm_run { /* KVM_EXIT_FAIL_ENTRY */ struct { __u64 hardware_entry_failure_reason; + __u32 cpu; } fail_entry; /* KVM_EXIT_EXCEPTION */ struct { @@ -1031,6 +1032,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_SECURE_GUEST 181 #define KVM_CAP_HALT_POLL 182 #define KVM_CAP_ASYNC_PF_INT 183 +#define KVM_CAP_LAST_CPU 184 #ifdef KVM_CAP_IRQ_ROUTING From 8a14fe4f0c54f27c89389d13c4a1e467a88c35ea Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 3 Jun 2020 16:56:22 -0700 Subject: [PATCH 021/127] kvm: x86: Move last_cpu into kvm_vcpu_arch as last_vmentry_cpu Both the vcpu_vmx structure and the vcpu_svm structure have a 'last_cpu' field. Move the common field into the kvm_vcpu_arch structure. For clarity, rename it to 'last_vmentry_cpu.' Suggested-by: Sean Christopherson Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20200603235623.245638-6-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm/sev.c | 2 +- arch/x86/kvm/svm/svm.c | 6 +++--- arch/x86/kvm/svm/svm.h | 3 --- arch/x86/kvm/vmx/vmx.c | 12 ++++++------ arch/x86/kvm/vmx/vmx.h | 3 --- 6 files changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 89e1a66ebee9..2357763bf7f2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -827,6 +827,9 @@ struct kvm_vcpu_arch { /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ bool l1tf_flush_l1d; + /* Host CPU on which VM-entry was most recently attempted */ + unsigned int last_vmentry_cpu; + /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a8444c74430e..e09aef93e92b 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1186,7 +1186,7 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) * 2) or this VMCB was executed on different host CPU in previous VMRUNs. */ if (sd->sev_vmcbs[asid] == svm->vmcb && - svm->last_cpu == cpu) + svm->vcpu.arch.last_vmentry_cpu == cpu) return; sd->sev_vmcbs[asid] = svm->vmcb; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8ecd46f2cb1e..c55ebf76ec6d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2947,7 +2947,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; - kvm_run->fail_entry.cpu = svm->last_cpu; + kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; dump_vmcb(vcpu); return 0; } @@ -2973,7 +2973,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_code; - vcpu->run->internal.data[1] = svm->last_cpu; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; return 0; } @@ -3398,7 +3398,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); - svm->last_cpu = vcpu->cpu; + vcpu->arch.last_vmentry_cpu = vcpu->cpu; __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6ac4c00a5d82..613356f85da6 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -158,9 +158,6 @@ struct vcpu_svm { */ struct list_head ir_list; spinlock_t ir_list_lock; - - /* which host CPU was used for running this vcpu */ - unsigned int last_cpu; }; struct svm_cpu_data { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b52bcebfa094..d9ee31b0679b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4785,7 +4785,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; vcpu->run->internal.data[2] = error_code; - vcpu->run->internal.data[3] = vmx->last_cpu; + vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6007,7 +6007,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason; - vcpu->run->fail_entry.cpu = vmx->last_cpu; + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6016,7 +6016,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); - vcpu->run->fail_entry.cpu = vmx->last_cpu; + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6044,7 +6044,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vmcs_read64(GUEST_PHYSICAL_ADDRESS); } vcpu->run->internal.data[vcpu->run->internal.ndata++] = - vmx->last_cpu; + vcpu->arch.last_vmentry_cpu; return 0; } @@ -6102,7 +6102,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_reason; - vcpu->run->internal.data[1] = vmx->last_cpu; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6740,7 +6740,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.cr2 != read_cr2()) write_cr2(vcpu->arch.cr2); - vmx->last_cpu = vcpu->cpu; + vcpu->arch.last_vmentry_cpu = vcpu->cpu; vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, vmx->loaded_vmcs->launched); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f8f9e214d285..639798e4a6ca 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -300,9 +300,6 @@ struct vcpu_vmx { u64 ept_pointer; struct pt_desc pt_desc; - - /* which host CPU was used for running this vcpu */ - unsigned int last_cpu; }; enum ept_pointers_status { From c967118ddb21191178c0e0080fdc41f5d85ca1d1 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 3 Jun 2020 16:56:23 -0700 Subject: [PATCH 022/127] kvm: x86: Set last_vmentry_cpu in vcpu_enter_guest Since this field is now in kvm_vcpu_arch, clean things up a little by setting it in vendor-agnostic code: vcpu_enter_guest. Note that it must be set after the call to kvm_x86_ops.run(), since it can't be updated before pre_sev_run(). Suggested-by: Sean Christopherson Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20200603235623.245638-7-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 1 - arch/x86/kvm/vmx/vmx.c | 1 - arch/x86/kvm/x86.c | 1 + 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c55ebf76ec6d..38104f47cd25 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3398,7 +3398,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); - vcpu->arch.last_vmentry_cpu = vcpu->cpu; __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d9ee31b0679b..1de5dac952b6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6740,7 +6740,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.cr2 != read_cr2()) write_cr2(vcpu->arch.cr2); - vcpu->arch.last_vmentry_cpu = vcpu->cpu; vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, vmx->loaded_vmcs->launched); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1a0fad1018f9..bd8690ca7b6b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8583,6 +8583,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); + vcpu->arch.last_vmentry_cpu = vcpu->cpu; vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); vcpu->mode = OUTSIDE_GUEST_MODE; From b2656e4d8b29a25ea26ae92d14694904274e63b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 8 Jun 2020 18:56:07 -0700 Subject: [PATCH 023/127] KVM: nVMX: Wrap VM-Fail valid path in generic VM-Fail helper Add nested_vmx_fail() to wrap VM-Fail paths that _may_ result in VM-Fail Valid to make it clear at the call sites that the Valid flavor isn't guaranteed. Suggested-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Message-Id: <20200609015607.6994-1-sean.j.christopherson@intel.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 77 ++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index aeaac9febca4..7693d41a2446 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -171,15 +171,6 @@ static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) static int nested_vmx_failValid(struct kvm_vcpu *vcpu, u32 vm_instruction_error) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * failValid writes the error number to the current VMCS, which - * can't be done if there isn't a current VMCS. - */ - if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) - return nested_vmx_failInvalid(vcpu); - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_SF | X86_EFLAGS_OF)) @@ -192,6 +183,20 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu, return kvm_skip_emulated_instruction(vcpu); } +static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * failValid writes the error number to the current VMCS, which + * can't be done if there isn't a current VMCS. + */ + if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) + return nested_vmx_failInvalid(vcpu); + + return nested_vmx_failValid(vcpu, vm_instruction_error); +} + static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ @@ -3493,19 +3498,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * when using the merged vmcs02. */ if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) - return nested_vmx_failValid(vcpu, - VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); if (vmcs12->launch_state == launch) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); if (nested_vmx_check_controls(vcpu, vmcs12)) - return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); if (nested_vmx_check_host_state(vcpu, vmcs12)) - return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); /* * We're finally done with prerequisite checking, and can start with @@ -3554,7 +3558,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (status == NVMX_VMENTRY_VMEXIT) return 1; WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); - return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); } /* @@ -4497,7 +4501,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, * flag and the VM-instruction error field of the VMCS * accordingly, and skip the emulated instruction. */ - (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); /* * Restore L1's host state to KVM's software model. We're here @@ -4797,8 +4801,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) } if (vmx->nested.vmxon) - return nested_vmx_failValid(vcpu, - VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { @@ -4889,12 +4892,10 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return r; if (!page_address_valid(vcpu, vmptr)) - return nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_INVALID_ADDRESS); + return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); if (vmptr == vmx->nested.vmxon_ptr) - return nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_VMXON_POINTER); + return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); /* * When Enlightened VMEntry is enabled on the calling CPU we treat @@ -4964,8 +4965,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) offset = vmcs_field_to_offset(field); if (offset < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); @@ -5068,8 +5068,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) offset = vmcs_field_to_offset(field); if (offset < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* * If the vCPU supports "VMWRITE to any supported field in the @@ -5077,8 +5076,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) */ if (vmcs_field_readonly(field) && !nested_cpu_has_vmwrite_any_field(vcpu)) - return nested_vmx_failValid(vcpu, - VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); + return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); /* * Ensure vmcs12 is up-to-date before any VMWRITE that dirties @@ -5153,12 +5151,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return r; if (!page_address_valid(vcpu, vmptr)) - return nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INVALID_ADDRESS); + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); if (vmptr == vmx->nested.vmxon_ptr) - return nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_VMXON_POINTER); + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); /* Forbid normal VMPTRLD if Enlightened version was used */ if (vmx->nested.hv_evmcs) @@ -5175,7 +5171,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) * given physical address won't match the required * VMCS12_REVISION identifier. */ - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -5185,7 +5181,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) (new_vmcs12->hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { kvm_vcpu_unmap(vcpu, &map, false); - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -5270,8 +5266,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; if (type >= 32 || !(types & (1 << type))) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) @@ -5292,7 +5287,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) switch (type) { case VMX_EPT_EXTENT_CONTEXT: if (!nested_vmx_check_eptp(vcpu, operand.eptp)) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); roots_to_free = 0; @@ -5352,7 +5347,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; if (type >= 32 || !(types & (1 << type))) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); /* according to the intel vmx instruction reference, the memory @@ -5366,7 +5361,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return vmx_handle_memory_failure(vcpu, r, &e); if (operand.vpid >> 16) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid02 = nested_get_vpid02(vcpu); @@ -5374,14 +5369,14 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: if (!operand.vpid || is_noncanonical_address(operand.gla, vcpu)) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_vcpu_addr(vpid02, operand.gla); break; case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: if (!operand.vpid) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_context(vpid02); break; From 7693b3eb537947ecffe302732f98e6f561befb70 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 25 Jun 2020 10:03:22 +0200 Subject: [PATCH 024/127] KVM: SVM: Rename struct nested_state to svm_nested_state Renaming is only needed in the svm.h header file. No functional changes. Signed-off-by: Joerg Roedel Message-Id: <20200625080325.28439-2-joro@8bytes.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 613356f85da6..e4251c5f80eb 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -81,7 +81,7 @@ struct kvm_svm { struct kvm_vcpu; -struct nested_state { +struct svm_nested_state { struct vmcb *hsave; u64 hsave_msr; u64 vm_cr_msr; @@ -133,7 +133,7 @@ struct vcpu_svm { ulong nmi_iret_rip; - struct nested_state nested; + struct svm_nested_state nested; bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; From 06e7852c0ffb30bb7cac1686db2f5d6458039b44 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 25 Jun 2020 10:03:23 +0200 Subject: [PATCH 025/127] KVM: SVM: Add vmcb_ prefix to mark_*() functions Make it more clear what data structure these functions operate on. No functional changes. Signed-off-by: Joerg Roedel Message-Id: <20200625080325.28439-3-joro@8bytes.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 2 +- arch/x86/kvm/svm/nested.c | 6 +++--- arch/x86/kvm/svm/sev.c | 2 +- arch/x86/kvm/svm/svm.c | 44 +++++++++++++++++++-------------------- arch/x86/kvm/svm/svm.h | 8 +++---- 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index e80daa98682f..ac830cd50830 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -665,7 +665,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } else { vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; } - mark_dirty(vmcb, VMCB_AVIC); + vmcb_mark_dirty(vmcb, VMCB_AVIC); svm_set_pi_irte_mode(vcpu, activated); } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index e4ef980981af..426a7ec2525f 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -106,7 +106,7 @@ void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h, *g; - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); if (!is_guest_mode(&svm->vcpu)) return; @@ -378,7 +378,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) */ recalc_intercepts(svm); - mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); } void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, @@ -601,7 +601,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; - mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code, nested_vmcb->control.exit_info_1, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index e09aef93e92b..f7f1f4ecf08e 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1191,5 +1191,5 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) sd->sev_vmcbs[asid] = svm->vmcb; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; - mark_dirty(svm->vmcb, VMCB_ASID); + vmcb_mark_dirty(svm->vmcb, VMCB_ASID); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 38104f47cd25..9eaa3247dcbe 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -282,7 +282,7 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) } svm->vmcb->save.efer = efer | EFER_SVME; - mark_dirty(svm->vmcb, VMCB_CR); + vmcb_mark_dirty(svm->vmcb, VMCB_CR); } static int is_external_interrupt(u32 info) @@ -713,7 +713,7 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) pause_filter_count_max); if (control->pause_filter_count != old) { - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); trace_kvm_ple_window_update(vcpu->vcpu_id, control->pause_filter_count, old); } @@ -731,7 +731,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) pause_filter_count_shrink, pause_filter_count); if (control->pause_filter_count != old) { - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); trace_kvm_ple_window_update(vcpu->vcpu_id, control->pause_filter_count, old); } @@ -966,7 +966,7 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) svm->vmcb->control.tsc_offset = offset + g_tsc_offset; - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); return svm->vmcb->control.tsc_offset; } @@ -1123,7 +1123,7 @@ static void init_vmcb(struct vcpu_svm *svm) clr_exception_intercept(svm, UD_VECTOR); } - mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); enable_gif(svm); @@ -1257,7 +1257,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(cpu != vcpu->cpu)) { svm->asid_generation = 0; - mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); } #ifdef CONFIG_X86_64 @@ -1367,7 +1367,7 @@ static void svm_set_vintr(struct vcpu_svm *svm) control->int_ctl &= ~V_INTR_PRIO_MASK; control->int_ctl |= V_IRQ_MASK | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); - mark_dirty(svm->vmcb, VMCB_INTR); + vmcb_mark_dirty(svm->vmcb, VMCB_INTR); } static void svm_clear_vintr(struct vcpu_svm *svm) @@ -1385,7 +1385,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm) svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask; } - mark_dirty(svm->vmcb, VMCB_INTR); + vmcb_mark_dirty(svm->vmcb, VMCB_INTR); } static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) @@ -1503,7 +1503,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) svm->vmcb->save.idtr.limit = dt->size; svm->vmcb->save.idtr.base = dt->address ; - mark_dirty(svm->vmcb, VMCB_DT); + vmcb_mark_dirty(svm->vmcb, VMCB_DT); } static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) @@ -1520,7 +1520,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) svm->vmcb->save.gdtr.limit = dt->size; svm->vmcb->save.gdtr.base = dt->address ; - mark_dirty(svm->vmcb, VMCB_DT); + vmcb_mark_dirty(svm->vmcb, VMCB_DT); } static void update_cr0_intercept(struct vcpu_svm *svm) @@ -1531,7 +1531,7 @@ static void update_cr0_intercept(struct vcpu_svm *svm) *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) | (gcr0 & SVM_CR0_SELECTIVE_MASK); - mark_dirty(svm->vmcb, VMCB_CR); + vmcb_mark_dirty(svm->vmcb, VMCB_CR); if (gcr0 == *hcr0) { clr_cr_intercept(svm, INTERCEPT_CR0_READ); @@ -1572,7 +1572,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; - mark_dirty(svm->vmcb, VMCB_CR); + vmcb_mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); } @@ -1592,7 +1592,7 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) cr4 |= X86_CR4_PAE; cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; - mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); + vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); return 0; } @@ -1624,7 +1624,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, /* This is symmetric with svm_get_segment() */ svm->vmcb->save.cpl = (var->dpl & 3); - mark_dirty(svm->vmcb, VMCB_SEG); + vmcb_mark_dirty(svm->vmcb, VMCB_SEG); } static void update_bp_intercept(struct kvm_vcpu *vcpu) @@ -1651,7 +1651,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid_generation = sd->asid_generation; svm->vmcb->control.asid = sd->next_asid++; - mark_dirty(svm->vmcb, VMCB_ASID); + vmcb_mark_dirty(svm->vmcb, VMCB_ASID); } static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) @@ -1660,7 +1660,7 @@ static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) if (unlikely(value != vmcb->save.dr6)) { vmcb->save.dr6 = value; - mark_dirty(vmcb, VMCB_DR); + vmcb_mark_dirty(vmcb, VMCB_DR); } } @@ -1687,7 +1687,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.dr7 = value; - mark_dirty(svm->vmcb, VMCB_DR); + vmcb_mark_dirty(svm->vmcb, VMCB_DR); } static int pf_interception(struct vcpu_svm *svm) @@ -2512,7 +2512,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) return 1; vcpu->arch.pat = data; svm->vmcb->save.g_pat = data; - mark_dirty(svm->vmcb, VMCB_NPT); + vmcb_mark_dirty(svm->vmcb, VMCB_NPT); break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && @@ -2617,7 +2617,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) return 1; svm->vmcb->save.dbgctl = data; - mark_dirty(svm->vmcb, VMCB_LBR); + vmcb_mark_dirty(svm->vmcb, VMCB_LBR); if (data & (1ULL<<0)) svm_enable_lbrv(svm); else @@ -3476,7 +3476,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) SVM_EXIT_EXCP_BASE + MC_VECTOR)) svm_handle_mce(svm); - mark_all_clean(svm->vmcb); + vmcb_mark_all_clean(svm->vmcb); return exit_fastpath; } @@ -3488,7 +3488,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) cr3 = __sme_set(root); if (npt_enabled) { svm->vmcb->control.nested_cr3 = cr3; - mark_dirty(svm->vmcb, VMCB_NPT); + vmcb_mark_dirty(svm->vmcb, VMCB_NPT); /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) @@ -3497,7 +3497,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) } svm->vmcb->save.cr3 = cr3; - mark_dirty(svm->vmcb, VMCB_CR); + vmcb_mark_dirty(svm->vmcb, VMCB_CR); } static int is_disabled(void) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e4251c5f80eb..0b1c10f79762 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -185,18 +185,18 @@ static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) return container_of(kvm, struct kvm_svm, kvm); } -static inline void mark_all_dirty(struct vmcb *vmcb) +static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; } -static inline void mark_all_clean(struct vmcb *vmcb) +static inline void vmcb_mark_all_clean(struct vmcb *vmcb) { vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) & ~VMCB_ALWAYS_DIRTY_MASK; } -static inline void mark_dirty(struct vmcb *vmcb, int bit) +static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit) { vmcb->control.clean &= ~(1 << bit); } @@ -417,7 +417,7 @@ extern int avic; static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) { svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; - mark_dirty(svm->vmcb, VMCB_AVIC); + vmcb_mark_dirty(svm->vmcb, VMCB_AVIC); } static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) From a284ba56a0a4b5a84733a19934196c19277b1b07 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 25 Jun 2020 10:03:24 +0200 Subject: [PATCH 026/127] KVM: SVM: Add svm_ prefix to set/clr/is_intercept() Make clear the symbols belong to the SVM code when they are built-in. No functional changes. Signed-off-by: Joerg Roedel Message-Id: <20200625080325.28439-4-joro@8bytes.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/svm.c | 88 +++++++++++++++++++-------------------- arch/x86/kvm/svm/svm.h | 6 +-- 3 files changed, 48 insertions(+), 48 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 426a7ec2525f..385461496cf5 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -261,7 +261,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm) /* Only a few fields of int_ctl are written by the processor. */ mask = V_IRQ_MASK | V_TPR_MASK; if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) && - is_intercept(svm, INTERCEPT_VINTR)) { + svm_is_intercept(svm, INTERCEPT_VINTR)) { /* * In order to request an interrupt window, L0 is usurping * svm->vmcb->control.int_ctl and possibly setting V_IRQ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9eaa3247dcbe..b934d51f7912 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1002,38 +1002,38 @@ static void init_vmcb(struct vcpu_svm *svm) if (enable_vmware_backdoor) set_exception_intercept(svm, GP_VECTOR); - set_intercept(svm, INTERCEPT_INTR); - set_intercept(svm, INTERCEPT_NMI); - set_intercept(svm, INTERCEPT_SMI); - set_intercept(svm, INTERCEPT_SELECTIVE_CR0); - set_intercept(svm, INTERCEPT_RDPMC); - set_intercept(svm, INTERCEPT_CPUID); - set_intercept(svm, INTERCEPT_INVD); - set_intercept(svm, INTERCEPT_INVLPG); - set_intercept(svm, INTERCEPT_INVLPGA); - set_intercept(svm, INTERCEPT_IOIO_PROT); - set_intercept(svm, INTERCEPT_MSR_PROT); - set_intercept(svm, INTERCEPT_TASK_SWITCH); - set_intercept(svm, INTERCEPT_SHUTDOWN); - set_intercept(svm, INTERCEPT_VMRUN); - set_intercept(svm, INTERCEPT_VMMCALL); - set_intercept(svm, INTERCEPT_VMLOAD); - set_intercept(svm, INTERCEPT_VMSAVE); - set_intercept(svm, INTERCEPT_STGI); - set_intercept(svm, INTERCEPT_CLGI); - set_intercept(svm, INTERCEPT_SKINIT); - set_intercept(svm, INTERCEPT_WBINVD); - set_intercept(svm, INTERCEPT_XSETBV); - set_intercept(svm, INTERCEPT_RDPRU); - set_intercept(svm, INTERCEPT_RSM); + svm_set_intercept(svm, INTERCEPT_INTR); + svm_set_intercept(svm, INTERCEPT_NMI); + svm_set_intercept(svm, INTERCEPT_SMI); + svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0); + svm_set_intercept(svm, INTERCEPT_RDPMC); + svm_set_intercept(svm, INTERCEPT_CPUID); + svm_set_intercept(svm, INTERCEPT_INVD); + svm_set_intercept(svm, INTERCEPT_INVLPG); + svm_set_intercept(svm, INTERCEPT_INVLPGA); + svm_set_intercept(svm, INTERCEPT_IOIO_PROT); + svm_set_intercept(svm, INTERCEPT_MSR_PROT); + svm_set_intercept(svm, INTERCEPT_TASK_SWITCH); + svm_set_intercept(svm, INTERCEPT_SHUTDOWN); + svm_set_intercept(svm, INTERCEPT_VMRUN); + svm_set_intercept(svm, INTERCEPT_VMMCALL); + svm_set_intercept(svm, INTERCEPT_VMLOAD); + svm_set_intercept(svm, INTERCEPT_VMSAVE); + svm_set_intercept(svm, INTERCEPT_STGI); + svm_set_intercept(svm, INTERCEPT_CLGI); + svm_set_intercept(svm, INTERCEPT_SKINIT); + svm_set_intercept(svm, INTERCEPT_WBINVD); + svm_set_intercept(svm, INTERCEPT_XSETBV); + svm_set_intercept(svm, INTERCEPT_RDPRU); + svm_set_intercept(svm, INTERCEPT_RSM); if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { - set_intercept(svm, INTERCEPT_MONITOR); - set_intercept(svm, INTERCEPT_MWAIT); + svm_set_intercept(svm, INTERCEPT_MONITOR); + svm_set_intercept(svm, INTERCEPT_MWAIT); } if (!kvm_hlt_in_guest(svm->vcpu.kvm)) - set_intercept(svm, INTERCEPT_HLT); + svm_set_intercept(svm, INTERCEPT_HLT); control->iopm_base_pa = __sme_set(iopm_base); control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); @@ -1077,7 +1077,7 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; - clr_intercept(svm, INTERCEPT_INVLPG); + svm_clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); clr_cr_intercept(svm, INTERCEPT_CR3_READ); clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); @@ -1094,9 +1094,9 @@ static void init_vmcb(struct vcpu_svm *svm) control->pause_filter_count = pause_filter_count; if (pause_filter_thresh) control->pause_filter_thresh = pause_filter_thresh; - set_intercept(svm, INTERCEPT_PAUSE); + svm_set_intercept(svm, INTERCEPT_PAUSE); } else { - clr_intercept(svm, INTERCEPT_PAUSE); + svm_clr_intercept(svm, INTERCEPT_PAUSE); } if (kvm_vcpu_apicv_active(&svm->vcpu)) @@ -1107,14 +1107,14 @@ static void init_vmcb(struct vcpu_svm *svm) * in VMCB and clear intercepts to avoid #VMEXIT. */ if (vls) { - clr_intercept(svm, INTERCEPT_VMLOAD); - clr_intercept(svm, INTERCEPT_VMSAVE); + svm_clr_intercept(svm, INTERCEPT_VMLOAD); + svm_clr_intercept(svm, INTERCEPT_VMSAVE); svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } if (vgif) { - clr_intercept(svm, INTERCEPT_STGI); - clr_intercept(svm, INTERCEPT_CLGI); + svm_clr_intercept(svm, INTERCEPT_STGI); + svm_clr_intercept(svm, INTERCEPT_CLGI); svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } @@ -1356,7 +1356,7 @@ static void svm_set_vintr(struct vcpu_svm *svm) /* The following fields are ignored when AVIC is enabled */ WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu)); - set_intercept(svm, INTERCEPT_VINTR); + svm_set_intercept(svm, INTERCEPT_VINTR); /* * This is just a dummy VINTR to actually cause a vmexit to happen. @@ -1373,7 +1373,7 @@ static void svm_set_vintr(struct vcpu_svm *svm) static void svm_clear_vintr(struct vcpu_svm *svm) { const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK; - clr_intercept(svm, INTERCEPT_VINTR); + svm_clr_intercept(svm, INTERCEPT_VINTR); /* Drop int_ctl fields related to VINTR injection. */ svm->vmcb->control.int_ctl &= mask; @@ -2000,8 +2000,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) * again while processing KVM_REQ_EVENT if needed. */ if (vgif_enabled(svm)) - clr_intercept(svm, INTERCEPT_STGI); - if (is_intercept(svm, INTERCEPT_VINTR)) + svm_clr_intercept(svm, INTERCEPT_STGI); + if (svm_is_intercept(svm, INTERCEPT_VINTR)) svm_clear_vintr(svm); enable_gif(svm); @@ -2162,7 +2162,7 @@ static int cpuid_interception(struct vcpu_svm *svm) static int iret_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.nmi_window_exits; - clr_intercept(svm, INTERCEPT_IRET); + svm_clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); @@ -3018,7 +3018,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; vcpu->arch.hflags |= HF_NMI_MASK; - set_intercept(svm, INTERCEPT_IRET); + svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } @@ -3095,10 +3095,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) if (masked) { svm->vcpu.arch.hflags |= HF_NMI_MASK; - set_intercept(svm, INTERCEPT_IRET); + svm_set_intercept(svm, INTERCEPT_IRET); } else { svm->vcpu.arch.hflags &= ~HF_NMI_MASK; - clr_intercept(svm, INTERCEPT_IRET); + svm_clr_intercept(svm, INTERCEPT_IRET); } } @@ -3178,7 +3178,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) if (!gif_set(svm)) { if (vgif_enabled(svm)) - set_intercept(svm, INTERCEPT_STGI); + svm_set_intercept(svm, INTERCEPT_STGI); return; /* STGI will cause a vm exit */ } @@ -3862,7 +3862,7 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) if (!gif_set(svm)) { if (vgif_enabled(svm)) - set_intercept(svm, INTERCEPT_STGI); + svm_set_intercept(svm, INTERCEPT_STGI); /* STGI will cause a vm exit */ } else { /* We must be in SMM; RSM will cause a vmexit anyway. */ diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 0b1c10f79762..6b9f72463629 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -290,7 +290,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) recalc_intercepts(svm); } -static inline void set_intercept(struct vcpu_svm *svm, int bit) +static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); @@ -299,7 +299,7 @@ static inline void set_intercept(struct vcpu_svm *svm, int bit) recalc_intercepts(svm); } -static inline void clr_intercept(struct vcpu_svm *svm, int bit) +static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); @@ -308,7 +308,7 @@ static inline void clr_intercept(struct vcpu_svm *svm, int bit) recalc_intercepts(svm); } -static inline bool is_intercept(struct vcpu_svm *svm, int bit) +static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) { return (svm->vmcb->control.intercept & (1ULL << bit)) != 0; } From 01c3b2b5cdae39af8dfcf6e40fdf484ae0e812c5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 25 Jun 2020 10:03:25 +0200 Subject: [PATCH 027/127] KVM: SVM: Rename svm_nested_virtualize_tpr() to nested_svm_virtualize_tpr() Match the naming with other nested svm functions. No functional changes. Signed-off-by: Joerg Roedel Message-Id: <20200625080325.28439-5-joro@8bytes.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 6 +++--- arch/x86/kvm/svm/svm.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b934d51f7912..74096aa72ad9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3039,7 +3039,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm_nested_virtualize_tpr(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); @@ -3233,7 +3233,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm_nested_virtualize_tpr(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { @@ -3247,7 +3247,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (svm_nested_virtualize_tpr(vcpu) || + if (nested_svm_virtualize_tpr(vcpu) || kvm_vcpu_apicv_active(vcpu)) return; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6b9f72463629..71b1dda947e6 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -362,7 +362,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ #define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ -static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) +static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); From 4cb5b77eecb0ccb2fa66f33ac6936680bbc41552 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 23 Jun 2020 20:34:39 +0200 Subject: [PATCH 028/127] KVM: x86: Use VMCALL and VMMCALL mnemonics in kvm_para.h Current minimum required version of binutils is 2.23, which supports VMCALL and VMMCALL instruction mnemonics. Replace the byte-wise specification of VMCALL and VMMCALL with these proper mnemonics. Signed-off-by: Uros Bizjak CC: Paolo Bonzini Message-Id: <20200623183439.5526-1-ubizjak@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_para.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index ceb599c8cf79..338119852512 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -19,7 +19,7 @@ static inline bool kvm_check_and_clear_guest_paused(void) #endif /* CONFIG_KVM_GUEST */ #define KVM_HYPERCALL \ - ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL) + ALTERNATIVE("vmcall", "vmmcall", X86_FEATURE_VMMCALL) /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall * instruction. The hypervisor may replace it with something else but only the From ac101b7cb17d4a5df1ab735420d0ee3593465dcf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 23 Jun 2020 12:40:26 -0700 Subject: [PATCH 029/127] KVM: x86/mmu: Avoid multiple hash lookups in kvm_get_mmu_page() Refactor for_each_valid_sp() to take the list of shadow pages instead of retrieving it from a gfn to avoid doing the gfn->list hash and lookup multiple times during kvm_get_mmu_page(). Cc: Peter Feiner Cc: Jon Cargille Cc: Jim Mattson Signed-off-by: Sean Christopherson Message-Id: <20200623194027.23135-2-sean.j.christopherson@intel.com> Reviewed-By: Jon Cargille Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 03ce2cad04f7..713371316848 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2243,15 +2243,14 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); - -#define for_each_valid_sp(_kvm, _sp, _gfn) \ - hlist_for_each_entry(_sp, \ - &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ +#define for_each_valid_sp(_kvm, _sp, _list) \ + hlist_for_each_entry(_sp, _list, hash_link) \ if (is_obsolete_sp((_kvm), (_sp))) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_valid_sp(_kvm, _sp, _gfn) \ + for_each_valid_sp(_kvm, _sp, \ + &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else static inline bool is_ept_sp(struct kvm_mmu_page *sp) @@ -2462,6 +2461,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned int access) { union kvm_mmu_page_role role; + struct hlist_head *sp_list; unsigned quadrant; struct kvm_mmu_page *sp; bool need_sync = false; @@ -2481,7 +2481,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_valid_sp(vcpu->kvm, sp, gfn) { + + sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; + for_each_valid_sp(vcpu->kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; @@ -2518,8 +2520,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->gfn = gfn; sp->role = role; - hlist_add_head(&sp->hash_link, - &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); + hlist_add_head(&sp->hash_link, sp_list); if (!direct) { /* * we should do write protection before syncing pages From fb58a9c345f645f1774dcf6a36fda169253008ae Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 23 Jun 2020 12:40:27 -0700 Subject: [PATCH 030/127] KVM: x86/mmu: Optimize MMU page cache lookup for fully direct MMUs Skip the unsync checks and the write flooding clearing for fully direct MMUs, which are guaranteed to not have unsync'd or indirect pages (write flooding detection only applies to indirect pages). For TDP, this avoids unnecessary memory reads and writes, and for the write flooding count will also avoid dirtying a cache line (unsync_child_bitmap itself consumes a cache line, i.e. write_flooding_count is guaranteed to be in a different cache line than parent_ptes). Signed-off-by: Sean Christopherson Message-Id: <20200623194027.23135-3-sean.j.christopherson@intel.com> Reviewed-By: Jon Cargille Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 713371316848..1b2988a9f0c6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2460,6 +2460,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, int direct, unsigned int access) { + bool direct_mmu = vcpu->arch.mmu->direct_map; union kvm_mmu_page_role role; struct hlist_head *sp_list; unsigned quadrant; @@ -2475,8 +2476,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (role.direct) role.gpte_is_8_bytes = true; role.access = access; - if (!vcpu->arch.mmu->direct_map - && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { + if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; @@ -2495,6 +2495,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->role.word != role.word) continue; + if (direct_mmu) + goto trace_get_page; + if (sp->unsync) { /* The page is good, but __kvm_sync_page might still end * up zapping it. If so, break in order to rebuild it. @@ -2510,6 +2513,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); __clear_sp_write_flooding_count(sp); + +trace_get_page: trace_kvm_mmu_get_page(sp, false); goto out; } From f95eec9bed76d42194c23153cb1cc8f186bf91cb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 23 Jun 2020 12:35:39 -0700 Subject: [PATCH 031/127] KVM: x86/mmu: Don't put invalid SPs back on the list of active pages Delete a shadow page from the invalidation list instead of throwing it back on the list of active pages when it's a root shadow page with active users. Invalid active root pages will be explicitly freed by mmu_free_root_page() when the root_count hits zero, i.e. they don't need to be put on the active list to avoid leakage. Use sp->role.invalid to detect that a shadow page has already been zapped, i.e. is not on a list. WARN if an invalid page is encountered when zapping pages, as it should now be impossible. Signed-off-by: Sean Christopherson Message-Id: <20200623193542.7554-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1b2988a9f0c6..14c16773e830 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2748,10 +2748,23 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, if (!sp->root_count) { /* Count self */ (*nr_zapped)++; - list_move(&sp->link, invalid_list); + + /* + * Already invalid pages (previously active roots) are not on + * the active page list. See list_del() in the "else" case of + * !sp->root_count. + */ + if (sp->role.invalid) + list_add(&sp->link, invalid_list); + else + list_move(&sp->link, invalid_list); kvm_mod_used_mmu_pages(kvm, -1); } else { - list_move(&sp->link, &kvm->arch.active_mmu_pages); + /* + * Remove the active root from the active page list, the root + * will be explicitly freed when the root_count hits zero. + */ + list_del(&sp->link); /* * Obsolete pages cannot be used on any vCPUs, see the comment @@ -5718,12 +5731,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) break; /* - * Skip invalid pages with a non-zero root count, zapping pages - * with a non-zero root count will never succeed, i.e. the page - * will get thrown back on active_mmu_pages and we'll get stuck - * in an infinite loop. + * Invalid pages should never land back on the list of active + * pages. Skip the bogus page, otherwise we'll get stuck in an + * infinite loop if the page gets put back on the list (again). */ - if (sp->role.invalid && sp->root_count) + if (WARN_ON(sp->role.invalid)) continue; /* @@ -6001,7 +6013,7 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (sp->role.invalid && sp->root_count) + if (WARN_ON(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; From 6b82ef2c9cf18a48726e4bb359aa9014632f6466 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 23 Jun 2020 12:35:40 -0700 Subject: [PATCH 032/127] KVM: x86/mmu: Batch zap MMU pages when recycling oldest pages Collect MMU pages for zapping in a loop when making MMU pages available, and skip over active roots when doing so as zapping an active root can never immediately free up a page. Batching the zapping avoids multiple remote TLB flushes and remedies the issue where the loop would bail early if an active root was encountered. Signed-off-by: Sean Christopherson Message-Id: <20200623193542.7554-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 52 +++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 14c16773e830..86abe2dc2413 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2829,20 +2829,51 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); } -static int make_mmu_pages_available(struct kvm_vcpu *vcpu) +static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, + unsigned long nr_to_zap) { + unsigned long total_zapped = 0; + struct kvm_mmu_page *sp, *tmp; LIST_HEAD(invalid_list); + bool unstable; + int nr_zapped; - if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) + if (list_empty(&kvm->arch.active_mmu_pages)) return 0; - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { - if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) +restart: + list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) { + /* + * Don't zap active root pages, the page itself can't be freed + * and zapping it will just force vCPUs to realloc and reload. + */ + if (sp->root_count) + continue; + + unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, + &nr_zapped); + total_zapped += nr_zapped; + if (total_zapped >= nr_to_zap) break; - ++vcpu->kvm->stat.mmu_recycled; + if (unstable) + goto restart; } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + + kvm_mmu_commit_zap_page(kvm, &invalid_list); + + kvm->stat.mmu_recycled += total_zapped; + return total_zapped; +} + +static int make_mmu_pages_available(struct kvm_vcpu *vcpu) +{ + unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); + + if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) + return 0; + + kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; @@ -2855,17 +2886,12 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { - LIST_HEAD(invalid_list); - spin_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { - /* Need to free some mmu pages to achieve the goal. */ - while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) - if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list)) - break; + kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - + goal_nr_mmu_pages); - kvm_mmu_commit_zap_page(kvm, &invalid_list); goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } From ebdb292dac7993425c8e31e2c21c9978e914a676 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 23 Jun 2020 12:35:41 -0700 Subject: [PATCH 033/127] KVM: x86/mmu: Batch zap MMU pages when shrinking the slab Use the recently introduced kvm_mmu_zap_oldest_mmu_pages() to batch zap MMU pages when shrinking a slab. This fixes a long standing issue where KVM's shrinker implementation is completely ineffective due to zapping only a single page. E.g. without batch zapping, forcing a scan via drop_caches basically has no impact on a VM with ~2k shadow pages. With batch zapping, the number of shadow pages can be reduced to a few hundred pages in one or two runs of drop_caches. Note, if the default batch size (currently 128) is problematic, e.g. zapping 128 pages holds mmu_lock for too long, KVM can bound the batch size by setting @batch in mmu_shrinker. Signed-off-by: Sean Christopherson Message-Id: <20200623193542.7554-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 86abe2dc2413..8083ec32a0dd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2816,19 +2816,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, } } -static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, - struct list_head *invalid_list) -{ - struct kvm_mmu_page *sp; - - if (list_empty(&kvm->arch.active_mmu_pages)) - return false; - - sp = list_last_entry(&kvm->arch.active_mmu_pages, - struct kvm_mmu_page, link); - return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); -} - static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, unsigned long nr_to_zap) { @@ -6116,9 +6103,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) goto unlock; } - if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) - freed++; - kvm_mmu_commit_zap_page(kvm, &invalid_list); + freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); unlock: spin_unlock(&kvm->mmu_lock); From 7bd7ded642978feeec8f2b9a2b05d8e1daeff758 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 23 Jun 2020 12:35:42 -0700 Subject: [PATCH 034/127] KVM: x86/mmu: Exit to userspace on make_mmu_pages_available() error Propagate any error returned by make_mmu_pages_available() out to userspace instead of resuming the guest if the error occurs while handling a page fault. Now that zapping the oldest MMU pages skips active roots, i.e. fails if and only if there are no zappable pages, there is no chance for a false positive, i.e. no chance of returning a spurious error to userspace. Signed-off-by: Sean Christopherson Message-Id: <20200623193542.7554-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 ++- arch/x86/kvm/mmu/paging_tmpl.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8083ec32a0dd..53d6bd07f9e9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4148,7 +4148,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; - if (make_mmu_pages_available(vcpu) < 0) + r = make_mmu_pages_available(vcpu); + if (r) goto out_unlock; r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, prefault, is_tdp && lpage_disallowed); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 7e370d8bd576..4ec044af36e8 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -866,7 +866,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); - if (make_mmu_pages_available(vcpu) < 0) + r = make_mmu_pages_available(vcpu); + if (r) goto out_unlock; r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, map_writable, prefault, lpage_disallowed); From 33e3042dac6bcc33b80835f7d7b502b1d74c457c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Jun 2020 13:20:29 -0700 Subject: [PATCH 035/127] KVM: x86/mmu: Move mmu_audit.c and mmutrace.h into the mmu/ sub-directory Move mmu_audit.c and mmutrace.h under mmu/ where they belong. Signed-off-by: Sean Christopherson Message-Id: <20200622202034.15093-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/{ => mmu}/mmu_audit.c | 0 arch/x86/kvm/{ => mmu}/mmutrace.h | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename arch/x86/kvm/{ => mmu}/mmu_audit.c (100%) rename arch/x86/kvm/{ => mmu}/mmutrace.h (99%) diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c similarity index 100% rename from arch/x86/kvm/mmu_audit.c rename to arch/x86/kvm/mmu/mmu_audit.c diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h similarity index 99% rename from arch/x86/kvm/mmutrace.h rename to arch/x86/kvm/mmu/mmutrace.h index ffcd96fc02d0..9d15bc0c535b 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -387,7 +387,7 @@ TRACE_EVENT( #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_PATH mmu #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE mmutrace From afe8d7e611c4d59c0be8d67883a1a0e68df23425 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Jun 2020 13:20:30 -0700 Subject: [PATCH 036/127] KVM: x86/mmu: Move kvm_mmu_available_pages() into mmu.c Move kvm_mmu_available_pages() from mmu.h to mmu.c, it has a single caller and has no business being exposed via mmu.h. Signed-off-by: Sean Christopherson Message-Id: <20200622202034.15093-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 9 --------- arch/x86/kvm/mmu/mmu.c | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 81cafc937cfb..2b1e7cf7dbf6 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -64,15 +64,6 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); -static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) -{ - if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) - return kvm->arch.n_max_mmu_pages - - kvm->arch.n_used_mmu_pages; - - return 0; -} - static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE)) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 53d6bd07f9e9..cafada59d3d5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2853,6 +2853,15 @@ static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, return total_zapped; } +static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) +{ + if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) + return kvm->arch.n_max_mmu_pages - + kvm->arch.n_used_mmu_pages; + + return 0; +} + static int make_mmu_pages_available(struct kvm_vcpu *vcpu) { unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); From 6ca9a6f3adef955e004123069e15ecffa462e823 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Jun 2020 13:20:31 -0700 Subject: [PATCH 037/127] KVM: x86/mmu: Add MMU-internal header Add mmu/mmu_internal.h to hold declarations and definitions that need to be shared between various mmu/ files, but should not be used by anything outside of the MMU. Begin populating mmu_internal.h with declarations of the helpers used by page_track.c. Signed-off-by: Sean Christopherson Message-Id: <20200622202034.15093-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 5 +---- arch/x86/kvm/mmu/mmu.c | 1 + arch/x86/kvm/mmu/mmu_internal.h | 10 ++++++++++ arch/x86/kvm/mmu/page_track.c | 2 +- 4 files changed, 13 insertions(+), 5 deletions(-) create mode 100644 arch/x86/kvm/mmu/mmu_internal.h diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 2b1e7cf7dbf6..434acfcbf710 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -209,10 +209,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, - struct kvm_memory_slot *slot, u64 gfn); +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index cafada59d3d5..4dd5ca7a7ea7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -18,6 +18,7 @@ #include "irq.h" #include "ioapic.h" #include "mmu.h" +#include "mmu_internal.h" #include "x86.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h new file mode 100644 index 000000000000..d7938c37c7de --- /dev/null +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_MMU_INTERNAL_H +#define __KVM_X86_MMU_INTERNAL_H + +void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + struct kvm_memory_slot *slot, u64 gfn); + +#endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index a7bcde34d1f2..a84a141a2ad2 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -16,7 +16,7 @@ #include -#include "mmu.h" +#include "mmu_internal.h" void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { From 985ab2780164698ec6e7d73fad523d50449261dd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Jun 2020 13:20:32 -0700 Subject: [PATCH 038/127] KVM: x86/mmu: Make kvm_mmu_page definition and accessor internal-only Make 'struct kvm_mmu_page' MMU-only, nothing outside of the MMU should be poking into the gory details of shadow pages. Signed-off-by: Sean Christopherson Message-Id: <20200622202034.15093-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 46 ++----------------------------- arch/x86/kvm/mmu/mmu_internal.h | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 44 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2357763bf7f2..97cb005c7aa7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -322,43 +322,6 @@ struct kvm_rmap_head { unsigned long val; }; -struct kvm_mmu_page { - struct list_head link; - struct hlist_node hash_link; - struct list_head lpage_disallowed_link; - - bool unsync; - u8 mmu_valid_gen; - bool mmio_cached; - bool lpage_disallowed; /* Can't be replaced by an equiv large page */ - - /* - * The following two entries are used to key the shadow page in the - * hash table. - */ - union kvm_mmu_page_role role; - gfn_t gfn; - - u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; - int root_count; /* Currently serving as active root */ - unsigned int unsync_children; - struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - DECLARE_BITMAP(unsync_child_bitmap, 512); - -#ifdef CONFIG_X86_32 - /* - * Used out of the mmu-lock to avoid reading spte values while an - * update is in progress; see the comments in __get_spte_lockless(). - */ - int clear_spte_count; -#endif - - /* Number of writes since the last time traversal visited this page. */ - atomic_t write_flooding_count; -}; - struct kvm_pio_request { unsigned long linear_rip; unsigned long count; @@ -384,6 +347,8 @@ struct kvm_mmu_root_info { #define KVM_MMU_NUM_PREV_ROOTS 3 +struct kvm_mmu_page; + /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the @@ -1560,13 +1525,6 @@ static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, return gpa; } -static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) -{ - struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); - - return (struct kvm_mmu_page *)page_private(page); -} - static inline u16 kvm_read_ldt(void) { u16 ldt; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index d7938c37c7de..8afa60f0a1a5 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -2,6 +2,54 @@ #ifndef __KVM_X86_MMU_INTERNAL_H #define __KVM_X86_MMU_INTERNAL_H +#include + +#include + +struct kvm_mmu_page { + struct list_head link; + struct hlist_node hash_link; + struct list_head lpage_disallowed_link; + + bool unsync; + u8 mmu_valid_gen; + bool mmio_cached; + bool lpage_disallowed; /* Can't be replaced by an equiv large page */ + + /* + * The following two entries are used to key the shadow page in the + * hash table. + */ + union kvm_mmu_page_role role; + gfn_t gfn; + + u64 *spt; + /* hold the gfn of each spte inside spt */ + gfn_t *gfns; + int root_count; /* Currently serving as active root */ + unsigned int unsync_children; + struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ + DECLARE_BITMAP(unsync_child_bitmap, 512); + +#ifdef CONFIG_X86_32 + /* + * Used out of the mmu-lock to avoid reading spte values while an + * update is in progress; see the comments in __get_spte_lockless(). + */ + int clear_spte_count; +#endif + + /* Number of writes since the last time traversal visited this page. */ + atomic_t write_flooding_count; +}; + +static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) +{ + struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); + + return (struct kvm_mmu_page *)page_private(page); +} + void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, From 573546820b792ef620acbfaa16bdf24ffbb1007b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Jun 2020 13:20:33 -0700 Subject: [PATCH 039/127] KVM: x86/mmu: Add sptep_to_sp() helper to wrap shadow page lookup Introduce sptep_to_sp() to reduce the boilerplate code needed to get the shadow page associated with a spte pointer, and to improve readability as it's not immediately obvious that "page_header" is a KVM-specific accessor for retrieving a shadow page. Signed-off-by: Sean Christopherson Message-Id: <20200622202034.15093-6-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 28 +++++++++++++--------------- arch/x86/kvm/mmu/mmu_audit.c | 6 +++--- arch/x86/kvm/mmu/mmu_internal.h | 5 +++++ arch/x86/kvm/mmu/paging_tmpl.h | 4 ++-- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4dd5ca7a7ea7..1eebef8317b3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -677,7 +677,7 @@ union split_spte { static void count_spte_clear(u64 *sptep, u64 spte) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); if (is_shadow_present_pte(spte)) return; @@ -761,7 +761,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) */ static u64 __get_spte_lockless(u64 *sptep) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); union split_spte spte, *orig = (union split_spte *)sptep; int count; @@ -1427,7 +1427,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); return pte_list_add(vcpu, spte, rmap_head); @@ -1439,7 +1439,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn_t gfn; struct kvm_rmap_head *rmap_head; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmap_head = gfn_to_rmap(kvm, gfn, sp); __pte_list_remove(spte, rmap_head); @@ -1531,7 +1531,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) { if (is_large_pte(*sptep)) { - WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K); + WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); --kvm->stat.lpages; return true; @@ -1543,7 +1543,7 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) { if (__drop_large_spte(vcpu->kvm, sptep)) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); @@ -2002,7 +2002,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); @@ -2124,7 +2124,7 @@ static void mark_unsync(u64 *spte) struct kvm_mmu_page *sp; unsigned int index; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); index = spte - sp->spt; if (__test_and_set_bit(index, sp->unsync_child_bitmap)) return; @@ -2449,9 +2449,7 @@ static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) static void clear_sp_write_flooding_count(u64 *spte) { - struct kvm_mmu_page *sp = page_header(__pa(spte)); - - __clear_sp_write_flooding_count(sp); + __clear_sp_write_flooding_count(sptep_to_sp(spte)); } static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, @@ -3026,7 +3024,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp_ad_disabled(sp)) spte |= SPTE_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) @@ -3239,7 +3237,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); /* * Without accessed bits, there's no way to distinguish between @@ -3547,7 +3545,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!is_shadow_present_pte(spte)) break; - sp = page_header(__pa(iterator.sptep)); + sp = sptep_to_sp(iterator.sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -5926,7 +5924,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); pfn = spte_to_pfn(*sptep); /* diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index 9d2844f87f6d..6ba703d3497f 100644 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -97,7 +97,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) kvm_pfn_t pfn; hpa_t hpa; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp->unsync) { if (level != PG_LEVEL_4K) { @@ -132,7 +132,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) struct kvm_memory_slot *slot; gfn_t gfn; - rev_sp = page_header(__pa(sptep)); + rev_sp = sptep_to_sp(sptep); gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); slots = kvm_memslots_for_spte_role(kvm, rev_sp->role); @@ -165,7 +165,7 @@ static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 8afa60f0a1a5..6371bf1d0b1c 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -50,6 +50,11 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) return (struct kvm_mmu_page *)page_private(page); } +static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) +{ + return page_header(__pa(sptep)); +} + void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 4ec044af36e8..0981b84c95e4 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -596,7 +596,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, u64 *spte; int i; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp->role.level > PG_LEVEL_4K) return; @@ -916,7 +916,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) level = iterator.level; sptep = iterator.sptep; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (is_last_spte(*sptep, level)) { pt_element_t gpte; gpa_t pte_gpa; From e47c4aee5bde03e7018f4fde45ba21028a8f8438 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Jun 2020 13:20:34 -0700 Subject: [PATCH 040/127] KVM: x86/mmu: Rename page_header() to to_shadow_page() Rename KVM's accessor for retrieving a 'struct kvm_mmu_page' from the associated host physical address to better convey what the function is doing. Signed-off-by: Sean Christopherson Message-Id: <20200622202034.15093-7-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 20 ++++++++++---------- arch/x86/kvm/mmu/mmu_audit.c | 6 +++--- arch/x86/kvm/mmu/mmu_internal.h | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1eebef8317b3..0011b2c97f65 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2193,7 +2193,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = page_header(ent & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -2647,7 +2647,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = page_header(*sptep & PT64_BASE_ADDR_MASK); + child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); if (child->role.access == direct_access) return; @@ -2669,7 +2669,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_large_pte(pte)) --kvm->stat.lpages; } else { - child = page_header(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); } return true; @@ -3127,7 +3127,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *child; u64 pte = *sptep; - child = page_header(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -3632,7 +3632,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK); + sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); --sp->root_count; if (!sp->root_count && sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); @@ -3862,7 +3862,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root_hpa; - sp = page_header(root); + sp = to_shadow_page(root); /* * Even if another CPU was marking the SP as unsync-ed @@ -3896,7 +3896,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); + sp = to_shadow_page(root); mmu_sync_children(vcpu, sp); } } @@ -4248,8 +4248,8 @@ static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { return (role.direct || pgd == root->pgd) && - VALID_PAGE(root->hpa) && page_header(root->hpa) && - role.word == page_header(root->hpa)->role.word; + VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) && + role.word == to_shadow_page(root->hpa)->role.word; } /* @@ -4334,7 +4334,7 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa)); + __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa)); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index 6ba703d3497f..c8d51a37e2ce 100644 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -45,7 +45,7 @@ static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, !is_last_spte(ent[i], level)) { struct kvm_mmu_page *child; - child = page_header(ent[i] & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent[i] & PT64_BASE_ADDR_MASK); __mmu_spte_walk(vcpu, child, fn, level - 1); } } @@ -62,7 +62,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root_hpa; - sp = page_header(root); + sp = to_shadow_page(root); __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level); return; } @@ -72,7 +72,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); + sp = to_shadow_page(root); __mmu_spte_walk(vcpu, sp, fn, 2); } } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 6371bf1d0b1c..3acf3b8eb469 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -43,7 +43,7 @@ struct kvm_mmu_page { atomic_t write_flooding_count; }; -static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) +static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); @@ -52,7 +52,7 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) { - return page_header(__pa(sptep)); + return to_shadow_page(__pa(sptep)); } void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); From de585020daf4e28a308549ed1457a51b3cc5ac92 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 23 Oct 2019 19:16:20 +0800 Subject: [PATCH 041/127] Revert "KVM: X86: Fix setup the virt_spin_lock_key before static key get initialized" This reverts commit 34226b6b70980a8f81fff3c09a2c889f77edeeff. Commit 8990cac6e5ea ("x86/jump_label: Initialize static branching early") adds jump_label_init() call in setup_arch() to make static keys initialized early, so we could use the original simpler code again. The similar change for XEN is in commit 090d54bcbc54 ("Revert "x86/paravirt: Set up the virt_spin_lock_key after static keys get initialized"") Signed-off-by: Zhenzhong Duan Reviewed-by: Vitaly Kuznetsov Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Wanpeng Li Cc: Jim Mattson Cc: Joerg Roedel Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 681bc4090e91..e4208ce11689 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -570,13 +570,6 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) } } -static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) -{ - native_smp_prepare_cpus(max_cpus); - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) - static_branch_disable(&virt_spin_lock_key); -} - static void __init kvm_smp_prepare_boot_cpu(void) { /* @@ -671,7 +664,6 @@ static void __init kvm_guest_init(void) } #ifdef CONFIG_SMP - smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; if (pv_sched_yield_supported()) { smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; @@ -879,8 +871,10 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { + static_branch_disable(&virt_spin_lock_key); return; + } /* Don't use the pvqspinlock code if there is only 1 vCPU. */ if (num_possible_cpus() == 1) From 5aefd786fba7336c1ecc608e2a126e4f45b180d7 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 23 Oct 2019 19:16:21 +0800 Subject: [PATCH 042/127] x86/kvm: Change print code to use pr_*() format pr_*() is preferred than printk(KERN_* ...), after change all the print in arch/x86/kernel/kvm.c will have "kvm-guest: xxx" style. No functional change. Signed-off-by: Zhenzhong Duan Reviewed-by: Vitaly Kuznetsov Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Wanpeng Li Cc: Jim Mattson Cc: Joerg Roedel Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e4208ce11689..4ef21a87f1d3 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -7,6 +7,8 @@ * Authors: Anthony Liguori */ +#define pr_fmt(fmt) "kvm-guest: " fmt + #include #include #include @@ -306,8 +308,8 @@ static void kvm_register_steal_time(void) return; wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); - pr_info("kvm-stealtime: cpu %d, msr %llx\n", - cpu, (unsigned long long) slow_virt_to_phys(st)); + pr_info("stealtime: cpu %d, msr %llx\n", cpu, + (unsigned long long) slow_virt_to_phys(st)); } static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; @@ -512,7 +514,8 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) } else { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); - WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); min = max = apic_id; ipi_bitmap = 0; } @@ -522,7 +525,8 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) if (ipi_bitmap) { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); - WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); } local_irq_restore(flags); @@ -552,7 +556,7 @@ static void kvm_setup_pv_ipi(void) { apic->send_IPI_mask = kvm_send_ipi_mask; apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; - pr_info("KVM setup pv IPIs\n"); + pr_info("setup PV IPIs\n"); } static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) @@ -667,11 +671,11 @@ static void __init kvm_guest_init(void) smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; if (pv_sched_yield_supported()) { smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; - pr_info("KVM setup pv sched yield\n"); + pr_info("setup PV sched yield\n"); } if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", kvm_cpu_online, kvm_cpu_down_prepare) < 0) - pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); + pr_err("failed to install cpu hotplug callbacks\n"); #else sev_map_percpu_data(); kvm_guest_cpu_init(); @@ -910,8 +914,8 @@ static void kvm_enable_host_haltpoll(void *i) void arch_haltpoll_enable(unsigned int cpu) { if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { - pr_err_once("kvm: host does not support poll control\n"); - pr_err_once("kvm: host upgrade recommended\n"); + pr_err_once("host does not support poll control\n"); + pr_err_once("host upgrade recommended\n"); return; } From 05eee619ed61c8cd89633954d38c4e5653086845 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 23 Oct 2019 19:16:22 +0800 Subject: [PATCH 043/127] x86/kvm: Add "nopvspin" parameter to disable PV spinlocks There are cases where a guest tries to switch spinlocks to bare metal behavior (e.g. by setting "xen_nopvspin" on XEN platform and "hv_nopvspin" on HYPER_V). That feature is missed on KVM, add a new parameter "nopvspin" to disable PV spinlocks for KVM guest. The new 'nopvspin' parameter will also replace Xen and Hyper-V specific parameters in future patches. Define variable nopvsin as global because it will be used in future patches as above. Signed-off-by: Zhenzhong Duan Reviewed-by: Vitaly Kuznetsov Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Wanpeng Li Cc: Jim Mattson Cc: Joerg Roedel Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Paolo Bonzini --- .../admin-guide/kernel-parameters.txt | 5 +++ arch/x86/include/asm/qspinlock.h | 1 + arch/x86/kernel/kvm.c | 43 +++++++++++++++---- kernel/locking/qspinlock.c | 7 +++ 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb95fad81c79..6a8934ffdaf6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5739,6 +5739,11 @@ as generic guest with no PV drivers. Currently support XEN HVM, KVM, HYPER_V and VMWARE guest. + nopvspin [X86,KVM] + Disables the qspinlock slow path using PV optimizations + which allow the hypervisor to 'idle' the guest on lock + contention. + xirc2ps_cs= [NET,PCMCIA] Format: ,,,,,[,[,[,]]] diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 444d6fd9a6d8..d86ab942219c 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -32,6 +32,7 @@ extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __pv_init_lock_hash(void); extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); +extern bool nopvspin; #define queued_spin_unlock queued_spin_unlock /** diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4ef21a87f1d3..d9995931ea18 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -871,18 +871,36 @@ asm( */ void __init kvm_spinlock_init(void) { - /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) - return; - - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { - static_branch_disable(&virt_spin_lock_key); + /* + * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an + * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is + * preferred over native qspinlock when vCPU is preempted. + */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { + pr_info("PV spinlocks disabled, no host support\n"); return; } - /* Don't use the pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1) - return; + /* + * Disable PV spinlocks and use native qspinlock when dedicated pCPUs + * are available. + */ + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { + pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n"); + goto out; + } + + if (num_possible_cpus() == 1) { + pr_info("PV spinlocks disabled, single CPU\n"); + goto out; + } + + if (nopvspin) { + pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n"); + goto out; + } + + pr_info("PV spinlocks enabled\n"); __pv_init_lock_hash(); pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; @@ -895,6 +913,13 @@ void __init kvm_spinlock_init(void) pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); } + /* + * When PV spinlock is enabled which is preferred over + * virt_spin_lock(), virt_spin_lock_key's value is meaningless. + * Just disable it anyway. + */ +out: + static_branch_disable(&virt_spin_lock_key); } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b9515fcc9b29..cbff6ba53d56 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -581,4 +581,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #include "qspinlock_paravirt.h" #include "qspinlock.c" +bool nopvspin __initdata; +static __init int parse_nopvspin(char *arg) +{ + nopvspin = true; + return 0; +} +early_param("nopvspin", parse_nopvspin); #endif From 9a3c05e658d4d31b38ef03fe5c17bc2039402ff7 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 23 Oct 2019 19:16:23 +0800 Subject: [PATCH 044/127] xen: Mark "xen_nopvspin" parameter obsolete Map "xen_nopvspin" to "nopvspin", fix stale description of "xen_nopvspin" as we use qspinlock now. Signed-off-by: Zhenzhong Duan Reviewed-by: Boris Ostrovsky Cc: Jonathan Corbet Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Signed-off-by: Paolo Bonzini --- Documentation/admin-guide/kernel-parameters.txt | 7 ++++--- arch/x86/xen/spinlock.c | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6a8934ffdaf6..4740d0d9aaa3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5712,8 +5712,9 @@ panic() code such as dumping handler. xen_nopvspin [X86,XEN] - Disables the ticketlock slowpath using Xen PV - optimizations. + Disables the qspinlock slowpath using Xen PV optimizations. + This parameter is obsoleted by "nopvspin" parameter, which + has equivalent effect for XEN platform. xen_nopv [X86] Disables the PV optimizations forcing the HVM guest to @@ -5739,7 +5740,7 @@ as generic guest with no PV drivers. Currently support XEN HVM, KVM, HYPER_V and VMWARE guest. - nopvspin [X86,KVM] + nopvspin [X86,XEN,KVM] Disables the qspinlock slow path using PV optimizations which allow the hypervisor to 'idle' the guest on lock contention. diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 6deb49094c60..799f4eba0a62 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -114,9 +114,8 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); */ void __init xen_init_spinlocks(void) { - /* Don't need to use pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1) + if (num_possible_cpus() == 1 || nopvspin) xen_pvspin = false; if (!xen_pvspin) { @@ -137,6 +136,7 @@ void __init xen_init_spinlocks(void) static __init int xen_parse_nopvspin(char *arg) { + pr_notice("\"xen_nopvspin\" is deprecated, please use \"nopvspin\" instead\n"); xen_pvspin = false; return 0; } From d42e3fae6faedacb2a7b4c984417ed0d9f540401 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 7 Jul 2020 15:36:30 -0700 Subject: [PATCH 045/127] kvm: x86: Read PDPTEs on CR0.CD and CR0.NW changes According to the SDM, when PAE paging would be in use following a MOV-to-CR0 that modifies any of CR0.CD, CR0.NW, or CR0.PG, then the PDPTEs are loaded from the address in CR3. Previously, kvm only loaded the PDPTEs when PAE paging would be in use following a MOV-to-CR0 that modified CR0.PG. Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20200707223630.336700-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd8690ca7b6b..1153ce7d118b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -798,6 +798,7 @@ EXPORT_SYMBOL_GPL(pdptrs_changed); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; @@ -815,9 +816,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) return 1; - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { + if (cr0 & X86_CR0_PG) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.efer & EFER_LME)) { + if (!is_paging(vcpu) && (vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) @@ -827,8 +828,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 1; } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) + if (is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) return 1; } From b899c13277a918d268055ade3e4fd8289314ee84 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 8 Jul 2020 00:39:55 +0000 Subject: [PATCH 046/127] KVM: x86: Create mask for guest CR4 reserved bits in kvm_update_cpuid() Instead of creating the mask for guest CR4 reserved bits in kvm_valid_cr4(), do it in kvm_update_cpuid() so that it can be reused instead of creating it each time kvm_valid_cr4() is called. Suggested-by: Paolo Bonzini Signed-off-by: Krish Sadhukhan Message-Id: <1594168797-29444-2-git-send-email-krish.sadhukhan@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 2 ++ arch/x86/kvm/x86.c | 23 +---------------------- arch/x86/kvm/x86.h | 21 +++++++++++++++++++++ 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 97cb005c7aa7..281be772e9a7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -545,6 +545,7 @@ struct kvm_vcpu_arch { unsigned long cr3; unsigned long cr4; unsigned long cr4_guest_owned_bits; + unsigned long cr4_guest_rsvd_bits; unsigned long cr8; u32 host_pkru; u32 pkru; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8a294f9747aa..5bec182aa648 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -128,6 +128,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); kvm_pmu_refresh(vcpu); + vcpu->arch.cr4_guest_rsvd_bits = + __cr4_reserved_bits(guest_cpuid_has, vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1153ce7d118b..549b3f7228ac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -955,33 +955,12 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -#define __cr4_reserved_bits(__cpu_has, __c) \ -({ \ - u64 __reserved_bits = CR4_RESERVED_BITS; \ - \ - if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \ - __reserved_bits |= X86_CR4_OSXSAVE; \ - if (!__cpu_has(__c, X86_FEATURE_SMEP)) \ - __reserved_bits |= X86_CR4_SMEP; \ - if (!__cpu_has(__c, X86_FEATURE_SMAP)) \ - __reserved_bits |= X86_CR4_SMAP; \ - if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \ - __reserved_bits |= X86_CR4_FSGSBASE; \ - if (!__cpu_has(__c, X86_FEATURE_PKU)) \ - __reserved_bits |= X86_CR4_PKE; \ - if (!__cpu_has(__c, X86_FEATURE_LA57)) \ - __reserved_bits |= X86_CR4_LA57; \ - if (!__cpu_has(__c, X86_FEATURE_UMIP)) \ - __reserved_bits |= X86_CR4_UMIP; \ - __reserved_bits; \ -}) - static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return -EINVAL; - if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu)) + if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return -EINVAL; return 0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 31928bf18ba5..15276ed224d5 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -373,4 +373,25 @@ bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); #define KVM_MSR_RET_INVALID 2 +#define __cr4_reserved_bits(__cpu_has, __c) \ +({ \ + u64 __reserved_bits = CR4_RESERVED_BITS; \ + \ + if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \ + __reserved_bits |= X86_CR4_OSXSAVE; \ + if (!__cpu_has(__c, X86_FEATURE_SMEP)) \ + __reserved_bits |= X86_CR4_SMEP; \ + if (!__cpu_has(__c, X86_FEATURE_SMAP)) \ + __reserved_bits |= X86_CR4_SMAP; \ + if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \ + __reserved_bits |= X86_CR4_FSGSBASE; \ + if (!__cpu_has(__c, X86_FEATURE_PKU)) \ + __reserved_bits |= X86_CR4_PKE; \ + if (!__cpu_has(__c, X86_FEATURE_LA57)) \ + __reserved_bits |= X86_CR4_LA57; \ + if (!__cpu_has(__c, X86_FEATURE_UMIP)) \ + __reserved_bits |= X86_CR4_UMIP; \ + __reserved_bits; \ +}) + #endif From 53efe527ca4a4432d17c693efde6eec56fb43ebb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Jul 2020 07:02:50 -0400 Subject: [PATCH 047/127] KVM: x86: Make CR4.VMXE reserved for the guest CR4.VMXE is reserved unless the VMX CPUID bit is set. On Intel, it is also tested by vmx_set_cr4, but AMD relies on kvm_valid_cr4, so fix it. Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 15276ed224d5..10441fbb4073 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -391,6 +391,8 @@ bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); __reserved_bits |= X86_CR4_LA57; \ if (!__cpu_has(__c, X86_FEATURE_UMIP)) \ __reserved_bits |= X86_CR4_UMIP; \ + if (!__cpu_has(__c, X86_FEATURE_VMX)) \ + __reserved_bits |= X86_CR4_VMXE; \ __reserved_bits; \ }) From 761e4169346553c180bbd4a383aedd72f905bc9a Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 8 Jul 2020 00:39:56 +0000 Subject: [PATCH 048/127] KVM: nSVM: Check that MBZ bits in CR3 and CR4 are not set on vmrun of nested guests According to section "Canonicalization and Consistency Checks" in APM vol. 2 the following guest state is illegal: "Any MBZ bit of CR3 is set." "Any MBZ bit of CR4 is set." Suggeted-by: Paolo Bonzini Signed-off-by: Krish Sadhukhan Message-Id: <1594168797-29444-3-git-send-email-krish.sadhukhan@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 26 ++++++++++++++++++++++++-- arch/x86/kvm/svm/svm.h | 5 ++++- arch/x86/kvm/x86.c | 3 ++- arch/x86/kvm/x86.h | 1 + 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 385461496cf5..402ea5b412f0 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -222,8 +222,9 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) return true; } -static bool nested_vmcb_checks(struct vmcb *vmcb) +static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb) { + bool nested_vmcb_lma; if ((vmcb->save.efer & EFER_SVME) == 0) return false; @@ -234,6 +235,27 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7)) return false; + nested_vmcb_lma = + (vmcb->save.efer & EFER_LME) && + (vmcb->save.cr0 & X86_CR0_PG); + + if (!nested_vmcb_lma) { + if (vmcb->save.cr4 & X86_CR4_PAE) { + if (vmcb->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) + return false; + } else { + if (vmcb->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) + return false; + } + } else { + if (!(vmcb->save.cr4 & X86_CR4_PAE) || + !(vmcb->save.cr0 & X86_CR0_PE) || + (vmcb->save.cr3 & MSR_CR3_LONG_RESERVED_MASK)) + return false; + } + if (kvm_valid_cr4(&svm->vcpu, vmcb->save.cr4)) + return false; + return nested_vmcb_check_controls(&vmcb->control); } @@ -419,7 +441,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm) nested_vmcb = map.hva; - if (!nested_vmcb_checks(nested_vmcb)) { + if (!nested_vmcb_checks(svm, nested_vmcb)) { nested_vmcb->control.exit_code = SVM_EXIT_ERR; nested_vmcb->control.exit_code_hi = 0; nested_vmcb->control.exit_info_1 = 0; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 71b1dda947e6..121b198b51e9 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -343,7 +343,10 @@ static inline bool gif_set(struct vcpu_svm *svm) } /* svm.c */ -#define MSR_INVALID 0xffffffffU +#define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U +#define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U +#define MSR_CR3_LONG_RESERVED_MASK 0xfff0000000000fe7U +#define MSR_INVALID 0xffffffffU u32 svm_msrpm_offset(u32 msr); void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 549b3f7228ac..475456a14d76 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -955,7 +955,7 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return -EINVAL; @@ -965,6 +965,7 @@ static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 0; } +EXPORT_SYMBOL_GPL(kvm_valid_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 10441fbb4073..224670d7c245 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -369,6 +369,7 @@ static inline bool kvm_dr6_valid(u64 data) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); +int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); #define KVM_MSR_RET_INVALID 2 From 2e8cd7a3b82874e9cb3e42d916dda05cd790cc1a Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 24 Jun 2020 09:59:28 +0800 Subject: [PATCH 049/127] kvm: x86: limit the maximum number of vPMU fixed counters to 3 Some new Intel platforms (such as TGL) already have the fourth fixed counter TOPDOWN.SLOTS, but it has not been fully enabled on KVM and the host. Therefore, we limit edx.split.num_counters_fixed to 3, so that it does not break the kvm-unit-tests PMU test case and bad-handled userspace. Signed-off-by: Like Xu Message-Id: <20200624015928.118614-1-like.xu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/pmu.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5bec182aa648..6f2b6e9c9ae0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -606,7 +606,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) eax.split.bit_width = cap.bit_width_gp; eax.split.mask_length = cap.events_mask_len; - edx.split.num_counters_fixed = cap.num_counters_fixed; + edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS); edx.split.bit_width_fixed = cap.bit_width_fixed; edx.split.reserved = 0; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ab85eed8a6cc..067fef51760c 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -15,6 +15,8 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 +#define MAX_FIXED_COUNTERS 3 + struct kvm_event_hw_type_mapping { u8 eventsel; u8 unit_mask; From 18964092825aef6faaadf51fbc5fd166168ddf61 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 8 Jul 2020 14:50:47 +0800 Subject: [PATCH 050/127] KVM: X86: Reset vcpu->arch.cpuid_nent to 0 if SET_CPUID* fails Current implementation keeps userspace input of CPUID configuration and cpuid->nent even if kvm_update_cpuid() fails. Reset vcpu->arch.cpuid_nent to 0 for the case of failure as a simple fix. Besides, update the doc to explicitly state that if IOCTL SET_CPUID* fail KVM gives no gurantee that previous valid CPUID configuration is kept. Signed-off-by: Xiaoyao Li Message-Id: <20200708065054.19713-2-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 4 ++++ arch/x86/kvm/cpuid.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1cfe79b932d6..644e5326aa50 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -669,6 +669,10 @@ MSRs that have been set successfully. Defines the vcpu responses to the cpuid instruction. Applications should use the KVM_SET_CPUID2 ioctl if available. +Note, when this IOCTL fails, KVM gives no guarantees that previous valid CPUID +configuration (if there is) is not corrupted. Userspace can get a copy of the +resulting CPUID configuration through KVM_GET_CPUID2 in case. + :: struct kvm_cpuid_entry { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6f2b6e9c9ae0..b7bbed0de636 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -209,6 +209,8 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, kvm_apic_set_version(vcpu); kvm_x86_ops.cpuid_update(vcpu); r = kvm_update_cpuid(vcpu); + if (r) + vcpu->arch.cpuid_nent = 0; kvfree(cpuid_entries); out: @@ -232,6 +234,8 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, kvm_apic_set_version(vcpu); kvm_x86_ops.cpuid_update(vcpu); r = kvm_update_cpuid(vcpu); + if (r) + vcpu->arch.cpuid_nent = 0; out: return r; } From 0d3b2ba16ba68612142399e1801e6aff3f184d3e Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 8 Jul 2020 14:50:48 +0800 Subject: [PATCH 051/127] KVM: X86: Go on updating other CPUID leaves when leaf 1 is absent As handling of bits out of leaf 1 added over time, kvm_update_cpuid() should not return directly if leaf 1 is absent, but should go on updateing other CPUID leaves. Keep the update of apic->lapic_timer.timer_mode_mask in a separate wrapper, to minimize churn for code since it will be moved out of this function in a future patch. Signed-off-by: Xiaoyao Li Message-Id: <20200708065054.19713-3-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b7bbed0de636..f6f760d4800d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -60,18 +60,17 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; best = kvm_find_cpuid_entry(vcpu, 1, 0); - if (!best) - return 0; - - /* Update OSXSAVE bit */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) - cpuid_entry_change(best, X86_FEATURE_OSXSAVE, + if (best) { + /* Update OSXSAVE bit */ + if (boot_cpu_has(X86_FEATURE_XSAVE)) + cpuid_entry_change(best, X86_FEATURE_OSXSAVE, kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)); - cpuid_entry_change(best, X86_FEATURE_APIC, + cpuid_entry_change(best, X86_FEATURE_APIC, vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); + } - if (apic) { + if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; else From 565b7820738a1d764879e1b1b85f485aa028287a Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 8 Jul 2020 14:50:53 +0800 Subject: [PATCH 052/127] KVM: lapic: Use guest_cpuid_has() in kvm_apic_set_version() Only code cleanup and no functional change. Signed-off-by: Xiaoyao Li Message-Id: <20200708065054.19713-8-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5bf72fc86a8e..e5dbb7ebae78 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -354,7 +354,6 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - struct kvm_cpuid_entry2 *feat; u32 v = APIC_VERSION; if (!lapic_in_kernel(vcpu)) @@ -367,8 +366,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) * version first and level-triggered interrupts never get EOIed in * IOAPIC. */ - feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); - if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) && + if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) && !ioapic_in_kernel(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); From 36f37648ca982915918bea001a40172c5b0cb233 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 8 Jul 2020 14:50:54 +0800 Subject: [PATCH 053/127] KVM: X86: Move kvm_apic_set_version() to kvm_update_cpuid() There is no dependencies between kvm_apic_set_version() and kvm_update_cpuid() because kvm_apic_set_version() queries X2APIC CPUID bit, which is not touched/changed by kvm_update_cpuid(). Obviously, kvm_apic_set_version() belongs to the category of updating vcpu model. Signed-off-by: Xiaoyao Li Message-Id: <20200708065054.19713-9-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index f6f760d4800d..eebd66f86abe 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -75,6 +75,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) apic->lapic_timer.timer_mode_mask = 3 << 17; else apic->lapic_timer.timer_mode_mask = 1 << 17; + + kvm_apic_set_version(vcpu); } best = kvm_find_cpuid_entry(vcpu, 7, 0); @@ -205,7 +207,6 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, } vcpu->arch.cpuid_nent = cpuid->nent; cpuid_fix_nx_cap(vcpu); - kvm_apic_set_version(vcpu); kvm_x86_ops.cpuid_update(vcpu); r = kvm_update_cpuid(vcpu); if (r) @@ -230,7 +231,6 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; vcpu->arch.cpuid_nent = cpuid->nent; - kvm_apic_set_version(vcpu); kvm_x86_ops.cpuid_update(vcpu); r = kvm_update_cpuid(vcpu); if (r) From a76733a9875238e271a3fd5778d5596248904f07 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Thu, 9 Jul 2020 12:34:22 +0800 Subject: [PATCH 054/127] KVM: x86: Introduce kvm_check_cpuid() Use kvm_check_cpuid() to validate if userspace provides legal cpuid settings and call it before KVM take any action to update CPUID or update vcpu states based on given CPUID settings. Signed-off-by: Xiaoyao Li Message-Id: <20200709043426.92712-2-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 55 ++++++++++++++++++++++++++++---------------- arch/x86/kvm/cpuid.h | 2 +- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index eebd66f86abe..1a053022a961 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -54,7 +54,26 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit -int kvm_update_cpuid(struct kvm_vcpu *vcpu) +static int kvm_check_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + /* + * The existing code assumes virtual address is 48-bit or 57-bit in the + * canonical address checks; exit if it is ever changed. + */ + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best) { + int vaddr_bits = (best->eax & 0xff00) >> 8; + + if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0) + return -EINVAL; + } + + return 0; +} + +void kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct kvm_lapic *apic = vcpu->arch.apic; @@ -98,18 +117,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - /* - * The existing code assumes virtual address is 48-bit or 57-bit in the - * canonical address checks; exit if it is ever changed. - */ - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); - if (best) { - int vaddr_bits = (best->eax & 0xff00) >> 8; - - if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0) - return -EINVAL; - } - best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) @@ -131,7 +138,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) kvm_pmu_refresh(vcpu); vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); - return 0; } static int is_efer_nx(void) @@ -206,11 +212,16 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_entries[i].padding[2] = 0; } vcpu->arch.cpuid_nent = cpuid->nent; + r = kvm_check_cpuid(vcpu); + if (r) { + vcpu->arch.cpuid_nent = 0; + kvfree(cpuid_entries); + goto out; + } + cpuid_fix_nx_cap(vcpu); kvm_x86_ops.cpuid_update(vcpu); - r = kvm_update_cpuid(vcpu); - if (r) - vcpu->arch.cpuid_nent = 0; + kvm_update_cpuid(vcpu); kvfree(cpuid_entries); out: @@ -231,10 +242,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; vcpu->arch.cpuid_nent = cpuid->nent; - kvm_x86_ops.cpuid_update(vcpu); - r = kvm_update_cpuid(vcpu); - if (r) + r = kvm_check_cpuid(vcpu); + if (r) { vcpu->arch.cpuid_nent = 0; + goto out; + } + + kvm_x86_ops.cpuid_update(vcpu); + kvm_update_cpuid(vcpu); out: return r; } diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 05434cd9342f..f136de1debad 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -9,7 +9,7 @@ extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); -int kvm_update_cpuid(struct kvm_vcpu *vcpu); +void kvm_update_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, From aedbaf4f6afdcf9da0f48f97d7e9d62f4d591e19 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Thu, 9 Jul 2020 12:34:23 +0800 Subject: [PATCH 055/127] KVM: x86: Extract kvm_update_cpuid_runtime() from kvm_update_cpuid() Beside called in kvm_vcpu_ioctl_set_cpuid*(), kvm_update_cpuid() is also called 5 places else in x86.c and 1 place else in lapic.c. All those 6 places only need the part of updating guest CPUIDs (OSXSAVE, OSPKE, APIC, KVM_FEATURE_PV_UNHALT, ...) based on the runtime vcpu state, so extract them as a separate kvm_update_cpuid_runtime(). Signed-off-by: Xiaoyao Li Message-Id: <20200709043426.92712-3-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 44 +++++++++++++++++++++++++++----------------- arch/x86/kvm/cpuid.h | 2 +- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/x86.c | 10 +++++----- 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 1a053022a961..0ed3b343c44e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -73,10 +73,9 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu) return 0; } -void kvm_update_cpuid(struct kvm_vcpu *vcpu) +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - struct kvm_lapic *apic = vcpu->arch.apic; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (best) { @@ -89,28 +88,14 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); } - if (best && apic) { - if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) - apic->lapic_timer.timer_mode_mask = 3 << 17; - else - apic->lapic_timer.timer_mode_mask = 1 << 17; - - kvm_apic_set_version(vcpu); - } - best = kvm_find_cpuid_entry(vcpu, 7, 0); if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) cpuid_entry_change(best, X86_FEATURE_OSPKE, kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); best = kvm_find_cpuid_entry(vcpu, 0xD, 0); - if (!best) { - vcpu->arch.guest_supported_xcr0 = 0; - } else { - vcpu->arch.guest_supported_xcr0 = - (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); - } best = kvm_find_cpuid_entry(vcpu, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || @@ -129,6 +114,29 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } +} + +static void kvm_update_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (best && apic) { + if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) + apic->lapic_timer.timer_mode_mask = 3 << 17; + else + apic->lapic_timer.timer_mode_mask = 1 << 17; + + kvm_apic_set_version(vcpu); + } + + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + if (!best) + vcpu->arch.guest_supported_xcr0 = 0; + else + vcpu->arch.guest_supported_xcr0 = + (best->eax | ((u64)best->edx << 32)) & supported_xcr0; /* Note, maxphyaddr must be updated before tdp_level. */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); @@ -221,6 +229,7 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, cpuid_fix_nx_cap(vcpu); kvm_x86_ops.cpuid_update(vcpu); + kvm_update_cpuid_runtime(vcpu); kvm_update_cpuid(vcpu); kvfree(cpuid_entries); @@ -249,6 +258,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, } kvm_x86_ops.cpuid_update(vcpu); + kvm_update_cpuid_runtime(vcpu); kvm_update_cpuid(vcpu); out: return r; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index f136de1debad..3a923ae15f2f 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -9,7 +9,7 @@ extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); -void kvm_update_cpuid(struct kvm_vcpu *vcpu); +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e5dbb7ebae78..47801a44cfa6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2230,7 +2230,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); if (!apic) return; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 475456a14d76..c432a445cbbe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -940,7 +940,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); return 0; } @@ -1004,7 +1004,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_mmu_reset_context(vcpu); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); return 0; } @@ -2916,7 +2916,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); } else { vcpu->arch.ia32_misc_enable_msr = data; } @@ -8170,7 +8170,7 @@ static void enter_smm(struct kvm_vcpu *vcpu) kvm_x86_ops.set_efer(vcpu, 0); #endif - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); kvm_mmu_reset_context(vcpu); } @@ -9194,7 +9194,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) (X86_CR4_OSXSAVE | X86_CR4_PKE)); kvm_x86_ops.set_cr4(vcpu, sregs->cr4); if (cpuid_update_needed) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); if (is_pae_paging(vcpu)) { From 346ce3591db29ffd14fdbadc71e3b5eee4069f7b Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Thu, 9 Jul 2020 12:34:24 +0800 Subject: [PATCH 056/127] KVM: x86: Rename kvm_update_cpuid() to kvm_vcpu_after_set_cpuid() Now there is no updating CPUID bits behavior in kvm_update_cpuid(), rename it to kvm_vcpu_after_set_cpuid(). Signed-off-by: Xiaoyao Li Message-Id: <20200709043426.92712-4-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0ed3b343c44e..b602c0c9078e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -116,7 +116,7 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) } } -static void kvm_update_cpuid(struct kvm_vcpu *vcpu) +static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; @@ -230,7 +230,7 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, cpuid_fix_nx_cap(vcpu); kvm_x86_ops.cpuid_update(vcpu); kvm_update_cpuid_runtime(vcpu); - kvm_update_cpuid(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); kvfree(cpuid_entries); out: @@ -259,7 +259,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, kvm_x86_ops.cpuid_update(vcpu); kvm_update_cpuid_runtime(vcpu); - kvm_update_cpuid(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); out: return r; } From 7c1b761be029da401571e13e68fe509772309b4f Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Thu, 9 Jul 2020 12:34:25 +0800 Subject: [PATCH 057/127] KVM: x86: Rename cpuid_update() callback to vcpu_after_set_cpuid() The name of callback cpuid_update() is misleading that it's not about updating CPUID settings of vcpu but updating the configurations of vcpu based on the CPUIDs. So rename it to vcpu_after_set_cpuid(). Signed-off-by: Xiaoyao Li Message-Id: <20200709043426.92712-5-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/cpuid.c | 4 ++-- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/vmx/nested.c | 3 ++- arch/x86/kvm/vmx/vmx.c | 4 ++-- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 281be772e9a7..86c719d2b755 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1052,7 +1052,7 @@ struct kvm_x86_ops { void (*hardware_unsetup)(void); bool (*cpu_has_accelerated_tpr)(void); bool (*has_emulated_msr)(u32 index); - void (*cpuid_update)(struct kvm_vcpu *vcpu); + void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); unsigned int vm_size; int (*vm_init)(struct kvm *kvm); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b602c0c9078e..832a24c1334e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -228,7 +228,7 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, } cpuid_fix_nx_cap(vcpu); - kvm_x86_ops.cpuid_update(vcpu); + kvm_x86_ops.vcpu_after_set_cpuid(vcpu); kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); @@ -257,7 +257,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, goto out; } - kvm_x86_ops.cpuid_update(vcpu); + kvm_x86_ops.vcpu_after_set_cpuid(vcpu); kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); out: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 74096aa72ad9..472544cf1de2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3550,7 +3550,7 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } -static void svm_cpuid_update(struct kvm_vcpu *vcpu) +static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4050,7 +4050,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_exit_info = svm_get_exit_info, - .cpuid_update = svm_cpuid_update, + .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, .has_wbinvd_exit = svm_has_wbinvd_exit, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7693d41a2446..e4080ab2df21 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6354,7 +6354,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) /* * secondary cpu-based controls. Do not include those that - * depend on CPUID bits, they are added later by vmx_cpuid_update. + * depend on CPUID bits, they are added later by + * vmx_vcpu_after_set_cpuid. */ if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1de5dac952b6..c6e96e2ef4d6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7236,7 +7236,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } -static void vmx_cpuid_update(struct kvm_vcpu *vcpu) +static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7894,7 +7894,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .get_exit_info = vmx_get_exit_info, - .cpuid_update = vmx_cpuid_update, + .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, From 5668821aefcbd8511740301de8bc9153c103a43a Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Thu, 9 Jul 2020 12:34:26 +0800 Subject: [PATCH 058/127] KVM: x86: Move kvm_x86_ops.vcpu_after_set_cpuid() into kvm_vcpu_after_set_cpuid() kvm_x86_ops.vcpu_after_set_cpuid() is used to update vmx/svm specific vcpu settings based on updated CPUID settings. So it's supposed to be called after CPUIDs are updated, i.e., kvm_update_cpuid_runtime(). Currently, kvm_update_cpuid_runtime() only updates CPUID bits of OSXSAVE, APIC, OSPKE, MWAIT, KVM_FEATURE_PV_UNHALT and CPUID(0xD,0).ebx and CPUID(0xD, 1).ebx. None of them is consumed by vmx/svm's update_vcpu_after_set_cpuid(). So there is no dependency between them. Move kvm_x86_ops.vcpu_after_set_cpuid() into kvm_vcpu_after_set_cpuid() is obviously more reasonable. Signed-off-by: Xiaoyao Li Message-Id: <20200709043426.92712-6-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 832a24c1334e..edbed4f522f2 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -121,6 +121,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; + kvm_x86_ops.vcpu_after_set_cpuid(vcpu); + best = kvm_find_cpuid_entry(vcpu, 1, 0); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) @@ -228,7 +230,6 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, } cpuid_fix_nx_cap(vcpu); - kvm_x86_ops.vcpu_after_set_cpuid(vcpu); kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); @@ -257,7 +258,6 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, goto out; } - kvm_x86_ops.vcpu_after_set_cpuid(vcpu); kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); out: From 995decb6c43e1d6e9d6a7d590471f2eea74600f4 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 8 Jul 2020 16:00:23 +0200 Subject: [PATCH 059/127] KVM: x86: take as_id into account when checking PGD OVMF booted guest running on shadow pages crashes on TRIPLE FAULT after enabling paging from SMM. The crash is triggered from mmu_check_root() and is caused by kvm_is_visible_gfn() searching through memslots with as_id = 0 while vCPU may be in a different context (address space). Introduce kvm_vcpu_is_visible_gfn() and use it from mmu_check_root(). Signed-off-by: Vitaly Kuznetsov Message-Id: <20200708140023.1476020-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0011b2c97f65..231beb6d9cf7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3693,7 +3693,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) { int ret = 0; - if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { + if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); ret = 1; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9edc6fc71a89..87140e79648b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -774,6 +774,7 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); +bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0a68c9d3d3ab..b528a59b0a84 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1626,6 +1626,14 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); +bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + + return kvm_is_visible_memslot(memslot); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn); + unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) { struct vm_area_struct *vma; From 632a4cf57fd1bfae24f05c71398d367ca51940f1 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 8 Jul 2020 15:44:09 +0800 Subject: [PATCH 060/127] KVM/x86: pmu: Fix #GP condition check for RDPMC emulation In guest protected mode, if the current privilege level is not 0 and the PCE flag in the CR4 register is cleared, we will inject a #GP for RDPMC usage. Signed-off-by: Like Xu Message-Id: <20200708074409.39028-1-like.xu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b86346903f2e..67741d2a0308 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -372,6 +372,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (!pmc) return 1; + if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) && + (kvm_x86_ops.get_cpl(vcpu) != 0) && + (kvm_read_cr0(vcpu) & X86_CR0_PE)) + return 1; + *data = pmc_read_counter(pmc) & mask; return 0; } From 841c2be09fe4f495fe5224952a419bd8c7e5b455 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 8 Jul 2020 14:57:31 +0300 Subject: [PATCH 061/127] kvm: x86: replace kvm_spec_ctrl_test_value with runtime test on the host To avoid complex and in some cases incorrect logic in kvm_spec_ctrl_test_value, just try the guest's given value on the host processor instead, and if it doesn't #GP, allow the guest to set it. One such case is when host CPU supports STIBP mitigation but doesn't support IBRS (as is the case with some Zen2 AMD cpus), and in this case we were giving guest #GP when it tried to use STIBP The reason why can can do the host test is that IA32_SPEC_CTRL msr is passed to the guest, after the guest sets it to a non zero value for the first time (due to performance reasons), and as as result of this, it is pointless to emulate #GP condition on this first access, in a different way than what the host CPU does. This is based on a patch from Sean Christopherson, who suggested this idea. Fixes: 6441fa6178f5 ("KVM: x86: avoid incorrect writes to host MSR_IA32_SPEC_CTRL") Cc: stable@vger.kernel.org Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Message-Id: <20200708115731.180097-1-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 38 +++++++++++++++++++++----------------- arch/x86/kvm/x86.h | 2 +- 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 472544cf1de2..9b59e63567bb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2522,7 +2522,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; - if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) + if (kvm_spec_ctrl_test_value(data)) return 1; svm->spec_ctrl = data; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c6e96e2ef4d6..0d526b32f041 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2062,7 +2062,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; - if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) + if (kvm_spec_ctrl_test_value(data)) return 1; vmx->spec_ctrl = data; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c432a445cbbe..7f32169e8449 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10693,28 +10693,32 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); -u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu) + +int kvm_spec_ctrl_test_value(u64 value) { - uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD; + /* + * test that setting IA32_SPEC_CTRL to given value + * is allowed by the host processor + */ - /* The STIBP bit doesn't fault even if it's not advertised */ - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) - bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) && - !boot_cpu_has(X86_FEATURE_AMD_IBRS)) - bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + u64 saved_value; + unsigned long flags; + int ret = 0; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) - bits &= ~SPEC_CTRL_SSBD; - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && - !boot_cpu_has(X86_FEATURE_AMD_SSBD)) - bits &= ~SPEC_CTRL_SSBD; + local_irq_save(flags); - return bits; + if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + ret = 1; + else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) + ret = 1; + else + wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); + + local_irq_restore(flags); + + return ret; } -EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits); +EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 224670d7c245..3308c3ccc0fd 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -368,7 +368,7 @@ static inline bool kvm_dr6_valid(u64 data) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); -u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); +int kvm_spec_ctrl_test_value(u64 value); int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); From 87fa7f3e98a1310ef1ac1900e7ee7f9610a038bc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Jul 2020 21:51:54 +0200 Subject: [PATCH 062/127] x86/kvm: Move context tracking where it belongs Context tracking for KVM happens way too early in the vcpu_run() code. Anything after guest_enter_irqoff() and before guest_exit_irqoff() cannot use RCU and should also be not instrumented. The current way of doing this covers way too much code. Move it closer to the actual vmenter/exit code. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Paolo Bonzini Message-Id: <20200708195321.724574345@linutronix.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 16 ++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 10 ++++++++++ arch/x86/kvm/x86.c | 2 -- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9b59e63567bb..0227c4cbe642 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3398,6 +3398,14 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); + /* + * Tell context tracking that this CPU is about to enter guest + * mode. This has to be after x86_spec_ctrl_set_guest() because + * that can take locks (lockdep needs RCU) and calls into world and + * some more. + */ + guest_enter_irqoff(); + __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); #ifdef CONFIG_X86_64 @@ -3408,6 +3416,14 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) loadsegment(gs, svm->host.gs); #endif #endif + /* + * Tell context tracking that this CPU is back. + * + * This needs to be done before the below as native_read_msr() + * contains a tracepoint and x86_spec_ctrl_restore_host() calls + * into world and some more. + */ + guest_exit_irqoff(); /* * We do not use IBRS in the kernel. If this vCPU has used the diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0d526b32f041..4b4796d5123d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6731,6 +6731,11 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); + /* + * Tell context tracking that this CPU is about to enter guest mode. + */ + guest_enter_irqoff(); + /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); @@ -6745,6 +6750,11 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.cr2 = read_cr2(); + /* + * Tell context tracking that this CPU is back. + */ + guest_exit_irqoff(); + /* * We do not use IBRS in the kernel. If this vCPU has used the * SPEC_CTRL MSR it may have left it on; save the value and diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7f32169e8449..d7d82b3c0e4c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8522,7 +8522,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); - guest_enter_irqoff(); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) @@ -8585,7 +8584,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) local_irq_disable(); kvm_after_interrupt(vcpu); - guest_exit_irqoff(); if (lapic_in_kernel(vcpu)) { s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; if (delta != S64_MIN) { From 0642391e2139a2c1b8a33f3fd816488d3f371d90 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Jul 2020 21:51:55 +0200 Subject: [PATCH 063/127] x86/kvm/vmx: Add hardirq tracing to guest enter/exit Entering guest mode is more or less the same as returning to user space. From an instrumentation point of view both leave kernel mode and the transition to guest or user mode reenables interrupts on the host. In user mode an interrupt is served directly and in guest mode it causes a VM exit which then handles or reinjects the interrupt. The transition from guest mode or user mode to kernel mode disables interrupts, which needs to be recorded in instrumentation to set the correct state again. This is important for e.g. latency analysis because otherwise the execution time in guest or user mode would be wrongly accounted as interrupt disabled and could trigger false positives. Add hardirq tracing to guest enter/exit functions in the same way as it is done in the user mode enter/exit code, respecting the RCU requirements. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Paolo Bonzini Message-Id: <20200708195321.822002354@linutronix.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4b4796d5123d..421cff3aa5b5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6732,9 +6732,21 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); /* - * Tell context tracking that this CPU is about to enter guest mode. + * VMENTER enables interrupts (host state), but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about + * it. This is the same logic as for exit_to_user_mode(). + * + * This ensures that e.g. latency analysis on the host observes + * guest mode as interrupt enabled. + * + * guest_enter_irqoff() informs context tracking about the + * transition to guest mode and if enabled adjusts RCU state + * accordingly. */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); guest_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) @@ -6751,9 +6763,20 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.cr2 = read_cr2(); /* - * Tell context tracking that this CPU is back. + * VMEXIT disables interrupts (host state), but tracing and lockdep + * have them in state 'on' as recorded before entering guest mode. + * Same as enter_from_user_mode(). + * + * guest_exit_irqoff() restores host context and reinstates RCU if + * enabled and required. + * + * This needs to be done before the below as native_read_msr() + * contains a tracepoint and x86_spec_ctrl_restore_host() calls + * into world and some more. */ + lockdep_hardirqs_off(CALLER_ADDR0); guest_exit_irqoff(); + trace_hardirqs_off_finish(); /* * We do not use IBRS in the kernel. If this vCPU has used the From 9fc975e9efd03e57c9599e0fc07c8b264ad8d5b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Jul 2020 21:51:56 +0200 Subject: [PATCH 064/127] x86/kvm/svm: Add hardirq tracing on guest enter/exit Entering guest mode is more or less the same as returning to user space. From an instrumentation point of view both leave kernel mode and the transition to guest or user mode reenables interrupts on the host. In user mode an interrupt is served directly and in guest mode it causes a VM exit which then handles or reinjects the interrupt. The transition from guest mode or user mode to kernel mode disables interrupts, which needs to be recorded in instrumentation to set the correct state again. This is important for e.g. latency analysis because otherwise the execution time in guest or user mode would be wrongly accounted as interrupt disabled and could trigger false positives. Add hardirq tracing to guest enter/exit functions in the same way as it is done in the user mode enter/exit code, respecting the RCU requirements. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Paolo Bonzini Message-Id: <20200708195321.934715094@linutronix.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0227c4cbe642..23bac92f5b27 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3399,12 +3399,21 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); /* - * Tell context tracking that this CPU is about to enter guest - * mode. This has to be after x86_spec_ctrl_set_guest() because - * that can take locks (lockdep needs RCU) and calls into world and - * some more. + * VMENTER enables interrupts (host state), but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about + * it. This is the same logic as for exit_to_user_mode(). + * + * This ensures that e.g. latency analysis on the host observes + * guest mode as interrupt enabled. + * + * guest_enter_irqoff() informs context tracking about the + * transition to guest mode and if enabled adjusts RCU state + * accordingly. */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); guest_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); @@ -3416,14 +3425,22 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) loadsegment(gs, svm->host.gs); #endif #endif + /* - * Tell context tracking that this CPU is back. + * VMEXIT disables interrupts (host state), but tracing and lockdep + * have them in state 'on' as recorded before entering guest mode. + * Same as enter_from_user_mode(). + * + * guest_exit_irqoff() restores host context and reinstates RCU if + * enabled and required. * * This needs to be done before the below as native_read_msr() * contains a tracepoint and x86_spec_ctrl_restore_host() calls * into world and some more. */ + lockdep_hardirqs_off(CALLER_ADDR0); guest_exit_irqoff(); + trace_hardirqs_off_finish(); /* * We do not use IBRS in the kernel. If this vCPU has used the From 3ebccdf373c21d8697782b7e8d5af0adc9c26e04 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Jul 2020 21:51:57 +0200 Subject: [PATCH 065/127] x86/kvm/vmx: Move guest enter/exit into .noinstr.text Move the functions which are inside the RCU off region into the non-instrumentable text section. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Paolo Bonzini Message-Id: <20200708195322.037311579@linutronix.de> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hardirq.h | 4 +- arch/x86/include/asm/kvm_host.h | 8 +++ arch/x86/kvm/vmx/ops.h | 4 ++ arch/x86/kvm/vmx/vmenter.S | 5 +- arch/x86/kvm/vmx/vmx.c | 111 ++++++++++++++++++-------------- arch/x86/kvm/x86.c | 2 +- 6 files changed, 81 insertions(+), 53 deletions(-) diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 07533795b8d2..275e7fd20310 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -67,12 +67,12 @@ static inline void kvm_set_cpu_l1tf_flush_l1d(void) __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); } -static inline void kvm_clear_cpu_l1tf_flush_l1d(void) +static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0); } -static inline bool kvm_get_cpu_l1tf_flush_l1d(void) +static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void) { return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 86c719d2b755..3d7d818a282c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1598,7 +1598,15 @@ asmlinkage void kvm_spurious_fault(void); insn "\n\t" \ "jmp 668f \n\t" \ "667: \n\t" \ + "1: \n\t" \ + ".pushsection .discard.instr_begin \n\t" \ + ".long 1b - . \n\t" \ + ".popsection \n\t" \ "call kvm_spurious_fault \n\t" \ + "1: \n\t" \ + ".pushsection .discard.instr_end \n\t" \ + ".long 1b - . \n\t" \ + ".popsection \n\t" \ "668: \n\t" \ _ASM_EXTABLE(666b, 667b) diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index 5f1ac002b4b6..692b0c31c9c8 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -146,7 +146,9 @@ do { \ : : op1 : "cc" : error, fault); \ return; \ error: \ + instrumentation_begin(); \ insn##_error(error_args); \ + instrumentation_end(); \ return; \ fault: \ kvm_spurious_fault(); \ @@ -161,7 +163,9 @@ do { \ : : op1, op2 : "cc" : error, fault); \ return; \ error: \ + instrumentation_begin(); \ insn##_error(error_args); \ + instrumentation_end(); \ return; \ fault: \ kvm_spurious_fault(); \ diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index e0a182cb3cdd..799db084a336 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -27,7 +27,7 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif - .text +.section .noinstr.text, "ax" /** * vmx_vmenter - VM-Enter the current loaded VMCS @@ -234,6 +234,9 @@ SYM_FUNC_START(__vmx_vcpu_run) jmp 1b SYM_FUNC_END(__vmx_vcpu_run) + +.section .text, "ax" + /** * vmread_error_trampoline - Trampoline from inline asm to vmread_error() * @field: VMCS field encoding that failed diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 421cff3aa5b5..e71a3d982781 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6116,7 +6116,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * information but as all relevant affected CPUs have 32KiB L1D cache size * there is no point in doing so. */ -static void vmx_l1d_flush(struct kvm_vcpu *vcpu) +static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) { int size = PAGE_SIZE << L1D_CACHE_ORDER; @@ -6149,7 +6149,7 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu) vcpu->stat.l1d_flush++; if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return; } @@ -6635,7 +6635,7 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) } } -void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) +void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) { if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { vmx->loaded_vmcs->host_state.rsp = host_rsp; @@ -6657,6 +6657,63 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); +static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, + struct vcpu_vmx *vmx) +{ + /* + * VMENTER enables interrupts (host state), but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about + * it. This is the same logic as for exit_to_user_mode(). + * + * This ensures that e.g. latency analysis on the host observes + * guest mode as interrupt enabled. + * + * guest_enter_irqoff() informs context tracking about the + * transition to guest mode and if enabled adjusts RCU state + * accordingly. + */ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); + + /* L1D Flush includes CPU buffer clear to mitigate MDS */ + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + else if (static_branch_unlikely(&mds_user_clear)) + mds_clear_cpu_buffers(); + + if (vcpu->arch.cr2 != read_cr2()) + write_cr2(vcpu->arch.cr2); + + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + vmx->loaded_vmcs->launched); + + vcpu->arch.cr2 = read_cr2(); + + /* + * VMEXIT disables interrupts (host state), but tracing and lockdep + * have them in state 'on' as recorded before entering guest mode. + * Same as enter_from_user_mode(). + * + * guest_exit_irqoff() restores host context and reinstates RCU if + * enabled and required. + * + * This needs to be done before the below as native_read_msr() + * contains a tracepoint and x86_spec_ctrl_restore_host() calls + * into world and some more. + */ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { fastpath_t exit_fastpath; @@ -6731,52 +6788,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - /* - * VMENTER enables interrupts (host state), but the kernel state is - * interrupts disabled when this is invoked. Also tell RCU about - * it. This is the same logic as for exit_to_user_mode(). - * - * This ensures that e.g. latency analysis on the host observes - * guest mode as interrupt enabled. - * - * guest_enter_irqoff() informs context tracking about the - * transition to guest mode and if enabled adjusts RCU state - * accordingly. - */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - guest_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); - - /* L1D Flush includes CPU buffer clear to mitigate MDS */ - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mds_user_clear)) - mds_clear_cpu_buffers(); - - if (vcpu->arch.cr2 != read_cr2()) - write_cr2(vcpu->arch.cr2); - - vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - vmx->loaded_vmcs->launched); - - vcpu->arch.cr2 = read_cr2(); - - /* - * VMEXIT disables interrupts (host state), but tracing and lockdep - * have them in state 'on' as recorded before entering guest mode. - * Same as enter_from_user_mode(). - * - * guest_exit_irqoff() restores host context and reinstates RCU if - * enabled and required. - * - * This needs to be done before the below as native_read_msr() - * contains a tracepoint and x86_spec_ctrl_restore_host() calls - * into world and some more. - */ - lockdep_hardirqs_off(CALLER_ADDR0); - guest_exit_irqoff(); - trace_hardirqs_off_finish(); + /* The actual VMENTER/EXIT is in the .noinstr.text section. */ + vmx_vcpu_enter_exit(vcpu, vmx); /* * We do not use IBRS in the kernel. If this vCPU has used the diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d7d82b3c0e4c..e27d3db7e43f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -402,7 +402,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible void kvm_spurious_fault(void) +asmlinkage __visible noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); From 135961e0a7d555fc8f1d7c89ad44a94dffa5dcd8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Jul 2020 21:51:58 +0200 Subject: [PATCH 066/127] x86/kvm/svm: Move guest enter/exit into .noinstr.text Move the functions which are inside the RCU off region into the non-instrumentable text section. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Paolo Bonzini Message-Id: <20200708195322.144607767@linutronix.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 98 +++++++++++++++++++++----------------- arch/x86/kvm/svm/vmenter.S | 2 +- 2 files changed, 56 insertions(+), 44 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 23bac92f5b27..71500e865c94 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3343,6 +3343,60 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); +static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, + struct vcpu_svm *svm) +{ + /* + * VMENTER enables interrupts (host state), but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about + * it. This is the same logic as for exit_to_user_mode(). + * + * This ensures that e.g. latency analysis on the host observes + * guest mode as interrupt enabled. + * + * guest_enter_irqoff() informs context tracking about the + * transition to guest mode and if enabled adjusts RCU state + * accordingly. + */ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); + + __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); + +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, svm->host.gs_base); +#else + loadsegment(fs, svm->host.fs); +#ifndef CONFIG_X86_32_LAZY_GS + loadsegment(gs, svm->host.gs); +#endif +#endif + + /* + * VMEXIT disables interrupts (host state), but tracing and lockdep + * have them in state 'on' as recorded before entering guest mode. + * Same as enter_from_user_mode(). + * + * guest_exit_irqoff() restores host context and reinstates RCU if + * enabled and required. + * + * This needs to be done before the below as native_read_msr() + * contains a tracepoint and x86_spec_ctrl_restore_host() calls + * into world and some more. + */ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { fastpath_t exit_fastpath; @@ -3398,49 +3452,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); - /* - * VMENTER enables interrupts (host state), but the kernel state is - * interrupts disabled when this is invoked. Also tell RCU about - * it. This is the same logic as for exit_to_user_mode(). - * - * This ensures that e.g. latency analysis on the host observes - * guest mode as interrupt enabled. - * - * guest_enter_irqoff() informs context tracking about the - * transition to guest mode and if enabled adjusts RCU state - * accordingly. - */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - guest_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); - - __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); - -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, svm->host.gs_base); -#else - loadsegment(fs, svm->host.fs); -#ifndef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif -#endif - - /* - * VMEXIT disables interrupts (host state), but tracing and lockdep - * have them in state 'on' as recorded before entering guest mode. - * Same as enter_from_user_mode(). - * - * guest_exit_irqoff() restores host context and reinstates RCU if - * enabled and required. - * - * This needs to be done before the below as native_read_msr() - * contains a tracepoint and x86_spec_ctrl_restore_host() calls - * into world and some more. - */ - lockdep_hardirqs_off(CALLER_ADDR0); - guest_exit_irqoff(); - trace_hardirqs_off_finish(); + svm_vcpu_enter_exit(vcpu, svm); /* * We do not use IBRS in the kernel. If this vCPU has used the diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index bf944334003a..1ec1ac40e328 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -27,7 +27,7 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif - .text +.section .noinstr.text, "ax" /** * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode From c3f08ed15047002677b8b1f9f5bc29ced3f87a6f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Jul 2020 21:51:59 +0200 Subject: [PATCH 067/127] x86/kvm/svm: Use uninstrumented wrmsrl() to restore GS On guest exit MSR_GS_BASE contains whatever the guest wrote to it and the first action after returning from the ASM code is to set it to the host kernel value. This uses wrmsrl() which is interesting at least. wrmsrl() is either using native_write_msr() or the paravirt variant. The XEN_PV code is uninteresting as nested SVM in a XEN_PV guest does not work. But native_write_msr() can be placed out of line by the compiler especially when paravirtualization is enabled in the kernel configuration. The function is marked notrace, but still can be probed if CONFIG_KPROBE_EVENTS_ON_NOTRACE is enabled. That would be a fatal problem as kprobe events use per-CPU variables which are GS based and would be accessed with the guest GS. Depending on the GS value this would either explode in colorful ways or lead to completely undebugable data corruption. Aside of that native_write_msr() contains a tracepoint which objtool complains about as it is invoked from the noinstr section. As this cannot run inside a XEN_PV guest there is no point in using wrmsrl(). Use native_wrmsrl() instead which is just a plain native WRMSR without tracing or anything else attached. Signed-off-by: Thomas Gleixner Acked-by: Juergen Gross Message-Id: <20200708195322.244847377@linutronix.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 71500e865c94..535ad311ad02 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3369,7 +3369,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); #ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, svm->host.gs_base); + native_wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else loadsegment(fs, svm->host.fs); #ifndef CONFIG_X86_32_LAZY_GS From 2245d39886fad920f4ea3583e2fb29bf01fc01d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Jul 2020 21:52:00 +0200 Subject: [PATCH 068/127] x86/kvm/vmx: Use native read/write_cr2() read/write_cr2() go throuh the paravirt XXL indirection, but nested VMX in a XEN_PV guest is not supported. Use the native variants. Signed-off-by: Thomas Gleixner Message-Id: <20200708195322.344731916@linutronix.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e71a3d982781..2b41d987b101 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6686,13 +6686,13 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, else if (static_branch_unlikely(&mds_user_clear)) mds_clear_cpu_buffers(); - if (vcpu->arch.cr2 != read_cr2()) - write_cr2(vcpu->arch.cr2); + if (vcpu->arch.cr2 != native_read_cr2()) + native_write_cr2(vcpu->arch.cr2); vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, vmx->loaded_vmcs->launched); - vcpu->arch.cr2 = read_cr2(); + vcpu->arch.cr2 = native_read_cr2(); /* * VMEXIT disables interrupts (host state), but tracing and lockdep From 5962bfb748f8ad90e002411ea4056a15c63b5151 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:25 -0700 Subject: [PATCH 069/127] KVM: x86/mmu: Track the associated kmem_cache in the MMU caches Track the kmem_cache used for non-page KVM MMU memory caches instead of passing in the associated kmem_cache when filling the cache. This will allow consolidating code and other cleanups. No functional change intended. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 24 +++++++++++------------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3d7d818a282c..6b4efe1c15ad 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -251,6 +251,7 @@ struct kvm_kernel_irq_routing_entry; */ struct kvm_mmu_memory_cache { int nobjs; + struct kmem_cache *kmem_cache; void *objects[KVM_NR_MEM_OBJS]; }; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 231beb6d9cf7..417539066093 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1061,15 +1061,14 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) local_irq_enable(); } -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min) +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min) { void *obj; if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT); + obj = kmem_cache_zalloc(cache->kmem_cache, GFP_KERNEL_ACCOUNT); if (!obj) return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = obj; @@ -1082,11 +1081,10 @@ static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) return cache->nobjs; } -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, - struct kmem_cache *cache) +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) - kmem_cache_free(cache, mc->objects[--mc->nobjs]); + kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); } static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, @@ -1116,25 +1114,22 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) int r; r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); + 8 + PTE_PREFETCH_NUM); if (r) goto out; r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); if (r) goto out; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache, 4); + r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 4); out: return r; } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) @@ -5705,6 +5700,9 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) uint i; int ret; + vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; + vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; + vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; From 45177cccd9c9a8c540ceb04c4faf451ba54df85e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:26 -0700 Subject: [PATCH 070/127] KVM: x86/mmu: Consolidate "page" variant of memory cache helpers Drop the "page" variants of the topup/free memory cache helpers, using the existence of an associated kmem_cache to select the correct alloc or free routine. No functional change intended. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 417539066093..317a2c551198 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1068,7 +1068,10 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min) if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(cache->kmem_cache, GFP_KERNEL_ACCOUNT); + if (cache->kmem_cache) + obj = kmem_cache_zalloc(cache->kmem_cache, GFP_KERNEL_ACCOUNT); + else + obj = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); if (!obj) return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = obj; @@ -1083,30 +1086,12 @@ static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) { - while (mc->nobjs) - kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, - int min) -{ - void *page; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); - if (!page) - return cache->nobjs >= min ? 0 : -ENOMEM; - cache->objects[cache->nobjs++] = page; + while (mc->nobjs) { + if (mc->kmem_cache) + kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); + else + free_page((unsigned long)mc->objects[--mc->nobjs]); } - return 0; -} - -static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) @@ -1117,7 +1102,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) 8 + PTE_PREFETCH_NUM); if (r) goto out; - r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); + r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_cache, 8); if (r) goto out; r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 4); @@ -1128,7 +1113,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); - mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } From 356ec69adfc8c8c82fdc0dd7dc81408f075c784a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:27 -0700 Subject: [PATCH 071/127] KVM: x86/mmu: Use consistent "mc" name for kvm_mmu_memory_cache locals Use "mc" for local variables to shorten line lengths and provide consistent names, which will be especially helpful when some of the helpers are moved to common KVM code in future patches. No functional change intended. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 317a2c551198..26f0c82bf801 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1061,27 +1061,27 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) local_irq_enable(); } -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min) +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) { void *obj; - if (cache->nobjs >= min) + if (mc->nobjs >= min) return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - if (cache->kmem_cache) - obj = kmem_cache_zalloc(cache->kmem_cache, GFP_KERNEL_ACCOUNT); + while (mc->nobjs < ARRAY_SIZE(mc->objects)) { + if (mc->kmem_cache) + obj = kmem_cache_zalloc(mc->kmem_cache, GFP_KERNEL_ACCOUNT); else obj = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); if (!obj) - return cache->nobjs >= min ? 0 : -ENOMEM; - cache->objects[cache->nobjs++] = obj; + return mc->nobjs >= min ? 0 : -ENOMEM; + mc->objects[mc->nobjs++] = obj; } return 0; } -static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) +static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *mc) { - return cache->nobjs; + return mc->nobjs; } static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) @@ -1396,10 +1396,10 @@ static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, static bool rmap_can_add(struct kvm_vcpu *vcpu) { - struct kvm_mmu_memory_cache *cache; + struct kvm_mmu_memory_cache *mc; - cache = &vcpu->arch.mmu_pte_list_desc_cache; - return mmu_memory_cache_free_objects(cache); + mc = &vcpu->arch.mmu_pte_list_desc_cache; + return mmu_memory_cache_free_objects(mc); } static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) From 284aa868688ac87d0eac7792b22b9c05f7a3cc45 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:28 -0700 Subject: [PATCH 072/127] KVM: x86/mmu: Remove superfluous gotos from mmu_topup_memory_caches() Return errors directly from mmu_topup_memory_caches() instead of branching to a label that does the same. No functional change intended. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 26f0c82bf801..7fae1c395cbd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1101,13 +1101,11 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); if (r) - goto out; + return r; r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_cache, 8); if (r) - goto out; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 4); -out: - return r; + return r; + return mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 4); } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) From 53a3f4877152fe1c1d6c499a49bf573b60f5dc41 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:29 -0700 Subject: [PATCH 073/127] KVM: x86/mmu: Try to avoid crashing KVM if a MMU memory cache is empty Attempt to allocate a new object instead of crashing KVM (and likely the kernel) if a memory cache is unexpectedly empty. Use GFP_ATOMIC for the allocation as the caches are used while holding mmu_lock. The immediate BUG_ON() makes the code unnecessarily explosive and led to confusing minimums being used in the past, e.g. allocating 4 objects where 1 would suffice. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-6-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7fae1c395cbd..a8eddb83c6f3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1061,6 +1061,15 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) local_irq_enable(); } +static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, + gfp_t gfp_flags) +{ + if (mc->kmem_cache) + return kmem_cache_zalloc(mc->kmem_cache, gfp_flags); + else + return (void *)__get_free_page(gfp_flags); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) { void *obj; @@ -1068,10 +1077,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) if (mc->nobjs >= min) return 0; while (mc->nobjs < ARRAY_SIZE(mc->objects)) { - if (mc->kmem_cache) - obj = kmem_cache_zalloc(mc->kmem_cache, GFP_KERNEL_ACCOUNT); - else - obj = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); if (!obj) return mc->nobjs >= min ? 0 : -ENOMEM; mc->objects[mc->nobjs++] = obj; @@ -1119,8 +1125,11 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) { void *p; - BUG_ON(!mc->nobjs); - p = mc->objects[--mc->nobjs]; + if (WARN_ON(!mc->nobjs)) + p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT); + else + p = mc->objects[--mc->nobjs]; + BUG_ON(!p); return p; } From 832914452a9638b713a3ea9a490cbc18f3b164f2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:30 -0700 Subject: [PATCH 074/127] KVM: x86/mmu: Move fast_page_fault() call above mmu_topup_memory_caches() Avoid refilling the memory caches and potentially slow reclaim/swap when handling a fast page fault, which does not need to allocate any new objects. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-7-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a8eddb83c6f3..d851f8cf2bcc 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4120,6 +4120,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; + if (fast_page_fault(vcpu, gpa, error_code)) + return RET_PF_RETRY; + r = mmu_topup_memory_caches(vcpu); if (r) return r; @@ -4127,9 +4130,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (lpage_disallowed) max_level = PG_LEVEL_4K; - if (fast_page_fault(vcpu, gpa, error_code)) - return RET_PF_RETRY; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); From f3747a5a9e5a26934da8c956ceef42639cbc623c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:31 -0700 Subject: [PATCH 075/127] KVM: x86/mmu: Topup memory caches after walking GVA->GPA Topup memory caches after walking the GVA->GPA translation during a shadow page fault, there is no need to ensure the caches are full when walking the GVA. As of commit f5a1e9f89504f ("KVM: MMU: remove call to kvm_mmu_pte_write from walk_addr"), the FNAME(walk_addr) flow no longer add rmaps via kvm_mmu_pte_write(). This avoids allocating memory in the case that the GVA is unmapped in the guest, and also provides a paper trail of why/when the memory caches need to be filled. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-8-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 0981b84c95e4..e3e9f9227372 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -789,10 +789,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - /* * If PFEC.RSVD is set, this is a shadow page fault. * The bit needs to be cleared before walking guest page tables. @@ -820,6 +816,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, return RET_PF_EMULATE; } + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + vcpu->arch.write_fault_to_shadow_pgtable = false; is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, From 531281ad98ba7e681e0f8fa3c6d216032a08a123 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:32 -0700 Subject: [PATCH 076/127] KVM: x86/mmu: Clean up the gorilla math in mmu_topup_memory_caches() Clean up the minimums in mmu_topup_memory_caches() to document the driving mechanisms behind the minimums. Now that encountering an empty cache is unlikely to trigger BUG_ON(), it is less dangerous to be more precise when defining the minimums. For rmaps, the logic is 1 parent PTE per level, plus a single rmap, and prefetched rmaps. The extra objects in the current '8 + PREFETCH' minimum came about due to an abundance of paranoia in commit c41ef344de212 ("KVM: MMU: increase per-vcpu rmap cache alloc size"), i.e. it could have increased the minimum to 2 rmaps. Furthermore, the unexpected extra rmap case was killed off entirely by commits f759e2b4c728c ("KVM: MMU: avoid pte_list_desc running out in kvm_mmu_pte_write") and f5a1e9f89504f ("KVM: MMU: remove call to kvm_mmu_pte_write from walk_addr"). For the so called page cache, replace '8' with 2*PT64_ROOT_MAX_LEVEL. The 2x multiplier is needed because the cache is used for both shadow pages and gfn arrays for indirect MMUs. And finally, for page headers, replace '4' with PT64_ROOT_MAX_LEVEL. Note, KVM now supports 5-level paging, i.e. the old minimums that used a baseline derived from 4-level paging were technically wrong. But, KVM always allocates roots in a separate flow, e.g. it's impossible in the current implementation to actually need 5 new shadow pages in a single flow. Use PT64_ROOT_MAX_LEVEL unmodified instead of subtracting 1, as the direct usage is likely more intuitive to uninformed readers, and the inflated minimum is unlikely to affect functionality in practice. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-9-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d851f8cf2bcc..be6ca862531f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1104,14 +1104,17 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) { int r; + /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - 8 + PTE_PREFETCH_NUM); + 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) return r; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_cache, 8); + r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_cache, + 2 * PT64_ROOT_MAX_LEVEL); if (r) return r; - return mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 4); + return mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, + PT64_ROOT_MAX_LEVEL); } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) From 171a90d70f25c2501162aef5f42eba5fc91c054a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:33 -0700 Subject: [PATCH 077/127] KVM: x86/mmu: Separate the memory caches for shadow pages and gfn arrays Use separate caches for allocating shadow pages versus gfn arrays. This sets the stage for specifying __GFP_ZERO when allocating shadow pages without incurring extra cost for gfn arrays. No functional change intended. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-10-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/mmu/mmu.c | 15 ++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6b4efe1c15ad..01f5c1508851 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -602,7 +602,8 @@ struct kvm_vcpu_arch { struct kvm_mmu *walk_mmu; struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; - struct kvm_mmu_memory_cache mmu_page_cache; + struct kvm_mmu_memory_cache mmu_shadow_page_cache; + struct kvm_mmu_memory_cache mmu_gfn_array_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; /* diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index be6ca862531f..a8fc17ac7ed0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1109,8 +1109,12 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) return r; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_cache, - 2 * PT64_ROOT_MAX_LEVEL); + r = mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, + PT64_ROOT_MAX_LEVEL); + if (r) + return r; + r = mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + PT64_ROOT_MAX_LEVEL); if (r) return r; return mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, @@ -1120,7 +1124,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } @@ -2082,9 +2087,9 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct struct kvm_mmu_page *sp; sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); /* From 5f6078f9f1e32b395a78c8d0c0f6598004c668de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:34 -0700 Subject: [PATCH 078/127] KVM: x86/mmu: Make __GFP_ZERO a property of the memory cache Add a gfp_zero flag to 'struct kvm_mmu_memory_cache' and use it to control __GFP_ZERO instead of hardcoding a call to kmem_cache_zalloc(). A future patch needs such a flag for the __get_free_page() path, as gfn arrays do not need/want the allocator to zero the memory. Convert the kmem_cache paths to __GFP_ZERO now so as to avoid a weird and inconsistent API in the future. No functional change intended. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-11-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 01f5c1508851..9d41eb5a8453 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -251,6 +251,7 @@ struct kvm_kernel_irq_routing_entry; */ struct kvm_mmu_memory_cache { int nobjs; + gfp_t gfp_zero; struct kmem_cache *kmem_cache; void *objects[KVM_NR_MEM_OBJS]; }; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a8fc17ac7ed0..6e57044ae848 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1064,8 +1064,10 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, gfp_t gfp_flags) { + gfp_flags |= mc->gfp_zero; + if (mc->kmem_cache) - return kmem_cache_zalloc(mc->kmem_cache, gfp_flags); + return kmem_cache_alloc(mc->kmem_cache, gfp_flags); else return (void *)__get_free_page(gfp_flags); } @@ -5701,7 +5703,10 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) int ret; vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; + vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; + vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; + vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; From 9688088378d41d7918ee50fc89b6f884212aaa22 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:35 -0700 Subject: [PATCH 079/127] KVM: x86/mmu: Zero allocate shadow pages (outside of mmu_lock) Set __GFP_ZERO for the shadow page memory cache and drop the explicit clear_page() from kvm_mmu_get_page(). This moves the cost of zeroing a page to the allocation time of the physical page, i.e. when topping up the memory caches, and thus avoids having to zero out an entire page while holding mmu_lock. Cc: Peter Feiner Cc: Peter Shier Cc: Junaid Shahid Cc: Jim Mattson Suggested-by: Ben Gardon Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-12-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6e57044ae848..a9fb427107b2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2535,7 +2535,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PG_LEVEL_4K && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } - clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); @@ -5708,6 +5707,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; + vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; + vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; From 378f5cd64aefd7e8527d40cc69d3aea275521fbc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:36 -0700 Subject: [PATCH 080/127] KVM: x86/mmu: Skip filling the gfn cache for guaranteed direct MMU topups Don't bother filling the gfn array cache when the caller is a fully direct MMU, i.e. won't need a gfn array for shadow pages. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-13-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 18 ++++++++++-------- arch/x86/kvm/mmu/paging_tmpl.h | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a9fb427107b2..3d477a305d28 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1102,7 +1102,7 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) } } -static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) { int r; @@ -1115,10 +1115,12 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) PT64_ROOT_MAX_LEVEL); if (r) return r; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, - PT64_ROOT_MAX_LEVEL); - if (r) - return r; + if (maybe_indirect) { + r = mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + PT64_ROOT_MAX_LEVEL); + if (r) + return r; + } return mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, PT64_ROOT_MAX_LEVEL); } @@ -4132,7 +4134,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (fast_page_fault(vcpu, gpa, error_code)) return RET_PF_RETRY; - r = mmu_topup_memory_caches(vcpu); + r = mmu_topup_memory_caches(vcpu, false); if (r) return r; @@ -5168,7 +5170,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; - r = mmu_topup_memory_caches(vcpu); + r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); if (r) goto out; r = mmu_alloc_roots(vcpu); @@ -5362,7 +5364,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, * or not since pte prefetch is skiped if it does not have * enough objects in the cache. */ - mmu_topup_memory_caches(vcpu); + mmu_topup_memory_caches(vcpu, true); spin_lock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index e3e9f9227372..0172a949f6a7 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -816,7 +816,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, return RET_PF_EMULATE; } - r = mmu_topup_memory_caches(vcpu); + r = mmu_topup_memory_caches(vcpu, true); if (r) return r; @@ -904,7 +904,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) * No need to check return value here, rmap_can_add() can * help us to skip pte prefetch later. */ - mmu_topup_memory_caches(vcpu); + mmu_topup_memory_caches(vcpu, true); if (!VALID_PAGE(root_hpa)) { WARN_ON(1); From 94ce87ef8177e436c5ee46b211c4e0cf490ae389 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:37 -0700 Subject: [PATCH 081/127] KVM: x86/mmu: Prepend "kvm_" to memory cache helpers that will be global Rename the memory helpers that will soon be moved to common code and be made globaly available via linux/kvm_host.h. "mmu" alone is not a sufficient namespace for globally available KVM symbols. Opportunistically add "nr_" in mmu_memory_cache_free_objects() to make it clear the function returns the number of free objects, as opposed to freeing existing objects. Suggested-by: Christoffer Dall Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-14-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3d477a305d28..942b6a90cb17 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1072,7 +1072,7 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, return (void *)__get_free_page(gfp_flags); } -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) +static int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) { void *obj; @@ -1087,12 +1087,12 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) return 0; } -static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *mc) +static int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) { return mc->nobjs; } -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +static void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) { if (mc->kmem_cache) @@ -1107,33 +1107,33 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) int r; /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ - r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, + 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) return r; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, - PT64_ROOT_MAX_LEVEL); + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, + PT64_ROOT_MAX_LEVEL); if (r) return r; if (maybe_indirect) { - r = mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, - PT64_ROOT_MAX_LEVEL); + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + PT64_ROOT_MAX_LEVEL); if (r) return r; } - return mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, - PT64_ROOT_MAX_LEVEL); + return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, + PT64_ROOT_MAX_LEVEL); } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) +static void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) { void *p; @@ -1147,7 +1147,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); + return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) @@ -1418,7 +1418,7 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) struct kvm_mmu_memory_cache *mc; mc = &vcpu->arch.mmu_pte_list_desc_cache; - return mmu_memory_cache_free_objects(mc); + return kvm_mmu_memory_cache_nr_free_objects(mc); } static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -2090,10 +2090,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct { struct kvm_mmu_page *sp; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); + sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); /* From 2aa9c199cf8151c190c7e7ca3ddfcfbb2d85ac36 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:38 -0700 Subject: [PATCH 082/127] KVM: Move x86's version of struct kvm_mmu_memory_cache to common code Move x86's 'struct kvm_mmu_memory_cache' to common code in anticipation of moving the entire x86 implementation code to common KVM and reusing it for arm64 and MIPS. Add a new architecture specific asm/kvm_types.h to control the existence and parameters of the struct. The new header is needed to avoid a chicken-and-egg problem with asm/kvm_host.h as all architectures define instances of the struct in their vCPU structs. Add an asm-generic version of kvm_types.h to avoid having empty files on PPC and s390 in the long term, and for arm64 and mips in the short term. Suggested-by: Christoffer Dall Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-15-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/Kbuild | 1 + arch/mips/include/asm/Kbuild | 1 + arch/powerpc/include/asm/Kbuild | 1 + arch/s390/include/asm/Kbuild | 1 + arch/x86/include/asm/kvm_host.h | 13 ------------- arch/x86/include/asm/kvm_types.h | 7 +++++++ include/asm-generic/kvm_types.h | 5 +++++ include/linux/kvm_types.h | 19 +++++++++++++++++++ 8 files changed, 35 insertions(+), 13 deletions(-) create mode 100644 arch/x86/include/asm/kvm_types.h create mode 100644 include/asm-generic/kvm_types.h diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index ff9cbb631212..35a68155cd0e 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 generic-y += early_ioremap.h +generic-y += kvm_types.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += qrwlock.h diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 8643d313890e..397e6d24d2ab 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -5,6 +5,7 @@ generated-y += syscall_table_64_n32.h generated-y += syscall_table_64_n64.h generated-y += syscall_table_64_o32.h generic-y += export.h +generic-y += kvm_types.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += parport.h diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index dadbcf3a0b1e..2d444d09b553 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -4,6 +4,7 @@ generated-y += syscall_table_64.h generated-y += syscall_table_c32.h generated-y += syscall_table_spu.h generic-y += export.h +generic-y += kvm_types.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += vtime.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 83f6e85de7bc..319efa0e6d02 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -6,5 +6,6 @@ generated-y += unistd_nr.h generic-y += asm-offsets.h generic-y += export.h +generic-y += kvm_types.h generic-y += local64.h generic-y += mcs_spinlock.h diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9d41eb5a8453..5aaef036627f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -193,8 +193,6 @@ struct x86_exception; enum x86_intercept; enum x86_intercept_stage; -#define KVM_NR_MEM_OBJS 40 - #define KVM_NR_DB_REGS 4 #define DR6_BD (1 << 13) @@ -245,17 +243,6 @@ enum x86_intercept_stage; struct kvm_kernel_irq_routing_entry; -/* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. - */ -struct kvm_mmu_memory_cache { - int nobjs; - gfp_t gfp_zero; - struct kmem_cache *kmem_cache; - void *objects[KVM_NR_MEM_OBJS]; -}; - /* * the pages used as guest page table on soft mmu are tracked by * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h new file mode 100644 index 000000000000..08f1b57d3b62 --- /dev/null +++ b/arch/x86/include/asm/kvm_types.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_KVM_TYPES_H +#define _ASM_X86_KVM_TYPES_H + +#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 + +#endif /* _ASM_X86_KVM_TYPES_H */ diff --git a/include/asm-generic/kvm_types.h b/include/asm-generic/kvm_types.h new file mode 100644 index 000000000000..2a82daf110f1 --- /dev/null +++ b/include/asm-generic/kvm_types.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_KVM_TYPES_H +#define _ASM_GENERIC_KVM_TYPES_H + +#endif diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 68e84cf42a3f..a7580f69dda0 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -20,6 +20,8 @@ enum kvm_mr_change; #include +#include + /* * Address types: * @@ -58,4 +60,21 @@ struct gfn_to_pfn_cache { bool dirty; }; +#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE +/* + * Memory caches are used to preallocate memory ahead of various MMU flows, + * e.g. page fault handlers. Gracefully handling allocation failures deep in + * MMU flows is problematic, as is triggering reclaim, I/O, etc... while + * holding MMU locks. Note, these caches act more like prefetch buffers than + * classical caches, i.e. objects are not returned to the cache on being freed. + */ +struct kvm_mmu_memory_cache { + int nobjs; + gfp_t gfp_zero; + struct kmem_cache *kmem_cache; + void *objects[KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE]; +}; +#endif + + #endif /* __KVM_TYPES_H__ */ From 6926f95accee3f8ceb5f69dbecd880687028ae70 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:39 -0700 Subject: [PATCH 083/127] KVM: Move x86's MMU memory cache helpers to common KVM code Move x86's memory cache helpers to common KVM code so that they can be reused by arm64 and MIPS in future patches. Suggested-by: Christoffer Dall Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-16-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 53 -------------------------------------- include/linux/kvm_host.h | 7 +++++ virt/kvm/kvm_main.c | 55 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 53 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 942b6a90cb17..fa506aaaf019 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1061,47 +1061,6 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) local_irq_enable(); } -static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, - gfp_t gfp_flags) -{ - gfp_flags |= mc->gfp_zero; - - if (mc->kmem_cache) - return kmem_cache_alloc(mc->kmem_cache, gfp_flags); - else - return (void *)__get_free_page(gfp_flags); -} - -static int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) -{ - void *obj; - - if (mc->nobjs >= min) - return 0; - while (mc->nobjs < ARRAY_SIZE(mc->objects)) { - obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); - if (!obj) - return mc->nobjs >= min ? 0 : -ENOMEM; - mc->objects[mc->nobjs++] = obj; - } - return 0; -} - -static int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) -{ - return mc->nobjs; -} - -static void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) { - if (mc->kmem_cache) - kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); - else - free_page((unsigned long)mc->objects[--mc->nobjs]); - } -} - static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) { int r; @@ -1133,18 +1092,6 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } -static void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) -{ - void *p; - - if (WARN_ON(!mc->nobjs)) - p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT); - else - p = mc->objects[--mc->nobjs]; - BUG_ON(!p); - return p; -} - static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 87140e79648b..989afcbe642f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -817,6 +817,13 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible); void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_reload_remote_mmus(struct kvm *kvm); +#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE +int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min); +int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc); +void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); +void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); +#endif + bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, struct kvm_vcpu *except, unsigned long *vcpu_bitmap, cpumask_var_t tmp); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b528a59b0a84..2c2c0254c2d8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -340,6 +340,61 @@ void kvm_reload_remote_mmus(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); } +#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE +static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, + gfp_t gfp_flags) +{ + gfp_flags |= mc->gfp_zero; + + if (mc->kmem_cache) + return kmem_cache_alloc(mc->kmem_cache, gfp_flags); + else + return (void *)__get_free_page(gfp_flags); +} + +int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) +{ + void *obj; + + if (mc->nobjs >= min) + return 0; + while (mc->nobjs < ARRAY_SIZE(mc->objects)) { + obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); + if (!obj) + return mc->nobjs >= min ? 0 : -ENOMEM; + mc->objects[mc->nobjs++] = obj; + } + return 0; +} + +int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) +{ + return mc->nobjs; +} + +void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) { + if (mc->kmem_cache) + kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); + else + free_page((unsigned long)mc->objects[--mc->nobjs]); + } +} + +void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) +{ + void *p; + + if (WARN_ON(!mc->nobjs)) + p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT); + else + p = mc->objects[--mc->nobjs]; + BUG_ON(!p); + return p; +} +#endif + static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { mutex_init(&vcpu->mutex); From 61ffb3a50c4402e17ead1257914ee98fd8c6f2f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:40 -0700 Subject: [PATCH 084/127] KVM: arm64: Drop @max param from mmu_topup_memory_cache() Replace the @max param in mmu_topup_memory_cache() and instead use ARRAY_SIZE() to terminate the loop to fill the cache. This removes a BUG_ON() and sets the stage for moving arm64 to the common memory cache implementation. No functional change intended. Tested-by: Marc Zyngier Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-17-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/mmu.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 8c0035cab6b6..f78aa3e269e9 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -124,15 +124,13 @@ static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) put_page(virt_to_page(pudp)); } -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - int min, int max) +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min) { void *page; - BUG_ON(max > KVM_NR_MEM_OBJS); if (cache->nobjs >= min) return 0; - while (cache->nobjs < max) { + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { page = (void *)__get_free_page(GFP_PGTABLE_USER); if (!page) return -ENOMEM; @@ -1481,8 +1479,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, pte = kvm_s2pte_mkwrite(pte); ret = mmu_topup_memory_cache(&cache, - kvm_mmu_cache_min_pages(kvm), - KVM_NR_MEM_OBJS); + kvm_mmu_cache_min_pages(kvm)); if (ret) goto out; spin_lock(&kvm->mmu_lock); @@ -1882,8 +1879,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmap_read_unlock(current->mm); /* We need minimum second+third level pages */ - ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), - KVM_NR_MEM_OBJS); + ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm)); if (ret) return ret; From e539451b7e7a6cb0248b1e1b1baf8ed40ce16bac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:41 -0700 Subject: [PATCH 085/127] KVM: arm64: Use common code's approach for __GFP_ZERO with memory caches Add a "gfp_zero" member to arm64's 'struct kvm_mmu_memory_cache' to make the struct and its usage compatible with the common 'struct kvm_mmu_memory_cache' in linux/kvm_host.h. This will minimize code churn when arm64 moves to the common implementation in a future patch, at the cost of temporarily having somewhat silly code. Note, GFP_PGTABLE_USER is equivalent to GFP_KERNEL_ACCOUNT | GFP_ZERO: #define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) | -> #define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO) == GFP_KERNEL | __GFP_ACCOUNT | __GFP_ZERO versus #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) with __GFP_ZERO explicitly OR'd in == GFP_KERNEL | __GFP_ACCOUNT | __GFP_ZERO No functional change intended. Tested-by: Marc Zyngier Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-18-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 2 ++ arch/arm64/kvm/mmu.c | 5 +++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c3e6fcc664b1..335170b59899 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -105,6 +105,7 @@ struct kvm_arch { */ struct kvm_mmu_memory_cache { int nobjs; + gfp_t gfp_zero; void *objects[KVM_NR_MEM_OBJS]; }; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 90cb90561446..1016635b3782 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -270,6 +270,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.target = -1; bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; + /* Set up the timer */ kvm_timer_vcpu_init(vcpu); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index f78aa3e269e9..5220623a4efb 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -131,7 +131,8 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min) if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_PGTABLE_USER); + page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | + cache->gfp_zero); if (!page) return -ENOMEM; cache->objects[cache->nobjs++] = page; @@ -1467,7 +1468,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t addr, end; int ret = 0; unsigned long pfn; - struct kvm_mmu_memory_cache cache = { 0, }; + struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, }; end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; pfn = __phys_to_pfn(pa); From c1a33aebe91d49c958df1648b2a84db96c403075 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:42 -0700 Subject: [PATCH 086/127] KVM: arm64: Use common KVM implementation of MMU memory caches Move to the common MMU memory cache implementation now that the common code and arm64's existing code are semantically compatible. No functional change intended. Cc: Marc Zyngier Suggested-by: Christoffer Dall Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-19-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/Kbuild | 1 - arch/arm64/include/asm/kvm_host.h | 12 ------- arch/arm64/include/asm/kvm_types.h | 8 +++++ arch/arm64/kvm/mmu.c | 53 +++++++----------------------- 4 files changed, 19 insertions(+), 55 deletions(-) create mode 100644 arch/arm64/include/asm/kvm_types.h diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 35a68155cd0e..ff9cbb631212 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 generic-y += early_ioremap.h -generic-y += kvm_types.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += qrwlock.h diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 335170b59899..23d1f41548f5 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -97,18 +97,6 @@ struct kvm_arch { bool return_nisv_io_abort_to_user; }; -#define KVM_NR_MEM_OBJS 40 - -/* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. - */ -struct kvm_mmu_memory_cache { - int nobjs; - gfp_t gfp_zero; - void *objects[KVM_NR_MEM_OBJS]; -}; - struct kvm_vcpu_fault_info { u32 esr_el2; /* Hyp Syndrom Register */ u64 far_el2; /* Hyp Fault Address Register */ diff --git a/arch/arm64/include/asm/kvm_types.h b/arch/arm64/include/asm/kvm_types.h new file mode 100644 index 000000000000..9a126b9e2d7c --- /dev/null +++ b/arch/arm64/include/asm/kvm_types.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_ARM64_KVM_TYPES_H +#define _ASM_ARM64_KVM_TYPES_H + +#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 + +#endif /* _ASM_ARM64_KVM_TYPES_H */ + diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 5220623a4efb..ba66e9a9bd3c 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -124,37 +124,6 @@ static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) put_page(virt_to_page(pudp)); } -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min) -{ - void *page; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | - cache->gfp_zero); - if (!page) - return -ENOMEM; - cache->objects[cache->nobjs++] = page; - } - return 0; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) -{ - void *p; - - BUG_ON(!mc || !mc->nobjs); - p = mc->objects[--mc->nobjs]; - return p; -} - static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL); @@ -1131,7 +1100,7 @@ static p4d_t *stage2_get_p4d(struct kvm *kvm, struct kvm_mmu_memory_cache *cache if (stage2_pgd_none(kvm, *pgd)) { if (!cache) return NULL; - p4d = mmu_memory_cache_alloc(cache); + p4d = kvm_mmu_memory_cache_alloc(cache); stage2_pgd_populate(kvm, pgd, p4d); get_page(virt_to_page(pgd)); } @@ -1149,7 +1118,7 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache if (stage2_p4d_none(kvm, *p4d)) { if (!cache) return NULL; - pud = mmu_memory_cache_alloc(cache); + pud = kvm_mmu_memory_cache_alloc(cache); stage2_p4d_populate(kvm, p4d, pud); get_page(virt_to_page(p4d)); } @@ -1170,7 +1139,7 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache if (stage2_pud_none(kvm, *pud)) { if (!cache) return NULL; - pmd = mmu_memory_cache_alloc(cache); + pmd = kvm_mmu_memory_cache_alloc(cache); stage2_pud_populate(kvm, pud, pmd); get_page(virt_to_page(pud)); } @@ -1376,7 +1345,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, if (stage2_pud_none(kvm, *pud)) { if (!cache) return 0; /* ignore calls from kvm_set_spte_hva */ - pmd = mmu_memory_cache_alloc(cache); + pmd = kvm_mmu_memory_cache_alloc(cache); stage2_pud_populate(kvm, pud, pmd); get_page(virt_to_page(pud)); } @@ -1401,7 +1370,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, if (pmd_none(*pmd)) { if (!cache) return 0; /* ignore calls from kvm_set_spte_hva */ - pte = mmu_memory_cache_alloc(cache); + pte = kvm_mmu_memory_cache_alloc(cache); kvm_pmd_populate(pmd, pte); get_page(virt_to_page(pmd)); } @@ -1468,7 +1437,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t addr, end; int ret = 0; unsigned long pfn; - struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, }; + struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, }; end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; pfn = __phys_to_pfn(pa); @@ -1479,8 +1448,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (writable) pte = kvm_s2pte_mkwrite(pte); - ret = mmu_topup_memory_cache(&cache, - kvm_mmu_cache_min_pages(kvm)); + ret = kvm_mmu_topup_memory_cache(&cache, + kvm_mmu_cache_min_pages(kvm)); if (ret) goto out; spin_lock(&kvm->mmu_lock); @@ -1494,7 +1463,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, } out: - mmu_free_memory_cache(&cache); + kvm_mmu_free_memory_cache(&cache); return ret; } @@ -1880,7 +1849,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmap_read_unlock(current->mm); /* We need minimum second+third level pages */ - ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm)); + ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm)); if (ret) return ret; @@ -2303,7 +2272,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); } phys_addr_t kvm_mmu_get_httbr(void) From 220a7060d58f4ea7e72c04618b2774228a37468c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:43 -0700 Subject: [PATCH 087/127] KVM: MIPS: Drop @max param from mmu_topup_memory_cache() Replace the @max param in mmu_topup_memory_cache() and instead use ARRAY_SIZE() to terminate the loop to fill the cache. This removes a BUG_ON() and sets the stage for moving MIPS to the common memory cache implementation. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-20-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mmu.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 49bd160f4d85..d1f148db2449 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -25,15 +25,13 @@ #define KVM_MMU_CACHE_MIN_PAGES 2 #endif -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - int min, int max) +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min) { void *page; - BUG_ON(max > KVM_NR_MEM_OBJS); if (cache->nobjs >= min) return 0; - while (cache->nobjs < max) { + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { page = (void *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; @@ -711,8 +709,7 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, goto out; /* We need a minimum of cached pages ready for page table creation */ - err = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, - KVM_NR_MEM_OBJS); + err = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); if (err) goto out; @@ -796,8 +793,7 @@ static pte_t *kvm_trap_emul_pte_for_gva(struct kvm_vcpu *vcpu, int ret; /* We need a minimum of cached pages ready for page table creation */ - ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, - KVM_NR_MEM_OBJS); + ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); if (ret) return NULL; From 0cdc739b664f429ff1a582e1578421db934b6314 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:44 -0700 Subject: [PATCH 088/127] KVM: MIPS: Account pages used for GPA page tables Use GFP_KERNEL_ACCOUNT instead of GFP_KERNEL when allocating pages for the the GPA page tables. The primary motivation for accounting the allocations is to align with the common KVM memory cache helpers in preparation for moving to the common implementation in a future patch. The actual accounting is a bonus side effect. Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-21-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index d1f148db2449..9d3c8c025624 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -32,7 +32,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min) if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL); + page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); if (!page) return -ENOMEM; cache->objects[cache->nobjs++] = page; From 380f3a8b6360442508321026f9dc7f8507a7f526 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:35:45 -0700 Subject: [PATCH 089/127] KVM: MIPS: Use common KVM implementation of MMU memory caches Move to the common MMU memory cache implementation now that the common code and MIPS's existing code are semantically compatible. No functional change intended. Suggested-by: Christoffer Dall Signed-off-by: Sean Christopherson Message-Id: <20200703023545.8771-22-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/Kbuild | 1 - arch/mips/include/asm/kvm_host.h | 11 --------- arch/mips/include/asm/kvm_types.h | 7 ++++++ arch/mips/kvm/mmu.c | 40 ++++--------------------------- 4 files changed, 12 insertions(+), 47 deletions(-) create mode 100644 arch/mips/include/asm/kvm_types.h diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 397e6d24d2ab..8643d313890e 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -5,7 +5,6 @@ generated-y += syscall_table_64_n32.h generated-y += syscall_table_64_n64.h generated-y += syscall_table_64_o32.h generic-y += export.h -generic-y += kvm_types.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += parport.h diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 363e7a89d173..f49617175f60 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -335,17 +335,6 @@ struct kvm_mips_tlb { long tlb_lo[2]; }; -#define KVM_NR_MEM_OBJS 4 - -/* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. - */ -struct kvm_mmu_memory_cache { - int nobjs; - void *objects[KVM_NR_MEM_OBJS]; -}; - #define KVM_MIPS_AUX_FPU 0x1 #define KVM_MIPS_AUX_MSA 0x2 diff --git a/arch/mips/include/asm/kvm_types.h b/arch/mips/include/asm/kvm_types.h new file mode 100644 index 000000000000..213754d9ef6b --- /dev/null +++ b/arch/mips/include/asm/kvm_types.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_MIPS_KVM_TYPES_H +#define _ASM_MIPS_KVM_TYPES_H + +#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 4 + +#endif /* _ASM_MIPS_KVM_TYPES_H */ diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 9d3c8c025624..87fa8d8a1031 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -25,39 +25,9 @@ #define KVM_MMU_CACHE_MIN_PAGES 2 #endif -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min) -{ - void *page; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); - if (!page) - return -ENOMEM; - cache->objects[cache->nobjs++] = page; - } - return 0; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) -{ - void *p; - - BUG_ON(!mc || !mc->nobjs); - p = mc->objects[--mc->nobjs]; - return p; -} - void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); } /** @@ -151,7 +121,7 @@ static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache, if (!cache) return NULL; - new_pmd = mmu_memory_cache_alloc(cache); + new_pmd = kvm_mmu_memory_cache_alloc(cache); pmd_init((unsigned long)new_pmd, (unsigned long)invalid_pte_table); pud_populate(NULL, pud, new_pmd); @@ -162,7 +132,7 @@ static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache, if (!cache) return NULL; - new_pte = mmu_memory_cache_alloc(cache); + new_pte = kvm_mmu_memory_cache_alloc(cache); clear_page(new_pte); pmd_populate_kernel(NULL, pmd, new_pte); } @@ -709,7 +679,7 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, goto out; /* We need a minimum of cached pages ready for page table creation */ - err = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); + err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); if (err) goto out; @@ -793,7 +763,7 @@ static pte_t *kvm_trap_emul_pte_for_gva(struct kvm_vcpu *vcpu, int ret; /* We need a minimum of cached pages ready for page table creation */ - ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); + ret = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); if (ret) return NULL; From 01edc5e76ecfecf9a79eec2658f6146ef47bc816 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 10 Jul 2020 14:30:17 +0800 Subject: [PATCH 090/127] MIPS: KVM: Limit Trap-and-Emulate to MIPS32R2 only After tons of fixes to get Trap-and-Emulate build on Loongson64, I've got panic on host machine when trying to run a VM. I found that it can never work on 64bit systems. Revewing the code, it looks like R6 can't supportrd by TE as well. Signed-off-by: Jiaxun Yang Message-Id: <20200710063047.154611-3-jiaxun.yang@flygoat.com> Signed-off-by: Paolo Bonzini --- arch/mips/Kconfig | 1 + arch/mips/kvm/Kconfig | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 6fee1a133e9d..2efc34ed94eb 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2202,6 +2202,7 @@ endchoice config KVM_GUEST bool "KVM Guest Kernel" + depends on CPU_MIPS32_R2 depends on BROKEN_ON_SMP help Select this option if building a guest kernel for KVM (Trap & Emulate) diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 2bf02d849a3a..032b3fca6cbb 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -37,10 +37,11 @@ choice config KVM_MIPS_TE bool "Trap & Emulate" + depends on CPU_MIPS32_R2 help Use trap and emulate to virtualize 32-bit guests in user mode. This does not require any special hardware Virtualization support beyond - standard MIPS32/64 r2 or later, but it does require the guest kernel + standard MIPS32 r2 or later, but it does require the guest kernel to be configured with CONFIG_KVM_GUEST=y so that it resides in the user address segment. From 6627a72c46352220bcd71a1cffeb61e5f57c65a5 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 10 Jul 2020 14:30:18 +0800 Subject: [PATCH 091/127] MIPS: KVM: Remove outdated README This file was created long ago and information inside is obviously outdated. Signed-off-by: Jiaxun Yang Message-Id: <20200710063047.154611-4-jiaxun.yang@flygoat.com> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/00README.txt | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 arch/mips/kvm/00README.txt diff --git a/arch/mips/kvm/00README.txt b/arch/mips/kvm/00README.txt deleted file mode 100644 index 51617e481aa3..000000000000 --- a/arch/mips/kvm/00README.txt +++ /dev/null @@ -1,31 +0,0 @@ -KVM/MIPS Trap & Emulate Release Notes -===================================== - -(1) KVM/MIPS should support MIPS32R2 and beyond. It has been tested on the following platforms: - Malta Board with FPGA based 34K - Sigma Designs TangoX board with a 24K based 8654 SoC. - Malta Board with 74K @ 1GHz - -(2) Both Guest kernel and Guest Userspace execute in UM. - Guest User address space: 0x00000000 -> 0x40000000 - Guest Kernel Unmapped: 0x40000000 -> 0x60000000 - Guest Kernel Mapped: 0x60000000 -> 0x80000000 - - Guest Usermode virtual memory is limited to 1GB. - -(2) 16K Page Sizes: Both Host Kernel and Guest Kernel should have the same page size, currently at least 16K. - Note that due to cache aliasing issues, 4K page sizes are NOT supported. - -(3) No HugeTLB Support - Both the host kernel and Guest kernel should have the page size set to 16K. - This will be implemented in a future release. - -(4) KVM/MIPS does not have support for SMP Guests - Linux-3.7-rc2 based SMP guest hangs due to the following code sequence in the generated TLB handlers: - LL/TLBP/SC. Since the TLBP instruction causes a trap the reservation gets cleared - when we ERET back to the guest. This causes the guest to hang in an infinite loop. - This will be fixed in a future release. - -(5) Use Host FPU - Currently KVM/MIPS emulates a 24K CPU without a FPU. - This will be fixed in a future release From 2f0a83bece1e758386e83f01e86379d2e83040ef Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Tue, 23 Jun 2020 21:14:14 +0800 Subject: [PATCH 092/127] KVM: s390: clean up redundant 'kvm_run' parameters In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu' structure. For historical reasons, many kvm-related function parameters retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This patch does a unified cleanup of these remaining redundant parameters. Signed-off-by: Tianjia Zhang Reviewed-by: Vitaly Kuznetsov Message-Id: <20200623131418.31473-2-tianjia.zhang@linux.alibaba.com> Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7fd4fdb165fc..cab3c0141098 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4173,8 +4173,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return rc; } -static void sync_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void sync_regs_fmt2(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; struct runtime_instr_cb *riccb; struct gs_cb *gscb; @@ -4240,8 +4241,10 @@ static void sync_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* SIE will load etoken directly from SDNX and therefore kvm_run */ } -static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void sync_regs(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; + if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { @@ -4270,7 +4273,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* Sync fmt2 only data */ if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) { - sync_regs_fmt2(vcpu, kvm_run); + sync_regs_fmt2(vcpu); } else { /* * In several places we have to modify our internal view to @@ -4289,8 +4292,10 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->kvm_dirty_regs = 0; } -static void store_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void store_regs_fmt2(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; + kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr; kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; @@ -4310,8 +4315,10 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* SIE will save etoken directly into SDNX and therefore kvm_run */ } -static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void store_regs(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; + kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu); @@ -4330,7 +4337,7 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) - store_regs_fmt2(vcpu, kvm_run); + store_regs_fmt2(vcpu); } int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) @@ -4368,7 +4375,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - sync_regs(vcpu, kvm_run); + sync_regs(vcpu); enable_cpu_timer_accounting(vcpu); might_fault(); @@ -4390,7 +4397,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) } disable_cpu_timer_accounting(vcpu); - store_regs(vcpu, kvm_run); + store_regs(vcpu); kvm_sigset_deactivate(vcpu); From 74cc7e0c35c1e4d7ccad27bc31c526ea0916696a Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Tue, 23 Jun 2020 21:14:15 +0800 Subject: [PATCH 093/127] KVM: arm64: clean up redundant 'kvm_run' parameters In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu' structure. For historical reasons, many kvm-related function parameters retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This patch does a unified cleanup of these remaining redundant parameters. Signed-off-by: Tianjia Zhang Reviewed-by: Vitaly Kuznetsov Message-Id: <20200623131418.31473-3-tianjia.zhang@linux.alibaba.com> Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/kvm_coproc.h | 12 +++++----- arch/arm64/include/asm/kvm_host.h | 11 ++++----- arch/arm64/include/asm/kvm_mmu.h | 2 +- arch/arm64/kvm/arm.c | 6 ++--- arch/arm64/kvm/handle_exit.c | 36 ++++++++++++++--------------- arch/arm64/kvm/mmio.c | 11 +++++---- arch/arm64/kvm/mmu.c | 5 ++-- arch/arm64/kvm/sys_regs.c | 13 +++++------ 8 files changed, 46 insertions(+), 50 deletions(-) diff --git a/arch/arm64/include/asm/kvm_coproc.h b/arch/arm64/include/asm/kvm_coproc.h index 0185ee8b8b5e..454373704b8a 100644 --- a/arch/arm64/include/asm/kvm_coproc.h +++ b/arch/arm64/include/asm/kvm_coproc.h @@ -27,12 +27,12 @@ struct kvm_sys_reg_target_table { void kvm_register_target_sys_reg_table(unsigned int target, struct kvm_sys_reg_target_table *table); -int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu); +int kvm_handle_cp14_32(struct kvm_vcpu *vcpu); +int kvm_handle_cp14_64(struct kvm_vcpu *vcpu); +int kvm_handle_cp15_32(struct kvm_vcpu *vcpu); +int kvm_handle_cp15_64(struct kvm_vcpu *vcpu); +int kvm_handle_sys_reg(struct kvm_vcpu *vcpu); #define kvm_coproc_table_init kvm_sys_reg_table_init void kvm_sys_reg_table_init(void); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 23d1f41548f5..ad337d3162fe 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -470,18 +470,15 @@ u64 __kvm_call_hyp(void *hypfn, ...); void force_vm_exit(const cpumask_t *mask); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); -int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, - int exception_index); -void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, - int exception_index); +int handle_exit(struct kvm_vcpu *vcpu, int exception_index); +void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index); /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); -int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); -int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, - phys_addr_t fault_ipa); +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu); +int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa); int kvm_perf_init(void); int kvm_perf_teardown(void); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index b12bfc1f051a..40be8f6c7351 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -139,7 +139,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm); int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable); -int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 1016635b3782..73e12869afe3 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -660,7 +660,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) return ret; if (run->exit_reason == KVM_EXIT_MMIO) { - ret = kvm_handle_mmio_return(vcpu, run); + ret = kvm_handle_mmio_return(vcpu); if (ret) return ret; } @@ -812,11 +812,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* Exit types that need handling before we can be preempted */ - handle_exit_early(vcpu, run, ret); + handle_exit_early(vcpu, ret); preempt_enable(); - ret = handle_exit(vcpu, run, ret); + ret = handle_exit(vcpu, ret); } /* Tell userspace about in-kernel device output levels */ diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 5a02d4c90559..1df3beafd73f 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -25,7 +25,7 @@ #define CREATE_TRACE_POINTS #include "trace_handle_exit.h" -typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); +typedef int (*exit_handle_fn)(struct kvm_vcpu *); static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u32 esr) { @@ -33,7 +33,7 @@ static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u32 esr) kvm_inject_vabt(vcpu); } -static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int handle_hvc(struct kvm_vcpu *vcpu) { int ret; @@ -50,7 +50,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) return ret; } -static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int handle_smc(struct kvm_vcpu *vcpu) { /* * "If an SMC instruction executed at Non-secure EL1 is @@ -69,7 +69,7 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) * Guest access to FP/ASIMD registers are routed to this handler only * when the system doesn't support FP/ASIMD. */ -static int handle_no_fpsimd(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int handle_no_fpsimd(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); return 1; @@ -87,7 +87,7 @@ static int handle_no_fpsimd(struct kvm_vcpu *vcpu, struct kvm_run *run) * world-switches and schedule other host processes until there is an * incoming IRQ or FIQ to the VM. */ -static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_wfx(struct kvm_vcpu *vcpu) { if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE) { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); @@ -109,16 +109,16 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) * kvm_handle_guest_debug - handle a debug exception instruction * * @vcpu: the vcpu pointer - * @run: access to the kvm_run structure for results * * We route all debug exceptions through the same handler. If both the * guest and host are using the same debug facilities it will be up to * userspace to re-inject the correct exception for guest delivery. * - * @return: 0 (while setting run->exit_reason), -1 for error + * @return: 0 (while setting vcpu->run->exit_reason), -1 for error */ -static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; u32 hsr = kvm_vcpu_get_hsr(vcpu); int ret = 0; @@ -144,7 +144,7 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) return ret; } -static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) { u32 hsr = kvm_vcpu_get_hsr(vcpu); @@ -155,7 +155,7 @@ static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } -static int handle_sve(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int handle_sve(struct kvm_vcpu *vcpu) { /* Until SVE is supported for guests: */ kvm_inject_undefined(vcpu); @@ -167,7 +167,7 @@ static int handle_sve(struct kvm_vcpu *vcpu, struct kvm_run *run) * a NOP). If we get here, it is that we didn't fixup ptrauth on exit, and all * that we can do is give the guest an UNDEF. */ -static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); return 1; @@ -212,7 +212,7 @@ static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) * KVM_EXIT_DEBUG, otherwise userspace needs to complete its * emulation first. */ -static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int handle_trap_exceptions(struct kvm_vcpu *vcpu) { int handled; @@ -227,7 +227,7 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run) exit_handle_fn exit_handler; exit_handler = kvm_get_exit_handler(vcpu); - handled = exit_handler(vcpu, run); + handled = exit_handler(vcpu); } return handled; @@ -237,9 +237,10 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run) * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on * proper exit to userspace. */ -int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, - int exception_index) +int handle_exit(struct kvm_vcpu *vcpu, int exception_index) { + struct kvm_run *run = vcpu->run; + if (ARM_SERROR_PENDING(exception_index)) { u8 hsr_ec = ESR_ELx_EC(kvm_vcpu_get_hsr(vcpu)); @@ -265,7 +266,7 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, case ARM_EXCEPTION_EL1_SERROR: return 1; case ARM_EXCEPTION_TRAP: - return handle_trap_exceptions(vcpu, run); + return handle_trap_exceptions(vcpu); case ARM_EXCEPTION_HYP_GONE: /* * EL2 has been reset to the hyp-stub. This happens when a guest @@ -289,8 +290,7 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, } /* For exit types that need handling before we can be preempted */ -void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, - int exception_index) +void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) { if (ARM_SERROR_PENDING(exception_index)) { if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) { diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 4e0366759726..158fbe682611 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -77,9 +77,8 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) * or in-kernel IO emulation * * @vcpu: The VCPU pointer - * @run: The VCPU run struct containing the mmio data */ -int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) { unsigned long data; unsigned int len; @@ -92,6 +91,8 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu->mmio_needed = 0; if (!kvm_vcpu_dabt_iswrite(vcpu)) { + struct kvm_run *run = vcpu->run; + len = kvm_vcpu_dabt_get_as(vcpu); data = kvm_mmio_read_buf(run->mmio.data, len); @@ -119,9 +120,9 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) return 0; } -int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, - phys_addr_t fault_ipa) +int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { + struct kvm_run *run = vcpu->run; unsigned long data; unsigned long rt; int ret; @@ -188,7 +189,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, if (!is_write) memcpy(run->mmio.data, data_buf, len); vcpu->stat.mmio_exit_kernel++; - kvm_handle_mmio_return(vcpu, run); + kvm_handle_mmio_return(vcpu); return 1; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index ba66e9a9bd3c..838aad520f1c 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -2015,7 +2015,6 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) /** * kvm_handle_guest_abort - handles all 2nd stage aborts * @vcpu: the VCPU pointer - * @run: the kvm_run structure * * Any abort that gets to the host is almost guaranteed to be caused by a * missing second stage translation table entry, which can mean that either the @@ -2024,7 +2023,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) * space. The distinction is based on the IPA causing the fault and whether this * memory region has been registered as standard RAM by user space. */ -int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) { unsigned long fault_status; phys_addr_t fault_ipa; @@ -2103,7 +2102,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) * of the page size. */ fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); - ret = io_mem_abort(vcpu, run, fault_ipa); + ret = io_mem_abort(vcpu, fault_ipa); goto out_unlock; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index baf5ce9225ce..c7a856913de8 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2156,7 +2156,7 @@ static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params, return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); } -int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); return 1; @@ -2335,7 +2335,7 @@ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, return 1; } -int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_cp15_64(struct kvm_vcpu *vcpu) { const struct sys_reg_desc *target_specific; size_t num; @@ -2346,7 +2346,7 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) target_specific, num); } -int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_cp15_32(struct kvm_vcpu *vcpu) { const struct sys_reg_desc *target_specific; size_t num; @@ -2357,14 +2357,14 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) target_specific, num); } -int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_cp14_64(struct kvm_vcpu *vcpu) { return kvm_handle_cp_64(vcpu, cp14_64_regs, ARRAY_SIZE(cp14_64_regs), NULL, 0); } -int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_cp14_32(struct kvm_vcpu *vcpu) { return kvm_handle_cp_32(vcpu, cp14_regs, ARRAY_SIZE(cp14_regs), @@ -2416,9 +2416,8 @@ static void reset_sys_reg_descs(struct kvm_vcpu *vcpu, /** * kvm_handle_sys_reg -- handles a mrs/msr trap on a guest sys_reg access * @vcpu: The VCPU pointer - * @run: The kvm_run struct */ -int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) { struct sys_reg_params params; unsigned long esr = kvm_vcpu_get_hsr(vcpu); From c34b26b98caca48ec9ee981d4a89ac4f73376a3a Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Tue, 23 Jun 2020 21:14:17 +0800 Subject: [PATCH 094/127] KVM: MIPS: clean up redundant 'kvm_run' parameters In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu' structure. For historical reasons, many kvm-related function parameters retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This patch does a unified cleanup of these remaining redundant parameters. Signed-off-by: Tianjia Zhang Reviewed-by: Huacai Chen Message-Id: <20200623131418.31473-5-tianjia.zhang@linux.alibaba.com> Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 28 +------- arch/mips/kvm/emulate.c | 59 ++++++---------- arch/mips/kvm/mips.c | 11 ++- arch/mips/kvm/trap_emul.c | 114 ++++++++++++++----------------- arch/mips/kvm/vz.c | 26 +++---- 5 files changed, 87 insertions(+), 151 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index f49617175f60..d35eaed1668f 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -843,8 +843,8 @@ struct kvm_mips_callbacks { const struct kvm_one_reg *reg, s64 v); int (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); int (*vcpu_put)(struct kvm_vcpu *vcpu, int cpu); - int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); - void (*vcpu_reenter)(struct kvm_run *run, struct kvm_vcpu *vcpu); + int (*vcpu_run)(struct kvm_vcpu *vcpu); + void (*vcpu_reenter)(struct kvm_vcpu *vcpu); }; extern struct kvm_mips_callbacks *kvm_mips_callbacks; int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); @@ -899,7 +899,6 @@ extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu, bool write_fault); @@ -1010,83 +1009,67 @@ static inline bool kvm_is_ifetch_fault(struct kvm_vcpu_arch *vcpu) extern enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_syscall(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_ri_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_bp_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_trap_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, - struct kvm_run *run); +extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu); u32 kvm_mips_read_count(struct kvm_vcpu *vcpu); void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count); @@ -1115,26 +1098,21 @@ static inline void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu) {} enum emulation_result kvm_mips_check_privilege(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, u32 *opc, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu); enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, u32 *opc, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu); enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu); enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu); /* COP0 */ diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index d3d322f70fe0..3221193c7371 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1262,7 +1262,6 @@ unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu) enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, u32 *opc, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -1597,12 +1596,12 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; enum emulation_result er; u32 rt; + struct kvm_run *run = vcpu->run; void *data = run->mmio.data; unsigned int imme; unsigned long curr_pc; @@ -1894,9 +1893,9 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, } enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, - u32 cause, struct kvm_run *run, - struct kvm_vcpu *vcpu) + u32 cause, struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; int r; enum emulation_result er; unsigned long curr_pc; @@ -2136,7 +2135,6 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long), unsigned long curr_pc, unsigned long addr, - struct kvm_run *run, struct kvm_vcpu *vcpu, u32 cause) { @@ -2164,13 +2162,13 @@ static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long), /* no matching guest TLB */ vcpu->arch.host_cp0_badvaddr = addr; vcpu->arch.pc = curr_pc; - kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, vcpu); + kvm_mips_emulate_tlbmiss_ld(cause, NULL, vcpu); return EMULATE_EXCEPT; case KVM_MIPS_TLBINV: /* invalid matching guest TLB */ vcpu->arch.host_cp0_badvaddr = addr; vcpu->arch.pc = curr_pc; - kvm_mips_emulate_tlbinv_ld(cause, NULL, run, vcpu); + kvm_mips_emulate_tlbinv_ld(cause, NULL, vcpu); return EMULATE_EXCEPT; default: break; @@ -2180,7 +2178,6 @@ static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long), enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, u32 *opc, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DONE; @@ -2270,7 +2267,7 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, * guest's behalf. */ er = kvm_mips_guest_cache_op(protected_writeback_dcache_line, - curr_pc, va, run, vcpu, cause); + curr_pc, va, vcpu, cause); if (er != EMULATE_DONE) goto done; #ifdef CONFIG_KVM_MIPS_DYN_TRANS @@ -2283,11 +2280,11 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, } else if (op_inst == Hit_Invalidate_I) { /* Perform the icache synchronisation on the guest's behalf */ er = kvm_mips_guest_cache_op(protected_writeback_dcache_line, - curr_pc, va, run, vcpu, cause); + curr_pc, va, vcpu, cause); if (er != EMULATE_DONE) goto done; er = kvm_mips_guest_cache_op(protected_flush_icache_line, - curr_pc, va, run, vcpu, cause); + curr_pc, va, vcpu, cause); if (er != EMULATE_DONE) goto done; @@ -2313,7 +2310,6 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, } enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { union mips_instruction inst; @@ -2329,14 +2325,14 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, switch (inst.r_format.opcode) { case cop0_op: - er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu); + er = kvm_mips_emulate_CP0(inst, opc, cause, vcpu); break; #ifndef CONFIG_CPU_MIPSR6 case cache_op: ++vcpu->stat.cache_exits; trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); - er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu); + er = kvm_mips_emulate_cache(inst, opc, cause, vcpu); break; #else case spec3_op: @@ -2344,7 +2340,7 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, case cache6_op: ++vcpu->stat.cache_exits; trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); - er = kvm_mips_emulate_cache(inst, opc, cause, run, + er = kvm_mips_emulate_cache(inst, opc, cause, vcpu); break; default: @@ -2384,7 +2380,6 @@ long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu) enum emulation_result kvm_mips_emulate_syscall(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2419,7 +2414,6 @@ enum emulation_result kvm_mips_emulate_syscall(u32 cause, enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2463,7 +2457,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2505,7 +2498,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2547,7 +2539,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2588,7 +2579,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2628,7 +2618,6 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2657,7 +2646,6 @@ enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause, enum emulation_result kvm_mips_emulate_ri_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2692,7 +2680,6 @@ enum emulation_result kvm_mips_emulate_ri_exc(u32 cause, enum emulation_result kvm_mips_emulate_bp_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2727,7 +2714,6 @@ enum emulation_result kvm_mips_emulate_bp_exc(u32 cause, enum emulation_result kvm_mips_emulate_trap_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2762,7 +2748,6 @@ enum emulation_result kvm_mips_emulate_trap_exc(u32 cause, enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2797,7 +2782,6 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause, enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2832,7 +2816,6 @@ enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause, enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2866,7 +2849,6 @@ enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, } enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2955,12 +2937,12 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, * branch target), and pass the RI exception to the guest OS. */ vcpu->arch.pc = curr_pc; - return kvm_mips_emulate_ri_exc(cause, opc, run, vcpu); + return kvm_mips_emulate_ri_exc(cause, opc, vcpu); } -enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, - struct kvm_run *run) +enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; unsigned long *gpr = &vcpu->arch.gprs[vcpu->arch.io_gpr]; enum emulation_result er = EMULATE_DONE; @@ -3103,7 +3085,6 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, static enum emulation_result kvm_mips_emulate_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; @@ -3141,7 +3122,6 @@ static enum emulation_result kvm_mips_emulate_exc(u32 cause, enum emulation_result kvm_mips_check_privilege(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DONE; @@ -3223,7 +3203,7 @@ enum emulation_result kvm_mips_check_privilege(u32 cause, } if (er == EMULATE_PRIV_FAIL) - kvm_mips_emulate_exc(cause, opc, run, vcpu); + kvm_mips_emulate_exc(cause, opc, vcpu); return er; } @@ -3237,7 +3217,6 @@ enum emulation_result kvm_mips_check_privilege(u32 cause, */ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu, bool write_fault) { @@ -3261,9 +3240,9 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, KVM_ENTRYHI_ASID)); if (index < 0) { if (exccode == EXCCODE_TLBL) { - er = kvm_mips_emulate_tlbmiss_ld(cause, opc, run, vcpu); + er = kvm_mips_emulate_tlbmiss_ld(cause, opc, vcpu); } else if (exccode == EXCCODE_TLBS) { - er = kvm_mips_emulate_tlbmiss_st(cause, opc, run, vcpu); + er = kvm_mips_emulate_tlbmiss_st(cause, opc, vcpu); } else { kvm_err("%s: invalid exc code: %d\n", __func__, exccode); @@ -3278,10 +3257,10 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, */ if (!TLB_IS_VALID(*tlb, va)) { if (exccode == EXCCODE_TLBL) { - er = kvm_mips_emulate_tlbinv_ld(cause, opc, run, + er = kvm_mips_emulate_tlbinv_ld(cause, opc, vcpu); } else if (exccode == EXCCODE_TLBS) { - er = kvm_mips_emulate_tlbinv_st(cause, opc, run, + er = kvm_mips_emulate_tlbinv_st(cause, opc, vcpu); } else { kvm_err("%s: invalid exc code: %d\n", __func__, diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 666d3350b4ac..7de85d2253ff 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -450,7 +450,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; int r = -EINTR; vcpu_load(vcpu); @@ -459,11 +458,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (vcpu->mmio_needed) { if (!vcpu->mmio_is_write) - kvm_mips_complete_mmio_load(vcpu, run); + kvm_mips_complete_mmio_load(vcpu); vcpu->mmio_needed = 0; } - if (run->immediate_exit) + if (vcpu->run->immediate_exit) goto out; lose_fpu(1); @@ -480,7 +479,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - r = kvm_mips_callbacks->vcpu_run(run, vcpu); + r = kvm_mips_callbacks->vcpu_run(vcpu); trace_kvm_out(vcpu); guest_exit_irqoff(); @@ -1236,7 +1235,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) * end up causing an exception to be delivered to the Guest * Kernel */ - er = kvm_mips_check_privilege(cause, opc, run, vcpu); + er = kvm_mips_check_privilege(cause, opc, vcpu); if (er == EMULATE_PRIV_FAIL) { goto skip_emul; } else if (er == EMULATE_FAIL) { @@ -1385,7 +1384,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - kvm_mips_callbacks->vcpu_reenter(run, vcpu); + kvm_mips_callbacks->vcpu_reenter(vcpu); /* * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 34ad0b46e610..f8cba51e1054 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -67,7 +67,6 @@ static int kvm_trap_emul_no_handler(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; @@ -81,14 +80,14 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) * Unusable/no FPU in guest: * deliver guest COP1 Unusable Exception */ - er = kvm_mips_emulate_fpu_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_fpu_exc(cause, opc, vcpu); } else { /* Restore FPU state */ kvm_own_fpu(vcpu); er = EMULATE_DONE; } } else { - er = kvm_mips_emulate_inst(cause, opc, run, vcpu); + er = kvm_mips_emulate_inst(cause, opc, vcpu); } switch (er) { @@ -97,12 +96,12 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) break; case EMULATE_FAIL: - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; break; case EMULATE_WAIT: - run->exit_reason = KVM_EXIT_INTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; ret = RESUME_HOST; break; @@ -116,8 +115,7 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) return ret; } -static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_run *run, - struct kvm_vcpu *vcpu) +static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_vcpu *vcpu) { enum emulation_result er; union mips_instruction inst; @@ -125,7 +123,7 @@ static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_run *run, /* A code fetch fault doesn't count as an MMIO */ if (kvm_is_ifetch_fault(&vcpu->arch)) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return RESUME_HOST; } @@ -134,23 +132,22 @@ static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_run *run, opc += 1; err = kvm_get_badinstr(opc, vcpu, &inst.word); if (err) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return RESUME_HOST; } /* Emulate the load */ - er = kvm_mips_emulate_load(inst, cause, run, vcpu); + er = kvm_mips_emulate_load(inst, cause, vcpu); if (er == EMULATE_FAIL) { kvm_err("Emulate load from MMIO space failed\n"); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; } else { - run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->exit_reason = KVM_EXIT_MMIO; } return RESUME_HOST; } -static int kvm_mips_bad_store(u32 cause, u32 *opc, struct kvm_run *run, - struct kvm_vcpu *vcpu) +static int kvm_mips_bad_store(u32 cause, u32 *opc, struct kvm_vcpu *vcpu) { enum emulation_result er; union mips_instruction inst; @@ -161,34 +158,33 @@ static int kvm_mips_bad_store(u32 cause, u32 *opc, struct kvm_run *run, opc += 1; err = kvm_get_badinstr(opc, vcpu, &inst.word); if (err) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return RESUME_HOST; } /* Emulate the store */ - er = kvm_mips_emulate_store(inst, cause, run, vcpu); + er = kvm_mips_emulate_store(inst, cause, vcpu); if (er == EMULATE_FAIL) { kvm_err("Emulate store to MMIO space failed\n"); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; } else { - run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->exit_reason = KVM_EXIT_MMIO; } return RESUME_HOST; } -static int kvm_mips_bad_access(u32 cause, u32 *opc, struct kvm_run *run, +static int kvm_mips_bad_access(u32 cause, u32 *opc, struct kvm_vcpu *vcpu, bool store) { if (store) - return kvm_mips_bad_store(cause, opc, run, vcpu); + return kvm_mips_bad_store(cause, opc, vcpu); else - return kvm_mips_bad_load(cause, opc, run, vcpu); + return kvm_mips_bad_load(cause, opc, vcpu); } static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; u32 cause = vcpu->arch.host_cp0_cause; @@ -212,12 +208,12 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) * They would indicate stale host TLB entries. */ if (unlikely(index < 0)) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return RESUME_HOST; } tlb = vcpu->arch.guest_tlb + index; if (unlikely(!TLB_IS_VALID(*tlb, badvaddr))) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return RESUME_HOST; } @@ -226,23 +222,23 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) * exception. Relay that on to the guest so it can handle it. */ if (!TLB_IS_DIRTY(*tlb, badvaddr)) { - kvm_mips_emulate_tlbmod(cause, opc, run, vcpu); + kvm_mips_emulate_tlbmod(cause, opc, vcpu); return RESUME_GUEST; } if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, badvaddr, true)) /* Not writable, needs handling as MMIO */ - return kvm_mips_bad_store(cause, opc, run, vcpu); + return kvm_mips_bad_store(cause, opc, vcpu); return RESUME_GUEST; } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) { if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, true) < 0) /* Not writable, needs handling as MMIO */ - return kvm_mips_bad_store(cause, opc, run, vcpu); + return kvm_mips_bad_store(cause, opc, vcpu); return RESUME_GUEST; } else { /* host kernel addresses are all handled as MMIO */ - return kvm_mips_bad_store(cause, opc, run, vcpu); + return kvm_mips_bad_store(cause, opc, vcpu); } } @@ -276,7 +272,7 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) * into the shadow host TLB */ - er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu, store); + er = kvm_mips_handle_tlbmiss(cause, opc, vcpu, store); if (er == EMULATE_DONE) ret = RESUME_GUEST; else { @@ -289,14 +285,14 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) * not expect to ever get them */ if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, store) < 0) - ret = kvm_mips_bad_access(cause, opc, run, vcpu, store); + ret = kvm_mips_bad_access(cause, opc, vcpu, store); } else if (KVM_GUEST_KERNEL_MODE(vcpu) && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { /* * With EVA we may get a TLB exception instead of an address * error when the guest performs MMIO to KSeg1 addresses. */ - ret = kvm_mips_bad_access(cause, opc, run, vcpu, store); + ret = kvm_mips_bad_access(cause, opc, vcpu, store); } else { kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", store ? "ST" : "LD", cause, opc, badvaddr); @@ -320,7 +316,6 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; u32 cause = vcpu->arch.host_cp0_cause; @@ -328,11 +323,11 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) if (KVM_GUEST_KERNEL_MODE(vcpu) && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { - ret = kvm_mips_bad_store(cause, opc, run, vcpu); + ret = kvm_mips_bad_store(cause, opc, vcpu); } else { kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -340,18 +335,17 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; u32 cause = vcpu->arch.host_cp0_cause; int ret = RESUME_GUEST; if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) { - ret = kvm_mips_bad_load(cause, opc, run, vcpu); + ret = kvm_mips_bad_load(cause, opc, vcpu); } else { kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -359,17 +353,16 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; - er = kvm_mips_emulate_syscall(cause, opc, run, vcpu); + er = kvm_mips_emulate_syscall(cause, opc, vcpu); if (er == EMULATE_DONE) ret = RESUME_GUEST; else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -377,17 +370,16 @@ static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; - er = kvm_mips_handle_ri(cause, opc, run, vcpu); + er = kvm_mips_handle_ri(cause, opc, vcpu); if (er == EMULATE_DONE) ret = RESUME_GUEST; else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -395,17 +387,16 @@ static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; - er = kvm_mips_emulate_bp_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_bp_exc(cause, opc, vcpu); if (er == EMULATE_DONE) ret = RESUME_GUEST; else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -413,17 +404,16 @@ static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *)vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; - er = kvm_mips_emulate_trap_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_trap_exc(cause, opc, vcpu); if (er == EMULATE_DONE) { ret = RESUME_GUEST; } else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -431,17 +421,16 @@ static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *)vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; - er = kvm_mips_emulate_msafpe_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_msafpe_exc(cause, opc, vcpu); if (er == EMULATE_DONE) { ret = RESUME_GUEST; } else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -449,17 +438,16 @@ static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *)vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; - er = kvm_mips_emulate_fpe_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_fpe_exc(cause, opc, vcpu); if (er == EMULATE_DONE) { ret = RESUME_GUEST; } else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -474,7 +462,6 @@ static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; @@ -486,10 +473,10 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu) * No MSA in guest, or FPU enabled and not in FR=1 mode, * guest reserved instruction exception */ - er = kvm_mips_emulate_ri_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_ri_exc(cause, opc, vcpu); } else if (!(kvm_read_c0_guest_config5(cop0) & MIPS_CONF5_MSAEN)) { /* MSA disabled by guest, guest MSA disabled exception */ - er = kvm_mips_emulate_msadis_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_msadis_exc(cause, opc, vcpu); } else { /* Restore MSA/FPU state */ kvm_own_msa(vcpu); @@ -502,7 +489,7 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu) break; case EMULATE_FAIL: - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; break; @@ -1184,8 +1171,7 @@ void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu) local_irq_enable(); } -static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run, - struct kvm_vcpu *vcpu) +static void kvm_trap_emul_vcpu_reenter(struct kvm_vcpu *vcpu) { struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; @@ -1228,7 +1214,7 @@ static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run, check_mmu_context(mm); } -static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +static int kvm_trap_emul_vcpu_run(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); int r; @@ -1237,7 +1223,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_mips_deliver_interrupts(vcpu, kvm_read_c0_guest_cause(vcpu->arch.cop0)); - kvm_trap_emul_vcpu_reenter(run, vcpu); + kvm_trap_emul_vcpu_reenter(vcpu); /* * We use user accessors to access guest memory, but we don't want to @@ -1255,7 +1241,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) */ kvm_mips_suspend_mm(cpu); - r = vcpu->arch.vcpu_run(run, vcpu); + r = vcpu->arch.vcpu_run(vcpu->run, vcpu); /* We may have migrated while handling guest exits */ cpu = smp_processor_id(); diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index d9c462c14163..9e58c479ee20 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -874,7 +874,6 @@ static void kvm_write_maari(struct kvm_vcpu *vcpu, unsigned long val) static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst, u32 *opc, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -1074,7 +1073,6 @@ static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst, static enum emulation_result kvm_vz_gpsi_cache(union mips_instruction inst, u32 *opc, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DONE; @@ -1217,7 +1215,6 @@ static enum emulation_result kvm_trap_vz_handle_gpsi(u32 cause, u32 *opc, { enum emulation_result er = EMULATE_DONE; struct kvm_vcpu_arch *arch = &vcpu->arch; - struct kvm_run *run = vcpu->run; union mips_instruction inst; int rd, rt, sel; int err; @@ -1233,12 +1230,12 @@ static enum emulation_result kvm_trap_vz_handle_gpsi(u32 cause, u32 *opc, switch (inst.r_format.opcode) { case cop0_op: - er = kvm_vz_gpsi_cop0(inst, opc, cause, run, vcpu); + er = kvm_vz_gpsi_cop0(inst, opc, cause, vcpu); break; #ifndef CONFIG_CPU_MIPSR6 case cache_op: trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); - er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu); + er = kvm_vz_gpsi_cache(inst, opc, cause, vcpu); break; #endif #ifdef CONFIG_CPU_LOONGSON64 @@ -1251,7 +1248,7 @@ static enum emulation_result kvm_trap_vz_handle_gpsi(u32 cause, u32 *opc, #ifdef CONFIG_CPU_MIPSR6 case cache6_op: trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); - er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu); + er = kvm_vz_gpsi_cache(inst, opc, cause, vcpu); break; #endif case rdhwr_op: @@ -1553,7 +1550,6 @@ static int kvm_trap_vz_handle_guest_exit(struct kvm_vcpu *vcpu) */ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_FAIL; int ret = RESUME_GUEST; @@ -1581,7 +1577,7 @@ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) break; case EMULATE_FAIL: - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; break; @@ -1600,8 +1596,6 @@ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) */ static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; - /* * If MSA not present or not exposed to guest or FR=0, the MSA operation * should have been treated as a reserved instruction! @@ -1612,7 +1606,7 @@ static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu) (read_gc0_status() & (ST0_CU1 | ST0_FR)) == ST0_CU1 || !(read_gc0_config5() & MIPS_CONF5_MSAEN) || vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return RESUME_HOST; } @@ -1648,7 +1642,7 @@ static int kvm_trap_vz_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) } /* Treat as MMIO */ - er = kvm_mips_emulate_load(inst, cause, run, vcpu); + er = kvm_mips_emulate_load(inst, cause, vcpu); if (er == EMULATE_FAIL) { kvm_err("Guest Emulate Load from MMIO space failed: PC: %p, BadVaddr: %#lx\n", opc, badvaddr); @@ -1695,7 +1689,7 @@ static int kvm_trap_vz_handle_tlb_st_miss(struct kvm_vcpu *vcpu) } /* Treat as MMIO */ - er = kvm_mips_emulate_store(inst, cause, run, vcpu); + er = kvm_mips_emulate_store(inst, cause, vcpu); if (er == EMULATE_FAIL) { kvm_err("Guest Emulate Store to MMIO space failed: PC: %p, BadVaddr: %#lx\n", opc, badvaddr); @@ -3242,7 +3236,7 @@ static void kvm_vz_flush_shadow_memslot(struct kvm *kvm, kvm_vz_flush_shadow_all(kvm); } -static void kvm_vz_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu) +static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); int preserve_guest_tlb; @@ -3258,7 +3252,7 @@ static void kvm_vz_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_vz_vcpu_load_wired(vcpu); } -static int kvm_vz_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +static int kvm_vz_vcpu_run(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); int r; @@ -3271,7 +3265,7 @@ static int kvm_vz_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_vz_vcpu_load_tlb(vcpu, cpu); kvm_vz_vcpu_load_wired(vcpu); - r = vcpu->arch.vcpu_run(run, vcpu); + r = vcpu->arch.vcpu_run(vcpu->run, vcpu); kvm_vz_vcpu_save_wired(vcpu); From d574c539c3c4d8a3f9e07c8a0785954d22a63dcb Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 10 Jul 2020 17:25:59 +0200 Subject: [PATCH 095/127] KVM: x86: move MSR_IA32_PERF_CAPABILITIES emulation to common x86 code state_test/smm_test selftests are failing on AMD with: "Unexpected result from KVM_GET_MSRS, r: 51 (failed MSR was 0x345)" MSR_IA32_PERF_CAPABILITIES is an emulated MSR on Intel but it is not known to AMD code, we can move the emulation to common x86 code. For AMD, we basically just allow the host to read and write zero to the MSR. Fixes: 27461da31089 ("KVM: x86/pmu: Support full width counting") Suggested-by: Jim Mattson Suggested-by: Paolo Bonzini Signed-off-by: Vitaly Kuznetsov Message-Id: <20200710152559.1645827-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/pmu_intel.c | 17 ----------------- arch/x86/kvm/x86.c | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 535ad311ad02..13f923c69475 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2358,6 +2358,8 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; break; + case MSR_IA32_PERF_CAPABILITIES: + return 0; default: return KVM_MSR_RET_INVALID; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index bdcce65c7a1d..a886a47daebd 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -180,9 +180,6 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: ret = pmu->version > 1; break; - case MSR_IA32_PERF_CAPABILITIES: - ret = 1; - break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || @@ -224,12 +221,6 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = pmu->global_ovf_ctrl; return 0; - case MSR_IA32_PERF_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) - return 1; - msr_info->data = vcpu->arch.perf_capabilities; - return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -289,14 +280,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; - case MSR_IA32_PERF_CAPABILITIES: - if (!msr_info->host_initiated) - return 1; - if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) ? - (data & ~vmx_get_perf_capabilities()) : data) - return 1; - vcpu->arch.perf_capabilities = data; - return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e27d3db7e43f..69a2e9c981e9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2857,6 +2857,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.arch_capabilities = data; break; + case MSR_IA32_PERF_CAPABILITIES: { + struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; + + if (!msr_info->host_initiated) + return 1; + if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent)) + return 1; + if (data & ~msr_ent.data) + return 1; + + vcpu->arch.perf_capabilities = data; + + return 0; + } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: @@ -3197,6 +3211,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.arch_capabilities; break; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return 1; + msr_info->data = vcpu->arch.perf_capabilities; + break; case MSR_IA32_POWER_CTL: msr_info->data = vcpu->arch.msr_ia32_power_ctl; break; From 0f04a2ac4fe96bbf05b7ac7d3e94598db550d6b8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 10 Jul 2020 16:11:49 +0200 Subject: [PATCH 096/127] KVM: nSVM: split kvm_init_shadow_npt_mmu() from kvm_init_shadow_mmu() As a preparatory change for moving kvm_mmu_new_pgd() from nested_prepare_vmcb_save() to nested_svm_init_mmu_context() split kvm_init_shadow_npt_mmu() from kvm_init_shadow_mmu(). This also makes the code look more like nVMX (kvm_init_shadow_ept_mmu()). No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200710141157.1640173-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 3 ++- arch/x86/kvm/mmu/mmu.c | 31 ++++++++++++++++++++++++------- arch/x86/kvm/svm/nested.c | 3 ++- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 434acfcbf710..75125ae57e50 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -57,7 +57,8 @@ void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer); +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, + gpa_t nested_cr3); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fa506aaaf019..f6e032c000ac 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4918,14 +4918,10 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) return role; } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, + u32 efer, union kvm_mmu_role new_role) { struct kvm_mmu *context = vcpu->arch.mmu; - union kvm_mmu_role new_role = - kvm_calc_shadow_mmu_root_page_role(vcpu, false); - - if (new_role.as_u64 == context->mmu_role.as_u64) - return; if (!(cr0 & X86_CR0_PG)) nonpaging_init_context(vcpu, context); @@ -4939,7 +4935,28 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) context->mmu_role.as_u64 = new_role.as_u64; reset_shadow_zero_bits_mask(vcpu, context); } -EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); + +static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +{ + struct kvm_mmu *context = vcpu->arch.mmu; + union kvm_mmu_role new_role = + kvm_calc_shadow_mmu_root_page_role(vcpu, false); + + if (new_role.as_u64 != context->mmu_role.as_u64) + shadow_mmu_init_context(vcpu, cr0, cr4, efer, new_role); +} + +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, + gpa_t nested_cr3) +{ + struct kvm_mmu *context = vcpu->arch.mmu; + union kvm_mmu_role new_role = + kvm_calc_shadow_mmu_root_page_role(vcpu, false); + + if (new_role.as_u64 != context->mmu_role.as_u64) + shadow_mmu_init_context(vcpu, cr0, cr4, efer, new_role); +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); static union kvm_mmu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 402ea5b412f0..19e1e99a7458 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -87,7 +87,8 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer); + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer, + svm->nested.ctl.nested_cr3); vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; From 8c008659aa43be97c60c1633074b8f52f9f4445c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 10 Jul 2020 16:11:50 +0200 Subject: [PATCH 097/127] KVM: MMU: stop dereferencing vcpu->arch.mmu to get the context for MMU init kvm_init_shadow_mmu() was actually the only function that could be called with different vcpu->arch.mmu values. Now that kvm_init_shadow_npt_mmu() is separated from kvm_init_shadow_mmu(), we always know the MMU context we need to use and there is no need to dereference vcpu->arch.mmu pointer. Based on a patch by Vitaly Kuznetsov . Signed-off-by: Paolo Bonzini Signed-off-by: Vitaly Kuznetsov Message-Id: <20200710141157.1640173-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f6e032c000ac..78c88e8aecfa 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4850,7 +4850,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_role new_role = kvm_calc_tdp_mmu_root_page_role(vcpu, false); @@ -4918,11 +4918,10 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) return role; } -static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, - u32 efer, union kvm_mmu_role new_role) +static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + u32 cr0, u32 cr4, u32 efer, + union kvm_mmu_role new_role) { - struct kvm_mmu *context = vcpu->arch.mmu; - if (!(cr0 & X86_CR0_PG)) nonpaging_init_context(vcpu, context); else if (efer & EFER_LMA) @@ -4938,23 +4937,23 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_role new_role = kvm_calc_shadow_mmu_root_page_role(vcpu, false); if (new_role.as_u64 != context->mmu_role.as_u64) - shadow_mmu_init_context(vcpu, cr0, cr4, efer, new_role); + shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); } void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, gpa_t nested_cr3) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.guest_mmu; union kvm_mmu_role new_role = kvm_calc_shadow_mmu_root_page_role(vcpu, false); if (new_role.as_u64 != context->mmu_role.as_u64) - shadow_mmu_init_context(vcpu, cr0, cr4, efer, new_role); + shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); } EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); @@ -4990,7 +4989,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); union kvm_mmu_role new_role = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, @@ -5024,7 +5023,7 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.root_mmu; kvm_init_shadow_mmu(vcpu, kvm_read_cr0_bits(vcpu, X86_CR0_PG), From ebdb3dba7b900a6b280d70f08befa0b96e0f806e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 10 Jul 2020 16:11:51 +0200 Subject: [PATCH 098/127] KVM: nSVM: reset nested_run_pending upon nested_svm_vmrun_msrpm() failure WARN_ON_ONCE(svm->nested.nested_run_pending) in nested_svm_vmexit() will fire if nested_run_pending remains '1' but it doesn't really need to, we are already failing and not going to run nested guest. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200710141157.1640173-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 19e1e99a7458..bd0df1b706ad 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -493,6 +493,8 @@ int nested_svm_vmrun(struct vcpu_svm *svm) enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb); if (!nested_svm_vmrun_msrpm(svm)) { + svm->nested.nested_run_pending = 0; + svm->vmcb->control.exit_code = SVM_EXIT_ERR; svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = 0; From 59cd9bc5b03f0bacce9506b068fec538aa9969a7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 10 Jul 2020 16:11:52 +0200 Subject: [PATCH 099/127] KVM: nSVM: prepare to handle errors from enter_svm_guest_mode() Some operations in enter_svm_guest_mode() may fail, e.g. currently we suppress kvm_set_cr3() return value. Prepare the code to proparate errors. No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200710141157.1640173-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 26 ++++++++++++++++---------- arch/x86/kvm/svm/svm.c | 6 ++++-- arch/x86/kvm/svm/svm.h | 4 ++-- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index bd0df1b706ad..bca98651fa07 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -404,7 +404,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) vmcb_mark_all_dirty(svm->vmcb); } -void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, +int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb) { svm->nested.vmcb = vmcb_gpa; @@ -413,6 +413,8 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, nested_prepare_vmcb_control(svm); svm_set_gif(svm, true); + + return 0; } int nested_svm_vmrun(struct vcpu_svm *svm) @@ -490,18 +492,22 @@ int nested_svm_vmrun(struct vcpu_svm *svm) copy_vmcb_control_area(&hsave->control, &vmcb->control); svm->nested.nested_run_pending = 1; - enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb); - if (!nested_svm_vmrun_msrpm(svm)) { - svm->nested.nested_run_pending = 0; + if (enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb)) + goto out_exit_err; - svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; + if (nested_svm_vmrun_msrpm(svm)) + goto out; - nested_svm_vmexit(svm); - } +out_exit_err: + svm->nested.nested_run_pending = 0; + + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); out: kvm_vcpu_unmap(&svm->vcpu, &map, true); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 13f923c69475..41f791e2a013 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3889,6 +3889,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) struct kvm_host_map map; u64 guest; u64 vmcb; + int ret = 0; guest = GET_SMSTATE(u64, smstate, 0x7ed8); vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); @@ -3897,10 +3898,11 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL) return 1; nested_vmcb = map.hva; - enter_svm_guest_mode(svm, vmcb, nested_vmcb); + ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb); kvm_vcpu_unmap(&svm->vcpu, &map, true); } - return 0; + + return ret; } static void enable_smi_window(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 121b198b51e9..a798e1731709 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -387,8 +387,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI)); } -void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb); +int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, + struct vmcb *nested_vmcb); void svm_leave_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct vcpu_svm *svm); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); From 62156f6cd15ab27cf19a97161b5f1820951a36b1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 10 Jul 2020 16:11:53 +0200 Subject: [PATCH 100/127] KVM: nSVM: introduce nested_svm_load_cr3()/nested_npt_enabled() As a preparatory change for implementing nSVM-specific PGD switch (following nVMX' nested_vmx_load_cr3()), introduce nested_svm_load_cr3() instead of relying on kvm_set_cr3(). No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200710141157.1640173-6-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index bca98651fa07..47069fe3aae5 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -336,6 +336,21 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, nested_vmcb->control.exit_int_info = exit_int_info; } +static inline bool nested_npt_enabled(struct vcpu_svm *svm) +{ + return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; +} + +/* + * Load guest's cr3 at nested entry. @nested_npt is true if we are + * emulating VM-Entry into a guest with NPT enabled. + */ +static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, + bool nested_npt) +{ + return kvm_set_cr3(vcpu, cr3); +} + static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb) { /* Load the nested guest state */ @@ -349,7 +364,8 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); - (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); + (void)nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3, + nested_npt_enabled(svm)); svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); @@ -368,7 +384,8 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v static void nested_prepare_vmcb_control(struct vcpu_svm *svm) { const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; - if (svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) + + if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(&svm->vcpu); /* Guest paging mode is active - reset mmu */ From bf7dea425327c5da12f540a1595f22770597e496 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 10 Jul 2020 16:11:54 +0200 Subject: [PATCH 101/127] KVM: nSVM: move kvm_set_cr3() after nested_svm_uninit_mmu_context() kvm_mmu_new_pgd() refers to arch.mmu and at this point it still references arch.guest_mmu while arch.root_mmu is expected. Note, the change is effectively a nop: when !npt_enabled, nested_svm_uninit_mmu_context() does nothing (as we don't do nested_svm_init_mmu_context()) and with npt_enabled we don't do kvm_set_cr3(). However, it will matter when we move the call to kvm_mmu_new_pgd into nested_svm_load_cr3(). No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200710141157.1640173-7-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 47069fe3aae5..381e94d15a34 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -636,12 +636,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm_set_efer(&svm->vcpu, hsave->save.efer); svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); svm_set_cr4(&svm->vcpu, hsave->save.cr4); - if (npt_enabled) { - svm->vmcb->save.cr3 = hsave->save.cr3; - svm->vcpu.arch.cr3 = hsave->save.cr3; - } else { - (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); - } kvm_rax_write(&svm->vcpu, hsave->save.rax); kvm_rsp_write(&svm->vcpu, hsave->save.rsp); kvm_rip_write(&svm->vcpu, hsave->save.rip); @@ -661,6 +655,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_vcpu_unmap(&svm->vcpu, &map, true); nested_svm_uninit_mmu_context(&svm->vcpu); + + if (npt_enabled) { + svm->vmcb->save.cr3 = hsave->save.cr3; + svm->vcpu.arch.cr3 = hsave->save.cr3; + } else { + (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); + } + kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); From a506fdd22342606d22645a6bf90a2d848e92e5d7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 10 Jul 2020 16:11:55 +0200 Subject: [PATCH 102/127] KVM: nSVM: implement nested_svm_load_cr3() and use it for host->guest switch Undesired triple fault gets injected to L1 guest on SVM when L2 is launched with certain CR3 values. #TF is raised by mmu_check_root() check in fast_pgd_switch() and the root cause is that when kvm_set_cr3() is called from nested_prepare_vmcb_save() with NPT enabled CR3 points to a nGPA so we can't check it with kvm_is_visible_gfn(). Using generic kvm_set_cr3() when switching to nested guest is not a great idea as we'll have to distinguish between 'real' CR3s and 'nested' CR3s to e.g. not call kvm_mmu_new_pgd() with nGPA. Following nVMX implement nested-specific nested_svm_load_cr3() doing the job. To support the change, nested_svm_load_cr3() needs to be re-ordered with nested_svm_init_mmu_context(). Note: the current implementation is sub-optimal as we always do TLB flush/MMU sync but this is still an improvement as we at least stop doing kvm_mmu_reset_context(). Fixes: 7c390d350f8b ("kvm: x86: Add fast CR3 switch code path") Signed-off-by: Vitaly Kuznetsov Message-Id: <20200710141157.1640173-8-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 ++ arch/x86/kvm/svm/nested.c | 38 +++++++++++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 78c88e8aecfa..61c35fec5219 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4952,6 +4952,8 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, union kvm_mmu_role new_role = kvm_calc_shadow_mmu_root_page_role(vcpu, false); + __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); + if (new_role.as_u64 != context->mmu_role.as_u64) shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 381e94d15a34..f7d5bafd3b14 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -348,7 +348,28 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_npt) { - return kvm_set_cr3(vcpu, cr3); + if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)) + return -EINVAL; + + if (!nested_npt && is_pae_paging(vcpu) && + (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { + if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + return -EINVAL; + } + + /* + * TODO: optimize unconditional TLB flush/MMU sync here and in + * kvm_init_shadow_npt_mmu(). + */ + if (!nested_npt) + kvm_mmu_new_pgd(vcpu, cr3, false, false); + + vcpu->arch.cr3 = cr3; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + + kvm_init_mmu(vcpu, false); + + return 0; } static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb) @@ -364,9 +385,6 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); - (void)nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3, - nested_npt_enabled(svm)); - svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp); @@ -388,11 +406,6 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(&svm->vcpu); - /* Guest paging mode is active - reset mmu */ - kvm_mmu_reset_context(&svm->vcpu); - - svm_flush_tlb(&svm->vcpu); - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; @@ -424,11 +437,18 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb) { + int ret; + svm->nested.vmcb = vmcb_gpa; load_nested_vmcb_control(svm, &nested_vmcb->control); nested_prepare_vmcb_save(svm, nested_vmcb); nested_prepare_vmcb_control(svm); + ret = nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3, + nested_npt_enabled(svm)); + if (ret) + return ret; + svm_set_gif(svm, true); return 0; From d82aaef9c88aa27bce63751d6d6329920b1fe8da Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 10 Jul 2020 16:11:56 +0200 Subject: [PATCH 103/127] KVM: nSVM: use nested_svm_load_cr3() on guest->host switch Make nSVM code resemble nVMX where nested_vmx_load_cr3() is used on both guest->host and host->guest transitions. Also, we can now eliminate unconditional kvm_mmu_reset_context() and speed things up. Note, nVMX has two different paths: load_vmcs12_host_state() and nested_vmx_restore_host_state() and the later is used to restore from 'partial' switch to L2, it always uses kvm_mmu_reset_context(). nSVM doesn't have this yet. Also, nested_svm_vmexit()'s return value is almost always ignored nowadays. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200710141157.1640173-9-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index f7d5bafd3b14..7b331e3da3eb 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -342,8 +342,8 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) } /* - * Load guest's cr3 at nested entry. @nested_npt is true if we are - * emulating VM-Entry into a guest with NPT enabled. + * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true + * if we are emulating VM-Entry into a guest with NPT enabled. */ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_npt) @@ -676,15 +676,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm) nested_svm_uninit_mmu_context(&svm->vcpu); - if (npt_enabled) { - svm->vmcb->save.cr3 = hsave->save.cr3; - svm->vcpu.arch.cr3 = hsave->save.cr3; - } else { - (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); - } + rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false); + if (rc) + return 1; - kvm_mmu_reset_context(&svm->vcpu); - kvm_mmu_load(&svm->vcpu); + if (npt_enabled) + svm->vmcb->save.cr3 = hsave->save.cr3; /* * Drop what we picked up for L2 via svm_complete_interrupts() so it From fe9304d31831c9ee3943f9a6f72b7cea16d4bc11 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 10 Jul 2020 16:11:57 +0200 Subject: [PATCH 104/127] KVM: x86: drop superfluous mmu_check_root() from fast_pgd_switch() The mmu_check_root() check in fast_pgd_switch() seems to be superfluous: when GPA is outside of the visible range cached_root_available() will fail for non-direct roots (as we can't have a matching one on the list) and we don't seem to care for direct ones. Also, raising #TF immediately when a non-existent GFN is written to CR3 doesn't seem to mach architectural behavior. Drop the check. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200710141157.1640173-10-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 61c35fec5219..613c33149428 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4243,8 +4243,7 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && mmu->root_level >= PT64_ROOT_4LEVEL) - return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) && - cached_root_available(vcpu, new_pgd, new_role); + return cached_root_available(vcpu, new_pgd, new_role); return false; } From 897861479c0640ed93ec82db78f8d839df32c4ac Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Fri, 10 Jul 2020 17:48:03 +0200 Subject: [PATCH 105/127] KVM: x86: Add helper functions for illegal GPA checking and page fault injection This patch adds two helper functions that will be used to support virtualizing MAXPHYADDR in both kvm-intel.ko and kvm.ko. kvm_fixup_and_inject_pf_error() injects a page fault for a user-specified GVA, while kvm_mmu_is_illegal_gpa() checks whether a GPA exceeds vCPU address limits. Signed-off-by: Mohammed Gamal Signed-off-by: Paolo Bonzini Message-Id: <20200710154811.418214-2-mgamal@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 6 ++++++ arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ arch/x86/kvm/x86.h | 1 + 3 files changed, 28 insertions(+) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 75125ae57e50..9f6554613bab 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -4,6 +4,7 @@ #include #include "kvm_cache_regs.h" +#include "cpuid.h" #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -150,6 +151,11 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } +static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); +} + /* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 69a2e9c981e9..0867a626b226 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10738,6 +10738,27 @@ int kvm_spec_ctrl_test_value(u64 value) } EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); +void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) +{ + struct x86_exception fault; + + if (!(error_code & PFERR_PRESENT_MASK) || + vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, error_code, &fault) != UNMAPPED_GVA) { + /* + * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page + * tables probably do not match the TLB. Just proceed + * with the error code that the processor gave. + */ + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = error_code; + fault.nested_page_fault = false; + fault.address = gva; + } + vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); +} +EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 3308c3ccc0fd..995ab696dcf0 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -272,6 +272,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); +void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); From cd313569f5817782033b179f5dd81c2b611bbd18 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Fri, 10 Jul 2020 17:48:04 +0200 Subject: [PATCH 106/127] KVM: x86: mmu: Move translate_gpa() to mmu.c Also no point of it being inline since it's always called through function pointers. So remove that. Signed-off-by: Mohammed Gamal Signed-off-by: Paolo Bonzini Message-Id: <20200710154811.418214-3-mgamal@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ------ arch/x86/kvm/mmu/mmu.c | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5aaef036627f..733bbfa7001e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1510,12 +1510,6 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); -static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, - struct x86_exception *exception) -{ - return gpa; -} - static inline u16 kvm_read_ldt(void) { u16 ldt; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 613c33149428..01e7277acb48 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -516,6 +516,12 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } +static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception) +{ + return gpa; +} + /* * Sets the shadow PTE masks used by the MMU. * From ec7771ab471ba6a945350353617e2e3385d0e013 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Fri, 10 Jul 2020 17:48:05 +0200 Subject: [PATCH 107/127] KVM: x86: mmu: Add guest physical address check in translate_gpa() Intel processors of various generations have supported 36, 39, 46 or 52 bits for physical addresses. Until IceLake introduced MAXPHYADDR==52, running on a machine with higher MAXPHYADDR than the guest more or less worked, because software that relied on reserved address bits (like KVM) generally used bit 51 as a marker and therefore the page faults where generated anyway. Unfortunately this is not true anymore if the host MAXPHYADDR is 52, and this can cause problems when migrating from a MAXPHYADDR<52 machine to one with MAXPHYADDR==52. Typically, the latter are machines that support 5-level page tables, so they can be identified easily from the LA57 CPUID bit. When that happens, the guest might have a physical address with reserved bits set, but the host won't see that and trap it. Hence, we need to check page faults' physical addresses against the guest's maximum physical memory and if it's exceeded, we need to add the PFERR_RSVD_MASK bits to the page fault error code. This patch does this for the MMU's page walks. The next patches will ensure that the correct exception and error code is produced whenever no host-reserved bits are set in page table entries. Signed-off-by: Mohammed Gamal Signed-off-by: Paolo Bonzini Message-Id: <20200710154811.418214-4-mgamal@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 01e7277acb48..77810ce66bdb 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -519,6 +519,12 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { + /* Check if guest physical address doesn't exceed guest maximum */ + if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) { + exception->error_code |= PFERR_RSVD_MASK; + return UNMAPPED_GVA; + } + return gpa; } From 6986982fef86ae71a27d6309739a0c38b562a5c0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 10 Jul 2020 17:48:06 +0200 Subject: [PATCH 108/127] KVM: x86: rename update_bp_intercept to update_exception_bitmap We would like to introduce a callback to update the #PF intercept when CPUID changes. Just reuse update_bp_intercept since VMX is already using update_exception_bitmap instead of a bespoke function. While at it, remove an unnecessary assignment in the SVM version, which is already done in the caller (kvm_arch_vcpu_ioctl_set_guest_debug) and has nothing to do with the exception bitmap. Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 7 +++---- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 733bbfa7001e..1df95f10c903 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1057,7 +1057,7 @@ struct kvm_x86_ops { void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); - void (*update_bp_intercept)(struct kvm_vcpu *vcpu); + void (*update_exception_bitmap)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 41f791e2a013..2371b1e40f39 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1627,7 +1627,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, vmcb_mark_dirty(svm->vmcb, VMCB_SEG); } -static void update_bp_intercept(struct kvm_vcpu *vcpu) +static void update_exception_bitmap(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1636,8 +1636,7 @@ static void update_bp_intercept(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) set_exception_intercept(svm, BP_VECTOR); - } else - vcpu->guest_debug = 0; + } } static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) @@ -4037,7 +4036,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_blocking = svm_vcpu_blocking, .vcpu_unblocking = svm_vcpu_unblocking, - .update_bp_intercept = update_bp_intercept, + .update_exception_bitmap = update_exception_bitmap, .get_msr_feature = svm_get_msr_feature, .get_msr = svm_get_msr, .set_msr = svm_set_msr, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2b41d987b101..8fe2999fcb1e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7875,7 +7875,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .update_bp_intercept = update_exception_bitmap, + .update_exception_bitmap = update_exception_bitmap, .get_msr_feature = vmx_get_msr_feature, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0867a626b226..35abe69aad28 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9316,7 +9316,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops.update_bp_intercept(vcpu); + kvm_x86_ops.update_exception_bitmap(vcpu); r = 0; From 32de2b5ee378ae590c90135e4d7f5c9149a04990 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 10 Jul 2020 17:48:07 +0200 Subject: [PATCH 109/127] KVM: x86: update exception bitmap on CPUID changes Allow vendor code to observe changes to MAXPHYADDR and start/stop intercepting page faults. Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index edbed4f522f2..7d92854082a1 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -148,6 +148,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_pmu_refresh(vcpu); vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); + kvm_x86_ops.update_exception_bitmap(vcpu); } static int is_efer_nx(void) From a0c134347baf56d0422d3658346a546891d3a98d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 10 Jul 2020 17:48:08 +0200 Subject: [PATCH 110/127] KVM: VMX: introduce vmx_need_pf_intercept Signed-off-by: Paolo Bonzini Message-Id: <20200710154811.418214-7-mgamal@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 28 +++++++++++++++++----------- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 5 +++++ 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e4080ab2df21..4d561edf6f9c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2438,22 +2438,28 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* * Whether page-faults are trapped is determined by a combination of - * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. - * If enable_ept, L0 doesn't care about page faults and we should - * set all of these to L1's desires. However, if !enable_ept, L0 does - * care about (at least some) page faults, and because it is not easy - * (if at all possible?) to merge L0 and L1's desires, we simply ask - * to exit on each and every L2 page fault. This is done by setting - * MASK=MATCH=0 and (see below) EB.PF=1. + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 + * doesn't care about page faults then we should set all of these to + * L1's desires. However, if L0 does care about (some) page faults, it + * is not easy (if at all possible?) to merge L0 and L1's desires, we + * simply ask to exit on each and every L2 page fault. This is done by + * setting MASK=MATCH=0 and (see below) EB.PF=1. * Note that below we don't need special code to set EB.PF beyond the * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when * !enable_ept, EB.PF is 1, so the "or" will always be 1. */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, - enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, - enable_ept ? vmcs12->page_fault_error_code_match : 0); + if (vmx_need_pf_intercept(&vmx->vcpu)) { + /* + * TODO: if both L0 and L1 need the same MASK and MATCH, + * go ahead and use it? + */ + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); + } else { + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); + } if (cpu_has_vmx_apicv()) { vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8fe2999fcb1e..6c51438c7567 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -780,7 +780,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu) eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; - if (enable_ept) + if (!vmx_need_pf_intercept(vcpu)) eb &= ~(1u << PF_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 639798e4a6ca..b0e5e210f1c1 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -550,6 +550,11 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; } +static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) +{ + return !enable_ept; +} + void dump_vmcs(void); #endif /* __KVM_X86_VMX_H */ From 1dbf5d68af6ffdb1c8b0d38ee565768e22e2c3ab Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Fri, 10 Jul 2020 17:48:09 +0200 Subject: [PATCH 111/127] KVM: VMX: Add guest physical address check in EPT violation and misconfig Check guest physical address against its maximum, which depends on the guest MAXPHYADDR. If the guest's physical address exceeds the maximum (i.e. has reserved bits set), inject a guest page fault with PFERR_RSVD_MASK set. This has to be done both in the EPT violation and page fault paths, as there are complications in both cases with respect to the computation of the correct error code. For EPT violations, unfortunately the only possibility is to emulate, because the access type in the exit qualification might refer to an access to a paging structure, rather than to the access performed by the program. Trapping page faults instead is needed in order to correct the error code, but the access type can be obtained from the original error code and passed to gva_to_gpa. The corrections required in the error code are subtle. For example, imagine that a PTE for a supervisor page has a reserved bit set. On a supervisor-mode access, the EPT violation path would trigger. However, on a user-mode access, the processor will not notice the reserved bit and not include PFERR_RSVD_MASK in the error code. Co-developed-by: Mohammed Gamal Signed-off-by: Paolo Bonzini Message-Id: <20200710154811.418214-8-mgamal@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 24 +++++++++++++++++++++--- arch/x86/kvm/vmx/vmx.h | 3 ++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6c51438c7567..5518f75c9b19 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4791,9 +4791,15 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (is_page_fault(intr_info)) { cr2 = vmx_get_exit_qual(vcpu); - /* EPT won't cause page fault directly */ - WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept); - return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); + if (enable_ept && !vcpu->arch.apf.host_apf_flags) { + /* + * EPT will cause page fault only if we need to + * detect illegal GPAs. + */ + kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); + return 1; + } else + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; @@ -5309,6 +5315,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; vcpu->arch.exit_qualification = exit_qualification; + + /* + * Check that the GPA doesn't exceed physical memory limits, as that is + * a guest page fault. We have to emulate the instruction here, because + * if the illegal address is that of a paging structure, then + * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we + * would also use advanced VM-exit information for EPT violations to + * reconstruct the page fault error code. + */ + if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa))) + return kvm_emulate_instruction(vcpu, 0); + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index b0e5e210f1c1..0d06951e607c 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -11,6 +11,7 @@ #include "kvm_cache_regs.h" #include "ops.h" #include "vmcs.h" +#include "cpuid.h" extern const u32 vmx_msr_index[]; @@ -552,7 +553,7 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) { - return !enable_ept; + return !enable_ept || cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; } void dump_vmcs(void); From 8c4182bd27cafc2b0f94564485c89ed366c1572c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 10 Jul 2020 17:48:10 +0200 Subject: [PATCH 112/127] KVM: VMX: optimize #PF injection when MAXPHYADDR does not match Ignore non-present page faults, since those cannot have reserved bits set. When running access.flat with "-cpu Haswell,phys-bits=36", the number of trapped page faults goes down from 8872644 to 3978948. Signed-off-by: Paolo Bonzini Message-Id: <20200710154811.418214-9-mgamal@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5518f75c9b19..962a78c7dde5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4355,6 +4355,16 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } + + /* + * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched + * between guest and host. In that case we only care about present + * faults. + */ + if (enable_ept) { + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK); + } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) From 3edd68399dc155b80335244c8c2673eaa652931a Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Fri, 10 Jul 2020 17:48:11 +0200 Subject: [PATCH 113/127] KVM: x86: Add a capability for GUEST_MAXPHYADDR < HOST_MAXPHYADDR support This patch adds a new capability KVM_CAP_SMALLER_MAXPHYADDR which allows userspace to query if the underlying architecture would support GUEST_MAXPHYADDR < HOST_MAXPHYADDR and hence act accordingly (e.g. qemu can decide if it should warn for -cpu ..,phys-bits=X) The complications in this patch are due to unexpected (but documented) behaviour we see with NPF vmexit handling in AMD processor. If SVM is modified to add guest physical address checks in the NPF and guest #PF paths, we see the followning error multiple times in the 'access' test in kvm-unit-tests: test pte.p pte.36 pde.p: FAIL: pte 2000021 expected 2000001 Dump mapping: address: 0x123400000000 ------L4: 24c3027 ------L3: 24c4027 ------L2: 24c5021 ------L1: 1002000021 This is because the PTE's accessed bit is set by the CPU hardware before the NPF vmexit. This is handled completely by hardware and cannot be fixed in software. Therefore, availability of the new capability depends on a boolean variable allow_smaller_maxphyaddr which is set individually by VMX and SVM init routines. On VMX it's always set to true, on SVM it's only set to true when NPT is not enabled. CC: Tom Lendacky CC: Babu Moger Signed-off-by: Mohammed Gamal Message-Id: <20200710154811.418214-10-mgamal@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 15 +++++++++++++++ arch/x86/kvm/vmx/vmx.c | 7 +++++++ arch/x86/kvm/x86.c | 6 ++++++ include/uapi/linux/kvm.h | 2 ++ 5 files changed, 31 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1df95f10c903..1bab87a444d7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1263,7 +1263,7 @@ struct kvm_arch_async_pf { }; extern u64 __read_mostly host_efer; - +extern bool __read_mostly allow_smaller_maxphyaddr; extern struct kvm_x86_ops kvm_x86_ops; #define __KVM_HAVE_ARCH_VM_ALLOC diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2371b1e40f39..783330d0e7b8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -924,6 +924,21 @@ static __init int svm_hardware_setup(void) svm_set_cpu_caps(); + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + return 0; err: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 962a78c7dde5..1bb59ae5016d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8309,6 +8309,13 @@ static int __init vmx_init(void) #endif vmx_check_vmcs12_offsets(); + /* + * Intel processors don't have problems with + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable + * it for VMX by default + */ + allow_smaller_maxphyaddr = true; + return 0; } module_init(vmx_init); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 35abe69aad28..95ef62922869 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -187,6 +187,9 @@ static struct kvm_shared_msrs __percpu *shared_msrs; u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); +bool __read_mostly allow_smaller_maxphyaddr; +EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); + static u64 __read_mostly host_xss; u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); @@ -3574,6 +3577,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; + case KVM_CAP_SMALLER_MAXPHYADDR: + r = (int) allow_smaller_maxphyaddr; + break; default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ff9b335620d0..2c73dcfb3dbb 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1033,6 +1033,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_HALT_POLL 182 #define KVM_CAP_ASYNC_PF_INT 183 #define KVM_CAP_LAST_CPU 184 +#define KVM_CAP_SMALLER_MAXPHYADDR 185 + #ifdef KVM_CAP_IRQ_ROUTING From e8af9e9f457bc457c67538ee488580de5fde1f17 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 10 Jul 2020 16:51:01 -0400 Subject: [PATCH 114/127] KVM: nSVM: remove nonsensical EXITINFO1 adjustment on nested NPF The "if" that drops the present bit from the page structure fauls makes no sense. It was added by yours truly in order to be bug-compatible with pre-existing code and in order to make the tests pass; however, the tests are wrong. The behavior after this patch matches bare metal. Reported-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 7b331e3da3eb..61378a3c2ce4 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -48,13 +48,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; svm->vmcb->control.exit_info_1 |= fault->error_code; - /* - * The present bit is always zero for page structure faults on real - * hardware. - */ - if (svm->vmcb->control.exit_info_1 & (2ULL << 32)) - svm->vmcb->control.exit_info_1 &= ~1; - nested_svm_vmexit(svm); } From 033555f6eb60787bd40e34d7abeacaebdcd4c54e Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 19 Jul 2020 18:23:27 +0800 Subject: [PATCH 115/127] MIPS: KVM: Fix build error caused by 'kvm_run' cleanup Commit c34b26b98caca48ec9ee9 ("KVM: MIPS: clean up redundant 'kvm_run' parameters") remove the 'kvm_run' parameter in kvm_mips_complete_mmio_ load(), but forget to update all callers. Fixes: c34b26b98caca48ec9ee9 ("KVM: MIPS: clean up redundant 'kvm_run' parameters") Reported-by: kernel test robot Cc: Tianjia Zhang Signed-off-by: Huacai Chen Message-Id: <1595154207-9787-1-git-send-email-chenhc@lemote.com> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 3221193c7371..8018e92ffd4b 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -2123,7 +2123,7 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, run->mmio.phys_addr, run->mmio.len, run->mmio.data); if (!r) { - kvm_mips_complete_mmio_load(vcpu, run); + kvm_mips_complete_mmio_load(vcpu); vcpu->mmio_needed = 0; return EMULATE_DONE; } From 9c2475f3e46a1de22bcae3b2c98c398937261c8a Mon Sep 17 00:00:00 2001 From: Haiwei Li Date: Tue, 21 Jul 2020 16:23:54 +0800 Subject: [PATCH 116/127] KVM: Using macros instead of magic values Instead of using magic values, use macros. Signed-off-by: Haiwei Li Message-Id: <4c072161-80dd-b7ed-7adb-02acccaa0701@gmail.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 47801a44cfa6..d5fb2ea2fadb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2083,7 +2083,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_SELF_IPI: if (apic_x2apic_mode(apic)) { - kvm_lapic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); + kvm_lapic_reg_write(apic, APIC_ICR, + APIC_DEST_SELF | (val & APIC_VECTOR_MASK)); } else ret = 1; break; From 096586fda522957881379cfe7870a373ab783c87 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:14 -0700 Subject: [PATCH 117/127] KVM: nSVM: Correctly set the shadow NPT root level in its MMU role Move the initialization of shadow NPT MMU's shadow_root_level into kvm_init_shadow_npt_mmu() and explicitly set the level in the shadow NPT MMU's role to be the TDP level. This ensures the role and MMU levels are synchronized and also initialized before __kvm_mmu_new_pgd(), which consumes the level when attempting a fast PGD switch. Cc: Vitaly Kuznetsov Fixes: 9fa72119b24db ("kvm: x86: Introduce kvm_mmu_calc_root_page_role()") Fixes: a506fdd223426 ("KVM: nSVM: implement nested_svm_load_cr3() and use it for host->guest switch") Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-2-sean.j.christopherson@intel.com> Reviewed-by: Vitaly Kuznetsov Tested-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 +++ arch/x86/kvm/svm/nested.c | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 77810ce66bdb..678b6209dad5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4963,6 +4963,9 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, union kvm_mmu_role new_role = kvm_calc_shadow_mmu_root_page_role(vcpu, false); + new_role.base.level = vcpu->arch.tdp_level; + context->shadow_root_level = new_role.base.level; + __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); if (new_role.as_u64 != context->mmu_role.as_u64) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 61378a3c2ce4..fb68467e6049 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -85,7 +85,6 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu->shadow_root_level = vcpu->arch.tdp_level; reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } From f291a358e0d88e3b20431266d8f78fc5eda1aec7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:16 -0700 Subject: [PATCH 118/127] KVM: VMX: Drop a duplicate declaration of construct_eptp() Remove an extra declaration of construct_eptp() from vmx.h. Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 0d06951e607c..0e8d25b0cec3 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -537,8 +537,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow) GFP_KERNEL_ACCOUNT); } -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); - static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) { vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; From 59505b55aa0957bcad84e74bb80153d5c77916f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:15 -0700 Subject: [PATCH 119/127] KVM: x86/mmu: Add separate helper for shadow NPT root page role calc Refactor the shadow NPT role calculation into a separate helper to better differentiate it from the non-nested shadow MMU, e.g. the NPT variant is never direct and derives its root level from the TDP level. Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-3-sean.j.christopherson@intel.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 678b6209dad5..0fb033ce6cc5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4908,7 +4908,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } static union kvm_mmu_role -kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only) { union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); @@ -4916,9 +4916,19 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) !is_write_protection(vcpu); role.base.smap_andnot_wp = role.ext.cr4_smap && !is_write_protection(vcpu); - role.base.direct = !is_paging(vcpu); role.base.gpte_is_8_bytes = !!is_pae(vcpu); + return role; +} + +static union kvm_mmu_role +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +{ + union kvm_mmu_role role = + kvm_calc_shadow_root_page_role_common(vcpu, base_only); + + role.base.direct = !is_paging(vcpu); + if (!is_long_mode(vcpu)) role.base.level = PT32E_ROOT_LEVEL; else if (is_la57_mode(vcpu)) @@ -4956,14 +4966,24 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efe shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); } +static union kvm_mmu_role +kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_role role = + kvm_calc_shadow_root_page_role_common(vcpu, false); + + role.base.direct = false; + role.base.level = vcpu->arch.tdp_level; + + return role; +} + void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, gpa_t nested_cr3) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; - union kvm_mmu_role new_role = - kvm_calc_shadow_mmu_root_page_role(vcpu, false); + union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); - new_role.base.level = vcpu->arch.tdp_level; context->shadow_root_level = new_role.base.level; __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); From 812f8058369f81be2d7e13a78e398d9425e830d8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:17 -0700 Subject: [PATCH 120/127] KVM: VMX: Make vmx_load_mmu_pgd() static Make vmx_load_mmu_pgd() static as it is no longer invoked directly by nested VMX (or any code for that matter). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1bb59ae5016d..791baa73e578 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3092,7 +3092,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) return eptp; } -void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) +static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 0e8d25b0cec3..3c55433ac1b2 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -338,7 +338,6 @@ void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); -void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); From 2a40b9001ec210e51a2bc8647629d7906779fb0b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:18 -0700 Subject: [PATCH 121/127] KVM: x86: Pull the PGD's level from the MMU instead of recalculating it Use the shadow_root_level from the current MMU as the root level for the PGD, i.e. for VMX's EPTP. This eliminates the weird dependency between VMX and the MMU where both must independently calculate the same root level for things to work correctly. Temporarily keep VMX's calculation of the level and use it to WARN if the incoming level diverges. Opportunistically refactor kvm_mmu_load_pgd() to avoid indentation hell, and rename a 'cr3' param in the load_mmu_pgd prototype that managed to survive the cr3 purge. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-6-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/mmu.h | 10 +++++++--- arch/x86/kvm/svm/svm.c | 3 ++- arch/x86/kvm/vmx/nested.c | 3 ++- arch/x86/kvm/vmx/vmx.c | 18 ++++++++++++------ arch/x86/kvm/vmx/vmx.h | 3 ++- 6 files changed, 27 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1bab87a444d7..ce60f4c38843 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1136,7 +1136,8 @@ struct kvm_x86_ops { int (*get_tdp_level)(struct kvm_vcpu *vcpu); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3); + void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd, + int pgd_level); bool (*has_wbinvd_exit)(void); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 9f6554613bab..5efc6081ca13 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -90,9 +90,13 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) { - if (VALID_PAGE(vcpu->arch.mmu->root_hpa)) - kvm_x86_ops.load_mmu_pgd(vcpu, vcpu->arch.mmu->root_hpa | - kvm_get_active_pcid(vcpu)); + u64 root_hpa = vcpu->arch.mmu->root_hpa; + + if (!VALID_PAGE(root_hpa)) + return; + + kvm_x86_ops.load_mmu_pgd(vcpu, root_hpa | kvm_get_active_pcid(vcpu), + vcpu->arch.mmu->shadow_root_level); } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 783330d0e7b8..c70d7dd33306 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3541,7 +3541,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) return exit_fastpath; } -static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) +static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root, + int root_level) { struct vcpu_svm *svm = to_svm(vcpu); unsigned long cr3; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4d561edf6f9c..e405e754b592 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2162,7 +2162,8 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) * consistency checks. */ if (enable_ept && nested_early_check) - vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); + vmcs_write64(EPT_POINTER, + construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 791baa73e578..244053cff0a3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2933,14 +2933,16 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { - u64 root_hpa = vcpu->arch.mmu->root_hpa; + struct kvm_mmu *mmu = vcpu->arch.mmu; + u64 root_hpa = mmu->root_hpa; /* No flush required if the current context is invalid. */ if (!VALID_PAGE(root_hpa)) return; if (enable_ept) - ept_sync_context(construct_eptp(vcpu, root_hpa)); + ept_sync_context(construct_eptp(vcpu, root_hpa, + mmu->shadow_root_level)); else if (!is_guest_mode(vcpu)) vpid_sync_context(to_vmx(vcpu)->vpid); else @@ -3078,11 +3080,12 @@ static int get_ept_level(struct kvm_vcpu *vcpu) return vmx_get_tdp_level(vcpu); } -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) +u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, + int root_level) { u64 eptp = VMX_EPTP_MT_WB; - eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; if (enable_ept_ad_bits && (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) @@ -3092,7 +3095,8 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) return eptp; } -static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) +static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, + int pgd_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3100,7 +3104,9 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) u64 eptp; if (enable_ept) { - eptp = construct_eptp(vcpu, pgd); + WARN_ON(pgd_level != get_ept_level(vcpu)); + + eptp = construct_eptp(vcpu, pgd, pgd_level); vmcs_write64(EPT_POINTER, eptp); if (kvm_x86_ops.tlb_remote_flush) { diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 3c55433ac1b2..26175a4759fa 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -341,7 +341,8 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); +u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, + int root_level); void update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); From f83a4a6932f002701db19f968938ada1289f5e3c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:19 -0700 Subject: [PATCH 122/127] KVM: VXM: Remove temporary WARN on expected vs. actual EPTP level mismatch Remove the WARN in vmx_load_mmu_pgd() that was temporarily added to aid bisection/debug in the event the current MMU's shadow root level didn't match VMX's computed EPTP level. Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-7-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 244053cff0a3..da75878171ce 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3072,14 +3072,6 @@ static int vmx_get_tdp_level(struct kvm_vcpu *vcpu) return 4; } -static int get_ept_level(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) - return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); - - return vmx_get_tdp_level(vcpu); -} - u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, int root_level) { @@ -3104,8 +3096,6 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, u64 eptp; if (enable_ept) { - WARN_ON(pgd_level != get_ept_level(vcpu)); - eptp = construct_eptp(vcpu, pgd, pgd_level); vmcs_write64(EPT_POINTER, eptp); From d468d94b7bafa7a2dd9bc72e5f7868469be3f7c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:20 -0700 Subject: [PATCH 123/127] KVM: x86: Dynamically calculate TDP level from max level and MAXPHYADDR Calculate the desired TDP level on the fly using the max TDP level and MAXPHYADDR instead of doing the same when CPUID is updated. This avoids the hidden dependency on cpuid_maxphyaddr() in vmx_get_tdp_level() and also standardizes the "use 5-level paging iff MAXPHYADDR > 48" behavior across x86. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-8-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/cpuid.c | 2 -- arch/x86/kvm/mmu/mmu.c | 17 +++++++++++++---- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 6 +++--- arch/x86/kvm/x86.c | 2 +- 6 files changed, 21 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ce60f4c38843..ffd45b68e1d4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -639,7 +639,7 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; int maxphyaddr; - int tdp_level; + int max_tdp_level; /* emulate context */ @@ -1133,7 +1133,7 @@ struct kvm_x86_ops { int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); - int (*get_tdp_level)(struct kvm_vcpu *vcpu); + int (*get_max_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7d92854082a1..fa873e3e6e90 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -140,9 +140,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & supported_xcr0; - /* Note, maxphyaddr must be updated before tdp_level. */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); kvm_mmu_reset_context(vcpu); kvm_pmu_refresh(vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0fb033ce6cc5..559b4b92b5e2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4846,13 +4846,22 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, return role; } +static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) +{ + /* Use 5-level TDP if and only if it's useful/necessary. */ + if (vcpu->arch.max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + return 4; + + return vcpu->arch.max_tdp_level; +} + static union kvm_mmu_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) { union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); - role.base.level = vcpu->arch.tdp_level; + role.base.level = kvm_mmu_get_tdp_level(vcpu); role.base.direct = true; role.base.gpte_is_8_bytes = true; @@ -4873,7 +4882,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->sync_page = nonpaging_sync_page; context->invlpg = NULL; context->update_pte = nonpaging_update_pte; - context->shadow_root_level = vcpu->arch.tdp_level; + context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); context->direct_map = true; context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; @@ -4973,7 +4982,7 @@ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu) kvm_calc_shadow_root_page_role_common(vcpu, false); role.base.direct = false; - role.base.level = vcpu->arch.tdp_level; + role.base.level = kvm_mmu_get_tdp_level(vcpu); return role; } @@ -5683,7 +5692,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can * skip allocating the PDP table. */ - if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL) + if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c70d7dd33306..c94faca46e76 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -254,7 +254,7 @@ static inline void invlpga(unsigned long addr, u32 asid) asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr)); } -static int get_npt_level(struct kvm_vcpu *vcpu) +static int get_max_npt_level(void) { #ifdef CONFIG_X86_64 return PT64_ROOT_4LEVEL; @@ -4109,7 +4109,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_tss_addr = svm_set_tss_addr, .set_identity_map_addr = svm_set_identity_map_addr, - .get_tdp_level = get_npt_level, + .get_max_tdp_level = get_max_npt_level, .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index da75878171ce..c0b1c7bd1248 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3065,9 +3065,9 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = emulation_required(vcpu); } -static int vmx_get_tdp_level(struct kvm_vcpu *vcpu) +static int vmx_get_max_tdp_level(void) { - if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) + if (cpu_has_vmx_ept_5levels()) return 5; return 4; } @@ -7959,7 +7959,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, - .get_tdp_level = vmx_get_tdp_level, + .get_max_tdp_level = vmx_get_max_tdp_level, .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 95ef62922869..41f43bb716c1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9520,7 +9520,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); + vcpu->arch.max_tdp_level = kvm_x86_ops.get_max_tdp_level(); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; From 1d92d2e8e70697e92bdc1f5ff5c80216fbcd1774 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:21 -0700 Subject: [PATCH 124/127] KVM: x86/mmu: Rename max_page_level to max_huge_page_level Rename max_page_level to explicitly call out that it tracks the max huge page level so as to avoid confusion when a future patch moves the max TDP level, i.e. max root level, into the MMU and kvm_configure_mmu(). Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-9-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 559b4b92b5e2..c867b35759ab 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -92,7 +92,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); */ bool tdp_enabled = false; -static int max_page_level __read_mostly; +static int max_huge_page_level __read_mostly; enum { AUDIT_PRE_PAGE_FAULT, @@ -3256,7 +3256,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - max_level = min(max_level, max_page_level); + max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) @@ -5580,23 +5580,23 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); -void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_huge_page_level) { tdp_enabled = enable_tdp; /* - * max_page_level reflects the capabilities of KVM's MMU irrespective + * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when * the kernel is not. But, KVM never creates a page size greater than * what is used by the kernel for any given HVA, i.e. the kernel's * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). */ if (tdp_enabled) - max_page_level = tdp_page_level; + max_huge_page_level = tdp_huge_page_level; else if (boot_cpu_has(X86_FEATURE_GBPAGES)) - max_page_level = PG_LEVEL_1G; + max_huge_page_level = PG_LEVEL_1G; else - max_page_level = PG_LEVEL_2M; + max_huge_page_level = PG_LEVEL_2M; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); From 83013059bdc5486dcbcd33b5c4abf8431766e06c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:22 -0700 Subject: [PATCH 125/127] KVM: x86: Specify max TDP level via kvm_configure_mmu() Capture the max TDP level during kvm_configure_mmu() instead of using a kvm_x86_ops hook to do it at every vCPU creation. Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-10-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/mmu/mmu.c | 9 ++++++--- arch/x86/kvm/svm/svm.c | 3 +-- arch/x86/kvm/vmx/vmx.c | 3 +-- arch/x86/kvm/x86.c | 1 - 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ffd45b68e1d4..5ab3af7275d8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1133,7 +1133,6 @@ struct kvm_x86_ops { int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); - int (*get_max_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd, @@ -1509,7 +1508,8 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, bool skip_mmu_sync); -void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); +void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, + int tdp_huge_page_level); static inline u16 kvm_read_ldt(void) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c867b35759ab..862bf418214e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -93,6 +93,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); bool tdp_enabled = false; static int max_huge_page_level __read_mostly; +static int max_tdp_level __read_mostly; enum { AUDIT_PRE_PAGE_FAULT, @@ -4849,10 +4850,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { /* Use 5-level TDP if and only if it's useful/necessary. */ - if (vcpu->arch.max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) return 4; - return vcpu->arch.max_tdp_level; + return max_tdp_level; } static union kvm_mmu_role @@ -5580,9 +5581,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); -void kvm_configure_mmu(bool enable_tdp, int tdp_huge_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, + int tdp_huge_page_level) { tdp_enabled = enable_tdp; + max_tdp_level = tdp_max_root_level; /* * max_huge_page_level reflects KVM's MMU capabilities irrespective diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c94faca46e76..5f47b44c5c32 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -885,7 +885,7 @@ static __init int svm_hardware_setup(void) if (npt_enabled && !npt) npt_enabled = false; - kvm_configure_mmu(npt_enabled, PG_LEVEL_1G); + kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { @@ -4109,7 +4109,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_tss_addr = svm_set_tss_addr, .set_identity_map_addr = svm_set_identity_map_addr, - .get_max_tdp_level = get_max_npt_level, .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c0b1c7bd1248..a70d8f6d8aba 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7959,7 +7959,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, - .get_max_tdp_level = vmx_get_max_tdp_level, .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, @@ -8110,7 +8109,7 @@ static __init int hardware_setup(void) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; - kvm_configure_mmu(enable_ept, ept_lpage_level); + kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41f43bb716c1..dc4370394ab8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9520,7 +9520,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.max_tdp_level = kvm_x86_ops.get_max_tdp_level(); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; From a445fc457d2886a1264ec09c34f4000d1b30784d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 31 Jul 2020 11:12:20 +0800 Subject: [PATCH 126/127] KVM: LAPIC: Set the TDCR settable bits It is a little different between Intel and AMD, Intel's bit 2 is 0 and AMD is reserved. On bare-metal, Intel will refuse to set APIC_TDCR once bits except 0, 1, 3 are setting, however, AMD will accept bits 0, 1, 3 and ignore other bits setting as patch does. Before the patch, we can get back anything what we set to the APIC_TDCR, this patch improves it. Signed-off-by: Wanpeng Li Message-Id: <1596165141-28874-2-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d5fb2ea2fadb..bd16e3100932 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2066,7 +2066,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_TDCR: { uint32_t old_divisor = apic->divide_count; - kvm_lapic_set_reg(apic, APIC_TDCR, val); + kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb); update_divide_count(apic); if (apic->divide_count != old_divisor && apic->lapic_timer.period) { From ff2bd9ff115218c144441e1df0370d9083b82866 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 14 Jul 2020 17:23:51 +0300 Subject: [PATCH 127/127] KVM: SVM: Fix sev_pin_memory() error handling The sev_pin_memory() function was modified to return error pointers instead of NULL but there are two problems. The first problem is that if "npages" is zero then it still returns NULL. Secondly, several of the callers were not updated to check for error pointers instead of NULL. Either one of these issues will lead to an Oops. Fixes: a8d908b5873c ("KVM: x86: report sev_pin_memory errors with PTR_ERR") Signed-off-by: Dan Carpenter Message-Id: <20200714142351.GA315374@mwanda> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f7f1f4ecf08e..402dc4234e39 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -318,6 +318,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long locked, lock_limit; struct page **pages; unsigned long first, last; + int ret; if (ulen == 0 || uaddr + ulen < uaddr) return ERR_PTR(-EINVAL); @@ -351,6 +352,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); + ret = -ENOMEM; goto err; } @@ -360,13 +362,11 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return pages; err: - if (npinned > 0) { + if (npinned > 0) unpin_user_pages(pages, npinned); - npinned = -ENOMEM; - } kvfree(pages); - return ERR_PTR(npinned); + return ERR_PTR(ret); } static void sev_unpin_memory(struct kvm *kvm, struct page **pages, @@ -440,8 +440,8 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Lock the user memory. */ inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); - if (!inpages) { - ret = -ENOMEM; + if (IS_ERR(inpages)) { + ret = PTR_ERR(inpages); goto e_free; } @@ -795,13 +795,13 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) /* lock userspace source and destination page */ src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0); - if (!src_p) - return -EFAULT; + if (IS_ERR(src_p)) + return PTR_ERR(src_p); dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); - if (!dst_p) { + if (IS_ERR(dst_p)) { sev_unpin_memory(kvm, src_p, n); - return -EFAULT; + return PTR_ERR(dst_p); } /*