KVM: arm/arm64: Fix handling of stage2 huge mappings

We rely on the mmu_notifier call backs to handle the split/merge
of huge pages and thus we are guaranteed that, while creating a
block mapping, either the entire block is unmapped at stage2 or it
is missing permission.

However, we miss a case where the block mapping is split for dirty
logging case and then could later be made block mapping, if we cancel the
dirty logging. This not only creates inconsistent TLB entries for
the pages in the the block, but also leakes the table pages for
PMD level.

Handle this corner case for the huge mappings at stage2 by
unmapping the non-huge mapping for the block. This could potentially
release the upper level table. So we need to restart the table walk
once we unmap the range.

Fixes : ad361f093c ("KVM: ARM: Support hugetlbfs backed huge pages")
Reported-by: Zheng Xiang <zhengxiang9@huawei.com>
Cc: Zheng Xiang <zhengxiang9@huawei.com>
Cc: Zenghui Yu <yuzenghui@huawei.com>
Cc: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
This commit is contained in:
Suzuki K Poulose 2019-03-20 14:57:19 +00:00 committed by Marc Zyngier
parent a80868f398
commit 3c3736cd32
2 changed files with 45 additions and 16 deletions

View File

@ -75,6 +75,8 @@ static inline bool kvm_stage2_has_pud(struct kvm *kvm)
#define S2_PMD_MASK PMD_MASK #define S2_PMD_MASK PMD_MASK
#define S2_PMD_SIZE PMD_SIZE #define S2_PMD_SIZE PMD_SIZE
#define S2_PUD_MASK PUD_MASK
#define S2_PUD_SIZE PUD_SIZE
static inline bool kvm_stage2_has_pmd(struct kvm *kvm) static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
{ {

View File

@ -1067,25 +1067,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
{ {
pmd_t *pmd, old_pmd; pmd_t *pmd, old_pmd;
retry:
pmd = stage2_get_pmd(kvm, cache, addr); pmd = stage2_get_pmd(kvm, cache, addr);
VM_BUG_ON(!pmd); VM_BUG_ON(!pmd);
old_pmd = *pmd; old_pmd = *pmd;
/*
* Multiple vcpus faulting on the same PMD entry, can
* lead to them sequentially updating the PMD with the
* same value. Following the break-before-make
* (pmd_clear() followed by tlb_flush()) process can
* hinder forward progress due to refaults generated
* on missing translations.
*
* Skip updating the page table if the entry is
* unchanged.
*/
if (pmd_val(old_pmd) == pmd_val(*new_pmd))
return 0;
if (pmd_present(old_pmd)) { if (pmd_present(old_pmd)) {
/* /*
* Multiple vcpus faulting on the same PMD entry, can * If we already have PTE level mapping for this block,
* lead to them sequentially updating the PMD with the * we must unmap it to avoid inconsistent TLB state and
* same value. Following the break-before-make * leaking the table page. We could end up in this situation
* (pmd_clear() followed by tlb_flush()) process can * if the memory slot was marked for dirty logging and was
* hinder forward progress due to refaults generated * reverted, leaving PTE level mappings for the pages accessed
* on missing translations. * during the period. So, unmap the PTE level mapping for this
* block and retry, as we could have released the upper level
* table in the process.
* *
* Skip updating the page table if the entry is * Normal THP split/merge follows mmu_notifier callbacks and do
* unchanged. * get handled accordingly.
*/ */
if (pmd_val(old_pmd) == pmd_val(*new_pmd)) if (!pmd_thp_or_huge(old_pmd)) {
return 0; unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
goto retry;
}
/* /*
* Mapping in huge pages should only happen through a * Mapping in huge pages should only happen through a
* fault. If a page is merged into a transparent huge * fault. If a page is merged into a transparent huge
@ -1097,8 +1115,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
* should become splitting first, unmapped, merged, * should become splitting first, unmapped, merged,
* and mapped back in on-demand. * and mapped back in on-demand.
*/ */
VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
pmd_clear(pmd); pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr); kvm_tlb_flush_vmid_ipa(kvm, addr);
} else { } else {
@ -1114,6 +1131,7 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
{ {
pud_t *pudp, old_pud; pud_t *pudp, old_pud;
retry:
pudp = stage2_get_pud(kvm, cache, addr); pudp = stage2_get_pud(kvm, cache, addr);
VM_BUG_ON(!pudp); VM_BUG_ON(!pudp);
@ -1121,14 +1139,23 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
/* /*
* A large number of vcpus faulting on the same stage 2 entry, * A large number of vcpus faulting on the same stage 2 entry,
* can lead to a refault due to the * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
* stage2_pud_clear()/tlb_flush(). Skip updating the page * Skip updating the page tables if there is no change.
* tables if there is no change.
*/ */
if (pud_val(old_pud) == pud_val(*new_pudp)) if (pud_val(old_pud) == pud_val(*new_pudp))
return 0; return 0;
if (stage2_pud_present(kvm, old_pud)) { if (stage2_pud_present(kvm, old_pud)) {
/*
* If we already have table level mapping for this block, unmap
* the range for this block and retry.
*/
if (!stage2_pud_huge(kvm, old_pud)) {
unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
goto retry;
}
WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
stage2_pud_clear(kvm, pudp); stage2_pud_clear(kvm, pudp);
kvm_tlb_flush_vmid_ipa(kvm, addr); kvm_tlb_flush_vmid_ipa(kvm, addr);
} else { } else {