From c844c4ad37ba222ab37c64cfbe4c5eb1ec73c844 Mon Sep 17 00:00:00 2001 From: deraadt Date: Tue, 21 Aug 2018 19:04:38 +0000 Subject: [PATCH] Perform mitigations for Intel L1TF screwup. There are three options: (1) Future cpus which don't have the bug, (2) cpu's with microcode containing a L1D flush operation, (3) stuffing the L1D cache with fresh data and expiring old content. This stuffing loop is complicated and interesting, no details on the mitigation have been released by Intel so Mike and I studied other systems for inspiration. Replacement algorithm for the L1D is described in the tlbleed paper. We use a 64K PA-linear region filled with trapsleds (in case there is L1D->L1I data movement). The TLBs covering the region are loaded first, because TLB loading apparently flows through the D cache. Before performing vmlaunch or vmresume, the cachelines covering the guest registers are also flushed. with mlarkin, additional testing by pd, handy comments from the kettenis and guenther peanuts --- sys/arch/amd64/amd64/identcpu.c | 24 +++++++++- sys/arch/amd64/amd64/vmm.c | 33 +++++++++++--- sys/arch/amd64/amd64/vmm_support.S | 71 ++++++++++++++++++++++++++++- sys/arch/amd64/include/cpu.h | 3 +- sys/arch/amd64/include/specialreg.h | 5 +- sys/arch/amd64/include/vmmvar.h | 4 +- 6 files changed, 127 insertions(+), 13 deletions(-) diff --git a/sys/arch/amd64/amd64/identcpu.c b/sys/arch/amd64/amd64/identcpu.c index cd433d578ba..184379b8a51 100644 --- a/sys/arch/amd64/amd64/identcpu.c +++ b/sys/arch/amd64/amd64/identcpu.c @@ -1,4 +1,4 @@ -/* $OpenBSD: identcpu.c,v 1.106 2018/08/15 02:07:35 jsg Exp $ */ +/* $OpenBSD: identcpu.c,v 1.107 2018/08/21 19:04:38 deraadt Exp $ */ /* $NetBSD: identcpu.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $ */ /* @@ -1011,5 +1011,27 @@ cpu_check_vmm_cap(struct cpu_info *ci) if (cap & AMD_SVM_NESTED_PAGING_CAP) ci->ci_vmm_flags |= CI_VMM_RVI; } + + /* + * Check "L1 flush on VM entry" (Intel L1TF vuln) semantics + */ + if (!strcmp(cpu_vendor, "GenuineIntel")) { + if (ci->ci_feature_sefflags_edx & SEFF0EDX_L1DF) + ci->ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr = 1; + else + ci->ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr = 0; + + /* + * Certain CPUs may have the vulnerability remedied in + * hardware, check for that and override the setting + * calculated above. + */ + if (ci->ci_feature_sefflags_edx & SEFF0EDX_ARCH_CAP) { + msr = rdmsr(MSR_ARCH_CAPABILITIES); + if (msr & ARCH_CAPABILITIES_SKIP_L1DFL_VMENTRY) + ci->ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr = + VMX_SKIP_L1D_FLUSH; + } + } } #endif /* NVMM > 0 */ diff --git a/sys/arch/amd64/amd64/vmm.c b/sys/arch/amd64/amd64/vmm.c index 4abe6d584ae..34c5651a021 100644 --- a/sys/arch/amd64/amd64/vmm.c +++ b/sys/arch/amd64/amd64/vmm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm.c,v 1.218 2018/07/27 21:11:31 kettenis Exp $ */ +/* $OpenBSD: vmm.c,v 1.219 2018/08/21 19:04:38 deraadt Exp $ */ /* * Copyright (c) 2014 Mike Larkin * @@ -42,6 +42,8 @@ /* #define VMM_DEBUG */ +void *l1tf_flush_region; + #ifdef VMM_DEBUG #define DPRINTF(x...) do { printf(x); } while(0) #else @@ -372,22 +374,38 @@ vmm_attach(struct device *parent, struct device *self, void *aux) rw_init(&sc->vm_lock, "vmlistlock"); if (sc->nr_ept_cpus) { - printf(": VMX/EPT\n"); + printf(": VMX/EPT"); sc->mode = VMM_MODE_EPT; } else if (sc->nr_vmx_cpus) { - printf(": VMX\n"); + printf(": VMX"); sc->mode = VMM_MODE_VMX; } else if (sc->nr_rvi_cpus) { - printf(": SVM/RVI\n"); + printf(": SVM/RVI"); sc->mode = VMM_MODE_RVI; } else if (sc->nr_svm_cpus) { - printf(": SVM\n"); + printf(": SVM"); sc->mode = VMM_MODE_SVM; } else { - printf(": unknown\n"); + printf(": unknown"); sc->mode = VMM_MODE_UNKNOWN; } + if (sc->mode == VMM_MODE_EPT || sc->mode == VMM_MODE_VMX) { + if (!(curcpu()->ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr)) { + l1tf_flush_region = km_alloc(VMX_L1D_FLUSH_SIZE, + &kv_any, &vmm_kp_contig, &kd_waitok); + if (!l1tf_flush_region) { + printf(" (failing, no memory)"); + sc->mode = VMM_MODE_UNKNOWN; + } else { + printf(" (using slow L1TF mitigation)"); + memset(l1tf_flush_region, 0xcc, + VMX_L1D_FLUSH_SIZE); + } + } + } + printf("\n"); + if (sc->mode == VMM_MODE_SVM || sc->mode == VMM_MODE_RVI) { sc->max_vpid = curcpu()->ci_vmm_cap.vcc_svm.svm_max_asid; } else { @@ -4108,7 +4126,8 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp) KERNEL_UNLOCK(); ret = vmx_enter_guest(&vcpu->vc_control_pa, - &vcpu->vc_gueststate, resume); + &vcpu->vc_gueststate, resume, + curcpu()->ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr); /* * On exit, interrupts are disabled, and we are running with diff --git a/sys/arch/amd64/amd64/vmm_support.S b/sys/arch/amd64/amd64/vmm_support.S index 8053e841f06..872951bcc20 100644 --- a/sys/arch/amd64/amd64/vmm_support.S +++ b/sys/arch/amd64/amd64/vmm_support.S @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm_support.S,v 1.12 2018/07/24 02:42:25 guenther Exp $ */ +/* $OpenBSD: vmm_support.S,v 1.13 2018/08/21 19:04:38 deraadt Exp $ */ /* * Copyright (c) 2014 Mike Larkin * @@ -16,6 +16,7 @@ */ #include "assym.h" +#include #include #include #include @@ -163,6 +164,7 @@ _C_LABEL(invept): _C_LABEL(vmx_enter_guest): RETGUARD_SETUP(vmx_enter_guest, r11) movq %rdx, %r8 /* resume flag */ + movq %rcx, %r9 /* L1DF MSR support */ testq %r8, %r8 jnz skip_init @@ -249,6 +251,62 @@ skip_init: movq %rsp, %rax vmwrite %rax, %rdi /* Host RSP */ + /* + * Intel L1TF vulnerability fix + * + * Certain Intel CPUs are broken and allow guest VMs to bypass + * EPT entirely as their address harvesting logic treats guest + * PTEs as host physical addresses. Flush L1 Dcache to prevent + * information leakage by command MSR or manually reading a + * bunch of junk in order to fill sizeof(L1 Dcache)*2. + * + * %r9 (inherited from parameter 4 in %rcx earlier) + * determines the flushing requirements + * 0 - use manual "junk read" flush + * 1 - use MSR command + * 2 (VMX_SKIP_L1D_FLUSH) - no flush required on this CPU + */ + cmpq $VMX_SKIP_L1D_FLUSH, %r9 + je done_flush + + testq %r9, %r9 + jz no_l1df_msr + + /* CPU has command MSR */ + movq $MSR_FLUSH_CMD, %rcx + xorq %rdx, %rdx + movq $FLUSH_CMD_L1D_FLUSH, %rax + wrmsr + jmp done_flush + +no_l1df_msr: + xorq %r9, %r9 +l1df_tlb_loop: + /* XXX get the right L1 size from cpuid */ + cmpq $VMX_L1D_FLUSH_SIZE, %r9 + je l1df_tlb_done + movb l1tf_flush_region(%r9), %al + addq $PAGE_SIZE, %r9 + jmp l1df_tlb_loop + +l1df_tlb_done: + /* + * Serialize: ensure previous TLB loads don't pull PTDs + * or other PA-containing data into the L1D. + */ + xorq %rax, %rax + cpuid + + xorq %r9, %r9 +l1df_load_cache: + movb l1tf_flush_region(%r9), %al + /* XXX get the right cacheline size from cpuid */ + addq $0x40, %r9 + cmpq $VMX_L1D_FLUSH_SIZE, %r9 + jne l1df_load_cache + lfence + +done_flush: testq %r8, %r8 jnz do_resume @@ -262,6 +320,10 @@ skip_init: movq 0x50(%rsi), %r11 movq 0x48(%rsi), %r10 movq 0x40(%rsi), %r9 + movq %rsi, %r8 + /* XXX get the right cacheline size from cpuid */ + addq $0x40, %r8 + clflush (%r8) movq 0x38(%rsi), %r8 movq 0x30(%rsi), %rbp movq 0x28(%rsi), %rdi @@ -269,6 +331,7 @@ skip_init: movq 0x18(%rsi), %rcx movq 0x10(%rsi), %rbx movq 0x08(%rsi), %rax + clflush (%rsi) movq 0x00(%rsi), %rsi vmlaunch @@ -284,6 +347,10 @@ do_resume: movq 0x50(%rsi), %r11 movq 0x48(%rsi), %r10 movq 0x40(%rsi), %r9 + movq %rsi, %r8 + /* XXX get the right cacheline size from cpuid */ + addq $0x40, %r8 + clflush (%r8) movq 0x38(%rsi), %r8 movq 0x30(%rsi), %rbp movq 0x28(%rsi), %rdi @@ -291,7 +358,9 @@ do_resume: movq 0x18(%rsi), %rcx movq 0x10(%rsi), %rbx movq 0x08(%rsi), %rax + clflush (%rsi) movq 0x00(%rsi), %rsi + vmresume fail_launch_or_resume: RET_STACK_REFILL_WITH_RCX diff --git a/sys/arch/amd64/include/cpu.h b/sys/arch/amd64/include/cpu.h index d345dd9ca64..7b4ea642b28 100644 --- a/sys/arch/amd64/include/cpu.h +++ b/sys/arch/amd64/include/cpu.h @@ -1,4 +1,4 @@ -/* $OpenBSD: cpu.h,v 1.126 2018/07/11 20:07:55 guenther Exp $ */ +/* $OpenBSD: cpu.h,v 1.127 2018/08/21 19:04:40 deraadt Exp $ */ /* $NetBSD: cpu.h,v 1.1 2003/04/26 18:39:39 fvdl Exp $ */ /*- @@ -71,6 +71,7 @@ struct vmx { uint32_t vmx_msr_table_size; uint32_t vmx_cr3_tgt_count; uint64_t vmx_vm_func; + uint8_t vmx_has_l1_flush_msr; }; /* diff --git a/sys/arch/amd64/include/specialreg.h b/sys/arch/amd64/include/specialreg.h index 5457379dc99..7cbea305382 100644 --- a/sys/arch/amd64/include/specialreg.h +++ b/sys/arch/amd64/include/specialreg.h @@ -1,4 +1,4 @@ -/* $OpenBSD: specialreg.h,v 1.78 2018/08/15 02:07:35 jsg Exp $ */ +/* $OpenBSD: specialreg.h,v 1.79 2018/08/21 19:04:40 deraadt Exp $ */ /* $NetBSD: specialreg.h,v 1.1 2003/04/26 18:39:48 fvdl Exp $ */ /* $NetBSD: x86/specialreg.h,v 1.2 2003/04/25 21:54:30 fvdl Exp $ */ @@ -1235,6 +1235,9 @@ #define IA32_VMX_MSR_LIST_SIZE_MASK (7ULL << 25) #define IA32_VMX_CR3_TGT_SIZE_MASK (0x1FFULL << 16) +#define VMX_SKIP_L1D_FLUSH 2 +#define VMX_L1D_FLUSH_SIZE (64 * 1024) + /* * SVM */ diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h index 70299c1e11c..fcbbb987cd6 100644 --- a/sys/arch/amd64/include/vmmvar.h +++ b/sys/arch/amd64/include/vmmvar.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmmvar.h,v 1.57 2018/07/12 15:13:33 mlarkin Exp $ */ +/* $OpenBSD: vmmvar.h,v 1.58 2018/08/21 19:04:40 deraadt Exp $ */ /* * Copyright (c) 2014 Mike Larkin * @@ -900,7 +900,7 @@ int vmwrite(uint64_t, uint64_t); int vmread(uint64_t, uint64_t *); void invvpid(uint64_t, struct vmx_invvpid_descriptor *); void invept(uint64_t, struct vmx_invept_descriptor *); -int vmx_enter_guest(uint64_t *, struct vcpu_gueststate *, int); +int vmx_enter_guest(uint64_t *, struct vcpu_gueststate *, int, uint8_t); int svm_enter_guest(uint64_t, struct vcpu_gueststate *, struct region_descriptor *); void start_vmm_on_cpu(struct cpu_info *); -- 2.20.1