From c844c4ad37ba222ab37c64cfbe4c5eb1ec73c844 Mon Sep 17 00:00:00 2001
From: deraadt <deraadt@openbsd.org>
Date: Tue, 21 Aug 2018 19:04:38 +0000
Subject: [PATCH] Perform mitigations for Intel L1TF screwup.  There are three
 options: (1) Future cpus which don't have the bug, (2) cpu's with microcode
 containing a L1D flush operation, (3) stuffing the L1D cache with fresh data
 and expiring old content.  This stuffing loop is complicated and interesting,
 no details on the mitigation have been released by Intel so Mike and I
 studied other systems for inspiration.  Replacement algorithm for the L1D is
 described in the tlbleed paper. We use a 64K PA-linear region filled with
 trapsleds (in case there is L1D->L1I data movement). The TLBs covering the
 region are loaded first, because TLB loading apparently flows through the D
 cache.  Before performing vmlaunch or vmresume, the cachelines covering the
 guest registers are also flushed. with mlarkin, additional testing by pd,
 handy comments from the kettenis and guenther peanuts

---
 sys/arch/amd64/amd64/identcpu.c     | 24 +++++++++-
 sys/arch/amd64/amd64/vmm.c          | 33 +++++++++++---
 sys/arch/amd64/amd64/vmm_support.S  | 71 ++++++++++++++++++++++++++++-
 sys/arch/amd64/include/cpu.h        |  3 +-
 sys/arch/amd64/include/specialreg.h |  5 +-
 sys/arch/amd64/include/vmmvar.h     |  4 +-
 6 files changed, 127 insertions(+), 13 deletions(-)

diff --git a/sys/arch/amd64/amd64/identcpu.c b/sys/arch/amd64/amd64/identcpu.c
index cd433d578ba..184379b8a51 100644
--- a/sys/arch/amd64/amd64/identcpu.c
+++ b/sys/arch/amd64/amd64/identcpu.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: identcpu.c,v 1.106 2018/08/15 02:07:35 jsg Exp $	*/
+/*	$OpenBSD: identcpu.c,v 1.107 2018/08/21 19:04:38 deraadt Exp $	*/
 /*	$NetBSD: identcpu.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $	*/
 
 /*
@@ -1011,5 +1011,27 @@ cpu_check_vmm_cap(struct cpu_info *ci)
 		if (cap & AMD_SVM_NESTED_PAGING_CAP)
 			ci->ci_vmm_flags |= CI_VMM_RVI;
 	}
+
+	/*
+	 * Check "L1 flush on VM entry" (Intel L1TF vuln) semantics
+	 */
+	if (!strcmp(cpu_vendor, "GenuineIntel")) {
+		if (ci->ci_feature_sefflags_edx & SEFF0EDX_L1DF)
+			ci->ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr = 1;
+		else
+			ci->ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr = 0;
+
+		/*
+		 * Certain CPUs may have the vulnerability remedied in
+		 * hardware, check for that and override the setting
+		 * calculated above.
+		 */	
+		if (ci->ci_feature_sefflags_edx & SEFF0EDX_ARCH_CAP) {
+			msr = rdmsr(MSR_ARCH_CAPABILITIES);
+			if (msr & ARCH_CAPABILITIES_SKIP_L1DFL_VMENTRY)
+				ci->ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr =
+				    VMX_SKIP_L1D_FLUSH;
+		}
+	}
 }
 #endif /* NVMM > 0 */
diff --git a/sys/arch/amd64/amd64/vmm.c b/sys/arch/amd64/amd64/vmm.c
index 4abe6d584ae..34c5651a021 100644
--- a/sys/arch/amd64/amd64/vmm.c
+++ b/sys/arch/amd64/amd64/vmm.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: vmm.c,v 1.218 2018/07/27 21:11:31 kettenis Exp $	*/
+/*	$OpenBSD: vmm.c,v 1.219 2018/08/21 19:04:38 deraadt Exp $	*/
 /*
  * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
  *
@@ -42,6 +42,8 @@
 
 /* #define VMM_DEBUG */
 
+void *l1tf_flush_region;
+
 #ifdef VMM_DEBUG
 #define DPRINTF(x...)	do { printf(x); } while(0)
 #else
@@ -372,22 +374,38 @@ vmm_attach(struct device *parent, struct device *self, void *aux)
 	rw_init(&sc->vm_lock, "vmlistlock");
 
 	if (sc->nr_ept_cpus) {
-		printf(": VMX/EPT\n");
+		printf(": VMX/EPT");
 		sc->mode = VMM_MODE_EPT;
 	} else if (sc->nr_vmx_cpus) {
-		printf(": VMX\n");
+		printf(": VMX");
 		sc->mode = VMM_MODE_VMX;
 	} else if (sc->nr_rvi_cpus) {
-		printf(": SVM/RVI\n");
+		printf(": SVM/RVI");
 		sc->mode = VMM_MODE_RVI;
 	} else if (sc->nr_svm_cpus) {
-		printf(": SVM\n");
+		printf(": SVM");
 		sc->mode = VMM_MODE_SVM;
 	} else {
-		printf(": unknown\n");
+		printf(": unknown");
 		sc->mode = VMM_MODE_UNKNOWN;
 	}
 
+	if (sc->mode == VMM_MODE_EPT || sc->mode == VMM_MODE_VMX) {
+		if (!(curcpu()->ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr)) {
+			l1tf_flush_region = km_alloc(VMX_L1D_FLUSH_SIZE,
+			    &kv_any, &vmm_kp_contig, &kd_waitok);
+			if (!l1tf_flush_region) {
+				printf(" (failing, no memory)");
+				sc->mode = VMM_MODE_UNKNOWN;
+			} else {
+				printf(" (using slow L1TF mitigation)");
+				memset(l1tf_flush_region, 0xcc,
+				    VMX_L1D_FLUSH_SIZE);
+			}
+		}
+	}
+	printf("\n");
+
 	if (sc->mode == VMM_MODE_SVM || sc->mode == VMM_MODE_RVI) {
 		sc->max_vpid = curcpu()->ci_vmm_cap.vcc_svm.svm_max_asid;
 	} else {
@@ -4108,7 +4126,8 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp)
 
 		KERNEL_UNLOCK();
 		ret = vmx_enter_guest(&vcpu->vc_control_pa,
-		    &vcpu->vc_gueststate, resume);
+		    &vcpu->vc_gueststate, resume,
+		    curcpu()->ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr);
 
 		/*
 		 * On exit, interrupts are disabled, and we are running with
diff --git a/sys/arch/amd64/amd64/vmm_support.S b/sys/arch/amd64/amd64/vmm_support.S
index 8053e841f06..872951bcc20 100644
--- a/sys/arch/amd64/amd64/vmm_support.S
+++ b/sys/arch/amd64/amd64/vmm_support.S
@@ -1,4 +1,4 @@
-/*	$OpenBSD: vmm_support.S,v 1.12 2018/07/24 02:42:25 guenther Exp $	*/
+/*	$OpenBSD: vmm_support.S,v 1.13 2018/08/21 19:04:38 deraadt Exp $	*/
 /*
  * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
  *
@@ -16,6 +16,7 @@
  */
 
 #include "assym.h"
+#include <machine/param.h>
 #include <machine/asm.h>
 #include <machine/psl.h>
 #include <machine/specialreg.h>
@@ -163,6 +164,7 @@ _C_LABEL(invept):
 _C_LABEL(vmx_enter_guest):
 	RETGUARD_SETUP(vmx_enter_guest, r11)
 	movq	%rdx, %r8	/* resume flag */
+	movq	%rcx, %r9	/* L1DF MSR support */
 	testq	%r8, %r8
 	jnz skip_init
 
@@ -249,6 +251,62 @@ skip_init:
 	movq	%rsp, %rax
 	vmwrite	%rax, %rdi	/* Host RSP */
 
+	/*
+	 * Intel L1TF vulnerability fix
+	 *
+	 * Certain Intel CPUs are broken and allow guest VMs to bypass
+	 * EPT entirely as their address harvesting logic treats guest
+	 * PTEs as host physical addresses. Flush L1 Dcache to prevent
+	 * information leakage by command MSR or manually reading a
+	 * bunch of junk in order to fill sizeof(L1 Dcache)*2.
+	 *
+	 * %r9 (inherited from parameter 4 in %rcx earlier)
+	 * determines the flushing requirements
+	 *  0 - use manual "junk read" flush
+	 *  1 - use MSR command
+	 *  2 (VMX_SKIP_L1D_FLUSH) - no flush required on this CPU
+	 */
+	cmpq	$VMX_SKIP_L1D_FLUSH, %r9
+	je	done_flush
+
+	testq	%r9, %r9
+	jz	no_l1df_msr
+
+	/* CPU has command MSR */
+	movq	$MSR_FLUSH_CMD, %rcx
+	xorq	%rdx, %rdx
+	movq	$FLUSH_CMD_L1D_FLUSH, %rax
+	wrmsr
+	jmp	done_flush
+
+no_l1df_msr:
+	xorq	%r9, %r9
+l1df_tlb_loop:
+	/* XXX get the right L1 size from cpuid */
+	cmpq	$VMX_L1D_FLUSH_SIZE, %r9
+	je	l1df_tlb_done
+	movb	l1tf_flush_region(%r9), %al
+	addq	$PAGE_SIZE, %r9
+	jmp	l1df_tlb_loop
+
+l1df_tlb_done:
+	/*
+	 * Serialize: ensure previous TLB loads don't pull PTDs
+	 * or other PA-containing data into the L1D.
+	 */
+	xorq	%rax, %rax
+	cpuid
+
+	xorq	%r9, %r9
+l1df_load_cache:
+	movb	l1tf_flush_region(%r9), %al
+	/* XXX get the right cacheline size from cpuid */
+	addq	$0x40, %r9
+	cmpq	$VMX_L1D_FLUSH_SIZE, %r9
+	jne	l1df_load_cache	
+	lfence
+	
+done_flush:
 	testq	%r8, %r8
 	jnz	do_resume
 
@@ -262,6 +320,10 @@ skip_init:
 	movq	0x50(%rsi), %r11
 	movq	0x48(%rsi), %r10
 	movq	0x40(%rsi), %r9
+	movq	%rsi, %r8
+	/* XXX get the right cacheline size from cpuid */
+	addq	$0x40, %r8
+	clflush	(%r8)
 	movq	0x38(%rsi), %r8
 	movq	0x30(%rsi), %rbp
 	movq	0x28(%rsi), %rdi
@@ -269,6 +331,7 @@ skip_init:
 	movq	0x18(%rsi), %rcx
 	movq	0x10(%rsi), %rbx
 	movq	0x08(%rsi), %rax
+	clflush	(%rsi)
 	movq	0x00(%rsi), %rsi
 
 	vmlaunch
@@ -284,6 +347,10 @@ do_resume:
 	movq	0x50(%rsi), %r11
 	movq	0x48(%rsi), %r10
 	movq	0x40(%rsi), %r9
+	movq	%rsi, %r8
+	/* XXX get the right cacheline size from cpuid */
+	addq	$0x40, %r8
+	clflush	(%r8)
 	movq	0x38(%rsi), %r8
 	movq	0x30(%rsi), %rbp
 	movq	0x28(%rsi), %rdi
@@ -291,7 +358,9 @@ do_resume:
 	movq	0x18(%rsi), %rcx
 	movq	0x10(%rsi), %rbx
 	movq	0x08(%rsi), %rax
+	clflush	(%rsi)
 	movq	0x00(%rsi), %rsi
+
 	vmresume
 fail_launch_or_resume:
 	RET_STACK_REFILL_WITH_RCX
diff --git a/sys/arch/amd64/include/cpu.h b/sys/arch/amd64/include/cpu.h
index d345dd9ca64..7b4ea642b28 100644
--- a/sys/arch/amd64/include/cpu.h
+++ b/sys/arch/amd64/include/cpu.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: cpu.h,v 1.126 2018/07/11 20:07:55 guenther Exp $	*/
+/*	$OpenBSD: cpu.h,v 1.127 2018/08/21 19:04:40 deraadt Exp $	*/
 /*	$NetBSD: cpu.h,v 1.1 2003/04/26 18:39:39 fvdl Exp $	*/
 
 /*-
@@ -71,6 +71,7 @@ struct vmx {
 	uint32_t	vmx_msr_table_size;
 	uint32_t	vmx_cr3_tgt_count;
 	uint64_t	vmx_vm_func;
+	uint8_t		vmx_has_l1_flush_msr;
 };
 
 /*
diff --git a/sys/arch/amd64/include/specialreg.h b/sys/arch/amd64/include/specialreg.h
index 5457379dc99..7cbea305382 100644
--- a/sys/arch/amd64/include/specialreg.h
+++ b/sys/arch/amd64/include/specialreg.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: specialreg.h,v 1.78 2018/08/15 02:07:35 jsg Exp $	*/
+/*	$OpenBSD: specialreg.h,v 1.79 2018/08/21 19:04:40 deraadt Exp $	*/
 /*	$NetBSD: specialreg.h,v 1.1 2003/04/26 18:39:48 fvdl Exp $	*/
 /*	$NetBSD: x86/specialreg.h,v 1.2 2003/04/25 21:54:30 fvdl Exp $	*/
 
@@ -1235,6 +1235,9 @@
 #define IA32_VMX_MSR_LIST_SIZE_MASK	(7ULL << 25)
 #define IA32_VMX_CR3_TGT_SIZE_MASK	(0x1FFULL << 16)
 
+#define VMX_SKIP_L1D_FLUSH		2
+#define VMX_L1D_FLUSH_SIZE		(64 * 1024)
+
 /*
  * SVM
  */
diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h
index 70299c1e11c..fcbbb987cd6 100644
--- a/sys/arch/amd64/include/vmmvar.h
+++ b/sys/arch/amd64/include/vmmvar.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: vmmvar.h,v 1.57 2018/07/12 15:13:33 mlarkin Exp $	*/
+/*	$OpenBSD: vmmvar.h,v 1.58 2018/08/21 19:04:40 deraadt Exp $	*/
 /*
  * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
  *
@@ -900,7 +900,7 @@ int	vmwrite(uint64_t, uint64_t);
 int	vmread(uint64_t, uint64_t *);
 void	invvpid(uint64_t, struct vmx_invvpid_descriptor *);
 void	invept(uint64_t, struct vmx_invept_descriptor *);
-int	vmx_enter_guest(uint64_t *, struct vcpu_gueststate *, int);
+int	vmx_enter_guest(uint64_t *, struct vcpu_gueststate *, int, uint8_t);
 int	svm_enter_guest(uint64_t, struct vcpu_gueststate *,
     struct region_descriptor *);
 void	start_vmm_on_cpu(struct cpu_info *);
-- 
2.20.1