untrusted comment: signature from openbsd 6.2 base secret key
RWRVWzAMgtyg7rvxuplc/tv0WeP9WjrnZZl9FiKkuS5ma3iKXBpQdO3Ekovh3YHSivUuV1G6ugrfiGTKfWXrQdNdq1wHHFs4qgc=
OpenBSD 6.2 errata 009, March 1st, 2018:
Intel CPUs contain a speculative execution flaw called Meltdown which
allows userspace programs to access kernel memory. A complex workaround
solves the problem.
Apply by doing:
signify -Vep /etc/signify/openbsd-62-base.pub -x 009_meltdown.patch.sig \
-m - | (cd /usr/src && patch -p0)
And then rebuild and install a new kernel:
KK=`sysctl -n kern.osversion | cut -d# -f1`
cd /usr/src/sys/arch/`machine`/compile/$KK
make obj
make config
make
make install
Index: sys/arch/amd64/amd64/cpu.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v
retrieving revision 1.107
diff -u -p -r1.107 cpu.c
--- sys/arch/amd64/amd64/cpu.c 25 Aug 2017 19:28:48 -0000 1.107
+++ sys/arch/amd64/amd64/cpu.c 21 Feb 2018 21:41:06 -0000
@@ -81,7 +81,7 @@
#include <uvm/uvm_extern.h>
#include <machine/codepatch.h>
-#include <machine/cpu.h>
+#include <machine/cpu_full.h>
#include <machine/cpufunc.h>
#include <machine/cpuvar.h>
#include <machine/pmap.h>
@@ -117,6 +117,14 @@
#include <machine/hibernate.h>
#endif /* HIBERNATE */
+/* #define CPU_DEBUG */
+
+#ifdef CPU_DEBUG
+#define DPRINTF(x...) do { printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* CPU_DEBUG */
+
int cpu_match(struct device *, void *, void *);
void cpu_attach(struct device *, struct device *, void *);
int cpu_activate(struct device *, int);
@@ -173,7 +181,7 @@ struct cfdriver cpu_cd = {
* CPU, on uniprocessors). The CPU info list is initialized to
* point at it.
*/
-struct cpu_info cpu_info_primary = { 0, &cpu_info_primary };
+struct cpu_info_full cpu_info_full_primary = { .cif_cpu = { .ci_self = &cpu_info_primary } };
struct cpu_info *cpu_info_list = &cpu_info_primary;
@@ -339,8 +347,15 @@ cpu_attach(struct device *parent, struct
* structure, otherwise use the primary's.
*/
if (caa->cpu_role == CPU_ROLE_AP) {
- ci = malloc(sizeof(*ci), M_DEVBUF, M_WAITOK|M_ZERO);
+ struct cpu_info_full *cif;
+
+ cif = km_alloc(sizeof *cif, &kv_any, &kp_zero, &kd_waitok);
+ ci = &cif->cif_cpu;
#if defined(MULTIPROCESSOR)
+ ci->ci_tss = &cif->cif_tss;
+ ci->ci_gdt = (void *)(ci->ci_tss + 1);
+ memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE);
+ cpu_enter_pages(cif);
if (cpu_info[cpunum] != NULL)
panic("cpu at apic id %d already attached?", cpunum);
cpu_info[cpunum] = ci;
@@ -446,7 +461,6 @@ cpu_attach(struct device *parent, struct
#if defined(MULTIPROCESSOR)
cpu_intr_init(ci);
- gdt_alloc_cpu(ci);
sched_init_cpu(ci);
cpu_start_secondary(ci);
ncpus++;
@@ -931,4 +945,63 @@ cpu_activate(struct device *self, int ac
}
return (0);
+}
+
+/*
+ * cpu_enter_pages
+ *
+ * Requests mapping of various special pages required in the Intel Meltdown
+ * case (to be entered into the U-K page table):
+ *
+ * 1 tss+gdt page for each CPU
+ * 1 trampoline stack page for each CPU
+ *
+ * The cpu_info_full struct for each CPU straddles these pages. The offset into
+ * 'cif' is calculated below, for each page. For more information, consult
+ * the definition of struct cpu_info_full in cpu_full.h
+ *
+ * On CPUs unaffected by Meltdown, this function still configures 'cif' but
+ * the calls to pmap_enter_special become no-ops.
+ *
+ * Parameters:
+ * cif : the cpu_info_full structure describing a CPU whose pages are to be
+ * entered into the special meltdown U-K page table.
+ */
+void
+cpu_enter_pages(struct cpu_info_full *cif)
+{
+ vaddr_t va;
+ paddr_t pa;
+
+ /* The TSS+GDT need to be readable */
+ va = (vaddr_t)cif;
+ pmap_extract(pmap_kernel(), va, &pa);
+ pmap_enter_special(va, pa, PROT_READ);
+ DPRINTF("%s: entered tss+gdt page at va 0x%llx pa 0x%llx\n", __func__,
+ (uint64_t)va, (uint64_t)pa);
+
+ /* The trampoline stack page needs to be read/write */
+ va = (vaddr_t)&cif->cif_tramp_stack;
+ pmap_extract(pmap_kernel(), va, &pa);
+ pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
+ DPRINTF("%s: entered t.stack page at va 0x%llx pa 0x%llx\n", __func__,
+ (uint64_t)va, (uint64_t)pa);
+
+ cif->cif_tss.tss_rsp0 = va + sizeof(cif->cif_tramp_stack) - 16;
+ DPRINTF("%s: cif_tss.tss_rsp0 = 0x%llx\n" ,__func__,
+ (uint64_t)cif->cif_tss.tss_rsp0);
+ cif->cif_cpu.ci_intr_rsp = cif->cif_tss.tss_rsp0 -
+ sizeof(struct iretq_frame);
+
+#define SETUP_IST_SPECIAL_STACK(ist, cif, member) do { \
+ (cif)->cif_tss.tss_ist[(ist)] = (vaddr_t)&(cif)->member + \
+ sizeof((cif)->member) - 16; \
+ (cif)->member[nitems((cif)->member) - 2] = (int64_t)&(cif)->cif_cpu; \
+} while (0)
+
+ SETUP_IST_SPECIAL_STACK(0, cif, cif_dblflt_stack);
+ SETUP_IST_SPECIAL_STACK(1, cif, cif_nmi_stack);
+
+ /* an empty iomap, by setting its offset to the TSS limit */
+ cif->cif_tss.tss_iobase = sizeof(cif->cif_tss);
}
Index: sys/arch/amd64/amd64/gdt.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/gdt.c,v
retrieving revision 1.24
diff -u -p -r1.24 gdt.c
--- sys/arch/amd64/amd64/gdt.c 24 May 2015 01:01:49 -0000 1.24
+++ sys/arch/amd64/amd64/gdt.c 21 Feb 2018 21:33:03 -0000
@@ -45,33 +45,6 @@
#include <machine/pcb.h>
/*
- * Allocate shadow GDT for a slave cpu.
- */
-void
-gdt_alloc_cpu(struct cpu_info *ci)
-{
- struct vm_page *pg;
- vaddr_t va;
-
- ci->ci_gdt = (char *)uvm_km_valloc(kernel_map,
- GDT_SIZE + sizeof(*ci->ci_tss));
- ci->ci_tss = (void *)(ci->ci_gdt + GDT_SIZE);
- uvm_map_pageable(kernel_map, (vaddr_t)ci->ci_gdt,
- (vaddr_t)ci->ci_gdt + GDT_SIZE, FALSE, FALSE);
- for (va = (vaddr_t)ci->ci_gdt;
- va < (vaddr_t)ci->ci_gdt + GDT_SIZE + sizeof(*ci->ci_tss);
- va += PAGE_SIZE) {
- pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
- if (pg == NULL)
- panic("gdt_init: no pages");
- pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), PROT_READ | PROT_WRITE);
- }
- memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE);
- bzero(ci->ci_tss, sizeof(*ci->ci_tss));
-}
-
-
-/*
* Load appropriate gdt descriptor; we better be running on *ci
* (for the most part, this is how a cpu knows who it is).
*/
Index: sys/arch/amd64/amd64/genassym.cf
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/genassym.cf,v
retrieving revision 1.31
diff -u -p -r1.31 genassym.cf
--- sys/arch/amd64/amd64/genassym.cf 18 May 2015 19:59:27 -0000 1.31
+++ sys/arch/amd64/amd64/genassym.cf 21 Feb 2018 21:33:03 -0000
@@ -78,6 +78,15 @@ member tf_ss
define FRAMESIZE sizeof(struct trapframe)
+struct iretq_frame
+member IRETQ_CS iretq_cs
+member IRETQ_RIP iretq_rip
+member IRETQ_RFLAGS iretq_rflags
+member IRETQ_RSP iretq_rsp
+member IRETQ_SS iretq_ss
+
+define IRETQ_SIZE sizeof(struct iretq_frame)
+
struct pcb
member pcb_cr3
member pcb_rsp
@@ -91,6 +100,8 @@ member pcb_cr0
struct pmap
member pm_cpus
+member pm_pdirpa
+member pm_pdirpa_intel
struct x86_64_tss
member tss_rsp0
@@ -115,6 +126,10 @@ endif
member CPU_INFO_GDT ci_gdt
member CPU_INFO_TSS ci_tss
member CPU_INFO_FLAGS ci_flags
+member CPU_INFO_KERN_CR3 ci_kern_cr3
+member CPU_INFO_USER_CR3 ci_user_cr3
+member CPU_INFO_KERN_RSP ci_kern_rsp
+member CPU_INFO_INTR_RSP ci_intr_rsp
export CPUF_USERSEGS_BIT
Index: sys/arch/amd64/amd64/identcpu.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v
retrieving revision 1.87
diff -u -p -r1.87 identcpu.c
--- sys/arch/amd64/amd64/identcpu.c 20 Jun 2017 05:34:41 -0000 1.87
+++ sys/arch/amd64/amd64/identcpu.c 21 Feb 2018 21:33:03 -0000
@@ -204,6 +204,10 @@ const struct {
{ SEFF0ECX_AVX512VBMI, "AVX512VBMI" },
{ SEFF0ECX_UMIP, "UMIP" },
{ SEFF0ECX_PKU, "PKU" },
+}, cpu_seff0_edxfeatures[] = {
+ { SEFF0EDX_IBRS, "IBRS,IBPB" },
+ { SEFF0EDX_STIBP, "STIBP" },
+ /* SEFF0EDX_ARCH_CAP (not printed) */
}, cpu_tpm_eaxfeatures[] = {
{ TPM_SENSOR, "SENSOR" },
{ TPM_ARAT, "ARAT" },
@@ -211,6 +215,8 @@ const struct {
{ CPUIDEAX_VERID, "PERF" },
}, cpu_cpuid_apmi_edx[] = {
{ CPUIDEDX_ITSC, "ITSC" },
+}, cpu_amdspec_ebxfeatures[] = {
+ { CPUIDEBX_IBPB, "IBPB" },
};
int
@@ -489,6 +495,7 @@ identifycpu(struct cpu_info *ci)
int i;
char *brandstr_from, *brandstr_to;
int skipspace;
+ extern uint32_t cpu_meltdown;
CPUID(1, ci->ci_signature, val, dummy, ci->ci_feature_flags);
CPUID(0x80000000, ci->ci_pnfeatset, dummy, dummy, dummy);
@@ -607,7 +614,7 @@ identifycpu(struct cpu_info *ci)
if (cpuid_level >= 0x07) {
/* "Structured Extended Feature Flags" */
CPUID_LEAF(0x7, 0, dummy, ci->ci_feature_sefflags_ebx,
- ci->ci_feature_sefflags_ecx, dummy);
+ ci->ci_feature_sefflags_ecx, ci->ci_feature_sefflags_edx);
for (i = 0; i < nitems(cpu_seff0_ebxfeatures); i++)
if (ci->ci_feature_sefflags_ebx &
cpu_seff0_ebxfeatures[i].bit)
@@ -616,6 +623,10 @@ identifycpu(struct cpu_info *ci)
if (ci->ci_feature_sefflags_ecx &
cpu_seff0_ecxfeatures[i].bit)
printf(",%s", cpu_seff0_ecxfeatures[i].str);
+ for (i = 0; i < nitems(cpu_seff0_edxfeatures); i++)
+ if (ci->ci_feature_sefflags_edx &
+ cpu_seff0_edxfeatures[i].bit)
+ printf(",%s", cpu_seff0_edxfeatures[i].str);
}
if (!strcmp(cpu_vendor, "GenuineIntel") && cpuid_level >= 0x06) {
@@ -628,6 +639,22 @@ identifycpu(struct cpu_info *ci)
if (ci->ci_family >= 0x12)
ci->ci_feature_tpmflags |= TPM_ARAT;
}
+
+ /* AMD speculation control features */
+ if (!strcmp(cpu_vendor, "AuthenticAMD")) {
+ if (ci->ci_pnfeatset >= 0x80000008) {
+ CPUID(0x80000008, dummy, ci->ci_feature_amdspec_ebx,
+ dummy, dummy);
+ for (i = 0; i < nitems(cpu_amdspec_ebxfeatures); i++)
+ if (ci->ci_feature_amdspec_ebx &
+ cpu_amdspec_ebxfeatures[i].bit)
+ printf(",%s",
+ cpu_amdspec_ebxfeatures[i].str);
+ }
+ }
+
+ if (cpu_meltdown)
+ printf(",MELTDOWN");
printf("\n");
Index: sys/arch/amd64/amd64/lapic.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.48
diff -u -p -r1.48 lapic.c
--- sys/arch/amd64/amd64/lapic.c 24 Jul 2017 15:31:14 -0000 1.48
+++ sys/arch/amd64/amd64/lapic.c 21 Feb 2018 21:41:06 -0000
@@ -62,6 +62,14 @@
#include <machine/i82093var.h>
#endif
+/* #define LAPIC_DEBUG */
+
+#ifdef LAPIC_DEBUG
+#define DPRINTF(x...) do { printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* LAPIC_DEBUG */
+
struct evcount clk_count;
#ifdef MULTIPROCESSOR
struct evcount ipi_count;
@@ -204,6 +212,7 @@ lapic_map(paddr_t lapic_base)
codepatch_call(CPTAG_EOI, &x2apic_eoi);
lapic_writereg(LAPIC_TPRI, s);
+ va = (vaddr_t)&local_apic;
} else {
/*
* Map local apic. If we have a local apic, it's safe to
@@ -222,6 +231,17 @@ lapic_map(paddr_t lapic_base)
lapic_tpr = s;
}
+
+ /*
+ * Enter the LAPIC MMIO page in the U-K page table for handling
+ * Meltdown (needed in the interrupt stub to acknowledge the
+ * incoming interrupt). On CPUs unaffected by Meltdown,
+ * pmap_enter_special is a no-op.
+ * XXX - need to map this PG_N
+ */
+ pmap_enter_special(va, lapic_base, PROT_READ | PROT_WRITE);
+ DPRINTF("%s: entered lapic page va 0x%llx pa 0x%llx\n", __func__,
+ (uint64_t)va, (uint64_t)lapic_base);
enable_intr();
}
Index: sys/arch/amd64/amd64/locore.S
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/locore.S,v
retrieving revision 1.89
diff -u -p -r1.89 locore.S
--- sys/arch/amd64/amd64/locore.S 4 Oct 2017 02:10:33 -0000 1.89
+++ sys/arch/amd64/amd64/locore.S 21 Feb 2018 21:33:03 -0000
@@ -113,6 +113,7 @@
#include <sys/syscall.h>
#include <machine/param.h>
+#include <machine/psl.h>
#include <machine/segments.h>
#include <machine/specialreg.h>
#include <machine/trap.h>
@@ -176,6 +177,8 @@ _C_LABEL(lapic_isr):
.globl _C_LABEL(biosbasemem),_C_LABEL(biosextmem)
.globl _C_LABEL(bootapiver)
.globl _C_LABEL(pg_nx)
+ .globl _C_LABEL(pg_g_kern)
+ .globl _C_LABEL(cpu_meltdown)
_C_LABEL(cpu_id): .long 0 # saved from `cpuid' instruction
_C_LABEL(cpu_feature): .long 0 # feature flags from 'cpuid'
# instruction
@@ -208,6 +211,10 @@ _C_LABEL(biosextmem): .long 0 # extended
_C_LABEL(biosextmem): .long REALEXTMEM
#endif
_C_LABEL(pg_nx): .quad 0 # NX PTE bit (if CPU supports)
+_C_LABEL(pg_g_kern): .quad 0 # 0x100 if global pages should be used
+ # in kernel mappings, 0 otherwise (for
+ # insecure CPUs)
+_C_LABEL(cpu_meltdown): .long 0 # 1 if this CPU has Meltdown
#define _RELOC(x) ((x) - KERNBASE)
#define RELOC(x) _RELOC(_C_LABEL(x))
@@ -233,7 +240,7 @@ gdt64_end:
/*****************************************************************************/
/*
- * Signal trampoline; copied to top of user stack.
+ * Signal trampoline; copied to a page mapped into userspace.
* gdb's backtrace logic matches against the instructions in this.
*/
.section .rodata
@@ -370,11 +377,15 @@ switch_exited:
btrl $CPUF_USERSEGS_BIT, CPUVAR(FLAGS)
jnc restore_saved
- /* set %ds, %es, and %fs to expected value to prevent info leak */
+ /* set %ds, %es, %fs, and %gs to expected value to prevent info leak */
movw $(GSEL(GUDATA_SEL, SEL_UPL)),%ax
movw %ax,%ds
movw %ax,%es
movw %ax,%fs
+ cli /* block interrupts when on user GS.base */
+ swapgs /* switch from kernel to user GS.base */
+ movw %ax,%gs /* set %gs to UDATA and GS.base to 0 */
+ swapgs /* back to kernel GS.base */
restore_saved:
/*
@@ -394,20 +405,34 @@ restore_saved:
movq PCB_RSP(%r13),%rsp
movq PCB_RBP(%r13),%rbp
- movq CPUVAR(TSS),%rcx
- movq PCB_KSTACK(%r13),%rdx
- movq %rdx,TSS_RSP0(%rcx)
-
movq PCB_CR3(%r13),%rax
- movq %rax,%cr3
+ movq %rax,%cr3 /* %rax used below too */
/* Don't bother with the rest if switching to a system process. */
testl $P_SYSTEM,P_FLAG(%r12)
jnz switch_restored
+ /* record the bits needed for future U-->K transition */
+ movq PCB_KSTACK(%r13),%rdx
+ subq $FRAMESIZE,%rdx
+ movq %rdx,CPUVAR(KERN_RSP)
+ movq PCB_PMAP(%r13),%rcx
+
+ /*
+ * Meltdown: iff we're doing separate U+K and U-K page tables,
+ * then record them in cpu_info for easy access in syscall and
+ * interrupt trampolines. XXX code patch this
+ */
+
+ movq PM_PDIRPA_INTEL(%rcx),%rdx
+ testq %rdx,%rdx
+ jz 0f /* yay, no intel suckiness */
+ movq %rax,CPUVAR(KERN_CR3)
+ movq %rdx,CPUVAR(USER_CR3)
+0:
+
/* set the new pmap's bit for the cpu */
movl CPUVAR(CPUID),%edi
- movq PCB_PMAP(%r13),%rcx
lock
btsq %rdi,PM_CPUS(%rcx)
#ifdef DIAGNOSTIC
@@ -496,8 +521,7 @@ IDTVEC(syscall32)
sysret /* go away please */
/*
- * syscall insn entry. This currently isn't much faster, but
- * it can be made faster in the future.
+ * syscall insn entry.
*/
IDTVEC(syscall)
/*
@@ -507,13 +531,20 @@ IDTVEC(syscall)
* the user-space value.
* First order of business is to swap to the kernel gs.base so that
* we can access our struct cpu_info and use the scratch space there
- * to switch to our kernel stack. Once that's in place we can
+ * to switch to the kernel page tables (thank you, Intel), then
+ * switch to our kernel stack. Once that's in place we can
* unblock interrupts and save the rest of the syscall frame.
*/
swapgs
movq %r15,CPUVAR(SCRATCH)
- movq CPUVAR(CURPCB),%r15
- movq PCB_KSTACK(%r15),%r15
+ movq CPUVAR(KERN_CR3),%r15
+ testq %r15,%r15
+ jz Xsyscall_untramp
+ movq %r15,%cr3
+ jmp Xsyscall_untramp
+
+NENTRY(Xsyscall_untramp)
+ movq CPUVAR(KERN_RSP),%r15
xchgq %r15,%rsp
sti
@@ -524,12 +555,11 @@ IDTVEC(syscall)
* ss:rsp, etc, so that all GP registers can be
* saved. Then, fill in the rest.
*/
- pushq $(GSEL(GUDATA_SEL, SEL_UPL))
- pushq %r15
- subq $(TF_RSP-TF_TRAPNO),%rsp
+ movq $(GSEL(GUDATA_SEL, SEL_UPL)),TF_SS(%rsp)
+ movq %r15,TF_RSP(%rsp)
movq CPUVAR(SCRATCH),%r15
- subq $32,%rsp
- INTR_SAVE_GPRS
+ INTR_SAVE_MOST_GPRS_NO_ADJ
+ movq %rcx,TF_RCX(%rsp)
movq %r11, TF_RFLAGS(%rsp) /* old rflags from syscall insn */
movq $(GSEL(GUCODE_SEL, SEL_UPL)), TF_CS(%rsp)
movq %rcx,TF_RIP(%rsp)
@@ -574,16 +604,45 @@ IDTVEC(syscall)
movq TF_RBP(%rsp),%rbp
movq TF_RBX(%rsp),%rbx
- INTR_RESTORE_SELECTORS
+ /* Restore FS.base if it's not already in the CPU */
+ btsl $CPUF_USERSEGS_BIT,CPUVAR(FLAGS)
+ jc 99f
+ movq CPUVAR(CURPCB),%rdx
+ movq PCB_FSBASE(%rdx),%rax
+ movq %rax,%rdx
+ shrq $32,%rdx
+ movl $MSR_FSBASE,%ecx
+ wrmsr
+99:
+ /*
+ * We need to finish reading from the trapframe, then switch
+ * to the user page tables, swapgs, and return. We need
+ * to get the final value for the register that was used
+ * for the mov to %cr3 from somewhere accessible on the
+ * user page tables, so save it in CPUVAR(SCRATCH) across
+ * the switch.
+ */
movq TF_RDX(%rsp),%rdx
movq TF_RAX(%rsp),%rax
+ movq %rax,CPUVAR(SCRATCH)
+ movq CPUVAR(USER_CR3),%rax
movq TF_RIP(%rsp),%rcx
movq TF_RFLAGS(%rsp),%r11
movq TF_RSP(%rsp),%rsp
+ testq %rax,%rax
+ jz 1f
+ jmp syscall_trampback
+
+KUENTRY(syscall_trampback)
+ movq %rax,%cr3
+1: movq CPUVAR(SCRATCH),%rax
+ swapgs
sysretq
+ .text
+
#ifdef DIAGNOSTIC
.Lsyscall_spl_not_lowered:
movabsq $spl_lowered, %rdi
@@ -620,6 +679,12 @@ NENTRY(proc_trampoline)
* Return via iretq, for real interrupts and signal returns
*/
NENTRY(intr_fast_exit)
+#ifdef DIAGNOSTIC
+ pushfq
+ popq %rdx
+ testq $PSL_I,%rdx
+ jnz .Lintr_exit_not_blocked
+#endif /* DIAGNOSTIC */
movq TF_RDI(%rsp),%rdi
movq TF_RSI(%rsp),%rsi
movq TF_R8(%rsp),%r8
@@ -633,11 +698,68 @@ NENTRY(intr_fast_exit)
movq TF_RBX(%rsp),%rbx
testq $SEL_RPL,TF_CS(%rsp)
- je 5f
+ je intr_exit_recurse /* returning back to kernel? */
- INTR_RESTORE_SELECTORS
+ /* returning to userspace. XXX fix up iret frame here */
-5: movq TF_RDX(%rsp),%rdx
+ /* restore FS.base if it's not already in the CPU */
+ btsl $CPUF_USERSEGS_BIT,CPUVAR(FLAGS)
+ jc 99f
+ movq CPUVAR(CURPCB),%rdx /* for below */
+ movq PCB_FSBASE(%rdx),%rax
+ movq %rax,%rdx
+ shrq $32,%rdx
+ movl $MSR_FSBASE,%ecx
+ wrmsr
+99:
+ /*
+ * Returning to userspace. We need to go things in this order:
+ * - update the iret frame from the trapframe
+ * - finish reading from the trapframe
+ * - switch to the trampoline stack
+ * - jump to the .kutext segment
+ * - switch to the user page tables
+ * - swapgs
+ * - iretq
+ * To get the final value for the register that was used
+ * for the mov to %cr3, we need access to somewhere accessible
+ * on the user page tables, so we save it in CPUVAR(SCRATCH)
+ * across the switch.
+ */
+ /* update iret frame */
+ movq CPUVAR(INTR_RSP),%rdx
+ movq $(GSEL(GUCODE_SEL,SEL_UPL)),IRETQ_CS(%rdx)
+ movq TF_RIP(%rsp),%rax
+ movq %rax,IRETQ_RIP(%rdx)
+ movq TF_RFLAGS(%rsp),%rax
+ movq %rax,IRETQ_RFLAGS(%rdx)
+ movq TF_RSP(%rsp),%rax
+ movq %rax,IRETQ_RSP(%rdx)
+ movq $(GSEL(GUDATA_SEL,SEL_UPL)),IRETQ_SS(%rdx)
+ /* finish with the trap frame */
+ movq TF_RAX(%rsp),%rax
+ movq %rax,CPUVAR(SCRATCH)
+ movq TF_RCX(%rsp),%rcx
+ movq TF_R11(%rsp),%r11
+ /* switch to the trampoline stack */
+ xchgq %rdx,%rsp
+ movq TF_RDX(%rdx),%rdx
+ movq CPUVAR(USER_CR3),%rax
+ testq %rax,%rax
+ jz 1f
+ jmp iretq_tramp
+
+KUENTRY(iretq_tramp)
+ movq %rax,%cr3
+1: movq CPUVAR(SCRATCH),%rax
+ swapgs
+
+ .globl _C_LABEL(doreti_iret)
+_C_LABEL(doreti_iret):
+ iretq
+
+NENTRY(intr_exit_recurse)
+ movq TF_RDX(%rsp),%rdx
movq TF_RCX(%rsp),%rcx
movq TF_R11(%rsp),%r11
movq TF_RAX(%rsp),%rax
@@ -655,9 +777,6 @@ NENTRY(intr_fast_exit)
#endif /* !defined(GPROF) && defined(DDBPROF) */
addq $TF_RIP,%rsp
-
- .globl _C_LABEL(doreti_iret)
-_C_LABEL(doreti_iret):
iretq
@@ -690,6 +809,33 @@ _C_LABEL(doreti_iret):
addq $TF_RIP,%rsp
iretq
#endif /* !defined(GPROF) && defined(DDBPROF) */
+ .text
+
+#ifdef DIAGNOSTIC
+.Lintr_exit_not_blocked:
+ xchgw %bx, %bx
+ movl warn_once(%rip),%edi
+ testl %edi,%edi
+ jnz 1f
+ incl %edi
+ movl %edi,warn_once(%rip)
+ leaq .Lnot_blocked(%rip),%rdi
+ call _C_LABEL(printf)
+#ifdef DDB
+ int $3
+#endif /* DDB */
+1: cli
+ jmp intr_fast_exit
+
+ .data
+.global warn_once
+warn_once:
+ .long 0
+ .section .rodata
+.Lnot_blocked:
+ .asciz "WARNING: INTERRUPTS NOT BLOCKED ON INTERRUPT RETURN: 0x%x 0x%x\n"
+ .text
+#endif
ENTRY(xrstor_user)
movq %rsi, %rdx
Index: sys/arch/amd64/amd64/locore0.S
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/locore0.S,v
retrieving revision 1.2
diff -u -p -r1.2 locore0.S
--- sys/arch/amd64/amd64/locore0.S 6 Jul 2017 06:17:04 -0000 1.2
+++ sys/arch/amd64/amd64/locore0.S 21 Feb 2018 21:33:03 -0000
@@ -216,6 +216,48 @@ bi_size_ok:
movl %ecx,8(%ebp)
movl $0, 12(%ebp)
+ /*
+ * Determine if CPU has meltdown. Certain Intel CPUs do not properly
+ * respect page permissions when speculatively loading data into
+ * the cache ("Meltdown" CVE). These CPUs must utilize a secondary
+ * sanitized page table lacking kernel mappings when executing user
+ * processes, and may not use PG_G global PTEs for kernel VAs.
+ */
+ movl $0x1, RELOC(cpu_meltdown) /* assume insecure at first */
+ movl $0x0, RELOC(pg_g_kern)
+
+ cmpl $0x756e6547, %ebx # "Genu"
+ jne .Lcpu_secure
+ cmpl $0x6c65746e, %ecx # "ntel"
+ jne .Lcpu_secure
+ cmpl $0x49656e69, %edx # "ineI"
+ jne .Lcpu_secure
+
+ /*
+ * Intel CPU, now check if IA32_ARCH_CAPABILITIES is supported and
+ * if it says this CPU is safe.
+ */
+ movl $0x0, %eax
+ cpuid
+ cmpl $0x7, %eax
+ jl .Lcpu_check_finished
+
+ movl $0x7, %eax
+ cpuid
+ testl $SEFF0EDX_ARCH_CAP, %edx
+ jz .Lcpu_check_finished
+
+ /* IA32_ARCH_CAPABILITIES MSR avaialble, use it to check CPU security */
+ movl $MSR_ARCH_CAPABILITIES, %ecx
+ rdmsr
+ testl $ARCH_CAPABILITIES_RDCL_NO, %eax
+ jz .Lcpu_check_finished
+
+.Lcpu_secure:
+ movl $0x0, RELOC(cpu_meltdown)
+ movl $PG_G, RELOC(pg_g_kern)
+
+.Lcpu_check_finished:
movl $1,%eax
cpuid
movl %eax,RELOC(cpu_id)
@@ -482,7 +524,8 @@ map_tables:
leal (PROC0_DMP2_OFF)(%esi), %ebx
xorl %eax, %eax
movl $(NDML2_ENTRIES * NPDPG), %ecx
-1: orl $(PG_V|PG_KW|PG_PS|PG_G), %eax
+1: orl $(PG_V|PG_KW|PG_PS), %eax
+ orl RELOC(pg_g_kern), %eax
cmpl $__kernel_base_phys, %eax
jl store_pte
cmpl $__kernel_end_phys, %eax
Index: sys/arch/amd64/amd64/machdep.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/machdep.c,v
retrieving revision 1.231
diff -u -p -r1.231 machdep.c
--- sys/arch/amd64/amd64/machdep.c 12 Jul 2017 06:26:32 -0000 1.231
+++ sys/arch/amd64/amd64/machdep.c 21 Feb 2018 21:41:06 -0000
@@ -95,7 +95,7 @@
#include <sys/sysctl.h>
-#include <machine/cpu.h>
+#include <machine/cpu_full.h>
#include <machine/cpufunc.h>
#include <machine/pio.h>
#include <machine/psl.h>
@@ -148,6 +148,14 @@ extern int db_console;
#include <dev/ic/pckbcvar.h>
#endif
+/* #define MACHDEP_DEBUG */
+
+#ifdef MACHDEP_DEBUG
+#define DPRINTF(x...) do { printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* MACHDEP_DEBUG */
+
/* the following is used externally (sysctl_hw) */
char machine[] = MACHINE;
@@ -263,6 +271,7 @@ void cpu_init_extents(void);
void map_tramps(void);
void init_x86_64(paddr_t);
void (*cpuresetfn)(void);
+void enter_shared_special_pages(void);
#ifdef APERTURE
int allowaperture = 0;
@@ -315,6 +324,66 @@ cpu_startup(void)
/* Safe for i/o port / memory space allocation to use malloc now. */
x86_bus_space_mallocok();
+
+ /* enter the IDT and trampoline code in the u-k maps */
+ enter_shared_special_pages();
+
+ /* initialize CPU0's TSS and GDT and put them in the u-k maps */
+ cpu_enter_pages(&cpu_info_full_primary);
+}
+
+/*
+ * enter_shared_special_pages
+ *
+ * Requests mapping of various special pages required in the Intel Meltdown
+ * case (to be entered into the U-K page table):
+ *
+ * 1 IDT page
+ * Various number of pages covering the U-K ".kutext" section. This section
+ * contains code needed during trampoline operation
+ * Various number of pages covering the U-K ".kudata" section. This section
+ * contains data accessed by the trampoline, before switching to U+K
+ * (for example, various shared global variables used by IPIs, etc)
+ *
+ * The linker script places the required symbols in the sections above.
+ *
+ * On CPUs not affected by Meltdown, the calls to pmap_enter_special below
+ * become no-ops.
+ */
+void
+enter_shared_special_pages(void)
+{
+ extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[];
+ extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[];
+ vaddr_t va;
+ paddr_t pa;
+
+ /* idt */
+ pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ);
+ DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__,
+ (uint64_t)idt_vaddr, (uint64_t)idt_paddr);
+
+ /* .kutext section */
+ va = (vaddr_t)__kutext_start;
+ pa = (paddr_t)__kernel_kutext_phys;
+ while (va < (vaddr_t)__kutext_end) {
+ pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
+ DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n",
+ __func__, (uint64_t)va, (uint64_t)pa);
+ va += PAGE_SIZE;
+ pa += PAGE_SIZE;
+ }
+
+ /* .kudata section */
+ va = (vaddr_t)__kudata_start;
+ pa = (paddr_t)__kernel_kudata_phys;
+ while (va < (vaddr_t)__kudata_end) {
+ pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
+ DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n",
+ __func__, (uint64_t)va, (uint64_t)pa);
+ va += PAGE_SIZE;
+ pa += PAGE_SIZE;
+ }
}
/*
@@ -331,12 +400,6 @@ x86_64_proc0_tss_ldt_init(void)
pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16;
proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1;
- /* an empty iomap, by setting its offset to the TSS limit */
- cpu_info_primary.ci_tss->tss_iobase = sizeof(struct x86_64_tss);
- cpu_info_primary.ci_tss->tss_rsp0 = pcb->pcb_kstack;
- cpu_info_primary.ci_tss->tss_ist[0] =
- (u_int64_t)proc0.p_addr + PAGE_SIZE - 16;
-
ltr(GSYSSEL(GPROC0_SEL, SEL_KPL));
lldt(0);
}
@@ -348,15 +411,11 @@ x86_64_proc0_tss_ldt_init(void)
#ifdef MULTIPROCESSOR
void
x86_64_init_pcb_tss_ldt(struct cpu_info *ci)
-{
+{
struct pcb *pcb = ci->ci_idle_pcb;
- ci->ci_tss->tss_iobase = sizeof(*ci->ci_tss);
- ci->ci_tss->tss_rsp0 = pcb->pcb_kstack;
- ci->ci_tss->tss_ist[0] = pcb->pcb_kstack - USPACE + PAGE_SIZE;
-
pcb->pcb_cr0 = rcr0();
-}
+}
#endif /* MULTIPROCESSOR */
bios_diskinfo_t *
@@ -1000,25 +1059,27 @@ dumpsys(void)
/*
* Force the userspace FS.base to be reloaded from the PCB on return from
- * the kernel, and reset most the segment registers (%ds, %es, and %fs)
+ * the kernel, and reset the segment registers (%ds, %es, %fs, and %gs)
* to their expected userspace value.
*/
void
reset_segs(void)
{
/*
- * Segment registers (%ds, %es, %fs, %gs) aren't in the trapframe.
- * %gs is reset on return to userspace to avoid having to deal with
- * swapgs; others are reset on context switch and here. This
- * operates like the cpu_switchto() sequence: if we haven't reset
- * %[def]s already, do so now.
- */
+ * This operates like the cpu_switchto() sequence: if we
+ * haven't reset %[defg]s already, do so now.
+ */
if (curcpu()->ci_flags & CPUF_USERSEGS) {
curcpu()->ci_flags &= ~CPUF_USERSEGS;
__asm volatile(
"movw %%ax,%%ds\n\t"
"movw %%ax,%%es\n\t"
- "movw %%ax,%%fs" : : "a"(GSEL(GUDATA_SEL, SEL_UPL)));
+ "movw %%ax,%%fs\n\t"
+ "cli\n\t" /* block intr when on user GS.base */
+ "swapgs\n\t" /* swap from kernel to user GS.base */
+ "movw %%ax,%%gs\n\t"/* set %gs to UDATA and GS.base to 0 */
+ "swapgs\n\t" /* back to kernel GS.base */
+ "sti" : : "a"(GSEL(GUDATA_SEL, SEL_UPL)));
}
}
@@ -1544,8 +1605,6 @@ init_x86_64(paddr_t first_avail)
pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE);
- pmap_kenter_pa(idt_vaddr + PAGE_SIZE, idt_paddr + PAGE_SIZE,
- PROT_READ | PROT_WRITE);
#if defined(MULTIPROCESSOR) || \
(NACPI > 0 && !defined(SMALL_KERNEL))
@@ -1553,7 +1612,7 @@ init_x86_64(paddr_t first_avail)
#endif
idt = (struct gate_descriptor *)idt_vaddr;
- cpu_info_primary.ci_tss = (void *)(idt + NIDT);
+ cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss;
cpu_info_primary.ci_gdt = (void *)(cpu_info_primary.ci_tss + 1);
/* make gdt gates and memory segments */
@@ -1578,9 +1637,10 @@ init_x86_64(paddr_t first_avail)
/* exceptions */
for (x = 0; x < 32; x++) {
- ist = (x == 8) ? 1 : 0;
+ /* trap2 == NMI, trap8 == double fault */
+ ist = (x == 2) ? 2 : (x == 8) ? 1 : 0;
setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
- (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
+ (x == 3) ? SEL_UPL : SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
idt_allocmap[x] = 1;
}
Index: sys/arch/amd64/amd64/pmap.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/pmap.c,v
retrieving revision 1.105
diff -u -p -r1.105 pmap.c
--- sys/arch/amd64/amd64/pmap.c 24 Jul 2017 15:31:14 -0000 1.105
+++ sys/arch/amd64/amd64/pmap.c 22 Feb 2018 20:55:33 -0000
@@ -126,6 +126,15 @@
#include "acpi.h"
+/* #define PMAP_DEBUG */
+
+#ifdef PMAP_DEBUG
+#define DPRINTF(x...) do { printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* PMAP_DEBUG */
+
+
/*
* general info:
*
@@ -262,6 +271,7 @@ TAILQ_HEAD(pg_to_free, vm_page);
struct pool pmap_pdp_pool;
void pmap_pdp_ctor(pd_entry_t *);
+void pmap_pdp_ctor_intel(pd_entry_t *);
extern vaddr_t msgbuf_vaddr;
extern paddr_t msgbuf_paddr;
@@ -275,6 +285,8 @@ extern vaddr_t lo32_paddr;
vaddr_t virtual_avail;
extern int end;
+extern uint32_t cpu_meltdown;
+
/*
* local prototypes
*/
@@ -316,7 +328,6 @@ void pmap_tlb_shootwait(void);
#define pmap_tlb_shootwait()
#endif
-
/*
* p m a p i n l i n e h e l p e r f u n c t i o n s
*/
@@ -330,7 +341,8 @@ static __inline boolean_t
pmap_is_curpmap(struct pmap *pmap)
{
return((pmap == pmap_kernel()) ||
- (pmap->pm_pdirpa == (paddr_t) rcr3()));
+ (pmap->pm_pdirpa == (paddr_t) rcr3()) ||
+ (pmap->pm_pdirpa_intel == (paddr_t) rcr3()));
}
/*
@@ -491,7 +503,6 @@ pmap_find_pte_direct(struct pmap *pm, va
return (0);
}
-
/*
* p m a p k e n t e r f u n c t i o n s
*
@@ -520,7 +531,7 @@ pmap_kenter_pa(vaddr_t va, paddr_t pa, v
/* special 1:1 mappings in the first 2MB must not be global */
if (va >= (vaddr_t)NBPD_L2)
- npte |= PG_G;
+ npte |= pg_g_kern;
if (!(prot & PROT_EXEC))
npte |= pg_nx;
@@ -593,12 +604,12 @@ pmap_kremove(vaddr_t sva, vsize_t len)
paddr_t
pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
{
- vaddr_t kva, kva_end, kva_start = VM_MIN_KERNEL_ADDRESS;
+ vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS;
struct pmap *kpm;
int i;
- unsigned long p1i;
long ndmpdp;
paddr_t dmpd, dmpdp;
+ vaddr_t kva, kva_end;
/*
* define the boundaries of the managed kernel virtual address
@@ -654,9 +665,14 @@ pmap_bootstrap(paddr_t first_avail, padd
curpcb->pcb_pmap = kpm; /* proc0's pcb */
/*
- * enable global TLB entries.
+ * Add PG_G attribute to already mapped kernel pages. pg_g_kern
+ * is calculated in locore0.S and may be set to:
+ *
+ * 0 if this CPU does not safely support global pages in the kernel
+ * (Intel/Meltdown)
+ * PG_G if this CPU does safely support global pages in the kernel
+ * (AMD)
*/
- /* add PG_G attribute to already mapped kernel pages */
#if KERNBASE == VM_MIN_KERNEL_ADDRESS
for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
#else
@@ -664,9 +680,9 @@ pmap_bootstrap(paddr_t first_avail, padd
for (kva = KERNBASE; kva < kva_end ;
#endif
kva += PAGE_SIZE) {
- p1i = pl1_i(kva);
+ unsigned long p1i = pl1_i(kva);
if (pmap_valid_entry(PTE_BASE[p1i]))
- PTE_BASE[p1i] |= PG_G;
+ PTE_BASE[p1i] |= pg_g_kern;
}
/*
@@ -691,7 +707,7 @@ pmap_bootstrap(paddr_t first_avail, padd
va = PMAP_DIRECT_MAP(pdp);
*((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT);
- *((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | PG_G | PG_U |
+ *((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | pg_g_kern | PG_U |
PG_M | pg_nx;
}
@@ -737,7 +753,7 @@ pmap_bootstrap(paddr_t first_avail, padd
LIST_INIT(&pmaps);
/*
- * initialize the pmap pool.
+ * initialize the pmap pools.
*/
pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_NONE, 0,
@@ -753,6 +769,9 @@ pmap_bootstrap(paddr_t first_avail, padd
pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_NONE, PR_WAITOK,
"pdppl", NULL);
+ kpm->pm_pdir_intel = 0;
+ kpm->pm_pdirpa_intel = 0;
+
/*
* ensure the TLB is sync'd with reality by flushing it...
*/
@@ -905,13 +924,21 @@ pmap_free_ptp(struct pmap *pmap, struct
unsigned long index;
int level;
vaddr_t invaladdr;
- pd_entry_t opde;
+ pd_entry_t opde, *mdpml4es;
level = 1;
do {
pmap_freepage(pmap, ptp, level, pagelist);
index = pl_i(va, level + 1);
opde = pmap_pte_set(&pdes[level - 1][index], 0);
+ if (level == 3 && pmap->pm_pdir_intel) {
+ /* Zap special meltdown PML4e */
+ mdpml4es = (pd_entry_t *)pmap->pm_pdir_intel;
+ opde = pmap_pte_set(&mdpml4es[index], 0);
+ DPRINTF("%s: cleared meltdown PML4e @ index %lu "
+ "(va range start 0x%llx)\n", __func__, index,
+ (uint64_t)(index << L4_SHIFT));
+ }
invaladdr = level == 1 ? (vaddr_t)ptes :
(vaddr_t)pdes[level - 2];
pmap_tlb_shootpage(curpcb->pcb_pmap,
@@ -945,7 +972,7 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t
struct vm_page *ptp, *pptp;
int i;
unsigned long index;
- pd_entry_t *pva;
+ pd_entry_t *pva, *pva_intel;
paddr_t ppa, pa;
struct uvm_object *obj;
@@ -984,6 +1011,20 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t
pmap->pm_ptphint[i - 2] = ptp;
pa = VM_PAGE_TO_PHYS(ptp);
pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V);
+
+ /*
+ * Meltdown Special case - if we are adding a new PML4e for
+ * usermode addresses, just copy the PML4e to the U-K page
+ * table.
+ */
+ if (pmap->pm_pdir_intel && i == 4 && va < VM_MAXUSER_ADDRESS) {
+ pva_intel = (pd_entry_t *)pmap->pm_pdir_intel;
+ pva_intel[index] = pva[index];
+ DPRINTF("%s: copying usermode PML4e (content=0x%llx) "
+ "from 0x%llx -> 0x%llx\n", __func__, pva[index],
+ (uint64_t)&pva[index], (uint64_t)&pva_intel[index]);
+ }
+
pmap->pm_stats.resident_count++;
/*
* If we're not in the top level, increase the
@@ -1059,6 +1100,15 @@ pmap_pdp_ctor(pd_entry_t *pdir)
#endif
}
+void
+pmap_pdp_ctor_intel(pd_entry_t *pdir)
+{
+ struct pmap *kpm = pmap_kernel();
+
+ /* Copy PML4es from pmap_kernel's U-K view */
+ memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE);
+}
+
/*
* pmap_create: create a pmap
*
@@ -1099,6 +1149,22 @@ pmap_create(void)
pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME;
+ /*
+ * Intel CPUs need a special page table to be used during usermode
+ * execution, one that lacks all kernel mappings.
+ */
+ if (cpu_meltdown) {
+ pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK);
+ pmap_pdp_ctor_intel(pmap->pm_pdir_intel);
+ if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel,
+ &pmap->pm_pdirpa_intel))
+ panic("%s: unknown PA mapping for meltdown PML4\n",
+ __func__);
+ } else {
+ pmap->pm_pdir_intel = 0;
+ pmap->pm_pdirpa_intel = 0;
+ }
+
LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
return (pmap);
}
@@ -1156,6 +1222,9 @@ pmap_destroy(struct pmap *pmap)
/* XXX: need to flush it out of other processor's space? */
pool_put(&pmap_pdp_pool, pmap->pm_pdir);
+ if (pmap->pm_pdir_intel)
+ pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
+
pool_put(&pmap_pmap_pool, pmap);
}
@@ -1970,6 +2039,132 @@ pmap_collect(struct pmap *pmap)
* defined as macro in pmap.h
*/
+void
+pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot)
+{
+ uint64_t l4idx, l3idx, l2idx, l1idx;
+ pd_entry_t *pd, *ptp;
+ paddr_t npa;
+ struct pmap *pmap = pmap_kernel();
+ pt_entry_t *ptes;
+ int level, offs;
+
+ /* If CPU is secure, no need to do anything */
+ if (!cpu_meltdown)
+ return;
+
+ /* Must be kernel VA */
+ if (va < VM_MIN_KERNEL_ADDRESS)
+ panic("%s: invalid special mapping va 0x%lx requested",
+ __func__, va);
+
+ if (!pmap->pm_pdir_intel)
+ pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool,
+ PR_WAITOK | PR_ZERO);
+
+ l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */
+ l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
+ l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */
+ l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */
+
+ DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld "
+ "l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va,
+ (uint64_t)pa, l4idx, l3idx, l2idx, l1idx);
+
+ /* Start at PML4 / top level */
+ pd = (pd_entry_t *)pmap->pm_pdir_intel;
+
+ if (!pd)
+ panic("%s: PML4 not initialized for pmap @ %p\n", __func__,
+ pmap);
+
+ /* npa = physaddr of PDPT */
+ npa = pd[l4idx] & PMAP_PA_MASK;
+
+ /* Valid PML4e for the 512GB region containing va? */
+ if (!npa) {
+ /* No valid PML4E - allocate PDPT page and set PML4E */
+
+ ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
+
+ if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
+ panic("%s: can't locate PDPT page\n", __func__);
+
+ pd[l4idx] = (npa | PG_u | PG_RW | PG_V);
+
+ DPRINTF("%s: allocated new PDPT page at phys 0x%llx, "
+ "setting PML4e[%lld] = 0x%llx\n", __func__,
+ (uint64_t)npa, l4idx, pd[l4idx]);
+ }
+
+ pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
+ if (!pd)
+ panic("%s: can't locate PDPT @ pa=0x%llx\n", __func__,
+ (uint64_t)npa);
+
+ /* npa = physaddr of PD page */
+ npa = pd[l3idx] & PMAP_PA_MASK;
+
+ /* Valid PDPTe for the 1GB region containing va? */
+ if (!npa) {
+ /* No valid PDPTe - allocate PD page and set PDPTe */
+
+ ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
+
+ if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
+ panic("%s: can't locate PD page\n", __func__);
+
+ pd[l3idx] = (npa | PG_u | PG_RW | PG_V);
+
+ DPRINTF("%s: allocated new PD page at phys 0x%llx, "
+ "setting PDPTe[%lld] = 0x%llx\n", __func__,
+ (uint64_t)npa, l3idx, pd[l3idx]);
+ }
+
+ pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
+ if (!pd)
+ panic("%s: can't locate PD page @ pa=0x%llx\n", __func__,
+ (uint64_t)npa);
+
+ /* npa = physaddr of PT page */
+ npa = pd[l2idx] & PMAP_PA_MASK;
+
+ /* Valid PDE for the 2MB region containing va? */
+ if (!npa) {
+ /* No valid PDE - allocate PT page and set PDE */
+
+ ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
+
+ if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
+ panic("%s: can't locate PT page\n", __func__);
+
+ pd[l2idx] = (npa | PG_u | PG_RW | PG_V);
+
+ DPRINTF("%s: allocated new PT page at phys 0x%llx, "
+ "setting PDE[%lld] = 0x%llx\n", __func__,
+ (uint64_t)npa, l2idx, pd[l2idx]);
+ }
+
+ pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
+ if (!pd)
+ panic("%s: can't locate PT page @ pa=0x%llx\n", __func__,
+ (uint64_t)npa);
+
+ DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot "
+ "0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd,
+ (uint64_t)prot, (uint64_t)pd[l1idx]);
+
+ pd[l1idx] = pa | protection_codes[prot] | PG_V | PG_G | PG_W;
+ DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]);
+
+ /* now set the PG_G flag on the corresponding U+K entry */
+ level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
+ if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs])))
+ ptes[offs] |= PG_G;
+ else
+ DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
+}
+
/*
* pmap_enter: enter a mapping into a pmap
*
@@ -2166,7 +2361,7 @@ enter_now:
else if (va < VM_MAX_ADDRESS)
npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */
if (pmap == pmap_kernel())
- npte |= PG_G;
+ npte |= pg_g_kern;
ptes[pl1_i(va)] = npte; /* zap! */
@@ -2450,10 +2645,10 @@ pmap_convert(struct pmap *pmap, int mode
* release the lock if we get an interrupt in a bad moment.
*/
-volatile long tlb_shoot_wait;
+volatile long tlb_shoot_wait __attribute__((section(".kudata")));
-volatile vaddr_t tlb_shoot_addr1;
-volatile vaddr_t tlb_shoot_addr2;
+volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata")));
+volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata")));
void
pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
Index: sys/arch/amd64/amd64/spl.S
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/spl.S,v
retrieving revision 1.11
diff -u -p -r1.11 spl.S
--- sys/arch/amd64/amd64/spl.S 20 May 2016 14:37:53 -0000 1.11
+++ sys/arch/amd64/amd64/spl.S 21 Feb 2018 21:33:03 -0000
@@ -114,7 +114,7 @@ _C_LABEL(splx):
* a lower-prio one first, which needs to take the kernel lock -->
* the sending CPU will never see the that CPU accept the IPI
*/
-IDTVEC(spllower)
+KIDTVEC(spllower)
_PROF_PROLOGUE
pushq %rbx
pushq %r13
@@ -143,7 +143,7 @@ IDTVEC(spllower)
* ebx - cpl to restore
* r13 - address to resume loop at
*/
-IDTVEC(doreti)
+KIDTVEC(doreti)
popq %rbx # get previous priority
decl CPUVAR(IDEPTH)
leaq 1f(%rip),%r13
@@ -168,4 +168,8 @@ IDTVEC(doreti)
call _C_LABEL(ast)
cli
jmp 5b
-3: INTRFASTEXIT
+3:
+#ifdef DIAGNOSTIC
+ movl $254,%esi
+#endif /* DIAGNOSTIC */
+ INTRFASTEXIT
Index: sys/arch/amd64/amd64/trap.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/trap.c,v
retrieving revision 1.61
diff -u -p -r1.61 trap.c
--- sys/arch/amd64/amd64/trap.c 4 Oct 2017 02:10:33 -0000 1.61
+++ sys/arch/amd64/amd64/trap.c 21 Feb 2018 21:33:03 -0000
@@ -214,6 +214,18 @@ trap(struct trapframe *frame)
frame->tf_rip = (u_int64_t)xrstor_resume;
return;
}
+
+ /*
+ * Check for failure during return to user mode.
+ * We do this by looking at the address of the
+ * instruction that faulted.
+ */
+ if (frame->tf_rip == (u_int64_t)doreti_iret) {
+ frame->tf_rip = (u_int64_t)resume_iret;
+ return;
+ }
+ /* FALLTHROUGH */
+
case T_SEGNPFLT:
case T_ALIGNFLT:
case T_TSSFLT:
@@ -225,16 +237,6 @@ copyfault:
frame->tf_rip = (u_int64_t)pcb->pcb_onfault;
return;
}
-
- /*
- * Check for failure during return to user mode.
- * We do this by looking at the address of the
- * instruction that faulted.
- */
- if (frame->tf_rip == (u_int64_t)doreti_iret) {
- frame->tf_rip = (u_int64_t)resume_iret;
- return;
- }
goto we_re_toast;
case T_PROTFLT|T_USER: /* protection fault */
@@ -457,8 +459,12 @@ out:
static void
frame_dump(struct trapframe *tf)
{
- printf("rip %p rsp %p rfl %p\n",
- (void *)tf->tf_rip, (void *)tf->tf_rsp, (void *)tf->tf_rflags);
+ printf("rip %p cs 0x%x rfl %p rsp %p ss 0x%x\n",
+ (void *)tf->tf_rip, (unsigned)tf->tf_cs & 0xffff,
+ (void *)tf->tf_rflags,
+ (void *)tf->tf_rsp, (unsigned)tf->tf_ss & 0xffff);
+ printf("err 0x%llx trapno 0x%llx\n",
+ tf->tf_err, tf->tf_trapno);
printf("rdi %p rsi %p rdx %p\n",
(void *)tf->tf_rdi, (void *)tf->tf_rsi, (void *)tf->tf_rdx);
printf("rcx %p r8 %p r9 %p\n",
Index: sys/arch/amd64/amd64/vector.S
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/vector.S,v
retrieving revision 1.51
diff -u -p -r1.51 vector.S
--- sys/arch/amd64/amd64/vector.S 4 Oct 2017 02:10:33 -0000 1.51
+++ sys/arch/amd64/amd64/vector.S 28 Feb 2018 17:13:02 -0000
@@ -104,36 +104,97 @@
#define TRAP(a) pushq $(a) ; jmp _C_LABEL(alltraps)
#define ZTRAP(a) pushq $0 ; TRAP(a)
- .text
IDTVEC(trap00)
ZTRAP(T_DIVIDE)
IDTVEC(trap01)
ZTRAP(T_TRCTRAP)
+
+/*
+ * NMIs can happen at any time, so there's no simple way to tell
+ * which GS.base is in place at the time of the interrupt. Instead,
+ * borrow a couple ideas from FreeBSD and put the CPU's kernel
+ * GS.base in the memory right above the stack, storing the current
+ * one in a pair of callee-saved registers (%r12/13). We save the
+ * current %cr3 in a callee-saved register too (%r15).
+ * Note: we don't unblock interrupts because a nested normal interrupt
+ * would also reenable NMIs.
+ */
IDTVEC(trap02)
- ZTRAP(T_NMI)
+ pushq $0
+ pushq $T_NMI
+calltrap_specstk: # special stack path
+ INTR_REENTRY
+ movl $MSR_GSBASE,%ecx # save current GS.base...
+ rdmsr
+ movq %rax,%r12 # ...in %r12 and %r13
+ movq %rdx,%r13
+ movq FRAMESIZE(%rsp),%rax # get kernel GS.base
+ movq %rax,%rdx
+ shrq $32,%rdx
+ wrmsr # switch to it
+ movq %cr3,%r15 # save current %cr3 in %r15
+ movq CPUVAR(KERN_CR3),%rax # switch to kernel page tables
+ testq %rax,%rax
+ jz INTRENTRY_LABEL(calltrap_specstk)
+ movq %rax,%cr3
+ jmp INTRENTRY_LABEL(calltrap_specstk)
+ .text
+ .globl INTRENTRY_LABEL(calltrap_specstk)
+INTRENTRY_LABEL(calltrap_specstk):
+ cld
+ SMAP_CLAC
+ movq %rsp,%rdi
+ call trap
+ movl $MSR_GSBASE,%ecx # restore GS.base
+ movq %r12,%rax
+ movq %r13,%rdx
+ wrmsr
+ popq %rdi
+ popq %rsi
+ popq %rdx
+ popq %rcx
+ popq %r8
+ popq %r9
+ popq %r10
+ popq %r11
+ popq %r12
+ popq %r13
+ popq %r14
+ jmp calltrap_specstk_tramp
+KUENTRY(calltrap_specstk_tramp)
+ movq %r15,%cr3 # restore %cr3
+ popq %r15
+ popq %rbp
+ popq %rbx
+ popq %rax
+ addq $48,%rsp # ignored TF_[DEFG]S
+ iretq
+
IDTVEC(trap03)
ZTRAP(T_BPTFLT)
IDTVEC(trap04)
- ZTRAP(T_OFLOW)
+ ZTRAP(T_OFLOW) # impossible: INTO instruction invalid in amd64
IDTVEC(trap05)
- ZTRAP(T_BOUND)
+ ZTRAP(T_BOUND) # impossible: BOUND instruction invalid in amd64
IDTVEC(trap06)
ZTRAP(T_PRIVINFLT)
IDTVEC(trap07)
pushq $0 # dummy error code
pushq $T_DNA
- INTRENTRY
+ INTRENTRY(trap07)
sti
cld
SMAP_CLAC
movq CPUVAR(SELF),%rdi
movq %rsp, %rsi
call _C_LABEL(fpudna)
+ cli
INTRFASTEXIT
IDTVEC(trap08)
- TRAP(T_DOUBLEFLT)
+ pushq $T_DOUBLEFLT
+ jmp calltrap_specstk
IDTVEC(trap09)
- ZTRAP(T_FPOPFLT)
+ ZTRAP(T_FPOPFLT) # impossible: not generated on amd64
IDTVEC(trap0a)
TRAP(T_TSSFLT)
IDTVEC(trap0b)
@@ -149,30 +210,49 @@ IDTVEC(trap0c)
* so that we can do the necessary swapgs in that case.
*/
IDTVEC(trap0d)
- subq $TF_ERR,%rsp
- movl $T_PROTFLT,TF_TRAPNO(%rsp)
- movq %rdi,TF_RDI(%rsp)
- leaq _C_LABEL(doreti_iret)(%rip),%rdi
- cmpq %rdi,TF_RIP(%rsp)
+ pushq %rcx
+ leaq _C_LABEL(doreti_iret)(%rip),%rcx
+ cmpq %rcx,16(%rsp) /* over %rcx and err to %rip */
+ popq %rcx
je 1f
- testq $SEL_RPL,TF_CS(%rsp)
- jz 2f
+ testq $SEL_RPL,16(%rsp) /* over err and %rip to %cs */
+ je INTRENTRY_LABEL(trap0d)
1: swapgs
-2: movq %r15,TF_R15(%rsp)
- movq %r14,TF_R14(%rsp)
- movq %r13,TF_R13(%rsp)
- movq %r12,TF_R12(%rsp)
- movq %r11,TF_R11(%rsp)
- movq %r10,TF_R10(%rsp)
- movq %r9,TF_R9(%rsp)
- movq %r8,TF_R8(%rsp)
- /*movq %rdi,TF_RDI(%rsp) done above */
- movq %rsi,TF_RSI(%rsp)
- movq %rbp,TF_RBP(%rsp)
- movq %rbx,TF_RBX(%rsp)
- movq %rdx,TF_RDX(%rsp)
+ movq %rax,CPUVAR(SCRATCH)
+ movq CPUVAR(KERN_CR3),%rax
+ testq %rax,%rax
+ jz 98f
+ movq %rax,%cr3
+ jmp 98f
+ .text
+ .globl INTRENTRY_LABEL(trap0d)
+INTRENTRY_LABEL(trap0d): /* from kernel */
+ pushq $T_PROTFLT
+ subq $152,%rsp
movq %rcx,TF_RCX(%rsp)
- movq %rax,TF_RAX(%rsp)
+ jmp 99f
+98: /* from userspace */
+ movq CPUVAR(KERN_RSP),%rax
+ xchgq %rax,%rsp
+ movq %rcx,TF_RCX(%rsp)
+ /* set trapno in the trap frame */
+ movq $T_PROTFLT,TF_TRAPNO(%rsp)
+ /* copy err and iretq frame to the trap frame */
+ movq 0(%rax),%rcx
+ movq %rcx,TF_ERR(%rsp)
+ add $8,%rax
+ movq IRETQ_RIP(%rax),%rcx
+ movq %rcx,TF_RIP(%rsp)
+ movq IRETQ_CS(%rax),%rcx
+ movq %rcx,TF_CS(%rsp)
+ movq IRETQ_RFLAGS(%rax),%rcx
+ movq %rcx,TF_RFLAGS(%rsp)
+ movq IRETQ_RSP(%rax),%rcx
+ movq %rcx,TF_RSP(%rsp)
+ movq IRETQ_SS(%rax),%rcx
+ movq %rcx,TF_SS(%rsp)
+ movq CPUVAR(SCRATCH),%rax
+99: INTR_SAVE_MOST_GPRS_NO_ADJ
sti
jmp calltrap
@@ -204,7 +284,9 @@ IDTVEC(trap1f)
/* 20 - 31 reserved for future exp */
ZTRAP(T_RESERVED)
-IDTVEC(exceptions)
+ .section .rodata
+ .globl Xexceptions
+Xexceptions:
.quad _C_LABEL(Xtrap00), _C_LABEL(Xtrap01)
.quad _C_LABEL(Xtrap02), _C_LABEL(Xtrap03)
.quad _C_LABEL(Xtrap04), _C_LABEL(Xtrap05)
@@ -232,19 +314,44 @@ IDTVEC(exceptions)
* protection fault. This will cause the process to get a SIGBUS.
*/
NENTRY(resume_iret)
- pushq $0
- pushq $T_PROTFLT
- subq $32,%rsp
- INTR_SAVE_GPRS
+ movq %rax,CPUVAR(SCRATCH)
+ movq CPUVAR(KERN_CR3),%rax
+ testq %rax,%rax
+ jz INTRENTRY_LABEL(iret)
+ movq %rax,%cr3
+ jmp INTRENTRY_LABEL(iret)
+ .text
+ .globl INTRENTRY_LABEL(iret)
+INTRENTRY_LABEL(iret): /* from kernel */
+ movq CPUVAR(KERN_RSP),%rax
+ xchgq %rax,%rsp
+ movq %rcx,TF_RCX(%rsp)
+ /* set trapno+err in the trap frame */
+ movq $T_PROTFLT,TF_TRAPNO(%rsp)
+ movq $0,TF_ERR(%rsp)
+ /* copy iretq frame to the trap frame */
+ movq IRETQ_RIP(%rax),%rcx
+ movq %rcx,TF_RIP(%rsp)
+ movq IRETQ_CS(%rax),%rcx
+ movq %rcx,TF_CS(%rsp)
+ movq IRETQ_RFLAGS(%rax),%rcx
+ movq %rcx,TF_RFLAGS(%rsp)
+ movq IRETQ_RSP(%rax),%rcx
+ movq %rcx,TF_RSP(%rsp)
+ movq IRETQ_SS(%rax),%rcx
+ movq %rcx,TF_SS(%rsp)
+ movq CPUVAR(SCRATCH),%rax
+ INTR_SAVE_MOST_GPRS_NO_ADJ
sti
jmp calltrap
+
/*
* All traps go through here. Call the generic trap handler, and
* check for ASTs afterwards.
*/
-NENTRY(alltraps)
- INTRENTRY
+KUENTRY(alltraps)
+ INTRENTRY(alltraps)
sti
calltrap:
cld
@@ -329,6 +436,7 @@ spl_lowered:
/* XXX See comment in locore.s */
#define XINTR(name,num) Xintr_##name##num
+ KUTEXT
.globl _C_LABEL(x2apic_eoi)
_C_LABEL(x2apic_eoi):
pushq %rax
@@ -345,23 +453,23 @@ _C_LABEL(x2apic_eoi):
#if NLAPIC > 0
#ifdef MULTIPROCESSOR
-IDTVEC(recurse_lapic_ipi)
+KIDTVEC(recurse_lapic_ipi)
INTR_RECURSE_HWFRAME
- pushq $0
+ pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTR_REENTRY
jmp 1f
IDTVEC(intr_lapic_ipi)
- pushq $0
+ pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTRENTRY(intr_lapic_ipi)
CODEPATCH_START
movl $0,_C_LABEL(local_apic)+LAPIC_EOI
CODEPATCH_END(CPTAG_EOI)
movl CPUVAR(ILEVEL),%ebx
cmpl $IPL_IPI,%ebx
jae 2f
-IDTVEC(resume_lapic_ipi)
+KIDTVEC(resume_lapic_ipi)
1:
incl CPUVAR(IDEPTH)
movl $IPL_IPI,CPUVAR(ILEVEL)
@@ -425,27 +533,27 @@ IDTVEC(ipi_invlrange)
iretq
#endif /* MULTIPROCESSOR */
-
+
/*
* Interrupt from the local APIC timer.
*/
-IDTVEC(recurse_lapic_ltimer)
+KIDTVEC(recurse_lapic_ltimer)
INTR_RECURSE_HWFRAME
- pushq $0
+ pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTR_REENTRY
jmp 1f
IDTVEC(intr_lapic_ltimer)
- pushq $0
+ pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTRENTRY(intr_lapic_ltimer)
CODEPATCH_START
movl $0,_C_LABEL(local_apic)+LAPIC_EOI
CODEPATCH_END(CPTAG_EOI)
movl CPUVAR(ILEVEL),%ebx
cmpl $IPL_CLOCK,%ebx
jae 2f
-IDTVEC(resume_lapic_ltimer)
+KIDTVEC(resume_lapic_ltimer)
1:
incl CPUVAR(IDEPTH)
movl $IPL_CLOCK,CPUVAR(ILEVEL)
@@ -466,21 +574,21 @@ IDTVEC(resume_lapic_ltimer)
* Xen event channel upcall interrupt handler.
* Only used when the hypervisor supports direct vector callbacks.
*/
-IDTVEC(recurse_xen_upcall)
+KIDTVEC(recurse_xen_upcall)
INTR_RECURSE_HWFRAME
pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTR_REENTRY
jmp 1f
IDTVEC(intr_xen_upcall)
pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTRENTRY(intr_xen_upcall)
call _C_LABEL(xen_intr_ack)
movl CPUVAR(ILEVEL),%ebx
cmpl $IPL_NET,%ebx
jae 2f
-IDTVEC(resume_xen_upcall)
+KIDTVEC(resume_xen_upcall)
1:
incl CPUVAR(IDEPTH)
movl $IPL_NET,CPUVAR(ILEVEL)
@@ -502,20 +610,20 @@ IDTVEC(resume_xen_upcall)
* Hyperv event channel upcall interrupt handler.
* Only used when the hypervisor supports direct vector callbacks.
*/
-IDTVEC(recurse_hyperv_upcall)
+KIDTVEC(recurse_hyperv_upcall)
INTR_RECURSE_HWFRAME
pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTR_REENTRY
jmp 1f
IDTVEC(intr_hyperv_upcall)
pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTRENTRY(intr_hyperv_upcall)
movl CPUVAR(ILEVEL),%ebx
cmpl $IPL_NET,%ebx
jae 2f
-IDTVEC(resume_hyperv_upcall)
+KIDTVEC(resume_hyperv_upcall)
1:
incl CPUVAR(IDEPTH)
movl $IPL_NET,CPUVAR(ILEVEL)
@@ -542,11 +650,11 @@ IDTVEC(resume_hyperv_upcall)
*/
#define INTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \
-IDTVEC(recurse_##name##num) ;\
+KIDTVEC(recurse_##name##num) ;\
INTR_RECURSE_HWFRAME ;\
subq $16,%rsp /* space for __if_{trapno,err} */;\
- INTRENTRY ;\
-IDTVEC(resume_##name##num) \
+ INTR_REENTRY ;\
+KIDTVEC(resume_##name##num) \
movq $IREENT_MAGIC,TF_ERR(%rsp) ;\
movl %ebx,%r13d ;\
movq CPUVAR(ISOURCES) + (num) * 8, %r14 ;\
@@ -555,7 +663,7 @@ IDTVEC(resume_##name##num) \
IDTVEC(intr_##name##num) ;\
pushq $0 /* dummy error code */ ;\
subq $8,%rsp /* unused __if_trapno */ ;\
- INTRENTRY ;\
+ INTRENTRY(intr_##name##num) ;\
movq CPUVAR(ISOURCES) + (num) * 8, %r14 ;\
mask(num) /* mask it in hardware */ ;\
early_ack(num) /* and allow other intrs */ ;\
@@ -1094,8 +1202,7 @@ _C_LABEL(ioapic_level_stubs):
/*
* Soft interrupt handlers
*/
- .text
-IDTVEC(softtty)
+KIDTVEC(softtty)
movl $IPL_SOFTTTY, CPUVAR(ILEVEL)
sti
incl CPUVAR(IDEPTH)
@@ -1104,7 +1211,7 @@ IDTVEC(softtty)
decl CPUVAR(IDEPTH)
jmp *%r13
-IDTVEC(softnet)
+KIDTVEC(softnet)
movl $IPL_SOFTNET, CPUVAR(ILEVEL)
sti
incl CPUVAR(IDEPTH)
@@ -1113,7 +1220,7 @@ IDTVEC(softnet)
decl CPUVAR(IDEPTH)
jmp *%r13
-IDTVEC(softclock)
+KIDTVEC(softclock)
movl $IPL_SOFTCLOCK, CPUVAR(ILEVEL)
sti
incl CPUVAR(IDEPTH)
Index: sys/arch/amd64/conf/ld.script
===================================================================
RCS file: /cvs/src/sys/arch/amd64/conf/ld.script,v
retrieving revision 1.7
diff -u -p -r1.7 ld.script
--- sys/arch/amd64/conf/ld.script 6 Jul 2017 06:21:56 -0000 1.7
+++ sys/arch/amd64/conf/ld.script 21 Feb 2018 21:33:03 -0000
@@ -52,6 +52,15 @@ SECTIONS
*(.text .text.*)
} :text =0xcccccccc
+ . = ALIGN(__ALIGN_SIZE);
+ __kernel_kutext_phys = (. - __kernel_virt_base) + 0x1000000;
+ .kutext : AT (__kernel_kutext_phys)
+ {
+ __kutext_start = ABSOLUTE(.);
+ *(.kutext)
+ __kutext_end = ABSOLUTE(.);
+ } :text =0xcccccccc
+
PROVIDE (etext = .);
_etext = .;
@@ -84,6 +93,17 @@ SECTIONS
__data_start = ABSOLUTE(.);
*(.data .data.*)
} :data =0xcccccccc
+ . = ALIGN(0x1000);
+
+ . = ALIGN(__ALIGN_SIZE);
+ __kernel_kudata_phys = (. - __kernel_virt_base) + 0x1000000;
+ .kudata : AT (__kernel_kudata_phys)
+ {
+ __kudata_start = ABSOLUTE(.);
+ *(.kudata)
+ __kudata_end = ABSOLUTE(.);
+ } :data =0xcccccccc
+
. = ALIGN(0x1000);
PROVIDE (edata = .);
_edata = .;
Index: sys/arch/amd64/include/asm.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/asm.h,v
retrieving revision 1.8
diff -u -p -r1.8 asm.h
--- sys/arch/amd64/include/asm.h 29 Jun 2017 17:36:16 -0000 1.8
+++ sys/arch/amd64/include/asm.h 21 Feb 2018 21:33:03 -0000
@@ -68,14 +68,19 @@
.text; _ALIGN_TEXT; .globl x; .type x,@function; x:
#ifdef _KERNEL
+#define KUTEXT .section .kutext, "ax"
+/*#define KUTEXT .text */
+
/* XXX Can't use __CONCAT() here, as it would be evaluated incorrectly. */
-#ifdef __STDC__
#define IDTVEC(name) \
- .text; ALIGN_TEXT; .globl X ## name; .type X ## name,@function; X ## name:
-#else
-#define IDTVEC(name) \
- .text; ALIGN_TEXT; .globl X/**/name; .type X/**/name,@function; X/**/name:
-#endif /* __STDC__ */
+ KUTEXT; ALIGN_TEXT; \
+ .globl X ## name; .type X ## name,@function; X ## name:
+#define KIDTVEC(name) \
+ .text; ALIGN_TEXT; \
+ .globl X ## name; .type X ## name,@function; X ## name:
+#define KUENTRY(x) \
+ KUTEXT; _ALIGN_TEXT; .globl x; .type x,@function; x:
+
#endif /* _KERNEL */
#ifdef __STDC__
Index: sys/arch/amd64/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v
retrieving revision 1.114
diff -u -p -r1.114 cpu.h
--- sys/arch/amd64/include/cpu.h 11 Aug 2017 20:19:14 -0000 1.114
+++ sys/arch/amd64/include/cpu.h 21 Feb 2018 21:33:03 -0000
@@ -43,7 +43,7 @@
*/
#ifdef _KERNEL
#include <machine/frame.h>
-#include <machine/segments.h>
+#include <machine/segments.h> /* USERMODE */
#include <machine/cacheinfo.h>
#include <machine/intrdefs.h>
#endif /* _KERNEL */
@@ -89,6 +89,17 @@ union vmm_cpu_cap {
struct x86_64_tss;
struct cpu_info {
+ /*
+ * The beginning of this structure in mapped in the userspace "u-k"
+ * page tables, so that these first couple members can be accessed
+ * from the trampoline code. The ci_PAGEALIGN member defines where
+ * the part that is *not* visible begins, so don't put anything
+ * above it that must be kept hidden from userspace!
+ */
+ u_int64_t ci_kern_cr3; /* U+K page table */
+ u_int64_t ci_scratch; /* for U<-->K transition */
+
+#define ci_PAGEALIGN ci_dev
struct device *ci_dev;
struct cpu_info *ci_self;
struct schedstate_percpu ci_schedstate; /* scheduler state */
@@ -100,7 +111,9 @@ struct cpu_info {
u_int ci_acpi_proc_id;
u_int32_t ci_randseed;
- u_int64_t ci_scratch;
+ u_int64_t ci_kern_rsp; /* kernel-only stack */
+ u_int64_t ci_intr_rsp; /* U<-->K trampoline stack */
+ u_int64_t ci_user_cr3; /* U-K page table */
struct proc *ci_fpcurproc;
struct proc *ci_fpsaveproc;
@@ -127,6 +140,8 @@ struct cpu_info {
u_int32_t ci_feature_eflags;
u_int32_t ci_feature_sefflags_ebx;
u_int32_t ci_feature_sefflags_ecx;
+ u_int32_t ci_feature_sefflags_edx;
+ u_int32_t ci_feature_amdspec_ebx;
u_int32_t ci_feature_tpmflags;
u_int32_t ci_pnfeatset;
u_int32_t ci_efeature_eax;
@@ -215,7 +230,10 @@ struct cpu_info {
#define PROC_PC(p) ((p)->p_md.md_regs->tf_rip)
#define PROC_STACK(p) ((p)->p_md.md_regs->tf_rsp)
-extern struct cpu_info cpu_info_primary;
+struct cpu_info_full;
+extern struct cpu_info_full cpu_info_full_primary;
+#define cpu_info_primary (*(struct cpu_info *)((char *)&cpu_info_full_primary + 4096*2 - offsetof(struct cpu_info, ci_PAGEALIGN)))
+
extern struct cpu_info *cpu_info_list;
#define CPU_INFO_ITERATOR int
@@ -240,7 +258,8 @@ extern void need_resched(struct cpu_info
#define CPU_START_CLEANUP(_ci) ((_ci)->ci_func->cleanup(_ci))
#define curcpu() ({struct cpu_info *__ci; \
- asm volatile("movq %%gs:8,%0" : "=r" (__ci)); \
+ asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) \
+ :"n" (offsetof(struct cpu_info, ci_self))); \
__ci;})
#define cpu_number() (curcpu()->ci_cpuid)
@@ -261,8 +280,6 @@ void cpu_unidle(struct cpu_info *);
#define MAXCPUS 1
#ifdef _KERNEL
-extern struct cpu_info cpu_info_primary;
-
#define curcpu() (&cpu_info_primary)
#define cpu_kick(ci)
Index: sys/arch/amd64/include/cpu_full.h
===================================================================
RCS file: sys/arch/amd64/include/cpu_full.h
diff -N sys/arch/amd64/include/cpu_full.h
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ sys/arch/amd64/include/cpu_full.h 22 Feb 2018 20:30:15 -0000
@@ -0,0 +1,66 @@
+/* $OpenBSD$ */
+/*
+ * Copyright (c) Philip Guenther <
[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _MACHINE_CPU_FULL_H_
+#define _MACHINE_CPU_FULL_H_
+
+#include <sys/param.h> /* offsetof, PAGE_SIZE */
+#include <machine/segments.h>
+#include <machine/tss.h>
+
+/*
+ * The layout of the full per-CPU information, including TSS, GDT,
+ * trampoline stacks, and cpu_info described in <machine/cpu.h>
+ */
+struct cpu_info_full {
+ /* page mapped kRO in u-k */
+ union {
+ struct x86_64_tss u_tss; /* followed by gdt */
+ char u_align[PAGE_SIZE];
+ } cif_RO;
+#define cif_tss cif_RO.u_tss
+
+ /* start of page mapped kRW in u-k */
+ uint64_t cif_tramp_stack[(PAGE_SIZE / 4
+ - offsetof(struct cpu_info, ci_PAGEALIGN)) / sizeof(uint64_t)];
+ uint64_t cif_dblflt_stack[(PAGE_SIZE / 4) / sizeof(uint64_t)];
+ uint64_t cif_nmi_stack[(2 * PAGE_SIZE / 4) / sizeof(uint64_t)];
+
+ /*
+ * Beginning of this hangs over into the kRW page; rest is
+ * unmapped in u-k
+ */
+ struct cpu_info cif_cpu;
+} __aligned(PAGE_SIZE);
+
+/* tss, align shim, and gdt must fit in a page */
+CTASSERT(_ALIGN(sizeof(struct x86_64_tss)) +
+ sizeof(struct mem_segment_descriptor) * (NGDT_MEM + 2*NGDT_SYS)
+ < PAGE_SIZE);
+
+/* verify expected alignment */
+CTASSERT(offsetof(struct cpu_info_full, cif_cpu.ci_PAGEALIGN) % PAGE_SIZE == 0);
+
+/* verify total size is multiple of page size */
+CTASSERT(sizeof(struct cpu_info_full) % PAGE_SIZE == 0);
+
+extern struct cpu_info_full cpu_info_full_primary;
+
+/* Now make sure the cpu_info_primary macro is correct */
+CTASSERT(&cpu_info_primary - &cpu_info_full_primary.cif_cpu == 0);
+
+#endif /* _MACHINE_CPU_FULL_H_ */
Index: sys/arch/amd64/include/cpufunc.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/cpufunc.h,v
retrieving revision 1.20
diff -u -p -r1.20 cpufunc.h
--- sys/arch/amd64/include/cpufunc.h 8 Aug 2017 15:53:55 -0000 1.20
+++ sys/arch/amd64/include/cpufunc.h 21 Feb 2018 21:33:03 -0000
@@ -315,6 +315,9 @@ breakpoint(void)
void amd64_errata(struct cpu_info *);
+struct cpu_info_full;
+void cpu_enter_pages(struct cpu_info_full *);
+
#endif /* _KERNEL */
#endif /* !_MACHINE_CPUFUNC_H_ */
Index: sys/arch/amd64/include/frame.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/frame.h,v
retrieving revision 1.6
diff -u -p -r1.6 frame.h
--- sys/arch/amd64/include/frame.h 26 Feb 2016 09:29:20 -0000 1.6
+++ sys/arch/amd64/include/frame.h 21 Feb 2018 21:33:03 -0000
@@ -147,6 +147,20 @@ struct intrframe {
int64_t if_ss;
};
+
+/*
+ * The trampoline frame used on the kernel stack page which is present
+ * but kernel-only, in the page tables used when in userspace. This is
+ * the minimum for iretq operation.
+ */
+struct iretq_frame {
+ int64_t iretq_rip;
+ int64_t iretq_cs;
+ int64_t iretq_rflags;
+ int64_t iretq_rsp;
+ int64_t iretq_ss;
+};
+
/*
* Stack frame inside cpu_switch()
*/
Index: sys/arch/amd64/include/frameasm.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/frameasm.h,v
retrieving revision 1.10
diff -u -p -r1.10 frameasm.h
--- sys/arch/amd64/include/frameasm.h 4 Sep 2016 09:22:28 -0000 1.10
+++ sys/arch/amd64/include/frameasm.h 21 Feb 2018 21:33:03 -0000
@@ -13,7 +13,10 @@
* These are used on interrupt or trap entry or exit.
*/
#define INTR_SAVE_GPRS \
- subq $120,%rsp ; \
+ subq $120,%rsp ; \
+ INTR_SAVE_MOST_GPRS_NO_ADJ ; \
+ movq %rcx,TF_RCX(%rsp)
+#define INTR_SAVE_MOST_GPRS_NO_ADJ \
movq %r15,TF_R15(%rsp) ; \
movq %r14,TF_R14(%rsp) ; \
movq %r13,TF_R13(%rsp) ; \
@@ -27,15 +30,54 @@
movq %rbp,TF_RBP(%rsp) ; \
movq %rbx,TF_RBX(%rsp) ; \
movq %rdx,TF_RDX(%rsp) ; \
- movq %rcx,TF_RCX(%rsp) ; \
movq %rax,TF_RAX(%rsp)
-#define INTRENTRY \
- subq $32,%rsp ; \
- testq $SEL_RPL,56(%rsp) ; \
- je 98f ; \
+/* For real interrupt code paths, where we can come from userspace */
+#define INTRENTRY_LABEL(label) X##label##_untramp
+#define INTRENTRY(label) \
+ testq $SEL_RPL,24(%rsp) ; \
+ je INTRENTRY_LABEL(label) ; \
swapgs ; \
-98: INTR_SAVE_GPRS
+ movq %rax,CPUVAR(SCRATCH) ; \
+ movq CPUVAR(KERN_CR3),%rax ; \
+ testq %rax,%rax ; \
+ jz 98f ; \
+ movq %rax,%cr3 ; \
+ jmp 98f ; \
+ .text ; \
+ .global INTRENTRY_LABEL(label) ; \
+INTRENTRY_LABEL(label): /* from kernel */ \
+ subq $152,%rsp ; \
+ movq %rcx,TF_RCX(%rsp) ; \
+ jmp 99f ; \
+98: /* from userspace */ \
+ movq CPUVAR(KERN_RSP),%rax ; \
+ xchgq %rax,%rsp ; \
+ movq %rcx,TF_RCX(%rsp) ; \
+ /* copy trapno+err to the trap frame */ \
+ movq 0(%rax),%rcx ; \
+ movq %rcx,TF_TRAPNO(%rsp) ; \
+ movq 8(%rax),%rcx ; \
+ movq %rcx,TF_ERR(%rsp) ; \
+ addq $16,%rax ; \
+ /* copy iretq frame to the trap frame */ \
+ movq IRETQ_RIP(%rax),%rcx ; \
+ movq %rcx,TF_RIP(%rsp) ; \
+ movq IRETQ_CS(%rax),%rcx ; \
+ movq %rcx,TF_CS(%rsp) ; \
+ movq IRETQ_RFLAGS(%rax),%rcx ; \
+ movq %rcx,TF_RFLAGS(%rsp) ; \
+ movq IRETQ_RSP(%rax),%rcx ; \
+ movq %rcx,TF_RSP(%rsp) ; \
+ movq IRETQ_SS(%rax),%rcx ; \
+ movq %rcx,TF_SS(%rsp) ; \
+ movq CPUVAR(SCRATCH),%rax ; \
+99: INTR_SAVE_MOST_GPRS_NO_ADJ
+
+/* For faking up an interrupt frame when we're already in the kernel */
+#define INTR_REENTRY \
+ subq $32,%rsp ; \
+ INTR_SAVE_GPRS
#define INTRFASTEXIT \
jmp intr_fast_exit
@@ -49,26 +91,6 @@
movl %cs,%r11d ; \
pushq %r11 ; \
pushq %r13 ;
-
-/*
- * Restore FS.base if it's not already in the CPU, and do the cli/swapgs.
- * Uses %rax, %rcx, and %rdx
- */
-#define INTR_RESTORE_SELECTORS \
- btsl $CPUF_USERSEGS_BIT, CPUVAR(FLAGS) ; \
- jc 99f ; \
- movq CPUVAR(CURPCB),%rdx /* for below */ ; \
- movq PCB_FSBASE(%rdx),%rax ; \
- cmpq $0,%rax ; \
- je 99f /* setting %fs has zeroed FS.base */ ; \
- movq %rax,%rdx ; \
- shrq $32,%rdx ; \
- movl $MSR_FSBASE,%ecx ; \
- wrmsr ; \
-99: movw $(GSEL(GUDATA_SEL, SEL_UPL)),%ax ; \
- cli ; \
- swapgs ; \
- movw %ax,%gs
#define INTR_FAKE_TRAP 0xbadabada
Index: sys/arch/amd64/include/gdt.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/gdt.h,v
retrieving revision 1.5
diff -u -p -r1.5 gdt.h
--- sys/arch/amd64/include/gdt.h 13 Nov 2010 04:16:42 -0000 1.5
+++ sys/arch/amd64/include/gdt.h 21 Feb 2018 21:33:03 -0000
@@ -31,4 +31,3 @@
*/
void gdt_init_cpu(struct cpu_info *);
-void gdt_alloc_cpu(struct cpu_info *);
Index: sys/arch/amd64/include/pmap.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/pmap.h,v
retrieving revision 1.62
diff -u -p -r1.62 pmap.h
--- sys/arch/amd64/include/pmap.h 8 Feb 2016 18:23:04 -0000 1.62
+++ sys/arch/amd64/include/pmap.h 21 Feb 2018 21:33:03 -0000
@@ -283,8 +283,19 @@ struct pmap {
struct mutex pm_mtx;
struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */
LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */
- pd_entry_t *pm_pdir; /* VA of PD (lck by object lock) */
- paddr_t pm_pdirpa; /* PA of PD (read-only after create) */
+ /*
+ * pm_pdir : VA of page table to be used when executing in
+ * privileged mode
+ * pm_pdirpa : PA of page table to be used when executing in
+ * privileged mode
+ * pm_pdir_intel : VA of special page table to be used when executing
+ * on an Intel CPU in usermode (no kernel mappings)
+ * pm_pdirpa_intel : PA of special page table to be used when executing
+ * on an Intel CPU in usermode (no kernel mappings)
+ */
+ pd_entry_t *pm_pdir, *pm_pdir_intel;
+ paddr_t pm_pdirpa, pm_pdirpa_intel;
+
struct vm_page *pm_ptphint[PTP_LEVELS-1];
/* pointer to a PTP in our pmap */
struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */
@@ -378,6 +389,7 @@ paddr_t pmap_prealloc_lowmem_ptps(paddr_
void pagezero(vaddr_t);
int pmap_convert(struct pmap *, int);
+void pmap_enter_special(vaddr_t, paddr_t, vm_prot_t);
/*
* functions for flushing the cache for vaddrs and pages.
Index: sys/arch/amd64/include/pte.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/pte.h,v
retrieving revision 1.13
diff -u -p -r1.13 pte.h
--- sys/arch/amd64/include/pte.h 9 Nov 2015 00:49:33 -0000 1.13
+++ sys/arch/amd64/include/pte.h 21 Feb 2018 20:14:19 -0000
@@ -158,6 +158,7 @@ typedef u_int64_t pt_entry_t; /* PTE */
#ifdef _KERNEL
extern pt_entry_t pg_nx; /* NX pte bit */
+extern pt_entry_t pg_g_kern; /* PG_G if glbl mappings can be used in kern */
#endif /* _KERNEL */
#endif /* _MACHINE_PTE_H_ */
Index: sys/arch/amd64/include/specialreg.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/specialreg.h,v
retrieving revision 1.61
diff -u -p -r1.61 specialreg.h
--- sys/arch/amd64/include/specialreg.h 12 Aug 2017 19:53:37 -0000 1.61
+++ sys/arch/amd64/include/specialreg.h 22 Feb 2018 20:30:15 -0000
@@ -214,6 +214,10 @@
#define SEFF0ECX_AVX512VBMI 0x00000002 /* AVX-512 vector bit inst */
#define SEFF0ECX_UMIP 0x00000004 /* UMIP support */
#define SEFF0ECX_PKU 0x00000008 /* Page prot keys for user mode */
+/* SEFF EDX bits */
+#define SEFF0EDX_IBRS 0x04000000 /* IBRS / IBPB Speculation Control */
+#define SEFF0EDX_STIBP 0x08000000 /* STIBP Speculation Control */
+#define SEFF0EDX_ARCH_CAP 0x20000000 /* Has IA32_ARCH_CAPABILITIES MSR */
/*
* Thermal and Power Management (CPUID function 0x6) EAX bits
@@ -285,9 +289,13 @@
* "Advanced Power Management Information" bits (CPUID function 0x80000007):
* EDX bits.
*/
-
#define CPUIDEDX_ITSC (1 << 8) /* Invariant TSC */
+/*
+ * AMD CPUID function 0x80000008 EBX bits
+ */
+#define CPUIDEBX_IBPB (1ULL << 12) /* Speculation Control IBPB */
+
#define CPUID2FAMILY(cpuid) (((cpuid) >> 8) & 15)
#define CPUID2MODEL(cpuid) (((cpuid) >> 4) & 15)
#define CPUID2STEPPING(cpuid) ((cpuid) & 15)
@@ -319,6 +327,11 @@
#define MSR_EBC_FREQUENCY_ID 0x02c /* Pentium 4 only */
#define MSR_TEST_CTL 0x033
#define MSR_IA32_FEATURE_CONTROL 0x03a
+#define MSR_SPEC_CTRL 0x048 /* Speculation Control IBRS / STIBP */
+#define SPEC_CTRL_IBRS (1ULL << 0)
+#define SPEC_CTRL_STIBP (1ULL << 1)
+#define MSR_PRED_CMD 0x049 /* Speculation Control IBPB */
+#define PRED_CMD_IBPB (1ULL << 0)
#define MSR_BIOS_UPDT_TRIG 0x079
#define MSR_BBL_CR_D0 0x088 /* PII+ only */
#define MSR_BBL_CR_D1 0x089 /* PII+ only */
@@ -331,6 +344,8 @@
#define MTRRcap_FIXED 0x100 /* bit 8 - fixed MTRRs supported */
#define MTRRcap_WC 0x400 /* bit 10 - WC type supported */
#define MTRRcap_SMRR 0x800 /* bit 11 - SMM range reg supported */
+#define MSR_ARCH_CAPABILITIES 0x10a
+#define ARCH_CAPABILITIES_RDCL_NO (1 << 0) /* Meltdown safe */
#define MSR_BBL_CR_ADDR 0x116 /* PII+ only */
#define MSR_BBL_CR_DECC 0x118 /* PII+ only */
#define MSR_BBL_CR_CTL 0x119 /* PII+ only */
Index: distrib/sets/lists/comp/md.amd64
===================================================================
RCS file: /cvs/src/distrib/sets/lists/comp/md.amd64,v
retrieving revision 1.108
diff -u -p -r1.108 md.amd64
--- distrib/sets/lists/comp/md.amd64 20 Aug 2017 14:53:38 -0000 1.108
+++ distrib/sets/lists/comp/md.amd64 26 Feb 2018 13:03:54 -0000
@@ -16,6 +16,7 @@
./usr/include/amd64/codepatch.h
./usr/include/amd64/conf.h
./usr/include/amd64/cpu.h
+./usr/include/amd64/cpu_full.h
./usr/include/amd64/cpufunc.h
./usr/include/amd64/cpuvar.h
./usr/include/amd64/db_machdep.h