untrusted comment: signature from openbsd 6.2 base secret key
RWRVWzAMgtyg7rvxuplc/tv0WeP9WjrnZZl9FiKkuS5ma3iKXBpQdO3Ekovh3YHSivUuV1G6ugrfiGTKfWXrQdNdq1wHHFs4qgc=

OpenBSD 6.2 errata 009, March 1st, 2018:

Intel CPUs contain a speculative execution flaw called Meltdown which
allows userspace programs to access kernel memory.  A complex workaround
solves the problem.

Apply by doing:
   signify -Vep /etc/signify/openbsd-62-base.pub -x 009_meltdown.patch.sig \
       -m - | (cd /usr/src && patch -p0)

And then rebuild and install a new kernel:
   KK=`sysctl -n kern.osversion | cut -d# -f1`
   cd /usr/src/sys/arch/`machine`/compile/$KK
   make obj
   make config
   make
   make install

Index: sys/arch/amd64/amd64/cpu.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v
retrieving revision 1.107
diff -u -p -r1.107 cpu.c
--- sys/arch/amd64/amd64/cpu.c  25 Aug 2017 19:28:48 -0000      1.107
+++ sys/arch/amd64/amd64/cpu.c  21 Feb 2018 21:41:06 -0000
@@ -81,7 +81,7 @@
#include <uvm/uvm_extern.h>

#include <machine/codepatch.h>
-#include <machine/cpu.h>
+#include <machine/cpu_full.h>
#include <machine/cpufunc.h>
#include <machine/cpuvar.h>
#include <machine/pmap.h>
@@ -117,6 +117,14 @@
#include <machine/hibernate.h>
#endif /* HIBERNATE */

+/* #define CPU_DEBUG */
+
+#ifdef CPU_DEBUG
+#define DPRINTF(x...)  do { printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* CPU_DEBUG */
+
int     cpu_match(struct device *, void *, void *);
void    cpu_attach(struct device *, struct device *, void *);
int     cpu_activate(struct device *, int);
@@ -173,7 +181,7 @@ struct cfdriver cpu_cd = {
 * CPU, on uniprocessors).  The CPU info list is initialized to
 * point at it.
 */
-struct cpu_info cpu_info_primary = { 0, &cpu_info_primary };
+struct cpu_info_full cpu_info_full_primary = { .cif_cpu = { .ci_self = &cpu_info_primary } };

struct cpu_info *cpu_info_list = &cpu_info_primary;

@@ -339,8 +347,15 @@ cpu_attach(struct device *parent, struct
        * structure, otherwise use the primary's.
        */
       if (caa->cpu_role == CPU_ROLE_AP) {
-               ci = malloc(sizeof(*ci), M_DEVBUF, M_WAITOK|M_ZERO);
+               struct cpu_info_full *cif;
+
+               cif = km_alloc(sizeof *cif, &kv_any, &kp_zero, &kd_waitok);
+               ci = &cif->cif_cpu;
#if defined(MULTIPROCESSOR)
+               ci->ci_tss = &cif->cif_tss;
+               ci->ci_gdt = (void *)(ci->ci_tss + 1);
+               memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE);
+               cpu_enter_pages(cif);
               if (cpu_info[cpunum] != NULL)
                       panic("cpu at apic id %d already attached?", cpunum);
               cpu_info[cpunum] = ci;
@@ -446,7 +461,6 @@ cpu_attach(struct device *parent, struct

#if defined(MULTIPROCESSOR)
               cpu_intr_init(ci);
-               gdt_alloc_cpu(ci);
               sched_init_cpu(ci);
               cpu_start_secondary(ci);
               ncpus++;
@@ -931,4 +945,63 @@ cpu_activate(struct device *self, int ac
       }

       return (0);
+}
+
+/*
+ * cpu_enter_pages
+ *
+ * Requests mapping of various special pages required in the Intel Meltdown
+ * case (to be entered into the U-K page table):
+ *
+ *  1 tss+gdt page for each CPU
+ *  1 trampoline stack page for each CPU
+ *
+ * The cpu_info_full struct for each CPU straddles these pages. The offset into
+ * 'cif' is calculated below, for each page. For more information, consult
+ * the definition of struct cpu_info_full in cpu_full.h
+ *
+ * On CPUs unaffected by Meltdown, this function still configures 'cif' but
+ * the calls to pmap_enter_special become no-ops.
+ *
+ * Parameters:
+ *  cif : the cpu_info_full structure describing a CPU whose pages are to be
+ *    entered into the special meltdown U-K page table.
+ */
+void
+cpu_enter_pages(struct cpu_info_full *cif)
+{
+       vaddr_t va;
+       paddr_t pa;
+
+       /* The TSS+GDT need to be readable */
+       va = (vaddr_t)cif;
+       pmap_extract(pmap_kernel(), va, &pa);
+       pmap_enter_special(va, pa, PROT_READ);
+       DPRINTF("%s: entered tss+gdt page at va 0x%llx pa 0x%llx\n", __func__,
+          (uint64_t)va, (uint64_t)pa);
+
+       /* The trampoline stack page needs to be read/write */
+       va = (vaddr_t)&cif->cif_tramp_stack;
+       pmap_extract(pmap_kernel(), va, &pa);
+       pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
+       DPRINTF("%s: entered t.stack page at va 0x%llx pa 0x%llx\n", __func__,
+          (uint64_t)va, (uint64_t)pa);
+
+       cif->cif_tss.tss_rsp0 = va + sizeof(cif->cif_tramp_stack) - 16;
+       DPRINTF("%s: cif_tss.tss_rsp0 = 0x%llx\n" ,__func__,
+           (uint64_t)cif->cif_tss.tss_rsp0);
+       cif->cif_cpu.ci_intr_rsp = cif->cif_tss.tss_rsp0 -
+           sizeof(struct iretq_frame);
+
+#define        SETUP_IST_SPECIAL_STACK(ist, cif, member) do {                  \
+       (cif)->cif_tss.tss_ist[(ist)] = (vaddr_t)&(cif)->member +       \
+           sizeof((cif)->member) - 16;                                 \
+       (cif)->member[nitems((cif)->member) - 2] = (int64_t)&(cif)->cif_cpu; \
+} while (0)
+
+       SETUP_IST_SPECIAL_STACK(0, cif, cif_dblflt_stack);
+       SETUP_IST_SPECIAL_STACK(1, cif, cif_nmi_stack);
+
+       /* an empty iomap, by setting its offset to the TSS limit */
+       cif->cif_tss.tss_iobase = sizeof(cif->cif_tss);
}
Index: sys/arch/amd64/amd64/gdt.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/gdt.c,v
retrieving revision 1.24
diff -u -p -r1.24 gdt.c
--- sys/arch/amd64/amd64/gdt.c  24 May 2015 01:01:49 -0000      1.24
+++ sys/arch/amd64/amd64/gdt.c  21 Feb 2018 21:33:03 -0000
@@ -45,33 +45,6 @@
#include <machine/pcb.h>

/*
- * Allocate shadow GDT for a slave cpu.
- */
-void
-gdt_alloc_cpu(struct cpu_info *ci)
-{
-       struct vm_page *pg;
-       vaddr_t va;
-
-       ci->ci_gdt = (char *)uvm_km_valloc(kernel_map,
-           GDT_SIZE + sizeof(*ci->ci_tss));
-       ci->ci_tss = (void *)(ci->ci_gdt + GDT_SIZE);
-       uvm_map_pageable(kernel_map, (vaddr_t)ci->ci_gdt,
-            (vaddr_t)ci->ci_gdt + GDT_SIZE, FALSE, FALSE);
-       for (va = (vaddr_t)ci->ci_gdt;
-           va < (vaddr_t)ci->ci_gdt + GDT_SIZE + sizeof(*ci->ci_tss);
-           va += PAGE_SIZE) {
-               pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
-               if (pg == NULL)
-                       panic("gdt_init: no pages");
-               pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), PROT_READ | PROT_WRITE);
-       }
-       memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE);
-       bzero(ci->ci_tss, sizeof(*ci->ci_tss));
-}
-
-
-/*
 * Load appropriate gdt descriptor; we better be running on *ci
 * (for the most part, this is how a cpu knows who it is).
 */
Index: sys/arch/amd64/amd64/genassym.cf
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/genassym.cf,v
retrieving revision 1.31
diff -u -p -r1.31 genassym.cf
--- sys/arch/amd64/amd64/genassym.cf    18 May 2015 19:59:27 -0000      1.31
+++ sys/arch/amd64/amd64/genassym.cf    21 Feb 2018 21:33:03 -0000
@@ -78,6 +78,15 @@ member       tf_ss

define  FRAMESIZE               sizeof(struct trapframe)

+struct iretq_frame
+member IRETQ_CS        iretq_cs
+member IRETQ_RIP       iretq_rip
+member IRETQ_RFLAGS    iretq_rflags
+member IRETQ_RSP       iretq_rsp
+member IRETQ_SS        iretq_ss
+
+define IRETQ_SIZE              sizeof(struct iretq_frame)
+
struct pcb
member pcb_cr3
member pcb_rsp
@@ -91,6 +100,8 @@ member       pcb_cr0

struct pmap
member pm_cpus
+member pm_pdirpa
+member pm_pdirpa_intel

struct x86_64_tss
member tss_rsp0
@@ -115,6 +126,10 @@ endif
member CPU_INFO_GDT            ci_gdt
member CPU_INFO_TSS            ci_tss
member CPU_INFO_FLAGS          ci_flags
+member CPU_INFO_KERN_CR3       ci_kern_cr3
+member CPU_INFO_USER_CR3       ci_user_cr3
+member CPU_INFO_KERN_RSP       ci_kern_rsp
+member CPU_INFO_INTR_RSP       ci_intr_rsp

export CPUF_USERSEGS_BIT

Index: sys/arch/amd64/amd64/identcpu.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v
retrieving revision 1.87
diff -u -p -r1.87 identcpu.c
--- sys/arch/amd64/amd64/identcpu.c     20 Jun 2017 05:34:41 -0000      1.87
+++ sys/arch/amd64/amd64/identcpu.c     21 Feb 2018 21:33:03 -0000
@@ -204,6 +204,10 @@ const struct {
       { SEFF0ECX_AVX512VBMI,  "AVX512VBMI" },
       { SEFF0ECX_UMIP,        "UMIP" },
       { SEFF0ECX_PKU,         "PKU" },
+}, cpu_seff0_edxfeatures[] = {
+       { SEFF0EDX_IBRS,        "IBRS,IBPB" },
+       { SEFF0EDX_STIBP,       "STIBP" },
+        /* SEFF0EDX_ARCH_CAP (not printed) */
}, cpu_tpm_eaxfeatures[] = {
       { TPM_SENSOR,           "SENSOR" },
       { TPM_ARAT,             "ARAT" },
@@ -211,6 +215,8 @@ const struct {
       { CPUIDEAX_VERID,       "PERF" },
}, cpu_cpuid_apmi_edx[] = {
       { CPUIDEDX_ITSC,        "ITSC" },
+}, cpu_amdspec_ebxfeatures[] = {
+       { CPUIDEBX_IBPB,        "IBPB" },
};

int
@@ -489,6 +495,7 @@ identifycpu(struct cpu_info *ci)
       int i;
       char *brandstr_from, *brandstr_to;
       int skipspace;
+       extern uint32_t cpu_meltdown;

       CPUID(1, ci->ci_signature, val, dummy, ci->ci_feature_flags);
       CPUID(0x80000000, ci->ci_pnfeatset, dummy, dummy, dummy);
@@ -607,7 +614,7 @@ identifycpu(struct cpu_info *ci)
       if (cpuid_level >= 0x07) {
               /* "Structured Extended Feature Flags" */
               CPUID_LEAF(0x7, 0, dummy, ci->ci_feature_sefflags_ebx,
-                   ci->ci_feature_sefflags_ecx, dummy);
+                   ci->ci_feature_sefflags_ecx, ci->ci_feature_sefflags_edx);
               for (i = 0; i < nitems(cpu_seff0_ebxfeatures); i++)
                       if (ci->ci_feature_sefflags_ebx &
                           cpu_seff0_ebxfeatures[i].bit)
@@ -616,6 +623,10 @@ identifycpu(struct cpu_info *ci)
                       if (ci->ci_feature_sefflags_ecx &
                           cpu_seff0_ecxfeatures[i].bit)
                               printf(",%s", cpu_seff0_ecxfeatures[i].str);
+               for (i = 0; i < nitems(cpu_seff0_edxfeatures); i++)
+                       if (ci->ci_feature_sefflags_edx &
+                           cpu_seff0_edxfeatures[i].bit)
+                               printf(",%s", cpu_seff0_edxfeatures[i].str);
       }

       if (!strcmp(cpu_vendor, "GenuineIntel") && cpuid_level >= 0x06) {
@@ -628,6 +639,22 @@ identifycpu(struct cpu_info *ci)
               if (ci->ci_family >= 0x12)
                       ci->ci_feature_tpmflags |= TPM_ARAT;
       }
+
+       /* AMD speculation control features */
+       if (!strcmp(cpu_vendor, "AuthenticAMD")) {
+               if (ci->ci_pnfeatset >= 0x80000008) {
+                       CPUID(0x80000008, dummy, ci->ci_feature_amdspec_ebx,
+                           dummy, dummy);
+                       for (i = 0; i < nitems(cpu_amdspec_ebxfeatures); i++)
+                               if (ci->ci_feature_amdspec_ebx &
+                                   cpu_amdspec_ebxfeatures[i].bit)
+                                       printf(",%s",
+                                           cpu_amdspec_ebxfeatures[i].str);
+               }
+       }
+
+       if (cpu_meltdown)
+               printf(",MELTDOWN");

       printf("\n");

Index: sys/arch/amd64/amd64/lapic.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.48
diff -u -p -r1.48 lapic.c
--- sys/arch/amd64/amd64/lapic.c        24 Jul 2017 15:31:14 -0000      1.48
+++ sys/arch/amd64/amd64/lapic.c        21 Feb 2018 21:41:06 -0000
@@ -62,6 +62,14 @@
#include <machine/i82093var.h>
#endif

+/* #define LAPIC_DEBUG */
+
+#ifdef LAPIC_DEBUG
+#define DPRINTF(x...)  do { printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* LAPIC_DEBUG */
+
struct evcount clk_count;
#ifdef MULTIPROCESSOR
struct evcount ipi_count;
@@ -204,6 +212,7 @@ lapic_map(paddr_t lapic_base)
               codepatch_call(CPTAG_EOI, &x2apic_eoi);

               lapic_writereg(LAPIC_TPRI, s);
+               va = (vaddr_t)&local_apic;
       } else {
               /*
                * Map local apic.  If we have a local apic, it's safe to
@@ -222,6 +231,17 @@ lapic_map(paddr_t lapic_base)

               lapic_tpr = s;
       }
+
+       /*
+        * Enter the LAPIC MMIO page in the U-K page table for handling
+        * Meltdown (needed in the interrupt stub to acknowledge the
+        * incoming interrupt). On CPUs unaffected by Meltdown,
+        * pmap_enter_special is a no-op.
+        * XXX - need to map this PG_N
+        */
+       pmap_enter_special(va, lapic_base, PROT_READ | PROT_WRITE);
+       DPRINTF("%s: entered lapic page va 0x%llx pa 0x%llx\n", __func__,
+           (uint64_t)va, (uint64_t)lapic_base);

       enable_intr();
}
Index: sys/arch/amd64/amd64/locore.S
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/locore.S,v
retrieving revision 1.89
diff -u -p -r1.89 locore.S
--- sys/arch/amd64/amd64/locore.S       4 Oct 2017 02:10:33 -0000       1.89
+++ sys/arch/amd64/amd64/locore.S       21 Feb 2018 21:33:03 -0000
@@ -113,6 +113,7 @@
#include <sys/syscall.h>

#include <machine/param.h>
+#include <machine/psl.h>
#include <machine/segments.h>
#include <machine/specialreg.h>
#include <machine/trap.h>
@@ -176,6 +177,8 @@ _C_LABEL(lapic_isr):
       .globl  _C_LABEL(biosbasemem),_C_LABEL(biosextmem)
       .globl  _C_LABEL(bootapiver)
       .globl  _C_LABEL(pg_nx)
+       .globl  _C_LABEL(pg_g_kern)
+       .globl  _C_LABEL(cpu_meltdown)
_C_LABEL(cpu_id):      .long   0       # saved from `cpuid' instruction
_C_LABEL(cpu_feature): .long   0       # feature flags from 'cpuid'
                                       #   instruction
@@ -208,6 +211,10 @@ _C_LABEL(biosextmem):      .long   0       # extended
_C_LABEL(biosextmem):  .long   REALEXTMEM
#endif
_C_LABEL(pg_nx):       .quad   0       # NX PTE bit (if CPU supports)
+_C_LABEL(pg_g_kern):   .quad   0       # 0x100 if global pages should be used
+                                       # in kernel mappings, 0 otherwise (for
+                                       # insecure CPUs)
+_C_LABEL(cpu_meltdown):        .long   0       # 1 if this CPU has Meltdown

#define        _RELOC(x)       ((x) - KERNBASE)
#define        RELOC(x)        _RELOC(_C_LABEL(x))
@@ -233,7 +240,7 @@ gdt64_end:
/*****************************************************************************/

/*
- * Signal trampoline; copied to top of user stack.
+ * Signal trampoline; copied to a page mapped into userspace.
 * gdb's backtrace logic matches against the instructions in this.
 */
       .section .rodata
@@ -370,11 +377,15 @@ switch_exited:
       btrl    $CPUF_USERSEGS_BIT, CPUVAR(FLAGS)
       jnc     restore_saved

-       /* set %ds, %es, and %fs to expected value to prevent info leak */
+       /* set %ds, %es, %fs, and %gs to expected value to prevent info leak */
       movw    $(GSEL(GUDATA_SEL, SEL_UPL)),%ax
       movw    %ax,%ds
       movw    %ax,%es
       movw    %ax,%fs
+       cli                     /* block interrupts when on user GS.base */
+       swapgs                  /* switch from kernel to user GS.base */
+       movw    %ax,%gs         /* set %gs to UDATA and GS.base to 0 */
+       swapgs                  /* back to kernel GS.base */

restore_saved:
       /*
@@ -394,20 +405,34 @@ restore_saved:
       movq    PCB_RSP(%r13),%rsp
       movq    PCB_RBP(%r13),%rbp

-       movq    CPUVAR(TSS),%rcx
-       movq    PCB_KSTACK(%r13),%rdx
-       movq    %rdx,TSS_RSP0(%rcx)
-
       movq    PCB_CR3(%r13),%rax
-       movq    %rax,%cr3
+       movq    %rax,%cr3                       /* %rax used below too */

       /* Don't bother with the rest if switching to a system process. */
       testl   $P_SYSTEM,P_FLAG(%r12)
       jnz     switch_restored

+       /* record the bits needed for future U-->K transition */
+       movq    PCB_KSTACK(%r13),%rdx
+       subq    $FRAMESIZE,%rdx
+       movq    %rdx,CPUVAR(KERN_RSP)
+       movq    PCB_PMAP(%r13),%rcx
+
+       /*
+        * Meltdown: iff we're doing separate U+K and U-K page tables,
+        * then record them in cpu_info for easy access in syscall and
+        * interrupt trampolines.  XXX code patch this
+        */
+
+       movq    PM_PDIRPA_INTEL(%rcx),%rdx
+       testq   %rdx,%rdx
+       jz      0f                      /* yay, no intel suckiness */
+       movq    %rax,CPUVAR(KERN_CR3)
+       movq    %rdx,CPUVAR(USER_CR3)
+0:
+
       /* set the new pmap's bit for the cpu */
       movl    CPUVAR(CPUID),%edi
-       movq    PCB_PMAP(%r13),%rcx
       lock
       btsq    %rdi,PM_CPUS(%rcx)
#ifdef DIAGNOSTIC
@@ -496,8 +521,7 @@ IDTVEC(syscall32)
       sysret          /* go away please */

/*
- * syscall insn entry. This currently isn't much faster, but
- * it can be made faster in the future.
+ * syscall insn entry.
 */
IDTVEC(syscall)
       /*
@@ -507,13 +531,20 @@ IDTVEC(syscall)
        * the user-space value.
        * First order of business is to swap to the kernel gs.base so that
        * we can access our struct cpu_info and use the scratch space there
-        * to switch to our kernel stack.  Once that's in place we can
+        * to switch to the kernel page tables (thank you, Intel), then
+        * switch to our kernel stack.  Once that's in place we can
        * unblock interrupts and save the rest of the syscall frame.
        */
       swapgs
       movq    %r15,CPUVAR(SCRATCH)
-       movq    CPUVAR(CURPCB),%r15
-       movq    PCB_KSTACK(%r15),%r15
+       movq    CPUVAR(KERN_CR3),%r15
+       testq   %r15,%r15
+       jz      Xsyscall_untramp
+       movq    %r15,%cr3
+       jmp     Xsyscall_untramp
+
+NENTRY(Xsyscall_untramp)
+       movq    CPUVAR(KERN_RSP),%r15
       xchgq   %r15,%rsp
       sti

@@ -524,12 +555,11 @@ IDTVEC(syscall)
        * ss:rsp, etc, so that all GP registers can be
        * saved. Then, fill in the rest.
        */
-       pushq   $(GSEL(GUDATA_SEL, SEL_UPL))
-       pushq   %r15
-       subq    $(TF_RSP-TF_TRAPNO),%rsp
+       movq    $(GSEL(GUDATA_SEL, SEL_UPL)),TF_SS(%rsp)
+       movq    %r15,TF_RSP(%rsp)
       movq    CPUVAR(SCRATCH),%r15
-       subq    $32,%rsp
-       INTR_SAVE_GPRS
+       INTR_SAVE_MOST_GPRS_NO_ADJ
+       movq    %rcx,TF_RCX(%rsp)
       movq    %r11, TF_RFLAGS(%rsp)   /* old rflags from syscall insn */
       movq    $(GSEL(GUCODE_SEL, SEL_UPL)), TF_CS(%rsp)
       movq    %rcx,TF_RIP(%rsp)
@@ -574,16 +604,45 @@ IDTVEC(syscall)
       movq    TF_RBP(%rsp),%rbp
       movq    TF_RBX(%rsp),%rbx

-       INTR_RESTORE_SELECTORS
+       /* Restore FS.base if it's not already in the CPU */
+       btsl    $CPUF_USERSEGS_BIT,CPUVAR(FLAGS)
+       jc      99f
+       movq    CPUVAR(CURPCB),%rdx
+       movq    PCB_FSBASE(%rdx),%rax
+       movq    %rax,%rdx
+       shrq    $32,%rdx
+       movl    $MSR_FSBASE,%ecx
+       wrmsr
+99:

+       /*
+        * We need to finish reading from the trapframe, then switch
+        * to the user page tables, swapgs, and return.  We need
+        * to get the final value for the register that was used
+        * for the mov to %cr3 from somewhere accessible on the
+        * user page tables, so save it in CPUVAR(SCRATCH) across
+        * the switch.
+        */
       movq    TF_RDX(%rsp),%rdx
       movq    TF_RAX(%rsp),%rax
+       movq    %rax,CPUVAR(SCRATCH)
+       movq    CPUVAR(USER_CR3),%rax

       movq    TF_RIP(%rsp),%rcx
       movq    TF_RFLAGS(%rsp),%r11
       movq    TF_RSP(%rsp),%rsp
+       testq   %rax,%rax
+       jz      1f
+       jmp     syscall_trampback
+
+KUENTRY(syscall_trampback)
+       movq    %rax,%cr3
+1:     movq    CPUVAR(SCRATCH),%rax
+       swapgs
       sysretq

+       .text
+
#ifdef DIAGNOSTIC
.Lsyscall_spl_not_lowered:
       movabsq $spl_lowered, %rdi
@@ -620,6 +679,12 @@ NENTRY(proc_trampoline)
 * Return via iretq, for real interrupts and signal returns
 */
NENTRY(intr_fast_exit)
+#ifdef DIAGNOSTIC
+       pushfq
+       popq    %rdx
+       testq   $PSL_I,%rdx
+       jnz     .Lintr_exit_not_blocked
+#endif /* DIAGNOSTIC */
       movq    TF_RDI(%rsp),%rdi
       movq    TF_RSI(%rsp),%rsi
       movq    TF_R8(%rsp),%r8
@@ -633,11 +698,68 @@ NENTRY(intr_fast_exit)
       movq    TF_RBX(%rsp),%rbx

       testq   $SEL_RPL,TF_CS(%rsp)
-       je      5f
+       je      intr_exit_recurse               /* returning back to kernel? */

-       INTR_RESTORE_SELECTORS
+       /* returning to userspace.  XXX fix up iret frame here */

-5:     movq    TF_RDX(%rsp),%rdx
+       /* restore FS.base if it's not already in the CPU */
+       btsl    $CPUF_USERSEGS_BIT,CPUVAR(FLAGS)
+       jc      99f
+       movq    CPUVAR(CURPCB),%rdx             /* for below */
+       movq    PCB_FSBASE(%rdx),%rax
+       movq    %rax,%rdx
+       shrq    $32,%rdx
+       movl    $MSR_FSBASE,%ecx
+       wrmsr
+99:
+       /*
+        * Returning to userspace.  We need to go things in this order:
+        *  - update the iret frame from the trapframe
+        *  - finish reading from the trapframe
+        *  - switch to the trampoline stack
+        *  - jump to the .kutext segment
+        *  - switch to the user page tables
+        *  - swapgs
+        *  - iretq
+        * To get the final value for the register that was used
+        * for the mov to %cr3, we need access to somewhere accessible
+        * on the user page tables, so we save it in CPUVAR(SCRATCH)
+        * across the switch.
+        */
+       /* update iret frame */
+       movq    CPUVAR(INTR_RSP),%rdx
+       movq    $(GSEL(GUCODE_SEL,SEL_UPL)),IRETQ_CS(%rdx)
+       movq    TF_RIP(%rsp),%rax
+       movq    %rax,IRETQ_RIP(%rdx)
+       movq    TF_RFLAGS(%rsp),%rax
+       movq    %rax,IRETQ_RFLAGS(%rdx)
+       movq    TF_RSP(%rsp),%rax
+       movq    %rax,IRETQ_RSP(%rdx)
+       movq    $(GSEL(GUDATA_SEL,SEL_UPL)),IRETQ_SS(%rdx)
+       /* finish with the trap frame */
+       movq    TF_RAX(%rsp),%rax
+       movq    %rax,CPUVAR(SCRATCH)
+       movq    TF_RCX(%rsp),%rcx
+       movq    TF_R11(%rsp),%r11
+       /* switch to the trampoline stack */
+       xchgq   %rdx,%rsp
+       movq    TF_RDX(%rdx),%rdx
+       movq    CPUVAR(USER_CR3),%rax
+       testq   %rax,%rax
+       jz      1f
+       jmp     iretq_tramp
+
+KUENTRY(iretq_tramp)
+       movq    %rax,%cr3
+1:     movq    CPUVAR(SCRATCH),%rax
+       swapgs
+
+       .globl  _C_LABEL(doreti_iret)
+_C_LABEL(doreti_iret):
+       iretq
+
+NENTRY(intr_exit_recurse)
+       movq    TF_RDX(%rsp),%rdx
       movq    TF_RCX(%rsp),%rcx
       movq    TF_R11(%rsp),%r11
       movq    TF_RAX(%rsp),%rax
@@ -655,9 +777,6 @@ NENTRY(intr_fast_exit)
#endif /* !defined(GPROF) && defined(DDBPROF) */

       addq    $TF_RIP,%rsp
-
-       .globl  _C_LABEL(doreti_iret)
-_C_LABEL(doreti_iret):
       iretq


@@ -690,6 +809,33 @@ _C_LABEL(doreti_iret):
       addq    $TF_RIP,%rsp
       iretq
#endif /* !defined(GPROF) && defined(DDBPROF) */
+       .text
+
+#ifdef DIAGNOSTIC
+.Lintr_exit_not_blocked:
+       xchgw   %bx, %bx
+       movl    warn_once(%rip),%edi
+       testl   %edi,%edi
+       jnz     1f
+       incl    %edi
+       movl    %edi,warn_once(%rip)
+       leaq    .Lnot_blocked(%rip),%rdi
+       call    _C_LABEL(printf)
+#ifdef DDB
+       int     $3
+#endif /* DDB */
+1:     cli
+       jmp     intr_fast_exit
+
+       .data
+.global warn_once
+warn_once:
+       .long   0
+       .section .rodata
+.Lnot_blocked:
+       .asciz  "WARNING: INTERRUPTS NOT BLOCKED ON INTERRUPT RETURN: 0x%x 0x%x\n"
+       .text
+#endif

ENTRY(xrstor_user)
       movq    %rsi, %rdx
Index: sys/arch/amd64/amd64/locore0.S
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/locore0.S,v
retrieving revision 1.2
diff -u -p -r1.2 locore0.S
--- sys/arch/amd64/amd64/locore0.S      6 Jul 2017 06:17:04 -0000       1.2
+++ sys/arch/amd64/amd64/locore0.S      21 Feb 2018 21:33:03 -0000
@@ -216,6 +216,48 @@ bi_size_ok:
       movl    %ecx,8(%ebp)
       movl    $0,  12(%ebp)

+       /*
+        * Determine if CPU has meltdown. Certain Intel CPUs do not properly
+        * respect page permissions when speculatively loading data into
+        * the cache ("Meltdown" CVE). These CPUs must utilize a secondary
+        * sanitized page table lacking kernel mappings when executing user
+        * processes, and may not use PG_G global PTEs for kernel VAs.
+        */
+       movl    $0x1, RELOC(cpu_meltdown)       /* assume insecure at first */
+       movl    $0x0, RELOC(pg_g_kern)
+
+       cmpl    $0x756e6547, %ebx       # "Genu"
+       jne     .Lcpu_secure
+       cmpl    $0x6c65746e, %ecx       # "ntel"
+       jne     .Lcpu_secure
+       cmpl    $0x49656e69, %edx       # "ineI"
+       jne     .Lcpu_secure
+
+       /*
+        * Intel CPU, now check if IA32_ARCH_CAPABILITIES is supported and
+        * if it says this CPU is safe.
+        */
+       movl    $0x0,   %eax
+       cpuid
+       cmpl    $0x7,   %eax
+       jl      .Lcpu_check_finished
+
+       movl    $0x7,   %eax
+       cpuid
+       testl   $SEFF0EDX_ARCH_CAP, %edx
+       jz      .Lcpu_check_finished
+
+       /* IA32_ARCH_CAPABILITIES MSR avaialble, use it to check CPU security */
+       movl    $MSR_ARCH_CAPABILITIES, %ecx
+       rdmsr
+       testl   $ARCH_CAPABILITIES_RDCL_NO, %eax
+       jz      .Lcpu_check_finished
+
+.Lcpu_secure:
+       movl    $0x0, RELOC(cpu_meltdown)
+       movl    $PG_G, RELOC(pg_g_kern)
+
+.Lcpu_check_finished:
       movl    $1,%eax
       cpuid
       movl    %eax,RELOC(cpu_id)
@@ -482,7 +524,8 @@ map_tables:
       leal    (PROC0_DMP2_OFF)(%esi), %ebx
       xorl    %eax, %eax
       movl    $(NDML2_ENTRIES * NPDPG), %ecx
-1:     orl     $(PG_V|PG_KW|PG_PS|PG_G), %eax
+1:     orl     $(PG_V|PG_KW|PG_PS), %eax
+       orl     RELOC(pg_g_kern), %eax
       cmpl    $__kernel_base_phys, %eax
       jl      store_pte
       cmpl    $__kernel_end_phys, %eax
Index: sys/arch/amd64/amd64/machdep.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/machdep.c,v
retrieving revision 1.231
diff -u -p -r1.231 machdep.c
--- sys/arch/amd64/amd64/machdep.c      12 Jul 2017 06:26:32 -0000      1.231
+++ sys/arch/amd64/amd64/machdep.c      21 Feb 2018 21:41:06 -0000
@@ -95,7 +95,7 @@

#include <sys/sysctl.h>

-#include <machine/cpu.h>
+#include <machine/cpu_full.h>
#include <machine/cpufunc.h>
#include <machine/pio.h>
#include <machine/psl.h>
@@ -148,6 +148,14 @@ extern int db_console;
#include <dev/ic/pckbcvar.h>
#endif

+/* #define MACHDEP_DEBUG */
+
+#ifdef MACHDEP_DEBUG
+#define DPRINTF(x...)  do { printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* MACHDEP_DEBUG */
+
/* the following is used externally (sysctl_hw) */
char machine[] = MACHINE;

@@ -263,6 +271,7 @@ void        cpu_init_extents(void);
void   map_tramps(void);
void   init_x86_64(paddr_t);
void   (*cpuresetfn)(void);
+void   enter_shared_special_pages(void);

#ifdef APERTURE
int allowaperture = 0;
@@ -315,6 +324,66 @@ cpu_startup(void)

       /* Safe for i/o port / memory space allocation to use malloc now. */
       x86_bus_space_mallocok();
+
+       /* enter the IDT and trampoline code in the u-k maps */
+       enter_shared_special_pages();
+
+       /* initialize CPU0's TSS and GDT and put them in the u-k maps */
+       cpu_enter_pages(&cpu_info_full_primary);
+}
+
+/*
+ * enter_shared_special_pages
+ *
+ * Requests mapping of various special pages required in the Intel Meltdown
+ * case (to be entered into the U-K page table):
+ *
+ *  1 IDT page
+ *  Various number of pages covering the U-K ".kutext" section. This section
+ *   contains code needed during trampoline operation
+ *  Various number of pages covering the U-K ".kudata" section. This section
+ *   contains data accessed by the trampoline, before switching to U+K
+ *   (for example, various shared global variables used by IPIs, etc)
+ *
+ * The linker script places the required symbols in the sections above.
+ *
+ * On CPUs not affected by Meltdown, the calls to pmap_enter_special below
+ * become no-ops.
+ */
+void
+enter_shared_special_pages(void)
+{
+       extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[];
+       extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[];
+       vaddr_t va;
+       paddr_t pa;
+
+       /* idt */
+       pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ);
+       DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__,
+           (uint64_t)idt_vaddr, (uint64_t)idt_paddr);
+
+       /* .kutext section */
+       va = (vaddr_t)__kutext_start;
+       pa = (paddr_t)__kernel_kutext_phys;
+       while (va < (vaddr_t)__kutext_end) {
+               pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
+               DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n",
+                   __func__, (uint64_t)va, (uint64_t)pa);
+               va += PAGE_SIZE;
+               pa += PAGE_SIZE;
+       }
+
+       /* .kudata section */
+       va = (vaddr_t)__kudata_start;
+       pa = (paddr_t)__kernel_kudata_phys;
+       while (va < (vaddr_t)__kudata_end) {
+               pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
+               DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n",
+                   __func__, (uint64_t)va, (uint64_t)pa);
+               va += PAGE_SIZE;
+               pa += PAGE_SIZE;
+       }
}

/*
@@ -331,12 +400,6 @@ x86_64_proc0_tss_ldt_init(void)
       pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16;
       proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1;

-       /* an empty iomap, by setting its offset to the TSS limit */
-       cpu_info_primary.ci_tss->tss_iobase = sizeof(struct x86_64_tss);
-       cpu_info_primary.ci_tss->tss_rsp0 = pcb->pcb_kstack;
-       cpu_info_primary.ci_tss->tss_ist[0] =
-           (u_int64_t)proc0.p_addr + PAGE_SIZE - 16;
-
       ltr(GSYSSEL(GPROC0_SEL, SEL_KPL));
       lldt(0);
}
@@ -348,15 +411,11 @@ x86_64_proc0_tss_ldt_init(void)
#ifdef MULTIPROCESSOR
void
x86_64_init_pcb_tss_ldt(struct cpu_info *ci)
-{
+{
       struct pcb *pcb = ci->ci_idle_pcb;

-       ci->ci_tss->tss_iobase = sizeof(*ci->ci_tss);
-       ci->ci_tss->tss_rsp0 = pcb->pcb_kstack;
-       ci->ci_tss->tss_ist[0] = pcb->pcb_kstack - USPACE + PAGE_SIZE;
-
       pcb->pcb_cr0 = rcr0();
-}
+}
#endif /* MULTIPROCESSOR */

bios_diskinfo_t *
@@ -1000,25 +1059,27 @@ dumpsys(void)

/*
 * Force the userspace FS.base to be reloaded from the PCB on return from
- * the kernel, and reset most the segment registers (%ds, %es, and %fs)
+ * the kernel, and reset the segment registers (%ds, %es, %fs, and %gs)
 * to their expected userspace value.
 */
void
reset_segs(void)
{
       /*
-        * Segment registers (%ds, %es, %fs, %gs) aren't in the trapframe.
-        * %gs is reset on return to userspace to avoid having to deal with
-        * swapgs; others are reset on context switch and here.  This
-        * operates like the cpu_switchto() sequence: if we haven't reset
-        * %[def]s already, do so now.
-        */
+        * This operates like the cpu_switchto() sequence: if we
+        * haven't reset %[defg]s already, do so now.
+       */
       if (curcpu()->ci_flags & CPUF_USERSEGS) {
               curcpu()->ci_flags &= ~CPUF_USERSEGS;
               __asm volatile(
                   "movw %%ax,%%ds\n\t"
                   "movw %%ax,%%es\n\t"
-                   "movw %%ax,%%fs" : : "a"(GSEL(GUDATA_SEL, SEL_UPL)));
+                   "movw %%ax,%%fs\n\t"
+                   "cli\n\t"           /* block intr when on user GS.base */
+                   "swapgs\n\t"        /* swap from kernel to user GS.base */
+                   "movw %%ax,%%gs\n\t"/* set %gs to UDATA and GS.base to 0 */
+                   "swapgs\n\t"        /* back to kernel GS.base */
+                   "sti" : : "a"(GSEL(GUDATA_SEL, SEL_UPL)));
       }
}

@@ -1544,8 +1605,6 @@ init_x86_64(paddr_t first_avail)
       pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);

       pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE);
-       pmap_kenter_pa(idt_vaddr + PAGE_SIZE, idt_paddr + PAGE_SIZE,
-           PROT_READ | PROT_WRITE);

#if defined(MULTIPROCESSOR) || \
    (NACPI > 0 && !defined(SMALL_KERNEL))
@@ -1553,7 +1612,7 @@ init_x86_64(paddr_t first_avail)
#endif

       idt = (struct gate_descriptor *)idt_vaddr;
-       cpu_info_primary.ci_tss = (void *)(idt + NIDT);
+       cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss;
       cpu_info_primary.ci_gdt = (void *)(cpu_info_primary.ci_tss + 1);

       /* make gdt gates and memory segments */
@@ -1578,9 +1637,10 @@ init_x86_64(paddr_t first_avail)

       /* exceptions */
       for (x = 0; x < 32; x++) {
-               ist = (x == 8) ? 1 : 0;
+               /* trap2 == NMI, trap8 == double fault */
+               ist = (x == 2) ? 2 : (x == 8) ? 1 : 0;
               setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
-                   (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
+                   (x == 3) ? SEL_UPL : SEL_KPL,
                   GSEL(GCODE_SEL, SEL_KPL));
               idt_allocmap[x] = 1;
       }
Index: sys/arch/amd64/amd64/pmap.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/pmap.c,v
retrieving revision 1.105
diff -u -p -r1.105 pmap.c
--- sys/arch/amd64/amd64/pmap.c 24 Jul 2017 15:31:14 -0000      1.105
+++ sys/arch/amd64/amd64/pmap.c 22 Feb 2018 20:55:33 -0000
@@ -126,6 +126,15 @@

#include "acpi.h"

+/* #define PMAP_DEBUG */
+
+#ifdef PMAP_DEBUG
+#define DPRINTF(x...)   do { printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* PMAP_DEBUG */
+
+
/*
 * general info:
 *
@@ -262,6 +271,7 @@ TAILQ_HEAD(pg_to_free, vm_page);

struct pool pmap_pdp_pool;
void pmap_pdp_ctor(pd_entry_t *);
+void pmap_pdp_ctor_intel(pd_entry_t *);

extern vaddr_t msgbuf_vaddr;
extern paddr_t msgbuf_paddr;
@@ -275,6 +285,8 @@ extern vaddr_t lo32_paddr;
vaddr_t virtual_avail;
extern int end;

+extern uint32_t cpu_meltdown;
+
/*
 * local prototypes
 */
@@ -316,7 +328,6 @@ void pmap_tlb_shootwait(void);
#define        pmap_tlb_shootwait()
#endif

-
/*
 * p m a p   i n l i n e   h e l p e r   f u n c t i o n s
 */
@@ -330,7 +341,8 @@ static __inline boolean_t
pmap_is_curpmap(struct pmap *pmap)
{
       return((pmap == pmap_kernel()) ||
-              (pmap->pm_pdirpa == (paddr_t) rcr3()));
+              (pmap->pm_pdirpa == (paddr_t) rcr3()) ||
+              (pmap->pm_pdirpa_intel == (paddr_t) rcr3()));
}

/*
@@ -491,7 +503,6 @@ pmap_find_pte_direct(struct pmap *pm, va
       return (0);
}

-
/*
 * p m a p   k e n t e r   f u n c t i o n s
 *
@@ -520,7 +531,7 @@ pmap_kenter_pa(vaddr_t va, paddr_t pa, v

       /* special 1:1 mappings in the first 2MB must not be global */
       if (va >= (vaddr_t)NBPD_L2)
-               npte |= PG_G;
+               npte |= pg_g_kern;

       if (!(prot & PROT_EXEC))
               npte |= pg_nx;
@@ -593,12 +604,12 @@ pmap_kremove(vaddr_t sva, vsize_t len)
paddr_t
pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
{
-       vaddr_t kva, kva_end, kva_start = VM_MIN_KERNEL_ADDRESS;
+       vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS;
       struct pmap *kpm;
       int i;
-       unsigned long p1i;
       long ndmpdp;
       paddr_t dmpd, dmpdp;
+       vaddr_t kva, kva_end;

       /*
        * define the boundaries of the managed kernel virtual address
@@ -654,9 +665,14 @@ pmap_bootstrap(paddr_t first_avail, padd
       curpcb->pcb_pmap = kpm; /* proc0's pcb */

       /*
-        * enable global TLB entries.
+        * Add PG_G attribute to already mapped kernel pages. pg_g_kern
+        * is calculated in locore0.S and may be set to:
+        *
+        * 0 if this CPU does not safely support global pages in the kernel
+        *  (Intel/Meltdown)
+        * PG_G if this CPU does safely support global pages in the kernel
+        *  (AMD)
        */
-       /* add PG_G attribute to already mapped kernel pages */
#if KERNBASE == VM_MIN_KERNEL_ADDRESS
       for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
#else
@@ -664,9 +680,9 @@ pmap_bootstrap(paddr_t first_avail, padd
       for (kva = KERNBASE; kva < kva_end ;
#endif
            kva += PAGE_SIZE) {
-               p1i = pl1_i(kva);
+               unsigned long p1i = pl1_i(kva);
               if (pmap_valid_entry(PTE_BASE[p1i]))
-                       PTE_BASE[p1i] |= PG_G;
+                       PTE_BASE[p1i] |= pg_g_kern;
       }

       /*
@@ -691,7 +707,7 @@ pmap_bootstrap(paddr_t first_avail, padd
               va = PMAP_DIRECT_MAP(pdp);

               *((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT);
-               *((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | PG_G | PG_U |
+               *((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | pg_g_kern | PG_U |
                   PG_M | pg_nx;
       }

@@ -737,7 +753,7 @@ pmap_bootstrap(paddr_t first_avail, padd
       LIST_INIT(&pmaps);

       /*
-        * initialize the pmap pool.
+        * initialize the pmap pools.
        */

       pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_NONE, 0,
@@ -753,6 +769,9 @@ pmap_bootstrap(paddr_t first_avail, padd
       pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_NONE, PR_WAITOK,
           "pdppl", NULL);

+       kpm->pm_pdir_intel = 0;
+       kpm->pm_pdirpa_intel = 0;
+
       /*
        * ensure the TLB is sync'd with reality by flushing it...
        */
@@ -905,13 +924,21 @@ pmap_free_ptp(struct pmap *pmap, struct
       unsigned long index;
       int level;
       vaddr_t invaladdr;
-       pd_entry_t opde;
+       pd_entry_t opde, *mdpml4es;

       level = 1;
       do {
               pmap_freepage(pmap, ptp, level, pagelist);
               index = pl_i(va, level + 1);
               opde = pmap_pte_set(&pdes[level - 1][index], 0);
+               if (level == 3 && pmap->pm_pdir_intel) {
+                       /* Zap special meltdown PML4e */
+                       mdpml4es = (pd_entry_t *)pmap->pm_pdir_intel;
+                       opde = pmap_pte_set(&mdpml4es[index], 0);
+                       DPRINTF("%s: cleared meltdown PML4e @ index %lu "
+                           "(va range start 0x%llx)\n", __func__, index,
+                           (uint64_t)(index << L4_SHIFT));
+               }
               invaladdr = level == 1 ? (vaddr_t)ptes :
                   (vaddr_t)pdes[level - 2];
               pmap_tlb_shootpage(curpcb->pcb_pmap,
@@ -945,7 +972,7 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t
       struct vm_page *ptp, *pptp;
       int i;
       unsigned long index;
-       pd_entry_t *pva;
+       pd_entry_t *pva, *pva_intel;
       paddr_t ppa, pa;
       struct uvm_object *obj;

@@ -984,6 +1011,20 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t
               pmap->pm_ptphint[i - 2] = ptp;
               pa = VM_PAGE_TO_PHYS(ptp);
               pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V);
+
+               /*
+                * Meltdown Special case - if we are adding a new PML4e for
+                * usermode addresses, just copy the PML4e to the U-K page
+                * table.
+                */
+               if (pmap->pm_pdir_intel && i == 4 && va < VM_MAXUSER_ADDRESS) {
+                       pva_intel = (pd_entry_t *)pmap->pm_pdir_intel;
+                       pva_intel[index] = pva[index];
+                       DPRINTF("%s: copying usermode PML4e (content=0x%llx) "
+                           "from 0x%llx -> 0x%llx\n", __func__, pva[index],
+                           (uint64_t)&pva[index], (uint64_t)&pva_intel[index]);
+               }
+
               pmap->pm_stats.resident_count++;
               /*
                * If we're not in the top level, increase the
@@ -1059,6 +1100,15 @@ pmap_pdp_ctor(pd_entry_t *pdir)
#endif
}

+void
+pmap_pdp_ctor_intel(pd_entry_t *pdir)
+{
+       struct pmap *kpm = pmap_kernel();
+
+       /* Copy PML4es from pmap_kernel's U-K view */
+       memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE);
+}
+
/*
 * pmap_create: create a pmap
 *
@@ -1099,6 +1149,22 @@ pmap_create(void)

       pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME;

+       /*
+        * Intel CPUs need a special page table to be used during usermode
+        * execution, one that lacks all kernel mappings.
+        */
+       if (cpu_meltdown) {
+               pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK);
+               pmap_pdp_ctor_intel(pmap->pm_pdir_intel);
+               if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel,
+                   &pmap->pm_pdirpa_intel))
+                       panic("%s: unknown PA mapping for meltdown PML4\n",
+                           __func__);
+       } else {
+               pmap->pm_pdir_intel = 0;
+               pmap->pm_pdirpa_intel = 0;
+       }
+
       LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
       return (pmap);
}
@@ -1156,6 +1222,9 @@ pmap_destroy(struct pmap *pmap)
       /* XXX: need to flush it out of other processor's space? */
       pool_put(&pmap_pdp_pool, pmap->pm_pdir);

+       if (pmap->pm_pdir_intel)
+               pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
+
       pool_put(&pmap_pmap_pool, pmap);
}

@@ -1970,6 +2039,132 @@ pmap_collect(struct pmap *pmap)
 * defined as macro in pmap.h
 */

+void
+pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot)
+{
+       uint64_t l4idx, l3idx, l2idx, l1idx;
+       pd_entry_t *pd, *ptp;
+       paddr_t npa;
+       struct pmap *pmap = pmap_kernel();
+       pt_entry_t *ptes;
+       int level, offs;
+
+       /* If CPU is secure, no need to do anything */
+       if (!cpu_meltdown)
+               return;
+
+       /* Must be kernel VA */
+       if (va < VM_MIN_KERNEL_ADDRESS)
+               panic("%s: invalid special mapping va 0x%lx requested",
+                   __func__, va);
+
+       if (!pmap->pm_pdir_intel)
+               pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool,
+                   PR_WAITOK | PR_ZERO);
+
+       l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */
+       l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
+       l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */
+       l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */
+
+       DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld "
+           "l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va,
+           (uint64_t)pa, l4idx, l3idx, l2idx, l1idx);
+
+       /* Start at PML4 / top level */
+       pd = (pd_entry_t *)pmap->pm_pdir_intel;
+
+       if (!pd)
+               panic("%s: PML4 not initialized for pmap @ %p\n", __func__,
+                   pmap);
+
+       /* npa = physaddr of PDPT */
+       npa = pd[l4idx] & PMAP_PA_MASK;
+
+       /* Valid PML4e for the 512GB region containing va? */
+       if (!npa) {
+               /* No valid PML4E - allocate PDPT page and set PML4E */
+
+               ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
+
+               if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
+                       panic("%s: can't locate PDPT page\n", __func__);
+
+               pd[l4idx] = (npa | PG_u | PG_RW | PG_V);
+
+               DPRINTF("%s: allocated new PDPT page at phys 0x%llx, "
+                   "setting PML4e[%lld] = 0x%llx\n", __func__,
+                   (uint64_t)npa, l4idx, pd[l4idx]);
+       }
+
+       pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
+       if (!pd)
+               panic("%s: can't locate PDPT @ pa=0x%llx\n", __func__,
+                   (uint64_t)npa);
+
+       /* npa = physaddr of PD page */
+       npa = pd[l3idx] & PMAP_PA_MASK;
+
+       /* Valid PDPTe for the 1GB region containing va? */
+       if (!npa) {
+               /* No valid PDPTe - allocate PD page and set PDPTe */
+
+               ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
+
+               if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
+                       panic("%s: can't locate PD page\n", __func__);
+
+               pd[l3idx] = (npa | PG_u | PG_RW | PG_V);
+
+               DPRINTF("%s: allocated new PD page at phys 0x%llx, "
+                   "setting PDPTe[%lld] = 0x%llx\n", __func__,
+                   (uint64_t)npa, l3idx, pd[l3idx]);
+       }
+
+       pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
+       if (!pd)
+               panic("%s: can't locate PD page @ pa=0x%llx\n", __func__,
+                   (uint64_t)npa);
+
+       /* npa = physaddr of PT page */
+       npa = pd[l2idx] & PMAP_PA_MASK;
+
+       /* Valid PDE for the 2MB region containing va? */
+       if (!npa) {
+               /* No valid PDE - allocate PT page and set PDE */
+
+               ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
+
+               if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
+                       panic("%s: can't locate PT page\n", __func__);
+
+               pd[l2idx] = (npa | PG_u | PG_RW | PG_V);
+
+               DPRINTF("%s: allocated new PT page at phys 0x%llx, "
+                   "setting PDE[%lld] = 0x%llx\n", __func__,
+                   (uint64_t)npa, l2idx, pd[l2idx]);
+       }
+
+       pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
+       if (!pd)
+               panic("%s: can't locate PT page @ pa=0x%llx\n", __func__,
+                   (uint64_t)npa);
+
+       DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot "
+           "0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd,
+           (uint64_t)prot, (uint64_t)pd[l1idx]);
+
+       pd[l1idx] = pa | protection_codes[prot] | PG_V | PG_G | PG_W;
+       DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]);
+
+       /* now set the PG_G flag on the corresponding U+K entry */
+       level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
+       if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs])))
+               ptes[offs] |= PG_G;
+       else
+               DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
+}
+
/*
 * pmap_enter: enter a mapping into a pmap
 *
@@ -2166,7 +2361,7 @@ enter_now:
       else if (va < VM_MAX_ADDRESS)
               npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */
       if (pmap == pmap_kernel())
-               npte |= PG_G;
+               npte |= pg_g_kern;

       ptes[pl1_i(va)] = npte;         /* zap! */

@@ -2450,10 +2645,10 @@ pmap_convert(struct pmap *pmap, int mode
 * release the lock if we get an interrupt in a bad moment.
 */

-volatile long tlb_shoot_wait;
+volatile long tlb_shoot_wait __attribute__((section(".kudata")));

-volatile vaddr_t tlb_shoot_addr1;
-volatile vaddr_t tlb_shoot_addr2;
+volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata")));
+volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata")));

void
pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
Index: sys/arch/amd64/amd64/spl.S
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/spl.S,v
retrieving revision 1.11
diff -u -p -r1.11 spl.S
--- sys/arch/amd64/amd64/spl.S  20 May 2016 14:37:53 -0000      1.11
+++ sys/arch/amd64/amd64/spl.S  21 Feb 2018 21:33:03 -0000
@@ -114,7 +114,7 @@ _C_LABEL(splx):
 * a lower-prio one first, which needs to take the kernel lock -->
 * the sending CPU will never see the that CPU accept the IPI
 */
-IDTVEC(spllower)
+KIDTVEC(spllower)
       _PROF_PROLOGUE
       pushq   %rbx
       pushq   %r13
@@ -143,7 +143,7 @@ IDTVEC(spllower)
 *   ebx - cpl to restore
 *   r13 - address to resume loop at
 */
-IDTVEC(doreti)
+KIDTVEC(doreti)
       popq    %rbx                    # get previous priority
       decl    CPUVAR(IDEPTH)
       leaq    1f(%rip),%r13
@@ -168,4 +168,8 @@ IDTVEC(doreti)
       call    _C_LABEL(ast)
       cli
       jmp     5b
-3:     INTRFASTEXIT
+3:
+#ifdef DIAGNOSTIC
+       movl    $254,%esi
+#endif /* DIAGNOSTIC */
+       INTRFASTEXIT
Index: sys/arch/amd64/amd64/trap.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/trap.c,v
retrieving revision 1.61
diff -u -p -r1.61 trap.c
--- sys/arch/amd64/amd64/trap.c 4 Oct 2017 02:10:33 -0000       1.61
+++ sys/arch/amd64/amd64/trap.c 21 Feb 2018 21:33:03 -0000
@@ -214,6 +214,18 @@ trap(struct trapframe *frame)
                       frame->tf_rip = (u_int64_t)xrstor_resume;
                       return;
               }
+
+               /*
+                * Check for failure during return to user mode.
+                * We do this by looking at the address of the
+                * instruction that faulted.
+                */
+               if (frame->tf_rip == (u_int64_t)doreti_iret) {
+                       frame->tf_rip = (u_int64_t)resume_iret;
+                       return;
+               }
+               /* FALLTHROUGH */
+
       case T_SEGNPFLT:
       case T_ALIGNFLT:
       case T_TSSFLT:
@@ -225,16 +237,6 @@ copyfault:
                       frame->tf_rip = (u_int64_t)pcb->pcb_onfault;
                       return;
               }
-
-               /*
-                * Check for failure during return to user mode.
-                * We do this by looking at the address of the
-                * instruction that faulted.
-                */
-               if (frame->tf_rip == (u_int64_t)doreti_iret) {
-                       frame->tf_rip = (u_int64_t)resume_iret;
-                       return;
-               }
               goto we_re_toast;

       case T_PROTFLT|T_USER:          /* protection fault */
@@ -457,8 +459,12 @@ out:
static void
frame_dump(struct trapframe *tf)
{
-       printf("rip %p  rsp %p  rfl %p\n",
-           (void *)tf->tf_rip, (void *)tf->tf_rsp, (void *)tf->tf_rflags);
+       printf("rip %p  cs 0x%x  rfl %p  rsp %p  ss 0x%x\n",
+           (void *)tf->tf_rip, (unsigned)tf->tf_cs & 0xffff,
+           (void *)tf->tf_rflags,
+           (void *)tf->tf_rsp, (unsigned)tf->tf_ss & 0xffff);
+       printf("err 0x%llx  trapno 0x%llx\n",
+           tf->tf_err, tf->tf_trapno);
       printf("rdi %p  rsi %p  rdx %p\n",
           (void *)tf->tf_rdi, (void *)tf->tf_rsi, (void *)tf->tf_rdx);
       printf("rcx %p  r8  %p  r9  %p\n",
Index: sys/arch/amd64/amd64/vector.S
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/vector.S,v
retrieving revision 1.51
diff -u -p -r1.51 vector.S
--- sys/arch/amd64/amd64/vector.S       4 Oct 2017 02:10:33 -0000       1.51
+++ sys/arch/amd64/amd64/vector.S       28 Feb 2018 17:13:02 -0000
@@ -104,36 +104,97 @@
#define        TRAP(a)         pushq $(a) ; jmp _C_LABEL(alltraps)
#define        ZTRAP(a)        pushq $0 ; TRAP(a)

-       .text
IDTVEC(trap00)
       ZTRAP(T_DIVIDE)
IDTVEC(trap01)
       ZTRAP(T_TRCTRAP)
+
+/*
+ * NMIs can happen at any time, so there's no simple way to tell
+ * which GS.base is in place at the time of the interrupt.  Instead,
+ * borrow a couple ideas from FreeBSD and put the CPU's kernel
+ * GS.base in the memory right above the stack, storing the current
+ * one in a pair of callee-saved registers (%r12/13).  We save the
+ * current %cr3 in a callee-saved register too (%r15).
+ * Note: we don't unblock interrupts because a nested normal interrupt
+ * would also reenable NMIs.
+ */
IDTVEC(trap02)
-       ZTRAP(T_NMI)
+       pushq   $0
+       pushq   $T_NMI
+calltrap_specstk:                      # special stack path
+       INTR_REENTRY
+       movl    $MSR_GSBASE,%ecx        # save current GS.base...
+       rdmsr
+       movq    %rax,%r12               # ...in %r12 and %r13
+       movq    %rdx,%r13
+       movq    FRAMESIZE(%rsp),%rax    # get kernel GS.base
+       movq    %rax,%rdx
+       shrq    $32,%rdx
+       wrmsr                           # switch to it
+       movq    %cr3,%r15               # save current %cr3 in %r15
+       movq    CPUVAR(KERN_CR3),%rax   # switch to kernel page tables
+       testq   %rax,%rax
+       jz      INTRENTRY_LABEL(calltrap_specstk)
+       movq    %rax,%cr3
+       jmp     INTRENTRY_LABEL(calltrap_specstk)
+       .text
+       .globl  INTRENTRY_LABEL(calltrap_specstk)
+INTRENTRY_LABEL(calltrap_specstk):
+       cld
+       SMAP_CLAC
+       movq    %rsp,%rdi
+       call    trap
+       movl    $MSR_GSBASE,%ecx        # restore GS.base
+       movq    %r12,%rax
+       movq    %r13,%rdx
+       wrmsr
+       popq    %rdi
+       popq    %rsi
+       popq    %rdx
+       popq    %rcx
+       popq    %r8
+       popq    %r9
+       popq    %r10
+       popq    %r11
+       popq    %r12
+       popq    %r13
+       popq    %r14
+       jmp     calltrap_specstk_tramp
+KUENTRY(calltrap_specstk_tramp)
+       movq    %r15,%cr3               # restore %cr3
+       popq    %r15
+       popq    %rbp
+       popq    %rbx
+       popq    %rax
+       addq    $48,%rsp                # ignored TF_[DEFG]S
+       iretq
+
IDTVEC(trap03)
       ZTRAP(T_BPTFLT)
IDTVEC(trap04)
-       ZTRAP(T_OFLOW)
+       ZTRAP(T_OFLOW)  # impossible: INTO instruction invalid in amd64
IDTVEC(trap05)
-       ZTRAP(T_BOUND)
+       ZTRAP(T_BOUND)  # impossible: BOUND instruction invalid in amd64
IDTVEC(trap06)
       ZTRAP(T_PRIVINFLT)
IDTVEC(trap07)
       pushq   $0                      # dummy error code
       pushq   $T_DNA
-       INTRENTRY
+       INTRENTRY(trap07)
       sti
       cld
       SMAP_CLAC
       movq    CPUVAR(SELF),%rdi
       movq    %rsp, %rsi
       call    _C_LABEL(fpudna)
+       cli
       INTRFASTEXIT
IDTVEC(trap08)
-       TRAP(T_DOUBLEFLT)
+       pushq   $T_DOUBLEFLT
+       jmp     calltrap_specstk
IDTVEC(trap09)
-       ZTRAP(T_FPOPFLT)
+       ZTRAP(T_FPOPFLT)        # impossible: not generated on amd64
IDTVEC(trap0a)
       TRAP(T_TSSFLT)
IDTVEC(trap0b)
@@ -149,30 +210,49 @@ IDTVEC(trap0c)
        * so that we can do the necessary swapgs in that case.
        */
IDTVEC(trap0d)
-       subq    $TF_ERR,%rsp
-       movl    $T_PROTFLT,TF_TRAPNO(%rsp)
-       movq    %rdi,TF_RDI(%rsp)
-       leaq    _C_LABEL(doreti_iret)(%rip),%rdi
-       cmpq    %rdi,TF_RIP(%rsp)
+       pushq   %rcx
+       leaq    _C_LABEL(doreti_iret)(%rip),%rcx
+       cmpq    %rcx,16(%rsp)           /* over %rcx and err to %rip */
+       popq    %rcx
       je      1f
-       testq   $SEL_RPL,TF_CS(%rsp)
-       jz      2f
+       testq   $SEL_RPL,16(%rsp)       /* over err and %rip to %cs */
+       je      INTRENTRY_LABEL(trap0d)
1:     swapgs
-2:     movq    %r15,TF_R15(%rsp)
-       movq    %r14,TF_R14(%rsp)
-       movq    %r13,TF_R13(%rsp)
-       movq    %r12,TF_R12(%rsp)
-       movq    %r11,TF_R11(%rsp)
-       movq    %r10,TF_R10(%rsp)
-       movq    %r9,TF_R9(%rsp)
-       movq    %r8,TF_R8(%rsp)
-       /*movq  %rdi,TF_RDI(%rsp)       done above */
-       movq    %rsi,TF_RSI(%rsp)
-       movq    %rbp,TF_RBP(%rsp)
-       movq    %rbx,TF_RBX(%rsp)
-       movq    %rdx,TF_RDX(%rsp)
+       movq    %rax,CPUVAR(SCRATCH)
+       movq    CPUVAR(KERN_CR3),%rax
+       testq   %rax,%rax
+       jz      98f
+       movq    %rax,%cr3
+       jmp     98f
+       .text
+       .globl  INTRENTRY_LABEL(trap0d)
+INTRENTRY_LABEL(trap0d):       /* from kernel */
+       pushq   $T_PROTFLT
+       subq    $152,%rsp
       movq    %rcx,TF_RCX(%rsp)
-       movq    %rax,TF_RAX(%rsp)
+       jmp     99f
+98:    /* from userspace */
+       movq    CPUVAR(KERN_RSP),%rax
+       xchgq   %rax,%rsp
+       movq    %rcx,TF_RCX(%rsp)
+       /* set trapno in the trap frame */
+       movq    $T_PROTFLT,TF_TRAPNO(%rsp)
+       /* copy err and iretq frame to the trap frame */
+       movq    0(%rax),%rcx
+       movq    %rcx,TF_ERR(%rsp)
+       add     $8,%rax
+       movq    IRETQ_RIP(%rax),%rcx
+       movq    %rcx,TF_RIP(%rsp)
+       movq    IRETQ_CS(%rax),%rcx
+       movq    %rcx,TF_CS(%rsp)
+       movq    IRETQ_RFLAGS(%rax),%rcx
+       movq    %rcx,TF_RFLAGS(%rsp)
+       movq    IRETQ_RSP(%rax),%rcx
+       movq    %rcx,TF_RSP(%rsp)
+       movq    IRETQ_SS(%rax),%rcx
+       movq    %rcx,TF_SS(%rsp)
+       movq    CPUVAR(SCRATCH),%rax
+99:    INTR_SAVE_MOST_GPRS_NO_ADJ
       sti
       jmp     calltrap

@@ -204,7 +284,9 @@ IDTVEC(trap1f)
       /* 20 - 31 reserved for future exp */
       ZTRAP(T_RESERVED)

-IDTVEC(exceptions)
+       .section .rodata
+       .globl  Xexceptions
+Xexceptions:
       .quad   _C_LABEL(Xtrap00), _C_LABEL(Xtrap01)
       .quad   _C_LABEL(Xtrap02), _C_LABEL(Xtrap03)
       .quad   _C_LABEL(Xtrap04), _C_LABEL(Xtrap05)
@@ -232,19 +314,44 @@ IDTVEC(exceptions)
 * protection fault.  This will cause the process to get a SIGBUS.
 */
NENTRY(resume_iret)
-       pushq   $0
-       pushq   $T_PROTFLT
-       subq    $32,%rsp
-       INTR_SAVE_GPRS
+       movq    %rax,CPUVAR(SCRATCH)
+       movq    CPUVAR(KERN_CR3),%rax
+       testq   %rax,%rax
+       jz      INTRENTRY_LABEL(iret)
+       movq    %rax,%cr3
+       jmp     INTRENTRY_LABEL(iret)
+       .text
+       .globl  INTRENTRY_LABEL(iret)
+INTRENTRY_LABEL(iret): /* from kernel */
+       movq    CPUVAR(KERN_RSP),%rax
+       xchgq   %rax,%rsp
+       movq    %rcx,TF_RCX(%rsp)
+       /* set trapno+err in the trap frame */
+       movq    $T_PROTFLT,TF_TRAPNO(%rsp)
+       movq    $0,TF_ERR(%rsp)
+       /* copy iretq frame to the trap frame */
+       movq    IRETQ_RIP(%rax),%rcx
+       movq    %rcx,TF_RIP(%rsp)
+       movq    IRETQ_CS(%rax),%rcx
+       movq    %rcx,TF_CS(%rsp)
+       movq    IRETQ_RFLAGS(%rax),%rcx
+       movq    %rcx,TF_RFLAGS(%rsp)
+       movq    IRETQ_RSP(%rax),%rcx
+       movq    %rcx,TF_RSP(%rsp)
+       movq    IRETQ_SS(%rax),%rcx
+       movq    %rcx,TF_SS(%rsp)
+       movq    CPUVAR(SCRATCH),%rax
+       INTR_SAVE_MOST_GPRS_NO_ADJ
       sti
       jmp     calltrap

+
/*
 * All traps go through here. Call the generic trap handler, and
 * check for ASTs afterwards.
 */
-NENTRY(alltraps)
-       INTRENTRY
+KUENTRY(alltraps)
+       INTRENTRY(alltraps)
       sti
calltrap:
       cld
@@ -329,6 +436,7 @@ spl_lowered:
/* XXX See comment in locore.s */
#define        XINTR(name,num)         Xintr_##name##num

+       KUTEXT
       .globl _C_LABEL(x2apic_eoi)
_C_LABEL(x2apic_eoi):
       pushq   %rax
@@ -345,23 +453,23 @@ _C_LABEL(x2apic_eoi):

#if NLAPIC > 0
#ifdef MULTIPROCESSOR
-IDTVEC(recurse_lapic_ipi)
+KIDTVEC(recurse_lapic_ipi)
       INTR_RECURSE_HWFRAME
-       pushq   $0
+       pushq   $0
       subq    $8,%rsp                 /* unused __if_trapno */
-       INTRENTRY
+       INTR_REENTRY
       jmp     1f
IDTVEC(intr_lapic_ipi)
-       pushq   $0
+       pushq   $0
       subq    $8,%rsp                 /* unused __if_trapno */
-       INTRENTRY
+       INTRENTRY(intr_lapic_ipi)
       CODEPATCH_START
       movl    $0,_C_LABEL(local_apic)+LAPIC_EOI
       CODEPATCH_END(CPTAG_EOI)
       movl    CPUVAR(ILEVEL),%ebx
       cmpl    $IPL_IPI,%ebx
       jae     2f
-IDTVEC(resume_lapic_ipi)
+KIDTVEC(resume_lapic_ipi)
1:
       incl    CPUVAR(IDEPTH)
       movl    $IPL_IPI,CPUVAR(ILEVEL)
@@ -425,27 +533,27 @@ IDTVEC(ipi_invlrange)
       iretq

#endif /* MULTIPROCESSOR */
-
+
       /*
        * Interrupt from the local APIC timer.
        */
-IDTVEC(recurse_lapic_ltimer)
+KIDTVEC(recurse_lapic_ltimer)
       INTR_RECURSE_HWFRAME
-       pushq   $0
+       pushq   $0
       subq    $8,%rsp                 /* unused __if_trapno */
-       INTRENTRY
+       INTR_REENTRY
       jmp     1f
IDTVEC(intr_lapic_ltimer)
-       pushq   $0
+       pushq   $0
       subq    $8,%rsp                 /* unused __if_trapno */
-       INTRENTRY
+       INTRENTRY(intr_lapic_ltimer)
       CODEPATCH_START
       movl    $0,_C_LABEL(local_apic)+LAPIC_EOI
       CODEPATCH_END(CPTAG_EOI)
       movl    CPUVAR(ILEVEL),%ebx
       cmpl    $IPL_CLOCK,%ebx
       jae     2f
-IDTVEC(resume_lapic_ltimer)
+KIDTVEC(resume_lapic_ltimer)
1:
       incl    CPUVAR(IDEPTH)
       movl    $IPL_CLOCK,CPUVAR(ILEVEL)
@@ -466,21 +574,21 @@ IDTVEC(resume_lapic_ltimer)
 * Xen event channel upcall interrupt handler.
 * Only used when the hypervisor supports direct vector callbacks.
 */
-IDTVEC(recurse_xen_upcall)
+KIDTVEC(recurse_xen_upcall)
       INTR_RECURSE_HWFRAME
       pushq   $0
       subq    $8,%rsp                 /* unused __if_trapno */
-       INTRENTRY
+       INTR_REENTRY
       jmp     1f
IDTVEC(intr_xen_upcall)
       pushq   $0
       subq    $8,%rsp                 /* unused __if_trapno */
-       INTRENTRY
+       INTRENTRY(intr_xen_upcall)
       call    _C_LABEL(xen_intr_ack)
       movl    CPUVAR(ILEVEL),%ebx
       cmpl    $IPL_NET,%ebx
       jae     2f
-IDTVEC(resume_xen_upcall)
+KIDTVEC(resume_xen_upcall)
1:
       incl    CPUVAR(IDEPTH)
       movl    $IPL_NET,CPUVAR(ILEVEL)
@@ -502,20 +610,20 @@ IDTVEC(resume_xen_upcall)
 * Hyperv event channel upcall interrupt handler.
 * Only used when the hypervisor supports direct vector callbacks.
 */
-IDTVEC(recurse_hyperv_upcall)
+KIDTVEC(recurse_hyperv_upcall)
       INTR_RECURSE_HWFRAME
       pushq   $0
       subq    $8,%rsp                 /* unused __if_trapno */
-       INTRENTRY
+       INTR_REENTRY
       jmp     1f
IDTVEC(intr_hyperv_upcall)
       pushq   $0
       subq    $8,%rsp                 /* unused __if_trapno */
-       INTRENTRY
+       INTRENTRY(intr_hyperv_upcall)
       movl    CPUVAR(ILEVEL),%ebx
       cmpl    $IPL_NET,%ebx
       jae     2f
-IDTVEC(resume_hyperv_upcall)
+KIDTVEC(resume_hyperv_upcall)
1:
       incl    CPUVAR(IDEPTH)
       movl    $IPL_NET,CPUVAR(ILEVEL)
@@ -542,11 +650,11 @@ IDTVEC(resume_hyperv_upcall)
 */

#define        INTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \
-IDTVEC(recurse_##name##num)                                            ;\
+KIDTVEC(recurse_##name##num)                                           ;\
       INTR_RECURSE_HWFRAME                                            ;\
       subq    $16,%rsp                /* space for __if_{trapno,err} */;\
-       INTRENTRY                                                       ;\
-IDTVEC(resume_##name##num)                                             \
+       INTR_REENTRY                                                    ;\
+KIDTVEC(resume_##name##num)                                            \
       movq    $IREENT_MAGIC,TF_ERR(%rsp)                              ;\
       movl    %ebx,%r13d                                              ;\
       movq    CPUVAR(ISOURCES) + (num) * 8, %r14                      ;\
@@ -555,7 +663,7 @@ IDTVEC(resume_##name##num)                                          \
IDTVEC(intr_##name##num)                                               ;\
       pushq   $0                      /* dummy error code */          ;\
       subq    $8,%rsp                 /* unused __if_trapno */        ;\
-       INTRENTRY                                                       ;\
+       INTRENTRY(intr_##name##num)                                     ;\
       movq    CPUVAR(ISOURCES) + (num) * 8, %r14                      ;\
       mask(num)                       /* mask it in hardware */       ;\
       early_ack(num)                  /* and allow other intrs */     ;\
@@ -1094,8 +1202,7 @@ _C_LABEL(ioapic_level_stubs):
/*
 * Soft interrupt handlers
 */
-       .text
-IDTVEC(softtty)
+KIDTVEC(softtty)
       movl    $IPL_SOFTTTY, CPUVAR(ILEVEL)
       sti
       incl    CPUVAR(IDEPTH)
@@ -1104,7 +1211,7 @@ IDTVEC(softtty)
       decl    CPUVAR(IDEPTH)
       jmp     *%r13

-IDTVEC(softnet)
+KIDTVEC(softnet)
       movl    $IPL_SOFTNET, CPUVAR(ILEVEL)
       sti
       incl    CPUVAR(IDEPTH)
@@ -1113,7 +1220,7 @@ IDTVEC(softnet)
       decl    CPUVAR(IDEPTH)
       jmp     *%r13

-IDTVEC(softclock)
+KIDTVEC(softclock)
       movl    $IPL_SOFTCLOCK, CPUVAR(ILEVEL)
       sti
       incl    CPUVAR(IDEPTH)
Index: sys/arch/amd64/conf/ld.script
===================================================================
RCS file: /cvs/src/sys/arch/amd64/conf/ld.script,v
retrieving revision 1.7
diff -u -p -r1.7 ld.script
--- sys/arch/amd64/conf/ld.script       6 Jul 2017 06:21:56 -0000       1.7
+++ sys/arch/amd64/conf/ld.script       21 Feb 2018 21:33:03 -0000
@@ -52,6 +52,15 @@ SECTIONS
               *(.text .text.*)
       } :text =0xcccccccc

+       . = ALIGN(__ALIGN_SIZE);
+       __kernel_kutext_phys = (. - __kernel_virt_base) + 0x1000000;
+       .kutext : AT (__kernel_kutext_phys)
+       {
+               __kutext_start = ABSOLUTE(.);
+               *(.kutext)
+               __kutext_end = ABSOLUTE(.);
+       } :text =0xcccccccc
+
       PROVIDE (etext = .);
       _etext = .;

@@ -84,6 +93,17 @@ SECTIONS
               __data_start = ABSOLUTE(.);
               *(.data .data.*)
       } :data =0xcccccccc
+       . = ALIGN(0x1000);
+
+       . = ALIGN(__ALIGN_SIZE);
+       __kernel_kudata_phys = (. - __kernel_virt_base) + 0x1000000;
+       .kudata : AT (__kernel_kudata_phys)
+       {
+               __kudata_start = ABSOLUTE(.);
+               *(.kudata)
+               __kudata_end = ABSOLUTE(.);
+       } :data =0xcccccccc
+
       . = ALIGN(0x1000);
       PROVIDE (edata = .);
       _edata = .;
Index: sys/arch/amd64/include/asm.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/asm.h,v
retrieving revision 1.8
diff -u -p -r1.8 asm.h
--- sys/arch/amd64/include/asm.h        29 Jun 2017 17:36:16 -0000      1.8
+++ sys/arch/amd64/include/asm.h        21 Feb 2018 21:33:03 -0000
@@ -68,14 +68,19 @@
       .text; _ALIGN_TEXT; .globl x; .type x,@function; x:

#ifdef _KERNEL
+#define        KUTEXT  .section .kutext, "ax"
+/*#define      KUTEXT  .text */
+
/* XXX Can't use __CONCAT() here, as it would be evaluated incorrectly. */
-#ifdef __STDC__
#define        IDTVEC(name) \
-       .text; ALIGN_TEXT; .globl X ## name; .type X ## name,@function; X ## name:
-#else
-#define        IDTVEC(name) \
-       .text; ALIGN_TEXT; .globl X/**/name; .type X/**/name,@function; X/**/name:
-#endif /* __STDC__ */
+       KUTEXT; ALIGN_TEXT; \
+       .globl X ## name; .type X ## name,@function; X ## name:
+#define        KIDTVEC(name) \
+       .text; ALIGN_TEXT; \
+       .globl X ## name; .type X ## name,@function; X ## name:
+#define KUENTRY(x) \
+       KUTEXT; _ALIGN_TEXT; .globl x; .type x,@function; x:
+
#endif /* _KERNEL */

#ifdef __STDC__
Index: sys/arch/amd64/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v
retrieving revision 1.114
diff -u -p -r1.114 cpu.h
--- sys/arch/amd64/include/cpu.h        11 Aug 2017 20:19:14 -0000      1.114
+++ sys/arch/amd64/include/cpu.h        21 Feb 2018 21:33:03 -0000
@@ -43,7 +43,7 @@
 */
#ifdef _KERNEL
#include <machine/frame.h>
-#include <machine/segments.h>
+#include <machine/segments.h>          /* USERMODE */
#include <machine/cacheinfo.h>
#include <machine/intrdefs.h>
#endif /* _KERNEL */
@@ -89,6 +89,17 @@ union vmm_cpu_cap {

struct x86_64_tss;
struct cpu_info {
+       /*
+        * The beginning of this structure in mapped in the userspace "u-k"
+        * page tables, so that these first couple members can be accessed
+        * from the trampoline code.  The ci_PAGEALIGN member defines where
+        * the part that is *not* visible begins, so don't put anything
+        * above it that must be kept hidden from userspace!
+        */
+       u_int64_t       ci_kern_cr3;    /* U+K page table */
+       u_int64_t       ci_scratch;     /* for U<-->K transition */
+
+#define ci_PAGEALIGN   ci_dev
       struct device *ci_dev;
       struct cpu_info *ci_self;
       struct schedstate_percpu ci_schedstate; /* scheduler state */
@@ -100,7 +111,9 @@ struct cpu_info {
       u_int ci_acpi_proc_id;
       u_int32_t ci_randseed;

-       u_int64_t ci_scratch;
+       u_int64_t ci_kern_rsp;  /* kernel-only stack */
+       u_int64_t ci_intr_rsp;  /* U<-->K trampoline stack */
+       u_int64_t ci_user_cr3;  /* U-K page table */

       struct proc *ci_fpcurproc;
       struct proc *ci_fpsaveproc;
@@ -127,6 +140,8 @@ struct cpu_info {
       u_int32_t       ci_feature_eflags;
       u_int32_t       ci_feature_sefflags_ebx;
       u_int32_t       ci_feature_sefflags_ecx;
+       u_int32_t       ci_feature_sefflags_edx;
+       u_int32_t       ci_feature_amdspec_ebx;
       u_int32_t       ci_feature_tpmflags;
       u_int32_t       ci_pnfeatset;
       u_int32_t       ci_efeature_eax;
@@ -215,7 +230,10 @@ struct cpu_info {
#define PROC_PC(p)     ((p)->p_md.md_regs->tf_rip)
#define PROC_STACK(p)  ((p)->p_md.md_regs->tf_rsp)

-extern struct cpu_info cpu_info_primary;
+struct cpu_info_full;
+extern struct cpu_info_full cpu_info_full_primary;
+#define cpu_info_primary (*(struct cpu_info *)((char *)&cpu_info_full_primary + 4096*2 - offsetof(struct cpu_info, ci_PAGEALIGN)))
+
extern struct cpu_info *cpu_info_list;

#define CPU_INFO_ITERATOR              int
@@ -240,7 +258,8 @@ extern void need_resched(struct cpu_info
#define CPU_START_CLEANUP(_ci) ((_ci)->ci_func->cleanup(_ci))

#define curcpu()       ({struct cpu_info *__ci;                  \
-                       asm volatile("movq %%gs:8,%0" : "=r" (__ci)); \
+                       asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) \
+                               :"n" (offsetof(struct cpu_info, ci_self))); \
                       __ci;})
#define cpu_number()   (curcpu()->ci_cpuid)

@@ -261,8 +280,6 @@ void cpu_unidle(struct cpu_info *);
#define MAXCPUS                1

#ifdef _KERNEL
-extern struct cpu_info cpu_info_primary;
-
#define curcpu()               (&cpu_info_primary)

#define cpu_kick(ci)
Index: sys/arch/amd64/include/cpu_full.h
===================================================================
RCS file: sys/arch/amd64/include/cpu_full.h
diff -N sys/arch/amd64/include/cpu_full.h
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ sys/arch/amd64/include/cpu_full.h   22 Feb 2018 20:30:15 -0000
@@ -0,0 +1,66 @@
+/*     $OpenBSD$       */
+/*
+ * Copyright (c) Philip Guenther <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _MACHINE_CPU_FULL_H_
+#define _MACHINE_CPU_FULL_H_
+
+#include <sys/param.h>                 /* offsetof, PAGE_SIZE */
+#include <machine/segments.h>
+#include <machine/tss.h>
+
+/*
+ * The layout of the full per-CPU information, including TSS, GDT,
+ * trampoline stacks, and cpu_info described in <machine/cpu.h>
+ */
+struct cpu_info_full {
+       /* page mapped kRO in u-k */
+       union {
+               struct x86_64_tss       u_tss; /* followed by gdt */
+               char                    u_align[PAGE_SIZE];
+       } cif_RO;
+#define cif_tss        cif_RO.u_tss
+
+       /* start of page mapped kRW in u-k */
+       uint64_t cif_tramp_stack[(PAGE_SIZE / 4
+               - offsetof(struct cpu_info, ci_PAGEALIGN)) / sizeof(uint64_t)];
+       uint64_t cif_dblflt_stack[(PAGE_SIZE / 4) / sizeof(uint64_t)];
+       uint64_t cif_nmi_stack[(2 * PAGE_SIZE / 4) / sizeof(uint64_t)];
+
+       /*
+        * Beginning of this hangs over into the kRW page; rest is
+        * unmapped in u-k
+        */
+       struct cpu_info cif_cpu;
+} __aligned(PAGE_SIZE);
+
+/* tss, align shim, and gdt must fit in a page */
+CTASSERT(_ALIGN(sizeof(struct x86_64_tss)) +
+        sizeof(struct mem_segment_descriptor) * (NGDT_MEM + 2*NGDT_SYS)
+        < PAGE_SIZE);
+
+/* verify expected alignment */
+CTASSERT(offsetof(struct cpu_info_full, cif_cpu.ci_PAGEALIGN) % PAGE_SIZE == 0);
+
+/* verify total size is multiple of page size */
+CTASSERT(sizeof(struct cpu_info_full) % PAGE_SIZE == 0);
+
+extern struct cpu_info_full cpu_info_full_primary;
+
+/* Now make sure the cpu_info_primary macro is correct */
+CTASSERT(&cpu_info_primary - &cpu_info_full_primary.cif_cpu == 0);
+
+#endif /* _MACHINE_CPU_FULL_H_ */
Index: sys/arch/amd64/include/cpufunc.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/cpufunc.h,v
retrieving revision 1.20
diff -u -p -r1.20 cpufunc.h
--- sys/arch/amd64/include/cpufunc.h    8 Aug 2017 15:53:55 -0000       1.20
+++ sys/arch/amd64/include/cpufunc.h    21 Feb 2018 21:33:03 -0000
@@ -315,6 +315,9 @@ breakpoint(void)

void amd64_errata(struct cpu_info *);

+struct cpu_info_full;
+void cpu_enter_pages(struct cpu_info_full *);
+
#endif /* _KERNEL */

#endif /* !_MACHINE_CPUFUNC_H_ */
Index: sys/arch/amd64/include/frame.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/frame.h,v
retrieving revision 1.6
diff -u -p -r1.6 frame.h
--- sys/arch/amd64/include/frame.h      26 Feb 2016 09:29:20 -0000      1.6
+++ sys/arch/amd64/include/frame.h      21 Feb 2018 21:33:03 -0000
@@ -147,6 +147,20 @@ struct intrframe {
       int64_t if_ss;
};

+
+/*
+ * The trampoline frame used on the kernel stack page which is present
+ * but kernel-only, in the page tables used when in userspace.  This is
+ * the minimum for iretq operation.
+ */
+struct iretq_frame {
+       int64_t iretq_rip;
+       int64_t iretq_cs;
+       int64_t iretq_rflags;
+       int64_t iretq_rsp;
+       int64_t iretq_ss;
+};
+
/*
 * Stack frame inside cpu_switch()
 */
Index: sys/arch/amd64/include/frameasm.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/frameasm.h,v
retrieving revision 1.10
diff -u -p -r1.10 frameasm.h
--- sys/arch/amd64/include/frameasm.h   4 Sep 2016 09:22:28 -0000       1.10
+++ sys/arch/amd64/include/frameasm.h   21 Feb 2018 21:33:03 -0000
@@ -13,7 +13,10 @@
 * These are used on interrupt or trap entry or exit.
 */
#define INTR_SAVE_GPRS \
-       subq    $120,%rsp       ; \
+       subq    $120,%rsp               ; \
+       INTR_SAVE_MOST_GPRS_NO_ADJ      ; \
+       movq    %rcx,TF_RCX(%rsp)
+#define INTR_SAVE_MOST_GPRS_NO_ADJ \
       movq    %r15,TF_R15(%rsp)       ; \
       movq    %r14,TF_R14(%rsp)       ; \
       movq    %r13,TF_R13(%rsp)       ; \
@@ -27,15 +30,54 @@
       movq    %rbp,TF_RBP(%rsp)       ; \
       movq    %rbx,TF_RBX(%rsp)       ; \
       movq    %rdx,TF_RDX(%rsp)       ; \
-       movq    %rcx,TF_RCX(%rsp)       ; \
       movq    %rax,TF_RAX(%rsp)

-#define        INTRENTRY \
-       subq    $32,%rsp                ; \
-       testq   $SEL_RPL,56(%rsp)       ; \
-       je      98f                     ; \
+/* For real interrupt code paths, where we can come from userspace */
+#define INTRENTRY_LABEL(label) X##label##_untramp
+#define        INTRENTRY(label) \
+       testq   $SEL_RPL,24(%rsp)       ; \
+       je      INTRENTRY_LABEL(label)  ; \
       swapgs                          ; \
-98:    INTR_SAVE_GPRS
+       movq    %rax,CPUVAR(SCRATCH)    ; \
+       movq    CPUVAR(KERN_CR3),%rax   ; \
+       testq   %rax,%rax               ; \
+       jz      98f                     ; \
+       movq    %rax,%cr3               ; \
+       jmp     98f                     ; \
+       .text                           ; \
+       .global INTRENTRY_LABEL(label)  ; \
+INTRENTRY_LABEL(label):        /* from kernel */ \
+       subq    $152,%rsp               ; \
+       movq    %rcx,TF_RCX(%rsp)       ; \
+       jmp     99f                     ; \
+98:    /* from userspace */              \
+       movq    CPUVAR(KERN_RSP),%rax   ; \
+       xchgq   %rax,%rsp               ; \
+       movq    %rcx,TF_RCX(%rsp)       ; \
+       /* copy trapno+err to the trap frame */ \
+       movq    0(%rax),%rcx            ; \
+       movq    %rcx,TF_TRAPNO(%rsp)    ; \
+       movq    8(%rax),%rcx            ; \
+       movq    %rcx,TF_ERR(%rsp)       ; \
+       addq    $16,%rax                ; \
+       /* copy iretq frame to the trap frame */ \
+       movq    IRETQ_RIP(%rax),%rcx    ; \
+       movq    %rcx,TF_RIP(%rsp)       ; \
+       movq    IRETQ_CS(%rax),%rcx     ; \
+       movq    %rcx,TF_CS(%rsp)        ; \
+       movq    IRETQ_RFLAGS(%rax),%rcx ; \
+       movq    %rcx,TF_RFLAGS(%rsp)    ; \
+       movq    IRETQ_RSP(%rax),%rcx    ; \
+       movq    %rcx,TF_RSP(%rsp)       ; \
+       movq    IRETQ_SS(%rax),%rcx     ; \
+       movq    %rcx,TF_SS(%rsp)        ; \
+       movq    CPUVAR(SCRATCH),%rax    ; \
+99:    INTR_SAVE_MOST_GPRS_NO_ADJ
+
+/* For faking up an interrupt frame when we're already in the kernel */
+#define        INTR_REENTRY \
+       subq    $32,%rsp                ; \
+       INTR_SAVE_GPRS

#define INTRFASTEXIT \
       jmp     intr_fast_exit
@@ -49,26 +91,6 @@
       movl    %cs,%r11d               ; \
       pushq   %r11                    ; \
       pushq   %r13                    ;
-
-/*
- * Restore FS.base if it's not already in the CPU, and do the cli/swapgs.
- * Uses %rax, %rcx, and %rdx
- */
-#define INTR_RESTORE_SELECTORS                                         \
-       btsl    $CPUF_USERSEGS_BIT, CPUVAR(FLAGS)                       ; \
-       jc      99f                                                     ; \
-       movq    CPUVAR(CURPCB),%rdx     /* for below */                 ; \
-       movq    PCB_FSBASE(%rdx),%rax                                   ; \
-       cmpq    $0,%rax                                                 ; \
-       je      99f             /* setting %fs has zeroed FS.base */    ; \
-       movq    %rax,%rdx                                               ; \
-       shrq    $32,%rdx                                                ; \
-       movl    $MSR_FSBASE,%ecx                                        ; \
-       wrmsr                                                           ; \
-99:    movw    $(GSEL(GUDATA_SEL, SEL_UPL)),%ax                        ; \
-       cli                                                             ; \
-       swapgs                                                          ; \
-       movw    %ax,%gs

#define        INTR_FAKE_TRAP  0xbadabada

Index: sys/arch/amd64/include/gdt.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/gdt.h,v
retrieving revision 1.5
diff -u -p -r1.5 gdt.h
--- sys/arch/amd64/include/gdt.h        13 Nov 2010 04:16:42 -0000      1.5
+++ sys/arch/amd64/include/gdt.h        21 Feb 2018 21:33:03 -0000
@@ -31,4 +31,3 @@
 */

void gdt_init_cpu(struct cpu_info *);
-void gdt_alloc_cpu(struct cpu_info *);
Index: sys/arch/amd64/include/pmap.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/pmap.h,v
retrieving revision 1.62
diff -u -p -r1.62 pmap.h
--- sys/arch/amd64/include/pmap.h       8 Feb 2016 18:23:04 -0000       1.62
+++ sys/arch/amd64/include/pmap.h       21 Feb 2018 21:33:03 -0000
@@ -283,8 +283,19 @@ struct pmap {
       struct mutex pm_mtx;
       struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */
       LIST_ENTRY(pmap) pm_list;       /* list (lck by pm_list lock) */
-       pd_entry_t *pm_pdir;            /* VA of PD (lck by object lock) */
-       paddr_t pm_pdirpa;              /* PA of PD (read-only after create) */
+       /*
+        * pm_pdir         : VA of page table to be used when executing in
+        *                   privileged mode
+        * pm_pdirpa       : PA of page table to be used when executing in
+        *                   privileged mode
+        * pm_pdir_intel   : VA of special page table to be used when executing
+        *                   on an Intel CPU in usermode (no kernel mappings)
+        * pm_pdirpa_intel : PA of special page table to be used when executing
+        *                   on an Intel CPU in usermode (no kernel mappings)
+        */
+       pd_entry_t *pm_pdir, *pm_pdir_intel;
+       paddr_t pm_pdirpa, pm_pdirpa_intel;
+
       struct vm_page *pm_ptphint[PTP_LEVELS-1];
                                       /* pointer to a PTP in our pmap */
       struct pmap_statistics pm_stats;  /* pmap stats (lck by object lock) */
@@ -378,6 +389,7 @@ paddr_t     pmap_prealloc_lowmem_ptps(paddr_
void   pagezero(vaddr_t);

int    pmap_convert(struct pmap *, int);
+void   pmap_enter_special(vaddr_t, paddr_t, vm_prot_t);

/*
 * functions for flushing the cache for vaddrs and pages.
Index: sys/arch/amd64/include/pte.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/pte.h,v
retrieving revision 1.13
diff -u -p -r1.13 pte.h
--- sys/arch/amd64/include/pte.h        9 Nov 2015 00:49:33 -0000       1.13
+++ sys/arch/amd64/include/pte.h        21 Feb 2018 20:14:19 -0000
@@ -158,6 +158,7 @@ typedef u_int64_t pt_entry_t;               /* PTE */

#ifdef _KERNEL
extern pt_entry_t pg_nx;       /* NX pte bit */
+extern pt_entry_t pg_g_kern;   /* PG_G if glbl mappings can be used in kern */
#endif /* _KERNEL */

#endif /* _MACHINE_PTE_H_ */
Index: sys/arch/amd64/include/specialreg.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/specialreg.h,v
retrieving revision 1.61
diff -u -p -r1.61 specialreg.h
--- sys/arch/amd64/include/specialreg.h 12 Aug 2017 19:53:37 -0000      1.61
+++ sys/arch/amd64/include/specialreg.h 22 Feb 2018 20:30:15 -0000
@@ -214,6 +214,10 @@
#define SEFF0ECX_AVX512VBMI    0x00000002 /* AVX-512 vector bit inst */
#define SEFF0ECX_UMIP          0x00000004 /* UMIP support */
#define SEFF0ECX_PKU           0x00000008 /* Page prot keys for user mode */
+/* SEFF EDX bits */
+#define SEFF0EDX_IBRS          0x04000000 /* IBRS / IBPB Speculation Control */
+#define SEFF0EDX_STIBP         0x08000000 /* STIBP Speculation Control */
+#define SEFF0EDX_ARCH_CAP      0x20000000 /* Has IA32_ARCH_CAPABILITIES MSR */

/*
 * Thermal and Power Management (CPUID function 0x6) EAX bits
@@ -285,9 +289,13 @@
 * "Advanced Power Management Information" bits (CPUID function 0x80000007):
 * EDX bits.
 */
-
#define CPUIDEDX_ITSC          (1 << 8)        /* Invariant TSC */

+/*
+ * AMD CPUID function 0x80000008 EBX bits
+ */
+#define CPUIDEBX_IBPB          (1ULL << 12)    /* Speculation Control IBPB */
+
#define        CPUID2FAMILY(cpuid)     (((cpuid) >> 8) & 15)
#define        CPUID2MODEL(cpuid)      (((cpuid) >> 4) & 15)
#define        CPUID2STEPPING(cpuid)   ((cpuid) & 15)
@@ -319,6 +327,11 @@
#define MSR_EBC_FREQUENCY_ID    0x02c   /* Pentium 4 only */
#define        MSR_TEST_CTL            0x033
#define MSR_IA32_FEATURE_CONTROL 0x03a
+#define MSR_SPEC_CTRL          0x048   /* Speculation Control IBRS / STIBP */
+#define SPEC_CTRL_IBRS         (1ULL << 0)
+#define SPEC_CTRL_STIBP                (1ULL << 1)
+#define MSR_PRED_CMD           0x049   /* Speculation Control IBPB */
+#define PRED_CMD_IBPB          (1ULL << 0)
#define MSR_BIOS_UPDT_TRIG     0x079
#define        MSR_BBL_CR_D0           0x088   /* PII+ only */
#define        MSR_BBL_CR_D1           0x089   /* PII+ only */
@@ -331,6 +344,8 @@
#define MTRRcap_FIXED          0x100   /* bit 8 - fixed MTRRs supported */
#define MTRRcap_WC             0x400   /* bit 10 - WC type supported */
#define MTRRcap_SMRR           0x800   /* bit 11 - SMM range reg supported */
+#define MSR_ARCH_CAPABILITIES  0x10a
+#define ARCH_CAPABILITIES_RDCL_NO      (1 << 0)        /* Meltdown safe */
#define        MSR_BBL_CR_ADDR         0x116   /* PII+ only */
#define        MSR_BBL_CR_DECC         0x118   /* PII+ only */
#define        MSR_BBL_CR_CTL          0x119   /* PII+ only */
Index: distrib/sets/lists/comp/md.amd64
===================================================================
RCS file: /cvs/src/distrib/sets/lists/comp/md.amd64,v
retrieving revision 1.108
diff -u -p -r1.108 md.amd64
--- distrib/sets/lists/comp/md.amd64    20 Aug 2017 14:53:38 -0000      1.108
+++ distrib/sets/lists/comp/md.amd64    26 Feb 2018 13:03:54 -0000
@@ -16,6 +16,7 @@
./usr/include/amd64/codepatch.h
./usr/include/amd64/conf.h
./usr/include/amd64/cpu.h
+./usr/include/amd64/cpu_full.h
./usr/include/amd64/cpufunc.h
./usr/include/amd64/cpuvar.h
./usr/include/amd64/db_machdep.h