1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/types.h> 33 #include <sys/errno.h> 34 #include <sys/systm.h> 35 #include <sys/malloc.h> 36 #include <sys/smp.h> 37 38 #include <vm/vm.h> 39 #include <vm/pmap.h> 40 41 #include <machine/param.h> 42 #include <machine/cpufunc.h> 43 #include <machine/pmap.h> 44 #include <machine/vmparam.h> 45 46 #include <machine/vmm.h> 47 #include "vmx_cpufunc.h" 48 #include "vmx_msr.h" 49 #include "vmx.h" 50 #include "ept.h" 51 52 #define EPT_PWL4(cap) ((cap) & (1UL << 6)) 53 #define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14)) 54 #define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */ 55 #define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */ 56 #define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) 57 #define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) 58 59 #define INVVPID_ALL_TYPES_MASK 0xF0000000000UL 60 #define INVVPID_ALL_TYPES_SUPPORTED(cap) \ 61 (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK) 62 63 #define INVEPT_ALL_TYPES_MASK 0x6000000UL 64 #define INVEPT_ALL_TYPES_SUPPORTED(cap) \ 65 (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK) 66 67 #define EPT_PG_RD (1 << 0) 68 #define EPT_PG_WR (1 << 1) 69 #define EPT_PG_EX (1 << 2) 70 #define EPT_PG_MEMORY_TYPE(x) ((x) << 3) 71 #define EPT_PG_IGNORE_PAT (1 << 6) 72 #define EPT_PG_SUPERPAGE (1 << 7) 73 74 #define EPT_ADDR_MASK ((uint64_t)-1 << 12) 75 76 MALLOC_DECLARE(M_VMX); 77 78 static uint64_t page_sizes_mask; 79 80 /* 81 * Set this to 1 to have the EPT tables respect the guest PAT settings 82 */ 83 static int ept_pat_passthru; 84 85 int 86 ept_init(void) 87 { 88 int page_shift; 89 uint64_t cap; 90 91 cap = rdmsr(MSR_VMX_EPT_VPID_CAP); 92 93 /* 94 * Verify that: 95 * - page walk length is 4 steps 96 * - extended page tables can be laid out in write-back memory 97 * - invvpid instruction with all possible types is supported 98 * - invept instruction with all possible types is supported 99 */ 100 if (!EPT_PWL4(cap) || 101 !EPT_MEMORY_TYPE_WB(cap) || 102 !INVVPID_SUPPORTED(cap) || 103 !INVVPID_ALL_TYPES_SUPPORTED(cap) || 104 !INVEPT_SUPPORTED(cap) || 105 !INVEPT_ALL_TYPES_SUPPORTED(cap)) 106 return (EINVAL); 107 108 /* Set bits in 'page_sizes_mask' for each valid page size */ 109 page_shift = PAGE_SHIFT; 110 page_sizes_mask = 1UL << page_shift; /* 4KB page */ 111 112 page_shift += 9; 113 if (EPT_PDE_SUPERPAGE(cap)) 114 page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */ 115 116 page_shift += 9; 117 if (EPT_PDPTE_SUPERPAGE(cap)) 118 page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */ 119 120 return (0); 121 } 122 123 #if 0 124 static void 125 ept_dump(uint64_t *ptp, int nlevels) 126 { 127 int i, t, tabs; 128 uint64_t *ptpnext, ptpval; 129 130 if (--nlevels < 0) 131 return; 132 133 tabs = 3 - nlevels; 134 for (t = 0; t < tabs; t++) 135 printf("\t"); 136 printf("PTP = %p\n", ptp); 137 138 for (i = 0; i < 512; i++) { 139 ptpval = ptp[i]; 140 141 if (ptpval == 0) 142 continue; 143 144 for (t = 0; t < tabs; t++) 145 printf("\t"); 146 printf("%3d 0x%016lx\n", i, ptpval); 147 148 if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) { 149 ptpnext = (uint64_t *) 150 PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK); 151 ept_dump(ptpnext, nlevels); 152 } 153 } 154 } 155 #endif 156 157 static size_t 158 ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, 159 vm_memattr_t attr, vm_prot_t prot, boolean_t spok) 160 { 161 int spshift, ptpshift, ptpindex, nlevels; 162 163 /* 164 * Compute the size of the mapping that we can accomodate. 165 * 166 * This is based on three factors: 167 * - super page sizes supported by the processor 168 * - alignment of the region starting at 'gpa' and 'hpa' 169 * - length of the region 'len' 170 */ 171 spshift = PAGE_SHIFT; 172 if (spok) 173 spshift += (EPT_PWLEVELS - 1) * 9; 174 while (spshift >= PAGE_SHIFT) { 175 uint64_t spsize = 1UL << spshift; 176 if ((page_sizes_mask & spsize) != 0 && 177 (gpa & (spsize - 1)) == 0 && 178 (hpa & (spsize - 1)) == 0 && 179 length >= spsize) { 180 break; 181 } 182 spshift -= 9; 183 } 184 185 if (spshift < PAGE_SHIFT) { 186 panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, " 187 "length 0x%016lx, page_sizes_mask 0x%016lx", 188 gpa, hpa, length, page_sizes_mask); 189 } 190 191 nlevels = EPT_PWLEVELS; 192 while (--nlevels >= 0) { 193 ptpshift = PAGE_SHIFT + nlevels * 9; 194 ptpindex = (gpa >> ptpshift) & 0x1FF; 195 196 /* We have reached the leaf mapping */ 197 if (spshift >= ptpshift) 198 break; 199 200 /* 201 * We are working on a non-leaf page table page. 202 * 203 * Create the next level page table page if necessary and point 204 * to it from the current page table. 205 */ 206 if (ptp[ptpindex] == 0) { 207 void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO); 208 ptp[ptpindex] = vtophys(nlp); 209 ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX; 210 } 211 212 /* Work our way down to the next level page table page */ 213 ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK); 214 } 215 216 if ((gpa & ((1UL << ptpshift) - 1)) != 0) { 217 panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d " 218 "mismatch\n", gpa, ptpshift); 219 } 220 221 if (prot != VM_PROT_NONE) { 222 /* Do the mapping */ 223 ptp[ptpindex] = hpa; 224 225 /* Apply the access controls */ 226 if (prot & VM_PROT_READ) 227 ptp[ptpindex] |= EPT_PG_RD; 228 if (prot & VM_PROT_WRITE) 229 ptp[ptpindex] |= EPT_PG_WR; 230 if (prot & VM_PROT_EXECUTE) 231 ptp[ptpindex] |= EPT_PG_EX; 232 233 /* 234 * By default the PAT type is ignored - this appears to 235 * be how other hypervisors handle EPT. Allow this to be 236 * overridden. 237 */ 238 ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr); 239 if (!ept_pat_passthru) 240 ptp[ptpindex] |= EPT_PG_IGNORE_PAT; 241 242 if (nlevels > 0) 243 ptp[ptpindex] |= EPT_PG_SUPERPAGE; 244 } else { 245 /* Remove the mapping */ 246 ptp[ptpindex] = 0; 247 } 248 249 return (1UL << ptpshift); 250 } 251 252 static vm_paddr_t 253 ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa) 254 { 255 int nlevels, ptpshift, ptpindex; 256 uint64_t ptpval, hpabase, pgmask; 257 258 nlevels = EPT_PWLEVELS; 259 while (--nlevels >= 0) { 260 ptpshift = PAGE_SHIFT + nlevels * 9; 261 ptpindex = (gpa >> ptpshift) & 0x1FF; 262 263 ptpval = ptp[ptpindex]; 264 265 /* Cannot make progress beyond this point */ 266 if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0) 267 break; 268 269 if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) { 270 pgmask = (1UL << ptpshift) - 1; 271 hpabase = ptpval & ~pgmask; 272 return (hpabase | (gpa & pgmask)); 273 } 274 275 /* Work our way down to the next level page table page */ 276 ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK); 277 } 278 279 return ((vm_paddr_t)-1); 280 } 281 282 static void 283 ept_free_pt_entry(pt_entry_t pte) 284 { 285 if (pte == 0) 286 return; 287 288 /* sanity check */ 289 if ((pte & EPT_PG_SUPERPAGE) != 0) 290 panic("ept_free_pt_entry: pte cannot have superpage bit"); 291 292 return; 293 } 294 295 static void 296 ept_free_pd_entry(pd_entry_t pde) 297 { 298 pt_entry_t *pt; 299 int i; 300 301 if (pde == 0) 302 return; 303 304 if ((pde & EPT_PG_SUPERPAGE) == 0) { 305 pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK); 306 for (i = 0; i < NPTEPG; i++) 307 ept_free_pt_entry(pt[i]); 308 free(pt, M_VMX); /* free the page table page */ 309 } 310 } 311 312 static void 313 ept_free_pdp_entry(pdp_entry_t pdpe) 314 { 315 pd_entry_t *pd; 316 int i; 317 318 if (pdpe == 0) 319 return; 320 321 if ((pdpe & EPT_PG_SUPERPAGE) == 0) { 322 pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK); 323 for (i = 0; i < NPDEPG; i++) 324 ept_free_pd_entry(pd[i]); 325 free(pd, M_VMX); /* free the page directory page */ 326 } 327 } 328 329 static void 330 ept_free_pml4_entry(pml4_entry_t pml4e) 331 { 332 pdp_entry_t *pdp; 333 int i; 334 335 if (pml4e == 0) 336 return; 337 338 if ((pml4e & EPT_PG_SUPERPAGE) == 0) { 339 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK); 340 for (i = 0; i < NPDPEPG; i++) 341 ept_free_pdp_entry(pdp[i]); 342 free(pdp, M_VMX); /* free the page directory ptr page */ 343 } 344 } 345 346 void 347 ept_vmcleanup(struct vmx *vmx) 348 { 349 int i; 350 351 for (i = 0; i < NPML4EPG; i++) 352 ept_free_pml4_entry(vmx->pml4ept[i]); 353 } 354 355 int 356 ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len, 357 vm_memattr_t attr, int prot, boolean_t spok) 358 { 359 size_t n; 360 struct vmx *vmx = arg; 361 362 while (len > 0) { 363 n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr, 364 prot, spok); 365 len -= n; 366 gpa += n; 367 hpa += n; 368 } 369 370 return (0); 371 } 372 373 vm_paddr_t 374 ept_vmmmap_get(void *arg, vm_paddr_t gpa) 375 { 376 vm_paddr_t hpa; 377 struct vmx *vmx; 378 379 vmx = arg; 380 hpa = ept_lookup_mapping(vmx->pml4ept, gpa); 381 return (hpa); 382 } 383 384 static void 385 invept_single_context(void *arg) 386 { 387 struct invept_desc desc = *(struct invept_desc *)arg; 388 389 invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); 390 } 391 392 void 393 ept_invalidate_mappings(u_long pml4ept) 394 { 395 struct invept_desc invept_desc = { 0 }; 396 397 invept_desc.eptp = EPTP(pml4ept); 398 399 smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); 400 } 401