xref: /freebsd/sys/amd64/vmm/intel/ept.c (revision a98ff317388a00b992f1bf8404dee596f9383f5e)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/types.h>
33 #include <sys/errno.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
36 #include <sys/smp.h>
37 
38 #include <vm/vm.h>
39 #include <vm/pmap.h>
40 
41 #include <machine/param.h>
42 #include <machine/cpufunc.h>
43 #include <machine/pmap.h>
44 #include <machine/vmparam.h>
45 
46 #include <machine/vmm.h>
47 #include "vmx_cpufunc.h"
48 #include "vmx_msr.h"
49 #include "vmx.h"
50 #include "ept.h"
51 
52 #define	EPT_PWL4(cap)			((cap) & (1UL << 6))
53 #define	EPT_MEMORY_TYPE_WB(cap)		((cap) & (1UL << 14))
54 #define	EPT_PDE_SUPERPAGE(cap)		((cap) & (1UL << 16))	/* 2MB pages */
55 #define	EPT_PDPTE_SUPERPAGE(cap)	((cap) & (1UL << 17))	/* 1GB pages */
56 #define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
57 #define	INVEPT_SUPPORTED(cap)		((cap) & (1UL << 20))
58 
59 #define	INVVPID_ALL_TYPES_MASK		0xF0000000000UL
60 #define	INVVPID_ALL_TYPES_SUPPORTED(cap)	\
61 	(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
62 
63 #define	INVEPT_ALL_TYPES_MASK		0x6000000UL
64 #define	INVEPT_ALL_TYPES_SUPPORTED(cap)		\
65 	(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
66 
67 #define	EPT_PG_RD			(1 << 0)
68 #define	EPT_PG_WR			(1 << 1)
69 #define	EPT_PG_EX			(1 << 2)
70 #define	EPT_PG_MEMORY_TYPE(x)		((x) << 3)
71 #define	EPT_PG_IGNORE_PAT		(1 << 6)
72 #define	EPT_PG_SUPERPAGE		(1 << 7)
73 
74 #define	EPT_ADDR_MASK			((uint64_t)-1 << 12)
75 
76 MALLOC_DECLARE(M_VMX);
77 
78 static uint64_t page_sizes_mask;
79 
80 /*
81  * Set this to 1 to have the EPT tables respect the guest PAT settings
82  */
83 static int ept_pat_passthru;
84 
85 int
86 ept_init(void)
87 {
88 	int page_shift;
89 	uint64_t cap;
90 
91 	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
92 
93 	/*
94 	 * Verify that:
95 	 * - page walk length is 4 steps
96 	 * - extended page tables can be laid out in write-back memory
97 	 * - invvpid instruction with all possible types is supported
98 	 * - invept instruction with all possible types is supported
99 	 */
100 	if (!EPT_PWL4(cap) ||
101 	    !EPT_MEMORY_TYPE_WB(cap) ||
102 	    !INVVPID_SUPPORTED(cap) ||
103 	    !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
104 	    !INVEPT_SUPPORTED(cap) ||
105 	    !INVEPT_ALL_TYPES_SUPPORTED(cap))
106 		return (EINVAL);
107 
108 	/* Set bits in 'page_sizes_mask' for each valid page size */
109 	page_shift = PAGE_SHIFT;
110 	page_sizes_mask = 1UL << page_shift;		/* 4KB page */
111 
112 	page_shift += 9;
113 	if (EPT_PDE_SUPERPAGE(cap))
114 		page_sizes_mask |= 1UL << page_shift;	/* 2MB superpage */
115 
116 	page_shift += 9;
117 	if (EPT_PDPTE_SUPERPAGE(cap))
118 		page_sizes_mask |= 1UL << page_shift;	/* 1GB superpage */
119 
120 	return (0);
121 }
122 
123 #if 0
124 static void
125 ept_dump(uint64_t *ptp, int nlevels)
126 {
127 	int i, t, tabs;
128 	uint64_t *ptpnext, ptpval;
129 
130 	if (--nlevels < 0)
131 		return;
132 
133 	tabs = 3 - nlevels;
134 	for (t = 0; t < tabs; t++)
135 		printf("\t");
136 	printf("PTP = %p\n", ptp);
137 
138 	for (i = 0; i < 512; i++) {
139 		ptpval = ptp[i];
140 
141 		if (ptpval == 0)
142 			continue;
143 
144 		for (t = 0; t < tabs; t++)
145 			printf("\t");
146 		printf("%3d 0x%016lx\n", i, ptpval);
147 
148 		if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
149 			ptpnext = (uint64_t *)
150 				  PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
151 			ept_dump(ptpnext, nlevels);
152 		}
153 	}
154 }
155 #endif
156 
157 static size_t
158 ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
159 		   vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
160 {
161 	int spshift, ptpshift, ptpindex, nlevels;
162 
163 	/*
164 	 * Compute the size of the mapping that we can accomodate.
165 	 *
166 	 * This is based on three factors:
167 	 * - super page sizes supported by the processor
168 	 * - alignment of the region starting at 'gpa' and 'hpa'
169 	 * - length of the region 'len'
170 	 */
171 	spshift = PAGE_SHIFT;
172 	if (spok)
173 		spshift += (EPT_PWLEVELS - 1) * 9;
174 	while (spshift >= PAGE_SHIFT) {
175 		uint64_t spsize = 1UL << spshift;
176 		if ((page_sizes_mask & spsize) != 0 &&
177 		    (gpa & (spsize - 1)) == 0 &&
178 		    (hpa & (spsize - 1)) == 0 &&
179 		    length >= spsize) {
180 			break;
181 		}
182 		spshift -= 9;
183 	}
184 
185 	if (spshift < PAGE_SHIFT) {
186 		panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
187 		      "length 0x%016lx, page_sizes_mask 0x%016lx",
188 		      gpa, hpa, length, page_sizes_mask);
189 	}
190 
191 	nlevels = EPT_PWLEVELS;
192 	while (--nlevels >= 0) {
193 		ptpshift = PAGE_SHIFT + nlevels * 9;
194 		ptpindex = (gpa >> ptpshift) & 0x1FF;
195 
196 		/* We have reached the leaf mapping */
197 		if (spshift >= ptpshift)
198 			break;
199 
200 		/*
201 		 * We are working on a non-leaf page table page.
202 		 *
203 		 * Create the next level page table page if necessary and point
204 		 * to it from the current page table.
205 		 */
206 		if (ptp[ptpindex] == 0) {
207 			void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
208 			ptp[ptpindex] = vtophys(nlp);
209 			ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
210 		}
211 
212 		/* Work our way down to the next level page table page */
213 		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
214 	}
215 
216 	if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
217 		panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
218 		      "mismatch\n", gpa, ptpshift);
219 	}
220 
221 	if (prot != VM_PROT_NONE) {
222 		/* Do the mapping */
223 		ptp[ptpindex] = hpa;
224 
225 		/* Apply the access controls */
226 		if (prot & VM_PROT_READ)
227 			ptp[ptpindex] |= EPT_PG_RD;
228 		if (prot & VM_PROT_WRITE)
229 			ptp[ptpindex] |= EPT_PG_WR;
230 		if (prot & VM_PROT_EXECUTE)
231 			ptp[ptpindex] |= EPT_PG_EX;
232 
233 		/*
234 		 * By default the PAT type is ignored - this appears to
235 		 * be how other hypervisors handle EPT. Allow this to be
236 		 * overridden.
237 		 */
238 		ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
239 		if (!ept_pat_passthru)
240 			ptp[ptpindex] |= EPT_PG_IGNORE_PAT;
241 
242 		if (nlevels > 0)
243 			ptp[ptpindex] |= EPT_PG_SUPERPAGE;
244 	} else {
245 		/* Remove the mapping */
246 		ptp[ptpindex] = 0;
247 	}
248 
249 	return (1UL << ptpshift);
250 }
251 
252 static vm_paddr_t
253 ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
254 {
255 	int nlevels, ptpshift, ptpindex;
256 	uint64_t ptpval, hpabase, pgmask;
257 
258 	nlevels = EPT_PWLEVELS;
259 	while (--nlevels >= 0) {
260 		ptpshift = PAGE_SHIFT + nlevels * 9;
261 		ptpindex = (gpa >> ptpshift) & 0x1FF;
262 
263 		ptpval = ptp[ptpindex];
264 
265 		/* Cannot make progress beyond this point */
266 		if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
267 			break;
268 
269 		if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
270 			pgmask = (1UL << ptpshift) - 1;
271 			hpabase = ptpval & ~pgmask;
272 			return (hpabase | (gpa & pgmask));
273 		}
274 
275 		/* Work our way down to the next level page table page */
276 		ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
277 	}
278 
279 	return ((vm_paddr_t)-1);
280 }
281 
282 static void
283 ept_free_pt_entry(pt_entry_t pte)
284 {
285 	if (pte == 0)
286 		return;
287 
288 	/* sanity check */
289 	if ((pte & EPT_PG_SUPERPAGE) != 0)
290 		panic("ept_free_pt_entry: pte cannot have superpage bit");
291 
292 	return;
293 }
294 
295 static void
296 ept_free_pd_entry(pd_entry_t pde)
297 {
298 	pt_entry_t	*pt;
299 	int		i;
300 
301 	if (pde == 0)
302 		return;
303 
304 	if ((pde & EPT_PG_SUPERPAGE) == 0) {
305 		pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
306 		for (i = 0; i < NPTEPG; i++)
307 			ept_free_pt_entry(pt[i]);
308 		free(pt, M_VMX);	/* free the page table page */
309 	}
310 }
311 
312 static void
313 ept_free_pdp_entry(pdp_entry_t pdpe)
314 {
315 	pd_entry_t 	*pd;
316 	int		 i;
317 
318 	if (pdpe == 0)
319 		return;
320 
321 	if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
322 		pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
323 		for (i = 0; i < NPDEPG; i++)
324 			ept_free_pd_entry(pd[i]);
325 		free(pd, M_VMX);	/* free the page directory page */
326 	}
327 }
328 
329 static void
330 ept_free_pml4_entry(pml4_entry_t pml4e)
331 {
332 	pdp_entry_t	*pdp;
333 	int		i;
334 
335 	if (pml4e == 0)
336 		return;
337 
338 	if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
339 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
340 		for (i = 0; i < NPDPEPG; i++)
341 			ept_free_pdp_entry(pdp[i]);
342 		free(pdp, M_VMX);	/* free the page directory ptr page */
343 	}
344 }
345 
346 void
347 ept_vmcleanup(struct vmx *vmx)
348 {
349 	int 		 i;
350 
351 	for (i = 0; i < NPML4EPG; i++)
352 		ept_free_pml4_entry(vmx->pml4ept[i]);
353 }
354 
355 int
356 ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
357 		vm_memattr_t attr, int prot, boolean_t spok)
358 {
359 	size_t n;
360 	struct vmx *vmx = arg;
361 
362 	while (len > 0) {
363 		n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
364 				       prot, spok);
365 		len -= n;
366 		gpa += n;
367 		hpa += n;
368 	}
369 
370 	return (0);
371 }
372 
373 vm_paddr_t
374 ept_vmmmap_get(void *arg, vm_paddr_t gpa)
375 {
376 	vm_paddr_t hpa;
377 	struct vmx *vmx;
378 
379 	vmx = arg;
380 	hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
381 	return (hpa);
382 }
383 
384 static void
385 invept_single_context(void *arg)
386 {
387 	struct invept_desc desc = *(struct invept_desc *)arg;
388 
389 	invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
390 }
391 
392 void
393 ept_invalidate_mappings(u_long pml4ept)
394 {
395 	struct invept_desc invept_desc = { 0 };
396 
397 	invept_desc.eptp = EPTP(pml4ept);
398 
399 	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
400 }
401