xref: /freebsd/sys/amd64/amd64/pmap.c (revision 7502c1f270827434bb9661cbb4b9652fdb836521)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1991 Regents of the University of California.
5  * All rights reserved.
6  * Copyright (c) 1994 John S. Dyson
7  * All rights reserved.
8  * Copyright (c) 1994 David Greenman
9  * All rights reserved.
10  * Copyright (c) 2003 Peter Wemm
11  * All rights reserved.
12  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
13  * All rights reserved.
14  *
15  * This code is derived from software contributed to Berkeley by
16  * the Systems Programming Group of the University of Utah Computer
17  * Science Department and William Jolitz of UUNET Technologies Inc.
18  *
19  * Redistribution and use in source and binary forms, with or without
20  * modification, are permitted provided that the following conditions
21  * are met:
22  * 1. Redistributions of source code must retain the above copyright
23  *    notice, this list of conditions and the following disclaimer.
24  * 2. Redistributions in binary form must reproduce the above copyright
25  *    notice, this list of conditions and the following disclaimer in the
26  *    documentation and/or other materials provided with the distribution.
27  * 3. All advertising materials mentioning features or use of this software
28  *    must display the following acknowledgement:
29  *	This product includes software developed by the University of
30  *	California, Berkeley and its contributors.
31  * 4. Neither the name of the University nor the names of its contributors
32  *    may be used to endorse or promote products derived from this software
33  *    without specific prior written permission.
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
36  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
38  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
39  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
40  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
41  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
42  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
43  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
44  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
45  * SUCH DAMAGE.
46  */
47 /*-
48  * Copyright (c) 2003 Networks Associates Technology, Inc.
49  * Copyright (c) 2014-2020 The FreeBSD Foundation
50  * All rights reserved.
51  *
52  * This software was developed for the FreeBSD Project by Jake Burkholder,
53  * Safeport Network Services, and Network Associates Laboratories, the
54  * Security Research Division of Network Associates, Inc. under
55  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
56  * CHATS research program.
57  *
58  * Portions of this software were developed by
59  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
60  * the FreeBSD Foundation.
61  *
62  * Redistribution and use in source and binary forms, with or without
63  * modification, are permitted provided that the following conditions
64  * are met:
65  * 1. Redistributions of source code must retain the above copyright
66  *    notice, this list of conditions and the following disclaimer.
67  * 2. Redistributions in binary form must reproduce the above copyright
68  *    notice, this list of conditions and the following disclaimer in the
69  *    documentation and/or other materials provided with the distribution.
70  *
71  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
72  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
75  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81  * SUCH DAMAGE.
82  */
83 
84 #define	AMD64_NPT_AWARE
85 
86 #include <sys/cdefs.h>
87 /*
88  *	Manages physical address maps.
89  *
90  *	Since the information managed by this module is
91  *	also stored by the logical address mapping module,
92  *	this module may throw away valid virtual-to-physical
93  *	mappings at almost any time.  However, invalidations
94  *	of virtual-to-physical mappings must be done as
95  *	requested.
96  *
97  *	In order to cope with hardware architectures which
98  *	make virtual-to-physical map invalidates expensive,
99  *	this module may delay invalidate or reduced protection
100  *	operations until such time as they are actually
101  *	necessary.  This module is given full information as
102  *	to which processors are currently using which maps,
103  *	and to when physical maps must be made correct.
104  */
105 
106 #include "opt_ddb.h"
107 #include "opt_pmap.h"
108 #include "opt_vm.h"
109 
110 #include <sys/param.h>
111 #include <sys/asan.h>
112 #include <sys/bitstring.h>
113 #include <sys/bus.h>
114 #include <sys/systm.h>
115 #include <sys/counter.h>
116 #include <sys/kernel.h>
117 #include <sys/ktr.h>
118 #include <sys/lock.h>
119 #include <sys/malloc.h>
120 #include <sys/mman.h>
121 #include <sys/msan.h>
122 #include <sys/mutex.h>
123 #include <sys/proc.h>
124 #include <sys/rangeset.h>
125 #include <sys/rwlock.h>
126 #include <sys/sbuf.h>
127 #include <sys/smr.h>
128 #include <sys/sx.h>
129 #include <sys/turnstile.h>
130 #include <sys/vmem.h>
131 #include <sys/vmmeter.h>
132 #include <sys/sched.h>
133 #include <sys/sysctl.h>
134 #include <sys/smp.h>
135 #ifdef DDB
136 #include <sys/kdb.h>
137 #include <ddb/ddb.h>
138 #endif
139 
140 #include <vm/vm.h>
141 #include <vm/vm_param.h>
142 #include <vm/vm_kern.h>
143 #include <vm/vm_page.h>
144 #include <vm/vm_map.h>
145 #include <vm/vm_object.h>
146 #include <vm/vm_extern.h>
147 #include <vm/vm_pageout.h>
148 #include <vm/vm_pager.h>
149 #include <vm/vm_phys.h>
150 #include <vm/vm_radix.h>
151 #include <vm/vm_reserv.h>
152 #include <vm/vm_dumpset.h>
153 #include <vm/uma.h>
154 
155 #include <machine/asan.h>
156 #include <machine/intr_machdep.h>
157 #include <x86/apicvar.h>
158 #include <x86/ifunc.h>
159 #include <machine/cpu.h>
160 #include <machine/cputypes.h>
161 #include <machine/md_var.h>
162 #include <machine/msan.h>
163 #include <machine/pcb.h>
164 #include <machine/specialreg.h>
165 #include <machine/smp.h>
166 #include <machine/sysarch.h>
167 #include <machine/tss.h>
168 
169 #ifdef NUMA
170 #define	PMAP_MEMDOM	MAXMEMDOM
171 #else
172 #define	PMAP_MEMDOM	1
173 #endif
174 
175 static __inline bool
pmap_type_guest(pmap_t pmap)176 pmap_type_guest(pmap_t pmap)
177 {
178 
179 	return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
180 }
181 
182 static __inline bool
pmap_emulate_ad_bits(pmap_t pmap)183 pmap_emulate_ad_bits(pmap_t pmap)
184 {
185 
186 	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
187 }
188 
189 static __inline pt_entry_t
pmap_valid_bit(pmap_t pmap)190 pmap_valid_bit(pmap_t pmap)
191 {
192 	pt_entry_t mask;
193 
194 	switch (pmap->pm_type) {
195 	case PT_X86:
196 	case PT_RVI:
197 		mask = X86_PG_V;
198 		break;
199 	case PT_EPT:
200 		if (pmap_emulate_ad_bits(pmap))
201 			mask = EPT_PG_EMUL_V;
202 		else
203 			mask = EPT_PG_READ;
204 		break;
205 	default:
206 		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
207 	}
208 
209 	return (mask);
210 }
211 
212 static __inline pt_entry_t
pmap_rw_bit(pmap_t pmap)213 pmap_rw_bit(pmap_t pmap)
214 {
215 	pt_entry_t mask;
216 
217 	switch (pmap->pm_type) {
218 	case PT_X86:
219 	case PT_RVI:
220 		mask = X86_PG_RW;
221 		break;
222 	case PT_EPT:
223 		if (pmap_emulate_ad_bits(pmap))
224 			mask = EPT_PG_EMUL_RW;
225 		else
226 			mask = EPT_PG_WRITE;
227 		break;
228 	default:
229 		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
230 	}
231 
232 	return (mask);
233 }
234 
235 static pt_entry_t pg_g;
236 
237 static __inline pt_entry_t
pmap_global_bit(pmap_t pmap)238 pmap_global_bit(pmap_t pmap)
239 {
240 	pt_entry_t mask;
241 
242 	switch (pmap->pm_type) {
243 	case PT_X86:
244 		mask = pg_g;
245 		break;
246 	case PT_RVI:
247 	case PT_EPT:
248 		mask = 0;
249 		break;
250 	default:
251 		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
252 	}
253 
254 	return (mask);
255 }
256 
257 static __inline pt_entry_t
pmap_accessed_bit(pmap_t pmap)258 pmap_accessed_bit(pmap_t pmap)
259 {
260 	pt_entry_t mask;
261 
262 	switch (pmap->pm_type) {
263 	case PT_X86:
264 	case PT_RVI:
265 		mask = X86_PG_A;
266 		break;
267 	case PT_EPT:
268 		if (pmap_emulate_ad_bits(pmap))
269 			mask = EPT_PG_READ;
270 		else
271 			mask = EPT_PG_A;
272 		break;
273 	default:
274 		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
275 	}
276 
277 	return (mask);
278 }
279 
280 static __inline pt_entry_t
pmap_modified_bit(pmap_t pmap)281 pmap_modified_bit(pmap_t pmap)
282 {
283 	pt_entry_t mask;
284 
285 	switch (pmap->pm_type) {
286 	case PT_X86:
287 	case PT_RVI:
288 		mask = X86_PG_M;
289 		break;
290 	case PT_EPT:
291 		if (pmap_emulate_ad_bits(pmap))
292 			mask = EPT_PG_WRITE;
293 		else
294 			mask = EPT_PG_M;
295 		break;
296 	default:
297 		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
298 	}
299 
300 	return (mask);
301 }
302 
303 static __inline pt_entry_t
pmap_pku_mask_bit(pmap_t pmap)304 pmap_pku_mask_bit(pmap_t pmap)
305 {
306 
307 	return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0);
308 }
309 
310 static __inline bool
safe_to_clear_referenced(pmap_t pmap,pt_entry_t pte)311 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
312 {
313 
314 	if (!pmap_emulate_ad_bits(pmap))
315 		return (true);
316 
317 	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
318 
319 	/*
320 	 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
321 	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
322 	 * if the EPT_PG_WRITE bit is set.
323 	 */
324 	if ((pte & EPT_PG_WRITE) != 0)
325 		return (false);
326 
327 	/*
328 	 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
329 	 */
330 	if ((pte & EPT_PG_EXECUTE) == 0 ||
331 	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
332 		return (true);
333 	else
334 		return (false);
335 }
336 
337 #ifdef PV_STATS
338 #define PV_STAT(x)	do { x ; } while (0)
339 #else
340 #define PV_STAT(x)	do { } while (0)
341 #endif
342 
343 #ifdef NUMA
344 #define	pa_index(pa)	({					\
345 	KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end,	\
346 	    ("address %lx beyond the last segment", (pa)));	\
347 	(pa) >> PDRSHIFT;					\
348 })
349 #define	pa_to_pmdp(pa)	(&pv_table[pa_index(pa)])
350 #define	pa_to_pvh(pa)	(&(pa_to_pmdp(pa)->pv_page))
351 #define	PHYS_TO_PV_LIST_LOCK(pa)	({			\
352 	struct rwlock *_lock;					\
353 	if (__predict_false((pa) > pmap_last_pa))		\
354 		_lock = &pv_dummy_large.pv_lock;		\
355 	else							\
356 		_lock = &(pa_to_pmdp(pa)->pv_lock);		\
357 	_lock;							\
358 })
359 #else
360 #define	pa_index(pa)	((pa) >> PDRSHIFT)
361 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
362 
363 #define	NPV_LIST_LOCKS	MAXCPU
364 
365 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
366 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
367 #endif
368 
369 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
370 	struct rwlock **_lockp = (lockp);		\
371 	struct rwlock *_new_lock;			\
372 							\
373 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
374 	if (_new_lock != *_lockp) {			\
375 		if (*_lockp != NULL)			\
376 			rw_wunlock(*_lockp);		\
377 		*_lockp = _new_lock;			\
378 		rw_wlock(*_lockp);			\
379 	}						\
380 } while (0)
381 
382 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
383 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
384 
385 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
386 	struct rwlock **_lockp = (lockp);		\
387 							\
388 	if (*_lockp != NULL) {				\
389 		rw_wunlock(*_lockp);			\
390 		*_lockp = NULL;				\
391 	}						\
392 } while (0)
393 
394 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
395 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
396 
397 /*
398  * Statically allocate kernel pmap memory.  However, memory for
399  * pm_pcids is obtained after the dynamic allocator is operational.
400  * Initialize it with a non-canonical pointer to catch early accesses
401  * regardless of the active mapping.
402  */
403 struct pmap kernel_pmap_store = {
404 	.pm_pcidp = (void *)0xdeadbeefdeadbeef,
405 };
406 
407 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
408 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
409 
410 int nkpt;
411 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
412     "Number of kernel page table pages allocated on bootup");
413 
414 static int ndmpdp;
415 vm_paddr_t dmaplimit;
416 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS_LA48;
417 pt_entry_t pg_nx;
418 
419 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
420     "VM/pmap parameters");
421 
422 static int __read_frequently pg_ps_enabled = 1;
423 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
424     &pg_ps_enabled, 0, "Are large page mappings enabled?");
425 
426 int __read_frequently la57 = 0;
427 SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
428     &la57, 0,
429     "5-level paging for host is enabled");
430 
431 /*
432  * The default value is needed in order to preserve compatibility with
433  * some userspace programs that put tags into sign-extended bits.
434  */
435 int prefer_uva_la48 = 1;
436 SYSCTL_INT(_vm_pmap, OID_AUTO, prefer_uva_la48, CTLFLAG_RDTUN,
437     &prefer_uva_la48, 0,
438     "Userspace maps are limited to LA48 unless otherwise configured");
439 
440 static bool
pmap_is_la57(pmap_t pmap)441 pmap_is_la57(pmap_t pmap)
442 {
443 	if (pmap->pm_type == PT_X86)
444 		return (la57);
445 	return (false);		/* XXXKIB handle EPT */
446 }
447 
448 #define	PAT_INDEX_SIZE	8
449 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
450 
451 static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
452 static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
453 static u_int64_t	KPDPphys;	/* phys addr of kernel level 3 */
454 u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
455 u_int64_t		KPML5phys;	/* phys addr of kernel level 5,
456 					   if supported */
457 
458 #ifdef KASAN
459 static uint64_t		KASANPDPphys;
460 #endif
461 #ifdef KMSAN
462 static uint64_t		KMSANSHADPDPphys;
463 static uint64_t		KMSANORIGPDPphys;
464 
465 /*
466  * To support systems with large amounts of memory, it is necessary to extend
467  * the maximum size of the direct map.  This could eat into the space reserved
468  * for the shadow map.
469  */
470 _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow");
471 #endif
472 
473 static pml4_entry_t	*kernel_pml4;
474 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
475 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
476 static u_int64_t	DMPML4phys;	/* ... level 4, for la57 */
477 static int		ndmpdpphys;	/* number of DMPDPphys pages */
478 
479 vm_paddr_t		kernphys;	/* phys addr of start of bootstrap data */
480 vm_paddr_t		KERNend;	/* and the end */
481 
482 struct kva_layout_s	kva_layout = {
483 	.kva_min =	KV4ADDR(PML4PML4I, 0, 0, 0),
484 	.kva_max =	KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
485 			    NPDEPG - 1, NPTEPG - 1),
486 	.dmap_low =	KV4ADDR(DMPML4I, 0, 0, 0),
487 	.dmap_high =	KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0),
488 	.lm_low =	KV4ADDR(LMSPML4I, 0, 0, 0),
489 	.lm_high =	KV4ADDR(LMEPML4I + 1, 0, 0, 0),
490 	.km_low =	KV4ADDR(KPML4BASE, 0, 0, 0),
491 	.km_high =	KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
492 			    NPDEPG - 1, NPTEPG - 1),
493 	.rec_pt =	KV4ADDR(PML4PML4I, 0, 0, 0),
494 	.kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0),
495 	.kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0),
496 	.kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0),
497 	.kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E,
498 			    0, 0, 0),
499 	.kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0),
500 	.kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E,
501 			    0, 0, 0),
502 };
503 
504 struct kva_layout_s	kva_layout_la57 = {
505 	.kva_min =	KV5ADDR(NPML5EPG / 2, 0, 0, 0, 0),	/* == rec_pt */
506 	.kva_max =	KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
507 			    NPDEPG - 1, NPTEPG - 1),
508 	.dmap_low =	KV5ADDR(DMPML5I, 0, 0, 0, 0),
509 	.dmap_high =	KV5ADDR(DMPML5I + NDMPML5E, 0, 0, 0, 0),
510 	.lm_low =	KV5ADDR(LMSPML5I, 0, 0, 0, 0),
511 	.lm_high =	KV5ADDR(LMEPML5I + 1, 0, 0, 0, 0),
512 	.km_low =	KV4ADDR(KPML4BASE, 0, 0, 0),
513 	.km_high =	KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
514 			    NPDEPG - 1, NPTEPG - 1),
515 	.rec_pt =	KV5ADDR(PML5PML5I, 0, 0, 0, 0),
516 	.kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0),
517 	.kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0),
518 	.kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0),
519 	.kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E,
520 			    0, 0, 0),
521 	.kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0),
522 	.kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E,
523 			    0, 0, 0),
524 };
525 
526 /*
527  * pmap_mapdev support pre initialization (i.e. console)
528  */
529 #define	PMAP_PREINIT_MAPPING_COUNT	8
530 static struct pmap_preinit_mapping {
531 	vm_paddr_t	pa;
532 	vm_offset_t	va;
533 	vm_size_t	sz;
534 	int		mode;
535 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
536 static int pmap_initialized;
537 
538 /*
539  * Data for the pv entry allocation mechanism.
540  * Updates to pv_invl_gen are protected by the pv list lock but reads are not.
541  */
542 #ifdef NUMA
543 static __inline int
pc_to_domain(struct pv_chunk * pc)544 pc_to_domain(struct pv_chunk *pc)
545 {
546 
547 	return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
548 }
549 #else
550 static __inline int
pc_to_domain(struct pv_chunk * pc __unused)551 pc_to_domain(struct pv_chunk *pc __unused)
552 {
553 
554 	return (0);
555 }
556 #endif
557 
558 struct pv_chunks_list {
559 	struct mtx pvc_lock;
560 	TAILQ_HEAD(pch, pv_chunk) pvc_list;
561 	int active_reclaims;
562 } __aligned(CACHE_LINE_SIZE);
563 
564 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
565 
566 #ifdef	NUMA
567 struct pmap_large_md_page {
568 	struct rwlock   pv_lock;
569 	struct md_page  pv_page;
570 	u_long pv_invl_gen;
571 };
572 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
573 #define pv_dummy pv_dummy_large.pv_page
574 __read_mostly static struct pmap_large_md_page *pv_table;
575 __read_mostly vm_paddr_t pmap_last_pa;
576 #else
577 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
578 static u_long pv_invl_gen[NPV_LIST_LOCKS];
579 static struct md_page *pv_table;
580 static struct md_page pv_dummy;
581 #endif
582 
583 /*
584  * All those kernel PT submaps that BSD is so fond of
585  */
586 pt_entry_t *CMAP1 = NULL;
587 caddr_t CADDR1 = 0;
588 static vm_offset_t qframe = 0;
589 static struct mtx qframe_mtx;
590 
591 static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
592 
593 static vmem_t *large_vmem;
594 static u_int lm_ents;
595 #define	PMAP_ADDRESS_IN_LARGEMAP(va)	((va) >= kva_layout.lm_low && \
596 	(va) < kva_layout.lm_high)
597 
598 int pmap_pcid_enabled = 1;
599 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
600     &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
601 int invpcid_works = 0;
602 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
603     "Is the invpcid instruction available ?");
604 int invlpgb_works;
605 SYSCTL_INT(_vm_pmap, OID_AUTO, invlpgb_works, CTLFLAG_RD, &invlpgb_works, 0,
606     "Is the invlpgb instruction available?");
607 int invlpgb_maxcnt;
608 int pmap_pcid_invlpg_workaround = 0;
609 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround,
610     CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
611     &pmap_pcid_invlpg_workaround, 0,
612     "Enable small core PCID/INVLPG workaround");
613 int pmap_pcid_invlpg_workaround_uena = 1;
614 
615 int __read_frequently pti = 0;
616 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
617     &pti, 0,
618     "Page Table Isolation enabled");
619 static vm_object_t pti_obj;
620 static pml4_entry_t *pti_pml4;
621 static vm_pindex_t pti_pg_idx;
622 static bool pti_finalized;
623 
624 static int pmap_growkernel_panic = 0;
625 SYSCTL_INT(_vm_pmap, OID_AUTO, growkernel_panic, CTLFLAG_RDTUN,
626     &pmap_growkernel_panic, 0,
627     "panic on failure to allocate kernel page table page");
628 
629 struct pmap_pkru_range {
630 	struct rs_el	pkru_rs_el;
631 	u_int		pkru_keyidx;
632 	int		pkru_flags;
633 };
634 
635 static uma_zone_t pmap_pkru_ranges_zone;
636 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
637     pt_entry_t *pte);
638 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va);
639 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
640 static void *pkru_dup_range(void *ctx, void *data);
641 static void pkru_free_range(void *ctx, void *node);
642 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap);
643 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
644 static void pmap_pkru_deassign_all(pmap_t pmap);
645 
646 static COUNTER_U64_DEFINE_EARLY(pcid_save_cnt);
647 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLFLAG_RD,
648     &pcid_save_cnt, "Count of saved TLB context on switch");
649 
650 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
651     LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
652 static struct mtx invl_gen_mtx;
653 /* Fake lock object to satisfy turnstiles interface. */
654 static struct lock_object invl_gen_ts = {
655 	.lo_name = "invlts",
656 };
657 static struct pmap_invl_gen pmap_invl_gen_head = {
658 	.gen = 1,
659 	.next = NULL,
660 };
661 static u_long pmap_invl_gen = 1;
662 static int pmap_invl_waiters;
663 static struct callout pmap_invl_callout;
664 static bool pmap_invl_callout_inited;
665 
666 #define	PMAP_ASSERT_NOT_IN_DI() \
667     KASSERT(pmap_not_in_di(), ("DI already started"))
668 
669 static bool
pmap_di_locked(void)670 pmap_di_locked(void)
671 {
672 	int tun;
673 
674 	if ((cpu_feature2 & CPUID2_CX16) == 0)
675 		return (true);
676 	tun = 0;
677 	TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun);
678 	return (tun != 0);
679 }
680 
681 static int
sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS)682 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS)
683 {
684 	int locked;
685 
686 	locked = pmap_di_locked();
687 	return (sysctl_handle_int(oidp, &locked, 0, req));
688 }
689 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN |
690     CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "",
691     "Locked delayed invalidation");
692 
693 static bool pmap_not_in_di_l(void);
694 static bool pmap_not_in_di_u(void);
695 DEFINE_IFUNC(, bool, pmap_not_in_di, (void))
696 {
697 
698 	return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u);
699 }
700 
701 static bool
pmap_not_in_di_l(void)702 pmap_not_in_di_l(void)
703 {
704 	struct pmap_invl_gen *invl_gen;
705 
706 	invl_gen = &curthread->td_md.md_invl_gen;
707 	return (invl_gen->gen == 0);
708 }
709 
710 static void
pmap_thread_init_invl_gen_l(struct thread * td)711 pmap_thread_init_invl_gen_l(struct thread *td)
712 {
713 	struct pmap_invl_gen *invl_gen;
714 
715 	invl_gen = &td->td_md.md_invl_gen;
716 	invl_gen->gen = 0;
717 }
718 
719 static void
pmap_delayed_invl_wait_block(u_long * m_gen,u_long * invl_gen)720 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen)
721 {
722 	struct turnstile *ts;
723 
724 	ts = turnstile_trywait(&invl_gen_ts);
725 	if (*m_gen > atomic_load_long(invl_gen))
726 		turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
727 	else
728 		turnstile_cancel(ts);
729 }
730 
731 static void
pmap_delayed_invl_finish_unblock(u_long new_gen)732 pmap_delayed_invl_finish_unblock(u_long new_gen)
733 {
734 	struct turnstile *ts;
735 
736 	turnstile_chain_lock(&invl_gen_ts);
737 	ts = turnstile_lookup(&invl_gen_ts);
738 	if (new_gen != 0)
739 		pmap_invl_gen = new_gen;
740 	if (ts != NULL) {
741 		turnstile_broadcast(ts, TS_SHARED_QUEUE);
742 		turnstile_unpend(ts);
743 	}
744 	turnstile_chain_unlock(&invl_gen_ts);
745 }
746 
747 /*
748  * Start a new Delayed Invalidation (DI) block of code, executed by
749  * the current thread.  Within a DI block, the current thread may
750  * destroy both the page table and PV list entries for a mapping and
751  * then release the corresponding PV list lock before ensuring that
752  * the mapping is flushed from the TLBs of any processors with the
753  * pmap active.
754  */
755 static void
pmap_delayed_invl_start_l(void)756 pmap_delayed_invl_start_l(void)
757 {
758 	struct pmap_invl_gen *invl_gen;
759 	u_long currgen;
760 
761 	invl_gen = &curthread->td_md.md_invl_gen;
762 	PMAP_ASSERT_NOT_IN_DI();
763 	mtx_lock(&invl_gen_mtx);
764 	if (LIST_EMPTY(&pmap_invl_gen_tracker))
765 		currgen = pmap_invl_gen;
766 	else
767 		currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
768 	invl_gen->gen = currgen + 1;
769 	LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
770 	mtx_unlock(&invl_gen_mtx);
771 }
772 
773 /*
774  * Finish the DI block, previously started by the current thread.  All
775  * required TLB flushes for the pages marked by
776  * pmap_delayed_invl_page() must be finished before this function is
777  * called.
778  *
779  * This function works by bumping the global DI generation number to
780  * the generation number of the current thread's DI, unless there is a
781  * pending DI that started earlier.  In the latter case, bumping the
782  * global DI generation number would incorrectly signal that the
783  * earlier DI had finished.  Instead, this function bumps the earlier
784  * DI's generation number to match the generation number of the
785  * current thread's DI.
786  */
787 static void
pmap_delayed_invl_finish_l(void)788 pmap_delayed_invl_finish_l(void)
789 {
790 	struct pmap_invl_gen *invl_gen, *next;
791 
792 	invl_gen = &curthread->td_md.md_invl_gen;
793 	KASSERT(invl_gen->gen != 0, ("missed invl_start"));
794 	mtx_lock(&invl_gen_mtx);
795 	next = LIST_NEXT(invl_gen, link);
796 	if (next == NULL)
797 		pmap_delayed_invl_finish_unblock(invl_gen->gen);
798 	else
799 		next->gen = invl_gen->gen;
800 	LIST_REMOVE(invl_gen, link);
801 	mtx_unlock(&invl_gen_mtx);
802 	invl_gen->gen = 0;
803 }
804 
805 static bool
pmap_not_in_di_u(void)806 pmap_not_in_di_u(void)
807 {
808 	struct pmap_invl_gen *invl_gen;
809 
810 	invl_gen = &curthread->td_md.md_invl_gen;
811 	return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0);
812 }
813 
814 static void
pmap_thread_init_invl_gen_u(struct thread * td)815 pmap_thread_init_invl_gen_u(struct thread *td)
816 {
817 	struct pmap_invl_gen *invl_gen;
818 
819 	invl_gen = &td->td_md.md_invl_gen;
820 	invl_gen->gen = 0;
821 	invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID;
822 }
823 
824 static bool
pmap_di_load_invl(struct pmap_invl_gen * ptr,struct pmap_invl_gen * out)825 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out)
826 {
827 	uint64_t new_high, new_low, old_high, old_low;
828 	char res;
829 
830 	old_low = new_low = 0;
831 	old_high = new_high = (uintptr_t)0;
832 
833 	__asm volatile("lock;cmpxchg16b\t%1"
834 	    : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
835 	    : "b"(new_low), "c" (new_high)
836 	    : "memory", "cc");
837 	if (res == 0) {
838 		if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0)
839 			return (false);
840 		out->gen = old_low;
841 		out->next = (void *)old_high;
842 	} else {
843 		out->gen = new_low;
844 		out->next = (void *)new_high;
845 	}
846 	return (true);
847 }
848 
849 static bool
pmap_di_store_invl(struct pmap_invl_gen * ptr,struct pmap_invl_gen * old_val,struct pmap_invl_gen * new_val)850 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val,
851     struct pmap_invl_gen *new_val)
852 {
853 	uint64_t new_high, new_low, old_high, old_low;
854 	char res;
855 
856 	new_low = new_val->gen;
857 	new_high = (uintptr_t)new_val->next;
858 	old_low = old_val->gen;
859 	old_high = (uintptr_t)old_val->next;
860 
861 	__asm volatile("lock;cmpxchg16b\t%1"
862 	    : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
863 	    : "b"(new_low), "c" (new_high)
864 	    : "memory", "cc");
865 	return (res);
866 }
867 
868 static COUNTER_U64_DEFINE_EARLY(pv_page_count);
869 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_page_count, CTLFLAG_RD,
870     &pv_page_count, "Current number of allocated pv pages");
871 
872 static COUNTER_U64_DEFINE_EARLY(user_pt_page_count);
873 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, user_pt_page_count, CTLFLAG_RD,
874     &user_pt_page_count,
875     "Current number of allocated page table pages for userspace");
876 
877 static COUNTER_U64_DEFINE_EARLY(kernel_pt_page_count);
878 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, kernel_pt_page_count, CTLFLAG_RD,
879     &kernel_pt_page_count,
880     "Current number of allocated page table pages for the kernel");
881 
882 #ifdef PV_STATS
883 
884 static COUNTER_U64_DEFINE_EARLY(invl_start_restart);
885 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_start_restart,
886     CTLFLAG_RD, &invl_start_restart,
887     "Number of delayed TLB invalidation request restarts");
888 
889 static COUNTER_U64_DEFINE_EARLY(invl_finish_restart);
890 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD,
891     &invl_finish_restart,
892     "Number of delayed TLB invalidation completion restarts");
893 
894 static int invl_max_qlen;
895 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD,
896     &invl_max_qlen, 0,
897     "Maximum delayed TLB invalidation request queue length");
898 #endif
899 
900 #define di_delay	locks_delay
901 
902 static void
pmap_delayed_invl_start_u(void)903 pmap_delayed_invl_start_u(void)
904 {
905 	struct pmap_invl_gen *invl_gen, *p, prev, new_prev;
906 	struct thread *td;
907 	struct lock_delay_arg lda;
908 	uintptr_t prevl;
909 	u_char pri;
910 #ifdef PV_STATS
911 	int i, ii;
912 #endif
913 
914 	td = curthread;
915 	invl_gen = &td->td_md.md_invl_gen;
916 	PMAP_ASSERT_NOT_IN_DI();
917 	lock_delay_arg_init(&lda, &di_delay);
918 	invl_gen->saved_pri = 0;
919 	pri = td->td_base_pri;
920 	if (pri > PVM) {
921 		thread_lock(td);
922 		pri = td->td_base_pri;
923 		if (pri > PVM) {
924 			invl_gen->saved_pri = pri;
925 			sched_prio(td, PVM);
926 		}
927 		thread_unlock(td);
928 	}
929 again:
930 	PV_STAT(i = 0);
931 	for (p = &pmap_invl_gen_head;; p = prev.next) {
932 		PV_STAT(i++);
933 		prevl = (uintptr_t)atomic_load_ptr(&p->next);
934 		if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
935 			PV_STAT(counter_u64_add(invl_start_restart, 1));
936 			lock_delay(&lda);
937 			goto again;
938 		}
939 		if (prevl == 0)
940 			break;
941 		prev.next = (void *)prevl;
942 	}
943 #ifdef PV_STATS
944 	if ((ii = invl_max_qlen) < i)
945 		atomic_cmpset_int(&invl_max_qlen, ii, i);
946 #endif
947 
948 	if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) {
949 		PV_STAT(counter_u64_add(invl_start_restart, 1));
950 		lock_delay(&lda);
951 		goto again;
952 	}
953 
954 	new_prev.gen = prev.gen;
955 	new_prev.next = invl_gen;
956 	invl_gen->gen = prev.gen + 1;
957 
958 	/* Formal fence between store to invl->gen and updating *p. */
959 	atomic_thread_fence_rel();
960 
961 	/*
962 	 * After inserting an invl_gen element with invalid bit set,
963 	 * this thread blocks any other thread trying to enter the
964 	 * delayed invalidation block.  Do not allow to remove us from
965 	 * the CPU, because it causes starvation for other threads.
966 	 */
967 	critical_enter();
968 
969 	/*
970 	 * ABA for *p is not possible there, since p->gen can only
971 	 * increase.  So if the *p thread finished its di, then
972 	 * started a new one and got inserted into the list at the
973 	 * same place, its gen will appear greater than the previously
974 	 * read gen.
975 	 */
976 	if (!pmap_di_store_invl(p, &prev, &new_prev)) {
977 		critical_exit();
978 		PV_STAT(counter_u64_add(invl_start_restart, 1));
979 		lock_delay(&lda);
980 		goto again;
981 	}
982 
983 	/*
984 	 * There we clear PMAP_INVL_GEN_NEXT_INVALID in
985 	 * invl_gen->next, allowing other threads to iterate past us.
986 	 * pmap_di_store_invl() provides fence between the generation
987 	 * write and the update of next.
988 	 */
989 	invl_gen->next = NULL;
990 	critical_exit();
991 }
992 
993 static bool
pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen * invl_gen,struct pmap_invl_gen * p)994 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen,
995     struct pmap_invl_gen *p)
996 {
997 	struct pmap_invl_gen prev, new_prev;
998 	u_long mygen;
999 
1000 	/*
1001 	 * Load invl_gen->gen after setting invl_gen->next
1002 	 * PMAP_INVL_GEN_NEXT_INVALID.  This prevents larger
1003 	 * generations to propagate to our invl_gen->gen.  Lock prefix
1004 	 * in atomic_set_ptr() worked as seq_cst fence.
1005 	 */
1006 	mygen = atomic_load_long(&invl_gen->gen);
1007 
1008 	if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen)
1009 		return (false);
1010 
1011 	KASSERT(prev.gen < mygen,
1012 	    ("invalid di gen sequence %lu %lu", prev.gen, mygen));
1013 	new_prev.gen = mygen;
1014 	new_prev.next = (void *)((uintptr_t)invl_gen->next &
1015 	    ~PMAP_INVL_GEN_NEXT_INVALID);
1016 
1017 	/* Formal fence between load of prev and storing update to it. */
1018 	atomic_thread_fence_rel();
1019 
1020 	return (pmap_di_store_invl(p, &prev, &new_prev));
1021 }
1022 
1023 static void
pmap_delayed_invl_finish_u(void)1024 pmap_delayed_invl_finish_u(void)
1025 {
1026 	struct pmap_invl_gen *invl_gen, *p;
1027 	struct thread *td;
1028 	struct lock_delay_arg lda;
1029 	uintptr_t prevl;
1030 
1031 	td = curthread;
1032 	invl_gen = &td->td_md.md_invl_gen;
1033 	KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0"));
1034 	KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0,
1035 	    ("missed invl_start: INVALID"));
1036 	lock_delay_arg_init(&lda, &di_delay);
1037 
1038 again:
1039 	for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) {
1040 		prevl = (uintptr_t)atomic_load_ptr(&p->next);
1041 		if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
1042 			PV_STAT(counter_u64_add(invl_finish_restart, 1));
1043 			lock_delay(&lda);
1044 			goto again;
1045 		}
1046 		if ((void *)prevl == invl_gen)
1047 			break;
1048 	}
1049 
1050 	/*
1051 	 * It is legitimate to not find ourself on the list if a
1052 	 * thread before us finished its DI and started it again.
1053 	 */
1054 	if (__predict_false(p == NULL)) {
1055 		PV_STAT(counter_u64_add(invl_finish_restart, 1));
1056 		lock_delay(&lda);
1057 		goto again;
1058 	}
1059 
1060 	critical_enter();
1061 	atomic_set_ptr((uintptr_t *)&invl_gen->next,
1062 	    PMAP_INVL_GEN_NEXT_INVALID);
1063 	if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) {
1064 		atomic_clear_ptr((uintptr_t *)&invl_gen->next,
1065 		    PMAP_INVL_GEN_NEXT_INVALID);
1066 		critical_exit();
1067 		PV_STAT(counter_u64_add(invl_finish_restart, 1));
1068 		lock_delay(&lda);
1069 		goto again;
1070 	}
1071 	critical_exit();
1072 	if (atomic_load_int(&pmap_invl_waiters) > 0)
1073 		pmap_delayed_invl_finish_unblock(0);
1074 	if (invl_gen->saved_pri != 0) {
1075 		thread_lock(td);
1076 		sched_prio(td, invl_gen->saved_pri);
1077 		thread_unlock(td);
1078 	}
1079 }
1080 
1081 #ifdef DDB
DB_SHOW_COMMAND(di_queue,pmap_di_queue)1082 DB_SHOW_COMMAND(di_queue, pmap_di_queue)
1083 {
1084 	struct pmap_invl_gen *p, *pn;
1085 	struct thread *td;
1086 	uintptr_t nextl;
1087 	bool first;
1088 
1089 	for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn,
1090 	    first = false) {
1091 		nextl = (uintptr_t)atomic_load_ptr(&p->next);
1092 		pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID);
1093 		td = first ? NULL : __containerof(p, struct thread,
1094 		    td_md.md_invl_gen);
1095 		db_printf("gen %lu inv %d td %p tid %d\n", p->gen,
1096 		    (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td,
1097 		    td != NULL ? td->td_tid : -1);
1098 	}
1099 }
1100 #endif
1101 
1102 #ifdef PV_STATS
1103 static COUNTER_U64_DEFINE_EARLY(invl_wait);
1104 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait,
1105     CTLFLAG_RD, &invl_wait,
1106     "Number of times DI invalidation blocked pmap_remove_all/write");
1107 
1108 static COUNTER_U64_DEFINE_EARLY(invl_wait_slow);
1109 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD,
1110      &invl_wait_slow, "Number of slow invalidation waits for lockless DI");
1111 
1112 #endif
1113 
1114 #ifdef NUMA
1115 static u_long *
pmap_delayed_invl_genp(vm_page_t m)1116 pmap_delayed_invl_genp(vm_page_t m)
1117 {
1118 	vm_paddr_t pa;
1119 	u_long *gen;
1120 
1121 	pa = VM_PAGE_TO_PHYS(m);
1122 	if (__predict_false((pa) > pmap_last_pa))
1123 		gen = &pv_dummy_large.pv_invl_gen;
1124 	else
1125 		gen = &(pa_to_pmdp(pa)->pv_invl_gen);
1126 
1127 	return (gen);
1128 }
1129 #else
1130 static u_long *
pmap_delayed_invl_genp(vm_page_t m)1131 pmap_delayed_invl_genp(vm_page_t m)
1132 {
1133 
1134 	return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
1135 }
1136 #endif
1137 
1138 static void
pmap_delayed_invl_callout_func(void * arg __unused)1139 pmap_delayed_invl_callout_func(void *arg __unused)
1140 {
1141 
1142 	if (atomic_load_int(&pmap_invl_waiters) == 0)
1143 		return;
1144 	pmap_delayed_invl_finish_unblock(0);
1145 }
1146 
1147 static void
pmap_delayed_invl_callout_init(void * arg __unused)1148 pmap_delayed_invl_callout_init(void *arg __unused)
1149 {
1150 
1151 	if (pmap_di_locked())
1152 		return;
1153 	callout_init(&pmap_invl_callout, 1);
1154 	pmap_invl_callout_inited = true;
1155 }
1156 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY,
1157     pmap_delayed_invl_callout_init, NULL);
1158 
1159 /*
1160  * Ensure that all currently executing DI blocks, that need to flush
1161  * TLB for the given page m, actually flushed the TLB at the time the
1162  * function returned.  If the page m has an empty PV list and we call
1163  * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
1164  * valid mapping for the page m in either its page table or TLB.
1165  *
1166  * This function works by blocking until the global DI generation
1167  * number catches up with the generation number associated with the
1168  * given page m and its PV list.  Since this function's callers
1169  * typically own an object lock and sometimes own a page lock, it
1170  * cannot sleep.  Instead, it blocks on a turnstile to relinquish the
1171  * processor.
1172  */
1173 static void
pmap_delayed_invl_wait_l(vm_page_t m)1174 pmap_delayed_invl_wait_l(vm_page_t m)
1175 {
1176 	u_long *m_gen;
1177 #ifdef PV_STATS
1178 	bool accounted = false;
1179 #endif
1180 
1181 	m_gen = pmap_delayed_invl_genp(m);
1182 	while (*m_gen > pmap_invl_gen) {
1183 #ifdef PV_STATS
1184 		if (!accounted) {
1185 			counter_u64_add(invl_wait, 1);
1186 			accounted = true;
1187 		}
1188 #endif
1189 		pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen);
1190 	}
1191 }
1192 
1193 static void
pmap_delayed_invl_wait_u(vm_page_t m)1194 pmap_delayed_invl_wait_u(vm_page_t m)
1195 {
1196 	u_long *m_gen;
1197 	struct lock_delay_arg lda;
1198 	bool fast;
1199 
1200 	fast = true;
1201 	m_gen = pmap_delayed_invl_genp(m);
1202 	lock_delay_arg_init(&lda, &di_delay);
1203 	while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) {
1204 		if (fast || !pmap_invl_callout_inited) {
1205 			PV_STAT(counter_u64_add(invl_wait, 1));
1206 			lock_delay(&lda);
1207 			fast = false;
1208 		} else {
1209 			/*
1210 			 * The page's invalidation generation number
1211 			 * is still below the current thread's number.
1212 			 * Prepare to block so that we do not waste
1213 			 * CPU cycles or worse, suffer livelock.
1214 			 *
1215 			 * Since it is impossible to block without
1216 			 * racing with pmap_delayed_invl_finish_u(),
1217 			 * prepare for the race by incrementing
1218 			 * pmap_invl_waiters and arming a 1-tick
1219 			 * callout which will unblock us if we lose
1220 			 * the race.
1221 			 */
1222 			atomic_add_int(&pmap_invl_waiters, 1);
1223 
1224 			/*
1225 			 * Re-check the current thread's invalidation
1226 			 * generation after incrementing
1227 			 * pmap_invl_waiters, so that there is no race
1228 			 * with pmap_delayed_invl_finish_u() setting
1229 			 * the page generation and checking
1230 			 * pmap_invl_waiters.  The only race allowed
1231 			 * is for a missed unblock, which is handled
1232 			 * by the callout.
1233 			 */
1234 			if (*m_gen >
1235 			    atomic_load_long(&pmap_invl_gen_head.gen)) {
1236 				callout_reset(&pmap_invl_callout, 1,
1237 				    pmap_delayed_invl_callout_func, NULL);
1238 				PV_STAT(counter_u64_add(invl_wait_slow, 1));
1239 				pmap_delayed_invl_wait_block(m_gen,
1240 				    &pmap_invl_gen_head.gen);
1241 			}
1242 			atomic_add_int(&pmap_invl_waiters, -1);
1243 		}
1244 	}
1245 }
1246 
1247 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *))
1248 {
1249 
1250 	return (pmap_di_locked() ? pmap_thread_init_invl_gen_l :
1251 	    pmap_thread_init_invl_gen_u);
1252 }
1253 
1254 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void))
1255 {
1256 
1257 	return (pmap_di_locked() ? pmap_delayed_invl_start_l :
1258 	    pmap_delayed_invl_start_u);
1259 }
1260 
1261 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void))
1262 {
1263 
1264 	return (pmap_di_locked() ? pmap_delayed_invl_finish_l :
1265 	    pmap_delayed_invl_finish_u);
1266 }
1267 
1268 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t))
1269 {
1270 
1271 	return (pmap_di_locked() ? pmap_delayed_invl_wait_l :
1272 	    pmap_delayed_invl_wait_u);
1273 }
1274 
1275 /*
1276  * Mark the page m's PV list as participating in the current thread's
1277  * DI block.  Any threads concurrently using m's PV list to remove or
1278  * restrict all mappings to m will wait for the current thread's DI
1279  * block to complete before proceeding.
1280  *
1281  * The function works by setting the DI generation number for m's PV
1282  * list to at least the DI generation number of the current thread.
1283  * This forces a caller of pmap_delayed_invl_wait() to block until
1284  * current thread calls pmap_delayed_invl_finish().
1285  */
1286 static void
pmap_delayed_invl_page(vm_page_t m)1287 pmap_delayed_invl_page(vm_page_t m)
1288 {
1289 	u_long gen, *m_gen;
1290 
1291 	rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
1292 	gen = curthread->td_md.md_invl_gen.gen;
1293 	if (gen == 0)
1294 		return;
1295 	m_gen = pmap_delayed_invl_genp(m);
1296 	if (*m_gen < gen)
1297 		*m_gen = gen;
1298 }
1299 
1300 /*
1301  * Crashdump maps.
1302  */
1303 static caddr_t crashdumpmap;
1304 
1305 /*
1306  * Internal flags for pmap_enter()'s helper functions.
1307  */
1308 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
1309 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
1310 
1311 /*
1312  * Internal flags for pmap_mapdev_internal() and
1313  * pmap_change_props_locked().
1314  */
1315 #define	MAPDEV_FLUSHCACHE	0x00000001	/* Flush cache after mapping. */
1316 #define	MAPDEV_SETATTR		0x00000002	/* Modify existing attrs. */
1317 #define	MAPDEV_ASSERTVALID	0x00000004	/* Assert mapping validity. */
1318 
1319 TAILQ_HEAD(pv_chunklist, pv_chunk);
1320 
1321 static void	free_pv_chunk(struct pv_chunk *pc);
1322 static void	free_pv_chunk_batch(struct pv_chunklist *batch);
1323 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
1324 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
1325 static int	popcnt_pc_map_pq(uint64_t *map);
1326 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
1327 static void	reserve_pv_entries(pmap_t pmap, int needed,
1328 		    struct rwlock **lockp);
1329 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1330 		    struct rwlock **lockp);
1331 static bool	pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
1332 		    u_int flags, struct rwlock **lockp);
1333 #if VM_NRESERVLEVEL > 0
1334 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1335 		    struct rwlock **lockp);
1336 #endif
1337 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
1338 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
1339 		    vm_offset_t va);
1340 
1341 static void	pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
1342 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
1343     vm_prot_t prot, int mode, int flags);
1344 static bool	pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
1345 static bool	pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
1346     vm_offset_t va, struct rwlock **lockp);
1347 static bool	pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde,
1348     vm_offset_t va, struct rwlock **lockp, vm_page_t mpte);
1349 static bool	pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
1350     vm_offset_t va, vm_page_t m);
1351 static int	pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
1352 		    vm_prot_t prot, struct rwlock **lockp);
1353 static int	pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
1354 		    u_int flags, vm_page_t m, struct rwlock **lockp);
1355 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
1356     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
1357 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
1358 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
1359     bool allpte_PG_A_set);
1360 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
1361     vm_offset_t eva);
1362 static void pmap_invalidate_cache_range_all(vm_offset_t sva,
1363     vm_offset_t eva);
1364 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
1365 		    pd_entry_t pde);
1366 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
1367 static vm_page_t pmap_large_map_getptp_unlocked(void);
1368 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va);
1369 #if VM_NRESERVLEVEL > 0
1370 static bool pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
1371     vm_page_t mpte, struct rwlock **lockp);
1372 #endif
1373 static bool pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
1374     vm_prot_t prot);
1375 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask);
1376 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
1377     bool exec);
1378 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
1379 static pd_entry_t *pmap_pti_pde(vm_offset_t va);
1380 static void pmap_pti_wire_pte(void *pte);
1381 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
1382     bool demote_kpde, struct spglist *free, struct rwlock **lockp);
1383 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
1384     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
1385 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
1386 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
1387     struct spglist *free);
1388 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1389 		    pd_entry_t *pde, struct spglist *free,
1390 		    struct rwlock **lockp);
1391 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
1392     vm_page_t m, struct rwlock **lockp);
1393 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
1394     pd_entry_t newpde);
1395 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
1396 
1397 static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp,
1398 		struct rwlock **lockp);
1399 static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex,
1400 		struct rwlock **lockp, vm_offset_t va);
1401 static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex,
1402 		struct rwlock **lockp, vm_offset_t va);
1403 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
1404 		struct rwlock **lockp);
1405 
1406 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
1407     struct spglist *free);
1408 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
1409 
1410 static vm_page_t pmap_alloc_pt_page(pmap_t, vm_pindex_t, int);
1411 static void pmap_free_pt_page(pmap_t, vm_page_t, bool);
1412 
1413 /********************/
1414 /* Inline functions */
1415 /********************/
1416 
1417 /*
1418  * Return a non-clipped indexes for a given VA, which are page table
1419  * pages indexes at the corresponding level.
1420  */
1421 static __inline vm_pindex_t
pmap_pde_pindex(vm_offset_t va)1422 pmap_pde_pindex(vm_offset_t va)
1423 {
1424 	return (va >> PDRSHIFT);
1425 }
1426 
1427 static __inline vm_pindex_t
pmap_pdpe_pindex(vm_offset_t va)1428 pmap_pdpe_pindex(vm_offset_t va)
1429 {
1430 	return (NUPDE + (va >> PDPSHIFT));
1431 }
1432 
1433 static __inline vm_pindex_t
pmap_pml4e_pindex(vm_offset_t va)1434 pmap_pml4e_pindex(vm_offset_t va)
1435 {
1436 	return (NUPDE + NUPDPE + (va >> PML4SHIFT));
1437 }
1438 
1439 static __inline vm_pindex_t
pmap_pml5e_pindex(vm_offset_t va)1440 pmap_pml5e_pindex(vm_offset_t va)
1441 {
1442 	return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT));
1443 }
1444 
1445 static __inline pml4_entry_t *
pmap_pml5e(pmap_t pmap,vm_offset_t va)1446 pmap_pml5e(pmap_t pmap, vm_offset_t va)
1447 {
1448 
1449 	MPASS(pmap_is_la57(pmap));
1450 	return (&pmap->pm_pmltop[pmap_pml5e_index(va)]);
1451 }
1452 
1453 static __inline pml4_entry_t *
pmap_pml5e_u(pmap_t pmap,vm_offset_t va)1454 pmap_pml5e_u(pmap_t pmap, vm_offset_t va)
1455 {
1456 
1457 	MPASS(pmap_is_la57(pmap));
1458 	return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]);
1459 }
1460 
1461 static __inline pml4_entry_t *
pmap_pml5e_to_pml4e(pml5_entry_t * pml5e,vm_offset_t va)1462 pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va)
1463 {
1464 	pml4_entry_t *pml4e;
1465 
1466 	/* XXX MPASS(pmap_is_la57(pmap); */
1467 	pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
1468 	return (&pml4e[pmap_pml4e_index(va)]);
1469 }
1470 
1471 /* Return a pointer to the PML4 slot that corresponds to a VA */
1472 static __inline pml4_entry_t *
pmap_pml4e(pmap_t pmap,vm_offset_t va)1473 pmap_pml4e(pmap_t pmap, vm_offset_t va)
1474 {
1475 	pml5_entry_t *pml5e;
1476 	pml4_entry_t *pml4e;
1477 	pt_entry_t PG_V;
1478 
1479 	if (pmap_is_la57(pmap)) {
1480 		pml5e = pmap_pml5e(pmap, va);
1481 		PG_V = pmap_valid_bit(pmap);
1482 		if ((*pml5e & PG_V) == 0)
1483 			return (NULL);
1484 		pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
1485 	} else {
1486 		pml4e = pmap->pm_pmltop;
1487 	}
1488 	return (&pml4e[pmap_pml4e_index(va)]);
1489 }
1490 
1491 static __inline pml4_entry_t *
pmap_pml4e_u(pmap_t pmap,vm_offset_t va)1492 pmap_pml4e_u(pmap_t pmap, vm_offset_t va)
1493 {
1494 	MPASS(!pmap_is_la57(pmap));
1495 	return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]);
1496 }
1497 
1498 /* Return a pointer to the PDP slot that corresponds to a VA */
1499 static __inline pdp_entry_t *
pmap_pml4e_to_pdpe(pml4_entry_t * pml4e,vm_offset_t va)1500 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
1501 {
1502 	pdp_entry_t *pdpe;
1503 
1504 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
1505 	return (&pdpe[pmap_pdpe_index(va)]);
1506 }
1507 
1508 /* Return a pointer to the PDP slot that corresponds to a VA */
1509 static __inline pdp_entry_t *
pmap_pdpe(pmap_t pmap,vm_offset_t va)1510 pmap_pdpe(pmap_t pmap, vm_offset_t va)
1511 {
1512 	pml4_entry_t *pml4e;
1513 	pt_entry_t PG_V;
1514 
1515 	PG_V = pmap_valid_bit(pmap);
1516 	pml4e = pmap_pml4e(pmap, va);
1517 	if (pml4e == NULL || (*pml4e & PG_V) == 0)
1518 		return (NULL);
1519 	return (pmap_pml4e_to_pdpe(pml4e, va));
1520 }
1521 
1522 /* Return a pointer to the PD slot that corresponds to a VA */
1523 static __inline pd_entry_t *
pmap_pdpe_to_pde(pdp_entry_t * pdpe,vm_offset_t va)1524 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
1525 {
1526 	pd_entry_t *pde;
1527 
1528 	KASSERT((*pdpe & PG_PS) == 0,
1529 	    ("%s: pdpe %#lx is a leaf", __func__, *pdpe));
1530 	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
1531 	return (&pde[pmap_pde_index(va)]);
1532 }
1533 
1534 /* Return a pointer to the PD slot that corresponds to a VA */
1535 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va)1536 pmap_pde(pmap_t pmap, vm_offset_t va)
1537 {
1538 	pdp_entry_t *pdpe;
1539 	pt_entry_t PG_V;
1540 
1541 	PG_V = pmap_valid_bit(pmap);
1542 	pdpe = pmap_pdpe(pmap, va);
1543 	if (pdpe == NULL || (*pdpe & PG_V) == 0)
1544 		return (NULL);
1545 	KASSERT((*pdpe & PG_PS) == 0,
1546 	    ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va));
1547 	return (pmap_pdpe_to_pde(pdpe, va));
1548 }
1549 
1550 /* Return a pointer to the PT slot that corresponds to a VA */
1551 static __inline pt_entry_t *
pmap_pde_to_pte(pd_entry_t * pde,vm_offset_t va)1552 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
1553 {
1554 	pt_entry_t *pte;
1555 
1556 	KASSERT((*pde & PG_PS) == 0,
1557 	    ("%s: pde %#lx is a leaf", __func__, *pde));
1558 	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
1559 	return (&pte[pmap_pte_index(va)]);
1560 }
1561 
1562 /* Return a pointer to the PT slot that corresponds to a VA */
1563 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va)1564 pmap_pte(pmap_t pmap, vm_offset_t va)
1565 {
1566 	pd_entry_t *pde;
1567 	pt_entry_t PG_V;
1568 
1569 	PG_V = pmap_valid_bit(pmap);
1570 	pde = pmap_pde(pmap, va);
1571 	if (pde == NULL || (*pde & PG_V) == 0)
1572 		return (NULL);
1573 	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
1574 		return ((pt_entry_t *)pde);
1575 	return (pmap_pde_to_pte(pde, va));
1576 }
1577 
1578 static __inline void
pmap_resident_count_adj(pmap_t pmap,int count)1579 pmap_resident_count_adj(pmap_t pmap, int count)
1580 {
1581 
1582 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1583 	KASSERT(pmap->pm_stats.resident_count + count >= 0,
1584 	    ("pmap %p resident count underflow %ld %d", pmap,
1585 	    pmap->pm_stats.resident_count, count));
1586 	pmap->pm_stats.resident_count += count;
1587 }
1588 
1589 static __inline void
pmap_pt_page_count_pinit(pmap_t pmap,int count)1590 pmap_pt_page_count_pinit(pmap_t pmap, int count)
1591 {
1592 	KASSERT(pmap->pm_stats.resident_count + count >= 0,
1593 	    ("pmap %p resident count underflow %ld %d", pmap,
1594 	    pmap->pm_stats.resident_count, count));
1595 	pmap->pm_stats.resident_count += count;
1596 }
1597 
1598 static __inline void
pmap_pt_page_count_adj(pmap_t pmap,int count)1599 pmap_pt_page_count_adj(pmap_t pmap, int count)
1600 {
1601 	if (pmap == kernel_pmap)
1602 		counter_u64_add(kernel_pt_page_count, count);
1603 	else {
1604 		if (pmap != NULL)
1605 			pmap_resident_count_adj(pmap, count);
1606 		counter_u64_add(user_pt_page_count, count);
1607 	}
1608 }
1609 
1610 pt_entry_t vtoptem __read_mostly = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
1611     NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1) << 3;
1612 vm_offset_t PTmap __read_mostly = (vm_offset_t)P4Tmap;
1613 
1614 pt_entry_t *
vtopte(vm_offset_t va)1615 vtopte(vm_offset_t va)
1616 {
1617 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
1618 
1619 	return ((pt_entry_t *)(PTmap + ((va >> (PAGE_SHIFT - 3)) & vtoptem)));
1620 }
1621 
1622 pd_entry_t vtopdem __read_mostly = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
1623     NPML4EPGSHIFT)) - 1) << 3;
1624 vm_offset_t PDmap __read_mostly = (vm_offset_t)P4Dmap;
1625 
1626 static __inline pd_entry_t *
vtopde(vm_offset_t va)1627 vtopde(vm_offset_t va)
1628 {
1629 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
1630 
1631 	return ((pt_entry_t *)(PDmap + ((va >> (PDRSHIFT - 3)) & vtopdem)));
1632 }
1633 
1634 static u_int64_t
allocpages(vm_paddr_t * firstaddr,int n)1635 allocpages(vm_paddr_t *firstaddr, int n)
1636 {
1637 	u_int64_t ret;
1638 
1639 	ret = *firstaddr;
1640 	bzero((void *)ret, n * PAGE_SIZE);
1641 	*firstaddr += n * PAGE_SIZE;
1642 	return (ret);
1643 }
1644 
1645 CTASSERT(powerof2(NDMPML4E));
1646 
1647 /* number of kernel PDP slots */
1648 #define	NKPDPE(ptpgs)		howmany(ptpgs, NPDEPG)
1649 
1650 static void
nkpt_init(vm_paddr_t addr)1651 nkpt_init(vm_paddr_t addr)
1652 {
1653 	int pt_pages;
1654 
1655 #ifdef NKPT
1656 	pt_pages = NKPT;
1657 #else
1658 	pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */
1659 	pt_pages += NKPDPE(pt_pages);
1660 
1661 	/*
1662 	 * Add some slop beyond the bare minimum required for bootstrapping
1663 	 * the kernel.
1664 	 *
1665 	 * This is quite important when allocating KVA for kernel modules.
1666 	 * The modules are required to be linked in the negative 2GB of
1667 	 * the address space.  If we run out of KVA in this region then
1668 	 * pmap_growkernel() will need to allocate page table pages to map
1669 	 * the entire 512GB of KVA space which is an unnecessary tax on
1670 	 * physical memory.
1671 	 *
1672 	 * Secondly, device memory mapped as part of setting up the low-
1673 	 * level console(s) is taken from KVA, starting at virtual_avail.
1674 	 * This is because cninit() is called after pmap_bootstrap() but
1675 	 * before vm_mem_init() and pmap_init(). 20MB for a frame buffer
1676 	 * is not uncommon.
1677 	 */
1678 	pt_pages += 32;		/* 64MB additional slop. */
1679 #endif
1680 	nkpt = pt_pages;
1681 }
1682 
1683 /*
1684  * Returns the proper write/execute permission for a physical page that is
1685  * part of the initial boot allocations.
1686  *
1687  * If the page has kernel text, it is marked as read-only. If the page has
1688  * kernel read-only data, it is marked as read-only/not-executable. If the
1689  * page has only read-write data, it is marked as read-write/not-executable.
1690  * If the page is below/above the kernel range, it is marked as read-write.
1691  *
1692  * This function operates on 2M pages, since we map the kernel space that
1693  * way.
1694  */
1695 static inline pt_entry_t
bootaddr_rwx(vm_paddr_t pa)1696 bootaddr_rwx(vm_paddr_t pa)
1697 {
1698 	/*
1699 	 * The kernel is loaded at a 2MB-aligned address, and memory below that
1700 	 * need not be executable.  The .bss section is padded to a 2MB
1701 	 * boundary, so memory following the kernel need not be executable
1702 	 * either.  Preloaded kernel modules have their mapping permissions
1703 	 * fixed up by the linker.
1704 	 */
1705 	if (pa < trunc_2mpage(kernphys + btext - KERNSTART) ||
1706 	    pa >= trunc_2mpage(kernphys + _end - KERNSTART))
1707 		return (X86_PG_RW | pg_nx);
1708 
1709 	/*
1710 	 * The linker should ensure that the read-only and read-write
1711 	 * portions don't share the same 2M page, so this shouldn't
1712 	 * impact read-only data. However, in any case, any page with
1713 	 * read-write data needs to be read-write.
1714 	 */
1715 	if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART))
1716 		return (X86_PG_RW | pg_nx);
1717 
1718 	/*
1719 	 * Mark any 2M page containing kernel text as read-only. Mark
1720 	 * other pages with read-only data as read-only and not executable.
1721 	 * (It is likely a small portion of the read-only data section will
1722 	 * be marked as read-only, but executable. This should be acceptable
1723 	 * since the read-only protection will keep the data from changing.)
1724 	 * Note that fixups to the .text section will still work until we
1725 	 * set CR0.WP.
1726 	 */
1727 	if (pa < round_2mpage(kernphys + etext - KERNSTART))
1728 		return (0);
1729 	return (pg_nx);
1730 }
1731 
1732 extern const char la57_trampoline[];
1733 
1734 static void
pmap_bootstrap_la57(vm_paddr_t * firstaddr)1735 pmap_bootstrap_la57(vm_paddr_t *firstaddr)
1736 {
1737 	void (*la57_tramp)(uint64_t pml5);
1738 	pml5_entry_t *pt;
1739 
1740 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0)
1741 		return;
1742 	la57 = 1;
1743 	TUNABLE_INT_FETCH("vm.pmap.la57", &la57);
1744 	if (!la57)
1745 		return;
1746 
1747 	KPML5phys = allocpages(firstaddr, 1);
1748 	KPML4phys = rcr3() & 0xfffff000; /* pml4 from loader must be < 4G */
1749 
1750 	pt = (pml5_entry_t *)KPML5phys;
1751 	pt[0] = KPML4phys | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
1752 	pt[NPML4EPG - 1] = KPML4phys | X86_PG_V | X86_PG_RW | X86_PG_A |
1753 	    X86_PG_M;
1754 
1755 	la57_tramp = (void (*)(uint64_t))((uintptr_t)la57_trampoline -
1756 	    KERNSTART + amd64_loadaddr());
1757 	printf("Calling la57 trampoline at %p, KPML5phys %#lx ...",
1758 	    la57_tramp, KPML5phys);
1759 	la57_tramp(KPML5phys);
1760 	printf(" alive in la57 mode\n");
1761 }
1762 
1763 static void
create_pagetables(vm_paddr_t * firstaddr)1764 create_pagetables(vm_paddr_t *firstaddr)
1765 {
1766 	pd_entry_t *pd_p;
1767 	pdp_entry_t *pdp_p;
1768 	pml4_entry_t *p4_p, *p4d_p;
1769 	pml5_entry_t *p5_p;
1770 	uint64_t DMPDkernphys;
1771 	vm_paddr_t pax;
1772 #ifdef KASAN
1773 	pt_entry_t *pt_p;
1774 	uint64_t KASANPDphys, KASANPTphys, KASANphys;
1775 	vm_offset_t kasankernbase;
1776 	int kasankpdpi, kasankpdi, nkasanpte;
1777 #endif
1778 	int i, j, ndm1g, nkpdpe, nkdmpde, ndmpml4phys;
1779 
1780 	TSENTER();
1781 	/* Allocate page table pages for the direct map */
1782 	ndmpdp = howmany(ptoa(Maxmem), NBPDP);
1783 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
1784 		ndmpdp = 4;
1785 	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
1786 	if (la57) {
1787 		ndmpml4phys = howmany(ndmpdpphys, NPML4EPG);
1788 		if (ndmpml4phys > NDMPML5E) {
1789 			printf("NDMPML5E limits system to %ld GB\n",
1790 			    (u_long)NDMPML5E * NBPML5 / 1024 / 1024 / 1024);
1791 			Maxmem = atop(NDMPML5E * NBPML5);
1792 			ndmpml4phys = NDMPML5E;
1793 			ndmpdpphys = ndmpml4phys * NPML4EPG;
1794 			ndmpdp = ndmpdpphys * NPDEPG;
1795 		}
1796 		DMPML4phys = allocpages(firstaddr, ndmpml4phys);
1797 	} else {
1798 		if (ndmpdpphys > NDMPML4E) {
1799 			/*
1800 			 * Each NDMPML4E allows 512 GB, so limit to
1801 			 * that, and then readjust ndmpdp and
1802 			 * ndmpdpphys.
1803 			 */
1804 			printf("NDMPML4E limits system to %d GB\n",
1805 			    NDMPML4E * 512);
1806 			Maxmem = atop(NDMPML4E * NBPML4);
1807 			ndmpdpphys = NDMPML4E;
1808 			ndmpdp = NDMPML4E * NPDEPG;
1809 		}
1810 	}
1811 	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
1812 	ndm1g = 0;
1813 	if ((amd_feature & AMDID_PAGE1GB) != 0) {
1814 		/*
1815 		 * Calculate the number of 1G pages that will fully fit in
1816 		 * Maxmem.
1817 		 */
1818 		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
1819 
1820 		/*
1821 		 * Allocate 2M pages for the kernel. These will be used in
1822 		 * place of the one or more 1G pages from ndm1g that maps
1823 		 * kernel memory into DMAP.
1824 		 */
1825 		nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART +
1826 		    kernphys - rounddown2(kernphys, NBPDP), NBPDP);
1827 		DMPDkernphys = allocpages(firstaddr, nkdmpde);
1828 	}
1829 	if (ndm1g < ndmpdp)
1830 		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
1831 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
1832 
1833 	/* Allocate pages. */
1834 	if (la57) {
1835 		KPML5phys = allocpages(firstaddr, 1);
1836 		p5_p = (pml5_entry_t *)KPML5phys;
1837 	}
1838 	KPML4phys = allocpages(firstaddr, 1);
1839 	p4_p = (pml4_entry_t *)KPML4phys;
1840 
1841 	KPDPphys = allocpages(firstaddr, NKPML4E);
1842 #ifdef KASAN
1843 	KASANPDPphys = allocpages(firstaddr, NKASANPML4E);
1844 	KASANPDphys = allocpages(firstaddr, 1);
1845 #endif
1846 #ifdef KMSAN
1847 	/*
1848 	 * The KMSAN shadow maps are initially left unpopulated, since there is
1849 	 * no need to shadow memory above KERNBASE.
1850 	 */
1851 	KMSANSHADPDPphys = allocpages(firstaddr, NKMSANSHADPML4E);
1852 	KMSANORIGPDPphys = allocpages(firstaddr, NKMSANORIGPML4E);
1853 #endif
1854 
1855 	/*
1856 	 * Allocate the initial number of kernel page table pages required to
1857 	 * bootstrap.  We defer this until after all memory-size dependent
1858 	 * allocations are done (e.g. direct map), so that we don't have to
1859 	 * build in too much slop in our estimate.
1860 	 *
1861 	 * Note that when NKPML4E > 1, we have an empty page underneath
1862 	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
1863 	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
1864 	 */
1865 	nkpt_init(*firstaddr);
1866 	nkpdpe = NKPDPE(nkpt);
1867 
1868 	KPTphys = allocpages(firstaddr, nkpt);
1869 	KPDphys = allocpages(firstaddr, nkpdpe);
1870 
1871 #ifdef KASAN
1872 	nkasanpte = howmany(nkpt, KASAN_SHADOW_SCALE);
1873 	KASANPTphys = allocpages(firstaddr, nkasanpte);
1874 	KASANphys = allocpages(firstaddr, nkasanpte * NPTEPG);
1875 #endif
1876 
1877 	/*
1878 	 * Connect the zero-filled PT pages to their PD entries.  This
1879 	 * implicitly maps the PT pages at their correct locations within
1880 	 * the PTmap.
1881 	 */
1882 	pd_p = (pd_entry_t *)KPDphys;
1883 	for (i = 0; i < nkpt; i++)
1884 		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
1885 
1886 	/*
1887 	 * Map from start of the kernel in physical memory (staging
1888 	 * area) to the end of loader preallocated memory using 2MB
1889 	 * pages.  This replaces some of the PD entries created above.
1890 	 * For compatibility, identity map 2M at the start.
1891 	 */
1892 	pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A |
1893 	    X86_PG_RW | pg_nx;
1894 	for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) {
1895 		/* Preset PG_M and PG_A because demotion expects it. */
1896 		pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
1897 		    X86_PG_A | bootaddr_rwx(pax);
1898 	}
1899 
1900 	/*
1901 	 * Because we map the physical blocks in 2M pages, adjust firstaddr
1902 	 * to record the physical blocks we've actually mapped into kernel
1903 	 * virtual address space.
1904 	 */
1905 	if (*firstaddr < round_2mpage(KERNend))
1906 		*firstaddr = round_2mpage(KERNend);
1907 
1908 	/* And connect up the PD to the PDP (leaving room for L4 pages) */
1909 	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
1910 	for (i = 0; i < nkpdpe; i++)
1911 		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
1912 
1913 #ifdef KASAN
1914 	kasankernbase = kasan_md_addr_to_shad(KERNBASE);
1915 	kasankpdpi = pmap_pdpe_index(kasankernbase);
1916 	kasankpdi = pmap_pde_index(kasankernbase);
1917 
1918 	pdp_p = (pdp_entry_t *)KASANPDPphys;
1919 	pdp_p[kasankpdpi] = (KASANPDphys | X86_PG_RW | X86_PG_V | pg_nx);
1920 
1921 	pd_p = (pd_entry_t *)KASANPDphys;
1922 	for (i = 0; i < nkasanpte; i++)
1923 		pd_p[i + kasankpdi] = (KASANPTphys + ptoa(i)) | X86_PG_RW |
1924 		    X86_PG_V | pg_nx;
1925 
1926 	pt_p = (pt_entry_t *)KASANPTphys;
1927 	for (i = 0; i < nkasanpte * NPTEPG; i++)
1928 		pt_p[i] = (KASANphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
1929 		    X86_PG_M | X86_PG_A | pg_nx;
1930 #endif
1931 
1932 	/*
1933 	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
1934 	 * the end of physical memory is not aligned to a 1GB page boundary,
1935 	 * then the residual physical memory is mapped with 2MB pages.  Later,
1936 	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
1937 	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
1938 	 * that are partially used.
1939 	 */
1940 	pd_p = (pd_entry_t *)DMPDphys;
1941 	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
1942 		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
1943 		/* Preset PG_M and PG_A because demotion expects it. */
1944 		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
1945 		    X86_PG_M | X86_PG_A | pg_nx;
1946 	}
1947 	pdp_p = (pdp_entry_t *)DMPDPphys;
1948 	for (i = 0; i < ndm1g; i++) {
1949 		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
1950 		/* Preset PG_M and PG_A because demotion expects it. */
1951 		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
1952 		    X86_PG_M | X86_PG_A | pg_nx;
1953 	}
1954 	for (j = 0; i < ndmpdp; i++, j++) {
1955 		pdp_p[i] = DMPDphys + ptoa(j);
1956 		pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx;
1957 	}
1958 
1959 	/*
1960 	 * Connect the Direct Map slots up to the PML4.
1961 	 * pml5 entries for DMAP are handled below in global pml5 loop.
1962 	 */
1963 	p4d_p = la57 ? (pml4_entry_t *)DMPML4phys : &p4_p[DMPML4I];
1964 	for (i = 0; i < ndmpdpphys; i++) {
1965 		p4d_p[i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
1966 		    pg_nx;
1967 	}
1968 
1969 	/*
1970 	 * Instead of using a 1G page for the memory containing the kernel,
1971 	 * use 2M pages with read-only and no-execute permissions.  (If using 1G
1972 	 * pages, this will partially overwrite the PDPEs above.)
1973 	 */
1974 	if (ndm1g > 0) {
1975 		pd_p = (pd_entry_t *)DMPDkernphys;
1976 		for (i = 0, pax = rounddown2(kernphys, NBPDP);
1977 		    i < NPDEPG * nkdmpde; i++, pax += NBPDR) {
1978 			pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
1979 			    X86_PG_A | pg_nx | bootaddr_rwx(pax);
1980 		}
1981 		j = rounddown2(kernphys, NBPDP) >> PDPSHIFT;
1982 		for (i = 0; i < nkdmpde; i++) {
1983 			pdp_p[i + j] = (DMPDkernphys + ptoa(i)) |
1984 			    X86_PG_RW | X86_PG_V | pg_nx;
1985 		}
1986 	}
1987 
1988 #ifdef KASAN
1989 	/* Connect the KASAN shadow map slots up to the PML4. */
1990 	for (i = 0; i < NKASANPML4E; i++) {
1991 		p4_p[KASANPML4I + i] = KASANPDPphys + ptoa(i);
1992 		p4_p[KASANPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
1993 	}
1994 #endif
1995 
1996 #ifdef KMSAN
1997 	/* Connect the KMSAN shadow map slots up to the PML4. */
1998 	for (i = 0; i < NKMSANSHADPML4E; i++) {
1999 		p4_p[KMSANSHADPML4I + i] = KMSANSHADPDPphys + ptoa(i);
2000 		p4_p[KMSANSHADPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
2001 	}
2002 
2003 	/* Connect the KMSAN origin map slots up to the PML4. */
2004 	for (i = 0; i < NKMSANORIGPML4E; i++) {
2005 		p4_p[KMSANORIGPML4I + i] = KMSANORIGPDPphys + ptoa(i);
2006 		p4_p[KMSANORIGPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
2007 	}
2008 #endif
2009 
2010 	/* Connect the KVA slots up to the PML4 */
2011 	for (i = 0; i < NKPML4E; i++) {
2012 		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
2013 		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
2014 	}
2015 
2016 	if (la57) {
2017 		/* XXXKIB bootstrap KPML5phys page is lost */
2018 		for (i = 0; i < NPML5EPG; i++) {
2019 			if (i == PML5PML5I) {
2020 				/*
2021 				 * Recursively map PML5 to itself in
2022 				 * order to get PTmap and PDmap.
2023 				 */
2024 				p5_p[i] = KPML5phys | X86_PG_RW | X86_PG_A |
2025 				    X86_PG_M | X86_PG_V | pg_nx;
2026 			} else if (i >= DMPML5I && i < DMPML5I + ndmpml4phys) {
2027 				/* Connect DMAP pml4 pages to PML5. */
2028 				p5_p[i] = (DMPML4phys + ptoa(i - DMPML5I)) |
2029 				    X86_PG_RW | X86_PG_V | pg_nx;
2030 			} else if (i == pmap_pml5e_index(UPT_MAX_ADDRESS)) {
2031 				p5_p[i] = KPML4phys | X86_PG_RW | X86_PG_A |
2032 				    X86_PG_M | X86_PG_V;
2033 			} else {
2034 				p5_p[i] = 0;
2035 			}
2036 		}
2037 	} else {
2038 		/* Recursively map PML4 to itself in order to get PTmap */
2039 		p4_p[PML4PML4I] = KPML4phys;
2040 		p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
2041 	}
2042 	TSEXIT();
2043 }
2044 
2045 /*
2046  *	Bootstrap the system enough to run with virtual memory.
2047  *
2048  *	On amd64 this is called after mapping has already been enabled
2049  *	and just syncs the pmap module with what has already been done.
2050  *	[We can't call it easily with mapping off since the kernel is not
2051  *	mapped with PA == VA, hence we would have to relocate every address
2052  *	from the linked base (virtual) address "KERNBASE" to the actual
2053  *	(physical) address starting relative to 0]
2054  */
2055 void
pmap_bootstrap(vm_paddr_t * firstaddr)2056 pmap_bootstrap(vm_paddr_t *firstaddr)
2057 {
2058 	vm_offset_t va;
2059 	pt_entry_t *pte, *pcpu_pte;
2060 	struct region_descriptor r_gdt;
2061 	uint64_t cr4, pcpu0_phys;
2062 	u_long res;
2063 	int i;
2064 
2065 	TSENTER();
2066 	KERNend = *firstaddr;
2067 	res = atop(KERNend - (vm_paddr_t)kernphys);
2068 
2069 	if (!pti)
2070 		pg_g = X86_PG_G;
2071 
2072 	/*
2073 	 * Create an initial set of page tables to run the kernel in.
2074 	 */
2075 	pmap_bootstrap_la57(firstaddr);
2076 	create_pagetables(firstaddr);
2077 
2078 	pcpu0_phys = allocpages(firstaddr, 1);
2079 
2080 	/*
2081 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
2082 	 * preallocated kernel page table pages so that vm_page structures
2083 	 * representing these pages will be created.  The vm_page structures
2084 	 * are required for promotion of the corresponding kernel virtual
2085 	 * addresses to superpage mappings.
2086 	 */
2087 	vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt));
2088 
2089 	/*
2090 	 * Account for the virtual addresses mapped by create_pagetables().
2091 	 */
2092 	virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend -
2093 	    (vm_paddr_t)kernphys);
2094 	virtual_end = kva_layout.km_high;
2095 
2096 	/*
2097 	 * Enable PG_G global pages, then switch to the kernel page
2098 	 * table from the bootstrap page table.  After the switch, it
2099 	 * is possible to enable SMEP and SMAP since PG_U bits are
2100 	 * correct now.
2101 	 */
2102 	cr4 = rcr4();
2103 	cr4 |= CR4_PGE;
2104 	load_cr4(cr4);
2105 	load_cr3(la57 ? KPML5phys : KPML4phys);
2106 	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
2107 		cr4 |= CR4_SMEP;
2108 	if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
2109 		cr4 |= CR4_SMAP;
2110 	load_cr4(cr4);
2111 
2112 	/*
2113 	 * Initialize the kernel pmap (which is statically allocated).
2114 	 * Count bootstrap data as being resident in case any of this data is
2115 	 * later unmapped (using pmap_remove()) and freed.
2116 	 *
2117 	 * DMAP_TO_PHYS()/PHYS_TO_DMAP() are functional only after
2118 	 * kva_layout is fixed.
2119 	 */
2120 	PMAP_LOCK_INIT(kernel_pmap);
2121 	if (la57) {
2122 		kva_layout = kva_layout_la57;
2123 		vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
2124 		    NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3;
2125 		PTmap = (vm_offset_t)P5Tmap;
2126 		vtopdem = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
2127 		    NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3;
2128 		PDmap = (vm_offset_t)P5Dmap;
2129 		kernel_pmap->pm_pmltop = (void *)PHYS_TO_DMAP(KPML5phys);
2130 		kernel_pmap->pm_cr3 = KPML5phys;
2131 		pmap_pt_page_count_adj(kernel_pmap, 1);	/* top-level page */
2132 	} else {
2133 		kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2134 		kernel_pmap->pm_pmltop = kernel_pml4;
2135 		kernel_pmap->pm_cr3 = KPML4phys;
2136 	}
2137 	kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
2138 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
2139 	kernel_pmap->pm_stats.resident_count = res;
2140 	vm_radix_init(&kernel_pmap->pm_root);
2141 	kernel_pmap->pm_flags = pmap_flags;
2142 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
2143 		rangeset_init(&kernel_pmap->pm_pkru, pkru_dup_range,
2144 		    pkru_free_range, kernel_pmap, M_NOWAIT);
2145 	}
2146 
2147 	/*
2148 	 * The kernel pmap is always active on all CPUs.  Once CPUs are
2149 	 * enumerated, the mask will be set equal to all_cpus.
2150 	 */
2151 	CPU_FILL(&kernel_pmap->pm_active);
2152 
2153  	/*
2154 	 * Initialize the TLB invalidations generation number lock.
2155 	 */
2156 	mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
2157 
2158 	/*
2159 	 * Reserve some special page table entries/VA space for temporary
2160 	 * mapping of pages.
2161 	 */
2162 #define	SYSMAP(c, p, v, n)	\
2163 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
2164 
2165 	va = virtual_avail;
2166 	pte = vtopte(va);
2167 
2168 	/*
2169 	 * Crashdump maps.  The first page is reused as CMAP1 for the
2170 	 * memory test.
2171 	 */
2172 	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
2173 	CADDR1 = crashdumpmap;
2174 
2175 	SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU);
2176 	virtual_avail = va;
2177 
2178 	/*
2179 	 * Map the BSP PCPU now, the rest of the PCPUs are mapped by
2180 	 * amd64_mp_alloc_pcpu()/start_all_aps() when we know the
2181 	 * number of CPUs and NUMA affinity.
2182 	 */
2183 	pcpu_pte[0] = pcpu0_phys | X86_PG_V | X86_PG_RW | pg_g | pg_nx |
2184 	    X86_PG_M | X86_PG_A;
2185 	for (i = 1; i < MAXCPU; i++)
2186 		pcpu_pte[i] = 0;
2187 
2188 	/*
2189 	 * Re-initialize PCPU area for BSP after switching.
2190 	 * Make hardware use gdt and common_tss from the new PCPU.
2191 	 * Also clears the usage of temporary gdt during switch to
2192 	 * LA57 paging.
2193 	 */
2194 	STAILQ_INIT(&cpuhead);
2195 	wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
2196 	pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu));
2197 	amd64_bsp_pcpu_init1(&__pcpu[0]);
2198 	amd64_bsp_ist_init(&__pcpu[0]);
2199 	__pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
2200 	    IOPERM_BITMAP_SIZE;
2201 	memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT *
2202 	    sizeof(struct user_segment_descriptor));
2203 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss;
2204 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
2205 	    (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]);
2206 	r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
2207 	r_gdt.rd_base = (long)__pcpu[0].pc_gdt;
2208 	lgdt(&r_gdt);
2209 	wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
2210 	ltr(GSEL(GPROC0_SEL, SEL_KPL));
2211 	__pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic;
2212 	__pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id;
2213 
2214 	/*
2215 	 * Initialize the PAT MSR.
2216 	 * pmap_init_pat() clears and sets CR4_PGE, which, as a
2217 	 * side-effect, invalidates stale PG_G TLB entries that might
2218 	 * have been created in our pre-boot environment.
2219 	 */
2220 	pmap_init_pat();
2221 
2222 	/* Initialize TLB Context Id. */
2223 	if (pmap_pcid_enabled) {
2224 		kernel_pmap->pm_pcidp = (void *)(uintptr_t)
2225 		    offsetof(struct pcpu, pc_kpmap_store);
2226 
2227 		PCPU_SET(kpmap_store.pm_pcid, PMAP_PCID_KERN);
2228 		PCPU_SET(kpmap_store.pm_gen, 1);
2229 
2230 		/*
2231 		 * PMAP_PCID_KERN + 1 is used for initialization of
2232 		 * proc0 pmap.  The pmap' pcid state might be used by
2233 		 * EFIRT entry before first context switch, so it
2234 		 * needs to be valid.
2235 		 */
2236 		PCPU_SET(pcid_next, PMAP_PCID_KERN + 2);
2237 		PCPU_SET(pcid_gen, 1);
2238 
2239 		/*
2240 		 * pcpu area for APs is zeroed during AP startup.
2241 		 * pc_pcid_next and pc_pcid_gen are initialized by AP
2242 		 * during pcpu setup.
2243 		 */
2244 		load_cr4(rcr4() | CR4_PCIDE);
2245 	}
2246 	TSEXIT();
2247 }
2248 
2249 /*
2250  * Setup the PAT MSR.
2251  */
2252 void
pmap_init_pat(void)2253 pmap_init_pat(void)
2254 {
2255 	uint64_t pat_msr;
2256 	u_long cr0, cr4;
2257 	int i;
2258 
2259 	/* Bail if this CPU doesn't implement PAT. */
2260 	if ((cpu_feature & CPUID_PAT) == 0)
2261 		panic("no PAT??");
2262 
2263 	/* Set default PAT index table. */
2264 	for (i = 0; i < PAT_INDEX_SIZE; i++)
2265 		pat_index[i] = -1;
2266 	pat_index[PAT_WRITE_BACK] = 0;
2267 	pat_index[PAT_WRITE_THROUGH] = 1;
2268 	pat_index[PAT_UNCACHEABLE] = 3;
2269 	pat_index[PAT_WRITE_COMBINING] = 6;
2270 	pat_index[PAT_WRITE_PROTECTED] = 5;
2271 	pat_index[PAT_UNCACHED] = 2;
2272 
2273 	/*
2274 	 * Initialize default PAT entries.
2275 	 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
2276 	 * Program 5 and 6 as WP and WC.
2277 	 *
2278 	 * Leave 4 and 7 as WB and UC.  Note that a recursive page table
2279 	 * mapping for a 2M page uses a PAT value with the bit 3 set due
2280 	 * to its overload with PG_PS.
2281 	 */
2282 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
2283 	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
2284 	    PAT_VALUE(2, PAT_UNCACHED) |
2285 	    PAT_VALUE(3, PAT_UNCACHEABLE) |
2286 	    PAT_VALUE(4, PAT_WRITE_BACK) |
2287 	    PAT_VALUE(5, PAT_WRITE_PROTECTED) |
2288 	    PAT_VALUE(6, PAT_WRITE_COMBINING) |
2289 	    PAT_VALUE(7, PAT_UNCACHEABLE);
2290 
2291 	/* Disable PGE. */
2292 	cr4 = rcr4();
2293 	load_cr4(cr4 & ~CR4_PGE);
2294 
2295 	/* Disable caches (CD = 1, NW = 0). */
2296 	cr0 = rcr0();
2297 	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
2298 
2299 	/* Flushes caches and TLBs. */
2300 	wbinvd();
2301 	invltlb();
2302 
2303 	/* Update PAT and index table. */
2304 	wrmsr(MSR_PAT, pat_msr);
2305 
2306 	/* Flush caches and TLBs again. */
2307 	wbinvd();
2308 	invltlb();
2309 
2310 	/* Restore caches and PGE. */
2311 	load_cr0(cr0);
2312 	load_cr4(cr4);
2313 }
2314 
2315 vm_page_t
pmap_page_alloc_below_4g(bool zeroed)2316 pmap_page_alloc_below_4g(bool zeroed)
2317 {
2318 	return (vm_page_alloc_noobj_contig((zeroed ? VM_ALLOC_ZERO : 0),
2319 	    1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT));
2320 }
2321 
2322 /*
2323  *	Initialize a vm_page's machine-dependent fields.
2324  */
2325 void
pmap_page_init(vm_page_t m)2326 pmap_page_init(vm_page_t m)
2327 {
2328 
2329 	TAILQ_INIT(&m->md.pv_list);
2330 	m->md.pat_mode = PAT_WRITE_BACK;
2331 }
2332 
2333 static int pmap_allow_2m_x_ept;
2334 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
2335     &pmap_allow_2m_x_ept, 0,
2336     "Allow executable superpage mappings in EPT");
2337 
2338 void
pmap_allow_2m_x_ept_recalculate(void)2339 pmap_allow_2m_x_ept_recalculate(void)
2340 {
2341 	/*
2342 	 * SKL002, SKL012S.  Since the EPT format is only used by
2343 	 * Intel CPUs, the vendor check is merely a formality.
2344 	 */
2345 	if (!(cpu_vendor_id != CPU_VENDOR_INTEL ||
2346 	    (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 ||
2347 	    (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
2348 	    (CPUID_TO_MODEL(cpu_id) == 0x26 ||	/* Atoms */
2349 	    CPUID_TO_MODEL(cpu_id) == 0x27 ||
2350 	    CPUID_TO_MODEL(cpu_id) == 0x35 ||
2351 	    CPUID_TO_MODEL(cpu_id) == 0x36 ||
2352 	    CPUID_TO_MODEL(cpu_id) == 0x37 ||
2353 	    CPUID_TO_MODEL(cpu_id) == 0x86 ||
2354 	    CPUID_TO_MODEL(cpu_id) == 0x1c ||
2355 	    CPUID_TO_MODEL(cpu_id) == 0x4a ||
2356 	    CPUID_TO_MODEL(cpu_id) == 0x4c ||
2357 	    CPUID_TO_MODEL(cpu_id) == 0x4d ||
2358 	    CPUID_TO_MODEL(cpu_id) == 0x5a ||
2359 	    CPUID_TO_MODEL(cpu_id) == 0x5c ||
2360 	    CPUID_TO_MODEL(cpu_id) == 0x5d ||
2361 	    CPUID_TO_MODEL(cpu_id) == 0x5f ||
2362 	    CPUID_TO_MODEL(cpu_id) == 0x6e ||
2363 	    CPUID_TO_MODEL(cpu_id) == 0x7a ||
2364 	    CPUID_TO_MODEL(cpu_id) == 0x57 ||	/* Knights */
2365 	    CPUID_TO_MODEL(cpu_id) == 0x85))))
2366 		pmap_allow_2m_x_ept = 1;
2367 #ifndef BURN_BRIDGES
2368 	TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept);
2369 #endif
2370 	TUNABLE_INT_FETCH("vm.pmap.allow_2m_x_ept", &pmap_allow_2m_x_ept);
2371 }
2372 
2373 static bool
pmap_allow_2m_x_page(pmap_t pmap,bool executable)2374 pmap_allow_2m_x_page(pmap_t pmap, bool executable)
2375 {
2376 
2377 	return (pmap->pm_type != PT_EPT || !executable ||
2378 	    !pmap_allow_2m_x_ept);
2379 }
2380 
2381 #ifdef NUMA
2382 static void
pmap_init_pv_table(void)2383 pmap_init_pv_table(void)
2384 {
2385 	struct pmap_large_md_page *pvd;
2386 	vm_size_t s;
2387 	long start, end, highest, pv_npg;
2388 	int domain, i, j, pages;
2389 
2390 	/*
2391 	 * For correctness we depend on the size being evenly divisible into a
2392 	 * page. As a tradeoff between performance and total memory use, the
2393 	 * entry is 64 bytes (aka one cacheline) in size. Not being smaller
2394 	 * avoids false-sharing, but not being 128 bytes potentially allows for
2395 	 * avoidable traffic due to adjacent cacheline prefetcher.
2396 	 *
2397 	 * Assert the size so that accidental changes fail to compile.
2398 	 */
2399 	CTASSERT((sizeof(*pvd) == 64));
2400 
2401 	/*
2402 	 * Calculate the size of the array.
2403 	 */
2404 	pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end;
2405 	pv_npg = howmany(pmap_last_pa, NBPDR);
2406 	s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page);
2407 	s = round_page(s);
2408 	pv_table = (struct pmap_large_md_page *)kva_alloc(s);
2409 	if (pv_table == NULL)
2410 		panic("%s: kva_alloc failed\n", __func__);
2411 
2412 	/*
2413 	 * Iterate physical segments to allocate space for respective pages.
2414 	 */
2415 	highest = -1;
2416 	s = 0;
2417 	for (i = 0; i < vm_phys_nsegs; i++) {
2418 		end = vm_phys_segs[i].end / NBPDR;
2419 		domain = vm_phys_segs[i].domain;
2420 
2421 		if (highest >= end)
2422 			continue;
2423 
2424 		start = highest + 1;
2425 		pvd = &pv_table[start];
2426 
2427 		pages = end - start + 1;
2428 		s = round_page(pages * sizeof(*pvd));
2429 		highest = start + (s / sizeof(*pvd)) - 1;
2430 
2431 		for (j = 0; j < s; j += PAGE_SIZE) {
2432 			vm_page_t m = vm_page_alloc_noobj_domain(domain, 0);
2433 			if (m == NULL)
2434 				panic("failed to allocate PV table page");
2435 			pmap_qenter((vm_offset_t)pvd + j, &m, 1);
2436 		}
2437 
2438 		for (j = 0; j < s / sizeof(*pvd); j++) {
2439 			rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
2440 			TAILQ_INIT(&pvd->pv_page.pv_list);
2441 			pvd->pv_page.pv_gen = 0;
2442 			pvd->pv_page.pat_mode = 0;
2443 			pvd->pv_invl_gen = 0;
2444 			pvd++;
2445 		}
2446 	}
2447 	pvd = &pv_dummy_large;
2448 	rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
2449 	TAILQ_INIT(&pvd->pv_page.pv_list);
2450 	pvd->pv_page.pv_gen = 0;
2451 	pvd->pv_page.pat_mode = 0;
2452 	pvd->pv_invl_gen = 0;
2453 }
2454 #else
2455 static void
pmap_init_pv_table(void)2456 pmap_init_pv_table(void)
2457 {
2458 	vm_size_t s;
2459 	long i, pv_npg;
2460 
2461 	/*
2462 	 * Initialize the pool of pv list locks.
2463 	 */
2464 	for (i = 0; i < NPV_LIST_LOCKS; i++)
2465 		rw_init(&pv_list_locks[i], "pmap pv list");
2466 
2467 	/*
2468 	 * Calculate the size of the pv head table for superpages.
2469 	 */
2470 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
2471 
2472 	/*
2473 	 * Allocate memory for the pv head table for superpages.
2474 	 */
2475 	s = (vm_size_t)pv_npg * sizeof(struct md_page);
2476 	s = round_page(s);
2477 	pv_table = kmem_malloc(s, M_WAITOK | M_ZERO);
2478 	for (i = 0; i < pv_npg; i++)
2479 		TAILQ_INIT(&pv_table[i].pv_list);
2480 	TAILQ_INIT(&pv_dummy.pv_list);
2481 }
2482 #endif
2483 
2484 /*
2485  *	Initialize the pmap module.
2486  *
2487  *	Called by vm_mem_init(), to initialize any structures that the pmap
2488  *	system needs to map virtual memory.
2489  */
2490 void
pmap_init(void)2491 pmap_init(void)
2492 {
2493 	struct pmap_preinit_mapping *ppim;
2494 	vm_page_t m, mpte;
2495 	pml4_entry_t *pml4e;
2496 	unsigned long lm_max;
2497 	int error, i, ret, skz63;
2498 
2499 	/* L1TF, reserve page @0 unconditionally */
2500 	vm_page_blacklist_add(0, bootverbose);
2501 
2502 	/* Detect bare-metal Skylake Server and Skylake-X. */
2503 	if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL &&
2504 	    CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) {
2505 		/*
2506 		 * Skylake-X errata SKZ63. Processor May Hang When
2507 		 * Executing Code In an HLE Transaction Region between
2508 		 * 40000000H and 403FFFFFH.
2509 		 *
2510 		 * Mark the pages in the range as preallocated.  It
2511 		 * seems to be impossible to distinguish between
2512 		 * Skylake Server and Skylake X.
2513 		 */
2514 		skz63 = 1;
2515 		TUNABLE_INT_FETCH("hw.skz63_enable", &skz63);
2516 		if (skz63 != 0) {
2517 			if (bootverbose)
2518 				printf("SKZ63: skipping 4M RAM starting "
2519 				    "at physical 1G\n");
2520 			for (i = 0; i < atop(0x400000); i++) {
2521 				ret = vm_page_blacklist_add(0x40000000 +
2522 				    ptoa(i), false);
2523 				if (!ret && bootverbose)
2524 					printf("page at %#x already used\n",
2525 					    0x40000000 + ptoa(i));
2526 			}
2527 		}
2528 	}
2529 
2530 	/* IFU */
2531 	pmap_allow_2m_x_ept_recalculate();
2532 
2533 	/*
2534 	 * Initialize the vm page array entries for the kernel pmap's
2535 	 * page table pages.
2536 	 */
2537 	PMAP_LOCK(kernel_pmap);
2538 	for (i = 0; i < nkpt; i++) {
2539 		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
2540 		KASSERT(mpte >= vm_page_array &&
2541 		    mpte < &vm_page_array[vm_page_array_size],
2542 		    ("pmap_init: page table page is out of range"));
2543 		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
2544 		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
2545 		mpte->ref_count = 1;
2546 
2547 		/*
2548 		 * Collect the page table pages that were replaced by a 2MB
2549 		 * page in create_pagetables().  They are zero filled.
2550 		 */
2551 		if ((i == 0 ||
2552 		    kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) &&
2553 		    pmap_insert_pt_page(kernel_pmap, mpte, false, false))
2554 			panic("pmap_init: pmap_insert_pt_page failed");
2555 	}
2556 	PMAP_UNLOCK(kernel_pmap);
2557 	vm_wire_add(nkpt);
2558 
2559 	/*
2560 	 * If the kernel is running on a virtual machine, then it must assume
2561 	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
2562 	 * be prepared for the hypervisor changing the vendor and family that
2563 	 * are reported by CPUID.  Consequently, the workaround for AMD Family
2564 	 * 10h Erratum 383 is enabled if the processor's feature set does not
2565 	 * include at least one feature that is only supported by older Intel
2566 	 * or newer AMD processors.
2567 	 */
2568 	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
2569 	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
2570 	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
2571 	    AMDID2_FMA4)) == 0)
2572 		workaround_erratum383 = 1;
2573 
2574 	/*
2575 	 * Are large page mappings enabled?
2576 	 */
2577 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
2578 	if (pg_ps_enabled) {
2579 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
2580 		    ("pmap_init: can't assign to pagesizes[1]"));
2581 		pagesizes[1] = NBPDR;
2582 		if ((amd_feature & AMDID_PAGE1GB) != 0) {
2583 			KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
2584 			    ("pmap_init: can't assign to pagesizes[2]"));
2585 			pagesizes[2] = NBPDP;
2586 		}
2587 	}
2588 
2589 	/*
2590 	 * Initialize pv chunk lists.
2591 	 */
2592 	for (i = 0; i < PMAP_MEMDOM; i++) {
2593 		mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF);
2594 		TAILQ_INIT(&pv_chunks[i].pvc_list);
2595 	}
2596 	pmap_init_pv_table();
2597 
2598 	pmap_initialized = 1;
2599 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
2600 		ppim = pmap_preinit_mapping + i;
2601 		if (ppim->va == 0)
2602 			continue;
2603 		/* Make the direct map consistent */
2604 		if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) {
2605 			(void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
2606 			    ppim->sz, ppim->mode);
2607 		}
2608 		if (!bootverbose)
2609 			continue;
2610 		printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
2611 		    ppim->pa, ppim->va, ppim->sz, ppim->mode);
2612 	}
2613 
2614 	mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
2615 	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
2616 	    (vmem_addr_t *)&qframe);
2617 	if (error != 0)
2618 		panic("qframe allocation failed");
2619 
2620 	lm_ents = 8;
2621 	TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
2622 	lm_max = (kva_layout.lm_high - kva_layout.lm_low) / NBPML4;
2623 	if (lm_ents > lm_max) {
2624 		printf(
2625 	    "pmap: shrinking large map from requested %d slots to %ld slots\n",
2626 		    lm_ents, lm_max);
2627 		lm_ents = lm_max;
2628 	}
2629 #ifdef KMSAN
2630 	if (!la57 && lm_ents > KMSANORIGPML4I - LMSPML4I) {
2631 		printf(
2632 	    "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n",
2633 		    lm_ents, KMSANORIGPML4I - LMSPML4I);
2634 		lm_ents = KMSANORIGPML4I - LMSPML4I;
2635 	}
2636 #endif
2637 	if (bootverbose)
2638 		printf("pmap: large map %u PML4 slots (%lu GB)\n",
2639 		    lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
2640 	if (lm_ents != 0) {
2641 		large_vmem = vmem_create("large", kva_layout.lm_low,
2642 		    (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
2643 		if (large_vmem == NULL) {
2644 			printf("pmap: cannot create large map\n");
2645 			lm_ents = 0;
2646 		}
2647 		if (la57) {
2648 			for (i = 0; i < howmany((vm_offset_t)NBPML4 *
2649 			    lm_ents, NBPML5); i++) {
2650 				m = pmap_large_map_getptp_unlocked();
2651 				kernel_pmap->pm_pmltop[LMSPML5I + i] = X86_PG_V |
2652 				    X86_PG_RW | X86_PG_A | X86_PG_M |
2653 				    pg_nx | VM_PAGE_TO_PHYS(m);
2654 			}
2655 		}
2656 		for (i = 0; i < lm_ents; i++) {
2657 			m = pmap_large_map_getptp_unlocked();
2658 			pml4e = pmap_pml4e(kernel_pmap, kva_layout.lm_low +
2659 			    (u_long)i * NBPML4);
2660 			*pml4e = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M |
2661 			    pg_nx | VM_PAGE_TO_PHYS(m);
2662 		}
2663 	}
2664 }
2665 
2666 SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries,
2667     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0,
2668     "Maximum number of PML4 entries for use by large map (tunable).  "
2669     "Each entry corresponds to 512GB of address space.");
2670 
2671 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
2672     "2MB page mapping counters");
2673 
2674 static COUNTER_U64_DEFINE_EARLY(pmap_pde_demotions);
2675 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, demotions,
2676     CTLFLAG_RD, &pmap_pde_demotions, "2MB page demotions");
2677 
2678 static COUNTER_U64_DEFINE_EARLY(pmap_pde_mappings);
2679 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
2680     &pmap_pde_mappings, "2MB page mappings");
2681 
2682 static COUNTER_U64_DEFINE_EARLY(pmap_pde_p_failures);
2683 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
2684     &pmap_pde_p_failures, "2MB page promotion failures");
2685 
2686 static COUNTER_U64_DEFINE_EARLY(pmap_pde_promotions);
2687 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
2688     &pmap_pde_promotions, "2MB page promotions");
2689 
2690 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
2691     "1GB page mapping counters");
2692 
2693 static COUNTER_U64_DEFINE_EARLY(pmap_pdpe_demotions);
2694 SYSCTL_COUNTER_U64(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
2695     &pmap_pdpe_demotions, "1GB page demotions");
2696 
2697 /***************************************************
2698  * Low level helper routines.....
2699  ***************************************************/
2700 
2701 static pt_entry_t
pmap_swap_pat(pmap_t pmap,pt_entry_t entry)2702 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
2703 {
2704 	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
2705 
2706 	switch (pmap->pm_type) {
2707 	case PT_X86:
2708 	case PT_RVI:
2709 		/* Verify that both PAT bits are not set at the same time */
2710 		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
2711 		    ("Invalid PAT bits in entry %#lx", entry));
2712 
2713 		/* Swap the PAT bits if one of them is set */
2714 		if ((entry & x86_pat_bits) != 0)
2715 			entry ^= x86_pat_bits;
2716 		break;
2717 	case PT_EPT:
2718 		/*
2719 		 * Nothing to do - the memory attributes are represented
2720 		 * the same way for regular pages and superpages.
2721 		 */
2722 		break;
2723 	default:
2724 		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
2725 	}
2726 
2727 	return (entry);
2728 }
2729 
2730 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)2731 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
2732 {
2733 
2734 	return (mode >= 0 && mode < PAT_INDEX_SIZE &&
2735 	    pat_index[(int)mode] >= 0);
2736 }
2737 
2738 /*
2739  * Determine the appropriate bits to set in a PTE or PDE for a specified
2740  * caching mode.
2741  */
2742 int
pmap_cache_bits(pmap_t pmap,int mode,bool is_pde)2743 pmap_cache_bits(pmap_t pmap, int mode, bool is_pde)
2744 {
2745 	int cache_bits, pat_flag, pat_idx;
2746 
2747 	if (!pmap_is_valid_memattr(pmap, mode))
2748 		panic("Unknown caching mode %d\n", mode);
2749 
2750 	switch (pmap->pm_type) {
2751 	case PT_X86:
2752 	case PT_RVI:
2753 		/* The PAT bit is different for PTE's and PDE's. */
2754 		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
2755 
2756 		/* Map the caching mode to a PAT index. */
2757 		pat_idx = pat_index[mode];
2758 
2759 		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
2760 		cache_bits = 0;
2761 		if (pat_idx & 0x4)
2762 			cache_bits |= pat_flag;
2763 		if (pat_idx & 0x2)
2764 			cache_bits |= PG_NC_PCD;
2765 		if (pat_idx & 0x1)
2766 			cache_bits |= PG_NC_PWT;
2767 		break;
2768 
2769 	case PT_EPT:
2770 		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
2771 		break;
2772 
2773 	default:
2774 		panic("unsupported pmap type %d", pmap->pm_type);
2775 	}
2776 
2777 	return (cache_bits);
2778 }
2779 
2780 static int
pmap_cache_mask(pmap_t pmap,bool is_pde)2781 pmap_cache_mask(pmap_t pmap, bool is_pde)
2782 {
2783 	int mask;
2784 
2785 	switch (pmap->pm_type) {
2786 	case PT_X86:
2787 	case PT_RVI:
2788 		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
2789 		break;
2790 	case PT_EPT:
2791 		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
2792 		break;
2793 	default:
2794 		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
2795 	}
2796 
2797 	return (mask);
2798 }
2799 
2800 static int
pmap_pat_index(pmap_t pmap,pt_entry_t pte,bool is_pde)2801 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde)
2802 {
2803 	int pat_flag, pat_idx;
2804 
2805 	pat_idx = 0;
2806 	switch (pmap->pm_type) {
2807 	case PT_X86:
2808 	case PT_RVI:
2809 		/* The PAT bit is different for PTE's and PDE's. */
2810 		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
2811 
2812 		if ((pte & pat_flag) != 0)
2813 			pat_idx |= 0x4;
2814 		if ((pte & PG_NC_PCD) != 0)
2815 			pat_idx |= 0x2;
2816 		if ((pte & PG_NC_PWT) != 0)
2817 			pat_idx |= 0x1;
2818 		break;
2819 	case PT_EPT:
2820 		if ((pte & EPT_PG_IGNORE_PAT) != 0)
2821 			panic("EPT PTE %#lx has no PAT memory type", pte);
2822 		pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3;
2823 		break;
2824 	}
2825 
2826 	/* See pmap_init_pat(). */
2827 	if (pat_idx == 4)
2828 		pat_idx = 0;
2829 	if (pat_idx == 7)
2830 		pat_idx = 3;
2831 
2832 	return (pat_idx);
2833 }
2834 
2835 bool
pmap_ps_enabled(pmap_t pmap)2836 pmap_ps_enabled(pmap_t pmap)
2837 {
2838 
2839 	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
2840 }
2841 
2842 static void
pmap_update_pde_store(pmap_t pmap,pd_entry_t * pde,pd_entry_t newpde)2843 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
2844 {
2845 
2846 	switch (pmap->pm_type) {
2847 	case PT_X86:
2848 		break;
2849 	case PT_RVI:
2850 	case PT_EPT:
2851 		/*
2852 		 * XXX
2853 		 * This is a little bogus since the generation number is
2854 		 * supposed to be bumped up when a region of the address
2855 		 * space is invalidated in the page tables.
2856 		 *
2857 		 * In this case the old PDE entry is valid but yet we want
2858 		 * to make sure that any mappings using the old entry are
2859 		 * invalidated in the TLB.
2860 		 *
2861 		 * The reason this works as expected is because we rendezvous
2862 		 * "all" host cpus and force any vcpu context to exit as a
2863 		 * side-effect.
2864 		 */
2865 		atomic_add_long(&pmap->pm_eptgen, 1);
2866 		break;
2867 	default:
2868 		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
2869 	}
2870 	pde_store(pde, newpde);
2871 }
2872 
2873 /*
2874  * After changing the page size for the specified virtual address in the page
2875  * table, flush the corresponding entries from the processor's TLB.  Only the
2876  * calling processor's TLB is affected.
2877  *
2878  * The calling thread must be pinned to a processor.
2879  */
2880 static void
pmap_update_pde_invalidate(pmap_t pmap,vm_offset_t va,pd_entry_t newpde)2881 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
2882 {
2883 	pt_entry_t PG_G;
2884 
2885 	if (pmap_type_guest(pmap))
2886 		return;
2887 
2888 	KASSERT(pmap->pm_type == PT_X86,
2889 	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
2890 
2891 	PG_G = pmap_global_bit(pmap);
2892 
2893 	if ((newpde & PG_PS) == 0)
2894 		/* Demotion: flush a specific 2MB page mapping. */
2895 		pmap_invlpg(pmap, va);
2896 	else if ((newpde & PG_G) == 0)
2897 		/*
2898 		 * Promotion: flush every 4KB page mapping from the TLB
2899 		 * because there are too many to flush individually.
2900 		 */
2901 		invltlb();
2902 	else {
2903 		/*
2904 		 * Promotion: flush every 4KB page mapping from the TLB,
2905 		 * including any global (PG_G) mappings.
2906 		 */
2907 		invltlb_glob();
2908 	}
2909 }
2910 
2911 /*
2912  * The amd64 pmap uses different approaches to TLB invalidation
2913  * depending on the kernel configuration, available hardware features,
2914  * and known hardware errata.  The kernel configuration option that
2915  * has the greatest operational impact on TLB invalidation is PTI,
2916  * which is enabled automatically on affected Intel CPUs.  The most
2917  * impactful hardware features are first PCID, and then INVPCID
2918  * instruction presence.  PCID usage is quite different for PTI
2919  * vs. non-PTI.
2920  *
2921  * * Kernel Page Table Isolation (PTI or KPTI) is used to mitigate
2922  *   the Meltdown bug in some Intel CPUs.  Under PTI, each user address
2923  *   space is served by two page tables, user and kernel.  The user
2924  *   page table only maps user space and a kernel trampoline.  The
2925  *   kernel trampoline includes the entirety of the kernel text but
2926  *   only the kernel data that is needed to switch from user to kernel
2927  *   mode.  The kernel page table maps the user and kernel address
2928  *   spaces in their entirety.  It is identical to the per-process
2929  *   page table used in non-PTI mode.
2930  *
2931  *   User page tables are only used when the CPU is in user mode.
2932  *   Consequently, some TLB invalidations can be postponed until the
2933  *   switch from kernel to user mode.  In contrast, the user
2934  *   space part of the kernel page table is used for copyout(9), so
2935  *   TLB invalidations on this page table cannot be similarly postponed.
2936  *
2937  *   The existence of a user mode page table for the given pmap is
2938  *   indicated by a pm_ucr3 value that differs from PMAP_NO_CR3, in
2939  *   which case pm_ucr3 contains the %cr3 register value for the user
2940  *   mode page table's root.
2941  *
2942  * * The pm_active bitmask indicates which CPUs currently have the
2943  *   pmap active.  A CPU's bit is set on context switch to the pmap, and
2944  *   cleared on switching off this CPU.  For the kernel page table,
2945  *   the pm_active field is immutable and contains all CPUs.  The
2946  *   kernel page table is always logically active on every processor,
2947  *   but not necessarily in use by the hardware, e.g., in PTI mode.
2948  *
2949  *   When requesting invalidation of virtual addresses with
2950  *   pmap_invalidate_XXX() functions, the pmap sends shootdown IPIs to
2951  *   all CPUs recorded as active in pm_active.  Updates to and reads
2952  *   from pm_active are not synchronized, and so they may race with
2953  *   each other.  Shootdown handlers are prepared to handle the race.
2954  *
2955  * * PCID is an optional feature of the long mode x86 MMU where TLB
2956  *   entries are tagged with the 'Process ID' of the address space
2957  *   they belong to.  This feature provides a limited namespace for
2958  *   process identifiers, 12 bits, supporting 4095 simultaneous IDs
2959  *   total.
2960  *
2961  *   Allocation of a PCID to a pmap is done by an algorithm described
2962  *   in section 15.12, "Other TLB Consistency Algorithms", of
2963  *   Vahalia's book "Unix Internals".  A PCID cannot be allocated for
2964  *   the whole lifetime of a pmap in pmap_pinit() due to the limited
2965  *   namespace.  Instead, a per-CPU, per-pmap PCID is assigned when
2966  *   the CPU is about to start caching TLB entries from a pmap,
2967  *   i.e., on the context switch that activates the pmap on the CPU.
2968  *
2969  *   The PCID allocator maintains a per-CPU, per-pmap generation
2970  *   count, pm_gen, which is incremented each time a new PCID is
2971  *   allocated.  On TLB invalidation, the generation counters for the
2972  *   pmap are zeroed, which signals the context switch code that the
2973  *   previously allocated PCID is no longer valid.  Effectively,
2974  *   zeroing any of these counters triggers a TLB shootdown for the
2975  *   given CPU/address space, due to the allocation of a new PCID.
2976  *
2977  *   Zeroing can be performed remotely.  Consequently, if a pmap is
2978  *   inactive on a CPU, then a TLB shootdown for that pmap and CPU can
2979  *   be initiated by an ordinary memory access to reset the target
2980  *   CPU's generation count within the pmap.  The CPU initiating the
2981  *   TLB shootdown does not need to send an IPI to the target CPU.
2982  *
2983  * * PTI + PCID.  The available PCIDs are divided into two sets: PCIDs
2984  *   for complete (kernel) page tables, and PCIDs for user mode page
2985  *   tables.  A user PCID value is obtained from the kernel PCID value
2986  *   by setting the highest bit, 11, to 1 (0x800 == PMAP_PCID_USER_PT).
2987  *
2988  *   User space page tables are activated on return to user mode, by
2989  *   loading pm_ucr3 into %cr3.  If the PCPU(ucr3_load_mask) requests
2990  *   clearing bit 63 of the loaded ucr3, this effectively causes
2991  *   complete invalidation of the user mode TLB entries for the
2992  *   current pmap.  In which case, local invalidations of individual
2993  *   pages in the user page table are skipped.
2994  *
2995  * * Local invalidation, all modes.  If the requested invalidation is
2996  *   for a specific address or the total invalidation of a currently
2997  *   active pmap, then the TLB is flushed using INVLPG for a kernel
2998  *   page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a
2999  *   user space page table(s).
3000  *
3001  *   If the INVPCID instruction is available, it is used to flush user
3002  *   entries from the kernel page table.
3003  *
3004  *   When PCID is enabled, the INVLPG instruction invalidates all TLB
3005  *   entries for the given page that either match the current PCID or
3006  *   are global. Since TLB entries for the same page under different
3007  *   PCIDs are unaffected, kernel pages which reside in all address
3008  *   spaces could be problematic.  We avoid the problem by creating
3009  *   all kernel PTEs with the global flag (PG_G) set, when PTI is
3010  *   disabled.
3011  *
3012  * * mode: PTI disabled, PCID present.  The kernel reserves PCID 0 for its
3013  *   address space, all other 4095 PCIDs are used for user mode spaces
3014  *   as described above.  A context switch allocates a new PCID if
3015  *   the recorded PCID is zero or the recorded generation does not match
3016  *   the CPU's generation, effectively flushing the TLB for this address space.
3017  *   Total remote invalidation is performed by zeroing pm_gen for all CPUs.
3018  *	local user page: INVLPG
3019  *	local kernel page: INVLPG
3020  *	local user total: INVPCID(CTX)
3021  *	local kernel total: INVPCID(CTXGLOB) or invltlb_glob()
3022  *	remote user page, inactive pmap: zero pm_gen
3023  *	remote user page, active pmap: zero pm_gen + IPI:INVLPG
3024  *	(Both actions are required to handle the aforementioned pm_active races.)
3025  *	remote kernel page: IPI:INVLPG
3026  *	remote user total, inactive pmap: zero pm_gen
3027  *	remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) or
3028  *          reload %cr3)
3029  *	(See note above about pm_active races.)
3030  *	remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob())
3031  *
3032  * PTI enabled, PCID present.
3033  *	local user page: INVLPG for kpt, INVPCID(ADDR) or (INVLPG for ucr3)
3034  *          for upt
3035  *	local kernel page: INVLPG
3036  *	local user total: INVPCID(CTX) or reload %cr3 for kpt, clear PCID_SAVE
3037  *          on loading UCR3 into %cr3 for upt
3038  *	local kernel total: INVPCID(CTXGLOB) or invltlb_glob()
3039  *	remote user page, inactive pmap: zero pm_gen
3040  *	remote user page, active pmap: zero pm_gen + IPI:(INVLPG for kpt,
3041  *          INVPCID(ADDR) for upt)
3042  *	remote kernel page: IPI:INVLPG
3043  *	remote user total, inactive pmap: zero pm_gen
3044  *	remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) for kpt,
3045  *          clear PCID_SAVE on loading UCR3 into $cr3 for upt)
3046  *	remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob())
3047  *
3048  *  No PCID.
3049  *	local user page: INVLPG
3050  *	local kernel page: INVLPG
3051  *	local user total: reload %cr3
3052  *	local kernel total: invltlb_glob()
3053  *	remote user page, inactive pmap: -
3054  *	remote user page, active pmap: IPI:INVLPG
3055  *	remote kernel page: IPI:INVLPG
3056  *	remote user total, inactive pmap: -
3057  *	remote user total, active pmap: IPI:(reload %cr3)
3058  *	remote kernel total: IPI:invltlb_glob()
3059  *  Since on return to user mode, the reload of %cr3 with ucr3 causes
3060  *  TLB invalidation, no specific action is required for user page table.
3061  *
3062  * EPT.  EPT pmaps do not map KVA, all mappings are userspace.
3063  * XXX TODO
3064  */
3065 
3066 /*
3067  * Interrupt the cpus that are executing in the guest context.
3068  * This will force the vcpu to exit and the cached EPT mappings
3069  * will be invalidated by the host before the next vmresume.
3070  */
3071 static __inline void
pmap_invalidate_ept(pmap_t pmap)3072 pmap_invalidate_ept(pmap_t pmap)
3073 {
3074 	smr_seq_t goal;
3075 	int ipinum;
3076 
3077 	sched_pin();
3078 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
3079 	    ("pmap_invalidate_ept: absurd pm_active"));
3080 
3081 	/*
3082 	 * The TLB mappings associated with a vcpu context are not
3083 	 * flushed each time a different vcpu is chosen to execute.
3084 	 *
3085 	 * This is in contrast with a process's vtop mappings that
3086 	 * are flushed from the TLB on each context switch.
3087 	 *
3088 	 * Therefore we need to do more than just a TLB shootdown on
3089 	 * the active cpus in 'pmap->pm_active'. To do this we keep
3090 	 * track of the number of invalidations performed on this pmap.
3091 	 *
3092 	 * Each vcpu keeps a cache of this counter and compares it
3093 	 * just before a vmresume. If the counter is out-of-date an
3094 	 * invept will be done to flush stale mappings from the TLB.
3095 	 *
3096 	 * To ensure that all vCPU threads have observed the new counter
3097 	 * value before returning, we use SMR.  Ordering is important here:
3098 	 * the VMM enters an SMR read section before loading the counter
3099 	 * and after updating the pm_active bit set.  Thus, pm_active is
3100 	 * a superset of active readers, and any reader that has observed
3101 	 * the goal has observed the new counter value.
3102 	 */
3103 	atomic_add_long(&pmap->pm_eptgen, 1);
3104 
3105 	goal = smr_advance(pmap->pm_eptsmr);
3106 
3107 	/*
3108 	 * Force the vcpu to exit and trap back into the hypervisor.
3109 	 */
3110 	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
3111 	ipi_selected(pmap->pm_active, ipinum);
3112 	sched_unpin();
3113 
3114 	/*
3115 	 * Ensure that all active vCPUs will observe the new generation counter
3116 	 * value before executing any more guest instructions.
3117 	 */
3118 	smr_wait(pmap->pm_eptsmr, goal);
3119 }
3120 
3121 static inline void
pmap_invalidate_preipi_pcid(pmap_t pmap)3122 pmap_invalidate_preipi_pcid(pmap_t pmap)
3123 {
3124 	struct pmap_pcid *pcidp;
3125 	u_int cpuid, i;
3126 
3127 	sched_pin();
3128 
3129 	cpuid = PCPU_GET(cpuid);
3130 	if (pmap != PCPU_GET(curpmap))
3131 		cpuid = 0xffffffff;	/* An impossible value */
3132 
3133 	CPU_FOREACH(i) {
3134 		if (cpuid != i) {
3135 			pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i);
3136 			pcidp->pm_gen = 0;
3137 		}
3138 	}
3139 
3140 	/*
3141 	 * The fence is between stores to pm_gen and the read of the
3142 	 * pm_active mask.  We need to ensure that it is impossible
3143 	 * for us to miss the bit update in pm_active and
3144 	 * simultaneously observe a non-zero pm_gen in
3145 	 * pmap_activate_sw(), otherwise TLB update is missed.
3146 	 * Without the fence, IA32 allows such an outcome.  Note that
3147 	 * pm_active is updated by a locked operation, which provides
3148 	 * the reciprocal fence.
3149 	 */
3150 	atomic_thread_fence_seq_cst();
3151 }
3152 
3153 static void
pmap_invalidate_preipi_nopcid(pmap_t pmap __unused)3154 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused)
3155 {
3156 	sched_pin();
3157 }
3158 
3159 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t))
3160 {
3161 	return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid :
3162 	    pmap_invalidate_preipi_nopcid);
3163 }
3164 
3165 static inline void
pmap_invalidate_page_pcid_cb(pmap_t pmap,vm_offset_t va,const bool invpcid_works1)3166 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va,
3167     const bool invpcid_works1)
3168 {
3169 	struct invpcid_descr d;
3170 	uint64_t kcr3, ucr3;
3171 	uint32_t pcid;
3172 
3173 	/*
3174 	 * Because pm_pcid is recalculated on a context switch, we
3175 	 * must ensure there is no preemption, not just pinning.
3176 	 * Otherwise, we might use a stale value below.
3177 	 */
3178 	CRITICAL_ASSERT(curthread);
3179 
3180 	/*
3181 	 * No need to do anything with user page tables invalidation
3182 	 * if there is no user page table, or invalidation is deferred
3183 	 * until the return to userspace.  ucr3_load_mask is stable
3184 	 * because we have preemption disabled.
3185 	 */
3186 	if (pmap->pm_ucr3 == PMAP_NO_CR3 ||
3187 	    PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK)
3188 		return;
3189 
3190 	pcid = pmap_get_pcid(pmap);
3191 	if (invpcid_works1) {
3192 		d.pcid = pcid | PMAP_PCID_USER_PT;
3193 		d.pad = 0;
3194 		d.addr = va;
3195 		invpcid(&d, INVPCID_ADDR);
3196 	} else {
3197 		kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
3198 		ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
3199 		pmap_pti_pcid_invlpg(ucr3, kcr3, va);
3200 	}
3201 }
3202 
3203 static void
pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap,vm_offset_t va)3204 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va)
3205 {
3206 	pmap_invalidate_page_pcid_cb(pmap, va, true);
3207 }
3208 
3209 static void
pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap,vm_offset_t va)3210 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va)
3211 {
3212 	pmap_invalidate_page_pcid_cb(pmap, va, false);
3213 }
3214 
3215 static void
pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused,vm_offset_t va __unused)3216 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused)
3217 {
3218 }
3219 
3220 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t))
3221 {
3222 	if (pmap_pcid_enabled)
3223 		return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb :
3224 		    pmap_invalidate_page_pcid_noinvpcid_cb);
3225 	return (pmap_invalidate_page_nopcid_cb);
3226 }
3227 
3228 static void
pmap_invalidate_page_curcpu_cb(pmap_t pmap,vm_offset_t va,vm_offset_t addr2 __unused)3229 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va,
3230     vm_offset_t addr2 __unused)
3231 {
3232 	if (pmap == kernel_pmap) {
3233 		pmap_invlpg(kernel_pmap, va);
3234 	} else if (pmap == PCPU_GET(curpmap)) {
3235 		invlpg(va);
3236 		pmap_invalidate_page_cb(pmap, va);
3237 	}
3238 }
3239 
3240 void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)3241 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
3242 {
3243 	if (pmap_type_guest(pmap)) {
3244 		pmap_invalidate_ept(pmap);
3245 		return;
3246 	}
3247 
3248 	KASSERT(pmap->pm_type == PT_X86,
3249 	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
3250 
3251 	pmap_invalidate_preipi(pmap);
3252 	smp_masked_invlpg(va, pmap, pmap_invalidate_page_curcpu_cb);
3253 }
3254 
3255 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
3256 #define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
3257 
3258 static void
pmap_invalidate_range_pcid_cb(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,const bool invpcid_works1)3259 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
3260     const bool invpcid_works1)
3261 {
3262 	struct invpcid_descr d;
3263 	uint64_t kcr3, ucr3;
3264 	uint32_t pcid;
3265 
3266 	CRITICAL_ASSERT(curthread);
3267 
3268 	if (pmap != PCPU_GET(curpmap) ||
3269 	    pmap->pm_ucr3 == PMAP_NO_CR3 ||
3270 	    PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK)
3271 		return;
3272 
3273 	pcid = pmap_get_pcid(pmap);
3274 	if (invpcid_works1) {
3275 		d.pcid = pcid | PMAP_PCID_USER_PT;
3276 		d.pad = 0;
3277 		for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE)
3278 			invpcid(&d, INVPCID_ADDR);
3279 	} else {
3280 		kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
3281 		ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
3282 		pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
3283 	}
3284 }
3285 
3286 static void
pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3287 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva,
3288     vm_offset_t eva)
3289 {
3290 	pmap_invalidate_range_pcid_cb(pmap, sva, eva, true);
3291 }
3292 
3293 static void
pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3294 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva,
3295     vm_offset_t eva)
3296 {
3297 	pmap_invalidate_range_pcid_cb(pmap, sva, eva, false);
3298 }
3299 
3300 static void
pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused,vm_offset_t sva __unused,vm_offset_t eva __unused)3301 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused,
3302     vm_offset_t eva __unused)
3303 {
3304 }
3305 
3306 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t,
3307     vm_offset_t))
3308 {
3309 	if (pmap_pcid_enabled)
3310 		return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb :
3311 		    pmap_invalidate_range_pcid_noinvpcid_cb);
3312 	return (pmap_invalidate_range_nopcid_cb);
3313 }
3314 
3315 static void
pmap_invalidate_range_curcpu_cb(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3316 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3317 {
3318 	vm_offset_t addr;
3319 
3320 	if (pmap == kernel_pmap) {
3321 		if (PCPU_GET(pcid_invlpg_workaround)) {
3322 			struct invpcid_descr d = { 0 };
3323 
3324 			invpcid(&d, INVPCID_CTXGLOB);
3325 		} else {
3326 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
3327 				invlpg(addr);
3328 		}
3329 	} else if (pmap == PCPU_GET(curpmap)) {
3330 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
3331 			invlpg(addr);
3332 		pmap_invalidate_range_cb(pmap, sva, eva);
3333 	}
3334 }
3335 
3336 void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3337 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3338 {
3339 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
3340 		pmap_invalidate_all(pmap);
3341 		return;
3342 	}
3343 
3344 	if (pmap_type_guest(pmap)) {
3345 		pmap_invalidate_ept(pmap);
3346 		return;
3347 	}
3348 
3349 	KASSERT(pmap->pm_type == PT_X86,
3350 	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
3351 
3352 	pmap_invalidate_preipi(pmap);
3353 	smp_masked_invlpg_range(sva, eva, pmap,
3354 	    pmap_invalidate_range_curcpu_cb);
3355 }
3356 
3357 static inline void
pmap_invalidate_all_pcid_cb(pmap_t pmap,bool invpcid_works1)3358 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1)
3359 {
3360 	struct invpcid_descr d;
3361 	uint64_t kcr3;
3362 	uint32_t pcid;
3363 
3364 	if (pmap == kernel_pmap) {
3365 		if (invpcid_works1) {
3366 			bzero(&d, sizeof(d));
3367 			invpcid(&d, INVPCID_CTXGLOB);
3368 		} else {
3369 			invltlb_glob();
3370 		}
3371 	} else if (pmap == PCPU_GET(curpmap)) {
3372 		CRITICAL_ASSERT(curthread);
3373 
3374 		pcid = pmap_get_pcid(pmap);
3375 		if (invpcid_works1) {
3376 			d.pcid = pcid;
3377 			d.pad = 0;
3378 			d.addr = 0;
3379 			invpcid(&d, INVPCID_CTX);
3380 		} else {
3381 			kcr3 = pmap->pm_cr3 | pcid;
3382 			load_cr3(kcr3);
3383 		}
3384 		if (pmap->pm_ucr3 != PMAP_NO_CR3)
3385 			PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
3386 	}
3387 }
3388 
3389 static void
pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap)3390 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap)
3391 {
3392 	pmap_invalidate_all_pcid_cb(pmap, true);
3393 }
3394 
3395 static void
pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap)3396 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap)
3397 {
3398 	pmap_invalidate_all_pcid_cb(pmap, false);
3399 }
3400 
3401 static void
pmap_invalidate_all_nopcid_cb(pmap_t pmap)3402 pmap_invalidate_all_nopcid_cb(pmap_t pmap)
3403 {
3404 	if (pmap == kernel_pmap)
3405 		invltlb_glob();
3406 	else if (pmap == PCPU_GET(curpmap))
3407 		invltlb();
3408 }
3409 
3410 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t))
3411 {
3412 	if (pmap_pcid_enabled)
3413 		return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb :
3414 		    pmap_invalidate_all_pcid_noinvpcid_cb);
3415 	return (pmap_invalidate_all_nopcid_cb);
3416 }
3417 
3418 static void
pmap_invalidate_all_curcpu_cb(pmap_t pmap,vm_offset_t addr1 __unused,vm_offset_t addr2 __unused)3419 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused,
3420     vm_offset_t addr2 __unused)
3421 {
3422 	pmap_invalidate_all_cb(pmap);
3423 }
3424 
3425 void
pmap_invalidate_all(pmap_t pmap)3426 pmap_invalidate_all(pmap_t pmap)
3427 {
3428 	if (pmap_type_guest(pmap)) {
3429 		pmap_invalidate_ept(pmap);
3430 		return;
3431 	}
3432 
3433 	KASSERT(pmap->pm_type == PT_X86,
3434 	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
3435 
3436 	pmap_invalidate_preipi(pmap);
3437 	smp_masked_invltlb(pmap, pmap_invalidate_all_curcpu_cb);
3438 }
3439 
3440 static void
pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused,vm_offset_t va __unused,vm_offset_t addr2 __unused)3441 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused,
3442     vm_offset_t addr2 __unused)
3443 {
3444 	wbinvd();
3445 }
3446 
3447 void
pmap_invalidate_cache(void)3448 pmap_invalidate_cache(void)
3449 {
3450 	sched_pin();
3451 	smp_cache_flush(pmap_invalidate_cache_curcpu_cb);
3452 }
3453 
3454 struct pde_action {
3455 	cpuset_t invalidate;	/* processors that invalidate their TLB */
3456 	pmap_t pmap;
3457 	vm_offset_t va;
3458 	pd_entry_t *pde;
3459 	pd_entry_t newpde;
3460 	u_int store;		/* processor that updates the PDE */
3461 };
3462 
3463 static void
pmap_update_pde_action(void * arg)3464 pmap_update_pde_action(void *arg)
3465 {
3466 	struct pde_action *act = arg;
3467 
3468 	if (act->store == PCPU_GET(cpuid))
3469 		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
3470 }
3471 
3472 static void
pmap_update_pde_teardown(void * arg)3473 pmap_update_pde_teardown(void *arg)
3474 {
3475 	struct pde_action *act = arg;
3476 
3477 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
3478 		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
3479 }
3480 
3481 /*
3482  * Change the page size for the specified virtual address in a way that
3483  * prevents any possibility of the TLB ever having two entries that map the
3484  * same virtual address using different page sizes.  This is the recommended
3485  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
3486  * machine check exception for a TLB state that is improperly diagnosed as a
3487  * hardware error.
3488  */
3489 static void
pmap_update_pde(pmap_t pmap,vm_offset_t va,pd_entry_t * pde,pd_entry_t newpde)3490 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
3491 {
3492 	struct pde_action act;
3493 	cpuset_t active, other_cpus;
3494 	u_int cpuid;
3495 
3496 	sched_pin();
3497 	cpuid = PCPU_GET(cpuid);
3498 	other_cpus = all_cpus;
3499 	CPU_CLR(cpuid, &other_cpus);
3500 	if (pmap == kernel_pmap || pmap_type_guest(pmap))
3501 		active = all_cpus;
3502 	else {
3503 		active = pmap->pm_active;
3504 	}
3505 	if (CPU_OVERLAP(&active, &other_cpus)) {
3506 		act.store = cpuid;
3507 		act.invalidate = active;
3508 		act.va = va;
3509 		act.pmap = pmap;
3510 		act.pde = pde;
3511 		act.newpde = newpde;
3512 		CPU_SET(cpuid, &active);
3513 		smp_rendezvous_cpus(active,
3514 		    smp_no_rendezvous_barrier, pmap_update_pde_action,
3515 		    pmap_update_pde_teardown, &act);
3516 	} else {
3517 		pmap_update_pde_store(pmap, pde, newpde);
3518 		if (CPU_ISSET(cpuid, &active))
3519 			pmap_update_pde_invalidate(pmap, va, newpde);
3520 	}
3521 	sched_unpin();
3522 }
3523 
3524 static void
pmap_invalidate_pde_page(pmap_t pmap,vm_offset_t va,pd_entry_t pde)3525 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
3526 {
3527 
3528 	/*
3529 	 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
3530 	 * by a promotion that did not invalidate the 512 4KB page mappings
3531 	 * that might exist in the TLB.  Consequently, at this point, the TLB
3532 	 * may hold both 4KB and 2MB page mappings for the address range [va,
3533 	 * va + NBPDR).  Therefore, the entire range must be invalidated here.
3534 	 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
3535 	 * 4KB page mappings for the address range [va, va + NBPDR), and so a
3536 	 * single INVLPG suffices to invalidate the 2MB page mapping from the
3537 	 * TLB.
3538 	 */
3539 	if ((pde & PG_PROMOTED) != 0)
3540 		pmap_invalidate_range(pmap, va, va + NBPDR - 1);
3541 	else
3542 		pmap_invalidate_page(pmap, va);
3543 }
3544 
3545 DEFINE_IFUNC(, void, pmap_invalidate_cache_range,
3546     (vm_offset_t sva, vm_offset_t eva))
3547 {
3548 
3549 	if ((cpu_feature & CPUID_SS) != 0)
3550 		return (pmap_invalidate_cache_range_selfsnoop);
3551 	if ((cpu_feature & CPUID_CLFSH) != 0)
3552 		return (pmap_force_invalidate_cache_range);
3553 	return (pmap_invalidate_cache_range_all);
3554 }
3555 
3556 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
3557 
3558 static void
pmap_invalidate_cache_range_check_align(vm_offset_t sva,vm_offset_t eva)3559 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
3560 {
3561 
3562 	KASSERT((sva & PAGE_MASK) == 0,
3563 	    ("pmap_invalidate_cache_range: sva not page-aligned"));
3564 	KASSERT((eva & PAGE_MASK) == 0,
3565 	    ("pmap_invalidate_cache_range: eva not page-aligned"));
3566 }
3567 
3568 static void
pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,vm_offset_t eva)3569 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
3570 {
3571 
3572 	pmap_invalidate_cache_range_check_align(sva, eva);
3573 }
3574 
3575 void
pmap_force_invalidate_cache_range(vm_offset_t sva,vm_offset_t eva)3576 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
3577 {
3578 
3579 	sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
3580 
3581 	/*
3582 	 * XXX: Some CPUs fault, hang, or trash the local APIC
3583 	 * registers if we use CLFLUSH on the local APIC range.  The
3584 	 * local APIC is always uncached, so we don't need to flush
3585 	 * for that range anyway.
3586 	 */
3587 	if (pmap_kextract(sva) == lapic_paddr)
3588 		return;
3589 
3590 	if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
3591 		/*
3592 		 * Do per-cache line flush.  Use a locked
3593 		 * instruction to insure that previous stores are
3594 		 * included in the write-back.  The processor
3595 		 * propagates flush to other processors in the cache
3596 		 * coherence domain.
3597 		 */
3598 		atomic_thread_fence_seq_cst();
3599 		for (; sva < eva; sva += cpu_clflush_line_size)
3600 			clflushopt(sva);
3601 		atomic_thread_fence_seq_cst();
3602 	} else {
3603 		/*
3604 		 * Writes are ordered by CLFLUSH on Intel CPUs.
3605 		 */
3606 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
3607 			mfence();
3608 		for (; sva < eva; sva += cpu_clflush_line_size)
3609 			clflush(sva);
3610 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
3611 			mfence();
3612 	}
3613 }
3614 
3615 static void
pmap_invalidate_cache_range_all(vm_offset_t sva,vm_offset_t eva)3616 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
3617 {
3618 
3619 	pmap_invalidate_cache_range_check_align(sva, eva);
3620 	pmap_invalidate_cache();
3621 }
3622 
3623 /*
3624  * Remove the specified set of pages from the data and instruction caches.
3625  *
3626  * In contrast to pmap_invalidate_cache_range(), this function does not
3627  * rely on the CPU's self-snoop feature, because it is intended for use
3628  * when moving pages into a different cache domain.
3629  */
3630 void
pmap_invalidate_cache_pages(vm_page_t * pages,int count)3631 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
3632 {
3633 	vm_offset_t daddr, eva;
3634 	int i;
3635 	bool useclflushopt;
3636 
3637 	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
3638 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
3639 	    ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
3640 		pmap_invalidate_cache();
3641 	else {
3642 		if (useclflushopt)
3643 			atomic_thread_fence_seq_cst();
3644 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
3645 			mfence();
3646 		for (i = 0; i < count; i++) {
3647 			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
3648 			eva = daddr + PAGE_SIZE;
3649 			for (; daddr < eva; daddr += cpu_clflush_line_size) {
3650 				if (useclflushopt)
3651 					clflushopt(daddr);
3652 				else
3653 					clflush(daddr);
3654 			}
3655 		}
3656 		if (useclflushopt)
3657 			atomic_thread_fence_seq_cst();
3658 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
3659 			mfence();
3660 	}
3661 }
3662 
3663 void
pmap_flush_cache_range(vm_offset_t sva,vm_offset_t eva)3664 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva)
3665 {
3666 
3667 	pmap_invalidate_cache_range_check_align(sva, eva);
3668 
3669 	if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) {
3670 		pmap_force_invalidate_cache_range(sva, eva);
3671 		return;
3672 	}
3673 
3674 	/* See comment in pmap_force_invalidate_cache_range(). */
3675 	if (pmap_kextract(sva) == lapic_paddr)
3676 		return;
3677 
3678 	atomic_thread_fence_seq_cst();
3679 	for (; sva < eva; sva += cpu_clflush_line_size)
3680 		clwb(sva);
3681 	atomic_thread_fence_seq_cst();
3682 }
3683 
3684 void
pmap_flush_cache_phys_range(vm_paddr_t spa,vm_paddr_t epa,vm_memattr_t mattr)3685 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
3686 {
3687 	pt_entry_t *pte;
3688 	vm_offset_t vaddr;
3689 	int error __diagused;
3690 	int pte_bits;
3691 
3692 	KASSERT((spa & PAGE_MASK) == 0,
3693 	    ("pmap_flush_cache_phys_range: spa not page-aligned"));
3694 	KASSERT((epa & PAGE_MASK) == 0,
3695 	    ("pmap_flush_cache_phys_range: epa not page-aligned"));
3696 
3697 	if (spa < dmaplimit) {
3698 		pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN(
3699 		    dmaplimit, epa)));
3700 		if (dmaplimit >= epa)
3701 			return;
3702 		spa = dmaplimit;
3703 	}
3704 
3705 	pte_bits = pmap_cache_bits(kernel_pmap, mattr, false) | X86_PG_RW |
3706 	    X86_PG_V;
3707 	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
3708 	    &vaddr);
3709 	KASSERT(error == 0, ("vmem_alloc failed: %d", error));
3710 	pte = vtopte(vaddr);
3711 	for (; spa < epa; spa += PAGE_SIZE) {
3712 		sched_pin();
3713 		pte_store(pte, spa | pte_bits);
3714 		pmap_invlpg(kernel_pmap, vaddr);
3715 		/* XXXKIB atomic inside flush_cache_range are excessive */
3716 		pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
3717 		sched_unpin();
3718 	}
3719 	vmem_free(kernel_arena, vaddr, PAGE_SIZE);
3720 }
3721 
3722 /*
3723  *	Routine:	pmap_extract
3724  *	Function:
3725  *		Extract the physical page address associated
3726  *		with the given map/virtual_address pair.
3727  */
3728 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)3729 pmap_extract(pmap_t pmap, vm_offset_t va)
3730 {
3731 	pdp_entry_t *pdpe;
3732 	pd_entry_t *pde;
3733 	pt_entry_t *pte, PG_V;
3734 	vm_paddr_t pa;
3735 
3736 	pa = 0;
3737 	PG_V = pmap_valid_bit(pmap);
3738 	PMAP_LOCK(pmap);
3739 	pdpe = pmap_pdpe(pmap, va);
3740 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
3741 		if ((*pdpe & PG_PS) != 0)
3742 			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
3743 		else {
3744 			pde = pmap_pdpe_to_pde(pdpe, va);
3745 			if ((*pde & PG_V) != 0) {
3746 				if ((*pde & PG_PS) != 0) {
3747 					pa = (*pde & PG_PS_FRAME) |
3748 					    (va & PDRMASK);
3749 				} else {
3750 					pte = pmap_pde_to_pte(pde, va);
3751 					pa = (*pte & PG_FRAME) |
3752 					    (va & PAGE_MASK);
3753 				}
3754 			}
3755 		}
3756 	}
3757 	PMAP_UNLOCK(pmap);
3758 	return (pa);
3759 }
3760 
3761 /*
3762  *	Routine:	pmap_extract_and_hold
3763  *	Function:
3764  *		Atomically extract and hold the physical page
3765  *		with the given pmap and virtual address pair
3766  *		if that mapping permits the given protection.
3767  */
3768 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)3769 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
3770 {
3771 	pdp_entry_t pdpe, *pdpep;
3772 	pd_entry_t pde, *pdep;
3773 	pt_entry_t pte, PG_RW, PG_V;
3774 	vm_page_t m;
3775 
3776 	m = NULL;
3777 	PG_RW = pmap_rw_bit(pmap);
3778 	PG_V = pmap_valid_bit(pmap);
3779 	PMAP_LOCK(pmap);
3780 
3781 	pdpep = pmap_pdpe(pmap, va);
3782 	if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0)
3783 		goto out;
3784 	if ((pdpe & PG_PS) != 0) {
3785 		if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)
3786 			goto out;
3787 		m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK));
3788 		goto check_page;
3789 	}
3790 
3791 	pdep = pmap_pdpe_to_pde(pdpep, va);
3792 	if (pdep == NULL || ((pde = *pdep) & PG_V) == 0)
3793 		goto out;
3794 	if ((pde & PG_PS) != 0) {
3795 		if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)
3796 			goto out;
3797 		m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK));
3798 		goto check_page;
3799 	}
3800 
3801 	pte = *pmap_pde_to_pte(pdep, va);
3802 	if ((pte & PG_V) == 0 ||
3803 	    ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0))
3804 		goto out;
3805 	m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
3806 
3807 check_page:
3808 	if (m != NULL && !vm_page_wire_mapped(m))
3809 		m = NULL;
3810 out:
3811 	PMAP_UNLOCK(pmap);
3812 	return (m);
3813 }
3814 
3815 /*
3816  *	Routine:	pmap_kextract
3817  *	Function:
3818  *		Extract the physical page address associated with the given kernel
3819  *		virtual address.
3820  */
3821 vm_paddr_t
pmap_kextract(vm_offset_t va)3822 pmap_kextract(vm_offset_t va)
3823 {
3824 	pd_entry_t pde;
3825 	vm_paddr_t pa;
3826 
3827 	if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high) {
3828 		pa = DMAP_TO_PHYS(va);
3829 	} else if (PMAP_ADDRESS_IN_LARGEMAP(va)) {
3830 		pa = pmap_large_map_kextract(va);
3831 	} else {
3832 		pde = *vtopde(va);
3833 		if (pde & PG_PS) {
3834 			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
3835 		} else {
3836 			/*
3837 			 * Beware of a concurrent promotion that changes the
3838 			 * PDE at this point!  For example, vtopte() must not
3839 			 * be used to access the PTE because it would use the
3840 			 * new PDE.  It is, however, safe to use the old PDE
3841 			 * because the page table page is preserved by the
3842 			 * promotion.
3843 			 */
3844 			pa = *pmap_pde_to_pte(&pde, va);
3845 			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
3846 		}
3847 	}
3848 	return (pa);
3849 }
3850 
3851 /***************************************************
3852  * Low level mapping routines.....
3853  ***************************************************/
3854 
3855 /*
3856  * Add a wired page to the kva.
3857  * Note: not SMP coherent.
3858  */
3859 void
pmap_kenter(vm_offset_t va,vm_paddr_t pa)3860 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
3861 {
3862 	pt_entry_t *pte;
3863 
3864 	pte = vtopte(va);
3865 	pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M |
3866 	    X86_PG_RW | X86_PG_V);
3867 }
3868 
3869 static __inline void
pmap_kenter_attr(vm_offset_t va,vm_paddr_t pa,int mode)3870 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
3871 {
3872 	pt_entry_t *pte;
3873 	int cache_bits;
3874 
3875 	pte = vtopte(va);
3876 	cache_bits = pmap_cache_bits(kernel_pmap, mode, false);
3877 	pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M |
3878 	    X86_PG_RW | X86_PG_V | cache_bits);
3879 }
3880 
3881 /*
3882  * Remove a page from the kernel pagetables.
3883  * Note: not SMP coherent.
3884  */
3885 void
pmap_kremove(vm_offset_t va)3886 pmap_kremove(vm_offset_t va)
3887 {
3888 	pt_entry_t *pte;
3889 
3890 	pte = vtopte(va);
3891 	pte_clear(pte);
3892 }
3893 
3894 /*
3895  *	Used to map a range of physical addresses into kernel
3896  *	virtual address space.
3897  *
3898  *	The value passed in '*virt' is a suggested virtual address for
3899  *	the mapping. Architectures which can support a direct-mapped
3900  *	physical to virtual region can return the appropriate address
3901  *	within that region, leaving '*virt' unchanged. Other
3902  *	architectures should map the pages starting at '*virt' and
3903  *	update '*virt' with the first usable address after the mapped
3904  *	region.
3905  */
3906 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)3907 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
3908 {
3909 	return PHYS_TO_DMAP(start);
3910 }
3911 
3912 /*
3913  * Add a list of wired pages to the kva
3914  * this routine is only used for temporary
3915  * kernel mappings that do not need to have
3916  * page modification or references recorded.
3917  * Note that old mappings are simply written
3918  * over.  The page *must* be wired.
3919  * Note: SMP coherent.  Uses a ranged shootdown IPI.
3920  */
3921 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)3922 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
3923 {
3924 	pt_entry_t *endpte, oldpte, pa, *pte;
3925 	vm_page_t m;
3926 	int cache_bits;
3927 
3928 	oldpte = 0;
3929 	pte = vtopte(sva);
3930 	endpte = pte + count;
3931 	while (pte < endpte) {
3932 		m = *ma++;
3933 		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, false);
3934 		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
3935 		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
3936 			oldpte |= *pte;
3937 			pte_store(pte, pa | pg_g | pg_nx | X86_PG_A |
3938 			    X86_PG_M | X86_PG_RW | X86_PG_V);
3939 		}
3940 		pte++;
3941 	}
3942 	if (__predict_false((oldpte & X86_PG_V) != 0))
3943 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
3944 		    PAGE_SIZE);
3945 }
3946 
3947 /*
3948  * This routine tears out page mappings from the
3949  * kernel -- it is meant only for temporary mappings.
3950  * Note: SMP coherent.  Uses a ranged shootdown IPI.
3951  */
3952 void
pmap_qremove(vm_offset_t sva,int count)3953 pmap_qremove(vm_offset_t sva, int count)
3954 {
3955 	vm_offset_t va;
3956 
3957 	va = sva;
3958 	while (count-- > 0) {
3959 		/*
3960 		 * pmap_enter() calls within the kernel virtual
3961 		 * address space happen on virtual addresses from
3962 		 * subarenas that import superpage-sized and -aligned
3963 		 * address ranges.  So, the virtual address that we
3964 		 * allocate to use with pmap_qenter() can't be close
3965 		 * enough to one of those pmap_enter() calls for it to
3966 		 * be caught up in a promotion.
3967 		 */
3968 		KASSERT(va >= kva_layout.km_low, ("usermode va %lx", va));
3969 		KASSERT((*vtopde(va) & X86_PG_PS) == 0,
3970 		    ("pmap_qremove on promoted va %#lx", va));
3971 
3972 		pmap_kremove(va);
3973 		va += PAGE_SIZE;
3974 	}
3975 	pmap_invalidate_range(kernel_pmap, sva, va);
3976 }
3977 
3978 /***************************************************
3979  * Page table page management routines.....
3980  ***************************************************/
3981 /*
3982  * Schedule the specified unused page table page to be freed.  Specifically,
3983  * add the page to the specified list of pages that will be released to the
3984  * physical memory manager after the TLB has been updated.
3985  */
3986 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)3987 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
3988 {
3989 
3990 	if (set_PG_ZERO)
3991 		m->flags |= PG_ZERO;
3992 	else
3993 		m->flags &= ~PG_ZERO;
3994 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
3995 }
3996 
3997 /*
3998  * Inserts the specified page table page into the specified pmap's collection
3999  * of idle page table pages.  Each of a pmap's page table pages is responsible
4000  * for mapping a distinct range of virtual addresses.  The pmap's collection is
4001  * ordered by this virtual address range.
4002  *
4003  * If "promoted" is false, then the page table page "mpte" must be zero filled;
4004  * "mpte"'s valid field will be set to 0.
4005  *
4006  * If "promoted" is true and "allpte_PG_A_set" is false, then "mpte" must
4007  * contain valid mappings with identical attributes except for PG_A; "mpte"'s
4008  * valid field will be set to 1.
4009  *
4010  * If "promoted" and "allpte_PG_A_set" are both true, then "mpte" must contain
4011  * valid mappings with identical attributes including PG_A; "mpte"'s valid
4012  * field will be set to VM_PAGE_BITS_ALL.
4013  */
4014 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool allpte_PG_A_set)4015 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4016     bool allpte_PG_A_set)
4017 {
4018 
4019 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4020 	KASSERT(promoted || !allpte_PG_A_set,
4021 	    ("a zero-filled PTP can't have PG_A set in every PTE"));
4022 	mpte->valid = promoted ? (allpte_PG_A_set ? VM_PAGE_BITS_ALL : 1) : 0;
4023 	return (vm_radix_insert(&pmap->pm_root, mpte));
4024 }
4025 
4026 /*
4027  * Removes the page table page mapping the specified virtual address from the
4028  * specified pmap's collection of idle page table pages, and returns it.
4029  * Otherwise, returns NULL if there is no page table page corresponding to the
4030  * specified virtual address.
4031  */
4032 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)4033 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4034 {
4035 
4036 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4037 	return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
4038 }
4039 
4040 /*
4041  * Decrements a page table page's reference count, which is used to record the
4042  * number of valid page table entries within the page.  If the reference count
4043  * drops to zero, then the page table page is unmapped.  Returns true if the
4044  * page table page was unmapped and false otherwise.
4045  */
4046 static inline bool
pmap_unwire_ptp(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)4047 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4048 {
4049 
4050 	--m->ref_count;
4051 	if (m->ref_count == 0) {
4052 		_pmap_unwire_ptp(pmap, va, m, free);
4053 		return (true);
4054 	} else
4055 		return (false);
4056 }
4057 
4058 static void
_pmap_unwire_ptp(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)4059 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4060 {
4061 	pml5_entry_t *pml5;
4062 	pml4_entry_t *pml4;
4063 	pdp_entry_t *pdp;
4064 	pd_entry_t *pd;
4065 	vm_page_t pdpg, pdppg, pml4pg;
4066 
4067 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4068 
4069 	/*
4070 	 * unmap the page table page
4071 	 */
4072 	if (m->pindex >= NUPDE + NUPDPE + NUPML4E) {
4073 		/* PML4 page */
4074 		MPASS(pmap_is_la57(pmap));
4075 		pml5 = pmap_pml5e(pmap, va);
4076 		*pml5 = 0;
4077 		if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) {
4078 			pml5 = pmap_pml5e_u(pmap, va);
4079 			*pml5 = 0;
4080 		}
4081 	} else if (m->pindex >= NUPDE + NUPDPE) {
4082 		/* PDP page */
4083 		pml4 = pmap_pml4e(pmap, va);
4084 		*pml4 = 0;
4085 		if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
4086 		    va <= VM_MAXUSER_ADDRESS) {
4087 			pml4 = pmap_pml4e_u(pmap, va);
4088 			*pml4 = 0;
4089 		}
4090 	} else if (m->pindex >= NUPDE) {
4091 		/* PD page */
4092 		pdp = pmap_pdpe(pmap, va);
4093 		*pdp = 0;
4094 	} else {
4095 		/* PTE page */
4096 		pd = pmap_pde(pmap, va);
4097 		*pd = 0;
4098 	}
4099 	if (m->pindex < NUPDE) {
4100 		/* We just released a PT, unhold the matching PD */
4101 		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
4102 		pmap_unwire_ptp(pmap, va, pdpg, free);
4103 	} else if (m->pindex < NUPDE + NUPDPE) {
4104 		/* We just released a PD, unhold the matching PDP */
4105 		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
4106 		pmap_unwire_ptp(pmap, va, pdppg, free);
4107 	} else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) {
4108 		/* We just released a PDP, unhold the matching PML4 */
4109 		pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME);
4110 		pmap_unwire_ptp(pmap, va, pml4pg, free);
4111 	}
4112 
4113 	pmap_pt_page_count_adj(pmap, -1);
4114 
4115 	/*
4116 	 * Put page on a list so that it is released after
4117 	 * *ALL* TLB shootdown is done
4118 	 */
4119 	pmap_add_delayed_free_list(m, free, true);
4120 }
4121 
4122 /*
4123  * After removing a page table entry, this routine is used to
4124  * conditionally free the page, and manage the reference count.
4125  */
4126 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)4127 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
4128     struct spglist *free)
4129 {
4130 	vm_page_t mpte;
4131 
4132 	if (va >= VM_MAXUSER_ADDRESS)
4133 		return (0);
4134 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
4135 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
4136 	return (pmap_unwire_ptp(pmap, va, mpte, free));
4137 }
4138 
4139 /*
4140  * Release a page table page reference after a failed attempt to create a
4141  * mapping.
4142  */
4143 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)4144 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
4145 {
4146 	struct spglist free;
4147 
4148 	SLIST_INIT(&free);
4149 	if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
4150 		/*
4151 		 * Although "va" was never mapped, paging-structure caches
4152 		 * could nonetheless have entries that refer to the freed
4153 		 * page table pages.  Invalidate those entries.
4154 		 */
4155 		pmap_invalidate_page(pmap, va);
4156 		vm_page_free_pages_toq(&free, true);
4157 	}
4158 }
4159 
4160 static void
pmap_pinit_pcids(pmap_t pmap,uint32_t pcid,int gen)4161 pmap_pinit_pcids(pmap_t pmap, uint32_t pcid, int gen)
4162 {
4163 	struct pmap_pcid *pcidp;
4164 	int i;
4165 
4166 	CPU_FOREACH(i) {
4167 		pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i);
4168 		pcidp->pm_pcid = pcid;
4169 		pcidp->pm_gen = gen;
4170 	}
4171 }
4172 
4173 void
pmap_pinit0(pmap_t pmap)4174 pmap_pinit0(pmap_t pmap)
4175 {
4176 	struct proc *p;
4177 	struct thread *td;
4178 
4179 	PMAP_LOCK_INIT(pmap);
4180 	pmap->pm_pmltop = kernel_pmap->pm_pmltop;
4181 	pmap->pm_pmltopu = NULL;
4182 	pmap->pm_cr3 = kernel_pmap->pm_cr3;
4183 	/* hack to keep pmap_pti_pcid_invalidate() alive */
4184 	pmap->pm_ucr3 = PMAP_NO_CR3;
4185 	vm_radix_init(&pmap->pm_root);
4186 	CPU_ZERO(&pmap->pm_active);
4187 	TAILQ_INIT(&pmap->pm_pvchunk);
4188 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4189 	pmap->pm_flags = pmap_flags;
4190 	pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8, M_WAITOK);
4191 	pmap_pinit_pcids(pmap, PMAP_PCID_KERN + 1, 1);
4192 	pmap_activate_boot(pmap);
4193 	td = curthread;
4194 	if (pti) {
4195 		p = td->td_proc;
4196 		PROC_LOCK(p);
4197 		p->p_md.md_flags |= P_MD_KPTI;
4198 		PROC_UNLOCK(p);
4199 	}
4200 	pmap_thread_init_invl_gen(td);
4201 
4202 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
4203 		pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
4204 		    sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL,
4205 		    UMA_ALIGN_PTR, 0);
4206 	}
4207 }
4208 
4209 void
pmap_pinit_pml4(vm_page_t pml4pg)4210 pmap_pinit_pml4(vm_page_t pml4pg)
4211 {
4212 	pml4_entry_t *pm_pml4;
4213 	int i;
4214 
4215 	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
4216 
4217 	/* Wire in kernel global address entries. */
4218 	for (i = 0; i < NKPML4E; i++) {
4219 		pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
4220 		    X86_PG_V;
4221 	}
4222 #ifdef KASAN
4223 	for (i = 0; i < NKASANPML4E; i++) {
4224 		pm_pml4[KASANPML4I + i] = (KASANPDPphys + ptoa(i)) | X86_PG_RW |
4225 		    X86_PG_V | pg_nx;
4226 	}
4227 #endif
4228 #ifdef KMSAN
4229 	for (i = 0; i < NKMSANSHADPML4E; i++) {
4230 		pm_pml4[KMSANSHADPML4I + i] = (KMSANSHADPDPphys + ptoa(i)) |
4231 		    X86_PG_RW | X86_PG_V | pg_nx;
4232 	}
4233 	for (i = 0; i < NKMSANORIGPML4E; i++) {
4234 		pm_pml4[KMSANORIGPML4I + i] = (KMSANORIGPDPphys + ptoa(i)) |
4235 		    X86_PG_RW | X86_PG_V | pg_nx;
4236 	}
4237 #endif
4238 	for (i = 0; i < ndmpdpphys; i++) {
4239 		pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
4240 		    X86_PG_V;
4241 	}
4242 
4243 	/* install self-referential address mapping entry(s) */
4244 	pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
4245 	    X86_PG_A | X86_PG_M;
4246 
4247 	/* install large map entries if configured */
4248 	for (i = 0; i < lm_ents; i++)
4249 		pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i];
4250 }
4251 
4252 void
pmap_pinit_pml5(vm_page_t pml5pg)4253 pmap_pinit_pml5(vm_page_t pml5pg)
4254 {
4255 	pml5_entry_t *pm_pml5;
4256 	int i;
4257 
4258 	pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg));
4259 	for (i = 0; i < NPML5EPG / 2; i++)
4260 		pm_pml5[i] = 0;
4261 	for (; i < NPML5EPG; i++)
4262 		pm_pml5[i] = kernel_pmap->pm_pmltop[i];
4263 }
4264 
4265 static void
pmap_pinit_pml4_pti(vm_page_t pml4pgu)4266 pmap_pinit_pml4_pti(vm_page_t pml4pgu)
4267 {
4268 	pml4_entry_t *pm_pml4u;
4269 	int i;
4270 
4271 	pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu));
4272 	for (i = 0; i < NPML4EPG; i++)
4273 		pm_pml4u[i] = pti_pml4[i];
4274 }
4275 
4276 static void
pmap_pinit_pml5_pti(vm_page_t pml5pgu)4277 pmap_pinit_pml5_pti(vm_page_t pml5pgu)
4278 {
4279 	pml5_entry_t *pm_pml5u;
4280 
4281 	pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu));
4282 	pagezero(pm_pml5u);
4283 
4284 	/*
4285 	 * Add pml5 entry at top of KVA pointing to existing pml4 pti
4286 	 * table, entering all kernel mappings needed for usermode
4287 	 * into level 5 table.
4288 	 */
4289 	pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] =
4290 	    pmap_kextract((vm_offset_t)pti_pml4) |
4291 	    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
4292 }
4293 
4294 /* Allocate a page table page and do related bookkeeping */
4295 static vm_page_t
pmap_alloc_pt_page(pmap_t pmap,vm_pindex_t pindex,int flags)4296 pmap_alloc_pt_page(pmap_t pmap, vm_pindex_t pindex, int flags)
4297 {
4298 	vm_page_t m;
4299 
4300 	m = vm_page_alloc_noobj(flags);
4301 	if (__predict_false(m == NULL))
4302 		return (NULL);
4303 	m->pindex = pindex;
4304 	pmap_pt_page_count_adj(pmap, 1);
4305 	return (m);
4306 }
4307 
4308 static void
pmap_free_pt_page(pmap_t pmap,vm_page_t m,bool zerofilled)4309 pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled)
4310 {
4311 	/*
4312 	 * This function assumes the page will need to be unwired,
4313 	 * even though the counterpart allocation in pmap_alloc_pt_page()
4314 	 * doesn't enforce VM_ALLOC_WIRED.  However, all current uses
4315 	 * of pmap_free_pt_page() require unwiring.  The case in which
4316 	 * a PT page doesn't require unwiring because its ref_count has
4317 	 * naturally reached 0 is handled through _pmap_unwire_ptp().
4318 	 */
4319 	vm_page_unwire_noq(m);
4320 	if (zerofilled)
4321 		vm_page_free_zero(m);
4322 	else
4323 		vm_page_free(m);
4324 
4325 	pmap_pt_page_count_adj(pmap, -1);
4326 }
4327 
4328 _Static_assert(sizeof(struct pmap_pcid) == 8, "Fix pcpu zone for pm_pcidp");
4329 
4330 /*
4331  * Initialize a preallocated and zeroed pmap structure,
4332  * such as one in a vmspace structure.
4333  */
4334 int
pmap_pinit_type(pmap_t pmap,enum pmap_type pm_type,int flags)4335 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
4336 {
4337 	vm_page_t pmltop_pg, pmltop_pgu;
4338 	vm_paddr_t pmltop_phys;
4339 
4340 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4341 
4342 	/*
4343 	 * Allocate the page directory page.  Pass NULL instead of a
4344 	 * pointer to the pmap here to avoid calling
4345 	 * pmap_resident_count_adj() through pmap_pt_page_count_adj(),
4346 	 * since that requires pmap lock.  Instead do the accounting
4347 	 * manually.
4348 	 *
4349 	 * Note that final call to pmap_remove() optimization that
4350 	 * checks for zero resident_count is basically disabled by
4351 	 * accounting for top-level page.  But the optimization was
4352 	 * not effective since we started using non-managed mapping of
4353 	 * the shared page.
4354 	 */
4355 	pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO |
4356 	    VM_ALLOC_WAITOK);
4357 	pmap_pt_page_count_pinit(pmap, 1);
4358 
4359 	pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg);
4360 	pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys);
4361 
4362 	if (pmap_pcid_enabled) {
4363 		if (pmap->pm_pcidp == NULL)
4364 			pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8,
4365 			    M_WAITOK);
4366 		pmap_pinit_pcids(pmap, PMAP_PCID_NONE, 0);
4367 	}
4368 	pmap->pm_cr3 = PMAP_NO_CR3;	/* initialize to an invalid value */
4369 	pmap->pm_ucr3 = PMAP_NO_CR3;
4370 	pmap->pm_pmltopu = NULL;
4371 
4372 	pmap->pm_type = pm_type;
4373 
4374 	/*
4375 	 * Do not install the host kernel mappings in the nested page
4376 	 * tables. These mappings are meaningless in the guest physical
4377 	 * address space.
4378 	 * Install minimal kernel mappings in PTI case.
4379 	 */
4380 	switch (pm_type) {
4381 	case PT_X86:
4382 		pmap->pm_cr3 = pmltop_phys;
4383 		if (pmap_is_la57(pmap))
4384 			pmap_pinit_pml5(pmltop_pg);
4385 		else
4386 			pmap_pinit_pml4(pmltop_pg);
4387 		if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) {
4388 			/*
4389 			 * As with pmltop_pg, pass NULL instead of a
4390 			 * pointer to the pmap to ensure that the PTI
4391 			 * page counted explicitly.
4392 			 */
4393 			pmltop_pgu = pmap_alloc_pt_page(NULL, 0,
4394 			    VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
4395 			pmap_pt_page_count_pinit(pmap, 1);
4396 			pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP(
4397 			    VM_PAGE_TO_PHYS(pmltop_pgu));
4398 			if (pmap_is_la57(pmap))
4399 				pmap_pinit_pml5_pti(pmltop_pgu);
4400 			else
4401 				pmap_pinit_pml4_pti(pmltop_pgu);
4402 			pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu);
4403 		}
4404 		if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
4405 			rangeset_init(&pmap->pm_pkru, pkru_dup_range,
4406 			    pkru_free_range, pmap, M_NOWAIT);
4407 		}
4408 		break;
4409 	case PT_EPT:
4410 	case PT_RVI:
4411 		pmap->pm_eptsmr = smr_create("pmap", 0, 0);
4412 		break;
4413 	}
4414 
4415 	vm_radix_init(&pmap->pm_root);
4416 	CPU_ZERO(&pmap->pm_active);
4417 	TAILQ_INIT(&pmap->pm_pvchunk);
4418 	pmap->pm_flags = flags;
4419 	pmap->pm_eptgen = 0;
4420 
4421 	return (1);
4422 }
4423 
4424 int
pmap_pinit(pmap_t pmap)4425 pmap_pinit(pmap_t pmap)
4426 {
4427 
4428 	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
4429 }
4430 
4431 static void
pmap_allocpte_free_unref(pmap_t pmap,vm_offset_t va,pt_entry_t * pte)4432 pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte)
4433 {
4434 	vm_page_t mpg;
4435 	struct spglist free;
4436 
4437 	mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
4438 	if (mpg->ref_count != 0)
4439 		return;
4440 	SLIST_INIT(&free);
4441 	_pmap_unwire_ptp(pmap, va, mpg, &free);
4442 	pmap_invalidate_page(pmap, va);
4443 	vm_page_free_pages_toq(&free, true);
4444 }
4445 
4446 static pml4_entry_t *
pmap_allocpte_getpml4(pmap_t pmap,struct rwlock ** lockp,vm_offset_t va,bool addref)4447 pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
4448     bool addref)
4449 {
4450 	vm_pindex_t pml5index;
4451 	pml5_entry_t *pml5;
4452 	pml4_entry_t *pml4;
4453 	vm_page_t pml4pg;
4454 	pt_entry_t PG_V;
4455 	bool allocated;
4456 
4457 	if (!pmap_is_la57(pmap))
4458 		return (&pmap->pm_pmltop[pmap_pml4e_index(va)]);
4459 
4460 	PG_V = pmap_valid_bit(pmap);
4461 	pml5index = pmap_pml5e_index(va);
4462 	pml5 = &pmap->pm_pmltop[pml5index];
4463 	if ((*pml5 & PG_V) == 0) {
4464 		if (pmap_allocpte_nosleep(pmap, pmap_pml5e_pindex(va), lockp,
4465 		    va) == NULL)
4466 			return (NULL);
4467 		allocated = true;
4468 	} else {
4469 		allocated = false;
4470 	}
4471 	pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME);
4472 	pml4 = &pml4[pmap_pml4e_index(va)];
4473 	if ((*pml4 & PG_V) == 0) {
4474 		pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME);
4475 		if (allocated && !addref)
4476 			pml4pg->ref_count--;
4477 		else if (!allocated && addref)
4478 			pml4pg->ref_count++;
4479 	}
4480 	return (pml4);
4481 }
4482 
4483 static pdp_entry_t *
pmap_allocpte_getpdp(pmap_t pmap,struct rwlock ** lockp,vm_offset_t va,bool addref)4484 pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
4485     bool addref)
4486 {
4487 	vm_page_t pdppg;
4488 	pml4_entry_t *pml4;
4489 	pdp_entry_t *pdp;
4490 	pt_entry_t PG_V;
4491 	bool allocated;
4492 
4493 	PG_V = pmap_valid_bit(pmap);
4494 
4495 	pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false);
4496 	if (pml4 == NULL)
4497 		return (NULL);
4498 
4499 	if ((*pml4 & PG_V) == 0) {
4500 		/* Have to allocate a new pdp, recurse */
4501 		if (pmap_allocpte_nosleep(pmap, pmap_pml4e_pindex(va), lockp,
4502 		    va) == NULL) {
4503 			if (pmap_is_la57(pmap))
4504 				pmap_allocpte_free_unref(pmap, va,
4505 				    pmap_pml5e(pmap, va));
4506 			return (NULL);
4507 		}
4508 		allocated = true;
4509 	} else {
4510 		allocated = false;
4511 	}
4512 	pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
4513 	pdp = &pdp[pmap_pdpe_index(va)];
4514 	if ((*pdp & PG_V) == 0) {
4515 		pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
4516 		if (allocated && !addref)
4517 			pdppg->ref_count--;
4518 		else if (!allocated && addref)
4519 			pdppg->ref_count++;
4520 	}
4521 	return (pdp);
4522 }
4523 
4524 /*
4525  * The ptepindexes, i.e. page indices, of the page table pages encountered
4526  * while translating virtual address va are defined as follows:
4527  * - for the page table page (last level),
4528  *      ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT,
4529  *   in other words, it is just the index of the PDE that maps the page
4530  *   table page.
4531  * - for the page directory page,
4532  *      ptepindex = NUPDE (number of userland PD entries) +
4533  *          (pmap_pde_index(va) >> NPDEPGSHIFT)
4534  *   i.e. index of PDPE is put after the last index of PDE,
4535  * - for the page directory pointer page,
4536  *      ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT +
4537  *          NPML4EPGSHIFT),
4538  *   i.e. index of pml4e is put after the last index of PDPE,
4539  * - for the PML4 page (if LA57 mode is enabled),
4540  *      ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >>
4541  *          (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT),
4542  *   i.e. index of pml5e is put after the last index of PML4E.
4543  *
4544  * Define an order on the paging entries, where all entries of the
4545  * same height are put together, then heights are put from deepest to
4546  * root.  Then ptexpindex is the sequential number of the
4547  * corresponding paging entry in this order.
4548  *
4549  * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of
4550  * LA57 paging structures even in LA48 paging mode. Moreover, the
4551  * ptepindexes are calculated as if the paging structures were 5-level
4552  * regardless of the actual mode of operation.
4553  *
4554  * The root page at PML4/PML5 does not participate in this indexing scheme,
4555  * since it is statically allocated by pmap_pinit() and not by pmap_allocpte().
4556  */
4557 static vm_page_t
pmap_allocpte_nosleep(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp,vm_offset_t va)4558 pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp,
4559     vm_offset_t va)
4560 {
4561 	vm_pindex_t pml5index, pml4index;
4562 	pml5_entry_t *pml5, *pml5u;
4563 	pml4_entry_t *pml4, *pml4u;
4564 	pdp_entry_t *pdp;
4565 	pd_entry_t *pd;
4566 	vm_page_t m, pdpg;
4567 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
4568 
4569 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4570 
4571 	PG_A = pmap_accessed_bit(pmap);
4572 	PG_M = pmap_modified_bit(pmap);
4573 	PG_V = pmap_valid_bit(pmap);
4574 	PG_RW = pmap_rw_bit(pmap);
4575 
4576 	/*
4577 	 * Allocate a page table page.
4578 	 */
4579 	m = pmap_alloc_pt_page(pmap, ptepindex,
4580 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
4581 	if (m == NULL)
4582 		return (NULL);
4583 
4584 	/*
4585 	 * Map the pagetable page into the process address space, if
4586 	 * it isn't already there.
4587 	 */
4588 	if (ptepindex >= NUPDE + NUPDPE + NUPML4E) {
4589 		MPASS(pmap_is_la57(pmap));
4590 
4591 		pml5index = pmap_pml5e_index(va);
4592 		pml5 = &pmap->pm_pmltop[pml5index];
4593 		KASSERT((*pml5 & PG_V) == 0,
4594 		    ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5));
4595 		*pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
4596 
4597 		if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) {
4598 			MPASS(pmap->pm_ucr3 != PMAP_NO_CR3);
4599 			*pml5 |= pg_nx;
4600 
4601 			pml5u = &pmap->pm_pmltopu[pml5index];
4602 			*pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
4603 			    PG_A | PG_M;
4604 		}
4605 	} else if (ptepindex >= NUPDE + NUPDPE) {
4606 		pml4index = pmap_pml4e_index(va);
4607 		/* Wire up a new PDPE page */
4608 		pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true);
4609 		if (pml4 == NULL) {
4610 			pmap_free_pt_page(pmap, m, true);
4611 			return (NULL);
4612 		}
4613 		KASSERT((*pml4 & PG_V) == 0,
4614 		    ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4));
4615 		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
4616 
4617 		if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
4618 		    pml4index < NUPML4E) {
4619 			MPASS(pmap->pm_ucr3 != PMAP_NO_CR3);
4620 
4621 			/*
4622 			 * PTI: Make all user-space mappings in the
4623 			 * kernel-mode page table no-execute so that
4624 			 * we detect any programming errors that leave
4625 			 * the kernel-mode page table active on return
4626 			 * to user space.
4627 			 */
4628 			*pml4 |= pg_nx;
4629 
4630 			pml4u = &pmap->pm_pmltopu[pml4index];
4631 			*pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
4632 			    PG_A | PG_M;
4633 		}
4634 	} else if (ptepindex >= NUPDE) {
4635 		/* Wire up a new PDE page */
4636 		pdp = pmap_allocpte_getpdp(pmap, lockp, va, true);
4637 		if (pdp == NULL) {
4638 			pmap_free_pt_page(pmap, m, true);
4639 			return (NULL);
4640 		}
4641 		KASSERT((*pdp & PG_V) == 0,
4642 		    ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp));
4643 		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
4644 	} else {
4645 		/* Wire up a new PTE page */
4646 		pdp = pmap_allocpte_getpdp(pmap, lockp, va, false);
4647 		if (pdp == NULL) {
4648 			pmap_free_pt_page(pmap, m, true);
4649 			return (NULL);
4650 		}
4651 		if ((*pdp & PG_V) == 0) {
4652 			/* Have to allocate a new pd, recurse */
4653 			if (pmap_allocpte_nosleep(pmap, pmap_pdpe_pindex(va),
4654 			    lockp, va) == NULL) {
4655 				pmap_allocpte_free_unref(pmap, va,
4656 				    pmap_pml4e(pmap, va));
4657 				pmap_free_pt_page(pmap, m, true);
4658 				return (NULL);
4659 			}
4660 		} else {
4661 			/* Add reference to the pd page */
4662 			pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
4663 			pdpg->ref_count++;
4664 		}
4665 		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
4666 
4667 		/* Now we know where the page directory page is */
4668 		pd = &pd[pmap_pde_index(va)];
4669 		KASSERT((*pd & PG_V) == 0,
4670 		    ("pmap %p va %#lx pd %#lx", pmap, va, *pd));
4671 		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
4672 	}
4673 
4674 	return (m);
4675 }
4676 
4677 /*
4678  * This routine is called if the desired page table page does not exist.
4679  *
4680  * If page table page allocation fails, this routine may sleep before
4681  * returning NULL.  It sleeps only if a lock pointer was given.  Sleep
4682  * occurs right before returning to the caller. This way, we never
4683  * drop pmap lock to sleep while a page table page has ref_count == 0,
4684  * which prevents the page from being freed under us.
4685  */
4686 static vm_page_t
pmap_allocpte_alloc(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp,vm_offset_t va)4687 pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp,
4688     vm_offset_t va)
4689 {
4690 	vm_page_t m;
4691 
4692 	m = pmap_allocpte_nosleep(pmap, ptepindex, lockp, va);
4693 	if (m == NULL && lockp != NULL) {
4694 		RELEASE_PV_LIST_LOCK(lockp);
4695 		PMAP_UNLOCK(pmap);
4696 		PMAP_ASSERT_NOT_IN_DI();
4697 		vm_wait(NULL);
4698 		PMAP_LOCK(pmap);
4699 	}
4700 	return (m);
4701 }
4702 
4703 static pd_entry_t *
pmap_alloc_pde(pmap_t pmap,vm_offset_t va,vm_page_t * pdpgp,struct rwlock ** lockp)4704 pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp,
4705     struct rwlock **lockp)
4706 {
4707 	pdp_entry_t *pdpe, PG_V;
4708 	pd_entry_t *pde;
4709 	vm_page_t pdpg;
4710 	vm_pindex_t pdpindex;
4711 
4712 	PG_V = pmap_valid_bit(pmap);
4713 
4714 retry:
4715 	pdpe = pmap_pdpe(pmap, va);
4716 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
4717 		pde = pmap_pdpe_to_pde(pdpe, va);
4718 		if (va < VM_MAXUSER_ADDRESS) {
4719 			/* Add a reference to the pd page. */
4720 			pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
4721 			pdpg->ref_count++;
4722 		} else
4723 			pdpg = NULL;
4724 	} else if (va < VM_MAXUSER_ADDRESS) {
4725 		/* Allocate a pd page. */
4726 		pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT;
4727 		pdpg = pmap_allocpte_alloc(pmap, NUPDE + pdpindex, lockp, va);
4728 		if (pdpg == NULL) {
4729 			if (lockp != NULL)
4730 				goto retry;
4731 			else
4732 				return (NULL);
4733 		}
4734 		pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4735 		pde = &pde[pmap_pde_index(va)];
4736 	} else
4737 		panic("pmap_alloc_pde: missing page table page for va %#lx",
4738 		    va);
4739 	*pdpgp = pdpg;
4740 	return (pde);
4741 }
4742 
4743 static vm_page_t
pmap_allocpte(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)4744 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
4745 {
4746 	vm_pindex_t ptepindex;
4747 	pd_entry_t *pd, PG_V;
4748 	vm_page_t m;
4749 
4750 	PG_V = pmap_valid_bit(pmap);
4751 
4752 	/*
4753 	 * Calculate pagetable page index
4754 	 */
4755 	ptepindex = pmap_pde_pindex(va);
4756 retry:
4757 	/*
4758 	 * Get the page directory entry
4759 	 */
4760 	pd = pmap_pde(pmap, va);
4761 
4762 	/*
4763 	 * This supports switching from a 2MB page to a
4764 	 * normal 4K page.
4765 	 */
4766 	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
4767 		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
4768 			/*
4769 			 * Invalidation of the 2MB page mapping may have caused
4770 			 * the deallocation of the underlying PD page.
4771 			 */
4772 			pd = NULL;
4773 		}
4774 	}
4775 
4776 	/*
4777 	 * If the page table page is mapped, we just increment the
4778 	 * hold count, and activate it.
4779 	 */
4780 	if (pd != NULL && (*pd & PG_V) != 0) {
4781 		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
4782 		m->ref_count++;
4783 	} else {
4784 		/*
4785 		 * Here if the pte page isn't mapped, or if it has been
4786 		 * deallocated.
4787 		 */
4788 		m = pmap_allocpte_alloc(pmap, ptepindex, lockp, va);
4789 		if (m == NULL && lockp != NULL)
4790 			goto retry;
4791 	}
4792 	return (m);
4793 }
4794 
4795 /***************************************************
4796  * Pmap allocation/deallocation routines.
4797  ***************************************************/
4798 
4799 /*
4800  * Release any resources held by the given physical map.
4801  * Called when a pmap initialized by pmap_pinit is being released.
4802  * Should only be called if the map contains no valid mappings.
4803  */
4804 void
pmap_release(pmap_t pmap)4805 pmap_release(pmap_t pmap)
4806 {
4807 	vm_page_t m;
4808 	int i;
4809 
4810 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
4811 	    ("pmap_release: pmap %p has reserved page table page(s)",
4812 	    pmap));
4813 	KASSERT(CPU_EMPTY(&pmap->pm_active),
4814 	    ("releasing active pmap %p", pmap));
4815 
4816 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop));
4817 
4818 	if (pmap_is_la57(pmap)) {
4819 		for (i = NPML5EPG / 2; i < NPML5EPG; i++)
4820 			pmap->pm_pmltop[i] = 0;
4821 	} else {
4822 		for (i = 0; i < NKPML4E; i++)	/* KVA */
4823 			pmap->pm_pmltop[KPML4BASE + i] = 0;
4824 #ifdef KASAN
4825 		for (i = 0; i < NKASANPML4E; i++) /* KASAN shadow map */
4826 			pmap->pm_pmltop[KASANPML4I + i] = 0;
4827 #endif
4828 #ifdef KMSAN
4829 		for (i = 0; i < NKMSANSHADPML4E; i++) /* KMSAN shadow map */
4830 			pmap->pm_pmltop[KMSANSHADPML4I + i] = 0;
4831 		for (i = 0; i < NKMSANORIGPML4E; i++) /* KMSAN shadow map */
4832 			pmap->pm_pmltop[KMSANORIGPML4I + i] = 0;
4833 #endif
4834 		for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
4835 			pmap->pm_pmltop[DMPML4I + i] = 0;
4836 		pmap->pm_pmltop[PML4PML4I] = 0;	/* Recursive Mapping */
4837 		for (i = 0; i < lm_ents; i++)	/* Large Map */
4838 			pmap->pm_pmltop[LMSPML4I + i] = 0;
4839 	}
4840 
4841 	pmap_free_pt_page(NULL, m, true);
4842 	pmap_pt_page_count_pinit(pmap, -1);
4843 
4844 	if (pmap->pm_pmltopu != NULL) {
4845 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->
4846 		    pm_pmltopu));
4847 		pmap_free_pt_page(NULL, m, false);
4848 		pmap_pt_page_count_pinit(pmap, -1);
4849 	}
4850 	if (pmap->pm_type == PT_X86 &&
4851 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
4852 		rangeset_fini(&pmap->pm_pkru);
4853 
4854 	KASSERT(pmap->pm_stats.resident_count == 0,
4855 	    ("pmap_release: pmap %p resident count %ld != 0",
4856 	    pmap, pmap->pm_stats.resident_count));
4857 }
4858 
4859 static int
kvm_size(SYSCTL_HANDLER_ARGS)4860 kvm_size(SYSCTL_HANDLER_ARGS)
4861 {
4862 	unsigned long ksize = kva_layout.km_high - kva_layout.km_low;
4863 
4864 	return sysctl_handle_long(oidp, &ksize, 0, req);
4865 }
4866 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
4867     0, 0, kvm_size, "LU",
4868     "Size of KVM");
4869 
4870 static int
kvm_free(SYSCTL_HANDLER_ARGS)4871 kvm_free(SYSCTL_HANDLER_ARGS)
4872 {
4873 	unsigned long kfree = kva_layout.km_high - kernel_vm_end;
4874 
4875 	return sysctl_handle_long(oidp, &kfree, 0, req);
4876 }
4877 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
4878     0, 0, kvm_free, "LU",
4879     "Amount of KVM free");
4880 
4881 #ifdef KMSAN
4882 static void
pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa,vm_size_t size)4883 pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa, vm_size_t size)
4884 {
4885 	pdp_entry_t *pdpe;
4886 	pd_entry_t *pde;
4887 	pt_entry_t *pte;
4888 	vm_paddr_t dummypa, dummypd, dummypt;
4889 	int i, npde, npdpg;
4890 
4891 	npdpg = howmany(size, NBPDP);
4892 	npde = size / NBPDR;
4893 
4894 	dummypa = vm_phys_early_alloc(-1, PAGE_SIZE);
4895 	pagezero((void *)PHYS_TO_DMAP(dummypa));
4896 
4897 	dummypt = vm_phys_early_alloc(-1, PAGE_SIZE);
4898 	pagezero((void *)PHYS_TO_DMAP(dummypt));
4899 	dummypd = vm_phys_early_alloc(-1, PAGE_SIZE * npdpg);
4900 	for (i = 0; i < npdpg; i++)
4901 		pagezero((void *)PHYS_TO_DMAP(dummypd + ptoa(i)));
4902 
4903 	pte = (pt_entry_t *)PHYS_TO_DMAP(dummypt);
4904 	for (i = 0; i < NPTEPG; i++)
4905 		pte[i] = (pt_entry_t)(dummypa | X86_PG_V | X86_PG_RW |
4906 		    X86_PG_A | X86_PG_M | pg_nx);
4907 
4908 	pde = (pd_entry_t *)PHYS_TO_DMAP(dummypd);
4909 	for (i = 0; i < npde; i++)
4910 		pde[i] = (pd_entry_t)(dummypt | X86_PG_V | X86_PG_RW | pg_nx);
4911 
4912 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(pdppa);
4913 	for (i = 0; i < npdpg; i++)
4914 		pdpe[i] = (pdp_entry_t)(dummypd + ptoa(i) | X86_PG_V |
4915 		    X86_PG_RW | pg_nx);
4916 }
4917 
4918 static void
pmap_kmsan_page_array_startup(vm_offset_t start,vm_offset_t end)4919 pmap_kmsan_page_array_startup(vm_offset_t start, vm_offset_t end)
4920 {
4921 	vm_size_t size;
4922 
4923 	KASSERT(start % NBPDP == 0, ("unaligned page array start address"));
4924 
4925 	/*
4926 	 * The end of the page array's KVA region is 2MB aligned, see
4927 	 * kmem_init().
4928 	 */
4929 	size = round_2mpage(end) - start;
4930 	pmap_kmsan_shadow_map_page_array(KMSANSHADPDPphys, size);
4931 	pmap_kmsan_shadow_map_page_array(KMSANORIGPDPphys, size);
4932 }
4933 #endif
4934 
4935 /*
4936  * Allocate physical memory for the vm_page array and map it into KVA,
4937  * attempting to back the vm_pages with domain-local memory.
4938  */
4939 void
pmap_page_array_startup(long pages)4940 pmap_page_array_startup(long pages)
4941 {
4942 	pdp_entry_t *pdpe;
4943 	pd_entry_t *pde, newpdir;
4944 	vm_offset_t va, start, end;
4945 	vm_paddr_t pa;
4946 	long pfn;
4947 	int domain, i;
4948 
4949 	vm_page_array_size = pages;
4950 
4951 	start = kva_layout.km_low;
4952 	end = start + pages * sizeof(struct vm_page);
4953 	for (va = start; va < end; va += NBPDR) {
4954 		pfn = first_page + (va - start) / sizeof(struct vm_page);
4955 		domain = vm_phys_domain(ptoa(pfn));
4956 		pdpe = pmap_pdpe(kernel_pmap, va);
4957 		if ((*pdpe & X86_PG_V) == 0) {
4958 			pa = vm_phys_early_alloc(domain, PAGE_SIZE);
4959 			dump_add_page(pa);
4960 			pagezero((void *)PHYS_TO_DMAP(pa));
4961 			*pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW |
4962 			    X86_PG_A | X86_PG_M);
4963 		}
4964 		pde = pmap_pdpe_to_pde(pdpe, va);
4965 		if ((*pde & X86_PG_V) != 0)
4966 			panic("Unexpected pde");
4967 		pa = vm_phys_early_alloc(domain, NBPDR);
4968 		for (i = 0; i < NPDEPG; i++)
4969 			dump_add_page(pa + i * PAGE_SIZE);
4970 		newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A |
4971 		    X86_PG_M | PG_PS | pg_g | pg_nx);
4972 		pde_store(pde, newpdir);
4973 	}
4974 	vm_page_array = (vm_page_t)start;
4975 
4976 #ifdef KMSAN
4977 	pmap_kmsan_page_array_startup(start, end);
4978 #endif
4979 }
4980 
4981 /*
4982  * grow the number of kernel page table entries, if needed
4983  */
4984 static int
pmap_growkernel_nopanic(vm_offset_t addr)4985 pmap_growkernel_nopanic(vm_offset_t addr)
4986 {
4987 	vm_paddr_t paddr;
4988 	vm_page_t nkpg;
4989 	pd_entry_t *pde, newpdir;
4990 	pdp_entry_t *pdpe;
4991 	vm_offset_t end;
4992 	int rv;
4993 
4994 	TSENTER();
4995 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
4996 	rv = KERN_SUCCESS;
4997 
4998 	/*
4999 	 * The kernel map covers two distinct regions of KVA: that used
5000 	 * for dynamic kernel memory allocations, and the uppermost 2GB
5001 	 * of the virtual address space.  The latter is used to map the
5002 	 * kernel and loadable kernel modules.  This scheme enables the
5003 	 * use of a special code generation model for kernel code which
5004 	 * takes advantage of compact addressing modes in machine code.
5005 	 *
5006 	 * Both regions grow upwards; to avoid wasting memory, the gap
5007 	 * in between is unmapped.  If "addr" is above "KERNBASE", the
5008 	 * kernel's region is grown, otherwise the kmem region is grown.
5009 	 *
5010 	 * The correctness of this action is based on the following
5011 	 * argument: vm_map_insert() allocates contiguous ranges of the
5012 	 * kernel virtual address space.  It calls this function if a range
5013 	 * ends after "kernel_vm_end".  If the kernel is mapped between
5014 	 * "kernel_vm_end" and "addr", then the range cannot begin at
5015 	 * "kernel_vm_end".  In fact, its beginning address cannot be less
5016 	 * than the kernel.  Thus, there is no immediate need to allocate
5017 	 * any new kernel page table pages between "kernel_vm_end" and
5018 	 * "KERNBASE".
5019 	 */
5020 	if (KERNBASE < addr) {
5021 		end = KERNBASE + nkpt * NBPDR;
5022 		if (end == 0) {
5023 			TSEXIT();
5024 			return (rv);
5025 		}
5026 	} else {
5027 		end = kernel_vm_end;
5028 	}
5029 
5030 	addr = roundup2(addr, NBPDR);
5031 	if (addr - 1 >= vm_map_max(kernel_map))
5032 		addr = vm_map_max(kernel_map);
5033 	if (addr <= end) {
5034 		/*
5035 		 * The grown region is already mapped, so there is
5036 		 * nothing to do.
5037 		 */
5038 		TSEXIT();
5039 		return (rv);
5040 	}
5041 
5042 	kasan_shadow_map(end, addr - end);
5043 	kmsan_shadow_map(end, addr - end);
5044 	while (end < addr) {
5045 		pdpe = pmap_pdpe(kernel_pmap, end);
5046 		if ((*pdpe & X86_PG_V) == 0) {
5047 			nkpg = pmap_alloc_pt_page(kernel_pmap,
5048 			    pmap_pdpe_pindex(end), VM_ALLOC_INTERRUPT |
5049 			        VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
5050 			if (nkpg == NULL) {
5051 				rv = KERN_RESOURCE_SHORTAGE;
5052 				break;
5053 			}
5054 			paddr = VM_PAGE_TO_PHYS(nkpg);
5055 			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
5056 			    X86_PG_A | X86_PG_M);
5057 			continue; /* try again */
5058 		}
5059 		pde = pmap_pdpe_to_pde(pdpe, end);
5060 		if ((*pde & X86_PG_V) != 0) {
5061 			end = (end + NBPDR) & ~PDRMASK;
5062 			if (end - 1 >= vm_map_max(kernel_map)) {
5063 				end = vm_map_max(kernel_map);
5064 				break;
5065 			}
5066 			continue;
5067 		}
5068 
5069 		nkpg = pmap_alloc_pt_page(kernel_pmap, pmap_pde_pindex(end),
5070 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOFREE | VM_ALLOC_WIRED |
5071 			VM_ALLOC_ZERO);
5072 		if (nkpg == NULL) {
5073 			rv = KERN_RESOURCE_SHORTAGE;
5074 			break;
5075 		}
5076 
5077 		paddr = VM_PAGE_TO_PHYS(nkpg);
5078 		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
5079 		pde_store(pde, newpdir);
5080 
5081 		end = (end + NBPDR) & ~PDRMASK;
5082 		if (end - 1 >= vm_map_max(kernel_map)) {
5083 			end = vm_map_max(kernel_map);
5084 			break;
5085 		}
5086 	}
5087 
5088 	if (end <= KERNBASE)
5089 		kernel_vm_end = end;
5090 	else
5091 		nkpt = howmany(end - KERNBASE, NBPDR);
5092 	TSEXIT();
5093 	return (rv);
5094 }
5095 
5096 int
pmap_growkernel(vm_offset_t addr)5097 pmap_growkernel(vm_offset_t addr)
5098 {
5099 	int rv;
5100 
5101 	rv = pmap_growkernel_nopanic(addr);
5102 	if (rv != KERN_SUCCESS && pmap_growkernel_panic)
5103 		panic("pmap_growkernel: no memory to grow kernel");
5104 	return (rv);
5105 }
5106 
5107 /***************************************************
5108  * page management routines.
5109  ***************************************************/
5110 
5111 static const uint64_t pc_freemask[_NPCM] = {
5112 	[0 ... _NPCM - 2] = PC_FREEN,
5113 	[_NPCM - 1] = PC_FREEL
5114 };
5115 
5116 #ifdef PV_STATS
5117 
5118 static COUNTER_U64_DEFINE_EARLY(pc_chunk_count);
5119 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD,
5120     &pc_chunk_count, "Current number of pv entry cnunks");
5121 
5122 static COUNTER_U64_DEFINE_EARLY(pc_chunk_allocs);
5123 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD,
5124     &pc_chunk_allocs, "Total number of pv entry chunks allocated");
5125 
5126 static COUNTER_U64_DEFINE_EARLY(pc_chunk_frees);
5127 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD,
5128     &pc_chunk_frees, "Total number of pv entry chunks freed");
5129 
5130 static COUNTER_U64_DEFINE_EARLY(pc_chunk_tryfail);
5131 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD,
5132     &pc_chunk_tryfail,
5133     "Number of failed attempts to get a pv entry chunk page");
5134 
5135 static COUNTER_U64_DEFINE_EARLY(pv_entry_frees);
5136 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD,
5137     &pv_entry_frees, "Total number of pv entries freed");
5138 
5139 static COUNTER_U64_DEFINE_EARLY(pv_entry_allocs);
5140 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD,
5141     &pv_entry_allocs, "Total number of pv entries allocated");
5142 
5143 static COUNTER_U64_DEFINE_EARLY(pv_entry_count);
5144 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD,
5145     &pv_entry_count, "Current number of pv entries");
5146 
5147 static COUNTER_U64_DEFINE_EARLY(pv_entry_spare);
5148 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD,
5149     &pv_entry_spare, "Current number of spare pv entries");
5150 #endif
5151 
5152 static void
reclaim_pv_chunk_leave_pmap(pmap_t pmap,pmap_t locked_pmap,bool start_di)5153 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
5154 {
5155 
5156 	if (pmap == NULL)
5157 		return;
5158 	pmap_invalidate_all(pmap);
5159 	if (pmap != locked_pmap)
5160 		PMAP_UNLOCK(pmap);
5161 	if (start_di)
5162 		pmap_delayed_invl_finish();
5163 }
5164 
5165 /*
5166  * We are in a serious low memory condition.  Resort to
5167  * drastic measures to free some pages so we can allocate
5168  * another pv entry chunk.
5169  *
5170  * Returns NULL if PV entries were reclaimed from the specified pmap.
5171  *
5172  * We do not, however, unmap 2mpages because subsequent accesses will
5173  * allocate per-page pv entries until repromotion occurs, thereby
5174  * exacerbating the shortage of free pv entries.
5175  */
5176 static vm_page_t
reclaim_pv_chunk_domain(pmap_t locked_pmap,struct rwlock ** lockp,int domain)5177 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
5178 {
5179 	struct pv_chunks_list *pvc;
5180 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
5181 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
5182 	struct md_page *pvh;
5183 	pd_entry_t *pde;
5184 	pmap_t next_pmap, pmap;
5185 	pt_entry_t *pte, tpte;
5186 	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
5187 	pv_entry_t pv;
5188 	vm_offset_t va;
5189 	vm_page_t m, m_pc;
5190 	struct spglist free;
5191 	uint64_t inuse;
5192 	int bit, field, freed;
5193 	bool start_di, restart;
5194 
5195 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
5196 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
5197 	pmap = NULL;
5198 	m_pc = NULL;
5199 	PG_G = PG_A = PG_M = PG_RW = 0;
5200 	SLIST_INIT(&free);
5201 	bzero(&pc_marker_b, sizeof(pc_marker_b));
5202 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
5203 	pc_marker = (struct pv_chunk *)&pc_marker_b;
5204 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
5205 
5206 	/*
5207 	 * A delayed invalidation block should already be active if
5208 	 * pmap_advise() or pmap_remove() called this function by way
5209 	 * of pmap_demote_pde_locked().
5210 	 */
5211 	start_di = pmap_not_in_di();
5212 
5213 	pvc = &pv_chunks[domain];
5214 	mtx_lock(&pvc->pvc_lock);
5215 	pvc->active_reclaims++;
5216 	TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
5217 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
5218 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
5219 	    SLIST_EMPTY(&free)) {
5220 		next_pmap = pc->pc_pmap;
5221 		if (next_pmap == NULL) {
5222 			/*
5223 			 * The next chunk is a marker.  However, it is
5224 			 * not our marker, so active_reclaims must be
5225 			 * > 1.  Consequently, the next_chunk code
5226 			 * will not rotate the pv_chunks list.
5227 			 */
5228 			goto next_chunk;
5229 		}
5230 		mtx_unlock(&pvc->pvc_lock);
5231 
5232 		/*
5233 		 * A pv_chunk can only be removed from the pc_lru list
5234 		 * when both pc_chunks_mutex is owned and the
5235 		 * corresponding pmap is locked.
5236 		 */
5237 		if (pmap != next_pmap) {
5238 			restart = false;
5239 			reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
5240 			    start_di);
5241 			pmap = next_pmap;
5242 			/* Avoid deadlock and lock recursion. */
5243 			if (pmap > locked_pmap) {
5244 				RELEASE_PV_LIST_LOCK(lockp);
5245 				PMAP_LOCK(pmap);
5246 				if (start_di)
5247 					pmap_delayed_invl_start();
5248 				mtx_lock(&pvc->pvc_lock);
5249 				restart = true;
5250 			} else if (pmap != locked_pmap) {
5251 				if (PMAP_TRYLOCK(pmap)) {
5252 					if (start_di)
5253 						pmap_delayed_invl_start();
5254 					mtx_lock(&pvc->pvc_lock);
5255 					restart = true;
5256 				} else {
5257 					pmap = NULL; /* pmap is not locked */
5258 					mtx_lock(&pvc->pvc_lock);
5259 					pc = TAILQ_NEXT(pc_marker, pc_lru);
5260 					if (pc == NULL ||
5261 					    pc->pc_pmap != next_pmap)
5262 						continue;
5263 					goto next_chunk;
5264 				}
5265 			} else if (start_di)
5266 				pmap_delayed_invl_start();
5267 			PG_G = pmap_global_bit(pmap);
5268 			PG_A = pmap_accessed_bit(pmap);
5269 			PG_M = pmap_modified_bit(pmap);
5270 			PG_RW = pmap_rw_bit(pmap);
5271 			if (restart)
5272 				continue;
5273 		}
5274 
5275 		/*
5276 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
5277 		 */
5278 		freed = 0;
5279 		for (field = 0; field < _NPCM; field++) {
5280 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
5281 			    inuse != 0; inuse &= ~(1UL << bit)) {
5282 				bit = bsfq(inuse);
5283 				pv = &pc->pc_pventry[field * 64 + bit];
5284 				va = pv->pv_va;
5285 				pde = pmap_pde(pmap, va);
5286 				if ((*pde & PG_PS) != 0)
5287 					continue;
5288 				pte = pmap_pde_to_pte(pde, va);
5289 				if ((*pte & PG_W) != 0)
5290 					continue;
5291 				tpte = pte_load_clear(pte);
5292 				if ((tpte & PG_G) != 0)
5293 					pmap_invalidate_page(pmap, va);
5294 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
5295 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5296 					vm_page_dirty(m);
5297 				if ((tpte & PG_A) != 0)
5298 					vm_page_aflag_set(m, PGA_REFERENCED);
5299 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
5300 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5301 				m->md.pv_gen++;
5302 				if (TAILQ_EMPTY(&m->md.pv_list) &&
5303 				    (m->flags & PG_FICTITIOUS) == 0) {
5304 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5305 					if (TAILQ_EMPTY(&pvh->pv_list)) {
5306 						vm_page_aflag_clear(m,
5307 						    PGA_WRITEABLE);
5308 					}
5309 				}
5310 				pmap_delayed_invl_page(m);
5311 				pc->pc_map[field] |= 1UL << bit;
5312 				pmap_unuse_pt(pmap, va, *pde, &free);
5313 				freed++;
5314 			}
5315 		}
5316 		if (freed == 0) {
5317 			mtx_lock(&pvc->pvc_lock);
5318 			goto next_chunk;
5319 		}
5320 		/* Every freed mapping is for a 4 KB page. */
5321 		pmap_resident_count_adj(pmap, -freed);
5322 		PV_STAT(counter_u64_add(pv_entry_frees, freed));
5323 		PV_STAT(counter_u64_add(pv_entry_spare, freed));
5324 		PV_STAT(counter_u64_add(pv_entry_count, -freed));
5325 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5326 		if (pc_is_free(pc)) {
5327 			PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV));
5328 			PV_STAT(counter_u64_add(pc_chunk_count, -1));
5329 			PV_STAT(counter_u64_add(pc_chunk_frees, 1));
5330 			/* Entire chunk is free; return it. */
5331 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
5332 			dump_drop_page(m_pc->phys_addr);
5333 			mtx_lock(&pvc->pvc_lock);
5334 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
5335 			break;
5336 		}
5337 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
5338 		mtx_lock(&pvc->pvc_lock);
5339 		/* One freed pv entry in locked_pmap is sufficient. */
5340 		if (pmap == locked_pmap)
5341 			break;
5342 next_chunk:
5343 		TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
5344 		TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
5345 		if (pvc->active_reclaims == 1 && pmap != NULL) {
5346 			/*
5347 			 * Rotate the pv chunks list so that we do not
5348 			 * scan the same pv chunks that could not be
5349 			 * freed (because they contained a wired
5350 			 * and/or superpage mapping) on every
5351 			 * invocation of reclaim_pv_chunk().
5352 			 */
5353 			while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) {
5354 				MPASS(pc->pc_pmap != NULL);
5355 				TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
5356 				TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
5357 			}
5358 		}
5359 	}
5360 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
5361 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
5362 	pvc->active_reclaims--;
5363 	mtx_unlock(&pvc->pvc_lock);
5364 	reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
5365 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
5366 		m_pc = SLIST_FIRST(&free);
5367 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
5368 		/* Recycle a freed page table page. */
5369 		m_pc->ref_count = 1;
5370 	}
5371 	vm_page_free_pages_toq(&free, true);
5372 	return (m_pc);
5373 }
5374 
5375 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)5376 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
5377 {
5378 	vm_page_t m;
5379 	int i, domain;
5380 
5381 	domain = PCPU_GET(domain);
5382 	for (i = 0; i < vm_ndomains; i++) {
5383 		m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
5384 		if (m != NULL)
5385 			break;
5386 		domain = (domain + 1) % vm_ndomains;
5387 	}
5388 
5389 	return (m);
5390 }
5391 
5392 /*
5393  * free the pv_entry back to the free list
5394  */
5395 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)5396 free_pv_entry(pmap_t pmap, pv_entry_t pv)
5397 {
5398 	struct pv_chunk *pc;
5399 	int idx, field, bit;
5400 
5401 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5402 	PV_STAT(counter_u64_add(pv_entry_frees, 1));
5403 	PV_STAT(counter_u64_add(pv_entry_spare, 1));
5404 	PV_STAT(counter_u64_add(pv_entry_count, -1));
5405 	pc = pv_to_chunk(pv);
5406 	idx = pv - &pc->pc_pventry[0];
5407 	field = idx / 64;
5408 	bit = idx % 64;
5409 	pc->pc_map[field] |= 1ul << bit;
5410 	if (!pc_is_free(pc)) {
5411 		/* 98% of the time, pc is already at the head of the list. */
5412 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
5413 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5414 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
5415 		}
5416 		return;
5417 	}
5418 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5419 	free_pv_chunk(pc);
5420 }
5421 
5422 static void
free_pv_chunk_dequeued(struct pv_chunk * pc)5423 free_pv_chunk_dequeued(struct pv_chunk *pc)
5424 {
5425 	vm_page_t m;
5426 
5427 	PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV));
5428 	PV_STAT(counter_u64_add(pc_chunk_count, -1));
5429 	PV_STAT(counter_u64_add(pc_chunk_frees, 1));
5430 	counter_u64_add(pv_page_count, -1);
5431 	/* entire chunk is free, return it */
5432 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
5433 	dump_drop_page(m->phys_addr);
5434 	vm_page_unwire_noq(m);
5435 	vm_page_free(m);
5436 }
5437 
5438 static void
free_pv_chunk(struct pv_chunk * pc)5439 free_pv_chunk(struct pv_chunk *pc)
5440 {
5441 	struct pv_chunks_list *pvc;
5442 
5443 	pvc = &pv_chunks[pc_to_domain(pc)];
5444 	mtx_lock(&pvc->pvc_lock);
5445 	TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
5446 	mtx_unlock(&pvc->pvc_lock);
5447 	free_pv_chunk_dequeued(pc);
5448 }
5449 
5450 static void
free_pv_chunk_batch(struct pv_chunklist * batch)5451 free_pv_chunk_batch(struct pv_chunklist *batch)
5452 {
5453 	struct pv_chunks_list *pvc;
5454 	struct pv_chunk *pc, *npc;
5455 	int i;
5456 
5457 	for (i = 0; i < vm_ndomains; i++) {
5458 		if (TAILQ_EMPTY(&batch[i]))
5459 			continue;
5460 		pvc = &pv_chunks[i];
5461 		mtx_lock(&pvc->pvc_lock);
5462 		TAILQ_FOREACH(pc, &batch[i], pc_list) {
5463 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
5464 		}
5465 		mtx_unlock(&pvc->pvc_lock);
5466 	}
5467 
5468 	for (i = 0; i < vm_ndomains; i++) {
5469 		TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
5470 			free_pv_chunk_dequeued(pc);
5471 		}
5472 	}
5473 }
5474 
5475 /*
5476  * Returns a new PV entry, allocating a new PV chunk from the system when
5477  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
5478  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
5479  * returned.
5480  *
5481  * The given PV list lock may be released.
5482  */
5483 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)5484 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
5485 {
5486 	struct pv_chunks_list *pvc;
5487 	int bit, field;
5488 	pv_entry_t pv;
5489 	struct pv_chunk *pc;
5490 	vm_page_t m;
5491 
5492 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5493 	PV_STAT(counter_u64_add(pv_entry_allocs, 1));
5494 retry:
5495 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
5496 	if (pc != NULL) {
5497 		for (field = 0; field < _NPCM; field++) {
5498 			if (pc->pc_map[field]) {
5499 				bit = bsfq(pc->pc_map[field]);
5500 				break;
5501 			}
5502 		}
5503 		if (field < _NPCM) {
5504 			pv = &pc->pc_pventry[field * 64 + bit];
5505 			pc->pc_map[field] &= ~(1ul << bit);
5506 			/* If this was the last item, move it to tail */
5507 			if (pc_is_full(pc)) {
5508 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5509 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
5510 				    pc_list);
5511 			}
5512 			PV_STAT(counter_u64_add(pv_entry_count, 1));
5513 			PV_STAT(counter_u64_add(pv_entry_spare, -1));
5514 			return (pv);
5515 		}
5516 	}
5517 	/* No free items, allocate another chunk */
5518 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5519 	if (m == NULL) {
5520 		if (lockp == NULL) {
5521 			PV_STAT(counter_u64_add(pc_chunk_tryfail, 1));
5522 			return (NULL);
5523 		}
5524 		m = reclaim_pv_chunk(pmap, lockp);
5525 		if (m == NULL)
5526 			goto retry;
5527 	} else
5528 		counter_u64_add(pv_page_count, 1);
5529 	PV_STAT(counter_u64_add(pc_chunk_count, 1));
5530 	PV_STAT(counter_u64_add(pc_chunk_allocs, 1));
5531 	dump_add_page(m->phys_addr);
5532 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
5533 	pc->pc_pmap = pmap;
5534 	pc->pc_map[0] = PC_FREEN & ~1ul;	/* preallocated bit 0 */
5535 	pc->pc_map[1] = PC_FREEN;
5536 	pc->pc_map[2] = PC_FREEL;
5537 	pvc = &pv_chunks[vm_page_domain(m)];
5538 	mtx_lock(&pvc->pvc_lock);
5539 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
5540 	mtx_unlock(&pvc->pvc_lock);
5541 	pv = &pc->pc_pventry[0];
5542 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
5543 	PV_STAT(counter_u64_add(pv_entry_count, 1));
5544 	PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV - 1));
5545 	return (pv);
5546 }
5547 
5548 /*
5549  * Returns the number of one bits within the given PV chunk map.
5550  *
5551  * The erratas for Intel processors state that "POPCNT Instruction May
5552  * Take Longer to Execute Than Expected".  It is believed that the
5553  * issue is the spurious dependency on the destination register.
5554  * Provide a hint to the register rename logic that the destination
5555  * value is overwritten, by clearing it, as suggested in the
5556  * optimization manual.  It should be cheap for unaffected processors
5557  * as well.
5558  *
5559  * Reference numbers for erratas are
5560  * 4th Gen Core: HSD146
5561  * 5th Gen Core: BDM85
5562  * 6th Gen Core: SKL029
5563  */
5564 static int
popcnt_pc_map_pq(uint64_t * map)5565 popcnt_pc_map_pq(uint64_t *map)
5566 {
5567 	u_long result, tmp;
5568 
5569 	__asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
5570 	    "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
5571 	    "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
5572 	    : "=&r" (result), "=&r" (tmp)
5573 	    : "m" (map[0]), "m" (map[1]), "m" (map[2]));
5574 	return (result);
5575 }
5576 
5577 /*
5578  * Ensure that the number of spare PV entries in the specified pmap meets or
5579  * exceeds the given count, "needed".
5580  *
5581  * The given PV list lock may be released.
5582  */
5583 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)5584 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
5585 {
5586 	struct pv_chunks_list *pvc;
5587 	struct pch new_tail[PMAP_MEMDOM];
5588 	struct pv_chunk *pc;
5589 	vm_page_t m;
5590 	int avail, free, i;
5591 	bool reclaimed;
5592 
5593 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5594 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
5595 
5596 	/*
5597 	 * Newly allocated PV chunks must be stored in a private list until
5598 	 * the required number of PV chunks have been allocated.  Otherwise,
5599 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
5600 	 * contrast, these chunks must be added to the pmap upon allocation.
5601 	 */
5602 	for (i = 0; i < PMAP_MEMDOM; i++)
5603 		TAILQ_INIT(&new_tail[i]);
5604 retry:
5605 	avail = 0;
5606 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
5607 #ifndef __POPCNT__
5608 		if ((cpu_feature2 & CPUID2_POPCNT) == 0)
5609 			bit_count((bitstr_t *)pc->pc_map, 0,
5610 			    sizeof(pc->pc_map) * NBBY, &free);
5611 		else
5612 #endif
5613 		free = popcnt_pc_map_pq(pc->pc_map);
5614 		if (free == 0)
5615 			break;
5616 		avail += free;
5617 		if (avail >= needed)
5618 			break;
5619 	}
5620 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
5621 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5622 		if (m == NULL) {
5623 			m = reclaim_pv_chunk(pmap, lockp);
5624 			if (m == NULL)
5625 				goto retry;
5626 			reclaimed = true;
5627 		} else
5628 			counter_u64_add(pv_page_count, 1);
5629 		PV_STAT(counter_u64_add(pc_chunk_count, 1));
5630 		PV_STAT(counter_u64_add(pc_chunk_allocs, 1));
5631 		dump_add_page(m->phys_addr);
5632 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
5633 		pc->pc_pmap = pmap;
5634 		pc->pc_map[0] = PC_FREEN;
5635 		pc->pc_map[1] = PC_FREEN;
5636 		pc->pc_map[2] = PC_FREEL;
5637 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
5638 		TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
5639 		PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV));
5640 
5641 		/*
5642 		 * The reclaim might have freed a chunk from the current pmap.
5643 		 * If that chunk contained available entries, we need to
5644 		 * re-count the number of available entries.
5645 		 */
5646 		if (reclaimed)
5647 			goto retry;
5648 	}
5649 	for (i = 0; i < vm_ndomains; i++) {
5650 		if (TAILQ_EMPTY(&new_tail[i]))
5651 			continue;
5652 		pvc = &pv_chunks[i];
5653 		mtx_lock(&pvc->pvc_lock);
5654 		TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
5655 		mtx_unlock(&pvc->pvc_lock);
5656 	}
5657 }
5658 
5659 /*
5660  * First find and then remove the pv entry for the specified pmap and virtual
5661  * address from the specified pv list.  Returns the pv entry if found and NULL
5662  * otherwise.  This operation can be performed on pv lists for either 4KB or
5663  * 2MB page mappings.
5664  */
5665 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)5666 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
5667 {
5668 	pv_entry_t pv;
5669 
5670 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5671 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
5672 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5673 			pvh->pv_gen++;
5674 			break;
5675 		}
5676 	}
5677 	return (pv);
5678 }
5679 
5680 /*
5681  * After demotion from a 2MB page mapping to 512 4KB page mappings,
5682  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
5683  * entries for each of the 4KB page mappings.
5684  */
5685 static void
pmap_pv_demote_pde(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)5686 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
5687     struct rwlock **lockp)
5688 {
5689 	struct md_page *pvh;
5690 	struct pv_chunk *pc;
5691 	pv_entry_t pv;
5692 	vm_offset_t va_last;
5693 	vm_page_t m;
5694 	int bit, field;
5695 
5696 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5697 	KASSERT((pa & PDRMASK) == 0,
5698 	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
5699 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
5700 
5701 	/*
5702 	 * Transfer the 2mpage's pv entry for this mapping to the first
5703 	 * page's pv list.  Once this transfer begins, the pv list lock
5704 	 * must not be released until the last pv entry is reinstantiated.
5705 	 */
5706 	pvh = pa_to_pvh(pa);
5707 	va = trunc_2mpage(va);
5708 	pv = pmap_pvh_remove(pvh, pmap, va);
5709 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
5710 	m = PHYS_TO_VM_PAGE(pa);
5711 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5712 	m->md.pv_gen++;
5713 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
5714 	PV_STAT(counter_u64_add(pv_entry_allocs, NPTEPG - 1));
5715 	va_last = va + NBPDR - PAGE_SIZE;
5716 	for (;;) {
5717 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
5718 		KASSERT(!pc_is_full(pc), ("pmap_pv_demote_pde: missing spare"));
5719 		for (field = 0; field < _NPCM; field++) {
5720 			while (pc->pc_map[field]) {
5721 				bit = bsfq(pc->pc_map[field]);
5722 				pc->pc_map[field] &= ~(1ul << bit);
5723 				pv = &pc->pc_pventry[field * 64 + bit];
5724 				va += PAGE_SIZE;
5725 				pv->pv_va = va;
5726 				m++;
5727 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5728 			    ("pmap_pv_demote_pde: page %p is not managed", m));
5729 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5730 				m->md.pv_gen++;
5731 				if (va == va_last)
5732 					goto out;
5733 			}
5734 		}
5735 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5736 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
5737 	}
5738 out:
5739 	if (pc_is_full(pc)) {
5740 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5741 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
5742 	}
5743 	PV_STAT(counter_u64_add(pv_entry_count, NPTEPG - 1));
5744 	PV_STAT(counter_u64_add(pv_entry_spare, -(NPTEPG - 1)));
5745 }
5746 
5747 #if VM_NRESERVLEVEL > 0
5748 /*
5749  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
5750  * replace the many pv entries for the 4KB page mappings by a single pv entry
5751  * for the 2MB page mapping.
5752  */
5753 static void
pmap_pv_promote_pde(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)5754 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
5755     struct rwlock **lockp)
5756 {
5757 	struct md_page *pvh;
5758 	pv_entry_t pv;
5759 	vm_offset_t va_last;
5760 	vm_page_t m;
5761 
5762 	KASSERT((pa & PDRMASK) == 0,
5763 	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
5764 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
5765 
5766 	/*
5767 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
5768 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
5769 	 * a transfer avoids the possibility that get_pv_entry() calls
5770 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
5771 	 * mappings that is being promoted.
5772 	 */
5773 	m = PHYS_TO_VM_PAGE(pa);
5774 	va = trunc_2mpage(va);
5775 	pv = pmap_pvh_remove(&m->md, pmap, va);
5776 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
5777 	pvh = pa_to_pvh(pa);
5778 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5779 	pvh->pv_gen++;
5780 	/* Free the remaining NPTEPG - 1 pv entries. */
5781 	va_last = va + NBPDR - PAGE_SIZE;
5782 	do {
5783 		m++;
5784 		va += PAGE_SIZE;
5785 		pmap_pvh_free(&m->md, pmap, va);
5786 	} while (va < va_last);
5787 }
5788 #endif /* VM_NRESERVLEVEL > 0 */
5789 
5790 /*
5791  * First find and then destroy the pv entry for the specified pmap and virtual
5792  * address.  This operation can be performed on pv lists for either 4KB or 2MB
5793  * page mappings.
5794  */
5795 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)5796 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
5797 {
5798 	pv_entry_t pv;
5799 
5800 	pv = pmap_pvh_remove(pvh, pmap, va);
5801 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
5802 	free_pv_entry(pmap, pv);
5803 }
5804 
5805 /*
5806  * Conditionally create the PV entry for a 4KB page mapping if the required
5807  * memory can be allocated without resorting to reclamation.
5808  */
5809 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)5810 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
5811     struct rwlock **lockp)
5812 {
5813 	pv_entry_t pv;
5814 
5815 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5816 	/* Pass NULL instead of the lock pointer to disable reclamation. */
5817 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
5818 		pv->pv_va = va;
5819 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
5820 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5821 		m->md.pv_gen++;
5822 		return (true);
5823 	} else
5824 		return (false);
5825 }
5826 
5827 /*
5828  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
5829  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
5830  * false if the PV entry cannot be allocated without resorting to reclamation.
5831  */
5832 static bool
pmap_pv_insert_pde(pmap_t pmap,vm_offset_t va,pd_entry_t pde,u_int flags,struct rwlock ** lockp)5833 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
5834     struct rwlock **lockp)
5835 {
5836 	struct md_page *pvh;
5837 	pv_entry_t pv;
5838 	vm_paddr_t pa;
5839 
5840 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5841 	/* Pass NULL instead of the lock pointer to disable reclamation. */
5842 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
5843 	    NULL : lockp)) == NULL)
5844 		return (false);
5845 	pv->pv_va = va;
5846 	pa = pde & PG_PS_FRAME;
5847 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
5848 	pvh = pa_to_pvh(pa);
5849 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5850 	pvh->pv_gen++;
5851 	return (true);
5852 }
5853 
5854 /*
5855  * Fills a page table page with mappings to consecutive physical pages.
5856  */
5857 static void
pmap_fill_ptp(pt_entry_t * firstpte,pt_entry_t newpte)5858 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
5859 {
5860 	pt_entry_t *pte;
5861 
5862 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
5863 		*pte = newpte;
5864 		newpte += PAGE_SIZE;
5865 	}
5866 }
5867 
5868 /*
5869  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
5870  * mapping is invalidated.
5871  */
5872 static bool
pmap_demote_pde(pmap_t pmap,pd_entry_t * pde,vm_offset_t va)5873 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
5874 {
5875 	struct rwlock *lock;
5876 	bool rv;
5877 
5878 	lock = NULL;
5879 	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
5880 	if (lock != NULL)
5881 		rw_wunlock(lock);
5882 	return (rv);
5883 }
5884 
5885 static void
pmap_demote_pde_check(pt_entry_t * firstpte __unused,pt_entry_t newpte __unused)5886 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused)
5887 {
5888 #ifdef INVARIANTS
5889 #ifdef DIAGNOSTIC
5890 	pt_entry_t *xpte, *ypte;
5891 
5892 	for (xpte = firstpte; xpte < firstpte + NPTEPG;
5893 	    xpte++, newpte += PAGE_SIZE) {
5894 		if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) {
5895 			printf("pmap_demote_pde: xpte %zd and newpte map "
5896 			    "different pages: found %#lx, expected %#lx\n",
5897 			    xpte - firstpte, *xpte, newpte);
5898 			printf("page table dump\n");
5899 			for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++)
5900 				printf("%zd %#lx\n", ypte - firstpte, *ypte);
5901 			panic("firstpte");
5902 		}
5903 	}
5904 #else
5905 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
5906 	    ("pmap_demote_pde: firstpte and newpte map different physical"
5907 	    " addresses"));
5908 #endif
5909 #endif
5910 }
5911 
5912 static void
pmap_demote_pde_abort(pmap_t pmap,vm_offset_t va,pd_entry_t * pde,pd_entry_t oldpde,struct rwlock ** lockp)5913 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
5914     pd_entry_t oldpde, struct rwlock **lockp)
5915 {
5916 	struct spglist free;
5917 	vm_offset_t sva;
5918 
5919 	SLIST_INIT(&free);
5920 	sva = trunc_2mpage(va);
5921 	pmap_remove_pde(pmap, pde, sva, true, &free, lockp);
5922 	if ((oldpde & pmap_global_bit(pmap)) == 0)
5923 		pmap_invalidate_pde_page(pmap, sva, oldpde);
5924 	vm_page_free_pages_toq(&free, true);
5925 	CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p",
5926 	    va, pmap);
5927 }
5928 
5929 static bool
pmap_demote_pde_locked(pmap_t pmap,pd_entry_t * pde,vm_offset_t va,struct rwlock ** lockp)5930 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
5931     struct rwlock **lockp)
5932 {
5933 	return (pmap_demote_pde_mpte(pmap, pde, va, lockp, NULL));
5934 }
5935 
5936 static bool
pmap_demote_pde_mpte(pmap_t pmap,pd_entry_t * pde,vm_offset_t va,struct rwlock ** lockp,vm_page_t mpte)5937 pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
5938     struct rwlock **lockp, vm_page_t mpte)
5939 {
5940 	pd_entry_t newpde, oldpde;
5941 	pt_entry_t *firstpte, newpte;
5942 	pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
5943 	vm_paddr_t mptepa;
5944 	int PG_PTE_CACHE;
5945 	bool in_kernel;
5946 
5947 	PG_A = pmap_accessed_bit(pmap);
5948 	PG_G = pmap_global_bit(pmap);
5949 	PG_M = pmap_modified_bit(pmap);
5950 	PG_RW = pmap_rw_bit(pmap);
5951 	PG_V = pmap_valid_bit(pmap);
5952 	PG_PTE_CACHE = pmap_cache_mask(pmap, false);
5953 	PG_PKU_MASK = pmap_pku_mask_bit(pmap);
5954 
5955 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5956 	oldpde = *pde;
5957 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
5958 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
5959 	KASSERT((oldpde & PG_MANAGED) == 0 || lockp != NULL,
5960 	    ("pmap_demote_pde: lockp for a managed mapping is NULL"));
5961 	in_kernel = va >= VM_MAXUSER_ADDRESS;
5962 	if (mpte == NULL) {
5963 		/*
5964 		 * Invalidate the 2MB page mapping and return "failure" if the
5965 		 * mapping was never accessed and not wired.
5966 		 */
5967 		if ((oldpde & PG_A) == 0) {
5968 			if ((oldpde & PG_W) == 0) {
5969 				pmap_demote_pde_abort(pmap, va, pde, oldpde,
5970 				    lockp);
5971 				return (false);
5972 			}
5973 			mpte = pmap_remove_pt_page(pmap, va);
5974 			/* Fill the PTP with PTEs that have PG_A cleared. */
5975 			mpte->valid = 0;
5976 		} else if ((mpte = pmap_remove_pt_page(pmap, va)) == NULL) {
5977 			KASSERT((oldpde & PG_W) == 0,
5978     ("pmap_demote_pde: page table page for a wired mapping is missing"));
5979 
5980 			/*
5981 			 * If the page table page is missing and the mapping
5982 			 * is for a kernel address, the mapping must belong to
5983 			 * the direct map.  Page table pages are preallocated
5984 			 * for every other part of the kernel address space,
5985 			 * so the direct map region is the only part of the
5986 			 * kernel address space that must be handled here.
5987 			 */
5988 			KASSERT(!in_kernel || (va >= kva_layout.dmap_low &&
5989 			    va < kva_layout.dmap_high),
5990 			    ("pmap_demote_pde: No saved mpte for va %#lx", va));
5991 
5992 			/*
5993 			 * If the 2MB page mapping belongs to the direct map
5994 			 * region of the kernel's address space, then the page
5995 			 * allocation request specifies the highest possible
5996 			 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
5997 			 * priority is normal.
5998 			 */
5999 			mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va),
6000 			    (in_kernel ? VM_ALLOC_INTERRUPT : 0) |
6001 			    VM_ALLOC_WIRED);
6002 
6003 			/*
6004 			 * If the allocation of the new page table page fails,
6005 			 * invalidate the 2MB page mapping and return "failure".
6006 			 */
6007 			if (mpte == NULL) {
6008 				pmap_demote_pde_abort(pmap, va, pde, oldpde,
6009 				    lockp);
6010 				return (false);
6011 			}
6012 
6013 			if (!in_kernel)
6014 				mpte->ref_count = NPTEPG;
6015 		}
6016 	}
6017 	mptepa = VM_PAGE_TO_PHYS(mpte);
6018 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
6019 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
6020 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
6021 	    ("pmap_demote_pde: oldpde is missing PG_M"));
6022 	newpte = oldpde & ~PG_PS;
6023 	newpte = pmap_swap_pat(pmap, newpte);
6024 
6025 	/*
6026 	 * If the PTP is not leftover from an earlier promotion or it does not
6027 	 * have PG_A set in every PTE, then fill it.  The new PTEs will all
6028 	 * have PG_A set, unless this is a wired mapping with PG_A clear.
6029 	 */
6030 	if (!vm_page_all_valid(mpte))
6031 		pmap_fill_ptp(firstpte, newpte);
6032 
6033 	pmap_demote_pde_check(firstpte, newpte);
6034 
6035 	/*
6036 	 * If the mapping has changed attributes, update the PTEs.
6037 	 */
6038 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
6039 		pmap_fill_ptp(firstpte, newpte);
6040 
6041 	/*
6042 	 * The spare PV entries must be reserved prior to demoting the
6043 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
6044 	 * of the PDE and the PV lists will be inconsistent, which can result
6045 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
6046 	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
6047 	 * PV entry for the 2MB page mapping that is being demoted.
6048 	 */
6049 	if ((oldpde & PG_MANAGED) != 0)
6050 		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
6051 
6052 	/*
6053 	 * Demote the mapping.  This pmap is locked.  The old PDE has
6054 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
6055 	 * set.  Thus, there is no danger of a race with another
6056 	 * processor changing the setting of PG_A and/or PG_M between
6057 	 * the read above and the store below.
6058 	 */
6059 	if (workaround_erratum383)
6060 		pmap_update_pde(pmap, va, pde, newpde);
6061 	else
6062 		pde_store(pde, newpde);
6063 
6064 	/*
6065 	 * Invalidate a stale recursive mapping of the page table page.
6066 	 */
6067 	if (in_kernel)
6068 		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
6069 
6070 	/*
6071 	 * Demote the PV entry.
6072 	 */
6073 	if ((oldpde & PG_MANAGED) != 0)
6074 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
6075 
6076 	counter_u64_add(pmap_pde_demotions, 1);
6077 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p",
6078 	    va, pmap);
6079 	return (true);
6080 }
6081 
6082 /*
6083  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
6084  */
6085 static void
pmap_remove_kernel_pde(pmap_t pmap,pd_entry_t * pde,vm_offset_t va)6086 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
6087 {
6088 	pd_entry_t newpde;
6089 	vm_paddr_t mptepa;
6090 	vm_page_t mpte;
6091 
6092 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
6093 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6094 	mpte = pmap_remove_pt_page(pmap, va);
6095 	KASSERT(mpte != NULL, ("pmap_remove_kernel_pde: missing pt page"));
6096 
6097 	mptepa = VM_PAGE_TO_PHYS(mpte);
6098 	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
6099 
6100 	/*
6101 	 * If this page table page was unmapped by a promotion, then it
6102 	 * contains valid mappings.  Zero it to invalidate those mappings.
6103 	 */
6104 	if (vm_page_any_valid(mpte))
6105 		pagezero((void *)PHYS_TO_DMAP(mptepa));
6106 
6107 	/*
6108 	 * Demote the mapping.
6109 	 */
6110 	if (workaround_erratum383)
6111 		pmap_update_pde(pmap, va, pde, newpde);
6112 	else
6113 		pde_store(pde, newpde);
6114 
6115 	/*
6116 	 * Invalidate a stale recursive mapping of the page table page.
6117 	 */
6118 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
6119 }
6120 
6121 /*
6122  * pmap_remove_pde: do the things to unmap a superpage in a process
6123  */
6124 static int
pmap_remove_pde(pmap_t pmap,pd_entry_t * pdq,vm_offset_t sva,bool demote_kpde,struct spglist * free,struct rwlock ** lockp)6125 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool demote_kpde,
6126     struct spglist *free, struct rwlock **lockp)
6127 {
6128 	struct md_page *pvh;
6129 	pd_entry_t oldpde;
6130 	vm_offset_t eva, va;
6131 	vm_page_t m, mpte;
6132 	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
6133 
6134 	PG_G = pmap_global_bit(pmap);
6135 	PG_A = pmap_accessed_bit(pmap);
6136 	PG_M = pmap_modified_bit(pmap);
6137 	PG_RW = pmap_rw_bit(pmap);
6138 
6139 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6140 	KASSERT((sva & PDRMASK) == 0,
6141 	    ("pmap_remove_pde: sva is not 2mpage aligned"));
6142 	oldpde = pte_load_clear(pdq);
6143 	if (oldpde & PG_W)
6144 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
6145 	if ((oldpde & PG_G) != 0)
6146 		pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
6147 	pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE);
6148 	if (oldpde & PG_MANAGED) {
6149 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
6150 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
6151 		pmap_pvh_free(pvh, pmap, sva);
6152 		eva = sva + NBPDR;
6153 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
6154 		    va < eva; va += PAGE_SIZE, m++) {
6155 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
6156 				vm_page_dirty(m);
6157 			if (oldpde & PG_A)
6158 				vm_page_aflag_set(m, PGA_REFERENCED);
6159 			if (TAILQ_EMPTY(&m->md.pv_list) &&
6160 			    TAILQ_EMPTY(&pvh->pv_list))
6161 				vm_page_aflag_clear(m, PGA_WRITEABLE);
6162 			pmap_delayed_invl_page(m);
6163 		}
6164 	}
6165 	if (pmap != kernel_pmap) {
6166 		mpte = pmap_remove_pt_page(pmap, sva);
6167 		if (mpte != NULL) {
6168 			KASSERT(vm_page_any_valid(mpte),
6169 			    ("pmap_remove_pde: pte page not promoted"));
6170 			pmap_pt_page_count_adj(pmap, -1);
6171 			KASSERT(mpte->ref_count == NPTEPG,
6172 			    ("pmap_remove_pde: pte page ref count error"));
6173 			mpte->ref_count = 0;
6174 			pmap_add_delayed_free_list(mpte, free, false);
6175 		}
6176 	} else if (demote_kpde) {
6177 		pmap_remove_kernel_pde(pmap, pdq, sva);
6178 	} else {
6179 		mpte = vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(sva));
6180 		if (vm_page_any_valid(mpte)) {
6181 			mpte->valid = 0;
6182 			pmap_zero_page(mpte);
6183 		}
6184 	}
6185 	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
6186 }
6187 
6188 /*
6189  * pmap_remove_pte: do the things to unmap a page in a process
6190  */
6191 static int
pmap_remove_pte(pmap_t pmap,pt_entry_t * ptq,vm_offset_t va,pd_entry_t ptepde,struct spglist * free,struct rwlock ** lockp)6192 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
6193     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
6194 {
6195 	struct md_page *pvh;
6196 	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
6197 	vm_page_t m;
6198 
6199 	PG_A = pmap_accessed_bit(pmap);
6200 	PG_M = pmap_modified_bit(pmap);
6201 	PG_RW = pmap_rw_bit(pmap);
6202 
6203 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6204 	oldpte = pte_load_clear(ptq);
6205 	if (oldpte & PG_W)
6206 		pmap->pm_stats.wired_count -= 1;
6207 	pmap_resident_count_adj(pmap, -1);
6208 	if (oldpte & PG_MANAGED) {
6209 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
6210 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6211 			vm_page_dirty(m);
6212 		if (oldpte & PG_A)
6213 			vm_page_aflag_set(m, PGA_REFERENCED);
6214 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
6215 		pmap_pvh_free(&m->md, pmap, va);
6216 		if (TAILQ_EMPTY(&m->md.pv_list) &&
6217 		    (m->flags & PG_FICTITIOUS) == 0) {
6218 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6219 			if (TAILQ_EMPTY(&pvh->pv_list))
6220 				vm_page_aflag_clear(m, PGA_WRITEABLE);
6221 		}
6222 		pmap_delayed_invl_page(m);
6223 	}
6224 	return (pmap_unuse_pt(pmap, va, ptepde, free));
6225 }
6226 
6227 /*
6228  * Remove a single page from a process address space
6229  */
6230 static void
pmap_remove_page(pmap_t pmap,vm_offset_t va,pd_entry_t * pde,struct spglist * free)6231 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
6232     struct spglist *free)
6233 {
6234 	struct rwlock *lock;
6235 	pt_entry_t *pte, PG_V;
6236 
6237 	PG_V = pmap_valid_bit(pmap);
6238 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6239 	if ((*pde & PG_V) == 0)
6240 		return;
6241 	pte = pmap_pde_to_pte(pde, va);
6242 	if ((*pte & PG_V) == 0)
6243 		return;
6244 	lock = NULL;
6245 	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
6246 	if (lock != NULL)
6247 		rw_wunlock(lock);
6248 	pmap_invalidate_page(pmap, va);
6249 }
6250 
6251 /*
6252  * Removes the specified range of addresses from the page table page.
6253  */
6254 static bool
pmap_remove_ptes(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pd_entry_t * pde,struct spglist * free,struct rwlock ** lockp)6255 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
6256     pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
6257 {
6258 	pt_entry_t PG_G, *pte;
6259 	vm_offset_t va;
6260 	bool anyvalid;
6261 
6262 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6263 	PG_G = pmap_global_bit(pmap);
6264 	anyvalid = false;
6265 	va = eva;
6266 	for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++,
6267 	    sva += PAGE_SIZE) {
6268 		if (*pte == 0) {
6269 			if (va != eva) {
6270 				pmap_invalidate_range(pmap, va, sva);
6271 				va = eva;
6272 			}
6273 			continue;
6274 		}
6275 		if ((*pte & PG_G) == 0)
6276 			anyvalid = true;
6277 		else if (va == eva)
6278 			va = sva;
6279 		if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) {
6280 			sva += PAGE_SIZE;
6281 			break;
6282 		}
6283 	}
6284 	if (va != eva)
6285 		pmap_invalidate_range(pmap, va, sva);
6286 	return (anyvalid);
6287 }
6288 
6289 static void
pmap_remove1(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool map_delete)6290 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
6291 {
6292 	struct rwlock *lock;
6293 	vm_page_t mt;
6294 	vm_offset_t va_next;
6295 	pml5_entry_t *pml5e;
6296 	pml4_entry_t *pml4e;
6297 	pdp_entry_t *pdpe;
6298 	pd_entry_t ptpaddr, *pde;
6299 	pt_entry_t PG_G, PG_V;
6300 	struct spglist free;
6301 	int anyvalid;
6302 
6303 	PG_G = pmap_global_bit(pmap);
6304 	PG_V = pmap_valid_bit(pmap);
6305 
6306 	/*
6307 	 * If there are no resident pages besides the top level page
6308 	 * table page(s), there is nothing to do.  Kernel pmap always
6309 	 * accounts whole preloaded area as resident, which makes its
6310 	 * resident count > 2.
6311 	 * Perform an unsynchronized read.  This is, however, safe.
6312 	 */
6313 	if (pmap->pm_stats.resident_count <= 1 + (pmap->pm_pmltopu != NULL ?
6314 	    1 : 0))
6315 		return;
6316 
6317 	anyvalid = 0;
6318 	SLIST_INIT(&free);
6319 
6320 	pmap_delayed_invl_start();
6321 	PMAP_LOCK(pmap);
6322 	if (map_delete)
6323 		pmap_pkru_on_remove(pmap, sva, eva);
6324 
6325 	/*
6326 	 * special handling of removing one page.  a very
6327 	 * common operation and easy to short circuit some
6328 	 * code.
6329 	 */
6330 	if (sva + PAGE_SIZE == eva) {
6331 		pde = pmap_pde(pmap, sva);
6332 		if (pde && (*pde & PG_PS) == 0) {
6333 			pmap_remove_page(pmap, sva, pde, &free);
6334 			goto out;
6335 		}
6336 	}
6337 
6338 	lock = NULL;
6339 	for (; sva < eva; sva = va_next) {
6340 		if (pmap->pm_stats.resident_count == 0)
6341 			break;
6342 
6343 		if (pmap_is_la57(pmap)) {
6344 			pml5e = pmap_pml5e(pmap, sva);
6345 			if ((*pml5e & PG_V) == 0) {
6346 				va_next = (sva + NBPML5) & ~PML5MASK;
6347 				if (va_next < sva)
6348 					va_next = eva;
6349 				continue;
6350 			}
6351 			pml4e = pmap_pml5e_to_pml4e(pml5e, sva);
6352 		} else {
6353 			pml4e = pmap_pml4e(pmap, sva);
6354 		}
6355 		if ((*pml4e & PG_V) == 0) {
6356 			va_next = (sva + NBPML4) & ~PML4MASK;
6357 			if (va_next < sva)
6358 				va_next = eva;
6359 			continue;
6360 		}
6361 
6362 		va_next = (sva + NBPDP) & ~PDPMASK;
6363 		if (va_next < sva)
6364 			va_next = eva;
6365 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
6366 		if ((*pdpe & PG_V) == 0)
6367 			continue;
6368 		if ((*pdpe & PG_PS) != 0) {
6369 			KASSERT(va_next <= eva,
6370 			    ("partial update of non-transparent 1G mapping "
6371 			    "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
6372 			    *pdpe, sva, eva, va_next));
6373 			MPASS(pmap != kernel_pmap); /* XXXKIB */
6374 			MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0);
6375 			anyvalid = 1;
6376 			*pdpe = 0;
6377 			pmap_resident_count_adj(pmap, -NBPDP / PAGE_SIZE);
6378 			mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME);
6379 			pmap_unwire_ptp(pmap, sva, mt, &free);
6380 			continue;
6381 		}
6382 
6383 		/*
6384 		 * Calculate index for next page table.
6385 		 */
6386 		va_next = (sva + NBPDR) & ~PDRMASK;
6387 		if (va_next < sva)
6388 			va_next = eva;
6389 
6390 		pde = pmap_pdpe_to_pde(pdpe, sva);
6391 		ptpaddr = *pde;
6392 
6393 		/*
6394 		 * Weed out invalid mappings.
6395 		 */
6396 		if (ptpaddr == 0)
6397 			continue;
6398 
6399 		/*
6400 		 * Check for large page.
6401 		 */
6402 		if ((ptpaddr & PG_PS) != 0) {
6403 			/*
6404 			 * Are we removing the entire large page?  If not,
6405 			 * demote the mapping and fall through.
6406 			 */
6407 			if (sva + NBPDR == va_next && eva >= va_next) {
6408 				/*
6409 				 * The TLB entry for a PG_G mapping is
6410 				 * invalidated by pmap_remove_pde().
6411 				 */
6412 				if ((ptpaddr & PG_G) == 0)
6413 					anyvalid = 1;
6414 				pmap_remove_pde(pmap, pde, sva, true, &free,
6415 				    &lock);
6416 				continue;
6417 			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
6418 			    &lock)) {
6419 				/* The large page mapping was destroyed. */
6420 				continue;
6421 			} else
6422 				ptpaddr = *pde;
6423 		}
6424 
6425 		/*
6426 		 * Limit our scan to either the end of the va represented
6427 		 * by the current page table page, or to the end of the
6428 		 * range being removed.
6429 		 */
6430 		if (va_next > eva)
6431 			va_next = eva;
6432 
6433 		if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock))
6434 			anyvalid = 1;
6435 	}
6436 	if (lock != NULL)
6437 		rw_wunlock(lock);
6438 out:
6439 	if (anyvalid)
6440 		pmap_invalidate_all(pmap);
6441 	PMAP_UNLOCK(pmap);
6442 	pmap_delayed_invl_finish();
6443 	vm_page_free_pages_toq(&free, true);
6444 }
6445 
6446 /*
6447  *	Remove the given range of addresses from the specified map.
6448  *
6449  *	It is assumed that the start and end are properly
6450  *	rounded to the page size.
6451  */
6452 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6453 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6454 {
6455 	pmap_remove1(pmap, sva, eva, false);
6456 }
6457 
6458 /*
6459  *	Remove the given range of addresses as part of a logical unmap
6460  *	operation. This has the effect of calling pmap_remove(), but
6461  *	also clears any metadata that should persist for the lifetime
6462  *	of a logical mapping.
6463  */
6464 void
pmap_map_delete(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6465 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6466 {
6467 	pmap_remove1(pmap, sva, eva, true);
6468 }
6469 
6470 /*
6471  *	Routine:	pmap_remove_all
6472  *	Function:
6473  *		Removes this physical page from
6474  *		all physical maps in which it resides.
6475  *		Reflects back modify bits to the pager.
6476  *
6477  *	Notes:
6478  *		Original versions of this routine were very
6479  *		inefficient because they iteratively called
6480  *		pmap_remove (slow...)
6481  */
6482 
6483 void
pmap_remove_all(vm_page_t m)6484 pmap_remove_all(vm_page_t m)
6485 {
6486 	struct md_page *pvh;
6487 	pv_entry_t pv;
6488 	pmap_t pmap;
6489 	struct rwlock *lock;
6490 	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
6491 	pd_entry_t *pde;
6492 	vm_offset_t va;
6493 	struct spglist free;
6494 	int pvh_gen, md_gen;
6495 
6496 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6497 	    ("pmap_remove_all: page %p is not managed", m));
6498 	SLIST_INIT(&free);
6499 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6500 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
6501 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
6502 	rw_wlock(lock);
6503 retry:
6504 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
6505 		pmap = PV_PMAP(pv);
6506 		if (!PMAP_TRYLOCK(pmap)) {
6507 			pvh_gen = pvh->pv_gen;
6508 			rw_wunlock(lock);
6509 			PMAP_LOCK(pmap);
6510 			rw_wlock(lock);
6511 			if (pvh_gen != pvh->pv_gen) {
6512 				PMAP_UNLOCK(pmap);
6513 				goto retry;
6514 			}
6515 		}
6516 		va = pv->pv_va;
6517 		pde = pmap_pde(pmap, va);
6518 		(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
6519 		PMAP_UNLOCK(pmap);
6520 	}
6521 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
6522 		pmap = PV_PMAP(pv);
6523 		if (!PMAP_TRYLOCK(pmap)) {
6524 			pvh_gen = pvh->pv_gen;
6525 			md_gen = m->md.pv_gen;
6526 			rw_wunlock(lock);
6527 			PMAP_LOCK(pmap);
6528 			rw_wlock(lock);
6529 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6530 				PMAP_UNLOCK(pmap);
6531 				goto retry;
6532 			}
6533 		}
6534 		PG_A = pmap_accessed_bit(pmap);
6535 		PG_M = pmap_modified_bit(pmap);
6536 		PG_RW = pmap_rw_bit(pmap);
6537 		pmap_resident_count_adj(pmap, -1);
6538 		pde = pmap_pde(pmap, pv->pv_va);
6539 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
6540 		    " a 2mpage in page %p's pv list", m));
6541 		pte = pmap_pde_to_pte(pde, pv->pv_va);
6542 		tpte = pte_load_clear(pte);
6543 		if (tpte & PG_W)
6544 			pmap->pm_stats.wired_count--;
6545 		if (tpte & PG_A)
6546 			vm_page_aflag_set(m, PGA_REFERENCED);
6547 
6548 		/*
6549 		 * Update the vm_page_t clean and reference bits.
6550 		 */
6551 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6552 			vm_page_dirty(m);
6553 		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
6554 		pmap_invalidate_page(pmap, pv->pv_va);
6555 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6556 		m->md.pv_gen++;
6557 		free_pv_entry(pmap, pv);
6558 		PMAP_UNLOCK(pmap);
6559 	}
6560 	vm_page_aflag_clear(m, PGA_WRITEABLE);
6561 	rw_wunlock(lock);
6562 	pmap_delayed_invl_wait(m);
6563 	vm_page_free_pages_toq(&free, true);
6564 }
6565 
6566 /*
6567  * pmap_protect_pde: do the things to protect a 2mpage in a process
6568  */
6569 static bool
pmap_protect_pde(pmap_t pmap,pd_entry_t * pde,vm_offset_t sva,vm_prot_t prot)6570 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
6571 {
6572 	pd_entry_t newpde, oldpde;
6573 	vm_page_t m, mt;
6574 	bool anychanged;
6575 	pt_entry_t PG_G, PG_M, PG_RW;
6576 
6577 	PG_G = pmap_global_bit(pmap);
6578 	PG_M = pmap_modified_bit(pmap);
6579 	PG_RW = pmap_rw_bit(pmap);
6580 
6581 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6582 	KASSERT((sva & PDRMASK) == 0,
6583 	    ("pmap_protect_pde: sva is not 2mpage aligned"));
6584 	anychanged = false;
6585 retry:
6586 	oldpde = newpde = *pde;
6587 	if ((prot & VM_PROT_WRITE) == 0) {
6588 		if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
6589 		    (PG_MANAGED | PG_M | PG_RW)) {
6590 			m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
6591 			for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
6592 				vm_page_dirty(mt);
6593 		}
6594 		newpde &= ~(PG_RW | PG_M);
6595 	}
6596 	if ((prot & VM_PROT_EXECUTE) == 0)
6597 		newpde |= pg_nx;
6598 	if (newpde != oldpde) {
6599 		/*
6600 		 * As an optimization to future operations on this PDE, clear
6601 		 * PG_PROMOTED.  The impending invalidation will remove any
6602 		 * lingering 4KB page mappings from the TLB.
6603 		 */
6604 		if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
6605 			goto retry;
6606 		if ((oldpde & PG_G) != 0)
6607 			pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
6608 		else
6609 			anychanged = true;
6610 	}
6611 	return (anychanged);
6612 }
6613 
6614 /*
6615  *	Set the physical protection on the
6616  *	specified range of this map as requested.
6617  */
6618 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)6619 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
6620 {
6621 	vm_page_t m;
6622 	vm_offset_t va_next;
6623 	pml4_entry_t *pml4e;
6624 	pdp_entry_t *pdpe;
6625 	pd_entry_t ptpaddr, *pde;
6626 	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
6627 	pt_entry_t obits, pbits;
6628 	bool anychanged;
6629 
6630 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
6631 	if (prot == VM_PROT_NONE) {
6632 		pmap_remove(pmap, sva, eva);
6633 		return;
6634 	}
6635 
6636 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
6637 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
6638 		return;
6639 
6640 	PG_G = pmap_global_bit(pmap);
6641 	PG_M = pmap_modified_bit(pmap);
6642 	PG_V = pmap_valid_bit(pmap);
6643 	PG_RW = pmap_rw_bit(pmap);
6644 	anychanged = false;
6645 
6646 	/*
6647 	 * Although this function delays and batches the invalidation
6648 	 * of stale TLB entries, it does not need to call
6649 	 * pmap_delayed_invl_start() and
6650 	 * pmap_delayed_invl_finish(), because it does not
6651 	 * ordinarily destroy mappings.  Stale TLB entries from
6652 	 * protection-only changes need only be invalidated before the
6653 	 * pmap lock is released, because protection-only changes do
6654 	 * not destroy PV entries.  Even operations that iterate over
6655 	 * a physical page's PV list of mappings, like
6656 	 * pmap_remove_write(), acquire the pmap lock for each
6657 	 * mapping.  Consequently, for protection-only changes, the
6658 	 * pmap lock suffices to synchronize both page table and TLB
6659 	 * updates.
6660 	 *
6661 	 * This function only destroys a mapping if pmap_demote_pde()
6662 	 * fails.  In that case, stale TLB entries are immediately
6663 	 * invalidated.
6664 	 */
6665 
6666 	PMAP_LOCK(pmap);
6667 	for (; sva < eva; sva = va_next) {
6668 		pml4e = pmap_pml4e(pmap, sva);
6669 		if (pml4e == NULL || (*pml4e & PG_V) == 0) {
6670 			va_next = (sva + NBPML4) & ~PML4MASK;
6671 			if (va_next < sva)
6672 				va_next = eva;
6673 			continue;
6674 		}
6675 
6676 		va_next = (sva + NBPDP) & ~PDPMASK;
6677 		if (va_next < sva)
6678 			va_next = eva;
6679 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
6680 		if ((*pdpe & PG_V) == 0)
6681 			continue;
6682 		if ((*pdpe & PG_PS) != 0) {
6683 			KASSERT(va_next <= eva,
6684 			    ("partial update of non-transparent 1G mapping "
6685 			    "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
6686 			    *pdpe, sva, eva, va_next));
6687 retry_pdpe:
6688 			obits = pbits = *pdpe;
6689 			MPASS((pbits & (PG_MANAGED | PG_G)) == 0);
6690 			MPASS(pmap != kernel_pmap); /* XXXKIB */
6691 			if ((prot & VM_PROT_WRITE) == 0)
6692 				pbits &= ~(PG_RW | PG_M);
6693 			if ((prot & VM_PROT_EXECUTE) == 0)
6694 				pbits |= pg_nx;
6695 
6696 			if (pbits != obits) {
6697 				if (!atomic_cmpset_long(pdpe, obits, pbits))
6698 					/* PG_PS cannot be cleared under us, */
6699 					goto retry_pdpe;
6700 				anychanged = true;
6701 			}
6702 			continue;
6703 		}
6704 
6705 		va_next = (sva + NBPDR) & ~PDRMASK;
6706 		if (va_next < sva)
6707 			va_next = eva;
6708 
6709 		pde = pmap_pdpe_to_pde(pdpe, sva);
6710 		ptpaddr = *pde;
6711 
6712 		/*
6713 		 * Weed out invalid mappings.
6714 		 */
6715 		if (ptpaddr == 0)
6716 			continue;
6717 
6718 		/*
6719 		 * Check for large page.
6720 		 */
6721 		if ((ptpaddr & PG_PS) != 0) {
6722 			/*
6723 			 * Are we protecting the entire large page?
6724 			 */
6725 			if (sva + NBPDR == va_next && eva >= va_next) {
6726 				/*
6727 				 * The TLB entry for a PG_G mapping is
6728 				 * invalidated by pmap_protect_pde().
6729 				 */
6730 				if (pmap_protect_pde(pmap, pde, sva, prot))
6731 					anychanged = true;
6732 				continue;
6733 			}
6734 
6735 			/*
6736 			 * Does the large page mapping need to change?  If so,
6737 			 * demote it and fall through.
6738 			 */
6739 			pbits = ptpaddr;
6740 			if ((prot & VM_PROT_WRITE) == 0)
6741 				pbits &= ~(PG_RW | PG_M);
6742 			if ((prot & VM_PROT_EXECUTE) == 0)
6743 				pbits |= pg_nx;
6744 			if (ptpaddr == pbits || !pmap_demote_pde(pmap, pde,
6745 			    sva)) {
6746 				/*
6747 				 * Either the large page mapping doesn't need
6748 				 * to change, or it was destroyed during
6749 				 * demotion.
6750 				 */
6751 				continue;
6752 			}
6753 		}
6754 
6755 		if (va_next > eva)
6756 			va_next = eva;
6757 
6758 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
6759 		    sva += PAGE_SIZE) {
6760 retry:
6761 			obits = pbits = *pte;
6762 			if ((pbits & PG_V) == 0)
6763 				continue;
6764 
6765 			if ((prot & VM_PROT_WRITE) == 0) {
6766 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
6767 				    (PG_MANAGED | PG_M | PG_RW)) {
6768 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
6769 					vm_page_dirty(m);
6770 				}
6771 				pbits &= ~(PG_RW | PG_M);
6772 			}
6773 			if ((prot & VM_PROT_EXECUTE) == 0)
6774 				pbits |= pg_nx;
6775 
6776 			if (pbits != obits) {
6777 				if (!atomic_cmpset_long(pte, obits, pbits))
6778 					goto retry;
6779 				if (obits & PG_G)
6780 					pmap_invalidate_page(pmap, sva);
6781 				else
6782 					anychanged = true;
6783 			}
6784 		}
6785 	}
6786 	if (anychanged)
6787 		pmap_invalidate_all(pmap);
6788 	PMAP_UNLOCK(pmap);
6789 }
6790 
6791 static bool
pmap_pde_ept_executable(pmap_t pmap,pd_entry_t pde)6792 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde)
6793 {
6794 
6795 	if (pmap->pm_type != PT_EPT)
6796 		return (false);
6797 	return ((pde & EPT_PG_EXECUTE) != 0);
6798 }
6799 
6800 #if VM_NRESERVLEVEL > 0
6801 /*
6802  * Tries to promote the 512, contiguous 4KB page mappings that are within a
6803  * single page table page (PTP) to a single 2MB page mapping.  For promotion
6804  * to occur, two conditions must be met: (1) the 4KB page mappings must map
6805  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
6806  * identical characteristics.
6807  */
6808 static bool
pmap_promote_pde(pmap_t pmap,pd_entry_t * pde,vm_offset_t va,vm_page_t mpte,struct rwlock ** lockp)6809 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, vm_page_t mpte,
6810     struct rwlock **lockp)
6811 {
6812 	pd_entry_t newpde;
6813 	pt_entry_t *firstpte, oldpte, pa, *pte;
6814 	pt_entry_t allpte_PG_A, PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
6815 	int PG_PTE_CACHE;
6816 
6817 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6818 	if (!pmap_ps_enabled(pmap))
6819 		return (false);
6820 
6821 	PG_A = pmap_accessed_bit(pmap);
6822 	PG_G = pmap_global_bit(pmap);
6823 	PG_M = pmap_modified_bit(pmap);
6824 	PG_V = pmap_valid_bit(pmap);
6825 	PG_RW = pmap_rw_bit(pmap);
6826 	PG_PKU_MASK = pmap_pku_mask_bit(pmap);
6827 	PG_PTE_CACHE = pmap_cache_mask(pmap, false);
6828 
6829 	/*
6830 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
6831 	 * ineligible for promotion due to hardware errata, invalid, or does
6832 	 * not map the first 4KB physical page within a 2MB page.
6833 	 */
6834 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
6835 	newpde = *firstpte;
6836 	if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde)))
6837 		return (false);
6838 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) {
6839 		counter_u64_add(pmap_pde_p_failures, 1);
6840 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
6841 		    " in pmap %p", va, pmap);
6842 		return (false);
6843 	}
6844 
6845 	/*
6846 	 * Both here and in the below "for" loop, to allow for repromotion
6847 	 * after MADV_FREE, conditionally write protect a clean PTE before
6848 	 * possibly aborting the promotion due to other PTE attributes.  Why?
6849 	 * Suppose that MADV_FREE is applied to a part of a superpage, the
6850 	 * address range [S, E).  pmap_advise() will demote the superpage
6851 	 * mapping, destroy the 4KB page mapping at the end of [S, E), and
6852 	 * clear PG_M and PG_A in the PTEs for the rest of [S, E).  Later,
6853 	 * imagine that the memory in [S, E) is recycled, but the last 4KB
6854 	 * page in [S, E) is not the last to be rewritten, or simply accessed.
6855 	 * In other words, there is still a 4KB page in [S, E), call it P,
6856 	 * that is writeable but PG_M and PG_A are clear in P's PTE.  Unless
6857 	 * we write protect P before aborting the promotion, if and when P is
6858 	 * finally rewritten, there won't be a page fault to trigger
6859 	 * repromotion.
6860 	 */
6861 setpde:
6862 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
6863 		/*
6864 		 * When PG_M is already clear, PG_RW can be cleared without
6865 		 * a TLB invalidation.
6866 		 */
6867 		if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW))
6868 			goto setpde;
6869 		newpde &= ~PG_RW;
6870 		CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
6871 		    " in pmap %p", va & ~PDRMASK, pmap);
6872 	}
6873 
6874 	/*
6875 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
6876 	 * PTE maps an unexpected 4KB physical page or does not have identical
6877 	 * characteristics to the first PTE.
6878 	 */
6879 	allpte_PG_A = newpde & PG_A;
6880 	pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE;
6881 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
6882 		oldpte = *pte;
6883 		if ((oldpte & (PG_FRAME | PG_V)) != pa) {
6884 			counter_u64_add(pmap_pde_p_failures, 1);
6885 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
6886 			    " in pmap %p", va, pmap);
6887 			return (false);
6888 		}
6889 setpte:
6890 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
6891 			/*
6892 			 * When PG_M is already clear, PG_RW can be cleared
6893 			 * without a TLB invalidation.
6894 			 */
6895 			if (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~PG_RW))
6896 				goto setpte;
6897 			oldpte &= ~PG_RW;
6898 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
6899 			    " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
6900 			    (va & ~PDRMASK), pmap);
6901 		}
6902 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
6903 			counter_u64_add(pmap_pde_p_failures, 1);
6904 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
6905 			    " in pmap %p", va, pmap);
6906 			return (false);
6907 		}
6908 		allpte_PG_A &= oldpte;
6909 		pa -= PAGE_SIZE;
6910 	}
6911 
6912 	/*
6913 	 * Unless all PTEs have PG_A set, clear it from the superpage mapping,
6914 	 * so that promotions triggered by speculative mappings, such as
6915 	 * pmap_enter_quick(), don't automatically mark the underlying pages
6916 	 * as referenced.
6917 	 */
6918 	newpde &= ~PG_A | allpte_PG_A;
6919 
6920 	/*
6921 	 * EPT PTEs with PG_M set and PG_A clear are not supported by early
6922 	 * MMUs supporting EPT.
6923 	 */
6924 	KASSERT((newpde & PG_A) != 0 || safe_to_clear_referenced(pmap, newpde),
6925 	    ("unsupported EPT PTE"));
6926 
6927 	/*
6928 	 * Save the PTP in its current state until the PDE mapping the
6929 	 * superpage is demoted by pmap_demote_pde() or destroyed by
6930 	 * pmap_remove_pde().  If PG_A is not set in every PTE, then request
6931 	 * that the PTP be refilled on demotion.
6932 	 */
6933 	if (mpte == NULL)
6934 		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
6935 	KASSERT(mpte >= vm_page_array &&
6936 	    mpte < &vm_page_array[vm_page_array_size],
6937 	    ("pmap_promote_pde: page table page is out of range"));
6938 	KASSERT(mpte->pindex == pmap_pde_pindex(va),
6939 	    ("pmap_promote_pde: page table page's pindex is wrong "
6940 	    "mpte %p pidx %#lx va %#lx va pde pidx %#lx",
6941 	    mpte, mpte->pindex, va, pmap_pde_pindex(va)));
6942 	if (pmap_insert_pt_page(pmap, mpte, true, allpte_PG_A != 0)) {
6943 		counter_u64_add(pmap_pde_p_failures, 1);
6944 		CTR2(KTR_PMAP,
6945 		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
6946 		    pmap);
6947 		return (false);
6948 	}
6949 
6950 	/*
6951 	 * Promote the pv entries.
6952 	 */
6953 	if ((newpde & PG_MANAGED) != 0)
6954 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
6955 
6956 	/*
6957 	 * Propagate the PAT index to its proper position.
6958 	 */
6959 	newpde = pmap_swap_pat(pmap, newpde);
6960 
6961 	/*
6962 	 * Map the superpage.
6963 	 */
6964 	if (workaround_erratum383)
6965 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
6966 	else
6967 		pde_store(pde, PG_PROMOTED | PG_PS | newpde);
6968 
6969 	counter_u64_add(pmap_pde_promotions, 1);
6970 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
6971 	    " in pmap %p", va, pmap);
6972 	return (true);
6973 }
6974 #endif /* VM_NRESERVLEVEL > 0 */
6975 
6976 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t newpte,int flags,int psind)6977 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
6978     int psind)
6979 {
6980 	vm_page_t mp;
6981 	pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V;
6982 
6983 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6984 	KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0,
6985 	    ("psind %d unexpected", psind));
6986 	KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0,
6987 	    ("unaligned phys address %#lx newpte %#lx psind %d",
6988 	    newpte & PG_FRAME, newpte, psind));
6989 	KASSERT((va & (pagesizes[psind] - 1)) == 0,
6990 	    ("unaligned va %#lx psind %d", va, psind));
6991 	KASSERT(va < VM_MAXUSER_ADDRESS,
6992 	    ("kernel mode non-transparent superpage")); /* XXXKIB */
6993 	KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS,
6994 	    ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */
6995 
6996 	PG_V = pmap_valid_bit(pmap);
6997 
6998 restart:
6999 	pten = newpte;
7000 	if (!pmap_pkru_same(pmap, va, va + pagesizes[psind], &pten))
7001 		return (KERN_PROTECTION_FAILURE);
7002 
7003 	if (psind == 2) {	/* 1G */
7004 		pml4e = pmap_pml4e(pmap, va);
7005 		if (pml4e == NULL || (*pml4e & PG_V) == 0) {
7006 			mp = pmap_allocpte_alloc(pmap, pmap_pml4e_pindex(va),
7007 			    NULL, va);
7008 			if (mp == NULL)
7009 				goto allocf;
7010 			pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
7011 			pdpe = &pdpe[pmap_pdpe_index(va)];
7012 			origpte = *pdpe;
7013 			MPASS(origpte == 0);
7014 		} else {
7015 			pdpe = pmap_pml4e_to_pdpe(pml4e, va);
7016 			KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va));
7017 			origpte = *pdpe;
7018 			if ((origpte & PG_V) == 0) {
7019 				mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME);
7020 				mp->ref_count++;
7021 			}
7022 		}
7023 		*pdpe = pten;
7024 	} else /* (psind == 1) */ {	/* 2M */
7025 		pde = pmap_pde(pmap, va);
7026 		if (pde == NULL) {
7027 			mp = pmap_allocpte_alloc(pmap, pmap_pdpe_pindex(va),
7028 			    NULL, va);
7029 			if (mp == NULL)
7030 				goto allocf;
7031 			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
7032 			pde = &pde[pmap_pde_index(va)];
7033 			origpte = *pde;
7034 			MPASS(origpte == 0);
7035 		} else {
7036 			origpte = *pde;
7037 			if ((origpte & PG_V) == 0) {
7038 				pdpe = pmap_pdpe(pmap, va);
7039 				MPASS(pdpe != NULL && (*pdpe & PG_V) != 0);
7040 				mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
7041 				mp->ref_count++;
7042 			}
7043 		}
7044 		*pde = pten;
7045 	}
7046 	KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 &&
7047 	    (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)),
7048 	    ("va %#lx changing %s phys page origpte %#lx pten %#lx",
7049 	    va, psind == 2 ? "1G" : "2M", origpte, pten));
7050 	if ((pten & PG_W) != 0 && (origpte & PG_W) == 0)
7051 		pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
7052 	else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0)
7053 		pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
7054 	if ((origpte & PG_V) == 0)
7055 		pmap_resident_count_adj(pmap, pagesizes[psind] / PAGE_SIZE);
7056 
7057 	return (KERN_SUCCESS);
7058 
7059 allocf:
7060 	if ((flags & PMAP_ENTER_NOSLEEP) != 0)
7061 		return (KERN_RESOURCE_SHORTAGE);
7062 	PMAP_UNLOCK(pmap);
7063 	vm_wait(NULL);
7064 	PMAP_LOCK(pmap);
7065 	goto restart;
7066 }
7067 
7068 /*
7069  *	Insert the given physical page (p) at
7070  *	the specified virtual address (v) in the
7071  *	target physical map with the protection requested.
7072  *
7073  *	If specified, the page will be wired down, meaning
7074  *	that the related pte can not be reclaimed.
7075  *
7076  *	NB:  This is the only routine which MAY NOT lazy-evaluate
7077  *	or lose information.  That is, this routine must actually
7078  *	insert this page into the given map NOW.
7079  *
7080  *	When destroying both a page table and PV entry, this function
7081  *	performs the TLB invalidation before releasing the PV list
7082  *	lock, so we do not need pmap_delayed_invl_page() calls here.
7083  */
7084 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)7085 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
7086     u_int flags, int8_t psind)
7087 {
7088 	struct rwlock *lock;
7089 	pd_entry_t *pde;
7090 	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
7091 	pt_entry_t newpte, origpte;
7092 	pv_entry_t pv;
7093 	vm_paddr_t opa, pa;
7094 	vm_page_t mpte, om;
7095 	int rv;
7096 	bool nosleep;
7097 
7098 	PG_A = pmap_accessed_bit(pmap);
7099 	PG_G = pmap_global_bit(pmap);
7100 	PG_M = pmap_modified_bit(pmap);
7101 	PG_V = pmap_valid_bit(pmap);
7102 	PG_RW = pmap_rw_bit(pmap);
7103 
7104 	va = trunc_page(va);
7105 	KASSERT(va <= kva_layout.km_high, ("pmap_enter: toobig"));
7106 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
7107 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
7108 	    va));
7109 	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va),
7110 	    ("pmap_enter: managed mapping within the clean submap"));
7111 	if ((m->oflags & VPO_UNMANAGED) == 0)
7112 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
7113 	KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
7114 	    ("pmap_enter: flags %u has reserved bits set", flags));
7115 	pa = VM_PAGE_TO_PHYS(m);
7116 	newpte = (pt_entry_t)(pa | PG_A | PG_V);
7117 	if ((flags & VM_PROT_WRITE) != 0)
7118 		newpte |= PG_M;
7119 	if ((prot & VM_PROT_WRITE) != 0)
7120 		newpte |= PG_RW;
7121 	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
7122 	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
7123 	if ((prot & VM_PROT_EXECUTE) == 0)
7124 		newpte |= pg_nx;
7125 	if ((flags & PMAP_ENTER_WIRED) != 0)
7126 		newpte |= PG_W;
7127 	if (va < VM_MAXUSER_ADDRESS)
7128 		newpte |= PG_U;
7129 	if (pmap == kernel_pmap)
7130 		newpte |= PG_G;
7131 	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
7132 
7133 	/*
7134 	 * Set modified bit gratuitously for writeable mappings if
7135 	 * the page is unmanaged. We do not want to take a fault
7136 	 * to do the dirty bit accounting for these mappings.
7137 	 */
7138 	if ((m->oflags & VPO_UNMANAGED) != 0) {
7139 		if ((newpte & PG_RW) != 0)
7140 			newpte |= PG_M;
7141 	} else
7142 		newpte |= PG_MANAGED;
7143 
7144 	lock = NULL;
7145 	PMAP_LOCK(pmap);
7146 	if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
7147 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
7148 		    ("managed largepage va %#lx flags %#x", va, flags));
7149 		rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags,
7150 		    psind);
7151 		goto out;
7152 	}
7153 	if (psind == 1) {
7154 		/* Assert the required virtual and physical alignment. */
7155 		KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
7156 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
7157 		rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock);
7158 		goto out;
7159 	}
7160 	mpte = NULL;
7161 
7162 	/*
7163 	 * In the case that a page table page is not
7164 	 * resident, we are creating it here.
7165 	 */
7166 retry:
7167 	pde = pmap_pde(pmap, va);
7168 	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
7169 	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
7170 		pte = pmap_pde_to_pte(pde, va);
7171 		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
7172 			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
7173 			mpte->ref_count++;
7174 		}
7175 	} else if (va < VM_MAXUSER_ADDRESS) {
7176 		/*
7177 		 * Here if the pte page isn't mapped, or if it has been
7178 		 * deallocated.
7179 		 */
7180 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
7181 		mpte = pmap_allocpte_alloc(pmap, pmap_pde_pindex(va),
7182 		    nosleep ? NULL : &lock, va);
7183 		if (mpte == NULL && nosleep) {
7184 			rv = KERN_RESOURCE_SHORTAGE;
7185 			goto out;
7186 		}
7187 		goto retry;
7188 	} else
7189 		panic("pmap_enter: invalid page directory va=%#lx", va);
7190 
7191 	origpte = *pte;
7192 	pv = NULL;
7193 	if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
7194 		newpte |= pmap_pkru_get(pmap, va);
7195 
7196 	/*
7197 	 * Is the specified virtual address already mapped?
7198 	 */
7199 	if ((origpte & PG_V) != 0) {
7200 		/*
7201 		 * Wiring change, just update stats. We don't worry about
7202 		 * wiring PT pages as they remain resident as long as there
7203 		 * are valid mappings in them. Hence, if a user page is wired,
7204 		 * the PT page will be also.
7205 		 */
7206 		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
7207 			pmap->pm_stats.wired_count++;
7208 		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
7209 			pmap->pm_stats.wired_count--;
7210 
7211 		/*
7212 		 * Remove the extra PT page reference.
7213 		 */
7214 		if (mpte != NULL) {
7215 			mpte->ref_count--;
7216 			KASSERT(mpte->ref_count > 0,
7217 			    ("pmap_enter: missing reference to page table page,"
7218 			     " va: 0x%lx", va));
7219 		}
7220 
7221 		/*
7222 		 * Has the physical page changed?
7223 		 */
7224 		opa = origpte & PG_FRAME;
7225 		if (opa == pa) {
7226 			/*
7227 			 * No, might be a protection or wiring change.
7228 			 */
7229 			if ((origpte & PG_MANAGED) != 0 &&
7230 			    (newpte & PG_RW) != 0)
7231 				vm_page_aflag_set(m, PGA_WRITEABLE);
7232 			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
7233 				goto unchanged;
7234 			goto validate;
7235 		}
7236 
7237 		/*
7238 		 * The physical page has changed.  Temporarily invalidate
7239 		 * the mapping.  This ensures that all threads sharing the
7240 		 * pmap keep a consistent view of the mapping, which is
7241 		 * necessary for the correct handling of COW faults.  It
7242 		 * also permits reuse of the old mapping's PV entry,
7243 		 * avoiding an allocation.
7244 		 *
7245 		 * For consistency, handle unmanaged mappings the same way.
7246 		 */
7247 		origpte = pte_load_clear(pte);
7248 		KASSERT((origpte & PG_FRAME) == opa,
7249 		    ("pmap_enter: unexpected pa update for %#lx", va));
7250 		if ((origpte & PG_MANAGED) != 0) {
7251 			om = PHYS_TO_VM_PAGE(opa);
7252 
7253 			/*
7254 			 * The pmap lock is sufficient to synchronize with
7255 			 * concurrent calls to pmap_page_test_mappings() and
7256 			 * pmap_ts_referenced().
7257 			 */
7258 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
7259 				vm_page_dirty(om);
7260 			if ((origpte & PG_A) != 0) {
7261 				pmap_invalidate_page(pmap, va);
7262 				vm_page_aflag_set(om, PGA_REFERENCED);
7263 			}
7264 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
7265 			pv = pmap_pvh_remove(&om->md, pmap, va);
7266 			KASSERT(pv != NULL,
7267 			    ("pmap_enter: no PV entry for %#lx", va));
7268 			if ((newpte & PG_MANAGED) == 0)
7269 				free_pv_entry(pmap, pv);
7270 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
7271 			    TAILQ_EMPTY(&om->md.pv_list) &&
7272 			    ((om->flags & PG_FICTITIOUS) != 0 ||
7273 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
7274 				vm_page_aflag_clear(om, PGA_WRITEABLE);
7275 		} else {
7276 			/*
7277 			 * Since this mapping is unmanaged, assume that PG_A
7278 			 * is set.
7279 			 */
7280 			pmap_invalidate_page(pmap, va);
7281 		}
7282 		origpte = 0;
7283 	} else {
7284 		/*
7285 		 * Increment the counters.
7286 		 */
7287 		if ((newpte & PG_W) != 0)
7288 			pmap->pm_stats.wired_count++;
7289 		pmap_resident_count_adj(pmap, 1);
7290 	}
7291 
7292 	/*
7293 	 * Enter on the PV list if part of our managed memory.
7294 	 */
7295 	if ((newpte & PG_MANAGED) != 0) {
7296 		if (pv == NULL) {
7297 			pv = get_pv_entry(pmap, &lock);
7298 			pv->pv_va = va;
7299 		}
7300 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
7301 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7302 		m->md.pv_gen++;
7303 		if ((newpte & PG_RW) != 0)
7304 			vm_page_aflag_set(m, PGA_WRITEABLE);
7305 	}
7306 
7307 	/*
7308 	 * Update the PTE.
7309 	 */
7310 	if ((origpte & PG_V) != 0) {
7311 validate:
7312 		origpte = pte_load_store(pte, newpte);
7313 		KASSERT((origpte & PG_FRAME) == pa,
7314 		    ("pmap_enter: unexpected pa update for %#lx", va));
7315 		if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
7316 		    (PG_M | PG_RW)) {
7317 			if ((origpte & PG_MANAGED) != 0)
7318 				vm_page_dirty(m);
7319 
7320 			/*
7321 			 * Although the PTE may still have PG_RW set, TLB
7322 			 * invalidation may nonetheless be required because
7323 			 * the PTE no longer has PG_M set.
7324 			 */
7325 		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
7326 			/*
7327 			 * This PTE change does not require TLB invalidation.
7328 			 */
7329 			goto unchanged;
7330 		}
7331 		if ((origpte & PG_A) != 0)
7332 			pmap_invalidate_page(pmap, va);
7333 	} else
7334 		pte_store(pte, newpte);
7335 
7336 unchanged:
7337 
7338 #if VM_NRESERVLEVEL > 0
7339 	/*
7340 	 * If both the page table page and the reservation are fully
7341 	 * populated, then attempt promotion.
7342 	 */
7343 	if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
7344 	    (m->flags & PG_FICTITIOUS) == 0 &&
7345 	    vm_reserv_level_iffullpop(m) == 0)
7346 		(void)pmap_promote_pde(pmap, pde, va, mpte, &lock);
7347 #endif
7348 
7349 	rv = KERN_SUCCESS;
7350 out:
7351 	if (lock != NULL)
7352 		rw_wunlock(lock);
7353 	PMAP_UNLOCK(pmap);
7354 	return (rv);
7355 }
7356 
7357 /*
7358  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns
7359  * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
7360  * value.  See pmap_enter_pde() for the possible error values when "no sleep",
7361  * "no replace", and "no reclaim" are specified.
7362  */
7363 static int
pmap_enter_2mpage(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)7364 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
7365     struct rwlock **lockp)
7366 {
7367 	pd_entry_t newpde;
7368 	pt_entry_t PG_V;
7369 
7370 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7371 	PG_V = pmap_valid_bit(pmap);
7372 	newpde = VM_PAGE_TO_PHYS(m) |
7373 	    pmap_cache_bits(pmap, m->md.pat_mode, true) | PG_PS | PG_V;
7374 	if ((m->oflags & VPO_UNMANAGED) == 0)
7375 		newpde |= PG_MANAGED;
7376 	if ((prot & VM_PROT_EXECUTE) == 0)
7377 		newpde |= pg_nx;
7378 	if (va < VM_MAXUSER_ADDRESS)
7379 		newpde |= PG_U;
7380 	return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
7381 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp));
7382 }
7383 
7384 /*
7385  * Returns true if every page table entry in the specified page table page is
7386  * zero.
7387  */
7388 static bool
pmap_every_pte_zero(vm_paddr_t pa)7389 pmap_every_pte_zero(vm_paddr_t pa)
7390 {
7391 	pt_entry_t *pt_end, *pte;
7392 
7393 	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
7394 	pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
7395 	for (pt_end = pte + NPTEPG; pte < pt_end; pte++) {
7396 		if (*pte != 0)
7397 			return (false);
7398 	}
7399 	return (true);
7400 }
7401 
7402 /*
7403  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
7404  * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE,
7405  * KERN_PROTECTION_FAILURE, or KERN_RESOURCE_SHORTAGE otherwise.  Returns
7406  * KERN_FAILURE if either (1) PMAP_ENTER_NOREPLACE was specified and a 4KB
7407  * page mapping already exists within the 2MB virtual address range starting
7408  * at the specified virtual address or (2) the requested 2MB page mapping is
7409  * not supported due to hardware errata.  Returns KERN_NO_SPACE if
7410  * PMAP_ENTER_NOREPLACE was specified and a 2MB page mapping already exists at
7411  * the specified virtual address.  Returns KERN_PROTECTION_FAILURE if the PKRU
7412  * settings are not the same across the 2MB virtual address range starting at
7413  * the specified virtual address.  Returns KERN_RESOURCE_SHORTAGE if either
7414  * (1) PMAP_ENTER_NOSLEEP was specified and a page table page allocation
7415  * failed or (2) PMAP_ENTER_NORECLAIM was specified and a PV entry allocation
7416  * failed.
7417  *
7418  * The parameter "m" is only used when creating a managed, writeable mapping.
7419  */
7420 static int
pmap_enter_pde(pmap_t pmap,vm_offset_t va,pd_entry_t newpde,u_int flags,vm_page_t m,struct rwlock ** lockp)7421 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
7422     vm_page_t m, struct rwlock **lockp)
7423 {
7424 	struct spglist free;
7425 	pd_entry_t oldpde, *pde;
7426 	pt_entry_t PG_G, PG_RW, PG_V;
7427 	vm_page_t mt, pdpg;
7428 	vm_page_t uwptpg;
7429 
7430 	PG_G = pmap_global_bit(pmap);
7431 	PG_RW = pmap_rw_bit(pmap);
7432 	KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
7433 	    ("pmap_enter_pde: newpde is missing PG_M"));
7434 	KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
7435 	    PMAP_ENTER_NORECLAIM,
7436 	    ("pmap_enter_pde: flags is missing PMAP_ENTER_NOREPLACE"));
7437 	PG_V = pmap_valid_bit(pmap);
7438 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7439 
7440 	if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap,
7441 	    newpde))) {
7442 		CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx"
7443 		    " in pmap %p", va, pmap);
7444 		return (KERN_FAILURE);
7445 	}
7446 	if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags &
7447 	    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
7448 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
7449 		    " in pmap %p", va, pmap);
7450 		return (KERN_RESOURCE_SHORTAGE);
7451 	}
7452 
7453 	/*
7454 	 * If pkru is not same for the whole pde range, return failure
7455 	 * and let vm_fault() cope.  Check after pde allocation, since
7456 	 * it could sleep.
7457 	 */
7458 	if (!pmap_pkru_same(pmap, va, va + NBPDR, &newpde)) {
7459 		pmap_abort_ptp(pmap, va, pdpg);
7460 		return (KERN_PROTECTION_FAILURE);
7461 	}
7462 
7463 	/*
7464 	 * If there are existing mappings, either abort or remove them.
7465 	 */
7466 	oldpde = *pde;
7467 	if ((oldpde & PG_V) != 0) {
7468 		KASSERT(pdpg == NULL || pdpg->ref_count > 1,
7469 		    ("pmap_enter_pde: pdpg's reference count is too low"));
7470 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
7471 			if ((oldpde & PG_PS) != 0) {
7472 				if (pdpg != NULL)
7473 					pdpg->ref_count--;
7474 				CTR2(KTR_PMAP,
7475 				    "pmap_enter_pde: no space for va %#lx"
7476 				    " in pmap %p", va, pmap);
7477 				return (KERN_NO_SPACE);
7478 			} else if (va < VM_MAXUSER_ADDRESS ||
7479 			    !pmap_every_pte_zero(oldpde & PG_FRAME)) {
7480 				if (pdpg != NULL)
7481 					pdpg->ref_count--;
7482 				CTR2(KTR_PMAP,
7483 				    "pmap_enter_pde: failure for va %#lx"
7484 				    " in pmap %p", va, pmap);
7485 				return (KERN_FAILURE);
7486 			}
7487 		}
7488 		/* Break the existing mapping(s). */
7489 		SLIST_INIT(&free);
7490 		if ((oldpde & PG_PS) != 0) {
7491 			/*
7492 			 * The reference to the PD page that was acquired by
7493 			 * pmap_alloc_pde() ensures that it won't be freed.
7494 			 * However, if the PDE resulted from a promotion, and
7495 			 * the mapping is not from kernel_pmap, then
7496 			 * a reserved PT page could be freed.
7497 			 */
7498 			(void)pmap_remove_pde(pmap, pde, va, false, &free,
7499 			    lockp);
7500 			if ((oldpde & PG_G) == 0)
7501 				pmap_invalidate_pde_page(pmap, va, oldpde);
7502 		} else {
7503 			if (va >= VM_MAXUSER_ADDRESS) {
7504 				/*
7505 				 * Try to save the ptp in the trie
7506 				 * before any changes to mappings are
7507 				 * made.  Abort on failure.
7508 				 */
7509 				mt = PHYS_TO_VM_PAGE(oldpde & PG_FRAME);
7510 				if (pmap_insert_pt_page(pmap, mt, false,
7511 				    false)) {
7512 					CTR1(KTR_PMAP,
7513 			    "pmap_enter_pde: cannot ins kern ptp va %#lx",
7514 					    va);
7515 					return (KERN_RESOURCE_SHORTAGE);
7516 				}
7517 				/*
7518 				 * Both pmap_remove_pde() and
7519 				 * pmap_remove_ptes() will zero-fill
7520 				 * the kernel page table page.
7521 				 */
7522 			}
7523 			pmap_delayed_invl_start();
7524 			if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
7525 			    lockp))
7526 		               pmap_invalidate_all(pmap);
7527 			pmap_delayed_invl_finish();
7528 		}
7529 		if (va < VM_MAXUSER_ADDRESS) {
7530 			vm_page_free_pages_toq(&free, true);
7531 			KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
7532 			    pde));
7533 		} else {
7534 			KASSERT(SLIST_EMPTY(&free),
7535 			    ("pmap_enter_pde: freed kernel page table page"));
7536 		}
7537 	}
7538 
7539 	/*
7540 	 * Allocate leaf ptpage for wired userspace pages.
7541 	 */
7542 	uwptpg = NULL;
7543 	if ((newpde & PG_W) != 0 && pmap != kernel_pmap) {
7544 		uwptpg = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va),
7545 		    VM_ALLOC_WIRED);
7546 		if (uwptpg == NULL) {
7547 			pmap_abort_ptp(pmap, va, pdpg);
7548 			return (KERN_RESOURCE_SHORTAGE);
7549 		}
7550 		if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
7551 			pmap_free_pt_page(pmap, uwptpg, false);
7552 			pmap_abort_ptp(pmap, va, pdpg);
7553 			return (KERN_RESOURCE_SHORTAGE);
7554 		}
7555 
7556 		uwptpg->ref_count = NPTEPG;
7557 	}
7558 	if ((newpde & PG_MANAGED) != 0) {
7559 		/*
7560 		 * Abort this mapping if its PV entry could not be created.
7561 		 */
7562 		if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
7563 			if (pdpg != NULL)
7564 				pmap_abort_ptp(pmap, va, pdpg);
7565 			else {
7566 				KASSERT(va >= VM_MAXUSER_ADDRESS &&
7567 				    (*pde & (PG_PS | PG_V)) == PG_V,
7568 				    ("pmap_enter_pde: invalid kernel PDE"));
7569 				mt = pmap_remove_pt_page(pmap, va);
7570 				KASSERT(mt != NULL,
7571 				    ("pmap_enter_pde: missing kernel PTP"));
7572 			}
7573 			if (uwptpg != NULL) {
7574 				mt = pmap_remove_pt_page(pmap, va);
7575 				KASSERT(mt == uwptpg,
7576 				    ("removed pt page %p, expected %p", mt,
7577 				    uwptpg));
7578 				uwptpg->ref_count = 1;
7579 				pmap_free_pt_page(pmap, uwptpg, false);
7580 			}
7581 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
7582 			    " in pmap %p", va, pmap);
7583 			return (KERN_RESOURCE_SHORTAGE);
7584 		}
7585 		if ((newpde & PG_RW) != 0) {
7586 			for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
7587 				vm_page_aflag_set(mt, PGA_WRITEABLE);
7588 		}
7589 	}
7590 
7591 	/*
7592 	 * Increment counters.
7593 	 */
7594 	if ((newpde & PG_W) != 0)
7595 		pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
7596 	pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE);
7597 
7598 	/*
7599 	 * Map the superpage.  (This is not a promoted mapping; there will not
7600 	 * be any lingering 4KB page mappings in the TLB.)
7601 	 */
7602 	pde_store(pde, newpde);
7603 
7604 	counter_u64_add(pmap_pde_mappings, 1);
7605 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p",
7606 	    va, pmap);
7607 	return (KERN_SUCCESS);
7608 }
7609 
7610 /*
7611  * Maps a sequence of resident pages belonging to the same object.
7612  * The sequence begins with the given page m_start.  This page is
7613  * mapped at the given virtual address start.  Each subsequent page is
7614  * mapped at a virtual address that is offset from start by the same
7615  * amount as the page is offset from m_start within the object.  The
7616  * last page in the sequence is the page with the largest offset from
7617  * m_start that can be mapped at a virtual address less than the given
7618  * virtual address end.  Not every virtual page between start and end
7619  * is mapped; only those for which a resident page exists with the
7620  * corresponding offset from m_start are mapped.
7621  */
7622 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)7623 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
7624     vm_page_t m_start, vm_prot_t prot)
7625 {
7626 	struct pctrie_iter pages;
7627 	struct rwlock *lock;
7628 	vm_offset_t va;
7629 	vm_page_t m, mpte;
7630 	int rv;
7631 
7632 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
7633 
7634 	mpte = NULL;
7635 	vm_page_iter_limit_init(&pages, m_start->object,
7636 	    m_start->pindex + atop(end - start));
7637 	m = vm_radix_iter_lookup(&pages, m_start->pindex);
7638 	lock = NULL;
7639 	PMAP_LOCK(pmap);
7640 	while (m != NULL) {
7641 		va = start + ptoa(m->pindex - m_start->pindex);
7642 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
7643 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
7644 		    ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) ==
7645 		    KERN_SUCCESS || rv == KERN_NO_SPACE))
7646 			m = vm_radix_iter_jump(&pages, NBPDR / PAGE_SIZE);
7647 		else {
7648 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
7649 			    mpte, &lock);
7650 			m = vm_radix_iter_step(&pages);
7651 		}
7652 	}
7653 	if (lock != NULL)
7654 		rw_wunlock(lock);
7655 	PMAP_UNLOCK(pmap);
7656 }
7657 
7658 /*
7659  * this code makes some *MAJOR* assumptions:
7660  * 1. Current pmap & pmap exists.
7661  * 2. Not wired.
7662  * 3. Read access.
7663  * 4. No page table pages.
7664  * but is *MUCH* faster than pmap_enter...
7665  */
7666 
7667 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)7668 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
7669 {
7670 	struct rwlock *lock;
7671 
7672 	lock = NULL;
7673 	PMAP_LOCK(pmap);
7674 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
7675 	if (lock != NULL)
7676 		rw_wunlock(lock);
7677 	PMAP_UNLOCK(pmap);
7678 }
7679 
7680 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)7681 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
7682     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
7683 {
7684 	pd_entry_t *pde;
7685 	pt_entry_t newpte, *pte, PG_V;
7686 
7687 	KASSERT(!VA_IS_CLEANMAP(va) ||
7688 	    (m->oflags & VPO_UNMANAGED) != 0,
7689 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
7690 	PG_V = pmap_valid_bit(pmap);
7691 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7692 	pde = NULL;
7693 
7694 	/*
7695 	 * In the case that a page table page is not
7696 	 * resident, we are creating it here.
7697 	 */
7698 	if (va < VM_MAXUSER_ADDRESS) {
7699 		pdp_entry_t *pdpe;
7700 		vm_pindex_t ptepindex;
7701 
7702 		/*
7703 		 * Calculate pagetable page index
7704 		 */
7705 		ptepindex = pmap_pde_pindex(va);
7706 		if (mpte && (mpte->pindex == ptepindex)) {
7707 			mpte->ref_count++;
7708 		} else {
7709 			/*
7710 			 * If the page table page is mapped, we just increment
7711 			 * the hold count, and activate it.  Otherwise, we
7712 			 * attempt to allocate a page table page, passing NULL
7713 			 * instead of the PV list lock pointer because we don't
7714 			 * intend to sleep.  If this attempt fails, we don't
7715 			 * retry.  Instead, we give up.
7716 			 */
7717 			pdpe = pmap_pdpe(pmap, va);
7718 			if (pdpe != NULL && (*pdpe & PG_V) != 0) {
7719 				if ((*pdpe & PG_PS) != 0)
7720 					return (NULL);
7721 				pde = pmap_pdpe_to_pde(pdpe, va);
7722 				if ((*pde & PG_V) != 0) {
7723 					if ((*pde & PG_PS) != 0)
7724 						return (NULL);
7725 					mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
7726 					mpte->ref_count++;
7727 				} else {
7728 					mpte = pmap_allocpte_alloc(pmap,
7729 					    ptepindex, NULL, va);
7730 					if (mpte == NULL)
7731 						return (NULL);
7732 				}
7733 			} else {
7734 				mpte = pmap_allocpte_alloc(pmap, ptepindex,
7735 				    NULL, va);
7736 				if (mpte == NULL)
7737 					return (NULL);
7738 			}
7739 		}
7740 		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
7741 		pte = &pte[pmap_pte_index(va)];
7742 	} else {
7743 		mpte = NULL;
7744 		pte = vtopte(va);
7745 	}
7746 	if (*pte) {
7747 		if (mpte != NULL)
7748 			mpte->ref_count--;
7749 		return (NULL);
7750 	}
7751 
7752 	/*
7753 	 * Enter on the PV list if part of our managed memory.
7754 	 */
7755 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
7756 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
7757 		if (mpte != NULL)
7758 			pmap_abort_ptp(pmap, va, mpte);
7759 		return (NULL);
7760 	}
7761 
7762 	/*
7763 	 * Increment counters
7764 	 */
7765 	pmap_resident_count_adj(pmap, 1);
7766 
7767 	newpte = VM_PAGE_TO_PHYS(m) | PG_V |
7768 	    pmap_cache_bits(pmap, m->md.pat_mode, false);
7769 	if ((m->oflags & VPO_UNMANAGED) == 0)
7770 		newpte |= PG_MANAGED;
7771 	if ((prot & VM_PROT_EXECUTE) == 0)
7772 		newpte |= pg_nx;
7773 	if (va < VM_MAXUSER_ADDRESS)
7774 		newpte |= PG_U | pmap_pkru_get(pmap, va);
7775 	pte_store(pte, newpte);
7776 
7777 #if VM_NRESERVLEVEL > 0
7778 	/*
7779 	 * If both the PTP and the reservation are fully populated, then
7780 	 * attempt promotion.
7781 	 */
7782 	if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
7783 	    (mpte == NULL || mpte->ref_count == NPTEPG) &&
7784 	    (m->flags & PG_FICTITIOUS) == 0 &&
7785 	    vm_reserv_level_iffullpop(m) == 0) {
7786 		if (pde == NULL)
7787 			pde = pmap_pde(pmap, va);
7788 
7789 		/*
7790 		 * If promotion succeeds, then the next call to this function
7791 		 * should not be given the unmapped PTP as a hint.
7792 		 */
7793 		if (pmap_promote_pde(pmap, pde, va, mpte, lockp))
7794 			mpte = NULL;
7795 	}
7796 #endif
7797 
7798 	return (mpte);
7799 }
7800 
7801 /*
7802  * Make a temporary mapping for a physical address.  This is only intended
7803  * to be used for panic dumps.
7804  */
7805 void *
pmap_kenter_temporary(vm_paddr_t pa,int i)7806 pmap_kenter_temporary(vm_paddr_t pa, int i)
7807 {
7808 	vm_offset_t va;
7809 
7810 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
7811 	pmap_kenter(va, pa);
7812 	pmap_invlpg(kernel_pmap, va);
7813 	return ((void *)crashdumpmap);
7814 }
7815 
7816 /*
7817  * This code maps large physical mmap regions into the
7818  * processor address space.  Note that some shortcuts
7819  * are taken, but the code works.
7820  */
7821 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)7822 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
7823     vm_pindex_t pindex, vm_size_t size)
7824 {
7825 	struct pctrie_iter pages;
7826 	pd_entry_t *pde;
7827 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
7828 	vm_paddr_t pa, ptepa;
7829 	vm_page_t p, pdpg;
7830 	int pat_mode;
7831 
7832 	PG_A = pmap_accessed_bit(pmap);
7833 	PG_M = pmap_modified_bit(pmap);
7834 	PG_V = pmap_valid_bit(pmap);
7835 	PG_RW = pmap_rw_bit(pmap);
7836 
7837 	VM_OBJECT_ASSERT_WLOCKED(object);
7838 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
7839 	    ("pmap_object_init_pt: non-device object"));
7840 	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
7841 		if (!pmap_ps_enabled(pmap))
7842 			return;
7843 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
7844 			return;
7845 		vm_page_iter_init(&pages, object);
7846 		p = vm_radix_iter_lookup(&pages, pindex);
7847 		KASSERT(vm_page_all_valid(p),
7848 		    ("pmap_object_init_pt: invalid page %p", p));
7849 		pat_mode = p->md.pat_mode;
7850 
7851 		/*
7852 		 * Abort the mapping if the first page is not physically
7853 		 * aligned to a 2MB page boundary.
7854 		 */
7855 		ptepa = VM_PAGE_TO_PHYS(p);
7856 		if (ptepa & (NBPDR - 1))
7857 			return;
7858 
7859 		/*
7860 		 * Skip the first page.  Abort the mapping if the rest of
7861 		 * the pages are not physically contiguous or have differing
7862 		 * memory attributes.
7863 		 */
7864 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
7865 		    pa += PAGE_SIZE) {
7866 			p = vm_radix_iter_next(&pages);
7867 			KASSERT(vm_page_all_valid(p),
7868 			    ("pmap_object_init_pt: invalid page %p", p));
7869 			if (pa != VM_PAGE_TO_PHYS(p) ||
7870 			    pat_mode != p->md.pat_mode)
7871 				return;
7872 		}
7873 
7874 		/*
7875 		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
7876 		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
7877 		 * will not affect the termination of this loop.
7878 		 */
7879 		PMAP_LOCK(pmap);
7880 		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, true);
7881 		    pa < ptepa + size; pa += NBPDR) {
7882 			pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL);
7883 			if (pde == NULL) {
7884 				/*
7885 				 * The creation of mappings below is only an
7886 				 * optimization.  If a page directory page
7887 				 * cannot be allocated without blocking,
7888 				 * continue on to the next mapping rather than
7889 				 * blocking.
7890 				 */
7891 				addr += NBPDR;
7892 				continue;
7893 			}
7894 			if ((*pde & PG_V) == 0) {
7895 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
7896 				    PG_U | PG_RW | PG_V);
7897 				pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE);
7898 				counter_u64_add(pmap_pde_mappings, 1);
7899 			} else {
7900 				/* Continue on if the PDE is already valid. */
7901 				pdpg->ref_count--;
7902 				KASSERT(pdpg->ref_count > 0,
7903 				    ("pmap_object_init_pt: missing reference "
7904 				    "to page directory page, va: 0x%lx", addr));
7905 			}
7906 			addr += NBPDR;
7907 		}
7908 		PMAP_UNLOCK(pmap);
7909 	}
7910 }
7911 
7912 /*
7913  *	Clear the wired attribute from the mappings for the specified range of
7914  *	addresses in the given pmap.  Every valid mapping within that range
7915  *	must have the wired attribute set.  In contrast, invalid mappings
7916  *	cannot have the wired attribute set, so they are ignored.
7917  *
7918  *	The wired attribute of the page table entry is not a hardware
7919  *	feature, so there is no need to invalidate any TLB entries.
7920  *	Since pmap_demote_pde() for the wired entry must never fail,
7921  *	pmap_delayed_invl_start()/finish() calls around the
7922  *	function are not needed.
7923  */
7924 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)7925 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
7926 {
7927 	vm_offset_t va_next;
7928 	pml4_entry_t *pml4e;
7929 	pdp_entry_t *pdpe;
7930 	pd_entry_t *pde;
7931 	pt_entry_t *pte, PG_V, PG_G __diagused;
7932 
7933 	PG_V = pmap_valid_bit(pmap);
7934 	PG_G = pmap_global_bit(pmap);
7935 	PMAP_LOCK(pmap);
7936 	for (; sva < eva; sva = va_next) {
7937 		pml4e = pmap_pml4e(pmap, sva);
7938 		if (pml4e == NULL || (*pml4e & PG_V) == 0) {
7939 			va_next = (sva + NBPML4) & ~PML4MASK;
7940 			if (va_next < sva)
7941 				va_next = eva;
7942 			continue;
7943 		}
7944 
7945 		va_next = (sva + NBPDP) & ~PDPMASK;
7946 		if (va_next < sva)
7947 			va_next = eva;
7948 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
7949 		if ((*pdpe & PG_V) == 0)
7950 			continue;
7951 		if ((*pdpe & PG_PS) != 0) {
7952 			KASSERT(va_next <= eva,
7953 			    ("partial update of non-transparent 1G mapping "
7954 			    "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
7955 			    *pdpe, sva, eva, va_next));
7956 			MPASS(pmap != kernel_pmap); /* XXXKIB */
7957 			MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0);
7958 			atomic_clear_long(pdpe, PG_W);
7959 			pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE;
7960 			continue;
7961 		}
7962 
7963 		va_next = (sva + NBPDR) & ~PDRMASK;
7964 		if (va_next < sva)
7965 			va_next = eva;
7966 		pde = pmap_pdpe_to_pde(pdpe, sva);
7967 		if ((*pde & PG_V) == 0)
7968 			continue;
7969 		if ((*pde & PG_PS) != 0) {
7970 			if ((*pde & PG_W) == 0)
7971 				panic("pmap_unwire: pde %#jx is missing PG_W",
7972 				    (uintmax_t)*pde);
7973 
7974 			/*
7975 			 * Are we unwiring the entire large page?  If not,
7976 			 * demote the mapping and fall through.
7977 			 */
7978 			if (sva + NBPDR == va_next && eva >= va_next) {
7979 				atomic_clear_long(pde, PG_W);
7980 				pmap->pm_stats.wired_count -= NBPDR /
7981 				    PAGE_SIZE;
7982 				continue;
7983 			} else if (!pmap_demote_pde(pmap, pde, sva))
7984 				panic("pmap_unwire: demotion failed");
7985 		}
7986 		if (va_next > eva)
7987 			va_next = eva;
7988 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
7989 		    sva += PAGE_SIZE) {
7990 			if ((*pte & PG_V) == 0)
7991 				continue;
7992 			if ((*pte & PG_W) == 0)
7993 				panic("pmap_unwire: pte %#jx is missing PG_W",
7994 				    (uintmax_t)*pte);
7995 
7996 			/*
7997 			 * PG_W must be cleared atomically.  Although the pmap
7998 			 * lock synchronizes access to PG_W, another processor
7999 			 * could be setting PG_M and/or PG_A concurrently.
8000 			 */
8001 			atomic_clear_long(pte, PG_W);
8002 			pmap->pm_stats.wired_count--;
8003 		}
8004 	}
8005 	PMAP_UNLOCK(pmap);
8006 }
8007 
8008 /*
8009  *	Copy the range specified by src_addr/len
8010  *	from the source map to the range dst_addr/len
8011  *	in the destination map.
8012  *
8013  *	This routine is only advisory and need not do anything.
8014  */
8015 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)8016 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
8017     vm_offset_t src_addr)
8018 {
8019 	struct rwlock *lock;
8020 	pml4_entry_t *pml4e;
8021 	pdp_entry_t *pdpe;
8022 	pd_entry_t *pde, srcptepaddr;
8023 	pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte;
8024 	vm_offset_t addr, end_addr, va_next;
8025 	vm_page_t dst_pdpg, dstmpte, srcmpte;
8026 
8027 	if (dst_addr != src_addr)
8028 		return;
8029 
8030 	if (dst_pmap->pm_type != src_pmap->pm_type)
8031 		return;
8032 
8033 	/*
8034 	 * EPT page table entries that require emulation of A/D bits are
8035 	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
8036 	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
8037 	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
8038 	 * implementations flag an EPT misconfiguration for exec-only
8039 	 * mappings we skip this function entirely for emulated pmaps.
8040 	 */
8041 	if (pmap_emulate_ad_bits(dst_pmap))
8042 		return;
8043 
8044 	end_addr = src_addr + len;
8045 	lock = NULL;
8046 	if (dst_pmap < src_pmap) {
8047 		PMAP_LOCK(dst_pmap);
8048 		PMAP_LOCK(src_pmap);
8049 	} else {
8050 		PMAP_LOCK(src_pmap);
8051 		PMAP_LOCK(dst_pmap);
8052 	}
8053 
8054 	PG_A = pmap_accessed_bit(dst_pmap);
8055 	PG_M = pmap_modified_bit(dst_pmap);
8056 	PG_V = pmap_valid_bit(dst_pmap);
8057 
8058 	for (addr = src_addr; addr < end_addr; addr = va_next) {
8059 		KASSERT(addr < UPT_MIN_ADDRESS,
8060 		    ("pmap_copy: invalid to pmap_copy page tables"));
8061 
8062 		pml4e = pmap_pml4e(src_pmap, addr);
8063 		if (pml4e == NULL || (*pml4e & PG_V) == 0) {
8064 			va_next = (addr + NBPML4) & ~PML4MASK;
8065 			if (va_next < addr)
8066 				va_next = end_addr;
8067 			continue;
8068 		}
8069 
8070 		va_next = (addr + NBPDP) & ~PDPMASK;
8071 		if (va_next < addr)
8072 			va_next = end_addr;
8073 		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
8074 		if ((*pdpe & PG_V) == 0)
8075 			continue;
8076 		if ((*pdpe & PG_PS) != 0) {
8077 			KASSERT(va_next <= end_addr,
8078 			    ("partial update of non-transparent 1G mapping "
8079 			    "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
8080 			    *pdpe, addr, end_addr, va_next));
8081 			MPASS((addr & PDPMASK) == 0);
8082 			MPASS((*pdpe & PG_MANAGED) == 0);
8083 			srcptepaddr = *pdpe;
8084 			pdpe = pmap_pdpe(dst_pmap, addr);
8085 			if (pdpe == NULL) {
8086 				if (pmap_allocpte_alloc(dst_pmap,
8087 				    pmap_pml4e_pindex(addr), NULL, addr) ==
8088 				    NULL)
8089 					break;
8090 				pdpe = pmap_pdpe(dst_pmap, addr);
8091 			} else {
8092 				pml4e = pmap_pml4e(dst_pmap, addr);
8093 				dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME);
8094 				dst_pdpg->ref_count++;
8095 			}
8096 			KASSERT(*pdpe == 0,
8097 			    ("1G mapping present in dst pmap "
8098 			    "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
8099 			    *pdpe, addr, end_addr, va_next));
8100 			*pdpe = srcptepaddr & ~PG_W;
8101 			pmap_resident_count_adj(dst_pmap, NBPDP / PAGE_SIZE);
8102 			continue;
8103 		}
8104 
8105 		va_next = (addr + NBPDR) & ~PDRMASK;
8106 		if (va_next < addr)
8107 			va_next = end_addr;
8108 
8109 		pde = pmap_pdpe_to_pde(pdpe, addr);
8110 		srcptepaddr = *pde;
8111 		if (srcptepaddr == 0)
8112 			continue;
8113 
8114 		if (srcptepaddr & PG_PS) {
8115 			/*
8116 			 * We can only virtual copy whole superpages.
8117 			 */
8118 			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
8119 				continue;
8120 			pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL);
8121 			if (pde == NULL)
8122 				break;
8123 			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
8124 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
8125 			    PMAP_ENTER_NORECLAIM, &lock))) {
8126 				/*
8127 				 * We leave the dirty bit unchanged because
8128 				 * managed read/write superpage mappings are
8129 				 * required to be dirty.  However, managed
8130 				 * superpage mappings are not required to
8131 				 * have their accessed bit set, so we clear
8132 				 * it because we don't know if this mapping
8133 				 * will be used.
8134 				 */
8135 				srcptepaddr &= ~PG_W;
8136 				if ((srcptepaddr & PG_MANAGED) != 0)
8137 					srcptepaddr &= ~PG_A;
8138 				*pde = srcptepaddr;
8139 				pmap_resident_count_adj(dst_pmap, NBPDR /
8140 				    PAGE_SIZE);
8141 				counter_u64_add(pmap_pde_mappings, 1);
8142 			} else
8143 				pmap_abort_ptp(dst_pmap, addr, dst_pdpg);
8144 			continue;
8145 		}
8146 
8147 		srcptepaddr &= PG_FRAME;
8148 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
8149 		KASSERT(srcmpte->ref_count > 0,
8150 		    ("pmap_copy: source page table page is unused"));
8151 
8152 		if (va_next > end_addr)
8153 			va_next = end_addr;
8154 
8155 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
8156 		src_pte = &src_pte[pmap_pte_index(addr)];
8157 		dstmpte = NULL;
8158 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
8159 			ptetemp = *src_pte;
8160 
8161 			/*
8162 			 * We only virtual copy managed pages.
8163 			 */
8164 			if ((ptetemp & PG_MANAGED) == 0)
8165 				continue;
8166 
8167 			if (dstmpte != NULL) {
8168 				KASSERT(dstmpte->pindex ==
8169 				    pmap_pde_pindex(addr),
8170 				    ("dstmpte pindex/addr mismatch"));
8171 				dstmpte->ref_count++;
8172 			} else if ((dstmpte = pmap_allocpte(dst_pmap, addr,
8173 			    NULL)) == NULL)
8174 				goto out;
8175 			dst_pte = (pt_entry_t *)
8176 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
8177 			dst_pte = &dst_pte[pmap_pte_index(addr)];
8178 			if (*dst_pte == 0 &&
8179 			    pmap_try_insert_pv_entry(dst_pmap, addr,
8180 			    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) {
8181 				/*
8182 				 * Clear the wired, modified, and accessed
8183 				 * (referenced) bits during the copy.
8184 				 */
8185 				*dst_pte = ptetemp & ~(PG_W | PG_M | PG_A);
8186 				pmap_resident_count_adj(dst_pmap, 1);
8187 			} else {
8188 				pmap_abort_ptp(dst_pmap, addr, dstmpte);
8189 				goto out;
8190 			}
8191 			/* Have we copied all of the valid mappings? */
8192 			if (dstmpte->ref_count >= srcmpte->ref_count)
8193 				break;
8194 		}
8195 	}
8196 out:
8197 	if (lock != NULL)
8198 		rw_wunlock(lock);
8199 	PMAP_UNLOCK(src_pmap);
8200 	PMAP_UNLOCK(dst_pmap);
8201 }
8202 
8203 int
pmap_vmspace_copy(pmap_t dst_pmap,pmap_t src_pmap)8204 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
8205 {
8206 	int error;
8207 
8208 	if (dst_pmap->pm_type != src_pmap->pm_type ||
8209 	    dst_pmap->pm_type != PT_X86 ||
8210 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
8211 		return (0);
8212 	for (;;) {
8213 		if (dst_pmap < src_pmap) {
8214 			PMAP_LOCK(dst_pmap);
8215 			PMAP_LOCK(src_pmap);
8216 		} else {
8217 			PMAP_LOCK(src_pmap);
8218 			PMAP_LOCK(dst_pmap);
8219 		}
8220 		error = pmap_pkru_copy(dst_pmap, src_pmap);
8221 		/* Clean up partial copy on failure due to no memory. */
8222 		if (error == ENOMEM)
8223 			pmap_pkru_deassign_all(dst_pmap);
8224 		PMAP_UNLOCK(src_pmap);
8225 		PMAP_UNLOCK(dst_pmap);
8226 		if (error != ENOMEM)
8227 			break;
8228 		vm_wait(NULL);
8229 	}
8230 	return (error);
8231 }
8232 
8233 /*
8234  * Zero the specified hardware page.
8235  */
8236 void
pmap_zero_page(vm_page_t m)8237 pmap_zero_page(vm_page_t m)
8238 {
8239 	vm_offset_t va;
8240 
8241 #ifdef TSLOG_PAGEZERO
8242 	TSENTER();
8243 #endif
8244 	va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
8245 	pagezero((void *)va);
8246 #ifdef TSLOG_PAGEZERO
8247 	TSEXIT();
8248 #endif
8249 }
8250 
8251 /*
8252  * Zero an area within a single hardware page.  off and size must not
8253  * cover an area beyond a single hardware page.
8254  */
8255 void
pmap_zero_page_area(vm_page_t m,int off,int size)8256 pmap_zero_page_area(vm_page_t m, int off, int size)
8257 {
8258 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
8259 
8260 	if (off == 0 && size == PAGE_SIZE)
8261 		pagezero((void *)va);
8262 	else
8263 		bzero((char *)va + off, size);
8264 }
8265 
8266 /*
8267  * Copy 1 specified hardware page to another.
8268  */
8269 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)8270 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
8271 {
8272 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
8273 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
8274 
8275 	pagecopy((void *)src, (void *)dst);
8276 }
8277 
8278 int unmapped_buf_allowed = 1;
8279 
8280 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)8281 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
8282     vm_offset_t b_offset, int xfersize)
8283 {
8284 	void *a_cp, *b_cp;
8285 	vm_page_t pages[2];
8286 	vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
8287 	int cnt;
8288 	bool mapped;
8289 
8290 	while (xfersize > 0) {
8291 		a_pg_offset = a_offset & PAGE_MASK;
8292 		pages[0] = ma[a_offset >> PAGE_SHIFT];
8293 		b_pg_offset = b_offset & PAGE_MASK;
8294 		pages[1] = mb[b_offset >> PAGE_SHIFT];
8295 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
8296 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
8297 		mapped = pmap_map_io_transient(pages, vaddr, 2, false);
8298 		a_cp = (char *)vaddr[0] + a_pg_offset;
8299 		b_cp = (char *)vaddr[1] + b_pg_offset;
8300 		bcopy(a_cp, b_cp, cnt);
8301 		if (__predict_false(mapped))
8302 			pmap_unmap_io_transient(pages, vaddr, 2, false);
8303 		a_offset += cnt;
8304 		b_offset += cnt;
8305 		xfersize -= cnt;
8306 	}
8307 }
8308 
8309 /*
8310  * Returns true if the pmap's pv is one of the first
8311  * 16 pvs linked to from this page.  This count may
8312  * be changed upwards or downwards in the future; it
8313  * is only necessary that true be returned for a small
8314  * subset of pmaps for proper page aging.
8315  */
8316 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)8317 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
8318 {
8319 	struct md_page *pvh;
8320 	struct rwlock *lock;
8321 	pv_entry_t pv;
8322 	int loops = 0;
8323 	bool rv;
8324 
8325 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
8326 	    ("pmap_page_exists_quick: page %p is not managed", m));
8327 	rv = false;
8328 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
8329 	rw_rlock(lock);
8330 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
8331 		if (PV_PMAP(pv) == pmap) {
8332 			rv = true;
8333 			break;
8334 		}
8335 		loops++;
8336 		if (loops >= 16)
8337 			break;
8338 	}
8339 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
8340 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
8341 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
8342 			if (PV_PMAP(pv) == pmap) {
8343 				rv = true;
8344 				break;
8345 			}
8346 			loops++;
8347 			if (loops >= 16)
8348 				break;
8349 		}
8350 	}
8351 	rw_runlock(lock);
8352 	return (rv);
8353 }
8354 
8355 /*
8356  *	pmap_page_wired_mappings:
8357  *
8358  *	Return the number of managed mappings to the given physical page
8359  *	that are wired.
8360  */
8361 int
pmap_page_wired_mappings(vm_page_t m)8362 pmap_page_wired_mappings(vm_page_t m)
8363 {
8364 	struct rwlock *lock;
8365 	struct md_page *pvh;
8366 	pmap_t pmap;
8367 	pt_entry_t *pte;
8368 	pv_entry_t pv;
8369 	int count, md_gen, pvh_gen;
8370 
8371 	if ((m->oflags & VPO_UNMANAGED) != 0)
8372 		return (0);
8373 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
8374 	rw_rlock(lock);
8375 restart:
8376 	count = 0;
8377 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
8378 		pmap = PV_PMAP(pv);
8379 		if (!PMAP_TRYLOCK(pmap)) {
8380 			md_gen = m->md.pv_gen;
8381 			rw_runlock(lock);
8382 			PMAP_LOCK(pmap);
8383 			rw_rlock(lock);
8384 			if (md_gen != m->md.pv_gen) {
8385 				PMAP_UNLOCK(pmap);
8386 				goto restart;
8387 			}
8388 		}
8389 		pte = pmap_pte(pmap, pv->pv_va);
8390 		if ((*pte & PG_W) != 0)
8391 			count++;
8392 		PMAP_UNLOCK(pmap);
8393 	}
8394 	if ((m->flags & PG_FICTITIOUS) == 0) {
8395 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
8396 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
8397 			pmap = PV_PMAP(pv);
8398 			if (!PMAP_TRYLOCK(pmap)) {
8399 				md_gen = m->md.pv_gen;
8400 				pvh_gen = pvh->pv_gen;
8401 				rw_runlock(lock);
8402 				PMAP_LOCK(pmap);
8403 				rw_rlock(lock);
8404 				if (md_gen != m->md.pv_gen ||
8405 				    pvh_gen != pvh->pv_gen) {
8406 					PMAP_UNLOCK(pmap);
8407 					goto restart;
8408 				}
8409 			}
8410 			pte = pmap_pde(pmap, pv->pv_va);
8411 			if ((*pte & PG_W) != 0)
8412 				count++;
8413 			PMAP_UNLOCK(pmap);
8414 		}
8415 	}
8416 	rw_runlock(lock);
8417 	return (count);
8418 }
8419 
8420 /*
8421  * Returns true if the given page is mapped individually or as part of
8422  * a 2mpage.  Otherwise, returns false.
8423  */
8424 bool
pmap_page_is_mapped(vm_page_t m)8425 pmap_page_is_mapped(vm_page_t m)
8426 {
8427 	struct rwlock *lock;
8428 	bool rv;
8429 
8430 	if ((m->oflags & VPO_UNMANAGED) != 0)
8431 		return (false);
8432 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
8433 	rw_rlock(lock);
8434 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
8435 	    ((m->flags & PG_FICTITIOUS) == 0 &&
8436 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
8437 	rw_runlock(lock);
8438 	return (rv);
8439 }
8440 
8441 /*
8442  * Destroy all managed, non-wired mappings in the given user-space
8443  * pmap.  This pmap cannot be active on any processor besides the
8444  * caller.
8445  *
8446  * This function cannot be applied to the kernel pmap.  Moreover, it
8447  * is not intended for general use.  It is only to be used during
8448  * process termination.  Consequently, it can be implemented in ways
8449  * that make it faster than pmap_remove().  First, it can more quickly
8450  * destroy mappings by iterating over the pmap's collection of PV
8451  * entries, rather than searching the page table.  Second, it doesn't
8452  * have to test and clear the page table entries atomically, because
8453  * no processor is currently accessing the user address space.  In
8454  * particular, a page table entry's dirty bit won't change state once
8455  * this function starts.
8456  *
8457  * Although this function destroys all of the pmap's managed,
8458  * non-wired mappings, it can delay and batch the invalidation of TLB
8459  * entries without calling pmap_delayed_invl_start() and
8460  * pmap_delayed_invl_finish().  Because the pmap is not active on
8461  * any other processor, none of these TLB entries will ever be used
8462  * before their eventual invalidation.  Consequently, there is no need
8463  * for either pmap_remove_all() or pmap_remove_write() to wait for
8464  * that eventual TLB invalidation.
8465  */
8466 void
pmap_remove_pages(pmap_t pmap)8467 pmap_remove_pages(pmap_t pmap)
8468 {
8469 	pd_entry_t ptepde;
8470 	pt_entry_t *pte, tpte;
8471 	pt_entry_t PG_M, PG_RW, PG_V;
8472 	struct spglist free;
8473 	struct pv_chunklist free_chunks[PMAP_MEMDOM];
8474 	vm_page_t m, mpte, mt;
8475 	pv_entry_t pv;
8476 	struct md_page *pvh;
8477 	struct pv_chunk *pc, *npc;
8478 	struct rwlock *lock;
8479 	int64_t bit;
8480 	uint64_t inuse, bitmask;
8481 	int allfree, field, i, idx;
8482 #ifdef PV_STATS
8483 	int freed;
8484 #endif
8485 	bool superpage;
8486 	vm_paddr_t pa;
8487 
8488 	/*
8489 	 * Assert that the given pmap is only active on the current
8490 	 * CPU.  Unfortunately, we cannot block another CPU from
8491 	 * activating the pmap while this function is executing.
8492 	 */
8493 	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
8494 #ifdef INVARIANTS
8495 	{
8496 		cpuset_t other_cpus;
8497 
8498 		other_cpus = all_cpus;
8499 		critical_enter();
8500 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
8501 		CPU_AND(&other_cpus, &other_cpus, &pmap->pm_active);
8502 		critical_exit();
8503 		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
8504 	}
8505 #endif
8506 
8507 	lock = NULL;
8508 	PG_M = pmap_modified_bit(pmap);
8509 	PG_V = pmap_valid_bit(pmap);
8510 	PG_RW = pmap_rw_bit(pmap);
8511 
8512 	for (i = 0; i < PMAP_MEMDOM; i++)
8513 		TAILQ_INIT(&free_chunks[i]);
8514 	SLIST_INIT(&free);
8515 	PMAP_LOCK(pmap);
8516 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
8517 		allfree = 1;
8518 #ifdef PV_STATS
8519 		freed = 0;
8520 #endif
8521 		for (field = 0; field < _NPCM; field++) {
8522 			inuse = ~pc->pc_map[field] & pc_freemask[field];
8523 			while (inuse != 0) {
8524 				bit = bsfq(inuse);
8525 				bitmask = 1UL << bit;
8526 				idx = field * 64 + bit;
8527 				pv = &pc->pc_pventry[idx];
8528 				inuse &= ~bitmask;
8529 
8530 				pte = pmap_pdpe(pmap, pv->pv_va);
8531 				ptepde = *pte;
8532 				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
8533 				tpte = *pte;
8534 				if ((tpte & (PG_PS | PG_V)) == PG_V) {
8535 					superpage = false;
8536 					ptepde = tpte;
8537 					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
8538 					    PG_FRAME);
8539 					pte = &pte[pmap_pte_index(pv->pv_va)];
8540 					tpte = *pte;
8541 				} else {
8542 					/*
8543 					 * Keep track whether 'tpte' is a
8544 					 * superpage explicitly instead of
8545 					 * relying on PG_PS being set.
8546 					 *
8547 					 * This is because PG_PS is numerically
8548 					 * identical to PG_PTE_PAT and thus a
8549 					 * regular page could be mistaken for
8550 					 * a superpage.
8551 					 */
8552 					superpage = true;
8553 				}
8554 
8555 				if ((tpte & PG_V) == 0) {
8556 					panic("bad pte va %lx pte %lx",
8557 					    pv->pv_va, tpte);
8558 				}
8559 
8560 /*
8561  * We cannot remove wired pages from a process' mapping at this time
8562  */
8563 				if (tpte & PG_W) {
8564 					allfree = 0;
8565 					continue;
8566 				}
8567 
8568 				/* Mark free */
8569 				pc->pc_map[field] |= bitmask;
8570 
8571 				/*
8572 				 * Because this pmap is not active on other
8573 				 * processors, the dirty bit cannot have
8574 				 * changed state since we last loaded pte.
8575 				 */
8576 				pte_clear(pte);
8577 
8578 				if (superpage)
8579 					pa = tpte & PG_PS_FRAME;
8580 				else
8581 					pa = tpte & PG_FRAME;
8582 
8583 				m = PHYS_TO_VM_PAGE(pa);
8584 				KASSERT(m->phys_addr == pa,
8585 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
8586 				    m, (uintmax_t)m->phys_addr,
8587 				    (uintmax_t)tpte));
8588 
8589 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
8590 				    m < &vm_page_array[vm_page_array_size],
8591 				    ("pmap_remove_pages: bad tpte %#jx",
8592 				    (uintmax_t)tpte));
8593 
8594 				/*
8595 				 * Update the vm_page_t clean/reference bits.
8596 				 */
8597 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
8598 					if (superpage) {
8599 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
8600 							vm_page_dirty(mt);
8601 					} else
8602 						vm_page_dirty(m);
8603 				}
8604 
8605 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
8606 
8607 				if (superpage) {
8608 					pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE);
8609 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
8610 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
8611 					pvh->pv_gen++;
8612 					if (TAILQ_EMPTY(&pvh->pv_list)) {
8613 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
8614 							if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
8615 							    TAILQ_EMPTY(&mt->md.pv_list))
8616 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
8617 					}
8618 					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
8619 					if (mpte != NULL) {
8620 						KASSERT(vm_page_any_valid(mpte),
8621 						    ("pmap_remove_pages: pte page not promoted"));
8622 						pmap_pt_page_count_adj(pmap, -1);
8623 						KASSERT(mpte->ref_count == NPTEPG,
8624 						    ("pmap_remove_pages: pte page reference count error"));
8625 						mpte->ref_count = 0;
8626 						pmap_add_delayed_free_list(mpte, &free, false);
8627 					}
8628 				} else {
8629 					pmap_resident_count_adj(pmap, -1);
8630 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
8631 					m->md.pv_gen++;
8632 					if ((m->a.flags & PGA_WRITEABLE) != 0 &&
8633 					    TAILQ_EMPTY(&m->md.pv_list) &&
8634 					    (m->flags & PG_FICTITIOUS) == 0) {
8635 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
8636 						if (TAILQ_EMPTY(&pvh->pv_list))
8637 							vm_page_aflag_clear(m, PGA_WRITEABLE);
8638 					}
8639 				}
8640 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
8641 #ifdef PV_STATS
8642 				freed++;
8643 #endif
8644 			}
8645 		}
8646 		PV_STAT(counter_u64_add(pv_entry_frees, freed));
8647 		PV_STAT(counter_u64_add(pv_entry_spare, freed));
8648 		PV_STAT(counter_u64_add(pv_entry_count, -freed));
8649 		if (allfree) {
8650 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
8651 			TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list);
8652 		}
8653 	}
8654 	if (lock != NULL)
8655 		rw_wunlock(lock);
8656 	pmap_invalidate_all(pmap);
8657 	pmap_pkru_deassign_all(pmap);
8658 	free_pv_chunk_batch((struct pv_chunklist *)&free_chunks);
8659 	PMAP_UNLOCK(pmap);
8660 	vm_page_free_pages_toq(&free, true);
8661 }
8662 
8663 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)8664 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
8665 {
8666 	struct rwlock *lock;
8667 	pv_entry_t pv;
8668 	struct md_page *pvh;
8669 	pt_entry_t *pte, mask;
8670 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
8671 	pmap_t pmap;
8672 	int md_gen, pvh_gen;
8673 	bool rv;
8674 
8675 	rv = false;
8676 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
8677 	rw_rlock(lock);
8678 restart:
8679 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
8680 		pmap = PV_PMAP(pv);
8681 		if (!PMAP_TRYLOCK(pmap)) {
8682 			md_gen = m->md.pv_gen;
8683 			rw_runlock(lock);
8684 			PMAP_LOCK(pmap);
8685 			rw_rlock(lock);
8686 			if (md_gen != m->md.pv_gen) {
8687 				PMAP_UNLOCK(pmap);
8688 				goto restart;
8689 			}
8690 		}
8691 		pte = pmap_pte(pmap, pv->pv_va);
8692 		mask = 0;
8693 		if (modified) {
8694 			PG_M = pmap_modified_bit(pmap);
8695 			PG_RW = pmap_rw_bit(pmap);
8696 			mask |= PG_RW | PG_M;
8697 		}
8698 		if (accessed) {
8699 			PG_A = pmap_accessed_bit(pmap);
8700 			PG_V = pmap_valid_bit(pmap);
8701 			mask |= PG_V | PG_A;
8702 		}
8703 		rv = (*pte & mask) == mask;
8704 		PMAP_UNLOCK(pmap);
8705 		if (rv)
8706 			goto out;
8707 	}
8708 	if ((m->flags & PG_FICTITIOUS) == 0) {
8709 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
8710 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
8711 			pmap = PV_PMAP(pv);
8712 			if (!PMAP_TRYLOCK(pmap)) {
8713 				md_gen = m->md.pv_gen;
8714 				pvh_gen = pvh->pv_gen;
8715 				rw_runlock(lock);
8716 				PMAP_LOCK(pmap);
8717 				rw_rlock(lock);
8718 				if (md_gen != m->md.pv_gen ||
8719 				    pvh_gen != pvh->pv_gen) {
8720 					PMAP_UNLOCK(pmap);
8721 					goto restart;
8722 				}
8723 			}
8724 			pte = pmap_pde(pmap, pv->pv_va);
8725 			mask = 0;
8726 			if (modified) {
8727 				PG_M = pmap_modified_bit(pmap);
8728 				PG_RW = pmap_rw_bit(pmap);
8729 				mask |= PG_RW | PG_M;
8730 			}
8731 			if (accessed) {
8732 				PG_A = pmap_accessed_bit(pmap);
8733 				PG_V = pmap_valid_bit(pmap);
8734 				mask |= PG_V | PG_A;
8735 			}
8736 			rv = (*pte & mask) == mask;
8737 			PMAP_UNLOCK(pmap);
8738 			if (rv)
8739 				goto out;
8740 		}
8741 	}
8742 out:
8743 	rw_runlock(lock);
8744 	return (rv);
8745 }
8746 
8747 /*
8748  *	pmap_is_modified:
8749  *
8750  *	Return whether or not the specified physical page was modified
8751  *	in any physical maps.
8752  */
8753 bool
pmap_is_modified(vm_page_t m)8754 pmap_is_modified(vm_page_t m)
8755 {
8756 
8757 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
8758 	    ("pmap_is_modified: page %p is not managed", m));
8759 
8760 	/*
8761 	 * If the page is not busied then this check is racy.
8762 	 */
8763 	if (!pmap_page_is_write_mapped(m))
8764 		return (false);
8765 	return (pmap_page_test_mappings(m, false, true));
8766 }
8767 
8768 /*
8769  *	pmap_is_prefaultable:
8770  *
8771  *	Return whether or not the specified virtual address is eligible
8772  *	for prefault.
8773  */
8774 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)8775 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
8776 {
8777 	pd_entry_t *pde;
8778 	pt_entry_t *pte, PG_V;
8779 	bool rv;
8780 
8781 	PG_V = pmap_valid_bit(pmap);
8782 
8783 	/*
8784 	 * Return true if and only if the PTE for the specified virtual
8785 	 * address is allocated but invalid.
8786 	 */
8787 	rv = false;
8788 	PMAP_LOCK(pmap);
8789 	pde = pmap_pde(pmap, addr);
8790 	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
8791 		pte = pmap_pde_to_pte(pde, addr);
8792 		rv = (*pte & PG_V) == 0;
8793 	}
8794 	PMAP_UNLOCK(pmap);
8795 	return (rv);
8796 }
8797 
8798 /*
8799  *	pmap_is_referenced:
8800  *
8801  *	Return whether or not the specified physical page was referenced
8802  *	in any physical maps.
8803  */
8804 bool
pmap_is_referenced(vm_page_t m)8805 pmap_is_referenced(vm_page_t m)
8806 {
8807 
8808 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
8809 	    ("pmap_is_referenced: page %p is not managed", m));
8810 	return (pmap_page_test_mappings(m, true, false));
8811 }
8812 
8813 /*
8814  * Clear the write and modified bits in each of the given page's mappings.
8815  */
8816 void
pmap_remove_write(vm_page_t m)8817 pmap_remove_write(vm_page_t m)
8818 {
8819 	struct md_page *pvh;
8820 	pmap_t pmap;
8821 	struct rwlock *lock;
8822 	pv_entry_t next_pv, pv;
8823 	pd_entry_t *pde;
8824 	pt_entry_t oldpte, *pte, PG_M, PG_RW;
8825 	vm_offset_t va;
8826 	int pvh_gen, md_gen;
8827 
8828 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
8829 	    ("pmap_remove_write: page %p is not managed", m));
8830 
8831 	vm_page_assert_busied(m);
8832 	if (!pmap_page_is_write_mapped(m))
8833 		return;
8834 
8835 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
8836 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
8837 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
8838 	rw_wlock(lock);
8839 retry:
8840 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
8841 		pmap = PV_PMAP(pv);
8842 		if (!PMAP_TRYLOCK(pmap)) {
8843 			pvh_gen = pvh->pv_gen;
8844 			rw_wunlock(lock);
8845 			PMAP_LOCK(pmap);
8846 			rw_wlock(lock);
8847 			if (pvh_gen != pvh->pv_gen) {
8848 				PMAP_UNLOCK(pmap);
8849 				goto retry;
8850 			}
8851 		}
8852 		PG_RW = pmap_rw_bit(pmap);
8853 		va = pv->pv_va;
8854 		pde = pmap_pde(pmap, va);
8855 		if ((*pde & PG_RW) != 0)
8856 			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
8857 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
8858 		    ("inconsistent pv lock %p %p for page %p",
8859 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
8860 		PMAP_UNLOCK(pmap);
8861 	}
8862 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
8863 		pmap = PV_PMAP(pv);
8864 		if (!PMAP_TRYLOCK(pmap)) {
8865 			pvh_gen = pvh->pv_gen;
8866 			md_gen = m->md.pv_gen;
8867 			rw_wunlock(lock);
8868 			PMAP_LOCK(pmap);
8869 			rw_wlock(lock);
8870 			if (pvh_gen != pvh->pv_gen ||
8871 			    md_gen != m->md.pv_gen) {
8872 				PMAP_UNLOCK(pmap);
8873 				goto retry;
8874 			}
8875 		}
8876 		PG_M = pmap_modified_bit(pmap);
8877 		PG_RW = pmap_rw_bit(pmap);
8878 		pde = pmap_pde(pmap, pv->pv_va);
8879 		KASSERT((*pde & PG_PS) == 0,
8880 		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
8881 		    m));
8882 		pte = pmap_pde_to_pte(pde, pv->pv_va);
8883 		oldpte = *pte;
8884 		if (oldpte & PG_RW) {
8885 			while (!atomic_fcmpset_long(pte, &oldpte, oldpte &
8886 			    ~(PG_RW | PG_M)))
8887 				cpu_spinwait();
8888 			if ((oldpte & PG_M) != 0)
8889 				vm_page_dirty(m);
8890 			pmap_invalidate_page(pmap, pv->pv_va);
8891 		}
8892 		PMAP_UNLOCK(pmap);
8893 	}
8894 	rw_wunlock(lock);
8895 	vm_page_aflag_clear(m, PGA_WRITEABLE);
8896 	pmap_delayed_invl_wait(m);
8897 }
8898 
8899 /*
8900  *	pmap_ts_referenced:
8901  *
8902  *	Return a count of reference bits for a page, clearing those bits.
8903  *	It is not necessary for every reference bit to be cleared, but it
8904  *	is necessary that 0 only be returned when there are truly no
8905  *	reference bits set.
8906  *
8907  *	As an optimization, update the page's dirty field if a modified bit is
8908  *	found while counting reference bits.  This opportunistic update can be
8909  *	performed at low cost and can eliminate the need for some future calls
8910  *	to pmap_is_modified().  However, since this function stops after
8911  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
8912  *	dirty pages.  Those dirty pages will only be detected by a future call
8913  *	to pmap_is_modified().
8914  *
8915  *	A DI block is not needed within this function, because
8916  *	invalidations are performed before the PV list lock is
8917  *	released.
8918  */
8919 int
pmap_ts_referenced(vm_page_t m)8920 pmap_ts_referenced(vm_page_t m)
8921 {
8922 	struct md_page *pvh;
8923 	pv_entry_t pv, pvf;
8924 	pmap_t pmap;
8925 	struct rwlock *lock;
8926 	pd_entry_t oldpde, *pde;
8927 	pt_entry_t *pte, PG_A, PG_M, PG_RW;
8928 	vm_offset_t va;
8929 	vm_paddr_t pa;
8930 	int cleared, md_gen, not_cleared, pvh_gen;
8931 	struct spglist free;
8932 	bool demoted;
8933 
8934 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
8935 	    ("pmap_ts_referenced: page %p is not managed", m));
8936 	SLIST_INIT(&free);
8937 	cleared = 0;
8938 	pa = VM_PAGE_TO_PHYS(m);
8939 	lock = PHYS_TO_PV_LIST_LOCK(pa);
8940 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
8941 	rw_wlock(lock);
8942 retry:
8943 	not_cleared = 0;
8944 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
8945 		goto small_mappings;
8946 	pv = pvf;
8947 	do {
8948 		if (pvf == NULL)
8949 			pvf = pv;
8950 		pmap = PV_PMAP(pv);
8951 		if (!PMAP_TRYLOCK(pmap)) {
8952 			pvh_gen = pvh->pv_gen;
8953 			rw_wunlock(lock);
8954 			PMAP_LOCK(pmap);
8955 			rw_wlock(lock);
8956 			if (pvh_gen != pvh->pv_gen) {
8957 				PMAP_UNLOCK(pmap);
8958 				goto retry;
8959 			}
8960 		}
8961 		PG_A = pmap_accessed_bit(pmap);
8962 		PG_M = pmap_modified_bit(pmap);
8963 		PG_RW = pmap_rw_bit(pmap);
8964 		va = pv->pv_va;
8965 		pde = pmap_pde(pmap, pv->pv_va);
8966 		oldpde = *pde;
8967 		if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
8968 			/*
8969 			 * Although "oldpde" is mapping a 2MB page, because
8970 			 * this function is called at a 4KB page granularity,
8971 			 * we only update the 4KB page under test.
8972 			 */
8973 			vm_page_dirty(m);
8974 		}
8975 		if ((oldpde & PG_A) != 0) {
8976 			/*
8977 			 * Since this reference bit is shared by 512 4KB
8978 			 * pages, it should not be cleared every time it is
8979 			 * tested.  Apply a simple "hash" function on the
8980 			 * physical page number, the virtual superpage number,
8981 			 * and the pmap address to select one 4KB page out of
8982 			 * the 512 on which testing the reference bit will
8983 			 * result in clearing that reference bit.  This
8984 			 * function is designed to avoid the selection of the
8985 			 * same 4KB page for every 2MB page mapping.
8986 			 *
8987 			 * On demotion, a mapping that hasn't been referenced
8988 			 * is simply destroyed.  To avoid the possibility of a
8989 			 * subsequent page fault on a demoted wired mapping,
8990 			 * always leave its reference bit set.  Moreover,
8991 			 * since the superpage is wired, the current state of
8992 			 * its reference bit won't affect page replacement.
8993 			 */
8994 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
8995 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
8996 			    (oldpde & PG_W) == 0) {
8997 				if (safe_to_clear_referenced(pmap, oldpde)) {
8998 					atomic_clear_long(pde, PG_A);
8999 					pmap_invalidate_page(pmap, pv->pv_va);
9000 					demoted = false;
9001 				} else if (pmap_demote_pde_locked(pmap, pde,
9002 				    pv->pv_va, &lock)) {
9003 					/*
9004 					 * Remove the mapping to a single page
9005 					 * so that a subsequent access may
9006 					 * repromote.  Since the underlying
9007 					 * page table page is fully populated,
9008 					 * this removal never frees a page
9009 					 * table page.
9010 					 */
9011 					demoted = true;
9012 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
9013 					    PG_PS_FRAME);
9014 					pte = pmap_pde_to_pte(pde, va);
9015 					pmap_remove_pte(pmap, pte, va, *pde,
9016 					    NULL, &lock);
9017 					pmap_invalidate_page(pmap, va);
9018 				} else
9019 					demoted = true;
9020 
9021 				if (demoted) {
9022 					/*
9023 					 * The superpage mapping was removed
9024 					 * entirely and therefore 'pv' is no
9025 					 * longer valid.
9026 					 */
9027 					if (pvf == pv)
9028 						pvf = NULL;
9029 					pv = NULL;
9030 				}
9031 				cleared++;
9032 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
9033 				    ("inconsistent pv lock %p %p for page %p",
9034 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
9035 			} else
9036 				not_cleared++;
9037 		}
9038 		PMAP_UNLOCK(pmap);
9039 		/* Rotate the PV list if it has more than one entry. */
9040 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
9041 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
9042 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
9043 			pvh->pv_gen++;
9044 		}
9045 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
9046 			goto out;
9047 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
9048 small_mappings:
9049 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
9050 		goto out;
9051 	pv = pvf;
9052 	do {
9053 		if (pvf == NULL)
9054 			pvf = pv;
9055 		pmap = PV_PMAP(pv);
9056 		if (!PMAP_TRYLOCK(pmap)) {
9057 			pvh_gen = pvh->pv_gen;
9058 			md_gen = m->md.pv_gen;
9059 			rw_wunlock(lock);
9060 			PMAP_LOCK(pmap);
9061 			rw_wlock(lock);
9062 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
9063 				PMAP_UNLOCK(pmap);
9064 				goto retry;
9065 			}
9066 		}
9067 		PG_A = pmap_accessed_bit(pmap);
9068 		PG_M = pmap_modified_bit(pmap);
9069 		PG_RW = pmap_rw_bit(pmap);
9070 		pde = pmap_pde(pmap, pv->pv_va);
9071 		KASSERT((*pde & PG_PS) == 0,
9072 		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
9073 		    m));
9074 		pte = pmap_pde_to_pte(pde, pv->pv_va);
9075 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
9076 			vm_page_dirty(m);
9077 		if ((*pte & PG_A) != 0) {
9078 			if (safe_to_clear_referenced(pmap, *pte)) {
9079 				atomic_clear_long(pte, PG_A);
9080 				pmap_invalidate_page(pmap, pv->pv_va);
9081 				cleared++;
9082 			} else if ((*pte & PG_W) == 0) {
9083 				/*
9084 				 * Wired pages cannot be paged out so
9085 				 * doing accessed bit emulation for
9086 				 * them is wasted effort. We do the
9087 				 * hard work for unwired pages only.
9088 				 */
9089 				pmap_remove_pte(pmap, pte, pv->pv_va,
9090 				    *pde, &free, &lock);
9091 				pmap_invalidate_page(pmap, pv->pv_va);
9092 				cleared++;
9093 				if (pvf == pv)
9094 					pvf = NULL;
9095 				pv = NULL;
9096 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
9097 				    ("inconsistent pv lock %p %p for page %p",
9098 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
9099 			} else
9100 				not_cleared++;
9101 		}
9102 		PMAP_UNLOCK(pmap);
9103 		/* Rotate the PV list if it has more than one entry. */
9104 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
9105 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
9106 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
9107 			m->md.pv_gen++;
9108 		}
9109 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
9110 	    not_cleared < PMAP_TS_REFERENCED_MAX);
9111 out:
9112 	rw_wunlock(lock);
9113 	vm_page_free_pages_toq(&free, true);
9114 	return (cleared + not_cleared);
9115 }
9116 
9117 /*
9118  *	Apply the given advice to the specified range of addresses within the
9119  *	given pmap.  Depending on the advice, clear the referenced and/or
9120  *	modified flags in each mapping and set the mapped page's dirty field.
9121  */
9122 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)9123 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
9124 {
9125 	struct rwlock *lock;
9126 	pml4_entry_t *pml4e;
9127 	pdp_entry_t *pdpe;
9128 	pd_entry_t oldpde, *pde;
9129 	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
9130 	vm_offset_t va, va_next;
9131 	vm_page_t m;
9132 	bool anychanged;
9133 
9134 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
9135 		return;
9136 
9137 	/*
9138 	 * A/D bit emulation requires an alternate code path when clearing
9139 	 * the modified and accessed bits below. Since this function is
9140 	 * advisory in nature we skip it entirely for pmaps that require
9141 	 * A/D bit emulation.
9142 	 */
9143 	if (pmap_emulate_ad_bits(pmap))
9144 		return;
9145 
9146 	PG_A = pmap_accessed_bit(pmap);
9147 	PG_G = pmap_global_bit(pmap);
9148 	PG_M = pmap_modified_bit(pmap);
9149 	PG_V = pmap_valid_bit(pmap);
9150 	PG_RW = pmap_rw_bit(pmap);
9151 	anychanged = false;
9152 	pmap_delayed_invl_start();
9153 	PMAP_LOCK(pmap);
9154 	for (; sva < eva; sva = va_next) {
9155 		pml4e = pmap_pml4e(pmap, sva);
9156 		if (pml4e == NULL || (*pml4e & PG_V) == 0) {
9157 			va_next = (sva + NBPML4) & ~PML4MASK;
9158 			if (va_next < sva)
9159 				va_next = eva;
9160 			continue;
9161 		}
9162 
9163 		va_next = (sva + NBPDP) & ~PDPMASK;
9164 		if (va_next < sva)
9165 			va_next = eva;
9166 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
9167 		if ((*pdpe & PG_V) == 0)
9168 			continue;
9169 		if ((*pdpe & PG_PS) != 0)
9170 			continue;
9171 
9172 		va_next = (sva + NBPDR) & ~PDRMASK;
9173 		if (va_next < sva)
9174 			va_next = eva;
9175 		pde = pmap_pdpe_to_pde(pdpe, sva);
9176 		oldpde = *pde;
9177 		if ((oldpde & PG_V) == 0)
9178 			continue;
9179 		else if ((oldpde & PG_PS) != 0) {
9180 			if ((oldpde & PG_MANAGED) == 0)
9181 				continue;
9182 			lock = NULL;
9183 			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
9184 				if (lock != NULL)
9185 					rw_wunlock(lock);
9186 
9187 				/*
9188 				 * The large page mapping was destroyed.
9189 				 */
9190 				continue;
9191 			}
9192 
9193 			/*
9194 			 * Unless the page mappings are wired, remove the
9195 			 * mapping to a single page so that a subsequent
9196 			 * access may repromote.  Choosing the last page
9197 			 * within the address range [sva, min(va_next, eva))
9198 			 * generally results in more repromotions.  Since the
9199 			 * underlying page table page is fully populated, this
9200 			 * removal never frees a page table page.
9201 			 */
9202 			if ((oldpde & PG_W) == 0) {
9203 				va = eva;
9204 				if (va > va_next)
9205 					va = va_next;
9206 				va -= PAGE_SIZE;
9207 				KASSERT(va >= sva,
9208 				    ("pmap_advise: no address gap"));
9209 				pte = pmap_pde_to_pte(pde, va);
9210 				KASSERT((*pte & PG_V) != 0,
9211 				    ("pmap_advise: invalid PTE"));
9212 				pmap_remove_pte(pmap, pte, va, *pde, NULL,
9213 				    &lock);
9214 				anychanged = true;
9215 			}
9216 			if (lock != NULL)
9217 				rw_wunlock(lock);
9218 		}
9219 		if (va_next > eva)
9220 			va_next = eva;
9221 		va = va_next;
9222 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
9223 		    sva += PAGE_SIZE) {
9224 			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
9225 				goto maybe_invlrng;
9226 			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
9227 				if (advice == MADV_DONTNEED) {
9228 					/*
9229 					 * Future calls to pmap_is_modified()
9230 					 * can be avoided by making the page
9231 					 * dirty now.
9232 					 */
9233 					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
9234 					vm_page_dirty(m);
9235 				}
9236 				atomic_clear_long(pte, PG_M | PG_A);
9237 			} else if ((*pte & PG_A) != 0)
9238 				atomic_clear_long(pte, PG_A);
9239 			else
9240 				goto maybe_invlrng;
9241 
9242 			if ((*pte & PG_G) != 0) {
9243 				if (va == va_next)
9244 					va = sva;
9245 			} else
9246 				anychanged = true;
9247 			continue;
9248 maybe_invlrng:
9249 			if (va != va_next) {
9250 				pmap_invalidate_range(pmap, va, sva);
9251 				va = va_next;
9252 			}
9253 		}
9254 		if (va != va_next)
9255 			pmap_invalidate_range(pmap, va, sva);
9256 	}
9257 	if (anychanged)
9258 		pmap_invalidate_all(pmap);
9259 	PMAP_UNLOCK(pmap);
9260 	pmap_delayed_invl_finish();
9261 }
9262 
9263 /*
9264  *	Clear the modify bits on the specified physical page.
9265  */
9266 void
pmap_clear_modify(vm_page_t m)9267 pmap_clear_modify(vm_page_t m)
9268 {
9269 	struct md_page *pvh;
9270 	pmap_t pmap;
9271 	pv_entry_t next_pv, pv;
9272 	pd_entry_t oldpde, *pde;
9273 	pt_entry_t *pte, PG_M, PG_RW;
9274 	struct rwlock *lock;
9275 	vm_offset_t va;
9276 	int md_gen, pvh_gen;
9277 
9278 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
9279 	    ("pmap_clear_modify: page %p is not managed", m));
9280 	vm_page_assert_busied(m);
9281 
9282 	if (!pmap_page_is_write_mapped(m))
9283 		return;
9284 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
9285 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
9286 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
9287 	rw_wlock(lock);
9288 restart:
9289 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
9290 		pmap = PV_PMAP(pv);
9291 		if (!PMAP_TRYLOCK(pmap)) {
9292 			pvh_gen = pvh->pv_gen;
9293 			rw_wunlock(lock);
9294 			PMAP_LOCK(pmap);
9295 			rw_wlock(lock);
9296 			if (pvh_gen != pvh->pv_gen) {
9297 				PMAP_UNLOCK(pmap);
9298 				goto restart;
9299 			}
9300 		}
9301 		PG_M = pmap_modified_bit(pmap);
9302 		PG_RW = pmap_rw_bit(pmap);
9303 		va = pv->pv_va;
9304 		pde = pmap_pde(pmap, va);
9305 		oldpde = *pde;
9306 		/* If oldpde has PG_RW set, then it also has PG_M set. */
9307 		if ((oldpde & PG_RW) != 0 &&
9308 		    pmap_demote_pde_locked(pmap, pde, va, &lock) &&
9309 		    (oldpde & PG_W) == 0) {
9310 			/*
9311 			 * Write protect the mapping to a single page so that
9312 			 * a subsequent write access may repromote.
9313 			 */
9314 			va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME);
9315 			pte = pmap_pde_to_pte(pde, va);
9316 			atomic_clear_long(pte, PG_M | PG_RW);
9317 			vm_page_dirty(m);
9318 			pmap_invalidate_page(pmap, va);
9319 		}
9320 		PMAP_UNLOCK(pmap);
9321 	}
9322 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
9323 		pmap = PV_PMAP(pv);
9324 		if (!PMAP_TRYLOCK(pmap)) {
9325 			md_gen = m->md.pv_gen;
9326 			pvh_gen = pvh->pv_gen;
9327 			rw_wunlock(lock);
9328 			PMAP_LOCK(pmap);
9329 			rw_wlock(lock);
9330 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
9331 				PMAP_UNLOCK(pmap);
9332 				goto restart;
9333 			}
9334 		}
9335 		PG_M = pmap_modified_bit(pmap);
9336 		PG_RW = pmap_rw_bit(pmap);
9337 		pde = pmap_pde(pmap, pv->pv_va);
9338 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
9339 		    " a 2mpage in page %p's pv list", m));
9340 		pte = pmap_pde_to_pte(pde, pv->pv_va);
9341 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
9342 			atomic_clear_long(pte, PG_M);
9343 			pmap_invalidate_page(pmap, pv->pv_va);
9344 		}
9345 		PMAP_UNLOCK(pmap);
9346 	}
9347 	rw_wunlock(lock);
9348 }
9349 
9350 /*
9351  * Miscellaneous support routines follow
9352  */
9353 
9354 /* Adjust the properties for a leaf page table entry. */
9355 static __inline void
pmap_pte_props(pt_entry_t * pte,u_long bits,u_long mask)9356 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask)
9357 {
9358 	u_long opte, npte;
9359 
9360 	opte = *(u_long *)pte;
9361 	do {
9362 		npte = opte & ~mask;
9363 		npte |= bits;
9364 	} while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte,
9365 	    npte));
9366 }
9367 
9368 /*
9369  * Map a set of physical memory pages into the kernel virtual
9370  * address space. Return a pointer to where it is mapped. This
9371  * routine is intended to be used for mapping device memory,
9372  * NOT real memory.
9373  */
9374 static void *
pmap_mapdev_internal(vm_paddr_t pa,vm_size_t size,int mode,int flags)9375 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags)
9376 {
9377 	struct pmap_preinit_mapping *ppim;
9378 	vm_offset_t va, offset;
9379 	vm_size_t tmpsize;
9380 	int i;
9381 
9382 	offset = pa & PAGE_MASK;
9383 	size = round_page(offset + size);
9384 	pa = trunc_page(pa);
9385 
9386 	if (!pmap_initialized) {
9387 		va = 0;
9388 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
9389 			ppim = pmap_preinit_mapping + i;
9390 			if (ppim->va == 0) {
9391 				ppim->pa = pa;
9392 				ppim->sz = size;
9393 				ppim->mode = mode;
9394 				ppim->va = virtual_avail;
9395 				virtual_avail += size;
9396 				va = ppim->va;
9397 				break;
9398 			}
9399 		}
9400 		if (va == 0)
9401 			panic("%s: too many preinit mappings", __func__);
9402 	} else {
9403 		/*
9404 		 * If we have a preinit mapping, reuse it.
9405 		 */
9406 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
9407 			ppim = pmap_preinit_mapping + i;
9408 			if (ppim->pa == pa && ppim->sz == size &&
9409 			    (ppim->mode == mode ||
9410 			    (flags & MAPDEV_SETATTR) == 0))
9411 				return ((void *)(ppim->va + offset));
9412 		}
9413 		/*
9414 		 * If the specified range of physical addresses fits within
9415 		 * the direct map window, use the direct map.
9416 		 */
9417 		if (pa < dmaplimit && pa + size <= dmaplimit) {
9418 			va = PHYS_TO_DMAP(pa);
9419 			if ((flags & MAPDEV_SETATTR) != 0) {
9420 				PMAP_LOCK(kernel_pmap);
9421 				i = pmap_change_props_locked(va, size,
9422 				    PROT_NONE, mode, flags);
9423 				PMAP_UNLOCK(kernel_pmap);
9424 			} else
9425 				i = 0;
9426 			if (!i)
9427 				return ((void *)(va + offset));
9428 		}
9429 		va = kva_alloc(size);
9430 		if (va == 0)
9431 			panic("%s: Couldn't allocate KVA", __func__);
9432 	}
9433 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
9434 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
9435 	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
9436 	if ((flags & MAPDEV_FLUSHCACHE) != 0)
9437 		pmap_invalidate_cache_range(va, va + tmpsize);
9438 	return ((void *)(va + offset));
9439 }
9440 
9441 void *
pmap_mapdev_attr(vm_paddr_t pa,vm_size_t size,int mode)9442 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
9443 {
9444 
9445 	return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE |
9446 	    MAPDEV_SETATTR));
9447 }
9448 
9449 void *
pmap_mapdev(vm_paddr_t pa,vm_size_t size)9450 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
9451 {
9452 
9453 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
9454 }
9455 
9456 void *
pmap_mapdev_pciecfg(vm_paddr_t pa,vm_size_t size)9457 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size)
9458 {
9459 
9460 	return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE,
9461 	    MAPDEV_SETATTR));
9462 }
9463 
9464 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)9465 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
9466 {
9467 
9468 	return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK,
9469 	    MAPDEV_FLUSHCACHE));
9470 }
9471 
9472 void
pmap_unmapdev(void * p,vm_size_t size)9473 pmap_unmapdev(void *p, vm_size_t size)
9474 {
9475 	struct pmap_preinit_mapping *ppim;
9476 	vm_offset_t offset, va;
9477 	int i;
9478 
9479 	va = (vm_offset_t)p;
9480 
9481 	/* If we gave a direct map region in pmap_mapdev, do nothing */
9482 	if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high)
9483 		return;
9484 	offset = va & PAGE_MASK;
9485 	size = round_page(offset + size);
9486 	va = trunc_page(va);
9487 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
9488 		ppim = pmap_preinit_mapping + i;
9489 		if (ppim->va == va && ppim->sz == size) {
9490 			if (pmap_initialized)
9491 				return;
9492 			ppim->pa = 0;
9493 			ppim->va = 0;
9494 			ppim->sz = 0;
9495 			ppim->mode = 0;
9496 			if (va + size == virtual_avail)
9497 				virtual_avail = va;
9498 			return;
9499 		}
9500 	}
9501 	if (pmap_initialized) {
9502 		pmap_qremove(va, atop(size));
9503 		kva_free(va, size);
9504 	}
9505 }
9506 
9507 /*
9508  * Tries to demote a 1GB page mapping.
9509  */
9510 static bool
pmap_demote_pdpe(pmap_t pmap,pdp_entry_t * pdpe,vm_offset_t va,vm_page_t m)9511 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va, vm_page_t m)
9512 {
9513 	pdp_entry_t newpdpe, oldpdpe;
9514 	pd_entry_t *firstpde, newpde, *pde;
9515 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
9516 	vm_paddr_t pdpgpa;
9517 	vm_page_t pdpg;
9518 
9519 	PG_A = pmap_accessed_bit(pmap);
9520 	PG_M = pmap_modified_bit(pmap);
9521 	PG_V = pmap_valid_bit(pmap);
9522 	PG_RW = pmap_rw_bit(pmap);
9523 
9524 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9525 	oldpdpe = *pdpe;
9526 	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
9527 	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
9528 	if (m == NULL) {
9529 		pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT,
9530 		    VM_ALLOC_WIRED);
9531 		if (pdpg  == NULL) {
9532 			CTR2(KTR_PMAP,
9533 			    "pmap_demote_pdpe: failure for va %#lx in pmap %p",
9534 			    va, pmap);
9535 			return (false);
9536 		}
9537 	} else {
9538 		pdpg = m;
9539 		pdpg->pindex = va >> PDPSHIFT;
9540 		pmap_pt_page_count_adj(pmap, 1);
9541 	}
9542 	pdpgpa = VM_PAGE_TO_PHYS(pdpg);
9543 	firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
9544 	newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
9545 	KASSERT((oldpdpe & PG_A) != 0,
9546 	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
9547 	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
9548 	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
9549 	newpde = oldpdpe;
9550 
9551 	/*
9552 	 * Initialize the page directory page.
9553 	 */
9554 	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
9555 		*pde = newpde;
9556 		newpde += NBPDR;
9557 	}
9558 
9559 	/*
9560 	 * Demote the mapping.
9561 	 */
9562 	*pdpe = newpdpe;
9563 
9564 	/*
9565 	 * Invalidate a stale recursive mapping of the page directory page.
9566 	 */
9567 	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
9568 
9569 	counter_u64_add(pmap_pdpe_demotions, 1);
9570 	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
9571 	    " in pmap %p", va, pmap);
9572 	return (true);
9573 }
9574 
9575 /*
9576  * Sets the memory attribute for the specified page.
9577  */
9578 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)9579 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
9580 {
9581 	if (m->md.pat_mode == ma)
9582 		return;
9583 
9584 	m->md.pat_mode = ma;
9585 
9586 	/*
9587 	 * If "m" is a normal page, update its direct mapping.  This update
9588 	 * can be relied upon to perform any cache operations that are
9589 	 * required for data coherence.
9590 	 */
9591 	if ((m->flags & PG_FICTITIOUS) == 0 &&
9592 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
9593 	    m->md.pat_mode))
9594 		panic("memory attribute change on the direct map failed");
9595 }
9596 
9597 void
pmap_page_set_memattr_noflush(vm_page_t m,vm_memattr_t ma)9598 pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma)
9599 {
9600 	int error;
9601 
9602 	if (m->md.pat_mode == ma)
9603 		return;
9604 
9605 	m->md.pat_mode = ma;
9606 
9607 	if ((m->flags & PG_FICTITIOUS) != 0)
9608 		return;
9609 	PMAP_LOCK(kernel_pmap);
9610 	error = pmap_change_props_locked(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)),
9611 	    PAGE_SIZE, PROT_NONE, m->md.pat_mode, 0);
9612 	PMAP_UNLOCK(kernel_pmap);
9613 	if (error != 0)
9614 		panic("memory attribute change on the direct map failed");
9615 }
9616 
9617 /*
9618  * Changes the specified virtual address range's memory type to that given by
9619  * the parameter "mode".  The specified virtual address range must be
9620  * completely contained within either the direct map or the kernel map.  If
9621  * the virtual address range is contained within the kernel map, then the
9622  * memory type for each of the corresponding ranges of the direct map is also
9623  * changed.  (The corresponding ranges of the direct map are those ranges that
9624  * map the same physical pages as the specified virtual address range.)  These
9625  * changes to the direct map are necessary because Intel describes the
9626  * behavior of their processors as "undefined" if two or more mappings to the
9627  * same physical page have different memory types.
9628  *
9629  * Returns zero if the change completed successfully, and either EINVAL or
9630  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
9631  * of the virtual address range was not mapped, and ENOMEM is returned if
9632  * there was insufficient memory available to complete the change.  In the
9633  * latter case, the memory type may have been changed on some part of the
9634  * virtual address range or the direct map.
9635  */
9636 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)9637 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
9638 {
9639 	int error;
9640 
9641 	PMAP_LOCK(kernel_pmap);
9642 	error = pmap_change_props_locked(va, size, PROT_NONE, mode,
9643 	    MAPDEV_FLUSHCACHE);
9644 	PMAP_UNLOCK(kernel_pmap);
9645 	return (error);
9646 }
9647 
9648 /*
9649  * Changes the specified virtual address range's protections to those
9650  * specified by "prot".  Like pmap_change_attr(), protections for aliases
9651  * in the direct map are updated as well.  Protections on aliasing mappings may
9652  * be a subset of the requested protections; for example, mappings in the direct
9653  * map are never executable.
9654  */
9655 int
pmap_change_prot(vm_offset_t va,vm_size_t size,vm_prot_t prot)9656 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
9657 {
9658 	int error;
9659 
9660 	/* Only supported within the kernel map. */
9661 	if (va < kva_layout.km_low)
9662 		return (EINVAL);
9663 
9664 	PMAP_LOCK(kernel_pmap);
9665 	error = pmap_change_props_locked(va, size, prot, -1,
9666 	    MAPDEV_ASSERTVALID);
9667 	PMAP_UNLOCK(kernel_pmap);
9668 	return (error);
9669 }
9670 
9671 static int
pmap_change_props_locked(vm_offset_t va,vm_size_t size,vm_prot_t prot,int mode,int flags)9672 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
9673     int mode, int flags)
9674 {
9675 	vm_offset_t base, offset, tmpva;
9676 	vm_paddr_t pa_start, pa_end, pa_end1;
9677 	pdp_entry_t *pdpe;
9678 	pd_entry_t *pde, pde_bits, pde_mask;
9679 	pt_entry_t *pte, pte_bits, pte_mask;
9680 	int error;
9681 	bool changed;
9682 
9683 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
9684 	base = trunc_page(va);
9685 	offset = va & PAGE_MASK;
9686 	size = round_page(offset + size);
9687 
9688 	/*
9689 	 * Only supported on kernel virtual addresses, including the direct
9690 	 * map but excluding the recursive map.
9691 	 */
9692 	if (base < kva_layout.dmap_low)
9693 		return (EINVAL);
9694 
9695 	/*
9696 	 * Construct our flag sets and masks.  "bits" is the subset of
9697 	 * "mask" that will be set in each modified PTE.
9698 	 *
9699 	 * Mappings in the direct map are never allowed to be executable.
9700 	 */
9701 	pde_bits = pte_bits = 0;
9702 	pde_mask = pte_mask = 0;
9703 	if (mode != -1) {
9704 		pde_bits |= pmap_cache_bits(kernel_pmap, mode, true);
9705 		pde_mask |= X86_PG_PDE_CACHE;
9706 		pte_bits |= pmap_cache_bits(kernel_pmap, mode, false);
9707 		pte_mask |= X86_PG_PTE_CACHE;
9708 	}
9709 	if (prot != VM_PROT_NONE) {
9710 		if ((prot & VM_PROT_WRITE) != 0) {
9711 			pde_bits |= X86_PG_RW;
9712 			pte_bits |= X86_PG_RW;
9713 		}
9714 		if ((prot & VM_PROT_EXECUTE) == 0 ||
9715 		    va < kva_layout.km_low) {
9716 			pde_bits |= pg_nx;
9717 			pte_bits |= pg_nx;
9718 		}
9719 		pde_mask |= X86_PG_RW | pg_nx;
9720 		pte_mask |= X86_PG_RW | pg_nx;
9721 	}
9722 
9723 	/*
9724 	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
9725 	 * into 4KB pages if required.
9726 	 */
9727 	for (tmpva = base; tmpva < base + size; ) {
9728 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
9729 		if (pdpe == NULL || *pdpe == 0) {
9730 			KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
9731 			    ("%s: addr %#lx is not mapped", __func__, tmpva));
9732 			return (EINVAL);
9733 		}
9734 		if (*pdpe & PG_PS) {
9735 			/*
9736 			 * If the current 1GB page already has the required
9737 			 * properties, then we need not demote this page.  Just
9738 			 * increment tmpva to the next 1GB page frame.
9739 			 */
9740 			if ((*pdpe & pde_mask) == pde_bits) {
9741 				tmpva = trunc_1gpage(tmpva) + NBPDP;
9742 				continue;
9743 			}
9744 
9745 			/*
9746 			 * If the current offset aligns with a 1GB page frame
9747 			 * and there is at least 1GB left within the range, then
9748 			 * we need not break down this page into 2MB pages.
9749 			 */
9750 			if ((tmpva & PDPMASK) == 0 &&
9751 			    tmpva + PDPMASK < base + size) {
9752 				tmpva += NBPDP;
9753 				continue;
9754 			}
9755 			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva, NULL))
9756 				return (ENOMEM);
9757 		}
9758 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
9759 		if (*pde == 0) {
9760 			KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
9761 			    ("%s: addr %#lx is not mapped", __func__, tmpva));
9762 			return (EINVAL);
9763 		}
9764 		if (*pde & PG_PS) {
9765 			/*
9766 			 * If the current 2MB page already has the required
9767 			 * properties, then we need not demote this page.  Just
9768 			 * increment tmpva to the next 2MB page frame.
9769 			 */
9770 			if ((*pde & pde_mask) == pde_bits) {
9771 				tmpva = trunc_2mpage(tmpva) + NBPDR;
9772 				continue;
9773 			}
9774 
9775 			/*
9776 			 * If the current offset aligns with a 2MB page frame
9777 			 * and there is at least 2MB left within the range, then
9778 			 * we need not break down this page into 4KB pages.
9779 			 */
9780 			if ((tmpva & PDRMASK) == 0 &&
9781 			    tmpva + PDRMASK < base + size) {
9782 				tmpva += NBPDR;
9783 				continue;
9784 			}
9785 			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
9786 				return (ENOMEM);
9787 		}
9788 		pte = pmap_pde_to_pte(pde, tmpva);
9789 		if (*pte == 0) {
9790 			KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
9791 			    ("%s: addr %#lx is not mapped", __func__, tmpva));
9792 			return (EINVAL);
9793 		}
9794 		tmpva += PAGE_SIZE;
9795 	}
9796 	error = 0;
9797 
9798 	/*
9799 	 * Ok, all the pages exist, so run through them updating their
9800 	 * properties if required.
9801 	 */
9802 	changed = false;
9803 	pa_start = pa_end = 0;
9804 	for (tmpva = base; tmpva < base + size; ) {
9805 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
9806 		if (*pdpe & PG_PS) {
9807 			if ((*pdpe & pde_mask) != pde_bits) {
9808 				pmap_pte_props(pdpe, pde_bits, pde_mask);
9809 				changed = true;
9810 			}
9811 			if (tmpva >= kva_layout.km_low &&
9812 			    (*pdpe & PG_PS_FRAME) < dmaplimit) {
9813 				if (pa_start == pa_end) {
9814 					/* Start physical address run. */
9815 					pa_start = *pdpe & PG_PS_FRAME;
9816 					pa_end = pa_start + NBPDP;
9817 				} else if (pa_end == (*pdpe & PG_PS_FRAME))
9818 					pa_end += NBPDP;
9819 				else {
9820 					/* Run ended, update direct map. */
9821 					error = pmap_change_props_locked(
9822 					    PHYS_TO_DMAP(pa_start),
9823 					    pa_end - pa_start, prot, mode,
9824 					    flags);
9825 					if (error != 0)
9826 						break;
9827 					/* Start physical address run. */
9828 					pa_start = *pdpe & PG_PS_FRAME;
9829 					pa_end = pa_start + NBPDP;
9830 				}
9831 			}
9832 			tmpva = trunc_1gpage(tmpva) + NBPDP;
9833 			continue;
9834 		}
9835 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
9836 		if (*pde & PG_PS) {
9837 			if ((*pde & pde_mask) != pde_bits) {
9838 				pmap_pte_props(pde, pde_bits, pde_mask);
9839 				changed = true;
9840 			}
9841 			if (tmpva >= kva_layout.km_low &&
9842 			    (*pde & PG_PS_FRAME) < dmaplimit) {
9843 				if (pa_start == pa_end) {
9844 					/* Start physical address run. */
9845 					pa_start = *pde & PG_PS_FRAME;
9846 					pa_end = pa_start + NBPDR;
9847 				} else if (pa_end == (*pde & PG_PS_FRAME))
9848 					pa_end += NBPDR;
9849 				else {
9850 					/* Run ended, update direct map. */
9851 					error = pmap_change_props_locked(
9852 					    PHYS_TO_DMAP(pa_start),
9853 					    pa_end - pa_start, prot, mode,
9854 					    flags);
9855 					if (error != 0)
9856 						break;
9857 					/* Start physical address run. */
9858 					pa_start = *pde & PG_PS_FRAME;
9859 					pa_end = pa_start + NBPDR;
9860 				}
9861 			}
9862 			tmpva = trunc_2mpage(tmpva) + NBPDR;
9863 		} else {
9864 			pte = pmap_pde_to_pte(pde, tmpva);
9865 			if ((*pte & pte_mask) != pte_bits) {
9866 				pmap_pte_props(pte, pte_bits, pte_mask);
9867 				changed = true;
9868 			}
9869 			if (tmpva >= kva_layout.km_low &&
9870 			    (*pte & PG_FRAME) < dmaplimit) {
9871 				if (pa_start == pa_end) {
9872 					/* Start physical address run. */
9873 					pa_start = *pte & PG_FRAME;
9874 					pa_end = pa_start + PAGE_SIZE;
9875 				} else if (pa_end == (*pte & PG_FRAME))
9876 					pa_end += PAGE_SIZE;
9877 				else {
9878 					/* Run ended, update direct map. */
9879 					error = pmap_change_props_locked(
9880 					    PHYS_TO_DMAP(pa_start),
9881 					    pa_end - pa_start, prot, mode,
9882 					    flags);
9883 					if (error != 0)
9884 						break;
9885 					/* Start physical address run. */
9886 					pa_start = *pte & PG_FRAME;
9887 					pa_end = pa_start + PAGE_SIZE;
9888 				}
9889 			}
9890 			tmpva += PAGE_SIZE;
9891 		}
9892 	}
9893 	if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
9894 		pa_end1 = MIN(pa_end, dmaplimit);
9895 		if (pa_start != pa_end1)
9896 			error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start),
9897 			    pa_end1 - pa_start, prot, mode, flags);
9898 	}
9899 
9900 	/*
9901 	 * Flush CPU caches if required to make sure any data isn't cached that
9902 	 * shouldn't be, etc.
9903 	 */
9904 	if (changed) {
9905 		pmap_invalidate_range(kernel_pmap, base, tmpva);
9906 		if ((flags & MAPDEV_FLUSHCACHE) != 0)
9907 			pmap_invalidate_cache_range(base, tmpva);
9908 	}
9909 	return (error);
9910 }
9911 
9912 /*
9913  * Demotes any mapping within the direct map region that covers more
9914  * than the specified range of physical addresses.  This range's size
9915  * must be a power of two and its starting address must be a multiple
9916  * of its size, which means that any pdp from the mapping is fully
9917  * covered by the range if len > NBPDP.  Since the demotion does not
9918  * change any attributes of the mapping, a TLB invalidation is not
9919  * mandatory.  The caller may, however, request a TLB invalidation.
9920  */
9921 void
pmap_demote_DMAP(vm_paddr_t base,vm_size_t len,bool invalidate)9922 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, bool invalidate)
9923 {
9924 	pdp_entry_t *pdpe;
9925 	pd_entry_t *pde;
9926 	vm_offset_t va;
9927 	vm_page_t m, mpte;
9928 	bool changed, rv __diagused;
9929 
9930 	if (len == 0)
9931 		return;
9932 	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
9933 	KASSERT((base & (len - 1)) == 0,
9934 	    ("pmap_demote_DMAP: base is not a multiple of len"));
9935 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "pmap_demote_DMAP");
9936 
9937 	if (len < NBPDP && base < dmaplimit) {
9938 		va = PHYS_TO_DMAP(base);
9939 		changed = false;
9940 
9941 		/*
9942 		 * Assume that it is fine to sleep there.
9943 		 * The only existing caller of pmap_demote_DMAP() is the
9944 		 * x86_mr_split_dmap() function.
9945 		 */
9946 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
9947 		if (len < NBPDR) {
9948 			mpte = vm_page_alloc_noobj(VM_ALLOC_WIRED |
9949 			    VM_ALLOC_WAITOK);
9950 		} else
9951 			mpte = NULL;
9952 
9953 		PMAP_LOCK(kernel_pmap);
9954 		pdpe = pmap_pdpe(kernel_pmap, va);
9955 		if ((*pdpe & X86_PG_V) == 0)
9956 			panic("pmap_demote_DMAP: invalid PDPE");
9957 		if ((*pdpe & PG_PS) != 0) {
9958 			rv = pmap_demote_pdpe(kernel_pmap, pdpe, va, m);
9959 			KASSERT(rv, ("pmap_demote_DMAP: PDPE failed"));
9960 			changed = true;
9961 			m = NULL;
9962 		}
9963 		if (len < NBPDR) {
9964 			pde = pmap_pdpe_to_pde(pdpe, va);
9965 			if ((*pde & X86_PG_V) == 0)
9966 				panic("pmap_demote_DMAP: invalid PDE");
9967 			if ((*pde & PG_PS) != 0) {
9968 				mpte->pindex = pmap_pde_pindex(va);
9969 				pmap_pt_page_count_adj(kernel_pmap, 1);
9970 				rv = pmap_demote_pde_mpte(kernel_pmap, pde, va,
9971 				    NULL, mpte);
9972 				KASSERT(rv, ("pmap_demote_DMAP: PDE failed"));
9973 				changed = true;
9974 				mpte = NULL;
9975 			}
9976 		}
9977 		if (changed && invalidate)
9978 			pmap_invalidate_page(kernel_pmap, va);
9979 		PMAP_UNLOCK(kernel_pmap);
9980 		if (m != NULL) {
9981 			vm_page_unwire_noq(m);
9982 			vm_page_free(m);
9983 		}
9984 		if (mpte != NULL) {
9985 			vm_page_unwire_noq(mpte);
9986 			vm_page_free(mpte);
9987 		}
9988 	}
9989 }
9990 
9991 /*
9992  * Perform the pmap work for mincore(2).  If the page is not both referenced and
9993  * modified by this pmap, returns its physical address so that the caller can
9994  * find other mappings.
9995  */
9996 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)9997 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
9998 {
9999 	pdp_entry_t *pdpe;
10000 	pd_entry_t *pdep;
10001 	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
10002 	vm_paddr_t pa;
10003 	int val;
10004 
10005 	PG_A = pmap_accessed_bit(pmap);
10006 	PG_M = pmap_modified_bit(pmap);
10007 	PG_V = pmap_valid_bit(pmap);
10008 	PG_RW = pmap_rw_bit(pmap);
10009 
10010 	PMAP_LOCK(pmap);
10011 	pte = 0;
10012 	pa = 0;
10013 	val = 0;
10014 	pdpe = pmap_pdpe(pmap, addr);
10015 	if (pdpe == NULL)
10016 		goto out;
10017 	if ((*pdpe & PG_V) != 0) {
10018 		if ((*pdpe & PG_PS) != 0) {
10019 			pte = *pdpe;
10020 			pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) &
10021 			    PG_FRAME;
10022 			val = MINCORE_PSIND(2);
10023 		} else {
10024 			pdep = pmap_pde(pmap, addr);
10025 			if (pdep != NULL && (*pdep & PG_V) != 0) {
10026 				if ((*pdep & PG_PS) != 0) {
10027 					pte = *pdep;
10028 			/* Compute the physical address of the 4KB page. */
10029 					pa = ((pte & PG_PS_FRAME) | (addr &
10030 					    PDRMASK)) & PG_FRAME;
10031 					val = MINCORE_PSIND(1);
10032 				} else {
10033 					pte = *pmap_pde_to_pte(pdep, addr);
10034 					pa = pte & PG_FRAME;
10035 					val = 0;
10036 				}
10037 			}
10038 		}
10039 	}
10040 	if ((pte & PG_V) != 0) {
10041 		val |= MINCORE_INCORE;
10042 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
10043 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
10044 		if ((pte & PG_A) != 0)
10045 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
10046 	}
10047 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
10048 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
10049 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
10050 		*pap = pa;
10051 	}
10052 out:
10053 	PMAP_UNLOCK(pmap);
10054 	return (val);
10055 }
10056 
10057 static uint64_t
pmap_pcid_alloc(pmap_t pmap,struct pmap_pcid * pcidp)10058 pmap_pcid_alloc(pmap_t pmap, struct pmap_pcid *pcidp)
10059 {
10060 	uint32_t gen, new_gen, pcid_next;
10061 
10062 	CRITICAL_ASSERT(curthread);
10063 	gen = PCPU_GET(pcid_gen);
10064 	if (pcidp->pm_pcid == PMAP_PCID_KERN)
10065 		return (pti ? 0 : CR3_PCID_SAVE);
10066 	if (pcidp->pm_gen == gen)
10067 		return (CR3_PCID_SAVE);
10068 	pcid_next = PCPU_GET(pcid_next);
10069 	KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
10070 	    (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
10071 	    ("cpu %d pcid_next %#x", PCPU_GET(cpuid), pcid_next));
10072 	if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
10073 	    (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
10074 		new_gen = gen + 1;
10075 		if (new_gen == 0)
10076 			new_gen = 1;
10077 		PCPU_SET(pcid_gen, new_gen);
10078 		pcid_next = PMAP_PCID_KERN + 1;
10079 	} else {
10080 		new_gen = gen;
10081 	}
10082 	pcidp->pm_pcid = pcid_next;
10083 	pcidp->pm_gen = new_gen;
10084 	PCPU_SET(pcid_next, pcid_next + 1);
10085 	return (0);
10086 }
10087 
10088 static uint64_t
pmap_pcid_alloc_checked(pmap_t pmap,struct pmap_pcid * pcidp)10089 pmap_pcid_alloc_checked(pmap_t pmap, struct pmap_pcid *pcidp)
10090 {
10091 	uint64_t cached;
10092 
10093 	cached = pmap_pcid_alloc(pmap, pcidp);
10094 	KASSERT(pcidp->pm_pcid < PMAP_PCID_OVERMAX,
10095 	    ("pmap %p cpu %d pcid %#x", pmap, PCPU_GET(cpuid), pcidp->pm_pcid));
10096 	KASSERT(pcidp->pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap,
10097 	    ("non-kernel pmap pmap %p cpu %d pcid %#x",
10098 	    pmap, PCPU_GET(cpuid), pcidp->pm_pcid));
10099 	return (cached);
10100 }
10101 
10102 static void
pmap_activate_sw_pti_post(struct thread * td,pmap_t pmap)10103 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap)
10104 {
10105 
10106 	PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ?
10107 	    PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base;
10108 }
10109 
10110 static void
pmap_activate_sw_pcid_pti(struct thread * td,pmap_t pmap,u_int cpuid)10111 pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid)
10112 {
10113 	pmap_t old_pmap;
10114 	struct pmap_pcid *pcidp, *old_pcidp;
10115 	uint64_t cached, cr3, kcr3, ucr3;
10116 
10117 	KASSERT((read_rflags() & PSL_I) == 0,
10118 	    ("PCID needs interrupts disabled in pmap_activate_sw()"));
10119 
10120 	/* See the comment in pmap_invalidate_page_pcid(). */
10121 	if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) {
10122 		PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
10123 		old_pmap = PCPU_GET(curpmap);
10124 		MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3);
10125 		old_pcidp = zpcpu_get_cpu(old_pmap->pm_pcidp, cpuid);
10126 		old_pcidp->pm_gen = 0;
10127 	}
10128 
10129 	pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid);
10130 	cached = pmap_pcid_alloc_checked(pmap, pcidp);
10131 	cr3 = rcr3();
10132 	if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
10133 		load_cr3(pmap->pm_cr3 | pcidp->pm_pcid);
10134 	PCPU_SET(curpmap, pmap);
10135 	kcr3 = pmap->pm_cr3 | pcidp->pm_pcid;
10136 	ucr3 = pmap->pm_ucr3 | pcidp->pm_pcid | PMAP_PCID_USER_PT;
10137 
10138 	if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3)
10139 		PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
10140 
10141 	PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
10142 	PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
10143 	if (cached)
10144 		counter_u64_add(pcid_save_cnt, 1);
10145 
10146 	pmap_activate_sw_pti_post(td, pmap);
10147 }
10148 
10149 static void
pmap_activate_sw_pcid_nopti(struct thread * td __unused,pmap_t pmap,u_int cpuid)10150 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap,
10151     u_int cpuid)
10152 {
10153 	struct pmap_pcid *pcidp;
10154 	uint64_t cached, cr3;
10155 
10156 	KASSERT((read_rflags() & PSL_I) == 0,
10157 	    ("PCID needs interrupts disabled in pmap_activate_sw()"));
10158 
10159 	pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid);
10160 	cached = pmap_pcid_alloc_checked(pmap, pcidp);
10161 	cr3 = rcr3();
10162 	if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
10163 		load_cr3(pmap->pm_cr3 | pcidp->pm_pcid | cached);
10164 	PCPU_SET(curpmap, pmap);
10165 	if (cached)
10166 		counter_u64_add(pcid_save_cnt, 1);
10167 }
10168 
10169 static void
pmap_activate_sw_nopcid_nopti(struct thread * td __unused,pmap_t pmap,u_int cpuid __unused)10170 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap,
10171     u_int cpuid __unused)
10172 {
10173 
10174 	load_cr3(pmap->pm_cr3);
10175 	PCPU_SET(curpmap, pmap);
10176 }
10177 
10178 static void
pmap_activate_sw_nopcid_pti(struct thread * td,pmap_t pmap,u_int cpuid __unused)10179 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap,
10180     u_int cpuid __unused)
10181 {
10182 
10183 	pmap_activate_sw_nopcid_nopti(td, pmap, cpuid);
10184 	PCPU_SET(kcr3, pmap->pm_cr3);
10185 	PCPU_SET(ucr3, pmap->pm_ucr3);
10186 	pmap_activate_sw_pti_post(td, pmap);
10187 }
10188 
10189 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t,
10190     u_int))
10191 {
10192 
10193 	if (pmap_pcid_enabled && pti)
10194 		return (pmap_activate_sw_pcid_pti);
10195 	else if (pmap_pcid_enabled && !pti)
10196 		return (pmap_activate_sw_pcid_nopti);
10197 	else if (!pmap_pcid_enabled && pti)
10198 		return (pmap_activate_sw_nopcid_pti);
10199 	else /* if (!pmap_pcid_enabled && !pti) */
10200 		return (pmap_activate_sw_nopcid_nopti);
10201 }
10202 
10203 void
pmap_activate_sw(struct thread * td)10204 pmap_activate_sw(struct thread *td)
10205 {
10206 	pmap_t oldpmap, pmap;
10207 	u_int cpuid;
10208 
10209 	oldpmap = PCPU_GET(curpmap);
10210 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
10211 	if (oldpmap == pmap) {
10212 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
10213 			mfence();
10214 		return;
10215 	}
10216 	cpuid = PCPU_GET(cpuid);
10217 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
10218 	pmap_activate_sw_mode(td, pmap, cpuid);
10219 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
10220 }
10221 
10222 void
pmap_activate(struct thread * td)10223 pmap_activate(struct thread *td)
10224 {
10225 	/*
10226 	 * invltlb_{invpcid,}_pcid_handler() is used to handle an
10227 	 * invalidate_all IPI, which checks for curpmap ==
10228 	 * smp_tlb_pmap.  The below sequence of operations has a
10229 	 * window where %CR3 is loaded with the new pmap's PML4
10230 	 * address, but the curpmap value has not yet been updated.
10231 	 * This causes the invltlb IPI handler, which is called
10232 	 * between the updates, to execute as a NOP, which leaves
10233 	 * stale TLB entries.
10234 	 *
10235 	 * Note that the most common use of pmap_activate_sw(), from
10236 	 * a context switch, is immune to this race, because
10237 	 * interrupts are disabled (while the thread lock is owned),
10238 	 * so the IPI is delayed until after curpmap is updated.  Protect
10239 	 * other callers in a similar way, by disabling interrupts
10240 	 * around the %cr3 register reload and curpmap assignment.
10241 	 */
10242 	spinlock_enter();
10243 	pmap_activate_sw(td);
10244 	spinlock_exit();
10245 }
10246 
10247 void
pmap_activate_boot(pmap_t pmap)10248 pmap_activate_boot(pmap_t pmap)
10249 {
10250 	uint64_t kcr3;
10251 	u_int cpuid;
10252 
10253 	/*
10254 	 * kernel_pmap must be never deactivated, and we ensure that
10255 	 * by never activating it at all.
10256 	 */
10257 	MPASS(pmap != kernel_pmap);
10258 
10259 	cpuid = PCPU_GET(cpuid);
10260 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
10261 	PCPU_SET(curpmap, pmap);
10262 	if (pti) {
10263 		kcr3 = pmap->pm_cr3;
10264 		if (pmap_pcid_enabled)
10265 			kcr3 |= pmap_get_pcid(pmap) | CR3_PCID_SAVE;
10266 	} else {
10267 		kcr3 = PMAP_NO_CR3;
10268 	}
10269 	PCPU_SET(kcr3, kcr3);
10270 	PCPU_SET(ucr3, PMAP_NO_CR3);
10271 }
10272 
10273 void
pmap_active_cpus(pmap_t pmap,cpuset_t * res)10274 pmap_active_cpus(pmap_t pmap, cpuset_t *res)
10275 {
10276 	*res = pmap->pm_active;
10277 }
10278 
10279 void
pmap_sync_icache(pmap_t pm,vm_offset_t va,vm_size_t sz)10280 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
10281 {
10282 }
10283 
10284 /*
10285  *	Increase the starting virtual address of the given mapping if a
10286  *	different alignment might result in more superpage mappings.
10287  */
10288 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)10289 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
10290     vm_offset_t *addr, vm_size_t size)
10291 {
10292 	vm_offset_t superpage_offset;
10293 
10294 	if (size < NBPDR)
10295 		return;
10296 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
10297 		offset += ptoa(object->pg_color);
10298 	superpage_offset = offset & PDRMASK;
10299 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
10300 	    (*addr & PDRMASK) == superpage_offset)
10301 		return;
10302 	if ((*addr & PDRMASK) < superpage_offset)
10303 		*addr = (*addr & ~PDRMASK) + superpage_offset;
10304 	else
10305 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
10306 }
10307 
10308 #ifdef INVARIANTS
10309 static unsigned long num_dirty_emulations;
10310 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
10311 	     &num_dirty_emulations, 0, NULL);
10312 
10313 static unsigned long num_accessed_emulations;
10314 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
10315 	     &num_accessed_emulations, 0, NULL);
10316 
10317 static unsigned long num_superpage_accessed_emulations;
10318 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
10319 	     &num_superpage_accessed_emulations, 0, NULL);
10320 
10321 static unsigned long ad_emulation_superpage_promotions;
10322 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
10323 	     &ad_emulation_superpage_promotions, 0, NULL);
10324 #endif	/* INVARIANTS */
10325 
10326 int
pmap_emulate_accessed_dirty(pmap_t pmap,vm_offset_t va,int ftype)10327 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
10328 {
10329 	int rv;
10330 	struct rwlock *lock;
10331 #if VM_NRESERVLEVEL > 0
10332 	vm_page_t m, mpte;
10333 #endif
10334 	pd_entry_t *pde;
10335 	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
10336 
10337 	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
10338 	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
10339 
10340 	if (!pmap_emulate_ad_bits(pmap))
10341 		return (-1);
10342 
10343 	PG_A = pmap_accessed_bit(pmap);
10344 	PG_M = pmap_modified_bit(pmap);
10345 	PG_V = pmap_valid_bit(pmap);
10346 	PG_RW = pmap_rw_bit(pmap);
10347 
10348 	rv = -1;
10349 	lock = NULL;
10350 	PMAP_LOCK(pmap);
10351 
10352 	pde = pmap_pde(pmap, va);
10353 	if (pde == NULL || (*pde & PG_V) == 0)
10354 		goto done;
10355 
10356 	if ((*pde & PG_PS) != 0) {
10357 		if (ftype == VM_PROT_READ) {
10358 #ifdef INVARIANTS
10359 			atomic_add_long(&num_superpage_accessed_emulations, 1);
10360 #endif
10361 			*pde |= PG_A;
10362 			rv = 0;
10363 		}
10364 		goto done;
10365 	}
10366 
10367 	pte = pmap_pde_to_pte(pde, va);
10368 	if ((*pte & PG_V) == 0)
10369 		goto done;
10370 
10371 	if (ftype == VM_PROT_WRITE) {
10372 		if ((*pte & PG_RW) == 0)
10373 			goto done;
10374 		/*
10375 		 * Set the modified and accessed bits simultaneously.
10376 		 *
10377 		 * Intel EPT PTEs that do software emulation of A/D bits map
10378 		 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
10379 		 * An EPT misconfiguration is triggered if the PTE is writable
10380 		 * but not readable (WR=10). This is avoided by setting PG_A
10381 		 * and PG_M simultaneously.
10382 		 */
10383 		*pte |= PG_M | PG_A;
10384 	} else {
10385 		*pte |= PG_A;
10386 	}
10387 
10388 #if VM_NRESERVLEVEL > 0
10389 	/* try to promote the mapping */
10390 	if (va < VM_MAXUSER_ADDRESS)
10391 		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
10392 	else
10393 		mpte = NULL;
10394 
10395 	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
10396 
10397 	if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
10398 	    (m->flags & PG_FICTITIOUS) == 0 &&
10399 	    vm_reserv_level_iffullpop(m) == 0 &&
10400 	    pmap_promote_pde(pmap, pde, va, mpte, &lock)) {
10401 #ifdef INVARIANTS
10402 		atomic_add_long(&ad_emulation_superpage_promotions, 1);
10403 #endif
10404 	}
10405 #endif
10406 
10407 #ifdef INVARIANTS
10408 	if (ftype == VM_PROT_WRITE)
10409 		atomic_add_long(&num_dirty_emulations, 1);
10410 	else
10411 		atomic_add_long(&num_accessed_emulations, 1);
10412 #endif
10413 	rv = 0;		/* success */
10414 done:
10415 	if (lock != NULL)
10416 		rw_wunlock(lock);
10417 	PMAP_UNLOCK(pmap);
10418 	return (rv);
10419 }
10420 
10421 void
pmap_get_mapping(pmap_t pmap,vm_offset_t va,uint64_t * ptr,int * num)10422 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
10423 {
10424 	pml4_entry_t *pml4;
10425 	pdp_entry_t *pdp;
10426 	pd_entry_t *pde;
10427 	pt_entry_t *pte, PG_V;
10428 	int idx;
10429 
10430 	idx = 0;
10431 	PG_V = pmap_valid_bit(pmap);
10432 	PMAP_LOCK(pmap);
10433 
10434 	pml4 = pmap_pml4e(pmap, va);
10435 	if (pml4 == NULL)
10436 		goto done;
10437 	ptr[idx++] = *pml4;
10438 	if ((*pml4 & PG_V) == 0)
10439 		goto done;
10440 
10441 	pdp = pmap_pml4e_to_pdpe(pml4, va);
10442 	ptr[idx++] = *pdp;
10443 	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
10444 		goto done;
10445 
10446 	pde = pmap_pdpe_to_pde(pdp, va);
10447 	ptr[idx++] = *pde;
10448 	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
10449 		goto done;
10450 
10451 	pte = pmap_pde_to_pte(pde, va);
10452 	ptr[idx++] = *pte;
10453 
10454 done:
10455 	PMAP_UNLOCK(pmap);
10456 	*num = idx;
10457 }
10458 
10459 /**
10460  * Get the kernel virtual address of a set of physical pages. If there are
10461  * physical addresses not covered by the DMAP perform a transient mapping
10462  * that will be removed when calling pmap_unmap_io_transient.
10463  *
10464  * \param page        The pages the caller wishes to obtain the virtual
10465  *                    address on the kernel memory map.
10466  * \param vaddr       On return contains the kernel virtual memory address
10467  *                    of the pages passed in the page parameter.
10468  * \param count       Number of pages passed in.
10469  * \param can_fault   true if the thread using the mapped pages can take
10470  *                    page faults, false otherwise.
10471  *
10472  * \returns true if the caller must call pmap_unmap_io_transient when
10473  *          finished or false otherwise.
10474  *
10475  */
10476 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)10477 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
10478     bool can_fault)
10479 {
10480 	vm_paddr_t paddr;
10481 	bool needs_mapping;
10482 	int error __unused, i;
10483 
10484 	/*
10485 	 * Allocate any KVA space that we need, this is done in a separate
10486 	 * loop to prevent calling vmem_alloc while pinned.
10487 	 */
10488 	needs_mapping = false;
10489 	for (i = 0; i < count; i++) {
10490 		paddr = VM_PAGE_TO_PHYS(page[i]);
10491 		if (__predict_false(paddr >= dmaplimit)) {
10492 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
10493 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
10494 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
10495 			needs_mapping = true;
10496 		} else {
10497 			vaddr[i] = PHYS_TO_DMAP(paddr);
10498 		}
10499 	}
10500 
10501 	/* Exit early if everything is covered by the DMAP */
10502 	if (!needs_mapping)
10503 		return (false);
10504 
10505 	/*
10506 	 * NB:  The sequence of updating a page table followed by accesses
10507 	 * to the corresponding pages used in the !DMAP case is subject to
10508 	 * the situation described in the "AMD64 Architecture Programmer's
10509 	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
10510 	 * Coherency Considerations".  Therefore, issuing the INVLPG right
10511 	 * after modifying the PTE bits is crucial.
10512 	 */
10513 	if (!can_fault)
10514 		sched_pin();
10515 	for (i = 0; i < count; i++) {
10516 		paddr = VM_PAGE_TO_PHYS(page[i]);
10517 		if (paddr >= dmaplimit) {
10518 			if (can_fault) {
10519 				/*
10520 				 * Slow path, since we can get page faults
10521 				 * while mappings are active don't pin the
10522 				 * thread to the CPU and instead add a global
10523 				 * mapping visible to all CPUs.
10524 				 */
10525 				pmap_qenter(vaddr[i], &page[i], 1);
10526 			} else {
10527 				pmap_kenter_attr(vaddr[i], paddr,
10528 				    page[i]->md.pat_mode);
10529 				pmap_invlpg(kernel_pmap, vaddr[i]);
10530 			}
10531 		}
10532 	}
10533 
10534 	return (needs_mapping);
10535 }
10536 
10537 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)10538 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
10539     bool can_fault)
10540 {
10541 	vm_paddr_t paddr;
10542 	int i;
10543 
10544 	if (!can_fault)
10545 		sched_unpin();
10546 	for (i = 0; i < count; i++) {
10547 		paddr = VM_PAGE_TO_PHYS(page[i]);
10548 		if (paddr >= dmaplimit) {
10549 			if (can_fault)
10550 				pmap_qremove(vaddr[i], 1);
10551 			vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
10552 		}
10553 	}
10554 }
10555 
10556 vm_offset_t
pmap_quick_enter_page(vm_page_t m)10557 pmap_quick_enter_page(vm_page_t m)
10558 {
10559 	vm_paddr_t paddr;
10560 
10561 	paddr = VM_PAGE_TO_PHYS(m);
10562 	if (paddr < dmaplimit)
10563 		return (PHYS_TO_DMAP(paddr));
10564 	mtx_lock_spin(&qframe_mtx);
10565 	KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
10566 
10567 	/*
10568 	 * Since qframe is exclusively mapped by us, and we do not set
10569 	 * PG_G, we can use INVLPG here.
10570 	 */
10571 	invlpg(qframe);
10572 
10573 	pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
10574 	    X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, false));
10575 	return (qframe);
10576 }
10577 
10578 void
pmap_quick_remove_page(vm_offset_t addr)10579 pmap_quick_remove_page(vm_offset_t addr)
10580 {
10581 
10582 	if (addr != qframe)
10583 		return;
10584 	pte_store(vtopte(qframe), 0);
10585 	mtx_unlock_spin(&qframe_mtx);
10586 }
10587 
10588 /*
10589  * Pdp pages from the large map are managed differently from either
10590  * kernel or user page table pages.  They are permanently allocated at
10591  * initialization time, and their reference count is permanently set to
10592  * zero.  The pml4 entries pointing to those pages are copied into
10593  * each allocated pmap.
10594  *
10595  * In contrast, pd and pt pages are managed like user page table
10596  * pages.  They are dynamically allocated, and their reference count
10597  * represents the number of valid entries within the page.
10598  */
10599 static vm_page_t
pmap_large_map_getptp_unlocked(void)10600 pmap_large_map_getptp_unlocked(void)
10601 {
10602 	return (pmap_alloc_pt_page(kernel_pmap, 0, VM_ALLOC_ZERO));
10603 }
10604 
10605 static vm_page_t
pmap_large_map_getptp(void)10606 pmap_large_map_getptp(void)
10607 {
10608 	vm_page_t m;
10609 
10610 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
10611 	m = pmap_large_map_getptp_unlocked();
10612 	if (m == NULL) {
10613 		PMAP_UNLOCK(kernel_pmap);
10614 		vm_wait(NULL);
10615 		PMAP_LOCK(kernel_pmap);
10616 		/* Callers retry. */
10617 	}
10618 	return (m);
10619 }
10620 
10621 static pdp_entry_t *
pmap_large_map_pdpe(vm_offset_t va)10622 pmap_large_map_pdpe(vm_offset_t va)
10623 {
10624 	pml4_entry_t *pml4;
10625 	vm_pindex_t pml4_idx;
10626 	vm_paddr_t mphys;
10627 
10628 	KASSERT(va >= kva_layout.lm_low && va < kva_layout.lm_low +
10629 	    (vm_offset_t)NBPML4 * lm_ents, ("va %#lx not in large map", va));
10630 	if (la57) {
10631 		pml4 = pmap_pml4e(kernel_pmap, va);
10632 		mphys = *pml4 & PG_FRAME;
10633 	} else {
10634 		pml4_idx = pmap_pml4e_index(va);
10635 
10636 		KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
10637 		    ("pmap_large_map_pdpe: va %#jx out of range idx %#jx "
10638 		    "LMSPML4I %#jx lm_ents %d",
10639 		    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
10640 		KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
10641 		    ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
10642 		    "LMSPML4I %#jx lm_ents %d",
10643 		    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
10644 		mphys = kernel_pml4[pml4_idx] & PG_FRAME;
10645 	}
10646 	return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
10647 }
10648 
10649 static pd_entry_t *
pmap_large_map_pde(vm_offset_t va)10650 pmap_large_map_pde(vm_offset_t va)
10651 {
10652 	pdp_entry_t *pdpe;
10653 	vm_page_t m;
10654 	vm_paddr_t mphys;
10655 
10656 retry:
10657 	pdpe = pmap_large_map_pdpe(va);
10658 	if (*pdpe == 0) {
10659 		m = pmap_large_map_getptp();
10660 		if (m == NULL)
10661 			goto retry;
10662 		mphys = VM_PAGE_TO_PHYS(m);
10663 		*pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
10664 	} else {
10665 		MPASS((*pdpe & X86_PG_PS) == 0);
10666 		mphys = *pdpe & PG_FRAME;
10667 	}
10668 	return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va));
10669 }
10670 
10671 static pt_entry_t *
pmap_large_map_pte(vm_offset_t va)10672 pmap_large_map_pte(vm_offset_t va)
10673 {
10674 	pd_entry_t *pde;
10675 	vm_page_t m;
10676 	vm_paddr_t mphys;
10677 
10678 retry:
10679 	pde = pmap_large_map_pde(va);
10680 	if (*pde == 0) {
10681 		m = pmap_large_map_getptp();
10682 		if (m == NULL)
10683 			goto retry;
10684 		mphys = VM_PAGE_TO_PHYS(m);
10685 		*pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
10686 		PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++;
10687 	} else {
10688 		MPASS((*pde & X86_PG_PS) == 0);
10689 		mphys = *pde & PG_FRAME;
10690 	}
10691 	return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va));
10692 }
10693 
10694 static vm_paddr_t
pmap_large_map_kextract(vm_offset_t va)10695 pmap_large_map_kextract(vm_offset_t va)
10696 {
10697 	pdp_entry_t *pdpe, pdp;
10698 	pd_entry_t *pde, pd;
10699 	pt_entry_t *pte, pt;
10700 
10701 	KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va),
10702 	    ("not largemap range %#lx", (u_long)va));
10703 	pdpe = pmap_large_map_pdpe(va);
10704 	pdp = *pdpe;
10705 	KASSERT((pdp & X86_PG_V) != 0,
10706 	    ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
10707 	    (u_long)pdpe, pdp));
10708 	if ((pdp & X86_PG_PS) != 0) {
10709 		KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
10710 		    ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
10711 		    (u_long)pdpe, pdp));
10712 		return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK));
10713 	}
10714 	pde = pmap_pdpe_to_pde(pdpe, va);
10715 	pd = *pde;
10716 	KASSERT((pd & X86_PG_V) != 0,
10717 	    ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd));
10718 	if ((pd & X86_PG_PS) != 0)
10719 		return ((pd & PG_PS_FRAME) | (va & PDRMASK));
10720 	pte = pmap_pde_to_pte(pde, va);
10721 	pt = *pte;
10722 	KASSERT((pt & X86_PG_V) != 0,
10723 	    ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt));
10724 	return ((pt & PG_FRAME) | (va & PAGE_MASK));
10725 }
10726 
10727 static int
pmap_large_map_getva(vm_size_t len,vm_offset_t align,vm_offset_t phase,vmem_addr_t * vmem_res)10728 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase,
10729     vmem_addr_t *vmem_res)
10730 {
10731 
10732 	/*
10733 	 * Large mappings are all but static.  Consequently, there
10734 	 * is no point in waiting for an earlier allocation to be
10735 	 * freed.
10736 	 */
10737 	return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN,
10738 	    VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res));
10739 }
10740 
10741 int
pmap_large_map(vm_paddr_t spa,vm_size_t len,void ** addr,vm_memattr_t mattr)10742 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr,
10743     vm_memattr_t mattr)
10744 {
10745 	pdp_entry_t *pdpe;
10746 	pd_entry_t *pde;
10747 	pt_entry_t *pte;
10748 	vm_offset_t va, inc;
10749 	vmem_addr_t vmem_res;
10750 	vm_paddr_t pa;
10751 	int error;
10752 
10753 	if (len == 0 || spa + len < spa)
10754 		return (EINVAL);
10755 
10756 	/* See if DMAP can serve. */
10757 	if (spa + len <= dmaplimit) {
10758 		va = PHYS_TO_DMAP(spa);
10759 		*addr = (void *)va;
10760 		return (pmap_change_attr(va, len, mattr));
10761 	}
10762 
10763 	/*
10764 	 * No, allocate KVA.  Fit the address with best possible
10765 	 * alignment for superpages.  Fall back to worse align if
10766 	 * failed.
10767 	 */
10768 	error = ENOMEM;
10769 	if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len,
10770 	    NBPDP) >= roundup2(spa, NBPDP) + NBPDP)
10771 		error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK,
10772 		    &vmem_res);
10773 	if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa,
10774 	    NBPDR) + NBPDR)
10775 		error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK,
10776 		    &vmem_res);
10777 	if (error != 0)
10778 		error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res);
10779 	if (error != 0)
10780 		return (error);
10781 
10782 	/*
10783 	 * Fill pagetable.  PG_M is not pre-set, we scan modified bits
10784 	 * in the pagetable to minimize flushing.  No need to
10785 	 * invalidate TLB, since we only update invalid entries.
10786 	 */
10787 	PMAP_LOCK(kernel_pmap);
10788 	for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc,
10789 	    len -= inc) {
10790 		if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP &&
10791 		    (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) {
10792 			pdpe = pmap_large_map_pdpe(va);
10793 			MPASS(*pdpe == 0);
10794 			*pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW |
10795 			    X86_PG_V | X86_PG_A | pg_nx |
10796 			    pmap_cache_bits(kernel_pmap, mattr, true);
10797 			inc = NBPDP;
10798 		} else if (len >= NBPDR && (pa & PDRMASK) == 0 &&
10799 		    (va & PDRMASK) == 0) {
10800 			pde = pmap_large_map_pde(va);
10801 			MPASS(*pde == 0);
10802 			*pde = pa | pg_g | X86_PG_PS | X86_PG_RW |
10803 			    X86_PG_V | X86_PG_A | pg_nx |
10804 			    pmap_cache_bits(kernel_pmap, mattr, true);
10805 			PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->
10806 			    ref_count++;
10807 			inc = NBPDR;
10808 		} else {
10809 			pte = pmap_large_map_pte(va);
10810 			MPASS(*pte == 0);
10811 			*pte = pa | pg_g | X86_PG_RW | X86_PG_V |
10812 			    X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap,
10813 			    mattr, false);
10814 			PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))->
10815 			    ref_count++;
10816 			inc = PAGE_SIZE;
10817 		}
10818 	}
10819 	PMAP_UNLOCK(kernel_pmap);
10820 	MPASS(len == 0);
10821 
10822 	*addr = (void *)vmem_res;
10823 	return (0);
10824 }
10825 
10826 void
pmap_large_unmap(void * svaa,vm_size_t len)10827 pmap_large_unmap(void *svaa, vm_size_t len)
10828 {
10829 	vm_offset_t sva, va;
10830 	vm_size_t inc;
10831 	pdp_entry_t *pdpe, pdp;
10832 	pd_entry_t *pde, pd;
10833 	pt_entry_t *pte;
10834 	vm_page_t m;
10835 	struct spglist spgf;
10836 
10837 	sva = (vm_offset_t)svaa;
10838 	if (len == 0 || sva + len < sva || (sva >= kva_layout.dmap_low &&
10839 	    sva + len < kva_layout.dmap_high))
10840 		return;
10841 
10842 	SLIST_INIT(&spgf);
10843 	KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) &&
10844 	    PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1),
10845 	    ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len));
10846 	PMAP_LOCK(kernel_pmap);
10847 	for (va = sva; va < sva + len; va += inc) {
10848 		pdpe = pmap_large_map_pdpe(va);
10849 		pdp = *pdpe;
10850 		KASSERT((pdp & X86_PG_V) != 0,
10851 		    ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
10852 		    (u_long)pdpe, pdp));
10853 		if ((pdp & X86_PG_PS) != 0) {
10854 			KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
10855 			    ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
10856 			    (u_long)pdpe, pdp));
10857 			KASSERT((va & PDPMASK) == 0,
10858 			    ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va,
10859 			    (u_long)pdpe, pdp));
10860 			KASSERT(va + NBPDP <= sva + len,
10861 			    ("unmap covers partial 1GB page, sva %#lx va %#lx "
10862 			    "pdpe %#lx pdp %#lx len %#lx", sva, va,
10863 			    (u_long)pdpe, pdp, len));
10864 			*pdpe = 0;
10865 			inc = NBPDP;
10866 			continue;
10867 		}
10868 		pde = pmap_pdpe_to_pde(pdpe, va);
10869 		pd = *pde;
10870 		KASSERT((pd & X86_PG_V) != 0,
10871 		    ("invalid pd va %#lx pde %#lx pd %#lx", va,
10872 		    (u_long)pde, pd));
10873 		if ((pd & X86_PG_PS) != 0) {
10874 			KASSERT((va & PDRMASK) == 0,
10875 			    ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va,
10876 			    (u_long)pde, pd));
10877 			KASSERT(va + NBPDR <= sva + len,
10878 			    ("unmap covers partial 2MB page, sva %#lx va %#lx "
10879 			    "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde,
10880 			    pd, len));
10881 			pde_store(pde, 0);
10882 			inc = NBPDR;
10883 			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
10884 			m->ref_count--;
10885 			if (m->ref_count == 0) {
10886 				*pdpe = 0;
10887 				SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
10888 			}
10889 			continue;
10890 		}
10891 		pte = pmap_pde_to_pte(pde, va);
10892 		KASSERT((*pte & X86_PG_V) != 0,
10893 		    ("invalid pte va %#lx pte %#lx pt %#lx", va,
10894 		    (u_long)pte, *pte));
10895 		pte_clear(pte);
10896 		inc = PAGE_SIZE;
10897 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte));
10898 		m->ref_count--;
10899 		if (m->ref_count == 0) {
10900 			*pde = 0;
10901 			SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
10902 			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
10903 			m->ref_count--;
10904 			if (m->ref_count == 0) {
10905 				*pdpe = 0;
10906 				SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
10907 			}
10908 		}
10909 	}
10910 	pmap_invalidate_range(kernel_pmap, sva, sva + len);
10911 	PMAP_UNLOCK(kernel_pmap);
10912 	vm_page_free_pages_toq(&spgf, false);
10913 	vmem_free(large_vmem, sva, len);
10914 }
10915 
10916 static void
pmap_large_map_wb_fence_mfence(void)10917 pmap_large_map_wb_fence_mfence(void)
10918 {
10919 
10920 	mfence();
10921 }
10922 
10923 static void
pmap_large_map_wb_fence_atomic(void)10924 pmap_large_map_wb_fence_atomic(void)
10925 {
10926 
10927 	atomic_thread_fence_seq_cst();
10928 }
10929 
10930 static void
pmap_large_map_wb_fence_nop(void)10931 pmap_large_map_wb_fence_nop(void)
10932 {
10933 }
10934 
10935 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void))
10936 {
10937 
10938 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
10939 		return (pmap_large_map_wb_fence_mfence);
10940 	else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB |
10941 	    CPUID_STDEXT_CLFLUSHOPT)) == 0)
10942 		return (pmap_large_map_wb_fence_atomic);
10943 	else
10944 		/* clflush is strongly enough ordered */
10945 		return (pmap_large_map_wb_fence_nop);
10946 }
10947 
10948 static void
pmap_large_map_flush_range_clwb(vm_offset_t va,vm_size_t len)10949 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len)
10950 {
10951 
10952 	for (; len > 0; len -= cpu_clflush_line_size,
10953 	    va += cpu_clflush_line_size)
10954 		clwb(va);
10955 }
10956 
10957 static void
pmap_large_map_flush_range_clflushopt(vm_offset_t va,vm_size_t len)10958 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len)
10959 {
10960 
10961 	for (; len > 0; len -= cpu_clflush_line_size,
10962 	    va += cpu_clflush_line_size)
10963 		clflushopt(va);
10964 }
10965 
10966 static void
pmap_large_map_flush_range_clflush(vm_offset_t va,vm_size_t len)10967 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len)
10968 {
10969 
10970 	for (; len > 0; len -= cpu_clflush_line_size,
10971 	    va += cpu_clflush_line_size)
10972 		clflush(va);
10973 }
10974 
10975 static void
pmap_large_map_flush_range_nop(vm_offset_t sva __unused,vm_size_t len __unused)10976 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused)
10977 {
10978 }
10979 
10980 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t))
10981 {
10982 
10983 	if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0)
10984 		return (pmap_large_map_flush_range_clwb);
10985 	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0)
10986 		return (pmap_large_map_flush_range_clflushopt);
10987 	else if ((cpu_feature & CPUID_CLFSH) != 0)
10988 		return (pmap_large_map_flush_range_clflush);
10989 	else
10990 		return (pmap_large_map_flush_range_nop);
10991 }
10992 
10993 static void
pmap_large_map_wb_large(vm_offset_t sva,vm_offset_t eva)10994 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva)
10995 {
10996 	volatile u_long *pe;
10997 	u_long p;
10998 	vm_offset_t va;
10999 	vm_size_t inc;
11000 	bool seen_other;
11001 
11002 	for (va = sva; va < eva; va += inc) {
11003 		inc = 0;
11004 		if ((amd_feature & AMDID_PAGE1GB) != 0) {
11005 			pe = (volatile u_long *)pmap_large_map_pdpe(va);
11006 			p = *pe;
11007 			if ((p & X86_PG_PS) != 0)
11008 				inc = NBPDP;
11009 		}
11010 		if (inc == 0) {
11011 			pe = (volatile u_long *)pmap_large_map_pde(va);
11012 			p = *pe;
11013 			if ((p & X86_PG_PS) != 0)
11014 				inc = NBPDR;
11015 		}
11016 		if (inc == 0) {
11017 			pe = (volatile u_long *)pmap_large_map_pte(va);
11018 			p = *pe;
11019 			inc = PAGE_SIZE;
11020 		}
11021 		seen_other = false;
11022 		for (;;) {
11023 			if ((p & X86_PG_AVAIL1) != 0) {
11024 				/*
11025 				 * Spin-wait for the end of a parallel
11026 				 * write-back.
11027 				 */
11028 				cpu_spinwait();
11029 				p = *pe;
11030 
11031 				/*
11032 				 * If we saw other write-back
11033 				 * occurring, we cannot rely on PG_M to
11034 				 * indicate state of the cache.  The
11035 				 * PG_M bit is cleared before the
11036 				 * flush to avoid ignoring new writes,
11037 				 * and writes which are relevant for
11038 				 * us might happen after.
11039 				 */
11040 				seen_other = true;
11041 				continue;
11042 			}
11043 
11044 			if ((p & X86_PG_M) != 0 || seen_other) {
11045 				if (!atomic_fcmpset_long(pe, &p,
11046 				    (p & ~X86_PG_M) | X86_PG_AVAIL1))
11047 					/*
11048 					 * If we saw PG_M without
11049 					 * PG_AVAIL1, and then on the
11050 					 * next attempt we do not
11051 					 * observe either PG_M or
11052 					 * PG_AVAIL1, the other
11053 					 * write-back started after us
11054 					 * and finished before us.  We
11055 					 * can rely on it doing our
11056 					 * work.
11057 					 */
11058 					continue;
11059 				pmap_large_map_flush_range(va, inc);
11060 				atomic_clear_long(pe, X86_PG_AVAIL1);
11061 			}
11062 			break;
11063 		}
11064 		maybe_yield();
11065 	}
11066 }
11067 
11068 /*
11069  * Write-back cache lines for the given address range.
11070  *
11071  * Must be called only on the range or sub-range returned from
11072  * pmap_large_map().  Must not be called on the coalesced ranges.
11073  *
11074  * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH
11075  * instructions support.
11076  */
11077 void
pmap_large_map_wb(void * svap,vm_size_t len)11078 pmap_large_map_wb(void *svap, vm_size_t len)
11079 {
11080 	vm_offset_t eva, sva;
11081 
11082 	sva = (vm_offset_t)svap;
11083 	eva = sva + len;
11084 	pmap_large_map_wb_fence();
11085 	if (sva >= kva_layout.dmap_low && eva < kva_layout.dmap_high) {
11086 		pmap_large_map_flush_range(sva, len);
11087 	} else {
11088 		KASSERT(sva >= kva_layout.lm_low && eva < kva_layout.lm_high,
11089 		    ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len));
11090 		pmap_large_map_wb_large(sva, eva);
11091 	}
11092 	pmap_large_map_wb_fence();
11093 }
11094 
11095 static vm_page_t
pmap_pti_alloc_page(void)11096 pmap_pti_alloc_page(void)
11097 {
11098 	vm_page_t m;
11099 
11100 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11101 	m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_WIRED | VM_ALLOC_ZERO);
11102 	return (m);
11103 }
11104 
11105 static bool
pmap_pti_free_page(vm_page_t m)11106 pmap_pti_free_page(vm_page_t m)
11107 {
11108 	if (!vm_page_unwire_noq(m))
11109 		return (false);
11110 	vm_page_xbusy_claim(m);
11111 	vm_page_free_zero(m);
11112 	return (true);
11113 }
11114 
11115 static void
pmap_pti_init(void)11116 pmap_pti_init(void)
11117 {
11118 	vm_page_t pml4_pg;
11119 	pdp_entry_t *pdpe;
11120 	vm_offset_t va;
11121 	int i;
11122 
11123 	if (!pti)
11124 		return;
11125 	pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
11126 	VM_OBJECT_WLOCK(pti_obj);
11127 	pml4_pg = pmap_pti_alloc_page();
11128 	pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
11129 	for (va = kva_layout.km_low; va <= kva_layout.km_high &&
11130 	    va >= kva_layout.km_low && va > NBPML4; va += NBPML4) {
11131 		pdpe = pmap_pti_pdpe(va);
11132 		pmap_pti_wire_pte(pdpe);
11133 	}
11134 	pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
11135 	    (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
11136 	pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
11137 	    sizeof(struct gate_descriptor) * NIDT, false);
11138 	CPU_FOREACH(i) {
11139 		/* Doublefault stack IST 1 */
11140 		va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu);
11141 		pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false);
11142 		/* NMI stack IST 2 */
11143 		va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu);
11144 		pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false);
11145 		/* MC# stack IST 3 */
11146 		va = __pcpu[i].pc_common_tss.tss_ist3 +
11147 		    sizeof(struct nmi_pcpu);
11148 		pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false);
11149 		/* DB# stack IST 4 */
11150 		va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu);
11151 		pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false);
11152 	}
11153 	pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext,
11154 	    true);
11155 	pti_finalized = true;
11156 	VM_OBJECT_WUNLOCK(pti_obj);
11157 }
11158 
11159 static void
pmap_cpu_init(void * arg __unused)11160 pmap_cpu_init(void *arg __unused)
11161 {
11162 	CPU_COPY(&all_cpus, &kernel_pmap->pm_active);
11163 	pmap_pti_init();
11164 }
11165 SYSINIT(pmap_cpu, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_cpu_init, NULL);
11166 
11167 static pdp_entry_t *
pmap_pti_pdpe(vm_offset_t va)11168 pmap_pti_pdpe(vm_offset_t va)
11169 {
11170 	pml4_entry_t *pml4e;
11171 	pdp_entry_t *pdpe;
11172 	vm_page_t m;
11173 	vm_pindex_t pml4_idx;
11174 	vm_paddr_t mphys;
11175 
11176 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11177 
11178 	pml4_idx = pmap_pml4e_index(va);
11179 	pml4e = &pti_pml4[pml4_idx];
11180 	m = NULL;
11181 	if (*pml4e == 0) {
11182 		if (pti_finalized)
11183 			panic("pml4 alloc after finalization\n");
11184 		m = pmap_pti_alloc_page();
11185 		if (*pml4e != 0) {
11186 			pmap_pti_free_page(m);
11187 			mphys = *pml4e & ~PAGE_MASK;
11188 		} else {
11189 			mphys = VM_PAGE_TO_PHYS(m);
11190 			*pml4e = mphys | X86_PG_RW | X86_PG_V;
11191 		}
11192 	} else {
11193 		mphys = *pml4e & ~PAGE_MASK;
11194 	}
11195 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
11196 	return (pdpe);
11197 }
11198 
11199 static void
pmap_pti_wire_pte(void * pte)11200 pmap_pti_wire_pte(void *pte)
11201 {
11202 	vm_page_t m;
11203 
11204 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11205 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
11206 	m->ref_count++;
11207 }
11208 
11209 static void
pmap_pti_unwire_pde(void * pde,bool only_ref)11210 pmap_pti_unwire_pde(void *pde, bool only_ref)
11211 {
11212 	vm_page_t m;
11213 
11214 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11215 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
11216 	MPASS(only_ref || m->ref_count > 1);
11217 	pmap_pti_free_page(m);
11218 }
11219 
11220 static void
pmap_pti_unwire_pte(void * pte,vm_offset_t va)11221 pmap_pti_unwire_pte(void *pte, vm_offset_t va)
11222 {
11223 	vm_page_t m;
11224 	pd_entry_t *pde;
11225 
11226 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11227 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
11228 	if (pmap_pti_free_page(m)) {
11229 		pde = pmap_pti_pde(va);
11230 		MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
11231 		*pde = 0;
11232 		pmap_pti_unwire_pde(pde, false);
11233 	}
11234 }
11235 
11236 static pd_entry_t *
pmap_pti_pde(vm_offset_t va)11237 pmap_pti_pde(vm_offset_t va)
11238 {
11239 	pdp_entry_t *pdpe;
11240 	pd_entry_t *pde;
11241 	vm_page_t m;
11242 	vm_pindex_t pd_idx;
11243 	vm_paddr_t mphys;
11244 
11245 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11246 
11247 	pdpe = pmap_pti_pdpe(va);
11248 	if (*pdpe == 0) {
11249 		m = pmap_pti_alloc_page();
11250 		if (*pdpe != 0) {
11251 			pmap_pti_free_page(m);
11252 			MPASS((*pdpe & X86_PG_PS) == 0);
11253 			mphys = *pdpe & ~PAGE_MASK;
11254 		} else {
11255 			mphys =  VM_PAGE_TO_PHYS(m);
11256 			*pdpe = mphys | X86_PG_RW | X86_PG_V;
11257 		}
11258 	} else {
11259 		MPASS((*pdpe & X86_PG_PS) == 0);
11260 		mphys = *pdpe & ~PAGE_MASK;
11261 	}
11262 
11263 	pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
11264 	pd_idx = pmap_pde_index(va);
11265 	pde += pd_idx;
11266 	return (pde);
11267 }
11268 
11269 static pt_entry_t *
pmap_pti_pte(vm_offset_t va,bool * unwire_pde)11270 pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
11271 {
11272 	pd_entry_t *pde;
11273 	pt_entry_t *pte;
11274 	vm_page_t m;
11275 	vm_paddr_t mphys;
11276 
11277 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11278 
11279 	pde = pmap_pti_pde(va);
11280 	if (unwire_pde != NULL) {
11281 		*unwire_pde = true;
11282 		pmap_pti_wire_pte(pde);
11283 	}
11284 	if (*pde == 0) {
11285 		m = pmap_pti_alloc_page();
11286 		if (*pde != 0) {
11287 			pmap_pti_free_page(m);
11288 			MPASS((*pde & X86_PG_PS) == 0);
11289 			mphys = *pde & ~(PAGE_MASK | pg_nx);
11290 		} else {
11291 			mphys = VM_PAGE_TO_PHYS(m);
11292 			*pde = mphys | X86_PG_RW | X86_PG_V;
11293 			if (unwire_pde != NULL)
11294 				*unwire_pde = false;
11295 		}
11296 	} else {
11297 		MPASS((*pde & X86_PG_PS) == 0);
11298 		mphys = *pde & ~(PAGE_MASK | pg_nx);
11299 	}
11300 
11301 	pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
11302 	pte += pmap_pte_index(va);
11303 
11304 	return (pte);
11305 }
11306 
11307 static void
pmap_pti_add_kva_locked(vm_offset_t sva,vm_offset_t eva,bool exec)11308 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
11309 {
11310 	vm_paddr_t pa;
11311 	pd_entry_t *pde;
11312 	pt_entry_t *pte, ptev;
11313 	bool unwire_pde;
11314 
11315 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11316 
11317 	sva = trunc_page(sva);
11318 	MPASS(sva > VM_MAXUSER_ADDRESS);
11319 	eva = round_page(eva);
11320 	MPASS(sva < eva);
11321 	for (; sva < eva; sva += PAGE_SIZE) {
11322 		pte = pmap_pti_pte(sva, &unwire_pde);
11323 		pa = pmap_kextract(sva);
11324 		ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G |
11325 		    (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
11326 		    VM_MEMATTR_DEFAULT, false);
11327 		if (*pte == 0) {
11328 			pte_store(pte, ptev);
11329 			pmap_pti_wire_pte(pte);
11330 		} else {
11331 			KASSERT(!pti_finalized,
11332 			    ("pti overlap after fin %#lx %#lx %#lx",
11333 			    sva, *pte, ptev));
11334 			KASSERT(*pte == ptev,
11335 			    ("pti non-identical pte after fin %#lx %#lx %#lx",
11336 			    sva, *pte, ptev));
11337 		}
11338 		if (unwire_pde) {
11339 			pde = pmap_pti_pde(sva);
11340 			pmap_pti_unwire_pde(pde, true);
11341 		}
11342 	}
11343 }
11344 
11345 void
pmap_pti_add_kva(vm_offset_t sva,vm_offset_t eva,bool exec)11346 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
11347 {
11348 
11349 	if (!pti)
11350 		return;
11351 	VM_OBJECT_WLOCK(pti_obj);
11352 	pmap_pti_add_kva_locked(sva, eva, exec);
11353 	VM_OBJECT_WUNLOCK(pti_obj);
11354 }
11355 
11356 void
pmap_pti_remove_kva(vm_offset_t sva,vm_offset_t eva)11357 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
11358 {
11359 	pt_entry_t *pte;
11360 	vm_offset_t va;
11361 
11362 	if (!pti)
11363 		return;
11364 	sva = rounddown2(sva, PAGE_SIZE);
11365 	MPASS(sva > VM_MAXUSER_ADDRESS);
11366 	eva = roundup2(eva, PAGE_SIZE);
11367 	MPASS(sva < eva);
11368 	VM_OBJECT_WLOCK(pti_obj);
11369 	for (va = sva; va < eva; va += PAGE_SIZE) {
11370 		pte = pmap_pti_pte(va, NULL);
11371 		KASSERT((*pte & X86_PG_V) != 0,
11372 		    ("invalid pte va %#lx pte %#lx pt %#lx", va,
11373 		    (u_long)pte, *pte));
11374 		pte_clear(pte);
11375 		pmap_pti_unwire_pte(pte, va);
11376 	}
11377 	pmap_invalidate_range(kernel_pmap, sva, eva);
11378 	VM_OBJECT_WUNLOCK(pti_obj);
11379 }
11380 
11381 static void *
pkru_dup_range(void * ctx __unused,void * data)11382 pkru_dup_range(void *ctx __unused, void *data)
11383 {
11384 	struct pmap_pkru_range *node, *new_node;
11385 
11386 	new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
11387 	if (new_node == NULL)
11388 		return (NULL);
11389 	node = data;
11390 	memcpy(new_node, node, sizeof(*node));
11391 	return (new_node);
11392 }
11393 
11394 static void
pkru_free_range(void * ctx __unused,void * node)11395 pkru_free_range(void *ctx __unused, void *node)
11396 {
11397 
11398 	uma_zfree(pmap_pkru_ranges_zone, node);
11399 }
11400 
11401 static int
pmap_pkru_assign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,u_int keyidx,int flags)11402 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
11403     int flags)
11404 {
11405 	struct pmap_pkru_range *ppr;
11406 	int error;
11407 
11408 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11409 	MPASS(pmap->pm_type == PT_X86);
11410 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
11411 	if ((flags & AMD64_PKRU_EXCL) != 0 &&
11412 	    !rangeset_check_empty(&pmap->pm_pkru, sva, eva))
11413 		return (EBUSY);
11414 	ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
11415 	if (ppr == NULL)
11416 		return (ENOMEM);
11417 	ppr->pkru_keyidx = keyidx;
11418 	ppr->pkru_flags = flags & AMD64_PKRU_PERSIST;
11419 	error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr);
11420 	if (error != 0)
11421 		uma_zfree(pmap_pkru_ranges_zone, ppr);
11422 	return (error);
11423 }
11424 
11425 static int
pmap_pkru_deassign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)11426 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
11427 {
11428 
11429 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11430 	MPASS(pmap->pm_type == PT_X86);
11431 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
11432 	return (rangeset_remove(&pmap->pm_pkru, sva, eva));
11433 }
11434 
11435 static void
pmap_pkru_deassign_all(pmap_t pmap)11436 pmap_pkru_deassign_all(pmap_t pmap)
11437 {
11438 
11439 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11440 	if (pmap->pm_type == PT_X86 &&
11441 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
11442 		rangeset_remove_all(&pmap->pm_pkru);
11443 }
11444 
11445 /*
11446  * Returns true if the PKU setting is the same across the specified address
11447  * range, and false otherwise.  When returning true, updates the referenced PTE
11448  * to reflect the PKU setting.
11449  */
11450 static bool
pmap_pkru_same(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t * pte)11451 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte)
11452 {
11453 	struct pmap_pkru_range *ppr;
11454 	vm_offset_t va;
11455 	u_int keyidx;
11456 
11457 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11458 	KASSERT(pmap->pm_type != PT_X86 || (*pte & X86_PG_PKU_MASK) == 0,
11459 	    ("pte %p has unexpected PKU %ld", pte, *pte & X86_PG_PKU_MASK));
11460 	if (pmap->pm_type != PT_X86 ||
11461 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
11462 	    sva >= VM_MAXUSER_ADDRESS)
11463 		return (true);
11464 	MPASS(eva <= VM_MAXUSER_ADDRESS);
11465 	ppr = rangeset_containing(&pmap->pm_pkru, sva);
11466 	if (ppr == NULL)
11467 		return (rangeset_empty(&pmap->pm_pkru, sva, eva));
11468 	keyidx = ppr->pkru_keyidx;
11469 	while ((va = ppr->pkru_rs_el.re_end) < eva) {
11470 		if ((ppr = rangeset_beginning(&pmap->pm_pkru, va)) == NULL ||
11471 		    keyidx != ppr->pkru_keyidx)
11472 			return (false);
11473 	}
11474 	*pte |= X86_PG_PKU(keyidx);
11475 	return (true);
11476 }
11477 
11478 static pt_entry_t
pmap_pkru_get(pmap_t pmap,vm_offset_t va)11479 pmap_pkru_get(pmap_t pmap, vm_offset_t va)
11480 {
11481 	struct pmap_pkru_range *ppr;
11482 
11483 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11484 	if (pmap->pm_type != PT_X86 ||
11485 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
11486 	    va >= VM_MAXUSER_ADDRESS)
11487 		return (0);
11488 	ppr = rangeset_containing(&pmap->pm_pkru, va);
11489 	if (ppr != NULL)
11490 		return (X86_PG_PKU(ppr->pkru_keyidx));
11491 	return (0);
11492 }
11493 
11494 static bool
pred_pkru_on_remove(void * ctx __unused,void * r)11495 pred_pkru_on_remove(void *ctx __unused, void *r)
11496 {
11497 	struct pmap_pkru_range *ppr;
11498 
11499 	ppr = r;
11500 	return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0);
11501 }
11502 
11503 static void
pmap_pkru_on_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)11504 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
11505 {
11506 
11507 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11508 	if (pmap->pm_type == PT_X86 &&
11509 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
11510 		rangeset_remove_pred(&pmap->pm_pkru, sva, eva,
11511 		    pred_pkru_on_remove);
11512 	}
11513 }
11514 
11515 static int
pmap_pkru_copy(pmap_t dst_pmap,pmap_t src_pmap)11516 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap)
11517 {
11518 
11519 	PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
11520 	PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
11521 	MPASS(dst_pmap->pm_type == PT_X86);
11522 	MPASS(src_pmap->pm_type == PT_X86);
11523 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
11524 	if (src_pmap->pm_pkru.rs_data_ctx == NULL)
11525 		return (0);
11526 	return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru));
11527 }
11528 
11529 static void
pmap_pkru_update_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,u_int keyidx)11530 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
11531     u_int keyidx)
11532 {
11533 	pml4_entry_t *pml4e;
11534 	pdp_entry_t *pdpe;
11535 	pd_entry_t newpde, ptpaddr, *pde;
11536 	pt_entry_t newpte, *ptep, pte;
11537 	vm_offset_t va, va_next;
11538 	bool changed;
11539 
11540 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11541 	MPASS(pmap->pm_type == PT_X86);
11542 	MPASS(keyidx <= PMAP_MAX_PKRU_IDX);
11543 
11544 	for (changed = false, va = sva; va < eva; va = va_next) {
11545 		pml4e = pmap_pml4e(pmap, va);
11546 		if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) {
11547 			va_next = (va + NBPML4) & ~PML4MASK;
11548 			if (va_next < va)
11549 				va_next = eva;
11550 			continue;
11551 		}
11552 
11553 		pdpe = pmap_pml4e_to_pdpe(pml4e, va);
11554 		if ((*pdpe & X86_PG_V) == 0) {
11555 			va_next = (va + NBPDP) & ~PDPMASK;
11556 			if (va_next < va)
11557 				va_next = eva;
11558 			continue;
11559 		}
11560 
11561 		va_next = (va + NBPDR) & ~PDRMASK;
11562 		if (va_next < va)
11563 			va_next = eva;
11564 
11565 		pde = pmap_pdpe_to_pde(pdpe, va);
11566 		ptpaddr = *pde;
11567 		if (ptpaddr == 0)
11568 			continue;
11569 
11570 		MPASS((ptpaddr & X86_PG_V) != 0);
11571 		if ((ptpaddr & PG_PS) != 0) {
11572 			if (va + NBPDR == va_next && eva >= va_next) {
11573 				newpde = (ptpaddr & ~X86_PG_PKU_MASK) |
11574 				    X86_PG_PKU(keyidx);
11575 				if (newpde != ptpaddr) {
11576 					*pde = newpde;
11577 					changed = true;
11578 				}
11579 				continue;
11580 			} else if (!pmap_demote_pde(pmap, pde, va)) {
11581 				continue;
11582 			}
11583 		}
11584 
11585 		if (va_next > eva)
11586 			va_next = eva;
11587 
11588 		for (ptep = pmap_pde_to_pte(pde, va); va != va_next;
11589 		    ptep++, va += PAGE_SIZE) {
11590 			pte = *ptep;
11591 			if ((pte & X86_PG_V) == 0)
11592 				continue;
11593 			newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx);
11594 			if (newpte != pte) {
11595 				*ptep = newpte;
11596 				changed = true;
11597 			}
11598 		}
11599 	}
11600 	if (changed)
11601 		pmap_invalidate_range(pmap, sva, eva);
11602 }
11603 
11604 static int
pmap_pkru_check_uargs(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,u_int keyidx,int flags)11605 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
11606     u_int keyidx, int flags)
11607 {
11608 
11609 	if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX ||
11610 	    (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0)
11611 		return (EINVAL);
11612 	if (eva <= sva || eva > VM_MAXUSER_ADDRESS)
11613 		return (EFAULT);
11614 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
11615 		return (ENOTSUP);
11616 	return (0);
11617 }
11618 
11619 int
pmap_pkru_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,u_int keyidx,int flags)11620 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
11621     int flags)
11622 {
11623 	int error;
11624 
11625 	sva = trunc_page(sva);
11626 	eva = round_page(eva);
11627 	error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags);
11628 	if (error != 0)
11629 		return (error);
11630 	for (;;) {
11631 		PMAP_LOCK(pmap);
11632 		error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags);
11633 		if (error == 0)
11634 			pmap_pkru_update_range(pmap, sva, eva, keyidx);
11635 		PMAP_UNLOCK(pmap);
11636 		if (error != ENOMEM)
11637 			break;
11638 		vm_wait(NULL);
11639 	}
11640 	return (error);
11641 }
11642 
11643 int
pmap_pkru_clear(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)11644 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
11645 {
11646 	int error;
11647 
11648 	sva = trunc_page(sva);
11649 	eva = round_page(eva);
11650 	error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0);
11651 	if (error != 0)
11652 		return (error);
11653 	for (;;) {
11654 		PMAP_LOCK(pmap);
11655 		error = pmap_pkru_deassign(pmap, sva, eva);
11656 		if (error == 0)
11657 			pmap_pkru_update_range(pmap, sva, eva, 0);
11658 		PMAP_UNLOCK(pmap);
11659 		if (error != ENOMEM)
11660 			break;
11661 		vm_wait(NULL);
11662 	}
11663 	return (error);
11664 }
11665 
11666 #if defined(KASAN) || defined(KMSAN)
11667 
11668 /*
11669  * Reserve enough memory to:
11670  * 1) allocate PDP pages for the shadow map(s),
11671  * 2) shadow the boot stack of KSTACK_PAGES pages,
11672  * 3) assuming that the kernel stack does not cross a 1GB boundary,
11673  * so we need one or two PD pages, one or two PT pages, and KSTACK_PAGES shadow
11674  * pages per shadow map.
11675  */
11676 #ifdef KASAN
11677 #define	SAN_EARLY_PAGES	\
11678 	(NKASANPML4E + 2 + 2 + howmany(KSTACK_PAGES, KASAN_SHADOW_SCALE))
11679 #else
11680 #define	SAN_EARLY_PAGES	\
11681 	(NKMSANSHADPML4E + NKMSANORIGPML4E + 2 * (2 + 2 + KSTACK_PAGES))
11682 #endif
11683 
11684 static uint64_t __nosanitizeaddress __nosanitizememory
pmap_san_enter_early_alloc_4k(uint64_t pabase)11685 pmap_san_enter_early_alloc_4k(uint64_t pabase)
11686 {
11687 	static uint8_t data[PAGE_SIZE * SAN_EARLY_PAGES] __aligned(PAGE_SIZE);
11688 	static size_t offset = 0;
11689 	uint64_t pa;
11690 
11691 	if (offset == sizeof(data)) {
11692 		panic("%s: ran out of memory for the bootstrap shadow map",
11693 		    __func__);
11694 	}
11695 
11696 	pa = pabase + ((vm_offset_t)&data[offset] - KERNSTART);
11697 	offset += PAGE_SIZE;
11698 	return (pa);
11699 }
11700 
11701 /*
11702  * Map a shadow page, before the kernel has bootstrapped its page tables.  This
11703  * is currently only used to shadow the temporary boot stack set up by locore.
11704  */
11705 static void __nosanitizeaddress __nosanitizememory
pmap_san_enter_early(vm_offset_t va)11706 pmap_san_enter_early(vm_offset_t va)
11707 {
11708 	static bool first = true;
11709 	pml4_entry_t *pml4e;
11710 	pdp_entry_t *pdpe;
11711 	pd_entry_t *pde;
11712 	pt_entry_t *pte;
11713 	uint64_t cr3, pa, base;
11714 	int i;
11715 
11716 	base = amd64_loadaddr();
11717 	cr3 = rcr3();
11718 
11719 	if (first) {
11720 		/*
11721 		 * If this the first call, we need to allocate new PML4Es for
11722 		 * the bootstrap shadow map(s).  We don't know how the PML4 page
11723 		 * was initialized by the boot loader, so we can't simply test
11724 		 * whether the shadow map's PML4Es are zero.
11725 		 */
11726 		first = false;
11727 #ifdef KASAN
11728 		for (i = 0; i < NKASANPML4E; i++) {
11729 			pa = pmap_san_enter_early_alloc_4k(base);
11730 
11731 			pml4e = (pml4_entry_t *)cr3 +
11732 			    pmap_pml4e_index(KASAN_MIN_ADDRESS + i * NBPML4);
11733 			*pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V);
11734 		}
11735 #else
11736 		for (i = 0; i < NKMSANORIGPML4E; i++) {
11737 			pa = pmap_san_enter_early_alloc_4k(base);
11738 
11739 			pml4e = (pml4_entry_t *)cr3 +
11740 			    pmap_pml4e_index(KMSAN_ORIG_MIN_ADDRESS +
11741 			    i * NBPML4);
11742 			*pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V);
11743 		}
11744 		for (i = 0; i < NKMSANSHADPML4E; i++) {
11745 			pa = pmap_san_enter_early_alloc_4k(base);
11746 
11747 			pml4e = (pml4_entry_t *)cr3 +
11748 			    pmap_pml4e_index(KMSAN_SHAD_MIN_ADDRESS +
11749 			    i * NBPML4);
11750 			*pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V);
11751 		}
11752 #endif
11753 	}
11754 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(va);
11755 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(va);
11756 	if (*pdpe == 0) {
11757 		pa = pmap_san_enter_early_alloc_4k(base);
11758 		*pdpe = (pdp_entry_t)(pa | X86_PG_RW | X86_PG_V);
11759 	}
11760 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(va);
11761 	if (*pde == 0) {
11762 		pa = pmap_san_enter_early_alloc_4k(base);
11763 		*pde = (pd_entry_t)(pa | X86_PG_RW | X86_PG_V);
11764 	}
11765 	pte = (pt_entry_t *)(*pde & PG_FRAME) + pmap_pte_index(va);
11766 	if (*pte != 0)
11767 		panic("%s: PTE for %#lx is already initialized", __func__, va);
11768 	pa = pmap_san_enter_early_alloc_4k(base);
11769 	*pte = (pt_entry_t)(pa | X86_PG_A | X86_PG_M | X86_PG_RW | X86_PG_V);
11770 }
11771 
11772 static vm_page_t
pmap_san_enter_alloc_4k(void)11773 pmap_san_enter_alloc_4k(void)
11774 {
11775 	vm_page_t m;
11776 
11777 	m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
11778 	    VM_ALLOC_ZERO);
11779 	if (m == NULL)
11780 		panic("%s: no memory to grow shadow map", __func__);
11781 	return (m);
11782 }
11783 
11784 static vm_page_t
pmap_san_enter_alloc_2m(void)11785 pmap_san_enter_alloc_2m(void)
11786 {
11787 	return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
11788 	    NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT));
11789 }
11790 
11791 /*
11792  * Grow a shadow map by at least one 4KB page at the specified address.  Use 2MB
11793  * pages when possible.
11794  */
11795 void __nosanitizeaddress __nosanitizememory
pmap_san_enter(vm_offset_t va)11796 pmap_san_enter(vm_offset_t va)
11797 {
11798 	pdp_entry_t *pdpe;
11799 	pd_entry_t *pde;
11800 	pt_entry_t *pte;
11801 	vm_page_t m;
11802 
11803 	if (kernphys == 0) {
11804 		/*
11805 		 * We're creating a temporary shadow map for the boot stack.
11806 		 */
11807 		pmap_san_enter_early(va);
11808 		return;
11809 	}
11810 
11811 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
11812 
11813 	pdpe = pmap_pdpe(kernel_pmap, va);
11814 	if ((*pdpe & X86_PG_V) == 0) {
11815 		m = pmap_san_enter_alloc_4k();
11816 		*pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW |
11817 		    X86_PG_V | pg_nx);
11818 	}
11819 	pde = pmap_pdpe_to_pde(pdpe, va);
11820 	if ((*pde & X86_PG_V) == 0) {
11821 		m = pmap_san_enter_alloc_2m();
11822 		if (m != NULL) {
11823 			*pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW |
11824 			    X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx);
11825 		} else {
11826 			m = pmap_san_enter_alloc_4k();
11827 			*pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW |
11828 			    X86_PG_V | pg_nx);
11829 		}
11830 	}
11831 	if ((*pde & X86_PG_PS) != 0)
11832 		return;
11833 	pte = pmap_pde_to_pte(pde, va);
11834 	if ((*pte & X86_PG_V) != 0)
11835 		return;
11836 	m = pmap_san_enter_alloc_4k();
11837 	*pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V |
11838 	    X86_PG_M | X86_PG_A | pg_nx);
11839 }
11840 #endif
11841 
11842 /*
11843  * Track a range of the kernel's virtual address space that is contiguous
11844  * in various mapping attributes.
11845  */
11846 struct pmap_kernel_map_range {
11847 	vm_offset_t sva;
11848 	pt_entry_t attrs;
11849 	int ptes;
11850 	int pdes;
11851 	int pdpes;
11852 };
11853 
11854 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)11855 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
11856     vm_offset_t eva)
11857 {
11858 	const char *mode;
11859 	int i, pat_idx;
11860 
11861 	if (eva <= range->sva)
11862 		return;
11863 
11864 	pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true);
11865 	for (i = 0; i < PAT_INDEX_SIZE; i++)
11866 		if (pat_index[i] == pat_idx)
11867 			break;
11868 
11869 	switch (i) {
11870 	case PAT_WRITE_BACK:
11871 		mode = "WB";
11872 		break;
11873 	case PAT_WRITE_THROUGH:
11874 		mode = "WT";
11875 		break;
11876 	case PAT_UNCACHEABLE:
11877 		mode = "UC";
11878 		break;
11879 	case PAT_UNCACHED:
11880 		mode = "U-";
11881 		break;
11882 	case PAT_WRITE_PROTECTED:
11883 		mode = "WP";
11884 		break;
11885 	case PAT_WRITE_COMBINING:
11886 		mode = "WC";
11887 		break;
11888 	default:
11889 		printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n",
11890 		    __func__, pat_idx, range->sva, eva);
11891 		mode = "??";
11892 		break;
11893 	}
11894 
11895 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n",
11896 	    range->sva, eva,
11897 	    (range->attrs & X86_PG_RW) != 0 ? 'w' : '-',
11898 	    (range->attrs & pg_nx) != 0 ? '-' : 'x',
11899 	    (range->attrs & X86_PG_U) != 0 ? 'u' : 's',
11900 	    (range->attrs & X86_PG_G) != 0 ? 'g' : '-',
11901 	    mode, range->pdpes, range->pdes, range->ptes);
11902 
11903 	/* Reset to sentinel value. */
11904 	range->sva = kva_layout.kva_max;
11905 }
11906 
11907 /*
11908  * Determine whether the attributes specified by a page table entry match those
11909  * being tracked by the current range.  This is not quite as simple as a direct
11910  * flag comparison since some PAT modes have multiple representations.
11911  */
11912 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)11913 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
11914 {
11915 	pt_entry_t diff, mask;
11916 
11917 	mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx;
11918 	diff = (range->attrs ^ attrs) & mask;
11919 	if (diff == 0)
11920 		return (true);
11921 	if ((diff & ~X86_PG_PDE_PAT) == 0 &&
11922 	    pmap_pat_index(kernel_pmap, range->attrs, true) ==
11923 	    pmap_pat_index(kernel_pmap, attrs, true))
11924 		return (true);
11925 	return (false);
11926 }
11927 
11928 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)11929 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
11930     pt_entry_t attrs)
11931 {
11932 
11933 	memset(range, 0, sizeof(*range));
11934 	range->sva = va;
11935 	range->attrs = attrs;
11936 }
11937 
11938 /*
11939  * Given a leaf PTE, derive the mapping's attributes.  If they do not match
11940  * those of the current run, dump the address range and its attributes, and
11941  * begin a new run.
11942  */
11943 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pml5_entry_t pml5e,pml4_entry_t pml4e,pdp_entry_t pdpe,pd_entry_t pde,pt_entry_t pte)11944 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
11945     vm_offset_t va, pml5_entry_t pml5e, pml4_entry_t pml4e, pdp_entry_t pdpe,
11946     pd_entry_t pde, pt_entry_t pte)
11947 {
11948 	pt_entry_t attrs;
11949 
11950 	if (la57) {
11951 		attrs = pml5e & (X86_PG_RW | X86_PG_U | pg_nx);
11952 		attrs |= pml4e & pg_nx;
11953 		attrs &= pg_nx | (pml4e & (X86_PG_RW | X86_PG_U));
11954 	} else {
11955 		attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
11956 	}
11957 
11958 	attrs |= pdpe & pg_nx;
11959 	attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
11960 	if ((pdpe & PG_PS) != 0) {
11961 		attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE);
11962 	} else if (pde != 0) {
11963 		attrs |= pde & pg_nx;
11964 		attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U));
11965 	}
11966 	if ((pde & PG_PS) != 0) {
11967 		attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE);
11968 	} else if (pte != 0) {
11969 		attrs |= pte & pg_nx;
11970 		attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U));
11971 		attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE);
11972 
11973 		/* Canonicalize by always using the PDE PAT bit. */
11974 		if ((attrs & X86_PG_PTE_PAT) != 0)
11975 			attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT;
11976 	}
11977 
11978 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
11979 		sysctl_kmaps_dump(sb, range, va);
11980 		sysctl_kmaps_reinit(range, va, attrs);
11981 	}
11982 }
11983 
11984 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)11985 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
11986 {
11987 	struct pmap_kernel_map_range range;
11988 	struct sbuf sbuf, *sb;
11989 	pml5_entry_t pml5e;
11990 	pml4_entry_t pml4e;
11991 	pdp_entry_t *pdp, pdpe;
11992 	pd_entry_t *pd, pde;
11993 	pt_entry_t *pt, pte;
11994 	vm_offset_t sva;
11995 	vm_paddr_t pa;
11996 	int error, j, k, l;
11997 	bool first;
11998 
11999 	error = sysctl_wire_old_buffer(req, 0);
12000 	if (error != 0)
12001 		return (error);
12002 	sb = &sbuf;
12003 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
12004 
12005 	/* Sentinel value. */
12006 	range.sva = kva_layout.kva_max;
12007 	pml5e = 0;	/* no UB for la48 */
12008 
12009 	/*
12010 	 * Iterate over the kernel page tables without holding the kernel pmap
12011 	 * lock.  Outside of the large map, kernel page table pages are never
12012 	 * freed, so at worst we will observe inconsistencies in the output.
12013 	 * Within the large map, ensure that PDP and PD page addresses are
12014 	 * valid before descending.
12015 	 */
12016 	for (first = true, sva = 0; sva != 0 || first; first = false) {
12017 		if (sva == kva_layout.rec_pt)
12018 			sbuf_printf(sb, "\nRecursive map:\n");
12019 		else if (sva == kva_layout.dmap_low)
12020 			sbuf_printf(sb, "\nDirect map:\n");
12021 #ifdef KASAN
12022 		else if (sva == kva_layout.kasan_shadow_low)
12023 			sbuf_printf(sb, "\nKASAN shadow map:\n");
12024 #endif
12025 #ifdef KMSAN
12026 		else if (sva == kva_layout.kmsan_shadow_low)
12027 			sbuf_printf(sb, "\nKMSAN shadow map:\n");
12028 		else if (sva == kva_layout.kmsan_origin_low)
12029 			sbuf_printf(sb, "\nKMSAN origin map:\n");
12030 #endif
12031 		else if (sva == kva_layout.km_low)
12032 			sbuf_printf(sb, "\nKernel map:\n");
12033 		else if (sva == kva_layout.lm_low)
12034 			sbuf_printf(sb, "\nLarge map:\n");
12035 
12036 		/* Convert to canonical form. */
12037 		if (la57) {
12038 			if (sva == 1ul << 56) {
12039 				sva |= -1ul << 57;
12040 				continue;
12041 			}
12042 		} else {
12043 			if (sva == 1ul << 47) {
12044 				sva |= -1ul << 48;
12045 				continue;
12046 			}
12047 		}
12048 
12049 restart:
12050 		if (la57) {
12051 			pml5e = *pmap_pml5e(kernel_pmap, sva);
12052 			if ((pml5e & X86_PG_V) == 0) {
12053 				sva = rounddown2(sva, NBPML5);
12054 				sysctl_kmaps_dump(sb, &range, sva);
12055 				sva += NBPML5;
12056 				continue;
12057 			}
12058 		}
12059 		pml4e = *pmap_pml4e(kernel_pmap, sva);
12060 		if ((pml4e & X86_PG_V) == 0) {
12061 			sva = rounddown2(sva, NBPML4);
12062 			sysctl_kmaps_dump(sb, &range, sva);
12063 			sva += NBPML4;
12064 			continue;
12065 		}
12066 		pa = pml4e & PG_FRAME;
12067 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa);
12068 
12069 		for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) {
12070 			pdpe = pdp[j];
12071 			if ((pdpe & X86_PG_V) == 0) {
12072 				sva = rounddown2(sva, NBPDP);
12073 				sysctl_kmaps_dump(sb, &range, sva);
12074 				sva += NBPDP;
12075 				continue;
12076 			}
12077 			pa = pdpe & PG_FRAME;
12078 			if ((pdpe & PG_PS) != 0) {
12079 				sva = rounddown2(sva, NBPDP);
12080 				sysctl_kmaps_check(sb, &range, sva, pml5e,
12081 				    pml4e, pdpe, 0, 0);
12082 				range.pdpes++;
12083 				sva += NBPDP;
12084 				continue;
12085 			}
12086 			if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
12087 			    vm_phys_paddr_to_vm_page(pa) == NULL) {
12088 				/*
12089 				 * Page table pages for the large map may be
12090 				 * freed.  Validate the next-level address
12091 				 * before descending.
12092 				 */
12093 				sva += NBPDP;
12094 				goto restart;
12095 			}
12096 			pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
12097 
12098 			for (k = pmap_pde_index(sva); k < NPDEPG; k++) {
12099 				pde = pd[k];
12100 				if ((pde & X86_PG_V) == 0) {
12101 					sva = rounddown2(sva, NBPDR);
12102 					sysctl_kmaps_dump(sb, &range, sva);
12103 					sva += NBPDR;
12104 					continue;
12105 				}
12106 				pa = pde & PG_FRAME;
12107 				if ((pde & PG_PS) != 0) {
12108 					sva = rounddown2(sva, NBPDR);
12109 					sysctl_kmaps_check(sb, &range, sva,
12110 					    pml5e, pml4e, pdpe, pde, 0);
12111 					range.pdes++;
12112 					sva += NBPDR;
12113 					continue;
12114 				}
12115 				if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
12116 				    vm_phys_paddr_to_vm_page(pa) == NULL) {
12117 					/*
12118 					 * Page table pages for the large map
12119 					 * may be freed.  Validate the
12120 					 * next-level address before descending.
12121 					 */
12122 					sva += NBPDR;
12123 					goto restart;
12124 				}
12125 				pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
12126 
12127 				for (l = pmap_pte_index(sva); l < NPTEPG; l++,
12128 				    sva += PAGE_SIZE) {
12129 					pte = pt[l];
12130 					if ((pte & X86_PG_V) == 0) {
12131 						sysctl_kmaps_dump(sb, &range,
12132 						    sva);
12133 						continue;
12134 					}
12135 					sysctl_kmaps_check(sb, &range, sva,
12136 					    pml5e, pml4e, pdpe, pde, pte);
12137 					range.ptes++;
12138 				}
12139 			}
12140 		}
12141 	}
12142 
12143 	error = sbuf_finish(sb);
12144 	sbuf_delete(sb);
12145 	return (error);
12146 }
12147 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
12148     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
12149     NULL, 0, sysctl_kmaps, "A",
12150     "Dump kernel address layout");
12151 
12152 #ifdef DDB
DB_SHOW_COMMAND(pte,pmap_print_pte)12153 DB_SHOW_COMMAND(pte, pmap_print_pte)
12154 {
12155 	pmap_t pmap;
12156 	pml5_entry_t *pml5;
12157 	pml4_entry_t *pml4;
12158 	pdp_entry_t *pdp;
12159 	pd_entry_t *pde;
12160 	pt_entry_t *pte, PG_V;
12161 	vm_offset_t va;
12162 
12163 	if (!have_addr) {
12164 		db_printf("show pte addr\n");
12165 		return;
12166 	}
12167 	va = (vm_offset_t)addr;
12168 
12169 	if (kdb_thread != NULL)
12170 		pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
12171 	else
12172 		pmap = PCPU_GET(curpmap);
12173 
12174 	PG_V = pmap_valid_bit(pmap);
12175 	db_printf("VA 0x%016lx", va);
12176 
12177 	if (pmap_is_la57(pmap)) {
12178 		pml5 = pmap_pml5e(pmap, va);
12179 		db_printf(" pml5e@0x%016lx 0x%016lx", (uint64_t)pml5, *pml5);
12180 		if ((*pml5 & PG_V) == 0) {
12181 			db_printf("\n");
12182 			return;
12183 		}
12184 		pml4 = pmap_pml5e_to_pml4e(pml5, va);
12185 	} else {
12186 		pml4 = pmap_pml4e(pmap, va);
12187 	}
12188 	db_printf(" pml4e@0x%016lx 0x%016lx", (uint64_t)pml4, *pml4);
12189 	if ((*pml4 & PG_V) == 0) {
12190 		db_printf("\n");
12191 		return;
12192 	}
12193 	pdp = pmap_pml4e_to_pdpe(pml4, va);
12194 	db_printf(" pdpe@0x%016lx 0x%016lx", (uint64_t)pdp, *pdp);
12195 	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
12196 		db_printf("\n");
12197 		return;
12198 	}
12199 	pde = pmap_pdpe_to_pde(pdp, va);
12200 	db_printf(" pde@0x%016lx 0x%016lx", (uint64_t)pde, *pde);
12201 	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
12202 		db_printf("\n");
12203 		return;
12204 	}
12205 	pte = pmap_pde_to_pte(pde, va);
12206 	db_printf(" pte@0x%016lx 0x%016lx\n", (uint64_t)pte, *pte);
12207 }
12208 
DB_SHOW_COMMAND(phys2dmap,pmap_phys2dmap)12209 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
12210 {
12211 	vm_paddr_t a;
12212 
12213 	if (have_addr) {
12214 		a = (vm_paddr_t)addr;
12215 		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
12216 	} else {
12217 		db_printf("show phys2dmap addr\n");
12218 	}
12219 }
12220 
12221 static void
ptpages_show_page(int level,int idx,vm_page_t pg)12222 ptpages_show_page(int level, int idx, vm_page_t pg)
12223 {
12224 	db_printf("l %d i %d pg %p phys %#lx ref %x\n",
12225 	    level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count);
12226 }
12227 
12228 static void
ptpages_show_complain(int level,int idx,uint64_t pte)12229 ptpages_show_complain(int level, int idx, uint64_t pte)
12230 {
12231 	db_printf("l %d i %d pte %#lx\n", level, idx, pte);
12232 }
12233 
12234 static void
ptpages_show_pml4(vm_page_t pg4,int num_entries,uint64_t PG_V)12235 ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V)
12236 {
12237 	vm_page_t pg3, pg2, pg1;
12238 	pml4_entry_t *pml4;
12239 	pdp_entry_t *pdp;
12240 	pd_entry_t *pd;
12241 	int i4, i3, i2;
12242 
12243 	pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4));
12244 	for (i4 = 0; i4 < num_entries; i4++) {
12245 		if ((pml4[i4] & PG_V) == 0)
12246 			continue;
12247 		pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME);
12248 		if (pg3 == NULL) {
12249 			ptpages_show_complain(3, i4, pml4[i4]);
12250 			continue;
12251 		}
12252 		ptpages_show_page(3, i4, pg3);
12253 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3));
12254 		for (i3 = 0; i3 < NPDPEPG; i3++) {
12255 			if ((pdp[i3] & PG_V) == 0)
12256 				continue;
12257 			pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME);
12258 			if (pg3 == NULL) {
12259 				ptpages_show_complain(2, i3, pdp[i3]);
12260 				continue;
12261 			}
12262 			ptpages_show_page(2, i3, pg2);
12263 			pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2));
12264 			for (i2 = 0; i2 < NPDEPG; i2++) {
12265 				if ((pd[i2] & PG_V) == 0)
12266 					continue;
12267 				pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME);
12268 				if (pg1 == NULL) {
12269 					ptpages_show_complain(1, i2, pd[i2]);
12270 					continue;
12271 				}
12272 				ptpages_show_page(1, i2, pg1);
12273 			}
12274 		}
12275 	}
12276 }
12277 
DB_SHOW_COMMAND(ptpages,pmap_ptpages)12278 DB_SHOW_COMMAND(ptpages, pmap_ptpages)
12279 {
12280 	pmap_t pmap;
12281 	vm_page_t pg;
12282 	pml5_entry_t *pml5;
12283 	uint64_t PG_V;
12284 	int i5;
12285 
12286 	if (have_addr)
12287 		pmap = (pmap_t)addr;
12288 	else
12289 		pmap = PCPU_GET(curpmap);
12290 
12291 	PG_V = pmap_valid_bit(pmap);
12292 
12293 	if (pmap_is_la57(pmap)) {
12294 		pml5 = pmap->pm_pmltop;
12295 		for (i5 = 0; i5 < NUPML5E; i5++) {
12296 			if ((pml5[i5] & PG_V) == 0)
12297 				continue;
12298 			pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME);
12299 			if (pg == NULL) {
12300 				ptpages_show_complain(4, i5, pml5[i5]);
12301 				continue;
12302 			}
12303 			ptpages_show_page(4, i5, pg);
12304 			ptpages_show_pml4(pg, NPML4EPG, PG_V);
12305 		}
12306 	} else {
12307 		ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS(
12308 		    (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V);
12309 	}
12310 }
12311 #endif
12312