xref: /freebsd/sys/arm64/arm64/pmap.c (revision c6a1c1260f02e44b7f44b1e3735ce5dbd785544d)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2003 Peter Wemm
9  * All rights reserved.
10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11  * All rights reserved.
12  * Copyright (c) 2014 Andrew Turner
13  * All rights reserved.
14  * Copyright (c) 2014-2016 The FreeBSD Foundation
15  * All rights reserved.
16  *
17  * This code is derived from software contributed to Berkeley by
18  * the Systems Programming Group of the University of Utah Computer
19  * Science Department and William Jolitz of UUNET Technologies Inc.
20  *
21  * This software was developed by Andrew Turner under sponsorship from
22  * the FreeBSD Foundation.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted provided that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  * 3. All advertising materials mentioning features or use of this software
33  *    must display the following acknowledgement:
34  *	This product includes software developed by the University of
35  *	California, Berkeley and its contributors.
36  * 4. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  */
52 /*-
53  * Copyright (c) 2003 Networks Associates Technology, Inc.
54  * All rights reserved.
55  *
56  * This software was developed for the FreeBSD Project by Jake Burkholder,
57  * Safeport Network Services, and Network Associates Laboratories, the
58  * Security Research Division of Network Associates, Inc. under
59  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
60  * CHATS research program.
61  *
62  * Redistribution and use in source and binary forms, with or without
63  * modification, are permitted provided that the following conditions
64  * are met:
65  * 1. Redistributions of source code must retain the above copyright
66  *    notice, this list of conditions and the following disclaimer.
67  * 2. Redistributions in binary form must reproduce the above copyright
68  *    notice, this list of conditions and the following disclaimer in the
69  *    documentation and/or other materials provided with the distribution.
70  *
71  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
72  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
75  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81  * SUCH DAMAGE.
82  */
83 
84 #include <sys/cdefs.h>
85 /*
86  *	Manages physical address maps.
87  *
88  *	Since the information managed by this module is
89  *	also stored by the logical address mapping module,
90  *	this module may throw away valid virtual-to-physical
91  *	mappings at almost any time.  However, invalidations
92  *	of virtual-to-physical mappings must be done as
93  *	requested.
94  *
95  *	In order to cope with hardware architectures which
96  *	make virtual-to-physical map invalidates expensive,
97  *	this module may delay invalidate or reduced protection
98  *	operations until such time as they are actually
99  *	necessary.  This module is given full information as
100  *	to which processors are currently using which maps,
101  *	and to when physical maps must be made correct.
102  */
103 
104 #include "opt_vm.h"
105 
106 #include <sys/param.h>
107 #include <sys/asan.h>
108 #include <sys/bitstring.h>
109 #include <sys/bus.h>
110 #include <sys/systm.h>
111 #include <sys/kernel.h>
112 #include <sys/ktr.h>
113 #include <sys/limits.h>
114 #include <sys/lock.h>
115 #include <sys/malloc.h>
116 #include <sys/mman.h>
117 #include <sys/msan.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/physmem.h>
121 #include <sys/proc.h>
122 #include <sys/rangeset.h>
123 #include <sys/rwlock.h>
124 #include <sys/sbuf.h>
125 #include <sys/sx.h>
126 #include <sys/vmem.h>
127 #include <sys/vmmeter.h>
128 #include <sys/sched.h>
129 #include <sys/sysctl.h>
130 #include <sys/_unrhdr.h>
131 #include <sys/smp.h>
132 
133 #include <vm/vm.h>
134 #include <vm/vm_param.h>
135 #include <vm/vm_kern.h>
136 #include <vm/vm_page.h>
137 #include <vm/vm_map.h>
138 #include <vm/vm_object.h>
139 #include <vm/vm_extern.h>
140 #include <vm/vm_pageout.h>
141 #include <vm/vm_pager.h>
142 #include <vm/vm_phys.h>
143 #include <vm/vm_radix.h>
144 #include <vm/vm_reserv.h>
145 #include <vm/vm_dumpset.h>
146 #include <vm/uma.h>
147 
148 #include <machine/asan.h>
149 #include <machine/cpu_feat.h>
150 #include <machine/elf.h>
151 #include <machine/ifunc.h>
152 #include <machine/machdep.h>
153 #include <machine/md_var.h>
154 #include <machine/pcb.h>
155 
156 #ifdef NUMA
157 #define	PMAP_MEMDOM	MAXMEMDOM
158 #else
159 #define	PMAP_MEMDOM	1
160 #endif
161 
162 #define	PMAP_ASSERT_STAGE1(pmap)	MPASS((pmap)->pm_stage == PM_STAGE1)
163 #define	PMAP_ASSERT_STAGE2(pmap)	MPASS((pmap)->pm_stage == PM_STAGE2)
164 
165 #define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
166 #define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
167 #define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
168 #define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
169 
170 #define	NUL0E		L0_ENTRIES
171 #define	NUL1E		(NUL0E * NL1PG)
172 #define	NUL2E		(NUL1E * NL2PG)
173 
174 #ifdef PV_STATS
175 #define PV_STAT(x)	do { x ; } while (0)
176 #define __pvused
177 #else
178 #define PV_STAT(x)	do { } while (0)
179 #define __pvused	__unused
180 #endif
181 
182 #define	pmap_l0_pindex(v)	(NUL2E + NUL1E + ((v) >> L0_SHIFT))
183 #define	pmap_l1_pindex(v)	(NUL2E + ((v) >> L1_SHIFT))
184 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
185 
186 #ifdef __ARM_FEATURE_BTI_DEFAULT
187 pt_entry_t __read_mostly pmap_gp_attr;
188 #define	ATTR_KERN_GP		pmap_gp_attr
189 #else
190 #define	ATTR_KERN_GP		0
191 #endif
192 #define	PMAP_SAN_PTE_BITS	(ATTR_AF | ATTR_S1_XN | pmap_sh_attr | \
193   ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
194 
195 static bool __read_mostly pmap_multiple_tlbi = false;
196 
197 struct pmap_large_md_page {
198 	struct rwlock   pv_lock;
199 	struct md_page  pv_page;
200 	/* Pad to a power of 2, see pmap_init_pv_table(). */
201 	int		pv_pad[2];
202 };
203 
204 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
205 #define pv_dummy pv_dummy_large.pv_page
206 __read_mostly static struct pmap_large_md_page *pv_table;
207 
208 static struct pmap_large_md_page *
209 _pa_to_pmdp(vm_paddr_t pa)
210 {
211 	struct vm_phys_seg *seg;
212 
213 	if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
214 		return ((struct pmap_large_md_page *)seg->md_first +
215 		    pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
216 	return (NULL);
217 }
218 
219 static struct pmap_large_md_page *
220 pa_to_pmdp(vm_paddr_t pa)
221 {
222 	struct pmap_large_md_page *pvd;
223 
224 	pvd = _pa_to_pmdp(pa);
225 	if (pvd == NULL)
226 		panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
227 	return (pvd);
228 }
229 
230 static struct pmap_large_md_page *
231 page_to_pmdp(vm_page_t m)
232 {
233 	struct vm_phys_seg *seg;
234 
235 	seg = &vm_phys_segs[m->segind];
236 	return ((struct pmap_large_md_page *)seg->md_first +
237 	    pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
238 }
239 
240 #define	pa_to_pvh(pa)	(&(pa_to_pmdp(pa)->pv_page))
241 #define	page_to_pvh(m)	(&(page_to_pmdp(m)->pv_page))
242 
243 #define	PHYS_TO_PV_LIST_LOCK(pa)	({			\
244 	struct pmap_large_md_page *_pvd;			\
245 	struct rwlock *_lock;					\
246 	_pvd = _pa_to_pmdp(pa);					\
247 	if (__predict_false(_pvd == NULL))			\
248 		_lock = &pv_dummy_large.pv_lock;		\
249 	else							\
250 		_lock = &(_pvd->pv_lock);			\
251 	_lock;							\
252 })
253 
254 static struct rwlock *
255 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)
256 {
257 	if ((m->flags & PG_FICTITIOUS) == 0)
258 		return (&page_to_pmdp(m)->pv_lock);
259 	else
260 		return (&pv_dummy_large.pv_lock);
261 }
262 
263 #define	CHANGE_PV_LIST_LOCK(lockp, new_lock)	do {	\
264 	struct rwlock **_lockp = (lockp);		\
265 	struct rwlock *_new_lock = (new_lock);		\
266 							\
267 	if (_new_lock != *_lockp) {			\
268 		if (*_lockp != NULL)			\
269 			rw_wunlock(*_lockp);		\
270 		*_lockp = _new_lock;			\
271 		rw_wlock(*_lockp);			\
272 	}						\
273 } while (0)
274 
275 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)		\
276 			CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa))
277 
278 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
279 			CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m))
280 
281 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
282 	struct rwlock **_lockp = (lockp);		\
283 							\
284 	if (*_lockp != NULL) {				\
285 		rw_wunlock(*_lockp);			\
286 		*_lockp = NULL;				\
287 	}						\
288 } while (0)
289 
290 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
291 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m))
292 
293 /*
294  * The presence of this flag indicates that the mapping is writeable.
295  * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
296  * it is dirty.  This flag may only be set on managed mappings.
297  *
298  * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
299  * as a software managed bit.
300  */
301 #define	ATTR_SW_DBM	ATTR_DBM
302 
303 struct pmap kernel_pmap_store;
304 
305 /* Used for mapping ACPI memory before VM is initialized */
306 #define	PMAP_PREINIT_MAPPING_COUNT	32
307 #define	PMAP_PREINIT_MAPPING_SIZE	(PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
308 static vm_offset_t preinit_map_va;	/* Start VA of pre-init mapping space */
309 static int vm_initialized = 0;		/* No need to use pre-init maps when set */
310 
311 /*
312  * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
313  * Always map entire L2 block for simplicity.
314  * VA of L2 block = preinit_map_va + i * L2_SIZE
315  */
316 static struct pmap_preinit_mapping {
317 	vm_paddr_t	pa;
318 	vm_offset_t	va;
319 	vm_size_t	size;
320 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
321 
322 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
323 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
324 vm_offset_t kernel_vm_end = 0;
325 
326 /*
327  * Data for the pv entry allocation mechanism.
328  */
329 #ifdef NUMA
330 static __inline int
331 pc_to_domain(struct pv_chunk *pc)
332 {
333 	return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
334 }
335 #else
336 static __inline int
337 pc_to_domain(struct pv_chunk *pc __unused)
338 {
339 	return (0);
340 }
341 #endif
342 
343 struct pv_chunks_list {
344 	struct mtx pvc_lock;
345 	TAILQ_HEAD(pch, pv_chunk) pvc_list;
346 	int active_reclaims;
347 } __aligned(CACHE_LINE_SIZE);
348 
349 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
350 
351 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
352 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
353 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
354 
355 extern pt_entry_t pagetable_l0_ttbr1[];
356 
357 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
358 static vm_paddr_t physmap[PHYSMAP_SIZE];
359 static u_int physmap_idx;
360 
361 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
362     "VM/pmap parameters");
363 
364 static int pmap_growkernel_panic = 0;
365 SYSCTL_INT(_vm_pmap, OID_AUTO, growkernel_panic, CTLFLAG_RDTUN,
366     &pmap_growkernel_panic, 0,
367     "panic on failure to allocate kernel page table page");
368 
369 bool pmap_lpa_enabled __read_mostly = false;
370 pt_entry_t pmap_sh_attr __read_mostly = ATTR_SH(ATTR_SH_IS);
371 
372 #if PAGE_SIZE == PAGE_SIZE_4K
373 #define	L1_BLOCKS_SUPPORTED	1
374 #else
375 #define	L1_BLOCKS_SUPPORTED	(pmap_lpa_enabled)
376 #endif
377 
378 #define	PMAP_ASSERT_L1_BLOCKS_SUPPORTED	MPASS(L1_BLOCKS_SUPPORTED)
379 
380 static bool pmap_l1_supported __read_mostly = false;
381 
382 /*
383  * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
384  * that it has currently allocated to a pmap, a cursor ("asid_next") to
385  * optimize its search for a free ASID in the bit vector, and an epoch number
386  * ("asid_epoch") to indicate when it has reclaimed all previously allocated
387  * ASIDs that are not currently active on a processor.
388  *
389  * The current epoch number is always in the range [0, INT_MAX).  Negative
390  * numbers and INT_MAX are reserved for special cases that are described
391  * below.
392  */
393 struct asid_set {
394 	int asid_bits;
395 	bitstr_t *asid_set;
396 	int asid_set_size;
397 	int asid_next;
398 	int asid_epoch;
399 	struct mtx asid_set_mutex;
400 };
401 
402 static struct asid_set asids;
403 static struct asid_set vmids;
404 
405 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
406     "ASID allocator");
407 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
408     "The number of bits in an ASID");
409 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
410     "The last allocated ASID plus one");
411 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
412     "The current epoch number");
413 
414 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
415 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
416     "The number of bits in an VMID");
417 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
418     "The last allocated VMID plus one");
419 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
420     "The current epoch number");
421 
422 void (*pmap_clean_stage2_tlbi)(void);
423 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
424 void (*pmap_stage2_invalidate_all)(uint64_t);
425 
426 /*
427  * A pmap's cookie encodes an ASID and epoch number.  Cookies for reserved
428  * ASIDs have a negative epoch number, specifically, INT_MIN.  Cookies for
429  * dynamically allocated ASIDs have a non-negative epoch number.
430  *
431  * An invalid ASID is represented by -1.
432  *
433  * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
434  * which indicates that an ASID should never be allocated to the pmap, and
435  * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
436  * allocated when the pmap is next activated.
437  */
438 #define	COOKIE_FROM(asid, epoch)	((long)((u_int)(asid) |	\
439 					    ((u_long)(epoch) << 32)))
440 #define	COOKIE_TO_ASID(cookie)		((int)(cookie))
441 #define	COOKIE_TO_EPOCH(cookie)		((int)((u_long)(cookie) >> 32))
442 
443 #define	TLBI_VA_SHIFT			12
444 #define	TLBI_VA_MASK			((1ul << 44) - 1)
445 #define	TLBI_VA(addr)			(((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
446 
447 static int __read_frequently superpages_enabled = 1;
448 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
449     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
450     "Are large page mappings enabled?");
451 
452 /*
453  * True when Branch Target Identification should be used by userspace. This
454  * allows pmap to mark pages as guarded with ATTR_S1_GP.
455  */
456 __read_mostly static bool pmap_bti_support = false;
457 
458 /*
459  * Internal flags for pmap_enter()'s helper functions.
460  */
461 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
462 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
463 
464 TAILQ_HEAD(pv_chunklist, pv_chunk);
465 
466 static void	free_pv_chunk(struct pv_chunk *pc);
467 static void	free_pv_chunk_batch(struct pv_chunklist *batch);
468 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
469 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
470 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
471 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
472 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
473 		    vm_offset_t va);
474 
475 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
476 static bool pmap_activate_int(struct thread *td, pmap_t pmap);
477 static void pmap_alloc_asid(pmap_t pmap);
478 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
479     vm_prot_t prot, int mode, bool skip_unmapped);
480 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
481     pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp);
482 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
483 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
484     vm_offset_t va, struct rwlock **lockp);
485 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
486 static bool pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va);
487 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va);
488 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
489     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
490 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
491     u_int flags, vm_page_t m, struct rwlock **lockp);
492 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
493     vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp);
494 static bool pmap_every_pte_zero(vm_paddr_t pa);
495 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
496     bool all_l3e_AF_set);
497 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p);
498 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
499     vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits);
500 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
501     struct rwlock **lockp);
502 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
503 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
504     pd_entry_t l1e, bool demote_kl2e, struct spglist *free,
505     struct rwlock **lockp);
506 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
507     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
508 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
509     vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
510     struct rwlock **lockp);
511 static void pmap_reset_asid_set(pmap_t pmap);
512 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
513     vm_page_t m, struct rwlock **lockp);
514 
515 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
516 		struct rwlock **lockp);
517 
518 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
519     struct spglist *free);
520 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
521 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
522     vm_offset_t va, vm_size_t size);
523 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
524 
525 static uma_zone_t pmap_bti_ranges_zone;
526 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
527     pt_entry_t *pte);
528 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va);
529 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
530 static void *bti_dup_range(void *ctx, void *data);
531 static void bti_free_range(void *ctx, void *node);
532 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap);
533 static void pmap_bti_deassign_all(pmap_t pmap);
534 static void pagezero(void *);
535 
536 /*
537  * These load the old table data and store the new value.
538  * They need to be atomic as the System MMU may write to the table at
539  * the same time as the CPU.
540  */
541 #define	pmap_clear(table)		atomic_store_64(table, 0)
542 #define	pmap_clear_bits(table, bits)	atomic_clear_64(table, bits)
543 #define	pmap_load(table)		(*table)
544 #define	pmap_load_clear(table)		atomic_swap_64(table, 0)
545 #define	pmap_load_store(table, entry)	atomic_swap_64(table, entry)
546 #define	pmap_set_bits(table, bits)	atomic_set_64(table, bits)
547 #define	pmap_store(table, entry)	atomic_store_64(table, entry)
548 
549 /********************/
550 /* Inline functions */
551 /********************/
552 
553 static __inline void
554 pagecopy(void *s, void *d)
555 {
556 
557 	memcpy(d, s, PAGE_SIZE);
558 }
559 
560 static __inline pd_entry_t *
561 pmap_l0(pmap_t pmap, vm_offset_t va)
562 {
563 
564 	return (&pmap->pm_l0[pmap_l0_index(va)]);
565 }
566 
567 static __inline pd_entry_t *
568 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
569 {
570 	pd_entry_t *l1;
571 
572 	l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
573 	return (&l1[pmap_l1_index(va)]);
574 }
575 
576 static __inline pd_entry_t *
577 pmap_l1(pmap_t pmap, vm_offset_t va)
578 {
579 	pd_entry_t *l0;
580 
581 	l0 = pmap_l0(pmap, va);
582 	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
583 		return (NULL);
584 
585 	return (pmap_l0_to_l1(l0, va));
586 }
587 
588 static __inline pd_entry_t *
589 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
590 {
591 	pd_entry_t l1, *l2p;
592 
593 	l1 = pmap_load(l1p);
594 
595 	KASSERT(ADDR_IS_CANONICAL(va),
596 	    ("%s: Address not in canonical form: %lx", __func__, va));
597 	/*
598 	 * The valid bit may be clear if pmap_update_entry() is concurrently
599 	 * modifying the entry, so for KVA only the entry type may be checked.
600 	 */
601 	KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
602 	    ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
603 	KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
604 	    ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
605 	l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1));
606 	return (&l2p[pmap_l2_index(va)]);
607 }
608 
609 static __inline pd_entry_t *
610 pmap_l2(pmap_t pmap, vm_offset_t va)
611 {
612 	pd_entry_t *l1;
613 
614 	l1 = pmap_l1(pmap, va);
615 	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
616 		return (NULL);
617 
618 	return (pmap_l1_to_l2(l1, va));
619 }
620 
621 static __inline pt_entry_t *
622 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
623 {
624 	pd_entry_t l2;
625 	pt_entry_t *l3p;
626 
627 	l2 = pmap_load(l2p);
628 
629 	KASSERT(ADDR_IS_CANONICAL(va),
630 	    ("%s: Address not in canonical form: %lx", __func__, va));
631 	/*
632 	 * The valid bit may be clear if pmap_update_entry() is concurrently
633 	 * modifying the entry, so for KVA only the entry type may be checked.
634 	 */
635 	KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
636 	    ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
637 	KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
638 	    ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
639 	l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2));
640 	return (&l3p[pmap_l3_index(va)]);
641 }
642 
643 /*
644  * Returns the lowest valid pde for a given virtual address.
645  * The next level may or may not point to a valid page or block.
646  */
647 static __inline pd_entry_t *
648 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
649 {
650 	pd_entry_t *l0, *l1, *l2, desc;
651 
652 	l0 = pmap_l0(pmap, va);
653 	desc = pmap_load(l0) & ATTR_DESCR_MASK;
654 	if (desc != L0_TABLE) {
655 		*level = -1;
656 		return (NULL);
657 	}
658 
659 	l1 = pmap_l0_to_l1(l0, va);
660 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
661 	if (desc != L1_TABLE) {
662 		*level = 0;
663 		return (l0);
664 	}
665 
666 	l2 = pmap_l1_to_l2(l1, va);
667 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
668 	if (desc != L2_TABLE) {
669 		*level = 1;
670 		return (l1);
671 	}
672 
673 	*level = 2;
674 	return (l2);
675 }
676 
677 /*
678  * Returns the lowest valid pte block or table entry for a given virtual
679  * address. If there are no valid entries return NULL and set the level to
680  * the first invalid level.
681  */
682 static __inline pt_entry_t *
683 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
684 {
685 	pd_entry_t *l1, *l2, desc;
686 	pt_entry_t *l3;
687 
688 	l1 = pmap_l1(pmap, va);
689 	if (l1 == NULL) {
690 		*level = 0;
691 		return (NULL);
692 	}
693 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
694 	if (desc == L1_BLOCK) {
695 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
696 		*level = 1;
697 		return (l1);
698 	}
699 
700 	if (desc != L1_TABLE) {
701 		*level = 1;
702 		return (NULL);
703 	}
704 
705 	l2 = pmap_l1_to_l2(l1, va);
706 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
707 	if (desc == L2_BLOCK) {
708 		*level = 2;
709 		return (l2);
710 	}
711 
712 	if (desc != L2_TABLE) {
713 		*level = 2;
714 		return (NULL);
715 	}
716 
717 	*level = 3;
718 	l3 = pmap_l2_to_l3(l2, va);
719 	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
720 		return (NULL);
721 
722 	return (l3);
723 }
724 
725 /*
726  * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
727  * level that maps the specified virtual address, then a pointer to that entry
728  * is returned.  Otherwise, NULL is returned, unless INVARIANTS are enabled
729  * and a diagnostic message is provided, in which case this function panics.
730  */
731 static __always_inline pt_entry_t *
732 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
733 {
734 	pd_entry_t *l0p, *l1p, *l2p;
735 	pt_entry_t desc, *l3p;
736 	int walk_level __diagused;
737 
738 	KASSERT(level >= 0 && level < 4,
739 	    ("%s: %s passed an out-of-range level (%d)", __func__, diag,
740 	    level));
741 	l0p = pmap_l0(pmap, va);
742 	desc = pmap_load(l0p) & ATTR_DESCR_MASK;
743 	if (desc == L0_TABLE && level > 0) {
744 		l1p = pmap_l0_to_l1(l0p, va);
745 		desc = pmap_load(l1p) & ATTR_DESCR_MASK;
746 		if (desc == L1_BLOCK && level == 1) {
747 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
748 			return (l1p);
749 		}
750 		if (desc == L1_TABLE && level > 1) {
751 			l2p = pmap_l1_to_l2(l1p, va);
752 			desc = pmap_load(l2p) & ATTR_DESCR_MASK;
753 			if (desc == L2_BLOCK && level == 2)
754 				return (l2p);
755 			else if (desc == L2_TABLE && level > 2) {
756 				l3p = pmap_l2_to_l3(l2p, va);
757 				desc = pmap_load(l3p) & ATTR_DESCR_MASK;
758 				if (desc == L3_PAGE && level == 3)
759 					return (l3p);
760 				else
761 					walk_level = 3;
762 			} else
763 				walk_level = 2;
764 		} else
765 			walk_level = 1;
766 	} else
767 		walk_level = 0;
768 	KASSERT(diag == NULL,
769 	    ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
770 	    diag, va, level, desc, walk_level));
771 	return (NULL);
772 }
773 
774 bool
775 pmap_ps_enabled(pmap_t pmap)
776 {
777 	/*
778 	 * Promotion requires a hypervisor call when the kernel is running
779 	 * in EL1. To stop this disable superpage support on non-stage 1
780 	 * pmaps for now.
781 	 */
782 	if (pmap->pm_stage != PM_STAGE1)
783 		return (false);
784 
785 #ifdef KMSAN
786 	/*
787 	 * The break-before-make in pmap_update_entry() results in a situation
788 	 * where a CPU may call into the KMSAN runtime while the entry is
789 	 * invalid.  If the entry is used to map the current thread structure,
790 	 * then the runtime will attempt to access unmapped memory.  Avoid this
791 	 * by simply disabling superpage promotion for the kernel map.
792 	 */
793 	if (pmap == kernel_pmap)
794 		return (false);
795 #endif
796 
797 	return (superpages_enabled != 0);
798 }
799 
800 bool
801 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
802     pd_entry_t **l2, pt_entry_t **l3)
803 {
804 	pd_entry_t *l0p, *l1p, *l2p;
805 
806 	if (pmap->pm_l0 == NULL)
807 		return (false);
808 
809 	l0p = pmap_l0(pmap, va);
810 	*l0 = l0p;
811 
812 	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
813 		return (false);
814 
815 	l1p = pmap_l0_to_l1(l0p, va);
816 	*l1 = l1p;
817 
818 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
819 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
820 		*l2 = NULL;
821 		*l3 = NULL;
822 		return (true);
823 	}
824 
825 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
826 		return (false);
827 
828 	l2p = pmap_l1_to_l2(l1p, va);
829 	*l2 = l2p;
830 
831 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
832 		*l3 = NULL;
833 		return (true);
834 	}
835 
836 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
837 		return (false);
838 
839 	*l3 = pmap_l2_to_l3(l2p, va);
840 
841 	return (true);
842 }
843 
844 static __inline int
845 pmap_l3_valid(pt_entry_t l3)
846 {
847 
848 	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
849 }
850 
851 CTASSERT(L1_BLOCK == L2_BLOCK);
852 
853 static pt_entry_t
854 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
855 {
856 	pt_entry_t val;
857 
858 	if (pmap->pm_stage == PM_STAGE1) {
859 		val = ATTR_S1_IDX(memattr);
860 		if (memattr == VM_MEMATTR_DEVICE)
861 			val |= ATTR_S1_XN;
862 		return (val);
863 	}
864 
865 	val = 0;
866 
867 	switch (memattr) {
868 	case VM_MEMATTR_DEVICE:
869 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
870 		    ATTR_S2_XN(ATTR_S2_XN_ALL));
871 	case VM_MEMATTR_UNCACHEABLE:
872 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
873 	case VM_MEMATTR_WRITE_BACK:
874 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
875 	case VM_MEMATTR_WRITE_THROUGH:
876 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
877 	default:
878 		panic("%s: invalid memory attribute %x", __func__, memattr);
879 	}
880 }
881 
882 static pt_entry_t
883 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
884 {
885 	pt_entry_t val;
886 
887 	val = 0;
888 	if (pmap->pm_stage == PM_STAGE1) {
889 		if ((prot & VM_PROT_EXECUTE) == 0)
890 			val |= ATTR_S1_XN;
891 		if ((prot & VM_PROT_WRITE) == 0)
892 			val |= ATTR_S1_AP(ATTR_S1_AP_RO);
893 	} else {
894 		if ((prot & VM_PROT_WRITE) != 0)
895 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
896 		if ((prot & VM_PROT_READ) != 0)
897 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
898 		if ((prot & VM_PROT_EXECUTE) == 0)
899 			val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
900 	}
901 
902 	return (val);
903 }
904 
905 /*
906  * Checks if the PTE is dirty.
907  */
908 static inline int
909 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
910 {
911 
912 	KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
913 
914 	if (pmap->pm_stage == PM_STAGE1) {
915 		KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
916 		    ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
917 
918 		return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
919 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
920 	}
921 
922 	return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
923 	    ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
924 }
925 
926 static __inline void
927 pmap_resident_count_inc(pmap_t pmap, int count)
928 {
929 
930 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
931 	pmap->pm_stats.resident_count += count;
932 }
933 
934 static __inline void
935 pmap_resident_count_dec(pmap_t pmap, int count)
936 {
937 
938 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
939 	KASSERT(pmap->pm_stats.resident_count >= count,
940 	    ("pmap %p resident count underflow %ld %d", pmap,
941 	    pmap->pm_stats.resident_count, count));
942 	pmap->pm_stats.resident_count -= count;
943 }
944 
945 static vm_paddr_t
946 pmap_early_vtophys(vm_offset_t va)
947 {
948 	vm_paddr_t pa_page;
949 
950 	pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
951 	return (pa_page | (va & PAR_LOW_MASK));
952 }
953 
954 /* State of the bootstrapped DMAP page tables */
955 struct pmap_bootstrap_state {
956 	pt_entry_t	*l1;
957 	pt_entry_t	*l2;
958 	pt_entry_t	*l3;
959 	vm_offset_t	freemempos;
960 	vm_offset_t	va;
961 	vm_paddr_t	pa;
962 	pt_entry_t	table_attrs;
963 	u_int		l0_slot;
964 	u_int		l1_slot;
965 	u_int		l2_slot;
966 	bool		dmap_valid;
967 };
968 
969 /* The bootstrap state */
970 static struct pmap_bootstrap_state bs_state = {
971 	.l1 = NULL,
972 	.l2 = NULL,
973 	.l3 = NULL,
974 	.table_attrs = TATTR_PXN_TABLE,
975 	.l0_slot = L0_ENTRIES,
976 	.l1_slot = Ln_ENTRIES,
977 	.l2_slot = Ln_ENTRIES,
978 	.dmap_valid = false,
979 };
980 
981 static void
982 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
983 {
984 	vm_paddr_t l1_pa;
985 	pd_entry_t l0e;
986 	u_int l0_slot;
987 
988 	/* Link the level 0 table to a level 1 table */
989 	l0_slot = pmap_l0_index(state->va);
990 	if (l0_slot != state->l0_slot) {
991 		/*
992 		 * Make sure we move from a low address to high address
993 		 * before the DMAP region is ready. This ensures we never
994 		 * modify an existing mapping until we can map from a
995 		 * physical address to a virtual address.
996 		 */
997 		MPASS(state->l0_slot < l0_slot ||
998 		    state->l0_slot == L0_ENTRIES ||
999 		    state->dmap_valid);
1000 
1001 		/* Reset lower levels */
1002 		state->l2 = NULL;
1003 		state->l3 = NULL;
1004 		state->l1_slot = Ln_ENTRIES;
1005 		state->l2_slot = Ln_ENTRIES;
1006 
1007 		/* Check the existing L0 entry */
1008 		state->l0_slot = l0_slot;
1009 		if (state->dmap_valid) {
1010 			l0e = pagetable_l0_ttbr1[l0_slot];
1011 			if ((l0e & ATTR_DESCR_VALID) != 0) {
1012 				MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
1013 				l1_pa = PTE_TO_PHYS(l0e);
1014 				state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa);
1015 				return;
1016 			}
1017 		}
1018 
1019 		/* Create a new L0 table entry */
1020 		state->l1 = (pt_entry_t *)state->freemempos;
1021 		memset_early(state->l1, 0, PAGE_SIZE);
1022 		state->freemempos += PAGE_SIZE;
1023 
1024 		l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
1025 		MPASS((l1_pa & Ln_TABLE_MASK) == 0);
1026 		MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
1027 		pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) |
1028 		    TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
1029 	}
1030 	KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
1031 }
1032 
1033 static void
1034 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
1035 {
1036 	vm_paddr_t l2_pa;
1037 	pd_entry_t l1e;
1038 	u_int l1_slot;
1039 
1040 	/* Make sure there is a valid L0 -> L1 table */
1041 	pmap_bootstrap_l0_table(state);
1042 
1043 	/* Link the level 1 table to a level 2 table */
1044 	l1_slot = pmap_l1_index(state->va);
1045 	if (l1_slot != state->l1_slot) {
1046 		/* See pmap_bootstrap_l0_table for a description */
1047 		MPASS(state->l1_slot < l1_slot ||
1048 		    state->l1_slot == Ln_ENTRIES ||
1049 		    state->dmap_valid);
1050 
1051 		/* Reset lower levels */
1052 		state->l3 = NULL;
1053 		state->l2_slot = Ln_ENTRIES;
1054 
1055 		/* Check the existing L1 entry */
1056 		state->l1_slot = l1_slot;
1057 		if (state->dmap_valid) {
1058 			l1e = state->l1[l1_slot];
1059 			if ((l1e & ATTR_DESCR_VALID) != 0) {
1060 				MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
1061 				l2_pa = PTE_TO_PHYS(l1e);
1062 				state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa);
1063 				return;
1064 			}
1065 		}
1066 
1067 		/* Create a new L1 table entry */
1068 		state->l2 = (pt_entry_t *)state->freemempos;
1069 		memset_early(state->l2, 0, PAGE_SIZE);
1070 		state->freemempos += PAGE_SIZE;
1071 
1072 		l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1073 		MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1074 		MPASS(state->l1[l1_slot] == 0);
1075 		pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) |
1076 		    state->table_attrs | L1_TABLE);
1077 	}
1078 	KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1079 }
1080 
1081 static void
1082 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1083 {
1084 	vm_paddr_t l3_pa;
1085 	pd_entry_t l2e;
1086 	u_int l2_slot;
1087 
1088 	/* Make sure there is a valid L1 -> L2 table */
1089 	pmap_bootstrap_l1_table(state);
1090 
1091 	/* Link the level 2 table to a level 3 table */
1092 	l2_slot = pmap_l2_index(state->va);
1093 	if (l2_slot != state->l2_slot) {
1094 		/* See pmap_bootstrap_l0_table for a description */
1095 		MPASS(state->l2_slot < l2_slot ||
1096 		    state->l2_slot == Ln_ENTRIES ||
1097 		    state->dmap_valid);
1098 
1099 		/* Check the existing L2 entry */
1100 		state->l2_slot = l2_slot;
1101 		if (state->dmap_valid) {
1102 			l2e = state->l2[l2_slot];
1103 			if ((l2e & ATTR_DESCR_VALID) != 0) {
1104 				MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1105 				l3_pa = PTE_TO_PHYS(l2e);
1106 				state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa);
1107 				return;
1108 			}
1109 		}
1110 
1111 		/* Create a new L2 table entry */
1112 		state->l3 = (pt_entry_t *)state->freemempos;
1113 		memset_early(state->l3, 0, PAGE_SIZE);
1114 		state->freemempos += PAGE_SIZE;
1115 
1116 		l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1117 		MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1118 		MPASS(state->l2[l2_slot] == 0);
1119 		pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) |
1120 		    state->table_attrs | L2_TABLE);
1121 	}
1122 	KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1123 }
1124 
1125 static void
1126 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1127 {
1128 	pt_entry_t contig;
1129 	u_int l2_slot;
1130 	bool first;
1131 
1132 	if ((physmap[i + 1] - state->pa) < L2_SIZE)
1133 		return;
1134 
1135 	/* Make sure there is a valid L1 table */
1136 	pmap_bootstrap_l1_table(state);
1137 
1138 	MPASS((state->va & L2_OFFSET) == 0);
1139 	for (first = true, contig = 0;
1140 	    state->va < DMAP_MAX_ADDRESS &&
1141 	    (physmap[i + 1] - state->pa) >= L2_SIZE;
1142 	    state->va += L2_SIZE, state->pa += L2_SIZE) {
1143 		/*
1144 		 * Stop if we are about to walk off the end of what the
1145 		 * current L1 slot can address.
1146 		 */
1147 		if (!first && (state->pa & L1_OFFSET) == 0)
1148 			break;
1149 
1150 		/*
1151 		 * If we have an aligned, contiguous chunk of L2C_ENTRIES
1152 		 * L2 blocks, set the contiguous bit within each PTE so that
1153 		 * the chunk can be cached using only one TLB entry.
1154 		 */
1155 		if ((state->pa & L2C_OFFSET) == 0) {
1156 			if (state->va + L2C_SIZE < DMAP_MAX_ADDRESS &&
1157 			    physmap[i + 1] - state->pa >= L2C_SIZE) {
1158 				contig = ATTR_CONTIGUOUS;
1159 			} else {
1160 				contig = 0;
1161 			}
1162 		}
1163 
1164 		first = false;
1165 		l2_slot = pmap_l2_index(state->va);
1166 		MPASS((state->pa & L2_OFFSET) == 0);
1167 		MPASS(state->l2[l2_slot] == 0);
1168 		pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) |
1169 		    ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP |
1170 		    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L2_BLOCK);
1171 	}
1172 	MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1173 }
1174 
1175 static void
1176 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1177 {
1178 	pt_entry_t contig;
1179 	u_int l3_slot;
1180 	bool first;
1181 
1182 	if (physmap[i + 1] - state->pa < L3_SIZE)
1183 		return;
1184 
1185 	/* Make sure there is a valid L2 table */
1186 	pmap_bootstrap_l2_table(state);
1187 
1188 	MPASS((state->va & L3_OFFSET) == 0);
1189 	for (first = true, contig = 0;
1190 	    state->va < DMAP_MAX_ADDRESS &&
1191 	    physmap[i + 1] - state->pa >= L3_SIZE;
1192 	    state->va += L3_SIZE, state->pa += L3_SIZE) {
1193 		/*
1194 		 * Stop if we are about to walk off the end of what the
1195 		 * current L2 slot can address.
1196 		 */
1197 		if (!first && (state->pa & L2_OFFSET) == 0)
1198 			break;
1199 
1200 		/*
1201 		 * If we have an aligned, contiguous chunk of L3C_ENTRIES
1202 		 * L3 pages, set the contiguous bit within each PTE so that
1203 		 * the chunk can be cached using only one TLB entry.
1204 		 */
1205 		if ((state->pa & L3C_OFFSET) == 0) {
1206 			if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS &&
1207 			    physmap[i + 1] - state->pa >= L3C_SIZE) {
1208 				contig = ATTR_CONTIGUOUS;
1209 			} else {
1210 				contig = 0;
1211 			}
1212 		}
1213 
1214 		first = false;
1215 		l3_slot = pmap_l3_index(state->va);
1216 		MPASS((state->pa & L3_OFFSET) == 0);
1217 		MPASS(state->l3[l3_slot] == 0);
1218 		pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) |
1219 		    ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP |
1220 		    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE);
1221 	}
1222 	MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1223 }
1224 
1225 void
1226 pmap_bootstrap_dmap(vm_size_t kernlen)
1227 {
1228 	vm_paddr_t start_pa, pa;
1229 	uint64_t tcr;
1230 	int i;
1231 
1232 	tcr = READ_SPECIALREG(tcr_el1);
1233 
1234 	/* Verify that the ASID is set through TTBR0. */
1235 	KASSERT((tcr & TCR_A1) == 0, ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1236 
1237 	if ((tcr & TCR_DS) != 0)
1238 		pmap_lpa_enabled = true;
1239 
1240 	pmap_l1_supported = L1_BLOCKS_SUPPORTED;
1241 
1242 	start_pa = pmap_early_vtophys(KERNBASE);
1243 
1244 	bs_state.freemempos = KERNBASE + kernlen;
1245 	bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1246 
1247 	/* Fill in physmap array. */
1248 	physmap_idx = physmem_avail(physmap, nitems(physmap));
1249 
1250 	dmap_phys_base = physmap[0] & ~L1_OFFSET;
1251 	dmap_phys_max = 0;
1252 	dmap_max_addr = 0;
1253 
1254 	for (i = 0; i < physmap_idx; i += 2) {
1255 		bs_state.pa = physmap[i] & ~L3_OFFSET;
1256 		bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1257 
1258 		/* Create L3 mappings at the start of the region */
1259 		if ((bs_state.pa & L2_OFFSET) != 0)
1260 			pmap_bootstrap_l3_page(&bs_state, i);
1261 		MPASS(bs_state.pa <= physmap[i + 1]);
1262 
1263 		if (L1_BLOCKS_SUPPORTED) {
1264 			/* Create L2 mappings at the start of the region */
1265 			if ((bs_state.pa & L1_OFFSET) != 0)
1266 				pmap_bootstrap_l2_block(&bs_state, i);
1267 			MPASS(bs_state.pa <= physmap[i + 1]);
1268 
1269 			/* Create the main L1 block mappings */
1270 			for (; bs_state.va < DMAP_MAX_ADDRESS &&
1271 			    (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1272 			    bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1273 				/* Make sure there is a valid L1 table */
1274 				pmap_bootstrap_l0_table(&bs_state);
1275 				MPASS((bs_state.pa & L1_OFFSET) == 0);
1276 				pmap_store(
1277 				    &bs_state.l1[pmap_l1_index(bs_state.va)],
1278 				    PHYS_TO_PTE(bs_state.pa) | ATTR_AF |
1279 				    pmap_sh_attr |
1280 				    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1281 				    ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK);
1282 			}
1283 			MPASS(bs_state.pa <= physmap[i + 1]);
1284 
1285 			/* Create L2 mappings at the end of the region */
1286 			pmap_bootstrap_l2_block(&bs_state, i);
1287 		} else {
1288 			while (bs_state.va < DMAP_MAX_ADDRESS &&
1289 			    (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1290 				pmap_bootstrap_l2_block(&bs_state, i);
1291 			}
1292 		}
1293 		MPASS(bs_state.pa <= physmap[i + 1]);
1294 
1295 		/* Create L3 mappings at the end of the region */
1296 		pmap_bootstrap_l3_page(&bs_state, i);
1297 		MPASS(bs_state.pa == physmap[i + 1]);
1298 
1299 		if (bs_state.pa > dmap_phys_max) {
1300 			dmap_phys_max = bs_state.pa;
1301 			dmap_max_addr = bs_state.va;
1302 		}
1303 	}
1304 
1305 	pmap_s1_invalidate_all_kernel();
1306 
1307 	bs_state.dmap_valid = true;
1308 
1309 	/* Exclude the kernel and DMAP region */
1310 	pa = pmap_early_vtophys(bs_state.freemempos);
1311 	physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1312 }
1313 
1314 static void
1315 pmap_bootstrap_l2(vm_offset_t va)
1316 {
1317 	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1318 
1319 	/* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1320 	bs_state.va = va;
1321 
1322 	for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1323 		pmap_bootstrap_l1_table(&bs_state);
1324 }
1325 
1326 static void
1327 pmap_bootstrap_l3(vm_offset_t va)
1328 {
1329 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1330 
1331 	/* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1332 	bs_state.va = va;
1333 
1334 	for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1335 		pmap_bootstrap_l2_table(&bs_state);
1336 }
1337 
1338 /*
1339  *	Bootstrap the system enough to run with virtual memory.
1340  */
1341 void
1342 pmap_bootstrap(void)
1343 {
1344 	vm_offset_t dpcpu, msgbufpv;
1345 	vm_paddr_t start_pa, pa;
1346 	size_t largest_phys_size;
1347 
1348 	/* Set this early so we can use the pagetable walking functions */
1349 	kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1350 	mtx_init(&kernel_pmap->pm_mtx, "kernel pmap", NULL, MTX_DEF);
1351 	kernel_pmap->pm_l0_paddr =
1352 	    pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1353 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1354 	vm_radix_init(&kernel_pmap->pm_root);
1355 	kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1356 	kernel_pmap->pm_stage = PM_STAGE1;
1357 	kernel_pmap->pm_levels = 4;
1358 	kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1359 	kernel_pmap->pm_asid_set = &asids;
1360 
1361 	/* Reserve some VA space for early BIOS/ACPI mapping */
1362 	preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1363 
1364 	virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1365 	virtual_avail = roundup2(virtual_avail, L1_SIZE);
1366 	virtual_end = VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE;
1367 	kernel_vm_end = virtual_avail;
1368 
1369 	/*
1370 	 * We only use PXN when we know nothing will be executed from it, e.g.
1371 	 * the DMAP region.
1372 	 */
1373 	bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1374 
1375 	/*
1376 	 * Find the physical memory we could use. This needs to be after we
1377 	 * exclude any memory that is mapped into the DMAP region but should
1378 	 * not be used by the kernel, e.g. some UEFI memory types.
1379 	 */
1380 	physmap_idx = physmem_avail(physmap, nitems(physmap));
1381 
1382 	/*
1383 	 * Find space for early allocations. We search for the largest
1384 	 * region. This is because the user may choose a large msgbuf.
1385 	 * This could be smarter, e.g. to allow multiple regions to be
1386 	 * used & switch to the next when one is full.
1387 	 */
1388 	largest_phys_size = 0;
1389 	for (int i = 0; i < physmap_idx; i += 2) {
1390 		if ((physmap[i + 1] - physmap[i]) > largest_phys_size) {
1391 			largest_phys_size = physmap[i + 1] - physmap[i];
1392 			bs_state.freemempos = PHYS_TO_DMAP(physmap[i]);
1393 		}
1394 	}
1395 
1396 	start_pa = pmap_early_vtophys(bs_state.freemempos);
1397 
1398 	/*
1399 	 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS.  We assume that the
1400 	 * loader allocated the first and only l2 page table page used to map
1401 	 * the kernel, preloaded files and module metadata.
1402 	 */
1403 	pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1404 	/* And the l3 tables for the early devmap */
1405 	pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1406 
1407 	pmap_s1_invalidate_all_kernel();
1408 
1409 #define alloc_pages(var, np)						\
1410 	(var) = bs_state.freemempos;					\
1411 	bs_state.freemempos += (np * PAGE_SIZE);			\
1412 	memset_early((char *)(var), 0, ((np) * PAGE_SIZE));
1413 
1414 	/* Allocate dynamic per-cpu area. */
1415 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1416 	dpcpu_init((void *)dpcpu, 0);
1417 
1418 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1419 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1420 	msgbufp = (void *)msgbufpv;
1421 
1422 	pa = pmap_early_vtophys(bs_state.freemempos);
1423 
1424 	physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1425 }
1426 
1427 #if defined(KASAN) || defined(KMSAN)
1428 static void
1429 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1430     vm_offset_t *vap, vm_offset_t eva)
1431 {
1432 	vm_paddr_t pa;
1433 	vm_offset_t va;
1434 	pd_entry_t *l2;
1435 
1436 	va = *vap;
1437 	pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1438 	for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) {
1439 		l2 = pmap_l2(kernel_pmap, va);
1440 
1441 		/*
1442 		 * KASAN stack checking results in us having already allocated
1443 		 * part of our shadow map, so we can just skip those segments.
1444 		 */
1445 		if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1446 			pa += L2_SIZE;
1447 			continue;
1448 		}
1449 
1450 		bzero_early((void *)PHYS_TO_DMAP(pa), L2_SIZE);
1451 		physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC);
1452 		pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK);
1453 	}
1454 	*vap = va;
1455 }
1456 
1457 /*
1458  * Finish constructing the initial shadow map:
1459  * - Count how many pages from KERNBASE to virtual_avail (scaled for
1460  *   shadow map)
1461  * - Map that entire range using L2 superpages.
1462  */
1463 static void
1464 pmap_bootstrap_san1(vm_offset_t va, int scale)
1465 {
1466 	vm_offset_t eva;
1467 	vm_paddr_t kernstart;
1468 	int i;
1469 
1470 	kernstart = pmap_early_vtophys(KERNBASE);
1471 
1472 	/*
1473 	 * Rebuild physmap one more time, we may have excluded more regions from
1474 	 * allocation since pmap_bootstrap().
1475 	 */
1476 	physmap_idx = physmem_avail(physmap, nitems(physmap));
1477 
1478 	eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale;
1479 
1480 	/*
1481 	 * Find a slot in the physmap large enough for what we needed.  We try to put
1482 	 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1483 	 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1484 	 */
1485 	for (i = physmap_idx - 2; i >= 0; i -= 2) {
1486 		vm_paddr_t plow, phigh;
1487 
1488 		/* L2 mappings must be backed by memory that is L2-aligned */
1489 		plow = roundup2(physmap[i], L2_SIZE);
1490 		phigh = physmap[i + 1];
1491 		if (plow >= phigh)
1492 			continue;
1493 		if (kernstart >= plow && kernstart < phigh)
1494 			phigh = kernstart;
1495 		if (phigh - plow >= L2_SIZE) {
1496 			pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva);
1497 			if (va >= eva)
1498 				break;
1499 		}
1500 	}
1501 	if (i < 0)
1502 		panic("Could not find phys region for shadow map");
1503 
1504 	/*
1505 	 * Done. We should now have a valid shadow address mapped for all KVA
1506 	 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1507 	 * shadow accesses by the sanitizer runtime will succeed for this range.
1508 	 * When the kernel virtual address range is later expanded, as will
1509 	 * happen in vm_mem_init(), the shadow map will be grown as well. This
1510 	 * is handled by pmap_san_enter().
1511 	 */
1512 }
1513 
1514 void
1515 pmap_bootstrap_san(void)
1516 {
1517 #ifdef KASAN
1518 	pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE);
1519 #else
1520 	static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1521 	static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1522 	pd_entry_t *l0, *l1;
1523 
1524 	if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE)
1525 		panic("initial kernel map is too large");
1526 
1527 	l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS);
1528 	pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1529 	    pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp)));
1530 	l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS);
1531 	pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1532 	    pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE)));
1533 	pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1);
1534 
1535 	l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS);
1536 	pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1537 	    pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp)));
1538 	l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS);
1539 	pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1540 	    pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE)));
1541 	pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1);
1542 #endif
1543 }
1544 #endif
1545 
1546 /*
1547  *	Initialize a vm_page's machine-dependent fields.
1548  */
1549 void
1550 pmap_page_init(vm_page_t m)
1551 {
1552 
1553 	TAILQ_INIT(&m->md.pv_list);
1554 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1555 }
1556 
1557 static void
1558 pmap_init_asids(struct asid_set *set, int bits)
1559 {
1560 	int i;
1561 
1562 	set->asid_bits = bits;
1563 
1564 	/*
1565 	 * We may be too early in the overall initialization process to use
1566 	 * bit_alloc().
1567 	 */
1568 	set->asid_set_size = 1 << set->asid_bits;
1569 	set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1570 	    M_WAITOK | M_ZERO);
1571 	for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1572 		bit_set(set->asid_set, i);
1573 	set->asid_next = ASID_FIRST_AVAILABLE;
1574 	mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1575 }
1576 
1577 static void
1578 pmap_init_pv_table(void)
1579 {
1580 	struct vm_phys_seg *seg, *next_seg;
1581 	struct pmap_large_md_page *pvd;
1582 	vm_size_t s;
1583 	int domain, i, j, pages;
1584 
1585 	/*
1586 	 * We depend on the size being evenly divisible into a page so
1587 	 * that the pv_table array can be indexed directly while
1588 	 * safely spanning multiple pages from different domains.
1589 	 */
1590 	CTASSERT(PAGE_SIZE % sizeof(*pvd) == 0);
1591 
1592 	/*
1593 	 * Calculate the size of the array.
1594 	 */
1595 	s = 0;
1596 	for (i = 0; i < vm_phys_nsegs; i++) {
1597 		seg = &vm_phys_segs[i];
1598 		pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1599 		    pmap_l2_pindex(seg->start);
1600 		s += round_page(pages * sizeof(*pvd));
1601 	}
1602 	pv_table = (struct pmap_large_md_page *)kva_alloc(s);
1603 	if (pv_table == NULL)
1604 		panic("%s: kva_alloc failed\n", __func__);
1605 
1606 	/*
1607 	 * Iterate physical segments to allocate domain-local memory for PV
1608 	 * list headers.
1609 	 */
1610 	pvd = pv_table;
1611 	for (i = 0; i < vm_phys_nsegs; i++) {
1612 		seg = &vm_phys_segs[i];
1613 		pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1614 		    pmap_l2_pindex(seg->start);
1615 		domain = seg->domain;
1616 
1617 		s = round_page(pages * sizeof(*pvd));
1618 
1619 		for (j = 0; j < s; j += PAGE_SIZE) {
1620 			vm_page_t m = vm_page_alloc_noobj_domain(domain,
1621 			    VM_ALLOC_ZERO);
1622 			if (m == NULL)
1623 				panic("failed to allocate PV table page");
1624 			pmap_qenter((vm_offset_t)pvd + j, &m, 1);
1625 		}
1626 
1627 		for (j = 0; j < s / sizeof(*pvd); j++) {
1628 			rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1629 			TAILQ_INIT(&pvd->pv_page.pv_list);
1630 			pvd++;
1631 		}
1632 	}
1633 	pvd = &pv_dummy_large;
1634 	memset(pvd, 0, sizeof(*pvd));
1635 	rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1636 	TAILQ_INIT(&pvd->pv_page.pv_list);
1637 
1638 	/*
1639 	 * Set pointers from vm_phys_segs to pv_table.
1640 	 */
1641 	for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1642 		seg = &vm_phys_segs[i];
1643 		seg->md_first = pvd;
1644 		pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1645 		    pmap_l2_pindex(seg->start);
1646 
1647 		/*
1648 		 * If there is a following segment, and the final
1649 		 * superpage of this segment and the initial superpage
1650 		 * of the next segment are the same then adjust the
1651 		 * pv_table entry for that next segment down by one so
1652 		 * that the pv_table entries will be shared.
1653 		 */
1654 		if (i + 1 < vm_phys_nsegs) {
1655 			next_seg = &vm_phys_segs[i + 1];
1656 			if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1657 			    pmap_l2_pindex(next_seg->start)) {
1658 				pvd--;
1659 			}
1660 		}
1661 	}
1662 }
1663 
1664 static cpu_feat_en
1665 pmap_dbm_check(const struct cpu_feat *feat __unused, u_int midr __unused)
1666 {
1667 	uint64_t id_aa64mmfr1;
1668 
1669 	id_aa64mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1670 	if (ID_AA64MMFR1_HAFDBS_VAL(id_aa64mmfr1) >=
1671 	    ID_AA64MMFR1_HAFDBS_AF_DBS)
1672 		return (FEAT_DEFAULT_ENABLE);
1673 
1674 	return (FEAT_ALWAYS_DISABLE);
1675 }
1676 
1677 static bool
1678 pmap_dbm_has_errata(const struct cpu_feat *feat __unused, u_int midr,
1679     u_int **errata_list, u_int *errata_count)
1680 {
1681 	/* Disable on Cortex-A55 for erratum 1024718 - all revisions */
1682 	if (CPU_IMPL(midr) == CPU_IMPL_ARM &&
1683 	    CPU_PART(midr) == CPU_PART_CORTEX_A55) {
1684 		static u_int errata_id = 1024718;
1685 
1686 		*errata_list = &errata_id;
1687 		*errata_count = 1;
1688 		return (true);
1689 	}
1690 
1691 	/* Disable on Cortex-A510 for erratum 2051678 - r0p0 to r0p2 */
1692 	if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A510,
1693 	    0, 0, 0, 2)) {
1694 		static u_int errata_id = 2051678;
1695 
1696 		*errata_list = &errata_id;
1697 		*errata_count = 1;
1698 		return (true);
1699 	}
1700 
1701 	return (false);
1702 }
1703 
1704 static bool
1705 pmap_dbm_enable(const struct cpu_feat *feat __unused,
1706     cpu_feat_errata errata_status, u_int *errata_list __unused,
1707     u_int errata_count)
1708 {
1709 	uint64_t tcr;
1710 
1711 	/* Skip if there is an erratum affecting DBM */
1712 	if (errata_status != ERRATA_NONE)
1713 		return (false);
1714 
1715 	tcr = READ_SPECIALREG(tcr_el1) | TCR_HD;
1716 	WRITE_SPECIALREG(tcr_el1, tcr);
1717 	isb();
1718 	/* Flush the local TLB for the TCR_HD flag change */
1719 	dsb(nshst);
1720 	__asm __volatile("tlbi vmalle1");
1721 	dsb(nsh);
1722 	isb();
1723 
1724 	return (true);
1725 }
1726 
1727 CPU_FEAT(feat_hafdbs, "Hardware management of the Access flag and dirty state",
1728     pmap_dbm_check, pmap_dbm_has_errata, pmap_dbm_enable, NULL,
1729     CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU);
1730 
1731 static cpu_feat_en
1732 pmap_multiple_tlbi_check(const struct cpu_feat *feat __unused, u_int midr)
1733 {
1734 	/*
1735 	 * Cortex-A55 erratum 2441007 (Cat B rare)
1736 	 * Present in all revisions
1737 	 */
1738 	if (CPU_IMPL(midr) == CPU_IMPL_ARM &&
1739 	    CPU_PART(midr) == CPU_PART_CORTEX_A55)
1740 		return (FEAT_DEFAULT_DISABLE);
1741 
1742 	/*
1743 	 * Cortex-A76 erratum 1286807 (Cat B rare)
1744 	 * Present in r0p0 - r3p0
1745 	 * Fixed in r3p1
1746 	 */
1747 	if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A76,
1748 	    0, 0, 3, 0))
1749 		return (FEAT_DEFAULT_DISABLE);
1750 
1751 	/*
1752 	 * Cortex-A510 erratum 2441009 (Cat B rare)
1753 	 * Present in r0p0 - r1p1
1754 	 * Fixed in r1p2
1755 	 */
1756 	if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A510,
1757 	    0, 0, 1, 1))
1758 		return (FEAT_DEFAULT_DISABLE);
1759 
1760 	return (FEAT_ALWAYS_DISABLE);
1761 }
1762 
1763 static bool
1764 pmap_multiple_tlbi_enable(const struct cpu_feat *feat __unused,
1765     cpu_feat_errata errata_status, u_int *errata_list __unused,
1766     u_int errata_count __unused)
1767 {
1768 	pmap_multiple_tlbi = true;
1769 	return (true);
1770 }
1771 
1772 CPU_FEAT(errata_multi_tlbi, "Multiple TLBI errata",
1773     pmap_multiple_tlbi_check, NULL, pmap_multiple_tlbi_enable, NULL,
1774     CPU_FEAT_EARLY_BOOT | CPU_FEAT_PER_CPU);
1775 
1776 /*
1777  *	Initialize the pmap module.
1778  *
1779  *	Called by vm_mem_init(), to initialize any structures that the pmap
1780  *	system needs to map virtual memory.
1781  */
1782 void
1783 pmap_init(void)
1784 {
1785 	uint64_t mmfr1;
1786 	int i, vmid_bits;
1787 
1788 	/*
1789 	 * Are large page mappings enabled?
1790 	 */
1791 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1792 	if (superpages_enabled) {
1793 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1794 		    ("pmap_init: can't assign to pagesizes[1]"));
1795 		pagesizes[1] = L3C_SIZE;
1796 		KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1797 		    ("pmap_init: can't assign to pagesizes[2]"));
1798 		pagesizes[2] = L2_SIZE;
1799 		if (L1_BLOCKS_SUPPORTED) {
1800 			KASSERT(MAXPAGESIZES > 3 && pagesizes[3] == 0,
1801 			    ("pmap_init: can't assign to pagesizes[3]"));
1802 			pagesizes[3] = L1_SIZE;
1803 		}
1804 	}
1805 
1806 	/*
1807 	 * Initialize the ASID allocator.
1808 	 */
1809 	pmap_init_asids(&asids,
1810 	    (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1811 
1812 	if (has_hyp()) {
1813 		mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1814 		vmid_bits = 8;
1815 
1816 		if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1817 		    ID_AA64MMFR1_VMIDBits_16)
1818 			vmid_bits = 16;
1819 		pmap_init_asids(&vmids, vmid_bits);
1820 	}
1821 
1822 	/*
1823 	 * Initialize pv chunk lists.
1824 	 */
1825 	for (i = 0; i < PMAP_MEMDOM; i++) {
1826 		mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1827 		    MTX_DEF);
1828 		TAILQ_INIT(&pv_chunks[i].pvc_list);
1829 	}
1830 	pmap_init_pv_table();
1831 
1832 	vm_initialized = 1;
1833 }
1834 
1835 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1836     "L1 (1GB/64GB) page mapping counters");
1837 
1838 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
1839 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
1840     &pmap_l1_demotions, "L1 (1GB/64GB) page demotions");
1841 
1842 SYSCTL_BOOL(_vm_pmap_l1, OID_AUTO, supported, CTLFLAG_RD, &pmap_l1_supported,
1843     0, "L1 blocks are supported");
1844 
1845 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1846     "L2C (32MB/1GB) page mapping counters");
1847 
1848 static COUNTER_U64_DEFINE_EARLY(pmap_l2c_demotions);
1849 SYSCTL_COUNTER_U64(_vm_pmap_l2c, OID_AUTO, demotions, CTLFLAG_RD,
1850     &pmap_l2c_demotions, "L2C (32MB/1GB) page demotions");
1851 
1852 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1853     "2MB page mapping counters");
1854 
1855 static COUNTER_U64_DEFINE_EARLY(pmap_l2_demotions);
1856 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1857     &pmap_l2_demotions, "L2 (2MB/32MB) page demotions");
1858 
1859 static COUNTER_U64_DEFINE_EARLY(pmap_l2_mappings);
1860 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1861     &pmap_l2_mappings, "L2 (2MB/32MB) page mappings");
1862 
1863 static COUNTER_U64_DEFINE_EARLY(pmap_l2_p_failures);
1864 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1865     &pmap_l2_p_failures, "L2 (2MB/32MB) page promotion failures");
1866 
1867 static COUNTER_U64_DEFINE_EARLY(pmap_l2_promotions);
1868 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1869     &pmap_l2_promotions, "L2 (2MB/32MB) page promotions");
1870 
1871 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1872     "L3C (64KB/2MB) page mapping counters");
1873 
1874 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions);
1875 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD,
1876     &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions");
1877 
1878 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings);
1879 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD,
1880     &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings");
1881 
1882 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures);
1883 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD,
1884     &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures");
1885 
1886 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions);
1887 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD,
1888     &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions");
1889 
1890 /*
1891  * If the given value for "final_only" is false, then any cached intermediate-
1892  * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1893  * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1894  * Otherwise, just the cached final-level entry is invalidated.
1895  */
1896 static __inline void
1897 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1898 {
1899 	if (final_only)
1900 		__asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1901 	else
1902 		__asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1903 }
1904 
1905 static __inline void
1906 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1907 {
1908 	if (final_only)
1909 		__asm __volatile("tlbi vale1is, %0" : : "r" (r));
1910 	else
1911 		__asm __volatile("tlbi vae1is, %0" : : "r" (r));
1912 }
1913 
1914 /*
1915  * Invalidates any cached final- and optionally intermediate-level TLB entries
1916  * for the specified virtual address in the given virtual address space.
1917  */
1918 static __inline void
1919 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1920 {
1921 	uint64_t r;
1922 
1923 	PMAP_ASSERT_STAGE1(pmap);
1924 
1925 	dsb(ishst);
1926 	r = TLBI_VA(va);
1927 	if (pmap == kernel_pmap) {
1928 		pmap_s1_invalidate_kernel(r, final_only);
1929 	} else {
1930 		r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1931 		pmap_s1_invalidate_user(r, final_only);
1932 	}
1933 	if (pmap_multiple_tlbi) {
1934 		dsb(ish);
1935 		__asm __volatile("tlbi	vale1is, xzr" ::: "memory");
1936 	}
1937 	dsb(ish);
1938 	isb();
1939 }
1940 
1941 static __inline void
1942 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1943 {
1944 	PMAP_ASSERT_STAGE2(pmap);
1945 	MPASS(pmap_stage2_invalidate_range != NULL);
1946 	pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1947 	    final_only);
1948 }
1949 
1950 static __inline void
1951 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1952 {
1953 	if (pmap->pm_stage == PM_STAGE1)
1954 		pmap_s1_invalidate_page(pmap, va, final_only);
1955 	else
1956 		pmap_s2_invalidate_page(pmap, va, final_only);
1957 }
1958 
1959 /*
1960  * Use stride L{1,2}_SIZE when invalidating the TLB entries for L{1,2}_BLOCK
1961  * mappings.  Otherwise, use stride L3_SIZE.
1962  */
1963 static __inline void
1964 pmap_s1_invalidate_strided(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1965     vm_offset_t stride, bool final_only)
1966 {
1967 	uint64_t end, r, start;
1968 
1969 	PMAP_ASSERT_STAGE1(pmap);
1970 
1971 	dsb(ishst);
1972 	if (pmap == kernel_pmap) {
1973 		start = TLBI_VA(sva);
1974 		end = TLBI_VA(eva);
1975 		for (r = start; r < end; r += TLBI_VA(stride))
1976 			pmap_s1_invalidate_kernel(r, final_only);
1977 	} else {
1978 		start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1979 		start |= TLBI_VA(sva);
1980 		end |= TLBI_VA(eva);
1981 		for (r = start; r < end; r += TLBI_VA(stride))
1982 			pmap_s1_invalidate_user(r, final_only);
1983 	}
1984 	if (pmap_multiple_tlbi) {
1985 		dsb(ish);
1986 		__asm __volatile("tlbi	vale1is, xzr" ::: "memory");
1987 	}
1988 	dsb(ish);
1989 	isb();
1990 }
1991 
1992 /*
1993  * Invalidates any cached final- and optionally intermediate-level TLB entries
1994  * for the specified virtual address range in the given virtual address space.
1995  */
1996 static __inline void
1997 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1998     bool final_only)
1999 {
2000 	pmap_s1_invalidate_strided(pmap, sva, eva, L3_SIZE, final_only);
2001 }
2002 
2003 static __inline void
2004 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2005     bool final_only)
2006 {
2007 	PMAP_ASSERT_STAGE2(pmap);
2008 	MPASS(pmap_stage2_invalidate_range != NULL);
2009 	pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
2010 }
2011 
2012 static __inline void
2013 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2014     bool final_only)
2015 {
2016 	if (pmap->pm_stage == PM_STAGE1)
2017 		pmap_s1_invalidate_range(pmap, sva, eva, final_only);
2018 	else
2019 		pmap_s2_invalidate_range(pmap, sva, eva, final_only);
2020 }
2021 
2022 void
2023 pmap_s1_invalidate_all_kernel(void)
2024 {
2025 	dsb(ishst);
2026 	__asm __volatile("tlbi vmalle1is");
2027 	if (pmap_multiple_tlbi) {
2028 		dsb(ish);
2029 		__asm __volatile("tlbi	vale1is, xzr" ::: "memory");
2030 	}
2031 	dsb(ish);
2032 	isb();
2033 }
2034 
2035 /*
2036  * Invalidates all cached intermediate- and final-level TLB entries for the
2037  * given virtual address space.
2038  */
2039 static __inline void
2040 pmap_s1_invalidate_all(pmap_t pmap)
2041 {
2042 	uint64_t r;
2043 
2044 	PMAP_ASSERT_STAGE1(pmap);
2045 
2046 	dsb(ishst);
2047 	if (pmap == kernel_pmap) {
2048 		__asm __volatile("tlbi vmalle1is");
2049 	} else {
2050 		r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
2051 		__asm __volatile("tlbi aside1is, %0" : : "r" (r));
2052 	}
2053 	if (pmap_multiple_tlbi) {
2054 		dsb(ish);
2055 		__asm __volatile("tlbi	vale1is, xzr" ::: "memory");
2056 	}
2057 	dsb(ish);
2058 	isb();
2059 }
2060 
2061 static __inline void
2062 pmap_s2_invalidate_all(pmap_t pmap)
2063 {
2064 	PMAP_ASSERT_STAGE2(pmap);
2065 	MPASS(pmap_stage2_invalidate_all != NULL);
2066 	pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
2067 }
2068 
2069 static __inline void
2070 pmap_invalidate_all(pmap_t pmap)
2071 {
2072 	if (pmap->pm_stage == PM_STAGE1)
2073 		pmap_s1_invalidate_all(pmap);
2074 	else
2075 		pmap_s2_invalidate_all(pmap);
2076 }
2077 
2078 /*
2079  *	Routine:	pmap_extract
2080  *	Function:
2081  *		Extract the physical page address associated
2082  *		with the given map/virtual_address pair.
2083  */
2084 vm_paddr_t
2085 pmap_extract(pmap_t pmap, vm_offset_t va)
2086 {
2087 	pt_entry_t *pte, tpte;
2088 	vm_paddr_t pa;
2089 	int lvl;
2090 
2091 	pa = 0;
2092 	PMAP_LOCK(pmap);
2093 	/*
2094 	 * Find the block or page map for this virtual address. pmap_pte
2095 	 * will return either a valid block/page entry, or NULL.
2096 	 */
2097 	pte = pmap_pte(pmap, va, &lvl);
2098 	if (pte != NULL) {
2099 		tpte = pmap_load(pte);
2100 		pa = PTE_TO_PHYS(tpte);
2101 		switch(lvl) {
2102 		case 1:
2103 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
2104 			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
2105 			    ("pmap_extract: Invalid L1 pte found: %lx",
2106 			    tpte & ATTR_DESCR_MASK));
2107 			pa |= (va & L1_OFFSET);
2108 			break;
2109 		case 2:
2110 			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
2111 			    ("pmap_extract: Invalid L2 pte found: %lx",
2112 			    tpte & ATTR_DESCR_MASK));
2113 			pa |= (va & L2_OFFSET);
2114 			break;
2115 		case 3:
2116 			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
2117 			    ("pmap_extract: Invalid L3 pte found: %lx",
2118 			    tpte & ATTR_DESCR_MASK));
2119 			pa |= (va & L3_OFFSET);
2120 			break;
2121 		}
2122 	}
2123 	PMAP_UNLOCK(pmap);
2124 	return (pa);
2125 }
2126 
2127 /*
2128  *	Routine:	pmap_extract_and_hold
2129  *	Function:
2130  *		Atomically extract and hold the physical page
2131  *		with the given pmap and virtual address pair
2132  *		if that mapping permits the given protection.
2133  */
2134 vm_page_t
2135 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
2136 {
2137 	pt_entry_t *pte, tpte;
2138 	vm_offset_t off;
2139 	vm_page_t m;
2140 	int lvl;
2141 	bool use;
2142 
2143 	m = NULL;
2144 	PMAP_LOCK(pmap);
2145 	pte = pmap_pte(pmap, va, &lvl);
2146 	if (pte != NULL) {
2147 		tpte = pmap_load(pte);
2148 
2149 		KASSERT(lvl > 0 && lvl <= 3,
2150 		    ("pmap_extract_and_hold: Invalid level %d", lvl));
2151 		/*
2152 		 * Check that the pte is either a L3 page, or a L1 or L2 block
2153 		 * entry. We can assume L1_BLOCK == L2_BLOCK.
2154 		 */
2155 		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
2156 		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
2157 		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
2158 		     tpte & ATTR_DESCR_MASK));
2159 
2160 		use = false;
2161 		if ((prot & VM_PROT_WRITE) == 0)
2162 			use = true;
2163 		else if (pmap->pm_stage == PM_STAGE1 &&
2164 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
2165 			use = true;
2166 		else if (pmap->pm_stage == PM_STAGE2 &&
2167 		    ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
2168 		     ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
2169 			use = true;
2170 
2171 		if (use) {
2172 			switch (lvl) {
2173 			case 1:
2174 				off = va & L1_OFFSET;
2175 				break;
2176 			case 2:
2177 				off = va & L2_OFFSET;
2178 				break;
2179 			case 3:
2180 			default:
2181 				off = 0;
2182 			}
2183 			m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off);
2184 			if (m != NULL && !vm_page_wire_mapped(m))
2185 				m = NULL;
2186 		}
2187 	}
2188 	PMAP_UNLOCK(pmap);
2189 	return (m);
2190 }
2191 
2192 /*
2193  * Returns true if the entire kernel virtual address range is mapped
2194  */
2195 static bool
2196 pmap_kmapped_range(vm_offset_t sva, vm_size_t size)
2197 {
2198 	pt_entry_t *pte, tpte;
2199 	vm_offset_t eva;
2200 
2201 	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS,
2202 	    ("%s: Invalid virtual address: %lx", __func__, sva));
2203 	MPASS(size != 0);
2204 	eva = sva + size - 1;
2205 	KASSERT(eva > sva, ("%s: Size too large: sva %lx, size %lx", __func__,
2206 	    sva, size));
2207 
2208 	while (sva <= eva) {
2209 		pte = pmap_l1(kernel_pmap, sva);
2210 		if (pte == NULL)
2211 			return (false);
2212 		tpte = pmap_load(pte);
2213 		if (tpte == 0)
2214 			return (false);
2215 		if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2216 			sva = (sva & ~L1_OFFSET) + L1_SIZE;
2217 			continue;
2218 		}
2219 
2220 		pte = pmap_l1_to_l2(&tpte, sva);
2221 		tpte = pmap_load(pte);
2222 		if (tpte == 0)
2223 			return (false);
2224 		if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2225 			sva = (sva & ~L2_OFFSET) + L2_SIZE;
2226 			continue;
2227 		}
2228 		pte = pmap_l2_to_l3(&tpte, sva);
2229 		tpte = pmap_load(pte);
2230 		if (tpte == 0)
2231 			return (false);
2232 		MPASS((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_PAGE);
2233 		if ((tpte & ATTR_CONTIGUOUS) == ATTR_CONTIGUOUS)
2234 			sva = (sva & ~L3C_OFFSET) + L3C_SIZE;
2235 		else
2236 			sva = (sva & ~L3_OFFSET) + L3_SIZE;
2237 	}
2238 
2239 	return (true);
2240 }
2241 
2242 /*
2243  * Walks the page tables to translate a kernel virtual address to a
2244  * physical address. Returns true if the kva is valid and stores the
2245  * physical address in pa if it is not NULL.
2246  *
2247  * See the comment above data_abort() for the rationale for specifying
2248  * NO_PERTHREAD_SSP here.
2249  */
2250 bool NO_PERTHREAD_SSP
2251 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
2252 {
2253 	pt_entry_t *pte, tpte;
2254 	register_t intr;
2255 	uint64_t par;
2256 
2257 	/*
2258 	 * Disable interrupts so we don't get interrupted between asking
2259 	 * for address translation, and getting the result back.
2260 	 */
2261 	intr = intr_disable();
2262 	par = arm64_address_translate_s1e1r(va);
2263 	intr_restore(intr);
2264 
2265 	if (PAR_SUCCESS(par)) {
2266 		if (pa != NULL)
2267 			*pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
2268 		return (true);
2269 	}
2270 
2271 	/*
2272 	 * Fall back to walking the page table. The address translation
2273 	 * instruction may fail when the page is in a break-before-make
2274 	 * sequence. As we only clear the valid bit in said sequence we
2275 	 * can walk the page table to find the physical address.
2276 	 */
2277 
2278 	pte = pmap_l1(kernel_pmap, va);
2279 	if (pte == NULL)
2280 		return (false);
2281 
2282 	/*
2283 	 * A concurrent pmap_update_entry() will clear the entry's valid bit
2284 	 * but leave the rest of the entry unchanged.  Therefore, we treat a
2285 	 * non-zero entry as being valid, and we ignore the valid bit when
2286 	 * determining whether the entry maps a block, page, or table.
2287 	 */
2288 	tpte = pmap_load(pte);
2289 	if (tpte == 0)
2290 		return (false);
2291 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2292 		if (pa != NULL)
2293 			*pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET);
2294 		return (true);
2295 	}
2296 	pte = pmap_l1_to_l2(&tpte, va);
2297 	tpte = pmap_load(pte);
2298 	if (tpte == 0)
2299 		return (false);
2300 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2301 		if (pa != NULL)
2302 			*pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET);
2303 		return (true);
2304 	}
2305 	pte = pmap_l2_to_l3(&tpte, va);
2306 	tpte = pmap_load(pte);
2307 	if (tpte == 0)
2308 		return (false);
2309 	if (pa != NULL)
2310 		*pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET);
2311 	return (true);
2312 }
2313 
2314 /*
2315  *	Routine:	pmap_kextract
2316  *	Function:
2317  *		Extract the physical page address associated with the given kernel
2318  *		virtual address.
2319  */
2320 vm_paddr_t
2321 pmap_kextract(vm_offset_t va)
2322 {
2323 	vm_paddr_t pa;
2324 
2325 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2326 		return (DMAP_TO_PHYS(va));
2327 
2328 	if (pmap_klookup(va, &pa) == false)
2329 		return (0);
2330 	return (pa);
2331 }
2332 
2333 /***************************************************
2334  * Low level mapping routines.....
2335  ***************************************************/
2336 
2337 void
2338 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
2339 {
2340 	pd_entry_t *pde;
2341 	pt_entry_t attr, old_l3e, *pte;
2342 	vm_offset_t va;
2343 	vm_page_t mpte;
2344 	int error, lvl;
2345 
2346 	KASSERT((pa & L3_OFFSET) == 0,
2347 	    ("pmap_kenter: Invalid physical address"));
2348 	KASSERT((sva & L3_OFFSET) == 0,
2349 	    ("pmap_kenter: Invalid virtual address"));
2350 	KASSERT((size & PAGE_MASK) == 0,
2351 	    ("pmap_kenter: Mapping is not page-sized"));
2352 
2353 	attr = ATTR_AF | pmap_sh_attr | ATTR_S1_AP(ATTR_S1_AP_RW) |
2354 	    ATTR_S1_XN | ATTR_KERN_GP | ATTR_S1_IDX(mode);
2355 	old_l3e = 0;
2356 	va = sva;
2357 	while (size != 0) {
2358 		pde = pmap_pde(kernel_pmap, va, &lvl);
2359 		KASSERT(pde != NULL,
2360 		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
2361 		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
2362 
2363 		/*
2364 		 * If we have an aligned, contiguous chunk of L2_SIZE, try
2365 		 * to create an L2_BLOCK mapping.
2366 		 */
2367 		if ((va & L2_OFFSET) == 0 && size >= L2_SIZE &&
2368 		    (pa & L2_OFFSET) == 0 && vm_initialized) {
2369 			mpte = PTE_TO_VM_PAGE(pmap_load(pde));
2370 			KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)),
2371 			    ("pmap_kenter: Unexpected mapping"));
2372 			PMAP_LOCK(kernel_pmap);
2373 			error = pmap_insert_pt_page(kernel_pmap, mpte, false,
2374 			    false);
2375 			if (error == 0) {
2376 				attr &= ~ATTR_CONTIGUOUS;
2377 
2378 				/*
2379 				 * Although the page table page "mpte" should
2380 				 * be devoid of mappings, the TLB might hold
2381 				 * intermediate entries that reference it, so
2382 				 * we perform a single-page invalidation.
2383 				 */
2384 				pmap_update_entry(kernel_pmap, pde,
2385 				    PHYS_TO_PTE(pa) | attr | L2_BLOCK, va,
2386 				    PAGE_SIZE);
2387 			}
2388 			PMAP_UNLOCK(kernel_pmap);
2389 			if (error == 0) {
2390 				va += L2_SIZE;
2391 				pa += L2_SIZE;
2392 				size -= L2_SIZE;
2393 				continue;
2394 			}
2395 		}
2396 
2397 		/*
2398 		 * If we have an aligned, contiguous chunk of L3C_ENTRIES
2399 		 * L3 pages, set the contiguous bit within each PTE so that
2400 		 * the chunk can be cached using only one TLB entry.
2401 		 */
2402 		if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) {
2403 			if (size >= L3C_SIZE)
2404 				attr |= ATTR_CONTIGUOUS;
2405 			else
2406 				attr &= ~ATTR_CONTIGUOUS;
2407 		}
2408 
2409 		pte = pmap_l2_to_l3(pde, va);
2410 		old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr |
2411 		    L3_PAGE);
2412 
2413 		va += PAGE_SIZE;
2414 		pa += PAGE_SIZE;
2415 		size -= PAGE_SIZE;
2416 	}
2417 	if ((old_l3e & ATTR_DESCR_VALID) != 0)
2418 		pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2419 	else {
2420 		/*
2421 		 * Because the old entries were invalid and the new mappings
2422 		 * are not executable, an isb is not required.
2423 		 */
2424 		dsb(ishst);
2425 	}
2426 }
2427 
2428 void
2429 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2430 {
2431 
2432 	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2433 }
2434 
2435 /*
2436  * Remove a page from the kernel pagetables.
2437  */
2438 void
2439 pmap_kremove(vm_offset_t va)
2440 {
2441 	pt_entry_t *pte;
2442 
2443 	pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2444 	KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0,
2445 	    ("pmap_kremove: unexpected ATTR_CONTIGUOUS"));
2446 	pmap_clear(pte);
2447 	pmap_s1_invalidate_page(kernel_pmap, va, true);
2448 }
2449 
2450 /*
2451  * Remove the specified range of mappings from the kernel address space.
2452  *
2453  * Should only be applied to mappings that were created by pmap_kenter() or
2454  * pmap_kenter_device().  Nothing about this function is actually specific
2455  * to device mappings.
2456  */
2457 void
2458 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2459 {
2460 	pt_entry_t *ptep, *ptep_end;
2461 	vm_offset_t va;
2462 	int lvl;
2463 
2464 	KASSERT((sva & L3_OFFSET) == 0,
2465 	    ("pmap_kremove_device: Invalid virtual address"));
2466 	KASSERT((size & PAGE_MASK) == 0,
2467 	    ("pmap_kremove_device: Mapping is not page-sized"));
2468 
2469 	va = sva;
2470 	while (size != 0) {
2471 		ptep = pmap_pte(kernel_pmap, va, &lvl);
2472 		KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va));
2473 		switch (lvl) {
2474 		case 2:
2475 			KASSERT((va & L2_OFFSET) == 0,
2476 			    ("Unaligned virtual address"));
2477 			KASSERT(size >= L2_SIZE, ("Insufficient size"));
2478 
2479 			if (va != sva) {
2480 				pmap_s1_invalidate_range(kernel_pmap, sva, va,
2481 				    true);
2482 			}
2483 			pmap_clear(ptep);
2484 			pmap_s1_invalidate_page(kernel_pmap, va, true);
2485 			PMAP_LOCK(kernel_pmap);
2486 			pmap_remove_kernel_l2(kernel_pmap, ptep, va);
2487 			PMAP_UNLOCK(kernel_pmap);
2488 
2489 			va += L2_SIZE;
2490 			sva = va;
2491 			size -= L2_SIZE;
2492 			break;
2493 		case 3:
2494 			if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
2495 				KASSERT((va & L3C_OFFSET) == 0,
2496 				    ("Unaligned L3C virtual address"));
2497 				KASSERT(size >= L3C_SIZE,
2498 				    ("Insufficient L3C size"));
2499 
2500 				ptep_end = ptep + L3C_ENTRIES;
2501 				for (; ptep < ptep_end; ptep++)
2502 					pmap_clear(ptep);
2503 
2504 				va += L3C_SIZE;
2505 				size -= L3C_SIZE;
2506 				break;
2507 			}
2508 			pmap_clear(ptep);
2509 
2510 			va += PAGE_SIZE;
2511 			size -= PAGE_SIZE;
2512 			break;
2513 		default:
2514 			__assert_unreachable();
2515 			break;
2516 		}
2517 	}
2518 	if (va != sva)
2519 		pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2520 }
2521 
2522 /*
2523  *	Used to map a range of physical addresses into kernel
2524  *	virtual address space.
2525  *
2526  *	The value passed in '*virt' is a suggested virtual address for
2527  *	the mapping. Architectures which can support a direct-mapped
2528  *	physical to virtual region can return the appropriate address
2529  *	within that region, leaving '*virt' unchanged. Other
2530  *	architectures should map the pages starting at '*virt' and
2531  *	update '*virt' with the first usable address after the mapped
2532  *	region.
2533  */
2534 vm_offset_t
2535 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2536 {
2537 	return PHYS_TO_DMAP(start);
2538 }
2539 
2540 /*
2541  * Add a list of wired pages to the kva
2542  * this routine is only used for temporary
2543  * kernel mappings that do not need to have
2544  * page modification or references recorded.
2545  * Note that old mappings are simply written
2546  * over.  The page *must* be wired.
2547  * Note: SMP coherent.  Uses a ranged shootdown IPI.
2548  */
2549 void
2550 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2551 {
2552 	pd_entry_t *pde;
2553 	pt_entry_t attr, old_l3e, *pte;
2554 	vm_offset_t va;
2555 	vm_page_t m;
2556 	int i, lvl;
2557 
2558 	old_l3e = 0;
2559 	va = sva;
2560 	for (i = 0; i < count; i++) {
2561 		pde = pmap_pde(kernel_pmap, va, &lvl);
2562 		KASSERT(pde != NULL,
2563 		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2564 		KASSERT(lvl == 2,
2565 		    ("pmap_qenter: Invalid level %d", lvl));
2566 
2567 		m = ma[i];
2568 		attr = ATTR_AF | pmap_sh_attr |
2569 		    ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2570 		    ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2571 		pte = pmap_l2_to_l3(pde, va);
2572 		old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr);
2573 
2574 		va += L3_SIZE;
2575 	}
2576 	if ((old_l3e & ATTR_DESCR_VALID) != 0)
2577 		pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2578 	else {
2579 		/*
2580 		 * Because the old entries were invalid and the new mappings
2581 		 * are not executable, an isb is not required.
2582 		 */
2583 		dsb(ishst);
2584 	}
2585 }
2586 
2587 /*
2588  * This routine tears out page mappings from the
2589  * kernel -- it is meant only for temporary mappings.
2590  */
2591 void
2592 pmap_qremove(vm_offset_t sva, int count)
2593 {
2594 	pt_entry_t *pte;
2595 	vm_offset_t va;
2596 
2597 	KASSERT(ADDR_IS_CANONICAL(sva),
2598 	    ("%s: Address not in canonical form: %lx", __func__, sva));
2599 	KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
2600 
2601 	va = sva;
2602 	while (count-- > 0) {
2603 		pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2604 		if (pte != NULL) {
2605 			pmap_clear(pte);
2606 		}
2607 
2608 		va += PAGE_SIZE;
2609 	}
2610 	pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2611 }
2612 
2613 /***************************************************
2614  * Page table page management routines.....
2615  ***************************************************/
2616 /*
2617  * Schedule the specified unused page table page to be freed.  Specifically,
2618  * add the page to the specified list of pages that will be released to the
2619  * physical memory manager after the TLB has been updated.
2620  */
2621 static __inline void
2622 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
2623 {
2624 
2625 	if (set_PG_ZERO)
2626 		m->flags |= PG_ZERO;
2627 	else
2628 		m->flags &= ~PG_ZERO;
2629 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2630 }
2631 
2632 /*
2633  * Decrements a page table page's reference count, which is used to record the
2634  * number of valid page table entries within the page.  If the reference count
2635  * drops to zero, then the page table page is unmapped.  Returns true if the
2636  * page table page was unmapped and false otherwise.
2637  */
2638 static inline bool
2639 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2640 {
2641 
2642 	--m->ref_count;
2643 	if (m->ref_count == 0) {
2644 		_pmap_unwire_l3(pmap, va, m, free);
2645 		return (true);
2646 	} else
2647 		return (false);
2648 }
2649 
2650 static void
2651 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2652 {
2653 
2654 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2655 	/*
2656 	 * unmap the page table page
2657 	 */
2658 	if (m->pindex >= (NUL2E + NUL1E)) {
2659 		/* l1 page */
2660 		pd_entry_t *l0;
2661 
2662 		l0 = pmap_l0(pmap, va);
2663 		pmap_clear(l0);
2664 	} else if (m->pindex >= NUL2E) {
2665 		/* l2 page */
2666 		pd_entry_t *l1;
2667 
2668 		l1 = pmap_l1(pmap, va);
2669 		pmap_clear(l1);
2670 	} else {
2671 		/* l3 page */
2672 		pd_entry_t *l2;
2673 
2674 		l2 = pmap_l2(pmap, va);
2675 		pmap_clear(l2);
2676 	}
2677 	pmap_resident_count_dec(pmap, 1);
2678 	if (m->pindex < NUL2E) {
2679 		/* We just released an l3, unhold the matching l2 */
2680 		pd_entry_t *l1, tl1;
2681 		vm_page_t l2pg;
2682 
2683 		l1 = pmap_l1(pmap, va);
2684 		tl1 = pmap_load(l1);
2685 		l2pg = PTE_TO_VM_PAGE(tl1);
2686 		pmap_unwire_l3(pmap, va, l2pg, free);
2687 	} else if (m->pindex < (NUL2E + NUL1E)) {
2688 		/* We just released an l2, unhold the matching l1 */
2689 		pd_entry_t *l0, tl0;
2690 		vm_page_t l1pg;
2691 
2692 		l0 = pmap_l0(pmap, va);
2693 		tl0 = pmap_load(l0);
2694 		l1pg = PTE_TO_VM_PAGE(tl0);
2695 		pmap_unwire_l3(pmap, va, l1pg, free);
2696 	}
2697 	pmap_invalidate_page(pmap, va, false);
2698 
2699 	/*
2700 	 * Put page on a list so that it is released after
2701 	 * *ALL* TLB shootdown is done
2702 	 */
2703 	pmap_add_delayed_free_list(m, free, true);
2704 }
2705 
2706 /*
2707  * After removing a page table entry, this routine is used to
2708  * conditionally free the page, and manage the reference count.
2709  */
2710 static int
2711 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2712     struct spglist *free)
2713 {
2714 	vm_page_t mpte;
2715 
2716 	KASSERT(ADDR_IS_CANONICAL(va),
2717 	    ("%s: Address not in canonical form: %lx", __func__, va));
2718 	if (ADDR_IS_KERNEL(va))
2719 		return (0);
2720 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2721 	mpte = PTE_TO_VM_PAGE(ptepde);
2722 	return (pmap_unwire_l3(pmap, va, mpte, free));
2723 }
2724 
2725 /*
2726  * Release a page table page reference after a failed attempt to create a
2727  * mapping.
2728  */
2729 static void
2730 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2731 {
2732 	struct spglist free;
2733 
2734 	SLIST_INIT(&free);
2735 	if (pmap_unwire_l3(pmap, va, mpte, &free))
2736 		vm_page_free_pages_toq(&free, true);
2737 }
2738 
2739 void
2740 pmap_pinit0(pmap_t pmap)
2741 {
2742 
2743 	PMAP_LOCK_INIT(pmap);
2744 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2745 	pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2746 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2747 	TAILQ_INIT(&pmap->pm_pvchunk);
2748 	vm_radix_init(&pmap->pm_root);
2749 	pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2750 	pmap->pm_stage = PM_STAGE1;
2751 	pmap->pm_levels = 4;
2752 	pmap->pm_ttbr = pmap->pm_l0_paddr;
2753 	pmap->pm_asid_set = &asids;
2754 	pmap->pm_bti = NULL;
2755 
2756 	PCPU_SET(curpmap, pmap);
2757 }
2758 
2759 int
2760 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2761 {
2762 	vm_page_t m;
2763 
2764 	/*
2765 	 * allocate the l0 page
2766 	 */
2767 	m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2768 	    VM_ALLOC_ZERO);
2769 	pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2770 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2771 
2772 	TAILQ_INIT(&pmap->pm_pvchunk);
2773 	vm_radix_init(&pmap->pm_root);
2774 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2775 	pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2776 
2777 	MPASS(levels == 3 || levels == 4);
2778 	pmap->pm_levels = levels;
2779 	pmap->pm_stage = stage;
2780 	pmap->pm_bti = NULL;
2781 	switch (stage) {
2782 	case PM_STAGE1:
2783 		pmap->pm_asid_set = &asids;
2784 		if (pmap_bti_support) {
2785 			pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF,
2786 			    M_ZERO | M_WAITOK);
2787 			rangeset_init(pmap->pm_bti, bti_dup_range,
2788 			    bti_free_range, pmap, M_NOWAIT);
2789 		}
2790 		break;
2791 	case PM_STAGE2:
2792 		pmap->pm_asid_set = &vmids;
2793 		break;
2794 	default:
2795 		panic("%s: Invalid pmap type %d", __func__, stage);
2796 		break;
2797 	}
2798 
2799 	/* XXX Temporarily disable deferred ASID allocation. */
2800 	pmap_alloc_asid(pmap);
2801 
2802 	/*
2803 	 * Allocate the level 1 entry to use as the root. This will increase
2804 	 * the refcount on the level 1 page so it won't be removed until
2805 	 * pmap_release() is called.
2806 	 */
2807 	if (pmap->pm_levels == 3) {
2808 		PMAP_LOCK(pmap);
2809 		m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2810 		PMAP_UNLOCK(pmap);
2811 	}
2812 	pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2813 
2814 	return (1);
2815 }
2816 
2817 int
2818 pmap_pinit(pmap_t pmap)
2819 {
2820 
2821 	return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2822 }
2823 
2824 /*
2825  * This routine is called if the desired page table page does not exist.
2826  *
2827  * If page table page allocation fails, this routine may sleep before
2828  * returning NULL.  It sleeps only if a lock pointer was given.
2829  *
2830  * Note: If a page allocation fails at page table level two or three,
2831  * one or two pages may be held during the wait, only to be released
2832  * afterwards.  This conservative approach is easily argued to avoid
2833  * race conditions.
2834  */
2835 static vm_page_t
2836 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2837 {
2838 	vm_page_t m, l1pg, l2pg;
2839 
2840 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2841 
2842 	/*
2843 	 * Allocate a page table page.
2844 	 */
2845 	if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2846 		if (lockp != NULL) {
2847 			RELEASE_PV_LIST_LOCK(lockp);
2848 			PMAP_UNLOCK(pmap);
2849 			vm_wait(NULL);
2850 			PMAP_LOCK(pmap);
2851 		}
2852 
2853 		/*
2854 		 * Indicate the need to retry.  While waiting, the page table
2855 		 * page may have been allocated.
2856 		 */
2857 		return (NULL);
2858 	}
2859 	m->pindex = ptepindex;
2860 
2861 	/*
2862 	 * Because of AArch64's weak memory consistency model, we must have a
2863 	 * barrier here to ensure that the stores for zeroing "m", whether by
2864 	 * pmap_zero_page() or an earlier function, are visible before adding
2865 	 * "m" to the page table.  Otherwise, a page table walk by another
2866 	 * processor's MMU could see the mapping to "m" and a stale, non-zero
2867 	 * PTE within "m".
2868 	 */
2869 	dmb(ishst);
2870 
2871 	/*
2872 	 * Map the pagetable page into the process address space, if
2873 	 * it isn't already there.
2874 	 */
2875 
2876 	if (ptepindex >= (NUL2E + NUL1E)) {
2877 		pd_entry_t *l0p, l0e;
2878 		vm_pindex_t l0index;
2879 
2880 		l0index = ptepindex - (NUL2E + NUL1E);
2881 		l0p = &pmap->pm_l0[l0index];
2882 		KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2883 		    ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2884 		l0e = VM_PAGE_TO_PTE(m) | L0_TABLE;
2885 
2886 		/*
2887 		 * Mark all kernel memory as not accessible from userspace
2888 		 * and userspace memory as not executable from the kernel.
2889 		 * This has been done for the bootstrap L0 entries in
2890 		 * locore.S.
2891 		 */
2892 		if (pmap == kernel_pmap)
2893 			l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2894 		else
2895 			l0e |= TATTR_PXN_TABLE;
2896 		pmap_store(l0p, l0e);
2897 	} else if (ptepindex >= NUL2E) {
2898 		vm_pindex_t l0index, l1index;
2899 		pd_entry_t *l0, *l1;
2900 		pd_entry_t tl0;
2901 
2902 		l1index = ptepindex - NUL2E;
2903 		l0index = l1index >> Ln_ENTRIES_SHIFT;
2904 
2905 		l0 = &pmap->pm_l0[l0index];
2906 		tl0 = pmap_load(l0);
2907 		if (tl0 == 0) {
2908 			/* recurse for allocating page dir */
2909 			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2910 			    lockp) == NULL) {
2911 				vm_page_unwire_noq(m);
2912 				vm_page_free_zero(m);
2913 				return (NULL);
2914 			}
2915 		} else {
2916 			l1pg = PTE_TO_VM_PAGE(tl0);
2917 			l1pg->ref_count++;
2918 		}
2919 
2920 		l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
2921 		l1 = &l1[ptepindex & Ln_ADDR_MASK];
2922 		KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2923 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2924 		pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
2925 	} else {
2926 		vm_pindex_t l0index, l1index;
2927 		pd_entry_t *l0, *l1, *l2;
2928 		pd_entry_t tl0, tl1;
2929 
2930 		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2931 		l0index = l1index >> Ln_ENTRIES_SHIFT;
2932 
2933 		l0 = &pmap->pm_l0[l0index];
2934 		tl0 = pmap_load(l0);
2935 		if (tl0 == 0) {
2936 			/* recurse for allocating page dir */
2937 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2938 			    lockp) == NULL) {
2939 				vm_page_unwire_noq(m);
2940 				vm_page_free_zero(m);
2941 				return (NULL);
2942 			}
2943 			tl0 = pmap_load(l0);
2944 			l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2945 			l1 = &l1[l1index & Ln_ADDR_MASK];
2946 		} else {
2947 			l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2948 			l1 = &l1[l1index & Ln_ADDR_MASK];
2949 			tl1 = pmap_load(l1);
2950 			if (tl1 == 0) {
2951 				/* recurse for allocating page dir */
2952 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2953 				    lockp) == NULL) {
2954 					vm_page_unwire_noq(m);
2955 					vm_page_free_zero(m);
2956 					return (NULL);
2957 				}
2958 			} else {
2959 				l2pg = PTE_TO_VM_PAGE(tl1);
2960 				l2pg->ref_count++;
2961 			}
2962 		}
2963 
2964 		l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1)));
2965 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
2966 		KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2967 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2968 		pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
2969 	}
2970 
2971 	pmap_resident_count_inc(pmap, 1);
2972 
2973 	return (m);
2974 }
2975 
2976 static pd_entry_t *
2977 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2978     struct rwlock **lockp)
2979 {
2980 	pd_entry_t *l1, *l2;
2981 	vm_page_t l2pg;
2982 	vm_pindex_t l2pindex;
2983 
2984 	KASSERT(ADDR_IS_CANONICAL(va),
2985 	    ("%s: Address not in canonical form: %lx", __func__, va));
2986 
2987 retry:
2988 	l1 = pmap_l1(pmap, va);
2989 	if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2990 		l2 = pmap_l1_to_l2(l1, va);
2991 		if (ADDR_IS_USER(va)) {
2992 			/* Add a reference to the L2 page. */
2993 			l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
2994 			l2pg->ref_count++;
2995 		} else
2996 			l2pg = NULL;
2997 	} else if (ADDR_IS_USER(va)) {
2998 		/* Allocate a L2 page. */
2999 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
3000 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
3001 		if (l2pg == NULL) {
3002 			if (lockp != NULL)
3003 				goto retry;
3004 			else
3005 				return (NULL);
3006 		}
3007 		l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
3008 		l2 = &l2[pmap_l2_index(va)];
3009 	} else
3010 		panic("pmap_alloc_l2: missing page table page for va %#lx",
3011 		    va);
3012 	*l2pgp = l2pg;
3013 	return (l2);
3014 }
3015 
3016 static vm_page_t
3017 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
3018 {
3019 	vm_pindex_t ptepindex;
3020 	pd_entry_t *pde, tpde;
3021 #ifdef INVARIANTS
3022 	pt_entry_t *pte;
3023 #endif
3024 	vm_page_t m;
3025 	int lvl;
3026 
3027 	/*
3028 	 * Calculate pagetable page index
3029 	 */
3030 	ptepindex = pmap_l2_pindex(va);
3031 retry:
3032 	/*
3033 	 * Get the page directory entry
3034 	 */
3035 	pde = pmap_pde(pmap, va, &lvl);
3036 
3037 	/*
3038 	 * If the page table page is mapped, we just increment the hold count,
3039 	 * and activate it. If we get a level 2 pde it will point to a level 3
3040 	 * table.
3041 	 */
3042 	switch (lvl) {
3043 	case -1:
3044 		break;
3045 	case 0:
3046 #ifdef INVARIANTS
3047 		pte = pmap_l0_to_l1(pde, va);
3048 		KASSERT(pmap_load(pte) == 0,
3049 		    ("pmap_alloc_l3: TODO: l0 superpages"));
3050 #endif
3051 		break;
3052 	case 1:
3053 #ifdef INVARIANTS
3054 		pte = pmap_l1_to_l2(pde, va);
3055 		KASSERT(pmap_load(pte) == 0,
3056 		    ("pmap_alloc_l3: TODO: l1 superpages"));
3057 #endif
3058 		break;
3059 	case 2:
3060 		tpde = pmap_load(pde);
3061 		if (tpde != 0) {
3062 			m = PTE_TO_VM_PAGE(tpde);
3063 			m->ref_count++;
3064 			return (m);
3065 		}
3066 		break;
3067 	default:
3068 		panic("pmap_alloc_l3: Invalid level %d", lvl);
3069 	}
3070 
3071 	/*
3072 	 * Here if the pte page isn't mapped, or if it has been deallocated.
3073 	 */
3074 	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
3075 	if (m == NULL && lockp != NULL)
3076 		goto retry;
3077 
3078 	return (m);
3079 }
3080 
3081 /***************************************************
3082  * Pmap allocation/deallocation routines.
3083  ***************************************************/
3084 
3085 /*
3086  * Release any resources held by the given physical map.
3087  * Called when a pmap initialized by pmap_pinit is being released.
3088  * Should only be called if the map contains no valid mappings.
3089  */
3090 void
3091 pmap_release(pmap_t pmap)
3092 {
3093 	bool rv __diagused;
3094 	struct spglist freelist;
3095 	struct asid_set *set;
3096 	vm_page_t m;
3097 	int asid;
3098 
3099 	if (pmap->pm_levels != 4) {
3100 		PMAP_ASSERT_STAGE2(pmap);
3101 		KASSERT(pmap->pm_stats.resident_count == 1,
3102 		    ("pmap_release: pmap resident count %ld != 0",
3103 		    pmap->pm_stats.resident_count));
3104 		KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
3105 		    ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
3106 
3107 		SLIST_INIT(&freelist);
3108 		m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
3109 		PMAP_LOCK(pmap);
3110 		rv = pmap_unwire_l3(pmap, 0, m, &freelist);
3111 		PMAP_UNLOCK(pmap);
3112 		MPASS(rv == true);
3113 		vm_page_free_pages_toq(&freelist, true);
3114 	}
3115 
3116 	KASSERT(pmap->pm_stats.resident_count == 0,
3117 	    ("pmap_release: pmap resident count %ld != 0",
3118 	    pmap->pm_stats.resident_count));
3119 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
3120 	    ("pmap_release: pmap has reserved page table page(s)"));
3121 
3122 	set = pmap->pm_asid_set;
3123 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
3124 
3125 	/*
3126 	 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
3127 	 * the entries when removing them so rely on a later tlb invalidation.
3128 	 * this will happen when updating the VMID generation. Because of this
3129 	 * we don't reuse VMIDs within a generation.
3130 	 */
3131 	if (pmap->pm_stage == PM_STAGE1) {
3132 		mtx_lock_spin(&set->asid_set_mutex);
3133 		if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
3134 			asid = COOKIE_TO_ASID(pmap->pm_cookie);
3135 			KASSERT(asid >= ASID_FIRST_AVAILABLE &&
3136 			    asid < set->asid_set_size,
3137 			    ("pmap_release: pmap cookie has out-of-range asid"));
3138 			bit_clear(set->asid_set, asid);
3139 		}
3140 		mtx_unlock_spin(&set->asid_set_mutex);
3141 
3142 		if (pmap->pm_bti != NULL) {
3143 			rangeset_fini(pmap->pm_bti);
3144 			free(pmap->pm_bti, M_DEVBUF);
3145 		}
3146 	}
3147 
3148 	m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
3149 	vm_page_unwire_noq(m);
3150 	vm_page_free_zero(m);
3151 }
3152 
3153 static int
3154 kvm_size(SYSCTL_HANDLER_ARGS)
3155 {
3156 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
3157 
3158 	return sysctl_handle_long(oidp, &ksize, 0, req);
3159 }
3160 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
3161     0, 0, kvm_size, "LU",
3162     "Size of KVM");
3163 
3164 static int
3165 kvm_free(SYSCTL_HANDLER_ARGS)
3166 {
3167 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
3168 
3169 	return sysctl_handle_long(oidp, &kfree, 0, req);
3170 }
3171 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
3172     0, 0, kvm_free, "LU",
3173     "Amount of KVM free");
3174 
3175 /*
3176  * grow the number of kernel page table entries, if needed
3177  */
3178 static int
3179 pmap_growkernel_nopanic(vm_offset_t addr)
3180 {
3181 	vm_page_t nkpg;
3182 	pd_entry_t *l0, *l1, *l2;
3183 
3184 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
3185 
3186 	addr = roundup2(addr, L2_SIZE);
3187 	if (addr - 1 >= vm_map_max(kernel_map))
3188 		addr = vm_map_max(kernel_map);
3189 	if (kernel_vm_end < addr) {
3190 		kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
3191 		kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
3192 	}
3193 	while (kernel_vm_end < addr) {
3194 		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
3195 		KASSERT(pmap_load(l0) != 0,
3196 		    ("pmap_growkernel: No level 0 kernel entry"));
3197 
3198 		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
3199 		if (pmap_load(l1) == 0) {
3200 			/* We need a new PDP entry */
3201 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3202 			    VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3203 			if (nkpg == NULL)
3204 				return (KERN_RESOURCE_SHORTAGE);
3205 			nkpg->pindex = pmap_l1_pindex(kernel_vm_end);
3206 			/* See the dmb() in _pmap_alloc_l3(). */
3207 			dmb(ishst);
3208 			pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE);
3209 			continue; /* try again */
3210 		}
3211 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
3212 		if (pmap_load(l2) != 0) {
3213 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
3214 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3215 				kernel_vm_end = vm_map_max(kernel_map);
3216 				break;
3217 			}
3218 			continue;
3219 		}
3220 
3221 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3222 		    VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3223 		if (nkpg == NULL)
3224 			return (KERN_RESOURCE_SHORTAGE);
3225 		nkpg->pindex = pmap_l2_pindex(kernel_vm_end);
3226 		/* See the dmb() in _pmap_alloc_l3(). */
3227 		dmb(ishst);
3228 		pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE);
3229 
3230 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
3231 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3232 			kernel_vm_end = vm_map_max(kernel_map);
3233 			break;
3234 		}
3235 	}
3236 	return (KERN_SUCCESS);
3237 }
3238 
3239 int
3240 pmap_growkernel(vm_offset_t addr)
3241 {
3242 	int rv;
3243 
3244 	rv = pmap_growkernel_nopanic(addr);
3245 	if (rv != KERN_SUCCESS && pmap_growkernel_panic)
3246 		panic("pmap_growkernel: no memory to grow kernel");
3247 	return (rv);
3248 }
3249 
3250 /***************************************************
3251  * page management routines.
3252  ***************************************************/
3253 
3254 static const uint64_t pc_freemask[_NPCM] = {
3255 	[0 ... _NPCM - 2] = PC_FREEN,
3256 	[_NPCM - 1] = PC_FREEL
3257 };
3258 
3259 #ifdef PV_STATS
3260 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
3261 
3262 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
3263 	"Current number of pv entry chunks");
3264 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
3265 	"Current number of pv entry chunks allocated");
3266 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
3267 	"Current number of pv entry chunks frees");
3268 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
3269 	"Number of times tried to get a chunk page but failed.");
3270 
3271 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
3272 static int pv_entry_spare;
3273 
3274 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
3275 	"Current number of pv entry frees");
3276 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
3277 	"Current number of pv entry allocs");
3278 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
3279 	"Current number of pv entries");
3280 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3281 	"Current number of spare pv entries");
3282 #endif
3283 
3284 /*
3285  * We are in a serious low memory condition.  Resort to
3286  * drastic measures to free some pages so we can allocate
3287  * another pv entry chunk.
3288  *
3289  * Returns NULL if PV entries were reclaimed from the specified pmap.
3290  *
3291  * We do not, however, unmap 2mpages because subsequent accesses will
3292  * allocate per-page pv entries until repromotion occurs, thereby
3293  * exacerbating the shortage of free pv entries.
3294  */
3295 static vm_page_t
3296 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
3297 {
3298 	struct pv_chunks_list *pvc;
3299 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3300 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3301 	struct md_page *pvh;
3302 	pd_entry_t *pde;
3303 	pmap_t next_pmap, pmap;
3304 	pt_entry_t *pte, tpte;
3305 	pv_entry_t pv;
3306 	vm_offset_t va;
3307 	vm_page_t m, m_pc;
3308 	struct spglist free;
3309 	uint64_t inuse;
3310 	int bit, field, freed, lvl;
3311 
3312 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3313 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3314 
3315 	pmap = NULL;
3316 	m_pc = NULL;
3317 	SLIST_INIT(&free);
3318 	bzero(&pc_marker_b, sizeof(pc_marker_b));
3319 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3320 	pc_marker = (struct pv_chunk *)&pc_marker_b;
3321 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3322 
3323 	pvc = &pv_chunks[domain];
3324 	mtx_lock(&pvc->pvc_lock);
3325 	pvc->active_reclaims++;
3326 	TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
3327 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
3328 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3329 	    SLIST_EMPTY(&free)) {
3330 		next_pmap = pc->pc_pmap;
3331 		if (next_pmap == NULL) {
3332 			/*
3333 			 * The next chunk is a marker.  However, it is
3334 			 * not our marker, so active_reclaims must be
3335 			 * > 1.  Consequently, the next_chunk code
3336 			 * will not rotate the pv_chunks list.
3337 			 */
3338 			goto next_chunk;
3339 		}
3340 		mtx_unlock(&pvc->pvc_lock);
3341 
3342 		/*
3343 		 * A pv_chunk can only be removed from the pc_lru list
3344 		 * when both pvc->pvc_lock is owned and the
3345 		 * corresponding pmap is locked.
3346 		 */
3347 		if (pmap != next_pmap) {
3348 			if (pmap != NULL && pmap != locked_pmap)
3349 				PMAP_UNLOCK(pmap);
3350 			pmap = next_pmap;
3351 			/* Avoid deadlock and lock recursion. */
3352 			if (pmap > locked_pmap) {
3353 				RELEASE_PV_LIST_LOCK(lockp);
3354 				PMAP_LOCK(pmap);
3355 				mtx_lock(&pvc->pvc_lock);
3356 				continue;
3357 			} else if (pmap != locked_pmap) {
3358 				if (PMAP_TRYLOCK(pmap)) {
3359 					mtx_lock(&pvc->pvc_lock);
3360 					continue;
3361 				} else {
3362 					pmap = NULL; /* pmap is not locked */
3363 					mtx_lock(&pvc->pvc_lock);
3364 					pc = TAILQ_NEXT(pc_marker, pc_lru);
3365 					if (pc == NULL ||
3366 					    pc->pc_pmap != next_pmap)
3367 						continue;
3368 					goto next_chunk;
3369 				}
3370 			}
3371 		}
3372 
3373 		/*
3374 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
3375 		 */
3376 		freed = 0;
3377 		for (field = 0; field < _NPCM; field++) {
3378 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3379 			    inuse != 0; inuse &= ~(1UL << bit)) {
3380 				bit = ffsl(inuse) - 1;
3381 				pv = &pc->pc_pventry[field * 64 + bit];
3382 				va = pv->pv_va;
3383 				pde = pmap_pde(pmap, va, &lvl);
3384 				if (lvl != 2)
3385 					continue;
3386 				pte = pmap_l2_to_l3(pde, va);
3387 				tpte = pmap_load(pte);
3388 				if ((tpte & ATTR_SW_WIRED) != 0)
3389 					continue;
3390 				if ((tpte & ATTR_CONTIGUOUS) != 0)
3391 					(void)pmap_demote_l3c(pmap, pte, va);
3392 				tpte = pmap_load_clear(pte);
3393 				m = PTE_TO_VM_PAGE(tpte);
3394 				if (pmap_pte_dirty(pmap, tpte))
3395 					vm_page_dirty(m);
3396 				if ((tpte & ATTR_AF) != 0) {
3397 					pmap_s1_invalidate_page(pmap, va, true);
3398 					vm_page_aflag_set(m, PGA_REFERENCED);
3399 				}
3400 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3401 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3402 				m->md.pv_gen++;
3403 				if (TAILQ_EMPTY(&m->md.pv_list) &&
3404 				    (m->flags & PG_FICTITIOUS) == 0) {
3405 					pvh = page_to_pvh(m);
3406 					if (TAILQ_EMPTY(&pvh->pv_list)) {
3407 						vm_page_aflag_clear(m,
3408 						    PGA_WRITEABLE);
3409 					}
3410 				}
3411 				pc->pc_map[field] |= 1UL << bit;
3412 				pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
3413 				freed++;
3414 			}
3415 		}
3416 		if (freed == 0) {
3417 			mtx_lock(&pvc->pvc_lock);
3418 			goto next_chunk;
3419 		}
3420 		/* Every freed mapping is for a 4 KB page. */
3421 		pmap_resident_count_dec(pmap, freed);
3422 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3423 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3424 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3425 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3426 		if (pc_is_free(pc)) {
3427 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3428 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3429 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3430 			/* Entire chunk is free; return it. */
3431 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3432 			dump_drop_page(m_pc->phys_addr);
3433 			mtx_lock(&pvc->pvc_lock);
3434 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3435 			break;
3436 		}
3437 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3438 		mtx_lock(&pvc->pvc_lock);
3439 		/* One freed pv entry in locked_pmap is sufficient. */
3440 		if (pmap == locked_pmap)
3441 			break;
3442 
3443 next_chunk:
3444 		TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3445 		TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
3446 		if (pvc->active_reclaims == 1 && pmap != NULL) {
3447 			/*
3448 			 * Rotate the pv chunks list so that we do not
3449 			 * scan the same pv chunks that could not be
3450 			 * freed (because they contained a wired
3451 			 * and/or superpage mapping) on every
3452 			 * invocation of reclaim_pv_chunk().
3453 			 */
3454 			while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
3455 				MPASS(pc->pc_pmap != NULL);
3456 				TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3457 				TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3458 			}
3459 		}
3460 	}
3461 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3462 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
3463 	pvc->active_reclaims--;
3464 	mtx_unlock(&pvc->pvc_lock);
3465 	if (pmap != NULL && pmap != locked_pmap)
3466 		PMAP_UNLOCK(pmap);
3467 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3468 		m_pc = SLIST_FIRST(&free);
3469 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3470 		/* Recycle a freed page table page. */
3471 		m_pc->ref_count = 1;
3472 	}
3473 	vm_page_free_pages_toq(&free, true);
3474 	return (m_pc);
3475 }
3476 
3477 static vm_page_t
3478 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3479 {
3480 	vm_page_t m;
3481 	int i, domain;
3482 
3483 	domain = PCPU_GET(domain);
3484 	for (i = 0; i < vm_ndomains; i++) {
3485 		m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
3486 		if (m != NULL)
3487 			break;
3488 		domain = (domain + 1) % vm_ndomains;
3489 	}
3490 
3491 	return (m);
3492 }
3493 
3494 /*
3495  * free the pv_entry back to the free list
3496  */
3497 static void
3498 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3499 {
3500 	struct pv_chunk *pc;
3501 	int idx, field, bit;
3502 
3503 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3504 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3505 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3506 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3507 	pc = pv_to_chunk(pv);
3508 	idx = pv - &pc->pc_pventry[0];
3509 	field = idx / 64;
3510 	bit = idx % 64;
3511 	pc->pc_map[field] |= 1ul << bit;
3512 	if (!pc_is_free(pc)) {
3513 		/* 98% of the time, pc is already at the head of the list. */
3514 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3515 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3516 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3517 		}
3518 		return;
3519 	}
3520 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3521 	free_pv_chunk(pc);
3522 }
3523 
3524 static void
3525 free_pv_chunk_dequeued(struct pv_chunk *pc)
3526 {
3527 	vm_page_t m;
3528 
3529 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3530 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3531 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3532 	/* entire chunk is free, return it */
3533 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3534 	dump_drop_page(m->phys_addr);
3535 	vm_page_unwire_noq(m);
3536 	vm_page_free(m);
3537 }
3538 
3539 static void
3540 free_pv_chunk(struct pv_chunk *pc)
3541 {
3542 	struct pv_chunks_list *pvc;
3543 
3544 	pvc = &pv_chunks[pc_to_domain(pc)];
3545 	mtx_lock(&pvc->pvc_lock);
3546 	TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3547 	mtx_unlock(&pvc->pvc_lock);
3548 	free_pv_chunk_dequeued(pc);
3549 }
3550 
3551 static void
3552 free_pv_chunk_batch(struct pv_chunklist *batch)
3553 {
3554 	struct pv_chunks_list *pvc;
3555 	struct pv_chunk *pc, *npc;
3556 	int i;
3557 
3558 	for (i = 0; i < vm_ndomains; i++) {
3559 		if (TAILQ_EMPTY(&batch[i]))
3560 			continue;
3561 		pvc = &pv_chunks[i];
3562 		mtx_lock(&pvc->pvc_lock);
3563 		TAILQ_FOREACH(pc, &batch[i], pc_list) {
3564 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3565 		}
3566 		mtx_unlock(&pvc->pvc_lock);
3567 	}
3568 
3569 	for (i = 0; i < vm_ndomains; i++) {
3570 		TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3571 			free_pv_chunk_dequeued(pc);
3572 		}
3573 	}
3574 }
3575 
3576 /*
3577  * Returns a new PV entry, allocating a new PV chunk from the system when
3578  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
3579  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
3580  * returned.
3581  *
3582  * The given PV list lock may be released.
3583  */
3584 static pv_entry_t
3585 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3586 {
3587 	struct pv_chunks_list *pvc;
3588 	int bit, field;
3589 	pv_entry_t pv;
3590 	struct pv_chunk *pc;
3591 	vm_page_t m;
3592 
3593 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3594 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3595 retry:
3596 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3597 	if (pc != NULL) {
3598 		for (field = 0; field < _NPCM; field++) {
3599 			if (pc->pc_map[field]) {
3600 				bit = ffsl(pc->pc_map[field]) - 1;
3601 				break;
3602 			}
3603 		}
3604 		if (field < _NPCM) {
3605 			pv = &pc->pc_pventry[field * 64 + bit];
3606 			pc->pc_map[field] &= ~(1ul << bit);
3607 			/* If this was the last item, move it to tail */
3608 			if (pc_is_full(pc)) {
3609 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3610 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3611 				    pc_list);
3612 			}
3613 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
3614 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3615 			return (pv);
3616 		}
3617 	}
3618 	/* No free items, allocate another chunk */
3619 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3620 	if (m == NULL) {
3621 		if (lockp == NULL) {
3622 			PV_STAT(pc_chunk_tryfail++);
3623 			return (NULL);
3624 		}
3625 		m = reclaim_pv_chunk(pmap, lockp);
3626 		if (m == NULL)
3627 			goto retry;
3628 	}
3629 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3630 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3631 	dump_add_page(m->phys_addr);
3632 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3633 	pc->pc_pmap = pmap;
3634 	memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3635 	pc->pc_map[0] &= ~1ul;		/* preallocated bit 0 */
3636 	pvc = &pv_chunks[vm_page_domain(m)];
3637 	mtx_lock(&pvc->pvc_lock);
3638 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3639 	mtx_unlock(&pvc->pvc_lock);
3640 	pv = &pc->pc_pventry[0];
3641 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3642 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
3643 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3644 	return (pv);
3645 }
3646 
3647 /*
3648  * Ensure that the number of spare PV entries in the specified pmap meets or
3649  * exceeds the given count, "needed".
3650  *
3651  * The given PV list lock may be released.
3652  */
3653 static void
3654 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3655 {
3656 	struct pv_chunks_list *pvc;
3657 	struct pch new_tail[PMAP_MEMDOM];
3658 	struct pv_chunk *pc;
3659 	vm_page_t m;
3660 	int avail, free, i;
3661 	bool reclaimed;
3662 
3663 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3664 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3665 
3666 	/*
3667 	 * Newly allocated PV chunks must be stored in a private list until
3668 	 * the required number of PV chunks have been allocated.  Otherwise,
3669 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
3670 	 * contrast, these chunks must be added to the pmap upon allocation.
3671 	 */
3672 	for (i = 0; i < PMAP_MEMDOM; i++)
3673 		TAILQ_INIT(&new_tail[i]);
3674 retry:
3675 	avail = 0;
3676 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3677 		bit_count((bitstr_t *)pc->pc_map, 0,
3678 		    sizeof(pc->pc_map) * NBBY, &free);
3679 		if (free == 0)
3680 			break;
3681 		avail += free;
3682 		if (avail >= needed)
3683 			break;
3684 	}
3685 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
3686 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3687 		if (m == NULL) {
3688 			m = reclaim_pv_chunk(pmap, lockp);
3689 			if (m == NULL)
3690 				goto retry;
3691 			reclaimed = true;
3692 		}
3693 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3694 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3695 		dump_add_page(m->phys_addr);
3696 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3697 		pc->pc_pmap = pmap;
3698 		memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3699 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3700 		TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3701 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3702 
3703 		/*
3704 		 * The reclaim might have freed a chunk from the current pmap.
3705 		 * If that chunk contained available entries, we need to
3706 		 * re-count the number of available entries.
3707 		 */
3708 		if (reclaimed)
3709 			goto retry;
3710 	}
3711 	for (i = 0; i < vm_ndomains; i++) {
3712 		if (TAILQ_EMPTY(&new_tail[i]))
3713 			continue;
3714 		pvc = &pv_chunks[i];
3715 		mtx_lock(&pvc->pvc_lock);
3716 		TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3717 		mtx_unlock(&pvc->pvc_lock);
3718 	}
3719 }
3720 
3721 /*
3722  * First find and then remove the pv entry for the specified pmap and virtual
3723  * address from the specified pv list.  Returns the pv entry if found and NULL
3724  * otherwise.  This operation can be performed on pv lists for either 4KB or
3725  * 2MB page mappings.
3726  */
3727 static __inline pv_entry_t
3728 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3729 {
3730 	pv_entry_t pv;
3731 
3732 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3733 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3734 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3735 			pvh->pv_gen++;
3736 			break;
3737 		}
3738 	}
3739 	return (pv);
3740 }
3741 
3742 /*
3743  * After demotion from a 2MB page mapping to 512 4KB page mappings,
3744  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3745  * entries for each of the 4KB page mappings.
3746  */
3747 static void
3748 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3749     struct rwlock **lockp)
3750 {
3751 	struct md_page *pvh;
3752 	struct pv_chunk *pc;
3753 	pv_entry_t pv;
3754 	vm_offset_t va_last;
3755 	vm_page_t m;
3756 	int bit, field;
3757 
3758 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3759 	KASSERT((va & L2_OFFSET) == 0,
3760 	    ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3761 	KASSERT((pa & L2_OFFSET) == 0,
3762 	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3763 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3764 
3765 	/*
3766 	 * Transfer the 2mpage's pv entry for this mapping to the first
3767 	 * page's pv list.  Once this transfer begins, the pv list lock
3768 	 * must not be released until the last pv entry is reinstantiated.
3769 	 */
3770 	pvh = pa_to_pvh(pa);
3771 	pv = pmap_pvh_remove(pvh, pmap, va);
3772 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3773 	m = PHYS_TO_VM_PAGE(pa);
3774 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3775 	m->md.pv_gen++;
3776 	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3777 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3778 	va_last = va + L2_SIZE - PAGE_SIZE;
3779 	for (;;) {
3780 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3781 		KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3782 		for (field = 0; field < _NPCM; field++) {
3783 			while (pc->pc_map[field]) {
3784 				bit = ffsl(pc->pc_map[field]) - 1;
3785 				pc->pc_map[field] &= ~(1ul << bit);
3786 				pv = &pc->pc_pventry[field * 64 + bit];
3787 				va += PAGE_SIZE;
3788 				pv->pv_va = va;
3789 				m++;
3790 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3791 			    ("pmap_pv_demote_l2: page %p is not managed", m));
3792 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3793 				m->md.pv_gen++;
3794 				if (va == va_last)
3795 					goto out;
3796 			}
3797 		}
3798 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3799 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3800 	}
3801 out:
3802 	if (pc_is_full(pc)) {
3803 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3804 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3805 	}
3806 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3807 	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3808 }
3809 
3810 /*
3811  * First find and then destroy the pv entry for the specified pmap and virtual
3812  * address.  This operation can be performed on pv lists for either 4KB or 2MB
3813  * page mappings.
3814  */
3815 static void
3816 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3817 {
3818 	pv_entry_t pv;
3819 
3820 	pv = pmap_pvh_remove(pvh, pmap, va);
3821 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3822 	free_pv_entry(pmap, pv);
3823 }
3824 
3825 /*
3826  * Conditionally create the PV entry for a 4KB page mapping if the required
3827  * memory can be allocated without resorting to reclamation.
3828  */
3829 static bool
3830 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3831     struct rwlock **lockp)
3832 {
3833 	pv_entry_t pv;
3834 
3835 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3836 	/* Pass NULL instead of the lock pointer to disable reclamation. */
3837 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3838 		pv->pv_va = va;
3839 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3840 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3841 		m->md.pv_gen++;
3842 		return (true);
3843 	} else
3844 		return (false);
3845 }
3846 
3847 /*
3848  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
3849  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
3850  * false if the PV entry cannot be allocated without resorting to reclamation.
3851  */
3852 static bool
3853 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3854     struct rwlock **lockp)
3855 {
3856 	struct md_page *pvh;
3857 	pv_entry_t pv;
3858 	vm_paddr_t pa;
3859 
3860 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3861 	/* Pass NULL instead of the lock pointer to disable reclamation. */
3862 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3863 	    NULL : lockp)) == NULL)
3864 		return (false);
3865 	pv->pv_va = va;
3866 	pa = PTE_TO_PHYS(l2e);
3867 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3868 	pvh = pa_to_pvh(pa);
3869 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3870 	pvh->pv_gen++;
3871 	return (true);
3872 }
3873 
3874 /*
3875  * Conditionally creates the PV entries for a L3C superpage mapping if
3876  * the required memory can be allocated without resorting to reclamation.
3877  */
3878 static bool
3879 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
3880     struct rwlock **lockp)
3881 {
3882 	pv_entry_t pv;
3883 	vm_offset_t tva;
3884 	vm_paddr_t pa __diagused;
3885 	vm_page_t mt;
3886 
3887 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3888 	KASSERT((va & L3C_OFFSET) == 0,
3889 	    ("pmap_pv_insert_l3c: va is not aligned"));
3890 	pa = VM_PAGE_TO_PHYS(m);
3891 	KASSERT((pa & L3C_OFFSET) == 0,
3892 	    ("pmap_pv_insert_l3c: pa is not aligned"));
3893 	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3894 	for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) {
3895 		/* Pass NULL instead of lockp to disable reclamation. */
3896 		pv = get_pv_entry(pmap, NULL);
3897 		if (__predict_false(pv == NULL)) {
3898 			while (tva > va) {
3899 				mt--;
3900 				tva -= L3_SIZE;
3901 				pmap_pvh_free(&mt->md, pmap, tva);
3902 			}
3903 			return (false);
3904 		}
3905 		pv->pv_va = tva;
3906 		TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next);
3907 		mt->md.pv_gen++;
3908 	}
3909 	return (true);
3910 }
3911 
3912 static void
3913 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3914 {
3915 	pt_entry_t newl2, oldl2 __diagused;
3916 	vm_page_t ml3;
3917 	vm_paddr_t ml3pa;
3918 
3919 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3920 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3921 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3922 
3923 	ml3 = pmap_remove_pt_page(pmap, va);
3924 	KASSERT(ml3 != NULL, ("pmap_remove_kernel_l2: missing pt page"));
3925 
3926 	ml3pa = VM_PAGE_TO_PHYS(ml3);
3927 	newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
3928 
3929 	/*
3930 	 * If this page table page was unmapped by a promotion, then it
3931 	 * contains valid mappings.  Zero it to invalidate those mappings.
3932 	 */
3933 	if (vm_page_any_valid(ml3))
3934 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
3935 
3936 	/*
3937 	 * Demote the mapping.  The caller must have already invalidated the
3938 	 * mapping (i.e., the "break" in break-before-make).
3939 	 */
3940 	oldl2 = pmap_load_store(l2, newl2);
3941 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3942 	    __func__, l2, oldl2));
3943 }
3944 
3945 /*
3946  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3947  */
3948 static int
3949 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e,
3950     bool demote_kl2e, struct spglist *free, struct rwlock **lockp)
3951 {
3952 	struct md_page *pvh;
3953 	pt_entry_t old_l2;
3954 	vm_page_t m, ml3, mt;
3955 
3956 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3957 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
3958 	old_l2 = pmap_load_clear(l2);
3959 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3960 	    ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
3961 
3962 	/*
3963 	 * Since a promotion must break the 4KB page mappings before making
3964 	 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3965 	 */
3966 	pmap_s1_invalidate_page(pmap, sva, true);
3967 
3968 	if (old_l2 & ATTR_SW_WIRED)
3969 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
3970 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
3971 	if (old_l2 & ATTR_SW_MANAGED) {
3972 		m = PTE_TO_VM_PAGE(old_l2);
3973 		pvh = page_to_pvh(m);
3974 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3975 		pmap_pvh_free(pvh, pmap, sva);
3976 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
3977 			if (pmap_pte_dirty(pmap, old_l2))
3978 				vm_page_dirty(mt);
3979 			if (old_l2 & ATTR_AF)
3980 				vm_page_aflag_set(mt, PGA_REFERENCED);
3981 			if (TAILQ_EMPTY(&mt->md.pv_list) &&
3982 			    TAILQ_EMPTY(&pvh->pv_list))
3983 				vm_page_aflag_clear(mt, PGA_WRITEABLE);
3984 		}
3985 	}
3986 	if (pmap != kernel_pmap) {
3987 		ml3 = pmap_remove_pt_page(pmap, sva);
3988 		if (ml3 != NULL) {
3989 			KASSERT(vm_page_any_valid(ml3),
3990 			    ("pmap_remove_l2: l3 page not promoted"));
3991 			pmap_resident_count_dec(pmap, 1);
3992 			KASSERT(ml3->ref_count == NL3PG,
3993 			    ("pmap_remove_l2: l3 page ref count error"));
3994 			ml3->ref_count = 0;
3995 			pmap_add_delayed_free_list(ml3, free, false);
3996 		}
3997 	} else if (demote_kl2e) {
3998 		pmap_remove_kernel_l2(pmap, l2, sva);
3999 	} else {
4000 		ml3 = vm_radix_lookup(&pmap->pm_root, pmap_l2_pindex(sva));
4001 		if (vm_page_any_valid(ml3)) {
4002 			ml3->valid = 0;
4003 			pmap_zero_page(ml3);
4004 		}
4005 	}
4006 	return (pmap_unuse_pt(pmap, sva, l1e, free));
4007 }
4008 
4009 /*
4010  * pmap_remove_l3: do the things to unmap a page in a process
4011  */
4012 static int
4013 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
4014     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
4015 {
4016 	struct md_page *pvh;
4017 	pt_entry_t old_l3;
4018 	vm_page_t m;
4019 
4020 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4021 	old_l3 = pmap_load(l3);
4022 	if ((old_l3 & ATTR_CONTIGUOUS) != 0)
4023 		(void)pmap_demote_l3c(pmap, l3, va);
4024 	old_l3 = pmap_load_clear(l3);
4025 	pmap_s1_invalidate_page(pmap, va, true);
4026 	if (old_l3 & ATTR_SW_WIRED)
4027 		pmap->pm_stats.wired_count -= 1;
4028 	pmap_resident_count_dec(pmap, 1);
4029 	if (old_l3 & ATTR_SW_MANAGED) {
4030 		m = PTE_TO_VM_PAGE(old_l3);
4031 		if (pmap_pte_dirty(pmap, old_l3))
4032 			vm_page_dirty(m);
4033 		if (old_l3 & ATTR_AF)
4034 			vm_page_aflag_set(m, PGA_REFERENCED);
4035 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
4036 		pmap_pvh_free(&m->md, pmap, va);
4037 		if (TAILQ_EMPTY(&m->md.pv_list) &&
4038 		    (m->flags & PG_FICTITIOUS) == 0) {
4039 			pvh = page_to_pvh(m);
4040 			if (TAILQ_EMPTY(&pvh->pv_list))
4041 				vm_page_aflag_clear(m, PGA_WRITEABLE);
4042 		}
4043 	}
4044 	return (pmap_unuse_pt(pmap, va, l2e, free));
4045 }
4046 
4047 /*
4048  * Removes the specified L3C superpage mapping.  Requests TLB invalidations
4049  * to be performed by the caller through the returned "*vap". Returns true
4050  * if the level 3 table "ml3" was unmapped and added to the spglist "free".
4051  * Otherwise, returns false.
4052  */
4053 static bool
4054 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap,
4055     vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
4056     struct rwlock **lockp)
4057 {
4058 	struct md_page *pvh;
4059 	struct rwlock *new_lock;
4060 	pt_entry_t first_l3e, l3e, *tl3p;
4061 	vm_offset_t tva;
4062 	vm_page_t m, mt;
4063 
4064 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4065 	KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4066 	    0, ("pmap_remove_l3c: l3p is not aligned"));
4067 	KASSERT((va & L3C_OFFSET) == 0,
4068 	    ("pmap_remove_l3c: va is not aligned"));
4069 
4070 	/*
4071 	 * Hardware accessed and dirty bit maintenance might only update a
4072 	 * single L3 entry, so we must combine the accessed and dirty bits
4073 	 * from this entire set of contiguous L3 entries.
4074 	 */
4075 	first_l3e = pmap_load_clear(l3p);
4076 	for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4077 		l3e = pmap_load_clear(tl3p);
4078 		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4079 		    ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS"));
4080 		if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4081 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4082 			first_l3e &= ~ATTR_S1_AP_RW_BIT;
4083 		first_l3e |= l3e & ATTR_AF;
4084 	}
4085 	if ((first_l3e & ATTR_SW_WIRED) != 0)
4086 		pmap->pm_stats.wired_count -= L3C_ENTRIES;
4087 	pmap_resident_count_dec(pmap, L3C_ENTRIES);
4088 	if ((first_l3e & ATTR_SW_MANAGED) != 0) {
4089 		m = PTE_TO_VM_PAGE(first_l3e);
4090 		new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4091 		if (new_lock != *lockp) {
4092 			if (*lockp != NULL) {
4093 				/*
4094 				 * Pending TLB invalidations must be
4095 				 * performed before the PV list lock is
4096 				 * released.  Otherwise, a concurrent
4097 				 * pmap_remove_all() on a physical page
4098 				 * could return while a stale TLB entry
4099 				 * still provides access to that page.
4100 				 */
4101 				if (*vap != va_next) {
4102 					pmap_invalidate_range(pmap, *vap, va,
4103 					    true);
4104 					*vap = va_next;
4105 				}
4106 				rw_wunlock(*lockp);
4107 			}
4108 			*lockp = new_lock;
4109 			rw_wlock(*lockp);
4110 		}
4111 		pvh = page_to_pvh(m);
4112 		for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva +=
4113 		    L3_SIZE) {
4114 			if (pmap_pte_dirty(pmap, first_l3e))
4115 				vm_page_dirty(mt);
4116 			if ((first_l3e & ATTR_AF) != 0)
4117 				vm_page_aflag_set(mt, PGA_REFERENCED);
4118 			pmap_pvh_free(&mt->md, pmap, tva);
4119 			if (TAILQ_EMPTY(&mt->md.pv_list) &&
4120 			    TAILQ_EMPTY(&pvh->pv_list))
4121 				vm_page_aflag_clear(mt, PGA_WRITEABLE);
4122 		}
4123 	}
4124 	if (*vap == va_next)
4125 		*vap = va;
4126 	if (ml3 != NULL) {
4127 		ml3->ref_count -= L3C_ENTRIES;
4128 		if (ml3->ref_count == 0) {
4129 			_pmap_unwire_l3(pmap, va, ml3, free);
4130 			return (true);
4131 		}
4132 	}
4133 	return (false);
4134 }
4135 
4136 /*
4137  * Remove the specified range of addresses from the L3 page table that is
4138  * identified by the given L2 entry.
4139  */
4140 static void
4141 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
4142     vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
4143 {
4144 	struct md_page *pvh;
4145 	struct rwlock *new_lock;
4146 	pt_entry_t *l3, old_l3;
4147 	vm_offset_t va;
4148 	vm_page_t l3pg, m;
4149 
4150 	KASSERT(ADDR_IS_CANONICAL(sva),
4151 	    ("%s: Start address not in canonical form: %lx", __func__, sva));
4152 	KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
4153 	    ("%s: End address not in canonical form: %lx", __func__, eva));
4154 
4155 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4156 	KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
4157 	    ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
4158 	l3pg = ADDR_IS_USER(sva) ? PTE_TO_VM_PAGE(l2e) : NULL;
4159 	va = eva;
4160 	for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
4161 		old_l3 = pmap_load(l3);
4162 		if (!pmap_l3_valid(old_l3)) {
4163 			if (va != eva) {
4164 				pmap_invalidate_range(pmap, va, sva, true);
4165 				va = eva;
4166 			}
4167 			continue;
4168 		}
4169 		if ((old_l3 & ATTR_CONTIGUOUS) != 0) {
4170 			/*
4171 			 * Is this entire set of contiguous L3 entries being
4172 			 * removed?  Handle the possibility that "eva" is zero
4173 			 * because of address wraparound.
4174 			 */
4175 			if ((sva & L3C_OFFSET) == 0 &&
4176 			    sva + L3C_OFFSET <= eva - 1) {
4177 				if (pmap_remove_l3c(pmap, l3, sva, &va, eva,
4178 				    l3pg, free, lockp)) {
4179 					/* The L3 table was unmapped. */
4180 					sva += L3C_SIZE;
4181 					break;
4182 				}
4183 				l3 += L3C_ENTRIES - 1;
4184 				sva += L3C_SIZE - L3_SIZE;
4185 				continue;
4186 			}
4187 
4188 			(void)pmap_demote_l3c(pmap, l3, sva);
4189 		}
4190 		old_l3 = pmap_load_clear(l3);
4191 		if ((old_l3 & ATTR_SW_WIRED) != 0)
4192 			pmap->pm_stats.wired_count--;
4193 		pmap_resident_count_dec(pmap, 1);
4194 		if ((old_l3 & ATTR_SW_MANAGED) != 0) {
4195 			m = PTE_TO_VM_PAGE(old_l3);
4196 			if (pmap_pte_dirty(pmap, old_l3))
4197 				vm_page_dirty(m);
4198 			if ((old_l3 & ATTR_AF) != 0)
4199 				vm_page_aflag_set(m, PGA_REFERENCED);
4200 			new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4201 			if (new_lock != *lockp) {
4202 				if (*lockp != NULL) {
4203 					/*
4204 					 * Pending TLB invalidations must be
4205 					 * performed before the PV list lock is
4206 					 * released.  Otherwise, a concurrent
4207 					 * pmap_remove_all() on a physical page
4208 					 * could return while a stale TLB entry
4209 					 * still provides access to that page.
4210 					 */
4211 					if (va != eva) {
4212 						pmap_invalidate_range(pmap, va,
4213 						    sva, true);
4214 						va = eva;
4215 					}
4216 					rw_wunlock(*lockp);
4217 				}
4218 				*lockp = new_lock;
4219 				rw_wlock(*lockp);
4220 			}
4221 			pmap_pvh_free(&m->md, pmap, sva);
4222 			if (TAILQ_EMPTY(&m->md.pv_list) &&
4223 			    (m->flags & PG_FICTITIOUS) == 0) {
4224 				pvh = page_to_pvh(m);
4225 				if (TAILQ_EMPTY(&pvh->pv_list))
4226 					vm_page_aflag_clear(m, PGA_WRITEABLE);
4227 			}
4228 		}
4229 		if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
4230 			/*
4231 			 * _pmap_unwire_l3() has already invalidated the TLB
4232 			 * entries at all levels for "sva".  So, we need not
4233 			 * perform "sva += L3_SIZE;" here.  Moreover, we need
4234 			 * not perform "va = sva;" if "sva" is at the start
4235 			 * of a new valid range consisting of a single page.
4236 			 */
4237 			break;
4238 		}
4239 		if (va == eva)
4240 			va = sva;
4241 	}
4242 	if (va != eva)
4243 		pmap_invalidate_range(pmap, va, sva, true);
4244 }
4245 
4246 static void
4247 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
4248 {
4249 	struct rwlock *lock;
4250 	vm_offset_t va_next;
4251 	pd_entry_t *l0, *l1, *l2;
4252 	pt_entry_t l3_paddr;
4253 	struct spglist free;
4254 
4255 	/*
4256 	 * Perform an unsynchronized read.  This is, however, safe.
4257 	 */
4258 	if (pmap->pm_stats.resident_count == 0)
4259 		return;
4260 
4261 	SLIST_INIT(&free);
4262 
4263 	PMAP_LOCK(pmap);
4264 	if (map_delete)
4265 		pmap_bti_on_remove(pmap, sva, eva);
4266 
4267 	lock = NULL;
4268 	for (; sva < eva; sva = va_next) {
4269 		if (pmap->pm_stats.resident_count == 0)
4270 			break;
4271 
4272 		l0 = pmap_l0(pmap, sva);
4273 		if (pmap_load(l0) == 0) {
4274 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4275 			if (va_next < sva)
4276 				va_next = eva;
4277 			continue;
4278 		}
4279 
4280 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4281 		if (va_next < sva)
4282 			va_next = eva;
4283 		l1 = pmap_l0_to_l1(l0, sva);
4284 		if (pmap_load(l1) == 0)
4285 			continue;
4286 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4287 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4288 			KASSERT(va_next <= eva,
4289 			    ("partial update of non-transparent 1G page "
4290 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4291 			    pmap_load(l1), sva, eva, va_next));
4292 			MPASS(pmap != kernel_pmap);
4293 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4294 			pmap_clear(l1);
4295 			pmap_s1_invalidate_page(pmap, sva, true);
4296 			pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
4297 			pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
4298 			continue;
4299 		}
4300 
4301 		/*
4302 		 * Calculate index for next page table.
4303 		 */
4304 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4305 		if (va_next < sva)
4306 			va_next = eva;
4307 
4308 		l2 = pmap_l1_to_l2(l1, sva);
4309 		l3_paddr = pmap_load(l2);
4310 
4311 		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4312 			if (sva + L2_SIZE == va_next && eva >= va_next) {
4313 				pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
4314 				    true, &free, &lock);
4315 				continue;
4316 			} else if (pmap_demote_l2_locked(pmap, l2, sva,
4317 			    &lock) == NULL)
4318 				continue;
4319 			l3_paddr = pmap_load(l2);
4320 		}
4321 
4322 		/*
4323 		 * Weed out invalid mappings.
4324 		 */
4325 		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
4326 			continue;
4327 
4328 		/*
4329 		 * Limit our scan to either the end of the va represented
4330 		 * by the current page table page, or to the end of the
4331 		 * range being removed.
4332 		 */
4333 		if (va_next > eva)
4334 			va_next = eva;
4335 
4336 		pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
4337 		    &lock);
4338 	}
4339 	if (lock != NULL)
4340 		rw_wunlock(lock);
4341 	PMAP_UNLOCK(pmap);
4342 	vm_page_free_pages_toq(&free, true);
4343 }
4344 
4345 /*
4346  *	Remove the given range of addresses from the specified map.
4347  *
4348  *	It is assumed that the start and end are properly
4349  *	rounded to the page size.
4350  */
4351 void
4352 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4353 {
4354 	pmap_remove1(pmap, sva, eva, false);
4355 }
4356 
4357 /*
4358  *	Remove the given range of addresses as part of a logical unmap
4359  *	operation. This has the effect of calling pmap_remove(), but
4360  *	also clears any metadata that should persist for the lifetime
4361  *	of a logical mapping.
4362  */
4363 void
4364 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4365 {
4366 	pmap_remove1(pmap, sva, eva, true);
4367 }
4368 
4369 /*
4370  *	Routine:	pmap_remove_all
4371  *	Function:
4372  *		Removes this physical page from
4373  *		all physical maps in which it resides.
4374  *		Reflects back modify bits to the pager.
4375  *
4376  *	Notes:
4377  *		Original versions of this routine were very
4378  *		inefficient because they iteratively called
4379  *		pmap_remove (slow...)
4380  */
4381 
4382 void
4383 pmap_remove_all(vm_page_t m)
4384 {
4385 	struct md_page *pvh;
4386 	pv_entry_t pv;
4387 	pmap_t pmap;
4388 	struct rwlock *lock;
4389 	pd_entry_t *pde, tpde;
4390 	pt_entry_t *pte, tpte;
4391 	vm_offset_t va;
4392 	struct spglist free;
4393 	int lvl, pvh_gen, md_gen;
4394 
4395 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4396 	    ("pmap_remove_all: page %p is not managed", m));
4397 	SLIST_INIT(&free);
4398 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4399 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
4400 	rw_wlock(lock);
4401 retry:
4402 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4403 		pmap = PV_PMAP(pv);
4404 		if (!PMAP_TRYLOCK(pmap)) {
4405 			pvh_gen = pvh->pv_gen;
4406 			rw_wunlock(lock);
4407 			PMAP_LOCK(pmap);
4408 			rw_wlock(lock);
4409 			if (pvh_gen != pvh->pv_gen) {
4410 				PMAP_UNLOCK(pmap);
4411 				goto retry;
4412 			}
4413 		}
4414 		va = pv->pv_va;
4415 		pte = pmap_pte_exists(pmap, va, 2, __func__);
4416 		pmap_demote_l2_locked(pmap, pte, va, &lock);
4417 		PMAP_UNLOCK(pmap);
4418 	}
4419 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4420 		pmap = PV_PMAP(pv);
4421 		if (!PMAP_TRYLOCK(pmap)) {
4422 			pvh_gen = pvh->pv_gen;
4423 			md_gen = m->md.pv_gen;
4424 			rw_wunlock(lock);
4425 			PMAP_LOCK(pmap);
4426 			rw_wlock(lock);
4427 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4428 				PMAP_UNLOCK(pmap);
4429 				goto retry;
4430 			}
4431 		}
4432 		pmap_resident_count_dec(pmap, 1);
4433 
4434 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
4435 		KASSERT(pde != NULL,
4436 		    ("pmap_remove_all: no page directory entry found"));
4437 		KASSERT(lvl == 2,
4438 		    ("pmap_remove_all: invalid pde level %d", lvl));
4439 		tpde = pmap_load(pde);
4440 
4441 		pte = pmap_l2_to_l3(pde, pv->pv_va);
4442 		tpte = pmap_load(pte);
4443 		if ((tpte & ATTR_CONTIGUOUS) != 0)
4444 			(void)pmap_demote_l3c(pmap, pte, pv->pv_va);
4445 		tpte = pmap_load_clear(pte);
4446 		if (tpte & ATTR_SW_WIRED)
4447 			pmap->pm_stats.wired_count--;
4448 		if ((tpte & ATTR_AF) != 0) {
4449 			pmap_invalidate_page(pmap, pv->pv_va, true);
4450 			vm_page_aflag_set(m, PGA_REFERENCED);
4451 		}
4452 
4453 		/*
4454 		 * Update the vm_page_t clean and reference bits.
4455 		 */
4456 		if (pmap_pte_dirty(pmap, tpte))
4457 			vm_page_dirty(m);
4458 		pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
4459 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4460 		m->md.pv_gen++;
4461 		free_pv_entry(pmap, pv);
4462 		PMAP_UNLOCK(pmap);
4463 	}
4464 	vm_page_aflag_clear(m, PGA_WRITEABLE);
4465 	rw_wunlock(lock);
4466 	vm_page_free_pages_toq(&free, true);
4467 }
4468 
4469 /*
4470  * Masks and sets bits in a level 2 page table entries in the specified pmap
4471  */
4472 static void
4473 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
4474     pt_entry_t nbits)
4475 {
4476 	pd_entry_t old_l2;
4477 	vm_page_t m, mt;
4478 
4479 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4480 	PMAP_ASSERT_STAGE1(pmap);
4481 	KASSERT((sva & L2_OFFSET) == 0,
4482 	    ("pmap_protect_l2: sva is not 2mpage aligned"));
4483 	old_l2 = pmap_load(l2);
4484 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4485 	    ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
4486 
4487 	/*
4488 	 * Return if the L2 entry already has the desired access restrictions
4489 	 * in place.
4490 	 */
4491 	if ((old_l2 & mask) == nbits)
4492 		return;
4493 
4494 	while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
4495 		cpu_spinwait();
4496 
4497 	/*
4498 	 * When a dirty read/write superpage mapping is write protected,
4499 	 * update the dirty field of each of the superpage's constituent 4KB
4500 	 * pages.
4501 	 */
4502 	if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
4503 	    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4504 	    pmap_pte_dirty(pmap, old_l2)) {
4505 		m = PTE_TO_VM_PAGE(old_l2);
4506 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4507 			vm_page_dirty(mt);
4508 	}
4509 
4510 	/*
4511 	 * Since a promotion must break the 4KB page mappings before making
4512 	 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4513 	 */
4514 	pmap_s1_invalidate_page(pmap, sva, true);
4515 }
4516 
4517 /*
4518  * Masks and sets bits in the specified L3C superpage mapping.
4519  *
4520  * Requests TLB invalidations to be performed by the caller through the
4521  * returned "*vap".
4522  */
4523 static void
4524 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
4525     vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits)
4526 {
4527 	pt_entry_t l3e, *tl3p;
4528 	vm_page_t m, mt;
4529 	bool dirty;
4530 
4531 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4532 	KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4533 	    0, ("pmap_mask_set_l3c: l3p is not aligned"));
4534 	KASSERT((va & L3C_OFFSET) == 0,
4535 	    ("pmap_mask_set_l3c: va is not aligned"));
4536 	dirty = false;
4537 	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4538 		l3e = pmap_load(tl3p);
4539 		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4540 		    ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS"));
4541 		while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
4542 			cpu_spinwait();
4543 		if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4544 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4545 			dirty = true;
4546 	}
4547 
4548 	/*
4549 	 * When a dirty read/write superpage mapping is write protected,
4550 	 * update the dirty field of each of the superpage's constituent 4KB
4551 	 * pages.
4552 	 */
4553 	if ((l3e & ATTR_SW_MANAGED) != 0 &&
4554 	    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4555 	    dirty) {
4556 		m = PTE_TO_VM_PAGE(pmap_load(l3p));
4557 		for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
4558 			vm_page_dirty(mt);
4559 	}
4560 
4561 	if (*vap == va_next)
4562 		*vap = va;
4563 }
4564 
4565 /*
4566  * Masks and sets bits in last level page table entries in the specified
4567  * pmap and range
4568  */
4569 static void
4570 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4571     pt_entry_t nbits, bool invalidate)
4572 {
4573 	vm_offset_t va, va_next;
4574 	pd_entry_t *l0, *l1, *l2;
4575 	pt_entry_t *l3p, l3;
4576 
4577 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4578 	for (; sva < eva; sva = va_next) {
4579 		l0 = pmap_l0(pmap, sva);
4580 		if (pmap_load(l0) == 0) {
4581 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4582 			if (va_next < sva)
4583 				va_next = eva;
4584 			continue;
4585 		}
4586 
4587 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4588 		if (va_next < sva)
4589 			va_next = eva;
4590 		l1 = pmap_l0_to_l1(l0, sva);
4591 		if (pmap_load(l1) == 0)
4592 			continue;
4593 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4594 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4595 			KASSERT(va_next <= eva,
4596 			    ("partial update of non-transparent 1G page "
4597 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4598 			    pmap_load(l1), sva, eva, va_next));
4599 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4600 			if ((pmap_load(l1) & mask) != nbits) {
4601 				pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
4602 				if (invalidate)
4603 					pmap_s1_invalidate_page(pmap, sva, true);
4604 			}
4605 			continue;
4606 		}
4607 
4608 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4609 		if (va_next < sva)
4610 			va_next = eva;
4611 
4612 		l2 = pmap_l1_to_l2(l1, sva);
4613 		if (pmap_load(l2) == 0)
4614 			continue;
4615 
4616 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4617 			if (sva + L2_SIZE == va_next && eva >= va_next) {
4618 				pmap_protect_l2(pmap, l2, sva, mask, nbits);
4619 				continue;
4620 			} else if ((pmap_load(l2) & mask) == nbits ||
4621 			    pmap_demote_l2(pmap, l2, sva) == NULL)
4622 				continue;
4623 		}
4624 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4625 		    ("pmap_protect: Invalid L2 entry after demotion"));
4626 
4627 		if (va_next > eva)
4628 			va_next = eva;
4629 
4630 		va = va_next;
4631 		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
4632 		    sva += L3_SIZE) {
4633 			l3 = pmap_load(l3p);
4634 
4635 			/*
4636 			 * Go to the next L3 entry if the current one is
4637 			 * invalid or already has the desired access
4638 			 * restrictions in place.  (The latter case occurs
4639 			 * frequently.  For example, in a "buildworld"
4640 			 * workload, almost 1 out of 4 L3 entries already
4641 			 * have the desired restrictions.)
4642 			 */
4643 			if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
4644 				if (va != va_next) {
4645 					if (invalidate)
4646 						pmap_s1_invalidate_range(pmap,
4647 						    va, sva, true);
4648 					va = va_next;
4649 				}
4650 				if ((l3 & ATTR_CONTIGUOUS) != 0) {
4651 					/*
4652 					 * Does this L3C page extend beyond
4653 					 * the requested range?  Handle the
4654 					 * possibility that "va_next" is zero.
4655 					 */
4656 					if ((sva | L3C_OFFSET) > va_next - 1)
4657 						break;
4658 
4659 					/*
4660 					 * Skip ahead to the last L3_PAGE
4661 					 * within this L3C page.
4662 					 */
4663 					l3p = (pt_entry_t *)((uintptr_t)l3p |
4664 					    ((L3C_ENTRIES - 1) *
4665 					    sizeof(pt_entry_t)));
4666 					sva |= L3C_SIZE - L3_SIZE;
4667 				}
4668 				continue;
4669 			}
4670 
4671 			if ((l3 & ATTR_CONTIGUOUS) != 0) {
4672 				/*
4673 				 * Is this entire set of contiguous L3 entries
4674 				 * being protected?  Handle the possibility
4675 				 * that "va_next" is zero because of address
4676 				 * wraparound.
4677 				 */
4678 				if ((sva & L3C_OFFSET) == 0 &&
4679 				    sva + L3C_OFFSET <= va_next - 1) {
4680 					pmap_mask_set_l3c(pmap, l3p, sva, &va,
4681 					    va_next, mask, nbits);
4682 					l3p += L3C_ENTRIES - 1;
4683 					sva += L3C_SIZE - L3_SIZE;
4684 					continue;
4685 				}
4686 
4687 				(void)pmap_demote_l3c(pmap, l3p, sva);
4688 
4689 				/*
4690 				 * The L3 entry's accessed bit may have changed.
4691 				 */
4692 				l3 = pmap_load(l3p);
4693 			}
4694 			while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
4695 			    nbits))
4696 				cpu_spinwait();
4697 
4698 			/*
4699 			 * When a dirty read/write mapping is write protected,
4700 			 * update the page's dirty field.
4701 			 */
4702 			if ((l3 & ATTR_SW_MANAGED) != 0 &&
4703 			    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4704 			    pmap_pte_dirty(pmap, l3))
4705 				vm_page_dirty(PTE_TO_VM_PAGE(l3));
4706 
4707 			if (va == va_next)
4708 				va = sva;
4709 		}
4710 		if (va != va_next && invalidate)
4711 			pmap_s1_invalidate_range(pmap, va, sva, true);
4712 	}
4713 }
4714 
4715 static void
4716 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4717     pt_entry_t nbits, bool invalidate)
4718 {
4719 	PMAP_LOCK(pmap);
4720 	pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
4721 	PMAP_UNLOCK(pmap);
4722 }
4723 
4724 /*
4725  *	Set the physical protection on the
4726  *	specified range of this map as requested.
4727  */
4728 void
4729 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4730 {
4731 	pt_entry_t mask, nbits;
4732 
4733 	PMAP_ASSERT_STAGE1(pmap);
4734 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4735 	if (prot == VM_PROT_NONE) {
4736 		pmap_remove(pmap, sva, eva);
4737 		return;
4738 	}
4739 
4740 	mask = nbits = 0;
4741 	if ((prot & VM_PROT_WRITE) == 0) {
4742 		mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
4743 		nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
4744 	}
4745 	if ((prot & VM_PROT_EXECUTE) == 0) {
4746 		mask |= ATTR_S1_XN;
4747 		nbits |= ATTR_S1_XN;
4748 	}
4749 	if (pmap == kernel_pmap) {
4750 		mask |= ATTR_KERN_GP;
4751 		nbits |= ATTR_KERN_GP;
4752 	}
4753 	if (mask == 0)
4754 		return;
4755 
4756 	pmap_mask_set(pmap, sva, eva, mask, nbits, true);
4757 }
4758 
4759 void
4760 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
4761 {
4762 
4763 	MPASS((sva & L3_OFFSET) == 0);
4764 	MPASS(((sva + size) & L3_OFFSET) == 0);
4765 
4766 	pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4767 	    ATTR_SW_NO_PROMOTE, false);
4768 }
4769 
4770 /*
4771  * Inserts the specified page table page into the specified pmap's collection
4772  * of idle page table pages.  Each of a pmap's page table pages is responsible
4773  * for mapping a distinct range of virtual addresses.  The pmap's collection is
4774  * ordered by this virtual address range.
4775  *
4776  * If "promoted" is false, then the page table page "mpte" must be zero filled;
4777  * "mpte"'s valid field will be set to 0.
4778  *
4779  * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must
4780  * contain valid mappings with identical attributes except for ATTR_AF;
4781  * "mpte"'s valid field will be set to 1.
4782  *
4783  * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain
4784  * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid
4785  * field will be set to VM_PAGE_BITS_ALL.
4786  */
4787 static __inline int
4788 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4789     bool all_l3e_AF_set)
4790 {
4791 
4792 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4793 	KASSERT(promoted || !all_l3e_AF_set,
4794 	    ("a zero-filled PTP can't have ATTR_AF set in every PTE"));
4795 	mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0;
4796 	return (vm_radix_insert(&pmap->pm_root, mpte));
4797 }
4798 
4799 /*
4800  * Removes the page table page mapping the specified virtual address from the
4801  * specified pmap's collection of idle page table pages, and returns it.
4802  * Otherwise, returns NULL if there is no page table page corresponding to the
4803  * specified virtual address.
4804  */
4805 static __inline vm_page_t
4806 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4807 {
4808 
4809 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4810 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4811 }
4812 
4813 /*
4814  * Performs a break-before-make update of a pmap entry. This is needed when
4815  * either promoting or demoting pages to ensure the TLB doesn't get into an
4816  * inconsistent state.
4817  */
4818 static void
4819 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte,
4820     vm_offset_t va, vm_size_t size)
4821 {
4822 	register_t intr;
4823 
4824 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4825 	KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4826 	    ("%s: Updating non-promote pte", __func__));
4827 
4828 	/*
4829 	 * Ensure we don't get switched out with the page table in an
4830 	 * inconsistent state. We also need to ensure no interrupts fire
4831 	 * as they may make use of an address we are about to invalidate.
4832 	 */
4833 	intr = intr_disable();
4834 
4835 	/*
4836 	 * Clear the old mapping's valid bit, but leave the rest of the entry
4837 	 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4838 	 * lookup the physical address.
4839 	 */
4840 	pmap_clear_bits(ptep, ATTR_DESCR_VALID);
4841 
4842 	/*
4843 	 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4844 	 * be cached, so we invalidate intermediate entries as well as final
4845 	 * entries.
4846 	 */
4847 	pmap_s1_invalidate_range(pmap, va, va + size, false);
4848 
4849 	/* Create the new mapping */
4850 	pmap_store(ptep, newpte);
4851 	dsb(ishst);
4852 
4853 	intr_restore(intr);
4854 }
4855 
4856 /*
4857  * Performs a break-before-make update of an ATTR_CONTIGUOUS mapping.
4858  */
4859 static void __nosanitizecoverage
4860 pmap_update_strided(pmap_t pmap, pd_entry_t *ptep, pd_entry_t *ptep_end,
4861     pd_entry_t newpte, vm_offset_t va, vm_offset_t stride, vm_size_t size)
4862 {
4863 	pd_entry_t *lip;
4864 	register_t intr;
4865 
4866 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4867 	KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4868 	    ("%s: Updating non-promote pte", __func__));
4869 
4870 	/*
4871 	 * Ensure we don't get switched out with the page table in an
4872 	 * inconsistent state. We also need to ensure no interrupts fire
4873 	 * as they may make use of an address we are about to invalidate.
4874 	 */
4875 	intr = intr_disable();
4876 
4877 	/*
4878 	 * Clear the old mapping's valid bits, but leave the rest of each
4879 	 * entry unchanged, so that a lockless, concurrent pmap_kextract() can
4880 	 * still lookup the physical address.
4881 	 */
4882 	for (lip = ptep; lip < ptep_end; lip++)
4883 		pmap_clear_bits(lip, ATTR_DESCR_VALID);
4884 
4885 	/* Only final entries are changing. */
4886 	pmap_s1_invalidate_strided(pmap, va, va + size, stride, true);
4887 
4888 	/* Create the new mapping. */
4889 	for (lip = ptep; lip < ptep_end; lip++) {
4890 		pmap_store(lip, newpte);
4891 		newpte += stride;
4892 	}
4893 	dsb(ishst);
4894 
4895 	intr_restore(intr);
4896 }
4897 
4898 #if VM_NRESERVLEVEL > 0
4899 /*
4900  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4901  * replace the many pv entries for the 4KB page mappings by a single pv entry
4902  * for the 2MB page mapping.
4903  */
4904 static void
4905 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4906     struct rwlock **lockp)
4907 {
4908 	struct md_page *pvh;
4909 	pv_entry_t pv;
4910 	vm_offset_t va_last;
4911 	vm_page_t m;
4912 
4913 	KASSERT((pa & L2_OFFSET) == 0,
4914 	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4915 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4916 
4917 	/*
4918 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
4919 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
4920 	 * a transfer avoids the possibility that get_pv_entry() calls
4921 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4922 	 * mappings that is being promoted.
4923 	 */
4924 	m = PHYS_TO_VM_PAGE(pa);
4925 	va = va & ~L2_OFFSET;
4926 	pv = pmap_pvh_remove(&m->md, pmap, va);
4927 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4928 	pvh = page_to_pvh(m);
4929 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4930 	pvh->pv_gen++;
4931 	/* Free the remaining NPTEPG - 1 pv entries. */
4932 	va_last = va + L2_SIZE - PAGE_SIZE;
4933 	do {
4934 		m++;
4935 		va += PAGE_SIZE;
4936 		pmap_pvh_free(&m->md, pmap, va);
4937 	} while (va < va_last);
4938 }
4939 
4940 /*
4941  * Tries to promote the 512, contiguous 4KB page mappings that are within a
4942  * single level 2 table entry to a single 2MB page mapping.  For promotion
4943  * to occur, two conditions must be met: (1) the 4KB page mappings must map
4944  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4945  * identical characteristics.
4946  */
4947 static bool
4948 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4949     struct rwlock **lockp)
4950 {
4951 	pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa;
4952 
4953 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4954 
4955 	/*
4956 	 * Currently, this function only supports promotion on stage 1 pmaps
4957 	 * because it tests stage 1 specific fields and performs a break-
4958 	 * before-make sequence that is incorrect for stage 2 pmaps.
4959 	 */
4960 	if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4961 		return (false);
4962 
4963 	/*
4964 	 * Examine the first L3E in the specified PTP.  Abort if this L3E is
4965 	 * ineligible for promotion...
4966 	 */
4967 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
4968 	newl2 = pmap_load(firstl3);
4969 	if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
4970 		return (false);
4971 	/* ... is not the first physical page within an L2 block */
4972 	if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 ||
4973 	    ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */
4974 		counter_u64_add(pmap_l2_p_failures, 1);
4975 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4976 		    " in pmap %p", va, pmap);
4977 		return (false);
4978 	}
4979 
4980 	/*
4981 	 * Both here and in the below "for" loop, to allow for repromotion
4982 	 * after MADV_FREE, conditionally write protect a clean L3E before
4983 	 * possibly aborting the promotion due to other L3E attributes.  Why?
4984 	 * Suppose that MADV_FREE is applied to a part of a superpage, the
4985 	 * address range [S, E).  pmap_advise() will demote the superpage
4986 	 * mapping, destroy the 4KB page mapping at the end of [S, E), and
4987 	 * set AP_RO and clear AF in the L3Es for the rest of [S, E).  Later,
4988 	 * imagine that the memory in [S, E) is recycled, but the last 4KB
4989 	 * page in [S, E) is not the last to be rewritten, or simply accessed.
4990 	 * In other words, there is still a 4KB page in [S, E), call it P,
4991 	 * that is writeable but AP_RO is set and AF is clear in P's L3E.
4992 	 * Unless we write protect P before aborting the promotion, if and
4993 	 * when P is finally rewritten, there won't be a page fault to trigger
4994 	 * repromotion.
4995 	 */
4996 setl2:
4997 	if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4998 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4999 		/*
5000 		 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
5001 		 * ATTR_SW_DBM can be cleared without a TLB invalidation.
5002 		 */
5003 		if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
5004 			goto setl2;
5005 		newl2 &= ~ATTR_SW_DBM;
5006 		CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx"
5007 		    " in pmap %p", va & ~L2_OFFSET, pmap);
5008 	}
5009 
5010 	/*
5011 	 * Examine each of the other L3Es in the specified PTP.  Abort if this
5012 	 * L3E maps an unexpected 4KB physical page or does not have identical
5013 	 * characteristics to the first L3E.  If ATTR_AF is not set in every
5014 	 * PTE, then request that the PTP be refilled on demotion.
5015 	 */
5016 	all_l3e_AF = newl2 & ATTR_AF;
5017 	pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK))
5018 	    + L2_SIZE - PAGE_SIZE;
5019 	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
5020 		oldl3 = pmap_load(l3);
5021 		if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
5022 			counter_u64_add(pmap_l2_p_failures, 1);
5023 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
5024 			    " in pmap %p", va, pmap);
5025 			return (false);
5026 		}
5027 setl3:
5028 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5029 		    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5030 			/*
5031 			 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
5032 			 * set, ATTR_SW_DBM can be cleared without a TLB
5033 			 * invalidation.
5034 			 */
5035 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
5036 			    ~ATTR_SW_DBM))
5037 				goto setl3;
5038 			oldl3 &= ~ATTR_SW_DBM;
5039 		}
5040 		if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) {
5041 			counter_u64_add(pmap_l2_p_failures, 1);
5042 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
5043 			    " in pmap %p", va, pmap);
5044 			return (false);
5045 		}
5046 		all_l3e_AF &= oldl3;
5047 		pa -= PAGE_SIZE;
5048 	}
5049 
5050 	/*
5051 	 * Unless all PTEs have ATTR_AF set, clear it from the superpage
5052 	 * mapping, so that promotions triggered by speculative mappings,
5053 	 * such as pmap_enter_quick(), don't automatically mark the
5054 	 * underlying pages as referenced.
5055 	 */
5056 	newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF;
5057 
5058 	/*
5059 	 * Save the page table page in its current state until the L2
5060 	 * mapping the superpage is demoted by pmap_demote_l2() or
5061 	 * destroyed by pmap_remove_l3().
5062 	 */
5063 	if (mpte == NULL)
5064 		mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5065 	KASSERT(mpte >= vm_page_array &&
5066 	    mpte < &vm_page_array[vm_page_array_size],
5067 	    ("pmap_promote_l2: page table page is out of range"));
5068 	KASSERT(mpte->pindex == pmap_l2_pindex(va),
5069 	    ("pmap_promote_l2: page table page's pindex is wrong"));
5070 	if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) {
5071 		counter_u64_add(pmap_l2_p_failures, 1);
5072 		CTR2(KTR_PMAP,
5073 		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
5074 		    pmap);
5075 		return (false);
5076 	}
5077 
5078 	if ((newl2 & ATTR_SW_MANAGED) != 0)
5079 		pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp);
5080 
5081 	pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE);
5082 
5083 	counter_u64_add(pmap_l2_promotions, 1);
5084 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
5085 	    pmap);
5086 	return (true);
5087 }
5088 
5089 /*
5090  * Tries to promote an aligned, contiguous set of base page mappings to a
5091  * single L3C page mapping.  For promotion to occur, two conditions must be
5092  * met: (1) the base page mappings must map aligned, contiguous physical
5093  * memory and (2) the base page mappings must have identical characteristics
5094  * except for the accessed flag.
5095  */
5096 static bool
5097 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va)
5098 {
5099 	pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa;
5100 
5101 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5102 
5103 	/*
5104 	 * Currently, this function only supports promotion on stage 1 pmaps
5105 	 * because it tests stage 1 specific fields and performs a break-
5106 	 * before-make sequence that is incorrect for stage 2 pmaps.
5107 	 */
5108 	if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
5109 		return (false);
5110 
5111 	/*
5112 	 * Compute the address of the first L3 entry in the superpage
5113 	 * candidate.
5114 	 */
5115 	l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
5116 	    sizeof(pt_entry_t)) - 1));
5117 
5118 	firstl3c = pmap_load(l3p);
5119 
5120 	/*
5121 	 * Examine the first L3 entry. Abort if this L3E is ineligible for
5122 	 * promotion...
5123 	 */
5124 	if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0)
5125 		return (false);
5126 	/* ...is not properly aligned... */
5127 	if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 ||
5128 	    (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */
5129 		counter_u64_add(pmap_l3c_p_failures, 1);
5130 		CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5131 		    " in pmap %p", va, pmap);
5132 		return (false);
5133 	}
5134 
5135 	/*
5136 	 * If the first L3 entry is a clean read-write mapping, convert it
5137 	 * to a read-only mapping.  See pmap_promote_l2() for the rationale.
5138 	 */
5139 set_first:
5140 	if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5141 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5142 		/*
5143 		 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
5144 		 * ATTR_SW_DBM can be cleared without a TLB invalidation.
5145 		 */
5146 		if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM))
5147 			goto set_first;
5148 		firstl3c &= ~ATTR_SW_DBM;
5149 		CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
5150 		    " in pmap %p", va & ~L3C_OFFSET, pmap);
5151 	}
5152 
5153 	/*
5154 	 * Check that the rest of the L3 entries are compatible with the first,
5155 	 * and convert clean read-write mappings to read-only mappings.
5156 	 */
5157 	all_l3e_AF = firstl3c & ATTR_AF;
5158 	pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) +
5159 	    L3C_SIZE - PAGE_SIZE;
5160 	for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) {
5161 		oldl3 = pmap_load(l3);
5162 		if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
5163 			counter_u64_add(pmap_l3c_p_failures, 1);
5164 			CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5165 			    " in pmap %p", va, pmap);
5166 			return (false);
5167 		}
5168 set_l3:
5169 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5170 		    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5171 			/*
5172 			 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
5173 			 * set, ATTR_SW_DBM can be cleared without a TLB
5174 			 * invalidation.
5175 			 */
5176 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
5177 			    ~ATTR_SW_DBM))
5178 				goto set_l3;
5179 			oldl3 &= ~ATTR_SW_DBM;
5180 			CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
5181 			    " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) |
5182 			    (va & ~L3C_OFFSET), pmap);
5183 		}
5184 		if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) {
5185 			counter_u64_add(pmap_l3c_p_failures, 1);
5186 			CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5187 			    " in pmap %p", va, pmap);
5188 			return (false);
5189 		}
5190 		all_l3e_AF &= oldl3;
5191 		pa -= PAGE_SIZE;
5192 	}
5193 
5194 	/*
5195 	 * Unless all PTEs have ATTR_AF set, clear it from the superpage
5196 	 * mapping, so that promotions triggered by speculative mappings,
5197 	 * such as pmap_enter_quick(), don't automatically mark the
5198 	 * underlying pages as referenced.
5199 	 */
5200 	firstl3c &= ~ATTR_AF | all_l3e_AF;
5201 
5202 	/*
5203 	 * Remake the mappings with the contiguous bit set.
5204 	 */
5205 	pmap_update_strided(pmap, l3p, l3p + L3C_ENTRIES, firstl3c |
5206 	    ATTR_CONTIGUOUS, va & ~L3C_OFFSET, L3_SIZE, L3C_SIZE);
5207 
5208 	counter_u64_add(pmap_l3c_promotions, 1);
5209 	CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va,
5210 	    pmap);
5211 	return (true);
5212 }
5213 #endif /* VM_NRESERVLEVEL > 0 */
5214 
5215 static int
5216 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags,
5217     int psind)
5218 {
5219 	pd_entry_t *l0p, *l1p, *l2p, *l3p, newpte, origpte, *tl3p;
5220 	vm_page_t mp;
5221 
5222 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5223 	KASSERT(psind > 0 && psind < MAXPAGESIZES,
5224 	    ("psind %d unexpected", psind));
5225 	KASSERT((PTE_TO_PHYS(pte) & (pagesizes[psind] - 1)) == 0,
5226 	    ("unaligned phys address %#lx pte %#lx psind %d",
5227 	    PTE_TO_PHYS(pte), pte, psind));
5228 
5229 restart:
5230 	newpte = pte;
5231 	if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte))
5232 		return (KERN_PROTECTION_FAILURE);
5233 	if (psind == 3) {
5234 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5235 
5236 		KASSERT(pagesizes[psind] == L1_SIZE,
5237 		    ("pagesizes[%d] != L1_SIZE", psind));
5238 		l0p = pmap_l0(pmap, va);
5239 		if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
5240 			mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
5241 			if (mp == NULL) {
5242 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5243 					return (KERN_RESOURCE_SHORTAGE);
5244 				PMAP_UNLOCK(pmap);
5245 				vm_wait(NULL);
5246 				PMAP_LOCK(pmap);
5247 				goto restart;
5248 			}
5249 			l1p = pmap_l0_to_l1(l0p, va);
5250 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
5251 			origpte = pmap_load(l1p);
5252 		} else {
5253 			l1p = pmap_l0_to_l1(l0p, va);
5254 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
5255 			origpte = pmap_load(l1p);
5256 			if ((origpte & ATTR_DESCR_VALID) == 0) {
5257 				mp = PTE_TO_VM_PAGE(pmap_load(l0p));
5258 				mp->ref_count++;
5259 			}
5260 		}
5261 		KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) &&
5262 		    (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
5263 		    (origpte & ATTR_DESCR_VALID) == 0,
5264 		    ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
5265 		    va, origpte, newpte));
5266 		pmap_store(l1p, newpte);
5267 	} else if (psind == 2) {
5268 		KASSERT(pagesizes[psind] == L2_SIZE,
5269 		    ("pagesizes[%d] != L2_SIZE", psind));
5270 		l2p = pmap_l2(pmap, va);
5271 		if (l2p == NULL) {
5272 			mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
5273 			if (mp == NULL) {
5274 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5275 					return (KERN_RESOURCE_SHORTAGE);
5276 				PMAP_UNLOCK(pmap);
5277 				vm_wait(NULL);
5278 				PMAP_LOCK(pmap);
5279 				goto restart;
5280 			}
5281 			l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
5282 			l2p = &l2p[pmap_l2_index(va)];
5283 			origpte = pmap_load(l2p);
5284 		} else {
5285 			l1p = pmap_l1(pmap, va);
5286 			origpte = pmap_load(l2p);
5287 			if ((origpte & ATTR_DESCR_VALID) == 0) {
5288 				mp = PTE_TO_VM_PAGE(pmap_load(l1p));
5289 				mp->ref_count++;
5290 			}
5291 		}
5292 		KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5293 		    ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
5294 		    PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5295 		    ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
5296 		    va, origpte, newpte));
5297 		pmap_store(l2p, newpte);
5298 	} else /* (psind == 1) */ {
5299 		KASSERT(pagesizes[psind] == L3C_SIZE,
5300 		    ("pagesizes[%d] != L3C_SIZE", psind));
5301 		l2p = pmap_l2(pmap, va);
5302 		if (l2p == NULL || (pmap_load(l2p) & ATTR_DESCR_VALID) == 0) {
5303 			mp = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL);
5304 			if (mp == NULL) {
5305 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5306 					return (KERN_RESOURCE_SHORTAGE);
5307 				PMAP_UNLOCK(pmap);
5308 				vm_wait(NULL);
5309 				PMAP_LOCK(pmap);
5310 				goto restart;
5311 			}
5312 			mp->ref_count += L3C_ENTRIES - 1;
5313 			l3p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
5314 			l3p = &l3p[pmap_l3_index(va)];
5315 		} else {
5316 			l3p = pmap_l2_to_l3(l2p, va);
5317 			if ((pmap_load(l3p) & ATTR_DESCR_VALID) == 0) {
5318 				mp = PTE_TO_VM_PAGE(pmap_load(l2p));
5319 				mp->ref_count += L3C_ENTRIES;
5320 			}
5321 		}
5322 		for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5323 			origpte = pmap_load(tl3p);
5324 			KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5325 			    ((origpte & ATTR_CONTIGUOUS) != 0 &&
5326 			    PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5327 			    ("va %#lx changing 64K phys page l3 %#lx newpte %#lx",
5328 			    va, origpte, newpte));
5329 			pmap_store(tl3p, newpte);
5330 			newpte += L3_SIZE;
5331 		}
5332 	}
5333 	dsb(ishst);
5334 
5335 	if ((origpte & ATTR_DESCR_VALID) == 0)
5336 		pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
5337 	if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
5338 		pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
5339 	else if ((newpte & ATTR_SW_WIRED) == 0 &&
5340 	    (origpte & ATTR_SW_WIRED) != 0)
5341 		pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
5342 
5343 	return (KERN_SUCCESS);
5344 }
5345 
5346 /*
5347  *	Insert the given physical page (p) at
5348  *	the specified virtual address (v) in the
5349  *	target physical map with the protection requested.
5350  *
5351  *	If specified, the page will be wired down, meaning
5352  *	that the related pte can not be reclaimed.
5353  *
5354  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5355  *	or lose information.  That is, this routine must actually
5356  *	insert this page into the given map NOW.
5357  */
5358 int
5359 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5360     u_int flags, int8_t psind)
5361 {
5362 	struct rwlock *lock;
5363 	pd_entry_t *pde;
5364 	pt_entry_t new_l3, orig_l3;
5365 	pt_entry_t *l2, *l3;
5366 	pv_entry_t pv;
5367 	vm_paddr_t opa, pa;
5368 	vm_page_t mpte, om;
5369 	bool nosleep;
5370 	int full_lvl, lvl, rv;
5371 
5372 	KASSERT(ADDR_IS_CANONICAL(va),
5373 	    ("%s: Address not in canonical form: %lx", __func__, va));
5374 
5375 	va = trunc_page(va);
5376 	if ((m->oflags & VPO_UNMANAGED) == 0)
5377 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
5378 	pa = VM_PAGE_TO_PHYS(m);
5379 	new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr |
5380 	    L3_PAGE);
5381 	new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
5382 	new_l3 |= pmap_pte_prot(pmap, prot);
5383 	if ((flags & PMAP_ENTER_WIRED) != 0)
5384 		new_l3 |= ATTR_SW_WIRED;
5385 	if (pmap->pm_stage == PM_STAGE1) {
5386 		if (ADDR_IS_USER(va))
5387 			new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5388 		else
5389 			new_l3 |= ATTR_S1_UXN;
5390 		if (pmap != kernel_pmap)
5391 			new_l3 |= ATTR_S1_nG;
5392 	} else {
5393 		/*
5394 		 * Clear the access flag on executable mappings, this will be
5395 		 * set later when the page is accessed. The fault handler is
5396 		 * required to invalidate the I-cache.
5397 		 *
5398 		 * TODO: Switch to the valid flag to allow hardware management
5399 		 * of the access flag. Much of the pmap code assumes the
5400 		 * valid flag is set and fails to destroy the old page tables
5401 		 * correctly if it is clear.
5402 		 */
5403 		if (prot & VM_PROT_EXECUTE)
5404 			new_l3 &= ~ATTR_AF;
5405 	}
5406 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5407 		new_l3 |= ATTR_SW_MANAGED;
5408 		if ((prot & VM_PROT_WRITE) != 0) {
5409 			new_l3 |= ATTR_SW_DBM;
5410 			if ((flags & VM_PROT_WRITE) == 0) {
5411 				if (pmap->pm_stage == PM_STAGE1)
5412 					new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
5413 				else
5414 					new_l3 &=
5415 					    ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5416 			}
5417 		}
5418 	}
5419 
5420 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
5421 
5422 	lock = NULL;
5423 	PMAP_LOCK(pmap);
5424 	if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
5425 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
5426 		    ("managed largepage va %#lx flags %#x", va, flags));
5427 		if (psind == 3) {
5428 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5429 			new_l3 &= ~L3_PAGE;
5430 			new_l3 |= L1_BLOCK;
5431 		} else if (psind == 2) {
5432 			new_l3 &= ~L3_PAGE;
5433 			new_l3 |= L2_BLOCK;
5434 		} else /* (psind == 1) */
5435 			new_l3 |= ATTR_CONTIGUOUS;
5436 		rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
5437 		goto out;
5438 	}
5439 	if (psind == 2) {
5440 		/* Assert the required virtual and physical alignment. */
5441 		KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
5442 		KASSERT(m->psind > 1, ("pmap_enter: m->psind < psind"));
5443 		rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
5444 		    flags, m, &lock);
5445 		goto out;
5446 	}
5447 	mpte = NULL;
5448 	if (psind == 1) {
5449 		KASSERT((va & L3C_OFFSET) == 0, ("pmap_enter: va unaligned"));
5450 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5451 		rv = pmap_enter_l3c(pmap, va, new_l3 | ATTR_CONTIGUOUS, flags,
5452 		    m, &mpte, &lock);
5453 #if VM_NRESERVLEVEL > 0
5454 		/*
5455 		 * Attempt L2 promotion, if both the PTP and a level 1
5456 		 * reservation are fully populated.
5457 		 */
5458 		if (rv == KERN_SUCCESS &&
5459 		    (mpte == NULL || mpte->ref_count == NL3PG) &&
5460 		    (m->flags & PG_FICTITIOUS) == 0 &&
5461 		    vm_reserv_level_iffullpop(m) == 1) {
5462 			pde = pmap_l2(pmap, va);
5463 			(void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5464 		}
5465 #endif
5466 		goto out;
5467 	}
5468 
5469 	/*
5470 	 * In the case that a page table page is not
5471 	 * resident, we are creating it here.
5472 	 */
5473 retry:
5474 	pde = pmap_pde(pmap, va, &lvl);
5475 	if (pde != NULL && lvl == 2) {
5476 		l3 = pmap_l2_to_l3(pde, va);
5477 		if (ADDR_IS_USER(va) && mpte == NULL) {
5478 			mpte = PTE_TO_VM_PAGE(pmap_load(pde));
5479 			mpte->ref_count++;
5480 		}
5481 		goto havel3;
5482 	} else if (pde != NULL && lvl == 1) {
5483 		l2 = pmap_l1_to_l2(pde, va);
5484 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
5485 		    (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
5486 			l3 = &l3[pmap_l3_index(va)];
5487 			if (ADDR_IS_USER(va)) {
5488 				mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5489 				mpte->ref_count++;
5490 			}
5491 			goto havel3;
5492 		}
5493 		/* We need to allocate an L3 table. */
5494 	}
5495 	if (ADDR_IS_USER(va)) {
5496 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5497 
5498 		/*
5499 		 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
5500 		 * to handle the possibility that a superpage mapping for "va"
5501 		 * was created while we slept.
5502 		 */
5503 		mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
5504 		    nosleep ? NULL : &lock);
5505 		if (mpte == NULL && nosleep) {
5506 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
5507 			rv = KERN_RESOURCE_SHORTAGE;
5508 			goto out;
5509 		}
5510 		goto retry;
5511 	} else
5512 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
5513 
5514 havel3:
5515 	orig_l3 = pmap_load(l3);
5516 	opa = PTE_TO_PHYS(orig_l3);
5517 	pv = NULL;
5518 	new_l3 |= pmap_pte_bti(pmap, va);
5519 
5520 	/*
5521 	 * Is the specified virtual address already mapped?
5522 	 */
5523 	if (pmap_l3_valid(orig_l3)) {
5524 		/*
5525 		 * Wiring change, just update stats. We don't worry about
5526 		 * wiring PT pages as they remain resident as long as there
5527 		 * are valid mappings in them. Hence, if a user page is wired,
5528 		 * the PT page will be also.
5529 		 */
5530 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
5531 		    (orig_l3 & ATTR_SW_WIRED) == 0)
5532 			pmap->pm_stats.wired_count++;
5533 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
5534 		    (orig_l3 & ATTR_SW_WIRED) != 0)
5535 			pmap->pm_stats.wired_count--;
5536 
5537 		/*
5538 		 * Remove the extra PT page reference.
5539 		 */
5540 		if (mpte != NULL) {
5541 			mpte->ref_count--;
5542 			KASSERT(mpte->ref_count > 0,
5543 			    ("pmap_enter: missing reference to page table page,"
5544 			     " va: 0x%lx", va));
5545 		}
5546 
5547 		/*
5548 		 * Has the physical page changed?
5549 		 */
5550 		if (opa == pa) {
5551 			/*
5552 			 * No, might be a protection or wiring change.
5553 			 */
5554 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5555 			    (new_l3 & ATTR_SW_DBM) != 0)
5556 				vm_page_aflag_set(m, PGA_WRITEABLE);
5557 			goto validate;
5558 		}
5559 
5560 		/*
5561 		 * The physical page has changed.  Temporarily invalidate
5562 		 * the mapping.
5563 		 */
5564 		if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5565 			(void)pmap_demote_l3c(pmap, l3, va);
5566 		orig_l3 = pmap_load_clear(l3);
5567 		KASSERT(PTE_TO_PHYS(orig_l3) == opa,
5568 		    ("pmap_enter: unexpected pa update for %#lx", va));
5569 		if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
5570 			om = PHYS_TO_VM_PAGE(opa);
5571 
5572 			/*
5573 			 * The pmap lock is sufficient to synchronize with
5574 			 * concurrent calls to pmap_page_test_mappings() and
5575 			 * pmap_ts_referenced().
5576 			 */
5577 			if (pmap_pte_dirty(pmap, orig_l3))
5578 				vm_page_dirty(om);
5579 			if ((orig_l3 & ATTR_AF) != 0) {
5580 				pmap_invalidate_page(pmap, va, true);
5581 				vm_page_aflag_set(om, PGA_REFERENCED);
5582 			}
5583 			CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om);
5584 			pv = pmap_pvh_remove(&om->md, pmap, va);
5585 			if ((m->oflags & VPO_UNMANAGED) != 0)
5586 				free_pv_entry(pmap, pv);
5587 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
5588 			    TAILQ_EMPTY(&om->md.pv_list) &&
5589 			    ((om->flags & PG_FICTITIOUS) != 0 ||
5590 			    TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
5591 				vm_page_aflag_clear(om, PGA_WRITEABLE);
5592 		} else {
5593 			KASSERT((orig_l3 & ATTR_AF) != 0,
5594 			    ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
5595 			pmap_invalidate_page(pmap, va, true);
5596 		}
5597 		orig_l3 = 0;
5598 	} else {
5599 		/*
5600 		 * Increment the counters.
5601 		 */
5602 		if ((new_l3 & ATTR_SW_WIRED) != 0)
5603 			pmap->pm_stats.wired_count++;
5604 		pmap_resident_count_inc(pmap, 1);
5605 	}
5606 	/*
5607 	 * Enter on the PV list if part of our managed memory.
5608 	 */
5609 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5610 		if (pv == NULL) {
5611 			pv = get_pv_entry(pmap, &lock);
5612 			pv->pv_va = va;
5613 		}
5614 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5615 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5616 		m->md.pv_gen++;
5617 		if ((new_l3 & ATTR_SW_DBM) != 0)
5618 			vm_page_aflag_set(m, PGA_WRITEABLE);
5619 	}
5620 
5621 validate:
5622 	if (pmap->pm_stage == PM_STAGE1) {
5623 		/*
5624 		 * Sync icache if exec permission and attribute
5625 		 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
5626 		 * is stored and made valid for hardware table walk. If done
5627 		 * later, then other can access this page before caches are
5628 		 * properly synced. Don't do it for kernel memory which is
5629 		 * mapped with exec permission even if the memory isn't going
5630 		 * to hold executable code. The only time when icache sync is
5631 		 * needed is after kernel module is loaded and the relocation
5632 		 * info is processed. And it's done in elf_cpu_load_file().
5633 		*/
5634 		if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
5635 		    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
5636 		    (opa != pa || (orig_l3 & ATTR_S1_XN))) {
5637 			PMAP_ASSERT_STAGE1(pmap);
5638 			cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
5639 			    PAGE_SIZE);
5640 		}
5641 	} else {
5642 		cpu_dcache_wb_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
5643 	}
5644 
5645 	/*
5646 	 * Update the L3 entry
5647 	 */
5648 	if (pmap_l3_valid(orig_l3)) {
5649 		KASSERT(opa == pa, ("pmap_enter: invalid update"));
5650 		if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
5651 			/* same PA, different attributes */
5652 			if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5653 				(void)pmap_demote_l3c(pmap, l3, va);
5654 			orig_l3 = pmap_load_store(l3, new_l3);
5655 			pmap_invalidate_page(pmap, va, true);
5656 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5657 			    pmap_pte_dirty(pmap, orig_l3))
5658 				vm_page_dirty(m);
5659 		} else {
5660 			/*
5661 			 * orig_l3 == new_l3
5662 			 * This can happens if multiple threads simultaneously
5663 			 * access not yet mapped page. This bad for performance
5664 			 * since this can cause full demotion-NOP-promotion
5665 			 * cycle.
5666 			 * Another possible reasons are:
5667 			 * - VM and pmap memory layout are diverged
5668 			 * - tlb flush is missing somewhere and CPU doesn't see
5669 			 *   actual mapping.
5670 			 */
5671 			CTR4(KTR_PMAP, "%s: already mapped page - "
5672 			    "pmap %p va 0x%#lx pte 0x%lx",
5673 			    __func__, pmap, va, new_l3);
5674 		}
5675 	} else {
5676 		/* New mapping */
5677 		pmap_store(l3, new_l3);
5678 		dsb(ishst);
5679 	}
5680 
5681 #if VM_NRESERVLEVEL > 0
5682 	/*
5683 	 * First, attempt L3C promotion, if the virtual and physical addresses
5684 	 * are aligned with each other and an underlying reservation has the
5685 	 * neighboring L3 pages allocated.  The first condition is simply an
5686 	 * optimization that recognizes some eventual promotion failures early
5687 	 * at a lower run-time cost.  Then, if both a level 1 reservation and
5688 	 * the PTP are fully populated, attempt L2 promotion.
5689 	 */
5690 	if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
5691 	    (m->flags & PG_FICTITIOUS) == 0 &&
5692 	    (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
5693 	    pmap_promote_l3c(pmap, l3, va) &&
5694 	    full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG))
5695 		(void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5696 #endif
5697 
5698 	rv = KERN_SUCCESS;
5699 out:
5700 	if (lock != NULL)
5701 		rw_wunlock(lock);
5702 	PMAP_UNLOCK(pmap);
5703 	return (rv);
5704 }
5705 
5706 /*
5707  * Tries to create a read- and/or execute-only L2 page mapping.  Returns
5708  * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
5709  * value.  See pmap_enter_l2() for the possible error values when "no sleep",
5710  * "no replace", and "no reclaim" are specified.
5711  */
5712 static int
5713 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5714     struct rwlock **lockp)
5715 {
5716 	pd_entry_t new_l2;
5717 
5718 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5719 	PMAP_ASSERT_STAGE1(pmap);
5720 	KASSERT(ADDR_IS_CANONICAL(va),
5721 	    ("%s: Address not in canonical form: %lx", __func__, va));
5722 
5723 	new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | pmap_sh_attr |
5724 	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5725 	    L2_BLOCK);
5726 	if ((m->oflags & VPO_UNMANAGED) == 0)
5727 		new_l2 |= ATTR_SW_MANAGED;
5728 	else
5729 		new_l2 |= ATTR_AF;
5730 	if ((prot & VM_PROT_EXECUTE) == 0 ||
5731 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
5732 		new_l2 |= ATTR_S1_XN;
5733 	if (ADDR_IS_USER(va))
5734 		new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5735 	else
5736 		new_l2 |= ATTR_S1_UXN;
5737 	if (pmap != kernel_pmap)
5738 		new_l2 |= ATTR_S1_nG;
5739 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
5740 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
5741 }
5742 
5743 /*
5744  * Returns true if every page table entry in the specified page table is
5745  * zero.
5746  */
5747 static bool
5748 pmap_every_pte_zero(vm_paddr_t pa)
5749 {
5750 	pt_entry_t *pt_end, *pte;
5751 
5752 	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
5753 	pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
5754 	for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
5755 		if (*pte != 0)
5756 			return (false);
5757 	}
5758 	return (true);
5759 }
5760 
5761 /*
5762  * Tries to create the specified L2 page mapping.  Returns KERN_SUCCESS if
5763  * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
5764  * KERN_RESOURCE_SHORTAGE otherwise.  Returns KERN_FAILURE if
5765  * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists
5766  * within the L2 virtual address range starting at the specified virtual
5767  * address.  Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
5768  * L2 page mapping already exists at the specified virtual address.  Returns
5769  * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
5770  * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
5771  * and a PV entry allocation failed.
5772  */
5773 static int
5774 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
5775     vm_page_t m, struct rwlock **lockp)
5776 {
5777 	struct spglist free;
5778 	pd_entry_t *l2, old_l2;
5779 	vm_page_t l2pg, mt;
5780 	vm_page_t uwptpg;
5781 
5782 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5783 	KASSERT(ADDR_IS_CANONICAL(va),
5784 	    ("%s: Address not in canonical form: %lx", __func__, va));
5785 	KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
5786 	    PMAP_ENTER_NORECLAIM,
5787 	    ("pmap_enter_l2: flags is missing PMAP_ENTER_NOREPLACE"));
5788 
5789 	if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
5790 	    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
5791 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
5792 		    va, pmap);
5793 		return (KERN_RESOURCE_SHORTAGE);
5794 	}
5795 
5796 	/*
5797 	 * If bti is not the same for the whole l2 range, return failure
5798 	 * and let vm_fault() cope.  Check after l2 allocation, since
5799 	 * it could sleep.
5800 	 */
5801 	if (!pmap_bti_same(pmap, va, va + L2_SIZE, &new_l2)) {
5802 		KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP"));
5803 		pmap_abort_ptp(pmap, va, l2pg);
5804 		return (KERN_PROTECTION_FAILURE);
5805 	}
5806 
5807 	/*
5808 	 * If there are existing mappings, either abort or remove them.
5809 	 */
5810 	if ((old_l2 = pmap_load(l2)) != 0) {
5811 		KASSERT(l2pg == NULL || l2pg->ref_count > 1,
5812 		    ("pmap_enter_l2: l2pg's ref count is too low"));
5813 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5814 			if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5815 				if (l2pg != NULL)
5816 					l2pg->ref_count--;
5817 				CTR2(KTR_PMAP,
5818 				    "pmap_enter_l2: no space for va %#lx"
5819 				    " in pmap %p", va, pmap);
5820 				return (KERN_NO_SPACE);
5821 			} else if (ADDR_IS_USER(va) ||
5822 			    !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) {
5823 				if (l2pg != NULL)
5824 					l2pg->ref_count--;
5825 				CTR2(KTR_PMAP,
5826 				    "pmap_enter_l2: failure for va %#lx"
5827 				    " in pmap %p", va, pmap);
5828 				return (KERN_FAILURE);
5829 			}
5830 		}
5831 		SLIST_INIT(&free);
5832 		if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5833 			(void)pmap_remove_l2(pmap, l2, va,
5834 			    pmap_load(pmap_l1(pmap, va)), false, &free, lockp);
5835 		} else {
5836 			if (ADDR_IS_KERNEL(va)) {
5837 				/*
5838 				 * Try to save the ptp in the trie
5839 				 * before any changes to mappings are
5840 				 * made.  Abort on failure.
5841 				 */
5842 				mt = PTE_TO_VM_PAGE(old_l2);
5843 				if (pmap_insert_pt_page(pmap, mt, false,
5844 				    false)) {
5845 					CTR1(KTR_PMAP,
5846 			    "pmap_enter_l2: cannot ins kern ptp va %#lx",
5847 					    va);
5848 					return (KERN_RESOURCE_SHORTAGE);
5849 				}
5850 				/*
5851 				 * Both pmap_remove_l2() and
5852 				 * pmap_remove_l3_range() will zero fill
5853 				 * the L3 kernel page table page.
5854 				 */
5855 			}
5856 			pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
5857 			    &free, lockp);
5858 			if (ADDR_IS_KERNEL(va)) {
5859 				/*
5860 				 * The TLB could have an intermediate
5861 				 * entry for the L3 kernel page table
5862 				 * page, so request an invalidation at
5863 				 * all levels after clearing the
5864 				 * L2_TABLE entry.
5865 				 */
5866 				pmap_clear(l2);
5867 				pmap_s1_invalidate_page(pmap, va, false);
5868 			}
5869 		}
5870 		KASSERT(pmap_load(l2) == 0,
5871 		    ("pmap_enter_l2: non-zero L2 entry %p", l2));
5872 		if (ADDR_IS_USER(va)) {
5873 			vm_page_free_pages_toq(&free, true);
5874 		} else {
5875 			KASSERT(SLIST_EMPTY(&free),
5876 			    ("pmap_enter_l2: freed kernel page table page"));
5877 		}
5878 	}
5879 
5880 	/*
5881 	 * Allocate leaf ptpage for wired userspace pages.
5882 	 */
5883 	uwptpg = NULL;
5884 	if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) {
5885 		uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5886 		if (uwptpg == NULL) {
5887 			pmap_abort_ptp(pmap, va, l2pg);
5888 			return (KERN_RESOURCE_SHORTAGE);
5889 		}
5890 		uwptpg->pindex = pmap_l2_pindex(va);
5891 		if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
5892 			vm_page_unwire_noq(uwptpg);
5893 			vm_page_free(uwptpg);
5894 			pmap_abort_ptp(pmap, va, l2pg);
5895 			return (KERN_RESOURCE_SHORTAGE);
5896 		}
5897 		pmap_resident_count_inc(pmap, 1);
5898 		uwptpg->ref_count = NL3PG;
5899 	}
5900 	if ((new_l2 & ATTR_SW_MANAGED) != 0) {
5901 		/*
5902 		 * Abort this mapping if its PV entry could not be created.
5903 		 */
5904 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
5905 			if (l2pg != NULL)
5906 				pmap_abort_ptp(pmap, va, l2pg);
5907 			else {
5908 				KASSERT(ADDR_IS_KERNEL(va) &&
5909 				    (pmap_load(l2) & ATTR_DESCR_MASK) ==
5910 				    L2_TABLE,
5911 				    ("pmap_enter_l2: invalid kernel L2E"));
5912 				mt = pmap_remove_pt_page(pmap, va);
5913 				KASSERT(mt != NULL,
5914 				    ("pmap_enter_l2: missing kernel PTP"));
5915 			}
5916 			if (uwptpg != NULL) {
5917 				mt = pmap_remove_pt_page(pmap, va);
5918 				KASSERT(mt == uwptpg,
5919 				    ("removed pt page %p, expected %p", mt,
5920 				    uwptpg));
5921 				pmap_resident_count_dec(pmap, 1);
5922 				uwptpg->ref_count = 1;
5923 				vm_page_unwire_noq(uwptpg);
5924 				vm_page_free(uwptpg);
5925 			}
5926 			CTR2(KTR_PMAP,
5927 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
5928 			    va, pmap);
5929 			return (KERN_RESOURCE_SHORTAGE);
5930 		}
5931 		if ((new_l2 & ATTR_SW_DBM) != 0)
5932 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5933 				vm_page_aflag_set(mt, PGA_WRITEABLE);
5934 	}
5935 
5936 	/*
5937 	 * Increment counters.
5938 	 */
5939 	if ((new_l2 & ATTR_SW_WIRED) != 0)
5940 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
5941 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
5942 
5943 	/*
5944 	 * Conditionally sync the icache.  See pmap_enter() for details.
5945 	 */
5946 	if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) !=
5947 	    PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) &&
5948 	    pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
5949 		cpu_icache_sync_range((void *)PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)),
5950 		    L2_SIZE);
5951 	}
5952 
5953 	/*
5954 	 * Map the superpage.
5955 	 */
5956 	pmap_store(l2, new_l2);
5957 	dsb(ishst);
5958 
5959 	counter_u64_add(pmap_l2_mappings, 1);
5960 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
5961 	    va, pmap);
5962 
5963 	return (KERN_SUCCESS);
5964 }
5965 
5966 /*
5967  * Tries to create a read- and/or execute-only L3C page mapping.  Returns
5968  * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
5969  * value.
5970  */
5971 static int
5972 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p,
5973     vm_prot_t prot, struct rwlock **lockp)
5974 {
5975 	pt_entry_t l3e;
5976 
5977 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5978 	PMAP_ASSERT_STAGE1(pmap);
5979 	KASSERT(ADDR_IS_CANONICAL(va),
5980 	    ("%s: Address not in canonical form: %lx", __func__, va));
5981 
5982 	l3e = VM_PAGE_TO_PTE(m) | pmap_sh_attr |
5983 	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5984 	    ATTR_CONTIGUOUS | L3_PAGE;
5985 	if ((m->oflags & VPO_UNMANAGED) == 0)
5986 		l3e |= ATTR_SW_MANAGED;
5987 	else
5988 		l3e |= ATTR_AF;
5989 	if ((prot & VM_PROT_EXECUTE) == 0 ||
5990 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
5991 		l3e |= ATTR_S1_XN;
5992 	if (ADDR_IS_USER(va))
5993 		l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5994 	else
5995 		l3e |= ATTR_S1_UXN;
5996 	if (pmap != kernel_pmap)
5997 		l3e |= ATTR_S1_nG;
5998 	return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP |
5999 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp));
6000 }
6001 
6002 static int
6003 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
6004     vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp)
6005 {
6006 	pd_entry_t *l2p, *pde;
6007 	pt_entry_t *l3p, *tl3p;
6008 	vm_page_t mt;
6009 	vm_paddr_t pa;
6010 	vm_pindex_t l2pindex;
6011 	int lvl;
6012 
6013 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6014 	KASSERT((va & L3C_OFFSET) == 0,
6015 	    ("pmap_enter_l3c: va is not aligned"));
6016 	KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0,
6017 	    ("pmap_enter_l3c: managed mapping within the clean submap"));
6018 	KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
6019 	    ("pmap_enter_l3c: l3e is missing ATTR_CONTIGUOUS"));
6020 
6021 	/*
6022 	 * If the L3 PTP is not resident, we attempt to create it here.
6023 	 */
6024 	if (ADDR_IS_USER(va)) {
6025 		/*
6026 		 * Were we given the correct L3 PTP?  If so, we can simply
6027 		 * increment its ref count.
6028 		 */
6029 		l2pindex = pmap_l2_pindex(va);
6030 		if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) {
6031 			(*ml3p)->ref_count += L3C_ENTRIES;
6032 		} else {
6033 retry:
6034 			/*
6035 			 * Get the L2 entry.
6036 			 */
6037 			pde = pmap_pde(pmap, va, &lvl);
6038 
6039 			/*
6040 			 * If the L2 entry is a superpage, we either abort or
6041 			 * demote depending on the given flags.
6042 			 */
6043 			if (lvl == 1) {
6044 				l2p = pmap_l1_to_l2(pde, va);
6045 				if ((pmap_load(l2p) & ATTR_DESCR_MASK) ==
6046 				    L2_BLOCK) {
6047 					if ((flags & PMAP_ENTER_NOREPLACE) != 0)
6048 						return (KERN_FAILURE);
6049 					l3p = pmap_demote_l2_locked(pmap, l2p,
6050 					    va, lockp);
6051 					if (l3p != NULL) {
6052 						*ml3p = PTE_TO_VM_PAGE(
6053 						    pmap_load(l2p));
6054 						(*ml3p)->ref_count +=
6055 						    L3C_ENTRIES;
6056 						goto have_l3p;
6057 					}
6058 				}
6059 				/* We need to allocate an L3 PTP. */
6060 			}
6061 
6062 			/*
6063 			 * If the L3 PTP is mapped, we just increment its ref
6064 			 * count.  Otherwise, we attempt to allocate it.
6065 			 */
6066 			if (lvl == 2 && pmap_load(pde) != 0) {
6067 				*ml3p = PTE_TO_VM_PAGE(pmap_load(pde));
6068 				(*ml3p)->ref_count += L3C_ENTRIES;
6069 			} else {
6070 				*ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags &
6071 				    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp);
6072 				if (*ml3p == NULL) {
6073 					if ((flags & PMAP_ENTER_NOSLEEP) != 0)
6074 						return (KERN_FAILURE);
6075 
6076 					/*
6077 					 * The page table may have changed
6078 					 * while we slept.
6079 					 */
6080 					goto retry;
6081 				}
6082 				(*ml3p)->ref_count += L3C_ENTRIES - 1;
6083 			}
6084 		}
6085 		l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p));
6086 	} else {
6087 		*ml3p = NULL;
6088 
6089 		/*
6090 		 * If the L2 entry is a superpage, we either abort or demote
6091 		 * depending on the given flags.
6092 		 */
6093 		pde = pmap_pde(kernel_pmap, va, &lvl);
6094 		if (lvl == 1) {
6095 			l2p = pmap_l1_to_l2(pde, va);
6096 			KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK,
6097 			    ("pmap_enter_l3c: missing L2 block"));
6098 			if ((flags & PMAP_ENTER_NOREPLACE) != 0)
6099 				return (KERN_FAILURE);
6100 			l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp);
6101 		} else {
6102 			KASSERT(lvl == 2,
6103 			    ("pmap_enter_l3c: Invalid level %d", lvl));
6104 			l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(
6105 			    pmap_load(pde)));
6106 		}
6107 	}
6108 have_l3p:
6109 	l3p = &l3p[pmap_l3_index(va)];
6110 
6111 	/*
6112 	 * If bti is not the same for the whole L3C range, return failure
6113 	 * and let vm_fault() cope.  Check after L3 allocation, since
6114 	 * it could sleep.
6115 	 */
6116 	if (!pmap_bti_same(pmap, va, va + L3C_SIZE, &l3e)) {
6117 		KASSERT(*ml3p != NULL, ("pmap_enter_l3c: missing L3 PTP"));
6118 		(*ml3p)->ref_count -= L3C_ENTRIES - 1;
6119 		pmap_abort_ptp(pmap, va, *ml3p);
6120 		*ml3p = NULL;
6121 		return (KERN_PROTECTION_FAILURE);
6122 	}
6123 
6124 	/*
6125 	 * If there are existing mappings, either abort or remove them.
6126 	 */
6127 	if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
6128 		for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6129 			if (pmap_load(tl3p) != 0) {
6130 				if (*ml3p != NULL)
6131 					(*ml3p)->ref_count -= L3C_ENTRIES;
6132 				return (KERN_FAILURE);
6133 			}
6134 		}
6135 	} else {
6136 		/*
6137 		 * Because we increment the L3 page's reference count above,
6138 		 * it is guaranteed not to be freed here and we can pass NULL
6139 		 * instead of a valid free list.
6140 		 */
6141 		pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va,
6142 		    va + L3C_SIZE, NULL, lockp);
6143 	}
6144 
6145 	/*
6146 	 * Enter on the PV list if part of our managed memory.
6147 	 */
6148 	if ((l3e & ATTR_SW_MANAGED) != 0) {
6149 		if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) {
6150 			if (*ml3p != NULL) {
6151 				(*ml3p)->ref_count -= L3C_ENTRIES - 1;
6152 				pmap_abort_ptp(pmap, va, *ml3p);
6153 				*ml3p = NULL;
6154 			}
6155 			return (KERN_RESOURCE_SHORTAGE);
6156 		}
6157 		if ((l3e & ATTR_SW_DBM) != 0)
6158 			for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
6159 				vm_page_aflag_set(mt, PGA_WRITEABLE);
6160 	}
6161 
6162 	/*
6163 	 * Increment counters.
6164 	 */
6165 	if ((l3e & ATTR_SW_WIRED) != 0)
6166 		pmap->pm_stats.wired_count += L3C_ENTRIES;
6167 	pmap_resident_count_inc(pmap, L3C_ENTRIES);
6168 
6169 	pa = VM_PAGE_TO_PHYS(m);
6170 	KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned"));
6171 
6172 	/*
6173 	 * Sync the icache before the mapping is stored.
6174 	 */
6175 	if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap &&
6176 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
6177 		cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), L3C_SIZE);
6178 
6179 	/*
6180 	 * Map the superpage.
6181 	 */
6182 	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6183 		pmap_store(tl3p, l3e);
6184 		l3e += L3_SIZE;
6185 	}
6186 	dsb(ishst);
6187 
6188 	counter_u64_add(pmap_l3c_mappings, 1);
6189 	CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p",
6190 	    va, pmap);
6191 	return (KERN_SUCCESS);
6192 }
6193 
6194 /*
6195  * Maps a sequence of resident pages belonging to the same object.
6196  * The sequence begins with the given page m_start.  This page is
6197  * mapped at the given virtual address start.  Each subsequent page is
6198  * mapped at a virtual address that is offset from start by the same
6199  * amount as the page is offset from m_start within the object.  The
6200  * last page in the sequence is the page with the largest offset from
6201  * m_start that can be mapped at a virtual address less than the given
6202  * virtual address end.  Not every virtual page between start and end
6203  * is mapped; only those for which a resident page exists with the
6204  * corresponding offset from m_start are mapped.
6205  */
6206 void
6207 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
6208     vm_page_t m_start, vm_prot_t prot)
6209 {
6210 	struct pctrie_iter pages;
6211 	struct rwlock *lock;
6212 	vm_offset_t va;
6213 	vm_page_t m, mpte;
6214 	int rv;
6215 
6216 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
6217 
6218 	mpte = NULL;
6219 	vm_page_iter_limit_init(&pages, m_start->object,
6220 	    m_start->pindex + atop(end - start));
6221 	m = vm_radix_iter_lookup(&pages, m_start->pindex);
6222 	lock = NULL;
6223 	PMAP_LOCK(pmap);
6224 	while (m != NULL) {
6225 		va = start + ptoa(m->pindex - m_start->pindex);
6226 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
6227 		    m->psind == 2 && pmap_ps_enabled(pmap) &&
6228 		    ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) ==
6229 		    KERN_SUCCESS || rv == KERN_NO_SPACE)) {
6230 			m = vm_radix_iter_jump(&pages, L2_SIZE / PAGE_SIZE);
6231 		} else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end &&
6232 		    m->psind >= 1 && pmap_ps_enabled(pmap) &&
6233 		    ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot,
6234 		    &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE)) {
6235 			m = vm_radix_iter_jump(&pages, L3C_ENTRIES);
6236 		} else {
6237 			/*
6238 			 * In general, if a superpage mapping were possible,
6239 			 * it would have been created above.  That said, if
6240 			 * start and end are not superpage aligned, then
6241 			 * promotion might be possible at the ends of [start,
6242 			 * end).  However, in practice, those promotion
6243 			 * attempts are so unlikely to succeed that they are
6244 			 * not worth trying.
6245 			 */
6246 			mpte = pmap_enter_quick_locked(pmap, va, m, prot |
6247 			    VM_PROT_NO_PROMOTE, mpte, &lock);
6248 			m = vm_radix_iter_step(&pages);
6249 		}
6250 	}
6251 	if (lock != NULL)
6252 		rw_wunlock(lock);
6253 	PMAP_UNLOCK(pmap);
6254 }
6255 
6256 /*
6257  * this code makes some *MAJOR* assumptions:
6258  * 1. Current pmap & pmap exists.
6259  * 2. Not wired.
6260  * 3. Read access.
6261  * 4. No page table pages.
6262  * but is *MUCH* faster than pmap_enter...
6263  */
6264 
6265 void
6266 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
6267 {
6268 	struct rwlock *lock;
6269 
6270 	lock = NULL;
6271 	PMAP_LOCK(pmap);
6272 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
6273 	if (lock != NULL)
6274 		rw_wunlock(lock);
6275 	PMAP_UNLOCK(pmap);
6276 }
6277 
6278 static vm_page_t
6279 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
6280     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
6281 {
6282 	pt_entry_t *l1, *l2, *l3, l3_val;
6283 	vm_paddr_t pa;
6284 	int full_lvl, lvl;
6285 
6286 	KASSERT(!VA_IS_CLEANMAP(va) ||
6287 	    (m->oflags & VPO_UNMANAGED) != 0,
6288 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
6289 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6290 	PMAP_ASSERT_STAGE1(pmap);
6291 	KASSERT(ADDR_IS_CANONICAL(va),
6292 	    ("%s: Address not in canonical form: %lx", __func__, va));
6293 	l2 = NULL;
6294 
6295 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
6296 	/*
6297 	 * In the case that a page table page is not
6298 	 * resident, we are creating it here.
6299 	 */
6300 	if (ADDR_IS_USER(va)) {
6301 		vm_pindex_t l2pindex;
6302 
6303 		/*
6304 		 * Calculate pagetable page index
6305 		 */
6306 		l2pindex = pmap_l2_pindex(va);
6307 		if (mpte && (mpte->pindex == l2pindex)) {
6308 			mpte->ref_count++;
6309 		} else {
6310 			/*
6311 			 * If the page table page is mapped, we just increment
6312 			 * the hold count, and activate it.  Otherwise, we
6313 			 * attempt to allocate a page table page, passing NULL
6314 			 * instead of the PV list lock pointer because we don't
6315 			 * intend to sleep.  If this attempt fails, we don't
6316 			 * retry.  Instead, we give up.
6317 			 */
6318 			l1 = pmap_l1(pmap, va);
6319 			if (l1 != NULL && pmap_load(l1) != 0) {
6320 				if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
6321 				    L1_BLOCK)
6322 					return (NULL);
6323 				l2 = pmap_l1_to_l2(l1, va);
6324 				if (pmap_load(l2) != 0) {
6325 					if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
6326 					    L2_BLOCK)
6327 						return (NULL);
6328 					mpte = PTE_TO_VM_PAGE(pmap_load(l2));
6329 					mpte->ref_count++;
6330 				} else {
6331 					mpte = _pmap_alloc_l3(pmap, l2pindex,
6332 					    NULL);
6333 					if (mpte == NULL)
6334 						return (mpte);
6335 				}
6336 			} else {
6337 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
6338 				if (mpte == NULL)
6339 					return (mpte);
6340 			}
6341 		}
6342 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
6343 		l3 = &l3[pmap_l3_index(va)];
6344 	} else {
6345 		mpte = NULL;
6346 		l2 = pmap_pde(kernel_pmap, va, &lvl);
6347 		KASSERT(l2 != NULL,
6348 		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
6349 		     va));
6350 		KASSERT(lvl == 2,
6351 		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
6352 		l3 = pmap_l2_to_l3(l2, va);
6353 	}
6354 
6355 	/*
6356 	 * Abort if a mapping already exists.
6357 	 */
6358 	if (pmap_load(l3) != 0) {
6359 		if (mpte != NULL)
6360 			mpte->ref_count--;
6361 		return (NULL);
6362 	}
6363 
6364 	/*
6365 	 * Enter on the PV list if part of our managed memory.
6366 	 */
6367 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
6368 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
6369 		if (mpte != NULL)
6370 			pmap_abort_ptp(pmap, va, mpte);
6371 		return (NULL);
6372 	}
6373 
6374 	/*
6375 	 * Increment counters
6376 	 */
6377 	pmap_resident_count_inc(pmap, 1);
6378 
6379 	pa = VM_PAGE_TO_PHYS(m);
6380 	l3_val = PHYS_TO_PTE(pa) | pmap_sh_attr |
6381 	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
6382 	l3_val |= pmap_pte_bti(pmap, va);
6383 	if ((prot & VM_PROT_EXECUTE) == 0 ||
6384 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
6385 		l3_val |= ATTR_S1_XN;
6386 	if (ADDR_IS_USER(va))
6387 		l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
6388 	else
6389 		l3_val |= ATTR_S1_UXN;
6390 	if (pmap != kernel_pmap)
6391 		l3_val |= ATTR_S1_nG;
6392 
6393 	/*
6394 	 * Now validate mapping with RO protection
6395 	 */
6396 	if ((m->oflags & VPO_UNMANAGED) == 0)
6397 		l3_val |= ATTR_SW_MANAGED;
6398 	else
6399 		l3_val |= ATTR_AF;
6400 
6401 	/* Sync icache before the mapping is stored to PTE */
6402 	if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
6403 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
6404 		cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
6405 
6406 	pmap_store(l3, l3_val);
6407 	dsb(ishst);
6408 
6409 #if VM_NRESERVLEVEL > 0
6410 	/*
6411 	 * First, attempt L3C promotion, if the virtual and physical addresses
6412 	 * are aligned with each other and an underlying reservation has the
6413 	 * neighboring L3 pages allocated.  The first condition is simply an
6414 	 * optimization that recognizes some eventual promotion failures early
6415 	 * at a lower run-time cost.  Then, attempt L2 promotion, if both a
6416 	 * level 1 reservation and the PTP are fully populated.
6417 	 */
6418 	if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
6419 	    (va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
6420 	    (m->flags & PG_FICTITIOUS) == 0 &&
6421 	    (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
6422 	    pmap_promote_l3c(pmap, l3, va) &&
6423 	    full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) {
6424 		if (l2 == NULL)
6425 			l2 = pmap_l2(pmap, va);
6426 
6427 		/*
6428 		 * If promotion succeeds, then the next call to this function
6429 		 * should not be given the unmapped PTP as a hint.
6430 		 */
6431 		if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
6432 			mpte = NULL;
6433 	}
6434 #endif
6435 
6436 	return (mpte);
6437 }
6438 
6439 /*
6440  * This code maps large physical mmap regions into the
6441  * processor address space.  Note that some shortcuts
6442  * are taken, but the code works.
6443  */
6444 void
6445 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
6446     vm_pindex_t pindex, vm_size_t size)
6447 {
6448 
6449 	VM_OBJECT_ASSERT_WLOCKED(object);
6450 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
6451 	    ("pmap_object_init_pt: non-device object"));
6452 }
6453 
6454 /*
6455  *	Clear the wired attribute from the mappings for the specified range of
6456  *	addresses in the given pmap.  Every valid mapping within that range
6457  *	must have the wired attribute set.  In contrast, invalid mappings
6458  *	cannot have the wired attribute set, so they are ignored.
6459  *
6460  *	The wired attribute of the page table entry is not a hardware feature,
6461  *	so there is no need to invalidate any TLB entries.
6462  */
6463 void
6464 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6465 {
6466 	vm_offset_t va_next;
6467 	pd_entry_t *l0, *l1, *l2;
6468 	pt_entry_t *l3;
6469 	bool partial_l3c;
6470 
6471 	PMAP_LOCK(pmap);
6472 	for (; sva < eva; sva = va_next) {
6473 		l0 = pmap_l0(pmap, sva);
6474 		if (pmap_load(l0) == 0) {
6475 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6476 			if (va_next < sva)
6477 				va_next = eva;
6478 			continue;
6479 		}
6480 
6481 		l1 = pmap_l0_to_l1(l0, sva);
6482 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6483 		if (va_next < sva)
6484 			va_next = eva;
6485 		if (pmap_load(l1) == 0)
6486 			continue;
6487 
6488 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6489 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6490 			KASSERT(va_next <= eva,
6491 			    ("partial update of non-transparent 1G page "
6492 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
6493 			    pmap_load(l1), sva, eva, va_next));
6494 			MPASS(pmap != kernel_pmap);
6495 			MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
6496 			    ATTR_SW_WIRED)) == ATTR_SW_WIRED);
6497 			pmap_clear_bits(l1, ATTR_SW_WIRED);
6498 			pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
6499 			continue;
6500 		}
6501 
6502 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6503 		if (va_next < sva)
6504 			va_next = eva;
6505 
6506 		l2 = pmap_l1_to_l2(l1, sva);
6507 		if (pmap_load(l2) == 0)
6508 			continue;
6509 
6510 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
6511 			if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
6512 				panic("pmap_unwire: l2 %#jx is missing "
6513 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
6514 
6515 			/*
6516 			 * Are we unwiring the entire large page?  If not,
6517 			 * demote the mapping and fall through.
6518 			 */
6519 			if (sva + L2_SIZE == va_next && eva >= va_next) {
6520 				pmap_clear_bits(l2, ATTR_SW_WIRED);
6521 				pmap->pm_stats.wired_count -= L2_SIZE /
6522 				    PAGE_SIZE;
6523 				continue;
6524 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
6525 				panic("pmap_unwire: demotion failed");
6526 		}
6527 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6528 		    ("pmap_unwire: Invalid l2 entry after demotion"));
6529 
6530 		if (va_next > eva)
6531 			va_next = eva;
6532 		for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva);
6533 		    sva != va_next; l3++, sva += L3_SIZE) {
6534 			if (pmap_load(l3) == 0)
6535 				continue;
6536 			if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) {
6537 				/*
6538 				 * Avoid demotion for whole-page unwiring.
6539 				 */
6540 				if ((sva & L3C_OFFSET) == 0) {
6541 					/*
6542 					 * Handle the possibility that
6543 					 * "va_next" is zero because of
6544 					 * address wraparound.
6545 					 */
6546 					partial_l3c = sva + L3C_OFFSET >
6547 					    va_next - 1;
6548 				}
6549 				if (partial_l3c)
6550 					(void)pmap_demote_l3c(pmap, l3, sva);
6551 			}
6552 			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
6553 				panic("pmap_unwire: l3 %#jx is missing "
6554 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
6555 
6556 			/*
6557 			 * ATTR_SW_WIRED must be cleared atomically.  Although
6558 			 * the pmap lock synchronizes access to ATTR_SW_WIRED,
6559 			 * the System MMU may write to the entry concurrently.
6560 			 */
6561 			pmap_clear_bits(l3, ATTR_SW_WIRED);
6562 			pmap->pm_stats.wired_count--;
6563 		}
6564 	}
6565 	PMAP_UNLOCK(pmap);
6566 }
6567 
6568 /*
6569  * This function requires that the caller has already added one to ml3's
6570  * ref_count in anticipation of creating a 4KB page mapping.
6571  */
6572 static bool
6573 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e,
6574     vm_page_t ml3, struct rwlock **lockp)
6575 {
6576 	pt_entry_t *tl3p;
6577 
6578 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6579 	KASSERT((va & L3C_OFFSET) == 0,
6580 	    ("pmap_copy_l3c: va is not aligned"));
6581 	KASSERT((l3e & ATTR_SW_MANAGED) != 0,
6582 	    ("pmap_copy_l3c: l3e is not managed"));
6583 
6584 	/*
6585 	 * Abort if a mapping already exists.
6586 	 */
6587 	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++)
6588 		if (pmap_load(tl3p) != 0) {
6589 			if (ml3 != NULL)
6590 				ml3->ref_count--;
6591 			return (false);
6592 		}
6593 
6594 	if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) {
6595 		if (ml3 != NULL)
6596 			pmap_abort_ptp(pmap, va, ml3);
6597 		return (false);
6598 	}
6599 	ml3->ref_count += L3C_ENTRIES - 1;
6600 
6601 	/*
6602 	 * Clear the wired and accessed bits.  However, leave the dirty bit
6603 	 * unchanged because read/write superpage mappings are required to be
6604 	 * dirty.
6605 	 */
6606 	l3e &= ~(ATTR_SW_WIRED | ATTR_AF);
6607 
6608 	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6609 		pmap_store(tl3p, l3e);
6610 		l3e += L3_SIZE;
6611 	}
6612 	pmap_resident_count_inc(pmap, L3C_ENTRIES);
6613 	counter_u64_add(pmap_l3c_mappings, 1);
6614 	CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p",
6615 	    va, pmap);
6616 	return (true);
6617 }
6618 
6619 /*
6620  *	Copy the range specified by src_addr/len
6621  *	from the source map to the range dst_addr/len
6622  *	in the destination map.
6623  *
6624  *	This routine is only advisory and need not do anything.
6625  *
6626  *	Because the executable mappings created by this routine are copied,
6627  *	it should not have to flush the instruction cache.
6628  */
6629 void
6630 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
6631     vm_offset_t src_addr)
6632 {
6633 	struct rwlock *lock;
6634 	pd_entry_t *l0, *l1, *l2, srcptepaddr;
6635 	pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
6636 	vm_offset_t addr, end_addr, va_next;
6637 	vm_page_t dst_m, dstmpte, srcmpte;
6638 
6639 	PMAP_ASSERT_STAGE1(dst_pmap);
6640 	PMAP_ASSERT_STAGE1(src_pmap);
6641 
6642 	if (dst_addr != src_addr)
6643 		return;
6644 	end_addr = src_addr + len;
6645 	lock = NULL;
6646 	if (dst_pmap < src_pmap) {
6647 		PMAP_LOCK(dst_pmap);
6648 		PMAP_LOCK(src_pmap);
6649 	} else {
6650 		PMAP_LOCK(src_pmap);
6651 		PMAP_LOCK(dst_pmap);
6652 	}
6653 	for (addr = src_addr; addr < end_addr; addr = va_next) {
6654 		l0 = pmap_l0(src_pmap, addr);
6655 		if (pmap_load(l0) == 0) {
6656 			va_next = (addr + L0_SIZE) & ~L0_OFFSET;
6657 			if (va_next < addr)
6658 				va_next = end_addr;
6659 			continue;
6660 		}
6661 
6662 		va_next = (addr + L1_SIZE) & ~L1_OFFSET;
6663 		if (va_next < addr)
6664 			va_next = end_addr;
6665 		l1 = pmap_l0_to_l1(l0, addr);
6666 		if (pmap_load(l1) == 0)
6667 			continue;
6668 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6669 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6670 			KASSERT(va_next <= end_addr,
6671 			    ("partial update of non-transparent 1G page "
6672 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6673 			    pmap_load(l1), addr, end_addr, va_next));
6674 			srcptepaddr = pmap_load(l1);
6675 			l1 = pmap_l1(dst_pmap, addr);
6676 			if (l1 == NULL) {
6677 				if (_pmap_alloc_l3(dst_pmap,
6678 				    pmap_l0_pindex(addr), NULL) == NULL)
6679 					break;
6680 				l1 = pmap_l1(dst_pmap, addr);
6681 			} else {
6682 				l0 = pmap_l0(dst_pmap, addr);
6683 				dst_m = PTE_TO_VM_PAGE(pmap_load(l0));
6684 				dst_m->ref_count++;
6685 			}
6686 			KASSERT(pmap_load(l1) == 0,
6687 			    ("1G mapping present in dst pmap "
6688 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6689 			    pmap_load(l1), addr, end_addr, va_next));
6690 			pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
6691 			pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
6692 			continue;
6693 		}
6694 
6695 		va_next = (addr + L2_SIZE) & ~L2_OFFSET;
6696 		if (va_next < addr)
6697 			va_next = end_addr;
6698 		l2 = pmap_l1_to_l2(l1, addr);
6699 		srcptepaddr = pmap_load(l2);
6700 		if (srcptepaddr == 0)
6701 			continue;
6702 		if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
6703 			/*
6704 			 * We can only virtual copy whole superpages.
6705 			 */
6706 			if ((addr & L2_OFFSET) != 0 ||
6707 			    addr + L2_SIZE > end_addr)
6708 				continue;
6709 			l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
6710 			if (l2 == NULL)
6711 				break;
6712 			if (pmap_load(l2) == 0 &&
6713 			    ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
6714 			    pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
6715 			    PMAP_ENTER_NORECLAIM, &lock))) {
6716 				/*
6717 				 * We leave the dirty bit unchanged because
6718 				 * managed read/write superpage mappings are
6719 				 * required to be dirty.  However, managed
6720 				 * superpage mappings are not required to
6721 				 * have their accessed bit set, so we clear
6722 				 * it because we don't know if this mapping
6723 				 * will be used.
6724 				 */
6725 				srcptepaddr &= ~ATTR_SW_WIRED;
6726 				if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
6727 					srcptepaddr &= ~ATTR_AF;
6728 				pmap_store(l2, srcptepaddr);
6729 				pmap_resident_count_inc(dst_pmap, L2_SIZE /
6730 				    PAGE_SIZE);
6731 				counter_u64_add(pmap_l2_mappings, 1);
6732 			} else
6733 				pmap_abort_ptp(dst_pmap, addr, dst_m);
6734 			continue;
6735 		}
6736 		KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
6737 		    ("pmap_copy: invalid L2 entry"));
6738 		srcmpte = PTE_TO_VM_PAGE(srcptepaddr);
6739 		KASSERT(srcmpte->ref_count > 0,
6740 		    ("pmap_copy: source page table page is unused"));
6741 		if (va_next > end_addr)
6742 			va_next = end_addr;
6743 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr));
6744 		src_pte = &src_pte[pmap_l3_index(addr)];
6745 		dstmpte = NULL;
6746 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
6747 			ptetemp = pmap_load(src_pte);
6748 
6749 			/*
6750 			 * We only virtual copy managed pages.
6751 			 */
6752 			if ((ptetemp & ATTR_SW_MANAGED) == 0)
6753 				continue;
6754 
6755 			if (dstmpte != NULL) {
6756 				KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
6757 				    ("dstmpte pindex/addr mismatch"));
6758 				dstmpte->ref_count++;
6759 			} else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
6760 			    NULL)) == NULL)
6761 				goto out;
6762 			dst_pte = (pt_entry_t *)
6763 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
6764 			dst_pte = &dst_pte[pmap_l3_index(addr)];
6765 			if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr &
6766 			    L3C_OFFSET) == 0 && addr + L3C_OFFSET <=
6767 			    va_next - 1) {
6768 				if (!pmap_copy_l3c(dst_pmap, dst_pte, addr,
6769 				    ptetemp, dstmpte, &lock))
6770 					goto out;
6771 				addr += L3C_SIZE - PAGE_SIZE;
6772 				src_pte += L3C_ENTRIES - 1;
6773 			} else if (pmap_load(dst_pte) == 0 &&
6774 			    pmap_try_insert_pv_entry(dst_pmap, addr,
6775 			    PTE_TO_VM_PAGE(ptetemp), &lock)) {
6776 				/*
6777 				 * Clear the wired, contiguous, modified, and
6778 				 * accessed bits from the destination PTE.
6779 				 * The contiguous bit is cleared because we
6780 				 * are not copying the entire L3C superpage.
6781 				 */
6782 				mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS |
6783 				    ATTR_AF;
6784 				nbits = 0;
6785 				if ((ptetemp & ATTR_SW_DBM) != 0)
6786 					nbits |= ATTR_S1_AP_RW_BIT;
6787 				pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
6788 				pmap_resident_count_inc(dst_pmap, 1);
6789 			} else {
6790 				pmap_abort_ptp(dst_pmap, addr, dstmpte);
6791 				goto out;
6792 			}
6793 			/* Have we copied all of the valid mappings? */
6794 			if (dstmpte->ref_count >= srcmpte->ref_count)
6795 				break;
6796 		}
6797 	}
6798 out:
6799 	/*
6800 	 * XXX This barrier may not be needed because the destination pmap is
6801 	 * not active.
6802 	 */
6803 	dsb(ishst);
6804 
6805 	if (lock != NULL)
6806 		rw_wunlock(lock);
6807 	PMAP_UNLOCK(src_pmap);
6808 	PMAP_UNLOCK(dst_pmap);
6809 }
6810 
6811 int
6812 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
6813 {
6814 	int error;
6815 
6816 	if (dst_pmap->pm_stage != src_pmap->pm_stage)
6817 		return (EINVAL);
6818 
6819 	if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL)
6820 		return (0);
6821 
6822 	for (;;) {
6823 		if (dst_pmap < src_pmap) {
6824 			PMAP_LOCK(dst_pmap);
6825 			PMAP_LOCK(src_pmap);
6826 		} else {
6827 			PMAP_LOCK(src_pmap);
6828 			PMAP_LOCK(dst_pmap);
6829 		}
6830 		error = pmap_bti_copy(dst_pmap, src_pmap);
6831 		/* Clean up partial copy on failure due to no memory. */
6832 		if (error == ENOMEM)
6833 			pmap_bti_deassign_all(dst_pmap);
6834 		PMAP_UNLOCK(src_pmap);
6835 		PMAP_UNLOCK(dst_pmap);
6836 		if (error != ENOMEM)
6837 			break;
6838 		vm_wait(NULL);
6839 	}
6840 	return (error);
6841 }
6842 
6843 /*
6844  *	pmap_zero_page zeros the specified hardware page by mapping
6845  *	the page into KVM and using bzero to clear its contents.
6846  */
6847 void
6848 pmap_zero_page(vm_page_t m)
6849 {
6850 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6851 
6852 	pagezero((void *)va);
6853 }
6854 
6855 /*
6856  *	pmap_zero_page_area zeros the specified hardware page by mapping
6857  *	the page into KVM and using bzero to clear its contents.
6858  *
6859  *	off and size may not cover an area beyond a single hardware page.
6860  */
6861 void
6862 pmap_zero_page_area(vm_page_t m, int off, int size)
6863 {
6864 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6865 
6866 	if (off == 0 && size == PAGE_SIZE)
6867 		pagezero((void *)va);
6868 	else
6869 		bzero((char *)va + off, size);
6870 }
6871 
6872 /*
6873  *	pmap_copy_page copies the specified (machine independent)
6874  *	page by mapping the page into virtual memory and using
6875  *	bcopy to copy the page, one machine dependent page at a
6876  *	time.
6877  */
6878 void
6879 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6880 {
6881 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
6882 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
6883 
6884 	pagecopy((void *)src, (void *)dst);
6885 }
6886 
6887 int unmapped_buf_allowed = 1;
6888 
6889 void
6890 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6891     vm_offset_t b_offset, int xfersize)
6892 {
6893 	void *a_cp, *b_cp;
6894 	vm_page_t m_a, m_b;
6895 	vm_paddr_t p_a, p_b;
6896 	vm_offset_t a_pg_offset, b_pg_offset;
6897 	int cnt;
6898 
6899 	while (xfersize > 0) {
6900 		a_pg_offset = a_offset & PAGE_MASK;
6901 		m_a = ma[a_offset >> PAGE_SHIFT];
6902 		p_a = m_a->phys_addr;
6903 		b_pg_offset = b_offset & PAGE_MASK;
6904 		m_b = mb[b_offset >> PAGE_SHIFT];
6905 		p_b = m_b->phys_addr;
6906 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6907 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6908 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
6909 			panic("!DMAP a %lx", p_a);
6910 		} else {
6911 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
6912 		}
6913 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
6914 			panic("!DMAP b %lx", p_b);
6915 		} else {
6916 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
6917 		}
6918 		bcopy(a_cp, b_cp, cnt);
6919 		a_offset += cnt;
6920 		b_offset += cnt;
6921 		xfersize -= cnt;
6922 	}
6923 }
6924 
6925 vm_offset_t
6926 pmap_quick_enter_page(vm_page_t m)
6927 {
6928 
6929 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
6930 }
6931 
6932 void
6933 pmap_quick_remove_page(vm_offset_t addr)
6934 {
6935 }
6936 
6937 /*
6938  * Returns true if the pmap's pv is one of the first
6939  * 16 pvs linked to from this page.  This count may
6940  * be changed upwards or downwards in the future; it
6941  * is only necessary that true be returned for a small
6942  * subset of pmaps for proper page aging.
6943  */
6944 bool
6945 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
6946 {
6947 	struct md_page *pvh;
6948 	struct rwlock *lock;
6949 	pv_entry_t pv;
6950 	int loops = 0;
6951 	bool rv;
6952 
6953 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6954 	    ("pmap_page_exists_quick: page %p is not managed", m));
6955 	rv = false;
6956 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6957 	rw_rlock(lock);
6958 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6959 		if (PV_PMAP(pv) == pmap) {
6960 			rv = true;
6961 			break;
6962 		}
6963 		loops++;
6964 		if (loops >= 16)
6965 			break;
6966 	}
6967 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
6968 		pvh = page_to_pvh(m);
6969 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6970 			if (PV_PMAP(pv) == pmap) {
6971 				rv = true;
6972 				break;
6973 			}
6974 			loops++;
6975 			if (loops >= 16)
6976 				break;
6977 		}
6978 	}
6979 	rw_runlock(lock);
6980 	return (rv);
6981 }
6982 
6983 /*
6984  *	pmap_page_wired_mappings:
6985  *
6986  *	Return the number of managed mappings to the given physical page
6987  *	that are wired.
6988  */
6989 int
6990 pmap_page_wired_mappings(vm_page_t m)
6991 {
6992 	struct rwlock *lock;
6993 	struct md_page *pvh;
6994 	pmap_t pmap;
6995 	pt_entry_t *pte;
6996 	pv_entry_t pv;
6997 	int count, md_gen, pvh_gen;
6998 
6999 	if ((m->oflags & VPO_UNMANAGED) != 0)
7000 		return (0);
7001 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7002 	rw_rlock(lock);
7003 restart:
7004 	count = 0;
7005 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7006 		pmap = PV_PMAP(pv);
7007 		if (!PMAP_TRYLOCK(pmap)) {
7008 			md_gen = m->md.pv_gen;
7009 			rw_runlock(lock);
7010 			PMAP_LOCK(pmap);
7011 			rw_rlock(lock);
7012 			if (md_gen != m->md.pv_gen) {
7013 				PMAP_UNLOCK(pmap);
7014 				goto restart;
7015 			}
7016 		}
7017 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7018 		if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
7019 			count++;
7020 		PMAP_UNLOCK(pmap);
7021 	}
7022 	if ((m->flags & PG_FICTITIOUS) == 0) {
7023 		pvh = page_to_pvh(m);
7024 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
7025 			pmap = PV_PMAP(pv);
7026 			if (!PMAP_TRYLOCK(pmap)) {
7027 				md_gen = m->md.pv_gen;
7028 				pvh_gen = pvh->pv_gen;
7029 				rw_runlock(lock);
7030 				PMAP_LOCK(pmap);
7031 				rw_rlock(lock);
7032 				if (md_gen != m->md.pv_gen ||
7033 				    pvh_gen != pvh->pv_gen) {
7034 					PMAP_UNLOCK(pmap);
7035 					goto restart;
7036 				}
7037 			}
7038 			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
7039 			if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
7040 				count++;
7041 			PMAP_UNLOCK(pmap);
7042 		}
7043 	}
7044 	rw_runlock(lock);
7045 	return (count);
7046 }
7047 
7048 /*
7049  * Returns true if the given page is mapped individually or as part of
7050  * a 2mpage.  Otherwise, returns false.
7051  */
7052 bool
7053 pmap_page_is_mapped(vm_page_t m)
7054 {
7055 	struct rwlock *lock;
7056 	bool rv;
7057 
7058 	if ((m->oflags & VPO_UNMANAGED) != 0)
7059 		return (false);
7060 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7061 	rw_rlock(lock);
7062 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
7063 	    ((m->flags & PG_FICTITIOUS) == 0 &&
7064 	    !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
7065 	rw_runlock(lock);
7066 	return (rv);
7067 }
7068 
7069 /*
7070  * Destroy all managed, non-wired mappings in the given user-space
7071  * pmap.  This pmap cannot be active on any processor besides the
7072  * caller.
7073  *
7074  * This function cannot be applied to the kernel pmap.  Moreover, it
7075  * is not intended for general use.  It is only to be used during
7076  * process termination.  Consequently, it can be implemented in ways
7077  * that make it faster than pmap_remove().  First, it can more quickly
7078  * destroy mappings by iterating over the pmap's collection of PV
7079  * entries, rather than searching the page table.  Second, it doesn't
7080  * have to test and clear the page table entries atomically, because
7081  * no processor is currently accessing the user address space.  In
7082  * particular, a page table entry's dirty bit won't change state once
7083  * this function starts.
7084  */
7085 void
7086 pmap_remove_pages(pmap_t pmap)
7087 {
7088 	pd_entry_t *pde;
7089 	pt_entry_t *pte, tpte;
7090 	struct spglist free;
7091 	struct pv_chunklist free_chunks[PMAP_MEMDOM];
7092 	vm_page_t m, ml3, mt;
7093 	pv_entry_t pv;
7094 	struct md_page *pvh;
7095 	struct pv_chunk *pc, *npc;
7096 	struct rwlock *lock;
7097 	int64_t bit;
7098 	uint64_t inuse, bitmask;
7099 	int allfree, field, i, idx, lvl;
7100 	int freed __pvused;
7101 	vm_paddr_t pa;
7102 
7103 	lock = NULL;
7104 
7105 	for (i = 0; i < PMAP_MEMDOM; i++)
7106 		TAILQ_INIT(&free_chunks[i]);
7107 	SLIST_INIT(&free);
7108 	PMAP_LOCK(pmap);
7109 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
7110 		allfree = 1;
7111 		freed = 0;
7112 		for (field = 0; field < _NPCM; field++) {
7113 			inuse = ~pc->pc_map[field] & pc_freemask[field];
7114 			while (inuse != 0) {
7115 				bit = ffsl(inuse) - 1;
7116 				bitmask = 1UL << bit;
7117 				idx = field * 64 + bit;
7118 				pv = &pc->pc_pventry[idx];
7119 				inuse &= ~bitmask;
7120 
7121 				pde = pmap_pde(pmap, pv->pv_va, &lvl);
7122 				KASSERT(pde != NULL,
7123 				    ("Attempting to remove an unmapped page"));
7124 
7125 				switch(lvl) {
7126 				case 1:
7127 					pte = pmap_l1_to_l2(pde, pv->pv_va);
7128 					tpte = pmap_load(pte);
7129 					KASSERT((tpte & ATTR_DESCR_MASK) ==
7130 					    L2_BLOCK,
7131 					    ("Attempting to remove an invalid "
7132 					    "block: %lx", tpte));
7133 					break;
7134 				case 2:
7135 					pte = pmap_l2_to_l3(pde, pv->pv_va);
7136 					tpte = pmap_load(pte);
7137 					KASSERT((tpte & ATTR_DESCR_MASK) ==
7138 					    L3_PAGE,
7139 					    ("Attempting to remove an invalid "
7140 					     "page: %lx", tpte));
7141 					break;
7142 				default:
7143 					panic(
7144 					    "Invalid page directory level: %d",
7145 					    lvl);
7146 				}
7147 
7148 				/*
7149 				 * We cannot remove wired mappings at this time.
7150 				 *
7151 				 * For L3C superpages, all of the constituent PTEs
7152 				 * should have the wired bit set, so we don't
7153 				 * check for ATTR_CONTIGUOUS here.
7154 				 */
7155 				if (tpte & ATTR_SW_WIRED) {
7156 					allfree = 0;
7157 					continue;
7158 				}
7159 
7160 				/* Mark free */
7161 				pc->pc_map[field] |= bitmask;
7162 
7163 				/*
7164 				 * Because this pmap is not active on other
7165 				 * processors, the dirty bit cannot have
7166 				 * changed state since we last loaded pte.
7167 				 */
7168 				pmap_clear(pte);
7169 
7170 				pa = PTE_TO_PHYS(tpte);
7171 
7172 				m = PHYS_TO_VM_PAGE(pa);
7173 				KASSERT(m->phys_addr == pa,
7174 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
7175 				    m, (uintmax_t)m->phys_addr,
7176 				    (uintmax_t)tpte));
7177 
7178 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
7179 				    m < &vm_page_array[vm_page_array_size],
7180 				    ("pmap_remove_pages: bad pte %#jx",
7181 				    (uintmax_t)tpte));
7182 
7183 				/*
7184 				 * Update the vm_page_t clean/reference bits.
7185 				 *
7186 				 * We don't check for ATTR_CONTIGUOUS here
7187 				 * because writeable L3C superpages are expected
7188 				 * to be dirty, i.e., every constituent PTE
7189 				 * should be dirty.
7190 				 */
7191 				if (pmap_pte_dirty(pmap, tpte)) {
7192 					switch (lvl) {
7193 					case 1:
7194 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
7195 							vm_page_dirty(mt);
7196 						break;
7197 					case 2:
7198 						vm_page_dirty(m);
7199 						break;
7200 					}
7201 				}
7202 
7203 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
7204 
7205 				switch (lvl) {
7206 				case 1:
7207 					pmap_resident_count_dec(pmap,
7208 					    L2_SIZE / PAGE_SIZE);
7209 					pvh = page_to_pvh(m);
7210 					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
7211 					pvh->pv_gen++;
7212 					if (TAILQ_EMPTY(&pvh->pv_list)) {
7213 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
7214 							if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
7215 							    TAILQ_EMPTY(&mt->md.pv_list))
7216 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
7217 					}
7218 					ml3 = pmap_remove_pt_page(pmap,
7219 					    pv->pv_va);
7220 					if (ml3 != NULL) {
7221 						KASSERT(vm_page_any_valid(ml3),
7222 						    ("pmap_remove_pages: l3 page not promoted"));
7223 						pmap_resident_count_dec(pmap,1);
7224 						KASSERT(ml3->ref_count == NL3PG,
7225 						    ("pmap_remove_pages: l3 page ref count error"));
7226 						ml3->ref_count = 0;
7227 						pmap_add_delayed_free_list(ml3,
7228 						    &free, false);
7229 					}
7230 					break;
7231 				case 2:
7232 					pmap_resident_count_dec(pmap, 1);
7233 					TAILQ_REMOVE(&m->md.pv_list, pv,
7234 					    pv_next);
7235 					m->md.pv_gen++;
7236 					if ((m->a.flags & PGA_WRITEABLE) != 0 &&
7237 					    TAILQ_EMPTY(&m->md.pv_list) &&
7238 					    (m->flags & PG_FICTITIOUS) == 0) {
7239 						pvh = page_to_pvh(m);
7240 						if (TAILQ_EMPTY(&pvh->pv_list))
7241 							vm_page_aflag_clear(m,
7242 							    PGA_WRITEABLE);
7243 					}
7244 					break;
7245 				}
7246 				pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
7247 				    &free);
7248 				freed++;
7249 			}
7250 		}
7251 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
7252 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
7253 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
7254 		if (allfree) {
7255 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
7256 			TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
7257 			    pc_list);
7258 		}
7259 	}
7260 	if (lock != NULL)
7261 		rw_wunlock(lock);
7262 	pmap_invalidate_all(pmap);
7263 	pmap_bti_deassign_all(pmap);
7264 	free_pv_chunk_batch(free_chunks);
7265 	PMAP_UNLOCK(pmap);
7266 	vm_page_free_pages_toq(&free, true);
7267 }
7268 
7269 /*
7270  * This is used to check if a page has been accessed or modified.
7271  */
7272 static bool
7273 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
7274 {
7275 	struct rwlock *lock;
7276 	pv_entry_t pv;
7277 	struct md_page *pvh;
7278 	pt_entry_t l3e, mask, *pte, value;
7279 	pmap_t pmap;
7280 	int md_gen, pvh_gen;
7281 	bool rv;
7282 
7283 	rv = false;
7284 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7285 	rw_rlock(lock);
7286 restart:
7287 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7288 		pmap = PV_PMAP(pv);
7289 		PMAP_ASSERT_STAGE1(pmap);
7290 		if (!PMAP_TRYLOCK(pmap)) {
7291 			md_gen = m->md.pv_gen;
7292 			rw_runlock(lock);
7293 			PMAP_LOCK(pmap);
7294 			rw_rlock(lock);
7295 			if (md_gen != m->md.pv_gen) {
7296 				PMAP_UNLOCK(pmap);
7297 				goto restart;
7298 			}
7299 		}
7300 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7301 		mask = 0;
7302 		value = 0;
7303 		if (modified) {
7304 			mask |= ATTR_S1_AP_RW_BIT;
7305 			value |= ATTR_S1_AP(ATTR_S1_AP_RW);
7306 		}
7307 		if (accessed) {
7308 			mask |= ATTR_AF | ATTR_DESCR_MASK;
7309 			value |= ATTR_AF | L3_PAGE;
7310 		}
7311 		l3e = pmap_load(pte);
7312 		if ((l3e & ATTR_CONTIGUOUS) != 0)
7313 			l3e = pmap_load_l3c(pte);
7314 		PMAP_UNLOCK(pmap);
7315 		rv = (l3e & mask) == value;
7316 		if (rv)
7317 			goto out;
7318 	}
7319 	if ((m->flags & PG_FICTITIOUS) == 0) {
7320 		pvh = page_to_pvh(m);
7321 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
7322 			pmap = PV_PMAP(pv);
7323 			PMAP_ASSERT_STAGE1(pmap);
7324 			if (!PMAP_TRYLOCK(pmap)) {
7325 				md_gen = m->md.pv_gen;
7326 				pvh_gen = pvh->pv_gen;
7327 				rw_runlock(lock);
7328 				PMAP_LOCK(pmap);
7329 				rw_rlock(lock);
7330 				if (md_gen != m->md.pv_gen ||
7331 				    pvh_gen != pvh->pv_gen) {
7332 					PMAP_UNLOCK(pmap);
7333 					goto restart;
7334 				}
7335 			}
7336 			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
7337 			mask = 0;
7338 			value = 0;
7339 			if (modified) {
7340 				mask |= ATTR_S1_AP_RW_BIT;
7341 				value |= ATTR_S1_AP(ATTR_S1_AP_RW);
7342 			}
7343 			if (accessed) {
7344 				mask |= ATTR_AF | ATTR_DESCR_MASK;
7345 				value |= ATTR_AF | L2_BLOCK;
7346 			}
7347 			rv = (pmap_load(pte) & mask) == value;
7348 			PMAP_UNLOCK(pmap);
7349 			if (rv)
7350 				goto out;
7351 		}
7352 	}
7353 out:
7354 	rw_runlock(lock);
7355 	return (rv);
7356 }
7357 
7358 /*
7359  *	pmap_is_modified:
7360  *
7361  *	Return whether or not the specified physical page was modified
7362  *	in any physical maps.
7363  */
7364 bool
7365 pmap_is_modified(vm_page_t m)
7366 {
7367 
7368 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7369 	    ("pmap_is_modified: page %p is not managed", m));
7370 
7371 	/*
7372 	 * If the page is not busied then this check is racy.
7373 	 */
7374 	if (!pmap_page_is_write_mapped(m))
7375 		return (false);
7376 	return (pmap_page_test_mappings(m, false, true));
7377 }
7378 
7379 /*
7380  *	pmap_is_prefaultable:
7381  *
7382  *	Return whether or not the specified virtual address is eligible
7383  *	for prefault.
7384  */
7385 bool
7386 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
7387 {
7388 	pd_entry_t *pde;
7389 	pt_entry_t *pte;
7390 	bool rv;
7391 	int lvl;
7392 
7393 	/*
7394 	 * Return true if and only if the L3 entry for the specified virtual
7395 	 * address is allocated but invalid.
7396 	 */
7397 	rv = false;
7398 	PMAP_LOCK(pmap);
7399 	pde = pmap_pde(pmap, addr, &lvl);
7400 	if (pde != NULL && lvl == 2) {
7401 		pte = pmap_l2_to_l3(pde, addr);
7402 		rv = pmap_load(pte) == 0;
7403 	}
7404 	PMAP_UNLOCK(pmap);
7405 	return (rv);
7406 }
7407 
7408 /*
7409  *	pmap_is_referenced:
7410  *
7411  *	Return whether or not the specified physical page was referenced
7412  *	in any physical maps.
7413  */
7414 bool
7415 pmap_is_referenced(vm_page_t m)
7416 {
7417 
7418 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7419 	    ("pmap_is_referenced: page %p is not managed", m));
7420 	return (pmap_page_test_mappings(m, true, false));
7421 }
7422 
7423 /*
7424  * Clear the write and modified bits in each of the given page's mappings.
7425  */
7426 void
7427 pmap_remove_write(vm_page_t m)
7428 {
7429 	struct md_page *pvh;
7430 	pmap_t pmap;
7431 	struct rwlock *lock;
7432 	pv_entry_t next_pv, pv;
7433 	pt_entry_t oldpte, *pte, set, clear, mask, val;
7434 	vm_offset_t va;
7435 	int md_gen, pvh_gen;
7436 
7437 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7438 	    ("pmap_remove_write: page %p is not managed", m));
7439 	vm_page_assert_busied(m);
7440 
7441 	if (!pmap_page_is_write_mapped(m))
7442 		return;
7443 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7444 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7445 	rw_wlock(lock);
7446 retry:
7447 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7448 		pmap = PV_PMAP(pv);
7449 		PMAP_ASSERT_STAGE1(pmap);
7450 		if (!PMAP_TRYLOCK(pmap)) {
7451 			pvh_gen = pvh->pv_gen;
7452 			rw_wunlock(lock);
7453 			PMAP_LOCK(pmap);
7454 			rw_wlock(lock);
7455 			if (pvh_gen != pvh->pv_gen) {
7456 				PMAP_UNLOCK(pmap);
7457 				goto retry;
7458 			}
7459 		}
7460 		va = pv->pv_va;
7461 		pte = pmap_pte_exists(pmap, va, 2, __func__);
7462 		if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
7463 			(void)pmap_demote_l2_locked(pmap, pte, va, &lock);
7464 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7465 		    ("inconsistent pv lock %p %p for page %p",
7466 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7467 		PMAP_UNLOCK(pmap);
7468 	}
7469 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7470 		pmap = PV_PMAP(pv);
7471 		if (!PMAP_TRYLOCK(pmap)) {
7472 			pvh_gen = pvh->pv_gen;
7473 			md_gen = m->md.pv_gen;
7474 			rw_wunlock(lock);
7475 			PMAP_LOCK(pmap);
7476 			rw_wlock(lock);
7477 			if (pvh_gen != pvh->pv_gen ||
7478 			    md_gen != m->md.pv_gen) {
7479 				PMAP_UNLOCK(pmap);
7480 				goto retry;
7481 			}
7482 		}
7483 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7484 		oldpte = pmap_load(pte);
7485 		if ((oldpte & ATTR_SW_DBM) != 0) {
7486 			if ((oldpte & ATTR_CONTIGUOUS) != 0) {
7487 				(void)pmap_demote_l3c(pmap, pte, pv->pv_va);
7488 
7489 				/*
7490 				 * The L3 entry's accessed bit may have
7491 				 * changed.
7492 				 */
7493 				oldpte = pmap_load(pte);
7494 			}
7495 			if (pmap->pm_stage == PM_STAGE1) {
7496 				set = ATTR_S1_AP_RW_BIT;
7497 				clear = 0;
7498 				mask = ATTR_S1_AP_RW_BIT;
7499 				val = ATTR_S1_AP(ATTR_S1_AP_RW);
7500 			} else {
7501 				set = 0;
7502 				clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7503 				mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7504 				val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7505 			}
7506 			clear |= ATTR_SW_DBM;
7507 			while (!atomic_fcmpset_64(pte, &oldpte,
7508 			    (oldpte | set) & ~clear))
7509 				cpu_spinwait();
7510 
7511 			if ((oldpte & mask) == val)
7512 				vm_page_dirty(m);
7513 			pmap_invalidate_page(pmap, pv->pv_va, true);
7514 		}
7515 		PMAP_UNLOCK(pmap);
7516 	}
7517 	rw_wunlock(lock);
7518 	vm_page_aflag_clear(m, PGA_WRITEABLE);
7519 }
7520 
7521 /*
7522  *	pmap_ts_referenced:
7523  *
7524  *	Return a count of reference bits for a page, clearing those bits.
7525  *	It is not necessary for every reference bit to be cleared, but it
7526  *	is necessary that 0 only be returned when there are truly no
7527  *	reference bits set.
7528  *
7529  *	As an optimization, update the page's dirty field if a modified bit is
7530  *	found while counting reference bits.  This opportunistic update can be
7531  *	performed at low cost and can eliminate the need for some future calls
7532  *	to pmap_is_modified().  However, since this function stops after
7533  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
7534  *	dirty pages.  Those dirty pages will only be detected by a future call
7535  *	to pmap_is_modified().
7536  */
7537 int
7538 pmap_ts_referenced(vm_page_t m)
7539 {
7540 	struct md_page *pvh;
7541 	pv_entry_t pv, pvf;
7542 	pmap_t pmap;
7543 	struct rwlock *lock;
7544 	pt_entry_t *pte, tpte;
7545 	vm_offset_t va;
7546 	vm_paddr_t pa;
7547 	int cleared, md_gen, not_cleared, pvh_gen;
7548 	struct spglist free;
7549 
7550 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7551 	    ("pmap_ts_referenced: page %p is not managed", m));
7552 	SLIST_INIT(&free);
7553 	cleared = 0;
7554 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7555 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7556 	rw_wlock(lock);
7557 retry:
7558 	not_cleared = 0;
7559 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
7560 		goto small_mappings;
7561 	pv = pvf;
7562 	do {
7563 		if (pvf == NULL)
7564 			pvf = pv;
7565 		pmap = PV_PMAP(pv);
7566 		if (!PMAP_TRYLOCK(pmap)) {
7567 			pvh_gen = pvh->pv_gen;
7568 			rw_wunlock(lock);
7569 			PMAP_LOCK(pmap);
7570 			rw_wlock(lock);
7571 			if (pvh_gen != pvh->pv_gen) {
7572 				PMAP_UNLOCK(pmap);
7573 				goto retry;
7574 			}
7575 		}
7576 		va = pv->pv_va;
7577 		pte = pmap_pte_exists(pmap, va, 2, __func__);
7578 		tpte = pmap_load(pte);
7579 		if (pmap_pte_dirty(pmap, tpte)) {
7580 			/*
7581 			 * Although "tpte" is mapping a 2MB page, because
7582 			 * this function is called at a 4KB page granularity,
7583 			 * we only update the 4KB page under test.
7584 			 */
7585 			vm_page_dirty(m);
7586 		}
7587 		if ((tpte & ATTR_AF) != 0) {
7588 			pa = VM_PAGE_TO_PHYS(m);
7589 
7590 			/*
7591 			 * Since this reference bit is shared by 512 4KB pages,
7592 			 * it should not be cleared every time it is tested.
7593 			 * Apply a simple "hash" function on the physical page
7594 			 * number, the virtual superpage number, and the pmap
7595 			 * address to select one 4KB page out of the 512 on
7596 			 * which testing the reference bit will result in
7597 			 * clearing that reference bit.  This function is
7598 			 * designed to avoid the selection of the same 4KB page
7599 			 * for every 2MB page mapping.
7600 			 *
7601 			 * On demotion, a mapping that hasn't been referenced
7602 			 * is simply destroyed.  To avoid the possibility of a
7603 			 * subsequent page fault on a demoted wired mapping,
7604 			 * always leave its reference bit set.  Moreover,
7605 			 * since the superpage is wired, the current state of
7606 			 * its reference bit won't affect page replacement.
7607 			 */
7608 			if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
7609 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
7610 			    (tpte & ATTR_SW_WIRED) == 0) {
7611 				pmap_clear_bits(pte, ATTR_AF);
7612 				pmap_invalidate_page(pmap, va, true);
7613 				cleared++;
7614 			} else
7615 				not_cleared++;
7616 		}
7617 		PMAP_UNLOCK(pmap);
7618 		/* Rotate the PV list if it has more than one entry. */
7619 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
7620 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7621 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
7622 			pvh->pv_gen++;
7623 		}
7624 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
7625 			goto out;
7626 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
7627 small_mappings:
7628 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
7629 		goto out;
7630 	pv = pvf;
7631 	do {
7632 		if (pvf == NULL)
7633 			pvf = pv;
7634 		pmap = PV_PMAP(pv);
7635 		if (!PMAP_TRYLOCK(pmap)) {
7636 			pvh_gen = pvh->pv_gen;
7637 			md_gen = m->md.pv_gen;
7638 			rw_wunlock(lock);
7639 			PMAP_LOCK(pmap);
7640 			rw_wlock(lock);
7641 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7642 				PMAP_UNLOCK(pmap);
7643 				goto retry;
7644 			}
7645 		}
7646 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7647 		tpte = pmap_load(pte);
7648 		if (pmap_pte_dirty(pmap, tpte))
7649 			vm_page_dirty(m);
7650 		if ((tpte & ATTR_AF) != 0) {
7651 			if ((tpte & ATTR_SW_WIRED) == 0) {
7652 				/*
7653 				 * Clear the accessed bit in this L3 entry
7654 				 * regardless of the contiguous bit.
7655 				 */
7656 				pmap_clear_bits(pte, ATTR_AF);
7657 				pmap_invalidate_page(pmap, pv->pv_va, true);
7658 				cleared++;
7659 			} else
7660 				not_cleared++;
7661 		} else if ((tpte & ATTR_CONTIGUOUS) != 0 &&
7662 		    (pmap_load_l3c(pte) & ATTR_AF) != 0) {
7663 			/*
7664 			 * An L3C superpage mapping is regarded as accessed
7665 			 * until the accessed bit has been cleared in all
7666 			 * of its constituent entries.
7667 			 */
7668 			not_cleared++;
7669 		}
7670 		PMAP_UNLOCK(pmap);
7671 		/* Rotate the PV list if it has more than one entry. */
7672 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
7673 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7674 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7675 			m->md.pv_gen++;
7676 		}
7677 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
7678 	    not_cleared < PMAP_TS_REFERENCED_MAX);
7679 out:
7680 	rw_wunlock(lock);
7681 	vm_page_free_pages_toq(&free, true);
7682 	return (cleared + not_cleared);
7683 }
7684 
7685 /*
7686  *	Apply the given advice to the specified range of addresses within the
7687  *	given pmap.  Depending on the advice, clear the referenced and/or
7688  *	modified flags in each mapping and set the mapped page's dirty field.
7689  */
7690 void
7691 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
7692 {
7693 	struct rwlock *lock;
7694 	vm_offset_t va, va_next, dva;
7695 	vm_page_t m;
7696 	pd_entry_t *l0, *l1, *l2, oldl2;
7697 	pt_entry_t *l3, *dl3, oldl3;
7698 
7699 	PMAP_ASSERT_STAGE1(pmap);
7700 
7701 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
7702 		return;
7703 
7704 	PMAP_LOCK(pmap);
7705 	for (; sva < eva; sva = va_next) {
7706 		l0 = pmap_l0(pmap, sva);
7707 		if (pmap_load(l0) == 0) {
7708 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
7709 			if (va_next < sva)
7710 				va_next = eva;
7711 			continue;
7712 		}
7713 
7714 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
7715 		if (va_next < sva)
7716 			va_next = eva;
7717 		l1 = pmap_l0_to_l1(l0, sva);
7718 		if (pmap_load(l1) == 0)
7719 			continue;
7720 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
7721 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7722 			continue;
7723 		}
7724 
7725 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
7726 		if (va_next < sva)
7727 			va_next = eva;
7728 		l2 = pmap_l1_to_l2(l1, sva);
7729 		oldl2 = pmap_load(l2);
7730 		if (oldl2 == 0)
7731 			continue;
7732 		if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
7733 			if ((oldl2 & ATTR_SW_MANAGED) == 0)
7734 				continue;
7735 			lock = NULL;
7736 			if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
7737 				if (lock != NULL)
7738 					rw_wunlock(lock);
7739 
7740 				/*
7741 				 * The 2MB page mapping was destroyed.
7742 				 */
7743 				continue;
7744 			}
7745 
7746 			/*
7747 			 * Unless the page mappings are wired, remove the
7748 			 * mapping to a single page so that a subsequent
7749 			 * access may repromote.  Choosing the last page
7750 			 * within the address range [sva, min(va_next, eva))
7751 			 * generally results in more repromotions.  Since the
7752 			 * underlying page table page is fully populated, this
7753 			 * removal never frees a page table page.
7754 			 */
7755 			if ((oldl2 & ATTR_SW_WIRED) == 0) {
7756 				va = eva;
7757 				if (va > va_next)
7758 					va = va_next;
7759 				va -= PAGE_SIZE;
7760 				KASSERT(va >= sva,
7761 				    ("pmap_advise: no address gap"));
7762 				l3 = pmap_l2_to_l3(l2, va);
7763 				KASSERT(pmap_load(l3) != 0,
7764 				    ("pmap_advise: invalid PTE"));
7765 				pmap_remove_l3(pmap, l3, va, pmap_load(l2),
7766 				    NULL, &lock);
7767 			}
7768 			if (lock != NULL)
7769 				rw_wunlock(lock);
7770 		}
7771 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
7772 		    ("pmap_advise: invalid L2 entry after demotion"));
7773 		if (va_next > eva)
7774 			va_next = eva;
7775 		va = va_next;
7776 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
7777 		    sva += L3_SIZE) {
7778 			oldl3 = pmap_load(l3);
7779 			if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
7780 			    (ATTR_SW_MANAGED | L3_PAGE))
7781 				goto maybe_invlrng;
7782 			else if (pmap_pte_dirty(pmap, oldl3)) {
7783 				if (advice == MADV_DONTNEED) {
7784 					/*
7785 					 * Future calls to pmap_is_modified()
7786 					 * can be avoided by making the page
7787 					 * dirty now.
7788 					 */
7789 					m = PTE_TO_VM_PAGE(oldl3);
7790 					vm_page_dirty(m);
7791 				}
7792 				if ((oldl3 & ATTR_CONTIGUOUS) != 0) {
7793 					/*
7794 					 * Unconditionally demote the L3C
7795 					 * superpage because we do not allow
7796 					 * writeable, clean superpages.
7797 					 */
7798 					(void)pmap_demote_l3c(pmap, l3, sva);
7799 
7800 					/*
7801                                          * Destroy the final mapping before the
7802                                          * next L3C boundary or va_next,
7803 					 * whichever comes first, so that a
7804 					 * subsequent access may act as a
7805 					 * repromotion trigger.
7806 					 */
7807                                         if ((oldl3 & ATTR_SW_WIRED) == 0) {
7808 						dva = MIN((sva & ~L3C_OFFSET) +
7809 						    L3C_SIZE - PAGE_SIZE,
7810 						    va_next - PAGE_SIZE);
7811 						dl3 = pmap_l2_to_l3(l2, dva);
7812 						KASSERT(pmap_load(dl3) != 0,
7813 						    ("pmap_advise: invalid PTE"));
7814 						lock = NULL;
7815 						pmap_remove_l3(pmap, dl3, dva,
7816 						    pmap_load(l2), NULL, &lock);
7817 						if (lock != NULL)
7818 							rw_wunlock(lock);
7819 					}
7820 
7821 					/*
7822 					 * The L3 entry's accessed bit may have
7823 					 * changed.
7824 					 */
7825 					oldl3 = pmap_load(l3);
7826 				}
7827 
7828 				/*
7829 				 * Check that we did not just destroy this entry so
7830 				 * we avoid corrupting the page able.
7831 				 */
7832 				if (oldl3 != 0) {
7833 					while (!atomic_fcmpset_long(l3, &oldl3,
7834 					    (oldl3 & ~ATTR_AF) |
7835 					    ATTR_S1_AP(ATTR_S1_AP_RO)))
7836 						cpu_spinwait();
7837 				}
7838 			} else if ((oldl3 & ATTR_AF) != 0) {
7839 				/*
7840 				 * Clear the accessed bit in this L3 entry
7841 				 * regardless of the contiguous bit.
7842 				 */
7843 				pmap_clear_bits(l3, ATTR_AF);
7844 			} else
7845 				goto maybe_invlrng;
7846 			if (va == va_next)
7847 				va = sva;
7848 			continue;
7849 maybe_invlrng:
7850 			if (va != va_next) {
7851 				pmap_s1_invalidate_range(pmap, va, sva, true);
7852 				va = va_next;
7853 			}
7854 		}
7855 		if (va != va_next)
7856 			pmap_s1_invalidate_range(pmap, va, sva, true);
7857 	}
7858 	PMAP_UNLOCK(pmap);
7859 }
7860 
7861 /*
7862  *	Clear the modify bits on the specified physical page.
7863  */
7864 void
7865 pmap_clear_modify(vm_page_t m)
7866 {
7867 	struct md_page *pvh;
7868 	struct rwlock *lock;
7869 	pmap_t pmap;
7870 	pv_entry_t next_pv, pv;
7871 	pd_entry_t *l2, oldl2;
7872 	pt_entry_t *l3, oldl3;
7873 	vm_offset_t va;
7874 	int md_gen, pvh_gen;
7875 
7876 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7877 	    ("pmap_clear_modify: page %p is not managed", m));
7878 	vm_page_assert_busied(m);
7879 
7880 	if (!pmap_page_is_write_mapped(m))
7881 		return;
7882 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7883 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7884 	rw_wlock(lock);
7885 restart:
7886 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7887 		pmap = PV_PMAP(pv);
7888 		PMAP_ASSERT_STAGE1(pmap);
7889 		if (!PMAP_TRYLOCK(pmap)) {
7890 			pvh_gen = pvh->pv_gen;
7891 			rw_wunlock(lock);
7892 			PMAP_LOCK(pmap);
7893 			rw_wlock(lock);
7894 			if (pvh_gen != pvh->pv_gen) {
7895 				PMAP_UNLOCK(pmap);
7896 				goto restart;
7897 			}
7898 		}
7899 		va = pv->pv_va;
7900 		l2 = pmap_l2(pmap, va);
7901 		oldl2 = pmap_load(l2);
7902 		/* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
7903 		if ((oldl2 & ATTR_SW_DBM) != 0 &&
7904 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
7905 		    (oldl2 & ATTR_SW_WIRED) == 0) {
7906 			/*
7907 			 * Write protect the mapping to a single page so that
7908 			 * a subsequent write access may repromote.
7909 			 */
7910 			va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
7911 			l3 = pmap_l2_to_l3(l2, va);
7912 			oldl3 = pmap_load(l3);
7913 			while (!atomic_fcmpset_long(l3, &oldl3,
7914 			    (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
7915 				cpu_spinwait();
7916 			vm_page_dirty(m);
7917 			pmap_s1_invalidate_page(pmap, va, true);
7918 		}
7919 		PMAP_UNLOCK(pmap);
7920 	}
7921 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7922 		pmap = PV_PMAP(pv);
7923 		PMAP_ASSERT_STAGE1(pmap);
7924 		if (!PMAP_TRYLOCK(pmap)) {
7925 			md_gen = m->md.pv_gen;
7926 			pvh_gen = pvh->pv_gen;
7927 			rw_wunlock(lock);
7928 			PMAP_LOCK(pmap);
7929 			rw_wlock(lock);
7930 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7931 				PMAP_UNLOCK(pmap);
7932 				goto restart;
7933 			}
7934 		}
7935 		l2 = pmap_l2(pmap, pv->pv_va);
7936 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
7937 		oldl3 = pmap_load(l3);
7938 		KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 ||
7939 		    (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
7940 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
7941 		    ("writeable L3C superpage not dirty"));
7942 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
7943 			if ((oldl3 & ATTR_CONTIGUOUS) != 0)
7944 				(void)pmap_demote_l3c(pmap, l3, pv->pv_va);
7945 			pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
7946 			pmap_s1_invalidate_page(pmap, pv->pv_va, true);
7947 		}
7948 		PMAP_UNLOCK(pmap);
7949 	}
7950 	rw_wunlock(lock);
7951 }
7952 
7953 void *
7954 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
7955 {
7956 	struct pmap_preinit_mapping *ppim;
7957 	vm_offset_t va, offset;
7958 	pd_entry_t old_l2e, *pde;
7959 	pt_entry_t *l2;
7960 	int i, lvl, l2_blocks, free_l2_count, start_idx;
7961 
7962 	/* Use the DMAP region if we can */
7963 	if (PHYS_IN_DMAP(pa) && PHYS_IN_DMAP(pa + size - 1) &&
7964 	    pmap_kmapped_range(PHYS_TO_DMAP(pa), size))
7965 		return ((void *)PHYS_TO_DMAP(pa));
7966 
7967 	if (!vm_initialized) {
7968 		/*
7969 		 * No L3 ptables so map entire L2 blocks where start VA is:
7970 		 * 	preinit_map_va + start_idx * L2_SIZE
7971 		 * There may be duplicate mappings (multiple VA -> same PA) but
7972 		 * ARM64 dcache is always PIPT so that's acceptable.
7973 		 */
7974 		 if (size == 0)
7975 			 return (NULL);
7976 
7977 		 /* Calculate how many L2 blocks are needed for the mapping */
7978 		l2_blocks = (roundup2(pa + size, L2_SIZE) -
7979 		    rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
7980 
7981 		offset = pa & L2_OFFSET;
7982 
7983 		if (preinit_map_va == 0)
7984 			return (NULL);
7985 
7986 		/* Map 2MiB L2 blocks from reserved VA space */
7987 
7988 		free_l2_count = 0;
7989 		start_idx = -1;
7990 		/* Find enough free contiguous VA space */
7991 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7992 			ppim = pmap_preinit_mapping + i;
7993 			if (free_l2_count > 0 && ppim->pa != 0) {
7994 				/* Not enough space here */
7995 				free_l2_count = 0;
7996 				start_idx = -1;
7997 				continue;
7998 			}
7999 
8000 			if (ppim->pa == 0) {
8001 				/* Free L2 block */
8002 				if (start_idx == -1)
8003 					start_idx = i;
8004 				free_l2_count++;
8005 				if (free_l2_count == l2_blocks)
8006 					break;
8007 			}
8008 		}
8009 		if (free_l2_count != l2_blocks)
8010 			panic("%s: too many preinit mappings", __func__);
8011 
8012 		va = preinit_map_va + (start_idx * L2_SIZE);
8013 		for (i = start_idx; i < start_idx + l2_blocks; i++) {
8014 			/* Mark entries as allocated */
8015 			ppim = pmap_preinit_mapping + i;
8016 			ppim->pa = pa;
8017 			ppim->va = va + offset;
8018 			ppim->size = size;
8019 		}
8020 
8021 		/* Map L2 blocks */
8022 		pa = rounddown2(pa, L2_SIZE);
8023 		old_l2e = 0;
8024 		for (i = 0; i < l2_blocks; i++) {
8025 			pde = pmap_pde(kernel_pmap, va, &lvl);
8026 			KASSERT(pde != NULL,
8027 			    ("pmap_mapbios: Invalid page entry, va: 0x%lx",
8028 			    va));
8029 			KASSERT(lvl == 1,
8030 			    ("pmap_mapbios: Invalid level %d", lvl));
8031 
8032 			/* Insert L2_BLOCK */
8033 			l2 = pmap_l1_to_l2(pde, va);
8034 			old_l2e |= pmap_load_store(l2,
8035 			    PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr |
8036 			    ATTR_S1_XN | ATTR_KERN_GP |
8037 			    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
8038 
8039 			va += L2_SIZE;
8040 			pa += L2_SIZE;
8041 		}
8042 		if ((old_l2e & ATTR_DESCR_VALID) != 0)
8043 			pmap_s1_invalidate_all_kernel();
8044 		else {
8045 			/*
8046 			 * Because the old entries were invalid and the new
8047 			 * mappings are not executable, an isb is not required.
8048 			 */
8049 			dsb(ishst);
8050 		}
8051 
8052 		va = preinit_map_va + (start_idx * L2_SIZE);
8053 
8054 	} else {
8055 		/* kva_alloc may be used to map the pages */
8056 		offset = pa & PAGE_MASK;
8057 		size = round_page(offset + size);
8058 
8059 		va = kva_alloc(size);
8060 		if (va == 0)
8061 			panic("%s: Couldn't allocate KVA", __func__);
8062 
8063 		pde = pmap_pde(kernel_pmap, va, &lvl);
8064 		KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
8065 
8066 		/* L3 table is linked */
8067 		va = trunc_page(va);
8068 		pa = trunc_page(pa);
8069 		pmap_kenter(va, size, pa, memory_mapping_mode(pa));
8070 	}
8071 
8072 	return ((void *)(va + offset));
8073 }
8074 
8075 void
8076 pmap_unmapbios(void *p, vm_size_t size)
8077 {
8078 	struct pmap_preinit_mapping *ppim;
8079 	vm_offset_t offset, va, va_trunc;
8080 	pd_entry_t *pde;
8081 	pt_entry_t *l2;
8082 	int error __diagused, i, lvl, l2_blocks, block;
8083 	bool preinit_map;
8084 
8085 	va = (vm_offset_t)p;
8086 	if (VIRT_IN_DMAP(va)) {
8087 		KASSERT(VIRT_IN_DMAP(va + size - 1),
8088 		    ("%s: End address not in DMAP region: %lx", __func__,
8089 		    va + size - 1));
8090 		/* Ensure the attributes are as expected for the DMAP region */
8091 		PMAP_LOCK(kernel_pmap);
8092 		error = pmap_change_props_locked(va, size,
8093 		    PROT_READ | PROT_WRITE, VM_MEMATTR_DEFAULT, false);
8094 		PMAP_UNLOCK(kernel_pmap);
8095 		KASSERT(error == 0, ("%s: Failed to reset DMAP attributes: %d",
8096 		    __func__, error));
8097 
8098 		return;
8099 	}
8100 
8101 	l2_blocks =
8102 	   (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
8103 	KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
8104 
8105 	/* Remove preinit mapping */
8106 	preinit_map = false;
8107 	block = 0;
8108 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
8109 		ppim = pmap_preinit_mapping + i;
8110 		if (ppim->va == va) {
8111 			KASSERT(ppim->size == size,
8112 			    ("pmap_unmapbios: size mismatch"));
8113 			ppim->va = 0;
8114 			ppim->pa = 0;
8115 			ppim->size = 0;
8116 			preinit_map = true;
8117 			offset = block * L2_SIZE;
8118 			va_trunc = rounddown2(va, L2_SIZE) + offset;
8119 
8120 			/* Remove L2_BLOCK */
8121 			pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
8122 			KASSERT(pde != NULL,
8123 			    ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
8124 			    va_trunc));
8125 			l2 = pmap_l1_to_l2(pde, va_trunc);
8126 			pmap_clear(l2);
8127 
8128 			if (block == (l2_blocks - 1))
8129 				break;
8130 			block++;
8131 		}
8132 	}
8133 	if (preinit_map) {
8134 		pmap_s1_invalidate_all_kernel();
8135 		return;
8136 	}
8137 
8138 	/* Unmap the pages reserved with kva_alloc. */
8139 	if (vm_initialized) {
8140 		offset = va & PAGE_MASK;
8141 		size = round_page(offset + size);
8142 		va = trunc_page(va);
8143 
8144 		/* Unmap and invalidate the pages */
8145 		pmap_kremove_device(va, size);
8146 
8147 		kva_free(va, size);
8148 	}
8149 }
8150 
8151 /*
8152  * Sets the memory attribute for the specified page.
8153  */
8154 void
8155 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
8156 {
8157 	if (m->md.pv_memattr == ma)
8158 		return;
8159 
8160 	m->md.pv_memattr = ma;
8161 
8162 	/*
8163 	 * If "m" is a normal page, update its direct mapping.  This update
8164 	 * can be relied upon to perform any cache operations that are
8165 	 * required for data coherence.
8166 	 */
8167 	if ((m->flags & PG_FICTITIOUS) == 0 &&
8168 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
8169 	    m->md.pv_memattr) != 0)
8170 		panic("memory attribute change on the direct map failed");
8171 }
8172 
8173 /*
8174  * Changes the specified virtual address range's memory type to that given by
8175  * the parameter "mode".  The specified virtual address range must be
8176  * completely contained within either the direct map or the kernel map.  If
8177  * the virtual address range is contained within the kernel map, then the
8178  * memory type for each of the corresponding ranges of the direct map is also
8179  * changed.  (The corresponding ranges of the direct map are those ranges that
8180  * map the same physical pages as the specified virtual address range.)  These
8181  * changes to the direct map are necessary because Intel describes the
8182  * behavior of their processors as "undefined" if two or more mappings to the
8183  * same physical page have different memory types.
8184  *
8185  * Returns zero if the change completed successfully, and either EINVAL or
8186  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
8187  * of the virtual address range was not mapped, and ENOMEM is returned if
8188  * there was insufficient memory available to complete the change.  In the
8189  * latter case, the memory type may have been changed on some part of the
8190  * virtual address range or the direct map.
8191  */
8192 int
8193 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
8194 {
8195 	int error;
8196 
8197 	PMAP_LOCK(kernel_pmap);
8198 	error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
8199 	PMAP_UNLOCK(kernel_pmap);
8200 	return (error);
8201 }
8202 
8203 /*
8204  * Changes the specified virtual address range's protections to those
8205  * specified by "prot".  Like pmap_change_attr(), protections for aliases
8206  * in the direct map are updated as well.  Protections on aliasing mappings may
8207  * be a subset of the requested protections; for example, mappings in the direct
8208  * map are never executable.
8209  */
8210 int
8211 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
8212 {
8213 	int error;
8214 
8215 	/* Only supported within the kernel map. */
8216 	if (va < VM_MIN_KERNEL_ADDRESS)
8217 		return (EINVAL);
8218 
8219 	PMAP_LOCK(kernel_pmap);
8220 	error = pmap_change_props_locked(va, size, prot, -1, false);
8221 	PMAP_UNLOCK(kernel_pmap);
8222 	return (error);
8223 }
8224 
8225 static int
8226 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
8227     int mode, bool skip_unmapped)
8228 {
8229 	vm_offset_t base, offset, tmpva;
8230 	vm_size_t pte_size;
8231 	vm_paddr_t pa;
8232 	pt_entry_t pte, *ptep, *newpte;
8233 	pt_entry_t bits, mask;
8234 	int lvl, rv;
8235 
8236 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
8237 	base = trunc_page(va);
8238 	offset = va & PAGE_MASK;
8239 	size = round_page(offset + size);
8240 
8241 	if (!VIRT_IN_DMAP(base) &&
8242 	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
8243 		return (EINVAL);
8244 
8245 	bits = 0;
8246 	mask = 0;
8247 	if (mode != -1) {
8248 		bits = ATTR_S1_IDX(mode);
8249 		mask = ATTR_S1_IDX_MASK;
8250 		if (mode == VM_MEMATTR_DEVICE) {
8251 			mask |= ATTR_S1_XN;
8252 			bits |= ATTR_S1_XN;
8253 		}
8254 	}
8255 	if (prot != VM_PROT_NONE) {
8256 		/* Don't mark the DMAP as executable. It never is on arm64. */
8257 		if (VIRT_IN_DMAP(base)) {
8258 			prot &= ~VM_PROT_EXECUTE;
8259 			/*
8260 			 * XXX Mark the DMAP as writable for now. We rely
8261 			 * on this in ddb & dtrace to insert breakpoint
8262 			 * instructions.
8263 			 */
8264 			prot |= VM_PROT_WRITE;
8265 		}
8266 
8267 		if ((prot & VM_PROT_WRITE) == 0) {
8268 			bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
8269 		}
8270 		if ((prot & VM_PROT_EXECUTE) == 0) {
8271 			bits |= ATTR_S1_PXN;
8272 		}
8273 		bits |= ATTR_S1_UXN;
8274 		mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
8275 	}
8276 
8277 	for (tmpva = base; tmpva < base + size; ) {
8278 		ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
8279 		if (ptep == NULL && !skip_unmapped) {
8280 			return (EINVAL);
8281 		} else if ((ptep == NULL && skip_unmapped) ||
8282 		    (pmap_load(ptep) & mask) == bits) {
8283 			/*
8284 			 * We already have the correct attribute or there
8285 			 * is no memory mapped at this address and we are
8286 			 * skipping unmapped memory.
8287 			 */
8288 			switch (lvl) {
8289 			default:
8290 				panic("Invalid DMAP table level: %d\n", lvl);
8291 			case 1:
8292 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
8293 				break;
8294 			case 2:
8295 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
8296 				break;
8297 			case 3:
8298 				tmpva += PAGE_SIZE;
8299 				break;
8300 			}
8301 		} else {
8302 			/* We can't demote/promote this entry */
8303 			MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
8304 
8305 			/*
8306 			 * Find the entry and demote it if the requested change
8307 			 * only applies to part of the address range mapped by
8308 			 * the entry.
8309 			 */
8310 			switch (lvl) {
8311 			default:
8312 				panic("Invalid DMAP table level: %d\n", lvl);
8313 			case 1:
8314 				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8315 				if ((tmpva & L1_OFFSET) == 0 &&
8316 				    (base + size - tmpva) >= L1_SIZE) {
8317 					pte_size = L1_SIZE;
8318 					break;
8319 				}
8320 				newpte = pmap_demote_l1(kernel_pmap, ptep,
8321 				    tmpva & ~L1_OFFSET);
8322 				if (newpte == NULL)
8323 					return (EINVAL);
8324 				ptep = pmap_l1_to_l2(ptep, tmpva);
8325 				/* FALLTHROUGH */
8326 			case 2:
8327 				if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
8328 					if ((tmpva & L2C_OFFSET) == 0 &&
8329 					    (base + size - tmpva) >= L2C_SIZE) {
8330 						pte_size = L2C_SIZE;
8331 						break;
8332 					}
8333 					if (!pmap_demote_l2c(kernel_pmap, ptep,
8334 					    tmpva))
8335 						return (EINVAL);
8336 				}
8337 				if ((tmpva & L2_OFFSET) == 0 &&
8338 				    (base + size - tmpva) >= L2_SIZE) {
8339 					pte_size = L2_SIZE;
8340 					break;
8341 				}
8342 				newpte = pmap_demote_l2(kernel_pmap, ptep,
8343 				    tmpva);
8344 				if (newpte == NULL)
8345 					return (EINVAL);
8346 				ptep = pmap_l2_to_l3(ptep, tmpva);
8347 				/* FALLTHROUGH */
8348 			case 3:
8349 				if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
8350 					if ((tmpva & L3C_OFFSET) == 0 &&
8351 					    (base + size - tmpva) >= L3C_SIZE) {
8352 						pte_size = L3C_SIZE;
8353 						break;
8354 					}
8355 					if (!pmap_demote_l3c(kernel_pmap, ptep,
8356 					    tmpva))
8357 						return (EINVAL);
8358 				}
8359 				pte_size = PAGE_SIZE;
8360 				break;
8361 			}
8362 
8363 			/* Update the entry */
8364 			pte = pmap_load(ptep);
8365 			pte &= ~mask;
8366 			pte |= bits;
8367 
8368 			switch (pte_size) {
8369 			case L2C_SIZE:
8370 				pmap_update_strided(kernel_pmap, ptep, ptep +
8371 				    L2C_ENTRIES, pte, tmpva, L2_SIZE, L2C_SIZE);
8372 				break;
8373 			case L3C_SIZE:
8374 				pmap_update_strided(kernel_pmap, ptep, ptep +
8375 				    L3C_ENTRIES, pte, tmpva, L3_SIZE, L3C_SIZE);
8376 				break;
8377 			default:
8378 				/*
8379 				 * We are updating a single block or page entry,
8380 				 * so regardless of pte_size pass PAGE_SIZE in
8381 				 * order that a single TLB invalidation is
8382 				 * performed.
8383 				 */
8384 				pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
8385 				    PAGE_SIZE);
8386 				break;
8387 			}
8388 
8389 			pa = PTE_TO_PHYS(pte);
8390 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
8391 				/*
8392 				 * Keep the DMAP memory in sync.
8393 				 */
8394 				rv = pmap_change_props_locked(
8395 				    PHYS_TO_DMAP(pa), pte_size,
8396 				    prot, mode, true);
8397 				if (rv != 0)
8398 					return (rv);
8399 			}
8400 
8401 			/*
8402 			 * If moving to a non-cacheable entry flush
8403 			 * the cache.
8404 			 */
8405 			if (mode == VM_MEMATTR_UNCACHEABLE)
8406 				cpu_dcache_wbinv_range((void *)tmpva, pte_size);
8407 			tmpva += pte_size;
8408 		}
8409 	}
8410 
8411 	return (0);
8412 }
8413 
8414 /*
8415  * Create an L2 table to map all addresses within an L1 mapping.
8416  */
8417 static pt_entry_t *
8418 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
8419 {
8420 	pt_entry_t *l2, newl2, oldl1;
8421 	vm_offset_t tmpl1;
8422 	vm_paddr_t l2phys, phys;
8423 	vm_page_t ml2;
8424 	int i;
8425 
8426 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8427 	oldl1 = pmap_load(l1);
8428 	PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8429 	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
8430 	    ("pmap_demote_l1: Demoting a non-block entry"));
8431 	KASSERT((va & L1_OFFSET) == 0,
8432 	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
8433 	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
8434 	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
8435 	KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
8436 	    ("pmap_demote_l1: Demoting entry with no-demote flag set"));
8437 
8438 	tmpl1 = 0;
8439 	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
8440 		tmpl1 = kva_alloc(PAGE_SIZE);
8441 		if (tmpl1 == 0)
8442 			return (NULL);
8443 	}
8444 
8445 	if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
8446 	    NULL) {
8447 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
8448 		    " in pmap %p", va, pmap);
8449 		l2 = NULL;
8450 		goto fail;
8451 	}
8452 
8453 	l2phys = VM_PAGE_TO_PHYS(ml2);
8454 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
8455 
8456 	/* Address the range points at */
8457 	phys = PTE_TO_PHYS(oldl1);
8458 	/* The attributed from the old l1 table to be copied */
8459 	newl2 = oldl1 & ATTR_MASK;
8460 
8461 	/* Create the new entries */
8462 	newl2 |= ATTR_CONTIGUOUS;
8463 	for (i = 0; i < Ln_ENTRIES; i++) {
8464 		l2[i] = newl2 | phys;
8465 		phys += L2_SIZE;
8466 	}
8467 	KASSERT(l2[0] == (ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) |
8468 	    L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0],
8469 	    ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
8470 
8471 	if (tmpl1 != 0) {
8472 		pmap_kenter(tmpl1, PAGE_SIZE,
8473 		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
8474 		    VM_MEMATTR_WRITE_BACK);
8475 		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
8476 	}
8477 
8478 	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
8479 
8480 	counter_u64_add(pmap_l1_demotions, 1);
8481 fail:
8482 	if (tmpl1 != 0) {
8483 		pmap_kremove(tmpl1);
8484 		kva_free(tmpl1, PAGE_SIZE);
8485 	}
8486 
8487 	return (l2);
8488 }
8489 
8490 static void
8491 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
8492 {
8493 	pt_entry_t *l3;
8494 
8495 	for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
8496 		*l3 = newl3;
8497 		newl3 += L3_SIZE;
8498 	}
8499 }
8500 
8501 static void
8502 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused)
8503 {
8504 #ifdef INVARIANTS
8505 #ifdef DIAGNOSTIC
8506 	pt_entry_t *xl3p, *yl3p;
8507 
8508 	for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES;
8509 	    xl3p++, newl3e += PAGE_SIZE) {
8510 		if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) {
8511 			printf("pmap_demote_l2: xl3e %zd and newl3e map "
8512 			    "different pages: found %#lx, expected %#lx\n",
8513 			    xl3p - firstl3p, pmap_load(xl3p), newl3e);
8514 			printf("page table dump\n");
8515 			for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES;
8516 			    yl3p++) {
8517 				printf("%zd %#lx\n", yl3p - firstl3p,
8518 				    pmap_load(yl3p));
8519 			}
8520 			panic("firstpte");
8521 		}
8522 	}
8523 #else
8524 	KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e),
8525 	    ("pmap_demote_l2: firstl3 and newl3e map different physical"
8526 	    " addresses"));
8527 #endif
8528 #endif
8529 }
8530 
8531 static void
8532 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
8533     struct rwlock **lockp)
8534 {
8535 	struct spglist free;
8536 
8537 	SLIST_INIT(&free);
8538 	(void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), true,
8539 	    &free, lockp);
8540 	vm_page_free_pages_toq(&free, true);
8541 }
8542 
8543 /*
8544  * Create an L3 table to map all addresses within an L2 mapping.
8545  */
8546 static pt_entry_t *
8547 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
8548     struct rwlock **lockp)
8549 {
8550 	pt_entry_t *l3, newl3, oldl2;
8551 	vm_offset_t tmpl2;
8552 	vm_paddr_t l3phys;
8553 	vm_page_t ml3;
8554 
8555 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8556 	PMAP_ASSERT_STAGE1(pmap);
8557 	KASSERT(ADDR_IS_CANONICAL(va),
8558 	    ("%s: Address not in canonical form: %lx", __func__, va));
8559 
8560 	l3 = NULL;
8561 	oldl2 = pmap_load(l2);
8562 	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
8563 	    ("pmap_demote_l2: Demoting a non-block entry"));
8564 	KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
8565 	    ("pmap_demote_l2: Demoting entry with no-demote flag set"));
8566 	va &= ~L2_OFFSET;
8567 
8568 	tmpl2 = 0;
8569 	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
8570 		tmpl2 = kva_alloc(PAGE_SIZE);
8571 		if (tmpl2 == 0)
8572 			return (NULL);
8573 	}
8574 
8575 	/*
8576 	 * Invalidate the 2MB page mapping and return "failure" if the
8577 	 * mapping was never accessed and not wired.
8578 	 */
8579 	if ((oldl2 & ATTR_AF) == 0) {
8580 		if ((oldl2 & ATTR_SW_WIRED) == 0) {
8581 			pmap_demote_l2_abort(pmap, va, l2, lockp);
8582 			CTR2(KTR_PMAP,
8583 			    "pmap_demote_l2: failure for va %#lx in pmap %p",
8584 			    va, pmap);
8585 			goto fail;
8586 		}
8587 		ml3 = pmap_remove_pt_page(pmap, va);
8588 		/* Fill the PTP with L3Es that have ATTR_AF cleared. */
8589 		ml3->valid = 0;
8590 	} else if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
8591 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8592 		    ("pmap_demote_l2: page table page for a wired mapping"
8593 		    " is missing"));
8594 
8595 		/*
8596 		 * If the page table page is missing and the mapping
8597 		 * is for a kernel address, the mapping must belong to
8598 		 * either the direct map or the early kernel memory.
8599 		 * Page table pages are preallocated for every other
8600 		 * part of the kernel address space, so the direct map
8601 		 * region and early kernel memory are the only parts of the
8602 		 * kernel address space that must be handled here.
8603 		 */
8604 		KASSERT(ADDR_IS_USER(va) || VIRT_IN_DMAP(va) ||
8605 		    (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
8606 		    ("pmap_demote_l2: No saved mpte for va %#lx", va));
8607 
8608 		/*
8609 		 * If the 2MB page mapping belongs to the direct map
8610 		 * region of the kernel's address space, then the page
8611 		 * allocation request specifies the highest possible
8612 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
8613 		 * priority is normal.
8614 		 */
8615 		ml3 = vm_page_alloc_noobj(
8616 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
8617 		    VM_ALLOC_WIRED);
8618 
8619 		/*
8620 		 * If the allocation of the new page table page fails,
8621 		 * invalidate the 2MB page mapping and return "failure".
8622 		 */
8623 		if (ml3 == NULL) {
8624 			pmap_demote_l2_abort(pmap, va, l2, lockp);
8625 			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
8626 			    " in pmap %p", va, pmap);
8627 			goto fail;
8628 		}
8629 		ml3->pindex = pmap_l2_pindex(va);
8630 
8631 		if (ADDR_IS_USER(va)) {
8632 			ml3->ref_count = NL3PG;
8633 			pmap_resident_count_inc(pmap, 1);
8634 		}
8635 	}
8636 	l3phys = VM_PAGE_TO_PHYS(ml3);
8637 	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
8638 	newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
8639 	KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
8640 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
8641 	    ("pmap_demote_l2: L2 entry is writeable but not dirty"));
8642 
8643 	/*
8644 	 * If the PTP is not leftover from an earlier promotion or it does not
8645 	 * have ATTR_AF set in every L3E, then fill it.  The new L3Es will all
8646 	 * have ATTR_AF set, unless this is a wired mapping with ATTR_AF clear.
8647 	 *
8648 	 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
8649 	 * performs a dsb().  That dsb() ensures that the stores for filling
8650 	 * "l3" are visible before "l3" is added to the page table.
8651 	 */
8652 	if (!vm_page_all_valid(ml3))
8653 		pmap_fill_l3(l3, newl3);
8654 
8655 	pmap_demote_l2_check(l3, newl3);
8656 
8657 	/*
8658 	 * If the mapping has changed attributes, update the L3Es.
8659 	 */
8660 	if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE))
8661 		pmap_fill_l3(l3, newl3);
8662 
8663 	/*
8664 	 * Map the temporary page so we don't lose access to the l2 table.
8665 	 */
8666 	if (tmpl2 != 0) {
8667 		pmap_kenter(tmpl2, PAGE_SIZE,
8668 		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
8669 		    VM_MEMATTR_WRITE_BACK);
8670 		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
8671 	}
8672 
8673 	/*
8674 	 * The spare PV entries must be reserved prior to demoting the
8675 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
8676 	 * of the L2 and the PV lists will be inconsistent, which can result
8677 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
8678 	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
8679 	 * PV entry for the 2MB page mapping that is being demoted.
8680 	 */
8681 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
8682 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
8683 
8684 	/*
8685 	 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
8686 	 * the 2MB page mapping.
8687 	 */
8688 	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
8689 
8690 	/*
8691 	 * Demote the PV entry.
8692 	 */
8693 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
8694 		pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
8695 
8696 	counter_u64_add(pmap_l2_demotions, 1);
8697 	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
8698 	    " in pmap %p %lx", va, pmap, l3[0]);
8699 
8700 fail:
8701 	if (tmpl2 != 0) {
8702 		pmap_kremove(tmpl2);
8703 		kva_free(tmpl2, PAGE_SIZE);
8704 	}
8705 
8706 	return (l3);
8707 
8708 }
8709 
8710 static pt_entry_t *
8711 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
8712 {
8713 	struct rwlock *lock;
8714 	pt_entry_t *l3;
8715 
8716 	lock = NULL;
8717 	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
8718 	if (lock != NULL)
8719 		rw_wunlock(lock);
8720 	return (l3);
8721 }
8722 
8723 /*
8724  * Demote an L2C superpage mapping to L2C_ENTRIES L2 block mappings.
8725  */
8726 static bool
8727 pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va)
8728 {
8729 	pd_entry_t *l2c_end, *l2c_start, l2e, mask, nbits, *tl2p;
8730 	vm_offset_t tmpl3;
8731 	register_t intr;
8732 
8733 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8734 	PMAP_ASSERT_STAGE1(pmap);
8735 	l2c_start = (pd_entry_t *)((uintptr_t)l2p & ~((L2C_ENTRIES *
8736 	    sizeof(pd_entry_t)) - 1));
8737 	l2c_end = l2c_start + L2C_ENTRIES;
8738 	tmpl3 = 0;
8739 	if ((va & ~L2C_OFFSET) < (vm_offset_t)l2c_end &&
8740 	    (vm_offset_t)l2c_start < (va & ~L2C_OFFSET) + L2C_SIZE) {
8741 		tmpl3 = kva_alloc(PAGE_SIZE);
8742 		if (tmpl3 == 0)
8743 			return (false);
8744 		pmap_kenter(tmpl3, PAGE_SIZE,
8745 		    DMAP_TO_PHYS((vm_offset_t)l2c_start) & ~L3_OFFSET,
8746 		    VM_MEMATTR_WRITE_BACK);
8747 		l2c_start = (pd_entry_t *)(tmpl3 +
8748 		    ((vm_offset_t)l2c_start & PAGE_MASK));
8749 		l2c_end = (pd_entry_t *)(tmpl3 +
8750 		    ((vm_offset_t)l2c_end & PAGE_MASK));
8751 	}
8752 	mask = 0;
8753 	nbits = ATTR_DESCR_VALID;
8754 	intr = intr_disable();
8755 
8756 	/*
8757 	 * Break the mappings.
8758 	 */
8759 	for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8760 		/*
8761 		 * Clear the mapping's contiguous and valid bits, but leave
8762 		 * the rest of the entry unchanged, so that a lockless,
8763 		 * concurrent pmap_kextract() can still lookup the physical
8764 		 * address.
8765 		 */
8766 		l2e = pmap_load(tl2p);
8767 		KASSERT((l2e & ATTR_CONTIGUOUS) != 0,
8768 		    ("pmap_demote_l2c: missing ATTR_CONTIGUOUS"));
8769 		KASSERT((l2e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8770 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8771 		    ("pmap_demote_l2c: missing ATTR_S1_AP_RW"));
8772 		while (!atomic_fcmpset_64(tl2p, &l2e, l2e & ~(ATTR_CONTIGUOUS |
8773 		    ATTR_DESCR_VALID)))
8774 			cpu_spinwait();
8775 
8776 		/*
8777 		 * Hardware accessed and dirty bit maintenance might only
8778 		 * update a single L2 entry, so we must combine the accessed
8779 		 * and dirty bits from this entire set of contiguous L2
8780 		 * entries.
8781 		 */
8782 		if ((l2e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8783 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8784 			mask = ATTR_S1_AP_RW_BIT;
8785 		nbits |= l2e & ATTR_AF;
8786 	}
8787 	if ((nbits & ATTR_AF) != 0) {
8788 		pmap_s1_invalidate_strided(pmap, va & ~L2C_OFFSET, (va +
8789 		    L2C_SIZE) & ~L2C_OFFSET, L2_SIZE, true);
8790 	}
8791 
8792 	/*
8793 	 * Remake the mappings, updating the accessed and dirty bits.
8794 	 */
8795 	l2e = (pmap_load(l2c_start) & ~mask) | nbits;
8796 	for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8797 		pmap_store(tl2p, l2e);
8798 		l2e += L2_SIZE;
8799 	}
8800 	dsb(ishst);
8801 
8802 	intr_restore(intr);
8803 	if (tmpl3 != 0) {
8804 		pmap_kremove(tmpl3);
8805 		kva_free(tmpl3, PAGE_SIZE);
8806 	}
8807 	counter_u64_add(pmap_l2c_demotions, 1);
8808 	CTR2(KTR_PMAP, "pmap_demote_l2c: success for va %#lx in pmap %p",
8809 	    va, pmap);
8810 	return (true);
8811 }
8812 
8813 /*
8814  * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings.
8815  */
8816 static bool
8817 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va)
8818 {
8819 	pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8820 	vm_offset_t tmpl3;
8821 	register_t intr;
8822 
8823 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8824 	l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8825 	    sizeof(pt_entry_t)) - 1));
8826 	l3c_end = l3c_start + L3C_ENTRIES;
8827 	tmpl3 = 0;
8828 	if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end &&
8829 	    (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) {
8830 		tmpl3 = kva_alloc(PAGE_SIZE);
8831 		if (tmpl3 == 0)
8832 			return (false);
8833 		pmap_kenter(tmpl3, PAGE_SIZE,
8834 		    DMAP_TO_PHYS((vm_offset_t)l3c_start) & ~L3_OFFSET,
8835 		    VM_MEMATTR_WRITE_BACK);
8836 		l3c_start = (pt_entry_t *)(tmpl3 +
8837 		    ((vm_offset_t)l3c_start & PAGE_MASK));
8838 		l3c_end = (pt_entry_t *)(tmpl3 +
8839 		    ((vm_offset_t)l3c_end & PAGE_MASK));
8840 	}
8841 	mask = 0;
8842 	nbits = ATTR_DESCR_VALID;
8843 	intr = intr_disable();
8844 
8845 	/*
8846 	 * Break the mappings.
8847 	 */
8848 	for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8849 		/*
8850 		 * Clear the mapping's contiguous and valid bits, but leave
8851 		 * the rest of the entry unchanged, so that a lockless,
8852 		 * concurrent pmap_kextract() can still lookup the physical
8853 		 * address.
8854 		 */
8855 		l3e = pmap_load(tl3p);
8856 		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8857 		    ("pmap_demote_l3c: missing ATTR_CONTIGUOUS"));
8858 		KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8859 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8860 		    ("pmap_demote_l3c: missing ATTR_S1_AP_RW"));
8861 		while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS |
8862 		    ATTR_DESCR_VALID)))
8863 			cpu_spinwait();
8864 
8865 		/*
8866 		 * Hardware accessed and dirty bit maintenance might only
8867 		 * update a single L3 entry, so we must combine the accessed
8868 		 * and dirty bits from this entire set of contiguous L3
8869 		 * entries.
8870 		 */
8871 		if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8872 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8873 			mask = ATTR_S1_AP_RW_BIT;
8874 		nbits |= l3e & ATTR_AF;
8875 	}
8876 	if ((nbits & ATTR_AF) != 0) {
8877 		pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) &
8878 		    ~L3C_OFFSET, true);
8879 	}
8880 
8881 	/*
8882 	 * Remake the mappings, updating the accessed and dirty bits.
8883 	 */
8884 	l3e = (pmap_load(l3c_start) & ~mask) | nbits;
8885 	for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8886 		pmap_store(tl3p, l3e);
8887 		l3e += L3_SIZE;
8888 	}
8889 	dsb(ishst);
8890 
8891 	intr_restore(intr);
8892 	if (tmpl3 != 0) {
8893 		pmap_kremove(tmpl3);
8894 		kva_free(tmpl3, PAGE_SIZE);
8895 	}
8896 	counter_u64_add(pmap_l3c_demotions, 1);
8897 	CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p",
8898 	    va, pmap);
8899 	return (true);
8900 }
8901 
8902 /*
8903  * Accumulate the accessed and dirty bits within a L3C superpage and
8904  * return the specified PTE with them applied correctly.
8905  */
8906 static pt_entry_t
8907 pmap_load_l3c(pt_entry_t *l3p)
8908 {
8909 	pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8910 
8911 	l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8912 	    sizeof(pt_entry_t)) - 1));
8913 	l3c_end = l3c_start + L3C_ENTRIES;
8914 	mask = 0;
8915 	nbits = 0;
8916 	/* Iterate over each mapping in the superpage. */
8917 	for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8918 		l3e = pmap_load(tl3p);
8919 		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8920 		    ("pmap_load_l3c: missing ATTR_CONTIGUOUS"));
8921 		/* Update mask if the current page has its dirty bit set. */
8922 		if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8923 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8924 			mask = ATTR_S1_AP_RW_BIT;
8925 		/* Update nbits if the accessed bit is set. */
8926 		nbits |= l3e & ATTR_AF;
8927 	}
8928 	return ((pmap_load(l3p) & ~mask) | nbits);
8929 }
8930 
8931 /*
8932  * Perform the pmap work for mincore(2).  If the page is not both referenced and
8933  * modified by this pmap, returns its physical address so that the caller can
8934  * find other mappings.
8935  */
8936 int
8937 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
8938 {
8939 	pt_entry_t *pte, tpte;
8940 	vm_paddr_t mask, pa;
8941 	int lvl, psind, val;
8942 	bool managed;
8943 
8944 	PMAP_ASSERT_STAGE1(pmap);
8945 	PMAP_LOCK(pmap);
8946 	pte = pmap_pte(pmap, addr, &lvl);
8947 	if (pte != NULL) {
8948 		tpte = pmap_load(pte);
8949 
8950 		switch (lvl) {
8951 		case 3:
8952 			mask = L3_OFFSET;
8953 			psind = (tpte & ATTR_CONTIGUOUS) != 0 ? 1 : 0;
8954 			break;
8955 		case 2:
8956 			mask = L2_OFFSET;
8957 			psind = 2;
8958 			break;
8959 		case 1:
8960 			mask = L1_OFFSET;
8961 			psind = 3;
8962 			break;
8963 		default:
8964 			panic("pmap_mincore: invalid level %d", lvl);
8965 		}
8966 
8967 		managed = (tpte & ATTR_SW_MANAGED) != 0;
8968 		val = MINCORE_INCORE | MINCORE_PSIND(psind);
8969 		if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
8970 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
8971 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
8972 		if ((tpte & ATTR_AF) == ATTR_AF)
8973 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
8974 
8975 		pa = PTE_TO_PHYS(tpte) | (addr & mask);
8976 	} else {
8977 		managed = false;
8978 		val = 0;
8979 	}
8980 
8981 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
8982 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
8983 		*pap = pa;
8984 	}
8985 	PMAP_UNLOCK(pmap);
8986 	return (val);
8987 }
8988 
8989 /*
8990  * Garbage collect every ASID that is neither active on a processor nor
8991  * reserved.
8992  */
8993 static void
8994 pmap_reset_asid_set(pmap_t pmap)
8995 {
8996 	pmap_t curpmap;
8997 	int asid, cpuid, epoch;
8998 	struct asid_set *set;
8999 	enum pmap_stage stage;
9000 
9001 	set = pmap->pm_asid_set;
9002 	stage = pmap->pm_stage;
9003 
9004 	set = pmap->pm_asid_set;
9005 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
9006 	mtx_assert(&set->asid_set_mutex, MA_OWNED);
9007 
9008 	/*
9009 	 * Ensure that the store to asid_epoch is globally visible before the
9010 	 * loads from pc_curpmap are performed.
9011 	 */
9012 	epoch = set->asid_epoch + 1;
9013 	if (epoch == INT_MAX)
9014 		epoch = 0;
9015 	set->asid_epoch = epoch;
9016 	dsb(ishst);
9017 	if (stage == PM_STAGE1) {
9018 		__asm __volatile("tlbi vmalle1is");
9019 	} else {
9020 		KASSERT(pmap_clean_stage2_tlbi != NULL,
9021 		    ("%s: Unset stage 2 tlb invalidation callback\n",
9022 		    __func__));
9023 		pmap_clean_stage2_tlbi();
9024 	}
9025 	dsb(ish);
9026 	bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
9027 	    set->asid_set_size - 1);
9028 	CPU_FOREACH(cpuid) {
9029 		if (cpuid == curcpu)
9030 			continue;
9031 		if (stage == PM_STAGE1) {
9032 			curpmap = pcpu_find(cpuid)->pc_curpmap;
9033 			PMAP_ASSERT_STAGE1(pmap);
9034 		} else {
9035 			curpmap = pcpu_find(cpuid)->pc_curvmpmap;
9036 			if (curpmap == NULL)
9037 				continue;
9038 			PMAP_ASSERT_STAGE2(pmap);
9039 		}
9040 		KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
9041 		asid = COOKIE_TO_ASID(curpmap->pm_cookie);
9042 		if (asid == -1)
9043 			continue;
9044 		bit_set(set->asid_set, asid);
9045 		curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
9046 	}
9047 }
9048 
9049 /*
9050  * Allocate a new ASID for the specified pmap.
9051  */
9052 static void
9053 pmap_alloc_asid(pmap_t pmap)
9054 {
9055 	struct asid_set *set;
9056 	int new_asid;
9057 
9058 	set = pmap->pm_asid_set;
9059 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
9060 
9061 	mtx_lock_spin(&set->asid_set_mutex);
9062 
9063 	/*
9064 	 * While this processor was waiting to acquire the asid set mutex,
9065 	 * pmap_reset_asid_set() running on another processor might have
9066 	 * updated this pmap's cookie to the current epoch.  In which case, we
9067 	 * don't need to allocate a new ASID.
9068 	 */
9069 	if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
9070 		goto out;
9071 
9072 	bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
9073 	    &new_asid);
9074 	if (new_asid == -1) {
9075 		bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
9076 		    set->asid_next, &new_asid);
9077 		if (new_asid == -1) {
9078 			pmap_reset_asid_set(pmap);
9079 			bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
9080 			    set->asid_set_size, &new_asid);
9081 			KASSERT(new_asid != -1, ("ASID allocation failure"));
9082 		}
9083 	}
9084 	bit_set(set->asid_set, new_asid);
9085 	set->asid_next = new_asid + 1;
9086 	pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
9087 out:
9088 	mtx_unlock_spin(&set->asid_set_mutex);
9089 }
9090 
9091 static uint64_t __read_mostly ttbr_flags;
9092 
9093 /*
9094  * Compute the value that should be stored in ttbr0 to activate the specified
9095  * pmap.  This value may change from time to time.
9096  */
9097 uint64_t
9098 pmap_to_ttbr0(pmap_t pmap)
9099 {
9100 	uint64_t ttbr;
9101 
9102 	ttbr = pmap->pm_ttbr;
9103 	ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
9104 	ttbr |= ttbr_flags;
9105 
9106 	return (ttbr);
9107 }
9108 
9109 static void
9110 pmap_set_cnp(void *arg)
9111 {
9112 	uint64_t ttbr0, ttbr1;
9113 	u_int cpuid;
9114 
9115 	cpuid = *(u_int *)arg;
9116 	if (cpuid == curcpu) {
9117 		/*
9118 		 * Set the flags while all CPUs are handling the
9119 		 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
9120 		 * to pmap_to_ttbr0 after this will have the CnP flag set.
9121 		 * The dsb after invalidating the TLB will act as a barrier
9122 		 * to ensure all CPUs can observe this change.
9123 		 */
9124 		ttbr_flags |= TTBR_CnP;
9125 	}
9126 
9127 	ttbr0 = READ_SPECIALREG(ttbr0_el1);
9128 	ttbr0 |= TTBR_CnP;
9129 
9130 	ttbr1 = READ_SPECIALREG(ttbr1_el1);
9131 	ttbr1 |= TTBR_CnP;
9132 
9133 	/* Update ttbr{0,1}_el1 with the CnP flag */
9134 	WRITE_SPECIALREG(ttbr0_el1, ttbr0);
9135 	WRITE_SPECIALREG(ttbr1_el1, ttbr1);
9136 	isb();
9137 	__asm __volatile("tlbi vmalle1is");
9138 	dsb(ish);
9139 	isb();
9140 }
9141 
9142 /*
9143  * Defer enabling some features until we have read the ID registers to know
9144  * if they are supported on all CPUs.
9145  */
9146 static void
9147 pmap_init_mp(void *dummy __unused)
9148 {
9149 	uint64_t reg;
9150 
9151 	get_kernel_reg(ID_AA64PFR1_EL1, &reg);
9152 	if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) {
9153 		if (bootverbose)
9154 			printf("Enabling BTI\n");
9155 		pmap_bti_support = true;
9156 
9157 		pmap_bti_ranges_zone = uma_zcreate("BTI ranges",
9158 		    sizeof(struct rs_el), NULL, NULL, NULL, NULL,
9159 		    UMA_ALIGN_PTR, 0);
9160 	}
9161 }
9162 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL);
9163 
9164 /*
9165  * Defer enabling CnP until we have read the ID registers to know if it's
9166  * supported on all CPUs.
9167  */
9168 static void
9169 pmap_init_cnp(void *dummy __unused)
9170 {
9171 	uint64_t reg;
9172 	u_int cpuid;
9173 
9174 	get_kernel_reg(ID_AA64MMFR2_EL1, &reg);
9175 	if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
9176 		if (bootverbose)
9177 			printf("Enabling CnP\n");
9178 		cpuid = curcpu;
9179 		smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
9180 	}
9181 
9182 }
9183 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
9184 
9185 static bool
9186 pmap_activate_int(struct thread *td, pmap_t pmap)
9187 {
9188 	struct asid_set *set;
9189 	int epoch;
9190 
9191 	KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
9192 	KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
9193 
9194 	if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
9195 	    (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
9196 		/*
9197 		 * Handle the possibility that the old thread was preempted
9198 		 * after an "ic" or "tlbi" instruction but before it performed
9199 		 * a "dsb" instruction.  If the old thread migrates to a new
9200 		 * processor, its completion of a "dsb" instruction on that
9201 		 * new processor does not guarantee that the "ic" or "tlbi"
9202 		 * instructions performed on the old processor have completed.
9203 		 */
9204 		dsb(ish);
9205 		return (false);
9206 	}
9207 
9208 	set = pmap->pm_asid_set;
9209 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
9210 
9211 	/*
9212 	 * Ensure that the store to curpmap is globally visible before the
9213 	 * load from asid_epoch is performed.
9214 	 */
9215 	if (pmap->pm_stage == PM_STAGE1)
9216 		PCPU_SET(curpmap, pmap);
9217 	else
9218 		PCPU_SET(curvmpmap, pmap);
9219 	dsb(ish);
9220 	epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
9221 	if (epoch >= 0 && epoch != set->asid_epoch)
9222 		pmap_alloc_asid(pmap);
9223 
9224 	if (pmap->pm_stage == PM_STAGE1) {
9225 		uint64_t new_tcr, tcr;
9226 
9227 		new_tcr = td->td_proc->p_md.md_tcr;
9228 		tcr = READ_SPECIALREG(tcr_el1);
9229 		if ((tcr & MD_TCR_FIELDS) != new_tcr) {
9230 			tcr &= ~MD_TCR_FIELDS;
9231 			tcr |= new_tcr;
9232 			WRITE_SPECIALREG(tcr_el1, tcr);
9233 		}
9234 		set_ttbr0(pmap_to_ttbr0(pmap));
9235 		if (PCPU_GET(bcast_tlbi_workaround) != 0)
9236 			invalidate_local_icache();
9237 	}
9238 	return (true);
9239 }
9240 
9241 void
9242 pmap_activate_vm(pmap_t pmap)
9243 {
9244 
9245 	PMAP_ASSERT_STAGE2(pmap);
9246 
9247 	(void)pmap_activate_int(NULL, pmap);
9248 }
9249 
9250 void
9251 pmap_activate(struct thread *td)
9252 {
9253 	pmap_t	pmap;
9254 
9255 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
9256 	PMAP_ASSERT_STAGE1(pmap);
9257 	critical_enter();
9258 	(void)pmap_activate_int(td, pmap);
9259 	critical_exit();
9260 }
9261 
9262 /*
9263  * Activate the thread we are switching to.
9264  * To simplify the assembly in cpu_throw return the new threads pcb.
9265  */
9266 struct pcb *
9267 pmap_switch(struct thread *new)
9268 {
9269 	pcpu_bp_harden bp_harden;
9270 	struct pcb *pcb;
9271 
9272 	/* Store the new curthread */
9273 	PCPU_SET(curthread, new);
9274 
9275 	/* And the new pcb */
9276 	pcb = new->td_pcb;
9277 	PCPU_SET(curpcb, pcb);
9278 
9279 	/*
9280 	 * TODO: We may need to flush the cache here if switching
9281 	 * to a user process.
9282 	 */
9283 
9284 	if (pmap_activate_int(new, vmspace_pmap(new->td_proc->p_vmspace))) {
9285 		/*
9286 		 * Stop userspace from training the branch predictor against
9287 		 * other processes. This will call into a CPU specific
9288 		 * function that clears the branch predictor state.
9289 		 */
9290 		bp_harden = PCPU_GET(bp_harden);
9291 		if (bp_harden != NULL)
9292 			bp_harden();
9293 	}
9294 
9295 	return (pcb);
9296 }
9297 
9298 void
9299 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
9300 {
9301 
9302 	PMAP_ASSERT_STAGE1(pmap);
9303 	KASSERT(ADDR_IS_CANONICAL(va),
9304 	    ("%s: Address not in canonical form: %lx", __func__, va));
9305 
9306 	if (ADDR_IS_KERNEL(va)) {
9307 		cpu_icache_sync_range((void *)va, sz);
9308 	} else {
9309 		u_int len, offset;
9310 		vm_paddr_t pa;
9311 
9312 		/* Find the length of data in this page to flush */
9313 		offset = va & PAGE_MASK;
9314 		len = imin(PAGE_SIZE - offset, sz);
9315 
9316 		while (sz != 0) {
9317 			/* Extract the physical address & find it in the DMAP */
9318 			pa = pmap_extract(pmap, va);
9319 			if (pa != 0)
9320 				cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
9321 				    len);
9322 
9323 			/* Move to the next page */
9324 			sz -= len;
9325 			va += len;
9326 			/* Set the length for the next iteration */
9327 			len = imin(PAGE_SIZE, sz);
9328 		}
9329 	}
9330 }
9331 
9332 static int
9333 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9334 {
9335 	pd_entry_t *pdep;
9336 	pt_entry_t *ptep, pte;
9337 	int rv, lvl, dfsc;
9338 
9339 	PMAP_ASSERT_STAGE2(pmap);
9340 	rv = KERN_FAILURE;
9341 
9342 	/* Data and insn aborts use same encoding for FSC field. */
9343 	dfsc = esr & ISS_DATA_DFSC_MASK;
9344 	switch (dfsc) {
9345 	case ISS_DATA_DFSC_TF_L0:
9346 	case ISS_DATA_DFSC_TF_L1:
9347 	case ISS_DATA_DFSC_TF_L2:
9348 	case ISS_DATA_DFSC_TF_L3:
9349 		PMAP_LOCK(pmap);
9350 		pdep = pmap_pde(pmap, far, &lvl);
9351 		if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
9352 			PMAP_UNLOCK(pmap);
9353 			break;
9354 		}
9355 
9356 		switch (lvl) {
9357 		case 0:
9358 			ptep = pmap_l0_to_l1(pdep, far);
9359 			break;
9360 		case 1:
9361 			ptep = pmap_l1_to_l2(pdep, far);
9362 			break;
9363 		case 2:
9364 			ptep = pmap_l2_to_l3(pdep, far);
9365 			break;
9366 		default:
9367 			panic("%s: Invalid pde level %d", __func__,lvl);
9368 		}
9369 		goto fault_exec;
9370 
9371 	case ISS_DATA_DFSC_AFF_L1:
9372 	case ISS_DATA_DFSC_AFF_L2:
9373 	case ISS_DATA_DFSC_AFF_L3:
9374 		PMAP_LOCK(pmap);
9375 		ptep = pmap_pte(pmap, far, &lvl);
9376 fault_exec:
9377 		if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
9378 			/*
9379 			 * If accessing an executable page invalidate
9380 			 * the I-cache so it will be valid when we
9381 			 * continue execution in the guest. The D-cache
9382 			 * is assumed to already be clean to the Point
9383 			 * of Coherency.
9384 			 */
9385 			if ((pte & ATTR_S2_XN_MASK) !=
9386 			    ATTR_S2_XN(ATTR_S2_XN_NONE)) {
9387 				invalidate_icache();
9388 			}
9389 			pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
9390 			rv = KERN_SUCCESS;
9391 		}
9392 		PMAP_UNLOCK(pmap);
9393 		break;
9394 	}
9395 
9396 	return (rv);
9397 }
9398 
9399 int
9400 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9401 {
9402 	pt_entry_t pte, *ptep;
9403 	register_t intr;
9404 	uint64_t ec, par;
9405 	int lvl, rv;
9406 
9407 	rv = KERN_FAILURE;
9408 
9409 	ec = ESR_ELx_EXCEPTION(esr);
9410 	switch (ec) {
9411 	case EXCP_INSN_ABORT_L:
9412 	case EXCP_INSN_ABORT:
9413 	case EXCP_DATA_ABORT_L:
9414 	case EXCP_DATA_ABORT:
9415 		break;
9416 	default:
9417 		return (rv);
9418 	}
9419 
9420 	if (pmap->pm_stage == PM_STAGE2)
9421 		return (pmap_stage2_fault(pmap, esr, far));
9422 
9423 	/* Data and insn aborts use same encoding for FSC field. */
9424 	switch (esr & ISS_DATA_DFSC_MASK) {
9425 	case ISS_DATA_DFSC_AFF_L1:
9426 	case ISS_DATA_DFSC_AFF_L2:
9427 	case ISS_DATA_DFSC_AFF_L3:
9428 		PMAP_LOCK(pmap);
9429 		ptep = pmap_pte(pmap, far, &lvl);
9430 		if (ptep != NULL) {
9431 			pmap_set_bits(ptep, ATTR_AF);
9432 			rv = KERN_SUCCESS;
9433 			/*
9434 			 * XXXMJ as an optimization we could mark the entry
9435 			 * dirty if this is a write fault.
9436 			 */
9437 		}
9438 		PMAP_UNLOCK(pmap);
9439 		break;
9440 	case ISS_DATA_DFSC_PF_L1:
9441 	case ISS_DATA_DFSC_PF_L2:
9442 	case ISS_DATA_DFSC_PF_L3:
9443 		if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
9444 		    (esr & ISS_DATA_WnR) == 0)
9445 			return (rv);
9446 		PMAP_LOCK(pmap);
9447 		ptep = pmap_pte(pmap, far, &lvl);
9448 		if (ptep != NULL &&
9449 		    ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
9450 			if ((pte & ATTR_S1_AP_RW_BIT) ==
9451 			    ATTR_S1_AP(ATTR_S1_AP_RO)) {
9452 				pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
9453 				pmap_s1_invalidate_page(pmap, far, true);
9454 			}
9455 			rv = KERN_SUCCESS;
9456 		}
9457 		PMAP_UNLOCK(pmap);
9458 		break;
9459 	case ISS_DATA_DFSC_TF_L0:
9460 	case ISS_DATA_DFSC_TF_L1:
9461 	case ISS_DATA_DFSC_TF_L2:
9462 	case ISS_DATA_DFSC_TF_L3:
9463 		/*
9464 		 * Retry the translation.  A break-before-make sequence can
9465 		 * produce a transient fault.
9466 		 */
9467 		if (pmap == kernel_pmap) {
9468 			/*
9469 			 * The translation fault may have occurred within a
9470 			 * critical section.  Therefore, we must check the
9471 			 * address without acquiring the kernel pmap's lock.
9472 			 */
9473 			if (pmap_klookup(far, NULL))
9474 				rv = KERN_SUCCESS;
9475 		} else {
9476 			bool owned;
9477 
9478 			/*
9479 			 * In the EFIRT driver we lock the pmap before
9480 			 * calling into the runtime service. As the lock
9481 			 * is already owned by the current thread skip
9482 			 * locking it again.
9483 			 */
9484 			owned = PMAP_OWNED(pmap);
9485 			if (!owned)
9486 				PMAP_LOCK(pmap);
9487 			/* Ask the MMU to check the address. */
9488 			intr = intr_disable();
9489 			par = arm64_address_translate_s1e0r(far);
9490 			intr_restore(intr);
9491 			if (!owned)
9492 				PMAP_UNLOCK(pmap);
9493 
9494 			/*
9495 			 * If the translation was successful, then we can
9496 			 * return success to the trap handler.
9497 			 */
9498 			if (PAR_SUCCESS(par))
9499 				rv = KERN_SUCCESS;
9500 		}
9501 		break;
9502 	}
9503 
9504 	return (rv);
9505 }
9506 
9507 /*
9508  *	Increase the starting virtual address of the given mapping if a
9509  *	different alignment might result in more superpage mappings.
9510  */
9511 void
9512 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
9513     vm_offset_t *addr, vm_size_t size)
9514 {
9515 	vm_offset_t superpage_offset;
9516 
9517 	if (size < L3C_SIZE)
9518 		return;
9519 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
9520 		offset += ptoa(object->pg_color);
9521 
9522 	/*
9523 	 * Considering the object's physical alignment, is the mapping large
9524 	 * enough to encompass an L2 (2MB/32MB) superpage ...
9525 	 */
9526 	superpage_offset = offset & L2_OFFSET;
9527 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) >= L2_SIZE) {
9528 		/*
9529 		 * If the virtual and physical alignments differ, then
9530 		 * increase the virtual address so that the alignments match.
9531 		 */
9532 		if ((*addr & L2_OFFSET) < superpage_offset)
9533 			*addr = (*addr & ~L2_OFFSET) + superpage_offset;
9534 		else if ((*addr & L2_OFFSET) > superpage_offset)
9535 			*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) +
9536 			    superpage_offset;
9537 		return;
9538 	}
9539 	/* ... or an L3C (64KB/2MB) superpage? */
9540 	superpage_offset = offset & L3C_OFFSET;
9541 	if (size - ((L3C_SIZE - superpage_offset) & L3C_OFFSET) >= L3C_SIZE) {
9542 		if ((*addr & L3C_OFFSET) < superpage_offset)
9543 			*addr = (*addr & ~L3C_OFFSET) + superpage_offset;
9544 		else if ((*addr & L3C_OFFSET) > superpage_offset)
9545 			*addr = ((*addr + L3C_OFFSET) & ~L3C_OFFSET) +
9546 			    superpage_offset;
9547 	}
9548 }
9549 
9550 /**
9551  * Get the kernel virtual address of a set of physical pages. If there are
9552  * physical addresses not covered by the DMAP perform a transient mapping
9553  * that will be removed when calling pmap_unmap_io_transient.
9554  *
9555  * \param page        The pages the caller wishes to obtain the virtual
9556  *                    address on the kernel memory map.
9557  * \param vaddr       On return contains the kernel virtual memory address
9558  *                    of the pages passed in the page parameter.
9559  * \param count       Number of pages passed in.
9560  * \param can_fault   true if the thread using the mapped pages can take
9561  *                    page faults, false otherwise.
9562  *
9563  * \returns true if the caller must call pmap_unmap_io_transient when
9564  *          finished or false otherwise.
9565  *
9566  */
9567 bool
9568 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9569     bool can_fault)
9570 {
9571 	vm_paddr_t paddr;
9572 	bool needs_mapping;
9573 	int error __diagused, i;
9574 
9575 	/*
9576 	 * Allocate any KVA space that we need, this is done in a separate
9577 	 * loop to prevent calling vmem_alloc while pinned.
9578 	 */
9579 	needs_mapping = false;
9580 	for (i = 0; i < count; i++) {
9581 		paddr = VM_PAGE_TO_PHYS(page[i]);
9582 		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
9583 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
9584 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
9585 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
9586 			needs_mapping = true;
9587 		} else {
9588 			vaddr[i] = PHYS_TO_DMAP(paddr);
9589 		}
9590 	}
9591 
9592 	/* Exit early if everything is covered by the DMAP */
9593 	if (!needs_mapping)
9594 		return (false);
9595 
9596 	if (!can_fault)
9597 		sched_pin();
9598 	for (i = 0; i < count; i++) {
9599 		paddr = VM_PAGE_TO_PHYS(page[i]);
9600 		if (!PHYS_IN_DMAP(paddr)) {
9601 			panic(
9602 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
9603 		}
9604 	}
9605 
9606 	return (needs_mapping);
9607 }
9608 
9609 void
9610 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9611     bool can_fault)
9612 {
9613 	vm_paddr_t paddr;
9614 	int i;
9615 
9616 	if (!can_fault)
9617 		sched_unpin();
9618 	for (i = 0; i < count; i++) {
9619 		paddr = VM_PAGE_TO_PHYS(page[i]);
9620 		if (!PHYS_IN_DMAP(paddr)) {
9621 			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
9622 		}
9623 	}
9624 }
9625 
9626 bool
9627 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
9628 {
9629 
9630 	return (mode >= 0 && mode < VM_MEMATTR_END);
9631 }
9632 
9633 static void *
9634 bti_dup_range(void *ctx __unused, void *data)
9635 {
9636 	struct rs_el *node, *new_node;
9637 
9638 	new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9639 	if (new_node == NULL)
9640 		return (NULL);
9641 	node = data;
9642 	memcpy(new_node, node, sizeof(*node));
9643 	return (new_node);
9644 }
9645 
9646 static void
9647 bti_free_range(void *ctx __unused, void *node)
9648 {
9649 
9650 	uma_zfree(pmap_bti_ranges_zone, node);
9651 }
9652 
9653 static int
9654 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9655 {
9656 	struct rs_el *rs;
9657 	int error;
9658 
9659 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9660 	PMAP_ASSERT_STAGE1(pmap);
9661 	MPASS(pmap->pm_bti != NULL);
9662 	rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9663 	if (rs == NULL)
9664 		return (ENOMEM);
9665 	error = rangeset_insert(pmap->pm_bti, sva, eva, rs);
9666 	if (error != 0)
9667 		uma_zfree(pmap_bti_ranges_zone, rs);
9668 	return (error);
9669 }
9670 
9671 static void
9672 pmap_bti_deassign_all(pmap_t pmap)
9673 {
9674 
9675 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9676 	if (pmap->pm_bti != NULL)
9677 		rangeset_remove_all(pmap->pm_bti);
9678 }
9679 
9680 /*
9681  * Returns true if the BTI setting is the same across the specified address
9682  * range, and false otherwise.  When returning true, updates the referenced PTE
9683  * to reflect the BTI setting.
9684  *
9685  * Only stage 1 pmaps support BTI.  The kernel pmap is always a stage 1 pmap
9686  * that has the same BTI setting implicitly across its entire address range.
9687  */
9688 static bool
9689 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte)
9690 {
9691 	struct rs_el *rs;
9692 	vm_offset_t va;
9693 
9694 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9695 	KASSERT(ADDR_IS_CANONICAL(sva),
9696 	    ("%s: Start address not in canonical form: %lx", __func__, sva));
9697 	KASSERT(ADDR_IS_CANONICAL(eva),
9698 	    ("%s: End address not in canonical form: %lx", __func__, eva));
9699 	KASSERT((*pte & ATTR_S1_GP) == 0,
9700 	    ("%s: pte %lx has ATTR_S1_GP preset", __func__, *pte));
9701 
9702 	if (pmap == kernel_pmap) {
9703 		*pte |= ATTR_KERN_GP;
9704 		return (true);
9705 	}
9706 	if (pmap->pm_bti == NULL)
9707 		return (true);
9708 	PMAP_ASSERT_STAGE1(pmap);
9709 	rs = rangeset_containing(pmap->pm_bti, sva);
9710 	if (rs == NULL)
9711 		return (rangeset_empty(pmap->pm_bti, sva, eva));
9712 	while ((va = rs->re_end) < eva) {
9713 		if ((rs = rangeset_beginning(pmap->pm_bti, va)) == NULL)
9714 			return (false);
9715 	}
9716 	*pte |= ATTR_S1_GP;
9717 	return (true);
9718 }
9719 
9720 static pt_entry_t
9721 pmap_pte_bti(pmap_t pmap, vm_offset_t va)
9722 {
9723 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9724 	MPASS(ADDR_IS_CANONICAL(va));
9725 
9726 	if (pmap->pm_stage != PM_STAGE1)
9727 		return (0);
9728 	if (pmap == kernel_pmap)
9729 		return (ATTR_KERN_GP);
9730 	if (pmap->pm_bti != NULL &&
9731 	    rangeset_containing(pmap->pm_bti, va) != NULL)
9732 		return (ATTR_S1_GP);
9733 	return (0);
9734 }
9735 
9736 static void
9737 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9738 {
9739 
9740 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9741 	if (pmap->pm_bti != NULL)
9742 		rangeset_remove(pmap->pm_bti, sva, eva);
9743 }
9744 
9745 static int
9746 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap)
9747 {
9748 
9749 	PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
9750 	PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
9751 	MPASS(src_pmap->pm_stage == dst_pmap->pm_stage);
9752 	MPASS(src_pmap->pm_bti != NULL);
9753 	MPASS(dst_pmap->pm_bti != NULL);
9754 	if (src_pmap->pm_bti->rs_data_ctx == NULL)
9755 		return (0);
9756 	return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti));
9757 }
9758 
9759 static void
9760 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set)
9761 {
9762 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9763 	PMAP_ASSERT_STAGE1(pmap);
9764 
9765 	pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0,
9766 	    true);
9767 }
9768 
9769 int
9770 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9771 {
9772 	int error;
9773 
9774 	if (pmap->pm_bti == NULL)
9775 		return (0);
9776 	if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva))
9777 		return (EINVAL);
9778 	if (pmap->pm_stage != PM_STAGE1)
9779 		return (EINVAL);
9780 	if (eva <= sva || ADDR_IS_KERNEL(eva))
9781 		return (EFAULT);
9782 
9783 	sva = trunc_page(sva);
9784 	eva = round_page(eva);
9785 	for (;;) {
9786 		PMAP_LOCK(pmap);
9787 		error = pmap_bti_assign(pmap, sva, eva);
9788 		if (error == 0)
9789 			pmap_bti_update_range(pmap, sva, eva, true);
9790 		PMAP_UNLOCK(pmap);
9791 		if (error != ENOMEM)
9792 			break;
9793 		vm_wait(NULL);
9794 	}
9795 	return (error);
9796 }
9797 
9798 #if defined(KASAN) || defined(KMSAN)
9799 static pd_entry_t	*pmap_san_early_l2;
9800 
9801 #define	SAN_BOOTSTRAP_L2_SIZE	(1 * L2_SIZE)
9802 #define	SAN_BOOTSTRAP_SIZE	(2 * PAGE_SIZE)
9803 static vm_offset_t __nosanitizeaddress
9804 pmap_san_enter_bootstrap_alloc_l2(void)
9805 {
9806 	static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
9807 	static size_t offset = 0;
9808 	vm_offset_t addr;
9809 
9810 	if (offset + L2_SIZE > sizeof(bootstrap_data)) {
9811 		panic("%s: out of memory for the bootstrap shadow map L2 entries",
9812 		    __func__);
9813 	}
9814 
9815 	addr = (uintptr_t)&bootstrap_data[offset];
9816 	offset += L2_SIZE;
9817 	return (addr);
9818 }
9819 
9820 /*
9821  * SAN L1 + L2 pages, maybe L3 entries later?
9822  */
9823 static vm_offset_t __nosanitizeaddress
9824 pmap_san_enter_bootstrap_alloc_pages(int npages)
9825 {
9826 	static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
9827 	static size_t offset = 0;
9828 	vm_offset_t addr;
9829 
9830 	if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
9831 		panic("%s: out of memory for the bootstrap shadow map",
9832 		    __func__);
9833 	}
9834 
9835 	addr = (uintptr_t)&bootstrap_data[offset];
9836 	offset += (npages * PAGE_SIZE);
9837 	return (addr);
9838 }
9839 
9840 static void __nosanitizeaddress
9841 pmap_san_enter_bootstrap(void)
9842 {
9843 	vm_offset_t freemempos;
9844 
9845 	/* L1, L2 */
9846 	freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
9847 	bs_state.freemempos = freemempos;
9848 	bs_state.va = KASAN_MIN_ADDRESS;
9849 	pmap_bootstrap_l1_table(&bs_state);
9850 	pmap_san_early_l2 = bs_state.l2;
9851 }
9852 
9853 static vm_page_t
9854 pmap_san_enter_alloc_l3(void)
9855 {
9856 	vm_page_t m;
9857 
9858 	m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
9859 	    VM_ALLOC_ZERO);
9860 	if (m == NULL)
9861 		panic("%s: no memory to grow shadow map", __func__);
9862 	return (m);
9863 }
9864 
9865 static vm_page_t
9866 pmap_san_enter_alloc_l2(void)
9867 {
9868 	return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
9869 	    Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
9870 }
9871 
9872 void __nosanitizeaddress __nosanitizememory
9873 pmap_san_enter(vm_offset_t va)
9874 {
9875 	pd_entry_t *l1, *l2;
9876 	pt_entry_t *l3;
9877 	vm_page_t m;
9878 
9879 	if (virtual_avail == 0) {
9880 		vm_offset_t block;
9881 		int slot;
9882 		bool first;
9883 
9884 		/* Temporary shadow map prior to pmap_bootstrap(). */
9885 		first = pmap_san_early_l2 == NULL;
9886 		if (first)
9887 			pmap_san_enter_bootstrap();
9888 
9889 		l2 = pmap_san_early_l2;
9890 		slot = pmap_l2_index(va);
9891 
9892 		if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
9893 			MPASS(first);
9894 			block = pmap_san_enter_bootstrap_alloc_l2();
9895 			pmap_store(&l2[slot],
9896 			    PHYS_TO_PTE(pmap_early_vtophys(block)) |
9897 			    PMAP_SAN_PTE_BITS | L2_BLOCK);
9898 			dmb(ishst);
9899 		}
9900 
9901 		return;
9902 	}
9903 
9904 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
9905 	l1 = pmap_l1(kernel_pmap, va);
9906 	MPASS(l1 != NULL);
9907 	if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
9908 		m = pmap_san_enter_alloc_l3();
9909 		pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
9910 	}
9911 	l2 = pmap_l1_to_l2(l1, va);
9912 	if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
9913 		m = pmap_san_enter_alloc_l2();
9914 		if (m != NULL) {
9915 			pmap_store(l2, VM_PAGE_TO_PTE(m) |
9916 			    PMAP_SAN_PTE_BITS | L2_BLOCK);
9917 		} else {
9918 			m = pmap_san_enter_alloc_l3();
9919 			pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
9920 		}
9921 		dmb(ishst);
9922 	}
9923 	if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
9924 		return;
9925 	l3 = pmap_l2_to_l3(l2, va);
9926 	if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
9927 		return;
9928 	m = pmap_san_enter_alloc_l3();
9929 	pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
9930 	dmb(ishst);
9931 }
9932 #endif /* KASAN || KMSAN */
9933 
9934 /*
9935  * Track a range of the kernel's virtual address space that is contiguous
9936  * in various mapping attributes.
9937  */
9938 struct pmap_kernel_map_range {
9939 	vm_offset_t sva;
9940 	pt_entry_t attrs;
9941 	int l3pages;
9942 	int l3contig;
9943 	int l2blocks;
9944 	int l2contig;
9945 	int l1blocks;
9946 };
9947 
9948 static void
9949 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
9950     vm_offset_t eva)
9951 {
9952 	const char *mode;
9953 	int index;
9954 
9955 	if (eva <= range->sva)
9956 		return;
9957 
9958 	index = range->attrs & ATTR_S1_IDX_MASK;
9959 	switch (index) {
9960 	case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
9961 		mode = "DEV-NP";
9962 		break;
9963 	case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
9964 		mode = "DEV";
9965 		break;
9966 	case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
9967 		mode = "UC";
9968 		break;
9969 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
9970 		mode = "WB";
9971 		break;
9972 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
9973 		mode = "WT";
9974 		break;
9975 	default:
9976 		printf(
9977 		    "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
9978 		    __func__, index, range->sva, eva);
9979 		mode = "??";
9980 		break;
9981 	}
9982 
9983 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d %d\n",
9984 	    range->sva, eva,
9985 	    (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
9986 	    (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
9987 	    (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
9988 	    (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
9989 	    (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-',
9990 	    mode, range->l1blocks, range->l2contig, range->l2blocks,
9991 	    range->l3contig, range->l3pages);
9992 
9993 	/* Reset to sentinel value. */
9994 	range->sva = 0xfffffffffffffffful;
9995 }
9996 
9997 /*
9998  * Determine whether the attributes specified by a page table entry match those
9999  * being tracked by the current range.
10000  */
10001 static bool
10002 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
10003 {
10004 
10005 	return (range->attrs == attrs);
10006 }
10007 
10008 static void
10009 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
10010     pt_entry_t attrs)
10011 {
10012 
10013 	memset(range, 0, sizeof(*range));
10014 	range->sva = va;
10015 	range->attrs = attrs;
10016 }
10017 
10018 /* Get the block/page attributes that correspond to the table attributes */
10019 static pt_entry_t
10020 sysctl_kmaps_table_attrs(pd_entry_t table)
10021 {
10022 	pt_entry_t attrs;
10023 
10024 	attrs = 0;
10025 	if ((table & TATTR_UXN_TABLE) != 0)
10026 		attrs |= ATTR_S1_UXN;
10027 	if ((table & TATTR_PXN_TABLE) != 0)
10028 		attrs |= ATTR_S1_PXN;
10029 	if ((table & TATTR_AP_TABLE_RO) != 0)
10030 		attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
10031 
10032 	return (attrs);
10033 }
10034 
10035 /* Read the block/page attributes we care about */
10036 static pt_entry_t
10037 sysctl_kmaps_block_attrs(pt_entry_t block)
10038 {
10039 	return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK |
10040 	    ATTR_S1_GP));
10041 }
10042 
10043 /*
10044  * Given a leaf PTE, derive the mapping's attributes.  If they do not match
10045  * those of the current run, dump the address range and its attributes, and
10046  * begin a new run.
10047  */
10048 static void
10049 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
10050     vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
10051     pt_entry_t l3e)
10052 {
10053 	pt_entry_t attrs;
10054 
10055 	attrs = sysctl_kmaps_table_attrs(l0e);
10056 
10057 	if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
10058 		attrs |= sysctl_kmaps_block_attrs(l1e);
10059 		goto done;
10060 	}
10061 	attrs |= sysctl_kmaps_table_attrs(l1e);
10062 
10063 	if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
10064 		attrs |= sysctl_kmaps_block_attrs(l2e);
10065 		goto done;
10066 	}
10067 	attrs |= sysctl_kmaps_table_attrs(l2e);
10068 	attrs |= sysctl_kmaps_block_attrs(l3e);
10069 
10070 done:
10071 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
10072 		sysctl_kmaps_dump(sb, range, va);
10073 		sysctl_kmaps_reinit(range, va, attrs);
10074 	}
10075 }
10076 
10077 static int
10078 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
10079 {
10080 	struct pmap_kernel_map_range range;
10081 	struct sbuf sbuf, *sb;
10082 	pd_entry_t l0e, *l1, l1e, *l2, l2e;
10083 	pt_entry_t *l3, l3e;
10084 	vm_offset_t sva;
10085 	vm_paddr_t pa;
10086 	int error, i, j, k, l;
10087 
10088 	error = sysctl_wire_old_buffer(req, 0);
10089 	if (error != 0)
10090 		return (error);
10091 	sb = &sbuf;
10092 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
10093 
10094 	/* Sentinel value. */
10095 	range.sva = 0xfffffffffffffffful;
10096 
10097 	/*
10098 	 * Iterate over the kernel page tables without holding the kernel pmap
10099 	 * lock.  Kernel page table pages are never freed, so at worst we will
10100 	 * observe inconsistencies in the output.
10101 	 */
10102 	for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
10103 	    i++) {
10104 		if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
10105 			sbuf_printf(sb, "\nDirect map:\n");
10106 		else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
10107 			sbuf_printf(sb, "\nKernel map:\n");
10108 #ifdef KASAN
10109 		else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
10110 			sbuf_printf(sb, "\nKASAN shadow map:\n");
10111 #endif
10112 #ifdef KMSAN
10113 		else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS))
10114 			sbuf_printf(sb, "\nKMSAN shadow map:\n");
10115 		else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS))
10116 			sbuf_printf(sb, "\nKMSAN origin map:\n");
10117 #endif
10118 
10119 		l0e = kernel_pmap->pm_l0[i];
10120 		if ((l0e & ATTR_DESCR_VALID) == 0) {
10121 			sysctl_kmaps_dump(sb, &range, sva);
10122 			sva += L0_SIZE;
10123 			continue;
10124 		}
10125 		pa = PTE_TO_PHYS(l0e);
10126 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
10127 
10128 		for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
10129 			l1e = l1[j];
10130 			if ((l1e & ATTR_DESCR_VALID) == 0) {
10131 				sysctl_kmaps_dump(sb, &range, sva);
10132 				sva += L1_SIZE;
10133 				continue;
10134 			}
10135 			if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
10136 				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
10137 				sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
10138 				    0, 0);
10139 				range.l1blocks++;
10140 				sva += L1_SIZE;
10141 				continue;
10142 			}
10143 			pa = PTE_TO_PHYS(l1e);
10144 			l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
10145 
10146 			for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
10147 				l2e = l2[k];
10148 				if ((l2e & ATTR_DESCR_VALID) == 0) {
10149 					sysctl_kmaps_dump(sb, &range, sva);
10150 					sva += L2_SIZE;
10151 					continue;
10152 				}
10153 				if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
10154 					sysctl_kmaps_check(sb, &range, sva,
10155 					    l0e, l1e, l2e, 0);
10156 					if ((l2e & ATTR_CONTIGUOUS) != 0)
10157 						range.l2contig +=
10158 						    k % L2C_ENTRIES == 0 ?
10159 						    1 : 0;
10160 					else
10161 						range.l2blocks++;
10162 					sva += L2_SIZE;
10163 					continue;
10164 				}
10165 				pa = PTE_TO_PHYS(l2e);
10166 				l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
10167 
10168 				for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
10169 				    l++, sva += L3_SIZE) {
10170 					l3e = l3[l];
10171 					if ((l3e & ATTR_DESCR_VALID) == 0) {
10172 						sysctl_kmaps_dump(sb, &range,
10173 						    sva);
10174 						continue;
10175 					}
10176 					sysctl_kmaps_check(sb, &range, sva,
10177 					    l0e, l1e, l2e, l3e);
10178 					if ((l3e & ATTR_CONTIGUOUS) != 0)
10179 						range.l3contig +=
10180 						    l % L3C_ENTRIES == 0 ?
10181 						    1 : 0;
10182 					else
10183 						range.l3pages++;
10184 				}
10185 			}
10186 		}
10187 	}
10188 
10189 	error = sbuf_finish(sb);
10190 	sbuf_delete(sb);
10191 	return (error);
10192 }
10193 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
10194     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
10195     NULL, 0, sysctl_kmaps, "A",
10196     "Dump kernel address layout");
10197 
10198 
10199 void pagezero_simple(void *);
10200 void pagezero_cache(void *);
10201 void pagezero_mops(void *);
10202 
10203 DEFINE_IFUNC(static, void, pagezero, (void *))
10204 {
10205 	uint32_t dczid_el0;
10206 
10207 	dczid_el0 = READ_SPECIALREG(dczid_el0);
10208 
10209 	if (elf_hwcap2 & HWCAP2_MOPS)
10210 		return (pagezero_mops);
10211 	else if ((dczid_el0 & DCZID_DZP) == 0)
10212 		return (pagezero_cache);
10213 	else
10214 		return (pagezero_simple);
10215 }
10216