xref: /freebsd/sys/riscv/riscv/pmap.c (revision 27595bea69400ae71fa778cf1dcb52a793911ad4)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1991 Regents of the University of California.
5  * All rights reserved.
6  * Copyright (c) 1994 John S. Dyson
7  * All rights reserved.
8  * Copyright (c) 1994 David Greenman
9  * All rights reserved.
10  * Copyright (c) 2003 Peter Wemm
11  * All rights reserved.
12  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
13  * All rights reserved.
14  * Copyright (c) 2014 Andrew Turner
15  * All rights reserved.
16  * Copyright (c) 2014 The FreeBSD Foundation
17  * All rights reserved.
18  * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com>
19  * All rights reserved.
20  *
21  * This code is derived from software contributed to Berkeley by
22  * the Systems Programming Group of the University of Utah Computer
23  * Science Department and William Jolitz of UUNET Technologies Inc.
24  *
25  * Portions of this software were developed by Andrew Turner under
26  * sponsorship from The FreeBSD Foundation.
27  *
28  * Portions of this software were developed by SRI International and the
29  * University of Cambridge Computer Laboratory under DARPA/AFRL contract
30  * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
31  *
32  * Portions of this software were developed by the University of Cambridge
33  * Computer Laboratory as part of the CTSRD Project, with support from the
34  * UK Higher Education Innovation Fund (HEIF).
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. All advertising materials mentioning features or use of this software
45  *    must display the following acknowledgement:
46  *	This product includes software developed by the University of
47  *	California, Berkeley and its contributors.
48  * 4. Neither the name of the University nor the names of its contributors
49  *    may be used to endorse or promote products derived from this software
50  *    without specific prior written permission.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62  * SUCH DAMAGE.
63  */
64 /*-
65  * Copyright (c) 2003 Networks Associates Technology, Inc.
66  * All rights reserved.
67  *
68  * This software was developed for the FreeBSD Project by Jake Burkholder,
69  * Safeport Network Services, and Network Associates Laboratories, the
70  * Security Research Division of Network Associates, Inc. under
71  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
72  * CHATS research program.
73  *
74  * Redistribution and use in source and binary forms, with or without
75  * modification, are permitted provided that the following conditions
76  * are met:
77  * 1. Redistributions of source code must retain the above copyright
78  *    notice, this list of conditions and the following disclaimer.
79  * 2. Redistributions in binary form must reproduce the above copyright
80  *    notice, this list of conditions and the following disclaimer in the
81  *    documentation and/or other materials provided with the distribution.
82  *
83  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
84  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
85  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
86  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
87  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
88  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
89  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
90  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
91  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
92  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
93  * SUCH DAMAGE.
94  */
95 
96 /*
97  *	Manages physical address maps.
98  *
99  *	Since the information managed by this module is
100  *	also stored by the logical address mapping module,
101  *	this module may throw away valid virtual-to-physical
102  *	mappings at almost any time.  However, invalidations
103  *	of virtual-to-physical mappings must be done as
104  *	requested.
105  *
106  *	In order to cope with hardware architectures which
107  *	make virtual-to-physical map invalidates expensive,
108  *	this module may delay invalidate or reduced protection
109  *	operations until such time as they are actually
110  *	necessary.  This module is given full information as
111  *	to which processors are currently using which maps,
112  *	and to when physical maps must be made correct.
113  */
114 
115 #include "opt_pmap.h"
116 
117 #include <sys/param.h>
118 #include <sys/systm.h>
119 #include <sys/bitstring.h>
120 #include <sys/bus.h>
121 #include <sys/cpuset.h>
122 #include <sys/kernel.h>
123 #include <sys/ktr.h>
124 #include <sys/lock.h>
125 #include <sys/malloc.h>
126 #include <sys/mman.h>
127 #include <sys/msgbuf.h>
128 #include <sys/mutex.h>
129 #include <sys/physmem.h>
130 #include <sys/proc.h>
131 #include <sys/rwlock.h>
132 #include <sys/sbuf.h>
133 #include <sys/sx.h>
134 #include <sys/vmem.h>
135 #include <sys/vmmeter.h>
136 #include <sys/sched.h>
137 #include <sys/sysctl.h>
138 #include <sys/smp.h>
139 
140 #include <vm/vm.h>
141 #include <vm/vm_param.h>
142 #include <vm/vm_kern.h>
143 #include <vm/vm_page.h>
144 #include <vm/vm_map.h>
145 #include <vm/vm_object.h>
146 #include <vm/vm_extern.h>
147 #include <vm/vm_pageout.h>
148 #include <vm/vm_pager.h>
149 #include <vm/vm_phys.h>
150 #include <vm/vm_radix.h>
151 #include <vm/vm_reserv.h>
152 #include <vm/vm_dumpset.h>
153 #include <vm/uma.h>
154 
155 #include <machine/machdep.h>
156 #include <machine/md_var.h>
157 #include <machine/pcb.h>
158 #include <machine/sbi.h>
159 #include <machine/thead.h>
160 
161 /*
162  * Boundary values for the page table page index space:
163  *
164  * L3 pages: [0, NUL2E)
165  * L2 pages: [NUL2E, NUL2E + NUL1E)
166  * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E)
167  *
168  * Note that these ranges are used in both SV39 and SV48 mode.  In SV39 mode the
169  * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages
170  * in a set of page tables.
171  */
172 #define	NUL0E		Ln_ENTRIES
173 #define	NUL1E		(Ln_ENTRIES * NUL0E)
174 #define	NUL2E		(Ln_ENTRIES * NUL1E)
175 
176 #ifdef PV_STATS
177 #define PV_STAT(x)	do { x ; } while (0)
178 #define	__pv_stat_used
179 #else
180 #define PV_STAT(x)	do { } while (0)
181 #define	__pv_stat_used	__unused
182 #endif
183 
184 #define	pmap_l1_pindex(v)	(NUL2E + ((v) >> L1_SHIFT))
185 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
186 #define	pa_index(pa)		((pa) >> L2_SHIFT)
187 #define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
188 
189 #define	NPV_LIST_LOCKS	MAXCPU
190 
191 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
192 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
193 
194 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
195 	struct rwlock **_lockp = (lockp);		\
196 	struct rwlock *_new_lock;			\
197 							\
198 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
199 	if (_new_lock != *_lockp) {			\
200 		if (*_lockp != NULL)			\
201 			rw_wunlock(*_lockp);		\
202 		*_lockp = _new_lock;			\
203 		rw_wlock(*_lockp);			\
204 	}						\
205 } while (0)
206 
207 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
208 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
209 
210 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
211 	struct rwlock **_lockp = (lockp);		\
212 							\
213 	if (*_lockp != NULL) {				\
214 		rw_wunlock(*_lockp);			\
215 		*_lockp = NULL;				\
216 	}						\
217 } while (0)
218 
219 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
220 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
221 
222 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
223     "VM/pmap parameters");
224 
225 /* The list of all the user pmaps */
226 LIST_HEAD(pmaplist, pmap);
227 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER();
228 
229 enum pmap_mode __read_frequently pmap_mode = PMAP_MODE_SV39;
230 SYSCTL_INT(_vm_pmap, OID_AUTO, mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
231     &pmap_mode, 0,
232     "translation mode, 0 = SV39, 1 = SV48");
233 
234 struct pmap kernel_pmap_store;
235 
236 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
237 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
238 vm_offset_t kernel_vm_end = 0;
239 
240 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
241 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
242 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
243 
244 static int pmap_growkernel_panic = 0;
245 SYSCTL_INT(_vm_pmap, OID_AUTO, growkernel_panic, CTLFLAG_RDTUN,
246     &pmap_growkernel_panic, 0,
247     "panic on failure to allocate kernel page table page");
248 
249 /* This code assumes all L1 DMAP entries will be used */
250 CTASSERT((DMAP_MIN_ADDRESS  & ~L1_OFFSET) == DMAP_MIN_ADDRESS);
251 CTASSERT((DMAP_MAX_ADDRESS  & ~L1_OFFSET) == DMAP_MAX_ADDRESS);
252 
253 /*
254  * This code assumes that the early DEVMAP is L2_SIZE aligned.
255  */
256 CTASSERT((PMAP_MAPDEV_EARLY_SIZE & L2_OFFSET) == 0);
257 
258 static struct rwlock_padalign pvh_global_lock;
259 static struct mtx_padalign allpmaps_lock;
260 
261 static int __read_frequently superpages_enabled = 1;
262 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
263     CTLFLAG_RDTUN, &superpages_enabled, 0,
264     "Enable support for transparent superpages");
265 
266 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
267     "2MB page mapping counters");
268 
269 static u_long pmap_l2_demotions;
270 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
271     &pmap_l2_demotions, 0,
272     "2MB page demotions");
273 
274 static u_long pmap_l2_mappings;
275 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
276     &pmap_l2_mappings, 0,
277     "2MB page mappings");
278 
279 static u_long pmap_l2_p_failures;
280 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
281     &pmap_l2_p_failures, 0,
282     "2MB page promotion failures");
283 
284 static u_long pmap_l2_promotions;
285 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
286     &pmap_l2_promotions, 0,
287     "2MB page promotions");
288 
289 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
290     "L1 (1GB) page mapping counters");
291 
292 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
293 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
294     &pmap_l1_demotions, "L1 (1GB) page demotions");
295 
296 /*
297  * Data for the pv entry allocation mechanism
298  */
299 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
300 static struct mtx pv_chunks_mutex;
301 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
302 static struct md_page *pv_table;
303 static struct md_page pv_dummy;
304 
305 extern cpuset_t all_harts;
306 
307 /*
308  * Internal flags for pmap_enter()'s helper functions.
309  */
310 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
311 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
312 
313 static void	free_pv_chunk(struct pv_chunk *pc);
314 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
315 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
316 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
317 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
318 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
319 		    vm_offset_t va);
320 static bool	pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va);
321 static bool	pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va);
322 static bool	pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2,
323 		    vm_offset_t va, struct rwlock **lockp);
324 static int	pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
325 		    u_int flags, vm_page_t m, struct rwlock **lockp);
326 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
327     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
328 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
329     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
330 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
331     vm_page_t m, struct rwlock **lockp);
332 
333 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
334 		struct rwlock **lockp);
335 
336 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
337     struct spglist *free);
338 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
339 
340 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
341 
342 static uint64_t pmap_satp_mode(void);
343 
344 #define	pmap_clear(pte)			pmap_store(pte, 0)
345 #define	pmap_clear_bits(pte, bits)	atomic_clear_64(pte, bits)
346 #define	pmap_load_store(pte, entry)	atomic_swap_64(pte, entry)
347 #define	pmap_load_clear(pte)		pmap_load_store(pte, 0)
348 #define	pmap_load(pte)			atomic_load_64(pte)
349 #define	pmap_store(pte, entry)		atomic_store_64(pte, entry)
350 #define	pmap_store_bits(pte, bits)	atomic_set_64(pte, bits)
351 
352 /********************/
353 /* Inline functions */
354 /********************/
355 
356 static __inline void
pagecopy(void * s,void * d)357 pagecopy(void *s, void *d)
358 {
359 
360 	memcpy(d, s, PAGE_SIZE);
361 }
362 
363 static __inline void
pagezero(void * p)364 pagezero(void *p)
365 {
366 
367 	bzero(p, PAGE_SIZE);
368 }
369 
370 #define	pmap_l0_index(va)	(((va) >> L0_SHIFT) & Ln_ADDR_MASK)
371 #define	pmap_l1_index(va)	(((va) >> L1_SHIFT) & Ln_ADDR_MASK)
372 #define	pmap_l2_index(va)	(((va) >> L2_SHIFT) & Ln_ADDR_MASK)
373 #define	pmap_l3_index(va)	(((va) >> L3_SHIFT) & Ln_ADDR_MASK)
374 
375 #define	PTE_TO_PHYS(pte) \
376     ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE)
377 #define	L2PTE_TO_PHYS(l2) \
378     ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT)
379 #define	L1PTE_TO_PHYS(l1) \
380     ((((l1) & ~PTE_HI_MASK) >> PTE_PPN2_S) << L1_SHIFT)
381 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
382 
383 /*
384  * Construct a page table entry of the specified level pointing to physical
385  * address pa, with PTE bits 'bits'.
386  *
387  * A leaf PTE of any level must point to an address matching its alignment,
388  * e.g. L2 pages must be 2MB aligned in memory.
389  */
390 #define	L1_PTE(pa, bits)	((((pa) >> L1_SHIFT) << PTE_PPN2_S) | (bits))
391 #define	L2_PTE(pa, bits)	((((pa) >> L2_SHIFT) << PTE_PPN1_S) | (bits))
392 #define	L3_PTE(pa, bits)	((((pa) >> L3_SHIFT) << PTE_PPN0_S) | (bits))
393 
394 /*
395  * Construct a page directory entry (PDE), pointing to next level entry at pa,
396  * with PTE bits 'bits'.
397  *
398  * Unlike PTEs, page directory entries can point to any 4K-aligned physical
399  * address.
400  */
401 #define	L0_PDE(pa, bits)	L3_PTE(pa, bits)
402 #define	L1_PDE(pa, bits)	L3_PTE(pa, bits)
403 #define	L2_PDE(pa, bits)	L3_PTE(pa, bits)
404 
405 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)406 pmap_l0(pmap_t pmap, vm_offset_t va)
407 {
408 	KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__));
409 	KASSERT(VIRT_IS_VALID(va),
410 	    ("%s: malformed virtual address %#lx", __func__, va));
411 	return (&pmap->pm_top[pmap_l0_index(va)]);
412 }
413 
414 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)415 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
416 {
417 	vm_paddr_t phys;
418 	pd_entry_t *l1;
419 
420 	KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__));
421 	phys = PTE_TO_PHYS(pmap_load(l0));
422 	l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
423 
424 	return (&l1[pmap_l1_index(va)]);
425 }
426 
427 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)428 pmap_l1(pmap_t pmap, vm_offset_t va)
429 {
430 	pd_entry_t *l0;
431 
432 	KASSERT(VIRT_IS_VALID(va),
433 	    ("%s: malformed virtual address %#lx", __func__, va));
434 	if (pmap_mode == PMAP_MODE_SV39) {
435 		return (&pmap->pm_top[pmap_l1_index(va)]);
436 	} else {
437 		l0 = pmap_l0(pmap, va);
438 		if ((pmap_load(l0) & PTE_V) == 0)
439 			return (NULL);
440 		if ((pmap_load(l0) & PTE_RX) != 0)
441 			return (NULL);
442 		return (pmap_l0_to_l1(l0, va));
443 	}
444 }
445 
446 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1,vm_offset_t va)447 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
448 {
449 	vm_paddr_t phys;
450 	pd_entry_t *l2;
451 
452 	phys = PTE_TO_PHYS(pmap_load(l1));
453 	l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
454 
455 	return (&l2[pmap_l2_index(va)]);
456 }
457 
458 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)459 pmap_l2(pmap_t pmap, vm_offset_t va)
460 {
461 	pd_entry_t *l1;
462 
463 	l1 = pmap_l1(pmap, va);
464 	if (l1 == NULL)
465 		return (NULL);
466 	if ((pmap_load(l1) & PTE_V) == 0)
467 		return (NULL);
468 	if ((pmap_load(l1) & PTE_RX) != 0)
469 		return (NULL);
470 
471 	return (pmap_l1_to_l2(l1, va));
472 }
473 
474 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2,vm_offset_t va)475 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
476 {
477 	vm_paddr_t phys;
478 	pt_entry_t *l3;
479 
480 	phys = PTE_TO_PHYS(pmap_load(l2));
481 	l3 = (pd_entry_t *)PHYS_TO_DMAP(phys);
482 
483 	return (&l3[pmap_l3_index(va)]);
484 }
485 
486 static __inline pt_entry_t *
pmap_l3(pmap_t pmap,vm_offset_t va)487 pmap_l3(pmap_t pmap, vm_offset_t va)
488 {
489 	pd_entry_t *l2;
490 
491 	l2 = pmap_l2(pmap, va);
492 	if (l2 == NULL)
493 		return (NULL);
494 	if ((pmap_load(l2) & PTE_V) == 0)
495 		return (NULL);
496 	if ((pmap_load(l2) & PTE_RX) != 0)
497 		return (NULL);
498 
499 	return (pmap_l2_to_l3(l2, va));
500 }
501 
502 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)503 pmap_resident_count_inc(pmap_t pmap, int count)
504 {
505 
506 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
507 	pmap->pm_stats.resident_count += count;
508 }
509 
510 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)511 pmap_resident_count_dec(pmap_t pmap, int count)
512 {
513 
514 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
515 	KASSERT(pmap->pm_stats.resident_count >= count,
516 	    ("pmap %p resident count underflow %ld %d", pmap,
517 	    pmap->pm_stats.resident_count, count));
518 	pmap->pm_stats.resident_count -= count;
519 }
520 
521 static void
pmap_distribute_l1(struct pmap * pmap,vm_pindex_t l1index,pt_entry_t entry)522 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index,
523     pt_entry_t entry)
524 {
525 	struct pmap *user_pmap;
526 	pd_entry_t *l1;
527 
528 	/*
529 	 * Distribute new kernel L1 entry to all the user pmaps.  This is only
530 	 * necessary with three-level paging configured: with four-level paging
531 	 * the kernel's half of the top-level page table page is static and can
532 	 * simply be copied at pmap initialization time.
533 	 */
534 	if (pmap != kernel_pmap || pmap_mode != PMAP_MODE_SV39)
535 		return;
536 
537 	mtx_lock(&allpmaps_lock);
538 	LIST_FOREACH(user_pmap, &allpmaps, pm_list) {
539 		l1 = &user_pmap->pm_top[l1index];
540 		pmap_store(l1, entry);
541 	}
542 	mtx_unlock(&allpmaps_lock);
543 }
544 
545 /*
546  * Holds the PTE mode bits (defined in pte.h) for defining e.g. cacheability.
547  *
548  * The indices correspond to the VM_MEMATTR_* defines in riscv/include/vm.h.
549  *
550  * The array will be empty if no mode bits are supported by the CPU, e.g. when
551  * lacking the Svpbmt extension.
552  */
553 static __read_frequently pt_entry_t memattr_bits[VM_MEMATTR_TOTAL];
554 static __read_frequently pt_entry_t memattr_mask;
555 
556 static __inline pt_entry_t
pmap_memattr_bits(vm_memattr_t mode)557 pmap_memattr_bits(vm_memattr_t mode)
558 {
559 	KASSERT(pmap_is_valid_memattr(kernel_pmap, mode),
560 	    ("invalid memory mode %u\n", mode));
561 	return (memattr_bits[(int)mode]);
562 }
563 
564 /*
565  * This should only be used during pmap bootstrap e.g. by
566  * pmap_create_pagetables().
567  */
568 static pt_entry_t *
pmap_early_alloc_tables(vm_paddr_t * freemempos,int npages)569 pmap_early_alloc_tables(vm_paddr_t *freemempos, int npages)
570 {
571 	pt_entry_t *pt;
572 
573 	pt = (pt_entry_t *)*freemempos;
574 	*freemempos += npages * PAGE_SIZE;
575 	bzero(pt, npages * PAGE_SIZE);
576 
577 	return (pt);
578 }
579 
580 /*
581  *	Construct the Direct Map -- a linear mapping of physical memory into
582  *	the kernel address space.
583  *
584  *	We walk the list of physical memory segments (of arbitrary size and
585  *	alignment) mapping each appropriately. Consequently, the DMAP address
586  *	space will have unmapped regions corresponding to the holes between
587  *	physical memory segments.
588  */
589 static vm_paddr_t
pmap_bootstrap_dmap(pd_entry_t * l1,vm_paddr_t freemempos)590 pmap_bootstrap_dmap(pd_entry_t *l1, vm_paddr_t freemempos)
591 {
592 	vm_paddr_t physmap[PHYS_AVAIL_ENTRIES];
593 	vm_offset_t va;
594 	vm_paddr_t min_pa, max_pa, pa, endpa;
595 	pd_entry_t *l3, *l2;
596 	pt_entry_t memattr;
597 	u_int l1slot, l2slot, l3slot;
598 	int physmap_idx;
599 
600 	physmap_idx = physmem_avail(physmap, nitems(physmap));
601 	min_pa = physmap[0];
602 	max_pa = physmap[physmap_idx - 1];
603 
604 	printf("physmap_idx %u\n", physmap_idx);
605 	printf("min_pa %lx\n", min_pa);
606 	printf("max_pa %lx\n", max_pa);
607 
608 	/* Set the limits of the DMAP region. */
609 	dmap_phys_base = rounddown(min_pa, L1_SIZE);
610 	dmap_phys_max = max_pa;
611 
612 	memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT);
613 
614 	/*
615 	 * Walk the physmap table, using the largest page sizes possible for each
616 	 * mapping. So, for each physmap entry, map as needed/able:
617 	 *  - 4K/L3 page prefix
618 	 *  - 2M/L2 superpage prefix
619 	 *  - 1G/L1 superpages
620 	 *  - 2M/L2 superpage suffix
621 	 *  - 4K/L3 page suffix
622 	 */
623 	l3 = l2 = NULL;
624 	l2slot = l1slot = Ln_ENTRIES; /* sentinel value */
625 	for (int idx = 0; idx < physmap_idx; idx += 2) {
626 		pa = rounddown(physmap[idx], L3_SIZE);
627 		endpa = physmap[idx + 1];
628 
629 		/* Virtual address for this range. */
630 		va = PHYS_TO_DMAP(pa);
631 
632 		/* Any 2MB possible for this range? */
633 		if (roundup(pa, L2_SIZE) + L2_SIZE > endpa)
634 			goto l3end;
635 
636 		/* Loop until the next 2MB boundary. */
637 		while ((pa & L2_OFFSET) != 0) {
638 			if (l2 == NULL || pmap_l1_index(va) != l1slot) {
639 				/* Need to alloc another page table. */
640 				l2 = pmap_early_alloc_tables(&freemempos, 1);
641 
642 				/* Link it. */
643 				l1slot = pmap_l1_index(va);
644 				pmap_store(&l1[l1slot],
645 				    L1_PDE((vm_paddr_t)l2, PTE_V));
646 			}
647 
648 			if (l3 == NULL || pmap_l2_index(va) != l2slot) {
649 				l3 = pmap_early_alloc_tables(&freemempos, 1);
650 
651 				/* Link it to L2. */
652 				l2slot = pmap_l2_index(va);
653 				pmap_store(&l2[l2slot],
654 				    L2_PDE((vm_paddr_t)l3, PTE_V));
655 			}
656 
657 			/* map l3 pages */
658 			l3slot = pmap_l3_index(va);
659 			pmap_store(&l3[l3slot], L3_PTE(pa, PTE_KERN | memattr));
660 
661 			pa += L3_SIZE;
662 			va += L3_SIZE;
663 		}
664 
665 		/* Any 1GB possible for remaining range? */
666 		if (roundup(pa, L1_SIZE) + L1_SIZE > endpa)
667 			goto l2end;
668 
669 		/* Loop until the next 1GB boundary. */
670 		while ((pa & L1_OFFSET) != 0) {
671 			if (l2 == NULL || pmap_l1_index(va) != l1slot) {
672 				/* Need to alloc another page table. */
673 				l2 = pmap_early_alloc_tables(&freemempos, 1);
674 
675 				/* Link it. */
676 				l1slot = pmap_l1_index(va);
677 				pmap_store(&l1[l1slot],
678 				    L1_PDE((vm_paddr_t)l2, PTE_V));
679 			}
680 
681 			/* map l2 pages */
682 			l2slot = pmap_l2_index(va);
683 			pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr));
684 
685 			pa += L2_SIZE;
686 			va += L2_SIZE;
687 		}
688 
689 		/* Map what we can with 1GB superpages. */
690 		while (pa + L1_SIZE - 1 < endpa) {
691 			/* map l1 pages */
692 			l1slot = pmap_l1_index(va);
693 			pmap_store(&l1[l1slot], L1_PTE(pa, PTE_KERN | memattr));
694 
695 			pa += L1_SIZE;
696 			va += L1_SIZE;
697 		}
698 
699 l2end:
700 		/* Map what we can with 2MB superpages. */
701 		while (pa + L2_SIZE - 1 < endpa) {
702 			if (l2 == NULL || pmap_l1_index(va) != l1slot) {
703 				/* Need to alloc another page table. */
704 				l2 = pmap_early_alloc_tables(&freemempos, 1);
705 
706 				/* Link it. */
707 				l1slot = pmap_l1_index(va);
708 				pmap_store(&l1[l1slot],
709 				    L1_PDE((vm_paddr_t)l2, PTE_V));
710 			}
711 
712 			/* map l2 pages */
713 			l2slot = pmap_l2_index(va);
714 			pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr));
715 
716 			pa += L2_SIZE;
717 			va += L2_SIZE;
718 		}
719 
720 l3end:
721 		while (pa < endpa) {
722 			if (l2 == NULL || pmap_l1_index(va) != l1slot) {
723 				/* Need to alloc another page table. */
724 				l2 = pmap_early_alloc_tables(&freemempos, 1);
725 
726 				/* Link it. */
727 				l1slot = pmap_l1_index(va);
728 				pmap_store(&l1[l1slot],
729 				    L1_PDE((vm_paddr_t)l2, PTE_V));
730 			}
731 
732 			if (l3 == NULL || pmap_l2_index(va) != l2slot) {
733 				l3 = pmap_early_alloc_tables(&freemempos, 1);
734 
735 				/* Link it to L2. */
736 				l2slot = pmap_l2_index(va);
737 				pmap_store(&l2[l2slot],
738 				    L2_PDE((vm_paddr_t)l3, PTE_V));
739 			}
740 
741 			/* map l3 pages */
742 			l3slot = pmap_l3_index(va);
743 			pmap_store(&l3[l3slot], L3_PTE(pa, PTE_KERN | memattr));
744 
745 			pa += L3_SIZE;
746 			va += L3_SIZE;
747 		}
748 	}
749 
750 	/* And finally, the limit on DMAP VA. */
751 	dmap_max_addr = va;
752 
753 	return (freemempos);
754 }
755 
756 /*
757  *	Create a new set of pagetables to run the kernel with.
758  *
759  *	An initial, temporary setup was created in locore.S, which serves well
760  *	enough to get us this far. It mapped kernstart -> KERNBASE, using 2MB
761  *	superpages, and created a 1GB identity map, which allows this function
762  *	to dereference physical addresses.
763  *
764  *	The memory backing these page tables is allocated in the space
765  *	immediately following the kernel's preload area. Depending on the size
766  *	of this area, some, all, or none of these pages can be implicitly
767  *	mapped by the kernel's 2MB mappings. This memory will only ever be
768  *	accessed through the direct map, however.
769  */
770 static vm_paddr_t
pmap_create_pagetables(vm_paddr_t kernstart,vm_size_t kernlen,vm_paddr_t * root_pt_phys)771 pmap_create_pagetables(vm_paddr_t kernstart, vm_size_t kernlen,
772     vm_paddr_t *root_pt_phys)
773 {
774 	pt_entry_t *l0, *l1, *kern_l2, *kern_l3, *devmap_l3;
775 	pt_entry_t memattr;
776 	pd_entry_t *devmap_l2;
777 	vm_paddr_t kernend, freemempos, pa;
778 	int nkernl2, nkernl3, ndevmapl3;
779 	int i, slot;
780 	int mode;
781 
782 	kernend = kernstart + kernlen;
783 
784 	/* Static allocations begin after the kernel staging area. */
785 	freemempos = roundup2(kernend, PAGE_SIZE);
786 
787 	/* Detect Sv48 mode. */
788 	mode = PMAP_MODE_SV39;
789 	TUNABLE_INT_FETCH("vm.pmap.mode", &mode);
790 
791 	if (mode == PMAP_MODE_SV48 && (mmu_caps & MMU_SV48) != 0) {
792 		/*
793 		 * Sv48 mode: allocate an L0 page table to be the root. The
794 		 * layout of KVA is otherwise identical to Sv39.
795 		 */
796 		l0 = pmap_early_alloc_tables(&freemempos, 1);
797 		*root_pt_phys = (vm_paddr_t)l0;
798 		pmap_mode = PMAP_MODE_SV48;
799 	} else {
800 		l0 = NULL;
801 	}
802 
803 	/*
804 	 * Allocate an L1 page table.
805 	 */
806 	l1 = pmap_early_alloc_tables(&freemempos, 1);
807 	if (pmap_mode == PMAP_MODE_SV39)
808 		*root_pt_phys = (vm_paddr_t)l1;
809 
810 	/*
811 	 * Allocate a set of L2 page tables for KVA. Most likely, only 1 is
812 	 * needed.
813 	 */
814 	nkernl2 = howmany(howmany(kernlen, L2_SIZE), Ln_ENTRIES);
815 	kern_l2 = pmap_early_alloc_tables(&freemempos, nkernl2);
816 
817 	/*
818 	 * Allocate an L2 page table for the static devmap, located at the end
819 	 * of KVA. We can expect that the devmap will always be less than 1GB
820 	 * in size.
821 	 */
822 	devmap_l2 = pmap_early_alloc_tables(&freemempos, 1);
823 
824 	/* Allocate L3 page tables for the devmap. */
825 	ndevmapl3 = howmany(howmany(PMAP_MAPDEV_EARLY_SIZE, L3_SIZE),
826 	    Ln_ENTRIES);
827 	devmap_l3 = pmap_early_alloc_tables(&freemempos, ndevmapl3);
828 
829 	/*
830 	 * Allocate some L3 bootstrap pages, for early KVA allocations before
831 	 * vm_mem_init() has run. For example, the message buffer.
832 	 *
833 	 * A somewhat arbitrary choice of 32MB. This should be more than enough
834 	 * for any early allocations. There is no need to worry about waste, as
835 	 * whatever is not used will be consumed by later calls to
836 	 * pmap_growkernel().
837 	 */
838 	nkernl3 = 16;
839 	kern_l3 = pmap_early_alloc_tables(&freemempos, nkernl3);
840 
841 	/* Bootstrap the direct map. */
842 	freemempos = pmap_bootstrap_dmap(l1, freemempos);
843 
844 	/* Allocations are done. */
845 	if (freemempos < roundup2(kernend, L2_SIZE))
846 		freemempos = roundup2(kernend, L2_SIZE);
847 
848 	/* Memory attributes for standard/main memory. */
849 	memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT);
850 
851 	/*
852 	 * Map the kernel (and preloaded modules or data) using L2 superpages.
853 	 *
854 	 * kernstart is 2MB-aligned. This is enforced by loader(8) and required
855 	 * by locore assembly.
856 	 *
857 	 * TODO: eventually, this should be done with proper permissions for
858 	 * each segment, rather than mapping the entire kernel and preloaded
859 	 * modules RWX.
860 	 */
861 	slot = pmap_l2_index(KERNBASE);
862 	for (pa = kernstart; pa < kernend; pa += L2_SIZE, slot++) {
863 		pmap_store(&kern_l2[slot],
864 		    L2_PTE(pa, PTE_KERN | PTE_X | memattr));
865 	}
866 
867 	/*
868 	 * Connect the L3 bootstrap pages to the kernel L2 table. The L3 PTEs
869 	 * themselves are invalid.
870 	 */
871 	slot = pmap_l2_index(freemempos - kernstart + KERNBASE);
872 	for (i = 0; i < nkernl3; i++, slot++) {
873 		pa = (vm_paddr_t)kern_l3 + ptoa(i);
874 		pmap_store(&kern_l2[slot], L2_PDE(pa, PTE_V));
875 	}
876 
877 	/* Connect the L2 tables to the L1 table. */
878 	slot = pmap_l1_index(KERNBASE);
879 	for (i = 0; i < nkernl2; i++, slot++) {
880 		pa = (vm_paddr_t)kern_l2 + ptoa(i);
881 		pmap_store(&l1[slot], L1_PDE(pa, PTE_V));
882 	}
883 
884 	/* Connect the L1 table to L0, if in use. */
885 	if (pmap_mode == PMAP_MODE_SV48) {
886 		slot = pmap_l0_index(KERNBASE);
887 		pmap_store(&l0[slot], L0_PDE((vm_paddr_t)l1, PTE_V));
888 	}
889 
890 	/*
891 	 * Connect the devmap L3 pages to the L2 table. The devmap PTEs
892 	 * themselves are invalid.
893 	 */
894 	slot = pmap_l2_index(DEVMAP_MIN_VADDR);
895 	for (i = 0; i < ndevmapl3; i++, slot++) {
896 		pa = (vm_paddr_t)devmap_l3 + ptoa(i);
897 		pmap_store(&devmap_l2[slot], L2_PDE(pa, PTE_V));
898 	}
899 
900 	/* Connect the devmap L2 pages to the L1 table. */
901 	slot = pmap_l1_index(DEVMAP_MIN_VADDR);
902 	pa = (vm_paddr_t)devmap_l2;
903 	pmap_store(&l1[slot], L1_PDE(pa, PTE_V));
904 
905 	/* Return the next position of free memory */
906 	return (freemempos);
907 }
908 
909 /*
910  *	Bootstrap the system enough to run with virtual memory.
911  */
912 void
pmap_bootstrap(vm_paddr_t kernstart,vm_size_t kernlen)913 pmap_bootstrap(vm_paddr_t kernstart, vm_size_t kernlen)
914 {
915 	vm_paddr_t freemempos, pa;
916 	vm_paddr_t root_pt_phys;
917 	vm_offset_t freeva;
918 	vm_offset_t dpcpu, msgbufpv;
919 	pt_entry_t *pte;
920 	int i;
921 
922 	printf("pmap_bootstrap %lx %lx\n", kernstart, kernlen);
923 
924 	PMAP_LOCK_INIT(kernel_pmap);
925 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
926 	vm_radix_init(&kernel_pmap->pm_root);
927 
928 	rw_init(&pvh_global_lock, "pmap pv global");
929 
930 	/*
931 	 * Set the current CPU as active in the kernel pmap. Secondary cores
932 	 * will add themselves later in init_secondary(). The SBI firmware
933 	 * may rely on this mask being precise, so CPU_FILL() is not used.
934 	 */
935 	CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active);
936 
937 	/*
938 	 * Set up the memory attribute bits.
939 	 */
940 	if (has_svpbmt) {
941 		memattr_bits[VM_MEMATTR_PMA] = PTE_MA_NONE;
942 		memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_MA_NC;
943 		memattr_bits[VM_MEMATTR_DEVICE] = PTE_MA_IO;
944 		memattr_mask = PTE_MA_MASK;
945 	} else if (has_errata_thead_pbmt) {
946 		memattr_bits[VM_MEMATTR_PMA] = PTE_THEAD_MA_NONE;
947 		memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_THEAD_MA_NC;
948 		memattr_bits[VM_MEMATTR_DEVICE] = PTE_THEAD_MA_IO;
949 		memattr_mask = PTE_THEAD_MA_MASK;
950 	}
951 
952 	/* Create a new set of pagetables to run the kernel in. */
953 	freemempos = pmap_create_pagetables(kernstart, kernlen, &root_pt_phys);
954 
955 	/* Switch to the newly created page tables. */
956 	kernel_pmap->pm_stage = PM_STAGE1;
957 	kernel_pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(root_pt_phys);
958 	kernel_pmap->pm_satp = atop(root_pt_phys) | pmap_satp_mode();
959 	csr_write(satp, kernel_pmap->pm_satp);
960 	sfence_vma();
961 
962 	/*
963 	 * Now, we need to make a few more static reservations from KVA.
964 	 *
965 	 * Set freeva to freemempos virtual address, and be sure to advance
966 	 * them together.
967 	 */
968 	freeva = freemempos - kernstart + KERNBASE;
969 #define reserve_space(var, pa, size)					\
970 	do {								\
971 		var = freeva;						\
972 		pa = freemempos;					\
973 		freeva += size;						\
974 		freemempos += size;					\
975 	} while (0)
976 
977 	/* Allocate the dynamic per-cpu area. */
978 	reserve_space(dpcpu, pa, DPCPU_SIZE);
979 
980 	/* Map it. */
981 	pte = pmap_l3(kernel_pmap, dpcpu);
982 	KASSERT(pte != NULL, ("Bootstrap pages missing"));
983 	for (i = 0; i < howmany(DPCPU_SIZE, PAGE_SIZE); i++)
984 		pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN |
985 		    pmap_memattr_bits(VM_MEMATTR_DEFAULT)));
986 
987 	/* Now, it can be initialized. */
988 	dpcpu_init((void *)dpcpu, 0);
989 
990 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
991 	reserve_space(msgbufpv, pa, round_page(msgbufsize));
992 	msgbufp = (void *)msgbufpv;
993 
994 	/* Map it. */
995 	pte = pmap_l3(kernel_pmap, msgbufpv);
996 	KASSERT(pte != NULL, ("Bootstrap pages missing"));
997 	for (i = 0; i < howmany(msgbufsize, PAGE_SIZE); i++)
998 		pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN |
999 		    pmap_memattr_bits(VM_MEMATTR_DEFAULT)));
1000 
1001 #undef	reserve_space
1002 
1003 	/* Mark the bounds of our available virtual address space */
1004 	virtual_avail = kernel_vm_end = freeva;
1005 	virtual_end = DEVMAP_MIN_VADDR;
1006 
1007 	/* Exclude the reserved physical memory from allocations. */
1008 	physmem_exclude_region(kernstart, freemempos - kernstart,
1009 	    EXFLAG_NOALLOC);
1010 }
1011 
1012 /*
1013  *	Initialize a vm_page's machine-dependent fields.
1014  */
1015 void
pmap_page_init(vm_page_t m)1016 pmap_page_init(vm_page_t m)
1017 {
1018 
1019 	TAILQ_INIT(&m->md.pv_list);
1020 	m->md.pv_memattr = VM_MEMATTR_DEFAULT;
1021 }
1022 
1023 /*
1024  *	Initialize the pmap module.
1025  *
1026  *	Called by vm_mem_init(), to initialize any structures that the pmap
1027  *	system needs to map virtual memory.
1028  */
1029 void
pmap_init(void)1030 pmap_init(void)
1031 {
1032 	vm_size_t s;
1033 	int i, pv_npg;
1034 
1035 	/*
1036 	 * Initialize the pv chunk and pmap list mutexes.
1037 	 */
1038 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1039 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF);
1040 
1041 	/*
1042 	 * Initialize the pool of pv list locks.
1043 	 */
1044 	for (i = 0; i < NPV_LIST_LOCKS; i++)
1045 		rw_init(&pv_list_locks[i], "pmap pv list");
1046 
1047 	/*
1048 	 * Calculate the size of the pv head table for superpages.
1049 	 */
1050 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
1051 
1052 	/*
1053 	 * Allocate memory for the pv head table for superpages.
1054 	 */
1055 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1056 	s = round_page(s);
1057 	pv_table = kmem_malloc(s, M_WAITOK | M_ZERO);
1058 	for (i = 0; i < pv_npg; i++)
1059 		TAILQ_INIT(&pv_table[i].pv_list);
1060 	TAILQ_INIT(&pv_dummy.pv_list);
1061 
1062 	if (superpages_enabled)
1063 		pagesizes[1] = L2_SIZE;
1064 }
1065 
1066 #ifdef SMP
1067 /*
1068  * For SMP, these functions have to use IPIs for coherence.
1069  *
1070  * In general, the calling thread uses a plain fence to order the
1071  * writes to the page tables before invoking an SBI callback to invoke
1072  * sfence_vma() on remote CPUs.
1073  */
1074 static void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)1075 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1076 {
1077 	cpuset_t mask;
1078 
1079 	sched_pin();
1080 	mask = pmap->pm_active;
1081 	CPU_CLR(PCPU_GET(hart), &mask);
1082 	fence();
1083 	if (!CPU_EMPTY(&mask) && smp_started)
1084 		sbi_remote_sfence_vma(mask.__bits, va, 1);
1085 	sfence_vma_page(va);
1086 	sched_unpin();
1087 }
1088 
1089 static void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)1090 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1091 {
1092 	cpuset_t mask;
1093 
1094 	sched_pin();
1095 	mask = pmap->pm_active;
1096 	CPU_CLR(PCPU_GET(hart), &mask);
1097 	fence();
1098 	if (!CPU_EMPTY(&mask) && smp_started)
1099 		sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1);
1100 
1101 	/*
1102 	 * Might consider a loop of sfence_vma_page() for a small
1103 	 * number of pages in the future.
1104 	 */
1105 	sfence_vma();
1106 	sched_unpin();
1107 }
1108 
1109 static void
pmap_invalidate_all(pmap_t pmap)1110 pmap_invalidate_all(pmap_t pmap)
1111 {
1112 	cpuset_t mask;
1113 
1114 	sched_pin();
1115 	mask = pmap->pm_active;
1116 	CPU_CLR(PCPU_GET(hart), &mask);
1117 
1118 	/*
1119 	 * XXX: The SBI doc doesn't detail how to specify x0 as the
1120 	 * address to perform a global fence.  BBL currently treats
1121 	 * all sfence_vma requests as global however.
1122 	 */
1123 	fence();
1124 	if (!CPU_EMPTY(&mask) && smp_started)
1125 		sbi_remote_sfence_vma(mask.__bits, 0, 0);
1126 	sfence_vma();
1127 	sched_unpin();
1128 }
1129 #else
1130 /*
1131  * Normal, non-SMP, invalidation functions.
1132  * We inline these within pmap.c for speed.
1133  */
1134 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)1135 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1136 {
1137 
1138 	sfence_vma_page(va);
1139 }
1140 
1141 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)1142 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1143 {
1144 
1145 	/*
1146 	 * Might consider a loop of sfence_vma_page() for a small
1147 	 * number of pages in the future.
1148 	 */
1149 	sfence_vma();
1150 }
1151 
1152 static __inline void
pmap_invalidate_all(pmap_t pmap)1153 pmap_invalidate_all(pmap_t pmap)
1154 {
1155 
1156 	sfence_vma();
1157 }
1158 #endif
1159 
1160 /*
1161  *	Routine:	pmap_extract
1162  *	Function:
1163  *		Extract the physical page address associated
1164  *		with the given map/virtual_address pair.
1165  */
1166 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1167 pmap_extract(pmap_t pmap, vm_offset_t va)
1168 {
1169 	pd_entry_t *l2p, l2;
1170 	pt_entry_t *l3p;
1171 	vm_paddr_t pa;
1172 
1173 	pa = 0;
1174 
1175 	/*
1176 	 * Start with an L2 lookup, L1 superpages are currently not implemented.
1177 	 */
1178 	PMAP_LOCK(pmap);
1179 	l2p = pmap_l2(pmap, va);
1180 	if (l2p != NULL && ((l2 = pmap_load(l2p)) & PTE_V) != 0) {
1181 		if ((l2 & PTE_RWX) == 0) {
1182 			l3p = pmap_l2_to_l3(l2p, va);
1183 			pa = PTE_TO_PHYS(pmap_load(l3p));
1184 			pa |= (va & L3_OFFSET);
1185 		} else {
1186 			/* L2 is a superpage mapping. */
1187 			pa = L2PTE_TO_PHYS(l2);
1188 			pa |= (va & L2_OFFSET);
1189 		}
1190 	}
1191 	PMAP_UNLOCK(pmap);
1192 	return (pa);
1193 }
1194 
1195 /*
1196  *	Routine:	pmap_extract_and_hold
1197  *	Function:
1198  *		Atomically extract and hold the physical page
1199  *		with the given pmap and virtual address pair
1200  *		if that mapping permits the given protection.
1201  */
1202 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1203 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1204 {
1205 	pd_entry_t *l2p, l2;
1206 	pt_entry_t *l3p, l3;
1207 	vm_page_t m;
1208 
1209 	m = NULL;
1210 	PMAP_LOCK(pmap);
1211 	l2p = pmap_l2(pmap, va);
1212 	if (l2p == NULL || ((l2 = pmap_load(l2p)) & PTE_V) == 0) {
1213 		;
1214 	} else if ((l2 & PTE_RWX) != 0) {
1215 		if ((l2 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) {
1216 			m = PHYS_TO_VM_PAGE(L2PTE_TO_PHYS(l2) +
1217 			    (va & L2_OFFSET));
1218 		}
1219 	} else {
1220 		l3p = pmap_l2_to_l3(l2p, va);
1221 		if ((l3 = pmap_load(l3p)) != 0) {
1222 			if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0)
1223 				m = PTE_TO_VM_PAGE(l3);
1224 		}
1225 	}
1226 	if (m != NULL && !vm_page_wire_mapped(m))
1227 		m = NULL;
1228 	PMAP_UNLOCK(pmap);
1229 	return (m);
1230 }
1231 
1232 /*
1233  *	Routine:	pmap_kextract
1234  *	Function:
1235  *		Extract the physical page address associated with the given kernel
1236  *		virtual address.
1237  */
1238 vm_paddr_t
pmap_kextract(vm_offset_t va)1239 pmap_kextract(vm_offset_t va)
1240 {
1241 	pd_entry_t *l2, l2e;
1242 	pt_entry_t *l3;
1243 	vm_paddr_t pa;
1244 
1245 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1246 		pa = DMAP_TO_PHYS(va);
1247 	} else {
1248 		l2 = pmap_l2(kernel_pmap, va);
1249 		if (l2 == NULL)
1250 			panic("pmap_kextract: No l2");
1251 		l2e = pmap_load(l2);
1252 		/*
1253 		 * Beware of concurrent promotion and demotion! We must
1254 		 * use l2e rather than loading from l2 multiple times to
1255 		 * ensure we see a consistent state, including the
1256 		 * implicit load in pmap_l2_to_l3.  It is, however, safe
1257 		 * to use an old l2e because the L3 page is preserved by
1258 		 * promotion.
1259 		 */
1260 		if ((l2e & PTE_RX) != 0) {
1261 			/* superpages */
1262 			pa = L2PTE_TO_PHYS(l2e);
1263 			pa |= (va & L2_OFFSET);
1264 			return (pa);
1265 		}
1266 
1267 		l3 = pmap_l2_to_l3(&l2e, va);
1268 		pa = PTE_TO_PHYS(pmap_load(l3));
1269 		pa |= (va & PAGE_MASK);
1270 	}
1271 	return (pa);
1272 }
1273 
1274 /***************************************************
1275  * Low level mapping routines.....
1276  ***************************************************/
1277 
1278 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)1279 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1280 {
1281 	pt_entry_t entry;
1282 	pt_entry_t *l3;
1283 	pt_entry_t memattr;
1284 	vm_offset_t va;
1285 	pn_t pn;
1286 
1287 	KASSERT((pa & L3_OFFSET) == 0,
1288 	   ("pmap_kenter_device: Invalid physical address"));
1289 	KASSERT((sva & L3_OFFSET) == 0,
1290 	   ("pmap_kenter_device: Invalid virtual address"));
1291 	KASSERT((size & PAGE_MASK) == 0,
1292 	    ("pmap_kenter_device: Mapping is not page-sized"));
1293 
1294 	memattr = pmap_memattr_bits(mode);
1295 	va = sva;
1296 	while (size != 0) {
1297 		l3 = pmap_l3(kernel_pmap, va);
1298 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
1299 
1300 		pn = (pa / PAGE_SIZE);
1301 		entry = PTE_KERN;
1302 		entry |= memattr;
1303 		entry |= (pn << PTE_PPN0_S);
1304 		pmap_store(l3, entry);
1305 
1306 		va += PAGE_SIZE;
1307 		pa += PAGE_SIZE;
1308 		size -= PAGE_SIZE;
1309 	}
1310 	pmap_invalidate_range(kernel_pmap, sva, va);
1311 }
1312 
1313 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)1314 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1315 {
1316 	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
1317 }
1318 
1319 /*
1320  * Remove a page from the kernel pagetables.
1321  * Note: not SMP coherent.
1322  */
1323 void
pmap_kremove(vm_offset_t va)1324 pmap_kremove(vm_offset_t va)
1325 {
1326 	pt_entry_t *l3;
1327 
1328 	l3 = pmap_l3(kernel_pmap, va);
1329 	KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1330 
1331 	pmap_clear(l3);
1332 	sfence_vma();
1333 }
1334 
1335 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)1336 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1337 {
1338 	pt_entry_t *l3;
1339 	vm_offset_t va;
1340 
1341 	KASSERT((sva & L3_OFFSET) == 0,
1342 	   ("pmap_kremove_device: Invalid virtual address"));
1343 	KASSERT((size & PAGE_MASK) == 0,
1344 	    ("pmap_kremove_device: Mapping is not page-sized"));
1345 
1346 	va = sva;
1347 	while (size != 0) {
1348 		l3 = pmap_l3(kernel_pmap, va);
1349 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
1350 		pmap_clear(l3);
1351 
1352 		va += PAGE_SIZE;
1353 		size -= PAGE_SIZE;
1354 	}
1355 
1356 	pmap_invalidate_range(kernel_pmap, sva, va);
1357 }
1358 
1359 /*
1360  *	Used to map a range of physical addresses into kernel
1361  *	virtual address space.
1362  *
1363  *	The value passed in '*virt' is a suggested virtual address for
1364  *	the mapping. Architectures which can support a direct-mapped
1365  *	physical to virtual region can return the appropriate address
1366  *	within that region, leaving '*virt' unchanged. Other
1367  *	architectures should map the pages starting at '*virt' and
1368  *	update '*virt' with the first usable address after the mapped
1369  *	region.
1370  */
1371 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)1372 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1373 {
1374 
1375 	return PHYS_TO_DMAP(start);
1376 }
1377 
1378 /*
1379  * Add a list of wired pages to the kva
1380  * this routine is only used for temporary
1381  * kernel mappings that do not need to have
1382  * page modification or references recorded.
1383  * Note that old mappings are simply written
1384  * over.  The page *must* be wired.
1385  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1386  */
1387 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)1388 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1389 {
1390 	pt_entry_t *l3;
1391 	vm_paddr_t pa;
1392 	vm_offset_t va;
1393 	vm_page_t m;
1394 	pt_entry_t entry;
1395 	pn_t pn;
1396 	int i;
1397 
1398 	va = sva;
1399 	for (i = 0; i < count; i++) {
1400 		m = ma[i];
1401 		pa = VM_PAGE_TO_PHYS(m);
1402 		pn = (pa / PAGE_SIZE);
1403 		l3 = pmap_l3(kernel_pmap, va);
1404 
1405 		entry = PTE_KERN;
1406 		entry |= pmap_memattr_bits(m->md.pv_memattr);
1407 		entry |= (pn << PTE_PPN0_S);
1408 		pmap_store(l3, entry);
1409 
1410 		va += L3_SIZE;
1411 	}
1412 	pmap_invalidate_range(kernel_pmap, sva, va);
1413 }
1414 
1415 /*
1416  * This routine tears out page mappings from the
1417  * kernel -- it is meant only for temporary mappings.
1418  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1419  */
1420 void
pmap_qremove(vm_offset_t sva,int count)1421 pmap_qremove(vm_offset_t sva, int count)
1422 {
1423 	pt_entry_t *l3;
1424 	vm_offset_t va;
1425 
1426 	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
1427 
1428 	for (va = sva; count-- > 0; va += PAGE_SIZE) {
1429 		l3 = pmap_l3(kernel_pmap, va);
1430 		KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1431 		pmap_clear(l3);
1432 	}
1433 	pmap_invalidate_range(kernel_pmap, sva, va);
1434 }
1435 
1436 bool
pmap_ps_enabled(pmap_t pmap __unused)1437 pmap_ps_enabled(pmap_t pmap __unused)
1438 {
1439 
1440 	return (superpages_enabled);
1441 }
1442 
1443 /***************************************************
1444  * Page table page management routines.....
1445  ***************************************************/
1446 /*
1447  * Schedule the specified unused page table page to be freed.  Specifically,
1448  * add the page to the specified list of pages that will be released to the
1449  * physical memory manager after the TLB has been updated.
1450  */
1451 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)1452 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
1453 {
1454 
1455 	if (set_PG_ZERO)
1456 		m->flags |= PG_ZERO;
1457 	else
1458 		m->flags &= ~PG_ZERO;
1459 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1460 }
1461 
1462 /*
1463  * Inserts the specified page table page into the specified pmap's collection
1464  * of idle page table pages.  Each of a pmap's page table pages is responsible
1465  * for mapping a distinct range of virtual addresses.  The pmap's collection is
1466  * ordered by this virtual address range.
1467  *
1468  * If "promoted" is false, then the page table page "mpte" must be zero filled;
1469  * "mpte"'s valid field will be set to 0.
1470  *
1471  * If "promoted" is true and "all_l3e_PTE_A_set" is false, then "mpte" must
1472  * contain valid mappings with identical attributes except for PTE_A;
1473  * "mpte"'s valid field will be set to 1.
1474  *
1475  * If "promoted" and "all_l3e_PTE_A_set" are both true, then "mpte" must contain
1476  * valid mappings with identical attributes including PTE_A; "mpte"'s valid
1477  * field will be set to VM_PAGE_BITS_ALL.
1478  */
1479 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_PTE_A_set)1480 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
1481     bool all_l3e_PTE_A_set)
1482 {
1483 
1484 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1485 	KASSERT(promoted || !all_l3e_PTE_A_set,
1486 	    ("a zero-filled PTP can't have PTE_A set in every PTE"));
1487 	mpte->valid = promoted ? (all_l3e_PTE_A_set ? VM_PAGE_BITS_ALL : 1) : 0;
1488 	return (vm_radix_insert(&pmap->pm_root, mpte));
1489 }
1490 
1491 /*
1492  * Removes the page table page mapping the specified virtual address from the
1493  * specified pmap's collection of idle page table pages, and returns it.
1494  * Otherwise, returns NULL if there is no page table page corresponding to the
1495  * specified virtual address.
1496  */
1497 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)1498 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
1499 {
1500 
1501 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1502 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
1503 }
1504 
1505 /*
1506  * Decrements a page table page's reference count, which is used to record the
1507  * number of valid page table entries within the page.  If the reference count
1508  * drops to zero, then the page table page is unmapped.  Returns true if the
1509  * page table page was unmapped and false otherwise.
1510  */
1511 static inline bool
pmap_unwire_ptp(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)1512 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1513 {
1514 	KASSERT(m->ref_count > 0,
1515 	    ("%s: page %p ref count underflow", __func__, m));
1516 
1517 	--m->ref_count;
1518 	if (m->ref_count == 0) {
1519 		_pmap_unwire_ptp(pmap, va, m, free);
1520 		return (true);
1521 	} else {
1522 		return (false);
1523 	}
1524 }
1525 
1526 static void
_pmap_unwire_ptp(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)1527 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1528 {
1529 
1530 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1531 	if (m->pindex >= NUL2E + NUL1E) {
1532 		pd_entry_t *l0;
1533 		l0 = pmap_l0(pmap, va);
1534 		pmap_clear(l0);
1535 	} else if (m->pindex >= NUL2E) {
1536 		pd_entry_t *l1;
1537 		l1 = pmap_l1(pmap, va);
1538 		pmap_clear(l1);
1539 		pmap_distribute_l1(pmap, pmap_l1_index(va), 0);
1540 	} else {
1541 		pd_entry_t *l2;
1542 		l2 = pmap_l2(pmap, va);
1543 		pmap_clear(l2);
1544 	}
1545 	pmap_resident_count_dec(pmap, 1);
1546 	if (m->pindex < NUL2E) {
1547 		pd_entry_t *l1;
1548 		vm_page_t pdpg;
1549 
1550 		l1 = pmap_l1(pmap, va);
1551 		pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1552 		pmap_unwire_ptp(pmap, va, pdpg, free);
1553 	} else if (m->pindex < NUL2E + NUL1E && pmap_mode != PMAP_MODE_SV39) {
1554 		pd_entry_t *l0;
1555 		vm_page_t pdpg;
1556 
1557 		l0 = pmap_l0(pmap, va);
1558 		pdpg = PTE_TO_VM_PAGE(pmap_load(l0));
1559 		pmap_unwire_ptp(pmap, va, pdpg, free);
1560 	}
1561 	pmap_invalidate_page(pmap, va);
1562 
1563 	vm_wire_sub(1);
1564 
1565 	/*
1566 	 * Put page on a list so that it is released after
1567 	 * *ALL* TLB shootdown is done
1568 	 */
1569 	pmap_add_delayed_free_list(m, free, true);
1570 }
1571 
1572 /*
1573  * After removing a page table entry, this routine is used to
1574  * conditionally free the page, and manage the reference count.
1575  */
1576 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)1577 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1578     struct spglist *free)
1579 {
1580 	vm_page_t mpte;
1581 
1582 	if (va >= VM_MAXUSER_ADDRESS)
1583 		return (0);
1584 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1585 	mpte = PTE_TO_VM_PAGE(ptepde);
1586 	return (pmap_unwire_ptp(pmap, va, mpte, free));
1587 }
1588 
1589 static uint64_t
pmap_satp_mode(void)1590 pmap_satp_mode(void)
1591 {
1592 	return (pmap_mode == PMAP_MODE_SV39 ? SATP_MODE_SV39 : SATP_MODE_SV48);
1593 }
1594 
1595 void
pmap_pinit0(pmap_t pmap)1596 pmap_pinit0(pmap_t pmap)
1597 {
1598 	PMAP_LOCK_INIT(pmap);
1599 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1600 	pmap->pm_stage = PM_STAGE1;
1601 	pmap->pm_top = kernel_pmap->pm_top;
1602 	pmap->pm_satp = pmap_satp_mode() |
1603 	    (vtophys(pmap->pm_top) >> PAGE_SHIFT);
1604 	CPU_ZERO(&pmap->pm_active);
1605 	TAILQ_INIT(&pmap->pm_pvchunk);
1606 	vm_radix_init(&pmap->pm_root);
1607 	pmap_activate_boot(pmap);
1608 }
1609 
1610 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage)1611 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage)
1612 {
1613 	vm_paddr_t topphys;
1614 	vm_page_t m;
1615 	size_t i;
1616 
1617 	/*
1618 	 * Top directory is 4 pages in hypervisor case.
1619 	 * Current address space layout makes 3 of them unused.
1620 	 */
1621 	if (stage == PM_STAGE1)
1622 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO |
1623 		    VM_ALLOC_WAITOK);
1624 	else
1625 		m = vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
1626 		    4, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT);
1627 
1628 	topphys = VM_PAGE_TO_PHYS(m);
1629 	pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys);
1630 	pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT);
1631 	pmap->pm_stage = stage;
1632 
1633 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1634 
1635 	CPU_ZERO(&pmap->pm_active);
1636 
1637 	if (stage == PM_STAGE2)
1638 		goto finish;
1639 
1640 	if (pmap_mode == PMAP_MODE_SV39) {
1641 		/*
1642 		 * Copy L1 entries from the kernel pmap.  This must be done with
1643 		 * the allpmaps lock held to avoid races with
1644 		 * pmap_distribute_l1().
1645 		 */
1646 		mtx_lock(&allpmaps_lock);
1647 		LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1648 		for (i = pmap_l1_index(VM_MIN_KERNEL_ADDRESS);
1649 		    i < pmap_l1_index(VM_MAX_KERNEL_ADDRESS); i++)
1650 			pmap->pm_top[i] = kernel_pmap->pm_top[i];
1651 		for (i = pmap_l1_index(DMAP_MIN_ADDRESS);
1652 		    i < pmap_l1_index(DMAP_MAX_ADDRESS); i++)
1653 			pmap->pm_top[i] = kernel_pmap->pm_top[i];
1654 		mtx_unlock(&allpmaps_lock);
1655 	} else {
1656 		i = pmap_l0_index(VM_MIN_KERNEL_ADDRESS);
1657 		pmap->pm_top[i] = kernel_pmap->pm_top[i];
1658 	}
1659 
1660 finish:
1661 	TAILQ_INIT(&pmap->pm_pvchunk);
1662 	vm_radix_init(&pmap->pm_root);
1663 
1664 	return (1);
1665 }
1666 
1667 int
pmap_pinit(pmap_t pmap)1668 pmap_pinit(pmap_t pmap)
1669 {
1670 
1671 	return (pmap_pinit_stage(pmap, PM_STAGE1));
1672 }
1673 
1674 /*
1675  * This routine is called if the desired page table page does not exist.
1676  *
1677  * If page table page allocation fails, this routine may sleep before
1678  * returning NULL.  It sleeps only if a lock pointer was given.
1679  *
1680  * Note: If a page allocation fails at page table level two or three,
1681  * one or two pages may be held during the wait, only to be released
1682  * afterwards.  This conservative approach is easily argued to avoid
1683  * race conditions.
1684  */
1685 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)1686 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1687 {
1688 	vm_page_t m, pdpg;
1689 	pt_entry_t entry;
1690 	vm_paddr_t phys;
1691 	pn_t pn;
1692 
1693 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1694 
1695 	/*
1696 	 * Allocate a page table page.
1697 	 */
1698 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1699 	if (m == NULL) {
1700 		if (lockp != NULL) {
1701 			RELEASE_PV_LIST_LOCK(lockp);
1702 			PMAP_UNLOCK(pmap);
1703 			rw_runlock(&pvh_global_lock);
1704 			vm_wait(NULL);
1705 			rw_rlock(&pvh_global_lock);
1706 			PMAP_LOCK(pmap);
1707 		}
1708 
1709 		/*
1710 		 * Indicate the need to retry.  While waiting, the page table
1711 		 * page may have been allocated.
1712 		 */
1713 		return (NULL);
1714 	}
1715 	m->pindex = ptepindex;
1716 
1717 	/*
1718 	 * Map the pagetable page into the process address space, if
1719 	 * it isn't already there.
1720 	 */
1721 	pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT;
1722 	if (ptepindex >= NUL2E + NUL1E) {
1723 		pd_entry_t *l0;
1724 		vm_pindex_t l0index;
1725 
1726 		KASSERT(pmap_mode != PMAP_MODE_SV39,
1727 		    ("%s: pindex %#lx in SV39 mode", __func__, ptepindex));
1728 		KASSERT(ptepindex < NUL2E + NUL1E + NUL0E,
1729 		    ("%s: pindex %#lx out of range", __func__, ptepindex));
1730 
1731 		l0index = ptepindex - (NUL2E + NUL1E);
1732 		l0 = &pmap->pm_top[l0index];
1733 		KASSERT((pmap_load(l0) & PTE_V) == 0,
1734 		    ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0)));
1735 
1736 		entry = PTE_V | (pn << PTE_PPN0_S);
1737 		pmap_store(l0, entry);
1738 	} else if (ptepindex >= NUL2E) {
1739 		pd_entry_t *l0, *l1;
1740 		vm_pindex_t l0index, l1index;
1741 
1742 		l1index = ptepindex - NUL2E;
1743 		if (pmap_mode == PMAP_MODE_SV39) {
1744 			l1 = &pmap->pm_top[l1index];
1745 		} else {
1746 			l0index = l1index >> Ln_ENTRIES_SHIFT;
1747 			l0 = &pmap->pm_top[l0index];
1748 			if (pmap_load(l0) == 0) {
1749 				/* Recurse to allocate the L1 page. */
1750 				if (_pmap_alloc_l3(pmap,
1751 				    NUL2E + NUL1E + l0index, lockp) == NULL)
1752 					goto fail;
1753 				phys = PTE_TO_PHYS(pmap_load(l0));
1754 			} else {
1755 				phys = PTE_TO_PHYS(pmap_load(l0));
1756 				pdpg = PHYS_TO_VM_PAGE(phys);
1757 				pdpg->ref_count++;
1758 			}
1759 			l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1760 			l1 = &l1[ptepindex & Ln_ADDR_MASK];
1761 		}
1762 		KASSERT((pmap_load(l1) & PTE_V) == 0,
1763 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
1764 
1765 		entry = PTE_V | (pn << PTE_PPN0_S);
1766 		pmap_store(l1, entry);
1767 		pmap_distribute_l1(pmap, l1index, entry);
1768 	} else {
1769 		vm_pindex_t l0index, l1index;
1770 		pd_entry_t *l0, *l1, *l2;
1771 
1772 		l1index = ptepindex >> (L1_SHIFT - L2_SHIFT);
1773 		if (pmap_mode == PMAP_MODE_SV39) {
1774 			l1 = &pmap->pm_top[l1index];
1775 			if (pmap_load(l1) == 0) {
1776 				/* recurse for allocating page dir */
1777 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1778 				    lockp) == NULL)
1779 					goto fail;
1780 			} else {
1781 				pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1782 				pdpg->ref_count++;
1783 			}
1784 		} else {
1785 			l0index = l1index >> Ln_ENTRIES_SHIFT;
1786 			l0 = &pmap->pm_top[l0index];
1787 			if (pmap_load(l0) == 0) {
1788 				/* Recurse to allocate the L1 entry. */
1789 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1790 				    lockp) == NULL)
1791 					goto fail;
1792 				phys = PTE_TO_PHYS(pmap_load(l0));
1793 				l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1794 				l1 = &l1[l1index & Ln_ADDR_MASK];
1795 			} else {
1796 				phys = PTE_TO_PHYS(pmap_load(l0));
1797 				l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1798 				l1 = &l1[l1index & Ln_ADDR_MASK];
1799 				if (pmap_load(l1) == 0) {
1800 					/* Recurse to allocate the L2 page. */
1801 					if (_pmap_alloc_l3(pmap,
1802 					    NUL2E + l1index, lockp) == NULL)
1803 						goto fail;
1804 				} else {
1805 					pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1806 					pdpg->ref_count++;
1807 				}
1808 			}
1809 		}
1810 
1811 		phys = PTE_TO_PHYS(pmap_load(l1));
1812 		l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1813 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
1814 		KASSERT((pmap_load(l2) & PTE_V) == 0,
1815 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
1816 
1817 		entry = PTE_V | (pn << PTE_PPN0_S);
1818 		pmap_store(l2, entry);
1819 	}
1820 
1821 	pmap_resident_count_inc(pmap, 1);
1822 
1823 	return (m);
1824 
1825 fail:
1826 	vm_page_unwire_noq(m);
1827 	vm_page_free_zero(m);
1828 	return (NULL);
1829 }
1830 
1831 static vm_page_t
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)1832 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1833 {
1834 	pd_entry_t *l1;
1835 	vm_page_t l2pg;
1836 	vm_pindex_t pindex;
1837 
1838 retry:
1839 	l1 = pmap_l1(pmap, va);
1840 	if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) {
1841 		KASSERT((pmap_load(l1) & PTE_RWX) == 0,
1842 		    ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__,
1843 		    pmap_load(l1), va));
1844 		/* Add a reference to the L2 page. */
1845 		l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
1846 		l2pg->ref_count++;
1847 	} else {
1848 		/* Allocate a L2 page. */
1849 		pindex = pmap_l1_pindex(va);
1850 		l2pg = _pmap_alloc_l3(pmap, pindex, lockp);
1851 		if (l2pg == NULL && lockp != NULL)
1852 			goto retry;
1853 	}
1854 	return (l2pg);
1855 }
1856 
1857 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)1858 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1859 {
1860 	vm_pindex_t ptepindex;
1861 	pd_entry_t *l2;
1862 	vm_page_t m;
1863 
1864 	/*
1865 	 * Calculate pagetable page index
1866 	 */
1867 	ptepindex = pmap_l2_pindex(va);
1868 retry:
1869 	/*
1870 	 * Get the page directory entry
1871 	 */
1872 	l2 = pmap_l2(pmap, va);
1873 
1874 	/*
1875 	 * If the page table page is mapped, we just increment the
1876 	 * hold count, and activate it.
1877 	 */
1878 	if (l2 != NULL && pmap_load(l2) != 0) {
1879 		m = PTE_TO_VM_PAGE(pmap_load(l2));
1880 		m->ref_count++;
1881 	} else {
1882 		/*
1883 		 * Here if the pte page isn't mapped, or if it has been
1884 		 * deallocated.
1885 		 */
1886 		m = _pmap_alloc_l3(pmap, ptepindex, lockp);
1887 		if (m == NULL && lockp != NULL)
1888 			goto retry;
1889 	}
1890 	return (m);
1891 }
1892 
1893 /***************************************************
1894  * Pmap allocation/deallocation routines.
1895  ***************************************************/
1896 
1897 /*
1898  * Release any resources held by the given physical map.
1899  * Called when a pmap initialized by pmap_pinit is being released.
1900  * Should only be called if the map contains no valid mappings.
1901  */
1902 void
pmap_release(pmap_t pmap)1903 pmap_release(pmap_t pmap)
1904 {
1905 	vm_page_t m;
1906 	int npages;
1907 	int i;
1908 
1909 	KASSERT(pmap->pm_stats.resident_count == 0,
1910 	    ("pmap_release: pmap resident count %ld != 0",
1911 	    pmap->pm_stats.resident_count));
1912 	KASSERT(CPU_EMPTY(&pmap->pm_active),
1913 	    ("releasing active pmap %p", pmap));
1914 
1915 	if (pmap->pm_stage == PM_STAGE2)
1916 		goto finish;
1917 
1918 	if (pmap_mode == PMAP_MODE_SV39) {
1919 		mtx_lock(&allpmaps_lock);
1920 		LIST_REMOVE(pmap, pm_list);
1921 		mtx_unlock(&allpmaps_lock);
1922 	}
1923 
1924 finish:
1925 	npages = pmap->pm_stage == PM_STAGE2 ? 4 : 1;
1926 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top));
1927 	for (i = 0; i < npages; i++) {
1928 		vm_page_unwire_noq(m);
1929 		vm_page_free(m);
1930 		m++;
1931 	}
1932 }
1933 
1934 static int
kvm_size(SYSCTL_HANDLER_ARGS)1935 kvm_size(SYSCTL_HANDLER_ARGS)
1936 {
1937 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1938 
1939 	return sysctl_handle_long(oidp, &ksize, 0, req);
1940 }
1941 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1942     0, 0, kvm_size, "LU",
1943     "Size of KVM");
1944 
1945 static int
kvm_free(SYSCTL_HANDLER_ARGS)1946 kvm_free(SYSCTL_HANDLER_ARGS)
1947 {
1948 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1949 
1950 	return sysctl_handle_long(oidp, &kfree, 0, req);
1951 }
1952 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1953     0, 0, kvm_free, "LU",
1954     "Amount of KVM free");
1955 
1956 /*
1957  * grow the number of kernel page table entries, if needed
1958  */
1959 static int
pmap_growkernel_nopanic(vm_offset_t addr)1960 pmap_growkernel_nopanic(vm_offset_t addr)
1961 {
1962 	vm_paddr_t paddr;
1963 	vm_page_t nkpg;
1964 	pd_entry_t *l1, *l2;
1965 	pt_entry_t entry;
1966 	pn_t pn;
1967 
1968 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1969 
1970 	addr = roundup2(addr, L2_SIZE);
1971 	if (addr - 1 >= vm_map_max(kernel_map))
1972 		addr = vm_map_max(kernel_map);
1973 	while (kernel_vm_end < addr) {
1974 		l1 = pmap_l1(kernel_pmap, kernel_vm_end);
1975 		if (pmap_load(l1) == 0) {
1976 			/* We need a new PDP entry */
1977 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
1978 			    VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1979 			if (nkpg == NULL)
1980 				return (KERN_RESOURCE_SHORTAGE);
1981 
1982 			nkpg->pindex = pmap_l1_pindex(kernel_vm_end);
1983 			paddr = VM_PAGE_TO_PHYS(nkpg);
1984 
1985 			pn = (paddr / PAGE_SIZE);
1986 			entry = (PTE_V);
1987 			entry |= (pn << PTE_PPN0_S);
1988 			pmap_store(l1, entry);
1989 			pmap_distribute_l1(kernel_pmap,
1990 			    pmap_l1_index(kernel_vm_end), entry);
1991 			continue; /* try again */
1992 		}
1993 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
1994 		if ((pmap_load(l2) & PTE_V) != 0 &&
1995 		    (pmap_load(l2) & PTE_RWX) == 0) {
1996 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1997 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
1998 				kernel_vm_end = vm_map_max(kernel_map);
1999 				break;
2000 			}
2001 			continue;
2002 		}
2003 
2004 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2005 		    VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2006 		if (nkpg == NULL)
2007 			return (KERN_RESOURCE_SHORTAGE);
2008 		nkpg->pindex = pmap_l2_pindex(kernel_vm_end);
2009 		paddr = VM_PAGE_TO_PHYS(nkpg);
2010 
2011 		pn = (paddr / PAGE_SIZE);
2012 		entry = (PTE_V);
2013 		entry |= (pn << PTE_PPN0_S);
2014 		pmap_store(l2, entry);
2015 
2016 		pmap_invalidate_page(kernel_pmap, kernel_vm_end);
2017 
2018 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2019 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2020 			kernel_vm_end = vm_map_max(kernel_map);
2021 			break;
2022 		}
2023 	}
2024 
2025 	return (KERN_SUCCESS);
2026 }
2027 
2028 int
pmap_growkernel(vm_offset_t addr)2029 pmap_growkernel(vm_offset_t addr)
2030 {
2031 	int rv;
2032 
2033 	rv = pmap_growkernel_nopanic(addr);
2034 	if (rv != KERN_SUCCESS && pmap_growkernel_panic)
2035 		panic("pmap_growkernel: no memory to grow kernel");
2036 	return (rv);
2037 }
2038 
2039 /***************************************************
2040  * page management routines.
2041  ***************************************************/
2042 
2043 static const uint64_t pc_freemask[_NPCM] = {
2044 	[0 ... _NPCM - 2] = PC_FREEN,
2045 	[_NPCM - 1] = PC_FREEL
2046 };
2047 
2048 #ifdef PV_STATS
2049 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2050 
2051 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2052 	"Current number of pv entry chunks");
2053 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2054 	"Current number of pv entry chunks allocated");
2055 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2056 	"Current number of pv entry chunks frees");
2057 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2058 	"Number of times tried to get a chunk page but failed.");
2059 
2060 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2061 static int pv_entry_spare;
2062 
2063 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2064 	"Current number of pv entry frees");
2065 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2066 	"Current number of pv entry allocs");
2067 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2068 	"Current number of pv entries");
2069 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2070 	"Current number of spare pv entries");
2071 #endif
2072 
2073 /*
2074  * We are in a serious low memory condition.  Resort to
2075  * drastic measures to free some pages so we can allocate
2076  * another pv entry chunk.
2077  *
2078  * Returns NULL if PV entries were reclaimed from the specified pmap.
2079  *
2080  * We do not, however, unmap 2mpages because subsequent accesses will
2081  * allocate per-page pv entries until repromotion occurs, thereby
2082  * exacerbating the shortage of free pv entries.
2083  */
2084 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)2085 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2086 {
2087 
2088 	panic("RISCVTODO: reclaim_pv_chunk");
2089 }
2090 
2091 /*
2092  * free the pv_entry back to the free list
2093  */
2094 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)2095 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2096 {
2097 	struct pv_chunk *pc;
2098 	int idx, field, bit;
2099 
2100 	rw_assert(&pvh_global_lock, RA_LOCKED);
2101 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2102 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2103 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2104 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2105 	pc = pv_to_chunk(pv);
2106 	idx = pv - &pc->pc_pventry[0];
2107 	field = idx / 64;
2108 	bit = idx % 64;
2109 	pc->pc_map[field] |= 1ul << bit;
2110 	if (!pc_is_free(pc)) {
2111 		/* 98% of the time, pc is already at the head of the list. */
2112 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2113 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2114 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2115 		}
2116 		return;
2117 	}
2118 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2119 	free_pv_chunk(pc);
2120 }
2121 
2122 static void
free_pv_chunk(struct pv_chunk * pc)2123 free_pv_chunk(struct pv_chunk *pc)
2124 {
2125 	vm_page_t m;
2126 
2127 	mtx_lock(&pv_chunks_mutex);
2128  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2129 	mtx_unlock(&pv_chunks_mutex);
2130 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2131 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2132 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2133 	/* entire chunk is free, return it */
2134 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2135 	dump_drop_page(m->phys_addr);
2136 	vm_page_unwire_noq(m);
2137 	vm_page_free(m);
2138 }
2139 
2140 /*
2141  * Returns a new PV entry, allocating a new PV chunk from the system when
2142  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2143  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2144  * returned.
2145  *
2146  * The given PV list lock may be released.
2147  */
2148 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)2149 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2150 {
2151 	int bit, field;
2152 	pv_entry_t pv;
2153 	struct pv_chunk *pc;
2154 	vm_page_t m;
2155 
2156 	rw_assert(&pvh_global_lock, RA_LOCKED);
2157 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2158 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2159 retry:
2160 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2161 	if (pc != NULL) {
2162 		for (field = 0; field < _NPCM; field++) {
2163 			if (pc->pc_map[field]) {
2164 				bit = ffsl(pc->pc_map[field]) - 1;
2165 				break;
2166 			}
2167 		}
2168 		if (field < _NPCM) {
2169 			pv = &pc->pc_pventry[field * 64 + bit];
2170 			pc->pc_map[field] &= ~(1ul << bit);
2171 			/* If this was the last item, move it to tail */
2172 			if (pc_is_full(pc)) {
2173 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2174 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2175 				    pc_list);
2176 			}
2177 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2178 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2179 			return (pv);
2180 		}
2181 	}
2182 	/* No free items, allocate another chunk */
2183 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2184 	if (m == NULL) {
2185 		if (lockp == NULL) {
2186 			PV_STAT(pc_chunk_tryfail++);
2187 			return (NULL);
2188 		}
2189 		m = reclaim_pv_chunk(pmap, lockp);
2190 		if (m == NULL)
2191 			goto retry;
2192 	}
2193 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2194 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2195 	dump_add_page(m->phys_addr);
2196 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2197 	pc->pc_pmap = pmap;
2198 	pc->pc_map[0] = PC_FREEN & ~1ul;	/* preallocated bit 0 */
2199 	pc->pc_map[1] = PC_FREEN;
2200 	pc->pc_map[2] = PC_FREEL;
2201 	mtx_lock(&pv_chunks_mutex);
2202 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2203 	mtx_unlock(&pv_chunks_mutex);
2204 	pv = &pc->pc_pventry[0];
2205 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2206 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2207 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2208 	return (pv);
2209 }
2210 
2211 /*
2212  * Ensure that the number of spare PV entries in the specified pmap meets or
2213  * exceeds the given count, "needed".
2214  *
2215  * The given PV list lock may be released.
2216  */
2217 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)2218 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2219 {
2220 	struct pch new_tail;
2221 	struct pv_chunk *pc;
2222 	vm_page_t m;
2223 	int avail, free;
2224 	bool reclaimed;
2225 
2226 	rw_assert(&pvh_global_lock, RA_LOCKED);
2227 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2228 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2229 
2230 	/*
2231 	 * Newly allocated PV chunks must be stored in a private list until
2232 	 * the required number of PV chunks have been allocated.  Otherwise,
2233 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
2234 	 * contrast, these chunks must be added to the pmap upon allocation.
2235 	 */
2236 	TAILQ_INIT(&new_tail);
2237 retry:
2238 	avail = 0;
2239 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2240 		bit_count((bitstr_t *)pc->pc_map, 0,
2241 		    sizeof(pc->pc_map) * NBBY, &free);
2242 		if (free == 0)
2243 			break;
2244 		avail += free;
2245 		if (avail >= needed)
2246 			break;
2247 	}
2248 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
2249 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2250 		if (m == NULL) {
2251 			m = reclaim_pv_chunk(pmap, lockp);
2252 			if (m == NULL)
2253 				goto retry;
2254 			reclaimed = true;
2255 		}
2256 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2257 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2258 		dump_add_page(m->phys_addr);
2259 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2260 		pc->pc_pmap = pmap;
2261 		pc->pc_map[0] = PC_FREEN;
2262 		pc->pc_map[1] = PC_FREEN;
2263 		pc->pc_map[2] = PC_FREEL;
2264 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2265 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2266 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
2267 
2268 		/*
2269 		 * The reclaim might have freed a chunk from the current pmap.
2270 		 * If that chunk contained available entries, we need to
2271 		 * re-count the number of available entries.
2272 		 */
2273 		if (reclaimed)
2274 			goto retry;
2275 	}
2276 	if (!TAILQ_EMPTY(&new_tail)) {
2277 		mtx_lock(&pv_chunks_mutex);
2278 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2279 		mtx_unlock(&pv_chunks_mutex);
2280 	}
2281 }
2282 
2283 /*
2284  * First find and then remove the pv entry for the specified pmap and virtual
2285  * address from the specified pv list.  Returns the pv entry if found and NULL
2286  * otherwise.  This operation can be performed on pv lists for either 4KB or
2287  * 2MB page mappings.
2288  */
2289 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2290 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2291 {
2292 	pv_entry_t pv;
2293 
2294 	rw_assert(&pvh_global_lock, RA_LOCKED);
2295 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2296 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2297 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2298 			pvh->pv_gen++;
2299 			break;
2300 		}
2301 	}
2302 	return (pv);
2303 }
2304 
2305 /*
2306  * First find and then destroy the pv entry for the specified pmap and virtual
2307  * address.  This operation can be performed on pv lists for either 4KB or 2MB
2308  * page mappings.
2309  */
2310 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2311 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2312 {
2313 	pv_entry_t pv;
2314 
2315 	pv = pmap_pvh_remove(pvh, pmap, va);
2316 
2317 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va));
2318 	free_pv_entry(pmap, pv);
2319 }
2320 
2321 /*
2322  * Conditionally create the PV entry for a 4KB page mapping if the required
2323  * memory can be allocated without resorting to reclamation.
2324  */
2325 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)2326 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2327     struct rwlock **lockp)
2328 {
2329 	pv_entry_t pv;
2330 
2331 	rw_assert(&pvh_global_lock, RA_LOCKED);
2332 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2333 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2334 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2335 		pv->pv_va = va;
2336 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2337 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2338 		m->md.pv_gen++;
2339 		return (true);
2340 	} else
2341 		return (false);
2342 }
2343 
2344 /*
2345  * After demotion from a 2MB page mapping to 512 4KB page mappings,
2346  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2347  * entries for each of the 4KB page mappings.
2348  */
2349 static void __unused
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)2350 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2351     struct rwlock **lockp)
2352 {
2353 	struct md_page *pvh;
2354 	struct pv_chunk *pc;
2355 	pv_entry_t pv;
2356 	vm_page_t m;
2357 	vm_offset_t va_last;
2358 	int bit, field;
2359 
2360 	rw_assert(&pvh_global_lock, RA_LOCKED);
2361 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2362 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2363 
2364 	/*
2365 	 * Transfer the 2mpage's pv entry for this mapping to the first
2366 	 * page's pv list.  Once this transfer begins, the pv list lock
2367 	 * must not be released until the last pv entry is reinstantiated.
2368 	 */
2369 	pvh = pa_to_pvh(pa);
2370 	va &= ~L2_OFFSET;
2371 	pv = pmap_pvh_remove(pvh, pmap, va);
2372 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
2373 	m = PHYS_TO_VM_PAGE(pa);
2374 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2375 	m->md.pv_gen++;
2376 	/* Instantiate the remaining 511 pv entries. */
2377 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
2378 	va_last = va + L2_SIZE - PAGE_SIZE;
2379 	for (;;) {
2380 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2381 		KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
2382 		for (field = 0; field < _NPCM; field++) {
2383 			while (pc->pc_map[field] != 0) {
2384 				bit = ffsl(pc->pc_map[field]) - 1;
2385 				pc->pc_map[field] &= ~(1ul << bit);
2386 				pv = &pc->pc_pventry[field * 64 + bit];
2387 				va += PAGE_SIZE;
2388 				pv->pv_va = va;
2389 				m++;
2390 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2391 			    ("pmap_pv_demote_l2: page %p is not managed", m));
2392 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2393 				m->md.pv_gen++;
2394 				if (va == va_last)
2395 					goto out;
2396 			}
2397 		}
2398 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2399 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2400 	}
2401 out:
2402 	if (pc_is_full(pc)) {
2403 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2404 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2405 	}
2406 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
2407 	PV_STAT(atomic_add_int(&pv_entry_spare, -(Ln_ENTRIES - 1)));
2408 }
2409 
2410 #if VM_NRESERVLEVEL > 0
2411 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)2412 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2413     struct rwlock **lockp)
2414 {
2415 	struct md_page *pvh;
2416 	pv_entry_t pv;
2417 	vm_page_t m;
2418 	vm_offset_t va_last;
2419 
2420 	rw_assert(&pvh_global_lock, RA_LOCKED);
2421 	KASSERT((pa & L2_OFFSET) == 0,
2422 	    ("pmap_pv_promote_l2: misaligned pa %#lx", pa));
2423 
2424 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2425 
2426 	m = PHYS_TO_VM_PAGE(pa);
2427 	va = va & ~L2_OFFSET;
2428 	pv = pmap_pvh_remove(&m->md, pmap, va);
2429 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va));
2430 	pvh = pa_to_pvh(pa);
2431 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2432 	pvh->pv_gen++;
2433 
2434 	va_last = va + L2_SIZE - PAGE_SIZE;
2435 	do {
2436 		m++;
2437 		va += PAGE_SIZE;
2438 		pmap_pvh_free(&m->md, pmap, va);
2439 	} while (va < va_last);
2440 }
2441 #endif /* VM_NRESERVLEVEL > 0 */
2442 
2443 /*
2444  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
2445  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
2446  * false if the PV entry cannot be allocated without resorting to reclamation.
2447  */
2448 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)2449 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
2450     struct rwlock **lockp)
2451 {
2452 	struct md_page *pvh;
2453 	pv_entry_t pv;
2454 	vm_paddr_t pa;
2455 
2456 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2457 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2458 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
2459 	    NULL : lockp)) == NULL)
2460 		return (false);
2461 	pv->pv_va = va;
2462 	pa = PTE_TO_PHYS(l2e);
2463 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2464 	pvh = pa_to_pvh(pa);
2465 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2466 	pvh->pv_gen++;
2467 	return (true);
2468 }
2469 
2470 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)2471 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
2472 {
2473 	pt_entry_t newl2, oldl2 __diagused;
2474 	vm_page_t ml3;
2475 	vm_paddr_t ml3pa;
2476 
2477 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
2478 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
2479 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2480 
2481 	ml3 = pmap_remove_pt_page(pmap, va);
2482 	if (ml3 == NULL)
2483 		panic("pmap_remove_kernel_l2: Missing pt page");
2484 
2485 	ml3pa = VM_PAGE_TO_PHYS(ml3);
2486 	newl2 = ml3pa | PTE_V;
2487 
2488 	/*
2489 	 * If this page table page was unmapped by a promotion, then it
2490 	 * contains valid mappings.  Zero it to invalidate those mappings.
2491 	 */
2492 	if (vm_page_any_valid(ml3))
2493 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
2494 
2495 	/*
2496 	 * Demote the mapping.
2497 	 */
2498 	oldl2 = pmap_load_store(l2, newl2);
2499 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
2500 	    __func__, l2, oldl2));
2501 }
2502 
2503 /*
2504  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
2505  */
2506 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)2507 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
2508     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
2509 {
2510 	struct md_page *pvh;
2511 	pt_entry_t oldl2;
2512 	vm_offset_t eva, va;
2513 	vm_page_t m, ml3;
2514 
2515 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2516 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
2517 	oldl2 = pmap_load_clear(l2);
2518 	KASSERT((oldl2 & PTE_RWX) != 0,
2519 	    ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2));
2520 
2521 	/*
2522 	 * The sfence.vma documentation states that it is sufficient to specify
2523 	 * a single address within a superpage mapping.  However, since we do
2524 	 * not perform any invalidation upon promotion, TLBs may still be
2525 	 * caching 4KB mappings within the superpage, so we must invalidate the
2526 	 * entire range.
2527 	 */
2528 	pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
2529 	if ((oldl2 & PTE_SW_WIRED) != 0)
2530 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
2531 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
2532 	if ((oldl2 & PTE_SW_MANAGED) != 0) {
2533 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2));
2534 		pvh = pa_to_pvh(PTE_TO_PHYS(oldl2));
2535 		pmap_pvh_free(pvh, pmap, sva);
2536 		eva = sva + L2_SIZE;
2537 		for (va = sva, m = PTE_TO_VM_PAGE(oldl2);
2538 		    va < eva; va += PAGE_SIZE, m++) {
2539 			if ((oldl2 & PTE_D) != 0)
2540 				vm_page_dirty(m);
2541 			if ((oldl2 & PTE_A) != 0)
2542 				vm_page_aflag_set(m, PGA_REFERENCED);
2543 			if (TAILQ_EMPTY(&m->md.pv_list) &&
2544 			    TAILQ_EMPTY(&pvh->pv_list))
2545 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2546 		}
2547 	}
2548 	if (pmap == kernel_pmap) {
2549 		pmap_remove_kernel_l2(pmap, l2, sva);
2550 	} else {
2551 		ml3 = pmap_remove_pt_page(pmap, sva);
2552 		if (ml3 != NULL) {
2553 			KASSERT(vm_page_any_valid(ml3),
2554 			    ("pmap_remove_l2: l3 page not promoted"));
2555 			pmap_resident_count_dec(pmap, 1);
2556 			KASSERT(ml3->ref_count == Ln_ENTRIES,
2557 			    ("pmap_remove_l2: l3 page ref count error"));
2558 			ml3->ref_count = 1;
2559 			vm_page_unwire_noq(ml3);
2560 			pmap_add_delayed_free_list(ml3, free, false);
2561 		}
2562 	}
2563 	return (pmap_unuse_pt(pmap, sva, l1e, free));
2564 }
2565 
2566 /*
2567  * pmap_remove_l3: do the things to unmap a page in a process
2568  */
2569 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)2570 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2571     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2572 {
2573 	struct md_page *pvh;
2574 	pt_entry_t old_l3;
2575 	vm_page_t m;
2576 
2577 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2578 	old_l3 = pmap_load_clear(l3);
2579 	pmap_invalidate_page(pmap, va);
2580 	if (old_l3 & PTE_SW_WIRED)
2581 		pmap->pm_stats.wired_count -= 1;
2582 	pmap_resident_count_dec(pmap, 1);
2583 	if (old_l3 & PTE_SW_MANAGED) {
2584 		m = PTE_TO_VM_PAGE(old_l3);
2585 		if ((old_l3 & PTE_D) != 0)
2586 			vm_page_dirty(m);
2587 		if (old_l3 & PTE_A)
2588 			vm_page_aflag_set(m, PGA_REFERENCED);
2589 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2590 		pmap_pvh_free(&m->md, pmap, va);
2591 		if (TAILQ_EMPTY(&m->md.pv_list) &&
2592 		    (m->flags & PG_FICTITIOUS) == 0) {
2593 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2594 			if (TAILQ_EMPTY(&pvh->pv_list))
2595 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2596 		}
2597 	}
2598 
2599 	return (pmap_unuse_pt(pmap, va, l2e, free));
2600 }
2601 
2602 /*
2603  *	Remove the given range of addresses from the specified map.
2604  *
2605  *	It is assumed that the start and end are properly
2606  *	rounded to the page size.
2607  */
2608 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)2609 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2610 {
2611 	struct spglist free;
2612 	struct rwlock *lock;
2613 	vm_offset_t va, va_next;
2614 	pd_entry_t *l0, *l1, *l2, l2e;
2615 	pt_entry_t *l3;
2616 
2617 	/*
2618 	 * Perform an unsynchronized read.  This is, however, safe.
2619 	 */
2620 	if (pmap->pm_stats.resident_count == 0)
2621 		return;
2622 
2623 	SLIST_INIT(&free);
2624 
2625 	rw_rlock(&pvh_global_lock);
2626 	PMAP_LOCK(pmap);
2627 
2628 	lock = NULL;
2629 	for (; sva < eva; sva = va_next) {
2630 		if (pmap->pm_stats.resident_count == 0)
2631 			break;
2632 
2633 		if (pmap_mode == PMAP_MODE_SV48) {
2634 			l0 = pmap_l0(pmap, sva);
2635 			if (pmap_load(l0) == 0) {
2636 				va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2637 				if (va_next < sva)
2638 					va_next = eva;
2639 				continue;
2640 			}
2641 			l1 = pmap_l0_to_l1(l0, sva);
2642 		} else {
2643 			l1 = pmap_l1(pmap, sva);
2644 		}
2645 
2646 		if (pmap_load(l1) == 0) {
2647 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2648 			if (va_next < sva)
2649 				va_next = eva;
2650 			continue;
2651 		}
2652 
2653 		/*
2654 		 * Calculate index for next page table.
2655 		 */
2656 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2657 		if (va_next < sva)
2658 			va_next = eva;
2659 
2660 		l2 = pmap_l1_to_l2(l1, sva);
2661 		if ((l2e = pmap_load(l2)) == 0)
2662 			continue;
2663 		if ((l2e & PTE_RWX) != 0) {
2664 			if (sva + L2_SIZE == va_next && eva >= va_next) {
2665 				(void)pmap_remove_l2(pmap, l2, sva,
2666 				    pmap_load(l1), &free, &lock);
2667 				continue;
2668 			} else if (!pmap_demote_l2_locked(pmap, l2, sva,
2669 			    &lock)) {
2670 				/*
2671 				 * The large page mapping was destroyed.
2672 				 */
2673 				continue;
2674 			}
2675 			l2e = pmap_load(l2);
2676 		}
2677 
2678 		/*
2679 		 * Limit our scan to either the end of the va represented
2680 		 * by the current page table page, or to the end of the
2681 		 * range being removed.
2682 		 */
2683 		if (va_next > eva)
2684 			va_next = eva;
2685 
2686 		va = va_next;
2687 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2688 		    sva += L3_SIZE) {
2689 			if (pmap_load(l3) == 0) {
2690 				if (va != va_next) {
2691 					pmap_invalidate_range(pmap, va, sva);
2692 					va = va_next;
2693 				}
2694 				continue;
2695 			}
2696 			if (va == va_next)
2697 				va = sva;
2698 			if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) {
2699 				sva += L3_SIZE;
2700 				break;
2701 			}
2702 		}
2703 		if (va != va_next)
2704 			pmap_invalidate_range(pmap, va, sva);
2705 	}
2706 	if (lock != NULL)
2707 		rw_wunlock(lock);
2708 	rw_runlock(&pvh_global_lock);
2709 	PMAP_UNLOCK(pmap);
2710 	vm_page_free_pages_toq(&free, false);
2711 }
2712 
2713 /*
2714  *	Routine:	pmap_remove_all
2715  *	Function:
2716  *		Removes this physical page from
2717  *		all physical maps in which it resides.
2718  *		Reflects back modify bits to the pager.
2719  *
2720  *	Notes:
2721  *		Original versions of this routine were very
2722  *		inefficient because they iteratively called
2723  *		pmap_remove (slow...)
2724  */
2725 
2726 void
pmap_remove_all(vm_page_t m)2727 pmap_remove_all(vm_page_t m)
2728 {
2729 	struct spglist free;
2730 	struct md_page *pvh;
2731 	pmap_t pmap;
2732 	pt_entry_t *l3, l3e;
2733 	pd_entry_t *l2, l2e __diagused;
2734 	pv_entry_t pv;
2735 	vm_offset_t va;
2736 
2737 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2738 	    ("pmap_remove_all: page %p is not managed", m));
2739 	SLIST_INIT(&free);
2740 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2741 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
2742 
2743 	rw_wlock(&pvh_global_lock);
2744 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2745 		pmap = PV_PMAP(pv);
2746 		PMAP_LOCK(pmap);
2747 		va = pv->pv_va;
2748 		l2 = pmap_l2(pmap, va);
2749 		(void)pmap_demote_l2(pmap, l2, va);
2750 		PMAP_UNLOCK(pmap);
2751 	}
2752 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2753 		pmap = PV_PMAP(pv);
2754 		PMAP_LOCK(pmap);
2755 		pmap_resident_count_dec(pmap, 1);
2756 		l2 = pmap_l2(pmap, pv->pv_va);
2757 		KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found"));
2758 		l2e = pmap_load(l2);
2759 
2760 		KASSERT((l2e & PTE_RX) == 0,
2761 		    ("pmap_remove_all: found a superpage in %p's pv list", m));
2762 
2763 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
2764 		l3e = pmap_load_clear(l3);
2765 		pmap_invalidate_page(pmap, pv->pv_va);
2766 		if (l3e & PTE_SW_WIRED)
2767 			pmap->pm_stats.wired_count--;
2768 		if ((l3e & PTE_A) != 0)
2769 			vm_page_aflag_set(m, PGA_REFERENCED);
2770 
2771 		/*
2772 		 * Update the vm_page_t clean and reference bits.
2773 		 */
2774 		if ((l3e & PTE_D) != 0)
2775 			vm_page_dirty(m);
2776 		pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free);
2777 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2778 		m->md.pv_gen++;
2779 		free_pv_entry(pmap, pv);
2780 		PMAP_UNLOCK(pmap);
2781 	}
2782 	vm_page_aflag_clear(m, PGA_WRITEABLE);
2783 	rw_wunlock(&pvh_global_lock);
2784 	vm_page_free_pages_toq(&free, false);
2785 }
2786 
2787 /*
2788  *	Set the physical protection on the
2789  *	specified range of this map as requested.
2790  */
2791 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)2792 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2793 {
2794 	pd_entry_t *l0, *l1, *l2, l2e;
2795 	pt_entry_t *l3, l3e, mask;
2796 	vm_page_t m, mt;
2797 	vm_offset_t va_next;
2798 	bool anychanged, pv_lists_locked;
2799 
2800 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2801 		pmap_remove(pmap, sva, eva);
2802 		return;
2803 	}
2804 
2805 	if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
2806 	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
2807 		return;
2808 
2809 	anychanged = false;
2810 	pv_lists_locked = false;
2811 	mask = 0;
2812 	if ((prot & VM_PROT_WRITE) == 0)
2813 		mask |= PTE_W | PTE_D;
2814 	if ((prot & VM_PROT_EXECUTE) == 0)
2815 		mask |= PTE_X;
2816 resume:
2817 	PMAP_LOCK(pmap);
2818 	for (; sva < eva; sva = va_next) {
2819 		if (pmap_mode == PMAP_MODE_SV48) {
2820 			l0 = pmap_l0(pmap, sva);
2821 			if (pmap_load(l0) == 0) {
2822 				va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2823 				if (va_next < sva)
2824 					va_next = eva;
2825 				continue;
2826 			}
2827 			l1 = pmap_l0_to_l1(l0, sva);
2828 		} else {
2829 			l1 = pmap_l1(pmap, sva);
2830 		}
2831 
2832 		if (pmap_load(l1) == 0) {
2833 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2834 			if (va_next < sva)
2835 				va_next = eva;
2836 			continue;
2837 		}
2838 
2839 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2840 		if (va_next < sva)
2841 			va_next = eva;
2842 
2843 		l2 = pmap_l1_to_l2(l1, sva);
2844 		if ((l2e = pmap_load(l2)) == 0)
2845 			continue;
2846 		if ((l2e & PTE_RWX) != 0) {
2847 			if (sva + L2_SIZE == va_next && eva >= va_next) {
2848 retryl2:
2849 				if ((prot & VM_PROT_WRITE) == 0 &&
2850 				    (l2e & (PTE_SW_MANAGED | PTE_D)) ==
2851 				    (PTE_SW_MANAGED | PTE_D)) {
2852 					m = PTE_TO_VM_PAGE(l2e);
2853 					for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
2854 						vm_page_dirty(mt);
2855 				}
2856 				if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask))
2857 					goto retryl2;
2858 				anychanged = true;
2859 				continue;
2860 			} else {
2861 				if (!pv_lists_locked) {
2862 					pv_lists_locked = true;
2863 					if (!rw_try_rlock(&pvh_global_lock)) {
2864 						if (anychanged)
2865 							pmap_invalidate_all(
2866 							    pmap);
2867 						PMAP_UNLOCK(pmap);
2868 						rw_rlock(&pvh_global_lock);
2869 						goto resume;
2870 					}
2871 				}
2872 				if (!pmap_demote_l2(pmap, l2, sva)) {
2873 					/*
2874 					 * The large page mapping was destroyed.
2875 					 */
2876 					continue;
2877 				}
2878 			}
2879 		}
2880 
2881 		if (va_next > eva)
2882 			va_next = eva;
2883 
2884 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2885 		    sva += L3_SIZE) {
2886 			l3e = pmap_load(l3);
2887 retryl3:
2888 			if ((l3e & PTE_V) == 0)
2889 				continue;
2890 			if ((prot & VM_PROT_WRITE) == 0 &&
2891 			    (l3e & (PTE_SW_MANAGED | PTE_D)) ==
2892 			    (PTE_SW_MANAGED | PTE_D)) {
2893 				m = PTE_TO_VM_PAGE(l3e);
2894 				vm_page_dirty(m);
2895 			}
2896 			if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask))
2897 				goto retryl3;
2898 			anychanged = true;
2899 		}
2900 	}
2901 	if (anychanged)
2902 		pmap_invalidate_all(pmap);
2903 	if (pv_lists_locked)
2904 		rw_runlock(&pvh_global_lock);
2905 	PMAP_UNLOCK(pmap);
2906 }
2907 
2908 int
pmap_fault(pmap_t pmap,vm_offset_t va,vm_prot_t ftype)2909 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype)
2910 {
2911 	pd_entry_t *l2, l2e;
2912 	pt_entry_t bits, *pte, oldpte;
2913 	int rv;
2914 
2915 	KASSERT(VIRT_IS_VALID(va), ("pmap_fault: invalid va %#lx", va));
2916 
2917 	rv = 0;
2918 	PMAP_LOCK(pmap);
2919 	l2 = pmap_l2(pmap, va);
2920 	if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0)
2921 		goto done;
2922 	if ((l2e & PTE_RWX) == 0) {
2923 		pte = pmap_l2_to_l3(l2, va);
2924 		if (((oldpte = pmap_load(pte)) & PTE_V) == 0)
2925 			goto done;
2926 	} else {
2927 		pte = l2;
2928 		oldpte = l2e;
2929 	}
2930 
2931 	if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) ||
2932 	    (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) ||
2933 	    (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) ||
2934 	    (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0))
2935 		goto done;
2936 
2937 	bits = PTE_A;
2938 	if (ftype == VM_PROT_WRITE)
2939 		bits |= PTE_D;
2940 
2941 	/*
2942 	 * Spurious faults can occur if the implementation caches invalid
2943 	 * entries in the TLB, or if simultaneous accesses on multiple CPUs
2944 	 * race with each other.
2945 	 */
2946 	if ((oldpte & bits) != bits)
2947 		pmap_store_bits(pte, bits);
2948 	sfence_vma();
2949 	rv = 1;
2950 done:
2951 	PMAP_UNLOCK(pmap);
2952 	return (rv);
2953 }
2954 
2955 /*
2956  *	Demote the specified L1 page to separate L2 pages.
2957  *	Currently only used for DMAP entries.
2958  */
2959 static bool
pmap_demote_l1(pmap_t pmap,pd_entry_t * l1,vm_offset_t va)2960 pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va)
2961 {
2962 	vm_page_t m;
2963 	pt_entry_t *l2, oldl1, newl2;
2964 	pd_entry_t newl1;
2965 	vm_paddr_t l2phys;
2966 
2967 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2968 
2969 	oldl1 = pmap_load(l1);
2970 	KASSERT((oldl1 & PTE_RWX) != 0,
2971 	    ("pmap_demote_l1: oldl1 is not a leaf PTE"));
2972 	KASSERT((oldl1 & PTE_A) != 0,
2973 	    ("pmap_demote_l1: oldl1 is missing PTE_A"));
2974 	KASSERT((oldl1 & (PTE_D | PTE_W)) != PTE_W,
2975 	    ("pmap_demote_l1: not dirty!"));
2976 	KASSERT((oldl1 & PTE_SW_MANAGED) == 0,
2977 	    ("pmap_demote_l1: L1 table shouldn't be managed"));
2978 	KASSERT(VIRT_IN_DMAP(va),
2979 	    ("pmap_demote_l1: is unsupported for non-DMAP va=%#lx", va));
2980 
2981 	/* Demoting L1 means we need to allocate a new page-table page. */
2982 	m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
2983 	if (m == NULL) {
2984 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx in pmap %p",
2985 		    va, pmap);
2986 		return (false);
2987 	}
2988 
2989 	l2phys = VM_PAGE_TO_PHYS(m);
2990 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
2991 
2992 	/*
2993 	 * Create new entries, relying on the fact that only the low bits
2994 	 * (index) of the physical address are changing.
2995 	 */
2996 	newl2 = oldl1;
2997 	for (int i = 0; i < Ln_ENTRIES; i++)
2998 		pmap_store(&l2[i], newl2 | (i << PTE_PPN1_S));
2999 
3000 	/*
3001 	 * And update the L1 entry.
3002 	 *
3003 	 * NB: flushing the TLB is the responsibility of the caller. Cached
3004 	 * translations are still "correct" for demoted mappings until some
3005 	 * subset of the demoted range is modified.
3006 	 */
3007 	newl1 = ((l2phys / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
3008 	pmap_store(l1, newl1);
3009 
3010 	counter_u64_add(pmap_l1_demotions, 1);
3011 	CTR2(KTR_PMAP, "pmap_demote_l1: success for va %#lx in pmap %p",
3012 	    va, pmap);
3013 	return (true);
3014 }
3015 
3016 static bool
pmap_demote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va)3017 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va)
3018 {
3019 	struct rwlock *lock;
3020 	bool rv;
3021 
3022 	lock = NULL;
3023 	rv = pmap_demote_l2_locked(pmap, l2, va, &lock);
3024 	if (lock != NULL)
3025 		rw_wunlock(lock);
3026 	return (rv);
3027 }
3028 
3029 /*
3030  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
3031  * mapping is invalidated.
3032  */
3033 static bool
pmap_demote_l2_locked(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)3034 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
3035     struct rwlock **lockp)
3036 {
3037 	struct spglist free;
3038 	vm_page_t mpte;
3039 	pd_entry_t newl2, oldl2;
3040 	pt_entry_t *firstl3, newl3;
3041 	vm_paddr_t mptepa;
3042 	int i;
3043 
3044 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3045 
3046 	oldl2 = pmap_load(l2);
3047 	KASSERT((oldl2 & PTE_RWX) != 0,
3048 	    ("pmap_demote_l2_locked: oldl2 is not a leaf entry"));
3049 	if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
3050 	    NULL) {
3051 		KASSERT((oldl2 & PTE_SW_WIRED) == 0,
3052 		    ("pmap_demote_l2_locked: page table page for a wired mapping is missing"));
3053 		if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj(
3054 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
3055 		    VM_ALLOC_WIRED)) == NULL) {
3056 			SLIST_INIT(&free);
3057 			(void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET,
3058 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
3059 			vm_page_free_pages_toq(&free, true);
3060 			CTR2(KTR_PMAP, "pmap_demote_l2_locked: "
3061 			    "failure for va %#lx in pmap %p", va, pmap);
3062 			return (false);
3063 		}
3064 		mpte->pindex = pmap_l2_pindex(va);
3065 		if (va < VM_MAXUSER_ADDRESS) {
3066 			mpte->ref_count = Ln_ENTRIES;
3067 			pmap_resident_count_inc(pmap, 1);
3068 		}
3069 	}
3070 	mptepa = VM_PAGE_TO_PHYS(mpte);
3071 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
3072 	newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
3073 	KASSERT((oldl2 & PTE_A) != 0,
3074 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_A"));
3075 	KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W,
3076 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_D"));
3077 	newl3 = oldl2;
3078 
3079 	/*
3080 	 * If the page table page is not leftover from an earlier promotion,
3081 	 * initialize it.
3082 	 */
3083 	if (!vm_page_all_valid(mpte)) {
3084 		for (i = 0; i < Ln_ENTRIES; i++)
3085 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
3086 	}
3087 	KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3),
3088 	    ("pmap_demote_l2_locked: firstl3 and newl3 map different physical "
3089 	    "addresses"));
3090 
3091 	/*
3092 	 * If the mapping has changed attributes, update the PTEs.
3093 	 */
3094 	if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE))
3095 		for (i = 0; i < Ln_ENTRIES; i++)
3096 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
3097 
3098 	/*
3099 	 * The spare PV entries must be reserved prior to demoting the
3100 	 * mapping, that is, prior to changing the L2 entry.  Otherwise, the
3101 	 * state of the L2 entry and the PV lists will be inconsistent, which
3102 	 * can result in reclaim_pv_chunk() attempting to remove a PV entry from
3103 	 * the wrong PV list and pmap_pv_demote_l2() failing to find the
3104 	 * expected PV entry for the 2MB page mapping that is being demoted.
3105 	 */
3106 	if ((oldl2 & PTE_SW_MANAGED) != 0)
3107 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
3108 
3109 	/*
3110 	 * Demote the mapping.
3111 	 */
3112 	pmap_store(l2, newl2);
3113 
3114 	/*
3115 	 * Demote the PV entry.
3116 	 */
3117 	if ((oldl2 & PTE_SW_MANAGED) != 0)
3118 		pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
3119 
3120 	atomic_add_long(&pmap_l2_demotions, 1);
3121 	CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p",
3122 	    va, pmap);
3123 	return (true);
3124 }
3125 
3126 #if VM_NRESERVLEVEL > 0
3127 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t ml3,struct rwlock ** lockp)3128 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t ml3,
3129     struct rwlock **lockp)
3130 {
3131 	pt_entry_t all_l3e_PTE_A, *firstl3, firstl3e, *l3, l3e;
3132 	vm_paddr_t pa;
3133 
3134 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3135 	if (!pmap_ps_enabled(pmap))
3136 		return (false);
3137 
3138 	KASSERT((pmap_load(l2) & PTE_RWX) == 0,
3139 	    ("pmap_promote_l2: invalid l2 entry %p", l2));
3140 
3141 	/*
3142 	 * Examine the first L3E in the specified PTP.  Abort if this L3E is
3143 	 * ineligible for promotion or does not map the first 4KB physical page
3144 	 * within a 2MB page.
3145 	 */
3146 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
3147 	firstl3e = pmap_load(firstl3);
3148 	pa = PTE_TO_PHYS(firstl3e);
3149 	if ((pa & L2_OFFSET) != 0) {
3150 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
3151 		    va, pmap);
3152 		atomic_add_long(&pmap_l2_p_failures, 1);
3153 		return (false);
3154 	}
3155 
3156 	/*
3157 	 * Downgrade a clean, writable mapping to read-only to ensure that the
3158 	 * hardware does not set PTE_D while we are comparing PTEs.
3159 	 *
3160 	 * Upon a write access to a clean mapping, the implementation will
3161 	 * either atomically check protections and set PTE_D, or raise a page
3162 	 * fault.  In the latter case, the pmap lock provides atomicity.  Thus,
3163 	 * we do not issue an sfence.vma here and instead rely on pmap_fault()
3164 	 * to do so lazily.
3165 	 */
3166 	while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) {
3167 		if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) {
3168 			firstl3e &= ~PTE_W;
3169 			break;
3170 		}
3171 	}
3172 
3173 	/*
3174 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3175 	 * PTE maps an unexpected 4KB physical page or does not have identical
3176 	 * characteristics to the first PTE.
3177 	 */
3178 	all_l3e_PTE_A = firstl3e & PTE_A;
3179 	pa += L2_SIZE - PAGE_SIZE;
3180 	for (l3 = firstl3 + Ln_ENTRIES - 1; l3 > firstl3; l3--) {
3181 		l3e = pmap_load(l3);
3182 		if (PTE_TO_PHYS(l3e) != pa) {
3183 			CTR2(KTR_PMAP,
3184 			    "pmap_promote_l2: failure for va %#lx pmap %p",
3185 			    va, pmap);
3186 			atomic_add_long(&pmap_l2_p_failures, 1);
3187 			return (false);
3188 		}
3189 		while ((l3e & (PTE_W | PTE_D)) == PTE_W) {
3190 			if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) {
3191 				l3e &= ~PTE_W;
3192 				break;
3193 			}
3194 		}
3195 		if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) {
3196 			CTR2(KTR_PMAP,
3197 			    "pmap_promote_l2: failure for va %#lx pmap %p",
3198 			    va, pmap);
3199 			atomic_add_long(&pmap_l2_p_failures, 1);
3200 			return (false);
3201 		}
3202 		all_l3e_PTE_A &= l3e;
3203 		pa -= PAGE_SIZE;
3204 	}
3205 
3206 	/*
3207 	 * Unless all PTEs have PTE_A set, clear it from the superpage
3208 	 * mapping, so that promotions triggered by speculative mappings,
3209 	 * such as pmap_enter_quick(), don't automatically mark the
3210 	 * underlying pages as referenced.
3211 	 */
3212 	firstl3e &= ~PTE_A | all_l3e_PTE_A;
3213 
3214 	/*
3215 	 * Save the page table page in its current state until the L2
3216 	 * mapping the superpage is demoted by pmap_demote_l2() or
3217 	 * destroyed by pmap_remove_l3().
3218 	 */
3219 	if (ml3 == NULL)
3220 		ml3 = PTE_TO_VM_PAGE(pmap_load(l2));
3221 	KASSERT(ml3->pindex == pmap_l2_pindex(va),
3222 	    ("pmap_promote_l2: page table page's pindex is wrong"));
3223 	if (pmap_insert_pt_page(pmap, ml3, true, all_l3e_PTE_A != 0)) {
3224 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
3225 		    va, pmap);
3226 		atomic_add_long(&pmap_l2_p_failures, 1);
3227 		return (false);
3228 	}
3229 
3230 	if ((firstl3e & PTE_SW_MANAGED) != 0)
3231 		pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp);
3232 
3233 	pmap_store(l2, firstl3e);
3234 
3235 	atomic_add_long(&pmap_l2_promotions, 1);
3236 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
3237 	    pmap);
3238 	return (true);
3239 }
3240 #endif
3241 
3242 /*
3243  *	Insert the given physical page (p) at
3244  *	the specified virtual address (v) in the
3245  *	target physical map with the protection requested.
3246  *
3247  *	If specified, the page will be wired down, meaning
3248  *	that the related pte can not be reclaimed.
3249  *
3250  *	NB:  This is the only routine which MAY NOT lazy-evaluate
3251  *	or lose information.  That is, this routine must actually
3252  *	insert this page into the given map NOW.
3253  */
3254 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)3255 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3256     u_int flags, int8_t psind)
3257 {
3258 	struct rwlock *lock;
3259 	pd_entry_t *l2, l2e;
3260 	pt_entry_t new_l3, orig_l3;
3261 	pt_entry_t *l3;
3262 	pv_entry_t pv;
3263 	vm_paddr_t opa, pa;
3264 	vm_page_t mpte, om;
3265 	pn_t pn;
3266 	int rv;
3267 	bool nosleep;
3268 
3269 	va = trunc_page(va);
3270 	if ((m->oflags & VPO_UNMANAGED) == 0)
3271 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
3272 	pa = VM_PAGE_TO_PHYS(m);
3273 	pn = (pa / PAGE_SIZE);
3274 
3275 	new_l3 = PTE_V | PTE_R | PTE_A;
3276 	if (prot & VM_PROT_EXECUTE)
3277 		new_l3 |= PTE_X;
3278 	if (flags & VM_PROT_WRITE)
3279 		new_l3 |= PTE_D;
3280 	if (prot & VM_PROT_WRITE)
3281 		new_l3 |= PTE_W;
3282 	if (va < VM_MAX_USER_ADDRESS)
3283 		new_l3 |= PTE_U;
3284 
3285 	new_l3 |= (pn << PTE_PPN0_S);
3286 	if ((flags & PMAP_ENTER_WIRED) != 0)
3287 		new_l3 |= PTE_SW_WIRED;
3288 	new_l3 |= pmap_memattr_bits(m->md.pv_memattr);
3289 
3290 	/*
3291 	 * Set modified bit gratuitously for writeable mappings if
3292 	 * the page is unmanaged. We do not want to take a fault
3293 	 * to do the dirty bit accounting for these mappings.
3294 	 */
3295 	if ((m->oflags & VPO_UNMANAGED) != 0) {
3296 		if (prot & VM_PROT_WRITE)
3297 			new_l3 |= PTE_D;
3298 	} else
3299 		new_l3 |= PTE_SW_MANAGED;
3300 
3301 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
3302 
3303 	lock = NULL;
3304 	mpte = NULL;
3305 	rw_rlock(&pvh_global_lock);
3306 	PMAP_LOCK(pmap);
3307 	if (psind == 1) {
3308 		/* Assert the required virtual and physical alignment. */
3309 		KASSERT((va & L2_OFFSET) == 0,
3310 		    ("pmap_enter: va %#lx unaligned", va));
3311 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
3312 		rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock);
3313 		goto out;
3314 	}
3315 
3316 	l2 = pmap_l2(pmap, va);
3317 	if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 &&
3318 	    ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2,
3319 	    va, &lock))) {
3320 		l3 = pmap_l2_to_l3(l2, va);
3321 		if (va < VM_MAXUSER_ADDRESS) {
3322 			mpte = PTE_TO_VM_PAGE(pmap_load(l2));
3323 			mpte->ref_count++;
3324 		}
3325 	} else if (va < VM_MAXUSER_ADDRESS) {
3326 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
3327 		mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
3328 		if (mpte == NULL && nosleep) {
3329 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
3330 			if (lock != NULL)
3331 				rw_wunlock(lock);
3332 			rw_runlock(&pvh_global_lock);
3333 			PMAP_UNLOCK(pmap);
3334 			return (KERN_RESOURCE_SHORTAGE);
3335 		}
3336 		l3 = pmap_l3(pmap, va);
3337 	} else {
3338 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
3339 	}
3340 
3341 	orig_l3 = pmap_load(l3);
3342 	opa = PTE_TO_PHYS(orig_l3);
3343 	pv = NULL;
3344 
3345 	/*
3346 	 * Is the specified virtual address already mapped?
3347 	 */
3348 	if ((orig_l3 & PTE_V) != 0) {
3349 		/*
3350 		 * Wiring change, just update stats. We don't worry about
3351 		 * wiring PT pages as they remain resident as long as there
3352 		 * are valid mappings in them. Hence, if a user page is wired,
3353 		 * the PT page will be also.
3354 		 */
3355 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
3356 		    (orig_l3 & PTE_SW_WIRED) == 0)
3357 			pmap->pm_stats.wired_count++;
3358 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
3359 		    (orig_l3 & PTE_SW_WIRED) != 0)
3360 			pmap->pm_stats.wired_count--;
3361 
3362 		/*
3363 		 * Remove the extra PT page reference.
3364 		 */
3365 		if (mpte != NULL) {
3366 			mpte->ref_count--;
3367 			KASSERT(mpte->ref_count > 0,
3368 			    ("pmap_enter: missing reference to page table page,"
3369 			     " va: 0x%lx", va));
3370 		}
3371 
3372 		/*
3373 		 * Has the physical page changed?
3374 		 */
3375 		if (opa == pa) {
3376 			/*
3377 			 * No, might be a protection or wiring change.
3378 			 */
3379 			if ((orig_l3 & PTE_SW_MANAGED) != 0 &&
3380 			    (new_l3 & PTE_W) != 0)
3381 				vm_page_aflag_set(m, PGA_WRITEABLE);
3382 			goto validate;
3383 		}
3384 
3385 		/*
3386 		 * The physical page has changed.  Temporarily invalidate
3387 		 * the mapping.  This ensures that all threads sharing the
3388 		 * pmap keep a consistent view of the mapping, which is
3389 		 * necessary for the correct handling of COW faults.  It
3390 		 * also permits reuse of the old mapping's PV entry,
3391 		 * avoiding an allocation.
3392 		 *
3393 		 * For consistency, handle unmanaged mappings the same way.
3394 		 */
3395 		orig_l3 = pmap_load_clear(l3);
3396 		KASSERT(PTE_TO_PHYS(orig_l3) == opa,
3397 		    ("pmap_enter: unexpected pa update for %#lx", va));
3398 		if ((orig_l3 & PTE_SW_MANAGED) != 0) {
3399 			om = PHYS_TO_VM_PAGE(opa);
3400 
3401 			/*
3402 			 * The pmap lock is sufficient to synchronize with
3403 			 * concurrent calls to pmap_page_test_mappings() and
3404 			 * pmap_ts_referenced().
3405 			 */
3406 			if ((orig_l3 & PTE_D) != 0)
3407 				vm_page_dirty(om);
3408 			if ((orig_l3 & PTE_A) != 0)
3409 				vm_page_aflag_set(om, PGA_REFERENCED);
3410 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3411 			pv = pmap_pvh_remove(&om->md, pmap, va);
3412 			KASSERT(pv != NULL,
3413 			    ("pmap_enter: no PV entry for %#lx", va));
3414 			if ((new_l3 & PTE_SW_MANAGED) == 0)
3415 				free_pv_entry(pmap, pv);
3416 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
3417 			    TAILQ_EMPTY(&om->md.pv_list) &&
3418 			    ((om->flags & PG_FICTITIOUS) != 0 ||
3419 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3420 				vm_page_aflag_clear(om, PGA_WRITEABLE);
3421 		}
3422 		pmap_invalidate_page(pmap, va);
3423 		orig_l3 = 0;
3424 	} else {
3425 		/*
3426 		 * Increment the counters.
3427 		 */
3428 		if ((new_l3 & PTE_SW_WIRED) != 0)
3429 			pmap->pm_stats.wired_count++;
3430 		pmap_resident_count_inc(pmap, 1);
3431 	}
3432 	/*
3433 	 * Enter on the PV list if part of our managed memory.
3434 	 */
3435 	if ((new_l3 & PTE_SW_MANAGED) != 0) {
3436 		if (pv == NULL) {
3437 			pv = get_pv_entry(pmap, &lock);
3438 			pv->pv_va = va;
3439 		}
3440 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3441 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3442 		m->md.pv_gen++;
3443 		if ((new_l3 & PTE_W) != 0)
3444 			vm_page_aflag_set(m, PGA_WRITEABLE);
3445 	}
3446 
3447 validate:
3448 	/*
3449 	 * Sync the i-cache on all harts before updating the PTE
3450 	 * if the new PTE is executable.
3451 	 */
3452 	if (prot & VM_PROT_EXECUTE)
3453 		pmap_sync_icache(pmap, va, PAGE_SIZE);
3454 
3455 	/*
3456 	 * Update the L3 entry.
3457 	 */
3458 	if (orig_l3 != 0) {
3459 		orig_l3 = pmap_load_store(l3, new_l3);
3460 		pmap_invalidate_page(pmap, va);
3461 		KASSERT(PTE_TO_PHYS(orig_l3) == pa,
3462 		    ("pmap_enter: invalid update"));
3463 		if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) ==
3464 		    (PTE_D | PTE_SW_MANAGED))
3465 			vm_page_dirty(m);
3466 	} else {
3467 		pmap_store(l3, new_l3);
3468 	}
3469 
3470 #if VM_NRESERVLEVEL > 0
3471 	if (mpte != NULL && mpte->ref_count == Ln_ENTRIES &&
3472 	    (m->flags & PG_FICTITIOUS) == 0 &&
3473 	    vm_reserv_level_iffullpop(m) == 0)
3474 		(void)pmap_promote_l2(pmap, l2, va, mpte, &lock);
3475 #endif
3476 
3477 	rv = KERN_SUCCESS;
3478 out:
3479 	if (lock != NULL)
3480 		rw_wunlock(lock);
3481 	rw_runlock(&pvh_global_lock);
3482 	PMAP_UNLOCK(pmap);
3483 	return (rv);
3484 }
3485 
3486 /*
3487  * Release a page table page reference after a failed attempt to create a
3488  * mapping.
3489  */
3490 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t l2pg)3491 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t l2pg)
3492 {
3493 	struct spglist free;
3494 
3495 	SLIST_INIT(&free);
3496 	if (pmap_unwire_ptp(pmap, va, l2pg, &free)) {
3497 		/*
3498 		 * Although "va" is not mapped, paging-structure
3499 		 * caches could nonetheless have entries that
3500 		 * refer to the freed page table pages.
3501 		 * Invalidate those entries.
3502 		 */
3503 		pmap_invalidate_page(pmap, va);
3504 		vm_page_free_pages_toq(&free, true);
3505 	}
3506 }
3507 
3508 /*
3509  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns
3510  * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
3511  * value.  See pmap_enter_l2() for the possible error values when "no sleep",
3512  * "no replace", and "no reclaim" are specified.
3513  */
3514 static int
pmap_enter_2mpage(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)3515 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3516     struct rwlock **lockp)
3517 {
3518 	pd_entry_t new_l2;
3519 	pn_t pn;
3520 
3521 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3522 
3523 	pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE;
3524 	new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V |
3525 	    pmap_memattr_bits(m->md.pv_memattr));
3526 	if ((m->oflags & VPO_UNMANAGED) == 0)
3527 		new_l2 |= PTE_SW_MANAGED;
3528 	if ((prot & VM_PROT_EXECUTE) != 0)
3529 		new_l2 |= PTE_X;
3530 	if (va < VM_MAXUSER_ADDRESS)
3531 		new_l2 |= PTE_U;
3532 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
3533 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp));
3534 }
3535 
3536 /*
3537  * Returns true if every page table entry in the specified page table is
3538  * zero.
3539  */
3540 static bool
pmap_every_pte_zero(vm_paddr_t pa)3541 pmap_every_pte_zero(vm_paddr_t pa)
3542 {
3543 	pt_entry_t *pt_end, *pte;
3544 
3545 	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
3546 	pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
3547 	for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
3548 		if (*pte != 0)
3549 			return (false);
3550 	}
3551 	return (true);
3552 }
3553 
3554 /*
3555  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
3556  * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
3557  * KERN_RESOURCE_SHORTAGE otherwise.  Returns KERN_FAILURE if
3558  * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists
3559  * within the 2MB virtual address range starting at the specified virtual
3560  * address.  Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
3561  * 2MB page mapping already exists at the specified virtual address.  Returns
3562  * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
3563  * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
3564  * and a PV entry allocation failed.
3565  *
3566  * The parameter "m" is only used when creating a managed, writeable mapping.
3567  */
3568 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)3569 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
3570     vm_page_t m, struct rwlock **lockp)
3571 {
3572 	struct spglist free;
3573 	pd_entry_t *l2, *l3, oldl2;
3574 	vm_offset_t sva;
3575 	vm_page_t l2pg, mt;
3576 	vm_page_t uwptpg;
3577 
3578 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3579 
3580 	if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
3581 	    NULL : lockp)) == NULL) {
3582 		CTR2(KTR_PMAP, "pmap_enter_l2: failed to allocate PT page"
3583 		    " for va %#lx in pmap %p", va, pmap);
3584 		return (KERN_RESOURCE_SHORTAGE);
3585 	}
3586 
3587 	l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
3588 	l2 = &l2[pmap_l2_index(va)];
3589 	if ((oldl2 = pmap_load(l2)) != 0) {
3590 		KASSERT(l2pg->ref_count > 1,
3591 		    ("pmap_enter_l2: l2pg's ref count is too low"));
3592 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
3593 			if ((oldl2 & PTE_RWX) != 0) {
3594 				l2pg->ref_count--;
3595 				CTR2(KTR_PMAP,
3596 				    "pmap_enter_l2: no space for va %#lx"
3597 				    " in pmap %p", va, pmap);
3598 				return (KERN_NO_SPACE);
3599 			} else if (va < VM_MAXUSER_ADDRESS ||
3600 			    !pmap_every_pte_zero(L2PTE_TO_PHYS(oldl2))) {
3601 				l2pg->ref_count--;
3602 				CTR2(KTR_PMAP, "pmap_enter_l2:"
3603 				    " failed to replace existing mapping"
3604 				    " for va %#lx in pmap %p", va, pmap);
3605 				return (KERN_FAILURE);
3606 			}
3607 		}
3608 		SLIST_INIT(&free);
3609 		if ((oldl2 & PTE_RWX) != 0)
3610 			(void)pmap_remove_l2(pmap, l2, va,
3611 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
3612 		else
3613 			for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) {
3614 				l3 = pmap_l2_to_l3(l2, sva);
3615 				if ((pmap_load(l3) & PTE_V) != 0 &&
3616 				    pmap_remove_l3(pmap, l3, sva, oldl2, &free,
3617 				    lockp) != 0)
3618 					break;
3619 			}
3620 		vm_page_free_pages_toq(&free, true);
3621 		if (va >= VM_MAXUSER_ADDRESS) {
3622 			/*
3623 			 * Both pmap_remove_l2() and pmap_remove_l3() will
3624 			 * leave the kernel page table page zero filled.
3625 			 */
3626 			mt = PTE_TO_VM_PAGE(pmap_load(l2));
3627 			if (pmap_insert_pt_page(pmap, mt, false, false))
3628 				panic("pmap_enter_l2: trie insert failed");
3629 		} else
3630 			KASSERT(pmap_load(l2) == 0,
3631 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
3632 	}
3633 
3634 	/*
3635 	 * Allocate leaf ptpage for wired userspace pages.
3636 	 */
3637 	uwptpg = NULL;
3638 	if ((new_l2 & PTE_SW_WIRED) != 0 && pmap != kernel_pmap) {
3639 		uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3640 		if (uwptpg == NULL) {
3641 			pmap_abort_ptp(pmap, va, l2pg);
3642 			return (KERN_RESOURCE_SHORTAGE);
3643 		}
3644 		uwptpg->pindex = pmap_l2_pindex(va);
3645 		if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
3646 			vm_page_unwire_noq(uwptpg);
3647 			vm_page_free(uwptpg);
3648 			pmap_abort_ptp(pmap, va, l2pg);
3649 			return (KERN_RESOURCE_SHORTAGE);
3650 		}
3651 		pmap_resident_count_inc(pmap, 1);
3652 		uwptpg->ref_count = Ln_ENTRIES;
3653 	}
3654 	if ((new_l2 & PTE_SW_MANAGED) != 0) {
3655 		/*
3656 		 * Abort this mapping if its PV entry could not be created.
3657 		 */
3658 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
3659 			pmap_abort_ptp(pmap, va, l2pg);
3660 			if (uwptpg != NULL) {
3661 				mt = pmap_remove_pt_page(pmap, va);
3662 				KASSERT(mt == uwptpg,
3663 				    ("removed pt page %p, expected %p", mt,
3664 				    uwptpg));
3665 				pmap_resident_count_dec(pmap, 1);
3666 				uwptpg->ref_count = 1;
3667 				vm_page_unwire_noq(uwptpg);
3668 				vm_page_free(uwptpg);
3669 			}
3670 			CTR2(KTR_PMAP,
3671 			    "pmap_enter_l2: failed to create PV entry"
3672 			    " for va %#lx in pmap %p", va, pmap);
3673 			return (KERN_RESOURCE_SHORTAGE);
3674 		}
3675 		if ((new_l2 & PTE_W) != 0)
3676 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3677 				vm_page_aflag_set(mt, PGA_WRITEABLE);
3678 	}
3679 
3680 	/*
3681 	 * Increment counters.
3682 	 */
3683 	if ((new_l2 & PTE_SW_WIRED) != 0)
3684 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
3685 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
3686 
3687 	/*
3688 	 * Map the superpage.
3689 	 */
3690 	pmap_store(l2, new_l2);
3691 
3692 	atomic_add_long(&pmap_l2_mappings, 1);
3693 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
3694 	    va, pmap);
3695 
3696 	return (KERN_SUCCESS);
3697 }
3698 
3699 /*
3700  * Maps a sequence of resident pages belonging to the same object.
3701  * The sequence begins with the given page m_start.  This page is
3702  * mapped at the given virtual address start.  Each subsequent page is
3703  * mapped at a virtual address that is offset from start by the same
3704  * amount as the page is offset from m_start within the object.  The
3705  * last page in the sequence is the page with the largest offset from
3706  * m_start that can be mapped at a virtual address less than the given
3707  * virtual address end.  Not every virtual page between start and end
3708  * is mapped; only those for which a resident page exists with the
3709  * corresponding offset from m_start are mapped.
3710  */
3711 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)3712 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3713     vm_page_t m_start, vm_prot_t prot)
3714 {
3715 	struct pctrie_iter pages;
3716 	struct rwlock *lock;
3717 	vm_offset_t va;
3718 	vm_page_t m, mpte;
3719 	int rv;
3720 
3721 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3722 
3723 	mpte = NULL;
3724 	vm_page_iter_limit_init(&pages, m_start->object,
3725 	    m_start->pindex + atop(end - start));
3726 	m = vm_radix_iter_lookup(&pages, m_start->pindex);
3727 	lock = NULL;
3728 	rw_rlock(&pvh_global_lock);
3729 	PMAP_LOCK(pmap);
3730 	while (m != NULL) {
3731 		va = start + ptoa(m->pindex - m_start->pindex);
3732 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
3733 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
3734 		    ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) ==
3735 		    KERN_SUCCESS || rv == KERN_NO_SPACE)) {
3736 			m = vm_radix_iter_jump(&pages, L2_SIZE / PAGE_SIZE);
3737 		} else {
3738 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
3739 			    &lock);
3740 			m = vm_radix_iter_step(&pages);
3741 		}
3742 	}
3743 	if (lock != NULL)
3744 		rw_wunlock(lock);
3745 	rw_runlock(&pvh_global_lock);
3746 	PMAP_UNLOCK(pmap);
3747 }
3748 
3749 /*
3750  * this code makes some *MAJOR* assumptions:
3751  * 1. Current pmap & pmap exists.
3752  * 2. Not wired.
3753  * 3. Read access.
3754  * 4. No page table pages.
3755  * but is *MUCH* faster than pmap_enter...
3756  */
3757 
3758 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)3759 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3760 {
3761 	struct rwlock *lock;
3762 
3763 	lock = NULL;
3764 	rw_rlock(&pvh_global_lock);
3765 	PMAP_LOCK(pmap);
3766 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
3767 	if (lock != NULL)
3768 		rw_wunlock(lock);
3769 	rw_runlock(&pvh_global_lock);
3770 	PMAP_UNLOCK(pmap);
3771 }
3772 
3773 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)3774 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3775     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
3776 {
3777 	struct spglist free;
3778 	pd_entry_t *l2;
3779 	pt_entry_t *l3, newl3;
3780 
3781 	KASSERT(!VA_IS_CLEANMAP(va) ||
3782 	    (m->oflags & VPO_UNMANAGED) != 0,
3783 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3784 	rw_assert(&pvh_global_lock, RA_LOCKED);
3785 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3786 	l2 = NULL;
3787 
3788 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
3789 	/*
3790 	 * In the case that a page table page is not
3791 	 * resident, we are creating it here.
3792 	 */
3793 	if (va < VM_MAXUSER_ADDRESS) {
3794 		vm_pindex_t l2pindex;
3795 
3796 		/*
3797 		 * Calculate pagetable page index
3798 		 */
3799 		l2pindex = pmap_l2_pindex(va);
3800 		if (mpte && (mpte->pindex == l2pindex)) {
3801 			mpte->ref_count++;
3802 		} else {
3803 			/*
3804 			 * Get the l2 entry
3805 			 */
3806 			l2 = pmap_l2(pmap, va);
3807 
3808 			/*
3809 			 * If the page table page is mapped, we just increment
3810 			 * the hold count, and activate it.  Otherwise, we
3811 			 * attempt to allocate a page table page.  If this
3812 			 * attempt fails, we don't retry.  Instead, we give up.
3813 			 */
3814 			if (l2 != NULL && pmap_load(l2) != 0) {
3815 				if ((pmap_load(l2) & PTE_RWX) != 0)
3816 					return (NULL);
3817 				mpte = PTE_TO_VM_PAGE(pmap_load(l2));
3818 				mpte->ref_count++;
3819 			} else {
3820 				/*
3821 				 * Pass NULL instead of the PV list lock
3822 				 * pointer, because we don't intend to sleep.
3823 				 */
3824 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
3825 				if (mpte == NULL)
3826 					return (mpte);
3827 			}
3828 		}
3829 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3830 		l3 = &l3[pmap_l3_index(va)];
3831 	} else {
3832 		mpte = NULL;
3833 		l3 = pmap_l3(kernel_pmap, va);
3834 	}
3835 	if (l3 == NULL)
3836 		panic("pmap_enter_quick_locked: No l3");
3837 	if (pmap_load(l3) != 0) {
3838 		if (mpte != NULL)
3839 			mpte->ref_count--;
3840 		return (NULL);
3841 	}
3842 
3843 	/*
3844 	 * Enter on the PV list if part of our managed memory.
3845 	 */
3846 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3847 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3848 		if (mpte != NULL) {
3849 			SLIST_INIT(&free);
3850 			if (pmap_unwire_ptp(pmap, va, mpte, &free))
3851 				vm_page_free_pages_toq(&free, false);
3852 		}
3853 		return (NULL);
3854 	}
3855 
3856 	/*
3857 	 * Increment counters
3858 	 */
3859 	pmap_resident_count_inc(pmap, 1);
3860 
3861 	newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) |
3862 	    PTE_V | PTE_R | pmap_memattr_bits(m->md.pv_memattr);
3863 	if ((prot & VM_PROT_EXECUTE) != 0)
3864 		newl3 |= PTE_X;
3865 	if ((m->oflags & VPO_UNMANAGED) == 0)
3866 		newl3 |= PTE_SW_MANAGED;
3867 	if (va < VM_MAX_USER_ADDRESS)
3868 		newl3 |= PTE_U;
3869 
3870 	/*
3871 	 * Sync the i-cache on all harts before updating the PTE
3872 	 * if the new PTE is executable.
3873 	 */
3874 	if (prot & VM_PROT_EXECUTE)
3875 		pmap_sync_icache(pmap, va, PAGE_SIZE);
3876 
3877 	pmap_store(l3, newl3);
3878 
3879 #if VM_NRESERVLEVEL > 0
3880 	/*
3881 	 * If both the PTP and the reservation are fully populated, then attempt
3882 	 * promotion.
3883 	 */
3884 	if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
3885 	    (mpte == NULL || mpte->ref_count == Ln_ENTRIES) &&
3886 	    (m->flags & PG_FICTITIOUS) == 0 &&
3887 	    vm_reserv_level_iffullpop(m) == 0) {
3888 		if (l2 == NULL)
3889 			l2 = pmap_l2(pmap, va);
3890 
3891 		/*
3892 		 * If promotion succeeds, then the next call to this function
3893 		 * should not be given the unmapped PTP as a hint.
3894 		 */
3895 		if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
3896 			mpte = NULL;
3897 	}
3898 #endif
3899 
3900 	return (mpte);
3901 }
3902 
3903 /*
3904  * This code maps large physical mmap regions into the
3905  * processor address space.  Note that some shortcuts
3906  * are taken, but the code works.
3907  */
3908 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)3909 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3910     vm_pindex_t pindex, vm_size_t size)
3911 {
3912 
3913 	VM_OBJECT_ASSERT_WLOCKED(object);
3914 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3915 	    ("pmap_object_init_pt: non-device object"));
3916 }
3917 
3918 /*
3919  *	Clear the wired attribute from the mappings for the specified range of
3920  *	addresses in the given pmap.  Every valid mapping within that range
3921  *	must have the wired attribute set.  In contrast, invalid mappings
3922  *	cannot have the wired attribute set, so they are ignored.
3923  *
3924  *	The wired attribute of the page table entry is not a hardware feature,
3925  *	so there is no need to invalidate any TLB entries.
3926  */
3927 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3928 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3929 {
3930 	vm_offset_t va_next;
3931 	pd_entry_t *l0, *l1, *l2, l2e;
3932 	pt_entry_t *l3, l3e;
3933 	bool pv_lists_locked;
3934 
3935 	pv_lists_locked = false;
3936 retry:
3937 	PMAP_LOCK(pmap);
3938 	for (; sva < eva; sva = va_next) {
3939 		if (pmap_mode == PMAP_MODE_SV48) {
3940 			l0 = pmap_l0(pmap, sva);
3941 			if (pmap_load(l0) == 0) {
3942 				va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3943 				if (va_next < sva)
3944 					va_next = eva;
3945 				continue;
3946 			}
3947 			l1 = pmap_l0_to_l1(l0, sva);
3948 		} else {
3949 			l1 = pmap_l1(pmap, sva);
3950 		}
3951 
3952 		if (pmap_load(l1) == 0) {
3953 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3954 			if (va_next < sva)
3955 				va_next = eva;
3956 			continue;
3957 		}
3958 
3959 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3960 		if (va_next < sva)
3961 			va_next = eva;
3962 
3963 		l2 = pmap_l1_to_l2(l1, sva);
3964 		if ((l2e = pmap_load(l2)) == 0)
3965 			continue;
3966 		if ((l2e & PTE_RWX) != 0) {
3967 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3968 				if ((l2e & PTE_SW_WIRED) == 0)
3969 					panic("pmap_unwire: l2 %#jx is missing "
3970 					    "PTE_SW_WIRED", (uintmax_t)l2e);
3971 				pmap_clear_bits(l2, PTE_SW_WIRED);
3972 				continue;
3973 			} else {
3974 				if (!pv_lists_locked) {
3975 					pv_lists_locked = true;
3976 					if (!rw_try_rlock(&pvh_global_lock)) {
3977 						PMAP_UNLOCK(pmap);
3978 						rw_rlock(&pvh_global_lock);
3979 						/* Repeat sva. */
3980 						goto retry;
3981 					}
3982 				}
3983 				if (!pmap_demote_l2(pmap, l2, sva))
3984 					panic("pmap_unwire: demotion failed");
3985 			}
3986 		}
3987 
3988 		if (va_next > eva)
3989 			va_next = eva;
3990 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
3991 		    sva += L3_SIZE) {
3992 			if ((l3e = pmap_load(l3)) == 0)
3993 				continue;
3994 			if ((l3e & PTE_SW_WIRED) == 0)
3995 				panic("pmap_unwire: l3 %#jx is missing "
3996 				    "PTE_SW_WIRED", (uintmax_t)l3e);
3997 
3998 			/*
3999 			 * PG_W must be cleared atomically.  Although the pmap
4000 			 * lock synchronizes access to PG_W, another processor
4001 			 * could be setting PG_M and/or PG_A concurrently.
4002 			 */
4003 			pmap_clear_bits(l3, PTE_SW_WIRED);
4004 			pmap->pm_stats.wired_count--;
4005 		}
4006 	}
4007 	if (pv_lists_locked)
4008 		rw_runlock(&pvh_global_lock);
4009 	PMAP_UNLOCK(pmap);
4010 }
4011 
4012 /*
4013  *	Copy the range specified by src_addr/len
4014  *	from the source map to the range dst_addr/len
4015  *	in the destination map.
4016  *
4017  *	This routine is only advisory and need not do anything.
4018  */
4019 
4020 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)4021 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4022     vm_offset_t src_addr)
4023 {
4024 
4025 }
4026 
4027 /*
4028  *	pmap_zero_page zeros the specified hardware page by mapping
4029  *	the page into KVM and using bzero to clear its contents.
4030  */
4031 void
pmap_zero_page(vm_page_t m)4032 pmap_zero_page(vm_page_t m)
4033 {
4034 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4035 
4036 	pagezero((void *)va);
4037 }
4038 
4039 /*
4040  *	pmap_zero_page_area zeros the specified hardware page by mapping
4041  *	the page into KVM and using bzero to clear its contents.
4042  *
4043  *	off and size may not cover an area beyond a single hardware page.
4044  */
4045 void
pmap_zero_page_area(vm_page_t m,int off,int size)4046 pmap_zero_page_area(vm_page_t m, int off, int size)
4047 {
4048 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4049 
4050 	if (off == 0 && size == PAGE_SIZE)
4051 		pagezero((void *)va);
4052 	else
4053 		bzero((char *)va + off, size);
4054 }
4055 
4056 /*
4057  *	pmap_copy_page copies the specified (machine independent)
4058  *	page by mapping the page into virtual memory and using
4059  *	bcopy to copy the page, one machine dependent page at a
4060  *	time.
4061  */
4062 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)4063 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
4064 {
4065 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
4066 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
4067 
4068 	pagecopy((void *)src, (void *)dst);
4069 }
4070 
4071 int unmapped_buf_allowed = 1;
4072 
4073 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)4074 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4075     vm_offset_t b_offset, int xfersize)
4076 {
4077 	void *a_cp, *b_cp;
4078 	vm_page_t m_a, m_b;
4079 	vm_paddr_t p_a, p_b;
4080 	vm_offset_t a_pg_offset, b_pg_offset;
4081 	int cnt;
4082 
4083 	while (xfersize > 0) {
4084 		a_pg_offset = a_offset & PAGE_MASK;
4085 		m_a = ma[a_offset >> PAGE_SHIFT];
4086 		p_a = m_a->phys_addr;
4087 		b_pg_offset = b_offset & PAGE_MASK;
4088 		m_b = mb[b_offset >> PAGE_SHIFT];
4089 		p_b = m_b->phys_addr;
4090 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4091 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4092 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
4093 			panic("!DMAP a %lx", p_a);
4094 		} else {
4095 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
4096 		}
4097 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
4098 			panic("!DMAP b %lx", p_b);
4099 		} else {
4100 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
4101 		}
4102 		bcopy(a_cp, b_cp, cnt);
4103 		a_offset += cnt;
4104 		b_offset += cnt;
4105 		xfersize -= cnt;
4106 	}
4107 }
4108 
4109 vm_offset_t
pmap_quick_enter_page(vm_page_t m)4110 pmap_quick_enter_page(vm_page_t m)
4111 {
4112 
4113 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
4114 }
4115 
4116 void
pmap_quick_remove_page(vm_offset_t addr)4117 pmap_quick_remove_page(vm_offset_t addr)
4118 {
4119 }
4120 
4121 /*
4122  * Returns true if the pmap's pv is one of the first
4123  * 16 pvs linked to from this page.  This count may
4124  * be changed upwards or downwards in the future; it
4125  * is only necessary that true be returned for a small
4126  * subset of pmaps for proper page aging.
4127  */
4128 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)4129 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4130 {
4131 	struct md_page *pvh;
4132 	struct rwlock *lock;
4133 	pv_entry_t pv;
4134 	int loops = 0;
4135 	bool rv;
4136 
4137 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4138 	    ("pmap_page_exists_quick: page %p is not managed", m));
4139 	rv = false;
4140 	rw_rlock(&pvh_global_lock);
4141 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4142 	rw_rlock(lock);
4143 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4144 		if (PV_PMAP(pv) == pmap) {
4145 			rv = true;
4146 			break;
4147 		}
4148 		loops++;
4149 		if (loops >= 16)
4150 			break;
4151 	}
4152 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4153 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4154 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4155 			if (PV_PMAP(pv) == pmap) {
4156 				rv = true;
4157 				break;
4158 			}
4159 			loops++;
4160 			if (loops >= 16)
4161 				break;
4162 		}
4163 	}
4164 	rw_runlock(lock);
4165 	rw_runlock(&pvh_global_lock);
4166 	return (rv);
4167 }
4168 
4169 /*
4170  *	pmap_page_wired_mappings:
4171  *
4172  *	Return the number of managed mappings to the given physical page
4173  *	that are wired.
4174  */
4175 int
pmap_page_wired_mappings(vm_page_t m)4176 pmap_page_wired_mappings(vm_page_t m)
4177 {
4178 	struct md_page *pvh;
4179 	struct rwlock *lock;
4180 	pmap_t pmap;
4181 	pd_entry_t *l2;
4182 	pt_entry_t *l3;
4183 	pv_entry_t pv;
4184 	int count, md_gen, pvh_gen;
4185 
4186 	if ((m->oflags & VPO_UNMANAGED) != 0)
4187 		return (0);
4188 	rw_rlock(&pvh_global_lock);
4189 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4190 	rw_rlock(lock);
4191 restart:
4192 	count = 0;
4193 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4194 		pmap = PV_PMAP(pv);
4195 		if (!PMAP_TRYLOCK(pmap)) {
4196 			md_gen = m->md.pv_gen;
4197 			rw_runlock(lock);
4198 			PMAP_LOCK(pmap);
4199 			rw_rlock(lock);
4200 			if (md_gen != m->md.pv_gen) {
4201 				PMAP_UNLOCK(pmap);
4202 				goto restart;
4203 			}
4204 		}
4205 		l2 = pmap_l2(pmap, pv->pv_va);
4206 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4207 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4208 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4209 		if ((pmap_load(l3) & PTE_SW_WIRED) != 0)
4210 			count++;
4211 		PMAP_UNLOCK(pmap);
4212 	}
4213 	if ((m->flags & PG_FICTITIOUS) == 0) {
4214 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4215 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4216 			pmap = PV_PMAP(pv);
4217 			if (!PMAP_TRYLOCK(pmap)) {
4218 				md_gen = m->md.pv_gen;
4219 				pvh_gen = pvh->pv_gen;
4220 				rw_runlock(lock);
4221 				PMAP_LOCK(pmap);
4222 				rw_rlock(lock);
4223 				if (md_gen != m->md.pv_gen ||
4224 				    pvh_gen != pvh->pv_gen) {
4225 					PMAP_UNLOCK(pmap);
4226 					goto restart;
4227 				}
4228 			}
4229 			l2 = pmap_l2(pmap, pv->pv_va);
4230 			if ((pmap_load(l2) & PTE_SW_WIRED) != 0)
4231 				count++;
4232 			PMAP_UNLOCK(pmap);
4233 		}
4234 	}
4235 	rw_runlock(lock);
4236 	rw_runlock(&pvh_global_lock);
4237 	return (count);
4238 }
4239 
4240 /*
4241  * Returns true if the given page is mapped individually or as part of
4242  * a 2mpage.  Otherwise, returns false.
4243  */
4244 bool
pmap_page_is_mapped(vm_page_t m)4245 pmap_page_is_mapped(vm_page_t m)
4246 {
4247 	struct rwlock *lock;
4248 	bool rv;
4249 
4250 	if ((m->oflags & VPO_UNMANAGED) != 0)
4251 		return (false);
4252 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4253 	rw_rlock(lock);
4254 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4255 	    ((m->flags & PG_FICTITIOUS) == 0 &&
4256 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4257 	rw_runlock(lock);
4258 	return (rv);
4259 }
4260 
4261 static void
pmap_remove_pages_pv(pmap_t pmap,vm_page_t m,pv_entry_t pv,struct spglist * free,bool superpage)4262 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv,
4263     struct spglist *free, bool superpage)
4264 {
4265 	struct md_page *pvh;
4266 	vm_page_t mpte, mt;
4267 
4268 	if (superpage) {
4269 		pmap_resident_count_dec(pmap, Ln_ENTRIES);
4270 		pvh = pa_to_pvh(m->phys_addr);
4271 		TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4272 		pvh->pv_gen++;
4273 		if (TAILQ_EMPTY(&pvh->pv_list)) {
4274 			for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
4275 				if (TAILQ_EMPTY(&mt->md.pv_list) &&
4276 				    (mt->a.flags & PGA_WRITEABLE) != 0)
4277 					vm_page_aflag_clear(mt, PGA_WRITEABLE);
4278 		}
4279 		mpte = pmap_remove_pt_page(pmap, pv->pv_va);
4280 		if (mpte != NULL) {
4281 			KASSERT(vm_page_any_valid(mpte),
4282 			    ("pmap_remove_pages: pte page not promoted"));
4283 			pmap_resident_count_dec(pmap, 1);
4284 			KASSERT(mpte->ref_count == Ln_ENTRIES,
4285 			    ("pmap_remove_pages: pte page ref count error"));
4286 			mpte->ref_count = 0;
4287 			pmap_add_delayed_free_list(mpte, free, false);
4288 		}
4289 	} else {
4290 		pmap_resident_count_dec(pmap, 1);
4291 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4292 		m->md.pv_gen++;
4293 		if (TAILQ_EMPTY(&m->md.pv_list) &&
4294 		    (m->a.flags & PGA_WRITEABLE) != 0) {
4295 			pvh = pa_to_pvh(m->phys_addr);
4296 			if (TAILQ_EMPTY(&pvh->pv_list))
4297 				vm_page_aflag_clear(m, PGA_WRITEABLE);
4298 		}
4299 	}
4300 }
4301 
4302 /*
4303  * Destroy all managed, non-wired mappings in the given user-space
4304  * pmap.  This pmap cannot be active on any processor besides the
4305  * caller.
4306  *
4307  * This function cannot be applied to the kernel pmap.  Moreover, it
4308  * is not intended for general use.  It is only to be used during
4309  * process termination.  Consequently, it can be implemented in ways
4310  * that make it faster than pmap_remove().  First, it can more quickly
4311  * destroy mappings by iterating over the pmap's collection of PV
4312  * entries, rather than searching the page table.  Second, it doesn't
4313  * have to test and clear the page table entries atomically, because
4314  * no processor is currently accessing the user address space.  In
4315  * particular, a page table entry's dirty bit won't change state once
4316  * this function starts.
4317  */
4318 void
pmap_remove_pages(pmap_t pmap)4319 pmap_remove_pages(pmap_t pmap)
4320 {
4321 	struct spglist free;
4322 	pd_entry_t ptepde;
4323 	pt_entry_t *pte, tpte;
4324 	vm_page_t m, mt;
4325 	pv_entry_t pv;
4326 	struct pv_chunk *pc, *npc;
4327 	struct rwlock *lock;
4328 	int64_t bit;
4329 	uint64_t inuse, bitmask;
4330 	int allfree, field, freed __pv_stat_used, idx;
4331 	bool superpage;
4332 
4333 	lock = NULL;
4334 
4335 	SLIST_INIT(&free);
4336 	rw_rlock(&pvh_global_lock);
4337 	PMAP_LOCK(pmap);
4338 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4339 		allfree = 1;
4340 		freed = 0;
4341 		for (field = 0; field < _NPCM; field++) {
4342 			inuse = ~pc->pc_map[field] & pc_freemask[field];
4343 			while (inuse != 0) {
4344 				bit = ffsl(inuse) - 1;
4345 				bitmask = 1UL << bit;
4346 				idx = field * 64 + bit;
4347 				pv = &pc->pc_pventry[idx];
4348 				inuse &= ~bitmask;
4349 
4350 				pte = pmap_l1(pmap, pv->pv_va);
4351 				ptepde = pmap_load(pte);
4352 				pte = pmap_l1_to_l2(pte, pv->pv_va);
4353 				tpte = pmap_load(pte);
4354 
4355 				KASSERT((tpte & PTE_V) != 0,
4356 				    ("L2 PTE is invalid... bogus PV entry? "
4357 				    "va=%#lx, pte=%#lx", pv->pv_va, tpte));
4358 				if ((tpte & PTE_RWX) != 0) {
4359 					superpage = true;
4360 				} else {
4361 					ptepde = tpte;
4362 					pte = pmap_l2_to_l3(pte, pv->pv_va);
4363 					tpte = pmap_load(pte);
4364 					superpage = false;
4365 				}
4366 
4367 				/*
4368 				 * We cannot remove wired pages from a
4369 				 * process' mapping at this time.
4370 				 */
4371 				if (tpte & PTE_SW_WIRED) {
4372 					allfree = 0;
4373 					continue;
4374 				}
4375 
4376 				m = PTE_TO_VM_PAGE(tpte);
4377 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4378 				    m < &vm_page_array[vm_page_array_size],
4379 				    ("pmap_remove_pages: bad pte %#jx",
4380 				    (uintmax_t)tpte));
4381 
4382 				pmap_clear(pte);
4383 
4384 				/*
4385 				 * Update the vm_page_t clean/reference bits.
4386 				 */
4387 				if ((tpte & (PTE_D | PTE_W)) ==
4388 				    (PTE_D | PTE_W)) {
4389 					if (superpage)
4390 						for (mt = m;
4391 						    mt < &m[Ln_ENTRIES]; mt++)
4392 							vm_page_dirty(mt);
4393 					else
4394 						vm_page_dirty(m);
4395 				}
4396 
4397 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
4398 
4399 				/* Mark free */
4400 				pc->pc_map[field] |= bitmask;
4401 
4402 				pmap_remove_pages_pv(pmap, m, pv, &free,
4403 				    superpage);
4404 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
4405 				freed++;
4406 			}
4407 		}
4408 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
4409 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
4410 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
4411 		if (allfree) {
4412 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4413 			free_pv_chunk(pc);
4414 		}
4415 	}
4416 	if (lock != NULL)
4417 		rw_wunlock(lock);
4418 	pmap_invalidate_all(pmap);
4419 	rw_runlock(&pvh_global_lock);
4420 	PMAP_UNLOCK(pmap);
4421 	vm_page_free_pages_toq(&free, false);
4422 }
4423 
4424 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)4425 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
4426 {
4427 	struct md_page *pvh;
4428 	struct rwlock *lock;
4429 	pd_entry_t *l2;
4430 	pt_entry_t *l3, mask;
4431 	pv_entry_t pv;
4432 	pmap_t pmap;
4433 	int md_gen, pvh_gen;
4434 	bool rv;
4435 
4436 	mask = 0;
4437 	if (modified)
4438 		mask |= PTE_D;
4439 	if (accessed)
4440 		mask |= PTE_A;
4441 
4442 	rv = false;
4443 	rw_rlock(&pvh_global_lock);
4444 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4445 	rw_rlock(lock);
4446 restart:
4447 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4448 		pmap = PV_PMAP(pv);
4449 		if (!PMAP_TRYLOCK(pmap)) {
4450 			md_gen = m->md.pv_gen;
4451 			rw_runlock(lock);
4452 			PMAP_LOCK(pmap);
4453 			rw_rlock(lock);
4454 			if (md_gen != m->md.pv_gen) {
4455 				PMAP_UNLOCK(pmap);
4456 				goto restart;
4457 			}
4458 		}
4459 		l2 = pmap_l2(pmap, pv->pv_va);
4460 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4461 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4462 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4463 		rv = (pmap_load(l3) & mask) == mask;
4464 		PMAP_UNLOCK(pmap);
4465 		if (rv)
4466 			goto out;
4467 	}
4468 	if ((m->flags & PG_FICTITIOUS) == 0) {
4469 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4470 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4471 			pmap = PV_PMAP(pv);
4472 			if (!PMAP_TRYLOCK(pmap)) {
4473 				md_gen = m->md.pv_gen;
4474 				pvh_gen = pvh->pv_gen;
4475 				rw_runlock(lock);
4476 				PMAP_LOCK(pmap);
4477 				rw_rlock(lock);
4478 				if (md_gen != m->md.pv_gen ||
4479 				    pvh_gen != pvh->pv_gen) {
4480 					PMAP_UNLOCK(pmap);
4481 					goto restart;
4482 				}
4483 			}
4484 			l2 = pmap_l2(pmap, pv->pv_va);
4485 			rv = (pmap_load(l2) & mask) == mask;
4486 			PMAP_UNLOCK(pmap);
4487 			if (rv)
4488 				goto out;
4489 		}
4490 	}
4491 out:
4492 	rw_runlock(lock);
4493 	rw_runlock(&pvh_global_lock);
4494 	return (rv);
4495 }
4496 
4497 /*
4498  *	pmap_is_modified:
4499  *
4500  *	Return whether or not the specified physical page was modified
4501  *	in any physical maps.
4502  */
4503 bool
pmap_is_modified(vm_page_t m)4504 pmap_is_modified(vm_page_t m)
4505 {
4506 
4507 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4508 	    ("pmap_is_modified: page %p is not managed", m));
4509 
4510 	/*
4511 	 * If the page is not busied then this check is racy.
4512 	 */
4513 	if (!pmap_page_is_write_mapped(m))
4514 		return (false);
4515 	return (pmap_page_test_mappings(m, false, true));
4516 }
4517 
4518 /*
4519  *	pmap_is_prefaultable:
4520  *
4521  *	Return whether or not the specified virtual address is eligible
4522  *	for prefault.
4523  */
4524 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)4525 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4526 {
4527 	pt_entry_t *l3;
4528 	bool rv;
4529 
4530 	/*
4531 	 * Return true if and only if the L3 entry for the specified virtual
4532 	 * address is allocated but invalid.
4533 	 */
4534 	rv = false;
4535 	PMAP_LOCK(pmap);
4536 	l3 = pmap_l3(pmap, addr);
4537 	if (l3 != NULL && pmap_load(l3) == 0) {
4538 		rv = true;
4539 	}
4540 	PMAP_UNLOCK(pmap);
4541 	return (rv);
4542 }
4543 
4544 /*
4545  *	pmap_is_referenced:
4546  *
4547  *	Return whether or not the specified physical page was referenced
4548  *	in any physical maps.
4549  */
4550 bool
pmap_is_referenced(vm_page_t m)4551 pmap_is_referenced(vm_page_t m)
4552 {
4553 
4554 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4555 	    ("pmap_is_referenced: page %p is not managed", m));
4556 	return (pmap_page_test_mappings(m, true, false));
4557 }
4558 
4559 /*
4560  * Clear the write and modified bits in each of the given page's mappings.
4561  */
4562 void
pmap_remove_write(vm_page_t m)4563 pmap_remove_write(vm_page_t m)
4564 {
4565 	struct md_page *pvh;
4566 	struct rwlock *lock;
4567 	pmap_t pmap;
4568 	pd_entry_t *l2;
4569 	pt_entry_t *l3, oldl3, newl3;
4570 	pv_entry_t next_pv, pv;
4571 	vm_offset_t va;
4572 	int md_gen, pvh_gen;
4573 
4574 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4575 	    ("pmap_remove_write: page %p is not managed", m));
4576 	vm_page_assert_busied(m);
4577 
4578 	if (!pmap_page_is_write_mapped(m))
4579 		return;
4580 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4581 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4582 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4583 	rw_rlock(&pvh_global_lock);
4584 retry_pv_loop:
4585 	rw_wlock(lock);
4586 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4587 		pmap = PV_PMAP(pv);
4588 		if (!PMAP_TRYLOCK(pmap)) {
4589 			pvh_gen = pvh->pv_gen;
4590 			rw_wunlock(lock);
4591 			PMAP_LOCK(pmap);
4592 			rw_wlock(lock);
4593 			if (pvh_gen != pvh->pv_gen) {
4594 				PMAP_UNLOCK(pmap);
4595 				rw_wunlock(lock);
4596 				goto retry_pv_loop;
4597 			}
4598 		}
4599 		va = pv->pv_va;
4600 		l2 = pmap_l2(pmap, va);
4601 		if ((pmap_load(l2) & PTE_W) != 0)
4602 			(void)pmap_demote_l2_locked(pmap, l2, va, &lock);
4603 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4604 		    ("inconsistent pv lock %p %p for page %p",
4605 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4606 		PMAP_UNLOCK(pmap);
4607 	}
4608 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4609 		pmap = PV_PMAP(pv);
4610 		if (!PMAP_TRYLOCK(pmap)) {
4611 			pvh_gen = pvh->pv_gen;
4612 			md_gen = m->md.pv_gen;
4613 			rw_wunlock(lock);
4614 			PMAP_LOCK(pmap);
4615 			rw_wlock(lock);
4616 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4617 				PMAP_UNLOCK(pmap);
4618 				rw_wunlock(lock);
4619 				goto retry_pv_loop;
4620 			}
4621 		}
4622 		l2 = pmap_l2(pmap, pv->pv_va);
4623 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4624 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4625 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4626 		oldl3 = pmap_load(l3);
4627 retry:
4628 		if ((oldl3 & PTE_W) != 0) {
4629 			newl3 = oldl3 & ~(PTE_D | PTE_W);
4630 			if (!atomic_fcmpset_long(l3, &oldl3, newl3))
4631 				goto retry;
4632 			if ((oldl3 & PTE_D) != 0)
4633 				vm_page_dirty(m);
4634 			pmap_invalidate_page(pmap, pv->pv_va);
4635 		}
4636 		PMAP_UNLOCK(pmap);
4637 	}
4638 	rw_wunlock(lock);
4639 	vm_page_aflag_clear(m, PGA_WRITEABLE);
4640 	rw_runlock(&pvh_global_lock);
4641 }
4642 
4643 /*
4644  *	pmap_ts_referenced:
4645  *
4646  *	Return a count of reference bits for a page, clearing those bits.
4647  *	It is not necessary for every reference bit to be cleared, but it
4648  *	is necessary that 0 only be returned when there are truly no
4649  *	reference bits set.
4650  *
4651  *	As an optimization, update the page's dirty field if a modified bit is
4652  *	found while counting reference bits.  This opportunistic update can be
4653  *	performed at low cost and can eliminate the need for some future calls
4654  *	to pmap_is_modified().  However, since this function stops after
4655  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4656  *	dirty pages.  Those dirty pages will only be detected by a future call
4657  *	to pmap_is_modified().
4658  */
4659 int
pmap_ts_referenced(vm_page_t m)4660 pmap_ts_referenced(vm_page_t m)
4661 {
4662 	struct spglist free;
4663 	struct md_page *pvh;
4664 	struct rwlock *lock;
4665 	pv_entry_t pv, pvf;
4666 	pmap_t pmap;
4667 	pd_entry_t *l2, l2e;
4668 	pt_entry_t *l3, l3e;
4669 	vm_paddr_t pa;
4670 	vm_offset_t va;
4671 	int cleared, md_gen, not_cleared, pvh_gen;
4672 
4673 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4674 	    ("pmap_ts_referenced: page %p is not managed", m));
4675 	SLIST_INIT(&free);
4676 	cleared = 0;
4677 	pa = VM_PAGE_TO_PHYS(m);
4678 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
4679 
4680 	lock = PHYS_TO_PV_LIST_LOCK(pa);
4681 	rw_rlock(&pvh_global_lock);
4682 	rw_wlock(lock);
4683 retry:
4684 	not_cleared = 0;
4685 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4686 		goto small_mappings;
4687 	pv = pvf;
4688 	do {
4689 		pmap = PV_PMAP(pv);
4690 		if (!PMAP_TRYLOCK(pmap)) {
4691 			pvh_gen = pvh->pv_gen;
4692 			rw_wunlock(lock);
4693 			PMAP_LOCK(pmap);
4694 			rw_wlock(lock);
4695 			if (pvh_gen != pvh->pv_gen) {
4696 				PMAP_UNLOCK(pmap);
4697 				goto retry;
4698 			}
4699 		}
4700 		va = pv->pv_va;
4701 		l2 = pmap_l2(pmap, va);
4702 		l2e = pmap_load(l2);
4703 		if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) {
4704 			/*
4705 			 * Although l2e is mapping a 2MB page, because
4706 			 * this function is called at a 4KB page granularity,
4707 			 * we only update the 4KB page under test.
4708 			 */
4709 			vm_page_dirty(m);
4710 		}
4711 		if ((l2e & PTE_A) != 0) {
4712 			/*
4713 			 * Since this reference bit is shared by 512 4KB
4714 			 * pages, it should not be cleared every time it is
4715 			 * tested.  Apply a simple "hash" function on the
4716 			 * physical page number, the virtual superpage number,
4717 			 * and the pmap address to select one 4KB page out of
4718 			 * the 512 on which testing the reference bit will
4719 			 * result in clearing that reference bit.  This
4720 			 * function is designed to avoid the selection of the
4721 			 * same 4KB page for every 2MB page mapping.
4722 			 *
4723 			 * On demotion, a mapping that hasn't been referenced
4724 			 * is simply destroyed.  To avoid the possibility of a
4725 			 * subsequent page fault on a demoted wired mapping,
4726 			 * always leave its reference bit set.  Moreover,
4727 			 * since the superpage is wired, the current state of
4728 			 * its reference bit won't affect page replacement.
4729 			 */
4730 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
4731 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
4732 			    (l2e & PTE_SW_WIRED) == 0) {
4733 				pmap_clear_bits(l2, PTE_A);
4734 				pmap_invalidate_page(pmap, va);
4735 				cleared++;
4736 			} else
4737 				not_cleared++;
4738 		}
4739 		PMAP_UNLOCK(pmap);
4740 		/* Rotate the PV list if it has more than one entry. */
4741 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4742 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4743 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4744 			pvh->pv_gen++;
4745 		}
4746 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
4747 			goto out;
4748 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4749 small_mappings:
4750 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4751 		goto out;
4752 	pv = pvf;
4753 	do {
4754 		pmap = PV_PMAP(pv);
4755 		if (!PMAP_TRYLOCK(pmap)) {
4756 			pvh_gen = pvh->pv_gen;
4757 			md_gen = m->md.pv_gen;
4758 			rw_wunlock(lock);
4759 			PMAP_LOCK(pmap);
4760 			rw_wlock(lock);
4761 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4762 				PMAP_UNLOCK(pmap);
4763 				goto retry;
4764 			}
4765 		}
4766 		l2 = pmap_l2(pmap, pv->pv_va);
4767 
4768 		KASSERT((pmap_load(l2) & PTE_RX) == 0,
4769 		    ("pmap_ts_referenced: found an invalid l2 table"));
4770 
4771 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4772 		l3e = pmap_load(l3);
4773 		if ((l3e & PTE_D) != 0)
4774 			vm_page_dirty(m);
4775 		if ((l3e & PTE_A) != 0) {
4776 			if ((l3e & PTE_SW_WIRED) == 0) {
4777 				/*
4778 				 * Wired pages cannot be paged out so
4779 				 * doing accessed bit emulation for
4780 				 * them is wasted effort. We do the
4781 				 * hard work for unwired pages only.
4782 				 */
4783 				pmap_clear_bits(l3, PTE_A);
4784 				pmap_invalidate_page(pmap, pv->pv_va);
4785 				cleared++;
4786 			} else
4787 				not_cleared++;
4788 		}
4789 		PMAP_UNLOCK(pmap);
4790 		/* Rotate the PV list if it has more than one entry. */
4791 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4792 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4793 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4794 			m->md.pv_gen++;
4795 		}
4796 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
4797 	    not_cleared < PMAP_TS_REFERENCED_MAX);
4798 out:
4799 	rw_wunlock(lock);
4800 	rw_runlock(&pvh_global_lock);
4801 	vm_page_free_pages_toq(&free, false);
4802 	return (cleared + not_cleared);
4803 }
4804 
4805 /*
4806  *	Apply the given advice to the specified range of addresses within the
4807  *	given pmap.  Depending on the advice, clear the referenced and/or
4808  *	modified flags in each mapping and set the mapped page's dirty field.
4809  */
4810 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)4811 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4812 {
4813 }
4814 
4815 /*
4816  *	Clear the modify bits on the specified physical page.
4817  */
4818 void
pmap_clear_modify(vm_page_t m)4819 pmap_clear_modify(vm_page_t m)
4820 {
4821 	struct md_page *pvh;
4822 	struct rwlock *lock;
4823 	pmap_t pmap;
4824 	pv_entry_t next_pv, pv;
4825 	pd_entry_t *l2, oldl2;
4826 	pt_entry_t *l3;
4827 	vm_offset_t va;
4828 	int md_gen, pvh_gen;
4829 
4830 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4831 	    ("%s: page %p is not managed", __func__, m));
4832 	vm_page_assert_busied(m);
4833 
4834 	if (!pmap_page_is_write_mapped(m))
4835 	        return;
4836 
4837 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4838 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4839 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4840 	rw_rlock(&pvh_global_lock);
4841 	rw_wlock(lock);
4842 restart:
4843 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4844 		pmap = PV_PMAP(pv);
4845 		if (!PMAP_TRYLOCK(pmap)) {
4846 			pvh_gen = pvh->pv_gen;
4847 			rw_wunlock(lock);
4848 			PMAP_LOCK(pmap);
4849 			rw_wlock(lock);
4850 			if (pvh_gen != pvh->pv_gen) {
4851 				PMAP_UNLOCK(pmap);
4852 				goto restart;
4853 			}
4854 		}
4855 		va = pv->pv_va;
4856 		l2 = pmap_l2(pmap, va);
4857 		oldl2 = pmap_load(l2);
4858 		/* If oldl2 has PTE_W set, then it also has PTE_D set. */
4859 		if ((oldl2 & PTE_W) != 0 &&
4860 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
4861 		    (oldl2 & PTE_SW_WIRED) == 0) {
4862 			/*
4863 			 * Write protect the mapping to a single page so that
4864 			 * a subsequent write access may repromote.
4865 			 */
4866 			va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
4867 			l3 = pmap_l2_to_l3(l2, va);
4868 			pmap_clear_bits(l3, PTE_D | PTE_W);
4869 			vm_page_dirty(m);
4870 			pmap_invalidate_page(pmap, va);
4871 		}
4872 		PMAP_UNLOCK(pmap);
4873 	}
4874 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4875 		pmap = PV_PMAP(pv);
4876 		if (!PMAP_TRYLOCK(pmap)) {
4877 			md_gen = m->md.pv_gen;
4878 			pvh_gen = pvh->pv_gen;
4879 			rw_wunlock(lock);
4880 			PMAP_LOCK(pmap);
4881 			rw_wlock(lock);
4882 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4883 				PMAP_UNLOCK(pmap);
4884 				goto restart;
4885 			}
4886 		}
4887 		l2 = pmap_l2(pmap, pv->pv_va);
4888 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4889 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4890 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4891 		if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) {
4892 			pmap_clear_bits(l3, PTE_D | PTE_W);
4893 			pmap_invalidate_page(pmap, pv->pv_va);
4894 		}
4895 		PMAP_UNLOCK(pmap);
4896 	}
4897 	rw_wunlock(lock);
4898 	rw_runlock(&pvh_global_lock);
4899 }
4900 
4901 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)4902 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4903 {
4904 
4905         return ((void *)PHYS_TO_DMAP(pa));
4906 }
4907 
4908 void
pmap_unmapbios(void * p,vm_size_t size)4909 pmap_unmapbios(void *p, vm_size_t size)
4910 {
4911 }
4912 
4913 /*
4914  * Sets the memory attribute for the specified page.
4915  */
4916 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)4917 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4918 {
4919 	if (m->md.pv_memattr == ma)
4920 		return;
4921 
4922 	m->md.pv_memattr = ma;
4923 
4924 	/*
4925 	 * If "m" is a normal page, update its direct mapping.  This update
4926 	 * can be relied upon to perform any cache operations that are
4927 	 * required for data coherence.
4928 	 */
4929 	if ((m->flags & PG_FICTITIOUS) == 0 &&
4930 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
4931 	    m->md.pv_memattr) != 0)
4932 		panic("memory attribute change on the direct map failed");
4933 }
4934 
4935 /*
4936  * Changes the specified virtual address range's memory type to that given by
4937  * the parameter "mode".  The specified virtual address range must be
4938  * completely contained within either the direct map or the kernel map.
4939  *
4940  * Returns zero if the change completed successfully, and either EINVAL or
4941  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4942  * of the virtual address range was not mapped, and ENOMEM is returned if
4943  * there was insufficient memory available to complete the change.  In the
4944  * latter case, the memory type may have been changed on some part of the
4945  * virtual address range.
4946  */
4947 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)4948 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4949 {
4950 	int error;
4951 
4952 	PMAP_LOCK(kernel_pmap);
4953 	error = pmap_change_attr_locked(va, size, mode);
4954 	PMAP_UNLOCK(kernel_pmap);
4955 	return (error);
4956 }
4957 
4958 static int
pmap_change_attr_locked(vm_offset_t va,vm_size_t size,int mode)4959 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
4960 {
4961 	vm_offset_t base, offset, tmpva;
4962 	vm_paddr_t phys;
4963 	pd_entry_t *l1, l1e;
4964 	pd_entry_t *l2, l2e;
4965 	pt_entry_t *l3, l3e;
4966 	pt_entry_t bits, mask;
4967 	bool anychanged = false;
4968 	int error = 0;
4969 
4970 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
4971 	base = trunc_page(va);
4972 	offset = va & PAGE_MASK;
4973 	size = round_page(offset + size);
4974 
4975 	if (!VIRT_IN_DMAP(base) &&
4976 	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
4977 		return (EINVAL);
4978 
4979 	bits = pmap_memattr_bits(mode);
4980 	mask = memattr_mask;
4981 
4982 	/* First loop: perform PTE validation and demotions as necessary. */
4983 	for (tmpva = base; tmpva < base + size; ) {
4984 		l1 = pmap_l1(kernel_pmap, tmpva);
4985 		if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0)
4986 			return (EINVAL);
4987 		if ((l1e & PTE_RWX) != 0) {
4988 			/*
4989 			 * If the existing PTE has the correct attributes, then
4990 			 * no need to demote.
4991 			 */
4992 			if ((l1e & mask) == bits) {
4993 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
4994 				continue;
4995 			}
4996 
4997 			/*
4998 			 * If the 1GB page fits in the remaining range, we
4999 			 * don't need to demote.
5000 			 */
5001 			if ((tmpva & L1_OFFSET) == 0 &&
5002 			    tmpva + L1_SIZE <= base + size) {
5003 				tmpva += L1_SIZE;
5004 				continue;
5005 			}
5006 
5007 			if (!pmap_demote_l1(kernel_pmap, l1, tmpva))
5008 				return (EINVAL);
5009 		}
5010 		l2 = pmap_l1_to_l2(l1, tmpva);
5011 		if (((l2e = pmap_load(l2)) & PTE_V) == 0)
5012 			return (EINVAL);
5013 		if ((l2e & PTE_RWX) != 0) {
5014 			/*
5015 			 * If the existing PTE has the correct attributes, then
5016 			 * no need to demote.
5017 			 */
5018 			if ((l2e & mask) == bits) {
5019 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
5020 				continue;
5021 			}
5022 
5023 			/*
5024 			 * If the 2MB page fits in the remaining range, we
5025 			 * don't need to demote.
5026 			 */
5027 			if ((tmpva & L2_OFFSET) == 0 &&
5028 			    tmpva + L2_SIZE <= base + size) {
5029 				tmpva += L2_SIZE;
5030 				continue;
5031 			}
5032 
5033 			if (!pmap_demote_l2(kernel_pmap, l2, tmpva))
5034 				panic("l2 demotion failed");
5035 		}
5036 		l3 = pmap_l2_to_l3(l2, tmpva);
5037 		if (((l3e = pmap_load(l3)) & PTE_V) == 0)
5038 			return (EINVAL);
5039 
5040 		tmpva += PAGE_SIZE;
5041 	}
5042 
5043 	/* Second loop: perform PTE updates. */
5044 	for (tmpva = base; tmpva < base + size; ) {
5045 		l1 = pmap_l1(kernel_pmap, tmpva);
5046 		l1e = pmap_load(l1);
5047 		if ((l1e & PTE_RWX) != 0) {
5048 			/* Unchanged. */
5049 			if ((l1e & mask) == bits) {
5050 				tmpva += L1_SIZE;
5051 				continue;
5052 			}
5053 
5054 			l1e &= ~mask;
5055 			l1e |= bits;
5056 			pmap_store(l1, l1e);
5057 			anychanged = true;
5058 
5059 			/* Update corresponding DMAP entry */
5060 			phys = L1PTE_TO_PHYS(l1e);
5061 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
5062 				error = pmap_change_attr_locked(
5063 				    PHYS_TO_DMAP(phys), L1_SIZE, mode);
5064 				if (error != 0)
5065 					break;
5066 			}
5067 			tmpva += L1_SIZE;
5068 			continue;
5069 		}
5070 
5071 		l2 = pmap_l1_to_l2(l1, tmpva);
5072 		l2e = pmap_load(l2);
5073 		if ((l2e & PTE_RWX) != 0) {
5074 			/* Unchanged. */
5075 			if ((l2e & mask) == bits) {
5076 				tmpva += L2_SIZE;
5077 				continue;
5078 			}
5079 
5080 			l2e &= ~mask;
5081 			l2e |= bits;
5082 			pmap_store(l2, l2e);
5083 			anychanged = true;
5084 
5085 			/* Update corresponding DMAP entry */
5086 			phys = L2PTE_TO_PHYS(l2e);
5087 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
5088 				error = pmap_change_attr_locked(
5089 				    PHYS_TO_DMAP(phys), L2_SIZE, mode);
5090 				if (error != 0)
5091 					break;
5092 			}
5093 			tmpva += L2_SIZE;
5094 			continue;
5095 		}
5096 
5097 		l3 = pmap_l2_to_l3(l2, tmpva);
5098 		l3e = pmap_load(l3);
5099 
5100 		/* Unchanged. */
5101 		if ((l3e & mask) == bits) {
5102 			tmpva += PAGE_SIZE;
5103 			continue;
5104 		}
5105 
5106 		l3e &= ~mask;
5107 		l3e |= bits;
5108 		pmap_store(l3, l3e);
5109 		anychanged = true;
5110 
5111 		phys = PTE_TO_PHYS(l3e);
5112 		if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
5113 			error = pmap_change_attr_locked(PHYS_TO_DMAP(phys),
5114 			    L3_SIZE, mode);
5115 			if (error != 0)
5116 				break;
5117 		}
5118 		tmpva += PAGE_SIZE;
5119 	}
5120 
5121 	if (anychanged) {
5122 		pmap_invalidate_range(kernel_pmap, base, tmpva);
5123 		if (mode == VM_MEMATTR_UNCACHEABLE)
5124 			cpu_dcache_wbinv_range(base, size);
5125 	}
5126 
5127 	return (error);
5128 }
5129 
5130 /*
5131  * Perform the pmap work for mincore(2).  If the page is not both referenced and
5132  * modified by this pmap, returns its physical address so that the caller can
5133  * find other mappings.
5134  */
5135 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)5136 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
5137 {
5138 	pt_entry_t *l2, *l3, tpte;
5139 	vm_paddr_t pa;
5140 	int val;
5141 	bool managed;
5142 
5143 	PMAP_LOCK(pmap);
5144 	l2 = pmap_l2(pmap, addr);
5145 	if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) {
5146 		if ((tpte & PTE_RWX) != 0) {
5147 			pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET);
5148 			val = MINCORE_INCORE | MINCORE_PSIND(1);
5149 		} else {
5150 			l3 = pmap_l2_to_l3(l2, addr);
5151 			tpte = pmap_load(l3);
5152 			if ((tpte & PTE_V) == 0) {
5153 				PMAP_UNLOCK(pmap);
5154 				return (0);
5155 			}
5156 			pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET);
5157 			val = MINCORE_INCORE;
5158 		}
5159 
5160 		if ((tpte & PTE_D) != 0)
5161 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5162 		if ((tpte & PTE_A) != 0)
5163 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5164 		managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED;
5165 	} else {
5166 		managed = false;
5167 		val = 0;
5168 	}
5169 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5170 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
5171 		*pap = pa;
5172 	}
5173 	PMAP_UNLOCK(pmap);
5174 	return (val);
5175 }
5176 
5177 void
pmap_activate_sw(struct thread * td)5178 pmap_activate_sw(struct thread *td)
5179 {
5180 	pmap_t oldpmap, pmap;
5181 	u_int hart;
5182 
5183 	oldpmap = PCPU_GET(curpmap);
5184 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5185 	if (pmap == oldpmap)
5186 		return;
5187 	csr_write(satp, pmap->pm_satp);
5188 
5189 	hart = PCPU_GET(hart);
5190 #ifdef SMP
5191 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
5192 	CPU_CLR_ATOMIC(hart, &oldpmap->pm_active);
5193 #else
5194 	CPU_SET(hart, &pmap->pm_active);
5195 	CPU_CLR(hart, &oldpmap->pm_active);
5196 #endif
5197 	PCPU_SET(curpmap, pmap);
5198 
5199 	sfence_vma();
5200 }
5201 
5202 void
pmap_activate(struct thread * td)5203 pmap_activate(struct thread *td)
5204 {
5205 
5206 	critical_enter();
5207 	pmap_activate_sw(td);
5208 	critical_exit();
5209 }
5210 
5211 void
pmap_activate_boot(pmap_t pmap)5212 pmap_activate_boot(pmap_t pmap)
5213 {
5214 	u_int hart;
5215 
5216 	hart = PCPU_GET(hart);
5217 #ifdef SMP
5218 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
5219 #else
5220 	CPU_SET(hart, &pmap->pm_active);
5221 #endif
5222 	PCPU_SET(curpmap, pmap);
5223 }
5224 
5225 void
pmap_active_cpus(pmap_t pmap,cpuset_t * res)5226 pmap_active_cpus(pmap_t pmap, cpuset_t *res)
5227 {
5228 	*res = pmap->pm_active;
5229 }
5230 
5231 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)5232 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
5233 {
5234 	cpuset_t mask;
5235 
5236 	/*
5237 	 * From the RISC-V User-Level ISA V2.2:
5238 	 *
5239 	 * "To make a store to instruction memory visible to all
5240 	 * RISC-V harts, the writing hart has to execute a data FENCE
5241 	 * before requesting that all remote RISC-V harts execute a
5242 	 * FENCE.I."
5243 	 *
5244 	 * However, this is slightly misleading; we still need to
5245 	 * perform a FENCE.I for the local hart, as FENCE does nothing
5246 	 * for its icache. FENCE.I alone is also sufficient for the
5247 	 * local hart.
5248 	 */
5249 	sched_pin();
5250 	mask = all_harts;
5251 	CPU_CLR(PCPU_GET(hart), &mask);
5252 	fence_i();
5253 	if (!CPU_EMPTY(&mask) && smp_started) {
5254 		fence();
5255 		sbi_remote_fence_i(mask.__bits);
5256 	}
5257 	sched_unpin();
5258 }
5259 
5260 /*
5261  *	Increase the starting virtual address of the given mapping if a
5262  *	different alignment might result in more superpage mappings.
5263  */
5264 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)5265 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5266     vm_offset_t *addr, vm_size_t size)
5267 {
5268 	vm_offset_t superpage_offset;
5269 
5270 	if (size < L2_SIZE)
5271 		return;
5272 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5273 		offset += ptoa(object->pg_color);
5274 	superpage_offset = offset & L2_OFFSET;
5275 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
5276 	    (*addr & L2_OFFSET) == superpage_offset)
5277 		return;
5278 	if ((*addr & L2_OFFSET) < superpage_offset)
5279 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
5280 	else
5281 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
5282 }
5283 
5284 /**
5285  * Get the kernel virtual address of a set of physical pages. If there are
5286  * physical addresses not covered by the DMAP perform a transient mapping
5287  * that will be removed when calling pmap_unmap_io_transient.
5288  *
5289  * \param page        The pages the caller wishes to obtain the virtual
5290  *                    address on the kernel memory map.
5291  * \param vaddr       On return contains the kernel virtual memory address
5292  *                    of the pages passed in the page parameter.
5293  * \param count       Number of pages passed in.
5294  * \param can_fault   true if the thread using the mapped pages can take
5295  *                    page faults, false otherwise.
5296  *
5297  * \returns true if the caller must call pmap_unmap_io_transient when
5298  *          finished or false otherwise.
5299  *
5300  */
5301 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)5302 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
5303     bool can_fault)
5304 {
5305 	vm_paddr_t paddr;
5306 	bool needs_mapping;
5307 	int error __diagused, i;
5308 
5309 	/*
5310 	 * Allocate any KVA space that we need, this is done in a separate
5311 	 * loop to prevent calling vmem_alloc while pinned.
5312 	 */
5313 	needs_mapping = false;
5314 	for (i = 0; i < count; i++) {
5315 		paddr = VM_PAGE_TO_PHYS(page[i]);
5316 		if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) {
5317 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
5318 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
5319 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
5320 			needs_mapping = true;
5321 		} else {
5322 			vaddr[i] = PHYS_TO_DMAP(paddr);
5323 		}
5324 	}
5325 
5326 	/* Exit early if everything is covered by the DMAP */
5327 	if (!needs_mapping)
5328 		return (false);
5329 
5330 	if (!can_fault)
5331 		sched_pin();
5332 	for (i = 0; i < count; i++) {
5333 		paddr = VM_PAGE_TO_PHYS(page[i]);
5334 		if (paddr >= DMAP_MAX_PHYSADDR) {
5335 			panic(
5336 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
5337 		}
5338 	}
5339 
5340 	return (needs_mapping);
5341 }
5342 
5343 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)5344 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
5345     bool can_fault)
5346 {
5347 	vm_paddr_t paddr;
5348 	int i;
5349 
5350 	if (!can_fault)
5351 		sched_unpin();
5352 	for (i = 0; i < count; i++) {
5353 		paddr = VM_PAGE_TO_PHYS(page[i]);
5354 		if (paddr >= DMAP_MAX_PHYSADDR) {
5355 			panic("RISCVTODO: pmap_unmap_io_transient: Unmap data");
5356 		}
5357 	}
5358 }
5359 
5360 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)5361 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
5362 {
5363 
5364 	return (mode >= VM_MEMATTR_DEFAULT && mode <= VM_MEMATTR_LAST);
5365 }
5366 
5367 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)5368 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2,
5369     pt_entry_t **l3)
5370 {
5371 	pd_entry_t *l1p, *l2p;
5372 
5373 	/* Get l1 directory entry. */
5374 	l1p = pmap_l1(pmap, va);
5375 	*l1 = l1p;
5376 
5377 	if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0)
5378 		return (false);
5379 
5380 	if ((pmap_load(l1p) & PTE_RX) != 0) {
5381 		*l2 = NULL;
5382 		*l3 = NULL;
5383 		return (true);
5384 	}
5385 
5386 	/* Get l2 directory entry. */
5387 	l2p = pmap_l1_to_l2(l1p, va);
5388 	*l2 = l2p;
5389 
5390 	if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0)
5391 		return (false);
5392 
5393 	if ((pmap_load(l2p) & PTE_RX) != 0) {
5394 		*l3 = NULL;
5395 		return (true);
5396 	}
5397 
5398 	/* Get l3 page table entry. */
5399 	*l3 = pmap_l2_to_l3(l2p, va);
5400 
5401 	return (true);
5402 }
5403 
5404 /*
5405  * Track a range of the kernel's virtual address space that is contiguous
5406  * in various mapping attributes.
5407  */
5408 struct pmap_kernel_map_range {
5409 	vm_offset_t sva;
5410 	pt_entry_t attrs;
5411 	int l3pages;
5412 	int l2pages;
5413 	int l1pages;
5414 };
5415 
5416 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)5417 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
5418     vm_offset_t eva)
5419 {
5420 	char *mode;
5421 	int i;
5422 
5423 	if (eva <= range->sva)
5424 		return;
5425 
5426 	for (i = 0; i < nitems(memattr_bits); i++)
5427 		if ((range->attrs & memattr_mask) == memattr_bits[i])
5428 			break;
5429 
5430 	switch (i) {
5431 	case VM_MEMATTR_PMA:
5432 		mode = "PMA";
5433 		break;
5434 	case VM_MEMATTR_UNCACHEABLE:
5435 		mode = "NC ";
5436 		break;
5437 	case VM_MEMATTR_DEVICE:
5438 		mode = "IO ";
5439 		break;
5440 	default:
5441 		mode = "???";
5442 		break;
5443 	}
5444 
5445 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n",
5446 	    range->sva, eva,
5447 	    (range->attrs & PTE_W) == PTE_W ? 'w' : '-',
5448 	    (range->attrs & PTE_X) == PTE_X ? 'x' : '-',
5449 	    (range->attrs & PTE_U) == PTE_U ? 'u' : 's',
5450 	    (range->attrs & PTE_G) == PTE_G ? 'g' : '-',
5451 	    mode, range->l1pages, range->l2pages, range->l3pages);
5452 
5453 	/* Reset to sentinel value. */
5454 	range->sva = 0xfffffffffffffffful;
5455 }
5456 
5457 /*
5458  * Determine whether the attributes specified by a page table entry match those
5459  * being tracked by the current range.
5460  */
5461 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)5462 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
5463 {
5464 
5465 	return (range->attrs == attrs);
5466 }
5467 
5468 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)5469 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
5470     pt_entry_t attrs)
5471 {
5472 
5473 	memset(range, 0, sizeof(*range));
5474 	range->sva = va;
5475 	range->attrs = attrs;
5476 }
5477 
5478 /*
5479  * Given a leaf PTE, derive the mapping's attributes. If they do not match
5480  * those of the current run, dump the address range and its attributes, and
5481  * begin a new run.
5482  */
5483 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)5484 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
5485     vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e)
5486 {
5487 	pt_entry_t attrs;
5488 
5489 	/* The PTE global bit is inherited by lower levels. */
5490 	attrs = l1e & PTE_G;
5491 	if ((l1e & PTE_RWX) != 0) {
5492 		attrs |= l1e & (PTE_RWX | PTE_U);
5493 		attrs |= l1e & memattr_mask;
5494 	} else if (l2e != 0)
5495 		attrs |= l2e & PTE_G;
5496 
5497 	if ((l2e & PTE_RWX) != 0) {
5498 		attrs |= l2e & (PTE_RWX | PTE_U);
5499 		attrs |= l2e & memattr_mask;
5500 	} else if (l3e != 0) {
5501 		attrs |= l3e & (PTE_RWX | PTE_U | PTE_G);
5502 		attrs |= l3e & memattr_mask;
5503 	}
5504 
5505 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
5506 		sysctl_kmaps_dump(sb, range, va);
5507 		sysctl_kmaps_reinit(range, va, attrs);
5508 	}
5509 }
5510 
5511 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)5512 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
5513 {
5514 	struct pmap_kernel_map_range range;
5515 	struct sbuf sbuf, *sb;
5516 	pd_entry_t *l1, l1e, *l2, l2e;
5517 	pt_entry_t *l3, l3e;
5518 	vm_offset_t sva;
5519 	vm_paddr_t pa;
5520 	int error, i, j, k;
5521 
5522 	error = sysctl_wire_old_buffer(req, 0);
5523 	if (error != 0)
5524 		return (error);
5525 	sb = &sbuf;
5526 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
5527 
5528 	/* Sentinel value. */
5529 	range.sva = 0xfffffffffffffffful;
5530 
5531 	/*
5532 	 * Iterate over the kernel page tables without holding the kernel pmap
5533 	 * lock. Kernel page table pages are never freed, so at worst we will
5534 	 * observe inconsistencies in the output.
5535 	 */
5536 	sva = VM_MIN_KERNEL_ADDRESS;
5537 	for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) {
5538 		if (i == pmap_l1_index(DMAP_MIN_ADDRESS))
5539 			sbuf_printf(sb, "\nDirect map:\n");
5540 		else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS))
5541 			sbuf_printf(sb, "\nKernel map:\n");
5542 
5543 		l1 = pmap_l1(kernel_pmap, sva);
5544 		l1e = pmap_load(l1);
5545 		if ((l1e & PTE_V) == 0) {
5546 			sysctl_kmaps_dump(sb, &range, sva);
5547 			sva += L1_SIZE;
5548 			continue;
5549 		}
5550 		if ((l1e & PTE_RWX) != 0) {
5551 			sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0);
5552 			range.l1pages++;
5553 			sva += L1_SIZE;
5554 			continue;
5555 		}
5556 		pa = PTE_TO_PHYS(l1e);
5557 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
5558 
5559 		for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) {
5560 			l2e = l2[j];
5561 			if ((l2e & PTE_V) == 0) {
5562 				sysctl_kmaps_dump(sb, &range, sva);
5563 				sva += L2_SIZE;
5564 				continue;
5565 			}
5566 			if ((l2e & PTE_RWX) != 0) {
5567 				sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0);
5568 				range.l2pages++;
5569 				sva += L2_SIZE;
5570 				continue;
5571 			}
5572 			pa = PTE_TO_PHYS(l2e);
5573 			l3 = (pd_entry_t *)PHYS_TO_DMAP(pa);
5574 
5575 			for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++,
5576 			    sva += L3_SIZE) {
5577 				l3e = l3[k];
5578 				if ((l3e & PTE_V) == 0) {
5579 					sysctl_kmaps_dump(sb, &range, sva);
5580 					continue;
5581 				}
5582 				sysctl_kmaps_check(sb, &range, sva,
5583 				    l1e, l2e, l3e);
5584 				range.l3pages++;
5585 			}
5586 		}
5587 	}
5588 
5589 	error = sbuf_finish(sb);
5590 	sbuf_delete(sb);
5591 	return (error);
5592 }
5593 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
5594     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
5595     NULL, 0, sysctl_kmaps, "A",
5596     "Dump kernel address layout");
5597