xref: /freebsd/sys/riscv/riscv/pmap.c (revision 9a6e9d7799235d3e122bcc8065e865ae265a6ce2)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1991 Regents of the University of California.
5  * All rights reserved.
6  * Copyright (c) 1994 John S. Dyson
7  * All rights reserved.
8  * Copyright (c) 1994 David Greenman
9  * All rights reserved.
10  * Copyright (c) 2003 Peter Wemm
11  * All rights reserved.
12  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
13  * All rights reserved.
14  * Copyright (c) 2014 Andrew Turner
15  * All rights reserved.
16  * Copyright (c) 2014 The FreeBSD Foundation
17  * All rights reserved.
18  * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com>
19  * All rights reserved.
20  *
21  * This code is derived from software contributed to Berkeley by
22  * the Systems Programming Group of the University of Utah Computer
23  * Science Department and William Jolitz of UUNET Technologies Inc.
24  *
25  * Portions of this software were developed by Andrew Turner under
26  * sponsorship from The FreeBSD Foundation.
27  *
28  * Portions of this software were developed by SRI International and the
29  * University of Cambridge Computer Laboratory under DARPA/AFRL contract
30  * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
31  *
32  * Portions of this software were developed by the University of Cambridge
33  * Computer Laboratory as part of the CTSRD Project, with support from the
34  * UK Higher Education Innovation Fund (HEIF).
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. All advertising materials mentioning features or use of this software
45  *    must display the following acknowledgement:
46  *	This product includes software developed by the University of
47  *	California, Berkeley and its contributors.
48  * 4. Neither the name of the University nor the names of its contributors
49  *    may be used to endorse or promote products derived from this software
50  *    without specific prior written permission.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62  * SUCH DAMAGE.
63  */
64 /*-
65  * Copyright (c) 2003 Networks Associates Technology, Inc.
66  * All rights reserved.
67  *
68  * This software was developed for the FreeBSD Project by Jake Burkholder,
69  * Safeport Network Services, and Network Associates Laboratories, the
70  * Security Research Division of Network Associates, Inc. under
71  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
72  * CHATS research program.
73  *
74  * Redistribution and use in source and binary forms, with or without
75  * modification, are permitted provided that the following conditions
76  * are met:
77  * 1. Redistributions of source code must retain the above copyright
78  *    notice, this list of conditions and the following disclaimer.
79  * 2. Redistributions in binary form must reproduce the above copyright
80  *    notice, this list of conditions and the following disclaimer in the
81  *    documentation and/or other materials provided with the distribution.
82  *
83  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
84  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
85  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
86  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
87  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
88  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
89  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
90  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
91  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
92  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
93  * SUCH DAMAGE.
94  */
95 
96 /*
97  *	Manages physical address maps.
98  *
99  *	Since the information managed by this module is
100  *	also stored by the logical address mapping module,
101  *	this module may throw away valid virtual-to-physical
102  *	mappings at almost any time.  However, invalidations
103  *	of virtual-to-physical mappings must be done as
104  *	requested.
105  *
106  *	In order to cope with hardware architectures which
107  *	make virtual-to-physical map invalidates expensive,
108  *	this module may delay invalidate or reduced protection
109  *	operations until such time as they are actually
110  *	necessary.  This module is given full information as
111  *	to which processors are currently using which maps,
112  *	and to when physical maps must be made correct.
113  */
114 
115 #include "opt_pmap.h"
116 
117 #include <sys/param.h>
118 #include <sys/systm.h>
119 #include <sys/bitstring.h>
120 #include <sys/bus.h>
121 #include <sys/cpuset.h>
122 #include <sys/kernel.h>
123 #include <sys/ktr.h>
124 #include <sys/lock.h>
125 #include <sys/malloc.h>
126 #include <sys/mman.h>
127 #include <sys/msgbuf.h>
128 #include <sys/mutex.h>
129 #include <sys/physmem.h>
130 #include <sys/proc.h>
131 #include <sys/rwlock.h>
132 #include <sys/sbuf.h>
133 #include <sys/sx.h>
134 #include <sys/vmem.h>
135 #include <sys/vmmeter.h>
136 #include <sys/sched.h>
137 #include <sys/sysctl.h>
138 #include <sys/smp.h>
139 
140 #include <vm/vm.h>
141 #include <vm/vm_param.h>
142 #include <vm/vm_kern.h>
143 #include <vm/vm_page.h>
144 #include <vm/vm_map.h>
145 #include <vm/vm_object.h>
146 #include <vm/vm_extern.h>
147 #include <vm/vm_pageout.h>
148 #include <vm/vm_pager.h>
149 #include <vm/vm_phys.h>
150 #include <vm/vm_radix.h>
151 #include <vm/vm_reserv.h>
152 #include <vm/vm_dumpset.h>
153 #include <vm/uma.h>
154 
155 #include <machine/machdep.h>
156 #include <machine/md_var.h>
157 #include <machine/pcb.h>
158 #include <machine/sbi.h>
159 #include <machine/thead.h>
160 
161 /*
162  * Boundary values for the page table page index space:
163  *
164  * L3 pages: [0, NUL2E)
165  * L2 pages: [NUL2E, NUL2E + NUL1E)
166  * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E)
167  *
168  * Note that these ranges are used in both SV39 and SV48 mode.  In SV39 mode the
169  * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages
170  * in a set of page tables.
171  */
172 #define	NUL0E		Ln_ENTRIES
173 #define	NUL1E		(Ln_ENTRIES * NUL0E)
174 #define	NUL2E		(Ln_ENTRIES * NUL1E)
175 
176 #ifdef PV_STATS
177 #define PV_STAT(x)	do { x ; } while (0)
178 #define	__pv_stat_used
179 #else
180 #define PV_STAT(x)	do { } while (0)
181 #define	__pv_stat_used	__unused
182 #endif
183 
184 #define	pmap_l1_pindex(v)	(NUL2E + ((v) >> L1_SHIFT))
185 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
186 #define	pa_index(pa)		((pa) >> L2_SHIFT)
187 #define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
188 
189 #define	NPV_LIST_LOCKS	MAXCPU
190 
191 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
192 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
193 
194 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
195 	struct rwlock **_lockp = (lockp);		\
196 	struct rwlock *_new_lock;			\
197 							\
198 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
199 	if (_new_lock != *_lockp) {			\
200 		if (*_lockp != NULL)			\
201 			rw_wunlock(*_lockp);		\
202 		*_lockp = _new_lock;			\
203 		rw_wlock(*_lockp);			\
204 	}						\
205 } while (0)
206 
207 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
208 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
209 
210 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
211 	struct rwlock **_lockp = (lockp);		\
212 							\
213 	if (*_lockp != NULL) {				\
214 		rw_wunlock(*_lockp);			\
215 		*_lockp = NULL;				\
216 	}						\
217 } while (0)
218 
219 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
220 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
221 
222 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
223     "VM/pmap parameters");
224 
225 /* The list of all the user pmaps */
226 LIST_HEAD(pmaplist, pmap);
227 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER();
228 
229 enum pmap_mode __read_frequently pmap_mode = PMAP_MODE_SV39;
230 SYSCTL_INT(_vm_pmap, OID_AUTO, mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
231     &pmap_mode, 0,
232     "translation mode, 0 = SV39, 1 = SV48");
233 
234 struct pmap kernel_pmap_store;
235 
236 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
237 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
238 vm_offset_t kernel_vm_end = 0;
239 
240 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
241 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
242 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
243 
244 static int pmap_growkernel_panic = 0;
245 SYSCTL_INT(_vm_pmap, OID_AUTO, growkernel_panic, CTLFLAG_RDTUN,
246     &pmap_growkernel_panic, 0,
247     "panic on failure to allocate kernel page table page");
248 
249 /* This code assumes all L1 DMAP entries will be used */
250 CTASSERT((DMAP_MIN_ADDRESS  & ~L1_OFFSET) == DMAP_MIN_ADDRESS);
251 CTASSERT((DMAP_MAX_ADDRESS  & ~L1_OFFSET) == DMAP_MAX_ADDRESS);
252 
253 /*
254  * This code assumes that the early DEVMAP is L2_SIZE aligned.
255  */
256 CTASSERT((PMAP_MAPDEV_EARLY_SIZE & L2_OFFSET) == 0);
257 
258 static struct rwlock_padalign pvh_global_lock;
259 static struct mtx_padalign allpmaps_lock;
260 
261 static int __read_frequently superpages_enabled = 1;
262 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
263     CTLFLAG_RDTUN, &superpages_enabled, 0,
264     "Enable support for transparent superpages");
265 
266 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
267     "2MB page mapping counters");
268 
269 static u_long pmap_l2_demotions;
270 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
271     &pmap_l2_demotions, 0,
272     "2MB page demotions");
273 
274 static u_long pmap_l2_mappings;
275 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
276     &pmap_l2_mappings, 0,
277     "2MB page mappings");
278 
279 static u_long pmap_l2_p_failures;
280 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
281     &pmap_l2_p_failures, 0,
282     "2MB page promotion failures");
283 
284 static u_long pmap_l2_promotions;
285 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
286     &pmap_l2_promotions, 0,
287     "2MB page promotions");
288 
289 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
290     "L1 (1GB) page mapping counters");
291 
292 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
293 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
294     &pmap_l1_demotions, "L1 (1GB) page demotions");
295 
296 /*
297  * Data for the pv entry allocation mechanism
298  */
299 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
300 static struct mtx pv_chunks_mutex;
301 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
302 static struct md_page *pv_table;
303 static struct md_page pv_dummy;
304 
305 extern cpuset_t all_harts;
306 
307 /*
308  * Internal flags for pmap_enter()'s helper functions.
309  */
310 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
311 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
312 
313 static void	free_pv_chunk(struct pv_chunk *pc);
314 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
315 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
316 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
317 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
318 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
319 		    vm_offset_t va);
320 static bool	pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va);
321 static bool	pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va);
322 static bool	pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2,
323 		    vm_offset_t va, struct rwlock **lockp);
324 static int	pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
325 		    u_int flags, vm_page_t m, struct rwlock **lockp);
326 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
327     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
328 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
329     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
330 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
331     vm_page_t m, struct rwlock **lockp);
332 
333 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
334 		struct rwlock **lockp);
335 
336 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
337     struct spglist *free);
338 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
339 
340 static int pmap_change_attr_locked(void *va, vm_size_t size, int mode);
341 
342 static uint64_t pmap_satp_mode(void);
343 
344 #define	pmap_clear(pte)			pmap_store(pte, 0)
345 #define	pmap_clear_bits(pte, bits)	atomic_clear_64(pte, bits)
346 #define	pmap_load_store(pte, entry)	atomic_swap_64(pte, entry)
347 #define	pmap_load_clear(pte)		pmap_load_store(pte, 0)
348 #define	pmap_load(pte)			atomic_load_64(pte)
349 #define	pmap_store(pte, entry)		atomic_store_64(pte, entry)
350 #define	pmap_store_bits(pte, bits)	atomic_set_64(pte, bits)
351 
352 /********************/
353 /* Inline functions */
354 /********************/
355 
356 static __inline void
pagecopy(void * s,void * d)357 pagecopy(void *s, void *d)
358 {
359 
360 	memcpy(d, s, PAGE_SIZE);
361 }
362 
363 static __inline void
pagezero(void * p)364 pagezero(void *p)
365 {
366 
367 	bzero(p, PAGE_SIZE);
368 }
369 
370 #define	pmap_l0_index(va)	(((va) >> L0_SHIFT) & Ln_ADDR_MASK)
371 #define	pmap_l1_index(va)	(((va) >> L1_SHIFT) & Ln_ADDR_MASK)
372 #define	pmap_l2_index(va)	(((va) >> L2_SHIFT) & Ln_ADDR_MASK)
373 #define	pmap_l3_index(va)	(((va) >> L3_SHIFT) & Ln_ADDR_MASK)
374 
375 #define	PTE_TO_PHYS(pte) \
376     ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE)
377 #define	L2PTE_TO_PHYS(l2) \
378     ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT)
379 #define	L1PTE_TO_PHYS(l1) \
380     ((((l1) & ~PTE_HI_MASK) >> PTE_PPN2_S) << L1_SHIFT)
381 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
382 
383 /*
384  * Construct a page table entry of the specified level pointing to physical
385  * address pa, with PTE bits 'bits'.
386  *
387  * A leaf PTE of any level must point to an address matching its alignment,
388  * e.g. L2 pages must be 2MB aligned in memory.
389  */
390 #define	L1_PTE(pa, bits)	((((pa) >> L1_SHIFT) << PTE_PPN2_S) | (bits))
391 #define	L2_PTE(pa, bits)	((((pa) >> L2_SHIFT) << PTE_PPN1_S) | (bits))
392 #define	L3_PTE(pa, bits)	((((pa) >> L3_SHIFT) << PTE_PPN0_S) | (bits))
393 
394 /*
395  * Construct a page directory entry (PDE), pointing to next level entry at pa,
396  * with PTE bits 'bits'.
397  *
398  * Unlike PTEs, page directory entries can point to any 4K-aligned physical
399  * address.
400  */
401 #define	L0_PDE(pa, bits)	L3_PTE(pa, bits)
402 #define	L1_PDE(pa, bits)	L3_PTE(pa, bits)
403 #define	L2_PDE(pa, bits)	L3_PTE(pa, bits)
404 
405 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)406 pmap_l0(pmap_t pmap, vm_offset_t va)
407 {
408 	KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__));
409 	KASSERT(VIRT_IS_VALID(va),
410 	    ("%s: malformed virtual address %#lx", __func__, va));
411 	return (&pmap->pm_top[pmap_l0_index(va)]);
412 }
413 
414 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)415 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
416 {
417 	vm_paddr_t phys;
418 	pd_entry_t *l1;
419 
420 	KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__));
421 	phys = PTE_TO_PHYS(pmap_load(l0));
422 	l1 = PHYS_TO_DMAP(phys);
423 
424 	return (&l1[pmap_l1_index(va)]);
425 }
426 
427 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)428 pmap_l1(pmap_t pmap, vm_offset_t va)
429 {
430 	pd_entry_t *l0;
431 
432 	KASSERT(VIRT_IS_VALID(va),
433 	    ("%s: malformed virtual address %#lx", __func__, va));
434 	if (pmap_mode == PMAP_MODE_SV39) {
435 		return (&pmap->pm_top[pmap_l1_index(va)]);
436 	} else {
437 		l0 = pmap_l0(pmap, va);
438 		if ((pmap_load(l0) & PTE_V) == 0)
439 			return (NULL);
440 		if ((pmap_load(l0) & PTE_RX) != 0)
441 			return (NULL);
442 		return (pmap_l0_to_l1(l0, va));
443 	}
444 }
445 
446 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1,vm_offset_t va)447 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
448 {
449 	vm_paddr_t phys;
450 	pd_entry_t *l2;
451 
452 	phys = PTE_TO_PHYS(pmap_load(l1));
453 	l2 = PHYS_TO_DMAP(phys);
454 
455 	return (&l2[pmap_l2_index(va)]);
456 }
457 
458 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)459 pmap_l2(pmap_t pmap, vm_offset_t va)
460 {
461 	pd_entry_t *l1;
462 
463 	l1 = pmap_l1(pmap, va);
464 	if (l1 == NULL)
465 		return (NULL);
466 	if ((pmap_load(l1) & PTE_V) == 0)
467 		return (NULL);
468 	if ((pmap_load(l1) & PTE_RX) != 0)
469 		return (NULL);
470 
471 	return (pmap_l1_to_l2(l1, va));
472 }
473 
474 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2,vm_offset_t va)475 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
476 {
477 	vm_paddr_t phys;
478 	pt_entry_t *l3;
479 
480 	phys = PTE_TO_PHYS(pmap_load(l2));
481 	l3 = PHYS_TO_DMAP(phys);
482 
483 	return (&l3[pmap_l3_index(va)]);
484 }
485 
486 static __inline pt_entry_t *
pmap_l3(pmap_t pmap,vm_offset_t va)487 pmap_l3(pmap_t pmap, vm_offset_t va)
488 {
489 	pd_entry_t *l2;
490 
491 	l2 = pmap_l2(pmap, va);
492 	if (l2 == NULL)
493 		return (NULL);
494 	if ((pmap_load(l2) & PTE_V) == 0)
495 		return (NULL);
496 	if ((pmap_load(l2) & PTE_RX) != 0)
497 		return (NULL);
498 
499 	return (pmap_l2_to_l3(l2, va));
500 }
501 
502 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)503 pmap_resident_count_inc(pmap_t pmap, int count)
504 {
505 
506 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
507 	pmap->pm_stats.resident_count += count;
508 }
509 
510 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)511 pmap_resident_count_dec(pmap_t pmap, int count)
512 {
513 
514 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
515 	KASSERT(pmap->pm_stats.resident_count >= count,
516 	    ("pmap %p resident count underflow %ld %d", pmap,
517 	    pmap->pm_stats.resident_count, count));
518 	pmap->pm_stats.resident_count -= count;
519 }
520 
521 static void
pmap_distribute_l1(struct pmap * pmap,vm_pindex_t l1index,pt_entry_t entry)522 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index,
523     pt_entry_t entry)
524 {
525 	struct pmap *user_pmap;
526 	pd_entry_t *l1;
527 
528 	/*
529 	 * Distribute new kernel L1 entry to all the user pmaps.  This is only
530 	 * necessary with three-level paging configured: with four-level paging
531 	 * the kernel's half of the top-level page table page is static and can
532 	 * simply be copied at pmap initialization time.
533 	 */
534 	if (pmap != kernel_pmap || pmap_mode != PMAP_MODE_SV39)
535 		return;
536 
537 	mtx_lock(&allpmaps_lock);
538 	LIST_FOREACH(user_pmap, &allpmaps, pm_list) {
539 		l1 = &user_pmap->pm_top[l1index];
540 		pmap_store(l1, entry);
541 	}
542 	mtx_unlock(&allpmaps_lock);
543 }
544 
545 /*
546  * Holds the PTE mode bits (defined in pte.h) for defining e.g. cacheability.
547  *
548  * The indices correspond to the VM_MEMATTR_* defines in riscv/include/vm.h.
549  *
550  * The array will be empty if no mode bits are supported by the CPU, e.g. when
551  * lacking the Svpbmt extension.
552  */
553 static __read_frequently pt_entry_t memattr_bits[VM_MEMATTR_TOTAL];
554 static __read_frequently pt_entry_t memattr_mask;
555 
556 static __inline pt_entry_t
pmap_memattr_bits(vm_memattr_t mode)557 pmap_memattr_bits(vm_memattr_t mode)
558 {
559 	KASSERT(pmap_is_valid_memattr(kernel_pmap, mode),
560 	    ("invalid memory mode %u\n", mode));
561 	return (memattr_bits[(int)mode]);
562 }
563 
564 /*
565  * This should only be used during pmap bootstrap e.g. by
566  * pmap_create_pagetables().
567  */
568 static pt_entry_t *
pmap_early_alloc_tables(vm_paddr_t * freemempos,int npages)569 pmap_early_alloc_tables(vm_paddr_t *freemempos, int npages)
570 {
571 	pt_entry_t *pt;
572 
573 	pt = (pt_entry_t *)*freemempos;
574 	*freemempos += npages * PAGE_SIZE;
575 	bzero(pt, npages * PAGE_SIZE);
576 
577 	return (pt);
578 }
579 
580 /*
581  *	Construct the Direct Map -- a linear mapping of physical memory into
582  *	the kernel address space.
583  *
584  *	We walk the list of physical memory segments (of arbitrary size and
585  *	alignment) mapping each appropriately. Consequently, the DMAP address
586  *	space will have unmapped regions corresponding to the holes between
587  *	physical memory segments.
588  */
589 static vm_paddr_t
pmap_bootstrap_dmap(pd_entry_t * l1,vm_paddr_t freemempos)590 pmap_bootstrap_dmap(pd_entry_t *l1, vm_paddr_t freemempos)
591 {
592 	vm_paddr_t physmap[PHYS_AVAIL_ENTRIES];
593 	vm_offset_t va;
594 	vm_paddr_t min_pa, max_pa, pa, endpa;
595 	pd_entry_t *l3, *l2;
596 	pt_entry_t memattr;
597 	u_int l1slot, l2slot, l3slot;
598 	int physmap_idx;
599 
600 	physmap_idx = physmem_avail(physmap, nitems(physmap));
601 	min_pa = physmap[0];
602 	max_pa = physmap[physmap_idx - 1];
603 
604 	printf("physmap_idx %u\n", physmap_idx);
605 	printf("min_pa %lx\n", min_pa);
606 	printf("max_pa %lx\n", max_pa);
607 
608 	/* Set the limits of the DMAP region. */
609 	dmap_phys_base = rounddown(min_pa, L1_SIZE);
610 	dmap_phys_max = max_pa;
611 
612 	memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT);
613 
614 	/*
615 	 * Walk the physmap table, using the largest page sizes possible for each
616 	 * mapping. So, for each physmap entry, map as needed/able:
617 	 *  - 4K/L3 page prefix
618 	 *  - 2M/L2 superpage prefix
619 	 *  - 1G/L1 superpages
620 	 *  - 2M/L2 superpage suffix
621 	 *  - 4K/L3 page suffix
622 	 */
623 	l3 = l2 = NULL;
624 	l2slot = l1slot = Ln_ENTRIES; /* sentinel value */
625 	for (int idx = 0; idx < physmap_idx; idx += 2) {
626 		pa = rounddown(physmap[idx], L3_SIZE);
627 		endpa = physmap[idx + 1];
628 
629 		/* Virtual address for this range. */
630 		va = PHYS_TO_DMAP_ADDR(pa);
631 
632 		/* Any 2MB possible for this range? */
633 		if (roundup(pa, L2_SIZE) + L2_SIZE > endpa)
634 			goto l3end;
635 
636 		/* Loop until the next 2MB boundary. */
637 		while ((pa & L2_OFFSET) != 0) {
638 			if (l2 == NULL || pmap_l1_index(va) != l1slot) {
639 				/* Need to alloc another page table. */
640 				l2 = pmap_early_alloc_tables(&freemempos, 1);
641 
642 				/* Link it. */
643 				l1slot = pmap_l1_index(va);
644 				pmap_store(&l1[l1slot],
645 				    L1_PDE((vm_paddr_t)l2, PTE_V));
646 			}
647 
648 			if (l3 == NULL || pmap_l2_index(va) != l2slot) {
649 				l3 = pmap_early_alloc_tables(&freemempos, 1);
650 
651 				/* Link it to L2. */
652 				l2slot = pmap_l2_index(va);
653 				pmap_store(&l2[l2slot],
654 				    L2_PDE((vm_paddr_t)l3, PTE_V));
655 			}
656 
657 			/* map l3 pages */
658 			l3slot = pmap_l3_index(va);
659 			pmap_store(&l3[l3slot], L3_PTE(pa, PTE_KERN | memattr));
660 
661 			pa += L3_SIZE;
662 			va += L3_SIZE;
663 		}
664 
665 		/* Any 1GB possible for remaining range? */
666 		if (roundup(pa, L1_SIZE) + L1_SIZE > endpa)
667 			goto l2end;
668 
669 		/* Loop until the next 1GB boundary. */
670 		while ((pa & L1_OFFSET) != 0) {
671 			if (l2 == NULL || pmap_l1_index(va) != l1slot) {
672 				/* Need to alloc another page table. */
673 				l2 = pmap_early_alloc_tables(&freemempos, 1);
674 
675 				/* Link it. */
676 				l1slot = pmap_l1_index(va);
677 				pmap_store(&l1[l1slot],
678 				    L1_PDE((vm_paddr_t)l2, PTE_V));
679 			}
680 
681 			/* map l2 pages */
682 			l2slot = pmap_l2_index(va);
683 			pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr));
684 
685 			pa += L2_SIZE;
686 			va += L2_SIZE;
687 		}
688 
689 		/* Map what we can with 1GB superpages. */
690 		while (pa + L1_SIZE - 1 < endpa) {
691 			/* map l1 pages */
692 			l1slot = pmap_l1_index(va);
693 			pmap_store(&l1[l1slot], L1_PTE(pa, PTE_KERN | memattr));
694 
695 			pa += L1_SIZE;
696 			va += L1_SIZE;
697 		}
698 
699 l2end:
700 		/* Map what we can with 2MB superpages. */
701 		while (pa + L2_SIZE - 1 < endpa) {
702 			if (l2 == NULL || pmap_l1_index(va) != l1slot) {
703 				/* Need to alloc another page table. */
704 				l2 = pmap_early_alloc_tables(&freemempos, 1);
705 
706 				/* Link it. */
707 				l1slot = pmap_l1_index(va);
708 				pmap_store(&l1[l1slot],
709 				    L1_PDE((vm_paddr_t)l2, PTE_V));
710 			}
711 
712 			/* map l2 pages */
713 			l2slot = pmap_l2_index(va);
714 			pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr));
715 
716 			pa += L2_SIZE;
717 			va += L2_SIZE;
718 		}
719 
720 l3end:
721 		while (pa < endpa) {
722 			if (l2 == NULL || pmap_l1_index(va) != l1slot) {
723 				/* Need to alloc another page table. */
724 				l2 = pmap_early_alloc_tables(&freemempos, 1);
725 
726 				/* Link it. */
727 				l1slot = pmap_l1_index(va);
728 				pmap_store(&l1[l1slot],
729 				    L1_PDE((vm_paddr_t)l2, PTE_V));
730 			}
731 
732 			if (l3 == NULL || pmap_l2_index(va) != l2slot) {
733 				l3 = pmap_early_alloc_tables(&freemempos, 1);
734 
735 				/* Link it to L2. */
736 				l2slot = pmap_l2_index(va);
737 				pmap_store(&l2[l2slot],
738 				    L2_PDE((vm_paddr_t)l3, PTE_V));
739 			}
740 
741 			/* map l3 pages */
742 			l3slot = pmap_l3_index(va);
743 			pmap_store(&l3[l3slot], L3_PTE(pa, PTE_KERN | memattr));
744 
745 			pa += L3_SIZE;
746 			va += L3_SIZE;
747 		}
748 	}
749 
750 	/* And finally, the limit on DMAP VA. */
751 	dmap_max_addr = va;
752 
753 	return (freemempos);
754 }
755 
756 /*
757  *	Create a new set of pagetables to run the kernel with.
758  *
759  *	An initial, temporary setup was created in locore.S, which serves well
760  *	enough to get us this far. It mapped kernstart -> KERNBASE, using 2MB
761  *	superpages, and created a 1GB identity map, which allows this function
762  *	to dereference physical addresses.
763  *
764  *	The memory backing these page tables is allocated in the space
765  *	immediately following the kernel's preload area. Depending on the size
766  *	of this area, some, all, or none of these pages can be implicitly
767  *	mapped by the kernel's 2MB mappings. This memory will only ever be
768  *	accessed through the direct map, however.
769  */
770 static vm_paddr_t
pmap_create_pagetables(vm_paddr_t kernstart,vm_size_t kernlen,vm_paddr_t * root_pt_phys)771 pmap_create_pagetables(vm_paddr_t kernstart, vm_size_t kernlen,
772     vm_paddr_t *root_pt_phys)
773 {
774 	pt_entry_t *l0, *l1, *kern_l2, *kern_l3, *devmap_l3;
775 	pt_entry_t memattr;
776 	pd_entry_t *devmap_l2;
777 	vm_paddr_t kernend, freemempos, pa;
778 	int nkernl2, nkernl3, ndevmapl3;
779 	int i, slot;
780 	int mode;
781 
782 	kernend = kernstart + kernlen;
783 
784 	/* Static allocations begin after the kernel staging area. */
785 	freemempos = roundup2(kernend, PAGE_SIZE);
786 
787 	/* Detect Sv48 mode. */
788 	mode = PMAP_MODE_SV39;
789 	TUNABLE_INT_FETCH("vm.pmap.mode", &mode);
790 
791 	if (mode == PMAP_MODE_SV48 && (mmu_caps & MMU_SV48) != 0) {
792 		/*
793 		 * Sv48 mode: allocate an L0 page table to be the root. The
794 		 * layout of KVA is otherwise identical to Sv39.
795 		 */
796 		l0 = pmap_early_alloc_tables(&freemempos, 1);
797 		*root_pt_phys = (vm_paddr_t)l0;
798 		pmap_mode = PMAP_MODE_SV48;
799 	} else {
800 		l0 = NULL;
801 	}
802 
803 	/*
804 	 * Allocate an L1 page table.
805 	 */
806 	l1 = pmap_early_alloc_tables(&freemempos, 1);
807 	if (pmap_mode == PMAP_MODE_SV39)
808 		*root_pt_phys = (vm_paddr_t)l1;
809 
810 	/*
811 	 * Allocate a set of L2 page tables for KVA. Most likely, only 1 is
812 	 * needed.
813 	 */
814 	nkernl2 = howmany(howmany(kernlen, L2_SIZE), Ln_ENTRIES);
815 	kern_l2 = pmap_early_alloc_tables(&freemempos, nkernl2);
816 
817 	/*
818 	 * Allocate an L2 page table for the static devmap, located at the end
819 	 * of KVA. We can expect that the devmap will always be less than 1GB
820 	 * in size.
821 	 */
822 	devmap_l2 = pmap_early_alloc_tables(&freemempos, 1);
823 
824 	/* Allocate L3 page tables for the devmap. */
825 	ndevmapl3 = howmany(howmany(PMAP_MAPDEV_EARLY_SIZE, L3_SIZE),
826 	    Ln_ENTRIES);
827 	devmap_l3 = pmap_early_alloc_tables(&freemempos, ndevmapl3);
828 
829 	/*
830 	 * Allocate some L3 bootstrap pages, for early KVA allocations before
831 	 * vm_mem_init() has run. For example, the message buffer.
832 	 *
833 	 * A somewhat arbitrary choice of 32MB. This should be more than enough
834 	 * for any early allocations. There is no need to worry about waste, as
835 	 * whatever is not used will be consumed by later calls to
836 	 * pmap_growkernel().
837 	 */
838 	nkernl3 = 16;
839 	kern_l3 = pmap_early_alloc_tables(&freemempos, nkernl3);
840 
841 	/* Bootstrap the direct map. */
842 	freemempos = pmap_bootstrap_dmap(l1, freemempos);
843 
844 	/* Allocations are done. */
845 	if (freemempos < roundup2(kernend, L2_SIZE))
846 		freemempos = roundup2(kernend, L2_SIZE);
847 
848 	/* Memory attributes for standard/main memory. */
849 	memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT);
850 
851 	/*
852 	 * Map the kernel (and preloaded modules or data) using L2 superpages.
853 	 *
854 	 * kernstart is 2MB-aligned. This is enforced by loader(8) and required
855 	 * by locore assembly.
856 	 *
857 	 * TODO: eventually, this should be done with proper permissions for
858 	 * each segment, rather than mapping the entire kernel and preloaded
859 	 * modules RWX.
860 	 */
861 	slot = pmap_l2_index(KERNBASE);
862 	for (pa = kernstart; pa < kernend; pa += L2_SIZE, slot++) {
863 		pmap_store(&kern_l2[slot],
864 		    L2_PTE(pa, PTE_KERN | PTE_X | memattr));
865 	}
866 
867 	/*
868 	 * Connect the L3 bootstrap pages to the kernel L2 table. The L3 PTEs
869 	 * themselves are invalid.
870 	 */
871 	slot = pmap_l2_index(freemempos - kernstart + KERNBASE);
872 	for (i = 0; i < nkernl3; i++, slot++) {
873 		pa = (vm_paddr_t)kern_l3 + ptoa(i);
874 		pmap_store(&kern_l2[slot], L2_PDE(pa, PTE_V));
875 	}
876 
877 	/* Connect the L2 tables to the L1 table. */
878 	slot = pmap_l1_index(KERNBASE);
879 	for (i = 0; i < nkernl2; i++, slot++) {
880 		pa = (vm_paddr_t)kern_l2 + ptoa(i);
881 		pmap_store(&l1[slot], L1_PDE(pa, PTE_V));
882 	}
883 
884 	/* Connect the L1 table to L0, if in use. */
885 	if (pmap_mode == PMAP_MODE_SV48) {
886 		slot = pmap_l0_index(KERNBASE);
887 		pmap_store(&l0[slot], L0_PDE((vm_paddr_t)l1, PTE_V));
888 	}
889 
890 	/*
891 	 * Connect the devmap L3 pages to the L2 table. The devmap PTEs
892 	 * themselves are invalid.
893 	 */
894 	slot = pmap_l2_index(DEVMAP_MIN_VADDR);
895 	for (i = 0; i < ndevmapl3; i++, slot++) {
896 		pa = (vm_paddr_t)devmap_l3 + ptoa(i);
897 		pmap_store(&devmap_l2[slot], L2_PDE(pa, PTE_V));
898 	}
899 
900 	/* Connect the devmap L2 pages to the L1 table. */
901 	slot = pmap_l1_index(DEVMAP_MIN_VADDR);
902 	pa = (vm_paddr_t)devmap_l2;
903 	pmap_store(&l1[slot], L1_PDE(pa, PTE_V));
904 
905 	/* Return the next position of free memory */
906 	return (freemempos);
907 }
908 
909 /*
910  *	Bootstrap the system enough to run with virtual memory.
911  */
912 void
pmap_bootstrap(vm_paddr_t kernstart,vm_size_t kernlen)913 pmap_bootstrap(vm_paddr_t kernstart, vm_size_t kernlen)
914 {
915 	vm_paddr_t freemempos, pa;
916 	vm_paddr_t root_pt_phys;
917 	vm_offset_t freeva;
918 	vm_offset_t dpcpu, msgbufpv;
919 	pt_entry_t *pte;
920 	int i;
921 
922 	printf("pmap_bootstrap %lx %lx\n", kernstart, kernlen);
923 
924 	mtx_init(&kernel_pmap->pm_mtx, "kernel pmap", NULL, MTX_DEF);
925 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
926 	vm_radix_init(&kernel_pmap->pm_root);
927 
928 	rw_init(&pvh_global_lock, "pmap pv global");
929 
930 	/*
931 	 * Set the current CPU as active in the kernel pmap. Secondary cores
932 	 * will add themselves later in init_secondary(). The SBI firmware
933 	 * may rely on this mask being precise, so CPU_FILL() is not used.
934 	 */
935 	CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active);
936 
937 	/*
938 	 * Set up the memory attribute bits.
939 	 */
940 	if (has_svpbmt) {
941 		memattr_bits[VM_MEMATTR_PMA] = PTE_MA_NONE;
942 		memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_MA_NC;
943 		memattr_bits[VM_MEMATTR_DEVICE] = PTE_MA_IO;
944 		memattr_mask = PTE_MA_MASK;
945 	} else if (has_errata_thead_pbmt) {
946 		memattr_bits[VM_MEMATTR_PMA] = PTE_THEAD_MA_NONE;
947 		memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_THEAD_MA_NC;
948 		memattr_bits[VM_MEMATTR_DEVICE] = PTE_THEAD_MA_IO;
949 		memattr_mask = PTE_THEAD_MA_MASK;
950 	}
951 
952 	/* Create a new set of pagetables to run the kernel in. */
953 	freemempos = pmap_create_pagetables(kernstart, kernlen, &root_pt_phys);
954 
955 	/* Switch to the newly created page tables. */
956 	kernel_pmap->pm_stage = PM_STAGE1;
957 	kernel_pmap->pm_top = PHYS_TO_DMAP(root_pt_phys);
958 	kernel_pmap->pm_satp = atop(root_pt_phys) | pmap_satp_mode();
959 	csr_write(satp, kernel_pmap->pm_satp);
960 	sfence_vma();
961 
962 	/*
963 	 * Now, we need to make a few more static reservations from KVA.
964 	 *
965 	 * Set freeva to freemempos virtual address, and be sure to advance
966 	 * them together.
967 	 */
968 	freeva = freemempos - kernstart + KERNBASE;
969 #define reserve_space(var, pa, size)					\
970 	do {								\
971 		var = freeva;						\
972 		pa = freemempos;					\
973 		freeva += size;						\
974 		freemempos += size;					\
975 	} while (0)
976 
977 	/* Allocate the dynamic per-cpu area. */
978 	reserve_space(dpcpu, pa, DPCPU_SIZE);
979 
980 	/* Map it. */
981 	pte = pmap_l3(kernel_pmap, dpcpu);
982 	KASSERT(pte != NULL, ("Bootstrap pages missing"));
983 	for (i = 0; i < howmany(DPCPU_SIZE, PAGE_SIZE); i++)
984 		pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN |
985 		    pmap_memattr_bits(VM_MEMATTR_DEFAULT)));
986 
987 	/* Now, it can be initialized. */
988 	dpcpu_init((void *)dpcpu, 0);
989 
990 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
991 	reserve_space(msgbufpv, pa, round_page(msgbufsize));
992 	msgbufp = (void *)msgbufpv;
993 
994 	/* Map it. */
995 	pte = pmap_l3(kernel_pmap, msgbufpv);
996 	KASSERT(pte != NULL, ("Bootstrap pages missing"));
997 	for (i = 0; i < howmany(msgbufsize, PAGE_SIZE); i++)
998 		pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN |
999 		    pmap_memattr_bits(VM_MEMATTR_DEFAULT)));
1000 
1001 #undef	reserve_space
1002 
1003 	/* Mark the bounds of our available virtual address space */
1004 	virtual_avail = kernel_vm_end = freeva;
1005 	virtual_end = DEVMAP_MIN_VADDR;
1006 
1007 	/* Exclude the reserved physical memory from allocations. */
1008 	physmem_exclude_region(kernstart, freemempos - kernstart,
1009 	    EXFLAG_NOALLOC);
1010 }
1011 
1012 /*
1013  *	Initialize a vm_page's machine-dependent fields.
1014  */
1015 void
pmap_page_init(vm_page_t m)1016 pmap_page_init(vm_page_t m)
1017 {
1018 
1019 	TAILQ_INIT(&m->md.pv_list);
1020 	m->md.pv_memattr = VM_MEMATTR_DEFAULT;
1021 }
1022 
1023 /*
1024  *	Initialize the pmap module.
1025  *
1026  *	Called by vm_mem_init(), to initialize any structures that the pmap
1027  *	system needs to map virtual memory.
1028  */
1029 void
pmap_init(void)1030 pmap_init(void)
1031 {
1032 	vm_size_t s;
1033 	int i, pv_npg;
1034 
1035 	/*
1036 	 * Initialize the pv chunk and pmap list mutexes.
1037 	 */
1038 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1039 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF);
1040 
1041 	/*
1042 	 * Initialize the pool of pv list locks.
1043 	 */
1044 	for (i = 0; i < NPV_LIST_LOCKS; i++)
1045 		rw_init(&pv_list_locks[i], "pmap pv list");
1046 
1047 	/*
1048 	 * Calculate the size of the pv head table for superpages.
1049 	 */
1050 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
1051 
1052 	/*
1053 	 * Allocate memory for the pv head table for superpages.
1054 	 */
1055 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1056 	s = round_page(s);
1057 	pv_table = kmem_malloc(s, M_WAITOK | M_ZERO);
1058 	for (i = 0; i < pv_npg; i++)
1059 		TAILQ_INIT(&pv_table[i].pv_list);
1060 	TAILQ_INIT(&pv_dummy.pv_list);
1061 
1062 	if (superpages_enabled)
1063 		pagesizes[1] = L2_SIZE;
1064 }
1065 
1066 #ifdef SMP
1067 /*
1068  * For SMP, these functions have to use IPIs for coherence.
1069  *
1070  * In general, the calling thread uses a plain fence to order the
1071  * writes to the page tables before invoking an SBI callback to invoke
1072  * sfence_vma() on remote CPUs.
1073  */
1074 static void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)1075 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1076 {
1077 	cpuset_t mask;
1078 
1079 	sched_pin();
1080 	mask = pmap->pm_active;
1081 	CPU_CLR(PCPU_GET(hart), &mask);
1082 	fence();
1083 	if (!CPU_EMPTY(&mask) && smp_started)
1084 		sbi_remote_sfence_vma(mask.__bits, va, 1);
1085 	sfence_vma_page(va);
1086 	sched_unpin();
1087 }
1088 
1089 static void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)1090 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1091 {
1092 	cpuset_t mask;
1093 
1094 	sched_pin();
1095 	mask = pmap->pm_active;
1096 	CPU_CLR(PCPU_GET(hart), &mask);
1097 	fence();
1098 	if (!CPU_EMPTY(&mask) && smp_started)
1099 		sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1);
1100 
1101 	/*
1102 	 * Might consider a loop of sfence_vma_page() for a small
1103 	 * number of pages in the future.
1104 	 */
1105 	sfence_vma();
1106 	sched_unpin();
1107 }
1108 
1109 static void
pmap_invalidate_all(pmap_t pmap)1110 pmap_invalidate_all(pmap_t pmap)
1111 {
1112 	cpuset_t mask;
1113 
1114 	sched_pin();
1115 	mask = pmap->pm_active;
1116 	CPU_CLR(PCPU_GET(hart), &mask);
1117 
1118 	/*
1119 	 * XXX: The SBI doc doesn't detail how to specify x0 as the
1120 	 * address to perform a global fence.  BBL currently treats
1121 	 * all sfence_vma requests as global however.
1122 	 */
1123 	fence();
1124 	if (!CPU_EMPTY(&mask) && smp_started)
1125 		sbi_remote_sfence_vma(mask.__bits, 0, 0);
1126 	sfence_vma();
1127 	sched_unpin();
1128 }
1129 #else
1130 /*
1131  * Normal, non-SMP, invalidation functions.
1132  * We inline these within pmap.c for speed.
1133  */
1134 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)1135 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1136 {
1137 
1138 	sfence_vma_page(va);
1139 }
1140 
1141 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)1142 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1143 {
1144 
1145 	/*
1146 	 * Might consider a loop of sfence_vma_page() for a small
1147 	 * number of pages in the future.
1148 	 */
1149 	sfence_vma();
1150 }
1151 
1152 static __inline void
pmap_invalidate_all(pmap_t pmap)1153 pmap_invalidate_all(pmap_t pmap)
1154 {
1155 
1156 	sfence_vma();
1157 }
1158 #endif
1159 
1160 /*
1161  *	Routine:	pmap_extract
1162  *	Function:
1163  *		Extract the physical page address associated
1164  *		with the given map/virtual_address pair.
1165  */
1166 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1167 pmap_extract(pmap_t pmap, vm_offset_t va)
1168 {
1169 	pd_entry_t *l2p, l2;
1170 	pt_entry_t *l3p;
1171 	vm_paddr_t pa;
1172 
1173 	pa = 0;
1174 
1175 	/*
1176 	 * Start with an L2 lookup, L1 superpages are currently not implemented.
1177 	 */
1178 	PMAP_LOCK(pmap);
1179 	l2p = pmap_l2(pmap, va);
1180 	if (l2p != NULL && ((l2 = pmap_load(l2p)) & PTE_V) != 0) {
1181 		if ((l2 & PTE_RWX) == 0) {
1182 			l3p = pmap_l2_to_l3(l2p, va);
1183 			pa = PTE_TO_PHYS(pmap_load(l3p));
1184 			pa |= (va & L3_OFFSET);
1185 		} else {
1186 			/* L2 is a superpage mapping. */
1187 			pa = L2PTE_TO_PHYS(l2);
1188 			pa |= (va & L2_OFFSET);
1189 		}
1190 	}
1191 	PMAP_UNLOCK(pmap);
1192 	return (pa);
1193 }
1194 
1195 /*
1196  *	Routine:	pmap_extract_and_hold
1197  *	Function:
1198  *		Atomically extract and hold the physical page
1199  *		with the given pmap and virtual address pair
1200  *		if that mapping permits the given protection.
1201  */
1202 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1203 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1204 {
1205 	pd_entry_t *l2p, l2;
1206 	pt_entry_t *l3p, l3;
1207 	vm_page_t m;
1208 
1209 	m = NULL;
1210 	PMAP_LOCK(pmap);
1211 	l2p = pmap_l2(pmap, va);
1212 	if (l2p == NULL || ((l2 = pmap_load(l2p)) & PTE_V) == 0) {
1213 		;
1214 	} else if ((l2 & PTE_RWX) != 0) {
1215 		if ((l2 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) {
1216 			m = PHYS_TO_VM_PAGE(L2PTE_TO_PHYS(l2) +
1217 			    (va & L2_OFFSET));
1218 		}
1219 	} else {
1220 		l3p = pmap_l2_to_l3(l2p, va);
1221 		if ((l3 = pmap_load(l3p)) != 0) {
1222 			if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0)
1223 				m = PTE_TO_VM_PAGE(l3);
1224 		}
1225 	}
1226 	if (m != NULL && !vm_page_wire_mapped(m))
1227 		m = NULL;
1228 	PMAP_UNLOCK(pmap);
1229 	return (m);
1230 }
1231 
1232 /*
1233  *	Routine:	pmap_kextract
1234  *	Function:
1235  *		Extract the physical page address associated with the given kernel
1236  *		virtual address.
1237  */
1238 vm_paddr_t
pmap_kextract(vm_offset_t va)1239 pmap_kextract(vm_offset_t va)
1240 {
1241 	pd_entry_t *l2, l2e;
1242 	pt_entry_t *l3;
1243 	vm_paddr_t pa;
1244 
1245 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1246 		pa = DMAP_TO_PHYS(va);
1247 	} else {
1248 		l2 = pmap_l2(kernel_pmap, va);
1249 		if (l2 == NULL)
1250 			panic("pmap_kextract: No l2");
1251 		l2e = pmap_load(l2);
1252 		/*
1253 		 * Beware of concurrent promotion and demotion! We must
1254 		 * use l2e rather than loading from l2 multiple times to
1255 		 * ensure we see a consistent state, including the
1256 		 * implicit load in pmap_l2_to_l3.  It is, however, safe
1257 		 * to use an old l2e because the L3 page is preserved by
1258 		 * promotion.
1259 		 */
1260 		if ((l2e & PTE_RX) != 0) {
1261 			/* superpages */
1262 			pa = L2PTE_TO_PHYS(l2e);
1263 			pa |= (va & L2_OFFSET);
1264 			return (pa);
1265 		}
1266 
1267 		l3 = pmap_l2_to_l3(&l2e, va);
1268 		pa = PTE_TO_PHYS(pmap_load(l3));
1269 		pa |= (va & PAGE_MASK);
1270 	}
1271 	return (pa);
1272 }
1273 
1274 /***************************************************
1275  * Low level mapping routines.....
1276  ***************************************************/
1277 
1278 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)1279 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1280 {
1281 	pt_entry_t entry;
1282 	pt_entry_t *l3;
1283 	pt_entry_t memattr;
1284 	vm_offset_t va;
1285 	pn_t pn;
1286 
1287 	KASSERT((pa & L3_OFFSET) == 0,
1288 	   ("pmap_kenter_device: Invalid physical address"));
1289 	KASSERT((sva & L3_OFFSET) == 0,
1290 	   ("pmap_kenter_device: Invalid virtual address"));
1291 	KASSERT((size & PAGE_MASK) == 0,
1292 	    ("pmap_kenter_device: Mapping is not page-sized"));
1293 
1294 	memattr = pmap_memattr_bits(mode);
1295 	va = sva;
1296 	while (size != 0) {
1297 		l3 = pmap_l3(kernel_pmap, va);
1298 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
1299 
1300 		pn = (pa / PAGE_SIZE);
1301 		entry = PTE_KERN;
1302 		entry |= memattr;
1303 		entry |= (pn << PTE_PPN0_S);
1304 		pmap_store(l3, entry);
1305 
1306 		va += PAGE_SIZE;
1307 		pa += PAGE_SIZE;
1308 		size -= PAGE_SIZE;
1309 	}
1310 	pmap_invalidate_range(kernel_pmap, sva, va);
1311 }
1312 
1313 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)1314 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1315 {
1316 	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
1317 }
1318 
1319 /*
1320  * Remove a page from the kernel pagetables.
1321  * Note: not SMP coherent.
1322  */
1323 void
pmap_kremove(vm_offset_t va)1324 pmap_kremove(vm_offset_t va)
1325 {
1326 	pt_entry_t *l3;
1327 
1328 	l3 = pmap_l3(kernel_pmap, va);
1329 	KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1330 
1331 	pmap_clear(l3);
1332 	sfence_vma();
1333 }
1334 
1335 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)1336 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1337 {
1338 	pt_entry_t *l3;
1339 	vm_offset_t va;
1340 
1341 	KASSERT((sva & L3_OFFSET) == 0,
1342 	   ("pmap_kremove_device: Invalid virtual address"));
1343 	KASSERT((size & PAGE_MASK) == 0,
1344 	    ("pmap_kremove_device: Mapping is not page-sized"));
1345 
1346 	va = sva;
1347 	while (size != 0) {
1348 		l3 = pmap_l3(kernel_pmap, va);
1349 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
1350 		pmap_clear(l3);
1351 
1352 		va += PAGE_SIZE;
1353 		size -= PAGE_SIZE;
1354 	}
1355 
1356 	pmap_invalidate_range(kernel_pmap, sva, va);
1357 }
1358 
1359 /*
1360  *	Used to map a range of physical addresses into kernel
1361  *	virtual address space.
1362  *
1363  *	The value passed in '*virt' is a suggested virtual address for
1364  *	the mapping. Architectures which can support a direct-mapped
1365  *	physical to virtual region can return the appropriate address
1366  *	within that region, leaving '*virt' unchanged. Other
1367  *	architectures should map the pages starting at '*virt' and
1368  *	update '*virt' with the first usable address after the mapped
1369  *	region.
1370  */
1371 void *
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)1372 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1373 {
1374 
1375 	return (PHYS_TO_DMAP(start));
1376 }
1377 
1378 /*
1379  * Add a list of wired pages to the kva
1380  * this routine is only used for temporary
1381  * kernel mappings that do not need to have
1382  * page modification or references recorded.
1383  * Note that old mappings are simply written
1384  * over.  The page *must* be wired.
1385  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1386  */
1387 void
pmap_qenter(void * sva,vm_page_t * ma,int count)1388 pmap_qenter(void *sva, vm_page_t *ma, int count)
1389 {
1390 	pt_entry_t *l3;
1391 	vm_paddr_t pa;
1392 	vm_offset_t va;
1393 	vm_page_t m;
1394 	pt_entry_t entry;
1395 	pn_t pn;
1396 	int i;
1397 
1398 	va = (vm_offset_t)sva;
1399 	for (i = 0; i < count; i++) {
1400 		m = ma[i];
1401 		pa = VM_PAGE_TO_PHYS(m);
1402 		pn = (pa / PAGE_SIZE);
1403 		l3 = pmap_l3(kernel_pmap, va);
1404 
1405 		entry = PTE_KERN;
1406 		entry |= pmap_memattr_bits(m->md.pv_memattr);
1407 		entry |= (pn << PTE_PPN0_S);
1408 		pmap_store(l3, entry);
1409 
1410 		va += L3_SIZE;
1411 	}
1412 	pmap_invalidate_range(kernel_pmap, (vm_offset_t)sva, va);
1413 }
1414 
1415 /*
1416  * This routine tears out page mappings from the
1417  * kernel -- it is meant only for temporary mappings.
1418  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1419  */
1420 void
pmap_qremove(void * sva,int count)1421 pmap_qremove(void *sva, int count)
1422 {
1423 	pt_entry_t *l3;
1424 	vm_offset_t va;
1425 
1426 	va = (vm_offset_t)sva;
1427 	KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %p", sva));
1428 
1429 	for (; count-- > 0; va += PAGE_SIZE) {
1430 		l3 = pmap_l3(kernel_pmap, va);
1431 		KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1432 		pmap_clear(l3);
1433 	}
1434 	pmap_invalidate_range(kernel_pmap, (vm_offset_t)sva, va);
1435 }
1436 
1437 bool
pmap_ps_enabled(pmap_t pmap __unused)1438 pmap_ps_enabled(pmap_t pmap __unused)
1439 {
1440 
1441 	return (superpages_enabled);
1442 }
1443 
1444 /***************************************************
1445  * Page table page management routines.....
1446  ***************************************************/
1447 /*
1448  * Schedule the specified unused page table page to be freed.  Specifically,
1449  * add the page to the specified list of pages that will be released to the
1450  * physical memory manager after the TLB has been updated.
1451  */
1452 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)1453 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
1454 {
1455 
1456 	if (set_PG_ZERO)
1457 		m->flags |= PG_ZERO;
1458 	else
1459 		m->flags &= ~PG_ZERO;
1460 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1461 }
1462 
1463 /*
1464  * Inserts the specified page table page into the specified pmap's collection
1465  * of idle page table pages.  Each of a pmap's page table pages is responsible
1466  * for mapping a distinct range of virtual addresses.  The pmap's collection is
1467  * ordered by this virtual address range.
1468  *
1469  * If "promoted" is false, then the page table page "mpte" must be zero filled;
1470  * "mpte"'s valid field will be set to 0.
1471  *
1472  * If "promoted" is true and "all_l3e_PTE_A_set" is false, then "mpte" must
1473  * contain valid mappings with identical attributes except for PTE_A;
1474  * "mpte"'s valid field will be set to 1.
1475  *
1476  * If "promoted" and "all_l3e_PTE_A_set" are both true, then "mpte" must contain
1477  * valid mappings with identical attributes including PTE_A; "mpte"'s valid
1478  * field will be set to VM_PAGE_BITS_ALL.
1479  */
1480 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_PTE_A_set)1481 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
1482     bool all_l3e_PTE_A_set)
1483 {
1484 
1485 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1486 	KASSERT(promoted || !all_l3e_PTE_A_set,
1487 	    ("a zero-filled PTP can't have PTE_A set in every PTE"));
1488 	mpte->valid = promoted ? (all_l3e_PTE_A_set ? VM_PAGE_BITS_ALL : 1) : 0;
1489 	return (vm_radix_insert(&pmap->pm_root, mpte));
1490 }
1491 
1492 /*
1493  * Removes the page table page mapping the specified virtual address from the
1494  * specified pmap's collection of idle page table pages, and returns it.
1495  * Otherwise, returns NULL if there is no page table page corresponding to the
1496  * specified virtual address.
1497  */
1498 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)1499 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
1500 {
1501 
1502 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1503 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
1504 }
1505 
1506 /*
1507  * Decrements a page table page's reference count, which is used to record the
1508  * number of valid page table entries within the page.  If the reference count
1509  * drops to zero, then the page table page is unmapped.  Returns true if the
1510  * page table page was unmapped and false otherwise.
1511  */
1512 static inline bool
pmap_unwire_ptp(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)1513 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1514 {
1515 	KASSERT(m->ref_count > 0,
1516 	    ("%s: page %p ref count underflow", __func__, m));
1517 
1518 	--m->ref_count;
1519 	if (m->ref_count == 0) {
1520 		_pmap_unwire_ptp(pmap, va, m, free);
1521 		return (true);
1522 	} else {
1523 		return (false);
1524 	}
1525 }
1526 
1527 static void
_pmap_unwire_ptp(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)1528 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1529 {
1530 
1531 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1532 	if (m->pindex >= NUL2E + NUL1E) {
1533 		pd_entry_t *l0;
1534 		l0 = pmap_l0(pmap, va);
1535 		pmap_clear(l0);
1536 	} else if (m->pindex >= NUL2E) {
1537 		pd_entry_t *l1;
1538 		l1 = pmap_l1(pmap, va);
1539 		pmap_clear(l1);
1540 		pmap_distribute_l1(pmap, pmap_l1_index(va), 0);
1541 	} else {
1542 		pd_entry_t *l2;
1543 		l2 = pmap_l2(pmap, va);
1544 		pmap_clear(l2);
1545 	}
1546 	pmap_resident_count_dec(pmap, 1);
1547 	if (m->pindex < NUL2E) {
1548 		pd_entry_t *l1;
1549 		vm_page_t pdpg;
1550 
1551 		l1 = pmap_l1(pmap, va);
1552 		pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1553 		pmap_unwire_ptp(pmap, va, pdpg, free);
1554 	} else if (m->pindex < NUL2E + NUL1E && pmap_mode != PMAP_MODE_SV39) {
1555 		pd_entry_t *l0;
1556 		vm_page_t pdpg;
1557 
1558 		l0 = pmap_l0(pmap, va);
1559 		pdpg = PTE_TO_VM_PAGE(pmap_load(l0));
1560 		pmap_unwire_ptp(pmap, va, pdpg, free);
1561 	}
1562 	pmap_invalidate_page(pmap, va);
1563 
1564 	vm_wire_sub(1);
1565 
1566 	/*
1567 	 * Put page on a list so that it is released after
1568 	 * *ALL* TLB shootdown is done
1569 	 */
1570 	pmap_add_delayed_free_list(m, free, true);
1571 }
1572 
1573 /*
1574  * After removing a page table entry, this routine is used to
1575  * conditionally free the page, and manage the reference count.
1576  */
1577 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)1578 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1579     struct spglist *free)
1580 {
1581 	vm_page_t mpte;
1582 
1583 	if (va >= VM_MAXUSER_ADDRESS)
1584 		return (0);
1585 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1586 	mpte = PTE_TO_VM_PAGE(ptepde);
1587 	return (pmap_unwire_ptp(pmap, va, mpte, free));
1588 }
1589 
1590 static uint64_t
pmap_satp_mode(void)1591 pmap_satp_mode(void)
1592 {
1593 	return (pmap_mode == PMAP_MODE_SV39 ? SATP_MODE_SV39 : SATP_MODE_SV48);
1594 }
1595 
1596 void
pmap_pinit0(pmap_t pmap)1597 pmap_pinit0(pmap_t pmap)
1598 {
1599 	PMAP_LOCK_INIT(pmap);
1600 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1601 	pmap->pm_stage = PM_STAGE1;
1602 	pmap->pm_top = kernel_pmap->pm_top;
1603 	pmap->pm_satp = pmap_satp_mode() |
1604 	    (vtophys(pmap->pm_top) >> PAGE_SHIFT);
1605 	CPU_ZERO(&pmap->pm_active);
1606 	TAILQ_INIT(&pmap->pm_pvchunk);
1607 	vm_radix_init(&pmap->pm_root);
1608 	pmap_activate_boot(pmap);
1609 }
1610 
1611 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage)1612 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage)
1613 {
1614 	vm_paddr_t topphys;
1615 	vm_page_t m;
1616 	size_t i;
1617 
1618 	/*
1619 	 * Top directory is 4 pages in hypervisor case.
1620 	 * Current address space layout makes 3 of them unused.
1621 	 */
1622 	if (stage == PM_STAGE1)
1623 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO |
1624 		    VM_ALLOC_WAITOK);
1625 	else
1626 		m = vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
1627 		    4, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT);
1628 
1629 	topphys = VM_PAGE_TO_PHYS(m);
1630 	pmap->pm_top = PHYS_TO_DMAP(topphys);
1631 	pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT);
1632 	pmap->pm_stage = stage;
1633 
1634 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1635 
1636 	CPU_ZERO(&pmap->pm_active);
1637 
1638 	if (stage == PM_STAGE2)
1639 		goto finish;
1640 
1641 	if (pmap_mode == PMAP_MODE_SV39) {
1642 		/*
1643 		 * Copy L1 entries from the kernel pmap.  This must be done with
1644 		 * the allpmaps lock held to avoid races with
1645 		 * pmap_distribute_l1().
1646 		 */
1647 		mtx_lock(&allpmaps_lock);
1648 		LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1649 		for (i = pmap_l1_index(VM_MIN_KERNEL_ADDRESS);
1650 		    i < pmap_l1_index(VM_MAX_KERNEL_ADDRESS); i++)
1651 			pmap->pm_top[i] = kernel_pmap->pm_top[i];
1652 		for (i = pmap_l1_index(DMAP_MIN_ADDRESS);
1653 		    i < pmap_l1_index(DMAP_MAX_ADDRESS); i++)
1654 			pmap->pm_top[i] = kernel_pmap->pm_top[i];
1655 		mtx_unlock(&allpmaps_lock);
1656 	} else {
1657 		i = pmap_l0_index(VM_MIN_KERNEL_ADDRESS);
1658 		pmap->pm_top[i] = kernel_pmap->pm_top[i];
1659 	}
1660 
1661 finish:
1662 	TAILQ_INIT(&pmap->pm_pvchunk);
1663 	vm_radix_init(&pmap->pm_root);
1664 
1665 	return (1);
1666 }
1667 
1668 int
pmap_pinit(pmap_t pmap)1669 pmap_pinit(pmap_t pmap)
1670 {
1671 
1672 	return (pmap_pinit_stage(pmap, PM_STAGE1));
1673 }
1674 
1675 /*
1676  * This routine is called if the desired page table page does not exist.
1677  *
1678  * If page table page allocation fails, this routine may sleep before
1679  * returning NULL.  It sleeps only if a lock pointer was given.
1680  *
1681  * Note: If a page allocation fails at page table level two or three,
1682  * one or two pages may be held during the wait, only to be released
1683  * afterwards.  This conservative approach is easily argued to avoid
1684  * race conditions.
1685  */
1686 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)1687 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1688 {
1689 	vm_page_t m, pdpg;
1690 	pt_entry_t entry;
1691 	vm_paddr_t phys;
1692 	pn_t pn;
1693 
1694 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1695 
1696 	/*
1697 	 * Allocate a page table page.
1698 	 */
1699 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1700 	if (m == NULL) {
1701 		if (lockp != NULL) {
1702 			RELEASE_PV_LIST_LOCK(lockp);
1703 			PMAP_UNLOCK(pmap);
1704 			rw_runlock(&pvh_global_lock);
1705 			vm_wait(NULL);
1706 			rw_rlock(&pvh_global_lock);
1707 			PMAP_LOCK(pmap);
1708 		}
1709 
1710 		/*
1711 		 * Indicate the need to retry.  While waiting, the page table
1712 		 * page may have been allocated.
1713 		 */
1714 		return (NULL);
1715 	}
1716 	m->pindex = ptepindex;
1717 
1718 	/*
1719 	 * Map the pagetable page into the process address space, if
1720 	 * it isn't already there.
1721 	 */
1722 	pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT;
1723 	if (ptepindex >= NUL2E + NUL1E) {
1724 		pd_entry_t *l0;
1725 		vm_pindex_t l0index;
1726 
1727 		KASSERT(pmap_mode != PMAP_MODE_SV39,
1728 		    ("%s: pindex %#lx in SV39 mode", __func__, ptepindex));
1729 		KASSERT(ptepindex < NUL2E + NUL1E + NUL0E,
1730 		    ("%s: pindex %#lx out of range", __func__, ptepindex));
1731 
1732 		l0index = ptepindex - (NUL2E + NUL1E);
1733 		l0 = &pmap->pm_top[l0index];
1734 		KASSERT((pmap_load(l0) & PTE_V) == 0,
1735 		    ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0)));
1736 
1737 		entry = PTE_V | (pn << PTE_PPN0_S);
1738 		pmap_store(l0, entry);
1739 	} else if (ptepindex >= NUL2E) {
1740 		pd_entry_t *l0, *l1;
1741 		vm_pindex_t l0index, l1index;
1742 
1743 		l1index = ptepindex - NUL2E;
1744 		if (pmap_mode == PMAP_MODE_SV39) {
1745 			l1 = &pmap->pm_top[l1index];
1746 		} else {
1747 			l0index = l1index >> Ln_ENTRIES_SHIFT;
1748 			l0 = &pmap->pm_top[l0index];
1749 			if (pmap_load(l0) == 0) {
1750 				/* Recurse to allocate the L1 page. */
1751 				if (_pmap_alloc_l3(pmap,
1752 				    NUL2E + NUL1E + l0index, lockp) == NULL)
1753 					goto fail;
1754 				phys = PTE_TO_PHYS(pmap_load(l0));
1755 			} else {
1756 				phys = PTE_TO_PHYS(pmap_load(l0));
1757 				pdpg = PHYS_TO_VM_PAGE(phys);
1758 				pdpg->ref_count++;
1759 			}
1760 			l1 = PHYS_TO_DMAP(phys);
1761 			l1 = &l1[ptepindex & Ln_ADDR_MASK];
1762 		}
1763 		KASSERT((pmap_load(l1) & PTE_V) == 0,
1764 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
1765 
1766 		entry = PTE_V | (pn << PTE_PPN0_S);
1767 		pmap_store(l1, entry);
1768 		pmap_distribute_l1(pmap, l1index, entry);
1769 	} else {
1770 		vm_pindex_t l0index, l1index;
1771 		pd_entry_t *l0, *l1, *l2;
1772 
1773 		l1index = ptepindex >> (L1_SHIFT - L2_SHIFT);
1774 		if (pmap_mode == PMAP_MODE_SV39) {
1775 			l1 = &pmap->pm_top[l1index];
1776 			if (pmap_load(l1) == 0) {
1777 				/* recurse for allocating page dir */
1778 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1779 				    lockp) == NULL)
1780 					goto fail;
1781 			} else {
1782 				pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1783 				pdpg->ref_count++;
1784 			}
1785 		} else {
1786 			l0index = l1index >> Ln_ENTRIES_SHIFT;
1787 			l0 = &pmap->pm_top[l0index];
1788 			if (pmap_load(l0) == 0) {
1789 				/* Recurse to allocate the L1 entry. */
1790 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1791 				    lockp) == NULL)
1792 					goto fail;
1793 				phys = PTE_TO_PHYS(pmap_load(l0));
1794 				l1 = PHYS_TO_DMAP(phys);
1795 				l1 = &l1[l1index & Ln_ADDR_MASK];
1796 			} else {
1797 				phys = PTE_TO_PHYS(pmap_load(l0));
1798 				l1 = PHYS_TO_DMAP(phys);
1799 				l1 = &l1[l1index & Ln_ADDR_MASK];
1800 				if (pmap_load(l1) == 0) {
1801 					/* Recurse to allocate the L2 page. */
1802 					if (_pmap_alloc_l3(pmap,
1803 					    NUL2E + l1index, lockp) == NULL)
1804 						goto fail;
1805 				} else {
1806 					pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1807 					pdpg->ref_count++;
1808 				}
1809 			}
1810 		}
1811 
1812 		phys = PTE_TO_PHYS(pmap_load(l1));
1813 		l2 = PHYS_TO_DMAP(phys);
1814 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
1815 		KASSERT((pmap_load(l2) & PTE_V) == 0,
1816 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
1817 
1818 		entry = PTE_V | (pn << PTE_PPN0_S);
1819 		pmap_store(l2, entry);
1820 	}
1821 
1822 	pmap_resident_count_inc(pmap, 1);
1823 
1824 	return (m);
1825 
1826 fail:
1827 	vm_page_unwire_noq(m);
1828 	vm_page_free_zero(m);
1829 	return (NULL);
1830 }
1831 
1832 static vm_page_t
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)1833 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1834 {
1835 	pd_entry_t *l1;
1836 	vm_page_t l2pg;
1837 	vm_pindex_t pindex;
1838 
1839 retry:
1840 	l1 = pmap_l1(pmap, va);
1841 	if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) {
1842 		KASSERT((pmap_load(l1) & PTE_RWX) == 0,
1843 		    ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__,
1844 		    pmap_load(l1), va));
1845 		/* Add a reference to the L2 page. */
1846 		l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
1847 		l2pg->ref_count++;
1848 	} else {
1849 		/* Allocate a L2 page. */
1850 		pindex = pmap_l1_pindex(va);
1851 		l2pg = _pmap_alloc_l3(pmap, pindex, lockp);
1852 		if (l2pg == NULL && lockp != NULL)
1853 			goto retry;
1854 	}
1855 	return (l2pg);
1856 }
1857 
1858 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)1859 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1860 {
1861 	vm_pindex_t ptepindex;
1862 	pd_entry_t *l2;
1863 	vm_page_t m;
1864 
1865 	/*
1866 	 * Calculate pagetable page index
1867 	 */
1868 	ptepindex = pmap_l2_pindex(va);
1869 retry:
1870 	/*
1871 	 * Get the page directory entry
1872 	 */
1873 	l2 = pmap_l2(pmap, va);
1874 
1875 	/*
1876 	 * If the page table page is mapped, we just increment the
1877 	 * hold count, and activate it.
1878 	 */
1879 	if (l2 != NULL && pmap_load(l2) != 0) {
1880 		m = PTE_TO_VM_PAGE(pmap_load(l2));
1881 		m->ref_count++;
1882 	} else {
1883 		/*
1884 		 * Here if the pte page isn't mapped, or if it has been
1885 		 * deallocated.
1886 		 */
1887 		m = _pmap_alloc_l3(pmap, ptepindex, lockp);
1888 		if (m == NULL && lockp != NULL)
1889 			goto retry;
1890 	}
1891 	return (m);
1892 }
1893 
1894 /***************************************************
1895  * Pmap allocation/deallocation routines.
1896  ***************************************************/
1897 
1898 /*
1899  * Release any resources held by the given physical map.
1900  * Called when a pmap initialized by pmap_pinit is being released.
1901  * Should only be called if the map contains no valid mappings.
1902  */
1903 void
pmap_release(pmap_t pmap)1904 pmap_release(pmap_t pmap)
1905 {
1906 	vm_page_t m;
1907 	int npages;
1908 	int i;
1909 
1910 	KASSERT(pmap->pm_stats.resident_count == 0,
1911 	    ("pmap_release: pmap resident count %ld != 0",
1912 	    pmap->pm_stats.resident_count));
1913 	KASSERT(CPU_EMPTY(&pmap->pm_active),
1914 	    ("releasing active pmap %p", pmap));
1915 
1916 	if (pmap->pm_stage == PM_STAGE2)
1917 		goto finish;
1918 
1919 	if (pmap_mode == PMAP_MODE_SV39) {
1920 		mtx_lock(&allpmaps_lock);
1921 		LIST_REMOVE(pmap, pm_list);
1922 		mtx_unlock(&allpmaps_lock);
1923 	}
1924 
1925 finish:
1926 	npages = pmap->pm_stage == PM_STAGE2 ? 4 : 1;
1927 	m = DMAP_TO_VM_PAGE(pmap->pm_top);
1928 	for (i = 0; i < npages; i++) {
1929 		vm_page_unwire_noq(m);
1930 		vm_page_free(m);
1931 		m++;
1932 	}
1933 }
1934 
1935 static int
kvm_size(SYSCTL_HANDLER_ARGS)1936 kvm_size(SYSCTL_HANDLER_ARGS)
1937 {
1938 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1939 
1940 	return sysctl_handle_long(oidp, &ksize, 0, req);
1941 }
1942 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1943     0, 0, kvm_size, "LU",
1944     "Size of KVM");
1945 
1946 static int
kvm_free(SYSCTL_HANDLER_ARGS)1947 kvm_free(SYSCTL_HANDLER_ARGS)
1948 {
1949 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1950 
1951 	return sysctl_handle_long(oidp, &kfree, 0, req);
1952 }
1953 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1954     0, 0, kvm_free, "LU",
1955     "Amount of KVM free");
1956 
1957 /*
1958  * grow the number of kernel page table entries, if needed
1959  */
1960 static int
pmap_growkernel_nopanic(vm_offset_t addr)1961 pmap_growkernel_nopanic(vm_offset_t addr)
1962 {
1963 	vm_paddr_t paddr;
1964 	vm_page_t nkpg;
1965 	pd_entry_t *l1, *l2;
1966 	pt_entry_t entry;
1967 	pn_t pn;
1968 
1969 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1970 
1971 	addr = roundup2(addr, L2_SIZE);
1972 	if (addr - 1 >= vm_map_max(kernel_map))
1973 		addr = vm_map_max(kernel_map);
1974 	while (kernel_vm_end < addr) {
1975 		l1 = pmap_l1(kernel_pmap, kernel_vm_end);
1976 		if (pmap_load(l1) == 0) {
1977 			/* We need a new PDP entry */
1978 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
1979 			    VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1980 			if (nkpg == NULL)
1981 				return (KERN_RESOURCE_SHORTAGE);
1982 
1983 			nkpg->pindex = pmap_l1_pindex(kernel_vm_end);
1984 			paddr = VM_PAGE_TO_PHYS(nkpg);
1985 
1986 			pn = (paddr / PAGE_SIZE);
1987 			entry = (PTE_V);
1988 			entry |= (pn << PTE_PPN0_S);
1989 			pmap_store(l1, entry);
1990 			pmap_distribute_l1(kernel_pmap,
1991 			    pmap_l1_index(kernel_vm_end), entry);
1992 			continue; /* try again */
1993 		}
1994 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
1995 		if ((pmap_load(l2) & PTE_V) != 0 &&
1996 		    (pmap_load(l2) & PTE_RWX) == 0) {
1997 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1998 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
1999 				kernel_vm_end = vm_map_max(kernel_map);
2000 				break;
2001 			}
2002 			continue;
2003 		}
2004 
2005 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2006 		    VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2007 		if (nkpg == NULL)
2008 			return (KERN_RESOURCE_SHORTAGE);
2009 		nkpg->pindex = pmap_l2_pindex(kernel_vm_end);
2010 		paddr = VM_PAGE_TO_PHYS(nkpg);
2011 
2012 		pn = (paddr / PAGE_SIZE);
2013 		entry = (PTE_V);
2014 		entry |= (pn << PTE_PPN0_S);
2015 		pmap_store(l2, entry);
2016 
2017 		pmap_invalidate_page(kernel_pmap, kernel_vm_end);
2018 
2019 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2020 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2021 			kernel_vm_end = vm_map_max(kernel_map);
2022 			break;
2023 		}
2024 	}
2025 
2026 	return (KERN_SUCCESS);
2027 }
2028 
2029 int
pmap_growkernel(vm_offset_t addr)2030 pmap_growkernel(vm_offset_t addr)
2031 {
2032 	int rv;
2033 
2034 	rv = pmap_growkernel_nopanic(addr);
2035 	if (rv != KERN_SUCCESS && pmap_growkernel_panic)
2036 		panic("pmap_growkernel: no memory to grow kernel");
2037 	return (rv);
2038 }
2039 
2040 /***************************************************
2041  * page management routines.
2042  ***************************************************/
2043 
2044 static const uint64_t pc_freemask[_NPCM] = {
2045 	[0 ... _NPCM - 2] = PC_FREEN,
2046 	[_NPCM - 1] = PC_FREEL
2047 };
2048 
2049 #ifdef PV_STATS
2050 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2051 
2052 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2053 	"Current number of pv entry chunks");
2054 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2055 	"Current number of pv entry chunks allocated");
2056 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2057 	"Current number of pv entry chunks frees");
2058 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2059 	"Number of times tried to get a chunk page but failed.");
2060 
2061 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2062 static int pv_entry_spare;
2063 
2064 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2065 	"Current number of pv entry frees");
2066 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2067 	"Current number of pv entry allocs");
2068 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2069 	"Current number of pv entries");
2070 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2071 	"Current number of spare pv entries");
2072 #endif
2073 
2074 /*
2075  * We are in a serious low memory condition.  Resort to
2076  * drastic measures to free some pages so we can allocate
2077  * another pv entry chunk.
2078  *
2079  * Returns NULL if PV entries were reclaimed from the specified pmap.
2080  *
2081  * We do not, however, unmap 2mpages because subsequent accesses will
2082  * allocate per-page pv entries until repromotion occurs, thereby
2083  * exacerbating the shortage of free pv entries.
2084  */
2085 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)2086 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2087 {
2088 
2089 	panic("RISCVTODO: reclaim_pv_chunk");
2090 }
2091 
2092 /*
2093  * free the pv_entry back to the free list
2094  */
2095 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)2096 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2097 {
2098 	struct pv_chunk *pc;
2099 	int idx, field, bit;
2100 
2101 	rw_assert(&pvh_global_lock, RA_LOCKED);
2102 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2103 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2104 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2105 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2106 	pc = pv_to_chunk(pv);
2107 	idx = pv - &pc->pc_pventry[0];
2108 	field = idx / 64;
2109 	bit = idx % 64;
2110 	pc->pc_map[field] |= 1ul << bit;
2111 	if (!pc_is_free(pc)) {
2112 		/* 98% of the time, pc is already at the head of the list. */
2113 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2114 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2115 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2116 		}
2117 		return;
2118 	}
2119 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2120 	free_pv_chunk(pc);
2121 }
2122 
2123 static void
free_pv_chunk(struct pv_chunk * pc)2124 free_pv_chunk(struct pv_chunk *pc)
2125 {
2126 	vm_page_t m;
2127 
2128 	mtx_lock(&pv_chunks_mutex);
2129  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2130 	mtx_unlock(&pv_chunks_mutex);
2131 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2132 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2133 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2134 	/* entire chunk is free, return it */
2135 	m = DMAP_TO_VM_PAGE(pc);
2136 	dump_drop_page(m->phys_addr);
2137 	vm_page_unwire_noq(m);
2138 	vm_page_free(m);
2139 }
2140 
2141 /*
2142  * Returns a new PV entry, allocating a new PV chunk from the system when
2143  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2144  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2145  * returned.
2146  *
2147  * The given PV list lock may be released.
2148  */
2149 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)2150 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2151 {
2152 	int bit, field;
2153 	pv_entry_t pv;
2154 	struct pv_chunk *pc;
2155 	vm_page_t m;
2156 
2157 	rw_assert(&pvh_global_lock, RA_LOCKED);
2158 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2159 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2160 retry:
2161 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2162 	if (pc != NULL) {
2163 		for (field = 0; field < _NPCM; field++) {
2164 			if (pc->pc_map[field]) {
2165 				bit = ffsl(pc->pc_map[field]) - 1;
2166 				break;
2167 			}
2168 		}
2169 		if (field < _NPCM) {
2170 			pv = &pc->pc_pventry[field * 64 + bit];
2171 			pc->pc_map[field] &= ~(1ul << bit);
2172 			/* If this was the last item, move it to tail */
2173 			if (pc_is_full(pc)) {
2174 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2175 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2176 				    pc_list);
2177 			}
2178 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2179 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2180 			return (pv);
2181 		}
2182 	}
2183 	/* No free items, allocate another chunk */
2184 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2185 	if (m == NULL) {
2186 		if (lockp == NULL) {
2187 			PV_STAT(pc_chunk_tryfail++);
2188 			return (NULL);
2189 		}
2190 		m = reclaim_pv_chunk(pmap, lockp);
2191 		if (m == NULL)
2192 			goto retry;
2193 	}
2194 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2195 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2196 	dump_add_page(m->phys_addr);
2197 	pc = VM_PAGE_TO_DMAP(m);
2198 	pc->pc_pmap = pmap;
2199 	pc->pc_map[0] = PC_FREEN & ~1ul;	/* preallocated bit 0 */
2200 	pc->pc_map[1] = PC_FREEN;
2201 	pc->pc_map[2] = PC_FREEL;
2202 	mtx_lock(&pv_chunks_mutex);
2203 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2204 	mtx_unlock(&pv_chunks_mutex);
2205 	pv = &pc->pc_pventry[0];
2206 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2207 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2208 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2209 	return (pv);
2210 }
2211 
2212 /*
2213  * Ensure that the number of spare PV entries in the specified pmap meets or
2214  * exceeds the given count, "needed".
2215  *
2216  * The given PV list lock may be released.
2217  */
2218 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)2219 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2220 {
2221 	struct pch new_tail;
2222 	struct pv_chunk *pc;
2223 	vm_page_t m;
2224 	int avail, free;
2225 	bool reclaimed;
2226 
2227 	rw_assert(&pvh_global_lock, RA_LOCKED);
2228 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2229 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2230 
2231 	/*
2232 	 * Newly allocated PV chunks must be stored in a private list until
2233 	 * the required number of PV chunks have been allocated.  Otherwise,
2234 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
2235 	 * contrast, these chunks must be added to the pmap upon allocation.
2236 	 */
2237 	TAILQ_INIT(&new_tail);
2238 retry:
2239 	avail = 0;
2240 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2241 		bit_count((bitstr_t *)pc->pc_map, 0,
2242 		    sizeof(pc->pc_map) * NBBY, &free);
2243 		if (free == 0)
2244 			break;
2245 		avail += free;
2246 		if (avail >= needed)
2247 			break;
2248 	}
2249 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
2250 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2251 		if (m == NULL) {
2252 			m = reclaim_pv_chunk(pmap, lockp);
2253 			if (m == NULL)
2254 				goto retry;
2255 			reclaimed = true;
2256 		}
2257 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2258 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2259 		dump_add_page(m->phys_addr);
2260 		pc = VM_PAGE_TO_DMAP(m);
2261 		pc->pc_pmap = pmap;
2262 		pc->pc_map[0] = PC_FREEN;
2263 		pc->pc_map[1] = PC_FREEN;
2264 		pc->pc_map[2] = PC_FREEL;
2265 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2266 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2267 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
2268 
2269 		/*
2270 		 * The reclaim might have freed a chunk from the current pmap.
2271 		 * If that chunk contained available entries, we need to
2272 		 * re-count the number of available entries.
2273 		 */
2274 		if (reclaimed)
2275 			goto retry;
2276 	}
2277 	if (!TAILQ_EMPTY(&new_tail)) {
2278 		mtx_lock(&pv_chunks_mutex);
2279 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2280 		mtx_unlock(&pv_chunks_mutex);
2281 	}
2282 }
2283 
2284 /*
2285  * First find and then remove the pv entry for the specified pmap and virtual
2286  * address from the specified pv list.  Returns the pv entry if found and NULL
2287  * otherwise.  This operation can be performed on pv lists for either 4KB or
2288  * 2MB page mappings.
2289  */
2290 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2291 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2292 {
2293 	pv_entry_t pv;
2294 
2295 	rw_assert(&pvh_global_lock, RA_LOCKED);
2296 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2297 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2298 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2299 			pvh->pv_gen++;
2300 			break;
2301 		}
2302 	}
2303 	return (pv);
2304 }
2305 
2306 /*
2307  * First find and then destroy the pv entry for the specified pmap and virtual
2308  * address.  This operation can be performed on pv lists for either 4KB or 2MB
2309  * page mappings.
2310  */
2311 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2312 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2313 {
2314 	pv_entry_t pv;
2315 
2316 	pv = pmap_pvh_remove(pvh, pmap, va);
2317 
2318 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va));
2319 	free_pv_entry(pmap, pv);
2320 }
2321 
2322 /*
2323  * Conditionally create the PV entry for a 4KB page mapping if the required
2324  * memory can be allocated without resorting to reclamation.
2325  */
2326 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)2327 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2328     struct rwlock **lockp)
2329 {
2330 	pv_entry_t pv;
2331 
2332 	rw_assert(&pvh_global_lock, RA_LOCKED);
2333 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2334 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2335 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2336 		pv->pv_va = va;
2337 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2338 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2339 		m->md.pv_gen++;
2340 		return (true);
2341 	} else
2342 		return (false);
2343 }
2344 
2345 /*
2346  * After demotion from a 2MB page mapping to 512 4KB page mappings,
2347  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2348  * entries for each of the 4KB page mappings.
2349  */
2350 static void __unused
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)2351 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2352     struct rwlock **lockp)
2353 {
2354 	struct md_page *pvh;
2355 	struct pv_chunk *pc;
2356 	pv_entry_t pv;
2357 	vm_page_t m;
2358 	vm_offset_t va_last;
2359 	int bit, field;
2360 
2361 	rw_assert(&pvh_global_lock, RA_LOCKED);
2362 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2363 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2364 
2365 	/*
2366 	 * Transfer the 2mpage's pv entry for this mapping to the first
2367 	 * page's pv list.  Once this transfer begins, the pv list lock
2368 	 * must not be released until the last pv entry is reinstantiated.
2369 	 */
2370 	pvh = pa_to_pvh(pa);
2371 	va &= ~L2_OFFSET;
2372 	pv = pmap_pvh_remove(pvh, pmap, va);
2373 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
2374 	m = PHYS_TO_VM_PAGE(pa);
2375 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2376 	m->md.pv_gen++;
2377 	/* Instantiate the remaining 511 pv entries. */
2378 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
2379 	va_last = va + L2_SIZE - PAGE_SIZE;
2380 	for (;;) {
2381 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2382 		KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
2383 		for (field = 0; field < _NPCM; field++) {
2384 			while (pc->pc_map[field] != 0) {
2385 				bit = ffsl(pc->pc_map[field]) - 1;
2386 				pc->pc_map[field] &= ~(1ul << bit);
2387 				pv = &pc->pc_pventry[field * 64 + bit];
2388 				va += PAGE_SIZE;
2389 				pv->pv_va = va;
2390 				m++;
2391 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2392 			    ("pmap_pv_demote_l2: page %p is not managed", m));
2393 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2394 				m->md.pv_gen++;
2395 				if (va == va_last)
2396 					goto out;
2397 			}
2398 		}
2399 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2400 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2401 	}
2402 out:
2403 	if (pc_is_full(pc)) {
2404 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2405 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2406 	}
2407 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
2408 	PV_STAT(atomic_add_int(&pv_entry_spare, -(Ln_ENTRIES - 1)));
2409 }
2410 
2411 #if VM_NRESERVLEVEL > 0
2412 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)2413 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2414     struct rwlock **lockp)
2415 {
2416 	struct md_page *pvh;
2417 	pv_entry_t pv;
2418 	vm_page_t m;
2419 	vm_offset_t va_last;
2420 
2421 	rw_assert(&pvh_global_lock, RA_LOCKED);
2422 	KASSERT((pa & L2_OFFSET) == 0,
2423 	    ("pmap_pv_promote_l2: misaligned pa %#lx", pa));
2424 
2425 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2426 
2427 	m = PHYS_TO_VM_PAGE(pa);
2428 	va = va & ~L2_OFFSET;
2429 	pv = pmap_pvh_remove(&m->md, pmap, va);
2430 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va));
2431 	pvh = pa_to_pvh(pa);
2432 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2433 	pvh->pv_gen++;
2434 
2435 	va_last = va + L2_SIZE - PAGE_SIZE;
2436 	do {
2437 		m++;
2438 		va += PAGE_SIZE;
2439 		pmap_pvh_free(&m->md, pmap, va);
2440 	} while (va < va_last);
2441 }
2442 #endif /* VM_NRESERVLEVEL > 0 */
2443 
2444 /*
2445  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
2446  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
2447  * false if the PV entry cannot be allocated without resorting to reclamation.
2448  */
2449 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)2450 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
2451     struct rwlock **lockp)
2452 {
2453 	struct md_page *pvh;
2454 	pv_entry_t pv;
2455 	vm_paddr_t pa;
2456 
2457 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2458 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2459 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
2460 	    NULL : lockp)) == NULL)
2461 		return (false);
2462 	pv->pv_va = va;
2463 	pa = PTE_TO_PHYS(l2e);
2464 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2465 	pvh = pa_to_pvh(pa);
2466 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2467 	pvh->pv_gen++;
2468 	return (true);
2469 }
2470 
2471 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)2472 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
2473 {
2474 	pt_entry_t newl2, oldl2 __diagused;
2475 	vm_page_t ml3;
2476 	vm_paddr_t ml3pa;
2477 
2478 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
2479 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
2480 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2481 
2482 	ml3 = pmap_remove_pt_page(pmap, va);
2483 	if (ml3 == NULL)
2484 		panic("pmap_remove_kernel_l2: Missing pt page");
2485 
2486 	ml3pa = VM_PAGE_TO_PHYS(ml3);
2487 	newl2 = ml3pa | PTE_V;
2488 
2489 	/*
2490 	 * If this page table page was unmapped by a promotion, then it
2491 	 * contains valid mappings.  Zero it to invalidate those mappings.
2492 	 */
2493 	if (vm_page_any_valid(ml3))
2494 		pagezero(PHYS_TO_DMAP(ml3pa));
2495 
2496 	/*
2497 	 * Demote the mapping.
2498 	 */
2499 	oldl2 = pmap_load_store(l2, newl2);
2500 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
2501 	    __func__, l2, oldl2));
2502 }
2503 
2504 /*
2505  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
2506  */
2507 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)2508 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
2509     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
2510 {
2511 	struct md_page *pvh;
2512 	pt_entry_t oldl2;
2513 	vm_offset_t eva, va;
2514 	vm_page_t m, ml3;
2515 
2516 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2517 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
2518 	oldl2 = pmap_load_clear(l2);
2519 	KASSERT((oldl2 & PTE_RWX) != 0,
2520 	    ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2));
2521 
2522 	/*
2523 	 * The sfence.vma documentation states that it is sufficient to specify
2524 	 * a single address within a superpage mapping.  However, since we do
2525 	 * not perform any invalidation upon promotion, TLBs may still be
2526 	 * caching 4KB mappings within the superpage, so we must invalidate the
2527 	 * entire range.
2528 	 */
2529 	pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
2530 	if ((oldl2 & PTE_SW_WIRED) != 0)
2531 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
2532 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
2533 	if ((oldl2 & PTE_SW_MANAGED) != 0) {
2534 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2));
2535 		pvh = pa_to_pvh(PTE_TO_PHYS(oldl2));
2536 		pmap_pvh_free(pvh, pmap, sva);
2537 		eva = sva + L2_SIZE;
2538 		for (va = sva, m = PTE_TO_VM_PAGE(oldl2);
2539 		    va < eva; va += PAGE_SIZE, m++) {
2540 			if ((oldl2 & PTE_D) != 0)
2541 				vm_page_dirty(m);
2542 			if ((oldl2 & PTE_A) != 0)
2543 				vm_page_aflag_set(m, PGA_REFERENCED);
2544 			if (TAILQ_EMPTY(&m->md.pv_list) &&
2545 			    TAILQ_EMPTY(&pvh->pv_list))
2546 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2547 		}
2548 	}
2549 	if (pmap == kernel_pmap) {
2550 		pmap_remove_kernel_l2(pmap, l2, sva);
2551 	} else {
2552 		ml3 = pmap_remove_pt_page(pmap, sva);
2553 		if (ml3 != NULL) {
2554 			KASSERT(vm_page_any_valid(ml3),
2555 			    ("pmap_remove_l2: l3 page not promoted"));
2556 			pmap_resident_count_dec(pmap, 1);
2557 			KASSERT(ml3->ref_count == Ln_ENTRIES,
2558 			    ("pmap_remove_l2: l3 page ref count error"));
2559 			ml3->ref_count = 1;
2560 			vm_page_unwire_noq(ml3);
2561 			pmap_add_delayed_free_list(ml3, free, false);
2562 		}
2563 	}
2564 	return (pmap_unuse_pt(pmap, sva, l1e, free));
2565 }
2566 
2567 /*
2568  * pmap_remove_l3: do the things to unmap a page in a process
2569  */
2570 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)2571 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2572     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2573 {
2574 	struct md_page *pvh;
2575 	pt_entry_t old_l3;
2576 	vm_page_t m;
2577 
2578 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2579 	old_l3 = pmap_load_clear(l3);
2580 	pmap_invalidate_page(pmap, va);
2581 	if (old_l3 & PTE_SW_WIRED)
2582 		pmap->pm_stats.wired_count -= 1;
2583 	pmap_resident_count_dec(pmap, 1);
2584 	if (old_l3 & PTE_SW_MANAGED) {
2585 		m = PTE_TO_VM_PAGE(old_l3);
2586 		if ((old_l3 & PTE_D) != 0)
2587 			vm_page_dirty(m);
2588 		if (old_l3 & PTE_A)
2589 			vm_page_aflag_set(m, PGA_REFERENCED);
2590 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2591 		pmap_pvh_free(&m->md, pmap, va);
2592 		if (TAILQ_EMPTY(&m->md.pv_list) &&
2593 		    (m->flags & PG_FICTITIOUS) == 0) {
2594 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2595 			if (TAILQ_EMPTY(&pvh->pv_list))
2596 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2597 		}
2598 	}
2599 
2600 	return (pmap_unuse_pt(pmap, va, l2e, free));
2601 }
2602 
2603 /*
2604  *	Remove the given range of addresses from the specified map.
2605  *
2606  *	It is assumed that the start and end are properly
2607  *	rounded to the page size.
2608  */
2609 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)2610 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2611 {
2612 	struct spglist free;
2613 	struct rwlock *lock;
2614 	vm_offset_t va, va_next;
2615 	pd_entry_t *l0, *l1, *l2, l2e;
2616 	pt_entry_t *l3;
2617 
2618 	/*
2619 	 * Perform an unsynchronized read.  This is, however, safe.
2620 	 */
2621 	if (pmap->pm_stats.resident_count == 0)
2622 		return;
2623 
2624 	SLIST_INIT(&free);
2625 
2626 	rw_rlock(&pvh_global_lock);
2627 	PMAP_LOCK(pmap);
2628 
2629 	lock = NULL;
2630 	for (; sva < eva; sva = va_next) {
2631 		if (pmap->pm_stats.resident_count == 0)
2632 			break;
2633 
2634 		if (pmap_mode == PMAP_MODE_SV48) {
2635 			l0 = pmap_l0(pmap, sva);
2636 			if (pmap_load(l0) == 0) {
2637 				va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2638 				if (va_next < sva)
2639 					va_next = eva;
2640 				continue;
2641 			}
2642 			l1 = pmap_l0_to_l1(l0, sva);
2643 		} else {
2644 			l1 = pmap_l1(pmap, sva);
2645 		}
2646 
2647 		if (pmap_load(l1) == 0) {
2648 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2649 			if (va_next < sva)
2650 				va_next = eva;
2651 			continue;
2652 		}
2653 
2654 		/*
2655 		 * Calculate index for next page table.
2656 		 */
2657 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2658 		if (va_next < sva)
2659 			va_next = eva;
2660 
2661 		l2 = pmap_l1_to_l2(l1, sva);
2662 		if ((l2e = pmap_load(l2)) == 0)
2663 			continue;
2664 		if ((l2e & PTE_RWX) != 0) {
2665 			if (sva + L2_SIZE == va_next && eva >= va_next) {
2666 				(void)pmap_remove_l2(pmap, l2, sva,
2667 				    pmap_load(l1), &free, &lock);
2668 				continue;
2669 			} else if (!pmap_demote_l2_locked(pmap, l2, sva,
2670 			    &lock)) {
2671 				/*
2672 				 * The large page mapping was destroyed.
2673 				 */
2674 				continue;
2675 			}
2676 			l2e = pmap_load(l2);
2677 		}
2678 
2679 		/*
2680 		 * Limit our scan to either the end of the va represented
2681 		 * by the current page table page, or to the end of the
2682 		 * range being removed.
2683 		 */
2684 		if (va_next > eva)
2685 			va_next = eva;
2686 
2687 		va = va_next;
2688 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2689 		    sva += L3_SIZE) {
2690 			if (pmap_load(l3) == 0) {
2691 				if (va != va_next) {
2692 					pmap_invalidate_range(pmap, va, sva);
2693 					va = va_next;
2694 				}
2695 				continue;
2696 			}
2697 			if (va == va_next)
2698 				va = sva;
2699 			if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) {
2700 				sva += L3_SIZE;
2701 				break;
2702 			}
2703 		}
2704 		if (va != va_next)
2705 			pmap_invalidate_range(pmap, va, sva);
2706 	}
2707 	if (lock != NULL)
2708 		rw_wunlock(lock);
2709 	rw_runlock(&pvh_global_lock);
2710 	PMAP_UNLOCK(pmap);
2711 	vm_page_free_pages_toq(&free, false);
2712 }
2713 
2714 /*
2715  *	Routine:	pmap_remove_all
2716  *	Function:
2717  *		Removes this physical page from
2718  *		all physical maps in which it resides.
2719  *		Reflects back modify bits to the pager.
2720  *
2721  *	Notes:
2722  *		Original versions of this routine were very
2723  *		inefficient because they iteratively called
2724  *		pmap_remove (slow...)
2725  */
2726 
2727 void
pmap_remove_all(vm_page_t m)2728 pmap_remove_all(vm_page_t m)
2729 {
2730 	struct spglist free;
2731 	struct md_page *pvh;
2732 	pmap_t pmap;
2733 	pt_entry_t *l3, l3e;
2734 	pd_entry_t *l2, l2e __diagused;
2735 	pv_entry_t pv;
2736 	vm_offset_t va;
2737 
2738 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2739 	    ("pmap_remove_all: page %p is not managed", m));
2740 	SLIST_INIT(&free);
2741 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2742 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
2743 
2744 	rw_wlock(&pvh_global_lock);
2745 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2746 		pmap = PV_PMAP(pv);
2747 		PMAP_LOCK(pmap);
2748 		va = pv->pv_va;
2749 		l2 = pmap_l2(pmap, va);
2750 		(void)pmap_demote_l2(pmap, l2, va);
2751 		PMAP_UNLOCK(pmap);
2752 	}
2753 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2754 		pmap = PV_PMAP(pv);
2755 		PMAP_LOCK(pmap);
2756 		pmap_resident_count_dec(pmap, 1);
2757 		l2 = pmap_l2(pmap, pv->pv_va);
2758 		KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found"));
2759 		l2e = pmap_load(l2);
2760 
2761 		KASSERT((l2e & PTE_RX) == 0,
2762 		    ("pmap_remove_all: found a superpage in %p's pv list", m));
2763 
2764 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
2765 		l3e = pmap_load_clear(l3);
2766 		pmap_invalidate_page(pmap, pv->pv_va);
2767 		if (l3e & PTE_SW_WIRED)
2768 			pmap->pm_stats.wired_count--;
2769 		if ((l3e & PTE_A) != 0)
2770 			vm_page_aflag_set(m, PGA_REFERENCED);
2771 
2772 		/*
2773 		 * Update the vm_page_t clean and reference bits.
2774 		 */
2775 		if ((l3e & PTE_D) != 0)
2776 			vm_page_dirty(m);
2777 		pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free);
2778 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2779 		m->md.pv_gen++;
2780 		free_pv_entry(pmap, pv);
2781 		PMAP_UNLOCK(pmap);
2782 	}
2783 	vm_page_aflag_clear(m, PGA_WRITEABLE);
2784 	rw_wunlock(&pvh_global_lock);
2785 	vm_page_free_pages_toq(&free, false);
2786 }
2787 
2788 /*
2789  *	Set the physical protection on the
2790  *	specified range of this map as requested.
2791  */
2792 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)2793 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2794 {
2795 	pd_entry_t *l0, *l1, *l2, l2e;
2796 	pt_entry_t *l3, l3e, mask;
2797 	vm_page_t m, mt;
2798 	vm_offset_t va_next;
2799 	bool anychanged, pv_lists_locked;
2800 
2801 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2802 		pmap_remove(pmap, sva, eva);
2803 		return;
2804 	}
2805 
2806 	if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
2807 	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
2808 		return;
2809 
2810 	anychanged = false;
2811 	pv_lists_locked = false;
2812 	mask = 0;
2813 	if ((prot & VM_PROT_WRITE) == 0)
2814 		mask |= PTE_W | PTE_D;
2815 	if ((prot & VM_PROT_EXECUTE) == 0)
2816 		mask |= PTE_X;
2817 resume:
2818 	PMAP_LOCK(pmap);
2819 	for (; sva < eva; sva = va_next) {
2820 		if (pmap_mode == PMAP_MODE_SV48) {
2821 			l0 = pmap_l0(pmap, sva);
2822 			if (pmap_load(l0) == 0) {
2823 				va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2824 				if (va_next < sva)
2825 					va_next = eva;
2826 				continue;
2827 			}
2828 			l1 = pmap_l0_to_l1(l0, sva);
2829 		} else {
2830 			l1 = pmap_l1(pmap, sva);
2831 		}
2832 
2833 		if (pmap_load(l1) == 0) {
2834 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2835 			if (va_next < sva)
2836 				va_next = eva;
2837 			continue;
2838 		}
2839 
2840 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2841 		if (va_next < sva)
2842 			va_next = eva;
2843 
2844 		l2 = pmap_l1_to_l2(l1, sva);
2845 		if ((l2e = pmap_load(l2)) == 0)
2846 			continue;
2847 		if ((l2e & PTE_RWX) != 0) {
2848 			if (sva + L2_SIZE == va_next && eva >= va_next) {
2849 retryl2:
2850 				if ((prot & VM_PROT_WRITE) == 0 &&
2851 				    (l2e & (PTE_SW_MANAGED | PTE_D)) ==
2852 				    (PTE_SW_MANAGED | PTE_D)) {
2853 					m = PTE_TO_VM_PAGE(l2e);
2854 					for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
2855 						vm_page_dirty(mt);
2856 				}
2857 				if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask))
2858 					goto retryl2;
2859 				anychanged = true;
2860 				continue;
2861 			} else {
2862 				if (!pv_lists_locked) {
2863 					pv_lists_locked = true;
2864 					if (!rw_try_rlock(&pvh_global_lock)) {
2865 						if (anychanged)
2866 							pmap_invalidate_all(
2867 							    pmap);
2868 						PMAP_UNLOCK(pmap);
2869 						rw_rlock(&pvh_global_lock);
2870 						goto resume;
2871 					}
2872 				}
2873 				if (!pmap_demote_l2(pmap, l2, sva)) {
2874 					/*
2875 					 * The large page mapping was destroyed.
2876 					 */
2877 					continue;
2878 				}
2879 			}
2880 		}
2881 
2882 		if (va_next > eva)
2883 			va_next = eva;
2884 
2885 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2886 		    sva += L3_SIZE) {
2887 			l3e = pmap_load(l3);
2888 retryl3:
2889 			if ((l3e & PTE_V) == 0)
2890 				continue;
2891 			if ((prot & VM_PROT_WRITE) == 0 &&
2892 			    (l3e & (PTE_SW_MANAGED | PTE_D)) ==
2893 			    (PTE_SW_MANAGED | PTE_D)) {
2894 				m = PTE_TO_VM_PAGE(l3e);
2895 				vm_page_dirty(m);
2896 			}
2897 			if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask))
2898 				goto retryl3;
2899 			anychanged = true;
2900 		}
2901 	}
2902 	if (anychanged)
2903 		pmap_invalidate_all(pmap);
2904 	if (pv_lists_locked)
2905 		rw_runlock(&pvh_global_lock);
2906 	PMAP_UNLOCK(pmap);
2907 }
2908 
2909 int
pmap_fault(pmap_t pmap,vm_offset_t va,vm_prot_t ftype)2910 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype)
2911 {
2912 	pd_entry_t *l2, l2e;
2913 	pt_entry_t bits, *pte, oldpte;
2914 	int rv;
2915 
2916 	KASSERT(VIRT_IS_VALID(va), ("pmap_fault: invalid va %#lx", va));
2917 
2918 	rv = 0;
2919 	PMAP_LOCK(pmap);
2920 	l2 = pmap_l2(pmap, va);
2921 	if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0)
2922 		goto done;
2923 	if ((l2e & PTE_RWX) == 0) {
2924 		pte = pmap_l2_to_l3(l2, va);
2925 		if (((oldpte = pmap_load(pte)) & PTE_V) == 0)
2926 			goto done;
2927 	} else {
2928 		pte = l2;
2929 		oldpte = l2e;
2930 	}
2931 
2932 	if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) ||
2933 	    (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) ||
2934 	    (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) ||
2935 	    (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0))
2936 		goto done;
2937 
2938 	bits = PTE_A;
2939 	if (ftype == VM_PROT_WRITE)
2940 		bits |= PTE_D;
2941 
2942 	/*
2943 	 * Spurious faults can occur if the implementation caches invalid
2944 	 * entries in the TLB, or if simultaneous accesses on multiple CPUs
2945 	 * race with each other.
2946 	 */
2947 	if ((oldpte & bits) != bits)
2948 		pmap_store_bits(pte, bits);
2949 	sfence_vma();
2950 	rv = 1;
2951 done:
2952 	PMAP_UNLOCK(pmap);
2953 	return (rv);
2954 }
2955 
2956 /*
2957  *	Demote the specified L1 page to separate L2 pages.
2958  *	Currently only used for DMAP entries.
2959  */
2960 static bool
pmap_demote_l1(pmap_t pmap,pd_entry_t * l1,vm_offset_t va)2961 pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va)
2962 {
2963 	vm_page_t m;
2964 	pt_entry_t *l2, oldl1, newl2;
2965 	pd_entry_t newl1;
2966 	vm_paddr_t l2phys;
2967 
2968 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2969 
2970 	oldl1 = pmap_load(l1);
2971 	KASSERT((oldl1 & PTE_RWX) != 0,
2972 	    ("pmap_demote_l1: oldl1 is not a leaf PTE"));
2973 	KASSERT((oldl1 & PTE_A) != 0,
2974 	    ("pmap_demote_l1: oldl1 is missing PTE_A"));
2975 	KASSERT((oldl1 & (PTE_D | PTE_W)) != PTE_W,
2976 	    ("pmap_demote_l1: not dirty!"));
2977 	KASSERT((oldl1 & PTE_SW_MANAGED) == 0,
2978 	    ("pmap_demote_l1: L1 table shouldn't be managed"));
2979 	KASSERT(VIRT_IN_DMAP(va),
2980 	    ("pmap_demote_l1: is unsupported for non-DMAP va=%#lx", va));
2981 
2982 	/* Demoting L1 means we need to allocate a new page-table page. */
2983 	m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
2984 	if (m == NULL) {
2985 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx in pmap %p",
2986 		    va, pmap);
2987 		return (false);
2988 	}
2989 
2990 	l2phys = VM_PAGE_TO_PHYS(m);
2991 	l2 = PHYS_TO_DMAP(l2phys);
2992 
2993 	/*
2994 	 * Create new entries, relying on the fact that only the low bits
2995 	 * (index) of the physical address are changing.
2996 	 */
2997 	newl2 = oldl1;
2998 	for (int i = 0; i < Ln_ENTRIES; i++)
2999 		pmap_store(&l2[i], newl2 | (i << PTE_PPN1_S));
3000 
3001 	/*
3002 	 * And update the L1 entry.
3003 	 *
3004 	 * NB: flushing the TLB is the responsibility of the caller. Cached
3005 	 * translations are still "correct" for demoted mappings until some
3006 	 * subset of the demoted range is modified.
3007 	 */
3008 	newl1 = ((l2phys / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
3009 	pmap_store(l1, newl1);
3010 
3011 	counter_u64_add(pmap_l1_demotions, 1);
3012 	CTR2(KTR_PMAP, "pmap_demote_l1: success for va %#lx in pmap %p",
3013 	    va, pmap);
3014 	return (true);
3015 }
3016 
3017 static bool
pmap_demote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va)3018 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va)
3019 {
3020 	struct rwlock *lock;
3021 	bool rv;
3022 
3023 	lock = NULL;
3024 	rv = pmap_demote_l2_locked(pmap, l2, va, &lock);
3025 	if (lock != NULL)
3026 		rw_wunlock(lock);
3027 	return (rv);
3028 }
3029 
3030 /*
3031  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
3032  * mapping is invalidated.
3033  */
3034 static bool
pmap_demote_l2_locked(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)3035 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
3036     struct rwlock **lockp)
3037 {
3038 	struct spglist free;
3039 	vm_page_t mpte;
3040 	pd_entry_t newl2, oldl2;
3041 	pt_entry_t *firstl3, newl3;
3042 	vm_paddr_t mptepa;
3043 	int i;
3044 
3045 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3046 
3047 	oldl2 = pmap_load(l2);
3048 	KASSERT((oldl2 & PTE_RWX) != 0,
3049 	    ("pmap_demote_l2_locked: oldl2 is not a leaf entry"));
3050 	if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
3051 	    NULL) {
3052 		KASSERT((oldl2 & PTE_SW_WIRED) == 0,
3053 		    ("pmap_demote_l2_locked: page table page for a wired mapping is missing"));
3054 		if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj(
3055 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
3056 		    VM_ALLOC_WIRED)) == NULL) {
3057 			SLIST_INIT(&free);
3058 			(void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET,
3059 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
3060 			vm_page_free_pages_toq(&free, true);
3061 			CTR2(KTR_PMAP, "pmap_demote_l2_locked: "
3062 			    "failure for va %#lx in pmap %p", va, pmap);
3063 			return (false);
3064 		}
3065 		mpte->pindex = pmap_l2_pindex(va);
3066 		if (va < VM_MAXUSER_ADDRESS) {
3067 			mpte->ref_count = Ln_ENTRIES;
3068 			pmap_resident_count_inc(pmap, 1);
3069 		}
3070 	}
3071 	mptepa = VM_PAGE_TO_PHYS(mpte);
3072 	firstl3 = PHYS_TO_DMAP(mptepa);
3073 	newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
3074 	KASSERT((oldl2 & PTE_A) != 0,
3075 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_A"));
3076 	KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W,
3077 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_D"));
3078 	newl3 = oldl2;
3079 
3080 	/*
3081 	 * If the page table page is not leftover from an earlier promotion,
3082 	 * initialize it.
3083 	 */
3084 	if (!vm_page_all_valid(mpte)) {
3085 		for (i = 0; i < Ln_ENTRIES; i++)
3086 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
3087 	}
3088 	KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3),
3089 	    ("pmap_demote_l2_locked: firstl3 and newl3 map different physical "
3090 	    "addresses"));
3091 
3092 	/*
3093 	 * If the mapping has changed attributes, update the PTEs.
3094 	 */
3095 	if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE))
3096 		for (i = 0; i < Ln_ENTRIES; i++)
3097 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
3098 
3099 	/*
3100 	 * The spare PV entries must be reserved prior to demoting the
3101 	 * mapping, that is, prior to changing the L2 entry.  Otherwise, the
3102 	 * state of the L2 entry and the PV lists will be inconsistent, which
3103 	 * can result in reclaim_pv_chunk() attempting to remove a PV entry from
3104 	 * the wrong PV list and pmap_pv_demote_l2() failing to find the
3105 	 * expected PV entry for the 2MB page mapping that is being demoted.
3106 	 */
3107 	if ((oldl2 & PTE_SW_MANAGED) != 0)
3108 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
3109 
3110 	/*
3111 	 * Demote the mapping.
3112 	 */
3113 	pmap_store(l2, newl2);
3114 
3115 	/*
3116 	 * Demote the PV entry.
3117 	 */
3118 	if ((oldl2 & PTE_SW_MANAGED) != 0)
3119 		pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
3120 
3121 	atomic_add_long(&pmap_l2_demotions, 1);
3122 	CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p",
3123 	    va, pmap);
3124 	return (true);
3125 }
3126 
3127 #if VM_NRESERVLEVEL > 0
3128 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t ml3,struct rwlock ** lockp)3129 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t ml3,
3130     struct rwlock **lockp)
3131 {
3132 	pt_entry_t all_l3e_PTE_A, *firstl3, firstl3e, *l3, l3e;
3133 	vm_paddr_t pa;
3134 
3135 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3136 	if (!pmap_ps_enabled(pmap))
3137 		return (false);
3138 
3139 	KASSERT((pmap_load(l2) & PTE_RWX) == 0,
3140 	    ("pmap_promote_l2: invalid l2 entry %p", l2));
3141 
3142 	/*
3143 	 * Examine the first L3E in the specified PTP.  Abort if this L3E is
3144 	 * ineligible for promotion or does not map the first 4KB physical page
3145 	 * within a 2MB page.
3146 	 */
3147 	firstl3 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
3148 	firstl3e = pmap_load(firstl3);
3149 	pa = PTE_TO_PHYS(firstl3e);
3150 	if ((pa & L2_OFFSET) != 0) {
3151 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
3152 		    va, pmap);
3153 		atomic_add_long(&pmap_l2_p_failures, 1);
3154 		return (false);
3155 	}
3156 
3157 	/*
3158 	 * Downgrade a clean, writable mapping to read-only to ensure that the
3159 	 * hardware does not set PTE_D while we are comparing PTEs.
3160 	 *
3161 	 * Upon a write access to a clean mapping, the implementation will
3162 	 * either atomically check protections and set PTE_D, or raise a page
3163 	 * fault.  In the latter case, the pmap lock provides atomicity.  Thus,
3164 	 * we do not issue an sfence.vma here and instead rely on pmap_fault()
3165 	 * to do so lazily.
3166 	 */
3167 	while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) {
3168 		if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) {
3169 			firstl3e &= ~PTE_W;
3170 			break;
3171 		}
3172 	}
3173 
3174 	/*
3175 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3176 	 * PTE maps an unexpected 4KB physical page or does not have identical
3177 	 * characteristics to the first PTE.
3178 	 */
3179 	all_l3e_PTE_A = firstl3e & PTE_A;
3180 	pa += L2_SIZE - PAGE_SIZE;
3181 	for (l3 = firstl3 + Ln_ENTRIES - 1; l3 > firstl3; l3--) {
3182 		l3e = pmap_load(l3);
3183 		if (PTE_TO_PHYS(l3e) != pa) {
3184 			CTR2(KTR_PMAP,
3185 			    "pmap_promote_l2: failure for va %#lx pmap %p",
3186 			    va, pmap);
3187 			atomic_add_long(&pmap_l2_p_failures, 1);
3188 			return (false);
3189 		}
3190 		while ((l3e & (PTE_W | PTE_D)) == PTE_W) {
3191 			if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) {
3192 				l3e &= ~PTE_W;
3193 				break;
3194 			}
3195 		}
3196 		if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) {
3197 			CTR2(KTR_PMAP,
3198 			    "pmap_promote_l2: failure for va %#lx pmap %p",
3199 			    va, pmap);
3200 			atomic_add_long(&pmap_l2_p_failures, 1);
3201 			return (false);
3202 		}
3203 		all_l3e_PTE_A &= l3e;
3204 		pa -= PAGE_SIZE;
3205 	}
3206 
3207 	/*
3208 	 * Unless all PTEs have PTE_A set, clear it from the superpage
3209 	 * mapping, so that promotions triggered by speculative mappings,
3210 	 * such as pmap_enter_quick(), don't automatically mark the
3211 	 * underlying pages as referenced.
3212 	 */
3213 	firstl3e &= ~PTE_A | all_l3e_PTE_A;
3214 
3215 	/*
3216 	 * Save the page table page in its current state until the L2
3217 	 * mapping the superpage is demoted by pmap_demote_l2() or
3218 	 * destroyed by pmap_remove_l3().
3219 	 */
3220 	if (ml3 == NULL)
3221 		ml3 = PTE_TO_VM_PAGE(pmap_load(l2));
3222 	KASSERT(ml3->pindex == pmap_l2_pindex(va),
3223 	    ("pmap_promote_l2: page table page's pindex is wrong"));
3224 	if (pmap_insert_pt_page(pmap, ml3, true, all_l3e_PTE_A != 0)) {
3225 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
3226 		    va, pmap);
3227 		atomic_add_long(&pmap_l2_p_failures, 1);
3228 		return (false);
3229 	}
3230 
3231 	if ((firstl3e & PTE_SW_MANAGED) != 0)
3232 		pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp);
3233 
3234 	pmap_store(l2, firstl3e);
3235 
3236 	atomic_add_long(&pmap_l2_promotions, 1);
3237 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
3238 	    pmap);
3239 	return (true);
3240 }
3241 #endif
3242 
3243 /*
3244  *	Insert the given physical page (p) at
3245  *	the specified virtual address (v) in the
3246  *	target physical map with the protection requested.
3247  *
3248  *	If specified, the page will be wired down, meaning
3249  *	that the related pte can not be reclaimed.
3250  *
3251  *	NB:  This is the only routine which MAY NOT lazy-evaluate
3252  *	or lose information.  That is, this routine must actually
3253  *	insert this page into the given map NOW.
3254  */
3255 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)3256 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3257     u_int flags, int8_t psind)
3258 {
3259 	struct rwlock *lock;
3260 	pd_entry_t *l2, l2e;
3261 	pt_entry_t new_l3, orig_l3;
3262 	pt_entry_t *l3;
3263 	pv_entry_t pv;
3264 	vm_paddr_t opa, pa;
3265 	vm_page_t mpte, om;
3266 	pn_t pn;
3267 	int rv;
3268 	bool nosleep;
3269 
3270 	va = trunc_page(va);
3271 	if ((m->oflags & VPO_UNMANAGED) == 0)
3272 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
3273 	pa = VM_PAGE_TO_PHYS(m);
3274 	pn = (pa / PAGE_SIZE);
3275 
3276 	new_l3 = PTE_V | PTE_R | PTE_A;
3277 	if (prot & VM_PROT_EXECUTE)
3278 		new_l3 |= PTE_X;
3279 	if (flags & VM_PROT_WRITE)
3280 		new_l3 |= PTE_D;
3281 	if (prot & VM_PROT_WRITE)
3282 		new_l3 |= PTE_W;
3283 	if (va < VM_MAX_USER_ADDRESS)
3284 		new_l3 |= PTE_U;
3285 
3286 	new_l3 |= (pn << PTE_PPN0_S);
3287 	if ((flags & PMAP_ENTER_WIRED) != 0)
3288 		new_l3 |= PTE_SW_WIRED;
3289 	new_l3 |= pmap_memattr_bits(m->md.pv_memattr);
3290 
3291 	/*
3292 	 * Set modified bit gratuitously for writeable mappings if
3293 	 * the page is unmanaged. We do not want to take a fault
3294 	 * to do the dirty bit accounting for these mappings.
3295 	 */
3296 	if ((m->oflags & VPO_UNMANAGED) != 0) {
3297 		if (prot & VM_PROT_WRITE)
3298 			new_l3 |= PTE_D;
3299 	} else
3300 		new_l3 |= PTE_SW_MANAGED;
3301 
3302 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
3303 
3304 	lock = NULL;
3305 	mpte = NULL;
3306 	rw_rlock(&pvh_global_lock);
3307 	PMAP_LOCK(pmap);
3308 	if (psind == 1) {
3309 		/* Assert the required virtual and physical alignment. */
3310 		KASSERT((va & L2_OFFSET) == 0,
3311 		    ("pmap_enter: va %#lx unaligned", va));
3312 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
3313 		rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock);
3314 		goto out;
3315 	}
3316 
3317 	l2 = pmap_l2(pmap, va);
3318 	if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 &&
3319 	    ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2,
3320 	    va, &lock))) {
3321 		l3 = pmap_l2_to_l3(l2, va);
3322 		if (va < VM_MAXUSER_ADDRESS) {
3323 			mpte = PTE_TO_VM_PAGE(pmap_load(l2));
3324 			mpte->ref_count++;
3325 		}
3326 	} else if (va < VM_MAXUSER_ADDRESS) {
3327 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
3328 		mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
3329 		if (mpte == NULL && nosleep) {
3330 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
3331 			if (lock != NULL)
3332 				rw_wunlock(lock);
3333 			rw_runlock(&pvh_global_lock);
3334 			PMAP_UNLOCK(pmap);
3335 			return (KERN_RESOURCE_SHORTAGE);
3336 		}
3337 		l3 = pmap_l3(pmap, va);
3338 	} else {
3339 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
3340 	}
3341 
3342 	orig_l3 = pmap_load(l3);
3343 	opa = PTE_TO_PHYS(orig_l3);
3344 	pv = NULL;
3345 
3346 	/*
3347 	 * Is the specified virtual address already mapped?
3348 	 */
3349 	if ((orig_l3 & PTE_V) != 0) {
3350 		/*
3351 		 * Wiring change, just update stats. We don't worry about
3352 		 * wiring PT pages as they remain resident as long as there
3353 		 * are valid mappings in them. Hence, if a user page is wired,
3354 		 * the PT page will be also.
3355 		 */
3356 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
3357 		    (orig_l3 & PTE_SW_WIRED) == 0)
3358 			pmap->pm_stats.wired_count++;
3359 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
3360 		    (orig_l3 & PTE_SW_WIRED) != 0)
3361 			pmap->pm_stats.wired_count--;
3362 
3363 		/*
3364 		 * Remove the extra PT page reference.
3365 		 */
3366 		if (mpte != NULL) {
3367 			mpte->ref_count--;
3368 			KASSERT(mpte->ref_count > 0,
3369 			    ("pmap_enter: missing reference to page table page,"
3370 			     " va: 0x%lx", va));
3371 		}
3372 
3373 		/*
3374 		 * Has the physical page changed?
3375 		 */
3376 		if (opa == pa) {
3377 			/*
3378 			 * No, might be a protection or wiring change.
3379 			 */
3380 			if ((orig_l3 & PTE_SW_MANAGED) != 0 &&
3381 			    (new_l3 & PTE_W) != 0)
3382 				vm_page_aflag_set(m, PGA_WRITEABLE);
3383 			goto validate;
3384 		}
3385 
3386 		/*
3387 		 * The physical page has changed.  Temporarily invalidate
3388 		 * the mapping.  This ensures that all threads sharing the
3389 		 * pmap keep a consistent view of the mapping, which is
3390 		 * necessary for the correct handling of COW faults.  It
3391 		 * also permits reuse of the old mapping's PV entry,
3392 		 * avoiding an allocation.
3393 		 *
3394 		 * For consistency, handle unmanaged mappings the same way.
3395 		 */
3396 		orig_l3 = pmap_load_clear(l3);
3397 		KASSERT(PTE_TO_PHYS(orig_l3) == opa,
3398 		    ("pmap_enter: unexpected pa update for %#lx", va));
3399 		if ((orig_l3 & PTE_SW_MANAGED) != 0) {
3400 			om = PHYS_TO_VM_PAGE(opa);
3401 
3402 			/*
3403 			 * The pmap lock is sufficient to synchronize with
3404 			 * concurrent calls to pmap_page_test_mappings() and
3405 			 * pmap_ts_referenced().
3406 			 */
3407 			if ((orig_l3 & PTE_D) != 0)
3408 				vm_page_dirty(om);
3409 			if ((orig_l3 & PTE_A) != 0)
3410 				vm_page_aflag_set(om, PGA_REFERENCED);
3411 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3412 			pv = pmap_pvh_remove(&om->md, pmap, va);
3413 			KASSERT(pv != NULL,
3414 			    ("pmap_enter: no PV entry for %#lx", va));
3415 			if ((new_l3 & PTE_SW_MANAGED) == 0)
3416 				free_pv_entry(pmap, pv);
3417 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
3418 			    TAILQ_EMPTY(&om->md.pv_list) &&
3419 			    ((om->flags & PG_FICTITIOUS) != 0 ||
3420 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3421 				vm_page_aflag_clear(om, PGA_WRITEABLE);
3422 		}
3423 		pmap_invalidate_page(pmap, va);
3424 		orig_l3 = 0;
3425 	} else {
3426 		/*
3427 		 * Increment the counters.
3428 		 */
3429 		if ((new_l3 & PTE_SW_WIRED) != 0)
3430 			pmap->pm_stats.wired_count++;
3431 		pmap_resident_count_inc(pmap, 1);
3432 	}
3433 	/*
3434 	 * Enter on the PV list if part of our managed memory.
3435 	 */
3436 	if ((new_l3 & PTE_SW_MANAGED) != 0) {
3437 		if (pv == NULL) {
3438 			pv = get_pv_entry(pmap, &lock);
3439 			pv->pv_va = va;
3440 		}
3441 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3442 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3443 		m->md.pv_gen++;
3444 		if ((new_l3 & PTE_W) != 0)
3445 			vm_page_aflag_set(m, PGA_WRITEABLE);
3446 	}
3447 
3448 validate:
3449 	/*
3450 	 * Sync the i-cache on all harts before updating the PTE
3451 	 * if the new PTE is executable.
3452 	 */
3453 	if (prot & VM_PROT_EXECUTE)
3454 		pmap_sync_icache(pmap, va, PAGE_SIZE);
3455 
3456 	/*
3457 	 * Update the L3 entry.
3458 	 */
3459 	if (orig_l3 != 0) {
3460 		orig_l3 = pmap_load_store(l3, new_l3);
3461 		pmap_invalidate_page(pmap, va);
3462 		KASSERT(PTE_TO_PHYS(orig_l3) == pa,
3463 		    ("pmap_enter: invalid update"));
3464 		if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) ==
3465 		    (PTE_D | PTE_SW_MANAGED))
3466 			vm_page_dirty(m);
3467 	} else {
3468 		pmap_store(l3, new_l3);
3469 	}
3470 
3471 #if VM_NRESERVLEVEL > 0
3472 	if (mpte != NULL && mpte->ref_count == Ln_ENTRIES &&
3473 	    (m->flags & PG_FICTITIOUS) == 0 &&
3474 	    vm_reserv_level_iffullpop(m) == 0)
3475 		(void)pmap_promote_l2(pmap, l2, va, mpte, &lock);
3476 #endif
3477 
3478 	rv = KERN_SUCCESS;
3479 out:
3480 	if (lock != NULL)
3481 		rw_wunlock(lock);
3482 	rw_runlock(&pvh_global_lock);
3483 	PMAP_UNLOCK(pmap);
3484 	return (rv);
3485 }
3486 
3487 /*
3488  * Release a page table page reference after a failed attempt to create a
3489  * mapping.
3490  */
3491 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t l2pg)3492 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t l2pg)
3493 {
3494 	struct spglist free;
3495 
3496 	SLIST_INIT(&free);
3497 	if (pmap_unwire_ptp(pmap, va, l2pg, &free)) {
3498 		/*
3499 		 * Although "va" is not mapped, paging-structure
3500 		 * caches could nonetheless have entries that
3501 		 * refer to the freed page table pages.
3502 		 * Invalidate those entries.
3503 		 */
3504 		pmap_invalidate_page(pmap, va);
3505 		vm_page_free_pages_toq(&free, true);
3506 	}
3507 }
3508 
3509 /*
3510  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns
3511  * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
3512  * value.  See pmap_enter_l2() for the possible error values when "no sleep",
3513  * "no replace", and "no reclaim" are specified.
3514  */
3515 static int
pmap_enter_2mpage(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)3516 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3517     struct rwlock **lockp)
3518 {
3519 	pd_entry_t new_l2;
3520 	pn_t pn;
3521 
3522 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3523 
3524 	pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE;
3525 	new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V |
3526 	    pmap_memattr_bits(m->md.pv_memattr));
3527 	if ((m->oflags & VPO_UNMANAGED) == 0)
3528 		new_l2 |= PTE_SW_MANAGED;
3529 	if ((prot & VM_PROT_EXECUTE) != 0)
3530 		new_l2 |= PTE_X;
3531 	if (va < VM_MAXUSER_ADDRESS)
3532 		new_l2 |= PTE_U;
3533 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
3534 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp));
3535 }
3536 
3537 /*
3538  * Returns true if every page table entry in the specified page table is
3539  * zero.
3540  */
3541 static bool
pmap_every_pte_zero(vm_paddr_t pa)3542 pmap_every_pte_zero(vm_paddr_t pa)
3543 {
3544 	pt_entry_t *pt_end, *pte;
3545 
3546 	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
3547 	pte = PHYS_TO_DMAP(pa);
3548 	for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
3549 		if (*pte != 0)
3550 			return (false);
3551 	}
3552 	return (true);
3553 }
3554 
3555 /*
3556  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
3557  * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
3558  * KERN_RESOURCE_SHORTAGE otherwise.  Returns KERN_FAILURE if
3559  * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists
3560  * within the 2MB virtual address range starting at the specified virtual
3561  * address.  Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
3562  * 2MB page mapping already exists at the specified virtual address.  Returns
3563  * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
3564  * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
3565  * and a PV entry allocation failed.
3566  *
3567  * The parameter "m" is only used when creating a managed, writeable mapping.
3568  */
3569 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)3570 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
3571     vm_page_t m, struct rwlock **lockp)
3572 {
3573 	struct spglist free;
3574 	pd_entry_t *l2, *l3, oldl2;
3575 	vm_offset_t sva;
3576 	vm_page_t l2pg, mt;
3577 	vm_page_t uwptpg;
3578 
3579 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3580 
3581 	if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
3582 	    NULL : lockp)) == NULL) {
3583 		CTR2(KTR_PMAP, "pmap_enter_l2: failed to allocate PT page"
3584 		    " for va %#lx in pmap %p", va, pmap);
3585 		return (KERN_RESOURCE_SHORTAGE);
3586 	}
3587 
3588 	l2 = VM_PAGE_TO_DMAP(l2pg);
3589 	l2 = &l2[pmap_l2_index(va)];
3590 	if ((oldl2 = pmap_load(l2)) != 0) {
3591 		KASSERT(l2pg->ref_count > 1,
3592 		    ("pmap_enter_l2: l2pg's ref count is too low"));
3593 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
3594 			if ((oldl2 & PTE_RWX) != 0) {
3595 				l2pg->ref_count--;
3596 				CTR2(KTR_PMAP,
3597 				    "pmap_enter_l2: no space for va %#lx"
3598 				    " in pmap %p", va, pmap);
3599 				return (KERN_NO_SPACE);
3600 			} else if (va < VM_MAXUSER_ADDRESS ||
3601 			    !pmap_every_pte_zero(L2PTE_TO_PHYS(oldl2))) {
3602 				l2pg->ref_count--;
3603 				CTR2(KTR_PMAP, "pmap_enter_l2:"
3604 				    " failed to replace existing mapping"
3605 				    " for va %#lx in pmap %p", va, pmap);
3606 				return (KERN_FAILURE);
3607 			}
3608 		}
3609 		SLIST_INIT(&free);
3610 		if ((oldl2 & PTE_RWX) != 0)
3611 			(void)pmap_remove_l2(pmap, l2, va,
3612 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
3613 		else
3614 			for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) {
3615 				l3 = pmap_l2_to_l3(l2, sva);
3616 				if ((pmap_load(l3) & PTE_V) != 0 &&
3617 				    pmap_remove_l3(pmap, l3, sva, oldl2, &free,
3618 				    lockp) != 0)
3619 					break;
3620 			}
3621 		vm_page_free_pages_toq(&free, true);
3622 		if (va >= VM_MAXUSER_ADDRESS) {
3623 			/*
3624 			 * Both pmap_remove_l2() and pmap_remove_l3() will
3625 			 * leave the kernel page table page zero filled.
3626 			 */
3627 			mt = PTE_TO_VM_PAGE(pmap_load(l2));
3628 			if (pmap_insert_pt_page(pmap, mt, false, false))
3629 				panic("pmap_enter_l2: trie insert failed");
3630 		} else
3631 			KASSERT(pmap_load(l2) == 0,
3632 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
3633 	}
3634 
3635 	/*
3636 	 * Allocate leaf ptpage for wired userspace pages.
3637 	 */
3638 	uwptpg = NULL;
3639 	if ((new_l2 & PTE_SW_WIRED) != 0 && pmap != kernel_pmap) {
3640 		uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3641 		if (uwptpg == NULL) {
3642 			pmap_abort_ptp(pmap, va, l2pg);
3643 			return (KERN_RESOURCE_SHORTAGE);
3644 		}
3645 		uwptpg->pindex = pmap_l2_pindex(va);
3646 		if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
3647 			vm_page_unwire_noq(uwptpg);
3648 			vm_page_free(uwptpg);
3649 			pmap_abort_ptp(pmap, va, l2pg);
3650 			return (KERN_RESOURCE_SHORTAGE);
3651 		}
3652 		pmap_resident_count_inc(pmap, 1);
3653 		uwptpg->ref_count = Ln_ENTRIES;
3654 	}
3655 	if ((new_l2 & PTE_SW_MANAGED) != 0) {
3656 		/*
3657 		 * Abort this mapping if its PV entry could not be created.
3658 		 */
3659 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
3660 			pmap_abort_ptp(pmap, va, l2pg);
3661 			if (uwptpg != NULL) {
3662 				mt = pmap_remove_pt_page(pmap, va);
3663 				KASSERT(mt == uwptpg,
3664 				    ("removed pt page %p, expected %p", mt,
3665 				    uwptpg));
3666 				pmap_resident_count_dec(pmap, 1);
3667 				uwptpg->ref_count = 1;
3668 				vm_page_unwire_noq(uwptpg);
3669 				vm_page_free(uwptpg);
3670 			}
3671 			CTR2(KTR_PMAP,
3672 			    "pmap_enter_l2: failed to create PV entry"
3673 			    " for va %#lx in pmap %p", va, pmap);
3674 			return (KERN_RESOURCE_SHORTAGE);
3675 		}
3676 		if ((new_l2 & PTE_W) != 0)
3677 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3678 				vm_page_aflag_set(mt, PGA_WRITEABLE);
3679 	}
3680 
3681 	/*
3682 	 * Increment counters.
3683 	 */
3684 	if ((new_l2 & PTE_SW_WIRED) != 0)
3685 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
3686 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
3687 
3688 	/*
3689 	 * Map the superpage.
3690 	 */
3691 	pmap_store(l2, new_l2);
3692 
3693 	atomic_add_long(&pmap_l2_mappings, 1);
3694 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
3695 	    va, pmap);
3696 
3697 	return (KERN_SUCCESS);
3698 }
3699 
3700 /*
3701  * Maps a sequence of resident pages belonging to the same object.
3702  * The sequence begins with the given page m_start.  This page is
3703  * mapped at the given virtual address start.  Each subsequent page is
3704  * mapped at a virtual address that is offset from start by the same
3705  * amount as the page is offset from m_start within the object.  The
3706  * last page in the sequence is the page with the largest offset from
3707  * m_start that can be mapped at a virtual address less than the given
3708  * virtual address end.  Not every virtual page between start and end
3709  * is mapped; only those for which a resident page exists with the
3710  * corresponding offset from m_start are mapped.
3711  */
3712 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)3713 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3714     vm_page_t m_start, vm_prot_t prot)
3715 {
3716 	struct pctrie_iter pages;
3717 	struct rwlock *lock;
3718 	vm_offset_t va;
3719 	vm_page_t m, mpte;
3720 	int rv;
3721 
3722 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3723 
3724 	mpte = NULL;
3725 	vm_page_iter_limit_init(&pages, m_start->object,
3726 	    m_start->pindex + atop(end - start));
3727 	m = vm_radix_iter_lookup(&pages, m_start->pindex);
3728 	lock = NULL;
3729 	rw_rlock(&pvh_global_lock);
3730 	PMAP_LOCK(pmap);
3731 	while (m != NULL) {
3732 		va = start + ptoa(m->pindex - m_start->pindex);
3733 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
3734 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
3735 		    ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) ==
3736 		    KERN_SUCCESS || rv == KERN_NO_SPACE)) {
3737 			m = vm_radix_iter_jump(&pages, L2_SIZE / PAGE_SIZE);
3738 		} else {
3739 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
3740 			    &lock);
3741 			m = vm_radix_iter_step(&pages);
3742 		}
3743 	}
3744 	if (lock != NULL)
3745 		rw_wunlock(lock);
3746 	rw_runlock(&pvh_global_lock);
3747 	PMAP_UNLOCK(pmap);
3748 }
3749 
3750 /*
3751  * this code makes some *MAJOR* assumptions:
3752  * 1. Current pmap & pmap exists.
3753  * 2. Not wired.
3754  * 3. Read access.
3755  * 4. No page table pages.
3756  * but is *MUCH* faster than pmap_enter...
3757  */
3758 
3759 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)3760 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3761 {
3762 	struct rwlock *lock;
3763 
3764 	lock = NULL;
3765 	rw_rlock(&pvh_global_lock);
3766 	PMAP_LOCK(pmap);
3767 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
3768 	if (lock != NULL)
3769 		rw_wunlock(lock);
3770 	rw_runlock(&pvh_global_lock);
3771 	PMAP_UNLOCK(pmap);
3772 }
3773 
3774 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)3775 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3776     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
3777 {
3778 	struct spglist free;
3779 	pd_entry_t *l2;
3780 	pt_entry_t *l3, newl3;
3781 
3782 	KASSERT(!VA_IS_CLEANMAP(va) ||
3783 	    (m->oflags & VPO_UNMANAGED) != 0,
3784 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3785 	rw_assert(&pvh_global_lock, RA_LOCKED);
3786 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3787 	l2 = NULL;
3788 
3789 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
3790 	/*
3791 	 * In the case that a page table page is not
3792 	 * resident, we are creating it here.
3793 	 */
3794 	if (va < VM_MAXUSER_ADDRESS) {
3795 		vm_pindex_t l2pindex;
3796 
3797 		/*
3798 		 * Calculate pagetable page index
3799 		 */
3800 		l2pindex = pmap_l2_pindex(va);
3801 		if (mpte && (mpte->pindex == l2pindex)) {
3802 			mpte->ref_count++;
3803 		} else {
3804 			/*
3805 			 * Get the l2 entry
3806 			 */
3807 			l2 = pmap_l2(pmap, va);
3808 
3809 			/*
3810 			 * If the page table page is mapped, we just increment
3811 			 * the hold count, and activate it.  Otherwise, we
3812 			 * attempt to allocate a page table page.  If this
3813 			 * attempt fails, we don't retry.  Instead, we give up.
3814 			 */
3815 			if (l2 != NULL && pmap_load(l2) != 0) {
3816 				if ((pmap_load(l2) & PTE_RWX) != 0)
3817 					return (NULL);
3818 				mpte = PTE_TO_VM_PAGE(pmap_load(l2));
3819 				mpte->ref_count++;
3820 			} else {
3821 				/*
3822 				 * Pass NULL instead of the PV list lock
3823 				 * pointer, because we don't intend to sleep.
3824 				 */
3825 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
3826 				if (mpte == NULL)
3827 					return (mpte);
3828 			}
3829 		}
3830 		l3 = VM_PAGE_TO_DMAP(mpte);
3831 		l3 = &l3[pmap_l3_index(va)];
3832 	} else {
3833 		mpte = NULL;
3834 		l3 = pmap_l3(kernel_pmap, va);
3835 	}
3836 	if (l3 == NULL)
3837 		panic("pmap_enter_quick_locked: No l3");
3838 	if (pmap_load(l3) != 0) {
3839 		if (mpte != NULL)
3840 			mpte->ref_count--;
3841 		return (NULL);
3842 	}
3843 
3844 	/*
3845 	 * Enter on the PV list if part of our managed memory.
3846 	 */
3847 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3848 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3849 		if (mpte != NULL) {
3850 			SLIST_INIT(&free);
3851 			if (pmap_unwire_ptp(pmap, va, mpte, &free))
3852 				vm_page_free_pages_toq(&free, false);
3853 		}
3854 		return (NULL);
3855 	}
3856 
3857 	/*
3858 	 * Increment counters
3859 	 */
3860 	pmap_resident_count_inc(pmap, 1);
3861 
3862 	newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) |
3863 	    PTE_V | PTE_R | pmap_memattr_bits(m->md.pv_memattr);
3864 	if ((prot & VM_PROT_EXECUTE) != 0)
3865 		newl3 |= PTE_X;
3866 	if ((m->oflags & VPO_UNMANAGED) == 0)
3867 		newl3 |= PTE_SW_MANAGED;
3868 	if (va < VM_MAX_USER_ADDRESS)
3869 		newl3 |= PTE_U;
3870 
3871 	/*
3872 	 * Sync the i-cache on all harts before updating the PTE
3873 	 * if the new PTE is executable.
3874 	 */
3875 	if (prot & VM_PROT_EXECUTE)
3876 		pmap_sync_icache(pmap, va, PAGE_SIZE);
3877 
3878 	pmap_store(l3, newl3);
3879 
3880 #if VM_NRESERVLEVEL > 0
3881 	/*
3882 	 * If both the PTP and the reservation are fully populated, then attempt
3883 	 * promotion.
3884 	 */
3885 	if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
3886 	    (mpte == NULL || mpte->ref_count == Ln_ENTRIES) &&
3887 	    (m->flags & PG_FICTITIOUS) == 0 &&
3888 	    vm_reserv_level_iffullpop(m) == 0) {
3889 		if (l2 == NULL)
3890 			l2 = pmap_l2(pmap, va);
3891 
3892 		/*
3893 		 * If promotion succeeds, then the next call to this function
3894 		 * should not be given the unmapped PTP as a hint.
3895 		 */
3896 		if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
3897 			mpte = NULL;
3898 	}
3899 #endif
3900 
3901 	return (mpte);
3902 }
3903 
3904 /*
3905  * This code maps large physical mmap regions into the
3906  * processor address space.  Note that some shortcuts
3907  * are taken, but the code works.
3908  */
3909 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)3910 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3911     vm_pindex_t pindex, vm_size_t size)
3912 {
3913 
3914 	VM_OBJECT_ASSERT_WLOCKED(object);
3915 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3916 	    ("pmap_object_init_pt: non-device object"));
3917 }
3918 
3919 /*
3920  *	Clear the wired attribute from the mappings for the specified range of
3921  *	addresses in the given pmap.  Every valid mapping within that range
3922  *	must have the wired attribute set.  In contrast, invalid mappings
3923  *	cannot have the wired attribute set, so they are ignored.
3924  *
3925  *	The wired attribute of the page table entry is not a hardware feature,
3926  *	so there is no need to invalidate any TLB entries.
3927  */
3928 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3929 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3930 {
3931 	vm_offset_t va_next;
3932 	pd_entry_t *l0, *l1, *l2, l2e;
3933 	pt_entry_t *l3, l3e;
3934 	bool pv_lists_locked;
3935 
3936 	pv_lists_locked = false;
3937 retry:
3938 	PMAP_LOCK(pmap);
3939 	for (; sva < eva; sva = va_next) {
3940 		if (pmap_mode == PMAP_MODE_SV48) {
3941 			l0 = pmap_l0(pmap, sva);
3942 			if (pmap_load(l0) == 0) {
3943 				va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3944 				if (va_next < sva)
3945 					va_next = eva;
3946 				continue;
3947 			}
3948 			l1 = pmap_l0_to_l1(l0, sva);
3949 		} else {
3950 			l1 = pmap_l1(pmap, sva);
3951 		}
3952 
3953 		if (pmap_load(l1) == 0) {
3954 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3955 			if (va_next < sva)
3956 				va_next = eva;
3957 			continue;
3958 		}
3959 
3960 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3961 		if (va_next < sva)
3962 			va_next = eva;
3963 
3964 		l2 = pmap_l1_to_l2(l1, sva);
3965 		if ((l2e = pmap_load(l2)) == 0)
3966 			continue;
3967 		if ((l2e & PTE_RWX) != 0) {
3968 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3969 				if ((l2e & PTE_SW_WIRED) == 0)
3970 					panic("pmap_unwire: l2 %#jx is missing "
3971 					    "PTE_SW_WIRED", (uintmax_t)l2e);
3972 				pmap_clear_bits(l2, PTE_SW_WIRED);
3973 				continue;
3974 			} else {
3975 				if (!pv_lists_locked) {
3976 					pv_lists_locked = true;
3977 					if (!rw_try_rlock(&pvh_global_lock)) {
3978 						PMAP_UNLOCK(pmap);
3979 						rw_rlock(&pvh_global_lock);
3980 						/* Repeat sva. */
3981 						goto retry;
3982 					}
3983 				}
3984 				if (!pmap_demote_l2(pmap, l2, sva))
3985 					panic("pmap_unwire: demotion failed");
3986 			}
3987 		}
3988 
3989 		if (va_next > eva)
3990 			va_next = eva;
3991 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
3992 		    sva += L3_SIZE) {
3993 			if ((l3e = pmap_load(l3)) == 0)
3994 				continue;
3995 			if ((l3e & PTE_SW_WIRED) == 0)
3996 				panic("pmap_unwire: l3 %#jx is missing "
3997 				    "PTE_SW_WIRED", (uintmax_t)l3e);
3998 
3999 			/*
4000 			 * PG_W must be cleared atomically.  Although the pmap
4001 			 * lock synchronizes access to PG_W, another processor
4002 			 * could be setting PG_M and/or PG_A concurrently.
4003 			 */
4004 			pmap_clear_bits(l3, PTE_SW_WIRED);
4005 			pmap->pm_stats.wired_count--;
4006 		}
4007 	}
4008 	if (pv_lists_locked)
4009 		rw_runlock(&pvh_global_lock);
4010 	PMAP_UNLOCK(pmap);
4011 }
4012 
4013 /*
4014  *	Copy the range specified by src_addr/len
4015  *	from the source map to the range dst_addr/len
4016  *	in the destination map.
4017  *
4018  *	This routine is only advisory and need not do anything.
4019  */
4020 
4021 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)4022 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4023     vm_offset_t src_addr)
4024 {
4025 
4026 }
4027 
4028 /*
4029  *	pmap_zero_page zeros the specified hardware page by mapping
4030  *	the page into KVM and using bzero to clear its contents.
4031  */
4032 void
pmap_zero_page(vm_page_t m)4033 pmap_zero_page(vm_page_t m)
4034 {
4035 	pagezero(VM_PAGE_TO_DMAP(m));
4036 }
4037 
4038 /*
4039  *	pmap_zero_page_area zeros the specified hardware page by mapping
4040  *	the page into KVM and using bzero to clear its contents.
4041  *
4042  *	off and size may not cover an area beyond a single hardware page.
4043  */
4044 void
pmap_zero_page_area(vm_page_t m,int off,int size)4045 pmap_zero_page_area(vm_page_t m, int off, int size)
4046 {
4047 	void *va = VM_PAGE_TO_DMAP(m);
4048 
4049 	if (off == 0 && size == PAGE_SIZE)
4050 		pagezero(va);
4051 	else
4052 		bzero((char *)va + off, size);
4053 }
4054 
4055 /*
4056  *	pmap_copy_page copies the specified (machine independent)
4057  *	page by mapping the page into virtual memory and using
4058  *	bcopy to copy the page, one machine dependent page at a
4059  *	time.
4060  */
4061 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)4062 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
4063 {
4064 	void *src = VM_PAGE_TO_DMAP(msrc);
4065 	void *dst = VM_PAGE_TO_DMAP(mdst);
4066 
4067 	pagecopy(src, dst);
4068 }
4069 
4070 int unmapped_buf_allowed = 1;
4071 
4072 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)4073 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4074     vm_offset_t b_offset, int xfersize)
4075 {
4076 	void *a_cp, *b_cp;
4077 	vm_page_t m_a, m_b;
4078 	vm_paddr_t p_a, p_b;
4079 	vm_offset_t a_pg_offset, b_pg_offset;
4080 	int cnt;
4081 
4082 	while (xfersize > 0) {
4083 		a_pg_offset = a_offset & PAGE_MASK;
4084 		m_a = ma[a_offset >> PAGE_SHIFT];
4085 		p_a = m_a->phys_addr;
4086 		b_pg_offset = b_offset & PAGE_MASK;
4087 		m_b = mb[b_offset >> PAGE_SHIFT];
4088 		p_b = m_b->phys_addr;
4089 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4090 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4091 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
4092 			panic("!DMAP a %lx", p_a);
4093 		} else {
4094 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
4095 		}
4096 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
4097 			panic("!DMAP b %lx", p_b);
4098 		} else {
4099 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
4100 		}
4101 		bcopy(a_cp, b_cp, cnt);
4102 		a_offset += cnt;
4103 		b_offset += cnt;
4104 		xfersize -= cnt;
4105 	}
4106 }
4107 
4108 void *
pmap_quick_enter_page(vm_page_t m)4109 pmap_quick_enter_page(vm_page_t m)
4110 {
4111 
4112 	return (VM_PAGE_TO_DMAP(m));
4113 }
4114 
4115 void
pmap_quick_remove_page(void * addr)4116 pmap_quick_remove_page(void *addr)
4117 {
4118 }
4119 
4120 /*
4121  * Returns true if the pmap's pv is one of the first
4122  * 16 pvs linked to from this page.  This count may
4123  * be changed upwards or downwards in the future; it
4124  * is only necessary that true be returned for a small
4125  * subset of pmaps for proper page aging.
4126  */
4127 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)4128 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4129 {
4130 	struct md_page *pvh;
4131 	struct rwlock *lock;
4132 	pv_entry_t pv;
4133 	int loops = 0;
4134 	bool rv;
4135 
4136 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4137 	    ("pmap_page_exists_quick: page %p is not managed", m));
4138 	rv = false;
4139 	rw_rlock(&pvh_global_lock);
4140 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4141 	rw_rlock(lock);
4142 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4143 		if (PV_PMAP(pv) == pmap) {
4144 			rv = true;
4145 			break;
4146 		}
4147 		loops++;
4148 		if (loops >= 16)
4149 			break;
4150 	}
4151 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4152 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4153 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4154 			if (PV_PMAP(pv) == pmap) {
4155 				rv = true;
4156 				break;
4157 			}
4158 			loops++;
4159 			if (loops >= 16)
4160 				break;
4161 		}
4162 	}
4163 	rw_runlock(lock);
4164 	rw_runlock(&pvh_global_lock);
4165 	return (rv);
4166 }
4167 
4168 /*
4169  *	pmap_page_wired_mappings:
4170  *
4171  *	Return the number of managed mappings to the given physical page
4172  *	that are wired.
4173  */
4174 int
pmap_page_wired_mappings(vm_page_t m)4175 pmap_page_wired_mappings(vm_page_t m)
4176 {
4177 	struct md_page *pvh;
4178 	struct rwlock *lock;
4179 	pmap_t pmap;
4180 	pd_entry_t *l2;
4181 	pt_entry_t *l3;
4182 	pv_entry_t pv;
4183 	int count, md_gen, pvh_gen;
4184 
4185 	if ((m->oflags & VPO_UNMANAGED) != 0)
4186 		return (0);
4187 	rw_rlock(&pvh_global_lock);
4188 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4189 	rw_rlock(lock);
4190 restart:
4191 	count = 0;
4192 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4193 		pmap = PV_PMAP(pv);
4194 		if (!PMAP_TRYLOCK(pmap)) {
4195 			md_gen = m->md.pv_gen;
4196 			rw_runlock(lock);
4197 			PMAP_LOCK(pmap);
4198 			rw_rlock(lock);
4199 			if (md_gen != m->md.pv_gen) {
4200 				PMAP_UNLOCK(pmap);
4201 				goto restart;
4202 			}
4203 		}
4204 		l2 = pmap_l2(pmap, pv->pv_va);
4205 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4206 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4207 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4208 		if ((pmap_load(l3) & PTE_SW_WIRED) != 0)
4209 			count++;
4210 		PMAP_UNLOCK(pmap);
4211 	}
4212 	if ((m->flags & PG_FICTITIOUS) == 0) {
4213 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4214 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4215 			pmap = PV_PMAP(pv);
4216 			if (!PMAP_TRYLOCK(pmap)) {
4217 				md_gen = m->md.pv_gen;
4218 				pvh_gen = pvh->pv_gen;
4219 				rw_runlock(lock);
4220 				PMAP_LOCK(pmap);
4221 				rw_rlock(lock);
4222 				if (md_gen != m->md.pv_gen ||
4223 				    pvh_gen != pvh->pv_gen) {
4224 					PMAP_UNLOCK(pmap);
4225 					goto restart;
4226 				}
4227 			}
4228 			l2 = pmap_l2(pmap, pv->pv_va);
4229 			if ((pmap_load(l2) & PTE_SW_WIRED) != 0)
4230 				count++;
4231 			PMAP_UNLOCK(pmap);
4232 		}
4233 	}
4234 	rw_runlock(lock);
4235 	rw_runlock(&pvh_global_lock);
4236 	return (count);
4237 }
4238 
4239 /*
4240  * Returns true if the given page is mapped individually or as part of
4241  * a 2mpage.  Otherwise, returns false.
4242  */
4243 bool
pmap_page_is_mapped(vm_page_t m)4244 pmap_page_is_mapped(vm_page_t m)
4245 {
4246 	struct rwlock *lock;
4247 	bool rv;
4248 
4249 	if ((m->oflags & VPO_UNMANAGED) != 0)
4250 		return (false);
4251 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4252 	rw_rlock(lock);
4253 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4254 	    ((m->flags & PG_FICTITIOUS) == 0 &&
4255 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4256 	rw_runlock(lock);
4257 	return (rv);
4258 }
4259 
4260 static void
pmap_remove_pages_pv(pmap_t pmap,vm_page_t m,pv_entry_t pv,struct spglist * free,bool superpage)4261 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv,
4262     struct spglist *free, bool superpage)
4263 {
4264 	struct md_page *pvh;
4265 	vm_page_t mpte, mt;
4266 
4267 	if (superpage) {
4268 		pmap_resident_count_dec(pmap, Ln_ENTRIES);
4269 		pvh = pa_to_pvh(m->phys_addr);
4270 		TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4271 		pvh->pv_gen++;
4272 		if (TAILQ_EMPTY(&pvh->pv_list)) {
4273 			for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
4274 				if (TAILQ_EMPTY(&mt->md.pv_list) &&
4275 				    (mt->a.flags & PGA_WRITEABLE) != 0)
4276 					vm_page_aflag_clear(mt, PGA_WRITEABLE);
4277 		}
4278 		mpte = pmap_remove_pt_page(pmap, pv->pv_va);
4279 		if (mpte != NULL) {
4280 			KASSERT(vm_page_any_valid(mpte),
4281 			    ("pmap_remove_pages: pte page not promoted"));
4282 			pmap_resident_count_dec(pmap, 1);
4283 			KASSERT(mpte->ref_count == Ln_ENTRIES,
4284 			    ("pmap_remove_pages: pte page ref count error"));
4285 			mpte->ref_count = 0;
4286 			pmap_add_delayed_free_list(mpte, free, false);
4287 		}
4288 	} else {
4289 		pmap_resident_count_dec(pmap, 1);
4290 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4291 		m->md.pv_gen++;
4292 		if (TAILQ_EMPTY(&m->md.pv_list) &&
4293 		    (m->a.flags & PGA_WRITEABLE) != 0) {
4294 			pvh = pa_to_pvh(m->phys_addr);
4295 			if (TAILQ_EMPTY(&pvh->pv_list))
4296 				vm_page_aflag_clear(m, PGA_WRITEABLE);
4297 		}
4298 	}
4299 }
4300 
4301 /*
4302  * Destroy all managed, non-wired mappings in the given user-space
4303  * pmap.  This pmap cannot be active on any processor besides the
4304  * caller.
4305  *
4306  * This function cannot be applied to the kernel pmap.  Moreover, it
4307  * is not intended for general use.  It is only to be used during
4308  * process termination.  Consequently, it can be implemented in ways
4309  * that make it faster than pmap_remove().  First, it can more quickly
4310  * destroy mappings by iterating over the pmap's collection of PV
4311  * entries, rather than searching the page table.  Second, it doesn't
4312  * have to test and clear the page table entries atomically, because
4313  * no processor is currently accessing the user address space.  In
4314  * particular, a page table entry's dirty bit won't change state once
4315  * this function starts.
4316  */
4317 void
pmap_remove_pages(pmap_t pmap)4318 pmap_remove_pages(pmap_t pmap)
4319 {
4320 	struct spglist free;
4321 	pd_entry_t ptepde;
4322 	pt_entry_t *pte, tpte;
4323 	vm_page_t m, mt;
4324 	pv_entry_t pv;
4325 	struct pv_chunk *pc, *npc;
4326 	struct rwlock *lock;
4327 	int64_t bit;
4328 	uint64_t inuse, bitmask;
4329 	int allfree, field, freed __pv_stat_used, idx;
4330 	bool superpage;
4331 
4332 	lock = NULL;
4333 
4334 	SLIST_INIT(&free);
4335 	rw_rlock(&pvh_global_lock);
4336 	PMAP_LOCK(pmap);
4337 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4338 		allfree = 1;
4339 		freed = 0;
4340 		for (field = 0; field < _NPCM; field++) {
4341 			inuse = ~pc->pc_map[field] & pc_freemask[field];
4342 			while (inuse != 0) {
4343 				bit = ffsl(inuse) - 1;
4344 				bitmask = 1UL << bit;
4345 				idx = field * 64 + bit;
4346 				pv = &pc->pc_pventry[idx];
4347 				inuse &= ~bitmask;
4348 
4349 				pte = pmap_l1(pmap, pv->pv_va);
4350 				ptepde = pmap_load(pte);
4351 				pte = pmap_l1_to_l2(pte, pv->pv_va);
4352 				tpte = pmap_load(pte);
4353 
4354 				KASSERT((tpte & PTE_V) != 0,
4355 				    ("L2 PTE is invalid... bogus PV entry? "
4356 				    "va=%#lx, pte=%#lx", pv->pv_va, tpte));
4357 				if ((tpte & PTE_RWX) != 0) {
4358 					superpage = true;
4359 				} else {
4360 					ptepde = tpte;
4361 					pte = pmap_l2_to_l3(pte, pv->pv_va);
4362 					tpte = pmap_load(pte);
4363 					superpage = false;
4364 				}
4365 
4366 				/*
4367 				 * We cannot remove wired pages from a
4368 				 * process' mapping at this time.
4369 				 */
4370 				if (tpte & PTE_SW_WIRED) {
4371 					allfree = 0;
4372 					continue;
4373 				}
4374 
4375 				m = PTE_TO_VM_PAGE(tpte);
4376 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4377 				    m < &vm_page_array[vm_page_array_size],
4378 				    ("pmap_remove_pages: bad pte %#jx",
4379 				    (uintmax_t)tpte));
4380 
4381 				pmap_clear(pte);
4382 
4383 				/*
4384 				 * Update the vm_page_t clean/reference bits.
4385 				 */
4386 				if ((tpte & (PTE_D | PTE_W)) ==
4387 				    (PTE_D | PTE_W)) {
4388 					if (superpage)
4389 						for (mt = m;
4390 						    mt < &m[Ln_ENTRIES]; mt++)
4391 							vm_page_dirty(mt);
4392 					else
4393 						vm_page_dirty(m);
4394 				}
4395 
4396 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
4397 
4398 				/* Mark free */
4399 				pc->pc_map[field] |= bitmask;
4400 
4401 				pmap_remove_pages_pv(pmap, m, pv, &free,
4402 				    superpage);
4403 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
4404 				freed++;
4405 			}
4406 		}
4407 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
4408 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
4409 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
4410 		if (allfree) {
4411 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4412 			free_pv_chunk(pc);
4413 		}
4414 	}
4415 	if (lock != NULL)
4416 		rw_wunlock(lock);
4417 	pmap_invalidate_all(pmap);
4418 	rw_runlock(&pvh_global_lock);
4419 	PMAP_UNLOCK(pmap);
4420 	vm_page_free_pages_toq(&free, false);
4421 }
4422 
4423 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)4424 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
4425 {
4426 	struct md_page *pvh;
4427 	struct rwlock *lock;
4428 	pd_entry_t *l2;
4429 	pt_entry_t *l3, mask;
4430 	pv_entry_t pv;
4431 	pmap_t pmap;
4432 	int md_gen, pvh_gen;
4433 	bool rv;
4434 
4435 	mask = 0;
4436 	if (modified)
4437 		mask |= PTE_D;
4438 	if (accessed)
4439 		mask |= PTE_A;
4440 
4441 	rv = false;
4442 	rw_rlock(&pvh_global_lock);
4443 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4444 	rw_rlock(lock);
4445 restart:
4446 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4447 		pmap = PV_PMAP(pv);
4448 		if (!PMAP_TRYLOCK(pmap)) {
4449 			md_gen = m->md.pv_gen;
4450 			rw_runlock(lock);
4451 			PMAP_LOCK(pmap);
4452 			rw_rlock(lock);
4453 			if (md_gen != m->md.pv_gen) {
4454 				PMAP_UNLOCK(pmap);
4455 				goto restart;
4456 			}
4457 		}
4458 		l2 = pmap_l2(pmap, pv->pv_va);
4459 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4460 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4461 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4462 		rv = (pmap_load(l3) & mask) == mask;
4463 		PMAP_UNLOCK(pmap);
4464 		if (rv)
4465 			goto out;
4466 	}
4467 	if ((m->flags & PG_FICTITIOUS) == 0) {
4468 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4469 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4470 			pmap = PV_PMAP(pv);
4471 			if (!PMAP_TRYLOCK(pmap)) {
4472 				md_gen = m->md.pv_gen;
4473 				pvh_gen = pvh->pv_gen;
4474 				rw_runlock(lock);
4475 				PMAP_LOCK(pmap);
4476 				rw_rlock(lock);
4477 				if (md_gen != m->md.pv_gen ||
4478 				    pvh_gen != pvh->pv_gen) {
4479 					PMAP_UNLOCK(pmap);
4480 					goto restart;
4481 				}
4482 			}
4483 			l2 = pmap_l2(pmap, pv->pv_va);
4484 			rv = (pmap_load(l2) & mask) == mask;
4485 			PMAP_UNLOCK(pmap);
4486 			if (rv)
4487 				goto out;
4488 		}
4489 	}
4490 out:
4491 	rw_runlock(lock);
4492 	rw_runlock(&pvh_global_lock);
4493 	return (rv);
4494 }
4495 
4496 /*
4497  *	pmap_is_modified:
4498  *
4499  *	Return whether or not the specified physical page was modified
4500  *	in any physical maps.
4501  */
4502 bool
pmap_is_modified(vm_page_t m)4503 pmap_is_modified(vm_page_t m)
4504 {
4505 
4506 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4507 	    ("pmap_is_modified: page %p is not managed", m));
4508 
4509 	/*
4510 	 * If the page is not busied then this check is racy.
4511 	 */
4512 	if (!pmap_page_is_write_mapped(m))
4513 		return (false);
4514 	return (pmap_page_test_mappings(m, false, true));
4515 }
4516 
4517 /*
4518  *	pmap_is_prefaultable:
4519  *
4520  *	Return whether or not the specified virtual address is eligible
4521  *	for prefault.
4522  */
4523 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)4524 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4525 {
4526 	pt_entry_t *l3;
4527 	bool rv;
4528 
4529 	/*
4530 	 * Return true if and only if the L3 entry for the specified virtual
4531 	 * address is allocated but invalid.
4532 	 */
4533 	rv = false;
4534 	PMAP_LOCK(pmap);
4535 	l3 = pmap_l3(pmap, addr);
4536 	if (l3 != NULL && pmap_load(l3) == 0) {
4537 		rv = true;
4538 	}
4539 	PMAP_UNLOCK(pmap);
4540 	return (rv);
4541 }
4542 
4543 /*
4544  *	pmap_is_referenced:
4545  *
4546  *	Return whether or not the specified physical page was referenced
4547  *	in any physical maps.
4548  */
4549 bool
pmap_is_referenced(vm_page_t m)4550 pmap_is_referenced(vm_page_t m)
4551 {
4552 
4553 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4554 	    ("pmap_is_referenced: page %p is not managed", m));
4555 	return (pmap_page_test_mappings(m, true, false));
4556 }
4557 
4558 /*
4559  * Clear the write and modified bits in each of the given page's mappings.
4560  */
4561 void
pmap_remove_write(vm_page_t m)4562 pmap_remove_write(vm_page_t m)
4563 {
4564 	struct md_page *pvh;
4565 	struct rwlock *lock;
4566 	pmap_t pmap;
4567 	pd_entry_t *l2;
4568 	pt_entry_t *l3, oldl3, newl3;
4569 	pv_entry_t next_pv, pv;
4570 	vm_offset_t va;
4571 	int md_gen, pvh_gen;
4572 
4573 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4574 	    ("pmap_remove_write: page %p is not managed", m));
4575 	vm_page_assert_busied(m);
4576 
4577 	if (!pmap_page_is_write_mapped(m))
4578 		return;
4579 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4580 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4581 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4582 	rw_rlock(&pvh_global_lock);
4583 retry_pv_loop:
4584 	rw_wlock(lock);
4585 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4586 		pmap = PV_PMAP(pv);
4587 		if (!PMAP_TRYLOCK(pmap)) {
4588 			pvh_gen = pvh->pv_gen;
4589 			rw_wunlock(lock);
4590 			PMAP_LOCK(pmap);
4591 			rw_wlock(lock);
4592 			if (pvh_gen != pvh->pv_gen) {
4593 				PMAP_UNLOCK(pmap);
4594 				rw_wunlock(lock);
4595 				goto retry_pv_loop;
4596 			}
4597 		}
4598 		va = pv->pv_va;
4599 		l2 = pmap_l2(pmap, va);
4600 		if ((pmap_load(l2) & PTE_W) != 0)
4601 			(void)pmap_demote_l2_locked(pmap, l2, va, &lock);
4602 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4603 		    ("inconsistent pv lock %p %p for page %p",
4604 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4605 		PMAP_UNLOCK(pmap);
4606 	}
4607 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4608 		pmap = PV_PMAP(pv);
4609 		if (!PMAP_TRYLOCK(pmap)) {
4610 			pvh_gen = pvh->pv_gen;
4611 			md_gen = m->md.pv_gen;
4612 			rw_wunlock(lock);
4613 			PMAP_LOCK(pmap);
4614 			rw_wlock(lock);
4615 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4616 				PMAP_UNLOCK(pmap);
4617 				rw_wunlock(lock);
4618 				goto retry_pv_loop;
4619 			}
4620 		}
4621 		l2 = pmap_l2(pmap, pv->pv_va);
4622 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4623 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4624 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4625 		oldl3 = pmap_load(l3);
4626 retry:
4627 		if ((oldl3 & PTE_W) != 0) {
4628 			newl3 = oldl3 & ~(PTE_D | PTE_W);
4629 			if (!atomic_fcmpset_long(l3, &oldl3, newl3))
4630 				goto retry;
4631 			if ((oldl3 & PTE_D) != 0)
4632 				vm_page_dirty(m);
4633 			pmap_invalidate_page(pmap, pv->pv_va);
4634 		}
4635 		PMAP_UNLOCK(pmap);
4636 	}
4637 	rw_wunlock(lock);
4638 	vm_page_aflag_clear(m, PGA_WRITEABLE);
4639 	rw_runlock(&pvh_global_lock);
4640 }
4641 
4642 /*
4643  *	pmap_ts_referenced:
4644  *
4645  *	Return a count of reference bits for a page, clearing those bits.
4646  *	It is not necessary for every reference bit to be cleared, but it
4647  *	is necessary that 0 only be returned when there are truly no
4648  *	reference bits set.
4649  *
4650  *	As an optimization, update the page's dirty field if a modified bit is
4651  *	found while counting reference bits.  This opportunistic update can be
4652  *	performed at low cost and can eliminate the need for some future calls
4653  *	to pmap_is_modified().  However, since this function stops after
4654  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4655  *	dirty pages.  Those dirty pages will only be detected by a future call
4656  *	to pmap_is_modified().
4657  */
4658 int
pmap_ts_referenced(vm_page_t m)4659 pmap_ts_referenced(vm_page_t m)
4660 {
4661 	struct spglist free;
4662 	struct md_page *pvh;
4663 	struct rwlock *lock;
4664 	pv_entry_t pv, pvf;
4665 	pmap_t pmap;
4666 	pd_entry_t *l2, l2e;
4667 	pt_entry_t *l3, l3e;
4668 	vm_paddr_t pa;
4669 	vm_offset_t va;
4670 	int cleared, md_gen, not_cleared, pvh_gen;
4671 
4672 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4673 	    ("pmap_ts_referenced: page %p is not managed", m));
4674 	SLIST_INIT(&free);
4675 	cleared = 0;
4676 	pa = VM_PAGE_TO_PHYS(m);
4677 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
4678 
4679 	lock = PHYS_TO_PV_LIST_LOCK(pa);
4680 	rw_rlock(&pvh_global_lock);
4681 	rw_wlock(lock);
4682 retry:
4683 	not_cleared = 0;
4684 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4685 		goto small_mappings;
4686 	pv = pvf;
4687 	do {
4688 		pmap = PV_PMAP(pv);
4689 		if (!PMAP_TRYLOCK(pmap)) {
4690 			pvh_gen = pvh->pv_gen;
4691 			rw_wunlock(lock);
4692 			PMAP_LOCK(pmap);
4693 			rw_wlock(lock);
4694 			if (pvh_gen != pvh->pv_gen) {
4695 				PMAP_UNLOCK(pmap);
4696 				goto retry;
4697 			}
4698 		}
4699 		va = pv->pv_va;
4700 		l2 = pmap_l2(pmap, va);
4701 		l2e = pmap_load(l2);
4702 		if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) {
4703 			/*
4704 			 * Although l2e is mapping a 2MB page, because
4705 			 * this function is called at a 4KB page granularity,
4706 			 * we only update the 4KB page under test.
4707 			 */
4708 			vm_page_dirty(m);
4709 		}
4710 		if ((l2e & PTE_A) != 0) {
4711 			/*
4712 			 * Since this reference bit is shared by 512 4KB
4713 			 * pages, it should not be cleared every time it is
4714 			 * tested.  Apply a simple "hash" function on the
4715 			 * physical page number, the virtual superpage number,
4716 			 * and the pmap address to select one 4KB page out of
4717 			 * the 512 on which testing the reference bit will
4718 			 * result in clearing that reference bit.  This
4719 			 * function is designed to avoid the selection of the
4720 			 * same 4KB page for every 2MB page mapping.
4721 			 *
4722 			 * On demotion, a mapping that hasn't been referenced
4723 			 * is simply destroyed.  To avoid the possibility of a
4724 			 * subsequent page fault on a demoted wired mapping,
4725 			 * always leave its reference bit set.  Moreover,
4726 			 * since the superpage is wired, the current state of
4727 			 * its reference bit won't affect page replacement.
4728 			 */
4729 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
4730 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
4731 			    (l2e & PTE_SW_WIRED) == 0) {
4732 				pmap_clear_bits(l2, PTE_A);
4733 				pmap_invalidate_page(pmap, va);
4734 				cleared++;
4735 			} else
4736 				not_cleared++;
4737 		}
4738 		PMAP_UNLOCK(pmap);
4739 		/* Rotate the PV list if it has more than one entry. */
4740 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4741 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4742 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4743 			pvh->pv_gen++;
4744 		}
4745 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
4746 			goto out;
4747 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4748 small_mappings:
4749 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4750 		goto out;
4751 	pv = pvf;
4752 	do {
4753 		pmap = PV_PMAP(pv);
4754 		if (!PMAP_TRYLOCK(pmap)) {
4755 			pvh_gen = pvh->pv_gen;
4756 			md_gen = m->md.pv_gen;
4757 			rw_wunlock(lock);
4758 			PMAP_LOCK(pmap);
4759 			rw_wlock(lock);
4760 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4761 				PMAP_UNLOCK(pmap);
4762 				goto retry;
4763 			}
4764 		}
4765 		l2 = pmap_l2(pmap, pv->pv_va);
4766 
4767 		KASSERT((pmap_load(l2) & PTE_RX) == 0,
4768 		    ("pmap_ts_referenced: found an invalid l2 table"));
4769 
4770 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4771 		l3e = pmap_load(l3);
4772 		if ((l3e & PTE_D) != 0)
4773 			vm_page_dirty(m);
4774 		if ((l3e & PTE_A) != 0) {
4775 			if ((l3e & PTE_SW_WIRED) == 0) {
4776 				/*
4777 				 * Wired pages cannot be paged out so
4778 				 * doing accessed bit emulation for
4779 				 * them is wasted effort. We do the
4780 				 * hard work for unwired pages only.
4781 				 */
4782 				pmap_clear_bits(l3, PTE_A);
4783 				pmap_invalidate_page(pmap, pv->pv_va);
4784 				cleared++;
4785 			} else
4786 				not_cleared++;
4787 		}
4788 		PMAP_UNLOCK(pmap);
4789 		/* Rotate the PV list if it has more than one entry. */
4790 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4791 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4792 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4793 			m->md.pv_gen++;
4794 		}
4795 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
4796 	    not_cleared < PMAP_TS_REFERENCED_MAX);
4797 out:
4798 	rw_wunlock(lock);
4799 	rw_runlock(&pvh_global_lock);
4800 	vm_page_free_pages_toq(&free, false);
4801 	return (cleared + not_cleared);
4802 }
4803 
4804 /*
4805  *	Apply the given advice to the specified range of addresses within the
4806  *	given pmap.  Depending on the advice, clear the referenced and/or
4807  *	modified flags in each mapping and set the mapped page's dirty field.
4808  */
4809 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)4810 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4811 {
4812 }
4813 
4814 /*
4815  *	Clear the modify bits on the specified physical page.
4816  */
4817 void
pmap_clear_modify(vm_page_t m)4818 pmap_clear_modify(vm_page_t m)
4819 {
4820 	struct md_page *pvh;
4821 	struct rwlock *lock;
4822 	pmap_t pmap;
4823 	pv_entry_t next_pv, pv;
4824 	pd_entry_t *l2, oldl2;
4825 	pt_entry_t *l3;
4826 	vm_offset_t va;
4827 	int md_gen, pvh_gen;
4828 
4829 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4830 	    ("%s: page %p is not managed", __func__, m));
4831 	vm_page_assert_busied(m);
4832 
4833 	if (!pmap_page_is_write_mapped(m))
4834 	        return;
4835 
4836 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4837 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4838 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4839 	rw_rlock(&pvh_global_lock);
4840 	rw_wlock(lock);
4841 restart:
4842 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4843 		pmap = PV_PMAP(pv);
4844 		if (!PMAP_TRYLOCK(pmap)) {
4845 			pvh_gen = pvh->pv_gen;
4846 			rw_wunlock(lock);
4847 			PMAP_LOCK(pmap);
4848 			rw_wlock(lock);
4849 			if (pvh_gen != pvh->pv_gen) {
4850 				PMAP_UNLOCK(pmap);
4851 				goto restart;
4852 			}
4853 		}
4854 		va = pv->pv_va;
4855 		l2 = pmap_l2(pmap, va);
4856 		oldl2 = pmap_load(l2);
4857 		/* If oldl2 has PTE_W set, then it also has PTE_D set. */
4858 		if ((oldl2 & PTE_W) != 0 &&
4859 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
4860 		    (oldl2 & PTE_SW_WIRED) == 0) {
4861 			/*
4862 			 * Write protect the mapping to a single page so that
4863 			 * a subsequent write access may repromote.
4864 			 */
4865 			va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
4866 			l3 = pmap_l2_to_l3(l2, va);
4867 			pmap_clear_bits(l3, PTE_D | PTE_W);
4868 			vm_page_dirty(m);
4869 			pmap_invalidate_page(pmap, va);
4870 		}
4871 		PMAP_UNLOCK(pmap);
4872 	}
4873 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4874 		pmap = PV_PMAP(pv);
4875 		if (!PMAP_TRYLOCK(pmap)) {
4876 			md_gen = m->md.pv_gen;
4877 			pvh_gen = pvh->pv_gen;
4878 			rw_wunlock(lock);
4879 			PMAP_LOCK(pmap);
4880 			rw_wlock(lock);
4881 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4882 				PMAP_UNLOCK(pmap);
4883 				goto restart;
4884 			}
4885 		}
4886 		l2 = pmap_l2(pmap, pv->pv_va);
4887 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4888 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4889 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4890 		if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) {
4891 			pmap_clear_bits(l3, PTE_D | PTE_W);
4892 			pmap_invalidate_page(pmap, pv->pv_va);
4893 		}
4894 		PMAP_UNLOCK(pmap);
4895 	}
4896 	rw_wunlock(lock);
4897 	rw_runlock(&pvh_global_lock);
4898 }
4899 
4900 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)4901 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4902 {
4903 
4904         return (PHYS_TO_DMAP(pa));
4905 }
4906 
4907 void
pmap_unmapbios(void * p,vm_size_t size)4908 pmap_unmapbios(void *p, vm_size_t size)
4909 {
4910 }
4911 
4912 /*
4913  * Sets the memory attribute for the specified page.
4914  */
4915 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)4916 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4917 {
4918 	if (m->md.pv_memattr == ma)
4919 		return;
4920 
4921 	m->md.pv_memattr = ma;
4922 
4923 	/*
4924 	 * If "m" is a normal page, update its direct mapping.  This update
4925 	 * can be relied upon to perform any cache operations that are
4926 	 * required for data coherence.
4927 	 */
4928 	if ((m->flags & PG_FICTITIOUS) == 0 &&
4929 	    pmap_change_attr(VM_PAGE_TO_DMAP(m), PAGE_SIZE,
4930 	    m->md.pv_memattr) != 0)
4931 		panic("memory attribute change on the direct map failed");
4932 }
4933 
4934 /*
4935  * Changes the specified virtual address range's memory type to that given by
4936  * the parameter "mode".  The specified virtual address range must be
4937  * completely contained within either the direct map or the kernel map.
4938  *
4939  * Returns zero if the change completed successfully, and either EINVAL or
4940  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4941  * of the virtual address range was not mapped, and ENOMEM is returned if
4942  * there was insufficient memory available to complete the change.  In the
4943  * latter case, the memory type may have been changed on some part of the
4944  * virtual address range.
4945  */
4946 int
pmap_change_attr(void * va,vm_size_t size,int mode)4947 pmap_change_attr(void *va, vm_size_t size, int mode)
4948 {
4949 	int error;
4950 
4951 	PMAP_LOCK(kernel_pmap);
4952 	error = pmap_change_attr_locked(va, size, mode);
4953 	PMAP_UNLOCK(kernel_pmap);
4954 	return (error);
4955 }
4956 
4957 static int
pmap_change_attr_locked(void * addr,vm_size_t size,int mode)4958 pmap_change_attr_locked(void *addr, vm_size_t size, int mode)
4959 {
4960 	vm_offset_t base, offset, tmpva, va;
4961 	vm_paddr_t phys;
4962 	pd_entry_t *l1, l1e;
4963 	pd_entry_t *l2, l2e;
4964 	pt_entry_t *l3, l3e;
4965 	pt_entry_t bits, mask;
4966 	bool anychanged = false;
4967 	int error = 0;
4968 
4969 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
4970 	va = (vm_offset_t)addr;
4971 	base = trunc_page(va);
4972 	offset = va & PAGE_MASK;
4973 	size = round_page(offset + size);
4974 
4975 	if (!VIRT_IN_DMAP(base) &&
4976 	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
4977 		return (EINVAL);
4978 
4979 	bits = pmap_memattr_bits(mode);
4980 	mask = memattr_mask;
4981 
4982 	/* First loop: perform PTE validation and demotions as necessary. */
4983 	for (tmpva = base; tmpva < base + size; ) {
4984 		l1 = pmap_l1(kernel_pmap, tmpva);
4985 		if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0)
4986 			return (EINVAL);
4987 		if ((l1e & PTE_RWX) != 0) {
4988 			/*
4989 			 * If the existing PTE has the correct attributes, then
4990 			 * no need to demote.
4991 			 */
4992 			if ((l1e & mask) == bits) {
4993 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
4994 				continue;
4995 			}
4996 
4997 			/*
4998 			 * If the 1GB page fits in the remaining range, we
4999 			 * don't need to demote.
5000 			 */
5001 			if ((tmpva & L1_OFFSET) == 0 &&
5002 			    tmpva + L1_SIZE <= base + size) {
5003 				tmpva += L1_SIZE;
5004 				continue;
5005 			}
5006 
5007 			if (!pmap_demote_l1(kernel_pmap, l1, tmpva))
5008 				return (EINVAL);
5009 		}
5010 		l2 = pmap_l1_to_l2(l1, tmpva);
5011 		if (((l2e = pmap_load(l2)) & PTE_V) == 0)
5012 			return (EINVAL);
5013 		if ((l2e & PTE_RWX) != 0) {
5014 			/*
5015 			 * If the existing PTE has the correct attributes, then
5016 			 * no need to demote.
5017 			 */
5018 			if ((l2e & mask) == bits) {
5019 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
5020 				continue;
5021 			}
5022 
5023 			/*
5024 			 * If the 2MB page fits in the remaining range, we
5025 			 * don't need to demote.
5026 			 */
5027 			if ((tmpva & L2_OFFSET) == 0 &&
5028 			    tmpva + L2_SIZE <= base + size) {
5029 				tmpva += L2_SIZE;
5030 				continue;
5031 			}
5032 
5033 			if (!pmap_demote_l2(kernel_pmap, l2, tmpva))
5034 				panic("l2 demotion failed");
5035 		}
5036 		l3 = pmap_l2_to_l3(l2, tmpva);
5037 		if (((l3e = pmap_load(l3)) & PTE_V) == 0)
5038 			return (EINVAL);
5039 
5040 		tmpva += PAGE_SIZE;
5041 	}
5042 
5043 	/* Second loop: perform PTE updates. */
5044 	for (tmpva = base; tmpva < base + size; ) {
5045 		l1 = pmap_l1(kernel_pmap, tmpva);
5046 		l1e = pmap_load(l1);
5047 		if ((l1e & PTE_RWX) != 0) {
5048 			/* Unchanged. */
5049 			if ((l1e & mask) == bits) {
5050 				tmpva += L1_SIZE;
5051 				continue;
5052 			}
5053 
5054 			l1e &= ~mask;
5055 			l1e |= bits;
5056 			pmap_store(l1, l1e);
5057 			anychanged = true;
5058 
5059 			/* Update corresponding DMAP entry */
5060 			phys = L1PTE_TO_PHYS(l1e);
5061 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
5062 				error = pmap_change_attr_locked(
5063 				    PHYS_TO_DMAP(phys), L1_SIZE, mode);
5064 				if (error != 0)
5065 					break;
5066 			}
5067 			tmpva += L1_SIZE;
5068 			continue;
5069 		}
5070 
5071 		l2 = pmap_l1_to_l2(l1, tmpva);
5072 		l2e = pmap_load(l2);
5073 		if ((l2e & PTE_RWX) != 0) {
5074 			/* Unchanged. */
5075 			if ((l2e & mask) == bits) {
5076 				tmpva += L2_SIZE;
5077 				continue;
5078 			}
5079 
5080 			l2e &= ~mask;
5081 			l2e |= bits;
5082 			pmap_store(l2, l2e);
5083 			anychanged = true;
5084 
5085 			/* Update corresponding DMAP entry */
5086 			phys = L2PTE_TO_PHYS(l2e);
5087 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
5088 				error = pmap_change_attr_locked(
5089 				    PHYS_TO_DMAP(phys), L2_SIZE, mode);
5090 				if (error != 0)
5091 					break;
5092 			}
5093 			tmpva += L2_SIZE;
5094 			continue;
5095 		}
5096 
5097 		l3 = pmap_l2_to_l3(l2, tmpva);
5098 		l3e = pmap_load(l3);
5099 
5100 		/* Unchanged. */
5101 		if ((l3e & mask) == bits) {
5102 			tmpva += PAGE_SIZE;
5103 			continue;
5104 		}
5105 
5106 		l3e &= ~mask;
5107 		l3e |= bits;
5108 		pmap_store(l3, l3e);
5109 		anychanged = true;
5110 
5111 		phys = PTE_TO_PHYS(l3e);
5112 		if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
5113 			error = pmap_change_attr_locked(PHYS_TO_DMAP(phys),
5114 			    L3_SIZE, mode);
5115 			if (error != 0)
5116 				break;
5117 		}
5118 		tmpva += PAGE_SIZE;
5119 	}
5120 
5121 	if (anychanged) {
5122 		pmap_invalidate_range(kernel_pmap, base, tmpva);
5123 		if (mode == VM_MEMATTR_UNCACHEABLE)
5124 			cpu_dcache_wbinv_range((void *)base, size);
5125 	}
5126 
5127 	return (error);
5128 }
5129 
5130 /*
5131  * Perform the pmap work for mincore(2).  If the page is not both referenced and
5132  * modified by this pmap, returns its physical address so that the caller can
5133  * find other mappings.
5134  */
5135 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)5136 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
5137 {
5138 	pt_entry_t *l2, *l3, tpte;
5139 	vm_paddr_t pa;
5140 	int val;
5141 	bool managed;
5142 
5143 	PMAP_LOCK(pmap);
5144 	l2 = pmap_l2(pmap, addr);
5145 	if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) {
5146 		if ((tpte & PTE_RWX) != 0) {
5147 			pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET);
5148 			val = MINCORE_INCORE | MINCORE_PSIND(1);
5149 		} else {
5150 			l3 = pmap_l2_to_l3(l2, addr);
5151 			tpte = pmap_load(l3);
5152 			if ((tpte & PTE_V) == 0) {
5153 				PMAP_UNLOCK(pmap);
5154 				return (0);
5155 			}
5156 			pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET);
5157 			val = MINCORE_INCORE;
5158 		}
5159 
5160 		if ((tpte & PTE_D) != 0)
5161 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5162 		if ((tpte & PTE_A) != 0)
5163 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5164 		managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED;
5165 	} else {
5166 		managed = false;
5167 		val = 0;
5168 	}
5169 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5170 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
5171 		*pap = pa;
5172 	}
5173 	PMAP_UNLOCK(pmap);
5174 	return (val);
5175 }
5176 
5177 void
pmap_activate_sw(struct thread * td)5178 pmap_activate_sw(struct thread *td)
5179 {
5180 	pmap_t oldpmap, pmap;
5181 	u_int hart;
5182 
5183 	oldpmap = PCPU_GET(curpmap);
5184 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5185 	if (pmap == oldpmap)
5186 		return;
5187 	csr_write(satp, pmap->pm_satp);
5188 
5189 	hart = PCPU_GET(hart);
5190 #ifdef SMP
5191 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
5192 	CPU_CLR_ATOMIC(hart, &oldpmap->pm_active);
5193 #else
5194 	CPU_SET(hart, &pmap->pm_active);
5195 	CPU_CLR(hart, &oldpmap->pm_active);
5196 #endif
5197 	PCPU_SET(curpmap, pmap);
5198 
5199 	sfence_vma();
5200 }
5201 
5202 void
pmap_activate(struct thread * td)5203 pmap_activate(struct thread *td)
5204 {
5205 
5206 	critical_enter();
5207 	pmap_activate_sw(td);
5208 	critical_exit();
5209 }
5210 
5211 void
pmap_activate_boot(pmap_t pmap)5212 pmap_activate_boot(pmap_t pmap)
5213 {
5214 	u_int hart;
5215 
5216 	hart = PCPU_GET(hart);
5217 #ifdef SMP
5218 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
5219 #else
5220 	CPU_SET(hart, &pmap->pm_active);
5221 #endif
5222 	PCPU_SET(curpmap, pmap);
5223 }
5224 
5225 void
pmap_active_cpus(pmap_t pmap,cpuset_t * res)5226 pmap_active_cpus(pmap_t pmap, cpuset_t *res)
5227 {
5228 	*res = pmap->pm_active;
5229 }
5230 
5231 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)5232 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
5233 {
5234 	cpuset_t mask;
5235 
5236 	/*
5237 	 * From the RISC-V User-Level ISA V2.2:
5238 	 *
5239 	 * "To make a store to instruction memory visible to all
5240 	 * RISC-V harts, the writing hart has to execute a data FENCE
5241 	 * before requesting that all remote RISC-V harts execute a
5242 	 * FENCE.I."
5243 	 *
5244 	 * However, this is slightly misleading; we still need to
5245 	 * perform a FENCE.I for the local hart, as FENCE does nothing
5246 	 * for its icache. FENCE.I alone is also sufficient for the
5247 	 * local hart.
5248 	 */
5249 	sched_pin();
5250 	mask = all_harts;
5251 	CPU_CLR(PCPU_GET(hart), &mask);
5252 	fence_i();
5253 	if (!CPU_EMPTY(&mask) && smp_started) {
5254 		fence();
5255 		sbi_remote_fence_i(mask.__bits);
5256 	}
5257 	sched_unpin();
5258 }
5259 
5260 /*
5261  *	Increase the starting virtual address of the given mapping if a
5262  *	different alignment might result in more superpage mappings.
5263  */
5264 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)5265 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5266     vm_offset_t *addr, vm_size_t size)
5267 {
5268 	vm_offset_t superpage_offset;
5269 
5270 	if (size < L2_SIZE)
5271 		return;
5272 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5273 		offset += ptoa(object->pg_color);
5274 	superpage_offset = offset & L2_OFFSET;
5275 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
5276 	    (*addr & L2_OFFSET) == superpage_offset)
5277 		return;
5278 	if ((*addr & L2_OFFSET) < superpage_offset)
5279 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
5280 	else
5281 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
5282 }
5283 
5284 /**
5285  * Get the kernel virtual address of a set of physical pages. If there are
5286  * physical addresses not covered by the DMAP perform a transient mapping
5287  * that will be removed when calling pmap_unmap_io_transient.
5288  *
5289  * \param page        The pages the caller wishes to obtain the virtual
5290  *                    address on the kernel memory map.
5291  * \param vaddr       On return contains the kernel virtual memory address
5292  *                    of the pages passed in the page parameter.
5293  * \param count       Number of pages passed in.
5294  * \param can_fault   true if the thread using the mapped pages can take
5295  *                    page faults, false otherwise.
5296  *
5297  * \returns true if the caller must call pmap_unmap_io_transient when
5298  *          finished or false otherwise.
5299  *
5300  */
5301 bool
pmap_map_io_transient(vm_page_t page[],void * vaddr[],int count,bool can_fault)5302 pmap_map_io_transient(vm_page_t page[], void *vaddr[], int count,
5303     bool can_fault)
5304 {
5305 	vm_paddr_t paddr;
5306 	vmem_addr_t addr;
5307 	bool needs_mapping;
5308 	int error __diagused, i;
5309 
5310 	/*
5311 	 * Allocate any KVA space that we need, this is done in a separate
5312 	 * loop to prevent calling vmem_alloc while pinned.
5313 	 */
5314 	needs_mapping = false;
5315 	for (i = 0; i < count; i++) {
5316 		paddr = VM_PAGE_TO_PHYS(page[i]);
5317 		if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) {
5318 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
5319 			    M_BESTFIT | M_WAITOK, &addr);
5320 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
5321 			vaddr[i] = (void *)addr;
5322 			needs_mapping = true;
5323 		} else {
5324 			vaddr[i] = PHYS_TO_DMAP(paddr);
5325 		}
5326 	}
5327 
5328 	/* Exit early if everything is covered by the DMAP */
5329 	if (!needs_mapping)
5330 		return (false);
5331 
5332 	if (!can_fault)
5333 		sched_pin();
5334 	for (i = 0; i < count; i++) {
5335 		paddr = VM_PAGE_TO_PHYS(page[i]);
5336 		if (paddr >= DMAP_MAX_PHYSADDR) {
5337 			panic(
5338 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
5339 		}
5340 	}
5341 
5342 	return (needs_mapping);
5343 }
5344 
5345 void
pmap_unmap_io_transient(vm_page_t page[],void * vaddr[],int count,bool can_fault)5346 pmap_unmap_io_transient(vm_page_t page[], void *vaddr[], int count,
5347     bool can_fault)
5348 {
5349 	vm_paddr_t paddr;
5350 	int i;
5351 
5352 	if (!can_fault)
5353 		sched_unpin();
5354 	for (i = 0; i < count; i++) {
5355 		paddr = VM_PAGE_TO_PHYS(page[i]);
5356 		if (paddr >= DMAP_MAX_PHYSADDR) {
5357 			panic("RISCVTODO: pmap_unmap_io_transient: Unmap data");
5358 		}
5359 	}
5360 }
5361 
5362 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)5363 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
5364 {
5365 
5366 	return (mode >= VM_MEMATTR_DEFAULT && mode <= VM_MEMATTR_LAST);
5367 }
5368 
5369 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)5370 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2,
5371     pt_entry_t **l3)
5372 {
5373 	pd_entry_t *l1p, *l2p;
5374 
5375 	/* Get l1 directory entry. */
5376 	l1p = pmap_l1(pmap, va);
5377 	*l1 = l1p;
5378 
5379 	if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0)
5380 		return (false);
5381 
5382 	if ((pmap_load(l1p) & PTE_RX) != 0) {
5383 		*l2 = NULL;
5384 		*l3 = NULL;
5385 		return (true);
5386 	}
5387 
5388 	/* Get l2 directory entry. */
5389 	l2p = pmap_l1_to_l2(l1p, va);
5390 	*l2 = l2p;
5391 
5392 	if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0)
5393 		return (false);
5394 
5395 	if ((pmap_load(l2p) & PTE_RX) != 0) {
5396 		*l3 = NULL;
5397 		return (true);
5398 	}
5399 
5400 	/* Get l3 page table entry. */
5401 	*l3 = pmap_l2_to_l3(l2p, va);
5402 
5403 	return (true);
5404 }
5405 
5406 /*
5407  * Track a range of the kernel's virtual address space that is contiguous
5408  * in various mapping attributes.
5409  */
5410 struct pmap_kernel_map_range {
5411 	vm_offset_t sva;
5412 	pt_entry_t attrs;
5413 	int l3pages;
5414 	int l2pages;
5415 	int l1pages;
5416 };
5417 
5418 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)5419 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
5420     vm_offset_t eva)
5421 {
5422 	char *mode;
5423 	int i;
5424 
5425 	if (eva <= range->sva)
5426 		return;
5427 
5428 	for (i = 0; i < nitems(memattr_bits); i++)
5429 		if ((range->attrs & memattr_mask) == memattr_bits[i])
5430 			break;
5431 
5432 	switch (i) {
5433 	case VM_MEMATTR_PMA:
5434 		mode = "PMA";
5435 		break;
5436 	case VM_MEMATTR_UNCACHEABLE:
5437 		mode = "NC ";
5438 		break;
5439 	case VM_MEMATTR_DEVICE:
5440 		mode = "IO ";
5441 		break;
5442 	default:
5443 		mode = "???";
5444 		break;
5445 	}
5446 
5447 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n",
5448 	    range->sva, eva,
5449 	    (range->attrs & PTE_W) == PTE_W ? 'w' : '-',
5450 	    (range->attrs & PTE_X) == PTE_X ? 'x' : '-',
5451 	    (range->attrs & PTE_U) == PTE_U ? 'u' : 's',
5452 	    (range->attrs & PTE_G) == PTE_G ? 'g' : '-',
5453 	    mode, range->l1pages, range->l2pages, range->l3pages);
5454 
5455 	/* Reset to sentinel value. */
5456 	range->sva = 0xfffffffffffffffful;
5457 }
5458 
5459 /*
5460  * Determine whether the attributes specified by a page table entry match those
5461  * being tracked by the current range.
5462  */
5463 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)5464 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
5465 {
5466 
5467 	return (range->attrs == attrs);
5468 }
5469 
5470 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)5471 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
5472     pt_entry_t attrs)
5473 {
5474 
5475 	memset(range, 0, sizeof(*range));
5476 	range->sva = va;
5477 	range->attrs = attrs;
5478 }
5479 
5480 /*
5481  * Given a leaf PTE, derive the mapping's attributes. If they do not match
5482  * those of the current run, dump the address range and its attributes, and
5483  * begin a new run.
5484  */
5485 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)5486 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
5487     vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e)
5488 {
5489 	pt_entry_t attrs;
5490 
5491 	/* The PTE global bit is inherited by lower levels. */
5492 	attrs = l1e & PTE_G;
5493 	if ((l1e & PTE_RWX) != 0) {
5494 		attrs |= l1e & (PTE_RWX | PTE_U);
5495 		attrs |= l1e & memattr_mask;
5496 	} else if (l2e != 0)
5497 		attrs |= l2e & PTE_G;
5498 
5499 	if ((l2e & PTE_RWX) != 0) {
5500 		attrs |= l2e & (PTE_RWX | PTE_U);
5501 		attrs |= l2e & memattr_mask;
5502 	} else if (l3e != 0) {
5503 		attrs |= l3e & (PTE_RWX | PTE_U | PTE_G);
5504 		attrs |= l3e & memattr_mask;
5505 	}
5506 
5507 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
5508 		sysctl_kmaps_dump(sb, range, va);
5509 		sysctl_kmaps_reinit(range, va, attrs);
5510 	}
5511 }
5512 
5513 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)5514 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
5515 {
5516 	struct pmap_kernel_map_range range;
5517 	struct sbuf sbuf, *sb;
5518 	pd_entry_t *l1, l1e, *l2, l2e;
5519 	pt_entry_t *l3, l3e;
5520 	vm_offset_t sva;
5521 	vm_paddr_t pa;
5522 	int error, i, j, k;
5523 
5524 	error = sysctl_wire_old_buffer(req, 0);
5525 	if (error != 0)
5526 		return (error);
5527 	sb = &sbuf;
5528 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
5529 
5530 	/* Sentinel value. */
5531 	range.sva = 0xfffffffffffffffful;
5532 
5533 	/*
5534 	 * Iterate over the kernel page tables without holding the kernel pmap
5535 	 * lock. Kernel page table pages are never freed, so at worst we will
5536 	 * observe inconsistencies in the output.
5537 	 */
5538 	sva = VM_MIN_KERNEL_ADDRESS;
5539 	for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) {
5540 		if (i == pmap_l1_index(DMAP_MIN_ADDRESS))
5541 			sbuf_printf(sb, "\nDirect map:\n");
5542 		else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS))
5543 			sbuf_printf(sb, "\nKernel map:\n");
5544 
5545 		l1 = pmap_l1(kernel_pmap, sva);
5546 		l1e = pmap_load(l1);
5547 		if ((l1e & PTE_V) == 0) {
5548 			sysctl_kmaps_dump(sb, &range, sva);
5549 			sva += L1_SIZE;
5550 			continue;
5551 		}
5552 		if ((l1e & PTE_RWX) != 0) {
5553 			sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0);
5554 			range.l1pages++;
5555 			sva += L1_SIZE;
5556 			continue;
5557 		}
5558 		pa = PTE_TO_PHYS(l1e);
5559 		l2 = PHYS_TO_DMAP(pa);
5560 
5561 		for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) {
5562 			l2e = l2[j];
5563 			if ((l2e & PTE_V) == 0) {
5564 				sysctl_kmaps_dump(sb, &range, sva);
5565 				sva += L2_SIZE;
5566 				continue;
5567 			}
5568 			if ((l2e & PTE_RWX) != 0) {
5569 				sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0);
5570 				range.l2pages++;
5571 				sva += L2_SIZE;
5572 				continue;
5573 			}
5574 			pa = PTE_TO_PHYS(l2e);
5575 			l3 = PHYS_TO_DMAP(pa);
5576 
5577 			for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++,
5578 			    sva += L3_SIZE) {
5579 				l3e = l3[k];
5580 				if ((l3e & PTE_V) == 0) {
5581 					sysctl_kmaps_dump(sb, &range, sva);
5582 					continue;
5583 				}
5584 				sysctl_kmaps_check(sb, &range, sva,
5585 				    l1e, l2e, l3e);
5586 				range.l3pages++;
5587 			}
5588 		}
5589 	}
5590 
5591 	error = sbuf_finish(sb);
5592 	sbuf_delete(sb);
5593 	return (error);
5594 }
5595 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
5596     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
5597     NULL, 0, sysctl_kmaps, "A",
5598     "Dump kernel address layout");
5599