1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 1991 Regents of the University of California.
5 * All rights reserved.
6 * Copyright (c) 1994 John S. Dyson
7 * All rights reserved.
8 * Copyright (c) 1994 David Greenman
9 * All rights reserved.
10 * Copyright (c) 2003 Peter Wemm
11 * All rights reserved.
12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
13 * All rights reserved.
14 * Copyright (c) 2014 Andrew Turner
15 * All rights reserved.
16 * Copyright (c) 2014 The FreeBSD Foundation
17 * All rights reserved.
18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com>
19 * All rights reserved.
20 *
21 * This code is derived from software contributed to Berkeley by
22 * the Systems Programming Group of the University of Utah Computer
23 * Science Department and William Jolitz of UUNET Technologies Inc.
24 *
25 * Portions of this software were developed by Andrew Turner under
26 * sponsorship from The FreeBSD Foundation.
27 *
28 * Portions of this software were developed by SRI International and the
29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract
30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
31 *
32 * Portions of this software were developed by the University of Cambridge
33 * Computer Laboratory as part of the CTSRD Project, with support from the
34 * UK Higher Education Innovation Fund (HEIF).
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 */
64 /*-
65 * Copyright (c) 2003 Networks Associates Technology, Inc.
66 * All rights reserved.
67 *
68 * This software was developed for the FreeBSD Project by Jake Burkholder,
69 * Safeport Network Services, and Network Associates Laboratories, the
70 * Security Research Division of Network Associates, Inc. under
71 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
72 * CHATS research program.
73 *
74 * Redistribution and use in source and binary forms, with or without
75 * modification, are permitted provided that the following conditions
76 * are met:
77 * 1. Redistributions of source code must retain the above copyright
78 * notice, this list of conditions and the following disclaimer.
79 * 2. Redistributions in binary form must reproduce the above copyright
80 * notice, this list of conditions and the following disclaimer in the
81 * documentation and/or other materials provided with the distribution.
82 *
83 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
84 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
85 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
86 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
87 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
88 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
89 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
90 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
91 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
92 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
93 * SUCH DAMAGE.
94 */
95
96 /*
97 * Manages physical address maps.
98 *
99 * Since the information managed by this module is
100 * also stored by the logical address mapping module,
101 * this module may throw away valid virtual-to-physical
102 * mappings at almost any time. However, invalidations
103 * of virtual-to-physical mappings must be done as
104 * requested.
105 *
106 * In order to cope with hardware architectures which
107 * make virtual-to-physical map invalidates expensive,
108 * this module may delay invalidate or reduced protection
109 * operations until such time as they are actually
110 * necessary. This module is given full information as
111 * to which processors are currently using which maps,
112 * and to when physical maps must be made correct.
113 */
114
115 #include "opt_pmap.h"
116
117 #include <sys/param.h>
118 #include <sys/systm.h>
119 #include <sys/bitstring.h>
120 #include <sys/bus.h>
121 #include <sys/cpuset.h>
122 #include <sys/kernel.h>
123 #include <sys/ktr.h>
124 #include <sys/lock.h>
125 #include <sys/malloc.h>
126 #include <sys/mman.h>
127 #include <sys/msgbuf.h>
128 #include <sys/mutex.h>
129 #include <sys/physmem.h>
130 #include <sys/proc.h>
131 #include <sys/rwlock.h>
132 #include <sys/sbuf.h>
133 #include <sys/sx.h>
134 #include <sys/vmem.h>
135 #include <sys/vmmeter.h>
136 #include <sys/sched.h>
137 #include <sys/sysctl.h>
138 #include <sys/smp.h>
139
140 #include <vm/vm.h>
141 #include <vm/vm_param.h>
142 #include <vm/vm_kern.h>
143 #include <vm/vm_page.h>
144 #include <vm/vm_map.h>
145 #include <vm/vm_object.h>
146 #include <vm/vm_extern.h>
147 #include <vm/vm_pageout.h>
148 #include <vm/vm_pager.h>
149 #include <vm/vm_phys.h>
150 #include <vm/vm_radix.h>
151 #include <vm/vm_reserv.h>
152 #include <vm/vm_dumpset.h>
153 #include <vm/uma.h>
154
155 #include <machine/machdep.h>
156 #include <machine/md_var.h>
157 #include <machine/pcb.h>
158 #include <machine/sbi.h>
159 #include <machine/thead.h>
160
161 /*
162 * Boundary values for the page table page index space:
163 *
164 * L3 pages: [0, NUL2E)
165 * L2 pages: [NUL2E, NUL2E + NUL1E)
166 * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E)
167 *
168 * Note that these ranges are used in both SV39 and SV48 mode. In SV39 mode the
169 * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages
170 * in a set of page tables.
171 */
172 #define NUL0E Ln_ENTRIES
173 #define NUL1E (Ln_ENTRIES * NUL0E)
174 #define NUL2E (Ln_ENTRIES * NUL1E)
175
176 #ifdef PV_STATS
177 #define PV_STAT(x) do { x ; } while (0)
178 #define __pv_stat_used
179 #else
180 #define PV_STAT(x) do { } while (0)
181 #define __pv_stat_used __unused
182 #endif
183
184 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
185 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
186 #define pa_index(pa) ((pa) >> L2_SHIFT)
187 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)])
188
189 #define NPV_LIST_LOCKS MAXCPU
190
191 #define PHYS_TO_PV_LIST_LOCK(pa) \
192 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
193
194 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \
195 struct rwlock **_lockp = (lockp); \
196 struct rwlock *_new_lock; \
197 \
198 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \
199 if (_new_lock != *_lockp) { \
200 if (*_lockp != NULL) \
201 rw_wunlock(*_lockp); \
202 *_lockp = _new_lock; \
203 rw_wlock(*_lockp); \
204 } \
205 } while (0)
206
207 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
208 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
209
210 #define RELEASE_PV_LIST_LOCK(lockp) do { \
211 struct rwlock **_lockp = (lockp); \
212 \
213 if (*_lockp != NULL) { \
214 rw_wunlock(*_lockp); \
215 *_lockp = NULL; \
216 } \
217 } while (0)
218
219 #define VM_PAGE_TO_PV_LIST_LOCK(m) \
220 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
221
222 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
223 "VM/pmap parameters");
224
225 /* The list of all the user pmaps */
226 LIST_HEAD(pmaplist, pmap);
227 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER();
228
229 enum pmap_mode __read_frequently pmap_mode = PMAP_MODE_SV39;
230 SYSCTL_INT(_vm_pmap, OID_AUTO, mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
231 &pmap_mode, 0,
232 "translation mode, 0 = SV39, 1 = SV48");
233
234 struct pmap kernel_pmap_store;
235
236 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
237 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
238 vm_offset_t kernel_vm_end = 0;
239
240 vm_paddr_t dmap_phys_base; /* The start of the dmap region */
241 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
242 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */
243
244 static int pmap_growkernel_panic = 0;
245 SYSCTL_INT(_vm_pmap, OID_AUTO, growkernel_panic, CTLFLAG_RDTUN,
246 &pmap_growkernel_panic, 0,
247 "panic on failure to allocate kernel page table page");
248
249 /* This code assumes all L1 DMAP entries will be used */
250 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS);
251 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS);
252
253 /*
254 * This code assumes that the early DEVMAP is L2_SIZE aligned.
255 */
256 CTASSERT((PMAP_MAPDEV_EARLY_SIZE & L2_OFFSET) == 0);
257
258 static struct rwlock_padalign pvh_global_lock;
259 static struct mtx_padalign allpmaps_lock;
260
261 static int __read_frequently superpages_enabled = 1;
262 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
263 CTLFLAG_RDTUN, &superpages_enabled, 0,
264 "Enable support for transparent superpages");
265
266 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
267 "2MB page mapping counters");
268
269 static u_long pmap_l2_demotions;
270 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
271 &pmap_l2_demotions, 0,
272 "2MB page demotions");
273
274 static u_long pmap_l2_mappings;
275 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
276 &pmap_l2_mappings, 0,
277 "2MB page mappings");
278
279 static u_long pmap_l2_p_failures;
280 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
281 &pmap_l2_p_failures, 0,
282 "2MB page promotion failures");
283
284 static u_long pmap_l2_promotions;
285 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
286 &pmap_l2_promotions, 0,
287 "2MB page promotions");
288
289 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
290 "L1 (1GB) page mapping counters");
291
292 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
293 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
294 &pmap_l1_demotions, "L1 (1GB) page demotions");
295
296 /*
297 * Data for the pv entry allocation mechanism
298 */
299 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
300 static struct mtx pv_chunks_mutex;
301 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
302 static struct md_page *pv_table;
303 static struct md_page pv_dummy;
304
305 extern cpuset_t all_harts;
306
307 /*
308 * Internal flags for pmap_enter()'s helper functions.
309 */
310 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
311 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
312
313 static void free_pv_chunk(struct pv_chunk *pc);
314 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
315 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
316 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
317 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
318 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
319 vm_offset_t va);
320 static bool pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va);
321 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va);
322 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2,
323 vm_offset_t va, struct rwlock **lockp);
324 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
325 u_int flags, vm_page_t m, struct rwlock **lockp);
326 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
327 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
328 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
329 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
330 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
331 vm_page_t m, struct rwlock **lockp);
332
333 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
334 struct rwlock **lockp);
335
336 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
337 struct spglist *free);
338 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
339
340 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
341
342 static uint64_t pmap_satp_mode(void);
343
344 #define pmap_clear(pte) pmap_store(pte, 0)
345 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits)
346 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry)
347 #define pmap_load_clear(pte) pmap_load_store(pte, 0)
348 #define pmap_load(pte) atomic_load_64(pte)
349 #define pmap_store(pte, entry) atomic_store_64(pte, entry)
350 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits)
351
352 /********************/
353 /* Inline functions */
354 /********************/
355
356 static __inline void
pagecopy(void * s,void * d)357 pagecopy(void *s, void *d)
358 {
359
360 memcpy(d, s, PAGE_SIZE);
361 }
362
363 static __inline void
pagezero(void * p)364 pagezero(void *p)
365 {
366
367 bzero(p, PAGE_SIZE);
368 }
369
370 #define pmap_l0_index(va) (((va) >> L0_SHIFT) & Ln_ADDR_MASK)
371 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK)
372 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK)
373 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK)
374
375 #define PTE_TO_PHYS(pte) \
376 ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE)
377 #define L2PTE_TO_PHYS(l2) \
378 ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT)
379 #define L1PTE_TO_PHYS(l1) \
380 ((((l1) & ~PTE_HI_MASK) >> PTE_PPN2_S) << L1_SHIFT)
381 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
382
383 /*
384 * Construct a page table entry of the specified level pointing to physical
385 * address pa, with PTE bits 'bits'.
386 *
387 * A leaf PTE of any level must point to an address matching its alignment,
388 * e.g. L2 pages must be 2MB aligned in memory.
389 */
390 #define L1_PTE(pa, bits) ((((pa) >> L1_SHIFT) << PTE_PPN2_S) | (bits))
391 #define L2_PTE(pa, bits) ((((pa) >> L2_SHIFT) << PTE_PPN1_S) | (bits))
392 #define L3_PTE(pa, bits) ((((pa) >> L3_SHIFT) << PTE_PPN0_S) | (bits))
393
394 /*
395 * Construct a page directory entry (PDE), pointing to next level entry at pa,
396 * with PTE bits 'bits'.
397 *
398 * Unlike PTEs, page directory entries can point to any 4K-aligned physical
399 * address.
400 */
401 #define L0_PDE(pa, bits) L3_PTE(pa, bits)
402 #define L1_PDE(pa, bits) L3_PTE(pa, bits)
403 #define L2_PDE(pa, bits) L3_PTE(pa, bits)
404
405 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)406 pmap_l0(pmap_t pmap, vm_offset_t va)
407 {
408 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__));
409 KASSERT(VIRT_IS_VALID(va),
410 ("%s: malformed virtual address %#lx", __func__, va));
411 return (&pmap->pm_top[pmap_l0_index(va)]);
412 }
413
414 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)415 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
416 {
417 vm_paddr_t phys;
418 pd_entry_t *l1;
419
420 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__));
421 phys = PTE_TO_PHYS(pmap_load(l0));
422 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
423
424 return (&l1[pmap_l1_index(va)]);
425 }
426
427 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)428 pmap_l1(pmap_t pmap, vm_offset_t va)
429 {
430 pd_entry_t *l0;
431
432 KASSERT(VIRT_IS_VALID(va),
433 ("%s: malformed virtual address %#lx", __func__, va));
434 if (pmap_mode == PMAP_MODE_SV39) {
435 return (&pmap->pm_top[pmap_l1_index(va)]);
436 } else {
437 l0 = pmap_l0(pmap, va);
438 if ((pmap_load(l0) & PTE_V) == 0)
439 return (NULL);
440 if ((pmap_load(l0) & PTE_RX) != 0)
441 return (NULL);
442 return (pmap_l0_to_l1(l0, va));
443 }
444 }
445
446 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1,vm_offset_t va)447 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
448 {
449 vm_paddr_t phys;
450 pd_entry_t *l2;
451
452 phys = PTE_TO_PHYS(pmap_load(l1));
453 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
454
455 return (&l2[pmap_l2_index(va)]);
456 }
457
458 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)459 pmap_l2(pmap_t pmap, vm_offset_t va)
460 {
461 pd_entry_t *l1;
462
463 l1 = pmap_l1(pmap, va);
464 if (l1 == NULL)
465 return (NULL);
466 if ((pmap_load(l1) & PTE_V) == 0)
467 return (NULL);
468 if ((pmap_load(l1) & PTE_RX) != 0)
469 return (NULL);
470
471 return (pmap_l1_to_l2(l1, va));
472 }
473
474 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2,vm_offset_t va)475 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
476 {
477 vm_paddr_t phys;
478 pt_entry_t *l3;
479
480 phys = PTE_TO_PHYS(pmap_load(l2));
481 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys);
482
483 return (&l3[pmap_l3_index(va)]);
484 }
485
486 static __inline pt_entry_t *
pmap_l3(pmap_t pmap,vm_offset_t va)487 pmap_l3(pmap_t pmap, vm_offset_t va)
488 {
489 pd_entry_t *l2;
490
491 l2 = pmap_l2(pmap, va);
492 if (l2 == NULL)
493 return (NULL);
494 if ((pmap_load(l2) & PTE_V) == 0)
495 return (NULL);
496 if ((pmap_load(l2) & PTE_RX) != 0)
497 return (NULL);
498
499 return (pmap_l2_to_l3(l2, va));
500 }
501
502 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)503 pmap_resident_count_inc(pmap_t pmap, int count)
504 {
505
506 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
507 pmap->pm_stats.resident_count += count;
508 }
509
510 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)511 pmap_resident_count_dec(pmap_t pmap, int count)
512 {
513
514 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
515 KASSERT(pmap->pm_stats.resident_count >= count,
516 ("pmap %p resident count underflow %ld %d", pmap,
517 pmap->pm_stats.resident_count, count));
518 pmap->pm_stats.resident_count -= count;
519 }
520
521 static void
pmap_distribute_l1(struct pmap * pmap,vm_pindex_t l1index,pt_entry_t entry)522 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index,
523 pt_entry_t entry)
524 {
525 struct pmap *user_pmap;
526 pd_entry_t *l1;
527
528 /*
529 * Distribute new kernel L1 entry to all the user pmaps. This is only
530 * necessary with three-level paging configured: with four-level paging
531 * the kernel's half of the top-level page table page is static and can
532 * simply be copied at pmap initialization time.
533 */
534 if (pmap != kernel_pmap || pmap_mode != PMAP_MODE_SV39)
535 return;
536
537 mtx_lock(&allpmaps_lock);
538 LIST_FOREACH(user_pmap, &allpmaps, pm_list) {
539 l1 = &user_pmap->pm_top[l1index];
540 pmap_store(l1, entry);
541 }
542 mtx_unlock(&allpmaps_lock);
543 }
544
545 /*
546 * Holds the PTE mode bits (defined in pte.h) for defining e.g. cacheability.
547 *
548 * The indices correspond to the VM_MEMATTR_* defines in riscv/include/vm.h.
549 *
550 * The array will be empty if no mode bits are supported by the CPU, e.g. when
551 * lacking the Svpbmt extension.
552 */
553 static __read_frequently pt_entry_t memattr_bits[VM_MEMATTR_TOTAL];
554 static __read_frequently pt_entry_t memattr_mask;
555
556 static __inline pt_entry_t
pmap_memattr_bits(vm_memattr_t mode)557 pmap_memattr_bits(vm_memattr_t mode)
558 {
559 KASSERT(pmap_is_valid_memattr(kernel_pmap, mode),
560 ("invalid memory mode %u\n", mode));
561 return (memattr_bits[(int)mode]);
562 }
563
564 /*
565 * This should only be used during pmap bootstrap e.g. by
566 * pmap_create_pagetables().
567 */
568 static pt_entry_t *
pmap_early_alloc_tables(vm_paddr_t * freemempos,int npages)569 pmap_early_alloc_tables(vm_paddr_t *freemempos, int npages)
570 {
571 pt_entry_t *pt;
572
573 pt = (pt_entry_t *)*freemempos;
574 *freemempos += npages * PAGE_SIZE;
575 bzero(pt, npages * PAGE_SIZE);
576
577 return (pt);
578 }
579
580 /*
581 * Construct the direct map -- a linear mapping of physical memory into
582 * the kernel address space.
583 *
584 * We walk the list of physical memory segments (of arbitrary size and
585 * address) mapping each appropriately using L2 and L1 superpages.
586 * Consequently, the DMAP address space will have unmapped regions
587 * corresponding to any holes between physical memory segments.
588 *
589 * The lowest usable physical address will always be mapped to
590 * DMAP_MIN_ADDRESS.
591 */
592 static vm_paddr_t
pmap_bootstrap_dmap(pd_entry_t * l1,vm_paddr_t freemempos)593 pmap_bootstrap_dmap(pd_entry_t *l1, vm_paddr_t freemempos)
594 {
595 vm_paddr_t physmap[PHYS_AVAIL_ENTRIES];
596 vm_offset_t va;
597 vm_paddr_t min_pa, max_pa, pa, endpa;
598 pd_entry_t *l2;
599 pt_entry_t memattr;
600 u_int l1slot, l2slot;
601 int physmap_idx;
602
603 physmap_idx = physmem_avail(physmap, nitems(physmap));
604 min_pa = physmap[0];
605 max_pa = physmap[physmap_idx - 1];
606
607 printf("physmap_idx %u\n", physmap_idx);
608 printf("min_pa %lx\n", min_pa);
609 printf("max_pa %lx\n", max_pa);
610
611 /* Set the limits of the DMAP region. */
612 dmap_phys_base = rounddown(min_pa, L1_SIZE);
613 dmap_phys_max = max_pa;
614
615 memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT);
616
617 /* Walk the physmap table. */
618 l2 = NULL;
619 l1slot = Ln_ENTRIES; /* sentinel value */
620 for (int idx = 0; idx < physmap_idx; idx += 2) {
621 pa = rounddown(physmap[idx], L2_SIZE);
622 endpa = physmap[idx + 1];
623
624 /* Virtual address for this range. */
625 va = PHYS_TO_DMAP(pa);
626
627 /* Any 1GB possible for this range? */
628 if (roundup(pa, L1_SIZE) + L1_SIZE > endpa)
629 goto l2end;
630
631 /* Loop until the next 1GB boundary. */
632 while ((pa & L1_OFFSET) != 0) {
633 if (l2 == NULL || pmap_l1_index(va) != l1slot) {
634 /* Need to alloc another page table. */
635 l2 = pmap_early_alloc_tables(&freemempos, 1);
636
637 /* Link it. */
638 l1slot = pmap_l1_index(va);
639 pmap_store(&l1[l1slot],
640 L1_PDE((vm_paddr_t)l2, PTE_V));
641 }
642
643 /* map l2 pages */
644 l2slot = pmap_l2_index(va);
645 pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr));
646
647 pa += L2_SIZE;
648 va += L2_SIZE;
649 }
650
651 /* Map what we can with 1GB superpages. */
652 while (pa + L1_SIZE - 1 < endpa) {
653 /* map l1 pages */
654 l1slot = pmap_l1_index(va);
655 pmap_store(&l1[l1slot], L1_PTE(pa, PTE_KERN | memattr));
656
657 pa += L1_SIZE;
658 va += L1_SIZE;
659 }
660
661 l2end:
662 while (pa < endpa) {
663 if (l2 == NULL || pmap_l1_index(va) != l1slot) {
664 /* Need to alloc another page table. */
665 l2 = pmap_early_alloc_tables(&freemempos, 1);
666
667 /* Link it. */
668 l1slot = pmap_l1_index(va);
669 pmap_store(&l1[l1slot],
670 L1_PDE((vm_paddr_t)l2, PTE_V));
671 }
672
673 /* map l2 pages */
674 l2slot = pmap_l2_index(va);
675 pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr));
676
677 pa += L2_SIZE;
678 va += L2_SIZE;
679 }
680 }
681
682 /* And finally, the limit on DMAP VA. */
683 dmap_max_addr = va;
684
685 return (freemempos);
686 }
687
688 /*
689 * Create a new set of pagetables to run the kernel with.
690 *
691 * An initial, temporary setup was created in locore.S, which serves well
692 * enough to get us this far. It mapped kernstart -> KERNBASE, using 2MB
693 * superpages, and created a 1GB identity map, which allows this function
694 * to dereference physical addresses.
695 *
696 * The memory backing these page tables is allocated in the space
697 * immediately following the kernel's preload area. Depending on the size
698 * of this area, some, all, or none of these pages can be implicitly
699 * mapped by the kernel's 2MB mappings. This memory will only ever be
700 * accessed through the direct map, however.
701 */
702 static vm_paddr_t
pmap_create_pagetables(vm_paddr_t kernstart,vm_size_t kernlen,vm_paddr_t * root_pt_phys)703 pmap_create_pagetables(vm_paddr_t kernstart, vm_size_t kernlen,
704 vm_paddr_t *root_pt_phys)
705 {
706 pt_entry_t *l0, *l1, *kern_l2, *kern_l3, *devmap_l3;
707 pt_entry_t memattr;
708 pd_entry_t *devmap_l2;
709 vm_paddr_t kernend, freemempos, pa;
710 int nkernl2, nkernl3, ndevmapl3;
711 int i, slot;
712 int mode;
713
714 kernend = kernstart + kernlen;
715
716 /* Static allocations begin after the kernel staging area. */
717 freemempos = roundup2(kernend, PAGE_SIZE);
718
719 /* Detect Sv48 mode. */
720 mode = PMAP_MODE_SV39;
721 TUNABLE_INT_FETCH("vm.pmap.mode", &mode);
722
723 if (mode == PMAP_MODE_SV48 && (mmu_caps & MMU_SV48) != 0) {
724 /*
725 * Sv48 mode: allocate an L0 page table to be the root. The
726 * layout of KVA is otherwise identical to Sv39.
727 */
728 l0 = pmap_early_alloc_tables(&freemempos, 1);
729 *root_pt_phys = (vm_paddr_t)l0;
730 pmap_mode = PMAP_MODE_SV48;
731 } else {
732 l0 = NULL;
733 }
734
735 /*
736 * Allocate an L1 page table.
737 */
738 l1 = pmap_early_alloc_tables(&freemempos, 1);
739 if (pmap_mode == PMAP_MODE_SV39)
740 *root_pt_phys = (vm_paddr_t)l1;
741
742 /*
743 * Allocate a set of L2 page tables for KVA. Most likely, only 1 is
744 * needed.
745 */
746 nkernl2 = howmany(howmany(kernlen, L2_SIZE), Ln_ENTRIES);
747 kern_l2 = pmap_early_alloc_tables(&freemempos, nkernl2);
748
749 /*
750 * Allocate an L2 page table for the static devmap, located at the end
751 * of KVA. We can expect that the devmap will always be less than 1GB
752 * in size.
753 */
754 devmap_l2 = pmap_early_alloc_tables(&freemempos, 1);
755
756 /* Allocate L3 page tables for the devmap. */
757 ndevmapl3 = howmany(howmany(PMAP_MAPDEV_EARLY_SIZE, L3_SIZE),
758 Ln_ENTRIES);
759 devmap_l3 = pmap_early_alloc_tables(&freemempos, ndevmapl3);
760
761 /*
762 * Allocate some L3 bootstrap pages, for early KVA allocations before
763 * vm_mem_init() has run. For example, the message buffer.
764 *
765 * A somewhat arbitrary choice of 32MB. This should be more than enough
766 * for any early allocations. There is no need to worry about waste, as
767 * whatever is not used will be consumed by later calls to
768 * pmap_growkernel().
769 */
770 nkernl3 = 16;
771 kern_l3 = pmap_early_alloc_tables(&freemempos, nkernl3);
772
773 /* Bootstrap the direct map. */
774 freemempos = pmap_bootstrap_dmap(l1, freemempos);
775
776 /* Allocations are done. */
777 if (freemempos < roundup2(kernend, L2_SIZE))
778 freemempos = roundup2(kernend, L2_SIZE);
779
780 /* Memory attributes for standard/main memory. */
781 memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT);
782
783 /*
784 * Map the kernel (and preloaded modules or data) using L2 superpages.
785 *
786 * kernstart is 2MB-aligned. This is enforced by loader(8) and required
787 * by locore assembly.
788 *
789 * TODO: eventually, this should be done with proper permissions for
790 * each segment, rather than mapping the entire kernel and preloaded
791 * modules RWX.
792 */
793 slot = pmap_l2_index(KERNBASE);
794 for (pa = kernstart; pa < kernend; pa += L2_SIZE, slot++) {
795 pmap_store(&kern_l2[slot],
796 L2_PTE(pa, PTE_KERN | PTE_X | memattr));
797 }
798
799 /*
800 * Connect the L3 bootstrap pages to the kernel L2 table. The L3 PTEs
801 * themselves are invalid.
802 */
803 slot = pmap_l2_index(freemempos - kernstart + KERNBASE);
804 for (i = 0; i < nkernl3; i++, slot++) {
805 pa = (vm_paddr_t)kern_l3 + ptoa(i);
806 pmap_store(&kern_l2[slot], L2_PDE(pa, PTE_V));
807 }
808
809 /* Connect the L2 tables to the L1 table. */
810 slot = pmap_l1_index(KERNBASE);
811 for (i = 0; i < nkernl2; i++, slot++) {
812 pa = (vm_paddr_t)kern_l2 + ptoa(i);
813 pmap_store(&l1[slot], L1_PDE(pa, PTE_V));
814 }
815
816 /* Connect the L1 table to L0, if in use. */
817 if (pmap_mode == PMAP_MODE_SV48) {
818 slot = pmap_l0_index(KERNBASE);
819 pmap_store(&l0[slot], L0_PDE((vm_paddr_t)l1, PTE_V));
820 }
821
822 /*
823 * Connect the devmap L3 pages to the L2 table. The devmap PTEs
824 * themselves are invalid.
825 */
826 slot = pmap_l2_index(DEVMAP_MIN_VADDR);
827 for (i = 0; i < ndevmapl3; i++, slot++) {
828 pa = (vm_paddr_t)devmap_l3 + ptoa(i);
829 pmap_store(&devmap_l2[slot], L2_PDE(pa, PTE_V));
830 }
831
832 /* Connect the devmap L2 pages to the L1 table. */
833 slot = pmap_l1_index(DEVMAP_MIN_VADDR);
834 pa = (vm_paddr_t)devmap_l2;
835 pmap_store(&l1[slot], L1_PDE(pa, PTE_V));
836
837 /* Return the next position of free memory */
838 return (freemempos);
839 }
840
841 /*
842 * Bootstrap the system enough to run with virtual memory.
843 */
844 void
pmap_bootstrap(vm_paddr_t kernstart,vm_size_t kernlen)845 pmap_bootstrap(vm_paddr_t kernstart, vm_size_t kernlen)
846 {
847 vm_paddr_t freemempos, pa;
848 vm_paddr_t root_pt_phys;
849 vm_offset_t freeva;
850 vm_offset_t dpcpu, msgbufpv;
851 pt_entry_t *pte;
852 int i;
853
854 printf("pmap_bootstrap %lx %lx\n", kernstart, kernlen);
855
856 PMAP_LOCK_INIT(kernel_pmap);
857 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
858 vm_radix_init(&kernel_pmap->pm_root);
859
860 rw_init(&pvh_global_lock, "pmap pv global");
861
862 /*
863 * Set the current CPU as active in the kernel pmap. Secondary cores
864 * will add themselves later in init_secondary(). The SBI firmware
865 * may rely on this mask being precise, so CPU_FILL() is not used.
866 */
867 CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active);
868
869 /*
870 * Set up the memory attribute bits.
871 */
872 if (has_svpbmt) {
873 memattr_bits[VM_MEMATTR_PMA] = PTE_MA_NONE;
874 memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_MA_NC;
875 memattr_bits[VM_MEMATTR_DEVICE] = PTE_MA_IO;
876 memattr_mask = PTE_MA_MASK;
877 } else if (has_errata_thead_pbmt) {
878 memattr_bits[VM_MEMATTR_PMA] = PTE_THEAD_MA_NONE;
879 memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_THEAD_MA_NC;
880 memattr_bits[VM_MEMATTR_DEVICE] = PTE_THEAD_MA_IO;
881 memattr_mask = PTE_THEAD_MA_MASK;
882 }
883
884 /* Create a new set of pagetables to run the kernel in. */
885 freemempos = pmap_create_pagetables(kernstart, kernlen, &root_pt_phys);
886
887 /* Switch to the newly created page tables. */
888 kernel_pmap->pm_stage = PM_STAGE1;
889 kernel_pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(root_pt_phys);
890 kernel_pmap->pm_satp = atop(root_pt_phys) | pmap_satp_mode();
891 csr_write(satp, kernel_pmap->pm_satp);
892 sfence_vma();
893
894 /*
895 * Now, we need to make a few more static reservations from KVA.
896 *
897 * Set freeva to freemempos virtual address, and be sure to advance
898 * them together.
899 */
900 freeva = freemempos - kernstart + KERNBASE;
901 #define reserve_space(var, pa, size) \
902 do { \
903 var = freeva; \
904 pa = freemempos; \
905 freeva += size; \
906 freemempos += size; \
907 } while (0)
908
909 /* Allocate the dynamic per-cpu area. */
910 reserve_space(dpcpu, pa, DPCPU_SIZE);
911
912 /* Map it. */
913 pte = pmap_l3(kernel_pmap, dpcpu);
914 KASSERT(pte != NULL, ("Bootstrap pages missing"));
915 for (i = 0; i < howmany(DPCPU_SIZE, PAGE_SIZE); i++)
916 pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN |
917 pmap_memattr_bits(VM_MEMATTR_DEFAULT)));
918
919 /* Now, it can be initialized. */
920 dpcpu_init((void *)dpcpu, 0);
921
922 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
923 reserve_space(msgbufpv, pa, round_page(msgbufsize));
924 msgbufp = (void *)msgbufpv;
925
926 /* Map it. */
927 pte = pmap_l3(kernel_pmap, msgbufpv);
928 KASSERT(pte != NULL, ("Bootstrap pages missing"));
929 for (i = 0; i < howmany(msgbufsize, PAGE_SIZE); i++)
930 pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN |
931 pmap_memattr_bits(VM_MEMATTR_DEFAULT)));
932
933 #undef reserve_space
934
935 /* Mark the bounds of our available virtual address space */
936 virtual_avail = kernel_vm_end = freeva;
937 virtual_end = DEVMAP_MIN_VADDR;
938
939 /* Exclude the reserved physical memory from allocations. */
940 physmem_exclude_region(kernstart, freemempos - kernstart,
941 EXFLAG_NOALLOC);
942 }
943
944 /*
945 * Initialize a vm_page's machine-dependent fields.
946 */
947 void
pmap_page_init(vm_page_t m)948 pmap_page_init(vm_page_t m)
949 {
950
951 TAILQ_INIT(&m->md.pv_list);
952 m->md.pv_memattr = VM_MEMATTR_DEFAULT;
953 }
954
955 /*
956 * Initialize the pmap module.
957 *
958 * Called by vm_mem_init(), to initialize any structures that the pmap
959 * system needs to map virtual memory.
960 */
961 void
pmap_init(void)962 pmap_init(void)
963 {
964 vm_size_t s;
965 int i, pv_npg;
966
967 /*
968 * Initialize the pv chunk and pmap list mutexes.
969 */
970 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
971 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF);
972
973 /*
974 * Initialize the pool of pv list locks.
975 */
976 for (i = 0; i < NPV_LIST_LOCKS; i++)
977 rw_init(&pv_list_locks[i], "pmap pv list");
978
979 /*
980 * Calculate the size of the pv head table for superpages.
981 */
982 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
983
984 /*
985 * Allocate memory for the pv head table for superpages.
986 */
987 s = (vm_size_t)(pv_npg * sizeof(struct md_page));
988 s = round_page(s);
989 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO);
990 for (i = 0; i < pv_npg; i++)
991 TAILQ_INIT(&pv_table[i].pv_list);
992 TAILQ_INIT(&pv_dummy.pv_list);
993
994 if (superpages_enabled)
995 pagesizes[1] = L2_SIZE;
996 }
997
998 #ifdef SMP
999 /*
1000 * For SMP, these functions have to use IPIs for coherence.
1001 *
1002 * In general, the calling thread uses a plain fence to order the
1003 * writes to the page tables before invoking an SBI callback to invoke
1004 * sfence_vma() on remote CPUs.
1005 */
1006 static void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)1007 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1008 {
1009 cpuset_t mask;
1010
1011 sched_pin();
1012 mask = pmap->pm_active;
1013 CPU_CLR(PCPU_GET(hart), &mask);
1014 fence();
1015 if (!CPU_EMPTY(&mask) && smp_started)
1016 sbi_remote_sfence_vma(mask.__bits, va, 1);
1017 sfence_vma_page(va);
1018 sched_unpin();
1019 }
1020
1021 static void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)1022 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1023 {
1024 cpuset_t mask;
1025
1026 sched_pin();
1027 mask = pmap->pm_active;
1028 CPU_CLR(PCPU_GET(hart), &mask);
1029 fence();
1030 if (!CPU_EMPTY(&mask) && smp_started)
1031 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1);
1032
1033 /*
1034 * Might consider a loop of sfence_vma_page() for a small
1035 * number of pages in the future.
1036 */
1037 sfence_vma();
1038 sched_unpin();
1039 }
1040
1041 static void
pmap_invalidate_all(pmap_t pmap)1042 pmap_invalidate_all(pmap_t pmap)
1043 {
1044 cpuset_t mask;
1045
1046 sched_pin();
1047 mask = pmap->pm_active;
1048 CPU_CLR(PCPU_GET(hart), &mask);
1049
1050 /*
1051 * XXX: The SBI doc doesn't detail how to specify x0 as the
1052 * address to perform a global fence. BBL currently treats
1053 * all sfence_vma requests as global however.
1054 */
1055 fence();
1056 if (!CPU_EMPTY(&mask) && smp_started)
1057 sbi_remote_sfence_vma(mask.__bits, 0, 0);
1058 sfence_vma();
1059 sched_unpin();
1060 }
1061 #else
1062 /*
1063 * Normal, non-SMP, invalidation functions.
1064 * We inline these within pmap.c for speed.
1065 */
1066 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)1067 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1068 {
1069
1070 sfence_vma_page(va);
1071 }
1072
1073 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)1074 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1075 {
1076
1077 /*
1078 * Might consider a loop of sfence_vma_page() for a small
1079 * number of pages in the future.
1080 */
1081 sfence_vma();
1082 }
1083
1084 static __inline void
pmap_invalidate_all(pmap_t pmap)1085 pmap_invalidate_all(pmap_t pmap)
1086 {
1087
1088 sfence_vma();
1089 }
1090 #endif
1091
1092 /*
1093 * Routine: pmap_extract
1094 * Function:
1095 * Extract the physical page address associated
1096 * with the given map/virtual_address pair.
1097 */
1098 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1099 pmap_extract(pmap_t pmap, vm_offset_t va)
1100 {
1101 pd_entry_t *l2p, l2;
1102 pt_entry_t *l3p;
1103 vm_paddr_t pa;
1104
1105 pa = 0;
1106
1107 /*
1108 * Start with an L2 lookup, L1 superpages are currently not implemented.
1109 */
1110 PMAP_LOCK(pmap);
1111 l2p = pmap_l2(pmap, va);
1112 if (l2p != NULL && ((l2 = pmap_load(l2p)) & PTE_V) != 0) {
1113 if ((l2 & PTE_RWX) == 0) {
1114 l3p = pmap_l2_to_l3(l2p, va);
1115 pa = PTE_TO_PHYS(pmap_load(l3p));
1116 pa |= (va & L3_OFFSET);
1117 } else {
1118 /* L2 is a superpage mapping. */
1119 pa = L2PTE_TO_PHYS(l2);
1120 pa |= (va & L2_OFFSET);
1121 }
1122 }
1123 PMAP_UNLOCK(pmap);
1124 return (pa);
1125 }
1126
1127 /*
1128 * Routine: pmap_extract_and_hold
1129 * Function:
1130 * Atomically extract and hold the physical page
1131 * with the given pmap and virtual address pair
1132 * if that mapping permits the given protection.
1133 */
1134 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1135 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1136 {
1137 pt_entry_t *l3p, l3;
1138 vm_page_t m;
1139
1140 m = NULL;
1141 PMAP_LOCK(pmap);
1142 l3p = pmap_l3(pmap, va);
1143 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) {
1144 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) {
1145 m = PTE_TO_VM_PAGE(l3);
1146 if (!vm_page_wire_mapped(m))
1147 m = NULL;
1148 }
1149 }
1150 PMAP_UNLOCK(pmap);
1151 return (m);
1152 }
1153
1154 /*
1155 * Routine: pmap_kextract
1156 * Function:
1157 * Extract the physical page address associated with the given kernel
1158 * virtual address.
1159 */
1160 vm_paddr_t
pmap_kextract(vm_offset_t va)1161 pmap_kextract(vm_offset_t va)
1162 {
1163 pd_entry_t *l2, l2e;
1164 pt_entry_t *l3;
1165 vm_paddr_t pa;
1166
1167 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1168 pa = DMAP_TO_PHYS(va);
1169 } else {
1170 l2 = pmap_l2(kernel_pmap, va);
1171 if (l2 == NULL)
1172 panic("pmap_kextract: No l2");
1173 l2e = pmap_load(l2);
1174 /*
1175 * Beware of concurrent promotion and demotion! We must
1176 * use l2e rather than loading from l2 multiple times to
1177 * ensure we see a consistent state, including the
1178 * implicit load in pmap_l2_to_l3. It is, however, safe
1179 * to use an old l2e because the L3 page is preserved by
1180 * promotion.
1181 */
1182 if ((l2e & PTE_RX) != 0) {
1183 /* superpages */
1184 pa = L2PTE_TO_PHYS(l2e);
1185 pa |= (va & L2_OFFSET);
1186 return (pa);
1187 }
1188
1189 l3 = pmap_l2_to_l3(&l2e, va);
1190 pa = PTE_TO_PHYS(pmap_load(l3));
1191 pa |= (va & PAGE_MASK);
1192 }
1193 return (pa);
1194 }
1195
1196 /***************************************************
1197 * Low level mapping routines.....
1198 ***************************************************/
1199
1200 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)1201 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1202 {
1203 pt_entry_t entry;
1204 pt_entry_t *l3;
1205 pt_entry_t memattr;
1206 vm_offset_t va;
1207 pn_t pn;
1208
1209 KASSERT((pa & L3_OFFSET) == 0,
1210 ("pmap_kenter_device: Invalid physical address"));
1211 KASSERT((sva & L3_OFFSET) == 0,
1212 ("pmap_kenter_device: Invalid virtual address"));
1213 KASSERT((size & PAGE_MASK) == 0,
1214 ("pmap_kenter_device: Mapping is not page-sized"));
1215
1216 memattr = pmap_memattr_bits(mode);
1217 va = sva;
1218 while (size != 0) {
1219 l3 = pmap_l3(kernel_pmap, va);
1220 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
1221
1222 pn = (pa / PAGE_SIZE);
1223 entry = PTE_KERN;
1224 entry |= memattr;
1225 entry |= (pn << PTE_PPN0_S);
1226 pmap_store(l3, entry);
1227
1228 va += PAGE_SIZE;
1229 pa += PAGE_SIZE;
1230 size -= PAGE_SIZE;
1231 }
1232 pmap_invalidate_range(kernel_pmap, sva, va);
1233 }
1234
1235 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)1236 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1237 {
1238 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
1239 }
1240
1241 /*
1242 * Remove a page from the kernel pagetables.
1243 * Note: not SMP coherent.
1244 */
1245 void
pmap_kremove(vm_offset_t va)1246 pmap_kremove(vm_offset_t va)
1247 {
1248 pt_entry_t *l3;
1249
1250 l3 = pmap_l3(kernel_pmap, va);
1251 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1252
1253 pmap_clear(l3);
1254 sfence_vma();
1255 }
1256
1257 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)1258 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1259 {
1260 pt_entry_t *l3;
1261 vm_offset_t va;
1262
1263 KASSERT((sva & L3_OFFSET) == 0,
1264 ("pmap_kremove_device: Invalid virtual address"));
1265 KASSERT((size & PAGE_MASK) == 0,
1266 ("pmap_kremove_device: Mapping is not page-sized"));
1267
1268 va = sva;
1269 while (size != 0) {
1270 l3 = pmap_l3(kernel_pmap, va);
1271 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
1272 pmap_clear(l3);
1273
1274 va += PAGE_SIZE;
1275 size -= PAGE_SIZE;
1276 }
1277
1278 pmap_invalidate_range(kernel_pmap, sva, va);
1279 }
1280
1281 /*
1282 * Used to map a range of physical addresses into kernel
1283 * virtual address space.
1284 *
1285 * The value passed in '*virt' is a suggested virtual address for
1286 * the mapping. Architectures which can support a direct-mapped
1287 * physical to virtual region can return the appropriate address
1288 * within that region, leaving '*virt' unchanged. Other
1289 * architectures should map the pages starting at '*virt' and
1290 * update '*virt' with the first usable address after the mapped
1291 * region.
1292 */
1293 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)1294 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1295 {
1296
1297 return PHYS_TO_DMAP(start);
1298 }
1299
1300 /*
1301 * Add a list of wired pages to the kva
1302 * this routine is only used for temporary
1303 * kernel mappings that do not need to have
1304 * page modification or references recorded.
1305 * Note that old mappings are simply written
1306 * over. The page *must* be wired.
1307 * Note: SMP coherent. Uses a ranged shootdown IPI.
1308 */
1309 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)1310 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1311 {
1312 pt_entry_t *l3;
1313 vm_paddr_t pa;
1314 vm_offset_t va;
1315 vm_page_t m;
1316 pt_entry_t entry;
1317 pn_t pn;
1318 int i;
1319
1320 va = sva;
1321 for (i = 0; i < count; i++) {
1322 m = ma[i];
1323 pa = VM_PAGE_TO_PHYS(m);
1324 pn = (pa / PAGE_SIZE);
1325 l3 = pmap_l3(kernel_pmap, va);
1326
1327 entry = PTE_KERN;
1328 entry |= pmap_memattr_bits(m->md.pv_memattr);
1329 entry |= (pn << PTE_PPN0_S);
1330 pmap_store(l3, entry);
1331
1332 va += L3_SIZE;
1333 }
1334 pmap_invalidate_range(kernel_pmap, sva, va);
1335 }
1336
1337 /*
1338 * This routine tears out page mappings from the
1339 * kernel -- it is meant only for temporary mappings.
1340 * Note: SMP coherent. Uses a ranged shootdown IPI.
1341 */
1342 void
pmap_qremove(vm_offset_t sva,int count)1343 pmap_qremove(vm_offset_t sva, int count)
1344 {
1345 pt_entry_t *l3;
1346 vm_offset_t va;
1347
1348 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
1349
1350 for (va = sva; count-- > 0; va += PAGE_SIZE) {
1351 l3 = pmap_l3(kernel_pmap, va);
1352 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1353 pmap_clear(l3);
1354 }
1355 pmap_invalidate_range(kernel_pmap, sva, va);
1356 }
1357
1358 bool
pmap_ps_enabled(pmap_t pmap __unused)1359 pmap_ps_enabled(pmap_t pmap __unused)
1360 {
1361
1362 return (superpages_enabled);
1363 }
1364
1365 /***************************************************
1366 * Page table page management routines.....
1367 ***************************************************/
1368 /*
1369 * Schedule the specified unused page table page to be freed. Specifically,
1370 * add the page to the specified list of pages that will be released to the
1371 * physical memory manager after the TLB has been updated.
1372 */
1373 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)1374 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
1375 {
1376
1377 if (set_PG_ZERO)
1378 m->flags |= PG_ZERO;
1379 else
1380 m->flags &= ~PG_ZERO;
1381 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1382 }
1383
1384 /*
1385 * Inserts the specified page table page into the specified pmap's collection
1386 * of idle page table pages. Each of a pmap's page table pages is responsible
1387 * for mapping a distinct range of virtual addresses. The pmap's collection is
1388 * ordered by this virtual address range.
1389 *
1390 * If "promoted" is false, then the page table page "mpte" must be zero filled;
1391 * "mpte"'s valid field will be set to 0.
1392 *
1393 * If "promoted" is true and "all_l3e_PTE_A_set" is false, then "mpte" must
1394 * contain valid mappings with identical attributes except for PTE_A;
1395 * "mpte"'s valid field will be set to 1.
1396 *
1397 * If "promoted" and "all_l3e_PTE_A_set" are both true, then "mpte" must contain
1398 * valid mappings with identical attributes including PTE_A; "mpte"'s valid
1399 * field will be set to VM_PAGE_BITS_ALL.
1400 */
1401 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_PTE_A_set)1402 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
1403 bool all_l3e_PTE_A_set)
1404 {
1405
1406 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1407 KASSERT(promoted || !all_l3e_PTE_A_set,
1408 ("a zero-filled PTP can't have PTE_A set in every PTE"));
1409 mpte->valid = promoted ? (all_l3e_PTE_A_set ? VM_PAGE_BITS_ALL : 1) : 0;
1410 return (vm_radix_insert(&pmap->pm_root, mpte));
1411 }
1412
1413 /*
1414 * Removes the page table page mapping the specified virtual address from the
1415 * specified pmap's collection of idle page table pages, and returns it.
1416 * Otherwise, returns NULL if there is no page table page corresponding to the
1417 * specified virtual address.
1418 */
1419 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)1420 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
1421 {
1422
1423 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1424 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
1425 }
1426
1427 /*
1428 * Decrements a page table page's reference count, which is used to record the
1429 * number of valid page table entries within the page. If the reference count
1430 * drops to zero, then the page table page is unmapped. Returns true if the
1431 * page table page was unmapped and false otherwise.
1432 */
1433 static inline bool
pmap_unwire_ptp(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)1434 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1435 {
1436 KASSERT(m->ref_count > 0,
1437 ("%s: page %p ref count underflow", __func__, m));
1438
1439 --m->ref_count;
1440 if (m->ref_count == 0) {
1441 _pmap_unwire_ptp(pmap, va, m, free);
1442 return (true);
1443 } else {
1444 return (false);
1445 }
1446 }
1447
1448 static void
_pmap_unwire_ptp(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)1449 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1450 {
1451
1452 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1453 if (m->pindex >= NUL2E + NUL1E) {
1454 pd_entry_t *l0;
1455 l0 = pmap_l0(pmap, va);
1456 pmap_clear(l0);
1457 } else if (m->pindex >= NUL2E) {
1458 pd_entry_t *l1;
1459 l1 = pmap_l1(pmap, va);
1460 pmap_clear(l1);
1461 pmap_distribute_l1(pmap, pmap_l1_index(va), 0);
1462 } else {
1463 pd_entry_t *l2;
1464 l2 = pmap_l2(pmap, va);
1465 pmap_clear(l2);
1466 }
1467 pmap_resident_count_dec(pmap, 1);
1468 if (m->pindex < NUL2E) {
1469 pd_entry_t *l1;
1470 vm_page_t pdpg;
1471
1472 l1 = pmap_l1(pmap, va);
1473 pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1474 pmap_unwire_ptp(pmap, va, pdpg, free);
1475 } else if (m->pindex < NUL2E + NUL1E && pmap_mode != PMAP_MODE_SV39) {
1476 pd_entry_t *l0;
1477 vm_page_t pdpg;
1478
1479 l0 = pmap_l0(pmap, va);
1480 pdpg = PTE_TO_VM_PAGE(pmap_load(l0));
1481 pmap_unwire_ptp(pmap, va, pdpg, free);
1482 }
1483 pmap_invalidate_page(pmap, va);
1484
1485 vm_wire_sub(1);
1486
1487 /*
1488 * Put page on a list so that it is released after
1489 * *ALL* TLB shootdown is done
1490 */
1491 pmap_add_delayed_free_list(m, free, true);
1492 }
1493
1494 /*
1495 * After removing a page table entry, this routine is used to
1496 * conditionally free the page, and manage the reference count.
1497 */
1498 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)1499 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1500 struct spglist *free)
1501 {
1502 vm_page_t mpte;
1503
1504 if (va >= VM_MAXUSER_ADDRESS)
1505 return (0);
1506 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1507 mpte = PTE_TO_VM_PAGE(ptepde);
1508 return (pmap_unwire_ptp(pmap, va, mpte, free));
1509 }
1510
1511 static uint64_t
pmap_satp_mode(void)1512 pmap_satp_mode(void)
1513 {
1514 return (pmap_mode == PMAP_MODE_SV39 ? SATP_MODE_SV39 : SATP_MODE_SV48);
1515 }
1516
1517 void
pmap_pinit0(pmap_t pmap)1518 pmap_pinit0(pmap_t pmap)
1519 {
1520 PMAP_LOCK_INIT(pmap);
1521 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1522 pmap->pm_stage = PM_STAGE1;
1523 pmap->pm_top = kernel_pmap->pm_top;
1524 pmap->pm_satp = pmap_satp_mode() |
1525 (vtophys(pmap->pm_top) >> PAGE_SHIFT);
1526 CPU_ZERO(&pmap->pm_active);
1527 TAILQ_INIT(&pmap->pm_pvchunk);
1528 vm_radix_init(&pmap->pm_root);
1529 pmap_activate_boot(pmap);
1530 }
1531
1532 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage)1533 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage)
1534 {
1535 vm_paddr_t topphys;
1536 vm_page_t m;
1537 size_t i;
1538
1539 /*
1540 * Top directory is 4 pages in hypervisor case.
1541 * Current address space layout makes 3 of them unused.
1542 */
1543 if (stage == PM_STAGE1)
1544 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO |
1545 VM_ALLOC_WAITOK);
1546 else
1547 m = vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
1548 4, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT);
1549
1550 topphys = VM_PAGE_TO_PHYS(m);
1551 pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys);
1552 pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT);
1553 pmap->pm_stage = stage;
1554
1555 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1556
1557 CPU_ZERO(&pmap->pm_active);
1558
1559 if (stage == PM_STAGE2)
1560 goto finish;
1561
1562 if (pmap_mode == PMAP_MODE_SV39) {
1563 /*
1564 * Copy L1 entries from the kernel pmap. This must be done with
1565 * the allpmaps lock held to avoid races with
1566 * pmap_distribute_l1().
1567 */
1568 mtx_lock(&allpmaps_lock);
1569 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1570 for (i = pmap_l1_index(VM_MIN_KERNEL_ADDRESS);
1571 i < pmap_l1_index(VM_MAX_KERNEL_ADDRESS); i++)
1572 pmap->pm_top[i] = kernel_pmap->pm_top[i];
1573 for (i = pmap_l1_index(DMAP_MIN_ADDRESS);
1574 i < pmap_l1_index(DMAP_MAX_ADDRESS); i++)
1575 pmap->pm_top[i] = kernel_pmap->pm_top[i];
1576 mtx_unlock(&allpmaps_lock);
1577 } else {
1578 i = pmap_l0_index(VM_MIN_KERNEL_ADDRESS);
1579 pmap->pm_top[i] = kernel_pmap->pm_top[i];
1580 }
1581
1582 finish:
1583 TAILQ_INIT(&pmap->pm_pvchunk);
1584 vm_radix_init(&pmap->pm_root);
1585
1586 return (1);
1587 }
1588
1589 int
pmap_pinit(pmap_t pmap)1590 pmap_pinit(pmap_t pmap)
1591 {
1592
1593 return (pmap_pinit_stage(pmap, PM_STAGE1));
1594 }
1595
1596 /*
1597 * This routine is called if the desired page table page does not exist.
1598 *
1599 * If page table page allocation fails, this routine may sleep before
1600 * returning NULL. It sleeps only if a lock pointer was given.
1601 *
1602 * Note: If a page allocation fails at page table level two or three,
1603 * one or two pages may be held during the wait, only to be released
1604 * afterwards. This conservative approach is easily argued to avoid
1605 * race conditions.
1606 */
1607 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)1608 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1609 {
1610 vm_page_t m, pdpg;
1611 pt_entry_t entry;
1612 vm_paddr_t phys;
1613 pn_t pn;
1614
1615 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1616
1617 /*
1618 * Allocate a page table page.
1619 */
1620 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1621 if (m == NULL) {
1622 if (lockp != NULL) {
1623 RELEASE_PV_LIST_LOCK(lockp);
1624 PMAP_UNLOCK(pmap);
1625 rw_runlock(&pvh_global_lock);
1626 vm_wait(NULL);
1627 rw_rlock(&pvh_global_lock);
1628 PMAP_LOCK(pmap);
1629 }
1630
1631 /*
1632 * Indicate the need to retry. While waiting, the page table
1633 * page may have been allocated.
1634 */
1635 return (NULL);
1636 }
1637 m->pindex = ptepindex;
1638
1639 /*
1640 * Map the pagetable page into the process address space, if
1641 * it isn't already there.
1642 */
1643 pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT;
1644 if (ptepindex >= NUL2E + NUL1E) {
1645 pd_entry_t *l0;
1646 vm_pindex_t l0index;
1647
1648 KASSERT(pmap_mode != PMAP_MODE_SV39,
1649 ("%s: pindex %#lx in SV39 mode", __func__, ptepindex));
1650 KASSERT(ptepindex < NUL2E + NUL1E + NUL0E,
1651 ("%s: pindex %#lx out of range", __func__, ptepindex));
1652
1653 l0index = ptepindex - (NUL2E + NUL1E);
1654 l0 = &pmap->pm_top[l0index];
1655 KASSERT((pmap_load(l0) & PTE_V) == 0,
1656 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0)));
1657
1658 entry = PTE_V | (pn << PTE_PPN0_S);
1659 pmap_store(l0, entry);
1660 } else if (ptepindex >= NUL2E) {
1661 pd_entry_t *l0, *l1;
1662 vm_pindex_t l0index, l1index;
1663
1664 l1index = ptepindex - NUL2E;
1665 if (pmap_mode == PMAP_MODE_SV39) {
1666 l1 = &pmap->pm_top[l1index];
1667 } else {
1668 l0index = l1index >> Ln_ENTRIES_SHIFT;
1669 l0 = &pmap->pm_top[l0index];
1670 if (pmap_load(l0) == 0) {
1671 /* Recurse to allocate the L1 page. */
1672 if (_pmap_alloc_l3(pmap,
1673 NUL2E + NUL1E + l0index, lockp) == NULL)
1674 goto fail;
1675 phys = PTE_TO_PHYS(pmap_load(l0));
1676 } else {
1677 phys = PTE_TO_PHYS(pmap_load(l0));
1678 pdpg = PHYS_TO_VM_PAGE(phys);
1679 pdpg->ref_count++;
1680 }
1681 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1682 l1 = &l1[ptepindex & Ln_ADDR_MASK];
1683 }
1684 KASSERT((pmap_load(l1) & PTE_V) == 0,
1685 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
1686
1687 entry = PTE_V | (pn << PTE_PPN0_S);
1688 pmap_store(l1, entry);
1689 pmap_distribute_l1(pmap, l1index, entry);
1690 } else {
1691 vm_pindex_t l0index, l1index;
1692 pd_entry_t *l0, *l1, *l2;
1693
1694 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT);
1695 if (pmap_mode == PMAP_MODE_SV39) {
1696 l1 = &pmap->pm_top[l1index];
1697 if (pmap_load(l1) == 0) {
1698 /* recurse for allocating page dir */
1699 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1700 lockp) == NULL)
1701 goto fail;
1702 } else {
1703 pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1704 pdpg->ref_count++;
1705 }
1706 } else {
1707 l0index = l1index >> Ln_ENTRIES_SHIFT;
1708 l0 = &pmap->pm_top[l0index];
1709 if (pmap_load(l0) == 0) {
1710 /* Recurse to allocate the L1 entry. */
1711 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1712 lockp) == NULL)
1713 goto fail;
1714 phys = PTE_TO_PHYS(pmap_load(l0));
1715 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1716 l1 = &l1[l1index & Ln_ADDR_MASK];
1717 } else {
1718 phys = PTE_TO_PHYS(pmap_load(l0));
1719 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1720 l1 = &l1[l1index & Ln_ADDR_MASK];
1721 if (pmap_load(l1) == 0) {
1722 /* Recurse to allocate the L2 page. */
1723 if (_pmap_alloc_l3(pmap,
1724 NUL2E + l1index, lockp) == NULL)
1725 goto fail;
1726 } else {
1727 pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1728 pdpg->ref_count++;
1729 }
1730 }
1731 }
1732
1733 phys = PTE_TO_PHYS(pmap_load(l1));
1734 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1735 l2 = &l2[ptepindex & Ln_ADDR_MASK];
1736 KASSERT((pmap_load(l2) & PTE_V) == 0,
1737 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
1738
1739 entry = PTE_V | (pn << PTE_PPN0_S);
1740 pmap_store(l2, entry);
1741 }
1742
1743 pmap_resident_count_inc(pmap, 1);
1744
1745 return (m);
1746
1747 fail:
1748 vm_page_unwire_noq(m);
1749 vm_page_free_zero(m);
1750 return (NULL);
1751 }
1752
1753 static vm_page_t
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)1754 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1755 {
1756 pd_entry_t *l1;
1757 vm_page_t l2pg;
1758 vm_pindex_t pindex;
1759
1760 retry:
1761 l1 = pmap_l1(pmap, va);
1762 if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) {
1763 KASSERT((pmap_load(l1) & PTE_RWX) == 0,
1764 ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__,
1765 pmap_load(l1), va));
1766 /* Add a reference to the L2 page. */
1767 l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
1768 l2pg->ref_count++;
1769 } else {
1770 /* Allocate a L2 page. */
1771 pindex = pmap_l1_pindex(va);
1772 l2pg = _pmap_alloc_l3(pmap, pindex, lockp);
1773 if (l2pg == NULL && lockp != NULL)
1774 goto retry;
1775 }
1776 return (l2pg);
1777 }
1778
1779 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)1780 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1781 {
1782 vm_pindex_t ptepindex;
1783 pd_entry_t *l2;
1784 vm_page_t m;
1785
1786 /*
1787 * Calculate pagetable page index
1788 */
1789 ptepindex = pmap_l2_pindex(va);
1790 retry:
1791 /*
1792 * Get the page directory entry
1793 */
1794 l2 = pmap_l2(pmap, va);
1795
1796 /*
1797 * If the page table page is mapped, we just increment the
1798 * hold count, and activate it.
1799 */
1800 if (l2 != NULL && pmap_load(l2) != 0) {
1801 m = PTE_TO_VM_PAGE(pmap_load(l2));
1802 m->ref_count++;
1803 } else {
1804 /*
1805 * Here if the pte page isn't mapped, or if it has been
1806 * deallocated.
1807 */
1808 m = _pmap_alloc_l3(pmap, ptepindex, lockp);
1809 if (m == NULL && lockp != NULL)
1810 goto retry;
1811 }
1812 return (m);
1813 }
1814
1815 /***************************************************
1816 * Pmap allocation/deallocation routines.
1817 ***************************************************/
1818
1819 /*
1820 * Release any resources held by the given physical map.
1821 * Called when a pmap initialized by pmap_pinit is being released.
1822 * Should only be called if the map contains no valid mappings.
1823 */
1824 void
pmap_release(pmap_t pmap)1825 pmap_release(pmap_t pmap)
1826 {
1827 vm_page_t m;
1828 int npages;
1829 int i;
1830
1831 KASSERT(pmap->pm_stats.resident_count == 0,
1832 ("pmap_release: pmap resident count %ld != 0",
1833 pmap->pm_stats.resident_count));
1834 KASSERT(CPU_EMPTY(&pmap->pm_active),
1835 ("releasing active pmap %p", pmap));
1836
1837 if (pmap->pm_stage == PM_STAGE2)
1838 goto finish;
1839
1840 if (pmap_mode == PMAP_MODE_SV39) {
1841 mtx_lock(&allpmaps_lock);
1842 LIST_REMOVE(pmap, pm_list);
1843 mtx_unlock(&allpmaps_lock);
1844 }
1845
1846 finish:
1847 npages = pmap->pm_stage == PM_STAGE2 ? 4 : 1;
1848 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top));
1849 for (i = 0; i < npages; i++) {
1850 vm_page_unwire_noq(m);
1851 vm_page_free(m);
1852 m++;
1853 }
1854 }
1855
1856 static int
kvm_size(SYSCTL_HANDLER_ARGS)1857 kvm_size(SYSCTL_HANDLER_ARGS)
1858 {
1859 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1860
1861 return sysctl_handle_long(oidp, &ksize, 0, req);
1862 }
1863 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1864 0, 0, kvm_size, "LU",
1865 "Size of KVM");
1866
1867 static int
kvm_free(SYSCTL_HANDLER_ARGS)1868 kvm_free(SYSCTL_HANDLER_ARGS)
1869 {
1870 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1871
1872 return sysctl_handle_long(oidp, &kfree, 0, req);
1873 }
1874 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1875 0, 0, kvm_free, "LU",
1876 "Amount of KVM free");
1877
1878 /*
1879 * grow the number of kernel page table entries, if needed
1880 */
1881 static int
pmap_growkernel_nopanic(vm_offset_t addr)1882 pmap_growkernel_nopanic(vm_offset_t addr)
1883 {
1884 vm_paddr_t paddr;
1885 vm_page_t nkpg;
1886 pd_entry_t *l1, *l2;
1887 pt_entry_t entry;
1888 pn_t pn;
1889
1890 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1891
1892 addr = roundup2(addr, L2_SIZE);
1893 if (addr - 1 >= vm_map_max(kernel_map))
1894 addr = vm_map_max(kernel_map);
1895 while (kernel_vm_end < addr) {
1896 l1 = pmap_l1(kernel_pmap, kernel_vm_end);
1897 if (pmap_load(l1) == 0) {
1898 /* We need a new PDP entry */
1899 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
1900 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1901 if (nkpg == NULL)
1902 return (KERN_RESOURCE_SHORTAGE);
1903
1904 nkpg->pindex = pmap_l1_pindex(kernel_vm_end);
1905 paddr = VM_PAGE_TO_PHYS(nkpg);
1906
1907 pn = (paddr / PAGE_SIZE);
1908 entry = (PTE_V);
1909 entry |= (pn << PTE_PPN0_S);
1910 pmap_store(l1, entry);
1911 pmap_distribute_l1(kernel_pmap,
1912 pmap_l1_index(kernel_vm_end), entry);
1913 continue; /* try again */
1914 }
1915 l2 = pmap_l1_to_l2(l1, kernel_vm_end);
1916 if ((pmap_load(l2) & PTE_V) != 0 &&
1917 (pmap_load(l2) & PTE_RWX) == 0) {
1918 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1919 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
1920 kernel_vm_end = vm_map_max(kernel_map);
1921 break;
1922 }
1923 continue;
1924 }
1925
1926 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
1927 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1928 if (nkpg == NULL)
1929 return (KERN_RESOURCE_SHORTAGE);
1930 nkpg->pindex = pmap_l2_pindex(kernel_vm_end);
1931 paddr = VM_PAGE_TO_PHYS(nkpg);
1932
1933 pn = (paddr / PAGE_SIZE);
1934 entry = (PTE_V);
1935 entry |= (pn << PTE_PPN0_S);
1936 pmap_store(l2, entry);
1937
1938 pmap_invalidate_page(kernel_pmap, kernel_vm_end);
1939
1940 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1941 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
1942 kernel_vm_end = vm_map_max(kernel_map);
1943 break;
1944 }
1945 }
1946
1947 return (KERN_SUCCESS);
1948 }
1949
1950 int
pmap_growkernel(vm_offset_t addr)1951 pmap_growkernel(vm_offset_t addr)
1952 {
1953 int rv;
1954
1955 rv = pmap_growkernel_nopanic(addr);
1956 if (rv != KERN_SUCCESS && pmap_growkernel_panic)
1957 panic("pmap_growkernel: no memory to grow kernel");
1958 return (rv);
1959 }
1960
1961 /***************************************************
1962 * page management routines.
1963 ***************************************************/
1964
1965 static const uint64_t pc_freemask[_NPCM] = {
1966 [0 ... _NPCM - 2] = PC_FREEN,
1967 [_NPCM - 1] = PC_FREEL
1968 };
1969
1970 #ifdef PV_STATS
1971 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1972
1973 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1974 "Current number of pv entry chunks");
1975 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1976 "Current number of pv entry chunks allocated");
1977 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1978 "Current number of pv entry chunks frees");
1979 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1980 "Number of times tried to get a chunk page but failed.");
1981
1982 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
1983 static int pv_entry_spare;
1984
1985 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1986 "Current number of pv entry frees");
1987 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1988 "Current number of pv entry allocs");
1989 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1990 "Current number of pv entries");
1991 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1992 "Current number of spare pv entries");
1993 #endif
1994
1995 /*
1996 * We are in a serious low memory condition. Resort to
1997 * drastic measures to free some pages so we can allocate
1998 * another pv entry chunk.
1999 *
2000 * Returns NULL if PV entries were reclaimed from the specified pmap.
2001 *
2002 * We do not, however, unmap 2mpages because subsequent accesses will
2003 * allocate per-page pv entries until repromotion occurs, thereby
2004 * exacerbating the shortage of free pv entries.
2005 */
2006 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)2007 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2008 {
2009
2010 panic("RISCVTODO: reclaim_pv_chunk");
2011 }
2012
2013 /*
2014 * free the pv_entry back to the free list
2015 */
2016 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)2017 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2018 {
2019 struct pv_chunk *pc;
2020 int idx, field, bit;
2021
2022 rw_assert(&pvh_global_lock, RA_LOCKED);
2023 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2024 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2025 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2026 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2027 pc = pv_to_chunk(pv);
2028 idx = pv - &pc->pc_pventry[0];
2029 field = idx / 64;
2030 bit = idx % 64;
2031 pc->pc_map[field] |= 1ul << bit;
2032 if (!pc_is_free(pc)) {
2033 /* 98% of the time, pc is already at the head of the list. */
2034 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2035 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2036 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2037 }
2038 return;
2039 }
2040 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2041 free_pv_chunk(pc);
2042 }
2043
2044 static void
free_pv_chunk(struct pv_chunk * pc)2045 free_pv_chunk(struct pv_chunk *pc)
2046 {
2047 vm_page_t m;
2048
2049 mtx_lock(&pv_chunks_mutex);
2050 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2051 mtx_unlock(&pv_chunks_mutex);
2052 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2053 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2054 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2055 /* entire chunk is free, return it */
2056 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2057 dump_drop_page(m->phys_addr);
2058 vm_page_unwire_noq(m);
2059 vm_page_free(m);
2060 }
2061
2062 /*
2063 * Returns a new PV entry, allocating a new PV chunk from the system when
2064 * needed. If this PV chunk allocation fails and a PV list lock pointer was
2065 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
2066 * returned.
2067 *
2068 * The given PV list lock may be released.
2069 */
2070 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)2071 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2072 {
2073 int bit, field;
2074 pv_entry_t pv;
2075 struct pv_chunk *pc;
2076 vm_page_t m;
2077
2078 rw_assert(&pvh_global_lock, RA_LOCKED);
2079 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2080 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2081 retry:
2082 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2083 if (pc != NULL) {
2084 for (field = 0; field < _NPCM; field++) {
2085 if (pc->pc_map[field]) {
2086 bit = ffsl(pc->pc_map[field]) - 1;
2087 break;
2088 }
2089 }
2090 if (field < _NPCM) {
2091 pv = &pc->pc_pventry[field * 64 + bit];
2092 pc->pc_map[field] &= ~(1ul << bit);
2093 /* If this was the last item, move it to tail */
2094 if (pc_is_full(pc)) {
2095 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2096 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2097 pc_list);
2098 }
2099 PV_STAT(atomic_add_long(&pv_entry_count, 1));
2100 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2101 return (pv);
2102 }
2103 }
2104 /* No free items, allocate another chunk */
2105 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2106 if (m == NULL) {
2107 if (lockp == NULL) {
2108 PV_STAT(pc_chunk_tryfail++);
2109 return (NULL);
2110 }
2111 m = reclaim_pv_chunk(pmap, lockp);
2112 if (m == NULL)
2113 goto retry;
2114 }
2115 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2116 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2117 dump_add_page(m->phys_addr);
2118 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2119 pc->pc_pmap = pmap;
2120 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */
2121 pc->pc_map[1] = PC_FREEN;
2122 pc->pc_map[2] = PC_FREEL;
2123 mtx_lock(&pv_chunks_mutex);
2124 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2125 mtx_unlock(&pv_chunks_mutex);
2126 pv = &pc->pc_pventry[0];
2127 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2128 PV_STAT(atomic_add_long(&pv_entry_count, 1));
2129 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2130 return (pv);
2131 }
2132
2133 /*
2134 * Ensure that the number of spare PV entries in the specified pmap meets or
2135 * exceeds the given count, "needed".
2136 *
2137 * The given PV list lock may be released.
2138 */
2139 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)2140 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2141 {
2142 struct pch new_tail;
2143 struct pv_chunk *pc;
2144 vm_page_t m;
2145 int avail, free;
2146 bool reclaimed;
2147
2148 rw_assert(&pvh_global_lock, RA_LOCKED);
2149 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2150 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2151
2152 /*
2153 * Newly allocated PV chunks must be stored in a private list until
2154 * the required number of PV chunks have been allocated. Otherwise,
2155 * reclaim_pv_chunk() could recycle one of these chunks. In
2156 * contrast, these chunks must be added to the pmap upon allocation.
2157 */
2158 TAILQ_INIT(&new_tail);
2159 retry:
2160 avail = 0;
2161 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2162 bit_count((bitstr_t *)pc->pc_map, 0,
2163 sizeof(pc->pc_map) * NBBY, &free);
2164 if (free == 0)
2165 break;
2166 avail += free;
2167 if (avail >= needed)
2168 break;
2169 }
2170 for (reclaimed = false; avail < needed; avail += _NPCPV) {
2171 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2172 if (m == NULL) {
2173 m = reclaim_pv_chunk(pmap, lockp);
2174 if (m == NULL)
2175 goto retry;
2176 reclaimed = true;
2177 }
2178 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2179 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2180 dump_add_page(m->phys_addr);
2181 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2182 pc->pc_pmap = pmap;
2183 pc->pc_map[0] = PC_FREEN;
2184 pc->pc_map[1] = PC_FREEN;
2185 pc->pc_map[2] = PC_FREEL;
2186 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2187 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2188 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
2189
2190 /*
2191 * The reclaim might have freed a chunk from the current pmap.
2192 * If that chunk contained available entries, we need to
2193 * re-count the number of available entries.
2194 */
2195 if (reclaimed)
2196 goto retry;
2197 }
2198 if (!TAILQ_EMPTY(&new_tail)) {
2199 mtx_lock(&pv_chunks_mutex);
2200 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2201 mtx_unlock(&pv_chunks_mutex);
2202 }
2203 }
2204
2205 /*
2206 * First find and then remove the pv entry for the specified pmap and virtual
2207 * address from the specified pv list. Returns the pv entry if found and NULL
2208 * otherwise. This operation can be performed on pv lists for either 4KB or
2209 * 2MB page mappings.
2210 */
2211 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2212 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2213 {
2214 pv_entry_t pv;
2215
2216 rw_assert(&pvh_global_lock, RA_LOCKED);
2217 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2218 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2219 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2220 pvh->pv_gen++;
2221 break;
2222 }
2223 }
2224 return (pv);
2225 }
2226
2227 /*
2228 * First find and then destroy the pv entry for the specified pmap and virtual
2229 * address. This operation can be performed on pv lists for either 4KB or 2MB
2230 * page mappings.
2231 */
2232 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2233 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2234 {
2235 pv_entry_t pv;
2236
2237 pv = pmap_pvh_remove(pvh, pmap, va);
2238
2239 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va));
2240 free_pv_entry(pmap, pv);
2241 }
2242
2243 /*
2244 * Conditionally create the PV entry for a 4KB page mapping if the required
2245 * memory can be allocated without resorting to reclamation.
2246 */
2247 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)2248 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2249 struct rwlock **lockp)
2250 {
2251 pv_entry_t pv;
2252
2253 rw_assert(&pvh_global_lock, RA_LOCKED);
2254 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2255 /* Pass NULL instead of the lock pointer to disable reclamation. */
2256 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2257 pv->pv_va = va;
2258 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2259 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2260 m->md.pv_gen++;
2261 return (true);
2262 } else
2263 return (false);
2264 }
2265
2266 /*
2267 * After demotion from a 2MB page mapping to 512 4KB page mappings,
2268 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2269 * entries for each of the 4KB page mappings.
2270 */
2271 static void __unused
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)2272 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2273 struct rwlock **lockp)
2274 {
2275 struct md_page *pvh;
2276 struct pv_chunk *pc;
2277 pv_entry_t pv;
2278 vm_page_t m;
2279 vm_offset_t va_last;
2280 int bit, field;
2281
2282 rw_assert(&pvh_global_lock, RA_LOCKED);
2283 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2284 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2285
2286 /*
2287 * Transfer the 2mpage's pv entry for this mapping to the first
2288 * page's pv list. Once this transfer begins, the pv list lock
2289 * must not be released until the last pv entry is reinstantiated.
2290 */
2291 pvh = pa_to_pvh(pa);
2292 va &= ~L2_OFFSET;
2293 pv = pmap_pvh_remove(pvh, pmap, va);
2294 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
2295 m = PHYS_TO_VM_PAGE(pa);
2296 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2297 m->md.pv_gen++;
2298 /* Instantiate the remaining 511 pv entries. */
2299 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
2300 va_last = va + L2_SIZE - PAGE_SIZE;
2301 for (;;) {
2302 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2303 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
2304 for (field = 0; field < _NPCM; field++) {
2305 while (pc->pc_map[field] != 0) {
2306 bit = ffsl(pc->pc_map[field]) - 1;
2307 pc->pc_map[field] &= ~(1ul << bit);
2308 pv = &pc->pc_pventry[field * 64 + bit];
2309 va += PAGE_SIZE;
2310 pv->pv_va = va;
2311 m++;
2312 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2313 ("pmap_pv_demote_l2: page %p is not managed", m));
2314 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2315 m->md.pv_gen++;
2316 if (va == va_last)
2317 goto out;
2318 }
2319 }
2320 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2321 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2322 }
2323 out:
2324 if (pc_is_full(pc)) {
2325 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2326 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2327 }
2328 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
2329 PV_STAT(atomic_add_int(&pv_entry_spare, -(Ln_ENTRIES - 1)));
2330 }
2331
2332 #if VM_NRESERVLEVEL > 0
2333 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)2334 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2335 struct rwlock **lockp)
2336 {
2337 struct md_page *pvh;
2338 pv_entry_t pv;
2339 vm_page_t m;
2340 vm_offset_t va_last;
2341
2342 rw_assert(&pvh_global_lock, RA_LOCKED);
2343 KASSERT((pa & L2_OFFSET) == 0,
2344 ("pmap_pv_promote_l2: misaligned pa %#lx", pa));
2345
2346 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2347
2348 m = PHYS_TO_VM_PAGE(pa);
2349 va = va & ~L2_OFFSET;
2350 pv = pmap_pvh_remove(&m->md, pmap, va);
2351 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va));
2352 pvh = pa_to_pvh(pa);
2353 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2354 pvh->pv_gen++;
2355
2356 va_last = va + L2_SIZE - PAGE_SIZE;
2357 do {
2358 m++;
2359 va += PAGE_SIZE;
2360 pmap_pvh_free(&m->md, pmap, va);
2361 } while (va < va_last);
2362 }
2363 #endif /* VM_NRESERVLEVEL > 0 */
2364
2365 /*
2366 * Create the PV entry for a 2MB page mapping. Always returns true unless the
2367 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
2368 * false if the PV entry cannot be allocated without resorting to reclamation.
2369 */
2370 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)2371 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
2372 struct rwlock **lockp)
2373 {
2374 struct md_page *pvh;
2375 pv_entry_t pv;
2376 vm_paddr_t pa;
2377
2378 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2379 /* Pass NULL instead of the lock pointer to disable reclamation. */
2380 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
2381 NULL : lockp)) == NULL)
2382 return (false);
2383 pv->pv_va = va;
2384 pa = PTE_TO_PHYS(l2e);
2385 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2386 pvh = pa_to_pvh(pa);
2387 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2388 pvh->pv_gen++;
2389 return (true);
2390 }
2391
2392 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)2393 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
2394 {
2395 pt_entry_t newl2, oldl2 __diagused;
2396 vm_page_t ml3;
2397 vm_paddr_t ml3pa;
2398
2399 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
2400 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
2401 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2402
2403 ml3 = pmap_remove_pt_page(pmap, va);
2404 if (ml3 == NULL)
2405 panic("pmap_remove_kernel_l2: Missing pt page");
2406
2407 ml3pa = VM_PAGE_TO_PHYS(ml3);
2408 newl2 = ml3pa | PTE_V;
2409
2410 /*
2411 * If this page table page was unmapped by a promotion, then it
2412 * contains valid mappings. Zero it to invalidate those mappings.
2413 */
2414 if (vm_page_any_valid(ml3))
2415 pagezero((void *)PHYS_TO_DMAP(ml3pa));
2416
2417 /*
2418 * Demote the mapping.
2419 */
2420 oldl2 = pmap_load_store(l2, newl2);
2421 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
2422 __func__, l2, oldl2));
2423 }
2424
2425 /*
2426 * pmap_remove_l2: Do the things to unmap a level 2 superpage.
2427 */
2428 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)2429 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
2430 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
2431 {
2432 struct md_page *pvh;
2433 pt_entry_t oldl2;
2434 vm_offset_t eva, va;
2435 vm_page_t m, ml3;
2436
2437 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2438 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
2439 oldl2 = pmap_load_clear(l2);
2440 KASSERT((oldl2 & PTE_RWX) != 0,
2441 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2));
2442
2443 /*
2444 * The sfence.vma documentation states that it is sufficient to specify
2445 * a single address within a superpage mapping. However, since we do
2446 * not perform any invalidation upon promotion, TLBs may still be
2447 * caching 4KB mappings within the superpage, so we must invalidate the
2448 * entire range.
2449 */
2450 pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
2451 if ((oldl2 & PTE_SW_WIRED) != 0)
2452 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
2453 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
2454 if ((oldl2 & PTE_SW_MANAGED) != 0) {
2455 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2));
2456 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2));
2457 pmap_pvh_free(pvh, pmap, sva);
2458 eva = sva + L2_SIZE;
2459 for (va = sva, m = PTE_TO_VM_PAGE(oldl2);
2460 va < eva; va += PAGE_SIZE, m++) {
2461 if ((oldl2 & PTE_D) != 0)
2462 vm_page_dirty(m);
2463 if ((oldl2 & PTE_A) != 0)
2464 vm_page_aflag_set(m, PGA_REFERENCED);
2465 if (TAILQ_EMPTY(&m->md.pv_list) &&
2466 TAILQ_EMPTY(&pvh->pv_list))
2467 vm_page_aflag_clear(m, PGA_WRITEABLE);
2468 }
2469 }
2470 if (pmap == kernel_pmap) {
2471 pmap_remove_kernel_l2(pmap, l2, sva);
2472 } else {
2473 ml3 = pmap_remove_pt_page(pmap, sva);
2474 if (ml3 != NULL) {
2475 KASSERT(vm_page_any_valid(ml3),
2476 ("pmap_remove_l2: l3 page not promoted"));
2477 pmap_resident_count_dec(pmap, 1);
2478 KASSERT(ml3->ref_count == Ln_ENTRIES,
2479 ("pmap_remove_l2: l3 page ref count error"));
2480 ml3->ref_count = 1;
2481 vm_page_unwire_noq(ml3);
2482 pmap_add_delayed_free_list(ml3, free, false);
2483 }
2484 }
2485 return (pmap_unuse_pt(pmap, sva, l1e, free));
2486 }
2487
2488 /*
2489 * pmap_remove_l3: do the things to unmap a page in a process
2490 */
2491 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)2492 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2493 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2494 {
2495 struct md_page *pvh;
2496 pt_entry_t old_l3;
2497 vm_page_t m;
2498
2499 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2500 old_l3 = pmap_load_clear(l3);
2501 pmap_invalidate_page(pmap, va);
2502 if (old_l3 & PTE_SW_WIRED)
2503 pmap->pm_stats.wired_count -= 1;
2504 pmap_resident_count_dec(pmap, 1);
2505 if (old_l3 & PTE_SW_MANAGED) {
2506 m = PTE_TO_VM_PAGE(old_l3);
2507 if ((old_l3 & PTE_D) != 0)
2508 vm_page_dirty(m);
2509 if (old_l3 & PTE_A)
2510 vm_page_aflag_set(m, PGA_REFERENCED);
2511 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2512 pmap_pvh_free(&m->md, pmap, va);
2513 if (TAILQ_EMPTY(&m->md.pv_list) &&
2514 (m->flags & PG_FICTITIOUS) == 0) {
2515 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2516 if (TAILQ_EMPTY(&pvh->pv_list))
2517 vm_page_aflag_clear(m, PGA_WRITEABLE);
2518 }
2519 }
2520
2521 return (pmap_unuse_pt(pmap, va, l2e, free));
2522 }
2523
2524 /*
2525 * Remove the given range of addresses from the specified map.
2526 *
2527 * It is assumed that the start and end are properly
2528 * rounded to the page size.
2529 */
2530 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)2531 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2532 {
2533 struct spglist free;
2534 struct rwlock *lock;
2535 vm_offset_t va, va_next;
2536 pd_entry_t *l0, *l1, *l2, l2e;
2537 pt_entry_t *l3;
2538
2539 /*
2540 * Perform an unsynchronized read. This is, however, safe.
2541 */
2542 if (pmap->pm_stats.resident_count == 0)
2543 return;
2544
2545 SLIST_INIT(&free);
2546
2547 rw_rlock(&pvh_global_lock);
2548 PMAP_LOCK(pmap);
2549
2550 lock = NULL;
2551 for (; sva < eva; sva = va_next) {
2552 if (pmap->pm_stats.resident_count == 0)
2553 break;
2554
2555 if (pmap_mode == PMAP_MODE_SV48) {
2556 l0 = pmap_l0(pmap, sva);
2557 if (pmap_load(l0) == 0) {
2558 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2559 if (va_next < sva)
2560 va_next = eva;
2561 continue;
2562 }
2563 l1 = pmap_l0_to_l1(l0, sva);
2564 } else {
2565 l1 = pmap_l1(pmap, sva);
2566 }
2567
2568 if (pmap_load(l1) == 0) {
2569 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2570 if (va_next < sva)
2571 va_next = eva;
2572 continue;
2573 }
2574
2575 /*
2576 * Calculate index for next page table.
2577 */
2578 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2579 if (va_next < sva)
2580 va_next = eva;
2581
2582 l2 = pmap_l1_to_l2(l1, sva);
2583 if ((l2e = pmap_load(l2)) == 0)
2584 continue;
2585 if ((l2e & PTE_RWX) != 0) {
2586 if (sva + L2_SIZE == va_next && eva >= va_next) {
2587 (void)pmap_remove_l2(pmap, l2, sva,
2588 pmap_load(l1), &free, &lock);
2589 continue;
2590 } else if (!pmap_demote_l2_locked(pmap, l2, sva,
2591 &lock)) {
2592 /*
2593 * The large page mapping was destroyed.
2594 */
2595 continue;
2596 }
2597 l2e = pmap_load(l2);
2598 }
2599
2600 /*
2601 * Limit our scan to either the end of the va represented
2602 * by the current page table page, or to the end of the
2603 * range being removed.
2604 */
2605 if (va_next > eva)
2606 va_next = eva;
2607
2608 va = va_next;
2609 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2610 sva += L3_SIZE) {
2611 if (pmap_load(l3) == 0) {
2612 if (va != va_next) {
2613 pmap_invalidate_range(pmap, va, sva);
2614 va = va_next;
2615 }
2616 continue;
2617 }
2618 if (va == va_next)
2619 va = sva;
2620 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) {
2621 sva += L3_SIZE;
2622 break;
2623 }
2624 }
2625 if (va != va_next)
2626 pmap_invalidate_range(pmap, va, sva);
2627 }
2628 if (lock != NULL)
2629 rw_wunlock(lock);
2630 rw_runlock(&pvh_global_lock);
2631 PMAP_UNLOCK(pmap);
2632 vm_page_free_pages_toq(&free, false);
2633 }
2634
2635 /*
2636 * Routine: pmap_remove_all
2637 * Function:
2638 * Removes this physical page from
2639 * all physical maps in which it resides.
2640 * Reflects back modify bits to the pager.
2641 *
2642 * Notes:
2643 * Original versions of this routine were very
2644 * inefficient because they iteratively called
2645 * pmap_remove (slow...)
2646 */
2647
2648 void
pmap_remove_all(vm_page_t m)2649 pmap_remove_all(vm_page_t m)
2650 {
2651 struct spglist free;
2652 struct md_page *pvh;
2653 pmap_t pmap;
2654 pt_entry_t *l3, l3e;
2655 pd_entry_t *l2, l2e __diagused;
2656 pv_entry_t pv;
2657 vm_offset_t va;
2658
2659 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2660 ("pmap_remove_all: page %p is not managed", m));
2661 SLIST_INIT(&free);
2662 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2663 pa_to_pvh(VM_PAGE_TO_PHYS(m));
2664
2665 rw_wlock(&pvh_global_lock);
2666 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2667 pmap = PV_PMAP(pv);
2668 PMAP_LOCK(pmap);
2669 va = pv->pv_va;
2670 l2 = pmap_l2(pmap, va);
2671 (void)pmap_demote_l2(pmap, l2, va);
2672 PMAP_UNLOCK(pmap);
2673 }
2674 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2675 pmap = PV_PMAP(pv);
2676 PMAP_LOCK(pmap);
2677 pmap_resident_count_dec(pmap, 1);
2678 l2 = pmap_l2(pmap, pv->pv_va);
2679 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found"));
2680 l2e = pmap_load(l2);
2681
2682 KASSERT((l2e & PTE_RX) == 0,
2683 ("pmap_remove_all: found a superpage in %p's pv list", m));
2684
2685 l3 = pmap_l2_to_l3(l2, pv->pv_va);
2686 l3e = pmap_load_clear(l3);
2687 pmap_invalidate_page(pmap, pv->pv_va);
2688 if (l3e & PTE_SW_WIRED)
2689 pmap->pm_stats.wired_count--;
2690 if ((l3e & PTE_A) != 0)
2691 vm_page_aflag_set(m, PGA_REFERENCED);
2692
2693 /*
2694 * Update the vm_page_t clean and reference bits.
2695 */
2696 if ((l3e & PTE_D) != 0)
2697 vm_page_dirty(m);
2698 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free);
2699 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2700 m->md.pv_gen++;
2701 free_pv_entry(pmap, pv);
2702 PMAP_UNLOCK(pmap);
2703 }
2704 vm_page_aflag_clear(m, PGA_WRITEABLE);
2705 rw_wunlock(&pvh_global_lock);
2706 vm_page_free_pages_toq(&free, false);
2707 }
2708
2709 /*
2710 * Set the physical protection on the
2711 * specified range of this map as requested.
2712 */
2713 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)2714 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2715 {
2716 pd_entry_t *l0, *l1, *l2, l2e;
2717 pt_entry_t *l3, l3e, mask;
2718 vm_page_t m, mt;
2719 vm_offset_t va_next;
2720 bool anychanged, pv_lists_locked;
2721
2722 if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2723 pmap_remove(pmap, sva, eva);
2724 return;
2725 }
2726
2727 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
2728 (VM_PROT_WRITE | VM_PROT_EXECUTE))
2729 return;
2730
2731 anychanged = false;
2732 pv_lists_locked = false;
2733 mask = 0;
2734 if ((prot & VM_PROT_WRITE) == 0)
2735 mask |= PTE_W | PTE_D;
2736 if ((prot & VM_PROT_EXECUTE) == 0)
2737 mask |= PTE_X;
2738 resume:
2739 PMAP_LOCK(pmap);
2740 for (; sva < eva; sva = va_next) {
2741 if (pmap_mode == PMAP_MODE_SV48) {
2742 l0 = pmap_l0(pmap, sva);
2743 if (pmap_load(l0) == 0) {
2744 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2745 if (va_next < sva)
2746 va_next = eva;
2747 continue;
2748 }
2749 l1 = pmap_l0_to_l1(l0, sva);
2750 } else {
2751 l1 = pmap_l1(pmap, sva);
2752 }
2753
2754 if (pmap_load(l1) == 0) {
2755 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2756 if (va_next < sva)
2757 va_next = eva;
2758 continue;
2759 }
2760
2761 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2762 if (va_next < sva)
2763 va_next = eva;
2764
2765 l2 = pmap_l1_to_l2(l1, sva);
2766 if ((l2e = pmap_load(l2)) == 0)
2767 continue;
2768 if ((l2e & PTE_RWX) != 0) {
2769 if (sva + L2_SIZE == va_next && eva >= va_next) {
2770 retryl2:
2771 if ((prot & VM_PROT_WRITE) == 0 &&
2772 (l2e & (PTE_SW_MANAGED | PTE_D)) ==
2773 (PTE_SW_MANAGED | PTE_D)) {
2774 m = PTE_TO_VM_PAGE(l2e);
2775 for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
2776 vm_page_dirty(mt);
2777 }
2778 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask))
2779 goto retryl2;
2780 anychanged = true;
2781 continue;
2782 } else {
2783 if (!pv_lists_locked) {
2784 pv_lists_locked = true;
2785 if (!rw_try_rlock(&pvh_global_lock)) {
2786 if (anychanged)
2787 pmap_invalidate_all(
2788 pmap);
2789 PMAP_UNLOCK(pmap);
2790 rw_rlock(&pvh_global_lock);
2791 goto resume;
2792 }
2793 }
2794 if (!pmap_demote_l2(pmap, l2, sva)) {
2795 /*
2796 * The large page mapping was destroyed.
2797 */
2798 continue;
2799 }
2800 }
2801 }
2802
2803 if (va_next > eva)
2804 va_next = eva;
2805
2806 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2807 sva += L3_SIZE) {
2808 l3e = pmap_load(l3);
2809 retryl3:
2810 if ((l3e & PTE_V) == 0)
2811 continue;
2812 if ((prot & VM_PROT_WRITE) == 0 &&
2813 (l3e & (PTE_SW_MANAGED | PTE_D)) ==
2814 (PTE_SW_MANAGED | PTE_D)) {
2815 m = PTE_TO_VM_PAGE(l3e);
2816 vm_page_dirty(m);
2817 }
2818 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask))
2819 goto retryl3;
2820 anychanged = true;
2821 }
2822 }
2823 if (anychanged)
2824 pmap_invalidate_all(pmap);
2825 if (pv_lists_locked)
2826 rw_runlock(&pvh_global_lock);
2827 PMAP_UNLOCK(pmap);
2828 }
2829
2830 int
pmap_fault(pmap_t pmap,vm_offset_t va,vm_prot_t ftype)2831 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype)
2832 {
2833 pd_entry_t *l2, l2e;
2834 pt_entry_t bits, *pte, oldpte;
2835 int rv;
2836
2837 KASSERT(VIRT_IS_VALID(va), ("pmap_fault: invalid va %#lx", va));
2838
2839 rv = 0;
2840 PMAP_LOCK(pmap);
2841 l2 = pmap_l2(pmap, va);
2842 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0)
2843 goto done;
2844 if ((l2e & PTE_RWX) == 0) {
2845 pte = pmap_l2_to_l3(l2, va);
2846 if (((oldpte = pmap_load(pte)) & PTE_V) == 0)
2847 goto done;
2848 } else {
2849 pte = l2;
2850 oldpte = l2e;
2851 }
2852
2853 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) ||
2854 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) ||
2855 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) ||
2856 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0))
2857 goto done;
2858
2859 bits = PTE_A;
2860 if (ftype == VM_PROT_WRITE)
2861 bits |= PTE_D;
2862
2863 /*
2864 * Spurious faults can occur if the implementation caches invalid
2865 * entries in the TLB, or if simultaneous accesses on multiple CPUs
2866 * race with each other.
2867 */
2868 if ((oldpte & bits) != bits)
2869 pmap_store_bits(pte, bits);
2870 sfence_vma();
2871 rv = 1;
2872 done:
2873 PMAP_UNLOCK(pmap);
2874 return (rv);
2875 }
2876
2877 /*
2878 * Demote the specified L1 page to separate L2 pages.
2879 * Currently only used for DMAP entries.
2880 */
2881 static bool
pmap_demote_l1(pmap_t pmap,pd_entry_t * l1,vm_offset_t va)2882 pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va)
2883 {
2884 vm_page_t m;
2885 pt_entry_t *l2, oldl1, newl2;
2886 pd_entry_t newl1;
2887 vm_paddr_t l2phys;
2888
2889 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2890
2891 oldl1 = pmap_load(l1);
2892 KASSERT((oldl1 & PTE_RWX) != 0,
2893 ("pmap_demote_l1: oldl1 is not a leaf PTE"));
2894 KASSERT((oldl1 & PTE_A) != 0,
2895 ("pmap_demote_l1: oldl1 is missing PTE_A"));
2896 KASSERT((oldl1 & (PTE_D | PTE_W)) != PTE_W,
2897 ("pmap_demote_l1: not dirty!"));
2898 KASSERT((oldl1 & PTE_SW_MANAGED) == 0,
2899 ("pmap_demote_l1: L1 table shouldn't be managed"));
2900 KASSERT(VIRT_IN_DMAP(va),
2901 ("pmap_demote_l1: is unsupported for non-DMAP va=%#lx", va));
2902
2903 /* Demoting L1 means we need to allocate a new page-table page. */
2904 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
2905 if (m == NULL) {
2906 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx in pmap %p",
2907 va, pmap);
2908 return (false);
2909 }
2910
2911 l2phys = VM_PAGE_TO_PHYS(m);
2912 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
2913
2914 /*
2915 * Create new entries, relying on the fact that only the low bits
2916 * (index) of the physical address are changing.
2917 */
2918 newl2 = oldl1;
2919 for (int i = 0; i < Ln_ENTRIES; i++)
2920 pmap_store(&l2[i], newl2 | (i << PTE_PPN1_S));
2921
2922 /*
2923 * And update the L1 entry.
2924 *
2925 * NB: flushing the TLB is the responsibility of the caller. Cached
2926 * translations are still "correct" for demoted mappings until some
2927 * subset of the demoted range is modified.
2928 */
2929 newl1 = ((l2phys / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
2930 pmap_store(l1, newl1);
2931
2932 counter_u64_add(pmap_l1_demotions, 1);
2933 CTR2(KTR_PMAP, "pmap_demote_l1: success for va %#lx in pmap %p",
2934 va, pmap);
2935 return (true);
2936 }
2937
2938 static bool
pmap_demote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va)2939 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va)
2940 {
2941 struct rwlock *lock;
2942 bool rv;
2943
2944 lock = NULL;
2945 rv = pmap_demote_l2_locked(pmap, l2, va, &lock);
2946 if (lock != NULL)
2947 rw_wunlock(lock);
2948 return (rv);
2949 }
2950
2951 /*
2952 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page
2953 * mapping is invalidated.
2954 */
2955 static bool
pmap_demote_l2_locked(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)2956 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
2957 struct rwlock **lockp)
2958 {
2959 struct spglist free;
2960 vm_page_t mpte;
2961 pd_entry_t newl2, oldl2;
2962 pt_entry_t *firstl3, newl3;
2963 vm_paddr_t mptepa;
2964 int i;
2965
2966 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2967
2968 oldl2 = pmap_load(l2);
2969 KASSERT((oldl2 & PTE_RWX) != 0,
2970 ("pmap_demote_l2_locked: oldl2 is not a leaf entry"));
2971 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
2972 NULL) {
2973 KASSERT((oldl2 & PTE_SW_WIRED) == 0,
2974 ("pmap_demote_l2_locked: page table page for a wired mapping is missing"));
2975 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj(
2976 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
2977 VM_ALLOC_WIRED)) == NULL) {
2978 SLIST_INIT(&free);
2979 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET,
2980 pmap_load(pmap_l1(pmap, va)), &free, lockp);
2981 vm_page_free_pages_toq(&free, true);
2982 CTR2(KTR_PMAP, "pmap_demote_l2_locked: "
2983 "failure for va %#lx in pmap %p", va, pmap);
2984 return (false);
2985 }
2986 mpte->pindex = pmap_l2_pindex(va);
2987 if (va < VM_MAXUSER_ADDRESS) {
2988 mpte->ref_count = Ln_ENTRIES;
2989 pmap_resident_count_inc(pmap, 1);
2990 }
2991 }
2992 mptepa = VM_PAGE_TO_PHYS(mpte);
2993 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
2994 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
2995 KASSERT((oldl2 & PTE_A) != 0,
2996 ("pmap_demote_l2_locked: oldl2 is missing PTE_A"));
2997 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W,
2998 ("pmap_demote_l2_locked: oldl2 is missing PTE_D"));
2999 newl3 = oldl2;
3000
3001 /*
3002 * If the page table page is not leftover from an earlier promotion,
3003 * initialize it.
3004 */
3005 if (!vm_page_all_valid(mpte)) {
3006 for (i = 0; i < Ln_ENTRIES; i++)
3007 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
3008 }
3009 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3),
3010 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical "
3011 "addresses"));
3012
3013 /*
3014 * If the mapping has changed attributes, update the PTEs.
3015 */
3016 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE))
3017 for (i = 0; i < Ln_ENTRIES; i++)
3018 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
3019
3020 /*
3021 * The spare PV entries must be reserved prior to demoting the
3022 * mapping, that is, prior to changing the L2 entry. Otherwise, the
3023 * state of the L2 entry and the PV lists will be inconsistent, which
3024 * can result in reclaim_pv_chunk() attempting to remove a PV entry from
3025 * the wrong PV list and pmap_pv_demote_l2() failing to find the
3026 * expected PV entry for the 2MB page mapping that is being demoted.
3027 */
3028 if ((oldl2 & PTE_SW_MANAGED) != 0)
3029 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
3030
3031 /*
3032 * Demote the mapping.
3033 */
3034 pmap_store(l2, newl2);
3035
3036 /*
3037 * Demote the PV entry.
3038 */
3039 if ((oldl2 & PTE_SW_MANAGED) != 0)
3040 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
3041
3042 atomic_add_long(&pmap_l2_demotions, 1);
3043 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p",
3044 va, pmap);
3045 return (true);
3046 }
3047
3048 #if VM_NRESERVLEVEL > 0
3049 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t ml3,struct rwlock ** lockp)3050 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t ml3,
3051 struct rwlock **lockp)
3052 {
3053 pt_entry_t all_l3e_PTE_A, *firstl3, firstl3e, *l3, l3e;
3054 vm_paddr_t pa;
3055
3056 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3057 if (!pmap_ps_enabled(pmap))
3058 return (false);
3059
3060 KASSERT((pmap_load(l2) & PTE_RWX) == 0,
3061 ("pmap_promote_l2: invalid l2 entry %p", l2));
3062
3063 /*
3064 * Examine the first L3E in the specified PTP. Abort if this L3E is
3065 * ineligible for promotion or does not map the first 4KB physical page
3066 * within a 2MB page.
3067 */
3068 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
3069 firstl3e = pmap_load(firstl3);
3070 pa = PTE_TO_PHYS(firstl3e);
3071 if ((pa & L2_OFFSET) != 0) {
3072 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
3073 va, pmap);
3074 atomic_add_long(&pmap_l2_p_failures, 1);
3075 return (false);
3076 }
3077
3078 /*
3079 * Downgrade a clean, writable mapping to read-only to ensure that the
3080 * hardware does not set PTE_D while we are comparing PTEs.
3081 *
3082 * Upon a write access to a clean mapping, the implementation will
3083 * either atomically check protections and set PTE_D, or raise a page
3084 * fault. In the latter case, the pmap lock provides atomicity. Thus,
3085 * we do not issue an sfence.vma here and instead rely on pmap_fault()
3086 * to do so lazily.
3087 */
3088 while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) {
3089 if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) {
3090 firstl3e &= ~PTE_W;
3091 break;
3092 }
3093 }
3094
3095 /*
3096 * Examine each of the other PTEs in the specified PTP. Abort if this
3097 * PTE maps an unexpected 4KB physical page or does not have identical
3098 * characteristics to the first PTE.
3099 */
3100 all_l3e_PTE_A = firstl3e & PTE_A;
3101 pa += L2_SIZE - PAGE_SIZE;
3102 for (l3 = firstl3 + Ln_ENTRIES - 1; l3 > firstl3; l3--) {
3103 l3e = pmap_load(l3);
3104 if (PTE_TO_PHYS(l3e) != pa) {
3105 CTR2(KTR_PMAP,
3106 "pmap_promote_l2: failure for va %#lx pmap %p",
3107 va, pmap);
3108 atomic_add_long(&pmap_l2_p_failures, 1);
3109 return (false);
3110 }
3111 while ((l3e & (PTE_W | PTE_D)) == PTE_W) {
3112 if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) {
3113 l3e &= ~PTE_W;
3114 break;
3115 }
3116 }
3117 if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) {
3118 CTR2(KTR_PMAP,
3119 "pmap_promote_l2: failure for va %#lx pmap %p",
3120 va, pmap);
3121 atomic_add_long(&pmap_l2_p_failures, 1);
3122 return (false);
3123 }
3124 all_l3e_PTE_A &= l3e;
3125 pa -= PAGE_SIZE;
3126 }
3127
3128 /*
3129 * Unless all PTEs have PTE_A set, clear it from the superpage
3130 * mapping, so that promotions triggered by speculative mappings,
3131 * such as pmap_enter_quick(), don't automatically mark the
3132 * underlying pages as referenced.
3133 */
3134 firstl3e &= ~PTE_A | all_l3e_PTE_A;
3135
3136 /*
3137 * Save the page table page in its current state until the L2
3138 * mapping the superpage is demoted by pmap_demote_l2() or
3139 * destroyed by pmap_remove_l3().
3140 */
3141 if (ml3 == NULL)
3142 ml3 = PTE_TO_VM_PAGE(pmap_load(l2));
3143 KASSERT(ml3->pindex == pmap_l2_pindex(va),
3144 ("pmap_promote_l2: page table page's pindex is wrong"));
3145 if (pmap_insert_pt_page(pmap, ml3, true, all_l3e_PTE_A != 0)) {
3146 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
3147 va, pmap);
3148 atomic_add_long(&pmap_l2_p_failures, 1);
3149 return (false);
3150 }
3151
3152 if ((firstl3e & PTE_SW_MANAGED) != 0)
3153 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp);
3154
3155 pmap_store(l2, firstl3e);
3156
3157 atomic_add_long(&pmap_l2_promotions, 1);
3158 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
3159 pmap);
3160 return (true);
3161 }
3162 #endif
3163
3164 /*
3165 * Insert the given physical page (p) at
3166 * the specified virtual address (v) in the
3167 * target physical map with the protection requested.
3168 *
3169 * If specified, the page will be wired down, meaning
3170 * that the related pte can not be reclaimed.
3171 *
3172 * NB: This is the only routine which MAY NOT lazy-evaluate
3173 * or lose information. That is, this routine must actually
3174 * insert this page into the given map NOW.
3175 */
3176 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)3177 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3178 u_int flags, int8_t psind)
3179 {
3180 struct rwlock *lock;
3181 pd_entry_t *l2, l2e;
3182 pt_entry_t new_l3, orig_l3;
3183 pt_entry_t *l3;
3184 pv_entry_t pv;
3185 vm_paddr_t opa, pa;
3186 vm_page_t mpte, om;
3187 pn_t pn;
3188 int rv;
3189 bool nosleep;
3190
3191 va = trunc_page(va);
3192 if ((m->oflags & VPO_UNMANAGED) == 0)
3193 VM_PAGE_OBJECT_BUSY_ASSERT(m);
3194 pa = VM_PAGE_TO_PHYS(m);
3195 pn = (pa / PAGE_SIZE);
3196
3197 new_l3 = PTE_V | PTE_R | PTE_A;
3198 if (prot & VM_PROT_EXECUTE)
3199 new_l3 |= PTE_X;
3200 if (flags & VM_PROT_WRITE)
3201 new_l3 |= PTE_D;
3202 if (prot & VM_PROT_WRITE)
3203 new_l3 |= PTE_W;
3204 if (va < VM_MAX_USER_ADDRESS)
3205 new_l3 |= PTE_U;
3206
3207 new_l3 |= (pn << PTE_PPN0_S);
3208 if ((flags & PMAP_ENTER_WIRED) != 0)
3209 new_l3 |= PTE_SW_WIRED;
3210 new_l3 |= pmap_memattr_bits(m->md.pv_memattr);
3211
3212 /*
3213 * Set modified bit gratuitously for writeable mappings if
3214 * the page is unmanaged. We do not want to take a fault
3215 * to do the dirty bit accounting for these mappings.
3216 */
3217 if ((m->oflags & VPO_UNMANAGED) != 0) {
3218 if (prot & VM_PROT_WRITE)
3219 new_l3 |= PTE_D;
3220 } else
3221 new_l3 |= PTE_SW_MANAGED;
3222
3223 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
3224
3225 lock = NULL;
3226 mpte = NULL;
3227 rw_rlock(&pvh_global_lock);
3228 PMAP_LOCK(pmap);
3229 if (psind == 1) {
3230 /* Assert the required virtual and physical alignment. */
3231 KASSERT((va & L2_OFFSET) == 0,
3232 ("pmap_enter: va %#lx unaligned", va));
3233 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
3234 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock);
3235 goto out;
3236 }
3237
3238 l2 = pmap_l2(pmap, va);
3239 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 &&
3240 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2,
3241 va, &lock))) {
3242 l3 = pmap_l2_to_l3(l2, va);
3243 if (va < VM_MAXUSER_ADDRESS) {
3244 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
3245 mpte->ref_count++;
3246 }
3247 } else if (va < VM_MAXUSER_ADDRESS) {
3248 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
3249 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
3250 if (mpte == NULL && nosleep) {
3251 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
3252 if (lock != NULL)
3253 rw_wunlock(lock);
3254 rw_runlock(&pvh_global_lock);
3255 PMAP_UNLOCK(pmap);
3256 return (KERN_RESOURCE_SHORTAGE);
3257 }
3258 l3 = pmap_l3(pmap, va);
3259 } else {
3260 panic("pmap_enter: missing L3 table for kernel va %#lx", va);
3261 }
3262
3263 orig_l3 = pmap_load(l3);
3264 opa = PTE_TO_PHYS(orig_l3);
3265 pv = NULL;
3266
3267 /*
3268 * Is the specified virtual address already mapped?
3269 */
3270 if ((orig_l3 & PTE_V) != 0) {
3271 /*
3272 * Wiring change, just update stats. We don't worry about
3273 * wiring PT pages as they remain resident as long as there
3274 * are valid mappings in them. Hence, if a user page is wired,
3275 * the PT page will be also.
3276 */
3277 if ((flags & PMAP_ENTER_WIRED) != 0 &&
3278 (orig_l3 & PTE_SW_WIRED) == 0)
3279 pmap->pm_stats.wired_count++;
3280 else if ((flags & PMAP_ENTER_WIRED) == 0 &&
3281 (orig_l3 & PTE_SW_WIRED) != 0)
3282 pmap->pm_stats.wired_count--;
3283
3284 /*
3285 * Remove the extra PT page reference.
3286 */
3287 if (mpte != NULL) {
3288 mpte->ref_count--;
3289 KASSERT(mpte->ref_count > 0,
3290 ("pmap_enter: missing reference to page table page,"
3291 " va: 0x%lx", va));
3292 }
3293
3294 /*
3295 * Has the physical page changed?
3296 */
3297 if (opa == pa) {
3298 /*
3299 * No, might be a protection or wiring change.
3300 */
3301 if ((orig_l3 & PTE_SW_MANAGED) != 0 &&
3302 (new_l3 & PTE_W) != 0)
3303 vm_page_aflag_set(m, PGA_WRITEABLE);
3304 goto validate;
3305 }
3306
3307 /*
3308 * The physical page has changed. Temporarily invalidate
3309 * the mapping. This ensures that all threads sharing the
3310 * pmap keep a consistent view of the mapping, which is
3311 * necessary for the correct handling of COW faults. It
3312 * also permits reuse of the old mapping's PV entry,
3313 * avoiding an allocation.
3314 *
3315 * For consistency, handle unmanaged mappings the same way.
3316 */
3317 orig_l3 = pmap_load_clear(l3);
3318 KASSERT(PTE_TO_PHYS(orig_l3) == opa,
3319 ("pmap_enter: unexpected pa update for %#lx", va));
3320 if ((orig_l3 & PTE_SW_MANAGED) != 0) {
3321 om = PHYS_TO_VM_PAGE(opa);
3322
3323 /*
3324 * The pmap lock is sufficient to synchronize with
3325 * concurrent calls to pmap_page_test_mappings() and
3326 * pmap_ts_referenced().
3327 */
3328 if ((orig_l3 & PTE_D) != 0)
3329 vm_page_dirty(om);
3330 if ((orig_l3 & PTE_A) != 0)
3331 vm_page_aflag_set(om, PGA_REFERENCED);
3332 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3333 pv = pmap_pvh_remove(&om->md, pmap, va);
3334 KASSERT(pv != NULL,
3335 ("pmap_enter: no PV entry for %#lx", va));
3336 if ((new_l3 & PTE_SW_MANAGED) == 0)
3337 free_pv_entry(pmap, pv);
3338 if ((om->a.flags & PGA_WRITEABLE) != 0 &&
3339 TAILQ_EMPTY(&om->md.pv_list) &&
3340 ((om->flags & PG_FICTITIOUS) != 0 ||
3341 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3342 vm_page_aflag_clear(om, PGA_WRITEABLE);
3343 }
3344 pmap_invalidate_page(pmap, va);
3345 orig_l3 = 0;
3346 } else {
3347 /*
3348 * Increment the counters.
3349 */
3350 if ((new_l3 & PTE_SW_WIRED) != 0)
3351 pmap->pm_stats.wired_count++;
3352 pmap_resident_count_inc(pmap, 1);
3353 }
3354 /*
3355 * Enter on the PV list if part of our managed memory.
3356 */
3357 if ((new_l3 & PTE_SW_MANAGED) != 0) {
3358 if (pv == NULL) {
3359 pv = get_pv_entry(pmap, &lock);
3360 pv->pv_va = va;
3361 }
3362 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3363 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3364 m->md.pv_gen++;
3365 if ((new_l3 & PTE_W) != 0)
3366 vm_page_aflag_set(m, PGA_WRITEABLE);
3367 }
3368
3369 validate:
3370 /*
3371 * Sync the i-cache on all harts before updating the PTE
3372 * if the new PTE is executable.
3373 */
3374 if (prot & VM_PROT_EXECUTE)
3375 pmap_sync_icache(pmap, va, PAGE_SIZE);
3376
3377 /*
3378 * Update the L3 entry.
3379 */
3380 if (orig_l3 != 0) {
3381 orig_l3 = pmap_load_store(l3, new_l3);
3382 pmap_invalidate_page(pmap, va);
3383 KASSERT(PTE_TO_PHYS(orig_l3) == pa,
3384 ("pmap_enter: invalid update"));
3385 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) ==
3386 (PTE_D | PTE_SW_MANAGED))
3387 vm_page_dirty(m);
3388 } else {
3389 pmap_store(l3, new_l3);
3390 }
3391
3392 #if VM_NRESERVLEVEL > 0
3393 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES &&
3394 (m->flags & PG_FICTITIOUS) == 0 &&
3395 vm_reserv_level_iffullpop(m) == 0)
3396 (void)pmap_promote_l2(pmap, l2, va, mpte, &lock);
3397 #endif
3398
3399 rv = KERN_SUCCESS;
3400 out:
3401 if (lock != NULL)
3402 rw_wunlock(lock);
3403 rw_runlock(&pvh_global_lock);
3404 PMAP_UNLOCK(pmap);
3405 return (rv);
3406 }
3407
3408 /*
3409 * Release a page table page reference after a failed attempt to create a
3410 * mapping.
3411 */
3412 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t l2pg)3413 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t l2pg)
3414 {
3415 struct spglist free;
3416
3417 SLIST_INIT(&free);
3418 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) {
3419 /*
3420 * Although "va" is not mapped, paging-structure
3421 * caches could nonetheless have entries that
3422 * refer to the freed page table pages.
3423 * Invalidate those entries.
3424 */
3425 pmap_invalidate_page(pmap, va);
3426 vm_page_free_pages_toq(&free, true);
3427 }
3428 }
3429
3430 /*
3431 * Tries to create a read- and/or execute-only 2MB page mapping. Returns
3432 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
3433 * value. See pmap_enter_l2() for the possible error values when "no sleep",
3434 * "no replace", and "no reclaim" are specified.
3435 */
3436 static int
pmap_enter_2mpage(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)3437 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3438 struct rwlock **lockp)
3439 {
3440 pd_entry_t new_l2;
3441 pn_t pn;
3442
3443 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3444
3445 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE;
3446 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V |
3447 pmap_memattr_bits(m->md.pv_memattr));
3448 if ((m->oflags & VPO_UNMANAGED) == 0)
3449 new_l2 |= PTE_SW_MANAGED;
3450 if ((prot & VM_PROT_EXECUTE) != 0)
3451 new_l2 |= PTE_X;
3452 if (va < VM_MAXUSER_ADDRESS)
3453 new_l2 |= PTE_U;
3454 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
3455 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp));
3456 }
3457
3458 /*
3459 * Returns true if every page table entry in the specified page table is
3460 * zero.
3461 */
3462 static bool
pmap_every_pte_zero(vm_paddr_t pa)3463 pmap_every_pte_zero(vm_paddr_t pa)
3464 {
3465 pt_entry_t *pt_end, *pte;
3466
3467 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
3468 pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
3469 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
3470 if (*pte != 0)
3471 return (false);
3472 }
3473 return (true);
3474 }
3475
3476 /*
3477 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if
3478 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
3479 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if
3480 * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists
3481 * within the 2MB virtual address range starting at the specified virtual
3482 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
3483 * 2MB page mapping already exists at the specified virtual address. Returns
3484 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
3485 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
3486 * and a PV entry allocation failed.
3487 *
3488 * The parameter "m" is only used when creating a managed, writeable mapping.
3489 */
3490 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)3491 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
3492 vm_page_t m, struct rwlock **lockp)
3493 {
3494 struct spglist free;
3495 pd_entry_t *l2, *l3, oldl2;
3496 vm_offset_t sva;
3497 vm_page_t l2pg, mt;
3498 vm_page_t uwptpg;
3499
3500 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3501
3502 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
3503 NULL : lockp)) == NULL) {
3504 CTR2(KTR_PMAP, "pmap_enter_l2: failed to allocate PT page"
3505 " for va %#lx in pmap %p", va, pmap);
3506 return (KERN_RESOURCE_SHORTAGE);
3507 }
3508
3509 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
3510 l2 = &l2[pmap_l2_index(va)];
3511 if ((oldl2 = pmap_load(l2)) != 0) {
3512 KASSERT(l2pg->ref_count > 1,
3513 ("pmap_enter_l2: l2pg's ref count is too low"));
3514 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
3515 if ((oldl2 & PTE_RWX) != 0) {
3516 l2pg->ref_count--;
3517 CTR2(KTR_PMAP,
3518 "pmap_enter_l2: no space for va %#lx"
3519 " in pmap %p", va, pmap);
3520 return (KERN_NO_SPACE);
3521 } else if (va < VM_MAXUSER_ADDRESS ||
3522 !pmap_every_pte_zero(L2PTE_TO_PHYS(oldl2))) {
3523 l2pg->ref_count--;
3524 CTR2(KTR_PMAP, "pmap_enter_l2:"
3525 " failed to replace existing mapping"
3526 " for va %#lx in pmap %p", va, pmap);
3527 return (KERN_FAILURE);
3528 }
3529 }
3530 SLIST_INIT(&free);
3531 if ((oldl2 & PTE_RWX) != 0)
3532 (void)pmap_remove_l2(pmap, l2, va,
3533 pmap_load(pmap_l1(pmap, va)), &free, lockp);
3534 else
3535 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) {
3536 l3 = pmap_l2_to_l3(l2, sva);
3537 if ((pmap_load(l3) & PTE_V) != 0 &&
3538 pmap_remove_l3(pmap, l3, sva, oldl2, &free,
3539 lockp) != 0)
3540 break;
3541 }
3542 vm_page_free_pages_toq(&free, true);
3543 if (va >= VM_MAXUSER_ADDRESS) {
3544 /*
3545 * Both pmap_remove_l2() and pmap_remove_l3() will
3546 * leave the kernel page table page zero filled.
3547 */
3548 mt = PTE_TO_VM_PAGE(pmap_load(l2));
3549 if (pmap_insert_pt_page(pmap, mt, false, false))
3550 panic("pmap_enter_l2: trie insert failed");
3551 } else
3552 KASSERT(pmap_load(l2) == 0,
3553 ("pmap_enter_l2: non-zero L2 entry %p", l2));
3554 }
3555
3556 /*
3557 * Allocate leaf ptpage for wired userspace pages.
3558 */
3559 uwptpg = NULL;
3560 if ((new_l2 & PTE_SW_WIRED) != 0 && pmap != kernel_pmap) {
3561 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3562 if (uwptpg == NULL) {
3563 pmap_abort_ptp(pmap, va, l2pg);
3564 return (KERN_RESOURCE_SHORTAGE);
3565 }
3566 uwptpg->pindex = pmap_l2_pindex(va);
3567 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
3568 vm_page_unwire_noq(uwptpg);
3569 vm_page_free(uwptpg);
3570 pmap_abort_ptp(pmap, va, l2pg);
3571 return (KERN_RESOURCE_SHORTAGE);
3572 }
3573 pmap_resident_count_inc(pmap, 1);
3574 uwptpg->ref_count = Ln_ENTRIES;
3575 }
3576 if ((new_l2 & PTE_SW_MANAGED) != 0) {
3577 /*
3578 * Abort this mapping if its PV entry could not be created.
3579 */
3580 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
3581 pmap_abort_ptp(pmap, va, l2pg);
3582 if (uwptpg != NULL) {
3583 mt = pmap_remove_pt_page(pmap, va);
3584 KASSERT(mt == uwptpg,
3585 ("removed pt page %p, expected %p", mt,
3586 uwptpg));
3587 pmap_resident_count_dec(pmap, 1);
3588 uwptpg->ref_count = 1;
3589 vm_page_unwire_noq(uwptpg);
3590 vm_page_free(uwptpg);
3591 }
3592 CTR2(KTR_PMAP,
3593 "pmap_enter_l2: failed to create PV entry"
3594 " for va %#lx in pmap %p", va, pmap);
3595 return (KERN_RESOURCE_SHORTAGE);
3596 }
3597 if ((new_l2 & PTE_W) != 0)
3598 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3599 vm_page_aflag_set(mt, PGA_WRITEABLE);
3600 }
3601
3602 /*
3603 * Increment counters.
3604 */
3605 if ((new_l2 & PTE_SW_WIRED) != 0)
3606 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
3607 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
3608
3609 /*
3610 * Map the superpage.
3611 */
3612 pmap_store(l2, new_l2);
3613
3614 atomic_add_long(&pmap_l2_mappings, 1);
3615 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
3616 va, pmap);
3617
3618 return (KERN_SUCCESS);
3619 }
3620
3621 /*
3622 * Maps a sequence of resident pages belonging to the same object.
3623 * The sequence begins with the given page m_start. This page is
3624 * mapped at the given virtual address start. Each subsequent page is
3625 * mapped at a virtual address that is offset from start by the same
3626 * amount as the page is offset from m_start within the object. The
3627 * last page in the sequence is the page with the largest offset from
3628 * m_start that can be mapped at a virtual address less than the given
3629 * virtual address end. Not every virtual page between start and end
3630 * is mapped; only those for which a resident page exists with the
3631 * corresponding offset from m_start are mapped.
3632 */
3633 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)3634 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3635 vm_page_t m_start, vm_prot_t prot)
3636 {
3637 struct pctrie_iter pages;
3638 struct rwlock *lock;
3639 vm_offset_t va;
3640 vm_page_t m, mpte;
3641 int rv;
3642
3643 VM_OBJECT_ASSERT_LOCKED(m_start->object);
3644
3645 mpte = NULL;
3646 vm_page_iter_limit_init(&pages, m_start->object,
3647 m_start->pindex + atop(end - start));
3648 m = vm_radix_iter_lookup(&pages, m_start->pindex);
3649 lock = NULL;
3650 rw_rlock(&pvh_global_lock);
3651 PMAP_LOCK(pmap);
3652 while (m != NULL) {
3653 va = start + ptoa(m->pindex - m_start->pindex);
3654 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
3655 m->psind == 1 && pmap_ps_enabled(pmap) &&
3656 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) ==
3657 KERN_SUCCESS || rv == KERN_NO_SPACE)) {
3658 m = vm_radix_iter_jump(&pages, L2_SIZE / PAGE_SIZE);
3659 } else {
3660 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
3661 &lock);
3662 m = vm_radix_iter_step(&pages);
3663 }
3664 }
3665 if (lock != NULL)
3666 rw_wunlock(lock);
3667 rw_runlock(&pvh_global_lock);
3668 PMAP_UNLOCK(pmap);
3669 }
3670
3671 /*
3672 * this code makes some *MAJOR* assumptions:
3673 * 1. Current pmap & pmap exists.
3674 * 2. Not wired.
3675 * 3. Read access.
3676 * 4. No page table pages.
3677 * but is *MUCH* faster than pmap_enter...
3678 */
3679
3680 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)3681 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3682 {
3683 struct rwlock *lock;
3684
3685 lock = NULL;
3686 rw_rlock(&pvh_global_lock);
3687 PMAP_LOCK(pmap);
3688 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
3689 if (lock != NULL)
3690 rw_wunlock(lock);
3691 rw_runlock(&pvh_global_lock);
3692 PMAP_UNLOCK(pmap);
3693 }
3694
3695 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)3696 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3697 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
3698 {
3699 struct spglist free;
3700 pd_entry_t *l2;
3701 pt_entry_t *l3, newl3;
3702
3703 KASSERT(!VA_IS_CLEANMAP(va) ||
3704 (m->oflags & VPO_UNMANAGED) != 0,
3705 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3706 rw_assert(&pvh_global_lock, RA_LOCKED);
3707 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3708 l2 = NULL;
3709
3710 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
3711 /*
3712 * In the case that a page table page is not
3713 * resident, we are creating it here.
3714 */
3715 if (va < VM_MAXUSER_ADDRESS) {
3716 vm_pindex_t l2pindex;
3717
3718 /*
3719 * Calculate pagetable page index
3720 */
3721 l2pindex = pmap_l2_pindex(va);
3722 if (mpte && (mpte->pindex == l2pindex)) {
3723 mpte->ref_count++;
3724 } else {
3725 /*
3726 * Get the l2 entry
3727 */
3728 l2 = pmap_l2(pmap, va);
3729
3730 /*
3731 * If the page table page is mapped, we just increment
3732 * the hold count, and activate it. Otherwise, we
3733 * attempt to allocate a page table page. If this
3734 * attempt fails, we don't retry. Instead, we give up.
3735 */
3736 if (l2 != NULL && pmap_load(l2) != 0) {
3737 if ((pmap_load(l2) & PTE_RWX) != 0)
3738 return (NULL);
3739 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
3740 mpte->ref_count++;
3741 } else {
3742 /*
3743 * Pass NULL instead of the PV list lock
3744 * pointer, because we don't intend to sleep.
3745 */
3746 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
3747 if (mpte == NULL)
3748 return (mpte);
3749 }
3750 }
3751 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3752 l3 = &l3[pmap_l3_index(va)];
3753 } else {
3754 mpte = NULL;
3755 l3 = pmap_l3(kernel_pmap, va);
3756 }
3757 if (l3 == NULL)
3758 panic("pmap_enter_quick_locked: No l3");
3759 if (pmap_load(l3) != 0) {
3760 if (mpte != NULL)
3761 mpte->ref_count--;
3762 return (NULL);
3763 }
3764
3765 /*
3766 * Enter on the PV list if part of our managed memory.
3767 */
3768 if ((m->oflags & VPO_UNMANAGED) == 0 &&
3769 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3770 if (mpte != NULL) {
3771 SLIST_INIT(&free);
3772 if (pmap_unwire_ptp(pmap, va, mpte, &free))
3773 vm_page_free_pages_toq(&free, false);
3774 }
3775 return (NULL);
3776 }
3777
3778 /*
3779 * Increment counters
3780 */
3781 pmap_resident_count_inc(pmap, 1);
3782
3783 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) |
3784 PTE_V | PTE_R | pmap_memattr_bits(m->md.pv_memattr);
3785 if ((prot & VM_PROT_EXECUTE) != 0)
3786 newl3 |= PTE_X;
3787 if ((m->oflags & VPO_UNMANAGED) == 0)
3788 newl3 |= PTE_SW_MANAGED;
3789 if (va < VM_MAX_USER_ADDRESS)
3790 newl3 |= PTE_U;
3791
3792 /*
3793 * Sync the i-cache on all harts before updating the PTE
3794 * if the new PTE is executable.
3795 */
3796 if (prot & VM_PROT_EXECUTE)
3797 pmap_sync_icache(pmap, va, PAGE_SIZE);
3798
3799 pmap_store(l3, newl3);
3800
3801 #if VM_NRESERVLEVEL > 0
3802 /*
3803 * If both the PTP and the reservation are fully populated, then attempt
3804 * promotion.
3805 */
3806 if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
3807 (mpte == NULL || mpte->ref_count == Ln_ENTRIES) &&
3808 (m->flags & PG_FICTITIOUS) == 0 &&
3809 vm_reserv_level_iffullpop(m) == 0) {
3810 if (l2 == NULL)
3811 l2 = pmap_l2(pmap, va);
3812
3813 /*
3814 * If promotion succeeds, then the next call to this function
3815 * should not be given the unmapped PTP as a hint.
3816 */
3817 if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
3818 mpte = NULL;
3819 }
3820 #endif
3821
3822 return (mpte);
3823 }
3824
3825 /*
3826 * This code maps large physical mmap regions into the
3827 * processor address space. Note that some shortcuts
3828 * are taken, but the code works.
3829 */
3830 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)3831 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3832 vm_pindex_t pindex, vm_size_t size)
3833 {
3834
3835 VM_OBJECT_ASSERT_WLOCKED(object);
3836 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3837 ("pmap_object_init_pt: non-device object"));
3838 }
3839
3840 /*
3841 * Clear the wired attribute from the mappings for the specified range of
3842 * addresses in the given pmap. Every valid mapping within that range
3843 * must have the wired attribute set. In contrast, invalid mappings
3844 * cannot have the wired attribute set, so they are ignored.
3845 *
3846 * The wired attribute of the page table entry is not a hardware feature,
3847 * so there is no need to invalidate any TLB entries.
3848 */
3849 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3850 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3851 {
3852 vm_offset_t va_next;
3853 pd_entry_t *l0, *l1, *l2, l2e;
3854 pt_entry_t *l3, l3e;
3855 bool pv_lists_locked;
3856
3857 pv_lists_locked = false;
3858 retry:
3859 PMAP_LOCK(pmap);
3860 for (; sva < eva; sva = va_next) {
3861 if (pmap_mode == PMAP_MODE_SV48) {
3862 l0 = pmap_l0(pmap, sva);
3863 if (pmap_load(l0) == 0) {
3864 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3865 if (va_next < sva)
3866 va_next = eva;
3867 continue;
3868 }
3869 l1 = pmap_l0_to_l1(l0, sva);
3870 } else {
3871 l1 = pmap_l1(pmap, sva);
3872 }
3873
3874 if (pmap_load(l1) == 0) {
3875 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3876 if (va_next < sva)
3877 va_next = eva;
3878 continue;
3879 }
3880
3881 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3882 if (va_next < sva)
3883 va_next = eva;
3884
3885 l2 = pmap_l1_to_l2(l1, sva);
3886 if ((l2e = pmap_load(l2)) == 0)
3887 continue;
3888 if ((l2e & PTE_RWX) != 0) {
3889 if (sva + L2_SIZE == va_next && eva >= va_next) {
3890 if ((l2e & PTE_SW_WIRED) == 0)
3891 panic("pmap_unwire: l2 %#jx is missing "
3892 "PTE_SW_WIRED", (uintmax_t)l2e);
3893 pmap_clear_bits(l2, PTE_SW_WIRED);
3894 continue;
3895 } else {
3896 if (!pv_lists_locked) {
3897 pv_lists_locked = true;
3898 if (!rw_try_rlock(&pvh_global_lock)) {
3899 PMAP_UNLOCK(pmap);
3900 rw_rlock(&pvh_global_lock);
3901 /* Repeat sva. */
3902 goto retry;
3903 }
3904 }
3905 if (!pmap_demote_l2(pmap, l2, sva))
3906 panic("pmap_unwire: demotion failed");
3907 }
3908 }
3909
3910 if (va_next > eva)
3911 va_next = eva;
3912 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
3913 sva += L3_SIZE) {
3914 if ((l3e = pmap_load(l3)) == 0)
3915 continue;
3916 if ((l3e & PTE_SW_WIRED) == 0)
3917 panic("pmap_unwire: l3 %#jx is missing "
3918 "PTE_SW_WIRED", (uintmax_t)l3e);
3919
3920 /*
3921 * PG_W must be cleared atomically. Although the pmap
3922 * lock synchronizes access to PG_W, another processor
3923 * could be setting PG_M and/or PG_A concurrently.
3924 */
3925 pmap_clear_bits(l3, PTE_SW_WIRED);
3926 pmap->pm_stats.wired_count--;
3927 }
3928 }
3929 if (pv_lists_locked)
3930 rw_runlock(&pvh_global_lock);
3931 PMAP_UNLOCK(pmap);
3932 }
3933
3934 /*
3935 * Copy the range specified by src_addr/len
3936 * from the source map to the range dst_addr/len
3937 * in the destination map.
3938 *
3939 * This routine is only advisory and need not do anything.
3940 */
3941
3942 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)3943 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3944 vm_offset_t src_addr)
3945 {
3946
3947 }
3948
3949 /*
3950 * pmap_zero_page zeros the specified hardware page by mapping
3951 * the page into KVM and using bzero to clear its contents.
3952 */
3953 void
pmap_zero_page(vm_page_t m)3954 pmap_zero_page(vm_page_t m)
3955 {
3956 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3957
3958 pagezero((void *)va);
3959 }
3960
3961 /*
3962 * pmap_zero_page_area zeros the specified hardware page by mapping
3963 * the page into KVM and using bzero to clear its contents.
3964 *
3965 * off and size may not cover an area beyond a single hardware page.
3966 */
3967 void
pmap_zero_page_area(vm_page_t m,int off,int size)3968 pmap_zero_page_area(vm_page_t m, int off, int size)
3969 {
3970 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3971
3972 if (off == 0 && size == PAGE_SIZE)
3973 pagezero((void *)va);
3974 else
3975 bzero((char *)va + off, size);
3976 }
3977
3978 /*
3979 * pmap_copy_page copies the specified (machine independent)
3980 * page by mapping the page into virtual memory and using
3981 * bcopy to copy the page, one machine dependent page at a
3982 * time.
3983 */
3984 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)3985 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
3986 {
3987 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
3988 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
3989
3990 pagecopy((void *)src, (void *)dst);
3991 }
3992
3993 int unmapped_buf_allowed = 1;
3994
3995 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)3996 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
3997 vm_offset_t b_offset, int xfersize)
3998 {
3999 void *a_cp, *b_cp;
4000 vm_page_t m_a, m_b;
4001 vm_paddr_t p_a, p_b;
4002 vm_offset_t a_pg_offset, b_pg_offset;
4003 int cnt;
4004
4005 while (xfersize > 0) {
4006 a_pg_offset = a_offset & PAGE_MASK;
4007 m_a = ma[a_offset >> PAGE_SHIFT];
4008 p_a = m_a->phys_addr;
4009 b_pg_offset = b_offset & PAGE_MASK;
4010 m_b = mb[b_offset >> PAGE_SHIFT];
4011 p_b = m_b->phys_addr;
4012 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4013 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4014 if (__predict_false(!PHYS_IN_DMAP(p_a))) {
4015 panic("!DMAP a %lx", p_a);
4016 } else {
4017 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
4018 }
4019 if (__predict_false(!PHYS_IN_DMAP(p_b))) {
4020 panic("!DMAP b %lx", p_b);
4021 } else {
4022 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
4023 }
4024 bcopy(a_cp, b_cp, cnt);
4025 a_offset += cnt;
4026 b_offset += cnt;
4027 xfersize -= cnt;
4028 }
4029 }
4030
4031 vm_offset_t
pmap_quick_enter_page(vm_page_t m)4032 pmap_quick_enter_page(vm_page_t m)
4033 {
4034
4035 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
4036 }
4037
4038 void
pmap_quick_remove_page(vm_offset_t addr)4039 pmap_quick_remove_page(vm_offset_t addr)
4040 {
4041 }
4042
4043 /*
4044 * Returns true if the pmap's pv is one of the first
4045 * 16 pvs linked to from this page. This count may
4046 * be changed upwards or downwards in the future; it
4047 * is only necessary that true be returned for a small
4048 * subset of pmaps for proper page aging.
4049 */
4050 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)4051 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4052 {
4053 struct md_page *pvh;
4054 struct rwlock *lock;
4055 pv_entry_t pv;
4056 int loops = 0;
4057 bool rv;
4058
4059 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4060 ("pmap_page_exists_quick: page %p is not managed", m));
4061 rv = false;
4062 rw_rlock(&pvh_global_lock);
4063 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4064 rw_rlock(lock);
4065 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4066 if (PV_PMAP(pv) == pmap) {
4067 rv = true;
4068 break;
4069 }
4070 loops++;
4071 if (loops >= 16)
4072 break;
4073 }
4074 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4075 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4076 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4077 if (PV_PMAP(pv) == pmap) {
4078 rv = true;
4079 break;
4080 }
4081 loops++;
4082 if (loops >= 16)
4083 break;
4084 }
4085 }
4086 rw_runlock(lock);
4087 rw_runlock(&pvh_global_lock);
4088 return (rv);
4089 }
4090
4091 /*
4092 * pmap_page_wired_mappings:
4093 *
4094 * Return the number of managed mappings to the given physical page
4095 * that are wired.
4096 */
4097 int
pmap_page_wired_mappings(vm_page_t m)4098 pmap_page_wired_mappings(vm_page_t m)
4099 {
4100 struct md_page *pvh;
4101 struct rwlock *lock;
4102 pmap_t pmap;
4103 pd_entry_t *l2;
4104 pt_entry_t *l3;
4105 pv_entry_t pv;
4106 int count, md_gen, pvh_gen;
4107
4108 if ((m->oflags & VPO_UNMANAGED) != 0)
4109 return (0);
4110 rw_rlock(&pvh_global_lock);
4111 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4112 rw_rlock(lock);
4113 restart:
4114 count = 0;
4115 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4116 pmap = PV_PMAP(pv);
4117 if (!PMAP_TRYLOCK(pmap)) {
4118 md_gen = m->md.pv_gen;
4119 rw_runlock(lock);
4120 PMAP_LOCK(pmap);
4121 rw_rlock(lock);
4122 if (md_gen != m->md.pv_gen) {
4123 PMAP_UNLOCK(pmap);
4124 goto restart;
4125 }
4126 }
4127 l2 = pmap_l2(pmap, pv->pv_va);
4128 KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4129 ("%s: found a 2mpage in page %p's pv list", __func__, m));
4130 l3 = pmap_l2_to_l3(l2, pv->pv_va);
4131 if ((pmap_load(l3) & PTE_SW_WIRED) != 0)
4132 count++;
4133 PMAP_UNLOCK(pmap);
4134 }
4135 if ((m->flags & PG_FICTITIOUS) == 0) {
4136 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4137 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4138 pmap = PV_PMAP(pv);
4139 if (!PMAP_TRYLOCK(pmap)) {
4140 md_gen = m->md.pv_gen;
4141 pvh_gen = pvh->pv_gen;
4142 rw_runlock(lock);
4143 PMAP_LOCK(pmap);
4144 rw_rlock(lock);
4145 if (md_gen != m->md.pv_gen ||
4146 pvh_gen != pvh->pv_gen) {
4147 PMAP_UNLOCK(pmap);
4148 goto restart;
4149 }
4150 }
4151 l2 = pmap_l2(pmap, pv->pv_va);
4152 if ((pmap_load(l2) & PTE_SW_WIRED) != 0)
4153 count++;
4154 PMAP_UNLOCK(pmap);
4155 }
4156 }
4157 rw_runlock(lock);
4158 rw_runlock(&pvh_global_lock);
4159 return (count);
4160 }
4161
4162 /*
4163 * Returns true if the given page is mapped individually or as part of
4164 * a 2mpage. Otherwise, returns false.
4165 */
4166 bool
pmap_page_is_mapped(vm_page_t m)4167 pmap_page_is_mapped(vm_page_t m)
4168 {
4169 struct rwlock *lock;
4170 bool rv;
4171
4172 if ((m->oflags & VPO_UNMANAGED) != 0)
4173 return (false);
4174 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4175 rw_rlock(lock);
4176 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4177 ((m->flags & PG_FICTITIOUS) == 0 &&
4178 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4179 rw_runlock(lock);
4180 return (rv);
4181 }
4182
4183 static void
pmap_remove_pages_pv(pmap_t pmap,vm_page_t m,pv_entry_t pv,struct spglist * free,bool superpage)4184 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv,
4185 struct spglist *free, bool superpage)
4186 {
4187 struct md_page *pvh;
4188 vm_page_t mpte, mt;
4189
4190 if (superpage) {
4191 pmap_resident_count_dec(pmap, Ln_ENTRIES);
4192 pvh = pa_to_pvh(m->phys_addr);
4193 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4194 pvh->pv_gen++;
4195 if (TAILQ_EMPTY(&pvh->pv_list)) {
4196 for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
4197 if (TAILQ_EMPTY(&mt->md.pv_list) &&
4198 (mt->a.flags & PGA_WRITEABLE) != 0)
4199 vm_page_aflag_clear(mt, PGA_WRITEABLE);
4200 }
4201 mpte = pmap_remove_pt_page(pmap, pv->pv_va);
4202 if (mpte != NULL) {
4203 KASSERT(vm_page_any_valid(mpte),
4204 ("pmap_remove_pages: pte page not promoted"));
4205 pmap_resident_count_dec(pmap, 1);
4206 KASSERT(mpte->ref_count == Ln_ENTRIES,
4207 ("pmap_remove_pages: pte page ref count error"));
4208 mpte->ref_count = 0;
4209 pmap_add_delayed_free_list(mpte, free, false);
4210 }
4211 } else {
4212 pmap_resident_count_dec(pmap, 1);
4213 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4214 m->md.pv_gen++;
4215 if (TAILQ_EMPTY(&m->md.pv_list) &&
4216 (m->a.flags & PGA_WRITEABLE) != 0) {
4217 pvh = pa_to_pvh(m->phys_addr);
4218 if (TAILQ_EMPTY(&pvh->pv_list))
4219 vm_page_aflag_clear(m, PGA_WRITEABLE);
4220 }
4221 }
4222 }
4223
4224 /*
4225 * Destroy all managed, non-wired mappings in the given user-space
4226 * pmap. This pmap cannot be active on any processor besides the
4227 * caller.
4228 *
4229 * This function cannot be applied to the kernel pmap. Moreover, it
4230 * is not intended for general use. It is only to be used during
4231 * process termination. Consequently, it can be implemented in ways
4232 * that make it faster than pmap_remove(). First, it can more quickly
4233 * destroy mappings by iterating over the pmap's collection of PV
4234 * entries, rather than searching the page table. Second, it doesn't
4235 * have to test and clear the page table entries atomically, because
4236 * no processor is currently accessing the user address space. In
4237 * particular, a page table entry's dirty bit won't change state once
4238 * this function starts.
4239 */
4240 void
pmap_remove_pages(pmap_t pmap)4241 pmap_remove_pages(pmap_t pmap)
4242 {
4243 struct spglist free;
4244 pd_entry_t ptepde;
4245 pt_entry_t *pte, tpte;
4246 vm_page_t m, mt;
4247 pv_entry_t pv;
4248 struct pv_chunk *pc, *npc;
4249 struct rwlock *lock;
4250 int64_t bit;
4251 uint64_t inuse, bitmask;
4252 int allfree, field, freed __pv_stat_used, idx;
4253 bool superpage;
4254
4255 lock = NULL;
4256
4257 SLIST_INIT(&free);
4258 rw_rlock(&pvh_global_lock);
4259 PMAP_LOCK(pmap);
4260 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4261 allfree = 1;
4262 freed = 0;
4263 for (field = 0; field < _NPCM; field++) {
4264 inuse = ~pc->pc_map[field] & pc_freemask[field];
4265 while (inuse != 0) {
4266 bit = ffsl(inuse) - 1;
4267 bitmask = 1UL << bit;
4268 idx = field * 64 + bit;
4269 pv = &pc->pc_pventry[idx];
4270 inuse &= ~bitmask;
4271
4272 pte = pmap_l1(pmap, pv->pv_va);
4273 ptepde = pmap_load(pte);
4274 pte = pmap_l1_to_l2(pte, pv->pv_va);
4275 tpte = pmap_load(pte);
4276
4277 KASSERT((tpte & PTE_V) != 0,
4278 ("L2 PTE is invalid... bogus PV entry? "
4279 "va=%#lx, pte=%#lx", pv->pv_va, tpte));
4280 if ((tpte & PTE_RWX) != 0) {
4281 superpage = true;
4282 } else {
4283 ptepde = tpte;
4284 pte = pmap_l2_to_l3(pte, pv->pv_va);
4285 tpte = pmap_load(pte);
4286 superpage = false;
4287 }
4288
4289 /*
4290 * We cannot remove wired pages from a
4291 * process' mapping at this time.
4292 */
4293 if (tpte & PTE_SW_WIRED) {
4294 allfree = 0;
4295 continue;
4296 }
4297
4298 m = PTE_TO_VM_PAGE(tpte);
4299 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4300 m < &vm_page_array[vm_page_array_size],
4301 ("pmap_remove_pages: bad pte %#jx",
4302 (uintmax_t)tpte));
4303
4304 pmap_clear(pte);
4305
4306 /*
4307 * Update the vm_page_t clean/reference bits.
4308 */
4309 if ((tpte & (PTE_D | PTE_W)) ==
4310 (PTE_D | PTE_W)) {
4311 if (superpage)
4312 for (mt = m;
4313 mt < &m[Ln_ENTRIES]; mt++)
4314 vm_page_dirty(mt);
4315 else
4316 vm_page_dirty(m);
4317 }
4318
4319 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
4320
4321 /* Mark free */
4322 pc->pc_map[field] |= bitmask;
4323
4324 pmap_remove_pages_pv(pmap, m, pv, &free,
4325 superpage);
4326 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
4327 freed++;
4328 }
4329 }
4330 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
4331 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
4332 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
4333 if (allfree) {
4334 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4335 free_pv_chunk(pc);
4336 }
4337 }
4338 if (lock != NULL)
4339 rw_wunlock(lock);
4340 pmap_invalidate_all(pmap);
4341 rw_runlock(&pvh_global_lock);
4342 PMAP_UNLOCK(pmap);
4343 vm_page_free_pages_toq(&free, false);
4344 }
4345
4346 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)4347 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
4348 {
4349 struct md_page *pvh;
4350 struct rwlock *lock;
4351 pd_entry_t *l2;
4352 pt_entry_t *l3, mask;
4353 pv_entry_t pv;
4354 pmap_t pmap;
4355 int md_gen, pvh_gen;
4356 bool rv;
4357
4358 mask = 0;
4359 if (modified)
4360 mask |= PTE_D;
4361 if (accessed)
4362 mask |= PTE_A;
4363
4364 rv = false;
4365 rw_rlock(&pvh_global_lock);
4366 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4367 rw_rlock(lock);
4368 restart:
4369 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4370 pmap = PV_PMAP(pv);
4371 if (!PMAP_TRYLOCK(pmap)) {
4372 md_gen = m->md.pv_gen;
4373 rw_runlock(lock);
4374 PMAP_LOCK(pmap);
4375 rw_rlock(lock);
4376 if (md_gen != m->md.pv_gen) {
4377 PMAP_UNLOCK(pmap);
4378 goto restart;
4379 }
4380 }
4381 l2 = pmap_l2(pmap, pv->pv_va);
4382 KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4383 ("%s: found a 2mpage in page %p's pv list", __func__, m));
4384 l3 = pmap_l2_to_l3(l2, pv->pv_va);
4385 rv = (pmap_load(l3) & mask) == mask;
4386 PMAP_UNLOCK(pmap);
4387 if (rv)
4388 goto out;
4389 }
4390 if ((m->flags & PG_FICTITIOUS) == 0) {
4391 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4392 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4393 pmap = PV_PMAP(pv);
4394 if (!PMAP_TRYLOCK(pmap)) {
4395 md_gen = m->md.pv_gen;
4396 pvh_gen = pvh->pv_gen;
4397 rw_runlock(lock);
4398 PMAP_LOCK(pmap);
4399 rw_rlock(lock);
4400 if (md_gen != m->md.pv_gen ||
4401 pvh_gen != pvh->pv_gen) {
4402 PMAP_UNLOCK(pmap);
4403 goto restart;
4404 }
4405 }
4406 l2 = pmap_l2(pmap, pv->pv_va);
4407 rv = (pmap_load(l2) & mask) == mask;
4408 PMAP_UNLOCK(pmap);
4409 if (rv)
4410 goto out;
4411 }
4412 }
4413 out:
4414 rw_runlock(lock);
4415 rw_runlock(&pvh_global_lock);
4416 return (rv);
4417 }
4418
4419 /*
4420 * pmap_is_modified:
4421 *
4422 * Return whether or not the specified physical page was modified
4423 * in any physical maps.
4424 */
4425 bool
pmap_is_modified(vm_page_t m)4426 pmap_is_modified(vm_page_t m)
4427 {
4428
4429 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4430 ("pmap_is_modified: page %p is not managed", m));
4431
4432 /*
4433 * If the page is not busied then this check is racy.
4434 */
4435 if (!pmap_page_is_write_mapped(m))
4436 return (false);
4437 return (pmap_page_test_mappings(m, false, true));
4438 }
4439
4440 /*
4441 * pmap_is_prefaultable:
4442 *
4443 * Return whether or not the specified virtual address is eligible
4444 * for prefault.
4445 */
4446 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)4447 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4448 {
4449 pt_entry_t *l3;
4450 bool rv;
4451
4452 /*
4453 * Return true if and only if the L3 entry for the specified virtual
4454 * address is allocated but invalid.
4455 */
4456 rv = false;
4457 PMAP_LOCK(pmap);
4458 l3 = pmap_l3(pmap, addr);
4459 if (l3 != NULL && pmap_load(l3) == 0) {
4460 rv = true;
4461 }
4462 PMAP_UNLOCK(pmap);
4463 return (rv);
4464 }
4465
4466 /*
4467 * pmap_is_referenced:
4468 *
4469 * Return whether or not the specified physical page was referenced
4470 * in any physical maps.
4471 */
4472 bool
pmap_is_referenced(vm_page_t m)4473 pmap_is_referenced(vm_page_t m)
4474 {
4475
4476 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4477 ("pmap_is_referenced: page %p is not managed", m));
4478 return (pmap_page_test_mappings(m, true, false));
4479 }
4480
4481 /*
4482 * Clear the write and modified bits in each of the given page's mappings.
4483 */
4484 void
pmap_remove_write(vm_page_t m)4485 pmap_remove_write(vm_page_t m)
4486 {
4487 struct md_page *pvh;
4488 struct rwlock *lock;
4489 pmap_t pmap;
4490 pd_entry_t *l2;
4491 pt_entry_t *l3, oldl3, newl3;
4492 pv_entry_t next_pv, pv;
4493 vm_offset_t va;
4494 int md_gen, pvh_gen;
4495
4496 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4497 ("pmap_remove_write: page %p is not managed", m));
4498 vm_page_assert_busied(m);
4499
4500 if (!pmap_page_is_write_mapped(m))
4501 return;
4502 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4503 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4504 pa_to_pvh(VM_PAGE_TO_PHYS(m));
4505 rw_rlock(&pvh_global_lock);
4506 retry_pv_loop:
4507 rw_wlock(lock);
4508 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4509 pmap = PV_PMAP(pv);
4510 if (!PMAP_TRYLOCK(pmap)) {
4511 pvh_gen = pvh->pv_gen;
4512 rw_wunlock(lock);
4513 PMAP_LOCK(pmap);
4514 rw_wlock(lock);
4515 if (pvh_gen != pvh->pv_gen) {
4516 PMAP_UNLOCK(pmap);
4517 rw_wunlock(lock);
4518 goto retry_pv_loop;
4519 }
4520 }
4521 va = pv->pv_va;
4522 l2 = pmap_l2(pmap, va);
4523 if ((pmap_load(l2) & PTE_W) != 0)
4524 (void)pmap_demote_l2_locked(pmap, l2, va, &lock);
4525 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4526 ("inconsistent pv lock %p %p for page %p",
4527 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4528 PMAP_UNLOCK(pmap);
4529 }
4530 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4531 pmap = PV_PMAP(pv);
4532 if (!PMAP_TRYLOCK(pmap)) {
4533 pvh_gen = pvh->pv_gen;
4534 md_gen = m->md.pv_gen;
4535 rw_wunlock(lock);
4536 PMAP_LOCK(pmap);
4537 rw_wlock(lock);
4538 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4539 PMAP_UNLOCK(pmap);
4540 rw_wunlock(lock);
4541 goto retry_pv_loop;
4542 }
4543 }
4544 l2 = pmap_l2(pmap, pv->pv_va);
4545 KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4546 ("%s: found a 2mpage in page %p's pv list", __func__, m));
4547 l3 = pmap_l2_to_l3(l2, pv->pv_va);
4548 oldl3 = pmap_load(l3);
4549 retry:
4550 if ((oldl3 & PTE_W) != 0) {
4551 newl3 = oldl3 & ~(PTE_D | PTE_W);
4552 if (!atomic_fcmpset_long(l3, &oldl3, newl3))
4553 goto retry;
4554 if ((oldl3 & PTE_D) != 0)
4555 vm_page_dirty(m);
4556 pmap_invalidate_page(pmap, pv->pv_va);
4557 }
4558 PMAP_UNLOCK(pmap);
4559 }
4560 rw_wunlock(lock);
4561 vm_page_aflag_clear(m, PGA_WRITEABLE);
4562 rw_runlock(&pvh_global_lock);
4563 }
4564
4565 /*
4566 * pmap_ts_referenced:
4567 *
4568 * Return a count of reference bits for a page, clearing those bits.
4569 * It is not necessary for every reference bit to be cleared, but it
4570 * is necessary that 0 only be returned when there are truly no
4571 * reference bits set.
4572 *
4573 * As an optimization, update the page's dirty field if a modified bit is
4574 * found while counting reference bits. This opportunistic update can be
4575 * performed at low cost and can eliminate the need for some future calls
4576 * to pmap_is_modified(). However, since this function stops after
4577 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4578 * dirty pages. Those dirty pages will only be detected by a future call
4579 * to pmap_is_modified().
4580 */
4581 int
pmap_ts_referenced(vm_page_t m)4582 pmap_ts_referenced(vm_page_t m)
4583 {
4584 struct spglist free;
4585 struct md_page *pvh;
4586 struct rwlock *lock;
4587 pv_entry_t pv, pvf;
4588 pmap_t pmap;
4589 pd_entry_t *l2, l2e;
4590 pt_entry_t *l3, l3e;
4591 vm_paddr_t pa;
4592 vm_offset_t va;
4593 int cleared, md_gen, not_cleared, pvh_gen;
4594
4595 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4596 ("pmap_ts_referenced: page %p is not managed", m));
4597 SLIST_INIT(&free);
4598 cleared = 0;
4599 pa = VM_PAGE_TO_PHYS(m);
4600 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
4601
4602 lock = PHYS_TO_PV_LIST_LOCK(pa);
4603 rw_rlock(&pvh_global_lock);
4604 rw_wlock(lock);
4605 retry:
4606 not_cleared = 0;
4607 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4608 goto small_mappings;
4609 pv = pvf;
4610 do {
4611 pmap = PV_PMAP(pv);
4612 if (!PMAP_TRYLOCK(pmap)) {
4613 pvh_gen = pvh->pv_gen;
4614 rw_wunlock(lock);
4615 PMAP_LOCK(pmap);
4616 rw_wlock(lock);
4617 if (pvh_gen != pvh->pv_gen) {
4618 PMAP_UNLOCK(pmap);
4619 goto retry;
4620 }
4621 }
4622 va = pv->pv_va;
4623 l2 = pmap_l2(pmap, va);
4624 l2e = pmap_load(l2);
4625 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) {
4626 /*
4627 * Although l2e is mapping a 2MB page, because
4628 * this function is called at a 4KB page granularity,
4629 * we only update the 4KB page under test.
4630 */
4631 vm_page_dirty(m);
4632 }
4633 if ((l2e & PTE_A) != 0) {
4634 /*
4635 * Since this reference bit is shared by 512 4KB
4636 * pages, it should not be cleared every time it is
4637 * tested. Apply a simple "hash" function on the
4638 * physical page number, the virtual superpage number,
4639 * and the pmap address to select one 4KB page out of
4640 * the 512 on which testing the reference bit will
4641 * result in clearing that reference bit. This
4642 * function is designed to avoid the selection of the
4643 * same 4KB page for every 2MB page mapping.
4644 *
4645 * On demotion, a mapping that hasn't been referenced
4646 * is simply destroyed. To avoid the possibility of a
4647 * subsequent page fault on a demoted wired mapping,
4648 * always leave its reference bit set. Moreover,
4649 * since the superpage is wired, the current state of
4650 * its reference bit won't affect page replacement.
4651 */
4652 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
4653 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
4654 (l2e & PTE_SW_WIRED) == 0) {
4655 pmap_clear_bits(l2, PTE_A);
4656 pmap_invalidate_page(pmap, va);
4657 cleared++;
4658 } else
4659 not_cleared++;
4660 }
4661 PMAP_UNLOCK(pmap);
4662 /* Rotate the PV list if it has more than one entry. */
4663 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4664 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4665 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4666 pvh->pv_gen++;
4667 }
4668 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
4669 goto out;
4670 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4671 small_mappings:
4672 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4673 goto out;
4674 pv = pvf;
4675 do {
4676 pmap = PV_PMAP(pv);
4677 if (!PMAP_TRYLOCK(pmap)) {
4678 pvh_gen = pvh->pv_gen;
4679 md_gen = m->md.pv_gen;
4680 rw_wunlock(lock);
4681 PMAP_LOCK(pmap);
4682 rw_wlock(lock);
4683 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4684 PMAP_UNLOCK(pmap);
4685 goto retry;
4686 }
4687 }
4688 l2 = pmap_l2(pmap, pv->pv_va);
4689
4690 KASSERT((pmap_load(l2) & PTE_RX) == 0,
4691 ("pmap_ts_referenced: found an invalid l2 table"));
4692
4693 l3 = pmap_l2_to_l3(l2, pv->pv_va);
4694 l3e = pmap_load(l3);
4695 if ((l3e & PTE_D) != 0)
4696 vm_page_dirty(m);
4697 if ((l3e & PTE_A) != 0) {
4698 if ((l3e & PTE_SW_WIRED) == 0) {
4699 /*
4700 * Wired pages cannot be paged out so
4701 * doing accessed bit emulation for
4702 * them is wasted effort. We do the
4703 * hard work for unwired pages only.
4704 */
4705 pmap_clear_bits(l3, PTE_A);
4706 pmap_invalidate_page(pmap, pv->pv_va);
4707 cleared++;
4708 } else
4709 not_cleared++;
4710 }
4711 PMAP_UNLOCK(pmap);
4712 /* Rotate the PV list if it has more than one entry. */
4713 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4714 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4715 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4716 m->md.pv_gen++;
4717 }
4718 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
4719 not_cleared < PMAP_TS_REFERENCED_MAX);
4720 out:
4721 rw_wunlock(lock);
4722 rw_runlock(&pvh_global_lock);
4723 vm_page_free_pages_toq(&free, false);
4724 return (cleared + not_cleared);
4725 }
4726
4727 /*
4728 * Apply the given advice to the specified range of addresses within the
4729 * given pmap. Depending on the advice, clear the referenced and/or
4730 * modified flags in each mapping and set the mapped page's dirty field.
4731 */
4732 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)4733 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4734 {
4735 }
4736
4737 /*
4738 * Clear the modify bits on the specified physical page.
4739 */
4740 void
pmap_clear_modify(vm_page_t m)4741 pmap_clear_modify(vm_page_t m)
4742 {
4743 struct md_page *pvh;
4744 struct rwlock *lock;
4745 pmap_t pmap;
4746 pv_entry_t next_pv, pv;
4747 pd_entry_t *l2, oldl2;
4748 pt_entry_t *l3;
4749 vm_offset_t va;
4750 int md_gen, pvh_gen;
4751
4752 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4753 ("%s: page %p is not managed", __func__, m));
4754 vm_page_assert_busied(m);
4755
4756 if (!pmap_page_is_write_mapped(m))
4757 return;
4758
4759 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4760 pa_to_pvh(VM_PAGE_TO_PHYS(m));
4761 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4762 rw_rlock(&pvh_global_lock);
4763 rw_wlock(lock);
4764 restart:
4765 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4766 pmap = PV_PMAP(pv);
4767 if (!PMAP_TRYLOCK(pmap)) {
4768 pvh_gen = pvh->pv_gen;
4769 rw_wunlock(lock);
4770 PMAP_LOCK(pmap);
4771 rw_wlock(lock);
4772 if (pvh_gen != pvh->pv_gen) {
4773 PMAP_UNLOCK(pmap);
4774 goto restart;
4775 }
4776 }
4777 va = pv->pv_va;
4778 l2 = pmap_l2(pmap, va);
4779 oldl2 = pmap_load(l2);
4780 /* If oldl2 has PTE_W set, then it also has PTE_D set. */
4781 if ((oldl2 & PTE_W) != 0 &&
4782 pmap_demote_l2_locked(pmap, l2, va, &lock) &&
4783 (oldl2 & PTE_SW_WIRED) == 0) {
4784 /*
4785 * Write protect the mapping to a single page so that
4786 * a subsequent write access may repromote.
4787 */
4788 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
4789 l3 = pmap_l2_to_l3(l2, va);
4790 pmap_clear_bits(l3, PTE_D | PTE_W);
4791 vm_page_dirty(m);
4792 pmap_invalidate_page(pmap, va);
4793 }
4794 PMAP_UNLOCK(pmap);
4795 }
4796 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4797 pmap = PV_PMAP(pv);
4798 if (!PMAP_TRYLOCK(pmap)) {
4799 md_gen = m->md.pv_gen;
4800 pvh_gen = pvh->pv_gen;
4801 rw_wunlock(lock);
4802 PMAP_LOCK(pmap);
4803 rw_wlock(lock);
4804 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4805 PMAP_UNLOCK(pmap);
4806 goto restart;
4807 }
4808 }
4809 l2 = pmap_l2(pmap, pv->pv_va);
4810 KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4811 ("%s: found a 2mpage in page %p's pv list", __func__, m));
4812 l3 = pmap_l2_to_l3(l2, pv->pv_va);
4813 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) {
4814 pmap_clear_bits(l3, PTE_D | PTE_W);
4815 pmap_invalidate_page(pmap, pv->pv_va);
4816 }
4817 PMAP_UNLOCK(pmap);
4818 }
4819 rw_wunlock(lock);
4820 rw_runlock(&pvh_global_lock);
4821 }
4822
4823 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)4824 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4825 {
4826
4827 return ((void *)PHYS_TO_DMAP(pa));
4828 }
4829
4830 void
pmap_unmapbios(void * p,vm_size_t size)4831 pmap_unmapbios(void *p, vm_size_t size)
4832 {
4833 }
4834
4835 /*
4836 * Sets the memory attribute for the specified page.
4837 */
4838 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)4839 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4840 {
4841
4842 m->md.pv_memattr = ma;
4843
4844 /*
4845 * If "m" is a normal page, update its direct mapping. This update
4846 * can be relied upon to perform any cache operations that are
4847 * required for data coherence.
4848 */
4849 if ((m->flags & PG_FICTITIOUS) == 0 &&
4850 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
4851 m->md.pv_memattr) != 0)
4852 panic("memory attribute change on the direct map failed");
4853 }
4854
4855 /*
4856 * Changes the specified virtual address range's memory type to that given by
4857 * the parameter "mode". The specified virtual address range must be
4858 * completely contained within either the direct map or the kernel map.
4859 *
4860 * Returns zero if the change completed successfully, and either EINVAL or
4861 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
4862 * of the virtual address range was not mapped, and ENOMEM is returned if
4863 * there was insufficient memory available to complete the change. In the
4864 * latter case, the memory type may have been changed on some part of the
4865 * virtual address range.
4866 */
4867 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)4868 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4869 {
4870 int error;
4871
4872 PMAP_LOCK(kernel_pmap);
4873 error = pmap_change_attr_locked(va, size, mode);
4874 PMAP_UNLOCK(kernel_pmap);
4875 return (error);
4876 }
4877
4878 static int
pmap_change_attr_locked(vm_offset_t va,vm_size_t size,int mode)4879 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
4880 {
4881 vm_offset_t base, offset, tmpva;
4882 vm_paddr_t phys;
4883 pd_entry_t *l1, l1e;
4884 pd_entry_t *l2, l2e;
4885 pt_entry_t *l3, l3e;
4886 pt_entry_t bits, mask;
4887 bool anychanged = false;
4888 int error = 0;
4889
4890 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
4891 base = trunc_page(va);
4892 offset = va & PAGE_MASK;
4893 size = round_page(offset + size);
4894
4895 if (!VIRT_IN_DMAP(base) &&
4896 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
4897 return (EINVAL);
4898
4899 bits = pmap_memattr_bits(mode);
4900 mask = memattr_mask;
4901
4902 /* First loop: perform PTE validation and demotions as necessary. */
4903 for (tmpva = base; tmpva < base + size; ) {
4904 l1 = pmap_l1(kernel_pmap, tmpva);
4905 if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0)
4906 return (EINVAL);
4907 if ((l1e & PTE_RWX) != 0) {
4908 /*
4909 * If the existing PTE has the correct attributes, then
4910 * no need to demote.
4911 */
4912 if ((l1e & mask) == bits) {
4913 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
4914 continue;
4915 }
4916
4917 /*
4918 * If the 1GB page fits in the remaining range, we
4919 * don't need to demote.
4920 */
4921 if ((tmpva & L1_OFFSET) == 0 &&
4922 tmpva + L1_SIZE <= base + size) {
4923 tmpva += L1_SIZE;
4924 continue;
4925 }
4926
4927 if (!pmap_demote_l1(kernel_pmap, l1, tmpva))
4928 return (EINVAL);
4929 }
4930 l2 = pmap_l1_to_l2(l1, tmpva);
4931 if (((l2e = pmap_load(l2)) & PTE_V) == 0)
4932 return (EINVAL);
4933 if ((l2e & PTE_RWX) != 0) {
4934 /*
4935 * If the existing PTE has the correct attributes, then
4936 * no need to demote.
4937 */
4938 if ((l2e & mask) == bits) {
4939 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
4940 continue;
4941 }
4942
4943 /*
4944 * If the 2MB page fits in the remaining range, we
4945 * don't need to demote.
4946 */
4947 if ((tmpva & L2_OFFSET) == 0 &&
4948 tmpva + L2_SIZE <= base + size) {
4949 tmpva += L2_SIZE;
4950 continue;
4951 }
4952
4953 if (!pmap_demote_l2(kernel_pmap, l2, tmpva))
4954 panic("l2 demotion failed");
4955 }
4956 l3 = pmap_l2_to_l3(l2, tmpva);
4957 if (((l3e = pmap_load(l3)) & PTE_V) == 0)
4958 return (EINVAL);
4959
4960 tmpva += PAGE_SIZE;
4961 }
4962
4963 /* Second loop: perform PTE updates. */
4964 for (tmpva = base; tmpva < base + size; ) {
4965 l1 = pmap_l1(kernel_pmap, tmpva);
4966 l1e = pmap_load(l1);
4967 if ((l1e & PTE_RWX) != 0) {
4968 /* Unchanged. */
4969 if ((l1e & mask) == bits) {
4970 tmpva += L1_SIZE;
4971 continue;
4972 }
4973
4974 l1e &= ~mask;
4975 l1e |= bits;
4976 pmap_store(l1, l1e);
4977 anychanged = true;
4978
4979 /* Update corresponding DMAP entry */
4980 phys = L1PTE_TO_PHYS(l1e);
4981 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
4982 error = pmap_change_attr_locked(
4983 PHYS_TO_DMAP(phys), L1_SIZE, mode);
4984 if (error != 0)
4985 break;
4986 }
4987 tmpva += L1_SIZE;
4988 continue;
4989 }
4990
4991 l2 = pmap_l1_to_l2(l1, tmpva);
4992 l2e = pmap_load(l2);
4993 if ((l2e & PTE_RWX) != 0) {
4994 /* Unchanged. */
4995 if ((l2e & mask) == bits) {
4996 tmpva += L2_SIZE;
4997 continue;
4998 }
4999
5000 l2e &= ~mask;
5001 l2e |= bits;
5002 pmap_store(l2, l2e);
5003 anychanged = true;
5004
5005 /* Update corresponding DMAP entry */
5006 phys = L2PTE_TO_PHYS(l2e);
5007 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
5008 error = pmap_change_attr_locked(
5009 PHYS_TO_DMAP(phys), L2_SIZE, mode);
5010 if (error != 0)
5011 break;
5012 }
5013 tmpva += L2_SIZE;
5014 continue;
5015 }
5016
5017 l3 = pmap_l2_to_l3(l2, tmpva);
5018 l3e = pmap_load(l3);
5019
5020 /* Unchanged. */
5021 if ((l3e & mask) == bits) {
5022 tmpva += PAGE_SIZE;
5023 continue;
5024 }
5025
5026 l3e &= ~mask;
5027 l3e |= bits;
5028 pmap_store(l3, l3e);
5029 anychanged = true;
5030
5031 phys = PTE_TO_PHYS(l3e);
5032 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
5033 error = pmap_change_attr_locked(PHYS_TO_DMAP(phys),
5034 L3_SIZE, mode);
5035 if (error != 0)
5036 break;
5037 }
5038 tmpva += PAGE_SIZE;
5039 }
5040
5041 if (anychanged) {
5042 pmap_invalidate_range(kernel_pmap, base, tmpva);
5043 if (mode == VM_MEMATTR_UNCACHEABLE)
5044 cpu_dcache_wbinv_range(base, size);
5045 }
5046
5047 return (error);
5048 }
5049
5050 /*
5051 * Perform the pmap work for mincore(2). If the page is not both referenced and
5052 * modified by this pmap, returns its physical address so that the caller can
5053 * find other mappings.
5054 */
5055 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)5056 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
5057 {
5058 pt_entry_t *l2, *l3, tpte;
5059 vm_paddr_t pa;
5060 int val;
5061 bool managed;
5062
5063 PMAP_LOCK(pmap);
5064 l2 = pmap_l2(pmap, addr);
5065 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) {
5066 if ((tpte & PTE_RWX) != 0) {
5067 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET);
5068 val = MINCORE_INCORE | MINCORE_PSIND(1);
5069 } else {
5070 l3 = pmap_l2_to_l3(l2, addr);
5071 tpte = pmap_load(l3);
5072 if ((tpte & PTE_V) == 0) {
5073 PMAP_UNLOCK(pmap);
5074 return (0);
5075 }
5076 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET);
5077 val = MINCORE_INCORE;
5078 }
5079
5080 if ((tpte & PTE_D) != 0)
5081 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5082 if ((tpte & PTE_A) != 0)
5083 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5084 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED;
5085 } else {
5086 managed = false;
5087 val = 0;
5088 }
5089 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5090 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
5091 *pap = pa;
5092 }
5093 PMAP_UNLOCK(pmap);
5094 return (val);
5095 }
5096
5097 void
pmap_activate_sw(struct thread * td)5098 pmap_activate_sw(struct thread *td)
5099 {
5100 pmap_t oldpmap, pmap;
5101 u_int hart;
5102
5103 oldpmap = PCPU_GET(curpmap);
5104 pmap = vmspace_pmap(td->td_proc->p_vmspace);
5105 if (pmap == oldpmap)
5106 return;
5107 csr_write(satp, pmap->pm_satp);
5108
5109 hart = PCPU_GET(hart);
5110 #ifdef SMP
5111 CPU_SET_ATOMIC(hart, &pmap->pm_active);
5112 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active);
5113 #else
5114 CPU_SET(hart, &pmap->pm_active);
5115 CPU_CLR(hart, &oldpmap->pm_active);
5116 #endif
5117 PCPU_SET(curpmap, pmap);
5118
5119 sfence_vma();
5120 }
5121
5122 void
pmap_activate(struct thread * td)5123 pmap_activate(struct thread *td)
5124 {
5125
5126 critical_enter();
5127 pmap_activate_sw(td);
5128 critical_exit();
5129 }
5130
5131 void
pmap_activate_boot(pmap_t pmap)5132 pmap_activate_boot(pmap_t pmap)
5133 {
5134 u_int hart;
5135
5136 hart = PCPU_GET(hart);
5137 #ifdef SMP
5138 CPU_SET_ATOMIC(hart, &pmap->pm_active);
5139 #else
5140 CPU_SET(hart, &pmap->pm_active);
5141 #endif
5142 PCPU_SET(curpmap, pmap);
5143 }
5144
5145 void
pmap_active_cpus(pmap_t pmap,cpuset_t * res)5146 pmap_active_cpus(pmap_t pmap, cpuset_t *res)
5147 {
5148 *res = pmap->pm_active;
5149 }
5150
5151 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)5152 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
5153 {
5154 cpuset_t mask;
5155
5156 /*
5157 * From the RISC-V User-Level ISA V2.2:
5158 *
5159 * "To make a store to instruction memory visible to all
5160 * RISC-V harts, the writing hart has to execute a data FENCE
5161 * before requesting that all remote RISC-V harts execute a
5162 * FENCE.I."
5163 *
5164 * However, this is slightly misleading; we still need to
5165 * perform a FENCE.I for the local hart, as FENCE does nothing
5166 * for its icache. FENCE.I alone is also sufficient for the
5167 * local hart.
5168 */
5169 sched_pin();
5170 mask = all_harts;
5171 CPU_CLR(PCPU_GET(hart), &mask);
5172 fence_i();
5173 if (!CPU_EMPTY(&mask) && smp_started) {
5174 fence();
5175 sbi_remote_fence_i(mask.__bits);
5176 }
5177 sched_unpin();
5178 }
5179
5180 /*
5181 * Increase the starting virtual address of the given mapping if a
5182 * different alignment might result in more superpage mappings.
5183 */
5184 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)5185 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5186 vm_offset_t *addr, vm_size_t size)
5187 {
5188 vm_offset_t superpage_offset;
5189
5190 if (size < L2_SIZE)
5191 return;
5192 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5193 offset += ptoa(object->pg_color);
5194 superpage_offset = offset & L2_OFFSET;
5195 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
5196 (*addr & L2_OFFSET) == superpage_offset)
5197 return;
5198 if ((*addr & L2_OFFSET) < superpage_offset)
5199 *addr = (*addr & ~L2_OFFSET) + superpage_offset;
5200 else
5201 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
5202 }
5203
5204 /**
5205 * Get the kernel virtual address of a set of physical pages. If there are
5206 * physical addresses not covered by the DMAP perform a transient mapping
5207 * that will be removed when calling pmap_unmap_io_transient.
5208 *
5209 * \param page The pages the caller wishes to obtain the virtual
5210 * address on the kernel memory map.
5211 * \param vaddr On return contains the kernel virtual memory address
5212 * of the pages passed in the page parameter.
5213 * \param count Number of pages passed in.
5214 * \param can_fault true if the thread using the mapped pages can take
5215 * page faults, false otherwise.
5216 *
5217 * \returns true if the caller must call pmap_unmap_io_transient when
5218 * finished or false otherwise.
5219 *
5220 */
5221 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)5222 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
5223 bool can_fault)
5224 {
5225 vm_paddr_t paddr;
5226 bool needs_mapping;
5227 int error __diagused, i;
5228
5229 /*
5230 * Allocate any KVA space that we need, this is done in a separate
5231 * loop to prevent calling vmem_alloc while pinned.
5232 */
5233 needs_mapping = false;
5234 for (i = 0; i < count; i++) {
5235 paddr = VM_PAGE_TO_PHYS(page[i]);
5236 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) {
5237 error = vmem_alloc(kernel_arena, PAGE_SIZE,
5238 M_BESTFIT | M_WAITOK, &vaddr[i]);
5239 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
5240 needs_mapping = true;
5241 } else {
5242 vaddr[i] = PHYS_TO_DMAP(paddr);
5243 }
5244 }
5245
5246 /* Exit early if everything is covered by the DMAP */
5247 if (!needs_mapping)
5248 return (false);
5249
5250 if (!can_fault)
5251 sched_pin();
5252 for (i = 0; i < count; i++) {
5253 paddr = VM_PAGE_TO_PHYS(page[i]);
5254 if (paddr >= DMAP_MAX_PHYSADDR) {
5255 panic(
5256 "pmap_map_io_transient: TODO: Map out of DMAP data");
5257 }
5258 }
5259
5260 return (needs_mapping);
5261 }
5262
5263 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)5264 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
5265 bool can_fault)
5266 {
5267 vm_paddr_t paddr;
5268 int i;
5269
5270 if (!can_fault)
5271 sched_unpin();
5272 for (i = 0; i < count; i++) {
5273 paddr = VM_PAGE_TO_PHYS(page[i]);
5274 if (paddr >= DMAP_MAX_PHYSADDR) {
5275 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data");
5276 }
5277 }
5278 }
5279
5280 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)5281 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
5282 {
5283
5284 return (mode >= VM_MEMATTR_DEFAULT && mode <= VM_MEMATTR_LAST);
5285 }
5286
5287 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)5288 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2,
5289 pt_entry_t **l3)
5290 {
5291 pd_entry_t *l1p, *l2p;
5292
5293 /* Get l1 directory entry. */
5294 l1p = pmap_l1(pmap, va);
5295 *l1 = l1p;
5296
5297 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0)
5298 return (false);
5299
5300 if ((pmap_load(l1p) & PTE_RX) != 0) {
5301 *l2 = NULL;
5302 *l3 = NULL;
5303 return (true);
5304 }
5305
5306 /* Get l2 directory entry. */
5307 l2p = pmap_l1_to_l2(l1p, va);
5308 *l2 = l2p;
5309
5310 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0)
5311 return (false);
5312
5313 if ((pmap_load(l2p) & PTE_RX) != 0) {
5314 *l3 = NULL;
5315 return (true);
5316 }
5317
5318 /* Get l3 page table entry. */
5319 *l3 = pmap_l2_to_l3(l2p, va);
5320
5321 return (true);
5322 }
5323
5324 /*
5325 * Track a range of the kernel's virtual address space that is contiguous
5326 * in various mapping attributes.
5327 */
5328 struct pmap_kernel_map_range {
5329 vm_offset_t sva;
5330 pt_entry_t attrs;
5331 int l3pages;
5332 int l2pages;
5333 int l1pages;
5334 };
5335
5336 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)5337 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
5338 vm_offset_t eva)
5339 {
5340 char *mode;
5341 int i;
5342
5343 if (eva <= range->sva)
5344 return;
5345
5346 for (i = 0; i < nitems(memattr_bits); i++)
5347 if ((range->attrs & memattr_mask) == memattr_bits[i])
5348 break;
5349
5350 switch (i) {
5351 case VM_MEMATTR_PMA:
5352 mode = "PMA";
5353 break;
5354 case VM_MEMATTR_UNCACHEABLE:
5355 mode = "NC ";
5356 break;
5357 case VM_MEMATTR_DEVICE:
5358 mode = "IO ";
5359 break;
5360 default:
5361 mode = "???";
5362 break;
5363 }
5364
5365 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n",
5366 range->sva, eva,
5367 (range->attrs & PTE_W) == PTE_W ? 'w' : '-',
5368 (range->attrs & PTE_X) == PTE_X ? 'x' : '-',
5369 (range->attrs & PTE_U) == PTE_U ? 'u' : 's',
5370 (range->attrs & PTE_G) == PTE_G ? 'g' : '-',
5371 mode, range->l1pages, range->l2pages, range->l3pages);
5372
5373 /* Reset to sentinel value. */
5374 range->sva = 0xfffffffffffffffful;
5375 }
5376
5377 /*
5378 * Determine whether the attributes specified by a page table entry match those
5379 * being tracked by the current range.
5380 */
5381 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)5382 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
5383 {
5384
5385 return (range->attrs == attrs);
5386 }
5387
5388 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)5389 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
5390 pt_entry_t attrs)
5391 {
5392
5393 memset(range, 0, sizeof(*range));
5394 range->sva = va;
5395 range->attrs = attrs;
5396 }
5397
5398 /*
5399 * Given a leaf PTE, derive the mapping's attributes. If they do not match
5400 * those of the current run, dump the address range and its attributes, and
5401 * begin a new run.
5402 */
5403 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)5404 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
5405 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e)
5406 {
5407 pt_entry_t attrs;
5408
5409 /* The PTE global bit is inherited by lower levels. */
5410 attrs = l1e & PTE_G;
5411 if ((l1e & PTE_RWX) != 0) {
5412 attrs |= l1e & (PTE_RWX | PTE_U);
5413 attrs |= l1e & memattr_mask;
5414 } else if (l2e != 0)
5415 attrs |= l2e & PTE_G;
5416
5417 if ((l2e & PTE_RWX) != 0) {
5418 attrs |= l2e & (PTE_RWX | PTE_U);
5419 attrs |= l2e & memattr_mask;
5420 } else if (l3e != 0) {
5421 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G);
5422 attrs |= l3e & memattr_mask;
5423 }
5424
5425 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
5426 sysctl_kmaps_dump(sb, range, va);
5427 sysctl_kmaps_reinit(range, va, attrs);
5428 }
5429 }
5430
5431 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)5432 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
5433 {
5434 struct pmap_kernel_map_range range;
5435 struct sbuf sbuf, *sb;
5436 pd_entry_t *l1, l1e, *l2, l2e;
5437 pt_entry_t *l3, l3e;
5438 vm_offset_t sva;
5439 vm_paddr_t pa;
5440 int error, i, j, k;
5441
5442 error = sysctl_wire_old_buffer(req, 0);
5443 if (error != 0)
5444 return (error);
5445 sb = &sbuf;
5446 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
5447
5448 /* Sentinel value. */
5449 range.sva = 0xfffffffffffffffful;
5450
5451 /*
5452 * Iterate over the kernel page tables without holding the kernel pmap
5453 * lock. Kernel page table pages are never freed, so at worst we will
5454 * observe inconsistencies in the output.
5455 */
5456 sva = VM_MIN_KERNEL_ADDRESS;
5457 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) {
5458 if (i == pmap_l1_index(DMAP_MIN_ADDRESS))
5459 sbuf_printf(sb, "\nDirect map:\n");
5460 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS))
5461 sbuf_printf(sb, "\nKernel map:\n");
5462
5463 l1 = pmap_l1(kernel_pmap, sva);
5464 l1e = pmap_load(l1);
5465 if ((l1e & PTE_V) == 0) {
5466 sysctl_kmaps_dump(sb, &range, sva);
5467 sva += L1_SIZE;
5468 continue;
5469 }
5470 if ((l1e & PTE_RWX) != 0) {
5471 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0);
5472 range.l1pages++;
5473 sva += L1_SIZE;
5474 continue;
5475 }
5476 pa = PTE_TO_PHYS(l1e);
5477 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
5478
5479 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) {
5480 l2e = l2[j];
5481 if ((l2e & PTE_V) == 0) {
5482 sysctl_kmaps_dump(sb, &range, sva);
5483 sva += L2_SIZE;
5484 continue;
5485 }
5486 if ((l2e & PTE_RWX) != 0) {
5487 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0);
5488 range.l2pages++;
5489 sva += L2_SIZE;
5490 continue;
5491 }
5492 pa = PTE_TO_PHYS(l2e);
5493 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa);
5494
5495 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++,
5496 sva += L3_SIZE) {
5497 l3e = l3[k];
5498 if ((l3e & PTE_V) == 0) {
5499 sysctl_kmaps_dump(sb, &range, sva);
5500 continue;
5501 }
5502 sysctl_kmaps_check(sb, &range, sva,
5503 l1e, l2e, l3e);
5504 range.l3pages++;
5505 }
5506 }
5507 }
5508
5509 error = sbuf_finish(sb);
5510 sbuf_delete(sb);
5511 return (error);
5512 }
5513 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
5514 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
5515 NULL, 0, sysctl_kmaps, "A",
5516 "Dump kernel address layout");
5517