1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 1991 Regents of the University of California.
5 * All rights reserved.
6 * Copyright (c) 1994 John S. Dyson
7 * All rights reserved.
8 * Copyright (c) 1994 David Greenman
9 * All rights reserved.
10 * Copyright (c) 2003 Peter Wemm
11 * All rights reserved.
12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
13 * All rights reserved.
14 * Copyright (c) 2014 Andrew Turner
15 * All rights reserved.
16 * Copyright (c) 2014 The FreeBSD Foundation
17 * All rights reserved.
18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com>
19 * All rights reserved.
20 *
21 * This code is derived from software contributed to Berkeley by
22 * the Systems Programming Group of the University of Utah Computer
23 * Science Department and William Jolitz of UUNET Technologies Inc.
24 *
25 * Portions of this software were developed by Andrew Turner under
26 * sponsorship from The FreeBSD Foundation.
27 *
28 * Portions of this software were developed by SRI International and the
29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract
30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
31 *
32 * Portions of this software were developed by the University of Cambridge
33 * Computer Laboratory as part of the CTSRD Project, with support from the
34 * UK Higher Education Innovation Fund (HEIF).
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 */
64 /*-
65 * Copyright (c) 2003 Networks Associates Technology, Inc.
66 * All rights reserved.
67 *
68 * This software was developed for the FreeBSD Project by Jake Burkholder,
69 * Safeport Network Services, and Network Associates Laboratories, the
70 * Security Research Division of Network Associates, Inc. under
71 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
72 * CHATS research program.
73 *
74 * Redistribution and use in source and binary forms, with or without
75 * modification, are permitted provided that the following conditions
76 * are met:
77 * 1. Redistributions of source code must retain the above copyright
78 * notice, this list of conditions and the following disclaimer.
79 * 2. Redistributions in binary form must reproduce the above copyright
80 * notice, this list of conditions and the following disclaimer in the
81 * documentation and/or other materials provided with the distribution.
82 *
83 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
84 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
85 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
86 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
87 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
88 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
89 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
90 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
91 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
92 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
93 * SUCH DAMAGE.
94 */
95
96 /*
97 * Manages physical address maps.
98 *
99 * Since the information managed by this module is
100 * also stored by the logical address mapping module,
101 * this module may throw away valid virtual-to-physical
102 * mappings at almost any time. However, invalidations
103 * of virtual-to-physical mappings must be done as
104 * requested.
105 *
106 * In order to cope with hardware architectures which
107 * make virtual-to-physical map invalidates expensive,
108 * this module may delay invalidate or reduced protection
109 * operations until such time as they are actually
110 * necessary. This module is given full information as
111 * to which processors are currently using which maps,
112 * and to when physical maps must be made correct.
113 */
114
115 #include "opt_pmap.h"
116
117 #include <sys/param.h>
118 #include <sys/systm.h>
119 #include <sys/bitstring.h>
120 #include <sys/bus.h>
121 #include <sys/cpuset.h>
122 #include <sys/kernel.h>
123 #include <sys/ktr.h>
124 #include <sys/lock.h>
125 #include <sys/malloc.h>
126 #include <sys/mman.h>
127 #include <sys/msgbuf.h>
128 #include <sys/mutex.h>
129 #include <sys/physmem.h>
130 #include <sys/proc.h>
131 #include <sys/rwlock.h>
132 #include <sys/sbuf.h>
133 #include <sys/sx.h>
134 #include <sys/vmem.h>
135 #include <sys/vmmeter.h>
136 #include <sys/sched.h>
137 #include <sys/sysctl.h>
138 #include <sys/smp.h>
139
140 #include <vm/vm.h>
141 #include <vm/vm_param.h>
142 #include <vm/vm_kern.h>
143 #include <vm/vm_page.h>
144 #include <vm/vm_map.h>
145 #include <vm/vm_object.h>
146 #include <vm/vm_extern.h>
147 #include <vm/vm_pageout.h>
148 #include <vm/vm_pager.h>
149 #include <vm/vm_phys.h>
150 #include <vm/vm_radix.h>
151 #include <vm/vm_reserv.h>
152 #include <vm/vm_dumpset.h>
153 #include <vm/uma.h>
154
155 #include <machine/machdep.h>
156 #include <machine/md_var.h>
157 #include <machine/pcb.h>
158 #include <machine/sbi.h>
159 #include <machine/thead.h>
160
161 /*
162 * Boundary values for the page table page index space:
163 *
164 * L3 pages: [0, NUL2E)
165 * L2 pages: [NUL2E, NUL2E + NUL1E)
166 * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E)
167 *
168 * Note that these ranges are used in both SV39 and SV48 mode. In SV39 mode the
169 * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages
170 * in a set of page tables.
171 */
172 #define NUL0E Ln_ENTRIES
173 #define NUL1E (Ln_ENTRIES * NUL0E)
174 #define NUL2E (Ln_ENTRIES * NUL1E)
175
176 #ifdef PV_STATS
177 #define PV_STAT(x) do { x ; } while (0)
178 #define __pv_stat_used
179 #else
180 #define PV_STAT(x) do { } while (0)
181 #define __pv_stat_used __unused
182 #endif
183
184 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
185 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
186 #define pa_index(pa) ((pa) >> L2_SHIFT)
187 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)])
188
189 #define NPV_LIST_LOCKS MAXCPU
190
191 #define PHYS_TO_PV_LIST_LOCK(pa) \
192 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
193
194 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \
195 struct rwlock **_lockp = (lockp); \
196 struct rwlock *_new_lock; \
197 \
198 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \
199 if (_new_lock != *_lockp) { \
200 if (*_lockp != NULL) \
201 rw_wunlock(*_lockp); \
202 *_lockp = _new_lock; \
203 rw_wlock(*_lockp); \
204 } \
205 } while (0)
206
207 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
208 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
209
210 #define RELEASE_PV_LIST_LOCK(lockp) do { \
211 struct rwlock **_lockp = (lockp); \
212 \
213 if (*_lockp != NULL) { \
214 rw_wunlock(*_lockp); \
215 *_lockp = NULL; \
216 } \
217 } while (0)
218
219 #define VM_PAGE_TO_PV_LIST_LOCK(m) \
220 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
221
222 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
223 "VM/pmap parameters");
224
225 /* The list of all the user pmaps */
226 LIST_HEAD(pmaplist, pmap);
227 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER();
228
229 enum pmap_mode __read_frequently pmap_mode = PMAP_MODE_SV39;
230 SYSCTL_INT(_vm_pmap, OID_AUTO, mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
231 &pmap_mode, 0,
232 "translation mode, 0 = SV39, 1 = SV48");
233
234 struct pmap kernel_pmap_store;
235
236 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
237 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
238 vm_offset_t kernel_vm_end = 0;
239
240 vm_paddr_t dmap_phys_base; /* The start of the dmap region */
241 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
242 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */
243
244 static int pmap_growkernel_panic = 0;
245 SYSCTL_INT(_vm_pmap, OID_AUTO, growkernel_panic, CTLFLAG_RDTUN,
246 &pmap_growkernel_panic, 0,
247 "panic on failure to allocate kernel page table page");
248
249 /* This code assumes all L1 DMAP entries will be used */
250 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS);
251 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS);
252
253 /*
254 * This code assumes that the early DEVMAP is L2_SIZE aligned.
255 */
256 CTASSERT((PMAP_MAPDEV_EARLY_SIZE & L2_OFFSET) == 0);
257
258 static struct rwlock_padalign pvh_global_lock;
259 static struct mtx_padalign allpmaps_lock;
260
261 static int __read_frequently superpages_enabled = 1;
262 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
263 CTLFLAG_RDTUN, &superpages_enabled, 0,
264 "Enable support for transparent superpages");
265
266 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
267 "2MB page mapping counters");
268
269 static u_long pmap_l2_demotions;
270 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
271 &pmap_l2_demotions, 0,
272 "2MB page demotions");
273
274 static u_long pmap_l2_mappings;
275 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
276 &pmap_l2_mappings, 0,
277 "2MB page mappings");
278
279 static u_long pmap_l2_p_failures;
280 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
281 &pmap_l2_p_failures, 0,
282 "2MB page promotion failures");
283
284 static u_long pmap_l2_promotions;
285 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
286 &pmap_l2_promotions, 0,
287 "2MB page promotions");
288
289 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
290 "L1 (1GB) page mapping counters");
291
292 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
293 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
294 &pmap_l1_demotions, "L1 (1GB) page demotions");
295
296 /*
297 * Data for the pv entry allocation mechanism
298 */
299 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
300 static struct mtx pv_chunks_mutex;
301 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
302 static struct md_page *pv_table;
303 static struct md_page pv_dummy;
304
305 extern cpuset_t all_harts;
306
307 /*
308 * Internal flags for pmap_enter()'s helper functions.
309 */
310 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
311 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
312
313 static void free_pv_chunk(struct pv_chunk *pc);
314 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
315 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
316 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
317 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
318 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
319 vm_offset_t va);
320 static bool pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va);
321 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va);
322 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2,
323 vm_offset_t va, struct rwlock **lockp);
324 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
325 u_int flags, vm_page_t m, struct rwlock **lockp);
326 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
327 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
328 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
329 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
330 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
331 vm_page_t m, struct rwlock **lockp);
332
333 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
334 struct rwlock **lockp);
335
336 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
337 struct spglist *free);
338 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
339
340 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
341
342 static uint64_t pmap_satp_mode(void);
343
344 #define pmap_clear(pte) pmap_store(pte, 0)
345 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits)
346 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry)
347 #define pmap_load_clear(pte) pmap_load_store(pte, 0)
348 #define pmap_load(pte) atomic_load_64(pte)
349 #define pmap_store(pte, entry) atomic_store_64(pte, entry)
350 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits)
351
352 /********************/
353 /* Inline functions */
354 /********************/
355
356 static __inline void
pagecopy(void * s,void * d)357 pagecopy(void *s, void *d)
358 {
359
360 memcpy(d, s, PAGE_SIZE);
361 }
362
363 static __inline void
pagezero(void * p)364 pagezero(void *p)
365 {
366
367 bzero(p, PAGE_SIZE);
368 }
369
370 #define pmap_l0_index(va) (((va) >> L0_SHIFT) & Ln_ADDR_MASK)
371 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK)
372 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK)
373 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK)
374
375 #define PTE_TO_PHYS(pte) \
376 ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE)
377 #define L2PTE_TO_PHYS(l2) \
378 ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT)
379 #define L1PTE_TO_PHYS(l1) \
380 ((((l1) & ~PTE_HI_MASK) >> PTE_PPN2_S) << L1_SHIFT)
381 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
382
383 /*
384 * Construct a page table entry of the specified level pointing to physical
385 * address pa, with PTE bits 'bits'.
386 *
387 * A leaf PTE of any level must point to an address matching its alignment,
388 * e.g. L2 pages must be 2MB aligned in memory.
389 */
390 #define L1_PTE(pa, bits) ((((pa) >> L1_SHIFT) << PTE_PPN2_S) | (bits))
391 #define L2_PTE(pa, bits) ((((pa) >> L2_SHIFT) << PTE_PPN1_S) | (bits))
392 #define L3_PTE(pa, bits) ((((pa) >> L3_SHIFT) << PTE_PPN0_S) | (bits))
393
394 /*
395 * Construct a page directory entry (PDE), pointing to next level entry at pa,
396 * with PTE bits 'bits'.
397 *
398 * Unlike PTEs, page directory entries can point to any 4K-aligned physical
399 * address.
400 */
401 #define L0_PDE(pa, bits) L3_PTE(pa, bits)
402 #define L1_PDE(pa, bits) L3_PTE(pa, bits)
403 #define L2_PDE(pa, bits) L3_PTE(pa, bits)
404
405 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)406 pmap_l0(pmap_t pmap, vm_offset_t va)
407 {
408 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__));
409 KASSERT(VIRT_IS_VALID(va),
410 ("%s: malformed virtual address %#lx", __func__, va));
411 return (&pmap->pm_top[pmap_l0_index(va)]);
412 }
413
414 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)415 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
416 {
417 vm_paddr_t phys;
418 pd_entry_t *l1;
419
420 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__));
421 phys = PTE_TO_PHYS(pmap_load(l0));
422 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
423
424 return (&l1[pmap_l1_index(va)]);
425 }
426
427 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)428 pmap_l1(pmap_t pmap, vm_offset_t va)
429 {
430 pd_entry_t *l0;
431
432 KASSERT(VIRT_IS_VALID(va),
433 ("%s: malformed virtual address %#lx", __func__, va));
434 if (pmap_mode == PMAP_MODE_SV39) {
435 return (&pmap->pm_top[pmap_l1_index(va)]);
436 } else {
437 l0 = pmap_l0(pmap, va);
438 if ((pmap_load(l0) & PTE_V) == 0)
439 return (NULL);
440 if ((pmap_load(l0) & PTE_RX) != 0)
441 return (NULL);
442 return (pmap_l0_to_l1(l0, va));
443 }
444 }
445
446 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1,vm_offset_t va)447 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
448 {
449 vm_paddr_t phys;
450 pd_entry_t *l2;
451
452 phys = PTE_TO_PHYS(pmap_load(l1));
453 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
454
455 return (&l2[pmap_l2_index(va)]);
456 }
457
458 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)459 pmap_l2(pmap_t pmap, vm_offset_t va)
460 {
461 pd_entry_t *l1;
462
463 l1 = pmap_l1(pmap, va);
464 if (l1 == NULL)
465 return (NULL);
466 if ((pmap_load(l1) & PTE_V) == 0)
467 return (NULL);
468 if ((pmap_load(l1) & PTE_RX) != 0)
469 return (NULL);
470
471 return (pmap_l1_to_l2(l1, va));
472 }
473
474 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2,vm_offset_t va)475 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
476 {
477 vm_paddr_t phys;
478 pt_entry_t *l3;
479
480 phys = PTE_TO_PHYS(pmap_load(l2));
481 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys);
482
483 return (&l3[pmap_l3_index(va)]);
484 }
485
486 static __inline pt_entry_t *
pmap_l3(pmap_t pmap,vm_offset_t va)487 pmap_l3(pmap_t pmap, vm_offset_t va)
488 {
489 pd_entry_t *l2;
490
491 l2 = pmap_l2(pmap, va);
492 if (l2 == NULL)
493 return (NULL);
494 if ((pmap_load(l2) & PTE_V) == 0)
495 return (NULL);
496 if ((pmap_load(l2) & PTE_RX) != 0)
497 return (NULL);
498
499 return (pmap_l2_to_l3(l2, va));
500 }
501
502 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)503 pmap_resident_count_inc(pmap_t pmap, int count)
504 {
505
506 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
507 pmap->pm_stats.resident_count += count;
508 }
509
510 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)511 pmap_resident_count_dec(pmap_t pmap, int count)
512 {
513
514 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
515 KASSERT(pmap->pm_stats.resident_count >= count,
516 ("pmap %p resident count underflow %ld %d", pmap,
517 pmap->pm_stats.resident_count, count));
518 pmap->pm_stats.resident_count -= count;
519 }
520
521 static void
pmap_distribute_l1(struct pmap * pmap,vm_pindex_t l1index,pt_entry_t entry)522 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index,
523 pt_entry_t entry)
524 {
525 struct pmap *user_pmap;
526 pd_entry_t *l1;
527
528 /*
529 * Distribute new kernel L1 entry to all the user pmaps. This is only
530 * necessary with three-level paging configured: with four-level paging
531 * the kernel's half of the top-level page table page is static and can
532 * simply be copied at pmap initialization time.
533 */
534 if (pmap != kernel_pmap || pmap_mode != PMAP_MODE_SV39)
535 return;
536
537 mtx_lock(&allpmaps_lock);
538 LIST_FOREACH(user_pmap, &allpmaps, pm_list) {
539 l1 = &user_pmap->pm_top[l1index];
540 pmap_store(l1, entry);
541 }
542 mtx_unlock(&allpmaps_lock);
543 }
544
545 /*
546 * Holds the PTE mode bits (defined in pte.h) for defining e.g. cacheability.
547 *
548 * The indices correspond to the VM_MEMATTR_* defines in riscv/include/vm.h.
549 *
550 * The array will be empty if no mode bits are supported by the CPU, e.g. when
551 * lacking the Svpbmt extension.
552 */
553 static __read_frequently pt_entry_t memattr_bits[VM_MEMATTR_TOTAL];
554 static __read_frequently pt_entry_t memattr_mask;
555
556 static __inline pt_entry_t
pmap_memattr_bits(vm_memattr_t mode)557 pmap_memattr_bits(vm_memattr_t mode)
558 {
559 KASSERT(pmap_is_valid_memattr(kernel_pmap, mode),
560 ("invalid memory mode %u\n", mode));
561 return (memattr_bits[(int)mode]);
562 }
563
564 /*
565 * This should only be used during pmap bootstrap e.g. by
566 * pmap_create_pagetables().
567 */
568 static pt_entry_t *
pmap_early_alloc_tables(vm_paddr_t * freemempos,int npages)569 pmap_early_alloc_tables(vm_paddr_t *freemempos, int npages)
570 {
571 pt_entry_t *pt;
572
573 pt = (pt_entry_t *)*freemempos;
574 *freemempos += npages * PAGE_SIZE;
575 bzero(pt, npages * PAGE_SIZE);
576
577 return (pt);
578 }
579
580 /*
581 * Construct the Direct Map -- a linear mapping of physical memory into
582 * the kernel address space.
583 *
584 * We walk the list of physical memory segments (of arbitrary size and
585 * alignment) mapping each appropriately. Consequently, the DMAP address
586 * space will have unmapped regions corresponding to the holes between
587 * physical memory segments.
588 */
589 static vm_paddr_t
pmap_bootstrap_dmap(pd_entry_t * l1,vm_paddr_t freemempos)590 pmap_bootstrap_dmap(pd_entry_t *l1, vm_paddr_t freemempos)
591 {
592 vm_paddr_t physmap[PHYS_AVAIL_ENTRIES];
593 vm_offset_t va;
594 vm_paddr_t min_pa, max_pa, pa, endpa;
595 pd_entry_t *l3, *l2;
596 pt_entry_t memattr;
597 u_int l1slot, l2slot, l3slot;
598 int physmap_idx;
599
600 physmap_idx = physmem_avail(physmap, nitems(physmap));
601 min_pa = physmap[0];
602 max_pa = physmap[physmap_idx - 1];
603
604 printf("physmap_idx %u\n", physmap_idx);
605 printf("min_pa %lx\n", min_pa);
606 printf("max_pa %lx\n", max_pa);
607
608 /* Set the limits of the DMAP region. */
609 dmap_phys_base = rounddown(min_pa, L1_SIZE);
610 dmap_phys_max = max_pa;
611
612 memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT);
613
614 /*
615 * Walk the physmap table, using the largest page sizes possible for each
616 * mapping. So, for each physmap entry, map as needed/able:
617 * - 4K/L3 page prefix
618 * - 2M/L2 superpage prefix
619 * - 1G/L1 superpages
620 * - 2M/L2 superpage suffix
621 * - 4K/L3 page suffix
622 */
623 l3 = l2 = NULL;
624 l2slot = l1slot = Ln_ENTRIES; /* sentinel value */
625 for (int idx = 0; idx < physmap_idx; idx += 2) {
626 pa = rounddown(physmap[idx], L3_SIZE);
627 endpa = physmap[idx + 1];
628
629 /* Virtual address for this range. */
630 va = PHYS_TO_DMAP(pa);
631
632 /* Any 2MB possible for this range? */
633 if (roundup(pa, L2_SIZE) + L2_SIZE > endpa)
634 goto l3end;
635
636 /* Loop until the next 2MB boundary. */
637 while ((pa & L2_OFFSET) != 0) {
638 if (l2 == NULL || pmap_l1_index(va) != l1slot) {
639 /* Need to alloc another page table. */
640 l2 = pmap_early_alloc_tables(&freemempos, 1);
641
642 /* Link it. */
643 l1slot = pmap_l1_index(va);
644 pmap_store(&l1[l1slot],
645 L1_PDE((vm_paddr_t)l2, PTE_V));
646 }
647
648 if (l3 == NULL || pmap_l2_index(va) != l2slot) {
649 l3 = pmap_early_alloc_tables(&freemempos, 1);
650
651 /* Link it to L2. */
652 l2slot = pmap_l2_index(va);
653 pmap_store(&l2[l2slot],
654 L2_PDE((vm_paddr_t)l3, PTE_V));
655 }
656
657 /* map l3 pages */
658 l3slot = pmap_l3_index(va);
659 pmap_store(&l3[l3slot], L3_PTE(pa, PTE_KERN | memattr));
660
661 pa += L3_SIZE;
662 va += L3_SIZE;
663 }
664
665 /* Any 1GB possible for remaining range? */
666 if (roundup(pa, L1_SIZE) + L1_SIZE > endpa)
667 goto l2end;
668
669 /* Loop until the next 1GB boundary. */
670 while ((pa & L1_OFFSET) != 0) {
671 if (l2 == NULL || pmap_l1_index(va) != l1slot) {
672 /* Need to alloc another page table. */
673 l2 = pmap_early_alloc_tables(&freemempos, 1);
674
675 /* Link it. */
676 l1slot = pmap_l1_index(va);
677 pmap_store(&l1[l1slot],
678 L1_PDE((vm_paddr_t)l2, PTE_V));
679 }
680
681 /* map l2 pages */
682 l2slot = pmap_l2_index(va);
683 pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr));
684
685 pa += L2_SIZE;
686 va += L2_SIZE;
687 }
688
689 /* Map what we can with 1GB superpages. */
690 while (pa + L1_SIZE - 1 < endpa) {
691 /* map l1 pages */
692 l1slot = pmap_l1_index(va);
693 pmap_store(&l1[l1slot], L1_PTE(pa, PTE_KERN | memattr));
694
695 pa += L1_SIZE;
696 va += L1_SIZE;
697 }
698
699 l2end:
700 /* Map what we can with 2MB superpages. */
701 while (pa + L2_SIZE - 1 < endpa) {
702 if (l2 == NULL || pmap_l1_index(va) != l1slot) {
703 /* Need to alloc another page table. */
704 l2 = pmap_early_alloc_tables(&freemempos, 1);
705
706 /* Link it. */
707 l1slot = pmap_l1_index(va);
708 pmap_store(&l1[l1slot],
709 L1_PDE((vm_paddr_t)l2, PTE_V));
710 }
711
712 /* map l2 pages */
713 l2slot = pmap_l2_index(va);
714 pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr));
715
716 pa += L2_SIZE;
717 va += L2_SIZE;
718 }
719
720 l3end:
721 while (pa < endpa) {
722 if (l2 == NULL || pmap_l1_index(va) != l1slot) {
723 /* Need to alloc another page table. */
724 l2 = pmap_early_alloc_tables(&freemempos, 1);
725
726 /* Link it. */
727 l1slot = pmap_l1_index(va);
728 pmap_store(&l1[l1slot],
729 L1_PDE((vm_paddr_t)l2, PTE_V));
730 }
731
732 if (l3 == NULL || pmap_l2_index(va) != l2slot) {
733 l3 = pmap_early_alloc_tables(&freemempos, 1);
734
735 /* Link it to L2. */
736 l2slot = pmap_l2_index(va);
737 pmap_store(&l2[l2slot],
738 L2_PDE((vm_paddr_t)l3, PTE_V));
739 }
740
741 /* map l3 pages */
742 l3slot = pmap_l3_index(va);
743 pmap_store(&l3[l3slot], L3_PTE(pa, PTE_KERN | memattr));
744
745 pa += L3_SIZE;
746 va += L3_SIZE;
747 }
748 }
749
750 /* And finally, the limit on DMAP VA. */
751 dmap_max_addr = va;
752
753 return (freemempos);
754 }
755
756 /*
757 * Create a new set of pagetables to run the kernel with.
758 *
759 * An initial, temporary setup was created in locore.S, which serves well
760 * enough to get us this far. It mapped kernstart -> KERNBASE, using 2MB
761 * superpages, and created a 1GB identity map, which allows this function
762 * to dereference physical addresses.
763 *
764 * The memory backing these page tables is allocated in the space
765 * immediately following the kernel's preload area. Depending on the size
766 * of this area, some, all, or none of these pages can be implicitly
767 * mapped by the kernel's 2MB mappings. This memory will only ever be
768 * accessed through the direct map, however.
769 */
770 static vm_paddr_t
pmap_create_pagetables(vm_paddr_t kernstart,vm_size_t kernlen,vm_paddr_t * root_pt_phys)771 pmap_create_pagetables(vm_paddr_t kernstart, vm_size_t kernlen,
772 vm_paddr_t *root_pt_phys)
773 {
774 pt_entry_t *l0, *l1, *kern_l2, *kern_l3, *devmap_l3;
775 pt_entry_t memattr;
776 pd_entry_t *devmap_l2;
777 vm_paddr_t kernend, freemempos, pa;
778 int nkernl2, nkernl3, ndevmapl3;
779 int i, slot;
780 int mode;
781
782 kernend = kernstart + kernlen;
783
784 /* Static allocations begin after the kernel staging area. */
785 freemempos = roundup2(kernend, PAGE_SIZE);
786
787 /* Detect Sv48 mode. */
788 mode = PMAP_MODE_SV39;
789 TUNABLE_INT_FETCH("vm.pmap.mode", &mode);
790
791 if (mode == PMAP_MODE_SV48 && (mmu_caps & MMU_SV48) != 0) {
792 /*
793 * Sv48 mode: allocate an L0 page table to be the root. The
794 * layout of KVA is otherwise identical to Sv39.
795 */
796 l0 = pmap_early_alloc_tables(&freemempos, 1);
797 *root_pt_phys = (vm_paddr_t)l0;
798 pmap_mode = PMAP_MODE_SV48;
799 } else {
800 l0 = NULL;
801 }
802
803 /*
804 * Allocate an L1 page table.
805 */
806 l1 = pmap_early_alloc_tables(&freemempos, 1);
807 if (pmap_mode == PMAP_MODE_SV39)
808 *root_pt_phys = (vm_paddr_t)l1;
809
810 /*
811 * Allocate a set of L2 page tables for KVA. Most likely, only 1 is
812 * needed.
813 */
814 nkernl2 = howmany(howmany(kernlen, L2_SIZE), Ln_ENTRIES);
815 kern_l2 = pmap_early_alloc_tables(&freemempos, nkernl2);
816
817 /*
818 * Allocate an L2 page table for the static devmap, located at the end
819 * of KVA. We can expect that the devmap will always be less than 1GB
820 * in size.
821 */
822 devmap_l2 = pmap_early_alloc_tables(&freemempos, 1);
823
824 /* Allocate L3 page tables for the devmap. */
825 ndevmapl3 = howmany(howmany(PMAP_MAPDEV_EARLY_SIZE, L3_SIZE),
826 Ln_ENTRIES);
827 devmap_l3 = pmap_early_alloc_tables(&freemempos, ndevmapl3);
828
829 /*
830 * Allocate some L3 bootstrap pages, for early KVA allocations before
831 * vm_mem_init() has run. For example, the message buffer.
832 *
833 * A somewhat arbitrary choice of 32MB. This should be more than enough
834 * for any early allocations. There is no need to worry about waste, as
835 * whatever is not used will be consumed by later calls to
836 * pmap_growkernel().
837 */
838 nkernl3 = 16;
839 kern_l3 = pmap_early_alloc_tables(&freemempos, nkernl3);
840
841 /* Bootstrap the direct map. */
842 freemempos = pmap_bootstrap_dmap(l1, freemempos);
843
844 /* Allocations are done. */
845 if (freemempos < roundup2(kernend, L2_SIZE))
846 freemempos = roundup2(kernend, L2_SIZE);
847
848 /* Memory attributes for standard/main memory. */
849 memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT);
850
851 /*
852 * Map the kernel (and preloaded modules or data) using L2 superpages.
853 *
854 * kernstart is 2MB-aligned. This is enforced by loader(8) and required
855 * by locore assembly.
856 *
857 * TODO: eventually, this should be done with proper permissions for
858 * each segment, rather than mapping the entire kernel and preloaded
859 * modules RWX.
860 */
861 slot = pmap_l2_index(KERNBASE);
862 for (pa = kernstart; pa < kernend; pa += L2_SIZE, slot++) {
863 pmap_store(&kern_l2[slot],
864 L2_PTE(pa, PTE_KERN | PTE_X | memattr));
865 }
866
867 /*
868 * Connect the L3 bootstrap pages to the kernel L2 table. The L3 PTEs
869 * themselves are invalid.
870 */
871 slot = pmap_l2_index(freemempos - kernstart + KERNBASE);
872 for (i = 0; i < nkernl3; i++, slot++) {
873 pa = (vm_paddr_t)kern_l3 + ptoa(i);
874 pmap_store(&kern_l2[slot], L2_PDE(pa, PTE_V));
875 }
876
877 /* Connect the L2 tables to the L1 table. */
878 slot = pmap_l1_index(KERNBASE);
879 for (i = 0; i < nkernl2; i++, slot++) {
880 pa = (vm_paddr_t)kern_l2 + ptoa(i);
881 pmap_store(&l1[slot], L1_PDE(pa, PTE_V));
882 }
883
884 /* Connect the L1 table to L0, if in use. */
885 if (pmap_mode == PMAP_MODE_SV48) {
886 slot = pmap_l0_index(KERNBASE);
887 pmap_store(&l0[slot], L0_PDE((vm_paddr_t)l1, PTE_V));
888 }
889
890 /*
891 * Connect the devmap L3 pages to the L2 table. The devmap PTEs
892 * themselves are invalid.
893 */
894 slot = pmap_l2_index(DEVMAP_MIN_VADDR);
895 for (i = 0; i < ndevmapl3; i++, slot++) {
896 pa = (vm_paddr_t)devmap_l3 + ptoa(i);
897 pmap_store(&devmap_l2[slot], L2_PDE(pa, PTE_V));
898 }
899
900 /* Connect the devmap L2 pages to the L1 table. */
901 slot = pmap_l1_index(DEVMAP_MIN_VADDR);
902 pa = (vm_paddr_t)devmap_l2;
903 pmap_store(&l1[slot], L1_PDE(pa, PTE_V));
904
905 /* Return the next position of free memory */
906 return (freemempos);
907 }
908
909 /*
910 * Bootstrap the system enough to run with virtual memory.
911 */
912 void
pmap_bootstrap(vm_paddr_t kernstart,vm_size_t kernlen)913 pmap_bootstrap(vm_paddr_t kernstart, vm_size_t kernlen)
914 {
915 vm_paddr_t freemempos, pa;
916 vm_paddr_t root_pt_phys;
917 vm_offset_t freeva;
918 vm_offset_t dpcpu, msgbufpv;
919 pt_entry_t *pte;
920 int i;
921
922 printf("pmap_bootstrap %lx %lx\n", kernstart, kernlen);
923
924 PMAP_LOCK_INIT(kernel_pmap);
925 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
926 vm_radix_init(&kernel_pmap->pm_root);
927
928 rw_init(&pvh_global_lock, "pmap pv global");
929
930 /*
931 * Set the current CPU as active in the kernel pmap. Secondary cores
932 * will add themselves later in init_secondary(). The SBI firmware
933 * may rely on this mask being precise, so CPU_FILL() is not used.
934 */
935 CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active);
936
937 /*
938 * Set up the memory attribute bits.
939 */
940 if (has_svpbmt) {
941 memattr_bits[VM_MEMATTR_PMA] = PTE_MA_NONE;
942 memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_MA_NC;
943 memattr_bits[VM_MEMATTR_DEVICE] = PTE_MA_IO;
944 memattr_mask = PTE_MA_MASK;
945 } else if (has_errata_thead_pbmt) {
946 memattr_bits[VM_MEMATTR_PMA] = PTE_THEAD_MA_NONE;
947 memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_THEAD_MA_NC;
948 memattr_bits[VM_MEMATTR_DEVICE] = PTE_THEAD_MA_IO;
949 memattr_mask = PTE_THEAD_MA_MASK;
950 }
951
952 /* Create a new set of pagetables to run the kernel in. */
953 freemempos = pmap_create_pagetables(kernstart, kernlen, &root_pt_phys);
954
955 /* Switch to the newly created page tables. */
956 kernel_pmap->pm_stage = PM_STAGE1;
957 kernel_pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(root_pt_phys);
958 kernel_pmap->pm_satp = atop(root_pt_phys) | pmap_satp_mode();
959 csr_write(satp, kernel_pmap->pm_satp);
960 sfence_vma();
961
962 /*
963 * Now, we need to make a few more static reservations from KVA.
964 *
965 * Set freeva to freemempos virtual address, and be sure to advance
966 * them together.
967 */
968 freeva = freemempos - kernstart + KERNBASE;
969 #define reserve_space(var, pa, size) \
970 do { \
971 var = freeva; \
972 pa = freemempos; \
973 freeva += size; \
974 freemempos += size; \
975 } while (0)
976
977 /* Allocate the dynamic per-cpu area. */
978 reserve_space(dpcpu, pa, DPCPU_SIZE);
979
980 /* Map it. */
981 pte = pmap_l3(kernel_pmap, dpcpu);
982 KASSERT(pte != NULL, ("Bootstrap pages missing"));
983 for (i = 0; i < howmany(DPCPU_SIZE, PAGE_SIZE); i++)
984 pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN |
985 pmap_memattr_bits(VM_MEMATTR_DEFAULT)));
986
987 /* Now, it can be initialized. */
988 dpcpu_init((void *)dpcpu, 0);
989
990 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
991 reserve_space(msgbufpv, pa, round_page(msgbufsize));
992 msgbufp = (void *)msgbufpv;
993
994 /* Map it. */
995 pte = pmap_l3(kernel_pmap, msgbufpv);
996 KASSERT(pte != NULL, ("Bootstrap pages missing"));
997 for (i = 0; i < howmany(msgbufsize, PAGE_SIZE); i++)
998 pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN |
999 pmap_memattr_bits(VM_MEMATTR_DEFAULT)));
1000
1001 #undef reserve_space
1002
1003 /* Mark the bounds of our available virtual address space */
1004 virtual_avail = kernel_vm_end = freeva;
1005 virtual_end = DEVMAP_MIN_VADDR;
1006
1007 /* Exclude the reserved physical memory from allocations. */
1008 physmem_exclude_region(kernstart, freemempos - kernstart,
1009 EXFLAG_NOALLOC);
1010 }
1011
1012 /*
1013 * Initialize a vm_page's machine-dependent fields.
1014 */
1015 void
pmap_page_init(vm_page_t m)1016 pmap_page_init(vm_page_t m)
1017 {
1018
1019 TAILQ_INIT(&m->md.pv_list);
1020 m->md.pv_memattr = VM_MEMATTR_DEFAULT;
1021 }
1022
1023 /*
1024 * Initialize the pmap module.
1025 *
1026 * Called by vm_mem_init(), to initialize any structures that the pmap
1027 * system needs to map virtual memory.
1028 */
1029 void
pmap_init(void)1030 pmap_init(void)
1031 {
1032 vm_size_t s;
1033 int i, pv_npg;
1034
1035 /*
1036 * Initialize the pv chunk and pmap list mutexes.
1037 */
1038 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1039 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF);
1040
1041 /*
1042 * Initialize the pool of pv list locks.
1043 */
1044 for (i = 0; i < NPV_LIST_LOCKS; i++)
1045 rw_init(&pv_list_locks[i], "pmap pv list");
1046
1047 /*
1048 * Calculate the size of the pv head table for superpages.
1049 */
1050 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
1051
1052 /*
1053 * Allocate memory for the pv head table for superpages.
1054 */
1055 s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1056 s = round_page(s);
1057 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO);
1058 for (i = 0; i < pv_npg; i++)
1059 TAILQ_INIT(&pv_table[i].pv_list);
1060 TAILQ_INIT(&pv_dummy.pv_list);
1061
1062 if (superpages_enabled)
1063 pagesizes[1] = L2_SIZE;
1064 }
1065
1066 #ifdef SMP
1067 /*
1068 * For SMP, these functions have to use IPIs for coherence.
1069 *
1070 * In general, the calling thread uses a plain fence to order the
1071 * writes to the page tables before invoking an SBI callback to invoke
1072 * sfence_vma() on remote CPUs.
1073 */
1074 static void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)1075 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1076 {
1077 cpuset_t mask;
1078
1079 sched_pin();
1080 mask = pmap->pm_active;
1081 CPU_CLR(PCPU_GET(hart), &mask);
1082 fence();
1083 if (!CPU_EMPTY(&mask) && smp_started)
1084 sbi_remote_sfence_vma(mask.__bits, va, 1);
1085 sfence_vma_page(va);
1086 sched_unpin();
1087 }
1088
1089 static void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)1090 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1091 {
1092 cpuset_t mask;
1093
1094 sched_pin();
1095 mask = pmap->pm_active;
1096 CPU_CLR(PCPU_GET(hart), &mask);
1097 fence();
1098 if (!CPU_EMPTY(&mask) && smp_started)
1099 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1);
1100
1101 /*
1102 * Might consider a loop of sfence_vma_page() for a small
1103 * number of pages in the future.
1104 */
1105 sfence_vma();
1106 sched_unpin();
1107 }
1108
1109 static void
pmap_invalidate_all(pmap_t pmap)1110 pmap_invalidate_all(pmap_t pmap)
1111 {
1112 cpuset_t mask;
1113
1114 sched_pin();
1115 mask = pmap->pm_active;
1116 CPU_CLR(PCPU_GET(hart), &mask);
1117
1118 /*
1119 * XXX: The SBI doc doesn't detail how to specify x0 as the
1120 * address to perform a global fence. BBL currently treats
1121 * all sfence_vma requests as global however.
1122 */
1123 fence();
1124 if (!CPU_EMPTY(&mask) && smp_started)
1125 sbi_remote_sfence_vma(mask.__bits, 0, 0);
1126 sfence_vma();
1127 sched_unpin();
1128 }
1129 #else
1130 /*
1131 * Normal, non-SMP, invalidation functions.
1132 * We inline these within pmap.c for speed.
1133 */
1134 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)1135 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1136 {
1137
1138 sfence_vma_page(va);
1139 }
1140
1141 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)1142 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1143 {
1144
1145 /*
1146 * Might consider a loop of sfence_vma_page() for a small
1147 * number of pages in the future.
1148 */
1149 sfence_vma();
1150 }
1151
1152 static __inline void
pmap_invalidate_all(pmap_t pmap)1153 pmap_invalidate_all(pmap_t pmap)
1154 {
1155
1156 sfence_vma();
1157 }
1158 #endif
1159
1160 /*
1161 * Routine: pmap_extract
1162 * Function:
1163 * Extract the physical page address associated
1164 * with the given map/virtual_address pair.
1165 */
1166 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1167 pmap_extract(pmap_t pmap, vm_offset_t va)
1168 {
1169 pd_entry_t *l2p, l2;
1170 pt_entry_t *l3p;
1171 vm_paddr_t pa;
1172
1173 pa = 0;
1174
1175 /*
1176 * Start with an L2 lookup, L1 superpages are currently not implemented.
1177 */
1178 PMAP_LOCK(pmap);
1179 l2p = pmap_l2(pmap, va);
1180 if (l2p != NULL && ((l2 = pmap_load(l2p)) & PTE_V) != 0) {
1181 if ((l2 & PTE_RWX) == 0) {
1182 l3p = pmap_l2_to_l3(l2p, va);
1183 pa = PTE_TO_PHYS(pmap_load(l3p));
1184 pa |= (va & L3_OFFSET);
1185 } else {
1186 /* L2 is a superpage mapping. */
1187 pa = L2PTE_TO_PHYS(l2);
1188 pa |= (va & L2_OFFSET);
1189 }
1190 }
1191 PMAP_UNLOCK(pmap);
1192 return (pa);
1193 }
1194
1195 /*
1196 * Routine: pmap_extract_and_hold
1197 * Function:
1198 * Atomically extract and hold the physical page
1199 * with the given pmap and virtual address pair
1200 * if that mapping permits the given protection.
1201 */
1202 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1203 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1204 {
1205 pd_entry_t *l2p, l2;
1206 pt_entry_t *l3p, l3;
1207 vm_page_t m;
1208
1209 m = NULL;
1210 PMAP_LOCK(pmap);
1211 l2p = pmap_l2(pmap, va);
1212 if (l2p == NULL || ((l2 = pmap_load(l2p)) & PTE_V) == 0) {
1213 ;
1214 } else if ((l2 & PTE_RWX) != 0) {
1215 if ((l2 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) {
1216 m = PHYS_TO_VM_PAGE(L2PTE_TO_PHYS(l2) +
1217 (va & L2_OFFSET));
1218 }
1219 } else {
1220 l3p = pmap_l2_to_l3(l2p, va);
1221 if ((l3 = pmap_load(l3p)) != 0) {
1222 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0)
1223 m = PTE_TO_VM_PAGE(l3);
1224 }
1225 }
1226 if (m != NULL && !vm_page_wire_mapped(m))
1227 m = NULL;
1228 PMAP_UNLOCK(pmap);
1229 return (m);
1230 }
1231
1232 /*
1233 * Routine: pmap_kextract
1234 * Function:
1235 * Extract the physical page address associated with the given kernel
1236 * virtual address.
1237 */
1238 vm_paddr_t
pmap_kextract(vm_offset_t va)1239 pmap_kextract(vm_offset_t va)
1240 {
1241 pd_entry_t *l2, l2e;
1242 pt_entry_t *l3;
1243 vm_paddr_t pa;
1244
1245 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1246 pa = DMAP_TO_PHYS(va);
1247 } else {
1248 l2 = pmap_l2(kernel_pmap, va);
1249 if (l2 == NULL)
1250 panic("pmap_kextract: No l2");
1251 l2e = pmap_load(l2);
1252 /*
1253 * Beware of concurrent promotion and demotion! We must
1254 * use l2e rather than loading from l2 multiple times to
1255 * ensure we see a consistent state, including the
1256 * implicit load in pmap_l2_to_l3. It is, however, safe
1257 * to use an old l2e because the L3 page is preserved by
1258 * promotion.
1259 */
1260 if ((l2e & PTE_RX) != 0) {
1261 /* superpages */
1262 pa = L2PTE_TO_PHYS(l2e);
1263 pa |= (va & L2_OFFSET);
1264 return (pa);
1265 }
1266
1267 l3 = pmap_l2_to_l3(&l2e, va);
1268 pa = PTE_TO_PHYS(pmap_load(l3));
1269 pa |= (va & PAGE_MASK);
1270 }
1271 return (pa);
1272 }
1273
1274 /***************************************************
1275 * Low level mapping routines.....
1276 ***************************************************/
1277
1278 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)1279 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1280 {
1281 pt_entry_t entry;
1282 pt_entry_t *l3;
1283 pt_entry_t memattr;
1284 vm_offset_t va;
1285 pn_t pn;
1286
1287 KASSERT((pa & L3_OFFSET) == 0,
1288 ("pmap_kenter_device: Invalid physical address"));
1289 KASSERT((sva & L3_OFFSET) == 0,
1290 ("pmap_kenter_device: Invalid virtual address"));
1291 KASSERT((size & PAGE_MASK) == 0,
1292 ("pmap_kenter_device: Mapping is not page-sized"));
1293
1294 memattr = pmap_memattr_bits(mode);
1295 va = sva;
1296 while (size != 0) {
1297 l3 = pmap_l3(kernel_pmap, va);
1298 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
1299
1300 pn = (pa / PAGE_SIZE);
1301 entry = PTE_KERN;
1302 entry |= memattr;
1303 entry |= (pn << PTE_PPN0_S);
1304 pmap_store(l3, entry);
1305
1306 va += PAGE_SIZE;
1307 pa += PAGE_SIZE;
1308 size -= PAGE_SIZE;
1309 }
1310 pmap_invalidate_range(kernel_pmap, sva, va);
1311 }
1312
1313 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)1314 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1315 {
1316 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
1317 }
1318
1319 /*
1320 * Remove a page from the kernel pagetables.
1321 * Note: not SMP coherent.
1322 */
1323 void
pmap_kremove(vm_offset_t va)1324 pmap_kremove(vm_offset_t va)
1325 {
1326 pt_entry_t *l3;
1327
1328 l3 = pmap_l3(kernel_pmap, va);
1329 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1330
1331 pmap_clear(l3);
1332 sfence_vma();
1333 }
1334
1335 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)1336 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1337 {
1338 pt_entry_t *l3;
1339 vm_offset_t va;
1340
1341 KASSERT((sva & L3_OFFSET) == 0,
1342 ("pmap_kremove_device: Invalid virtual address"));
1343 KASSERT((size & PAGE_MASK) == 0,
1344 ("pmap_kremove_device: Mapping is not page-sized"));
1345
1346 va = sva;
1347 while (size != 0) {
1348 l3 = pmap_l3(kernel_pmap, va);
1349 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
1350 pmap_clear(l3);
1351
1352 va += PAGE_SIZE;
1353 size -= PAGE_SIZE;
1354 }
1355
1356 pmap_invalidate_range(kernel_pmap, sva, va);
1357 }
1358
1359 /*
1360 * Used to map a range of physical addresses into kernel
1361 * virtual address space.
1362 *
1363 * The value passed in '*virt' is a suggested virtual address for
1364 * the mapping. Architectures which can support a direct-mapped
1365 * physical to virtual region can return the appropriate address
1366 * within that region, leaving '*virt' unchanged. Other
1367 * architectures should map the pages starting at '*virt' and
1368 * update '*virt' with the first usable address after the mapped
1369 * region.
1370 */
1371 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)1372 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1373 {
1374
1375 return PHYS_TO_DMAP(start);
1376 }
1377
1378 /*
1379 * Add a list of wired pages to the kva
1380 * this routine is only used for temporary
1381 * kernel mappings that do not need to have
1382 * page modification or references recorded.
1383 * Note that old mappings are simply written
1384 * over. The page *must* be wired.
1385 * Note: SMP coherent. Uses a ranged shootdown IPI.
1386 */
1387 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)1388 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1389 {
1390 pt_entry_t *l3;
1391 vm_paddr_t pa;
1392 vm_offset_t va;
1393 vm_page_t m;
1394 pt_entry_t entry;
1395 pn_t pn;
1396 int i;
1397
1398 va = sva;
1399 for (i = 0; i < count; i++) {
1400 m = ma[i];
1401 pa = VM_PAGE_TO_PHYS(m);
1402 pn = (pa / PAGE_SIZE);
1403 l3 = pmap_l3(kernel_pmap, va);
1404
1405 entry = PTE_KERN;
1406 entry |= pmap_memattr_bits(m->md.pv_memattr);
1407 entry |= (pn << PTE_PPN0_S);
1408 pmap_store(l3, entry);
1409
1410 va += L3_SIZE;
1411 }
1412 pmap_invalidate_range(kernel_pmap, sva, va);
1413 }
1414
1415 /*
1416 * This routine tears out page mappings from the
1417 * kernel -- it is meant only for temporary mappings.
1418 * Note: SMP coherent. Uses a ranged shootdown IPI.
1419 */
1420 void
pmap_qremove(vm_offset_t sva,int count)1421 pmap_qremove(vm_offset_t sva, int count)
1422 {
1423 pt_entry_t *l3;
1424 vm_offset_t va;
1425
1426 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
1427
1428 for (va = sva; count-- > 0; va += PAGE_SIZE) {
1429 l3 = pmap_l3(kernel_pmap, va);
1430 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1431 pmap_clear(l3);
1432 }
1433 pmap_invalidate_range(kernel_pmap, sva, va);
1434 }
1435
1436 bool
pmap_ps_enabled(pmap_t pmap __unused)1437 pmap_ps_enabled(pmap_t pmap __unused)
1438 {
1439
1440 return (superpages_enabled);
1441 }
1442
1443 /***************************************************
1444 * Page table page management routines.....
1445 ***************************************************/
1446 /*
1447 * Schedule the specified unused page table page to be freed. Specifically,
1448 * add the page to the specified list of pages that will be released to the
1449 * physical memory manager after the TLB has been updated.
1450 */
1451 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)1452 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
1453 {
1454
1455 if (set_PG_ZERO)
1456 m->flags |= PG_ZERO;
1457 else
1458 m->flags &= ~PG_ZERO;
1459 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1460 }
1461
1462 /*
1463 * Inserts the specified page table page into the specified pmap's collection
1464 * of idle page table pages. Each of a pmap's page table pages is responsible
1465 * for mapping a distinct range of virtual addresses. The pmap's collection is
1466 * ordered by this virtual address range.
1467 *
1468 * If "promoted" is false, then the page table page "mpte" must be zero filled;
1469 * "mpte"'s valid field will be set to 0.
1470 *
1471 * If "promoted" is true and "all_l3e_PTE_A_set" is false, then "mpte" must
1472 * contain valid mappings with identical attributes except for PTE_A;
1473 * "mpte"'s valid field will be set to 1.
1474 *
1475 * If "promoted" and "all_l3e_PTE_A_set" are both true, then "mpte" must contain
1476 * valid mappings with identical attributes including PTE_A; "mpte"'s valid
1477 * field will be set to VM_PAGE_BITS_ALL.
1478 */
1479 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_PTE_A_set)1480 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
1481 bool all_l3e_PTE_A_set)
1482 {
1483
1484 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1485 KASSERT(promoted || !all_l3e_PTE_A_set,
1486 ("a zero-filled PTP can't have PTE_A set in every PTE"));
1487 mpte->valid = promoted ? (all_l3e_PTE_A_set ? VM_PAGE_BITS_ALL : 1) : 0;
1488 return (vm_radix_insert(&pmap->pm_root, mpte));
1489 }
1490
1491 /*
1492 * Removes the page table page mapping the specified virtual address from the
1493 * specified pmap's collection of idle page table pages, and returns it.
1494 * Otherwise, returns NULL if there is no page table page corresponding to the
1495 * specified virtual address.
1496 */
1497 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)1498 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
1499 {
1500
1501 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1502 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
1503 }
1504
1505 /*
1506 * Decrements a page table page's reference count, which is used to record the
1507 * number of valid page table entries within the page. If the reference count
1508 * drops to zero, then the page table page is unmapped. Returns true if the
1509 * page table page was unmapped and false otherwise.
1510 */
1511 static inline bool
pmap_unwire_ptp(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)1512 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1513 {
1514 KASSERT(m->ref_count > 0,
1515 ("%s: page %p ref count underflow", __func__, m));
1516
1517 --m->ref_count;
1518 if (m->ref_count == 0) {
1519 _pmap_unwire_ptp(pmap, va, m, free);
1520 return (true);
1521 } else {
1522 return (false);
1523 }
1524 }
1525
1526 static void
_pmap_unwire_ptp(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)1527 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1528 {
1529
1530 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1531 if (m->pindex >= NUL2E + NUL1E) {
1532 pd_entry_t *l0;
1533 l0 = pmap_l0(pmap, va);
1534 pmap_clear(l0);
1535 } else if (m->pindex >= NUL2E) {
1536 pd_entry_t *l1;
1537 l1 = pmap_l1(pmap, va);
1538 pmap_clear(l1);
1539 pmap_distribute_l1(pmap, pmap_l1_index(va), 0);
1540 } else {
1541 pd_entry_t *l2;
1542 l2 = pmap_l2(pmap, va);
1543 pmap_clear(l2);
1544 }
1545 pmap_resident_count_dec(pmap, 1);
1546 if (m->pindex < NUL2E) {
1547 pd_entry_t *l1;
1548 vm_page_t pdpg;
1549
1550 l1 = pmap_l1(pmap, va);
1551 pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1552 pmap_unwire_ptp(pmap, va, pdpg, free);
1553 } else if (m->pindex < NUL2E + NUL1E && pmap_mode != PMAP_MODE_SV39) {
1554 pd_entry_t *l0;
1555 vm_page_t pdpg;
1556
1557 l0 = pmap_l0(pmap, va);
1558 pdpg = PTE_TO_VM_PAGE(pmap_load(l0));
1559 pmap_unwire_ptp(pmap, va, pdpg, free);
1560 }
1561 pmap_invalidate_page(pmap, va);
1562
1563 vm_wire_sub(1);
1564
1565 /*
1566 * Put page on a list so that it is released after
1567 * *ALL* TLB shootdown is done
1568 */
1569 pmap_add_delayed_free_list(m, free, true);
1570 }
1571
1572 /*
1573 * After removing a page table entry, this routine is used to
1574 * conditionally free the page, and manage the reference count.
1575 */
1576 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)1577 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1578 struct spglist *free)
1579 {
1580 vm_page_t mpte;
1581
1582 if (va >= VM_MAXUSER_ADDRESS)
1583 return (0);
1584 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1585 mpte = PTE_TO_VM_PAGE(ptepde);
1586 return (pmap_unwire_ptp(pmap, va, mpte, free));
1587 }
1588
1589 static uint64_t
pmap_satp_mode(void)1590 pmap_satp_mode(void)
1591 {
1592 return (pmap_mode == PMAP_MODE_SV39 ? SATP_MODE_SV39 : SATP_MODE_SV48);
1593 }
1594
1595 void
pmap_pinit0(pmap_t pmap)1596 pmap_pinit0(pmap_t pmap)
1597 {
1598 PMAP_LOCK_INIT(pmap);
1599 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1600 pmap->pm_stage = PM_STAGE1;
1601 pmap->pm_top = kernel_pmap->pm_top;
1602 pmap->pm_satp = pmap_satp_mode() |
1603 (vtophys(pmap->pm_top) >> PAGE_SHIFT);
1604 CPU_ZERO(&pmap->pm_active);
1605 TAILQ_INIT(&pmap->pm_pvchunk);
1606 vm_radix_init(&pmap->pm_root);
1607 pmap_activate_boot(pmap);
1608 }
1609
1610 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage)1611 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage)
1612 {
1613 vm_paddr_t topphys;
1614 vm_page_t m;
1615 size_t i;
1616
1617 /*
1618 * Top directory is 4 pages in hypervisor case.
1619 * Current address space layout makes 3 of them unused.
1620 */
1621 if (stage == PM_STAGE1)
1622 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO |
1623 VM_ALLOC_WAITOK);
1624 else
1625 m = vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
1626 4, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT);
1627
1628 topphys = VM_PAGE_TO_PHYS(m);
1629 pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys);
1630 pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT);
1631 pmap->pm_stage = stage;
1632
1633 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1634
1635 CPU_ZERO(&pmap->pm_active);
1636
1637 if (stage == PM_STAGE2)
1638 goto finish;
1639
1640 if (pmap_mode == PMAP_MODE_SV39) {
1641 /*
1642 * Copy L1 entries from the kernel pmap. This must be done with
1643 * the allpmaps lock held to avoid races with
1644 * pmap_distribute_l1().
1645 */
1646 mtx_lock(&allpmaps_lock);
1647 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1648 for (i = pmap_l1_index(VM_MIN_KERNEL_ADDRESS);
1649 i < pmap_l1_index(VM_MAX_KERNEL_ADDRESS); i++)
1650 pmap->pm_top[i] = kernel_pmap->pm_top[i];
1651 for (i = pmap_l1_index(DMAP_MIN_ADDRESS);
1652 i < pmap_l1_index(DMAP_MAX_ADDRESS); i++)
1653 pmap->pm_top[i] = kernel_pmap->pm_top[i];
1654 mtx_unlock(&allpmaps_lock);
1655 } else {
1656 i = pmap_l0_index(VM_MIN_KERNEL_ADDRESS);
1657 pmap->pm_top[i] = kernel_pmap->pm_top[i];
1658 }
1659
1660 finish:
1661 TAILQ_INIT(&pmap->pm_pvchunk);
1662 vm_radix_init(&pmap->pm_root);
1663
1664 return (1);
1665 }
1666
1667 int
pmap_pinit(pmap_t pmap)1668 pmap_pinit(pmap_t pmap)
1669 {
1670
1671 return (pmap_pinit_stage(pmap, PM_STAGE1));
1672 }
1673
1674 /*
1675 * This routine is called if the desired page table page does not exist.
1676 *
1677 * If page table page allocation fails, this routine may sleep before
1678 * returning NULL. It sleeps only if a lock pointer was given.
1679 *
1680 * Note: If a page allocation fails at page table level two or three,
1681 * one or two pages may be held during the wait, only to be released
1682 * afterwards. This conservative approach is easily argued to avoid
1683 * race conditions.
1684 */
1685 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)1686 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1687 {
1688 vm_page_t m, pdpg;
1689 pt_entry_t entry;
1690 vm_paddr_t phys;
1691 pn_t pn;
1692
1693 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1694
1695 /*
1696 * Allocate a page table page.
1697 */
1698 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1699 if (m == NULL) {
1700 if (lockp != NULL) {
1701 RELEASE_PV_LIST_LOCK(lockp);
1702 PMAP_UNLOCK(pmap);
1703 rw_runlock(&pvh_global_lock);
1704 vm_wait(NULL);
1705 rw_rlock(&pvh_global_lock);
1706 PMAP_LOCK(pmap);
1707 }
1708
1709 /*
1710 * Indicate the need to retry. While waiting, the page table
1711 * page may have been allocated.
1712 */
1713 return (NULL);
1714 }
1715 m->pindex = ptepindex;
1716
1717 /*
1718 * Map the pagetable page into the process address space, if
1719 * it isn't already there.
1720 */
1721 pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT;
1722 if (ptepindex >= NUL2E + NUL1E) {
1723 pd_entry_t *l0;
1724 vm_pindex_t l0index;
1725
1726 KASSERT(pmap_mode != PMAP_MODE_SV39,
1727 ("%s: pindex %#lx in SV39 mode", __func__, ptepindex));
1728 KASSERT(ptepindex < NUL2E + NUL1E + NUL0E,
1729 ("%s: pindex %#lx out of range", __func__, ptepindex));
1730
1731 l0index = ptepindex - (NUL2E + NUL1E);
1732 l0 = &pmap->pm_top[l0index];
1733 KASSERT((pmap_load(l0) & PTE_V) == 0,
1734 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0)));
1735
1736 entry = PTE_V | (pn << PTE_PPN0_S);
1737 pmap_store(l0, entry);
1738 } else if (ptepindex >= NUL2E) {
1739 pd_entry_t *l0, *l1;
1740 vm_pindex_t l0index, l1index;
1741
1742 l1index = ptepindex - NUL2E;
1743 if (pmap_mode == PMAP_MODE_SV39) {
1744 l1 = &pmap->pm_top[l1index];
1745 } else {
1746 l0index = l1index >> Ln_ENTRIES_SHIFT;
1747 l0 = &pmap->pm_top[l0index];
1748 if (pmap_load(l0) == 0) {
1749 /* Recurse to allocate the L1 page. */
1750 if (_pmap_alloc_l3(pmap,
1751 NUL2E + NUL1E + l0index, lockp) == NULL)
1752 goto fail;
1753 phys = PTE_TO_PHYS(pmap_load(l0));
1754 } else {
1755 phys = PTE_TO_PHYS(pmap_load(l0));
1756 pdpg = PHYS_TO_VM_PAGE(phys);
1757 pdpg->ref_count++;
1758 }
1759 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1760 l1 = &l1[ptepindex & Ln_ADDR_MASK];
1761 }
1762 KASSERT((pmap_load(l1) & PTE_V) == 0,
1763 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
1764
1765 entry = PTE_V | (pn << PTE_PPN0_S);
1766 pmap_store(l1, entry);
1767 pmap_distribute_l1(pmap, l1index, entry);
1768 } else {
1769 vm_pindex_t l0index, l1index;
1770 pd_entry_t *l0, *l1, *l2;
1771
1772 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT);
1773 if (pmap_mode == PMAP_MODE_SV39) {
1774 l1 = &pmap->pm_top[l1index];
1775 if (pmap_load(l1) == 0) {
1776 /* recurse for allocating page dir */
1777 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1778 lockp) == NULL)
1779 goto fail;
1780 } else {
1781 pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1782 pdpg->ref_count++;
1783 }
1784 } else {
1785 l0index = l1index >> Ln_ENTRIES_SHIFT;
1786 l0 = &pmap->pm_top[l0index];
1787 if (pmap_load(l0) == 0) {
1788 /* Recurse to allocate the L1 entry. */
1789 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1790 lockp) == NULL)
1791 goto fail;
1792 phys = PTE_TO_PHYS(pmap_load(l0));
1793 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1794 l1 = &l1[l1index & Ln_ADDR_MASK];
1795 } else {
1796 phys = PTE_TO_PHYS(pmap_load(l0));
1797 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1798 l1 = &l1[l1index & Ln_ADDR_MASK];
1799 if (pmap_load(l1) == 0) {
1800 /* Recurse to allocate the L2 page. */
1801 if (_pmap_alloc_l3(pmap,
1802 NUL2E + l1index, lockp) == NULL)
1803 goto fail;
1804 } else {
1805 pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1806 pdpg->ref_count++;
1807 }
1808 }
1809 }
1810
1811 phys = PTE_TO_PHYS(pmap_load(l1));
1812 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1813 l2 = &l2[ptepindex & Ln_ADDR_MASK];
1814 KASSERT((pmap_load(l2) & PTE_V) == 0,
1815 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
1816
1817 entry = PTE_V | (pn << PTE_PPN0_S);
1818 pmap_store(l2, entry);
1819 }
1820
1821 pmap_resident_count_inc(pmap, 1);
1822
1823 return (m);
1824
1825 fail:
1826 vm_page_unwire_noq(m);
1827 vm_page_free_zero(m);
1828 return (NULL);
1829 }
1830
1831 static vm_page_t
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)1832 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1833 {
1834 pd_entry_t *l1;
1835 vm_page_t l2pg;
1836 vm_pindex_t pindex;
1837
1838 retry:
1839 l1 = pmap_l1(pmap, va);
1840 if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) {
1841 KASSERT((pmap_load(l1) & PTE_RWX) == 0,
1842 ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__,
1843 pmap_load(l1), va));
1844 /* Add a reference to the L2 page. */
1845 l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
1846 l2pg->ref_count++;
1847 } else {
1848 /* Allocate a L2 page. */
1849 pindex = pmap_l1_pindex(va);
1850 l2pg = _pmap_alloc_l3(pmap, pindex, lockp);
1851 if (l2pg == NULL && lockp != NULL)
1852 goto retry;
1853 }
1854 return (l2pg);
1855 }
1856
1857 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)1858 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1859 {
1860 vm_pindex_t ptepindex;
1861 pd_entry_t *l2;
1862 vm_page_t m;
1863
1864 /*
1865 * Calculate pagetable page index
1866 */
1867 ptepindex = pmap_l2_pindex(va);
1868 retry:
1869 /*
1870 * Get the page directory entry
1871 */
1872 l2 = pmap_l2(pmap, va);
1873
1874 /*
1875 * If the page table page is mapped, we just increment the
1876 * hold count, and activate it.
1877 */
1878 if (l2 != NULL && pmap_load(l2) != 0) {
1879 m = PTE_TO_VM_PAGE(pmap_load(l2));
1880 m->ref_count++;
1881 } else {
1882 /*
1883 * Here if the pte page isn't mapped, or if it has been
1884 * deallocated.
1885 */
1886 m = _pmap_alloc_l3(pmap, ptepindex, lockp);
1887 if (m == NULL && lockp != NULL)
1888 goto retry;
1889 }
1890 return (m);
1891 }
1892
1893 /***************************************************
1894 * Pmap allocation/deallocation routines.
1895 ***************************************************/
1896
1897 /*
1898 * Release any resources held by the given physical map.
1899 * Called when a pmap initialized by pmap_pinit is being released.
1900 * Should only be called if the map contains no valid mappings.
1901 */
1902 void
pmap_release(pmap_t pmap)1903 pmap_release(pmap_t pmap)
1904 {
1905 vm_page_t m;
1906 int npages;
1907 int i;
1908
1909 KASSERT(pmap->pm_stats.resident_count == 0,
1910 ("pmap_release: pmap resident count %ld != 0",
1911 pmap->pm_stats.resident_count));
1912 KASSERT(CPU_EMPTY(&pmap->pm_active),
1913 ("releasing active pmap %p", pmap));
1914
1915 if (pmap->pm_stage == PM_STAGE2)
1916 goto finish;
1917
1918 if (pmap_mode == PMAP_MODE_SV39) {
1919 mtx_lock(&allpmaps_lock);
1920 LIST_REMOVE(pmap, pm_list);
1921 mtx_unlock(&allpmaps_lock);
1922 }
1923
1924 finish:
1925 npages = pmap->pm_stage == PM_STAGE2 ? 4 : 1;
1926 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top));
1927 for (i = 0; i < npages; i++) {
1928 vm_page_unwire_noq(m);
1929 vm_page_free(m);
1930 m++;
1931 }
1932 }
1933
1934 static int
kvm_size(SYSCTL_HANDLER_ARGS)1935 kvm_size(SYSCTL_HANDLER_ARGS)
1936 {
1937 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1938
1939 return sysctl_handle_long(oidp, &ksize, 0, req);
1940 }
1941 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1942 0, 0, kvm_size, "LU",
1943 "Size of KVM");
1944
1945 static int
kvm_free(SYSCTL_HANDLER_ARGS)1946 kvm_free(SYSCTL_HANDLER_ARGS)
1947 {
1948 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1949
1950 return sysctl_handle_long(oidp, &kfree, 0, req);
1951 }
1952 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1953 0, 0, kvm_free, "LU",
1954 "Amount of KVM free");
1955
1956 /*
1957 * grow the number of kernel page table entries, if needed
1958 */
1959 static int
pmap_growkernel_nopanic(vm_offset_t addr)1960 pmap_growkernel_nopanic(vm_offset_t addr)
1961 {
1962 vm_paddr_t paddr;
1963 vm_page_t nkpg;
1964 pd_entry_t *l1, *l2;
1965 pt_entry_t entry;
1966 pn_t pn;
1967
1968 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1969
1970 addr = roundup2(addr, L2_SIZE);
1971 if (addr - 1 >= vm_map_max(kernel_map))
1972 addr = vm_map_max(kernel_map);
1973 while (kernel_vm_end < addr) {
1974 l1 = pmap_l1(kernel_pmap, kernel_vm_end);
1975 if (pmap_load(l1) == 0) {
1976 /* We need a new PDP entry */
1977 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
1978 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1979 if (nkpg == NULL)
1980 return (KERN_RESOURCE_SHORTAGE);
1981
1982 nkpg->pindex = pmap_l1_pindex(kernel_vm_end);
1983 paddr = VM_PAGE_TO_PHYS(nkpg);
1984
1985 pn = (paddr / PAGE_SIZE);
1986 entry = (PTE_V);
1987 entry |= (pn << PTE_PPN0_S);
1988 pmap_store(l1, entry);
1989 pmap_distribute_l1(kernel_pmap,
1990 pmap_l1_index(kernel_vm_end), entry);
1991 continue; /* try again */
1992 }
1993 l2 = pmap_l1_to_l2(l1, kernel_vm_end);
1994 if ((pmap_load(l2) & PTE_V) != 0 &&
1995 (pmap_load(l2) & PTE_RWX) == 0) {
1996 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1997 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
1998 kernel_vm_end = vm_map_max(kernel_map);
1999 break;
2000 }
2001 continue;
2002 }
2003
2004 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2005 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2006 if (nkpg == NULL)
2007 return (KERN_RESOURCE_SHORTAGE);
2008 nkpg->pindex = pmap_l2_pindex(kernel_vm_end);
2009 paddr = VM_PAGE_TO_PHYS(nkpg);
2010
2011 pn = (paddr / PAGE_SIZE);
2012 entry = (PTE_V);
2013 entry |= (pn << PTE_PPN0_S);
2014 pmap_store(l2, entry);
2015
2016 pmap_invalidate_page(kernel_pmap, kernel_vm_end);
2017
2018 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2019 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2020 kernel_vm_end = vm_map_max(kernel_map);
2021 break;
2022 }
2023 }
2024
2025 return (KERN_SUCCESS);
2026 }
2027
2028 int
pmap_growkernel(vm_offset_t addr)2029 pmap_growkernel(vm_offset_t addr)
2030 {
2031 int rv;
2032
2033 rv = pmap_growkernel_nopanic(addr);
2034 if (rv != KERN_SUCCESS && pmap_growkernel_panic)
2035 panic("pmap_growkernel: no memory to grow kernel");
2036 return (rv);
2037 }
2038
2039 /***************************************************
2040 * page management routines.
2041 ***************************************************/
2042
2043 static const uint64_t pc_freemask[_NPCM] = {
2044 [0 ... _NPCM - 2] = PC_FREEN,
2045 [_NPCM - 1] = PC_FREEL
2046 };
2047
2048 #ifdef PV_STATS
2049 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2050
2051 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2052 "Current number of pv entry chunks");
2053 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2054 "Current number of pv entry chunks allocated");
2055 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2056 "Current number of pv entry chunks frees");
2057 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2058 "Number of times tried to get a chunk page but failed.");
2059
2060 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2061 static int pv_entry_spare;
2062
2063 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2064 "Current number of pv entry frees");
2065 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2066 "Current number of pv entry allocs");
2067 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2068 "Current number of pv entries");
2069 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2070 "Current number of spare pv entries");
2071 #endif
2072
2073 /*
2074 * We are in a serious low memory condition. Resort to
2075 * drastic measures to free some pages so we can allocate
2076 * another pv entry chunk.
2077 *
2078 * Returns NULL if PV entries were reclaimed from the specified pmap.
2079 *
2080 * We do not, however, unmap 2mpages because subsequent accesses will
2081 * allocate per-page pv entries until repromotion occurs, thereby
2082 * exacerbating the shortage of free pv entries.
2083 */
2084 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)2085 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2086 {
2087
2088 panic("RISCVTODO: reclaim_pv_chunk");
2089 }
2090
2091 /*
2092 * free the pv_entry back to the free list
2093 */
2094 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)2095 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2096 {
2097 struct pv_chunk *pc;
2098 int idx, field, bit;
2099
2100 rw_assert(&pvh_global_lock, RA_LOCKED);
2101 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2102 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2103 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2104 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2105 pc = pv_to_chunk(pv);
2106 idx = pv - &pc->pc_pventry[0];
2107 field = idx / 64;
2108 bit = idx % 64;
2109 pc->pc_map[field] |= 1ul << bit;
2110 if (!pc_is_free(pc)) {
2111 /* 98% of the time, pc is already at the head of the list. */
2112 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2113 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2114 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2115 }
2116 return;
2117 }
2118 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2119 free_pv_chunk(pc);
2120 }
2121
2122 static void
free_pv_chunk(struct pv_chunk * pc)2123 free_pv_chunk(struct pv_chunk *pc)
2124 {
2125 vm_page_t m;
2126
2127 mtx_lock(&pv_chunks_mutex);
2128 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2129 mtx_unlock(&pv_chunks_mutex);
2130 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2131 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2132 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2133 /* entire chunk is free, return it */
2134 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2135 dump_drop_page(m->phys_addr);
2136 vm_page_unwire_noq(m);
2137 vm_page_free(m);
2138 }
2139
2140 /*
2141 * Returns a new PV entry, allocating a new PV chunk from the system when
2142 * needed. If this PV chunk allocation fails and a PV list lock pointer was
2143 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
2144 * returned.
2145 *
2146 * The given PV list lock may be released.
2147 */
2148 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)2149 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2150 {
2151 int bit, field;
2152 pv_entry_t pv;
2153 struct pv_chunk *pc;
2154 vm_page_t m;
2155
2156 rw_assert(&pvh_global_lock, RA_LOCKED);
2157 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2158 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2159 retry:
2160 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2161 if (pc != NULL) {
2162 for (field = 0; field < _NPCM; field++) {
2163 if (pc->pc_map[field]) {
2164 bit = ffsl(pc->pc_map[field]) - 1;
2165 break;
2166 }
2167 }
2168 if (field < _NPCM) {
2169 pv = &pc->pc_pventry[field * 64 + bit];
2170 pc->pc_map[field] &= ~(1ul << bit);
2171 /* If this was the last item, move it to tail */
2172 if (pc_is_full(pc)) {
2173 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2174 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2175 pc_list);
2176 }
2177 PV_STAT(atomic_add_long(&pv_entry_count, 1));
2178 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2179 return (pv);
2180 }
2181 }
2182 /* No free items, allocate another chunk */
2183 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2184 if (m == NULL) {
2185 if (lockp == NULL) {
2186 PV_STAT(pc_chunk_tryfail++);
2187 return (NULL);
2188 }
2189 m = reclaim_pv_chunk(pmap, lockp);
2190 if (m == NULL)
2191 goto retry;
2192 }
2193 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2194 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2195 dump_add_page(m->phys_addr);
2196 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2197 pc->pc_pmap = pmap;
2198 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */
2199 pc->pc_map[1] = PC_FREEN;
2200 pc->pc_map[2] = PC_FREEL;
2201 mtx_lock(&pv_chunks_mutex);
2202 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2203 mtx_unlock(&pv_chunks_mutex);
2204 pv = &pc->pc_pventry[0];
2205 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2206 PV_STAT(atomic_add_long(&pv_entry_count, 1));
2207 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2208 return (pv);
2209 }
2210
2211 /*
2212 * Ensure that the number of spare PV entries in the specified pmap meets or
2213 * exceeds the given count, "needed".
2214 *
2215 * The given PV list lock may be released.
2216 */
2217 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)2218 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2219 {
2220 struct pch new_tail;
2221 struct pv_chunk *pc;
2222 vm_page_t m;
2223 int avail, free;
2224 bool reclaimed;
2225
2226 rw_assert(&pvh_global_lock, RA_LOCKED);
2227 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2228 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2229
2230 /*
2231 * Newly allocated PV chunks must be stored in a private list until
2232 * the required number of PV chunks have been allocated. Otherwise,
2233 * reclaim_pv_chunk() could recycle one of these chunks. In
2234 * contrast, these chunks must be added to the pmap upon allocation.
2235 */
2236 TAILQ_INIT(&new_tail);
2237 retry:
2238 avail = 0;
2239 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2240 bit_count((bitstr_t *)pc->pc_map, 0,
2241 sizeof(pc->pc_map) * NBBY, &free);
2242 if (free == 0)
2243 break;
2244 avail += free;
2245 if (avail >= needed)
2246 break;
2247 }
2248 for (reclaimed = false; avail < needed; avail += _NPCPV) {
2249 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2250 if (m == NULL) {
2251 m = reclaim_pv_chunk(pmap, lockp);
2252 if (m == NULL)
2253 goto retry;
2254 reclaimed = true;
2255 }
2256 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2257 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2258 dump_add_page(m->phys_addr);
2259 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2260 pc->pc_pmap = pmap;
2261 pc->pc_map[0] = PC_FREEN;
2262 pc->pc_map[1] = PC_FREEN;
2263 pc->pc_map[2] = PC_FREEL;
2264 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2265 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2266 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
2267
2268 /*
2269 * The reclaim might have freed a chunk from the current pmap.
2270 * If that chunk contained available entries, we need to
2271 * re-count the number of available entries.
2272 */
2273 if (reclaimed)
2274 goto retry;
2275 }
2276 if (!TAILQ_EMPTY(&new_tail)) {
2277 mtx_lock(&pv_chunks_mutex);
2278 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2279 mtx_unlock(&pv_chunks_mutex);
2280 }
2281 }
2282
2283 /*
2284 * First find and then remove the pv entry for the specified pmap and virtual
2285 * address from the specified pv list. Returns the pv entry if found and NULL
2286 * otherwise. This operation can be performed on pv lists for either 4KB or
2287 * 2MB page mappings.
2288 */
2289 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2290 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2291 {
2292 pv_entry_t pv;
2293
2294 rw_assert(&pvh_global_lock, RA_LOCKED);
2295 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2296 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2297 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2298 pvh->pv_gen++;
2299 break;
2300 }
2301 }
2302 return (pv);
2303 }
2304
2305 /*
2306 * First find and then destroy the pv entry for the specified pmap and virtual
2307 * address. This operation can be performed on pv lists for either 4KB or 2MB
2308 * page mappings.
2309 */
2310 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2311 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2312 {
2313 pv_entry_t pv;
2314
2315 pv = pmap_pvh_remove(pvh, pmap, va);
2316
2317 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va));
2318 free_pv_entry(pmap, pv);
2319 }
2320
2321 /*
2322 * Conditionally create the PV entry for a 4KB page mapping if the required
2323 * memory can be allocated without resorting to reclamation.
2324 */
2325 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)2326 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2327 struct rwlock **lockp)
2328 {
2329 pv_entry_t pv;
2330
2331 rw_assert(&pvh_global_lock, RA_LOCKED);
2332 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2333 /* Pass NULL instead of the lock pointer to disable reclamation. */
2334 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2335 pv->pv_va = va;
2336 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2337 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2338 m->md.pv_gen++;
2339 return (true);
2340 } else
2341 return (false);
2342 }
2343
2344 /*
2345 * After demotion from a 2MB page mapping to 512 4KB page mappings,
2346 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2347 * entries for each of the 4KB page mappings.
2348 */
2349 static void __unused
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)2350 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2351 struct rwlock **lockp)
2352 {
2353 struct md_page *pvh;
2354 struct pv_chunk *pc;
2355 pv_entry_t pv;
2356 vm_page_t m;
2357 vm_offset_t va_last;
2358 int bit, field;
2359
2360 rw_assert(&pvh_global_lock, RA_LOCKED);
2361 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2362 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2363
2364 /*
2365 * Transfer the 2mpage's pv entry for this mapping to the first
2366 * page's pv list. Once this transfer begins, the pv list lock
2367 * must not be released until the last pv entry is reinstantiated.
2368 */
2369 pvh = pa_to_pvh(pa);
2370 va &= ~L2_OFFSET;
2371 pv = pmap_pvh_remove(pvh, pmap, va);
2372 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
2373 m = PHYS_TO_VM_PAGE(pa);
2374 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2375 m->md.pv_gen++;
2376 /* Instantiate the remaining 511 pv entries. */
2377 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
2378 va_last = va + L2_SIZE - PAGE_SIZE;
2379 for (;;) {
2380 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2381 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
2382 for (field = 0; field < _NPCM; field++) {
2383 while (pc->pc_map[field] != 0) {
2384 bit = ffsl(pc->pc_map[field]) - 1;
2385 pc->pc_map[field] &= ~(1ul << bit);
2386 pv = &pc->pc_pventry[field * 64 + bit];
2387 va += PAGE_SIZE;
2388 pv->pv_va = va;
2389 m++;
2390 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2391 ("pmap_pv_demote_l2: page %p is not managed", m));
2392 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2393 m->md.pv_gen++;
2394 if (va == va_last)
2395 goto out;
2396 }
2397 }
2398 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2399 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2400 }
2401 out:
2402 if (pc_is_full(pc)) {
2403 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2404 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2405 }
2406 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
2407 PV_STAT(atomic_add_int(&pv_entry_spare, -(Ln_ENTRIES - 1)));
2408 }
2409
2410 #if VM_NRESERVLEVEL > 0
2411 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)2412 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2413 struct rwlock **lockp)
2414 {
2415 struct md_page *pvh;
2416 pv_entry_t pv;
2417 vm_page_t m;
2418 vm_offset_t va_last;
2419
2420 rw_assert(&pvh_global_lock, RA_LOCKED);
2421 KASSERT((pa & L2_OFFSET) == 0,
2422 ("pmap_pv_promote_l2: misaligned pa %#lx", pa));
2423
2424 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2425
2426 m = PHYS_TO_VM_PAGE(pa);
2427 va = va & ~L2_OFFSET;
2428 pv = pmap_pvh_remove(&m->md, pmap, va);
2429 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va));
2430 pvh = pa_to_pvh(pa);
2431 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2432 pvh->pv_gen++;
2433
2434 va_last = va + L2_SIZE - PAGE_SIZE;
2435 do {
2436 m++;
2437 va += PAGE_SIZE;
2438 pmap_pvh_free(&m->md, pmap, va);
2439 } while (va < va_last);
2440 }
2441 #endif /* VM_NRESERVLEVEL > 0 */
2442
2443 /*
2444 * Create the PV entry for a 2MB page mapping. Always returns true unless the
2445 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
2446 * false if the PV entry cannot be allocated without resorting to reclamation.
2447 */
2448 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)2449 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
2450 struct rwlock **lockp)
2451 {
2452 struct md_page *pvh;
2453 pv_entry_t pv;
2454 vm_paddr_t pa;
2455
2456 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2457 /* Pass NULL instead of the lock pointer to disable reclamation. */
2458 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
2459 NULL : lockp)) == NULL)
2460 return (false);
2461 pv->pv_va = va;
2462 pa = PTE_TO_PHYS(l2e);
2463 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2464 pvh = pa_to_pvh(pa);
2465 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2466 pvh->pv_gen++;
2467 return (true);
2468 }
2469
2470 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)2471 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
2472 {
2473 pt_entry_t newl2, oldl2 __diagused;
2474 vm_page_t ml3;
2475 vm_paddr_t ml3pa;
2476
2477 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
2478 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
2479 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2480
2481 ml3 = pmap_remove_pt_page(pmap, va);
2482 if (ml3 == NULL)
2483 panic("pmap_remove_kernel_l2: Missing pt page");
2484
2485 ml3pa = VM_PAGE_TO_PHYS(ml3);
2486 newl2 = ml3pa | PTE_V;
2487
2488 /*
2489 * If this page table page was unmapped by a promotion, then it
2490 * contains valid mappings. Zero it to invalidate those mappings.
2491 */
2492 if (vm_page_any_valid(ml3))
2493 pagezero((void *)PHYS_TO_DMAP(ml3pa));
2494
2495 /*
2496 * Demote the mapping.
2497 */
2498 oldl2 = pmap_load_store(l2, newl2);
2499 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
2500 __func__, l2, oldl2));
2501 }
2502
2503 /*
2504 * pmap_remove_l2: Do the things to unmap a level 2 superpage.
2505 */
2506 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)2507 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
2508 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
2509 {
2510 struct md_page *pvh;
2511 pt_entry_t oldl2;
2512 vm_offset_t eva, va;
2513 vm_page_t m, ml3;
2514
2515 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2516 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
2517 oldl2 = pmap_load_clear(l2);
2518 KASSERT((oldl2 & PTE_RWX) != 0,
2519 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2));
2520
2521 /*
2522 * The sfence.vma documentation states that it is sufficient to specify
2523 * a single address within a superpage mapping. However, since we do
2524 * not perform any invalidation upon promotion, TLBs may still be
2525 * caching 4KB mappings within the superpage, so we must invalidate the
2526 * entire range.
2527 */
2528 pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
2529 if ((oldl2 & PTE_SW_WIRED) != 0)
2530 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
2531 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
2532 if ((oldl2 & PTE_SW_MANAGED) != 0) {
2533 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2));
2534 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2));
2535 pmap_pvh_free(pvh, pmap, sva);
2536 eva = sva + L2_SIZE;
2537 for (va = sva, m = PTE_TO_VM_PAGE(oldl2);
2538 va < eva; va += PAGE_SIZE, m++) {
2539 if ((oldl2 & PTE_D) != 0)
2540 vm_page_dirty(m);
2541 if ((oldl2 & PTE_A) != 0)
2542 vm_page_aflag_set(m, PGA_REFERENCED);
2543 if (TAILQ_EMPTY(&m->md.pv_list) &&
2544 TAILQ_EMPTY(&pvh->pv_list))
2545 vm_page_aflag_clear(m, PGA_WRITEABLE);
2546 }
2547 }
2548 if (pmap == kernel_pmap) {
2549 pmap_remove_kernel_l2(pmap, l2, sva);
2550 } else {
2551 ml3 = pmap_remove_pt_page(pmap, sva);
2552 if (ml3 != NULL) {
2553 KASSERT(vm_page_any_valid(ml3),
2554 ("pmap_remove_l2: l3 page not promoted"));
2555 pmap_resident_count_dec(pmap, 1);
2556 KASSERT(ml3->ref_count == Ln_ENTRIES,
2557 ("pmap_remove_l2: l3 page ref count error"));
2558 ml3->ref_count = 1;
2559 vm_page_unwire_noq(ml3);
2560 pmap_add_delayed_free_list(ml3, free, false);
2561 }
2562 }
2563 return (pmap_unuse_pt(pmap, sva, l1e, free));
2564 }
2565
2566 /*
2567 * pmap_remove_l3: do the things to unmap a page in a process
2568 */
2569 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)2570 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2571 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2572 {
2573 struct md_page *pvh;
2574 pt_entry_t old_l3;
2575 vm_page_t m;
2576
2577 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2578 old_l3 = pmap_load_clear(l3);
2579 pmap_invalidate_page(pmap, va);
2580 if (old_l3 & PTE_SW_WIRED)
2581 pmap->pm_stats.wired_count -= 1;
2582 pmap_resident_count_dec(pmap, 1);
2583 if (old_l3 & PTE_SW_MANAGED) {
2584 m = PTE_TO_VM_PAGE(old_l3);
2585 if ((old_l3 & PTE_D) != 0)
2586 vm_page_dirty(m);
2587 if (old_l3 & PTE_A)
2588 vm_page_aflag_set(m, PGA_REFERENCED);
2589 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2590 pmap_pvh_free(&m->md, pmap, va);
2591 if (TAILQ_EMPTY(&m->md.pv_list) &&
2592 (m->flags & PG_FICTITIOUS) == 0) {
2593 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2594 if (TAILQ_EMPTY(&pvh->pv_list))
2595 vm_page_aflag_clear(m, PGA_WRITEABLE);
2596 }
2597 }
2598
2599 return (pmap_unuse_pt(pmap, va, l2e, free));
2600 }
2601
2602 /*
2603 * Remove the given range of addresses from the specified map.
2604 *
2605 * It is assumed that the start and end are properly
2606 * rounded to the page size.
2607 */
2608 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)2609 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2610 {
2611 struct spglist free;
2612 struct rwlock *lock;
2613 vm_offset_t va, va_next;
2614 pd_entry_t *l0, *l1, *l2, l2e;
2615 pt_entry_t *l3;
2616
2617 /*
2618 * Perform an unsynchronized read. This is, however, safe.
2619 */
2620 if (pmap->pm_stats.resident_count == 0)
2621 return;
2622
2623 SLIST_INIT(&free);
2624
2625 rw_rlock(&pvh_global_lock);
2626 PMAP_LOCK(pmap);
2627
2628 lock = NULL;
2629 for (; sva < eva; sva = va_next) {
2630 if (pmap->pm_stats.resident_count == 0)
2631 break;
2632
2633 if (pmap_mode == PMAP_MODE_SV48) {
2634 l0 = pmap_l0(pmap, sva);
2635 if (pmap_load(l0) == 0) {
2636 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2637 if (va_next < sva)
2638 va_next = eva;
2639 continue;
2640 }
2641 l1 = pmap_l0_to_l1(l0, sva);
2642 } else {
2643 l1 = pmap_l1(pmap, sva);
2644 }
2645
2646 if (pmap_load(l1) == 0) {
2647 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2648 if (va_next < sva)
2649 va_next = eva;
2650 continue;
2651 }
2652
2653 /*
2654 * Calculate index for next page table.
2655 */
2656 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2657 if (va_next < sva)
2658 va_next = eva;
2659
2660 l2 = pmap_l1_to_l2(l1, sva);
2661 if ((l2e = pmap_load(l2)) == 0)
2662 continue;
2663 if ((l2e & PTE_RWX) != 0) {
2664 if (sva + L2_SIZE == va_next && eva >= va_next) {
2665 (void)pmap_remove_l2(pmap, l2, sva,
2666 pmap_load(l1), &free, &lock);
2667 continue;
2668 } else if (!pmap_demote_l2_locked(pmap, l2, sva,
2669 &lock)) {
2670 /*
2671 * The large page mapping was destroyed.
2672 */
2673 continue;
2674 }
2675 l2e = pmap_load(l2);
2676 }
2677
2678 /*
2679 * Limit our scan to either the end of the va represented
2680 * by the current page table page, or to the end of the
2681 * range being removed.
2682 */
2683 if (va_next > eva)
2684 va_next = eva;
2685
2686 va = va_next;
2687 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2688 sva += L3_SIZE) {
2689 if (pmap_load(l3) == 0) {
2690 if (va != va_next) {
2691 pmap_invalidate_range(pmap, va, sva);
2692 va = va_next;
2693 }
2694 continue;
2695 }
2696 if (va == va_next)
2697 va = sva;
2698 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) {
2699 sva += L3_SIZE;
2700 break;
2701 }
2702 }
2703 if (va != va_next)
2704 pmap_invalidate_range(pmap, va, sva);
2705 }
2706 if (lock != NULL)
2707 rw_wunlock(lock);
2708 rw_runlock(&pvh_global_lock);
2709 PMAP_UNLOCK(pmap);
2710 vm_page_free_pages_toq(&free, false);
2711 }
2712
2713 /*
2714 * Routine: pmap_remove_all
2715 * Function:
2716 * Removes this physical page from
2717 * all physical maps in which it resides.
2718 * Reflects back modify bits to the pager.
2719 *
2720 * Notes:
2721 * Original versions of this routine were very
2722 * inefficient because they iteratively called
2723 * pmap_remove (slow...)
2724 */
2725
2726 void
pmap_remove_all(vm_page_t m)2727 pmap_remove_all(vm_page_t m)
2728 {
2729 struct spglist free;
2730 struct md_page *pvh;
2731 pmap_t pmap;
2732 pt_entry_t *l3, l3e;
2733 pd_entry_t *l2, l2e __diagused;
2734 pv_entry_t pv;
2735 vm_offset_t va;
2736
2737 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2738 ("pmap_remove_all: page %p is not managed", m));
2739 SLIST_INIT(&free);
2740 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2741 pa_to_pvh(VM_PAGE_TO_PHYS(m));
2742
2743 rw_wlock(&pvh_global_lock);
2744 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2745 pmap = PV_PMAP(pv);
2746 PMAP_LOCK(pmap);
2747 va = pv->pv_va;
2748 l2 = pmap_l2(pmap, va);
2749 (void)pmap_demote_l2(pmap, l2, va);
2750 PMAP_UNLOCK(pmap);
2751 }
2752 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2753 pmap = PV_PMAP(pv);
2754 PMAP_LOCK(pmap);
2755 pmap_resident_count_dec(pmap, 1);
2756 l2 = pmap_l2(pmap, pv->pv_va);
2757 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found"));
2758 l2e = pmap_load(l2);
2759
2760 KASSERT((l2e & PTE_RX) == 0,
2761 ("pmap_remove_all: found a superpage in %p's pv list", m));
2762
2763 l3 = pmap_l2_to_l3(l2, pv->pv_va);
2764 l3e = pmap_load_clear(l3);
2765 pmap_invalidate_page(pmap, pv->pv_va);
2766 if (l3e & PTE_SW_WIRED)
2767 pmap->pm_stats.wired_count--;
2768 if ((l3e & PTE_A) != 0)
2769 vm_page_aflag_set(m, PGA_REFERENCED);
2770
2771 /*
2772 * Update the vm_page_t clean and reference bits.
2773 */
2774 if ((l3e & PTE_D) != 0)
2775 vm_page_dirty(m);
2776 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free);
2777 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2778 m->md.pv_gen++;
2779 free_pv_entry(pmap, pv);
2780 PMAP_UNLOCK(pmap);
2781 }
2782 vm_page_aflag_clear(m, PGA_WRITEABLE);
2783 rw_wunlock(&pvh_global_lock);
2784 vm_page_free_pages_toq(&free, false);
2785 }
2786
2787 /*
2788 * Set the physical protection on the
2789 * specified range of this map as requested.
2790 */
2791 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)2792 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2793 {
2794 pd_entry_t *l0, *l1, *l2, l2e;
2795 pt_entry_t *l3, l3e, mask;
2796 vm_page_t m, mt;
2797 vm_offset_t va_next;
2798 bool anychanged, pv_lists_locked;
2799
2800 if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2801 pmap_remove(pmap, sva, eva);
2802 return;
2803 }
2804
2805 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
2806 (VM_PROT_WRITE | VM_PROT_EXECUTE))
2807 return;
2808
2809 anychanged = false;
2810 pv_lists_locked = false;
2811 mask = 0;
2812 if ((prot & VM_PROT_WRITE) == 0)
2813 mask |= PTE_W | PTE_D;
2814 if ((prot & VM_PROT_EXECUTE) == 0)
2815 mask |= PTE_X;
2816 resume:
2817 PMAP_LOCK(pmap);
2818 for (; sva < eva; sva = va_next) {
2819 if (pmap_mode == PMAP_MODE_SV48) {
2820 l0 = pmap_l0(pmap, sva);
2821 if (pmap_load(l0) == 0) {
2822 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2823 if (va_next < sva)
2824 va_next = eva;
2825 continue;
2826 }
2827 l1 = pmap_l0_to_l1(l0, sva);
2828 } else {
2829 l1 = pmap_l1(pmap, sva);
2830 }
2831
2832 if (pmap_load(l1) == 0) {
2833 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2834 if (va_next < sva)
2835 va_next = eva;
2836 continue;
2837 }
2838
2839 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2840 if (va_next < sva)
2841 va_next = eva;
2842
2843 l2 = pmap_l1_to_l2(l1, sva);
2844 if ((l2e = pmap_load(l2)) == 0)
2845 continue;
2846 if ((l2e & PTE_RWX) != 0) {
2847 if (sva + L2_SIZE == va_next && eva >= va_next) {
2848 retryl2:
2849 if ((prot & VM_PROT_WRITE) == 0 &&
2850 (l2e & (PTE_SW_MANAGED | PTE_D)) ==
2851 (PTE_SW_MANAGED | PTE_D)) {
2852 m = PTE_TO_VM_PAGE(l2e);
2853 for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
2854 vm_page_dirty(mt);
2855 }
2856 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask))
2857 goto retryl2;
2858 anychanged = true;
2859 continue;
2860 } else {
2861 if (!pv_lists_locked) {
2862 pv_lists_locked = true;
2863 if (!rw_try_rlock(&pvh_global_lock)) {
2864 if (anychanged)
2865 pmap_invalidate_all(
2866 pmap);
2867 PMAP_UNLOCK(pmap);
2868 rw_rlock(&pvh_global_lock);
2869 goto resume;
2870 }
2871 }
2872 if (!pmap_demote_l2(pmap, l2, sva)) {
2873 /*
2874 * The large page mapping was destroyed.
2875 */
2876 continue;
2877 }
2878 }
2879 }
2880
2881 if (va_next > eva)
2882 va_next = eva;
2883
2884 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2885 sva += L3_SIZE) {
2886 l3e = pmap_load(l3);
2887 retryl3:
2888 if ((l3e & PTE_V) == 0)
2889 continue;
2890 if ((prot & VM_PROT_WRITE) == 0 &&
2891 (l3e & (PTE_SW_MANAGED | PTE_D)) ==
2892 (PTE_SW_MANAGED | PTE_D)) {
2893 m = PTE_TO_VM_PAGE(l3e);
2894 vm_page_dirty(m);
2895 }
2896 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask))
2897 goto retryl3;
2898 anychanged = true;
2899 }
2900 }
2901 if (anychanged)
2902 pmap_invalidate_all(pmap);
2903 if (pv_lists_locked)
2904 rw_runlock(&pvh_global_lock);
2905 PMAP_UNLOCK(pmap);
2906 }
2907
2908 int
pmap_fault(pmap_t pmap,vm_offset_t va,vm_prot_t ftype)2909 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype)
2910 {
2911 pd_entry_t *l2, l2e;
2912 pt_entry_t bits, *pte, oldpte;
2913 int rv;
2914
2915 KASSERT(VIRT_IS_VALID(va), ("pmap_fault: invalid va %#lx", va));
2916
2917 rv = 0;
2918 PMAP_LOCK(pmap);
2919 l2 = pmap_l2(pmap, va);
2920 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0)
2921 goto done;
2922 if ((l2e & PTE_RWX) == 0) {
2923 pte = pmap_l2_to_l3(l2, va);
2924 if (((oldpte = pmap_load(pte)) & PTE_V) == 0)
2925 goto done;
2926 } else {
2927 pte = l2;
2928 oldpte = l2e;
2929 }
2930
2931 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) ||
2932 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) ||
2933 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) ||
2934 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0))
2935 goto done;
2936
2937 bits = PTE_A;
2938 if (ftype == VM_PROT_WRITE)
2939 bits |= PTE_D;
2940
2941 /*
2942 * Spurious faults can occur if the implementation caches invalid
2943 * entries in the TLB, or if simultaneous accesses on multiple CPUs
2944 * race with each other.
2945 */
2946 if ((oldpte & bits) != bits)
2947 pmap_store_bits(pte, bits);
2948 sfence_vma();
2949 rv = 1;
2950 done:
2951 PMAP_UNLOCK(pmap);
2952 return (rv);
2953 }
2954
2955 /*
2956 * Demote the specified L1 page to separate L2 pages.
2957 * Currently only used for DMAP entries.
2958 */
2959 static bool
pmap_demote_l1(pmap_t pmap,pd_entry_t * l1,vm_offset_t va)2960 pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va)
2961 {
2962 vm_page_t m;
2963 pt_entry_t *l2, oldl1, newl2;
2964 pd_entry_t newl1;
2965 vm_paddr_t l2phys;
2966
2967 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2968
2969 oldl1 = pmap_load(l1);
2970 KASSERT((oldl1 & PTE_RWX) != 0,
2971 ("pmap_demote_l1: oldl1 is not a leaf PTE"));
2972 KASSERT((oldl1 & PTE_A) != 0,
2973 ("pmap_demote_l1: oldl1 is missing PTE_A"));
2974 KASSERT((oldl1 & (PTE_D | PTE_W)) != PTE_W,
2975 ("pmap_demote_l1: not dirty!"));
2976 KASSERT((oldl1 & PTE_SW_MANAGED) == 0,
2977 ("pmap_demote_l1: L1 table shouldn't be managed"));
2978 KASSERT(VIRT_IN_DMAP(va),
2979 ("pmap_demote_l1: is unsupported for non-DMAP va=%#lx", va));
2980
2981 /* Demoting L1 means we need to allocate a new page-table page. */
2982 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
2983 if (m == NULL) {
2984 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx in pmap %p",
2985 va, pmap);
2986 return (false);
2987 }
2988
2989 l2phys = VM_PAGE_TO_PHYS(m);
2990 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
2991
2992 /*
2993 * Create new entries, relying on the fact that only the low bits
2994 * (index) of the physical address are changing.
2995 */
2996 newl2 = oldl1;
2997 for (int i = 0; i < Ln_ENTRIES; i++)
2998 pmap_store(&l2[i], newl2 | (i << PTE_PPN1_S));
2999
3000 /*
3001 * And update the L1 entry.
3002 *
3003 * NB: flushing the TLB is the responsibility of the caller. Cached
3004 * translations are still "correct" for demoted mappings until some
3005 * subset of the demoted range is modified.
3006 */
3007 newl1 = ((l2phys / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
3008 pmap_store(l1, newl1);
3009
3010 counter_u64_add(pmap_l1_demotions, 1);
3011 CTR2(KTR_PMAP, "pmap_demote_l1: success for va %#lx in pmap %p",
3012 va, pmap);
3013 return (true);
3014 }
3015
3016 static bool
pmap_demote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va)3017 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va)
3018 {
3019 struct rwlock *lock;
3020 bool rv;
3021
3022 lock = NULL;
3023 rv = pmap_demote_l2_locked(pmap, l2, va, &lock);
3024 if (lock != NULL)
3025 rw_wunlock(lock);
3026 return (rv);
3027 }
3028
3029 /*
3030 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page
3031 * mapping is invalidated.
3032 */
3033 static bool
pmap_demote_l2_locked(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)3034 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
3035 struct rwlock **lockp)
3036 {
3037 struct spglist free;
3038 vm_page_t mpte;
3039 pd_entry_t newl2, oldl2;
3040 pt_entry_t *firstl3, newl3;
3041 vm_paddr_t mptepa;
3042 int i;
3043
3044 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3045
3046 oldl2 = pmap_load(l2);
3047 KASSERT((oldl2 & PTE_RWX) != 0,
3048 ("pmap_demote_l2_locked: oldl2 is not a leaf entry"));
3049 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
3050 NULL) {
3051 KASSERT((oldl2 & PTE_SW_WIRED) == 0,
3052 ("pmap_demote_l2_locked: page table page for a wired mapping is missing"));
3053 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj(
3054 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
3055 VM_ALLOC_WIRED)) == NULL) {
3056 SLIST_INIT(&free);
3057 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET,
3058 pmap_load(pmap_l1(pmap, va)), &free, lockp);
3059 vm_page_free_pages_toq(&free, true);
3060 CTR2(KTR_PMAP, "pmap_demote_l2_locked: "
3061 "failure for va %#lx in pmap %p", va, pmap);
3062 return (false);
3063 }
3064 mpte->pindex = pmap_l2_pindex(va);
3065 if (va < VM_MAXUSER_ADDRESS) {
3066 mpte->ref_count = Ln_ENTRIES;
3067 pmap_resident_count_inc(pmap, 1);
3068 }
3069 }
3070 mptepa = VM_PAGE_TO_PHYS(mpte);
3071 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
3072 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
3073 KASSERT((oldl2 & PTE_A) != 0,
3074 ("pmap_demote_l2_locked: oldl2 is missing PTE_A"));
3075 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W,
3076 ("pmap_demote_l2_locked: oldl2 is missing PTE_D"));
3077 newl3 = oldl2;
3078
3079 /*
3080 * If the page table page is not leftover from an earlier promotion,
3081 * initialize it.
3082 */
3083 if (!vm_page_all_valid(mpte)) {
3084 for (i = 0; i < Ln_ENTRIES; i++)
3085 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
3086 }
3087 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3),
3088 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical "
3089 "addresses"));
3090
3091 /*
3092 * If the mapping has changed attributes, update the PTEs.
3093 */
3094 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE))
3095 for (i = 0; i < Ln_ENTRIES; i++)
3096 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
3097
3098 /*
3099 * The spare PV entries must be reserved prior to demoting the
3100 * mapping, that is, prior to changing the L2 entry. Otherwise, the
3101 * state of the L2 entry and the PV lists will be inconsistent, which
3102 * can result in reclaim_pv_chunk() attempting to remove a PV entry from
3103 * the wrong PV list and pmap_pv_demote_l2() failing to find the
3104 * expected PV entry for the 2MB page mapping that is being demoted.
3105 */
3106 if ((oldl2 & PTE_SW_MANAGED) != 0)
3107 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
3108
3109 /*
3110 * Demote the mapping.
3111 */
3112 pmap_store(l2, newl2);
3113
3114 /*
3115 * Demote the PV entry.
3116 */
3117 if ((oldl2 & PTE_SW_MANAGED) != 0)
3118 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
3119
3120 atomic_add_long(&pmap_l2_demotions, 1);
3121 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p",
3122 va, pmap);
3123 return (true);
3124 }
3125
3126 #if VM_NRESERVLEVEL > 0
3127 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t ml3,struct rwlock ** lockp)3128 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t ml3,
3129 struct rwlock **lockp)
3130 {
3131 pt_entry_t all_l3e_PTE_A, *firstl3, firstl3e, *l3, l3e;
3132 vm_paddr_t pa;
3133
3134 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3135 if (!pmap_ps_enabled(pmap))
3136 return (false);
3137
3138 KASSERT((pmap_load(l2) & PTE_RWX) == 0,
3139 ("pmap_promote_l2: invalid l2 entry %p", l2));
3140
3141 /*
3142 * Examine the first L3E in the specified PTP. Abort if this L3E is
3143 * ineligible for promotion or does not map the first 4KB physical page
3144 * within a 2MB page.
3145 */
3146 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
3147 firstl3e = pmap_load(firstl3);
3148 pa = PTE_TO_PHYS(firstl3e);
3149 if ((pa & L2_OFFSET) != 0) {
3150 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
3151 va, pmap);
3152 atomic_add_long(&pmap_l2_p_failures, 1);
3153 return (false);
3154 }
3155
3156 /*
3157 * Downgrade a clean, writable mapping to read-only to ensure that the
3158 * hardware does not set PTE_D while we are comparing PTEs.
3159 *
3160 * Upon a write access to a clean mapping, the implementation will
3161 * either atomically check protections and set PTE_D, or raise a page
3162 * fault. In the latter case, the pmap lock provides atomicity. Thus,
3163 * we do not issue an sfence.vma here and instead rely on pmap_fault()
3164 * to do so lazily.
3165 */
3166 while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) {
3167 if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) {
3168 firstl3e &= ~PTE_W;
3169 break;
3170 }
3171 }
3172
3173 /*
3174 * Examine each of the other PTEs in the specified PTP. Abort if this
3175 * PTE maps an unexpected 4KB physical page or does not have identical
3176 * characteristics to the first PTE.
3177 */
3178 all_l3e_PTE_A = firstl3e & PTE_A;
3179 pa += L2_SIZE - PAGE_SIZE;
3180 for (l3 = firstl3 + Ln_ENTRIES - 1; l3 > firstl3; l3--) {
3181 l3e = pmap_load(l3);
3182 if (PTE_TO_PHYS(l3e) != pa) {
3183 CTR2(KTR_PMAP,
3184 "pmap_promote_l2: failure for va %#lx pmap %p",
3185 va, pmap);
3186 atomic_add_long(&pmap_l2_p_failures, 1);
3187 return (false);
3188 }
3189 while ((l3e & (PTE_W | PTE_D)) == PTE_W) {
3190 if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) {
3191 l3e &= ~PTE_W;
3192 break;
3193 }
3194 }
3195 if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) {
3196 CTR2(KTR_PMAP,
3197 "pmap_promote_l2: failure for va %#lx pmap %p",
3198 va, pmap);
3199 atomic_add_long(&pmap_l2_p_failures, 1);
3200 return (false);
3201 }
3202 all_l3e_PTE_A &= l3e;
3203 pa -= PAGE_SIZE;
3204 }
3205
3206 /*
3207 * Unless all PTEs have PTE_A set, clear it from the superpage
3208 * mapping, so that promotions triggered by speculative mappings,
3209 * such as pmap_enter_quick(), don't automatically mark the
3210 * underlying pages as referenced.
3211 */
3212 firstl3e &= ~PTE_A | all_l3e_PTE_A;
3213
3214 /*
3215 * Save the page table page in its current state until the L2
3216 * mapping the superpage is demoted by pmap_demote_l2() or
3217 * destroyed by pmap_remove_l3().
3218 */
3219 if (ml3 == NULL)
3220 ml3 = PTE_TO_VM_PAGE(pmap_load(l2));
3221 KASSERT(ml3->pindex == pmap_l2_pindex(va),
3222 ("pmap_promote_l2: page table page's pindex is wrong"));
3223 if (pmap_insert_pt_page(pmap, ml3, true, all_l3e_PTE_A != 0)) {
3224 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
3225 va, pmap);
3226 atomic_add_long(&pmap_l2_p_failures, 1);
3227 return (false);
3228 }
3229
3230 if ((firstl3e & PTE_SW_MANAGED) != 0)
3231 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp);
3232
3233 pmap_store(l2, firstl3e);
3234
3235 atomic_add_long(&pmap_l2_promotions, 1);
3236 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
3237 pmap);
3238 return (true);
3239 }
3240 #endif
3241
3242 /*
3243 * Insert the given physical page (p) at
3244 * the specified virtual address (v) in the
3245 * target physical map with the protection requested.
3246 *
3247 * If specified, the page will be wired down, meaning
3248 * that the related pte can not be reclaimed.
3249 *
3250 * NB: This is the only routine which MAY NOT lazy-evaluate
3251 * or lose information. That is, this routine must actually
3252 * insert this page into the given map NOW.
3253 */
3254 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)3255 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3256 u_int flags, int8_t psind)
3257 {
3258 struct rwlock *lock;
3259 pd_entry_t *l2, l2e;
3260 pt_entry_t new_l3, orig_l3;
3261 pt_entry_t *l3;
3262 pv_entry_t pv;
3263 vm_paddr_t opa, pa;
3264 vm_page_t mpte, om;
3265 pn_t pn;
3266 int rv;
3267 bool nosleep;
3268
3269 va = trunc_page(va);
3270 if ((m->oflags & VPO_UNMANAGED) == 0)
3271 VM_PAGE_OBJECT_BUSY_ASSERT(m);
3272 pa = VM_PAGE_TO_PHYS(m);
3273 pn = (pa / PAGE_SIZE);
3274
3275 new_l3 = PTE_V | PTE_R | PTE_A;
3276 if (prot & VM_PROT_EXECUTE)
3277 new_l3 |= PTE_X;
3278 if (flags & VM_PROT_WRITE)
3279 new_l3 |= PTE_D;
3280 if (prot & VM_PROT_WRITE)
3281 new_l3 |= PTE_W;
3282 if (va < VM_MAX_USER_ADDRESS)
3283 new_l3 |= PTE_U;
3284
3285 new_l3 |= (pn << PTE_PPN0_S);
3286 if ((flags & PMAP_ENTER_WIRED) != 0)
3287 new_l3 |= PTE_SW_WIRED;
3288 new_l3 |= pmap_memattr_bits(m->md.pv_memattr);
3289
3290 /*
3291 * Set modified bit gratuitously for writeable mappings if
3292 * the page is unmanaged. We do not want to take a fault
3293 * to do the dirty bit accounting for these mappings.
3294 */
3295 if ((m->oflags & VPO_UNMANAGED) != 0) {
3296 if (prot & VM_PROT_WRITE)
3297 new_l3 |= PTE_D;
3298 } else
3299 new_l3 |= PTE_SW_MANAGED;
3300
3301 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
3302
3303 lock = NULL;
3304 mpte = NULL;
3305 rw_rlock(&pvh_global_lock);
3306 PMAP_LOCK(pmap);
3307 if (psind == 1) {
3308 /* Assert the required virtual and physical alignment. */
3309 KASSERT((va & L2_OFFSET) == 0,
3310 ("pmap_enter: va %#lx unaligned", va));
3311 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
3312 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock);
3313 goto out;
3314 }
3315
3316 l2 = pmap_l2(pmap, va);
3317 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 &&
3318 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2,
3319 va, &lock))) {
3320 l3 = pmap_l2_to_l3(l2, va);
3321 if (va < VM_MAXUSER_ADDRESS) {
3322 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
3323 mpte->ref_count++;
3324 }
3325 } else if (va < VM_MAXUSER_ADDRESS) {
3326 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
3327 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
3328 if (mpte == NULL && nosleep) {
3329 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
3330 if (lock != NULL)
3331 rw_wunlock(lock);
3332 rw_runlock(&pvh_global_lock);
3333 PMAP_UNLOCK(pmap);
3334 return (KERN_RESOURCE_SHORTAGE);
3335 }
3336 l3 = pmap_l3(pmap, va);
3337 } else {
3338 panic("pmap_enter: missing L3 table for kernel va %#lx", va);
3339 }
3340
3341 orig_l3 = pmap_load(l3);
3342 opa = PTE_TO_PHYS(orig_l3);
3343 pv = NULL;
3344
3345 /*
3346 * Is the specified virtual address already mapped?
3347 */
3348 if ((orig_l3 & PTE_V) != 0) {
3349 /*
3350 * Wiring change, just update stats. We don't worry about
3351 * wiring PT pages as they remain resident as long as there
3352 * are valid mappings in them. Hence, if a user page is wired,
3353 * the PT page will be also.
3354 */
3355 if ((flags & PMAP_ENTER_WIRED) != 0 &&
3356 (orig_l3 & PTE_SW_WIRED) == 0)
3357 pmap->pm_stats.wired_count++;
3358 else if ((flags & PMAP_ENTER_WIRED) == 0 &&
3359 (orig_l3 & PTE_SW_WIRED) != 0)
3360 pmap->pm_stats.wired_count--;
3361
3362 /*
3363 * Remove the extra PT page reference.
3364 */
3365 if (mpte != NULL) {
3366 mpte->ref_count--;
3367 KASSERT(mpte->ref_count > 0,
3368 ("pmap_enter: missing reference to page table page,"
3369 " va: 0x%lx", va));
3370 }
3371
3372 /*
3373 * Has the physical page changed?
3374 */
3375 if (opa == pa) {
3376 /*
3377 * No, might be a protection or wiring change.
3378 */
3379 if ((orig_l3 & PTE_SW_MANAGED) != 0 &&
3380 (new_l3 & PTE_W) != 0)
3381 vm_page_aflag_set(m, PGA_WRITEABLE);
3382 goto validate;
3383 }
3384
3385 /*
3386 * The physical page has changed. Temporarily invalidate
3387 * the mapping. This ensures that all threads sharing the
3388 * pmap keep a consistent view of the mapping, which is
3389 * necessary for the correct handling of COW faults. It
3390 * also permits reuse of the old mapping's PV entry,
3391 * avoiding an allocation.
3392 *
3393 * For consistency, handle unmanaged mappings the same way.
3394 */
3395 orig_l3 = pmap_load_clear(l3);
3396 KASSERT(PTE_TO_PHYS(orig_l3) == opa,
3397 ("pmap_enter: unexpected pa update for %#lx", va));
3398 if ((orig_l3 & PTE_SW_MANAGED) != 0) {
3399 om = PHYS_TO_VM_PAGE(opa);
3400
3401 /*
3402 * The pmap lock is sufficient to synchronize with
3403 * concurrent calls to pmap_page_test_mappings() and
3404 * pmap_ts_referenced().
3405 */
3406 if ((orig_l3 & PTE_D) != 0)
3407 vm_page_dirty(om);
3408 if ((orig_l3 & PTE_A) != 0)
3409 vm_page_aflag_set(om, PGA_REFERENCED);
3410 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3411 pv = pmap_pvh_remove(&om->md, pmap, va);
3412 KASSERT(pv != NULL,
3413 ("pmap_enter: no PV entry for %#lx", va));
3414 if ((new_l3 & PTE_SW_MANAGED) == 0)
3415 free_pv_entry(pmap, pv);
3416 if ((om->a.flags & PGA_WRITEABLE) != 0 &&
3417 TAILQ_EMPTY(&om->md.pv_list) &&
3418 ((om->flags & PG_FICTITIOUS) != 0 ||
3419 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3420 vm_page_aflag_clear(om, PGA_WRITEABLE);
3421 }
3422 pmap_invalidate_page(pmap, va);
3423 orig_l3 = 0;
3424 } else {
3425 /*
3426 * Increment the counters.
3427 */
3428 if ((new_l3 & PTE_SW_WIRED) != 0)
3429 pmap->pm_stats.wired_count++;
3430 pmap_resident_count_inc(pmap, 1);
3431 }
3432 /*
3433 * Enter on the PV list if part of our managed memory.
3434 */
3435 if ((new_l3 & PTE_SW_MANAGED) != 0) {
3436 if (pv == NULL) {
3437 pv = get_pv_entry(pmap, &lock);
3438 pv->pv_va = va;
3439 }
3440 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3441 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3442 m->md.pv_gen++;
3443 if ((new_l3 & PTE_W) != 0)
3444 vm_page_aflag_set(m, PGA_WRITEABLE);
3445 }
3446
3447 validate:
3448 /*
3449 * Sync the i-cache on all harts before updating the PTE
3450 * if the new PTE is executable.
3451 */
3452 if (prot & VM_PROT_EXECUTE)
3453 pmap_sync_icache(pmap, va, PAGE_SIZE);
3454
3455 /*
3456 * Update the L3 entry.
3457 */
3458 if (orig_l3 != 0) {
3459 orig_l3 = pmap_load_store(l3, new_l3);
3460 pmap_invalidate_page(pmap, va);
3461 KASSERT(PTE_TO_PHYS(orig_l3) == pa,
3462 ("pmap_enter: invalid update"));
3463 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) ==
3464 (PTE_D | PTE_SW_MANAGED))
3465 vm_page_dirty(m);
3466 } else {
3467 pmap_store(l3, new_l3);
3468 }
3469
3470 #if VM_NRESERVLEVEL > 0
3471 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES &&
3472 (m->flags & PG_FICTITIOUS) == 0 &&
3473 vm_reserv_level_iffullpop(m) == 0)
3474 (void)pmap_promote_l2(pmap, l2, va, mpte, &lock);
3475 #endif
3476
3477 rv = KERN_SUCCESS;
3478 out:
3479 if (lock != NULL)
3480 rw_wunlock(lock);
3481 rw_runlock(&pvh_global_lock);
3482 PMAP_UNLOCK(pmap);
3483 return (rv);
3484 }
3485
3486 /*
3487 * Release a page table page reference after a failed attempt to create a
3488 * mapping.
3489 */
3490 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t l2pg)3491 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t l2pg)
3492 {
3493 struct spglist free;
3494
3495 SLIST_INIT(&free);
3496 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) {
3497 /*
3498 * Although "va" is not mapped, paging-structure
3499 * caches could nonetheless have entries that
3500 * refer to the freed page table pages.
3501 * Invalidate those entries.
3502 */
3503 pmap_invalidate_page(pmap, va);
3504 vm_page_free_pages_toq(&free, true);
3505 }
3506 }
3507
3508 /*
3509 * Tries to create a read- and/or execute-only 2MB page mapping. Returns
3510 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
3511 * value. See pmap_enter_l2() for the possible error values when "no sleep",
3512 * "no replace", and "no reclaim" are specified.
3513 */
3514 static int
pmap_enter_2mpage(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)3515 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3516 struct rwlock **lockp)
3517 {
3518 pd_entry_t new_l2;
3519 pn_t pn;
3520
3521 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3522
3523 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE;
3524 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V |
3525 pmap_memattr_bits(m->md.pv_memattr));
3526 if ((m->oflags & VPO_UNMANAGED) == 0)
3527 new_l2 |= PTE_SW_MANAGED;
3528 if ((prot & VM_PROT_EXECUTE) != 0)
3529 new_l2 |= PTE_X;
3530 if (va < VM_MAXUSER_ADDRESS)
3531 new_l2 |= PTE_U;
3532 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
3533 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp));
3534 }
3535
3536 /*
3537 * Returns true if every page table entry in the specified page table is
3538 * zero.
3539 */
3540 static bool
pmap_every_pte_zero(vm_paddr_t pa)3541 pmap_every_pte_zero(vm_paddr_t pa)
3542 {
3543 pt_entry_t *pt_end, *pte;
3544
3545 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
3546 pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
3547 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
3548 if (*pte != 0)
3549 return (false);
3550 }
3551 return (true);
3552 }
3553
3554 /*
3555 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if
3556 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
3557 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if
3558 * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists
3559 * within the 2MB virtual address range starting at the specified virtual
3560 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
3561 * 2MB page mapping already exists at the specified virtual address. Returns
3562 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
3563 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
3564 * and a PV entry allocation failed.
3565 *
3566 * The parameter "m" is only used when creating a managed, writeable mapping.
3567 */
3568 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)3569 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
3570 vm_page_t m, struct rwlock **lockp)
3571 {
3572 struct spglist free;
3573 pd_entry_t *l2, *l3, oldl2;
3574 vm_offset_t sva;
3575 vm_page_t l2pg, mt;
3576 vm_page_t uwptpg;
3577
3578 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3579
3580 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
3581 NULL : lockp)) == NULL) {
3582 CTR2(KTR_PMAP, "pmap_enter_l2: failed to allocate PT page"
3583 " for va %#lx in pmap %p", va, pmap);
3584 return (KERN_RESOURCE_SHORTAGE);
3585 }
3586
3587 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
3588 l2 = &l2[pmap_l2_index(va)];
3589 if ((oldl2 = pmap_load(l2)) != 0) {
3590 KASSERT(l2pg->ref_count > 1,
3591 ("pmap_enter_l2: l2pg's ref count is too low"));
3592 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
3593 if ((oldl2 & PTE_RWX) != 0) {
3594 l2pg->ref_count--;
3595 CTR2(KTR_PMAP,
3596 "pmap_enter_l2: no space for va %#lx"
3597 " in pmap %p", va, pmap);
3598 return (KERN_NO_SPACE);
3599 } else if (va < VM_MAXUSER_ADDRESS ||
3600 !pmap_every_pte_zero(L2PTE_TO_PHYS(oldl2))) {
3601 l2pg->ref_count--;
3602 CTR2(KTR_PMAP, "pmap_enter_l2:"
3603 " failed to replace existing mapping"
3604 " for va %#lx in pmap %p", va, pmap);
3605 return (KERN_FAILURE);
3606 }
3607 }
3608 SLIST_INIT(&free);
3609 if ((oldl2 & PTE_RWX) != 0)
3610 (void)pmap_remove_l2(pmap, l2, va,
3611 pmap_load(pmap_l1(pmap, va)), &free, lockp);
3612 else
3613 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) {
3614 l3 = pmap_l2_to_l3(l2, sva);
3615 if ((pmap_load(l3) & PTE_V) != 0 &&
3616 pmap_remove_l3(pmap, l3, sva, oldl2, &free,
3617 lockp) != 0)
3618 break;
3619 }
3620 vm_page_free_pages_toq(&free, true);
3621 if (va >= VM_MAXUSER_ADDRESS) {
3622 /*
3623 * Both pmap_remove_l2() and pmap_remove_l3() will
3624 * leave the kernel page table page zero filled.
3625 */
3626 mt = PTE_TO_VM_PAGE(pmap_load(l2));
3627 if (pmap_insert_pt_page(pmap, mt, false, false))
3628 panic("pmap_enter_l2: trie insert failed");
3629 } else
3630 KASSERT(pmap_load(l2) == 0,
3631 ("pmap_enter_l2: non-zero L2 entry %p", l2));
3632 }
3633
3634 /*
3635 * Allocate leaf ptpage for wired userspace pages.
3636 */
3637 uwptpg = NULL;
3638 if ((new_l2 & PTE_SW_WIRED) != 0 && pmap != kernel_pmap) {
3639 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3640 if (uwptpg == NULL) {
3641 pmap_abort_ptp(pmap, va, l2pg);
3642 return (KERN_RESOURCE_SHORTAGE);
3643 }
3644 uwptpg->pindex = pmap_l2_pindex(va);
3645 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
3646 vm_page_unwire_noq(uwptpg);
3647 vm_page_free(uwptpg);
3648 pmap_abort_ptp(pmap, va, l2pg);
3649 return (KERN_RESOURCE_SHORTAGE);
3650 }
3651 pmap_resident_count_inc(pmap, 1);
3652 uwptpg->ref_count = Ln_ENTRIES;
3653 }
3654 if ((new_l2 & PTE_SW_MANAGED) != 0) {
3655 /*
3656 * Abort this mapping if its PV entry could not be created.
3657 */
3658 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
3659 pmap_abort_ptp(pmap, va, l2pg);
3660 if (uwptpg != NULL) {
3661 mt = pmap_remove_pt_page(pmap, va);
3662 KASSERT(mt == uwptpg,
3663 ("removed pt page %p, expected %p", mt,
3664 uwptpg));
3665 pmap_resident_count_dec(pmap, 1);
3666 uwptpg->ref_count = 1;
3667 vm_page_unwire_noq(uwptpg);
3668 vm_page_free(uwptpg);
3669 }
3670 CTR2(KTR_PMAP,
3671 "pmap_enter_l2: failed to create PV entry"
3672 " for va %#lx in pmap %p", va, pmap);
3673 return (KERN_RESOURCE_SHORTAGE);
3674 }
3675 if ((new_l2 & PTE_W) != 0)
3676 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3677 vm_page_aflag_set(mt, PGA_WRITEABLE);
3678 }
3679
3680 /*
3681 * Increment counters.
3682 */
3683 if ((new_l2 & PTE_SW_WIRED) != 0)
3684 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
3685 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
3686
3687 /*
3688 * Map the superpage.
3689 */
3690 pmap_store(l2, new_l2);
3691
3692 atomic_add_long(&pmap_l2_mappings, 1);
3693 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
3694 va, pmap);
3695
3696 return (KERN_SUCCESS);
3697 }
3698
3699 /*
3700 * Maps a sequence of resident pages belonging to the same object.
3701 * The sequence begins with the given page m_start. This page is
3702 * mapped at the given virtual address start. Each subsequent page is
3703 * mapped at a virtual address that is offset from start by the same
3704 * amount as the page is offset from m_start within the object. The
3705 * last page in the sequence is the page with the largest offset from
3706 * m_start that can be mapped at a virtual address less than the given
3707 * virtual address end. Not every virtual page between start and end
3708 * is mapped; only those for which a resident page exists with the
3709 * corresponding offset from m_start are mapped.
3710 */
3711 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)3712 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3713 vm_page_t m_start, vm_prot_t prot)
3714 {
3715 struct pctrie_iter pages;
3716 struct rwlock *lock;
3717 vm_offset_t va;
3718 vm_page_t m, mpte;
3719 int rv;
3720
3721 VM_OBJECT_ASSERT_LOCKED(m_start->object);
3722
3723 mpte = NULL;
3724 vm_page_iter_limit_init(&pages, m_start->object,
3725 m_start->pindex + atop(end - start));
3726 m = vm_radix_iter_lookup(&pages, m_start->pindex);
3727 lock = NULL;
3728 rw_rlock(&pvh_global_lock);
3729 PMAP_LOCK(pmap);
3730 while (m != NULL) {
3731 va = start + ptoa(m->pindex - m_start->pindex);
3732 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
3733 m->psind == 1 && pmap_ps_enabled(pmap) &&
3734 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) ==
3735 KERN_SUCCESS || rv == KERN_NO_SPACE)) {
3736 m = vm_radix_iter_jump(&pages, L2_SIZE / PAGE_SIZE);
3737 } else {
3738 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
3739 &lock);
3740 m = vm_radix_iter_step(&pages);
3741 }
3742 }
3743 if (lock != NULL)
3744 rw_wunlock(lock);
3745 rw_runlock(&pvh_global_lock);
3746 PMAP_UNLOCK(pmap);
3747 }
3748
3749 /*
3750 * this code makes some *MAJOR* assumptions:
3751 * 1. Current pmap & pmap exists.
3752 * 2. Not wired.
3753 * 3. Read access.
3754 * 4. No page table pages.
3755 * but is *MUCH* faster than pmap_enter...
3756 */
3757
3758 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)3759 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3760 {
3761 struct rwlock *lock;
3762
3763 lock = NULL;
3764 rw_rlock(&pvh_global_lock);
3765 PMAP_LOCK(pmap);
3766 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
3767 if (lock != NULL)
3768 rw_wunlock(lock);
3769 rw_runlock(&pvh_global_lock);
3770 PMAP_UNLOCK(pmap);
3771 }
3772
3773 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)3774 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3775 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
3776 {
3777 struct spglist free;
3778 pd_entry_t *l2;
3779 pt_entry_t *l3, newl3;
3780
3781 KASSERT(!VA_IS_CLEANMAP(va) ||
3782 (m->oflags & VPO_UNMANAGED) != 0,
3783 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3784 rw_assert(&pvh_global_lock, RA_LOCKED);
3785 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3786 l2 = NULL;
3787
3788 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
3789 /*
3790 * In the case that a page table page is not
3791 * resident, we are creating it here.
3792 */
3793 if (va < VM_MAXUSER_ADDRESS) {
3794 vm_pindex_t l2pindex;
3795
3796 /*
3797 * Calculate pagetable page index
3798 */
3799 l2pindex = pmap_l2_pindex(va);
3800 if (mpte && (mpte->pindex == l2pindex)) {
3801 mpte->ref_count++;
3802 } else {
3803 /*
3804 * Get the l2 entry
3805 */
3806 l2 = pmap_l2(pmap, va);
3807
3808 /*
3809 * If the page table page is mapped, we just increment
3810 * the hold count, and activate it. Otherwise, we
3811 * attempt to allocate a page table page. If this
3812 * attempt fails, we don't retry. Instead, we give up.
3813 */
3814 if (l2 != NULL && pmap_load(l2) != 0) {
3815 if ((pmap_load(l2) & PTE_RWX) != 0)
3816 return (NULL);
3817 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
3818 mpte->ref_count++;
3819 } else {
3820 /*
3821 * Pass NULL instead of the PV list lock
3822 * pointer, because we don't intend to sleep.
3823 */
3824 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
3825 if (mpte == NULL)
3826 return (mpte);
3827 }
3828 }
3829 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3830 l3 = &l3[pmap_l3_index(va)];
3831 } else {
3832 mpte = NULL;
3833 l3 = pmap_l3(kernel_pmap, va);
3834 }
3835 if (l3 == NULL)
3836 panic("pmap_enter_quick_locked: No l3");
3837 if (pmap_load(l3) != 0) {
3838 if (mpte != NULL)
3839 mpte->ref_count--;
3840 return (NULL);
3841 }
3842
3843 /*
3844 * Enter on the PV list if part of our managed memory.
3845 */
3846 if ((m->oflags & VPO_UNMANAGED) == 0 &&
3847 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3848 if (mpte != NULL) {
3849 SLIST_INIT(&free);
3850 if (pmap_unwire_ptp(pmap, va, mpte, &free))
3851 vm_page_free_pages_toq(&free, false);
3852 }
3853 return (NULL);
3854 }
3855
3856 /*
3857 * Increment counters
3858 */
3859 pmap_resident_count_inc(pmap, 1);
3860
3861 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) |
3862 PTE_V | PTE_R | pmap_memattr_bits(m->md.pv_memattr);
3863 if ((prot & VM_PROT_EXECUTE) != 0)
3864 newl3 |= PTE_X;
3865 if ((m->oflags & VPO_UNMANAGED) == 0)
3866 newl3 |= PTE_SW_MANAGED;
3867 if (va < VM_MAX_USER_ADDRESS)
3868 newl3 |= PTE_U;
3869
3870 /*
3871 * Sync the i-cache on all harts before updating the PTE
3872 * if the new PTE is executable.
3873 */
3874 if (prot & VM_PROT_EXECUTE)
3875 pmap_sync_icache(pmap, va, PAGE_SIZE);
3876
3877 pmap_store(l3, newl3);
3878
3879 #if VM_NRESERVLEVEL > 0
3880 /*
3881 * If both the PTP and the reservation are fully populated, then attempt
3882 * promotion.
3883 */
3884 if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
3885 (mpte == NULL || mpte->ref_count == Ln_ENTRIES) &&
3886 (m->flags & PG_FICTITIOUS) == 0 &&
3887 vm_reserv_level_iffullpop(m) == 0) {
3888 if (l2 == NULL)
3889 l2 = pmap_l2(pmap, va);
3890
3891 /*
3892 * If promotion succeeds, then the next call to this function
3893 * should not be given the unmapped PTP as a hint.
3894 */
3895 if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
3896 mpte = NULL;
3897 }
3898 #endif
3899
3900 return (mpte);
3901 }
3902
3903 /*
3904 * This code maps large physical mmap regions into the
3905 * processor address space. Note that some shortcuts
3906 * are taken, but the code works.
3907 */
3908 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)3909 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3910 vm_pindex_t pindex, vm_size_t size)
3911 {
3912
3913 VM_OBJECT_ASSERT_WLOCKED(object);
3914 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3915 ("pmap_object_init_pt: non-device object"));
3916 }
3917
3918 /*
3919 * Clear the wired attribute from the mappings for the specified range of
3920 * addresses in the given pmap. Every valid mapping within that range
3921 * must have the wired attribute set. In contrast, invalid mappings
3922 * cannot have the wired attribute set, so they are ignored.
3923 *
3924 * The wired attribute of the page table entry is not a hardware feature,
3925 * so there is no need to invalidate any TLB entries.
3926 */
3927 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3928 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3929 {
3930 vm_offset_t va_next;
3931 pd_entry_t *l0, *l1, *l2, l2e;
3932 pt_entry_t *l3, l3e;
3933 bool pv_lists_locked;
3934
3935 pv_lists_locked = false;
3936 retry:
3937 PMAP_LOCK(pmap);
3938 for (; sva < eva; sva = va_next) {
3939 if (pmap_mode == PMAP_MODE_SV48) {
3940 l0 = pmap_l0(pmap, sva);
3941 if (pmap_load(l0) == 0) {
3942 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3943 if (va_next < sva)
3944 va_next = eva;
3945 continue;
3946 }
3947 l1 = pmap_l0_to_l1(l0, sva);
3948 } else {
3949 l1 = pmap_l1(pmap, sva);
3950 }
3951
3952 if (pmap_load(l1) == 0) {
3953 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3954 if (va_next < sva)
3955 va_next = eva;
3956 continue;
3957 }
3958
3959 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3960 if (va_next < sva)
3961 va_next = eva;
3962
3963 l2 = pmap_l1_to_l2(l1, sva);
3964 if ((l2e = pmap_load(l2)) == 0)
3965 continue;
3966 if ((l2e & PTE_RWX) != 0) {
3967 if (sva + L2_SIZE == va_next && eva >= va_next) {
3968 if ((l2e & PTE_SW_WIRED) == 0)
3969 panic("pmap_unwire: l2 %#jx is missing "
3970 "PTE_SW_WIRED", (uintmax_t)l2e);
3971 pmap_clear_bits(l2, PTE_SW_WIRED);
3972 continue;
3973 } else {
3974 if (!pv_lists_locked) {
3975 pv_lists_locked = true;
3976 if (!rw_try_rlock(&pvh_global_lock)) {
3977 PMAP_UNLOCK(pmap);
3978 rw_rlock(&pvh_global_lock);
3979 /* Repeat sva. */
3980 goto retry;
3981 }
3982 }
3983 if (!pmap_demote_l2(pmap, l2, sva))
3984 panic("pmap_unwire: demotion failed");
3985 }
3986 }
3987
3988 if (va_next > eva)
3989 va_next = eva;
3990 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
3991 sva += L3_SIZE) {
3992 if ((l3e = pmap_load(l3)) == 0)
3993 continue;
3994 if ((l3e & PTE_SW_WIRED) == 0)
3995 panic("pmap_unwire: l3 %#jx is missing "
3996 "PTE_SW_WIRED", (uintmax_t)l3e);
3997
3998 /*
3999 * PG_W must be cleared atomically. Although the pmap
4000 * lock synchronizes access to PG_W, another processor
4001 * could be setting PG_M and/or PG_A concurrently.
4002 */
4003 pmap_clear_bits(l3, PTE_SW_WIRED);
4004 pmap->pm_stats.wired_count--;
4005 }
4006 }
4007 if (pv_lists_locked)
4008 rw_runlock(&pvh_global_lock);
4009 PMAP_UNLOCK(pmap);
4010 }
4011
4012 /*
4013 * Copy the range specified by src_addr/len
4014 * from the source map to the range dst_addr/len
4015 * in the destination map.
4016 *
4017 * This routine is only advisory and need not do anything.
4018 */
4019
4020 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)4021 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4022 vm_offset_t src_addr)
4023 {
4024
4025 }
4026
4027 /*
4028 * pmap_zero_page zeros the specified hardware page by mapping
4029 * the page into KVM and using bzero to clear its contents.
4030 */
4031 void
pmap_zero_page(vm_page_t m)4032 pmap_zero_page(vm_page_t m)
4033 {
4034 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4035
4036 pagezero((void *)va);
4037 }
4038
4039 /*
4040 * pmap_zero_page_area zeros the specified hardware page by mapping
4041 * the page into KVM and using bzero to clear its contents.
4042 *
4043 * off and size may not cover an area beyond a single hardware page.
4044 */
4045 void
pmap_zero_page_area(vm_page_t m,int off,int size)4046 pmap_zero_page_area(vm_page_t m, int off, int size)
4047 {
4048 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4049
4050 if (off == 0 && size == PAGE_SIZE)
4051 pagezero((void *)va);
4052 else
4053 bzero((char *)va + off, size);
4054 }
4055
4056 /*
4057 * pmap_copy_page copies the specified (machine independent)
4058 * page by mapping the page into virtual memory and using
4059 * bcopy to copy the page, one machine dependent page at a
4060 * time.
4061 */
4062 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)4063 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
4064 {
4065 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
4066 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
4067
4068 pagecopy((void *)src, (void *)dst);
4069 }
4070
4071 int unmapped_buf_allowed = 1;
4072
4073 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)4074 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4075 vm_offset_t b_offset, int xfersize)
4076 {
4077 void *a_cp, *b_cp;
4078 vm_page_t m_a, m_b;
4079 vm_paddr_t p_a, p_b;
4080 vm_offset_t a_pg_offset, b_pg_offset;
4081 int cnt;
4082
4083 while (xfersize > 0) {
4084 a_pg_offset = a_offset & PAGE_MASK;
4085 m_a = ma[a_offset >> PAGE_SHIFT];
4086 p_a = m_a->phys_addr;
4087 b_pg_offset = b_offset & PAGE_MASK;
4088 m_b = mb[b_offset >> PAGE_SHIFT];
4089 p_b = m_b->phys_addr;
4090 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4091 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4092 if (__predict_false(!PHYS_IN_DMAP(p_a))) {
4093 panic("!DMAP a %lx", p_a);
4094 } else {
4095 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
4096 }
4097 if (__predict_false(!PHYS_IN_DMAP(p_b))) {
4098 panic("!DMAP b %lx", p_b);
4099 } else {
4100 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
4101 }
4102 bcopy(a_cp, b_cp, cnt);
4103 a_offset += cnt;
4104 b_offset += cnt;
4105 xfersize -= cnt;
4106 }
4107 }
4108
4109 vm_offset_t
pmap_quick_enter_page(vm_page_t m)4110 pmap_quick_enter_page(vm_page_t m)
4111 {
4112
4113 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
4114 }
4115
4116 void
pmap_quick_remove_page(vm_offset_t addr)4117 pmap_quick_remove_page(vm_offset_t addr)
4118 {
4119 }
4120
4121 /*
4122 * Returns true if the pmap's pv is one of the first
4123 * 16 pvs linked to from this page. This count may
4124 * be changed upwards or downwards in the future; it
4125 * is only necessary that true be returned for a small
4126 * subset of pmaps for proper page aging.
4127 */
4128 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)4129 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4130 {
4131 struct md_page *pvh;
4132 struct rwlock *lock;
4133 pv_entry_t pv;
4134 int loops = 0;
4135 bool rv;
4136
4137 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4138 ("pmap_page_exists_quick: page %p is not managed", m));
4139 rv = false;
4140 rw_rlock(&pvh_global_lock);
4141 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4142 rw_rlock(lock);
4143 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4144 if (PV_PMAP(pv) == pmap) {
4145 rv = true;
4146 break;
4147 }
4148 loops++;
4149 if (loops >= 16)
4150 break;
4151 }
4152 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4153 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4154 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4155 if (PV_PMAP(pv) == pmap) {
4156 rv = true;
4157 break;
4158 }
4159 loops++;
4160 if (loops >= 16)
4161 break;
4162 }
4163 }
4164 rw_runlock(lock);
4165 rw_runlock(&pvh_global_lock);
4166 return (rv);
4167 }
4168
4169 /*
4170 * pmap_page_wired_mappings:
4171 *
4172 * Return the number of managed mappings to the given physical page
4173 * that are wired.
4174 */
4175 int
pmap_page_wired_mappings(vm_page_t m)4176 pmap_page_wired_mappings(vm_page_t m)
4177 {
4178 struct md_page *pvh;
4179 struct rwlock *lock;
4180 pmap_t pmap;
4181 pd_entry_t *l2;
4182 pt_entry_t *l3;
4183 pv_entry_t pv;
4184 int count, md_gen, pvh_gen;
4185
4186 if ((m->oflags & VPO_UNMANAGED) != 0)
4187 return (0);
4188 rw_rlock(&pvh_global_lock);
4189 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4190 rw_rlock(lock);
4191 restart:
4192 count = 0;
4193 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4194 pmap = PV_PMAP(pv);
4195 if (!PMAP_TRYLOCK(pmap)) {
4196 md_gen = m->md.pv_gen;
4197 rw_runlock(lock);
4198 PMAP_LOCK(pmap);
4199 rw_rlock(lock);
4200 if (md_gen != m->md.pv_gen) {
4201 PMAP_UNLOCK(pmap);
4202 goto restart;
4203 }
4204 }
4205 l2 = pmap_l2(pmap, pv->pv_va);
4206 KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4207 ("%s: found a 2mpage in page %p's pv list", __func__, m));
4208 l3 = pmap_l2_to_l3(l2, pv->pv_va);
4209 if ((pmap_load(l3) & PTE_SW_WIRED) != 0)
4210 count++;
4211 PMAP_UNLOCK(pmap);
4212 }
4213 if ((m->flags & PG_FICTITIOUS) == 0) {
4214 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4215 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4216 pmap = PV_PMAP(pv);
4217 if (!PMAP_TRYLOCK(pmap)) {
4218 md_gen = m->md.pv_gen;
4219 pvh_gen = pvh->pv_gen;
4220 rw_runlock(lock);
4221 PMAP_LOCK(pmap);
4222 rw_rlock(lock);
4223 if (md_gen != m->md.pv_gen ||
4224 pvh_gen != pvh->pv_gen) {
4225 PMAP_UNLOCK(pmap);
4226 goto restart;
4227 }
4228 }
4229 l2 = pmap_l2(pmap, pv->pv_va);
4230 if ((pmap_load(l2) & PTE_SW_WIRED) != 0)
4231 count++;
4232 PMAP_UNLOCK(pmap);
4233 }
4234 }
4235 rw_runlock(lock);
4236 rw_runlock(&pvh_global_lock);
4237 return (count);
4238 }
4239
4240 /*
4241 * Returns true if the given page is mapped individually or as part of
4242 * a 2mpage. Otherwise, returns false.
4243 */
4244 bool
pmap_page_is_mapped(vm_page_t m)4245 pmap_page_is_mapped(vm_page_t m)
4246 {
4247 struct rwlock *lock;
4248 bool rv;
4249
4250 if ((m->oflags & VPO_UNMANAGED) != 0)
4251 return (false);
4252 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4253 rw_rlock(lock);
4254 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4255 ((m->flags & PG_FICTITIOUS) == 0 &&
4256 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4257 rw_runlock(lock);
4258 return (rv);
4259 }
4260
4261 static void
pmap_remove_pages_pv(pmap_t pmap,vm_page_t m,pv_entry_t pv,struct spglist * free,bool superpage)4262 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv,
4263 struct spglist *free, bool superpage)
4264 {
4265 struct md_page *pvh;
4266 vm_page_t mpte, mt;
4267
4268 if (superpage) {
4269 pmap_resident_count_dec(pmap, Ln_ENTRIES);
4270 pvh = pa_to_pvh(m->phys_addr);
4271 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4272 pvh->pv_gen++;
4273 if (TAILQ_EMPTY(&pvh->pv_list)) {
4274 for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
4275 if (TAILQ_EMPTY(&mt->md.pv_list) &&
4276 (mt->a.flags & PGA_WRITEABLE) != 0)
4277 vm_page_aflag_clear(mt, PGA_WRITEABLE);
4278 }
4279 mpte = pmap_remove_pt_page(pmap, pv->pv_va);
4280 if (mpte != NULL) {
4281 KASSERT(vm_page_any_valid(mpte),
4282 ("pmap_remove_pages: pte page not promoted"));
4283 pmap_resident_count_dec(pmap, 1);
4284 KASSERT(mpte->ref_count == Ln_ENTRIES,
4285 ("pmap_remove_pages: pte page ref count error"));
4286 mpte->ref_count = 0;
4287 pmap_add_delayed_free_list(mpte, free, false);
4288 }
4289 } else {
4290 pmap_resident_count_dec(pmap, 1);
4291 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4292 m->md.pv_gen++;
4293 if (TAILQ_EMPTY(&m->md.pv_list) &&
4294 (m->a.flags & PGA_WRITEABLE) != 0) {
4295 pvh = pa_to_pvh(m->phys_addr);
4296 if (TAILQ_EMPTY(&pvh->pv_list))
4297 vm_page_aflag_clear(m, PGA_WRITEABLE);
4298 }
4299 }
4300 }
4301
4302 /*
4303 * Destroy all managed, non-wired mappings in the given user-space
4304 * pmap. This pmap cannot be active on any processor besides the
4305 * caller.
4306 *
4307 * This function cannot be applied to the kernel pmap. Moreover, it
4308 * is not intended for general use. It is only to be used during
4309 * process termination. Consequently, it can be implemented in ways
4310 * that make it faster than pmap_remove(). First, it can more quickly
4311 * destroy mappings by iterating over the pmap's collection of PV
4312 * entries, rather than searching the page table. Second, it doesn't
4313 * have to test and clear the page table entries atomically, because
4314 * no processor is currently accessing the user address space. In
4315 * particular, a page table entry's dirty bit won't change state once
4316 * this function starts.
4317 */
4318 void
pmap_remove_pages(pmap_t pmap)4319 pmap_remove_pages(pmap_t pmap)
4320 {
4321 struct spglist free;
4322 pd_entry_t ptepde;
4323 pt_entry_t *pte, tpte;
4324 vm_page_t m, mt;
4325 pv_entry_t pv;
4326 struct pv_chunk *pc, *npc;
4327 struct rwlock *lock;
4328 int64_t bit;
4329 uint64_t inuse, bitmask;
4330 int allfree, field, freed __pv_stat_used, idx;
4331 bool superpage;
4332
4333 lock = NULL;
4334
4335 SLIST_INIT(&free);
4336 rw_rlock(&pvh_global_lock);
4337 PMAP_LOCK(pmap);
4338 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4339 allfree = 1;
4340 freed = 0;
4341 for (field = 0; field < _NPCM; field++) {
4342 inuse = ~pc->pc_map[field] & pc_freemask[field];
4343 while (inuse != 0) {
4344 bit = ffsl(inuse) - 1;
4345 bitmask = 1UL << bit;
4346 idx = field * 64 + bit;
4347 pv = &pc->pc_pventry[idx];
4348 inuse &= ~bitmask;
4349
4350 pte = pmap_l1(pmap, pv->pv_va);
4351 ptepde = pmap_load(pte);
4352 pte = pmap_l1_to_l2(pte, pv->pv_va);
4353 tpte = pmap_load(pte);
4354
4355 KASSERT((tpte & PTE_V) != 0,
4356 ("L2 PTE is invalid... bogus PV entry? "
4357 "va=%#lx, pte=%#lx", pv->pv_va, tpte));
4358 if ((tpte & PTE_RWX) != 0) {
4359 superpage = true;
4360 } else {
4361 ptepde = tpte;
4362 pte = pmap_l2_to_l3(pte, pv->pv_va);
4363 tpte = pmap_load(pte);
4364 superpage = false;
4365 }
4366
4367 /*
4368 * We cannot remove wired pages from a
4369 * process' mapping at this time.
4370 */
4371 if (tpte & PTE_SW_WIRED) {
4372 allfree = 0;
4373 continue;
4374 }
4375
4376 m = PTE_TO_VM_PAGE(tpte);
4377 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4378 m < &vm_page_array[vm_page_array_size],
4379 ("pmap_remove_pages: bad pte %#jx",
4380 (uintmax_t)tpte));
4381
4382 pmap_clear(pte);
4383
4384 /*
4385 * Update the vm_page_t clean/reference bits.
4386 */
4387 if ((tpte & (PTE_D | PTE_W)) ==
4388 (PTE_D | PTE_W)) {
4389 if (superpage)
4390 for (mt = m;
4391 mt < &m[Ln_ENTRIES]; mt++)
4392 vm_page_dirty(mt);
4393 else
4394 vm_page_dirty(m);
4395 }
4396
4397 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
4398
4399 /* Mark free */
4400 pc->pc_map[field] |= bitmask;
4401
4402 pmap_remove_pages_pv(pmap, m, pv, &free,
4403 superpage);
4404 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
4405 freed++;
4406 }
4407 }
4408 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
4409 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
4410 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
4411 if (allfree) {
4412 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4413 free_pv_chunk(pc);
4414 }
4415 }
4416 if (lock != NULL)
4417 rw_wunlock(lock);
4418 pmap_invalidate_all(pmap);
4419 rw_runlock(&pvh_global_lock);
4420 PMAP_UNLOCK(pmap);
4421 vm_page_free_pages_toq(&free, false);
4422 }
4423
4424 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)4425 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
4426 {
4427 struct md_page *pvh;
4428 struct rwlock *lock;
4429 pd_entry_t *l2;
4430 pt_entry_t *l3, mask;
4431 pv_entry_t pv;
4432 pmap_t pmap;
4433 int md_gen, pvh_gen;
4434 bool rv;
4435
4436 mask = 0;
4437 if (modified)
4438 mask |= PTE_D;
4439 if (accessed)
4440 mask |= PTE_A;
4441
4442 rv = false;
4443 rw_rlock(&pvh_global_lock);
4444 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4445 rw_rlock(lock);
4446 restart:
4447 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4448 pmap = PV_PMAP(pv);
4449 if (!PMAP_TRYLOCK(pmap)) {
4450 md_gen = m->md.pv_gen;
4451 rw_runlock(lock);
4452 PMAP_LOCK(pmap);
4453 rw_rlock(lock);
4454 if (md_gen != m->md.pv_gen) {
4455 PMAP_UNLOCK(pmap);
4456 goto restart;
4457 }
4458 }
4459 l2 = pmap_l2(pmap, pv->pv_va);
4460 KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4461 ("%s: found a 2mpage in page %p's pv list", __func__, m));
4462 l3 = pmap_l2_to_l3(l2, pv->pv_va);
4463 rv = (pmap_load(l3) & mask) == mask;
4464 PMAP_UNLOCK(pmap);
4465 if (rv)
4466 goto out;
4467 }
4468 if ((m->flags & PG_FICTITIOUS) == 0) {
4469 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4470 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4471 pmap = PV_PMAP(pv);
4472 if (!PMAP_TRYLOCK(pmap)) {
4473 md_gen = m->md.pv_gen;
4474 pvh_gen = pvh->pv_gen;
4475 rw_runlock(lock);
4476 PMAP_LOCK(pmap);
4477 rw_rlock(lock);
4478 if (md_gen != m->md.pv_gen ||
4479 pvh_gen != pvh->pv_gen) {
4480 PMAP_UNLOCK(pmap);
4481 goto restart;
4482 }
4483 }
4484 l2 = pmap_l2(pmap, pv->pv_va);
4485 rv = (pmap_load(l2) & mask) == mask;
4486 PMAP_UNLOCK(pmap);
4487 if (rv)
4488 goto out;
4489 }
4490 }
4491 out:
4492 rw_runlock(lock);
4493 rw_runlock(&pvh_global_lock);
4494 return (rv);
4495 }
4496
4497 /*
4498 * pmap_is_modified:
4499 *
4500 * Return whether or not the specified physical page was modified
4501 * in any physical maps.
4502 */
4503 bool
pmap_is_modified(vm_page_t m)4504 pmap_is_modified(vm_page_t m)
4505 {
4506
4507 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4508 ("pmap_is_modified: page %p is not managed", m));
4509
4510 /*
4511 * If the page is not busied then this check is racy.
4512 */
4513 if (!pmap_page_is_write_mapped(m))
4514 return (false);
4515 return (pmap_page_test_mappings(m, false, true));
4516 }
4517
4518 /*
4519 * pmap_is_prefaultable:
4520 *
4521 * Return whether or not the specified virtual address is eligible
4522 * for prefault.
4523 */
4524 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)4525 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4526 {
4527 pt_entry_t *l3;
4528 bool rv;
4529
4530 /*
4531 * Return true if and only if the L3 entry for the specified virtual
4532 * address is allocated but invalid.
4533 */
4534 rv = false;
4535 PMAP_LOCK(pmap);
4536 l3 = pmap_l3(pmap, addr);
4537 if (l3 != NULL && pmap_load(l3) == 0) {
4538 rv = true;
4539 }
4540 PMAP_UNLOCK(pmap);
4541 return (rv);
4542 }
4543
4544 /*
4545 * pmap_is_referenced:
4546 *
4547 * Return whether or not the specified physical page was referenced
4548 * in any physical maps.
4549 */
4550 bool
pmap_is_referenced(vm_page_t m)4551 pmap_is_referenced(vm_page_t m)
4552 {
4553
4554 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4555 ("pmap_is_referenced: page %p is not managed", m));
4556 return (pmap_page_test_mappings(m, true, false));
4557 }
4558
4559 /*
4560 * Clear the write and modified bits in each of the given page's mappings.
4561 */
4562 void
pmap_remove_write(vm_page_t m)4563 pmap_remove_write(vm_page_t m)
4564 {
4565 struct md_page *pvh;
4566 struct rwlock *lock;
4567 pmap_t pmap;
4568 pd_entry_t *l2;
4569 pt_entry_t *l3, oldl3, newl3;
4570 pv_entry_t next_pv, pv;
4571 vm_offset_t va;
4572 int md_gen, pvh_gen;
4573
4574 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4575 ("pmap_remove_write: page %p is not managed", m));
4576 vm_page_assert_busied(m);
4577
4578 if (!pmap_page_is_write_mapped(m))
4579 return;
4580 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4581 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4582 pa_to_pvh(VM_PAGE_TO_PHYS(m));
4583 rw_rlock(&pvh_global_lock);
4584 retry_pv_loop:
4585 rw_wlock(lock);
4586 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4587 pmap = PV_PMAP(pv);
4588 if (!PMAP_TRYLOCK(pmap)) {
4589 pvh_gen = pvh->pv_gen;
4590 rw_wunlock(lock);
4591 PMAP_LOCK(pmap);
4592 rw_wlock(lock);
4593 if (pvh_gen != pvh->pv_gen) {
4594 PMAP_UNLOCK(pmap);
4595 rw_wunlock(lock);
4596 goto retry_pv_loop;
4597 }
4598 }
4599 va = pv->pv_va;
4600 l2 = pmap_l2(pmap, va);
4601 if ((pmap_load(l2) & PTE_W) != 0)
4602 (void)pmap_demote_l2_locked(pmap, l2, va, &lock);
4603 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4604 ("inconsistent pv lock %p %p for page %p",
4605 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4606 PMAP_UNLOCK(pmap);
4607 }
4608 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4609 pmap = PV_PMAP(pv);
4610 if (!PMAP_TRYLOCK(pmap)) {
4611 pvh_gen = pvh->pv_gen;
4612 md_gen = m->md.pv_gen;
4613 rw_wunlock(lock);
4614 PMAP_LOCK(pmap);
4615 rw_wlock(lock);
4616 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4617 PMAP_UNLOCK(pmap);
4618 rw_wunlock(lock);
4619 goto retry_pv_loop;
4620 }
4621 }
4622 l2 = pmap_l2(pmap, pv->pv_va);
4623 KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4624 ("%s: found a 2mpage in page %p's pv list", __func__, m));
4625 l3 = pmap_l2_to_l3(l2, pv->pv_va);
4626 oldl3 = pmap_load(l3);
4627 retry:
4628 if ((oldl3 & PTE_W) != 0) {
4629 newl3 = oldl3 & ~(PTE_D | PTE_W);
4630 if (!atomic_fcmpset_long(l3, &oldl3, newl3))
4631 goto retry;
4632 if ((oldl3 & PTE_D) != 0)
4633 vm_page_dirty(m);
4634 pmap_invalidate_page(pmap, pv->pv_va);
4635 }
4636 PMAP_UNLOCK(pmap);
4637 }
4638 rw_wunlock(lock);
4639 vm_page_aflag_clear(m, PGA_WRITEABLE);
4640 rw_runlock(&pvh_global_lock);
4641 }
4642
4643 /*
4644 * pmap_ts_referenced:
4645 *
4646 * Return a count of reference bits for a page, clearing those bits.
4647 * It is not necessary for every reference bit to be cleared, but it
4648 * is necessary that 0 only be returned when there are truly no
4649 * reference bits set.
4650 *
4651 * As an optimization, update the page's dirty field if a modified bit is
4652 * found while counting reference bits. This opportunistic update can be
4653 * performed at low cost and can eliminate the need for some future calls
4654 * to pmap_is_modified(). However, since this function stops after
4655 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4656 * dirty pages. Those dirty pages will only be detected by a future call
4657 * to pmap_is_modified().
4658 */
4659 int
pmap_ts_referenced(vm_page_t m)4660 pmap_ts_referenced(vm_page_t m)
4661 {
4662 struct spglist free;
4663 struct md_page *pvh;
4664 struct rwlock *lock;
4665 pv_entry_t pv, pvf;
4666 pmap_t pmap;
4667 pd_entry_t *l2, l2e;
4668 pt_entry_t *l3, l3e;
4669 vm_paddr_t pa;
4670 vm_offset_t va;
4671 int cleared, md_gen, not_cleared, pvh_gen;
4672
4673 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4674 ("pmap_ts_referenced: page %p is not managed", m));
4675 SLIST_INIT(&free);
4676 cleared = 0;
4677 pa = VM_PAGE_TO_PHYS(m);
4678 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
4679
4680 lock = PHYS_TO_PV_LIST_LOCK(pa);
4681 rw_rlock(&pvh_global_lock);
4682 rw_wlock(lock);
4683 retry:
4684 not_cleared = 0;
4685 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4686 goto small_mappings;
4687 pv = pvf;
4688 do {
4689 pmap = PV_PMAP(pv);
4690 if (!PMAP_TRYLOCK(pmap)) {
4691 pvh_gen = pvh->pv_gen;
4692 rw_wunlock(lock);
4693 PMAP_LOCK(pmap);
4694 rw_wlock(lock);
4695 if (pvh_gen != pvh->pv_gen) {
4696 PMAP_UNLOCK(pmap);
4697 goto retry;
4698 }
4699 }
4700 va = pv->pv_va;
4701 l2 = pmap_l2(pmap, va);
4702 l2e = pmap_load(l2);
4703 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) {
4704 /*
4705 * Although l2e is mapping a 2MB page, because
4706 * this function is called at a 4KB page granularity,
4707 * we only update the 4KB page under test.
4708 */
4709 vm_page_dirty(m);
4710 }
4711 if ((l2e & PTE_A) != 0) {
4712 /*
4713 * Since this reference bit is shared by 512 4KB
4714 * pages, it should not be cleared every time it is
4715 * tested. Apply a simple "hash" function on the
4716 * physical page number, the virtual superpage number,
4717 * and the pmap address to select one 4KB page out of
4718 * the 512 on which testing the reference bit will
4719 * result in clearing that reference bit. This
4720 * function is designed to avoid the selection of the
4721 * same 4KB page for every 2MB page mapping.
4722 *
4723 * On demotion, a mapping that hasn't been referenced
4724 * is simply destroyed. To avoid the possibility of a
4725 * subsequent page fault on a demoted wired mapping,
4726 * always leave its reference bit set. Moreover,
4727 * since the superpage is wired, the current state of
4728 * its reference bit won't affect page replacement.
4729 */
4730 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
4731 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
4732 (l2e & PTE_SW_WIRED) == 0) {
4733 pmap_clear_bits(l2, PTE_A);
4734 pmap_invalidate_page(pmap, va);
4735 cleared++;
4736 } else
4737 not_cleared++;
4738 }
4739 PMAP_UNLOCK(pmap);
4740 /* Rotate the PV list if it has more than one entry. */
4741 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4742 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4743 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4744 pvh->pv_gen++;
4745 }
4746 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
4747 goto out;
4748 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4749 small_mappings:
4750 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4751 goto out;
4752 pv = pvf;
4753 do {
4754 pmap = PV_PMAP(pv);
4755 if (!PMAP_TRYLOCK(pmap)) {
4756 pvh_gen = pvh->pv_gen;
4757 md_gen = m->md.pv_gen;
4758 rw_wunlock(lock);
4759 PMAP_LOCK(pmap);
4760 rw_wlock(lock);
4761 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4762 PMAP_UNLOCK(pmap);
4763 goto retry;
4764 }
4765 }
4766 l2 = pmap_l2(pmap, pv->pv_va);
4767
4768 KASSERT((pmap_load(l2) & PTE_RX) == 0,
4769 ("pmap_ts_referenced: found an invalid l2 table"));
4770
4771 l3 = pmap_l2_to_l3(l2, pv->pv_va);
4772 l3e = pmap_load(l3);
4773 if ((l3e & PTE_D) != 0)
4774 vm_page_dirty(m);
4775 if ((l3e & PTE_A) != 0) {
4776 if ((l3e & PTE_SW_WIRED) == 0) {
4777 /*
4778 * Wired pages cannot be paged out so
4779 * doing accessed bit emulation for
4780 * them is wasted effort. We do the
4781 * hard work for unwired pages only.
4782 */
4783 pmap_clear_bits(l3, PTE_A);
4784 pmap_invalidate_page(pmap, pv->pv_va);
4785 cleared++;
4786 } else
4787 not_cleared++;
4788 }
4789 PMAP_UNLOCK(pmap);
4790 /* Rotate the PV list if it has more than one entry. */
4791 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4792 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4793 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4794 m->md.pv_gen++;
4795 }
4796 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
4797 not_cleared < PMAP_TS_REFERENCED_MAX);
4798 out:
4799 rw_wunlock(lock);
4800 rw_runlock(&pvh_global_lock);
4801 vm_page_free_pages_toq(&free, false);
4802 return (cleared + not_cleared);
4803 }
4804
4805 /*
4806 * Apply the given advice to the specified range of addresses within the
4807 * given pmap. Depending on the advice, clear the referenced and/or
4808 * modified flags in each mapping and set the mapped page's dirty field.
4809 */
4810 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)4811 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4812 {
4813 }
4814
4815 /*
4816 * Clear the modify bits on the specified physical page.
4817 */
4818 void
pmap_clear_modify(vm_page_t m)4819 pmap_clear_modify(vm_page_t m)
4820 {
4821 struct md_page *pvh;
4822 struct rwlock *lock;
4823 pmap_t pmap;
4824 pv_entry_t next_pv, pv;
4825 pd_entry_t *l2, oldl2;
4826 pt_entry_t *l3;
4827 vm_offset_t va;
4828 int md_gen, pvh_gen;
4829
4830 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4831 ("%s: page %p is not managed", __func__, m));
4832 vm_page_assert_busied(m);
4833
4834 if (!pmap_page_is_write_mapped(m))
4835 return;
4836
4837 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4838 pa_to_pvh(VM_PAGE_TO_PHYS(m));
4839 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4840 rw_rlock(&pvh_global_lock);
4841 rw_wlock(lock);
4842 restart:
4843 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4844 pmap = PV_PMAP(pv);
4845 if (!PMAP_TRYLOCK(pmap)) {
4846 pvh_gen = pvh->pv_gen;
4847 rw_wunlock(lock);
4848 PMAP_LOCK(pmap);
4849 rw_wlock(lock);
4850 if (pvh_gen != pvh->pv_gen) {
4851 PMAP_UNLOCK(pmap);
4852 goto restart;
4853 }
4854 }
4855 va = pv->pv_va;
4856 l2 = pmap_l2(pmap, va);
4857 oldl2 = pmap_load(l2);
4858 /* If oldl2 has PTE_W set, then it also has PTE_D set. */
4859 if ((oldl2 & PTE_W) != 0 &&
4860 pmap_demote_l2_locked(pmap, l2, va, &lock) &&
4861 (oldl2 & PTE_SW_WIRED) == 0) {
4862 /*
4863 * Write protect the mapping to a single page so that
4864 * a subsequent write access may repromote.
4865 */
4866 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
4867 l3 = pmap_l2_to_l3(l2, va);
4868 pmap_clear_bits(l3, PTE_D | PTE_W);
4869 vm_page_dirty(m);
4870 pmap_invalidate_page(pmap, va);
4871 }
4872 PMAP_UNLOCK(pmap);
4873 }
4874 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4875 pmap = PV_PMAP(pv);
4876 if (!PMAP_TRYLOCK(pmap)) {
4877 md_gen = m->md.pv_gen;
4878 pvh_gen = pvh->pv_gen;
4879 rw_wunlock(lock);
4880 PMAP_LOCK(pmap);
4881 rw_wlock(lock);
4882 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4883 PMAP_UNLOCK(pmap);
4884 goto restart;
4885 }
4886 }
4887 l2 = pmap_l2(pmap, pv->pv_va);
4888 KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4889 ("%s: found a 2mpage in page %p's pv list", __func__, m));
4890 l3 = pmap_l2_to_l3(l2, pv->pv_va);
4891 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) {
4892 pmap_clear_bits(l3, PTE_D | PTE_W);
4893 pmap_invalidate_page(pmap, pv->pv_va);
4894 }
4895 PMAP_UNLOCK(pmap);
4896 }
4897 rw_wunlock(lock);
4898 rw_runlock(&pvh_global_lock);
4899 }
4900
4901 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)4902 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4903 {
4904
4905 return ((void *)PHYS_TO_DMAP(pa));
4906 }
4907
4908 void
pmap_unmapbios(void * p,vm_size_t size)4909 pmap_unmapbios(void *p, vm_size_t size)
4910 {
4911 }
4912
4913 /*
4914 * Sets the memory attribute for the specified page.
4915 */
4916 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)4917 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4918 {
4919 if (m->md.pv_memattr == ma)
4920 return;
4921
4922 m->md.pv_memattr = ma;
4923
4924 /*
4925 * If "m" is a normal page, update its direct mapping. This update
4926 * can be relied upon to perform any cache operations that are
4927 * required for data coherence.
4928 */
4929 if ((m->flags & PG_FICTITIOUS) == 0 &&
4930 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
4931 m->md.pv_memattr) != 0)
4932 panic("memory attribute change on the direct map failed");
4933 }
4934
4935 /*
4936 * Changes the specified virtual address range's memory type to that given by
4937 * the parameter "mode". The specified virtual address range must be
4938 * completely contained within either the direct map or the kernel map.
4939 *
4940 * Returns zero if the change completed successfully, and either EINVAL or
4941 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
4942 * of the virtual address range was not mapped, and ENOMEM is returned if
4943 * there was insufficient memory available to complete the change. In the
4944 * latter case, the memory type may have been changed on some part of the
4945 * virtual address range.
4946 */
4947 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)4948 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4949 {
4950 int error;
4951
4952 PMAP_LOCK(kernel_pmap);
4953 error = pmap_change_attr_locked(va, size, mode);
4954 PMAP_UNLOCK(kernel_pmap);
4955 return (error);
4956 }
4957
4958 static int
pmap_change_attr_locked(vm_offset_t va,vm_size_t size,int mode)4959 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
4960 {
4961 vm_offset_t base, offset, tmpva;
4962 vm_paddr_t phys;
4963 pd_entry_t *l1, l1e;
4964 pd_entry_t *l2, l2e;
4965 pt_entry_t *l3, l3e;
4966 pt_entry_t bits, mask;
4967 bool anychanged = false;
4968 int error = 0;
4969
4970 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
4971 base = trunc_page(va);
4972 offset = va & PAGE_MASK;
4973 size = round_page(offset + size);
4974
4975 if (!VIRT_IN_DMAP(base) &&
4976 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
4977 return (EINVAL);
4978
4979 bits = pmap_memattr_bits(mode);
4980 mask = memattr_mask;
4981
4982 /* First loop: perform PTE validation and demotions as necessary. */
4983 for (tmpva = base; tmpva < base + size; ) {
4984 l1 = pmap_l1(kernel_pmap, tmpva);
4985 if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0)
4986 return (EINVAL);
4987 if ((l1e & PTE_RWX) != 0) {
4988 /*
4989 * If the existing PTE has the correct attributes, then
4990 * no need to demote.
4991 */
4992 if ((l1e & mask) == bits) {
4993 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
4994 continue;
4995 }
4996
4997 /*
4998 * If the 1GB page fits in the remaining range, we
4999 * don't need to demote.
5000 */
5001 if ((tmpva & L1_OFFSET) == 0 &&
5002 tmpva + L1_SIZE <= base + size) {
5003 tmpva += L1_SIZE;
5004 continue;
5005 }
5006
5007 if (!pmap_demote_l1(kernel_pmap, l1, tmpva))
5008 return (EINVAL);
5009 }
5010 l2 = pmap_l1_to_l2(l1, tmpva);
5011 if (((l2e = pmap_load(l2)) & PTE_V) == 0)
5012 return (EINVAL);
5013 if ((l2e & PTE_RWX) != 0) {
5014 /*
5015 * If the existing PTE has the correct attributes, then
5016 * no need to demote.
5017 */
5018 if ((l2e & mask) == bits) {
5019 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
5020 continue;
5021 }
5022
5023 /*
5024 * If the 2MB page fits in the remaining range, we
5025 * don't need to demote.
5026 */
5027 if ((tmpva & L2_OFFSET) == 0 &&
5028 tmpva + L2_SIZE <= base + size) {
5029 tmpva += L2_SIZE;
5030 continue;
5031 }
5032
5033 if (!pmap_demote_l2(kernel_pmap, l2, tmpva))
5034 panic("l2 demotion failed");
5035 }
5036 l3 = pmap_l2_to_l3(l2, tmpva);
5037 if (((l3e = pmap_load(l3)) & PTE_V) == 0)
5038 return (EINVAL);
5039
5040 tmpva += PAGE_SIZE;
5041 }
5042
5043 /* Second loop: perform PTE updates. */
5044 for (tmpva = base; tmpva < base + size; ) {
5045 l1 = pmap_l1(kernel_pmap, tmpva);
5046 l1e = pmap_load(l1);
5047 if ((l1e & PTE_RWX) != 0) {
5048 /* Unchanged. */
5049 if ((l1e & mask) == bits) {
5050 tmpva += L1_SIZE;
5051 continue;
5052 }
5053
5054 l1e &= ~mask;
5055 l1e |= bits;
5056 pmap_store(l1, l1e);
5057 anychanged = true;
5058
5059 /* Update corresponding DMAP entry */
5060 phys = L1PTE_TO_PHYS(l1e);
5061 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
5062 error = pmap_change_attr_locked(
5063 PHYS_TO_DMAP(phys), L1_SIZE, mode);
5064 if (error != 0)
5065 break;
5066 }
5067 tmpva += L1_SIZE;
5068 continue;
5069 }
5070
5071 l2 = pmap_l1_to_l2(l1, tmpva);
5072 l2e = pmap_load(l2);
5073 if ((l2e & PTE_RWX) != 0) {
5074 /* Unchanged. */
5075 if ((l2e & mask) == bits) {
5076 tmpva += L2_SIZE;
5077 continue;
5078 }
5079
5080 l2e &= ~mask;
5081 l2e |= bits;
5082 pmap_store(l2, l2e);
5083 anychanged = true;
5084
5085 /* Update corresponding DMAP entry */
5086 phys = L2PTE_TO_PHYS(l2e);
5087 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
5088 error = pmap_change_attr_locked(
5089 PHYS_TO_DMAP(phys), L2_SIZE, mode);
5090 if (error != 0)
5091 break;
5092 }
5093 tmpva += L2_SIZE;
5094 continue;
5095 }
5096
5097 l3 = pmap_l2_to_l3(l2, tmpva);
5098 l3e = pmap_load(l3);
5099
5100 /* Unchanged. */
5101 if ((l3e & mask) == bits) {
5102 tmpva += PAGE_SIZE;
5103 continue;
5104 }
5105
5106 l3e &= ~mask;
5107 l3e |= bits;
5108 pmap_store(l3, l3e);
5109 anychanged = true;
5110
5111 phys = PTE_TO_PHYS(l3e);
5112 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
5113 error = pmap_change_attr_locked(PHYS_TO_DMAP(phys),
5114 L3_SIZE, mode);
5115 if (error != 0)
5116 break;
5117 }
5118 tmpva += PAGE_SIZE;
5119 }
5120
5121 if (anychanged) {
5122 pmap_invalidate_range(kernel_pmap, base, tmpva);
5123 if (mode == VM_MEMATTR_UNCACHEABLE)
5124 cpu_dcache_wbinv_range(base, size);
5125 }
5126
5127 return (error);
5128 }
5129
5130 /*
5131 * Perform the pmap work for mincore(2). If the page is not both referenced and
5132 * modified by this pmap, returns its physical address so that the caller can
5133 * find other mappings.
5134 */
5135 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)5136 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
5137 {
5138 pt_entry_t *l2, *l3, tpte;
5139 vm_paddr_t pa;
5140 int val;
5141 bool managed;
5142
5143 PMAP_LOCK(pmap);
5144 l2 = pmap_l2(pmap, addr);
5145 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) {
5146 if ((tpte & PTE_RWX) != 0) {
5147 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET);
5148 val = MINCORE_INCORE | MINCORE_PSIND(1);
5149 } else {
5150 l3 = pmap_l2_to_l3(l2, addr);
5151 tpte = pmap_load(l3);
5152 if ((tpte & PTE_V) == 0) {
5153 PMAP_UNLOCK(pmap);
5154 return (0);
5155 }
5156 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET);
5157 val = MINCORE_INCORE;
5158 }
5159
5160 if ((tpte & PTE_D) != 0)
5161 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5162 if ((tpte & PTE_A) != 0)
5163 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5164 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED;
5165 } else {
5166 managed = false;
5167 val = 0;
5168 }
5169 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5170 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
5171 *pap = pa;
5172 }
5173 PMAP_UNLOCK(pmap);
5174 return (val);
5175 }
5176
5177 void
pmap_activate_sw(struct thread * td)5178 pmap_activate_sw(struct thread *td)
5179 {
5180 pmap_t oldpmap, pmap;
5181 u_int hart;
5182
5183 oldpmap = PCPU_GET(curpmap);
5184 pmap = vmspace_pmap(td->td_proc->p_vmspace);
5185 if (pmap == oldpmap)
5186 return;
5187 csr_write(satp, pmap->pm_satp);
5188
5189 hart = PCPU_GET(hart);
5190 #ifdef SMP
5191 CPU_SET_ATOMIC(hart, &pmap->pm_active);
5192 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active);
5193 #else
5194 CPU_SET(hart, &pmap->pm_active);
5195 CPU_CLR(hart, &oldpmap->pm_active);
5196 #endif
5197 PCPU_SET(curpmap, pmap);
5198
5199 sfence_vma();
5200 }
5201
5202 void
pmap_activate(struct thread * td)5203 pmap_activate(struct thread *td)
5204 {
5205
5206 critical_enter();
5207 pmap_activate_sw(td);
5208 critical_exit();
5209 }
5210
5211 void
pmap_activate_boot(pmap_t pmap)5212 pmap_activate_boot(pmap_t pmap)
5213 {
5214 u_int hart;
5215
5216 hart = PCPU_GET(hart);
5217 #ifdef SMP
5218 CPU_SET_ATOMIC(hart, &pmap->pm_active);
5219 #else
5220 CPU_SET(hart, &pmap->pm_active);
5221 #endif
5222 PCPU_SET(curpmap, pmap);
5223 }
5224
5225 void
pmap_active_cpus(pmap_t pmap,cpuset_t * res)5226 pmap_active_cpus(pmap_t pmap, cpuset_t *res)
5227 {
5228 *res = pmap->pm_active;
5229 }
5230
5231 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)5232 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
5233 {
5234 cpuset_t mask;
5235
5236 /*
5237 * From the RISC-V User-Level ISA V2.2:
5238 *
5239 * "To make a store to instruction memory visible to all
5240 * RISC-V harts, the writing hart has to execute a data FENCE
5241 * before requesting that all remote RISC-V harts execute a
5242 * FENCE.I."
5243 *
5244 * However, this is slightly misleading; we still need to
5245 * perform a FENCE.I for the local hart, as FENCE does nothing
5246 * for its icache. FENCE.I alone is also sufficient for the
5247 * local hart.
5248 */
5249 sched_pin();
5250 mask = all_harts;
5251 CPU_CLR(PCPU_GET(hart), &mask);
5252 fence_i();
5253 if (!CPU_EMPTY(&mask) && smp_started) {
5254 fence();
5255 sbi_remote_fence_i(mask.__bits);
5256 }
5257 sched_unpin();
5258 }
5259
5260 /*
5261 * Increase the starting virtual address of the given mapping if a
5262 * different alignment might result in more superpage mappings.
5263 */
5264 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)5265 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5266 vm_offset_t *addr, vm_size_t size)
5267 {
5268 vm_offset_t superpage_offset;
5269
5270 if (size < L2_SIZE)
5271 return;
5272 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5273 offset += ptoa(object->pg_color);
5274 superpage_offset = offset & L2_OFFSET;
5275 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
5276 (*addr & L2_OFFSET) == superpage_offset)
5277 return;
5278 if ((*addr & L2_OFFSET) < superpage_offset)
5279 *addr = (*addr & ~L2_OFFSET) + superpage_offset;
5280 else
5281 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
5282 }
5283
5284 /**
5285 * Get the kernel virtual address of a set of physical pages. If there are
5286 * physical addresses not covered by the DMAP perform a transient mapping
5287 * that will be removed when calling pmap_unmap_io_transient.
5288 *
5289 * \param page The pages the caller wishes to obtain the virtual
5290 * address on the kernel memory map.
5291 * \param vaddr On return contains the kernel virtual memory address
5292 * of the pages passed in the page parameter.
5293 * \param count Number of pages passed in.
5294 * \param can_fault true if the thread using the mapped pages can take
5295 * page faults, false otherwise.
5296 *
5297 * \returns true if the caller must call pmap_unmap_io_transient when
5298 * finished or false otherwise.
5299 *
5300 */
5301 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)5302 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
5303 bool can_fault)
5304 {
5305 vm_paddr_t paddr;
5306 bool needs_mapping;
5307 int error __diagused, i;
5308
5309 /*
5310 * Allocate any KVA space that we need, this is done in a separate
5311 * loop to prevent calling vmem_alloc while pinned.
5312 */
5313 needs_mapping = false;
5314 for (i = 0; i < count; i++) {
5315 paddr = VM_PAGE_TO_PHYS(page[i]);
5316 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) {
5317 error = vmem_alloc(kernel_arena, PAGE_SIZE,
5318 M_BESTFIT | M_WAITOK, &vaddr[i]);
5319 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
5320 needs_mapping = true;
5321 } else {
5322 vaddr[i] = PHYS_TO_DMAP(paddr);
5323 }
5324 }
5325
5326 /* Exit early if everything is covered by the DMAP */
5327 if (!needs_mapping)
5328 return (false);
5329
5330 if (!can_fault)
5331 sched_pin();
5332 for (i = 0; i < count; i++) {
5333 paddr = VM_PAGE_TO_PHYS(page[i]);
5334 if (paddr >= DMAP_MAX_PHYSADDR) {
5335 panic(
5336 "pmap_map_io_transient: TODO: Map out of DMAP data");
5337 }
5338 }
5339
5340 return (needs_mapping);
5341 }
5342
5343 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)5344 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
5345 bool can_fault)
5346 {
5347 vm_paddr_t paddr;
5348 int i;
5349
5350 if (!can_fault)
5351 sched_unpin();
5352 for (i = 0; i < count; i++) {
5353 paddr = VM_PAGE_TO_PHYS(page[i]);
5354 if (paddr >= DMAP_MAX_PHYSADDR) {
5355 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data");
5356 }
5357 }
5358 }
5359
5360 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)5361 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
5362 {
5363
5364 return (mode >= VM_MEMATTR_DEFAULT && mode <= VM_MEMATTR_LAST);
5365 }
5366
5367 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)5368 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2,
5369 pt_entry_t **l3)
5370 {
5371 pd_entry_t *l1p, *l2p;
5372
5373 /* Get l1 directory entry. */
5374 l1p = pmap_l1(pmap, va);
5375 *l1 = l1p;
5376
5377 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0)
5378 return (false);
5379
5380 if ((pmap_load(l1p) & PTE_RX) != 0) {
5381 *l2 = NULL;
5382 *l3 = NULL;
5383 return (true);
5384 }
5385
5386 /* Get l2 directory entry. */
5387 l2p = pmap_l1_to_l2(l1p, va);
5388 *l2 = l2p;
5389
5390 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0)
5391 return (false);
5392
5393 if ((pmap_load(l2p) & PTE_RX) != 0) {
5394 *l3 = NULL;
5395 return (true);
5396 }
5397
5398 /* Get l3 page table entry. */
5399 *l3 = pmap_l2_to_l3(l2p, va);
5400
5401 return (true);
5402 }
5403
5404 /*
5405 * Track a range of the kernel's virtual address space that is contiguous
5406 * in various mapping attributes.
5407 */
5408 struct pmap_kernel_map_range {
5409 vm_offset_t sva;
5410 pt_entry_t attrs;
5411 int l3pages;
5412 int l2pages;
5413 int l1pages;
5414 };
5415
5416 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)5417 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
5418 vm_offset_t eva)
5419 {
5420 char *mode;
5421 int i;
5422
5423 if (eva <= range->sva)
5424 return;
5425
5426 for (i = 0; i < nitems(memattr_bits); i++)
5427 if ((range->attrs & memattr_mask) == memattr_bits[i])
5428 break;
5429
5430 switch (i) {
5431 case VM_MEMATTR_PMA:
5432 mode = "PMA";
5433 break;
5434 case VM_MEMATTR_UNCACHEABLE:
5435 mode = "NC ";
5436 break;
5437 case VM_MEMATTR_DEVICE:
5438 mode = "IO ";
5439 break;
5440 default:
5441 mode = "???";
5442 break;
5443 }
5444
5445 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n",
5446 range->sva, eva,
5447 (range->attrs & PTE_W) == PTE_W ? 'w' : '-',
5448 (range->attrs & PTE_X) == PTE_X ? 'x' : '-',
5449 (range->attrs & PTE_U) == PTE_U ? 'u' : 's',
5450 (range->attrs & PTE_G) == PTE_G ? 'g' : '-',
5451 mode, range->l1pages, range->l2pages, range->l3pages);
5452
5453 /* Reset to sentinel value. */
5454 range->sva = 0xfffffffffffffffful;
5455 }
5456
5457 /*
5458 * Determine whether the attributes specified by a page table entry match those
5459 * being tracked by the current range.
5460 */
5461 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)5462 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
5463 {
5464
5465 return (range->attrs == attrs);
5466 }
5467
5468 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)5469 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
5470 pt_entry_t attrs)
5471 {
5472
5473 memset(range, 0, sizeof(*range));
5474 range->sva = va;
5475 range->attrs = attrs;
5476 }
5477
5478 /*
5479 * Given a leaf PTE, derive the mapping's attributes. If they do not match
5480 * those of the current run, dump the address range and its attributes, and
5481 * begin a new run.
5482 */
5483 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)5484 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
5485 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e)
5486 {
5487 pt_entry_t attrs;
5488
5489 /* The PTE global bit is inherited by lower levels. */
5490 attrs = l1e & PTE_G;
5491 if ((l1e & PTE_RWX) != 0) {
5492 attrs |= l1e & (PTE_RWX | PTE_U);
5493 attrs |= l1e & memattr_mask;
5494 } else if (l2e != 0)
5495 attrs |= l2e & PTE_G;
5496
5497 if ((l2e & PTE_RWX) != 0) {
5498 attrs |= l2e & (PTE_RWX | PTE_U);
5499 attrs |= l2e & memattr_mask;
5500 } else if (l3e != 0) {
5501 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G);
5502 attrs |= l3e & memattr_mask;
5503 }
5504
5505 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
5506 sysctl_kmaps_dump(sb, range, va);
5507 sysctl_kmaps_reinit(range, va, attrs);
5508 }
5509 }
5510
5511 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)5512 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
5513 {
5514 struct pmap_kernel_map_range range;
5515 struct sbuf sbuf, *sb;
5516 pd_entry_t *l1, l1e, *l2, l2e;
5517 pt_entry_t *l3, l3e;
5518 vm_offset_t sva;
5519 vm_paddr_t pa;
5520 int error, i, j, k;
5521
5522 error = sysctl_wire_old_buffer(req, 0);
5523 if (error != 0)
5524 return (error);
5525 sb = &sbuf;
5526 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
5527
5528 /* Sentinel value. */
5529 range.sva = 0xfffffffffffffffful;
5530
5531 /*
5532 * Iterate over the kernel page tables without holding the kernel pmap
5533 * lock. Kernel page table pages are never freed, so at worst we will
5534 * observe inconsistencies in the output.
5535 */
5536 sva = VM_MIN_KERNEL_ADDRESS;
5537 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) {
5538 if (i == pmap_l1_index(DMAP_MIN_ADDRESS))
5539 sbuf_printf(sb, "\nDirect map:\n");
5540 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS))
5541 sbuf_printf(sb, "\nKernel map:\n");
5542
5543 l1 = pmap_l1(kernel_pmap, sva);
5544 l1e = pmap_load(l1);
5545 if ((l1e & PTE_V) == 0) {
5546 sysctl_kmaps_dump(sb, &range, sva);
5547 sva += L1_SIZE;
5548 continue;
5549 }
5550 if ((l1e & PTE_RWX) != 0) {
5551 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0);
5552 range.l1pages++;
5553 sva += L1_SIZE;
5554 continue;
5555 }
5556 pa = PTE_TO_PHYS(l1e);
5557 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
5558
5559 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) {
5560 l2e = l2[j];
5561 if ((l2e & PTE_V) == 0) {
5562 sysctl_kmaps_dump(sb, &range, sva);
5563 sva += L2_SIZE;
5564 continue;
5565 }
5566 if ((l2e & PTE_RWX) != 0) {
5567 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0);
5568 range.l2pages++;
5569 sva += L2_SIZE;
5570 continue;
5571 }
5572 pa = PTE_TO_PHYS(l2e);
5573 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa);
5574
5575 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++,
5576 sva += L3_SIZE) {
5577 l3e = l3[k];
5578 if ((l3e & PTE_V) == 0) {
5579 sysctl_kmaps_dump(sb, &range, sva);
5580 continue;
5581 }
5582 sysctl_kmaps_check(sb, &range, sva,
5583 l1e, l2e, l3e);
5584 range.l3pages++;
5585 }
5586 }
5587 }
5588
5589 error = sbuf_finish(sb);
5590 sbuf_delete(sb);
5591 return (error);
5592 }
5593 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
5594 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
5595 NULL, 0, sysctl_kmaps, "A",
5596 "Dump kernel address layout");
5597