1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014-2016 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * This product includes software developed by the University of
35 * California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 * may be used to endorse or promote products derived from this software
38 * without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 */
52 /*-
53 * Copyright (c) 2003 Networks Associates Technology, Inc.
54 * All rights reserved.
55 *
56 * This software was developed for the FreeBSD Project by Jake Burkholder,
57 * Safeport Network Services, and Network Associates Laboratories, the
58 * Security Research Division of Network Associates, Inc. under
59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
60 * CHATS research program.
61 *
62 * Redistribution and use in source and binary forms, with or without
63 * modification, are permitted provided that the following conditions
64 * are met:
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 * 2. Redistributions in binary form must reproduce the above copyright
68 * notice, this list of conditions and the following disclaimer in the
69 * documentation and/or other materials provided with the distribution.
70 *
71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81 * SUCH DAMAGE.
82 */
83
84 #include <sys/cdefs.h>
85 /*
86 * Manages physical address maps.
87 *
88 * Since the information managed by this module is
89 * also stored by the logical address mapping module,
90 * this module may throw away valid virtual-to-physical
91 * mappings at almost any time. However, invalidations
92 * of virtual-to-physical mappings must be done as
93 * requested.
94 *
95 * In order to cope with hardware architectures which
96 * make virtual-to-physical map invalidates expensive,
97 * this module may delay invalidate or reduced protection
98 * operations until such time as they are actually
99 * necessary. This module is given full information as
100 * to which processors are currently using which maps,
101 * and to when physical maps must be made correct.
102 */
103
104 #include "opt_vm.h"
105
106 #include <sys/param.h>
107 #include <sys/asan.h>
108 #include <sys/bitstring.h>
109 #include <sys/bus.h>
110 #include <sys/systm.h>
111 #include <sys/kernel.h>
112 #include <sys/ktr.h>
113 #include <sys/limits.h>
114 #include <sys/lock.h>
115 #include <sys/malloc.h>
116 #include <sys/mman.h>
117 #include <sys/msan.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/physmem.h>
121 #include <sys/proc.h>
122 #include <sys/rangeset.h>
123 #include <sys/rwlock.h>
124 #include <sys/sbuf.h>
125 #include <sys/sx.h>
126 #include <sys/vmem.h>
127 #include <sys/vmmeter.h>
128 #include <sys/sched.h>
129 #include <sys/sysctl.h>
130 #include <sys/_unrhdr.h>
131 #include <sys/smp.h>
132
133 #include <vm/vm.h>
134 #include <vm/vm_param.h>
135 #include <vm/vm_kern.h>
136 #include <vm/vm_page.h>
137 #include <vm/vm_map.h>
138 #include <vm/vm_object.h>
139 #include <vm/vm_extern.h>
140 #include <vm/vm_pageout.h>
141 #include <vm/vm_pager.h>
142 #include <vm/vm_phys.h>
143 #include <vm/vm_radix.h>
144 #include <vm/vm_reserv.h>
145 #include <vm/vm_dumpset.h>
146 #include <vm/uma.h>
147
148 #include <machine/asan.h>
149 #include <machine/cpu_feat.h>
150 #include <machine/machdep.h>
151 #include <machine/md_var.h>
152 #include <machine/pcb.h>
153
154 #ifdef NUMA
155 #define PMAP_MEMDOM MAXMEMDOM
156 #else
157 #define PMAP_MEMDOM 1
158 #endif
159
160 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1)
161 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2)
162
163 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t)))
164 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t)))
165 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t)))
166 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t)))
167
168 #define NUL0E L0_ENTRIES
169 #define NUL1E (NUL0E * NL1PG)
170 #define NUL2E (NUL1E * NL2PG)
171
172 #ifdef PV_STATS
173 #define PV_STAT(x) do { x ; } while (0)
174 #define __pvused
175 #else
176 #define PV_STAT(x) do { } while (0)
177 #define __pvused __unused
178 #endif
179
180 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT))
181 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
182 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
183
184 #ifdef __ARM_FEATURE_BTI_DEFAULT
185 pt_entry_t __read_mostly pmap_gp_attr;
186 #define ATTR_KERN_GP pmap_gp_attr
187 #else
188 #define ATTR_KERN_GP 0
189 #endif
190 #define PMAP_SAN_PTE_BITS (ATTR_AF | ATTR_S1_XN | pmap_sh_attr | \
191 ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
192
193 struct pmap_large_md_page {
194 struct rwlock pv_lock;
195 struct md_page pv_page;
196 /* Pad to a power of 2, see pmap_init_pv_table(). */
197 int pv_pad[2];
198 };
199
200 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
201 #define pv_dummy pv_dummy_large.pv_page
202 __read_mostly static struct pmap_large_md_page *pv_table;
203
204 static struct pmap_large_md_page *
_pa_to_pmdp(vm_paddr_t pa)205 _pa_to_pmdp(vm_paddr_t pa)
206 {
207 struct vm_phys_seg *seg;
208
209 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
210 return ((struct pmap_large_md_page *)seg->md_first +
211 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
212 return (NULL);
213 }
214
215 static struct pmap_large_md_page *
pa_to_pmdp(vm_paddr_t pa)216 pa_to_pmdp(vm_paddr_t pa)
217 {
218 struct pmap_large_md_page *pvd;
219
220 pvd = _pa_to_pmdp(pa);
221 if (pvd == NULL)
222 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
223 return (pvd);
224 }
225
226 static struct pmap_large_md_page *
page_to_pmdp(vm_page_t m)227 page_to_pmdp(vm_page_t m)
228 {
229 struct vm_phys_seg *seg;
230
231 seg = &vm_phys_segs[m->segind];
232 return ((struct pmap_large_md_page *)seg->md_first +
233 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
234 }
235
236 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page))
237 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page))
238
239 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \
240 struct pmap_large_md_page *_pvd; \
241 struct rwlock *_lock; \
242 _pvd = _pa_to_pmdp(pa); \
243 if (__predict_false(_pvd == NULL)) \
244 _lock = &pv_dummy_large.pv_lock; \
245 else \
246 _lock = &(_pvd->pv_lock); \
247 _lock; \
248 })
249
250 static struct rwlock *
VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)251 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)
252 {
253 if ((m->flags & PG_FICTITIOUS) == 0)
254 return (&page_to_pmdp(m)->pv_lock);
255 else
256 return (&pv_dummy_large.pv_lock);
257 }
258
259 #define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \
260 struct rwlock **_lockp = (lockp); \
261 struct rwlock *_new_lock = (new_lock); \
262 \
263 if (_new_lock != *_lockp) { \
264 if (*_lockp != NULL) \
265 rw_wunlock(*_lockp); \
266 *_lockp = _new_lock; \
267 rw_wlock(*_lockp); \
268 } \
269 } while (0)
270
271 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \
272 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa))
273
274 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
275 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m))
276
277 #define RELEASE_PV_LIST_LOCK(lockp) do { \
278 struct rwlock **_lockp = (lockp); \
279 \
280 if (*_lockp != NULL) { \
281 rw_wunlock(*_lockp); \
282 *_lockp = NULL; \
283 } \
284 } while (0)
285
286 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
287 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m))
288
289 /*
290 * The presence of this flag indicates that the mapping is writeable.
291 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
292 * it is dirty. This flag may only be set on managed mappings.
293 *
294 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
295 * as a software managed bit.
296 */
297 #define ATTR_SW_DBM ATTR_DBM
298
299 struct pmap kernel_pmap_store;
300
301 /* Used for mapping ACPI memory before VM is initialized */
302 #define PMAP_PREINIT_MAPPING_COUNT 32
303 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
304 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */
305 static int vm_initialized = 0; /* No need to use pre-init maps when set */
306
307 /*
308 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
309 * Always map entire L2 block for simplicity.
310 * VA of L2 block = preinit_map_va + i * L2_SIZE
311 */
312 static struct pmap_preinit_mapping {
313 vm_paddr_t pa;
314 vm_offset_t va;
315 vm_size_t size;
316 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
317
318 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
319 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
320 vm_offset_t kernel_vm_end = 0;
321
322 /*
323 * Data for the pv entry allocation mechanism.
324 */
325 #ifdef NUMA
326 static __inline int
pc_to_domain(struct pv_chunk * pc)327 pc_to_domain(struct pv_chunk *pc)
328 {
329 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
330 }
331 #else
332 static __inline int
pc_to_domain(struct pv_chunk * pc __unused)333 pc_to_domain(struct pv_chunk *pc __unused)
334 {
335 return (0);
336 }
337 #endif
338
339 struct pv_chunks_list {
340 struct mtx pvc_lock;
341 TAILQ_HEAD(pch, pv_chunk) pvc_list;
342 int active_reclaims;
343 } __aligned(CACHE_LINE_SIZE);
344
345 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
346
347 vm_paddr_t dmap_phys_base; /* The start of the dmap region */
348 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
349 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */
350
351 extern pt_entry_t pagetable_l0_ttbr1[];
352
353 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
354 static vm_paddr_t physmap[PHYSMAP_SIZE];
355 static u_int physmap_idx;
356
357 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
358 "VM/pmap parameters");
359
360 static int pmap_growkernel_panic = 0;
361 SYSCTL_INT(_vm_pmap, OID_AUTO, growkernel_panic, CTLFLAG_RDTUN,
362 &pmap_growkernel_panic, 0,
363 "panic on failure to allocate kernel page table page");
364
365 bool pmap_lpa_enabled __read_mostly = false;
366 pt_entry_t pmap_sh_attr __read_mostly = ATTR_SH(ATTR_SH_IS);
367
368 #if PAGE_SIZE == PAGE_SIZE_4K
369 #define L1_BLOCKS_SUPPORTED 1
370 #else
371 #define L1_BLOCKS_SUPPORTED (pmap_lpa_enabled)
372 #endif
373
374 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED)
375
376 static bool pmap_l1_supported __read_mostly = false;
377
378 /*
379 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
380 * that it has currently allocated to a pmap, a cursor ("asid_next") to
381 * optimize its search for a free ASID in the bit vector, and an epoch number
382 * ("asid_epoch") to indicate when it has reclaimed all previously allocated
383 * ASIDs that are not currently active on a processor.
384 *
385 * The current epoch number is always in the range [0, INT_MAX). Negative
386 * numbers and INT_MAX are reserved for special cases that are described
387 * below.
388 */
389 struct asid_set {
390 int asid_bits;
391 bitstr_t *asid_set;
392 int asid_set_size;
393 int asid_next;
394 int asid_epoch;
395 struct mtx asid_set_mutex;
396 };
397
398 static struct asid_set asids;
399 static struct asid_set vmids;
400
401 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
402 "ASID allocator");
403 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
404 "The number of bits in an ASID");
405 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
406 "The last allocated ASID plus one");
407 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
408 "The current epoch number");
409
410 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
411 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
412 "The number of bits in an VMID");
413 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
414 "The last allocated VMID plus one");
415 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
416 "The current epoch number");
417
418 void (*pmap_clean_stage2_tlbi)(void);
419 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
420 void (*pmap_stage2_invalidate_all)(uint64_t);
421
422 /*
423 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved
424 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for
425 * dynamically allocated ASIDs have a non-negative epoch number.
426 *
427 * An invalid ASID is represented by -1.
428 *
429 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
430 * which indicates that an ASID should never be allocated to the pmap, and
431 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
432 * allocated when the pmap is next activated.
433 */
434 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \
435 ((u_long)(epoch) << 32)))
436 #define COOKIE_TO_ASID(cookie) ((int)(cookie))
437 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32))
438
439 #define TLBI_VA_SHIFT 12
440 #define TLBI_VA_MASK ((1ul << 44) - 1)
441 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
442
443 static int __read_frequently superpages_enabled = 1;
444 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
445 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
446 "Are large page mappings enabled?");
447
448 /*
449 * True when Branch Target Identification should be used by userspace. This
450 * allows pmap to mark pages as guarded with ATTR_S1_GP.
451 */
452 __read_mostly static bool pmap_bti_support = false;
453
454 /*
455 * Internal flags for pmap_enter()'s helper functions.
456 */
457 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
458 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
459
460 TAILQ_HEAD(pv_chunklist, pv_chunk);
461
462 static void free_pv_chunk(struct pv_chunk *pc);
463 static void free_pv_chunk_batch(struct pv_chunklist *batch);
464 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
465 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
466 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
467 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
468 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
469 vm_offset_t va);
470
471 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
472 static bool pmap_activate_int(pmap_t pmap);
473 static void pmap_alloc_asid(pmap_t pmap);
474 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
475 vm_prot_t prot, int mode, bool skip_unmapped);
476 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
477 pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp);
478 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
479 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
480 vm_offset_t va, struct rwlock **lockp);
481 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
482 static bool pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va);
483 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va);
484 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
485 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
486 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
487 u_int flags, vm_page_t m, struct rwlock **lockp);
488 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
489 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp);
490 static bool pmap_every_pte_zero(vm_paddr_t pa);
491 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
492 bool all_l3e_AF_set);
493 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p);
494 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
495 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits);
496 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
497 struct rwlock **lockp);
498 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
499 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
500 pd_entry_t l1e, bool demote_kl2e, struct spglist *free,
501 struct rwlock **lockp);
502 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
503 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
504 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
505 vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
506 struct rwlock **lockp);
507 static void pmap_reset_asid_set(pmap_t pmap);
508 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
509 vm_page_t m, struct rwlock **lockp);
510
511 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
512 struct rwlock **lockp);
513
514 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
515 struct spglist *free);
516 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
517 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
518 vm_offset_t va, vm_size_t size);
519 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
520
521 static uma_zone_t pmap_bti_ranges_zone;
522 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
523 pt_entry_t *pte);
524 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va);
525 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
526 static void *bti_dup_range(void *ctx, void *data);
527 static void bti_free_range(void *ctx, void *node);
528 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap);
529 static void pmap_bti_deassign_all(pmap_t pmap);
530
531 /*
532 * These load the old table data and store the new value.
533 * They need to be atomic as the System MMU may write to the table at
534 * the same time as the CPU.
535 */
536 #define pmap_clear(table) atomic_store_64(table, 0)
537 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits)
538 #define pmap_load(table) (*table)
539 #define pmap_load_clear(table) atomic_swap_64(table, 0)
540 #define pmap_load_store(table, entry) atomic_swap_64(table, entry)
541 #define pmap_set_bits(table, bits) atomic_set_64(table, bits)
542 #define pmap_store(table, entry) atomic_store_64(table, entry)
543
544 /********************/
545 /* Inline functions */
546 /********************/
547
548 static __inline void
pagecopy(void * s,void * d)549 pagecopy(void *s, void *d)
550 {
551
552 memcpy(d, s, PAGE_SIZE);
553 }
554
555 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)556 pmap_l0(pmap_t pmap, vm_offset_t va)
557 {
558
559 return (&pmap->pm_l0[pmap_l0_index(va)]);
560 }
561
562 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)563 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
564 {
565 pd_entry_t *l1;
566
567 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
568 return (&l1[pmap_l1_index(va)]);
569 }
570
571 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)572 pmap_l1(pmap_t pmap, vm_offset_t va)
573 {
574 pd_entry_t *l0;
575
576 l0 = pmap_l0(pmap, va);
577 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
578 return (NULL);
579
580 return (pmap_l0_to_l1(l0, va));
581 }
582
583 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1p,vm_offset_t va)584 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
585 {
586 pd_entry_t l1, *l2p;
587
588 l1 = pmap_load(l1p);
589
590 KASSERT(ADDR_IS_CANONICAL(va),
591 ("%s: Address not in canonical form: %lx", __func__, va));
592 /*
593 * The valid bit may be clear if pmap_update_entry() is concurrently
594 * modifying the entry, so for KVA only the entry type may be checked.
595 */
596 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
597 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
598 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
599 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
600 l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1));
601 return (&l2p[pmap_l2_index(va)]);
602 }
603
604 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)605 pmap_l2(pmap_t pmap, vm_offset_t va)
606 {
607 pd_entry_t *l1;
608
609 l1 = pmap_l1(pmap, va);
610 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
611 return (NULL);
612
613 return (pmap_l1_to_l2(l1, va));
614 }
615
616 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2p,vm_offset_t va)617 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
618 {
619 pd_entry_t l2;
620 pt_entry_t *l3p;
621
622 l2 = pmap_load(l2p);
623
624 KASSERT(ADDR_IS_CANONICAL(va),
625 ("%s: Address not in canonical form: %lx", __func__, va));
626 /*
627 * The valid bit may be clear if pmap_update_entry() is concurrently
628 * modifying the entry, so for KVA only the entry type may be checked.
629 */
630 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
631 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
632 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
633 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
634 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2));
635 return (&l3p[pmap_l3_index(va)]);
636 }
637
638 /*
639 * Returns the lowest valid pde for a given virtual address.
640 * The next level may or may not point to a valid page or block.
641 */
642 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va,int * level)643 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
644 {
645 pd_entry_t *l0, *l1, *l2, desc;
646
647 l0 = pmap_l0(pmap, va);
648 desc = pmap_load(l0) & ATTR_DESCR_MASK;
649 if (desc != L0_TABLE) {
650 *level = -1;
651 return (NULL);
652 }
653
654 l1 = pmap_l0_to_l1(l0, va);
655 desc = pmap_load(l1) & ATTR_DESCR_MASK;
656 if (desc != L1_TABLE) {
657 *level = 0;
658 return (l0);
659 }
660
661 l2 = pmap_l1_to_l2(l1, va);
662 desc = pmap_load(l2) & ATTR_DESCR_MASK;
663 if (desc != L2_TABLE) {
664 *level = 1;
665 return (l1);
666 }
667
668 *level = 2;
669 return (l2);
670 }
671
672 /*
673 * Returns the lowest valid pte block or table entry for a given virtual
674 * address. If there are no valid entries return NULL and set the level to
675 * the first invalid level.
676 */
677 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va,int * level)678 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
679 {
680 pd_entry_t *l1, *l2, desc;
681 pt_entry_t *l3;
682
683 l1 = pmap_l1(pmap, va);
684 if (l1 == NULL) {
685 *level = 0;
686 return (NULL);
687 }
688 desc = pmap_load(l1) & ATTR_DESCR_MASK;
689 if (desc == L1_BLOCK) {
690 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
691 *level = 1;
692 return (l1);
693 }
694
695 if (desc != L1_TABLE) {
696 *level = 1;
697 return (NULL);
698 }
699
700 l2 = pmap_l1_to_l2(l1, va);
701 desc = pmap_load(l2) & ATTR_DESCR_MASK;
702 if (desc == L2_BLOCK) {
703 *level = 2;
704 return (l2);
705 }
706
707 if (desc != L2_TABLE) {
708 *level = 2;
709 return (NULL);
710 }
711
712 *level = 3;
713 l3 = pmap_l2_to_l3(l2, va);
714 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
715 return (NULL);
716
717 return (l3);
718 }
719
720 /*
721 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
722 * level that maps the specified virtual address, then a pointer to that entry
723 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled
724 * and a diagnostic message is provided, in which case this function panics.
725 */
726 static __always_inline pt_entry_t *
pmap_pte_exists(pmap_t pmap,vm_offset_t va,int level,const char * diag)727 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
728 {
729 pd_entry_t *l0p, *l1p, *l2p;
730 pt_entry_t desc, *l3p;
731 int walk_level __diagused;
732
733 KASSERT(level >= 0 && level < 4,
734 ("%s: %s passed an out-of-range level (%d)", __func__, diag,
735 level));
736 l0p = pmap_l0(pmap, va);
737 desc = pmap_load(l0p) & ATTR_DESCR_MASK;
738 if (desc == L0_TABLE && level > 0) {
739 l1p = pmap_l0_to_l1(l0p, va);
740 desc = pmap_load(l1p) & ATTR_DESCR_MASK;
741 if (desc == L1_BLOCK && level == 1) {
742 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
743 return (l1p);
744 }
745 if (desc == L1_TABLE && level > 1) {
746 l2p = pmap_l1_to_l2(l1p, va);
747 desc = pmap_load(l2p) & ATTR_DESCR_MASK;
748 if (desc == L2_BLOCK && level == 2)
749 return (l2p);
750 else if (desc == L2_TABLE && level > 2) {
751 l3p = pmap_l2_to_l3(l2p, va);
752 desc = pmap_load(l3p) & ATTR_DESCR_MASK;
753 if (desc == L3_PAGE && level == 3)
754 return (l3p);
755 else
756 walk_level = 3;
757 } else
758 walk_level = 2;
759 } else
760 walk_level = 1;
761 } else
762 walk_level = 0;
763 KASSERT(diag == NULL,
764 ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
765 diag, va, level, desc, walk_level));
766 return (NULL);
767 }
768
769 bool
pmap_ps_enabled(pmap_t pmap)770 pmap_ps_enabled(pmap_t pmap)
771 {
772 /*
773 * Promotion requires a hypervisor call when the kernel is running
774 * in EL1. To stop this disable superpage support on non-stage 1
775 * pmaps for now.
776 */
777 if (pmap->pm_stage != PM_STAGE1)
778 return (false);
779
780 #ifdef KMSAN
781 /*
782 * The break-before-make in pmap_update_entry() results in a situation
783 * where a CPU may call into the KMSAN runtime while the entry is
784 * invalid. If the entry is used to map the current thread structure,
785 * then the runtime will attempt to access unmapped memory. Avoid this
786 * by simply disabling superpage promotion for the kernel map.
787 */
788 if (pmap == kernel_pmap)
789 return (false);
790 #endif
791
792 return (superpages_enabled != 0);
793 }
794
795 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l0,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)796 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
797 pd_entry_t **l2, pt_entry_t **l3)
798 {
799 pd_entry_t *l0p, *l1p, *l2p;
800
801 if (pmap->pm_l0 == NULL)
802 return (false);
803
804 l0p = pmap_l0(pmap, va);
805 *l0 = l0p;
806
807 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
808 return (false);
809
810 l1p = pmap_l0_to_l1(l0p, va);
811 *l1 = l1p;
812
813 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
814 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
815 *l2 = NULL;
816 *l3 = NULL;
817 return (true);
818 }
819
820 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
821 return (false);
822
823 l2p = pmap_l1_to_l2(l1p, va);
824 *l2 = l2p;
825
826 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
827 *l3 = NULL;
828 return (true);
829 }
830
831 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
832 return (false);
833
834 *l3 = pmap_l2_to_l3(l2p, va);
835
836 return (true);
837 }
838
839 static __inline int
pmap_l3_valid(pt_entry_t l3)840 pmap_l3_valid(pt_entry_t l3)
841 {
842
843 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
844 }
845
846 CTASSERT(L1_BLOCK == L2_BLOCK);
847
848 static pt_entry_t
pmap_pte_memattr(pmap_t pmap,vm_memattr_t memattr)849 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
850 {
851 pt_entry_t val;
852
853 if (pmap->pm_stage == PM_STAGE1) {
854 val = ATTR_S1_IDX(memattr);
855 if (memattr == VM_MEMATTR_DEVICE)
856 val |= ATTR_S1_XN;
857 return (val);
858 }
859
860 val = 0;
861
862 switch (memattr) {
863 case VM_MEMATTR_DEVICE:
864 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
865 ATTR_S2_XN(ATTR_S2_XN_ALL));
866 case VM_MEMATTR_UNCACHEABLE:
867 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
868 case VM_MEMATTR_WRITE_BACK:
869 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
870 case VM_MEMATTR_WRITE_THROUGH:
871 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
872 default:
873 panic("%s: invalid memory attribute %x", __func__, memattr);
874 }
875 }
876
877 static pt_entry_t
pmap_pte_prot(pmap_t pmap,vm_prot_t prot)878 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
879 {
880 pt_entry_t val;
881
882 val = 0;
883 if (pmap->pm_stage == PM_STAGE1) {
884 if ((prot & VM_PROT_EXECUTE) == 0)
885 val |= ATTR_S1_XN;
886 if ((prot & VM_PROT_WRITE) == 0)
887 val |= ATTR_S1_AP(ATTR_S1_AP_RO);
888 } else {
889 if ((prot & VM_PROT_WRITE) != 0)
890 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
891 if ((prot & VM_PROT_READ) != 0)
892 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
893 if ((prot & VM_PROT_EXECUTE) == 0)
894 val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
895 }
896
897 return (val);
898 }
899
900 /*
901 * Checks if the PTE is dirty.
902 */
903 static inline int
pmap_pte_dirty(pmap_t pmap,pt_entry_t pte)904 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
905 {
906
907 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
908
909 if (pmap->pm_stage == PM_STAGE1) {
910 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
911 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
912
913 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
914 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
915 }
916
917 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
918 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
919 }
920
921 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)922 pmap_resident_count_inc(pmap_t pmap, int count)
923 {
924
925 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
926 pmap->pm_stats.resident_count += count;
927 }
928
929 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)930 pmap_resident_count_dec(pmap_t pmap, int count)
931 {
932
933 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
934 KASSERT(pmap->pm_stats.resident_count >= count,
935 ("pmap %p resident count underflow %ld %d", pmap,
936 pmap->pm_stats.resident_count, count));
937 pmap->pm_stats.resident_count -= count;
938 }
939
940 static vm_paddr_t
pmap_early_vtophys(vm_offset_t va)941 pmap_early_vtophys(vm_offset_t va)
942 {
943 vm_paddr_t pa_page;
944
945 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
946 return (pa_page | (va & PAR_LOW_MASK));
947 }
948
949 /* State of the bootstrapped DMAP page tables */
950 struct pmap_bootstrap_state {
951 pt_entry_t *l1;
952 pt_entry_t *l2;
953 pt_entry_t *l3;
954 vm_offset_t freemempos;
955 vm_offset_t va;
956 vm_paddr_t pa;
957 pt_entry_t table_attrs;
958 u_int l0_slot;
959 u_int l1_slot;
960 u_int l2_slot;
961 bool dmap_valid;
962 };
963
964 /* The bootstrap state */
965 static struct pmap_bootstrap_state bs_state = {
966 .l1 = NULL,
967 .l2 = NULL,
968 .l3 = NULL,
969 .table_attrs = TATTR_PXN_TABLE,
970 .l0_slot = L0_ENTRIES,
971 .l1_slot = Ln_ENTRIES,
972 .l2_slot = Ln_ENTRIES,
973 .dmap_valid = false,
974 };
975
976 static void
pmap_bootstrap_l0_table(struct pmap_bootstrap_state * state)977 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
978 {
979 vm_paddr_t l1_pa;
980 pd_entry_t l0e;
981 u_int l0_slot;
982
983 /* Link the level 0 table to a level 1 table */
984 l0_slot = pmap_l0_index(state->va);
985 if (l0_slot != state->l0_slot) {
986 /*
987 * Make sure we move from a low address to high address
988 * before the DMAP region is ready. This ensures we never
989 * modify an existing mapping until we can map from a
990 * physical address to a virtual address.
991 */
992 MPASS(state->l0_slot < l0_slot ||
993 state->l0_slot == L0_ENTRIES ||
994 state->dmap_valid);
995
996 /* Reset lower levels */
997 state->l2 = NULL;
998 state->l3 = NULL;
999 state->l1_slot = Ln_ENTRIES;
1000 state->l2_slot = Ln_ENTRIES;
1001
1002 /* Check the existing L0 entry */
1003 state->l0_slot = l0_slot;
1004 if (state->dmap_valid) {
1005 l0e = pagetable_l0_ttbr1[l0_slot];
1006 if ((l0e & ATTR_DESCR_VALID) != 0) {
1007 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
1008 l1_pa = PTE_TO_PHYS(l0e);
1009 state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa);
1010 return;
1011 }
1012 }
1013
1014 /* Create a new L0 table entry */
1015 state->l1 = (pt_entry_t *)state->freemempos;
1016 memset(state->l1, 0, PAGE_SIZE);
1017 state->freemempos += PAGE_SIZE;
1018
1019 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
1020 MPASS((l1_pa & Ln_TABLE_MASK) == 0);
1021 MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
1022 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) |
1023 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
1024 }
1025 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
1026 }
1027
1028 static void
pmap_bootstrap_l1_table(struct pmap_bootstrap_state * state)1029 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
1030 {
1031 vm_paddr_t l2_pa;
1032 pd_entry_t l1e;
1033 u_int l1_slot;
1034
1035 /* Make sure there is a valid L0 -> L1 table */
1036 pmap_bootstrap_l0_table(state);
1037
1038 /* Link the level 1 table to a level 2 table */
1039 l1_slot = pmap_l1_index(state->va);
1040 if (l1_slot != state->l1_slot) {
1041 /* See pmap_bootstrap_l0_table for a description */
1042 MPASS(state->l1_slot < l1_slot ||
1043 state->l1_slot == Ln_ENTRIES ||
1044 state->dmap_valid);
1045
1046 /* Reset lower levels */
1047 state->l3 = NULL;
1048 state->l2_slot = Ln_ENTRIES;
1049
1050 /* Check the existing L1 entry */
1051 state->l1_slot = l1_slot;
1052 if (state->dmap_valid) {
1053 l1e = state->l1[l1_slot];
1054 if ((l1e & ATTR_DESCR_VALID) != 0) {
1055 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
1056 l2_pa = PTE_TO_PHYS(l1e);
1057 state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa);
1058 return;
1059 }
1060 }
1061
1062 /* Create a new L1 table entry */
1063 state->l2 = (pt_entry_t *)state->freemempos;
1064 memset(state->l2, 0, PAGE_SIZE);
1065 state->freemempos += PAGE_SIZE;
1066
1067 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1068 MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1069 MPASS(state->l1[l1_slot] == 0);
1070 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) |
1071 state->table_attrs | L1_TABLE);
1072 }
1073 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1074 }
1075
1076 static void
pmap_bootstrap_l2_table(struct pmap_bootstrap_state * state)1077 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1078 {
1079 vm_paddr_t l3_pa;
1080 pd_entry_t l2e;
1081 u_int l2_slot;
1082
1083 /* Make sure there is a valid L1 -> L2 table */
1084 pmap_bootstrap_l1_table(state);
1085
1086 /* Link the level 2 table to a level 3 table */
1087 l2_slot = pmap_l2_index(state->va);
1088 if (l2_slot != state->l2_slot) {
1089 /* See pmap_bootstrap_l0_table for a description */
1090 MPASS(state->l2_slot < l2_slot ||
1091 state->l2_slot == Ln_ENTRIES ||
1092 state->dmap_valid);
1093
1094 /* Check the existing L2 entry */
1095 state->l2_slot = l2_slot;
1096 if (state->dmap_valid) {
1097 l2e = state->l2[l2_slot];
1098 if ((l2e & ATTR_DESCR_VALID) != 0) {
1099 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1100 l3_pa = PTE_TO_PHYS(l2e);
1101 state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa);
1102 return;
1103 }
1104 }
1105
1106 /* Create a new L2 table entry */
1107 state->l3 = (pt_entry_t *)state->freemempos;
1108 memset(state->l3, 0, PAGE_SIZE);
1109 state->freemempos += PAGE_SIZE;
1110
1111 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1112 MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1113 MPASS(state->l2[l2_slot] == 0);
1114 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) |
1115 state->table_attrs | L2_TABLE);
1116 }
1117 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1118 }
1119
1120 static void
pmap_bootstrap_l2_block(struct pmap_bootstrap_state * state,int i)1121 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1122 {
1123 pt_entry_t contig;
1124 u_int l2_slot;
1125 bool first;
1126
1127 if ((physmap[i + 1] - state->pa) < L2_SIZE)
1128 return;
1129
1130 /* Make sure there is a valid L1 table */
1131 pmap_bootstrap_l1_table(state);
1132
1133 MPASS((state->va & L2_OFFSET) == 0);
1134 for (first = true, contig = 0;
1135 state->va < DMAP_MAX_ADDRESS &&
1136 (physmap[i + 1] - state->pa) >= L2_SIZE;
1137 state->va += L2_SIZE, state->pa += L2_SIZE) {
1138 /*
1139 * Stop if we are about to walk off the end of what the
1140 * current L1 slot can address.
1141 */
1142 if (!first && (state->pa & L1_OFFSET) == 0)
1143 break;
1144
1145 /*
1146 * If we have an aligned, contiguous chunk of L2C_ENTRIES
1147 * L2 blocks, set the contiguous bit within each PTE so that
1148 * the chunk can be cached using only one TLB entry.
1149 */
1150 if ((state->pa & L2C_OFFSET) == 0) {
1151 if (state->va + L2C_SIZE < DMAP_MAX_ADDRESS &&
1152 physmap[i + 1] - state->pa >= L2C_SIZE) {
1153 contig = ATTR_CONTIGUOUS;
1154 } else {
1155 contig = 0;
1156 }
1157 }
1158
1159 first = false;
1160 l2_slot = pmap_l2_index(state->va);
1161 MPASS((state->pa & L2_OFFSET) == 0);
1162 MPASS(state->l2[l2_slot] == 0);
1163 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) |
1164 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP |
1165 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L2_BLOCK);
1166 }
1167 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1168 }
1169
1170 static void
pmap_bootstrap_l3_page(struct pmap_bootstrap_state * state,int i)1171 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1172 {
1173 pt_entry_t contig;
1174 u_int l3_slot;
1175 bool first;
1176
1177 if (physmap[i + 1] - state->pa < L3_SIZE)
1178 return;
1179
1180 /* Make sure there is a valid L2 table */
1181 pmap_bootstrap_l2_table(state);
1182
1183 MPASS((state->va & L3_OFFSET) == 0);
1184 for (first = true, contig = 0;
1185 state->va < DMAP_MAX_ADDRESS &&
1186 physmap[i + 1] - state->pa >= L3_SIZE;
1187 state->va += L3_SIZE, state->pa += L3_SIZE) {
1188 /*
1189 * Stop if we are about to walk off the end of what the
1190 * current L2 slot can address.
1191 */
1192 if (!first && (state->pa & L2_OFFSET) == 0)
1193 break;
1194
1195 /*
1196 * If we have an aligned, contiguous chunk of L3C_ENTRIES
1197 * L3 pages, set the contiguous bit within each PTE so that
1198 * the chunk can be cached using only one TLB entry.
1199 */
1200 if ((state->pa & L3C_OFFSET) == 0) {
1201 if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS &&
1202 physmap[i + 1] - state->pa >= L3C_SIZE) {
1203 contig = ATTR_CONTIGUOUS;
1204 } else {
1205 contig = 0;
1206 }
1207 }
1208
1209 first = false;
1210 l3_slot = pmap_l3_index(state->va);
1211 MPASS((state->pa & L3_OFFSET) == 0);
1212 MPASS(state->l3[l3_slot] == 0);
1213 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) |
1214 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP |
1215 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE);
1216 }
1217 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1218 }
1219
1220 void
pmap_bootstrap_dmap(vm_size_t kernlen)1221 pmap_bootstrap_dmap(vm_size_t kernlen)
1222 {
1223 vm_paddr_t start_pa, pa;
1224 uint64_t tcr;
1225 int i;
1226
1227 tcr = READ_SPECIALREG(tcr_el1);
1228
1229 /* Verify that the ASID is set through TTBR0. */
1230 KASSERT((tcr & TCR_A1) == 0, ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1231
1232 if ((tcr & TCR_DS) != 0)
1233 pmap_lpa_enabled = true;
1234
1235 pmap_l1_supported = L1_BLOCKS_SUPPORTED;
1236
1237 start_pa = pmap_early_vtophys(KERNBASE);
1238
1239 bs_state.freemempos = KERNBASE + kernlen;
1240 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1241
1242 /* Fill in physmap array. */
1243 physmap_idx = physmem_avail(physmap, nitems(physmap));
1244
1245 dmap_phys_base = physmap[0] & ~L1_OFFSET;
1246 dmap_phys_max = 0;
1247 dmap_max_addr = 0;
1248
1249 for (i = 0; i < physmap_idx; i += 2) {
1250 bs_state.pa = physmap[i] & ~L3_OFFSET;
1251 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1252
1253 /* Create L3 mappings at the start of the region */
1254 if ((bs_state.pa & L2_OFFSET) != 0)
1255 pmap_bootstrap_l3_page(&bs_state, i);
1256 MPASS(bs_state.pa <= physmap[i + 1]);
1257
1258 if (L1_BLOCKS_SUPPORTED) {
1259 /* Create L2 mappings at the start of the region */
1260 if ((bs_state.pa & L1_OFFSET) != 0)
1261 pmap_bootstrap_l2_block(&bs_state, i);
1262 MPASS(bs_state.pa <= physmap[i + 1]);
1263
1264 /* Create the main L1 block mappings */
1265 for (; bs_state.va < DMAP_MAX_ADDRESS &&
1266 (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1267 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1268 /* Make sure there is a valid L1 table */
1269 pmap_bootstrap_l0_table(&bs_state);
1270 MPASS((bs_state.pa & L1_OFFSET) == 0);
1271 pmap_store(
1272 &bs_state.l1[pmap_l1_index(bs_state.va)],
1273 PHYS_TO_PTE(bs_state.pa) | ATTR_AF |
1274 pmap_sh_attr |
1275 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1276 ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK);
1277 }
1278 MPASS(bs_state.pa <= physmap[i + 1]);
1279
1280 /* Create L2 mappings at the end of the region */
1281 pmap_bootstrap_l2_block(&bs_state, i);
1282 } else {
1283 while (bs_state.va < DMAP_MAX_ADDRESS &&
1284 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1285 pmap_bootstrap_l2_block(&bs_state, i);
1286 }
1287 }
1288 MPASS(bs_state.pa <= physmap[i + 1]);
1289
1290 /* Create L3 mappings at the end of the region */
1291 pmap_bootstrap_l3_page(&bs_state, i);
1292 MPASS(bs_state.pa == physmap[i + 1]);
1293
1294 if (bs_state.pa > dmap_phys_max) {
1295 dmap_phys_max = bs_state.pa;
1296 dmap_max_addr = bs_state.va;
1297 }
1298 }
1299
1300 cpu_tlb_flushID();
1301
1302 bs_state.dmap_valid = true;
1303
1304 /* Exclude the kernel and DMAP region */
1305 pa = pmap_early_vtophys(bs_state.freemempos);
1306 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1307 }
1308
1309 static void
pmap_bootstrap_l2(vm_offset_t va)1310 pmap_bootstrap_l2(vm_offset_t va)
1311 {
1312 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1313
1314 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1315 bs_state.va = va;
1316
1317 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1318 pmap_bootstrap_l1_table(&bs_state);
1319 }
1320
1321 static void
pmap_bootstrap_l3(vm_offset_t va)1322 pmap_bootstrap_l3(vm_offset_t va)
1323 {
1324 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1325
1326 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1327 bs_state.va = va;
1328
1329 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1330 pmap_bootstrap_l2_table(&bs_state);
1331 }
1332
1333 /*
1334 * Bootstrap the system enough to run with virtual memory.
1335 */
1336 void
pmap_bootstrap(void)1337 pmap_bootstrap(void)
1338 {
1339 vm_offset_t dpcpu, msgbufpv;
1340 vm_paddr_t start_pa, pa;
1341 size_t largest_phys_size;
1342
1343 /* Set this early so we can use the pagetable walking functions */
1344 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1345 PMAP_LOCK_INIT(kernel_pmap);
1346 kernel_pmap->pm_l0_paddr =
1347 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1348 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1349 vm_radix_init(&kernel_pmap->pm_root);
1350 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1351 kernel_pmap->pm_stage = PM_STAGE1;
1352 kernel_pmap->pm_levels = 4;
1353 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1354 kernel_pmap->pm_asid_set = &asids;
1355
1356 /* Reserve some VA space for early BIOS/ACPI mapping */
1357 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1358
1359 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1360 virtual_avail = roundup2(virtual_avail, L1_SIZE);
1361 virtual_end = VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE;
1362 kernel_vm_end = virtual_avail;
1363
1364 /*
1365 * We only use PXN when we know nothing will be executed from it, e.g.
1366 * the DMAP region.
1367 */
1368 bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1369
1370 /*
1371 * Find the physical memory we could use. This needs to be after we
1372 * exclude any memory that is mapped into the DMAP region but should
1373 * not be used by the kernel, e.g. some UEFI memory types.
1374 */
1375 physmap_idx = physmem_avail(physmap, nitems(physmap));
1376
1377 /*
1378 * Find space for early allocations. We search for the largest
1379 * region. This is because the user may choose a large msgbuf.
1380 * This could be smarter, e.g. to allow multiple regions to be
1381 * used & switch to the next when one is full.
1382 */
1383 largest_phys_size = 0;
1384 for (int i = 0; i < physmap_idx; i += 2) {
1385 if ((physmap[i + 1] - physmap[i]) > largest_phys_size) {
1386 largest_phys_size = physmap[i + 1] - physmap[i];
1387 bs_state.freemempos = PHYS_TO_DMAP(physmap[i]);
1388 }
1389 }
1390
1391 start_pa = pmap_early_vtophys(bs_state.freemempos);
1392
1393 /*
1394 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the
1395 * loader allocated the first and only l2 page table page used to map
1396 * the kernel, preloaded files and module metadata.
1397 */
1398 pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1399 /* And the l3 tables for the early devmap */
1400 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1401
1402 cpu_tlb_flushID();
1403
1404 #define alloc_pages(var, np) \
1405 (var) = bs_state.freemempos; \
1406 bs_state.freemempos += (np * PAGE_SIZE); \
1407 memset((char *)(var), 0, ((np) * PAGE_SIZE));
1408
1409 /* Allocate dynamic per-cpu area. */
1410 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1411 dpcpu_init((void *)dpcpu, 0);
1412
1413 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1414 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1415 msgbufp = (void *)msgbufpv;
1416
1417 pa = pmap_early_vtophys(bs_state.freemempos);
1418
1419 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1420 }
1421
1422 #if defined(KASAN) || defined(KMSAN)
1423 static void
pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa,vm_paddr_t end_pa,vm_offset_t * vap,vm_offset_t eva)1424 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1425 vm_offset_t *vap, vm_offset_t eva)
1426 {
1427 vm_paddr_t pa;
1428 vm_offset_t va;
1429 pd_entry_t *l2;
1430
1431 va = *vap;
1432 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1433 for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) {
1434 l2 = pmap_l2(kernel_pmap, va);
1435
1436 /*
1437 * KASAN stack checking results in us having already allocated
1438 * part of our shadow map, so we can just skip those segments.
1439 */
1440 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1441 pa += L2_SIZE;
1442 continue;
1443 }
1444
1445 bzero((void *)PHYS_TO_DMAP(pa), L2_SIZE);
1446 physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC);
1447 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK);
1448 }
1449 *vap = va;
1450 }
1451
1452 /*
1453 * Finish constructing the initial shadow map:
1454 * - Count how many pages from KERNBASE to virtual_avail (scaled for
1455 * shadow map)
1456 * - Map that entire range using L2 superpages.
1457 */
1458 static void
pmap_bootstrap_san1(vm_offset_t va,int scale)1459 pmap_bootstrap_san1(vm_offset_t va, int scale)
1460 {
1461 vm_offset_t eva;
1462 vm_paddr_t kernstart;
1463 int i;
1464
1465 kernstart = pmap_early_vtophys(KERNBASE);
1466
1467 /*
1468 * Rebuild physmap one more time, we may have excluded more regions from
1469 * allocation since pmap_bootstrap().
1470 */
1471 physmap_idx = physmem_avail(physmap, nitems(physmap));
1472
1473 eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale;
1474
1475 /*
1476 * Find a slot in the physmap large enough for what we needed. We try to put
1477 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1478 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1479 */
1480 for (i = physmap_idx - 2; i >= 0; i -= 2) {
1481 vm_paddr_t plow, phigh;
1482
1483 /* L2 mappings must be backed by memory that is L2-aligned */
1484 plow = roundup2(physmap[i], L2_SIZE);
1485 phigh = physmap[i + 1];
1486 if (plow >= phigh)
1487 continue;
1488 if (kernstart >= plow && kernstart < phigh)
1489 phigh = kernstart;
1490 if (phigh - plow >= L2_SIZE) {
1491 pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva);
1492 if (va >= eva)
1493 break;
1494 }
1495 }
1496 if (i < 0)
1497 panic("Could not find phys region for shadow map");
1498
1499 /*
1500 * Done. We should now have a valid shadow address mapped for all KVA
1501 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1502 * shadow accesses by the sanitizer runtime will succeed for this range.
1503 * When the kernel virtual address range is later expanded, as will
1504 * happen in vm_mem_init(), the shadow map will be grown as well. This
1505 * is handled by pmap_san_enter().
1506 */
1507 }
1508
1509 void
pmap_bootstrap_san(void)1510 pmap_bootstrap_san(void)
1511 {
1512 #ifdef KASAN
1513 pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE);
1514 #else
1515 static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1516 static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1517 pd_entry_t *l0, *l1;
1518
1519 if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE)
1520 panic("initial kernel map is too large");
1521
1522 l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS);
1523 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1524 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp)));
1525 l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS);
1526 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1527 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE)));
1528 pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1);
1529
1530 l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS);
1531 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1532 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp)));
1533 l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS);
1534 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1535 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE)));
1536 pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1);
1537 #endif
1538 }
1539 #endif
1540
1541 /*
1542 * Initialize a vm_page's machine-dependent fields.
1543 */
1544 void
pmap_page_init(vm_page_t m)1545 pmap_page_init(vm_page_t m)
1546 {
1547
1548 TAILQ_INIT(&m->md.pv_list);
1549 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1550 }
1551
1552 static void
pmap_init_asids(struct asid_set * set,int bits)1553 pmap_init_asids(struct asid_set *set, int bits)
1554 {
1555 int i;
1556
1557 set->asid_bits = bits;
1558
1559 /*
1560 * We may be too early in the overall initialization process to use
1561 * bit_alloc().
1562 */
1563 set->asid_set_size = 1 << set->asid_bits;
1564 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1565 M_WAITOK | M_ZERO);
1566 for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1567 bit_set(set->asid_set, i);
1568 set->asid_next = ASID_FIRST_AVAILABLE;
1569 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1570 }
1571
1572 static void
pmap_init_pv_table(void)1573 pmap_init_pv_table(void)
1574 {
1575 struct vm_phys_seg *seg, *next_seg;
1576 struct pmap_large_md_page *pvd;
1577 vm_size_t s;
1578 int domain, i, j, pages;
1579
1580 /*
1581 * We depend on the size being evenly divisible into a page so
1582 * that the pv_table array can be indexed directly while
1583 * safely spanning multiple pages from different domains.
1584 */
1585 CTASSERT(PAGE_SIZE % sizeof(*pvd) == 0);
1586
1587 /*
1588 * Calculate the size of the array.
1589 */
1590 s = 0;
1591 for (i = 0; i < vm_phys_nsegs; i++) {
1592 seg = &vm_phys_segs[i];
1593 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1594 pmap_l2_pindex(seg->start);
1595 s += round_page(pages * sizeof(*pvd));
1596 }
1597 pv_table = (struct pmap_large_md_page *)kva_alloc(s);
1598 if (pv_table == NULL)
1599 panic("%s: kva_alloc failed\n", __func__);
1600
1601 /*
1602 * Iterate physical segments to allocate domain-local memory for PV
1603 * list headers.
1604 */
1605 pvd = pv_table;
1606 for (i = 0; i < vm_phys_nsegs; i++) {
1607 seg = &vm_phys_segs[i];
1608 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1609 pmap_l2_pindex(seg->start);
1610 domain = seg->domain;
1611
1612 s = round_page(pages * sizeof(*pvd));
1613
1614 for (j = 0; j < s; j += PAGE_SIZE) {
1615 vm_page_t m = vm_page_alloc_noobj_domain(domain,
1616 VM_ALLOC_ZERO);
1617 if (m == NULL)
1618 panic("failed to allocate PV table page");
1619 pmap_qenter((vm_offset_t)pvd + j, &m, 1);
1620 }
1621
1622 for (j = 0; j < s / sizeof(*pvd); j++) {
1623 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1624 TAILQ_INIT(&pvd->pv_page.pv_list);
1625 pvd++;
1626 }
1627 }
1628 pvd = &pv_dummy_large;
1629 memset(pvd, 0, sizeof(*pvd));
1630 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1631 TAILQ_INIT(&pvd->pv_page.pv_list);
1632
1633 /*
1634 * Set pointers from vm_phys_segs to pv_table.
1635 */
1636 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1637 seg = &vm_phys_segs[i];
1638 seg->md_first = pvd;
1639 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1640 pmap_l2_pindex(seg->start);
1641
1642 /*
1643 * If there is a following segment, and the final
1644 * superpage of this segment and the initial superpage
1645 * of the next segment are the same then adjust the
1646 * pv_table entry for that next segment down by one so
1647 * that the pv_table entries will be shared.
1648 */
1649 if (i + 1 < vm_phys_nsegs) {
1650 next_seg = &vm_phys_segs[i + 1];
1651 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1652 pmap_l2_pindex(next_seg->start)) {
1653 pvd--;
1654 }
1655 }
1656 }
1657 }
1658
1659 static bool
pmap_dbm_check(const struct cpu_feat * feat __unused,u_int midr __unused)1660 pmap_dbm_check(const struct cpu_feat *feat __unused, u_int midr __unused)
1661 {
1662 uint64_t id_aa64mmfr1;
1663
1664 id_aa64mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1665 return (ID_AA64MMFR1_HAFDBS_VAL(id_aa64mmfr1) >=
1666 ID_AA64MMFR1_HAFDBS_AF_DBS);
1667 }
1668
1669 static bool
pmap_dbm_has_errata(const struct cpu_feat * feat __unused,u_int midr,u_int ** errata_list,u_int * errata_count)1670 pmap_dbm_has_errata(const struct cpu_feat *feat __unused, u_int midr,
1671 u_int **errata_list, u_int *errata_count)
1672 {
1673 /* Disable on Cortex-A55 for erratum 1024718 - all revisions */
1674 if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK, CPU_IMPL_ARM,
1675 CPU_PART_CORTEX_A55, 0, 0)) {
1676 static u_int errata_id = 1024718;
1677
1678 *errata_list = &errata_id;
1679 *errata_count = 1;
1680 return (true);
1681 }
1682
1683 /* Disable on Cortex-A510 for erratum 2051678 - r0p0 to r0p2 */
1684 if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK | CPU_VAR_MASK,
1685 CPU_IMPL_ARM, CPU_PART_CORTEX_A510, 0, 0)) {
1686 if (CPU_REV(PCPU_GET(midr)) < 3) {
1687 static u_int errata_id = 2051678;
1688
1689 *errata_list = &errata_id;
1690 *errata_count = 1;
1691 return (true);
1692 }
1693 }
1694
1695 return (false);
1696 }
1697
1698 static void
pmap_dbm_enable(const struct cpu_feat * feat __unused,cpu_feat_errata errata_status,u_int * errata_list __unused,u_int errata_count)1699 pmap_dbm_enable(const struct cpu_feat *feat __unused,
1700 cpu_feat_errata errata_status, u_int *errata_list __unused,
1701 u_int errata_count)
1702 {
1703 uint64_t tcr;
1704
1705 /* Skip if there is an erratum affecting DBM */
1706 if (errata_status != ERRATA_NONE)
1707 return;
1708
1709 tcr = READ_SPECIALREG(tcr_el1) | TCR_HD;
1710 WRITE_SPECIALREG(tcr_el1, tcr);
1711 isb();
1712 /* Flush the local TLB for the TCR_HD flag change */
1713 dsb(nshst);
1714 __asm __volatile("tlbi vmalle1");
1715 dsb(nsh);
1716 isb();
1717 }
1718
1719 static struct cpu_feat feat_dbm = {
1720 .feat_name = "FEAT_HAFDBS (DBM)",
1721 .feat_check = pmap_dbm_check,
1722 .feat_has_errata = pmap_dbm_has_errata,
1723 .feat_enable = pmap_dbm_enable,
1724 .feat_flags = CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU,
1725 };
1726 DATA_SET(cpu_feat_set, feat_dbm);
1727
1728 /*
1729 * Initialize the pmap module.
1730 *
1731 * Called by vm_mem_init(), to initialize any structures that the pmap
1732 * system needs to map virtual memory.
1733 */
1734 void
pmap_init(void)1735 pmap_init(void)
1736 {
1737 uint64_t mmfr1;
1738 int i, vmid_bits;
1739
1740 /*
1741 * Are large page mappings enabled?
1742 */
1743 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1744 if (superpages_enabled) {
1745 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1746 ("pmap_init: can't assign to pagesizes[1]"));
1747 pagesizes[1] = L3C_SIZE;
1748 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1749 ("pmap_init: can't assign to pagesizes[2]"));
1750 pagesizes[2] = L2_SIZE;
1751 if (L1_BLOCKS_SUPPORTED) {
1752 KASSERT(MAXPAGESIZES > 3 && pagesizes[3] == 0,
1753 ("pmap_init: can't assign to pagesizes[3]"));
1754 pagesizes[3] = L1_SIZE;
1755 }
1756 }
1757
1758 /*
1759 * Initialize the ASID allocator.
1760 */
1761 pmap_init_asids(&asids,
1762 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1763
1764 if (has_hyp()) {
1765 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1766 vmid_bits = 8;
1767
1768 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1769 ID_AA64MMFR1_VMIDBits_16)
1770 vmid_bits = 16;
1771 pmap_init_asids(&vmids, vmid_bits);
1772 }
1773
1774 /*
1775 * Initialize pv chunk lists.
1776 */
1777 for (i = 0; i < PMAP_MEMDOM; i++) {
1778 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1779 MTX_DEF);
1780 TAILQ_INIT(&pv_chunks[i].pvc_list);
1781 }
1782 pmap_init_pv_table();
1783
1784 vm_initialized = 1;
1785 }
1786
1787 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1788 "L1 (1GB/64GB) page mapping counters");
1789
1790 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
1791 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
1792 &pmap_l1_demotions, "L1 (1GB/64GB) page demotions");
1793
1794 SYSCTL_BOOL(_vm_pmap_l1, OID_AUTO, supported, CTLFLAG_RD, &pmap_l1_supported,
1795 0, "L1 blocks are supported");
1796
1797 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1798 "L2C (32MB/1GB) page mapping counters");
1799
1800 static COUNTER_U64_DEFINE_EARLY(pmap_l2c_demotions);
1801 SYSCTL_COUNTER_U64(_vm_pmap_l2c, OID_AUTO, demotions, CTLFLAG_RD,
1802 &pmap_l2c_demotions, "L2C (32MB/1GB) page demotions");
1803
1804 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1805 "2MB page mapping counters");
1806
1807 static COUNTER_U64_DEFINE_EARLY(pmap_l2_demotions);
1808 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1809 &pmap_l2_demotions, "L2 (2MB/32MB) page demotions");
1810
1811 static COUNTER_U64_DEFINE_EARLY(pmap_l2_mappings);
1812 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1813 &pmap_l2_mappings, "L2 (2MB/32MB) page mappings");
1814
1815 static COUNTER_U64_DEFINE_EARLY(pmap_l2_p_failures);
1816 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1817 &pmap_l2_p_failures, "L2 (2MB/32MB) page promotion failures");
1818
1819 static COUNTER_U64_DEFINE_EARLY(pmap_l2_promotions);
1820 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1821 &pmap_l2_promotions, "L2 (2MB/32MB) page promotions");
1822
1823 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1824 "L3C (64KB/2MB) page mapping counters");
1825
1826 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions);
1827 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD,
1828 &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions");
1829
1830 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings);
1831 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD,
1832 &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings");
1833
1834 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures);
1835 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD,
1836 &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures");
1837
1838 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions);
1839 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD,
1840 &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions");
1841
1842 /*
1843 * If the given value for "final_only" is false, then any cached intermediate-
1844 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1845 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1846 * Otherwise, just the cached final-level entry is invalidated.
1847 */
1848 static __inline void
pmap_s1_invalidate_kernel(uint64_t r,bool final_only)1849 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1850 {
1851 if (final_only)
1852 __asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1853 else
1854 __asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1855 }
1856
1857 static __inline void
pmap_s1_invalidate_user(uint64_t r,bool final_only)1858 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1859 {
1860 if (final_only)
1861 __asm __volatile("tlbi vale1is, %0" : : "r" (r));
1862 else
1863 __asm __volatile("tlbi vae1is, %0" : : "r" (r));
1864 }
1865
1866 /*
1867 * Invalidates any cached final- and optionally intermediate-level TLB entries
1868 * for the specified virtual address in the given virtual address space.
1869 */
1870 static __inline void
pmap_s1_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1871 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1872 {
1873 uint64_t r;
1874
1875 PMAP_ASSERT_STAGE1(pmap);
1876
1877 dsb(ishst);
1878 r = TLBI_VA(va);
1879 if (pmap == kernel_pmap) {
1880 pmap_s1_invalidate_kernel(r, final_only);
1881 } else {
1882 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1883 pmap_s1_invalidate_user(r, final_only);
1884 }
1885 dsb(ish);
1886 isb();
1887 }
1888
1889 static __inline void
pmap_s2_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1890 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1891 {
1892 PMAP_ASSERT_STAGE2(pmap);
1893 MPASS(pmap_stage2_invalidate_range != NULL);
1894 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1895 final_only);
1896 }
1897
1898 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1899 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1900 {
1901 if (pmap->pm_stage == PM_STAGE1)
1902 pmap_s1_invalidate_page(pmap, va, final_only);
1903 else
1904 pmap_s2_invalidate_page(pmap, va, final_only);
1905 }
1906
1907 /*
1908 * Use stride L{1,2}_SIZE when invalidating the TLB entries for L{1,2}_BLOCK
1909 * mappings. Otherwise, use stride L3_SIZE.
1910 */
1911 static __inline void
pmap_s1_invalidate_strided(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_offset_t stride,bool final_only)1912 pmap_s1_invalidate_strided(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1913 vm_offset_t stride, bool final_only)
1914 {
1915 uint64_t end, r, start;
1916
1917 PMAP_ASSERT_STAGE1(pmap);
1918
1919 dsb(ishst);
1920 if (pmap == kernel_pmap) {
1921 start = TLBI_VA(sva);
1922 end = TLBI_VA(eva);
1923 for (r = start; r < end; r += TLBI_VA(stride))
1924 pmap_s1_invalidate_kernel(r, final_only);
1925 } else {
1926 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1927 start |= TLBI_VA(sva);
1928 end |= TLBI_VA(eva);
1929 for (r = start; r < end; r += TLBI_VA(stride))
1930 pmap_s1_invalidate_user(r, final_only);
1931 }
1932 dsb(ish);
1933 isb();
1934 }
1935
1936 /*
1937 * Invalidates any cached final- and optionally intermediate-level TLB entries
1938 * for the specified virtual address range in the given virtual address space.
1939 */
1940 static __inline void
pmap_s1_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1941 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1942 bool final_only)
1943 {
1944 pmap_s1_invalidate_strided(pmap, sva, eva, L3_SIZE, final_only);
1945 }
1946
1947 static __inline void
pmap_s2_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1948 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1949 bool final_only)
1950 {
1951 PMAP_ASSERT_STAGE2(pmap);
1952 MPASS(pmap_stage2_invalidate_range != NULL);
1953 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
1954 }
1955
1956 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1957 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1958 bool final_only)
1959 {
1960 if (pmap->pm_stage == PM_STAGE1)
1961 pmap_s1_invalidate_range(pmap, sva, eva, final_only);
1962 else
1963 pmap_s2_invalidate_range(pmap, sva, eva, final_only);
1964 }
1965
1966 /*
1967 * Invalidates all cached intermediate- and final-level TLB entries for the
1968 * given virtual address space.
1969 */
1970 static __inline void
pmap_s1_invalidate_all(pmap_t pmap)1971 pmap_s1_invalidate_all(pmap_t pmap)
1972 {
1973 uint64_t r;
1974
1975 PMAP_ASSERT_STAGE1(pmap);
1976
1977 dsb(ishst);
1978 if (pmap == kernel_pmap) {
1979 __asm __volatile("tlbi vmalle1is");
1980 } else {
1981 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1982 __asm __volatile("tlbi aside1is, %0" : : "r" (r));
1983 }
1984 dsb(ish);
1985 isb();
1986 }
1987
1988 static __inline void
pmap_s2_invalidate_all(pmap_t pmap)1989 pmap_s2_invalidate_all(pmap_t pmap)
1990 {
1991 PMAP_ASSERT_STAGE2(pmap);
1992 MPASS(pmap_stage2_invalidate_all != NULL);
1993 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
1994 }
1995
1996 static __inline void
pmap_invalidate_all(pmap_t pmap)1997 pmap_invalidate_all(pmap_t pmap)
1998 {
1999 if (pmap->pm_stage == PM_STAGE1)
2000 pmap_s1_invalidate_all(pmap);
2001 else
2002 pmap_s2_invalidate_all(pmap);
2003 }
2004
2005 /*
2006 * Routine: pmap_extract
2007 * Function:
2008 * Extract the physical page address associated
2009 * with the given map/virtual_address pair.
2010 */
2011 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)2012 pmap_extract(pmap_t pmap, vm_offset_t va)
2013 {
2014 pt_entry_t *pte, tpte;
2015 vm_paddr_t pa;
2016 int lvl;
2017
2018 pa = 0;
2019 PMAP_LOCK(pmap);
2020 /*
2021 * Find the block or page map for this virtual address. pmap_pte
2022 * will return either a valid block/page entry, or NULL.
2023 */
2024 pte = pmap_pte(pmap, va, &lvl);
2025 if (pte != NULL) {
2026 tpte = pmap_load(pte);
2027 pa = PTE_TO_PHYS(tpte);
2028 switch(lvl) {
2029 case 1:
2030 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
2031 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
2032 ("pmap_extract: Invalid L1 pte found: %lx",
2033 tpte & ATTR_DESCR_MASK));
2034 pa |= (va & L1_OFFSET);
2035 break;
2036 case 2:
2037 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
2038 ("pmap_extract: Invalid L2 pte found: %lx",
2039 tpte & ATTR_DESCR_MASK));
2040 pa |= (va & L2_OFFSET);
2041 break;
2042 case 3:
2043 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
2044 ("pmap_extract: Invalid L3 pte found: %lx",
2045 tpte & ATTR_DESCR_MASK));
2046 pa |= (va & L3_OFFSET);
2047 break;
2048 }
2049 }
2050 PMAP_UNLOCK(pmap);
2051 return (pa);
2052 }
2053
2054 /*
2055 * Routine: pmap_extract_and_hold
2056 * Function:
2057 * Atomically extract and hold the physical page
2058 * with the given pmap and virtual address pair
2059 * if that mapping permits the given protection.
2060 */
2061 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)2062 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
2063 {
2064 pt_entry_t *pte, tpte;
2065 vm_offset_t off;
2066 vm_page_t m;
2067 int lvl;
2068 bool use;
2069
2070 m = NULL;
2071 PMAP_LOCK(pmap);
2072 pte = pmap_pte(pmap, va, &lvl);
2073 if (pte != NULL) {
2074 tpte = pmap_load(pte);
2075
2076 KASSERT(lvl > 0 && lvl <= 3,
2077 ("pmap_extract_and_hold: Invalid level %d", lvl));
2078 /*
2079 * Check that the pte is either a L3 page, or a L1 or L2 block
2080 * entry. We can assume L1_BLOCK == L2_BLOCK.
2081 */
2082 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
2083 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
2084 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
2085 tpte & ATTR_DESCR_MASK));
2086
2087 use = false;
2088 if ((prot & VM_PROT_WRITE) == 0)
2089 use = true;
2090 else if (pmap->pm_stage == PM_STAGE1 &&
2091 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
2092 use = true;
2093 else if (pmap->pm_stage == PM_STAGE2 &&
2094 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
2095 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
2096 use = true;
2097
2098 if (use) {
2099 switch (lvl) {
2100 case 1:
2101 off = va & L1_OFFSET;
2102 break;
2103 case 2:
2104 off = va & L2_OFFSET;
2105 break;
2106 case 3:
2107 default:
2108 off = 0;
2109 }
2110 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off);
2111 if (m != NULL && !vm_page_wire_mapped(m))
2112 m = NULL;
2113 }
2114 }
2115 PMAP_UNLOCK(pmap);
2116 return (m);
2117 }
2118
2119 /*
2120 * Returns true if the entire kernel virtual address range is mapped
2121 */
2122 static bool
pmap_kmapped_range(vm_offset_t sva,vm_size_t size)2123 pmap_kmapped_range(vm_offset_t sva, vm_size_t size)
2124 {
2125 pt_entry_t *pte, tpte;
2126 vm_offset_t eva;
2127
2128 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS,
2129 ("%s: Invalid virtual address: %lx", __func__, sva));
2130 MPASS(size != 0);
2131 eva = sva + size - 1;
2132 KASSERT(eva > sva, ("%s: Size too large: sva %lx, size %lx", __func__,
2133 sva, size));
2134
2135 while (sva <= eva) {
2136 pte = pmap_l1(kernel_pmap, sva);
2137 if (pte == NULL)
2138 return (false);
2139 tpte = pmap_load(pte);
2140 if (tpte == 0)
2141 return (false);
2142 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2143 sva = (sva & ~L1_OFFSET) + L1_SIZE;
2144 continue;
2145 }
2146
2147 pte = pmap_l1_to_l2(&tpte, sva);
2148 tpte = pmap_load(pte);
2149 if (tpte == 0)
2150 return (false);
2151 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2152 sva = (sva & ~L2_OFFSET) + L2_SIZE;
2153 continue;
2154 }
2155 pte = pmap_l2_to_l3(&tpte, sva);
2156 tpte = pmap_load(pte);
2157 if (tpte == 0)
2158 return (false);
2159 MPASS((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_PAGE);
2160 if ((tpte & ATTR_CONTIGUOUS) == ATTR_CONTIGUOUS)
2161 sva = (sva & ~L3C_OFFSET) + L3C_SIZE;
2162 else
2163 sva = (sva & ~L3_OFFSET) + L3_SIZE;
2164 }
2165
2166 return (true);
2167 }
2168
2169 /*
2170 * Walks the page tables to translate a kernel virtual address to a
2171 * physical address. Returns true if the kva is valid and stores the
2172 * physical address in pa if it is not NULL.
2173 *
2174 * See the comment above data_abort() for the rationale for specifying
2175 * NO_PERTHREAD_SSP here.
2176 */
2177 bool NO_PERTHREAD_SSP
pmap_klookup(vm_offset_t va,vm_paddr_t * pa)2178 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
2179 {
2180 pt_entry_t *pte, tpte;
2181 register_t intr;
2182 uint64_t par;
2183
2184 /*
2185 * Disable interrupts so we don't get interrupted between asking
2186 * for address translation, and getting the result back.
2187 */
2188 intr = intr_disable();
2189 par = arm64_address_translate_s1e1r(va);
2190 intr_restore(intr);
2191
2192 if (PAR_SUCCESS(par)) {
2193 if (pa != NULL)
2194 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
2195 return (true);
2196 }
2197
2198 /*
2199 * Fall back to walking the page table. The address translation
2200 * instruction may fail when the page is in a break-before-make
2201 * sequence. As we only clear the valid bit in said sequence we
2202 * can walk the page table to find the physical address.
2203 */
2204
2205 pte = pmap_l1(kernel_pmap, va);
2206 if (pte == NULL)
2207 return (false);
2208
2209 /*
2210 * A concurrent pmap_update_entry() will clear the entry's valid bit
2211 * but leave the rest of the entry unchanged. Therefore, we treat a
2212 * non-zero entry as being valid, and we ignore the valid bit when
2213 * determining whether the entry maps a block, page, or table.
2214 */
2215 tpte = pmap_load(pte);
2216 if (tpte == 0)
2217 return (false);
2218 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2219 if (pa != NULL)
2220 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET);
2221 return (true);
2222 }
2223 pte = pmap_l1_to_l2(&tpte, va);
2224 tpte = pmap_load(pte);
2225 if (tpte == 0)
2226 return (false);
2227 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2228 if (pa != NULL)
2229 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET);
2230 return (true);
2231 }
2232 pte = pmap_l2_to_l3(&tpte, va);
2233 tpte = pmap_load(pte);
2234 if (tpte == 0)
2235 return (false);
2236 if (pa != NULL)
2237 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET);
2238 return (true);
2239 }
2240
2241 /*
2242 * Routine: pmap_kextract
2243 * Function:
2244 * Extract the physical page address associated with the given kernel
2245 * virtual address.
2246 */
2247 vm_paddr_t
pmap_kextract(vm_offset_t va)2248 pmap_kextract(vm_offset_t va)
2249 {
2250 vm_paddr_t pa;
2251
2252 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2253 return (DMAP_TO_PHYS(va));
2254
2255 if (pmap_klookup(va, &pa) == false)
2256 return (0);
2257 return (pa);
2258 }
2259
2260 /***************************************************
2261 * Low level mapping routines.....
2262 ***************************************************/
2263
2264 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)2265 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
2266 {
2267 pd_entry_t *pde;
2268 pt_entry_t attr, old_l3e, *pte;
2269 vm_offset_t va;
2270 vm_page_t mpte;
2271 int error, lvl;
2272
2273 KASSERT((pa & L3_OFFSET) == 0,
2274 ("pmap_kenter: Invalid physical address"));
2275 KASSERT((sva & L3_OFFSET) == 0,
2276 ("pmap_kenter: Invalid virtual address"));
2277 KASSERT((size & PAGE_MASK) == 0,
2278 ("pmap_kenter: Mapping is not page-sized"));
2279
2280 attr = ATTR_AF | pmap_sh_attr | ATTR_S1_AP(ATTR_S1_AP_RW) |
2281 ATTR_S1_XN | ATTR_KERN_GP | ATTR_S1_IDX(mode);
2282 old_l3e = 0;
2283 va = sva;
2284 while (size != 0) {
2285 pde = pmap_pde(kernel_pmap, va, &lvl);
2286 KASSERT(pde != NULL,
2287 ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
2288 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
2289
2290 /*
2291 * If we have an aligned, contiguous chunk of L2_SIZE, try
2292 * to create an L2_BLOCK mapping.
2293 */
2294 if ((va & L2_OFFSET) == 0 && size >= L2_SIZE &&
2295 (pa & L2_OFFSET) == 0 && vm_initialized) {
2296 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
2297 KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)),
2298 ("pmap_kenter: Unexpected mapping"));
2299 PMAP_LOCK(kernel_pmap);
2300 error = pmap_insert_pt_page(kernel_pmap, mpte, false,
2301 false);
2302 if (error == 0) {
2303 attr &= ~ATTR_CONTIGUOUS;
2304
2305 /*
2306 * Although the page table page "mpte" should
2307 * be devoid of mappings, the TLB might hold
2308 * intermediate entries that reference it, so
2309 * we perform a single-page invalidation.
2310 */
2311 pmap_update_entry(kernel_pmap, pde,
2312 PHYS_TO_PTE(pa) | attr | L2_BLOCK, va,
2313 PAGE_SIZE);
2314 }
2315 PMAP_UNLOCK(kernel_pmap);
2316 if (error == 0) {
2317 va += L2_SIZE;
2318 pa += L2_SIZE;
2319 size -= L2_SIZE;
2320 continue;
2321 }
2322 }
2323
2324 /*
2325 * If we have an aligned, contiguous chunk of L3C_ENTRIES
2326 * L3 pages, set the contiguous bit within each PTE so that
2327 * the chunk can be cached using only one TLB entry.
2328 */
2329 if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) {
2330 if (size >= L3C_SIZE)
2331 attr |= ATTR_CONTIGUOUS;
2332 else
2333 attr &= ~ATTR_CONTIGUOUS;
2334 }
2335
2336 pte = pmap_l2_to_l3(pde, va);
2337 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr |
2338 L3_PAGE);
2339
2340 va += PAGE_SIZE;
2341 pa += PAGE_SIZE;
2342 size -= PAGE_SIZE;
2343 }
2344 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2345 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2346 else {
2347 /*
2348 * Because the old entries were invalid and the new mappings
2349 * are not executable, an isb is not required.
2350 */
2351 dsb(ishst);
2352 }
2353 }
2354
2355 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)2356 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2357 {
2358
2359 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2360 }
2361
2362 /*
2363 * Remove a page from the kernel pagetables.
2364 */
2365 void
pmap_kremove(vm_offset_t va)2366 pmap_kremove(vm_offset_t va)
2367 {
2368 pt_entry_t *pte;
2369
2370 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2371 KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0,
2372 ("pmap_kremove: unexpected ATTR_CONTIGUOUS"));
2373 pmap_clear(pte);
2374 pmap_s1_invalidate_page(kernel_pmap, va, true);
2375 }
2376
2377 /*
2378 * Remove the specified range of mappings from the kernel address space.
2379 *
2380 * Should only be applied to mappings that were created by pmap_kenter() or
2381 * pmap_kenter_device(). Nothing about this function is actually specific
2382 * to device mappings.
2383 */
2384 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)2385 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2386 {
2387 pt_entry_t *ptep, *ptep_end;
2388 vm_offset_t va;
2389 int lvl;
2390
2391 KASSERT((sva & L3_OFFSET) == 0,
2392 ("pmap_kremove_device: Invalid virtual address"));
2393 KASSERT((size & PAGE_MASK) == 0,
2394 ("pmap_kremove_device: Mapping is not page-sized"));
2395
2396 va = sva;
2397 while (size != 0) {
2398 ptep = pmap_pte(kernel_pmap, va, &lvl);
2399 KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va));
2400 switch (lvl) {
2401 case 2:
2402 KASSERT((va & L2_OFFSET) == 0,
2403 ("Unaligned virtual address"));
2404 KASSERT(size >= L2_SIZE, ("Insufficient size"));
2405
2406 if (va != sva) {
2407 pmap_s1_invalidate_range(kernel_pmap, sva, va,
2408 true);
2409 }
2410 pmap_clear(ptep);
2411 pmap_s1_invalidate_page(kernel_pmap, va, true);
2412 PMAP_LOCK(kernel_pmap);
2413 pmap_remove_kernel_l2(kernel_pmap, ptep, va);
2414 PMAP_UNLOCK(kernel_pmap);
2415
2416 va += L2_SIZE;
2417 sva = va;
2418 size -= L2_SIZE;
2419 break;
2420 case 3:
2421 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
2422 KASSERT((va & L3C_OFFSET) == 0,
2423 ("Unaligned L3C virtual address"));
2424 KASSERT(size >= L3C_SIZE,
2425 ("Insufficient L3C size"));
2426
2427 ptep_end = ptep + L3C_ENTRIES;
2428 for (; ptep < ptep_end; ptep++)
2429 pmap_clear(ptep);
2430
2431 va += L3C_SIZE;
2432 size -= L3C_SIZE;
2433 break;
2434 }
2435 pmap_clear(ptep);
2436
2437 va += PAGE_SIZE;
2438 size -= PAGE_SIZE;
2439 break;
2440 default:
2441 __assert_unreachable();
2442 break;
2443 }
2444 }
2445 if (va != sva)
2446 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2447 }
2448
2449 /*
2450 * Used to map a range of physical addresses into kernel
2451 * virtual address space.
2452 *
2453 * The value passed in '*virt' is a suggested virtual address for
2454 * the mapping. Architectures which can support a direct-mapped
2455 * physical to virtual region can return the appropriate address
2456 * within that region, leaving '*virt' unchanged. Other
2457 * architectures should map the pages starting at '*virt' and
2458 * update '*virt' with the first usable address after the mapped
2459 * region.
2460 */
2461 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)2462 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2463 {
2464 return PHYS_TO_DMAP(start);
2465 }
2466
2467 /*
2468 * Add a list of wired pages to the kva
2469 * this routine is only used for temporary
2470 * kernel mappings that do not need to have
2471 * page modification or references recorded.
2472 * Note that old mappings are simply written
2473 * over. The page *must* be wired.
2474 * Note: SMP coherent. Uses a ranged shootdown IPI.
2475 */
2476 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)2477 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2478 {
2479 pd_entry_t *pde;
2480 pt_entry_t attr, old_l3e, *pte;
2481 vm_offset_t va;
2482 vm_page_t m;
2483 int i, lvl;
2484
2485 old_l3e = 0;
2486 va = sva;
2487 for (i = 0; i < count; i++) {
2488 pde = pmap_pde(kernel_pmap, va, &lvl);
2489 KASSERT(pde != NULL,
2490 ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2491 KASSERT(lvl == 2,
2492 ("pmap_qenter: Invalid level %d", lvl));
2493
2494 m = ma[i];
2495 attr = ATTR_AF | pmap_sh_attr |
2496 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2497 ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2498 pte = pmap_l2_to_l3(pde, va);
2499 old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr);
2500
2501 va += L3_SIZE;
2502 }
2503 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2504 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2505 else {
2506 /*
2507 * Because the old entries were invalid and the new mappings
2508 * are not executable, an isb is not required.
2509 */
2510 dsb(ishst);
2511 }
2512 }
2513
2514 /*
2515 * This routine tears out page mappings from the
2516 * kernel -- it is meant only for temporary mappings.
2517 */
2518 void
pmap_qremove(vm_offset_t sva,int count)2519 pmap_qremove(vm_offset_t sva, int count)
2520 {
2521 pt_entry_t *pte;
2522 vm_offset_t va;
2523
2524 KASSERT(ADDR_IS_CANONICAL(sva),
2525 ("%s: Address not in canonical form: %lx", __func__, sva));
2526 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
2527
2528 va = sva;
2529 while (count-- > 0) {
2530 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2531 if (pte != NULL) {
2532 pmap_clear(pte);
2533 }
2534
2535 va += PAGE_SIZE;
2536 }
2537 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2538 }
2539
2540 /***************************************************
2541 * Page table page management routines.....
2542 ***************************************************/
2543 /*
2544 * Schedule the specified unused page table page to be freed. Specifically,
2545 * add the page to the specified list of pages that will be released to the
2546 * physical memory manager after the TLB has been updated.
2547 */
2548 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)2549 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
2550 {
2551
2552 if (set_PG_ZERO)
2553 m->flags |= PG_ZERO;
2554 else
2555 m->flags &= ~PG_ZERO;
2556 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2557 }
2558
2559 /*
2560 * Decrements a page table page's reference count, which is used to record the
2561 * number of valid page table entries within the page. If the reference count
2562 * drops to zero, then the page table page is unmapped. Returns true if the
2563 * page table page was unmapped and false otherwise.
2564 */
2565 static inline bool
pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2566 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2567 {
2568
2569 --m->ref_count;
2570 if (m->ref_count == 0) {
2571 _pmap_unwire_l3(pmap, va, m, free);
2572 return (true);
2573 } else
2574 return (false);
2575 }
2576
2577 static void
_pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2578 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2579 {
2580
2581 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2582 /*
2583 * unmap the page table page
2584 */
2585 if (m->pindex >= (NUL2E + NUL1E)) {
2586 /* l1 page */
2587 pd_entry_t *l0;
2588
2589 l0 = pmap_l0(pmap, va);
2590 pmap_clear(l0);
2591 } else if (m->pindex >= NUL2E) {
2592 /* l2 page */
2593 pd_entry_t *l1;
2594
2595 l1 = pmap_l1(pmap, va);
2596 pmap_clear(l1);
2597 } else {
2598 /* l3 page */
2599 pd_entry_t *l2;
2600
2601 l2 = pmap_l2(pmap, va);
2602 pmap_clear(l2);
2603 }
2604 pmap_resident_count_dec(pmap, 1);
2605 if (m->pindex < NUL2E) {
2606 /* We just released an l3, unhold the matching l2 */
2607 pd_entry_t *l1, tl1;
2608 vm_page_t l2pg;
2609
2610 l1 = pmap_l1(pmap, va);
2611 tl1 = pmap_load(l1);
2612 l2pg = PTE_TO_VM_PAGE(tl1);
2613 pmap_unwire_l3(pmap, va, l2pg, free);
2614 } else if (m->pindex < (NUL2E + NUL1E)) {
2615 /* We just released an l2, unhold the matching l1 */
2616 pd_entry_t *l0, tl0;
2617 vm_page_t l1pg;
2618
2619 l0 = pmap_l0(pmap, va);
2620 tl0 = pmap_load(l0);
2621 l1pg = PTE_TO_VM_PAGE(tl0);
2622 pmap_unwire_l3(pmap, va, l1pg, free);
2623 }
2624 pmap_invalidate_page(pmap, va, false);
2625
2626 /*
2627 * Put page on a list so that it is released after
2628 * *ALL* TLB shootdown is done
2629 */
2630 pmap_add_delayed_free_list(m, free, true);
2631 }
2632
2633 /*
2634 * After removing a page table entry, this routine is used to
2635 * conditionally free the page, and manage the reference count.
2636 */
2637 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)2638 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2639 struct spglist *free)
2640 {
2641 vm_page_t mpte;
2642
2643 KASSERT(ADDR_IS_CANONICAL(va),
2644 ("%s: Address not in canonical form: %lx", __func__, va));
2645 if (ADDR_IS_KERNEL(va))
2646 return (0);
2647 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2648 mpte = PTE_TO_VM_PAGE(ptepde);
2649 return (pmap_unwire_l3(pmap, va, mpte, free));
2650 }
2651
2652 /*
2653 * Release a page table page reference after a failed attempt to create a
2654 * mapping.
2655 */
2656 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)2657 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2658 {
2659 struct spglist free;
2660
2661 SLIST_INIT(&free);
2662 if (pmap_unwire_l3(pmap, va, mpte, &free))
2663 vm_page_free_pages_toq(&free, true);
2664 }
2665
2666 void
pmap_pinit0(pmap_t pmap)2667 pmap_pinit0(pmap_t pmap)
2668 {
2669
2670 PMAP_LOCK_INIT(pmap);
2671 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2672 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2673 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2674 TAILQ_INIT(&pmap->pm_pvchunk);
2675 vm_radix_init(&pmap->pm_root);
2676 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2677 pmap->pm_stage = PM_STAGE1;
2678 pmap->pm_levels = 4;
2679 pmap->pm_ttbr = pmap->pm_l0_paddr;
2680 pmap->pm_asid_set = &asids;
2681 pmap->pm_bti = NULL;
2682
2683 PCPU_SET(curpmap, pmap);
2684 }
2685
2686 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage,int levels)2687 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2688 {
2689 vm_page_t m;
2690
2691 /*
2692 * allocate the l0 page
2693 */
2694 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2695 VM_ALLOC_ZERO);
2696 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2697 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2698
2699 TAILQ_INIT(&pmap->pm_pvchunk);
2700 vm_radix_init(&pmap->pm_root);
2701 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2702 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2703
2704 MPASS(levels == 3 || levels == 4);
2705 pmap->pm_levels = levels;
2706 pmap->pm_stage = stage;
2707 pmap->pm_bti = NULL;
2708 switch (stage) {
2709 case PM_STAGE1:
2710 pmap->pm_asid_set = &asids;
2711 if (pmap_bti_support) {
2712 pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF,
2713 M_ZERO | M_WAITOK);
2714 rangeset_init(pmap->pm_bti, bti_dup_range,
2715 bti_free_range, pmap, M_NOWAIT);
2716 }
2717 break;
2718 case PM_STAGE2:
2719 pmap->pm_asid_set = &vmids;
2720 break;
2721 default:
2722 panic("%s: Invalid pmap type %d", __func__, stage);
2723 break;
2724 }
2725
2726 /* XXX Temporarily disable deferred ASID allocation. */
2727 pmap_alloc_asid(pmap);
2728
2729 /*
2730 * Allocate the level 1 entry to use as the root. This will increase
2731 * the refcount on the level 1 page so it won't be removed until
2732 * pmap_release() is called.
2733 */
2734 if (pmap->pm_levels == 3) {
2735 PMAP_LOCK(pmap);
2736 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2737 PMAP_UNLOCK(pmap);
2738 }
2739 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2740
2741 return (1);
2742 }
2743
2744 int
pmap_pinit(pmap_t pmap)2745 pmap_pinit(pmap_t pmap)
2746 {
2747
2748 return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2749 }
2750
2751 /*
2752 * This routine is called if the desired page table page does not exist.
2753 *
2754 * If page table page allocation fails, this routine may sleep before
2755 * returning NULL. It sleeps only if a lock pointer was given.
2756 *
2757 * Note: If a page allocation fails at page table level two or three,
2758 * one or two pages may be held during the wait, only to be released
2759 * afterwards. This conservative approach is easily argued to avoid
2760 * race conditions.
2761 */
2762 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)2763 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2764 {
2765 vm_page_t m, l1pg, l2pg;
2766
2767 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2768
2769 /*
2770 * Allocate a page table page.
2771 */
2772 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2773 if (lockp != NULL) {
2774 RELEASE_PV_LIST_LOCK(lockp);
2775 PMAP_UNLOCK(pmap);
2776 vm_wait(NULL);
2777 PMAP_LOCK(pmap);
2778 }
2779
2780 /*
2781 * Indicate the need to retry. While waiting, the page table
2782 * page may have been allocated.
2783 */
2784 return (NULL);
2785 }
2786 m->pindex = ptepindex;
2787
2788 /*
2789 * Because of AArch64's weak memory consistency model, we must have a
2790 * barrier here to ensure that the stores for zeroing "m", whether by
2791 * pmap_zero_page() or an earlier function, are visible before adding
2792 * "m" to the page table. Otherwise, a page table walk by another
2793 * processor's MMU could see the mapping to "m" and a stale, non-zero
2794 * PTE within "m".
2795 */
2796 dmb(ishst);
2797
2798 /*
2799 * Map the pagetable page into the process address space, if
2800 * it isn't already there.
2801 */
2802
2803 if (ptepindex >= (NUL2E + NUL1E)) {
2804 pd_entry_t *l0p, l0e;
2805 vm_pindex_t l0index;
2806
2807 l0index = ptepindex - (NUL2E + NUL1E);
2808 l0p = &pmap->pm_l0[l0index];
2809 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2810 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2811 l0e = VM_PAGE_TO_PTE(m) | L0_TABLE;
2812
2813 /*
2814 * Mark all kernel memory as not accessible from userspace
2815 * and userspace memory as not executable from the kernel.
2816 * This has been done for the bootstrap L0 entries in
2817 * locore.S.
2818 */
2819 if (pmap == kernel_pmap)
2820 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2821 else
2822 l0e |= TATTR_PXN_TABLE;
2823 pmap_store(l0p, l0e);
2824 } else if (ptepindex >= NUL2E) {
2825 vm_pindex_t l0index, l1index;
2826 pd_entry_t *l0, *l1;
2827 pd_entry_t tl0;
2828
2829 l1index = ptepindex - NUL2E;
2830 l0index = l1index >> Ln_ENTRIES_SHIFT;
2831
2832 l0 = &pmap->pm_l0[l0index];
2833 tl0 = pmap_load(l0);
2834 if (tl0 == 0) {
2835 /* recurse for allocating page dir */
2836 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2837 lockp) == NULL) {
2838 vm_page_unwire_noq(m);
2839 vm_page_free_zero(m);
2840 return (NULL);
2841 }
2842 } else {
2843 l1pg = PTE_TO_VM_PAGE(tl0);
2844 l1pg->ref_count++;
2845 }
2846
2847 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
2848 l1 = &l1[ptepindex & Ln_ADDR_MASK];
2849 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2850 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2851 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
2852 } else {
2853 vm_pindex_t l0index, l1index;
2854 pd_entry_t *l0, *l1, *l2;
2855 pd_entry_t tl0, tl1;
2856
2857 l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2858 l0index = l1index >> Ln_ENTRIES_SHIFT;
2859
2860 l0 = &pmap->pm_l0[l0index];
2861 tl0 = pmap_load(l0);
2862 if (tl0 == 0) {
2863 /* recurse for allocating page dir */
2864 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2865 lockp) == NULL) {
2866 vm_page_unwire_noq(m);
2867 vm_page_free_zero(m);
2868 return (NULL);
2869 }
2870 tl0 = pmap_load(l0);
2871 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2872 l1 = &l1[l1index & Ln_ADDR_MASK];
2873 } else {
2874 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2875 l1 = &l1[l1index & Ln_ADDR_MASK];
2876 tl1 = pmap_load(l1);
2877 if (tl1 == 0) {
2878 /* recurse for allocating page dir */
2879 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2880 lockp) == NULL) {
2881 vm_page_unwire_noq(m);
2882 vm_page_free_zero(m);
2883 return (NULL);
2884 }
2885 } else {
2886 l2pg = PTE_TO_VM_PAGE(tl1);
2887 l2pg->ref_count++;
2888 }
2889 }
2890
2891 l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1)));
2892 l2 = &l2[ptepindex & Ln_ADDR_MASK];
2893 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2894 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2895 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
2896 }
2897
2898 pmap_resident_count_inc(pmap, 1);
2899
2900 return (m);
2901 }
2902
2903 static pd_entry_t *
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,vm_page_t * l2pgp,struct rwlock ** lockp)2904 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2905 struct rwlock **lockp)
2906 {
2907 pd_entry_t *l1, *l2;
2908 vm_page_t l2pg;
2909 vm_pindex_t l2pindex;
2910
2911 KASSERT(ADDR_IS_CANONICAL(va),
2912 ("%s: Address not in canonical form: %lx", __func__, va));
2913
2914 retry:
2915 l1 = pmap_l1(pmap, va);
2916 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2917 l2 = pmap_l1_to_l2(l1, va);
2918 if (!ADDR_IS_KERNEL(va)) {
2919 /* Add a reference to the L2 page. */
2920 l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
2921 l2pg->ref_count++;
2922 } else
2923 l2pg = NULL;
2924 } else if (!ADDR_IS_KERNEL(va)) {
2925 /* Allocate a L2 page. */
2926 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
2927 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2928 if (l2pg == NULL) {
2929 if (lockp != NULL)
2930 goto retry;
2931 else
2932 return (NULL);
2933 }
2934 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2935 l2 = &l2[pmap_l2_index(va)];
2936 } else
2937 panic("pmap_alloc_l2: missing page table page for va %#lx",
2938 va);
2939 *l2pgp = l2pg;
2940 return (l2);
2941 }
2942
2943 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)2944 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2945 {
2946 vm_pindex_t ptepindex;
2947 pd_entry_t *pde, tpde;
2948 #ifdef INVARIANTS
2949 pt_entry_t *pte;
2950 #endif
2951 vm_page_t m;
2952 int lvl;
2953
2954 /*
2955 * Calculate pagetable page index
2956 */
2957 ptepindex = pmap_l2_pindex(va);
2958 retry:
2959 /*
2960 * Get the page directory entry
2961 */
2962 pde = pmap_pde(pmap, va, &lvl);
2963
2964 /*
2965 * If the page table page is mapped, we just increment the hold count,
2966 * and activate it. If we get a level 2 pde it will point to a level 3
2967 * table.
2968 */
2969 switch (lvl) {
2970 case -1:
2971 break;
2972 case 0:
2973 #ifdef INVARIANTS
2974 pte = pmap_l0_to_l1(pde, va);
2975 KASSERT(pmap_load(pte) == 0,
2976 ("pmap_alloc_l3: TODO: l0 superpages"));
2977 #endif
2978 break;
2979 case 1:
2980 #ifdef INVARIANTS
2981 pte = pmap_l1_to_l2(pde, va);
2982 KASSERT(pmap_load(pte) == 0,
2983 ("pmap_alloc_l3: TODO: l1 superpages"));
2984 #endif
2985 break;
2986 case 2:
2987 tpde = pmap_load(pde);
2988 if (tpde != 0) {
2989 m = PTE_TO_VM_PAGE(tpde);
2990 m->ref_count++;
2991 return (m);
2992 }
2993 break;
2994 default:
2995 panic("pmap_alloc_l3: Invalid level %d", lvl);
2996 }
2997
2998 /*
2999 * Here if the pte page isn't mapped, or if it has been deallocated.
3000 */
3001 m = _pmap_alloc_l3(pmap, ptepindex, lockp);
3002 if (m == NULL && lockp != NULL)
3003 goto retry;
3004
3005 return (m);
3006 }
3007
3008 /***************************************************
3009 * Pmap allocation/deallocation routines.
3010 ***************************************************/
3011
3012 /*
3013 * Release any resources held by the given physical map.
3014 * Called when a pmap initialized by pmap_pinit is being released.
3015 * Should only be called if the map contains no valid mappings.
3016 */
3017 void
pmap_release(pmap_t pmap)3018 pmap_release(pmap_t pmap)
3019 {
3020 bool rv __diagused;
3021 struct spglist freelist;
3022 struct asid_set *set;
3023 vm_page_t m;
3024 int asid;
3025
3026 if (pmap->pm_levels != 4) {
3027 PMAP_ASSERT_STAGE2(pmap);
3028 KASSERT(pmap->pm_stats.resident_count == 1,
3029 ("pmap_release: pmap resident count %ld != 0",
3030 pmap->pm_stats.resident_count));
3031 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
3032 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
3033
3034 SLIST_INIT(&freelist);
3035 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
3036 PMAP_LOCK(pmap);
3037 rv = pmap_unwire_l3(pmap, 0, m, &freelist);
3038 PMAP_UNLOCK(pmap);
3039 MPASS(rv == true);
3040 vm_page_free_pages_toq(&freelist, true);
3041 }
3042
3043 KASSERT(pmap->pm_stats.resident_count == 0,
3044 ("pmap_release: pmap resident count %ld != 0",
3045 pmap->pm_stats.resident_count));
3046 KASSERT(vm_radix_is_empty(&pmap->pm_root),
3047 ("pmap_release: pmap has reserved page table page(s)"));
3048
3049 set = pmap->pm_asid_set;
3050 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
3051
3052 /*
3053 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
3054 * the entries when removing them so rely on a later tlb invalidation.
3055 * this will happen when updating the VMID generation. Because of this
3056 * we don't reuse VMIDs within a generation.
3057 */
3058 if (pmap->pm_stage == PM_STAGE1) {
3059 mtx_lock_spin(&set->asid_set_mutex);
3060 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
3061 asid = COOKIE_TO_ASID(pmap->pm_cookie);
3062 KASSERT(asid >= ASID_FIRST_AVAILABLE &&
3063 asid < set->asid_set_size,
3064 ("pmap_release: pmap cookie has out-of-range asid"));
3065 bit_clear(set->asid_set, asid);
3066 }
3067 mtx_unlock_spin(&set->asid_set_mutex);
3068
3069 if (pmap->pm_bti != NULL) {
3070 rangeset_fini(pmap->pm_bti);
3071 free(pmap->pm_bti, M_DEVBUF);
3072 }
3073 }
3074
3075 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
3076 vm_page_unwire_noq(m);
3077 vm_page_free_zero(m);
3078 }
3079
3080 static int
kvm_size(SYSCTL_HANDLER_ARGS)3081 kvm_size(SYSCTL_HANDLER_ARGS)
3082 {
3083 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
3084
3085 return sysctl_handle_long(oidp, &ksize, 0, req);
3086 }
3087 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
3088 0, 0, kvm_size, "LU",
3089 "Size of KVM");
3090
3091 static int
kvm_free(SYSCTL_HANDLER_ARGS)3092 kvm_free(SYSCTL_HANDLER_ARGS)
3093 {
3094 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
3095
3096 return sysctl_handle_long(oidp, &kfree, 0, req);
3097 }
3098 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
3099 0, 0, kvm_free, "LU",
3100 "Amount of KVM free");
3101
3102 /*
3103 * grow the number of kernel page table entries, if needed
3104 */
3105 static int
pmap_growkernel_nopanic(vm_offset_t addr)3106 pmap_growkernel_nopanic(vm_offset_t addr)
3107 {
3108 vm_page_t nkpg;
3109 pd_entry_t *l0, *l1, *l2;
3110
3111 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
3112
3113 addr = roundup2(addr, L2_SIZE);
3114 if (addr - 1 >= vm_map_max(kernel_map))
3115 addr = vm_map_max(kernel_map);
3116 if (kernel_vm_end < addr) {
3117 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
3118 kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
3119 }
3120 while (kernel_vm_end < addr) {
3121 l0 = pmap_l0(kernel_pmap, kernel_vm_end);
3122 KASSERT(pmap_load(l0) != 0,
3123 ("pmap_growkernel: No level 0 kernel entry"));
3124
3125 l1 = pmap_l0_to_l1(l0, kernel_vm_end);
3126 if (pmap_load(l1) == 0) {
3127 /* We need a new PDP entry */
3128 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3129 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3130 if (nkpg == NULL)
3131 return (KERN_RESOURCE_SHORTAGE);
3132 nkpg->pindex = pmap_l1_pindex(kernel_vm_end);
3133 /* See the dmb() in _pmap_alloc_l3(). */
3134 dmb(ishst);
3135 pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE);
3136 continue; /* try again */
3137 }
3138 l2 = pmap_l1_to_l2(l1, kernel_vm_end);
3139 if (pmap_load(l2) != 0) {
3140 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
3141 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3142 kernel_vm_end = vm_map_max(kernel_map);
3143 break;
3144 }
3145 continue;
3146 }
3147
3148 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3149 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3150 if (nkpg == NULL)
3151 return (KERN_RESOURCE_SHORTAGE);
3152 nkpg->pindex = pmap_l2_pindex(kernel_vm_end);
3153 /* See the dmb() in _pmap_alloc_l3(). */
3154 dmb(ishst);
3155 pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE);
3156
3157 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
3158 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3159 kernel_vm_end = vm_map_max(kernel_map);
3160 break;
3161 }
3162 }
3163 return (KERN_SUCCESS);
3164 }
3165
3166 int
pmap_growkernel(vm_offset_t addr)3167 pmap_growkernel(vm_offset_t addr)
3168 {
3169 int rv;
3170
3171 rv = pmap_growkernel_nopanic(addr);
3172 if (rv != KERN_SUCCESS && pmap_growkernel_panic)
3173 panic("pmap_growkernel: no memory to grow kernel");
3174 return (rv);
3175 }
3176
3177 /***************************************************
3178 * page management routines.
3179 ***************************************************/
3180
3181 static const uint64_t pc_freemask[_NPCM] = {
3182 [0 ... _NPCM - 2] = PC_FREEN,
3183 [_NPCM - 1] = PC_FREEL
3184 };
3185
3186 #ifdef PV_STATS
3187 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
3188
3189 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
3190 "Current number of pv entry chunks");
3191 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
3192 "Current number of pv entry chunks allocated");
3193 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
3194 "Current number of pv entry chunks frees");
3195 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
3196 "Number of times tried to get a chunk page but failed.");
3197
3198 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
3199 static int pv_entry_spare;
3200
3201 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
3202 "Current number of pv entry frees");
3203 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
3204 "Current number of pv entry allocs");
3205 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
3206 "Current number of pv entries");
3207 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3208 "Current number of spare pv entries");
3209 #endif
3210
3211 /*
3212 * We are in a serious low memory condition. Resort to
3213 * drastic measures to free some pages so we can allocate
3214 * another pv entry chunk.
3215 *
3216 * Returns NULL if PV entries were reclaimed from the specified pmap.
3217 *
3218 * We do not, however, unmap 2mpages because subsequent accesses will
3219 * allocate per-page pv entries until repromotion occurs, thereby
3220 * exacerbating the shortage of free pv entries.
3221 */
3222 static vm_page_t
reclaim_pv_chunk_domain(pmap_t locked_pmap,struct rwlock ** lockp,int domain)3223 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
3224 {
3225 struct pv_chunks_list *pvc;
3226 struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3227 struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3228 struct md_page *pvh;
3229 pd_entry_t *pde;
3230 pmap_t next_pmap, pmap;
3231 pt_entry_t *pte, tpte;
3232 pv_entry_t pv;
3233 vm_offset_t va;
3234 vm_page_t m, m_pc;
3235 struct spglist free;
3236 uint64_t inuse;
3237 int bit, field, freed, lvl;
3238
3239 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3240 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3241
3242 pmap = NULL;
3243 m_pc = NULL;
3244 SLIST_INIT(&free);
3245 bzero(&pc_marker_b, sizeof(pc_marker_b));
3246 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3247 pc_marker = (struct pv_chunk *)&pc_marker_b;
3248 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3249
3250 pvc = &pv_chunks[domain];
3251 mtx_lock(&pvc->pvc_lock);
3252 pvc->active_reclaims++;
3253 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
3254 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
3255 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3256 SLIST_EMPTY(&free)) {
3257 next_pmap = pc->pc_pmap;
3258 if (next_pmap == NULL) {
3259 /*
3260 * The next chunk is a marker. However, it is
3261 * not our marker, so active_reclaims must be
3262 * > 1. Consequently, the next_chunk code
3263 * will not rotate the pv_chunks list.
3264 */
3265 goto next_chunk;
3266 }
3267 mtx_unlock(&pvc->pvc_lock);
3268
3269 /*
3270 * A pv_chunk can only be removed from the pc_lru list
3271 * when both pvc->pvc_lock is owned and the
3272 * corresponding pmap is locked.
3273 */
3274 if (pmap != next_pmap) {
3275 if (pmap != NULL && pmap != locked_pmap)
3276 PMAP_UNLOCK(pmap);
3277 pmap = next_pmap;
3278 /* Avoid deadlock and lock recursion. */
3279 if (pmap > locked_pmap) {
3280 RELEASE_PV_LIST_LOCK(lockp);
3281 PMAP_LOCK(pmap);
3282 mtx_lock(&pvc->pvc_lock);
3283 continue;
3284 } else if (pmap != locked_pmap) {
3285 if (PMAP_TRYLOCK(pmap)) {
3286 mtx_lock(&pvc->pvc_lock);
3287 continue;
3288 } else {
3289 pmap = NULL; /* pmap is not locked */
3290 mtx_lock(&pvc->pvc_lock);
3291 pc = TAILQ_NEXT(pc_marker, pc_lru);
3292 if (pc == NULL ||
3293 pc->pc_pmap != next_pmap)
3294 continue;
3295 goto next_chunk;
3296 }
3297 }
3298 }
3299
3300 /*
3301 * Destroy every non-wired, 4 KB page mapping in the chunk.
3302 */
3303 freed = 0;
3304 for (field = 0; field < _NPCM; field++) {
3305 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3306 inuse != 0; inuse &= ~(1UL << bit)) {
3307 bit = ffsl(inuse) - 1;
3308 pv = &pc->pc_pventry[field * 64 + bit];
3309 va = pv->pv_va;
3310 pde = pmap_pde(pmap, va, &lvl);
3311 if (lvl != 2)
3312 continue;
3313 pte = pmap_l2_to_l3(pde, va);
3314 tpte = pmap_load(pte);
3315 if ((tpte & ATTR_SW_WIRED) != 0)
3316 continue;
3317 if ((tpte & ATTR_CONTIGUOUS) != 0)
3318 (void)pmap_demote_l3c(pmap, pte, va);
3319 tpte = pmap_load_clear(pte);
3320 m = PTE_TO_VM_PAGE(tpte);
3321 if (pmap_pte_dirty(pmap, tpte))
3322 vm_page_dirty(m);
3323 if ((tpte & ATTR_AF) != 0) {
3324 pmap_s1_invalidate_page(pmap, va, true);
3325 vm_page_aflag_set(m, PGA_REFERENCED);
3326 }
3327 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3328 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3329 m->md.pv_gen++;
3330 if (TAILQ_EMPTY(&m->md.pv_list) &&
3331 (m->flags & PG_FICTITIOUS) == 0) {
3332 pvh = page_to_pvh(m);
3333 if (TAILQ_EMPTY(&pvh->pv_list)) {
3334 vm_page_aflag_clear(m,
3335 PGA_WRITEABLE);
3336 }
3337 }
3338 pc->pc_map[field] |= 1UL << bit;
3339 pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
3340 freed++;
3341 }
3342 }
3343 if (freed == 0) {
3344 mtx_lock(&pvc->pvc_lock);
3345 goto next_chunk;
3346 }
3347 /* Every freed mapping is for a 4 KB page. */
3348 pmap_resident_count_dec(pmap, freed);
3349 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3350 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3351 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3352 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3353 if (pc_is_free(pc)) {
3354 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3355 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3356 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3357 /* Entire chunk is free; return it. */
3358 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3359 dump_drop_page(m_pc->phys_addr);
3360 mtx_lock(&pvc->pvc_lock);
3361 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3362 break;
3363 }
3364 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3365 mtx_lock(&pvc->pvc_lock);
3366 /* One freed pv entry in locked_pmap is sufficient. */
3367 if (pmap == locked_pmap)
3368 break;
3369
3370 next_chunk:
3371 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3372 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
3373 if (pvc->active_reclaims == 1 && pmap != NULL) {
3374 /*
3375 * Rotate the pv chunks list so that we do not
3376 * scan the same pv chunks that could not be
3377 * freed (because they contained a wired
3378 * and/or superpage mapping) on every
3379 * invocation of reclaim_pv_chunk().
3380 */
3381 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
3382 MPASS(pc->pc_pmap != NULL);
3383 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3384 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3385 }
3386 }
3387 }
3388 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3389 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
3390 pvc->active_reclaims--;
3391 mtx_unlock(&pvc->pvc_lock);
3392 if (pmap != NULL && pmap != locked_pmap)
3393 PMAP_UNLOCK(pmap);
3394 if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3395 m_pc = SLIST_FIRST(&free);
3396 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3397 /* Recycle a freed page table page. */
3398 m_pc->ref_count = 1;
3399 }
3400 vm_page_free_pages_toq(&free, true);
3401 return (m_pc);
3402 }
3403
3404 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)3405 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3406 {
3407 vm_page_t m;
3408 int i, domain;
3409
3410 domain = PCPU_GET(domain);
3411 for (i = 0; i < vm_ndomains; i++) {
3412 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
3413 if (m != NULL)
3414 break;
3415 domain = (domain + 1) % vm_ndomains;
3416 }
3417
3418 return (m);
3419 }
3420
3421 /*
3422 * free the pv_entry back to the free list
3423 */
3424 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)3425 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3426 {
3427 struct pv_chunk *pc;
3428 int idx, field, bit;
3429
3430 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3431 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3432 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3433 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3434 pc = pv_to_chunk(pv);
3435 idx = pv - &pc->pc_pventry[0];
3436 field = idx / 64;
3437 bit = idx % 64;
3438 pc->pc_map[field] |= 1ul << bit;
3439 if (!pc_is_free(pc)) {
3440 /* 98% of the time, pc is already at the head of the list. */
3441 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3442 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3443 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3444 }
3445 return;
3446 }
3447 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3448 free_pv_chunk(pc);
3449 }
3450
3451 static void
free_pv_chunk_dequeued(struct pv_chunk * pc)3452 free_pv_chunk_dequeued(struct pv_chunk *pc)
3453 {
3454 vm_page_t m;
3455
3456 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3457 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3458 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3459 /* entire chunk is free, return it */
3460 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3461 dump_drop_page(m->phys_addr);
3462 vm_page_unwire_noq(m);
3463 vm_page_free(m);
3464 }
3465
3466 static void
free_pv_chunk(struct pv_chunk * pc)3467 free_pv_chunk(struct pv_chunk *pc)
3468 {
3469 struct pv_chunks_list *pvc;
3470
3471 pvc = &pv_chunks[pc_to_domain(pc)];
3472 mtx_lock(&pvc->pvc_lock);
3473 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3474 mtx_unlock(&pvc->pvc_lock);
3475 free_pv_chunk_dequeued(pc);
3476 }
3477
3478 static void
free_pv_chunk_batch(struct pv_chunklist * batch)3479 free_pv_chunk_batch(struct pv_chunklist *batch)
3480 {
3481 struct pv_chunks_list *pvc;
3482 struct pv_chunk *pc, *npc;
3483 int i;
3484
3485 for (i = 0; i < vm_ndomains; i++) {
3486 if (TAILQ_EMPTY(&batch[i]))
3487 continue;
3488 pvc = &pv_chunks[i];
3489 mtx_lock(&pvc->pvc_lock);
3490 TAILQ_FOREACH(pc, &batch[i], pc_list) {
3491 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3492 }
3493 mtx_unlock(&pvc->pvc_lock);
3494 }
3495
3496 for (i = 0; i < vm_ndomains; i++) {
3497 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3498 free_pv_chunk_dequeued(pc);
3499 }
3500 }
3501 }
3502
3503 /*
3504 * Returns a new PV entry, allocating a new PV chunk from the system when
3505 * needed. If this PV chunk allocation fails and a PV list lock pointer was
3506 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
3507 * returned.
3508 *
3509 * The given PV list lock may be released.
3510 */
3511 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)3512 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3513 {
3514 struct pv_chunks_list *pvc;
3515 int bit, field;
3516 pv_entry_t pv;
3517 struct pv_chunk *pc;
3518 vm_page_t m;
3519
3520 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3521 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3522 retry:
3523 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3524 if (pc != NULL) {
3525 for (field = 0; field < _NPCM; field++) {
3526 if (pc->pc_map[field]) {
3527 bit = ffsl(pc->pc_map[field]) - 1;
3528 break;
3529 }
3530 }
3531 if (field < _NPCM) {
3532 pv = &pc->pc_pventry[field * 64 + bit];
3533 pc->pc_map[field] &= ~(1ul << bit);
3534 /* If this was the last item, move it to tail */
3535 if (pc_is_full(pc)) {
3536 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3537 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3538 pc_list);
3539 }
3540 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3541 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3542 return (pv);
3543 }
3544 }
3545 /* No free items, allocate another chunk */
3546 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3547 if (m == NULL) {
3548 if (lockp == NULL) {
3549 PV_STAT(pc_chunk_tryfail++);
3550 return (NULL);
3551 }
3552 m = reclaim_pv_chunk(pmap, lockp);
3553 if (m == NULL)
3554 goto retry;
3555 }
3556 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3557 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3558 dump_add_page(m->phys_addr);
3559 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3560 pc->pc_pmap = pmap;
3561 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3562 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */
3563 pvc = &pv_chunks[vm_page_domain(m)];
3564 mtx_lock(&pvc->pvc_lock);
3565 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3566 mtx_unlock(&pvc->pvc_lock);
3567 pv = &pc->pc_pventry[0];
3568 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3569 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3570 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3571 return (pv);
3572 }
3573
3574 /*
3575 * Ensure that the number of spare PV entries in the specified pmap meets or
3576 * exceeds the given count, "needed".
3577 *
3578 * The given PV list lock may be released.
3579 */
3580 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)3581 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3582 {
3583 struct pv_chunks_list *pvc;
3584 struct pch new_tail[PMAP_MEMDOM];
3585 struct pv_chunk *pc;
3586 vm_page_t m;
3587 int avail, free, i;
3588 bool reclaimed;
3589
3590 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3591 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3592
3593 /*
3594 * Newly allocated PV chunks must be stored in a private list until
3595 * the required number of PV chunks have been allocated. Otherwise,
3596 * reclaim_pv_chunk() could recycle one of these chunks. In
3597 * contrast, these chunks must be added to the pmap upon allocation.
3598 */
3599 for (i = 0; i < PMAP_MEMDOM; i++)
3600 TAILQ_INIT(&new_tail[i]);
3601 retry:
3602 avail = 0;
3603 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3604 bit_count((bitstr_t *)pc->pc_map, 0,
3605 sizeof(pc->pc_map) * NBBY, &free);
3606 if (free == 0)
3607 break;
3608 avail += free;
3609 if (avail >= needed)
3610 break;
3611 }
3612 for (reclaimed = false; avail < needed; avail += _NPCPV) {
3613 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3614 if (m == NULL) {
3615 m = reclaim_pv_chunk(pmap, lockp);
3616 if (m == NULL)
3617 goto retry;
3618 reclaimed = true;
3619 }
3620 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3621 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3622 dump_add_page(m->phys_addr);
3623 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3624 pc->pc_pmap = pmap;
3625 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3626 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3627 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3628 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3629
3630 /*
3631 * The reclaim might have freed a chunk from the current pmap.
3632 * If that chunk contained available entries, we need to
3633 * re-count the number of available entries.
3634 */
3635 if (reclaimed)
3636 goto retry;
3637 }
3638 for (i = 0; i < vm_ndomains; i++) {
3639 if (TAILQ_EMPTY(&new_tail[i]))
3640 continue;
3641 pvc = &pv_chunks[i];
3642 mtx_lock(&pvc->pvc_lock);
3643 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3644 mtx_unlock(&pvc->pvc_lock);
3645 }
3646 }
3647
3648 /*
3649 * First find and then remove the pv entry for the specified pmap and virtual
3650 * address from the specified pv list. Returns the pv entry if found and NULL
3651 * otherwise. This operation can be performed on pv lists for either 4KB or
3652 * 2MB page mappings.
3653 */
3654 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3655 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3656 {
3657 pv_entry_t pv;
3658
3659 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3660 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3661 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3662 pvh->pv_gen++;
3663 break;
3664 }
3665 }
3666 return (pv);
3667 }
3668
3669 /*
3670 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3671 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3672 * entries for each of the 4KB page mappings.
3673 */
3674 static void
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)3675 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3676 struct rwlock **lockp)
3677 {
3678 struct md_page *pvh;
3679 struct pv_chunk *pc;
3680 pv_entry_t pv;
3681 vm_offset_t va_last;
3682 vm_page_t m;
3683 int bit, field;
3684
3685 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3686 KASSERT((va & L2_OFFSET) == 0,
3687 ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3688 KASSERT((pa & L2_OFFSET) == 0,
3689 ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3690 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3691
3692 /*
3693 * Transfer the 2mpage's pv entry for this mapping to the first
3694 * page's pv list. Once this transfer begins, the pv list lock
3695 * must not be released until the last pv entry is reinstantiated.
3696 */
3697 pvh = pa_to_pvh(pa);
3698 pv = pmap_pvh_remove(pvh, pmap, va);
3699 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3700 m = PHYS_TO_VM_PAGE(pa);
3701 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3702 m->md.pv_gen++;
3703 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3704 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3705 va_last = va + L2_SIZE - PAGE_SIZE;
3706 for (;;) {
3707 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3708 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3709 for (field = 0; field < _NPCM; field++) {
3710 while (pc->pc_map[field]) {
3711 bit = ffsl(pc->pc_map[field]) - 1;
3712 pc->pc_map[field] &= ~(1ul << bit);
3713 pv = &pc->pc_pventry[field * 64 + bit];
3714 va += PAGE_SIZE;
3715 pv->pv_va = va;
3716 m++;
3717 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3718 ("pmap_pv_demote_l2: page %p is not managed", m));
3719 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3720 m->md.pv_gen++;
3721 if (va == va_last)
3722 goto out;
3723 }
3724 }
3725 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3726 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3727 }
3728 out:
3729 if (pc_is_full(pc)) {
3730 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3731 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3732 }
3733 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3734 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3735 }
3736
3737 /*
3738 * First find and then destroy the pv entry for the specified pmap and virtual
3739 * address. This operation can be performed on pv lists for either 4KB or 2MB
3740 * page mappings.
3741 */
3742 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3743 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3744 {
3745 pv_entry_t pv;
3746
3747 pv = pmap_pvh_remove(pvh, pmap, va);
3748 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3749 free_pv_entry(pmap, pv);
3750 }
3751
3752 /*
3753 * Conditionally create the PV entry for a 4KB page mapping if the required
3754 * memory can be allocated without resorting to reclamation.
3755 */
3756 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3757 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3758 struct rwlock **lockp)
3759 {
3760 pv_entry_t pv;
3761
3762 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3763 /* Pass NULL instead of the lock pointer to disable reclamation. */
3764 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3765 pv->pv_va = va;
3766 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3767 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3768 m->md.pv_gen++;
3769 return (true);
3770 } else
3771 return (false);
3772 }
3773
3774 /*
3775 * Create the PV entry for a 2MB page mapping. Always returns true unless the
3776 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
3777 * false if the PV entry cannot be allocated without resorting to reclamation.
3778 */
3779 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)3780 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3781 struct rwlock **lockp)
3782 {
3783 struct md_page *pvh;
3784 pv_entry_t pv;
3785 vm_paddr_t pa;
3786
3787 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3788 /* Pass NULL instead of the lock pointer to disable reclamation. */
3789 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3790 NULL : lockp)) == NULL)
3791 return (false);
3792 pv->pv_va = va;
3793 pa = PTE_TO_PHYS(l2e);
3794 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3795 pvh = pa_to_pvh(pa);
3796 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3797 pvh->pv_gen++;
3798 return (true);
3799 }
3800
3801 /*
3802 * Conditionally creates the PV entries for a L3C superpage mapping if
3803 * the required memory can be allocated without resorting to reclamation.
3804 */
3805 static bool
pmap_pv_insert_l3c(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3806 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
3807 struct rwlock **lockp)
3808 {
3809 pv_entry_t pv;
3810 vm_offset_t tva;
3811 vm_paddr_t pa __diagused;
3812 vm_page_t mt;
3813
3814 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3815 KASSERT((va & L3C_OFFSET) == 0,
3816 ("pmap_pv_insert_l3c: va is not aligned"));
3817 pa = VM_PAGE_TO_PHYS(m);
3818 KASSERT((pa & L3C_OFFSET) == 0,
3819 ("pmap_pv_insert_l3c: pa is not aligned"));
3820 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3821 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) {
3822 /* Pass NULL instead of lockp to disable reclamation. */
3823 pv = get_pv_entry(pmap, NULL);
3824 if (__predict_false(pv == NULL)) {
3825 while (tva > va) {
3826 mt--;
3827 tva -= L3_SIZE;
3828 pmap_pvh_free(&mt->md, pmap, tva);
3829 }
3830 return (false);
3831 }
3832 pv->pv_va = tva;
3833 TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next);
3834 mt->md.pv_gen++;
3835 }
3836 return (true);
3837 }
3838
3839 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)3840 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3841 {
3842 pt_entry_t newl2, oldl2 __diagused;
3843 vm_page_t ml3;
3844 vm_paddr_t ml3pa;
3845
3846 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3847 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3848 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3849
3850 ml3 = pmap_remove_pt_page(pmap, va);
3851 KASSERT(ml3 != NULL, ("pmap_remove_kernel_l2: missing pt page"));
3852
3853 ml3pa = VM_PAGE_TO_PHYS(ml3);
3854 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
3855
3856 /*
3857 * If this page table page was unmapped by a promotion, then it
3858 * contains valid mappings. Zero it to invalidate those mappings.
3859 */
3860 if (vm_page_any_valid(ml3))
3861 pagezero((void *)PHYS_TO_DMAP(ml3pa));
3862
3863 /*
3864 * Demote the mapping. The caller must have already invalidated the
3865 * mapping (i.e., the "break" in break-before-make).
3866 */
3867 oldl2 = pmap_load_store(l2, newl2);
3868 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3869 __func__, l2, oldl2));
3870 }
3871
3872 /*
3873 * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3874 */
3875 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,bool demote_kl2e,struct spglist * free,struct rwlock ** lockp)3876 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e,
3877 bool demote_kl2e, struct spglist *free, struct rwlock **lockp)
3878 {
3879 struct md_page *pvh;
3880 pt_entry_t old_l2;
3881 vm_page_t m, ml3, mt;
3882
3883 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3884 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
3885 old_l2 = pmap_load_clear(l2);
3886 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3887 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
3888
3889 /*
3890 * Since a promotion must break the 4KB page mappings before making
3891 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3892 */
3893 pmap_s1_invalidate_page(pmap, sva, true);
3894
3895 if (old_l2 & ATTR_SW_WIRED)
3896 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
3897 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
3898 if (old_l2 & ATTR_SW_MANAGED) {
3899 m = PTE_TO_VM_PAGE(old_l2);
3900 pvh = page_to_pvh(m);
3901 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3902 pmap_pvh_free(pvh, pmap, sva);
3903 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
3904 if (pmap_pte_dirty(pmap, old_l2))
3905 vm_page_dirty(mt);
3906 if (old_l2 & ATTR_AF)
3907 vm_page_aflag_set(mt, PGA_REFERENCED);
3908 if (TAILQ_EMPTY(&mt->md.pv_list) &&
3909 TAILQ_EMPTY(&pvh->pv_list))
3910 vm_page_aflag_clear(mt, PGA_WRITEABLE);
3911 }
3912 }
3913 if (pmap != kernel_pmap) {
3914 ml3 = pmap_remove_pt_page(pmap, sva);
3915 if (ml3 != NULL) {
3916 KASSERT(vm_page_any_valid(ml3),
3917 ("pmap_remove_l2: l3 page not promoted"));
3918 pmap_resident_count_dec(pmap, 1);
3919 KASSERT(ml3->ref_count == NL3PG,
3920 ("pmap_remove_l2: l3 page ref count error"));
3921 ml3->ref_count = 0;
3922 pmap_add_delayed_free_list(ml3, free, false);
3923 }
3924 } else if (demote_kl2e) {
3925 pmap_remove_kernel_l2(pmap, l2, sva);
3926 } else {
3927 ml3 = vm_radix_lookup(&pmap->pm_root, pmap_l2_pindex(sva));
3928 if (vm_page_any_valid(ml3)) {
3929 ml3->valid = 0;
3930 pmap_zero_page(ml3);
3931 }
3932 }
3933 return (pmap_unuse_pt(pmap, sva, l1e, free));
3934 }
3935
3936 /*
3937 * pmap_remove_l3: do the things to unmap a page in a process
3938 */
3939 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)3940 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
3941 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
3942 {
3943 struct md_page *pvh;
3944 pt_entry_t old_l3;
3945 vm_page_t m;
3946
3947 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3948 old_l3 = pmap_load(l3);
3949 if ((old_l3 & ATTR_CONTIGUOUS) != 0)
3950 (void)pmap_demote_l3c(pmap, l3, va);
3951 old_l3 = pmap_load_clear(l3);
3952 pmap_s1_invalidate_page(pmap, va, true);
3953 if (old_l3 & ATTR_SW_WIRED)
3954 pmap->pm_stats.wired_count -= 1;
3955 pmap_resident_count_dec(pmap, 1);
3956 if (old_l3 & ATTR_SW_MANAGED) {
3957 m = PTE_TO_VM_PAGE(old_l3);
3958 if (pmap_pte_dirty(pmap, old_l3))
3959 vm_page_dirty(m);
3960 if (old_l3 & ATTR_AF)
3961 vm_page_aflag_set(m, PGA_REFERENCED);
3962 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3963 pmap_pvh_free(&m->md, pmap, va);
3964 if (TAILQ_EMPTY(&m->md.pv_list) &&
3965 (m->flags & PG_FICTITIOUS) == 0) {
3966 pvh = page_to_pvh(m);
3967 if (TAILQ_EMPTY(&pvh->pv_list))
3968 vm_page_aflag_clear(m, PGA_WRITEABLE);
3969 }
3970 }
3971 return (pmap_unuse_pt(pmap, va, l2e, free));
3972 }
3973
3974 /*
3975 * Removes the specified L3C superpage mapping. Requests TLB invalidations
3976 * to be performed by the caller through the returned "*vap". Returns true
3977 * if the level 3 table "ml3" was unmapped and added to the spglist "free".
3978 * Otherwise, returns false.
3979 */
3980 static bool
pmap_remove_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,vm_page_t ml3,struct spglist * free,struct rwlock ** lockp)3981 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap,
3982 vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
3983 struct rwlock **lockp)
3984 {
3985 struct md_page *pvh;
3986 struct rwlock *new_lock;
3987 pt_entry_t first_l3e, l3e, *tl3p;
3988 vm_offset_t tva;
3989 vm_page_t m, mt;
3990
3991 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3992 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
3993 0, ("pmap_remove_l3c: l3p is not aligned"));
3994 KASSERT((va & L3C_OFFSET) == 0,
3995 ("pmap_remove_l3c: va is not aligned"));
3996
3997 /*
3998 * Hardware accessed and dirty bit maintenance might only update a
3999 * single L3 entry, so we must combine the accessed and dirty bits
4000 * from this entire set of contiguous L3 entries.
4001 */
4002 first_l3e = pmap_load_clear(l3p);
4003 for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4004 l3e = pmap_load_clear(tl3p);
4005 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4006 ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS"));
4007 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4008 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4009 first_l3e &= ~ATTR_S1_AP_RW_BIT;
4010 first_l3e |= l3e & ATTR_AF;
4011 }
4012 if ((first_l3e & ATTR_SW_WIRED) != 0)
4013 pmap->pm_stats.wired_count -= L3C_ENTRIES;
4014 pmap_resident_count_dec(pmap, L3C_ENTRIES);
4015 if ((first_l3e & ATTR_SW_MANAGED) != 0) {
4016 m = PTE_TO_VM_PAGE(first_l3e);
4017 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4018 if (new_lock != *lockp) {
4019 if (*lockp != NULL) {
4020 /*
4021 * Pending TLB invalidations must be
4022 * performed before the PV list lock is
4023 * released. Otherwise, a concurrent
4024 * pmap_remove_all() on a physical page
4025 * could return while a stale TLB entry
4026 * still provides access to that page.
4027 */
4028 if (*vap != va_next) {
4029 pmap_invalidate_range(pmap, *vap, va,
4030 true);
4031 *vap = va_next;
4032 }
4033 rw_wunlock(*lockp);
4034 }
4035 *lockp = new_lock;
4036 rw_wlock(*lockp);
4037 }
4038 pvh = page_to_pvh(m);
4039 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva +=
4040 L3_SIZE) {
4041 if (pmap_pte_dirty(pmap, first_l3e))
4042 vm_page_dirty(mt);
4043 if ((first_l3e & ATTR_AF) != 0)
4044 vm_page_aflag_set(mt, PGA_REFERENCED);
4045 pmap_pvh_free(&mt->md, pmap, tva);
4046 if (TAILQ_EMPTY(&mt->md.pv_list) &&
4047 TAILQ_EMPTY(&pvh->pv_list))
4048 vm_page_aflag_clear(mt, PGA_WRITEABLE);
4049 }
4050 }
4051 if (*vap == va_next)
4052 *vap = va;
4053 if (ml3 != NULL) {
4054 ml3->ref_count -= L3C_ENTRIES;
4055 if (ml3->ref_count == 0) {
4056 _pmap_unwire_l3(pmap, va, ml3, free);
4057 return (true);
4058 }
4059 }
4060 return (false);
4061 }
4062
4063 /*
4064 * Remove the specified range of addresses from the L3 page table that is
4065 * identified by the given L2 entry.
4066 */
4067 static void
pmap_remove_l3_range(pmap_t pmap,pd_entry_t l2e,vm_offset_t sva,vm_offset_t eva,struct spglist * free,struct rwlock ** lockp)4068 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
4069 vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
4070 {
4071 struct md_page *pvh;
4072 struct rwlock *new_lock;
4073 pt_entry_t *l3, old_l3;
4074 vm_offset_t va;
4075 vm_page_t l3pg, m;
4076
4077 KASSERT(ADDR_IS_CANONICAL(sva),
4078 ("%s: Start address not in canonical form: %lx", __func__, sva));
4079 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
4080 ("%s: End address not in canonical form: %lx", __func__, eva));
4081
4082 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4083 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
4084 ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
4085 l3pg = !ADDR_IS_KERNEL(sva) ? PTE_TO_VM_PAGE(l2e) : NULL;
4086 va = eva;
4087 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
4088 old_l3 = pmap_load(l3);
4089 if (!pmap_l3_valid(old_l3)) {
4090 if (va != eva) {
4091 pmap_invalidate_range(pmap, va, sva, true);
4092 va = eva;
4093 }
4094 continue;
4095 }
4096 if ((old_l3 & ATTR_CONTIGUOUS) != 0) {
4097 /*
4098 * Is this entire set of contiguous L3 entries being
4099 * removed? Handle the possibility that "eva" is zero
4100 * because of address wraparound.
4101 */
4102 if ((sva & L3C_OFFSET) == 0 &&
4103 sva + L3C_OFFSET <= eva - 1) {
4104 if (pmap_remove_l3c(pmap, l3, sva, &va, eva,
4105 l3pg, free, lockp)) {
4106 /* The L3 table was unmapped. */
4107 sva += L3C_SIZE;
4108 break;
4109 }
4110 l3 += L3C_ENTRIES - 1;
4111 sva += L3C_SIZE - L3_SIZE;
4112 continue;
4113 }
4114
4115 (void)pmap_demote_l3c(pmap, l3, sva);
4116 }
4117 old_l3 = pmap_load_clear(l3);
4118 if ((old_l3 & ATTR_SW_WIRED) != 0)
4119 pmap->pm_stats.wired_count--;
4120 pmap_resident_count_dec(pmap, 1);
4121 if ((old_l3 & ATTR_SW_MANAGED) != 0) {
4122 m = PTE_TO_VM_PAGE(old_l3);
4123 if (pmap_pte_dirty(pmap, old_l3))
4124 vm_page_dirty(m);
4125 if ((old_l3 & ATTR_AF) != 0)
4126 vm_page_aflag_set(m, PGA_REFERENCED);
4127 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4128 if (new_lock != *lockp) {
4129 if (*lockp != NULL) {
4130 /*
4131 * Pending TLB invalidations must be
4132 * performed before the PV list lock is
4133 * released. Otherwise, a concurrent
4134 * pmap_remove_all() on a physical page
4135 * could return while a stale TLB entry
4136 * still provides access to that page.
4137 */
4138 if (va != eva) {
4139 pmap_invalidate_range(pmap, va,
4140 sva, true);
4141 va = eva;
4142 }
4143 rw_wunlock(*lockp);
4144 }
4145 *lockp = new_lock;
4146 rw_wlock(*lockp);
4147 }
4148 pmap_pvh_free(&m->md, pmap, sva);
4149 if (TAILQ_EMPTY(&m->md.pv_list) &&
4150 (m->flags & PG_FICTITIOUS) == 0) {
4151 pvh = page_to_pvh(m);
4152 if (TAILQ_EMPTY(&pvh->pv_list))
4153 vm_page_aflag_clear(m, PGA_WRITEABLE);
4154 }
4155 }
4156 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
4157 /*
4158 * _pmap_unwire_l3() has already invalidated the TLB
4159 * entries at all levels for "sva". So, we need not
4160 * perform "sva += L3_SIZE;" here. Moreover, we need
4161 * not perform "va = sva;" if "sva" is at the start
4162 * of a new valid range consisting of a single page.
4163 */
4164 break;
4165 }
4166 if (va == eva)
4167 va = sva;
4168 }
4169 if (va != eva)
4170 pmap_invalidate_range(pmap, va, sva, true);
4171 }
4172
4173 static void
pmap_remove1(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool map_delete)4174 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
4175 {
4176 struct rwlock *lock;
4177 vm_offset_t va_next;
4178 pd_entry_t *l0, *l1, *l2;
4179 pt_entry_t l3_paddr;
4180 struct spglist free;
4181
4182 /*
4183 * Perform an unsynchronized read. This is, however, safe.
4184 */
4185 if (pmap->pm_stats.resident_count == 0)
4186 return;
4187
4188 SLIST_INIT(&free);
4189
4190 PMAP_LOCK(pmap);
4191 if (map_delete)
4192 pmap_bti_on_remove(pmap, sva, eva);
4193
4194 lock = NULL;
4195 for (; sva < eva; sva = va_next) {
4196 if (pmap->pm_stats.resident_count == 0)
4197 break;
4198
4199 l0 = pmap_l0(pmap, sva);
4200 if (pmap_load(l0) == 0) {
4201 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4202 if (va_next < sva)
4203 va_next = eva;
4204 continue;
4205 }
4206
4207 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4208 if (va_next < sva)
4209 va_next = eva;
4210 l1 = pmap_l0_to_l1(l0, sva);
4211 if (pmap_load(l1) == 0)
4212 continue;
4213 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4214 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4215 KASSERT(va_next <= eva,
4216 ("partial update of non-transparent 1G page "
4217 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4218 pmap_load(l1), sva, eva, va_next));
4219 MPASS(pmap != kernel_pmap);
4220 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4221 pmap_clear(l1);
4222 pmap_s1_invalidate_page(pmap, sva, true);
4223 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
4224 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
4225 continue;
4226 }
4227
4228 /*
4229 * Calculate index for next page table.
4230 */
4231 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4232 if (va_next < sva)
4233 va_next = eva;
4234
4235 l2 = pmap_l1_to_l2(l1, sva);
4236 l3_paddr = pmap_load(l2);
4237
4238 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4239 if (sva + L2_SIZE == va_next && eva >= va_next) {
4240 pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
4241 true, &free, &lock);
4242 continue;
4243 } else if (pmap_demote_l2_locked(pmap, l2, sva,
4244 &lock) == NULL)
4245 continue;
4246 l3_paddr = pmap_load(l2);
4247 }
4248
4249 /*
4250 * Weed out invalid mappings.
4251 */
4252 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
4253 continue;
4254
4255 /*
4256 * Limit our scan to either the end of the va represented
4257 * by the current page table page, or to the end of the
4258 * range being removed.
4259 */
4260 if (va_next > eva)
4261 va_next = eva;
4262
4263 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
4264 &lock);
4265 }
4266 if (lock != NULL)
4267 rw_wunlock(lock);
4268 PMAP_UNLOCK(pmap);
4269 vm_page_free_pages_toq(&free, true);
4270 }
4271
4272 /*
4273 * Remove the given range of addresses from the specified map.
4274 *
4275 * It is assumed that the start and end are properly
4276 * rounded to the page size.
4277 */
4278 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4279 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4280 {
4281 pmap_remove1(pmap, sva, eva, false);
4282 }
4283
4284 /*
4285 * Remove the given range of addresses as part of a logical unmap
4286 * operation. This has the effect of calling pmap_remove(), but
4287 * also clears any metadata that should persist for the lifetime
4288 * of a logical mapping.
4289 */
4290 void
pmap_map_delete(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4291 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4292 {
4293 pmap_remove1(pmap, sva, eva, true);
4294 }
4295
4296 /*
4297 * Routine: pmap_remove_all
4298 * Function:
4299 * Removes this physical page from
4300 * all physical maps in which it resides.
4301 * Reflects back modify bits to the pager.
4302 *
4303 * Notes:
4304 * Original versions of this routine were very
4305 * inefficient because they iteratively called
4306 * pmap_remove (slow...)
4307 */
4308
4309 void
pmap_remove_all(vm_page_t m)4310 pmap_remove_all(vm_page_t m)
4311 {
4312 struct md_page *pvh;
4313 pv_entry_t pv;
4314 pmap_t pmap;
4315 struct rwlock *lock;
4316 pd_entry_t *pde, tpde;
4317 pt_entry_t *pte, tpte;
4318 vm_offset_t va;
4319 struct spglist free;
4320 int lvl, pvh_gen, md_gen;
4321
4322 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4323 ("pmap_remove_all: page %p is not managed", m));
4324 SLIST_INIT(&free);
4325 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4326 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
4327 rw_wlock(lock);
4328 retry:
4329 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4330 pmap = PV_PMAP(pv);
4331 if (!PMAP_TRYLOCK(pmap)) {
4332 pvh_gen = pvh->pv_gen;
4333 rw_wunlock(lock);
4334 PMAP_LOCK(pmap);
4335 rw_wlock(lock);
4336 if (pvh_gen != pvh->pv_gen) {
4337 PMAP_UNLOCK(pmap);
4338 goto retry;
4339 }
4340 }
4341 va = pv->pv_va;
4342 pte = pmap_pte_exists(pmap, va, 2, __func__);
4343 pmap_demote_l2_locked(pmap, pte, va, &lock);
4344 PMAP_UNLOCK(pmap);
4345 }
4346 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4347 pmap = PV_PMAP(pv);
4348 if (!PMAP_TRYLOCK(pmap)) {
4349 pvh_gen = pvh->pv_gen;
4350 md_gen = m->md.pv_gen;
4351 rw_wunlock(lock);
4352 PMAP_LOCK(pmap);
4353 rw_wlock(lock);
4354 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4355 PMAP_UNLOCK(pmap);
4356 goto retry;
4357 }
4358 }
4359 pmap_resident_count_dec(pmap, 1);
4360
4361 pde = pmap_pde(pmap, pv->pv_va, &lvl);
4362 KASSERT(pde != NULL,
4363 ("pmap_remove_all: no page directory entry found"));
4364 KASSERT(lvl == 2,
4365 ("pmap_remove_all: invalid pde level %d", lvl));
4366 tpde = pmap_load(pde);
4367
4368 pte = pmap_l2_to_l3(pde, pv->pv_va);
4369 tpte = pmap_load(pte);
4370 if ((tpte & ATTR_CONTIGUOUS) != 0)
4371 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
4372 tpte = pmap_load_clear(pte);
4373 if (tpte & ATTR_SW_WIRED)
4374 pmap->pm_stats.wired_count--;
4375 if ((tpte & ATTR_AF) != 0) {
4376 pmap_invalidate_page(pmap, pv->pv_va, true);
4377 vm_page_aflag_set(m, PGA_REFERENCED);
4378 }
4379
4380 /*
4381 * Update the vm_page_t clean and reference bits.
4382 */
4383 if (pmap_pte_dirty(pmap, tpte))
4384 vm_page_dirty(m);
4385 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
4386 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4387 m->md.pv_gen++;
4388 free_pv_entry(pmap, pv);
4389 PMAP_UNLOCK(pmap);
4390 }
4391 vm_page_aflag_clear(m, PGA_WRITEABLE);
4392 rw_wunlock(lock);
4393 vm_page_free_pages_toq(&free, true);
4394 }
4395
4396 /*
4397 * Masks and sets bits in a level 2 page table entries in the specified pmap
4398 */
4399 static void
pmap_protect_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pt_entry_t mask,pt_entry_t nbits)4400 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
4401 pt_entry_t nbits)
4402 {
4403 pd_entry_t old_l2;
4404 vm_page_t m, mt;
4405
4406 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4407 PMAP_ASSERT_STAGE1(pmap);
4408 KASSERT((sva & L2_OFFSET) == 0,
4409 ("pmap_protect_l2: sva is not 2mpage aligned"));
4410 old_l2 = pmap_load(l2);
4411 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4412 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
4413
4414 /*
4415 * Return if the L2 entry already has the desired access restrictions
4416 * in place.
4417 */
4418 if ((old_l2 & mask) == nbits)
4419 return;
4420
4421 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
4422 cpu_spinwait();
4423
4424 /*
4425 * When a dirty read/write superpage mapping is write protected,
4426 * update the dirty field of each of the superpage's constituent 4KB
4427 * pages.
4428 */
4429 if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
4430 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4431 pmap_pte_dirty(pmap, old_l2)) {
4432 m = PTE_TO_VM_PAGE(old_l2);
4433 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4434 vm_page_dirty(mt);
4435 }
4436
4437 /*
4438 * Since a promotion must break the 4KB page mappings before making
4439 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4440 */
4441 pmap_s1_invalidate_page(pmap, sva, true);
4442 }
4443
4444 /*
4445 * Masks and sets bits in the specified L3C superpage mapping.
4446 *
4447 * Requests TLB invalidations to be performed by the caller through the
4448 * returned "*vap".
4449 */
4450 static void
pmap_mask_set_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,pt_entry_t mask,pt_entry_t nbits)4451 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
4452 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits)
4453 {
4454 pt_entry_t l3e, *tl3p;
4455 vm_page_t m, mt;
4456 bool dirty;
4457
4458 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4459 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4460 0, ("pmap_mask_set_l3c: l3p is not aligned"));
4461 KASSERT((va & L3C_OFFSET) == 0,
4462 ("pmap_mask_set_l3c: va is not aligned"));
4463 dirty = false;
4464 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4465 l3e = pmap_load(tl3p);
4466 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4467 ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS"));
4468 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
4469 cpu_spinwait();
4470 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4471 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4472 dirty = true;
4473 }
4474
4475 /*
4476 * When a dirty read/write superpage mapping is write protected,
4477 * update the dirty field of each of the superpage's constituent 4KB
4478 * pages.
4479 */
4480 if ((l3e & ATTR_SW_MANAGED) != 0 &&
4481 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4482 dirty) {
4483 m = PTE_TO_VM_PAGE(pmap_load(l3p));
4484 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
4485 vm_page_dirty(mt);
4486 }
4487
4488 if (*vap == va_next)
4489 *vap = va;
4490 }
4491
4492 /*
4493 * Masks and sets bits in last level page table entries in the specified
4494 * pmap and range
4495 */
4496 static void
pmap_mask_set_locked(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4497 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4498 pt_entry_t nbits, bool invalidate)
4499 {
4500 vm_offset_t va, va_next;
4501 pd_entry_t *l0, *l1, *l2;
4502 pt_entry_t *l3p, l3;
4503
4504 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4505 for (; sva < eva; sva = va_next) {
4506 l0 = pmap_l0(pmap, sva);
4507 if (pmap_load(l0) == 0) {
4508 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4509 if (va_next < sva)
4510 va_next = eva;
4511 continue;
4512 }
4513
4514 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4515 if (va_next < sva)
4516 va_next = eva;
4517 l1 = pmap_l0_to_l1(l0, sva);
4518 if (pmap_load(l1) == 0)
4519 continue;
4520 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4521 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4522 KASSERT(va_next <= eva,
4523 ("partial update of non-transparent 1G page "
4524 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4525 pmap_load(l1), sva, eva, va_next));
4526 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4527 if ((pmap_load(l1) & mask) != nbits) {
4528 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
4529 if (invalidate)
4530 pmap_s1_invalidate_page(pmap, sva, true);
4531 }
4532 continue;
4533 }
4534
4535 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4536 if (va_next < sva)
4537 va_next = eva;
4538
4539 l2 = pmap_l1_to_l2(l1, sva);
4540 if (pmap_load(l2) == 0)
4541 continue;
4542
4543 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4544 if (sva + L2_SIZE == va_next && eva >= va_next) {
4545 pmap_protect_l2(pmap, l2, sva, mask, nbits);
4546 continue;
4547 } else if ((pmap_load(l2) & mask) == nbits ||
4548 pmap_demote_l2(pmap, l2, sva) == NULL)
4549 continue;
4550 }
4551 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4552 ("pmap_protect: Invalid L2 entry after demotion"));
4553
4554 if (va_next > eva)
4555 va_next = eva;
4556
4557 va = va_next;
4558 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
4559 sva += L3_SIZE) {
4560 l3 = pmap_load(l3p);
4561
4562 /*
4563 * Go to the next L3 entry if the current one is
4564 * invalid or already has the desired access
4565 * restrictions in place. (The latter case occurs
4566 * frequently. For example, in a "buildworld"
4567 * workload, almost 1 out of 4 L3 entries already
4568 * have the desired restrictions.)
4569 */
4570 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
4571 if (va != va_next) {
4572 if (invalidate)
4573 pmap_s1_invalidate_range(pmap,
4574 va, sva, true);
4575 va = va_next;
4576 }
4577 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4578 /*
4579 * Does this L3C page extend beyond
4580 * the requested range? Handle the
4581 * possibility that "va_next" is zero.
4582 */
4583 if ((sva | L3C_OFFSET) > va_next - 1)
4584 break;
4585
4586 /*
4587 * Skip ahead to the last L3_PAGE
4588 * within this L3C page.
4589 */
4590 l3p = (pt_entry_t *)((uintptr_t)l3p |
4591 ((L3C_ENTRIES - 1) *
4592 sizeof(pt_entry_t)));
4593 sva |= L3C_SIZE - L3_SIZE;
4594 }
4595 continue;
4596 }
4597
4598 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4599 /*
4600 * Is this entire set of contiguous L3 entries
4601 * being protected? Handle the possibility
4602 * that "va_next" is zero because of address
4603 * wraparound.
4604 */
4605 if ((sva & L3C_OFFSET) == 0 &&
4606 sva + L3C_OFFSET <= va_next - 1) {
4607 pmap_mask_set_l3c(pmap, l3p, sva, &va,
4608 va_next, mask, nbits);
4609 l3p += L3C_ENTRIES - 1;
4610 sva += L3C_SIZE - L3_SIZE;
4611 continue;
4612 }
4613
4614 (void)pmap_demote_l3c(pmap, l3p, sva);
4615
4616 /*
4617 * The L3 entry's accessed bit may have changed.
4618 */
4619 l3 = pmap_load(l3p);
4620 }
4621 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
4622 nbits))
4623 cpu_spinwait();
4624
4625 /*
4626 * When a dirty read/write mapping is write protected,
4627 * update the page's dirty field.
4628 */
4629 if ((l3 & ATTR_SW_MANAGED) != 0 &&
4630 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4631 pmap_pte_dirty(pmap, l3))
4632 vm_page_dirty(PTE_TO_VM_PAGE(l3));
4633
4634 if (va == va_next)
4635 va = sva;
4636 }
4637 if (va != va_next && invalidate)
4638 pmap_s1_invalidate_range(pmap, va, sva, true);
4639 }
4640 }
4641
4642 static void
pmap_mask_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4643 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4644 pt_entry_t nbits, bool invalidate)
4645 {
4646 PMAP_LOCK(pmap);
4647 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
4648 PMAP_UNLOCK(pmap);
4649 }
4650
4651 /*
4652 * Set the physical protection on the
4653 * specified range of this map as requested.
4654 */
4655 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)4656 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4657 {
4658 pt_entry_t mask, nbits;
4659
4660 PMAP_ASSERT_STAGE1(pmap);
4661 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4662 if (prot == VM_PROT_NONE) {
4663 pmap_remove(pmap, sva, eva);
4664 return;
4665 }
4666
4667 mask = nbits = 0;
4668 if ((prot & VM_PROT_WRITE) == 0) {
4669 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
4670 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
4671 }
4672 if ((prot & VM_PROT_EXECUTE) == 0) {
4673 mask |= ATTR_S1_XN;
4674 nbits |= ATTR_S1_XN;
4675 }
4676 if (pmap == kernel_pmap) {
4677 mask |= ATTR_KERN_GP;
4678 nbits |= ATTR_KERN_GP;
4679 }
4680 if (mask == 0)
4681 return;
4682
4683 pmap_mask_set(pmap, sva, eva, mask, nbits, true);
4684 }
4685
4686 void
pmap_disable_promotion(vm_offset_t sva,vm_size_t size)4687 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
4688 {
4689
4690 MPASS((sva & L3_OFFSET) == 0);
4691 MPASS(((sva + size) & L3_OFFSET) == 0);
4692
4693 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4694 ATTR_SW_NO_PROMOTE, false);
4695 }
4696
4697 /*
4698 * Inserts the specified page table page into the specified pmap's collection
4699 * of idle page table pages. Each of a pmap's page table pages is responsible
4700 * for mapping a distinct range of virtual addresses. The pmap's collection is
4701 * ordered by this virtual address range.
4702 *
4703 * If "promoted" is false, then the page table page "mpte" must be zero filled;
4704 * "mpte"'s valid field will be set to 0.
4705 *
4706 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must
4707 * contain valid mappings with identical attributes except for ATTR_AF;
4708 * "mpte"'s valid field will be set to 1.
4709 *
4710 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain
4711 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid
4712 * field will be set to VM_PAGE_BITS_ALL.
4713 */
4714 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_AF_set)4715 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4716 bool all_l3e_AF_set)
4717 {
4718
4719 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4720 KASSERT(promoted || !all_l3e_AF_set,
4721 ("a zero-filled PTP can't have ATTR_AF set in every PTE"));
4722 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0;
4723 return (vm_radix_insert(&pmap->pm_root, mpte));
4724 }
4725
4726 /*
4727 * Removes the page table page mapping the specified virtual address from the
4728 * specified pmap's collection of idle page table pages, and returns it.
4729 * Otherwise, returns NULL if there is no page table page corresponding to the
4730 * specified virtual address.
4731 */
4732 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)4733 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4734 {
4735
4736 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4737 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4738 }
4739
4740 /*
4741 * Performs a break-before-make update of a pmap entry. This is needed when
4742 * either promoting or demoting pages to ensure the TLB doesn't get into an
4743 * inconsistent state.
4744 */
4745 static void
pmap_update_entry(pmap_t pmap,pd_entry_t * ptep,pd_entry_t newpte,vm_offset_t va,vm_size_t size)4746 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte,
4747 vm_offset_t va, vm_size_t size)
4748 {
4749 register_t intr;
4750
4751 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4752 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4753 ("%s: Updating non-promote pte", __func__));
4754
4755 /*
4756 * Ensure we don't get switched out with the page table in an
4757 * inconsistent state. We also need to ensure no interrupts fire
4758 * as they may make use of an address we are about to invalidate.
4759 */
4760 intr = intr_disable();
4761
4762 /*
4763 * Clear the old mapping's valid bit, but leave the rest of the entry
4764 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4765 * lookup the physical address.
4766 */
4767 pmap_clear_bits(ptep, ATTR_DESCR_VALID);
4768
4769 /*
4770 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4771 * be cached, so we invalidate intermediate entries as well as final
4772 * entries.
4773 */
4774 pmap_s1_invalidate_range(pmap, va, va + size, false);
4775
4776 /* Create the new mapping */
4777 pmap_store(ptep, newpte);
4778 dsb(ishst);
4779
4780 intr_restore(intr);
4781 }
4782
4783 /*
4784 * Performs a break-before-make update of an ATTR_CONTIGUOUS mapping.
4785 */
4786 static void __nosanitizecoverage
pmap_update_strided(pmap_t pmap,pd_entry_t * ptep,pd_entry_t * ptep_end,pd_entry_t newpte,vm_offset_t va,vm_offset_t stride,vm_size_t size)4787 pmap_update_strided(pmap_t pmap, pd_entry_t *ptep, pd_entry_t *ptep_end,
4788 pd_entry_t newpte, vm_offset_t va, vm_offset_t stride, vm_size_t size)
4789 {
4790 pd_entry_t *lip;
4791 register_t intr;
4792
4793 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4794 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4795 ("%s: Updating non-promote pte", __func__));
4796
4797 /*
4798 * Ensure we don't get switched out with the page table in an
4799 * inconsistent state. We also need to ensure no interrupts fire
4800 * as they may make use of an address we are about to invalidate.
4801 */
4802 intr = intr_disable();
4803
4804 /*
4805 * Clear the old mapping's valid bits, but leave the rest of each
4806 * entry unchanged, so that a lockless, concurrent pmap_kextract() can
4807 * still lookup the physical address.
4808 */
4809 for (lip = ptep; lip < ptep_end; lip++)
4810 pmap_clear_bits(lip, ATTR_DESCR_VALID);
4811
4812 /* Only final entries are changing. */
4813 pmap_s1_invalidate_strided(pmap, va, va + size, stride, true);
4814
4815 /* Create the new mapping. */
4816 for (lip = ptep; lip < ptep_end; lip++) {
4817 pmap_store(lip, newpte);
4818 newpte += stride;
4819 }
4820 dsb(ishst);
4821
4822 intr_restore(intr);
4823 }
4824
4825 #if VM_NRESERVLEVEL > 0
4826 /*
4827 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4828 * replace the many pv entries for the 4KB page mappings by a single pv entry
4829 * for the 2MB page mapping.
4830 */
4831 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)4832 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4833 struct rwlock **lockp)
4834 {
4835 struct md_page *pvh;
4836 pv_entry_t pv;
4837 vm_offset_t va_last;
4838 vm_page_t m;
4839
4840 KASSERT((pa & L2_OFFSET) == 0,
4841 ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4842 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4843
4844 /*
4845 * Transfer the first page's pv entry for this mapping to the 2mpage's
4846 * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
4847 * a transfer avoids the possibility that get_pv_entry() calls
4848 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4849 * mappings that is being promoted.
4850 */
4851 m = PHYS_TO_VM_PAGE(pa);
4852 va = va & ~L2_OFFSET;
4853 pv = pmap_pvh_remove(&m->md, pmap, va);
4854 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4855 pvh = page_to_pvh(m);
4856 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4857 pvh->pv_gen++;
4858 /* Free the remaining NPTEPG - 1 pv entries. */
4859 va_last = va + L2_SIZE - PAGE_SIZE;
4860 do {
4861 m++;
4862 va += PAGE_SIZE;
4863 pmap_pvh_free(&m->md, pmap, va);
4864 } while (va < va_last);
4865 }
4866
4867 /*
4868 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4869 * single level 2 table entry to a single 2MB page mapping. For promotion
4870 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4871 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4872 * identical characteristics.
4873 */
4874 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t mpte,struct rwlock ** lockp)4875 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4876 struct rwlock **lockp)
4877 {
4878 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa;
4879
4880 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4881
4882 /*
4883 * Currently, this function only supports promotion on stage 1 pmaps
4884 * because it tests stage 1 specific fields and performs a break-
4885 * before-make sequence that is incorrect for stage 2 pmaps.
4886 */
4887 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4888 return (false);
4889
4890 /*
4891 * Examine the first L3E in the specified PTP. Abort if this L3E is
4892 * ineligible for promotion...
4893 */
4894 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
4895 newl2 = pmap_load(firstl3);
4896 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
4897 return (false);
4898 /* ... is not the first physical page within an L2 block */
4899 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 ||
4900 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */
4901 counter_u64_add(pmap_l2_p_failures, 1);
4902 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4903 " in pmap %p", va, pmap);
4904 return (false);
4905 }
4906
4907 /*
4908 * Both here and in the below "for" loop, to allow for repromotion
4909 * after MADV_FREE, conditionally write protect a clean L3E before
4910 * possibly aborting the promotion due to other L3E attributes. Why?
4911 * Suppose that MADV_FREE is applied to a part of a superpage, the
4912 * address range [S, E). pmap_advise() will demote the superpage
4913 * mapping, destroy the 4KB page mapping at the end of [S, E), and
4914 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later,
4915 * imagine that the memory in [S, E) is recycled, but the last 4KB
4916 * page in [S, E) is not the last to be rewritten, or simply accessed.
4917 * In other words, there is still a 4KB page in [S, E), call it P,
4918 * that is writeable but AP_RO is set and AF is clear in P's L3E.
4919 * Unless we write protect P before aborting the promotion, if and
4920 * when P is finally rewritten, there won't be a page fault to trigger
4921 * repromotion.
4922 */
4923 setl2:
4924 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4925 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4926 /*
4927 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4928 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4929 */
4930 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
4931 goto setl2;
4932 newl2 &= ~ATTR_SW_DBM;
4933 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx"
4934 " in pmap %p", va & ~L2_OFFSET, pmap);
4935 }
4936
4937 /*
4938 * Examine each of the other L3Es in the specified PTP. Abort if this
4939 * L3E maps an unexpected 4KB physical page or does not have identical
4940 * characteristics to the first L3E. If ATTR_AF is not set in every
4941 * PTE, then request that the PTP be refilled on demotion.
4942 */
4943 all_l3e_AF = newl2 & ATTR_AF;
4944 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK))
4945 + L2_SIZE - PAGE_SIZE;
4946 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
4947 oldl3 = pmap_load(l3);
4948 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4949 counter_u64_add(pmap_l2_p_failures, 1);
4950 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4951 " in pmap %p", va, pmap);
4952 return (false);
4953 }
4954 setl3:
4955 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4956 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4957 /*
4958 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4959 * set, ATTR_SW_DBM can be cleared without a TLB
4960 * invalidation.
4961 */
4962 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4963 ~ATTR_SW_DBM))
4964 goto setl3;
4965 oldl3 &= ~ATTR_SW_DBM;
4966 }
4967 if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) {
4968 counter_u64_add(pmap_l2_p_failures, 1);
4969 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4970 " in pmap %p", va, pmap);
4971 return (false);
4972 }
4973 all_l3e_AF &= oldl3;
4974 pa -= PAGE_SIZE;
4975 }
4976
4977 /*
4978 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4979 * mapping, so that promotions triggered by speculative mappings,
4980 * such as pmap_enter_quick(), don't automatically mark the
4981 * underlying pages as referenced.
4982 */
4983 newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF;
4984
4985 /*
4986 * Save the page table page in its current state until the L2
4987 * mapping the superpage is demoted by pmap_demote_l2() or
4988 * destroyed by pmap_remove_l3().
4989 */
4990 if (mpte == NULL)
4991 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
4992 KASSERT(mpte >= vm_page_array &&
4993 mpte < &vm_page_array[vm_page_array_size],
4994 ("pmap_promote_l2: page table page is out of range"));
4995 KASSERT(mpte->pindex == pmap_l2_pindex(va),
4996 ("pmap_promote_l2: page table page's pindex is wrong"));
4997 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) {
4998 counter_u64_add(pmap_l2_p_failures, 1);
4999 CTR2(KTR_PMAP,
5000 "pmap_promote_l2: failure for va %#lx in pmap %p", va,
5001 pmap);
5002 return (false);
5003 }
5004
5005 if ((newl2 & ATTR_SW_MANAGED) != 0)
5006 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp);
5007
5008 pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE);
5009
5010 counter_u64_add(pmap_l2_promotions, 1);
5011 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
5012 pmap);
5013 return (true);
5014 }
5015
5016 /*
5017 * Tries to promote an aligned, contiguous set of base page mappings to a
5018 * single L3C page mapping. For promotion to occur, two conditions must be
5019 * met: (1) the base page mappings must map aligned, contiguous physical
5020 * memory and (2) the base page mappings must have identical characteristics
5021 * except for the accessed flag.
5022 */
5023 static bool
pmap_promote_l3c(pmap_t pmap,pd_entry_t * l3p,vm_offset_t va)5024 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va)
5025 {
5026 pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa;
5027
5028 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5029
5030 /*
5031 * Currently, this function only supports promotion on stage 1 pmaps
5032 * because it tests stage 1 specific fields and performs a break-
5033 * before-make sequence that is incorrect for stage 2 pmaps.
5034 */
5035 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
5036 return (false);
5037
5038 /*
5039 * Compute the address of the first L3 entry in the superpage
5040 * candidate.
5041 */
5042 l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
5043 sizeof(pt_entry_t)) - 1));
5044
5045 firstl3c = pmap_load(l3p);
5046
5047 /*
5048 * Examine the first L3 entry. Abort if this L3E is ineligible for
5049 * promotion...
5050 */
5051 if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0)
5052 return (false);
5053 /* ...is not properly aligned... */
5054 if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 ||
5055 (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */
5056 counter_u64_add(pmap_l3c_p_failures, 1);
5057 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5058 " in pmap %p", va, pmap);
5059 return (false);
5060 }
5061
5062 /*
5063 * If the first L3 entry is a clean read-write mapping, convert it
5064 * to a read-only mapping. See pmap_promote_l2() for the rationale.
5065 */
5066 set_first:
5067 if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5068 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5069 /*
5070 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
5071 * ATTR_SW_DBM can be cleared without a TLB invalidation.
5072 */
5073 if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM))
5074 goto set_first;
5075 firstl3c &= ~ATTR_SW_DBM;
5076 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
5077 " in pmap %p", va & ~L3C_OFFSET, pmap);
5078 }
5079
5080 /*
5081 * Check that the rest of the L3 entries are compatible with the first,
5082 * and convert clean read-write mappings to read-only mappings.
5083 */
5084 all_l3e_AF = firstl3c & ATTR_AF;
5085 pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) +
5086 L3C_SIZE - PAGE_SIZE;
5087 for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) {
5088 oldl3 = pmap_load(l3);
5089 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
5090 counter_u64_add(pmap_l3c_p_failures, 1);
5091 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5092 " in pmap %p", va, pmap);
5093 return (false);
5094 }
5095 set_l3:
5096 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5097 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5098 /*
5099 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
5100 * set, ATTR_SW_DBM can be cleared without a TLB
5101 * invalidation.
5102 */
5103 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
5104 ~ATTR_SW_DBM))
5105 goto set_l3;
5106 oldl3 &= ~ATTR_SW_DBM;
5107 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
5108 " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) |
5109 (va & ~L3C_OFFSET), pmap);
5110 }
5111 if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) {
5112 counter_u64_add(pmap_l3c_p_failures, 1);
5113 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5114 " in pmap %p", va, pmap);
5115 return (false);
5116 }
5117 all_l3e_AF &= oldl3;
5118 pa -= PAGE_SIZE;
5119 }
5120
5121 /*
5122 * Unless all PTEs have ATTR_AF set, clear it from the superpage
5123 * mapping, so that promotions triggered by speculative mappings,
5124 * such as pmap_enter_quick(), don't automatically mark the
5125 * underlying pages as referenced.
5126 */
5127 firstl3c &= ~ATTR_AF | all_l3e_AF;
5128
5129 /*
5130 * Remake the mappings with the contiguous bit set.
5131 */
5132 pmap_update_strided(pmap, l3p, l3p + L3C_ENTRIES, firstl3c |
5133 ATTR_CONTIGUOUS, va & ~L3C_OFFSET, L3_SIZE, L3C_SIZE);
5134
5135 counter_u64_add(pmap_l3c_promotions, 1);
5136 CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va,
5137 pmap);
5138 return (true);
5139 }
5140 #endif /* VM_NRESERVLEVEL > 0 */
5141
5142 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t pte,int flags,int psind)5143 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags,
5144 int psind)
5145 {
5146 pd_entry_t *l0p, *l1p, *l2p, *l3p, newpte, origpte, *tl3p;
5147 vm_page_t mp;
5148
5149 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5150 KASSERT(psind > 0 && psind < MAXPAGESIZES,
5151 ("psind %d unexpected", psind));
5152 KASSERT((PTE_TO_PHYS(pte) & (pagesizes[psind] - 1)) == 0,
5153 ("unaligned phys address %#lx pte %#lx psind %d",
5154 PTE_TO_PHYS(pte), pte, psind));
5155
5156 restart:
5157 newpte = pte;
5158 if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte))
5159 return (KERN_PROTECTION_FAILURE);
5160 if (psind == 3) {
5161 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5162
5163 KASSERT(pagesizes[psind] == L1_SIZE,
5164 ("pagesizes[%d] != L1_SIZE", psind));
5165 l0p = pmap_l0(pmap, va);
5166 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
5167 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
5168 if (mp == NULL) {
5169 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5170 return (KERN_RESOURCE_SHORTAGE);
5171 PMAP_UNLOCK(pmap);
5172 vm_wait(NULL);
5173 PMAP_LOCK(pmap);
5174 goto restart;
5175 }
5176 l1p = pmap_l0_to_l1(l0p, va);
5177 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
5178 origpte = pmap_load(l1p);
5179 } else {
5180 l1p = pmap_l0_to_l1(l0p, va);
5181 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
5182 origpte = pmap_load(l1p);
5183 if ((origpte & ATTR_DESCR_VALID) == 0) {
5184 mp = PTE_TO_VM_PAGE(pmap_load(l0p));
5185 mp->ref_count++;
5186 }
5187 }
5188 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) &&
5189 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
5190 (origpte & ATTR_DESCR_VALID) == 0,
5191 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
5192 va, origpte, newpte));
5193 pmap_store(l1p, newpte);
5194 } else if (psind == 2) {
5195 KASSERT(pagesizes[psind] == L2_SIZE,
5196 ("pagesizes[%d] != L2_SIZE", psind));
5197 l2p = pmap_l2(pmap, va);
5198 if (l2p == NULL) {
5199 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
5200 if (mp == NULL) {
5201 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5202 return (KERN_RESOURCE_SHORTAGE);
5203 PMAP_UNLOCK(pmap);
5204 vm_wait(NULL);
5205 PMAP_LOCK(pmap);
5206 goto restart;
5207 }
5208 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
5209 l2p = &l2p[pmap_l2_index(va)];
5210 origpte = pmap_load(l2p);
5211 } else {
5212 l1p = pmap_l1(pmap, va);
5213 origpte = pmap_load(l2p);
5214 if ((origpte & ATTR_DESCR_VALID) == 0) {
5215 mp = PTE_TO_VM_PAGE(pmap_load(l1p));
5216 mp->ref_count++;
5217 }
5218 }
5219 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5220 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
5221 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5222 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
5223 va, origpte, newpte));
5224 pmap_store(l2p, newpte);
5225 } else /* (psind == 1) */ {
5226 KASSERT(pagesizes[psind] == L3C_SIZE,
5227 ("pagesizes[%d] != L3C_SIZE", psind));
5228 l2p = pmap_l2(pmap, va);
5229 if (l2p == NULL || (pmap_load(l2p) & ATTR_DESCR_VALID) == 0) {
5230 mp = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL);
5231 if (mp == NULL) {
5232 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5233 return (KERN_RESOURCE_SHORTAGE);
5234 PMAP_UNLOCK(pmap);
5235 vm_wait(NULL);
5236 PMAP_LOCK(pmap);
5237 goto restart;
5238 }
5239 mp->ref_count += L3C_ENTRIES - 1;
5240 l3p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
5241 l3p = &l3p[pmap_l3_index(va)];
5242 } else {
5243 l3p = pmap_l2_to_l3(l2p, va);
5244 if ((pmap_load(l3p) & ATTR_DESCR_VALID) == 0) {
5245 mp = PTE_TO_VM_PAGE(pmap_load(l2p));
5246 mp->ref_count += L3C_ENTRIES;
5247 }
5248 }
5249 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5250 origpte = pmap_load(tl3p);
5251 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5252 ((origpte & ATTR_CONTIGUOUS) != 0 &&
5253 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5254 ("va %#lx changing 64K phys page l3 %#lx newpte %#lx",
5255 va, origpte, newpte));
5256 pmap_store(tl3p, newpte);
5257 newpte += L3_SIZE;
5258 }
5259 }
5260 dsb(ishst);
5261
5262 if ((origpte & ATTR_DESCR_VALID) == 0)
5263 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
5264 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
5265 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
5266 else if ((newpte & ATTR_SW_WIRED) == 0 &&
5267 (origpte & ATTR_SW_WIRED) != 0)
5268 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
5269
5270 return (KERN_SUCCESS);
5271 }
5272
5273 /*
5274 * Insert the given physical page (p) at
5275 * the specified virtual address (v) in the
5276 * target physical map with the protection requested.
5277 *
5278 * If specified, the page will be wired down, meaning
5279 * that the related pte can not be reclaimed.
5280 *
5281 * NB: This is the only routine which MAY NOT lazy-evaluate
5282 * or lose information. That is, this routine must actually
5283 * insert this page into the given map NOW.
5284 */
5285 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)5286 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5287 u_int flags, int8_t psind)
5288 {
5289 struct rwlock *lock;
5290 pd_entry_t *pde;
5291 pt_entry_t new_l3, orig_l3;
5292 pt_entry_t *l2, *l3;
5293 pv_entry_t pv;
5294 vm_paddr_t opa, pa;
5295 vm_page_t mpte, om;
5296 bool nosleep;
5297 int full_lvl, lvl, rv;
5298
5299 KASSERT(ADDR_IS_CANONICAL(va),
5300 ("%s: Address not in canonical form: %lx", __func__, va));
5301
5302 va = trunc_page(va);
5303 if ((m->oflags & VPO_UNMANAGED) == 0)
5304 VM_PAGE_OBJECT_BUSY_ASSERT(m);
5305 pa = VM_PAGE_TO_PHYS(m);
5306 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr |
5307 L3_PAGE);
5308 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
5309 new_l3 |= pmap_pte_prot(pmap, prot);
5310 if ((flags & PMAP_ENTER_WIRED) != 0)
5311 new_l3 |= ATTR_SW_WIRED;
5312 if (pmap->pm_stage == PM_STAGE1) {
5313 if (!ADDR_IS_KERNEL(va))
5314 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5315 else
5316 new_l3 |= ATTR_S1_UXN;
5317 if (pmap != kernel_pmap)
5318 new_l3 |= ATTR_S1_nG;
5319 } else {
5320 /*
5321 * Clear the access flag on executable mappings, this will be
5322 * set later when the page is accessed. The fault handler is
5323 * required to invalidate the I-cache.
5324 *
5325 * TODO: Switch to the valid flag to allow hardware management
5326 * of the access flag. Much of the pmap code assumes the
5327 * valid flag is set and fails to destroy the old page tables
5328 * correctly if it is clear.
5329 */
5330 if (prot & VM_PROT_EXECUTE)
5331 new_l3 &= ~ATTR_AF;
5332 }
5333 if ((m->oflags & VPO_UNMANAGED) == 0) {
5334 new_l3 |= ATTR_SW_MANAGED;
5335 if ((prot & VM_PROT_WRITE) != 0) {
5336 new_l3 |= ATTR_SW_DBM;
5337 if ((flags & VM_PROT_WRITE) == 0) {
5338 if (pmap->pm_stage == PM_STAGE1)
5339 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
5340 else
5341 new_l3 &=
5342 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5343 }
5344 }
5345 }
5346
5347 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
5348
5349 lock = NULL;
5350 PMAP_LOCK(pmap);
5351 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
5352 KASSERT((m->oflags & VPO_UNMANAGED) != 0,
5353 ("managed largepage va %#lx flags %#x", va, flags));
5354 if (psind == 3) {
5355 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5356 new_l3 &= ~L3_PAGE;
5357 new_l3 |= L1_BLOCK;
5358 } else if (psind == 2) {
5359 new_l3 &= ~L3_PAGE;
5360 new_l3 |= L2_BLOCK;
5361 } else /* (psind == 1) */
5362 new_l3 |= ATTR_CONTIGUOUS;
5363 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
5364 goto out;
5365 }
5366 if (psind == 2) {
5367 /* Assert the required virtual and physical alignment. */
5368 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
5369 KASSERT(m->psind > 1, ("pmap_enter: m->psind < psind"));
5370 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
5371 flags, m, &lock);
5372 goto out;
5373 }
5374 mpte = NULL;
5375 if (psind == 1) {
5376 KASSERT((va & L3C_OFFSET) == 0, ("pmap_enter: va unaligned"));
5377 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5378 rv = pmap_enter_l3c(pmap, va, new_l3 | ATTR_CONTIGUOUS, flags,
5379 m, &mpte, &lock);
5380 #if VM_NRESERVLEVEL > 0
5381 /*
5382 * Attempt L2 promotion, if both the PTP and a level 1
5383 * reservation are fully populated.
5384 */
5385 if (rv == KERN_SUCCESS &&
5386 (mpte == NULL || mpte->ref_count == NL3PG) &&
5387 (m->flags & PG_FICTITIOUS) == 0 &&
5388 vm_reserv_level_iffullpop(m) == 1) {
5389 pde = pmap_l2(pmap, va);
5390 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5391 }
5392 #endif
5393 goto out;
5394 }
5395
5396 /*
5397 * In the case that a page table page is not
5398 * resident, we are creating it here.
5399 */
5400 retry:
5401 pde = pmap_pde(pmap, va, &lvl);
5402 if (pde != NULL && lvl == 2) {
5403 l3 = pmap_l2_to_l3(pde, va);
5404 if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
5405 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
5406 mpte->ref_count++;
5407 }
5408 goto havel3;
5409 } else if (pde != NULL && lvl == 1) {
5410 l2 = pmap_l1_to_l2(pde, va);
5411 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
5412 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
5413 l3 = &l3[pmap_l3_index(va)];
5414 if (!ADDR_IS_KERNEL(va)) {
5415 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5416 mpte->ref_count++;
5417 }
5418 goto havel3;
5419 }
5420 /* We need to allocate an L3 table. */
5421 }
5422 if (!ADDR_IS_KERNEL(va)) {
5423 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5424
5425 /*
5426 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
5427 * to handle the possibility that a superpage mapping for "va"
5428 * was created while we slept.
5429 */
5430 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
5431 nosleep ? NULL : &lock);
5432 if (mpte == NULL && nosleep) {
5433 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
5434 rv = KERN_RESOURCE_SHORTAGE;
5435 goto out;
5436 }
5437 goto retry;
5438 } else
5439 panic("pmap_enter: missing L3 table for kernel va %#lx", va);
5440
5441 havel3:
5442 orig_l3 = pmap_load(l3);
5443 opa = PTE_TO_PHYS(orig_l3);
5444 pv = NULL;
5445 new_l3 |= pmap_pte_bti(pmap, va);
5446
5447 /*
5448 * Is the specified virtual address already mapped?
5449 */
5450 if (pmap_l3_valid(orig_l3)) {
5451 /*
5452 * Wiring change, just update stats. We don't worry about
5453 * wiring PT pages as they remain resident as long as there
5454 * are valid mappings in them. Hence, if a user page is wired,
5455 * the PT page will be also.
5456 */
5457 if ((flags & PMAP_ENTER_WIRED) != 0 &&
5458 (orig_l3 & ATTR_SW_WIRED) == 0)
5459 pmap->pm_stats.wired_count++;
5460 else if ((flags & PMAP_ENTER_WIRED) == 0 &&
5461 (orig_l3 & ATTR_SW_WIRED) != 0)
5462 pmap->pm_stats.wired_count--;
5463
5464 /*
5465 * Remove the extra PT page reference.
5466 */
5467 if (mpte != NULL) {
5468 mpte->ref_count--;
5469 KASSERT(mpte->ref_count > 0,
5470 ("pmap_enter: missing reference to page table page,"
5471 " va: 0x%lx", va));
5472 }
5473
5474 /*
5475 * Has the physical page changed?
5476 */
5477 if (opa == pa) {
5478 /*
5479 * No, might be a protection or wiring change.
5480 */
5481 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5482 (new_l3 & ATTR_SW_DBM) != 0)
5483 vm_page_aflag_set(m, PGA_WRITEABLE);
5484 goto validate;
5485 }
5486
5487 /*
5488 * The physical page has changed. Temporarily invalidate
5489 * the mapping.
5490 */
5491 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5492 (void)pmap_demote_l3c(pmap, l3, va);
5493 orig_l3 = pmap_load_clear(l3);
5494 KASSERT(PTE_TO_PHYS(orig_l3) == opa,
5495 ("pmap_enter: unexpected pa update for %#lx", va));
5496 if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
5497 om = PHYS_TO_VM_PAGE(opa);
5498
5499 /*
5500 * The pmap lock is sufficient to synchronize with
5501 * concurrent calls to pmap_page_test_mappings() and
5502 * pmap_ts_referenced().
5503 */
5504 if (pmap_pte_dirty(pmap, orig_l3))
5505 vm_page_dirty(om);
5506 if ((orig_l3 & ATTR_AF) != 0) {
5507 pmap_invalidate_page(pmap, va, true);
5508 vm_page_aflag_set(om, PGA_REFERENCED);
5509 }
5510 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om);
5511 pv = pmap_pvh_remove(&om->md, pmap, va);
5512 if ((m->oflags & VPO_UNMANAGED) != 0)
5513 free_pv_entry(pmap, pv);
5514 if ((om->a.flags & PGA_WRITEABLE) != 0 &&
5515 TAILQ_EMPTY(&om->md.pv_list) &&
5516 ((om->flags & PG_FICTITIOUS) != 0 ||
5517 TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
5518 vm_page_aflag_clear(om, PGA_WRITEABLE);
5519 } else {
5520 KASSERT((orig_l3 & ATTR_AF) != 0,
5521 ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
5522 pmap_invalidate_page(pmap, va, true);
5523 }
5524 orig_l3 = 0;
5525 } else {
5526 /*
5527 * Increment the counters.
5528 */
5529 if ((new_l3 & ATTR_SW_WIRED) != 0)
5530 pmap->pm_stats.wired_count++;
5531 pmap_resident_count_inc(pmap, 1);
5532 }
5533 /*
5534 * Enter on the PV list if part of our managed memory.
5535 */
5536 if ((m->oflags & VPO_UNMANAGED) == 0) {
5537 if (pv == NULL) {
5538 pv = get_pv_entry(pmap, &lock);
5539 pv->pv_va = va;
5540 }
5541 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5542 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5543 m->md.pv_gen++;
5544 if ((new_l3 & ATTR_SW_DBM) != 0)
5545 vm_page_aflag_set(m, PGA_WRITEABLE);
5546 }
5547
5548 validate:
5549 if (pmap->pm_stage == PM_STAGE1) {
5550 /*
5551 * Sync icache if exec permission and attribute
5552 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
5553 * is stored and made valid for hardware table walk. If done
5554 * later, then other can access this page before caches are
5555 * properly synced. Don't do it for kernel memory which is
5556 * mapped with exec permission even if the memory isn't going
5557 * to hold executable code. The only time when icache sync is
5558 * needed is after kernel module is loaded and the relocation
5559 * info is processed. And it's done in elf_cpu_load_file().
5560 */
5561 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5562 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
5563 (opa != pa || (orig_l3 & ATTR_S1_XN))) {
5564 PMAP_ASSERT_STAGE1(pmap);
5565 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
5566 PAGE_SIZE);
5567 }
5568 } else {
5569 cpu_dcache_wb_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
5570 }
5571
5572 /*
5573 * Update the L3 entry
5574 */
5575 if (pmap_l3_valid(orig_l3)) {
5576 KASSERT(opa == pa, ("pmap_enter: invalid update"));
5577 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
5578 /* same PA, different attributes */
5579 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5580 (void)pmap_demote_l3c(pmap, l3, va);
5581 orig_l3 = pmap_load_store(l3, new_l3);
5582 pmap_invalidate_page(pmap, va, true);
5583 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5584 pmap_pte_dirty(pmap, orig_l3))
5585 vm_page_dirty(m);
5586 } else {
5587 /*
5588 * orig_l3 == new_l3
5589 * This can happens if multiple threads simultaneously
5590 * access not yet mapped page. This bad for performance
5591 * since this can cause full demotion-NOP-promotion
5592 * cycle.
5593 * Another possible reasons are:
5594 * - VM and pmap memory layout are diverged
5595 * - tlb flush is missing somewhere and CPU doesn't see
5596 * actual mapping.
5597 */
5598 CTR4(KTR_PMAP, "%s: already mapped page - "
5599 "pmap %p va 0x%#lx pte 0x%lx",
5600 __func__, pmap, va, new_l3);
5601 }
5602 } else {
5603 /* New mapping */
5604 pmap_store(l3, new_l3);
5605 dsb(ishst);
5606 }
5607
5608 #if VM_NRESERVLEVEL > 0
5609 /*
5610 * First, attempt L3C promotion, if the virtual and physical addresses
5611 * are aligned with each other and an underlying reservation has the
5612 * neighboring L3 pages allocated. The first condition is simply an
5613 * optimization that recognizes some eventual promotion failures early
5614 * at a lower run-time cost. Then, if both a level 1 reservation and
5615 * the PTP are fully populated, attempt L2 promotion.
5616 */
5617 if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
5618 (m->flags & PG_FICTITIOUS) == 0 &&
5619 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
5620 pmap_promote_l3c(pmap, l3, va) &&
5621 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG))
5622 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5623 #endif
5624
5625 rv = KERN_SUCCESS;
5626 out:
5627 if (lock != NULL)
5628 rw_wunlock(lock);
5629 PMAP_UNLOCK(pmap);
5630 return (rv);
5631 }
5632
5633 /*
5634 * Tries to create a read- and/or execute-only L2 page mapping. Returns
5635 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
5636 * value. See pmap_enter_l2() for the possible error values when "no sleep",
5637 * "no replace", and "no reclaim" are specified.
5638 */
5639 static int
pmap_enter_l2_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)5640 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5641 struct rwlock **lockp)
5642 {
5643 pd_entry_t new_l2;
5644
5645 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5646 PMAP_ASSERT_STAGE1(pmap);
5647 KASSERT(ADDR_IS_CANONICAL(va),
5648 ("%s: Address not in canonical form: %lx", __func__, va));
5649
5650 new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | pmap_sh_attr |
5651 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5652 L2_BLOCK);
5653 if ((m->oflags & VPO_UNMANAGED) == 0)
5654 new_l2 |= ATTR_SW_MANAGED;
5655 else
5656 new_l2 |= ATTR_AF;
5657 if ((prot & VM_PROT_EXECUTE) == 0 ||
5658 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5659 new_l2 |= ATTR_S1_XN;
5660 if (!ADDR_IS_KERNEL(va))
5661 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5662 else
5663 new_l2 |= ATTR_S1_UXN;
5664 if (pmap != kernel_pmap)
5665 new_l2 |= ATTR_S1_nG;
5666 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
5667 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
5668 }
5669
5670 /*
5671 * Returns true if every page table entry in the specified page table is
5672 * zero.
5673 */
5674 static bool
pmap_every_pte_zero(vm_paddr_t pa)5675 pmap_every_pte_zero(vm_paddr_t pa)
5676 {
5677 pt_entry_t *pt_end, *pte;
5678
5679 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
5680 pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
5681 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
5682 if (*pte != 0)
5683 return (false);
5684 }
5685 return (true);
5686 }
5687
5688 /*
5689 * Tries to create the specified L2 page mapping. Returns KERN_SUCCESS if
5690 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
5691 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if
5692 * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists
5693 * within the L2 virtual address range starting at the specified virtual
5694 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
5695 * L2 page mapping already exists at the specified virtual address. Returns
5696 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
5697 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
5698 * and a PV entry allocation failed.
5699 */
5700 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)5701 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
5702 vm_page_t m, struct rwlock **lockp)
5703 {
5704 struct spglist free;
5705 pd_entry_t *l2, old_l2;
5706 vm_page_t l2pg, mt;
5707 vm_page_t uwptpg;
5708
5709 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5710 KASSERT(ADDR_IS_CANONICAL(va),
5711 ("%s: Address not in canonical form: %lx", __func__, va));
5712 KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
5713 PMAP_ENTER_NORECLAIM,
5714 ("pmap_enter_l2: flags is missing PMAP_ENTER_NOREPLACE"));
5715
5716 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
5717 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
5718 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
5719 va, pmap);
5720 return (KERN_RESOURCE_SHORTAGE);
5721 }
5722
5723 /*
5724 * If bti is not the same for the whole l2 range, return failure
5725 * and let vm_fault() cope. Check after l2 allocation, since
5726 * it could sleep.
5727 */
5728 if (!pmap_bti_same(pmap, va, va + L2_SIZE, &new_l2)) {
5729 KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP"));
5730 pmap_abort_ptp(pmap, va, l2pg);
5731 return (KERN_PROTECTION_FAILURE);
5732 }
5733
5734 /*
5735 * If there are existing mappings, either abort or remove them.
5736 */
5737 if ((old_l2 = pmap_load(l2)) != 0) {
5738 KASSERT(l2pg == NULL || l2pg->ref_count > 1,
5739 ("pmap_enter_l2: l2pg's ref count is too low"));
5740 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5741 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5742 if (l2pg != NULL)
5743 l2pg->ref_count--;
5744 CTR2(KTR_PMAP,
5745 "pmap_enter_l2: no space for va %#lx"
5746 " in pmap %p", va, pmap);
5747 return (KERN_NO_SPACE);
5748 } else if (!ADDR_IS_KERNEL(va) ||
5749 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) {
5750 if (l2pg != NULL)
5751 l2pg->ref_count--;
5752 CTR2(KTR_PMAP,
5753 "pmap_enter_l2: failure for va %#lx"
5754 " in pmap %p", va, pmap);
5755 return (KERN_FAILURE);
5756 }
5757 }
5758 SLIST_INIT(&free);
5759 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5760 (void)pmap_remove_l2(pmap, l2, va,
5761 pmap_load(pmap_l1(pmap, va)), false, &free, lockp);
5762 } else {
5763 if (ADDR_IS_KERNEL(va)) {
5764 /*
5765 * Try to save the ptp in the trie
5766 * before any changes to mappings are
5767 * made. Abort on failure.
5768 */
5769 mt = PTE_TO_VM_PAGE(old_l2);
5770 if (pmap_insert_pt_page(pmap, mt, false,
5771 false)) {
5772 CTR1(KTR_PMAP,
5773 "pmap_enter_l2: cannot ins kern ptp va %#lx",
5774 va);
5775 return (KERN_RESOURCE_SHORTAGE);
5776 }
5777 /*
5778 * Both pmap_remove_l2() and
5779 * pmap_remove_l3_range() will zero fill
5780 * the L3 kernel page table page.
5781 */
5782 }
5783 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
5784 &free, lockp);
5785 if (ADDR_IS_KERNEL(va)) {
5786 /*
5787 * The TLB could have an intermediate
5788 * entry for the L3 kernel page table
5789 * page, so request an invalidation at
5790 * all levels after clearing the
5791 * L2_TABLE entry.
5792 */
5793 pmap_clear(l2);
5794 pmap_s1_invalidate_page(pmap, va, false);
5795 }
5796 }
5797 KASSERT(pmap_load(l2) == 0,
5798 ("pmap_enter_l2: non-zero L2 entry %p", l2));
5799 if (!ADDR_IS_KERNEL(va)) {
5800 vm_page_free_pages_toq(&free, true);
5801 } else {
5802 KASSERT(SLIST_EMPTY(&free),
5803 ("pmap_enter_l2: freed kernel page table page"));
5804 }
5805 }
5806
5807 /*
5808 * Allocate leaf ptpage for wired userspace pages.
5809 */
5810 uwptpg = NULL;
5811 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) {
5812 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5813 if (uwptpg == NULL) {
5814 pmap_abort_ptp(pmap, va, l2pg);
5815 return (KERN_RESOURCE_SHORTAGE);
5816 }
5817 uwptpg->pindex = pmap_l2_pindex(va);
5818 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
5819 vm_page_unwire_noq(uwptpg);
5820 vm_page_free(uwptpg);
5821 pmap_abort_ptp(pmap, va, l2pg);
5822 return (KERN_RESOURCE_SHORTAGE);
5823 }
5824 pmap_resident_count_inc(pmap, 1);
5825 uwptpg->ref_count = NL3PG;
5826 }
5827 if ((new_l2 & ATTR_SW_MANAGED) != 0) {
5828 /*
5829 * Abort this mapping if its PV entry could not be created.
5830 */
5831 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
5832 if (l2pg != NULL)
5833 pmap_abort_ptp(pmap, va, l2pg);
5834 else {
5835 KASSERT(ADDR_IS_KERNEL(va) &&
5836 (pmap_load(l2) & ATTR_DESCR_MASK) ==
5837 L2_TABLE,
5838 ("pmap_enter_l2: invalid kernel L2E"));
5839 mt = pmap_remove_pt_page(pmap, va);
5840 KASSERT(mt != NULL,
5841 ("pmap_enter_l2: missing kernel PTP"));
5842 }
5843 if (uwptpg != NULL) {
5844 mt = pmap_remove_pt_page(pmap, va);
5845 KASSERT(mt == uwptpg,
5846 ("removed pt page %p, expected %p", mt,
5847 uwptpg));
5848 pmap_resident_count_dec(pmap, 1);
5849 uwptpg->ref_count = 1;
5850 vm_page_unwire_noq(uwptpg);
5851 vm_page_free(uwptpg);
5852 }
5853 CTR2(KTR_PMAP,
5854 "pmap_enter_l2: failure for va %#lx in pmap %p",
5855 va, pmap);
5856 return (KERN_RESOURCE_SHORTAGE);
5857 }
5858 if ((new_l2 & ATTR_SW_DBM) != 0)
5859 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5860 vm_page_aflag_set(mt, PGA_WRITEABLE);
5861 }
5862
5863 /*
5864 * Increment counters.
5865 */
5866 if ((new_l2 & ATTR_SW_WIRED) != 0)
5867 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
5868 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
5869
5870 /*
5871 * Conditionally sync the icache. See pmap_enter() for details.
5872 */
5873 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) !=
5874 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) &&
5875 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
5876 cpu_icache_sync_range((void *)PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)),
5877 L2_SIZE);
5878 }
5879
5880 /*
5881 * Map the superpage.
5882 */
5883 pmap_store(l2, new_l2);
5884 dsb(ishst);
5885
5886 counter_u64_add(pmap_l2_mappings, 1);
5887 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
5888 va, pmap);
5889
5890 return (KERN_SUCCESS);
5891 }
5892
5893 /*
5894 * Tries to create a read- and/or execute-only L3C page mapping. Returns
5895 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
5896 * value.
5897 */
5898 static int
pmap_enter_l3c_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_page_t * ml3p,vm_prot_t prot,struct rwlock ** lockp)5899 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p,
5900 vm_prot_t prot, struct rwlock **lockp)
5901 {
5902 pt_entry_t l3e;
5903
5904 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5905 PMAP_ASSERT_STAGE1(pmap);
5906 KASSERT(ADDR_IS_CANONICAL(va),
5907 ("%s: Address not in canonical form: %lx", __func__, va));
5908
5909 l3e = VM_PAGE_TO_PTE(m) | pmap_sh_attr |
5910 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5911 ATTR_CONTIGUOUS | L3_PAGE;
5912 if ((m->oflags & VPO_UNMANAGED) == 0)
5913 l3e |= ATTR_SW_MANAGED;
5914 else
5915 l3e |= ATTR_AF;
5916 if ((prot & VM_PROT_EXECUTE) == 0 ||
5917 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5918 l3e |= ATTR_S1_XN;
5919 if (!ADDR_IS_KERNEL(va))
5920 l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5921 else
5922 l3e |= ATTR_S1_UXN;
5923 if (pmap != kernel_pmap)
5924 l3e |= ATTR_S1_nG;
5925 return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP |
5926 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp));
5927 }
5928
5929 static int
pmap_enter_l3c(pmap_t pmap,vm_offset_t va,pt_entry_t l3e,u_int flags,vm_page_t m,vm_page_t * ml3p,struct rwlock ** lockp)5930 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
5931 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp)
5932 {
5933 pd_entry_t *l2p, *pde;
5934 pt_entry_t *l3p, *tl3p;
5935 vm_page_t mt;
5936 vm_paddr_t pa;
5937 vm_pindex_t l2pindex;
5938 int lvl;
5939
5940 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5941 KASSERT((va & L3C_OFFSET) == 0,
5942 ("pmap_enter_l3c: va is not aligned"));
5943 KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0,
5944 ("pmap_enter_l3c: managed mapping within the clean submap"));
5945 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
5946 ("pmap_enter_l3c: l3e is missing ATTR_CONTIGUOUS"));
5947
5948 /*
5949 * If the L3 PTP is not resident, we attempt to create it here.
5950 */
5951 if (!ADDR_IS_KERNEL(va)) {
5952 /*
5953 * Were we given the correct L3 PTP? If so, we can simply
5954 * increment its ref count.
5955 */
5956 l2pindex = pmap_l2_pindex(va);
5957 if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) {
5958 (*ml3p)->ref_count += L3C_ENTRIES;
5959 } else {
5960 retry:
5961 /*
5962 * Get the L2 entry.
5963 */
5964 pde = pmap_pde(pmap, va, &lvl);
5965
5966 /*
5967 * If the L2 entry is a superpage, we either abort or
5968 * demote depending on the given flags.
5969 */
5970 if (lvl == 1) {
5971 l2p = pmap_l1_to_l2(pde, va);
5972 if ((pmap_load(l2p) & ATTR_DESCR_MASK) ==
5973 L2_BLOCK) {
5974 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5975 return (KERN_FAILURE);
5976 l3p = pmap_demote_l2_locked(pmap, l2p,
5977 va, lockp);
5978 if (l3p != NULL) {
5979 *ml3p = PTE_TO_VM_PAGE(
5980 pmap_load(l2p));
5981 (*ml3p)->ref_count +=
5982 L3C_ENTRIES;
5983 goto have_l3p;
5984 }
5985 }
5986 /* We need to allocate an L3 PTP. */
5987 }
5988
5989 /*
5990 * If the L3 PTP is mapped, we just increment its ref
5991 * count. Otherwise, we attempt to allocate it.
5992 */
5993 if (lvl == 2 && pmap_load(pde) != 0) {
5994 *ml3p = PTE_TO_VM_PAGE(pmap_load(pde));
5995 (*ml3p)->ref_count += L3C_ENTRIES;
5996 } else {
5997 *ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags &
5998 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp);
5999 if (*ml3p == NULL) {
6000 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
6001 return (KERN_FAILURE);
6002
6003 /*
6004 * The page table may have changed
6005 * while we slept.
6006 */
6007 goto retry;
6008 }
6009 (*ml3p)->ref_count += L3C_ENTRIES - 1;
6010 }
6011 }
6012 l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p));
6013 } else {
6014 *ml3p = NULL;
6015
6016 /*
6017 * If the L2 entry is a superpage, we either abort or demote
6018 * depending on the given flags.
6019 */
6020 pde = pmap_pde(kernel_pmap, va, &lvl);
6021 if (lvl == 1) {
6022 l2p = pmap_l1_to_l2(pde, va);
6023 KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK,
6024 ("pmap_enter_l3c: missing L2 block"));
6025 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
6026 return (KERN_FAILURE);
6027 l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp);
6028 } else {
6029 KASSERT(lvl == 2,
6030 ("pmap_enter_l3c: Invalid level %d", lvl));
6031 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(
6032 pmap_load(pde)));
6033 }
6034 }
6035 have_l3p:
6036 l3p = &l3p[pmap_l3_index(va)];
6037
6038 /*
6039 * If bti is not the same for the whole L3C range, return failure
6040 * and let vm_fault() cope. Check after L3 allocation, since
6041 * it could sleep.
6042 */
6043 if (!pmap_bti_same(pmap, va, va + L3C_SIZE, &l3e)) {
6044 KASSERT(*ml3p != NULL, ("pmap_enter_l3c: missing L3 PTP"));
6045 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
6046 pmap_abort_ptp(pmap, va, *ml3p);
6047 *ml3p = NULL;
6048 return (KERN_PROTECTION_FAILURE);
6049 }
6050
6051 /*
6052 * If there are existing mappings, either abort or remove them.
6053 */
6054 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
6055 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6056 if (pmap_load(tl3p) != 0) {
6057 if (*ml3p != NULL)
6058 (*ml3p)->ref_count -= L3C_ENTRIES;
6059 return (KERN_FAILURE);
6060 }
6061 }
6062 } else {
6063 /*
6064 * Because we increment the L3 page's reference count above,
6065 * it is guaranteed not to be freed here and we can pass NULL
6066 * instead of a valid free list.
6067 */
6068 pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va,
6069 va + L3C_SIZE, NULL, lockp);
6070 }
6071
6072 /*
6073 * Enter on the PV list if part of our managed memory.
6074 */
6075 if ((l3e & ATTR_SW_MANAGED) != 0) {
6076 if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) {
6077 if (*ml3p != NULL) {
6078 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
6079 pmap_abort_ptp(pmap, va, *ml3p);
6080 *ml3p = NULL;
6081 }
6082 return (KERN_RESOURCE_SHORTAGE);
6083 }
6084 if ((l3e & ATTR_SW_DBM) != 0)
6085 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
6086 vm_page_aflag_set(mt, PGA_WRITEABLE);
6087 }
6088
6089 /*
6090 * Increment counters.
6091 */
6092 if ((l3e & ATTR_SW_WIRED) != 0)
6093 pmap->pm_stats.wired_count += L3C_ENTRIES;
6094 pmap_resident_count_inc(pmap, L3C_ENTRIES);
6095
6096 pa = VM_PAGE_TO_PHYS(m);
6097 KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned"));
6098
6099 /*
6100 * Sync the icache before the mapping is stored.
6101 */
6102 if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap &&
6103 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
6104 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), L3C_SIZE);
6105
6106 /*
6107 * Map the superpage.
6108 */
6109 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6110 pmap_store(tl3p, l3e);
6111 l3e += L3_SIZE;
6112 }
6113 dsb(ishst);
6114
6115 counter_u64_add(pmap_l3c_mappings, 1);
6116 CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p",
6117 va, pmap);
6118 return (KERN_SUCCESS);
6119 }
6120
6121 /*
6122 * Maps a sequence of resident pages belonging to the same object.
6123 * The sequence begins with the given page m_start. This page is
6124 * mapped at the given virtual address start. Each subsequent page is
6125 * mapped at a virtual address that is offset from start by the same
6126 * amount as the page is offset from m_start within the object. The
6127 * last page in the sequence is the page with the largest offset from
6128 * m_start that can be mapped at a virtual address less than the given
6129 * virtual address end. Not every virtual page between start and end
6130 * is mapped; only those for which a resident page exists with the
6131 * corresponding offset from m_start are mapped.
6132 */
6133 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)6134 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
6135 vm_page_t m_start, vm_prot_t prot)
6136 {
6137 struct pctrie_iter pages;
6138 struct rwlock *lock;
6139 vm_offset_t va;
6140 vm_page_t m, mpte;
6141 int rv;
6142
6143 VM_OBJECT_ASSERT_LOCKED(m_start->object);
6144
6145 mpte = NULL;
6146 vm_page_iter_limit_init(&pages, m_start->object,
6147 m_start->pindex + atop(end - start));
6148 m = vm_radix_iter_lookup(&pages, m_start->pindex);
6149 lock = NULL;
6150 PMAP_LOCK(pmap);
6151 while (m != NULL) {
6152 va = start + ptoa(m->pindex - m_start->pindex);
6153 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
6154 m->psind == 2 && pmap_ps_enabled(pmap) &&
6155 ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) ==
6156 KERN_SUCCESS || rv == KERN_NO_SPACE)) {
6157 m = vm_radix_iter_jump(&pages, L2_SIZE / PAGE_SIZE);
6158 } else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end &&
6159 m->psind >= 1 && pmap_ps_enabled(pmap) &&
6160 ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot,
6161 &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE)) {
6162 m = vm_radix_iter_jump(&pages, L3C_ENTRIES);
6163 } else {
6164 /*
6165 * In general, if a superpage mapping were possible,
6166 * it would have been created above. That said, if
6167 * start and end are not superpage aligned, then
6168 * promotion might be possible at the ends of [start,
6169 * end). However, in practice, those promotion
6170 * attempts are so unlikely to succeed that they are
6171 * not worth trying.
6172 */
6173 mpte = pmap_enter_quick_locked(pmap, va, m, prot |
6174 VM_PROT_NO_PROMOTE, mpte, &lock);
6175 m = vm_radix_iter_step(&pages);
6176 }
6177 }
6178 if (lock != NULL)
6179 rw_wunlock(lock);
6180 PMAP_UNLOCK(pmap);
6181 }
6182
6183 /*
6184 * this code makes some *MAJOR* assumptions:
6185 * 1. Current pmap & pmap exists.
6186 * 2. Not wired.
6187 * 3. Read access.
6188 * 4. No page table pages.
6189 * but is *MUCH* faster than pmap_enter...
6190 */
6191
6192 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)6193 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
6194 {
6195 struct rwlock *lock;
6196
6197 lock = NULL;
6198 PMAP_LOCK(pmap);
6199 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
6200 if (lock != NULL)
6201 rw_wunlock(lock);
6202 PMAP_UNLOCK(pmap);
6203 }
6204
6205 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)6206 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
6207 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
6208 {
6209 pt_entry_t *l1, *l2, *l3, l3_val;
6210 vm_paddr_t pa;
6211 int full_lvl, lvl;
6212
6213 KASSERT(!VA_IS_CLEANMAP(va) ||
6214 (m->oflags & VPO_UNMANAGED) != 0,
6215 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
6216 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6217 PMAP_ASSERT_STAGE1(pmap);
6218 KASSERT(ADDR_IS_CANONICAL(va),
6219 ("%s: Address not in canonical form: %lx", __func__, va));
6220 l2 = NULL;
6221
6222 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
6223 /*
6224 * In the case that a page table page is not
6225 * resident, we are creating it here.
6226 */
6227 if (!ADDR_IS_KERNEL(va)) {
6228 vm_pindex_t l2pindex;
6229
6230 /*
6231 * Calculate pagetable page index
6232 */
6233 l2pindex = pmap_l2_pindex(va);
6234 if (mpte && (mpte->pindex == l2pindex)) {
6235 mpte->ref_count++;
6236 } else {
6237 /*
6238 * If the page table page is mapped, we just increment
6239 * the hold count, and activate it. Otherwise, we
6240 * attempt to allocate a page table page, passing NULL
6241 * instead of the PV list lock pointer because we don't
6242 * intend to sleep. If this attempt fails, we don't
6243 * retry. Instead, we give up.
6244 */
6245 l1 = pmap_l1(pmap, va);
6246 if (l1 != NULL && pmap_load(l1) != 0) {
6247 if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
6248 L1_BLOCK)
6249 return (NULL);
6250 l2 = pmap_l1_to_l2(l1, va);
6251 if (pmap_load(l2) != 0) {
6252 if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
6253 L2_BLOCK)
6254 return (NULL);
6255 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
6256 mpte->ref_count++;
6257 } else {
6258 mpte = _pmap_alloc_l3(pmap, l2pindex,
6259 NULL);
6260 if (mpte == NULL)
6261 return (mpte);
6262 }
6263 } else {
6264 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
6265 if (mpte == NULL)
6266 return (mpte);
6267 }
6268 }
6269 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
6270 l3 = &l3[pmap_l3_index(va)];
6271 } else {
6272 mpte = NULL;
6273 l2 = pmap_pde(kernel_pmap, va, &lvl);
6274 KASSERT(l2 != NULL,
6275 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
6276 va));
6277 KASSERT(lvl == 2,
6278 ("pmap_enter_quick_locked: Invalid level %d", lvl));
6279 l3 = pmap_l2_to_l3(l2, va);
6280 }
6281
6282 /*
6283 * Abort if a mapping already exists.
6284 */
6285 if (pmap_load(l3) != 0) {
6286 if (mpte != NULL)
6287 mpte->ref_count--;
6288 return (NULL);
6289 }
6290
6291 /*
6292 * Enter on the PV list if part of our managed memory.
6293 */
6294 if ((m->oflags & VPO_UNMANAGED) == 0 &&
6295 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
6296 if (mpte != NULL)
6297 pmap_abort_ptp(pmap, va, mpte);
6298 return (NULL);
6299 }
6300
6301 /*
6302 * Increment counters
6303 */
6304 pmap_resident_count_inc(pmap, 1);
6305
6306 pa = VM_PAGE_TO_PHYS(m);
6307 l3_val = PHYS_TO_PTE(pa) | pmap_sh_attr |
6308 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
6309 l3_val |= pmap_pte_bti(pmap, va);
6310 if ((prot & VM_PROT_EXECUTE) == 0 ||
6311 m->md.pv_memattr == VM_MEMATTR_DEVICE)
6312 l3_val |= ATTR_S1_XN;
6313 if (!ADDR_IS_KERNEL(va))
6314 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
6315 else
6316 l3_val |= ATTR_S1_UXN;
6317 if (pmap != kernel_pmap)
6318 l3_val |= ATTR_S1_nG;
6319
6320 /*
6321 * Now validate mapping with RO protection
6322 */
6323 if ((m->oflags & VPO_UNMANAGED) == 0)
6324 l3_val |= ATTR_SW_MANAGED;
6325 else
6326 l3_val |= ATTR_AF;
6327
6328 /* Sync icache before the mapping is stored to PTE */
6329 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
6330 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
6331 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
6332
6333 pmap_store(l3, l3_val);
6334 dsb(ishst);
6335
6336 #if VM_NRESERVLEVEL > 0
6337 /*
6338 * First, attempt L3C promotion, if the virtual and physical addresses
6339 * are aligned with each other and an underlying reservation has the
6340 * neighboring L3 pages allocated. The first condition is simply an
6341 * optimization that recognizes some eventual promotion failures early
6342 * at a lower run-time cost. Then, attempt L2 promotion, if both a
6343 * level 1 reservation and the PTP are fully populated.
6344 */
6345 if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
6346 (va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
6347 (m->flags & PG_FICTITIOUS) == 0 &&
6348 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
6349 pmap_promote_l3c(pmap, l3, va) &&
6350 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) {
6351 if (l2 == NULL)
6352 l2 = pmap_l2(pmap, va);
6353
6354 /*
6355 * If promotion succeeds, then the next call to this function
6356 * should not be given the unmapped PTP as a hint.
6357 */
6358 if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
6359 mpte = NULL;
6360 }
6361 #endif
6362
6363 return (mpte);
6364 }
6365
6366 /*
6367 * This code maps large physical mmap regions into the
6368 * processor address space. Note that some shortcuts
6369 * are taken, but the code works.
6370 */
6371 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)6372 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
6373 vm_pindex_t pindex, vm_size_t size)
6374 {
6375
6376 VM_OBJECT_ASSERT_WLOCKED(object);
6377 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
6378 ("pmap_object_init_pt: non-device object"));
6379 }
6380
6381 /*
6382 * Clear the wired attribute from the mappings for the specified range of
6383 * addresses in the given pmap. Every valid mapping within that range
6384 * must have the wired attribute set. In contrast, invalid mappings
6385 * cannot have the wired attribute set, so they are ignored.
6386 *
6387 * The wired attribute of the page table entry is not a hardware feature,
6388 * so there is no need to invalidate any TLB entries.
6389 */
6390 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6391 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6392 {
6393 vm_offset_t va_next;
6394 pd_entry_t *l0, *l1, *l2;
6395 pt_entry_t *l3;
6396 bool partial_l3c;
6397
6398 PMAP_LOCK(pmap);
6399 for (; sva < eva; sva = va_next) {
6400 l0 = pmap_l0(pmap, sva);
6401 if (pmap_load(l0) == 0) {
6402 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6403 if (va_next < sva)
6404 va_next = eva;
6405 continue;
6406 }
6407
6408 l1 = pmap_l0_to_l1(l0, sva);
6409 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6410 if (va_next < sva)
6411 va_next = eva;
6412 if (pmap_load(l1) == 0)
6413 continue;
6414
6415 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6416 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6417 KASSERT(va_next <= eva,
6418 ("partial update of non-transparent 1G page "
6419 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
6420 pmap_load(l1), sva, eva, va_next));
6421 MPASS(pmap != kernel_pmap);
6422 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
6423 ATTR_SW_WIRED)) == ATTR_SW_WIRED);
6424 pmap_clear_bits(l1, ATTR_SW_WIRED);
6425 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
6426 continue;
6427 }
6428
6429 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6430 if (va_next < sva)
6431 va_next = eva;
6432
6433 l2 = pmap_l1_to_l2(l1, sva);
6434 if (pmap_load(l2) == 0)
6435 continue;
6436
6437 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
6438 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
6439 panic("pmap_unwire: l2 %#jx is missing "
6440 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
6441
6442 /*
6443 * Are we unwiring the entire large page? If not,
6444 * demote the mapping and fall through.
6445 */
6446 if (sva + L2_SIZE == va_next && eva >= va_next) {
6447 pmap_clear_bits(l2, ATTR_SW_WIRED);
6448 pmap->pm_stats.wired_count -= L2_SIZE /
6449 PAGE_SIZE;
6450 continue;
6451 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
6452 panic("pmap_unwire: demotion failed");
6453 }
6454 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6455 ("pmap_unwire: Invalid l2 entry after demotion"));
6456
6457 if (va_next > eva)
6458 va_next = eva;
6459 for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva);
6460 sva != va_next; l3++, sva += L3_SIZE) {
6461 if (pmap_load(l3) == 0)
6462 continue;
6463 if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) {
6464 /*
6465 * Avoid demotion for whole-page unwiring.
6466 */
6467 if ((sva & L3C_OFFSET) == 0) {
6468 /*
6469 * Handle the possibility that
6470 * "va_next" is zero because of
6471 * address wraparound.
6472 */
6473 partial_l3c = sva + L3C_OFFSET >
6474 va_next - 1;
6475 }
6476 if (partial_l3c)
6477 (void)pmap_demote_l3c(pmap, l3, sva);
6478 }
6479 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
6480 panic("pmap_unwire: l3 %#jx is missing "
6481 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
6482
6483 /*
6484 * ATTR_SW_WIRED must be cleared atomically. Although
6485 * the pmap lock synchronizes access to ATTR_SW_WIRED,
6486 * the System MMU may write to the entry concurrently.
6487 */
6488 pmap_clear_bits(l3, ATTR_SW_WIRED);
6489 pmap->pm_stats.wired_count--;
6490 }
6491 }
6492 PMAP_UNLOCK(pmap);
6493 }
6494
6495 /*
6496 * This function requires that the caller has already added one to ml3's
6497 * ref_count in anticipation of creating a 4KB page mapping.
6498 */
6499 static bool
pmap_copy_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,pt_entry_t l3e,vm_page_t ml3,struct rwlock ** lockp)6500 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e,
6501 vm_page_t ml3, struct rwlock **lockp)
6502 {
6503 pt_entry_t *tl3p;
6504
6505 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6506 KASSERT((va & L3C_OFFSET) == 0,
6507 ("pmap_copy_l3c: va is not aligned"));
6508 KASSERT((l3e & ATTR_SW_MANAGED) != 0,
6509 ("pmap_copy_l3c: l3e is not managed"));
6510
6511 /*
6512 * Abort if a mapping already exists.
6513 */
6514 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++)
6515 if (pmap_load(tl3p) != 0) {
6516 if (ml3 != NULL)
6517 ml3->ref_count--;
6518 return (false);
6519 }
6520
6521 if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) {
6522 if (ml3 != NULL)
6523 pmap_abort_ptp(pmap, va, ml3);
6524 return (false);
6525 }
6526 ml3->ref_count += L3C_ENTRIES - 1;
6527
6528 /*
6529 * Clear the wired and accessed bits. However, leave the dirty bit
6530 * unchanged because read/write superpage mappings are required to be
6531 * dirty.
6532 */
6533 l3e &= ~(ATTR_SW_WIRED | ATTR_AF);
6534
6535 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6536 pmap_store(tl3p, l3e);
6537 l3e += L3_SIZE;
6538 }
6539 pmap_resident_count_inc(pmap, L3C_ENTRIES);
6540 counter_u64_add(pmap_l3c_mappings, 1);
6541 CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p",
6542 va, pmap);
6543 return (true);
6544 }
6545
6546 /*
6547 * Copy the range specified by src_addr/len
6548 * from the source map to the range dst_addr/len
6549 * in the destination map.
6550 *
6551 * This routine is only advisory and need not do anything.
6552 *
6553 * Because the executable mappings created by this routine are copied,
6554 * it should not have to flush the instruction cache.
6555 */
6556 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)6557 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
6558 vm_offset_t src_addr)
6559 {
6560 struct rwlock *lock;
6561 pd_entry_t *l0, *l1, *l2, srcptepaddr;
6562 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
6563 vm_offset_t addr, end_addr, va_next;
6564 vm_page_t dst_m, dstmpte, srcmpte;
6565
6566 PMAP_ASSERT_STAGE1(dst_pmap);
6567 PMAP_ASSERT_STAGE1(src_pmap);
6568
6569 if (dst_addr != src_addr)
6570 return;
6571 end_addr = src_addr + len;
6572 lock = NULL;
6573 if (dst_pmap < src_pmap) {
6574 PMAP_LOCK(dst_pmap);
6575 PMAP_LOCK(src_pmap);
6576 } else {
6577 PMAP_LOCK(src_pmap);
6578 PMAP_LOCK(dst_pmap);
6579 }
6580 for (addr = src_addr; addr < end_addr; addr = va_next) {
6581 l0 = pmap_l0(src_pmap, addr);
6582 if (pmap_load(l0) == 0) {
6583 va_next = (addr + L0_SIZE) & ~L0_OFFSET;
6584 if (va_next < addr)
6585 va_next = end_addr;
6586 continue;
6587 }
6588
6589 va_next = (addr + L1_SIZE) & ~L1_OFFSET;
6590 if (va_next < addr)
6591 va_next = end_addr;
6592 l1 = pmap_l0_to_l1(l0, addr);
6593 if (pmap_load(l1) == 0)
6594 continue;
6595 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6596 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6597 KASSERT(va_next <= end_addr,
6598 ("partial update of non-transparent 1G page "
6599 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6600 pmap_load(l1), addr, end_addr, va_next));
6601 srcptepaddr = pmap_load(l1);
6602 l1 = pmap_l1(dst_pmap, addr);
6603 if (l1 == NULL) {
6604 if (_pmap_alloc_l3(dst_pmap,
6605 pmap_l0_pindex(addr), NULL) == NULL)
6606 break;
6607 l1 = pmap_l1(dst_pmap, addr);
6608 } else {
6609 l0 = pmap_l0(dst_pmap, addr);
6610 dst_m = PTE_TO_VM_PAGE(pmap_load(l0));
6611 dst_m->ref_count++;
6612 }
6613 KASSERT(pmap_load(l1) == 0,
6614 ("1G mapping present in dst pmap "
6615 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6616 pmap_load(l1), addr, end_addr, va_next));
6617 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
6618 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
6619 continue;
6620 }
6621
6622 va_next = (addr + L2_SIZE) & ~L2_OFFSET;
6623 if (va_next < addr)
6624 va_next = end_addr;
6625 l2 = pmap_l1_to_l2(l1, addr);
6626 srcptepaddr = pmap_load(l2);
6627 if (srcptepaddr == 0)
6628 continue;
6629 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
6630 /*
6631 * We can only virtual copy whole superpages.
6632 */
6633 if ((addr & L2_OFFSET) != 0 ||
6634 addr + L2_SIZE > end_addr)
6635 continue;
6636 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
6637 if (l2 == NULL)
6638 break;
6639 if (pmap_load(l2) == 0 &&
6640 ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
6641 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
6642 PMAP_ENTER_NORECLAIM, &lock))) {
6643 /*
6644 * We leave the dirty bit unchanged because
6645 * managed read/write superpage mappings are
6646 * required to be dirty. However, managed
6647 * superpage mappings are not required to
6648 * have their accessed bit set, so we clear
6649 * it because we don't know if this mapping
6650 * will be used.
6651 */
6652 srcptepaddr &= ~ATTR_SW_WIRED;
6653 if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
6654 srcptepaddr &= ~ATTR_AF;
6655 pmap_store(l2, srcptepaddr);
6656 pmap_resident_count_inc(dst_pmap, L2_SIZE /
6657 PAGE_SIZE);
6658 counter_u64_add(pmap_l2_mappings, 1);
6659 } else
6660 pmap_abort_ptp(dst_pmap, addr, dst_m);
6661 continue;
6662 }
6663 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
6664 ("pmap_copy: invalid L2 entry"));
6665 srcmpte = PTE_TO_VM_PAGE(srcptepaddr);
6666 KASSERT(srcmpte->ref_count > 0,
6667 ("pmap_copy: source page table page is unused"));
6668 if (va_next > end_addr)
6669 va_next = end_addr;
6670 src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr));
6671 src_pte = &src_pte[pmap_l3_index(addr)];
6672 dstmpte = NULL;
6673 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
6674 ptetemp = pmap_load(src_pte);
6675
6676 /*
6677 * We only virtual copy managed pages.
6678 */
6679 if ((ptetemp & ATTR_SW_MANAGED) == 0)
6680 continue;
6681
6682 if (dstmpte != NULL) {
6683 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
6684 ("dstmpte pindex/addr mismatch"));
6685 dstmpte->ref_count++;
6686 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
6687 NULL)) == NULL)
6688 goto out;
6689 dst_pte = (pt_entry_t *)
6690 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
6691 dst_pte = &dst_pte[pmap_l3_index(addr)];
6692 if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr &
6693 L3C_OFFSET) == 0 && addr + L3C_OFFSET <=
6694 va_next - 1) {
6695 if (!pmap_copy_l3c(dst_pmap, dst_pte, addr,
6696 ptetemp, dstmpte, &lock))
6697 goto out;
6698 addr += L3C_SIZE - PAGE_SIZE;
6699 src_pte += L3C_ENTRIES - 1;
6700 } else if (pmap_load(dst_pte) == 0 &&
6701 pmap_try_insert_pv_entry(dst_pmap, addr,
6702 PTE_TO_VM_PAGE(ptetemp), &lock)) {
6703 /*
6704 * Clear the wired, contiguous, modified, and
6705 * accessed bits from the destination PTE.
6706 * The contiguous bit is cleared because we
6707 * are not copying the entire L3C superpage.
6708 */
6709 mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS |
6710 ATTR_AF;
6711 nbits = 0;
6712 if ((ptetemp & ATTR_SW_DBM) != 0)
6713 nbits |= ATTR_S1_AP_RW_BIT;
6714 pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
6715 pmap_resident_count_inc(dst_pmap, 1);
6716 } else {
6717 pmap_abort_ptp(dst_pmap, addr, dstmpte);
6718 goto out;
6719 }
6720 /* Have we copied all of the valid mappings? */
6721 if (dstmpte->ref_count >= srcmpte->ref_count)
6722 break;
6723 }
6724 }
6725 out:
6726 /*
6727 * XXX This barrier may not be needed because the destination pmap is
6728 * not active.
6729 */
6730 dsb(ishst);
6731
6732 if (lock != NULL)
6733 rw_wunlock(lock);
6734 PMAP_UNLOCK(src_pmap);
6735 PMAP_UNLOCK(dst_pmap);
6736 }
6737
6738 int
pmap_vmspace_copy(pmap_t dst_pmap,pmap_t src_pmap)6739 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
6740 {
6741 int error;
6742
6743 if (dst_pmap->pm_stage != src_pmap->pm_stage)
6744 return (EINVAL);
6745
6746 if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL)
6747 return (0);
6748
6749 for (;;) {
6750 if (dst_pmap < src_pmap) {
6751 PMAP_LOCK(dst_pmap);
6752 PMAP_LOCK(src_pmap);
6753 } else {
6754 PMAP_LOCK(src_pmap);
6755 PMAP_LOCK(dst_pmap);
6756 }
6757 error = pmap_bti_copy(dst_pmap, src_pmap);
6758 /* Clean up partial copy on failure due to no memory. */
6759 if (error == ENOMEM)
6760 pmap_bti_deassign_all(dst_pmap);
6761 PMAP_UNLOCK(src_pmap);
6762 PMAP_UNLOCK(dst_pmap);
6763 if (error != ENOMEM)
6764 break;
6765 vm_wait(NULL);
6766 }
6767 return (error);
6768 }
6769
6770 /*
6771 * pmap_zero_page zeros the specified hardware page by mapping
6772 * the page into KVM and using bzero to clear its contents.
6773 */
6774 void
pmap_zero_page(vm_page_t m)6775 pmap_zero_page(vm_page_t m)
6776 {
6777 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6778
6779 pagezero((void *)va);
6780 }
6781
6782 /*
6783 * pmap_zero_page_area zeros the specified hardware page by mapping
6784 * the page into KVM and using bzero to clear its contents.
6785 *
6786 * off and size may not cover an area beyond a single hardware page.
6787 */
6788 void
pmap_zero_page_area(vm_page_t m,int off,int size)6789 pmap_zero_page_area(vm_page_t m, int off, int size)
6790 {
6791 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6792
6793 if (off == 0 && size == PAGE_SIZE)
6794 pagezero((void *)va);
6795 else
6796 bzero((char *)va + off, size);
6797 }
6798
6799 /*
6800 * pmap_copy_page copies the specified (machine independent)
6801 * page by mapping the page into virtual memory and using
6802 * bcopy to copy the page, one machine dependent page at a
6803 * time.
6804 */
6805 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)6806 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6807 {
6808 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
6809 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
6810
6811 pagecopy((void *)src, (void *)dst);
6812 }
6813
6814 int unmapped_buf_allowed = 1;
6815
6816 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)6817 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6818 vm_offset_t b_offset, int xfersize)
6819 {
6820 void *a_cp, *b_cp;
6821 vm_page_t m_a, m_b;
6822 vm_paddr_t p_a, p_b;
6823 vm_offset_t a_pg_offset, b_pg_offset;
6824 int cnt;
6825
6826 while (xfersize > 0) {
6827 a_pg_offset = a_offset & PAGE_MASK;
6828 m_a = ma[a_offset >> PAGE_SHIFT];
6829 p_a = m_a->phys_addr;
6830 b_pg_offset = b_offset & PAGE_MASK;
6831 m_b = mb[b_offset >> PAGE_SHIFT];
6832 p_b = m_b->phys_addr;
6833 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6834 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6835 if (__predict_false(!PHYS_IN_DMAP(p_a))) {
6836 panic("!DMAP a %lx", p_a);
6837 } else {
6838 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
6839 }
6840 if (__predict_false(!PHYS_IN_DMAP(p_b))) {
6841 panic("!DMAP b %lx", p_b);
6842 } else {
6843 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
6844 }
6845 bcopy(a_cp, b_cp, cnt);
6846 a_offset += cnt;
6847 b_offset += cnt;
6848 xfersize -= cnt;
6849 }
6850 }
6851
6852 vm_offset_t
pmap_quick_enter_page(vm_page_t m)6853 pmap_quick_enter_page(vm_page_t m)
6854 {
6855
6856 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
6857 }
6858
6859 void
pmap_quick_remove_page(vm_offset_t addr)6860 pmap_quick_remove_page(vm_offset_t addr)
6861 {
6862 }
6863
6864 /*
6865 * Returns true if the pmap's pv is one of the first
6866 * 16 pvs linked to from this page. This count may
6867 * be changed upwards or downwards in the future; it
6868 * is only necessary that true be returned for a small
6869 * subset of pmaps for proper page aging.
6870 */
6871 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)6872 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
6873 {
6874 struct md_page *pvh;
6875 struct rwlock *lock;
6876 pv_entry_t pv;
6877 int loops = 0;
6878 bool rv;
6879
6880 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6881 ("pmap_page_exists_quick: page %p is not managed", m));
6882 rv = false;
6883 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6884 rw_rlock(lock);
6885 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6886 if (PV_PMAP(pv) == pmap) {
6887 rv = true;
6888 break;
6889 }
6890 loops++;
6891 if (loops >= 16)
6892 break;
6893 }
6894 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
6895 pvh = page_to_pvh(m);
6896 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6897 if (PV_PMAP(pv) == pmap) {
6898 rv = true;
6899 break;
6900 }
6901 loops++;
6902 if (loops >= 16)
6903 break;
6904 }
6905 }
6906 rw_runlock(lock);
6907 return (rv);
6908 }
6909
6910 /*
6911 * pmap_page_wired_mappings:
6912 *
6913 * Return the number of managed mappings to the given physical page
6914 * that are wired.
6915 */
6916 int
pmap_page_wired_mappings(vm_page_t m)6917 pmap_page_wired_mappings(vm_page_t m)
6918 {
6919 struct rwlock *lock;
6920 struct md_page *pvh;
6921 pmap_t pmap;
6922 pt_entry_t *pte;
6923 pv_entry_t pv;
6924 int count, md_gen, pvh_gen;
6925
6926 if ((m->oflags & VPO_UNMANAGED) != 0)
6927 return (0);
6928 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6929 rw_rlock(lock);
6930 restart:
6931 count = 0;
6932 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6933 pmap = PV_PMAP(pv);
6934 if (!PMAP_TRYLOCK(pmap)) {
6935 md_gen = m->md.pv_gen;
6936 rw_runlock(lock);
6937 PMAP_LOCK(pmap);
6938 rw_rlock(lock);
6939 if (md_gen != m->md.pv_gen) {
6940 PMAP_UNLOCK(pmap);
6941 goto restart;
6942 }
6943 }
6944 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6945 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6946 count++;
6947 PMAP_UNLOCK(pmap);
6948 }
6949 if ((m->flags & PG_FICTITIOUS) == 0) {
6950 pvh = page_to_pvh(m);
6951 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6952 pmap = PV_PMAP(pv);
6953 if (!PMAP_TRYLOCK(pmap)) {
6954 md_gen = m->md.pv_gen;
6955 pvh_gen = pvh->pv_gen;
6956 rw_runlock(lock);
6957 PMAP_LOCK(pmap);
6958 rw_rlock(lock);
6959 if (md_gen != m->md.pv_gen ||
6960 pvh_gen != pvh->pv_gen) {
6961 PMAP_UNLOCK(pmap);
6962 goto restart;
6963 }
6964 }
6965 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6966 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6967 count++;
6968 PMAP_UNLOCK(pmap);
6969 }
6970 }
6971 rw_runlock(lock);
6972 return (count);
6973 }
6974
6975 /*
6976 * Returns true if the given page is mapped individually or as part of
6977 * a 2mpage. Otherwise, returns false.
6978 */
6979 bool
pmap_page_is_mapped(vm_page_t m)6980 pmap_page_is_mapped(vm_page_t m)
6981 {
6982 struct rwlock *lock;
6983 bool rv;
6984
6985 if ((m->oflags & VPO_UNMANAGED) != 0)
6986 return (false);
6987 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6988 rw_rlock(lock);
6989 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
6990 ((m->flags & PG_FICTITIOUS) == 0 &&
6991 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
6992 rw_runlock(lock);
6993 return (rv);
6994 }
6995
6996 /*
6997 * Destroy all managed, non-wired mappings in the given user-space
6998 * pmap. This pmap cannot be active on any processor besides the
6999 * caller.
7000 *
7001 * This function cannot be applied to the kernel pmap. Moreover, it
7002 * is not intended for general use. It is only to be used during
7003 * process termination. Consequently, it can be implemented in ways
7004 * that make it faster than pmap_remove(). First, it can more quickly
7005 * destroy mappings by iterating over the pmap's collection of PV
7006 * entries, rather than searching the page table. Second, it doesn't
7007 * have to test and clear the page table entries atomically, because
7008 * no processor is currently accessing the user address space. In
7009 * particular, a page table entry's dirty bit won't change state once
7010 * this function starts.
7011 */
7012 void
pmap_remove_pages(pmap_t pmap)7013 pmap_remove_pages(pmap_t pmap)
7014 {
7015 pd_entry_t *pde;
7016 pt_entry_t *pte, tpte;
7017 struct spglist free;
7018 struct pv_chunklist free_chunks[PMAP_MEMDOM];
7019 vm_page_t m, ml3, mt;
7020 pv_entry_t pv;
7021 struct md_page *pvh;
7022 struct pv_chunk *pc, *npc;
7023 struct rwlock *lock;
7024 int64_t bit;
7025 uint64_t inuse, bitmask;
7026 int allfree, field, i, idx, lvl;
7027 int freed __pvused;
7028 vm_paddr_t pa;
7029
7030 lock = NULL;
7031
7032 for (i = 0; i < PMAP_MEMDOM; i++)
7033 TAILQ_INIT(&free_chunks[i]);
7034 SLIST_INIT(&free);
7035 PMAP_LOCK(pmap);
7036 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
7037 allfree = 1;
7038 freed = 0;
7039 for (field = 0; field < _NPCM; field++) {
7040 inuse = ~pc->pc_map[field] & pc_freemask[field];
7041 while (inuse != 0) {
7042 bit = ffsl(inuse) - 1;
7043 bitmask = 1UL << bit;
7044 idx = field * 64 + bit;
7045 pv = &pc->pc_pventry[idx];
7046 inuse &= ~bitmask;
7047
7048 pde = pmap_pde(pmap, pv->pv_va, &lvl);
7049 KASSERT(pde != NULL,
7050 ("Attempting to remove an unmapped page"));
7051
7052 switch(lvl) {
7053 case 1:
7054 pte = pmap_l1_to_l2(pde, pv->pv_va);
7055 tpte = pmap_load(pte);
7056 KASSERT((tpte & ATTR_DESCR_MASK) ==
7057 L2_BLOCK,
7058 ("Attempting to remove an invalid "
7059 "block: %lx", tpte));
7060 break;
7061 case 2:
7062 pte = pmap_l2_to_l3(pde, pv->pv_va);
7063 tpte = pmap_load(pte);
7064 KASSERT((tpte & ATTR_DESCR_MASK) ==
7065 L3_PAGE,
7066 ("Attempting to remove an invalid "
7067 "page: %lx", tpte));
7068 break;
7069 default:
7070 panic(
7071 "Invalid page directory level: %d",
7072 lvl);
7073 }
7074
7075 /*
7076 * We cannot remove wired mappings at this time.
7077 *
7078 * For L3C superpages, all of the constituent PTEs
7079 * should have the wired bit set, so we don't
7080 * check for ATTR_CONTIGUOUS here.
7081 */
7082 if (tpte & ATTR_SW_WIRED) {
7083 allfree = 0;
7084 continue;
7085 }
7086
7087 /* Mark free */
7088 pc->pc_map[field] |= bitmask;
7089
7090 /*
7091 * Because this pmap is not active on other
7092 * processors, the dirty bit cannot have
7093 * changed state since we last loaded pte.
7094 */
7095 pmap_clear(pte);
7096
7097 pa = PTE_TO_PHYS(tpte);
7098
7099 m = PHYS_TO_VM_PAGE(pa);
7100 KASSERT(m->phys_addr == pa,
7101 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
7102 m, (uintmax_t)m->phys_addr,
7103 (uintmax_t)tpte));
7104
7105 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
7106 m < &vm_page_array[vm_page_array_size],
7107 ("pmap_remove_pages: bad pte %#jx",
7108 (uintmax_t)tpte));
7109
7110 /*
7111 * Update the vm_page_t clean/reference bits.
7112 *
7113 * We don't check for ATTR_CONTIGUOUS here
7114 * because writeable L3C superpages are expected
7115 * to be dirty, i.e., every constituent PTE
7116 * should be dirty.
7117 */
7118 if (pmap_pte_dirty(pmap, tpte)) {
7119 switch (lvl) {
7120 case 1:
7121 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
7122 vm_page_dirty(mt);
7123 break;
7124 case 2:
7125 vm_page_dirty(m);
7126 break;
7127 }
7128 }
7129
7130 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
7131
7132 switch (lvl) {
7133 case 1:
7134 pmap_resident_count_dec(pmap,
7135 L2_SIZE / PAGE_SIZE);
7136 pvh = page_to_pvh(m);
7137 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
7138 pvh->pv_gen++;
7139 if (TAILQ_EMPTY(&pvh->pv_list)) {
7140 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
7141 if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
7142 TAILQ_EMPTY(&mt->md.pv_list))
7143 vm_page_aflag_clear(mt, PGA_WRITEABLE);
7144 }
7145 ml3 = pmap_remove_pt_page(pmap,
7146 pv->pv_va);
7147 if (ml3 != NULL) {
7148 KASSERT(vm_page_any_valid(ml3),
7149 ("pmap_remove_pages: l3 page not promoted"));
7150 pmap_resident_count_dec(pmap,1);
7151 KASSERT(ml3->ref_count == NL3PG,
7152 ("pmap_remove_pages: l3 page ref count error"));
7153 ml3->ref_count = 0;
7154 pmap_add_delayed_free_list(ml3,
7155 &free, false);
7156 }
7157 break;
7158 case 2:
7159 pmap_resident_count_dec(pmap, 1);
7160 TAILQ_REMOVE(&m->md.pv_list, pv,
7161 pv_next);
7162 m->md.pv_gen++;
7163 if ((m->a.flags & PGA_WRITEABLE) != 0 &&
7164 TAILQ_EMPTY(&m->md.pv_list) &&
7165 (m->flags & PG_FICTITIOUS) == 0) {
7166 pvh = page_to_pvh(m);
7167 if (TAILQ_EMPTY(&pvh->pv_list))
7168 vm_page_aflag_clear(m,
7169 PGA_WRITEABLE);
7170 }
7171 break;
7172 }
7173 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
7174 &free);
7175 freed++;
7176 }
7177 }
7178 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
7179 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
7180 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
7181 if (allfree) {
7182 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
7183 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
7184 pc_list);
7185 }
7186 }
7187 if (lock != NULL)
7188 rw_wunlock(lock);
7189 pmap_invalidate_all(pmap);
7190 pmap_bti_deassign_all(pmap);
7191 free_pv_chunk_batch(free_chunks);
7192 PMAP_UNLOCK(pmap);
7193 vm_page_free_pages_toq(&free, true);
7194 }
7195
7196 /*
7197 * This is used to check if a page has been accessed or modified.
7198 */
7199 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)7200 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
7201 {
7202 struct rwlock *lock;
7203 pv_entry_t pv;
7204 struct md_page *pvh;
7205 pt_entry_t l3e, mask, *pte, value;
7206 pmap_t pmap;
7207 int md_gen, pvh_gen;
7208 bool rv;
7209
7210 rv = false;
7211 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7212 rw_rlock(lock);
7213 restart:
7214 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7215 pmap = PV_PMAP(pv);
7216 PMAP_ASSERT_STAGE1(pmap);
7217 if (!PMAP_TRYLOCK(pmap)) {
7218 md_gen = m->md.pv_gen;
7219 rw_runlock(lock);
7220 PMAP_LOCK(pmap);
7221 rw_rlock(lock);
7222 if (md_gen != m->md.pv_gen) {
7223 PMAP_UNLOCK(pmap);
7224 goto restart;
7225 }
7226 }
7227 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7228 mask = 0;
7229 value = 0;
7230 if (modified) {
7231 mask |= ATTR_S1_AP_RW_BIT;
7232 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
7233 }
7234 if (accessed) {
7235 mask |= ATTR_AF | ATTR_DESCR_MASK;
7236 value |= ATTR_AF | L3_PAGE;
7237 }
7238 l3e = pmap_load(pte);
7239 if ((l3e & ATTR_CONTIGUOUS) != 0)
7240 l3e = pmap_load_l3c(pte);
7241 PMAP_UNLOCK(pmap);
7242 rv = (l3e & mask) == value;
7243 if (rv)
7244 goto out;
7245 }
7246 if ((m->flags & PG_FICTITIOUS) == 0) {
7247 pvh = page_to_pvh(m);
7248 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
7249 pmap = PV_PMAP(pv);
7250 PMAP_ASSERT_STAGE1(pmap);
7251 if (!PMAP_TRYLOCK(pmap)) {
7252 md_gen = m->md.pv_gen;
7253 pvh_gen = pvh->pv_gen;
7254 rw_runlock(lock);
7255 PMAP_LOCK(pmap);
7256 rw_rlock(lock);
7257 if (md_gen != m->md.pv_gen ||
7258 pvh_gen != pvh->pv_gen) {
7259 PMAP_UNLOCK(pmap);
7260 goto restart;
7261 }
7262 }
7263 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
7264 mask = 0;
7265 value = 0;
7266 if (modified) {
7267 mask |= ATTR_S1_AP_RW_BIT;
7268 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
7269 }
7270 if (accessed) {
7271 mask |= ATTR_AF | ATTR_DESCR_MASK;
7272 value |= ATTR_AF | L2_BLOCK;
7273 }
7274 rv = (pmap_load(pte) & mask) == value;
7275 PMAP_UNLOCK(pmap);
7276 if (rv)
7277 goto out;
7278 }
7279 }
7280 out:
7281 rw_runlock(lock);
7282 return (rv);
7283 }
7284
7285 /*
7286 * pmap_is_modified:
7287 *
7288 * Return whether or not the specified physical page was modified
7289 * in any physical maps.
7290 */
7291 bool
pmap_is_modified(vm_page_t m)7292 pmap_is_modified(vm_page_t m)
7293 {
7294
7295 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7296 ("pmap_is_modified: page %p is not managed", m));
7297
7298 /*
7299 * If the page is not busied then this check is racy.
7300 */
7301 if (!pmap_page_is_write_mapped(m))
7302 return (false);
7303 return (pmap_page_test_mappings(m, false, true));
7304 }
7305
7306 /*
7307 * pmap_is_prefaultable:
7308 *
7309 * Return whether or not the specified virtual address is eligible
7310 * for prefault.
7311 */
7312 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)7313 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
7314 {
7315 pd_entry_t *pde;
7316 pt_entry_t *pte;
7317 bool rv;
7318 int lvl;
7319
7320 /*
7321 * Return true if and only if the L3 entry for the specified virtual
7322 * address is allocated but invalid.
7323 */
7324 rv = false;
7325 PMAP_LOCK(pmap);
7326 pde = pmap_pde(pmap, addr, &lvl);
7327 if (pde != NULL && lvl == 2) {
7328 pte = pmap_l2_to_l3(pde, addr);
7329 rv = pmap_load(pte) == 0;
7330 }
7331 PMAP_UNLOCK(pmap);
7332 return (rv);
7333 }
7334
7335 /*
7336 * pmap_is_referenced:
7337 *
7338 * Return whether or not the specified physical page was referenced
7339 * in any physical maps.
7340 */
7341 bool
pmap_is_referenced(vm_page_t m)7342 pmap_is_referenced(vm_page_t m)
7343 {
7344
7345 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7346 ("pmap_is_referenced: page %p is not managed", m));
7347 return (pmap_page_test_mappings(m, true, false));
7348 }
7349
7350 /*
7351 * Clear the write and modified bits in each of the given page's mappings.
7352 */
7353 void
pmap_remove_write(vm_page_t m)7354 pmap_remove_write(vm_page_t m)
7355 {
7356 struct md_page *pvh;
7357 pmap_t pmap;
7358 struct rwlock *lock;
7359 pv_entry_t next_pv, pv;
7360 pt_entry_t oldpte, *pte, set, clear, mask, val;
7361 vm_offset_t va;
7362 int md_gen, pvh_gen;
7363
7364 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7365 ("pmap_remove_write: page %p is not managed", m));
7366 vm_page_assert_busied(m);
7367
7368 if (!pmap_page_is_write_mapped(m))
7369 return;
7370 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7371 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7372 rw_wlock(lock);
7373 retry:
7374 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7375 pmap = PV_PMAP(pv);
7376 PMAP_ASSERT_STAGE1(pmap);
7377 if (!PMAP_TRYLOCK(pmap)) {
7378 pvh_gen = pvh->pv_gen;
7379 rw_wunlock(lock);
7380 PMAP_LOCK(pmap);
7381 rw_wlock(lock);
7382 if (pvh_gen != pvh->pv_gen) {
7383 PMAP_UNLOCK(pmap);
7384 goto retry;
7385 }
7386 }
7387 va = pv->pv_va;
7388 pte = pmap_pte_exists(pmap, va, 2, __func__);
7389 if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
7390 (void)pmap_demote_l2_locked(pmap, pte, va, &lock);
7391 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7392 ("inconsistent pv lock %p %p for page %p",
7393 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7394 PMAP_UNLOCK(pmap);
7395 }
7396 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7397 pmap = PV_PMAP(pv);
7398 if (!PMAP_TRYLOCK(pmap)) {
7399 pvh_gen = pvh->pv_gen;
7400 md_gen = m->md.pv_gen;
7401 rw_wunlock(lock);
7402 PMAP_LOCK(pmap);
7403 rw_wlock(lock);
7404 if (pvh_gen != pvh->pv_gen ||
7405 md_gen != m->md.pv_gen) {
7406 PMAP_UNLOCK(pmap);
7407 goto retry;
7408 }
7409 }
7410 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7411 oldpte = pmap_load(pte);
7412 if ((oldpte & ATTR_SW_DBM) != 0) {
7413 if ((oldpte & ATTR_CONTIGUOUS) != 0) {
7414 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
7415
7416 /*
7417 * The L3 entry's accessed bit may have
7418 * changed.
7419 */
7420 oldpte = pmap_load(pte);
7421 }
7422 if (pmap->pm_stage == PM_STAGE1) {
7423 set = ATTR_S1_AP_RW_BIT;
7424 clear = 0;
7425 mask = ATTR_S1_AP_RW_BIT;
7426 val = ATTR_S1_AP(ATTR_S1_AP_RW);
7427 } else {
7428 set = 0;
7429 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7430 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7431 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7432 }
7433 clear |= ATTR_SW_DBM;
7434 while (!atomic_fcmpset_64(pte, &oldpte,
7435 (oldpte | set) & ~clear))
7436 cpu_spinwait();
7437
7438 if ((oldpte & mask) == val)
7439 vm_page_dirty(m);
7440 pmap_invalidate_page(pmap, pv->pv_va, true);
7441 }
7442 PMAP_UNLOCK(pmap);
7443 }
7444 rw_wunlock(lock);
7445 vm_page_aflag_clear(m, PGA_WRITEABLE);
7446 }
7447
7448 /*
7449 * pmap_ts_referenced:
7450 *
7451 * Return a count of reference bits for a page, clearing those bits.
7452 * It is not necessary for every reference bit to be cleared, but it
7453 * is necessary that 0 only be returned when there are truly no
7454 * reference bits set.
7455 *
7456 * As an optimization, update the page's dirty field if a modified bit is
7457 * found while counting reference bits. This opportunistic update can be
7458 * performed at low cost and can eliminate the need for some future calls
7459 * to pmap_is_modified(). However, since this function stops after
7460 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
7461 * dirty pages. Those dirty pages will only be detected by a future call
7462 * to pmap_is_modified().
7463 */
7464 int
pmap_ts_referenced(vm_page_t m)7465 pmap_ts_referenced(vm_page_t m)
7466 {
7467 struct md_page *pvh;
7468 pv_entry_t pv, pvf;
7469 pmap_t pmap;
7470 struct rwlock *lock;
7471 pt_entry_t *pte, tpte;
7472 vm_offset_t va;
7473 vm_paddr_t pa;
7474 int cleared, md_gen, not_cleared, pvh_gen;
7475 struct spglist free;
7476
7477 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7478 ("pmap_ts_referenced: page %p is not managed", m));
7479 SLIST_INIT(&free);
7480 cleared = 0;
7481 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7482 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7483 rw_wlock(lock);
7484 retry:
7485 not_cleared = 0;
7486 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
7487 goto small_mappings;
7488 pv = pvf;
7489 do {
7490 if (pvf == NULL)
7491 pvf = pv;
7492 pmap = PV_PMAP(pv);
7493 if (!PMAP_TRYLOCK(pmap)) {
7494 pvh_gen = pvh->pv_gen;
7495 rw_wunlock(lock);
7496 PMAP_LOCK(pmap);
7497 rw_wlock(lock);
7498 if (pvh_gen != pvh->pv_gen) {
7499 PMAP_UNLOCK(pmap);
7500 goto retry;
7501 }
7502 }
7503 va = pv->pv_va;
7504 pte = pmap_pte_exists(pmap, va, 2, __func__);
7505 tpte = pmap_load(pte);
7506 if (pmap_pte_dirty(pmap, tpte)) {
7507 /*
7508 * Although "tpte" is mapping a 2MB page, because
7509 * this function is called at a 4KB page granularity,
7510 * we only update the 4KB page under test.
7511 */
7512 vm_page_dirty(m);
7513 }
7514 if ((tpte & ATTR_AF) != 0) {
7515 pa = VM_PAGE_TO_PHYS(m);
7516
7517 /*
7518 * Since this reference bit is shared by 512 4KB pages,
7519 * it should not be cleared every time it is tested.
7520 * Apply a simple "hash" function on the physical page
7521 * number, the virtual superpage number, and the pmap
7522 * address to select one 4KB page out of the 512 on
7523 * which testing the reference bit will result in
7524 * clearing that reference bit. This function is
7525 * designed to avoid the selection of the same 4KB page
7526 * for every 2MB page mapping.
7527 *
7528 * On demotion, a mapping that hasn't been referenced
7529 * is simply destroyed. To avoid the possibility of a
7530 * subsequent page fault on a demoted wired mapping,
7531 * always leave its reference bit set. Moreover,
7532 * since the superpage is wired, the current state of
7533 * its reference bit won't affect page replacement.
7534 */
7535 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
7536 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
7537 (tpte & ATTR_SW_WIRED) == 0) {
7538 pmap_clear_bits(pte, ATTR_AF);
7539 pmap_invalidate_page(pmap, va, true);
7540 cleared++;
7541 } else
7542 not_cleared++;
7543 }
7544 PMAP_UNLOCK(pmap);
7545 /* Rotate the PV list if it has more than one entry. */
7546 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7547 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7548 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
7549 pvh->pv_gen++;
7550 }
7551 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
7552 goto out;
7553 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
7554 small_mappings:
7555 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
7556 goto out;
7557 pv = pvf;
7558 do {
7559 if (pvf == NULL)
7560 pvf = pv;
7561 pmap = PV_PMAP(pv);
7562 if (!PMAP_TRYLOCK(pmap)) {
7563 pvh_gen = pvh->pv_gen;
7564 md_gen = m->md.pv_gen;
7565 rw_wunlock(lock);
7566 PMAP_LOCK(pmap);
7567 rw_wlock(lock);
7568 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7569 PMAP_UNLOCK(pmap);
7570 goto retry;
7571 }
7572 }
7573 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7574 tpte = pmap_load(pte);
7575 if (pmap_pte_dirty(pmap, tpte))
7576 vm_page_dirty(m);
7577 if ((tpte & ATTR_AF) != 0) {
7578 if ((tpte & ATTR_SW_WIRED) == 0) {
7579 /*
7580 * Clear the accessed bit in this L3 entry
7581 * regardless of the contiguous bit.
7582 */
7583 pmap_clear_bits(pte, ATTR_AF);
7584 pmap_invalidate_page(pmap, pv->pv_va, true);
7585 cleared++;
7586 } else
7587 not_cleared++;
7588 } else if ((tpte & ATTR_CONTIGUOUS) != 0 &&
7589 (pmap_load_l3c(pte) & ATTR_AF) != 0) {
7590 /*
7591 * An L3C superpage mapping is regarded as accessed
7592 * until the accessed bit has been cleared in all
7593 * of its constituent entries.
7594 */
7595 not_cleared++;
7596 }
7597 PMAP_UNLOCK(pmap);
7598 /* Rotate the PV list if it has more than one entry. */
7599 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7600 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7601 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7602 m->md.pv_gen++;
7603 }
7604 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
7605 not_cleared < PMAP_TS_REFERENCED_MAX);
7606 out:
7607 rw_wunlock(lock);
7608 vm_page_free_pages_toq(&free, true);
7609 return (cleared + not_cleared);
7610 }
7611
7612 /*
7613 * Apply the given advice to the specified range of addresses within the
7614 * given pmap. Depending on the advice, clear the referenced and/or
7615 * modified flags in each mapping and set the mapped page's dirty field.
7616 */
7617 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)7618 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
7619 {
7620 struct rwlock *lock;
7621 vm_offset_t va, va_next, dva;
7622 vm_page_t m;
7623 pd_entry_t *l0, *l1, *l2, oldl2;
7624 pt_entry_t *l3, *dl3, oldl3;
7625
7626 PMAP_ASSERT_STAGE1(pmap);
7627
7628 if (advice != MADV_DONTNEED && advice != MADV_FREE)
7629 return;
7630
7631 PMAP_LOCK(pmap);
7632 for (; sva < eva; sva = va_next) {
7633 l0 = pmap_l0(pmap, sva);
7634 if (pmap_load(l0) == 0) {
7635 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
7636 if (va_next < sva)
7637 va_next = eva;
7638 continue;
7639 }
7640
7641 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
7642 if (va_next < sva)
7643 va_next = eva;
7644 l1 = pmap_l0_to_l1(l0, sva);
7645 if (pmap_load(l1) == 0)
7646 continue;
7647 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
7648 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7649 continue;
7650 }
7651
7652 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
7653 if (va_next < sva)
7654 va_next = eva;
7655 l2 = pmap_l1_to_l2(l1, sva);
7656 oldl2 = pmap_load(l2);
7657 if (oldl2 == 0)
7658 continue;
7659 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
7660 if ((oldl2 & ATTR_SW_MANAGED) == 0)
7661 continue;
7662 lock = NULL;
7663 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
7664 if (lock != NULL)
7665 rw_wunlock(lock);
7666
7667 /*
7668 * The 2MB page mapping was destroyed.
7669 */
7670 continue;
7671 }
7672
7673 /*
7674 * Unless the page mappings are wired, remove the
7675 * mapping to a single page so that a subsequent
7676 * access may repromote. Choosing the last page
7677 * within the address range [sva, min(va_next, eva))
7678 * generally results in more repromotions. Since the
7679 * underlying page table page is fully populated, this
7680 * removal never frees a page table page.
7681 */
7682 if ((oldl2 & ATTR_SW_WIRED) == 0) {
7683 va = eva;
7684 if (va > va_next)
7685 va = va_next;
7686 va -= PAGE_SIZE;
7687 KASSERT(va >= sva,
7688 ("pmap_advise: no address gap"));
7689 l3 = pmap_l2_to_l3(l2, va);
7690 KASSERT(pmap_load(l3) != 0,
7691 ("pmap_advise: invalid PTE"));
7692 pmap_remove_l3(pmap, l3, va, pmap_load(l2),
7693 NULL, &lock);
7694 }
7695 if (lock != NULL)
7696 rw_wunlock(lock);
7697 }
7698 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
7699 ("pmap_advise: invalid L2 entry after demotion"));
7700 if (va_next > eva)
7701 va_next = eva;
7702 va = va_next;
7703 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
7704 sva += L3_SIZE) {
7705 oldl3 = pmap_load(l3);
7706 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
7707 (ATTR_SW_MANAGED | L3_PAGE))
7708 goto maybe_invlrng;
7709 else if (pmap_pte_dirty(pmap, oldl3)) {
7710 if (advice == MADV_DONTNEED) {
7711 /*
7712 * Future calls to pmap_is_modified()
7713 * can be avoided by making the page
7714 * dirty now.
7715 */
7716 m = PTE_TO_VM_PAGE(oldl3);
7717 vm_page_dirty(m);
7718 }
7719 if ((oldl3 & ATTR_CONTIGUOUS) != 0) {
7720 /*
7721 * Unconditionally demote the L3C
7722 * superpage because we do not allow
7723 * writeable, clean superpages.
7724 */
7725 (void)pmap_demote_l3c(pmap, l3, sva);
7726
7727 /*
7728 * Destroy the final mapping before the
7729 * next L3C boundary or va_next,
7730 * whichever comes first, so that a
7731 * subsequent access may act as a
7732 * repromotion trigger.
7733 */
7734 if ((oldl3 & ATTR_SW_WIRED) == 0) {
7735 dva = MIN((sva & ~L3C_OFFSET) +
7736 L3C_SIZE - PAGE_SIZE,
7737 va_next - PAGE_SIZE);
7738 dl3 = pmap_l2_to_l3(l2, dva);
7739 KASSERT(pmap_load(dl3) != 0,
7740 ("pmap_advise: invalid PTE"));
7741 lock = NULL;
7742 pmap_remove_l3(pmap, dl3, dva,
7743 pmap_load(l2), NULL, &lock);
7744 if (lock != NULL)
7745 rw_wunlock(lock);
7746 }
7747
7748 /*
7749 * The L3 entry's accessed bit may have
7750 * changed.
7751 */
7752 oldl3 = pmap_load(l3);
7753 }
7754
7755 /*
7756 * Check that we did not just destroy this entry so
7757 * we avoid corrupting the page able.
7758 */
7759 if (oldl3 != 0) {
7760 while (!atomic_fcmpset_long(l3, &oldl3,
7761 (oldl3 & ~ATTR_AF) |
7762 ATTR_S1_AP(ATTR_S1_AP_RO)))
7763 cpu_spinwait();
7764 }
7765 } else if ((oldl3 & ATTR_AF) != 0) {
7766 /*
7767 * Clear the accessed bit in this L3 entry
7768 * regardless of the contiguous bit.
7769 */
7770 pmap_clear_bits(l3, ATTR_AF);
7771 } else
7772 goto maybe_invlrng;
7773 if (va == va_next)
7774 va = sva;
7775 continue;
7776 maybe_invlrng:
7777 if (va != va_next) {
7778 pmap_s1_invalidate_range(pmap, va, sva, true);
7779 va = va_next;
7780 }
7781 }
7782 if (va != va_next)
7783 pmap_s1_invalidate_range(pmap, va, sva, true);
7784 }
7785 PMAP_UNLOCK(pmap);
7786 }
7787
7788 /*
7789 * Clear the modify bits on the specified physical page.
7790 */
7791 void
pmap_clear_modify(vm_page_t m)7792 pmap_clear_modify(vm_page_t m)
7793 {
7794 struct md_page *pvh;
7795 struct rwlock *lock;
7796 pmap_t pmap;
7797 pv_entry_t next_pv, pv;
7798 pd_entry_t *l2, oldl2;
7799 pt_entry_t *l3, oldl3;
7800 vm_offset_t va;
7801 int md_gen, pvh_gen;
7802
7803 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7804 ("pmap_clear_modify: page %p is not managed", m));
7805 vm_page_assert_busied(m);
7806
7807 if (!pmap_page_is_write_mapped(m))
7808 return;
7809 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7810 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7811 rw_wlock(lock);
7812 restart:
7813 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7814 pmap = PV_PMAP(pv);
7815 PMAP_ASSERT_STAGE1(pmap);
7816 if (!PMAP_TRYLOCK(pmap)) {
7817 pvh_gen = pvh->pv_gen;
7818 rw_wunlock(lock);
7819 PMAP_LOCK(pmap);
7820 rw_wlock(lock);
7821 if (pvh_gen != pvh->pv_gen) {
7822 PMAP_UNLOCK(pmap);
7823 goto restart;
7824 }
7825 }
7826 va = pv->pv_va;
7827 l2 = pmap_l2(pmap, va);
7828 oldl2 = pmap_load(l2);
7829 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
7830 if ((oldl2 & ATTR_SW_DBM) != 0 &&
7831 pmap_demote_l2_locked(pmap, l2, va, &lock) &&
7832 (oldl2 & ATTR_SW_WIRED) == 0) {
7833 /*
7834 * Write protect the mapping to a single page so that
7835 * a subsequent write access may repromote.
7836 */
7837 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
7838 l3 = pmap_l2_to_l3(l2, va);
7839 oldl3 = pmap_load(l3);
7840 while (!atomic_fcmpset_long(l3, &oldl3,
7841 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
7842 cpu_spinwait();
7843 vm_page_dirty(m);
7844 pmap_s1_invalidate_page(pmap, va, true);
7845 }
7846 PMAP_UNLOCK(pmap);
7847 }
7848 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7849 pmap = PV_PMAP(pv);
7850 PMAP_ASSERT_STAGE1(pmap);
7851 if (!PMAP_TRYLOCK(pmap)) {
7852 md_gen = m->md.pv_gen;
7853 pvh_gen = pvh->pv_gen;
7854 rw_wunlock(lock);
7855 PMAP_LOCK(pmap);
7856 rw_wlock(lock);
7857 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7858 PMAP_UNLOCK(pmap);
7859 goto restart;
7860 }
7861 }
7862 l2 = pmap_l2(pmap, pv->pv_va);
7863 l3 = pmap_l2_to_l3(l2, pv->pv_va);
7864 oldl3 = pmap_load(l3);
7865 KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 ||
7866 (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
7867 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
7868 ("writeable L3C superpage not dirty"));
7869 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
7870 if ((oldl3 & ATTR_CONTIGUOUS) != 0)
7871 (void)pmap_demote_l3c(pmap, l3, pv->pv_va);
7872 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
7873 pmap_s1_invalidate_page(pmap, pv->pv_va, true);
7874 }
7875 PMAP_UNLOCK(pmap);
7876 }
7877 rw_wunlock(lock);
7878 }
7879
7880 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)7881 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
7882 {
7883 struct pmap_preinit_mapping *ppim;
7884 vm_offset_t va, offset;
7885 pd_entry_t old_l2e, *pde;
7886 pt_entry_t *l2;
7887 int i, lvl, l2_blocks, free_l2_count, start_idx;
7888
7889 /* Use the DMAP region if we can */
7890 if (PHYS_IN_DMAP(pa) && PHYS_IN_DMAP(pa + size - 1) &&
7891 pmap_kmapped_range(PHYS_TO_DMAP(pa), size))
7892 return ((void *)PHYS_TO_DMAP(pa));
7893
7894 if (!vm_initialized) {
7895 /*
7896 * No L3 ptables so map entire L2 blocks where start VA is:
7897 * preinit_map_va + start_idx * L2_SIZE
7898 * There may be duplicate mappings (multiple VA -> same PA) but
7899 * ARM64 dcache is always PIPT so that's acceptable.
7900 */
7901 if (size == 0)
7902 return (NULL);
7903
7904 /* Calculate how many L2 blocks are needed for the mapping */
7905 l2_blocks = (roundup2(pa + size, L2_SIZE) -
7906 rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
7907
7908 offset = pa & L2_OFFSET;
7909
7910 if (preinit_map_va == 0)
7911 return (NULL);
7912
7913 /* Map 2MiB L2 blocks from reserved VA space */
7914
7915 free_l2_count = 0;
7916 start_idx = -1;
7917 /* Find enough free contiguous VA space */
7918 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7919 ppim = pmap_preinit_mapping + i;
7920 if (free_l2_count > 0 && ppim->pa != 0) {
7921 /* Not enough space here */
7922 free_l2_count = 0;
7923 start_idx = -1;
7924 continue;
7925 }
7926
7927 if (ppim->pa == 0) {
7928 /* Free L2 block */
7929 if (start_idx == -1)
7930 start_idx = i;
7931 free_l2_count++;
7932 if (free_l2_count == l2_blocks)
7933 break;
7934 }
7935 }
7936 if (free_l2_count != l2_blocks)
7937 panic("%s: too many preinit mappings", __func__);
7938
7939 va = preinit_map_va + (start_idx * L2_SIZE);
7940 for (i = start_idx; i < start_idx + l2_blocks; i++) {
7941 /* Mark entries as allocated */
7942 ppim = pmap_preinit_mapping + i;
7943 ppim->pa = pa;
7944 ppim->va = va + offset;
7945 ppim->size = size;
7946 }
7947
7948 /* Map L2 blocks */
7949 pa = rounddown2(pa, L2_SIZE);
7950 old_l2e = 0;
7951 for (i = 0; i < l2_blocks; i++) {
7952 pde = pmap_pde(kernel_pmap, va, &lvl);
7953 KASSERT(pde != NULL,
7954 ("pmap_mapbios: Invalid page entry, va: 0x%lx",
7955 va));
7956 KASSERT(lvl == 1,
7957 ("pmap_mapbios: Invalid level %d", lvl));
7958
7959 /* Insert L2_BLOCK */
7960 l2 = pmap_l1_to_l2(pde, va);
7961 old_l2e |= pmap_load_store(l2,
7962 PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr |
7963 ATTR_S1_XN | ATTR_KERN_GP |
7964 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
7965
7966 va += L2_SIZE;
7967 pa += L2_SIZE;
7968 }
7969 if ((old_l2e & ATTR_DESCR_VALID) != 0)
7970 pmap_s1_invalidate_all(kernel_pmap);
7971 else {
7972 /*
7973 * Because the old entries were invalid and the new
7974 * mappings are not executable, an isb is not required.
7975 */
7976 dsb(ishst);
7977 }
7978
7979 va = preinit_map_va + (start_idx * L2_SIZE);
7980
7981 } else {
7982 /* kva_alloc may be used to map the pages */
7983 offset = pa & PAGE_MASK;
7984 size = round_page(offset + size);
7985
7986 va = kva_alloc(size);
7987 if (va == 0)
7988 panic("%s: Couldn't allocate KVA", __func__);
7989
7990 pde = pmap_pde(kernel_pmap, va, &lvl);
7991 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
7992
7993 /* L3 table is linked */
7994 va = trunc_page(va);
7995 pa = trunc_page(pa);
7996 pmap_kenter(va, size, pa, memory_mapping_mode(pa));
7997 }
7998
7999 return ((void *)(va + offset));
8000 }
8001
8002 void
pmap_unmapbios(void * p,vm_size_t size)8003 pmap_unmapbios(void *p, vm_size_t size)
8004 {
8005 struct pmap_preinit_mapping *ppim;
8006 vm_offset_t offset, va, va_trunc;
8007 pd_entry_t *pde;
8008 pt_entry_t *l2;
8009 int error __diagused, i, lvl, l2_blocks, block;
8010 bool preinit_map;
8011
8012 va = (vm_offset_t)p;
8013 if (VIRT_IN_DMAP(va)) {
8014 KASSERT(VIRT_IN_DMAP(va + size - 1),
8015 ("%s: End address not in DMAP region: %lx", __func__,
8016 va + size - 1));
8017 /* Ensure the attributes are as expected for the DMAP region */
8018 PMAP_LOCK(kernel_pmap);
8019 error = pmap_change_props_locked(va, size,
8020 PROT_READ | PROT_WRITE, VM_MEMATTR_DEFAULT, false);
8021 PMAP_UNLOCK(kernel_pmap);
8022 KASSERT(error == 0, ("%s: Failed to reset DMAP attributes: %d",
8023 __func__, error));
8024
8025 return;
8026 }
8027
8028 l2_blocks =
8029 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
8030 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
8031
8032 /* Remove preinit mapping */
8033 preinit_map = false;
8034 block = 0;
8035 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
8036 ppim = pmap_preinit_mapping + i;
8037 if (ppim->va == va) {
8038 KASSERT(ppim->size == size,
8039 ("pmap_unmapbios: size mismatch"));
8040 ppim->va = 0;
8041 ppim->pa = 0;
8042 ppim->size = 0;
8043 preinit_map = true;
8044 offset = block * L2_SIZE;
8045 va_trunc = rounddown2(va, L2_SIZE) + offset;
8046
8047 /* Remove L2_BLOCK */
8048 pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
8049 KASSERT(pde != NULL,
8050 ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
8051 va_trunc));
8052 l2 = pmap_l1_to_l2(pde, va_trunc);
8053 pmap_clear(l2);
8054
8055 if (block == (l2_blocks - 1))
8056 break;
8057 block++;
8058 }
8059 }
8060 if (preinit_map) {
8061 pmap_s1_invalidate_all(kernel_pmap);
8062 return;
8063 }
8064
8065 /* Unmap the pages reserved with kva_alloc. */
8066 if (vm_initialized) {
8067 offset = va & PAGE_MASK;
8068 size = round_page(offset + size);
8069 va = trunc_page(va);
8070
8071 /* Unmap and invalidate the pages */
8072 pmap_kremove_device(va, size);
8073
8074 kva_free(va, size);
8075 }
8076 }
8077
8078 /*
8079 * Sets the memory attribute for the specified page.
8080 */
8081 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)8082 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
8083 {
8084 if (m->md.pv_memattr == ma)
8085 return;
8086
8087 m->md.pv_memattr = ma;
8088
8089 /*
8090 * If "m" is a normal page, update its direct mapping. This update
8091 * can be relied upon to perform any cache operations that are
8092 * required for data coherence.
8093 */
8094 if ((m->flags & PG_FICTITIOUS) == 0 &&
8095 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
8096 m->md.pv_memattr) != 0)
8097 panic("memory attribute change on the direct map failed");
8098 }
8099
8100 /*
8101 * Changes the specified virtual address range's memory type to that given by
8102 * the parameter "mode". The specified virtual address range must be
8103 * completely contained within either the direct map or the kernel map. If
8104 * the virtual address range is contained within the kernel map, then the
8105 * memory type for each of the corresponding ranges of the direct map is also
8106 * changed. (The corresponding ranges of the direct map are those ranges that
8107 * map the same physical pages as the specified virtual address range.) These
8108 * changes to the direct map are necessary because Intel describes the
8109 * behavior of their processors as "undefined" if two or more mappings to the
8110 * same physical page have different memory types.
8111 *
8112 * Returns zero if the change completed successfully, and either EINVAL or
8113 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
8114 * of the virtual address range was not mapped, and ENOMEM is returned if
8115 * there was insufficient memory available to complete the change. In the
8116 * latter case, the memory type may have been changed on some part of the
8117 * virtual address range or the direct map.
8118 */
8119 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)8120 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
8121 {
8122 int error;
8123
8124 PMAP_LOCK(kernel_pmap);
8125 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
8126 PMAP_UNLOCK(kernel_pmap);
8127 return (error);
8128 }
8129
8130 /*
8131 * Changes the specified virtual address range's protections to those
8132 * specified by "prot". Like pmap_change_attr(), protections for aliases
8133 * in the direct map are updated as well. Protections on aliasing mappings may
8134 * be a subset of the requested protections; for example, mappings in the direct
8135 * map are never executable.
8136 */
8137 int
pmap_change_prot(vm_offset_t va,vm_size_t size,vm_prot_t prot)8138 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
8139 {
8140 int error;
8141
8142 /* Only supported within the kernel map. */
8143 if (va < VM_MIN_KERNEL_ADDRESS)
8144 return (EINVAL);
8145
8146 PMAP_LOCK(kernel_pmap);
8147 error = pmap_change_props_locked(va, size, prot, -1, false);
8148 PMAP_UNLOCK(kernel_pmap);
8149 return (error);
8150 }
8151
8152 static int
pmap_change_props_locked(vm_offset_t va,vm_size_t size,vm_prot_t prot,int mode,bool skip_unmapped)8153 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
8154 int mode, bool skip_unmapped)
8155 {
8156 vm_offset_t base, offset, tmpva;
8157 vm_size_t pte_size;
8158 vm_paddr_t pa;
8159 pt_entry_t pte, *ptep, *newpte;
8160 pt_entry_t bits, mask;
8161 int lvl, rv;
8162
8163 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
8164 base = trunc_page(va);
8165 offset = va & PAGE_MASK;
8166 size = round_page(offset + size);
8167
8168 if (!VIRT_IN_DMAP(base) &&
8169 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
8170 return (EINVAL);
8171
8172 bits = 0;
8173 mask = 0;
8174 if (mode != -1) {
8175 bits = ATTR_S1_IDX(mode);
8176 mask = ATTR_S1_IDX_MASK;
8177 if (mode == VM_MEMATTR_DEVICE) {
8178 mask |= ATTR_S1_XN;
8179 bits |= ATTR_S1_XN;
8180 }
8181 }
8182 if (prot != VM_PROT_NONE) {
8183 /* Don't mark the DMAP as executable. It never is on arm64. */
8184 if (VIRT_IN_DMAP(base)) {
8185 prot &= ~VM_PROT_EXECUTE;
8186 /*
8187 * XXX Mark the DMAP as writable for now. We rely
8188 * on this in ddb & dtrace to insert breakpoint
8189 * instructions.
8190 */
8191 prot |= VM_PROT_WRITE;
8192 }
8193
8194 if ((prot & VM_PROT_WRITE) == 0) {
8195 bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
8196 }
8197 if ((prot & VM_PROT_EXECUTE) == 0) {
8198 bits |= ATTR_S1_PXN;
8199 }
8200 bits |= ATTR_S1_UXN;
8201 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
8202 }
8203
8204 for (tmpva = base; tmpva < base + size; ) {
8205 ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
8206 if (ptep == NULL && !skip_unmapped) {
8207 return (EINVAL);
8208 } else if ((ptep == NULL && skip_unmapped) ||
8209 (pmap_load(ptep) & mask) == bits) {
8210 /*
8211 * We already have the correct attribute or there
8212 * is no memory mapped at this address and we are
8213 * skipping unmapped memory.
8214 */
8215 switch (lvl) {
8216 default:
8217 panic("Invalid DMAP table level: %d\n", lvl);
8218 case 1:
8219 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
8220 break;
8221 case 2:
8222 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
8223 break;
8224 case 3:
8225 tmpva += PAGE_SIZE;
8226 break;
8227 }
8228 } else {
8229 /* We can't demote/promote this entry */
8230 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
8231
8232 /*
8233 * Find the entry and demote it if the requested change
8234 * only applies to part of the address range mapped by
8235 * the entry.
8236 */
8237 switch (lvl) {
8238 default:
8239 panic("Invalid DMAP table level: %d\n", lvl);
8240 case 1:
8241 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8242 if ((tmpva & L1_OFFSET) == 0 &&
8243 (base + size - tmpva) >= L1_SIZE) {
8244 pte_size = L1_SIZE;
8245 break;
8246 }
8247 newpte = pmap_demote_l1(kernel_pmap, ptep,
8248 tmpva & ~L1_OFFSET);
8249 if (newpte == NULL)
8250 return (EINVAL);
8251 ptep = pmap_l1_to_l2(ptep, tmpva);
8252 /* FALLTHROUGH */
8253 case 2:
8254 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
8255 if ((tmpva & L2C_OFFSET) == 0 &&
8256 (base + size - tmpva) >= L2C_SIZE) {
8257 pte_size = L2C_SIZE;
8258 break;
8259 }
8260 if (!pmap_demote_l2c(kernel_pmap, ptep,
8261 tmpva))
8262 return (EINVAL);
8263 }
8264 if ((tmpva & L2_OFFSET) == 0 &&
8265 (base + size - tmpva) >= L2_SIZE) {
8266 pte_size = L2_SIZE;
8267 break;
8268 }
8269 newpte = pmap_demote_l2(kernel_pmap, ptep,
8270 tmpva);
8271 if (newpte == NULL)
8272 return (EINVAL);
8273 ptep = pmap_l2_to_l3(ptep, tmpva);
8274 /* FALLTHROUGH */
8275 case 3:
8276 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
8277 if ((tmpva & L3C_OFFSET) == 0 &&
8278 (base + size - tmpva) >= L3C_SIZE) {
8279 pte_size = L3C_SIZE;
8280 break;
8281 }
8282 if (!pmap_demote_l3c(kernel_pmap, ptep,
8283 tmpva))
8284 return (EINVAL);
8285 }
8286 pte_size = PAGE_SIZE;
8287 break;
8288 }
8289
8290 /* Update the entry */
8291 pte = pmap_load(ptep);
8292 pte &= ~mask;
8293 pte |= bits;
8294
8295 switch (pte_size) {
8296 case L2C_SIZE:
8297 pmap_update_strided(kernel_pmap, ptep, ptep +
8298 L2C_ENTRIES, pte, tmpva, L2_SIZE, L2C_SIZE);
8299 break;
8300 case L3C_SIZE:
8301 pmap_update_strided(kernel_pmap, ptep, ptep +
8302 L3C_ENTRIES, pte, tmpva, L3_SIZE, L3C_SIZE);
8303 break;
8304 default:
8305 /*
8306 * We are updating a single block or page entry,
8307 * so regardless of pte_size pass PAGE_SIZE in
8308 * order that a single TLB invalidation is
8309 * performed.
8310 */
8311 pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
8312 PAGE_SIZE);
8313 break;
8314 }
8315
8316 pa = PTE_TO_PHYS(pte);
8317 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
8318 /*
8319 * Keep the DMAP memory in sync.
8320 */
8321 rv = pmap_change_props_locked(
8322 PHYS_TO_DMAP(pa), pte_size,
8323 prot, mode, true);
8324 if (rv != 0)
8325 return (rv);
8326 }
8327
8328 /*
8329 * If moving to a non-cacheable entry flush
8330 * the cache.
8331 */
8332 if (mode == VM_MEMATTR_UNCACHEABLE)
8333 cpu_dcache_wbinv_range((void *)tmpva, pte_size);
8334 tmpva += pte_size;
8335 }
8336 }
8337
8338 return (0);
8339 }
8340
8341 /*
8342 * Create an L2 table to map all addresses within an L1 mapping.
8343 */
8344 static pt_entry_t *
pmap_demote_l1(pmap_t pmap,pt_entry_t * l1,vm_offset_t va)8345 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
8346 {
8347 pt_entry_t *l2, newl2, oldl1;
8348 vm_offset_t tmpl1;
8349 vm_paddr_t l2phys, phys;
8350 vm_page_t ml2;
8351 int i;
8352
8353 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8354 oldl1 = pmap_load(l1);
8355 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8356 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
8357 ("pmap_demote_l1: Demoting a non-block entry"));
8358 KASSERT((va & L1_OFFSET) == 0,
8359 ("pmap_demote_l1: Invalid virtual address %#lx", va));
8360 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
8361 ("pmap_demote_l1: Level 1 table shouldn't be managed"));
8362 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
8363 ("pmap_demote_l1: Demoting entry with no-demote flag set"));
8364
8365 tmpl1 = 0;
8366 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
8367 tmpl1 = kva_alloc(PAGE_SIZE);
8368 if (tmpl1 == 0)
8369 return (NULL);
8370 }
8371
8372 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
8373 NULL) {
8374 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
8375 " in pmap %p", va, pmap);
8376 l2 = NULL;
8377 goto fail;
8378 }
8379
8380 l2phys = VM_PAGE_TO_PHYS(ml2);
8381 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
8382
8383 /* Address the range points at */
8384 phys = PTE_TO_PHYS(oldl1);
8385 /* The attributed from the old l1 table to be copied */
8386 newl2 = oldl1 & ATTR_MASK;
8387
8388 /* Create the new entries */
8389 newl2 |= ATTR_CONTIGUOUS;
8390 for (i = 0; i < Ln_ENTRIES; i++) {
8391 l2[i] = newl2 | phys;
8392 phys += L2_SIZE;
8393 }
8394 KASSERT(l2[0] == (ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) |
8395 L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0],
8396 ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
8397
8398 if (tmpl1 != 0) {
8399 pmap_kenter(tmpl1, PAGE_SIZE,
8400 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
8401 VM_MEMATTR_WRITE_BACK);
8402 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
8403 }
8404
8405 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
8406
8407 counter_u64_add(pmap_l1_demotions, 1);
8408 fail:
8409 if (tmpl1 != 0) {
8410 pmap_kremove(tmpl1);
8411 kva_free(tmpl1, PAGE_SIZE);
8412 }
8413
8414 return (l2);
8415 }
8416
8417 static void
pmap_fill_l3(pt_entry_t * firstl3,pt_entry_t newl3)8418 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
8419 {
8420 pt_entry_t *l3;
8421
8422 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
8423 *l3 = newl3;
8424 newl3 += L3_SIZE;
8425 }
8426 }
8427
8428 static void
pmap_demote_l2_check(pt_entry_t * firstl3p __unused,pt_entry_t newl3e __unused)8429 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused)
8430 {
8431 #ifdef INVARIANTS
8432 #ifdef DIAGNOSTIC
8433 pt_entry_t *xl3p, *yl3p;
8434
8435 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES;
8436 xl3p++, newl3e += PAGE_SIZE) {
8437 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) {
8438 printf("pmap_demote_l2: xl3e %zd and newl3e map "
8439 "different pages: found %#lx, expected %#lx\n",
8440 xl3p - firstl3p, pmap_load(xl3p), newl3e);
8441 printf("page table dump\n");
8442 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES;
8443 yl3p++) {
8444 printf("%zd %#lx\n", yl3p - firstl3p,
8445 pmap_load(yl3p));
8446 }
8447 panic("firstpte");
8448 }
8449 }
8450 #else
8451 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e),
8452 ("pmap_demote_l2: firstl3 and newl3e map different physical"
8453 " addresses"));
8454 #endif
8455 #endif
8456 }
8457
8458 static void
pmap_demote_l2_abort(pmap_t pmap,vm_offset_t va,pt_entry_t * l2,struct rwlock ** lockp)8459 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
8460 struct rwlock **lockp)
8461 {
8462 struct spglist free;
8463
8464 SLIST_INIT(&free);
8465 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), true,
8466 &free, lockp);
8467 vm_page_free_pages_toq(&free, true);
8468 }
8469
8470 /*
8471 * Create an L3 table to map all addresses within an L2 mapping.
8472 */
8473 static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap,pt_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)8474 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
8475 struct rwlock **lockp)
8476 {
8477 pt_entry_t *l3, newl3, oldl2;
8478 vm_offset_t tmpl2;
8479 vm_paddr_t l3phys;
8480 vm_page_t ml3;
8481
8482 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8483 PMAP_ASSERT_STAGE1(pmap);
8484 KASSERT(ADDR_IS_CANONICAL(va),
8485 ("%s: Address not in canonical form: %lx", __func__, va));
8486
8487 l3 = NULL;
8488 oldl2 = pmap_load(l2);
8489 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
8490 ("pmap_demote_l2: Demoting a non-block entry"));
8491 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
8492 ("pmap_demote_l2: Demoting entry with no-demote flag set"));
8493 va &= ~L2_OFFSET;
8494
8495 tmpl2 = 0;
8496 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
8497 tmpl2 = kva_alloc(PAGE_SIZE);
8498 if (tmpl2 == 0)
8499 return (NULL);
8500 }
8501
8502 /*
8503 * Invalidate the 2MB page mapping and return "failure" if the
8504 * mapping was never accessed and not wired.
8505 */
8506 if ((oldl2 & ATTR_AF) == 0) {
8507 if ((oldl2 & ATTR_SW_WIRED) == 0) {
8508 pmap_demote_l2_abort(pmap, va, l2, lockp);
8509 CTR2(KTR_PMAP,
8510 "pmap_demote_l2: failure for va %#lx in pmap %p",
8511 va, pmap);
8512 goto fail;
8513 }
8514 ml3 = pmap_remove_pt_page(pmap, va);
8515 /* Fill the PTP with L3Es that have ATTR_AF cleared. */
8516 ml3->valid = 0;
8517 } else if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
8518 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8519 ("pmap_demote_l2: page table page for a wired mapping"
8520 " is missing"));
8521
8522 /*
8523 * If the page table page is missing and the mapping
8524 * is for a kernel address, the mapping must belong to
8525 * either the direct map or the early kernel memory.
8526 * Page table pages are preallocated for every other
8527 * part of the kernel address space, so the direct map
8528 * region and early kernel memory are the only parts of the
8529 * kernel address space that must be handled here.
8530 */
8531 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
8532 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
8533 ("pmap_demote_l2: No saved mpte for va %#lx", va));
8534
8535 /*
8536 * If the 2MB page mapping belongs to the direct map
8537 * region of the kernel's address space, then the page
8538 * allocation request specifies the highest possible
8539 * priority (VM_ALLOC_INTERRUPT). Otherwise, the
8540 * priority is normal.
8541 */
8542 ml3 = vm_page_alloc_noobj(
8543 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
8544 VM_ALLOC_WIRED);
8545
8546 /*
8547 * If the allocation of the new page table page fails,
8548 * invalidate the 2MB page mapping and return "failure".
8549 */
8550 if (ml3 == NULL) {
8551 pmap_demote_l2_abort(pmap, va, l2, lockp);
8552 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
8553 " in pmap %p", va, pmap);
8554 goto fail;
8555 }
8556 ml3->pindex = pmap_l2_pindex(va);
8557
8558 if (!ADDR_IS_KERNEL(va)) {
8559 ml3->ref_count = NL3PG;
8560 pmap_resident_count_inc(pmap, 1);
8561 }
8562 }
8563 l3phys = VM_PAGE_TO_PHYS(ml3);
8564 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
8565 newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
8566 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
8567 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
8568 ("pmap_demote_l2: L2 entry is writeable but not dirty"));
8569
8570 /*
8571 * If the PTP is not leftover from an earlier promotion or it does not
8572 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all
8573 * have ATTR_AF set, unless this is a wired mapping with ATTR_AF clear.
8574 *
8575 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
8576 * performs a dsb(). That dsb() ensures that the stores for filling
8577 * "l3" are visible before "l3" is added to the page table.
8578 */
8579 if (!vm_page_all_valid(ml3))
8580 pmap_fill_l3(l3, newl3);
8581
8582 pmap_demote_l2_check(l3, newl3);
8583
8584 /*
8585 * If the mapping has changed attributes, update the L3Es.
8586 */
8587 if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE))
8588 pmap_fill_l3(l3, newl3);
8589
8590 /*
8591 * Map the temporary page so we don't lose access to the l2 table.
8592 */
8593 if (tmpl2 != 0) {
8594 pmap_kenter(tmpl2, PAGE_SIZE,
8595 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
8596 VM_MEMATTR_WRITE_BACK);
8597 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
8598 }
8599
8600 /*
8601 * The spare PV entries must be reserved prior to demoting the
8602 * mapping, that is, prior to changing the PDE. Otherwise, the state
8603 * of the L2 and the PV lists will be inconsistent, which can result
8604 * in reclaim_pv_chunk() attempting to remove a PV entry from the
8605 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
8606 * PV entry for the 2MB page mapping that is being demoted.
8607 */
8608 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8609 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
8610
8611 /*
8612 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
8613 * the 2MB page mapping.
8614 */
8615 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
8616
8617 /*
8618 * Demote the PV entry.
8619 */
8620 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8621 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
8622
8623 counter_u64_add(pmap_l2_demotions, 1);
8624 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
8625 " in pmap %p %lx", va, pmap, l3[0]);
8626
8627 fail:
8628 if (tmpl2 != 0) {
8629 pmap_kremove(tmpl2);
8630 kva_free(tmpl2, PAGE_SIZE);
8631 }
8632
8633 return (l3);
8634
8635 }
8636
8637 static pt_entry_t *
pmap_demote_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)8638 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
8639 {
8640 struct rwlock *lock;
8641 pt_entry_t *l3;
8642
8643 lock = NULL;
8644 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
8645 if (lock != NULL)
8646 rw_wunlock(lock);
8647 return (l3);
8648 }
8649
8650 /*
8651 * Demote an L2C superpage mapping to L2C_ENTRIES L2 block mappings.
8652 */
8653 static bool
pmap_demote_l2c(pmap_t pmap,pt_entry_t * l2p,vm_offset_t va)8654 pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va)
8655 {
8656 pd_entry_t *l2c_end, *l2c_start, l2e, mask, nbits, *tl2p;
8657 vm_offset_t tmpl3;
8658 register_t intr;
8659
8660 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8661 PMAP_ASSERT_STAGE1(pmap);
8662 l2c_start = (pd_entry_t *)((uintptr_t)l2p & ~((L2C_ENTRIES *
8663 sizeof(pd_entry_t)) - 1));
8664 l2c_end = l2c_start + L2C_ENTRIES;
8665 tmpl3 = 0;
8666 if ((va & ~L2C_OFFSET) < (vm_offset_t)l2c_end &&
8667 (vm_offset_t)l2c_start < (va & ~L2C_OFFSET) + L2C_SIZE) {
8668 tmpl3 = kva_alloc(PAGE_SIZE);
8669 if (tmpl3 == 0)
8670 return (false);
8671 pmap_kenter(tmpl3, PAGE_SIZE,
8672 DMAP_TO_PHYS((vm_offset_t)l2c_start) & ~L3_OFFSET,
8673 VM_MEMATTR_WRITE_BACK);
8674 l2c_start = (pd_entry_t *)(tmpl3 +
8675 ((vm_offset_t)l2c_start & PAGE_MASK));
8676 l2c_end = (pd_entry_t *)(tmpl3 +
8677 ((vm_offset_t)l2c_end & PAGE_MASK));
8678 }
8679 mask = 0;
8680 nbits = ATTR_DESCR_VALID;
8681 intr = intr_disable();
8682
8683 /*
8684 * Break the mappings.
8685 */
8686 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8687 /*
8688 * Clear the mapping's contiguous and valid bits, but leave
8689 * the rest of the entry unchanged, so that a lockless,
8690 * concurrent pmap_kextract() can still lookup the physical
8691 * address.
8692 */
8693 l2e = pmap_load(tl2p);
8694 KASSERT((l2e & ATTR_CONTIGUOUS) != 0,
8695 ("pmap_demote_l2c: missing ATTR_CONTIGUOUS"));
8696 KASSERT((l2e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8697 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8698 ("pmap_demote_l2c: missing ATTR_S1_AP_RW"));
8699 while (!atomic_fcmpset_64(tl2p, &l2e, l2e & ~(ATTR_CONTIGUOUS |
8700 ATTR_DESCR_VALID)))
8701 cpu_spinwait();
8702
8703 /*
8704 * Hardware accessed and dirty bit maintenance might only
8705 * update a single L2 entry, so we must combine the accessed
8706 * and dirty bits from this entire set of contiguous L2
8707 * entries.
8708 */
8709 if ((l2e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8710 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8711 mask = ATTR_S1_AP_RW_BIT;
8712 nbits |= l2e & ATTR_AF;
8713 }
8714 if ((nbits & ATTR_AF) != 0) {
8715 pmap_s1_invalidate_strided(pmap, va & ~L2C_OFFSET, (va +
8716 L2C_SIZE) & ~L2C_OFFSET, L2_SIZE, true);
8717 }
8718
8719 /*
8720 * Remake the mappings, updating the accessed and dirty bits.
8721 */
8722 l2e = (pmap_load(l2c_start) & ~mask) | nbits;
8723 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8724 pmap_store(tl2p, l2e);
8725 l2e += L2_SIZE;
8726 }
8727 dsb(ishst);
8728
8729 intr_restore(intr);
8730 if (tmpl3 != 0) {
8731 pmap_kremove(tmpl3);
8732 kva_free(tmpl3, PAGE_SIZE);
8733 }
8734 counter_u64_add(pmap_l2c_demotions, 1);
8735 CTR2(KTR_PMAP, "pmap_demote_l2c: success for va %#lx in pmap %p",
8736 va, pmap);
8737 return (true);
8738 }
8739
8740 /*
8741 * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings.
8742 */
8743 static bool
pmap_demote_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va)8744 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va)
8745 {
8746 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8747 vm_offset_t tmpl3;
8748 register_t intr;
8749
8750 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8751 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8752 sizeof(pt_entry_t)) - 1));
8753 l3c_end = l3c_start + L3C_ENTRIES;
8754 tmpl3 = 0;
8755 if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end &&
8756 (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) {
8757 tmpl3 = kva_alloc(PAGE_SIZE);
8758 if (tmpl3 == 0)
8759 return (false);
8760 pmap_kenter(tmpl3, PAGE_SIZE,
8761 DMAP_TO_PHYS((vm_offset_t)l3c_start) & ~L3_OFFSET,
8762 VM_MEMATTR_WRITE_BACK);
8763 l3c_start = (pt_entry_t *)(tmpl3 +
8764 ((vm_offset_t)l3c_start & PAGE_MASK));
8765 l3c_end = (pt_entry_t *)(tmpl3 +
8766 ((vm_offset_t)l3c_end & PAGE_MASK));
8767 }
8768 mask = 0;
8769 nbits = ATTR_DESCR_VALID;
8770 intr = intr_disable();
8771
8772 /*
8773 * Break the mappings.
8774 */
8775 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8776 /*
8777 * Clear the mapping's contiguous and valid bits, but leave
8778 * the rest of the entry unchanged, so that a lockless,
8779 * concurrent pmap_kextract() can still lookup the physical
8780 * address.
8781 */
8782 l3e = pmap_load(tl3p);
8783 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8784 ("pmap_demote_l3c: missing ATTR_CONTIGUOUS"));
8785 KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8786 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8787 ("pmap_demote_l3c: missing ATTR_S1_AP_RW"));
8788 while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS |
8789 ATTR_DESCR_VALID)))
8790 cpu_spinwait();
8791
8792 /*
8793 * Hardware accessed and dirty bit maintenance might only
8794 * update a single L3 entry, so we must combine the accessed
8795 * and dirty bits from this entire set of contiguous L3
8796 * entries.
8797 */
8798 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8799 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8800 mask = ATTR_S1_AP_RW_BIT;
8801 nbits |= l3e & ATTR_AF;
8802 }
8803 if ((nbits & ATTR_AF) != 0) {
8804 pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) &
8805 ~L3C_OFFSET, true);
8806 }
8807
8808 /*
8809 * Remake the mappings, updating the accessed and dirty bits.
8810 */
8811 l3e = (pmap_load(l3c_start) & ~mask) | nbits;
8812 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8813 pmap_store(tl3p, l3e);
8814 l3e += L3_SIZE;
8815 }
8816 dsb(ishst);
8817
8818 intr_restore(intr);
8819 if (tmpl3 != 0) {
8820 pmap_kremove(tmpl3);
8821 kva_free(tmpl3, PAGE_SIZE);
8822 }
8823 counter_u64_add(pmap_l3c_demotions, 1);
8824 CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p",
8825 va, pmap);
8826 return (true);
8827 }
8828
8829 /*
8830 * Accumulate the accessed and dirty bits within a L3C superpage and
8831 * return the specified PTE with them applied correctly.
8832 */
8833 static pt_entry_t
pmap_load_l3c(pt_entry_t * l3p)8834 pmap_load_l3c(pt_entry_t *l3p)
8835 {
8836 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8837
8838 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8839 sizeof(pt_entry_t)) - 1));
8840 l3c_end = l3c_start + L3C_ENTRIES;
8841 mask = 0;
8842 nbits = 0;
8843 /* Iterate over each mapping in the superpage. */
8844 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8845 l3e = pmap_load(tl3p);
8846 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8847 ("pmap_load_l3c: missing ATTR_CONTIGUOUS"));
8848 /* Update mask if the current page has its dirty bit set. */
8849 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8850 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8851 mask = ATTR_S1_AP_RW_BIT;
8852 /* Update nbits if the accessed bit is set. */
8853 nbits |= l3e & ATTR_AF;
8854 }
8855 return ((pmap_load(l3p) & ~mask) | nbits);
8856 }
8857
8858 /*
8859 * Perform the pmap work for mincore(2). If the page is not both referenced and
8860 * modified by this pmap, returns its physical address so that the caller can
8861 * find other mappings.
8862 */
8863 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)8864 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
8865 {
8866 pt_entry_t *pte, tpte;
8867 vm_paddr_t mask, pa;
8868 int lvl, psind, val;
8869 bool managed;
8870
8871 PMAP_ASSERT_STAGE1(pmap);
8872 PMAP_LOCK(pmap);
8873 pte = pmap_pte(pmap, addr, &lvl);
8874 if (pte != NULL) {
8875 tpte = pmap_load(pte);
8876
8877 switch (lvl) {
8878 case 3:
8879 mask = L3_OFFSET;
8880 psind = (tpte & ATTR_CONTIGUOUS) != 0 ? 1 : 0;
8881 break;
8882 case 2:
8883 mask = L2_OFFSET;
8884 psind = 2;
8885 break;
8886 case 1:
8887 mask = L1_OFFSET;
8888 psind = 3;
8889 break;
8890 default:
8891 panic("pmap_mincore: invalid level %d", lvl);
8892 }
8893
8894 managed = (tpte & ATTR_SW_MANAGED) != 0;
8895 val = MINCORE_INCORE | MINCORE_PSIND(psind);
8896 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
8897 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
8898 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
8899 if ((tpte & ATTR_AF) == ATTR_AF)
8900 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
8901
8902 pa = PTE_TO_PHYS(tpte) | (addr & mask);
8903 } else {
8904 managed = false;
8905 val = 0;
8906 }
8907
8908 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
8909 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
8910 *pap = pa;
8911 }
8912 PMAP_UNLOCK(pmap);
8913 return (val);
8914 }
8915
8916 /*
8917 * Garbage collect every ASID that is neither active on a processor nor
8918 * reserved.
8919 */
8920 static void
pmap_reset_asid_set(pmap_t pmap)8921 pmap_reset_asid_set(pmap_t pmap)
8922 {
8923 pmap_t curpmap;
8924 int asid, cpuid, epoch;
8925 struct asid_set *set;
8926 enum pmap_stage stage;
8927
8928 set = pmap->pm_asid_set;
8929 stage = pmap->pm_stage;
8930
8931 set = pmap->pm_asid_set;
8932 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8933 mtx_assert(&set->asid_set_mutex, MA_OWNED);
8934
8935 /*
8936 * Ensure that the store to asid_epoch is globally visible before the
8937 * loads from pc_curpmap are performed.
8938 */
8939 epoch = set->asid_epoch + 1;
8940 if (epoch == INT_MAX)
8941 epoch = 0;
8942 set->asid_epoch = epoch;
8943 dsb(ishst);
8944 if (stage == PM_STAGE1) {
8945 __asm __volatile("tlbi vmalle1is");
8946 } else {
8947 KASSERT(pmap_clean_stage2_tlbi != NULL,
8948 ("%s: Unset stage 2 tlb invalidation callback\n",
8949 __func__));
8950 pmap_clean_stage2_tlbi();
8951 }
8952 dsb(ish);
8953 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
8954 set->asid_set_size - 1);
8955 CPU_FOREACH(cpuid) {
8956 if (cpuid == curcpu)
8957 continue;
8958 if (stage == PM_STAGE1) {
8959 curpmap = pcpu_find(cpuid)->pc_curpmap;
8960 PMAP_ASSERT_STAGE1(pmap);
8961 } else {
8962 curpmap = pcpu_find(cpuid)->pc_curvmpmap;
8963 if (curpmap == NULL)
8964 continue;
8965 PMAP_ASSERT_STAGE2(pmap);
8966 }
8967 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
8968 asid = COOKIE_TO_ASID(curpmap->pm_cookie);
8969 if (asid == -1)
8970 continue;
8971 bit_set(set->asid_set, asid);
8972 curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
8973 }
8974 }
8975
8976 /*
8977 * Allocate a new ASID for the specified pmap.
8978 */
8979 static void
pmap_alloc_asid(pmap_t pmap)8980 pmap_alloc_asid(pmap_t pmap)
8981 {
8982 struct asid_set *set;
8983 int new_asid;
8984
8985 set = pmap->pm_asid_set;
8986 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8987
8988 mtx_lock_spin(&set->asid_set_mutex);
8989
8990 /*
8991 * While this processor was waiting to acquire the asid set mutex,
8992 * pmap_reset_asid_set() running on another processor might have
8993 * updated this pmap's cookie to the current epoch. In which case, we
8994 * don't need to allocate a new ASID.
8995 */
8996 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
8997 goto out;
8998
8999 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
9000 &new_asid);
9001 if (new_asid == -1) {
9002 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
9003 set->asid_next, &new_asid);
9004 if (new_asid == -1) {
9005 pmap_reset_asid_set(pmap);
9006 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
9007 set->asid_set_size, &new_asid);
9008 KASSERT(new_asid != -1, ("ASID allocation failure"));
9009 }
9010 }
9011 bit_set(set->asid_set, new_asid);
9012 set->asid_next = new_asid + 1;
9013 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
9014 out:
9015 mtx_unlock_spin(&set->asid_set_mutex);
9016 }
9017
9018 static uint64_t __read_mostly ttbr_flags;
9019
9020 /*
9021 * Compute the value that should be stored in ttbr0 to activate the specified
9022 * pmap. This value may change from time to time.
9023 */
9024 uint64_t
pmap_to_ttbr0(pmap_t pmap)9025 pmap_to_ttbr0(pmap_t pmap)
9026 {
9027 uint64_t ttbr;
9028
9029 ttbr = pmap->pm_ttbr;
9030 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
9031 ttbr |= ttbr_flags;
9032
9033 return (ttbr);
9034 }
9035
9036 static void
pmap_set_cnp(void * arg)9037 pmap_set_cnp(void *arg)
9038 {
9039 uint64_t ttbr0, ttbr1;
9040 u_int cpuid;
9041
9042 cpuid = *(u_int *)arg;
9043 if (cpuid == curcpu) {
9044 /*
9045 * Set the flags while all CPUs are handling the
9046 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
9047 * to pmap_to_ttbr0 after this will have the CnP flag set.
9048 * The dsb after invalidating the TLB will act as a barrier
9049 * to ensure all CPUs can observe this change.
9050 */
9051 ttbr_flags |= TTBR_CnP;
9052 }
9053
9054 ttbr0 = READ_SPECIALREG(ttbr0_el1);
9055 ttbr0 |= TTBR_CnP;
9056
9057 ttbr1 = READ_SPECIALREG(ttbr1_el1);
9058 ttbr1 |= TTBR_CnP;
9059
9060 /* Update ttbr{0,1}_el1 with the CnP flag */
9061 WRITE_SPECIALREG(ttbr0_el1, ttbr0);
9062 WRITE_SPECIALREG(ttbr1_el1, ttbr1);
9063 isb();
9064 __asm __volatile("tlbi vmalle1is");
9065 dsb(ish);
9066 isb();
9067 }
9068
9069 /*
9070 * Defer enabling some features until we have read the ID registers to know
9071 * if they are supported on all CPUs.
9072 */
9073 static void
pmap_init_mp(void * dummy __unused)9074 pmap_init_mp(void *dummy __unused)
9075 {
9076 uint64_t reg;
9077
9078 if (get_kernel_reg(ID_AA64PFR1_EL1, ®)) {
9079 if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) {
9080 if (bootverbose)
9081 printf("Enabling BTI\n");
9082 pmap_bti_support = true;
9083
9084 pmap_bti_ranges_zone = uma_zcreate("BTI ranges",
9085 sizeof(struct rs_el), NULL, NULL, NULL, NULL,
9086 UMA_ALIGN_PTR, 0);
9087 }
9088 }
9089 }
9090 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL);
9091
9092 /*
9093 * Defer enabling CnP until we have read the ID registers to know if it's
9094 * supported on all CPUs.
9095 */
9096 static void
pmap_init_cnp(void * dummy __unused)9097 pmap_init_cnp(void *dummy __unused)
9098 {
9099 uint64_t reg;
9100 u_int cpuid;
9101
9102 if (!get_kernel_reg(ID_AA64MMFR2_EL1, ®))
9103 return;
9104
9105 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
9106 if (bootverbose)
9107 printf("Enabling CnP\n");
9108 cpuid = curcpu;
9109 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
9110 }
9111
9112 }
9113 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
9114
9115 static bool
pmap_activate_int(pmap_t pmap)9116 pmap_activate_int(pmap_t pmap)
9117 {
9118 struct asid_set *set;
9119 int epoch;
9120
9121 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
9122 KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
9123
9124 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
9125 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
9126 /*
9127 * Handle the possibility that the old thread was preempted
9128 * after an "ic" or "tlbi" instruction but before it performed
9129 * a "dsb" instruction. If the old thread migrates to a new
9130 * processor, its completion of a "dsb" instruction on that
9131 * new processor does not guarantee that the "ic" or "tlbi"
9132 * instructions performed on the old processor have completed.
9133 */
9134 dsb(ish);
9135 return (false);
9136 }
9137
9138 set = pmap->pm_asid_set;
9139 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
9140
9141 /*
9142 * Ensure that the store to curpmap is globally visible before the
9143 * load from asid_epoch is performed.
9144 */
9145 if (pmap->pm_stage == PM_STAGE1)
9146 PCPU_SET(curpmap, pmap);
9147 else
9148 PCPU_SET(curvmpmap, pmap);
9149 dsb(ish);
9150 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
9151 if (epoch >= 0 && epoch != set->asid_epoch)
9152 pmap_alloc_asid(pmap);
9153
9154 if (pmap->pm_stage == PM_STAGE1) {
9155 set_ttbr0(pmap_to_ttbr0(pmap));
9156 if (PCPU_GET(bcast_tlbi_workaround) != 0)
9157 invalidate_local_icache();
9158 }
9159 return (true);
9160 }
9161
9162 void
pmap_activate_vm(pmap_t pmap)9163 pmap_activate_vm(pmap_t pmap)
9164 {
9165
9166 PMAP_ASSERT_STAGE2(pmap);
9167
9168 (void)pmap_activate_int(pmap);
9169 }
9170
9171 void
pmap_activate(struct thread * td)9172 pmap_activate(struct thread *td)
9173 {
9174 pmap_t pmap;
9175
9176 pmap = vmspace_pmap(td->td_proc->p_vmspace);
9177 PMAP_ASSERT_STAGE1(pmap);
9178 critical_enter();
9179 (void)pmap_activate_int(pmap);
9180 critical_exit();
9181 }
9182
9183 /*
9184 * Activate the thread we are switching to.
9185 * To simplify the assembly in cpu_throw return the new threads pcb.
9186 */
9187 struct pcb *
pmap_switch(struct thread * new)9188 pmap_switch(struct thread *new)
9189 {
9190 pcpu_bp_harden bp_harden;
9191 struct pcb *pcb;
9192
9193 /* Store the new curthread */
9194 PCPU_SET(curthread, new);
9195
9196 /* And the new pcb */
9197 pcb = new->td_pcb;
9198 PCPU_SET(curpcb, pcb);
9199
9200 /*
9201 * TODO: We may need to flush the cache here if switching
9202 * to a user process.
9203 */
9204
9205 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
9206 /*
9207 * Stop userspace from training the branch predictor against
9208 * other processes. This will call into a CPU specific
9209 * function that clears the branch predictor state.
9210 */
9211 bp_harden = PCPU_GET(bp_harden);
9212 if (bp_harden != NULL)
9213 bp_harden();
9214 }
9215
9216 return (pcb);
9217 }
9218
9219 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)9220 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
9221 {
9222
9223 PMAP_ASSERT_STAGE1(pmap);
9224 KASSERT(ADDR_IS_CANONICAL(va),
9225 ("%s: Address not in canonical form: %lx", __func__, va));
9226
9227 if (ADDR_IS_KERNEL(va)) {
9228 cpu_icache_sync_range((void *)va, sz);
9229 } else {
9230 u_int len, offset;
9231 vm_paddr_t pa;
9232
9233 /* Find the length of data in this page to flush */
9234 offset = va & PAGE_MASK;
9235 len = imin(PAGE_SIZE - offset, sz);
9236
9237 while (sz != 0) {
9238 /* Extract the physical address & find it in the DMAP */
9239 pa = pmap_extract(pmap, va);
9240 if (pa != 0)
9241 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
9242 len);
9243
9244 /* Move to the next page */
9245 sz -= len;
9246 va += len;
9247 /* Set the length for the next iteration */
9248 len = imin(PAGE_SIZE, sz);
9249 }
9250 }
9251 }
9252
9253 static int
pmap_stage2_fault(pmap_t pmap,uint64_t esr,uint64_t far)9254 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9255 {
9256 pd_entry_t *pdep;
9257 pt_entry_t *ptep, pte;
9258 int rv, lvl, dfsc;
9259
9260 PMAP_ASSERT_STAGE2(pmap);
9261 rv = KERN_FAILURE;
9262
9263 /* Data and insn aborts use same encoding for FSC field. */
9264 dfsc = esr & ISS_DATA_DFSC_MASK;
9265 switch (dfsc) {
9266 case ISS_DATA_DFSC_TF_L0:
9267 case ISS_DATA_DFSC_TF_L1:
9268 case ISS_DATA_DFSC_TF_L2:
9269 case ISS_DATA_DFSC_TF_L3:
9270 PMAP_LOCK(pmap);
9271 pdep = pmap_pde(pmap, far, &lvl);
9272 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
9273 PMAP_UNLOCK(pmap);
9274 break;
9275 }
9276
9277 switch (lvl) {
9278 case 0:
9279 ptep = pmap_l0_to_l1(pdep, far);
9280 break;
9281 case 1:
9282 ptep = pmap_l1_to_l2(pdep, far);
9283 break;
9284 case 2:
9285 ptep = pmap_l2_to_l3(pdep, far);
9286 break;
9287 default:
9288 panic("%s: Invalid pde level %d", __func__,lvl);
9289 }
9290 goto fault_exec;
9291
9292 case ISS_DATA_DFSC_AFF_L1:
9293 case ISS_DATA_DFSC_AFF_L2:
9294 case ISS_DATA_DFSC_AFF_L3:
9295 PMAP_LOCK(pmap);
9296 ptep = pmap_pte(pmap, far, &lvl);
9297 fault_exec:
9298 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
9299 /*
9300 * If accessing an executable page invalidate
9301 * the I-cache so it will be valid when we
9302 * continue execution in the guest. The D-cache
9303 * is assumed to already be clean to the Point
9304 * of Coherency.
9305 */
9306 if ((pte & ATTR_S2_XN_MASK) !=
9307 ATTR_S2_XN(ATTR_S2_XN_NONE)) {
9308 invalidate_icache();
9309 }
9310 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
9311 rv = KERN_SUCCESS;
9312 }
9313 PMAP_UNLOCK(pmap);
9314 break;
9315 }
9316
9317 return (rv);
9318 }
9319
9320 int
pmap_fault(pmap_t pmap,uint64_t esr,uint64_t far)9321 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9322 {
9323 pt_entry_t pte, *ptep;
9324 register_t intr;
9325 uint64_t ec, par;
9326 int lvl, rv;
9327
9328 rv = KERN_FAILURE;
9329
9330 ec = ESR_ELx_EXCEPTION(esr);
9331 switch (ec) {
9332 case EXCP_INSN_ABORT_L:
9333 case EXCP_INSN_ABORT:
9334 case EXCP_DATA_ABORT_L:
9335 case EXCP_DATA_ABORT:
9336 break;
9337 default:
9338 return (rv);
9339 }
9340
9341 if (pmap->pm_stage == PM_STAGE2)
9342 return (pmap_stage2_fault(pmap, esr, far));
9343
9344 /* Data and insn aborts use same encoding for FSC field. */
9345 switch (esr & ISS_DATA_DFSC_MASK) {
9346 case ISS_DATA_DFSC_AFF_L1:
9347 case ISS_DATA_DFSC_AFF_L2:
9348 case ISS_DATA_DFSC_AFF_L3:
9349 PMAP_LOCK(pmap);
9350 ptep = pmap_pte(pmap, far, &lvl);
9351 if (ptep != NULL) {
9352 pmap_set_bits(ptep, ATTR_AF);
9353 rv = KERN_SUCCESS;
9354 /*
9355 * XXXMJ as an optimization we could mark the entry
9356 * dirty if this is a write fault.
9357 */
9358 }
9359 PMAP_UNLOCK(pmap);
9360 break;
9361 case ISS_DATA_DFSC_PF_L1:
9362 case ISS_DATA_DFSC_PF_L2:
9363 case ISS_DATA_DFSC_PF_L3:
9364 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
9365 (esr & ISS_DATA_WnR) == 0)
9366 return (rv);
9367 PMAP_LOCK(pmap);
9368 ptep = pmap_pte(pmap, far, &lvl);
9369 if (ptep != NULL &&
9370 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
9371 if ((pte & ATTR_S1_AP_RW_BIT) ==
9372 ATTR_S1_AP(ATTR_S1_AP_RO)) {
9373 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
9374 pmap_s1_invalidate_page(pmap, far, true);
9375 }
9376 rv = KERN_SUCCESS;
9377 }
9378 PMAP_UNLOCK(pmap);
9379 break;
9380 case ISS_DATA_DFSC_TF_L0:
9381 case ISS_DATA_DFSC_TF_L1:
9382 case ISS_DATA_DFSC_TF_L2:
9383 case ISS_DATA_DFSC_TF_L3:
9384 /*
9385 * Retry the translation. A break-before-make sequence can
9386 * produce a transient fault.
9387 */
9388 if (pmap == kernel_pmap) {
9389 /*
9390 * The translation fault may have occurred within a
9391 * critical section. Therefore, we must check the
9392 * address without acquiring the kernel pmap's lock.
9393 */
9394 if (pmap_klookup(far, NULL))
9395 rv = KERN_SUCCESS;
9396 } else {
9397 bool owned;
9398
9399 /*
9400 * In the EFIRT driver we lock the pmap before
9401 * calling into the runtime service. As the lock
9402 * is already owned by the current thread skip
9403 * locking it again.
9404 */
9405 owned = PMAP_OWNED(pmap);
9406 if (!owned)
9407 PMAP_LOCK(pmap);
9408 /* Ask the MMU to check the address. */
9409 intr = intr_disable();
9410 par = arm64_address_translate_s1e0r(far);
9411 intr_restore(intr);
9412 if (!owned)
9413 PMAP_UNLOCK(pmap);
9414
9415 /*
9416 * If the translation was successful, then we can
9417 * return success to the trap handler.
9418 */
9419 if (PAR_SUCCESS(par))
9420 rv = KERN_SUCCESS;
9421 }
9422 break;
9423 }
9424
9425 return (rv);
9426 }
9427
9428 /*
9429 * Increase the starting virtual address of the given mapping if a
9430 * different alignment might result in more superpage mappings.
9431 */
9432 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)9433 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
9434 vm_offset_t *addr, vm_size_t size)
9435 {
9436 vm_offset_t superpage_offset;
9437
9438 if (size < L3C_SIZE)
9439 return;
9440 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
9441 offset += ptoa(object->pg_color);
9442
9443 /*
9444 * Considering the object's physical alignment, is the mapping large
9445 * enough to encompass an L2 (2MB/32MB) superpage ...
9446 */
9447 superpage_offset = offset & L2_OFFSET;
9448 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) >= L2_SIZE) {
9449 /*
9450 * If the virtual and physical alignments differ, then
9451 * increase the virtual address so that the alignments match.
9452 */
9453 if ((*addr & L2_OFFSET) < superpage_offset)
9454 *addr = (*addr & ~L2_OFFSET) + superpage_offset;
9455 else if ((*addr & L2_OFFSET) > superpage_offset)
9456 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) +
9457 superpage_offset;
9458 return;
9459 }
9460 /* ... or an L3C (64KB/2MB) superpage? */
9461 superpage_offset = offset & L3C_OFFSET;
9462 if (size - ((L3C_SIZE - superpage_offset) & L3C_OFFSET) >= L3C_SIZE) {
9463 if ((*addr & L3C_OFFSET) < superpage_offset)
9464 *addr = (*addr & ~L3C_OFFSET) + superpage_offset;
9465 else if ((*addr & L3C_OFFSET) > superpage_offset)
9466 *addr = ((*addr + L3C_OFFSET) & ~L3C_OFFSET) +
9467 superpage_offset;
9468 }
9469 }
9470
9471 /**
9472 * Get the kernel virtual address of a set of physical pages. If there are
9473 * physical addresses not covered by the DMAP perform a transient mapping
9474 * that will be removed when calling pmap_unmap_io_transient.
9475 *
9476 * \param page The pages the caller wishes to obtain the virtual
9477 * address on the kernel memory map.
9478 * \param vaddr On return contains the kernel virtual memory address
9479 * of the pages passed in the page parameter.
9480 * \param count Number of pages passed in.
9481 * \param can_fault true if the thread using the mapped pages can take
9482 * page faults, false otherwise.
9483 *
9484 * \returns true if the caller must call pmap_unmap_io_transient when
9485 * finished or false otherwise.
9486 *
9487 */
9488 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)9489 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9490 bool can_fault)
9491 {
9492 vm_paddr_t paddr;
9493 bool needs_mapping;
9494 int error __diagused, i;
9495
9496 /*
9497 * Allocate any KVA space that we need, this is done in a separate
9498 * loop to prevent calling vmem_alloc while pinned.
9499 */
9500 needs_mapping = false;
9501 for (i = 0; i < count; i++) {
9502 paddr = VM_PAGE_TO_PHYS(page[i]);
9503 if (__predict_false(!PHYS_IN_DMAP(paddr))) {
9504 error = vmem_alloc(kernel_arena, PAGE_SIZE,
9505 M_BESTFIT | M_WAITOK, &vaddr[i]);
9506 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
9507 needs_mapping = true;
9508 } else {
9509 vaddr[i] = PHYS_TO_DMAP(paddr);
9510 }
9511 }
9512
9513 /* Exit early if everything is covered by the DMAP */
9514 if (!needs_mapping)
9515 return (false);
9516
9517 if (!can_fault)
9518 sched_pin();
9519 for (i = 0; i < count; i++) {
9520 paddr = VM_PAGE_TO_PHYS(page[i]);
9521 if (!PHYS_IN_DMAP(paddr)) {
9522 panic(
9523 "pmap_map_io_transient: TODO: Map out of DMAP data");
9524 }
9525 }
9526
9527 return (needs_mapping);
9528 }
9529
9530 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)9531 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9532 bool can_fault)
9533 {
9534 vm_paddr_t paddr;
9535 int i;
9536
9537 if (!can_fault)
9538 sched_unpin();
9539 for (i = 0; i < count; i++) {
9540 paddr = VM_PAGE_TO_PHYS(page[i]);
9541 if (!PHYS_IN_DMAP(paddr)) {
9542 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
9543 }
9544 }
9545 }
9546
9547 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)9548 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
9549 {
9550
9551 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
9552 }
9553
9554 static void *
bti_dup_range(void * ctx __unused,void * data)9555 bti_dup_range(void *ctx __unused, void *data)
9556 {
9557 struct rs_el *node, *new_node;
9558
9559 new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9560 if (new_node == NULL)
9561 return (NULL);
9562 node = data;
9563 memcpy(new_node, node, sizeof(*node));
9564 return (new_node);
9565 }
9566
9567 static void
bti_free_range(void * ctx __unused,void * node)9568 bti_free_range(void *ctx __unused, void *node)
9569 {
9570
9571 uma_zfree(pmap_bti_ranges_zone, node);
9572 }
9573
9574 static int
pmap_bti_assign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9575 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9576 {
9577 struct rs_el *rs;
9578 int error;
9579
9580 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9581 PMAP_ASSERT_STAGE1(pmap);
9582 MPASS(pmap->pm_bti != NULL);
9583 rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9584 if (rs == NULL)
9585 return (ENOMEM);
9586 error = rangeset_insert(pmap->pm_bti, sva, eva, rs);
9587 if (error != 0)
9588 uma_zfree(pmap_bti_ranges_zone, rs);
9589 return (error);
9590 }
9591
9592 static void
pmap_bti_deassign_all(pmap_t pmap)9593 pmap_bti_deassign_all(pmap_t pmap)
9594 {
9595
9596 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9597 if (pmap->pm_bti != NULL)
9598 rangeset_remove_all(pmap->pm_bti);
9599 }
9600
9601 /*
9602 * Returns true if the BTI setting is the same across the specified address
9603 * range, and false otherwise. When returning true, updates the referenced PTE
9604 * to reflect the BTI setting.
9605 *
9606 * Only stage 1 pmaps support BTI. The kernel pmap is always a stage 1 pmap
9607 * that has the same BTI setting implicitly across its entire address range.
9608 */
9609 static bool
pmap_bti_same(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t * pte)9610 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte)
9611 {
9612 struct rs_el *rs;
9613 vm_offset_t va;
9614
9615 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9616 KASSERT(ADDR_IS_CANONICAL(sva),
9617 ("%s: Start address not in canonical form: %lx", __func__, sva));
9618 KASSERT(ADDR_IS_CANONICAL(eva),
9619 ("%s: End address not in canonical form: %lx", __func__, eva));
9620 KASSERT((*pte & ATTR_S1_GP) == 0,
9621 ("%s: pte %lx has ATTR_S1_GP preset", __func__, *pte));
9622
9623 if (pmap == kernel_pmap) {
9624 *pte |= ATTR_KERN_GP;
9625 return (true);
9626 }
9627 if (pmap->pm_bti == NULL)
9628 return (true);
9629 PMAP_ASSERT_STAGE1(pmap);
9630 rs = rangeset_containing(pmap->pm_bti, sva);
9631 if (rs == NULL)
9632 return (rangeset_empty(pmap->pm_bti, sva, eva));
9633 while ((va = rs->re_end) < eva) {
9634 if ((rs = rangeset_beginning(pmap->pm_bti, va)) == NULL)
9635 return (false);
9636 }
9637 *pte |= ATTR_S1_GP;
9638 return (true);
9639 }
9640
9641 static pt_entry_t
pmap_pte_bti(pmap_t pmap,vm_offset_t va)9642 pmap_pte_bti(pmap_t pmap, vm_offset_t va)
9643 {
9644 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9645 MPASS(ADDR_IS_CANONICAL(va));
9646
9647 if (pmap->pm_stage != PM_STAGE1)
9648 return (0);
9649 if (pmap == kernel_pmap)
9650 return (ATTR_KERN_GP);
9651 if (pmap->pm_bti != NULL &&
9652 rangeset_containing(pmap->pm_bti, va) != NULL)
9653 return (ATTR_S1_GP);
9654 return (0);
9655 }
9656
9657 static void
pmap_bti_on_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9658 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9659 {
9660
9661 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9662 if (pmap->pm_bti != NULL)
9663 rangeset_remove(pmap->pm_bti, sva, eva);
9664 }
9665
9666 static int
pmap_bti_copy(pmap_t dst_pmap,pmap_t src_pmap)9667 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap)
9668 {
9669
9670 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
9671 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
9672 MPASS(src_pmap->pm_stage == dst_pmap->pm_stage);
9673 MPASS(src_pmap->pm_bti != NULL);
9674 MPASS(dst_pmap->pm_bti != NULL);
9675 if (src_pmap->pm_bti->rs_data_ctx == NULL)
9676 return (0);
9677 return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti));
9678 }
9679
9680 static void
pmap_bti_update_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool set)9681 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set)
9682 {
9683 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9684 PMAP_ASSERT_STAGE1(pmap);
9685
9686 pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0,
9687 true);
9688 }
9689
9690 int
pmap_bti_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9691 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9692 {
9693 int error;
9694
9695 if (pmap->pm_bti == NULL)
9696 return (0);
9697 if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva))
9698 return (EINVAL);
9699 if (pmap->pm_stage != PM_STAGE1)
9700 return (EINVAL);
9701 if (eva <= sva || ADDR_IS_KERNEL(eva))
9702 return (EFAULT);
9703
9704 sva = trunc_page(sva);
9705 eva = round_page(eva);
9706 for (;;) {
9707 PMAP_LOCK(pmap);
9708 error = pmap_bti_assign(pmap, sva, eva);
9709 if (error == 0)
9710 pmap_bti_update_range(pmap, sva, eva, true);
9711 PMAP_UNLOCK(pmap);
9712 if (error != ENOMEM)
9713 break;
9714 vm_wait(NULL);
9715 }
9716 return (error);
9717 }
9718
9719 #if defined(KASAN) || defined(KMSAN)
9720 static pd_entry_t *pmap_san_early_l2;
9721
9722 #define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE)
9723 #define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE)
9724 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_l2(void)9725 pmap_san_enter_bootstrap_alloc_l2(void)
9726 {
9727 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
9728 static size_t offset = 0;
9729 vm_offset_t addr;
9730
9731 if (offset + L2_SIZE > sizeof(bootstrap_data)) {
9732 panic("%s: out of memory for the bootstrap shadow map L2 entries",
9733 __func__);
9734 }
9735
9736 addr = (uintptr_t)&bootstrap_data[offset];
9737 offset += L2_SIZE;
9738 return (addr);
9739 }
9740
9741 /*
9742 * SAN L1 + L2 pages, maybe L3 entries later?
9743 */
9744 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_pages(int npages)9745 pmap_san_enter_bootstrap_alloc_pages(int npages)
9746 {
9747 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
9748 static size_t offset = 0;
9749 vm_offset_t addr;
9750
9751 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
9752 panic("%s: out of memory for the bootstrap shadow map",
9753 __func__);
9754 }
9755
9756 addr = (uintptr_t)&bootstrap_data[offset];
9757 offset += (npages * PAGE_SIZE);
9758 return (addr);
9759 }
9760
9761 static void __nosanitizeaddress
pmap_san_enter_bootstrap(void)9762 pmap_san_enter_bootstrap(void)
9763 {
9764 vm_offset_t freemempos;
9765
9766 /* L1, L2 */
9767 freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
9768 bs_state.freemempos = freemempos;
9769 bs_state.va = KASAN_MIN_ADDRESS;
9770 pmap_bootstrap_l1_table(&bs_state);
9771 pmap_san_early_l2 = bs_state.l2;
9772 }
9773
9774 static vm_page_t
pmap_san_enter_alloc_l3(void)9775 pmap_san_enter_alloc_l3(void)
9776 {
9777 vm_page_t m;
9778
9779 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
9780 VM_ALLOC_ZERO);
9781 if (m == NULL)
9782 panic("%s: no memory to grow shadow map", __func__);
9783 return (m);
9784 }
9785
9786 static vm_page_t
pmap_san_enter_alloc_l2(void)9787 pmap_san_enter_alloc_l2(void)
9788 {
9789 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
9790 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
9791 }
9792
9793 void __nosanitizeaddress __nosanitizememory
pmap_san_enter(vm_offset_t va)9794 pmap_san_enter(vm_offset_t va)
9795 {
9796 pd_entry_t *l1, *l2;
9797 pt_entry_t *l3;
9798 vm_page_t m;
9799
9800 if (virtual_avail == 0) {
9801 vm_offset_t block;
9802 int slot;
9803 bool first;
9804
9805 /* Temporary shadow map prior to pmap_bootstrap(). */
9806 first = pmap_san_early_l2 == NULL;
9807 if (first)
9808 pmap_san_enter_bootstrap();
9809
9810 l2 = pmap_san_early_l2;
9811 slot = pmap_l2_index(va);
9812
9813 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
9814 MPASS(first);
9815 block = pmap_san_enter_bootstrap_alloc_l2();
9816 pmap_store(&l2[slot],
9817 PHYS_TO_PTE(pmap_early_vtophys(block)) |
9818 PMAP_SAN_PTE_BITS | L2_BLOCK);
9819 dmb(ishst);
9820 }
9821
9822 return;
9823 }
9824
9825 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
9826 l1 = pmap_l1(kernel_pmap, va);
9827 MPASS(l1 != NULL);
9828 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
9829 m = pmap_san_enter_alloc_l3();
9830 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
9831 }
9832 l2 = pmap_l1_to_l2(l1, va);
9833 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
9834 m = pmap_san_enter_alloc_l2();
9835 if (m != NULL) {
9836 pmap_store(l2, VM_PAGE_TO_PTE(m) |
9837 PMAP_SAN_PTE_BITS | L2_BLOCK);
9838 } else {
9839 m = pmap_san_enter_alloc_l3();
9840 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
9841 }
9842 dmb(ishst);
9843 }
9844 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
9845 return;
9846 l3 = pmap_l2_to_l3(l2, va);
9847 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
9848 return;
9849 m = pmap_san_enter_alloc_l3();
9850 pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
9851 dmb(ishst);
9852 }
9853 #endif /* KASAN || KMSAN */
9854
9855 /*
9856 * Track a range of the kernel's virtual address space that is contiguous
9857 * in various mapping attributes.
9858 */
9859 struct pmap_kernel_map_range {
9860 vm_offset_t sva;
9861 pt_entry_t attrs;
9862 int l3pages;
9863 int l3contig;
9864 int l2blocks;
9865 int l2contig;
9866 int l1blocks;
9867 };
9868
9869 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)9870 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
9871 vm_offset_t eva)
9872 {
9873 const char *mode;
9874 int index;
9875
9876 if (eva <= range->sva)
9877 return;
9878
9879 index = range->attrs & ATTR_S1_IDX_MASK;
9880 switch (index) {
9881 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
9882 mode = "DEV-NP";
9883 break;
9884 case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
9885 mode = "DEV";
9886 break;
9887 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
9888 mode = "UC";
9889 break;
9890 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
9891 mode = "WB";
9892 break;
9893 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
9894 mode = "WT";
9895 break;
9896 default:
9897 printf(
9898 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
9899 __func__, index, range->sva, eva);
9900 mode = "??";
9901 break;
9902 }
9903
9904 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d %d\n",
9905 range->sva, eva,
9906 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
9907 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
9908 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
9909 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
9910 (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-',
9911 mode, range->l1blocks, range->l2contig, range->l2blocks,
9912 range->l3contig, range->l3pages);
9913
9914 /* Reset to sentinel value. */
9915 range->sva = 0xfffffffffffffffful;
9916 }
9917
9918 /*
9919 * Determine whether the attributes specified by a page table entry match those
9920 * being tracked by the current range.
9921 */
9922 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)9923 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
9924 {
9925
9926 return (range->attrs == attrs);
9927 }
9928
9929 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)9930 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
9931 pt_entry_t attrs)
9932 {
9933
9934 memset(range, 0, sizeof(*range));
9935 range->sva = va;
9936 range->attrs = attrs;
9937 }
9938
9939 /* Get the block/page attributes that correspond to the table attributes */
9940 static pt_entry_t
sysctl_kmaps_table_attrs(pd_entry_t table)9941 sysctl_kmaps_table_attrs(pd_entry_t table)
9942 {
9943 pt_entry_t attrs;
9944
9945 attrs = 0;
9946 if ((table & TATTR_UXN_TABLE) != 0)
9947 attrs |= ATTR_S1_UXN;
9948 if ((table & TATTR_PXN_TABLE) != 0)
9949 attrs |= ATTR_S1_PXN;
9950 if ((table & TATTR_AP_TABLE_RO) != 0)
9951 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
9952
9953 return (attrs);
9954 }
9955
9956 /* Read the block/page attributes we care about */
9957 static pt_entry_t
sysctl_kmaps_block_attrs(pt_entry_t block)9958 sysctl_kmaps_block_attrs(pt_entry_t block)
9959 {
9960 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK |
9961 ATTR_S1_GP));
9962 }
9963
9964 /*
9965 * Given a leaf PTE, derive the mapping's attributes. If they do not match
9966 * those of the current run, dump the address range and its attributes, and
9967 * begin a new run.
9968 */
9969 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l0e,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)9970 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
9971 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
9972 pt_entry_t l3e)
9973 {
9974 pt_entry_t attrs;
9975
9976 attrs = sysctl_kmaps_table_attrs(l0e);
9977
9978 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9979 attrs |= sysctl_kmaps_block_attrs(l1e);
9980 goto done;
9981 }
9982 attrs |= sysctl_kmaps_table_attrs(l1e);
9983
9984 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9985 attrs |= sysctl_kmaps_block_attrs(l2e);
9986 goto done;
9987 }
9988 attrs |= sysctl_kmaps_table_attrs(l2e);
9989 attrs |= sysctl_kmaps_block_attrs(l3e);
9990
9991 done:
9992 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
9993 sysctl_kmaps_dump(sb, range, va);
9994 sysctl_kmaps_reinit(range, va, attrs);
9995 }
9996 }
9997
9998 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)9999 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
10000 {
10001 struct pmap_kernel_map_range range;
10002 struct sbuf sbuf, *sb;
10003 pd_entry_t l0e, *l1, l1e, *l2, l2e;
10004 pt_entry_t *l3, l3e;
10005 vm_offset_t sva;
10006 vm_paddr_t pa;
10007 int error, i, j, k, l;
10008
10009 error = sysctl_wire_old_buffer(req, 0);
10010 if (error != 0)
10011 return (error);
10012 sb = &sbuf;
10013 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
10014
10015 /* Sentinel value. */
10016 range.sva = 0xfffffffffffffffful;
10017
10018 /*
10019 * Iterate over the kernel page tables without holding the kernel pmap
10020 * lock. Kernel page table pages are never freed, so at worst we will
10021 * observe inconsistencies in the output.
10022 */
10023 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
10024 i++) {
10025 if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
10026 sbuf_printf(sb, "\nDirect map:\n");
10027 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
10028 sbuf_printf(sb, "\nKernel map:\n");
10029 #ifdef KASAN
10030 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
10031 sbuf_printf(sb, "\nKASAN shadow map:\n");
10032 #endif
10033 #ifdef KMSAN
10034 else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS))
10035 sbuf_printf(sb, "\nKMSAN shadow map:\n");
10036 else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS))
10037 sbuf_printf(sb, "\nKMSAN origin map:\n");
10038 #endif
10039
10040 l0e = kernel_pmap->pm_l0[i];
10041 if ((l0e & ATTR_DESCR_VALID) == 0) {
10042 sysctl_kmaps_dump(sb, &range, sva);
10043 sva += L0_SIZE;
10044 continue;
10045 }
10046 pa = PTE_TO_PHYS(l0e);
10047 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
10048
10049 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
10050 l1e = l1[j];
10051 if ((l1e & ATTR_DESCR_VALID) == 0) {
10052 sysctl_kmaps_dump(sb, &range, sva);
10053 sva += L1_SIZE;
10054 continue;
10055 }
10056 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
10057 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
10058 sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
10059 0, 0);
10060 range.l1blocks++;
10061 sva += L1_SIZE;
10062 continue;
10063 }
10064 pa = PTE_TO_PHYS(l1e);
10065 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
10066
10067 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
10068 l2e = l2[k];
10069 if ((l2e & ATTR_DESCR_VALID) == 0) {
10070 sysctl_kmaps_dump(sb, &range, sva);
10071 sva += L2_SIZE;
10072 continue;
10073 }
10074 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
10075 sysctl_kmaps_check(sb, &range, sva,
10076 l0e, l1e, l2e, 0);
10077 if ((l2e & ATTR_CONTIGUOUS) != 0)
10078 range.l2contig +=
10079 k % L2C_ENTRIES == 0 ?
10080 1 : 0;
10081 else
10082 range.l2blocks++;
10083 sva += L2_SIZE;
10084 continue;
10085 }
10086 pa = PTE_TO_PHYS(l2e);
10087 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
10088
10089 for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
10090 l++, sva += L3_SIZE) {
10091 l3e = l3[l];
10092 if ((l3e & ATTR_DESCR_VALID) == 0) {
10093 sysctl_kmaps_dump(sb, &range,
10094 sva);
10095 continue;
10096 }
10097 sysctl_kmaps_check(sb, &range, sva,
10098 l0e, l1e, l2e, l3e);
10099 if ((l3e & ATTR_CONTIGUOUS) != 0)
10100 range.l3contig +=
10101 l % L3C_ENTRIES == 0 ?
10102 1 : 0;
10103 else
10104 range.l3pages++;
10105 }
10106 }
10107 }
10108 }
10109
10110 error = sbuf_finish(sb);
10111 sbuf_delete(sb);
10112 return (error);
10113 }
10114 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
10115 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
10116 NULL, 0, sysctl_kmaps, "A",
10117 "Dump kernel address layout");
10118