1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014-2016 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * This product includes software developed by the University of
35 * California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 * may be used to endorse or promote products derived from this software
38 * without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 */
52 /*-
53 * Copyright (c) 2003 Networks Associates Technology, Inc.
54 * All rights reserved.
55 *
56 * This software was developed for the FreeBSD Project by Jake Burkholder,
57 * Safeport Network Services, and Network Associates Laboratories, the
58 * Security Research Division of Network Associates, Inc. under
59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
60 * CHATS research program.
61 *
62 * Redistribution and use in source and binary forms, with or without
63 * modification, are permitted provided that the following conditions
64 * are met:
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 * 2. Redistributions in binary form must reproduce the above copyright
68 * notice, this list of conditions and the following disclaimer in the
69 * documentation and/or other materials provided with the distribution.
70 *
71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81 * SUCH DAMAGE.
82 */
83
84 #include <sys/cdefs.h>
85 /*
86 * Manages physical address maps.
87 *
88 * Since the information managed by this module is
89 * also stored by the logical address mapping module,
90 * this module may throw away valid virtual-to-physical
91 * mappings at almost any time. However, invalidations
92 * of virtual-to-physical mappings must be done as
93 * requested.
94 *
95 * In order to cope with hardware architectures which
96 * make virtual-to-physical map invalidates expensive,
97 * this module may delay invalidate or reduced protection
98 * operations until such time as they are actually
99 * necessary. This module is given full information as
100 * to which processors are currently using which maps,
101 * and to when physical maps must be made correct.
102 */
103
104 #include "opt_vm.h"
105
106 #include <sys/param.h>
107 #include <sys/asan.h>
108 #include <sys/bitstring.h>
109 #include <sys/bus.h>
110 #include <sys/systm.h>
111 #include <sys/kernel.h>
112 #include <sys/ktr.h>
113 #include <sys/limits.h>
114 #include <sys/lock.h>
115 #include <sys/malloc.h>
116 #include <sys/mman.h>
117 #include <sys/msan.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/physmem.h>
121 #include <sys/proc.h>
122 #include <sys/rangeset.h>
123 #include <sys/rwlock.h>
124 #include <sys/sbuf.h>
125 #include <sys/sx.h>
126 #include <sys/vmem.h>
127 #include <sys/vmmeter.h>
128 #include <sys/sched.h>
129 #include <sys/sysctl.h>
130 #include <sys/_unrhdr.h>
131 #include <sys/smp.h>
132
133 #include <vm/vm.h>
134 #include <vm/vm_param.h>
135 #include <vm/vm_kern.h>
136 #include <vm/vm_page.h>
137 #include <vm/vm_map.h>
138 #include <vm/vm_object.h>
139 #include <vm/vm_extern.h>
140 #include <vm/vm_pageout.h>
141 #include <vm/vm_pager.h>
142 #include <vm/vm_phys.h>
143 #include <vm/vm_radix.h>
144 #include <vm/vm_reserv.h>
145 #include <vm/vm_dumpset.h>
146 #include <vm/uma.h>
147
148 #include <machine/asan.h>
149 #include <machine/cpu_feat.h>
150 #include <machine/machdep.h>
151 #include <machine/md_var.h>
152 #include <machine/pcb.h>
153
154 #ifdef NUMA
155 #define PMAP_MEMDOM MAXMEMDOM
156 #else
157 #define PMAP_MEMDOM 1
158 #endif
159
160 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1)
161 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2)
162
163 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t)))
164 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t)))
165 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t)))
166 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t)))
167
168 #define NUL0E L0_ENTRIES
169 #define NUL1E (NUL0E * NL1PG)
170 #define NUL2E (NUL1E * NL2PG)
171
172 #ifdef PV_STATS
173 #define PV_STAT(x) do { x ; } while (0)
174 #define __pvused
175 #else
176 #define PV_STAT(x) do { } while (0)
177 #define __pvused __unused
178 #endif
179
180 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT))
181 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
182 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
183
184 #ifdef __ARM_FEATURE_BTI_DEFAULT
185 pt_entry_t __read_mostly pmap_gp_attr;
186 #define ATTR_KERN_GP pmap_gp_attr
187 #else
188 #define ATTR_KERN_GP 0
189 #endif
190 #define PMAP_SAN_PTE_BITS (ATTR_AF | ATTR_S1_XN | pmap_sh_attr | \
191 ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
192
193 struct pmap_large_md_page {
194 struct rwlock pv_lock;
195 struct md_page pv_page;
196 /* Pad to a power of 2, see pmap_init_pv_table(). */
197 int pv_pad[2];
198 };
199
200 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
201 #define pv_dummy pv_dummy_large.pv_page
202 __read_mostly static struct pmap_large_md_page *pv_table;
203
204 static struct pmap_large_md_page *
_pa_to_pmdp(vm_paddr_t pa)205 _pa_to_pmdp(vm_paddr_t pa)
206 {
207 struct vm_phys_seg *seg;
208
209 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
210 return ((struct pmap_large_md_page *)seg->md_first +
211 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
212 return (NULL);
213 }
214
215 static struct pmap_large_md_page *
pa_to_pmdp(vm_paddr_t pa)216 pa_to_pmdp(vm_paddr_t pa)
217 {
218 struct pmap_large_md_page *pvd;
219
220 pvd = _pa_to_pmdp(pa);
221 if (pvd == NULL)
222 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
223 return (pvd);
224 }
225
226 static struct pmap_large_md_page *
page_to_pmdp(vm_page_t m)227 page_to_pmdp(vm_page_t m)
228 {
229 struct vm_phys_seg *seg;
230
231 seg = &vm_phys_segs[m->segind];
232 return ((struct pmap_large_md_page *)seg->md_first +
233 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
234 }
235
236 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page))
237 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page))
238
239 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \
240 struct pmap_large_md_page *_pvd; \
241 struct rwlock *_lock; \
242 _pvd = _pa_to_pmdp(pa); \
243 if (__predict_false(_pvd == NULL)) \
244 _lock = &pv_dummy_large.pv_lock; \
245 else \
246 _lock = &(_pvd->pv_lock); \
247 _lock; \
248 })
249
250 static struct rwlock *
VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)251 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)
252 {
253 if ((m->flags & PG_FICTITIOUS) == 0)
254 return (&page_to_pmdp(m)->pv_lock);
255 else
256 return (&pv_dummy_large.pv_lock);
257 }
258
259 #define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \
260 struct rwlock **_lockp = (lockp); \
261 struct rwlock *_new_lock = (new_lock); \
262 \
263 if (_new_lock != *_lockp) { \
264 if (*_lockp != NULL) \
265 rw_wunlock(*_lockp); \
266 *_lockp = _new_lock; \
267 rw_wlock(*_lockp); \
268 } \
269 } while (0)
270
271 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \
272 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa))
273
274 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
275 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m))
276
277 #define RELEASE_PV_LIST_LOCK(lockp) do { \
278 struct rwlock **_lockp = (lockp); \
279 \
280 if (*_lockp != NULL) { \
281 rw_wunlock(*_lockp); \
282 *_lockp = NULL; \
283 } \
284 } while (0)
285
286 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
287 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m))
288
289 /*
290 * The presence of this flag indicates that the mapping is writeable.
291 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
292 * it is dirty. This flag may only be set on managed mappings.
293 *
294 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
295 * as a software managed bit.
296 */
297 #define ATTR_SW_DBM ATTR_DBM
298
299 struct pmap kernel_pmap_store;
300
301 /* Used for mapping ACPI memory before VM is initialized */
302 #define PMAP_PREINIT_MAPPING_COUNT 32
303 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
304 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */
305 static int vm_initialized = 0; /* No need to use pre-init maps when set */
306
307 /*
308 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
309 * Always map entire L2 block for simplicity.
310 * VA of L2 block = preinit_map_va + i * L2_SIZE
311 */
312 static struct pmap_preinit_mapping {
313 vm_paddr_t pa;
314 vm_offset_t va;
315 vm_size_t size;
316 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
317
318 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
319 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
320 vm_offset_t kernel_vm_end = 0;
321
322 /*
323 * Data for the pv entry allocation mechanism.
324 */
325 #ifdef NUMA
326 static __inline int
pc_to_domain(struct pv_chunk * pc)327 pc_to_domain(struct pv_chunk *pc)
328 {
329 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
330 }
331 #else
332 static __inline int
pc_to_domain(struct pv_chunk * pc __unused)333 pc_to_domain(struct pv_chunk *pc __unused)
334 {
335 return (0);
336 }
337 #endif
338
339 struct pv_chunks_list {
340 struct mtx pvc_lock;
341 TAILQ_HEAD(pch, pv_chunk) pvc_list;
342 int active_reclaims;
343 } __aligned(CACHE_LINE_SIZE);
344
345 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
346
347 vm_paddr_t dmap_phys_base; /* The start of the dmap region */
348 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
349 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */
350
351 extern pt_entry_t pagetable_l0_ttbr1[];
352
353 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
354 static vm_paddr_t physmap[PHYSMAP_SIZE];
355 static u_int physmap_idx;
356
357 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
358 "VM/pmap parameters");
359
360 bool pmap_lpa_enabled __read_mostly = false;
361 pt_entry_t pmap_sh_attr __read_mostly = ATTR_SH(ATTR_SH_IS);
362
363 #if PAGE_SIZE == PAGE_SIZE_4K
364 #define L1_BLOCKS_SUPPORTED 1
365 #else
366 #define L1_BLOCKS_SUPPORTED (pmap_lpa_enabled)
367 #endif
368
369 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED)
370
371 static bool pmap_l1_supported __read_mostly = false;
372
373 /*
374 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
375 * that it has currently allocated to a pmap, a cursor ("asid_next") to
376 * optimize its search for a free ASID in the bit vector, and an epoch number
377 * ("asid_epoch") to indicate when it has reclaimed all previously allocated
378 * ASIDs that are not currently active on a processor.
379 *
380 * The current epoch number is always in the range [0, INT_MAX). Negative
381 * numbers and INT_MAX are reserved for special cases that are described
382 * below.
383 */
384 struct asid_set {
385 int asid_bits;
386 bitstr_t *asid_set;
387 int asid_set_size;
388 int asid_next;
389 int asid_epoch;
390 struct mtx asid_set_mutex;
391 };
392
393 static struct asid_set asids;
394 static struct asid_set vmids;
395
396 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
397 "ASID allocator");
398 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
399 "The number of bits in an ASID");
400 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
401 "The last allocated ASID plus one");
402 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
403 "The current epoch number");
404
405 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
406 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
407 "The number of bits in an VMID");
408 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
409 "The last allocated VMID plus one");
410 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
411 "The current epoch number");
412
413 void (*pmap_clean_stage2_tlbi)(void);
414 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
415 void (*pmap_stage2_invalidate_all)(uint64_t);
416
417 /*
418 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved
419 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for
420 * dynamically allocated ASIDs have a non-negative epoch number.
421 *
422 * An invalid ASID is represented by -1.
423 *
424 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
425 * which indicates that an ASID should never be allocated to the pmap, and
426 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
427 * allocated when the pmap is next activated.
428 */
429 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \
430 ((u_long)(epoch) << 32)))
431 #define COOKIE_TO_ASID(cookie) ((int)(cookie))
432 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32))
433
434 #define TLBI_VA_SHIFT 12
435 #define TLBI_VA_MASK ((1ul << 44) - 1)
436 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
437
438 static int __read_frequently superpages_enabled = 1;
439 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
440 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
441 "Are large page mappings enabled?");
442
443 /*
444 * True when Branch Target Identification should be used by userspace. This
445 * allows pmap to mark pages as guarded with ATTR_S1_GP.
446 */
447 __read_mostly static bool pmap_bti_support = false;
448
449 /*
450 * Internal flags for pmap_enter()'s helper functions.
451 */
452 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
453 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
454
455 TAILQ_HEAD(pv_chunklist, pv_chunk);
456
457 static void free_pv_chunk(struct pv_chunk *pc);
458 static void free_pv_chunk_batch(struct pv_chunklist *batch);
459 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
460 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
461 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
462 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
463 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
464 vm_offset_t va);
465
466 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
467 static bool pmap_activate_int(pmap_t pmap);
468 static void pmap_alloc_asid(pmap_t pmap);
469 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
470 vm_prot_t prot, int mode, bool skip_unmapped);
471 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
472 pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp);
473 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
474 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
475 vm_offset_t va, struct rwlock **lockp);
476 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
477 static bool pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va);
478 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va);
479 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
480 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
481 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
482 u_int flags, vm_page_t m, struct rwlock **lockp);
483 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
484 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp);
485 static bool pmap_every_pte_zero(vm_paddr_t pa);
486 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
487 bool all_l3e_AF_set);
488 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p);
489 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
490 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits);
491 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
492 struct rwlock **lockp);
493 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
494 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
495 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
496 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
497 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
498 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
499 vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
500 struct rwlock **lockp);
501 static void pmap_reset_asid_set(pmap_t pmap);
502 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
503 vm_page_t m, struct rwlock **lockp);
504
505 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
506 struct rwlock **lockp);
507
508 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
509 struct spglist *free);
510 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
511 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
512 vm_offset_t va, vm_size_t size);
513 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
514
515 static uma_zone_t pmap_bti_ranges_zone;
516 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
517 pt_entry_t *pte);
518 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va);
519 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
520 static void *bti_dup_range(void *ctx, void *data);
521 static void bti_free_range(void *ctx, void *node);
522 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap);
523 static void pmap_bti_deassign_all(pmap_t pmap);
524
525 /*
526 * These load the old table data and store the new value.
527 * They need to be atomic as the System MMU may write to the table at
528 * the same time as the CPU.
529 */
530 #define pmap_clear(table) atomic_store_64(table, 0)
531 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits)
532 #define pmap_load(table) (*table)
533 #define pmap_load_clear(table) atomic_swap_64(table, 0)
534 #define pmap_load_store(table, entry) atomic_swap_64(table, entry)
535 #define pmap_set_bits(table, bits) atomic_set_64(table, bits)
536 #define pmap_store(table, entry) atomic_store_64(table, entry)
537
538 /********************/
539 /* Inline functions */
540 /********************/
541
542 static __inline void
pagecopy(void * s,void * d)543 pagecopy(void *s, void *d)
544 {
545
546 memcpy(d, s, PAGE_SIZE);
547 }
548
549 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)550 pmap_l0(pmap_t pmap, vm_offset_t va)
551 {
552
553 return (&pmap->pm_l0[pmap_l0_index(va)]);
554 }
555
556 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)557 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
558 {
559 pd_entry_t *l1;
560
561 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
562 return (&l1[pmap_l1_index(va)]);
563 }
564
565 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)566 pmap_l1(pmap_t pmap, vm_offset_t va)
567 {
568 pd_entry_t *l0;
569
570 l0 = pmap_l0(pmap, va);
571 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
572 return (NULL);
573
574 return (pmap_l0_to_l1(l0, va));
575 }
576
577 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1p,vm_offset_t va)578 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
579 {
580 pd_entry_t l1, *l2p;
581
582 l1 = pmap_load(l1p);
583
584 KASSERT(ADDR_IS_CANONICAL(va),
585 ("%s: Address not in canonical form: %lx", __func__, va));
586 /*
587 * The valid bit may be clear if pmap_update_entry() is concurrently
588 * modifying the entry, so for KVA only the entry type may be checked.
589 */
590 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
591 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
592 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
593 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
594 l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1));
595 return (&l2p[pmap_l2_index(va)]);
596 }
597
598 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)599 pmap_l2(pmap_t pmap, vm_offset_t va)
600 {
601 pd_entry_t *l1;
602
603 l1 = pmap_l1(pmap, va);
604 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
605 return (NULL);
606
607 return (pmap_l1_to_l2(l1, va));
608 }
609
610 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2p,vm_offset_t va)611 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
612 {
613 pd_entry_t l2;
614 pt_entry_t *l3p;
615
616 l2 = pmap_load(l2p);
617
618 KASSERT(ADDR_IS_CANONICAL(va),
619 ("%s: Address not in canonical form: %lx", __func__, va));
620 /*
621 * The valid bit may be clear if pmap_update_entry() is concurrently
622 * modifying the entry, so for KVA only the entry type may be checked.
623 */
624 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
625 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
626 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
627 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
628 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2));
629 return (&l3p[pmap_l3_index(va)]);
630 }
631
632 /*
633 * Returns the lowest valid pde for a given virtual address.
634 * The next level may or may not point to a valid page or block.
635 */
636 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va,int * level)637 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
638 {
639 pd_entry_t *l0, *l1, *l2, desc;
640
641 l0 = pmap_l0(pmap, va);
642 desc = pmap_load(l0) & ATTR_DESCR_MASK;
643 if (desc != L0_TABLE) {
644 *level = -1;
645 return (NULL);
646 }
647
648 l1 = pmap_l0_to_l1(l0, va);
649 desc = pmap_load(l1) & ATTR_DESCR_MASK;
650 if (desc != L1_TABLE) {
651 *level = 0;
652 return (l0);
653 }
654
655 l2 = pmap_l1_to_l2(l1, va);
656 desc = pmap_load(l2) & ATTR_DESCR_MASK;
657 if (desc != L2_TABLE) {
658 *level = 1;
659 return (l1);
660 }
661
662 *level = 2;
663 return (l2);
664 }
665
666 /*
667 * Returns the lowest valid pte block or table entry for a given virtual
668 * address. If there are no valid entries return NULL and set the level to
669 * the first invalid level.
670 */
671 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va,int * level)672 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
673 {
674 pd_entry_t *l1, *l2, desc;
675 pt_entry_t *l3;
676
677 l1 = pmap_l1(pmap, va);
678 if (l1 == NULL) {
679 *level = 0;
680 return (NULL);
681 }
682 desc = pmap_load(l1) & ATTR_DESCR_MASK;
683 if (desc == L1_BLOCK) {
684 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
685 *level = 1;
686 return (l1);
687 }
688
689 if (desc != L1_TABLE) {
690 *level = 1;
691 return (NULL);
692 }
693
694 l2 = pmap_l1_to_l2(l1, va);
695 desc = pmap_load(l2) & ATTR_DESCR_MASK;
696 if (desc == L2_BLOCK) {
697 *level = 2;
698 return (l2);
699 }
700
701 if (desc != L2_TABLE) {
702 *level = 2;
703 return (NULL);
704 }
705
706 *level = 3;
707 l3 = pmap_l2_to_l3(l2, va);
708 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
709 return (NULL);
710
711 return (l3);
712 }
713
714 /*
715 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
716 * level that maps the specified virtual address, then a pointer to that entry
717 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled
718 * and a diagnostic message is provided, in which case this function panics.
719 */
720 static __always_inline pt_entry_t *
pmap_pte_exists(pmap_t pmap,vm_offset_t va,int level,const char * diag)721 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
722 {
723 pd_entry_t *l0p, *l1p, *l2p;
724 pt_entry_t desc, *l3p;
725 int walk_level __diagused;
726
727 KASSERT(level >= 0 && level < 4,
728 ("%s: %s passed an out-of-range level (%d)", __func__, diag,
729 level));
730 l0p = pmap_l0(pmap, va);
731 desc = pmap_load(l0p) & ATTR_DESCR_MASK;
732 if (desc == L0_TABLE && level > 0) {
733 l1p = pmap_l0_to_l1(l0p, va);
734 desc = pmap_load(l1p) & ATTR_DESCR_MASK;
735 if (desc == L1_BLOCK && level == 1) {
736 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
737 return (l1p);
738 }
739 if (desc == L1_TABLE && level > 1) {
740 l2p = pmap_l1_to_l2(l1p, va);
741 desc = pmap_load(l2p) & ATTR_DESCR_MASK;
742 if (desc == L2_BLOCK && level == 2)
743 return (l2p);
744 else if (desc == L2_TABLE && level > 2) {
745 l3p = pmap_l2_to_l3(l2p, va);
746 desc = pmap_load(l3p) & ATTR_DESCR_MASK;
747 if (desc == L3_PAGE && level == 3)
748 return (l3p);
749 else
750 walk_level = 3;
751 } else
752 walk_level = 2;
753 } else
754 walk_level = 1;
755 } else
756 walk_level = 0;
757 KASSERT(diag == NULL,
758 ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
759 diag, va, level, desc, walk_level));
760 return (NULL);
761 }
762
763 bool
pmap_ps_enabled(pmap_t pmap)764 pmap_ps_enabled(pmap_t pmap)
765 {
766 /*
767 * Promotion requires a hypervisor call when the kernel is running
768 * in EL1. To stop this disable superpage support on non-stage 1
769 * pmaps for now.
770 */
771 if (pmap->pm_stage != PM_STAGE1)
772 return (false);
773
774 #ifdef KMSAN
775 /*
776 * The break-before-make in pmap_update_entry() results in a situation
777 * where a CPU may call into the KMSAN runtime while the entry is
778 * invalid. If the entry is used to map the current thread structure,
779 * then the runtime will attempt to access unmapped memory. Avoid this
780 * by simply disabling superpage promotion for the kernel map.
781 */
782 if (pmap == kernel_pmap)
783 return (false);
784 #endif
785
786 return (superpages_enabled != 0);
787 }
788
789 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l0,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)790 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
791 pd_entry_t **l2, pt_entry_t **l3)
792 {
793 pd_entry_t *l0p, *l1p, *l2p;
794
795 if (pmap->pm_l0 == NULL)
796 return (false);
797
798 l0p = pmap_l0(pmap, va);
799 *l0 = l0p;
800
801 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
802 return (false);
803
804 l1p = pmap_l0_to_l1(l0p, va);
805 *l1 = l1p;
806
807 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
808 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
809 *l2 = NULL;
810 *l3 = NULL;
811 return (true);
812 }
813
814 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
815 return (false);
816
817 l2p = pmap_l1_to_l2(l1p, va);
818 *l2 = l2p;
819
820 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
821 *l3 = NULL;
822 return (true);
823 }
824
825 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
826 return (false);
827
828 *l3 = pmap_l2_to_l3(l2p, va);
829
830 return (true);
831 }
832
833 static __inline int
pmap_l3_valid(pt_entry_t l3)834 pmap_l3_valid(pt_entry_t l3)
835 {
836
837 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
838 }
839
840 CTASSERT(L1_BLOCK == L2_BLOCK);
841
842 static pt_entry_t
pmap_pte_memattr(pmap_t pmap,vm_memattr_t memattr)843 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
844 {
845 pt_entry_t val;
846
847 if (pmap->pm_stage == PM_STAGE1) {
848 val = ATTR_S1_IDX(memattr);
849 if (memattr == VM_MEMATTR_DEVICE)
850 val |= ATTR_S1_XN;
851 return (val);
852 }
853
854 val = 0;
855
856 switch (memattr) {
857 case VM_MEMATTR_DEVICE:
858 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
859 ATTR_S2_XN(ATTR_S2_XN_ALL));
860 case VM_MEMATTR_UNCACHEABLE:
861 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
862 case VM_MEMATTR_WRITE_BACK:
863 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
864 case VM_MEMATTR_WRITE_THROUGH:
865 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
866 default:
867 panic("%s: invalid memory attribute %x", __func__, memattr);
868 }
869 }
870
871 static pt_entry_t
pmap_pte_prot(pmap_t pmap,vm_prot_t prot)872 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
873 {
874 pt_entry_t val;
875
876 val = 0;
877 if (pmap->pm_stage == PM_STAGE1) {
878 if ((prot & VM_PROT_EXECUTE) == 0)
879 val |= ATTR_S1_XN;
880 if ((prot & VM_PROT_WRITE) == 0)
881 val |= ATTR_S1_AP(ATTR_S1_AP_RO);
882 } else {
883 if ((prot & VM_PROT_WRITE) != 0)
884 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
885 if ((prot & VM_PROT_READ) != 0)
886 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
887 if ((prot & VM_PROT_EXECUTE) == 0)
888 val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
889 }
890
891 return (val);
892 }
893
894 /*
895 * Checks if the PTE is dirty.
896 */
897 static inline int
pmap_pte_dirty(pmap_t pmap,pt_entry_t pte)898 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
899 {
900
901 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
902
903 if (pmap->pm_stage == PM_STAGE1) {
904 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
905 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
906
907 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
908 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
909 }
910
911 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
912 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
913 }
914
915 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)916 pmap_resident_count_inc(pmap_t pmap, int count)
917 {
918
919 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
920 pmap->pm_stats.resident_count += count;
921 }
922
923 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)924 pmap_resident_count_dec(pmap_t pmap, int count)
925 {
926
927 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
928 KASSERT(pmap->pm_stats.resident_count >= count,
929 ("pmap %p resident count underflow %ld %d", pmap,
930 pmap->pm_stats.resident_count, count));
931 pmap->pm_stats.resident_count -= count;
932 }
933
934 static vm_paddr_t
pmap_early_vtophys(vm_offset_t va)935 pmap_early_vtophys(vm_offset_t va)
936 {
937 vm_paddr_t pa_page;
938
939 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
940 return (pa_page | (va & PAR_LOW_MASK));
941 }
942
943 /* State of the bootstrapped DMAP page tables */
944 struct pmap_bootstrap_state {
945 pt_entry_t *l1;
946 pt_entry_t *l2;
947 pt_entry_t *l3;
948 vm_offset_t freemempos;
949 vm_offset_t va;
950 vm_paddr_t pa;
951 pt_entry_t table_attrs;
952 u_int l0_slot;
953 u_int l1_slot;
954 u_int l2_slot;
955 bool dmap_valid;
956 };
957
958 /* The bootstrap state */
959 static struct pmap_bootstrap_state bs_state = {
960 .l1 = NULL,
961 .l2 = NULL,
962 .l3 = NULL,
963 .table_attrs = TATTR_PXN_TABLE,
964 .l0_slot = L0_ENTRIES,
965 .l1_slot = Ln_ENTRIES,
966 .l2_slot = Ln_ENTRIES,
967 .dmap_valid = false,
968 };
969
970 static void
pmap_bootstrap_l0_table(struct pmap_bootstrap_state * state)971 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
972 {
973 vm_paddr_t l1_pa;
974 pd_entry_t l0e;
975 u_int l0_slot;
976
977 /* Link the level 0 table to a level 1 table */
978 l0_slot = pmap_l0_index(state->va);
979 if (l0_slot != state->l0_slot) {
980 /*
981 * Make sure we move from a low address to high address
982 * before the DMAP region is ready. This ensures we never
983 * modify an existing mapping until we can map from a
984 * physical address to a virtual address.
985 */
986 MPASS(state->l0_slot < l0_slot ||
987 state->l0_slot == L0_ENTRIES ||
988 state->dmap_valid);
989
990 /* Reset lower levels */
991 state->l2 = NULL;
992 state->l3 = NULL;
993 state->l1_slot = Ln_ENTRIES;
994 state->l2_slot = Ln_ENTRIES;
995
996 /* Check the existing L0 entry */
997 state->l0_slot = l0_slot;
998 if (state->dmap_valid) {
999 l0e = pagetable_l0_ttbr1[l0_slot];
1000 if ((l0e & ATTR_DESCR_VALID) != 0) {
1001 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
1002 l1_pa = PTE_TO_PHYS(l0e);
1003 state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa);
1004 return;
1005 }
1006 }
1007
1008 /* Create a new L0 table entry */
1009 state->l1 = (pt_entry_t *)state->freemempos;
1010 memset(state->l1, 0, PAGE_SIZE);
1011 state->freemempos += PAGE_SIZE;
1012
1013 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
1014 MPASS((l1_pa & Ln_TABLE_MASK) == 0);
1015 MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
1016 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) |
1017 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
1018 }
1019 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
1020 }
1021
1022 static void
pmap_bootstrap_l1_table(struct pmap_bootstrap_state * state)1023 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
1024 {
1025 vm_paddr_t l2_pa;
1026 pd_entry_t l1e;
1027 u_int l1_slot;
1028
1029 /* Make sure there is a valid L0 -> L1 table */
1030 pmap_bootstrap_l0_table(state);
1031
1032 /* Link the level 1 table to a level 2 table */
1033 l1_slot = pmap_l1_index(state->va);
1034 if (l1_slot != state->l1_slot) {
1035 /* See pmap_bootstrap_l0_table for a description */
1036 MPASS(state->l1_slot < l1_slot ||
1037 state->l1_slot == Ln_ENTRIES ||
1038 state->dmap_valid);
1039
1040 /* Reset lower levels */
1041 state->l3 = NULL;
1042 state->l2_slot = Ln_ENTRIES;
1043
1044 /* Check the existing L1 entry */
1045 state->l1_slot = l1_slot;
1046 if (state->dmap_valid) {
1047 l1e = state->l1[l1_slot];
1048 if ((l1e & ATTR_DESCR_VALID) != 0) {
1049 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
1050 l2_pa = PTE_TO_PHYS(l1e);
1051 state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa);
1052 return;
1053 }
1054 }
1055
1056 /* Create a new L1 table entry */
1057 state->l2 = (pt_entry_t *)state->freemempos;
1058 memset(state->l2, 0, PAGE_SIZE);
1059 state->freemempos += PAGE_SIZE;
1060
1061 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1062 MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1063 MPASS(state->l1[l1_slot] == 0);
1064 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) |
1065 state->table_attrs | L1_TABLE);
1066 }
1067 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1068 }
1069
1070 static void
pmap_bootstrap_l2_table(struct pmap_bootstrap_state * state)1071 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1072 {
1073 vm_paddr_t l3_pa;
1074 pd_entry_t l2e;
1075 u_int l2_slot;
1076
1077 /* Make sure there is a valid L1 -> L2 table */
1078 pmap_bootstrap_l1_table(state);
1079
1080 /* Link the level 2 table to a level 3 table */
1081 l2_slot = pmap_l2_index(state->va);
1082 if (l2_slot != state->l2_slot) {
1083 /* See pmap_bootstrap_l0_table for a description */
1084 MPASS(state->l2_slot < l2_slot ||
1085 state->l2_slot == Ln_ENTRIES ||
1086 state->dmap_valid);
1087
1088 /* Check the existing L2 entry */
1089 state->l2_slot = l2_slot;
1090 if (state->dmap_valid) {
1091 l2e = state->l2[l2_slot];
1092 if ((l2e & ATTR_DESCR_VALID) != 0) {
1093 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1094 l3_pa = PTE_TO_PHYS(l2e);
1095 state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa);
1096 return;
1097 }
1098 }
1099
1100 /* Create a new L2 table entry */
1101 state->l3 = (pt_entry_t *)state->freemempos;
1102 memset(state->l3, 0, PAGE_SIZE);
1103 state->freemempos += PAGE_SIZE;
1104
1105 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1106 MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1107 MPASS(state->l2[l2_slot] == 0);
1108 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) |
1109 state->table_attrs | L2_TABLE);
1110 }
1111 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1112 }
1113
1114 static void
pmap_bootstrap_l2_block(struct pmap_bootstrap_state * state,int i)1115 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1116 {
1117 pt_entry_t contig;
1118 u_int l2_slot;
1119 bool first;
1120
1121 if ((physmap[i + 1] - state->pa) < L2_SIZE)
1122 return;
1123
1124 /* Make sure there is a valid L1 table */
1125 pmap_bootstrap_l1_table(state);
1126
1127 MPASS((state->va & L2_OFFSET) == 0);
1128 for (first = true, contig = 0;
1129 state->va < DMAP_MAX_ADDRESS &&
1130 (physmap[i + 1] - state->pa) >= L2_SIZE;
1131 state->va += L2_SIZE, state->pa += L2_SIZE) {
1132 /*
1133 * Stop if we are about to walk off the end of what the
1134 * current L1 slot can address.
1135 */
1136 if (!first && (state->pa & L1_OFFSET) == 0)
1137 break;
1138
1139 /*
1140 * If we have an aligned, contiguous chunk of L2C_ENTRIES
1141 * L2 blocks, set the contiguous bit within each PTE so that
1142 * the chunk can be cached using only one TLB entry.
1143 */
1144 if ((state->pa & L2C_OFFSET) == 0) {
1145 if (state->va + L2C_SIZE < DMAP_MAX_ADDRESS &&
1146 physmap[i + 1] - state->pa >= L2C_SIZE) {
1147 contig = ATTR_CONTIGUOUS;
1148 } else {
1149 contig = 0;
1150 }
1151 }
1152
1153 first = false;
1154 l2_slot = pmap_l2_index(state->va);
1155 MPASS((state->pa & L2_OFFSET) == 0);
1156 MPASS(state->l2[l2_slot] == 0);
1157 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) |
1158 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP |
1159 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L2_BLOCK);
1160 }
1161 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1162 }
1163
1164 static void
pmap_bootstrap_l3_page(struct pmap_bootstrap_state * state,int i)1165 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1166 {
1167 pt_entry_t contig;
1168 u_int l3_slot;
1169 bool first;
1170
1171 if (physmap[i + 1] - state->pa < L3_SIZE)
1172 return;
1173
1174 /* Make sure there is a valid L2 table */
1175 pmap_bootstrap_l2_table(state);
1176
1177 MPASS((state->va & L3_OFFSET) == 0);
1178 for (first = true, contig = 0;
1179 state->va < DMAP_MAX_ADDRESS &&
1180 physmap[i + 1] - state->pa >= L3_SIZE;
1181 state->va += L3_SIZE, state->pa += L3_SIZE) {
1182 /*
1183 * Stop if we are about to walk off the end of what the
1184 * current L2 slot can address.
1185 */
1186 if (!first && (state->pa & L2_OFFSET) == 0)
1187 break;
1188
1189 /*
1190 * If we have an aligned, contiguous chunk of L3C_ENTRIES
1191 * L3 pages, set the contiguous bit within each PTE so that
1192 * the chunk can be cached using only one TLB entry.
1193 */
1194 if ((state->pa & L3C_OFFSET) == 0) {
1195 if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS &&
1196 physmap[i + 1] - state->pa >= L3C_SIZE) {
1197 contig = ATTR_CONTIGUOUS;
1198 } else {
1199 contig = 0;
1200 }
1201 }
1202
1203 first = false;
1204 l3_slot = pmap_l3_index(state->va);
1205 MPASS((state->pa & L3_OFFSET) == 0);
1206 MPASS(state->l3[l3_slot] == 0);
1207 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) |
1208 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP |
1209 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE);
1210 }
1211 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1212 }
1213
1214 static void
pmap_bootstrap_dmap(void)1215 pmap_bootstrap_dmap(void)
1216 {
1217 int i;
1218
1219 /* Fill in physmap array. */
1220 physmap_idx = physmem_avail(physmap, nitems(physmap));
1221
1222 dmap_phys_base = physmap[0] & ~L1_OFFSET;
1223 dmap_phys_max = 0;
1224 dmap_max_addr = 0;
1225
1226 for (i = 0; i < physmap_idx; i += 2) {
1227 bs_state.pa = physmap[i] & ~L3_OFFSET;
1228 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1229
1230 /* Create L3 mappings at the start of the region */
1231 if ((bs_state.pa & L2_OFFSET) != 0)
1232 pmap_bootstrap_l3_page(&bs_state, i);
1233 MPASS(bs_state.pa <= physmap[i + 1]);
1234
1235 if (L1_BLOCKS_SUPPORTED) {
1236 /* Create L2 mappings at the start of the region */
1237 if ((bs_state.pa & L1_OFFSET) != 0)
1238 pmap_bootstrap_l2_block(&bs_state, i);
1239 MPASS(bs_state.pa <= physmap[i + 1]);
1240
1241 /* Create the main L1 block mappings */
1242 for (; bs_state.va < DMAP_MAX_ADDRESS &&
1243 (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1244 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1245 /* Make sure there is a valid L1 table */
1246 pmap_bootstrap_l0_table(&bs_state);
1247 MPASS((bs_state.pa & L1_OFFSET) == 0);
1248 pmap_store(
1249 &bs_state.l1[pmap_l1_index(bs_state.va)],
1250 PHYS_TO_PTE(bs_state.pa) | ATTR_AF |
1251 pmap_sh_attr |
1252 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1253 ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK);
1254 }
1255 MPASS(bs_state.pa <= physmap[i + 1]);
1256
1257 /* Create L2 mappings at the end of the region */
1258 pmap_bootstrap_l2_block(&bs_state, i);
1259 } else {
1260 while (bs_state.va < DMAP_MAX_ADDRESS &&
1261 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1262 pmap_bootstrap_l2_block(&bs_state, i);
1263 }
1264 }
1265 MPASS(bs_state.pa <= physmap[i + 1]);
1266
1267 /* Create L3 mappings at the end of the region */
1268 pmap_bootstrap_l3_page(&bs_state, i);
1269 MPASS(bs_state.pa == physmap[i + 1]);
1270
1271 if (bs_state.pa > dmap_phys_max) {
1272 dmap_phys_max = bs_state.pa;
1273 dmap_max_addr = bs_state.va;
1274 }
1275 }
1276
1277 cpu_tlb_flushID();
1278 }
1279
1280 static void
pmap_bootstrap_l2(vm_offset_t va)1281 pmap_bootstrap_l2(vm_offset_t va)
1282 {
1283 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1284
1285 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1286 bs_state.va = va;
1287
1288 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1289 pmap_bootstrap_l1_table(&bs_state);
1290 }
1291
1292 static void
pmap_bootstrap_l3(vm_offset_t va)1293 pmap_bootstrap_l3(vm_offset_t va)
1294 {
1295 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1296
1297 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1298 bs_state.va = va;
1299
1300 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1301 pmap_bootstrap_l2_table(&bs_state);
1302 }
1303
1304 /*
1305 * Bootstrap the system enough to run with virtual memory.
1306 */
1307 void
pmap_bootstrap(vm_size_t kernlen)1308 pmap_bootstrap(vm_size_t kernlen)
1309 {
1310 vm_offset_t dpcpu, msgbufpv;
1311 vm_paddr_t start_pa, pa;
1312 uint64_t tcr;
1313
1314 tcr = READ_SPECIALREG(tcr_el1);
1315
1316 /* Verify that the ASID is set through TTBR0. */
1317 KASSERT((tcr & TCR_A1) == 0, ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1318
1319 if ((tcr & TCR_DS) != 0)
1320 pmap_lpa_enabled = true;
1321
1322 pmap_l1_supported = L1_BLOCKS_SUPPORTED;
1323
1324 /* Set this early so we can use the pagetable walking functions */
1325 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1326 PMAP_LOCK_INIT(kernel_pmap);
1327 kernel_pmap->pm_l0_paddr =
1328 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1329 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1330 vm_radix_init(&kernel_pmap->pm_root);
1331 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1332 kernel_pmap->pm_stage = PM_STAGE1;
1333 kernel_pmap->pm_levels = 4;
1334 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1335 kernel_pmap->pm_asid_set = &asids;
1336
1337 bs_state.freemempos = KERNBASE + kernlen;
1338 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1339
1340 /* Create a direct map region early so we can use it for pa -> va */
1341 pmap_bootstrap_dmap();
1342 bs_state.dmap_valid = true;
1343
1344 /*
1345 * We only use PXN when we know nothing will be executed from it, e.g.
1346 * the DMAP region.
1347 */
1348 bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1349
1350 start_pa = pa = pmap_early_vtophys(KERNBASE);
1351
1352 /*
1353 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the
1354 * loader allocated the first and only l2 page table page used to map
1355 * the kernel, preloaded files and module metadata.
1356 */
1357 pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1358 /* And the l3 tables for the early devmap */
1359 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1360
1361 cpu_tlb_flushID();
1362
1363 #define alloc_pages(var, np) \
1364 (var) = bs_state.freemempos; \
1365 bs_state.freemempos += (np * PAGE_SIZE); \
1366 memset((char *)(var), 0, ((np) * PAGE_SIZE));
1367
1368 /* Allocate dynamic per-cpu area. */
1369 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1370 dpcpu_init((void *)dpcpu, 0);
1371
1372 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1373 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1374 msgbufp = (void *)msgbufpv;
1375
1376 /* Reserve some VA space for early BIOS/ACPI mapping */
1377 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1378
1379 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1380 virtual_avail = roundup2(virtual_avail, L1_SIZE);
1381 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1382 kernel_vm_end = virtual_avail;
1383
1384 pa = pmap_early_vtophys(bs_state.freemempos);
1385
1386 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1387
1388 cpu_tlb_flushID();
1389 }
1390
1391 #if defined(KASAN) || defined(KMSAN)
1392 static void
pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa,vm_paddr_t end_pa,vm_offset_t * vap,vm_offset_t eva)1393 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1394 vm_offset_t *vap, vm_offset_t eva)
1395 {
1396 vm_paddr_t pa;
1397 vm_offset_t va;
1398 pd_entry_t *l2;
1399
1400 va = *vap;
1401 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1402 for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) {
1403 l2 = pmap_l2(kernel_pmap, va);
1404
1405 /*
1406 * KASAN stack checking results in us having already allocated
1407 * part of our shadow map, so we can just skip those segments.
1408 */
1409 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1410 pa += L2_SIZE;
1411 continue;
1412 }
1413
1414 bzero((void *)PHYS_TO_DMAP(pa), L2_SIZE);
1415 physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC);
1416 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK);
1417 }
1418 *vap = va;
1419 }
1420
1421 /*
1422 * Finish constructing the initial shadow map:
1423 * - Count how many pages from KERNBASE to virtual_avail (scaled for
1424 * shadow map)
1425 * - Map that entire range using L2 superpages.
1426 */
1427 static void
pmap_bootstrap_san1(vm_offset_t va,int scale)1428 pmap_bootstrap_san1(vm_offset_t va, int scale)
1429 {
1430 vm_offset_t eva;
1431 vm_paddr_t kernstart;
1432 int i;
1433
1434 kernstart = pmap_early_vtophys(KERNBASE);
1435
1436 /*
1437 * Rebuild physmap one more time, we may have excluded more regions from
1438 * allocation since pmap_bootstrap().
1439 */
1440 physmap_idx = physmem_avail(physmap, nitems(physmap));
1441
1442 eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale;
1443
1444 /*
1445 * Find a slot in the physmap large enough for what we needed. We try to put
1446 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1447 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1448 */
1449 for (i = physmap_idx - 2; i >= 0; i -= 2) {
1450 vm_paddr_t plow, phigh;
1451
1452 /* L2 mappings must be backed by memory that is L2-aligned */
1453 plow = roundup2(physmap[i], L2_SIZE);
1454 phigh = physmap[i + 1];
1455 if (plow >= phigh)
1456 continue;
1457 if (kernstart >= plow && kernstart < phigh)
1458 phigh = kernstart;
1459 if (phigh - plow >= L2_SIZE) {
1460 pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva);
1461 if (va >= eva)
1462 break;
1463 }
1464 }
1465 if (i < 0)
1466 panic("Could not find phys region for shadow map");
1467
1468 /*
1469 * Done. We should now have a valid shadow address mapped for all KVA
1470 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1471 * shadow accesses by the sanitizer runtime will succeed for this range.
1472 * When the kernel virtual address range is later expanded, as will
1473 * happen in vm_mem_init(), the shadow map will be grown as well. This
1474 * is handled by pmap_san_enter().
1475 */
1476 }
1477
1478 void
pmap_bootstrap_san(void)1479 pmap_bootstrap_san(void)
1480 {
1481 #ifdef KASAN
1482 pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE);
1483 #else
1484 static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1485 static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1486 pd_entry_t *l0, *l1;
1487
1488 if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE)
1489 panic("initial kernel map is too large");
1490
1491 l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS);
1492 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1493 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp)));
1494 l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS);
1495 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1496 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE)));
1497 pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1);
1498
1499 l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS);
1500 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1501 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp)));
1502 l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS);
1503 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1504 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE)));
1505 pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1);
1506 #endif
1507 }
1508 #endif
1509
1510 /*
1511 * Initialize a vm_page's machine-dependent fields.
1512 */
1513 void
pmap_page_init(vm_page_t m)1514 pmap_page_init(vm_page_t m)
1515 {
1516
1517 TAILQ_INIT(&m->md.pv_list);
1518 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1519 }
1520
1521 static void
pmap_init_asids(struct asid_set * set,int bits)1522 pmap_init_asids(struct asid_set *set, int bits)
1523 {
1524 int i;
1525
1526 set->asid_bits = bits;
1527
1528 /*
1529 * We may be too early in the overall initialization process to use
1530 * bit_alloc().
1531 */
1532 set->asid_set_size = 1 << set->asid_bits;
1533 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1534 M_WAITOK | M_ZERO);
1535 for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1536 bit_set(set->asid_set, i);
1537 set->asid_next = ASID_FIRST_AVAILABLE;
1538 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1539 }
1540
1541 static void
pmap_init_pv_table(void)1542 pmap_init_pv_table(void)
1543 {
1544 struct vm_phys_seg *seg, *next_seg;
1545 struct pmap_large_md_page *pvd;
1546 vm_size_t s;
1547 int domain, i, j, pages;
1548
1549 /*
1550 * We depend on the size being evenly divisible into a page so
1551 * that the pv_table array can be indexed directly while
1552 * safely spanning multiple pages from different domains.
1553 */
1554 CTASSERT(PAGE_SIZE % sizeof(*pvd) == 0);
1555
1556 /*
1557 * Calculate the size of the array.
1558 */
1559 s = 0;
1560 for (i = 0; i < vm_phys_nsegs; i++) {
1561 seg = &vm_phys_segs[i];
1562 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1563 pmap_l2_pindex(seg->start);
1564 s += round_page(pages * sizeof(*pvd));
1565 }
1566 pv_table = (struct pmap_large_md_page *)kva_alloc(s);
1567 if (pv_table == NULL)
1568 panic("%s: kva_alloc failed\n", __func__);
1569
1570 /*
1571 * Iterate physical segments to allocate domain-local memory for PV
1572 * list headers.
1573 */
1574 pvd = pv_table;
1575 for (i = 0; i < vm_phys_nsegs; i++) {
1576 seg = &vm_phys_segs[i];
1577 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1578 pmap_l2_pindex(seg->start);
1579 domain = seg->domain;
1580
1581 s = round_page(pages * sizeof(*pvd));
1582
1583 for (j = 0; j < s; j += PAGE_SIZE) {
1584 vm_page_t m = vm_page_alloc_noobj_domain(domain,
1585 VM_ALLOC_ZERO);
1586 if (m == NULL)
1587 panic("failed to allocate PV table page");
1588 pmap_qenter((vm_offset_t)pvd + j, &m, 1);
1589 }
1590
1591 for (j = 0; j < s / sizeof(*pvd); j++) {
1592 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1593 TAILQ_INIT(&pvd->pv_page.pv_list);
1594 pvd++;
1595 }
1596 }
1597 pvd = &pv_dummy_large;
1598 memset(pvd, 0, sizeof(*pvd));
1599 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1600 TAILQ_INIT(&pvd->pv_page.pv_list);
1601
1602 /*
1603 * Set pointers from vm_phys_segs to pv_table.
1604 */
1605 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1606 seg = &vm_phys_segs[i];
1607 seg->md_first = pvd;
1608 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1609 pmap_l2_pindex(seg->start);
1610
1611 /*
1612 * If there is a following segment, and the final
1613 * superpage of this segment and the initial superpage
1614 * of the next segment are the same then adjust the
1615 * pv_table entry for that next segment down by one so
1616 * that the pv_table entries will be shared.
1617 */
1618 if (i + 1 < vm_phys_nsegs) {
1619 next_seg = &vm_phys_segs[i + 1];
1620 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1621 pmap_l2_pindex(next_seg->start)) {
1622 pvd--;
1623 }
1624 }
1625 }
1626 }
1627
1628 static bool
pmap_dbm_check(const struct cpu_feat * feat __unused,u_int midr __unused)1629 pmap_dbm_check(const struct cpu_feat *feat __unused, u_int midr __unused)
1630 {
1631 uint64_t id_aa64mmfr1;
1632
1633 id_aa64mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1634 return (ID_AA64MMFR1_HAFDBS_VAL(id_aa64mmfr1) >=
1635 ID_AA64MMFR1_HAFDBS_AF_DBS);
1636 }
1637
1638 static bool
pmap_dbm_has_errata(const struct cpu_feat * feat __unused,u_int midr,u_int ** errata_list,u_int * errata_count)1639 pmap_dbm_has_errata(const struct cpu_feat *feat __unused, u_int midr,
1640 u_int **errata_list, u_int *errata_count)
1641 {
1642 /* Disable on Cortex-A55 for erratum 1024718 - all revisions */
1643 if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK, CPU_IMPL_ARM,
1644 CPU_PART_CORTEX_A55, 0, 0)) {
1645 static u_int errata_id = 1024718;
1646
1647 *errata_list = &errata_id;
1648 *errata_count = 1;
1649 return (true);
1650 }
1651
1652 /* Disable on Cortex-A510 for erratum 2051678 - r0p0 to r0p2 */
1653 if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK | CPU_VAR_MASK,
1654 CPU_IMPL_ARM, CPU_PART_CORTEX_A510, 0, 0)) {
1655 if (CPU_REV(PCPU_GET(midr)) < 3) {
1656 static u_int errata_id = 2051678;
1657
1658 *errata_list = &errata_id;
1659 *errata_count = 1;
1660 return (true);
1661 }
1662 }
1663
1664 return (false);
1665 }
1666
1667 static void
pmap_dbm_enable(const struct cpu_feat * feat __unused,cpu_feat_errata errata_status,u_int * errata_list __unused,u_int errata_count)1668 pmap_dbm_enable(const struct cpu_feat *feat __unused,
1669 cpu_feat_errata errata_status, u_int *errata_list __unused,
1670 u_int errata_count)
1671 {
1672 uint64_t tcr;
1673
1674 /* Skip if there is an erratum affecting DBM */
1675 if (errata_status != ERRATA_NONE)
1676 return;
1677
1678 tcr = READ_SPECIALREG(tcr_el1) | TCR_HD;
1679 WRITE_SPECIALREG(tcr_el1, tcr);
1680 isb();
1681 /* Flush the local TLB for the TCR_HD flag change */
1682 dsb(nshst);
1683 __asm __volatile("tlbi vmalle1");
1684 dsb(nsh);
1685 isb();
1686 }
1687
1688 static struct cpu_feat feat_dbm = {
1689 .feat_name = "FEAT_HAFDBS (DBM)",
1690 .feat_check = pmap_dbm_check,
1691 .feat_has_errata = pmap_dbm_has_errata,
1692 .feat_enable = pmap_dbm_enable,
1693 .feat_flags = CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU,
1694 };
1695 DATA_SET(cpu_feat_set, feat_dbm);
1696
1697 /*
1698 * Initialize the pmap module.
1699 *
1700 * Called by vm_mem_init(), to initialize any structures that the pmap
1701 * system needs to map virtual memory.
1702 */
1703 void
pmap_init(void)1704 pmap_init(void)
1705 {
1706 uint64_t mmfr1;
1707 int i, vmid_bits;
1708
1709 /*
1710 * Are large page mappings enabled?
1711 */
1712 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1713 if (superpages_enabled) {
1714 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1715 ("pmap_init: can't assign to pagesizes[1]"));
1716 pagesizes[1] = L3C_SIZE;
1717 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1718 ("pmap_init: can't assign to pagesizes[2]"));
1719 pagesizes[2] = L2_SIZE;
1720 if (L1_BLOCKS_SUPPORTED) {
1721 KASSERT(MAXPAGESIZES > 3 && pagesizes[3] == 0,
1722 ("pmap_init: can't assign to pagesizes[3]"));
1723 pagesizes[3] = L1_SIZE;
1724 }
1725 }
1726
1727 /*
1728 * Initialize the ASID allocator.
1729 */
1730 pmap_init_asids(&asids,
1731 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1732
1733 if (has_hyp()) {
1734 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1735 vmid_bits = 8;
1736
1737 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1738 ID_AA64MMFR1_VMIDBits_16)
1739 vmid_bits = 16;
1740 pmap_init_asids(&vmids, vmid_bits);
1741 }
1742
1743 /*
1744 * Initialize pv chunk lists.
1745 */
1746 for (i = 0; i < PMAP_MEMDOM; i++) {
1747 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1748 MTX_DEF);
1749 TAILQ_INIT(&pv_chunks[i].pvc_list);
1750 }
1751 pmap_init_pv_table();
1752
1753 vm_initialized = 1;
1754 }
1755
1756 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1757 "L1 (1GB/64GB) page mapping counters");
1758
1759 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
1760 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
1761 &pmap_l1_demotions, "L1 (1GB/64GB) page demotions");
1762
1763 SYSCTL_BOOL(_vm_pmap_l1, OID_AUTO, supported, CTLFLAG_RD, &pmap_l1_supported,
1764 0, "L1 blocks are supported");
1765
1766 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1767 "L2C (32MB/1GB) page mapping counters");
1768
1769 static COUNTER_U64_DEFINE_EARLY(pmap_l2c_demotions);
1770 SYSCTL_COUNTER_U64(_vm_pmap_l2c, OID_AUTO, demotions, CTLFLAG_RD,
1771 &pmap_l2c_demotions, "L2C (32MB/1GB) page demotions");
1772
1773 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1774 "2MB page mapping counters");
1775
1776 static u_long pmap_l2_demotions;
1777 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1778 &pmap_l2_demotions, 0, "2MB page demotions");
1779
1780 static u_long pmap_l2_mappings;
1781 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1782 &pmap_l2_mappings, 0, "2MB page mappings");
1783
1784 static u_long pmap_l2_p_failures;
1785 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1786 &pmap_l2_p_failures, 0, "2MB page promotion failures");
1787
1788 static u_long pmap_l2_promotions;
1789 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1790 &pmap_l2_promotions, 0, "2MB page promotions");
1791
1792 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1793 "L3C (64KB/2MB) page mapping counters");
1794
1795 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions);
1796 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD,
1797 &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions");
1798
1799 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings);
1800 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD,
1801 &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings");
1802
1803 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures);
1804 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD,
1805 &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures");
1806
1807 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions);
1808 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD,
1809 &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions");
1810
1811 /*
1812 * If the given value for "final_only" is false, then any cached intermediate-
1813 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1814 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1815 * Otherwise, just the cached final-level entry is invalidated.
1816 */
1817 static __inline void
pmap_s1_invalidate_kernel(uint64_t r,bool final_only)1818 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1819 {
1820 if (final_only)
1821 __asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1822 else
1823 __asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1824 }
1825
1826 static __inline void
pmap_s1_invalidate_user(uint64_t r,bool final_only)1827 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1828 {
1829 if (final_only)
1830 __asm __volatile("tlbi vale1is, %0" : : "r" (r));
1831 else
1832 __asm __volatile("tlbi vae1is, %0" : : "r" (r));
1833 }
1834
1835 /*
1836 * Invalidates any cached final- and optionally intermediate-level TLB entries
1837 * for the specified virtual address in the given virtual address space.
1838 */
1839 static __inline void
pmap_s1_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1840 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1841 {
1842 uint64_t r;
1843
1844 PMAP_ASSERT_STAGE1(pmap);
1845
1846 dsb(ishst);
1847 r = TLBI_VA(va);
1848 if (pmap == kernel_pmap) {
1849 pmap_s1_invalidate_kernel(r, final_only);
1850 } else {
1851 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1852 pmap_s1_invalidate_user(r, final_only);
1853 }
1854 dsb(ish);
1855 isb();
1856 }
1857
1858 static __inline void
pmap_s2_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1859 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1860 {
1861 PMAP_ASSERT_STAGE2(pmap);
1862 MPASS(pmap_stage2_invalidate_range != NULL);
1863 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1864 final_only);
1865 }
1866
1867 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1868 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1869 {
1870 if (pmap->pm_stage == PM_STAGE1)
1871 pmap_s1_invalidate_page(pmap, va, final_only);
1872 else
1873 pmap_s2_invalidate_page(pmap, va, final_only);
1874 }
1875
1876 /*
1877 * Use stride L{1,2}_SIZE when invalidating the TLB entries for L{1,2}_BLOCK
1878 * mappings. Otherwise, use stride L3_SIZE.
1879 */
1880 static __inline void
pmap_s1_invalidate_strided(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_offset_t stride,bool final_only)1881 pmap_s1_invalidate_strided(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1882 vm_offset_t stride, bool final_only)
1883 {
1884 uint64_t end, r, start;
1885
1886 PMAP_ASSERT_STAGE1(pmap);
1887
1888 dsb(ishst);
1889 if (pmap == kernel_pmap) {
1890 start = TLBI_VA(sva);
1891 end = TLBI_VA(eva);
1892 for (r = start; r < end; r += TLBI_VA(stride))
1893 pmap_s1_invalidate_kernel(r, final_only);
1894 } else {
1895 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1896 start |= TLBI_VA(sva);
1897 end |= TLBI_VA(eva);
1898 for (r = start; r < end; r += TLBI_VA(stride))
1899 pmap_s1_invalidate_user(r, final_only);
1900 }
1901 dsb(ish);
1902 isb();
1903 }
1904
1905 /*
1906 * Invalidates any cached final- and optionally intermediate-level TLB entries
1907 * for the specified virtual address range in the given virtual address space.
1908 */
1909 static __inline void
pmap_s1_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1910 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1911 bool final_only)
1912 {
1913 pmap_s1_invalidate_strided(pmap, sva, eva, L3_SIZE, final_only);
1914 }
1915
1916 static __inline void
pmap_s2_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1917 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1918 bool final_only)
1919 {
1920 PMAP_ASSERT_STAGE2(pmap);
1921 MPASS(pmap_stage2_invalidate_range != NULL);
1922 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
1923 }
1924
1925 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1926 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1927 bool final_only)
1928 {
1929 if (pmap->pm_stage == PM_STAGE1)
1930 pmap_s1_invalidate_range(pmap, sva, eva, final_only);
1931 else
1932 pmap_s2_invalidate_range(pmap, sva, eva, final_only);
1933 }
1934
1935 /*
1936 * Invalidates all cached intermediate- and final-level TLB entries for the
1937 * given virtual address space.
1938 */
1939 static __inline void
pmap_s1_invalidate_all(pmap_t pmap)1940 pmap_s1_invalidate_all(pmap_t pmap)
1941 {
1942 uint64_t r;
1943
1944 PMAP_ASSERT_STAGE1(pmap);
1945
1946 dsb(ishst);
1947 if (pmap == kernel_pmap) {
1948 __asm __volatile("tlbi vmalle1is");
1949 } else {
1950 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1951 __asm __volatile("tlbi aside1is, %0" : : "r" (r));
1952 }
1953 dsb(ish);
1954 isb();
1955 }
1956
1957 static __inline void
pmap_s2_invalidate_all(pmap_t pmap)1958 pmap_s2_invalidate_all(pmap_t pmap)
1959 {
1960 PMAP_ASSERT_STAGE2(pmap);
1961 MPASS(pmap_stage2_invalidate_all != NULL);
1962 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
1963 }
1964
1965 static __inline void
pmap_invalidate_all(pmap_t pmap)1966 pmap_invalidate_all(pmap_t pmap)
1967 {
1968 if (pmap->pm_stage == PM_STAGE1)
1969 pmap_s1_invalidate_all(pmap);
1970 else
1971 pmap_s2_invalidate_all(pmap);
1972 }
1973
1974 /*
1975 * Routine: pmap_extract
1976 * Function:
1977 * Extract the physical page address associated
1978 * with the given map/virtual_address pair.
1979 */
1980 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1981 pmap_extract(pmap_t pmap, vm_offset_t va)
1982 {
1983 pt_entry_t *pte, tpte;
1984 vm_paddr_t pa;
1985 int lvl;
1986
1987 pa = 0;
1988 PMAP_LOCK(pmap);
1989 /*
1990 * Find the block or page map for this virtual address. pmap_pte
1991 * will return either a valid block/page entry, or NULL.
1992 */
1993 pte = pmap_pte(pmap, va, &lvl);
1994 if (pte != NULL) {
1995 tpte = pmap_load(pte);
1996 pa = PTE_TO_PHYS(tpte);
1997 switch(lvl) {
1998 case 1:
1999 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
2000 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
2001 ("pmap_extract: Invalid L1 pte found: %lx",
2002 tpte & ATTR_DESCR_MASK));
2003 pa |= (va & L1_OFFSET);
2004 break;
2005 case 2:
2006 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
2007 ("pmap_extract: Invalid L2 pte found: %lx",
2008 tpte & ATTR_DESCR_MASK));
2009 pa |= (va & L2_OFFSET);
2010 break;
2011 case 3:
2012 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
2013 ("pmap_extract: Invalid L3 pte found: %lx",
2014 tpte & ATTR_DESCR_MASK));
2015 pa |= (va & L3_OFFSET);
2016 break;
2017 }
2018 }
2019 PMAP_UNLOCK(pmap);
2020 return (pa);
2021 }
2022
2023 /*
2024 * Routine: pmap_extract_and_hold
2025 * Function:
2026 * Atomically extract and hold the physical page
2027 * with the given pmap and virtual address pair
2028 * if that mapping permits the given protection.
2029 */
2030 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)2031 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
2032 {
2033 pt_entry_t *pte, tpte;
2034 vm_offset_t off;
2035 vm_page_t m;
2036 int lvl;
2037 bool use;
2038
2039 m = NULL;
2040 PMAP_LOCK(pmap);
2041 pte = pmap_pte(pmap, va, &lvl);
2042 if (pte != NULL) {
2043 tpte = pmap_load(pte);
2044
2045 KASSERT(lvl > 0 && lvl <= 3,
2046 ("pmap_extract_and_hold: Invalid level %d", lvl));
2047 /*
2048 * Check that the pte is either a L3 page, or a L1 or L2 block
2049 * entry. We can assume L1_BLOCK == L2_BLOCK.
2050 */
2051 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
2052 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
2053 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
2054 tpte & ATTR_DESCR_MASK));
2055
2056 use = false;
2057 if ((prot & VM_PROT_WRITE) == 0)
2058 use = true;
2059 else if (pmap->pm_stage == PM_STAGE1 &&
2060 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
2061 use = true;
2062 else if (pmap->pm_stage == PM_STAGE2 &&
2063 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
2064 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
2065 use = true;
2066
2067 if (use) {
2068 switch (lvl) {
2069 case 1:
2070 off = va & L1_OFFSET;
2071 break;
2072 case 2:
2073 off = va & L2_OFFSET;
2074 break;
2075 case 3:
2076 default:
2077 off = 0;
2078 }
2079 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off);
2080 if (m != NULL && !vm_page_wire_mapped(m))
2081 m = NULL;
2082 }
2083 }
2084 PMAP_UNLOCK(pmap);
2085 return (m);
2086 }
2087
2088 /*
2089 * Walks the page tables to translate a kernel virtual address to a
2090 * physical address. Returns true if the kva is valid and stores the
2091 * physical address in pa if it is not NULL.
2092 *
2093 * See the comment above data_abort() for the rationale for specifying
2094 * NO_PERTHREAD_SSP here.
2095 */
2096 bool NO_PERTHREAD_SSP
pmap_klookup(vm_offset_t va,vm_paddr_t * pa)2097 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
2098 {
2099 pt_entry_t *pte, tpte;
2100 register_t intr;
2101 uint64_t par;
2102
2103 /*
2104 * Disable interrupts so we don't get interrupted between asking
2105 * for address translation, and getting the result back.
2106 */
2107 intr = intr_disable();
2108 par = arm64_address_translate_s1e1r(va);
2109 intr_restore(intr);
2110
2111 if (PAR_SUCCESS(par)) {
2112 if (pa != NULL)
2113 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
2114 return (true);
2115 }
2116
2117 /*
2118 * Fall back to walking the page table. The address translation
2119 * instruction may fail when the page is in a break-before-make
2120 * sequence. As we only clear the valid bit in said sequence we
2121 * can walk the page table to find the physical address.
2122 */
2123
2124 pte = pmap_l1(kernel_pmap, va);
2125 if (pte == NULL)
2126 return (false);
2127
2128 /*
2129 * A concurrent pmap_update_entry() will clear the entry's valid bit
2130 * but leave the rest of the entry unchanged. Therefore, we treat a
2131 * non-zero entry as being valid, and we ignore the valid bit when
2132 * determining whether the entry maps a block, page, or table.
2133 */
2134 tpte = pmap_load(pte);
2135 if (tpte == 0)
2136 return (false);
2137 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2138 if (pa != NULL)
2139 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET);
2140 return (true);
2141 }
2142 pte = pmap_l1_to_l2(&tpte, va);
2143 tpte = pmap_load(pte);
2144 if (tpte == 0)
2145 return (false);
2146 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2147 if (pa != NULL)
2148 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET);
2149 return (true);
2150 }
2151 pte = pmap_l2_to_l3(&tpte, va);
2152 tpte = pmap_load(pte);
2153 if (tpte == 0)
2154 return (false);
2155 if (pa != NULL)
2156 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET);
2157 return (true);
2158 }
2159
2160 /*
2161 * Routine: pmap_kextract
2162 * Function:
2163 * Extract the physical page address associated with the given kernel
2164 * virtual address.
2165 */
2166 vm_paddr_t
pmap_kextract(vm_offset_t va)2167 pmap_kextract(vm_offset_t va)
2168 {
2169 vm_paddr_t pa;
2170
2171 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2172 return (DMAP_TO_PHYS(va));
2173
2174 if (pmap_klookup(va, &pa) == false)
2175 return (0);
2176 return (pa);
2177 }
2178
2179 /***************************************************
2180 * Low level mapping routines.....
2181 ***************************************************/
2182
2183 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)2184 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
2185 {
2186 pd_entry_t *pde;
2187 pt_entry_t attr, old_l3e, *pte;
2188 vm_offset_t va;
2189 vm_page_t mpte;
2190 int error, lvl;
2191
2192 KASSERT((pa & L3_OFFSET) == 0,
2193 ("pmap_kenter: Invalid physical address"));
2194 KASSERT((sva & L3_OFFSET) == 0,
2195 ("pmap_kenter: Invalid virtual address"));
2196 KASSERT((size & PAGE_MASK) == 0,
2197 ("pmap_kenter: Mapping is not page-sized"));
2198
2199 attr = ATTR_AF | pmap_sh_attr | ATTR_S1_AP(ATTR_S1_AP_RW) |
2200 ATTR_S1_XN | ATTR_KERN_GP | ATTR_S1_IDX(mode);
2201 old_l3e = 0;
2202 va = sva;
2203 while (size != 0) {
2204 pde = pmap_pde(kernel_pmap, va, &lvl);
2205 KASSERT(pde != NULL,
2206 ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
2207 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
2208
2209 /*
2210 * If we have an aligned, contiguous chunk of L2_SIZE, try
2211 * to create an L2_BLOCK mapping.
2212 */
2213 if ((va & L2_OFFSET) == 0 && size >= L2_SIZE &&
2214 (pa & L2_OFFSET) == 0 && vm_initialized) {
2215 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
2216 KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)),
2217 ("pmap_kenter: Unexpected mapping"));
2218 PMAP_LOCK(kernel_pmap);
2219 error = pmap_insert_pt_page(kernel_pmap, mpte, false,
2220 false);
2221 if (error == 0) {
2222 attr &= ~ATTR_CONTIGUOUS;
2223
2224 /*
2225 * Although the page table page "mpte" should
2226 * be devoid of mappings, the TLB might hold
2227 * intermediate entries that reference it, so
2228 * we perform a single-page invalidation.
2229 */
2230 pmap_update_entry(kernel_pmap, pde,
2231 PHYS_TO_PTE(pa) | attr | L2_BLOCK, va,
2232 PAGE_SIZE);
2233 }
2234 PMAP_UNLOCK(kernel_pmap);
2235 if (error == 0) {
2236 va += L2_SIZE;
2237 pa += L2_SIZE;
2238 size -= L2_SIZE;
2239 continue;
2240 }
2241 }
2242
2243 /*
2244 * If we have an aligned, contiguous chunk of L3C_ENTRIES
2245 * L3 pages, set the contiguous bit within each PTE so that
2246 * the chunk can be cached using only one TLB entry.
2247 */
2248 if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) {
2249 if (size >= L3C_SIZE)
2250 attr |= ATTR_CONTIGUOUS;
2251 else
2252 attr &= ~ATTR_CONTIGUOUS;
2253 }
2254
2255 pte = pmap_l2_to_l3(pde, va);
2256 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr |
2257 L3_PAGE);
2258
2259 va += PAGE_SIZE;
2260 pa += PAGE_SIZE;
2261 size -= PAGE_SIZE;
2262 }
2263 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2264 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2265 else {
2266 /*
2267 * Because the old entries were invalid and the new mappings
2268 * are not executable, an isb is not required.
2269 */
2270 dsb(ishst);
2271 }
2272 }
2273
2274 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)2275 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2276 {
2277
2278 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2279 }
2280
2281 /*
2282 * Remove a page from the kernel pagetables.
2283 */
2284 void
pmap_kremove(vm_offset_t va)2285 pmap_kremove(vm_offset_t va)
2286 {
2287 pt_entry_t *pte;
2288
2289 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2290 KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0,
2291 ("pmap_kremove: unexpected ATTR_CONTIGUOUS"));
2292 pmap_clear(pte);
2293 pmap_s1_invalidate_page(kernel_pmap, va, true);
2294 }
2295
2296 /*
2297 * Remove the specified range of mappings from the kernel address space.
2298 *
2299 * Should only be applied to mappings that were created by pmap_kenter() or
2300 * pmap_kenter_device(). Nothing about this function is actually specific
2301 * to device mappings.
2302 */
2303 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)2304 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2305 {
2306 pt_entry_t *ptep, *ptep_end;
2307 vm_offset_t va;
2308 int lvl;
2309
2310 KASSERT((sva & L3_OFFSET) == 0,
2311 ("pmap_kremove_device: Invalid virtual address"));
2312 KASSERT((size & PAGE_MASK) == 0,
2313 ("pmap_kremove_device: Mapping is not page-sized"));
2314
2315 va = sva;
2316 while (size != 0) {
2317 ptep = pmap_pte(kernel_pmap, va, &lvl);
2318 KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va));
2319 switch (lvl) {
2320 case 2:
2321 KASSERT((va & L2_OFFSET) == 0,
2322 ("Unaligned virtual address"));
2323 KASSERT(size >= L2_SIZE, ("Insufficient size"));
2324
2325 if (va != sva) {
2326 pmap_s1_invalidate_range(kernel_pmap, sva, va,
2327 true);
2328 }
2329 pmap_clear(ptep);
2330 pmap_s1_invalidate_page(kernel_pmap, va, true);
2331 PMAP_LOCK(kernel_pmap);
2332 pmap_remove_kernel_l2(kernel_pmap, ptep, va);
2333 PMAP_UNLOCK(kernel_pmap);
2334
2335 va += L2_SIZE;
2336 sva = va;
2337 size -= L2_SIZE;
2338 break;
2339 case 3:
2340 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
2341 KASSERT((va & L3C_OFFSET) == 0,
2342 ("Unaligned L3C virtual address"));
2343 KASSERT(size >= L3C_SIZE,
2344 ("Insufficient L3C size"));
2345
2346 ptep_end = ptep + L3C_ENTRIES;
2347 for (; ptep < ptep_end; ptep++)
2348 pmap_clear(ptep);
2349
2350 va += L3C_SIZE;
2351 size -= L3C_SIZE;
2352 break;
2353 }
2354 pmap_clear(ptep);
2355
2356 va += PAGE_SIZE;
2357 size -= PAGE_SIZE;
2358 break;
2359 default:
2360 __assert_unreachable();
2361 break;
2362 }
2363 }
2364 if (va != sva)
2365 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2366 }
2367
2368 /*
2369 * Used to map a range of physical addresses into kernel
2370 * virtual address space.
2371 *
2372 * The value passed in '*virt' is a suggested virtual address for
2373 * the mapping. Architectures which can support a direct-mapped
2374 * physical to virtual region can return the appropriate address
2375 * within that region, leaving '*virt' unchanged. Other
2376 * architectures should map the pages starting at '*virt' and
2377 * update '*virt' with the first usable address after the mapped
2378 * region.
2379 */
2380 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)2381 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2382 {
2383 return PHYS_TO_DMAP(start);
2384 }
2385
2386 /*
2387 * Add a list of wired pages to the kva
2388 * this routine is only used for temporary
2389 * kernel mappings that do not need to have
2390 * page modification or references recorded.
2391 * Note that old mappings are simply written
2392 * over. The page *must* be wired.
2393 * Note: SMP coherent. Uses a ranged shootdown IPI.
2394 */
2395 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)2396 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2397 {
2398 pd_entry_t *pde;
2399 pt_entry_t attr, old_l3e, *pte;
2400 vm_offset_t va;
2401 vm_page_t m;
2402 int i, lvl;
2403
2404 old_l3e = 0;
2405 va = sva;
2406 for (i = 0; i < count; i++) {
2407 pde = pmap_pde(kernel_pmap, va, &lvl);
2408 KASSERT(pde != NULL,
2409 ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2410 KASSERT(lvl == 2,
2411 ("pmap_qenter: Invalid level %d", lvl));
2412
2413 m = ma[i];
2414 attr = ATTR_AF | pmap_sh_attr |
2415 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2416 ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2417 pte = pmap_l2_to_l3(pde, va);
2418 old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr);
2419
2420 va += L3_SIZE;
2421 }
2422 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2423 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2424 else {
2425 /*
2426 * Because the old entries were invalid and the new mappings
2427 * are not executable, an isb is not required.
2428 */
2429 dsb(ishst);
2430 }
2431 }
2432
2433 /*
2434 * This routine tears out page mappings from the
2435 * kernel -- it is meant only for temporary mappings.
2436 */
2437 void
pmap_qremove(vm_offset_t sva,int count)2438 pmap_qremove(vm_offset_t sva, int count)
2439 {
2440 pt_entry_t *pte;
2441 vm_offset_t va;
2442
2443 KASSERT(ADDR_IS_CANONICAL(sva),
2444 ("%s: Address not in canonical form: %lx", __func__, sva));
2445 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
2446
2447 va = sva;
2448 while (count-- > 0) {
2449 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2450 if (pte != NULL) {
2451 pmap_clear(pte);
2452 }
2453
2454 va += PAGE_SIZE;
2455 }
2456 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2457 }
2458
2459 /***************************************************
2460 * Page table page management routines.....
2461 ***************************************************/
2462 /*
2463 * Schedule the specified unused page table page to be freed. Specifically,
2464 * add the page to the specified list of pages that will be released to the
2465 * physical memory manager after the TLB has been updated.
2466 */
2467 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)2468 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
2469 {
2470
2471 if (set_PG_ZERO)
2472 m->flags |= PG_ZERO;
2473 else
2474 m->flags &= ~PG_ZERO;
2475 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2476 }
2477
2478 /*
2479 * Decrements a page table page's reference count, which is used to record the
2480 * number of valid page table entries within the page. If the reference count
2481 * drops to zero, then the page table page is unmapped. Returns true if the
2482 * page table page was unmapped and false otherwise.
2483 */
2484 static inline bool
pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2485 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2486 {
2487
2488 --m->ref_count;
2489 if (m->ref_count == 0) {
2490 _pmap_unwire_l3(pmap, va, m, free);
2491 return (true);
2492 } else
2493 return (false);
2494 }
2495
2496 static void
_pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2497 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2498 {
2499
2500 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2501 /*
2502 * unmap the page table page
2503 */
2504 if (m->pindex >= (NUL2E + NUL1E)) {
2505 /* l1 page */
2506 pd_entry_t *l0;
2507
2508 l0 = pmap_l0(pmap, va);
2509 pmap_clear(l0);
2510 } else if (m->pindex >= NUL2E) {
2511 /* l2 page */
2512 pd_entry_t *l1;
2513
2514 l1 = pmap_l1(pmap, va);
2515 pmap_clear(l1);
2516 } else {
2517 /* l3 page */
2518 pd_entry_t *l2;
2519
2520 l2 = pmap_l2(pmap, va);
2521 pmap_clear(l2);
2522 }
2523 pmap_resident_count_dec(pmap, 1);
2524 if (m->pindex < NUL2E) {
2525 /* We just released an l3, unhold the matching l2 */
2526 pd_entry_t *l1, tl1;
2527 vm_page_t l2pg;
2528
2529 l1 = pmap_l1(pmap, va);
2530 tl1 = pmap_load(l1);
2531 l2pg = PTE_TO_VM_PAGE(tl1);
2532 pmap_unwire_l3(pmap, va, l2pg, free);
2533 } else if (m->pindex < (NUL2E + NUL1E)) {
2534 /* We just released an l2, unhold the matching l1 */
2535 pd_entry_t *l0, tl0;
2536 vm_page_t l1pg;
2537
2538 l0 = pmap_l0(pmap, va);
2539 tl0 = pmap_load(l0);
2540 l1pg = PTE_TO_VM_PAGE(tl0);
2541 pmap_unwire_l3(pmap, va, l1pg, free);
2542 }
2543 pmap_invalidate_page(pmap, va, false);
2544
2545 /*
2546 * Put page on a list so that it is released after
2547 * *ALL* TLB shootdown is done
2548 */
2549 pmap_add_delayed_free_list(m, free, true);
2550 }
2551
2552 /*
2553 * After removing a page table entry, this routine is used to
2554 * conditionally free the page, and manage the reference count.
2555 */
2556 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)2557 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2558 struct spglist *free)
2559 {
2560 vm_page_t mpte;
2561
2562 KASSERT(ADDR_IS_CANONICAL(va),
2563 ("%s: Address not in canonical form: %lx", __func__, va));
2564 if (ADDR_IS_KERNEL(va))
2565 return (0);
2566 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2567 mpte = PTE_TO_VM_PAGE(ptepde);
2568 return (pmap_unwire_l3(pmap, va, mpte, free));
2569 }
2570
2571 /*
2572 * Release a page table page reference after a failed attempt to create a
2573 * mapping.
2574 */
2575 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)2576 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2577 {
2578 struct spglist free;
2579
2580 SLIST_INIT(&free);
2581 if (pmap_unwire_l3(pmap, va, mpte, &free))
2582 vm_page_free_pages_toq(&free, true);
2583 }
2584
2585 void
pmap_pinit0(pmap_t pmap)2586 pmap_pinit0(pmap_t pmap)
2587 {
2588
2589 PMAP_LOCK_INIT(pmap);
2590 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2591 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2592 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2593 TAILQ_INIT(&pmap->pm_pvchunk);
2594 vm_radix_init(&pmap->pm_root);
2595 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2596 pmap->pm_stage = PM_STAGE1;
2597 pmap->pm_levels = 4;
2598 pmap->pm_ttbr = pmap->pm_l0_paddr;
2599 pmap->pm_asid_set = &asids;
2600 pmap->pm_bti = NULL;
2601
2602 PCPU_SET(curpmap, pmap);
2603 }
2604
2605 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage,int levels)2606 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2607 {
2608 vm_page_t m;
2609
2610 /*
2611 * allocate the l0 page
2612 */
2613 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2614 VM_ALLOC_ZERO);
2615 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2616 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2617
2618 TAILQ_INIT(&pmap->pm_pvchunk);
2619 vm_radix_init(&pmap->pm_root);
2620 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2621 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2622
2623 MPASS(levels == 3 || levels == 4);
2624 pmap->pm_levels = levels;
2625 pmap->pm_stage = stage;
2626 pmap->pm_bti = NULL;
2627 switch (stage) {
2628 case PM_STAGE1:
2629 pmap->pm_asid_set = &asids;
2630 if (pmap_bti_support) {
2631 pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF,
2632 M_ZERO | M_WAITOK);
2633 rangeset_init(pmap->pm_bti, bti_dup_range,
2634 bti_free_range, pmap, M_NOWAIT);
2635 }
2636 break;
2637 case PM_STAGE2:
2638 pmap->pm_asid_set = &vmids;
2639 break;
2640 default:
2641 panic("%s: Invalid pmap type %d", __func__, stage);
2642 break;
2643 }
2644
2645 /* XXX Temporarily disable deferred ASID allocation. */
2646 pmap_alloc_asid(pmap);
2647
2648 /*
2649 * Allocate the level 1 entry to use as the root. This will increase
2650 * the refcount on the level 1 page so it won't be removed until
2651 * pmap_release() is called.
2652 */
2653 if (pmap->pm_levels == 3) {
2654 PMAP_LOCK(pmap);
2655 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2656 PMAP_UNLOCK(pmap);
2657 }
2658 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2659
2660 return (1);
2661 }
2662
2663 int
pmap_pinit(pmap_t pmap)2664 pmap_pinit(pmap_t pmap)
2665 {
2666
2667 return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2668 }
2669
2670 /*
2671 * This routine is called if the desired page table page does not exist.
2672 *
2673 * If page table page allocation fails, this routine may sleep before
2674 * returning NULL. It sleeps only if a lock pointer was given.
2675 *
2676 * Note: If a page allocation fails at page table level two or three,
2677 * one or two pages may be held during the wait, only to be released
2678 * afterwards. This conservative approach is easily argued to avoid
2679 * race conditions.
2680 */
2681 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)2682 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2683 {
2684 vm_page_t m, l1pg, l2pg;
2685
2686 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2687
2688 /*
2689 * Allocate a page table page.
2690 */
2691 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2692 if (lockp != NULL) {
2693 RELEASE_PV_LIST_LOCK(lockp);
2694 PMAP_UNLOCK(pmap);
2695 vm_wait(NULL);
2696 PMAP_LOCK(pmap);
2697 }
2698
2699 /*
2700 * Indicate the need to retry. While waiting, the page table
2701 * page may have been allocated.
2702 */
2703 return (NULL);
2704 }
2705 m->pindex = ptepindex;
2706
2707 /*
2708 * Because of AArch64's weak memory consistency model, we must have a
2709 * barrier here to ensure that the stores for zeroing "m", whether by
2710 * pmap_zero_page() or an earlier function, are visible before adding
2711 * "m" to the page table. Otherwise, a page table walk by another
2712 * processor's MMU could see the mapping to "m" and a stale, non-zero
2713 * PTE within "m".
2714 */
2715 dmb(ishst);
2716
2717 /*
2718 * Map the pagetable page into the process address space, if
2719 * it isn't already there.
2720 */
2721
2722 if (ptepindex >= (NUL2E + NUL1E)) {
2723 pd_entry_t *l0p, l0e;
2724 vm_pindex_t l0index;
2725
2726 l0index = ptepindex - (NUL2E + NUL1E);
2727 l0p = &pmap->pm_l0[l0index];
2728 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2729 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2730 l0e = VM_PAGE_TO_PTE(m) | L0_TABLE;
2731
2732 /*
2733 * Mark all kernel memory as not accessible from userspace
2734 * and userspace memory as not executable from the kernel.
2735 * This has been done for the bootstrap L0 entries in
2736 * locore.S.
2737 */
2738 if (pmap == kernel_pmap)
2739 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2740 else
2741 l0e |= TATTR_PXN_TABLE;
2742 pmap_store(l0p, l0e);
2743 } else if (ptepindex >= NUL2E) {
2744 vm_pindex_t l0index, l1index;
2745 pd_entry_t *l0, *l1;
2746 pd_entry_t tl0;
2747
2748 l1index = ptepindex - NUL2E;
2749 l0index = l1index >> Ln_ENTRIES_SHIFT;
2750
2751 l0 = &pmap->pm_l0[l0index];
2752 tl0 = pmap_load(l0);
2753 if (tl0 == 0) {
2754 /* recurse for allocating page dir */
2755 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2756 lockp) == NULL) {
2757 vm_page_unwire_noq(m);
2758 vm_page_free_zero(m);
2759 return (NULL);
2760 }
2761 } else {
2762 l1pg = PTE_TO_VM_PAGE(tl0);
2763 l1pg->ref_count++;
2764 }
2765
2766 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
2767 l1 = &l1[ptepindex & Ln_ADDR_MASK];
2768 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2769 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2770 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
2771 } else {
2772 vm_pindex_t l0index, l1index;
2773 pd_entry_t *l0, *l1, *l2;
2774 pd_entry_t tl0, tl1;
2775
2776 l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2777 l0index = l1index >> Ln_ENTRIES_SHIFT;
2778
2779 l0 = &pmap->pm_l0[l0index];
2780 tl0 = pmap_load(l0);
2781 if (tl0 == 0) {
2782 /* recurse for allocating page dir */
2783 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2784 lockp) == NULL) {
2785 vm_page_unwire_noq(m);
2786 vm_page_free_zero(m);
2787 return (NULL);
2788 }
2789 tl0 = pmap_load(l0);
2790 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2791 l1 = &l1[l1index & Ln_ADDR_MASK];
2792 } else {
2793 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2794 l1 = &l1[l1index & Ln_ADDR_MASK];
2795 tl1 = pmap_load(l1);
2796 if (tl1 == 0) {
2797 /* recurse for allocating page dir */
2798 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2799 lockp) == NULL) {
2800 vm_page_unwire_noq(m);
2801 vm_page_free_zero(m);
2802 return (NULL);
2803 }
2804 } else {
2805 l2pg = PTE_TO_VM_PAGE(tl1);
2806 l2pg->ref_count++;
2807 }
2808 }
2809
2810 l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1)));
2811 l2 = &l2[ptepindex & Ln_ADDR_MASK];
2812 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2813 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2814 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
2815 }
2816
2817 pmap_resident_count_inc(pmap, 1);
2818
2819 return (m);
2820 }
2821
2822 static pd_entry_t *
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,vm_page_t * l2pgp,struct rwlock ** lockp)2823 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2824 struct rwlock **lockp)
2825 {
2826 pd_entry_t *l1, *l2;
2827 vm_page_t l2pg;
2828 vm_pindex_t l2pindex;
2829
2830 KASSERT(ADDR_IS_CANONICAL(va),
2831 ("%s: Address not in canonical form: %lx", __func__, va));
2832
2833 retry:
2834 l1 = pmap_l1(pmap, va);
2835 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2836 l2 = pmap_l1_to_l2(l1, va);
2837 if (!ADDR_IS_KERNEL(va)) {
2838 /* Add a reference to the L2 page. */
2839 l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
2840 l2pg->ref_count++;
2841 } else
2842 l2pg = NULL;
2843 } else if (!ADDR_IS_KERNEL(va)) {
2844 /* Allocate a L2 page. */
2845 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
2846 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2847 if (l2pg == NULL) {
2848 if (lockp != NULL)
2849 goto retry;
2850 else
2851 return (NULL);
2852 }
2853 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2854 l2 = &l2[pmap_l2_index(va)];
2855 } else
2856 panic("pmap_alloc_l2: missing page table page for va %#lx",
2857 va);
2858 *l2pgp = l2pg;
2859 return (l2);
2860 }
2861
2862 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)2863 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2864 {
2865 vm_pindex_t ptepindex;
2866 pd_entry_t *pde, tpde;
2867 #ifdef INVARIANTS
2868 pt_entry_t *pte;
2869 #endif
2870 vm_page_t m;
2871 int lvl;
2872
2873 /*
2874 * Calculate pagetable page index
2875 */
2876 ptepindex = pmap_l2_pindex(va);
2877 retry:
2878 /*
2879 * Get the page directory entry
2880 */
2881 pde = pmap_pde(pmap, va, &lvl);
2882
2883 /*
2884 * If the page table page is mapped, we just increment the hold count,
2885 * and activate it. If we get a level 2 pde it will point to a level 3
2886 * table.
2887 */
2888 switch (lvl) {
2889 case -1:
2890 break;
2891 case 0:
2892 #ifdef INVARIANTS
2893 pte = pmap_l0_to_l1(pde, va);
2894 KASSERT(pmap_load(pte) == 0,
2895 ("pmap_alloc_l3: TODO: l0 superpages"));
2896 #endif
2897 break;
2898 case 1:
2899 #ifdef INVARIANTS
2900 pte = pmap_l1_to_l2(pde, va);
2901 KASSERT(pmap_load(pte) == 0,
2902 ("pmap_alloc_l3: TODO: l1 superpages"));
2903 #endif
2904 break;
2905 case 2:
2906 tpde = pmap_load(pde);
2907 if (tpde != 0) {
2908 m = PTE_TO_VM_PAGE(tpde);
2909 m->ref_count++;
2910 return (m);
2911 }
2912 break;
2913 default:
2914 panic("pmap_alloc_l3: Invalid level %d", lvl);
2915 }
2916
2917 /*
2918 * Here if the pte page isn't mapped, or if it has been deallocated.
2919 */
2920 m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2921 if (m == NULL && lockp != NULL)
2922 goto retry;
2923
2924 return (m);
2925 }
2926
2927 /***************************************************
2928 * Pmap allocation/deallocation routines.
2929 ***************************************************/
2930
2931 /*
2932 * Release any resources held by the given physical map.
2933 * Called when a pmap initialized by pmap_pinit is being released.
2934 * Should only be called if the map contains no valid mappings.
2935 */
2936 void
pmap_release(pmap_t pmap)2937 pmap_release(pmap_t pmap)
2938 {
2939 bool rv __diagused;
2940 struct spglist freelist;
2941 struct asid_set *set;
2942 vm_page_t m;
2943 int asid;
2944
2945 if (pmap->pm_levels != 4) {
2946 PMAP_ASSERT_STAGE2(pmap);
2947 KASSERT(pmap->pm_stats.resident_count == 1,
2948 ("pmap_release: pmap resident count %ld != 0",
2949 pmap->pm_stats.resident_count));
2950 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2951 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2952
2953 SLIST_INIT(&freelist);
2954 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2955 PMAP_LOCK(pmap);
2956 rv = pmap_unwire_l3(pmap, 0, m, &freelist);
2957 PMAP_UNLOCK(pmap);
2958 MPASS(rv == true);
2959 vm_page_free_pages_toq(&freelist, true);
2960 }
2961
2962 KASSERT(pmap->pm_stats.resident_count == 0,
2963 ("pmap_release: pmap resident count %ld != 0",
2964 pmap->pm_stats.resident_count));
2965 KASSERT(vm_radix_is_empty(&pmap->pm_root),
2966 ("pmap_release: pmap has reserved page table page(s)"));
2967
2968 set = pmap->pm_asid_set;
2969 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2970
2971 /*
2972 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2973 * the entries when removing them so rely on a later tlb invalidation.
2974 * this will happen when updating the VMID generation. Because of this
2975 * we don't reuse VMIDs within a generation.
2976 */
2977 if (pmap->pm_stage == PM_STAGE1) {
2978 mtx_lock_spin(&set->asid_set_mutex);
2979 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2980 asid = COOKIE_TO_ASID(pmap->pm_cookie);
2981 KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2982 asid < set->asid_set_size,
2983 ("pmap_release: pmap cookie has out-of-range asid"));
2984 bit_clear(set->asid_set, asid);
2985 }
2986 mtx_unlock_spin(&set->asid_set_mutex);
2987
2988 if (pmap->pm_bti != NULL) {
2989 rangeset_fini(pmap->pm_bti);
2990 free(pmap->pm_bti, M_DEVBUF);
2991 }
2992 }
2993
2994 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2995 vm_page_unwire_noq(m);
2996 vm_page_free_zero(m);
2997 }
2998
2999 static int
kvm_size(SYSCTL_HANDLER_ARGS)3000 kvm_size(SYSCTL_HANDLER_ARGS)
3001 {
3002 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
3003
3004 return sysctl_handle_long(oidp, &ksize, 0, req);
3005 }
3006 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
3007 0, 0, kvm_size, "LU",
3008 "Size of KVM");
3009
3010 static int
kvm_free(SYSCTL_HANDLER_ARGS)3011 kvm_free(SYSCTL_HANDLER_ARGS)
3012 {
3013 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
3014
3015 return sysctl_handle_long(oidp, &kfree, 0, req);
3016 }
3017 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
3018 0, 0, kvm_free, "LU",
3019 "Amount of KVM free");
3020
3021 /*
3022 * grow the number of kernel page table entries, if needed
3023 */
3024 void
pmap_growkernel(vm_offset_t addr)3025 pmap_growkernel(vm_offset_t addr)
3026 {
3027 vm_page_t nkpg;
3028 pd_entry_t *l0, *l1, *l2;
3029
3030 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
3031
3032 addr = roundup2(addr, L2_SIZE);
3033 if (addr - 1 >= vm_map_max(kernel_map))
3034 addr = vm_map_max(kernel_map);
3035 if (kernel_vm_end < addr) {
3036 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
3037 kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
3038 }
3039 while (kernel_vm_end < addr) {
3040 l0 = pmap_l0(kernel_pmap, kernel_vm_end);
3041 KASSERT(pmap_load(l0) != 0,
3042 ("pmap_growkernel: No level 0 kernel entry"));
3043
3044 l1 = pmap_l0_to_l1(l0, kernel_vm_end);
3045 if (pmap_load(l1) == 0) {
3046 /* We need a new PDP entry */
3047 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3048 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3049 if (nkpg == NULL)
3050 panic("pmap_growkernel: no memory to grow kernel");
3051 nkpg->pindex = pmap_l1_pindex(kernel_vm_end);
3052 /* See the dmb() in _pmap_alloc_l3(). */
3053 dmb(ishst);
3054 pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE);
3055 continue; /* try again */
3056 }
3057 l2 = pmap_l1_to_l2(l1, kernel_vm_end);
3058 if (pmap_load(l2) != 0) {
3059 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
3060 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3061 kernel_vm_end = vm_map_max(kernel_map);
3062 break;
3063 }
3064 continue;
3065 }
3066
3067 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3068 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3069 if (nkpg == NULL)
3070 panic("pmap_growkernel: no memory to grow kernel");
3071 nkpg->pindex = pmap_l2_pindex(kernel_vm_end);
3072 /* See the dmb() in _pmap_alloc_l3(). */
3073 dmb(ishst);
3074 pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE);
3075
3076 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
3077 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3078 kernel_vm_end = vm_map_max(kernel_map);
3079 break;
3080 }
3081 }
3082 }
3083
3084 /***************************************************
3085 * page management routines.
3086 ***************************************************/
3087
3088 static const uint64_t pc_freemask[_NPCM] = {
3089 [0 ... _NPCM - 2] = PC_FREEN,
3090 [_NPCM - 1] = PC_FREEL
3091 };
3092
3093 #ifdef PV_STATS
3094 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
3095
3096 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
3097 "Current number of pv entry chunks");
3098 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
3099 "Current number of pv entry chunks allocated");
3100 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
3101 "Current number of pv entry chunks frees");
3102 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
3103 "Number of times tried to get a chunk page but failed.");
3104
3105 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
3106 static int pv_entry_spare;
3107
3108 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
3109 "Current number of pv entry frees");
3110 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
3111 "Current number of pv entry allocs");
3112 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
3113 "Current number of pv entries");
3114 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3115 "Current number of spare pv entries");
3116 #endif
3117
3118 /*
3119 * We are in a serious low memory condition. Resort to
3120 * drastic measures to free some pages so we can allocate
3121 * another pv entry chunk.
3122 *
3123 * Returns NULL if PV entries were reclaimed from the specified pmap.
3124 *
3125 * We do not, however, unmap 2mpages because subsequent accesses will
3126 * allocate per-page pv entries until repromotion occurs, thereby
3127 * exacerbating the shortage of free pv entries.
3128 */
3129 static vm_page_t
reclaim_pv_chunk_domain(pmap_t locked_pmap,struct rwlock ** lockp,int domain)3130 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
3131 {
3132 struct pv_chunks_list *pvc;
3133 struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3134 struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3135 struct md_page *pvh;
3136 pd_entry_t *pde;
3137 pmap_t next_pmap, pmap;
3138 pt_entry_t *pte, tpte;
3139 pv_entry_t pv;
3140 vm_offset_t va;
3141 vm_page_t m, m_pc;
3142 struct spglist free;
3143 uint64_t inuse;
3144 int bit, field, freed, lvl;
3145
3146 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3147 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3148
3149 pmap = NULL;
3150 m_pc = NULL;
3151 SLIST_INIT(&free);
3152 bzero(&pc_marker_b, sizeof(pc_marker_b));
3153 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3154 pc_marker = (struct pv_chunk *)&pc_marker_b;
3155 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3156
3157 pvc = &pv_chunks[domain];
3158 mtx_lock(&pvc->pvc_lock);
3159 pvc->active_reclaims++;
3160 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
3161 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
3162 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3163 SLIST_EMPTY(&free)) {
3164 next_pmap = pc->pc_pmap;
3165 if (next_pmap == NULL) {
3166 /*
3167 * The next chunk is a marker. However, it is
3168 * not our marker, so active_reclaims must be
3169 * > 1. Consequently, the next_chunk code
3170 * will not rotate the pv_chunks list.
3171 */
3172 goto next_chunk;
3173 }
3174 mtx_unlock(&pvc->pvc_lock);
3175
3176 /*
3177 * A pv_chunk can only be removed from the pc_lru list
3178 * when both pvc->pvc_lock is owned and the
3179 * corresponding pmap is locked.
3180 */
3181 if (pmap != next_pmap) {
3182 if (pmap != NULL && pmap != locked_pmap)
3183 PMAP_UNLOCK(pmap);
3184 pmap = next_pmap;
3185 /* Avoid deadlock and lock recursion. */
3186 if (pmap > locked_pmap) {
3187 RELEASE_PV_LIST_LOCK(lockp);
3188 PMAP_LOCK(pmap);
3189 mtx_lock(&pvc->pvc_lock);
3190 continue;
3191 } else if (pmap != locked_pmap) {
3192 if (PMAP_TRYLOCK(pmap)) {
3193 mtx_lock(&pvc->pvc_lock);
3194 continue;
3195 } else {
3196 pmap = NULL; /* pmap is not locked */
3197 mtx_lock(&pvc->pvc_lock);
3198 pc = TAILQ_NEXT(pc_marker, pc_lru);
3199 if (pc == NULL ||
3200 pc->pc_pmap != next_pmap)
3201 continue;
3202 goto next_chunk;
3203 }
3204 }
3205 }
3206
3207 /*
3208 * Destroy every non-wired, 4 KB page mapping in the chunk.
3209 */
3210 freed = 0;
3211 for (field = 0; field < _NPCM; field++) {
3212 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3213 inuse != 0; inuse &= ~(1UL << bit)) {
3214 bit = ffsl(inuse) - 1;
3215 pv = &pc->pc_pventry[field * 64 + bit];
3216 va = pv->pv_va;
3217 pde = pmap_pde(pmap, va, &lvl);
3218 if (lvl != 2)
3219 continue;
3220 pte = pmap_l2_to_l3(pde, va);
3221 tpte = pmap_load(pte);
3222 if ((tpte & ATTR_SW_WIRED) != 0)
3223 continue;
3224 if ((tpte & ATTR_CONTIGUOUS) != 0)
3225 (void)pmap_demote_l3c(pmap, pte, va);
3226 tpte = pmap_load_clear(pte);
3227 m = PTE_TO_VM_PAGE(tpte);
3228 if (pmap_pte_dirty(pmap, tpte))
3229 vm_page_dirty(m);
3230 if ((tpte & ATTR_AF) != 0) {
3231 pmap_s1_invalidate_page(pmap, va, true);
3232 vm_page_aflag_set(m, PGA_REFERENCED);
3233 }
3234 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3235 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3236 m->md.pv_gen++;
3237 if (TAILQ_EMPTY(&m->md.pv_list) &&
3238 (m->flags & PG_FICTITIOUS) == 0) {
3239 pvh = page_to_pvh(m);
3240 if (TAILQ_EMPTY(&pvh->pv_list)) {
3241 vm_page_aflag_clear(m,
3242 PGA_WRITEABLE);
3243 }
3244 }
3245 pc->pc_map[field] |= 1UL << bit;
3246 pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
3247 freed++;
3248 }
3249 }
3250 if (freed == 0) {
3251 mtx_lock(&pvc->pvc_lock);
3252 goto next_chunk;
3253 }
3254 /* Every freed mapping is for a 4 KB page. */
3255 pmap_resident_count_dec(pmap, freed);
3256 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3257 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3258 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3259 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3260 if (pc_is_free(pc)) {
3261 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3262 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3263 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3264 /* Entire chunk is free; return it. */
3265 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3266 dump_drop_page(m_pc->phys_addr);
3267 mtx_lock(&pvc->pvc_lock);
3268 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3269 break;
3270 }
3271 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3272 mtx_lock(&pvc->pvc_lock);
3273 /* One freed pv entry in locked_pmap is sufficient. */
3274 if (pmap == locked_pmap)
3275 break;
3276
3277 next_chunk:
3278 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3279 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
3280 if (pvc->active_reclaims == 1 && pmap != NULL) {
3281 /*
3282 * Rotate the pv chunks list so that we do not
3283 * scan the same pv chunks that could not be
3284 * freed (because they contained a wired
3285 * and/or superpage mapping) on every
3286 * invocation of reclaim_pv_chunk().
3287 */
3288 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
3289 MPASS(pc->pc_pmap != NULL);
3290 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3291 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3292 }
3293 }
3294 }
3295 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3296 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
3297 pvc->active_reclaims--;
3298 mtx_unlock(&pvc->pvc_lock);
3299 if (pmap != NULL && pmap != locked_pmap)
3300 PMAP_UNLOCK(pmap);
3301 if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3302 m_pc = SLIST_FIRST(&free);
3303 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3304 /* Recycle a freed page table page. */
3305 m_pc->ref_count = 1;
3306 }
3307 vm_page_free_pages_toq(&free, true);
3308 return (m_pc);
3309 }
3310
3311 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)3312 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3313 {
3314 vm_page_t m;
3315 int i, domain;
3316
3317 domain = PCPU_GET(domain);
3318 for (i = 0; i < vm_ndomains; i++) {
3319 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
3320 if (m != NULL)
3321 break;
3322 domain = (domain + 1) % vm_ndomains;
3323 }
3324
3325 return (m);
3326 }
3327
3328 /*
3329 * free the pv_entry back to the free list
3330 */
3331 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)3332 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3333 {
3334 struct pv_chunk *pc;
3335 int idx, field, bit;
3336
3337 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3338 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3339 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3340 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3341 pc = pv_to_chunk(pv);
3342 idx = pv - &pc->pc_pventry[0];
3343 field = idx / 64;
3344 bit = idx % 64;
3345 pc->pc_map[field] |= 1ul << bit;
3346 if (!pc_is_free(pc)) {
3347 /* 98% of the time, pc is already at the head of the list. */
3348 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3349 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3350 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3351 }
3352 return;
3353 }
3354 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3355 free_pv_chunk(pc);
3356 }
3357
3358 static void
free_pv_chunk_dequeued(struct pv_chunk * pc)3359 free_pv_chunk_dequeued(struct pv_chunk *pc)
3360 {
3361 vm_page_t m;
3362
3363 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3364 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3365 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3366 /* entire chunk is free, return it */
3367 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3368 dump_drop_page(m->phys_addr);
3369 vm_page_unwire_noq(m);
3370 vm_page_free(m);
3371 }
3372
3373 static void
free_pv_chunk(struct pv_chunk * pc)3374 free_pv_chunk(struct pv_chunk *pc)
3375 {
3376 struct pv_chunks_list *pvc;
3377
3378 pvc = &pv_chunks[pc_to_domain(pc)];
3379 mtx_lock(&pvc->pvc_lock);
3380 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3381 mtx_unlock(&pvc->pvc_lock);
3382 free_pv_chunk_dequeued(pc);
3383 }
3384
3385 static void
free_pv_chunk_batch(struct pv_chunklist * batch)3386 free_pv_chunk_batch(struct pv_chunklist *batch)
3387 {
3388 struct pv_chunks_list *pvc;
3389 struct pv_chunk *pc, *npc;
3390 int i;
3391
3392 for (i = 0; i < vm_ndomains; i++) {
3393 if (TAILQ_EMPTY(&batch[i]))
3394 continue;
3395 pvc = &pv_chunks[i];
3396 mtx_lock(&pvc->pvc_lock);
3397 TAILQ_FOREACH(pc, &batch[i], pc_list) {
3398 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3399 }
3400 mtx_unlock(&pvc->pvc_lock);
3401 }
3402
3403 for (i = 0; i < vm_ndomains; i++) {
3404 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3405 free_pv_chunk_dequeued(pc);
3406 }
3407 }
3408 }
3409
3410 /*
3411 * Returns a new PV entry, allocating a new PV chunk from the system when
3412 * needed. If this PV chunk allocation fails and a PV list lock pointer was
3413 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
3414 * returned.
3415 *
3416 * The given PV list lock may be released.
3417 */
3418 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)3419 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3420 {
3421 struct pv_chunks_list *pvc;
3422 int bit, field;
3423 pv_entry_t pv;
3424 struct pv_chunk *pc;
3425 vm_page_t m;
3426
3427 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3428 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3429 retry:
3430 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3431 if (pc != NULL) {
3432 for (field = 0; field < _NPCM; field++) {
3433 if (pc->pc_map[field]) {
3434 bit = ffsl(pc->pc_map[field]) - 1;
3435 break;
3436 }
3437 }
3438 if (field < _NPCM) {
3439 pv = &pc->pc_pventry[field * 64 + bit];
3440 pc->pc_map[field] &= ~(1ul << bit);
3441 /* If this was the last item, move it to tail */
3442 if (pc_is_full(pc)) {
3443 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3444 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3445 pc_list);
3446 }
3447 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3448 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3449 return (pv);
3450 }
3451 }
3452 /* No free items, allocate another chunk */
3453 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3454 if (m == NULL) {
3455 if (lockp == NULL) {
3456 PV_STAT(pc_chunk_tryfail++);
3457 return (NULL);
3458 }
3459 m = reclaim_pv_chunk(pmap, lockp);
3460 if (m == NULL)
3461 goto retry;
3462 }
3463 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3464 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3465 dump_add_page(m->phys_addr);
3466 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3467 pc->pc_pmap = pmap;
3468 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3469 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */
3470 pvc = &pv_chunks[vm_page_domain(m)];
3471 mtx_lock(&pvc->pvc_lock);
3472 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3473 mtx_unlock(&pvc->pvc_lock);
3474 pv = &pc->pc_pventry[0];
3475 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3476 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3477 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3478 return (pv);
3479 }
3480
3481 /*
3482 * Ensure that the number of spare PV entries in the specified pmap meets or
3483 * exceeds the given count, "needed".
3484 *
3485 * The given PV list lock may be released.
3486 */
3487 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)3488 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3489 {
3490 struct pv_chunks_list *pvc;
3491 struct pch new_tail[PMAP_MEMDOM];
3492 struct pv_chunk *pc;
3493 vm_page_t m;
3494 int avail, free, i;
3495 bool reclaimed;
3496
3497 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3498 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3499
3500 /*
3501 * Newly allocated PV chunks must be stored in a private list until
3502 * the required number of PV chunks have been allocated. Otherwise,
3503 * reclaim_pv_chunk() could recycle one of these chunks. In
3504 * contrast, these chunks must be added to the pmap upon allocation.
3505 */
3506 for (i = 0; i < PMAP_MEMDOM; i++)
3507 TAILQ_INIT(&new_tail[i]);
3508 retry:
3509 avail = 0;
3510 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3511 bit_count((bitstr_t *)pc->pc_map, 0,
3512 sizeof(pc->pc_map) * NBBY, &free);
3513 if (free == 0)
3514 break;
3515 avail += free;
3516 if (avail >= needed)
3517 break;
3518 }
3519 for (reclaimed = false; avail < needed; avail += _NPCPV) {
3520 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3521 if (m == NULL) {
3522 m = reclaim_pv_chunk(pmap, lockp);
3523 if (m == NULL)
3524 goto retry;
3525 reclaimed = true;
3526 }
3527 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3528 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3529 dump_add_page(m->phys_addr);
3530 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3531 pc->pc_pmap = pmap;
3532 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3533 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3534 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3535 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3536
3537 /*
3538 * The reclaim might have freed a chunk from the current pmap.
3539 * If that chunk contained available entries, we need to
3540 * re-count the number of available entries.
3541 */
3542 if (reclaimed)
3543 goto retry;
3544 }
3545 for (i = 0; i < vm_ndomains; i++) {
3546 if (TAILQ_EMPTY(&new_tail[i]))
3547 continue;
3548 pvc = &pv_chunks[i];
3549 mtx_lock(&pvc->pvc_lock);
3550 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3551 mtx_unlock(&pvc->pvc_lock);
3552 }
3553 }
3554
3555 /*
3556 * First find and then remove the pv entry for the specified pmap and virtual
3557 * address from the specified pv list. Returns the pv entry if found and NULL
3558 * otherwise. This operation can be performed on pv lists for either 4KB or
3559 * 2MB page mappings.
3560 */
3561 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3562 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3563 {
3564 pv_entry_t pv;
3565
3566 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3567 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3568 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3569 pvh->pv_gen++;
3570 break;
3571 }
3572 }
3573 return (pv);
3574 }
3575
3576 /*
3577 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3578 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3579 * entries for each of the 4KB page mappings.
3580 */
3581 static void
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)3582 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3583 struct rwlock **lockp)
3584 {
3585 struct md_page *pvh;
3586 struct pv_chunk *pc;
3587 pv_entry_t pv;
3588 vm_offset_t va_last;
3589 vm_page_t m;
3590 int bit, field;
3591
3592 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3593 KASSERT((va & L2_OFFSET) == 0,
3594 ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3595 KASSERT((pa & L2_OFFSET) == 0,
3596 ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3597 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3598
3599 /*
3600 * Transfer the 2mpage's pv entry for this mapping to the first
3601 * page's pv list. Once this transfer begins, the pv list lock
3602 * must not be released until the last pv entry is reinstantiated.
3603 */
3604 pvh = pa_to_pvh(pa);
3605 pv = pmap_pvh_remove(pvh, pmap, va);
3606 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3607 m = PHYS_TO_VM_PAGE(pa);
3608 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3609 m->md.pv_gen++;
3610 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3611 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3612 va_last = va + L2_SIZE - PAGE_SIZE;
3613 for (;;) {
3614 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3615 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3616 for (field = 0; field < _NPCM; field++) {
3617 while (pc->pc_map[field]) {
3618 bit = ffsl(pc->pc_map[field]) - 1;
3619 pc->pc_map[field] &= ~(1ul << bit);
3620 pv = &pc->pc_pventry[field * 64 + bit];
3621 va += PAGE_SIZE;
3622 pv->pv_va = va;
3623 m++;
3624 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3625 ("pmap_pv_demote_l2: page %p is not managed", m));
3626 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3627 m->md.pv_gen++;
3628 if (va == va_last)
3629 goto out;
3630 }
3631 }
3632 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3633 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3634 }
3635 out:
3636 if (pc_is_full(pc)) {
3637 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3638 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3639 }
3640 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3641 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3642 }
3643
3644 /*
3645 * First find and then destroy the pv entry for the specified pmap and virtual
3646 * address. This operation can be performed on pv lists for either 4KB or 2MB
3647 * page mappings.
3648 */
3649 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3650 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3651 {
3652 pv_entry_t pv;
3653
3654 pv = pmap_pvh_remove(pvh, pmap, va);
3655 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3656 free_pv_entry(pmap, pv);
3657 }
3658
3659 /*
3660 * Conditionally create the PV entry for a 4KB page mapping if the required
3661 * memory can be allocated without resorting to reclamation.
3662 */
3663 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3664 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3665 struct rwlock **lockp)
3666 {
3667 pv_entry_t pv;
3668
3669 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3670 /* Pass NULL instead of the lock pointer to disable reclamation. */
3671 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3672 pv->pv_va = va;
3673 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3674 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3675 m->md.pv_gen++;
3676 return (true);
3677 } else
3678 return (false);
3679 }
3680
3681 /*
3682 * Create the PV entry for a 2MB page mapping. Always returns true unless the
3683 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
3684 * false if the PV entry cannot be allocated without resorting to reclamation.
3685 */
3686 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)3687 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3688 struct rwlock **lockp)
3689 {
3690 struct md_page *pvh;
3691 pv_entry_t pv;
3692 vm_paddr_t pa;
3693
3694 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3695 /* Pass NULL instead of the lock pointer to disable reclamation. */
3696 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3697 NULL : lockp)) == NULL)
3698 return (false);
3699 pv->pv_va = va;
3700 pa = PTE_TO_PHYS(l2e);
3701 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3702 pvh = pa_to_pvh(pa);
3703 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3704 pvh->pv_gen++;
3705 return (true);
3706 }
3707
3708 /*
3709 * Conditionally creates the PV entries for a L3C superpage mapping if
3710 * the required memory can be allocated without resorting to reclamation.
3711 */
3712 static bool
pmap_pv_insert_l3c(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3713 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
3714 struct rwlock **lockp)
3715 {
3716 pv_entry_t pv;
3717 vm_offset_t tva;
3718 vm_paddr_t pa __diagused;
3719 vm_page_t mt;
3720
3721 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3722 KASSERT((va & L3C_OFFSET) == 0,
3723 ("pmap_pv_insert_l3c: va is not aligned"));
3724 pa = VM_PAGE_TO_PHYS(m);
3725 KASSERT((pa & L3C_OFFSET) == 0,
3726 ("pmap_pv_insert_l3c: pa is not aligned"));
3727 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3728 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) {
3729 /* Pass NULL instead of lockp to disable reclamation. */
3730 pv = get_pv_entry(pmap, NULL);
3731 if (__predict_false(pv == NULL)) {
3732 while (tva > va) {
3733 mt--;
3734 tva -= L3_SIZE;
3735 pmap_pvh_free(&mt->md, pmap, tva);
3736 }
3737 return (false);
3738 }
3739 pv->pv_va = tva;
3740 TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next);
3741 mt->md.pv_gen++;
3742 }
3743 return (true);
3744 }
3745
3746 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)3747 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3748 {
3749 pt_entry_t newl2, oldl2 __diagused;
3750 vm_page_t ml3;
3751 vm_paddr_t ml3pa;
3752
3753 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3754 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3755 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3756
3757 ml3 = pmap_remove_pt_page(pmap, va);
3758 if (ml3 == NULL)
3759 panic("pmap_remove_kernel_l2: Missing pt page");
3760
3761 ml3pa = VM_PAGE_TO_PHYS(ml3);
3762 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
3763
3764 /*
3765 * If this page table page was unmapped by a promotion, then it
3766 * contains valid mappings. Zero it to invalidate those mappings.
3767 */
3768 if (vm_page_any_valid(ml3))
3769 pagezero((void *)PHYS_TO_DMAP(ml3pa));
3770
3771 /*
3772 * Demote the mapping. The caller must have already invalidated the
3773 * mapping (i.e., the "break" in break-before-make).
3774 */
3775 oldl2 = pmap_load_store(l2, newl2);
3776 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3777 __func__, l2, oldl2));
3778 }
3779
3780 /*
3781 * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3782 */
3783 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)3784 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
3785 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
3786 {
3787 struct md_page *pvh;
3788 pt_entry_t old_l2;
3789 vm_page_t m, ml3, mt;
3790
3791 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3792 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
3793 old_l2 = pmap_load_clear(l2);
3794 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3795 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
3796
3797 /*
3798 * Since a promotion must break the 4KB page mappings before making
3799 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3800 */
3801 pmap_s1_invalidate_page(pmap, sva, true);
3802
3803 if (old_l2 & ATTR_SW_WIRED)
3804 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
3805 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
3806 if (old_l2 & ATTR_SW_MANAGED) {
3807 m = PTE_TO_VM_PAGE(old_l2);
3808 pvh = page_to_pvh(m);
3809 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3810 pmap_pvh_free(pvh, pmap, sva);
3811 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
3812 if (pmap_pte_dirty(pmap, old_l2))
3813 vm_page_dirty(mt);
3814 if (old_l2 & ATTR_AF)
3815 vm_page_aflag_set(mt, PGA_REFERENCED);
3816 if (TAILQ_EMPTY(&mt->md.pv_list) &&
3817 TAILQ_EMPTY(&pvh->pv_list))
3818 vm_page_aflag_clear(mt, PGA_WRITEABLE);
3819 }
3820 }
3821 if (pmap == kernel_pmap) {
3822 pmap_remove_kernel_l2(pmap, l2, sva);
3823 } else {
3824 ml3 = pmap_remove_pt_page(pmap, sva);
3825 if (ml3 != NULL) {
3826 KASSERT(vm_page_any_valid(ml3),
3827 ("pmap_remove_l2: l3 page not promoted"));
3828 pmap_resident_count_dec(pmap, 1);
3829 KASSERT(ml3->ref_count == NL3PG,
3830 ("pmap_remove_l2: l3 page ref count error"));
3831 ml3->ref_count = 0;
3832 pmap_add_delayed_free_list(ml3, free, false);
3833 }
3834 }
3835 return (pmap_unuse_pt(pmap, sva, l1e, free));
3836 }
3837
3838 /*
3839 * pmap_remove_l3: do the things to unmap a page in a process
3840 */
3841 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)3842 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
3843 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
3844 {
3845 struct md_page *pvh;
3846 pt_entry_t old_l3;
3847 vm_page_t m;
3848
3849 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3850 old_l3 = pmap_load(l3);
3851 if ((old_l3 & ATTR_CONTIGUOUS) != 0)
3852 (void)pmap_demote_l3c(pmap, l3, va);
3853 old_l3 = pmap_load_clear(l3);
3854 pmap_s1_invalidate_page(pmap, va, true);
3855 if (old_l3 & ATTR_SW_WIRED)
3856 pmap->pm_stats.wired_count -= 1;
3857 pmap_resident_count_dec(pmap, 1);
3858 if (old_l3 & ATTR_SW_MANAGED) {
3859 m = PTE_TO_VM_PAGE(old_l3);
3860 if (pmap_pte_dirty(pmap, old_l3))
3861 vm_page_dirty(m);
3862 if (old_l3 & ATTR_AF)
3863 vm_page_aflag_set(m, PGA_REFERENCED);
3864 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3865 pmap_pvh_free(&m->md, pmap, va);
3866 if (TAILQ_EMPTY(&m->md.pv_list) &&
3867 (m->flags & PG_FICTITIOUS) == 0) {
3868 pvh = page_to_pvh(m);
3869 if (TAILQ_EMPTY(&pvh->pv_list))
3870 vm_page_aflag_clear(m, PGA_WRITEABLE);
3871 }
3872 }
3873 return (pmap_unuse_pt(pmap, va, l2e, free));
3874 }
3875
3876 /*
3877 * Removes the specified L3C superpage mapping. Requests TLB invalidations
3878 * to be performed by the caller through the returned "*vap". Returns true
3879 * if the level 3 table "ml3" was unmapped and added to the spglist "free".
3880 * Otherwise, returns false.
3881 */
3882 static bool
pmap_remove_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,vm_page_t ml3,struct spglist * free,struct rwlock ** lockp)3883 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap,
3884 vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
3885 struct rwlock **lockp)
3886 {
3887 struct md_page *pvh;
3888 struct rwlock *new_lock;
3889 pt_entry_t first_l3e, l3e, *tl3p;
3890 vm_offset_t tva;
3891 vm_page_t m, mt;
3892
3893 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3894 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
3895 0, ("pmap_remove_l3c: l3p is not aligned"));
3896 KASSERT((va & L3C_OFFSET) == 0,
3897 ("pmap_remove_l3c: va is not aligned"));
3898
3899 /*
3900 * Hardware accessed and dirty bit maintenance might only update a
3901 * single L3 entry, so we must combine the accessed and dirty bits
3902 * from this entire set of contiguous L3 entries.
3903 */
3904 first_l3e = pmap_load_clear(l3p);
3905 for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
3906 l3e = pmap_load_clear(tl3p);
3907 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
3908 ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS"));
3909 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
3910 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
3911 first_l3e &= ~ATTR_S1_AP_RW_BIT;
3912 first_l3e |= l3e & ATTR_AF;
3913 }
3914 if ((first_l3e & ATTR_SW_WIRED) != 0)
3915 pmap->pm_stats.wired_count -= L3C_ENTRIES;
3916 pmap_resident_count_dec(pmap, L3C_ENTRIES);
3917 if ((first_l3e & ATTR_SW_MANAGED) != 0) {
3918 m = PTE_TO_VM_PAGE(first_l3e);
3919 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3920 if (new_lock != *lockp) {
3921 if (*lockp != NULL) {
3922 /*
3923 * Pending TLB invalidations must be
3924 * performed before the PV list lock is
3925 * released. Otherwise, a concurrent
3926 * pmap_remove_all() on a physical page
3927 * could return while a stale TLB entry
3928 * still provides access to that page.
3929 */
3930 if (*vap != va_next) {
3931 pmap_invalidate_range(pmap, *vap, va,
3932 true);
3933 *vap = va_next;
3934 }
3935 rw_wunlock(*lockp);
3936 }
3937 *lockp = new_lock;
3938 rw_wlock(*lockp);
3939 }
3940 pvh = page_to_pvh(m);
3941 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva +=
3942 L3_SIZE) {
3943 if (pmap_pte_dirty(pmap, first_l3e))
3944 vm_page_dirty(mt);
3945 if ((first_l3e & ATTR_AF) != 0)
3946 vm_page_aflag_set(mt, PGA_REFERENCED);
3947 pmap_pvh_free(&mt->md, pmap, tva);
3948 if (TAILQ_EMPTY(&mt->md.pv_list) &&
3949 TAILQ_EMPTY(&pvh->pv_list))
3950 vm_page_aflag_clear(mt, PGA_WRITEABLE);
3951 }
3952 }
3953 if (*vap == va_next)
3954 *vap = va;
3955 if (ml3 != NULL) {
3956 ml3->ref_count -= L3C_ENTRIES;
3957 if (ml3->ref_count == 0) {
3958 _pmap_unwire_l3(pmap, va, ml3, free);
3959 return (true);
3960 }
3961 }
3962 return (false);
3963 }
3964
3965 /*
3966 * Remove the specified range of addresses from the L3 page table that is
3967 * identified by the given L2 entry.
3968 */
3969 static void
pmap_remove_l3_range(pmap_t pmap,pd_entry_t l2e,vm_offset_t sva,vm_offset_t eva,struct spglist * free,struct rwlock ** lockp)3970 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
3971 vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
3972 {
3973 struct md_page *pvh;
3974 struct rwlock *new_lock;
3975 pt_entry_t *l3, old_l3;
3976 vm_offset_t va;
3977 vm_page_t l3pg, m;
3978
3979 KASSERT(ADDR_IS_CANONICAL(sva),
3980 ("%s: Start address not in canonical form: %lx", __func__, sva));
3981 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
3982 ("%s: End address not in canonical form: %lx", __func__, eva));
3983
3984 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3985 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
3986 ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
3987 l3pg = !ADDR_IS_KERNEL(sva) ? PTE_TO_VM_PAGE(l2e) : NULL;
3988 va = eva;
3989 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
3990 old_l3 = pmap_load(l3);
3991 if (!pmap_l3_valid(old_l3)) {
3992 if (va != eva) {
3993 pmap_invalidate_range(pmap, va, sva, true);
3994 va = eva;
3995 }
3996 continue;
3997 }
3998 if ((old_l3 & ATTR_CONTIGUOUS) != 0) {
3999 /*
4000 * Is this entire set of contiguous L3 entries being
4001 * removed? Handle the possibility that "eva" is zero
4002 * because of address wraparound.
4003 */
4004 if ((sva & L3C_OFFSET) == 0 &&
4005 sva + L3C_OFFSET <= eva - 1) {
4006 if (pmap_remove_l3c(pmap, l3, sva, &va, eva,
4007 l3pg, free, lockp)) {
4008 /* The L3 table was unmapped. */
4009 sva += L3C_SIZE;
4010 break;
4011 }
4012 l3 += L3C_ENTRIES - 1;
4013 sva += L3C_SIZE - L3_SIZE;
4014 continue;
4015 }
4016
4017 (void)pmap_demote_l3c(pmap, l3, sva);
4018 }
4019 old_l3 = pmap_load_clear(l3);
4020 if ((old_l3 & ATTR_SW_WIRED) != 0)
4021 pmap->pm_stats.wired_count--;
4022 pmap_resident_count_dec(pmap, 1);
4023 if ((old_l3 & ATTR_SW_MANAGED) != 0) {
4024 m = PTE_TO_VM_PAGE(old_l3);
4025 if (pmap_pte_dirty(pmap, old_l3))
4026 vm_page_dirty(m);
4027 if ((old_l3 & ATTR_AF) != 0)
4028 vm_page_aflag_set(m, PGA_REFERENCED);
4029 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4030 if (new_lock != *lockp) {
4031 if (*lockp != NULL) {
4032 /*
4033 * Pending TLB invalidations must be
4034 * performed before the PV list lock is
4035 * released. Otherwise, a concurrent
4036 * pmap_remove_all() on a physical page
4037 * could return while a stale TLB entry
4038 * still provides access to that page.
4039 */
4040 if (va != eva) {
4041 pmap_invalidate_range(pmap, va,
4042 sva, true);
4043 va = eva;
4044 }
4045 rw_wunlock(*lockp);
4046 }
4047 *lockp = new_lock;
4048 rw_wlock(*lockp);
4049 }
4050 pmap_pvh_free(&m->md, pmap, sva);
4051 if (TAILQ_EMPTY(&m->md.pv_list) &&
4052 (m->flags & PG_FICTITIOUS) == 0) {
4053 pvh = page_to_pvh(m);
4054 if (TAILQ_EMPTY(&pvh->pv_list))
4055 vm_page_aflag_clear(m, PGA_WRITEABLE);
4056 }
4057 }
4058 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
4059 /*
4060 * _pmap_unwire_l3() has already invalidated the TLB
4061 * entries at all levels for "sva". So, we need not
4062 * perform "sva += L3_SIZE;" here. Moreover, we need
4063 * not perform "va = sva;" if "sva" is at the start
4064 * of a new valid range consisting of a single page.
4065 */
4066 break;
4067 }
4068 if (va == eva)
4069 va = sva;
4070 }
4071 if (va != eva)
4072 pmap_invalidate_range(pmap, va, sva, true);
4073 }
4074
4075 static void
pmap_remove1(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool map_delete)4076 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
4077 {
4078 struct rwlock *lock;
4079 vm_offset_t va_next;
4080 pd_entry_t *l0, *l1, *l2;
4081 pt_entry_t l3_paddr;
4082 struct spglist free;
4083
4084 /*
4085 * Perform an unsynchronized read. This is, however, safe.
4086 */
4087 if (pmap->pm_stats.resident_count == 0)
4088 return;
4089
4090 SLIST_INIT(&free);
4091
4092 PMAP_LOCK(pmap);
4093 if (map_delete)
4094 pmap_bti_on_remove(pmap, sva, eva);
4095
4096 lock = NULL;
4097 for (; sva < eva; sva = va_next) {
4098 if (pmap->pm_stats.resident_count == 0)
4099 break;
4100
4101 l0 = pmap_l0(pmap, sva);
4102 if (pmap_load(l0) == 0) {
4103 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4104 if (va_next < sva)
4105 va_next = eva;
4106 continue;
4107 }
4108
4109 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4110 if (va_next < sva)
4111 va_next = eva;
4112 l1 = pmap_l0_to_l1(l0, sva);
4113 if (pmap_load(l1) == 0)
4114 continue;
4115 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4116 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4117 KASSERT(va_next <= eva,
4118 ("partial update of non-transparent 1G page "
4119 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4120 pmap_load(l1), sva, eva, va_next));
4121 MPASS(pmap != kernel_pmap);
4122 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4123 pmap_clear(l1);
4124 pmap_s1_invalidate_page(pmap, sva, true);
4125 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
4126 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
4127 continue;
4128 }
4129
4130 /*
4131 * Calculate index for next page table.
4132 */
4133 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4134 if (va_next < sva)
4135 va_next = eva;
4136
4137 l2 = pmap_l1_to_l2(l1, sva);
4138 if (l2 == NULL)
4139 continue;
4140
4141 l3_paddr = pmap_load(l2);
4142
4143 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4144 if (sva + L2_SIZE == va_next && eva >= va_next) {
4145 pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
4146 &free, &lock);
4147 continue;
4148 } else if (pmap_demote_l2_locked(pmap, l2, sva,
4149 &lock) == NULL)
4150 continue;
4151 l3_paddr = pmap_load(l2);
4152 }
4153
4154 /*
4155 * Weed out invalid mappings.
4156 */
4157 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
4158 continue;
4159
4160 /*
4161 * Limit our scan to either the end of the va represented
4162 * by the current page table page, or to the end of the
4163 * range being removed.
4164 */
4165 if (va_next > eva)
4166 va_next = eva;
4167
4168 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
4169 &lock);
4170 }
4171 if (lock != NULL)
4172 rw_wunlock(lock);
4173 PMAP_UNLOCK(pmap);
4174 vm_page_free_pages_toq(&free, true);
4175 }
4176
4177 /*
4178 * Remove the given range of addresses from the specified map.
4179 *
4180 * It is assumed that the start and end are properly
4181 * rounded to the page size.
4182 */
4183 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4184 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4185 {
4186 pmap_remove1(pmap, sva, eva, false);
4187 }
4188
4189 /*
4190 * Remove the given range of addresses as part of a logical unmap
4191 * operation. This has the effect of calling pmap_remove(), but
4192 * also clears any metadata that should persist for the lifetime
4193 * of a logical mapping.
4194 */
4195 void
pmap_map_delete(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4196 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4197 {
4198 pmap_remove1(pmap, sva, eva, true);
4199 }
4200
4201 /*
4202 * Routine: pmap_remove_all
4203 * Function:
4204 * Removes this physical page from
4205 * all physical maps in which it resides.
4206 * Reflects back modify bits to the pager.
4207 *
4208 * Notes:
4209 * Original versions of this routine were very
4210 * inefficient because they iteratively called
4211 * pmap_remove (slow...)
4212 */
4213
4214 void
pmap_remove_all(vm_page_t m)4215 pmap_remove_all(vm_page_t m)
4216 {
4217 struct md_page *pvh;
4218 pv_entry_t pv;
4219 pmap_t pmap;
4220 struct rwlock *lock;
4221 pd_entry_t *pde, tpde;
4222 pt_entry_t *pte, tpte;
4223 vm_offset_t va;
4224 struct spglist free;
4225 int lvl, pvh_gen, md_gen;
4226
4227 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4228 ("pmap_remove_all: page %p is not managed", m));
4229 SLIST_INIT(&free);
4230 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4231 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
4232 rw_wlock(lock);
4233 retry:
4234 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4235 pmap = PV_PMAP(pv);
4236 if (!PMAP_TRYLOCK(pmap)) {
4237 pvh_gen = pvh->pv_gen;
4238 rw_wunlock(lock);
4239 PMAP_LOCK(pmap);
4240 rw_wlock(lock);
4241 if (pvh_gen != pvh->pv_gen) {
4242 PMAP_UNLOCK(pmap);
4243 goto retry;
4244 }
4245 }
4246 va = pv->pv_va;
4247 pte = pmap_pte_exists(pmap, va, 2, __func__);
4248 pmap_demote_l2_locked(pmap, pte, va, &lock);
4249 PMAP_UNLOCK(pmap);
4250 }
4251 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4252 pmap = PV_PMAP(pv);
4253 if (!PMAP_TRYLOCK(pmap)) {
4254 pvh_gen = pvh->pv_gen;
4255 md_gen = m->md.pv_gen;
4256 rw_wunlock(lock);
4257 PMAP_LOCK(pmap);
4258 rw_wlock(lock);
4259 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4260 PMAP_UNLOCK(pmap);
4261 goto retry;
4262 }
4263 }
4264 pmap_resident_count_dec(pmap, 1);
4265
4266 pde = pmap_pde(pmap, pv->pv_va, &lvl);
4267 KASSERT(pde != NULL,
4268 ("pmap_remove_all: no page directory entry found"));
4269 KASSERT(lvl == 2,
4270 ("pmap_remove_all: invalid pde level %d", lvl));
4271 tpde = pmap_load(pde);
4272
4273 pte = pmap_l2_to_l3(pde, pv->pv_va);
4274 tpte = pmap_load(pte);
4275 if ((tpte & ATTR_CONTIGUOUS) != 0)
4276 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
4277 tpte = pmap_load_clear(pte);
4278 if (tpte & ATTR_SW_WIRED)
4279 pmap->pm_stats.wired_count--;
4280 if ((tpte & ATTR_AF) != 0) {
4281 pmap_invalidate_page(pmap, pv->pv_va, true);
4282 vm_page_aflag_set(m, PGA_REFERENCED);
4283 }
4284
4285 /*
4286 * Update the vm_page_t clean and reference bits.
4287 */
4288 if (pmap_pte_dirty(pmap, tpte))
4289 vm_page_dirty(m);
4290 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
4291 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4292 m->md.pv_gen++;
4293 free_pv_entry(pmap, pv);
4294 PMAP_UNLOCK(pmap);
4295 }
4296 vm_page_aflag_clear(m, PGA_WRITEABLE);
4297 rw_wunlock(lock);
4298 vm_page_free_pages_toq(&free, true);
4299 }
4300
4301 /*
4302 * Masks and sets bits in a level 2 page table entries in the specified pmap
4303 */
4304 static void
pmap_protect_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pt_entry_t mask,pt_entry_t nbits)4305 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
4306 pt_entry_t nbits)
4307 {
4308 pd_entry_t old_l2;
4309 vm_page_t m, mt;
4310
4311 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4312 PMAP_ASSERT_STAGE1(pmap);
4313 KASSERT((sva & L2_OFFSET) == 0,
4314 ("pmap_protect_l2: sva is not 2mpage aligned"));
4315 old_l2 = pmap_load(l2);
4316 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4317 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
4318
4319 /*
4320 * Return if the L2 entry already has the desired access restrictions
4321 * in place.
4322 */
4323 if ((old_l2 & mask) == nbits)
4324 return;
4325
4326 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
4327 cpu_spinwait();
4328
4329 /*
4330 * When a dirty read/write superpage mapping is write protected,
4331 * update the dirty field of each of the superpage's constituent 4KB
4332 * pages.
4333 */
4334 if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
4335 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4336 pmap_pte_dirty(pmap, old_l2)) {
4337 m = PTE_TO_VM_PAGE(old_l2);
4338 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4339 vm_page_dirty(mt);
4340 }
4341
4342 /*
4343 * Since a promotion must break the 4KB page mappings before making
4344 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4345 */
4346 pmap_s1_invalidate_page(pmap, sva, true);
4347 }
4348
4349 /*
4350 * Masks and sets bits in the specified L3C superpage mapping.
4351 *
4352 * Requests TLB invalidations to be performed by the caller through the
4353 * returned "*vap".
4354 */
4355 static void
pmap_mask_set_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,pt_entry_t mask,pt_entry_t nbits)4356 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
4357 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits)
4358 {
4359 pt_entry_t l3e, *tl3p;
4360 vm_page_t m, mt;
4361 bool dirty;
4362
4363 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4364 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4365 0, ("pmap_mask_set_l3c: l3p is not aligned"));
4366 KASSERT((va & L3C_OFFSET) == 0,
4367 ("pmap_mask_set_l3c: va is not aligned"));
4368 dirty = false;
4369 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4370 l3e = pmap_load(tl3p);
4371 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4372 ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS"));
4373 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
4374 cpu_spinwait();
4375 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4376 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4377 dirty = true;
4378 }
4379
4380 /*
4381 * When a dirty read/write superpage mapping is write protected,
4382 * update the dirty field of each of the superpage's constituent 4KB
4383 * pages.
4384 */
4385 if ((l3e & ATTR_SW_MANAGED) != 0 &&
4386 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4387 dirty) {
4388 m = PTE_TO_VM_PAGE(pmap_load(l3p));
4389 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
4390 vm_page_dirty(mt);
4391 }
4392
4393 if (*vap == va_next)
4394 *vap = va;
4395 }
4396
4397 /*
4398 * Masks and sets bits in last level page table entries in the specified
4399 * pmap and range
4400 */
4401 static void
pmap_mask_set_locked(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4402 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4403 pt_entry_t nbits, bool invalidate)
4404 {
4405 vm_offset_t va, va_next;
4406 pd_entry_t *l0, *l1, *l2;
4407 pt_entry_t *l3p, l3;
4408
4409 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4410 for (; sva < eva; sva = va_next) {
4411 l0 = pmap_l0(pmap, sva);
4412 if (pmap_load(l0) == 0) {
4413 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4414 if (va_next < sva)
4415 va_next = eva;
4416 continue;
4417 }
4418
4419 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4420 if (va_next < sva)
4421 va_next = eva;
4422 l1 = pmap_l0_to_l1(l0, sva);
4423 if (pmap_load(l1) == 0)
4424 continue;
4425 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4426 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4427 KASSERT(va_next <= eva,
4428 ("partial update of non-transparent 1G page "
4429 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4430 pmap_load(l1), sva, eva, va_next));
4431 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4432 if ((pmap_load(l1) & mask) != nbits) {
4433 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
4434 if (invalidate)
4435 pmap_s1_invalidate_page(pmap, sva, true);
4436 }
4437 continue;
4438 }
4439
4440 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4441 if (va_next < sva)
4442 va_next = eva;
4443
4444 l2 = pmap_l1_to_l2(l1, sva);
4445 if (pmap_load(l2) == 0)
4446 continue;
4447
4448 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4449 if (sva + L2_SIZE == va_next && eva >= va_next) {
4450 pmap_protect_l2(pmap, l2, sva, mask, nbits);
4451 continue;
4452 } else if ((pmap_load(l2) & mask) == nbits ||
4453 pmap_demote_l2(pmap, l2, sva) == NULL)
4454 continue;
4455 }
4456 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4457 ("pmap_protect: Invalid L2 entry after demotion"));
4458
4459 if (va_next > eva)
4460 va_next = eva;
4461
4462 va = va_next;
4463 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
4464 sva += L3_SIZE) {
4465 l3 = pmap_load(l3p);
4466
4467 /*
4468 * Go to the next L3 entry if the current one is
4469 * invalid or already has the desired access
4470 * restrictions in place. (The latter case occurs
4471 * frequently. For example, in a "buildworld"
4472 * workload, almost 1 out of 4 L3 entries already
4473 * have the desired restrictions.)
4474 */
4475 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
4476 if (va != va_next) {
4477 if (invalidate)
4478 pmap_s1_invalidate_range(pmap,
4479 va, sva, true);
4480 va = va_next;
4481 }
4482 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4483 /*
4484 * Does this L3C page extend beyond
4485 * the requested range? Handle the
4486 * possibility that "va_next" is zero.
4487 */
4488 if ((sva | L3C_OFFSET) > va_next - 1)
4489 break;
4490
4491 /*
4492 * Skip ahead to the last L3_PAGE
4493 * within this L3C page.
4494 */
4495 l3p = (pt_entry_t *)((uintptr_t)l3p |
4496 ((L3C_ENTRIES - 1) *
4497 sizeof(pt_entry_t)));
4498 sva |= L3C_SIZE - L3_SIZE;
4499 }
4500 continue;
4501 }
4502
4503 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4504 /*
4505 * Is this entire set of contiguous L3 entries
4506 * being protected? Handle the possibility
4507 * that "va_next" is zero because of address
4508 * wraparound.
4509 */
4510 if ((sva & L3C_OFFSET) == 0 &&
4511 sva + L3C_OFFSET <= va_next - 1) {
4512 pmap_mask_set_l3c(pmap, l3p, sva, &va,
4513 va_next, mask, nbits);
4514 l3p += L3C_ENTRIES - 1;
4515 sva += L3C_SIZE - L3_SIZE;
4516 continue;
4517 }
4518
4519 (void)pmap_demote_l3c(pmap, l3p, sva);
4520
4521 /*
4522 * The L3 entry's accessed bit may have changed.
4523 */
4524 l3 = pmap_load(l3p);
4525 }
4526 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
4527 nbits))
4528 cpu_spinwait();
4529
4530 /*
4531 * When a dirty read/write mapping is write protected,
4532 * update the page's dirty field.
4533 */
4534 if ((l3 & ATTR_SW_MANAGED) != 0 &&
4535 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4536 pmap_pte_dirty(pmap, l3))
4537 vm_page_dirty(PTE_TO_VM_PAGE(l3));
4538
4539 if (va == va_next)
4540 va = sva;
4541 }
4542 if (va != va_next && invalidate)
4543 pmap_s1_invalidate_range(pmap, va, sva, true);
4544 }
4545 }
4546
4547 static void
pmap_mask_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4548 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4549 pt_entry_t nbits, bool invalidate)
4550 {
4551 PMAP_LOCK(pmap);
4552 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
4553 PMAP_UNLOCK(pmap);
4554 }
4555
4556 /*
4557 * Set the physical protection on the
4558 * specified range of this map as requested.
4559 */
4560 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)4561 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4562 {
4563 pt_entry_t mask, nbits;
4564
4565 PMAP_ASSERT_STAGE1(pmap);
4566 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4567 if (prot == VM_PROT_NONE) {
4568 pmap_remove(pmap, sva, eva);
4569 return;
4570 }
4571
4572 mask = nbits = 0;
4573 if ((prot & VM_PROT_WRITE) == 0) {
4574 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
4575 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
4576 }
4577 if ((prot & VM_PROT_EXECUTE) == 0) {
4578 mask |= ATTR_S1_XN;
4579 nbits |= ATTR_S1_XN;
4580 }
4581 if (pmap == kernel_pmap) {
4582 mask |= ATTR_KERN_GP;
4583 nbits |= ATTR_KERN_GP;
4584 }
4585 if (mask == 0)
4586 return;
4587
4588 pmap_mask_set(pmap, sva, eva, mask, nbits, true);
4589 }
4590
4591 void
pmap_disable_promotion(vm_offset_t sva,vm_size_t size)4592 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
4593 {
4594
4595 MPASS((sva & L3_OFFSET) == 0);
4596 MPASS(((sva + size) & L3_OFFSET) == 0);
4597
4598 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4599 ATTR_SW_NO_PROMOTE, false);
4600 }
4601
4602 /*
4603 * Inserts the specified page table page into the specified pmap's collection
4604 * of idle page table pages. Each of a pmap's page table pages is responsible
4605 * for mapping a distinct range of virtual addresses. The pmap's collection is
4606 * ordered by this virtual address range.
4607 *
4608 * If "promoted" is false, then the page table page "mpte" must be zero filled;
4609 * "mpte"'s valid field will be set to 0.
4610 *
4611 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must
4612 * contain valid mappings with identical attributes except for ATTR_AF;
4613 * "mpte"'s valid field will be set to 1.
4614 *
4615 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain
4616 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid
4617 * field will be set to VM_PAGE_BITS_ALL.
4618 */
4619 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_AF_set)4620 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4621 bool all_l3e_AF_set)
4622 {
4623
4624 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4625 KASSERT(promoted || !all_l3e_AF_set,
4626 ("a zero-filled PTP can't have ATTR_AF set in every PTE"));
4627 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0;
4628 return (vm_radix_insert(&pmap->pm_root, mpte));
4629 }
4630
4631 /*
4632 * Removes the page table page mapping the specified virtual address from the
4633 * specified pmap's collection of idle page table pages, and returns it.
4634 * Otherwise, returns NULL if there is no page table page corresponding to the
4635 * specified virtual address.
4636 */
4637 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)4638 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4639 {
4640
4641 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4642 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4643 }
4644
4645 /*
4646 * Performs a break-before-make update of a pmap entry. This is needed when
4647 * either promoting or demoting pages to ensure the TLB doesn't get into an
4648 * inconsistent state.
4649 */
4650 static void
pmap_update_entry(pmap_t pmap,pd_entry_t * ptep,pd_entry_t newpte,vm_offset_t va,vm_size_t size)4651 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte,
4652 vm_offset_t va, vm_size_t size)
4653 {
4654 register_t intr;
4655
4656 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4657 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4658 ("%s: Updating non-promote pte", __func__));
4659
4660 /*
4661 * Ensure we don't get switched out with the page table in an
4662 * inconsistent state. We also need to ensure no interrupts fire
4663 * as they may make use of an address we are about to invalidate.
4664 */
4665 intr = intr_disable();
4666
4667 /*
4668 * Clear the old mapping's valid bit, but leave the rest of the entry
4669 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4670 * lookup the physical address.
4671 */
4672 pmap_clear_bits(ptep, ATTR_DESCR_VALID);
4673
4674 /*
4675 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4676 * be cached, so we invalidate intermediate entries as well as final
4677 * entries.
4678 */
4679 pmap_s1_invalidate_range(pmap, va, va + size, false);
4680
4681 /* Create the new mapping */
4682 pmap_store(ptep, newpte);
4683 dsb(ishst);
4684
4685 intr_restore(intr);
4686 }
4687
4688 /*
4689 * Performs a break-before-make update of an ATTR_CONTIGUOUS mapping.
4690 */
4691 static void __nosanitizecoverage
pmap_update_strided(pmap_t pmap,pd_entry_t * ptep,pd_entry_t * ptep_end,pd_entry_t newpte,vm_offset_t va,vm_offset_t stride,vm_size_t size)4692 pmap_update_strided(pmap_t pmap, pd_entry_t *ptep, pd_entry_t *ptep_end,
4693 pd_entry_t newpte, vm_offset_t va, vm_offset_t stride, vm_size_t size)
4694 {
4695 pd_entry_t *lip;
4696 register_t intr;
4697
4698 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4699 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4700 ("%s: Updating non-promote pte", __func__));
4701
4702 /*
4703 * Ensure we don't get switched out with the page table in an
4704 * inconsistent state. We also need to ensure no interrupts fire
4705 * as they may make use of an address we are about to invalidate.
4706 */
4707 intr = intr_disable();
4708
4709 /*
4710 * Clear the old mapping's valid bits, but leave the rest of each
4711 * entry unchanged, so that a lockless, concurrent pmap_kextract() can
4712 * still lookup the physical address.
4713 */
4714 for (lip = ptep; lip < ptep_end; lip++)
4715 pmap_clear_bits(lip, ATTR_DESCR_VALID);
4716
4717 /* Only final entries are changing. */
4718 pmap_s1_invalidate_strided(pmap, va, va + size, stride, true);
4719
4720 /* Create the new mapping. */
4721 for (lip = ptep; lip < ptep_end; lip++) {
4722 pmap_store(lip, newpte);
4723 newpte += stride;
4724 }
4725 dsb(ishst);
4726
4727 intr_restore(intr);
4728 }
4729
4730 #if VM_NRESERVLEVEL > 0
4731 /*
4732 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4733 * replace the many pv entries for the 4KB page mappings by a single pv entry
4734 * for the 2MB page mapping.
4735 */
4736 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)4737 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4738 struct rwlock **lockp)
4739 {
4740 struct md_page *pvh;
4741 pv_entry_t pv;
4742 vm_offset_t va_last;
4743 vm_page_t m;
4744
4745 KASSERT((pa & L2_OFFSET) == 0,
4746 ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4747 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4748
4749 /*
4750 * Transfer the first page's pv entry for this mapping to the 2mpage's
4751 * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
4752 * a transfer avoids the possibility that get_pv_entry() calls
4753 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4754 * mappings that is being promoted.
4755 */
4756 m = PHYS_TO_VM_PAGE(pa);
4757 va = va & ~L2_OFFSET;
4758 pv = pmap_pvh_remove(&m->md, pmap, va);
4759 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4760 pvh = page_to_pvh(m);
4761 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4762 pvh->pv_gen++;
4763 /* Free the remaining NPTEPG - 1 pv entries. */
4764 va_last = va + L2_SIZE - PAGE_SIZE;
4765 do {
4766 m++;
4767 va += PAGE_SIZE;
4768 pmap_pvh_free(&m->md, pmap, va);
4769 } while (va < va_last);
4770 }
4771
4772 /*
4773 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4774 * single level 2 table entry to a single 2MB page mapping. For promotion
4775 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4776 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4777 * identical characteristics.
4778 */
4779 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t mpte,struct rwlock ** lockp)4780 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4781 struct rwlock **lockp)
4782 {
4783 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa;
4784
4785 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4786
4787 /*
4788 * Currently, this function only supports promotion on stage 1 pmaps
4789 * because it tests stage 1 specific fields and performs a break-
4790 * before-make sequence that is incorrect for stage 2 pmaps.
4791 */
4792 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4793 return (false);
4794
4795 /*
4796 * Examine the first L3E in the specified PTP. Abort if this L3E is
4797 * ineligible for promotion...
4798 */
4799 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
4800 newl2 = pmap_load(firstl3);
4801 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
4802 return (false);
4803 /* ... is not the first physical page within an L2 block */
4804 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 ||
4805 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */
4806 atomic_add_long(&pmap_l2_p_failures, 1);
4807 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4808 " in pmap %p", va, pmap);
4809 return (false);
4810 }
4811
4812 /*
4813 * Both here and in the below "for" loop, to allow for repromotion
4814 * after MADV_FREE, conditionally write protect a clean L3E before
4815 * possibly aborting the promotion due to other L3E attributes. Why?
4816 * Suppose that MADV_FREE is applied to a part of a superpage, the
4817 * address range [S, E). pmap_advise() will demote the superpage
4818 * mapping, destroy the 4KB page mapping at the end of [S, E), and
4819 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later,
4820 * imagine that the memory in [S, E) is recycled, but the last 4KB
4821 * page in [S, E) is not the last to be rewritten, or simply accessed.
4822 * In other words, there is still a 4KB page in [S, E), call it P,
4823 * that is writeable but AP_RO is set and AF is clear in P's L3E.
4824 * Unless we write protect P before aborting the promotion, if and
4825 * when P is finally rewritten, there won't be a page fault to trigger
4826 * repromotion.
4827 */
4828 setl2:
4829 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4830 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4831 /*
4832 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4833 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4834 */
4835 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
4836 goto setl2;
4837 newl2 &= ~ATTR_SW_DBM;
4838 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx"
4839 " in pmap %p", va & ~L2_OFFSET, pmap);
4840 }
4841
4842 /*
4843 * Examine each of the other L3Es in the specified PTP. Abort if this
4844 * L3E maps an unexpected 4KB physical page or does not have identical
4845 * characteristics to the first L3E. If ATTR_AF is not set in every
4846 * PTE, then request that the PTP be refilled on demotion.
4847 */
4848 all_l3e_AF = newl2 & ATTR_AF;
4849 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK))
4850 + L2_SIZE - PAGE_SIZE;
4851 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
4852 oldl3 = pmap_load(l3);
4853 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4854 atomic_add_long(&pmap_l2_p_failures, 1);
4855 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4856 " in pmap %p", va, pmap);
4857 return (false);
4858 }
4859 setl3:
4860 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4861 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4862 /*
4863 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4864 * set, ATTR_SW_DBM can be cleared without a TLB
4865 * invalidation.
4866 */
4867 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4868 ~ATTR_SW_DBM))
4869 goto setl3;
4870 oldl3 &= ~ATTR_SW_DBM;
4871 }
4872 if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) {
4873 atomic_add_long(&pmap_l2_p_failures, 1);
4874 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4875 " in pmap %p", va, pmap);
4876 return (false);
4877 }
4878 all_l3e_AF &= oldl3;
4879 pa -= PAGE_SIZE;
4880 }
4881
4882 /*
4883 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4884 * mapping, so that promotions triggered by speculative mappings,
4885 * such as pmap_enter_quick(), don't automatically mark the
4886 * underlying pages as referenced.
4887 */
4888 newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF;
4889
4890 /*
4891 * Save the page table page in its current state until the L2
4892 * mapping the superpage is demoted by pmap_demote_l2() or
4893 * destroyed by pmap_remove_l3().
4894 */
4895 if (mpte == NULL)
4896 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
4897 KASSERT(mpte >= vm_page_array &&
4898 mpte < &vm_page_array[vm_page_array_size],
4899 ("pmap_promote_l2: page table page is out of range"));
4900 KASSERT(mpte->pindex == pmap_l2_pindex(va),
4901 ("pmap_promote_l2: page table page's pindex is wrong"));
4902 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) {
4903 atomic_add_long(&pmap_l2_p_failures, 1);
4904 CTR2(KTR_PMAP,
4905 "pmap_promote_l2: failure for va %#lx in pmap %p", va,
4906 pmap);
4907 return (false);
4908 }
4909
4910 if ((newl2 & ATTR_SW_MANAGED) != 0)
4911 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp);
4912
4913 pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE);
4914
4915 atomic_add_long(&pmap_l2_promotions, 1);
4916 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
4917 pmap);
4918 return (true);
4919 }
4920
4921 /*
4922 * Tries to promote an aligned, contiguous set of base page mappings to a
4923 * single L3C page mapping. For promotion to occur, two conditions must be
4924 * met: (1) the base page mappings must map aligned, contiguous physical
4925 * memory and (2) the base page mappings must have identical characteristics
4926 * except for the accessed flag.
4927 */
4928 static bool
pmap_promote_l3c(pmap_t pmap,pd_entry_t * l3p,vm_offset_t va)4929 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va)
4930 {
4931 pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa;
4932
4933 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4934
4935 /*
4936 * Currently, this function only supports promotion on stage 1 pmaps
4937 * because it tests stage 1 specific fields and performs a break-
4938 * before-make sequence that is incorrect for stage 2 pmaps.
4939 */
4940 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4941 return (false);
4942
4943 /*
4944 * Compute the address of the first L3 entry in the superpage
4945 * candidate.
4946 */
4947 l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
4948 sizeof(pt_entry_t)) - 1));
4949
4950 firstl3c = pmap_load(l3p);
4951
4952 /*
4953 * Examine the first L3 entry. Abort if this L3E is ineligible for
4954 * promotion...
4955 */
4956 if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0)
4957 return (false);
4958 /* ...is not properly aligned... */
4959 if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 ||
4960 (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */
4961 counter_u64_add(pmap_l3c_p_failures, 1);
4962 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4963 " in pmap %p", va, pmap);
4964 return (false);
4965 }
4966
4967 /*
4968 * If the first L3 entry is a clean read-write mapping, convert it
4969 * to a read-only mapping. See pmap_promote_l2() for the rationale.
4970 */
4971 set_first:
4972 if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4973 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4974 /*
4975 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4976 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4977 */
4978 if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM))
4979 goto set_first;
4980 firstl3c &= ~ATTR_SW_DBM;
4981 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4982 " in pmap %p", va & ~L3C_OFFSET, pmap);
4983 }
4984
4985 /*
4986 * Check that the rest of the L3 entries are compatible with the first,
4987 * and convert clean read-write mappings to read-only mappings.
4988 */
4989 all_l3e_AF = firstl3c & ATTR_AF;
4990 pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) +
4991 L3C_SIZE - PAGE_SIZE;
4992 for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) {
4993 oldl3 = pmap_load(l3);
4994 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4995 counter_u64_add(pmap_l3c_p_failures, 1);
4996 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4997 " in pmap %p", va, pmap);
4998 return (false);
4999 }
5000 set_l3:
5001 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5002 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5003 /*
5004 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
5005 * set, ATTR_SW_DBM can be cleared without a TLB
5006 * invalidation.
5007 */
5008 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
5009 ~ATTR_SW_DBM))
5010 goto set_l3;
5011 oldl3 &= ~ATTR_SW_DBM;
5012 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
5013 " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) |
5014 (va & ~L3C_OFFSET), pmap);
5015 }
5016 if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) {
5017 counter_u64_add(pmap_l3c_p_failures, 1);
5018 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5019 " in pmap %p", va, pmap);
5020 return (false);
5021 }
5022 all_l3e_AF &= oldl3;
5023 pa -= PAGE_SIZE;
5024 }
5025
5026 /*
5027 * Unless all PTEs have ATTR_AF set, clear it from the superpage
5028 * mapping, so that promotions triggered by speculative mappings,
5029 * such as pmap_enter_quick(), don't automatically mark the
5030 * underlying pages as referenced.
5031 */
5032 firstl3c &= ~ATTR_AF | all_l3e_AF;
5033
5034 /*
5035 * Remake the mappings with the contiguous bit set.
5036 */
5037 pmap_update_strided(pmap, l3p, l3p + L3C_ENTRIES, firstl3c |
5038 ATTR_CONTIGUOUS, va & ~L3C_OFFSET, L3_SIZE, L3C_SIZE);
5039
5040 counter_u64_add(pmap_l3c_promotions, 1);
5041 CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va,
5042 pmap);
5043 return (true);
5044 }
5045 #endif /* VM_NRESERVLEVEL > 0 */
5046
5047 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t pte,int flags,int psind)5048 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags,
5049 int psind)
5050 {
5051 pd_entry_t *l0p, *l1p, *l2p, *l3p, newpte, origpte, *tl3p;
5052 vm_page_t mp;
5053
5054 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5055 KASSERT(psind > 0 && psind < MAXPAGESIZES,
5056 ("psind %d unexpected", psind));
5057 KASSERT((PTE_TO_PHYS(pte) & (pagesizes[psind] - 1)) == 0,
5058 ("unaligned phys address %#lx pte %#lx psind %d",
5059 PTE_TO_PHYS(pte), pte, psind));
5060
5061 restart:
5062 newpte = pte;
5063 if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte))
5064 return (KERN_PROTECTION_FAILURE);
5065 if (psind == 3) {
5066 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5067
5068 KASSERT(pagesizes[psind] == L1_SIZE,
5069 ("pagesizes[%d] != L1_SIZE", psind));
5070 l0p = pmap_l0(pmap, va);
5071 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
5072 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
5073 if (mp == NULL) {
5074 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5075 return (KERN_RESOURCE_SHORTAGE);
5076 PMAP_UNLOCK(pmap);
5077 vm_wait(NULL);
5078 PMAP_LOCK(pmap);
5079 goto restart;
5080 }
5081 l1p = pmap_l0_to_l1(l0p, va);
5082 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
5083 origpte = pmap_load(l1p);
5084 } else {
5085 l1p = pmap_l0_to_l1(l0p, va);
5086 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
5087 origpte = pmap_load(l1p);
5088 if ((origpte & ATTR_DESCR_VALID) == 0) {
5089 mp = PTE_TO_VM_PAGE(pmap_load(l0p));
5090 mp->ref_count++;
5091 }
5092 }
5093 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) &&
5094 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
5095 (origpte & ATTR_DESCR_VALID) == 0,
5096 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
5097 va, origpte, newpte));
5098 pmap_store(l1p, newpte);
5099 } else if (psind == 2) {
5100 KASSERT(pagesizes[psind] == L2_SIZE,
5101 ("pagesizes[%d] != L2_SIZE", psind));
5102 l2p = pmap_l2(pmap, va);
5103 if (l2p == NULL) {
5104 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
5105 if (mp == NULL) {
5106 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5107 return (KERN_RESOURCE_SHORTAGE);
5108 PMAP_UNLOCK(pmap);
5109 vm_wait(NULL);
5110 PMAP_LOCK(pmap);
5111 goto restart;
5112 }
5113 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
5114 l2p = &l2p[pmap_l2_index(va)];
5115 origpte = pmap_load(l2p);
5116 } else {
5117 l1p = pmap_l1(pmap, va);
5118 origpte = pmap_load(l2p);
5119 if ((origpte & ATTR_DESCR_VALID) == 0) {
5120 mp = PTE_TO_VM_PAGE(pmap_load(l1p));
5121 mp->ref_count++;
5122 }
5123 }
5124 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5125 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
5126 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5127 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
5128 va, origpte, newpte));
5129 pmap_store(l2p, newpte);
5130 } else /* (psind == 1) */ {
5131 KASSERT(pagesizes[psind] == L3C_SIZE,
5132 ("pagesizes[%d] != L3C_SIZE", psind));
5133 l2p = pmap_l2(pmap, va);
5134 if (l2p == NULL || (pmap_load(l2p) & ATTR_DESCR_VALID) == 0) {
5135 mp = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL);
5136 if (mp == NULL) {
5137 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5138 return (KERN_RESOURCE_SHORTAGE);
5139 PMAP_UNLOCK(pmap);
5140 vm_wait(NULL);
5141 PMAP_LOCK(pmap);
5142 goto restart;
5143 }
5144 mp->ref_count += L3C_ENTRIES - 1;
5145 l3p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
5146 l3p = &l3p[pmap_l3_index(va)];
5147 } else {
5148 l3p = pmap_l2_to_l3(l2p, va);
5149 if ((pmap_load(l3p) & ATTR_DESCR_VALID) == 0) {
5150 mp = PTE_TO_VM_PAGE(pmap_load(l2p));
5151 mp->ref_count += L3C_ENTRIES;
5152 }
5153 }
5154 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5155 origpte = pmap_load(tl3p);
5156 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5157 ((origpte & ATTR_CONTIGUOUS) != 0 &&
5158 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5159 ("va %#lx changing 64K phys page l3 %#lx newpte %#lx",
5160 va, origpte, newpte));
5161 pmap_store(tl3p, newpte);
5162 newpte += L3_SIZE;
5163 }
5164 }
5165 dsb(ishst);
5166
5167 if ((origpte & ATTR_DESCR_VALID) == 0)
5168 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
5169 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
5170 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
5171 else if ((newpte & ATTR_SW_WIRED) == 0 &&
5172 (origpte & ATTR_SW_WIRED) != 0)
5173 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
5174
5175 return (KERN_SUCCESS);
5176 }
5177
5178 /*
5179 * Insert the given physical page (p) at
5180 * the specified virtual address (v) in the
5181 * target physical map with the protection requested.
5182 *
5183 * If specified, the page will be wired down, meaning
5184 * that the related pte can not be reclaimed.
5185 *
5186 * NB: This is the only routine which MAY NOT lazy-evaluate
5187 * or lose information. That is, this routine must actually
5188 * insert this page into the given map NOW.
5189 */
5190 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)5191 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5192 u_int flags, int8_t psind)
5193 {
5194 struct rwlock *lock;
5195 pd_entry_t *pde;
5196 pt_entry_t new_l3, orig_l3;
5197 pt_entry_t *l2, *l3;
5198 pv_entry_t pv;
5199 vm_paddr_t opa, pa;
5200 vm_page_t mpte, om;
5201 bool nosleep;
5202 int full_lvl, lvl, rv;
5203
5204 KASSERT(ADDR_IS_CANONICAL(va),
5205 ("%s: Address not in canonical form: %lx", __func__, va));
5206
5207 va = trunc_page(va);
5208 if ((m->oflags & VPO_UNMANAGED) == 0)
5209 VM_PAGE_OBJECT_BUSY_ASSERT(m);
5210 pa = VM_PAGE_TO_PHYS(m);
5211 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr |
5212 L3_PAGE);
5213 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
5214 new_l3 |= pmap_pte_prot(pmap, prot);
5215 if ((flags & PMAP_ENTER_WIRED) != 0)
5216 new_l3 |= ATTR_SW_WIRED;
5217 if (pmap->pm_stage == PM_STAGE1) {
5218 if (!ADDR_IS_KERNEL(va))
5219 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5220 else
5221 new_l3 |= ATTR_S1_UXN;
5222 if (pmap != kernel_pmap)
5223 new_l3 |= ATTR_S1_nG;
5224 } else {
5225 /*
5226 * Clear the access flag on executable mappings, this will be
5227 * set later when the page is accessed. The fault handler is
5228 * required to invalidate the I-cache.
5229 *
5230 * TODO: Switch to the valid flag to allow hardware management
5231 * of the access flag. Much of the pmap code assumes the
5232 * valid flag is set and fails to destroy the old page tables
5233 * correctly if it is clear.
5234 */
5235 if (prot & VM_PROT_EXECUTE)
5236 new_l3 &= ~ATTR_AF;
5237 }
5238 if ((m->oflags & VPO_UNMANAGED) == 0) {
5239 new_l3 |= ATTR_SW_MANAGED;
5240 if ((prot & VM_PROT_WRITE) != 0) {
5241 new_l3 |= ATTR_SW_DBM;
5242 if ((flags & VM_PROT_WRITE) == 0) {
5243 if (pmap->pm_stage == PM_STAGE1)
5244 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
5245 else
5246 new_l3 &=
5247 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5248 }
5249 }
5250 }
5251
5252 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
5253
5254 lock = NULL;
5255 PMAP_LOCK(pmap);
5256 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
5257 KASSERT((m->oflags & VPO_UNMANAGED) != 0,
5258 ("managed largepage va %#lx flags %#x", va, flags));
5259 if (psind == 3) {
5260 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5261 new_l3 &= ~L3_PAGE;
5262 new_l3 |= L1_BLOCK;
5263 } else if (psind == 2) {
5264 new_l3 &= ~L3_PAGE;
5265 new_l3 |= L2_BLOCK;
5266 } else /* (psind == 1) */
5267 new_l3 |= ATTR_CONTIGUOUS;
5268 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
5269 goto out;
5270 }
5271 if (psind == 2) {
5272 /* Assert the required virtual and physical alignment. */
5273 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
5274 KASSERT(m->psind > 1, ("pmap_enter: m->psind < psind"));
5275 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
5276 flags, m, &lock);
5277 goto out;
5278 }
5279 mpte = NULL;
5280 if (psind == 1) {
5281 KASSERT((va & L3C_OFFSET) == 0, ("pmap_enter: va unaligned"));
5282 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5283 rv = pmap_enter_l3c(pmap, va, new_l3 | ATTR_CONTIGUOUS, flags,
5284 m, &mpte, &lock);
5285 #if VM_NRESERVLEVEL > 0
5286 /*
5287 * Attempt L2 promotion, if both the PTP and a level 1
5288 * reservation are fully populated.
5289 */
5290 if (rv == KERN_SUCCESS &&
5291 (mpte == NULL || mpte->ref_count == NL3PG) &&
5292 (m->flags & PG_FICTITIOUS) == 0 &&
5293 vm_reserv_level_iffullpop(m) == 1) {
5294 pde = pmap_l2(pmap, va);
5295 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5296 }
5297 #endif
5298 goto out;
5299 }
5300
5301 /*
5302 * In the case that a page table page is not
5303 * resident, we are creating it here.
5304 */
5305 retry:
5306 pde = pmap_pde(pmap, va, &lvl);
5307 if (pde != NULL && lvl == 2) {
5308 l3 = pmap_l2_to_l3(pde, va);
5309 if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
5310 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
5311 mpte->ref_count++;
5312 }
5313 goto havel3;
5314 } else if (pde != NULL && lvl == 1) {
5315 l2 = pmap_l1_to_l2(pde, va);
5316 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
5317 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
5318 l3 = &l3[pmap_l3_index(va)];
5319 if (!ADDR_IS_KERNEL(va)) {
5320 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5321 mpte->ref_count++;
5322 }
5323 goto havel3;
5324 }
5325 /* We need to allocate an L3 table. */
5326 }
5327 if (!ADDR_IS_KERNEL(va)) {
5328 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5329
5330 /*
5331 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
5332 * to handle the possibility that a superpage mapping for "va"
5333 * was created while we slept.
5334 */
5335 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
5336 nosleep ? NULL : &lock);
5337 if (mpte == NULL && nosleep) {
5338 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
5339 rv = KERN_RESOURCE_SHORTAGE;
5340 goto out;
5341 }
5342 goto retry;
5343 } else
5344 panic("pmap_enter: missing L3 table for kernel va %#lx", va);
5345
5346 havel3:
5347 orig_l3 = pmap_load(l3);
5348 opa = PTE_TO_PHYS(orig_l3);
5349 pv = NULL;
5350 new_l3 |= pmap_pte_bti(pmap, va);
5351
5352 /*
5353 * Is the specified virtual address already mapped?
5354 */
5355 if (pmap_l3_valid(orig_l3)) {
5356 /*
5357 * Wiring change, just update stats. We don't worry about
5358 * wiring PT pages as they remain resident as long as there
5359 * are valid mappings in them. Hence, if a user page is wired,
5360 * the PT page will be also.
5361 */
5362 if ((flags & PMAP_ENTER_WIRED) != 0 &&
5363 (orig_l3 & ATTR_SW_WIRED) == 0)
5364 pmap->pm_stats.wired_count++;
5365 else if ((flags & PMAP_ENTER_WIRED) == 0 &&
5366 (orig_l3 & ATTR_SW_WIRED) != 0)
5367 pmap->pm_stats.wired_count--;
5368
5369 /*
5370 * Remove the extra PT page reference.
5371 */
5372 if (mpte != NULL) {
5373 mpte->ref_count--;
5374 KASSERT(mpte->ref_count > 0,
5375 ("pmap_enter: missing reference to page table page,"
5376 " va: 0x%lx", va));
5377 }
5378
5379 /*
5380 * Has the physical page changed?
5381 */
5382 if (opa == pa) {
5383 /*
5384 * No, might be a protection or wiring change.
5385 */
5386 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5387 (new_l3 & ATTR_SW_DBM) != 0)
5388 vm_page_aflag_set(m, PGA_WRITEABLE);
5389 goto validate;
5390 }
5391
5392 /*
5393 * The physical page has changed. Temporarily invalidate
5394 * the mapping.
5395 */
5396 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5397 (void)pmap_demote_l3c(pmap, l3, va);
5398 orig_l3 = pmap_load_clear(l3);
5399 KASSERT(PTE_TO_PHYS(orig_l3) == opa,
5400 ("pmap_enter: unexpected pa update for %#lx", va));
5401 if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
5402 om = PHYS_TO_VM_PAGE(opa);
5403
5404 /*
5405 * The pmap lock is sufficient to synchronize with
5406 * concurrent calls to pmap_page_test_mappings() and
5407 * pmap_ts_referenced().
5408 */
5409 if (pmap_pte_dirty(pmap, orig_l3))
5410 vm_page_dirty(om);
5411 if ((orig_l3 & ATTR_AF) != 0) {
5412 pmap_invalidate_page(pmap, va, true);
5413 vm_page_aflag_set(om, PGA_REFERENCED);
5414 }
5415 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om);
5416 pv = pmap_pvh_remove(&om->md, pmap, va);
5417 if ((m->oflags & VPO_UNMANAGED) != 0)
5418 free_pv_entry(pmap, pv);
5419 if ((om->a.flags & PGA_WRITEABLE) != 0 &&
5420 TAILQ_EMPTY(&om->md.pv_list) &&
5421 ((om->flags & PG_FICTITIOUS) != 0 ||
5422 TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
5423 vm_page_aflag_clear(om, PGA_WRITEABLE);
5424 } else {
5425 KASSERT((orig_l3 & ATTR_AF) != 0,
5426 ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
5427 pmap_invalidate_page(pmap, va, true);
5428 }
5429 orig_l3 = 0;
5430 } else {
5431 /*
5432 * Increment the counters.
5433 */
5434 if ((new_l3 & ATTR_SW_WIRED) != 0)
5435 pmap->pm_stats.wired_count++;
5436 pmap_resident_count_inc(pmap, 1);
5437 }
5438 /*
5439 * Enter on the PV list if part of our managed memory.
5440 */
5441 if ((m->oflags & VPO_UNMANAGED) == 0) {
5442 if (pv == NULL) {
5443 pv = get_pv_entry(pmap, &lock);
5444 pv->pv_va = va;
5445 }
5446 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5447 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5448 m->md.pv_gen++;
5449 if ((new_l3 & ATTR_SW_DBM) != 0)
5450 vm_page_aflag_set(m, PGA_WRITEABLE);
5451 }
5452
5453 validate:
5454 if (pmap->pm_stage == PM_STAGE1) {
5455 /*
5456 * Sync icache if exec permission and attribute
5457 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
5458 * is stored and made valid for hardware table walk. If done
5459 * later, then other can access this page before caches are
5460 * properly synced. Don't do it for kernel memory which is
5461 * mapped with exec permission even if the memory isn't going
5462 * to hold executable code. The only time when icache sync is
5463 * needed is after kernel module is loaded and the relocation
5464 * info is processed. And it's done in elf_cpu_load_file().
5465 */
5466 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5467 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
5468 (opa != pa || (orig_l3 & ATTR_S1_XN))) {
5469 PMAP_ASSERT_STAGE1(pmap);
5470 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
5471 PAGE_SIZE);
5472 }
5473 } else {
5474 cpu_dcache_wb_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
5475 }
5476
5477 /*
5478 * Update the L3 entry
5479 */
5480 if (pmap_l3_valid(orig_l3)) {
5481 KASSERT(opa == pa, ("pmap_enter: invalid update"));
5482 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
5483 /* same PA, different attributes */
5484 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5485 (void)pmap_demote_l3c(pmap, l3, va);
5486 orig_l3 = pmap_load_store(l3, new_l3);
5487 pmap_invalidate_page(pmap, va, true);
5488 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5489 pmap_pte_dirty(pmap, orig_l3))
5490 vm_page_dirty(m);
5491 } else {
5492 /*
5493 * orig_l3 == new_l3
5494 * This can happens if multiple threads simultaneously
5495 * access not yet mapped page. This bad for performance
5496 * since this can cause full demotion-NOP-promotion
5497 * cycle.
5498 * Another possible reasons are:
5499 * - VM and pmap memory layout are diverged
5500 * - tlb flush is missing somewhere and CPU doesn't see
5501 * actual mapping.
5502 */
5503 CTR4(KTR_PMAP, "%s: already mapped page - "
5504 "pmap %p va 0x%#lx pte 0x%lx",
5505 __func__, pmap, va, new_l3);
5506 }
5507 } else {
5508 /* New mapping */
5509 pmap_store(l3, new_l3);
5510 dsb(ishst);
5511 }
5512
5513 #if VM_NRESERVLEVEL > 0
5514 /*
5515 * First, attempt L3C promotion, if the virtual and physical addresses
5516 * are aligned with each other and an underlying reservation has the
5517 * neighboring L3 pages allocated. The first condition is simply an
5518 * optimization that recognizes some eventual promotion failures early
5519 * at a lower run-time cost. Then, if both a level 1 reservation and
5520 * the PTP are fully populated, attempt L2 promotion.
5521 */
5522 if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
5523 (m->flags & PG_FICTITIOUS) == 0 &&
5524 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
5525 pmap_promote_l3c(pmap, l3, va) &&
5526 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG))
5527 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5528 #endif
5529
5530 rv = KERN_SUCCESS;
5531 out:
5532 if (lock != NULL)
5533 rw_wunlock(lock);
5534 PMAP_UNLOCK(pmap);
5535 return (rv);
5536 }
5537
5538 /*
5539 * Tries to create a read- and/or execute-only L2 page mapping. Returns
5540 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
5541 * value. See pmap_enter_l2() for the possible error values when "no sleep",
5542 * "no replace", and "no reclaim" are specified.
5543 */
5544 static int
pmap_enter_l2_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)5545 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5546 struct rwlock **lockp)
5547 {
5548 pd_entry_t new_l2;
5549
5550 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5551 PMAP_ASSERT_STAGE1(pmap);
5552 KASSERT(ADDR_IS_CANONICAL(va),
5553 ("%s: Address not in canonical form: %lx", __func__, va));
5554
5555 new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | pmap_sh_attr |
5556 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5557 L2_BLOCK);
5558 if ((m->oflags & VPO_UNMANAGED) == 0)
5559 new_l2 |= ATTR_SW_MANAGED;
5560 else
5561 new_l2 |= ATTR_AF;
5562 if ((prot & VM_PROT_EXECUTE) == 0 ||
5563 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5564 new_l2 |= ATTR_S1_XN;
5565 if (!ADDR_IS_KERNEL(va))
5566 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5567 else
5568 new_l2 |= ATTR_S1_UXN;
5569 if (pmap != kernel_pmap)
5570 new_l2 |= ATTR_S1_nG;
5571 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
5572 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
5573 }
5574
5575 /*
5576 * Returns true if every page table entry in the specified page table is
5577 * zero.
5578 */
5579 static bool
pmap_every_pte_zero(vm_paddr_t pa)5580 pmap_every_pte_zero(vm_paddr_t pa)
5581 {
5582 pt_entry_t *pt_end, *pte;
5583
5584 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
5585 pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
5586 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
5587 if (*pte != 0)
5588 return (false);
5589 }
5590 return (true);
5591 }
5592
5593 /*
5594 * Tries to create the specified L2 page mapping. Returns KERN_SUCCESS if
5595 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
5596 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if
5597 * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists
5598 * within the L2 virtual address range starting at the specified virtual
5599 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
5600 * L2 page mapping already exists at the specified virtual address. Returns
5601 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
5602 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
5603 * and a PV entry allocation failed.
5604 */
5605 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)5606 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
5607 vm_page_t m, struct rwlock **lockp)
5608 {
5609 struct spglist free;
5610 pd_entry_t *l2, old_l2;
5611 vm_page_t l2pg, mt;
5612 vm_page_t uwptpg;
5613
5614 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5615 KASSERT(ADDR_IS_CANONICAL(va),
5616 ("%s: Address not in canonical form: %lx", __func__, va));
5617
5618 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
5619 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
5620 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
5621 va, pmap);
5622 return (KERN_RESOURCE_SHORTAGE);
5623 }
5624
5625 /*
5626 * If bti is not the same for the whole l2 range, return failure
5627 * and let vm_fault() cope. Check after l2 allocation, since
5628 * it could sleep.
5629 */
5630 if (!pmap_bti_same(pmap, va, va + L2_SIZE, &new_l2)) {
5631 KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP"));
5632 pmap_abort_ptp(pmap, va, l2pg);
5633 return (KERN_PROTECTION_FAILURE);
5634 }
5635
5636 /*
5637 * If there are existing mappings, either abort or remove them.
5638 */
5639 if ((old_l2 = pmap_load(l2)) != 0) {
5640 KASSERT(l2pg == NULL || l2pg->ref_count > 1,
5641 ("pmap_enter_l2: l2pg's ref count is too low"));
5642 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5643 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5644 if (l2pg != NULL)
5645 l2pg->ref_count--;
5646 CTR2(KTR_PMAP,
5647 "pmap_enter_l2: no space for va %#lx"
5648 " in pmap %p", va, pmap);
5649 return (KERN_NO_SPACE);
5650 } else if (!ADDR_IS_KERNEL(va) ||
5651 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) {
5652 if (l2pg != NULL)
5653 l2pg->ref_count--;
5654 CTR2(KTR_PMAP,
5655 "pmap_enter_l2: failure for va %#lx"
5656 " in pmap %p", va, pmap);
5657 return (KERN_FAILURE);
5658 }
5659 }
5660 SLIST_INIT(&free);
5661 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
5662 (void)pmap_remove_l2(pmap, l2, va,
5663 pmap_load(pmap_l1(pmap, va)), &free, lockp);
5664 else
5665 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
5666 &free, lockp);
5667 if (!ADDR_IS_KERNEL(va)) {
5668 vm_page_free_pages_toq(&free, true);
5669 KASSERT(pmap_load(l2) == 0,
5670 ("pmap_enter_l2: non-zero L2 entry %p", l2));
5671 } else {
5672 KASSERT(SLIST_EMPTY(&free),
5673 ("pmap_enter_l2: freed kernel page table page"));
5674
5675 /*
5676 * Both pmap_remove_l2() and pmap_remove_l3_range()
5677 * will leave the kernel page table page zero filled.
5678 * Nonetheless, the TLB could have an intermediate
5679 * entry for the kernel page table page, so request
5680 * an invalidation at all levels after clearing
5681 * the L2_TABLE entry.
5682 */
5683 mt = PTE_TO_VM_PAGE(pmap_load(l2));
5684 if (pmap_insert_pt_page(pmap, mt, false, false))
5685 panic("pmap_enter_l2: trie insert failed");
5686 pmap_clear(l2);
5687 pmap_s1_invalidate_page(pmap, va, false);
5688 }
5689 }
5690
5691 /*
5692 * Allocate leaf ptpage for wired userspace pages.
5693 */
5694 uwptpg = NULL;
5695 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) {
5696 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5697 if (uwptpg == NULL) {
5698 pmap_abort_ptp(pmap, va, l2pg);
5699 return (KERN_RESOURCE_SHORTAGE);
5700 }
5701 uwptpg->pindex = pmap_l2_pindex(va);
5702 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
5703 vm_page_unwire_noq(uwptpg);
5704 vm_page_free(uwptpg);
5705 pmap_abort_ptp(pmap, va, l2pg);
5706 return (KERN_RESOURCE_SHORTAGE);
5707 }
5708 pmap_resident_count_inc(pmap, 1);
5709 uwptpg->ref_count = NL3PG;
5710 }
5711 if ((new_l2 & ATTR_SW_MANAGED) != 0) {
5712 /*
5713 * Abort this mapping if its PV entry could not be created.
5714 */
5715 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
5716 if (l2pg != NULL)
5717 pmap_abort_ptp(pmap, va, l2pg);
5718 if (uwptpg != NULL) {
5719 mt = pmap_remove_pt_page(pmap, va);
5720 KASSERT(mt == uwptpg,
5721 ("removed pt page %p, expected %p", mt,
5722 uwptpg));
5723 pmap_resident_count_dec(pmap, 1);
5724 uwptpg->ref_count = 1;
5725 vm_page_unwire_noq(uwptpg);
5726 vm_page_free(uwptpg);
5727 }
5728 CTR2(KTR_PMAP,
5729 "pmap_enter_l2: failure for va %#lx in pmap %p",
5730 va, pmap);
5731 return (KERN_RESOURCE_SHORTAGE);
5732 }
5733 if ((new_l2 & ATTR_SW_DBM) != 0)
5734 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5735 vm_page_aflag_set(mt, PGA_WRITEABLE);
5736 }
5737
5738 /*
5739 * Increment counters.
5740 */
5741 if ((new_l2 & ATTR_SW_WIRED) != 0)
5742 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
5743 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
5744
5745 /*
5746 * Conditionally sync the icache. See pmap_enter() for details.
5747 */
5748 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) !=
5749 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) &&
5750 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
5751 cpu_icache_sync_range((void *)PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)),
5752 L2_SIZE);
5753 }
5754
5755 /*
5756 * Map the superpage.
5757 */
5758 pmap_store(l2, new_l2);
5759 dsb(ishst);
5760
5761 atomic_add_long(&pmap_l2_mappings, 1);
5762 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
5763 va, pmap);
5764
5765 return (KERN_SUCCESS);
5766 }
5767
5768 /*
5769 * Tries to create a read- and/or execute-only L3C page mapping. Returns
5770 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
5771 * value.
5772 */
5773 static int
pmap_enter_l3c_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_page_t * ml3p,vm_prot_t prot,struct rwlock ** lockp)5774 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p,
5775 vm_prot_t prot, struct rwlock **lockp)
5776 {
5777 pt_entry_t l3e;
5778
5779 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5780 PMAP_ASSERT_STAGE1(pmap);
5781 KASSERT(ADDR_IS_CANONICAL(va),
5782 ("%s: Address not in canonical form: %lx", __func__, va));
5783
5784 l3e = VM_PAGE_TO_PTE(m) | pmap_sh_attr |
5785 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5786 ATTR_CONTIGUOUS | L3_PAGE;
5787 if ((m->oflags & VPO_UNMANAGED) == 0)
5788 l3e |= ATTR_SW_MANAGED;
5789 else
5790 l3e |= ATTR_AF;
5791 if ((prot & VM_PROT_EXECUTE) == 0 ||
5792 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5793 l3e |= ATTR_S1_XN;
5794 if (!ADDR_IS_KERNEL(va))
5795 l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5796 else
5797 l3e |= ATTR_S1_UXN;
5798 if (pmap != kernel_pmap)
5799 l3e |= ATTR_S1_nG;
5800 return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP |
5801 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp));
5802 }
5803
5804 static int
pmap_enter_l3c(pmap_t pmap,vm_offset_t va,pt_entry_t l3e,u_int flags,vm_page_t m,vm_page_t * ml3p,struct rwlock ** lockp)5805 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
5806 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp)
5807 {
5808 pd_entry_t *l2p, *pde;
5809 pt_entry_t *l3p, *tl3p;
5810 vm_page_t mt;
5811 vm_paddr_t pa;
5812 vm_pindex_t l2pindex;
5813 int lvl;
5814
5815 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5816 KASSERT((va & L3C_OFFSET) == 0,
5817 ("pmap_enter_l3c: va is not aligned"));
5818 KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0,
5819 ("pmap_enter_l3c: managed mapping within the clean submap"));
5820 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
5821 ("pmap_enter_l3c: l3e is missing ATTR_CONTIGUOUS"));
5822
5823 /*
5824 * If the L3 PTP is not resident, we attempt to create it here.
5825 */
5826 if (!ADDR_IS_KERNEL(va)) {
5827 /*
5828 * Were we given the correct L3 PTP? If so, we can simply
5829 * increment its ref count.
5830 */
5831 l2pindex = pmap_l2_pindex(va);
5832 if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) {
5833 (*ml3p)->ref_count += L3C_ENTRIES;
5834 } else {
5835 retry:
5836 /*
5837 * Get the L2 entry.
5838 */
5839 pde = pmap_pde(pmap, va, &lvl);
5840
5841 /*
5842 * If the L2 entry is a superpage, we either abort or
5843 * demote depending on the given flags.
5844 */
5845 if (lvl == 1) {
5846 l2p = pmap_l1_to_l2(pde, va);
5847 if ((pmap_load(l2p) & ATTR_DESCR_MASK) ==
5848 L2_BLOCK) {
5849 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5850 return (KERN_FAILURE);
5851 l3p = pmap_demote_l2_locked(pmap, l2p,
5852 va, lockp);
5853 if (l3p != NULL) {
5854 *ml3p = PTE_TO_VM_PAGE(
5855 pmap_load(l2p));
5856 (*ml3p)->ref_count +=
5857 L3C_ENTRIES;
5858 goto have_l3p;
5859 }
5860 }
5861 /* We need to allocate an L3 PTP. */
5862 }
5863
5864 /*
5865 * If the L3 PTP is mapped, we just increment its ref
5866 * count. Otherwise, we attempt to allocate it.
5867 */
5868 if (lvl == 2 && pmap_load(pde) != 0) {
5869 *ml3p = PTE_TO_VM_PAGE(pmap_load(pde));
5870 (*ml3p)->ref_count += L3C_ENTRIES;
5871 } else {
5872 *ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags &
5873 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp);
5874 if (*ml3p == NULL) {
5875 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5876 return (KERN_FAILURE);
5877
5878 /*
5879 * The page table may have changed
5880 * while we slept.
5881 */
5882 goto retry;
5883 }
5884 (*ml3p)->ref_count += L3C_ENTRIES - 1;
5885 }
5886 }
5887 l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p));
5888 } else {
5889 *ml3p = NULL;
5890
5891 /*
5892 * If the L2 entry is a superpage, we either abort or demote
5893 * depending on the given flags.
5894 */
5895 pde = pmap_pde(kernel_pmap, va, &lvl);
5896 if (lvl == 1) {
5897 l2p = pmap_l1_to_l2(pde, va);
5898 KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK,
5899 ("pmap_enter_l3c: missing L2 block"));
5900 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5901 return (KERN_FAILURE);
5902 l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp);
5903 } else {
5904 KASSERT(lvl == 2,
5905 ("pmap_enter_l3c: Invalid level %d", lvl));
5906 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(
5907 pmap_load(pde)));
5908 }
5909 }
5910 have_l3p:
5911 l3p = &l3p[pmap_l3_index(va)];
5912
5913 /*
5914 * If bti is not the same for the whole L3C range, return failure
5915 * and let vm_fault() cope. Check after L3 allocation, since
5916 * it could sleep.
5917 */
5918 if (!pmap_bti_same(pmap, va, va + L3C_SIZE, &l3e)) {
5919 KASSERT(*ml3p != NULL, ("pmap_enter_l3c: missing L3 PTP"));
5920 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
5921 pmap_abort_ptp(pmap, va, *ml3p);
5922 *ml3p = NULL;
5923 return (KERN_PROTECTION_FAILURE);
5924 }
5925
5926 /*
5927 * If there are existing mappings, either abort or remove them.
5928 */
5929 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5930 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5931 if (pmap_load(tl3p) != 0) {
5932 if (*ml3p != NULL)
5933 (*ml3p)->ref_count -= L3C_ENTRIES;
5934 return (KERN_FAILURE);
5935 }
5936 }
5937 } else {
5938 /*
5939 * Because we increment the L3 page's reference count above,
5940 * it is guaranteed not to be freed here and we can pass NULL
5941 * instead of a valid free list.
5942 */
5943 pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va,
5944 va + L3C_SIZE, NULL, lockp);
5945 }
5946
5947 /*
5948 * Enter on the PV list if part of our managed memory.
5949 */
5950 if ((l3e & ATTR_SW_MANAGED) != 0) {
5951 if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) {
5952 if (*ml3p != NULL) {
5953 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
5954 pmap_abort_ptp(pmap, va, *ml3p);
5955 *ml3p = NULL;
5956 }
5957 return (KERN_RESOURCE_SHORTAGE);
5958 }
5959 if ((l3e & ATTR_SW_DBM) != 0)
5960 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
5961 vm_page_aflag_set(mt, PGA_WRITEABLE);
5962 }
5963
5964 /*
5965 * Increment counters.
5966 */
5967 if ((l3e & ATTR_SW_WIRED) != 0)
5968 pmap->pm_stats.wired_count += L3C_ENTRIES;
5969 pmap_resident_count_inc(pmap, L3C_ENTRIES);
5970
5971 pa = VM_PAGE_TO_PHYS(m);
5972 KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned"));
5973
5974 /*
5975 * Sync the icache before the mapping is stored.
5976 */
5977 if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap &&
5978 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5979 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), L3C_SIZE);
5980
5981 /*
5982 * Map the superpage.
5983 */
5984 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5985 pmap_store(tl3p, l3e);
5986 l3e += L3_SIZE;
5987 }
5988 dsb(ishst);
5989
5990 counter_u64_add(pmap_l3c_mappings, 1);
5991 CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p",
5992 va, pmap);
5993 return (KERN_SUCCESS);
5994 }
5995
5996 /*
5997 * Maps a sequence of resident pages belonging to the same object.
5998 * The sequence begins with the given page m_start. This page is
5999 * mapped at the given virtual address start. Each subsequent page is
6000 * mapped at a virtual address that is offset from start by the same
6001 * amount as the page is offset from m_start within the object. The
6002 * last page in the sequence is the page with the largest offset from
6003 * m_start that can be mapped at a virtual address less than the given
6004 * virtual address end. Not every virtual page between start and end
6005 * is mapped; only those for which a resident page exists with the
6006 * corresponding offset from m_start are mapped.
6007 */
6008 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)6009 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
6010 vm_page_t m_start, vm_prot_t prot)
6011 {
6012 struct rwlock *lock;
6013 vm_offset_t va;
6014 vm_page_t m, mpte;
6015 vm_pindex_t diff, psize;
6016 int rv;
6017
6018 VM_OBJECT_ASSERT_LOCKED(m_start->object);
6019
6020 psize = atop(end - start);
6021 mpte = NULL;
6022 m = m_start;
6023 lock = NULL;
6024 PMAP_LOCK(pmap);
6025 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
6026 va = start + ptoa(diff);
6027 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
6028 m->psind == 2 && pmap_ps_enabled(pmap) &&
6029 ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) ==
6030 KERN_SUCCESS || rv == KERN_NO_SPACE))
6031 m = &m[L2_SIZE / PAGE_SIZE - 1];
6032 else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end &&
6033 m->psind >= 1 && pmap_ps_enabled(pmap) &&
6034 ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot,
6035 &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE))
6036 m = &m[L3C_ENTRIES - 1];
6037 else {
6038 /*
6039 * In general, if a superpage mapping were possible,
6040 * it would have been created above. That said, if
6041 * start and end are not superpage aligned, then
6042 * promotion might be possible at the ends of [start,
6043 * end). However, in practice, those promotion
6044 * attempts are so unlikely to succeed that they are
6045 * not worth trying.
6046 */
6047 mpte = pmap_enter_quick_locked(pmap, va, m, prot |
6048 VM_PROT_NO_PROMOTE, mpte, &lock);
6049 }
6050 m = TAILQ_NEXT(m, listq);
6051 }
6052 if (lock != NULL)
6053 rw_wunlock(lock);
6054 PMAP_UNLOCK(pmap);
6055 }
6056
6057 /*
6058 * this code makes some *MAJOR* assumptions:
6059 * 1. Current pmap & pmap exists.
6060 * 2. Not wired.
6061 * 3. Read access.
6062 * 4. No page table pages.
6063 * but is *MUCH* faster than pmap_enter...
6064 */
6065
6066 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)6067 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
6068 {
6069 struct rwlock *lock;
6070
6071 lock = NULL;
6072 PMAP_LOCK(pmap);
6073 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
6074 if (lock != NULL)
6075 rw_wunlock(lock);
6076 PMAP_UNLOCK(pmap);
6077 }
6078
6079 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)6080 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
6081 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
6082 {
6083 pt_entry_t *l1, *l2, *l3, l3_val;
6084 vm_paddr_t pa;
6085 int full_lvl, lvl;
6086
6087 KASSERT(!VA_IS_CLEANMAP(va) ||
6088 (m->oflags & VPO_UNMANAGED) != 0,
6089 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
6090 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6091 PMAP_ASSERT_STAGE1(pmap);
6092 KASSERT(ADDR_IS_CANONICAL(va),
6093 ("%s: Address not in canonical form: %lx", __func__, va));
6094 l2 = NULL;
6095
6096 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
6097 /*
6098 * In the case that a page table page is not
6099 * resident, we are creating it here.
6100 */
6101 if (!ADDR_IS_KERNEL(va)) {
6102 vm_pindex_t l2pindex;
6103
6104 /*
6105 * Calculate pagetable page index
6106 */
6107 l2pindex = pmap_l2_pindex(va);
6108 if (mpte && (mpte->pindex == l2pindex)) {
6109 mpte->ref_count++;
6110 } else {
6111 /*
6112 * If the page table page is mapped, we just increment
6113 * the hold count, and activate it. Otherwise, we
6114 * attempt to allocate a page table page, passing NULL
6115 * instead of the PV list lock pointer because we don't
6116 * intend to sleep. If this attempt fails, we don't
6117 * retry. Instead, we give up.
6118 */
6119 l1 = pmap_l1(pmap, va);
6120 if (l1 != NULL && pmap_load(l1) != 0) {
6121 if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
6122 L1_BLOCK)
6123 return (NULL);
6124 l2 = pmap_l1_to_l2(l1, va);
6125 if (pmap_load(l2) != 0) {
6126 if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
6127 L2_BLOCK)
6128 return (NULL);
6129 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
6130 mpte->ref_count++;
6131 } else {
6132 mpte = _pmap_alloc_l3(pmap, l2pindex,
6133 NULL);
6134 if (mpte == NULL)
6135 return (mpte);
6136 }
6137 } else {
6138 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
6139 if (mpte == NULL)
6140 return (mpte);
6141 }
6142 }
6143 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
6144 l3 = &l3[pmap_l3_index(va)];
6145 } else {
6146 mpte = NULL;
6147 l2 = pmap_pde(kernel_pmap, va, &lvl);
6148 KASSERT(l2 != NULL,
6149 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
6150 va));
6151 KASSERT(lvl == 2,
6152 ("pmap_enter_quick_locked: Invalid level %d", lvl));
6153 l3 = pmap_l2_to_l3(l2, va);
6154 }
6155
6156 /*
6157 * Abort if a mapping already exists.
6158 */
6159 if (pmap_load(l3) != 0) {
6160 if (mpte != NULL)
6161 mpte->ref_count--;
6162 return (NULL);
6163 }
6164
6165 /*
6166 * Enter on the PV list if part of our managed memory.
6167 */
6168 if ((m->oflags & VPO_UNMANAGED) == 0 &&
6169 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
6170 if (mpte != NULL)
6171 pmap_abort_ptp(pmap, va, mpte);
6172 return (NULL);
6173 }
6174
6175 /*
6176 * Increment counters
6177 */
6178 pmap_resident_count_inc(pmap, 1);
6179
6180 pa = VM_PAGE_TO_PHYS(m);
6181 l3_val = PHYS_TO_PTE(pa) | pmap_sh_attr |
6182 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
6183 l3_val |= pmap_pte_bti(pmap, va);
6184 if ((prot & VM_PROT_EXECUTE) == 0 ||
6185 m->md.pv_memattr == VM_MEMATTR_DEVICE)
6186 l3_val |= ATTR_S1_XN;
6187 if (!ADDR_IS_KERNEL(va))
6188 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
6189 else
6190 l3_val |= ATTR_S1_UXN;
6191 if (pmap != kernel_pmap)
6192 l3_val |= ATTR_S1_nG;
6193
6194 /*
6195 * Now validate mapping with RO protection
6196 */
6197 if ((m->oflags & VPO_UNMANAGED) == 0)
6198 l3_val |= ATTR_SW_MANAGED;
6199 else
6200 l3_val |= ATTR_AF;
6201
6202 /* Sync icache before the mapping is stored to PTE */
6203 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
6204 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
6205 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
6206
6207 pmap_store(l3, l3_val);
6208 dsb(ishst);
6209
6210 #if VM_NRESERVLEVEL > 0
6211 /*
6212 * First, attempt L3C promotion, if the virtual and physical addresses
6213 * are aligned with each other and an underlying reservation has the
6214 * neighboring L3 pages allocated. The first condition is simply an
6215 * optimization that recognizes some eventual promotion failures early
6216 * at a lower run-time cost. Then, attempt L2 promotion, if both a
6217 * level 1 reservation and the PTP are fully populated.
6218 */
6219 if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
6220 (va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
6221 (m->flags & PG_FICTITIOUS) == 0 &&
6222 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
6223 pmap_promote_l3c(pmap, l3, va) &&
6224 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) {
6225 if (l2 == NULL)
6226 l2 = pmap_l2(pmap, va);
6227
6228 /*
6229 * If promotion succeeds, then the next call to this function
6230 * should not be given the unmapped PTP as a hint.
6231 */
6232 if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
6233 mpte = NULL;
6234 }
6235 #endif
6236
6237 return (mpte);
6238 }
6239
6240 /*
6241 * This code maps large physical mmap regions into the
6242 * processor address space. Note that some shortcuts
6243 * are taken, but the code works.
6244 */
6245 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)6246 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
6247 vm_pindex_t pindex, vm_size_t size)
6248 {
6249
6250 VM_OBJECT_ASSERT_WLOCKED(object);
6251 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
6252 ("pmap_object_init_pt: non-device object"));
6253 }
6254
6255 /*
6256 * Clear the wired attribute from the mappings for the specified range of
6257 * addresses in the given pmap. Every valid mapping within that range
6258 * must have the wired attribute set. In contrast, invalid mappings
6259 * cannot have the wired attribute set, so they are ignored.
6260 *
6261 * The wired attribute of the page table entry is not a hardware feature,
6262 * so there is no need to invalidate any TLB entries.
6263 */
6264 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6265 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6266 {
6267 vm_offset_t va_next;
6268 pd_entry_t *l0, *l1, *l2;
6269 pt_entry_t *l3;
6270 bool partial_l3c;
6271
6272 PMAP_LOCK(pmap);
6273 for (; sva < eva; sva = va_next) {
6274 l0 = pmap_l0(pmap, sva);
6275 if (pmap_load(l0) == 0) {
6276 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6277 if (va_next < sva)
6278 va_next = eva;
6279 continue;
6280 }
6281
6282 l1 = pmap_l0_to_l1(l0, sva);
6283 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6284 if (va_next < sva)
6285 va_next = eva;
6286 if (pmap_load(l1) == 0)
6287 continue;
6288
6289 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6290 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6291 KASSERT(va_next <= eva,
6292 ("partial update of non-transparent 1G page "
6293 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
6294 pmap_load(l1), sva, eva, va_next));
6295 MPASS(pmap != kernel_pmap);
6296 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
6297 ATTR_SW_WIRED)) == ATTR_SW_WIRED);
6298 pmap_clear_bits(l1, ATTR_SW_WIRED);
6299 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
6300 continue;
6301 }
6302
6303 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6304 if (va_next < sva)
6305 va_next = eva;
6306
6307 l2 = pmap_l1_to_l2(l1, sva);
6308 if (pmap_load(l2) == 0)
6309 continue;
6310
6311 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
6312 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
6313 panic("pmap_unwire: l2 %#jx is missing "
6314 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
6315
6316 /*
6317 * Are we unwiring the entire large page? If not,
6318 * demote the mapping and fall through.
6319 */
6320 if (sva + L2_SIZE == va_next && eva >= va_next) {
6321 pmap_clear_bits(l2, ATTR_SW_WIRED);
6322 pmap->pm_stats.wired_count -= L2_SIZE /
6323 PAGE_SIZE;
6324 continue;
6325 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
6326 panic("pmap_unwire: demotion failed");
6327 }
6328 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6329 ("pmap_unwire: Invalid l2 entry after demotion"));
6330
6331 if (va_next > eva)
6332 va_next = eva;
6333 for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva);
6334 sva != va_next; l3++, sva += L3_SIZE) {
6335 if (pmap_load(l3) == 0)
6336 continue;
6337 if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) {
6338 /*
6339 * Avoid demotion for whole-page unwiring.
6340 */
6341 if ((sva & L3C_OFFSET) == 0) {
6342 /*
6343 * Handle the possibility that
6344 * "va_next" is zero because of
6345 * address wraparound.
6346 */
6347 partial_l3c = sva + L3C_OFFSET >
6348 va_next - 1;
6349 }
6350 if (partial_l3c)
6351 (void)pmap_demote_l3c(pmap, l3, sva);
6352 }
6353 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
6354 panic("pmap_unwire: l3 %#jx is missing "
6355 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
6356
6357 /*
6358 * ATTR_SW_WIRED must be cleared atomically. Although
6359 * the pmap lock synchronizes access to ATTR_SW_WIRED,
6360 * the System MMU may write to the entry concurrently.
6361 */
6362 pmap_clear_bits(l3, ATTR_SW_WIRED);
6363 pmap->pm_stats.wired_count--;
6364 }
6365 }
6366 PMAP_UNLOCK(pmap);
6367 }
6368
6369 /*
6370 * This function requires that the caller has already added one to ml3's
6371 * ref_count in anticipation of creating a 4KB page mapping.
6372 */
6373 static bool
pmap_copy_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,pt_entry_t l3e,vm_page_t ml3,struct rwlock ** lockp)6374 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e,
6375 vm_page_t ml3, struct rwlock **lockp)
6376 {
6377 pt_entry_t *tl3p;
6378
6379 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6380 KASSERT((va & L3C_OFFSET) == 0,
6381 ("pmap_copy_l3c: va is not aligned"));
6382 KASSERT((l3e & ATTR_SW_MANAGED) != 0,
6383 ("pmap_copy_l3c: l3e is not managed"));
6384
6385 /*
6386 * Abort if a mapping already exists.
6387 */
6388 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++)
6389 if (pmap_load(tl3p) != 0) {
6390 if (ml3 != NULL)
6391 ml3->ref_count--;
6392 return (false);
6393 }
6394
6395 if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) {
6396 if (ml3 != NULL)
6397 pmap_abort_ptp(pmap, va, ml3);
6398 return (false);
6399 }
6400 ml3->ref_count += L3C_ENTRIES - 1;
6401
6402 /*
6403 * Clear the wired and accessed bits. However, leave the dirty bit
6404 * unchanged because read/write superpage mappings are required to be
6405 * dirty.
6406 */
6407 l3e &= ~(ATTR_SW_WIRED | ATTR_AF);
6408
6409 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6410 pmap_store(tl3p, l3e);
6411 l3e += L3_SIZE;
6412 }
6413 pmap_resident_count_inc(pmap, L3C_ENTRIES);
6414 counter_u64_add(pmap_l3c_mappings, 1);
6415 CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p",
6416 va, pmap);
6417 return (true);
6418 }
6419
6420 /*
6421 * Copy the range specified by src_addr/len
6422 * from the source map to the range dst_addr/len
6423 * in the destination map.
6424 *
6425 * This routine is only advisory and need not do anything.
6426 *
6427 * Because the executable mappings created by this routine are copied,
6428 * it should not have to flush the instruction cache.
6429 */
6430 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)6431 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
6432 vm_offset_t src_addr)
6433 {
6434 struct rwlock *lock;
6435 pd_entry_t *l0, *l1, *l2, srcptepaddr;
6436 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
6437 vm_offset_t addr, end_addr, va_next;
6438 vm_page_t dst_m, dstmpte, srcmpte;
6439
6440 PMAP_ASSERT_STAGE1(dst_pmap);
6441 PMAP_ASSERT_STAGE1(src_pmap);
6442
6443 if (dst_addr != src_addr)
6444 return;
6445 end_addr = src_addr + len;
6446 lock = NULL;
6447 if (dst_pmap < src_pmap) {
6448 PMAP_LOCK(dst_pmap);
6449 PMAP_LOCK(src_pmap);
6450 } else {
6451 PMAP_LOCK(src_pmap);
6452 PMAP_LOCK(dst_pmap);
6453 }
6454 for (addr = src_addr; addr < end_addr; addr = va_next) {
6455 l0 = pmap_l0(src_pmap, addr);
6456 if (pmap_load(l0) == 0) {
6457 va_next = (addr + L0_SIZE) & ~L0_OFFSET;
6458 if (va_next < addr)
6459 va_next = end_addr;
6460 continue;
6461 }
6462
6463 va_next = (addr + L1_SIZE) & ~L1_OFFSET;
6464 if (va_next < addr)
6465 va_next = end_addr;
6466 l1 = pmap_l0_to_l1(l0, addr);
6467 if (pmap_load(l1) == 0)
6468 continue;
6469 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6470 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6471 KASSERT(va_next <= end_addr,
6472 ("partial update of non-transparent 1G page "
6473 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6474 pmap_load(l1), addr, end_addr, va_next));
6475 srcptepaddr = pmap_load(l1);
6476 l1 = pmap_l1(dst_pmap, addr);
6477 if (l1 == NULL) {
6478 if (_pmap_alloc_l3(dst_pmap,
6479 pmap_l0_pindex(addr), NULL) == NULL)
6480 break;
6481 l1 = pmap_l1(dst_pmap, addr);
6482 } else {
6483 l0 = pmap_l0(dst_pmap, addr);
6484 dst_m = PTE_TO_VM_PAGE(pmap_load(l0));
6485 dst_m->ref_count++;
6486 }
6487 KASSERT(pmap_load(l1) == 0,
6488 ("1G mapping present in dst pmap "
6489 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6490 pmap_load(l1), addr, end_addr, va_next));
6491 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
6492 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
6493 continue;
6494 }
6495
6496 va_next = (addr + L2_SIZE) & ~L2_OFFSET;
6497 if (va_next < addr)
6498 va_next = end_addr;
6499 l2 = pmap_l1_to_l2(l1, addr);
6500 srcptepaddr = pmap_load(l2);
6501 if (srcptepaddr == 0)
6502 continue;
6503 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
6504 /*
6505 * We can only virtual copy whole superpages.
6506 */
6507 if ((addr & L2_OFFSET) != 0 ||
6508 addr + L2_SIZE > end_addr)
6509 continue;
6510 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
6511 if (l2 == NULL)
6512 break;
6513 if (pmap_load(l2) == 0 &&
6514 ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
6515 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
6516 PMAP_ENTER_NORECLAIM, &lock))) {
6517 /*
6518 * We leave the dirty bit unchanged because
6519 * managed read/write superpage mappings are
6520 * required to be dirty. However, managed
6521 * superpage mappings are not required to
6522 * have their accessed bit set, so we clear
6523 * it because we don't know if this mapping
6524 * will be used.
6525 */
6526 srcptepaddr &= ~ATTR_SW_WIRED;
6527 if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
6528 srcptepaddr &= ~ATTR_AF;
6529 pmap_store(l2, srcptepaddr);
6530 pmap_resident_count_inc(dst_pmap, L2_SIZE /
6531 PAGE_SIZE);
6532 atomic_add_long(&pmap_l2_mappings, 1);
6533 } else
6534 pmap_abort_ptp(dst_pmap, addr, dst_m);
6535 continue;
6536 }
6537 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
6538 ("pmap_copy: invalid L2 entry"));
6539 srcmpte = PTE_TO_VM_PAGE(srcptepaddr);
6540 KASSERT(srcmpte->ref_count > 0,
6541 ("pmap_copy: source page table page is unused"));
6542 if (va_next > end_addr)
6543 va_next = end_addr;
6544 src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr));
6545 src_pte = &src_pte[pmap_l3_index(addr)];
6546 dstmpte = NULL;
6547 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
6548 ptetemp = pmap_load(src_pte);
6549
6550 /*
6551 * We only virtual copy managed pages.
6552 */
6553 if ((ptetemp & ATTR_SW_MANAGED) == 0)
6554 continue;
6555
6556 if (dstmpte != NULL) {
6557 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
6558 ("dstmpte pindex/addr mismatch"));
6559 dstmpte->ref_count++;
6560 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
6561 NULL)) == NULL)
6562 goto out;
6563 dst_pte = (pt_entry_t *)
6564 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
6565 dst_pte = &dst_pte[pmap_l3_index(addr)];
6566 if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr &
6567 L3C_OFFSET) == 0 && addr + L3C_OFFSET <=
6568 va_next - 1) {
6569 if (!pmap_copy_l3c(dst_pmap, dst_pte, addr,
6570 ptetemp, dstmpte, &lock))
6571 goto out;
6572 addr += L3C_SIZE - PAGE_SIZE;
6573 src_pte += L3C_ENTRIES - 1;
6574 } else if (pmap_load(dst_pte) == 0 &&
6575 pmap_try_insert_pv_entry(dst_pmap, addr,
6576 PTE_TO_VM_PAGE(ptetemp), &lock)) {
6577 /*
6578 * Clear the wired, contiguous, modified, and
6579 * accessed bits from the destination PTE.
6580 * The contiguous bit is cleared because we
6581 * are not copying the entire L3C superpage.
6582 */
6583 mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS |
6584 ATTR_AF;
6585 nbits = 0;
6586 if ((ptetemp & ATTR_SW_DBM) != 0)
6587 nbits |= ATTR_S1_AP_RW_BIT;
6588 pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
6589 pmap_resident_count_inc(dst_pmap, 1);
6590 } else {
6591 pmap_abort_ptp(dst_pmap, addr, dstmpte);
6592 goto out;
6593 }
6594 /* Have we copied all of the valid mappings? */
6595 if (dstmpte->ref_count >= srcmpte->ref_count)
6596 break;
6597 }
6598 }
6599 out:
6600 /*
6601 * XXX This barrier may not be needed because the destination pmap is
6602 * not active.
6603 */
6604 dsb(ishst);
6605
6606 if (lock != NULL)
6607 rw_wunlock(lock);
6608 PMAP_UNLOCK(src_pmap);
6609 PMAP_UNLOCK(dst_pmap);
6610 }
6611
6612 int
pmap_vmspace_copy(pmap_t dst_pmap,pmap_t src_pmap)6613 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
6614 {
6615 int error;
6616
6617 if (dst_pmap->pm_stage != src_pmap->pm_stage)
6618 return (EINVAL);
6619
6620 if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL)
6621 return (0);
6622
6623 for (;;) {
6624 if (dst_pmap < src_pmap) {
6625 PMAP_LOCK(dst_pmap);
6626 PMAP_LOCK(src_pmap);
6627 } else {
6628 PMAP_LOCK(src_pmap);
6629 PMAP_LOCK(dst_pmap);
6630 }
6631 error = pmap_bti_copy(dst_pmap, src_pmap);
6632 /* Clean up partial copy on failure due to no memory. */
6633 if (error == ENOMEM)
6634 pmap_bti_deassign_all(dst_pmap);
6635 PMAP_UNLOCK(src_pmap);
6636 PMAP_UNLOCK(dst_pmap);
6637 if (error != ENOMEM)
6638 break;
6639 vm_wait(NULL);
6640 }
6641 return (error);
6642 }
6643
6644 /*
6645 * pmap_zero_page zeros the specified hardware page by mapping
6646 * the page into KVM and using bzero to clear its contents.
6647 */
6648 void
pmap_zero_page(vm_page_t m)6649 pmap_zero_page(vm_page_t m)
6650 {
6651 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6652
6653 pagezero((void *)va);
6654 }
6655
6656 /*
6657 * pmap_zero_page_area zeros the specified hardware page by mapping
6658 * the page into KVM and using bzero to clear its contents.
6659 *
6660 * off and size may not cover an area beyond a single hardware page.
6661 */
6662 void
pmap_zero_page_area(vm_page_t m,int off,int size)6663 pmap_zero_page_area(vm_page_t m, int off, int size)
6664 {
6665 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6666
6667 if (off == 0 && size == PAGE_SIZE)
6668 pagezero((void *)va);
6669 else
6670 bzero((char *)va + off, size);
6671 }
6672
6673 /*
6674 * pmap_copy_page copies the specified (machine independent)
6675 * page by mapping the page into virtual memory and using
6676 * bcopy to copy the page, one machine dependent page at a
6677 * time.
6678 */
6679 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)6680 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6681 {
6682 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
6683 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
6684
6685 pagecopy((void *)src, (void *)dst);
6686 }
6687
6688 int unmapped_buf_allowed = 1;
6689
6690 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)6691 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6692 vm_offset_t b_offset, int xfersize)
6693 {
6694 void *a_cp, *b_cp;
6695 vm_page_t m_a, m_b;
6696 vm_paddr_t p_a, p_b;
6697 vm_offset_t a_pg_offset, b_pg_offset;
6698 int cnt;
6699
6700 while (xfersize > 0) {
6701 a_pg_offset = a_offset & PAGE_MASK;
6702 m_a = ma[a_offset >> PAGE_SHIFT];
6703 p_a = m_a->phys_addr;
6704 b_pg_offset = b_offset & PAGE_MASK;
6705 m_b = mb[b_offset >> PAGE_SHIFT];
6706 p_b = m_b->phys_addr;
6707 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6708 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6709 if (__predict_false(!PHYS_IN_DMAP(p_a))) {
6710 panic("!DMAP a %lx", p_a);
6711 } else {
6712 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
6713 }
6714 if (__predict_false(!PHYS_IN_DMAP(p_b))) {
6715 panic("!DMAP b %lx", p_b);
6716 } else {
6717 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
6718 }
6719 bcopy(a_cp, b_cp, cnt);
6720 a_offset += cnt;
6721 b_offset += cnt;
6722 xfersize -= cnt;
6723 }
6724 }
6725
6726 vm_offset_t
pmap_quick_enter_page(vm_page_t m)6727 pmap_quick_enter_page(vm_page_t m)
6728 {
6729
6730 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
6731 }
6732
6733 void
pmap_quick_remove_page(vm_offset_t addr)6734 pmap_quick_remove_page(vm_offset_t addr)
6735 {
6736 }
6737
6738 /*
6739 * Returns true if the pmap's pv is one of the first
6740 * 16 pvs linked to from this page. This count may
6741 * be changed upwards or downwards in the future; it
6742 * is only necessary that true be returned for a small
6743 * subset of pmaps for proper page aging.
6744 */
6745 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)6746 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
6747 {
6748 struct md_page *pvh;
6749 struct rwlock *lock;
6750 pv_entry_t pv;
6751 int loops = 0;
6752 bool rv;
6753
6754 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6755 ("pmap_page_exists_quick: page %p is not managed", m));
6756 rv = false;
6757 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6758 rw_rlock(lock);
6759 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6760 if (PV_PMAP(pv) == pmap) {
6761 rv = true;
6762 break;
6763 }
6764 loops++;
6765 if (loops >= 16)
6766 break;
6767 }
6768 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
6769 pvh = page_to_pvh(m);
6770 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6771 if (PV_PMAP(pv) == pmap) {
6772 rv = true;
6773 break;
6774 }
6775 loops++;
6776 if (loops >= 16)
6777 break;
6778 }
6779 }
6780 rw_runlock(lock);
6781 return (rv);
6782 }
6783
6784 /*
6785 * pmap_page_wired_mappings:
6786 *
6787 * Return the number of managed mappings to the given physical page
6788 * that are wired.
6789 */
6790 int
pmap_page_wired_mappings(vm_page_t m)6791 pmap_page_wired_mappings(vm_page_t m)
6792 {
6793 struct rwlock *lock;
6794 struct md_page *pvh;
6795 pmap_t pmap;
6796 pt_entry_t *pte;
6797 pv_entry_t pv;
6798 int count, md_gen, pvh_gen;
6799
6800 if ((m->oflags & VPO_UNMANAGED) != 0)
6801 return (0);
6802 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6803 rw_rlock(lock);
6804 restart:
6805 count = 0;
6806 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6807 pmap = PV_PMAP(pv);
6808 if (!PMAP_TRYLOCK(pmap)) {
6809 md_gen = m->md.pv_gen;
6810 rw_runlock(lock);
6811 PMAP_LOCK(pmap);
6812 rw_rlock(lock);
6813 if (md_gen != m->md.pv_gen) {
6814 PMAP_UNLOCK(pmap);
6815 goto restart;
6816 }
6817 }
6818 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6819 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6820 count++;
6821 PMAP_UNLOCK(pmap);
6822 }
6823 if ((m->flags & PG_FICTITIOUS) == 0) {
6824 pvh = page_to_pvh(m);
6825 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6826 pmap = PV_PMAP(pv);
6827 if (!PMAP_TRYLOCK(pmap)) {
6828 md_gen = m->md.pv_gen;
6829 pvh_gen = pvh->pv_gen;
6830 rw_runlock(lock);
6831 PMAP_LOCK(pmap);
6832 rw_rlock(lock);
6833 if (md_gen != m->md.pv_gen ||
6834 pvh_gen != pvh->pv_gen) {
6835 PMAP_UNLOCK(pmap);
6836 goto restart;
6837 }
6838 }
6839 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6840 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6841 count++;
6842 PMAP_UNLOCK(pmap);
6843 }
6844 }
6845 rw_runlock(lock);
6846 return (count);
6847 }
6848
6849 /*
6850 * Returns true if the given page is mapped individually or as part of
6851 * a 2mpage. Otherwise, returns false.
6852 */
6853 bool
pmap_page_is_mapped(vm_page_t m)6854 pmap_page_is_mapped(vm_page_t m)
6855 {
6856 struct rwlock *lock;
6857 bool rv;
6858
6859 if ((m->oflags & VPO_UNMANAGED) != 0)
6860 return (false);
6861 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6862 rw_rlock(lock);
6863 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
6864 ((m->flags & PG_FICTITIOUS) == 0 &&
6865 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
6866 rw_runlock(lock);
6867 return (rv);
6868 }
6869
6870 /*
6871 * Destroy all managed, non-wired mappings in the given user-space
6872 * pmap. This pmap cannot be active on any processor besides the
6873 * caller.
6874 *
6875 * This function cannot be applied to the kernel pmap. Moreover, it
6876 * is not intended for general use. It is only to be used during
6877 * process termination. Consequently, it can be implemented in ways
6878 * that make it faster than pmap_remove(). First, it can more quickly
6879 * destroy mappings by iterating over the pmap's collection of PV
6880 * entries, rather than searching the page table. Second, it doesn't
6881 * have to test and clear the page table entries atomically, because
6882 * no processor is currently accessing the user address space. In
6883 * particular, a page table entry's dirty bit won't change state once
6884 * this function starts.
6885 */
6886 void
pmap_remove_pages(pmap_t pmap)6887 pmap_remove_pages(pmap_t pmap)
6888 {
6889 pd_entry_t *pde;
6890 pt_entry_t *pte, tpte;
6891 struct spglist free;
6892 struct pv_chunklist free_chunks[PMAP_MEMDOM];
6893 vm_page_t m, ml3, mt;
6894 pv_entry_t pv;
6895 struct md_page *pvh;
6896 struct pv_chunk *pc, *npc;
6897 struct rwlock *lock;
6898 int64_t bit;
6899 uint64_t inuse, bitmask;
6900 int allfree, field, i, idx, lvl;
6901 int freed __pvused;
6902 vm_paddr_t pa;
6903
6904 lock = NULL;
6905
6906 for (i = 0; i < PMAP_MEMDOM; i++)
6907 TAILQ_INIT(&free_chunks[i]);
6908 SLIST_INIT(&free);
6909 PMAP_LOCK(pmap);
6910 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
6911 allfree = 1;
6912 freed = 0;
6913 for (field = 0; field < _NPCM; field++) {
6914 inuse = ~pc->pc_map[field] & pc_freemask[field];
6915 while (inuse != 0) {
6916 bit = ffsl(inuse) - 1;
6917 bitmask = 1UL << bit;
6918 idx = field * 64 + bit;
6919 pv = &pc->pc_pventry[idx];
6920 inuse &= ~bitmask;
6921
6922 pde = pmap_pde(pmap, pv->pv_va, &lvl);
6923 KASSERT(pde != NULL,
6924 ("Attempting to remove an unmapped page"));
6925
6926 switch(lvl) {
6927 case 1:
6928 pte = pmap_l1_to_l2(pde, pv->pv_va);
6929 tpte = pmap_load(pte);
6930 KASSERT((tpte & ATTR_DESCR_MASK) ==
6931 L2_BLOCK,
6932 ("Attempting to remove an invalid "
6933 "block: %lx", tpte));
6934 break;
6935 case 2:
6936 pte = pmap_l2_to_l3(pde, pv->pv_va);
6937 tpte = pmap_load(pte);
6938 KASSERT((tpte & ATTR_DESCR_MASK) ==
6939 L3_PAGE,
6940 ("Attempting to remove an invalid "
6941 "page: %lx", tpte));
6942 break;
6943 default:
6944 panic(
6945 "Invalid page directory level: %d",
6946 lvl);
6947 }
6948
6949 /*
6950 * We cannot remove wired mappings at this time.
6951 *
6952 * For L3C superpages, all of the constituent PTEs
6953 * should have the wired bit set, so we don't
6954 * check for ATTR_CONTIGUOUS here.
6955 */
6956 if (tpte & ATTR_SW_WIRED) {
6957 allfree = 0;
6958 continue;
6959 }
6960
6961 /* Mark free */
6962 pc->pc_map[field] |= bitmask;
6963
6964 /*
6965 * Because this pmap is not active on other
6966 * processors, the dirty bit cannot have
6967 * changed state since we last loaded pte.
6968 */
6969 pmap_clear(pte);
6970
6971 pa = PTE_TO_PHYS(tpte);
6972
6973 m = PHYS_TO_VM_PAGE(pa);
6974 KASSERT(m->phys_addr == pa,
6975 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
6976 m, (uintmax_t)m->phys_addr,
6977 (uintmax_t)tpte));
6978
6979 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
6980 m < &vm_page_array[vm_page_array_size],
6981 ("pmap_remove_pages: bad pte %#jx",
6982 (uintmax_t)tpte));
6983
6984 /*
6985 * Update the vm_page_t clean/reference bits.
6986 *
6987 * We don't check for ATTR_CONTIGUOUS here
6988 * because writeable L3C superpages are expected
6989 * to be dirty, i.e., every constituent PTE
6990 * should be dirty.
6991 */
6992 if (pmap_pte_dirty(pmap, tpte)) {
6993 switch (lvl) {
6994 case 1:
6995 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6996 vm_page_dirty(mt);
6997 break;
6998 case 2:
6999 vm_page_dirty(m);
7000 break;
7001 }
7002 }
7003
7004 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
7005
7006 switch (lvl) {
7007 case 1:
7008 pmap_resident_count_dec(pmap,
7009 L2_SIZE / PAGE_SIZE);
7010 pvh = page_to_pvh(m);
7011 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
7012 pvh->pv_gen++;
7013 if (TAILQ_EMPTY(&pvh->pv_list)) {
7014 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
7015 if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
7016 TAILQ_EMPTY(&mt->md.pv_list))
7017 vm_page_aflag_clear(mt, PGA_WRITEABLE);
7018 }
7019 ml3 = pmap_remove_pt_page(pmap,
7020 pv->pv_va);
7021 if (ml3 != NULL) {
7022 KASSERT(vm_page_any_valid(ml3),
7023 ("pmap_remove_pages: l3 page not promoted"));
7024 pmap_resident_count_dec(pmap,1);
7025 KASSERT(ml3->ref_count == NL3PG,
7026 ("pmap_remove_pages: l3 page ref count error"));
7027 ml3->ref_count = 0;
7028 pmap_add_delayed_free_list(ml3,
7029 &free, false);
7030 }
7031 break;
7032 case 2:
7033 pmap_resident_count_dec(pmap, 1);
7034 TAILQ_REMOVE(&m->md.pv_list, pv,
7035 pv_next);
7036 m->md.pv_gen++;
7037 if ((m->a.flags & PGA_WRITEABLE) != 0 &&
7038 TAILQ_EMPTY(&m->md.pv_list) &&
7039 (m->flags & PG_FICTITIOUS) == 0) {
7040 pvh = page_to_pvh(m);
7041 if (TAILQ_EMPTY(&pvh->pv_list))
7042 vm_page_aflag_clear(m,
7043 PGA_WRITEABLE);
7044 }
7045 break;
7046 }
7047 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
7048 &free);
7049 freed++;
7050 }
7051 }
7052 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
7053 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
7054 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
7055 if (allfree) {
7056 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
7057 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
7058 pc_list);
7059 }
7060 }
7061 if (lock != NULL)
7062 rw_wunlock(lock);
7063 pmap_invalidate_all(pmap);
7064 pmap_bti_deassign_all(pmap);
7065 free_pv_chunk_batch(free_chunks);
7066 PMAP_UNLOCK(pmap);
7067 vm_page_free_pages_toq(&free, true);
7068 }
7069
7070 /*
7071 * This is used to check if a page has been accessed or modified.
7072 */
7073 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)7074 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
7075 {
7076 struct rwlock *lock;
7077 pv_entry_t pv;
7078 struct md_page *pvh;
7079 pt_entry_t l3e, mask, *pte, value;
7080 pmap_t pmap;
7081 int md_gen, pvh_gen;
7082 bool rv;
7083
7084 rv = false;
7085 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7086 rw_rlock(lock);
7087 restart:
7088 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7089 pmap = PV_PMAP(pv);
7090 PMAP_ASSERT_STAGE1(pmap);
7091 if (!PMAP_TRYLOCK(pmap)) {
7092 md_gen = m->md.pv_gen;
7093 rw_runlock(lock);
7094 PMAP_LOCK(pmap);
7095 rw_rlock(lock);
7096 if (md_gen != m->md.pv_gen) {
7097 PMAP_UNLOCK(pmap);
7098 goto restart;
7099 }
7100 }
7101 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7102 mask = 0;
7103 value = 0;
7104 if (modified) {
7105 mask |= ATTR_S1_AP_RW_BIT;
7106 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
7107 }
7108 if (accessed) {
7109 mask |= ATTR_AF | ATTR_DESCR_MASK;
7110 value |= ATTR_AF | L3_PAGE;
7111 }
7112 l3e = pmap_load(pte);
7113 if ((l3e & ATTR_CONTIGUOUS) != 0)
7114 l3e = pmap_load_l3c(pte);
7115 PMAP_UNLOCK(pmap);
7116 rv = (l3e & mask) == value;
7117 if (rv)
7118 goto out;
7119 }
7120 if ((m->flags & PG_FICTITIOUS) == 0) {
7121 pvh = page_to_pvh(m);
7122 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
7123 pmap = PV_PMAP(pv);
7124 PMAP_ASSERT_STAGE1(pmap);
7125 if (!PMAP_TRYLOCK(pmap)) {
7126 md_gen = m->md.pv_gen;
7127 pvh_gen = pvh->pv_gen;
7128 rw_runlock(lock);
7129 PMAP_LOCK(pmap);
7130 rw_rlock(lock);
7131 if (md_gen != m->md.pv_gen ||
7132 pvh_gen != pvh->pv_gen) {
7133 PMAP_UNLOCK(pmap);
7134 goto restart;
7135 }
7136 }
7137 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
7138 mask = 0;
7139 value = 0;
7140 if (modified) {
7141 mask |= ATTR_S1_AP_RW_BIT;
7142 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
7143 }
7144 if (accessed) {
7145 mask |= ATTR_AF | ATTR_DESCR_MASK;
7146 value |= ATTR_AF | L2_BLOCK;
7147 }
7148 rv = (pmap_load(pte) & mask) == value;
7149 PMAP_UNLOCK(pmap);
7150 if (rv)
7151 goto out;
7152 }
7153 }
7154 out:
7155 rw_runlock(lock);
7156 return (rv);
7157 }
7158
7159 /*
7160 * pmap_is_modified:
7161 *
7162 * Return whether or not the specified physical page was modified
7163 * in any physical maps.
7164 */
7165 bool
pmap_is_modified(vm_page_t m)7166 pmap_is_modified(vm_page_t m)
7167 {
7168
7169 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7170 ("pmap_is_modified: page %p is not managed", m));
7171
7172 /*
7173 * If the page is not busied then this check is racy.
7174 */
7175 if (!pmap_page_is_write_mapped(m))
7176 return (false);
7177 return (pmap_page_test_mappings(m, false, true));
7178 }
7179
7180 /*
7181 * pmap_is_prefaultable:
7182 *
7183 * Return whether or not the specified virtual address is eligible
7184 * for prefault.
7185 */
7186 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)7187 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
7188 {
7189 pd_entry_t *pde;
7190 pt_entry_t *pte;
7191 bool rv;
7192 int lvl;
7193
7194 /*
7195 * Return true if and only if the L3 entry for the specified virtual
7196 * address is allocated but invalid.
7197 */
7198 rv = false;
7199 PMAP_LOCK(pmap);
7200 pde = pmap_pde(pmap, addr, &lvl);
7201 if (pde != NULL && lvl == 2) {
7202 pte = pmap_l2_to_l3(pde, addr);
7203 rv = pmap_load(pte) == 0;
7204 }
7205 PMAP_UNLOCK(pmap);
7206 return (rv);
7207 }
7208
7209 /*
7210 * pmap_is_referenced:
7211 *
7212 * Return whether or not the specified physical page was referenced
7213 * in any physical maps.
7214 */
7215 bool
pmap_is_referenced(vm_page_t m)7216 pmap_is_referenced(vm_page_t m)
7217 {
7218
7219 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7220 ("pmap_is_referenced: page %p is not managed", m));
7221 return (pmap_page_test_mappings(m, true, false));
7222 }
7223
7224 /*
7225 * Clear the write and modified bits in each of the given page's mappings.
7226 */
7227 void
pmap_remove_write(vm_page_t m)7228 pmap_remove_write(vm_page_t m)
7229 {
7230 struct md_page *pvh;
7231 pmap_t pmap;
7232 struct rwlock *lock;
7233 pv_entry_t next_pv, pv;
7234 pt_entry_t oldpte, *pte, set, clear, mask, val;
7235 vm_offset_t va;
7236 int md_gen, pvh_gen;
7237
7238 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7239 ("pmap_remove_write: page %p is not managed", m));
7240 vm_page_assert_busied(m);
7241
7242 if (!pmap_page_is_write_mapped(m))
7243 return;
7244 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7245 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7246 rw_wlock(lock);
7247 retry:
7248 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7249 pmap = PV_PMAP(pv);
7250 PMAP_ASSERT_STAGE1(pmap);
7251 if (!PMAP_TRYLOCK(pmap)) {
7252 pvh_gen = pvh->pv_gen;
7253 rw_wunlock(lock);
7254 PMAP_LOCK(pmap);
7255 rw_wlock(lock);
7256 if (pvh_gen != pvh->pv_gen) {
7257 PMAP_UNLOCK(pmap);
7258 goto retry;
7259 }
7260 }
7261 va = pv->pv_va;
7262 pte = pmap_pte_exists(pmap, va, 2, __func__);
7263 if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
7264 (void)pmap_demote_l2_locked(pmap, pte, va, &lock);
7265 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7266 ("inconsistent pv lock %p %p for page %p",
7267 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7268 PMAP_UNLOCK(pmap);
7269 }
7270 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7271 pmap = PV_PMAP(pv);
7272 if (!PMAP_TRYLOCK(pmap)) {
7273 pvh_gen = pvh->pv_gen;
7274 md_gen = m->md.pv_gen;
7275 rw_wunlock(lock);
7276 PMAP_LOCK(pmap);
7277 rw_wlock(lock);
7278 if (pvh_gen != pvh->pv_gen ||
7279 md_gen != m->md.pv_gen) {
7280 PMAP_UNLOCK(pmap);
7281 goto retry;
7282 }
7283 }
7284 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7285 oldpte = pmap_load(pte);
7286 if ((oldpte & ATTR_SW_DBM) != 0) {
7287 if ((oldpte & ATTR_CONTIGUOUS) != 0) {
7288 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
7289
7290 /*
7291 * The L3 entry's accessed bit may have
7292 * changed.
7293 */
7294 oldpte = pmap_load(pte);
7295 }
7296 if (pmap->pm_stage == PM_STAGE1) {
7297 set = ATTR_S1_AP_RW_BIT;
7298 clear = 0;
7299 mask = ATTR_S1_AP_RW_BIT;
7300 val = ATTR_S1_AP(ATTR_S1_AP_RW);
7301 } else {
7302 set = 0;
7303 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7304 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7305 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7306 }
7307 clear |= ATTR_SW_DBM;
7308 while (!atomic_fcmpset_64(pte, &oldpte,
7309 (oldpte | set) & ~clear))
7310 cpu_spinwait();
7311
7312 if ((oldpte & mask) == val)
7313 vm_page_dirty(m);
7314 pmap_invalidate_page(pmap, pv->pv_va, true);
7315 }
7316 PMAP_UNLOCK(pmap);
7317 }
7318 rw_wunlock(lock);
7319 vm_page_aflag_clear(m, PGA_WRITEABLE);
7320 }
7321
7322 /*
7323 * pmap_ts_referenced:
7324 *
7325 * Return a count of reference bits for a page, clearing those bits.
7326 * It is not necessary for every reference bit to be cleared, but it
7327 * is necessary that 0 only be returned when there are truly no
7328 * reference bits set.
7329 *
7330 * As an optimization, update the page's dirty field if a modified bit is
7331 * found while counting reference bits. This opportunistic update can be
7332 * performed at low cost and can eliminate the need for some future calls
7333 * to pmap_is_modified(). However, since this function stops after
7334 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
7335 * dirty pages. Those dirty pages will only be detected by a future call
7336 * to pmap_is_modified().
7337 */
7338 int
pmap_ts_referenced(vm_page_t m)7339 pmap_ts_referenced(vm_page_t m)
7340 {
7341 struct md_page *pvh;
7342 pv_entry_t pv, pvf;
7343 pmap_t pmap;
7344 struct rwlock *lock;
7345 pt_entry_t *pte, tpte;
7346 vm_offset_t va;
7347 vm_paddr_t pa;
7348 int cleared, md_gen, not_cleared, pvh_gen;
7349 struct spglist free;
7350
7351 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7352 ("pmap_ts_referenced: page %p is not managed", m));
7353 SLIST_INIT(&free);
7354 cleared = 0;
7355 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7356 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7357 rw_wlock(lock);
7358 retry:
7359 not_cleared = 0;
7360 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
7361 goto small_mappings;
7362 pv = pvf;
7363 do {
7364 if (pvf == NULL)
7365 pvf = pv;
7366 pmap = PV_PMAP(pv);
7367 if (!PMAP_TRYLOCK(pmap)) {
7368 pvh_gen = pvh->pv_gen;
7369 rw_wunlock(lock);
7370 PMAP_LOCK(pmap);
7371 rw_wlock(lock);
7372 if (pvh_gen != pvh->pv_gen) {
7373 PMAP_UNLOCK(pmap);
7374 goto retry;
7375 }
7376 }
7377 va = pv->pv_va;
7378 pte = pmap_pte_exists(pmap, va, 2, __func__);
7379 tpte = pmap_load(pte);
7380 if (pmap_pte_dirty(pmap, tpte)) {
7381 /*
7382 * Although "tpte" is mapping a 2MB page, because
7383 * this function is called at a 4KB page granularity,
7384 * we only update the 4KB page under test.
7385 */
7386 vm_page_dirty(m);
7387 }
7388 if ((tpte & ATTR_AF) != 0) {
7389 pa = VM_PAGE_TO_PHYS(m);
7390
7391 /*
7392 * Since this reference bit is shared by 512 4KB pages,
7393 * it should not be cleared every time it is tested.
7394 * Apply a simple "hash" function on the physical page
7395 * number, the virtual superpage number, and the pmap
7396 * address to select one 4KB page out of the 512 on
7397 * which testing the reference bit will result in
7398 * clearing that reference bit. This function is
7399 * designed to avoid the selection of the same 4KB page
7400 * for every 2MB page mapping.
7401 *
7402 * On demotion, a mapping that hasn't been referenced
7403 * is simply destroyed. To avoid the possibility of a
7404 * subsequent page fault on a demoted wired mapping,
7405 * always leave its reference bit set. Moreover,
7406 * since the superpage is wired, the current state of
7407 * its reference bit won't affect page replacement.
7408 */
7409 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
7410 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
7411 (tpte & ATTR_SW_WIRED) == 0) {
7412 pmap_clear_bits(pte, ATTR_AF);
7413 pmap_invalidate_page(pmap, va, true);
7414 cleared++;
7415 } else
7416 not_cleared++;
7417 }
7418 PMAP_UNLOCK(pmap);
7419 /* Rotate the PV list if it has more than one entry. */
7420 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7421 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7422 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
7423 pvh->pv_gen++;
7424 }
7425 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
7426 goto out;
7427 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
7428 small_mappings:
7429 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
7430 goto out;
7431 pv = pvf;
7432 do {
7433 if (pvf == NULL)
7434 pvf = pv;
7435 pmap = PV_PMAP(pv);
7436 if (!PMAP_TRYLOCK(pmap)) {
7437 pvh_gen = pvh->pv_gen;
7438 md_gen = m->md.pv_gen;
7439 rw_wunlock(lock);
7440 PMAP_LOCK(pmap);
7441 rw_wlock(lock);
7442 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7443 PMAP_UNLOCK(pmap);
7444 goto retry;
7445 }
7446 }
7447 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7448 tpte = pmap_load(pte);
7449 if (pmap_pte_dirty(pmap, tpte))
7450 vm_page_dirty(m);
7451 if ((tpte & ATTR_AF) != 0) {
7452 if ((tpte & ATTR_SW_WIRED) == 0) {
7453 /*
7454 * Clear the accessed bit in this L3 entry
7455 * regardless of the contiguous bit.
7456 */
7457 pmap_clear_bits(pte, ATTR_AF);
7458 pmap_invalidate_page(pmap, pv->pv_va, true);
7459 cleared++;
7460 } else
7461 not_cleared++;
7462 } else if ((tpte & ATTR_CONTIGUOUS) != 0 &&
7463 (pmap_load_l3c(pte) & ATTR_AF) != 0) {
7464 /*
7465 * An L3C superpage mapping is regarded as accessed
7466 * until the accessed bit has been cleared in all
7467 * of its constituent entries.
7468 */
7469 not_cleared++;
7470 }
7471 PMAP_UNLOCK(pmap);
7472 /* Rotate the PV list if it has more than one entry. */
7473 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7474 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7475 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7476 m->md.pv_gen++;
7477 }
7478 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
7479 not_cleared < PMAP_TS_REFERENCED_MAX);
7480 out:
7481 rw_wunlock(lock);
7482 vm_page_free_pages_toq(&free, true);
7483 return (cleared + not_cleared);
7484 }
7485
7486 /*
7487 * Apply the given advice to the specified range of addresses within the
7488 * given pmap. Depending on the advice, clear the referenced and/or
7489 * modified flags in each mapping and set the mapped page's dirty field.
7490 */
7491 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)7492 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
7493 {
7494 struct rwlock *lock;
7495 vm_offset_t va, va_next, dva;
7496 vm_page_t m;
7497 pd_entry_t *l0, *l1, *l2, oldl2;
7498 pt_entry_t *l3, *dl3, oldl3;
7499
7500 PMAP_ASSERT_STAGE1(pmap);
7501
7502 if (advice != MADV_DONTNEED && advice != MADV_FREE)
7503 return;
7504
7505 PMAP_LOCK(pmap);
7506 for (; sva < eva; sva = va_next) {
7507 l0 = pmap_l0(pmap, sva);
7508 if (pmap_load(l0) == 0) {
7509 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
7510 if (va_next < sva)
7511 va_next = eva;
7512 continue;
7513 }
7514
7515 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
7516 if (va_next < sva)
7517 va_next = eva;
7518 l1 = pmap_l0_to_l1(l0, sva);
7519 if (pmap_load(l1) == 0)
7520 continue;
7521 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
7522 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7523 continue;
7524 }
7525
7526 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
7527 if (va_next < sva)
7528 va_next = eva;
7529 l2 = pmap_l1_to_l2(l1, sva);
7530 oldl2 = pmap_load(l2);
7531 if (oldl2 == 0)
7532 continue;
7533 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
7534 if ((oldl2 & ATTR_SW_MANAGED) == 0)
7535 continue;
7536 lock = NULL;
7537 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
7538 if (lock != NULL)
7539 rw_wunlock(lock);
7540
7541 /*
7542 * The 2MB page mapping was destroyed.
7543 */
7544 continue;
7545 }
7546
7547 /*
7548 * Unless the page mappings are wired, remove the
7549 * mapping to a single page so that a subsequent
7550 * access may repromote. Choosing the last page
7551 * within the address range [sva, min(va_next, eva))
7552 * generally results in more repromotions. Since the
7553 * underlying page table page is fully populated, this
7554 * removal never frees a page table page.
7555 */
7556 if ((oldl2 & ATTR_SW_WIRED) == 0) {
7557 va = eva;
7558 if (va > va_next)
7559 va = va_next;
7560 va -= PAGE_SIZE;
7561 KASSERT(va >= sva,
7562 ("pmap_advise: no address gap"));
7563 l3 = pmap_l2_to_l3(l2, va);
7564 KASSERT(pmap_load(l3) != 0,
7565 ("pmap_advise: invalid PTE"));
7566 pmap_remove_l3(pmap, l3, va, pmap_load(l2),
7567 NULL, &lock);
7568 }
7569 if (lock != NULL)
7570 rw_wunlock(lock);
7571 }
7572 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
7573 ("pmap_advise: invalid L2 entry after demotion"));
7574 if (va_next > eva)
7575 va_next = eva;
7576 va = va_next;
7577 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
7578 sva += L3_SIZE) {
7579 oldl3 = pmap_load(l3);
7580 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
7581 (ATTR_SW_MANAGED | L3_PAGE))
7582 goto maybe_invlrng;
7583 else if (pmap_pte_dirty(pmap, oldl3)) {
7584 if (advice == MADV_DONTNEED) {
7585 /*
7586 * Future calls to pmap_is_modified()
7587 * can be avoided by making the page
7588 * dirty now.
7589 */
7590 m = PTE_TO_VM_PAGE(oldl3);
7591 vm_page_dirty(m);
7592 }
7593 if ((oldl3 & ATTR_CONTIGUOUS) != 0) {
7594 /*
7595 * Unconditionally demote the L3C
7596 * superpage because we do not allow
7597 * writeable, clean superpages.
7598 */
7599 (void)pmap_demote_l3c(pmap, l3, sva);
7600
7601 /*
7602 * Destroy the final mapping before the
7603 * next L3C boundary or va_next,
7604 * whichever comes first, so that a
7605 * subsequent access may act as a
7606 * repromotion trigger.
7607 */
7608 if ((oldl3 & ATTR_SW_WIRED) == 0) {
7609 dva = MIN((sva & ~L3C_OFFSET) +
7610 L3C_SIZE - PAGE_SIZE,
7611 va_next - PAGE_SIZE);
7612 dl3 = pmap_l2_to_l3(l2, dva);
7613 KASSERT(pmap_load(dl3) != 0,
7614 ("pmap_advise: invalid PTE"));
7615 lock = NULL;
7616 pmap_remove_l3(pmap, dl3, dva,
7617 pmap_load(l2), NULL, &lock);
7618 if (lock != NULL)
7619 rw_wunlock(lock);
7620 }
7621
7622 /*
7623 * The L3 entry's accessed bit may have
7624 * changed.
7625 */
7626 oldl3 = pmap_load(l3);
7627 }
7628
7629 /*
7630 * Check that we did not just destroy this entry so
7631 * we avoid corrupting the page able.
7632 */
7633 if (oldl3 != 0) {
7634 while (!atomic_fcmpset_long(l3, &oldl3,
7635 (oldl3 & ~ATTR_AF) |
7636 ATTR_S1_AP(ATTR_S1_AP_RO)))
7637 cpu_spinwait();
7638 }
7639 } else if ((oldl3 & ATTR_AF) != 0) {
7640 /*
7641 * Clear the accessed bit in this L3 entry
7642 * regardless of the contiguous bit.
7643 */
7644 pmap_clear_bits(l3, ATTR_AF);
7645 } else
7646 goto maybe_invlrng;
7647 if (va == va_next)
7648 va = sva;
7649 continue;
7650 maybe_invlrng:
7651 if (va != va_next) {
7652 pmap_s1_invalidate_range(pmap, va, sva, true);
7653 va = va_next;
7654 }
7655 }
7656 if (va != va_next)
7657 pmap_s1_invalidate_range(pmap, va, sva, true);
7658 }
7659 PMAP_UNLOCK(pmap);
7660 }
7661
7662 /*
7663 * Clear the modify bits on the specified physical page.
7664 */
7665 void
pmap_clear_modify(vm_page_t m)7666 pmap_clear_modify(vm_page_t m)
7667 {
7668 struct md_page *pvh;
7669 struct rwlock *lock;
7670 pmap_t pmap;
7671 pv_entry_t next_pv, pv;
7672 pd_entry_t *l2, oldl2;
7673 pt_entry_t *l3, oldl3;
7674 vm_offset_t va;
7675 int md_gen, pvh_gen;
7676
7677 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7678 ("pmap_clear_modify: page %p is not managed", m));
7679 vm_page_assert_busied(m);
7680
7681 if (!pmap_page_is_write_mapped(m))
7682 return;
7683 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7684 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7685 rw_wlock(lock);
7686 restart:
7687 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7688 pmap = PV_PMAP(pv);
7689 PMAP_ASSERT_STAGE1(pmap);
7690 if (!PMAP_TRYLOCK(pmap)) {
7691 pvh_gen = pvh->pv_gen;
7692 rw_wunlock(lock);
7693 PMAP_LOCK(pmap);
7694 rw_wlock(lock);
7695 if (pvh_gen != pvh->pv_gen) {
7696 PMAP_UNLOCK(pmap);
7697 goto restart;
7698 }
7699 }
7700 va = pv->pv_va;
7701 l2 = pmap_l2(pmap, va);
7702 oldl2 = pmap_load(l2);
7703 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
7704 if ((oldl2 & ATTR_SW_DBM) != 0 &&
7705 pmap_demote_l2_locked(pmap, l2, va, &lock) &&
7706 (oldl2 & ATTR_SW_WIRED) == 0) {
7707 /*
7708 * Write protect the mapping to a single page so that
7709 * a subsequent write access may repromote.
7710 */
7711 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
7712 l3 = pmap_l2_to_l3(l2, va);
7713 oldl3 = pmap_load(l3);
7714 while (!atomic_fcmpset_long(l3, &oldl3,
7715 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
7716 cpu_spinwait();
7717 vm_page_dirty(m);
7718 pmap_s1_invalidate_page(pmap, va, true);
7719 }
7720 PMAP_UNLOCK(pmap);
7721 }
7722 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7723 pmap = PV_PMAP(pv);
7724 PMAP_ASSERT_STAGE1(pmap);
7725 if (!PMAP_TRYLOCK(pmap)) {
7726 md_gen = m->md.pv_gen;
7727 pvh_gen = pvh->pv_gen;
7728 rw_wunlock(lock);
7729 PMAP_LOCK(pmap);
7730 rw_wlock(lock);
7731 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7732 PMAP_UNLOCK(pmap);
7733 goto restart;
7734 }
7735 }
7736 l2 = pmap_l2(pmap, pv->pv_va);
7737 l3 = pmap_l2_to_l3(l2, pv->pv_va);
7738 oldl3 = pmap_load(l3);
7739 KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 ||
7740 (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
7741 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
7742 ("writeable L3C superpage not dirty"));
7743 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
7744 if ((oldl3 & ATTR_CONTIGUOUS) != 0)
7745 (void)pmap_demote_l3c(pmap, l3, pv->pv_va);
7746 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
7747 pmap_s1_invalidate_page(pmap, pv->pv_va, true);
7748 }
7749 PMAP_UNLOCK(pmap);
7750 }
7751 rw_wunlock(lock);
7752 }
7753
7754 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)7755 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
7756 {
7757 struct pmap_preinit_mapping *ppim;
7758 vm_offset_t va, offset;
7759 pd_entry_t old_l2e, *pde;
7760 pt_entry_t *l2;
7761 int i, lvl, l2_blocks, free_l2_count, start_idx;
7762
7763 if (!vm_initialized) {
7764 /*
7765 * No L3 ptables so map entire L2 blocks where start VA is:
7766 * preinit_map_va + start_idx * L2_SIZE
7767 * There may be duplicate mappings (multiple VA -> same PA) but
7768 * ARM64 dcache is always PIPT so that's acceptable.
7769 */
7770 if (size == 0)
7771 return (NULL);
7772
7773 /* Calculate how many L2 blocks are needed for the mapping */
7774 l2_blocks = (roundup2(pa + size, L2_SIZE) -
7775 rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
7776
7777 offset = pa & L2_OFFSET;
7778
7779 if (preinit_map_va == 0)
7780 return (NULL);
7781
7782 /* Map 2MiB L2 blocks from reserved VA space */
7783
7784 free_l2_count = 0;
7785 start_idx = -1;
7786 /* Find enough free contiguous VA space */
7787 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7788 ppim = pmap_preinit_mapping + i;
7789 if (free_l2_count > 0 && ppim->pa != 0) {
7790 /* Not enough space here */
7791 free_l2_count = 0;
7792 start_idx = -1;
7793 continue;
7794 }
7795
7796 if (ppim->pa == 0) {
7797 /* Free L2 block */
7798 if (start_idx == -1)
7799 start_idx = i;
7800 free_l2_count++;
7801 if (free_l2_count == l2_blocks)
7802 break;
7803 }
7804 }
7805 if (free_l2_count != l2_blocks)
7806 panic("%s: too many preinit mappings", __func__);
7807
7808 va = preinit_map_va + (start_idx * L2_SIZE);
7809 for (i = start_idx; i < start_idx + l2_blocks; i++) {
7810 /* Mark entries as allocated */
7811 ppim = pmap_preinit_mapping + i;
7812 ppim->pa = pa;
7813 ppim->va = va + offset;
7814 ppim->size = size;
7815 }
7816
7817 /* Map L2 blocks */
7818 pa = rounddown2(pa, L2_SIZE);
7819 old_l2e = 0;
7820 for (i = 0; i < l2_blocks; i++) {
7821 pde = pmap_pde(kernel_pmap, va, &lvl);
7822 KASSERT(pde != NULL,
7823 ("pmap_mapbios: Invalid page entry, va: 0x%lx",
7824 va));
7825 KASSERT(lvl == 1,
7826 ("pmap_mapbios: Invalid level %d", lvl));
7827
7828 /* Insert L2_BLOCK */
7829 l2 = pmap_l1_to_l2(pde, va);
7830 old_l2e |= pmap_load_store(l2,
7831 PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr |
7832 ATTR_S1_XN | ATTR_KERN_GP |
7833 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
7834
7835 va += L2_SIZE;
7836 pa += L2_SIZE;
7837 }
7838 if ((old_l2e & ATTR_DESCR_VALID) != 0)
7839 pmap_s1_invalidate_all(kernel_pmap);
7840 else {
7841 /*
7842 * Because the old entries were invalid and the new
7843 * mappings are not executable, an isb is not required.
7844 */
7845 dsb(ishst);
7846 }
7847
7848 va = preinit_map_va + (start_idx * L2_SIZE);
7849
7850 } else {
7851 /* kva_alloc may be used to map the pages */
7852 offset = pa & PAGE_MASK;
7853 size = round_page(offset + size);
7854
7855 va = kva_alloc(size);
7856 if (va == 0)
7857 panic("%s: Couldn't allocate KVA", __func__);
7858
7859 pde = pmap_pde(kernel_pmap, va, &lvl);
7860 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
7861
7862 /* L3 table is linked */
7863 va = trunc_page(va);
7864 pa = trunc_page(pa);
7865 pmap_kenter(va, size, pa, memory_mapping_mode(pa));
7866 }
7867
7868 return ((void *)(va + offset));
7869 }
7870
7871 void
pmap_unmapbios(void * p,vm_size_t size)7872 pmap_unmapbios(void *p, vm_size_t size)
7873 {
7874 struct pmap_preinit_mapping *ppim;
7875 vm_offset_t offset, va, va_trunc;
7876 pd_entry_t *pde;
7877 pt_entry_t *l2;
7878 int i, lvl, l2_blocks, block;
7879 bool preinit_map;
7880
7881 va = (vm_offset_t)p;
7882 l2_blocks =
7883 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
7884 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
7885
7886 /* Remove preinit mapping */
7887 preinit_map = false;
7888 block = 0;
7889 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7890 ppim = pmap_preinit_mapping + i;
7891 if (ppim->va == va) {
7892 KASSERT(ppim->size == size,
7893 ("pmap_unmapbios: size mismatch"));
7894 ppim->va = 0;
7895 ppim->pa = 0;
7896 ppim->size = 0;
7897 preinit_map = true;
7898 offset = block * L2_SIZE;
7899 va_trunc = rounddown2(va, L2_SIZE) + offset;
7900
7901 /* Remove L2_BLOCK */
7902 pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
7903 KASSERT(pde != NULL,
7904 ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
7905 va_trunc));
7906 l2 = pmap_l1_to_l2(pde, va_trunc);
7907 pmap_clear(l2);
7908
7909 if (block == (l2_blocks - 1))
7910 break;
7911 block++;
7912 }
7913 }
7914 if (preinit_map) {
7915 pmap_s1_invalidate_all(kernel_pmap);
7916 return;
7917 }
7918
7919 /* Unmap the pages reserved with kva_alloc. */
7920 if (vm_initialized) {
7921 offset = va & PAGE_MASK;
7922 size = round_page(offset + size);
7923 va = trunc_page(va);
7924
7925 /* Unmap and invalidate the pages */
7926 pmap_kremove_device(va, size);
7927
7928 kva_free(va, size);
7929 }
7930 }
7931
7932 /*
7933 * Sets the memory attribute for the specified page.
7934 */
7935 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)7936 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
7937 {
7938
7939 m->md.pv_memattr = ma;
7940
7941 /*
7942 * If "m" is a normal page, update its direct mapping. This update
7943 * can be relied upon to perform any cache operations that are
7944 * required for data coherence.
7945 */
7946 if ((m->flags & PG_FICTITIOUS) == 0 &&
7947 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
7948 m->md.pv_memattr) != 0)
7949 panic("memory attribute change on the direct map failed");
7950 }
7951
7952 /*
7953 * Changes the specified virtual address range's memory type to that given by
7954 * the parameter "mode". The specified virtual address range must be
7955 * completely contained within either the direct map or the kernel map. If
7956 * the virtual address range is contained within the kernel map, then the
7957 * memory type for each of the corresponding ranges of the direct map is also
7958 * changed. (The corresponding ranges of the direct map are those ranges that
7959 * map the same physical pages as the specified virtual address range.) These
7960 * changes to the direct map are necessary because Intel describes the
7961 * behavior of their processors as "undefined" if two or more mappings to the
7962 * same physical page have different memory types.
7963 *
7964 * Returns zero if the change completed successfully, and either EINVAL or
7965 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
7966 * of the virtual address range was not mapped, and ENOMEM is returned if
7967 * there was insufficient memory available to complete the change. In the
7968 * latter case, the memory type may have been changed on some part of the
7969 * virtual address range or the direct map.
7970 */
7971 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)7972 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
7973 {
7974 int error;
7975
7976 PMAP_LOCK(kernel_pmap);
7977 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
7978 PMAP_UNLOCK(kernel_pmap);
7979 return (error);
7980 }
7981
7982 /*
7983 * Changes the specified virtual address range's protections to those
7984 * specified by "prot". Like pmap_change_attr(), protections for aliases
7985 * in the direct map are updated as well. Protections on aliasing mappings may
7986 * be a subset of the requested protections; for example, mappings in the direct
7987 * map are never executable.
7988 */
7989 int
pmap_change_prot(vm_offset_t va,vm_size_t size,vm_prot_t prot)7990 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
7991 {
7992 int error;
7993
7994 /* Only supported within the kernel map. */
7995 if (va < VM_MIN_KERNEL_ADDRESS)
7996 return (EINVAL);
7997
7998 PMAP_LOCK(kernel_pmap);
7999 error = pmap_change_props_locked(va, size, prot, -1, false);
8000 PMAP_UNLOCK(kernel_pmap);
8001 return (error);
8002 }
8003
8004 static int
pmap_change_props_locked(vm_offset_t va,vm_size_t size,vm_prot_t prot,int mode,bool skip_unmapped)8005 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
8006 int mode, bool skip_unmapped)
8007 {
8008 vm_offset_t base, offset, tmpva;
8009 vm_size_t pte_size;
8010 vm_paddr_t pa;
8011 pt_entry_t pte, *ptep, *newpte;
8012 pt_entry_t bits, mask;
8013 int lvl, rv;
8014
8015 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
8016 base = trunc_page(va);
8017 offset = va & PAGE_MASK;
8018 size = round_page(offset + size);
8019
8020 if (!VIRT_IN_DMAP(base) &&
8021 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
8022 return (EINVAL);
8023
8024 bits = 0;
8025 mask = 0;
8026 if (mode != -1) {
8027 bits = ATTR_S1_IDX(mode);
8028 mask = ATTR_S1_IDX_MASK;
8029 if (mode == VM_MEMATTR_DEVICE) {
8030 mask |= ATTR_S1_XN;
8031 bits |= ATTR_S1_XN;
8032 }
8033 }
8034 if (prot != VM_PROT_NONE) {
8035 /* Don't mark the DMAP as executable. It never is on arm64. */
8036 if (VIRT_IN_DMAP(base)) {
8037 prot &= ~VM_PROT_EXECUTE;
8038 /*
8039 * XXX Mark the DMAP as writable for now. We rely
8040 * on this in ddb & dtrace to insert breakpoint
8041 * instructions.
8042 */
8043 prot |= VM_PROT_WRITE;
8044 }
8045
8046 if ((prot & VM_PROT_WRITE) == 0) {
8047 bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
8048 }
8049 if ((prot & VM_PROT_EXECUTE) == 0) {
8050 bits |= ATTR_S1_PXN;
8051 }
8052 bits |= ATTR_S1_UXN;
8053 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
8054 }
8055
8056 for (tmpva = base; tmpva < base + size; ) {
8057 ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
8058 if (ptep == NULL && !skip_unmapped) {
8059 return (EINVAL);
8060 } else if ((ptep == NULL && skip_unmapped) ||
8061 (pmap_load(ptep) & mask) == bits) {
8062 /*
8063 * We already have the correct attribute or there
8064 * is no memory mapped at this address and we are
8065 * skipping unmapped memory.
8066 */
8067 switch (lvl) {
8068 default:
8069 panic("Invalid DMAP table level: %d\n", lvl);
8070 case 1:
8071 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
8072 break;
8073 case 2:
8074 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
8075 break;
8076 case 3:
8077 tmpva += PAGE_SIZE;
8078 break;
8079 }
8080 } else {
8081 /* We can't demote/promote this entry */
8082 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
8083
8084 /*
8085 * Find the entry and demote it if the requested change
8086 * only applies to part of the address range mapped by
8087 * the entry.
8088 */
8089 switch (lvl) {
8090 default:
8091 panic("Invalid DMAP table level: %d\n", lvl);
8092 case 1:
8093 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8094 if ((tmpva & L1_OFFSET) == 0 &&
8095 (base + size - tmpva) >= L1_SIZE) {
8096 pte_size = L1_SIZE;
8097 break;
8098 }
8099 newpte = pmap_demote_l1(kernel_pmap, ptep,
8100 tmpva & ~L1_OFFSET);
8101 if (newpte == NULL)
8102 return (EINVAL);
8103 ptep = pmap_l1_to_l2(ptep, tmpva);
8104 /* FALLTHROUGH */
8105 case 2:
8106 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
8107 if ((tmpva & L2C_OFFSET) == 0 &&
8108 (base + size - tmpva) >= L2C_SIZE) {
8109 pte_size = L2C_SIZE;
8110 break;
8111 }
8112 if (!pmap_demote_l2c(kernel_pmap, ptep,
8113 tmpva))
8114 return (EINVAL);
8115 }
8116 if ((tmpva & L2_OFFSET) == 0 &&
8117 (base + size - tmpva) >= L2_SIZE) {
8118 pte_size = L2_SIZE;
8119 break;
8120 }
8121 newpte = pmap_demote_l2(kernel_pmap, ptep,
8122 tmpva);
8123 if (newpte == NULL)
8124 return (EINVAL);
8125 ptep = pmap_l2_to_l3(ptep, tmpva);
8126 /* FALLTHROUGH */
8127 case 3:
8128 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
8129 if ((tmpva & L3C_OFFSET) == 0 &&
8130 (base + size - tmpva) >= L3C_SIZE) {
8131 pte_size = L3C_SIZE;
8132 break;
8133 }
8134 if (!pmap_demote_l3c(kernel_pmap, ptep,
8135 tmpva))
8136 return (EINVAL);
8137 }
8138 pte_size = PAGE_SIZE;
8139 break;
8140 }
8141
8142 /* Update the entry */
8143 pte = pmap_load(ptep);
8144 pte &= ~mask;
8145 pte |= bits;
8146
8147 switch (pte_size) {
8148 case L2C_SIZE:
8149 pmap_update_strided(kernel_pmap, ptep, ptep +
8150 L2C_ENTRIES, pte, tmpva, L2_SIZE, L2C_SIZE);
8151 break;
8152 case L3C_SIZE:
8153 pmap_update_strided(kernel_pmap, ptep, ptep +
8154 L3C_ENTRIES, pte, tmpva, L3_SIZE, L3C_SIZE);
8155 break;
8156 default:
8157 /*
8158 * We are updating a single block or page entry,
8159 * so regardless of pte_size pass PAGE_SIZE in
8160 * order that a single TLB invalidation is
8161 * performed.
8162 */
8163 pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
8164 PAGE_SIZE);
8165 break;
8166 }
8167
8168 pa = PTE_TO_PHYS(pte);
8169 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
8170 /*
8171 * Keep the DMAP memory in sync.
8172 */
8173 rv = pmap_change_props_locked(
8174 PHYS_TO_DMAP(pa), pte_size,
8175 prot, mode, true);
8176 if (rv != 0)
8177 return (rv);
8178 }
8179
8180 /*
8181 * If moving to a non-cacheable entry flush
8182 * the cache.
8183 */
8184 if (mode == VM_MEMATTR_UNCACHEABLE)
8185 cpu_dcache_wbinv_range((void *)tmpva, pte_size);
8186 tmpva += pte_size;
8187 }
8188 }
8189
8190 return (0);
8191 }
8192
8193 /*
8194 * Create an L2 table to map all addresses within an L1 mapping.
8195 */
8196 static pt_entry_t *
pmap_demote_l1(pmap_t pmap,pt_entry_t * l1,vm_offset_t va)8197 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
8198 {
8199 pt_entry_t *l2, newl2, oldl1;
8200 vm_offset_t tmpl1;
8201 vm_paddr_t l2phys, phys;
8202 vm_page_t ml2;
8203 int i;
8204
8205 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8206 oldl1 = pmap_load(l1);
8207 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8208 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
8209 ("pmap_demote_l1: Demoting a non-block entry"));
8210 KASSERT((va & L1_OFFSET) == 0,
8211 ("pmap_demote_l1: Invalid virtual address %#lx", va));
8212 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
8213 ("pmap_demote_l1: Level 1 table shouldn't be managed"));
8214 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
8215 ("pmap_demote_l1: Demoting entry with no-demote flag set"));
8216
8217 tmpl1 = 0;
8218 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
8219 tmpl1 = kva_alloc(PAGE_SIZE);
8220 if (tmpl1 == 0)
8221 return (NULL);
8222 }
8223
8224 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
8225 NULL) {
8226 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
8227 " in pmap %p", va, pmap);
8228 l2 = NULL;
8229 goto fail;
8230 }
8231
8232 l2phys = VM_PAGE_TO_PHYS(ml2);
8233 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
8234
8235 /* Address the range points at */
8236 phys = PTE_TO_PHYS(oldl1);
8237 /* The attributed from the old l1 table to be copied */
8238 newl2 = oldl1 & ATTR_MASK;
8239
8240 /* Create the new entries */
8241 newl2 |= ATTR_CONTIGUOUS;
8242 for (i = 0; i < Ln_ENTRIES; i++) {
8243 l2[i] = newl2 | phys;
8244 phys += L2_SIZE;
8245 }
8246 KASSERT(l2[0] == (ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) |
8247 L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0],
8248 ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
8249
8250 if (tmpl1 != 0) {
8251 pmap_kenter(tmpl1, PAGE_SIZE,
8252 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
8253 VM_MEMATTR_WRITE_BACK);
8254 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
8255 }
8256
8257 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
8258
8259 counter_u64_add(pmap_l1_demotions, 1);
8260 fail:
8261 if (tmpl1 != 0) {
8262 pmap_kremove(tmpl1);
8263 kva_free(tmpl1, PAGE_SIZE);
8264 }
8265
8266 return (l2);
8267 }
8268
8269 static void
pmap_fill_l3(pt_entry_t * firstl3,pt_entry_t newl3)8270 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
8271 {
8272 pt_entry_t *l3;
8273
8274 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
8275 *l3 = newl3;
8276 newl3 += L3_SIZE;
8277 }
8278 }
8279
8280 static void
pmap_demote_l2_check(pt_entry_t * firstl3p __unused,pt_entry_t newl3e __unused)8281 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused)
8282 {
8283 #ifdef INVARIANTS
8284 #ifdef DIAGNOSTIC
8285 pt_entry_t *xl3p, *yl3p;
8286
8287 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES;
8288 xl3p++, newl3e += PAGE_SIZE) {
8289 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) {
8290 printf("pmap_demote_l2: xl3e %zd and newl3e map "
8291 "different pages: found %#lx, expected %#lx\n",
8292 xl3p - firstl3p, pmap_load(xl3p), newl3e);
8293 printf("page table dump\n");
8294 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES;
8295 yl3p++) {
8296 printf("%zd %#lx\n", yl3p - firstl3p,
8297 pmap_load(yl3p));
8298 }
8299 panic("firstpte");
8300 }
8301 }
8302 #else
8303 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e),
8304 ("pmap_demote_l2: firstl3 and newl3e map different physical"
8305 " addresses"));
8306 #endif
8307 #endif
8308 }
8309
8310 static void
pmap_demote_l2_abort(pmap_t pmap,vm_offset_t va,pt_entry_t * l2,struct rwlock ** lockp)8311 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
8312 struct rwlock **lockp)
8313 {
8314 struct spglist free;
8315
8316 SLIST_INIT(&free);
8317 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
8318 lockp);
8319 vm_page_free_pages_toq(&free, true);
8320 }
8321
8322 /*
8323 * Create an L3 table to map all addresses within an L2 mapping.
8324 */
8325 static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap,pt_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)8326 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
8327 struct rwlock **lockp)
8328 {
8329 pt_entry_t *l3, newl3, oldl2;
8330 vm_offset_t tmpl2;
8331 vm_paddr_t l3phys;
8332 vm_page_t ml3;
8333
8334 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8335 PMAP_ASSERT_STAGE1(pmap);
8336 KASSERT(ADDR_IS_CANONICAL(va),
8337 ("%s: Address not in canonical form: %lx", __func__, va));
8338
8339 l3 = NULL;
8340 oldl2 = pmap_load(l2);
8341 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
8342 ("pmap_demote_l2: Demoting a non-block entry"));
8343 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
8344 ("pmap_demote_l2: Demoting entry with no-demote flag set"));
8345 va &= ~L2_OFFSET;
8346
8347 tmpl2 = 0;
8348 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
8349 tmpl2 = kva_alloc(PAGE_SIZE);
8350 if (tmpl2 == 0)
8351 return (NULL);
8352 }
8353
8354 /*
8355 * Invalidate the 2MB page mapping and return "failure" if the
8356 * mapping was never accessed.
8357 */
8358 if ((oldl2 & ATTR_AF) == 0) {
8359 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8360 ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
8361 pmap_demote_l2_abort(pmap, va, l2, lockp);
8362 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
8363 va, pmap);
8364 goto fail;
8365 }
8366
8367 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
8368 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8369 ("pmap_demote_l2: page table page for a wired mapping"
8370 " is missing"));
8371
8372 /*
8373 * If the page table page is missing and the mapping
8374 * is for a kernel address, the mapping must belong to
8375 * either the direct map or the early kernel memory.
8376 * Page table pages are preallocated for every other
8377 * part of the kernel address space, so the direct map
8378 * region and early kernel memory are the only parts of the
8379 * kernel address space that must be handled here.
8380 */
8381 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
8382 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
8383 ("pmap_demote_l2: No saved mpte for va %#lx", va));
8384
8385 /*
8386 * If the 2MB page mapping belongs to the direct map
8387 * region of the kernel's address space, then the page
8388 * allocation request specifies the highest possible
8389 * priority (VM_ALLOC_INTERRUPT). Otherwise, the
8390 * priority is normal.
8391 */
8392 ml3 = vm_page_alloc_noobj(
8393 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
8394 VM_ALLOC_WIRED);
8395
8396 /*
8397 * If the allocation of the new page table page fails,
8398 * invalidate the 2MB page mapping and return "failure".
8399 */
8400 if (ml3 == NULL) {
8401 pmap_demote_l2_abort(pmap, va, l2, lockp);
8402 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
8403 " in pmap %p", va, pmap);
8404 goto fail;
8405 }
8406 ml3->pindex = pmap_l2_pindex(va);
8407
8408 if (!ADDR_IS_KERNEL(va)) {
8409 ml3->ref_count = NL3PG;
8410 pmap_resident_count_inc(pmap, 1);
8411 }
8412 }
8413 l3phys = VM_PAGE_TO_PHYS(ml3);
8414 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
8415 newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
8416 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
8417 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
8418 ("pmap_demote_l2: L2 entry is writeable but not dirty"));
8419
8420 /*
8421 * If the PTP is not leftover from an earlier promotion or it does not
8422 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all
8423 * have ATTR_AF set.
8424 *
8425 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
8426 * performs a dsb(). That dsb() ensures that the stores for filling
8427 * "l3" are visible before "l3" is added to the page table.
8428 */
8429 if (!vm_page_all_valid(ml3))
8430 pmap_fill_l3(l3, newl3);
8431
8432 pmap_demote_l2_check(l3, newl3);
8433
8434 /*
8435 * If the mapping has changed attributes, update the L3Es.
8436 */
8437 if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE))
8438 pmap_fill_l3(l3, newl3);
8439
8440 /*
8441 * Map the temporary page so we don't lose access to the l2 table.
8442 */
8443 if (tmpl2 != 0) {
8444 pmap_kenter(tmpl2, PAGE_SIZE,
8445 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
8446 VM_MEMATTR_WRITE_BACK);
8447 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
8448 }
8449
8450 /*
8451 * The spare PV entries must be reserved prior to demoting the
8452 * mapping, that is, prior to changing the PDE. Otherwise, the state
8453 * of the L2 and the PV lists will be inconsistent, which can result
8454 * in reclaim_pv_chunk() attempting to remove a PV entry from the
8455 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
8456 * PV entry for the 2MB page mapping that is being demoted.
8457 */
8458 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8459 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
8460
8461 /*
8462 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
8463 * the 2MB page mapping.
8464 */
8465 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
8466
8467 /*
8468 * Demote the PV entry.
8469 */
8470 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8471 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
8472
8473 atomic_add_long(&pmap_l2_demotions, 1);
8474 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
8475 " in pmap %p %lx", va, pmap, l3[0]);
8476
8477 fail:
8478 if (tmpl2 != 0) {
8479 pmap_kremove(tmpl2);
8480 kva_free(tmpl2, PAGE_SIZE);
8481 }
8482
8483 return (l3);
8484
8485 }
8486
8487 static pt_entry_t *
pmap_demote_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)8488 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
8489 {
8490 struct rwlock *lock;
8491 pt_entry_t *l3;
8492
8493 lock = NULL;
8494 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
8495 if (lock != NULL)
8496 rw_wunlock(lock);
8497 return (l3);
8498 }
8499
8500 /*
8501 * Demote an L2C superpage mapping to L2C_ENTRIES L2 block mappings.
8502 */
8503 static bool
pmap_demote_l2c(pmap_t pmap,pt_entry_t * l2p,vm_offset_t va)8504 pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va)
8505 {
8506 pd_entry_t *l2c_end, *l2c_start, l2e, mask, nbits, *tl2p;
8507 vm_offset_t tmpl3;
8508 register_t intr;
8509
8510 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8511 PMAP_ASSERT_STAGE1(pmap);
8512 l2c_start = (pd_entry_t *)((uintptr_t)l2p & ~((L2C_ENTRIES *
8513 sizeof(pd_entry_t)) - 1));
8514 l2c_end = l2c_start + L2C_ENTRIES;
8515 tmpl3 = 0;
8516 if ((va & ~L2C_OFFSET) < (vm_offset_t)l2c_end &&
8517 (vm_offset_t)l2c_start < (va & ~L2C_OFFSET) + L2C_SIZE) {
8518 tmpl3 = kva_alloc(PAGE_SIZE);
8519 if (tmpl3 == 0)
8520 return (false);
8521 pmap_kenter(tmpl3, PAGE_SIZE,
8522 DMAP_TO_PHYS((vm_offset_t)l2c_start) & ~L3_OFFSET,
8523 VM_MEMATTR_WRITE_BACK);
8524 l2c_start = (pd_entry_t *)(tmpl3 +
8525 ((vm_offset_t)l2c_start & PAGE_MASK));
8526 l2c_end = (pd_entry_t *)(tmpl3 +
8527 ((vm_offset_t)l2c_end & PAGE_MASK));
8528 }
8529 mask = 0;
8530 nbits = ATTR_DESCR_VALID;
8531 intr = intr_disable();
8532
8533 /*
8534 * Break the mappings.
8535 */
8536 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8537 /*
8538 * Clear the mapping's contiguous and valid bits, but leave
8539 * the rest of the entry unchanged, so that a lockless,
8540 * concurrent pmap_kextract() can still lookup the physical
8541 * address.
8542 */
8543 l2e = pmap_load(tl2p);
8544 KASSERT((l2e & ATTR_CONTIGUOUS) != 0,
8545 ("pmap_demote_l2c: missing ATTR_CONTIGUOUS"));
8546 KASSERT((l2e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8547 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8548 ("pmap_demote_l2c: missing ATTR_S1_AP_RW"));
8549 while (!atomic_fcmpset_64(tl2p, &l2e, l2e & ~(ATTR_CONTIGUOUS |
8550 ATTR_DESCR_VALID)))
8551 cpu_spinwait();
8552
8553 /*
8554 * Hardware accessed and dirty bit maintenance might only
8555 * update a single L2 entry, so we must combine the accessed
8556 * and dirty bits from this entire set of contiguous L2
8557 * entries.
8558 */
8559 if ((l2e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8560 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8561 mask = ATTR_S1_AP_RW_BIT;
8562 nbits |= l2e & ATTR_AF;
8563 }
8564 if ((nbits & ATTR_AF) != 0) {
8565 pmap_s1_invalidate_strided(pmap, va & ~L2C_OFFSET, (va +
8566 L2C_SIZE) & ~L2C_OFFSET, L2_SIZE, true);
8567 }
8568
8569 /*
8570 * Remake the mappings, updating the accessed and dirty bits.
8571 */
8572 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8573 l2e = pmap_load(tl2p);
8574 while (!atomic_fcmpset_64(tl2p, &l2e, (l2e & ~mask) | nbits))
8575 cpu_spinwait();
8576 }
8577 dsb(ishst);
8578
8579 intr_restore(intr);
8580 if (tmpl3 != 0) {
8581 pmap_kremove(tmpl3);
8582 kva_free(tmpl3, PAGE_SIZE);
8583 }
8584 counter_u64_add(pmap_l2c_demotions, 1);
8585 CTR2(KTR_PMAP, "pmap_demote_l2c: success for va %#lx in pmap %p",
8586 va, pmap);
8587 return (true);
8588 }
8589
8590 /*
8591 * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings.
8592 */
8593 static bool
pmap_demote_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va)8594 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va)
8595 {
8596 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8597 vm_offset_t tmpl3;
8598 register_t intr;
8599
8600 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8601 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8602 sizeof(pt_entry_t)) - 1));
8603 l3c_end = l3c_start + L3C_ENTRIES;
8604 tmpl3 = 0;
8605 if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end &&
8606 (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) {
8607 tmpl3 = kva_alloc(PAGE_SIZE);
8608 if (tmpl3 == 0)
8609 return (false);
8610 pmap_kenter(tmpl3, PAGE_SIZE,
8611 DMAP_TO_PHYS((vm_offset_t)l3c_start) & ~L3_OFFSET,
8612 VM_MEMATTR_WRITE_BACK);
8613 l3c_start = (pt_entry_t *)(tmpl3 +
8614 ((vm_offset_t)l3c_start & PAGE_MASK));
8615 l3c_end = (pt_entry_t *)(tmpl3 +
8616 ((vm_offset_t)l3c_end & PAGE_MASK));
8617 }
8618 mask = 0;
8619 nbits = ATTR_DESCR_VALID;
8620 intr = intr_disable();
8621
8622 /*
8623 * Break the mappings.
8624 */
8625 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8626 /*
8627 * Clear the mapping's contiguous and valid bits, but leave
8628 * the rest of the entry unchanged, so that a lockless,
8629 * concurrent pmap_kextract() can still lookup the physical
8630 * address.
8631 */
8632 l3e = pmap_load(tl3p);
8633 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8634 ("pmap_demote_l3c: missing ATTR_CONTIGUOUS"));
8635 KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8636 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8637 ("pmap_demote_l3c: missing ATTR_S1_AP_RW"));
8638 while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS |
8639 ATTR_DESCR_VALID)))
8640 cpu_spinwait();
8641
8642 /*
8643 * Hardware accessed and dirty bit maintenance might only
8644 * update a single L3 entry, so we must combine the accessed
8645 * and dirty bits from this entire set of contiguous L3
8646 * entries.
8647 */
8648 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8649 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8650 mask = ATTR_S1_AP_RW_BIT;
8651 nbits |= l3e & ATTR_AF;
8652 }
8653 if ((nbits & ATTR_AF) != 0) {
8654 pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) &
8655 ~L3C_OFFSET, true);
8656 }
8657
8658 /*
8659 * Remake the mappings, updating the accessed and dirty bits.
8660 */
8661 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8662 l3e = pmap_load(tl3p);
8663 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
8664 cpu_spinwait();
8665 }
8666 dsb(ishst);
8667
8668 intr_restore(intr);
8669 if (tmpl3 != 0) {
8670 pmap_kremove(tmpl3);
8671 kva_free(tmpl3, PAGE_SIZE);
8672 }
8673 counter_u64_add(pmap_l3c_demotions, 1);
8674 CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p",
8675 va, pmap);
8676 return (true);
8677 }
8678
8679 /*
8680 * Accumulate the accessed and dirty bits within a L3C superpage and
8681 * return the specified PTE with them applied correctly.
8682 */
8683 static pt_entry_t
pmap_load_l3c(pt_entry_t * l3p)8684 pmap_load_l3c(pt_entry_t *l3p)
8685 {
8686 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8687
8688 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8689 sizeof(pt_entry_t)) - 1));
8690 l3c_end = l3c_start + L3C_ENTRIES;
8691 mask = 0;
8692 nbits = 0;
8693 /* Iterate over each mapping in the superpage. */
8694 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8695 l3e = pmap_load(tl3p);
8696 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8697 ("pmap_load_l3c: missing ATTR_CONTIGUOUS"));
8698 /* Update mask if the current page has its dirty bit set. */
8699 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8700 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8701 mask = ATTR_S1_AP_RW_BIT;
8702 /* Update nbits if the accessed bit is set. */
8703 nbits |= l3e & ATTR_AF;
8704 }
8705 return ((pmap_load(l3p) & ~mask) | nbits);
8706 }
8707
8708 /*
8709 * Perform the pmap work for mincore(2). If the page is not both referenced and
8710 * modified by this pmap, returns its physical address so that the caller can
8711 * find other mappings.
8712 */
8713 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)8714 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
8715 {
8716 pt_entry_t *pte, tpte;
8717 vm_paddr_t mask, pa;
8718 int lvl, psind, val;
8719 bool managed;
8720
8721 PMAP_ASSERT_STAGE1(pmap);
8722 PMAP_LOCK(pmap);
8723 pte = pmap_pte(pmap, addr, &lvl);
8724 if (pte != NULL) {
8725 tpte = pmap_load(pte);
8726
8727 switch (lvl) {
8728 case 3:
8729 mask = L3_OFFSET;
8730 psind = (tpte & ATTR_CONTIGUOUS) != 0 ? 1 : 0;
8731 break;
8732 case 2:
8733 mask = L2_OFFSET;
8734 psind = 2;
8735 break;
8736 case 1:
8737 mask = L1_OFFSET;
8738 psind = 3;
8739 break;
8740 default:
8741 panic("pmap_mincore: invalid level %d", lvl);
8742 }
8743
8744 managed = (tpte & ATTR_SW_MANAGED) != 0;
8745 val = MINCORE_INCORE | MINCORE_PSIND(psind);
8746 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
8747 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
8748 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
8749 if ((tpte & ATTR_AF) == ATTR_AF)
8750 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
8751
8752 pa = PTE_TO_PHYS(tpte) | (addr & mask);
8753 } else {
8754 managed = false;
8755 val = 0;
8756 }
8757
8758 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
8759 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
8760 *pap = pa;
8761 }
8762 PMAP_UNLOCK(pmap);
8763 return (val);
8764 }
8765
8766 /*
8767 * Garbage collect every ASID that is neither active on a processor nor
8768 * reserved.
8769 */
8770 static void
pmap_reset_asid_set(pmap_t pmap)8771 pmap_reset_asid_set(pmap_t pmap)
8772 {
8773 pmap_t curpmap;
8774 int asid, cpuid, epoch;
8775 struct asid_set *set;
8776 enum pmap_stage stage;
8777
8778 set = pmap->pm_asid_set;
8779 stage = pmap->pm_stage;
8780
8781 set = pmap->pm_asid_set;
8782 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8783 mtx_assert(&set->asid_set_mutex, MA_OWNED);
8784
8785 /*
8786 * Ensure that the store to asid_epoch is globally visible before the
8787 * loads from pc_curpmap are performed.
8788 */
8789 epoch = set->asid_epoch + 1;
8790 if (epoch == INT_MAX)
8791 epoch = 0;
8792 set->asid_epoch = epoch;
8793 dsb(ishst);
8794 if (stage == PM_STAGE1) {
8795 __asm __volatile("tlbi vmalle1is");
8796 } else {
8797 KASSERT(pmap_clean_stage2_tlbi != NULL,
8798 ("%s: Unset stage 2 tlb invalidation callback\n",
8799 __func__));
8800 pmap_clean_stage2_tlbi();
8801 }
8802 dsb(ish);
8803 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
8804 set->asid_set_size - 1);
8805 CPU_FOREACH(cpuid) {
8806 if (cpuid == curcpu)
8807 continue;
8808 if (stage == PM_STAGE1) {
8809 curpmap = pcpu_find(cpuid)->pc_curpmap;
8810 PMAP_ASSERT_STAGE1(pmap);
8811 } else {
8812 curpmap = pcpu_find(cpuid)->pc_curvmpmap;
8813 if (curpmap == NULL)
8814 continue;
8815 PMAP_ASSERT_STAGE2(pmap);
8816 }
8817 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
8818 asid = COOKIE_TO_ASID(curpmap->pm_cookie);
8819 if (asid == -1)
8820 continue;
8821 bit_set(set->asid_set, asid);
8822 curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
8823 }
8824 }
8825
8826 /*
8827 * Allocate a new ASID for the specified pmap.
8828 */
8829 static void
pmap_alloc_asid(pmap_t pmap)8830 pmap_alloc_asid(pmap_t pmap)
8831 {
8832 struct asid_set *set;
8833 int new_asid;
8834
8835 set = pmap->pm_asid_set;
8836 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8837
8838 mtx_lock_spin(&set->asid_set_mutex);
8839
8840 /*
8841 * While this processor was waiting to acquire the asid set mutex,
8842 * pmap_reset_asid_set() running on another processor might have
8843 * updated this pmap's cookie to the current epoch. In which case, we
8844 * don't need to allocate a new ASID.
8845 */
8846 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
8847 goto out;
8848
8849 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
8850 &new_asid);
8851 if (new_asid == -1) {
8852 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8853 set->asid_next, &new_asid);
8854 if (new_asid == -1) {
8855 pmap_reset_asid_set(pmap);
8856 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8857 set->asid_set_size, &new_asid);
8858 KASSERT(new_asid != -1, ("ASID allocation failure"));
8859 }
8860 }
8861 bit_set(set->asid_set, new_asid);
8862 set->asid_next = new_asid + 1;
8863 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
8864 out:
8865 mtx_unlock_spin(&set->asid_set_mutex);
8866 }
8867
8868 static uint64_t __read_mostly ttbr_flags;
8869
8870 /*
8871 * Compute the value that should be stored in ttbr0 to activate the specified
8872 * pmap. This value may change from time to time.
8873 */
8874 uint64_t
pmap_to_ttbr0(pmap_t pmap)8875 pmap_to_ttbr0(pmap_t pmap)
8876 {
8877 uint64_t ttbr;
8878
8879 ttbr = pmap->pm_ttbr;
8880 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
8881 ttbr |= ttbr_flags;
8882
8883 return (ttbr);
8884 }
8885
8886 static void
pmap_set_cnp(void * arg)8887 pmap_set_cnp(void *arg)
8888 {
8889 uint64_t ttbr0, ttbr1;
8890 u_int cpuid;
8891
8892 cpuid = *(u_int *)arg;
8893 if (cpuid == curcpu) {
8894 /*
8895 * Set the flags while all CPUs are handling the
8896 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
8897 * to pmap_to_ttbr0 after this will have the CnP flag set.
8898 * The dsb after invalidating the TLB will act as a barrier
8899 * to ensure all CPUs can observe this change.
8900 */
8901 ttbr_flags |= TTBR_CnP;
8902 }
8903
8904 ttbr0 = READ_SPECIALREG(ttbr0_el1);
8905 ttbr0 |= TTBR_CnP;
8906
8907 ttbr1 = READ_SPECIALREG(ttbr1_el1);
8908 ttbr1 |= TTBR_CnP;
8909
8910 /* Update ttbr{0,1}_el1 with the CnP flag */
8911 WRITE_SPECIALREG(ttbr0_el1, ttbr0);
8912 WRITE_SPECIALREG(ttbr1_el1, ttbr1);
8913 isb();
8914 __asm __volatile("tlbi vmalle1is");
8915 dsb(ish);
8916 isb();
8917 }
8918
8919 /*
8920 * Defer enabling some features until we have read the ID registers to know
8921 * if they are supported on all CPUs.
8922 */
8923 static void
pmap_init_mp(void * dummy __unused)8924 pmap_init_mp(void *dummy __unused)
8925 {
8926 uint64_t reg;
8927
8928 if (get_kernel_reg(ID_AA64PFR1_EL1, ®)) {
8929 if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) {
8930 if (bootverbose)
8931 printf("Enabling BTI\n");
8932 pmap_bti_support = true;
8933
8934 pmap_bti_ranges_zone = uma_zcreate("BTI ranges",
8935 sizeof(struct rs_el), NULL, NULL, NULL, NULL,
8936 UMA_ALIGN_PTR, 0);
8937 }
8938 }
8939 }
8940 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL);
8941
8942 /*
8943 * Defer enabling CnP until we have read the ID registers to know if it's
8944 * supported on all CPUs.
8945 */
8946 static void
pmap_init_cnp(void * dummy __unused)8947 pmap_init_cnp(void *dummy __unused)
8948 {
8949 uint64_t reg;
8950 u_int cpuid;
8951
8952 if (!get_kernel_reg(ID_AA64MMFR2_EL1, ®))
8953 return;
8954
8955 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
8956 if (bootverbose)
8957 printf("Enabling CnP\n");
8958 cpuid = curcpu;
8959 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
8960 }
8961
8962 }
8963 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
8964
8965 static bool
pmap_activate_int(pmap_t pmap)8966 pmap_activate_int(pmap_t pmap)
8967 {
8968 struct asid_set *set;
8969 int epoch;
8970
8971 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
8972 KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
8973
8974 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
8975 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
8976 /*
8977 * Handle the possibility that the old thread was preempted
8978 * after an "ic" or "tlbi" instruction but before it performed
8979 * a "dsb" instruction. If the old thread migrates to a new
8980 * processor, its completion of a "dsb" instruction on that
8981 * new processor does not guarantee that the "ic" or "tlbi"
8982 * instructions performed on the old processor have completed.
8983 */
8984 dsb(ish);
8985 return (false);
8986 }
8987
8988 set = pmap->pm_asid_set;
8989 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8990
8991 /*
8992 * Ensure that the store to curpmap is globally visible before the
8993 * load from asid_epoch is performed.
8994 */
8995 if (pmap->pm_stage == PM_STAGE1)
8996 PCPU_SET(curpmap, pmap);
8997 else
8998 PCPU_SET(curvmpmap, pmap);
8999 dsb(ish);
9000 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
9001 if (epoch >= 0 && epoch != set->asid_epoch)
9002 pmap_alloc_asid(pmap);
9003
9004 if (pmap->pm_stage == PM_STAGE1) {
9005 set_ttbr0(pmap_to_ttbr0(pmap));
9006 if (PCPU_GET(bcast_tlbi_workaround) != 0)
9007 invalidate_local_icache();
9008 }
9009 return (true);
9010 }
9011
9012 void
pmap_activate_vm(pmap_t pmap)9013 pmap_activate_vm(pmap_t pmap)
9014 {
9015
9016 PMAP_ASSERT_STAGE2(pmap);
9017
9018 (void)pmap_activate_int(pmap);
9019 }
9020
9021 void
pmap_activate(struct thread * td)9022 pmap_activate(struct thread *td)
9023 {
9024 pmap_t pmap;
9025
9026 pmap = vmspace_pmap(td->td_proc->p_vmspace);
9027 PMAP_ASSERT_STAGE1(pmap);
9028 critical_enter();
9029 (void)pmap_activate_int(pmap);
9030 critical_exit();
9031 }
9032
9033 /*
9034 * Activate the thread we are switching to.
9035 * To simplify the assembly in cpu_throw return the new threads pcb.
9036 */
9037 struct pcb *
pmap_switch(struct thread * new)9038 pmap_switch(struct thread *new)
9039 {
9040 pcpu_bp_harden bp_harden;
9041 struct pcb *pcb;
9042
9043 /* Store the new curthread */
9044 PCPU_SET(curthread, new);
9045
9046 /* And the new pcb */
9047 pcb = new->td_pcb;
9048 PCPU_SET(curpcb, pcb);
9049
9050 /*
9051 * TODO: We may need to flush the cache here if switching
9052 * to a user process.
9053 */
9054
9055 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
9056 /*
9057 * Stop userspace from training the branch predictor against
9058 * other processes. This will call into a CPU specific
9059 * function that clears the branch predictor state.
9060 */
9061 bp_harden = PCPU_GET(bp_harden);
9062 if (bp_harden != NULL)
9063 bp_harden();
9064 }
9065
9066 return (pcb);
9067 }
9068
9069 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)9070 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
9071 {
9072
9073 PMAP_ASSERT_STAGE1(pmap);
9074 KASSERT(ADDR_IS_CANONICAL(va),
9075 ("%s: Address not in canonical form: %lx", __func__, va));
9076
9077 if (ADDR_IS_KERNEL(va)) {
9078 cpu_icache_sync_range((void *)va, sz);
9079 } else {
9080 u_int len, offset;
9081 vm_paddr_t pa;
9082
9083 /* Find the length of data in this page to flush */
9084 offset = va & PAGE_MASK;
9085 len = imin(PAGE_SIZE - offset, sz);
9086
9087 while (sz != 0) {
9088 /* Extract the physical address & find it in the DMAP */
9089 pa = pmap_extract(pmap, va);
9090 if (pa != 0)
9091 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
9092 len);
9093
9094 /* Move to the next page */
9095 sz -= len;
9096 va += len;
9097 /* Set the length for the next iteration */
9098 len = imin(PAGE_SIZE, sz);
9099 }
9100 }
9101 }
9102
9103 static int
pmap_stage2_fault(pmap_t pmap,uint64_t esr,uint64_t far)9104 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9105 {
9106 pd_entry_t *pdep;
9107 pt_entry_t *ptep, pte;
9108 int rv, lvl, dfsc;
9109
9110 PMAP_ASSERT_STAGE2(pmap);
9111 rv = KERN_FAILURE;
9112
9113 /* Data and insn aborts use same encoding for FSC field. */
9114 dfsc = esr & ISS_DATA_DFSC_MASK;
9115 switch (dfsc) {
9116 case ISS_DATA_DFSC_TF_L0:
9117 case ISS_DATA_DFSC_TF_L1:
9118 case ISS_DATA_DFSC_TF_L2:
9119 case ISS_DATA_DFSC_TF_L3:
9120 PMAP_LOCK(pmap);
9121 pdep = pmap_pde(pmap, far, &lvl);
9122 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
9123 PMAP_UNLOCK(pmap);
9124 break;
9125 }
9126
9127 switch (lvl) {
9128 case 0:
9129 ptep = pmap_l0_to_l1(pdep, far);
9130 break;
9131 case 1:
9132 ptep = pmap_l1_to_l2(pdep, far);
9133 break;
9134 case 2:
9135 ptep = pmap_l2_to_l3(pdep, far);
9136 break;
9137 default:
9138 panic("%s: Invalid pde level %d", __func__,lvl);
9139 }
9140 goto fault_exec;
9141
9142 case ISS_DATA_DFSC_AFF_L1:
9143 case ISS_DATA_DFSC_AFF_L2:
9144 case ISS_DATA_DFSC_AFF_L3:
9145 PMAP_LOCK(pmap);
9146 ptep = pmap_pte(pmap, far, &lvl);
9147 fault_exec:
9148 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
9149 /*
9150 * If accessing an executable page invalidate
9151 * the I-cache so it will be valid when we
9152 * continue execution in the guest. The D-cache
9153 * is assumed to already be clean to the Point
9154 * of Coherency.
9155 */
9156 if ((pte & ATTR_S2_XN_MASK) !=
9157 ATTR_S2_XN(ATTR_S2_XN_NONE)) {
9158 invalidate_icache();
9159 }
9160 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
9161 rv = KERN_SUCCESS;
9162 }
9163 PMAP_UNLOCK(pmap);
9164 break;
9165 }
9166
9167 return (rv);
9168 }
9169
9170 int
pmap_fault(pmap_t pmap,uint64_t esr,uint64_t far)9171 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9172 {
9173 pt_entry_t pte, *ptep;
9174 register_t intr;
9175 uint64_t ec, par;
9176 int lvl, rv;
9177
9178 rv = KERN_FAILURE;
9179
9180 ec = ESR_ELx_EXCEPTION(esr);
9181 switch (ec) {
9182 case EXCP_INSN_ABORT_L:
9183 case EXCP_INSN_ABORT:
9184 case EXCP_DATA_ABORT_L:
9185 case EXCP_DATA_ABORT:
9186 break;
9187 default:
9188 return (rv);
9189 }
9190
9191 if (pmap->pm_stage == PM_STAGE2)
9192 return (pmap_stage2_fault(pmap, esr, far));
9193
9194 /* Data and insn aborts use same encoding for FSC field. */
9195 switch (esr & ISS_DATA_DFSC_MASK) {
9196 case ISS_DATA_DFSC_AFF_L1:
9197 case ISS_DATA_DFSC_AFF_L2:
9198 case ISS_DATA_DFSC_AFF_L3:
9199 PMAP_LOCK(pmap);
9200 ptep = pmap_pte(pmap, far, &lvl);
9201 if (ptep != NULL) {
9202 pmap_set_bits(ptep, ATTR_AF);
9203 rv = KERN_SUCCESS;
9204 /*
9205 * XXXMJ as an optimization we could mark the entry
9206 * dirty if this is a write fault.
9207 */
9208 }
9209 PMAP_UNLOCK(pmap);
9210 break;
9211 case ISS_DATA_DFSC_PF_L1:
9212 case ISS_DATA_DFSC_PF_L2:
9213 case ISS_DATA_DFSC_PF_L3:
9214 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
9215 (esr & ISS_DATA_WnR) == 0)
9216 return (rv);
9217 PMAP_LOCK(pmap);
9218 ptep = pmap_pte(pmap, far, &lvl);
9219 if (ptep != NULL &&
9220 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
9221 if ((pte & ATTR_S1_AP_RW_BIT) ==
9222 ATTR_S1_AP(ATTR_S1_AP_RO)) {
9223 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
9224 pmap_s1_invalidate_page(pmap, far, true);
9225 }
9226 rv = KERN_SUCCESS;
9227 }
9228 PMAP_UNLOCK(pmap);
9229 break;
9230 case ISS_DATA_DFSC_TF_L0:
9231 case ISS_DATA_DFSC_TF_L1:
9232 case ISS_DATA_DFSC_TF_L2:
9233 case ISS_DATA_DFSC_TF_L3:
9234 /*
9235 * Retry the translation. A break-before-make sequence can
9236 * produce a transient fault.
9237 */
9238 if (pmap == kernel_pmap) {
9239 /*
9240 * The translation fault may have occurred within a
9241 * critical section. Therefore, we must check the
9242 * address without acquiring the kernel pmap's lock.
9243 */
9244 if (pmap_klookup(far, NULL))
9245 rv = KERN_SUCCESS;
9246 } else {
9247 bool owned;
9248
9249 /*
9250 * In the EFIRT driver we lock the pmap before
9251 * calling into the runtime service. As the lock
9252 * is already owned by the current thread skip
9253 * locking it again.
9254 */
9255 owned = PMAP_OWNED(pmap);
9256 if (!owned)
9257 PMAP_LOCK(pmap);
9258 /* Ask the MMU to check the address. */
9259 intr = intr_disable();
9260 par = arm64_address_translate_s1e0r(far);
9261 intr_restore(intr);
9262 if (!owned)
9263 PMAP_UNLOCK(pmap);
9264
9265 /*
9266 * If the translation was successful, then we can
9267 * return success to the trap handler.
9268 */
9269 if (PAR_SUCCESS(par))
9270 rv = KERN_SUCCESS;
9271 }
9272 break;
9273 }
9274
9275 return (rv);
9276 }
9277
9278 /*
9279 * Increase the starting virtual address of the given mapping if a
9280 * different alignment might result in more superpage mappings.
9281 */
9282 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)9283 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
9284 vm_offset_t *addr, vm_size_t size)
9285 {
9286 vm_offset_t superpage_offset;
9287
9288 if (size < L3C_SIZE)
9289 return;
9290 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
9291 offset += ptoa(object->pg_color);
9292
9293 /*
9294 * Considering the object's physical alignment, is the mapping large
9295 * enough to encompass an L2 (2MB/32MB) superpage ...
9296 */
9297 superpage_offset = offset & L2_OFFSET;
9298 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) >= L2_SIZE) {
9299 /*
9300 * If the virtual and physical alignments differ, then
9301 * increase the virtual address so that the alignments match.
9302 */
9303 if ((*addr & L2_OFFSET) < superpage_offset)
9304 *addr = (*addr & ~L2_OFFSET) + superpage_offset;
9305 else if ((*addr & L2_OFFSET) > superpage_offset)
9306 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) +
9307 superpage_offset;
9308 return;
9309 }
9310 /* ... or an L3C (64KB/2MB) superpage? */
9311 superpage_offset = offset & L3C_OFFSET;
9312 if (size - ((L3C_SIZE - superpage_offset) & L3C_OFFSET) >= L3C_SIZE) {
9313 if ((*addr & L3C_OFFSET) < superpage_offset)
9314 *addr = (*addr & ~L3C_OFFSET) + superpage_offset;
9315 else if ((*addr & L3C_OFFSET) > superpage_offset)
9316 *addr = ((*addr + L3C_OFFSET) & ~L3C_OFFSET) +
9317 superpage_offset;
9318 }
9319 }
9320
9321 /**
9322 * Get the kernel virtual address of a set of physical pages. If there are
9323 * physical addresses not covered by the DMAP perform a transient mapping
9324 * that will be removed when calling pmap_unmap_io_transient.
9325 *
9326 * \param page The pages the caller wishes to obtain the virtual
9327 * address on the kernel memory map.
9328 * \param vaddr On return contains the kernel virtual memory address
9329 * of the pages passed in the page parameter.
9330 * \param count Number of pages passed in.
9331 * \param can_fault true if the thread using the mapped pages can take
9332 * page faults, false otherwise.
9333 *
9334 * \returns true if the caller must call pmap_unmap_io_transient when
9335 * finished or false otherwise.
9336 *
9337 */
9338 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)9339 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9340 bool can_fault)
9341 {
9342 vm_paddr_t paddr;
9343 bool needs_mapping;
9344 int error __diagused, i;
9345
9346 /*
9347 * Allocate any KVA space that we need, this is done in a separate
9348 * loop to prevent calling vmem_alloc while pinned.
9349 */
9350 needs_mapping = false;
9351 for (i = 0; i < count; i++) {
9352 paddr = VM_PAGE_TO_PHYS(page[i]);
9353 if (__predict_false(!PHYS_IN_DMAP(paddr))) {
9354 error = vmem_alloc(kernel_arena, PAGE_SIZE,
9355 M_BESTFIT | M_WAITOK, &vaddr[i]);
9356 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
9357 needs_mapping = true;
9358 } else {
9359 vaddr[i] = PHYS_TO_DMAP(paddr);
9360 }
9361 }
9362
9363 /* Exit early if everything is covered by the DMAP */
9364 if (!needs_mapping)
9365 return (false);
9366
9367 if (!can_fault)
9368 sched_pin();
9369 for (i = 0; i < count; i++) {
9370 paddr = VM_PAGE_TO_PHYS(page[i]);
9371 if (!PHYS_IN_DMAP(paddr)) {
9372 panic(
9373 "pmap_map_io_transient: TODO: Map out of DMAP data");
9374 }
9375 }
9376
9377 return (needs_mapping);
9378 }
9379
9380 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)9381 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9382 bool can_fault)
9383 {
9384 vm_paddr_t paddr;
9385 int i;
9386
9387 if (!can_fault)
9388 sched_unpin();
9389 for (i = 0; i < count; i++) {
9390 paddr = VM_PAGE_TO_PHYS(page[i]);
9391 if (!PHYS_IN_DMAP(paddr)) {
9392 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
9393 }
9394 }
9395 }
9396
9397 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)9398 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
9399 {
9400
9401 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
9402 }
9403
9404 static void *
bti_dup_range(void * ctx __unused,void * data)9405 bti_dup_range(void *ctx __unused, void *data)
9406 {
9407 struct rs_el *node, *new_node;
9408
9409 new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9410 if (new_node == NULL)
9411 return (NULL);
9412 node = data;
9413 memcpy(new_node, node, sizeof(*node));
9414 return (new_node);
9415 }
9416
9417 static void
bti_free_range(void * ctx __unused,void * node)9418 bti_free_range(void *ctx __unused, void *node)
9419 {
9420
9421 uma_zfree(pmap_bti_ranges_zone, node);
9422 }
9423
9424 static int
pmap_bti_assign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9425 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9426 {
9427 struct rs_el *rs;
9428 int error;
9429
9430 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9431 PMAP_ASSERT_STAGE1(pmap);
9432 MPASS(pmap->pm_bti != NULL);
9433 rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9434 if (rs == NULL)
9435 return (ENOMEM);
9436 error = rangeset_insert(pmap->pm_bti, sva, eva, rs);
9437 if (error != 0)
9438 uma_zfree(pmap_bti_ranges_zone, rs);
9439 return (error);
9440 }
9441
9442 static void
pmap_bti_deassign_all(pmap_t pmap)9443 pmap_bti_deassign_all(pmap_t pmap)
9444 {
9445
9446 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9447 if (pmap->pm_bti != NULL)
9448 rangeset_remove_all(pmap->pm_bti);
9449 }
9450
9451 /*
9452 * Returns true if the BTI setting is the same across the specified address
9453 * range, and false otherwise. When returning true, updates the referenced PTE
9454 * to reflect the BTI setting.
9455 *
9456 * Only stage 1 pmaps support BTI. The kernel pmap is always a stage 1 pmap
9457 * that has the same BTI setting implicitly across its entire address range.
9458 */
9459 static bool
pmap_bti_same(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t * pte)9460 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte)
9461 {
9462 struct rs_el *rs;
9463 vm_offset_t va;
9464
9465 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9466 KASSERT(ADDR_IS_CANONICAL(sva),
9467 ("%s: Start address not in canonical form: %lx", __func__, sva));
9468 KASSERT(ADDR_IS_CANONICAL(eva),
9469 ("%s: End address not in canonical form: %lx", __func__, eva));
9470 KASSERT((*pte & ATTR_S1_GP) == 0,
9471 ("%s: pte %lx has ATTR_S1_GP preset", __func__, *pte));
9472
9473 if (pmap == kernel_pmap) {
9474 *pte |= ATTR_KERN_GP;
9475 return (true);
9476 }
9477 if (pmap->pm_bti == NULL)
9478 return (true);
9479 PMAP_ASSERT_STAGE1(pmap);
9480 rs = rangeset_containing(pmap->pm_bti, sva);
9481 if (rs == NULL)
9482 return (rangeset_empty(pmap->pm_bti, sva, eva));
9483 while ((va = rs->re_end) < eva) {
9484 if ((rs = rangeset_beginning(pmap->pm_bti, va)) == NULL)
9485 return (false);
9486 }
9487 *pte |= ATTR_S1_GP;
9488 return (true);
9489 }
9490
9491 static pt_entry_t
pmap_pte_bti(pmap_t pmap,vm_offset_t va)9492 pmap_pte_bti(pmap_t pmap, vm_offset_t va)
9493 {
9494 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9495 MPASS(ADDR_IS_CANONICAL(va));
9496
9497 if (pmap->pm_stage != PM_STAGE1)
9498 return (0);
9499 if (pmap == kernel_pmap)
9500 return (ATTR_KERN_GP);
9501 if (pmap->pm_bti != NULL &&
9502 rangeset_containing(pmap->pm_bti, va) != NULL)
9503 return (ATTR_S1_GP);
9504 return (0);
9505 }
9506
9507 static void
pmap_bti_on_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9508 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9509 {
9510
9511 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9512 if (pmap->pm_bti != NULL)
9513 rangeset_remove(pmap->pm_bti, sva, eva);
9514 }
9515
9516 static int
pmap_bti_copy(pmap_t dst_pmap,pmap_t src_pmap)9517 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap)
9518 {
9519
9520 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
9521 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
9522 MPASS(src_pmap->pm_stage == dst_pmap->pm_stage);
9523 MPASS(src_pmap->pm_bti != NULL);
9524 MPASS(dst_pmap->pm_bti != NULL);
9525 if (src_pmap->pm_bti->rs_data_ctx == NULL)
9526 return (0);
9527 return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti));
9528 }
9529
9530 static void
pmap_bti_update_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool set)9531 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set)
9532 {
9533 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9534 PMAP_ASSERT_STAGE1(pmap);
9535
9536 pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0,
9537 true);
9538 }
9539
9540 int
pmap_bti_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9541 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9542 {
9543 int error;
9544
9545 if (pmap->pm_bti == NULL)
9546 return (0);
9547 if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva))
9548 return (EINVAL);
9549 if (pmap->pm_stage != PM_STAGE1)
9550 return (EINVAL);
9551 if (eva <= sva || ADDR_IS_KERNEL(eva))
9552 return (EFAULT);
9553
9554 sva = trunc_page(sva);
9555 eva = round_page(eva);
9556 for (;;) {
9557 PMAP_LOCK(pmap);
9558 error = pmap_bti_assign(pmap, sva, eva);
9559 if (error == 0)
9560 pmap_bti_update_range(pmap, sva, eva, true);
9561 PMAP_UNLOCK(pmap);
9562 if (error != ENOMEM)
9563 break;
9564 vm_wait(NULL);
9565 }
9566 return (error);
9567 }
9568
9569 #if defined(KASAN) || defined(KMSAN)
9570 static pd_entry_t *pmap_san_early_l2;
9571
9572 #define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE)
9573 #define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE)
9574 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_l2(void)9575 pmap_san_enter_bootstrap_alloc_l2(void)
9576 {
9577 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
9578 static size_t offset = 0;
9579 vm_offset_t addr;
9580
9581 if (offset + L2_SIZE > sizeof(bootstrap_data)) {
9582 panic("%s: out of memory for the bootstrap shadow map L2 entries",
9583 __func__);
9584 }
9585
9586 addr = (uintptr_t)&bootstrap_data[offset];
9587 offset += L2_SIZE;
9588 return (addr);
9589 }
9590
9591 /*
9592 * SAN L1 + L2 pages, maybe L3 entries later?
9593 */
9594 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_pages(int npages)9595 pmap_san_enter_bootstrap_alloc_pages(int npages)
9596 {
9597 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
9598 static size_t offset = 0;
9599 vm_offset_t addr;
9600
9601 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
9602 panic("%s: out of memory for the bootstrap shadow map",
9603 __func__);
9604 }
9605
9606 addr = (uintptr_t)&bootstrap_data[offset];
9607 offset += (npages * PAGE_SIZE);
9608 return (addr);
9609 }
9610
9611 static void __nosanitizeaddress
pmap_san_enter_bootstrap(void)9612 pmap_san_enter_bootstrap(void)
9613 {
9614 vm_offset_t freemempos;
9615
9616 /* L1, L2 */
9617 freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
9618 bs_state.freemempos = freemempos;
9619 bs_state.va = KASAN_MIN_ADDRESS;
9620 pmap_bootstrap_l1_table(&bs_state);
9621 pmap_san_early_l2 = bs_state.l2;
9622 }
9623
9624 static vm_page_t
pmap_san_enter_alloc_l3(void)9625 pmap_san_enter_alloc_l3(void)
9626 {
9627 vm_page_t m;
9628
9629 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
9630 VM_ALLOC_ZERO);
9631 if (m == NULL)
9632 panic("%s: no memory to grow shadow map", __func__);
9633 return (m);
9634 }
9635
9636 static vm_page_t
pmap_san_enter_alloc_l2(void)9637 pmap_san_enter_alloc_l2(void)
9638 {
9639 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
9640 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
9641 }
9642
9643 void __nosanitizeaddress __nosanitizememory
pmap_san_enter(vm_offset_t va)9644 pmap_san_enter(vm_offset_t va)
9645 {
9646 pd_entry_t *l1, *l2;
9647 pt_entry_t *l3;
9648 vm_page_t m;
9649
9650 if (virtual_avail == 0) {
9651 vm_offset_t block;
9652 int slot;
9653 bool first;
9654
9655 /* Temporary shadow map prior to pmap_bootstrap(). */
9656 first = pmap_san_early_l2 == NULL;
9657 if (first)
9658 pmap_san_enter_bootstrap();
9659
9660 l2 = pmap_san_early_l2;
9661 slot = pmap_l2_index(va);
9662
9663 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
9664 MPASS(first);
9665 block = pmap_san_enter_bootstrap_alloc_l2();
9666 pmap_store(&l2[slot],
9667 PHYS_TO_PTE(pmap_early_vtophys(block)) |
9668 PMAP_SAN_PTE_BITS | L2_BLOCK);
9669 dmb(ishst);
9670 }
9671
9672 return;
9673 }
9674
9675 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
9676 l1 = pmap_l1(kernel_pmap, va);
9677 MPASS(l1 != NULL);
9678 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
9679 m = pmap_san_enter_alloc_l3();
9680 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
9681 }
9682 l2 = pmap_l1_to_l2(l1, va);
9683 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
9684 m = pmap_san_enter_alloc_l2();
9685 if (m != NULL) {
9686 pmap_store(l2, VM_PAGE_TO_PTE(m) |
9687 PMAP_SAN_PTE_BITS | L2_BLOCK);
9688 } else {
9689 m = pmap_san_enter_alloc_l3();
9690 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
9691 }
9692 dmb(ishst);
9693 }
9694 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
9695 return;
9696 l3 = pmap_l2_to_l3(l2, va);
9697 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
9698 return;
9699 m = pmap_san_enter_alloc_l3();
9700 pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
9701 dmb(ishst);
9702 }
9703 #endif /* KASAN || KMSAN */
9704
9705 /*
9706 * Track a range of the kernel's virtual address space that is contiguous
9707 * in various mapping attributes.
9708 */
9709 struct pmap_kernel_map_range {
9710 vm_offset_t sva;
9711 pt_entry_t attrs;
9712 int l3pages;
9713 int l3contig;
9714 int l2blocks;
9715 int l2contig;
9716 int l1blocks;
9717 };
9718
9719 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)9720 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
9721 vm_offset_t eva)
9722 {
9723 const char *mode;
9724 int index;
9725
9726 if (eva <= range->sva)
9727 return;
9728
9729 index = range->attrs & ATTR_S1_IDX_MASK;
9730 switch (index) {
9731 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
9732 mode = "DEV-NP";
9733 break;
9734 case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
9735 mode = "DEV";
9736 break;
9737 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
9738 mode = "UC";
9739 break;
9740 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
9741 mode = "WB";
9742 break;
9743 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
9744 mode = "WT";
9745 break;
9746 default:
9747 printf(
9748 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
9749 __func__, index, range->sva, eva);
9750 mode = "??";
9751 break;
9752 }
9753
9754 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d %d\n",
9755 range->sva, eva,
9756 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
9757 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
9758 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
9759 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
9760 (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-',
9761 mode, range->l1blocks, range->l2contig, range->l2blocks,
9762 range->l3contig, range->l3pages);
9763
9764 /* Reset to sentinel value. */
9765 range->sva = 0xfffffffffffffffful;
9766 }
9767
9768 /*
9769 * Determine whether the attributes specified by a page table entry match those
9770 * being tracked by the current range.
9771 */
9772 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)9773 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
9774 {
9775
9776 return (range->attrs == attrs);
9777 }
9778
9779 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)9780 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
9781 pt_entry_t attrs)
9782 {
9783
9784 memset(range, 0, sizeof(*range));
9785 range->sva = va;
9786 range->attrs = attrs;
9787 }
9788
9789 /* Get the block/page attributes that correspond to the table attributes */
9790 static pt_entry_t
sysctl_kmaps_table_attrs(pd_entry_t table)9791 sysctl_kmaps_table_attrs(pd_entry_t table)
9792 {
9793 pt_entry_t attrs;
9794
9795 attrs = 0;
9796 if ((table & TATTR_UXN_TABLE) != 0)
9797 attrs |= ATTR_S1_UXN;
9798 if ((table & TATTR_PXN_TABLE) != 0)
9799 attrs |= ATTR_S1_PXN;
9800 if ((table & TATTR_AP_TABLE_RO) != 0)
9801 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
9802
9803 return (attrs);
9804 }
9805
9806 /* Read the block/page attributes we care about */
9807 static pt_entry_t
sysctl_kmaps_block_attrs(pt_entry_t block)9808 sysctl_kmaps_block_attrs(pt_entry_t block)
9809 {
9810 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK |
9811 ATTR_S1_GP));
9812 }
9813
9814 /*
9815 * Given a leaf PTE, derive the mapping's attributes. If they do not match
9816 * those of the current run, dump the address range and its attributes, and
9817 * begin a new run.
9818 */
9819 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l0e,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)9820 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
9821 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
9822 pt_entry_t l3e)
9823 {
9824 pt_entry_t attrs;
9825
9826 attrs = sysctl_kmaps_table_attrs(l0e);
9827
9828 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9829 attrs |= sysctl_kmaps_block_attrs(l1e);
9830 goto done;
9831 }
9832 attrs |= sysctl_kmaps_table_attrs(l1e);
9833
9834 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9835 attrs |= sysctl_kmaps_block_attrs(l2e);
9836 goto done;
9837 }
9838 attrs |= sysctl_kmaps_table_attrs(l2e);
9839 attrs |= sysctl_kmaps_block_attrs(l3e);
9840
9841 done:
9842 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
9843 sysctl_kmaps_dump(sb, range, va);
9844 sysctl_kmaps_reinit(range, va, attrs);
9845 }
9846 }
9847
9848 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)9849 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
9850 {
9851 struct pmap_kernel_map_range range;
9852 struct sbuf sbuf, *sb;
9853 pd_entry_t l0e, *l1, l1e, *l2, l2e;
9854 pt_entry_t *l3, l3e;
9855 vm_offset_t sva;
9856 vm_paddr_t pa;
9857 int error, i, j, k, l;
9858
9859 error = sysctl_wire_old_buffer(req, 0);
9860 if (error != 0)
9861 return (error);
9862 sb = &sbuf;
9863 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
9864
9865 /* Sentinel value. */
9866 range.sva = 0xfffffffffffffffful;
9867
9868 /*
9869 * Iterate over the kernel page tables without holding the kernel pmap
9870 * lock. Kernel page table pages are never freed, so at worst we will
9871 * observe inconsistencies in the output.
9872 */
9873 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
9874 i++) {
9875 if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
9876 sbuf_printf(sb, "\nDirect map:\n");
9877 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
9878 sbuf_printf(sb, "\nKernel map:\n");
9879 #ifdef KASAN
9880 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
9881 sbuf_printf(sb, "\nKASAN shadow map:\n");
9882 #endif
9883 #ifdef KMSAN
9884 else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS))
9885 sbuf_printf(sb, "\nKMSAN shadow map:\n");
9886 else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS))
9887 sbuf_printf(sb, "\nKMSAN origin map:\n");
9888 #endif
9889
9890 l0e = kernel_pmap->pm_l0[i];
9891 if ((l0e & ATTR_DESCR_VALID) == 0) {
9892 sysctl_kmaps_dump(sb, &range, sva);
9893 sva += L0_SIZE;
9894 continue;
9895 }
9896 pa = PTE_TO_PHYS(l0e);
9897 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9898
9899 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
9900 l1e = l1[j];
9901 if ((l1e & ATTR_DESCR_VALID) == 0) {
9902 sysctl_kmaps_dump(sb, &range, sva);
9903 sva += L1_SIZE;
9904 continue;
9905 }
9906 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
9907 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
9908 sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
9909 0, 0);
9910 range.l1blocks++;
9911 sva += L1_SIZE;
9912 continue;
9913 }
9914 pa = PTE_TO_PHYS(l1e);
9915 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9916
9917 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
9918 l2e = l2[k];
9919 if ((l2e & ATTR_DESCR_VALID) == 0) {
9920 sysctl_kmaps_dump(sb, &range, sva);
9921 sva += L2_SIZE;
9922 continue;
9923 }
9924 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
9925 sysctl_kmaps_check(sb, &range, sva,
9926 l0e, l1e, l2e, 0);
9927 if ((l2e & ATTR_CONTIGUOUS) != 0)
9928 range.l2contig +=
9929 k % L2C_ENTRIES == 0 ?
9930 1 : 0;
9931 else
9932 range.l2blocks++;
9933 sva += L2_SIZE;
9934 continue;
9935 }
9936 pa = PTE_TO_PHYS(l2e);
9937 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
9938
9939 for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
9940 l++, sva += L3_SIZE) {
9941 l3e = l3[l];
9942 if ((l3e & ATTR_DESCR_VALID) == 0) {
9943 sysctl_kmaps_dump(sb, &range,
9944 sva);
9945 continue;
9946 }
9947 sysctl_kmaps_check(sb, &range, sva,
9948 l0e, l1e, l2e, l3e);
9949 if ((l3e & ATTR_CONTIGUOUS) != 0)
9950 range.l3contig +=
9951 l % L3C_ENTRIES == 0 ?
9952 1 : 0;
9953 else
9954 range.l3pages++;
9955 }
9956 }
9957 }
9958 }
9959
9960 error = sbuf_finish(sb);
9961 sbuf_delete(sb);
9962 return (error);
9963 }
9964 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
9965 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
9966 NULL, 0, sysctl_kmaps, "A",
9967 "Dump kernel address layout");
9968