1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014-2016 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * This product includes software developed by the University of
35 * California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 * may be used to endorse or promote products derived from this software
38 * without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 */
52 /*-
53 * Copyright (c) 2003 Networks Associates Technology, Inc.
54 * All rights reserved.
55 *
56 * This software was developed for the FreeBSD Project by Jake Burkholder,
57 * Safeport Network Services, and Network Associates Laboratories, the
58 * Security Research Division of Network Associates, Inc. under
59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
60 * CHATS research program.
61 *
62 * Redistribution and use in source and binary forms, with or without
63 * modification, are permitted provided that the following conditions
64 * are met:
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 * 2. Redistributions in binary form must reproduce the above copyright
68 * notice, this list of conditions and the following disclaimer in the
69 * documentation and/or other materials provided with the distribution.
70 *
71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81 * SUCH DAMAGE.
82 */
83
84 #include <sys/cdefs.h>
85 /*
86 * Manages physical address maps.
87 *
88 * Since the information managed by this module is
89 * also stored by the logical address mapping module,
90 * this module may throw away valid virtual-to-physical
91 * mappings at almost any time. However, invalidations
92 * of virtual-to-physical mappings must be done as
93 * requested.
94 *
95 * In order to cope with hardware architectures which
96 * make virtual-to-physical map invalidates expensive,
97 * this module may delay invalidate or reduced protection
98 * operations until such time as they are actually
99 * necessary. This module is given full information as
100 * to which processors are currently using which maps,
101 * and to when physical maps must be made correct.
102 */
103
104 #include "opt_vm.h"
105
106 #include <sys/param.h>
107 #include <sys/asan.h>
108 #include <sys/bitstring.h>
109 #include <sys/bus.h>
110 #include <sys/systm.h>
111 #include <sys/kernel.h>
112 #include <sys/ktr.h>
113 #include <sys/limits.h>
114 #include <sys/lock.h>
115 #include <sys/malloc.h>
116 #include <sys/mman.h>
117 #include <sys/msan.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/physmem.h>
121 #include <sys/proc.h>
122 #include <sys/rangeset.h>
123 #include <sys/rwlock.h>
124 #include <sys/sbuf.h>
125 #include <sys/sx.h>
126 #include <sys/vmem.h>
127 #include <sys/vmmeter.h>
128 #include <sys/sched.h>
129 #include <sys/sysctl.h>
130 #include <sys/_unrhdr.h>
131 #include <sys/smp.h>
132
133 #include <vm/vm.h>
134 #include <vm/vm_param.h>
135 #include <vm/vm_kern.h>
136 #include <vm/vm_page.h>
137 #include <vm/vm_map.h>
138 #include <vm/vm_object.h>
139 #include <vm/vm_extern.h>
140 #include <vm/vm_pageout.h>
141 #include <vm/vm_pager.h>
142 #include <vm/vm_phys.h>
143 #include <vm/vm_radix.h>
144 #include <vm/vm_reserv.h>
145 #include <vm/vm_dumpset.h>
146 #include <vm/uma.h>
147
148 #include <machine/asan.h>
149 #include <machine/cpu.h>
150 #include <machine/cpu_feat.h>
151 #include <machine/elf.h>
152 #include <machine/ifunc.h>
153 #include <machine/machdep.h>
154 #include <machine/md_var.h>
155 #include <machine/pcb.h>
156 #include <machine/rsi.h>
157
158 #ifdef NUMA
159 #define PMAP_MEMDOM MAXMEMDOM
160 #else
161 #define PMAP_MEMDOM 1
162 #endif
163
164 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1)
165 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2)
166
167 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t)))
168 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t)))
169 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t)))
170 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t)))
171
172 #define NUL0E L0_ENTRIES
173 #define NUL1E (NUL0E * NL1PG)
174 #define NUL2E (NUL1E * NL2PG)
175
176 #ifdef PV_STATS
177 #define PV_STAT(x) do { x ; } while (0)
178 #define __pvused
179 #else
180 #define PV_STAT(x) do { } while (0)
181 #define __pvused __unused
182 #endif
183
184 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT))
185 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
186 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
187
188 #ifdef __ARM_FEATURE_BTI_DEFAULT
189 pt_entry_t __read_mostly pmap_gp_attr;
190 #define ATTR_KERN_GP pmap_gp_attr
191 #else
192 #define ATTR_KERN_GP 0
193 #endif
194 #define PMAP_SAN_PTE_BITS (ATTR_AF | ATTR_S1_XN | pmap_sh_attr | \
195 ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
196
197 static bool __read_mostly pmap_multiple_tlbi = false;
198
199 struct pmap_large_md_page {
200 struct rwlock pv_lock;
201 struct md_page pv_page;
202 /* Pad to a power of 2, see pmap_init_pv_table(). */
203 int pv_pad[2];
204 };
205
206 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
207 #define pv_dummy pv_dummy_large.pv_page
208 __read_mostly static struct pmap_large_md_page *pv_table;
209
210 __read_mostly uint64_t prot_ns_shared_pa;
211
212 static struct pmap_large_md_page *
_pa_to_pmdp(vm_paddr_t pa)213 _pa_to_pmdp(vm_paddr_t pa)
214 {
215 struct vm_phys_seg *seg;
216
217 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
218 return ((struct pmap_large_md_page *)seg->md_first +
219 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
220 return (NULL);
221 }
222
223 static struct pmap_large_md_page *
pa_to_pmdp(vm_paddr_t pa)224 pa_to_pmdp(vm_paddr_t pa)
225 {
226 struct pmap_large_md_page *pvd;
227
228 pvd = _pa_to_pmdp(pa);
229 if (pvd == NULL)
230 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
231 return (pvd);
232 }
233
234 static struct pmap_large_md_page *
page_to_pmdp(vm_page_t m)235 page_to_pmdp(vm_page_t m)
236 {
237 struct vm_phys_seg *seg;
238
239 seg = &vm_phys_segs[m->segind];
240 return ((struct pmap_large_md_page *)seg->md_first +
241 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
242 }
243
244 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page))
245 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page))
246
247 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \
248 struct pmap_large_md_page *_pvd; \
249 struct rwlock *_lock; \
250 _pvd = _pa_to_pmdp(pa); \
251 if (__predict_false(_pvd == NULL)) \
252 _lock = &pv_dummy_large.pv_lock; \
253 else \
254 _lock = &(_pvd->pv_lock); \
255 _lock; \
256 })
257
258 static struct rwlock *
VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)259 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)
260 {
261 if ((m->flags & PG_FICTITIOUS) == 0)
262 return (&page_to_pmdp(m)->pv_lock);
263 else
264 return (&pv_dummy_large.pv_lock);
265 }
266
267 #define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \
268 struct rwlock **_lockp = (lockp); \
269 struct rwlock *_new_lock = (new_lock); \
270 \
271 if (_new_lock != *_lockp) { \
272 if (*_lockp != NULL) \
273 rw_wunlock(*_lockp); \
274 *_lockp = _new_lock; \
275 rw_wlock(*_lockp); \
276 } \
277 } while (0)
278
279 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \
280 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa))
281
282 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
283 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m))
284
285 #define RELEASE_PV_LIST_LOCK(lockp) do { \
286 struct rwlock **_lockp = (lockp); \
287 \
288 if (*_lockp != NULL) { \
289 rw_wunlock(*_lockp); \
290 *_lockp = NULL; \
291 } \
292 } while (0)
293
294 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
295 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m))
296
297 static struct mtx cmap_lock;
298 static void *cmap1_addr;
299 static pt_entry_t *cmap1_pte;
300
301 /*
302 * The presence of this flag indicates that the mapping is writeable.
303 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
304 * it is dirty. This flag may only be set on managed mappings.
305 *
306 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
307 * as a software managed bit.
308 */
309 #define ATTR_SW_DBM ATTR_DBM
310
311 struct pmap kernel_pmap_store;
312
313 /* Used for mapping ACPI memory before VM is initialized */
314 #define PMAP_PREINIT_MAPPING_COUNT 32
315 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
316 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */
317 static int vm_initialized = 0; /* No need to use pre-init maps when set */
318
319 /*
320 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
321 * Always map entire L2 block for simplicity.
322 * VA of L2 block = preinit_map_va + i * L2_SIZE
323 */
324 static struct pmap_preinit_mapping {
325 vm_paddr_t pa;
326 void *va;
327 vm_size_t size;
328 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
329
330 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
331 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
332 vm_offset_t kernel_vm_end = 0;
333
334 /*
335 * Data for the pv entry allocation mechanism.
336 */
337 #ifdef NUMA
338 static __inline int
pc_to_domain(struct pv_chunk * pc)339 pc_to_domain(struct pv_chunk *pc)
340 {
341 return (vm_phys_domain(DMAP_TO_PHYS(pc)));
342 }
343 #else
344 static __inline int
pc_to_domain(struct pv_chunk * pc __unused)345 pc_to_domain(struct pv_chunk *pc __unused)
346 {
347 return (0);
348 }
349 #endif
350
351 struct pv_chunks_list {
352 struct mtx pvc_lock;
353 TAILQ_HEAD(pch, pv_chunk) pvc_list;
354 int active_reclaims;
355 } __aligned(CACHE_LINE_SIZE);
356
357 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
358
359 vm_paddr_t dmap_phys_base; /* The start of the dmap region */
360 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
361 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */
362 static int dmap_attr = VM_MEMATTR_WRITE_BACK;
363
364 extern pt_entry_t pagetable_l0_ttbr1[];
365
366 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
367 static vm_paddr_t physmap[PHYSMAP_SIZE];
368 static u_int physmap_idx;
369
370 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
371 "VM/pmap parameters");
372
373 static int pmap_growkernel_panic = 0;
374 SYSCTL_INT(_vm_pmap, OID_AUTO, growkernel_panic, CTLFLAG_RDTUN,
375 &pmap_growkernel_panic, 0,
376 "panic on failure to allocate kernel page table page");
377
378 bool pmap_lpa_enabled __read_mostly = false;
379 pt_entry_t pmap_sh_attr __read_mostly = ATTR_SH(ATTR_SH_IS);
380
381 #if PAGE_SIZE == PAGE_SIZE_4K
382 #define L1_BLOCKS_SUPPORTED 1
383 #else
384 #define L1_BLOCKS_SUPPORTED (pmap_lpa_enabled)
385 #endif
386
387 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED)
388
389 static bool pmap_l1_supported __read_mostly = false;
390
391 /*
392 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
393 * that it has currently allocated to a pmap, a cursor ("asid_next") to
394 * optimize its search for a free ASID in the bit vector, and an epoch number
395 * ("asid_epoch") to indicate when it has reclaimed all previously allocated
396 * ASIDs that are not currently active on a processor.
397 *
398 * The current epoch number is always in the range [0, INT_MAX). Negative
399 * numbers and INT_MAX are reserved for special cases that are described
400 * below.
401 */
402 struct asid_set {
403 int asid_bits;
404 bitstr_t *asid_set;
405 int asid_set_size;
406 int asid_next;
407 int asid_epoch;
408 struct mtx asid_set_mutex;
409 };
410
411 static struct asid_set asids;
412 static struct asid_set vmids;
413
414 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
415 "ASID allocator");
416 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
417 "The number of bits in an ASID");
418 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
419 "The last allocated ASID plus one");
420 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
421 "The current epoch number");
422
423 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
424 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
425 "The number of bits in an VMID");
426 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
427 "The last allocated VMID plus one");
428 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
429 "The current epoch number");
430
431 void (*pmap_clean_stage2_tlbi)(void);
432 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
433 void (*pmap_stage2_invalidate_all)(uint64_t);
434
435 /*
436 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved
437 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for
438 * dynamically allocated ASIDs have a non-negative epoch number.
439 *
440 * An invalid ASID is represented by -1.
441 *
442 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
443 * which indicates that an ASID should never be allocated to the pmap, and
444 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
445 * allocated when the pmap is next activated.
446 */
447 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \
448 ((u_long)(epoch) << 32)))
449 #define COOKIE_TO_ASID(cookie) ((int)(cookie))
450 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32))
451
452 #define TLBI_VA_SHIFT 12
453 #define TLBI_VA_MASK ((1ul << 44) - 1)
454 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
455
456 static int __read_frequently superpages_enabled = 1;
457 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
458 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
459 "Are large page mappings enabled?");
460
461 /*
462 * True when Branch Target Identification should be used by userspace. This
463 * allows pmap to mark pages as guarded with ATTR_S1_GP.
464 */
465 __read_mostly static bool pmap_bti_support = false;
466
467 /*
468 * Internal flags for pmap_enter()'s helper functions.
469 */
470 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
471 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
472
473 TAILQ_HEAD(pv_chunklist, pv_chunk);
474
475 static void free_pv_chunk(struct pv_chunk *pc);
476 static void free_pv_chunk_batch(struct pv_chunklist *batch);
477 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
478 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
479 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
480 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
481 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
482 vm_offset_t va);
483
484 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
485 static bool pmap_activate_int(struct thread *td, pmap_t pmap);
486 static void pmap_alloc_asid(pmap_t pmap);
487 static int pmap_change_props_locked(void *addr, vm_size_t size,
488 vm_prot_t prot, int mode, int old_mode, bool skip_unmapped);
489 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
490 pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp);
491 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
492 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
493 vm_offset_t va, struct rwlock **lockp);
494 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
495 static bool pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va);
496 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va);
497 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
498 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
499 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
500 u_int flags, vm_page_t m, struct rwlock **lockp);
501 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
502 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp);
503 static bool pmap_every_pte_zero(vm_paddr_t pa);
504 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
505 bool all_l3e_AF_set);
506 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p);
507 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
508 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits);
509 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
510 struct rwlock **lockp);
511 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
512 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
513 pd_entry_t l1e, bool demote_kl2e, struct spglist *free,
514 struct rwlock **lockp);
515 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
516 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
517 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
518 vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
519 struct rwlock **lockp);
520 static void pmap_reset_asid_set(pmap_t pmap);
521 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
522 vm_page_t m, struct rwlock **lockp);
523
524 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
525 struct rwlock **lockp);
526
527 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
528 struct spglist *free);
529 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
530 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
531 vm_offset_t va, vm_size_t size);
532 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
533
534 static uma_zone_t pmap_bti_ranges_zone;
535 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
536 pt_entry_t *pte);
537 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va);
538 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
539 static void *bti_dup_range(void *ctx, void *data);
540 static void bti_free_range(void *ctx, void *node);
541 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap);
542 static void pmap_bti_deassign_all(pmap_t pmap);
543 static void pagezero(void *);
544
545 static void pmap_set_protected(pt_entry_t old_l3);
546 static void pmap_set_unprotected(pt_entry_t new_l3);
547
548 /*
549 * These load the old table data and store the new value.
550 * They need to be atomic as the System MMU may write to the table at
551 * the same time as the CPU.
552 */
553 #define pmap_clear(table) atomic_store_64(table, 0)
554 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits)
555 #define pmap_load(table) (*table)
556 #define pmap_load_clear(table) atomic_swap_64(table, 0)
557 #define pmap_load_store(table, entry) atomic_swap_64(table, entry)
558 #define pmap_set_bits(table, bits) atomic_set_64(table, bits)
559 #define pmap_store(table, entry) atomic_store_64(table, entry)
560
561 /********************/
562 /* Inline functions */
563 /********************/
564
565 static __inline void
pagecopy(void * s,void * d)566 pagecopy(void *s, void *d)
567 {
568
569 memcpy(d, s, PAGE_SIZE);
570 }
571
572 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)573 pmap_l0(pmap_t pmap, vm_offset_t va)
574 {
575
576 return (&pmap->pm_l0[pmap_l0_index(va)]);
577 }
578
579 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)580 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
581 {
582 pd_entry_t *l1;
583
584 l1 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
585 return (&l1[pmap_l1_index(va)]);
586 }
587
588 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)589 pmap_l1(pmap_t pmap, vm_offset_t va)
590 {
591 pd_entry_t *l0;
592
593 l0 = pmap_l0(pmap, va);
594 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
595 return (NULL);
596
597 return (pmap_l0_to_l1(l0, va));
598 }
599
600 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1p,vm_offset_t va)601 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
602 {
603 pd_entry_t l1, *l2p;
604
605 l1 = pmap_load(l1p);
606
607 KASSERT(ADDR_IS_CANONICAL(va),
608 ("%s: Address not in canonical form: %lx", __func__, va));
609 /*
610 * The valid bit may be clear if pmap_update_entry() is concurrently
611 * modifying the entry, so for KVA only the entry type may be checked.
612 */
613 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
614 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
615 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
616 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
617 l2p = PHYS_TO_DMAP(PTE_TO_PHYS(l1));
618 return (&l2p[pmap_l2_index(va)]);
619 }
620
621 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)622 pmap_l2(pmap_t pmap, vm_offset_t va)
623 {
624 pd_entry_t *l1;
625
626 l1 = pmap_l1(pmap, va);
627 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
628 return (NULL);
629
630 return (pmap_l1_to_l2(l1, va));
631 }
632
633 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2p,vm_offset_t va)634 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
635 {
636 pd_entry_t l2;
637 pt_entry_t *l3p;
638
639 l2 = pmap_load(l2p);
640
641 KASSERT(ADDR_IS_CANONICAL(va),
642 ("%s: Address not in canonical form: %lx", __func__, va));
643 /*
644 * The valid bit may be clear if pmap_update_entry() is concurrently
645 * modifying the entry, so for KVA only the entry type may be checked.
646 */
647 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
648 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
649 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
650 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
651 l3p = PHYS_TO_DMAP(PTE_TO_PHYS(l2));
652 return (&l3p[pmap_l3_index(va)]);
653 }
654
655 /*
656 * Returns the lowest valid pde for a given virtual address.
657 * The next level may or may not point to a valid page or block.
658 */
659 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va,int * level)660 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
661 {
662 pd_entry_t *l0, *l1, *l2, desc;
663
664 l0 = pmap_l0(pmap, va);
665 desc = pmap_load(l0) & ATTR_DESCR_MASK;
666 if (desc != L0_TABLE) {
667 *level = -1;
668 return (NULL);
669 }
670
671 l1 = pmap_l0_to_l1(l0, va);
672 desc = pmap_load(l1) & ATTR_DESCR_MASK;
673 if (desc != L1_TABLE) {
674 *level = 0;
675 return (l0);
676 }
677
678 l2 = pmap_l1_to_l2(l1, va);
679 desc = pmap_load(l2) & ATTR_DESCR_MASK;
680 if (desc != L2_TABLE) {
681 *level = 1;
682 return (l1);
683 }
684
685 *level = 2;
686 return (l2);
687 }
688
689 /*
690 * Returns the lowest valid pte block or table entry for a given virtual
691 * address. If there are no valid entries return NULL and set the level to
692 * the first invalid level.
693 */
694 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va,int * level)695 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
696 {
697 pd_entry_t *l1, *l2, desc;
698 pt_entry_t *l3;
699
700 l1 = pmap_l1(pmap, va);
701 if (l1 == NULL) {
702 *level = 0;
703 return (NULL);
704 }
705 desc = pmap_load(l1) & ATTR_DESCR_MASK;
706 if (desc == L1_BLOCK) {
707 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
708 *level = 1;
709 return (l1);
710 }
711
712 if (desc != L1_TABLE) {
713 *level = 1;
714 return (NULL);
715 }
716
717 l2 = pmap_l1_to_l2(l1, va);
718 desc = pmap_load(l2) & ATTR_DESCR_MASK;
719 if (desc == L2_BLOCK) {
720 *level = 2;
721 return (l2);
722 }
723
724 if (desc != L2_TABLE) {
725 *level = 2;
726 return (NULL);
727 }
728
729 *level = 3;
730 l3 = pmap_l2_to_l3(l2, va);
731 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
732 return (NULL);
733
734 return (l3);
735 }
736
737 /*
738 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
739 * level that maps the specified virtual address, then a pointer to that entry
740 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled
741 * and a diagnostic message is provided, in which case this function panics.
742 */
743 static __always_inline pt_entry_t *
pmap_pte_exists(pmap_t pmap,vm_offset_t va,int level,const char * diag)744 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
745 {
746 pd_entry_t *l0p, *l1p, *l2p;
747 pt_entry_t desc, *l3p;
748 int walk_level __diagused;
749
750 KASSERT(level >= 0 && level < 4,
751 ("%s: %s passed an out-of-range level (%d)", __func__, diag,
752 level));
753 l0p = pmap_l0(pmap, va);
754 desc = pmap_load(l0p) & ATTR_DESCR_MASK;
755 if (desc == L0_TABLE && level > 0) {
756 l1p = pmap_l0_to_l1(l0p, va);
757 desc = pmap_load(l1p) & ATTR_DESCR_MASK;
758 if (desc == L1_BLOCK && level == 1) {
759 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
760 return (l1p);
761 }
762 if (desc == L1_TABLE && level > 1) {
763 l2p = pmap_l1_to_l2(l1p, va);
764 desc = pmap_load(l2p) & ATTR_DESCR_MASK;
765 if (desc == L2_BLOCK && level == 2)
766 return (l2p);
767 else if (desc == L2_TABLE && level > 2) {
768 l3p = pmap_l2_to_l3(l2p, va);
769 desc = pmap_load(l3p) & ATTR_DESCR_MASK;
770 if (desc == L3_PAGE && level == 3)
771 return (l3p);
772 else
773 walk_level = 3;
774 } else
775 walk_level = 2;
776 } else
777 walk_level = 1;
778 } else
779 walk_level = 0;
780 KASSERT(diag == NULL,
781 ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
782 diag, va, level, desc, walk_level));
783 return (NULL);
784 }
785
786 bool
pmap_ps_enabled(pmap_t pmap)787 pmap_ps_enabled(pmap_t pmap)
788 {
789 /*
790 * Promotion requires a hypervisor call when the kernel is running
791 * in EL1. To stop this disable superpage support on non-stage 1
792 * pmaps for now.
793 */
794 if (pmap->pm_stage != PM_STAGE1)
795 return (false);
796
797 #ifdef KMSAN
798 /*
799 * The break-before-make in pmap_update_entry() results in a situation
800 * where a CPU may call into the KMSAN runtime while the entry is
801 * invalid. If the entry is used to map the current thread structure,
802 * then the runtime will attempt to access unmapped memory. Avoid this
803 * by simply disabling superpage promotion for the kernel map.
804 */
805 if (pmap == kernel_pmap)
806 return (false);
807 #endif
808
809 return (superpages_enabled != 0);
810 }
811
812 bool
pmap_vs_enabled(void)813 pmap_vs_enabled(void)
814 {
815 /*
816 * 8 and 16 are the only values hardware can support, but allow for the
817 * possibility of artificially restricting the bits, e.g. for testing.
818 */
819 KASSERT(vmids.asid_bits <= 16, ("VMID bits %d > 16", vmids.asid_bits));
820 return (vmids.asid_bits > 8);
821 }
822
823 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l0,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)824 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
825 pd_entry_t **l2, pt_entry_t **l3)
826 {
827 pd_entry_t *l0p, *l1p, *l2p;
828
829 if (pmap->pm_l0 == NULL)
830 return (false);
831
832 l0p = pmap_l0(pmap, va);
833 *l0 = l0p;
834
835 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
836 return (false);
837
838 l1p = pmap_l0_to_l1(l0p, va);
839 *l1 = l1p;
840
841 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
842 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
843 *l2 = NULL;
844 *l3 = NULL;
845 return (true);
846 }
847
848 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
849 return (false);
850
851 l2p = pmap_l1_to_l2(l1p, va);
852 *l2 = l2p;
853
854 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
855 *l3 = NULL;
856 return (true);
857 }
858
859 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
860 return (false);
861
862 *l3 = pmap_l2_to_l3(l2p, va);
863
864 return (true);
865 }
866
867 static __inline int
pmap_l3_valid(pt_entry_t l3)868 pmap_l3_valid(pt_entry_t l3)
869 {
870
871 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
872 }
873
874 CTASSERT(L1_BLOCK == L2_BLOCK);
875
876 static pt_entry_t
pmap_pte_memattr(pmap_t pmap,vm_memattr_t memattr)877 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
878 {
879 pt_entry_t val;
880
881 if (pmap->pm_stage == PM_STAGE1) {
882 val = ATTR_S1_IDX(memattr);
883 if (memattr == VM_MEMATTR_DEVICE)
884 val |= ATTR_S1_XN;
885 return (val);
886 }
887
888 val = 0;
889
890 switch (memattr) {
891 case VM_MEMATTR_DEVICE:
892 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
893 ATTR_S2_XN(ATTR_S2_XN_ALL));
894 case VM_MEMATTR_UNCACHEABLE:
895 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
896 case VM_MEMATTR_WRITE_BACK:
897 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
898 case VM_MEMATTR_WRITE_THROUGH:
899 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
900 default:
901 panic("%s: invalid memory attribute %x", __func__, memattr);
902 }
903 }
904
905 static pt_entry_t
pmap_pte_prot(pmap_t pmap,vm_prot_t prot)906 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
907 {
908 pt_entry_t val;
909
910 val = 0;
911 if (pmap->pm_stage == PM_STAGE1) {
912 if ((prot & VM_PROT_EXECUTE) == 0)
913 val |= ATTR_S1_XN;
914 if ((prot & VM_PROT_WRITE) == 0)
915 val |= ATTR_S1_AP(ATTR_S1_AP_RO);
916 } else {
917 if ((prot & VM_PROT_WRITE) != 0)
918 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
919 if ((prot & VM_PROT_READ) != 0)
920 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
921 if ((prot & VM_PROT_EXECUTE) == 0)
922 val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
923 }
924
925 return (val);
926 }
927
928 /*
929 * Checks if the PTE is dirty.
930 */
931 static inline int
pmap_pte_dirty(pmap_t pmap,pt_entry_t pte)932 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
933 {
934
935 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
936
937 if (pmap->pm_stage == PM_STAGE1) {
938 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
939 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
940
941 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
942 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
943 }
944
945 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
946 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
947 }
948
949 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)950 pmap_resident_count_inc(pmap_t pmap, int count)
951 {
952
953 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
954 pmap->pm_stats.resident_count += count;
955 }
956
957 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)958 pmap_resident_count_dec(pmap_t pmap, int count)
959 {
960
961 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
962 KASSERT(pmap->pm_stats.resident_count >= count,
963 ("pmap %p resident count underflow %ld %d", pmap,
964 pmap->pm_stats.resident_count, count));
965 pmap->pm_stats.resident_count -= count;
966 }
967
968 static vm_paddr_t
pmap_early_vtophys(vm_offset_t va)969 pmap_early_vtophys(vm_offset_t va)
970 {
971 vm_paddr_t pa_page;
972
973 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
974 return (pa_page | (va & PAR_LOW_MASK));
975 }
976
977 /* State of the bootstrapped DMAP page tables */
978 struct pmap_bootstrap_state {
979 pt_entry_t *l1;
980 pt_entry_t *l2;
981 pt_entry_t *l3;
982 vm_offset_t freemempos;
983 vm_offset_t va;
984 vm_paddr_t pa;
985 pt_entry_t table_attrs;
986 u_int l0_slot;
987 u_int l1_slot;
988 u_int l2_slot;
989 bool dmap_valid;
990 };
991
992 /* The bootstrap state */
993 static struct pmap_bootstrap_state bs_state = {
994 .l1 = NULL,
995 .l2 = NULL,
996 .l3 = NULL,
997 .table_attrs = TATTR_PXN_TABLE,
998 .l0_slot = L0_ENTRIES,
999 .l1_slot = Ln_ENTRIES,
1000 .l2_slot = Ln_ENTRIES,
1001 .dmap_valid = false,
1002 };
1003
1004 static void
pmap_bootstrap_l0_table(struct pmap_bootstrap_state * state)1005 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
1006 {
1007 vm_paddr_t l1_pa;
1008 pd_entry_t l0e;
1009 u_int l0_slot;
1010
1011 /* Link the level 0 table to a level 1 table */
1012 l0_slot = pmap_l0_index(state->va);
1013 if (l0_slot != state->l0_slot) {
1014 /*
1015 * Make sure we move from a low address to high address
1016 * before the DMAP region is ready. This ensures we never
1017 * modify an existing mapping until we can map from a
1018 * physical address to a virtual address.
1019 */
1020 MPASS(state->l0_slot < l0_slot ||
1021 state->l0_slot == L0_ENTRIES ||
1022 state->dmap_valid);
1023
1024 /* Reset lower levels */
1025 state->l2 = NULL;
1026 state->l3 = NULL;
1027 state->l1_slot = Ln_ENTRIES;
1028 state->l2_slot = Ln_ENTRIES;
1029
1030 /* Check the existing L0 entry */
1031 state->l0_slot = l0_slot;
1032 if (state->dmap_valid) {
1033 l0e = pagetable_l0_ttbr1[l0_slot];
1034 if ((l0e & ATTR_DESCR_VALID) != 0) {
1035 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
1036 l1_pa = PTE_TO_PHYS(l0e);
1037 state->l1 = PHYS_TO_DMAP(l1_pa);
1038 return;
1039 }
1040 }
1041
1042 /* Create a new L0 table entry */
1043 state->l1 = (pt_entry_t *)state->freemempos;
1044 memset_early(state->l1, 0, PAGE_SIZE);
1045 state->freemempos += PAGE_SIZE;
1046
1047 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
1048 MPASS((l1_pa & Ln_TABLE_MASK) == 0);
1049 MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
1050 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) |
1051 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
1052 }
1053 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
1054 }
1055
1056 static void
pmap_bootstrap_l1_table(struct pmap_bootstrap_state * state)1057 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
1058 {
1059 vm_paddr_t l2_pa;
1060 pd_entry_t l1e;
1061 u_int l1_slot;
1062
1063 /* Make sure there is a valid L0 -> L1 table */
1064 pmap_bootstrap_l0_table(state);
1065
1066 /* Link the level 1 table to a level 2 table */
1067 l1_slot = pmap_l1_index(state->va);
1068 if (l1_slot != state->l1_slot) {
1069 /* See pmap_bootstrap_l0_table for a description */
1070 MPASS(state->l1_slot < l1_slot ||
1071 state->l1_slot == Ln_ENTRIES ||
1072 state->dmap_valid);
1073
1074 /* Reset lower levels */
1075 state->l3 = NULL;
1076 state->l2_slot = Ln_ENTRIES;
1077
1078 /* Check the existing L1 entry */
1079 state->l1_slot = l1_slot;
1080 if (state->dmap_valid) {
1081 l1e = state->l1[l1_slot];
1082 if ((l1e & ATTR_DESCR_VALID) != 0) {
1083 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
1084 l2_pa = PTE_TO_PHYS(l1e);
1085 state->l2 = PHYS_TO_DMAP(l2_pa);
1086 return;
1087 }
1088 }
1089
1090 /* Create a new L1 table entry */
1091 state->l2 = (pt_entry_t *)state->freemempos;
1092 memset_early(state->l2, 0, PAGE_SIZE);
1093 state->freemempos += PAGE_SIZE;
1094
1095 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1096 MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1097 MPASS(state->l1[l1_slot] == 0);
1098 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) |
1099 state->table_attrs | L1_TABLE);
1100 }
1101 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1102 }
1103
1104 static void
pmap_bootstrap_l2_table(struct pmap_bootstrap_state * state)1105 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1106 {
1107 vm_paddr_t l3_pa;
1108 pd_entry_t l2e;
1109 u_int l2_slot;
1110
1111 /* Make sure there is a valid L1 -> L2 table */
1112 pmap_bootstrap_l1_table(state);
1113
1114 /* Link the level 2 table to a level 3 table */
1115 l2_slot = pmap_l2_index(state->va);
1116 if (l2_slot != state->l2_slot) {
1117 /* See pmap_bootstrap_l0_table for a description */
1118 MPASS(state->l2_slot < l2_slot ||
1119 state->l2_slot == Ln_ENTRIES ||
1120 state->dmap_valid);
1121
1122 /* Check the existing L2 entry */
1123 state->l2_slot = l2_slot;
1124 if (state->dmap_valid) {
1125 l2e = state->l2[l2_slot];
1126 if ((l2e & ATTR_DESCR_VALID) != 0) {
1127 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1128 l3_pa = PTE_TO_PHYS(l2e);
1129 state->l3 = PHYS_TO_DMAP(l3_pa);
1130 return;
1131 }
1132 }
1133
1134 /* Create a new L2 table entry */
1135 state->l3 = (pt_entry_t *)state->freemempos;
1136 memset_early(state->l3, 0, PAGE_SIZE);
1137 state->freemempos += PAGE_SIZE;
1138
1139 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1140 MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1141 MPASS(state->l2[l2_slot] == 0);
1142 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) |
1143 state->table_attrs | L2_TABLE);
1144 }
1145 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1146 }
1147
1148 static void
pmap_bootstrap_l2_block(struct pmap_bootstrap_state * state,int i)1149 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1150 {
1151 pt_entry_t contig;
1152 u_int l2_slot;
1153 bool first;
1154
1155 if ((physmap[i + 1] - state->pa) < L2_SIZE)
1156 return;
1157
1158 /* Make sure there is a valid L1 table */
1159 pmap_bootstrap_l1_table(state);
1160
1161 MPASS((state->va & L2_OFFSET) == 0);
1162 for (first = true, contig = 0;
1163 state->va < DMAP_MAX_ADDRESS &&
1164 (physmap[i + 1] - state->pa) >= L2_SIZE;
1165 state->va += L2_SIZE, state->pa += L2_SIZE) {
1166 /*
1167 * Stop if we are about to walk off the end of what the
1168 * current L1 slot can address.
1169 */
1170 if (!first && (state->pa & L1_OFFSET) == 0)
1171 break;
1172
1173 /*
1174 * If we have an aligned, contiguous chunk of L2C_ENTRIES
1175 * L2 blocks, set the contiguous bit within each PTE so that
1176 * the chunk can be cached using only one TLB entry.
1177 */
1178 if ((state->pa & L2C_OFFSET) == 0) {
1179 if (state->va + L2C_SIZE < DMAP_MAX_ADDRESS &&
1180 physmap[i + 1] - state->pa >= L2C_SIZE) {
1181 contig = ATTR_CONTIGUOUS;
1182 } else {
1183 contig = 0;
1184 }
1185 }
1186
1187 first = false;
1188 l2_slot = pmap_l2_index(state->va);
1189 MPASS((state->pa & L2_OFFSET) == 0);
1190 MPASS(state->l2[l2_slot] == 0);
1191 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) |
1192 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP |
1193 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L2_BLOCK);
1194 }
1195 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1196 }
1197
1198 static void
pmap_bootstrap_l3_page(struct pmap_bootstrap_state * state,int i)1199 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1200 {
1201 pt_entry_t contig;
1202 u_int l3_slot;
1203 bool first;
1204
1205 if (physmap[i + 1] - state->pa < L3_SIZE)
1206 return;
1207
1208 /* Make sure there is a valid L2 table */
1209 pmap_bootstrap_l2_table(state);
1210
1211 MPASS((state->va & L3_OFFSET) == 0);
1212 for (first = true, contig = 0;
1213 state->va < DMAP_MAX_ADDRESS &&
1214 physmap[i + 1] - state->pa >= L3_SIZE;
1215 state->va += L3_SIZE, state->pa += L3_SIZE) {
1216 /*
1217 * Stop if we are about to walk off the end of what the
1218 * current L2 slot can address.
1219 */
1220 if (!first && (state->pa & L2_OFFSET) == 0)
1221 break;
1222
1223 /*
1224 * If we have an aligned, contiguous chunk of L3C_ENTRIES
1225 * L3 pages, set the contiguous bit within each PTE so that
1226 * the chunk can be cached using only one TLB entry.
1227 */
1228 if ((state->pa & L3C_OFFSET) == 0) {
1229 if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS &&
1230 physmap[i + 1] - state->pa >= L3C_SIZE) {
1231 contig = ATTR_CONTIGUOUS;
1232 } else {
1233 contig = 0;
1234 }
1235 }
1236
1237 first = false;
1238 l3_slot = pmap_l3_index(state->va);
1239 MPASS((state->pa & L3_OFFSET) == 0);
1240 MPASS(state->l3[l3_slot] == 0);
1241 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) |
1242 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP |
1243 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE);
1244 }
1245 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1246 }
1247
1248 void
pmap_bootstrap_dmap(vm_size_t kernlen)1249 pmap_bootstrap_dmap(vm_size_t kernlen)
1250 {
1251 vm_paddr_t start_pa, pa;
1252 uint64_t tcr;
1253 int i;
1254
1255 tcr = READ_SPECIALREG(tcr_el1);
1256
1257 /* Verify that the ASID is set through TTBR0. */
1258 KASSERT((tcr & TCR_A1) == 0, ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1259
1260 if ((tcr & TCR_DS) != 0)
1261 pmap_lpa_enabled = true;
1262
1263 pmap_l1_supported = L1_BLOCKS_SUPPORTED;
1264
1265 start_pa = pmap_early_vtophys(KERNBASE);
1266
1267 bs_state.freemempos = KERNBASE + kernlen;
1268 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1269
1270 /* Fill in physmap array. */
1271 physmap_idx = physmem_avail(physmap, nitems(physmap));
1272
1273 dmap_phys_base = physmap[0] & ~L1_OFFSET;
1274 dmap_phys_max = 0;
1275 dmap_max_addr = 0;
1276
1277 for (i = 0; i < physmap_idx; i += 2) {
1278 bs_state.pa = physmap[i] & ~L3_OFFSET;
1279 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1280
1281 /* Create L3 mappings at the start of the region */
1282 if ((bs_state.pa & L2_OFFSET) != 0)
1283 pmap_bootstrap_l3_page(&bs_state, i);
1284 MPASS(bs_state.pa <= physmap[i + 1]);
1285
1286 if (L1_BLOCKS_SUPPORTED) {
1287 /* Create L2 mappings at the start of the region */
1288 if ((bs_state.pa & L1_OFFSET) != 0)
1289 pmap_bootstrap_l2_block(&bs_state, i);
1290 MPASS(bs_state.pa <= physmap[i + 1]);
1291
1292 /* Create the main L1 block mappings */
1293 for (; bs_state.va < DMAP_MAX_ADDRESS &&
1294 (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1295 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1296 /* Make sure there is a valid L1 table */
1297 pmap_bootstrap_l0_table(&bs_state);
1298 MPASS((bs_state.pa & L1_OFFSET) == 0);
1299 pmap_store(
1300 &bs_state.l1[pmap_l1_index(bs_state.va)],
1301 PHYS_TO_PTE(bs_state.pa) | ATTR_AF |
1302 pmap_sh_attr |
1303 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1304 ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK);
1305 }
1306 MPASS(bs_state.pa <= physmap[i + 1]);
1307
1308 /* Create L2 mappings at the end of the region */
1309 pmap_bootstrap_l2_block(&bs_state, i);
1310 } else {
1311 while (bs_state.va < DMAP_MAX_ADDRESS &&
1312 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1313 pmap_bootstrap_l2_block(&bs_state, i);
1314 }
1315 }
1316 MPASS(bs_state.pa <= physmap[i + 1]);
1317
1318 /* Create L3 mappings at the end of the region */
1319 pmap_bootstrap_l3_page(&bs_state, i);
1320 MPASS(bs_state.pa == physmap[i + 1]);
1321
1322 if (bs_state.pa > dmap_phys_max) {
1323 dmap_phys_max = bs_state.pa;
1324 dmap_max_addr = bs_state.va;
1325 }
1326 }
1327
1328 pmap_s1_invalidate_all_kernel();
1329
1330 bs_state.dmap_valid = true;
1331
1332 /* Exclude the kernel and DMAP region */
1333 pa = pmap_early_vtophys(bs_state.freemempos);
1334 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1335 }
1336
1337 static void
pmap_bootstrap_l2(vm_offset_t va)1338 pmap_bootstrap_l2(vm_offset_t va)
1339 {
1340 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1341
1342 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1343 bs_state.va = va;
1344
1345 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1346 pmap_bootstrap_l1_table(&bs_state);
1347 }
1348
1349 static void
pmap_bootstrap_l3(vm_offset_t va)1350 pmap_bootstrap_l3(vm_offset_t va)
1351 {
1352 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1353
1354 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1355 bs_state.va = va;
1356
1357 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1358 pmap_bootstrap_l2_table(&bs_state);
1359 }
1360
1361 /*
1362 * Bootstrap the system enough to run with virtual memory.
1363 */
1364 void
pmap_bootstrap(void)1365 pmap_bootstrap(void)
1366 {
1367 vm_offset_t dpcpu, msgbufpv;
1368 vm_paddr_t start_pa, pa;
1369 size_t largest_phys_size;
1370
1371 /* Set this early so we can use the pagetable walking functions */
1372 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1373 mtx_init(&kernel_pmap->pm_mtx, "kernel pmap", NULL, MTX_DEF);
1374 kernel_pmap->pm_l0_paddr =
1375 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1376 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1377 vm_radix_init(&kernel_pmap->pm_root);
1378 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1379 kernel_pmap->pm_stage = PM_STAGE1;
1380 kernel_pmap->pm_levels = 4;
1381 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1382 kernel_pmap->pm_asid_set = &asids;
1383
1384 /* Reserve some VA space for early BIOS/ACPI mapping */
1385 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1386
1387 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1388 virtual_avail = roundup2(virtual_avail, L1_SIZE);
1389 virtual_end = VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE - L2_SIZE;
1390 kernel_vm_end = virtual_avail;
1391
1392 /*
1393 * We only use PXN when we know nothing will be executed from it, e.g.
1394 * the DMAP region.
1395 */
1396 bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1397
1398 /*
1399 * Find the physical memory we could use. This needs to be after we
1400 * exclude any memory that is mapped into the DMAP region but should
1401 * not be used by the kernel, e.g. some UEFI memory types.
1402 */
1403 physmap_idx = physmem_avail(physmap, nitems(physmap));
1404
1405 /*
1406 * Find space for early allocations. We search for the largest
1407 * region. This is because the user may choose a large msgbuf.
1408 * This could be smarter, e.g. to allow multiple regions to be
1409 * used & switch to the next when one is full.
1410 */
1411 largest_phys_size = 0;
1412 for (int i = 0; i < physmap_idx; i += 2) {
1413 if ((physmap[i + 1] - physmap[i]) > largest_phys_size) {
1414 largest_phys_size = physmap[i + 1] - physmap[i];
1415 bs_state.freemempos = PHYS_TO_DMAP_ADDR(physmap[i]);
1416 }
1417 }
1418
1419 start_pa = pmap_early_vtophys(bs_state.freemempos);
1420
1421 /*
1422 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the
1423 * loader allocated the first and only l2 page table page used to map
1424 * the kernel, preloaded files and module metadata.
1425 */
1426 pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1427 /* And the l3 tables for the early devmap */
1428 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1429
1430 pmap_s1_invalidate_all_kernel();
1431
1432 #define alloc_pages(var, np) \
1433 (var) = bs_state.freemempos; \
1434 bs_state.freemempos += (np * PAGE_SIZE); \
1435 memset_early((char *)(var), 0, ((np) * PAGE_SIZE));
1436
1437 /* Allocate dynamic per-cpu area. */
1438 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1439 dpcpu_init((void *)dpcpu, 0);
1440
1441 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1442 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1443 msgbufp = (void *)msgbufpv;
1444
1445 /* Allocate space for the CPU0 CMAP */
1446 bs_state.va = virtual_end;
1447 pmap_bootstrap_l2_table(&bs_state);
1448 pmap_store(&bs_state.l3[pmap_l3_index(bs_state.va)],
1449 PHYS_TO_PTE(pmap_early_vtophys((vm_offset_t)bs_state.l3)) |
1450 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP |
1451 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L3_PAGE);
1452 dsb(ishst);
1453
1454 mtx_init(&cmap_lock, "SYSMAPS", NULL, MTX_DEF);
1455 cmap1_addr = (void *)(virtual_end + L3_SIZE);
1456 cmap1_pte = &bs_state.l3[pmap_l3_index((vm_offset_t)cmap1_addr)];
1457
1458 pa = pmap_early_vtophys(bs_state.freemempos);
1459
1460 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1461 }
1462
1463 #if defined(KASAN) || defined(KMSAN)
1464 static void
pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa,vm_paddr_t end_pa,vm_offset_t * vap,vm_offset_t eva)1465 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1466 vm_offset_t *vap, vm_offset_t eva)
1467 {
1468 vm_paddr_t pa;
1469 vm_offset_t va;
1470 pd_entry_t *l2;
1471
1472 va = *vap;
1473 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1474 for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) {
1475 l2 = pmap_l2(kernel_pmap, va);
1476
1477 /*
1478 * KASAN stack checking results in us having already allocated
1479 * part of our shadow map, so we can just skip those segments.
1480 */
1481 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1482 pa += L2_SIZE;
1483 continue;
1484 }
1485
1486 bzero_early(PHYS_TO_DMAP(pa), L2_SIZE);
1487 physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC);
1488 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK);
1489 }
1490 *vap = va;
1491 }
1492
1493 /*
1494 * Finish constructing the initial shadow map:
1495 * - Count how many pages from KERNBASE to virtual_avail (scaled for
1496 * shadow map)
1497 * - Map that entire range using L2 superpages.
1498 */
1499 static void
pmap_bootstrap_san1(vm_offset_t va,int scale)1500 pmap_bootstrap_san1(vm_offset_t va, int scale)
1501 {
1502 vm_offset_t eva;
1503 vm_paddr_t kernstart;
1504 int i;
1505
1506 kernstart = pmap_early_vtophys(KERNBASE);
1507
1508 /*
1509 * Rebuild physmap one more time, we may have excluded more regions from
1510 * allocation since pmap_bootstrap().
1511 */
1512 physmap_idx = physmem_avail(physmap, nitems(physmap));
1513
1514 eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale;
1515
1516 /*
1517 * Find a slot in the physmap large enough for what we needed. We try to put
1518 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1519 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1520 */
1521 for (i = physmap_idx - 2; i >= 0; i -= 2) {
1522 vm_paddr_t plow, phigh;
1523
1524 /* L2 mappings must be backed by memory that is L2-aligned */
1525 plow = roundup2(physmap[i], L2_SIZE);
1526 phigh = physmap[i + 1];
1527 if (plow >= phigh)
1528 continue;
1529 if (kernstart >= plow && kernstart < phigh)
1530 phigh = kernstart;
1531 if (phigh - plow >= L2_SIZE) {
1532 pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva);
1533 if (va >= eva)
1534 break;
1535 }
1536 }
1537 if (i < 0)
1538 panic("Could not find phys region for shadow map");
1539
1540 /*
1541 * Done. We should now have a valid shadow address mapped for all KVA
1542 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1543 * shadow accesses by the sanitizer runtime will succeed for this range.
1544 * When the kernel virtual address range is later expanded, as will
1545 * happen in vm_mem_init(), the shadow map will be grown as well. This
1546 * is handled by pmap_san_enter().
1547 */
1548 }
1549
1550 void
pmap_bootstrap_san(void)1551 pmap_bootstrap_san(void)
1552 {
1553 #ifdef KASAN
1554 pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE);
1555 #else
1556 static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1557 static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1558 pd_entry_t *l0, *l1;
1559
1560 if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE)
1561 panic("initial kernel map is too large");
1562
1563 l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS);
1564 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1565 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp)));
1566 l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS);
1567 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1568 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE)));
1569 pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1);
1570
1571 l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS);
1572 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1573 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp)));
1574 l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS);
1575 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1576 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE)));
1577 pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1);
1578 #endif
1579 }
1580 #endif
1581
1582 /*
1583 * Initialize a vm_page's machine-dependent fields.
1584 */
1585 void
pmap_page_init(vm_page_t m)1586 pmap_page_init(vm_page_t m)
1587 {
1588
1589 TAILQ_INIT(&m->md.pv_list);
1590 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1591 m->md.pv_flags = 0;
1592 }
1593
1594 static void
pmap_init_asids(struct asid_set * set,int bits)1595 pmap_init_asids(struct asid_set *set, int bits)
1596 {
1597 int i;
1598
1599 set->asid_bits = bits;
1600
1601 /*
1602 * We may be too early in the overall initialization process to use
1603 * bit_alloc().
1604 */
1605 set->asid_set_size = 1 << set->asid_bits;
1606 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1607 M_WAITOK | M_ZERO);
1608 for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1609 bit_set(set->asid_set, i);
1610 set->asid_next = ASID_FIRST_AVAILABLE;
1611 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1612 }
1613
1614 static void
pmap_init_pv_table(void)1615 pmap_init_pv_table(void)
1616 {
1617 struct vm_phys_seg *seg, *next_seg;
1618 struct pmap_large_md_page *pvd;
1619 vm_size_t s;
1620 int domain, i, j, pages;
1621
1622 /*
1623 * We depend on the size being evenly divisible into a page so
1624 * that the pv_table array can be indexed directly while
1625 * safely spanning multiple pages from different domains.
1626 */
1627 CTASSERT(PAGE_SIZE % sizeof(*pvd) == 0);
1628
1629 /*
1630 * Calculate the size of the array.
1631 */
1632 s = 0;
1633 for (i = 0; i < vm_phys_nsegs; i++) {
1634 seg = &vm_phys_segs[i];
1635 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1636 pmap_l2_pindex(seg->start);
1637 s += round_page(pages * sizeof(*pvd));
1638 }
1639 pv_table = kva_alloc(s);
1640 if (pv_table == NULL)
1641 panic("%s: kva_alloc failed\n", __func__);
1642
1643 /*
1644 * Iterate physical segments to allocate domain-local memory for PV
1645 * list headers.
1646 */
1647 pvd = pv_table;
1648 for (i = 0; i < vm_phys_nsegs; i++) {
1649 seg = &vm_phys_segs[i];
1650 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1651 pmap_l2_pindex(seg->start);
1652 domain = seg->domain;
1653
1654 s = round_page(pages * sizeof(*pvd));
1655
1656 for (j = 0; j < s; j += PAGE_SIZE) {
1657 vm_page_t m = vm_page_alloc_noobj_domain(domain,
1658 VM_ALLOC_ZERO);
1659 if (m == NULL)
1660 panic("failed to allocate PV table page");
1661 pmap_qenter((char *)pvd + j, &m, 1);
1662 }
1663
1664 for (j = 0; j < s / sizeof(*pvd); j++) {
1665 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1666 TAILQ_INIT(&pvd->pv_page.pv_list);
1667 pvd++;
1668 }
1669 }
1670 pvd = &pv_dummy_large;
1671 memset(pvd, 0, sizeof(*pvd));
1672 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1673 TAILQ_INIT(&pvd->pv_page.pv_list);
1674
1675 /*
1676 * Set pointers from vm_phys_segs to pv_table.
1677 */
1678 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1679 seg = &vm_phys_segs[i];
1680 seg->md_first = pvd;
1681 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1682 pmap_l2_pindex(seg->start);
1683
1684 /*
1685 * If there is a following segment, and the final
1686 * superpage of this segment and the initial superpage
1687 * of the next segment are the same then adjust the
1688 * pv_table entry for that next segment down by one so
1689 * that the pv_table entries will be shared.
1690 */
1691 if (i + 1 < vm_phys_nsegs) {
1692 next_seg = &vm_phys_segs[i + 1];
1693 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1694 pmap_l2_pindex(next_seg->start)) {
1695 pvd--;
1696 }
1697 }
1698 }
1699 }
1700
1701 static cpu_feat_en
pmap_dbm_check(const struct cpu_feat * feat __unused,u_int midr __unused)1702 pmap_dbm_check(const struct cpu_feat *feat __unused, u_int midr __unused)
1703 {
1704 uint64_t id_aa64mmfr1;
1705
1706 id_aa64mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1707 if (ID_AA64MMFR1_HAFDBS_VAL(id_aa64mmfr1) >=
1708 ID_AA64MMFR1_HAFDBS_AF_DBS)
1709 return (FEAT_DEFAULT_ENABLE);
1710
1711 return (FEAT_ALWAYS_DISABLE);
1712 }
1713
1714 static bool
pmap_dbm_has_errata(const struct cpu_feat * feat __unused,u_int midr,u_int ** errata_list,u_int * errata_count)1715 pmap_dbm_has_errata(const struct cpu_feat *feat __unused, u_int midr,
1716 u_int **errata_list, u_int *errata_count)
1717 {
1718 /* Disable on Cortex-A55 for erratum 1024718 - all revisions */
1719 if (CPU_IMPL(midr) == CPU_IMPL_ARM &&
1720 CPU_PART(midr) == CPU_PART_CORTEX_A55) {
1721 static u_int errata_id = 1024718;
1722
1723 *errata_list = &errata_id;
1724 *errata_count = 1;
1725 return (true);
1726 }
1727
1728 /* Disable on Cortex-A510 for erratum 2051678 - r0p0 to r0p2 */
1729 if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A510,
1730 0, 0, 0, 2)) {
1731 static u_int errata_id = 2051678;
1732
1733 *errata_list = &errata_id;
1734 *errata_count = 1;
1735 return (true);
1736 }
1737
1738 return (false);
1739 }
1740
1741 static bool
pmap_dbm_enable(const struct cpu_feat * feat __unused,cpu_feat_errata errata_status,u_int * errata_list __unused,u_int errata_count)1742 pmap_dbm_enable(const struct cpu_feat *feat __unused,
1743 cpu_feat_errata errata_status, u_int *errata_list __unused,
1744 u_int errata_count)
1745 {
1746 uint64_t tcr;
1747
1748 /* Skip if there is an erratum affecting DBM */
1749 if (errata_status != ERRATA_NONE)
1750 return (false);
1751
1752 tcr = READ_SPECIALREG(tcr_el1) | TCR_HD;
1753 WRITE_SPECIALREG(tcr_el1, tcr);
1754 isb();
1755 /* Flush the local TLB for the TCR_HD flag change */
1756 dsb(nshst);
1757 __asm __volatile("tlbi vmalle1");
1758 dsb(nsh);
1759 isb();
1760
1761 return (true);
1762 }
1763
1764 CPU_FEAT(feat_hafdbs, "Hardware management of the Access flag and dirty state",
1765 pmap_dbm_check, pmap_dbm_has_errata, pmap_dbm_enable, NULL,
1766 CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU);
1767
1768 static cpu_feat_en
pmap_multiple_tlbi_check(const struct cpu_feat * feat __unused,u_int midr)1769 pmap_multiple_tlbi_check(const struct cpu_feat *feat __unused, u_int midr)
1770 {
1771 /*
1772 * Cortex-A55 erratum 2441007 (Cat B rare)
1773 * Present in all revisions
1774 */
1775 if (CPU_IMPL(midr) == CPU_IMPL_ARM &&
1776 CPU_PART(midr) == CPU_PART_CORTEX_A55)
1777 return (FEAT_DEFAULT_DISABLE);
1778
1779 /*
1780 * Cortex-A76 erratum 1286807 (Cat B rare)
1781 * Present in r0p0 - r3p0
1782 * Fixed in r3p1
1783 */
1784 if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A76,
1785 0, 0, 3, 0))
1786 return (FEAT_DEFAULT_DISABLE);
1787
1788 /*
1789 * Cortex-A510 erratum 2441009 (Cat B rare)
1790 * Present in r0p0 - r1p1
1791 * Fixed in r1p2
1792 */
1793 if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A510,
1794 0, 0, 1, 1))
1795 return (FEAT_DEFAULT_DISABLE);
1796
1797 return (FEAT_ALWAYS_DISABLE);
1798 }
1799
1800 static bool
pmap_multiple_tlbi_enable(const struct cpu_feat * feat __unused,cpu_feat_errata errata_status,u_int * errata_list __unused,u_int errata_count __unused)1801 pmap_multiple_tlbi_enable(const struct cpu_feat *feat __unused,
1802 cpu_feat_errata errata_status, u_int *errata_list __unused,
1803 u_int errata_count __unused)
1804 {
1805 pmap_multiple_tlbi = true;
1806 return (true);
1807 }
1808
1809 CPU_FEAT(errata_multi_tlbi, "Multiple TLBI errata",
1810 pmap_multiple_tlbi_check, NULL, pmap_multiple_tlbi_enable, NULL,
1811 CPU_FEAT_EARLY_BOOT | CPU_FEAT_PER_CPU);
1812
1813 /*
1814 * Initialize the pmap module.
1815 *
1816 * Called by vm_mem_init(), to initialize any structures that the pmap
1817 * system needs to map virtual memory.
1818 */
1819 void
pmap_init(void)1820 pmap_init(void)
1821 {
1822 uint64_t mmfr1;
1823 int i, vmid_bits;
1824
1825 /*
1826 * Are large page mappings enabled?
1827 */
1828 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1829 if (superpages_enabled) {
1830 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1831 ("pmap_init: can't assign to pagesizes[1]"));
1832 pagesizes[1] = L3C_SIZE;
1833 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1834 ("pmap_init: can't assign to pagesizes[2]"));
1835 pagesizes[2] = L2_SIZE;
1836 if (L1_BLOCKS_SUPPORTED) {
1837 KASSERT(MAXPAGESIZES > 3 && pagesizes[3] == 0,
1838 ("pmap_init: can't assign to pagesizes[3]"));
1839 pagesizes[3] = L1_SIZE;
1840 }
1841 }
1842
1843 /*
1844 * Initialize the ASID allocator.
1845 */
1846 pmap_init_asids(&asids,
1847 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1848
1849 if (has_hyp()) {
1850 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1851 vmid_bits = 8;
1852
1853 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1854 ID_AA64MMFR1_VMIDBits_16)
1855 vmid_bits = 16;
1856 pmap_init_asids(&vmids, vmid_bits);
1857 }
1858
1859 /*
1860 * Initialize pv chunk lists.
1861 */
1862 for (i = 0; i < PMAP_MEMDOM; i++) {
1863 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1864 MTX_DEF);
1865 TAILQ_INIT(&pv_chunks[i].pvc_list);
1866 }
1867 pmap_init_pv_table();
1868
1869 vm_initialized = 1;
1870 }
1871
1872 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1873 "L1 (1GB/64GB) page mapping counters");
1874
1875 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
1876 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
1877 &pmap_l1_demotions, "L1 (1GB/64GB) page demotions");
1878
1879 SYSCTL_BOOL(_vm_pmap_l1, OID_AUTO, supported, CTLFLAG_RD, &pmap_l1_supported,
1880 0, "L1 blocks are supported");
1881
1882 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1883 "L2C (32MB/1GB) page mapping counters");
1884
1885 static COUNTER_U64_DEFINE_EARLY(pmap_l2c_demotions);
1886 SYSCTL_COUNTER_U64(_vm_pmap_l2c, OID_AUTO, demotions, CTLFLAG_RD,
1887 &pmap_l2c_demotions, "L2C (32MB/1GB) page demotions");
1888
1889 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1890 "2MB page mapping counters");
1891
1892 static COUNTER_U64_DEFINE_EARLY(pmap_l2_demotions);
1893 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1894 &pmap_l2_demotions, "L2 (2MB/32MB) page demotions");
1895
1896 static COUNTER_U64_DEFINE_EARLY(pmap_l2_mappings);
1897 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1898 &pmap_l2_mappings, "L2 (2MB/32MB) page mappings");
1899
1900 static COUNTER_U64_DEFINE_EARLY(pmap_l2_p_failures);
1901 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1902 &pmap_l2_p_failures, "L2 (2MB/32MB) page promotion failures");
1903
1904 static COUNTER_U64_DEFINE_EARLY(pmap_l2_promotions);
1905 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1906 &pmap_l2_promotions, "L2 (2MB/32MB) page promotions");
1907
1908 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1909 "L3C (64KB/2MB) page mapping counters");
1910
1911 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions);
1912 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD,
1913 &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions");
1914
1915 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings);
1916 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD,
1917 &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings");
1918
1919 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures);
1920 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD,
1921 &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures");
1922
1923 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions);
1924 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD,
1925 &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions");
1926
1927 /*
1928 * If the given value for "final_only" is false, then any cached intermediate-
1929 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1930 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1931 * Otherwise, just the cached final-level entry is invalidated.
1932 */
1933 static __inline void
pmap_s1_invalidate_kernel(uint64_t r,bool final_only)1934 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1935 {
1936 if (final_only)
1937 __asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1938 else
1939 __asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1940 }
1941
1942 static __inline void
pmap_s1_invalidate_user(uint64_t r,bool final_only)1943 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1944 {
1945 if (final_only)
1946 __asm __volatile("tlbi vale1is, %0" : : "r" (r));
1947 else
1948 __asm __volatile("tlbi vae1is, %0" : : "r" (r));
1949 }
1950
1951 /*
1952 * Invalidates any cached final- and optionally intermediate-level TLB entries
1953 * for the specified virtual address in the given virtual address space.
1954 */
1955 static __inline void
pmap_s1_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1956 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1957 {
1958 uint64_t r;
1959
1960 PMAP_ASSERT_STAGE1(pmap);
1961
1962 dsb(ishst);
1963 r = TLBI_VA(va);
1964 if (pmap == kernel_pmap) {
1965 pmap_s1_invalidate_kernel(r, final_only);
1966 } else {
1967 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1968 pmap_s1_invalidate_user(r, final_only);
1969 }
1970 if (pmap_multiple_tlbi) {
1971 dsb(ish);
1972 __asm __volatile("tlbi vale1is, xzr" ::: "memory");
1973 }
1974 dsb(ish);
1975 isb();
1976 }
1977
1978 static __inline void
pmap_s2_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1979 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1980 {
1981 PMAP_ASSERT_STAGE2(pmap);
1982 MPASS(pmap_stage2_invalidate_range != NULL);
1983 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1984 final_only);
1985 }
1986
1987 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1988 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1989 {
1990 if (pmap->pm_stage == PM_STAGE1)
1991 pmap_s1_invalidate_page(pmap, va, final_only);
1992 else
1993 pmap_s2_invalidate_page(pmap, va, final_only);
1994 }
1995
1996 /*
1997 * Use stride L{1,2}_SIZE when invalidating the TLB entries for L{1,2}_BLOCK
1998 * mappings. Otherwise, use stride L3_SIZE.
1999 */
2000 static __inline void
pmap_s1_invalidate_strided(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_offset_t stride,bool final_only)2001 pmap_s1_invalidate_strided(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2002 vm_offset_t stride, bool final_only)
2003 {
2004 uint64_t end, r, start;
2005
2006 PMAP_ASSERT_STAGE1(pmap);
2007
2008 dsb(ishst);
2009 if (pmap == kernel_pmap) {
2010 start = TLBI_VA(sva);
2011 end = TLBI_VA(eva);
2012 for (r = start; r < end; r += TLBI_VA(stride))
2013 pmap_s1_invalidate_kernel(r, final_only);
2014 } else {
2015 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
2016 start |= TLBI_VA(sva);
2017 end |= TLBI_VA(eva);
2018 for (r = start; r < end; r += TLBI_VA(stride))
2019 pmap_s1_invalidate_user(r, final_only);
2020 }
2021 if (pmap_multiple_tlbi) {
2022 dsb(ish);
2023 __asm __volatile("tlbi vale1is, xzr" ::: "memory");
2024 }
2025 dsb(ish);
2026 isb();
2027 }
2028
2029 /*
2030 * Invalidates any cached final- and optionally intermediate-level TLB entries
2031 * for the specified virtual address range in the given virtual address space.
2032 */
2033 static __inline void
pmap_s1_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)2034 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2035 bool final_only)
2036 {
2037 pmap_s1_invalidate_strided(pmap, sva, eva, L3_SIZE, final_only);
2038 }
2039
2040 static __inline void
pmap_s2_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)2041 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2042 bool final_only)
2043 {
2044 PMAP_ASSERT_STAGE2(pmap);
2045 MPASS(pmap_stage2_invalidate_range != NULL);
2046 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
2047 }
2048
2049 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)2050 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2051 bool final_only)
2052 {
2053 if (pmap->pm_stage == PM_STAGE1)
2054 pmap_s1_invalidate_range(pmap, sva, eva, final_only);
2055 else
2056 pmap_s2_invalidate_range(pmap, sva, eva, final_only);
2057 }
2058
2059 void
pmap_s1_invalidate_all_kernel(void)2060 pmap_s1_invalidate_all_kernel(void)
2061 {
2062 dsb(ishst);
2063 __asm __volatile("tlbi vmalle1is");
2064 if (pmap_multiple_tlbi) {
2065 dsb(ish);
2066 __asm __volatile("tlbi vale1is, xzr" ::: "memory");
2067 }
2068 dsb(ish);
2069 isb();
2070 }
2071
2072 /*
2073 * Invalidates all cached intermediate- and final-level TLB entries for the
2074 * given virtual address space.
2075 */
2076 static __inline void
pmap_s1_invalidate_all(pmap_t pmap)2077 pmap_s1_invalidate_all(pmap_t pmap)
2078 {
2079 uint64_t r;
2080
2081 PMAP_ASSERT_STAGE1(pmap);
2082
2083 dsb(ishst);
2084 if (pmap == kernel_pmap) {
2085 __asm __volatile("tlbi vmalle1is");
2086 } else {
2087 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
2088 __asm __volatile("tlbi aside1is, %0" : : "r" (r));
2089 }
2090 if (pmap_multiple_tlbi) {
2091 dsb(ish);
2092 __asm __volatile("tlbi vale1is, xzr" ::: "memory");
2093 }
2094 dsb(ish);
2095 isb();
2096 }
2097
2098 static __inline void
pmap_s2_invalidate_all(pmap_t pmap)2099 pmap_s2_invalidate_all(pmap_t pmap)
2100 {
2101 PMAP_ASSERT_STAGE2(pmap);
2102 MPASS(pmap_stage2_invalidate_all != NULL);
2103 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
2104 }
2105
2106 static __inline void
pmap_invalidate_all(pmap_t pmap)2107 pmap_invalidate_all(pmap_t pmap)
2108 {
2109 if (pmap->pm_stage == PM_STAGE1)
2110 pmap_s1_invalidate_all(pmap);
2111 else
2112 pmap_s2_invalidate_all(pmap);
2113 }
2114
2115 /*
2116 * Routine: pmap_extract
2117 * Function:
2118 * Extract the physical page address associated
2119 * with the given map/virtual_address pair.
2120 */
2121 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)2122 pmap_extract(pmap_t pmap, vm_offset_t va)
2123 {
2124 pt_entry_t *pte, tpte;
2125 vm_paddr_t pa;
2126 int lvl;
2127
2128 pa = 0;
2129 PMAP_LOCK(pmap);
2130 /*
2131 * Find the block or page map for this virtual address. pmap_pte
2132 * will return either a valid block/page entry, or NULL.
2133 */
2134 pte = pmap_pte(pmap, va, &lvl);
2135 if (pte != NULL) {
2136 tpte = pmap_load(pte);
2137 pa = PTE_TO_PHYS(tpte);
2138 switch(lvl) {
2139 case 1:
2140 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
2141 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
2142 ("pmap_extract: Invalid L1 pte found: %lx",
2143 tpte & ATTR_DESCR_MASK));
2144 pa |= (va & L1_OFFSET);
2145 break;
2146 case 2:
2147 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
2148 ("pmap_extract: Invalid L2 pte found: %lx",
2149 tpte & ATTR_DESCR_MASK));
2150 pa |= (va & L2_OFFSET);
2151 break;
2152 case 3:
2153 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
2154 ("pmap_extract: Invalid L3 pte found: %lx",
2155 tpte & ATTR_DESCR_MASK));
2156 pa |= (va & L3_OFFSET);
2157 break;
2158 }
2159 }
2160 PMAP_UNLOCK(pmap);
2161 return (pa);
2162 }
2163
2164 /*
2165 * Routine: pmap_extract_and_hold
2166 * Function:
2167 * Atomically extract and hold the physical page
2168 * with the given pmap and virtual address pair
2169 * if that mapping permits the given protection.
2170 */
2171 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)2172 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
2173 {
2174 pt_entry_t *pte, tpte;
2175 vm_offset_t off;
2176 vm_page_t m;
2177 int lvl;
2178 bool use;
2179
2180 m = NULL;
2181 PMAP_LOCK(pmap);
2182 pte = pmap_pte(pmap, va, &lvl);
2183 if (pte != NULL) {
2184 tpte = pmap_load(pte);
2185
2186 KASSERT(lvl > 0 && lvl <= 3,
2187 ("pmap_extract_and_hold: Invalid level %d", lvl));
2188 /*
2189 * Check that the pte is either a L3 page, or a L1 or L2 block
2190 * entry. We can assume L1_BLOCK == L2_BLOCK.
2191 */
2192 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
2193 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
2194 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
2195 tpte & ATTR_DESCR_MASK));
2196
2197 use = false;
2198 if ((prot & VM_PROT_WRITE) == 0)
2199 use = true;
2200 else if (pmap->pm_stage == PM_STAGE1 &&
2201 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
2202 use = true;
2203 else if (pmap->pm_stage == PM_STAGE2 &&
2204 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
2205 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
2206 use = true;
2207
2208 if (use) {
2209 switch (lvl) {
2210 case 1:
2211 off = va & L1_OFFSET;
2212 break;
2213 case 2:
2214 off = va & L2_OFFSET;
2215 break;
2216 case 3:
2217 default:
2218 off = 0;
2219 }
2220 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off);
2221 if (m != NULL && !vm_page_wire_mapped(m))
2222 m = NULL;
2223 }
2224 }
2225 PMAP_UNLOCK(pmap);
2226 return (m);
2227 }
2228
2229 /*
2230 * Returns true if the entire kernel virtual address range is mapped
2231 */
2232 static bool
pmap_kmapped_range(void * va,vm_size_t size)2233 pmap_kmapped_range(void *va, vm_size_t size)
2234 {
2235 pt_entry_t *pte, tpte;
2236 vm_offset_t eva, sva;
2237
2238 sva = (vm_offset_t)va;
2239 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS,
2240 ("%s: Invalid virtual address: %lx", __func__, sva));
2241 MPASS(size != 0);
2242 eva = sva + size - 1;
2243 KASSERT(eva > sva, ("%s: Size too large: sva %lx, size %lx", __func__,
2244 sva, size));
2245
2246 while (sva <= eva) {
2247 pte = pmap_l1(kernel_pmap, sva);
2248 if (pte == NULL)
2249 return (false);
2250 tpte = pmap_load(pte);
2251 if (tpte == 0)
2252 return (false);
2253 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2254 sva = (sva & ~L1_OFFSET) + L1_SIZE;
2255 continue;
2256 }
2257
2258 pte = pmap_l1_to_l2(&tpte, sva);
2259 tpte = pmap_load(pte);
2260 if (tpte == 0)
2261 return (false);
2262 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2263 sva = (sva & ~L2_OFFSET) + L2_SIZE;
2264 continue;
2265 }
2266 pte = pmap_l2_to_l3(&tpte, sva);
2267 tpte = pmap_load(pte);
2268 if (tpte == 0)
2269 return (false);
2270 MPASS((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_PAGE);
2271 if ((tpte & ATTR_CONTIGUOUS) == ATTR_CONTIGUOUS)
2272 sva = (sva & ~L3C_OFFSET) + L3C_SIZE;
2273 else
2274 sva = (sva & ~L3_OFFSET) + L3_SIZE;
2275 }
2276
2277 return (true);
2278 }
2279
2280 /*
2281 * Walks the page tables to translate a kernel virtual address to a
2282 * physical address. Returns true if the kva is valid and stores the
2283 * physical address in pa if it is not NULL.
2284 *
2285 * See the comment above data_abort() for the rationale for specifying
2286 * NO_PERTHREAD_SSP here.
2287 */
2288 bool NO_PERTHREAD_SSP
pmap_klookup(vm_offset_t va,vm_paddr_t * pa)2289 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
2290 {
2291 pt_entry_t *pte, tpte;
2292 register_t intr;
2293 uint64_t par;
2294
2295 /*
2296 * Disable interrupts so we don't get interrupted between asking
2297 * for address translation, and getting the result back.
2298 */
2299 intr = intr_disable();
2300 par = arm64_address_translate_s1e1r(va);
2301 intr_restore(intr);
2302
2303 if (PAR_SUCCESS(par)) {
2304 if (pa != NULL)
2305 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
2306 return (true);
2307 }
2308
2309 /*
2310 * Fall back to walking the page table. The address translation
2311 * instruction may fail when the page is in a break-before-make
2312 * sequence. As we only clear the valid bit in said sequence we
2313 * can walk the page table to find the physical address.
2314 */
2315
2316 pte = pmap_l1(kernel_pmap, va);
2317 if (pte == NULL)
2318 return (false);
2319
2320 /*
2321 * A concurrent pmap_update_entry() will clear the entry's valid bit
2322 * but leave the rest of the entry unchanged. Therefore, we treat a
2323 * non-zero entry as being valid, and we ignore the valid bit when
2324 * determining whether the entry maps a block, page, or table.
2325 */
2326 tpte = pmap_load(pte);
2327 if (tpte == 0)
2328 return (false);
2329 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2330 if (pa != NULL)
2331 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET);
2332 return (true);
2333 }
2334 pte = pmap_l1_to_l2(&tpte, va);
2335 tpte = pmap_load(pte);
2336 if (tpte == 0)
2337 return (false);
2338 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2339 if (pa != NULL)
2340 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET);
2341 return (true);
2342 }
2343 pte = pmap_l2_to_l3(&tpte, va);
2344 tpte = pmap_load(pte);
2345 if (tpte == 0)
2346 return (false);
2347 if (pa != NULL)
2348 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET);
2349 return (true);
2350 }
2351
2352 /*
2353 * Routine: pmap_kextract
2354 * Function:
2355 * Extract the physical page address associated with the given kernel
2356 * virtual address.
2357 */
2358 vm_paddr_t
pmap_kextract(vm_offset_t va)2359 pmap_kextract(vm_offset_t va)
2360 {
2361 vm_paddr_t pa;
2362
2363 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2364 return (DMAP_TO_PHYS(va));
2365
2366 if (pmap_klookup(va, &pa) == false)
2367 return (0);
2368 return (pa);
2369 }
2370
2371 /***************************************************
2372 * Low level mapping routines.....
2373 ***************************************************/
2374
2375 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)2376 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
2377 {
2378 pd_entry_t *pde;
2379 pt_entry_t attr, old_l3e, *pte;
2380 vm_offset_t va;
2381 vm_page_t mpte;
2382 int error, lvl;
2383
2384 KASSERT((pa & L3_OFFSET) == 0,
2385 ("pmap_kenter: Invalid physical address"));
2386 KASSERT((sva & L3_OFFSET) == 0,
2387 ("pmap_kenter: Invalid virtual address"));
2388 KASSERT((size & PAGE_MASK) == 0,
2389 ("pmap_kenter: Mapping is not page-sized"));
2390
2391 /* CCA - Map devices as nonsecure */
2392 if (in_realm() && (mode == VM_MEMATTR_DEVICE ||
2393 mode == VM_MEMATTR_DEVICE_NP))
2394 pa |= prot_ns_shared_pa;
2395
2396 attr = ATTR_AF | pmap_sh_attr | ATTR_S1_AP(ATTR_S1_AP_RW) |
2397 ATTR_S1_XN | ATTR_KERN_GP | ATTR_S1_IDX(mode);
2398 old_l3e = 0;
2399 va = sva;
2400 while (size != 0) {
2401 pde = pmap_pde(kernel_pmap, va, &lvl);
2402 KASSERT(pde != NULL,
2403 ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
2404 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
2405
2406 /*
2407 * If we have an aligned, contiguous chunk of L2_SIZE, try
2408 * to create an L2_BLOCK mapping.
2409 */
2410 if ((va & L2_OFFSET) == 0 && size >= L2_SIZE &&
2411 (pa & L2_OFFSET) == 0 && vm_initialized) {
2412 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
2413 KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)),
2414 ("pmap_kenter: Unexpected mapping"));
2415 PMAP_LOCK(kernel_pmap);
2416 error = pmap_insert_pt_page(kernel_pmap, mpte, false,
2417 false);
2418 if (error == 0) {
2419 attr &= ~ATTR_CONTIGUOUS;
2420
2421 /*
2422 * Although the page table page "mpte" should
2423 * be devoid of mappings, the TLB might hold
2424 * intermediate entries that reference it, so
2425 * we perform a single-page invalidation.
2426 */
2427 pmap_update_entry(kernel_pmap, pde,
2428 PHYS_TO_PTE(pa) | attr | L2_BLOCK, va,
2429 PAGE_SIZE);
2430 }
2431 PMAP_UNLOCK(kernel_pmap);
2432 if (error == 0) {
2433 va += L2_SIZE;
2434 pa += L2_SIZE;
2435 size -= L2_SIZE;
2436 continue;
2437 }
2438 }
2439
2440 /*
2441 * If we have an aligned, contiguous chunk of L3C_ENTRIES
2442 * L3 pages, set the contiguous bit within each PTE so that
2443 * the chunk can be cached using only one TLB entry.
2444 */
2445 if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) {
2446 if (size >= L3C_SIZE)
2447 attr |= ATTR_CONTIGUOUS;
2448 else
2449 attr &= ~ATTR_CONTIGUOUS;
2450 }
2451
2452 pte = pmap_l2_to_l3(pde, va);
2453 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr |
2454 L3_PAGE);
2455
2456 va += PAGE_SIZE;
2457 pa += PAGE_SIZE;
2458 size -= PAGE_SIZE;
2459 }
2460 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2461 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2462 else {
2463 /*
2464 * Because the old entries were invalid and the new mappings
2465 * are not executable, an isb is not required.
2466 */
2467 dsb(ishst);
2468 }
2469 }
2470
2471 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)2472 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2473 {
2474
2475 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2476 }
2477
2478 /*
2479 * Remove a page from the kernel pagetables.
2480 */
2481 void
pmap_kremove(vm_offset_t va)2482 pmap_kremove(vm_offset_t va)
2483 {
2484 pt_entry_t *pte;
2485
2486 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2487 KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0,
2488 ("pmap_kremove: unexpected ATTR_CONTIGUOUS"));
2489 pmap_clear(pte);
2490 pmap_s1_invalidate_page(kernel_pmap, va, true);
2491 }
2492
2493 /*
2494 * Remove the specified range of mappings from the kernel address space.
2495 *
2496 * Should only be applied to mappings that were created by pmap_kenter() or
2497 * pmap_kenter_device(). Nothing about this function is actually specific
2498 * to device mappings.
2499 */
2500 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)2501 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2502 {
2503 pt_entry_t *ptep, *ptep_end;
2504 vm_offset_t va;
2505 int lvl;
2506
2507 KASSERT((sva & L3_OFFSET) == 0,
2508 ("pmap_kremove_device: Invalid virtual address"));
2509 KASSERT((size & PAGE_MASK) == 0,
2510 ("pmap_kremove_device: Mapping is not page-sized"));
2511
2512 va = sva;
2513 while (size != 0) {
2514 ptep = pmap_pte(kernel_pmap, va, &lvl);
2515 KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va));
2516 switch (lvl) {
2517 case 2:
2518 KASSERT((va & L2_OFFSET) == 0,
2519 ("Unaligned virtual address"));
2520 KASSERT(size >= L2_SIZE, ("Insufficient size"));
2521
2522 if (va != sva) {
2523 pmap_s1_invalidate_range(kernel_pmap, sva, va,
2524 true);
2525 }
2526 pmap_clear(ptep);
2527 pmap_s1_invalidate_page(kernel_pmap, va, true);
2528 PMAP_LOCK(kernel_pmap);
2529 pmap_remove_kernel_l2(kernel_pmap, ptep, va);
2530 PMAP_UNLOCK(kernel_pmap);
2531
2532 va += L2_SIZE;
2533 sva = va;
2534 size -= L2_SIZE;
2535 break;
2536 case 3:
2537 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
2538 KASSERT((va & L3C_OFFSET) == 0,
2539 ("Unaligned L3C virtual address"));
2540 KASSERT(size >= L3C_SIZE,
2541 ("Insufficient L3C size"));
2542
2543 ptep_end = ptep + L3C_ENTRIES;
2544 for (; ptep < ptep_end; ptep++)
2545 pmap_clear(ptep);
2546
2547 va += L3C_SIZE;
2548 size -= L3C_SIZE;
2549 break;
2550 }
2551 pmap_clear(ptep);
2552
2553 va += PAGE_SIZE;
2554 size -= PAGE_SIZE;
2555 break;
2556 default:
2557 __assert_unreachable();
2558 break;
2559 }
2560 }
2561 if (va != sva)
2562 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2563 }
2564
2565 /*
2566 * Used to map a range of physical addresses into kernel
2567 * virtual address space.
2568 *
2569 * The value passed in '*virt' is a suggested virtual address for
2570 * the mapping. Architectures which can support a direct-mapped
2571 * physical to virtual region can return the appropriate address
2572 * within that region, leaving '*virt' unchanged. Other
2573 * architectures should map the pages starting at '*virt' and
2574 * update '*virt' with the first usable address after the mapped
2575 * region.
2576 */
2577 void *
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)2578 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2579 {
2580 return (PHYS_TO_DMAP(start));
2581 }
2582
2583 /*
2584 * Add a list of wired pages to the kva
2585 * this routine is only used for temporary
2586 * kernel mappings that do not need to have
2587 * page modification or references recorded.
2588 * Note that old mappings are simply written
2589 * over. The page *must* be wired.
2590 * Note: SMP coherent. Uses a ranged shootdown IPI.
2591 */
2592 void
pmap_qenter(void * sva,vm_page_t * ma,int count)2593 pmap_qenter(void *sva, vm_page_t *ma, int count)
2594 {
2595 pd_entry_t *pde;
2596 pt_entry_t attr, old_l3e, *pte;
2597 vm_offset_t va;
2598 vm_page_t m;
2599 int i, lvl;
2600
2601 old_l3e = 0;
2602 va = (vm_offset_t)sva;
2603 for (i = 0; i < count; i++) {
2604 pde = pmap_pde(kernel_pmap, va, &lvl);
2605 KASSERT(pde != NULL,
2606 ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2607 KASSERT(lvl == 2,
2608 ("pmap_qenter: Invalid level %d", lvl));
2609
2610 m = ma[i];
2611 attr = ATTR_AF | pmap_sh_attr |
2612 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2613 ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2614 pte = pmap_l2_to_l3(pde, va);
2615 old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr);
2616
2617 va += L3_SIZE;
2618 }
2619 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2620 pmap_s1_invalidate_range(kernel_pmap, (vm_offset_t)sva, va,
2621 true);
2622 else {
2623 /*
2624 * Because the old entries were invalid and the new mappings
2625 * are not executable, an isb is not required.
2626 */
2627 dsb(ishst);
2628 }
2629 }
2630
2631 /*
2632 * This routine tears out page mappings from the
2633 * kernel -- it is meant only for temporary mappings.
2634 */
2635 void
pmap_qremove(void * sva,int count)2636 pmap_qremove(void *sva, int count)
2637 {
2638 pt_entry_t *pte;
2639 vm_offset_t va;
2640
2641 va = (vm_offset_t)sva;
2642
2643 KASSERT(ADDR_IS_CANONICAL(va),
2644 ("%s: Address not in canonical form: %p", __func__, sva));
2645 KASSERT(ADDR_IS_KERNEL(va), ("usermode va %p", sva));
2646
2647 while (count-- > 0) {
2648 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2649 if (pte != NULL) {
2650 pmap_clear(pte);
2651 }
2652
2653 va += PAGE_SIZE;
2654 }
2655 pmap_s1_invalidate_range(kernel_pmap, (vm_offset_t)sva, va, true);
2656 }
2657
2658 /***************************************************
2659 * Page table page management routines.....
2660 ***************************************************/
2661 /*
2662 * Schedule the specified unused page table page to be freed. Specifically,
2663 * add the page to the specified list of pages that will be released to the
2664 * physical memory manager after the TLB has been updated.
2665 */
2666 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)2667 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
2668 {
2669
2670 if (set_PG_ZERO)
2671 m->flags |= PG_ZERO;
2672 else
2673 m->flags &= ~PG_ZERO;
2674 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2675 }
2676
2677 /*
2678 * Decrements a page table page's reference count, which is used to record the
2679 * number of valid page table entries within the page. If the reference count
2680 * drops to zero, then the page table page is unmapped. Returns true if the
2681 * page table page was unmapped and false otherwise.
2682 */
2683 static inline bool
pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2684 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2685 {
2686
2687 --m->ref_count;
2688 if (m->ref_count == 0) {
2689 _pmap_unwire_l3(pmap, va, m, free);
2690 return (true);
2691 } else
2692 return (false);
2693 }
2694
2695 static void
_pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2696 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2697 {
2698
2699 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2700 /*
2701 * unmap the page table page
2702 */
2703 if (m->pindex >= (NUL2E + NUL1E)) {
2704 /* l1 page */
2705 pd_entry_t *l0;
2706
2707 l0 = pmap_l0(pmap, va);
2708 pmap_clear(l0);
2709 } else if (m->pindex >= NUL2E) {
2710 /* l2 page */
2711 pd_entry_t *l1;
2712
2713 l1 = pmap_l1(pmap, va);
2714 pmap_clear(l1);
2715 } else {
2716 /* l3 page */
2717 pd_entry_t *l2;
2718
2719 l2 = pmap_l2(pmap, va);
2720 pmap_clear(l2);
2721 }
2722 pmap_resident_count_dec(pmap, 1);
2723 if (m->pindex < NUL2E) {
2724 /* We just released an l3, unhold the matching l2 */
2725 pd_entry_t *l1, tl1;
2726 vm_page_t l2pg;
2727
2728 l1 = pmap_l1(pmap, va);
2729 tl1 = pmap_load(l1);
2730 l2pg = PTE_TO_VM_PAGE(tl1);
2731 pmap_unwire_l3(pmap, va, l2pg, free);
2732 } else if (m->pindex < (NUL2E + NUL1E)) {
2733 /* We just released an l2, unhold the matching l1 */
2734 pd_entry_t *l0, tl0;
2735 vm_page_t l1pg;
2736
2737 l0 = pmap_l0(pmap, va);
2738 tl0 = pmap_load(l0);
2739 l1pg = PTE_TO_VM_PAGE(tl0);
2740 pmap_unwire_l3(pmap, va, l1pg, free);
2741 }
2742 pmap_invalidate_page(pmap, va, false);
2743
2744 /*
2745 * Put page on a list so that it is released after
2746 * *ALL* TLB shootdown is done
2747 */
2748 pmap_add_delayed_free_list(m, free, true);
2749 }
2750
2751 /*
2752 * After removing a page table entry, this routine is used to
2753 * conditionally free the page, and manage the reference count.
2754 */
2755 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)2756 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2757 struct spglist *free)
2758 {
2759 vm_page_t mpte;
2760
2761 KASSERT(ADDR_IS_CANONICAL(va),
2762 ("%s: Address not in canonical form: %lx", __func__, va));
2763 if (ADDR_IS_KERNEL(va))
2764 return (0);
2765 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2766 mpte = PTE_TO_VM_PAGE(ptepde);
2767 return (pmap_unwire_l3(pmap, va, mpte, free));
2768 }
2769
2770 /*
2771 * Release a page table page reference after a failed attempt to create a
2772 * mapping.
2773 */
2774 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)2775 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2776 {
2777 struct spglist free;
2778
2779 SLIST_INIT(&free);
2780 if (pmap_unwire_l3(pmap, va, mpte, &free))
2781 vm_page_free_pages_toq(&free, true);
2782 }
2783
2784 void
pmap_pinit0(pmap_t pmap)2785 pmap_pinit0(pmap_t pmap)
2786 {
2787
2788 PMAP_LOCK_INIT(pmap);
2789 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2790 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2791 pmap->pm_l0 = PHYS_TO_DMAP(pmap->pm_l0_paddr);
2792 TAILQ_INIT(&pmap->pm_pvchunk);
2793 vm_radix_init(&pmap->pm_root);
2794 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2795 pmap->pm_stage = PM_STAGE1;
2796 pmap->pm_levels = 4;
2797 pmap->pm_ttbr = pmap->pm_l0_paddr;
2798 pmap->pm_asid_set = &asids;
2799 pmap->pm_bti = NULL;
2800
2801 PCPU_SET(curpmap, pmap);
2802 }
2803
2804 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage,int levels)2805 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2806 {
2807 vm_page_t m;
2808
2809 /*
2810 * allocate the l0 page
2811 */
2812 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2813 VM_ALLOC_ZERO);
2814 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2815 pmap->pm_l0 = PHYS_TO_DMAP(pmap->pm_l0_paddr);
2816
2817 TAILQ_INIT(&pmap->pm_pvchunk);
2818 vm_radix_init(&pmap->pm_root);
2819 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2820 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2821
2822 MPASS(levels == 3 || levels == 4);
2823 pmap->pm_levels = levels;
2824 pmap->pm_stage = stage;
2825 pmap->pm_bti = NULL;
2826 switch (stage) {
2827 case PM_STAGE1:
2828 pmap->pm_asid_set = &asids;
2829 if (pmap_bti_support) {
2830 pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF,
2831 M_ZERO | M_WAITOK);
2832 rangeset_init(pmap->pm_bti, bti_dup_range,
2833 bti_free_range, pmap, M_NOWAIT);
2834 }
2835 break;
2836 case PM_STAGE2:
2837 pmap->pm_asid_set = &vmids;
2838 break;
2839 default:
2840 panic("%s: Invalid pmap type %d", __func__, stage);
2841 break;
2842 }
2843
2844 /* XXX Temporarily disable deferred ASID allocation. */
2845 pmap_alloc_asid(pmap);
2846
2847 /*
2848 * Allocate the level 1 entry to use as the root. This will increase
2849 * the refcount on the level 1 page so it won't be removed until
2850 * pmap_release() is called.
2851 */
2852 if (pmap->pm_levels == 3) {
2853 PMAP_LOCK(pmap);
2854 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2855 PMAP_UNLOCK(pmap);
2856 }
2857 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2858
2859 return (1);
2860 }
2861
2862 int
pmap_pinit(pmap_t pmap)2863 pmap_pinit(pmap_t pmap)
2864 {
2865
2866 return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2867 }
2868
2869 /*
2870 * This routine is called if the desired page table page does not exist.
2871 *
2872 * If page table page allocation fails, this routine may sleep before
2873 * returning NULL. It sleeps only if a lock pointer was given.
2874 *
2875 * Note: If a page allocation fails at page table level two or three,
2876 * one or two pages may be held during the wait, only to be released
2877 * afterwards. This conservative approach is easily argued to avoid
2878 * race conditions.
2879 */
2880 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)2881 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2882 {
2883 vm_page_t m, l1pg, l2pg;
2884
2885 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2886
2887 /*
2888 * Allocate a page table page.
2889 */
2890 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2891 if (lockp != NULL) {
2892 RELEASE_PV_LIST_LOCK(lockp);
2893 PMAP_UNLOCK(pmap);
2894 vm_wait(NULL);
2895 PMAP_LOCK(pmap);
2896 }
2897
2898 /*
2899 * Indicate the need to retry. While waiting, the page table
2900 * page may have been allocated.
2901 */
2902 return (NULL);
2903 }
2904 m->pindex = ptepindex;
2905
2906 /*
2907 * Because of AArch64's weak memory consistency model, we must have a
2908 * barrier here to ensure that the stores for zeroing "m", whether by
2909 * pmap_zero_page() or an earlier function, are visible before adding
2910 * "m" to the page table. Otherwise, a page table walk by another
2911 * processor's MMU could see the mapping to "m" and a stale, non-zero
2912 * PTE within "m".
2913 */
2914 dmb(ishst);
2915
2916 /*
2917 * Map the pagetable page into the process address space, if
2918 * it isn't already there.
2919 */
2920
2921 if (ptepindex >= (NUL2E + NUL1E)) {
2922 pd_entry_t *l0p, l0e;
2923 vm_pindex_t l0index;
2924
2925 l0index = ptepindex - (NUL2E + NUL1E);
2926 l0p = &pmap->pm_l0[l0index];
2927 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2928 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2929 l0e = VM_PAGE_TO_PTE(m) | L0_TABLE;
2930
2931 /*
2932 * Mark all kernel memory as not accessible from userspace
2933 * and userspace memory as not executable from the kernel.
2934 * This has been done for the bootstrap L0 entries in
2935 * locore.S.
2936 */
2937 if (pmap == kernel_pmap)
2938 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2939 else
2940 l0e |= TATTR_PXN_TABLE;
2941 pmap_store(l0p, l0e);
2942 } else if (ptepindex >= NUL2E) {
2943 vm_pindex_t l0index, l1index;
2944 pd_entry_t *l0, *l1;
2945 pd_entry_t tl0;
2946
2947 l1index = ptepindex - NUL2E;
2948 l0index = l1index >> Ln_ENTRIES_SHIFT;
2949
2950 l0 = &pmap->pm_l0[l0index];
2951 tl0 = pmap_load(l0);
2952 if (tl0 == 0) {
2953 /* recurse for allocating page dir */
2954 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2955 lockp) == NULL) {
2956 vm_page_unwire_noq(m);
2957 vm_page_free_zero(m);
2958 return (NULL);
2959 }
2960 } else {
2961 l1pg = PTE_TO_VM_PAGE(tl0);
2962 l1pg->ref_count++;
2963 }
2964
2965 l1 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
2966 l1 = &l1[ptepindex & Ln_ADDR_MASK];
2967 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2968 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2969 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
2970 } else {
2971 vm_pindex_t l0index, l1index;
2972 pd_entry_t *l0, *l1, *l2;
2973 pd_entry_t tl0, tl1;
2974
2975 l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2976 l0index = l1index >> Ln_ENTRIES_SHIFT;
2977
2978 l0 = &pmap->pm_l0[l0index];
2979 tl0 = pmap_load(l0);
2980 if (tl0 == 0) {
2981 /* recurse for allocating page dir */
2982 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2983 lockp) == NULL) {
2984 vm_page_unwire_noq(m);
2985 vm_page_free_zero(m);
2986 return (NULL);
2987 }
2988 tl0 = pmap_load(l0);
2989 l1 = PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2990 l1 = &l1[l1index & Ln_ADDR_MASK];
2991 } else {
2992 l1 = PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2993 l1 = &l1[l1index & Ln_ADDR_MASK];
2994 tl1 = pmap_load(l1);
2995 if (tl1 == 0) {
2996 /* recurse for allocating page dir */
2997 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2998 lockp) == NULL) {
2999 vm_page_unwire_noq(m);
3000 vm_page_free_zero(m);
3001 return (NULL);
3002 }
3003 } else {
3004 l2pg = PTE_TO_VM_PAGE(tl1);
3005 l2pg->ref_count++;
3006 }
3007 }
3008
3009 l2 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1)));
3010 l2 = &l2[ptepindex & Ln_ADDR_MASK];
3011 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
3012 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
3013 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
3014 }
3015
3016 pmap_resident_count_inc(pmap, 1);
3017
3018 return (m);
3019 }
3020
3021 static pd_entry_t *
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,vm_page_t * l2pgp,struct rwlock ** lockp)3022 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
3023 struct rwlock **lockp)
3024 {
3025 pd_entry_t *l1, *l2;
3026 vm_page_t l2pg;
3027 vm_pindex_t l2pindex;
3028
3029 KASSERT(ADDR_IS_CANONICAL(va),
3030 ("%s: Address not in canonical form: %lx", __func__, va));
3031
3032 retry:
3033 l1 = pmap_l1(pmap, va);
3034 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
3035 l2 = pmap_l1_to_l2(l1, va);
3036 if (ADDR_IS_USER(va)) {
3037 /* Add a reference to the L2 page. */
3038 l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
3039 l2pg->ref_count++;
3040 } else
3041 l2pg = NULL;
3042 } else if (ADDR_IS_USER(va)) {
3043 /* Allocate a L2 page. */
3044 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
3045 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
3046 if (l2pg == NULL) {
3047 if (lockp != NULL)
3048 goto retry;
3049 else
3050 return (NULL);
3051 }
3052 l2 = VM_PAGE_TO_DMAP(l2pg);
3053 l2 = &l2[pmap_l2_index(va)];
3054 } else
3055 panic("pmap_alloc_l2: missing page table page for va %#lx",
3056 va);
3057 *l2pgp = l2pg;
3058 return (l2);
3059 }
3060
3061 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)3062 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
3063 {
3064 vm_pindex_t ptepindex;
3065 pd_entry_t *pde, tpde;
3066 #ifdef INVARIANTS
3067 pt_entry_t *pte;
3068 #endif
3069 vm_page_t m;
3070 int lvl;
3071
3072 /*
3073 * Calculate pagetable page index
3074 */
3075 ptepindex = pmap_l2_pindex(va);
3076 retry:
3077 /*
3078 * Get the page directory entry
3079 */
3080 pde = pmap_pde(pmap, va, &lvl);
3081
3082 /*
3083 * If the page table page is mapped, we just increment the hold count,
3084 * and activate it. If we get a level 2 pde it will point to a level 3
3085 * table.
3086 */
3087 switch (lvl) {
3088 case -1:
3089 break;
3090 case 0:
3091 #ifdef INVARIANTS
3092 pte = pmap_l0_to_l1(pde, va);
3093 KASSERT(pmap_load(pte) == 0,
3094 ("pmap_alloc_l3: TODO: l0 superpages"));
3095 #endif
3096 break;
3097 case 1:
3098 #ifdef INVARIANTS
3099 pte = pmap_l1_to_l2(pde, va);
3100 KASSERT(pmap_load(pte) == 0,
3101 ("pmap_alloc_l3: TODO: l1 superpages"));
3102 #endif
3103 break;
3104 case 2:
3105 tpde = pmap_load(pde);
3106 if (tpde != 0) {
3107 m = PTE_TO_VM_PAGE(tpde);
3108 m->ref_count++;
3109 return (m);
3110 }
3111 break;
3112 default:
3113 panic("pmap_alloc_l3: Invalid level %d", lvl);
3114 }
3115
3116 /*
3117 * Here if the pte page isn't mapped, or if it has been deallocated.
3118 */
3119 m = _pmap_alloc_l3(pmap, ptepindex, lockp);
3120 if (m == NULL && lockp != NULL)
3121 goto retry;
3122
3123 return (m);
3124 }
3125
3126 /***************************************************
3127 * Pmap allocation/deallocation routines.
3128 ***************************************************/
3129
3130 /*
3131 * Release any resources held by the given physical map.
3132 * Called when a pmap initialized by pmap_pinit is being released.
3133 * Should only be called if the map contains no valid mappings.
3134 */
3135 void
pmap_release(pmap_t pmap)3136 pmap_release(pmap_t pmap)
3137 {
3138 bool rv __diagused;
3139 struct spglist freelist;
3140 struct asid_set *set;
3141 vm_page_t m;
3142 int asid;
3143
3144 if (pmap->pm_levels != 4) {
3145 PMAP_ASSERT_STAGE2(pmap);
3146 KASSERT(pmap->pm_stats.resident_count == 1,
3147 ("pmap_release: pmap resident count %ld != 0",
3148 pmap->pm_stats.resident_count));
3149 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
3150 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
3151
3152 SLIST_INIT(&freelist);
3153 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
3154 PMAP_LOCK(pmap);
3155 rv = pmap_unwire_l3(pmap, 0, m, &freelist);
3156 PMAP_UNLOCK(pmap);
3157 MPASS(rv == true);
3158 vm_page_free_pages_toq(&freelist, true);
3159 }
3160
3161 KASSERT(pmap->pm_stats.resident_count == 0,
3162 ("pmap_release: pmap resident count %ld != 0",
3163 pmap->pm_stats.resident_count));
3164 KASSERT(vm_radix_is_empty(&pmap->pm_root),
3165 ("pmap_release: pmap has reserved page table page(s)"));
3166
3167 set = pmap->pm_asid_set;
3168 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
3169
3170 /*
3171 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
3172 * the entries when removing them so rely on a later tlb invalidation.
3173 * this will happen when updating the VMID generation. Because of this
3174 * we don't reuse VMIDs within a generation.
3175 */
3176 if (pmap->pm_stage == PM_STAGE1) {
3177 mtx_lock_spin(&set->asid_set_mutex);
3178 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
3179 asid = COOKIE_TO_ASID(pmap->pm_cookie);
3180 KASSERT(asid >= ASID_FIRST_AVAILABLE &&
3181 asid < set->asid_set_size,
3182 ("pmap_release: pmap cookie has out-of-range asid"));
3183 bit_clear(set->asid_set, asid);
3184 }
3185 mtx_unlock_spin(&set->asid_set_mutex);
3186
3187 if (pmap->pm_bti != NULL) {
3188 rangeset_fini(pmap->pm_bti);
3189 free(pmap->pm_bti, M_DEVBUF);
3190 }
3191 }
3192
3193 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
3194 vm_page_unwire_noq(m);
3195 vm_page_free_zero(m);
3196 }
3197
3198 static int
kvm_size(SYSCTL_HANDLER_ARGS)3199 kvm_size(SYSCTL_HANDLER_ARGS)
3200 {
3201 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
3202
3203 return sysctl_handle_long(oidp, &ksize, 0, req);
3204 }
3205 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
3206 0, 0, kvm_size, "LU",
3207 "Size of KVM");
3208
3209 static int
kvm_free(SYSCTL_HANDLER_ARGS)3210 kvm_free(SYSCTL_HANDLER_ARGS)
3211 {
3212 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
3213
3214 return sysctl_handle_long(oidp, &kfree, 0, req);
3215 }
3216 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
3217 0, 0, kvm_free, "LU",
3218 "Amount of KVM free");
3219
3220 /*
3221 * grow the number of kernel page table entries, if needed
3222 */
3223 static int
pmap_growkernel_nopanic(vm_offset_t addr)3224 pmap_growkernel_nopanic(vm_offset_t addr)
3225 {
3226 vm_page_t nkpg;
3227 pd_entry_t *l0, *l1, *l2;
3228
3229 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
3230
3231 addr = roundup2(addr, L2_SIZE);
3232 if (addr - 1 >= vm_map_max(kernel_map))
3233 addr = vm_map_max(kernel_map);
3234 if (kernel_vm_end < addr) {
3235 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
3236 kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
3237 }
3238 while (kernel_vm_end < addr) {
3239 l0 = pmap_l0(kernel_pmap, kernel_vm_end);
3240 KASSERT(pmap_load(l0) != 0,
3241 ("pmap_growkernel: No level 0 kernel entry"));
3242
3243 l1 = pmap_l0_to_l1(l0, kernel_vm_end);
3244 if (pmap_load(l1) == 0) {
3245 /* We need a new PDP entry */
3246 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3247 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3248 if (nkpg == NULL)
3249 return (KERN_RESOURCE_SHORTAGE);
3250 nkpg->pindex = pmap_l1_pindex(kernel_vm_end);
3251 /* See the dmb() in _pmap_alloc_l3(). */
3252 dmb(ishst);
3253 pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE);
3254 continue; /* try again */
3255 }
3256 l2 = pmap_l1_to_l2(l1, kernel_vm_end);
3257 if (pmap_load(l2) != 0) {
3258 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
3259 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3260 kernel_vm_end = vm_map_max(kernel_map);
3261 break;
3262 }
3263 continue;
3264 }
3265
3266 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3267 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3268 if (nkpg == NULL)
3269 return (KERN_RESOURCE_SHORTAGE);
3270 nkpg->pindex = pmap_l2_pindex(kernel_vm_end);
3271 /* See the dmb() in _pmap_alloc_l3(). */
3272 dmb(ishst);
3273 pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE);
3274
3275 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
3276 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3277 kernel_vm_end = vm_map_max(kernel_map);
3278 break;
3279 }
3280 }
3281 return (KERN_SUCCESS);
3282 }
3283
3284 int
pmap_growkernel(vm_offset_t addr)3285 pmap_growkernel(vm_offset_t addr)
3286 {
3287 int rv;
3288
3289 rv = pmap_growkernel_nopanic(addr);
3290 if (rv != KERN_SUCCESS && pmap_growkernel_panic)
3291 panic("pmap_growkernel: no memory to grow kernel");
3292 return (rv);
3293 }
3294
3295 /***************************************************
3296 * page management routines.
3297 ***************************************************/
3298
3299 static const uint64_t pc_freemask[_NPCM] = {
3300 [0 ... _NPCM - 2] = PC_FREEN,
3301 [_NPCM - 1] = PC_FREEL
3302 };
3303
3304 #ifdef PV_STATS
3305 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
3306
3307 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
3308 "Current number of pv entry chunks");
3309 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
3310 "Current number of pv entry chunks allocated");
3311 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
3312 "Current number of pv entry chunks frees");
3313 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
3314 "Number of times tried to get a chunk page but failed.");
3315
3316 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
3317 static int pv_entry_spare;
3318
3319 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
3320 "Current number of pv entry frees");
3321 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
3322 "Current number of pv entry allocs");
3323 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
3324 "Current number of pv entries");
3325 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3326 "Current number of spare pv entries");
3327 #endif
3328
3329 /*
3330 * We are in a serious low memory condition. Resort to
3331 * drastic measures to free some pages so we can allocate
3332 * another pv entry chunk.
3333 *
3334 * Returns NULL if PV entries were reclaimed from the specified pmap.
3335 *
3336 * We do not, however, unmap 2mpages because subsequent accesses will
3337 * allocate per-page pv entries until repromotion occurs, thereby
3338 * exacerbating the shortage of free pv entries.
3339 */
3340 static vm_page_t
reclaim_pv_chunk_domain(pmap_t locked_pmap,struct rwlock ** lockp,int domain)3341 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
3342 {
3343 struct pv_chunks_list *pvc;
3344 struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3345 struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3346 struct md_page *pvh;
3347 pd_entry_t *pde;
3348 pmap_t next_pmap, pmap;
3349 pt_entry_t *pte, tpte;
3350 pv_entry_t pv;
3351 vm_offset_t va;
3352 vm_page_t m, m_pc;
3353 struct spglist free;
3354 uint64_t inuse;
3355 int bit, field, freed, lvl;
3356
3357 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3358 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3359
3360 pmap = NULL;
3361 m_pc = NULL;
3362 SLIST_INIT(&free);
3363 bzero(&pc_marker_b, sizeof(pc_marker_b));
3364 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3365 pc_marker = (struct pv_chunk *)&pc_marker_b;
3366 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3367
3368 pvc = &pv_chunks[domain];
3369 mtx_lock(&pvc->pvc_lock);
3370 pvc->active_reclaims++;
3371 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
3372 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
3373 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3374 SLIST_EMPTY(&free)) {
3375 next_pmap = pc->pc_pmap;
3376 if (next_pmap == NULL) {
3377 /*
3378 * The next chunk is a marker. However, it is
3379 * not our marker, so active_reclaims must be
3380 * > 1. Consequently, the next_chunk code
3381 * will not rotate the pv_chunks list.
3382 */
3383 goto next_chunk;
3384 }
3385 mtx_unlock(&pvc->pvc_lock);
3386
3387 /*
3388 * A pv_chunk can only be removed from the pc_lru list
3389 * when both pvc->pvc_lock is owned and the
3390 * corresponding pmap is locked.
3391 */
3392 if (pmap != next_pmap) {
3393 if (pmap != NULL && pmap != locked_pmap)
3394 PMAP_UNLOCK(pmap);
3395 pmap = next_pmap;
3396 /* Avoid deadlock and lock recursion. */
3397 if (pmap > locked_pmap) {
3398 RELEASE_PV_LIST_LOCK(lockp);
3399 PMAP_LOCK(pmap);
3400 mtx_lock(&pvc->pvc_lock);
3401 continue;
3402 } else if (pmap != locked_pmap) {
3403 if (PMAP_TRYLOCK(pmap)) {
3404 mtx_lock(&pvc->pvc_lock);
3405 continue;
3406 } else {
3407 pmap = NULL; /* pmap is not locked */
3408 mtx_lock(&pvc->pvc_lock);
3409 pc = TAILQ_NEXT(pc_marker, pc_lru);
3410 if (pc == NULL ||
3411 pc->pc_pmap != next_pmap)
3412 continue;
3413 goto next_chunk;
3414 }
3415 }
3416 }
3417
3418 /*
3419 * Destroy every non-wired, 4 KB page mapping in the chunk.
3420 */
3421 freed = 0;
3422 for (field = 0; field < _NPCM; field++) {
3423 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3424 inuse != 0; inuse &= ~(1UL << bit)) {
3425 bit = ffsl(inuse) - 1;
3426 pv = &pc->pc_pventry[field * 64 + bit];
3427 va = pv->pv_va;
3428 pde = pmap_pde(pmap, va, &lvl);
3429 if (lvl != 2)
3430 continue;
3431 pte = pmap_l2_to_l3(pde, va);
3432 tpte = pmap_load(pte);
3433 if ((tpte & ATTR_SW_WIRED) != 0)
3434 continue;
3435 if ((tpte & ATTR_CONTIGUOUS) != 0)
3436 (void)pmap_demote_l3c(pmap, pte, va);
3437 tpte = pmap_load_clear(pte);
3438 m = PTE_TO_VM_PAGE(tpte);
3439 if (pmap_pte_dirty(pmap, tpte))
3440 vm_page_dirty(m);
3441 if ((tpte & ATTR_AF) != 0) {
3442 pmap_s1_invalidate_page(pmap, va, true);
3443 vm_page_aflag_set(m, PGA_REFERENCED);
3444 }
3445 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3446 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3447 m->md.pv_gen++;
3448 if (TAILQ_EMPTY(&m->md.pv_list) &&
3449 (m->flags & PG_FICTITIOUS) == 0) {
3450 pvh = page_to_pvh(m);
3451 if (TAILQ_EMPTY(&pvh->pv_list)) {
3452 vm_page_aflag_clear(m,
3453 PGA_WRITEABLE);
3454 }
3455 }
3456 pc->pc_map[field] |= 1UL << bit;
3457 pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
3458 freed++;
3459 }
3460 }
3461 if (freed == 0) {
3462 mtx_lock(&pvc->pvc_lock);
3463 goto next_chunk;
3464 }
3465 /* Every freed mapping is for a 4 KB page. */
3466 pmap_resident_count_dec(pmap, freed);
3467 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3468 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3469 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3470 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3471 if (pc_is_free(pc)) {
3472 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3473 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3474 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3475 /* Entire chunk is free; return it. */
3476 m_pc = DMAP_TO_VM_PAGE(pc);
3477 dump_drop_page(m_pc->phys_addr);
3478 mtx_lock(&pvc->pvc_lock);
3479 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3480 break;
3481 }
3482 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3483 mtx_lock(&pvc->pvc_lock);
3484 /* One freed pv entry in locked_pmap is sufficient. */
3485 if (pmap == locked_pmap)
3486 break;
3487
3488 next_chunk:
3489 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3490 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
3491 if (pvc->active_reclaims == 1 && pmap != NULL) {
3492 /*
3493 * Rotate the pv chunks list so that we do not
3494 * scan the same pv chunks that could not be
3495 * freed (because they contained a wired
3496 * and/or superpage mapping) on every
3497 * invocation of reclaim_pv_chunk().
3498 */
3499 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
3500 MPASS(pc->pc_pmap != NULL);
3501 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3502 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3503 }
3504 }
3505 }
3506 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3507 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
3508 pvc->active_reclaims--;
3509 mtx_unlock(&pvc->pvc_lock);
3510 if (pmap != NULL && pmap != locked_pmap)
3511 PMAP_UNLOCK(pmap);
3512 if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3513 m_pc = SLIST_FIRST(&free);
3514 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3515 /* Recycle a freed page table page. */
3516 m_pc->ref_count = 1;
3517 }
3518 vm_page_free_pages_toq(&free, true);
3519 return (m_pc);
3520 }
3521
3522 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)3523 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3524 {
3525 vm_page_t m;
3526 int i, domain;
3527
3528 domain = PCPU_GET(domain);
3529 for (i = 0; i < vm_ndomains; i++) {
3530 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
3531 if (m != NULL)
3532 break;
3533 domain = (domain + 1) % vm_ndomains;
3534 }
3535
3536 return (m);
3537 }
3538
3539 /*
3540 * free the pv_entry back to the free list
3541 */
3542 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)3543 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3544 {
3545 struct pv_chunk *pc;
3546 int idx, field, bit;
3547
3548 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3549 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3550 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3551 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3552 pc = pv_to_chunk(pv);
3553 idx = pv - &pc->pc_pventry[0];
3554 field = idx / 64;
3555 bit = idx % 64;
3556 pc->pc_map[field] |= 1ul << bit;
3557 if (!pc_is_free(pc)) {
3558 /* 98% of the time, pc is already at the head of the list. */
3559 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3560 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3561 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3562 }
3563 return;
3564 }
3565 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3566 free_pv_chunk(pc);
3567 }
3568
3569 static void
free_pv_chunk_dequeued(struct pv_chunk * pc)3570 free_pv_chunk_dequeued(struct pv_chunk *pc)
3571 {
3572 vm_page_t m;
3573
3574 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3575 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3576 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3577 /* entire chunk is free, return it */
3578 m = DMAP_TO_VM_PAGE(pc);
3579 dump_drop_page(m->phys_addr);
3580 vm_page_unwire_noq(m);
3581 vm_page_free(m);
3582 }
3583
3584 static void
free_pv_chunk(struct pv_chunk * pc)3585 free_pv_chunk(struct pv_chunk *pc)
3586 {
3587 struct pv_chunks_list *pvc;
3588
3589 pvc = &pv_chunks[pc_to_domain(pc)];
3590 mtx_lock(&pvc->pvc_lock);
3591 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3592 mtx_unlock(&pvc->pvc_lock);
3593 free_pv_chunk_dequeued(pc);
3594 }
3595
3596 static void
free_pv_chunk_batch(struct pv_chunklist * batch)3597 free_pv_chunk_batch(struct pv_chunklist *batch)
3598 {
3599 struct pv_chunks_list *pvc;
3600 struct pv_chunk *pc, *npc;
3601 int i;
3602
3603 for (i = 0; i < vm_ndomains; i++) {
3604 if (TAILQ_EMPTY(&batch[i]))
3605 continue;
3606 pvc = &pv_chunks[i];
3607 mtx_lock(&pvc->pvc_lock);
3608 TAILQ_FOREACH(pc, &batch[i], pc_list) {
3609 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3610 }
3611 mtx_unlock(&pvc->pvc_lock);
3612 }
3613
3614 for (i = 0; i < vm_ndomains; i++) {
3615 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3616 free_pv_chunk_dequeued(pc);
3617 }
3618 }
3619 }
3620
3621 /*
3622 * Returns a new PV entry, allocating a new PV chunk from the system when
3623 * needed. If this PV chunk allocation fails and a PV list lock pointer was
3624 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
3625 * returned.
3626 *
3627 * The given PV list lock may be released.
3628 */
3629 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)3630 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3631 {
3632 struct pv_chunks_list *pvc;
3633 int bit, field;
3634 pv_entry_t pv;
3635 struct pv_chunk *pc;
3636 vm_page_t m;
3637
3638 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3639 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3640 retry:
3641 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3642 if (pc != NULL) {
3643 for (field = 0; field < _NPCM; field++) {
3644 if (pc->pc_map[field]) {
3645 bit = ffsl(pc->pc_map[field]) - 1;
3646 break;
3647 }
3648 }
3649 if (field < _NPCM) {
3650 pv = &pc->pc_pventry[field * 64 + bit];
3651 pc->pc_map[field] &= ~(1ul << bit);
3652 /* If this was the last item, move it to tail */
3653 if (pc_is_full(pc)) {
3654 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3655 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3656 pc_list);
3657 }
3658 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3659 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3660 return (pv);
3661 }
3662 }
3663 /* No free items, allocate another chunk */
3664 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3665 if (m == NULL) {
3666 if (lockp == NULL) {
3667 PV_STAT(pc_chunk_tryfail++);
3668 return (NULL);
3669 }
3670 m = reclaim_pv_chunk(pmap, lockp);
3671 if (m == NULL)
3672 goto retry;
3673 }
3674 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3675 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3676 dump_add_page(m->phys_addr);
3677 pc = PHYS_TO_DMAP(m->phys_addr);
3678 pc->pc_pmap = pmap;
3679 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3680 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */
3681 pvc = &pv_chunks[vm_page_domain(m)];
3682 mtx_lock(&pvc->pvc_lock);
3683 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3684 mtx_unlock(&pvc->pvc_lock);
3685 pv = &pc->pc_pventry[0];
3686 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3687 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3688 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3689 return (pv);
3690 }
3691
3692 /*
3693 * Ensure that the number of spare PV entries in the specified pmap meets or
3694 * exceeds the given count, "needed".
3695 *
3696 * The given PV list lock may be released.
3697 */
3698 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)3699 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3700 {
3701 struct pv_chunks_list *pvc;
3702 struct pch new_tail[PMAP_MEMDOM];
3703 struct pv_chunk *pc;
3704 vm_page_t m;
3705 int avail, free, i;
3706 bool reclaimed;
3707
3708 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3709 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3710
3711 /*
3712 * Newly allocated PV chunks must be stored in a private list until
3713 * the required number of PV chunks have been allocated. Otherwise,
3714 * reclaim_pv_chunk() could recycle one of these chunks. In
3715 * contrast, these chunks must be added to the pmap upon allocation.
3716 */
3717 for (i = 0; i < PMAP_MEMDOM; i++)
3718 TAILQ_INIT(&new_tail[i]);
3719 retry:
3720 avail = 0;
3721 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3722 bit_count((bitstr_t *)pc->pc_map, 0,
3723 sizeof(pc->pc_map) * NBBY, &free);
3724 if (free == 0)
3725 break;
3726 avail += free;
3727 if (avail >= needed)
3728 break;
3729 }
3730 for (reclaimed = false; avail < needed; avail += _NPCPV) {
3731 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3732 if (m == NULL) {
3733 m = reclaim_pv_chunk(pmap, lockp);
3734 if (m == NULL)
3735 goto retry;
3736 reclaimed = true;
3737 }
3738 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3739 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3740 dump_add_page(m->phys_addr);
3741 pc = PHYS_TO_DMAP(m->phys_addr);
3742 pc->pc_pmap = pmap;
3743 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3744 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3745 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3746 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3747
3748 /*
3749 * The reclaim might have freed a chunk from the current pmap.
3750 * If that chunk contained available entries, we need to
3751 * re-count the number of available entries.
3752 */
3753 if (reclaimed)
3754 goto retry;
3755 }
3756 for (i = 0; i < vm_ndomains; i++) {
3757 if (TAILQ_EMPTY(&new_tail[i]))
3758 continue;
3759 pvc = &pv_chunks[i];
3760 mtx_lock(&pvc->pvc_lock);
3761 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3762 mtx_unlock(&pvc->pvc_lock);
3763 }
3764 }
3765
3766 /*
3767 * First find and then remove the pv entry for the specified pmap and virtual
3768 * address from the specified pv list. Returns the pv entry if found and NULL
3769 * otherwise. This operation can be performed on pv lists for either 4KB or
3770 * 2MB page mappings.
3771 */
3772 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3773 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3774 {
3775 pv_entry_t pv;
3776
3777 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3778 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3779 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3780 pvh->pv_gen++;
3781 break;
3782 }
3783 }
3784 return (pv);
3785 }
3786
3787 /*
3788 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3789 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3790 * entries for each of the 4KB page mappings.
3791 */
3792 static void
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)3793 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3794 struct rwlock **lockp)
3795 {
3796 struct md_page *pvh;
3797 struct pv_chunk *pc;
3798 pv_entry_t pv;
3799 vm_offset_t va_last;
3800 vm_page_t m;
3801 int bit, field;
3802
3803 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3804 KASSERT((va & L2_OFFSET) == 0,
3805 ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3806 KASSERT((pa & L2_OFFSET) == 0,
3807 ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3808 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3809
3810 /*
3811 * Transfer the 2mpage's pv entry for this mapping to the first
3812 * page's pv list. Once this transfer begins, the pv list lock
3813 * must not be released until the last pv entry is reinstantiated.
3814 */
3815 pvh = pa_to_pvh(pa);
3816 pv = pmap_pvh_remove(pvh, pmap, va);
3817 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3818 m = PHYS_TO_VM_PAGE(pa);
3819 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3820 m->md.pv_gen++;
3821 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3822 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3823 va_last = va + L2_SIZE - PAGE_SIZE;
3824 for (;;) {
3825 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3826 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3827 for (field = 0; field < _NPCM; field++) {
3828 while (pc->pc_map[field]) {
3829 bit = ffsl(pc->pc_map[field]) - 1;
3830 pc->pc_map[field] &= ~(1ul << bit);
3831 pv = &pc->pc_pventry[field * 64 + bit];
3832 va += PAGE_SIZE;
3833 pv->pv_va = va;
3834 m++;
3835 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3836 ("pmap_pv_demote_l2: page %p is not managed", m));
3837 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3838 m->md.pv_gen++;
3839 if (va == va_last)
3840 goto out;
3841 }
3842 }
3843 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3844 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3845 }
3846 out:
3847 if (pc_is_full(pc)) {
3848 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3849 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3850 }
3851 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3852 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3853 }
3854
3855 /*
3856 * First find and then destroy the pv entry for the specified pmap and virtual
3857 * address. This operation can be performed on pv lists for either 4KB or 2MB
3858 * page mappings.
3859 */
3860 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3861 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3862 {
3863 pv_entry_t pv;
3864
3865 pv = pmap_pvh_remove(pvh, pmap, va);
3866 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3867 free_pv_entry(pmap, pv);
3868 }
3869
3870 /*
3871 * Conditionally create the PV entry for a 4KB page mapping if the required
3872 * memory can be allocated without resorting to reclamation.
3873 */
3874 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3875 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3876 struct rwlock **lockp)
3877 {
3878 pv_entry_t pv;
3879
3880 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3881 /* Pass NULL instead of the lock pointer to disable reclamation. */
3882 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3883 pv->pv_va = va;
3884 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3885 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3886 m->md.pv_gen++;
3887 return (true);
3888 } else
3889 return (false);
3890 }
3891
3892 /*
3893 * Create the PV entry for a 2MB page mapping. Always returns true unless the
3894 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
3895 * false if the PV entry cannot be allocated without resorting to reclamation.
3896 */
3897 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)3898 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3899 struct rwlock **lockp)
3900 {
3901 struct md_page *pvh;
3902 pv_entry_t pv;
3903 vm_paddr_t pa;
3904
3905 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3906 /* Pass NULL instead of the lock pointer to disable reclamation. */
3907 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3908 NULL : lockp)) == NULL)
3909 return (false);
3910 pv->pv_va = va;
3911 pa = PTE_TO_PHYS(l2e);
3912 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3913 pvh = pa_to_pvh(pa);
3914 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3915 pvh->pv_gen++;
3916 return (true);
3917 }
3918
3919 /*
3920 * Conditionally creates the PV entries for a L3C superpage mapping if
3921 * the required memory can be allocated without resorting to reclamation.
3922 */
3923 static bool
pmap_pv_insert_l3c(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3924 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
3925 struct rwlock **lockp)
3926 {
3927 pv_entry_t pv;
3928 vm_offset_t tva;
3929 vm_paddr_t pa __diagused;
3930 vm_page_t mt;
3931
3932 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3933 KASSERT((va & L3C_OFFSET) == 0,
3934 ("pmap_pv_insert_l3c: va is not aligned"));
3935 pa = VM_PAGE_TO_PHYS(m);
3936 KASSERT((pa & L3C_OFFSET) == 0,
3937 ("pmap_pv_insert_l3c: pa is not aligned"));
3938 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3939 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) {
3940 /* Pass NULL instead of lockp to disable reclamation. */
3941 pv = get_pv_entry(pmap, NULL);
3942 if (__predict_false(pv == NULL)) {
3943 while (tva > va) {
3944 mt--;
3945 tva -= L3_SIZE;
3946 pmap_pvh_free(&mt->md, pmap, tva);
3947 }
3948 return (false);
3949 }
3950 pv->pv_va = tva;
3951 TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next);
3952 mt->md.pv_gen++;
3953 }
3954 return (true);
3955 }
3956
3957 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)3958 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3959 {
3960 pt_entry_t newl2, oldl2 __diagused;
3961 vm_page_t ml3;
3962 vm_paddr_t ml3pa;
3963
3964 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3965 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3966 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3967
3968 ml3 = pmap_remove_pt_page(pmap, va);
3969 KASSERT(ml3 != NULL, ("pmap_remove_kernel_l2: missing pt page"));
3970
3971 ml3pa = VM_PAGE_TO_PHYS(ml3);
3972 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
3973
3974 /*
3975 * If this page table page was unmapped by a promotion, then it
3976 * contains valid mappings. Zero it to invalidate those mappings.
3977 */
3978 if (vm_page_any_valid(ml3))
3979 pagezero(PHYS_TO_DMAP(ml3pa));
3980
3981 /*
3982 * Demote the mapping. The caller must have already invalidated the
3983 * mapping (i.e., the "break" in break-before-make).
3984 */
3985 oldl2 = pmap_load_store(l2, newl2);
3986 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3987 __func__, l2, oldl2));
3988 }
3989
3990 /*
3991 * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3992 */
3993 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,bool demote_kl2e,struct spglist * free,struct rwlock ** lockp)3994 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e,
3995 bool demote_kl2e, struct spglist *free, struct rwlock **lockp)
3996 {
3997 struct md_page *pvh;
3998 pt_entry_t old_l2;
3999 vm_page_t m, ml3, mt;
4000
4001 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4002 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
4003 old_l2 = pmap_load_clear(l2);
4004 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4005 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
4006
4007 /*
4008 * Since a promotion must break the 4KB page mappings before making
4009 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4010 */
4011 pmap_s1_invalidate_page(pmap, sva, true);
4012
4013 if (old_l2 & ATTR_SW_WIRED)
4014 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
4015 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
4016 if (old_l2 & ATTR_SW_MANAGED) {
4017 m = PTE_TO_VM_PAGE(old_l2);
4018 pvh = page_to_pvh(m);
4019 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
4020 pmap_pvh_free(pvh, pmap, sva);
4021 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
4022 if (pmap_pte_dirty(pmap, old_l2))
4023 vm_page_dirty(mt);
4024 if (old_l2 & ATTR_AF)
4025 vm_page_aflag_set(mt, PGA_REFERENCED);
4026 if (TAILQ_EMPTY(&mt->md.pv_list) &&
4027 TAILQ_EMPTY(&pvh->pv_list))
4028 vm_page_aflag_clear(mt, PGA_WRITEABLE);
4029 }
4030 }
4031 if (pmap != kernel_pmap) {
4032 ml3 = pmap_remove_pt_page(pmap, sva);
4033 if (ml3 != NULL) {
4034 KASSERT(vm_page_any_valid(ml3),
4035 ("pmap_remove_l2: l3 page not promoted"));
4036 pmap_resident_count_dec(pmap, 1);
4037 KASSERT(ml3->ref_count == NL3PG,
4038 ("pmap_remove_l2: l3 page ref count error"));
4039 ml3->ref_count = 0;
4040 pmap_add_delayed_free_list(ml3, free, false);
4041 }
4042 } else if (demote_kl2e) {
4043 pmap_remove_kernel_l2(pmap, l2, sva);
4044 } else {
4045 ml3 = vm_radix_lookup(&pmap->pm_root, pmap_l2_pindex(sva));
4046 if (vm_page_any_valid(ml3)) {
4047 ml3->valid = 0;
4048 pmap_zero_page(ml3);
4049 }
4050 }
4051 return (pmap_unuse_pt(pmap, sva, l1e, free));
4052 }
4053
4054 /*
4055 * pmap_remove_l3: do the things to unmap a page in a process
4056 */
4057 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)4058 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
4059 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
4060 {
4061 struct md_page *pvh;
4062 pt_entry_t old_l3;
4063 vm_page_t m;
4064
4065 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4066 old_l3 = pmap_load(l3);
4067 if ((old_l3 & ATTR_CONTIGUOUS) != 0)
4068 (void)pmap_demote_l3c(pmap, l3, va);
4069 old_l3 = pmap_load_clear(l3);
4070 pmap_s1_invalidate_page(pmap, va, true);
4071 if (old_l3 & ATTR_SW_WIRED)
4072 pmap->pm_stats.wired_count -= 1;
4073 pmap_resident_count_dec(pmap, 1);
4074 if (old_l3 & ATTR_SW_MANAGED) {
4075 m = PTE_TO_VM_PAGE(old_l3);
4076 if (pmap_pte_dirty(pmap, old_l3))
4077 vm_page_dirty(m);
4078 if (old_l3 & ATTR_AF)
4079 vm_page_aflag_set(m, PGA_REFERENCED);
4080 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
4081 pmap_pvh_free(&m->md, pmap, va);
4082 if (TAILQ_EMPTY(&m->md.pv_list) &&
4083 (m->flags & PG_FICTITIOUS) == 0) {
4084 pvh = page_to_pvh(m);
4085 if (TAILQ_EMPTY(&pvh->pv_list))
4086 vm_page_aflag_clear(m, PGA_WRITEABLE);
4087 }
4088 }
4089 return (pmap_unuse_pt(pmap, va, l2e, free));
4090 }
4091
4092 /*
4093 * Removes the specified L3C superpage mapping. Requests TLB invalidations
4094 * to be performed by the caller through the returned "*vap". Returns true
4095 * if the level 3 table "ml3" was unmapped and added to the spglist "free".
4096 * Otherwise, returns false.
4097 */
4098 static bool
pmap_remove_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,vm_page_t ml3,struct spglist * free,struct rwlock ** lockp)4099 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap,
4100 vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
4101 struct rwlock **lockp)
4102 {
4103 struct md_page *pvh;
4104 struct rwlock *new_lock;
4105 pt_entry_t first_l3e, l3e, *tl3p;
4106 vm_offset_t tva;
4107 vm_page_t m, mt;
4108
4109 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4110 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4111 0, ("pmap_remove_l3c: l3p is not aligned"));
4112 KASSERT((va & L3C_OFFSET) == 0,
4113 ("pmap_remove_l3c: va is not aligned"));
4114
4115 /*
4116 * Hardware accessed and dirty bit maintenance might only update a
4117 * single L3 entry, so we must combine the accessed and dirty bits
4118 * from this entire set of contiguous L3 entries.
4119 */
4120 first_l3e = pmap_load_clear(l3p);
4121 for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4122 l3e = pmap_load_clear(tl3p);
4123 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4124 ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS"));
4125 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4126 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4127 first_l3e &= ~ATTR_S1_AP_RW_BIT;
4128 first_l3e |= l3e & ATTR_AF;
4129 }
4130 if ((first_l3e & ATTR_SW_WIRED) != 0)
4131 pmap->pm_stats.wired_count -= L3C_ENTRIES;
4132 pmap_resident_count_dec(pmap, L3C_ENTRIES);
4133 if ((first_l3e & ATTR_SW_MANAGED) != 0) {
4134 m = PTE_TO_VM_PAGE(first_l3e);
4135 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4136 if (new_lock != *lockp) {
4137 if (*lockp != NULL) {
4138 /*
4139 * Pending TLB invalidations must be
4140 * performed before the PV list lock is
4141 * released. Otherwise, a concurrent
4142 * pmap_remove_all() on a physical page
4143 * could return while a stale TLB entry
4144 * still provides access to that page.
4145 */
4146 if (*vap != va_next) {
4147 pmap_invalidate_range(pmap, *vap, va,
4148 true);
4149 *vap = va_next;
4150 }
4151 rw_wunlock(*lockp);
4152 }
4153 *lockp = new_lock;
4154 rw_wlock(*lockp);
4155 }
4156 pvh = page_to_pvh(m);
4157 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva +=
4158 L3_SIZE) {
4159 if (pmap_pte_dirty(pmap, first_l3e))
4160 vm_page_dirty(mt);
4161 if ((first_l3e & ATTR_AF) != 0)
4162 vm_page_aflag_set(mt, PGA_REFERENCED);
4163 pmap_pvh_free(&mt->md, pmap, tva);
4164 if (TAILQ_EMPTY(&mt->md.pv_list) &&
4165 TAILQ_EMPTY(&pvh->pv_list))
4166 vm_page_aflag_clear(mt, PGA_WRITEABLE);
4167 }
4168 }
4169 if (*vap == va_next)
4170 *vap = va;
4171 if (ml3 != NULL) {
4172 ml3->ref_count -= L3C_ENTRIES;
4173 if (ml3->ref_count == 0) {
4174 _pmap_unwire_l3(pmap, va, ml3, free);
4175 return (true);
4176 }
4177 }
4178 return (false);
4179 }
4180
4181 /*
4182 * Remove the specified range of addresses from the L3 page table that is
4183 * identified by the given L2 entry.
4184 */
4185 static void
pmap_remove_l3_range(pmap_t pmap,pd_entry_t l2e,vm_offset_t sva,vm_offset_t eva,struct spglist * free,struct rwlock ** lockp)4186 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
4187 vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
4188 {
4189 struct md_page *pvh;
4190 struct rwlock *new_lock;
4191 pt_entry_t *l3, old_l3;
4192 vm_offset_t va;
4193 vm_page_t l3pg, m;
4194
4195 KASSERT(ADDR_IS_CANONICAL(sva),
4196 ("%s: Start address not in canonical form: %lx", __func__, sva));
4197 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
4198 ("%s: End address not in canonical form: %lx", __func__, eva));
4199
4200 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4201 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
4202 ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
4203 l3pg = ADDR_IS_USER(sva) ? PTE_TO_VM_PAGE(l2e) : NULL;
4204 va = eva;
4205 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
4206 old_l3 = pmap_load(l3);
4207 if (!pmap_l3_valid(old_l3)) {
4208 if (va != eva) {
4209 pmap_invalidate_range(pmap, va, sva, true);
4210 va = eva;
4211 }
4212 continue;
4213 }
4214 if ((old_l3 & ATTR_CONTIGUOUS) != 0) {
4215 /*
4216 * Is this entire set of contiguous L3 entries being
4217 * removed? Handle the possibility that "eva" is zero
4218 * because of address wraparound.
4219 */
4220 if ((sva & L3C_OFFSET) == 0 &&
4221 sva + L3C_OFFSET <= eva - 1) {
4222 if (pmap_remove_l3c(pmap, l3, sva, &va, eva,
4223 l3pg, free, lockp)) {
4224 /* The L3 table was unmapped. */
4225 sva += L3C_SIZE;
4226 break;
4227 }
4228 l3 += L3C_ENTRIES - 1;
4229 sva += L3C_SIZE - L3_SIZE;
4230 continue;
4231 }
4232
4233 (void)pmap_demote_l3c(pmap, l3, sva);
4234 }
4235 old_l3 = pmap_load_clear(l3);
4236 if ((old_l3 & ATTR_SW_WIRED) != 0)
4237 pmap->pm_stats.wired_count--;
4238 pmap_resident_count_dec(pmap, 1);
4239 /* Below will only be true in a realm environment. */
4240 if (PTE_TO_PHYS(old_l3) & prot_ns_shared_pa)
4241 pmap_set_protected(old_l3);
4242 if ((old_l3 & ATTR_SW_MANAGED) != 0) {
4243 m = PTE_TO_VM_PAGE(old_l3);
4244 if (pmap_pte_dirty(pmap, old_l3))
4245 vm_page_dirty(m);
4246 if ((old_l3 & ATTR_AF) != 0)
4247 vm_page_aflag_set(m, PGA_REFERENCED);
4248 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4249 if (new_lock != *lockp) {
4250 if (*lockp != NULL) {
4251 /*
4252 * Pending TLB invalidations must be
4253 * performed before the PV list lock is
4254 * released. Otherwise, a concurrent
4255 * pmap_remove_all() on a physical page
4256 * could return while a stale TLB entry
4257 * still provides access to that page.
4258 */
4259 if (va != eva) {
4260 pmap_invalidate_range(pmap, va,
4261 sva, true);
4262 va = eva;
4263 }
4264 rw_wunlock(*lockp);
4265 }
4266 *lockp = new_lock;
4267 rw_wlock(*lockp);
4268 }
4269 pmap_pvh_free(&m->md, pmap, sva);
4270 if (TAILQ_EMPTY(&m->md.pv_list) &&
4271 (m->flags & PG_FICTITIOUS) == 0) {
4272 pvh = page_to_pvh(m);
4273 if (TAILQ_EMPTY(&pvh->pv_list))
4274 vm_page_aflag_clear(m, PGA_WRITEABLE);
4275 }
4276 }
4277 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
4278 /*
4279 * _pmap_unwire_l3() has already invalidated the TLB
4280 * entries at all levels for "sva". So, we need not
4281 * perform "sva += L3_SIZE;" here. Moreover, we need
4282 * not perform "va = sva;" if "sva" is at the start
4283 * of a new valid range consisting of a single page.
4284 */
4285 break;
4286 }
4287 if (va == eva)
4288 va = sva;
4289 }
4290 if (va != eva)
4291 pmap_invalidate_range(pmap, va, sva, true);
4292 }
4293
4294 static void
pmap_remove1(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool map_delete)4295 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
4296 {
4297 struct rwlock *lock;
4298 vm_offset_t va_next;
4299 pd_entry_t *l0, *l1, *l2;
4300 pt_entry_t l3_paddr;
4301 struct spglist free;
4302
4303 /*
4304 * Perform an unsynchronized read. This is, however, safe.
4305 */
4306 if (pmap->pm_stats.resident_count == 0)
4307 return;
4308
4309 SLIST_INIT(&free);
4310
4311 PMAP_LOCK(pmap);
4312 if (map_delete)
4313 pmap_bti_on_remove(pmap, sva, eva);
4314
4315 lock = NULL;
4316 for (; sva < eva; sva = va_next) {
4317 if (pmap->pm_stats.resident_count == 0)
4318 break;
4319
4320 l0 = pmap_l0(pmap, sva);
4321 if (pmap_load(l0) == 0) {
4322 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4323 if (va_next < sva)
4324 va_next = eva;
4325 continue;
4326 }
4327
4328 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4329 if (va_next < sva)
4330 va_next = eva;
4331 l1 = pmap_l0_to_l1(l0, sva);
4332 if (pmap_load(l1) == 0)
4333 continue;
4334 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4335 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4336 KASSERT(va_next <= eva,
4337 ("partial update of non-transparent 1G page "
4338 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4339 pmap_load(l1), sva, eva, va_next));
4340 MPASS(pmap != kernel_pmap);
4341 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4342 pmap_clear(l1);
4343 pmap_s1_invalidate_page(pmap, sva, true);
4344 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
4345 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
4346 continue;
4347 }
4348
4349 /*
4350 * Calculate index for next page table.
4351 */
4352 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4353 if (va_next < sva)
4354 va_next = eva;
4355
4356 l2 = pmap_l1_to_l2(l1, sva);
4357 l3_paddr = pmap_load(l2);
4358
4359 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4360 if (sva + L2_SIZE == va_next && eva >= va_next) {
4361 pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
4362 true, &free, &lock);
4363 continue;
4364 } else if (pmap_demote_l2_locked(pmap, l2, sva,
4365 &lock) == NULL)
4366 continue;
4367 l3_paddr = pmap_load(l2);
4368 }
4369
4370 /*
4371 * Weed out invalid mappings.
4372 */
4373 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
4374 continue;
4375
4376 /*
4377 * Limit our scan to either the end of the va represented
4378 * by the current page table page, or to the end of the
4379 * range being removed.
4380 */
4381 if (va_next > eva)
4382 va_next = eva;
4383
4384 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
4385 &lock);
4386 }
4387 if (lock != NULL)
4388 rw_wunlock(lock);
4389 PMAP_UNLOCK(pmap);
4390 vm_page_free_pages_toq(&free, true);
4391 }
4392
4393 /*
4394 * Remove the given range of addresses from the specified map.
4395 *
4396 * It is assumed that the start and end are properly
4397 * rounded to the page size.
4398 */
4399 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4400 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4401 {
4402 pmap_remove1(pmap, sva, eva, false);
4403 }
4404
4405 /*
4406 * Remove the given range of addresses as part of a logical unmap
4407 * operation. This has the effect of calling pmap_remove(), but
4408 * also clears any metadata that should persist for the lifetime
4409 * of a logical mapping.
4410 */
4411 void
pmap_map_delete(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4412 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4413 {
4414 pmap_remove1(pmap, sva, eva, true);
4415 }
4416
4417 /*
4418 * Routine: pmap_remove_all
4419 * Function:
4420 * Removes this physical page from
4421 * all physical maps in which it resides.
4422 * Reflects back modify bits to the pager.
4423 *
4424 * Notes:
4425 * Original versions of this routine were very
4426 * inefficient because they iteratively called
4427 * pmap_remove (slow...)
4428 */
4429
4430 void
pmap_remove_all(vm_page_t m)4431 pmap_remove_all(vm_page_t m)
4432 {
4433 struct md_page *pvh;
4434 pv_entry_t pv;
4435 pmap_t pmap;
4436 struct rwlock *lock;
4437 pd_entry_t *pde, tpde;
4438 pt_entry_t *pte, tpte;
4439 vm_offset_t va;
4440 struct spglist free;
4441 int lvl, pvh_gen, md_gen;
4442
4443 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4444 ("pmap_remove_all: page %p is not managed", m));
4445 SLIST_INIT(&free);
4446 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4447 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
4448 rw_wlock(lock);
4449 retry:
4450 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4451 pmap = PV_PMAP(pv);
4452 if (!PMAP_TRYLOCK(pmap)) {
4453 pvh_gen = pvh->pv_gen;
4454 rw_wunlock(lock);
4455 PMAP_LOCK(pmap);
4456 rw_wlock(lock);
4457 if (pvh_gen != pvh->pv_gen) {
4458 PMAP_UNLOCK(pmap);
4459 goto retry;
4460 }
4461 }
4462 va = pv->pv_va;
4463 pte = pmap_pte_exists(pmap, va, 2, __func__);
4464 pmap_demote_l2_locked(pmap, pte, va, &lock);
4465 PMAP_UNLOCK(pmap);
4466 }
4467 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4468 pmap = PV_PMAP(pv);
4469 if (!PMAP_TRYLOCK(pmap)) {
4470 pvh_gen = pvh->pv_gen;
4471 md_gen = m->md.pv_gen;
4472 rw_wunlock(lock);
4473 PMAP_LOCK(pmap);
4474 rw_wlock(lock);
4475 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4476 PMAP_UNLOCK(pmap);
4477 goto retry;
4478 }
4479 }
4480 pmap_resident_count_dec(pmap, 1);
4481
4482 pde = pmap_pde(pmap, pv->pv_va, &lvl);
4483 KASSERT(pde != NULL,
4484 ("pmap_remove_all: no page directory entry found"));
4485 KASSERT(lvl == 2,
4486 ("pmap_remove_all: invalid pde level %d", lvl));
4487 tpde = pmap_load(pde);
4488
4489 pte = pmap_l2_to_l3(pde, pv->pv_va);
4490 tpte = pmap_load(pte);
4491 if ((tpte & ATTR_CONTIGUOUS) != 0)
4492 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
4493 tpte = pmap_load_clear(pte);
4494 if (tpte & ATTR_SW_WIRED)
4495 pmap->pm_stats.wired_count--;
4496 if ((tpte & ATTR_AF) != 0) {
4497 pmap_invalidate_page(pmap, pv->pv_va, true);
4498 vm_page_aflag_set(m, PGA_REFERENCED);
4499 }
4500
4501 /*
4502 * Update the vm_page_t clean and reference bits.
4503 */
4504 if (pmap_pte_dirty(pmap, tpte))
4505 vm_page_dirty(m);
4506 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
4507 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4508 m->md.pv_gen++;
4509 free_pv_entry(pmap, pv);
4510 PMAP_UNLOCK(pmap);
4511 }
4512 vm_page_aflag_clear(m, PGA_WRITEABLE);
4513 rw_wunlock(lock);
4514 vm_page_free_pages_toq(&free, true);
4515 }
4516
4517 /*
4518 * Masks and sets bits in a level 2 page table entries in the specified pmap
4519 */
4520 static void
pmap_protect_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pt_entry_t mask,pt_entry_t nbits)4521 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
4522 pt_entry_t nbits)
4523 {
4524 pd_entry_t old_l2;
4525 vm_page_t m, mt;
4526
4527 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4528 PMAP_ASSERT_STAGE1(pmap);
4529 KASSERT((sva & L2_OFFSET) == 0,
4530 ("pmap_protect_l2: sva is not 2mpage aligned"));
4531 old_l2 = pmap_load(l2);
4532 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4533 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
4534
4535 /*
4536 * Return if the L2 entry already has the desired access restrictions
4537 * in place.
4538 */
4539 if ((old_l2 & mask) == nbits)
4540 return;
4541
4542 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
4543 cpu_spinwait();
4544
4545 /*
4546 * When a dirty read/write superpage mapping is write protected,
4547 * update the dirty field of each of the superpage's constituent 4KB
4548 * pages.
4549 */
4550 if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
4551 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4552 pmap_pte_dirty(pmap, old_l2)) {
4553 m = PTE_TO_VM_PAGE(old_l2);
4554 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4555 vm_page_dirty(mt);
4556 }
4557
4558 /*
4559 * Since a promotion must break the 4KB page mappings before making
4560 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4561 */
4562 pmap_s1_invalidate_page(pmap, sva, true);
4563 }
4564
4565 /*
4566 * Masks and sets bits in the specified L3C superpage mapping.
4567 *
4568 * Requests TLB invalidations to be performed by the caller through the
4569 * returned "*vap".
4570 */
4571 static void
pmap_mask_set_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,pt_entry_t mask,pt_entry_t nbits)4572 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
4573 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits)
4574 {
4575 pt_entry_t l3e, *tl3p;
4576 vm_page_t m, mt;
4577 bool dirty;
4578
4579 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4580 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4581 0, ("pmap_mask_set_l3c: l3p is not aligned"));
4582 KASSERT((va & L3C_OFFSET) == 0,
4583 ("pmap_mask_set_l3c: va is not aligned"));
4584 dirty = false;
4585 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4586 l3e = pmap_load(tl3p);
4587 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4588 ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS"));
4589 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
4590 cpu_spinwait();
4591 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4592 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4593 dirty = true;
4594 }
4595
4596 /*
4597 * When a dirty read/write superpage mapping is write protected,
4598 * update the dirty field of each of the superpage's constituent 4KB
4599 * pages.
4600 */
4601 if ((l3e & ATTR_SW_MANAGED) != 0 &&
4602 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4603 dirty) {
4604 m = PTE_TO_VM_PAGE(pmap_load(l3p));
4605 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
4606 vm_page_dirty(mt);
4607 }
4608
4609 if (*vap == va_next)
4610 *vap = va;
4611 }
4612
4613 /*
4614 * Masks and sets bits in last level page table entries in the specified
4615 * pmap and range
4616 */
4617 static void
pmap_mask_set_locked(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4618 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4619 pt_entry_t nbits, bool invalidate)
4620 {
4621 vm_offset_t va, va_next;
4622 pd_entry_t *l0, *l1, *l2;
4623 pt_entry_t *l3p, l3;
4624
4625 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4626 for (; sva < eva; sva = va_next) {
4627 l0 = pmap_l0(pmap, sva);
4628 if (pmap_load(l0) == 0) {
4629 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4630 if (va_next < sva)
4631 va_next = eva;
4632 continue;
4633 }
4634
4635 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4636 if (va_next < sva)
4637 va_next = eva;
4638 l1 = pmap_l0_to_l1(l0, sva);
4639 if (pmap_load(l1) == 0)
4640 continue;
4641 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4642 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4643 KASSERT(va_next <= eva,
4644 ("partial update of non-transparent 1G page "
4645 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4646 pmap_load(l1), sva, eva, va_next));
4647 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4648 if ((pmap_load(l1) & mask) != nbits) {
4649 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
4650 if (invalidate)
4651 pmap_s1_invalidate_page(pmap, sva, true);
4652 }
4653 continue;
4654 }
4655
4656 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4657 if (va_next < sva)
4658 va_next = eva;
4659
4660 l2 = pmap_l1_to_l2(l1, sva);
4661 if (pmap_load(l2) == 0)
4662 continue;
4663
4664 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4665 if (sva + L2_SIZE == va_next && eva >= va_next) {
4666 pmap_protect_l2(pmap, l2, sva, mask, nbits);
4667 continue;
4668 } else if ((pmap_load(l2) & mask) == nbits ||
4669 pmap_demote_l2(pmap, l2, sva) == NULL)
4670 continue;
4671 }
4672 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4673 ("pmap_protect: Invalid L2 entry after demotion"));
4674
4675 if (va_next > eva)
4676 va_next = eva;
4677
4678 va = va_next;
4679 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
4680 sva += L3_SIZE) {
4681 l3 = pmap_load(l3p);
4682
4683 /*
4684 * Go to the next L3 entry if the current one is
4685 * invalid or already has the desired access
4686 * restrictions in place. (The latter case occurs
4687 * frequently. For example, in a "buildworld"
4688 * workload, almost 1 out of 4 L3 entries already
4689 * have the desired restrictions.)
4690 */
4691 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
4692 if (va != va_next) {
4693 if (invalidate)
4694 pmap_s1_invalidate_range(pmap,
4695 va, sva, true);
4696 va = va_next;
4697 }
4698 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4699 /*
4700 * Does this L3C page extend beyond
4701 * the requested range? Handle the
4702 * possibility that "va_next" is zero.
4703 */
4704 if ((sva | L3C_OFFSET) > va_next - 1)
4705 break;
4706
4707 /*
4708 * Skip ahead to the last L3_PAGE
4709 * within this L3C page.
4710 */
4711 l3p = (pt_entry_t *)((uintptr_t)l3p |
4712 ((L3C_ENTRIES - 1) *
4713 sizeof(pt_entry_t)));
4714 sva |= L3C_SIZE - L3_SIZE;
4715 }
4716 continue;
4717 }
4718
4719 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4720 /*
4721 * Is this entire set of contiguous L3 entries
4722 * being protected? Handle the possibility
4723 * that "va_next" is zero because of address
4724 * wraparound.
4725 */
4726 if ((sva & L3C_OFFSET) == 0 &&
4727 sva + L3C_OFFSET <= va_next - 1) {
4728 pmap_mask_set_l3c(pmap, l3p, sva, &va,
4729 va_next, mask, nbits);
4730 l3p += L3C_ENTRIES - 1;
4731 sva += L3C_SIZE - L3_SIZE;
4732 continue;
4733 }
4734
4735 (void)pmap_demote_l3c(pmap, l3p, sva);
4736
4737 /*
4738 * The L3 entry's accessed bit may have changed.
4739 */
4740 l3 = pmap_load(l3p);
4741 }
4742 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
4743 nbits))
4744 cpu_spinwait();
4745
4746 /*
4747 * When a dirty read/write mapping is write protected,
4748 * update the page's dirty field.
4749 */
4750 if ((l3 & ATTR_SW_MANAGED) != 0 &&
4751 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4752 pmap_pte_dirty(pmap, l3))
4753 vm_page_dirty(PTE_TO_VM_PAGE(l3));
4754
4755 if (va == va_next)
4756 va = sva;
4757 }
4758 if (va != va_next && invalidate)
4759 pmap_s1_invalidate_range(pmap, va, sva, true);
4760 }
4761 }
4762
4763 static void
pmap_mask_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4764 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4765 pt_entry_t nbits, bool invalidate)
4766 {
4767 PMAP_LOCK(pmap);
4768 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
4769 PMAP_UNLOCK(pmap);
4770 }
4771
4772 /*
4773 * Set the physical protection on the
4774 * specified range of this map as requested.
4775 */
4776 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)4777 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4778 {
4779 pt_entry_t mask, nbits;
4780
4781 PMAP_ASSERT_STAGE1(pmap);
4782 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4783 if (prot == VM_PROT_NONE) {
4784 pmap_remove(pmap, sva, eva);
4785 return;
4786 }
4787
4788 mask = nbits = 0;
4789 if ((prot & VM_PROT_WRITE) == 0) {
4790 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
4791 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
4792 }
4793 if ((prot & VM_PROT_EXECUTE) == 0) {
4794 mask |= ATTR_S1_XN;
4795 nbits |= ATTR_S1_XN;
4796 }
4797 if (pmap == kernel_pmap) {
4798 mask |= ATTR_KERN_GP;
4799 nbits |= ATTR_KERN_GP;
4800 }
4801 if (mask == 0)
4802 return;
4803
4804 pmap_mask_set(pmap, sva, eva, mask, nbits, true);
4805 }
4806
4807 void
pmap_disable_promotion(vm_offset_t sva,vm_size_t size)4808 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
4809 {
4810
4811 MPASS((sva & L3_OFFSET) == 0);
4812 MPASS(((sva + size) & L3_OFFSET) == 0);
4813
4814 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4815 ATTR_SW_NO_PROMOTE, false);
4816 }
4817
4818 /*
4819 * Inserts the specified page table page into the specified pmap's collection
4820 * of idle page table pages. Each of a pmap's page table pages is responsible
4821 * for mapping a distinct range of virtual addresses. The pmap's collection is
4822 * ordered by this virtual address range.
4823 *
4824 * If "promoted" is false, then the page table page "mpte" must be zero filled;
4825 * "mpte"'s valid field will be set to 0.
4826 *
4827 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must
4828 * contain valid mappings with identical attributes except for ATTR_AF;
4829 * "mpte"'s valid field will be set to 1.
4830 *
4831 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain
4832 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid
4833 * field will be set to VM_PAGE_BITS_ALL.
4834 */
4835 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_AF_set)4836 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4837 bool all_l3e_AF_set)
4838 {
4839
4840 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4841 KASSERT(promoted || !all_l3e_AF_set,
4842 ("a zero-filled PTP can't have ATTR_AF set in every PTE"));
4843 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0;
4844 return (vm_radix_insert(&pmap->pm_root, mpte));
4845 }
4846
4847 /*
4848 * Removes the page table page mapping the specified virtual address from the
4849 * specified pmap's collection of idle page table pages, and returns it.
4850 * Otherwise, returns NULL if there is no page table page corresponding to the
4851 * specified virtual address.
4852 */
4853 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)4854 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4855 {
4856
4857 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4858 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4859 }
4860
4861 /*
4862 * Performs a break-before-make update of a pmap entry. This is needed when
4863 * either promoting or demoting pages to ensure the TLB doesn't get into an
4864 * inconsistent state.
4865 */
4866 static void
pmap_update_entry(pmap_t pmap,pd_entry_t * ptep,pd_entry_t newpte,vm_offset_t va,vm_size_t size)4867 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte,
4868 vm_offset_t va, vm_size_t size)
4869 {
4870 register_t intr;
4871
4872 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4873 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4874 ("%s: Updating non-promote pte", __func__));
4875
4876 /*
4877 * Ensure we don't get switched out with the page table in an
4878 * inconsistent state. We also need to ensure no interrupts fire
4879 * as they may make use of an address we are about to invalidate.
4880 */
4881 intr = intr_disable();
4882
4883 /*
4884 * Clear the old mapping's valid bit, but leave the rest of the entry
4885 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4886 * lookup the physical address.
4887 */
4888 pmap_clear_bits(ptep, ATTR_DESCR_VALID);
4889
4890 /*
4891 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4892 * be cached, so we invalidate intermediate entries as well as final
4893 * entries.
4894 */
4895 pmap_s1_invalidate_range(pmap, va, va + size, false);
4896
4897 /* Create the new mapping */
4898 pmap_store(ptep, newpte);
4899 dsb(ishst);
4900
4901 intr_restore(intr);
4902 }
4903
4904 /*
4905 * Performs a break-before-make update of an ATTR_CONTIGUOUS mapping.
4906 */
4907 static void __nosanitizecoverage
pmap_update_strided(pmap_t pmap,pd_entry_t * ptep,pd_entry_t * ptep_end,pd_entry_t newpte,vm_offset_t va,vm_offset_t stride,vm_size_t size)4908 pmap_update_strided(pmap_t pmap, pd_entry_t *ptep, pd_entry_t *ptep_end,
4909 pd_entry_t newpte, vm_offset_t va, vm_offset_t stride, vm_size_t size)
4910 {
4911 pd_entry_t *lip;
4912 register_t intr;
4913
4914 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4915 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4916 ("%s: Updating non-promote pte", __func__));
4917
4918 /*
4919 * Ensure we don't get switched out with the page table in an
4920 * inconsistent state. We also need to ensure no interrupts fire
4921 * as they may make use of an address we are about to invalidate.
4922 */
4923 intr = intr_disable();
4924
4925 /*
4926 * Clear the old mapping's valid bits, but leave the rest of each
4927 * entry unchanged, so that a lockless, concurrent pmap_kextract() can
4928 * still lookup the physical address.
4929 */
4930 for (lip = ptep; lip < ptep_end; lip++)
4931 pmap_clear_bits(lip, ATTR_DESCR_VALID);
4932
4933 /* Only final entries are changing. */
4934 pmap_s1_invalidate_strided(pmap, va, va + size, stride, true);
4935
4936 /* Create the new mapping. */
4937 for (lip = ptep; lip < ptep_end; lip++) {
4938 pmap_store(lip, newpte);
4939 newpte += stride;
4940 }
4941 dsb(ishst);
4942
4943 intr_restore(intr);
4944 }
4945
4946 #if VM_NRESERVLEVEL > 0
4947 /*
4948 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4949 * replace the many pv entries for the 4KB page mappings by a single pv entry
4950 * for the 2MB page mapping.
4951 */
4952 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)4953 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4954 struct rwlock **lockp)
4955 {
4956 struct md_page *pvh;
4957 pv_entry_t pv;
4958 vm_offset_t va_last;
4959 vm_page_t m;
4960
4961 KASSERT((pa & L2_OFFSET) == 0,
4962 ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4963 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4964
4965 /*
4966 * Transfer the first page's pv entry for this mapping to the 2mpage's
4967 * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
4968 * a transfer avoids the possibility that get_pv_entry() calls
4969 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4970 * mappings that is being promoted.
4971 */
4972 m = PHYS_TO_VM_PAGE(pa);
4973 va = va & ~L2_OFFSET;
4974 pv = pmap_pvh_remove(&m->md, pmap, va);
4975 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4976 pvh = page_to_pvh(m);
4977 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4978 pvh->pv_gen++;
4979 /* Free the remaining NPTEPG - 1 pv entries. */
4980 va_last = va + L2_SIZE - PAGE_SIZE;
4981 do {
4982 m++;
4983 va += PAGE_SIZE;
4984 pmap_pvh_free(&m->md, pmap, va);
4985 } while (va < va_last);
4986 }
4987
4988 /*
4989 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4990 * single level 2 table entry to a single 2MB page mapping. For promotion
4991 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4992 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4993 * identical characteristics.
4994 */
4995 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t mpte,struct rwlock ** lockp)4996 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4997 struct rwlock **lockp)
4998 {
4999 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa;
5000
5001 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5002
5003 /*
5004 * Currently, this function only supports promotion on stage 1 pmaps
5005 * because it tests stage 1 specific fields and performs a break-
5006 * before-make sequence that is incorrect for stage 2 pmaps.
5007 */
5008 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
5009 return (false);
5010
5011 /*
5012 * Examine the first L3E in the specified PTP. Abort if this L3E is
5013 * ineligible for promotion...
5014 */
5015 firstl3 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
5016 newl2 = pmap_load(firstl3);
5017 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
5018 return (false);
5019 /* ... is not the first physical page within an L2 block */
5020 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 ||
5021 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */
5022 counter_u64_add(pmap_l2_p_failures, 1);
5023 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
5024 " in pmap %p", va, pmap);
5025 return (false);
5026 }
5027
5028 /*
5029 * Both here and in the below "for" loop, to allow for repromotion
5030 * after MADV_FREE, conditionally write protect a clean L3E before
5031 * possibly aborting the promotion due to other L3E attributes. Why?
5032 * Suppose that MADV_FREE is applied to a part of a superpage, the
5033 * address range [S, E). pmap_advise() will demote the superpage
5034 * mapping, destroy the 4KB page mapping at the end of [S, E), and
5035 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later,
5036 * imagine that the memory in [S, E) is recycled, but the last 4KB
5037 * page in [S, E) is not the last to be rewritten, or simply accessed.
5038 * In other words, there is still a 4KB page in [S, E), call it P,
5039 * that is writeable but AP_RO is set and AF is clear in P's L3E.
5040 * Unless we write protect P before aborting the promotion, if and
5041 * when P is finally rewritten, there won't be a page fault to trigger
5042 * repromotion.
5043 */
5044 setl2:
5045 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5046 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5047 /*
5048 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
5049 * ATTR_SW_DBM can be cleared without a TLB invalidation.
5050 */
5051 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
5052 goto setl2;
5053 newl2 &= ~ATTR_SW_DBM;
5054 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx"
5055 " in pmap %p", va & ~L2_OFFSET, pmap);
5056 }
5057
5058 /*
5059 * Examine each of the other L3Es in the specified PTP. Abort if this
5060 * L3E maps an unexpected 4KB physical page or does not have identical
5061 * characteristics to the first L3E. If ATTR_AF is not set in every
5062 * PTE, then request that the PTP be refilled on demotion.
5063 */
5064 all_l3e_AF = newl2 & ATTR_AF;
5065 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK))
5066 + L2_SIZE - PAGE_SIZE;
5067 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
5068 oldl3 = pmap_load(l3);
5069 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
5070 counter_u64_add(pmap_l2_p_failures, 1);
5071 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
5072 " in pmap %p", va, pmap);
5073 return (false);
5074 }
5075 setl3:
5076 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5077 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5078 /*
5079 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
5080 * set, ATTR_SW_DBM can be cleared without a TLB
5081 * invalidation.
5082 */
5083 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
5084 ~ATTR_SW_DBM))
5085 goto setl3;
5086 oldl3 &= ~ATTR_SW_DBM;
5087 }
5088 if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) {
5089 counter_u64_add(pmap_l2_p_failures, 1);
5090 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
5091 " in pmap %p", va, pmap);
5092 return (false);
5093 }
5094 all_l3e_AF &= oldl3;
5095 pa -= PAGE_SIZE;
5096 }
5097
5098 /*
5099 * Unless all PTEs have ATTR_AF set, clear it from the superpage
5100 * mapping, so that promotions triggered by speculative mappings,
5101 * such as pmap_enter_quick(), don't automatically mark the
5102 * underlying pages as referenced.
5103 */
5104 newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF;
5105
5106 /*
5107 * Save the page table page in its current state until the L2
5108 * mapping the superpage is demoted by pmap_demote_l2() or
5109 * destroyed by pmap_remove_l3().
5110 */
5111 if (mpte == NULL)
5112 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5113 KASSERT(mpte >= vm_page_array &&
5114 mpte < &vm_page_array[vm_page_array_size],
5115 ("pmap_promote_l2: page table page is out of range"));
5116 KASSERT(mpte->pindex == pmap_l2_pindex(va),
5117 ("pmap_promote_l2: page table page's pindex is wrong"));
5118 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) {
5119 counter_u64_add(pmap_l2_p_failures, 1);
5120 CTR2(KTR_PMAP,
5121 "pmap_promote_l2: failure for va %#lx in pmap %p", va,
5122 pmap);
5123 return (false);
5124 }
5125
5126 if ((newl2 & ATTR_SW_MANAGED) != 0)
5127 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp);
5128
5129 pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE);
5130
5131 counter_u64_add(pmap_l2_promotions, 1);
5132 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
5133 pmap);
5134 return (true);
5135 }
5136
5137 /*
5138 * Tries to promote an aligned, contiguous set of base page mappings to a
5139 * single L3C page mapping. For promotion to occur, two conditions must be
5140 * met: (1) the base page mappings must map aligned, contiguous physical
5141 * memory and (2) the base page mappings must have identical characteristics
5142 * except for the accessed flag.
5143 */
5144 static bool
pmap_promote_l3c(pmap_t pmap,pd_entry_t * l3p,vm_offset_t va)5145 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va)
5146 {
5147 pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa;
5148
5149 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5150
5151 /*
5152 * Currently, this function only supports promotion on stage 1 pmaps
5153 * because it tests stage 1 specific fields and performs a break-
5154 * before-make sequence that is incorrect for stage 2 pmaps.
5155 */
5156 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
5157 return (false);
5158
5159 /*
5160 * Compute the address of the first L3 entry in the superpage
5161 * candidate.
5162 */
5163 l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
5164 sizeof(pt_entry_t)) - 1));
5165
5166 firstl3c = pmap_load(l3p);
5167
5168 /*
5169 * Examine the first L3 entry. Abort if this L3E is ineligible for
5170 * promotion...
5171 */
5172 if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0)
5173 return (false);
5174 /* ...is not properly aligned... */
5175 if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 ||
5176 (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */
5177 counter_u64_add(pmap_l3c_p_failures, 1);
5178 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5179 " in pmap %p", va, pmap);
5180 return (false);
5181 }
5182
5183 /*
5184 * If the first L3 entry is a clean read-write mapping, convert it
5185 * to a read-only mapping. See pmap_promote_l2() for the rationale.
5186 */
5187 set_first:
5188 if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5189 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5190 /*
5191 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
5192 * ATTR_SW_DBM can be cleared without a TLB invalidation.
5193 */
5194 if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM))
5195 goto set_first;
5196 firstl3c &= ~ATTR_SW_DBM;
5197 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
5198 " in pmap %p", va & ~L3C_OFFSET, pmap);
5199 }
5200
5201 /*
5202 * Check that the rest of the L3 entries are compatible with the first,
5203 * and convert clean read-write mappings to read-only mappings.
5204 */
5205 all_l3e_AF = firstl3c & ATTR_AF;
5206 pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) +
5207 L3C_SIZE - PAGE_SIZE;
5208 for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) {
5209 oldl3 = pmap_load(l3);
5210 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
5211 counter_u64_add(pmap_l3c_p_failures, 1);
5212 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5213 " in pmap %p", va, pmap);
5214 return (false);
5215 }
5216 set_l3:
5217 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5218 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5219 /*
5220 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
5221 * set, ATTR_SW_DBM can be cleared without a TLB
5222 * invalidation.
5223 */
5224 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
5225 ~ATTR_SW_DBM))
5226 goto set_l3;
5227 oldl3 &= ~ATTR_SW_DBM;
5228 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
5229 " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) |
5230 (va & ~L3C_OFFSET), pmap);
5231 }
5232 if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) {
5233 counter_u64_add(pmap_l3c_p_failures, 1);
5234 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5235 " in pmap %p", va, pmap);
5236 return (false);
5237 }
5238 all_l3e_AF &= oldl3;
5239 pa -= PAGE_SIZE;
5240 }
5241
5242 /*
5243 * Unless all PTEs have ATTR_AF set, clear it from the superpage
5244 * mapping, so that promotions triggered by speculative mappings,
5245 * such as pmap_enter_quick(), don't automatically mark the
5246 * underlying pages as referenced.
5247 */
5248 firstl3c &= ~ATTR_AF | all_l3e_AF;
5249
5250 /*
5251 * Remake the mappings with the contiguous bit set.
5252 */
5253 pmap_update_strided(pmap, l3p, l3p + L3C_ENTRIES, firstl3c |
5254 ATTR_CONTIGUOUS, va & ~L3C_OFFSET, L3_SIZE, L3C_SIZE);
5255
5256 counter_u64_add(pmap_l3c_promotions, 1);
5257 CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va,
5258 pmap);
5259 return (true);
5260 }
5261 #endif /* VM_NRESERVLEVEL > 0 */
5262
5263 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t pte,int flags,int psind)5264 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags,
5265 int psind)
5266 {
5267 pd_entry_t *l0p, *l1p, *l2p, *l3p, newpte, origpte, *tl3p;
5268 vm_page_t mp;
5269
5270 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5271 KASSERT(psind > 0 && psind < MAXPAGESIZES,
5272 ("psind %d unexpected", psind));
5273 KASSERT((PTE_TO_PHYS(pte) & (pagesizes[psind] - 1)) == 0,
5274 ("unaligned phys address %#lx pte %#lx psind %d",
5275 PTE_TO_PHYS(pte), pte, psind));
5276
5277 restart:
5278 newpte = pte;
5279 if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte))
5280 return (KERN_PROTECTION_FAILURE);
5281 if (psind == 3) {
5282 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5283
5284 KASSERT(pagesizes[psind] == L1_SIZE,
5285 ("pagesizes[%d] != L1_SIZE", psind));
5286 l0p = pmap_l0(pmap, va);
5287 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
5288 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
5289 if (mp == NULL) {
5290 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5291 return (KERN_RESOURCE_SHORTAGE);
5292 PMAP_UNLOCK(pmap);
5293 vm_wait(NULL);
5294 PMAP_LOCK(pmap);
5295 goto restart;
5296 }
5297 l1p = pmap_l0_to_l1(l0p, va);
5298 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
5299 origpte = pmap_load(l1p);
5300 } else {
5301 l1p = pmap_l0_to_l1(l0p, va);
5302 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
5303 origpte = pmap_load(l1p);
5304 if ((origpte & ATTR_DESCR_VALID) == 0) {
5305 mp = PTE_TO_VM_PAGE(pmap_load(l0p));
5306 mp->ref_count++;
5307 }
5308 }
5309 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) &&
5310 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
5311 (origpte & ATTR_DESCR_VALID) == 0,
5312 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
5313 va, origpte, newpte));
5314 pmap_store(l1p, newpte);
5315 } else if (psind == 2) {
5316 KASSERT(pagesizes[psind] == L2_SIZE,
5317 ("pagesizes[%d] != L2_SIZE", psind));
5318 l2p = pmap_l2(pmap, va);
5319 if (l2p == NULL) {
5320 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
5321 if (mp == NULL) {
5322 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5323 return (KERN_RESOURCE_SHORTAGE);
5324 PMAP_UNLOCK(pmap);
5325 vm_wait(NULL);
5326 PMAP_LOCK(pmap);
5327 goto restart;
5328 }
5329 l2p = VM_PAGE_TO_DMAP(mp);
5330 l2p = &l2p[pmap_l2_index(va)];
5331 origpte = pmap_load(l2p);
5332 } else {
5333 l1p = pmap_l1(pmap, va);
5334 origpte = pmap_load(l2p);
5335 if ((origpte & ATTR_DESCR_VALID) == 0) {
5336 mp = PTE_TO_VM_PAGE(pmap_load(l1p));
5337 mp->ref_count++;
5338 }
5339 }
5340 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5341 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
5342 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5343 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
5344 va, origpte, newpte));
5345 pmap_store(l2p, newpte);
5346 } else /* (psind == 1) */ {
5347 KASSERT(pagesizes[psind] == L3C_SIZE,
5348 ("pagesizes[%d] != L3C_SIZE", psind));
5349 l2p = pmap_l2(pmap, va);
5350 if (l2p == NULL || (pmap_load(l2p) & ATTR_DESCR_VALID) == 0) {
5351 mp = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL);
5352 if (mp == NULL) {
5353 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5354 return (KERN_RESOURCE_SHORTAGE);
5355 PMAP_UNLOCK(pmap);
5356 vm_wait(NULL);
5357 PMAP_LOCK(pmap);
5358 goto restart;
5359 }
5360 mp->ref_count += L3C_ENTRIES - 1;
5361 l3p = VM_PAGE_TO_DMAP(mp);
5362 l3p = &l3p[pmap_l3_index(va)];
5363 } else {
5364 l3p = pmap_l2_to_l3(l2p, va);
5365 if ((pmap_load(l3p) & ATTR_DESCR_VALID) == 0) {
5366 mp = PTE_TO_VM_PAGE(pmap_load(l2p));
5367 mp->ref_count += L3C_ENTRIES;
5368 }
5369 }
5370 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5371 origpte = pmap_load(tl3p);
5372 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5373 ((origpte & ATTR_CONTIGUOUS) != 0 &&
5374 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5375 ("va %#lx changing 64K phys page l3 %#lx newpte %#lx",
5376 va, origpte, newpte));
5377 pmap_store(tl3p, newpte);
5378 newpte += L3_SIZE;
5379 }
5380 }
5381 dsb(ishst);
5382
5383 if ((origpte & ATTR_DESCR_VALID) == 0)
5384 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
5385 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
5386 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
5387 else if ((newpte & ATTR_SW_WIRED) == 0 &&
5388 (origpte & ATTR_SW_WIRED) != 0)
5389 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
5390
5391 return (KERN_SUCCESS);
5392 }
5393
5394 static void
pmap_set_unprotected(pt_entry_t new_l3)5395 pmap_set_unprotected(pt_entry_t new_l3)
5396 {
5397 vm_paddr_t pa;
5398
5399 pa = PTE_TO_PHYS(new_l3) & ~prot_ns_shared_pa;
5400
5401 rsi_set_addr_range_state(pa, pa + L3_SIZE, RSI_RIPAS_EMPTY,
5402 RSI_CHANGE_DESTROYED, NULL);
5403 }
5404
5405 static void
pmap_set_protected(pt_entry_t old_l3)5406 pmap_set_protected(pt_entry_t old_l3)
5407 {
5408 vm_paddr_t pa;
5409
5410 pa = PTE_TO_PHYS(old_l3) & ~prot_ns_shared_pa;
5411
5412 rsi_set_addr_range_state(pa, pa + L3_SIZE, RSI_RIPAS_RAM,
5413 RSI_CHANGE_DESTROYED, NULL);
5414 }
5415
5416 /*
5417 * Insert the given physical page (p) at
5418 * the specified virtual address (v) in the
5419 * target physical map with the protection requested.
5420 *
5421 * If specified, the page will be wired down, meaning
5422 * that the related pte can not be reclaimed.
5423 *
5424 * NB: This is the only routine which MAY NOT lazy-evaluate
5425 * or lose information. That is, this routine must actually
5426 * insert this page into the given map NOW.
5427 */
5428 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)5429 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5430 u_int flags, int8_t psind)
5431 {
5432 struct rwlock *lock;
5433 pd_entry_t *pde;
5434 pt_entry_t new_l3, orig_l3;
5435 pt_entry_t *l2, *l3;
5436 pv_entry_t pv;
5437 vm_paddr_t opa, pa;
5438 vm_page_t mpte, om;
5439 bool nosleep;
5440 int full_lvl, lvl, rv;
5441
5442 KASSERT(ADDR_IS_CANONICAL(va),
5443 ("%s: Address not in canonical form: %lx", __func__, va));
5444
5445 va = trunc_page(va);
5446 if ((m->oflags & VPO_UNMANAGED) == 0)
5447 VM_PAGE_OBJECT_BUSY_ASSERT(m);
5448 pa = VM_PAGE_TO_PHYS(m);
5449 if (in_realm() && (flags & PMAP_ENTER_UNPROTECTED) != 0)
5450 pa |= prot_ns_shared_pa;
5451 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr |
5452 L3_PAGE);
5453 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
5454 new_l3 |= pmap_pte_prot(pmap, prot);
5455 if ((flags & PMAP_ENTER_WIRED) != 0)
5456 new_l3 |= ATTR_SW_WIRED;
5457 if (pmap->pm_stage == PM_STAGE1) {
5458 if (ADDR_IS_USER(va))
5459 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5460 else
5461 new_l3 |= ATTR_S1_UXN;
5462 if (pmap != kernel_pmap)
5463 new_l3 |= ATTR_S1_nG;
5464 } else {
5465 /*
5466 * Clear the access flag on executable mappings, this will be
5467 * set later when the page is accessed. The fault handler is
5468 * required to invalidate the I-cache.
5469 *
5470 * TODO: Switch to the valid flag to allow hardware management
5471 * of the access flag. Much of the pmap code assumes the
5472 * valid flag is set and fails to destroy the old page tables
5473 * correctly if it is clear.
5474 */
5475 if (prot & VM_PROT_EXECUTE)
5476 new_l3 &= ~ATTR_AF;
5477 }
5478 if ((m->oflags & VPO_UNMANAGED) == 0) {
5479 new_l3 |= ATTR_SW_MANAGED;
5480 if ((prot & VM_PROT_WRITE) != 0) {
5481 new_l3 |= ATTR_SW_DBM;
5482 if ((flags & VM_PROT_WRITE) == 0) {
5483 if (pmap->pm_stage == PM_STAGE1)
5484 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
5485 else
5486 new_l3 &=
5487 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5488 }
5489 }
5490 }
5491
5492 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
5493
5494 lock = NULL;
5495 PMAP_LOCK(pmap);
5496 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
5497 KASSERT((m->oflags & VPO_UNMANAGED) != 0,
5498 ("managed largepage va %#lx flags %#x", va, flags));
5499 if (psind == 3) {
5500 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5501 new_l3 &= ~L3_PAGE;
5502 new_l3 |= L1_BLOCK;
5503 } else if (psind == 2) {
5504 new_l3 &= ~L3_PAGE;
5505 new_l3 |= L2_BLOCK;
5506 } else /* (psind == 1) */
5507 new_l3 |= ATTR_CONTIGUOUS;
5508 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
5509 goto out;
5510 }
5511 if (psind == 2) {
5512 /* Assert the required virtual and physical alignment. */
5513 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
5514 KASSERT(m->psind > 1, ("pmap_enter: m->psind < psind"));
5515 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
5516 flags, m, &lock);
5517 goto out;
5518 }
5519 mpte = NULL;
5520 if (psind == 1) {
5521 KASSERT((va & L3C_OFFSET) == 0, ("pmap_enter: va unaligned"));
5522 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5523 rv = pmap_enter_l3c(pmap, va, new_l3 | ATTR_CONTIGUOUS, flags,
5524 m, &mpte, &lock);
5525 #if VM_NRESERVLEVEL > 0
5526 /*
5527 * Attempt L2 promotion, if both the PTP and a level 1
5528 * reservation are fully populated.
5529 */
5530 if (rv == KERN_SUCCESS &&
5531 (mpte == NULL || mpte->ref_count == NL3PG) &&
5532 (m->flags & PG_FICTITIOUS) == 0 &&
5533 vm_reserv_level_iffullpop(m) == 1) {
5534 pde = pmap_l2(pmap, va);
5535 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5536 }
5537 #endif
5538 goto out;
5539 }
5540
5541 /*
5542 * In the case that a page table page is not
5543 * resident, we are creating it here.
5544 */
5545 retry:
5546 pde = pmap_pde(pmap, va, &lvl);
5547 if (pde != NULL && lvl == 2) {
5548 l3 = pmap_l2_to_l3(pde, va);
5549 if (ADDR_IS_USER(va) && mpte == NULL) {
5550 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
5551 mpte->ref_count++;
5552 }
5553 goto havel3;
5554 } else if (pde != NULL && lvl == 1) {
5555 l2 = pmap_l1_to_l2(pde, va);
5556 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
5557 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
5558 l3 = &l3[pmap_l3_index(va)];
5559 if (ADDR_IS_USER(va)) {
5560 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5561 mpte->ref_count++;
5562 }
5563 goto havel3;
5564 }
5565 /* We need to allocate an L3 table. */
5566 }
5567 if (ADDR_IS_USER(va)) {
5568 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5569
5570 /*
5571 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
5572 * to handle the possibility that a superpage mapping for "va"
5573 * was created while we slept.
5574 */
5575 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
5576 nosleep ? NULL : &lock);
5577 if (mpte == NULL && nosleep) {
5578 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
5579 rv = KERN_RESOURCE_SHORTAGE;
5580 goto out;
5581 }
5582 goto retry;
5583 } else
5584 panic("pmap_enter: missing L3 table for kernel va %#lx", va);
5585
5586 havel3:
5587 orig_l3 = pmap_load(l3);
5588 opa = PTE_TO_PHYS(orig_l3);
5589 pv = NULL;
5590 new_l3 |= pmap_pte_bti(pmap, va);
5591
5592 /*
5593 * Is the specified virtual address already mapped?
5594 */
5595 if (pmap_l3_valid(orig_l3)) {
5596 /*
5597 * Wiring change, just update stats. We don't worry about
5598 * wiring PT pages as they remain resident as long as there
5599 * are valid mappings in them. Hence, if a user page is wired,
5600 * the PT page will be also.
5601 */
5602 if ((flags & PMAP_ENTER_WIRED) != 0 &&
5603 (orig_l3 & ATTR_SW_WIRED) == 0)
5604 pmap->pm_stats.wired_count++;
5605 else if ((flags & PMAP_ENTER_WIRED) == 0 &&
5606 (orig_l3 & ATTR_SW_WIRED) != 0)
5607 pmap->pm_stats.wired_count--;
5608
5609 /*
5610 * Remove the extra PT page reference.
5611 */
5612 if (mpte != NULL) {
5613 mpte->ref_count--;
5614 KASSERT(mpte->ref_count > 0,
5615 ("pmap_enter: missing reference to page table page,"
5616 " va: 0x%lx", va));
5617 }
5618
5619 /*
5620 * Has the physical page changed?
5621 */
5622 if (opa == pa) {
5623 /*
5624 * No, might be a protection or wiring change.
5625 */
5626 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5627 (new_l3 & ATTR_SW_DBM) != 0)
5628 vm_page_aflag_set(m, PGA_WRITEABLE);
5629 goto validate;
5630 }
5631
5632 /*
5633 * The physical page has changed. Temporarily invalidate
5634 * the mapping.
5635 */
5636 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5637 (void)pmap_demote_l3c(pmap, l3, va);
5638 orig_l3 = pmap_load_clear(l3);
5639 KASSERT(PTE_TO_PHYS(orig_l3) == opa,
5640 ("pmap_enter: unexpected pa update for %#lx", va));
5641 if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
5642 om = PHYS_TO_VM_PAGE(opa);
5643
5644 /*
5645 * The pmap lock is sufficient to synchronize with
5646 * concurrent calls to pmap_page_test_mappings() and
5647 * pmap_ts_referenced().
5648 */
5649 if (pmap_pte_dirty(pmap, orig_l3))
5650 vm_page_dirty(om);
5651 if ((orig_l3 & ATTR_AF) != 0) {
5652 pmap_invalidate_page(pmap, va, true);
5653 vm_page_aflag_set(om, PGA_REFERENCED);
5654 }
5655 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om);
5656 pv = pmap_pvh_remove(&om->md, pmap, va);
5657 if ((m->oflags & VPO_UNMANAGED) != 0)
5658 free_pv_entry(pmap, pv);
5659 if ((om->a.flags & PGA_WRITEABLE) != 0 &&
5660 TAILQ_EMPTY(&om->md.pv_list) &&
5661 ((om->flags & PG_FICTITIOUS) != 0 ||
5662 TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
5663 vm_page_aflag_clear(om, PGA_WRITEABLE);
5664 } else {
5665 KASSERT((orig_l3 & ATTR_AF) != 0,
5666 ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
5667 pmap_invalidate_page(pmap, va, true);
5668 }
5669 orig_l3 = 0;
5670 } else {
5671 /*
5672 * Increment the counters.
5673 */
5674 if ((new_l3 & ATTR_SW_WIRED) != 0)
5675 pmap->pm_stats.wired_count++;
5676 pmap_resident_count_inc(pmap, 1);
5677 }
5678 /*
5679 * Enter on the PV list if part of our managed memory.
5680 */
5681 if ((m->oflags & VPO_UNMANAGED) == 0) {
5682 if (pv == NULL) {
5683 pv = get_pv_entry(pmap, &lock);
5684 pv->pv_va = va;
5685 }
5686 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5687 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5688 m->md.pv_gen++;
5689 if ((new_l3 & ATTR_SW_DBM) != 0)
5690 vm_page_aflag_set(m, PGA_WRITEABLE);
5691 }
5692
5693 validate:
5694 if (pmap->pm_stage == PM_STAGE1) {
5695 /*
5696 * Sync icache if exec permission and attribute
5697 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
5698 * is stored and made valid for hardware table walk. If done
5699 * later, then other can access this page before caches are
5700 * properly synced. Don't do it for kernel memory which is
5701 * mapped with exec permission even if the memory isn't going
5702 * to hold executable code. The only time when icache sync is
5703 * needed is after kernel module is loaded and the relocation
5704 * info is processed. And it's done in elf_cpu_load_file().
5705 */
5706 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5707 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
5708 (opa != pa || (orig_l3 & ATTR_S1_XN))) {
5709 PMAP_ASSERT_STAGE1(pmap);
5710 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
5711 }
5712 } else {
5713 cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
5714 }
5715
5716 /*
5717 * Update the L3 entry
5718 */
5719 if (pmap_l3_valid(orig_l3)) {
5720 KASSERT(opa == pa, ("pmap_enter: invalid update"));
5721 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
5722 /* same PA, different attributes */
5723 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5724 (void)pmap_demote_l3c(pmap, l3, va);
5725 orig_l3 = pmap_load_store(l3, new_l3);
5726 pmap_invalidate_page(pmap, va, true);
5727 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5728 pmap_pte_dirty(pmap, orig_l3))
5729 vm_page_dirty(m);
5730 } else {
5731 /*
5732 * orig_l3 == new_l3
5733 * This can happens if multiple threads simultaneously
5734 * access not yet mapped page. This bad for performance
5735 * since this can cause full demotion-NOP-promotion
5736 * cycle.
5737 * Another possible reasons are:
5738 * - VM and pmap memory layout are diverged
5739 * - tlb flush is missing somewhere and CPU doesn't see
5740 * actual mapping.
5741 */
5742 CTR4(KTR_PMAP, "%s: already mapped page - "
5743 "pmap %p va 0x%#lx pte 0x%lx",
5744 __func__, pmap, va, new_l3);
5745 }
5746 } else {
5747 /* New mapping */
5748 pmap_store(l3, new_l3);
5749 dsb(ishst);
5750 }
5751
5752 #if VM_NRESERVLEVEL > 0
5753 /*
5754 * First, attempt L3C promotion, if the virtual and physical addresses
5755 * are aligned with each other and an underlying reservation has the
5756 * neighboring L3 pages allocated. The first condition is simply an
5757 * optimization that recognizes some eventual promotion failures early
5758 * at a lower run-time cost. Then, if both a level 1 reservation and
5759 * the PTP are fully populated, attempt L2 promotion.
5760 */
5761 if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
5762 (m->flags & PG_FICTITIOUS) == 0 &&
5763 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
5764 pmap_promote_l3c(pmap, l3, va) &&
5765 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG))
5766 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5767 #endif
5768
5769 rv = KERN_SUCCESS;
5770
5771 if (in_realm() && (flags & PMAP_ENTER_UNPROTECTED) != 0)
5772 pmap_set_unprotected(new_l3);
5773
5774 out:
5775 if (lock != NULL)
5776 rw_wunlock(lock);
5777 PMAP_UNLOCK(pmap);
5778 return (rv);
5779 }
5780
5781 /*
5782 * Tries to create a read- and/or execute-only L2 page mapping. Returns
5783 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
5784 * value. See pmap_enter_l2() for the possible error values when "no sleep",
5785 * "no replace", and "no reclaim" are specified.
5786 */
5787 static int
pmap_enter_l2_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)5788 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5789 struct rwlock **lockp)
5790 {
5791 pd_entry_t new_l2;
5792
5793 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5794 PMAP_ASSERT_STAGE1(pmap);
5795 KASSERT(ADDR_IS_CANONICAL(va),
5796 ("%s: Address not in canonical form: %lx", __func__, va));
5797
5798 new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | pmap_sh_attr |
5799 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5800 L2_BLOCK);
5801 if ((m->oflags & VPO_UNMANAGED) == 0)
5802 new_l2 |= ATTR_SW_MANAGED;
5803 else
5804 new_l2 |= ATTR_AF;
5805 if ((prot & VM_PROT_EXECUTE) == 0 ||
5806 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5807 new_l2 |= ATTR_S1_XN;
5808 if (ADDR_IS_USER(va))
5809 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5810 else
5811 new_l2 |= ATTR_S1_UXN;
5812 if (pmap != kernel_pmap)
5813 new_l2 |= ATTR_S1_nG;
5814 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
5815 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
5816 }
5817
5818 /*
5819 * Returns true if every page table entry in the specified page table is
5820 * zero.
5821 */
5822 static bool
pmap_every_pte_zero(vm_paddr_t pa)5823 pmap_every_pte_zero(vm_paddr_t pa)
5824 {
5825 pt_entry_t *pt_end, *pte;
5826
5827 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
5828 pte = PHYS_TO_DMAP(pa);
5829 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
5830 if (*pte != 0)
5831 return (false);
5832 }
5833 return (true);
5834 }
5835
5836 /*
5837 * Tries to create the specified L2 page mapping. Returns KERN_SUCCESS if
5838 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
5839 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if
5840 * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists
5841 * within the L2 virtual address range starting at the specified virtual
5842 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
5843 * L2 page mapping already exists at the specified virtual address. Returns
5844 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
5845 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
5846 * and a PV entry allocation failed.
5847 */
5848 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)5849 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
5850 vm_page_t m, struct rwlock **lockp)
5851 {
5852 struct spglist free;
5853 pd_entry_t *l2, old_l2;
5854 vm_page_t l2pg, mt;
5855 vm_page_t uwptpg;
5856
5857 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5858 KASSERT(ADDR_IS_CANONICAL(va),
5859 ("%s: Address not in canonical form: %lx", __func__, va));
5860 KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
5861 PMAP_ENTER_NORECLAIM,
5862 ("pmap_enter_l2: flags is missing PMAP_ENTER_NOREPLACE"));
5863
5864 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
5865 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
5866 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
5867 va, pmap);
5868 return (KERN_RESOURCE_SHORTAGE);
5869 }
5870
5871 /*
5872 * If bti is not the same for the whole l2 range, return failure
5873 * and let vm_fault() cope. Check after l2 allocation, since
5874 * it could sleep.
5875 */
5876 if (!pmap_bti_same(pmap, va, va + L2_SIZE, &new_l2)) {
5877 KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP"));
5878 pmap_abort_ptp(pmap, va, l2pg);
5879 return (KERN_PROTECTION_FAILURE);
5880 }
5881
5882 /*
5883 * If there are existing mappings, either abort or remove them.
5884 */
5885 if ((old_l2 = pmap_load(l2)) != 0) {
5886 KASSERT(l2pg == NULL || l2pg->ref_count > 1,
5887 ("pmap_enter_l2: l2pg's ref count is too low"));
5888 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5889 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5890 if (l2pg != NULL)
5891 l2pg->ref_count--;
5892 CTR2(KTR_PMAP,
5893 "pmap_enter_l2: no space for va %#lx"
5894 " in pmap %p", va, pmap);
5895 return (KERN_NO_SPACE);
5896 } else if (ADDR_IS_USER(va) ||
5897 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) {
5898 if (l2pg != NULL)
5899 l2pg->ref_count--;
5900 CTR2(KTR_PMAP,
5901 "pmap_enter_l2: failure for va %#lx"
5902 " in pmap %p", va, pmap);
5903 return (KERN_FAILURE);
5904 }
5905 }
5906 SLIST_INIT(&free);
5907 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5908 (void)pmap_remove_l2(pmap, l2, va,
5909 pmap_load(pmap_l1(pmap, va)), false, &free, lockp);
5910 } else {
5911 if (ADDR_IS_KERNEL(va)) {
5912 /*
5913 * Try to save the ptp in the trie
5914 * before any changes to mappings are
5915 * made. Abort on failure.
5916 */
5917 mt = PTE_TO_VM_PAGE(old_l2);
5918 if (pmap_insert_pt_page(pmap, mt, false,
5919 false)) {
5920 CTR1(KTR_PMAP,
5921 "pmap_enter_l2: cannot ins kern ptp va %#lx",
5922 va);
5923 return (KERN_RESOURCE_SHORTAGE);
5924 }
5925 /*
5926 * Both pmap_remove_l2() and
5927 * pmap_remove_l3_range() will zero fill
5928 * the L3 kernel page table page.
5929 */
5930 }
5931 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
5932 &free, lockp);
5933 if (ADDR_IS_KERNEL(va)) {
5934 /*
5935 * The TLB could have an intermediate
5936 * entry for the L3 kernel page table
5937 * page, so request an invalidation at
5938 * all levels after clearing the
5939 * L2_TABLE entry.
5940 */
5941 pmap_clear(l2);
5942 pmap_s1_invalidate_page(pmap, va, false);
5943 }
5944 }
5945 KASSERT(pmap_load(l2) == 0,
5946 ("pmap_enter_l2: non-zero L2 entry %p", l2));
5947 if (ADDR_IS_USER(va)) {
5948 vm_page_free_pages_toq(&free, true);
5949 } else {
5950 KASSERT(SLIST_EMPTY(&free),
5951 ("pmap_enter_l2: freed kernel page table page"));
5952 }
5953 }
5954
5955 /*
5956 * Allocate leaf ptpage for wired userspace pages.
5957 */
5958 uwptpg = NULL;
5959 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) {
5960 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5961 if (uwptpg == NULL) {
5962 pmap_abort_ptp(pmap, va, l2pg);
5963 return (KERN_RESOURCE_SHORTAGE);
5964 }
5965 uwptpg->pindex = pmap_l2_pindex(va);
5966 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
5967 vm_page_unwire_noq(uwptpg);
5968 vm_page_free(uwptpg);
5969 pmap_abort_ptp(pmap, va, l2pg);
5970 return (KERN_RESOURCE_SHORTAGE);
5971 }
5972 pmap_resident_count_inc(pmap, 1);
5973 uwptpg->ref_count = NL3PG;
5974 }
5975 if ((new_l2 & ATTR_SW_MANAGED) != 0) {
5976 /*
5977 * Abort this mapping if its PV entry could not be created.
5978 */
5979 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
5980 if (l2pg != NULL)
5981 pmap_abort_ptp(pmap, va, l2pg);
5982 else {
5983 KASSERT(ADDR_IS_KERNEL(va) &&
5984 (pmap_load(l2) & ATTR_DESCR_MASK) ==
5985 L2_TABLE,
5986 ("pmap_enter_l2: invalid kernel L2E"));
5987 mt = pmap_remove_pt_page(pmap, va);
5988 KASSERT(mt != NULL,
5989 ("pmap_enter_l2: missing kernel PTP"));
5990 }
5991 if (uwptpg != NULL) {
5992 mt = pmap_remove_pt_page(pmap, va);
5993 KASSERT(mt == uwptpg,
5994 ("removed pt page %p, expected %p", mt,
5995 uwptpg));
5996 pmap_resident_count_dec(pmap, 1);
5997 uwptpg->ref_count = 1;
5998 vm_page_unwire_noq(uwptpg);
5999 vm_page_free(uwptpg);
6000 }
6001 CTR2(KTR_PMAP,
6002 "pmap_enter_l2: failure for va %#lx in pmap %p",
6003 va, pmap);
6004 return (KERN_RESOURCE_SHORTAGE);
6005 }
6006 if ((new_l2 & ATTR_SW_DBM) != 0)
6007 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6008 vm_page_aflag_set(mt, PGA_WRITEABLE);
6009 }
6010
6011 /*
6012 * Increment counters.
6013 */
6014 if ((new_l2 & ATTR_SW_WIRED) != 0)
6015 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
6016 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
6017
6018 /*
6019 * Conditionally sync the icache. See pmap_enter() for details.
6020 */
6021 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) !=
6022 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) &&
6023 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
6024 cpu_icache_sync_range(PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)),
6025 L2_SIZE);
6026 }
6027
6028 /*
6029 * Map the superpage.
6030 */
6031 pmap_store(l2, new_l2);
6032 dsb(ishst);
6033
6034 counter_u64_add(pmap_l2_mappings, 1);
6035 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
6036 va, pmap);
6037
6038 return (KERN_SUCCESS);
6039 }
6040
6041 /*
6042 * Tries to create a read- and/or execute-only L3C page mapping. Returns
6043 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
6044 * value.
6045 */
6046 static int
pmap_enter_l3c_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_page_t * ml3p,vm_prot_t prot,struct rwlock ** lockp)6047 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p,
6048 vm_prot_t prot, struct rwlock **lockp)
6049 {
6050 pt_entry_t l3e;
6051
6052 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6053 PMAP_ASSERT_STAGE1(pmap);
6054 KASSERT(ADDR_IS_CANONICAL(va),
6055 ("%s: Address not in canonical form: %lx", __func__, va));
6056
6057 l3e = VM_PAGE_TO_PTE(m) | pmap_sh_attr |
6058 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
6059 ATTR_CONTIGUOUS | L3_PAGE;
6060 if ((m->oflags & VPO_UNMANAGED) == 0)
6061 l3e |= ATTR_SW_MANAGED;
6062 else
6063 l3e |= ATTR_AF;
6064 if ((prot & VM_PROT_EXECUTE) == 0 ||
6065 m->md.pv_memattr == VM_MEMATTR_DEVICE)
6066 l3e |= ATTR_S1_XN;
6067 if (ADDR_IS_USER(va))
6068 l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
6069 else
6070 l3e |= ATTR_S1_UXN;
6071 if (pmap != kernel_pmap)
6072 l3e |= ATTR_S1_nG;
6073 return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP |
6074 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp));
6075 }
6076
6077 static int
pmap_enter_l3c(pmap_t pmap,vm_offset_t va,pt_entry_t l3e,u_int flags,vm_page_t m,vm_page_t * ml3p,struct rwlock ** lockp)6078 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
6079 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp)
6080 {
6081 pd_entry_t *l2p, *pde;
6082 pt_entry_t *l3p, *tl3p;
6083 vm_page_t mt;
6084 vm_paddr_t pa;
6085 vm_pindex_t l2pindex;
6086 int lvl;
6087
6088 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6089 KASSERT((va & L3C_OFFSET) == 0,
6090 ("pmap_enter_l3c: va is not aligned"));
6091 KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0,
6092 ("pmap_enter_l3c: managed mapping within the clean submap"));
6093 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
6094 ("pmap_enter_l3c: l3e is missing ATTR_CONTIGUOUS"));
6095
6096 /*
6097 * If the L3 PTP is not resident, we attempt to create it here.
6098 */
6099 if (ADDR_IS_USER(va)) {
6100 /*
6101 * Were we given the correct L3 PTP? If so, we can simply
6102 * increment its ref count.
6103 */
6104 l2pindex = pmap_l2_pindex(va);
6105 if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) {
6106 (*ml3p)->ref_count += L3C_ENTRIES;
6107 } else {
6108 retry:
6109 /*
6110 * Get the L2 entry.
6111 */
6112 pde = pmap_pde(pmap, va, &lvl);
6113
6114 /*
6115 * If the L2 entry is a superpage, we either abort or
6116 * demote depending on the given flags.
6117 */
6118 if (lvl == 1) {
6119 l2p = pmap_l1_to_l2(pde, va);
6120 if ((pmap_load(l2p) & ATTR_DESCR_MASK) ==
6121 L2_BLOCK) {
6122 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
6123 return (KERN_FAILURE);
6124 l3p = pmap_demote_l2_locked(pmap, l2p,
6125 va, lockp);
6126 if (l3p != NULL) {
6127 *ml3p = PTE_TO_VM_PAGE(
6128 pmap_load(l2p));
6129 (*ml3p)->ref_count +=
6130 L3C_ENTRIES;
6131 goto have_l3p;
6132 }
6133 }
6134 /* We need to allocate an L3 PTP. */
6135 }
6136
6137 /*
6138 * If the L3 PTP is mapped, we just increment its ref
6139 * count. Otherwise, we attempt to allocate it.
6140 */
6141 if (lvl == 2 && pmap_load(pde) != 0) {
6142 *ml3p = PTE_TO_VM_PAGE(pmap_load(pde));
6143 (*ml3p)->ref_count += L3C_ENTRIES;
6144 } else {
6145 *ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags &
6146 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp);
6147 if (*ml3p == NULL) {
6148 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
6149 return (KERN_FAILURE);
6150
6151 /*
6152 * The page table may have changed
6153 * while we slept.
6154 */
6155 goto retry;
6156 }
6157 (*ml3p)->ref_count += L3C_ENTRIES - 1;
6158 }
6159 }
6160 l3p = VM_PAGE_TO_DMAP(*ml3p);
6161 } else {
6162 *ml3p = NULL;
6163
6164 /*
6165 * If the L2 entry is a superpage, we either abort or demote
6166 * depending on the given flags.
6167 */
6168 pde = pmap_pde(kernel_pmap, va, &lvl);
6169 if (lvl == 1) {
6170 l2p = pmap_l1_to_l2(pde, va);
6171 KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK,
6172 ("pmap_enter_l3c: missing L2 block"));
6173 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
6174 return (KERN_FAILURE);
6175 l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp);
6176 } else {
6177 KASSERT(lvl == 2,
6178 ("pmap_enter_l3c: Invalid level %d", lvl));
6179 l3p = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(pde)));
6180 }
6181 }
6182 have_l3p:
6183 l3p = &l3p[pmap_l3_index(va)];
6184
6185 /*
6186 * If bti is not the same for the whole L3C range, return failure
6187 * and let vm_fault() cope. Check after L3 allocation, since
6188 * it could sleep.
6189 */
6190 if (!pmap_bti_same(pmap, va, va + L3C_SIZE, &l3e)) {
6191 KASSERT(*ml3p != NULL, ("pmap_enter_l3c: missing L3 PTP"));
6192 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
6193 pmap_abort_ptp(pmap, va, *ml3p);
6194 *ml3p = NULL;
6195 return (KERN_PROTECTION_FAILURE);
6196 }
6197
6198 /*
6199 * If there are existing mappings, either abort or remove them.
6200 */
6201 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
6202 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6203 if (pmap_load(tl3p) != 0) {
6204 if (*ml3p != NULL)
6205 (*ml3p)->ref_count -= L3C_ENTRIES;
6206 return (KERN_FAILURE);
6207 }
6208 }
6209 } else {
6210 /*
6211 * Because we increment the L3 page's reference count above,
6212 * it is guaranteed not to be freed here and we can pass NULL
6213 * instead of a valid free list.
6214 */
6215 pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va,
6216 va + L3C_SIZE, NULL, lockp);
6217 }
6218
6219 /*
6220 * Enter on the PV list if part of our managed memory.
6221 */
6222 if ((l3e & ATTR_SW_MANAGED) != 0) {
6223 if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) {
6224 if (*ml3p != NULL) {
6225 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
6226 pmap_abort_ptp(pmap, va, *ml3p);
6227 *ml3p = NULL;
6228 }
6229 return (KERN_RESOURCE_SHORTAGE);
6230 }
6231 if ((l3e & ATTR_SW_DBM) != 0)
6232 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
6233 vm_page_aflag_set(mt, PGA_WRITEABLE);
6234 }
6235
6236 /*
6237 * Increment counters.
6238 */
6239 if ((l3e & ATTR_SW_WIRED) != 0)
6240 pmap->pm_stats.wired_count += L3C_ENTRIES;
6241 pmap_resident_count_inc(pmap, L3C_ENTRIES);
6242
6243 pa = VM_PAGE_TO_PHYS(m);
6244 KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned"));
6245
6246 /*
6247 * Sync the icache before the mapping is stored.
6248 */
6249 if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap &&
6250 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
6251 cpu_icache_sync_range(PHYS_TO_DMAP(pa), L3C_SIZE);
6252
6253 /*
6254 * Map the superpage.
6255 */
6256 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6257 pmap_store(tl3p, l3e);
6258 l3e += L3_SIZE;
6259 }
6260 dsb(ishst);
6261
6262 counter_u64_add(pmap_l3c_mappings, 1);
6263 CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p",
6264 va, pmap);
6265 return (KERN_SUCCESS);
6266 }
6267
6268 /*
6269 * Maps a sequence of resident pages belonging to the same object.
6270 * The sequence begins with the given page m_start. This page is
6271 * mapped at the given virtual address start. Each subsequent page is
6272 * mapped at a virtual address that is offset from start by the same
6273 * amount as the page is offset from m_start within the object. The
6274 * last page in the sequence is the page with the largest offset from
6275 * m_start that can be mapped at a virtual address less than the given
6276 * virtual address end. Not every virtual page between start and end
6277 * is mapped; only those for which a resident page exists with the
6278 * corresponding offset from m_start are mapped.
6279 */
6280 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)6281 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
6282 vm_page_t m_start, vm_prot_t prot)
6283 {
6284 struct pctrie_iter pages;
6285 struct rwlock *lock;
6286 vm_offset_t va;
6287 vm_page_t m, mpte;
6288 int rv;
6289
6290 VM_OBJECT_ASSERT_LOCKED(m_start->object);
6291
6292 mpte = NULL;
6293 vm_page_iter_limit_init(&pages, m_start->object,
6294 m_start->pindex + atop(end - start));
6295 m = vm_radix_iter_lookup(&pages, m_start->pindex);
6296 lock = NULL;
6297 PMAP_LOCK(pmap);
6298 while (m != NULL) {
6299 va = start + ptoa(m->pindex - m_start->pindex);
6300 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
6301 m->psind == 2 && pmap_ps_enabled(pmap) &&
6302 ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) ==
6303 KERN_SUCCESS || rv == KERN_NO_SPACE)) {
6304 m = vm_radix_iter_jump(&pages, L2_SIZE / PAGE_SIZE);
6305 } else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end &&
6306 m->psind >= 1 && pmap_ps_enabled(pmap) &&
6307 ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot,
6308 &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE)) {
6309 m = vm_radix_iter_jump(&pages, L3C_ENTRIES);
6310 } else {
6311 /*
6312 * In general, if a superpage mapping were possible,
6313 * it would have been created above. That said, if
6314 * start and end are not superpage aligned, then
6315 * promotion might be possible at the ends of [start,
6316 * end). However, in practice, those promotion
6317 * attempts are so unlikely to succeed that they are
6318 * not worth trying.
6319 */
6320 mpte = pmap_enter_quick_locked(pmap, va, m, prot |
6321 VM_PROT_NO_PROMOTE, mpte, &lock);
6322 m = vm_radix_iter_step(&pages);
6323 }
6324 }
6325 if (lock != NULL)
6326 rw_wunlock(lock);
6327 PMAP_UNLOCK(pmap);
6328 }
6329
6330 /*
6331 * this code makes some *MAJOR* assumptions:
6332 * 1. Current pmap & pmap exists.
6333 * 2. Not wired.
6334 * 3. Read access.
6335 * 4. No page table pages.
6336 * but is *MUCH* faster than pmap_enter...
6337 */
6338
6339 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)6340 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
6341 {
6342 struct rwlock *lock;
6343
6344 lock = NULL;
6345 PMAP_LOCK(pmap);
6346 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
6347 if (lock != NULL)
6348 rw_wunlock(lock);
6349 PMAP_UNLOCK(pmap);
6350 }
6351
6352 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)6353 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
6354 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
6355 {
6356 pt_entry_t *l1, *l2, *l3, l3_val;
6357 vm_paddr_t pa;
6358 int full_lvl, lvl;
6359
6360 KASSERT(!VA_IS_CLEANMAP(va) ||
6361 (m->oflags & VPO_UNMANAGED) != 0,
6362 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
6363 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6364 PMAP_ASSERT_STAGE1(pmap);
6365 KASSERT(ADDR_IS_CANONICAL(va),
6366 ("%s: Address not in canonical form: %lx", __func__, va));
6367 l2 = NULL;
6368
6369 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
6370 /*
6371 * In the case that a page table page is not
6372 * resident, we are creating it here.
6373 */
6374 if (ADDR_IS_USER(va)) {
6375 vm_pindex_t l2pindex;
6376
6377 /*
6378 * Calculate pagetable page index
6379 */
6380 l2pindex = pmap_l2_pindex(va);
6381 if (mpte && (mpte->pindex == l2pindex)) {
6382 mpte->ref_count++;
6383 } else {
6384 /*
6385 * If the page table page is mapped, we just increment
6386 * the hold count, and activate it. Otherwise, we
6387 * attempt to allocate a page table page, passing NULL
6388 * instead of the PV list lock pointer because we don't
6389 * intend to sleep. If this attempt fails, we don't
6390 * retry. Instead, we give up.
6391 */
6392 l1 = pmap_l1(pmap, va);
6393 if (l1 != NULL && pmap_load(l1) != 0) {
6394 if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
6395 L1_BLOCK)
6396 return (NULL);
6397 l2 = pmap_l1_to_l2(l1, va);
6398 if (pmap_load(l2) != 0) {
6399 if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
6400 L2_BLOCK)
6401 return (NULL);
6402 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
6403 mpte->ref_count++;
6404 } else {
6405 mpte = _pmap_alloc_l3(pmap, l2pindex,
6406 NULL);
6407 if (mpte == NULL)
6408 return (mpte);
6409 }
6410 } else {
6411 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
6412 if (mpte == NULL)
6413 return (mpte);
6414 }
6415 }
6416 l3 = VM_PAGE_TO_DMAP(mpte);
6417 l3 = &l3[pmap_l3_index(va)];
6418 } else {
6419 mpte = NULL;
6420 l2 = pmap_pde(kernel_pmap, va, &lvl);
6421 KASSERT(l2 != NULL,
6422 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
6423 va));
6424 KASSERT(lvl == 2,
6425 ("pmap_enter_quick_locked: Invalid level %d", lvl));
6426 l3 = pmap_l2_to_l3(l2, va);
6427 }
6428
6429 /*
6430 * Abort if a mapping already exists.
6431 */
6432 if (pmap_load(l3) != 0) {
6433 if (mpte != NULL)
6434 mpte->ref_count--;
6435 return (NULL);
6436 }
6437
6438 /*
6439 * Enter on the PV list if part of our managed memory.
6440 */
6441 if ((m->oflags & VPO_UNMANAGED) == 0 &&
6442 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
6443 if (mpte != NULL)
6444 pmap_abort_ptp(pmap, va, mpte);
6445 return (NULL);
6446 }
6447
6448 /*
6449 * Increment counters
6450 */
6451 pmap_resident_count_inc(pmap, 1);
6452
6453 pa = VM_PAGE_TO_PHYS(m);
6454 l3_val = PHYS_TO_PTE(pa) | pmap_sh_attr |
6455 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
6456 l3_val |= pmap_pte_bti(pmap, va);
6457 if ((prot & VM_PROT_EXECUTE) == 0 ||
6458 m->md.pv_memattr == VM_MEMATTR_DEVICE)
6459 l3_val |= ATTR_S1_XN;
6460 if (ADDR_IS_USER(va))
6461 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
6462 else
6463 l3_val |= ATTR_S1_UXN;
6464 if (pmap != kernel_pmap)
6465 l3_val |= ATTR_S1_nG;
6466
6467 /*
6468 * Now validate mapping with RO protection
6469 */
6470 if ((m->oflags & VPO_UNMANAGED) == 0)
6471 l3_val |= ATTR_SW_MANAGED;
6472 else
6473 l3_val |= ATTR_AF;
6474
6475 /* Sync icache before the mapping is stored to PTE */
6476 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
6477 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
6478 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
6479
6480 pmap_store(l3, l3_val);
6481 dsb(ishst);
6482
6483 #if VM_NRESERVLEVEL > 0
6484 /*
6485 * First, attempt L3C promotion, if the virtual and physical addresses
6486 * are aligned with each other and an underlying reservation has the
6487 * neighboring L3 pages allocated. The first condition is simply an
6488 * optimization that recognizes some eventual promotion failures early
6489 * at a lower run-time cost. Then, attempt L2 promotion, if both a
6490 * level 1 reservation and the PTP are fully populated.
6491 */
6492 if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
6493 (va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
6494 (m->flags & PG_FICTITIOUS) == 0 &&
6495 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
6496 pmap_promote_l3c(pmap, l3, va) &&
6497 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) {
6498 if (l2 == NULL)
6499 l2 = pmap_l2(pmap, va);
6500
6501 /*
6502 * If promotion succeeds, then the next call to this function
6503 * should not be given the unmapped PTP as a hint.
6504 */
6505 if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
6506 mpte = NULL;
6507 }
6508 #endif
6509
6510 return (mpte);
6511 }
6512
6513 /*
6514 * This code maps large physical mmap regions into the
6515 * processor address space. Note that some shortcuts
6516 * are taken, but the code works.
6517 */
6518 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)6519 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
6520 vm_pindex_t pindex, vm_size_t size)
6521 {
6522
6523 VM_OBJECT_ASSERT_WLOCKED(object);
6524 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
6525 ("pmap_object_init_pt: non-device object"));
6526 }
6527
6528 /*
6529 * Clear the wired attribute from the mappings for the specified range of
6530 * addresses in the given pmap. Every valid mapping within that range
6531 * must have the wired attribute set. In contrast, invalid mappings
6532 * cannot have the wired attribute set, so they are ignored.
6533 *
6534 * The wired attribute of the page table entry is not a hardware feature,
6535 * so there is no need to invalidate any TLB entries.
6536 */
6537 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6538 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6539 {
6540 vm_offset_t va_next;
6541 pd_entry_t *l0, *l1, *l2;
6542 pt_entry_t *l3;
6543 bool partial_l3c;
6544
6545 PMAP_LOCK(pmap);
6546 for (; sva < eva; sva = va_next) {
6547 l0 = pmap_l0(pmap, sva);
6548 if (pmap_load(l0) == 0) {
6549 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6550 if (va_next < sva)
6551 va_next = eva;
6552 continue;
6553 }
6554
6555 l1 = pmap_l0_to_l1(l0, sva);
6556 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6557 if (va_next < sva)
6558 va_next = eva;
6559 if (pmap_load(l1) == 0)
6560 continue;
6561
6562 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6563 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6564 KASSERT(va_next <= eva,
6565 ("partial update of non-transparent 1G page "
6566 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
6567 pmap_load(l1), sva, eva, va_next));
6568 MPASS(pmap != kernel_pmap);
6569 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
6570 ATTR_SW_WIRED)) == ATTR_SW_WIRED);
6571 pmap_clear_bits(l1, ATTR_SW_WIRED);
6572 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
6573 continue;
6574 }
6575
6576 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6577 if (va_next < sva)
6578 va_next = eva;
6579
6580 l2 = pmap_l1_to_l2(l1, sva);
6581 if (pmap_load(l2) == 0)
6582 continue;
6583
6584 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
6585 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
6586 panic("pmap_unwire: l2 %#jx is missing "
6587 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
6588
6589 /*
6590 * Are we unwiring the entire large page? If not,
6591 * demote the mapping and fall through.
6592 */
6593 if (sva + L2_SIZE == va_next && eva >= va_next) {
6594 pmap_clear_bits(l2, ATTR_SW_WIRED);
6595 pmap->pm_stats.wired_count -= L2_SIZE /
6596 PAGE_SIZE;
6597 continue;
6598 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
6599 panic("pmap_unwire: demotion failed");
6600 }
6601 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6602 ("pmap_unwire: Invalid l2 entry after demotion"));
6603
6604 if (va_next > eva)
6605 va_next = eva;
6606 for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva);
6607 sva != va_next; l3++, sva += L3_SIZE) {
6608 if (pmap_load(l3) == 0)
6609 continue;
6610 if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) {
6611 /*
6612 * Avoid demotion for whole-page unwiring.
6613 */
6614 if ((sva & L3C_OFFSET) == 0) {
6615 /*
6616 * Handle the possibility that
6617 * "va_next" is zero because of
6618 * address wraparound.
6619 */
6620 partial_l3c = sva + L3C_OFFSET >
6621 va_next - 1;
6622 }
6623 if (partial_l3c)
6624 (void)pmap_demote_l3c(pmap, l3, sva);
6625 }
6626 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
6627 panic("pmap_unwire: l3 %#jx is missing "
6628 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
6629
6630 /*
6631 * ATTR_SW_WIRED must be cleared atomically. Although
6632 * the pmap lock synchronizes access to ATTR_SW_WIRED,
6633 * the System MMU may write to the entry concurrently.
6634 */
6635 pmap_clear_bits(l3, ATTR_SW_WIRED);
6636 pmap->pm_stats.wired_count--;
6637 }
6638 }
6639 PMAP_UNLOCK(pmap);
6640 }
6641
6642 /*
6643 * This function requires that the caller has already added one to ml3's
6644 * ref_count in anticipation of creating a 4KB page mapping.
6645 */
6646 static bool
pmap_copy_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,pt_entry_t l3e,vm_page_t ml3,struct rwlock ** lockp)6647 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e,
6648 vm_page_t ml3, struct rwlock **lockp)
6649 {
6650 pt_entry_t *tl3p;
6651
6652 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6653 KASSERT((va & L3C_OFFSET) == 0,
6654 ("pmap_copy_l3c: va is not aligned"));
6655 KASSERT((l3e & ATTR_SW_MANAGED) != 0,
6656 ("pmap_copy_l3c: l3e is not managed"));
6657
6658 /*
6659 * Abort if a mapping already exists.
6660 */
6661 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++)
6662 if (pmap_load(tl3p) != 0) {
6663 if (ml3 != NULL)
6664 ml3->ref_count--;
6665 return (false);
6666 }
6667
6668 if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) {
6669 if (ml3 != NULL)
6670 pmap_abort_ptp(pmap, va, ml3);
6671 return (false);
6672 }
6673 ml3->ref_count += L3C_ENTRIES - 1;
6674
6675 /*
6676 * Clear the wired and accessed bits. However, leave the dirty bit
6677 * unchanged because read/write superpage mappings are required to be
6678 * dirty.
6679 */
6680 l3e &= ~(ATTR_SW_WIRED | ATTR_AF);
6681
6682 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6683 pmap_store(tl3p, l3e);
6684 l3e += L3_SIZE;
6685 }
6686 pmap_resident_count_inc(pmap, L3C_ENTRIES);
6687 counter_u64_add(pmap_l3c_mappings, 1);
6688 CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p",
6689 va, pmap);
6690 return (true);
6691 }
6692
6693 /*
6694 * Copy the range specified by src_addr/len
6695 * from the source map to the range dst_addr/len
6696 * in the destination map.
6697 *
6698 * This routine is only advisory and need not do anything.
6699 *
6700 * Because the executable mappings created by this routine are copied,
6701 * it should not have to flush the instruction cache.
6702 */
6703 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)6704 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
6705 vm_offset_t src_addr)
6706 {
6707 struct rwlock *lock;
6708 pd_entry_t *l0, *l1, *l2, srcptepaddr;
6709 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
6710 vm_offset_t addr, end_addr, va_next;
6711 vm_page_t dst_m, dstmpte, srcmpte;
6712
6713 PMAP_ASSERT_STAGE1(dst_pmap);
6714 PMAP_ASSERT_STAGE1(src_pmap);
6715
6716 if (dst_addr != src_addr)
6717 return;
6718 end_addr = src_addr + len;
6719 lock = NULL;
6720 if (dst_pmap < src_pmap) {
6721 PMAP_LOCK(dst_pmap);
6722 PMAP_LOCK(src_pmap);
6723 } else {
6724 PMAP_LOCK(src_pmap);
6725 PMAP_LOCK(dst_pmap);
6726 }
6727 for (addr = src_addr; addr < end_addr; addr = va_next) {
6728 l0 = pmap_l0(src_pmap, addr);
6729 if (pmap_load(l0) == 0) {
6730 va_next = (addr + L0_SIZE) & ~L0_OFFSET;
6731 if (va_next < addr)
6732 va_next = end_addr;
6733 continue;
6734 }
6735
6736 va_next = (addr + L1_SIZE) & ~L1_OFFSET;
6737 if (va_next < addr)
6738 va_next = end_addr;
6739 l1 = pmap_l0_to_l1(l0, addr);
6740 if (pmap_load(l1) == 0)
6741 continue;
6742 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6743 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6744 KASSERT(va_next <= end_addr,
6745 ("partial update of non-transparent 1G page "
6746 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6747 pmap_load(l1), addr, end_addr, va_next));
6748 srcptepaddr = pmap_load(l1);
6749 l1 = pmap_l1(dst_pmap, addr);
6750 if (l1 == NULL) {
6751 if (_pmap_alloc_l3(dst_pmap,
6752 pmap_l0_pindex(addr), NULL) == NULL)
6753 break;
6754 l1 = pmap_l1(dst_pmap, addr);
6755 } else {
6756 l0 = pmap_l0(dst_pmap, addr);
6757 dst_m = PTE_TO_VM_PAGE(pmap_load(l0));
6758 dst_m->ref_count++;
6759 }
6760 KASSERT(pmap_load(l1) == 0,
6761 ("1G mapping present in dst pmap "
6762 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6763 pmap_load(l1), addr, end_addr, va_next));
6764 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
6765 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
6766 continue;
6767 }
6768
6769 va_next = (addr + L2_SIZE) & ~L2_OFFSET;
6770 if (va_next < addr)
6771 va_next = end_addr;
6772 l2 = pmap_l1_to_l2(l1, addr);
6773 srcptepaddr = pmap_load(l2);
6774 if (srcptepaddr == 0)
6775 continue;
6776 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
6777 /*
6778 * We can only virtual copy whole superpages.
6779 */
6780 if ((addr & L2_OFFSET) != 0 ||
6781 addr + L2_SIZE > end_addr)
6782 continue;
6783 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
6784 if (l2 == NULL)
6785 break;
6786 if (pmap_load(l2) == 0 &&
6787 ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
6788 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
6789 PMAP_ENTER_NORECLAIM, &lock))) {
6790 /*
6791 * We leave the dirty bit unchanged because
6792 * managed read/write superpage mappings are
6793 * required to be dirty. However, managed
6794 * superpage mappings are not required to
6795 * have their accessed bit set, so we clear
6796 * it because we don't know if this mapping
6797 * will be used.
6798 */
6799 srcptepaddr &= ~ATTR_SW_WIRED;
6800 if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
6801 srcptepaddr &= ~ATTR_AF;
6802 pmap_store(l2, srcptepaddr);
6803 pmap_resident_count_inc(dst_pmap, L2_SIZE /
6804 PAGE_SIZE);
6805 counter_u64_add(pmap_l2_mappings, 1);
6806 } else
6807 pmap_abort_ptp(dst_pmap, addr, dst_m);
6808 continue;
6809 }
6810 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
6811 ("pmap_copy: invalid L2 entry"));
6812 srcmpte = PTE_TO_VM_PAGE(srcptepaddr);
6813 KASSERT(srcmpte->ref_count > 0,
6814 ("pmap_copy: source page table page is unused"));
6815 if (va_next > end_addr)
6816 va_next = end_addr;
6817 src_pte = PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr));
6818 src_pte = &src_pte[pmap_l3_index(addr)];
6819 dstmpte = NULL;
6820 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
6821 ptetemp = pmap_load(src_pte);
6822
6823 /*
6824 * We only virtual copy managed pages.
6825 */
6826 if ((ptetemp & ATTR_SW_MANAGED) == 0)
6827 continue;
6828
6829 if (dstmpte != NULL) {
6830 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
6831 ("dstmpte pindex/addr mismatch"));
6832 dstmpte->ref_count++;
6833 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
6834 NULL)) == NULL)
6835 goto out;
6836 dst_pte = VM_PAGE_TO_DMAP(dstmpte);
6837 dst_pte = &dst_pte[pmap_l3_index(addr)];
6838 if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr &
6839 L3C_OFFSET) == 0 && addr + L3C_OFFSET <=
6840 va_next - 1) {
6841 if (!pmap_copy_l3c(dst_pmap, dst_pte, addr,
6842 ptetemp, dstmpte, &lock))
6843 goto out;
6844 addr += L3C_SIZE - PAGE_SIZE;
6845 src_pte += L3C_ENTRIES - 1;
6846 } else if (pmap_load(dst_pte) == 0 &&
6847 pmap_try_insert_pv_entry(dst_pmap, addr,
6848 PTE_TO_VM_PAGE(ptetemp), &lock)) {
6849 /*
6850 * Clear the wired, contiguous, modified, and
6851 * accessed bits from the destination PTE.
6852 * The contiguous bit is cleared because we
6853 * are not copying the entire L3C superpage.
6854 */
6855 mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS |
6856 ATTR_AF;
6857 nbits = 0;
6858 if ((ptetemp & ATTR_SW_DBM) != 0)
6859 nbits |= ATTR_S1_AP_RW_BIT;
6860 pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
6861 pmap_resident_count_inc(dst_pmap, 1);
6862 } else {
6863 pmap_abort_ptp(dst_pmap, addr, dstmpte);
6864 goto out;
6865 }
6866 /* Have we copied all of the valid mappings? */
6867 if (dstmpte->ref_count >= srcmpte->ref_count)
6868 break;
6869 }
6870 }
6871 out:
6872 /*
6873 * XXX This barrier may not be needed because the destination pmap is
6874 * not active.
6875 */
6876 dsb(ishst);
6877
6878 if (lock != NULL)
6879 rw_wunlock(lock);
6880 PMAP_UNLOCK(src_pmap);
6881 PMAP_UNLOCK(dst_pmap);
6882 }
6883
6884 int
pmap_vmspace_copy(pmap_t dst_pmap,pmap_t src_pmap)6885 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
6886 {
6887 int error;
6888
6889 if (dst_pmap->pm_stage != src_pmap->pm_stage)
6890 return (EINVAL);
6891
6892 if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL)
6893 return (0);
6894
6895 for (;;) {
6896 if (dst_pmap < src_pmap) {
6897 PMAP_LOCK(dst_pmap);
6898 PMAP_LOCK(src_pmap);
6899 } else {
6900 PMAP_LOCK(src_pmap);
6901 PMAP_LOCK(dst_pmap);
6902 }
6903 error = pmap_bti_copy(dst_pmap, src_pmap);
6904 /* Clean up partial copy on failure due to no memory. */
6905 if (error == ENOMEM)
6906 pmap_bti_deassign_all(dst_pmap);
6907 PMAP_UNLOCK(src_pmap);
6908 PMAP_UNLOCK(dst_pmap);
6909 if (error != ENOMEM)
6910 break;
6911 vm_wait(NULL);
6912 }
6913 return (error);
6914 }
6915
6916 /*
6917 * pmap_zero_page zeros the specified hardware page by mapping
6918 * the page into KVM and using bzero to clear its contents.
6919 */
6920 void
pmap_zero_page(vm_page_t m)6921 pmap_zero_page(vm_page_t m)
6922 {
6923 void *va = VM_PAGE_TO_DMAP(m);
6924
6925 pagezero(va);
6926 m->md.pv_flags &= ~PV_MTE_TAGGED;
6927 }
6928
6929 /*
6930 * pmap_zero_page_area zeros the specified hardware page by mapping
6931 * the page into KVM and using bzero to clear its contents.
6932 *
6933 * off and size may not cover an area beyond a single hardware page.
6934 */
6935 void
pmap_zero_page_area(vm_page_t m,int off,int size)6936 pmap_zero_page_area(vm_page_t m, int off, int size)
6937 {
6938 void *va = VM_PAGE_TO_DMAP(m);
6939
6940 if (off == 0 && size == PAGE_SIZE)
6941 pagezero(va);
6942 else
6943 bzero((char *)va + off, size);
6944 }
6945
6946 /*
6947 * pmap_copy_page copies the specified (machine independent)
6948 * page by mapping the page into virtual memory and using
6949 * bcopy to copy the page, one machine dependent page at a
6950 * time.
6951 */
6952 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)6953 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6954 {
6955 void *src = VM_PAGE_TO_DMAP(msrc);
6956 void *dst = VM_PAGE_TO_DMAP(mdst);
6957
6958 /*
6959 * On a page copy, check whether the src page is tagged. If it is,
6960 * we must copy the tags before copying the contents of the page.
6961 */
6962 if ((msrc->md.pv_flags & PV_MTE_TAGGED) != 0)
6963 mte_copy_tags(msrc, mdst, src, dst);
6964 else
6965 mdst->md.pv_flags &= ~PV_MTE_TAGGED;
6966
6967 pagecopy(src, dst);
6968 }
6969
6970 int unmapped_buf_allowed = 1;
6971
6972 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)6973 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6974 vm_offset_t b_offset, int xfersize)
6975 {
6976 void *a_cp, *b_cp;
6977 vm_page_t m_a, m_b;
6978 vm_paddr_t p_a, p_b;
6979 vm_offset_t a_pg_offset, b_pg_offset;
6980 int cnt;
6981
6982 while (xfersize > 0) {
6983 KASSERT(ADDR_IS_CANONICAL(a_offset),
6984 ("%s: Address not in canonical form: %lx", __func__, a_offset));
6985
6986 a_pg_offset = a_offset & PAGE_MASK;
6987 m_a = ma[a_offset >> PAGE_SHIFT];
6988 p_a = m_a->phys_addr;
6989 b_pg_offset = b_offset & PAGE_MASK;
6990 m_b = mb[b_offset >> PAGE_SHIFT];
6991 p_b = m_b->phys_addr;
6992 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6993 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6994 if (__predict_false(!PHYS_IN_DMAP(p_a))) {
6995 panic("!DMAP a %lx", p_a);
6996 } else {
6997 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
6998 }
6999 if (__predict_false(!PHYS_IN_DMAP(p_b))) {
7000 panic("!DMAP b %lx", p_b);
7001 } else {
7002 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
7003 }
7004 bcopy(a_cp, b_cp, cnt);
7005 a_offset += cnt;
7006 b_offset += cnt;
7007 xfersize -= cnt;
7008 }
7009 }
7010
7011 void *
pmap_quick_enter_page(vm_page_t m)7012 pmap_quick_enter_page(vm_page_t m)
7013 {
7014
7015 return (VM_PAGE_TO_DMAP(m));
7016 }
7017
7018 void
pmap_quick_remove_page(void * addr)7019 pmap_quick_remove_page(void *addr)
7020 {
7021 }
7022
7023 /*
7024 * Returns true if the pmap's pv is one of the first
7025 * 16 pvs linked to from this page. This count may
7026 * be changed upwards or downwards in the future; it
7027 * is only necessary that true be returned for a small
7028 * subset of pmaps for proper page aging.
7029 */
7030 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)7031 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
7032 {
7033 struct md_page *pvh;
7034 struct rwlock *lock;
7035 pv_entry_t pv;
7036 int loops = 0;
7037 bool rv;
7038
7039 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7040 ("pmap_page_exists_quick: page %p is not managed", m));
7041 rv = false;
7042 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7043 rw_rlock(lock);
7044 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7045 if (PV_PMAP(pv) == pmap) {
7046 rv = true;
7047 break;
7048 }
7049 loops++;
7050 if (loops >= 16)
7051 break;
7052 }
7053 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
7054 pvh = page_to_pvh(m);
7055 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
7056 if (PV_PMAP(pv) == pmap) {
7057 rv = true;
7058 break;
7059 }
7060 loops++;
7061 if (loops >= 16)
7062 break;
7063 }
7064 }
7065 rw_runlock(lock);
7066 return (rv);
7067 }
7068
7069 /*
7070 * pmap_page_wired_mappings:
7071 *
7072 * Return the number of managed mappings to the given physical page
7073 * that are wired.
7074 */
7075 int
pmap_page_wired_mappings(vm_page_t m)7076 pmap_page_wired_mappings(vm_page_t m)
7077 {
7078 struct rwlock *lock;
7079 struct md_page *pvh;
7080 pmap_t pmap;
7081 pt_entry_t *pte;
7082 pv_entry_t pv;
7083 int count, md_gen, pvh_gen;
7084
7085 if ((m->oflags & VPO_UNMANAGED) != 0)
7086 return (0);
7087 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7088 rw_rlock(lock);
7089 restart:
7090 count = 0;
7091 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7092 pmap = PV_PMAP(pv);
7093 if (!PMAP_TRYLOCK(pmap)) {
7094 md_gen = m->md.pv_gen;
7095 rw_runlock(lock);
7096 PMAP_LOCK(pmap);
7097 rw_rlock(lock);
7098 if (md_gen != m->md.pv_gen) {
7099 PMAP_UNLOCK(pmap);
7100 goto restart;
7101 }
7102 }
7103 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7104 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
7105 count++;
7106 PMAP_UNLOCK(pmap);
7107 }
7108 if ((m->flags & PG_FICTITIOUS) == 0) {
7109 pvh = page_to_pvh(m);
7110 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
7111 pmap = PV_PMAP(pv);
7112 if (!PMAP_TRYLOCK(pmap)) {
7113 md_gen = m->md.pv_gen;
7114 pvh_gen = pvh->pv_gen;
7115 rw_runlock(lock);
7116 PMAP_LOCK(pmap);
7117 rw_rlock(lock);
7118 if (md_gen != m->md.pv_gen ||
7119 pvh_gen != pvh->pv_gen) {
7120 PMAP_UNLOCK(pmap);
7121 goto restart;
7122 }
7123 }
7124 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
7125 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
7126 count++;
7127 PMAP_UNLOCK(pmap);
7128 }
7129 }
7130 rw_runlock(lock);
7131 return (count);
7132 }
7133
7134 /*
7135 * Returns true if the given page is mapped individually or as part of
7136 * a 2mpage. Otherwise, returns false.
7137 */
7138 bool
pmap_page_is_mapped(vm_page_t m)7139 pmap_page_is_mapped(vm_page_t m)
7140 {
7141 struct rwlock *lock;
7142 bool rv;
7143
7144 if ((m->oflags & VPO_UNMANAGED) != 0)
7145 return (false);
7146 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7147 rw_rlock(lock);
7148 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
7149 ((m->flags & PG_FICTITIOUS) == 0 &&
7150 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
7151 rw_runlock(lock);
7152 return (rv);
7153 }
7154
7155 /*
7156 * Destroy all managed, non-wired mappings in the given user-space
7157 * pmap. This pmap cannot be active on any processor besides the
7158 * caller.
7159 *
7160 * This function cannot be applied to the kernel pmap. Moreover, it
7161 * is not intended for general use. It is only to be used during
7162 * process termination. Consequently, it can be implemented in ways
7163 * that make it faster than pmap_remove(). First, it can more quickly
7164 * destroy mappings by iterating over the pmap's collection of PV
7165 * entries, rather than searching the page table. Second, it doesn't
7166 * have to test and clear the page table entries atomically, because
7167 * no processor is currently accessing the user address space. In
7168 * particular, a page table entry's dirty bit won't change state once
7169 * this function starts.
7170 */
7171 void
pmap_remove_pages(pmap_t pmap)7172 pmap_remove_pages(pmap_t pmap)
7173 {
7174 pd_entry_t *pde;
7175 pt_entry_t *pte, tpte;
7176 struct spglist free;
7177 struct pv_chunklist free_chunks[PMAP_MEMDOM];
7178 vm_page_t m, ml3, mt;
7179 pv_entry_t pv;
7180 struct md_page *pvh;
7181 struct pv_chunk *pc, *npc;
7182 struct rwlock *lock;
7183 int64_t bit;
7184 uint64_t inuse, bitmask;
7185 int allfree, field, i, idx, lvl;
7186 int freed __pvused;
7187 vm_paddr_t pa;
7188
7189 lock = NULL;
7190
7191 for (i = 0; i < PMAP_MEMDOM; i++)
7192 TAILQ_INIT(&free_chunks[i]);
7193 SLIST_INIT(&free);
7194 PMAP_LOCK(pmap);
7195 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
7196 allfree = 1;
7197 freed = 0;
7198 for (field = 0; field < _NPCM; field++) {
7199 inuse = ~pc->pc_map[field] & pc_freemask[field];
7200 while (inuse != 0) {
7201 bit = ffsl(inuse) - 1;
7202 bitmask = 1UL << bit;
7203 idx = field * 64 + bit;
7204 pv = &pc->pc_pventry[idx];
7205 inuse &= ~bitmask;
7206
7207 pde = pmap_pde(pmap, pv->pv_va, &lvl);
7208 KASSERT(pde != NULL,
7209 ("Attempting to remove an unmapped page"));
7210
7211 switch(lvl) {
7212 case 1:
7213 pte = pmap_l1_to_l2(pde, pv->pv_va);
7214 tpte = pmap_load(pte);
7215 KASSERT((tpte & ATTR_DESCR_MASK) ==
7216 L2_BLOCK,
7217 ("Attempting to remove an invalid "
7218 "block: %lx", tpte));
7219 break;
7220 case 2:
7221 pte = pmap_l2_to_l3(pde, pv->pv_va);
7222 tpte = pmap_load(pte);
7223 KASSERT((tpte & ATTR_DESCR_MASK) ==
7224 L3_PAGE,
7225 ("Attempting to remove an invalid "
7226 "page: %lx", tpte));
7227 break;
7228 default:
7229 panic(
7230 "Invalid page directory level: %d",
7231 lvl);
7232 }
7233
7234 /*
7235 * We cannot remove wired mappings at this time.
7236 *
7237 * For L3C superpages, all of the constituent PTEs
7238 * should have the wired bit set, so we don't
7239 * check for ATTR_CONTIGUOUS here.
7240 */
7241 if (tpte & ATTR_SW_WIRED) {
7242 allfree = 0;
7243 continue;
7244 }
7245
7246 /* Mark free */
7247 pc->pc_map[field] |= bitmask;
7248
7249 /*
7250 * Because this pmap is not active on other
7251 * processors, the dirty bit cannot have
7252 * changed state since we last loaded pte.
7253 */
7254 pmap_clear(pte);
7255
7256 pa = PTE_TO_PHYS(tpte);
7257
7258 m = PHYS_TO_VM_PAGE(pa);
7259 KASSERT(m->phys_addr == pa,
7260 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
7261 m, (uintmax_t)m->phys_addr,
7262 (uintmax_t)tpte));
7263
7264 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
7265 m < &vm_page_array[vm_page_array_size],
7266 ("pmap_remove_pages: bad pte %#jx",
7267 (uintmax_t)tpte));
7268
7269 /*
7270 * Update the vm_page_t clean/reference bits.
7271 *
7272 * We don't check for ATTR_CONTIGUOUS here
7273 * because writeable L3C superpages are expected
7274 * to be dirty, i.e., every constituent PTE
7275 * should be dirty.
7276 */
7277 if (pmap_pte_dirty(pmap, tpte)) {
7278 switch (lvl) {
7279 case 1:
7280 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
7281 vm_page_dirty(mt);
7282 break;
7283 case 2:
7284 vm_page_dirty(m);
7285 break;
7286 }
7287 }
7288
7289 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
7290
7291 switch (lvl) {
7292 case 1:
7293 pmap_resident_count_dec(pmap,
7294 L2_SIZE / PAGE_SIZE);
7295 pvh = page_to_pvh(m);
7296 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
7297 pvh->pv_gen++;
7298 if (TAILQ_EMPTY(&pvh->pv_list)) {
7299 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
7300 if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
7301 TAILQ_EMPTY(&mt->md.pv_list))
7302 vm_page_aflag_clear(mt, PGA_WRITEABLE);
7303 }
7304 ml3 = pmap_remove_pt_page(pmap,
7305 pv->pv_va);
7306 if (ml3 != NULL) {
7307 KASSERT(vm_page_any_valid(ml3),
7308 ("pmap_remove_pages: l3 page not promoted"));
7309 pmap_resident_count_dec(pmap,1);
7310 KASSERT(ml3->ref_count == NL3PG,
7311 ("pmap_remove_pages: l3 page ref count error"));
7312 ml3->ref_count = 0;
7313 pmap_add_delayed_free_list(ml3,
7314 &free, false);
7315 }
7316 break;
7317 case 2:
7318 pmap_resident_count_dec(pmap, 1);
7319 TAILQ_REMOVE(&m->md.pv_list, pv,
7320 pv_next);
7321 m->md.pv_gen++;
7322 if ((m->a.flags & PGA_WRITEABLE) != 0 &&
7323 TAILQ_EMPTY(&m->md.pv_list) &&
7324 (m->flags & PG_FICTITIOUS) == 0) {
7325 pvh = page_to_pvh(m);
7326 if (TAILQ_EMPTY(&pvh->pv_list))
7327 vm_page_aflag_clear(m,
7328 PGA_WRITEABLE);
7329 }
7330 break;
7331 }
7332 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
7333 &free);
7334 freed++;
7335 }
7336 }
7337 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
7338 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
7339 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
7340 if (allfree) {
7341 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
7342 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
7343 pc_list);
7344 }
7345 }
7346 if (lock != NULL)
7347 rw_wunlock(lock);
7348 pmap_invalidate_all(pmap);
7349 pmap_bti_deassign_all(pmap);
7350 free_pv_chunk_batch(free_chunks);
7351 PMAP_UNLOCK(pmap);
7352 vm_page_free_pages_toq(&free, true);
7353 }
7354
7355 /*
7356 * This is used to check if a page has been accessed or modified.
7357 */
7358 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)7359 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
7360 {
7361 struct rwlock *lock;
7362 pv_entry_t pv;
7363 struct md_page *pvh;
7364 pt_entry_t l3e, mask, *pte, value;
7365 pmap_t pmap;
7366 int md_gen, pvh_gen;
7367 bool rv;
7368
7369 rv = false;
7370 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7371 rw_rlock(lock);
7372 restart:
7373 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7374 pmap = PV_PMAP(pv);
7375 PMAP_ASSERT_STAGE1(pmap);
7376 if (!PMAP_TRYLOCK(pmap)) {
7377 md_gen = m->md.pv_gen;
7378 rw_runlock(lock);
7379 PMAP_LOCK(pmap);
7380 rw_rlock(lock);
7381 if (md_gen != m->md.pv_gen) {
7382 PMAP_UNLOCK(pmap);
7383 goto restart;
7384 }
7385 }
7386 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7387 mask = 0;
7388 value = 0;
7389 if (modified) {
7390 mask |= ATTR_S1_AP_RW_BIT;
7391 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
7392 }
7393 if (accessed) {
7394 mask |= ATTR_AF | ATTR_DESCR_MASK;
7395 value |= ATTR_AF | L3_PAGE;
7396 }
7397 l3e = pmap_load(pte);
7398 if ((l3e & ATTR_CONTIGUOUS) != 0)
7399 l3e = pmap_load_l3c(pte);
7400 PMAP_UNLOCK(pmap);
7401 rv = (l3e & mask) == value;
7402 if (rv)
7403 goto out;
7404 }
7405 if ((m->flags & PG_FICTITIOUS) == 0) {
7406 pvh = page_to_pvh(m);
7407 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
7408 pmap = PV_PMAP(pv);
7409 PMAP_ASSERT_STAGE1(pmap);
7410 if (!PMAP_TRYLOCK(pmap)) {
7411 md_gen = m->md.pv_gen;
7412 pvh_gen = pvh->pv_gen;
7413 rw_runlock(lock);
7414 PMAP_LOCK(pmap);
7415 rw_rlock(lock);
7416 if (md_gen != m->md.pv_gen ||
7417 pvh_gen != pvh->pv_gen) {
7418 PMAP_UNLOCK(pmap);
7419 goto restart;
7420 }
7421 }
7422 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
7423 mask = 0;
7424 value = 0;
7425 if (modified) {
7426 mask |= ATTR_S1_AP_RW_BIT;
7427 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
7428 }
7429 if (accessed) {
7430 mask |= ATTR_AF | ATTR_DESCR_MASK;
7431 value |= ATTR_AF | L2_BLOCK;
7432 }
7433 rv = (pmap_load(pte) & mask) == value;
7434 PMAP_UNLOCK(pmap);
7435 if (rv)
7436 goto out;
7437 }
7438 }
7439 out:
7440 rw_runlock(lock);
7441 return (rv);
7442 }
7443
7444 /*
7445 * pmap_is_modified:
7446 *
7447 * Return whether or not the specified physical page was modified
7448 * in any physical maps.
7449 */
7450 bool
pmap_is_modified(vm_page_t m)7451 pmap_is_modified(vm_page_t m)
7452 {
7453
7454 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7455 ("pmap_is_modified: page %p is not managed", m));
7456
7457 /*
7458 * If the page is not busied then this check is racy.
7459 */
7460 if (!pmap_page_is_write_mapped(m))
7461 return (false);
7462 return (pmap_page_test_mappings(m, false, true));
7463 }
7464
7465 /*
7466 * pmap_is_prefaultable:
7467 *
7468 * Return whether or not the specified virtual address is eligible
7469 * for prefault.
7470 */
7471 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)7472 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
7473 {
7474 pd_entry_t *pde;
7475 pt_entry_t *pte;
7476 bool rv;
7477 int lvl;
7478
7479 /*
7480 * Return true if and only if the L3 entry for the specified virtual
7481 * address is allocated but invalid.
7482 */
7483 rv = false;
7484 PMAP_LOCK(pmap);
7485 pde = pmap_pde(pmap, addr, &lvl);
7486 if (pde != NULL && lvl == 2) {
7487 pte = pmap_l2_to_l3(pde, addr);
7488 rv = pmap_load(pte) == 0;
7489 }
7490 PMAP_UNLOCK(pmap);
7491 return (rv);
7492 }
7493
7494 /*
7495 * pmap_is_referenced:
7496 *
7497 * Return whether or not the specified physical page was referenced
7498 * in any physical maps.
7499 */
7500 bool
pmap_is_referenced(vm_page_t m)7501 pmap_is_referenced(vm_page_t m)
7502 {
7503
7504 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7505 ("pmap_is_referenced: page %p is not managed", m));
7506 return (pmap_page_test_mappings(m, true, false));
7507 }
7508
7509 /*
7510 * Clear the write and modified bits in each of the given page's mappings.
7511 */
7512 void
pmap_remove_write(vm_page_t m)7513 pmap_remove_write(vm_page_t m)
7514 {
7515 struct md_page *pvh;
7516 pmap_t pmap;
7517 struct rwlock *lock;
7518 pv_entry_t next_pv, pv;
7519 pt_entry_t oldpte, *pte, set, clear, mask, val;
7520 vm_offset_t va;
7521 int md_gen, pvh_gen;
7522
7523 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7524 ("pmap_remove_write: page %p is not managed", m));
7525 vm_page_assert_busied(m);
7526
7527 if (!pmap_page_is_write_mapped(m))
7528 return;
7529 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7530 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7531 rw_wlock(lock);
7532 retry:
7533 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7534 pmap = PV_PMAP(pv);
7535 PMAP_ASSERT_STAGE1(pmap);
7536 if (!PMAP_TRYLOCK(pmap)) {
7537 pvh_gen = pvh->pv_gen;
7538 rw_wunlock(lock);
7539 PMAP_LOCK(pmap);
7540 rw_wlock(lock);
7541 if (pvh_gen != pvh->pv_gen) {
7542 PMAP_UNLOCK(pmap);
7543 goto retry;
7544 }
7545 }
7546 va = pv->pv_va;
7547 pte = pmap_pte_exists(pmap, va, 2, __func__);
7548 if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
7549 (void)pmap_demote_l2_locked(pmap, pte, va, &lock);
7550 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7551 ("inconsistent pv lock %p %p for page %p",
7552 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7553 PMAP_UNLOCK(pmap);
7554 }
7555 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7556 pmap = PV_PMAP(pv);
7557 if (!PMAP_TRYLOCK(pmap)) {
7558 pvh_gen = pvh->pv_gen;
7559 md_gen = m->md.pv_gen;
7560 rw_wunlock(lock);
7561 PMAP_LOCK(pmap);
7562 rw_wlock(lock);
7563 if (pvh_gen != pvh->pv_gen ||
7564 md_gen != m->md.pv_gen) {
7565 PMAP_UNLOCK(pmap);
7566 goto retry;
7567 }
7568 }
7569 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7570 oldpte = pmap_load(pte);
7571 if ((oldpte & ATTR_SW_DBM) != 0) {
7572 if ((oldpte & ATTR_CONTIGUOUS) != 0) {
7573 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
7574
7575 /*
7576 * The L3 entry's accessed bit may have
7577 * changed.
7578 */
7579 oldpte = pmap_load(pte);
7580 }
7581 if (pmap->pm_stage == PM_STAGE1) {
7582 set = ATTR_S1_AP_RW_BIT;
7583 clear = 0;
7584 mask = ATTR_S1_AP_RW_BIT;
7585 val = ATTR_S1_AP(ATTR_S1_AP_RW);
7586 } else {
7587 set = 0;
7588 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7589 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7590 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7591 }
7592 clear |= ATTR_SW_DBM;
7593 while (!atomic_fcmpset_64(pte, &oldpte,
7594 (oldpte | set) & ~clear))
7595 cpu_spinwait();
7596
7597 if ((oldpte & mask) == val)
7598 vm_page_dirty(m);
7599 pmap_invalidate_page(pmap, pv->pv_va, true);
7600 }
7601 PMAP_UNLOCK(pmap);
7602 }
7603 rw_wunlock(lock);
7604 vm_page_aflag_clear(m, PGA_WRITEABLE);
7605 }
7606
7607 /*
7608 * pmap_ts_referenced:
7609 *
7610 * Return a count of reference bits for a page, clearing those bits.
7611 * It is not necessary for every reference bit to be cleared, but it
7612 * is necessary that 0 only be returned when there are truly no
7613 * reference bits set.
7614 *
7615 * As an optimization, update the page's dirty field if a modified bit is
7616 * found while counting reference bits. This opportunistic update can be
7617 * performed at low cost and can eliminate the need for some future calls
7618 * to pmap_is_modified(). However, since this function stops after
7619 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
7620 * dirty pages. Those dirty pages will only be detected by a future call
7621 * to pmap_is_modified().
7622 */
7623 int
pmap_ts_referenced(vm_page_t m)7624 pmap_ts_referenced(vm_page_t m)
7625 {
7626 struct md_page *pvh;
7627 pv_entry_t pv, pvf;
7628 pmap_t pmap;
7629 struct rwlock *lock;
7630 pt_entry_t *pte, tpte;
7631 vm_offset_t va;
7632 vm_paddr_t pa;
7633 int cleared, md_gen, not_cleared, pvh_gen;
7634 struct spglist free;
7635
7636 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7637 ("pmap_ts_referenced: page %p is not managed", m));
7638 SLIST_INIT(&free);
7639 cleared = 0;
7640 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7641 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7642 rw_wlock(lock);
7643 retry:
7644 not_cleared = 0;
7645 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
7646 goto small_mappings;
7647 pv = pvf;
7648 do {
7649 if (pvf == NULL)
7650 pvf = pv;
7651 pmap = PV_PMAP(pv);
7652 if (!PMAP_TRYLOCK(pmap)) {
7653 pvh_gen = pvh->pv_gen;
7654 rw_wunlock(lock);
7655 PMAP_LOCK(pmap);
7656 rw_wlock(lock);
7657 if (pvh_gen != pvh->pv_gen) {
7658 PMAP_UNLOCK(pmap);
7659 goto retry;
7660 }
7661 }
7662 va = pv->pv_va;
7663 pte = pmap_pte_exists(pmap, va, 2, __func__);
7664 tpte = pmap_load(pte);
7665 if (pmap_pte_dirty(pmap, tpte)) {
7666 /*
7667 * Although "tpte" is mapping a 2MB page, because
7668 * this function is called at a 4KB page granularity,
7669 * we only update the 4KB page under test.
7670 */
7671 vm_page_dirty(m);
7672 }
7673 if ((tpte & ATTR_AF) != 0) {
7674 pa = VM_PAGE_TO_PHYS(m);
7675
7676 /*
7677 * Since this reference bit is shared by 512 4KB pages,
7678 * it should not be cleared every time it is tested.
7679 * Apply a simple "hash" function on the physical page
7680 * number, the virtual superpage number, and the pmap
7681 * address to select one 4KB page out of the 512 on
7682 * which testing the reference bit will result in
7683 * clearing that reference bit. This function is
7684 * designed to avoid the selection of the same 4KB page
7685 * for every 2MB page mapping.
7686 *
7687 * On demotion, a mapping that hasn't been referenced
7688 * is simply destroyed. To avoid the possibility of a
7689 * subsequent page fault on a demoted wired mapping,
7690 * always leave its reference bit set. Moreover,
7691 * since the superpage is wired, the current state of
7692 * its reference bit won't affect page replacement.
7693 */
7694 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
7695 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
7696 (tpte & ATTR_SW_WIRED) == 0) {
7697 pmap_clear_bits(pte, ATTR_AF);
7698 pmap_invalidate_page(pmap, va, true);
7699 cleared++;
7700 } else
7701 not_cleared++;
7702 }
7703 PMAP_UNLOCK(pmap);
7704 /* Rotate the PV list if it has more than one entry. */
7705 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7706 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7707 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
7708 pvh->pv_gen++;
7709 }
7710 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
7711 goto out;
7712 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
7713 small_mappings:
7714 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
7715 goto out;
7716 pv = pvf;
7717 do {
7718 if (pvf == NULL)
7719 pvf = pv;
7720 pmap = PV_PMAP(pv);
7721 if (!PMAP_TRYLOCK(pmap)) {
7722 pvh_gen = pvh->pv_gen;
7723 md_gen = m->md.pv_gen;
7724 rw_wunlock(lock);
7725 PMAP_LOCK(pmap);
7726 rw_wlock(lock);
7727 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7728 PMAP_UNLOCK(pmap);
7729 goto retry;
7730 }
7731 }
7732 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7733 tpte = pmap_load(pte);
7734 if (pmap_pte_dirty(pmap, tpte))
7735 vm_page_dirty(m);
7736 if ((tpte & ATTR_AF) != 0) {
7737 if ((tpte & ATTR_SW_WIRED) == 0) {
7738 /*
7739 * Clear the accessed bit in this L3 entry
7740 * regardless of the contiguous bit.
7741 */
7742 pmap_clear_bits(pte, ATTR_AF);
7743 pmap_invalidate_page(pmap, pv->pv_va, true);
7744 cleared++;
7745 } else
7746 not_cleared++;
7747 } else if ((tpte & ATTR_CONTIGUOUS) != 0 &&
7748 (pmap_load_l3c(pte) & ATTR_AF) != 0) {
7749 /*
7750 * An L3C superpage mapping is regarded as accessed
7751 * until the accessed bit has been cleared in all
7752 * of its constituent entries.
7753 */
7754 not_cleared++;
7755 }
7756 PMAP_UNLOCK(pmap);
7757 /* Rotate the PV list if it has more than one entry. */
7758 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7759 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7760 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7761 m->md.pv_gen++;
7762 }
7763 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
7764 not_cleared < PMAP_TS_REFERENCED_MAX);
7765 out:
7766 rw_wunlock(lock);
7767 vm_page_free_pages_toq(&free, true);
7768 return (cleared + not_cleared);
7769 }
7770
7771 /*
7772 * Apply the given advice to the specified range of addresses within the
7773 * given pmap. Depending on the advice, clear the referenced and/or
7774 * modified flags in each mapping and set the mapped page's dirty field.
7775 */
7776 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)7777 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
7778 {
7779 struct rwlock *lock;
7780 vm_offset_t va, va_next, dva;
7781 vm_page_t m;
7782 pd_entry_t *l0, *l1, *l2, oldl2;
7783 pt_entry_t *l3, *dl3, oldl3;
7784
7785 PMAP_ASSERT_STAGE1(pmap);
7786
7787 if (advice != MADV_DONTNEED && advice != MADV_FREE)
7788 return;
7789
7790 PMAP_LOCK(pmap);
7791 for (; sva < eva; sva = va_next) {
7792 l0 = pmap_l0(pmap, sva);
7793 if (pmap_load(l0) == 0) {
7794 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
7795 if (va_next < sva)
7796 va_next = eva;
7797 continue;
7798 }
7799
7800 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
7801 if (va_next < sva)
7802 va_next = eva;
7803 l1 = pmap_l0_to_l1(l0, sva);
7804 if (pmap_load(l1) == 0)
7805 continue;
7806 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
7807 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7808 continue;
7809 }
7810
7811 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
7812 if (va_next < sva)
7813 va_next = eva;
7814 l2 = pmap_l1_to_l2(l1, sva);
7815 oldl2 = pmap_load(l2);
7816 if (oldl2 == 0)
7817 continue;
7818 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
7819 if ((oldl2 & ATTR_SW_MANAGED) == 0)
7820 continue;
7821 lock = NULL;
7822 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
7823 if (lock != NULL)
7824 rw_wunlock(lock);
7825
7826 /*
7827 * The 2MB page mapping was destroyed.
7828 */
7829 continue;
7830 }
7831
7832 /*
7833 * Unless the page mappings are wired, remove the
7834 * mapping to a single page so that a subsequent
7835 * access may repromote. Choosing the last page
7836 * within the address range [sva, min(va_next, eva))
7837 * generally results in more repromotions. Since the
7838 * underlying page table page is fully populated, this
7839 * removal never frees a page table page.
7840 */
7841 if ((oldl2 & ATTR_SW_WIRED) == 0) {
7842 va = eva;
7843 if (va > va_next)
7844 va = va_next;
7845 va -= PAGE_SIZE;
7846 KASSERT(va >= sva,
7847 ("pmap_advise: no address gap"));
7848 l3 = pmap_l2_to_l3(l2, va);
7849 KASSERT(pmap_load(l3) != 0,
7850 ("pmap_advise: invalid PTE"));
7851 pmap_remove_l3(pmap, l3, va, pmap_load(l2),
7852 NULL, &lock);
7853 }
7854 if (lock != NULL)
7855 rw_wunlock(lock);
7856 }
7857 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
7858 ("pmap_advise: invalid L2 entry after demotion"));
7859 if (va_next > eva)
7860 va_next = eva;
7861 va = va_next;
7862 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
7863 sva += L3_SIZE) {
7864 oldl3 = pmap_load(l3);
7865 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
7866 (ATTR_SW_MANAGED | L3_PAGE))
7867 goto maybe_invlrng;
7868 else if (pmap_pte_dirty(pmap, oldl3)) {
7869 if (advice == MADV_DONTNEED) {
7870 /*
7871 * Future calls to pmap_is_modified()
7872 * can be avoided by making the page
7873 * dirty now.
7874 */
7875 m = PTE_TO_VM_PAGE(oldl3);
7876 vm_page_dirty(m);
7877 }
7878 if ((oldl3 & ATTR_CONTIGUOUS) != 0) {
7879 /*
7880 * Unconditionally demote the L3C
7881 * superpage because we do not allow
7882 * writeable, clean superpages.
7883 */
7884 (void)pmap_demote_l3c(pmap, l3, sva);
7885
7886 /*
7887 * Destroy the final mapping before the
7888 * next L3C boundary or va_next,
7889 * whichever comes first, so that a
7890 * subsequent access may act as a
7891 * repromotion trigger.
7892 */
7893 if ((oldl3 & ATTR_SW_WIRED) == 0) {
7894 dva = MIN((sva & ~L3C_OFFSET) +
7895 L3C_SIZE - PAGE_SIZE,
7896 va_next - PAGE_SIZE);
7897 dl3 = pmap_l2_to_l3(l2, dva);
7898 KASSERT(pmap_load(dl3) != 0,
7899 ("pmap_advise: invalid PTE"));
7900 lock = NULL;
7901 pmap_remove_l3(pmap, dl3, dva,
7902 pmap_load(l2), NULL, &lock);
7903 if (lock != NULL)
7904 rw_wunlock(lock);
7905 }
7906
7907 /*
7908 * The L3 entry's accessed bit may have
7909 * changed.
7910 */
7911 oldl3 = pmap_load(l3);
7912 }
7913
7914 /*
7915 * Check that we did not just destroy this entry so
7916 * we avoid corrupting the page able.
7917 */
7918 if (oldl3 != 0) {
7919 while (!atomic_fcmpset_long(l3, &oldl3,
7920 (oldl3 & ~ATTR_AF) |
7921 ATTR_S1_AP(ATTR_S1_AP_RO)))
7922 cpu_spinwait();
7923 }
7924 } else if ((oldl3 & ATTR_AF) != 0) {
7925 /*
7926 * Clear the accessed bit in this L3 entry
7927 * regardless of the contiguous bit.
7928 */
7929 pmap_clear_bits(l3, ATTR_AF);
7930 } else
7931 goto maybe_invlrng;
7932 if (va == va_next)
7933 va = sva;
7934 continue;
7935 maybe_invlrng:
7936 if (va != va_next) {
7937 pmap_s1_invalidate_range(pmap, va, sva, true);
7938 va = va_next;
7939 }
7940 }
7941 if (va != va_next)
7942 pmap_s1_invalidate_range(pmap, va, sva, true);
7943 }
7944 PMAP_UNLOCK(pmap);
7945 }
7946
7947 /*
7948 * Clear the modify bits on the specified physical page.
7949 */
7950 void
pmap_clear_modify(vm_page_t m)7951 pmap_clear_modify(vm_page_t m)
7952 {
7953 struct md_page *pvh;
7954 struct rwlock *lock;
7955 pmap_t pmap;
7956 pv_entry_t next_pv, pv;
7957 pd_entry_t *l2, oldl2;
7958 pt_entry_t *l3, oldl3;
7959 vm_offset_t va;
7960 int md_gen, pvh_gen;
7961
7962 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7963 ("pmap_clear_modify: page %p is not managed", m));
7964 vm_page_assert_busied(m);
7965
7966 if (!pmap_page_is_write_mapped(m))
7967 return;
7968 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7969 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7970 rw_wlock(lock);
7971 restart:
7972 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7973 pmap = PV_PMAP(pv);
7974 PMAP_ASSERT_STAGE1(pmap);
7975 if (!PMAP_TRYLOCK(pmap)) {
7976 pvh_gen = pvh->pv_gen;
7977 rw_wunlock(lock);
7978 PMAP_LOCK(pmap);
7979 rw_wlock(lock);
7980 if (pvh_gen != pvh->pv_gen) {
7981 PMAP_UNLOCK(pmap);
7982 goto restart;
7983 }
7984 }
7985 va = pv->pv_va;
7986 l2 = pmap_l2(pmap, va);
7987 oldl2 = pmap_load(l2);
7988 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
7989 if ((oldl2 & ATTR_SW_DBM) != 0 &&
7990 pmap_demote_l2_locked(pmap, l2, va, &lock) &&
7991 (oldl2 & ATTR_SW_WIRED) == 0) {
7992 /*
7993 * Write protect the mapping to a single page so that
7994 * a subsequent write access may repromote.
7995 */
7996 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
7997 l3 = pmap_l2_to_l3(l2, va);
7998 oldl3 = pmap_load(l3);
7999 while (!atomic_fcmpset_long(l3, &oldl3,
8000 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
8001 cpu_spinwait();
8002 vm_page_dirty(m);
8003 pmap_s1_invalidate_page(pmap, va, true);
8004 }
8005 PMAP_UNLOCK(pmap);
8006 }
8007 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
8008 pmap = PV_PMAP(pv);
8009 PMAP_ASSERT_STAGE1(pmap);
8010 if (!PMAP_TRYLOCK(pmap)) {
8011 md_gen = m->md.pv_gen;
8012 pvh_gen = pvh->pv_gen;
8013 rw_wunlock(lock);
8014 PMAP_LOCK(pmap);
8015 rw_wlock(lock);
8016 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
8017 PMAP_UNLOCK(pmap);
8018 goto restart;
8019 }
8020 }
8021 l2 = pmap_l2(pmap, pv->pv_va);
8022 l3 = pmap_l2_to_l3(l2, pv->pv_va);
8023 oldl3 = pmap_load(l3);
8024 KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 ||
8025 (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8026 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8027 ("writeable L3C superpage not dirty"));
8028 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
8029 if ((oldl3 & ATTR_CONTIGUOUS) != 0)
8030 (void)pmap_demote_l3c(pmap, l3, pv->pv_va);
8031 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
8032 pmap_s1_invalidate_page(pmap, pv->pv_va, true);
8033 }
8034 PMAP_UNLOCK(pmap);
8035 }
8036 rw_wunlock(lock);
8037 }
8038
8039 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)8040 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
8041 {
8042 struct pmap_preinit_mapping *ppim;
8043 vm_offset_t va, offset;
8044 pd_entry_t old_l2e, *pde;
8045 pt_entry_t *l2;
8046 int i, lvl, l2_blocks, free_l2_count, start_idx;
8047
8048 /* Use the DMAP region if we can */
8049 if (PHYS_IN_DMAP(pa) && PHYS_IN_DMAP(pa + size - 1) &&
8050 pmap_kmapped_range(PHYS_TO_DMAP(pa), size))
8051 return (PHYS_TO_DMAP(pa));
8052
8053 if (!vm_initialized) {
8054 /*
8055 * No L3 ptables so map entire L2 blocks where start VA is:
8056 * preinit_map_va + start_idx * L2_SIZE
8057 * There may be duplicate mappings (multiple VA -> same PA) but
8058 * ARM64 dcache is always PIPT so that's acceptable.
8059 */
8060 if (size == 0)
8061 return (NULL);
8062
8063 /* Calculate how many L2 blocks are needed for the mapping */
8064 l2_blocks = (roundup2(pa + size, L2_SIZE) -
8065 rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
8066
8067 offset = pa & L2_OFFSET;
8068
8069 if (preinit_map_va == 0)
8070 return (NULL);
8071
8072 /* Map 2MiB L2 blocks from reserved VA space */
8073
8074 free_l2_count = 0;
8075 start_idx = -1;
8076 /* Find enough free contiguous VA space */
8077 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
8078 ppim = pmap_preinit_mapping + i;
8079 if (free_l2_count > 0 && ppim->pa != 0) {
8080 /* Not enough space here */
8081 free_l2_count = 0;
8082 start_idx = -1;
8083 continue;
8084 }
8085
8086 if (ppim->pa == 0) {
8087 /* Free L2 block */
8088 if (start_idx == -1)
8089 start_idx = i;
8090 free_l2_count++;
8091 if (free_l2_count == l2_blocks)
8092 break;
8093 }
8094 }
8095 if (free_l2_count != l2_blocks)
8096 panic("%s: too many preinit mappings", __func__);
8097
8098 va = preinit_map_va + (start_idx * L2_SIZE);
8099 for (i = start_idx; i < start_idx + l2_blocks; i++) {
8100 /* Mark entries as allocated */
8101 ppim = pmap_preinit_mapping + i;
8102 ppim->pa = pa;
8103 ppim->va = (char *)va + offset;
8104 ppim->size = size;
8105 }
8106
8107 /* Map L2 blocks */
8108 pa = rounddown2(pa, L2_SIZE);
8109 old_l2e = 0;
8110 for (i = 0; i < l2_blocks; i++) {
8111 pde = pmap_pde(kernel_pmap, va, &lvl);
8112 KASSERT(pde != NULL,
8113 ("pmap_mapbios: Invalid page entry, va: 0x%lx",
8114 va));
8115 KASSERT(lvl == 1,
8116 ("pmap_mapbios: Invalid level %d", lvl));
8117
8118 /* Insert L2_BLOCK */
8119 l2 = pmap_l1_to_l2(pde, va);
8120 old_l2e |= pmap_load_store(l2,
8121 PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr |
8122 ATTR_S1_XN | ATTR_KERN_GP |
8123 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
8124
8125 va += L2_SIZE;
8126 pa += L2_SIZE;
8127 }
8128 if ((old_l2e & ATTR_DESCR_VALID) != 0)
8129 pmap_s1_invalidate_all_kernel();
8130 else {
8131 /*
8132 * Because the old entries were invalid and the new
8133 * mappings are not executable, an isb is not required.
8134 */
8135 dsb(ishst);
8136 }
8137
8138 va = preinit_map_va + (start_idx * L2_SIZE);
8139
8140 } else {
8141 /* kva_alloc may be used to map the pages */
8142 offset = pa & PAGE_MASK;
8143 size = round_page(offset + size);
8144
8145 va = (vm_offset_t)kva_alloc(size);
8146 if (va == 0)
8147 panic("%s: Couldn't allocate KVA", __func__);
8148
8149 pde = pmap_pde(kernel_pmap, va, &lvl);
8150 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
8151
8152 /* L3 table is linked */
8153 va = trunc_page(va);
8154 pa = trunc_page(pa);
8155 pmap_kenter(va, size, pa, memory_mapping_mode(pa));
8156 }
8157
8158 return ((void *)(va + offset));
8159 }
8160
8161 void
pmap_unmapbios(void * p,vm_size_t size)8162 pmap_unmapbios(void *p, vm_size_t size)
8163 {
8164 struct pmap_preinit_mapping *ppim;
8165 char *va;
8166 vm_offset_t offset, va_trunc;
8167 pd_entry_t *pde;
8168 pt_entry_t *l2;
8169 int error __diagused, i, lvl, l2_blocks, block;
8170 bool preinit_map;
8171
8172 va = p;
8173 if (VIRT_IN_DMAP(va)) {
8174 KASSERT(VIRT_IN_DMAP(va + size - 1),
8175 ("%s: End address not in DMAP region: %p", __func__,
8176 va + size - 1));
8177 /* Ensure the attributes are as expected for the DMAP region */
8178 PMAP_LOCK(kernel_pmap);
8179 error = pmap_change_props_locked(va, size,
8180 PROT_READ | PROT_WRITE, VM_MEMATTR_DEFAULT, -1, false);
8181 PMAP_UNLOCK(kernel_pmap);
8182 KASSERT(error == 0, ("%s: Failed to reset DMAP attributes: %d",
8183 __func__, error));
8184
8185 return;
8186 }
8187
8188 l2_blocks =
8189 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
8190 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
8191
8192 /* Remove preinit mapping */
8193 preinit_map = false;
8194 block = 0;
8195 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
8196 ppim = pmap_preinit_mapping + i;
8197 if (ppim->va == va) {
8198 KASSERT(ppim->size == size,
8199 ("pmap_unmapbios: size mismatch"));
8200 ppim->va = NULL;
8201 ppim->pa = 0;
8202 ppim->size = 0;
8203 preinit_map = true;
8204 offset = block * L2_SIZE;
8205 va_trunc = rounddown2((vm_offset_t)va, L2_SIZE) +
8206 offset;
8207
8208 /* Remove L2_BLOCK */
8209 pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
8210 KASSERT(pde != NULL,
8211 ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
8212 va_trunc));
8213 l2 = pmap_l1_to_l2(pde, va_trunc);
8214 pmap_clear(l2);
8215
8216 if (block == (l2_blocks - 1))
8217 break;
8218 block++;
8219 }
8220 }
8221 if (preinit_map) {
8222 pmap_s1_invalidate_all_kernel();
8223 return;
8224 }
8225
8226 /* Unmap the pages reserved with kva_alloc. */
8227 if (vm_initialized) {
8228 offset = (vm_offset_t)va & PAGE_MASK;
8229 size = round_page(offset + size);
8230 va = trunc_page(va);
8231
8232 /* Unmap and invalidate the pages */
8233 pmap_kremove_device((vm_offset_t)va, size);
8234
8235 kva_free(va, size);
8236 }
8237 }
8238
8239 /*
8240 * Sets the memory attribute for the specified page.
8241 */
8242 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)8243 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
8244 {
8245 if (m->md.pv_memattr == ma)
8246 return;
8247
8248 m->md.pv_memattr = ma;
8249
8250 /*
8251 * If "m" is a normal page, update its direct mapping. This update
8252 * can be relied upon to perform any cache operations that are
8253 * required for data coherence.
8254 */
8255 if ((m->flags & PG_FICTITIOUS) == 0 &&
8256 pmap_change_attr(VM_PAGE_TO_DMAP(m), PAGE_SIZE,
8257 m->md.pv_memattr) != 0)
8258 panic("memory attribute change on the direct map failed");
8259 }
8260
8261 /*
8262 * Changes the specified virtual address range's memory type to that given by
8263 * the parameter "mode". The specified virtual address range must be
8264 * completely contained within either the direct map or the kernel map. If
8265 * the virtual address range is contained within the kernel map, then the
8266 * memory type for each of the corresponding ranges of the direct map is also
8267 * changed. (The corresponding ranges of the direct map are those ranges that
8268 * map the same physical pages as the specified virtual address range.) These
8269 * changes to the direct map are necessary because Intel describes the
8270 * behavior of their processors as "undefined" if two or more mappings to the
8271 * same physical page have different memory types.
8272 *
8273 * Returns zero if the change completed successfully, and either EINVAL or
8274 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
8275 * of the virtual address range was not mapped, and ENOMEM is returned if
8276 * there was insufficient memory available to complete the change. In the
8277 * latter case, the memory type may have been changed on some part of the
8278 * virtual address range or the direct map.
8279 */
8280 int
pmap_change_attr(void * va,vm_size_t size,int mode)8281 pmap_change_attr(void *va, vm_size_t size, int mode)
8282 {
8283 int error;
8284
8285 PMAP_LOCK(kernel_pmap);
8286 error = pmap_change_props_locked(va, size, PROT_NONE, mode, -1, false);
8287 PMAP_UNLOCK(kernel_pmap);
8288 return (error);
8289 }
8290
8291 int
pmap_change_dmap_attr(int mode)8292 pmap_change_dmap_attr(int mode)
8293 {
8294 int error;
8295
8296 KASSERT(mode == VM_MEMATTR_WRITE_BACK ||
8297 mode == VM_MEMATTR_TAGGED,
8298 ("%s: mode %d must be compatible with write-back", __func__, mode));
8299
8300 PMAP_LOCK(kernel_pmap);
8301 error = pmap_change_props_locked((void *)DMAP_MIN_ADDRESS,
8302 dmap_max_addr - DMAP_MIN_ADDRESS, PROT_NONE, mode, dmap_attr, true);
8303 if (error == 0)
8304 dmap_attr = mode;
8305 PMAP_UNLOCK(kernel_pmap);
8306 return (error);
8307 }
8308
8309 /*
8310 * Changes the specified virtual address range's protections to those
8311 * specified by "prot". Like pmap_change_attr(), protections for aliases
8312 * in the direct map are updated as well. Protections on aliasing mappings may
8313 * be a subset of the requested protections; for example, mappings in the direct
8314 * map are never executable.
8315 */
8316 int
pmap_change_prot(void * va,vm_size_t size,vm_prot_t prot)8317 pmap_change_prot(void *va, vm_size_t size, vm_prot_t prot)
8318 {
8319 int error;
8320
8321 /* Only supported within the kernel map. */
8322 if ((vm_offset_t)va < VM_MIN_KERNEL_ADDRESS)
8323 return (EINVAL);
8324
8325 PMAP_LOCK(kernel_pmap);
8326 error = pmap_change_props_locked(va, size, prot, -1, -1, false);
8327 PMAP_UNLOCK(kernel_pmap);
8328 return (error);
8329 }
8330
8331 static int
pmap_change_props_locked(void * addr,vm_size_t size,vm_prot_t prot,int mode,int old_mode,bool skip_unmapped)8332 pmap_change_props_locked(void *addr, vm_size_t size, vm_prot_t prot,
8333 int mode, int old_mode, bool skip_unmapped)
8334 {
8335 vm_offset_t base, offset, tmpva, va;
8336 vm_size_t pte_size;
8337 vm_paddr_t pa;
8338 pt_entry_t pte, *ptep, *newpte;
8339 pt_entry_t bits, mask, old_mode_bits, old_mode_mask;
8340 char *tmpptep;
8341 int lvl, rv;
8342
8343 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
8344 va = (vm_offset_t)addr;
8345 base = trunc_page(va);
8346 offset = va & PAGE_MASK;
8347 size = round_page(offset + size);
8348
8349 if (!VIRT_IN_DMAP(base) &&
8350 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
8351 return (EINVAL);
8352
8353 bits = old_mode_bits = 0;
8354 mask = old_mode_mask = 0;
8355 if (mode != -1) {
8356 bits = ATTR_S1_IDX(mode);
8357 mask = ATTR_S1_IDX_MASK;
8358 if (mode == VM_MEMATTR_DEVICE) {
8359 mask |= ATTR_S1_XN;
8360 bits |= ATTR_S1_XN;
8361 }
8362 }
8363 if (old_mode != -1) {
8364 old_mode_bits = ATTR_S1_IDX(old_mode);
8365 old_mode_mask = ATTR_S1_IDX_MASK;
8366 }
8367 if (prot != VM_PROT_NONE) {
8368 /* Don't mark the DMAP as executable. It never is on arm64. */
8369 if (VIRT_IN_DMAP(base)) {
8370 prot &= ~VM_PROT_EXECUTE;
8371 /*
8372 * XXX Mark the DMAP as writable for now. We rely
8373 * on this in ddb & dtrace to insert breakpoint
8374 * instructions.
8375 */
8376 prot |= VM_PROT_WRITE;
8377 }
8378
8379 if ((prot & VM_PROT_WRITE) == 0) {
8380 bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
8381 }
8382 if ((prot & VM_PROT_EXECUTE) == 0) {
8383 bits |= ATTR_S1_PXN;
8384 }
8385 bits |= ATTR_S1_UXN;
8386 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
8387 }
8388
8389 for (tmpva = base; tmpva < base + size; ) {
8390 ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
8391 if (ptep == NULL && !skip_unmapped) {
8392 return (EINVAL);
8393 } else if ((ptep == NULL && skip_unmapped) ||
8394 (pmap_load(ptep) & mask) == bits ||
8395 (pmap_load(ptep) & old_mode_mask) != old_mode_bits) {
8396 /*
8397 * We already have one of the following meaning
8398 * we can skip this memory region::
8399 * - No memory mapped at this address
8400 * - The new attributes are already set
8401 * - The expected attributes are incorrect
8402 */
8403 switch (lvl) {
8404 default:
8405 panic("Invalid DMAP table level: %d\n", lvl);
8406 case 1:
8407 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
8408 break;
8409 case 2:
8410 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
8411 break;
8412 case 3:
8413 tmpva += PAGE_SIZE;
8414 break;
8415 }
8416 } else {
8417 /* We can't demote/promote this entry */
8418 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
8419
8420 /*
8421 * Find the entry and demote it if the requested change
8422 * only applies to part of the address range mapped by
8423 * the entry.
8424 */
8425 switch (lvl) {
8426 default:
8427 panic("Invalid DMAP table level: %d\n", lvl);
8428 case 1:
8429 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8430 if ((tmpva & L1_OFFSET) == 0 &&
8431 (base + size - tmpva) >= L1_SIZE) {
8432 pte_size = L1_SIZE;
8433 break;
8434 }
8435 newpte = pmap_demote_l1(kernel_pmap, ptep,
8436 tmpva & ~L1_OFFSET);
8437 if (newpte == NULL)
8438 return (EINVAL);
8439 ptep = pmap_l1_to_l2(ptep, tmpva);
8440 /* FALLTHROUGH */
8441 case 2:
8442 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
8443 if ((tmpva & L2C_OFFSET) == 0 &&
8444 (base + size - tmpva) >= L2C_SIZE) {
8445 pte_size = L2C_SIZE;
8446 break;
8447 }
8448 if (!pmap_demote_l2c(kernel_pmap, ptep,
8449 tmpva))
8450 return (EINVAL);
8451 }
8452 if ((tmpva & L2_OFFSET) == 0 &&
8453 (base + size - tmpva) >= L2_SIZE) {
8454 pte_size = L2_SIZE;
8455 break;
8456 }
8457 newpte = pmap_demote_l2(kernel_pmap, ptep,
8458 tmpva);
8459 if (newpte == NULL)
8460 return (EINVAL);
8461 ptep = pmap_l2_to_l3(ptep, tmpva);
8462 /* FALLTHROUGH */
8463 case 3:
8464 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
8465 if ((tmpva & L3C_OFFSET) == 0 &&
8466 (base + size - tmpva) >= L3C_SIZE) {
8467 pte_size = L3C_SIZE;
8468 break;
8469 }
8470 if (!pmap_demote_l3c(kernel_pmap, ptep,
8471 tmpva))
8472 return (EINVAL);
8473 }
8474 pte_size = PAGE_SIZE;
8475 break;
8476 }
8477
8478 tmpptep = 0;
8479 if (tmpva <= (vm_offset_t)ptep &&
8480 tmpva + pte_size > (vm_offset_t)ptep) {
8481 vm_paddr_t pte_pa;
8482
8483 mtx_lock(&cmap_lock);
8484 tmpptep = cmap1_addr;
8485 pte_pa = DMAP_TO_PHYS((vm_offset_t)ptep);
8486 pmap_store(cmap1_pte, ATTR_AF |
8487 pmap_sh_attr | ATTR_S1_AP(ATTR_S1_AP_RW) |
8488 ATTR_S1_XN | ATTR_KERN_GP |
8489 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
8490 PHYS_TO_PTE(pte_pa &~L3_OFFSET) | L3_PAGE);
8491 dsb(ishst);
8492 ptep = (pt_entry_t *)(tmpptep +
8493 ((vm_offset_t)ptep & PAGE_MASK));
8494 }
8495
8496 /* Update the entry */
8497 pte = pmap_load(ptep);
8498 pte &= ~mask;
8499 pte |= bits;
8500
8501 switch (pte_size) {
8502 case L2C_SIZE:
8503 pmap_update_strided(kernel_pmap, ptep, ptep +
8504 L2C_ENTRIES, pte, tmpva, L2_SIZE, L2C_SIZE);
8505 break;
8506 case L3C_SIZE:
8507 pmap_update_strided(kernel_pmap, ptep, ptep +
8508 L3C_ENTRIES, pte, tmpva, L3_SIZE, L3C_SIZE);
8509 break;
8510 default:
8511 /*
8512 * We are updating a single block or page entry,
8513 * so regardless of pte_size pass PAGE_SIZE in
8514 * order that a single TLB invalidation is
8515 * performed.
8516 */
8517 pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
8518 PAGE_SIZE);
8519 break;
8520 }
8521
8522 if (tmpptep != 0) {
8523 pmap_clear(cmap1_pte);
8524 pmap_s1_invalidate_page(kernel_pmap,
8525 (vm_offset_t)tmpptep, true);
8526 mtx_unlock(&cmap_lock);
8527 }
8528
8529 pa = PTE_TO_PHYS(pte);
8530 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
8531 int dmap_mode;
8532
8533 /*
8534 * When booting on HW with MTE enabled we may
8535 * need to swap to a tagged type for the DMAP
8536 * to allow tags to be set through it.
8537 */
8538 if (mode == VM_MEMATTR_WRITE_BACK)
8539 dmap_mode = dmap_attr;
8540 else
8541 dmap_mode = mode;
8542
8543 /*
8544 * Keep the DMAP memory in sync.
8545 */
8546 rv = pmap_change_props_locked(
8547 PHYS_TO_DMAP(pa), pte_size,
8548 prot, dmap_mode, old_mode, true);
8549 if (rv != 0)
8550 return (rv);
8551 }
8552
8553 /*
8554 * If moving to a non-cacheable entry flush
8555 * the cache.
8556 */
8557 if (mode == VM_MEMATTR_UNCACHEABLE)
8558 cpu_dcache_wbinv_range((void *)tmpva, pte_size);
8559 tmpva += pte_size;
8560 }
8561 }
8562
8563 return (0);
8564 }
8565
8566 /*
8567 * Create an L2 table to map all addresses within an L1 mapping.
8568 */
8569 static pt_entry_t *
pmap_demote_l1(pmap_t pmap,pt_entry_t * l1,vm_offset_t va)8570 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
8571 {
8572 pt_entry_t *l2, newl2, oldl1;
8573 char *tmpl1;
8574 vm_paddr_t l2phys, phys;
8575 vm_page_t ml2;
8576 int i;
8577
8578 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8579 oldl1 = pmap_load(l1);
8580 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8581 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
8582 ("pmap_demote_l1: Demoting a non-block entry"));
8583 KASSERT((va & L1_OFFSET) == 0,
8584 ("pmap_demote_l1: Invalid virtual address %#lx", va));
8585 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
8586 ("pmap_demote_l1: Level 1 table shouldn't be managed"));
8587 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
8588 ("pmap_demote_l1: Demoting entry with no-demote flag set"));
8589
8590 tmpl1 = NULL;
8591 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
8592 tmpl1 = kva_alloc(PAGE_SIZE);
8593 if (tmpl1 == NULL)
8594 return (NULL);
8595 }
8596
8597 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
8598 NULL) {
8599 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
8600 " in pmap %p", va, pmap);
8601 l2 = NULL;
8602 goto fail;
8603 }
8604
8605 l2phys = VM_PAGE_TO_PHYS(ml2);
8606 l2 = PHYS_TO_DMAP(l2phys);
8607
8608 /* Address the range points at */
8609 phys = PTE_TO_PHYS(oldl1);
8610 /* The attributed from the old l1 table to be copied */
8611 newl2 = oldl1 & ATTR_MASK;
8612
8613 /* Create the new entries */
8614 newl2 |= ATTR_CONTIGUOUS;
8615 for (i = 0; i < Ln_ENTRIES; i++) {
8616 l2[i] = newl2 | phys;
8617 phys += L2_SIZE;
8618 }
8619 KASSERT(l2[0] == (ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) |
8620 L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0],
8621 ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
8622
8623 if (tmpl1 != NULL) {
8624 pmap_kenter((vm_offset_t)tmpl1, PAGE_SIZE,
8625 DMAP_TO_PHYS(l1) & ~L3_OFFSET,
8626 VM_MEMATTR_WRITE_BACK);
8627 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
8628 }
8629
8630 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
8631
8632 counter_u64_add(pmap_l1_demotions, 1);
8633 fail:
8634 if (tmpl1 != NULL) {
8635 pmap_kremove((vm_offset_t)tmpl1);
8636 kva_free(tmpl1, PAGE_SIZE);
8637 }
8638
8639 return (l2);
8640 }
8641
8642 static void
pmap_fill_l3(pt_entry_t * firstl3,pt_entry_t newl3)8643 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
8644 {
8645 pt_entry_t *l3;
8646
8647 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
8648 *l3 = newl3;
8649 newl3 += L3_SIZE;
8650 }
8651 }
8652
8653 static void
pmap_demote_l2_check(pt_entry_t * firstl3p __unused,pt_entry_t newl3e __unused)8654 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused)
8655 {
8656 #ifdef INVARIANTS
8657 #ifdef DIAGNOSTIC
8658 pt_entry_t *xl3p, *yl3p;
8659
8660 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES;
8661 xl3p++, newl3e += PAGE_SIZE) {
8662 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) {
8663 printf("pmap_demote_l2: xl3e %zd and newl3e map "
8664 "different pages: found %#lx, expected %#lx\n",
8665 xl3p - firstl3p, pmap_load(xl3p), newl3e);
8666 printf("page table dump\n");
8667 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES;
8668 yl3p++) {
8669 printf("%zd %#lx\n", yl3p - firstl3p,
8670 pmap_load(yl3p));
8671 }
8672 panic("firstpte");
8673 }
8674 }
8675 #else
8676 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e),
8677 ("pmap_demote_l2: firstl3 and newl3e map different physical"
8678 " addresses"));
8679 #endif
8680 #endif
8681 }
8682
8683 static void
pmap_demote_l2_abort(pmap_t pmap,vm_offset_t va,pt_entry_t * l2,struct rwlock ** lockp)8684 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
8685 struct rwlock **lockp)
8686 {
8687 struct spglist free;
8688
8689 SLIST_INIT(&free);
8690 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), true,
8691 &free, lockp);
8692 vm_page_free_pages_toq(&free, true);
8693 }
8694
8695 /*
8696 * Create an L3 table to map all addresses within an L2 mapping.
8697 */
8698 static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap,pt_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)8699 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
8700 struct rwlock **lockp)
8701 {
8702 pt_entry_t *l3, newl3, oldl2;
8703 char *tmpl2;
8704 vm_paddr_t l3phys;
8705 vm_page_t ml3;
8706
8707 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8708 PMAP_ASSERT_STAGE1(pmap);
8709 KASSERT(ADDR_IS_CANONICAL(va),
8710 ("%s: Address not in canonical form: %lx", __func__, va));
8711
8712 l3 = NULL;
8713 oldl2 = pmap_load(l2);
8714 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
8715 ("pmap_demote_l2: Demoting a non-block entry"));
8716 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
8717 ("pmap_demote_l2: Demoting entry with no-demote flag set"));
8718 va &= ~L2_OFFSET;
8719
8720 tmpl2 = NULL;
8721 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
8722 tmpl2 = kva_alloc(PAGE_SIZE);
8723 if (tmpl2 == NULL)
8724 return (NULL);
8725 }
8726
8727 /*
8728 * Invalidate the 2MB page mapping and return "failure" if the
8729 * mapping was never accessed and not wired.
8730 */
8731 if ((oldl2 & ATTR_AF) == 0) {
8732 if ((oldl2 & ATTR_SW_WIRED) == 0) {
8733 pmap_demote_l2_abort(pmap, va, l2, lockp);
8734 CTR2(KTR_PMAP,
8735 "pmap_demote_l2: failure for va %#lx in pmap %p",
8736 va, pmap);
8737 goto fail;
8738 }
8739 ml3 = pmap_remove_pt_page(pmap, va);
8740 /* Fill the PTP with L3Es that have ATTR_AF cleared. */
8741 ml3->valid = 0;
8742 } else if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
8743 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8744 ("pmap_demote_l2: page table page for a wired mapping"
8745 " is missing"));
8746
8747 /*
8748 * If the page table page is missing and the mapping
8749 * is for a kernel address, the mapping must belong to
8750 * either the direct map or the early kernel memory.
8751 * Page table pages are preallocated for every other
8752 * part of the kernel address space, so the direct map
8753 * region and early kernel memory are the only parts of the
8754 * kernel address space that must be handled here.
8755 */
8756 KASSERT(ADDR_IS_USER(va) || VIRT_IN_DMAP(va) ||
8757 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
8758 ("pmap_demote_l2: No saved mpte for va %#lx", va));
8759
8760 /*
8761 * If the 2MB page mapping belongs to the direct map
8762 * region of the kernel's address space, then the page
8763 * allocation request specifies the highest possible
8764 * priority (VM_ALLOC_INTERRUPT). Otherwise, the
8765 * priority is normal.
8766 */
8767 ml3 = vm_page_alloc_noobj(
8768 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
8769 VM_ALLOC_WIRED);
8770
8771 /*
8772 * If the allocation of the new page table page fails,
8773 * invalidate the 2MB page mapping and return "failure".
8774 */
8775 if (ml3 == NULL) {
8776 pmap_demote_l2_abort(pmap, va, l2, lockp);
8777 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
8778 " in pmap %p", va, pmap);
8779 goto fail;
8780 }
8781 ml3->pindex = pmap_l2_pindex(va);
8782
8783 if (ADDR_IS_USER(va)) {
8784 ml3->ref_count = NL3PG;
8785 pmap_resident_count_inc(pmap, 1);
8786 }
8787 }
8788 l3phys = VM_PAGE_TO_PHYS(ml3);
8789 l3 = PHYS_TO_DMAP(l3phys);
8790 newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
8791 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
8792 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
8793 ("pmap_demote_l2: L2 entry is writeable but not dirty"));
8794
8795 /*
8796 * If the PTP is not leftover from an earlier promotion or it does not
8797 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all
8798 * have ATTR_AF set, unless this is a wired mapping with ATTR_AF clear.
8799 *
8800 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
8801 * performs a dsb(). That dsb() ensures that the stores for filling
8802 * "l3" are visible before "l3" is added to the page table.
8803 */
8804 if (!vm_page_all_valid(ml3))
8805 pmap_fill_l3(l3, newl3);
8806
8807 pmap_demote_l2_check(l3, newl3);
8808
8809 /*
8810 * If the mapping has changed attributes, update the L3Es.
8811 */
8812 if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE))
8813 pmap_fill_l3(l3, newl3);
8814
8815 /*
8816 * Map the temporary page so we don't lose access to the l2 table.
8817 */
8818 if (tmpl2 != NULL) {
8819 pmap_kenter((vm_offset_t)tmpl2, PAGE_SIZE,
8820 DMAP_TO_PHYS(l2) & ~L3_OFFSET,
8821 VM_MEMATTR_WRITE_BACK);
8822 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
8823 }
8824
8825 /*
8826 * The spare PV entries must be reserved prior to demoting the
8827 * mapping, that is, prior to changing the PDE. Otherwise, the state
8828 * of the L2 and the PV lists will be inconsistent, which can result
8829 * in reclaim_pv_chunk() attempting to remove a PV entry from the
8830 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
8831 * PV entry for the 2MB page mapping that is being demoted.
8832 */
8833 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8834 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
8835
8836 /*
8837 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
8838 * the 2MB page mapping.
8839 */
8840 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
8841
8842 /*
8843 * Demote the PV entry.
8844 */
8845 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8846 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
8847
8848 counter_u64_add(pmap_l2_demotions, 1);
8849 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
8850 " in pmap %p %lx", va, pmap, l3[0]);
8851
8852 fail:
8853 if (tmpl2 != NULL) {
8854 pmap_kremove((vm_offset_t)tmpl2);
8855 kva_free(tmpl2, PAGE_SIZE);
8856 }
8857
8858 return (l3);
8859
8860 }
8861
8862 static pt_entry_t *
pmap_demote_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)8863 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
8864 {
8865 struct rwlock *lock;
8866 pt_entry_t *l3;
8867
8868 lock = NULL;
8869 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
8870 if (lock != NULL)
8871 rw_wunlock(lock);
8872 return (l3);
8873 }
8874
8875 /*
8876 * Demote an L2C superpage mapping to L2C_ENTRIES L2 block mappings.
8877 */
8878 static bool
pmap_demote_l2c(pmap_t pmap,pt_entry_t * l2p,vm_offset_t va)8879 pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va)
8880 {
8881 pd_entry_t *l2c_end, *l2c_start, l2e, mask, nbits, *tl2p;
8882 char *tmpl3;
8883 register_t intr;
8884
8885 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8886 PMAP_ASSERT_STAGE1(pmap);
8887 l2c_start = (pd_entry_t *)((uintptr_t)l2p & ~((L2C_ENTRIES *
8888 sizeof(pd_entry_t)) - 1));
8889 l2c_end = l2c_start + L2C_ENTRIES;
8890 tmpl3 = NULL;
8891 if ((va & ~L2C_OFFSET) < (vm_offset_t)l2c_end &&
8892 (vm_offset_t)l2c_start < (va & ~L2C_OFFSET) + L2C_SIZE) {
8893 tmpl3 = kva_alloc(PAGE_SIZE);
8894 if (tmpl3 == NULL)
8895 return (false);
8896 pmap_kenter((vm_offset_t)tmpl3, PAGE_SIZE,
8897 DMAP_TO_PHYS(l2c_start) & ~L3_OFFSET,
8898 VM_MEMATTR_WRITE_BACK);
8899 l2c_start = (pd_entry_t *)(tmpl3 +
8900 ((vm_offset_t)l2c_start & PAGE_MASK));
8901 l2c_end = (pd_entry_t *)(tmpl3 +
8902 ((vm_offset_t)l2c_end & PAGE_MASK));
8903 }
8904 mask = 0;
8905 nbits = ATTR_DESCR_VALID;
8906 intr = intr_disable();
8907
8908 /*
8909 * Break the mappings.
8910 */
8911 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8912 /*
8913 * Clear the mapping's contiguous and valid bits, but leave
8914 * the rest of the entry unchanged, so that a lockless,
8915 * concurrent pmap_kextract() can still lookup the physical
8916 * address.
8917 */
8918 l2e = pmap_load(tl2p);
8919 KASSERT((l2e & ATTR_CONTIGUOUS) != 0,
8920 ("pmap_demote_l2c: missing ATTR_CONTIGUOUS"));
8921 KASSERT((l2e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8922 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8923 ("pmap_demote_l2c: missing ATTR_S1_AP_RW"));
8924 while (!atomic_fcmpset_64(tl2p, &l2e, l2e & ~(ATTR_CONTIGUOUS |
8925 ATTR_DESCR_VALID)))
8926 cpu_spinwait();
8927
8928 /*
8929 * Hardware accessed and dirty bit maintenance might only
8930 * update a single L2 entry, so we must combine the accessed
8931 * and dirty bits from this entire set of contiguous L2
8932 * entries.
8933 */
8934 if ((l2e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8935 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8936 mask = ATTR_S1_AP_RW_BIT;
8937 nbits |= l2e & ATTR_AF;
8938 }
8939 if ((nbits & ATTR_AF) != 0) {
8940 pmap_s1_invalidate_strided(pmap, va & ~L2C_OFFSET, (va +
8941 L2C_SIZE) & ~L2C_OFFSET, L2_SIZE, true);
8942 }
8943
8944 /*
8945 * Remake the mappings, updating the accessed and dirty bits.
8946 */
8947 l2e = (pmap_load(l2c_start) & ~mask) | nbits;
8948 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8949 pmap_store(tl2p, l2e);
8950 l2e += L2_SIZE;
8951 }
8952 dsb(ishst);
8953
8954 intr_restore(intr);
8955 if (tmpl3 != NULL) {
8956 pmap_kremove((vm_offset_t)tmpl3);
8957 kva_free(tmpl3, PAGE_SIZE);
8958 }
8959 counter_u64_add(pmap_l2c_demotions, 1);
8960 CTR2(KTR_PMAP, "pmap_demote_l2c: success for va %#lx in pmap %p",
8961 va, pmap);
8962 return (true);
8963 }
8964
8965 /*
8966 * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings.
8967 */
8968 static bool
pmap_demote_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va)8969 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va)
8970 {
8971 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8972 char *tmpl3;
8973 register_t intr;
8974
8975 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8976 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8977 sizeof(pt_entry_t)) - 1));
8978 l3c_end = l3c_start + L3C_ENTRIES;
8979 tmpl3 = NULL;
8980 if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end &&
8981 (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) {
8982 tmpl3 = kva_alloc(PAGE_SIZE);
8983 if (tmpl3 == NULL)
8984 return (false);
8985 pmap_kenter((vm_offset_t)tmpl3, PAGE_SIZE,
8986 DMAP_TO_PHYS(l3c_start) & ~L3_OFFSET,
8987 VM_MEMATTR_WRITE_BACK);
8988 l3c_start = (pt_entry_t *)(tmpl3 +
8989 ((vm_offset_t)l3c_start & PAGE_MASK));
8990 l3c_end = (pt_entry_t *)(tmpl3 +
8991 ((vm_offset_t)l3c_end & PAGE_MASK));
8992 }
8993 mask = 0;
8994 nbits = ATTR_DESCR_VALID;
8995 intr = intr_disable();
8996
8997 /*
8998 * Break the mappings.
8999 */
9000 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
9001 /*
9002 * Clear the mapping's contiguous and valid bits, but leave
9003 * the rest of the entry unchanged, so that a lockless,
9004 * concurrent pmap_kextract() can still lookup the physical
9005 * address.
9006 */
9007 l3e = pmap_load(tl3p);
9008 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
9009 ("pmap_demote_l3c: missing ATTR_CONTIGUOUS"));
9010 KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
9011 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
9012 ("pmap_demote_l3c: missing ATTR_S1_AP_RW"));
9013 while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS |
9014 ATTR_DESCR_VALID)))
9015 cpu_spinwait();
9016
9017 /*
9018 * Hardware accessed and dirty bit maintenance might only
9019 * update a single L3 entry, so we must combine the accessed
9020 * and dirty bits from this entire set of contiguous L3
9021 * entries.
9022 */
9023 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
9024 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
9025 mask = ATTR_S1_AP_RW_BIT;
9026 nbits |= l3e & ATTR_AF;
9027 }
9028 if ((nbits & ATTR_AF) != 0) {
9029 pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) &
9030 ~L3C_OFFSET, true);
9031 }
9032
9033 /*
9034 * Remake the mappings, updating the accessed and dirty bits.
9035 */
9036 l3e = (pmap_load(l3c_start) & ~mask) | nbits;
9037 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
9038 pmap_store(tl3p, l3e);
9039 l3e += L3_SIZE;
9040 }
9041 dsb(ishst);
9042
9043 intr_restore(intr);
9044 if (tmpl3 != NULL) {
9045 pmap_kremove((vm_offset_t)tmpl3);
9046 kva_free(tmpl3, PAGE_SIZE);
9047 }
9048 counter_u64_add(pmap_l3c_demotions, 1);
9049 CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p",
9050 va, pmap);
9051 return (true);
9052 }
9053
9054 /*
9055 * Accumulate the accessed and dirty bits within a L3C superpage and
9056 * return the specified PTE with them applied correctly.
9057 */
9058 static pt_entry_t
pmap_load_l3c(pt_entry_t * l3p)9059 pmap_load_l3c(pt_entry_t *l3p)
9060 {
9061 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
9062
9063 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
9064 sizeof(pt_entry_t)) - 1));
9065 l3c_end = l3c_start + L3C_ENTRIES;
9066 mask = 0;
9067 nbits = 0;
9068 /* Iterate over each mapping in the superpage. */
9069 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
9070 l3e = pmap_load(tl3p);
9071 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
9072 ("pmap_load_l3c: missing ATTR_CONTIGUOUS"));
9073 /* Update mask if the current page has its dirty bit set. */
9074 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
9075 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
9076 mask = ATTR_S1_AP_RW_BIT;
9077 /* Update nbits if the accessed bit is set. */
9078 nbits |= l3e & ATTR_AF;
9079 }
9080 return ((pmap_load(l3p) & ~mask) | nbits);
9081 }
9082
9083 /*
9084 * Perform the pmap work for mincore(2). If the page is not both referenced and
9085 * modified by this pmap, returns its physical address so that the caller can
9086 * find other mappings.
9087 */
9088 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)9089 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
9090 {
9091 pt_entry_t *pte, tpte;
9092 vm_paddr_t mask, pa;
9093 int lvl, psind, val;
9094 bool managed;
9095
9096 PMAP_ASSERT_STAGE1(pmap);
9097 PMAP_LOCK(pmap);
9098 pte = pmap_pte(pmap, addr, &lvl);
9099 if (pte != NULL) {
9100 tpte = pmap_load(pte);
9101
9102 switch (lvl) {
9103 case 3:
9104 mask = L3_OFFSET;
9105 psind = (tpte & ATTR_CONTIGUOUS) != 0 ? 1 : 0;
9106 break;
9107 case 2:
9108 mask = L2_OFFSET;
9109 psind = 2;
9110 break;
9111 case 1:
9112 mask = L1_OFFSET;
9113 psind = 3;
9114 break;
9115 default:
9116 panic("pmap_mincore: invalid level %d", lvl);
9117 }
9118
9119 managed = (tpte & ATTR_SW_MANAGED) != 0;
9120 val = MINCORE_INCORE | MINCORE_PSIND(psind);
9121 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
9122 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
9123 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
9124 if ((tpte & ATTR_AF) == ATTR_AF)
9125 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
9126
9127 pa = PTE_TO_PHYS(tpte) | (addr & mask);
9128 } else {
9129 managed = false;
9130 val = 0;
9131 }
9132
9133 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
9134 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
9135 *pap = pa;
9136 }
9137 PMAP_UNLOCK(pmap);
9138 return (val);
9139 }
9140
9141 /*
9142 * Garbage collect every ASID that is neither active on a processor nor
9143 * reserved.
9144 */
9145 static void
pmap_reset_asid_set(pmap_t pmap)9146 pmap_reset_asid_set(pmap_t pmap)
9147 {
9148 pmap_t curpmap;
9149 int asid, cpuid, epoch;
9150 struct asid_set *set;
9151 enum pmap_stage stage;
9152
9153 set = pmap->pm_asid_set;
9154 stage = pmap->pm_stage;
9155
9156 set = pmap->pm_asid_set;
9157 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
9158 mtx_assert(&set->asid_set_mutex, MA_OWNED);
9159
9160 /*
9161 * Ensure that the store to asid_epoch is globally visible before the
9162 * loads from pc_curpmap are performed.
9163 */
9164 epoch = set->asid_epoch + 1;
9165 if (epoch == INT_MAX)
9166 epoch = 0;
9167 set->asid_epoch = epoch;
9168 dsb(ishst);
9169 if (stage == PM_STAGE1) {
9170 __asm __volatile("tlbi vmalle1is");
9171 } else {
9172 KASSERT(pmap_clean_stage2_tlbi != NULL,
9173 ("%s: Unset stage 2 tlb invalidation callback\n",
9174 __func__));
9175 pmap_clean_stage2_tlbi();
9176 }
9177 dsb(ish);
9178 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
9179 set->asid_set_size - 1);
9180 CPU_FOREACH(cpuid) {
9181 if (cpuid == curcpu)
9182 continue;
9183 if (stage == PM_STAGE1) {
9184 curpmap = pcpu_find(cpuid)->pc_curpmap;
9185 PMAP_ASSERT_STAGE1(pmap);
9186 } else {
9187 curpmap = pcpu_find(cpuid)->pc_curvmpmap;
9188 if (curpmap == NULL)
9189 continue;
9190 PMAP_ASSERT_STAGE2(pmap);
9191 }
9192 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
9193 asid = COOKIE_TO_ASID(curpmap->pm_cookie);
9194 if (asid == -1)
9195 continue;
9196 bit_set(set->asid_set, asid);
9197 curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
9198 }
9199 }
9200
9201 /*
9202 * Allocate a new ASID for the specified pmap.
9203 */
9204 static void
pmap_alloc_asid(pmap_t pmap)9205 pmap_alloc_asid(pmap_t pmap)
9206 {
9207 struct asid_set *set;
9208 int new_asid;
9209
9210 set = pmap->pm_asid_set;
9211 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
9212
9213 mtx_lock_spin(&set->asid_set_mutex);
9214
9215 /*
9216 * While this processor was waiting to acquire the asid set mutex,
9217 * pmap_reset_asid_set() running on another processor might have
9218 * updated this pmap's cookie to the current epoch. In which case, we
9219 * don't need to allocate a new ASID.
9220 */
9221 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
9222 goto out;
9223
9224 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
9225 &new_asid);
9226 if (new_asid == -1) {
9227 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
9228 set->asid_next, &new_asid);
9229 if (new_asid == -1) {
9230 pmap_reset_asid_set(pmap);
9231 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
9232 set->asid_set_size, &new_asid);
9233 KASSERT(new_asid != -1, ("ASID allocation failure"));
9234 }
9235 }
9236 bit_set(set->asid_set, new_asid);
9237 set->asid_next = new_asid + 1;
9238 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
9239 out:
9240 mtx_unlock_spin(&set->asid_set_mutex);
9241 }
9242
9243 static uint64_t __read_mostly ttbr_flags;
9244
9245 /*
9246 * Compute the value that should be stored in ttbr0 to activate the specified
9247 * pmap. This value may change from time to time.
9248 */
9249 uint64_t
pmap_to_ttbr0(pmap_t pmap)9250 pmap_to_ttbr0(pmap_t pmap)
9251 {
9252 uint64_t ttbr;
9253
9254 ttbr = pmap->pm_ttbr;
9255 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
9256 ttbr |= ttbr_flags;
9257
9258 return (ttbr);
9259 }
9260
9261 static void
pmap_set_cnp(void * arg)9262 pmap_set_cnp(void *arg)
9263 {
9264 uint64_t ttbr0, ttbr1;
9265 u_int cpuid;
9266
9267 cpuid = *(u_int *)arg;
9268 if (cpuid == curcpu) {
9269 /*
9270 * Set the flags while all CPUs are handling the
9271 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
9272 * to pmap_to_ttbr0 after this will have the CnP flag set.
9273 * The dsb after invalidating the TLB will act as a barrier
9274 * to ensure all CPUs can observe this change.
9275 */
9276 ttbr_flags |= TTBR_CnP;
9277 }
9278
9279 ttbr0 = READ_SPECIALREG(ttbr0_el1);
9280 ttbr0 |= TTBR_CnP;
9281
9282 ttbr1 = READ_SPECIALREG(ttbr1_el1);
9283 ttbr1 |= TTBR_CnP;
9284
9285 /* Update ttbr{0,1}_el1 with the CnP flag */
9286 WRITE_SPECIALREG(ttbr0_el1, ttbr0);
9287 WRITE_SPECIALREG(ttbr1_el1, ttbr1);
9288 isb();
9289 __asm __volatile("tlbi vmalle1is");
9290 dsb(ish);
9291 isb();
9292 }
9293
9294 /*
9295 * Defer enabling some features until we have read the ID registers to know
9296 * if they are supported on all CPUs.
9297 */
9298 static void
pmap_init_mp(void * dummy __unused)9299 pmap_init_mp(void *dummy __unused)
9300 {
9301 uint64_t reg;
9302
9303 get_kernel_reg(ID_AA64PFR1_EL1, ®);
9304 if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) {
9305 if (bootverbose)
9306 printf("Enabling BTI\n");
9307 pmap_bti_support = true;
9308
9309 pmap_bti_ranges_zone = uma_zcreate("BTI ranges",
9310 sizeof(struct rs_el), NULL, NULL, NULL, NULL,
9311 UMA_ALIGN_PTR, 0);
9312 }
9313 }
9314 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL);
9315
9316 /*
9317 * Defer enabling CnP until we have read the ID registers to know if it's
9318 * supported on all CPUs.
9319 */
9320 static void
pmap_init_cnp(void * dummy __unused)9321 pmap_init_cnp(void *dummy __unused)
9322 {
9323 uint64_t reg;
9324 u_int cpuid;
9325
9326 get_kernel_reg(ID_AA64MMFR2_EL1, ®);
9327 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
9328 if (bootverbose)
9329 printf("Enabling CnP\n");
9330 cpuid = curcpu;
9331 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
9332 }
9333
9334 }
9335 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
9336
9337 static bool
pmap_activate_int(struct thread * td,pmap_t pmap)9338 pmap_activate_int(struct thread *td, pmap_t pmap)
9339 {
9340 struct asid_set *set;
9341 int epoch;
9342
9343 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
9344 KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
9345
9346 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
9347 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
9348 /*
9349 * Handle the possibility that the old thread was preempted
9350 * after an "ic" or "tlbi" instruction but before it performed
9351 * a "dsb" instruction. If the old thread migrates to a new
9352 * processor, its completion of a "dsb" instruction on that
9353 * new processor does not guarantee that the "ic" or "tlbi"
9354 * instructions performed on the old processor have completed.
9355 */
9356 dsb(ish);
9357 return (false);
9358 }
9359
9360 set = pmap->pm_asid_set;
9361 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
9362
9363 /*
9364 * Ensure that the store to curpmap is globally visible before the
9365 * load from asid_epoch is performed.
9366 */
9367 if (pmap->pm_stage == PM_STAGE1)
9368 PCPU_SET(curpmap, pmap);
9369 else
9370 PCPU_SET(curvmpmap, pmap);
9371 dsb(ish);
9372 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
9373 if (epoch >= 0 && epoch != set->asid_epoch)
9374 pmap_alloc_asid(pmap);
9375
9376 if (pmap->pm_stage == PM_STAGE1) {
9377 uint64_t new_tcr, tcr;
9378
9379 new_tcr = td->td_proc->p_md.md_tcr;
9380 tcr = READ_SPECIALREG(tcr_el1);
9381 if ((tcr & MD_TCR_FIELDS) != new_tcr) {
9382 tcr &= ~MD_TCR_FIELDS;
9383 tcr |= new_tcr;
9384 WRITE_SPECIALREG(tcr_el1, tcr);
9385 }
9386 set_ttbr0(pmap_to_ttbr0(pmap));
9387 if (PCPU_GET(bcast_tlbi_workaround) != 0)
9388 invalidate_local_icache();
9389 }
9390 return (true);
9391 }
9392
9393 void
pmap_activate_vm(pmap_t pmap)9394 pmap_activate_vm(pmap_t pmap)
9395 {
9396
9397 PMAP_ASSERT_STAGE2(pmap);
9398
9399 (void)pmap_activate_int(NULL, pmap);
9400 }
9401
9402 void
pmap_activate(struct thread * td)9403 pmap_activate(struct thread *td)
9404 {
9405 pmap_t pmap;
9406
9407 pmap = vmspace_pmap(td->td_proc->p_vmspace);
9408 PMAP_ASSERT_STAGE1(pmap);
9409 critical_enter();
9410 (void)pmap_activate_int(td, pmap);
9411 critical_exit();
9412 }
9413
9414 /*
9415 * Activate the thread we are switching to.
9416 * To simplify the assembly in cpu_throw return the new threads pcb.
9417 */
9418 struct pcb *
pmap_switch(struct thread * new)9419 pmap_switch(struct thread *new)
9420 {
9421 pcpu_bp_harden bp_harden;
9422 struct pcb *pcb;
9423 uint64_t sctlr;
9424
9425 /* Store the new curthread */
9426 PCPU_SET(curthread, new);
9427
9428 /* And the new pcb */
9429 pcb = new->td_pcb;
9430 PCPU_SET(curpcb, pcb);
9431
9432 if ((new->td_proc->p_flag & P_KPROC) == 0) {
9433 sctlr = READ_SPECIALREG(sctlr_el1);
9434 if ((sctlr & SCTLR_USER_MASK) != new->td_md.md_sctlr) {
9435 sctlr &= ~SCTLR_USER_MASK;
9436 sctlr |= new->td_md.md_sctlr;
9437 WRITE_SPECIALREG(sctlr_el1, sctlr);
9438 isb();
9439 }
9440 }
9441
9442 /*
9443 * TODO: We may need to flush the cache here if switching
9444 * to a user process.
9445 */
9446
9447 if (pmap_activate_int(new, vmspace_pmap(new->td_proc->p_vmspace))) {
9448 /*
9449 * Stop userspace from training the branch predictor against
9450 * other processes. This will call into a CPU specific
9451 * function that clears the branch predictor state.
9452 */
9453 bp_harden = PCPU_GET(bp_harden);
9454 if (bp_harden != NULL)
9455 bp_harden();
9456 }
9457
9458 return (pcb);
9459 }
9460
9461 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)9462 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
9463 {
9464
9465 PMAP_ASSERT_STAGE1(pmap);
9466 KASSERT(ADDR_IS_CANONICAL(va),
9467 ("%s: Address not in canonical form: %lx", __func__, va));
9468
9469 if (ADDR_IS_KERNEL(va)) {
9470 cpu_icache_sync_range((void *)va, sz);
9471 } else {
9472 u_int len, offset;
9473 vm_paddr_t pa;
9474
9475 /* Find the length of data in this page to flush */
9476 offset = va & PAGE_MASK;
9477 len = imin(PAGE_SIZE - offset, sz);
9478
9479 while (sz != 0) {
9480 /* Extract the physical address & find it in the DMAP */
9481 pa = pmap_extract(pmap, va);
9482 if (pa != 0)
9483 cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
9484
9485 /* Move to the next page */
9486 sz -= len;
9487 va += len;
9488 /* Set the length for the next iteration */
9489 len = imin(PAGE_SIZE, sz);
9490 }
9491 }
9492 }
9493
9494 static int
pmap_stage2_fault(pmap_t pmap,uint64_t esr,uint64_t far)9495 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9496 {
9497 pd_entry_t *pdep;
9498 pt_entry_t *ptep, pte;
9499 int rv, lvl, dfsc;
9500
9501 PMAP_ASSERT_STAGE2(pmap);
9502 rv = KERN_FAILURE;
9503
9504 /* Data and insn aborts use same encoding for FSC field. */
9505 dfsc = esr & ISS_DATA_DFSC_MASK;
9506 switch (dfsc) {
9507 case ISS_DATA_DFSC_TF_L0:
9508 case ISS_DATA_DFSC_TF_L1:
9509 case ISS_DATA_DFSC_TF_L2:
9510 case ISS_DATA_DFSC_TF_L3:
9511 PMAP_LOCK(pmap);
9512 pdep = pmap_pde(pmap, far, &lvl);
9513 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
9514 PMAP_UNLOCK(pmap);
9515 break;
9516 }
9517
9518 switch (lvl) {
9519 case 0:
9520 ptep = pmap_l0_to_l1(pdep, far);
9521 break;
9522 case 1:
9523 ptep = pmap_l1_to_l2(pdep, far);
9524 break;
9525 case 2:
9526 ptep = pmap_l2_to_l3(pdep, far);
9527 break;
9528 default:
9529 panic("%s: Invalid pde level %d", __func__,lvl);
9530 }
9531 goto fault_exec;
9532
9533 case ISS_DATA_DFSC_AFF_L1:
9534 case ISS_DATA_DFSC_AFF_L2:
9535 case ISS_DATA_DFSC_AFF_L3:
9536 PMAP_LOCK(pmap);
9537 ptep = pmap_pte(pmap, far, &lvl);
9538 fault_exec:
9539 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
9540 /*
9541 * If accessing an executable page invalidate
9542 * the I-cache so it will be valid when we
9543 * continue execution in the guest. The D-cache
9544 * is assumed to already be clean to the Point
9545 * of Coherency.
9546 */
9547 if ((pte & ATTR_S2_XN_MASK) !=
9548 ATTR_S2_XN(ATTR_S2_XN_NONE)) {
9549 invalidate_icache();
9550 }
9551 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
9552 rv = KERN_SUCCESS;
9553 }
9554 PMAP_UNLOCK(pmap);
9555 break;
9556 }
9557
9558 return (rv);
9559 }
9560
9561 int
pmap_fault(pmap_t pmap,uint64_t esr,uint64_t far)9562 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9563 {
9564 pt_entry_t pte, *ptep;
9565 register_t intr;
9566 uint64_t ec, par;
9567 int lvl, rv;
9568
9569 rv = KERN_FAILURE;
9570
9571 ec = ESR_ELx_EXCEPTION(esr);
9572 switch (ec) {
9573 case EXCP_INSN_ABORT_L:
9574 case EXCP_INSN_ABORT:
9575 case EXCP_DATA_ABORT_L:
9576 case EXCP_DATA_ABORT:
9577 break;
9578 default:
9579 return (rv);
9580 }
9581
9582 if (pmap->pm_stage == PM_STAGE2)
9583 return (pmap_stage2_fault(pmap, esr, far));
9584
9585 /* Data and insn aborts use same encoding for FSC field. */
9586 switch (esr & ISS_DATA_DFSC_MASK) {
9587 case ISS_DATA_DFSC_AFF_L1:
9588 case ISS_DATA_DFSC_AFF_L2:
9589 case ISS_DATA_DFSC_AFF_L3:
9590 PMAP_LOCK(pmap);
9591 ptep = pmap_pte(pmap, far, &lvl);
9592 if (ptep != NULL) {
9593 pmap_set_bits(ptep, ATTR_AF);
9594 rv = KERN_SUCCESS;
9595 /*
9596 * XXXMJ as an optimization we could mark the entry
9597 * dirty if this is a write fault.
9598 */
9599 }
9600 PMAP_UNLOCK(pmap);
9601 break;
9602 case ISS_DATA_DFSC_PF_L1:
9603 case ISS_DATA_DFSC_PF_L2:
9604 case ISS_DATA_DFSC_PF_L3:
9605 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
9606 (esr & ISS_DATA_WnR) == 0)
9607 return (rv);
9608 PMAP_LOCK(pmap);
9609 ptep = pmap_pte(pmap, far, &lvl);
9610 if (ptep != NULL &&
9611 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
9612 if ((pte & ATTR_S1_AP_RW_BIT) ==
9613 ATTR_S1_AP(ATTR_S1_AP_RO)) {
9614 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
9615 pmap_s1_invalidate_page(pmap, far, true);
9616 }
9617 rv = KERN_SUCCESS;
9618 }
9619 PMAP_UNLOCK(pmap);
9620 break;
9621 case ISS_DATA_DFSC_TF_L0:
9622 case ISS_DATA_DFSC_TF_L1:
9623 case ISS_DATA_DFSC_TF_L2:
9624 case ISS_DATA_DFSC_TF_L3:
9625 /*
9626 * Retry the translation. A break-before-make sequence can
9627 * produce a transient fault.
9628 */
9629 if (pmap == kernel_pmap) {
9630 /*
9631 * The translation fault may have occurred within a
9632 * critical section. Therefore, we must check the
9633 * address without acquiring the kernel pmap's lock.
9634 */
9635 if (pmap_klookup(far, NULL))
9636 rv = KERN_SUCCESS;
9637 } else {
9638 bool owned;
9639
9640 /*
9641 * In the EFIRT driver we lock the pmap before
9642 * calling into the runtime service. As the lock
9643 * is already owned by the current thread skip
9644 * locking it again.
9645 */
9646 owned = PMAP_OWNED(pmap);
9647 if (!owned)
9648 PMAP_LOCK(pmap);
9649 /* Ask the MMU to check the address. */
9650 intr = intr_disable();
9651 par = arm64_address_translate_s1e0r(far);
9652 intr_restore(intr);
9653 if (!owned)
9654 PMAP_UNLOCK(pmap);
9655
9656 /*
9657 * If the translation was successful, then we can
9658 * return success to the trap handler.
9659 */
9660 if (PAR_SUCCESS(par))
9661 rv = KERN_SUCCESS;
9662 }
9663 break;
9664 }
9665
9666 return (rv);
9667 }
9668
9669 /*
9670 * Increase the starting virtual address of the given mapping if a
9671 * different alignment might result in more superpage mappings.
9672 */
9673 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)9674 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
9675 vm_offset_t *addr, vm_size_t size)
9676 {
9677 vm_offset_t superpage_offset;
9678
9679 if (size < L3C_SIZE)
9680 return;
9681 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
9682 offset += ptoa(object->pg_color);
9683
9684 /*
9685 * Considering the object's physical alignment, is the mapping large
9686 * enough to encompass an L2 (2MB/32MB) superpage ...
9687 */
9688 superpage_offset = offset & L2_OFFSET;
9689 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) >= L2_SIZE) {
9690 /*
9691 * If the virtual and physical alignments differ, then
9692 * increase the virtual address so that the alignments match.
9693 */
9694 if ((*addr & L2_OFFSET) < superpage_offset)
9695 *addr = (*addr & ~L2_OFFSET) + superpage_offset;
9696 else if ((*addr & L2_OFFSET) > superpage_offset)
9697 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) +
9698 superpage_offset;
9699 return;
9700 }
9701 /* ... or an L3C (64KB/2MB) superpage? */
9702 superpage_offset = offset & L3C_OFFSET;
9703 if (size - ((L3C_SIZE - superpage_offset) & L3C_OFFSET) >= L3C_SIZE) {
9704 if ((*addr & L3C_OFFSET) < superpage_offset)
9705 *addr = (*addr & ~L3C_OFFSET) + superpage_offset;
9706 else if ((*addr & L3C_OFFSET) > superpage_offset)
9707 *addr = ((*addr + L3C_OFFSET) & ~L3C_OFFSET) +
9708 superpage_offset;
9709 }
9710 }
9711
9712 /**
9713 * Get the kernel virtual address of a set of physical pages. If there are
9714 * physical addresses not covered by the DMAP perform a transient mapping
9715 * that will be removed when calling pmap_unmap_io_transient.
9716 *
9717 * \param page The pages the caller wishes to obtain the virtual
9718 * address on the kernel memory map.
9719 * \param vaddr On return contains the kernel virtual memory address
9720 * of the pages passed in the page parameter.
9721 * \param count Number of pages passed in.
9722 * \param can_fault true if the thread using the mapped pages can take
9723 * page faults, false otherwise.
9724 *
9725 * \returns true if the caller must call pmap_unmap_io_transient when
9726 * finished or false otherwise.
9727 *
9728 */
9729 bool
pmap_map_io_transient(vm_page_t page[],void * vaddr[],int count,bool can_fault)9730 pmap_map_io_transient(vm_page_t page[], void *vaddr[], int count,
9731 bool can_fault)
9732 {
9733 vm_paddr_t paddr;
9734 vmem_addr_t addr;
9735 bool needs_mapping;
9736 int error __diagused, i;
9737
9738 /*
9739 * Allocate any KVA space that we need, this is done in a separate
9740 * loop to prevent calling vmem_alloc while pinned.
9741 */
9742 needs_mapping = false;
9743 for (i = 0; i < count; i++) {
9744 paddr = VM_PAGE_TO_PHYS(page[i]);
9745 if (__predict_false(!PHYS_IN_DMAP(paddr))) {
9746 error = vmem_alloc(kernel_arena, PAGE_SIZE,
9747 M_BESTFIT | M_WAITOK, &addr);
9748 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
9749 vaddr[i] = (void *)addr;
9750 needs_mapping = true;
9751 } else {
9752 vaddr[i] = PHYS_TO_DMAP(paddr);
9753 }
9754 }
9755
9756 /* Exit early if everything is covered by the DMAP */
9757 if (!needs_mapping)
9758 return (false);
9759
9760 if (!can_fault)
9761 sched_pin();
9762 for (i = 0; i < count; i++) {
9763 paddr = VM_PAGE_TO_PHYS(page[i]);
9764 if (!PHYS_IN_DMAP(paddr)) {
9765 panic(
9766 "pmap_map_io_transient: TODO: Map out of DMAP data");
9767 }
9768 }
9769
9770 return (needs_mapping);
9771 }
9772
9773 void
pmap_unmap_io_transient(vm_page_t page[],void * vaddr[],int count,bool can_fault)9774 pmap_unmap_io_transient(vm_page_t page[], void *vaddr[], int count,
9775 bool can_fault)
9776 {
9777 vm_paddr_t paddr;
9778 int i;
9779
9780 if (!can_fault)
9781 sched_unpin();
9782 for (i = 0; i < count; i++) {
9783 paddr = VM_PAGE_TO_PHYS(page[i]);
9784 if (!PHYS_IN_DMAP(paddr)) {
9785 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
9786 }
9787 }
9788 }
9789
9790 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)9791 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
9792 {
9793
9794 return (mode >= 0 && mode < VM_MEMATTR_END);
9795 }
9796
9797 static void *
bti_dup_range(void * ctx __unused,void * data)9798 bti_dup_range(void *ctx __unused, void *data)
9799 {
9800 struct rs_el *node, *new_node;
9801
9802 new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9803 if (new_node == NULL)
9804 return (NULL);
9805 node = data;
9806 memcpy(new_node, node, sizeof(*node));
9807 return (new_node);
9808 }
9809
9810 static void
bti_free_range(void * ctx __unused,void * node)9811 bti_free_range(void *ctx __unused, void *node)
9812 {
9813
9814 uma_zfree(pmap_bti_ranges_zone, node);
9815 }
9816
9817 static int
pmap_bti_assign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9818 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9819 {
9820 struct rs_el *rs;
9821 int error;
9822
9823 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9824 PMAP_ASSERT_STAGE1(pmap);
9825 MPASS(pmap->pm_bti != NULL);
9826 rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9827 if (rs == NULL)
9828 return (ENOMEM);
9829 error = rangeset_insert(pmap->pm_bti, sva, eva, rs);
9830 if (error != 0)
9831 uma_zfree(pmap_bti_ranges_zone, rs);
9832 return (error);
9833 }
9834
9835 static void
pmap_bti_deassign_all(pmap_t pmap)9836 pmap_bti_deassign_all(pmap_t pmap)
9837 {
9838
9839 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9840 if (pmap->pm_bti != NULL)
9841 rangeset_remove_all(pmap->pm_bti);
9842 }
9843
9844 /*
9845 * Returns true if the BTI setting is the same across the specified address
9846 * range, and false otherwise. When returning true, updates the referenced PTE
9847 * to reflect the BTI setting.
9848 *
9849 * Only stage 1 pmaps support BTI. The kernel pmap is always a stage 1 pmap
9850 * that has the same BTI setting implicitly across its entire address range.
9851 */
9852 static bool
pmap_bti_same(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t * pte)9853 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte)
9854 {
9855 struct rs_el *rs;
9856 vm_offset_t va;
9857
9858 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9859 KASSERT(ADDR_IS_CANONICAL(sva),
9860 ("%s: Start address not in canonical form: %lx", __func__, sva));
9861 KASSERT(ADDR_IS_CANONICAL(eva),
9862 ("%s: End address not in canonical form: %lx", __func__, eva));
9863 KASSERT((*pte & ATTR_S1_GP) == 0,
9864 ("%s: pte %lx has ATTR_S1_GP preset", __func__, *pte));
9865
9866 if (pmap == kernel_pmap) {
9867 *pte |= ATTR_KERN_GP;
9868 return (true);
9869 }
9870 if (pmap->pm_bti == NULL)
9871 return (true);
9872 PMAP_ASSERT_STAGE1(pmap);
9873 rs = rangeset_containing(pmap->pm_bti, sva);
9874 if (rs == NULL)
9875 return (rangeset_empty(pmap->pm_bti, sva, eva));
9876 while ((va = rs->re_end) < eva) {
9877 if ((rs = rangeset_beginning(pmap->pm_bti, va)) == NULL)
9878 return (false);
9879 }
9880 *pte |= ATTR_S1_GP;
9881 return (true);
9882 }
9883
9884 static pt_entry_t
pmap_pte_bti(pmap_t pmap,vm_offset_t va)9885 pmap_pte_bti(pmap_t pmap, vm_offset_t va)
9886 {
9887 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9888 MPASS(ADDR_IS_CANONICAL(va));
9889
9890 if (pmap->pm_stage != PM_STAGE1)
9891 return (0);
9892 if (pmap == kernel_pmap)
9893 return (ATTR_KERN_GP);
9894 if (pmap->pm_bti != NULL &&
9895 rangeset_containing(pmap->pm_bti, va) != NULL)
9896 return (ATTR_S1_GP);
9897 return (0);
9898 }
9899
9900 static void
pmap_bti_on_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9901 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9902 {
9903
9904 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9905 if (pmap->pm_bti != NULL)
9906 rangeset_remove(pmap->pm_bti, sva, eva);
9907 }
9908
9909 static int
pmap_bti_copy(pmap_t dst_pmap,pmap_t src_pmap)9910 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap)
9911 {
9912
9913 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
9914 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
9915 MPASS(src_pmap->pm_stage == dst_pmap->pm_stage);
9916 MPASS(src_pmap->pm_bti != NULL);
9917 MPASS(dst_pmap->pm_bti != NULL);
9918 if (src_pmap->pm_bti->rs_data_ctx == NULL)
9919 return (0);
9920 return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti));
9921 }
9922
9923 static void
pmap_bti_update_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool set)9924 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set)
9925 {
9926 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9927 PMAP_ASSERT_STAGE1(pmap);
9928
9929 pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0,
9930 true);
9931 }
9932
9933 int
pmap_bti_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9934 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9935 {
9936 int error;
9937
9938 if (pmap->pm_bti == NULL)
9939 return (0);
9940 if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva))
9941 return (EINVAL);
9942 if (pmap->pm_stage != PM_STAGE1)
9943 return (EINVAL);
9944 if (eva <= sva || ADDR_IS_KERNEL(eva))
9945 return (EFAULT);
9946
9947 sva = trunc_page(sva);
9948 eva = round_page(eva);
9949 for (;;) {
9950 PMAP_LOCK(pmap);
9951 error = pmap_bti_assign(pmap, sva, eva);
9952 if (error == 0)
9953 pmap_bti_update_range(pmap, sva, eva, true);
9954 PMAP_UNLOCK(pmap);
9955 if (error != ENOMEM)
9956 break;
9957 vm_wait(NULL);
9958 }
9959 return (error);
9960 }
9961
9962 #if defined(KASAN) || defined(KMSAN)
9963 static pd_entry_t *pmap_san_early_l2;
9964
9965 #define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE)
9966 #define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE)
9967 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_l2(void)9968 pmap_san_enter_bootstrap_alloc_l2(void)
9969 {
9970 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
9971 static size_t offset = 0;
9972 vm_offset_t addr;
9973
9974 if (offset + L2_SIZE > sizeof(bootstrap_data)) {
9975 panic("%s: out of memory for the bootstrap shadow map L2 entries",
9976 __func__);
9977 }
9978
9979 addr = (uintptr_t)&bootstrap_data[offset];
9980 offset += L2_SIZE;
9981 return (addr);
9982 }
9983
9984 /*
9985 * SAN L1 + L2 pages, maybe L3 entries later?
9986 */
9987 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_pages(int npages)9988 pmap_san_enter_bootstrap_alloc_pages(int npages)
9989 {
9990 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
9991 static size_t offset = 0;
9992 vm_offset_t addr;
9993
9994 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
9995 panic("%s: out of memory for the bootstrap shadow map",
9996 __func__);
9997 }
9998
9999 addr = (uintptr_t)&bootstrap_data[offset];
10000 offset += (npages * PAGE_SIZE);
10001 return (addr);
10002 }
10003
10004 static void __nosanitizeaddress
pmap_san_enter_bootstrap(void)10005 pmap_san_enter_bootstrap(void)
10006 {
10007 vm_offset_t freemempos;
10008
10009 /* L1, L2 */
10010 freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
10011 bs_state.freemempos = freemempos;
10012 bs_state.va = KASAN_MIN_ADDRESS;
10013 pmap_bootstrap_l1_table(&bs_state);
10014 pmap_san_early_l2 = bs_state.l2;
10015 }
10016
10017 static vm_page_t
pmap_san_enter_alloc_l3(void)10018 pmap_san_enter_alloc_l3(void)
10019 {
10020 vm_page_t m;
10021
10022 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
10023 VM_ALLOC_ZERO);
10024 if (m == NULL)
10025 panic("%s: no memory to grow shadow map", __func__);
10026 return (m);
10027 }
10028
10029 static vm_page_t
pmap_san_enter_alloc_l2(void)10030 pmap_san_enter_alloc_l2(void)
10031 {
10032 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
10033 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
10034 }
10035
10036 void __nosanitizeaddress __nosanitizememory
pmap_san_enter(vm_offset_t va)10037 pmap_san_enter(vm_offset_t va)
10038 {
10039 pd_entry_t *l1, *l2;
10040 pt_entry_t *l3;
10041 vm_page_t m;
10042
10043 if (virtual_avail == 0) {
10044 vm_offset_t block;
10045 int slot;
10046 bool first;
10047
10048 /* Temporary shadow map prior to pmap_bootstrap(). */
10049 first = pmap_san_early_l2 == NULL;
10050 if (first)
10051 pmap_san_enter_bootstrap();
10052
10053 l2 = pmap_san_early_l2;
10054 slot = pmap_l2_index(va);
10055
10056 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
10057 MPASS(first);
10058 block = pmap_san_enter_bootstrap_alloc_l2();
10059 pmap_store(&l2[slot],
10060 PHYS_TO_PTE(pmap_early_vtophys(block)) |
10061 PMAP_SAN_PTE_BITS | L2_BLOCK);
10062 dmb(ishst);
10063 }
10064
10065 return;
10066 }
10067
10068 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
10069 l1 = pmap_l1(kernel_pmap, va);
10070 MPASS(l1 != NULL);
10071 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
10072 m = pmap_san_enter_alloc_l3();
10073 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
10074 }
10075 l2 = pmap_l1_to_l2(l1, va);
10076 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
10077 m = pmap_san_enter_alloc_l2();
10078 if (m != NULL) {
10079 pmap_store(l2, VM_PAGE_TO_PTE(m) |
10080 PMAP_SAN_PTE_BITS | L2_BLOCK);
10081 } else {
10082 m = pmap_san_enter_alloc_l3();
10083 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
10084 }
10085 dmb(ishst);
10086 }
10087 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
10088 return;
10089 l3 = pmap_l2_to_l3(l2, va);
10090 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
10091 return;
10092 m = pmap_san_enter_alloc_l3();
10093 pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
10094 dmb(ishst);
10095 }
10096 #endif /* KASAN || KMSAN */
10097
10098 /*
10099 * Track a range of the kernel's virtual address space that is contiguous
10100 * in various mapping attributes.
10101 */
10102 struct pmap_kernel_map_range {
10103 vm_offset_t sva;
10104 pt_entry_t attrs;
10105 int l3pages;
10106 int l3contig;
10107 int l2blocks;
10108 int l2contig;
10109 int l1blocks;
10110 };
10111
10112 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)10113 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
10114 vm_offset_t eva)
10115 {
10116 const char *mode;
10117 int index;
10118
10119 if (eva <= range->sva)
10120 return;
10121
10122 index = range->attrs & ATTR_S1_IDX_MASK;
10123 switch (index) {
10124 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
10125 mode = "DEV-NP";
10126 break;
10127 case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
10128 mode = "DEV";
10129 break;
10130 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
10131 mode = "UC";
10132 break;
10133 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
10134 mode = "WB";
10135 break;
10136 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
10137 mode = "WT";
10138 break;
10139 case ATTR_S1_IDX(VM_MEMATTR_TAGGED):
10140 mode = "TAGGED";
10141 break;
10142 default:
10143 printf(
10144 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
10145 __func__, index, range->sva, eva);
10146 mode = "??";
10147 break;
10148 }
10149
10150 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d %d\n",
10151 range->sva, eva,
10152 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
10153 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
10154 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
10155 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
10156 (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-',
10157 mode, range->l1blocks, range->l2contig, range->l2blocks,
10158 range->l3contig, range->l3pages);
10159
10160 /* Reset to sentinel value. */
10161 range->sva = 0xfffffffffffffffful;
10162 }
10163
10164 /*
10165 * Determine whether the attributes specified by a page table entry match those
10166 * being tracked by the current range.
10167 */
10168 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)10169 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
10170 {
10171
10172 return (range->attrs == attrs);
10173 }
10174
10175 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)10176 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
10177 pt_entry_t attrs)
10178 {
10179
10180 memset(range, 0, sizeof(*range));
10181 range->sva = va;
10182 range->attrs = attrs;
10183 }
10184
10185 /* Get the block/page attributes that correspond to the table attributes */
10186 static pt_entry_t
sysctl_kmaps_table_attrs(pd_entry_t table)10187 sysctl_kmaps_table_attrs(pd_entry_t table)
10188 {
10189 pt_entry_t attrs;
10190
10191 attrs = 0;
10192 if ((table & TATTR_UXN_TABLE) != 0)
10193 attrs |= ATTR_S1_UXN;
10194 if ((table & TATTR_PXN_TABLE) != 0)
10195 attrs |= ATTR_S1_PXN;
10196 if ((table & TATTR_AP_TABLE_RO) != 0)
10197 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
10198
10199 return (attrs);
10200 }
10201
10202 /* Read the block/page attributes we care about */
10203 static pt_entry_t
sysctl_kmaps_block_attrs(pt_entry_t block)10204 sysctl_kmaps_block_attrs(pt_entry_t block)
10205 {
10206 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK |
10207 ATTR_S1_GP));
10208 }
10209
10210 /*
10211 * Given a leaf PTE, derive the mapping's attributes. If they do not match
10212 * those of the current run, dump the address range and its attributes, and
10213 * begin a new run.
10214 */
10215 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l0e,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)10216 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
10217 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
10218 pt_entry_t l3e)
10219 {
10220 pt_entry_t attrs;
10221
10222 attrs = sysctl_kmaps_table_attrs(l0e);
10223
10224 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
10225 attrs |= sysctl_kmaps_block_attrs(l1e);
10226 goto done;
10227 }
10228 attrs |= sysctl_kmaps_table_attrs(l1e);
10229
10230 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
10231 attrs |= sysctl_kmaps_block_attrs(l2e);
10232 goto done;
10233 }
10234 attrs |= sysctl_kmaps_table_attrs(l2e);
10235 attrs |= sysctl_kmaps_block_attrs(l3e);
10236
10237 done:
10238 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
10239 sysctl_kmaps_dump(sb, range, va);
10240 sysctl_kmaps_reinit(range, va, attrs);
10241 }
10242 }
10243
10244 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)10245 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
10246 {
10247 struct pmap_kernel_map_range range;
10248 struct sbuf sbuf, *sb;
10249 pd_entry_t l0e, *l1, l1e, *l2, l2e;
10250 pt_entry_t *l3, l3e;
10251 vm_offset_t sva;
10252 vm_paddr_t pa;
10253 int error, i, j, k, l;
10254
10255 error = sysctl_wire_old_buffer(req, 0);
10256 if (error != 0)
10257 return (error);
10258 sb = &sbuf;
10259 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
10260
10261 /* Sentinel value. */
10262 range.sva = 0xfffffffffffffffful;
10263
10264 /*
10265 * Iterate over the kernel page tables without holding the kernel pmap
10266 * lock. Kernel page table pages are never freed, so at worst we will
10267 * observe inconsistencies in the output.
10268 */
10269 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
10270 i++) {
10271 if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
10272 sbuf_printf(sb, "\nDirect map:\n");
10273 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
10274 sbuf_printf(sb, "\nKernel map:\n");
10275 #ifdef KASAN
10276 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
10277 sbuf_printf(sb, "\nKASAN shadow map:\n");
10278 #endif
10279 #ifdef KMSAN
10280 else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS))
10281 sbuf_printf(sb, "\nKMSAN shadow map:\n");
10282 else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS))
10283 sbuf_printf(sb, "\nKMSAN origin map:\n");
10284 #endif
10285
10286 l0e = kernel_pmap->pm_l0[i];
10287 if ((l0e & ATTR_DESCR_VALID) == 0) {
10288 sysctl_kmaps_dump(sb, &range, sva);
10289 sva += L0_SIZE;
10290 continue;
10291 }
10292 pa = PTE_TO_PHYS(l0e);
10293 l1 = PHYS_TO_DMAP(pa);
10294
10295 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
10296 l1e = l1[j];
10297 if ((l1e & ATTR_DESCR_VALID) == 0) {
10298 sysctl_kmaps_dump(sb, &range, sva);
10299 sva += L1_SIZE;
10300 continue;
10301 }
10302 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
10303 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
10304 sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
10305 0, 0);
10306 range.l1blocks++;
10307 sva += L1_SIZE;
10308 continue;
10309 }
10310 pa = PTE_TO_PHYS(l1e);
10311 l2 = PHYS_TO_DMAP(pa);
10312
10313 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
10314 l2e = l2[k];
10315 if ((l2e & ATTR_DESCR_VALID) == 0) {
10316 sysctl_kmaps_dump(sb, &range, sva);
10317 sva += L2_SIZE;
10318 continue;
10319 }
10320 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
10321 sysctl_kmaps_check(sb, &range, sva,
10322 l0e, l1e, l2e, 0);
10323 if ((l2e & ATTR_CONTIGUOUS) != 0)
10324 range.l2contig +=
10325 k % L2C_ENTRIES == 0 ?
10326 1 : 0;
10327 else
10328 range.l2blocks++;
10329 sva += L2_SIZE;
10330 continue;
10331 }
10332 pa = PTE_TO_PHYS(l2e);
10333 l3 = PHYS_TO_DMAP(pa);
10334
10335 for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
10336 l++, sva += L3_SIZE) {
10337 l3e = l3[l];
10338 if ((l3e & ATTR_DESCR_VALID) == 0) {
10339 sysctl_kmaps_dump(sb, &range,
10340 sva);
10341 continue;
10342 }
10343 sysctl_kmaps_check(sb, &range, sva,
10344 l0e, l1e, l2e, l3e);
10345 if ((l3e & ATTR_CONTIGUOUS) != 0)
10346 range.l3contig +=
10347 l % L3C_ENTRIES == 0 ?
10348 1 : 0;
10349 else
10350 range.l3pages++;
10351 }
10352 }
10353 }
10354 }
10355
10356 error = sbuf_finish(sb);
10357 sbuf_delete(sb);
10358 return (error);
10359 }
10360 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
10361 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
10362 NULL, 0, sysctl_kmaps, "A",
10363 "Dump kernel address layout");
10364
10365
10366 void pagezero_simple(void *);
10367 void pagezero_cache(void *);
10368 void pagezero_mops(void *);
10369
10370 DEFINE_IFUNC(static, void, pagezero, (void *))
10371 {
10372 uint32_t dczid_el0;
10373
10374 dczid_el0 = READ_SPECIALREG(dczid_el0);
10375
10376 if (elf_hwcap2 & HWCAP2_MOPS)
10377 return (pagezero_mops);
10378 else if ((dczid_el0 & DCZID_DZP) == 0)
10379 return (pagezero_cache);
10380 else
10381 return (pagezero_simple);
10382 }
10383