1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014-2016 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * This product includes software developed by the University of
35 * California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 * may be used to endorse or promote products derived from this software
38 * without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 */
52 /*-
53 * Copyright (c) 2003 Networks Associates Technology, Inc.
54 * All rights reserved.
55 *
56 * This software was developed for the FreeBSD Project by Jake Burkholder,
57 * Safeport Network Services, and Network Associates Laboratories, the
58 * Security Research Division of Network Associates, Inc. under
59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
60 * CHATS research program.
61 *
62 * Redistribution and use in source and binary forms, with or without
63 * modification, are permitted provided that the following conditions
64 * are met:
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 * 2. Redistributions in binary form must reproduce the above copyright
68 * notice, this list of conditions and the following disclaimer in the
69 * documentation and/or other materials provided with the distribution.
70 *
71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81 * SUCH DAMAGE.
82 */
83
84 #include <sys/cdefs.h>
85 /*
86 * Manages physical address maps.
87 *
88 * Since the information managed by this module is
89 * also stored by the logical address mapping module,
90 * this module may throw away valid virtual-to-physical
91 * mappings at almost any time. However, invalidations
92 * of virtual-to-physical mappings must be done as
93 * requested.
94 *
95 * In order to cope with hardware architectures which
96 * make virtual-to-physical map invalidates expensive,
97 * this module may delay invalidate or reduced protection
98 * operations until such time as they are actually
99 * necessary. This module is given full information as
100 * to which processors are currently using which maps,
101 * and to when physical maps must be made correct.
102 */
103
104 #include "opt_vm.h"
105
106 #include <sys/param.h>
107 #include <sys/asan.h>
108 #include <sys/bitstring.h>
109 #include <sys/bus.h>
110 #include <sys/systm.h>
111 #include <sys/kernel.h>
112 #include <sys/ktr.h>
113 #include <sys/limits.h>
114 #include <sys/lock.h>
115 #include <sys/malloc.h>
116 #include <sys/mman.h>
117 #include <sys/msan.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/physmem.h>
121 #include <sys/proc.h>
122 #include <sys/rangeset.h>
123 #include <sys/rwlock.h>
124 #include <sys/sbuf.h>
125 #include <sys/sx.h>
126 #include <sys/vmem.h>
127 #include <sys/vmmeter.h>
128 #include <sys/sched.h>
129 #include <sys/sysctl.h>
130 #include <sys/_unrhdr.h>
131 #include <sys/smp.h>
132
133 #include <vm/vm.h>
134 #include <vm/vm_param.h>
135 #include <vm/vm_kern.h>
136 #include <vm/vm_page.h>
137 #include <vm/vm_map.h>
138 #include <vm/vm_object.h>
139 #include <vm/vm_extern.h>
140 #include <vm/vm_pageout.h>
141 #include <vm/vm_pager.h>
142 #include <vm/vm_phys.h>
143 #include <vm/vm_radix.h>
144 #include <vm/vm_reserv.h>
145 #include <vm/vm_dumpset.h>
146 #include <vm/uma.h>
147
148 #include <machine/asan.h>
149 #include <machine/cpu_feat.h>
150 #include <machine/elf.h>
151 #include <machine/ifunc.h>
152 #include <machine/machdep.h>
153 #include <machine/md_var.h>
154 #include <machine/pcb.h>
155 #include <machine/rsi.h>
156
157 #ifdef NUMA
158 #define PMAP_MEMDOM MAXMEMDOM
159 #else
160 #define PMAP_MEMDOM 1
161 #endif
162
163 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1)
164 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2)
165
166 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t)))
167 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t)))
168 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t)))
169 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t)))
170
171 #define NUL0E L0_ENTRIES
172 #define NUL1E (NUL0E * NL1PG)
173 #define NUL2E (NUL1E * NL2PG)
174
175 #ifdef PV_STATS
176 #define PV_STAT(x) do { x ; } while (0)
177 #define __pvused
178 #else
179 #define PV_STAT(x) do { } while (0)
180 #define __pvused __unused
181 #endif
182
183 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT))
184 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
185 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
186
187 #ifdef __ARM_FEATURE_BTI_DEFAULT
188 pt_entry_t __read_mostly pmap_gp_attr;
189 #define ATTR_KERN_GP pmap_gp_attr
190 #else
191 #define ATTR_KERN_GP 0
192 #endif
193 #define PMAP_SAN_PTE_BITS (ATTR_AF | ATTR_S1_XN | pmap_sh_attr | \
194 ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
195
196 static bool __read_mostly pmap_multiple_tlbi = false;
197
198 struct pmap_large_md_page {
199 struct rwlock pv_lock;
200 struct md_page pv_page;
201 /* Pad to a power of 2, see pmap_init_pv_table(). */
202 int pv_pad[2];
203 };
204
205 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
206 #define pv_dummy pv_dummy_large.pv_page
207 __read_mostly static struct pmap_large_md_page *pv_table;
208
209 __read_mostly uint64_t prot_ns_shared_pa;
210
211 static struct pmap_large_md_page *
_pa_to_pmdp(vm_paddr_t pa)212 _pa_to_pmdp(vm_paddr_t pa)
213 {
214 struct vm_phys_seg *seg;
215
216 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
217 return ((struct pmap_large_md_page *)seg->md_first +
218 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
219 return (NULL);
220 }
221
222 static struct pmap_large_md_page *
pa_to_pmdp(vm_paddr_t pa)223 pa_to_pmdp(vm_paddr_t pa)
224 {
225 struct pmap_large_md_page *pvd;
226
227 pvd = _pa_to_pmdp(pa);
228 if (pvd == NULL)
229 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
230 return (pvd);
231 }
232
233 static struct pmap_large_md_page *
page_to_pmdp(vm_page_t m)234 page_to_pmdp(vm_page_t m)
235 {
236 struct vm_phys_seg *seg;
237
238 seg = &vm_phys_segs[m->segind];
239 return ((struct pmap_large_md_page *)seg->md_first +
240 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
241 }
242
243 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page))
244 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page))
245
246 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \
247 struct pmap_large_md_page *_pvd; \
248 struct rwlock *_lock; \
249 _pvd = _pa_to_pmdp(pa); \
250 if (__predict_false(_pvd == NULL)) \
251 _lock = &pv_dummy_large.pv_lock; \
252 else \
253 _lock = &(_pvd->pv_lock); \
254 _lock; \
255 })
256
257 static struct rwlock *
VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)258 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)
259 {
260 if ((m->flags & PG_FICTITIOUS) == 0)
261 return (&page_to_pmdp(m)->pv_lock);
262 else
263 return (&pv_dummy_large.pv_lock);
264 }
265
266 #define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \
267 struct rwlock **_lockp = (lockp); \
268 struct rwlock *_new_lock = (new_lock); \
269 \
270 if (_new_lock != *_lockp) { \
271 if (*_lockp != NULL) \
272 rw_wunlock(*_lockp); \
273 *_lockp = _new_lock; \
274 rw_wlock(*_lockp); \
275 } \
276 } while (0)
277
278 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \
279 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa))
280
281 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
282 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m))
283
284 #define RELEASE_PV_LIST_LOCK(lockp) do { \
285 struct rwlock **_lockp = (lockp); \
286 \
287 if (*_lockp != NULL) { \
288 rw_wunlock(*_lockp); \
289 *_lockp = NULL; \
290 } \
291 } while (0)
292
293 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
294 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m))
295
296 static struct mtx cmap_lock;
297 static void *cmap1_addr;
298 static pt_entry_t *cmap1_pte;
299
300 /*
301 * The presence of this flag indicates that the mapping is writeable.
302 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
303 * it is dirty. This flag may only be set on managed mappings.
304 *
305 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
306 * as a software managed bit.
307 */
308 #define ATTR_SW_DBM ATTR_DBM
309
310 struct pmap kernel_pmap_store;
311
312 /* Used for mapping ACPI memory before VM is initialized */
313 #define PMAP_PREINIT_MAPPING_COUNT 32
314 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
315 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */
316 static int vm_initialized = 0; /* No need to use pre-init maps when set */
317
318 /*
319 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
320 * Always map entire L2 block for simplicity.
321 * VA of L2 block = preinit_map_va + i * L2_SIZE
322 */
323 static struct pmap_preinit_mapping {
324 vm_paddr_t pa;
325 void *va;
326 vm_size_t size;
327 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
328
329 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
330 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
331 vm_offset_t kernel_vm_end = 0;
332
333 /*
334 * Data for the pv entry allocation mechanism.
335 */
336 #ifdef NUMA
337 static __inline int
pc_to_domain(struct pv_chunk * pc)338 pc_to_domain(struct pv_chunk *pc)
339 {
340 return (vm_phys_domain(DMAP_TO_PHYS(pc)));
341 }
342 #else
343 static __inline int
pc_to_domain(struct pv_chunk * pc __unused)344 pc_to_domain(struct pv_chunk *pc __unused)
345 {
346 return (0);
347 }
348 #endif
349
350 struct pv_chunks_list {
351 struct mtx pvc_lock;
352 TAILQ_HEAD(pch, pv_chunk) pvc_list;
353 int active_reclaims;
354 } __aligned(CACHE_LINE_SIZE);
355
356 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
357
358 vm_paddr_t dmap_phys_base; /* The start of the dmap region */
359 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
360 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */
361
362 extern pt_entry_t pagetable_l0_ttbr1[];
363
364 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
365 static vm_paddr_t physmap[PHYSMAP_SIZE];
366 static u_int physmap_idx;
367
368 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
369 "VM/pmap parameters");
370
371 static int pmap_growkernel_panic = 0;
372 SYSCTL_INT(_vm_pmap, OID_AUTO, growkernel_panic, CTLFLAG_RDTUN,
373 &pmap_growkernel_panic, 0,
374 "panic on failure to allocate kernel page table page");
375
376 bool pmap_lpa_enabled __read_mostly = false;
377 pt_entry_t pmap_sh_attr __read_mostly = ATTR_SH(ATTR_SH_IS);
378
379 #if PAGE_SIZE == PAGE_SIZE_4K
380 #define L1_BLOCKS_SUPPORTED 1
381 #else
382 #define L1_BLOCKS_SUPPORTED (pmap_lpa_enabled)
383 #endif
384
385 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED)
386
387 static bool pmap_l1_supported __read_mostly = false;
388
389 /*
390 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
391 * that it has currently allocated to a pmap, a cursor ("asid_next") to
392 * optimize its search for a free ASID in the bit vector, and an epoch number
393 * ("asid_epoch") to indicate when it has reclaimed all previously allocated
394 * ASIDs that are not currently active on a processor.
395 *
396 * The current epoch number is always in the range [0, INT_MAX). Negative
397 * numbers and INT_MAX are reserved for special cases that are described
398 * below.
399 */
400 struct asid_set {
401 int asid_bits;
402 bitstr_t *asid_set;
403 int asid_set_size;
404 int asid_next;
405 int asid_epoch;
406 struct mtx asid_set_mutex;
407 };
408
409 static struct asid_set asids;
410 static struct asid_set vmids;
411
412 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
413 "ASID allocator");
414 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
415 "The number of bits in an ASID");
416 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
417 "The last allocated ASID plus one");
418 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
419 "The current epoch number");
420
421 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
422 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
423 "The number of bits in an VMID");
424 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
425 "The last allocated VMID plus one");
426 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
427 "The current epoch number");
428
429 void (*pmap_clean_stage2_tlbi)(void);
430 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
431 void (*pmap_stage2_invalidate_all)(uint64_t);
432
433 /*
434 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved
435 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for
436 * dynamically allocated ASIDs have a non-negative epoch number.
437 *
438 * An invalid ASID is represented by -1.
439 *
440 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
441 * which indicates that an ASID should never be allocated to the pmap, and
442 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
443 * allocated when the pmap is next activated.
444 */
445 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \
446 ((u_long)(epoch) << 32)))
447 #define COOKIE_TO_ASID(cookie) ((int)(cookie))
448 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32))
449
450 #define TLBI_VA_SHIFT 12
451 #define TLBI_VA_MASK ((1ul << 44) - 1)
452 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
453
454 static int __read_frequently superpages_enabled = 1;
455 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
456 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
457 "Are large page mappings enabled?");
458
459 /*
460 * True when Branch Target Identification should be used by userspace. This
461 * allows pmap to mark pages as guarded with ATTR_S1_GP.
462 */
463 __read_mostly static bool pmap_bti_support = false;
464
465 /*
466 * Internal flags for pmap_enter()'s helper functions.
467 */
468 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
469 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
470
471 TAILQ_HEAD(pv_chunklist, pv_chunk);
472
473 static void free_pv_chunk(struct pv_chunk *pc);
474 static void free_pv_chunk_batch(struct pv_chunklist *batch);
475 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
476 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
477 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
478 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
479 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
480 vm_offset_t va);
481
482 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
483 static bool pmap_activate_int(struct thread *td, pmap_t pmap);
484 static void pmap_alloc_asid(pmap_t pmap);
485 static int pmap_change_props_locked(void *addr, vm_size_t size,
486 vm_prot_t prot, int mode, bool skip_unmapped);
487 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
488 pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp);
489 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
490 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
491 vm_offset_t va, struct rwlock **lockp);
492 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
493 static bool pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va);
494 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va);
495 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
496 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
497 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
498 u_int flags, vm_page_t m, struct rwlock **lockp);
499 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
500 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp);
501 static bool pmap_every_pte_zero(vm_paddr_t pa);
502 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
503 bool all_l3e_AF_set);
504 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p);
505 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
506 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits);
507 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
508 struct rwlock **lockp);
509 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
510 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
511 pd_entry_t l1e, bool demote_kl2e, struct spglist *free,
512 struct rwlock **lockp);
513 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
514 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
515 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
516 vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
517 struct rwlock **lockp);
518 static void pmap_reset_asid_set(pmap_t pmap);
519 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
520 vm_page_t m, struct rwlock **lockp);
521
522 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
523 struct rwlock **lockp);
524
525 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
526 struct spglist *free);
527 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
528 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
529 vm_offset_t va, vm_size_t size);
530 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
531
532 static uma_zone_t pmap_bti_ranges_zone;
533 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
534 pt_entry_t *pte);
535 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va);
536 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
537 static void *bti_dup_range(void *ctx, void *data);
538 static void bti_free_range(void *ctx, void *node);
539 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap);
540 static void pmap_bti_deassign_all(pmap_t pmap);
541 static void pagezero(void *);
542
543 static void pmap_set_protected(pt_entry_t old_l3);
544 static void pmap_set_unprotected(pt_entry_t new_l3);
545
546 /*
547 * These load the old table data and store the new value.
548 * They need to be atomic as the System MMU may write to the table at
549 * the same time as the CPU.
550 */
551 #define pmap_clear(table) atomic_store_64(table, 0)
552 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits)
553 #define pmap_load(table) (*table)
554 #define pmap_load_clear(table) atomic_swap_64(table, 0)
555 #define pmap_load_store(table, entry) atomic_swap_64(table, entry)
556 #define pmap_set_bits(table, bits) atomic_set_64(table, bits)
557 #define pmap_store(table, entry) atomic_store_64(table, entry)
558
559 /********************/
560 /* Inline functions */
561 /********************/
562
563 static __inline void
pagecopy(void * s,void * d)564 pagecopy(void *s, void *d)
565 {
566
567 memcpy(d, s, PAGE_SIZE);
568 }
569
570 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)571 pmap_l0(pmap_t pmap, vm_offset_t va)
572 {
573
574 return (&pmap->pm_l0[pmap_l0_index(va)]);
575 }
576
577 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)578 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
579 {
580 pd_entry_t *l1;
581
582 l1 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
583 return (&l1[pmap_l1_index(va)]);
584 }
585
586 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)587 pmap_l1(pmap_t pmap, vm_offset_t va)
588 {
589 pd_entry_t *l0;
590
591 l0 = pmap_l0(pmap, va);
592 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
593 return (NULL);
594
595 return (pmap_l0_to_l1(l0, va));
596 }
597
598 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1p,vm_offset_t va)599 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
600 {
601 pd_entry_t l1, *l2p;
602
603 l1 = pmap_load(l1p);
604
605 KASSERT(ADDR_IS_CANONICAL(va),
606 ("%s: Address not in canonical form: %lx", __func__, va));
607 /*
608 * The valid bit may be clear if pmap_update_entry() is concurrently
609 * modifying the entry, so for KVA only the entry type may be checked.
610 */
611 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
612 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
613 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
614 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
615 l2p = PHYS_TO_DMAP(PTE_TO_PHYS(l1));
616 return (&l2p[pmap_l2_index(va)]);
617 }
618
619 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)620 pmap_l2(pmap_t pmap, vm_offset_t va)
621 {
622 pd_entry_t *l1;
623
624 l1 = pmap_l1(pmap, va);
625 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
626 return (NULL);
627
628 return (pmap_l1_to_l2(l1, va));
629 }
630
631 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2p,vm_offset_t va)632 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
633 {
634 pd_entry_t l2;
635 pt_entry_t *l3p;
636
637 l2 = pmap_load(l2p);
638
639 KASSERT(ADDR_IS_CANONICAL(va),
640 ("%s: Address not in canonical form: %lx", __func__, va));
641 /*
642 * The valid bit may be clear if pmap_update_entry() is concurrently
643 * modifying the entry, so for KVA only the entry type may be checked.
644 */
645 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
646 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
647 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
648 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
649 l3p = PHYS_TO_DMAP(PTE_TO_PHYS(l2));
650 return (&l3p[pmap_l3_index(va)]);
651 }
652
653 /*
654 * Returns the lowest valid pde for a given virtual address.
655 * The next level may or may not point to a valid page or block.
656 */
657 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va,int * level)658 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
659 {
660 pd_entry_t *l0, *l1, *l2, desc;
661
662 l0 = pmap_l0(pmap, va);
663 desc = pmap_load(l0) & ATTR_DESCR_MASK;
664 if (desc != L0_TABLE) {
665 *level = -1;
666 return (NULL);
667 }
668
669 l1 = pmap_l0_to_l1(l0, va);
670 desc = pmap_load(l1) & ATTR_DESCR_MASK;
671 if (desc != L1_TABLE) {
672 *level = 0;
673 return (l0);
674 }
675
676 l2 = pmap_l1_to_l2(l1, va);
677 desc = pmap_load(l2) & ATTR_DESCR_MASK;
678 if (desc != L2_TABLE) {
679 *level = 1;
680 return (l1);
681 }
682
683 *level = 2;
684 return (l2);
685 }
686
687 /*
688 * Returns the lowest valid pte block or table entry for a given virtual
689 * address. If there are no valid entries return NULL and set the level to
690 * the first invalid level.
691 */
692 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va,int * level)693 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
694 {
695 pd_entry_t *l1, *l2, desc;
696 pt_entry_t *l3;
697
698 l1 = pmap_l1(pmap, va);
699 if (l1 == NULL) {
700 *level = 0;
701 return (NULL);
702 }
703 desc = pmap_load(l1) & ATTR_DESCR_MASK;
704 if (desc == L1_BLOCK) {
705 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
706 *level = 1;
707 return (l1);
708 }
709
710 if (desc != L1_TABLE) {
711 *level = 1;
712 return (NULL);
713 }
714
715 l2 = pmap_l1_to_l2(l1, va);
716 desc = pmap_load(l2) & ATTR_DESCR_MASK;
717 if (desc == L2_BLOCK) {
718 *level = 2;
719 return (l2);
720 }
721
722 if (desc != L2_TABLE) {
723 *level = 2;
724 return (NULL);
725 }
726
727 *level = 3;
728 l3 = pmap_l2_to_l3(l2, va);
729 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
730 return (NULL);
731
732 return (l3);
733 }
734
735 /*
736 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
737 * level that maps the specified virtual address, then a pointer to that entry
738 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled
739 * and a diagnostic message is provided, in which case this function panics.
740 */
741 static __always_inline pt_entry_t *
pmap_pte_exists(pmap_t pmap,vm_offset_t va,int level,const char * diag)742 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
743 {
744 pd_entry_t *l0p, *l1p, *l2p;
745 pt_entry_t desc, *l3p;
746 int walk_level __diagused;
747
748 KASSERT(level >= 0 && level < 4,
749 ("%s: %s passed an out-of-range level (%d)", __func__, diag,
750 level));
751 l0p = pmap_l0(pmap, va);
752 desc = pmap_load(l0p) & ATTR_DESCR_MASK;
753 if (desc == L0_TABLE && level > 0) {
754 l1p = pmap_l0_to_l1(l0p, va);
755 desc = pmap_load(l1p) & ATTR_DESCR_MASK;
756 if (desc == L1_BLOCK && level == 1) {
757 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
758 return (l1p);
759 }
760 if (desc == L1_TABLE && level > 1) {
761 l2p = pmap_l1_to_l2(l1p, va);
762 desc = pmap_load(l2p) & ATTR_DESCR_MASK;
763 if (desc == L2_BLOCK && level == 2)
764 return (l2p);
765 else if (desc == L2_TABLE && level > 2) {
766 l3p = pmap_l2_to_l3(l2p, va);
767 desc = pmap_load(l3p) & ATTR_DESCR_MASK;
768 if (desc == L3_PAGE && level == 3)
769 return (l3p);
770 else
771 walk_level = 3;
772 } else
773 walk_level = 2;
774 } else
775 walk_level = 1;
776 } else
777 walk_level = 0;
778 KASSERT(diag == NULL,
779 ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
780 diag, va, level, desc, walk_level));
781 return (NULL);
782 }
783
784 bool
pmap_ps_enabled(pmap_t pmap)785 pmap_ps_enabled(pmap_t pmap)
786 {
787 /*
788 * Promotion requires a hypervisor call when the kernel is running
789 * in EL1. To stop this disable superpage support on non-stage 1
790 * pmaps for now.
791 */
792 if (pmap->pm_stage != PM_STAGE1)
793 return (false);
794
795 #ifdef KMSAN
796 /*
797 * The break-before-make in pmap_update_entry() results in a situation
798 * where a CPU may call into the KMSAN runtime while the entry is
799 * invalid. If the entry is used to map the current thread structure,
800 * then the runtime will attempt to access unmapped memory. Avoid this
801 * by simply disabling superpage promotion for the kernel map.
802 */
803 if (pmap == kernel_pmap)
804 return (false);
805 #endif
806
807 return (superpages_enabled != 0);
808 }
809
810 bool
pmap_vs_enabled(void)811 pmap_vs_enabled(void)
812 {
813 /*
814 * 8 and 16 are the only values hardware can support, but allow for the
815 * possibility of artificially restricting the bits, e.g. for testing.
816 */
817 KASSERT(vmids.asid_bits <= 16, ("VMID bits %d > 16", vmids.asid_bits));
818 return (vmids.asid_bits > 8);
819 }
820
821 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l0,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)822 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
823 pd_entry_t **l2, pt_entry_t **l3)
824 {
825 pd_entry_t *l0p, *l1p, *l2p;
826
827 if (pmap->pm_l0 == NULL)
828 return (false);
829
830 l0p = pmap_l0(pmap, va);
831 *l0 = l0p;
832
833 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
834 return (false);
835
836 l1p = pmap_l0_to_l1(l0p, va);
837 *l1 = l1p;
838
839 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
840 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
841 *l2 = NULL;
842 *l3 = NULL;
843 return (true);
844 }
845
846 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
847 return (false);
848
849 l2p = pmap_l1_to_l2(l1p, va);
850 *l2 = l2p;
851
852 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
853 *l3 = NULL;
854 return (true);
855 }
856
857 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
858 return (false);
859
860 *l3 = pmap_l2_to_l3(l2p, va);
861
862 return (true);
863 }
864
865 static __inline int
pmap_l3_valid(pt_entry_t l3)866 pmap_l3_valid(pt_entry_t l3)
867 {
868
869 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
870 }
871
872 CTASSERT(L1_BLOCK == L2_BLOCK);
873
874 static pt_entry_t
pmap_pte_memattr(pmap_t pmap,vm_memattr_t memattr)875 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
876 {
877 pt_entry_t val;
878
879 if (pmap->pm_stage == PM_STAGE1) {
880 val = ATTR_S1_IDX(memattr);
881 if (memattr == VM_MEMATTR_DEVICE)
882 val |= ATTR_S1_XN;
883 return (val);
884 }
885
886 val = 0;
887
888 switch (memattr) {
889 case VM_MEMATTR_DEVICE:
890 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
891 ATTR_S2_XN(ATTR_S2_XN_ALL));
892 case VM_MEMATTR_UNCACHEABLE:
893 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
894 case VM_MEMATTR_WRITE_BACK:
895 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
896 case VM_MEMATTR_WRITE_THROUGH:
897 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
898 default:
899 panic("%s: invalid memory attribute %x", __func__, memattr);
900 }
901 }
902
903 static pt_entry_t
pmap_pte_prot(pmap_t pmap,vm_prot_t prot)904 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
905 {
906 pt_entry_t val;
907
908 val = 0;
909 if (pmap->pm_stage == PM_STAGE1) {
910 if ((prot & VM_PROT_EXECUTE) == 0)
911 val |= ATTR_S1_XN;
912 if ((prot & VM_PROT_WRITE) == 0)
913 val |= ATTR_S1_AP(ATTR_S1_AP_RO);
914 } else {
915 if ((prot & VM_PROT_WRITE) != 0)
916 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
917 if ((prot & VM_PROT_READ) != 0)
918 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
919 if ((prot & VM_PROT_EXECUTE) == 0)
920 val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
921 }
922
923 return (val);
924 }
925
926 /*
927 * Checks if the PTE is dirty.
928 */
929 static inline int
pmap_pte_dirty(pmap_t pmap,pt_entry_t pte)930 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
931 {
932
933 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
934
935 if (pmap->pm_stage == PM_STAGE1) {
936 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
937 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
938
939 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
940 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
941 }
942
943 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
944 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
945 }
946
947 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)948 pmap_resident_count_inc(pmap_t pmap, int count)
949 {
950
951 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
952 pmap->pm_stats.resident_count += count;
953 }
954
955 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)956 pmap_resident_count_dec(pmap_t pmap, int count)
957 {
958
959 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
960 KASSERT(pmap->pm_stats.resident_count >= count,
961 ("pmap %p resident count underflow %ld %d", pmap,
962 pmap->pm_stats.resident_count, count));
963 pmap->pm_stats.resident_count -= count;
964 }
965
966 static vm_paddr_t
pmap_early_vtophys(vm_offset_t va)967 pmap_early_vtophys(vm_offset_t va)
968 {
969 vm_paddr_t pa_page;
970
971 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
972 return (pa_page | (va & PAR_LOW_MASK));
973 }
974
975 /* State of the bootstrapped DMAP page tables */
976 struct pmap_bootstrap_state {
977 pt_entry_t *l1;
978 pt_entry_t *l2;
979 pt_entry_t *l3;
980 vm_offset_t freemempos;
981 vm_offset_t va;
982 vm_paddr_t pa;
983 pt_entry_t table_attrs;
984 u_int l0_slot;
985 u_int l1_slot;
986 u_int l2_slot;
987 bool dmap_valid;
988 };
989
990 /* The bootstrap state */
991 static struct pmap_bootstrap_state bs_state = {
992 .l1 = NULL,
993 .l2 = NULL,
994 .l3 = NULL,
995 .table_attrs = TATTR_PXN_TABLE,
996 .l0_slot = L0_ENTRIES,
997 .l1_slot = Ln_ENTRIES,
998 .l2_slot = Ln_ENTRIES,
999 .dmap_valid = false,
1000 };
1001
1002 static void
pmap_bootstrap_l0_table(struct pmap_bootstrap_state * state)1003 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
1004 {
1005 vm_paddr_t l1_pa;
1006 pd_entry_t l0e;
1007 u_int l0_slot;
1008
1009 /* Link the level 0 table to a level 1 table */
1010 l0_slot = pmap_l0_index(state->va);
1011 if (l0_slot != state->l0_slot) {
1012 /*
1013 * Make sure we move from a low address to high address
1014 * before the DMAP region is ready. This ensures we never
1015 * modify an existing mapping until we can map from a
1016 * physical address to a virtual address.
1017 */
1018 MPASS(state->l0_slot < l0_slot ||
1019 state->l0_slot == L0_ENTRIES ||
1020 state->dmap_valid);
1021
1022 /* Reset lower levels */
1023 state->l2 = NULL;
1024 state->l3 = NULL;
1025 state->l1_slot = Ln_ENTRIES;
1026 state->l2_slot = Ln_ENTRIES;
1027
1028 /* Check the existing L0 entry */
1029 state->l0_slot = l0_slot;
1030 if (state->dmap_valid) {
1031 l0e = pagetable_l0_ttbr1[l0_slot];
1032 if ((l0e & ATTR_DESCR_VALID) != 0) {
1033 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
1034 l1_pa = PTE_TO_PHYS(l0e);
1035 state->l1 = PHYS_TO_DMAP(l1_pa);
1036 return;
1037 }
1038 }
1039
1040 /* Create a new L0 table entry */
1041 state->l1 = (pt_entry_t *)state->freemempos;
1042 memset_early(state->l1, 0, PAGE_SIZE);
1043 state->freemempos += PAGE_SIZE;
1044
1045 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
1046 MPASS((l1_pa & Ln_TABLE_MASK) == 0);
1047 MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
1048 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) |
1049 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
1050 }
1051 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
1052 }
1053
1054 static void
pmap_bootstrap_l1_table(struct pmap_bootstrap_state * state)1055 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
1056 {
1057 vm_paddr_t l2_pa;
1058 pd_entry_t l1e;
1059 u_int l1_slot;
1060
1061 /* Make sure there is a valid L0 -> L1 table */
1062 pmap_bootstrap_l0_table(state);
1063
1064 /* Link the level 1 table to a level 2 table */
1065 l1_slot = pmap_l1_index(state->va);
1066 if (l1_slot != state->l1_slot) {
1067 /* See pmap_bootstrap_l0_table for a description */
1068 MPASS(state->l1_slot < l1_slot ||
1069 state->l1_slot == Ln_ENTRIES ||
1070 state->dmap_valid);
1071
1072 /* Reset lower levels */
1073 state->l3 = NULL;
1074 state->l2_slot = Ln_ENTRIES;
1075
1076 /* Check the existing L1 entry */
1077 state->l1_slot = l1_slot;
1078 if (state->dmap_valid) {
1079 l1e = state->l1[l1_slot];
1080 if ((l1e & ATTR_DESCR_VALID) != 0) {
1081 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
1082 l2_pa = PTE_TO_PHYS(l1e);
1083 state->l2 = PHYS_TO_DMAP(l2_pa);
1084 return;
1085 }
1086 }
1087
1088 /* Create a new L1 table entry */
1089 state->l2 = (pt_entry_t *)state->freemempos;
1090 memset_early(state->l2, 0, PAGE_SIZE);
1091 state->freemempos += PAGE_SIZE;
1092
1093 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1094 MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1095 MPASS(state->l1[l1_slot] == 0);
1096 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) |
1097 state->table_attrs | L1_TABLE);
1098 }
1099 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1100 }
1101
1102 static void
pmap_bootstrap_l2_table(struct pmap_bootstrap_state * state)1103 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1104 {
1105 vm_paddr_t l3_pa;
1106 pd_entry_t l2e;
1107 u_int l2_slot;
1108
1109 /* Make sure there is a valid L1 -> L2 table */
1110 pmap_bootstrap_l1_table(state);
1111
1112 /* Link the level 2 table to a level 3 table */
1113 l2_slot = pmap_l2_index(state->va);
1114 if (l2_slot != state->l2_slot) {
1115 /* See pmap_bootstrap_l0_table for a description */
1116 MPASS(state->l2_slot < l2_slot ||
1117 state->l2_slot == Ln_ENTRIES ||
1118 state->dmap_valid);
1119
1120 /* Check the existing L2 entry */
1121 state->l2_slot = l2_slot;
1122 if (state->dmap_valid) {
1123 l2e = state->l2[l2_slot];
1124 if ((l2e & ATTR_DESCR_VALID) != 0) {
1125 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1126 l3_pa = PTE_TO_PHYS(l2e);
1127 state->l3 = PHYS_TO_DMAP(l3_pa);
1128 return;
1129 }
1130 }
1131
1132 /* Create a new L2 table entry */
1133 state->l3 = (pt_entry_t *)state->freemempos;
1134 memset_early(state->l3, 0, PAGE_SIZE);
1135 state->freemempos += PAGE_SIZE;
1136
1137 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1138 MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1139 MPASS(state->l2[l2_slot] == 0);
1140 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) |
1141 state->table_attrs | L2_TABLE);
1142 }
1143 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1144 }
1145
1146 static void
pmap_bootstrap_l2_block(struct pmap_bootstrap_state * state,int i)1147 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1148 {
1149 pt_entry_t contig;
1150 u_int l2_slot;
1151 bool first;
1152
1153 if ((physmap[i + 1] - state->pa) < L2_SIZE)
1154 return;
1155
1156 /* Make sure there is a valid L1 table */
1157 pmap_bootstrap_l1_table(state);
1158
1159 MPASS((state->va & L2_OFFSET) == 0);
1160 for (first = true, contig = 0;
1161 state->va < DMAP_MAX_ADDRESS &&
1162 (physmap[i + 1] - state->pa) >= L2_SIZE;
1163 state->va += L2_SIZE, state->pa += L2_SIZE) {
1164 /*
1165 * Stop if we are about to walk off the end of what the
1166 * current L1 slot can address.
1167 */
1168 if (!first && (state->pa & L1_OFFSET) == 0)
1169 break;
1170
1171 /*
1172 * If we have an aligned, contiguous chunk of L2C_ENTRIES
1173 * L2 blocks, set the contiguous bit within each PTE so that
1174 * the chunk can be cached using only one TLB entry.
1175 */
1176 if ((state->pa & L2C_OFFSET) == 0) {
1177 if (state->va + L2C_SIZE < DMAP_MAX_ADDRESS &&
1178 physmap[i + 1] - state->pa >= L2C_SIZE) {
1179 contig = ATTR_CONTIGUOUS;
1180 } else {
1181 contig = 0;
1182 }
1183 }
1184
1185 first = false;
1186 l2_slot = pmap_l2_index(state->va);
1187 MPASS((state->pa & L2_OFFSET) == 0);
1188 MPASS(state->l2[l2_slot] == 0);
1189 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) |
1190 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP |
1191 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L2_BLOCK);
1192 }
1193 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1194 }
1195
1196 static void
pmap_bootstrap_l3_page(struct pmap_bootstrap_state * state,int i)1197 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1198 {
1199 pt_entry_t contig;
1200 u_int l3_slot;
1201 bool first;
1202
1203 if (physmap[i + 1] - state->pa < L3_SIZE)
1204 return;
1205
1206 /* Make sure there is a valid L2 table */
1207 pmap_bootstrap_l2_table(state);
1208
1209 MPASS((state->va & L3_OFFSET) == 0);
1210 for (first = true, contig = 0;
1211 state->va < DMAP_MAX_ADDRESS &&
1212 physmap[i + 1] - state->pa >= L3_SIZE;
1213 state->va += L3_SIZE, state->pa += L3_SIZE) {
1214 /*
1215 * Stop if we are about to walk off the end of what the
1216 * current L2 slot can address.
1217 */
1218 if (!first && (state->pa & L2_OFFSET) == 0)
1219 break;
1220
1221 /*
1222 * If we have an aligned, contiguous chunk of L3C_ENTRIES
1223 * L3 pages, set the contiguous bit within each PTE so that
1224 * the chunk can be cached using only one TLB entry.
1225 */
1226 if ((state->pa & L3C_OFFSET) == 0) {
1227 if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS &&
1228 physmap[i + 1] - state->pa >= L3C_SIZE) {
1229 contig = ATTR_CONTIGUOUS;
1230 } else {
1231 contig = 0;
1232 }
1233 }
1234
1235 first = false;
1236 l3_slot = pmap_l3_index(state->va);
1237 MPASS((state->pa & L3_OFFSET) == 0);
1238 MPASS(state->l3[l3_slot] == 0);
1239 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) |
1240 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP |
1241 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE);
1242 }
1243 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1244 }
1245
1246 void
pmap_bootstrap_dmap(vm_size_t kernlen)1247 pmap_bootstrap_dmap(vm_size_t kernlen)
1248 {
1249 vm_paddr_t start_pa, pa;
1250 uint64_t tcr;
1251 int i;
1252
1253 tcr = READ_SPECIALREG(tcr_el1);
1254
1255 /* Verify that the ASID is set through TTBR0. */
1256 KASSERT((tcr & TCR_A1) == 0, ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1257
1258 if ((tcr & TCR_DS) != 0)
1259 pmap_lpa_enabled = true;
1260
1261 pmap_l1_supported = L1_BLOCKS_SUPPORTED;
1262
1263 start_pa = pmap_early_vtophys(KERNBASE);
1264
1265 bs_state.freemempos = KERNBASE + kernlen;
1266 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1267
1268 /* Fill in physmap array. */
1269 physmap_idx = physmem_avail(physmap, nitems(physmap));
1270
1271 dmap_phys_base = physmap[0] & ~L1_OFFSET;
1272 dmap_phys_max = 0;
1273 dmap_max_addr = 0;
1274
1275 for (i = 0; i < physmap_idx; i += 2) {
1276 bs_state.pa = physmap[i] & ~L3_OFFSET;
1277 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1278
1279 /* Create L3 mappings at the start of the region */
1280 if ((bs_state.pa & L2_OFFSET) != 0)
1281 pmap_bootstrap_l3_page(&bs_state, i);
1282 MPASS(bs_state.pa <= physmap[i + 1]);
1283
1284 if (L1_BLOCKS_SUPPORTED) {
1285 /* Create L2 mappings at the start of the region */
1286 if ((bs_state.pa & L1_OFFSET) != 0)
1287 pmap_bootstrap_l2_block(&bs_state, i);
1288 MPASS(bs_state.pa <= physmap[i + 1]);
1289
1290 /* Create the main L1 block mappings */
1291 for (; bs_state.va < DMAP_MAX_ADDRESS &&
1292 (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1293 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1294 /* Make sure there is a valid L1 table */
1295 pmap_bootstrap_l0_table(&bs_state);
1296 MPASS((bs_state.pa & L1_OFFSET) == 0);
1297 pmap_store(
1298 &bs_state.l1[pmap_l1_index(bs_state.va)],
1299 PHYS_TO_PTE(bs_state.pa) | ATTR_AF |
1300 pmap_sh_attr |
1301 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1302 ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK);
1303 }
1304 MPASS(bs_state.pa <= physmap[i + 1]);
1305
1306 /* Create L2 mappings at the end of the region */
1307 pmap_bootstrap_l2_block(&bs_state, i);
1308 } else {
1309 while (bs_state.va < DMAP_MAX_ADDRESS &&
1310 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1311 pmap_bootstrap_l2_block(&bs_state, i);
1312 }
1313 }
1314 MPASS(bs_state.pa <= physmap[i + 1]);
1315
1316 /* Create L3 mappings at the end of the region */
1317 pmap_bootstrap_l3_page(&bs_state, i);
1318 MPASS(bs_state.pa == physmap[i + 1]);
1319
1320 if (bs_state.pa > dmap_phys_max) {
1321 dmap_phys_max = bs_state.pa;
1322 dmap_max_addr = bs_state.va;
1323 }
1324 }
1325
1326 pmap_s1_invalidate_all_kernel();
1327
1328 bs_state.dmap_valid = true;
1329
1330 /* Exclude the kernel and DMAP region */
1331 pa = pmap_early_vtophys(bs_state.freemempos);
1332 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1333 }
1334
1335 static void
pmap_bootstrap_l2(vm_offset_t va)1336 pmap_bootstrap_l2(vm_offset_t va)
1337 {
1338 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1339
1340 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1341 bs_state.va = va;
1342
1343 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1344 pmap_bootstrap_l1_table(&bs_state);
1345 }
1346
1347 static void
pmap_bootstrap_l3(vm_offset_t va)1348 pmap_bootstrap_l3(vm_offset_t va)
1349 {
1350 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1351
1352 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1353 bs_state.va = va;
1354
1355 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1356 pmap_bootstrap_l2_table(&bs_state);
1357 }
1358
1359 /*
1360 * Bootstrap the system enough to run with virtual memory.
1361 */
1362 void
pmap_bootstrap(void)1363 pmap_bootstrap(void)
1364 {
1365 vm_offset_t dpcpu, msgbufpv;
1366 vm_paddr_t start_pa, pa;
1367 size_t largest_phys_size;
1368
1369 /* Set this early so we can use the pagetable walking functions */
1370 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1371 mtx_init(&kernel_pmap->pm_mtx, "kernel pmap", NULL, MTX_DEF);
1372 kernel_pmap->pm_l0_paddr =
1373 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1374 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1375 vm_radix_init(&kernel_pmap->pm_root);
1376 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1377 kernel_pmap->pm_stage = PM_STAGE1;
1378 kernel_pmap->pm_levels = 4;
1379 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1380 kernel_pmap->pm_asid_set = &asids;
1381
1382 /* Reserve some VA space for early BIOS/ACPI mapping */
1383 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1384
1385 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1386 virtual_avail = roundup2(virtual_avail, L1_SIZE);
1387 virtual_end = VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE - L2_SIZE;
1388 kernel_vm_end = virtual_avail;
1389
1390 /*
1391 * We only use PXN when we know nothing will be executed from it, e.g.
1392 * the DMAP region.
1393 */
1394 bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1395
1396 /*
1397 * Find the physical memory we could use. This needs to be after we
1398 * exclude any memory that is mapped into the DMAP region but should
1399 * not be used by the kernel, e.g. some UEFI memory types.
1400 */
1401 physmap_idx = physmem_avail(physmap, nitems(physmap));
1402
1403 /*
1404 * Find space for early allocations. We search for the largest
1405 * region. This is because the user may choose a large msgbuf.
1406 * This could be smarter, e.g. to allow multiple regions to be
1407 * used & switch to the next when one is full.
1408 */
1409 largest_phys_size = 0;
1410 for (int i = 0; i < physmap_idx; i += 2) {
1411 if ((physmap[i + 1] - physmap[i]) > largest_phys_size) {
1412 largest_phys_size = physmap[i + 1] - physmap[i];
1413 bs_state.freemempos = PHYS_TO_DMAP_ADDR(physmap[i]);
1414 }
1415 }
1416
1417 start_pa = pmap_early_vtophys(bs_state.freemempos);
1418
1419 /*
1420 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the
1421 * loader allocated the first and only l2 page table page used to map
1422 * the kernel, preloaded files and module metadata.
1423 */
1424 pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1425 /* And the l3 tables for the early devmap */
1426 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1427
1428 pmap_s1_invalidate_all_kernel();
1429
1430 #define alloc_pages(var, np) \
1431 (var) = bs_state.freemempos; \
1432 bs_state.freemempos += (np * PAGE_SIZE); \
1433 memset_early((char *)(var), 0, ((np) * PAGE_SIZE));
1434
1435 /* Allocate dynamic per-cpu area. */
1436 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1437 dpcpu_init((void *)dpcpu, 0);
1438
1439 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1440 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1441 msgbufp = (void *)msgbufpv;
1442
1443 /* Allocate space for the CPU0 CMAP */
1444 bs_state.va = virtual_end;
1445 pmap_bootstrap_l2_table(&bs_state);
1446 pmap_store(&bs_state.l3[pmap_l3_index(bs_state.va)],
1447 PHYS_TO_PTE(pmap_early_vtophys((vm_offset_t)bs_state.l3)) |
1448 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP |
1449 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L3_PAGE);
1450 dsb(ishst);
1451
1452 mtx_init(&cmap_lock, "SYSMAPS", NULL, MTX_DEF);
1453 cmap1_addr = (void *)(virtual_end + L3_SIZE);
1454 cmap1_pte = &bs_state.l3[pmap_l3_index((vm_offset_t)cmap1_addr)];
1455
1456 pa = pmap_early_vtophys(bs_state.freemempos);
1457
1458 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1459 }
1460
1461 #if defined(KASAN) || defined(KMSAN)
1462 static void
pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa,vm_paddr_t end_pa,vm_offset_t * vap,vm_offset_t eva)1463 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1464 vm_offset_t *vap, vm_offset_t eva)
1465 {
1466 vm_paddr_t pa;
1467 vm_offset_t va;
1468 pd_entry_t *l2;
1469
1470 va = *vap;
1471 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1472 for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) {
1473 l2 = pmap_l2(kernel_pmap, va);
1474
1475 /*
1476 * KASAN stack checking results in us having already allocated
1477 * part of our shadow map, so we can just skip those segments.
1478 */
1479 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1480 pa += L2_SIZE;
1481 continue;
1482 }
1483
1484 bzero_early(PHYS_TO_DMAP(pa), L2_SIZE);
1485 physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC);
1486 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK);
1487 }
1488 *vap = va;
1489 }
1490
1491 /*
1492 * Finish constructing the initial shadow map:
1493 * - Count how many pages from KERNBASE to virtual_avail (scaled for
1494 * shadow map)
1495 * - Map that entire range using L2 superpages.
1496 */
1497 static void
pmap_bootstrap_san1(vm_offset_t va,int scale)1498 pmap_bootstrap_san1(vm_offset_t va, int scale)
1499 {
1500 vm_offset_t eva;
1501 vm_paddr_t kernstart;
1502 int i;
1503
1504 kernstart = pmap_early_vtophys(KERNBASE);
1505
1506 /*
1507 * Rebuild physmap one more time, we may have excluded more regions from
1508 * allocation since pmap_bootstrap().
1509 */
1510 physmap_idx = physmem_avail(physmap, nitems(physmap));
1511
1512 eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale;
1513
1514 /*
1515 * Find a slot in the physmap large enough for what we needed. We try to put
1516 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1517 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1518 */
1519 for (i = physmap_idx - 2; i >= 0; i -= 2) {
1520 vm_paddr_t plow, phigh;
1521
1522 /* L2 mappings must be backed by memory that is L2-aligned */
1523 plow = roundup2(physmap[i], L2_SIZE);
1524 phigh = physmap[i + 1];
1525 if (plow >= phigh)
1526 continue;
1527 if (kernstart >= plow && kernstart < phigh)
1528 phigh = kernstart;
1529 if (phigh - plow >= L2_SIZE) {
1530 pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva);
1531 if (va >= eva)
1532 break;
1533 }
1534 }
1535 if (i < 0)
1536 panic("Could not find phys region for shadow map");
1537
1538 /*
1539 * Done. We should now have a valid shadow address mapped for all KVA
1540 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1541 * shadow accesses by the sanitizer runtime will succeed for this range.
1542 * When the kernel virtual address range is later expanded, as will
1543 * happen in vm_mem_init(), the shadow map will be grown as well. This
1544 * is handled by pmap_san_enter().
1545 */
1546 }
1547
1548 void
pmap_bootstrap_san(void)1549 pmap_bootstrap_san(void)
1550 {
1551 #ifdef KASAN
1552 pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE);
1553 #else
1554 static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1555 static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1556 pd_entry_t *l0, *l1;
1557
1558 if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE)
1559 panic("initial kernel map is too large");
1560
1561 l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS);
1562 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1563 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp)));
1564 l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS);
1565 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1566 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE)));
1567 pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1);
1568
1569 l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS);
1570 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1571 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp)));
1572 l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS);
1573 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1574 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE)));
1575 pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1);
1576 #endif
1577 }
1578 #endif
1579
1580 /*
1581 * Initialize a vm_page's machine-dependent fields.
1582 */
1583 void
pmap_page_init(vm_page_t m)1584 pmap_page_init(vm_page_t m)
1585 {
1586
1587 TAILQ_INIT(&m->md.pv_list);
1588 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1589 }
1590
1591 static void
pmap_init_asids(struct asid_set * set,int bits)1592 pmap_init_asids(struct asid_set *set, int bits)
1593 {
1594 int i;
1595
1596 set->asid_bits = bits;
1597
1598 /*
1599 * We may be too early in the overall initialization process to use
1600 * bit_alloc().
1601 */
1602 set->asid_set_size = 1 << set->asid_bits;
1603 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1604 M_WAITOK | M_ZERO);
1605 for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1606 bit_set(set->asid_set, i);
1607 set->asid_next = ASID_FIRST_AVAILABLE;
1608 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1609 }
1610
1611 static void
pmap_init_pv_table(void)1612 pmap_init_pv_table(void)
1613 {
1614 struct vm_phys_seg *seg, *next_seg;
1615 struct pmap_large_md_page *pvd;
1616 vm_size_t s;
1617 int domain, i, j, pages;
1618
1619 /*
1620 * We depend on the size being evenly divisible into a page so
1621 * that the pv_table array can be indexed directly while
1622 * safely spanning multiple pages from different domains.
1623 */
1624 CTASSERT(PAGE_SIZE % sizeof(*pvd) == 0);
1625
1626 /*
1627 * Calculate the size of the array.
1628 */
1629 s = 0;
1630 for (i = 0; i < vm_phys_nsegs; i++) {
1631 seg = &vm_phys_segs[i];
1632 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1633 pmap_l2_pindex(seg->start);
1634 s += round_page(pages * sizeof(*pvd));
1635 }
1636 pv_table = kva_alloc(s);
1637 if (pv_table == NULL)
1638 panic("%s: kva_alloc failed\n", __func__);
1639
1640 /*
1641 * Iterate physical segments to allocate domain-local memory for PV
1642 * list headers.
1643 */
1644 pvd = pv_table;
1645 for (i = 0; i < vm_phys_nsegs; i++) {
1646 seg = &vm_phys_segs[i];
1647 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1648 pmap_l2_pindex(seg->start);
1649 domain = seg->domain;
1650
1651 s = round_page(pages * sizeof(*pvd));
1652
1653 for (j = 0; j < s; j += PAGE_SIZE) {
1654 vm_page_t m = vm_page_alloc_noobj_domain(domain,
1655 VM_ALLOC_ZERO);
1656 if (m == NULL)
1657 panic("failed to allocate PV table page");
1658 pmap_qenter((char *)pvd + j, &m, 1);
1659 }
1660
1661 for (j = 0; j < s / sizeof(*pvd); j++) {
1662 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1663 TAILQ_INIT(&pvd->pv_page.pv_list);
1664 pvd++;
1665 }
1666 }
1667 pvd = &pv_dummy_large;
1668 memset(pvd, 0, sizeof(*pvd));
1669 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1670 TAILQ_INIT(&pvd->pv_page.pv_list);
1671
1672 /*
1673 * Set pointers from vm_phys_segs to pv_table.
1674 */
1675 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1676 seg = &vm_phys_segs[i];
1677 seg->md_first = pvd;
1678 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1679 pmap_l2_pindex(seg->start);
1680
1681 /*
1682 * If there is a following segment, and the final
1683 * superpage of this segment and the initial superpage
1684 * of the next segment are the same then adjust the
1685 * pv_table entry for that next segment down by one so
1686 * that the pv_table entries will be shared.
1687 */
1688 if (i + 1 < vm_phys_nsegs) {
1689 next_seg = &vm_phys_segs[i + 1];
1690 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1691 pmap_l2_pindex(next_seg->start)) {
1692 pvd--;
1693 }
1694 }
1695 }
1696 }
1697
1698 static cpu_feat_en
pmap_dbm_check(const struct cpu_feat * feat __unused,u_int midr __unused)1699 pmap_dbm_check(const struct cpu_feat *feat __unused, u_int midr __unused)
1700 {
1701 uint64_t id_aa64mmfr1;
1702
1703 id_aa64mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1704 if (ID_AA64MMFR1_HAFDBS_VAL(id_aa64mmfr1) >=
1705 ID_AA64MMFR1_HAFDBS_AF_DBS)
1706 return (FEAT_DEFAULT_ENABLE);
1707
1708 return (FEAT_ALWAYS_DISABLE);
1709 }
1710
1711 static bool
pmap_dbm_has_errata(const struct cpu_feat * feat __unused,u_int midr,u_int ** errata_list,u_int * errata_count)1712 pmap_dbm_has_errata(const struct cpu_feat *feat __unused, u_int midr,
1713 u_int **errata_list, u_int *errata_count)
1714 {
1715 /* Disable on Cortex-A55 for erratum 1024718 - all revisions */
1716 if (CPU_IMPL(midr) == CPU_IMPL_ARM &&
1717 CPU_PART(midr) == CPU_PART_CORTEX_A55) {
1718 static u_int errata_id = 1024718;
1719
1720 *errata_list = &errata_id;
1721 *errata_count = 1;
1722 return (true);
1723 }
1724
1725 /* Disable on Cortex-A510 for erratum 2051678 - r0p0 to r0p2 */
1726 if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A510,
1727 0, 0, 0, 2)) {
1728 static u_int errata_id = 2051678;
1729
1730 *errata_list = &errata_id;
1731 *errata_count = 1;
1732 return (true);
1733 }
1734
1735 return (false);
1736 }
1737
1738 static bool
pmap_dbm_enable(const struct cpu_feat * feat __unused,cpu_feat_errata errata_status,u_int * errata_list __unused,u_int errata_count)1739 pmap_dbm_enable(const struct cpu_feat *feat __unused,
1740 cpu_feat_errata errata_status, u_int *errata_list __unused,
1741 u_int errata_count)
1742 {
1743 uint64_t tcr;
1744
1745 /* Skip if there is an erratum affecting DBM */
1746 if (errata_status != ERRATA_NONE)
1747 return (false);
1748
1749 tcr = READ_SPECIALREG(tcr_el1) | TCR_HD;
1750 WRITE_SPECIALREG(tcr_el1, tcr);
1751 isb();
1752 /* Flush the local TLB for the TCR_HD flag change */
1753 dsb(nshst);
1754 __asm __volatile("tlbi vmalle1");
1755 dsb(nsh);
1756 isb();
1757
1758 return (true);
1759 }
1760
1761 CPU_FEAT(feat_hafdbs, "Hardware management of the Access flag and dirty state",
1762 pmap_dbm_check, pmap_dbm_has_errata, pmap_dbm_enable, NULL,
1763 CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU);
1764
1765 static cpu_feat_en
pmap_multiple_tlbi_check(const struct cpu_feat * feat __unused,u_int midr)1766 pmap_multiple_tlbi_check(const struct cpu_feat *feat __unused, u_int midr)
1767 {
1768 /*
1769 * Cortex-A55 erratum 2441007 (Cat B rare)
1770 * Present in all revisions
1771 */
1772 if (CPU_IMPL(midr) == CPU_IMPL_ARM &&
1773 CPU_PART(midr) == CPU_PART_CORTEX_A55)
1774 return (FEAT_DEFAULT_DISABLE);
1775
1776 /*
1777 * Cortex-A76 erratum 1286807 (Cat B rare)
1778 * Present in r0p0 - r3p0
1779 * Fixed in r3p1
1780 */
1781 if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A76,
1782 0, 0, 3, 0))
1783 return (FEAT_DEFAULT_DISABLE);
1784
1785 /*
1786 * Cortex-A510 erratum 2441009 (Cat B rare)
1787 * Present in r0p0 - r1p1
1788 * Fixed in r1p2
1789 */
1790 if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A510,
1791 0, 0, 1, 1))
1792 return (FEAT_DEFAULT_DISABLE);
1793
1794 return (FEAT_ALWAYS_DISABLE);
1795 }
1796
1797 static bool
pmap_multiple_tlbi_enable(const struct cpu_feat * feat __unused,cpu_feat_errata errata_status,u_int * errata_list __unused,u_int errata_count __unused)1798 pmap_multiple_tlbi_enable(const struct cpu_feat *feat __unused,
1799 cpu_feat_errata errata_status, u_int *errata_list __unused,
1800 u_int errata_count __unused)
1801 {
1802 pmap_multiple_tlbi = true;
1803 return (true);
1804 }
1805
1806 CPU_FEAT(errata_multi_tlbi, "Multiple TLBI errata",
1807 pmap_multiple_tlbi_check, NULL, pmap_multiple_tlbi_enable, NULL,
1808 CPU_FEAT_EARLY_BOOT | CPU_FEAT_PER_CPU);
1809
1810 /*
1811 * Initialize the pmap module.
1812 *
1813 * Called by vm_mem_init(), to initialize any structures that the pmap
1814 * system needs to map virtual memory.
1815 */
1816 void
pmap_init(void)1817 pmap_init(void)
1818 {
1819 uint64_t mmfr1;
1820 int i, vmid_bits;
1821
1822 /*
1823 * Are large page mappings enabled?
1824 */
1825 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1826 if (superpages_enabled) {
1827 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1828 ("pmap_init: can't assign to pagesizes[1]"));
1829 pagesizes[1] = L3C_SIZE;
1830 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1831 ("pmap_init: can't assign to pagesizes[2]"));
1832 pagesizes[2] = L2_SIZE;
1833 if (L1_BLOCKS_SUPPORTED) {
1834 KASSERT(MAXPAGESIZES > 3 && pagesizes[3] == 0,
1835 ("pmap_init: can't assign to pagesizes[3]"));
1836 pagesizes[3] = L1_SIZE;
1837 }
1838 }
1839
1840 /*
1841 * Initialize the ASID allocator.
1842 */
1843 pmap_init_asids(&asids,
1844 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1845
1846 if (has_hyp()) {
1847 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1848 vmid_bits = 8;
1849
1850 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1851 ID_AA64MMFR1_VMIDBits_16)
1852 vmid_bits = 16;
1853 pmap_init_asids(&vmids, vmid_bits);
1854 }
1855
1856 /*
1857 * Initialize pv chunk lists.
1858 */
1859 for (i = 0; i < PMAP_MEMDOM; i++) {
1860 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1861 MTX_DEF);
1862 TAILQ_INIT(&pv_chunks[i].pvc_list);
1863 }
1864 pmap_init_pv_table();
1865
1866 vm_initialized = 1;
1867 }
1868
1869 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1870 "L1 (1GB/64GB) page mapping counters");
1871
1872 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
1873 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
1874 &pmap_l1_demotions, "L1 (1GB/64GB) page demotions");
1875
1876 SYSCTL_BOOL(_vm_pmap_l1, OID_AUTO, supported, CTLFLAG_RD, &pmap_l1_supported,
1877 0, "L1 blocks are supported");
1878
1879 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1880 "L2C (32MB/1GB) page mapping counters");
1881
1882 static COUNTER_U64_DEFINE_EARLY(pmap_l2c_demotions);
1883 SYSCTL_COUNTER_U64(_vm_pmap_l2c, OID_AUTO, demotions, CTLFLAG_RD,
1884 &pmap_l2c_demotions, "L2C (32MB/1GB) page demotions");
1885
1886 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1887 "2MB page mapping counters");
1888
1889 static COUNTER_U64_DEFINE_EARLY(pmap_l2_demotions);
1890 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1891 &pmap_l2_demotions, "L2 (2MB/32MB) page demotions");
1892
1893 static COUNTER_U64_DEFINE_EARLY(pmap_l2_mappings);
1894 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1895 &pmap_l2_mappings, "L2 (2MB/32MB) page mappings");
1896
1897 static COUNTER_U64_DEFINE_EARLY(pmap_l2_p_failures);
1898 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1899 &pmap_l2_p_failures, "L2 (2MB/32MB) page promotion failures");
1900
1901 static COUNTER_U64_DEFINE_EARLY(pmap_l2_promotions);
1902 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1903 &pmap_l2_promotions, "L2 (2MB/32MB) page promotions");
1904
1905 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1906 "L3C (64KB/2MB) page mapping counters");
1907
1908 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions);
1909 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD,
1910 &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions");
1911
1912 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings);
1913 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD,
1914 &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings");
1915
1916 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures);
1917 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD,
1918 &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures");
1919
1920 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions);
1921 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD,
1922 &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions");
1923
1924 /*
1925 * If the given value for "final_only" is false, then any cached intermediate-
1926 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1927 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1928 * Otherwise, just the cached final-level entry is invalidated.
1929 */
1930 static __inline void
pmap_s1_invalidate_kernel(uint64_t r,bool final_only)1931 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1932 {
1933 if (final_only)
1934 __asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1935 else
1936 __asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1937 }
1938
1939 static __inline void
pmap_s1_invalidate_user(uint64_t r,bool final_only)1940 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1941 {
1942 if (final_only)
1943 __asm __volatile("tlbi vale1is, %0" : : "r" (r));
1944 else
1945 __asm __volatile("tlbi vae1is, %0" : : "r" (r));
1946 }
1947
1948 /*
1949 * Invalidates any cached final- and optionally intermediate-level TLB entries
1950 * for the specified virtual address in the given virtual address space.
1951 */
1952 static __inline void
pmap_s1_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1953 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1954 {
1955 uint64_t r;
1956
1957 PMAP_ASSERT_STAGE1(pmap);
1958
1959 dsb(ishst);
1960 r = TLBI_VA(va);
1961 if (pmap == kernel_pmap) {
1962 pmap_s1_invalidate_kernel(r, final_only);
1963 } else {
1964 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1965 pmap_s1_invalidate_user(r, final_only);
1966 }
1967 if (pmap_multiple_tlbi) {
1968 dsb(ish);
1969 __asm __volatile("tlbi vale1is, xzr" ::: "memory");
1970 }
1971 dsb(ish);
1972 isb();
1973 }
1974
1975 static __inline void
pmap_s2_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1976 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1977 {
1978 PMAP_ASSERT_STAGE2(pmap);
1979 MPASS(pmap_stage2_invalidate_range != NULL);
1980 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1981 final_only);
1982 }
1983
1984 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1985 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1986 {
1987 if (pmap->pm_stage == PM_STAGE1)
1988 pmap_s1_invalidate_page(pmap, va, final_only);
1989 else
1990 pmap_s2_invalidate_page(pmap, va, final_only);
1991 }
1992
1993 /*
1994 * Use stride L{1,2}_SIZE when invalidating the TLB entries for L{1,2}_BLOCK
1995 * mappings. Otherwise, use stride L3_SIZE.
1996 */
1997 static __inline void
pmap_s1_invalidate_strided(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_offset_t stride,bool final_only)1998 pmap_s1_invalidate_strided(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1999 vm_offset_t stride, bool final_only)
2000 {
2001 uint64_t end, r, start;
2002
2003 PMAP_ASSERT_STAGE1(pmap);
2004
2005 dsb(ishst);
2006 if (pmap == kernel_pmap) {
2007 start = TLBI_VA(sva);
2008 end = TLBI_VA(eva);
2009 for (r = start; r < end; r += TLBI_VA(stride))
2010 pmap_s1_invalidate_kernel(r, final_only);
2011 } else {
2012 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
2013 start |= TLBI_VA(sva);
2014 end |= TLBI_VA(eva);
2015 for (r = start; r < end; r += TLBI_VA(stride))
2016 pmap_s1_invalidate_user(r, final_only);
2017 }
2018 if (pmap_multiple_tlbi) {
2019 dsb(ish);
2020 __asm __volatile("tlbi vale1is, xzr" ::: "memory");
2021 }
2022 dsb(ish);
2023 isb();
2024 }
2025
2026 /*
2027 * Invalidates any cached final- and optionally intermediate-level TLB entries
2028 * for the specified virtual address range in the given virtual address space.
2029 */
2030 static __inline void
pmap_s1_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)2031 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2032 bool final_only)
2033 {
2034 pmap_s1_invalidate_strided(pmap, sva, eva, L3_SIZE, final_only);
2035 }
2036
2037 static __inline void
pmap_s2_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)2038 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2039 bool final_only)
2040 {
2041 PMAP_ASSERT_STAGE2(pmap);
2042 MPASS(pmap_stage2_invalidate_range != NULL);
2043 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
2044 }
2045
2046 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)2047 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2048 bool final_only)
2049 {
2050 if (pmap->pm_stage == PM_STAGE1)
2051 pmap_s1_invalidate_range(pmap, sva, eva, final_only);
2052 else
2053 pmap_s2_invalidate_range(pmap, sva, eva, final_only);
2054 }
2055
2056 void
pmap_s1_invalidate_all_kernel(void)2057 pmap_s1_invalidate_all_kernel(void)
2058 {
2059 dsb(ishst);
2060 __asm __volatile("tlbi vmalle1is");
2061 if (pmap_multiple_tlbi) {
2062 dsb(ish);
2063 __asm __volatile("tlbi vale1is, xzr" ::: "memory");
2064 }
2065 dsb(ish);
2066 isb();
2067 }
2068
2069 /*
2070 * Invalidates all cached intermediate- and final-level TLB entries for the
2071 * given virtual address space.
2072 */
2073 static __inline void
pmap_s1_invalidate_all(pmap_t pmap)2074 pmap_s1_invalidate_all(pmap_t pmap)
2075 {
2076 uint64_t r;
2077
2078 PMAP_ASSERT_STAGE1(pmap);
2079
2080 dsb(ishst);
2081 if (pmap == kernel_pmap) {
2082 __asm __volatile("tlbi vmalle1is");
2083 } else {
2084 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
2085 __asm __volatile("tlbi aside1is, %0" : : "r" (r));
2086 }
2087 if (pmap_multiple_tlbi) {
2088 dsb(ish);
2089 __asm __volatile("tlbi vale1is, xzr" ::: "memory");
2090 }
2091 dsb(ish);
2092 isb();
2093 }
2094
2095 static __inline void
pmap_s2_invalidate_all(pmap_t pmap)2096 pmap_s2_invalidate_all(pmap_t pmap)
2097 {
2098 PMAP_ASSERT_STAGE2(pmap);
2099 MPASS(pmap_stage2_invalidate_all != NULL);
2100 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
2101 }
2102
2103 static __inline void
pmap_invalidate_all(pmap_t pmap)2104 pmap_invalidate_all(pmap_t pmap)
2105 {
2106 if (pmap->pm_stage == PM_STAGE1)
2107 pmap_s1_invalidate_all(pmap);
2108 else
2109 pmap_s2_invalidate_all(pmap);
2110 }
2111
2112 /*
2113 * Routine: pmap_extract
2114 * Function:
2115 * Extract the physical page address associated
2116 * with the given map/virtual_address pair.
2117 */
2118 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)2119 pmap_extract(pmap_t pmap, vm_offset_t va)
2120 {
2121 pt_entry_t *pte, tpte;
2122 vm_paddr_t pa;
2123 int lvl;
2124
2125 pa = 0;
2126 PMAP_LOCK(pmap);
2127 /*
2128 * Find the block or page map for this virtual address. pmap_pte
2129 * will return either a valid block/page entry, or NULL.
2130 */
2131 pte = pmap_pte(pmap, va, &lvl);
2132 if (pte != NULL) {
2133 tpte = pmap_load(pte);
2134 pa = PTE_TO_PHYS(tpte);
2135 switch(lvl) {
2136 case 1:
2137 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
2138 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
2139 ("pmap_extract: Invalid L1 pte found: %lx",
2140 tpte & ATTR_DESCR_MASK));
2141 pa |= (va & L1_OFFSET);
2142 break;
2143 case 2:
2144 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
2145 ("pmap_extract: Invalid L2 pte found: %lx",
2146 tpte & ATTR_DESCR_MASK));
2147 pa |= (va & L2_OFFSET);
2148 break;
2149 case 3:
2150 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
2151 ("pmap_extract: Invalid L3 pte found: %lx",
2152 tpte & ATTR_DESCR_MASK));
2153 pa |= (va & L3_OFFSET);
2154 break;
2155 }
2156 }
2157 PMAP_UNLOCK(pmap);
2158 return (pa);
2159 }
2160
2161 /*
2162 * Routine: pmap_extract_and_hold
2163 * Function:
2164 * Atomically extract and hold the physical page
2165 * with the given pmap and virtual address pair
2166 * if that mapping permits the given protection.
2167 */
2168 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)2169 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
2170 {
2171 pt_entry_t *pte, tpte;
2172 vm_offset_t off;
2173 vm_page_t m;
2174 int lvl;
2175 bool use;
2176
2177 m = NULL;
2178 PMAP_LOCK(pmap);
2179 pte = pmap_pte(pmap, va, &lvl);
2180 if (pte != NULL) {
2181 tpte = pmap_load(pte);
2182
2183 KASSERT(lvl > 0 && lvl <= 3,
2184 ("pmap_extract_and_hold: Invalid level %d", lvl));
2185 /*
2186 * Check that the pte is either a L3 page, or a L1 or L2 block
2187 * entry. We can assume L1_BLOCK == L2_BLOCK.
2188 */
2189 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
2190 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
2191 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
2192 tpte & ATTR_DESCR_MASK));
2193
2194 use = false;
2195 if ((prot & VM_PROT_WRITE) == 0)
2196 use = true;
2197 else if (pmap->pm_stage == PM_STAGE1 &&
2198 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
2199 use = true;
2200 else if (pmap->pm_stage == PM_STAGE2 &&
2201 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
2202 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
2203 use = true;
2204
2205 if (use) {
2206 switch (lvl) {
2207 case 1:
2208 off = va & L1_OFFSET;
2209 break;
2210 case 2:
2211 off = va & L2_OFFSET;
2212 break;
2213 case 3:
2214 default:
2215 off = 0;
2216 }
2217 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off);
2218 if (m != NULL && !vm_page_wire_mapped(m))
2219 m = NULL;
2220 }
2221 }
2222 PMAP_UNLOCK(pmap);
2223 return (m);
2224 }
2225
2226 /*
2227 * Returns true if the entire kernel virtual address range is mapped
2228 */
2229 static bool
pmap_kmapped_range(void * va,vm_size_t size)2230 pmap_kmapped_range(void *va, vm_size_t size)
2231 {
2232 pt_entry_t *pte, tpte;
2233 vm_offset_t eva, sva;
2234
2235 sva = (vm_offset_t)va;
2236 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS,
2237 ("%s: Invalid virtual address: %lx", __func__, sva));
2238 MPASS(size != 0);
2239 eva = sva + size - 1;
2240 KASSERT(eva > sva, ("%s: Size too large: sva %lx, size %lx", __func__,
2241 sva, size));
2242
2243 while (sva <= eva) {
2244 pte = pmap_l1(kernel_pmap, sva);
2245 if (pte == NULL)
2246 return (false);
2247 tpte = pmap_load(pte);
2248 if (tpte == 0)
2249 return (false);
2250 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2251 sva = (sva & ~L1_OFFSET) + L1_SIZE;
2252 continue;
2253 }
2254
2255 pte = pmap_l1_to_l2(&tpte, sva);
2256 tpte = pmap_load(pte);
2257 if (tpte == 0)
2258 return (false);
2259 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2260 sva = (sva & ~L2_OFFSET) + L2_SIZE;
2261 continue;
2262 }
2263 pte = pmap_l2_to_l3(&tpte, sva);
2264 tpte = pmap_load(pte);
2265 if (tpte == 0)
2266 return (false);
2267 MPASS((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_PAGE);
2268 if ((tpte & ATTR_CONTIGUOUS) == ATTR_CONTIGUOUS)
2269 sva = (sva & ~L3C_OFFSET) + L3C_SIZE;
2270 else
2271 sva = (sva & ~L3_OFFSET) + L3_SIZE;
2272 }
2273
2274 return (true);
2275 }
2276
2277 /*
2278 * Walks the page tables to translate a kernel virtual address to a
2279 * physical address. Returns true if the kva is valid and stores the
2280 * physical address in pa if it is not NULL.
2281 *
2282 * See the comment above data_abort() for the rationale for specifying
2283 * NO_PERTHREAD_SSP here.
2284 */
2285 bool NO_PERTHREAD_SSP
pmap_klookup(vm_offset_t va,vm_paddr_t * pa)2286 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
2287 {
2288 pt_entry_t *pte, tpte;
2289 register_t intr;
2290 uint64_t par;
2291
2292 /*
2293 * Disable interrupts so we don't get interrupted between asking
2294 * for address translation, and getting the result back.
2295 */
2296 intr = intr_disable();
2297 par = arm64_address_translate_s1e1r(va);
2298 intr_restore(intr);
2299
2300 if (PAR_SUCCESS(par)) {
2301 if (pa != NULL)
2302 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
2303 return (true);
2304 }
2305
2306 /*
2307 * Fall back to walking the page table. The address translation
2308 * instruction may fail when the page is in a break-before-make
2309 * sequence. As we only clear the valid bit in said sequence we
2310 * can walk the page table to find the physical address.
2311 */
2312
2313 pte = pmap_l1(kernel_pmap, va);
2314 if (pte == NULL)
2315 return (false);
2316
2317 /*
2318 * A concurrent pmap_update_entry() will clear the entry's valid bit
2319 * but leave the rest of the entry unchanged. Therefore, we treat a
2320 * non-zero entry as being valid, and we ignore the valid bit when
2321 * determining whether the entry maps a block, page, or table.
2322 */
2323 tpte = pmap_load(pte);
2324 if (tpte == 0)
2325 return (false);
2326 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2327 if (pa != NULL)
2328 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET);
2329 return (true);
2330 }
2331 pte = pmap_l1_to_l2(&tpte, va);
2332 tpte = pmap_load(pte);
2333 if (tpte == 0)
2334 return (false);
2335 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2336 if (pa != NULL)
2337 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET);
2338 return (true);
2339 }
2340 pte = pmap_l2_to_l3(&tpte, va);
2341 tpte = pmap_load(pte);
2342 if (tpte == 0)
2343 return (false);
2344 if (pa != NULL)
2345 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET);
2346 return (true);
2347 }
2348
2349 /*
2350 * Routine: pmap_kextract
2351 * Function:
2352 * Extract the physical page address associated with the given kernel
2353 * virtual address.
2354 */
2355 vm_paddr_t
pmap_kextract(vm_offset_t va)2356 pmap_kextract(vm_offset_t va)
2357 {
2358 vm_paddr_t pa;
2359
2360 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2361 return (DMAP_TO_PHYS(va));
2362
2363 if (pmap_klookup(va, &pa) == false)
2364 return (0);
2365 return (pa);
2366 }
2367
2368 /***************************************************
2369 * Low level mapping routines.....
2370 ***************************************************/
2371
2372 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)2373 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
2374 {
2375 pd_entry_t *pde;
2376 pt_entry_t attr, old_l3e, *pte;
2377 vm_offset_t va;
2378 vm_page_t mpte;
2379 int error, lvl;
2380
2381 KASSERT((pa & L3_OFFSET) == 0,
2382 ("pmap_kenter: Invalid physical address"));
2383 KASSERT((sva & L3_OFFSET) == 0,
2384 ("pmap_kenter: Invalid virtual address"));
2385 KASSERT((size & PAGE_MASK) == 0,
2386 ("pmap_kenter: Mapping is not page-sized"));
2387
2388 /* CCA - Map devices as nonsecure */
2389 if (in_realm() && (mode == VM_MEMATTR_DEVICE ||
2390 mode == VM_MEMATTR_DEVICE_NP))
2391 pa |= prot_ns_shared_pa;
2392
2393 attr = ATTR_AF | pmap_sh_attr | ATTR_S1_AP(ATTR_S1_AP_RW) |
2394 ATTR_S1_XN | ATTR_KERN_GP | ATTR_S1_IDX(mode);
2395 old_l3e = 0;
2396 va = sva;
2397 while (size != 0) {
2398 pde = pmap_pde(kernel_pmap, va, &lvl);
2399 KASSERT(pde != NULL,
2400 ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
2401 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
2402
2403 /*
2404 * If we have an aligned, contiguous chunk of L2_SIZE, try
2405 * to create an L2_BLOCK mapping.
2406 */
2407 if ((va & L2_OFFSET) == 0 && size >= L2_SIZE &&
2408 (pa & L2_OFFSET) == 0 && vm_initialized) {
2409 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
2410 KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)),
2411 ("pmap_kenter: Unexpected mapping"));
2412 PMAP_LOCK(kernel_pmap);
2413 error = pmap_insert_pt_page(kernel_pmap, mpte, false,
2414 false);
2415 if (error == 0) {
2416 attr &= ~ATTR_CONTIGUOUS;
2417
2418 /*
2419 * Although the page table page "mpte" should
2420 * be devoid of mappings, the TLB might hold
2421 * intermediate entries that reference it, so
2422 * we perform a single-page invalidation.
2423 */
2424 pmap_update_entry(kernel_pmap, pde,
2425 PHYS_TO_PTE(pa) | attr | L2_BLOCK, va,
2426 PAGE_SIZE);
2427 }
2428 PMAP_UNLOCK(kernel_pmap);
2429 if (error == 0) {
2430 va += L2_SIZE;
2431 pa += L2_SIZE;
2432 size -= L2_SIZE;
2433 continue;
2434 }
2435 }
2436
2437 /*
2438 * If we have an aligned, contiguous chunk of L3C_ENTRIES
2439 * L3 pages, set the contiguous bit within each PTE so that
2440 * the chunk can be cached using only one TLB entry.
2441 */
2442 if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) {
2443 if (size >= L3C_SIZE)
2444 attr |= ATTR_CONTIGUOUS;
2445 else
2446 attr &= ~ATTR_CONTIGUOUS;
2447 }
2448
2449 pte = pmap_l2_to_l3(pde, va);
2450 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr |
2451 L3_PAGE);
2452
2453 va += PAGE_SIZE;
2454 pa += PAGE_SIZE;
2455 size -= PAGE_SIZE;
2456 }
2457 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2458 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2459 else {
2460 /*
2461 * Because the old entries were invalid and the new mappings
2462 * are not executable, an isb is not required.
2463 */
2464 dsb(ishst);
2465 }
2466 }
2467
2468 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)2469 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2470 {
2471
2472 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2473 }
2474
2475 /*
2476 * Remove a page from the kernel pagetables.
2477 */
2478 void
pmap_kremove(vm_offset_t va)2479 pmap_kremove(vm_offset_t va)
2480 {
2481 pt_entry_t *pte;
2482
2483 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2484 KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0,
2485 ("pmap_kremove: unexpected ATTR_CONTIGUOUS"));
2486 pmap_clear(pte);
2487 pmap_s1_invalidate_page(kernel_pmap, va, true);
2488 }
2489
2490 /*
2491 * Remove the specified range of mappings from the kernel address space.
2492 *
2493 * Should only be applied to mappings that were created by pmap_kenter() or
2494 * pmap_kenter_device(). Nothing about this function is actually specific
2495 * to device mappings.
2496 */
2497 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)2498 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2499 {
2500 pt_entry_t *ptep, *ptep_end;
2501 vm_offset_t va;
2502 int lvl;
2503
2504 KASSERT((sva & L3_OFFSET) == 0,
2505 ("pmap_kremove_device: Invalid virtual address"));
2506 KASSERT((size & PAGE_MASK) == 0,
2507 ("pmap_kremove_device: Mapping is not page-sized"));
2508
2509 va = sva;
2510 while (size != 0) {
2511 ptep = pmap_pte(kernel_pmap, va, &lvl);
2512 KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va));
2513 switch (lvl) {
2514 case 2:
2515 KASSERT((va & L2_OFFSET) == 0,
2516 ("Unaligned virtual address"));
2517 KASSERT(size >= L2_SIZE, ("Insufficient size"));
2518
2519 if (va != sva) {
2520 pmap_s1_invalidate_range(kernel_pmap, sva, va,
2521 true);
2522 }
2523 pmap_clear(ptep);
2524 pmap_s1_invalidate_page(kernel_pmap, va, true);
2525 PMAP_LOCK(kernel_pmap);
2526 pmap_remove_kernel_l2(kernel_pmap, ptep, va);
2527 PMAP_UNLOCK(kernel_pmap);
2528
2529 va += L2_SIZE;
2530 sva = va;
2531 size -= L2_SIZE;
2532 break;
2533 case 3:
2534 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
2535 KASSERT((va & L3C_OFFSET) == 0,
2536 ("Unaligned L3C virtual address"));
2537 KASSERT(size >= L3C_SIZE,
2538 ("Insufficient L3C size"));
2539
2540 ptep_end = ptep + L3C_ENTRIES;
2541 for (; ptep < ptep_end; ptep++)
2542 pmap_clear(ptep);
2543
2544 va += L3C_SIZE;
2545 size -= L3C_SIZE;
2546 break;
2547 }
2548 pmap_clear(ptep);
2549
2550 va += PAGE_SIZE;
2551 size -= PAGE_SIZE;
2552 break;
2553 default:
2554 __assert_unreachable();
2555 break;
2556 }
2557 }
2558 if (va != sva)
2559 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2560 }
2561
2562 /*
2563 * Used to map a range of physical addresses into kernel
2564 * virtual address space.
2565 *
2566 * The value passed in '*virt' is a suggested virtual address for
2567 * the mapping. Architectures which can support a direct-mapped
2568 * physical to virtual region can return the appropriate address
2569 * within that region, leaving '*virt' unchanged. Other
2570 * architectures should map the pages starting at '*virt' and
2571 * update '*virt' with the first usable address after the mapped
2572 * region.
2573 */
2574 void *
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)2575 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2576 {
2577 return (PHYS_TO_DMAP(start));
2578 }
2579
2580 /*
2581 * Add a list of wired pages to the kva
2582 * this routine is only used for temporary
2583 * kernel mappings that do not need to have
2584 * page modification or references recorded.
2585 * Note that old mappings are simply written
2586 * over. The page *must* be wired.
2587 * Note: SMP coherent. Uses a ranged shootdown IPI.
2588 */
2589 void
pmap_qenter(void * sva,vm_page_t * ma,int count)2590 pmap_qenter(void *sva, vm_page_t *ma, int count)
2591 {
2592 pd_entry_t *pde;
2593 pt_entry_t attr, old_l3e, *pte;
2594 vm_offset_t va;
2595 vm_page_t m;
2596 int i, lvl;
2597
2598 old_l3e = 0;
2599 va = (vm_offset_t)sva;
2600 for (i = 0; i < count; i++) {
2601 pde = pmap_pde(kernel_pmap, va, &lvl);
2602 KASSERT(pde != NULL,
2603 ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2604 KASSERT(lvl == 2,
2605 ("pmap_qenter: Invalid level %d", lvl));
2606
2607 m = ma[i];
2608 attr = ATTR_AF | pmap_sh_attr |
2609 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2610 ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2611 pte = pmap_l2_to_l3(pde, va);
2612 old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr);
2613
2614 va += L3_SIZE;
2615 }
2616 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2617 pmap_s1_invalidate_range(kernel_pmap, (vm_offset_t)sva, va,
2618 true);
2619 else {
2620 /*
2621 * Because the old entries were invalid and the new mappings
2622 * are not executable, an isb is not required.
2623 */
2624 dsb(ishst);
2625 }
2626 }
2627
2628 /*
2629 * This routine tears out page mappings from the
2630 * kernel -- it is meant only for temporary mappings.
2631 */
2632 void
pmap_qremove(void * sva,int count)2633 pmap_qremove(void *sva, int count)
2634 {
2635 pt_entry_t *pte;
2636 vm_offset_t va;
2637
2638 va = (vm_offset_t)sva;
2639
2640 KASSERT(ADDR_IS_CANONICAL(va),
2641 ("%s: Address not in canonical form: %p", __func__, sva));
2642 KASSERT(ADDR_IS_KERNEL(va), ("usermode va %p", sva));
2643
2644 while (count-- > 0) {
2645 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2646 if (pte != NULL) {
2647 pmap_clear(pte);
2648 }
2649
2650 va += PAGE_SIZE;
2651 }
2652 pmap_s1_invalidate_range(kernel_pmap, (vm_offset_t)sva, va, true);
2653 }
2654
2655 /***************************************************
2656 * Page table page management routines.....
2657 ***************************************************/
2658 /*
2659 * Schedule the specified unused page table page to be freed. Specifically,
2660 * add the page to the specified list of pages that will be released to the
2661 * physical memory manager after the TLB has been updated.
2662 */
2663 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)2664 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
2665 {
2666
2667 if (set_PG_ZERO)
2668 m->flags |= PG_ZERO;
2669 else
2670 m->flags &= ~PG_ZERO;
2671 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2672 }
2673
2674 /*
2675 * Decrements a page table page's reference count, which is used to record the
2676 * number of valid page table entries within the page. If the reference count
2677 * drops to zero, then the page table page is unmapped. Returns true if the
2678 * page table page was unmapped and false otherwise.
2679 */
2680 static inline bool
pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2681 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2682 {
2683
2684 --m->ref_count;
2685 if (m->ref_count == 0) {
2686 _pmap_unwire_l3(pmap, va, m, free);
2687 return (true);
2688 } else
2689 return (false);
2690 }
2691
2692 static void
_pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2693 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2694 {
2695
2696 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2697 /*
2698 * unmap the page table page
2699 */
2700 if (m->pindex >= (NUL2E + NUL1E)) {
2701 /* l1 page */
2702 pd_entry_t *l0;
2703
2704 l0 = pmap_l0(pmap, va);
2705 pmap_clear(l0);
2706 } else if (m->pindex >= NUL2E) {
2707 /* l2 page */
2708 pd_entry_t *l1;
2709
2710 l1 = pmap_l1(pmap, va);
2711 pmap_clear(l1);
2712 } else {
2713 /* l3 page */
2714 pd_entry_t *l2;
2715
2716 l2 = pmap_l2(pmap, va);
2717 pmap_clear(l2);
2718 }
2719 pmap_resident_count_dec(pmap, 1);
2720 if (m->pindex < NUL2E) {
2721 /* We just released an l3, unhold the matching l2 */
2722 pd_entry_t *l1, tl1;
2723 vm_page_t l2pg;
2724
2725 l1 = pmap_l1(pmap, va);
2726 tl1 = pmap_load(l1);
2727 l2pg = PTE_TO_VM_PAGE(tl1);
2728 pmap_unwire_l3(pmap, va, l2pg, free);
2729 } else if (m->pindex < (NUL2E + NUL1E)) {
2730 /* We just released an l2, unhold the matching l1 */
2731 pd_entry_t *l0, tl0;
2732 vm_page_t l1pg;
2733
2734 l0 = pmap_l0(pmap, va);
2735 tl0 = pmap_load(l0);
2736 l1pg = PTE_TO_VM_PAGE(tl0);
2737 pmap_unwire_l3(pmap, va, l1pg, free);
2738 }
2739 pmap_invalidate_page(pmap, va, false);
2740
2741 /*
2742 * Put page on a list so that it is released after
2743 * *ALL* TLB shootdown is done
2744 */
2745 pmap_add_delayed_free_list(m, free, true);
2746 }
2747
2748 /*
2749 * After removing a page table entry, this routine is used to
2750 * conditionally free the page, and manage the reference count.
2751 */
2752 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)2753 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2754 struct spglist *free)
2755 {
2756 vm_page_t mpte;
2757
2758 KASSERT(ADDR_IS_CANONICAL(va),
2759 ("%s: Address not in canonical form: %lx", __func__, va));
2760 if (ADDR_IS_KERNEL(va))
2761 return (0);
2762 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2763 mpte = PTE_TO_VM_PAGE(ptepde);
2764 return (pmap_unwire_l3(pmap, va, mpte, free));
2765 }
2766
2767 /*
2768 * Release a page table page reference after a failed attempt to create a
2769 * mapping.
2770 */
2771 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)2772 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2773 {
2774 struct spglist free;
2775
2776 SLIST_INIT(&free);
2777 if (pmap_unwire_l3(pmap, va, mpte, &free))
2778 vm_page_free_pages_toq(&free, true);
2779 }
2780
2781 void
pmap_pinit0(pmap_t pmap)2782 pmap_pinit0(pmap_t pmap)
2783 {
2784
2785 PMAP_LOCK_INIT(pmap);
2786 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2787 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2788 pmap->pm_l0 = PHYS_TO_DMAP(pmap->pm_l0_paddr);
2789 TAILQ_INIT(&pmap->pm_pvchunk);
2790 vm_radix_init(&pmap->pm_root);
2791 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2792 pmap->pm_stage = PM_STAGE1;
2793 pmap->pm_levels = 4;
2794 pmap->pm_ttbr = pmap->pm_l0_paddr;
2795 pmap->pm_asid_set = &asids;
2796 pmap->pm_bti = NULL;
2797
2798 PCPU_SET(curpmap, pmap);
2799 }
2800
2801 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage,int levels)2802 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2803 {
2804 vm_page_t m;
2805
2806 /*
2807 * allocate the l0 page
2808 */
2809 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2810 VM_ALLOC_ZERO);
2811 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2812 pmap->pm_l0 = PHYS_TO_DMAP(pmap->pm_l0_paddr);
2813
2814 TAILQ_INIT(&pmap->pm_pvchunk);
2815 vm_radix_init(&pmap->pm_root);
2816 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2817 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2818
2819 MPASS(levels == 3 || levels == 4);
2820 pmap->pm_levels = levels;
2821 pmap->pm_stage = stage;
2822 pmap->pm_bti = NULL;
2823 switch (stage) {
2824 case PM_STAGE1:
2825 pmap->pm_asid_set = &asids;
2826 if (pmap_bti_support) {
2827 pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF,
2828 M_ZERO | M_WAITOK);
2829 rangeset_init(pmap->pm_bti, bti_dup_range,
2830 bti_free_range, pmap, M_NOWAIT);
2831 }
2832 break;
2833 case PM_STAGE2:
2834 pmap->pm_asid_set = &vmids;
2835 break;
2836 default:
2837 panic("%s: Invalid pmap type %d", __func__, stage);
2838 break;
2839 }
2840
2841 /* XXX Temporarily disable deferred ASID allocation. */
2842 pmap_alloc_asid(pmap);
2843
2844 /*
2845 * Allocate the level 1 entry to use as the root. This will increase
2846 * the refcount on the level 1 page so it won't be removed until
2847 * pmap_release() is called.
2848 */
2849 if (pmap->pm_levels == 3) {
2850 PMAP_LOCK(pmap);
2851 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2852 PMAP_UNLOCK(pmap);
2853 }
2854 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2855
2856 return (1);
2857 }
2858
2859 int
pmap_pinit(pmap_t pmap)2860 pmap_pinit(pmap_t pmap)
2861 {
2862
2863 return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2864 }
2865
2866 /*
2867 * This routine is called if the desired page table page does not exist.
2868 *
2869 * If page table page allocation fails, this routine may sleep before
2870 * returning NULL. It sleeps only if a lock pointer was given.
2871 *
2872 * Note: If a page allocation fails at page table level two or three,
2873 * one or two pages may be held during the wait, only to be released
2874 * afterwards. This conservative approach is easily argued to avoid
2875 * race conditions.
2876 */
2877 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)2878 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2879 {
2880 vm_page_t m, l1pg, l2pg;
2881
2882 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2883
2884 /*
2885 * Allocate a page table page.
2886 */
2887 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2888 if (lockp != NULL) {
2889 RELEASE_PV_LIST_LOCK(lockp);
2890 PMAP_UNLOCK(pmap);
2891 vm_wait(NULL);
2892 PMAP_LOCK(pmap);
2893 }
2894
2895 /*
2896 * Indicate the need to retry. While waiting, the page table
2897 * page may have been allocated.
2898 */
2899 return (NULL);
2900 }
2901 m->pindex = ptepindex;
2902
2903 /*
2904 * Because of AArch64's weak memory consistency model, we must have a
2905 * barrier here to ensure that the stores for zeroing "m", whether by
2906 * pmap_zero_page() or an earlier function, are visible before adding
2907 * "m" to the page table. Otherwise, a page table walk by another
2908 * processor's MMU could see the mapping to "m" and a stale, non-zero
2909 * PTE within "m".
2910 */
2911 dmb(ishst);
2912
2913 /*
2914 * Map the pagetable page into the process address space, if
2915 * it isn't already there.
2916 */
2917
2918 if (ptepindex >= (NUL2E + NUL1E)) {
2919 pd_entry_t *l0p, l0e;
2920 vm_pindex_t l0index;
2921
2922 l0index = ptepindex - (NUL2E + NUL1E);
2923 l0p = &pmap->pm_l0[l0index];
2924 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2925 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2926 l0e = VM_PAGE_TO_PTE(m) | L0_TABLE;
2927
2928 /*
2929 * Mark all kernel memory as not accessible from userspace
2930 * and userspace memory as not executable from the kernel.
2931 * This has been done for the bootstrap L0 entries in
2932 * locore.S.
2933 */
2934 if (pmap == kernel_pmap)
2935 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2936 else
2937 l0e |= TATTR_PXN_TABLE;
2938 pmap_store(l0p, l0e);
2939 } else if (ptepindex >= NUL2E) {
2940 vm_pindex_t l0index, l1index;
2941 pd_entry_t *l0, *l1;
2942 pd_entry_t tl0;
2943
2944 l1index = ptepindex - NUL2E;
2945 l0index = l1index >> Ln_ENTRIES_SHIFT;
2946
2947 l0 = &pmap->pm_l0[l0index];
2948 tl0 = pmap_load(l0);
2949 if (tl0 == 0) {
2950 /* recurse for allocating page dir */
2951 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2952 lockp) == NULL) {
2953 vm_page_unwire_noq(m);
2954 vm_page_free_zero(m);
2955 return (NULL);
2956 }
2957 } else {
2958 l1pg = PTE_TO_VM_PAGE(tl0);
2959 l1pg->ref_count++;
2960 }
2961
2962 l1 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
2963 l1 = &l1[ptepindex & Ln_ADDR_MASK];
2964 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2965 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2966 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
2967 } else {
2968 vm_pindex_t l0index, l1index;
2969 pd_entry_t *l0, *l1, *l2;
2970 pd_entry_t tl0, tl1;
2971
2972 l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2973 l0index = l1index >> Ln_ENTRIES_SHIFT;
2974
2975 l0 = &pmap->pm_l0[l0index];
2976 tl0 = pmap_load(l0);
2977 if (tl0 == 0) {
2978 /* recurse for allocating page dir */
2979 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2980 lockp) == NULL) {
2981 vm_page_unwire_noq(m);
2982 vm_page_free_zero(m);
2983 return (NULL);
2984 }
2985 tl0 = pmap_load(l0);
2986 l1 = PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2987 l1 = &l1[l1index & Ln_ADDR_MASK];
2988 } else {
2989 l1 = PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2990 l1 = &l1[l1index & Ln_ADDR_MASK];
2991 tl1 = pmap_load(l1);
2992 if (tl1 == 0) {
2993 /* recurse for allocating page dir */
2994 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2995 lockp) == NULL) {
2996 vm_page_unwire_noq(m);
2997 vm_page_free_zero(m);
2998 return (NULL);
2999 }
3000 } else {
3001 l2pg = PTE_TO_VM_PAGE(tl1);
3002 l2pg->ref_count++;
3003 }
3004 }
3005
3006 l2 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1)));
3007 l2 = &l2[ptepindex & Ln_ADDR_MASK];
3008 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
3009 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
3010 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
3011 }
3012
3013 pmap_resident_count_inc(pmap, 1);
3014
3015 return (m);
3016 }
3017
3018 static pd_entry_t *
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,vm_page_t * l2pgp,struct rwlock ** lockp)3019 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
3020 struct rwlock **lockp)
3021 {
3022 pd_entry_t *l1, *l2;
3023 vm_page_t l2pg;
3024 vm_pindex_t l2pindex;
3025
3026 KASSERT(ADDR_IS_CANONICAL(va),
3027 ("%s: Address not in canonical form: %lx", __func__, va));
3028
3029 retry:
3030 l1 = pmap_l1(pmap, va);
3031 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
3032 l2 = pmap_l1_to_l2(l1, va);
3033 if (ADDR_IS_USER(va)) {
3034 /* Add a reference to the L2 page. */
3035 l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
3036 l2pg->ref_count++;
3037 } else
3038 l2pg = NULL;
3039 } else if (ADDR_IS_USER(va)) {
3040 /* Allocate a L2 page. */
3041 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
3042 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
3043 if (l2pg == NULL) {
3044 if (lockp != NULL)
3045 goto retry;
3046 else
3047 return (NULL);
3048 }
3049 l2 = VM_PAGE_TO_DMAP(l2pg);
3050 l2 = &l2[pmap_l2_index(va)];
3051 } else
3052 panic("pmap_alloc_l2: missing page table page for va %#lx",
3053 va);
3054 *l2pgp = l2pg;
3055 return (l2);
3056 }
3057
3058 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)3059 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
3060 {
3061 vm_pindex_t ptepindex;
3062 pd_entry_t *pde, tpde;
3063 #ifdef INVARIANTS
3064 pt_entry_t *pte;
3065 #endif
3066 vm_page_t m;
3067 int lvl;
3068
3069 /*
3070 * Calculate pagetable page index
3071 */
3072 ptepindex = pmap_l2_pindex(va);
3073 retry:
3074 /*
3075 * Get the page directory entry
3076 */
3077 pde = pmap_pde(pmap, va, &lvl);
3078
3079 /*
3080 * If the page table page is mapped, we just increment the hold count,
3081 * and activate it. If we get a level 2 pde it will point to a level 3
3082 * table.
3083 */
3084 switch (lvl) {
3085 case -1:
3086 break;
3087 case 0:
3088 #ifdef INVARIANTS
3089 pte = pmap_l0_to_l1(pde, va);
3090 KASSERT(pmap_load(pte) == 0,
3091 ("pmap_alloc_l3: TODO: l0 superpages"));
3092 #endif
3093 break;
3094 case 1:
3095 #ifdef INVARIANTS
3096 pte = pmap_l1_to_l2(pde, va);
3097 KASSERT(pmap_load(pte) == 0,
3098 ("pmap_alloc_l3: TODO: l1 superpages"));
3099 #endif
3100 break;
3101 case 2:
3102 tpde = pmap_load(pde);
3103 if (tpde != 0) {
3104 m = PTE_TO_VM_PAGE(tpde);
3105 m->ref_count++;
3106 return (m);
3107 }
3108 break;
3109 default:
3110 panic("pmap_alloc_l3: Invalid level %d", lvl);
3111 }
3112
3113 /*
3114 * Here if the pte page isn't mapped, or if it has been deallocated.
3115 */
3116 m = _pmap_alloc_l3(pmap, ptepindex, lockp);
3117 if (m == NULL && lockp != NULL)
3118 goto retry;
3119
3120 return (m);
3121 }
3122
3123 /***************************************************
3124 * Pmap allocation/deallocation routines.
3125 ***************************************************/
3126
3127 /*
3128 * Release any resources held by the given physical map.
3129 * Called when a pmap initialized by pmap_pinit is being released.
3130 * Should only be called if the map contains no valid mappings.
3131 */
3132 void
pmap_release(pmap_t pmap)3133 pmap_release(pmap_t pmap)
3134 {
3135 bool rv __diagused;
3136 struct spglist freelist;
3137 struct asid_set *set;
3138 vm_page_t m;
3139 int asid;
3140
3141 if (pmap->pm_levels != 4) {
3142 PMAP_ASSERT_STAGE2(pmap);
3143 KASSERT(pmap->pm_stats.resident_count == 1,
3144 ("pmap_release: pmap resident count %ld != 0",
3145 pmap->pm_stats.resident_count));
3146 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
3147 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
3148
3149 SLIST_INIT(&freelist);
3150 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
3151 PMAP_LOCK(pmap);
3152 rv = pmap_unwire_l3(pmap, 0, m, &freelist);
3153 PMAP_UNLOCK(pmap);
3154 MPASS(rv == true);
3155 vm_page_free_pages_toq(&freelist, true);
3156 }
3157
3158 KASSERT(pmap->pm_stats.resident_count == 0,
3159 ("pmap_release: pmap resident count %ld != 0",
3160 pmap->pm_stats.resident_count));
3161 KASSERT(vm_radix_is_empty(&pmap->pm_root),
3162 ("pmap_release: pmap has reserved page table page(s)"));
3163
3164 set = pmap->pm_asid_set;
3165 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
3166
3167 /*
3168 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
3169 * the entries when removing them so rely on a later tlb invalidation.
3170 * this will happen when updating the VMID generation. Because of this
3171 * we don't reuse VMIDs within a generation.
3172 */
3173 if (pmap->pm_stage == PM_STAGE1) {
3174 mtx_lock_spin(&set->asid_set_mutex);
3175 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
3176 asid = COOKIE_TO_ASID(pmap->pm_cookie);
3177 KASSERT(asid >= ASID_FIRST_AVAILABLE &&
3178 asid < set->asid_set_size,
3179 ("pmap_release: pmap cookie has out-of-range asid"));
3180 bit_clear(set->asid_set, asid);
3181 }
3182 mtx_unlock_spin(&set->asid_set_mutex);
3183
3184 if (pmap->pm_bti != NULL) {
3185 rangeset_fini(pmap->pm_bti);
3186 free(pmap->pm_bti, M_DEVBUF);
3187 }
3188 }
3189
3190 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
3191 vm_page_unwire_noq(m);
3192 vm_page_free_zero(m);
3193 }
3194
3195 static int
kvm_size(SYSCTL_HANDLER_ARGS)3196 kvm_size(SYSCTL_HANDLER_ARGS)
3197 {
3198 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
3199
3200 return sysctl_handle_long(oidp, &ksize, 0, req);
3201 }
3202 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
3203 0, 0, kvm_size, "LU",
3204 "Size of KVM");
3205
3206 static int
kvm_free(SYSCTL_HANDLER_ARGS)3207 kvm_free(SYSCTL_HANDLER_ARGS)
3208 {
3209 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
3210
3211 return sysctl_handle_long(oidp, &kfree, 0, req);
3212 }
3213 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
3214 0, 0, kvm_free, "LU",
3215 "Amount of KVM free");
3216
3217 /*
3218 * grow the number of kernel page table entries, if needed
3219 */
3220 static int
pmap_growkernel_nopanic(vm_offset_t addr)3221 pmap_growkernel_nopanic(vm_offset_t addr)
3222 {
3223 vm_page_t nkpg;
3224 pd_entry_t *l0, *l1, *l2;
3225
3226 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
3227
3228 addr = roundup2(addr, L2_SIZE);
3229 if (addr - 1 >= vm_map_max(kernel_map))
3230 addr = vm_map_max(kernel_map);
3231 if (kernel_vm_end < addr) {
3232 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
3233 kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
3234 }
3235 while (kernel_vm_end < addr) {
3236 l0 = pmap_l0(kernel_pmap, kernel_vm_end);
3237 KASSERT(pmap_load(l0) != 0,
3238 ("pmap_growkernel: No level 0 kernel entry"));
3239
3240 l1 = pmap_l0_to_l1(l0, kernel_vm_end);
3241 if (pmap_load(l1) == 0) {
3242 /* We need a new PDP entry */
3243 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3244 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3245 if (nkpg == NULL)
3246 return (KERN_RESOURCE_SHORTAGE);
3247 nkpg->pindex = pmap_l1_pindex(kernel_vm_end);
3248 /* See the dmb() in _pmap_alloc_l3(). */
3249 dmb(ishst);
3250 pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE);
3251 continue; /* try again */
3252 }
3253 l2 = pmap_l1_to_l2(l1, kernel_vm_end);
3254 if (pmap_load(l2) != 0) {
3255 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
3256 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3257 kernel_vm_end = vm_map_max(kernel_map);
3258 break;
3259 }
3260 continue;
3261 }
3262
3263 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3264 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3265 if (nkpg == NULL)
3266 return (KERN_RESOURCE_SHORTAGE);
3267 nkpg->pindex = pmap_l2_pindex(kernel_vm_end);
3268 /* See the dmb() in _pmap_alloc_l3(). */
3269 dmb(ishst);
3270 pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE);
3271
3272 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
3273 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3274 kernel_vm_end = vm_map_max(kernel_map);
3275 break;
3276 }
3277 }
3278 return (KERN_SUCCESS);
3279 }
3280
3281 int
pmap_growkernel(vm_offset_t addr)3282 pmap_growkernel(vm_offset_t addr)
3283 {
3284 int rv;
3285
3286 rv = pmap_growkernel_nopanic(addr);
3287 if (rv != KERN_SUCCESS && pmap_growkernel_panic)
3288 panic("pmap_growkernel: no memory to grow kernel");
3289 return (rv);
3290 }
3291
3292 /***************************************************
3293 * page management routines.
3294 ***************************************************/
3295
3296 static const uint64_t pc_freemask[_NPCM] = {
3297 [0 ... _NPCM - 2] = PC_FREEN,
3298 [_NPCM - 1] = PC_FREEL
3299 };
3300
3301 #ifdef PV_STATS
3302 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
3303
3304 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
3305 "Current number of pv entry chunks");
3306 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
3307 "Current number of pv entry chunks allocated");
3308 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
3309 "Current number of pv entry chunks frees");
3310 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
3311 "Number of times tried to get a chunk page but failed.");
3312
3313 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
3314 static int pv_entry_spare;
3315
3316 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
3317 "Current number of pv entry frees");
3318 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
3319 "Current number of pv entry allocs");
3320 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
3321 "Current number of pv entries");
3322 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3323 "Current number of spare pv entries");
3324 #endif
3325
3326 /*
3327 * We are in a serious low memory condition. Resort to
3328 * drastic measures to free some pages so we can allocate
3329 * another pv entry chunk.
3330 *
3331 * Returns NULL if PV entries were reclaimed from the specified pmap.
3332 *
3333 * We do not, however, unmap 2mpages because subsequent accesses will
3334 * allocate per-page pv entries until repromotion occurs, thereby
3335 * exacerbating the shortage of free pv entries.
3336 */
3337 static vm_page_t
reclaim_pv_chunk_domain(pmap_t locked_pmap,struct rwlock ** lockp,int domain)3338 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
3339 {
3340 struct pv_chunks_list *pvc;
3341 struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3342 struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3343 struct md_page *pvh;
3344 pd_entry_t *pde;
3345 pmap_t next_pmap, pmap;
3346 pt_entry_t *pte, tpte;
3347 pv_entry_t pv;
3348 vm_offset_t va;
3349 vm_page_t m, m_pc;
3350 struct spglist free;
3351 uint64_t inuse;
3352 int bit, field, freed, lvl;
3353
3354 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3355 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3356
3357 pmap = NULL;
3358 m_pc = NULL;
3359 SLIST_INIT(&free);
3360 bzero(&pc_marker_b, sizeof(pc_marker_b));
3361 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3362 pc_marker = (struct pv_chunk *)&pc_marker_b;
3363 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3364
3365 pvc = &pv_chunks[domain];
3366 mtx_lock(&pvc->pvc_lock);
3367 pvc->active_reclaims++;
3368 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
3369 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
3370 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3371 SLIST_EMPTY(&free)) {
3372 next_pmap = pc->pc_pmap;
3373 if (next_pmap == NULL) {
3374 /*
3375 * The next chunk is a marker. However, it is
3376 * not our marker, so active_reclaims must be
3377 * > 1. Consequently, the next_chunk code
3378 * will not rotate the pv_chunks list.
3379 */
3380 goto next_chunk;
3381 }
3382 mtx_unlock(&pvc->pvc_lock);
3383
3384 /*
3385 * A pv_chunk can only be removed from the pc_lru list
3386 * when both pvc->pvc_lock is owned and the
3387 * corresponding pmap is locked.
3388 */
3389 if (pmap != next_pmap) {
3390 if (pmap != NULL && pmap != locked_pmap)
3391 PMAP_UNLOCK(pmap);
3392 pmap = next_pmap;
3393 /* Avoid deadlock and lock recursion. */
3394 if (pmap > locked_pmap) {
3395 RELEASE_PV_LIST_LOCK(lockp);
3396 PMAP_LOCK(pmap);
3397 mtx_lock(&pvc->pvc_lock);
3398 continue;
3399 } else if (pmap != locked_pmap) {
3400 if (PMAP_TRYLOCK(pmap)) {
3401 mtx_lock(&pvc->pvc_lock);
3402 continue;
3403 } else {
3404 pmap = NULL; /* pmap is not locked */
3405 mtx_lock(&pvc->pvc_lock);
3406 pc = TAILQ_NEXT(pc_marker, pc_lru);
3407 if (pc == NULL ||
3408 pc->pc_pmap != next_pmap)
3409 continue;
3410 goto next_chunk;
3411 }
3412 }
3413 }
3414
3415 /*
3416 * Destroy every non-wired, 4 KB page mapping in the chunk.
3417 */
3418 freed = 0;
3419 for (field = 0; field < _NPCM; field++) {
3420 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3421 inuse != 0; inuse &= ~(1UL << bit)) {
3422 bit = ffsl(inuse) - 1;
3423 pv = &pc->pc_pventry[field * 64 + bit];
3424 va = pv->pv_va;
3425 pde = pmap_pde(pmap, va, &lvl);
3426 if (lvl != 2)
3427 continue;
3428 pte = pmap_l2_to_l3(pde, va);
3429 tpte = pmap_load(pte);
3430 if ((tpte & ATTR_SW_WIRED) != 0)
3431 continue;
3432 if ((tpte & ATTR_CONTIGUOUS) != 0)
3433 (void)pmap_demote_l3c(pmap, pte, va);
3434 tpte = pmap_load_clear(pte);
3435 m = PTE_TO_VM_PAGE(tpte);
3436 if (pmap_pte_dirty(pmap, tpte))
3437 vm_page_dirty(m);
3438 if ((tpte & ATTR_AF) != 0) {
3439 pmap_s1_invalidate_page(pmap, va, true);
3440 vm_page_aflag_set(m, PGA_REFERENCED);
3441 }
3442 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3443 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3444 m->md.pv_gen++;
3445 if (TAILQ_EMPTY(&m->md.pv_list) &&
3446 (m->flags & PG_FICTITIOUS) == 0) {
3447 pvh = page_to_pvh(m);
3448 if (TAILQ_EMPTY(&pvh->pv_list)) {
3449 vm_page_aflag_clear(m,
3450 PGA_WRITEABLE);
3451 }
3452 }
3453 pc->pc_map[field] |= 1UL << bit;
3454 pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
3455 freed++;
3456 }
3457 }
3458 if (freed == 0) {
3459 mtx_lock(&pvc->pvc_lock);
3460 goto next_chunk;
3461 }
3462 /* Every freed mapping is for a 4 KB page. */
3463 pmap_resident_count_dec(pmap, freed);
3464 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3465 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3466 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3467 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3468 if (pc_is_free(pc)) {
3469 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3470 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3471 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3472 /* Entire chunk is free; return it. */
3473 m_pc = DMAP_TO_VM_PAGE(pc);
3474 dump_drop_page(m_pc->phys_addr);
3475 mtx_lock(&pvc->pvc_lock);
3476 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3477 break;
3478 }
3479 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3480 mtx_lock(&pvc->pvc_lock);
3481 /* One freed pv entry in locked_pmap is sufficient. */
3482 if (pmap == locked_pmap)
3483 break;
3484
3485 next_chunk:
3486 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3487 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
3488 if (pvc->active_reclaims == 1 && pmap != NULL) {
3489 /*
3490 * Rotate the pv chunks list so that we do not
3491 * scan the same pv chunks that could not be
3492 * freed (because they contained a wired
3493 * and/or superpage mapping) on every
3494 * invocation of reclaim_pv_chunk().
3495 */
3496 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
3497 MPASS(pc->pc_pmap != NULL);
3498 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3499 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3500 }
3501 }
3502 }
3503 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3504 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
3505 pvc->active_reclaims--;
3506 mtx_unlock(&pvc->pvc_lock);
3507 if (pmap != NULL && pmap != locked_pmap)
3508 PMAP_UNLOCK(pmap);
3509 if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3510 m_pc = SLIST_FIRST(&free);
3511 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3512 /* Recycle a freed page table page. */
3513 m_pc->ref_count = 1;
3514 }
3515 vm_page_free_pages_toq(&free, true);
3516 return (m_pc);
3517 }
3518
3519 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)3520 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3521 {
3522 vm_page_t m;
3523 int i, domain;
3524
3525 domain = PCPU_GET(domain);
3526 for (i = 0; i < vm_ndomains; i++) {
3527 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
3528 if (m != NULL)
3529 break;
3530 domain = (domain + 1) % vm_ndomains;
3531 }
3532
3533 return (m);
3534 }
3535
3536 /*
3537 * free the pv_entry back to the free list
3538 */
3539 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)3540 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3541 {
3542 struct pv_chunk *pc;
3543 int idx, field, bit;
3544
3545 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3546 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3547 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3548 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3549 pc = pv_to_chunk(pv);
3550 idx = pv - &pc->pc_pventry[0];
3551 field = idx / 64;
3552 bit = idx % 64;
3553 pc->pc_map[field] |= 1ul << bit;
3554 if (!pc_is_free(pc)) {
3555 /* 98% of the time, pc is already at the head of the list. */
3556 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3557 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3558 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3559 }
3560 return;
3561 }
3562 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3563 free_pv_chunk(pc);
3564 }
3565
3566 static void
free_pv_chunk_dequeued(struct pv_chunk * pc)3567 free_pv_chunk_dequeued(struct pv_chunk *pc)
3568 {
3569 vm_page_t m;
3570
3571 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3572 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3573 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3574 /* entire chunk is free, return it */
3575 m = DMAP_TO_VM_PAGE(pc);
3576 dump_drop_page(m->phys_addr);
3577 vm_page_unwire_noq(m);
3578 vm_page_free(m);
3579 }
3580
3581 static void
free_pv_chunk(struct pv_chunk * pc)3582 free_pv_chunk(struct pv_chunk *pc)
3583 {
3584 struct pv_chunks_list *pvc;
3585
3586 pvc = &pv_chunks[pc_to_domain(pc)];
3587 mtx_lock(&pvc->pvc_lock);
3588 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3589 mtx_unlock(&pvc->pvc_lock);
3590 free_pv_chunk_dequeued(pc);
3591 }
3592
3593 static void
free_pv_chunk_batch(struct pv_chunklist * batch)3594 free_pv_chunk_batch(struct pv_chunklist *batch)
3595 {
3596 struct pv_chunks_list *pvc;
3597 struct pv_chunk *pc, *npc;
3598 int i;
3599
3600 for (i = 0; i < vm_ndomains; i++) {
3601 if (TAILQ_EMPTY(&batch[i]))
3602 continue;
3603 pvc = &pv_chunks[i];
3604 mtx_lock(&pvc->pvc_lock);
3605 TAILQ_FOREACH(pc, &batch[i], pc_list) {
3606 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3607 }
3608 mtx_unlock(&pvc->pvc_lock);
3609 }
3610
3611 for (i = 0; i < vm_ndomains; i++) {
3612 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3613 free_pv_chunk_dequeued(pc);
3614 }
3615 }
3616 }
3617
3618 /*
3619 * Returns a new PV entry, allocating a new PV chunk from the system when
3620 * needed. If this PV chunk allocation fails and a PV list lock pointer was
3621 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
3622 * returned.
3623 *
3624 * The given PV list lock may be released.
3625 */
3626 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)3627 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3628 {
3629 struct pv_chunks_list *pvc;
3630 int bit, field;
3631 pv_entry_t pv;
3632 struct pv_chunk *pc;
3633 vm_page_t m;
3634
3635 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3636 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3637 retry:
3638 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3639 if (pc != NULL) {
3640 for (field = 0; field < _NPCM; field++) {
3641 if (pc->pc_map[field]) {
3642 bit = ffsl(pc->pc_map[field]) - 1;
3643 break;
3644 }
3645 }
3646 if (field < _NPCM) {
3647 pv = &pc->pc_pventry[field * 64 + bit];
3648 pc->pc_map[field] &= ~(1ul << bit);
3649 /* If this was the last item, move it to tail */
3650 if (pc_is_full(pc)) {
3651 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3652 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3653 pc_list);
3654 }
3655 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3656 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3657 return (pv);
3658 }
3659 }
3660 /* No free items, allocate another chunk */
3661 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3662 if (m == NULL) {
3663 if (lockp == NULL) {
3664 PV_STAT(pc_chunk_tryfail++);
3665 return (NULL);
3666 }
3667 m = reclaim_pv_chunk(pmap, lockp);
3668 if (m == NULL)
3669 goto retry;
3670 }
3671 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3672 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3673 dump_add_page(m->phys_addr);
3674 pc = PHYS_TO_DMAP(m->phys_addr);
3675 pc->pc_pmap = pmap;
3676 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3677 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */
3678 pvc = &pv_chunks[vm_page_domain(m)];
3679 mtx_lock(&pvc->pvc_lock);
3680 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3681 mtx_unlock(&pvc->pvc_lock);
3682 pv = &pc->pc_pventry[0];
3683 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3684 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3685 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3686 return (pv);
3687 }
3688
3689 /*
3690 * Ensure that the number of spare PV entries in the specified pmap meets or
3691 * exceeds the given count, "needed".
3692 *
3693 * The given PV list lock may be released.
3694 */
3695 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)3696 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3697 {
3698 struct pv_chunks_list *pvc;
3699 struct pch new_tail[PMAP_MEMDOM];
3700 struct pv_chunk *pc;
3701 vm_page_t m;
3702 int avail, free, i;
3703 bool reclaimed;
3704
3705 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3706 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3707
3708 /*
3709 * Newly allocated PV chunks must be stored in a private list until
3710 * the required number of PV chunks have been allocated. Otherwise,
3711 * reclaim_pv_chunk() could recycle one of these chunks. In
3712 * contrast, these chunks must be added to the pmap upon allocation.
3713 */
3714 for (i = 0; i < PMAP_MEMDOM; i++)
3715 TAILQ_INIT(&new_tail[i]);
3716 retry:
3717 avail = 0;
3718 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3719 bit_count((bitstr_t *)pc->pc_map, 0,
3720 sizeof(pc->pc_map) * NBBY, &free);
3721 if (free == 0)
3722 break;
3723 avail += free;
3724 if (avail >= needed)
3725 break;
3726 }
3727 for (reclaimed = false; avail < needed; avail += _NPCPV) {
3728 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3729 if (m == NULL) {
3730 m = reclaim_pv_chunk(pmap, lockp);
3731 if (m == NULL)
3732 goto retry;
3733 reclaimed = true;
3734 }
3735 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3736 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3737 dump_add_page(m->phys_addr);
3738 pc = PHYS_TO_DMAP(m->phys_addr);
3739 pc->pc_pmap = pmap;
3740 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3741 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3742 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3743 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3744
3745 /*
3746 * The reclaim might have freed a chunk from the current pmap.
3747 * If that chunk contained available entries, we need to
3748 * re-count the number of available entries.
3749 */
3750 if (reclaimed)
3751 goto retry;
3752 }
3753 for (i = 0; i < vm_ndomains; i++) {
3754 if (TAILQ_EMPTY(&new_tail[i]))
3755 continue;
3756 pvc = &pv_chunks[i];
3757 mtx_lock(&pvc->pvc_lock);
3758 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3759 mtx_unlock(&pvc->pvc_lock);
3760 }
3761 }
3762
3763 /*
3764 * First find and then remove the pv entry for the specified pmap and virtual
3765 * address from the specified pv list. Returns the pv entry if found and NULL
3766 * otherwise. This operation can be performed on pv lists for either 4KB or
3767 * 2MB page mappings.
3768 */
3769 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3770 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3771 {
3772 pv_entry_t pv;
3773
3774 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3775 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3776 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3777 pvh->pv_gen++;
3778 break;
3779 }
3780 }
3781 return (pv);
3782 }
3783
3784 /*
3785 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3786 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3787 * entries for each of the 4KB page mappings.
3788 */
3789 static void
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)3790 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3791 struct rwlock **lockp)
3792 {
3793 struct md_page *pvh;
3794 struct pv_chunk *pc;
3795 pv_entry_t pv;
3796 vm_offset_t va_last;
3797 vm_page_t m;
3798 int bit, field;
3799
3800 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3801 KASSERT((va & L2_OFFSET) == 0,
3802 ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3803 KASSERT((pa & L2_OFFSET) == 0,
3804 ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3805 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3806
3807 /*
3808 * Transfer the 2mpage's pv entry for this mapping to the first
3809 * page's pv list. Once this transfer begins, the pv list lock
3810 * must not be released until the last pv entry is reinstantiated.
3811 */
3812 pvh = pa_to_pvh(pa);
3813 pv = pmap_pvh_remove(pvh, pmap, va);
3814 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3815 m = PHYS_TO_VM_PAGE(pa);
3816 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3817 m->md.pv_gen++;
3818 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3819 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3820 va_last = va + L2_SIZE - PAGE_SIZE;
3821 for (;;) {
3822 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3823 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3824 for (field = 0; field < _NPCM; field++) {
3825 while (pc->pc_map[field]) {
3826 bit = ffsl(pc->pc_map[field]) - 1;
3827 pc->pc_map[field] &= ~(1ul << bit);
3828 pv = &pc->pc_pventry[field * 64 + bit];
3829 va += PAGE_SIZE;
3830 pv->pv_va = va;
3831 m++;
3832 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3833 ("pmap_pv_demote_l2: page %p is not managed", m));
3834 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3835 m->md.pv_gen++;
3836 if (va == va_last)
3837 goto out;
3838 }
3839 }
3840 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3841 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3842 }
3843 out:
3844 if (pc_is_full(pc)) {
3845 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3846 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3847 }
3848 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3849 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3850 }
3851
3852 /*
3853 * First find and then destroy the pv entry for the specified pmap and virtual
3854 * address. This operation can be performed on pv lists for either 4KB or 2MB
3855 * page mappings.
3856 */
3857 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3858 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3859 {
3860 pv_entry_t pv;
3861
3862 pv = pmap_pvh_remove(pvh, pmap, va);
3863 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3864 free_pv_entry(pmap, pv);
3865 }
3866
3867 /*
3868 * Conditionally create the PV entry for a 4KB page mapping if the required
3869 * memory can be allocated without resorting to reclamation.
3870 */
3871 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3872 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3873 struct rwlock **lockp)
3874 {
3875 pv_entry_t pv;
3876
3877 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3878 /* Pass NULL instead of the lock pointer to disable reclamation. */
3879 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3880 pv->pv_va = va;
3881 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3882 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3883 m->md.pv_gen++;
3884 return (true);
3885 } else
3886 return (false);
3887 }
3888
3889 /*
3890 * Create the PV entry for a 2MB page mapping. Always returns true unless the
3891 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
3892 * false if the PV entry cannot be allocated without resorting to reclamation.
3893 */
3894 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)3895 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3896 struct rwlock **lockp)
3897 {
3898 struct md_page *pvh;
3899 pv_entry_t pv;
3900 vm_paddr_t pa;
3901
3902 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3903 /* Pass NULL instead of the lock pointer to disable reclamation. */
3904 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3905 NULL : lockp)) == NULL)
3906 return (false);
3907 pv->pv_va = va;
3908 pa = PTE_TO_PHYS(l2e);
3909 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3910 pvh = pa_to_pvh(pa);
3911 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3912 pvh->pv_gen++;
3913 return (true);
3914 }
3915
3916 /*
3917 * Conditionally creates the PV entries for a L3C superpage mapping if
3918 * the required memory can be allocated without resorting to reclamation.
3919 */
3920 static bool
pmap_pv_insert_l3c(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3921 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
3922 struct rwlock **lockp)
3923 {
3924 pv_entry_t pv;
3925 vm_offset_t tva;
3926 vm_paddr_t pa __diagused;
3927 vm_page_t mt;
3928
3929 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3930 KASSERT((va & L3C_OFFSET) == 0,
3931 ("pmap_pv_insert_l3c: va is not aligned"));
3932 pa = VM_PAGE_TO_PHYS(m);
3933 KASSERT((pa & L3C_OFFSET) == 0,
3934 ("pmap_pv_insert_l3c: pa is not aligned"));
3935 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3936 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) {
3937 /* Pass NULL instead of lockp to disable reclamation. */
3938 pv = get_pv_entry(pmap, NULL);
3939 if (__predict_false(pv == NULL)) {
3940 while (tva > va) {
3941 mt--;
3942 tva -= L3_SIZE;
3943 pmap_pvh_free(&mt->md, pmap, tva);
3944 }
3945 return (false);
3946 }
3947 pv->pv_va = tva;
3948 TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next);
3949 mt->md.pv_gen++;
3950 }
3951 return (true);
3952 }
3953
3954 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)3955 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3956 {
3957 pt_entry_t newl2, oldl2 __diagused;
3958 vm_page_t ml3;
3959 vm_paddr_t ml3pa;
3960
3961 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3962 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3963 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3964
3965 ml3 = pmap_remove_pt_page(pmap, va);
3966 KASSERT(ml3 != NULL, ("pmap_remove_kernel_l2: missing pt page"));
3967
3968 ml3pa = VM_PAGE_TO_PHYS(ml3);
3969 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
3970
3971 /*
3972 * If this page table page was unmapped by a promotion, then it
3973 * contains valid mappings. Zero it to invalidate those mappings.
3974 */
3975 if (vm_page_any_valid(ml3))
3976 pagezero(PHYS_TO_DMAP(ml3pa));
3977
3978 /*
3979 * Demote the mapping. The caller must have already invalidated the
3980 * mapping (i.e., the "break" in break-before-make).
3981 */
3982 oldl2 = pmap_load_store(l2, newl2);
3983 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3984 __func__, l2, oldl2));
3985 }
3986
3987 /*
3988 * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3989 */
3990 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,bool demote_kl2e,struct spglist * free,struct rwlock ** lockp)3991 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e,
3992 bool demote_kl2e, struct spglist *free, struct rwlock **lockp)
3993 {
3994 struct md_page *pvh;
3995 pt_entry_t old_l2;
3996 vm_page_t m, ml3, mt;
3997
3998 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3999 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
4000 old_l2 = pmap_load_clear(l2);
4001 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4002 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
4003
4004 /*
4005 * Since a promotion must break the 4KB page mappings before making
4006 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4007 */
4008 pmap_s1_invalidate_page(pmap, sva, true);
4009
4010 if (old_l2 & ATTR_SW_WIRED)
4011 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
4012 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
4013 if (old_l2 & ATTR_SW_MANAGED) {
4014 m = PTE_TO_VM_PAGE(old_l2);
4015 pvh = page_to_pvh(m);
4016 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
4017 pmap_pvh_free(pvh, pmap, sva);
4018 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
4019 if (pmap_pte_dirty(pmap, old_l2))
4020 vm_page_dirty(mt);
4021 if (old_l2 & ATTR_AF)
4022 vm_page_aflag_set(mt, PGA_REFERENCED);
4023 if (TAILQ_EMPTY(&mt->md.pv_list) &&
4024 TAILQ_EMPTY(&pvh->pv_list))
4025 vm_page_aflag_clear(mt, PGA_WRITEABLE);
4026 }
4027 }
4028 if (pmap != kernel_pmap) {
4029 ml3 = pmap_remove_pt_page(pmap, sva);
4030 if (ml3 != NULL) {
4031 KASSERT(vm_page_any_valid(ml3),
4032 ("pmap_remove_l2: l3 page not promoted"));
4033 pmap_resident_count_dec(pmap, 1);
4034 KASSERT(ml3->ref_count == NL3PG,
4035 ("pmap_remove_l2: l3 page ref count error"));
4036 ml3->ref_count = 0;
4037 pmap_add_delayed_free_list(ml3, free, false);
4038 }
4039 } else if (demote_kl2e) {
4040 pmap_remove_kernel_l2(pmap, l2, sva);
4041 } else {
4042 ml3 = vm_radix_lookup(&pmap->pm_root, pmap_l2_pindex(sva));
4043 if (vm_page_any_valid(ml3)) {
4044 ml3->valid = 0;
4045 pmap_zero_page(ml3);
4046 }
4047 }
4048 return (pmap_unuse_pt(pmap, sva, l1e, free));
4049 }
4050
4051 /*
4052 * pmap_remove_l3: do the things to unmap a page in a process
4053 */
4054 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)4055 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
4056 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
4057 {
4058 struct md_page *pvh;
4059 pt_entry_t old_l3;
4060 vm_page_t m;
4061
4062 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4063 old_l3 = pmap_load(l3);
4064 if ((old_l3 & ATTR_CONTIGUOUS) != 0)
4065 (void)pmap_demote_l3c(pmap, l3, va);
4066 old_l3 = pmap_load_clear(l3);
4067 pmap_s1_invalidate_page(pmap, va, true);
4068 if (old_l3 & ATTR_SW_WIRED)
4069 pmap->pm_stats.wired_count -= 1;
4070 pmap_resident_count_dec(pmap, 1);
4071 if (old_l3 & ATTR_SW_MANAGED) {
4072 m = PTE_TO_VM_PAGE(old_l3);
4073 if (pmap_pte_dirty(pmap, old_l3))
4074 vm_page_dirty(m);
4075 if (old_l3 & ATTR_AF)
4076 vm_page_aflag_set(m, PGA_REFERENCED);
4077 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
4078 pmap_pvh_free(&m->md, pmap, va);
4079 if (TAILQ_EMPTY(&m->md.pv_list) &&
4080 (m->flags & PG_FICTITIOUS) == 0) {
4081 pvh = page_to_pvh(m);
4082 if (TAILQ_EMPTY(&pvh->pv_list))
4083 vm_page_aflag_clear(m, PGA_WRITEABLE);
4084 }
4085 }
4086 return (pmap_unuse_pt(pmap, va, l2e, free));
4087 }
4088
4089 /*
4090 * Removes the specified L3C superpage mapping. Requests TLB invalidations
4091 * to be performed by the caller through the returned "*vap". Returns true
4092 * if the level 3 table "ml3" was unmapped and added to the spglist "free".
4093 * Otherwise, returns false.
4094 */
4095 static bool
pmap_remove_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,vm_page_t ml3,struct spglist * free,struct rwlock ** lockp)4096 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap,
4097 vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
4098 struct rwlock **lockp)
4099 {
4100 struct md_page *pvh;
4101 struct rwlock *new_lock;
4102 pt_entry_t first_l3e, l3e, *tl3p;
4103 vm_offset_t tva;
4104 vm_page_t m, mt;
4105
4106 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4107 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4108 0, ("pmap_remove_l3c: l3p is not aligned"));
4109 KASSERT((va & L3C_OFFSET) == 0,
4110 ("pmap_remove_l3c: va is not aligned"));
4111
4112 /*
4113 * Hardware accessed and dirty bit maintenance might only update a
4114 * single L3 entry, so we must combine the accessed and dirty bits
4115 * from this entire set of contiguous L3 entries.
4116 */
4117 first_l3e = pmap_load_clear(l3p);
4118 for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4119 l3e = pmap_load_clear(tl3p);
4120 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4121 ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS"));
4122 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4123 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4124 first_l3e &= ~ATTR_S1_AP_RW_BIT;
4125 first_l3e |= l3e & ATTR_AF;
4126 }
4127 if ((first_l3e & ATTR_SW_WIRED) != 0)
4128 pmap->pm_stats.wired_count -= L3C_ENTRIES;
4129 pmap_resident_count_dec(pmap, L3C_ENTRIES);
4130 if ((first_l3e & ATTR_SW_MANAGED) != 0) {
4131 m = PTE_TO_VM_PAGE(first_l3e);
4132 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4133 if (new_lock != *lockp) {
4134 if (*lockp != NULL) {
4135 /*
4136 * Pending TLB invalidations must be
4137 * performed before the PV list lock is
4138 * released. Otherwise, a concurrent
4139 * pmap_remove_all() on a physical page
4140 * could return while a stale TLB entry
4141 * still provides access to that page.
4142 */
4143 if (*vap != va_next) {
4144 pmap_invalidate_range(pmap, *vap, va,
4145 true);
4146 *vap = va_next;
4147 }
4148 rw_wunlock(*lockp);
4149 }
4150 *lockp = new_lock;
4151 rw_wlock(*lockp);
4152 }
4153 pvh = page_to_pvh(m);
4154 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva +=
4155 L3_SIZE) {
4156 if (pmap_pte_dirty(pmap, first_l3e))
4157 vm_page_dirty(mt);
4158 if ((first_l3e & ATTR_AF) != 0)
4159 vm_page_aflag_set(mt, PGA_REFERENCED);
4160 pmap_pvh_free(&mt->md, pmap, tva);
4161 if (TAILQ_EMPTY(&mt->md.pv_list) &&
4162 TAILQ_EMPTY(&pvh->pv_list))
4163 vm_page_aflag_clear(mt, PGA_WRITEABLE);
4164 }
4165 }
4166 if (*vap == va_next)
4167 *vap = va;
4168 if (ml3 != NULL) {
4169 ml3->ref_count -= L3C_ENTRIES;
4170 if (ml3->ref_count == 0) {
4171 _pmap_unwire_l3(pmap, va, ml3, free);
4172 return (true);
4173 }
4174 }
4175 return (false);
4176 }
4177
4178 /*
4179 * Remove the specified range of addresses from the L3 page table that is
4180 * identified by the given L2 entry.
4181 */
4182 static void
pmap_remove_l3_range(pmap_t pmap,pd_entry_t l2e,vm_offset_t sva,vm_offset_t eva,struct spglist * free,struct rwlock ** lockp)4183 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
4184 vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
4185 {
4186 struct md_page *pvh;
4187 struct rwlock *new_lock;
4188 pt_entry_t *l3, old_l3;
4189 vm_offset_t va;
4190 vm_page_t l3pg, m;
4191
4192 KASSERT(ADDR_IS_CANONICAL(sva),
4193 ("%s: Start address not in canonical form: %lx", __func__, sva));
4194 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
4195 ("%s: End address not in canonical form: %lx", __func__, eva));
4196
4197 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4198 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
4199 ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
4200 l3pg = ADDR_IS_USER(sva) ? PTE_TO_VM_PAGE(l2e) : NULL;
4201 va = eva;
4202 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
4203 old_l3 = pmap_load(l3);
4204 if (!pmap_l3_valid(old_l3)) {
4205 if (va != eva) {
4206 pmap_invalidate_range(pmap, va, sva, true);
4207 va = eva;
4208 }
4209 continue;
4210 }
4211 if ((old_l3 & ATTR_CONTIGUOUS) != 0) {
4212 /*
4213 * Is this entire set of contiguous L3 entries being
4214 * removed? Handle the possibility that "eva" is zero
4215 * because of address wraparound.
4216 */
4217 if ((sva & L3C_OFFSET) == 0 &&
4218 sva + L3C_OFFSET <= eva - 1) {
4219 if (pmap_remove_l3c(pmap, l3, sva, &va, eva,
4220 l3pg, free, lockp)) {
4221 /* The L3 table was unmapped. */
4222 sva += L3C_SIZE;
4223 break;
4224 }
4225 l3 += L3C_ENTRIES - 1;
4226 sva += L3C_SIZE - L3_SIZE;
4227 continue;
4228 }
4229
4230 (void)pmap_demote_l3c(pmap, l3, sva);
4231 }
4232 old_l3 = pmap_load_clear(l3);
4233 if ((old_l3 & ATTR_SW_WIRED) != 0)
4234 pmap->pm_stats.wired_count--;
4235 pmap_resident_count_dec(pmap, 1);
4236 /* Below will only be true in a realm environment. */
4237 if (PTE_TO_PHYS(old_l3) & prot_ns_shared_pa)
4238 pmap_set_protected(old_l3);
4239 if ((old_l3 & ATTR_SW_MANAGED) != 0) {
4240 m = PTE_TO_VM_PAGE(old_l3);
4241 if (pmap_pte_dirty(pmap, old_l3))
4242 vm_page_dirty(m);
4243 if ((old_l3 & ATTR_AF) != 0)
4244 vm_page_aflag_set(m, PGA_REFERENCED);
4245 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4246 if (new_lock != *lockp) {
4247 if (*lockp != NULL) {
4248 /*
4249 * Pending TLB invalidations must be
4250 * performed before the PV list lock is
4251 * released. Otherwise, a concurrent
4252 * pmap_remove_all() on a physical page
4253 * could return while a stale TLB entry
4254 * still provides access to that page.
4255 */
4256 if (va != eva) {
4257 pmap_invalidate_range(pmap, va,
4258 sva, true);
4259 va = eva;
4260 }
4261 rw_wunlock(*lockp);
4262 }
4263 *lockp = new_lock;
4264 rw_wlock(*lockp);
4265 }
4266 pmap_pvh_free(&m->md, pmap, sva);
4267 if (TAILQ_EMPTY(&m->md.pv_list) &&
4268 (m->flags & PG_FICTITIOUS) == 0) {
4269 pvh = page_to_pvh(m);
4270 if (TAILQ_EMPTY(&pvh->pv_list))
4271 vm_page_aflag_clear(m, PGA_WRITEABLE);
4272 }
4273 }
4274 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
4275 /*
4276 * _pmap_unwire_l3() has already invalidated the TLB
4277 * entries at all levels for "sva". So, we need not
4278 * perform "sva += L3_SIZE;" here. Moreover, we need
4279 * not perform "va = sva;" if "sva" is at the start
4280 * of a new valid range consisting of a single page.
4281 */
4282 break;
4283 }
4284 if (va == eva)
4285 va = sva;
4286 }
4287 if (va != eva)
4288 pmap_invalidate_range(pmap, va, sva, true);
4289 }
4290
4291 static void
pmap_remove1(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool map_delete)4292 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
4293 {
4294 struct rwlock *lock;
4295 vm_offset_t va_next;
4296 pd_entry_t *l0, *l1, *l2;
4297 pt_entry_t l3_paddr;
4298 struct spglist free;
4299
4300 /*
4301 * Perform an unsynchronized read. This is, however, safe.
4302 */
4303 if (pmap->pm_stats.resident_count == 0)
4304 return;
4305
4306 SLIST_INIT(&free);
4307
4308 PMAP_LOCK(pmap);
4309 if (map_delete)
4310 pmap_bti_on_remove(pmap, sva, eva);
4311
4312 lock = NULL;
4313 for (; sva < eva; sva = va_next) {
4314 if (pmap->pm_stats.resident_count == 0)
4315 break;
4316
4317 l0 = pmap_l0(pmap, sva);
4318 if (pmap_load(l0) == 0) {
4319 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4320 if (va_next < sva)
4321 va_next = eva;
4322 continue;
4323 }
4324
4325 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4326 if (va_next < sva)
4327 va_next = eva;
4328 l1 = pmap_l0_to_l1(l0, sva);
4329 if (pmap_load(l1) == 0)
4330 continue;
4331 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4332 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4333 KASSERT(va_next <= eva,
4334 ("partial update of non-transparent 1G page "
4335 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4336 pmap_load(l1), sva, eva, va_next));
4337 MPASS(pmap != kernel_pmap);
4338 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4339 pmap_clear(l1);
4340 pmap_s1_invalidate_page(pmap, sva, true);
4341 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
4342 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
4343 continue;
4344 }
4345
4346 /*
4347 * Calculate index for next page table.
4348 */
4349 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4350 if (va_next < sva)
4351 va_next = eva;
4352
4353 l2 = pmap_l1_to_l2(l1, sva);
4354 l3_paddr = pmap_load(l2);
4355
4356 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4357 if (sva + L2_SIZE == va_next && eva >= va_next) {
4358 pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
4359 true, &free, &lock);
4360 continue;
4361 } else if (pmap_demote_l2_locked(pmap, l2, sva,
4362 &lock) == NULL)
4363 continue;
4364 l3_paddr = pmap_load(l2);
4365 }
4366
4367 /*
4368 * Weed out invalid mappings.
4369 */
4370 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
4371 continue;
4372
4373 /*
4374 * Limit our scan to either the end of the va represented
4375 * by the current page table page, or to the end of the
4376 * range being removed.
4377 */
4378 if (va_next > eva)
4379 va_next = eva;
4380
4381 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
4382 &lock);
4383 }
4384 if (lock != NULL)
4385 rw_wunlock(lock);
4386 PMAP_UNLOCK(pmap);
4387 vm_page_free_pages_toq(&free, true);
4388 }
4389
4390 /*
4391 * Remove the given range of addresses from the specified map.
4392 *
4393 * It is assumed that the start and end are properly
4394 * rounded to the page size.
4395 */
4396 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4397 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4398 {
4399 pmap_remove1(pmap, sva, eva, false);
4400 }
4401
4402 /*
4403 * Remove the given range of addresses as part of a logical unmap
4404 * operation. This has the effect of calling pmap_remove(), but
4405 * also clears any metadata that should persist for the lifetime
4406 * of a logical mapping.
4407 */
4408 void
pmap_map_delete(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4409 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4410 {
4411 pmap_remove1(pmap, sva, eva, true);
4412 }
4413
4414 /*
4415 * Routine: pmap_remove_all
4416 * Function:
4417 * Removes this physical page from
4418 * all physical maps in which it resides.
4419 * Reflects back modify bits to the pager.
4420 *
4421 * Notes:
4422 * Original versions of this routine were very
4423 * inefficient because they iteratively called
4424 * pmap_remove (slow...)
4425 */
4426
4427 void
pmap_remove_all(vm_page_t m)4428 pmap_remove_all(vm_page_t m)
4429 {
4430 struct md_page *pvh;
4431 pv_entry_t pv;
4432 pmap_t pmap;
4433 struct rwlock *lock;
4434 pd_entry_t *pde, tpde;
4435 pt_entry_t *pte, tpte;
4436 vm_offset_t va;
4437 struct spglist free;
4438 int lvl, pvh_gen, md_gen;
4439
4440 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4441 ("pmap_remove_all: page %p is not managed", m));
4442 SLIST_INIT(&free);
4443 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4444 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
4445 rw_wlock(lock);
4446 retry:
4447 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4448 pmap = PV_PMAP(pv);
4449 if (!PMAP_TRYLOCK(pmap)) {
4450 pvh_gen = pvh->pv_gen;
4451 rw_wunlock(lock);
4452 PMAP_LOCK(pmap);
4453 rw_wlock(lock);
4454 if (pvh_gen != pvh->pv_gen) {
4455 PMAP_UNLOCK(pmap);
4456 goto retry;
4457 }
4458 }
4459 va = pv->pv_va;
4460 pte = pmap_pte_exists(pmap, va, 2, __func__);
4461 pmap_demote_l2_locked(pmap, pte, va, &lock);
4462 PMAP_UNLOCK(pmap);
4463 }
4464 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4465 pmap = PV_PMAP(pv);
4466 if (!PMAP_TRYLOCK(pmap)) {
4467 pvh_gen = pvh->pv_gen;
4468 md_gen = m->md.pv_gen;
4469 rw_wunlock(lock);
4470 PMAP_LOCK(pmap);
4471 rw_wlock(lock);
4472 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4473 PMAP_UNLOCK(pmap);
4474 goto retry;
4475 }
4476 }
4477 pmap_resident_count_dec(pmap, 1);
4478
4479 pde = pmap_pde(pmap, pv->pv_va, &lvl);
4480 KASSERT(pde != NULL,
4481 ("pmap_remove_all: no page directory entry found"));
4482 KASSERT(lvl == 2,
4483 ("pmap_remove_all: invalid pde level %d", lvl));
4484 tpde = pmap_load(pde);
4485
4486 pte = pmap_l2_to_l3(pde, pv->pv_va);
4487 tpte = pmap_load(pte);
4488 if ((tpte & ATTR_CONTIGUOUS) != 0)
4489 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
4490 tpte = pmap_load_clear(pte);
4491 if (tpte & ATTR_SW_WIRED)
4492 pmap->pm_stats.wired_count--;
4493 if ((tpte & ATTR_AF) != 0) {
4494 pmap_invalidate_page(pmap, pv->pv_va, true);
4495 vm_page_aflag_set(m, PGA_REFERENCED);
4496 }
4497
4498 /*
4499 * Update the vm_page_t clean and reference bits.
4500 */
4501 if (pmap_pte_dirty(pmap, tpte))
4502 vm_page_dirty(m);
4503 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
4504 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4505 m->md.pv_gen++;
4506 free_pv_entry(pmap, pv);
4507 PMAP_UNLOCK(pmap);
4508 }
4509 vm_page_aflag_clear(m, PGA_WRITEABLE);
4510 rw_wunlock(lock);
4511 vm_page_free_pages_toq(&free, true);
4512 }
4513
4514 /*
4515 * Masks and sets bits in a level 2 page table entries in the specified pmap
4516 */
4517 static void
pmap_protect_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pt_entry_t mask,pt_entry_t nbits)4518 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
4519 pt_entry_t nbits)
4520 {
4521 pd_entry_t old_l2;
4522 vm_page_t m, mt;
4523
4524 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4525 PMAP_ASSERT_STAGE1(pmap);
4526 KASSERT((sva & L2_OFFSET) == 0,
4527 ("pmap_protect_l2: sva is not 2mpage aligned"));
4528 old_l2 = pmap_load(l2);
4529 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4530 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
4531
4532 /*
4533 * Return if the L2 entry already has the desired access restrictions
4534 * in place.
4535 */
4536 if ((old_l2 & mask) == nbits)
4537 return;
4538
4539 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
4540 cpu_spinwait();
4541
4542 /*
4543 * When a dirty read/write superpage mapping is write protected,
4544 * update the dirty field of each of the superpage's constituent 4KB
4545 * pages.
4546 */
4547 if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
4548 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4549 pmap_pte_dirty(pmap, old_l2)) {
4550 m = PTE_TO_VM_PAGE(old_l2);
4551 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4552 vm_page_dirty(mt);
4553 }
4554
4555 /*
4556 * Since a promotion must break the 4KB page mappings before making
4557 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4558 */
4559 pmap_s1_invalidate_page(pmap, sva, true);
4560 }
4561
4562 /*
4563 * Masks and sets bits in the specified L3C superpage mapping.
4564 *
4565 * Requests TLB invalidations to be performed by the caller through the
4566 * returned "*vap".
4567 */
4568 static void
pmap_mask_set_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,pt_entry_t mask,pt_entry_t nbits)4569 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
4570 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits)
4571 {
4572 pt_entry_t l3e, *tl3p;
4573 vm_page_t m, mt;
4574 bool dirty;
4575
4576 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4577 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4578 0, ("pmap_mask_set_l3c: l3p is not aligned"));
4579 KASSERT((va & L3C_OFFSET) == 0,
4580 ("pmap_mask_set_l3c: va is not aligned"));
4581 dirty = false;
4582 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4583 l3e = pmap_load(tl3p);
4584 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4585 ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS"));
4586 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
4587 cpu_spinwait();
4588 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4589 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4590 dirty = true;
4591 }
4592
4593 /*
4594 * When a dirty read/write superpage mapping is write protected,
4595 * update the dirty field of each of the superpage's constituent 4KB
4596 * pages.
4597 */
4598 if ((l3e & ATTR_SW_MANAGED) != 0 &&
4599 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4600 dirty) {
4601 m = PTE_TO_VM_PAGE(pmap_load(l3p));
4602 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
4603 vm_page_dirty(mt);
4604 }
4605
4606 if (*vap == va_next)
4607 *vap = va;
4608 }
4609
4610 /*
4611 * Masks and sets bits in last level page table entries in the specified
4612 * pmap and range
4613 */
4614 static void
pmap_mask_set_locked(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4615 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4616 pt_entry_t nbits, bool invalidate)
4617 {
4618 vm_offset_t va, va_next;
4619 pd_entry_t *l0, *l1, *l2;
4620 pt_entry_t *l3p, l3;
4621
4622 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4623 for (; sva < eva; sva = va_next) {
4624 l0 = pmap_l0(pmap, sva);
4625 if (pmap_load(l0) == 0) {
4626 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4627 if (va_next < sva)
4628 va_next = eva;
4629 continue;
4630 }
4631
4632 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4633 if (va_next < sva)
4634 va_next = eva;
4635 l1 = pmap_l0_to_l1(l0, sva);
4636 if (pmap_load(l1) == 0)
4637 continue;
4638 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4639 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4640 KASSERT(va_next <= eva,
4641 ("partial update of non-transparent 1G page "
4642 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4643 pmap_load(l1), sva, eva, va_next));
4644 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4645 if ((pmap_load(l1) & mask) != nbits) {
4646 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
4647 if (invalidate)
4648 pmap_s1_invalidate_page(pmap, sva, true);
4649 }
4650 continue;
4651 }
4652
4653 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4654 if (va_next < sva)
4655 va_next = eva;
4656
4657 l2 = pmap_l1_to_l2(l1, sva);
4658 if (pmap_load(l2) == 0)
4659 continue;
4660
4661 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4662 if (sva + L2_SIZE == va_next && eva >= va_next) {
4663 pmap_protect_l2(pmap, l2, sva, mask, nbits);
4664 continue;
4665 } else if ((pmap_load(l2) & mask) == nbits ||
4666 pmap_demote_l2(pmap, l2, sva) == NULL)
4667 continue;
4668 }
4669 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4670 ("pmap_protect: Invalid L2 entry after demotion"));
4671
4672 if (va_next > eva)
4673 va_next = eva;
4674
4675 va = va_next;
4676 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
4677 sva += L3_SIZE) {
4678 l3 = pmap_load(l3p);
4679
4680 /*
4681 * Go to the next L3 entry if the current one is
4682 * invalid or already has the desired access
4683 * restrictions in place. (The latter case occurs
4684 * frequently. For example, in a "buildworld"
4685 * workload, almost 1 out of 4 L3 entries already
4686 * have the desired restrictions.)
4687 */
4688 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
4689 if (va != va_next) {
4690 if (invalidate)
4691 pmap_s1_invalidate_range(pmap,
4692 va, sva, true);
4693 va = va_next;
4694 }
4695 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4696 /*
4697 * Does this L3C page extend beyond
4698 * the requested range? Handle the
4699 * possibility that "va_next" is zero.
4700 */
4701 if ((sva | L3C_OFFSET) > va_next - 1)
4702 break;
4703
4704 /*
4705 * Skip ahead to the last L3_PAGE
4706 * within this L3C page.
4707 */
4708 l3p = (pt_entry_t *)((uintptr_t)l3p |
4709 ((L3C_ENTRIES - 1) *
4710 sizeof(pt_entry_t)));
4711 sva |= L3C_SIZE - L3_SIZE;
4712 }
4713 continue;
4714 }
4715
4716 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4717 /*
4718 * Is this entire set of contiguous L3 entries
4719 * being protected? Handle the possibility
4720 * that "va_next" is zero because of address
4721 * wraparound.
4722 */
4723 if ((sva & L3C_OFFSET) == 0 &&
4724 sva + L3C_OFFSET <= va_next - 1) {
4725 pmap_mask_set_l3c(pmap, l3p, sva, &va,
4726 va_next, mask, nbits);
4727 l3p += L3C_ENTRIES - 1;
4728 sva += L3C_SIZE - L3_SIZE;
4729 continue;
4730 }
4731
4732 (void)pmap_demote_l3c(pmap, l3p, sva);
4733
4734 /*
4735 * The L3 entry's accessed bit may have changed.
4736 */
4737 l3 = pmap_load(l3p);
4738 }
4739 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
4740 nbits))
4741 cpu_spinwait();
4742
4743 /*
4744 * When a dirty read/write mapping is write protected,
4745 * update the page's dirty field.
4746 */
4747 if ((l3 & ATTR_SW_MANAGED) != 0 &&
4748 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4749 pmap_pte_dirty(pmap, l3))
4750 vm_page_dirty(PTE_TO_VM_PAGE(l3));
4751
4752 if (va == va_next)
4753 va = sva;
4754 }
4755 if (va != va_next && invalidate)
4756 pmap_s1_invalidate_range(pmap, va, sva, true);
4757 }
4758 }
4759
4760 static void
pmap_mask_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4761 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4762 pt_entry_t nbits, bool invalidate)
4763 {
4764 PMAP_LOCK(pmap);
4765 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
4766 PMAP_UNLOCK(pmap);
4767 }
4768
4769 /*
4770 * Set the physical protection on the
4771 * specified range of this map as requested.
4772 */
4773 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)4774 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4775 {
4776 pt_entry_t mask, nbits;
4777
4778 PMAP_ASSERT_STAGE1(pmap);
4779 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4780 if (prot == VM_PROT_NONE) {
4781 pmap_remove(pmap, sva, eva);
4782 return;
4783 }
4784
4785 mask = nbits = 0;
4786 if ((prot & VM_PROT_WRITE) == 0) {
4787 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
4788 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
4789 }
4790 if ((prot & VM_PROT_EXECUTE) == 0) {
4791 mask |= ATTR_S1_XN;
4792 nbits |= ATTR_S1_XN;
4793 }
4794 if (pmap == kernel_pmap) {
4795 mask |= ATTR_KERN_GP;
4796 nbits |= ATTR_KERN_GP;
4797 }
4798 if (mask == 0)
4799 return;
4800
4801 pmap_mask_set(pmap, sva, eva, mask, nbits, true);
4802 }
4803
4804 void
pmap_disable_promotion(vm_offset_t sva,vm_size_t size)4805 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
4806 {
4807
4808 MPASS((sva & L3_OFFSET) == 0);
4809 MPASS(((sva + size) & L3_OFFSET) == 0);
4810
4811 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4812 ATTR_SW_NO_PROMOTE, false);
4813 }
4814
4815 /*
4816 * Inserts the specified page table page into the specified pmap's collection
4817 * of idle page table pages. Each of a pmap's page table pages is responsible
4818 * for mapping a distinct range of virtual addresses. The pmap's collection is
4819 * ordered by this virtual address range.
4820 *
4821 * If "promoted" is false, then the page table page "mpte" must be zero filled;
4822 * "mpte"'s valid field will be set to 0.
4823 *
4824 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must
4825 * contain valid mappings with identical attributes except for ATTR_AF;
4826 * "mpte"'s valid field will be set to 1.
4827 *
4828 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain
4829 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid
4830 * field will be set to VM_PAGE_BITS_ALL.
4831 */
4832 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_AF_set)4833 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4834 bool all_l3e_AF_set)
4835 {
4836
4837 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4838 KASSERT(promoted || !all_l3e_AF_set,
4839 ("a zero-filled PTP can't have ATTR_AF set in every PTE"));
4840 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0;
4841 return (vm_radix_insert(&pmap->pm_root, mpte));
4842 }
4843
4844 /*
4845 * Removes the page table page mapping the specified virtual address from the
4846 * specified pmap's collection of idle page table pages, and returns it.
4847 * Otherwise, returns NULL if there is no page table page corresponding to the
4848 * specified virtual address.
4849 */
4850 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)4851 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4852 {
4853
4854 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4855 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4856 }
4857
4858 /*
4859 * Performs a break-before-make update of a pmap entry. This is needed when
4860 * either promoting or demoting pages to ensure the TLB doesn't get into an
4861 * inconsistent state.
4862 */
4863 static void
pmap_update_entry(pmap_t pmap,pd_entry_t * ptep,pd_entry_t newpte,vm_offset_t va,vm_size_t size)4864 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte,
4865 vm_offset_t va, vm_size_t size)
4866 {
4867 register_t intr;
4868
4869 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4870 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4871 ("%s: Updating non-promote pte", __func__));
4872
4873 /*
4874 * Ensure we don't get switched out with the page table in an
4875 * inconsistent state. We also need to ensure no interrupts fire
4876 * as they may make use of an address we are about to invalidate.
4877 */
4878 intr = intr_disable();
4879
4880 /*
4881 * Clear the old mapping's valid bit, but leave the rest of the entry
4882 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4883 * lookup the physical address.
4884 */
4885 pmap_clear_bits(ptep, ATTR_DESCR_VALID);
4886
4887 /*
4888 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4889 * be cached, so we invalidate intermediate entries as well as final
4890 * entries.
4891 */
4892 pmap_s1_invalidate_range(pmap, va, va + size, false);
4893
4894 /* Create the new mapping */
4895 pmap_store(ptep, newpte);
4896 dsb(ishst);
4897
4898 intr_restore(intr);
4899 }
4900
4901 /*
4902 * Performs a break-before-make update of an ATTR_CONTIGUOUS mapping.
4903 */
4904 static void __nosanitizecoverage
pmap_update_strided(pmap_t pmap,pd_entry_t * ptep,pd_entry_t * ptep_end,pd_entry_t newpte,vm_offset_t va,vm_offset_t stride,vm_size_t size)4905 pmap_update_strided(pmap_t pmap, pd_entry_t *ptep, pd_entry_t *ptep_end,
4906 pd_entry_t newpte, vm_offset_t va, vm_offset_t stride, vm_size_t size)
4907 {
4908 pd_entry_t *lip;
4909 register_t intr;
4910
4911 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4912 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4913 ("%s: Updating non-promote pte", __func__));
4914
4915 /*
4916 * Ensure we don't get switched out with the page table in an
4917 * inconsistent state. We also need to ensure no interrupts fire
4918 * as they may make use of an address we are about to invalidate.
4919 */
4920 intr = intr_disable();
4921
4922 /*
4923 * Clear the old mapping's valid bits, but leave the rest of each
4924 * entry unchanged, so that a lockless, concurrent pmap_kextract() can
4925 * still lookup the physical address.
4926 */
4927 for (lip = ptep; lip < ptep_end; lip++)
4928 pmap_clear_bits(lip, ATTR_DESCR_VALID);
4929
4930 /* Only final entries are changing. */
4931 pmap_s1_invalidate_strided(pmap, va, va + size, stride, true);
4932
4933 /* Create the new mapping. */
4934 for (lip = ptep; lip < ptep_end; lip++) {
4935 pmap_store(lip, newpte);
4936 newpte += stride;
4937 }
4938 dsb(ishst);
4939
4940 intr_restore(intr);
4941 }
4942
4943 #if VM_NRESERVLEVEL > 0
4944 /*
4945 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4946 * replace the many pv entries for the 4KB page mappings by a single pv entry
4947 * for the 2MB page mapping.
4948 */
4949 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)4950 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4951 struct rwlock **lockp)
4952 {
4953 struct md_page *pvh;
4954 pv_entry_t pv;
4955 vm_offset_t va_last;
4956 vm_page_t m;
4957
4958 KASSERT((pa & L2_OFFSET) == 0,
4959 ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4960 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4961
4962 /*
4963 * Transfer the first page's pv entry for this mapping to the 2mpage's
4964 * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
4965 * a transfer avoids the possibility that get_pv_entry() calls
4966 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4967 * mappings that is being promoted.
4968 */
4969 m = PHYS_TO_VM_PAGE(pa);
4970 va = va & ~L2_OFFSET;
4971 pv = pmap_pvh_remove(&m->md, pmap, va);
4972 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4973 pvh = page_to_pvh(m);
4974 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4975 pvh->pv_gen++;
4976 /* Free the remaining NPTEPG - 1 pv entries. */
4977 va_last = va + L2_SIZE - PAGE_SIZE;
4978 do {
4979 m++;
4980 va += PAGE_SIZE;
4981 pmap_pvh_free(&m->md, pmap, va);
4982 } while (va < va_last);
4983 }
4984
4985 /*
4986 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4987 * single level 2 table entry to a single 2MB page mapping. For promotion
4988 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4989 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4990 * identical characteristics.
4991 */
4992 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t mpte,struct rwlock ** lockp)4993 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4994 struct rwlock **lockp)
4995 {
4996 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa;
4997
4998 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4999
5000 /*
5001 * Currently, this function only supports promotion on stage 1 pmaps
5002 * because it tests stage 1 specific fields and performs a break-
5003 * before-make sequence that is incorrect for stage 2 pmaps.
5004 */
5005 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
5006 return (false);
5007
5008 /*
5009 * Examine the first L3E in the specified PTP. Abort if this L3E is
5010 * ineligible for promotion...
5011 */
5012 firstl3 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
5013 newl2 = pmap_load(firstl3);
5014 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
5015 return (false);
5016 /* ... is not the first physical page within an L2 block */
5017 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 ||
5018 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */
5019 counter_u64_add(pmap_l2_p_failures, 1);
5020 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
5021 " in pmap %p", va, pmap);
5022 return (false);
5023 }
5024
5025 /*
5026 * Both here and in the below "for" loop, to allow for repromotion
5027 * after MADV_FREE, conditionally write protect a clean L3E before
5028 * possibly aborting the promotion due to other L3E attributes. Why?
5029 * Suppose that MADV_FREE is applied to a part of a superpage, the
5030 * address range [S, E). pmap_advise() will demote the superpage
5031 * mapping, destroy the 4KB page mapping at the end of [S, E), and
5032 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later,
5033 * imagine that the memory in [S, E) is recycled, but the last 4KB
5034 * page in [S, E) is not the last to be rewritten, or simply accessed.
5035 * In other words, there is still a 4KB page in [S, E), call it P,
5036 * that is writeable but AP_RO is set and AF is clear in P's L3E.
5037 * Unless we write protect P before aborting the promotion, if and
5038 * when P is finally rewritten, there won't be a page fault to trigger
5039 * repromotion.
5040 */
5041 setl2:
5042 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5043 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5044 /*
5045 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
5046 * ATTR_SW_DBM can be cleared without a TLB invalidation.
5047 */
5048 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
5049 goto setl2;
5050 newl2 &= ~ATTR_SW_DBM;
5051 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx"
5052 " in pmap %p", va & ~L2_OFFSET, pmap);
5053 }
5054
5055 /*
5056 * Examine each of the other L3Es in the specified PTP. Abort if this
5057 * L3E maps an unexpected 4KB physical page or does not have identical
5058 * characteristics to the first L3E. If ATTR_AF is not set in every
5059 * PTE, then request that the PTP be refilled on demotion.
5060 */
5061 all_l3e_AF = newl2 & ATTR_AF;
5062 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK))
5063 + L2_SIZE - PAGE_SIZE;
5064 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
5065 oldl3 = pmap_load(l3);
5066 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
5067 counter_u64_add(pmap_l2_p_failures, 1);
5068 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
5069 " in pmap %p", va, pmap);
5070 return (false);
5071 }
5072 setl3:
5073 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5074 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5075 /*
5076 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
5077 * set, ATTR_SW_DBM can be cleared without a TLB
5078 * invalidation.
5079 */
5080 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
5081 ~ATTR_SW_DBM))
5082 goto setl3;
5083 oldl3 &= ~ATTR_SW_DBM;
5084 }
5085 if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) {
5086 counter_u64_add(pmap_l2_p_failures, 1);
5087 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
5088 " in pmap %p", va, pmap);
5089 return (false);
5090 }
5091 all_l3e_AF &= oldl3;
5092 pa -= PAGE_SIZE;
5093 }
5094
5095 /*
5096 * Unless all PTEs have ATTR_AF set, clear it from the superpage
5097 * mapping, so that promotions triggered by speculative mappings,
5098 * such as pmap_enter_quick(), don't automatically mark the
5099 * underlying pages as referenced.
5100 */
5101 newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF;
5102
5103 /*
5104 * Save the page table page in its current state until the L2
5105 * mapping the superpage is demoted by pmap_demote_l2() or
5106 * destroyed by pmap_remove_l3().
5107 */
5108 if (mpte == NULL)
5109 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5110 KASSERT(mpte >= vm_page_array &&
5111 mpte < &vm_page_array[vm_page_array_size],
5112 ("pmap_promote_l2: page table page is out of range"));
5113 KASSERT(mpte->pindex == pmap_l2_pindex(va),
5114 ("pmap_promote_l2: page table page's pindex is wrong"));
5115 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) {
5116 counter_u64_add(pmap_l2_p_failures, 1);
5117 CTR2(KTR_PMAP,
5118 "pmap_promote_l2: failure for va %#lx in pmap %p", va,
5119 pmap);
5120 return (false);
5121 }
5122
5123 if ((newl2 & ATTR_SW_MANAGED) != 0)
5124 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp);
5125
5126 pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE);
5127
5128 counter_u64_add(pmap_l2_promotions, 1);
5129 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
5130 pmap);
5131 return (true);
5132 }
5133
5134 /*
5135 * Tries to promote an aligned, contiguous set of base page mappings to a
5136 * single L3C page mapping. For promotion to occur, two conditions must be
5137 * met: (1) the base page mappings must map aligned, contiguous physical
5138 * memory and (2) the base page mappings must have identical characteristics
5139 * except for the accessed flag.
5140 */
5141 static bool
pmap_promote_l3c(pmap_t pmap,pd_entry_t * l3p,vm_offset_t va)5142 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va)
5143 {
5144 pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa;
5145
5146 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5147
5148 /*
5149 * Currently, this function only supports promotion on stage 1 pmaps
5150 * because it tests stage 1 specific fields and performs a break-
5151 * before-make sequence that is incorrect for stage 2 pmaps.
5152 */
5153 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
5154 return (false);
5155
5156 /*
5157 * Compute the address of the first L3 entry in the superpage
5158 * candidate.
5159 */
5160 l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
5161 sizeof(pt_entry_t)) - 1));
5162
5163 firstl3c = pmap_load(l3p);
5164
5165 /*
5166 * Examine the first L3 entry. Abort if this L3E is ineligible for
5167 * promotion...
5168 */
5169 if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0)
5170 return (false);
5171 /* ...is not properly aligned... */
5172 if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 ||
5173 (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */
5174 counter_u64_add(pmap_l3c_p_failures, 1);
5175 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5176 " in pmap %p", va, pmap);
5177 return (false);
5178 }
5179
5180 /*
5181 * If the first L3 entry is a clean read-write mapping, convert it
5182 * to a read-only mapping. See pmap_promote_l2() for the rationale.
5183 */
5184 set_first:
5185 if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5186 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5187 /*
5188 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
5189 * ATTR_SW_DBM can be cleared without a TLB invalidation.
5190 */
5191 if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM))
5192 goto set_first;
5193 firstl3c &= ~ATTR_SW_DBM;
5194 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
5195 " in pmap %p", va & ~L3C_OFFSET, pmap);
5196 }
5197
5198 /*
5199 * Check that the rest of the L3 entries are compatible with the first,
5200 * and convert clean read-write mappings to read-only mappings.
5201 */
5202 all_l3e_AF = firstl3c & ATTR_AF;
5203 pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) +
5204 L3C_SIZE - PAGE_SIZE;
5205 for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) {
5206 oldl3 = pmap_load(l3);
5207 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
5208 counter_u64_add(pmap_l3c_p_failures, 1);
5209 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5210 " in pmap %p", va, pmap);
5211 return (false);
5212 }
5213 set_l3:
5214 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
5215 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
5216 /*
5217 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
5218 * set, ATTR_SW_DBM can be cleared without a TLB
5219 * invalidation.
5220 */
5221 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
5222 ~ATTR_SW_DBM))
5223 goto set_l3;
5224 oldl3 &= ~ATTR_SW_DBM;
5225 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
5226 " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) |
5227 (va & ~L3C_OFFSET), pmap);
5228 }
5229 if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) {
5230 counter_u64_add(pmap_l3c_p_failures, 1);
5231 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
5232 " in pmap %p", va, pmap);
5233 return (false);
5234 }
5235 all_l3e_AF &= oldl3;
5236 pa -= PAGE_SIZE;
5237 }
5238
5239 /*
5240 * Unless all PTEs have ATTR_AF set, clear it from the superpage
5241 * mapping, so that promotions triggered by speculative mappings,
5242 * such as pmap_enter_quick(), don't automatically mark the
5243 * underlying pages as referenced.
5244 */
5245 firstl3c &= ~ATTR_AF | all_l3e_AF;
5246
5247 /*
5248 * Remake the mappings with the contiguous bit set.
5249 */
5250 pmap_update_strided(pmap, l3p, l3p + L3C_ENTRIES, firstl3c |
5251 ATTR_CONTIGUOUS, va & ~L3C_OFFSET, L3_SIZE, L3C_SIZE);
5252
5253 counter_u64_add(pmap_l3c_promotions, 1);
5254 CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va,
5255 pmap);
5256 return (true);
5257 }
5258 #endif /* VM_NRESERVLEVEL > 0 */
5259
5260 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t pte,int flags,int psind)5261 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags,
5262 int psind)
5263 {
5264 pd_entry_t *l0p, *l1p, *l2p, *l3p, newpte, origpte, *tl3p;
5265 vm_page_t mp;
5266
5267 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5268 KASSERT(psind > 0 && psind < MAXPAGESIZES,
5269 ("psind %d unexpected", psind));
5270 KASSERT((PTE_TO_PHYS(pte) & (pagesizes[psind] - 1)) == 0,
5271 ("unaligned phys address %#lx pte %#lx psind %d",
5272 PTE_TO_PHYS(pte), pte, psind));
5273
5274 restart:
5275 newpte = pte;
5276 if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte))
5277 return (KERN_PROTECTION_FAILURE);
5278 if (psind == 3) {
5279 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5280
5281 KASSERT(pagesizes[psind] == L1_SIZE,
5282 ("pagesizes[%d] != L1_SIZE", psind));
5283 l0p = pmap_l0(pmap, va);
5284 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
5285 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
5286 if (mp == NULL) {
5287 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5288 return (KERN_RESOURCE_SHORTAGE);
5289 PMAP_UNLOCK(pmap);
5290 vm_wait(NULL);
5291 PMAP_LOCK(pmap);
5292 goto restart;
5293 }
5294 l1p = pmap_l0_to_l1(l0p, va);
5295 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
5296 origpte = pmap_load(l1p);
5297 } else {
5298 l1p = pmap_l0_to_l1(l0p, va);
5299 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
5300 origpte = pmap_load(l1p);
5301 if ((origpte & ATTR_DESCR_VALID) == 0) {
5302 mp = PTE_TO_VM_PAGE(pmap_load(l0p));
5303 mp->ref_count++;
5304 }
5305 }
5306 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) &&
5307 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
5308 (origpte & ATTR_DESCR_VALID) == 0,
5309 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
5310 va, origpte, newpte));
5311 pmap_store(l1p, newpte);
5312 } else if (psind == 2) {
5313 KASSERT(pagesizes[psind] == L2_SIZE,
5314 ("pagesizes[%d] != L2_SIZE", psind));
5315 l2p = pmap_l2(pmap, va);
5316 if (l2p == NULL) {
5317 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
5318 if (mp == NULL) {
5319 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5320 return (KERN_RESOURCE_SHORTAGE);
5321 PMAP_UNLOCK(pmap);
5322 vm_wait(NULL);
5323 PMAP_LOCK(pmap);
5324 goto restart;
5325 }
5326 l2p = VM_PAGE_TO_DMAP(mp);
5327 l2p = &l2p[pmap_l2_index(va)];
5328 origpte = pmap_load(l2p);
5329 } else {
5330 l1p = pmap_l1(pmap, va);
5331 origpte = pmap_load(l2p);
5332 if ((origpte & ATTR_DESCR_VALID) == 0) {
5333 mp = PTE_TO_VM_PAGE(pmap_load(l1p));
5334 mp->ref_count++;
5335 }
5336 }
5337 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5338 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
5339 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5340 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
5341 va, origpte, newpte));
5342 pmap_store(l2p, newpte);
5343 } else /* (psind == 1) */ {
5344 KASSERT(pagesizes[psind] == L3C_SIZE,
5345 ("pagesizes[%d] != L3C_SIZE", psind));
5346 l2p = pmap_l2(pmap, va);
5347 if (l2p == NULL || (pmap_load(l2p) & ATTR_DESCR_VALID) == 0) {
5348 mp = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL);
5349 if (mp == NULL) {
5350 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5351 return (KERN_RESOURCE_SHORTAGE);
5352 PMAP_UNLOCK(pmap);
5353 vm_wait(NULL);
5354 PMAP_LOCK(pmap);
5355 goto restart;
5356 }
5357 mp->ref_count += L3C_ENTRIES - 1;
5358 l3p = VM_PAGE_TO_DMAP(mp);
5359 l3p = &l3p[pmap_l3_index(va)];
5360 } else {
5361 l3p = pmap_l2_to_l3(l2p, va);
5362 if ((pmap_load(l3p) & ATTR_DESCR_VALID) == 0) {
5363 mp = PTE_TO_VM_PAGE(pmap_load(l2p));
5364 mp->ref_count += L3C_ENTRIES;
5365 }
5366 }
5367 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5368 origpte = pmap_load(tl3p);
5369 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5370 ((origpte & ATTR_CONTIGUOUS) != 0 &&
5371 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5372 ("va %#lx changing 64K phys page l3 %#lx newpte %#lx",
5373 va, origpte, newpte));
5374 pmap_store(tl3p, newpte);
5375 newpte += L3_SIZE;
5376 }
5377 }
5378 dsb(ishst);
5379
5380 if ((origpte & ATTR_DESCR_VALID) == 0)
5381 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
5382 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
5383 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
5384 else if ((newpte & ATTR_SW_WIRED) == 0 &&
5385 (origpte & ATTR_SW_WIRED) != 0)
5386 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
5387
5388 return (KERN_SUCCESS);
5389 }
5390
5391 static void
pmap_set_unprotected(pt_entry_t new_l3)5392 pmap_set_unprotected(pt_entry_t new_l3)
5393 {
5394 vm_paddr_t pa;
5395
5396 pa = PTE_TO_PHYS(new_l3) & ~prot_ns_shared_pa;
5397
5398 rsi_set_addr_range_state(pa, pa + L3_SIZE, RSI_RIPAS_EMPTY,
5399 RSI_CHANGE_DESTROYED, NULL);
5400 }
5401
5402 static void
pmap_set_protected(pt_entry_t old_l3)5403 pmap_set_protected(pt_entry_t old_l3)
5404 {
5405 vm_paddr_t pa;
5406
5407 pa = PTE_TO_PHYS(old_l3) & ~prot_ns_shared_pa;
5408
5409 rsi_set_addr_range_state(pa, pa + L3_SIZE, RSI_RIPAS_RAM,
5410 RSI_CHANGE_DESTROYED, NULL);
5411 }
5412
5413 /*
5414 * Insert the given physical page (p) at
5415 * the specified virtual address (v) in the
5416 * target physical map with the protection requested.
5417 *
5418 * If specified, the page will be wired down, meaning
5419 * that the related pte can not be reclaimed.
5420 *
5421 * NB: This is the only routine which MAY NOT lazy-evaluate
5422 * or lose information. That is, this routine must actually
5423 * insert this page into the given map NOW.
5424 */
5425 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)5426 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5427 u_int flags, int8_t psind)
5428 {
5429 struct rwlock *lock;
5430 pd_entry_t *pde;
5431 pt_entry_t new_l3, orig_l3;
5432 pt_entry_t *l2, *l3;
5433 pv_entry_t pv;
5434 vm_paddr_t opa, pa;
5435 vm_page_t mpte, om;
5436 bool nosleep;
5437 int full_lvl, lvl, rv;
5438
5439 KASSERT(ADDR_IS_CANONICAL(va),
5440 ("%s: Address not in canonical form: %lx", __func__, va));
5441
5442 va = trunc_page(va);
5443 if ((m->oflags & VPO_UNMANAGED) == 0)
5444 VM_PAGE_OBJECT_BUSY_ASSERT(m);
5445 pa = VM_PAGE_TO_PHYS(m);
5446 if (in_realm() && (flags & PMAP_ENTER_UNPROTECTED) != 0)
5447 pa |= prot_ns_shared_pa;
5448 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr |
5449 L3_PAGE);
5450 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
5451 new_l3 |= pmap_pte_prot(pmap, prot);
5452 if ((flags & PMAP_ENTER_WIRED) != 0)
5453 new_l3 |= ATTR_SW_WIRED;
5454 if (pmap->pm_stage == PM_STAGE1) {
5455 if (ADDR_IS_USER(va))
5456 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5457 else
5458 new_l3 |= ATTR_S1_UXN;
5459 if (pmap != kernel_pmap)
5460 new_l3 |= ATTR_S1_nG;
5461 } else {
5462 /*
5463 * Clear the access flag on executable mappings, this will be
5464 * set later when the page is accessed. The fault handler is
5465 * required to invalidate the I-cache.
5466 *
5467 * TODO: Switch to the valid flag to allow hardware management
5468 * of the access flag. Much of the pmap code assumes the
5469 * valid flag is set and fails to destroy the old page tables
5470 * correctly if it is clear.
5471 */
5472 if (prot & VM_PROT_EXECUTE)
5473 new_l3 &= ~ATTR_AF;
5474 }
5475 if ((m->oflags & VPO_UNMANAGED) == 0) {
5476 new_l3 |= ATTR_SW_MANAGED;
5477 if ((prot & VM_PROT_WRITE) != 0) {
5478 new_l3 |= ATTR_SW_DBM;
5479 if ((flags & VM_PROT_WRITE) == 0) {
5480 if (pmap->pm_stage == PM_STAGE1)
5481 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
5482 else
5483 new_l3 &=
5484 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5485 }
5486 }
5487 }
5488
5489 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
5490
5491 lock = NULL;
5492 PMAP_LOCK(pmap);
5493 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
5494 KASSERT((m->oflags & VPO_UNMANAGED) != 0,
5495 ("managed largepage va %#lx flags %#x", va, flags));
5496 if (psind == 3) {
5497 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5498 new_l3 &= ~L3_PAGE;
5499 new_l3 |= L1_BLOCK;
5500 } else if (psind == 2) {
5501 new_l3 &= ~L3_PAGE;
5502 new_l3 |= L2_BLOCK;
5503 } else /* (psind == 1) */
5504 new_l3 |= ATTR_CONTIGUOUS;
5505 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
5506 goto out;
5507 }
5508 if (psind == 2) {
5509 /* Assert the required virtual and physical alignment. */
5510 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
5511 KASSERT(m->psind > 1, ("pmap_enter: m->psind < psind"));
5512 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
5513 flags, m, &lock);
5514 goto out;
5515 }
5516 mpte = NULL;
5517 if (psind == 1) {
5518 KASSERT((va & L3C_OFFSET) == 0, ("pmap_enter: va unaligned"));
5519 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5520 rv = pmap_enter_l3c(pmap, va, new_l3 | ATTR_CONTIGUOUS, flags,
5521 m, &mpte, &lock);
5522 #if VM_NRESERVLEVEL > 0
5523 /*
5524 * Attempt L2 promotion, if both the PTP and a level 1
5525 * reservation are fully populated.
5526 */
5527 if (rv == KERN_SUCCESS &&
5528 (mpte == NULL || mpte->ref_count == NL3PG) &&
5529 (m->flags & PG_FICTITIOUS) == 0 &&
5530 vm_reserv_level_iffullpop(m) == 1) {
5531 pde = pmap_l2(pmap, va);
5532 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5533 }
5534 #endif
5535 goto out;
5536 }
5537
5538 /*
5539 * In the case that a page table page is not
5540 * resident, we are creating it here.
5541 */
5542 retry:
5543 pde = pmap_pde(pmap, va, &lvl);
5544 if (pde != NULL && lvl == 2) {
5545 l3 = pmap_l2_to_l3(pde, va);
5546 if (ADDR_IS_USER(va) && mpte == NULL) {
5547 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
5548 mpte->ref_count++;
5549 }
5550 goto havel3;
5551 } else if (pde != NULL && lvl == 1) {
5552 l2 = pmap_l1_to_l2(pde, va);
5553 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
5554 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
5555 l3 = &l3[pmap_l3_index(va)];
5556 if (ADDR_IS_USER(va)) {
5557 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5558 mpte->ref_count++;
5559 }
5560 goto havel3;
5561 }
5562 /* We need to allocate an L3 table. */
5563 }
5564 if (ADDR_IS_USER(va)) {
5565 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5566
5567 /*
5568 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
5569 * to handle the possibility that a superpage mapping for "va"
5570 * was created while we slept.
5571 */
5572 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
5573 nosleep ? NULL : &lock);
5574 if (mpte == NULL && nosleep) {
5575 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
5576 rv = KERN_RESOURCE_SHORTAGE;
5577 goto out;
5578 }
5579 goto retry;
5580 } else
5581 panic("pmap_enter: missing L3 table for kernel va %#lx", va);
5582
5583 havel3:
5584 orig_l3 = pmap_load(l3);
5585 opa = PTE_TO_PHYS(orig_l3);
5586 pv = NULL;
5587 new_l3 |= pmap_pte_bti(pmap, va);
5588
5589 /*
5590 * Is the specified virtual address already mapped?
5591 */
5592 if (pmap_l3_valid(orig_l3)) {
5593 /*
5594 * Wiring change, just update stats. We don't worry about
5595 * wiring PT pages as they remain resident as long as there
5596 * are valid mappings in them. Hence, if a user page is wired,
5597 * the PT page will be also.
5598 */
5599 if ((flags & PMAP_ENTER_WIRED) != 0 &&
5600 (orig_l3 & ATTR_SW_WIRED) == 0)
5601 pmap->pm_stats.wired_count++;
5602 else if ((flags & PMAP_ENTER_WIRED) == 0 &&
5603 (orig_l3 & ATTR_SW_WIRED) != 0)
5604 pmap->pm_stats.wired_count--;
5605
5606 /*
5607 * Remove the extra PT page reference.
5608 */
5609 if (mpte != NULL) {
5610 mpte->ref_count--;
5611 KASSERT(mpte->ref_count > 0,
5612 ("pmap_enter: missing reference to page table page,"
5613 " va: 0x%lx", va));
5614 }
5615
5616 /*
5617 * Has the physical page changed?
5618 */
5619 if (opa == pa) {
5620 /*
5621 * No, might be a protection or wiring change.
5622 */
5623 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5624 (new_l3 & ATTR_SW_DBM) != 0)
5625 vm_page_aflag_set(m, PGA_WRITEABLE);
5626 goto validate;
5627 }
5628
5629 /*
5630 * The physical page has changed. Temporarily invalidate
5631 * the mapping.
5632 */
5633 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5634 (void)pmap_demote_l3c(pmap, l3, va);
5635 orig_l3 = pmap_load_clear(l3);
5636 KASSERT(PTE_TO_PHYS(orig_l3) == opa,
5637 ("pmap_enter: unexpected pa update for %#lx", va));
5638 if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
5639 om = PHYS_TO_VM_PAGE(opa);
5640
5641 /*
5642 * The pmap lock is sufficient to synchronize with
5643 * concurrent calls to pmap_page_test_mappings() and
5644 * pmap_ts_referenced().
5645 */
5646 if (pmap_pte_dirty(pmap, orig_l3))
5647 vm_page_dirty(om);
5648 if ((orig_l3 & ATTR_AF) != 0) {
5649 pmap_invalidate_page(pmap, va, true);
5650 vm_page_aflag_set(om, PGA_REFERENCED);
5651 }
5652 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om);
5653 pv = pmap_pvh_remove(&om->md, pmap, va);
5654 if ((m->oflags & VPO_UNMANAGED) != 0)
5655 free_pv_entry(pmap, pv);
5656 if ((om->a.flags & PGA_WRITEABLE) != 0 &&
5657 TAILQ_EMPTY(&om->md.pv_list) &&
5658 ((om->flags & PG_FICTITIOUS) != 0 ||
5659 TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
5660 vm_page_aflag_clear(om, PGA_WRITEABLE);
5661 } else {
5662 KASSERT((orig_l3 & ATTR_AF) != 0,
5663 ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
5664 pmap_invalidate_page(pmap, va, true);
5665 }
5666 orig_l3 = 0;
5667 } else {
5668 /*
5669 * Increment the counters.
5670 */
5671 if ((new_l3 & ATTR_SW_WIRED) != 0)
5672 pmap->pm_stats.wired_count++;
5673 pmap_resident_count_inc(pmap, 1);
5674 }
5675 /*
5676 * Enter on the PV list if part of our managed memory.
5677 */
5678 if ((m->oflags & VPO_UNMANAGED) == 0) {
5679 if (pv == NULL) {
5680 pv = get_pv_entry(pmap, &lock);
5681 pv->pv_va = va;
5682 }
5683 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5684 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5685 m->md.pv_gen++;
5686 if ((new_l3 & ATTR_SW_DBM) != 0)
5687 vm_page_aflag_set(m, PGA_WRITEABLE);
5688 }
5689
5690 validate:
5691 if (pmap->pm_stage == PM_STAGE1) {
5692 /*
5693 * Sync icache if exec permission and attribute
5694 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
5695 * is stored and made valid for hardware table walk. If done
5696 * later, then other can access this page before caches are
5697 * properly synced. Don't do it for kernel memory which is
5698 * mapped with exec permission even if the memory isn't going
5699 * to hold executable code. The only time when icache sync is
5700 * needed is after kernel module is loaded and the relocation
5701 * info is processed. And it's done in elf_cpu_load_file().
5702 */
5703 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5704 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
5705 (opa != pa || (orig_l3 & ATTR_S1_XN))) {
5706 PMAP_ASSERT_STAGE1(pmap);
5707 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
5708 }
5709 } else {
5710 cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
5711 }
5712
5713 /*
5714 * Update the L3 entry
5715 */
5716 if (pmap_l3_valid(orig_l3)) {
5717 KASSERT(opa == pa, ("pmap_enter: invalid update"));
5718 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
5719 /* same PA, different attributes */
5720 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5721 (void)pmap_demote_l3c(pmap, l3, va);
5722 orig_l3 = pmap_load_store(l3, new_l3);
5723 pmap_invalidate_page(pmap, va, true);
5724 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5725 pmap_pte_dirty(pmap, orig_l3))
5726 vm_page_dirty(m);
5727 } else {
5728 /*
5729 * orig_l3 == new_l3
5730 * This can happens if multiple threads simultaneously
5731 * access not yet mapped page. This bad for performance
5732 * since this can cause full demotion-NOP-promotion
5733 * cycle.
5734 * Another possible reasons are:
5735 * - VM and pmap memory layout are diverged
5736 * - tlb flush is missing somewhere and CPU doesn't see
5737 * actual mapping.
5738 */
5739 CTR4(KTR_PMAP, "%s: already mapped page - "
5740 "pmap %p va 0x%#lx pte 0x%lx",
5741 __func__, pmap, va, new_l3);
5742 }
5743 } else {
5744 /* New mapping */
5745 pmap_store(l3, new_l3);
5746 dsb(ishst);
5747 }
5748
5749 #if VM_NRESERVLEVEL > 0
5750 /*
5751 * First, attempt L3C promotion, if the virtual and physical addresses
5752 * are aligned with each other and an underlying reservation has the
5753 * neighboring L3 pages allocated. The first condition is simply an
5754 * optimization that recognizes some eventual promotion failures early
5755 * at a lower run-time cost. Then, if both a level 1 reservation and
5756 * the PTP are fully populated, attempt L2 promotion.
5757 */
5758 if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
5759 (m->flags & PG_FICTITIOUS) == 0 &&
5760 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
5761 pmap_promote_l3c(pmap, l3, va) &&
5762 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG))
5763 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5764 #endif
5765
5766 rv = KERN_SUCCESS;
5767
5768 if (in_realm() && (flags & PMAP_ENTER_UNPROTECTED) != 0)
5769 pmap_set_unprotected(new_l3);
5770
5771 out:
5772 if (lock != NULL)
5773 rw_wunlock(lock);
5774 PMAP_UNLOCK(pmap);
5775 return (rv);
5776 }
5777
5778 /*
5779 * Tries to create a read- and/or execute-only L2 page mapping. Returns
5780 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
5781 * value. See pmap_enter_l2() for the possible error values when "no sleep",
5782 * "no replace", and "no reclaim" are specified.
5783 */
5784 static int
pmap_enter_l2_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)5785 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5786 struct rwlock **lockp)
5787 {
5788 pd_entry_t new_l2;
5789
5790 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5791 PMAP_ASSERT_STAGE1(pmap);
5792 KASSERT(ADDR_IS_CANONICAL(va),
5793 ("%s: Address not in canonical form: %lx", __func__, va));
5794
5795 new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | pmap_sh_attr |
5796 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5797 L2_BLOCK);
5798 if ((m->oflags & VPO_UNMANAGED) == 0)
5799 new_l2 |= ATTR_SW_MANAGED;
5800 else
5801 new_l2 |= ATTR_AF;
5802 if ((prot & VM_PROT_EXECUTE) == 0 ||
5803 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5804 new_l2 |= ATTR_S1_XN;
5805 if (ADDR_IS_USER(va))
5806 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5807 else
5808 new_l2 |= ATTR_S1_UXN;
5809 if (pmap != kernel_pmap)
5810 new_l2 |= ATTR_S1_nG;
5811 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
5812 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
5813 }
5814
5815 /*
5816 * Returns true if every page table entry in the specified page table is
5817 * zero.
5818 */
5819 static bool
pmap_every_pte_zero(vm_paddr_t pa)5820 pmap_every_pte_zero(vm_paddr_t pa)
5821 {
5822 pt_entry_t *pt_end, *pte;
5823
5824 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
5825 pte = PHYS_TO_DMAP(pa);
5826 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
5827 if (*pte != 0)
5828 return (false);
5829 }
5830 return (true);
5831 }
5832
5833 /*
5834 * Tries to create the specified L2 page mapping. Returns KERN_SUCCESS if
5835 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
5836 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if
5837 * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists
5838 * within the L2 virtual address range starting at the specified virtual
5839 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
5840 * L2 page mapping already exists at the specified virtual address. Returns
5841 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
5842 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
5843 * and a PV entry allocation failed.
5844 */
5845 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)5846 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
5847 vm_page_t m, struct rwlock **lockp)
5848 {
5849 struct spglist free;
5850 pd_entry_t *l2, old_l2;
5851 vm_page_t l2pg, mt;
5852 vm_page_t uwptpg;
5853
5854 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5855 KASSERT(ADDR_IS_CANONICAL(va),
5856 ("%s: Address not in canonical form: %lx", __func__, va));
5857 KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
5858 PMAP_ENTER_NORECLAIM,
5859 ("pmap_enter_l2: flags is missing PMAP_ENTER_NOREPLACE"));
5860
5861 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
5862 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
5863 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
5864 va, pmap);
5865 return (KERN_RESOURCE_SHORTAGE);
5866 }
5867
5868 /*
5869 * If bti is not the same for the whole l2 range, return failure
5870 * and let vm_fault() cope. Check after l2 allocation, since
5871 * it could sleep.
5872 */
5873 if (!pmap_bti_same(pmap, va, va + L2_SIZE, &new_l2)) {
5874 KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP"));
5875 pmap_abort_ptp(pmap, va, l2pg);
5876 return (KERN_PROTECTION_FAILURE);
5877 }
5878
5879 /*
5880 * If there are existing mappings, either abort or remove them.
5881 */
5882 if ((old_l2 = pmap_load(l2)) != 0) {
5883 KASSERT(l2pg == NULL || l2pg->ref_count > 1,
5884 ("pmap_enter_l2: l2pg's ref count is too low"));
5885 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5886 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5887 if (l2pg != NULL)
5888 l2pg->ref_count--;
5889 CTR2(KTR_PMAP,
5890 "pmap_enter_l2: no space for va %#lx"
5891 " in pmap %p", va, pmap);
5892 return (KERN_NO_SPACE);
5893 } else if (ADDR_IS_USER(va) ||
5894 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) {
5895 if (l2pg != NULL)
5896 l2pg->ref_count--;
5897 CTR2(KTR_PMAP,
5898 "pmap_enter_l2: failure for va %#lx"
5899 " in pmap %p", va, pmap);
5900 return (KERN_FAILURE);
5901 }
5902 }
5903 SLIST_INIT(&free);
5904 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5905 (void)pmap_remove_l2(pmap, l2, va,
5906 pmap_load(pmap_l1(pmap, va)), false, &free, lockp);
5907 } else {
5908 if (ADDR_IS_KERNEL(va)) {
5909 /*
5910 * Try to save the ptp in the trie
5911 * before any changes to mappings are
5912 * made. Abort on failure.
5913 */
5914 mt = PTE_TO_VM_PAGE(old_l2);
5915 if (pmap_insert_pt_page(pmap, mt, false,
5916 false)) {
5917 CTR1(KTR_PMAP,
5918 "pmap_enter_l2: cannot ins kern ptp va %#lx",
5919 va);
5920 return (KERN_RESOURCE_SHORTAGE);
5921 }
5922 /*
5923 * Both pmap_remove_l2() and
5924 * pmap_remove_l3_range() will zero fill
5925 * the L3 kernel page table page.
5926 */
5927 }
5928 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
5929 &free, lockp);
5930 if (ADDR_IS_KERNEL(va)) {
5931 /*
5932 * The TLB could have an intermediate
5933 * entry for the L3 kernel page table
5934 * page, so request an invalidation at
5935 * all levels after clearing the
5936 * L2_TABLE entry.
5937 */
5938 pmap_clear(l2);
5939 pmap_s1_invalidate_page(pmap, va, false);
5940 }
5941 }
5942 KASSERT(pmap_load(l2) == 0,
5943 ("pmap_enter_l2: non-zero L2 entry %p", l2));
5944 if (ADDR_IS_USER(va)) {
5945 vm_page_free_pages_toq(&free, true);
5946 } else {
5947 KASSERT(SLIST_EMPTY(&free),
5948 ("pmap_enter_l2: freed kernel page table page"));
5949 }
5950 }
5951
5952 /*
5953 * Allocate leaf ptpage for wired userspace pages.
5954 */
5955 uwptpg = NULL;
5956 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) {
5957 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5958 if (uwptpg == NULL) {
5959 pmap_abort_ptp(pmap, va, l2pg);
5960 return (KERN_RESOURCE_SHORTAGE);
5961 }
5962 uwptpg->pindex = pmap_l2_pindex(va);
5963 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
5964 vm_page_unwire_noq(uwptpg);
5965 vm_page_free(uwptpg);
5966 pmap_abort_ptp(pmap, va, l2pg);
5967 return (KERN_RESOURCE_SHORTAGE);
5968 }
5969 pmap_resident_count_inc(pmap, 1);
5970 uwptpg->ref_count = NL3PG;
5971 }
5972 if ((new_l2 & ATTR_SW_MANAGED) != 0) {
5973 /*
5974 * Abort this mapping if its PV entry could not be created.
5975 */
5976 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
5977 if (l2pg != NULL)
5978 pmap_abort_ptp(pmap, va, l2pg);
5979 else {
5980 KASSERT(ADDR_IS_KERNEL(va) &&
5981 (pmap_load(l2) & ATTR_DESCR_MASK) ==
5982 L2_TABLE,
5983 ("pmap_enter_l2: invalid kernel L2E"));
5984 mt = pmap_remove_pt_page(pmap, va);
5985 KASSERT(mt != NULL,
5986 ("pmap_enter_l2: missing kernel PTP"));
5987 }
5988 if (uwptpg != NULL) {
5989 mt = pmap_remove_pt_page(pmap, va);
5990 KASSERT(mt == uwptpg,
5991 ("removed pt page %p, expected %p", mt,
5992 uwptpg));
5993 pmap_resident_count_dec(pmap, 1);
5994 uwptpg->ref_count = 1;
5995 vm_page_unwire_noq(uwptpg);
5996 vm_page_free(uwptpg);
5997 }
5998 CTR2(KTR_PMAP,
5999 "pmap_enter_l2: failure for va %#lx in pmap %p",
6000 va, pmap);
6001 return (KERN_RESOURCE_SHORTAGE);
6002 }
6003 if ((new_l2 & ATTR_SW_DBM) != 0)
6004 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6005 vm_page_aflag_set(mt, PGA_WRITEABLE);
6006 }
6007
6008 /*
6009 * Increment counters.
6010 */
6011 if ((new_l2 & ATTR_SW_WIRED) != 0)
6012 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
6013 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
6014
6015 /*
6016 * Conditionally sync the icache. See pmap_enter() for details.
6017 */
6018 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) !=
6019 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) &&
6020 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
6021 cpu_icache_sync_range(PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)),
6022 L2_SIZE);
6023 }
6024
6025 /*
6026 * Map the superpage.
6027 */
6028 pmap_store(l2, new_l2);
6029 dsb(ishst);
6030
6031 counter_u64_add(pmap_l2_mappings, 1);
6032 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
6033 va, pmap);
6034
6035 return (KERN_SUCCESS);
6036 }
6037
6038 /*
6039 * Tries to create a read- and/or execute-only L3C page mapping. Returns
6040 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
6041 * value.
6042 */
6043 static int
pmap_enter_l3c_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_page_t * ml3p,vm_prot_t prot,struct rwlock ** lockp)6044 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p,
6045 vm_prot_t prot, struct rwlock **lockp)
6046 {
6047 pt_entry_t l3e;
6048
6049 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6050 PMAP_ASSERT_STAGE1(pmap);
6051 KASSERT(ADDR_IS_CANONICAL(va),
6052 ("%s: Address not in canonical form: %lx", __func__, va));
6053
6054 l3e = VM_PAGE_TO_PTE(m) | pmap_sh_attr |
6055 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
6056 ATTR_CONTIGUOUS | L3_PAGE;
6057 if ((m->oflags & VPO_UNMANAGED) == 0)
6058 l3e |= ATTR_SW_MANAGED;
6059 else
6060 l3e |= ATTR_AF;
6061 if ((prot & VM_PROT_EXECUTE) == 0 ||
6062 m->md.pv_memattr == VM_MEMATTR_DEVICE)
6063 l3e |= ATTR_S1_XN;
6064 if (ADDR_IS_USER(va))
6065 l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
6066 else
6067 l3e |= ATTR_S1_UXN;
6068 if (pmap != kernel_pmap)
6069 l3e |= ATTR_S1_nG;
6070 return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP |
6071 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp));
6072 }
6073
6074 static int
pmap_enter_l3c(pmap_t pmap,vm_offset_t va,pt_entry_t l3e,u_int flags,vm_page_t m,vm_page_t * ml3p,struct rwlock ** lockp)6075 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
6076 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp)
6077 {
6078 pd_entry_t *l2p, *pde;
6079 pt_entry_t *l3p, *tl3p;
6080 vm_page_t mt;
6081 vm_paddr_t pa;
6082 vm_pindex_t l2pindex;
6083 int lvl;
6084
6085 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6086 KASSERT((va & L3C_OFFSET) == 0,
6087 ("pmap_enter_l3c: va is not aligned"));
6088 KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0,
6089 ("pmap_enter_l3c: managed mapping within the clean submap"));
6090 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
6091 ("pmap_enter_l3c: l3e is missing ATTR_CONTIGUOUS"));
6092
6093 /*
6094 * If the L3 PTP is not resident, we attempt to create it here.
6095 */
6096 if (ADDR_IS_USER(va)) {
6097 /*
6098 * Were we given the correct L3 PTP? If so, we can simply
6099 * increment its ref count.
6100 */
6101 l2pindex = pmap_l2_pindex(va);
6102 if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) {
6103 (*ml3p)->ref_count += L3C_ENTRIES;
6104 } else {
6105 retry:
6106 /*
6107 * Get the L2 entry.
6108 */
6109 pde = pmap_pde(pmap, va, &lvl);
6110
6111 /*
6112 * If the L2 entry is a superpage, we either abort or
6113 * demote depending on the given flags.
6114 */
6115 if (lvl == 1) {
6116 l2p = pmap_l1_to_l2(pde, va);
6117 if ((pmap_load(l2p) & ATTR_DESCR_MASK) ==
6118 L2_BLOCK) {
6119 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
6120 return (KERN_FAILURE);
6121 l3p = pmap_demote_l2_locked(pmap, l2p,
6122 va, lockp);
6123 if (l3p != NULL) {
6124 *ml3p = PTE_TO_VM_PAGE(
6125 pmap_load(l2p));
6126 (*ml3p)->ref_count +=
6127 L3C_ENTRIES;
6128 goto have_l3p;
6129 }
6130 }
6131 /* We need to allocate an L3 PTP. */
6132 }
6133
6134 /*
6135 * If the L3 PTP is mapped, we just increment its ref
6136 * count. Otherwise, we attempt to allocate it.
6137 */
6138 if (lvl == 2 && pmap_load(pde) != 0) {
6139 *ml3p = PTE_TO_VM_PAGE(pmap_load(pde));
6140 (*ml3p)->ref_count += L3C_ENTRIES;
6141 } else {
6142 *ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags &
6143 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp);
6144 if (*ml3p == NULL) {
6145 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
6146 return (KERN_FAILURE);
6147
6148 /*
6149 * The page table may have changed
6150 * while we slept.
6151 */
6152 goto retry;
6153 }
6154 (*ml3p)->ref_count += L3C_ENTRIES - 1;
6155 }
6156 }
6157 l3p = VM_PAGE_TO_DMAP(*ml3p);
6158 } else {
6159 *ml3p = NULL;
6160
6161 /*
6162 * If the L2 entry is a superpage, we either abort or demote
6163 * depending on the given flags.
6164 */
6165 pde = pmap_pde(kernel_pmap, va, &lvl);
6166 if (lvl == 1) {
6167 l2p = pmap_l1_to_l2(pde, va);
6168 KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK,
6169 ("pmap_enter_l3c: missing L2 block"));
6170 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
6171 return (KERN_FAILURE);
6172 l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp);
6173 } else {
6174 KASSERT(lvl == 2,
6175 ("pmap_enter_l3c: Invalid level %d", lvl));
6176 l3p = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(pde)));
6177 }
6178 }
6179 have_l3p:
6180 l3p = &l3p[pmap_l3_index(va)];
6181
6182 /*
6183 * If bti is not the same for the whole L3C range, return failure
6184 * and let vm_fault() cope. Check after L3 allocation, since
6185 * it could sleep.
6186 */
6187 if (!pmap_bti_same(pmap, va, va + L3C_SIZE, &l3e)) {
6188 KASSERT(*ml3p != NULL, ("pmap_enter_l3c: missing L3 PTP"));
6189 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
6190 pmap_abort_ptp(pmap, va, *ml3p);
6191 *ml3p = NULL;
6192 return (KERN_PROTECTION_FAILURE);
6193 }
6194
6195 /*
6196 * If there are existing mappings, either abort or remove them.
6197 */
6198 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
6199 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6200 if (pmap_load(tl3p) != 0) {
6201 if (*ml3p != NULL)
6202 (*ml3p)->ref_count -= L3C_ENTRIES;
6203 return (KERN_FAILURE);
6204 }
6205 }
6206 } else {
6207 /*
6208 * Because we increment the L3 page's reference count above,
6209 * it is guaranteed not to be freed here and we can pass NULL
6210 * instead of a valid free list.
6211 */
6212 pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va,
6213 va + L3C_SIZE, NULL, lockp);
6214 }
6215
6216 /*
6217 * Enter on the PV list if part of our managed memory.
6218 */
6219 if ((l3e & ATTR_SW_MANAGED) != 0) {
6220 if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) {
6221 if (*ml3p != NULL) {
6222 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
6223 pmap_abort_ptp(pmap, va, *ml3p);
6224 *ml3p = NULL;
6225 }
6226 return (KERN_RESOURCE_SHORTAGE);
6227 }
6228 if ((l3e & ATTR_SW_DBM) != 0)
6229 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
6230 vm_page_aflag_set(mt, PGA_WRITEABLE);
6231 }
6232
6233 /*
6234 * Increment counters.
6235 */
6236 if ((l3e & ATTR_SW_WIRED) != 0)
6237 pmap->pm_stats.wired_count += L3C_ENTRIES;
6238 pmap_resident_count_inc(pmap, L3C_ENTRIES);
6239
6240 pa = VM_PAGE_TO_PHYS(m);
6241 KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned"));
6242
6243 /*
6244 * Sync the icache before the mapping is stored.
6245 */
6246 if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap &&
6247 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
6248 cpu_icache_sync_range(PHYS_TO_DMAP(pa), L3C_SIZE);
6249
6250 /*
6251 * Map the superpage.
6252 */
6253 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6254 pmap_store(tl3p, l3e);
6255 l3e += L3_SIZE;
6256 }
6257 dsb(ishst);
6258
6259 counter_u64_add(pmap_l3c_mappings, 1);
6260 CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p",
6261 va, pmap);
6262 return (KERN_SUCCESS);
6263 }
6264
6265 /*
6266 * Maps a sequence of resident pages belonging to the same object.
6267 * The sequence begins with the given page m_start. This page is
6268 * mapped at the given virtual address start. Each subsequent page is
6269 * mapped at a virtual address that is offset from start by the same
6270 * amount as the page is offset from m_start within the object. The
6271 * last page in the sequence is the page with the largest offset from
6272 * m_start that can be mapped at a virtual address less than the given
6273 * virtual address end. Not every virtual page between start and end
6274 * is mapped; only those for which a resident page exists with the
6275 * corresponding offset from m_start are mapped.
6276 */
6277 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)6278 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
6279 vm_page_t m_start, vm_prot_t prot)
6280 {
6281 struct pctrie_iter pages;
6282 struct rwlock *lock;
6283 vm_offset_t va;
6284 vm_page_t m, mpte;
6285 int rv;
6286
6287 VM_OBJECT_ASSERT_LOCKED(m_start->object);
6288
6289 mpte = NULL;
6290 vm_page_iter_limit_init(&pages, m_start->object,
6291 m_start->pindex + atop(end - start));
6292 m = vm_radix_iter_lookup(&pages, m_start->pindex);
6293 lock = NULL;
6294 PMAP_LOCK(pmap);
6295 while (m != NULL) {
6296 va = start + ptoa(m->pindex - m_start->pindex);
6297 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
6298 m->psind == 2 && pmap_ps_enabled(pmap) &&
6299 ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) ==
6300 KERN_SUCCESS || rv == KERN_NO_SPACE)) {
6301 m = vm_radix_iter_jump(&pages, L2_SIZE / PAGE_SIZE);
6302 } else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end &&
6303 m->psind >= 1 && pmap_ps_enabled(pmap) &&
6304 ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot,
6305 &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE)) {
6306 m = vm_radix_iter_jump(&pages, L3C_ENTRIES);
6307 } else {
6308 /*
6309 * In general, if a superpage mapping were possible,
6310 * it would have been created above. That said, if
6311 * start and end are not superpage aligned, then
6312 * promotion might be possible at the ends of [start,
6313 * end). However, in practice, those promotion
6314 * attempts are so unlikely to succeed that they are
6315 * not worth trying.
6316 */
6317 mpte = pmap_enter_quick_locked(pmap, va, m, prot |
6318 VM_PROT_NO_PROMOTE, mpte, &lock);
6319 m = vm_radix_iter_step(&pages);
6320 }
6321 }
6322 if (lock != NULL)
6323 rw_wunlock(lock);
6324 PMAP_UNLOCK(pmap);
6325 }
6326
6327 /*
6328 * this code makes some *MAJOR* assumptions:
6329 * 1. Current pmap & pmap exists.
6330 * 2. Not wired.
6331 * 3. Read access.
6332 * 4. No page table pages.
6333 * but is *MUCH* faster than pmap_enter...
6334 */
6335
6336 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)6337 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
6338 {
6339 struct rwlock *lock;
6340
6341 lock = NULL;
6342 PMAP_LOCK(pmap);
6343 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
6344 if (lock != NULL)
6345 rw_wunlock(lock);
6346 PMAP_UNLOCK(pmap);
6347 }
6348
6349 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)6350 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
6351 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
6352 {
6353 pt_entry_t *l1, *l2, *l3, l3_val;
6354 vm_paddr_t pa;
6355 int full_lvl, lvl;
6356
6357 KASSERT(!VA_IS_CLEANMAP(va) ||
6358 (m->oflags & VPO_UNMANAGED) != 0,
6359 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
6360 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6361 PMAP_ASSERT_STAGE1(pmap);
6362 KASSERT(ADDR_IS_CANONICAL(va),
6363 ("%s: Address not in canonical form: %lx", __func__, va));
6364 l2 = NULL;
6365
6366 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
6367 /*
6368 * In the case that a page table page is not
6369 * resident, we are creating it here.
6370 */
6371 if (ADDR_IS_USER(va)) {
6372 vm_pindex_t l2pindex;
6373
6374 /*
6375 * Calculate pagetable page index
6376 */
6377 l2pindex = pmap_l2_pindex(va);
6378 if (mpte && (mpte->pindex == l2pindex)) {
6379 mpte->ref_count++;
6380 } else {
6381 /*
6382 * If the page table page is mapped, we just increment
6383 * the hold count, and activate it. Otherwise, we
6384 * attempt to allocate a page table page, passing NULL
6385 * instead of the PV list lock pointer because we don't
6386 * intend to sleep. If this attempt fails, we don't
6387 * retry. Instead, we give up.
6388 */
6389 l1 = pmap_l1(pmap, va);
6390 if (l1 != NULL && pmap_load(l1) != 0) {
6391 if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
6392 L1_BLOCK)
6393 return (NULL);
6394 l2 = pmap_l1_to_l2(l1, va);
6395 if (pmap_load(l2) != 0) {
6396 if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
6397 L2_BLOCK)
6398 return (NULL);
6399 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
6400 mpte->ref_count++;
6401 } else {
6402 mpte = _pmap_alloc_l3(pmap, l2pindex,
6403 NULL);
6404 if (mpte == NULL)
6405 return (mpte);
6406 }
6407 } else {
6408 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
6409 if (mpte == NULL)
6410 return (mpte);
6411 }
6412 }
6413 l3 = VM_PAGE_TO_DMAP(mpte);
6414 l3 = &l3[pmap_l3_index(va)];
6415 } else {
6416 mpte = NULL;
6417 l2 = pmap_pde(kernel_pmap, va, &lvl);
6418 KASSERT(l2 != NULL,
6419 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
6420 va));
6421 KASSERT(lvl == 2,
6422 ("pmap_enter_quick_locked: Invalid level %d", lvl));
6423 l3 = pmap_l2_to_l3(l2, va);
6424 }
6425
6426 /*
6427 * Abort if a mapping already exists.
6428 */
6429 if (pmap_load(l3) != 0) {
6430 if (mpte != NULL)
6431 mpte->ref_count--;
6432 return (NULL);
6433 }
6434
6435 /*
6436 * Enter on the PV list if part of our managed memory.
6437 */
6438 if ((m->oflags & VPO_UNMANAGED) == 0 &&
6439 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
6440 if (mpte != NULL)
6441 pmap_abort_ptp(pmap, va, mpte);
6442 return (NULL);
6443 }
6444
6445 /*
6446 * Increment counters
6447 */
6448 pmap_resident_count_inc(pmap, 1);
6449
6450 pa = VM_PAGE_TO_PHYS(m);
6451 l3_val = PHYS_TO_PTE(pa) | pmap_sh_attr |
6452 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
6453 l3_val |= pmap_pte_bti(pmap, va);
6454 if ((prot & VM_PROT_EXECUTE) == 0 ||
6455 m->md.pv_memattr == VM_MEMATTR_DEVICE)
6456 l3_val |= ATTR_S1_XN;
6457 if (ADDR_IS_USER(va))
6458 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
6459 else
6460 l3_val |= ATTR_S1_UXN;
6461 if (pmap != kernel_pmap)
6462 l3_val |= ATTR_S1_nG;
6463
6464 /*
6465 * Now validate mapping with RO protection
6466 */
6467 if ((m->oflags & VPO_UNMANAGED) == 0)
6468 l3_val |= ATTR_SW_MANAGED;
6469 else
6470 l3_val |= ATTR_AF;
6471
6472 /* Sync icache before the mapping is stored to PTE */
6473 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
6474 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
6475 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
6476
6477 pmap_store(l3, l3_val);
6478 dsb(ishst);
6479
6480 #if VM_NRESERVLEVEL > 0
6481 /*
6482 * First, attempt L3C promotion, if the virtual and physical addresses
6483 * are aligned with each other and an underlying reservation has the
6484 * neighboring L3 pages allocated. The first condition is simply an
6485 * optimization that recognizes some eventual promotion failures early
6486 * at a lower run-time cost. Then, attempt L2 promotion, if both a
6487 * level 1 reservation and the PTP are fully populated.
6488 */
6489 if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
6490 (va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
6491 (m->flags & PG_FICTITIOUS) == 0 &&
6492 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
6493 pmap_promote_l3c(pmap, l3, va) &&
6494 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) {
6495 if (l2 == NULL)
6496 l2 = pmap_l2(pmap, va);
6497
6498 /*
6499 * If promotion succeeds, then the next call to this function
6500 * should not be given the unmapped PTP as a hint.
6501 */
6502 if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
6503 mpte = NULL;
6504 }
6505 #endif
6506
6507 return (mpte);
6508 }
6509
6510 /*
6511 * This code maps large physical mmap regions into the
6512 * processor address space. Note that some shortcuts
6513 * are taken, but the code works.
6514 */
6515 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)6516 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
6517 vm_pindex_t pindex, vm_size_t size)
6518 {
6519
6520 VM_OBJECT_ASSERT_WLOCKED(object);
6521 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
6522 ("pmap_object_init_pt: non-device object"));
6523 }
6524
6525 /*
6526 * Clear the wired attribute from the mappings for the specified range of
6527 * addresses in the given pmap. Every valid mapping within that range
6528 * must have the wired attribute set. In contrast, invalid mappings
6529 * cannot have the wired attribute set, so they are ignored.
6530 *
6531 * The wired attribute of the page table entry is not a hardware feature,
6532 * so there is no need to invalidate any TLB entries.
6533 */
6534 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6535 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6536 {
6537 vm_offset_t va_next;
6538 pd_entry_t *l0, *l1, *l2;
6539 pt_entry_t *l3;
6540 bool partial_l3c;
6541
6542 PMAP_LOCK(pmap);
6543 for (; sva < eva; sva = va_next) {
6544 l0 = pmap_l0(pmap, sva);
6545 if (pmap_load(l0) == 0) {
6546 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6547 if (va_next < sva)
6548 va_next = eva;
6549 continue;
6550 }
6551
6552 l1 = pmap_l0_to_l1(l0, sva);
6553 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6554 if (va_next < sva)
6555 va_next = eva;
6556 if (pmap_load(l1) == 0)
6557 continue;
6558
6559 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6560 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6561 KASSERT(va_next <= eva,
6562 ("partial update of non-transparent 1G page "
6563 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
6564 pmap_load(l1), sva, eva, va_next));
6565 MPASS(pmap != kernel_pmap);
6566 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
6567 ATTR_SW_WIRED)) == ATTR_SW_WIRED);
6568 pmap_clear_bits(l1, ATTR_SW_WIRED);
6569 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
6570 continue;
6571 }
6572
6573 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6574 if (va_next < sva)
6575 va_next = eva;
6576
6577 l2 = pmap_l1_to_l2(l1, sva);
6578 if (pmap_load(l2) == 0)
6579 continue;
6580
6581 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
6582 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
6583 panic("pmap_unwire: l2 %#jx is missing "
6584 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
6585
6586 /*
6587 * Are we unwiring the entire large page? If not,
6588 * demote the mapping and fall through.
6589 */
6590 if (sva + L2_SIZE == va_next && eva >= va_next) {
6591 pmap_clear_bits(l2, ATTR_SW_WIRED);
6592 pmap->pm_stats.wired_count -= L2_SIZE /
6593 PAGE_SIZE;
6594 continue;
6595 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
6596 panic("pmap_unwire: demotion failed");
6597 }
6598 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6599 ("pmap_unwire: Invalid l2 entry after demotion"));
6600
6601 if (va_next > eva)
6602 va_next = eva;
6603 for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva);
6604 sva != va_next; l3++, sva += L3_SIZE) {
6605 if (pmap_load(l3) == 0)
6606 continue;
6607 if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) {
6608 /*
6609 * Avoid demotion for whole-page unwiring.
6610 */
6611 if ((sva & L3C_OFFSET) == 0) {
6612 /*
6613 * Handle the possibility that
6614 * "va_next" is zero because of
6615 * address wraparound.
6616 */
6617 partial_l3c = sva + L3C_OFFSET >
6618 va_next - 1;
6619 }
6620 if (partial_l3c)
6621 (void)pmap_demote_l3c(pmap, l3, sva);
6622 }
6623 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
6624 panic("pmap_unwire: l3 %#jx is missing "
6625 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
6626
6627 /*
6628 * ATTR_SW_WIRED must be cleared atomically. Although
6629 * the pmap lock synchronizes access to ATTR_SW_WIRED,
6630 * the System MMU may write to the entry concurrently.
6631 */
6632 pmap_clear_bits(l3, ATTR_SW_WIRED);
6633 pmap->pm_stats.wired_count--;
6634 }
6635 }
6636 PMAP_UNLOCK(pmap);
6637 }
6638
6639 /*
6640 * This function requires that the caller has already added one to ml3's
6641 * ref_count in anticipation of creating a 4KB page mapping.
6642 */
6643 static bool
pmap_copy_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,pt_entry_t l3e,vm_page_t ml3,struct rwlock ** lockp)6644 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e,
6645 vm_page_t ml3, struct rwlock **lockp)
6646 {
6647 pt_entry_t *tl3p;
6648
6649 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6650 KASSERT((va & L3C_OFFSET) == 0,
6651 ("pmap_copy_l3c: va is not aligned"));
6652 KASSERT((l3e & ATTR_SW_MANAGED) != 0,
6653 ("pmap_copy_l3c: l3e is not managed"));
6654
6655 /*
6656 * Abort if a mapping already exists.
6657 */
6658 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++)
6659 if (pmap_load(tl3p) != 0) {
6660 if (ml3 != NULL)
6661 ml3->ref_count--;
6662 return (false);
6663 }
6664
6665 if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) {
6666 if (ml3 != NULL)
6667 pmap_abort_ptp(pmap, va, ml3);
6668 return (false);
6669 }
6670 ml3->ref_count += L3C_ENTRIES - 1;
6671
6672 /*
6673 * Clear the wired and accessed bits. However, leave the dirty bit
6674 * unchanged because read/write superpage mappings are required to be
6675 * dirty.
6676 */
6677 l3e &= ~(ATTR_SW_WIRED | ATTR_AF);
6678
6679 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6680 pmap_store(tl3p, l3e);
6681 l3e += L3_SIZE;
6682 }
6683 pmap_resident_count_inc(pmap, L3C_ENTRIES);
6684 counter_u64_add(pmap_l3c_mappings, 1);
6685 CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p",
6686 va, pmap);
6687 return (true);
6688 }
6689
6690 /*
6691 * Copy the range specified by src_addr/len
6692 * from the source map to the range dst_addr/len
6693 * in the destination map.
6694 *
6695 * This routine is only advisory and need not do anything.
6696 *
6697 * Because the executable mappings created by this routine are copied,
6698 * it should not have to flush the instruction cache.
6699 */
6700 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)6701 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
6702 vm_offset_t src_addr)
6703 {
6704 struct rwlock *lock;
6705 pd_entry_t *l0, *l1, *l2, srcptepaddr;
6706 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
6707 vm_offset_t addr, end_addr, va_next;
6708 vm_page_t dst_m, dstmpte, srcmpte;
6709
6710 PMAP_ASSERT_STAGE1(dst_pmap);
6711 PMAP_ASSERT_STAGE1(src_pmap);
6712
6713 if (dst_addr != src_addr)
6714 return;
6715 end_addr = src_addr + len;
6716 lock = NULL;
6717 if (dst_pmap < src_pmap) {
6718 PMAP_LOCK(dst_pmap);
6719 PMAP_LOCK(src_pmap);
6720 } else {
6721 PMAP_LOCK(src_pmap);
6722 PMAP_LOCK(dst_pmap);
6723 }
6724 for (addr = src_addr; addr < end_addr; addr = va_next) {
6725 l0 = pmap_l0(src_pmap, addr);
6726 if (pmap_load(l0) == 0) {
6727 va_next = (addr + L0_SIZE) & ~L0_OFFSET;
6728 if (va_next < addr)
6729 va_next = end_addr;
6730 continue;
6731 }
6732
6733 va_next = (addr + L1_SIZE) & ~L1_OFFSET;
6734 if (va_next < addr)
6735 va_next = end_addr;
6736 l1 = pmap_l0_to_l1(l0, addr);
6737 if (pmap_load(l1) == 0)
6738 continue;
6739 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6740 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6741 KASSERT(va_next <= end_addr,
6742 ("partial update of non-transparent 1G page "
6743 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6744 pmap_load(l1), addr, end_addr, va_next));
6745 srcptepaddr = pmap_load(l1);
6746 l1 = pmap_l1(dst_pmap, addr);
6747 if (l1 == NULL) {
6748 if (_pmap_alloc_l3(dst_pmap,
6749 pmap_l0_pindex(addr), NULL) == NULL)
6750 break;
6751 l1 = pmap_l1(dst_pmap, addr);
6752 } else {
6753 l0 = pmap_l0(dst_pmap, addr);
6754 dst_m = PTE_TO_VM_PAGE(pmap_load(l0));
6755 dst_m->ref_count++;
6756 }
6757 KASSERT(pmap_load(l1) == 0,
6758 ("1G mapping present in dst pmap "
6759 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6760 pmap_load(l1), addr, end_addr, va_next));
6761 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
6762 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
6763 continue;
6764 }
6765
6766 va_next = (addr + L2_SIZE) & ~L2_OFFSET;
6767 if (va_next < addr)
6768 va_next = end_addr;
6769 l2 = pmap_l1_to_l2(l1, addr);
6770 srcptepaddr = pmap_load(l2);
6771 if (srcptepaddr == 0)
6772 continue;
6773 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
6774 /*
6775 * We can only virtual copy whole superpages.
6776 */
6777 if ((addr & L2_OFFSET) != 0 ||
6778 addr + L2_SIZE > end_addr)
6779 continue;
6780 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
6781 if (l2 == NULL)
6782 break;
6783 if (pmap_load(l2) == 0 &&
6784 ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
6785 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
6786 PMAP_ENTER_NORECLAIM, &lock))) {
6787 /*
6788 * We leave the dirty bit unchanged because
6789 * managed read/write superpage mappings are
6790 * required to be dirty. However, managed
6791 * superpage mappings are not required to
6792 * have their accessed bit set, so we clear
6793 * it because we don't know if this mapping
6794 * will be used.
6795 */
6796 srcptepaddr &= ~ATTR_SW_WIRED;
6797 if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
6798 srcptepaddr &= ~ATTR_AF;
6799 pmap_store(l2, srcptepaddr);
6800 pmap_resident_count_inc(dst_pmap, L2_SIZE /
6801 PAGE_SIZE);
6802 counter_u64_add(pmap_l2_mappings, 1);
6803 } else
6804 pmap_abort_ptp(dst_pmap, addr, dst_m);
6805 continue;
6806 }
6807 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
6808 ("pmap_copy: invalid L2 entry"));
6809 srcmpte = PTE_TO_VM_PAGE(srcptepaddr);
6810 KASSERT(srcmpte->ref_count > 0,
6811 ("pmap_copy: source page table page is unused"));
6812 if (va_next > end_addr)
6813 va_next = end_addr;
6814 src_pte = PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr));
6815 src_pte = &src_pte[pmap_l3_index(addr)];
6816 dstmpte = NULL;
6817 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
6818 ptetemp = pmap_load(src_pte);
6819
6820 /*
6821 * We only virtual copy managed pages.
6822 */
6823 if ((ptetemp & ATTR_SW_MANAGED) == 0)
6824 continue;
6825
6826 if (dstmpte != NULL) {
6827 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
6828 ("dstmpte pindex/addr mismatch"));
6829 dstmpte->ref_count++;
6830 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
6831 NULL)) == NULL)
6832 goto out;
6833 dst_pte = VM_PAGE_TO_DMAP(dstmpte);
6834 dst_pte = &dst_pte[pmap_l3_index(addr)];
6835 if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr &
6836 L3C_OFFSET) == 0 && addr + L3C_OFFSET <=
6837 va_next - 1) {
6838 if (!pmap_copy_l3c(dst_pmap, dst_pte, addr,
6839 ptetemp, dstmpte, &lock))
6840 goto out;
6841 addr += L3C_SIZE - PAGE_SIZE;
6842 src_pte += L3C_ENTRIES - 1;
6843 } else if (pmap_load(dst_pte) == 0 &&
6844 pmap_try_insert_pv_entry(dst_pmap, addr,
6845 PTE_TO_VM_PAGE(ptetemp), &lock)) {
6846 /*
6847 * Clear the wired, contiguous, modified, and
6848 * accessed bits from the destination PTE.
6849 * The contiguous bit is cleared because we
6850 * are not copying the entire L3C superpage.
6851 */
6852 mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS |
6853 ATTR_AF;
6854 nbits = 0;
6855 if ((ptetemp & ATTR_SW_DBM) != 0)
6856 nbits |= ATTR_S1_AP_RW_BIT;
6857 pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
6858 pmap_resident_count_inc(dst_pmap, 1);
6859 } else {
6860 pmap_abort_ptp(dst_pmap, addr, dstmpte);
6861 goto out;
6862 }
6863 /* Have we copied all of the valid mappings? */
6864 if (dstmpte->ref_count >= srcmpte->ref_count)
6865 break;
6866 }
6867 }
6868 out:
6869 /*
6870 * XXX This barrier may not be needed because the destination pmap is
6871 * not active.
6872 */
6873 dsb(ishst);
6874
6875 if (lock != NULL)
6876 rw_wunlock(lock);
6877 PMAP_UNLOCK(src_pmap);
6878 PMAP_UNLOCK(dst_pmap);
6879 }
6880
6881 int
pmap_vmspace_copy(pmap_t dst_pmap,pmap_t src_pmap)6882 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
6883 {
6884 int error;
6885
6886 if (dst_pmap->pm_stage != src_pmap->pm_stage)
6887 return (EINVAL);
6888
6889 if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL)
6890 return (0);
6891
6892 for (;;) {
6893 if (dst_pmap < src_pmap) {
6894 PMAP_LOCK(dst_pmap);
6895 PMAP_LOCK(src_pmap);
6896 } else {
6897 PMAP_LOCK(src_pmap);
6898 PMAP_LOCK(dst_pmap);
6899 }
6900 error = pmap_bti_copy(dst_pmap, src_pmap);
6901 /* Clean up partial copy on failure due to no memory. */
6902 if (error == ENOMEM)
6903 pmap_bti_deassign_all(dst_pmap);
6904 PMAP_UNLOCK(src_pmap);
6905 PMAP_UNLOCK(dst_pmap);
6906 if (error != ENOMEM)
6907 break;
6908 vm_wait(NULL);
6909 }
6910 return (error);
6911 }
6912
6913 /*
6914 * pmap_zero_page zeros the specified hardware page by mapping
6915 * the page into KVM and using bzero to clear its contents.
6916 */
6917 void
pmap_zero_page(vm_page_t m)6918 pmap_zero_page(vm_page_t m)
6919 {
6920 void *va = VM_PAGE_TO_DMAP(m);
6921
6922 pagezero(va);
6923 }
6924
6925 /*
6926 * pmap_zero_page_area zeros the specified hardware page by mapping
6927 * the page into KVM and using bzero to clear its contents.
6928 *
6929 * off and size may not cover an area beyond a single hardware page.
6930 */
6931 void
pmap_zero_page_area(vm_page_t m,int off,int size)6932 pmap_zero_page_area(vm_page_t m, int off, int size)
6933 {
6934 void *va = VM_PAGE_TO_DMAP(m);
6935
6936 if (off == 0 && size == PAGE_SIZE)
6937 pagezero(va);
6938 else
6939 bzero((char *)va + off, size);
6940 }
6941
6942 /*
6943 * pmap_copy_page copies the specified (machine independent)
6944 * page by mapping the page into virtual memory and using
6945 * bcopy to copy the page, one machine dependent page at a
6946 * time.
6947 */
6948 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)6949 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6950 {
6951 void *src = VM_PAGE_TO_DMAP(msrc);
6952 void *dst = VM_PAGE_TO_DMAP(mdst);
6953
6954 pagecopy(src, dst);
6955 }
6956
6957 int unmapped_buf_allowed = 1;
6958
6959 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)6960 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6961 vm_offset_t b_offset, int xfersize)
6962 {
6963 void *a_cp, *b_cp;
6964 vm_page_t m_a, m_b;
6965 vm_paddr_t p_a, p_b;
6966 vm_offset_t a_pg_offset, b_pg_offset;
6967 int cnt;
6968
6969 while (xfersize > 0) {
6970 a_pg_offset = a_offset & PAGE_MASK;
6971 m_a = ma[a_offset >> PAGE_SHIFT];
6972 p_a = m_a->phys_addr;
6973 b_pg_offset = b_offset & PAGE_MASK;
6974 m_b = mb[b_offset >> PAGE_SHIFT];
6975 p_b = m_b->phys_addr;
6976 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6977 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6978 if (__predict_false(!PHYS_IN_DMAP(p_a))) {
6979 panic("!DMAP a %lx", p_a);
6980 } else {
6981 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
6982 }
6983 if (__predict_false(!PHYS_IN_DMAP(p_b))) {
6984 panic("!DMAP b %lx", p_b);
6985 } else {
6986 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
6987 }
6988 bcopy(a_cp, b_cp, cnt);
6989 a_offset += cnt;
6990 b_offset += cnt;
6991 xfersize -= cnt;
6992 }
6993 }
6994
6995 void *
pmap_quick_enter_page(vm_page_t m)6996 pmap_quick_enter_page(vm_page_t m)
6997 {
6998
6999 return (VM_PAGE_TO_DMAP(m));
7000 }
7001
7002 void
pmap_quick_remove_page(void * addr)7003 pmap_quick_remove_page(void *addr)
7004 {
7005 }
7006
7007 /*
7008 * Returns true if the pmap's pv is one of the first
7009 * 16 pvs linked to from this page. This count may
7010 * be changed upwards or downwards in the future; it
7011 * is only necessary that true be returned for a small
7012 * subset of pmaps for proper page aging.
7013 */
7014 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)7015 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
7016 {
7017 struct md_page *pvh;
7018 struct rwlock *lock;
7019 pv_entry_t pv;
7020 int loops = 0;
7021 bool rv;
7022
7023 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7024 ("pmap_page_exists_quick: page %p is not managed", m));
7025 rv = false;
7026 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7027 rw_rlock(lock);
7028 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7029 if (PV_PMAP(pv) == pmap) {
7030 rv = true;
7031 break;
7032 }
7033 loops++;
7034 if (loops >= 16)
7035 break;
7036 }
7037 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
7038 pvh = page_to_pvh(m);
7039 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
7040 if (PV_PMAP(pv) == pmap) {
7041 rv = true;
7042 break;
7043 }
7044 loops++;
7045 if (loops >= 16)
7046 break;
7047 }
7048 }
7049 rw_runlock(lock);
7050 return (rv);
7051 }
7052
7053 /*
7054 * pmap_page_wired_mappings:
7055 *
7056 * Return the number of managed mappings to the given physical page
7057 * that are wired.
7058 */
7059 int
pmap_page_wired_mappings(vm_page_t m)7060 pmap_page_wired_mappings(vm_page_t m)
7061 {
7062 struct rwlock *lock;
7063 struct md_page *pvh;
7064 pmap_t pmap;
7065 pt_entry_t *pte;
7066 pv_entry_t pv;
7067 int count, md_gen, pvh_gen;
7068
7069 if ((m->oflags & VPO_UNMANAGED) != 0)
7070 return (0);
7071 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7072 rw_rlock(lock);
7073 restart:
7074 count = 0;
7075 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7076 pmap = PV_PMAP(pv);
7077 if (!PMAP_TRYLOCK(pmap)) {
7078 md_gen = m->md.pv_gen;
7079 rw_runlock(lock);
7080 PMAP_LOCK(pmap);
7081 rw_rlock(lock);
7082 if (md_gen != m->md.pv_gen) {
7083 PMAP_UNLOCK(pmap);
7084 goto restart;
7085 }
7086 }
7087 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7088 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
7089 count++;
7090 PMAP_UNLOCK(pmap);
7091 }
7092 if ((m->flags & PG_FICTITIOUS) == 0) {
7093 pvh = page_to_pvh(m);
7094 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
7095 pmap = PV_PMAP(pv);
7096 if (!PMAP_TRYLOCK(pmap)) {
7097 md_gen = m->md.pv_gen;
7098 pvh_gen = pvh->pv_gen;
7099 rw_runlock(lock);
7100 PMAP_LOCK(pmap);
7101 rw_rlock(lock);
7102 if (md_gen != m->md.pv_gen ||
7103 pvh_gen != pvh->pv_gen) {
7104 PMAP_UNLOCK(pmap);
7105 goto restart;
7106 }
7107 }
7108 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
7109 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
7110 count++;
7111 PMAP_UNLOCK(pmap);
7112 }
7113 }
7114 rw_runlock(lock);
7115 return (count);
7116 }
7117
7118 /*
7119 * Returns true if the given page is mapped individually or as part of
7120 * a 2mpage. Otherwise, returns false.
7121 */
7122 bool
pmap_page_is_mapped(vm_page_t m)7123 pmap_page_is_mapped(vm_page_t m)
7124 {
7125 struct rwlock *lock;
7126 bool rv;
7127
7128 if ((m->oflags & VPO_UNMANAGED) != 0)
7129 return (false);
7130 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7131 rw_rlock(lock);
7132 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
7133 ((m->flags & PG_FICTITIOUS) == 0 &&
7134 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
7135 rw_runlock(lock);
7136 return (rv);
7137 }
7138
7139 /*
7140 * Destroy all managed, non-wired mappings in the given user-space
7141 * pmap. This pmap cannot be active on any processor besides the
7142 * caller.
7143 *
7144 * This function cannot be applied to the kernel pmap. Moreover, it
7145 * is not intended for general use. It is only to be used during
7146 * process termination. Consequently, it can be implemented in ways
7147 * that make it faster than pmap_remove(). First, it can more quickly
7148 * destroy mappings by iterating over the pmap's collection of PV
7149 * entries, rather than searching the page table. Second, it doesn't
7150 * have to test and clear the page table entries atomically, because
7151 * no processor is currently accessing the user address space. In
7152 * particular, a page table entry's dirty bit won't change state once
7153 * this function starts.
7154 */
7155 void
pmap_remove_pages(pmap_t pmap)7156 pmap_remove_pages(pmap_t pmap)
7157 {
7158 pd_entry_t *pde;
7159 pt_entry_t *pte, tpte;
7160 struct spglist free;
7161 struct pv_chunklist free_chunks[PMAP_MEMDOM];
7162 vm_page_t m, ml3, mt;
7163 pv_entry_t pv;
7164 struct md_page *pvh;
7165 struct pv_chunk *pc, *npc;
7166 struct rwlock *lock;
7167 int64_t bit;
7168 uint64_t inuse, bitmask;
7169 int allfree, field, i, idx, lvl;
7170 int freed __pvused;
7171 vm_paddr_t pa;
7172
7173 lock = NULL;
7174
7175 for (i = 0; i < PMAP_MEMDOM; i++)
7176 TAILQ_INIT(&free_chunks[i]);
7177 SLIST_INIT(&free);
7178 PMAP_LOCK(pmap);
7179 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
7180 allfree = 1;
7181 freed = 0;
7182 for (field = 0; field < _NPCM; field++) {
7183 inuse = ~pc->pc_map[field] & pc_freemask[field];
7184 while (inuse != 0) {
7185 bit = ffsl(inuse) - 1;
7186 bitmask = 1UL << bit;
7187 idx = field * 64 + bit;
7188 pv = &pc->pc_pventry[idx];
7189 inuse &= ~bitmask;
7190
7191 pde = pmap_pde(pmap, pv->pv_va, &lvl);
7192 KASSERT(pde != NULL,
7193 ("Attempting to remove an unmapped page"));
7194
7195 switch(lvl) {
7196 case 1:
7197 pte = pmap_l1_to_l2(pde, pv->pv_va);
7198 tpte = pmap_load(pte);
7199 KASSERT((tpte & ATTR_DESCR_MASK) ==
7200 L2_BLOCK,
7201 ("Attempting to remove an invalid "
7202 "block: %lx", tpte));
7203 break;
7204 case 2:
7205 pte = pmap_l2_to_l3(pde, pv->pv_va);
7206 tpte = pmap_load(pte);
7207 KASSERT((tpte & ATTR_DESCR_MASK) ==
7208 L3_PAGE,
7209 ("Attempting to remove an invalid "
7210 "page: %lx", tpte));
7211 break;
7212 default:
7213 panic(
7214 "Invalid page directory level: %d",
7215 lvl);
7216 }
7217
7218 /*
7219 * We cannot remove wired mappings at this time.
7220 *
7221 * For L3C superpages, all of the constituent PTEs
7222 * should have the wired bit set, so we don't
7223 * check for ATTR_CONTIGUOUS here.
7224 */
7225 if (tpte & ATTR_SW_WIRED) {
7226 allfree = 0;
7227 continue;
7228 }
7229
7230 /* Mark free */
7231 pc->pc_map[field] |= bitmask;
7232
7233 /*
7234 * Because this pmap is not active on other
7235 * processors, the dirty bit cannot have
7236 * changed state since we last loaded pte.
7237 */
7238 pmap_clear(pte);
7239
7240 pa = PTE_TO_PHYS(tpte);
7241
7242 m = PHYS_TO_VM_PAGE(pa);
7243 KASSERT(m->phys_addr == pa,
7244 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
7245 m, (uintmax_t)m->phys_addr,
7246 (uintmax_t)tpte));
7247
7248 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
7249 m < &vm_page_array[vm_page_array_size],
7250 ("pmap_remove_pages: bad pte %#jx",
7251 (uintmax_t)tpte));
7252
7253 /*
7254 * Update the vm_page_t clean/reference bits.
7255 *
7256 * We don't check for ATTR_CONTIGUOUS here
7257 * because writeable L3C superpages are expected
7258 * to be dirty, i.e., every constituent PTE
7259 * should be dirty.
7260 */
7261 if (pmap_pte_dirty(pmap, tpte)) {
7262 switch (lvl) {
7263 case 1:
7264 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
7265 vm_page_dirty(mt);
7266 break;
7267 case 2:
7268 vm_page_dirty(m);
7269 break;
7270 }
7271 }
7272
7273 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
7274
7275 switch (lvl) {
7276 case 1:
7277 pmap_resident_count_dec(pmap,
7278 L2_SIZE / PAGE_SIZE);
7279 pvh = page_to_pvh(m);
7280 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
7281 pvh->pv_gen++;
7282 if (TAILQ_EMPTY(&pvh->pv_list)) {
7283 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
7284 if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
7285 TAILQ_EMPTY(&mt->md.pv_list))
7286 vm_page_aflag_clear(mt, PGA_WRITEABLE);
7287 }
7288 ml3 = pmap_remove_pt_page(pmap,
7289 pv->pv_va);
7290 if (ml3 != NULL) {
7291 KASSERT(vm_page_any_valid(ml3),
7292 ("pmap_remove_pages: l3 page not promoted"));
7293 pmap_resident_count_dec(pmap,1);
7294 KASSERT(ml3->ref_count == NL3PG,
7295 ("pmap_remove_pages: l3 page ref count error"));
7296 ml3->ref_count = 0;
7297 pmap_add_delayed_free_list(ml3,
7298 &free, false);
7299 }
7300 break;
7301 case 2:
7302 pmap_resident_count_dec(pmap, 1);
7303 TAILQ_REMOVE(&m->md.pv_list, pv,
7304 pv_next);
7305 m->md.pv_gen++;
7306 if ((m->a.flags & PGA_WRITEABLE) != 0 &&
7307 TAILQ_EMPTY(&m->md.pv_list) &&
7308 (m->flags & PG_FICTITIOUS) == 0) {
7309 pvh = page_to_pvh(m);
7310 if (TAILQ_EMPTY(&pvh->pv_list))
7311 vm_page_aflag_clear(m,
7312 PGA_WRITEABLE);
7313 }
7314 break;
7315 }
7316 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
7317 &free);
7318 freed++;
7319 }
7320 }
7321 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
7322 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
7323 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
7324 if (allfree) {
7325 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
7326 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
7327 pc_list);
7328 }
7329 }
7330 if (lock != NULL)
7331 rw_wunlock(lock);
7332 pmap_invalidate_all(pmap);
7333 pmap_bti_deassign_all(pmap);
7334 free_pv_chunk_batch(free_chunks);
7335 PMAP_UNLOCK(pmap);
7336 vm_page_free_pages_toq(&free, true);
7337 }
7338
7339 /*
7340 * This is used to check if a page has been accessed or modified.
7341 */
7342 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)7343 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
7344 {
7345 struct rwlock *lock;
7346 pv_entry_t pv;
7347 struct md_page *pvh;
7348 pt_entry_t l3e, mask, *pte, value;
7349 pmap_t pmap;
7350 int md_gen, pvh_gen;
7351 bool rv;
7352
7353 rv = false;
7354 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7355 rw_rlock(lock);
7356 restart:
7357 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7358 pmap = PV_PMAP(pv);
7359 PMAP_ASSERT_STAGE1(pmap);
7360 if (!PMAP_TRYLOCK(pmap)) {
7361 md_gen = m->md.pv_gen;
7362 rw_runlock(lock);
7363 PMAP_LOCK(pmap);
7364 rw_rlock(lock);
7365 if (md_gen != m->md.pv_gen) {
7366 PMAP_UNLOCK(pmap);
7367 goto restart;
7368 }
7369 }
7370 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7371 mask = 0;
7372 value = 0;
7373 if (modified) {
7374 mask |= ATTR_S1_AP_RW_BIT;
7375 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
7376 }
7377 if (accessed) {
7378 mask |= ATTR_AF | ATTR_DESCR_MASK;
7379 value |= ATTR_AF | L3_PAGE;
7380 }
7381 l3e = pmap_load(pte);
7382 if ((l3e & ATTR_CONTIGUOUS) != 0)
7383 l3e = pmap_load_l3c(pte);
7384 PMAP_UNLOCK(pmap);
7385 rv = (l3e & mask) == value;
7386 if (rv)
7387 goto out;
7388 }
7389 if ((m->flags & PG_FICTITIOUS) == 0) {
7390 pvh = page_to_pvh(m);
7391 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
7392 pmap = PV_PMAP(pv);
7393 PMAP_ASSERT_STAGE1(pmap);
7394 if (!PMAP_TRYLOCK(pmap)) {
7395 md_gen = m->md.pv_gen;
7396 pvh_gen = pvh->pv_gen;
7397 rw_runlock(lock);
7398 PMAP_LOCK(pmap);
7399 rw_rlock(lock);
7400 if (md_gen != m->md.pv_gen ||
7401 pvh_gen != pvh->pv_gen) {
7402 PMAP_UNLOCK(pmap);
7403 goto restart;
7404 }
7405 }
7406 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
7407 mask = 0;
7408 value = 0;
7409 if (modified) {
7410 mask |= ATTR_S1_AP_RW_BIT;
7411 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
7412 }
7413 if (accessed) {
7414 mask |= ATTR_AF | ATTR_DESCR_MASK;
7415 value |= ATTR_AF | L2_BLOCK;
7416 }
7417 rv = (pmap_load(pte) & mask) == value;
7418 PMAP_UNLOCK(pmap);
7419 if (rv)
7420 goto out;
7421 }
7422 }
7423 out:
7424 rw_runlock(lock);
7425 return (rv);
7426 }
7427
7428 /*
7429 * pmap_is_modified:
7430 *
7431 * Return whether or not the specified physical page was modified
7432 * in any physical maps.
7433 */
7434 bool
pmap_is_modified(vm_page_t m)7435 pmap_is_modified(vm_page_t m)
7436 {
7437
7438 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7439 ("pmap_is_modified: page %p is not managed", m));
7440
7441 /*
7442 * If the page is not busied then this check is racy.
7443 */
7444 if (!pmap_page_is_write_mapped(m))
7445 return (false);
7446 return (pmap_page_test_mappings(m, false, true));
7447 }
7448
7449 /*
7450 * pmap_is_prefaultable:
7451 *
7452 * Return whether or not the specified virtual address is eligible
7453 * for prefault.
7454 */
7455 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)7456 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
7457 {
7458 pd_entry_t *pde;
7459 pt_entry_t *pte;
7460 bool rv;
7461 int lvl;
7462
7463 /*
7464 * Return true if and only if the L3 entry for the specified virtual
7465 * address is allocated but invalid.
7466 */
7467 rv = false;
7468 PMAP_LOCK(pmap);
7469 pde = pmap_pde(pmap, addr, &lvl);
7470 if (pde != NULL && lvl == 2) {
7471 pte = pmap_l2_to_l3(pde, addr);
7472 rv = pmap_load(pte) == 0;
7473 }
7474 PMAP_UNLOCK(pmap);
7475 return (rv);
7476 }
7477
7478 /*
7479 * pmap_is_referenced:
7480 *
7481 * Return whether or not the specified physical page was referenced
7482 * in any physical maps.
7483 */
7484 bool
pmap_is_referenced(vm_page_t m)7485 pmap_is_referenced(vm_page_t m)
7486 {
7487
7488 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7489 ("pmap_is_referenced: page %p is not managed", m));
7490 return (pmap_page_test_mappings(m, true, false));
7491 }
7492
7493 /*
7494 * Clear the write and modified bits in each of the given page's mappings.
7495 */
7496 void
pmap_remove_write(vm_page_t m)7497 pmap_remove_write(vm_page_t m)
7498 {
7499 struct md_page *pvh;
7500 pmap_t pmap;
7501 struct rwlock *lock;
7502 pv_entry_t next_pv, pv;
7503 pt_entry_t oldpte, *pte, set, clear, mask, val;
7504 vm_offset_t va;
7505 int md_gen, pvh_gen;
7506
7507 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7508 ("pmap_remove_write: page %p is not managed", m));
7509 vm_page_assert_busied(m);
7510
7511 if (!pmap_page_is_write_mapped(m))
7512 return;
7513 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7514 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7515 rw_wlock(lock);
7516 retry:
7517 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7518 pmap = PV_PMAP(pv);
7519 PMAP_ASSERT_STAGE1(pmap);
7520 if (!PMAP_TRYLOCK(pmap)) {
7521 pvh_gen = pvh->pv_gen;
7522 rw_wunlock(lock);
7523 PMAP_LOCK(pmap);
7524 rw_wlock(lock);
7525 if (pvh_gen != pvh->pv_gen) {
7526 PMAP_UNLOCK(pmap);
7527 goto retry;
7528 }
7529 }
7530 va = pv->pv_va;
7531 pte = pmap_pte_exists(pmap, va, 2, __func__);
7532 if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
7533 (void)pmap_demote_l2_locked(pmap, pte, va, &lock);
7534 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7535 ("inconsistent pv lock %p %p for page %p",
7536 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7537 PMAP_UNLOCK(pmap);
7538 }
7539 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7540 pmap = PV_PMAP(pv);
7541 if (!PMAP_TRYLOCK(pmap)) {
7542 pvh_gen = pvh->pv_gen;
7543 md_gen = m->md.pv_gen;
7544 rw_wunlock(lock);
7545 PMAP_LOCK(pmap);
7546 rw_wlock(lock);
7547 if (pvh_gen != pvh->pv_gen ||
7548 md_gen != m->md.pv_gen) {
7549 PMAP_UNLOCK(pmap);
7550 goto retry;
7551 }
7552 }
7553 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7554 oldpte = pmap_load(pte);
7555 if ((oldpte & ATTR_SW_DBM) != 0) {
7556 if ((oldpte & ATTR_CONTIGUOUS) != 0) {
7557 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
7558
7559 /*
7560 * The L3 entry's accessed bit may have
7561 * changed.
7562 */
7563 oldpte = pmap_load(pte);
7564 }
7565 if (pmap->pm_stage == PM_STAGE1) {
7566 set = ATTR_S1_AP_RW_BIT;
7567 clear = 0;
7568 mask = ATTR_S1_AP_RW_BIT;
7569 val = ATTR_S1_AP(ATTR_S1_AP_RW);
7570 } else {
7571 set = 0;
7572 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7573 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7574 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7575 }
7576 clear |= ATTR_SW_DBM;
7577 while (!atomic_fcmpset_64(pte, &oldpte,
7578 (oldpte | set) & ~clear))
7579 cpu_spinwait();
7580
7581 if ((oldpte & mask) == val)
7582 vm_page_dirty(m);
7583 pmap_invalidate_page(pmap, pv->pv_va, true);
7584 }
7585 PMAP_UNLOCK(pmap);
7586 }
7587 rw_wunlock(lock);
7588 vm_page_aflag_clear(m, PGA_WRITEABLE);
7589 }
7590
7591 /*
7592 * pmap_ts_referenced:
7593 *
7594 * Return a count of reference bits for a page, clearing those bits.
7595 * It is not necessary for every reference bit to be cleared, but it
7596 * is necessary that 0 only be returned when there are truly no
7597 * reference bits set.
7598 *
7599 * As an optimization, update the page's dirty field if a modified bit is
7600 * found while counting reference bits. This opportunistic update can be
7601 * performed at low cost and can eliminate the need for some future calls
7602 * to pmap_is_modified(). However, since this function stops after
7603 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
7604 * dirty pages. Those dirty pages will only be detected by a future call
7605 * to pmap_is_modified().
7606 */
7607 int
pmap_ts_referenced(vm_page_t m)7608 pmap_ts_referenced(vm_page_t m)
7609 {
7610 struct md_page *pvh;
7611 pv_entry_t pv, pvf;
7612 pmap_t pmap;
7613 struct rwlock *lock;
7614 pt_entry_t *pte, tpte;
7615 vm_offset_t va;
7616 vm_paddr_t pa;
7617 int cleared, md_gen, not_cleared, pvh_gen;
7618 struct spglist free;
7619
7620 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7621 ("pmap_ts_referenced: page %p is not managed", m));
7622 SLIST_INIT(&free);
7623 cleared = 0;
7624 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7625 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7626 rw_wlock(lock);
7627 retry:
7628 not_cleared = 0;
7629 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
7630 goto small_mappings;
7631 pv = pvf;
7632 do {
7633 if (pvf == NULL)
7634 pvf = pv;
7635 pmap = PV_PMAP(pv);
7636 if (!PMAP_TRYLOCK(pmap)) {
7637 pvh_gen = pvh->pv_gen;
7638 rw_wunlock(lock);
7639 PMAP_LOCK(pmap);
7640 rw_wlock(lock);
7641 if (pvh_gen != pvh->pv_gen) {
7642 PMAP_UNLOCK(pmap);
7643 goto retry;
7644 }
7645 }
7646 va = pv->pv_va;
7647 pte = pmap_pte_exists(pmap, va, 2, __func__);
7648 tpte = pmap_load(pte);
7649 if (pmap_pte_dirty(pmap, tpte)) {
7650 /*
7651 * Although "tpte" is mapping a 2MB page, because
7652 * this function is called at a 4KB page granularity,
7653 * we only update the 4KB page under test.
7654 */
7655 vm_page_dirty(m);
7656 }
7657 if ((tpte & ATTR_AF) != 0) {
7658 pa = VM_PAGE_TO_PHYS(m);
7659
7660 /*
7661 * Since this reference bit is shared by 512 4KB pages,
7662 * it should not be cleared every time it is tested.
7663 * Apply a simple "hash" function on the physical page
7664 * number, the virtual superpage number, and the pmap
7665 * address to select one 4KB page out of the 512 on
7666 * which testing the reference bit will result in
7667 * clearing that reference bit. This function is
7668 * designed to avoid the selection of the same 4KB page
7669 * for every 2MB page mapping.
7670 *
7671 * On demotion, a mapping that hasn't been referenced
7672 * is simply destroyed. To avoid the possibility of a
7673 * subsequent page fault on a demoted wired mapping,
7674 * always leave its reference bit set. Moreover,
7675 * since the superpage is wired, the current state of
7676 * its reference bit won't affect page replacement.
7677 */
7678 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
7679 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
7680 (tpte & ATTR_SW_WIRED) == 0) {
7681 pmap_clear_bits(pte, ATTR_AF);
7682 pmap_invalidate_page(pmap, va, true);
7683 cleared++;
7684 } else
7685 not_cleared++;
7686 }
7687 PMAP_UNLOCK(pmap);
7688 /* Rotate the PV list if it has more than one entry. */
7689 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7690 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7691 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
7692 pvh->pv_gen++;
7693 }
7694 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
7695 goto out;
7696 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
7697 small_mappings:
7698 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
7699 goto out;
7700 pv = pvf;
7701 do {
7702 if (pvf == NULL)
7703 pvf = pv;
7704 pmap = PV_PMAP(pv);
7705 if (!PMAP_TRYLOCK(pmap)) {
7706 pvh_gen = pvh->pv_gen;
7707 md_gen = m->md.pv_gen;
7708 rw_wunlock(lock);
7709 PMAP_LOCK(pmap);
7710 rw_wlock(lock);
7711 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7712 PMAP_UNLOCK(pmap);
7713 goto retry;
7714 }
7715 }
7716 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7717 tpte = pmap_load(pte);
7718 if (pmap_pte_dirty(pmap, tpte))
7719 vm_page_dirty(m);
7720 if ((tpte & ATTR_AF) != 0) {
7721 if ((tpte & ATTR_SW_WIRED) == 0) {
7722 /*
7723 * Clear the accessed bit in this L3 entry
7724 * regardless of the contiguous bit.
7725 */
7726 pmap_clear_bits(pte, ATTR_AF);
7727 pmap_invalidate_page(pmap, pv->pv_va, true);
7728 cleared++;
7729 } else
7730 not_cleared++;
7731 } else if ((tpte & ATTR_CONTIGUOUS) != 0 &&
7732 (pmap_load_l3c(pte) & ATTR_AF) != 0) {
7733 /*
7734 * An L3C superpage mapping is regarded as accessed
7735 * until the accessed bit has been cleared in all
7736 * of its constituent entries.
7737 */
7738 not_cleared++;
7739 }
7740 PMAP_UNLOCK(pmap);
7741 /* Rotate the PV list if it has more than one entry. */
7742 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7743 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7744 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7745 m->md.pv_gen++;
7746 }
7747 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
7748 not_cleared < PMAP_TS_REFERENCED_MAX);
7749 out:
7750 rw_wunlock(lock);
7751 vm_page_free_pages_toq(&free, true);
7752 return (cleared + not_cleared);
7753 }
7754
7755 /*
7756 * Apply the given advice to the specified range of addresses within the
7757 * given pmap. Depending on the advice, clear the referenced and/or
7758 * modified flags in each mapping and set the mapped page's dirty field.
7759 */
7760 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)7761 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
7762 {
7763 struct rwlock *lock;
7764 vm_offset_t va, va_next, dva;
7765 vm_page_t m;
7766 pd_entry_t *l0, *l1, *l2, oldl2;
7767 pt_entry_t *l3, *dl3, oldl3;
7768
7769 PMAP_ASSERT_STAGE1(pmap);
7770
7771 if (advice != MADV_DONTNEED && advice != MADV_FREE)
7772 return;
7773
7774 PMAP_LOCK(pmap);
7775 for (; sva < eva; sva = va_next) {
7776 l0 = pmap_l0(pmap, sva);
7777 if (pmap_load(l0) == 0) {
7778 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
7779 if (va_next < sva)
7780 va_next = eva;
7781 continue;
7782 }
7783
7784 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
7785 if (va_next < sva)
7786 va_next = eva;
7787 l1 = pmap_l0_to_l1(l0, sva);
7788 if (pmap_load(l1) == 0)
7789 continue;
7790 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
7791 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7792 continue;
7793 }
7794
7795 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
7796 if (va_next < sva)
7797 va_next = eva;
7798 l2 = pmap_l1_to_l2(l1, sva);
7799 oldl2 = pmap_load(l2);
7800 if (oldl2 == 0)
7801 continue;
7802 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
7803 if ((oldl2 & ATTR_SW_MANAGED) == 0)
7804 continue;
7805 lock = NULL;
7806 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
7807 if (lock != NULL)
7808 rw_wunlock(lock);
7809
7810 /*
7811 * The 2MB page mapping was destroyed.
7812 */
7813 continue;
7814 }
7815
7816 /*
7817 * Unless the page mappings are wired, remove the
7818 * mapping to a single page so that a subsequent
7819 * access may repromote. Choosing the last page
7820 * within the address range [sva, min(va_next, eva))
7821 * generally results in more repromotions. Since the
7822 * underlying page table page is fully populated, this
7823 * removal never frees a page table page.
7824 */
7825 if ((oldl2 & ATTR_SW_WIRED) == 0) {
7826 va = eva;
7827 if (va > va_next)
7828 va = va_next;
7829 va -= PAGE_SIZE;
7830 KASSERT(va >= sva,
7831 ("pmap_advise: no address gap"));
7832 l3 = pmap_l2_to_l3(l2, va);
7833 KASSERT(pmap_load(l3) != 0,
7834 ("pmap_advise: invalid PTE"));
7835 pmap_remove_l3(pmap, l3, va, pmap_load(l2),
7836 NULL, &lock);
7837 }
7838 if (lock != NULL)
7839 rw_wunlock(lock);
7840 }
7841 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
7842 ("pmap_advise: invalid L2 entry after demotion"));
7843 if (va_next > eva)
7844 va_next = eva;
7845 va = va_next;
7846 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
7847 sva += L3_SIZE) {
7848 oldl3 = pmap_load(l3);
7849 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
7850 (ATTR_SW_MANAGED | L3_PAGE))
7851 goto maybe_invlrng;
7852 else if (pmap_pte_dirty(pmap, oldl3)) {
7853 if (advice == MADV_DONTNEED) {
7854 /*
7855 * Future calls to pmap_is_modified()
7856 * can be avoided by making the page
7857 * dirty now.
7858 */
7859 m = PTE_TO_VM_PAGE(oldl3);
7860 vm_page_dirty(m);
7861 }
7862 if ((oldl3 & ATTR_CONTIGUOUS) != 0) {
7863 /*
7864 * Unconditionally demote the L3C
7865 * superpage because we do not allow
7866 * writeable, clean superpages.
7867 */
7868 (void)pmap_demote_l3c(pmap, l3, sva);
7869
7870 /*
7871 * Destroy the final mapping before the
7872 * next L3C boundary or va_next,
7873 * whichever comes first, so that a
7874 * subsequent access may act as a
7875 * repromotion trigger.
7876 */
7877 if ((oldl3 & ATTR_SW_WIRED) == 0) {
7878 dva = MIN((sva & ~L3C_OFFSET) +
7879 L3C_SIZE - PAGE_SIZE,
7880 va_next - PAGE_SIZE);
7881 dl3 = pmap_l2_to_l3(l2, dva);
7882 KASSERT(pmap_load(dl3) != 0,
7883 ("pmap_advise: invalid PTE"));
7884 lock = NULL;
7885 pmap_remove_l3(pmap, dl3, dva,
7886 pmap_load(l2), NULL, &lock);
7887 if (lock != NULL)
7888 rw_wunlock(lock);
7889 }
7890
7891 /*
7892 * The L3 entry's accessed bit may have
7893 * changed.
7894 */
7895 oldl3 = pmap_load(l3);
7896 }
7897
7898 /*
7899 * Check that we did not just destroy this entry so
7900 * we avoid corrupting the page able.
7901 */
7902 if (oldl3 != 0) {
7903 while (!atomic_fcmpset_long(l3, &oldl3,
7904 (oldl3 & ~ATTR_AF) |
7905 ATTR_S1_AP(ATTR_S1_AP_RO)))
7906 cpu_spinwait();
7907 }
7908 } else if ((oldl3 & ATTR_AF) != 0) {
7909 /*
7910 * Clear the accessed bit in this L3 entry
7911 * regardless of the contiguous bit.
7912 */
7913 pmap_clear_bits(l3, ATTR_AF);
7914 } else
7915 goto maybe_invlrng;
7916 if (va == va_next)
7917 va = sva;
7918 continue;
7919 maybe_invlrng:
7920 if (va != va_next) {
7921 pmap_s1_invalidate_range(pmap, va, sva, true);
7922 va = va_next;
7923 }
7924 }
7925 if (va != va_next)
7926 pmap_s1_invalidate_range(pmap, va, sva, true);
7927 }
7928 PMAP_UNLOCK(pmap);
7929 }
7930
7931 /*
7932 * Clear the modify bits on the specified physical page.
7933 */
7934 void
pmap_clear_modify(vm_page_t m)7935 pmap_clear_modify(vm_page_t m)
7936 {
7937 struct md_page *pvh;
7938 struct rwlock *lock;
7939 pmap_t pmap;
7940 pv_entry_t next_pv, pv;
7941 pd_entry_t *l2, oldl2;
7942 pt_entry_t *l3, oldl3;
7943 vm_offset_t va;
7944 int md_gen, pvh_gen;
7945
7946 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7947 ("pmap_clear_modify: page %p is not managed", m));
7948 vm_page_assert_busied(m);
7949
7950 if (!pmap_page_is_write_mapped(m))
7951 return;
7952 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7953 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7954 rw_wlock(lock);
7955 restart:
7956 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7957 pmap = PV_PMAP(pv);
7958 PMAP_ASSERT_STAGE1(pmap);
7959 if (!PMAP_TRYLOCK(pmap)) {
7960 pvh_gen = pvh->pv_gen;
7961 rw_wunlock(lock);
7962 PMAP_LOCK(pmap);
7963 rw_wlock(lock);
7964 if (pvh_gen != pvh->pv_gen) {
7965 PMAP_UNLOCK(pmap);
7966 goto restart;
7967 }
7968 }
7969 va = pv->pv_va;
7970 l2 = pmap_l2(pmap, va);
7971 oldl2 = pmap_load(l2);
7972 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
7973 if ((oldl2 & ATTR_SW_DBM) != 0 &&
7974 pmap_demote_l2_locked(pmap, l2, va, &lock) &&
7975 (oldl2 & ATTR_SW_WIRED) == 0) {
7976 /*
7977 * Write protect the mapping to a single page so that
7978 * a subsequent write access may repromote.
7979 */
7980 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
7981 l3 = pmap_l2_to_l3(l2, va);
7982 oldl3 = pmap_load(l3);
7983 while (!atomic_fcmpset_long(l3, &oldl3,
7984 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
7985 cpu_spinwait();
7986 vm_page_dirty(m);
7987 pmap_s1_invalidate_page(pmap, va, true);
7988 }
7989 PMAP_UNLOCK(pmap);
7990 }
7991 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7992 pmap = PV_PMAP(pv);
7993 PMAP_ASSERT_STAGE1(pmap);
7994 if (!PMAP_TRYLOCK(pmap)) {
7995 md_gen = m->md.pv_gen;
7996 pvh_gen = pvh->pv_gen;
7997 rw_wunlock(lock);
7998 PMAP_LOCK(pmap);
7999 rw_wlock(lock);
8000 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
8001 PMAP_UNLOCK(pmap);
8002 goto restart;
8003 }
8004 }
8005 l2 = pmap_l2(pmap, pv->pv_va);
8006 l3 = pmap_l2_to_l3(l2, pv->pv_va);
8007 oldl3 = pmap_load(l3);
8008 KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 ||
8009 (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8010 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8011 ("writeable L3C superpage not dirty"));
8012 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
8013 if ((oldl3 & ATTR_CONTIGUOUS) != 0)
8014 (void)pmap_demote_l3c(pmap, l3, pv->pv_va);
8015 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
8016 pmap_s1_invalidate_page(pmap, pv->pv_va, true);
8017 }
8018 PMAP_UNLOCK(pmap);
8019 }
8020 rw_wunlock(lock);
8021 }
8022
8023 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)8024 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
8025 {
8026 struct pmap_preinit_mapping *ppim;
8027 vm_offset_t va, offset;
8028 pd_entry_t old_l2e, *pde;
8029 pt_entry_t *l2;
8030 int i, lvl, l2_blocks, free_l2_count, start_idx;
8031
8032 /* Use the DMAP region if we can */
8033 if (PHYS_IN_DMAP(pa) && PHYS_IN_DMAP(pa + size - 1) &&
8034 pmap_kmapped_range(PHYS_TO_DMAP(pa), size))
8035 return (PHYS_TO_DMAP(pa));
8036
8037 if (!vm_initialized) {
8038 /*
8039 * No L3 ptables so map entire L2 blocks where start VA is:
8040 * preinit_map_va + start_idx * L2_SIZE
8041 * There may be duplicate mappings (multiple VA -> same PA) but
8042 * ARM64 dcache is always PIPT so that's acceptable.
8043 */
8044 if (size == 0)
8045 return (NULL);
8046
8047 /* Calculate how many L2 blocks are needed for the mapping */
8048 l2_blocks = (roundup2(pa + size, L2_SIZE) -
8049 rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
8050
8051 offset = pa & L2_OFFSET;
8052
8053 if (preinit_map_va == 0)
8054 return (NULL);
8055
8056 /* Map 2MiB L2 blocks from reserved VA space */
8057
8058 free_l2_count = 0;
8059 start_idx = -1;
8060 /* Find enough free contiguous VA space */
8061 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
8062 ppim = pmap_preinit_mapping + i;
8063 if (free_l2_count > 0 && ppim->pa != 0) {
8064 /* Not enough space here */
8065 free_l2_count = 0;
8066 start_idx = -1;
8067 continue;
8068 }
8069
8070 if (ppim->pa == 0) {
8071 /* Free L2 block */
8072 if (start_idx == -1)
8073 start_idx = i;
8074 free_l2_count++;
8075 if (free_l2_count == l2_blocks)
8076 break;
8077 }
8078 }
8079 if (free_l2_count != l2_blocks)
8080 panic("%s: too many preinit mappings", __func__);
8081
8082 va = preinit_map_va + (start_idx * L2_SIZE);
8083 for (i = start_idx; i < start_idx + l2_blocks; i++) {
8084 /* Mark entries as allocated */
8085 ppim = pmap_preinit_mapping + i;
8086 ppim->pa = pa;
8087 ppim->va = (char *)va + offset;
8088 ppim->size = size;
8089 }
8090
8091 /* Map L2 blocks */
8092 pa = rounddown2(pa, L2_SIZE);
8093 old_l2e = 0;
8094 for (i = 0; i < l2_blocks; i++) {
8095 pde = pmap_pde(kernel_pmap, va, &lvl);
8096 KASSERT(pde != NULL,
8097 ("pmap_mapbios: Invalid page entry, va: 0x%lx",
8098 va));
8099 KASSERT(lvl == 1,
8100 ("pmap_mapbios: Invalid level %d", lvl));
8101
8102 /* Insert L2_BLOCK */
8103 l2 = pmap_l1_to_l2(pde, va);
8104 old_l2e |= pmap_load_store(l2,
8105 PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr |
8106 ATTR_S1_XN | ATTR_KERN_GP |
8107 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
8108
8109 va += L2_SIZE;
8110 pa += L2_SIZE;
8111 }
8112 if ((old_l2e & ATTR_DESCR_VALID) != 0)
8113 pmap_s1_invalidate_all_kernel();
8114 else {
8115 /*
8116 * Because the old entries were invalid and the new
8117 * mappings are not executable, an isb is not required.
8118 */
8119 dsb(ishst);
8120 }
8121
8122 va = preinit_map_va + (start_idx * L2_SIZE);
8123
8124 } else {
8125 /* kva_alloc may be used to map the pages */
8126 offset = pa & PAGE_MASK;
8127 size = round_page(offset + size);
8128
8129 va = (vm_offset_t)kva_alloc(size);
8130 if (va == 0)
8131 panic("%s: Couldn't allocate KVA", __func__);
8132
8133 pde = pmap_pde(kernel_pmap, va, &lvl);
8134 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
8135
8136 /* L3 table is linked */
8137 va = trunc_page(va);
8138 pa = trunc_page(pa);
8139 pmap_kenter(va, size, pa, memory_mapping_mode(pa));
8140 }
8141
8142 return ((void *)(va + offset));
8143 }
8144
8145 void
pmap_unmapbios(void * p,vm_size_t size)8146 pmap_unmapbios(void *p, vm_size_t size)
8147 {
8148 struct pmap_preinit_mapping *ppim;
8149 char *va;
8150 vm_offset_t offset, va_trunc;
8151 pd_entry_t *pde;
8152 pt_entry_t *l2;
8153 int error __diagused, i, lvl, l2_blocks, block;
8154 bool preinit_map;
8155
8156 va = p;
8157 if (VIRT_IN_DMAP(va)) {
8158 KASSERT(VIRT_IN_DMAP(va + size - 1),
8159 ("%s: End address not in DMAP region: %p", __func__,
8160 va + size - 1));
8161 /* Ensure the attributes are as expected for the DMAP region */
8162 PMAP_LOCK(kernel_pmap);
8163 error = pmap_change_props_locked(va, size,
8164 PROT_READ | PROT_WRITE, VM_MEMATTR_DEFAULT, false);
8165 PMAP_UNLOCK(kernel_pmap);
8166 KASSERT(error == 0, ("%s: Failed to reset DMAP attributes: %d",
8167 __func__, error));
8168
8169 return;
8170 }
8171
8172 l2_blocks =
8173 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
8174 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
8175
8176 /* Remove preinit mapping */
8177 preinit_map = false;
8178 block = 0;
8179 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
8180 ppim = pmap_preinit_mapping + i;
8181 if (ppim->va == va) {
8182 KASSERT(ppim->size == size,
8183 ("pmap_unmapbios: size mismatch"));
8184 ppim->va = NULL;
8185 ppim->pa = 0;
8186 ppim->size = 0;
8187 preinit_map = true;
8188 offset = block * L2_SIZE;
8189 va_trunc = rounddown2((vm_offset_t)va, L2_SIZE) +
8190 offset;
8191
8192 /* Remove L2_BLOCK */
8193 pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
8194 KASSERT(pde != NULL,
8195 ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
8196 va_trunc));
8197 l2 = pmap_l1_to_l2(pde, va_trunc);
8198 pmap_clear(l2);
8199
8200 if (block == (l2_blocks - 1))
8201 break;
8202 block++;
8203 }
8204 }
8205 if (preinit_map) {
8206 pmap_s1_invalidate_all_kernel();
8207 return;
8208 }
8209
8210 /* Unmap the pages reserved with kva_alloc. */
8211 if (vm_initialized) {
8212 offset = (vm_offset_t)va & PAGE_MASK;
8213 size = round_page(offset + size);
8214 va = trunc_page(va);
8215
8216 /* Unmap and invalidate the pages */
8217 pmap_kremove_device((vm_offset_t)va, size);
8218
8219 kva_free(va, size);
8220 }
8221 }
8222
8223 /*
8224 * Sets the memory attribute for the specified page.
8225 */
8226 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)8227 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
8228 {
8229 if (m->md.pv_memattr == ma)
8230 return;
8231
8232 m->md.pv_memattr = ma;
8233
8234 /*
8235 * If "m" is a normal page, update its direct mapping. This update
8236 * can be relied upon to perform any cache operations that are
8237 * required for data coherence.
8238 */
8239 if ((m->flags & PG_FICTITIOUS) == 0 &&
8240 pmap_change_attr(VM_PAGE_TO_DMAP(m), PAGE_SIZE,
8241 m->md.pv_memattr) != 0)
8242 panic("memory attribute change on the direct map failed");
8243 }
8244
8245 /*
8246 * Changes the specified virtual address range's memory type to that given by
8247 * the parameter "mode". The specified virtual address range must be
8248 * completely contained within either the direct map or the kernel map. If
8249 * the virtual address range is contained within the kernel map, then the
8250 * memory type for each of the corresponding ranges of the direct map is also
8251 * changed. (The corresponding ranges of the direct map are those ranges that
8252 * map the same physical pages as the specified virtual address range.) These
8253 * changes to the direct map are necessary because Intel describes the
8254 * behavior of their processors as "undefined" if two or more mappings to the
8255 * same physical page have different memory types.
8256 *
8257 * Returns zero if the change completed successfully, and either EINVAL or
8258 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
8259 * of the virtual address range was not mapped, and ENOMEM is returned if
8260 * there was insufficient memory available to complete the change. In the
8261 * latter case, the memory type may have been changed on some part of the
8262 * virtual address range or the direct map.
8263 */
8264 int
pmap_change_attr(void * va,vm_size_t size,int mode)8265 pmap_change_attr(void *va, vm_size_t size, int mode)
8266 {
8267 int error;
8268
8269 PMAP_LOCK(kernel_pmap);
8270 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
8271 PMAP_UNLOCK(kernel_pmap);
8272 return (error);
8273 }
8274
8275 /*
8276 * Changes the specified virtual address range's protections to those
8277 * specified by "prot". Like pmap_change_attr(), protections for aliases
8278 * in the direct map are updated as well. Protections on aliasing mappings may
8279 * be a subset of the requested protections; for example, mappings in the direct
8280 * map are never executable.
8281 */
8282 int
pmap_change_prot(void * va,vm_size_t size,vm_prot_t prot)8283 pmap_change_prot(void *va, vm_size_t size, vm_prot_t prot)
8284 {
8285 int error;
8286
8287 /* Only supported within the kernel map. */
8288 if ((vm_offset_t)va < VM_MIN_KERNEL_ADDRESS)
8289 return (EINVAL);
8290
8291 PMAP_LOCK(kernel_pmap);
8292 error = pmap_change_props_locked(va, size, prot, -1, false);
8293 PMAP_UNLOCK(kernel_pmap);
8294 return (error);
8295 }
8296
8297 static int
pmap_change_props_locked(void * addr,vm_size_t size,vm_prot_t prot,int mode,bool skip_unmapped)8298 pmap_change_props_locked(void *addr, vm_size_t size, vm_prot_t prot,
8299 int mode, bool skip_unmapped)
8300 {
8301 vm_offset_t base, offset, tmpva, va;
8302 vm_size_t pte_size;
8303 vm_paddr_t pa;
8304 pt_entry_t pte, *ptep, *newpte;
8305 pt_entry_t bits, mask;
8306 char *tmpptep;
8307 int lvl, rv;
8308
8309 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
8310 va = (vm_offset_t)addr;
8311 base = trunc_page(va);
8312 offset = va & PAGE_MASK;
8313 size = round_page(offset + size);
8314
8315 if (!VIRT_IN_DMAP(base) &&
8316 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
8317 return (EINVAL);
8318
8319 bits = 0;
8320 mask = 0;
8321 if (mode != -1) {
8322 bits = ATTR_S1_IDX(mode);
8323 mask = ATTR_S1_IDX_MASK;
8324 if (mode == VM_MEMATTR_DEVICE) {
8325 mask |= ATTR_S1_XN;
8326 bits |= ATTR_S1_XN;
8327 }
8328 }
8329 if (prot != VM_PROT_NONE) {
8330 /* Don't mark the DMAP as executable. It never is on arm64. */
8331 if (VIRT_IN_DMAP(base)) {
8332 prot &= ~VM_PROT_EXECUTE;
8333 /*
8334 * XXX Mark the DMAP as writable for now. We rely
8335 * on this in ddb & dtrace to insert breakpoint
8336 * instructions.
8337 */
8338 prot |= VM_PROT_WRITE;
8339 }
8340
8341 if ((prot & VM_PROT_WRITE) == 0) {
8342 bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
8343 }
8344 if ((prot & VM_PROT_EXECUTE) == 0) {
8345 bits |= ATTR_S1_PXN;
8346 }
8347 bits |= ATTR_S1_UXN;
8348 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
8349 }
8350
8351 for (tmpva = base; tmpva < base + size; ) {
8352 ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
8353 if (ptep == NULL && !skip_unmapped) {
8354 return (EINVAL);
8355 } else if ((ptep == NULL && skip_unmapped) ||
8356 (pmap_load(ptep) & mask) == bits) {
8357 /*
8358 * We already have the correct attribute or there
8359 * is no memory mapped at this address and we are
8360 * skipping unmapped memory.
8361 */
8362 switch (lvl) {
8363 default:
8364 panic("Invalid DMAP table level: %d\n", lvl);
8365 case 1:
8366 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
8367 break;
8368 case 2:
8369 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
8370 break;
8371 case 3:
8372 tmpva += PAGE_SIZE;
8373 break;
8374 }
8375 } else {
8376 /* We can't demote/promote this entry */
8377 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
8378
8379 /*
8380 * Find the entry and demote it if the requested change
8381 * only applies to part of the address range mapped by
8382 * the entry.
8383 */
8384 switch (lvl) {
8385 default:
8386 panic("Invalid DMAP table level: %d\n", lvl);
8387 case 1:
8388 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8389 if ((tmpva & L1_OFFSET) == 0 &&
8390 (base + size - tmpva) >= L1_SIZE) {
8391 pte_size = L1_SIZE;
8392 break;
8393 }
8394 newpte = pmap_demote_l1(kernel_pmap, ptep,
8395 tmpva & ~L1_OFFSET);
8396 if (newpte == NULL)
8397 return (EINVAL);
8398 ptep = pmap_l1_to_l2(ptep, tmpva);
8399 /* FALLTHROUGH */
8400 case 2:
8401 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
8402 if ((tmpva & L2C_OFFSET) == 0 &&
8403 (base + size - tmpva) >= L2C_SIZE) {
8404 pte_size = L2C_SIZE;
8405 break;
8406 }
8407 if (!pmap_demote_l2c(kernel_pmap, ptep,
8408 tmpva))
8409 return (EINVAL);
8410 }
8411 if ((tmpva & L2_OFFSET) == 0 &&
8412 (base + size - tmpva) >= L2_SIZE) {
8413 pte_size = L2_SIZE;
8414 break;
8415 }
8416 newpte = pmap_demote_l2(kernel_pmap, ptep,
8417 tmpva);
8418 if (newpte == NULL)
8419 return (EINVAL);
8420 ptep = pmap_l2_to_l3(ptep, tmpva);
8421 /* FALLTHROUGH */
8422 case 3:
8423 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
8424 if ((tmpva & L3C_OFFSET) == 0 &&
8425 (base + size - tmpva) >= L3C_SIZE) {
8426 pte_size = L3C_SIZE;
8427 break;
8428 }
8429 if (!pmap_demote_l3c(kernel_pmap, ptep,
8430 tmpva))
8431 return (EINVAL);
8432 }
8433 pte_size = PAGE_SIZE;
8434 break;
8435 }
8436
8437 tmpptep = 0;
8438 if (tmpva <= (vm_offset_t)ptep &&
8439 tmpva + pte_size > (vm_offset_t)ptep) {
8440 vm_paddr_t pte_pa;
8441
8442 mtx_lock(&cmap_lock);
8443 tmpptep = cmap1_addr;
8444 pte_pa = DMAP_TO_PHYS((vm_offset_t)ptep);
8445 pmap_store(cmap1_pte, ATTR_AF |
8446 pmap_sh_attr | ATTR_S1_AP(ATTR_S1_AP_RW) |
8447 ATTR_S1_XN | ATTR_KERN_GP |
8448 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
8449 PHYS_TO_PTE(pte_pa &~L3_OFFSET) | L3_PAGE);
8450 dsb(ishst);
8451 ptep = (pt_entry_t *)(tmpptep +
8452 ((vm_offset_t)ptep & PAGE_MASK));
8453 }
8454
8455 /* Update the entry */
8456 pte = pmap_load(ptep);
8457 pte &= ~mask;
8458 pte |= bits;
8459
8460 switch (pte_size) {
8461 case L2C_SIZE:
8462 pmap_update_strided(kernel_pmap, ptep, ptep +
8463 L2C_ENTRIES, pte, tmpva, L2_SIZE, L2C_SIZE);
8464 break;
8465 case L3C_SIZE:
8466 pmap_update_strided(kernel_pmap, ptep, ptep +
8467 L3C_ENTRIES, pte, tmpva, L3_SIZE, L3C_SIZE);
8468 break;
8469 default:
8470 /*
8471 * We are updating a single block or page entry,
8472 * so regardless of pte_size pass PAGE_SIZE in
8473 * order that a single TLB invalidation is
8474 * performed.
8475 */
8476 pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
8477 PAGE_SIZE);
8478 break;
8479 }
8480
8481 if (tmpptep != 0) {
8482 pmap_clear(cmap1_pte);
8483 pmap_s1_invalidate_page(kernel_pmap,
8484 (vm_offset_t)tmpptep, true);
8485 mtx_unlock(&cmap_lock);
8486 }
8487
8488 pa = PTE_TO_PHYS(pte);
8489 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
8490 /*
8491 * Keep the DMAP memory in sync.
8492 */
8493 rv = pmap_change_props_locked(
8494 PHYS_TO_DMAP(pa), pte_size,
8495 prot, mode, true);
8496 if (rv != 0)
8497 return (rv);
8498 }
8499
8500 /*
8501 * If moving to a non-cacheable entry flush
8502 * the cache.
8503 */
8504 if (mode == VM_MEMATTR_UNCACHEABLE)
8505 cpu_dcache_wbinv_range((void *)tmpva, pte_size);
8506 tmpva += pte_size;
8507 }
8508 }
8509
8510 return (0);
8511 }
8512
8513 /*
8514 * Create an L2 table to map all addresses within an L1 mapping.
8515 */
8516 static pt_entry_t *
pmap_demote_l1(pmap_t pmap,pt_entry_t * l1,vm_offset_t va)8517 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
8518 {
8519 pt_entry_t *l2, newl2, oldl1;
8520 char *tmpl1;
8521 vm_paddr_t l2phys, phys;
8522 vm_page_t ml2;
8523 int i;
8524
8525 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8526 oldl1 = pmap_load(l1);
8527 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8528 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
8529 ("pmap_demote_l1: Demoting a non-block entry"));
8530 KASSERT((va & L1_OFFSET) == 0,
8531 ("pmap_demote_l1: Invalid virtual address %#lx", va));
8532 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
8533 ("pmap_demote_l1: Level 1 table shouldn't be managed"));
8534 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
8535 ("pmap_demote_l1: Demoting entry with no-demote flag set"));
8536
8537 tmpl1 = NULL;
8538 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
8539 tmpl1 = kva_alloc(PAGE_SIZE);
8540 if (tmpl1 == NULL)
8541 return (NULL);
8542 }
8543
8544 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
8545 NULL) {
8546 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
8547 " in pmap %p", va, pmap);
8548 l2 = NULL;
8549 goto fail;
8550 }
8551
8552 l2phys = VM_PAGE_TO_PHYS(ml2);
8553 l2 = PHYS_TO_DMAP(l2phys);
8554
8555 /* Address the range points at */
8556 phys = PTE_TO_PHYS(oldl1);
8557 /* The attributed from the old l1 table to be copied */
8558 newl2 = oldl1 & ATTR_MASK;
8559
8560 /* Create the new entries */
8561 newl2 |= ATTR_CONTIGUOUS;
8562 for (i = 0; i < Ln_ENTRIES; i++) {
8563 l2[i] = newl2 | phys;
8564 phys += L2_SIZE;
8565 }
8566 KASSERT(l2[0] == (ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) |
8567 L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0],
8568 ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
8569
8570 if (tmpl1 != NULL) {
8571 pmap_kenter((vm_offset_t)tmpl1, PAGE_SIZE,
8572 DMAP_TO_PHYS(l1) & ~L3_OFFSET,
8573 VM_MEMATTR_WRITE_BACK);
8574 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
8575 }
8576
8577 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
8578
8579 counter_u64_add(pmap_l1_demotions, 1);
8580 fail:
8581 if (tmpl1 != NULL) {
8582 pmap_kremove((vm_offset_t)tmpl1);
8583 kva_free(tmpl1, PAGE_SIZE);
8584 }
8585
8586 return (l2);
8587 }
8588
8589 static void
pmap_fill_l3(pt_entry_t * firstl3,pt_entry_t newl3)8590 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
8591 {
8592 pt_entry_t *l3;
8593
8594 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
8595 *l3 = newl3;
8596 newl3 += L3_SIZE;
8597 }
8598 }
8599
8600 static void
pmap_demote_l2_check(pt_entry_t * firstl3p __unused,pt_entry_t newl3e __unused)8601 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused)
8602 {
8603 #ifdef INVARIANTS
8604 #ifdef DIAGNOSTIC
8605 pt_entry_t *xl3p, *yl3p;
8606
8607 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES;
8608 xl3p++, newl3e += PAGE_SIZE) {
8609 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) {
8610 printf("pmap_demote_l2: xl3e %zd and newl3e map "
8611 "different pages: found %#lx, expected %#lx\n",
8612 xl3p - firstl3p, pmap_load(xl3p), newl3e);
8613 printf("page table dump\n");
8614 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES;
8615 yl3p++) {
8616 printf("%zd %#lx\n", yl3p - firstl3p,
8617 pmap_load(yl3p));
8618 }
8619 panic("firstpte");
8620 }
8621 }
8622 #else
8623 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e),
8624 ("pmap_demote_l2: firstl3 and newl3e map different physical"
8625 " addresses"));
8626 #endif
8627 #endif
8628 }
8629
8630 static void
pmap_demote_l2_abort(pmap_t pmap,vm_offset_t va,pt_entry_t * l2,struct rwlock ** lockp)8631 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
8632 struct rwlock **lockp)
8633 {
8634 struct spglist free;
8635
8636 SLIST_INIT(&free);
8637 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), true,
8638 &free, lockp);
8639 vm_page_free_pages_toq(&free, true);
8640 }
8641
8642 /*
8643 * Create an L3 table to map all addresses within an L2 mapping.
8644 */
8645 static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap,pt_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)8646 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
8647 struct rwlock **lockp)
8648 {
8649 pt_entry_t *l3, newl3, oldl2;
8650 char *tmpl2;
8651 vm_paddr_t l3phys;
8652 vm_page_t ml3;
8653
8654 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8655 PMAP_ASSERT_STAGE1(pmap);
8656 KASSERT(ADDR_IS_CANONICAL(va),
8657 ("%s: Address not in canonical form: %lx", __func__, va));
8658
8659 l3 = NULL;
8660 oldl2 = pmap_load(l2);
8661 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
8662 ("pmap_demote_l2: Demoting a non-block entry"));
8663 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
8664 ("pmap_demote_l2: Demoting entry with no-demote flag set"));
8665 va &= ~L2_OFFSET;
8666
8667 tmpl2 = NULL;
8668 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
8669 tmpl2 = kva_alloc(PAGE_SIZE);
8670 if (tmpl2 == NULL)
8671 return (NULL);
8672 }
8673
8674 /*
8675 * Invalidate the 2MB page mapping and return "failure" if the
8676 * mapping was never accessed and not wired.
8677 */
8678 if ((oldl2 & ATTR_AF) == 0) {
8679 if ((oldl2 & ATTR_SW_WIRED) == 0) {
8680 pmap_demote_l2_abort(pmap, va, l2, lockp);
8681 CTR2(KTR_PMAP,
8682 "pmap_demote_l2: failure for va %#lx in pmap %p",
8683 va, pmap);
8684 goto fail;
8685 }
8686 ml3 = pmap_remove_pt_page(pmap, va);
8687 /* Fill the PTP with L3Es that have ATTR_AF cleared. */
8688 ml3->valid = 0;
8689 } else if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
8690 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8691 ("pmap_demote_l2: page table page for a wired mapping"
8692 " is missing"));
8693
8694 /*
8695 * If the page table page is missing and the mapping
8696 * is for a kernel address, the mapping must belong to
8697 * either the direct map or the early kernel memory.
8698 * Page table pages are preallocated for every other
8699 * part of the kernel address space, so the direct map
8700 * region and early kernel memory are the only parts of the
8701 * kernel address space that must be handled here.
8702 */
8703 KASSERT(ADDR_IS_USER(va) || VIRT_IN_DMAP(va) ||
8704 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
8705 ("pmap_demote_l2: No saved mpte for va %#lx", va));
8706
8707 /*
8708 * If the 2MB page mapping belongs to the direct map
8709 * region of the kernel's address space, then the page
8710 * allocation request specifies the highest possible
8711 * priority (VM_ALLOC_INTERRUPT). Otherwise, the
8712 * priority is normal.
8713 */
8714 ml3 = vm_page_alloc_noobj(
8715 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
8716 VM_ALLOC_WIRED);
8717
8718 /*
8719 * If the allocation of the new page table page fails,
8720 * invalidate the 2MB page mapping and return "failure".
8721 */
8722 if (ml3 == NULL) {
8723 pmap_demote_l2_abort(pmap, va, l2, lockp);
8724 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
8725 " in pmap %p", va, pmap);
8726 goto fail;
8727 }
8728 ml3->pindex = pmap_l2_pindex(va);
8729
8730 if (ADDR_IS_USER(va)) {
8731 ml3->ref_count = NL3PG;
8732 pmap_resident_count_inc(pmap, 1);
8733 }
8734 }
8735 l3phys = VM_PAGE_TO_PHYS(ml3);
8736 l3 = PHYS_TO_DMAP(l3phys);
8737 newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
8738 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
8739 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
8740 ("pmap_demote_l2: L2 entry is writeable but not dirty"));
8741
8742 /*
8743 * If the PTP is not leftover from an earlier promotion or it does not
8744 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all
8745 * have ATTR_AF set, unless this is a wired mapping with ATTR_AF clear.
8746 *
8747 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
8748 * performs a dsb(). That dsb() ensures that the stores for filling
8749 * "l3" are visible before "l3" is added to the page table.
8750 */
8751 if (!vm_page_all_valid(ml3))
8752 pmap_fill_l3(l3, newl3);
8753
8754 pmap_demote_l2_check(l3, newl3);
8755
8756 /*
8757 * If the mapping has changed attributes, update the L3Es.
8758 */
8759 if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE))
8760 pmap_fill_l3(l3, newl3);
8761
8762 /*
8763 * Map the temporary page so we don't lose access to the l2 table.
8764 */
8765 if (tmpl2 != NULL) {
8766 pmap_kenter((vm_offset_t)tmpl2, PAGE_SIZE,
8767 DMAP_TO_PHYS(l2) & ~L3_OFFSET,
8768 VM_MEMATTR_WRITE_BACK);
8769 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
8770 }
8771
8772 /*
8773 * The spare PV entries must be reserved prior to demoting the
8774 * mapping, that is, prior to changing the PDE. Otherwise, the state
8775 * of the L2 and the PV lists will be inconsistent, which can result
8776 * in reclaim_pv_chunk() attempting to remove a PV entry from the
8777 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
8778 * PV entry for the 2MB page mapping that is being demoted.
8779 */
8780 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8781 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
8782
8783 /*
8784 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
8785 * the 2MB page mapping.
8786 */
8787 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
8788
8789 /*
8790 * Demote the PV entry.
8791 */
8792 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8793 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
8794
8795 counter_u64_add(pmap_l2_demotions, 1);
8796 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
8797 " in pmap %p %lx", va, pmap, l3[0]);
8798
8799 fail:
8800 if (tmpl2 != NULL) {
8801 pmap_kremove((vm_offset_t)tmpl2);
8802 kva_free(tmpl2, PAGE_SIZE);
8803 }
8804
8805 return (l3);
8806
8807 }
8808
8809 static pt_entry_t *
pmap_demote_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)8810 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
8811 {
8812 struct rwlock *lock;
8813 pt_entry_t *l3;
8814
8815 lock = NULL;
8816 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
8817 if (lock != NULL)
8818 rw_wunlock(lock);
8819 return (l3);
8820 }
8821
8822 /*
8823 * Demote an L2C superpage mapping to L2C_ENTRIES L2 block mappings.
8824 */
8825 static bool
pmap_demote_l2c(pmap_t pmap,pt_entry_t * l2p,vm_offset_t va)8826 pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va)
8827 {
8828 pd_entry_t *l2c_end, *l2c_start, l2e, mask, nbits, *tl2p;
8829 char *tmpl3;
8830 register_t intr;
8831
8832 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8833 PMAP_ASSERT_STAGE1(pmap);
8834 l2c_start = (pd_entry_t *)((uintptr_t)l2p & ~((L2C_ENTRIES *
8835 sizeof(pd_entry_t)) - 1));
8836 l2c_end = l2c_start + L2C_ENTRIES;
8837 tmpl3 = NULL;
8838 if ((va & ~L2C_OFFSET) < (vm_offset_t)l2c_end &&
8839 (vm_offset_t)l2c_start < (va & ~L2C_OFFSET) + L2C_SIZE) {
8840 tmpl3 = kva_alloc(PAGE_SIZE);
8841 if (tmpl3 == NULL)
8842 return (false);
8843 pmap_kenter((vm_offset_t)tmpl3, PAGE_SIZE,
8844 DMAP_TO_PHYS(l2c_start) & ~L3_OFFSET,
8845 VM_MEMATTR_WRITE_BACK);
8846 l2c_start = (pd_entry_t *)(tmpl3 +
8847 ((vm_offset_t)l2c_start & PAGE_MASK));
8848 l2c_end = (pd_entry_t *)(tmpl3 +
8849 ((vm_offset_t)l2c_end & PAGE_MASK));
8850 }
8851 mask = 0;
8852 nbits = ATTR_DESCR_VALID;
8853 intr = intr_disable();
8854
8855 /*
8856 * Break the mappings.
8857 */
8858 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8859 /*
8860 * Clear the mapping's contiguous and valid bits, but leave
8861 * the rest of the entry unchanged, so that a lockless,
8862 * concurrent pmap_kextract() can still lookup the physical
8863 * address.
8864 */
8865 l2e = pmap_load(tl2p);
8866 KASSERT((l2e & ATTR_CONTIGUOUS) != 0,
8867 ("pmap_demote_l2c: missing ATTR_CONTIGUOUS"));
8868 KASSERT((l2e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8869 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8870 ("pmap_demote_l2c: missing ATTR_S1_AP_RW"));
8871 while (!atomic_fcmpset_64(tl2p, &l2e, l2e & ~(ATTR_CONTIGUOUS |
8872 ATTR_DESCR_VALID)))
8873 cpu_spinwait();
8874
8875 /*
8876 * Hardware accessed and dirty bit maintenance might only
8877 * update a single L2 entry, so we must combine the accessed
8878 * and dirty bits from this entire set of contiguous L2
8879 * entries.
8880 */
8881 if ((l2e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8882 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8883 mask = ATTR_S1_AP_RW_BIT;
8884 nbits |= l2e & ATTR_AF;
8885 }
8886 if ((nbits & ATTR_AF) != 0) {
8887 pmap_s1_invalidate_strided(pmap, va & ~L2C_OFFSET, (va +
8888 L2C_SIZE) & ~L2C_OFFSET, L2_SIZE, true);
8889 }
8890
8891 /*
8892 * Remake the mappings, updating the accessed and dirty bits.
8893 */
8894 l2e = (pmap_load(l2c_start) & ~mask) | nbits;
8895 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8896 pmap_store(tl2p, l2e);
8897 l2e += L2_SIZE;
8898 }
8899 dsb(ishst);
8900
8901 intr_restore(intr);
8902 if (tmpl3 != NULL) {
8903 pmap_kremove((vm_offset_t)tmpl3);
8904 kva_free(tmpl3, PAGE_SIZE);
8905 }
8906 counter_u64_add(pmap_l2c_demotions, 1);
8907 CTR2(KTR_PMAP, "pmap_demote_l2c: success for va %#lx in pmap %p",
8908 va, pmap);
8909 return (true);
8910 }
8911
8912 /*
8913 * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings.
8914 */
8915 static bool
pmap_demote_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va)8916 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va)
8917 {
8918 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8919 char *tmpl3;
8920 register_t intr;
8921
8922 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8923 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8924 sizeof(pt_entry_t)) - 1));
8925 l3c_end = l3c_start + L3C_ENTRIES;
8926 tmpl3 = NULL;
8927 if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end &&
8928 (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) {
8929 tmpl3 = kva_alloc(PAGE_SIZE);
8930 if (tmpl3 == NULL)
8931 return (false);
8932 pmap_kenter((vm_offset_t)tmpl3, PAGE_SIZE,
8933 DMAP_TO_PHYS(l3c_start) & ~L3_OFFSET,
8934 VM_MEMATTR_WRITE_BACK);
8935 l3c_start = (pt_entry_t *)(tmpl3 +
8936 ((vm_offset_t)l3c_start & PAGE_MASK));
8937 l3c_end = (pt_entry_t *)(tmpl3 +
8938 ((vm_offset_t)l3c_end & PAGE_MASK));
8939 }
8940 mask = 0;
8941 nbits = ATTR_DESCR_VALID;
8942 intr = intr_disable();
8943
8944 /*
8945 * Break the mappings.
8946 */
8947 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8948 /*
8949 * Clear the mapping's contiguous and valid bits, but leave
8950 * the rest of the entry unchanged, so that a lockless,
8951 * concurrent pmap_kextract() can still lookup the physical
8952 * address.
8953 */
8954 l3e = pmap_load(tl3p);
8955 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8956 ("pmap_demote_l3c: missing ATTR_CONTIGUOUS"));
8957 KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8958 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8959 ("pmap_demote_l3c: missing ATTR_S1_AP_RW"));
8960 while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS |
8961 ATTR_DESCR_VALID)))
8962 cpu_spinwait();
8963
8964 /*
8965 * Hardware accessed and dirty bit maintenance might only
8966 * update a single L3 entry, so we must combine the accessed
8967 * and dirty bits from this entire set of contiguous L3
8968 * entries.
8969 */
8970 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8971 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8972 mask = ATTR_S1_AP_RW_BIT;
8973 nbits |= l3e & ATTR_AF;
8974 }
8975 if ((nbits & ATTR_AF) != 0) {
8976 pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) &
8977 ~L3C_OFFSET, true);
8978 }
8979
8980 /*
8981 * Remake the mappings, updating the accessed and dirty bits.
8982 */
8983 l3e = (pmap_load(l3c_start) & ~mask) | nbits;
8984 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8985 pmap_store(tl3p, l3e);
8986 l3e += L3_SIZE;
8987 }
8988 dsb(ishst);
8989
8990 intr_restore(intr);
8991 if (tmpl3 != NULL) {
8992 pmap_kremove((vm_offset_t)tmpl3);
8993 kva_free(tmpl3, PAGE_SIZE);
8994 }
8995 counter_u64_add(pmap_l3c_demotions, 1);
8996 CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p",
8997 va, pmap);
8998 return (true);
8999 }
9000
9001 /*
9002 * Accumulate the accessed and dirty bits within a L3C superpage and
9003 * return the specified PTE with them applied correctly.
9004 */
9005 static pt_entry_t
pmap_load_l3c(pt_entry_t * l3p)9006 pmap_load_l3c(pt_entry_t *l3p)
9007 {
9008 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
9009
9010 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
9011 sizeof(pt_entry_t)) - 1));
9012 l3c_end = l3c_start + L3C_ENTRIES;
9013 mask = 0;
9014 nbits = 0;
9015 /* Iterate over each mapping in the superpage. */
9016 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
9017 l3e = pmap_load(tl3p);
9018 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
9019 ("pmap_load_l3c: missing ATTR_CONTIGUOUS"));
9020 /* Update mask if the current page has its dirty bit set. */
9021 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
9022 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
9023 mask = ATTR_S1_AP_RW_BIT;
9024 /* Update nbits if the accessed bit is set. */
9025 nbits |= l3e & ATTR_AF;
9026 }
9027 return ((pmap_load(l3p) & ~mask) | nbits);
9028 }
9029
9030 /*
9031 * Perform the pmap work for mincore(2). If the page is not both referenced and
9032 * modified by this pmap, returns its physical address so that the caller can
9033 * find other mappings.
9034 */
9035 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)9036 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
9037 {
9038 pt_entry_t *pte, tpte;
9039 vm_paddr_t mask, pa;
9040 int lvl, psind, val;
9041 bool managed;
9042
9043 PMAP_ASSERT_STAGE1(pmap);
9044 PMAP_LOCK(pmap);
9045 pte = pmap_pte(pmap, addr, &lvl);
9046 if (pte != NULL) {
9047 tpte = pmap_load(pte);
9048
9049 switch (lvl) {
9050 case 3:
9051 mask = L3_OFFSET;
9052 psind = (tpte & ATTR_CONTIGUOUS) != 0 ? 1 : 0;
9053 break;
9054 case 2:
9055 mask = L2_OFFSET;
9056 psind = 2;
9057 break;
9058 case 1:
9059 mask = L1_OFFSET;
9060 psind = 3;
9061 break;
9062 default:
9063 panic("pmap_mincore: invalid level %d", lvl);
9064 }
9065
9066 managed = (tpte & ATTR_SW_MANAGED) != 0;
9067 val = MINCORE_INCORE | MINCORE_PSIND(psind);
9068 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
9069 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
9070 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
9071 if ((tpte & ATTR_AF) == ATTR_AF)
9072 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
9073
9074 pa = PTE_TO_PHYS(tpte) | (addr & mask);
9075 } else {
9076 managed = false;
9077 val = 0;
9078 }
9079
9080 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
9081 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
9082 *pap = pa;
9083 }
9084 PMAP_UNLOCK(pmap);
9085 return (val);
9086 }
9087
9088 /*
9089 * Garbage collect every ASID that is neither active on a processor nor
9090 * reserved.
9091 */
9092 static void
pmap_reset_asid_set(pmap_t pmap)9093 pmap_reset_asid_set(pmap_t pmap)
9094 {
9095 pmap_t curpmap;
9096 int asid, cpuid, epoch;
9097 struct asid_set *set;
9098 enum pmap_stage stage;
9099
9100 set = pmap->pm_asid_set;
9101 stage = pmap->pm_stage;
9102
9103 set = pmap->pm_asid_set;
9104 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
9105 mtx_assert(&set->asid_set_mutex, MA_OWNED);
9106
9107 /*
9108 * Ensure that the store to asid_epoch is globally visible before the
9109 * loads from pc_curpmap are performed.
9110 */
9111 epoch = set->asid_epoch + 1;
9112 if (epoch == INT_MAX)
9113 epoch = 0;
9114 set->asid_epoch = epoch;
9115 dsb(ishst);
9116 if (stage == PM_STAGE1) {
9117 __asm __volatile("tlbi vmalle1is");
9118 } else {
9119 KASSERT(pmap_clean_stage2_tlbi != NULL,
9120 ("%s: Unset stage 2 tlb invalidation callback\n",
9121 __func__));
9122 pmap_clean_stage2_tlbi();
9123 }
9124 dsb(ish);
9125 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
9126 set->asid_set_size - 1);
9127 CPU_FOREACH(cpuid) {
9128 if (cpuid == curcpu)
9129 continue;
9130 if (stage == PM_STAGE1) {
9131 curpmap = pcpu_find(cpuid)->pc_curpmap;
9132 PMAP_ASSERT_STAGE1(pmap);
9133 } else {
9134 curpmap = pcpu_find(cpuid)->pc_curvmpmap;
9135 if (curpmap == NULL)
9136 continue;
9137 PMAP_ASSERT_STAGE2(pmap);
9138 }
9139 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
9140 asid = COOKIE_TO_ASID(curpmap->pm_cookie);
9141 if (asid == -1)
9142 continue;
9143 bit_set(set->asid_set, asid);
9144 curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
9145 }
9146 }
9147
9148 /*
9149 * Allocate a new ASID for the specified pmap.
9150 */
9151 static void
pmap_alloc_asid(pmap_t pmap)9152 pmap_alloc_asid(pmap_t pmap)
9153 {
9154 struct asid_set *set;
9155 int new_asid;
9156
9157 set = pmap->pm_asid_set;
9158 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
9159
9160 mtx_lock_spin(&set->asid_set_mutex);
9161
9162 /*
9163 * While this processor was waiting to acquire the asid set mutex,
9164 * pmap_reset_asid_set() running on another processor might have
9165 * updated this pmap's cookie to the current epoch. In which case, we
9166 * don't need to allocate a new ASID.
9167 */
9168 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
9169 goto out;
9170
9171 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
9172 &new_asid);
9173 if (new_asid == -1) {
9174 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
9175 set->asid_next, &new_asid);
9176 if (new_asid == -1) {
9177 pmap_reset_asid_set(pmap);
9178 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
9179 set->asid_set_size, &new_asid);
9180 KASSERT(new_asid != -1, ("ASID allocation failure"));
9181 }
9182 }
9183 bit_set(set->asid_set, new_asid);
9184 set->asid_next = new_asid + 1;
9185 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
9186 out:
9187 mtx_unlock_spin(&set->asid_set_mutex);
9188 }
9189
9190 static uint64_t __read_mostly ttbr_flags;
9191
9192 /*
9193 * Compute the value that should be stored in ttbr0 to activate the specified
9194 * pmap. This value may change from time to time.
9195 */
9196 uint64_t
pmap_to_ttbr0(pmap_t pmap)9197 pmap_to_ttbr0(pmap_t pmap)
9198 {
9199 uint64_t ttbr;
9200
9201 ttbr = pmap->pm_ttbr;
9202 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
9203 ttbr |= ttbr_flags;
9204
9205 return (ttbr);
9206 }
9207
9208 static void
pmap_set_cnp(void * arg)9209 pmap_set_cnp(void *arg)
9210 {
9211 uint64_t ttbr0, ttbr1;
9212 u_int cpuid;
9213
9214 cpuid = *(u_int *)arg;
9215 if (cpuid == curcpu) {
9216 /*
9217 * Set the flags while all CPUs are handling the
9218 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
9219 * to pmap_to_ttbr0 after this will have the CnP flag set.
9220 * The dsb after invalidating the TLB will act as a barrier
9221 * to ensure all CPUs can observe this change.
9222 */
9223 ttbr_flags |= TTBR_CnP;
9224 }
9225
9226 ttbr0 = READ_SPECIALREG(ttbr0_el1);
9227 ttbr0 |= TTBR_CnP;
9228
9229 ttbr1 = READ_SPECIALREG(ttbr1_el1);
9230 ttbr1 |= TTBR_CnP;
9231
9232 /* Update ttbr{0,1}_el1 with the CnP flag */
9233 WRITE_SPECIALREG(ttbr0_el1, ttbr0);
9234 WRITE_SPECIALREG(ttbr1_el1, ttbr1);
9235 isb();
9236 __asm __volatile("tlbi vmalle1is");
9237 dsb(ish);
9238 isb();
9239 }
9240
9241 /*
9242 * Defer enabling some features until we have read the ID registers to know
9243 * if they are supported on all CPUs.
9244 */
9245 static void
pmap_init_mp(void * dummy __unused)9246 pmap_init_mp(void *dummy __unused)
9247 {
9248 uint64_t reg;
9249
9250 get_kernel_reg(ID_AA64PFR1_EL1, ®);
9251 if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) {
9252 if (bootverbose)
9253 printf("Enabling BTI\n");
9254 pmap_bti_support = true;
9255
9256 pmap_bti_ranges_zone = uma_zcreate("BTI ranges",
9257 sizeof(struct rs_el), NULL, NULL, NULL, NULL,
9258 UMA_ALIGN_PTR, 0);
9259 }
9260 }
9261 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL);
9262
9263 /*
9264 * Defer enabling CnP until we have read the ID registers to know if it's
9265 * supported on all CPUs.
9266 */
9267 static void
pmap_init_cnp(void * dummy __unused)9268 pmap_init_cnp(void *dummy __unused)
9269 {
9270 uint64_t reg;
9271 u_int cpuid;
9272
9273 get_kernel_reg(ID_AA64MMFR2_EL1, ®);
9274 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
9275 if (bootverbose)
9276 printf("Enabling CnP\n");
9277 cpuid = curcpu;
9278 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
9279 }
9280
9281 }
9282 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
9283
9284 static bool
pmap_activate_int(struct thread * td,pmap_t pmap)9285 pmap_activate_int(struct thread *td, pmap_t pmap)
9286 {
9287 struct asid_set *set;
9288 int epoch;
9289
9290 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
9291 KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
9292
9293 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
9294 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
9295 /*
9296 * Handle the possibility that the old thread was preempted
9297 * after an "ic" or "tlbi" instruction but before it performed
9298 * a "dsb" instruction. If the old thread migrates to a new
9299 * processor, its completion of a "dsb" instruction on that
9300 * new processor does not guarantee that the "ic" or "tlbi"
9301 * instructions performed on the old processor have completed.
9302 */
9303 dsb(ish);
9304 return (false);
9305 }
9306
9307 set = pmap->pm_asid_set;
9308 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
9309
9310 /*
9311 * Ensure that the store to curpmap is globally visible before the
9312 * load from asid_epoch is performed.
9313 */
9314 if (pmap->pm_stage == PM_STAGE1)
9315 PCPU_SET(curpmap, pmap);
9316 else
9317 PCPU_SET(curvmpmap, pmap);
9318 dsb(ish);
9319 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
9320 if (epoch >= 0 && epoch != set->asid_epoch)
9321 pmap_alloc_asid(pmap);
9322
9323 if (pmap->pm_stage == PM_STAGE1) {
9324 uint64_t new_tcr, tcr;
9325
9326 new_tcr = td->td_proc->p_md.md_tcr;
9327 tcr = READ_SPECIALREG(tcr_el1);
9328 if ((tcr & MD_TCR_FIELDS) != new_tcr) {
9329 tcr &= ~MD_TCR_FIELDS;
9330 tcr |= new_tcr;
9331 WRITE_SPECIALREG(tcr_el1, tcr);
9332 }
9333 set_ttbr0(pmap_to_ttbr0(pmap));
9334 if (PCPU_GET(bcast_tlbi_workaround) != 0)
9335 invalidate_local_icache();
9336 }
9337 return (true);
9338 }
9339
9340 void
pmap_activate_vm(pmap_t pmap)9341 pmap_activate_vm(pmap_t pmap)
9342 {
9343
9344 PMAP_ASSERT_STAGE2(pmap);
9345
9346 (void)pmap_activate_int(NULL, pmap);
9347 }
9348
9349 void
pmap_activate(struct thread * td)9350 pmap_activate(struct thread *td)
9351 {
9352 pmap_t pmap;
9353
9354 pmap = vmspace_pmap(td->td_proc->p_vmspace);
9355 PMAP_ASSERT_STAGE1(pmap);
9356 critical_enter();
9357 (void)pmap_activate_int(td, pmap);
9358 critical_exit();
9359 }
9360
9361 /*
9362 * Activate the thread we are switching to.
9363 * To simplify the assembly in cpu_throw return the new threads pcb.
9364 */
9365 struct pcb *
pmap_switch(struct thread * new)9366 pmap_switch(struct thread *new)
9367 {
9368 pcpu_bp_harden bp_harden;
9369 struct pcb *pcb;
9370 uint64_t sctlr;
9371
9372 /* Store the new curthread */
9373 PCPU_SET(curthread, new);
9374
9375 /* And the new pcb */
9376 pcb = new->td_pcb;
9377 PCPU_SET(curpcb, pcb);
9378
9379 if ((new->td_proc->p_flag & P_KPROC) == 0) {
9380 sctlr = READ_SPECIALREG(sctlr_el1);
9381 if ((sctlr & SCTLR_USER_MASK) != new->td_md.md_sctlr) {
9382 sctlr &= ~SCTLR_USER_MASK;
9383 sctlr |= new->td_md.md_sctlr;
9384 WRITE_SPECIALREG(sctlr_el1, sctlr);
9385 isb();
9386 }
9387 }
9388
9389 /*
9390 * TODO: We may need to flush the cache here if switching
9391 * to a user process.
9392 */
9393
9394 if (pmap_activate_int(new, vmspace_pmap(new->td_proc->p_vmspace))) {
9395 /*
9396 * Stop userspace from training the branch predictor against
9397 * other processes. This will call into a CPU specific
9398 * function that clears the branch predictor state.
9399 */
9400 bp_harden = PCPU_GET(bp_harden);
9401 if (bp_harden != NULL)
9402 bp_harden();
9403 }
9404
9405 return (pcb);
9406 }
9407
9408 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)9409 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
9410 {
9411
9412 PMAP_ASSERT_STAGE1(pmap);
9413 KASSERT(ADDR_IS_CANONICAL(va),
9414 ("%s: Address not in canonical form: %lx", __func__, va));
9415
9416 if (ADDR_IS_KERNEL(va)) {
9417 cpu_icache_sync_range((void *)va, sz);
9418 } else {
9419 u_int len, offset;
9420 vm_paddr_t pa;
9421
9422 /* Find the length of data in this page to flush */
9423 offset = va & PAGE_MASK;
9424 len = imin(PAGE_SIZE - offset, sz);
9425
9426 while (sz != 0) {
9427 /* Extract the physical address & find it in the DMAP */
9428 pa = pmap_extract(pmap, va);
9429 if (pa != 0)
9430 cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
9431
9432 /* Move to the next page */
9433 sz -= len;
9434 va += len;
9435 /* Set the length for the next iteration */
9436 len = imin(PAGE_SIZE, sz);
9437 }
9438 }
9439 }
9440
9441 static int
pmap_stage2_fault(pmap_t pmap,uint64_t esr,uint64_t far)9442 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9443 {
9444 pd_entry_t *pdep;
9445 pt_entry_t *ptep, pte;
9446 int rv, lvl, dfsc;
9447
9448 PMAP_ASSERT_STAGE2(pmap);
9449 rv = KERN_FAILURE;
9450
9451 /* Data and insn aborts use same encoding for FSC field. */
9452 dfsc = esr & ISS_DATA_DFSC_MASK;
9453 switch (dfsc) {
9454 case ISS_DATA_DFSC_TF_L0:
9455 case ISS_DATA_DFSC_TF_L1:
9456 case ISS_DATA_DFSC_TF_L2:
9457 case ISS_DATA_DFSC_TF_L3:
9458 PMAP_LOCK(pmap);
9459 pdep = pmap_pde(pmap, far, &lvl);
9460 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
9461 PMAP_UNLOCK(pmap);
9462 break;
9463 }
9464
9465 switch (lvl) {
9466 case 0:
9467 ptep = pmap_l0_to_l1(pdep, far);
9468 break;
9469 case 1:
9470 ptep = pmap_l1_to_l2(pdep, far);
9471 break;
9472 case 2:
9473 ptep = pmap_l2_to_l3(pdep, far);
9474 break;
9475 default:
9476 panic("%s: Invalid pde level %d", __func__,lvl);
9477 }
9478 goto fault_exec;
9479
9480 case ISS_DATA_DFSC_AFF_L1:
9481 case ISS_DATA_DFSC_AFF_L2:
9482 case ISS_DATA_DFSC_AFF_L3:
9483 PMAP_LOCK(pmap);
9484 ptep = pmap_pte(pmap, far, &lvl);
9485 fault_exec:
9486 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
9487 /*
9488 * If accessing an executable page invalidate
9489 * the I-cache so it will be valid when we
9490 * continue execution in the guest. The D-cache
9491 * is assumed to already be clean to the Point
9492 * of Coherency.
9493 */
9494 if ((pte & ATTR_S2_XN_MASK) !=
9495 ATTR_S2_XN(ATTR_S2_XN_NONE)) {
9496 invalidate_icache();
9497 }
9498 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
9499 rv = KERN_SUCCESS;
9500 }
9501 PMAP_UNLOCK(pmap);
9502 break;
9503 }
9504
9505 return (rv);
9506 }
9507
9508 int
pmap_fault(pmap_t pmap,uint64_t esr,uint64_t far)9509 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9510 {
9511 pt_entry_t pte, *ptep;
9512 register_t intr;
9513 uint64_t ec, par;
9514 int lvl, rv;
9515
9516 rv = KERN_FAILURE;
9517
9518 ec = ESR_ELx_EXCEPTION(esr);
9519 switch (ec) {
9520 case EXCP_INSN_ABORT_L:
9521 case EXCP_INSN_ABORT:
9522 case EXCP_DATA_ABORT_L:
9523 case EXCP_DATA_ABORT:
9524 break;
9525 default:
9526 return (rv);
9527 }
9528
9529 if (pmap->pm_stage == PM_STAGE2)
9530 return (pmap_stage2_fault(pmap, esr, far));
9531
9532 /* Data and insn aborts use same encoding for FSC field. */
9533 switch (esr & ISS_DATA_DFSC_MASK) {
9534 case ISS_DATA_DFSC_AFF_L1:
9535 case ISS_DATA_DFSC_AFF_L2:
9536 case ISS_DATA_DFSC_AFF_L3:
9537 PMAP_LOCK(pmap);
9538 ptep = pmap_pte(pmap, far, &lvl);
9539 if (ptep != NULL) {
9540 pmap_set_bits(ptep, ATTR_AF);
9541 rv = KERN_SUCCESS;
9542 /*
9543 * XXXMJ as an optimization we could mark the entry
9544 * dirty if this is a write fault.
9545 */
9546 }
9547 PMAP_UNLOCK(pmap);
9548 break;
9549 case ISS_DATA_DFSC_PF_L1:
9550 case ISS_DATA_DFSC_PF_L2:
9551 case ISS_DATA_DFSC_PF_L3:
9552 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
9553 (esr & ISS_DATA_WnR) == 0)
9554 return (rv);
9555 PMAP_LOCK(pmap);
9556 ptep = pmap_pte(pmap, far, &lvl);
9557 if (ptep != NULL &&
9558 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
9559 if ((pte & ATTR_S1_AP_RW_BIT) ==
9560 ATTR_S1_AP(ATTR_S1_AP_RO)) {
9561 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
9562 pmap_s1_invalidate_page(pmap, far, true);
9563 }
9564 rv = KERN_SUCCESS;
9565 }
9566 PMAP_UNLOCK(pmap);
9567 break;
9568 case ISS_DATA_DFSC_TF_L0:
9569 case ISS_DATA_DFSC_TF_L1:
9570 case ISS_DATA_DFSC_TF_L2:
9571 case ISS_DATA_DFSC_TF_L3:
9572 /*
9573 * Retry the translation. A break-before-make sequence can
9574 * produce a transient fault.
9575 */
9576 if (pmap == kernel_pmap) {
9577 /*
9578 * The translation fault may have occurred within a
9579 * critical section. Therefore, we must check the
9580 * address without acquiring the kernel pmap's lock.
9581 */
9582 if (pmap_klookup(far, NULL))
9583 rv = KERN_SUCCESS;
9584 } else {
9585 bool owned;
9586
9587 /*
9588 * In the EFIRT driver we lock the pmap before
9589 * calling into the runtime service. As the lock
9590 * is already owned by the current thread skip
9591 * locking it again.
9592 */
9593 owned = PMAP_OWNED(pmap);
9594 if (!owned)
9595 PMAP_LOCK(pmap);
9596 /* Ask the MMU to check the address. */
9597 intr = intr_disable();
9598 par = arm64_address_translate_s1e0r(far);
9599 intr_restore(intr);
9600 if (!owned)
9601 PMAP_UNLOCK(pmap);
9602
9603 /*
9604 * If the translation was successful, then we can
9605 * return success to the trap handler.
9606 */
9607 if (PAR_SUCCESS(par))
9608 rv = KERN_SUCCESS;
9609 }
9610 break;
9611 }
9612
9613 return (rv);
9614 }
9615
9616 /*
9617 * Increase the starting virtual address of the given mapping if a
9618 * different alignment might result in more superpage mappings.
9619 */
9620 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)9621 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
9622 vm_offset_t *addr, vm_size_t size)
9623 {
9624 vm_offset_t superpage_offset;
9625
9626 if (size < L3C_SIZE)
9627 return;
9628 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
9629 offset += ptoa(object->pg_color);
9630
9631 /*
9632 * Considering the object's physical alignment, is the mapping large
9633 * enough to encompass an L2 (2MB/32MB) superpage ...
9634 */
9635 superpage_offset = offset & L2_OFFSET;
9636 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) >= L2_SIZE) {
9637 /*
9638 * If the virtual and physical alignments differ, then
9639 * increase the virtual address so that the alignments match.
9640 */
9641 if ((*addr & L2_OFFSET) < superpage_offset)
9642 *addr = (*addr & ~L2_OFFSET) + superpage_offset;
9643 else if ((*addr & L2_OFFSET) > superpage_offset)
9644 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) +
9645 superpage_offset;
9646 return;
9647 }
9648 /* ... or an L3C (64KB/2MB) superpage? */
9649 superpage_offset = offset & L3C_OFFSET;
9650 if (size - ((L3C_SIZE - superpage_offset) & L3C_OFFSET) >= L3C_SIZE) {
9651 if ((*addr & L3C_OFFSET) < superpage_offset)
9652 *addr = (*addr & ~L3C_OFFSET) + superpage_offset;
9653 else if ((*addr & L3C_OFFSET) > superpage_offset)
9654 *addr = ((*addr + L3C_OFFSET) & ~L3C_OFFSET) +
9655 superpage_offset;
9656 }
9657 }
9658
9659 /**
9660 * Get the kernel virtual address of a set of physical pages. If there are
9661 * physical addresses not covered by the DMAP perform a transient mapping
9662 * that will be removed when calling pmap_unmap_io_transient.
9663 *
9664 * \param page The pages the caller wishes to obtain the virtual
9665 * address on the kernel memory map.
9666 * \param vaddr On return contains the kernel virtual memory address
9667 * of the pages passed in the page parameter.
9668 * \param count Number of pages passed in.
9669 * \param can_fault true if the thread using the mapped pages can take
9670 * page faults, false otherwise.
9671 *
9672 * \returns true if the caller must call pmap_unmap_io_transient when
9673 * finished or false otherwise.
9674 *
9675 */
9676 bool
pmap_map_io_transient(vm_page_t page[],void * vaddr[],int count,bool can_fault)9677 pmap_map_io_transient(vm_page_t page[], void *vaddr[], int count,
9678 bool can_fault)
9679 {
9680 vm_paddr_t paddr;
9681 vmem_addr_t addr;
9682 bool needs_mapping;
9683 int error __diagused, i;
9684
9685 /*
9686 * Allocate any KVA space that we need, this is done in a separate
9687 * loop to prevent calling vmem_alloc while pinned.
9688 */
9689 needs_mapping = false;
9690 for (i = 0; i < count; i++) {
9691 paddr = VM_PAGE_TO_PHYS(page[i]);
9692 if (__predict_false(!PHYS_IN_DMAP(paddr))) {
9693 error = vmem_alloc(kernel_arena, PAGE_SIZE,
9694 M_BESTFIT | M_WAITOK, &addr);
9695 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
9696 vaddr[i] = (void *)addr;
9697 needs_mapping = true;
9698 } else {
9699 vaddr[i] = PHYS_TO_DMAP(paddr);
9700 }
9701 }
9702
9703 /* Exit early if everything is covered by the DMAP */
9704 if (!needs_mapping)
9705 return (false);
9706
9707 if (!can_fault)
9708 sched_pin();
9709 for (i = 0; i < count; i++) {
9710 paddr = VM_PAGE_TO_PHYS(page[i]);
9711 if (!PHYS_IN_DMAP(paddr)) {
9712 panic(
9713 "pmap_map_io_transient: TODO: Map out of DMAP data");
9714 }
9715 }
9716
9717 return (needs_mapping);
9718 }
9719
9720 void
pmap_unmap_io_transient(vm_page_t page[],void * vaddr[],int count,bool can_fault)9721 pmap_unmap_io_transient(vm_page_t page[], void *vaddr[], int count,
9722 bool can_fault)
9723 {
9724 vm_paddr_t paddr;
9725 int i;
9726
9727 if (!can_fault)
9728 sched_unpin();
9729 for (i = 0; i < count; i++) {
9730 paddr = VM_PAGE_TO_PHYS(page[i]);
9731 if (!PHYS_IN_DMAP(paddr)) {
9732 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
9733 }
9734 }
9735 }
9736
9737 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)9738 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
9739 {
9740
9741 return (mode >= 0 && mode < VM_MEMATTR_END);
9742 }
9743
9744 static void *
bti_dup_range(void * ctx __unused,void * data)9745 bti_dup_range(void *ctx __unused, void *data)
9746 {
9747 struct rs_el *node, *new_node;
9748
9749 new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9750 if (new_node == NULL)
9751 return (NULL);
9752 node = data;
9753 memcpy(new_node, node, sizeof(*node));
9754 return (new_node);
9755 }
9756
9757 static void
bti_free_range(void * ctx __unused,void * node)9758 bti_free_range(void *ctx __unused, void *node)
9759 {
9760
9761 uma_zfree(pmap_bti_ranges_zone, node);
9762 }
9763
9764 static int
pmap_bti_assign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9765 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9766 {
9767 struct rs_el *rs;
9768 int error;
9769
9770 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9771 PMAP_ASSERT_STAGE1(pmap);
9772 MPASS(pmap->pm_bti != NULL);
9773 rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9774 if (rs == NULL)
9775 return (ENOMEM);
9776 error = rangeset_insert(pmap->pm_bti, sva, eva, rs);
9777 if (error != 0)
9778 uma_zfree(pmap_bti_ranges_zone, rs);
9779 return (error);
9780 }
9781
9782 static void
pmap_bti_deassign_all(pmap_t pmap)9783 pmap_bti_deassign_all(pmap_t pmap)
9784 {
9785
9786 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9787 if (pmap->pm_bti != NULL)
9788 rangeset_remove_all(pmap->pm_bti);
9789 }
9790
9791 /*
9792 * Returns true if the BTI setting is the same across the specified address
9793 * range, and false otherwise. When returning true, updates the referenced PTE
9794 * to reflect the BTI setting.
9795 *
9796 * Only stage 1 pmaps support BTI. The kernel pmap is always a stage 1 pmap
9797 * that has the same BTI setting implicitly across its entire address range.
9798 */
9799 static bool
pmap_bti_same(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t * pte)9800 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte)
9801 {
9802 struct rs_el *rs;
9803 vm_offset_t va;
9804
9805 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9806 KASSERT(ADDR_IS_CANONICAL(sva),
9807 ("%s: Start address not in canonical form: %lx", __func__, sva));
9808 KASSERT(ADDR_IS_CANONICAL(eva),
9809 ("%s: End address not in canonical form: %lx", __func__, eva));
9810 KASSERT((*pte & ATTR_S1_GP) == 0,
9811 ("%s: pte %lx has ATTR_S1_GP preset", __func__, *pte));
9812
9813 if (pmap == kernel_pmap) {
9814 *pte |= ATTR_KERN_GP;
9815 return (true);
9816 }
9817 if (pmap->pm_bti == NULL)
9818 return (true);
9819 PMAP_ASSERT_STAGE1(pmap);
9820 rs = rangeset_containing(pmap->pm_bti, sva);
9821 if (rs == NULL)
9822 return (rangeset_empty(pmap->pm_bti, sva, eva));
9823 while ((va = rs->re_end) < eva) {
9824 if ((rs = rangeset_beginning(pmap->pm_bti, va)) == NULL)
9825 return (false);
9826 }
9827 *pte |= ATTR_S1_GP;
9828 return (true);
9829 }
9830
9831 static pt_entry_t
pmap_pte_bti(pmap_t pmap,vm_offset_t va)9832 pmap_pte_bti(pmap_t pmap, vm_offset_t va)
9833 {
9834 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9835 MPASS(ADDR_IS_CANONICAL(va));
9836
9837 if (pmap->pm_stage != PM_STAGE1)
9838 return (0);
9839 if (pmap == kernel_pmap)
9840 return (ATTR_KERN_GP);
9841 if (pmap->pm_bti != NULL &&
9842 rangeset_containing(pmap->pm_bti, va) != NULL)
9843 return (ATTR_S1_GP);
9844 return (0);
9845 }
9846
9847 static void
pmap_bti_on_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9848 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9849 {
9850
9851 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9852 if (pmap->pm_bti != NULL)
9853 rangeset_remove(pmap->pm_bti, sva, eva);
9854 }
9855
9856 static int
pmap_bti_copy(pmap_t dst_pmap,pmap_t src_pmap)9857 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap)
9858 {
9859
9860 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
9861 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
9862 MPASS(src_pmap->pm_stage == dst_pmap->pm_stage);
9863 MPASS(src_pmap->pm_bti != NULL);
9864 MPASS(dst_pmap->pm_bti != NULL);
9865 if (src_pmap->pm_bti->rs_data_ctx == NULL)
9866 return (0);
9867 return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti));
9868 }
9869
9870 static void
pmap_bti_update_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool set)9871 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set)
9872 {
9873 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9874 PMAP_ASSERT_STAGE1(pmap);
9875
9876 pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0,
9877 true);
9878 }
9879
9880 int
pmap_bti_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9881 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9882 {
9883 int error;
9884
9885 if (pmap->pm_bti == NULL)
9886 return (0);
9887 if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva))
9888 return (EINVAL);
9889 if (pmap->pm_stage != PM_STAGE1)
9890 return (EINVAL);
9891 if (eva <= sva || ADDR_IS_KERNEL(eva))
9892 return (EFAULT);
9893
9894 sva = trunc_page(sva);
9895 eva = round_page(eva);
9896 for (;;) {
9897 PMAP_LOCK(pmap);
9898 error = pmap_bti_assign(pmap, sva, eva);
9899 if (error == 0)
9900 pmap_bti_update_range(pmap, sva, eva, true);
9901 PMAP_UNLOCK(pmap);
9902 if (error != ENOMEM)
9903 break;
9904 vm_wait(NULL);
9905 }
9906 return (error);
9907 }
9908
9909 #if defined(KASAN) || defined(KMSAN)
9910 static pd_entry_t *pmap_san_early_l2;
9911
9912 #define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE)
9913 #define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE)
9914 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_l2(void)9915 pmap_san_enter_bootstrap_alloc_l2(void)
9916 {
9917 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
9918 static size_t offset = 0;
9919 vm_offset_t addr;
9920
9921 if (offset + L2_SIZE > sizeof(bootstrap_data)) {
9922 panic("%s: out of memory for the bootstrap shadow map L2 entries",
9923 __func__);
9924 }
9925
9926 addr = (uintptr_t)&bootstrap_data[offset];
9927 offset += L2_SIZE;
9928 return (addr);
9929 }
9930
9931 /*
9932 * SAN L1 + L2 pages, maybe L3 entries later?
9933 */
9934 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_pages(int npages)9935 pmap_san_enter_bootstrap_alloc_pages(int npages)
9936 {
9937 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
9938 static size_t offset = 0;
9939 vm_offset_t addr;
9940
9941 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
9942 panic("%s: out of memory for the bootstrap shadow map",
9943 __func__);
9944 }
9945
9946 addr = (uintptr_t)&bootstrap_data[offset];
9947 offset += (npages * PAGE_SIZE);
9948 return (addr);
9949 }
9950
9951 static void __nosanitizeaddress
pmap_san_enter_bootstrap(void)9952 pmap_san_enter_bootstrap(void)
9953 {
9954 vm_offset_t freemempos;
9955
9956 /* L1, L2 */
9957 freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
9958 bs_state.freemempos = freemempos;
9959 bs_state.va = KASAN_MIN_ADDRESS;
9960 pmap_bootstrap_l1_table(&bs_state);
9961 pmap_san_early_l2 = bs_state.l2;
9962 }
9963
9964 static vm_page_t
pmap_san_enter_alloc_l3(void)9965 pmap_san_enter_alloc_l3(void)
9966 {
9967 vm_page_t m;
9968
9969 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
9970 VM_ALLOC_ZERO);
9971 if (m == NULL)
9972 panic("%s: no memory to grow shadow map", __func__);
9973 return (m);
9974 }
9975
9976 static vm_page_t
pmap_san_enter_alloc_l2(void)9977 pmap_san_enter_alloc_l2(void)
9978 {
9979 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
9980 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
9981 }
9982
9983 void __nosanitizeaddress __nosanitizememory
pmap_san_enter(vm_offset_t va)9984 pmap_san_enter(vm_offset_t va)
9985 {
9986 pd_entry_t *l1, *l2;
9987 pt_entry_t *l3;
9988 vm_page_t m;
9989
9990 if (virtual_avail == 0) {
9991 vm_offset_t block;
9992 int slot;
9993 bool first;
9994
9995 /* Temporary shadow map prior to pmap_bootstrap(). */
9996 first = pmap_san_early_l2 == NULL;
9997 if (first)
9998 pmap_san_enter_bootstrap();
9999
10000 l2 = pmap_san_early_l2;
10001 slot = pmap_l2_index(va);
10002
10003 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
10004 MPASS(first);
10005 block = pmap_san_enter_bootstrap_alloc_l2();
10006 pmap_store(&l2[slot],
10007 PHYS_TO_PTE(pmap_early_vtophys(block)) |
10008 PMAP_SAN_PTE_BITS | L2_BLOCK);
10009 dmb(ishst);
10010 }
10011
10012 return;
10013 }
10014
10015 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
10016 l1 = pmap_l1(kernel_pmap, va);
10017 MPASS(l1 != NULL);
10018 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
10019 m = pmap_san_enter_alloc_l3();
10020 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
10021 }
10022 l2 = pmap_l1_to_l2(l1, va);
10023 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
10024 m = pmap_san_enter_alloc_l2();
10025 if (m != NULL) {
10026 pmap_store(l2, VM_PAGE_TO_PTE(m) |
10027 PMAP_SAN_PTE_BITS | L2_BLOCK);
10028 } else {
10029 m = pmap_san_enter_alloc_l3();
10030 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
10031 }
10032 dmb(ishst);
10033 }
10034 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
10035 return;
10036 l3 = pmap_l2_to_l3(l2, va);
10037 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
10038 return;
10039 m = pmap_san_enter_alloc_l3();
10040 pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
10041 dmb(ishst);
10042 }
10043 #endif /* KASAN || KMSAN */
10044
10045 /*
10046 * Track a range of the kernel's virtual address space that is contiguous
10047 * in various mapping attributes.
10048 */
10049 struct pmap_kernel_map_range {
10050 vm_offset_t sva;
10051 pt_entry_t attrs;
10052 int l3pages;
10053 int l3contig;
10054 int l2blocks;
10055 int l2contig;
10056 int l1blocks;
10057 };
10058
10059 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)10060 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
10061 vm_offset_t eva)
10062 {
10063 const char *mode;
10064 int index;
10065
10066 if (eva <= range->sva)
10067 return;
10068
10069 index = range->attrs & ATTR_S1_IDX_MASK;
10070 switch (index) {
10071 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
10072 mode = "DEV-NP";
10073 break;
10074 case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
10075 mode = "DEV";
10076 break;
10077 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
10078 mode = "UC";
10079 break;
10080 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
10081 mode = "WB";
10082 break;
10083 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
10084 mode = "WT";
10085 break;
10086 case ATTR_S1_IDX(VM_MEMATTR_TAGGED):
10087 mode = "TAGGED";
10088 break;
10089 default:
10090 printf(
10091 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
10092 __func__, index, range->sva, eva);
10093 mode = "??";
10094 break;
10095 }
10096
10097 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d %d\n",
10098 range->sva, eva,
10099 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
10100 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
10101 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
10102 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
10103 (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-',
10104 mode, range->l1blocks, range->l2contig, range->l2blocks,
10105 range->l3contig, range->l3pages);
10106
10107 /* Reset to sentinel value. */
10108 range->sva = 0xfffffffffffffffful;
10109 }
10110
10111 /*
10112 * Determine whether the attributes specified by a page table entry match those
10113 * being tracked by the current range.
10114 */
10115 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)10116 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
10117 {
10118
10119 return (range->attrs == attrs);
10120 }
10121
10122 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)10123 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
10124 pt_entry_t attrs)
10125 {
10126
10127 memset(range, 0, sizeof(*range));
10128 range->sva = va;
10129 range->attrs = attrs;
10130 }
10131
10132 /* Get the block/page attributes that correspond to the table attributes */
10133 static pt_entry_t
sysctl_kmaps_table_attrs(pd_entry_t table)10134 sysctl_kmaps_table_attrs(pd_entry_t table)
10135 {
10136 pt_entry_t attrs;
10137
10138 attrs = 0;
10139 if ((table & TATTR_UXN_TABLE) != 0)
10140 attrs |= ATTR_S1_UXN;
10141 if ((table & TATTR_PXN_TABLE) != 0)
10142 attrs |= ATTR_S1_PXN;
10143 if ((table & TATTR_AP_TABLE_RO) != 0)
10144 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
10145
10146 return (attrs);
10147 }
10148
10149 /* Read the block/page attributes we care about */
10150 static pt_entry_t
sysctl_kmaps_block_attrs(pt_entry_t block)10151 sysctl_kmaps_block_attrs(pt_entry_t block)
10152 {
10153 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK |
10154 ATTR_S1_GP));
10155 }
10156
10157 /*
10158 * Given a leaf PTE, derive the mapping's attributes. If they do not match
10159 * those of the current run, dump the address range and its attributes, and
10160 * begin a new run.
10161 */
10162 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l0e,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)10163 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
10164 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
10165 pt_entry_t l3e)
10166 {
10167 pt_entry_t attrs;
10168
10169 attrs = sysctl_kmaps_table_attrs(l0e);
10170
10171 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
10172 attrs |= sysctl_kmaps_block_attrs(l1e);
10173 goto done;
10174 }
10175 attrs |= sysctl_kmaps_table_attrs(l1e);
10176
10177 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
10178 attrs |= sysctl_kmaps_block_attrs(l2e);
10179 goto done;
10180 }
10181 attrs |= sysctl_kmaps_table_attrs(l2e);
10182 attrs |= sysctl_kmaps_block_attrs(l3e);
10183
10184 done:
10185 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
10186 sysctl_kmaps_dump(sb, range, va);
10187 sysctl_kmaps_reinit(range, va, attrs);
10188 }
10189 }
10190
10191 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)10192 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
10193 {
10194 struct pmap_kernel_map_range range;
10195 struct sbuf sbuf, *sb;
10196 pd_entry_t l0e, *l1, l1e, *l2, l2e;
10197 pt_entry_t *l3, l3e;
10198 vm_offset_t sva;
10199 vm_paddr_t pa;
10200 int error, i, j, k, l;
10201
10202 error = sysctl_wire_old_buffer(req, 0);
10203 if (error != 0)
10204 return (error);
10205 sb = &sbuf;
10206 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
10207
10208 /* Sentinel value. */
10209 range.sva = 0xfffffffffffffffful;
10210
10211 /*
10212 * Iterate over the kernel page tables without holding the kernel pmap
10213 * lock. Kernel page table pages are never freed, so at worst we will
10214 * observe inconsistencies in the output.
10215 */
10216 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
10217 i++) {
10218 if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
10219 sbuf_printf(sb, "\nDirect map:\n");
10220 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
10221 sbuf_printf(sb, "\nKernel map:\n");
10222 #ifdef KASAN
10223 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
10224 sbuf_printf(sb, "\nKASAN shadow map:\n");
10225 #endif
10226 #ifdef KMSAN
10227 else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS))
10228 sbuf_printf(sb, "\nKMSAN shadow map:\n");
10229 else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS))
10230 sbuf_printf(sb, "\nKMSAN origin map:\n");
10231 #endif
10232
10233 l0e = kernel_pmap->pm_l0[i];
10234 if ((l0e & ATTR_DESCR_VALID) == 0) {
10235 sysctl_kmaps_dump(sb, &range, sva);
10236 sva += L0_SIZE;
10237 continue;
10238 }
10239 pa = PTE_TO_PHYS(l0e);
10240 l1 = PHYS_TO_DMAP(pa);
10241
10242 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
10243 l1e = l1[j];
10244 if ((l1e & ATTR_DESCR_VALID) == 0) {
10245 sysctl_kmaps_dump(sb, &range, sva);
10246 sva += L1_SIZE;
10247 continue;
10248 }
10249 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
10250 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
10251 sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
10252 0, 0);
10253 range.l1blocks++;
10254 sva += L1_SIZE;
10255 continue;
10256 }
10257 pa = PTE_TO_PHYS(l1e);
10258 l2 = PHYS_TO_DMAP(pa);
10259
10260 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
10261 l2e = l2[k];
10262 if ((l2e & ATTR_DESCR_VALID) == 0) {
10263 sysctl_kmaps_dump(sb, &range, sva);
10264 sva += L2_SIZE;
10265 continue;
10266 }
10267 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
10268 sysctl_kmaps_check(sb, &range, sva,
10269 l0e, l1e, l2e, 0);
10270 if ((l2e & ATTR_CONTIGUOUS) != 0)
10271 range.l2contig +=
10272 k % L2C_ENTRIES == 0 ?
10273 1 : 0;
10274 else
10275 range.l2blocks++;
10276 sva += L2_SIZE;
10277 continue;
10278 }
10279 pa = PTE_TO_PHYS(l2e);
10280 l3 = PHYS_TO_DMAP(pa);
10281
10282 for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
10283 l++, sva += L3_SIZE) {
10284 l3e = l3[l];
10285 if ((l3e & ATTR_DESCR_VALID) == 0) {
10286 sysctl_kmaps_dump(sb, &range,
10287 sva);
10288 continue;
10289 }
10290 sysctl_kmaps_check(sb, &range, sva,
10291 l0e, l1e, l2e, l3e);
10292 if ((l3e & ATTR_CONTIGUOUS) != 0)
10293 range.l3contig +=
10294 l % L3C_ENTRIES == 0 ?
10295 1 : 0;
10296 else
10297 range.l3pages++;
10298 }
10299 }
10300 }
10301 }
10302
10303 error = sbuf_finish(sb);
10304 sbuf_delete(sb);
10305 return (error);
10306 }
10307 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
10308 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
10309 NULL, 0, sysctl_kmaps, "A",
10310 "Dump kernel address layout");
10311
10312
10313 void pagezero_simple(void *);
10314 void pagezero_cache(void *);
10315 void pagezero_mops(void *);
10316
10317 DEFINE_IFUNC(static, void, pagezero, (void *))
10318 {
10319 uint32_t dczid_el0;
10320
10321 dczid_el0 = READ_SPECIALREG(dczid_el0);
10322
10323 if (elf_hwcap2 & HWCAP2_MOPS)
10324 return (pagezero_mops);
10325 else if ((dczid_el0 & DCZID_DZP) == 0)
10326 return (pagezero_cache);
10327 else
10328 return (pagezero_simple);
10329 }
10330