1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * Copyright (c) 2014 Andrew Turner 13 * All rights reserved. 14 * Copyright (c) 2014-2016 The FreeBSD Foundation 15 * All rights reserved. 16 * 17 * This code is derived from software contributed to Berkeley by 18 * the Systems Programming Group of the University of Utah Computer 19 * Science Department and William Jolitz of UUNET Technologies Inc. 20 * 21 * This software was developed by Andrew Turner under sponsorship from 22 * the FreeBSD Foundation. 23 * 24 * Redistribution and use in source and binary forms, with or without 25 * modification, are permitted provided that the following conditions 26 * are met: 27 * 1. Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * 2. Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in the 31 * documentation and/or other materials provided with the distribution. 32 * 3. All advertising materials mentioning features or use of this software 33 * must display the following acknowledgement: 34 * This product includes software developed by the University of 35 * California, Berkeley and its contributors. 36 * 4. Neither the name of the University nor the names of its contributors 37 * may be used to endorse or promote products derived from this software 38 * without specific prior written permission. 39 * 40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 */ 52 /*- 53 * Copyright (c) 2003 Networks Associates Technology, Inc. 54 * All rights reserved. 55 * 56 * This software was developed for the FreeBSD Project by Jake Burkholder, 57 * Safeport Network Services, and Network Associates Laboratories, the 58 * Security Research Division of Network Associates, Inc. under 59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 60 * CHATS research program. 61 * 62 * Redistribution and use in source and binary forms, with or without 63 * modification, are permitted provided that the following conditions 64 * are met: 65 * 1. Redistributions of source code must retain the above copyright 66 * notice, this list of conditions and the following disclaimer. 67 * 2. Redistributions in binary form must reproduce the above copyright 68 * notice, this list of conditions and the following disclaimer in the 69 * documentation and/or other materials provided with the distribution. 70 * 71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 81 * SUCH DAMAGE. 82 */ 83 84 #include <sys/cdefs.h> 85 /* 86 * Manages physical address maps. 87 * 88 * Since the information managed by this module is 89 * also stored by the logical address mapping module, 90 * this module may throw away valid virtual-to-physical 91 * mappings at almost any time. However, invalidations 92 * of virtual-to-physical mappings must be done as 93 * requested. 94 * 95 * In order to cope with hardware architectures which 96 * make virtual-to-physical map invalidates expensive, 97 * this module may delay invalidate or reduced protection 98 * operations until such time as they are actually 99 * necessary. This module is given full information as 100 * to which processors are currently using which maps, 101 * and to when physical maps must be made correct. 102 */ 103 104 #include "opt_vm.h" 105 106 #include <sys/param.h> 107 #include <sys/asan.h> 108 #include <sys/bitstring.h> 109 #include <sys/bus.h> 110 #include <sys/systm.h> 111 #include <sys/kernel.h> 112 #include <sys/ktr.h> 113 #include <sys/limits.h> 114 #include <sys/lock.h> 115 #include <sys/malloc.h> 116 #include <sys/mman.h> 117 #include <sys/msan.h> 118 #include <sys/msgbuf.h> 119 #include <sys/mutex.h> 120 #include <sys/physmem.h> 121 #include <sys/proc.h> 122 #include <sys/rangeset.h> 123 #include <sys/rwlock.h> 124 #include <sys/sbuf.h> 125 #include <sys/sx.h> 126 #include <sys/vmem.h> 127 #include <sys/vmmeter.h> 128 #include <sys/sched.h> 129 #include <sys/sysctl.h> 130 #include <sys/_unrhdr.h> 131 #include <sys/smp.h> 132 133 #include <vm/vm.h> 134 #include <vm/vm_param.h> 135 #include <vm/vm_kern.h> 136 #include <vm/vm_page.h> 137 #include <vm/vm_map.h> 138 #include <vm/vm_object.h> 139 #include <vm/vm_extern.h> 140 #include <vm/vm_pageout.h> 141 #include <vm/vm_pager.h> 142 #include <vm/vm_phys.h> 143 #include <vm/vm_radix.h> 144 #include <vm/vm_reserv.h> 145 #include <vm/vm_dumpset.h> 146 #include <vm/uma.h> 147 148 #include <machine/asan.h> 149 #include <machine/cpu.h> 150 #include <machine/cpu_feat.h> 151 #include <machine/elf.h> 152 #include <machine/ifunc.h> 153 #include <machine/machdep.h> 154 #include <machine/md_var.h> 155 #include <machine/pcb.h> 156 #include <machine/rsi.h> 157 158 #ifdef NUMA 159 #define PMAP_MEMDOM MAXMEMDOM 160 #else 161 #define PMAP_MEMDOM 1 162 #endif 163 164 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1) 165 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2) 166 167 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) 168 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) 169 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) 170 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) 171 172 #define NUL0E L0_ENTRIES 173 #define NUL1E (NUL0E * NL1PG) 174 #define NUL2E (NUL1E * NL2PG) 175 176 #ifdef PV_STATS 177 #define PV_STAT(x) do { x ; } while (0) 178 #define __pvused 179 #else 180 #define PV_STAT(x) do { } while (0) 181 #define __pvused __unused 182 #endif 183 184 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT)) 185 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 186 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 187 188 #ifdef __ARM_FEATURE_BTI_DEFAULT 189 pt_entry_t __read_mostly pmap_gp_attr; 190 #define ATTR_KERN_GP pmap_gp_attr 191 #else 192 #define ATTR_KERN_GP 0 193 #endif 194 #define PMAP_SAN_PTE_BITS (ATTR_AF | ATTR_S1_XN | pmap_sh_attr | \ 195 ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW)) 196 197 static bool __read_mostly pmap_multiple_tlbi = false; 198 199 struct pmap_large_md_page { 200 struct rwlock pv_lock; 201 struct md_page pv_page; 202 /* Pad to a power of 2, see pmap_init_pv_table(). */ 203 int pv_pad[2]; 204 }; 205 206 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 207 #define pv_dummy pv_dummy_large.pv_page 208 __read_mostly static struct pmap_large_md_page *pv_table; 209 210 __read_mostly uint64_t prot_ns_shared_pa; 211 212 static struct pmap_large_md_page * 213 _pa_to_pmdp(vm_paddr_t pa) 214 { 215 struct vm_phys_seg *seg; 216 217 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 218 return ((struct pmap_large_md_page *)seg->md_first + 219 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start)); 220 return (NULL); 221 } 222 223 static struct pmap_large_md_page * 224 pa_to_pmdp(vm_paddr_t pa) 225 { 226 struct pmap_large_md_page *pvd; 227 228 pvd = _pa_to_pmdp(pa); 229 if (pvd == NULL) 230 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa); 231 return (pvd); 232 } 233 234 static struct pmap_large_md_page * 235 page_to_pmdp(vm_page_t m) 236 { 237 struct vm_phys_seg *seg; 238 239 seg = &vm_phys_segs[m->segind]; 240 return ((struct pmap_large_md_page *)seg->md_first + 241 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start)); 242 } 243 244 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 245 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page)) 246 247 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 248 struct pmap_large_md_page *_pvd; \ 249 struct rwlock *_lock; \ 250 _pvd = _pa_to_pmdp(pa); \ 251 if (__predict_false(_pvd == NULL)) \ 252 _lock = &pv_dummy_large.pv_lock; \ 253 else \ 254 _lock = &(_pvd->pv_lock); \ 255 _lock; \ 256 }) 257 258 static struct rwlock * 259 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m) 260 { 261 if ((m->flags & PG_FICTITIOUS) == 0) 262 return (&page_to_pmdp(m)->pv_lock); 263 else 264 return (&pv_dummy_large.pv_lock); 265 } 266 267 #define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \ 268 struct rwlock **_lockp = (lockp); \ 269 struct rwlock *_new_lock = (new_lock); \ 270 \ 271 if (_new_lock != *_lockp) { \ 272 if (*_lockp != NULL) \ 273 rw_wunlock(*_lockp); \ 274 *_lockp = _new_lock; \ 275 rw_wlock(*_lockp); \ 276 } \ 277 } while (0) 278 279 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \ 280 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa)) 281 282 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 283 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m)) 284 285 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 286 struct rwlock **_lockp = (lockp); \ 287 \ 288 if (*_lockp != NULL) { \ 289 rw_wunlock(*_lockp); \ 290 *_lockp = NULL; \ 291 } \ 292 } while (0) 293 294 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte)) 295 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) 296 297 static struct mtx cmap_lock; 298 static void *cmap1_addr; 299 static pt_entry_t *cmap1_pte; 300 301 /* 302 * The presence of this flag indicates that the mapping is writeable. 303 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise 304 * it is dirty. This flag may only be set on managed mappings. 305 * 306 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it 307 * as a software managed bit. 308 */ 309 #define ATTR_SW_DBM ATTR_DBM 310 311 struct pmap kernel_pmap_store; 312 313 /* Used for mapping ACPI memory before VM is initialized */ 314 #define PMAP_PREINIT_MAPPING_COUNT 32 315 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) 316 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ 317 static int vm_initialized = 0; /* No need to use pre-init maps when set */ 318 319 /* 320 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. 321 * Always map entire L2 block for simplicity. 322 * VA of L2 block = preinit_map_va + i * L2_SIZE 323 */ 324 static struct pmap_preinit_mapping { 325 vm_paddr_t pa; 326 void *va; 327 vm_size_t size; 328 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 329 330 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 331 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 332 vm_offset_t kernel_vm_end = 0; 333 334 /* 335 * Data for the pv entry allocation mechanism. 336 */ 337 #ifdef NUMA 338 static __inline int 339 pc_to_domain(struct pv_chunk *pc) 340 { 341 return (vm_phys_domain(DMAP_TO_PHYS(pc))); 342 } 343 #else 344 static __inline int 345 pc_to_domain(struct pv_chunk *pc __unused) 346 { 347 return (0); 348 } 349 #endif 350 351 struct pv_chunks_list { 352 struct mtx pvc_lock; 353 TAILQ_HEAD(pch, pv_chunk) pvc_list; 354 int active_reclaims; 355 } __aligned(CACHE_LINE_SIZE); 356 357 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 358 359 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 360 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 361 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 362 static int dmap_attr = VM_MEMATTR_WRITE_BACK; 363 364 extern pt_entry_t pagetable_l0_ttbr1[]; 365 366 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 367 static vm_paddr_t physmap[PHYSMAP_SIZE]; 368 static u_int physmap_idx; 369 370 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 371 "VM/pmap parameters"); 372 373 static int pmap_growkernel_panic = 0; 374 SYSCTL_INT(_vm_pmap, OID_AUTO, growkernel_panic, CTLFLAG_RDTUN, 375 &pmap_growkernel_panic, 0, 376 "panic on failure to allocate kernel page table page"); 377 378 bool pmap_lpa_enabled __read_mostly = false; 379 pt_entry_t pmap_sh_attr __read_mostly = ATTR_SH(ATTR_SH_IS); 380 381 #if PAGE_SIZE == PAGE_SIZE_4K 382 #define L1_BLOCKS_SUPPORTED 1 383 #else 384 #define L1_BLOCKS_SUPPORTED (pmap_lpa_enabled) 385 #endif 386 387 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED) 388 389 static bool pmap_l1_supported __read_mostly = false; 390 391 /* 392 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs 393 * that it has currently allocated to a pmap, a cursor ("asid_next") to 394 * optimize its search for a free ASID in the bit vector, and an epoch number 395 * ("asid_epoch") to indicate when it has reclaimed all previously allocated 396 * ASIDs that are not currently active on a processor. 397 * 398 * The current epoch number is always in the range [0, INT_MAX). Negative 399 * numbers and INT_MAX are reserved for special cases that are described 400 * below. 401 */ 402 struct asid_set { 403 int asid_bits; 404 bitstr_t *asid_set; 405 int asid_set_size; 406 int asid_next; 407 int asid_epoch; 408 struct mtx asid_set_mutex; 409 }; 410 411 static struct asid_set asids; 412 static struct asid_set vmids; 413 414 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 415 "ASID allocator"); 416 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0, 417 "The number of bits in an ASID"); 418 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0, 419 "The last allocated ASID plus one"); 420 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0, 421 "The current epoch number"); 422 423 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator"); 424 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0, 425 "The number of bits in an VMID"); 426 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0, 427 "The last allocated VMID plus one"); 428 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0, 429 "The current epoch number"); 430 431 void (*pmap_clean_stage2_tlbi)(void); 432 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool); 433 void (*pmap_stage2_invalidate_all)(uint64_t); 434 435 /* 436 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved 437 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for 438 * dynamically allocated ASIDs have a non-negative epoch number. 439 * 440 * An invalid ASID is represented by -1. 441 * 442 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN), 443 * which indicates that an ASID should never be allocated to the pmap, and 444 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be 445 * allocated when the pmap is next activated. 446 */ 447 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \ 448 ((u_long)(epoch) << 32))) 449 #define COOKIE_TO_ASID(cookie) ((int)(cookie)) 450 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32)) 451 452 #define TLBI_VA_SHIFT 12 453 #define TLBI_VA_MASK ((1ul << 44) - 1) 454 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK) 455 456 static int __read_frequently superpages_enabled = 1; 457 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 458 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, 459 "Are large page mappings enabled?"); 460 461 /* 462 * True when Branch Target Identification should be used by userspace. This 463 * allows pmap to mark pages as guarded with ATTR_S1_GP. 464 */ 465 __read_mostly static bool pmap_bti_support = false; 466 467 /* 468 * Internal flags for pmap_enter()'s helper functions. 469 */ 470 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 471 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 472 473 TAILQ_HEAD(pv_chunklist, pv_chunk); 474 475 static void free_pv_chunk(struct pv_chunk *pc); 476 static void free_pv_chunk_batch(struct pv_chunklist *batch); 477 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 478 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 479 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 480 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 481 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 482 vm_offset_t va); 483 484 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 485 static bool pmap_activate_int(struct thread *td, pmap_t pmap); 486 static void pmap_alloc_asid(pmap_t pmap); 487 static int pmap_change_props_locked(void *addr, vm_size_t size, 488 vm_prot_t prot, int mode, int old_mode, bool skip_unmapped); 489 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 490 pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp); 491 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); 492 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, 493 vm_offset_t va, struct rwlock **lockp); 494 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 495 static bool pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va); 496 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va); 497 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 498 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 499 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 500 u_int flags, vm_page_t m, struct rwlock **lockp); 501 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags, 502 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp); 503 static bool pmap_every_pte_zero(vm_paddr_t pa); 504 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 505 bool all_l3e_AF_set); 506 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p); 507 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 508 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits); 509 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m, 510 struct rwlock **lockp); 511 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 512 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 513 pd_entry_t l1e, bool demote_kl2e, struct spglist *free, 514 struct rwlock **lockp); 515 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 516 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); 517 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 518 vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free, 519 struct rwlock **lockp); 520 static void pmap_reset_asid_set(pmap_t pmap); 521 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 522 vm_page_t m, struct rwlock **lockp); 523 524 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 525 struct rwlock **lockp); 526 527 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, 528 struct spglist *free); 529 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 530 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, 531 vm_offset_t va, vm_size_t size); 532 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 533 534 static uma_zone_t pmap_bti_ranges_zone; 535 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 536 pt_entry_t *pte); 537 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va); 538 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 539 static void *bti_dup_range(void *ctx, void *data); 540 static void bti_free_range(void *ctx, void *node); 541 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap); 542 static void pmap_bti_deassign_all(pmap_t pmap); 543 static void pagezero(void *); 544 545 static void pmap_set_protected(pt_entry_t old_l3); 546 static void pmap_set_unprotected(pt_entry_t new_l3); 547 548 /* 549 * These load the old table data and store the new value. 550 * They need to be atomic as the System MMU may write to the table at 551 * the same time as the CPU. 552 */ 553 #define pmap_clear(table) atomic_store_64(table, 0) 554 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) 555 #define pmap_load(table) (*table) 556 #define pmap_load_clear(table) atomic_swap_64(table, 0) 557 #define pmap_load_store(table, entry) atomic_swap_64(table, entry) 558 #define pmap_set_bits(table, bits) atomic_set_64(table, bits) 559 #define pmap_store(table, entry) atomic_store_64(table, entry) 560 561 /********************/ 562 /* Inline functions */ 563 /********************/ 564 565 static __inline void 566 pagecopy(void *s, void *d) 567 { 568 569 memcpy(d, s, PAGE_SIZE); 570 } 571 572 static __inline pd_entry_t * 573 pmap_l0(pmap_t pmap, vm_offset_t va) 574 { 575 576 return (&pmap->pm_l0[pmap_l0_index(va)]); 577 } 578 579 static __inline pd_entry_t * 580 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 581 { 582 pd_entry_t *l1; 583 584 l1 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0))); 585 return (&l1[pmap_l1_index(va)]); 586 } 587 588 static __inline pd_entry_t * 589 pmap_l1(pmap_t pmap, vm_offset_t va) 590 { 591 pd_entry_t *l0; 592 593 l0 = pmap_l0(pmap, va); 594 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) 595 return (NULL); 596 597 return (pmap_l0_to_l1(l0, va)); 598 } 599 600 static __inline pd_entry_t * 601 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) 602 { 603 pd_entry_t l1, *l2p; 604 605 l1 = pmap_load(l1p); 606 607 KASSERT(ADDR_IS_CANONICAL(va), 608 ("%s: Address not in canonical form: %lx", __func__, va)); 609 /* 610 * The valid bit may be clear if pmap_update_entry() is concurrently 611 * modifying the entry, so for KVA only the entry type may be checked. 612 */ 613 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0, 614 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va)); 615 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 616 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va)); 617 l2p = PHYS_TO_DMAP(PTE_TO_PHYS(l1)); 618 return (&l2p[pmap_l2_index(va)]); 619 } 620 621 static __inline pd_entry_t * 622 pmap_l2(pmap_t pmap, vm_offset_t va) 623 { 624 pd_entry_t *l1; 625 626 l1 = pmap_l1(pmap, va); 627 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) 628 return (NULL); 629 630 return (pmap_l1_to_l2(l1, va)); 631 } 632 633 static __inline pt_entry_t * 634 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va) 635 { 636 pd_entry_t l2; 637 pt_entry_t *l3p; 638 639 l2 = pmap_load(l2p); 640 641 KASSERT(ADDR_IS_CANONICAL(va), 642 ("%s: Address not in canonical form: %lx", __func__, va)); 643 /* 644 * The valid bit may be clear if pmap_update_entry() is concurrently 645 * modifying the entry, so for KVA only the entry type may be checked. 646 */ 647 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0, 648 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va)); 649 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 650 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va)); 651 l3p = PHYS_TO_DMAP(PTE_TO_PHYS(l2)); 652 return (&l3p[pmap_l3_index(va)]); 653 } 654 655 /* 656 * Returns the lowest valid pde for a given virtual address. 657 * The next level may or may not point to a valid page or block. 658 */ 659 static __inline pd_entry_t * 660 pmap_pde(pmap_t pmap, vm_offset_t va, int *level) 661 { 662 pd_entry_t *l0, *l1, *l2, desc; 663 664 l0 = pmap_l0(pmap, va); 665 desc = pmap_load(l0) & ATTR_DESCR_MASK; 666 if (desc != L0_TABLE) { 667 *level = -1; 668 return (NULL); 669 } 670 671 l1 = pmap_l0_to_l1(l0, va); 672 desc = pmap_load(l1) & ATTR_DESCR_MASK; 673 if (desc != L1_TABLE) { 674 *level = 0; 675 return (l0); 676 } 677 678 l2 = pmap_l1_to_l2(l1, va); 679 desc = pmap_load(l2) & ATTR_DESCR_MASK; 680 if (desc != L2_TABLE) { 681 *level = 1; 682 return (l1); 683 } 684 685 *level = 2; 686 return (l2); 687 } 688 689 /* 690 * Returns the lowest valid pte block or table entry for a given virtual 691 * address. If there are no valid entries return NULL and set the level to 692 * the first invalid level. 693 */ 694 static __inline pt_entry_t * 695 pmap_pte(pmap_t pmap, vm_offset_t va, int *level) 696 { 697 pd_entry_t *l1, *l2, desc; 698 pt_entry_t *l3; 699 700 l1 = pmap_l1(pmap, va); 701 if (l1 == NULL) { 702 *level = 0; 703 return (NULL); 704 } 705 desc = pmap_load(l1) & ATTR_DESCR_MASK; 706 if (desc == L1_BLOCK) { 707 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 708 *level = 1; 709 return (l1); 710 } 711 712 if (desc != L1_TABLE) { 713 *level = 1; 714 return (NULL); 715 } 716 717 l2 = pmap_l1_to_l2(l1, va); 718 desc = pmap_load(l2) & ATTR_DESCR_MASK; 719 if (desc == L2_BLOCK) { 720 *level = 2; 721 return (l2); 722 } 723 724 if (desc != L2_TABLE) { 725 *level = 2; 726 return (NULL); 727 } 728 729 *level = 3; 730 l3 = pmap_l2_to_l3(l2, va); 731 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) 732 return (NULL); 733 734 return (l3); 735 } 736 737 /* 738 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified 739 * level that maps the specified virtual address, then a pointer to that entry 740 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled 741 * and a diagnostic message is provided, in which case this function panics. 742 */ 743 static __always_inline pt_entry_t * 744 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag) 745 { 746 pd_entry_t *l0p, *l1p, *l2p; 747 pt_entry_t desc, *l3p; 748 int walk_level __diagused; 749 750 KASSERT(level >= 0 && level < 4, 751 ("%s: %s passed an out-of-range level (%d)", __func__, diag, 752 level)); 753 l0p = pmap_l0(pmap, va); 754 desc = pmap_load(l0p) & ATTR_DESCR_MASK; 755 if (desc == L0_TABLE && level > 0) { 756 l1p = pmap_l0_to_l1(l0p, va); 757 desc = pmap_load(l1p) & ATTR_DESCR_MASK; 758 if (desc == L1_BLOCK && level == 1) { 759 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 760 return (l1p); 761 } 762 if (desc == L1_TABLE && level > 1) { 763 l2p = pmap_l1_to_l2(l1p, va); 764 desc = pmap_load(l2p) & ATTR_DESCR_MASK; 765 if (desc == L2_BLOCK && level == 2) 766 return (l2p); 767 else if (desc == L2_TABLE && level > 2) { 768 l3p = pmap_l2_to_l3(l2p, va); 769 desc = pmap_load(l3p) & ATTR_DESCR_MASK; 770 if (desc == L3_PAGE && level == 3) 771 return (l3p); 772 else 773 walk_level = 3; 774 } else 775 walk_level = 2; 776 } else 777 walk_level = 1; 778 } else 779 walk_level = 0; 780 KASSERT(diag == NULL, 781 ("%s: va %#lx not mapped at level %d, desc %ld at level %d", 782 diag, va, level, desc, walk_level)); 783 return (NULL); 784 } 785 786 bool 787 pmap_ps_enabled(pmap_t pmap) 788 { 789 /* 790 * Promotion requires a hypervisor call when the kernel is running 791 * in EL1. To stop this disable superpage support on non-stage 1 792 * pmaps for now. 793 */ 794 if (pmap->pm_stage != PM_STAGE1) 795 return (false); 796 797 #ifdef KMSAN 798 /* 799 * The break-before-make in pmap_update_entry() results in a situation 800 * where a CPU may call into the KMSAN runtime while the entry is 801 * invalid. If the entry is used to map the current thread structure, 802 * then the runtime will attempt to access unmapped memory. Avoid this 803 * by simply disabling superpage promotion for the kernel map. 804 */ 805 if (pmap == kernel_pmap) 806 return (false); 807 #endif 808 809 return (superpages_enabled != 0); 810 } 811 812 bool 813 pmap_vs_enabled(void) 814 { 815 /* 816 * 8 and 16 are the only values hardware can support, but allow for the 817 * possibility of artificially restricting the bits, e.g. for testing. 818 */ 819 KASSERT(vmids.asid_bits <= 16, ("VMID bits %d > 16", vmids.asid_bits)); 820 return (vmids.asid_bits > 8); 821 } 822 823 bool 824 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, 825 pd_entry_t **l2, pt_entry_t **l3) 826 { 827 pd_entry_t *l0p, *l1p, *l2p; 828 829 if (pmap->pm_l0 == NULL) 830 return (false); 831 832 l0p = pmap_l0(pmap, va); 833 *l0 = l0p; 834 835 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) 836 return (false); 837 838 l1p = pmap_l0_to_l1(l0p, va); 839 *l1 = l1p; 840 841 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { 842 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 843 *l2 = NULL; 844 *l3 = NULL; 845 return (true); 846 } 847 848 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) 849 return (false); 850 851 l2p = pmap_l1_to_l2(l1p, va); 852 *l2 = l2p; 853 854 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { 855 *l3 = NULL; 856 return (true); 857 } 858 859 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) 860 return (false); 861 862 *l3 = pmap_l2_to_l3(l2p, va); 863 864 return (true); 865 } 866 867 static __inline int 868 pmap_l3_valid(pt_entry_t l3) 869 { 870 871 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); 872 } 873 874 CTASSERT(L1_BLOCK == L2_BLOCK); 875 876 static pt_entry_t 877 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr) 878 { 879 pt_entry_t val; 880 881 if (pmap->pm_stage == PM_STAGE1) { 882 val = ATTR_S1_IDX(memattr); 883 if (memattr == VM_MEMATTR_DEVICE) 884 val |= ATTR_S1_XN; 885 return (val); 886 } 887 888 val = 0; 889 890 switch (memattr) { 891 case VM_MEMATTR_DEVICE: 892 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) | 893 ATTR_S2_XN(ATTR_S2_XN_ALL)); 894 case VM_MEMATTR_UNCACHEABLE: 895 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC)); 896 case VM_MEMATTR_WRITE_BACK: 897 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB)); 898 case VM_MEMATTR_WRITE_THROUGH: 899 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT)); 900 default: 901 panic("%s: invalid memory attribute %x", __func__, memattr); 902 } 903 } 904 905 static pt_entry_t 906 pmap_pte_prot(pmap_t pmap, vm_prot_t prot) 907 { 908 pt_entry_t val; 909 910 val = 0; 911 if (pmap->pm_stage == PM_STAGE1) { 912 if ((prot & VM_PROT_EXECUTE) == 0) 913 val |= ATTR_S1_XN; 914 if ((prot & VM_PROT_WRITE) == 0) 915 val |= ATTR_S1_AP(ATTR_S1_AP_RO); 916 } else { 917 if ((prot & VM_PROT_WRITE) != 0) 918 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 919 if ((prot & VM_PROT_READ) != 0) 920 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ); 921 if ((prot & VM_PROT_EXECUTE) == 0) 922 val |= ATTR_S2_XN(ATTR_S2_XN_ALL); 923 } 924 925 return (val); 926 } 927 928 /* 929 * Checks if the PTE is dirty. 930 */ 931 static inline int 932 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte) 933 { 934 935 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); 936 937 if (pmap->pm_stage == PM_STAGE1) { 938 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0, 939 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); 940 941 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 942 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)); 943 } 944 945 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 946 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)); 947 } 948 949 static __inline void 950 pmap_resident_count_inc(pmap_t pmap, int count) 951 { 952 953 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 954 pmap->pm_stats.resident_count += count; 955 } 956 957 static __inline void 958 pmap_resident_count_dec(pmap_t pmap, int count) 959 { 960 961 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 962 KASSERT(pmap->pm_stats.resident_count >= count, 963 ("pmap %p resident count underflow %ld %d", pmap, 964 pmap->pm_stats.resident_count, count)); 965 pmap->pm_stats.resident_count -= count; 966 } 967 968 static vm_paddr_t 969 pmap_early_vtophys(vm_offset_t va) 970 { 971 vm_paddr_t pa_page; 972 973 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK; 974 return (pa_page | (va & PAR_LOW_MASK)); 975 } 976 977 /* State of the bootstrapped DMAP page tables */ 978 struct pmap_bootstrap_state { 979 pt_entry_t *l1; 980 pt_entry_t *l2; 981 pt_entry_t *l3; 982 vm_offset_t freemempos; 983 vm_offset_t va; 984 vm_paddr_t pa; 985 pt_entry_t table_attrs; 986 u_int l0_slot; 987 u_int l1_slot; 988 u_int l2_slot; 989 bool dmap_valid; 990 }; 991 992 /* The bootstrap state */ 993 static struct pmap_bootstrap_state bs_state = { 994 .l1 = NULL, 995 .l2 = NULL, 996 .l3 = NULL, 997 .table_attrs = TATTR_PXN_TABLE, 998 .l0_slot = L0_ENTRIES, 999 .l1_slot = Ln_ENTRIES, 1000 .l2_slot = Ln_ENTRIES, 1001 .dmap_valid = false, 1002 }; 1003 1004 static void 1005 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state) 1006 { 1007 vm_paddr_t l1_pa; 1008 pd_entry_t l0e; 1009 u_int l0_slot; 1010 1011 /* Link the level 0 table to a level 1 table */ 1012 l0_slot = pmap_l0_index(state->va); 1013 if (l0_slot != state->l0_slot) { 1014 /* 1015 * Make sure we move from a low address to high address 1016 * before the DMAP region is ready. This ensures we never 1017 * modify an existing mapping until we can map from a 1018 * physical address to a virtual address. 1019 */ 1020 MPASS(state->l0_slot < l0_slot || 1021 state->l0_slot == L0_ENTRIES || 1022 state->dmap_valid); 1023 1024 /* Reset lower levels */ 1025 state->l2 = NULL; 1026 state->l3 = NULL; 1027 state->l1_slot = Ln_ENTRIES; 1028 state->l2_slot = Ln_ENTRIES; 1029 1030 /* Check the existing L0 entry */ 1031 state->l0_slot = l0_slot; 1032 if (state->dmap_valid) { 1033 l0e = pagetable_l0_ttbr1[l0_slot]; 1034 if ((l0e & ATTR_DESCR_VALID) != 0) { 1035 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE); 1036 l1_pa = PTE_TO_PHYS(l0e); 1037 state->l1 = PHYS_TO_DMAP(l1_pa); 1038 return; 1039 } 1040 } 1041 1042 /* Create a new L0 table entry */ 1043 state->l1 = (pt_entry_t *)state->freemempos; 1044 memset_early(state->l1, 0, PAGE_SIZE); 1045 state->freemempos += PAGE_SIZE; 1046 1047 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1); 1048 MPASS((l1_pa & Ln_TABLE_MASK) == 0); 1049 MPASS(pagetable_l0_ttbr1[l0_slot] == 0); 1050 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) | 1051 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE); 1052 } 1053 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__)); 1054 } 1055 1056 static void 1057 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state) 1058 { 1059 vm_paddr_t l2_pa; 1060 pd_entry_t l1e; 1061 u_int l1_slot; 1062 1063 /* Make sure there is a valid L0 -> L1 table */ 1064 pmap_bootstrap_l0_table(state); 1065 1066 /* Link the level 1 table to a level 2 table */ 1067 l1_slot = pmap_l1_index(state->va); 1068 if (l1_slot != state->l1_slot) { 1069 /* See pmap_bootstrap_l0_table for a description */ 1070 MPASS(state->l1_slot < l1_slot || 1071 state->l1_slot == Ln_ENTRIES || 1072 state->dmap_valid); 1073 1074 /* Reset lower levels */ 1075 state->l3 = NULL; 1076 state->l2_slot = Ln_ENTRIES; 1077 1078 /* Check the existing L1 entry */ 1079 state->l1_slot = l1_slot; 1080 if (state->dmap_valid) { 1081 l1e = state->l1[l1_slot]; 1082 if ((l1e & ATTR_DESCR_VALID) != 0) { 1083 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE); 1084 l2_pa = PTE_TO_PHYS(l1e); 1085 state->l2 = PHYS_TO_DMAP(l2_pa); 1086 return; 1087 } 1088 } 1089 1090 /* Create a new L1 table entry */ 1091 state->l2 = (pt_entry_t *)state->freemempos; 1092 memset_early(state->l2, 0, PAGE_SIZE); 1093 state->freemempos += PAGE_SIZE; 1094 1095 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2); 1096 MPASS((l2_pa & Ln_TABLE_MASK) == 0); 1097 MPASS(state->l1[l1_slot] == 0); 1098 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) | 1099 state->table_attrs | L1_TABLE); 1100 } 1101 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__)); 1102 } 1103 1104 static void 1105 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state) 1106 { 1107 vm_paddr_t l3_pa; 1108 pd_entry_t l2e; 1109 u_int l2_slot; 1110 1111 /* Make sure there is a valid L1 -> L2 table */ 1112 pmap_bootstrap_l1_table(state); 1113 1114 /* Link the level 2 table to a level 3 table */ 1115 l2_slot = pmap_l2_index(state->va); 1116 if (l2_slot != state->l2_slot) { 1117 /* See pmap_bootstrap_l0_table for a description */ 1118 MPASS(state->l2_slot < l2_slot || 1119 state->l2_slot == Ln_ENTRIES || 1120 state->dmap_valid); 1121 1122 /* Check the existing L2 entry */ 1123 state->l2_slot = l2_slot; 1124 if (state->dmap_valid) { 1125 l2e = state->l2[l2_slot]; 1126 if ((l2e & ATTR_DESCR_VALID) != 0) { 1127 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE); 1128 l3_pa = PTE_TO_PHYS(l2e); 1129 state->l3 = PHYS_TO_DMAP(l3_pa); 1130 return; 1131 } 1132 } 1133 1134 /* Create a new L2 table entry */ 1135 state->l3 = (pt_entry_t *)state->freemempos; 1136 memset_early(state->l3, 0, PAGE_SIZE); 1137 state->freemempos += PAGE_SIZE; 1138 1139 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3); 1140 MPASS((l3_pa & Ln_TABLE_MASK) == 0); 1141 MPASS(state->l2[l2_slot] == 0); 1142 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) | 1143 state->table_attrs | L2_TABLE); 1144 } 1145 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__)); 1146 } 1147 1148 static void 1149 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i) 1150 { 1151 pt_entry_t contig; 1152 u_int l2_slot; 1153 bool first; 1154 1155 if ((physmap[i + 1] - state->pa) < L2_SIZE) 1156 return; 1157 1158 /* Make sure there is a valid L1 table */ 1159 pmap_bootstrap_l1_table(state); 1160 1161 MPASS((state->va & L2_OFFSET) == 0); 1162 for (first = true, contig = 0; 1163 state->va < DMAP_MAX_ADDRESS && 1164 (physmap[i + 1] - state->pa) >= L2_SIZE; 1165 state->va += L2_SIZE, state->pa += L2_SIZE) { 1166 /* 1167 * Stop if we are about to walk off the end of what the 1168 * current L1 slot can address. 1169 */ 1170 if (!first && (state->pa & L1_OFFSET) == 0) 1171 break; 1172 1173 /* 1174 * If we have an aligned, contiguous chunk of L2C_ENTRIES 1175 * L2 blocks, set the contiguous bit within each PTE so that 1176 * the chunk can be cached using only one TLB entry. 1177 */ 1178 if ((state->pa & L2C_OFFSET) == 0) { 1179 if (state->va + L2C_SIZE < DMAP_MAX_ADDRESS && 1180 physmap[i + 1] - state->pa >= L2C_SIZE) { 1181 contig = ATTR_CONTIGUOUS; 1182 } else { 1183 contig = 0; 1184 } 1185 } 1186 1187 first = false; 1188 l2_slot = pmap_l2_index(state->va); 1189 MPASS((state->pa & L2_OFFSET) == 0); 1190 MPASS(state->l2[l2_slot] == 0); 1191 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) | 1192 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP | 1193 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L2_BLOCK); 1194 } 1195 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 1196 } 1197 1198 static void 1199 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i) 1200 { 1201 pt_entry_t contig; 1202 u_int l3_slot; 1203 bool first; 1204 1205 if (physmap[i + 1] - state->pa < L3_SIZE) 1206 return; 1207 1208 /* Make sure there is a valid L2 table */ 1209 pmap_bootstrap_l2_table(state); 1210 1211 MPASS((state->va & L3_OFFSET) == 0); 1212 for (first = true, contig = 0; 1213 state->va < DMAP_MAX_ADDRESS && 1214 physmap[i + 1] - state->pa >= L3_SIZE; 1215 state->va += L3_SIZE, state->pa += L3_SIZE) { 1216 /* 1217 * Stop if we are about to walk off the end of what the 1218 * current L2 slot can address. 1219 */ 1220 if (!first && (state->pa & L2_OFFSET) == 0) 1221 break; 1222 1223 /* 1224 * If we have an aligned, contiguous chunk of L3C_ENTRIES 1225 * L3 pages, set the contiguous bit within each PTE so that 1226 * the chunk can be cached using only one TLB entry. 1227 */ 1228 if ((state->pa & L3C_OFFSET) == 0) { 1229 if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS && 1230 physmap[i + 1] - state->pa >= L3C_SIZE) { 1231 contig = ATTR_CONTIGUOUS; 1232 } else { 1233 contig = 0; 1234 } 1235 } 1236 1237 first = false; 1238 l3_slot = pmap_l3_index(state->va); 1239 MPASS((state->pa & L3_OFFSET) == 0); 1240 MPASS(state->l3[l3_slot] == 0); 1241 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) | 1242 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP | 1243 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE); 1244 } 1245 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 1246 } 1247 1248 void 1249 pmap_bootstrap_dmap(vm_size_t kernlen) 1250 { 1251 vm_paddr_t start_pa, pa; 1252 uint64_t tcr; 1253 int i; 1254 1255 tcr = READ_SPECIALREG(tcr_el1); 1256 1257 /* Verify that the ASID is set through TTBR0. */ 1258 KASSERT((tcr & TCR_A1) == 0, ("pmap_bootstrap: TCR_EL1.A1 != 0")); 1259 1260 if ((tcr & TCR_DS) != 0) 1261 pmap_lpa_enabled = true; 1262 1263 pmap_l1_supported = L1_BLOCKS_SUPPORTED; 1264 1265 start_pa = pmap_early_vtophys(KERNBASE); 1266 1267 bs_state.freemempos = KERNBASE + kernlen; 1268 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE); 1269 1270 /* Fill in physmap array. */ 1271 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1272 1273 dmap_phys_base = physmap[0] & ~L1_OFFSET; 1274 dmap_phys_max = 0; 1275 dmap_max_addr = 0; 1276 1277 for (i = 0; i < physmap_idx; i += 2) { 1278 bs_state.pa = physmap[i] & ~L3_OFFSET; 1279 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS; 1280 1281 /* Create L3 mappings at the start of the region */ 1282 if ((bs_state.pa & L2_OFFSET) != 0) 1283 pmap_bootstrap_l3_page(&bs_state, i); 1284 MPASS(bs_state.pa <= physmap[i + 1]); 1285 1286 if (L1_BLOCKS_SUPPORTED) { 1287 /* Create L2 mappings at the start of the region */ 1288 if ((bs_state.pa & L1_OFFSET) != 0) 1289 pmap_bootstrap_l2_block(&bs_state, i); 1290 MPASS(bs_state.pa <= physmap[i + 1]); 1291 1292 /* Create the main L1 block mappings */ 1293 for (; bs_state.va < DMAP_MAX_ADDRESS && 1294 (physmap[i + 1] - bs_state.pa) >= L1_SIZE; 1295 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) { 1296 /* Make sure there is a valid L1 table */ 1297 pmap_bootstrap_l0_table(&bs_state); 1298 MPASS((bs_state.pa & L1_OFFSET) == 0); 1299 pmap_store( 1300 &bs_state.l1[pmap_l1_index(bs_state.va)], 1301 PHYS_TO_PTE(bs_state.pa) | ATTR_AF | 1302 pmap_sh_attr | 1303 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 1304 ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK); 1305 } 1306 MPASS(bs_state.pa <= physmap[i + 1]); 1307 1308 /* Create L2 mappings at the end of the region */ 1309 pmap_bootstrap_l2_block(&bs_state, i); 1310 } else { 1311 while (bs_state.va < DMAP_MAX_ADDRESS && 1312 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) { 1313 pmap_bootstrap_l2_block(&bs_state, i); 1314 } 1315 } 1316 MPASS(bs_state.pa <= physmap[i + 1]); 1317 1318 /* Create L3 mappings at the end of the region */ 1319 pmap_bootstrap_l3_page(&bs_state, i); 1320 MPASS(bs_state.pa == physmap[i + 1]); 1321 1322 if (bs_state.pa > dmap_phys_max) { 1323 dmap_phys_max = bs_state.pa; 1324 dmap_max_addr = bs_state.va; 1325 } 1326 } 1327 1328 pmap_s1_invalidate_all_kernel(); 1329 1330 bs_state.dmap_valid = true; 1331 1332 /* Exclude the kernel and DMAP region */ 1333 pa = pmap_early_vtophys(bs_state.freemempos); 1334 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); 1335 } 1336 1337 static void 1338 pmap_bootstrap_l2(vm_offset_t va) 1339 { 1340 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); 1341 1342 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/ 1343 bs_state.va = va; 1344 1345 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE) 1346 pmap_bootstrap_l1_table(&bs_state); 1347 } 1348 1349 static void 1350 pmap_bootstrap_l3(vm_offset_t va) 1351 { 1352 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 1353 1354 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/ 1355 bs_state.va = va; 1356 1357 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE) 1358 pmap_bootstrap_l2_table(&bs_state); 1359 } 1360 1361 /* 1362 * Bootstrap the system enough to run with virtual memory. 1363 */ 1364 void 1365 pmap_bootstrap(void) 1366 { 1367 vm_offset_t dpcpu, msgbufpv; 1368 vm_paddr_t start_pa, pa; 1369 size_t largest_phys_size; 1370 1371 /* Set this early so we can use the pagetable walking functions */ 1372 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1; 1373 mtx_init(&kernel_pmap->pm_mtx, "kernel pmap", NULL, MTX_DEF); 1374 kernel_pmap->pm_l0_paddr = 1375 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0); 1376 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1377 vm_radix_init(&kernel_pmap->pm_root); 1378 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN); 1379 kernel_pmap->pm_stage = PM_STAGE1; 1380 kernel_pmap->pm_levels = 4; 1381 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr; 1382 kernel_pmap->pm_asid_set = &asids; 1383 1384 /* Reserve some VA space for early BIOS/ACPI mapping */ 1385 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE); 1386 1387 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; 1388 virtual_avail = roundup2(virtual_avail, L1_SIZE); 1389 virtual_end = VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE - L2_SIZE; 1390 kernel_vm_end = virtual_avail; 1391 1392 /* 1393 * We only use PXN when we know nothing will be executed from it, e.g. 1394 * the DMAP region. 1395 */ 1396 bs_state.table_attrs &= ~TATTR_PXN_TABLE; 1397 1398 /* 1399 * Find the physical memory we could use. This needs to be after we 1400 * exclude any memory that is mapped into the DMAP region but should 1401 * not be used by the kernel, e.g. some UEFI memory types. 1402 */ 1403 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1404 1405 /* 1406 * Find space for early allocations. We search for the largest 1407 * region. This is because the user may choose a large msgbuf. 1408 * This could be smarter, e.g. to allow multiple regions to be 1409 * used & switch to the next when one is full. 1410 */ 1411 largest_phys_size = 0; 1412 for (int i = 0; i < physmap_idx; i += 2) { 1413 if ((physmap[i + 1] - physmap[i]) > largest_phys_size) { 1414 largest_phys_size = physmap[i + 1] - physmap[i]; 1415 bs_state.freemempos = PHYS_TO_DMAP_ADDR(physmap[i]); 1416 } 1417 } 1418 1419 start_pa = pmap_early_vtophys(bs_state.freemempos); 1420 1421 /* 1422 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the 1423 * loader allocated the first and only l2 page table page used to map 1424 * the kernel, preloaded files and module metadata. 1425 */ 1426 pmap_bootstrap_l2(KERNBASE + L1_SIZE); 1427 /* And the l3 tables for the early devmap */ 1428 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE)); 1429 1430 pmap_s1_invalidate_all_kernel(); 1431 1432 #define alloc_pages(var, np) \ 1433 (var) = bs_state.freemempos; \ 1434 bs_state.freemempos += (np * PAGE_SIZE); \ 1435 memset_early((char *)(var), 0, ((np) * PAGE_SIZE)); 1436 1437 /* Allocate dynamic per-cpu area. */ 1438 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 1439 dpcpu_init((void *)dpcpu, 0); 1440 1441 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 1442 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 1443 msgbufp = (void *)msgbufpv; 1444 1445 /* Allocate space for the CPU0 CMAP */ 1446 bs_state.va = virtual_end; 1447 pmap_bootstrap_l2_table(&bs_state); 1448 pmap_store(&bs_state.l3[pmap_l3_index(bs_state.va)], 1449 PHYS_TO_PTE(pmap_early_vtophys((vm_offset_t)bs_state.l3)) | 1450 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP | 1451 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L3_PAGE); 1452 dsb(ishst); 1453 1454 mtx_init(&cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1455 cmap1_addr = (void *)(virtual_end + L3_SIZE); 1456 cmap1_pte = &bs_state.l3[pmap_l3_index((vm_offset_t)cmap1_addr)]; 1457 1458 pa = pmap_early_vtophys(bs_state.freemempos); 1459 1460 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); 1461 } 1462 1463 #if defined(KASAN) || defined(KMSAN) 1464 static void 1465 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa, 1466 vm_offset_t *vap, vm_offset_t eva) 1467 { 1468 vm_paddr_t pa; 1469 vm_offset_t va; 1470 pd_entry_t *l2; 1471 1472 va = *vap; 1473 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE); 1474 for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) { 1475 l2 = pmap_l2(kernel_pmap, va); 1476 1477 /* 1478 * KASAN stack checking results in us having already allocated 1479 * part of our shadow map, so we can just skip those segments. 1480 */ 1481 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) { 1482 pa += L2_SIZE; 1483 continue; 1484 } 1485 1486 bzero_early(PHYS_TO_DMAP(pa), L2_SIZE); 1487 physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC); 1488 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK); 1489 } 1490 *vap = va; 1491 } 1492 1493 /* 1494 * Finish constructing the initial shadow map: 1495 * - Count how many pages from KERNBASE to virtual_avail (scaled for 1496 * shadow map) 1497 * - Map that entire range using L2 superpages. 1498 */ 1499 static void 1500 pmap_bootstrap_san1(vm_offset_t va, int scale) 1501 { 1502 vm_offset_t eva; 1503 vm_paddr_t kernstart; 1504 int i; 1505 1506 kernstart = pmap_early_vtophys(KERNBASE); 1507 1508 /* 1509 * Rebuild physmap one more time, we may have excluded more regions from 1510 * allocation since pmap_bootstrap(). 1511 */ 1512 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1513 1514 eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale; 1515 1516 /* 1517 * Find a slot in the physmap large enough for what we needed. We try to put 1518 * the shadow map as high up as we can to avoid depleting the lower 4GB in case 1519 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA. 1520 */ 1521 for (i = physmap_idx - 2; i >= 0; i -= 2) { 1522 vm_paddr_t plow, phigh; 1523 1524 /* L2 mappings must be backed by memory that is L2-aligned */ 1525 plow = roundup2(physmap[i], L2_SIZE); 1526 phigh = physmap[i + 1]; 1527 if (plow >= phigh) 1528 continue; 1529 if (kernstart >= plow && kernstart < phigh) 1530 phigh = kernstart; 1531 if (phigh - plow >= L2_SIZE) { 1532 pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva); 1533 if (va >= eva) 1534 break; 1535 } 1536 } 1537 if (i < 0) 1538 panic("Could not find phys region for shadow map"); 1539 1540 /* 1541 * Done. We should now have a valid shadow address mapped for all KVA 1542 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus, 1543 * shadow accesses by the sanitizer runtime will succeed for this range. 1544 * When the kernel virtual address range is later expanded, as will 1545 * happen in vm_mem_init(), the shadow map will be grown as well. This 1546 * is handled by pmap_san_enter(). 1547 */ 1548 } 1549 1550 void 1551 pmap_bootstrap_san(void) 1552 { 1553 #ifdef KASAN 1554 pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE); 1555 #else 1556 static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE); 1557 static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE); 1558 pd_entry_t *l0, *l1; 1559 1560 if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE) 1561 panic("initial kernel map is too large"); 1562 1563 l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS); 1564 pmap_store(l0, L0_TABLE | PHYS_TO_PTE( 1565 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp))); 1566 l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS); 1567 pmap_store(l1, L1_TABLE | PHYS_TO_PTE( 1568 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE))); 1569 pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1); 1570 1571 l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS); 1572 pmap_store(l0, L0_TABLE | PHYS_TO_PTE( 1573 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp))); 1574 l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS); 1575 pmap_store(l1, L1_TABLE | PHYS_TO_PTE( 1576 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE))); 1577 pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1); 1578 #endif 1579 } 1580 #endif 1581 1582 /* 1583 * Initialize a vm_page's machine-dependent fields. 1584 */ 1585 void 1586 pmap_page_init(vm_page_t m) 1587 { 1588 1589 TAILQ_INIT(&m->md.pv_list); 1590 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 1591 m->md.pv_flags = 0; 1592 } 1593 1594 static void 1595 pmap_init_asids(struct asid_set *set, int bits) 1596 { 1597 int i; 1598 1599 set->asid_bits = bits; 1600 1601 /* 1602 * We may be too early in the overall initialization process to use 1603 * bit_alloc(). 1604 */ 1605 set->asid_set_size = 1 << set->asid_bits; 1606 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size), 1607 M_WAITOK | M_ZERO); 1608 for (i = 0; i < ASID_FIRST_AVAILABLE; i++) 1609 bit_set(set->asid_set, i); 1610 set->asid_next = ASID_FIRST_AVAILABLE; 1611 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN); 1612 } 1613 1614 static void 1615 pmap_init_pv_table(void) 1616 { 1617 struct vm_phys_seg *seg, *next_seg; 1618 struct pmap_large_md_page *pvd; 1619 vm_size_t s; 1620 int domain, i, j, pages; 1621 1622 /* 1623 * We depend on the size being evenly divisible into a page so 1624 * that the pv_table array can be indexed directly while 1625 * safely spanning multiple pages from different domains. 1626 */ 1627 CTASSERT(PAGE_SIZE % sizeof(*pvd) == 0); 1628 1629 /* 1630 * Calculate the size of the array. 1631 */ 1632 s = 0; 1633 for (i = 0; i < vm_phys_nsegs; i++) { 1634 seg = &vm_phys_segs[i]; 1635 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1636 pmap_l2_pindex(seg->start); 1637 s += round_page(pages * sizeof(*pvd)); 1638 } 1639 pv_table = kva_alloc(s); 1640 if (pv_table == NULL) 1641 panic("%s: kva_alloc failed\n", __func__); 1642 1643 /* 1644 * Iterate physical segments to allocate domain-local memory for PV 1645 * list headers. 1646 */ 1647 pvd = pv_table; 1648 for (i = 0; i < vm_phys_nsegs; i++) { 1649 seg = &vm_phys_segs[i]; 1650 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1651 pmap_l2_pindex(seg->start); 1652 domain = seg->domain; 1653 1654 s = round_page(pages * sizeof(*pvd)); 1655 1656 for (j = 0; j < s; j += PAGE_SIZE) { 1657 vm_page_t m = vm_page_alloc_noobj_domain(domain, 1658 VM_ALLOC_ZERO); 1659 if (m == NULL) 1660 panic("failed to allocate PV table page"); 1661 pmap_qenter((char *)pvd + j, &m, 1); 1662 } 1663 1664 for (j = 0; j < s / sizeof(*pvd); j++) { 1665 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 1666 TAILQ_INIT(&pvd->pv_page.pv_list); 1667 pvd++; 1668 } 1669 } 1670 pvd = &pv_dummy_large; 1671 memset(pvd, 0, sizeof(*pvd)); 1672 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 1673 TAILQ_INIT(&pvd->pv_page.pv_list); 1674 1675 /* 1676 * Set pointers from vm_phys_segs to pv_table. 1677 */ 1678 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) { 1679 seg = &vm_phys_segs[i]; 1680 seg->md_first = pvd; 1681 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1682 pmap_l2_pindex(seg->start); 1683 1684 /* 1685 * If there is a following segment, and the final 1686 * superpage of this segment and the initial superpage 1687 * of the next segment are the same then adjust the 1688 * pv_table entry for that next segment down by one so 1689 * that the pv_table entries will be shared. 1690 */ 1691 if (i + 1 < vm_phys_nsegs) { 1692 next_seg = &vm_phys_segs[i + 1]; 1693 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 == 1694 pmap_l2_pindex(next_seg->start)) { 1695 pvd--; 1696 } 1697 } 1698 } 1699 } 1700 1701 static cpu_feat_en 1702 pmap_dbm_check(const struct cpu_feat *feat __unused, u_int midr __unused) 1703 { 1704 uint64_t id_aa64mmfr1; 1705 1706 id_aa64mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); 1707 if (ID_AA64MMFR1_HAFDBS_VAL(id_aa64mmfr1) >= 1708 ID_AA64MMFR1_HAFDBS_AF_DBS) 1709 return (FEAT_DEFAULT_ENABLE); 1710 1711 return (FEAT_ALWAYS_DISABLE); 1712 } 1713 1714 static bool 1715 pmap_dbm_has_errata(const struct cpu_feat *feat __unused, u_int midr, 1716 u_int **errata_list, u_int *errata_count) 1717 { 1718 /* Disable on Cortex-A55 for erratum 1024718 - all revisions */ 1719 if (CPU_IMPL(midr) == CPU_IMPL_ARM && 1720 CPU_PART(midr) == CPU_PART_CORTEX_A55) { 1721 static u_int errata_id = 1024718; 1722 1723 *errata_list = &errata_id; 1724 *errata_count = 1; 1725 return (true); 1726 } 1727 1728 /* Disable on Cortex-A510 for erratum 2051678 - r0p0 to r0p2 */ 1729 if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A510, 1730 0, 0, 0, 2)) { 1731 static u_int errata_id = 2051678; 1732 1733 *errata_list = &errata_id; 1734 *errata_count = 1; 1735 return (true); 1736 } 1737 1738 return (false); 1739 } 1740 1741 static bool 1742 pmap_dbm_enable(const struct cpu_feat *feat __unused, 1743 cpu_feat_errata errata_status, u_int *errata_list __unused, 1744 u_int errata_count) 1745 { 1746 uint64_t tcr; 1747 1748 /* Skip if there is an erratum affecting DBM */ 1749 if (errata_status != ERRATA_NONE) 1750 return (false); 1751 1752 tcr = READ_SPECIALREG(tcr_el1) | TCR_HD; 1753 WRITE_SPECIALREG(tcr_el1, tcr); 1754 isb(); 1755 /* Flush the local TLB for the TCR_HD flag change */ 1756 dsb(nshst); 1757 __asm __volatile("tlbi vmalle1"); 1758 dsb(nsh); 1759 isb(); 1760 1761 return (true); 1762 } 1763 1764 CPU_FEAT(feat_hafdbs, "Hardware management of the Access flag and dirty state", 1765 pmap_dbm_check, pmap_dbm_has_errata, pmap_dbm_enable, NULL, 1766 CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU); 1767 1768 static cpu_feat_en 1769 pmap_multiple_tlbi_check(const struct cpu_feat *feat __unused, u_int midr) 1770 { 1771 /* 1772 * ARM C1-Premium erratum 4193780 1773 * ARM C1-Ultra erratum 4193780 1774 * ARM Cortex-A76 erratum 4193800 1775 * ARM Cortex-A76AE erratum 4193801 1776 * ARM Cortex-A77 erratum 4193798 1777 * ARM Cortex-A78 erratum 4193791 1778 * ARM Cortex-A78AE erratum 4193793 1779 * ARM Cortex-A78C erratum 4193794 1780 * ARM Cortex-A710 erratum 4193788 1781 * ARM Cortex-X1 erratum 4193791 1782 * ARM Cortex-X1C erratum 4193792 1783 * ARM Cortex-X2 erratum 4193788 1784 * ARM Cortex-X3 erratum 4193786 1785 * ARM Cortex-X4 erratum 4118414 1786 * ARM Cortex-X925 erratum 4193781 1787 * ARM Neoverse-N1 erratum 4193800 1788 * ARM Neoverse-N2 erratum 4193789 1789 * ARM Neoverse-V1 erratum 4193790 1790 * ARM Neoverse-V2 erratum 4193787 1791 * ARM Neoverse-V3 erratum 4193784 1792 * ARM Neoverse-V3AE erratum 4193784 1793 * Present in all revisions 1794 */ 1795 if (CPU_IMPL(midr) == CPU_IMPL_ARM) { 1796 switch(CPU_PART(midr)) { 1797 case CPU_PART_C1_PREMIUM: 1798 case CPU_PART_C1_ULTRA: 1799 case CPU_PART_CORTEX_A76: 1800 case CPU_PART_CORTEX_A76AE: 1801 case CPU_PART_CORTEX_A77: 1802 case CPU_PART_CORTEX_A78: 1803 case CPU_PART_CORTEX_A78AE: 1804 case CPU_PART_CORTEX_A78C: 1805 case CPU_PART_CORTEX_A710: 1806 case CPU_PART_CORTEX_X1: 1807 case CPU_PART_CORTEX_X1C: 1808 case CPU_PART_CORTEX_X2: 1809 case CPU_PART_CORTEX_X3: 1810 case CPU_PART_CORTEX_X4: 1811 case CPU_PART_CORTEX_X925: 1812 case CPU_PART_NEOVERSE_N1: 1813 case CPU_PART_NEOVERSE_N2: 1814 case CPU_PART_NEOVERSE_V1: 1815 case CPU_PART_NEOVERSE_V2: 1816 case CPU_PART_NEOVERSE_V3: 1817 case CPU_PART_NEOVERSE_V3AE: 1818 return (FEAT_DEFAULT_ENABLE); 1819 } 1820 } 1821 1822 /* 1823 * Cortex-A55 erratum 2441007 (Cat B rare) 1824 * Present in all revisions 1825 */ 1826 if (CPU_IMPL(midr) == CPU_IMPL_ARM && 1827 CPU_PART(midr) == CPU_PART_CORTEX_A55) 1828 return (FEAT_DEFAULT_DISABLE); 1829 1830 /* 1831 * Cortex-A510 erratum 2441009 (Cat B rare) 1832 * Present in r0p0 - r1p1 1833 * Fixed in r1p2 1834 */ 1835 if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A510, 1836 0, 0, 1, 1)) 1837 return (FEAT_DEFAULT_DISABLE); 1838 1839 return (FEAT_ALWAYS_DISABLE); 1840 } 1841 1842 static bool 1843 pmap_multiple_tlbi_enable(const struct cpu_feat *feat __unused, 1844 cpu_feat_errata errata_status, u_int *errata_list __unused, 1845 u_int errata_count __unused) 1846 { 1847 pmap_multiple_tlbi = true; 1848 return (true); 1849 } 1850 1851 CPU_FEAT(errata_multi_tlbi, "Multiple TLBI errata", 1852 pmap_multiple_tlbi_check, NULL, pmap_multiple_tlbi_enable, NULL, 1853 CPU_FEAT_EARLY_BOOT | CPU_FEAT_PER_CPU); 1854 1855 /* 1856 * Initialize the pmap module. 1857 * 1858 * Called by vm_mem_init(), to initialize any structures that the pmap 1859 * system needs to map virtual memory. 1860 */ 1861 void 1862 pmap_init(void) 1863 { 1864 uint64_t mmfr1; 1865 int i, vmid_bits; 1866 1867 /* 1868 * Are large page mappings enabled? 1869 */ 1870 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 1871 if (superpages_enabled) { 1872 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1873 ("pmap_init: can't assign to pagesizes[1]")); 1874 pagesizes[1] = L3C_SIZE; 1875 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 1876 ("pmap_init: can't assign to pagesizes[2]")); 1877 pagesizes[2] = L2_SIZE; 1878 if (L1_BLOCKS_SUPPORTED) { 1879 KASSERT(MAXPAGESIZES > 3 && pagesizes[3] == 0, 1880 ("pmap_init: can't assign to pagesizes[3]")); 1881 pagesizes[3] = L1_SIZE; 1882 } 1883 } 1884 1885 /* 1886 * Initialize the ASID allocator. 1887 */ 1888 pmap_init_asids(&asids, 1889 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8); 1890 1891 if (has_hyp()) { 1892 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); 1893 vmid_bits = 8; 1894 1895 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) == 1896 ID_AA64MMFR1_VMIDBits_16) 1897 vmid_bits = 16; 1898 pmap_init_asids(&vmids, vmid_bits); 1899 } 1900 1901 /* 1902 * Initialize pv chunk lists. 1903 */ 1904 for (i = 0; i < PMAP_MEMDOM; i++) { 1905 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, 1906 MTX_DEF); 1907 TAILQ_INIT(&pv_chunks[i].pvc_list); 1908 } 1909 pmap_init_pv_table(); 1910 1911 vm_initialized = 1; 1912 } 1913 1914 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1915 "L1 (1GB/64GB) page mapping counters"); 1916 1917 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions); 1918 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD, 1919 &pmap_l1_demotions, "L1 (1GB/64GB) page demotions"); 1920 1921 SYSCTL_BOOL(_vm_pmap_l1, OID_AUTO, supported, CTLFLAG_RD, &pmap_l1_supported, 1922 0, "L1 blocks are supported"); 1923 1924 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1925 "L2C (32MB/1GB) page mapping counters"); 1926 1927 static COUNTER_U64_DEFINE_EARLY(pmap_l2c_demotions); 1928 SYSCTL_COUNTER_U64(_vm_pmap_l2c, OID_AUTO, demotions, CTLFLAG_RD, 1929 &pmap_l2c_demotions, "L2C (32MB/1GB) page demotions"); 1930 1931 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1932 "2MB page mapping counters"); 1933 1934 static COUNTER_U64_DEFINE_EARLY(pmap_l2_demotions); 1935 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 1936 &pmap_l2_demotions, "L2 (2MB/32MB) page demotions"); 1937 1938 static COUNTER_U64_DEFINE_EARLY(pmap_l2_mappings); 1939 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 1940 &pmap_l2_mappings, "L2 (2MB/32MB) page mappings"); 1941 1942 static COUNTER_U64_DEFINE_EARLY(pmap_l2_p_failures); 1943 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 1944 &pmap_l2_p_failures, "L2 (2MB/32MB) page promotion failures"); 1945 1946 static COUNTER_U64_DEFINE_EARLY(pmap_l2_promotions); 1947 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 1948 &pmap_l2_promotions, "L2 (2MB/32MB) page promotions"); 1949 1950 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1951 "L3C (64KB/2MB) page mapping counters"); 1952 1953 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions); 1954 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD, 1955 &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions"); 1956 1957 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings); 1958 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD, 1959 &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings"); 1960 1961 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures); 1962 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD, 1963 &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures"); 1964 1965 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions); 1966 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD, 1967 &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions"); 1968 1969 /* 1970 * If the given value for "final_only" is false, then any cached intermediate- 1971 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to 1972 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry. 1973 * Otherwise, just the cached final-level entry is invalidated. 1974 */ 1975 static __inline void 1976 pmap_s1_invalidate_kernel(uint64_t r, bool final_only) 1977 { 1978 if (final_only) 1979 __asm __volatile("tlbi vaale1is, %0" : : "r" (r)); 1980 else 1981 __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); 1982 } 1983 1984 static __inline void 1985 pmap_s1_invalidate_user(uint64_t r, bool final_only) 1986 { 1987 if (final_only) 1988 __asm __volatile("tlbi vale1is, %0" : : "r" (r)); 1989 else 1990 __asm __volatile("tlbi vae1is, %0" : : "r" (r)); 1991 } 1992 1993 /* 1994 * Invalidates any cached final- and optionally intermediate-level TLB entries 1995 * for the specified virtual address in the given virtual address space. 1996 */ 1997 static __inline void 1998 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1999 { 2000 uint64_t r; 2001 2002 PMAP_ASSERT_STAGE1(pmap); 2003 2004 dsb(ishst); 2005 r = TLBI_VA(va); 2006 if (pmap == kernel_pmap) { 2007 pmap_s1_invalidate_kernel(r, final_only); 2008 } else { 2009 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 2010 pmap_s1_invalidate_user(r, final_only); 2011 } 2012 if (pmap_multiple_tlbi) { 2013 dsb(ish); 2014 __asm __volatile("tlbi vale1is, xzr" ::: "memory"); 2015 } 2016 dsb(ish); 2017 isb(); 2018 } 2019 2020 static __inline void 2021 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 2022 { 2023 PMAP_ASSERT_STAGE2(pmap); 2024 MPASS(pmap_stage2_invalidate_range != NULL); 2025 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE, 2026 final_only); 2027 } 2028 2029 static __inline void 2030 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 2031 { 2032 if (pmap->pm_stage == PM_STAGE1) 2033 pmap_s1_invalidate_page(pmap, va, final_only); 2034 else 2035 pmap_s2_invalidate_page(pmap, va, final_only); 2036 } 2037 2038 /* 2039 * Use stride L{1,2}_SIZE when invalidating the TLB entries for L{1,2}_BLOCK 2040 * mappings. Otherwise, use stride L3_SIZE. 2041 */ 2042 static __inline void 2043 pmap_s1_invalidate_strided(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 2044 vm_offset_t stride, bool final_only) 2045 { 2046 uint64_t end, r, start; 2047 2048 PMAP_ASSERT_STAGE1(pmap); 2049 2050 dsb(ishst); 2051 if (pmap == kernel_pmap) { 2052 start = TLBI_VA(sva); 2053 end = TLBI_VA(eva); 2054 for (r = start; r < end; r += TLBI_VA(stride)) 2055 pmap_s1_invalidate_kernel(r, final_only); 2056 } else { 2057 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 2058 start |= TLBI_VA(sva); 2059 end |= TLBI_VA(eva); 2060 for (r = start; r < end; r += TLBI_VA(stride)) 2061 pmap_s1_invalidate_user(r, final_only); 2062 } 2063 if (pmap_multiple_tlbi) { 2064 dsb(ish); 2065 __asm __volatile("tlbi vale1is, xzr" ::: "memory"); 2066 } 2067 dsb(ish); 2068 isb(); 2069 } 2070 2071 /* 2072 * Invalidates any cached final- and optionally intermediate-level TLB entries 2073 * for the specified virtual address range in the given virtual address space. 2074 */ 2075 static __inline void 2076 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 2077 bool final_only) 2078 { 2079 pmap_s1_invalidate_strided(pmap, sva, eva, L3_SIZE, final_only); 2080 } 2081 2082 static __inline void 2083 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 2084 bool final_only) 2085 { 2086 PMAP_ASSERT_STAGE2(pmap); 2087 MPASS(pmap_stage2_invalidate_range != NULL); 2088 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only); 2089 } 2090 2091 static __inline void 2092 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 2093 bool final_only) 2094 { 2095 if (pmap->pm_stage == PM_STAGE1) 2096 pmap_s1_invalidate_range(pmap, sva, eva, final_only); 2097 else 2098 pmap_s2_invalidate_range(pmap, sva, eva, final_only); 2099 } 2100 2101 void 2102 pmap_s1_invalidate_all_kernel(void) 2103 { 2104 dsb(ishst); 2105 __asm __volatile("tlbi vmalle1is"); 2106 if (pmap_multiple_tlbi) { 2107 dsb(ish); 2108 __asm __volatile("tlbi vale1is, xzr" ::: "memory"); 2109 } 2110 dsb(ish); 2111 isb(); 2112 } 2113 2114 /* 2115 * Invalidates all cached intermediate- and final-level TLB entries for the 2116 * given virtual address space. 2117 */ 2118 static __inline void 2119 pmap_s1_invalidate_all(pmap_t pmap) 2120 { 2121 uint64_t r; 2122 2123 PMAP_ASSERT_STAGE1(pmap); 2124 2125 dsb(ishst); 2126 if (pmap == kernel_pmap) { 2127 __asm __volatile("tlbi vmalle1is"); 2128 } else { 2129 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 2130 __asm __volatile("tlbi aside1is, %0" : : "r" (r)); 2131 } 2132 if (pmap_multiple_tlbi) { 2133 dsb(ish); 2134 __asm __volatile("tlbi vale1is, xzr" ::: "memory"); 2135 } 2136 dsb(ish); 2137 isb(); 2138 } 2139 2140 static __inline void 2141 pmap_s2_invalidate_all(pmap_t pmap) 2142 { 2143 PMAP_ASSERT_STAGE2(pmap); 2144 MPASS(pmap_stage2_invalidate_all != NULL); 2145 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap)); 2146 } 2147 2148 static __inline void 2149 pmap_invalidate_all(pmap_t pmap) 2150 { 2151 if (pmap->pm_stage == PM_STAGE1) 2152 pmap_s1_invalidate_all(pmap); 2153 else 2154 pmap_s2_invalidate_all(pmap); 2155 } 2156 2157 /* 2158 * Routine: pmap_extract 2159 * Function: 2160 * Extract the physical page address associated 2161 * with the given map/virtual_address pair. 2162 */ 2163 vm_paddr_t 2164 pmap_extract(pmap_t pmap, vm_offset_t va) 2165 { 2166 pt_entry_t *pte, tpte; 2167 vm_paddr_t pa; 2168 int lvl; 2169 2170 pa = 0; 2171 PMAP_LOCK(pmap); 2172 /* 2173 * Find the block or page map for this virtual address. pmap_pte 2174 * will return either a valid block/page entry, or NULL. 2175 */ 2176 pte = pmap_pte(pmap, va, &lvl); 2177 if (pte != NULL) { 2178 tpte = pmap_load(pte); 2179 pa = PTE_TO_PHYS(tpte); 2180 switch(lvl) { 2181 case 1: 2182 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 2183 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, 2184 ("pmap_extract: Invalid L1 pte found: %lx", 2185 tpte & ATTR_DESCR_MASK)); 2186 pa |= (va & L1_OFFSET); 2187 break; 2188 case 2: 2189 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, 2190 ("pmap_extract: Invalid L2 pte found: %lx", 2191 tpte & ATTR_DESCR_MASK)); 2192 pa |= (va & L2_OFFSET); 2193 break; 2194 case 3: 2195 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, 2196 ("pmap_extract: Invalid L3 pte found: %lx", 2197 tpte & ATTR_DESCR_MASK)); 2198 pa |= (va & L3_OFFSET); 2199 break; 2200 } 2201 } 2202 PMAP_UNLOCK(pmap); 2203 return (pa); 2204 } 2205 2206 /* 2207 * Routine: pmap_extract_and_hold 2208 * Function: 2209 * Atomically extract and hold the physical page 2210 * with the given pmap and virtual address pair 2211 * if that mapping permits the given protection. 2212 */ 2213 vm_page_t 2214 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 2215 { 2216 pt_entry_t *pte, tpte; 2217 vm_offset_t off; 2218 vm_page_t m; 2219 int lvl; 2220 bool use; 2221 2222 m = NULL; 2223 PMAP_LOCK(pmap); 2224 pte = pmap_pte(pmap, va, &lvl); 2225 if (pte != NULL) { 2226 tpte = pmap_load(pte); 2227 2228 KASSERT(lvl > 0 && lvl <= 3, 2229 ("pmap_extract_and_hold: Invalid level %d", lvl)); 2230 /* 2231 * Check that the pte is either a L3 page, or a L1 or L2 block 2232 * entry. We can assume L1_BLOCK == L2_BLOCK. 2233 */ 2234 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || 2235 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), 2236 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, 2237 tpte & ATTR_DESCR_MASK)); 2238 2239 use = false; 2240 if ((prot & VM_PROT_WRITE) == 0) 2241 use = true; 2242 else if (pmap->pm_stage == PM_STAGE1 && 2243 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) 2244 use = true; 2245 else if (pmap->pm_stage == PM_STAGE2 && 2246 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 2247 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE))) 2248 use = true; 2249 2250 if (use) { 2251 switch (lvl) { 2252 case 1: 2253 off = va & L1_OFFSET; 2254 break; 2255 case 2: 2256 off = va & L2_OFFSET; 2257 break; 2258 case 3: 2259 default: 2260 off = 0; 2261 } 2262 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off); 2263 if (m != NULL && !vm_page_wire_mapped(m)) 2264 m = NULL; 2265 } 2266 } 2267 PMAP_UNLOCK(pmap); 2268 return (m); 2269 } 2270 2271 /* 2272 * Returns true if the entire kernel virtual address range is mapped 2273 */ 2274 static bool 2275 pmap_kmapped_range(void *va, vm_size_t size) 2276 { 2277 pt_entry_t *pte, tpte; 2278 vm_offset_t eva, sva; 2279 2280 sva = (vm_offset_t)va; 2281 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, 2282 ("%s: Invalid virtual address: %lx", __func__, sva)); 2283 MPASS(size != 0); 2284 eva = sva + size - 1; 2285 KASSERT(eva > sva, ("%s: Size too large: sva %lx, size %lx", __func__, 2286 sva, size)); 2287 2288 while (sva <= eva) { 2289 pte = pmap_l1(kernel_pmap, sva); 2290 if (pte == NULL) 2291 return (false); 2292 tpte = pmap_load(pte); 2293 if (tpte == 0) 2294 return (false); 2295 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 2296 sva = (sva & ~L1_OFFSET) + L1_SIZE; 2297 continue; 2298 } 2299 2300 pte = pmap_l1_to_l2(&tpte, sva); 2301 tpte = pmap_load(pte); 2302 if (tpte == 0) 2303 return (false); 2304 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 2305 sva = (sva & ~L2_OFFSET) + L2_SIZE; 2306 continue; 2307 } 2308 pte = pmap_l2_to_l3(&tpte, sva); 2309 tpte = pmap_load(pte); 2310 if (tpte == 0) 2311 return (false); 2312 MPASS((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_PAGE); 2313 if ((tpte & ATTR_CONTIGUOUS) == ATTR_CONTIGUOUS) 2314 sva = (sva & ~L3C_OFFSET) + L3C_SIZE; 2315 else 2316 sva = (sva & ~L3_OFFSET) + L3_SIZE; 2317 } 2318 2319 return (true); 2320 } 2321 2322 /* 2323 * Walks the page tables to translate a kernel virtual address to a 2324 * physical address. Returns true if the kva is valid and stores the 2325 * physical address in pa if it is not NULL. 2326 * 2327 * See the comment above data_abort() for the rationale for specifying 2328 * NO_PERTHREAD_SSP here. 2329 */ 2330 bool NO_PERTHREAD_SSP 2331 pmap_klookup(vm_offset_t va, vm_paddr_t *pa) 2332 { 2333 pt_entry_t *pte, tpte; 2334 register_t intr; 2335 uint64_t par; 2336 2337 /* 2338 * Disable interrupts so we don't get interrupted between asking 2339 * for address translation, and getting the result back. 2340 */ 2341 intr = intr_disable(); 2342 par = arm64_address_translate_s1e1r(va); 2343 intr_restore(intr); 2344 2345 if (PAR_SUCCESS(par)) { 2346 if (pa != NULL) 2347 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK); 2348 return (true); 2349 } 2350 2351 /* 2352 * Fall back to walking the page table. The address translation 2353 * instruction may fail when the page is in a break-before-make 2354 * sequence. As we only clear the valid bit in said sequence we 2355 * can walk the page table to find the physical address. 2356 */ 2357 2358 pte = pmap_l1(kernel_pmap, va); 2359 if (pte == NULL) 2360 return (false); 2361 2362 /* 2363 * A concurrent pmap_update_entry() will clear the entry's valid bit 2364 * but leave the rest of the entry unchanged. Therefore, we treat a 2365 * non-zero entry as being valid, and we ignore the valid bit when 2366 * determining whether the entry maps a block, page, or table. 2367 */ 2368 tpte = pmap_load(pte); 2369 if (tpte == 0) 2370 return (false); 2371 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 2372 if (pa != NULL) 2373 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET); 2374 return (true); 2375 } 2376 pte = pmap_l1_to_l2(&tpte, va); 2377 tpte = pmap_load(pte); 2378 if (tpte == 0) 2379 return (false); 2380 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 2381 if (pa != NULL) 2382 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET); 2383 return (true); 2384 } 2385 pte = pmap_l2_to_l3(&tpte, va); 2386 tpte = pmap_load(pte); 2387 if (tpte == 0) 2388 return (false); 2389 if (pa != NULL) 2390 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET); 2391 return (true); 2392 } 2393 2394 /* 2395 * Routine: pmap_kextract 2396 * Function: 2397 * Extract the physical page address associated with the given kernel 2398 * virtual address. 2399 */ 2400 vm_paddr_t 2401 pmap_kextract(vm_offset_t va) 2402 { 2403 vm_paddr_t pa; 2404 2405 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 2406 return (DMAP_TO_PHYS(va)); 2407 2408 if (pmap_klookup(va, &pa) == false) 2409 return (0); 2410 return (pa); 2411 } 2412 2413 /*************************************************** 2414 * Low level mapping routines..... 2415 ***************************************************/ 2416 2417 void 2418 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) 2419 { 2420 pd_entry_t *pde; 2421 pt_entry_t attr, old_l3e, *pte; 2422 vm_offset_t va; 2423 vm_page_t mpte; 2424 int error, lvl; 2425 2426 KASSERT((pa & L3_OFFSET) == 0, 2427 ("pmap_kenter: Invalid physical address")); 2428 KASSERT((sva & L3_OFFSET) == 0, 2429 ("pmap_kenter: Invalid virtual address")); 2430 KASSERT((size & PAGE_MASK) == 0, 2431 ("pmap_kenter: Mapping is not page-sized")); 2432 2433 /* CCA - Map devices as nonsecure */ 2434 if (in_realm() && (mode == VM_MEMATTR_DEVICE || 2435 mode == VM_MEMATTR_DEVICE_NP)) 2436 pa |= prot_ns_shared_pa; 2437 2438 attr = ATTR_AF | pmap_sh_attr | ATTR_S1_AP(ATTR_S1_AP_RW) | 2439 ATTR_S1_XN | ATTR_KERN_GP | ATTR_S1_IDX(mode); 2440 old_l3e = 0; 2441 va = sva; 2442 while (size != 0) { 2443 pde = pmap_pde(kernel_pmap, va, &lvl); 2444 KASSERT(pde != NULL, 2445 ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); 2446 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); 2447 2448 /* 2449 * If we have an aligned, contiguous chunk of L2_SIZE, try 2450 * to create an L2_BLOCK mapping. 2451 */ 2452 if ((va & L2_OFFSET) == 0 && size >= L2_SIZE && 2453 (pa & L2_OFFSET) == 0 && vm_initialized) { 2454 mpte = PTE_TO_VM_PAGE(pmap_load(pde)); 2455 KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)), 2456 ("pmap_kenter: Unexpected mapping")); 2457 PMAP_LOCK(kernel_pmap); 2458 error = pmap_insert_pt_page(kernel_pmap, mpte, false, 2459 false); 2460 if (error == 0) { 2461 attr &= ~ATTR_CONTIGUOUS; 2462 2463 /* 2464 * Although the page table page "mpte" should 2465 * be devoid of mappings, the TLB might hold 2466 * intermediate entries that reference it, so 2467 * we perform a single-page invalidation. 2468 */ 2469 pmap_update_entry(kernel_pmap, pde, 2470 PHYS_TO_PTE(pa) | attr | L2_BLOCK, va, 2471 PAGE_SIZE); 2472 } 2473 PMAP_UNLOCK(kernel_pmap); 2474 if (error == 0) { 2475 va += L2_SIZE; 2476 pa += L2_SIZE; 2477 size -= L2_SIZE; 2478 continue; 2479 } 2480 } 2481 2482 /* 2483 * If we have an aligned, contiguous chunk of L3C_ENTRIES 2484 * L3 pages, set the contiguous bit within each PTE so that 2485 * the chunk can be cached using only one TLB entry. 2486 */ 2487 if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) { 2488 if (size >= L3C_SIZE) 2489 attr |= ATTR_CONTIGUOUS; 2490 else 2491 attr &= ~ATTR_CONTIGUOUS; 2492 } 2493 2494 pte = pmap_l2_to_l3(pde, va); 2495 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr | 2496 L3_PAGE); 2497 2498 va += PAGE_SIZE; 2499 pa += PAGE_SIZE; 2500 size -= PAGE_SIZE; 2501 } 2502 if ((old_l3e & ATTR_DESCR_VALID) != 0) 2503 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2504 else { 2505 /* 2506 * Because the old entries were invalid and the new mappings 2507 * are not executable, an isb is not required. 2508 */ 2509 dsb(ishst); 2510 } 2511 } 2512 2513 void 2514 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 2515 { 2516 2517 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 2518 } 2519 2520 /* 2521 * Remove a page from the kernel pagetables. 2522 */ 2523 void 2524 pmap_kremove(vm_offset_t va) 2525 { 2526 pt_entry_t *pte; 2527 2528 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__); 2529 KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0, 2530 ("pmap_kremove: unexpected ATTR_CONTIGUOUS")); 2531 pmap_clear(pte); 2532 pmap_s1_invalidate_page(kernel_pmap, va, true); 2533 } 2534 2535 /* 2536 * Remove the specified range of mappings from the kernel address space. 2537 * 2538 * Should only be applied to mappings that were created by pmap_kenter() or 2539 * pmap_kenter_device(). Nothing about this function is actually specific 2540 * to device mappings. 2541 */ 2542 void 2543 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 2544 { 2545 pt_entry_t *ptep, *ptep_end; 2546 vm_offset_t va; 2547 int lvl; 2548 2549 KASSERT((sva & L3_OFFSET) == 0, 2550 ("pmap_kremove_device: Invalid virtual address")); 2551 KASSERT((size & PAGE_MASK) == 0, 2552 ("pmap_kremove_device: Mapping is not page-sized")); 2553 2554 va = sva; 2555 while (size != 0) { 2556 ptep = pmap_pte(kernel_pmap, va, &lvl); 2557 KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va)); 2558 switch (lvl) { 2559 case 2: 2560 KASSERT((va & L2_OFFSET) == 0, 2561 ("Unaligned virtual address")); 2562 KASSERT(size >= L2_SIZE, ("Insufficient size")); 2563 2564 if (va != sva) { 2565 pmap_s1_invalidate_range(kernel_pmap, sva, va, 2566 true); 2567 } 2568 pmap_clear(ptep); 2569 pmap_s1_invalidate_page(kernel_pmap, va, true); 2570 PMAP_LOCK(kernel_pmap); 2571 pmap_remove_kernel_l2(kernel_pmap, ptep, va); 2572 PMAP_UNLOCK(kernel_pmap); 2573 2574 va += L2_SIZE; 2575 sva = va; 2576 size -= L2_SIZE; 2577 break; 2578 case 3: 2579 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) { 2580 KASSERT((va & L3C_OFFSET) == 0, 2581 ("Unaligned L3C virtual address")); 2582 KASSERT(size >= L3C_SIZE, 2583 ("Insufficient L3C size")); 2584 2585 ptep_end = ptep + L3C_ENTRIES; 2586 for (; ptep < ptep_end; ptep++) 2587 pmap_clear(ptep); 2588 2589 va += L3C_SIZE; 2590 size -= L3C_SIZE; 2591 break; 2592 } 2593 pmap_clear(ptep); 2594 2595 va += PAGE_SIZE; 2596 size -= PAGE_SIZE; 2597 break; 2598 default: 2599 __assert_unreachable(); 2600 break; 2601 } 2602 } 2603 if (va != sva) 2604 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2605 } 2606 2607 /* 2608 * Used to map a range of physical addresses into kernel 2609 * virtual address space. 2610 * 2611 * The value passed in '*virt' is a suggested virtual address for 2612 * the mapping. Architectures which can support a direct-mapped 2613 * physical to virtual region can return the appropriate address 2614 * within that region, leaving '*virt' unchanged. Other 2615 * architectures should map the pages starting at '*virt' and 2616 * update '*virt' with the first usable address after the mapped 2617 * region. 2618 */ 2619 void * 2620 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 2621 { 2622 return (PHYS_TO_DMAP(start)); 2623 } 2624 2625 /* 2626 * Add a list of wired pages to the kva 2627 * this routine is only used for temporary 2628 * kernel mappings that do not need to have 2629 * page modification or references recorded. 2630 * Note that old mappings are simply written 2631 * over. The page *must* be wired. 2632 * Note: SMP coherent. Uses a ranged shootdown IPI. 2633 */ 2634 void 2635 pmap_qenter(void *sva, vm_page_t *ma, int count) 2636 { 2637 pd_entry_t *pde; 2638 pt_entry_t attr, old_l3e, *pte; 2639 vm_offset_t va; 2640 vm_page_t m; 2641 int i, lvl; 2642 2643 old_l3e = 0; 2644 va = (vm_offset_t)sva; 2645 for (i = 0; i < count; i++) { 2646 pde = pmap_pde(kernel_pmap, va, &lvl); 2647 KASSERT(pde != NULL, 2648 ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); 2649 KASSERT(lvl == 2, 2650 ("pmap_qenter: Invalid level %d", lvl)); 2651 2652 m = ma[i]; 2653 attr = ATTR_AF | pmap_sh_attr | 2654 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 2655 ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE; 2656 pte = pmap_l2_to_l3(pde, va); 2657 old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr); 2658 2659 va += L3_SIZE; 2660 } 2661 if ((old_l3e & ATTR_DESCR_VALID) != 0) 2662 pmap_s1_invalidate_range(kernel_pmap, (vm_offset_t)sva, va, 2663 true); 2664 else { 2665 /* 2666 * Because the old entries were invalid and the new mappings 2667 * are not executable, an isb is not required. 2668 */ 2669 dsb(ishst); 2670 } 2671 } 2672 2673 /* 2674 * This routine tears out page mappings from the 2675 * kernel -- it is meant only for temporary mappings. 2676 */ 2677 void 2678 pmap_qremove(void *sva, int count) 2679 { 2680 pt_entry_t *pte; 2681 vm_offset_t va; 2682 2683 va = (vm_offset_t)sva; 2684 2685 KASSERT(ADDR_IS_CANONICAL(va), 2686 ("%s: Address not in canonical form: %p", __func__, sva)); 2687 KASSERT(ADDR_IS_KERNEL(va), ("usermode va %p", sva)); 2688 2689 while (count-- > 0) { 2690 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL); 2691 if (pte != NULL) { 2692 pmap_clear(pte); 2693 } 2694 2695 va += PAGE_SIZE; 2696 } 2697 pmap_s1_invalidate_range(kernel_pmap, (vm_offset_t)sva, va, true); 2698 } 2699 2700 /*************************************************** 2701 * Page table page management routines..... 2702 ***************************************************/ 2703 /* 2704 * Schedule the specified unused page table page to be freed. Specifically, 2705 * add the page to the specified list of pages that will be released to the 2706 * physical memory manager after the TLB has been updated. 2707 */ 2708 static __inline void 2709 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO) 2710 { 2711 2712 if (set_PG_ZERO) 2713 m->flags |= PG_ZERO; 2714 else 2715 m->flags &= ~PG_ZERO; 2716 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2717 } 2718 2719 /* 2720 * Decrements a page table page's reference count, which is used to record the 2721 * number of valid page table entries within the page. If the reference count 2722 * drops to zero, then the page table page is unmapped. Returns true if the 2723 * page table page was unmapped and false otherwise. 2724 */ 2725 static inline bool 2726 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2727 { 2728 2729 --m->ref_count; 2730 if (m->ref_count == 0) { 2731 _pmap_unwire_l3(pmap, va, m, free); 2732 return (true); 2733 } else 2734 return (false); 2735 } 2736 2737 static void 2738 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2739 { 2740 2741 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2742 /* 2743 * unmap the page table page 2744 */ 2745 if (m->pindex >= (NUL2E + NUL1E)) { 2746 /* l1 page */ 2747 pd_entry_t *l0; 2748 2749 l0 = pmap_l0(pmap, va); 2750 pmap_clear(l0); 2751 } else if (m->pindex >= NUL2E) { 2752 /* l2 page */ 2753 pd_entry_t *l1; 2754 2755 l1 = pmap_l1(pmap, va); 2756 pmap_clear(l1); 2757 } else { 2758 /* l3 page */ 2759 pd_entry_t *l2; 2760 2761 l2 = pmap_l2(pmap, va); 2762 pmap_clear(l2); 2763 } 2764 pmap_resident_count_dec(pmap, 1); 2765 if (m->pindex < NUL2E) { 2766 /* We just released an l3, unhold the matching l2 */ 2767 pd_entry_t *l1, tl1; 2768 vm_page_t l2pg; 2769 2770 l1 = pmap_l1(pmap, va); 2771 tl1 = pmap_load(l1); 2772 l2pg = PTE_TO_VM_PAGE(tl1); 2773 pmap_unwire_l3(pmap, va, l2pg, free); 2774 } else if (m->pindex < (NUL2E + NUL1E)) { 2775 /* We just released an l2, unhold the matching l1 */ 2776 pd_entry_t *l0, tl0; 2777 vm_page_t l1pg; 2778 2779 l0 = pmap_l0(pmap, va); 2780 tl0 = pmap_load(l0); 2781 l1pg = PTE_TO_VM_PAGE(tl0); 2782 pmap_unwire_l3(pmap, va, l1pg, free); 2783 } 2784 pmap_invalidate_page(pmap, va, false); 2785 2786 /* 2787 * Put page on a list so that it is released after 2788 * *ALL* TLB shootdown is done 2789 */ 2790 pmap_add_delayed_free_list(m, free, true); 2791 } 2792 2793 /* 2794 * After removing a page table entry, this routine is used to 2795 * conditionally free the page, and manage the reference count. 2796 */ 2797 static int 2798 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2799 struct spglist *free) 2800 { 2801 vm_page_t mpte; 2802 2803 KASSERT(ADDR_IS_CANONICAL(va), 2804 ("%s: Address not in canonical form: %lx", __func__, va)); 2805 if (ADDR_IS_KERNEL(va)) 2806 return (0); 2807 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2808 mpte = PTE_TO_VM_PAGE(ptepde); 2809 return (pmap_unwire_l3(pmap, va, mpte, free)); 2810 } 2811 2812 /* 2813 * Release a page table page reference after a failed attempt to create a 2814 * mapping. 2815 */ 2816 static void 2817 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 2818 { 2819 struct spglist free; 2820 2821 SLIST_INIT(&free); 2822 if (pmap_unwire_l3(pmap, va, mpte, &free)) 2823 vm_page_free_pages_toq(&free, true); 2824 } 2825 2826 void 2827 pmap_pinit0(pmap_t pmap) 2828 { 2829 2830 PMAP_LOCK_INIT(pmap); 2831 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 2832 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1); 2833 pmap->pm_l0 = PHYS_TO_DMAP(pmap->pm_l0_paddr); 2834 TAILQ_INIT(&pmap->pm_pvchunk); 2835 vm_radix_init(&pmap->pm_root); 2836 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN); 2837 pmap->pm_stage = PM_STAGE1; 2838 pmap->pm_levels = 4; 2839 pmap->pm_ttbr = pmap->pm_l0_paddr; 2840 pmap->pm_asid_set = &asids; 2841 pmap->pm_bti = NULL; 2842 2843 PCPU_SET(curpmap, pmap); 2844 } 2845 2846 int 2847 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels) 2848 { 2849 vm_page_t m; 2850 2851 /* 2852 * allocate the l0 page 2853 */ 2854 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED | 2855 VM_ALLOC_ZERO); 2856 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); 2857 pmap->pm_l0 = PHYS_TO_DMAP(pmap->pm_l0_paddr); 2858 2859 TAILQ_INIT(&pmap->pm_pvchunk); 2860 vm_radix_init(&pmap->pm_root); 2861 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 2862 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); 2863 2864 MPASS(levels == 3 || levels == 4); 2865 pmap->pm_levels = levels; 2866 pmap->pm_stage = stage; 2867 pmap->pm_bti = NULL; 2868 switch (stage) { 2869 case PM_STAGE1: 2870 pmap->pm_asid_set = &asids; 2871 if (pmap_bti_support) { 2872 pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF, 2873 M_ZERO | M_WAITOK); 2874 rangeset_init(pmap->pm_bti, bti_dup_range, 2875 bti_free_range, pmap, M_NOWAIT); 2876 } 2877 break; 2878 case PM_STAGE2: 2879 pmap->pm_asid_set = &vmids; 2880 break; 2881 default: 2882 panic("%s: Invalid pmap type %d", __func__, stage); 2883 break; 2884 } 2885 2886 /* XXX Temporarily disable deferred ASID allocation. */ 2887 pmap_alloc_asid(pmap); 2888 2889 /* 2890 * Allocate the level 1 entry to use as the root. This will increase 2891 * the refcount on the level 1 page so it won't be removed until 2892 * pmap_release() is called. 2893 */ 2894 if (pmap->pm_levels == 3) { 2895 PMAP_LOCK(pmap); 2896 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL); 2897 PMAP_UNLOCK(pmap); 2898 } 2899 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m); 2900 2901 return (1); 2902 } 2903 2904 int 2905 pmap_pinit(pmap_t pmap) 2906 { 2907 2908 return (pmap_pinit_stage(pmap, PM_STAGE1, 4)); 2909 } 2910 2911 /* 2912 * This routine is called if the desired page table page does not exist. 2913 * 2914 * If page table page allocation fails, this routine may sleep before 2915 * returning NULL. It sleeps only if a lock pointer was given. 2916 * 2917 * Note: If a page allocation fails at page table level two or three, 2918 * one or two pages may be held during the wait, only to be released 2919 * afterwards. This conservative approach is easily argued to avoid 2920 * race conditions. 2921 */ 2922 static vm_page_t 2923 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2924 { 2925 vm_page_t m, l1pg, l2pg; 2926 2927 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2928 2929 /* 2930 * Allocate a page table page. 2931 */ 2932 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2933 if (lockp != NULL) { 2934 RELEASE_PV_LIST_LOCK(lockp); 2935 PMAP_UNLOCK(pmap); 2936 vm_wait(NULL); 2937 PMAP_LOCK(pmap); 2938 } 2939 2940 /* 2941 * Indicate the need to retry. While waiting, the page table 2942 * page may have been allocated. 2943 */ 2944 return (NULL); 2945 } 2946 m->pindex = ptepindex; 2947 2948 /* 2949 * Because of AArch64's weak memory consistency model, we must have a 2950 * barrier here to ensure that the stores for zeroing "m", whether by 2951 * pmap_zero_page() or an earlier function, are visible before adding 2952 * "m" to the page table. Otherwise, a page table walk by another 2953 * processor's MMU could see the mapping to "m" and a stale, non-zero 2954 * PTE within "m". 2955 */ 2956 dmb(ishst); 2957 2958 /* 2959 * Map the pagetable page into the process address space, if 2960 * it isn't already there. 2961 */ 2962 2963 if (ptepindex >= (NUL2E + NUL1E)) { 2964 pd_entry_t *l0p, l0e; 2965 vm_pindex_t l0index; 2966 2967 l0index = ptepindex - (NUL2E + NUL1E); 2968 l0p = &pmap->pm_l0[l0index]; 2969 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0, 2970 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p))); 2971 l0e = VM_PAGE_TO_PTE(m) | L0_TABLE; 2972 2973 /* 2974 * Mark all kernel memory as not accessible from userspace 2975 * and userspace memory as not executable from the kernel. 2976 * This has been done for the bootstrap L0 entries in 2977 * locore.S. 2978 */ 2979 if (pmap == kernel_pmap) 2980 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0; 2981 else 2982 l0e |= TATTR_PXN_TABLE; 2983 pmap_store(l0p, l0e); 2984 } else if (ptepindex >= NUL2E) { 2985 vm_pindex_t l0index, l1index; 2986 pd_entry_t *l0, *l1; 2987 pd_entry_t tl0; 2988 2989 l1index = ptepindex - NUL2E; 2990 l0index = l1index >> Ln_ENTRIES_SHIFT; 2991 2992 l0 = &pmap->pm_l0[l0index]; 2993 tl0 = pmap_load(l0); 2994 if (tl0 == 0) { 2995 /* recurse for allocating page dir */ 2996 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, 2997 lockp) == NULL) { 2998 vm_page_unwire_noq(m); 2999 vm_page_free_zero(m); 3000 return (NULL); 3001 } 3002 } else { 3003 l1pg = PTE_TO_VM_PAGE(tl0); 3004 l1pg->ref_count++; 3005 } 3006 3007 l1 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0))); 3008 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 3009 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0, 3010 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 3011 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE); 3012 } else { 3013 vm_pindex_t l0index, l1index; 3014 pd_entry_t *l0, *l1, *l2; 3015 pd_entry_t tl0, tl1; 3016 3017 l1index = ptepindex >> Ln_ENTRIES_SHIFT; 3018 l0index = l1index >> Ln_ENTRIES_SHIFT; 3019 3020 l0 = &pmap->pm_l0[l0index]; 3021 tl0 = pmap_load(l0); 3022 if (tl0 == 0) { 3023 /* recurse for allocating page dir */ 3024 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 3025 lockp) == NULL) { 3026 vm_page_unwire_noq(m); 3027 vm_page_free_zero(m); 3028 return (NULL); 3029 } 3030 tl0 = pmap_load(l0); 3031 l1 = PHYS_TO_DMAP(PTE_TO_PHYS(tl0)); 3032 l1 = &l1[l1index & Ln_ADDR_MASK]; 3033 } else { 3034 l1 = PHYS_TO_DMAP(PTE_TO_PHYS(tl0)); 3035 l1 = &l1[l1index & Ln_ADDR_MASK]; 3036 tl1 = pmap_load(l1); 3037 if (tl1 == 0) { 3038 /* recurse for allocating page dir */ 3039 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 3040 lockp) == NULL) { 3041 vm_page_unwire_noq(m); 3042 vm_page_free_zero(m); 3043 return (NULL); 3044 } 3045 } else { 3046 l2pg = PTE_TO_VM_PAGE(tl1); 3047 l2pg->ref_count++; 3048 } 3049 } 3050 3051 l2 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1))); 3052 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 3053 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0, 3054 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 3055 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE); 3056 } 3057 3058 pmap_resident_count_inc(pmap, 1); 3059 3060 return (m); 3061 } 3062 3063 static pd_entry_t * 3064 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp, 3065 struct rwlock **lockp) 3066 { 3067 pd_entry_t *l1, *l2; 3068 vm_page_t l2pg; 3069 vm_pindex_t l2pindex; 3070 3071 KASSERT(ADDR_IS_CANONICAL(va), 3072 ("%s: Address not in canonical form: %lx", __func__, va)); 3073 3074 retry: 3075 l1 = pmap_l1(pmap, va); 3076 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { 3077 l2 = pmap_l1_to_l2(l1, va); 3078 if (ADDR_IS_USER(va)) { 3079 /* Add a reference to the L2 page. */ 3080 l2pg = PTE_TO_VM_PAGE(pmap_load(l1)); 3081 l2pg->ref_count++; 3082 } else 3083 l2pg = NULL; 3084 } else if (ADDR_IS_USER(va)) { 3085 /* Allocate a L2 page. */ 3086 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 3087 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 3088 if (l2pg == NULL) { 3089 if (lockp != NULL) 3090 goto retry; 3091 else 3092 return (NULL); 3093 } 3094 l2 = VM_PAGE_TO_DMAP(l2pg); 3095 l2 = &l2[pmap_l2_index(va)]; 3096 } else 3097 panic("pmap_alloc_l2: missing page table page for va %#lx", 3098 va); 3099 *l2pgp = l2pg; 3100 return (l2); 3101 } 3102 3103 static vm_page_t 3104 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 3105 { 3106 vm_pindex_t ptepindex; 3107 pd_entry_t *pde, tpde; 3108 #ifdef INVARIANTS 3109 pt_entry_t *pte; 3110 #endif 3111 vm_page_t m; 3112 int lvl; 3113 3114 /* 3115 * Calculate pagetable page index 3116 */ 3117 ptepindex = pmap_l2_pindex(va); 3118 retry: 3119 /* 3120 * Get the page directory entry 3121 */ 3122 pde = pmap_pde(pmap, va, &lvl); 3123 3124 /* 3125 * If the page table page is mapped, we just increment the hold count, 3126 * and activate it. If we get a level 2 pde it will point to a level 3 3127 * table. 3128 */ 3129 switch (lvl) { 3130 case -1: 3131 break; 3132 case 0: 3133 #ifdef INVARIANTS 3134 pte = pmap_l0_to_l1(pde, va); 3135 KASSERT(pmap_load(pte) == 0, 3136 ("pmap_alloc_l3: TODO: l0 superpages")); 3137 #endif 3138 break; 3139 case 1: 3140 #ifdef INVARIANTS 3141 pte = pmap_l1_to_l2(pde, va); 3142 KASSERT(pmap_load(pte) == 0, 3143 ("pmap_alloc_l3: TODO: l1 superpages")); 3144 #endif 3145 break; 3146 case 2: 3147 tpde = pmap_load(pde); 3148 if (tpde != 0) { 3149 m = PTE_TO_VM_PAGE(tpde); 3150 m->ref_count++; 3151 return (m); 3152 } 3153 break; 3154 default: 3155 panic("pmap_alloc_l3: Invalid level %d", lvl); 3156 } 3157 3158 /* 3159 * Here if the pte page isn't mapped, or if it has been deallocated. 3160 */ 3161 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 3162 if (m == NULL && lockp != NULL) 3163 goto retry; 3164 3165 return (m); 3166 } 3167 3168 /*************************************************** 3169 * Pmap allocation/deallocation routines. 3170 ***************************************************/ 3171 3172 /* 3173 * Release any resources held by the given physical map. 3174 * Called when a pmap initialized by pmap_pinit is being released. 3175 * Should only be called if the map contains no valid mappings. 3176 */ 3177 void 3178 pmap_release(pmap_t pmap) 3179 { 3180 bool rv __diagused; 3181 struct spglist freelist; 3182 struct asid_set *set; 3183 vm_page_t m; 3184 int asid; 3185 3186 if (pmap->pm_levels != 4) { 3187 PMAP_ASSERT_STAGE2(pmap); 3188 KASSERT(pmap->pm_stats.resident_count == 1, 3189 ("pmap_release: pmap resident count %ld != 0", 3190 pmap->pm_stats.resident_count)); 3191 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID, 3192 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0])); 3193 3194 SLIST_INIT(&freelist); 3195 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr); 3196 PMAP_LOCK(pmap); 3197 rv = pmap_unwire_l3(pmap, 0, m, &freelist); 3198 PMAP_UNLOCK(pmap); 3199 MPASS(rv == true); 3200 vm_page_free_pages_toq(&freelist, true); 3201 } 3202 3203 KASSERT(pmap->pm_stats.resident_count == 0, 3204 ("pmap_release: pmap resident count %ld != 0", 3205 pmap->pm_stats.resident_count)); 3206 KASSERT(vm_radix_is_empty(&pmap->pm_root), 3207 ("pmap_release: pmap has reserved page table page(s)")); 3208 3209 set = pmap->pm_asid_set; 3210 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 3211 3212 /* 3213 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate 3214 * the entries when removing them so rely on a later tlb invalidation. 3215 * this will happen when updating the VMID generation. Because of this 3216 * we don't reuse VMIDs within a generation. 3217 */ 3218 if (pmap->pm_stage == PM_STAGE1) { 3219 mtx_lock_spin(&set->asid_set_mutex); 3220 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) { 3221 asid = COOKIE_TO_ASID(pmap->pm_cookie); 3222 KASSERT(asid >= ASID_FIRST_AVAILABLE && 3223 asid < set->asid_set_size, 3224 ("pmap_release: pmap cookie has out-of-range asid")); 3225 bit_clear(set->asid_set, asid); 3226 } 3227 mtx_unlock_spin(&set->asid_set_mutex); 3228 3229 if (pmap->pm_bti != NULL) { 3230 rangeset_fini(pmap->pm_bti); 3231 free(pmap->pm_bti, M_DEVBUF); 3232 } 3233 } 3234 3235 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); 3236 vm_page_unwire_noq(m); 3237 vm_page_free_zero(m); 3238 } 3239 3240 static int 3241 kvm_size(SYSCTL_HANDLER_ARGS) 3242 { 3243 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 3244 3245 return sysctl_handle_long(oidp, &ksize, 0, req); 3246 } 3247 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 3248 0, 0, kvm_size, "LU", 3249 "Size of KVM"); 3250 3251 static int 3252 kvm_free(SYSCTL_HANDLER_ARGS) 3253 { 3254 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 3255 3256 return sysctl_handle_long(oidp, &kfree, 0, req); 3257 } 3258 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 3259 0, 0, kvm_free, "LU", 3260 "Amount of KVM free"); 3261 3262 /* 3263 * grow the number of kernel page table entries, if needed 3264 */ 3265 static int 3266 pmap_growkernel_nopanic(vm_offset_t addr) 3267 { 3268 vm_page_t nkpg; 3269 pd_entry_t *l0, *l1, *l2; 3270 3271 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 3272 3273 addr = roundup2(addr, L2_SIZE); 3274 if (addr - 1 >= vm_map_max(kernel_map)) 3275 addr = vm_map_max(kernel_map); 3276 if (kernel_vm_end < addr) { 3277 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end); 3278 kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end); 3279 } 3280 while (kernel_vm_end < addr) { 3281 l0 = pmap_l0(kernel_pmap, kernel_vm_end); 3282 KASSERT(pmap_load(l0) != 0, 3283 ("pmap_growkernel: No level 0 kernel entry")); 3284 3285 l1 = pmap_l0_to_l1(l0, kernel_vm_end); 3286 if (pmap_load(l1) == 0) { 3287 /* We need a new PDP entry */ 3288 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 3289 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3290 if (nkpg == NULL) 3291 return (KERN_RESOURCE_SHORTAGE); 3292 nkpg->pindex = pmap_l1_pindex(kernel_vm_end); 3293 /* See the dmb() in _pmap_alloc_l3(). */ 3294 dmb(ishst); 3295 pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE); 3296 continue; /* try again */ 3297 } 3298 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 3299 if (pmap_load(l2) != 0) { 3300 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 3301 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3302 kernel_vm_end = vm_map_max(kernel_map); 3303 break; 3304 } 3305 continue; 3306 } 3307 3308 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 3309 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3310 if (nkpg == NULL) 3311 return (KERN_RESOURCE_SHORTAGE); 3312 nkpg->pindex = pmap_l2_pindex(kernel_vm_end); 3313 /* See the dmb() in _pmap_alloc_l3(). */ 3314 dmb(ishst); 3315 pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE); 3316 3317 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 3318 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3319 kernel_vm_end = vm_map_max(kernel_map); 3320 break; 3321 } 3322 } 3323 return (KERN_SUCCESS); 3324 } 3325 3326 int 3327 pmap_growkernel(vm_offset_t addr) 3328 { 3329 int rv; 3330 3331 rv = pmap_growkernel_nopanic(addr); 3332 if (rv != KERN_SUCCESS && pmap_growkernel_panic) 3333 panic("pmap_growkernel: no memory to grow kernel"); 3334 return (rv); 3335 } 3336 3337 /*************************************************** 3338 * page management routines. 3339 ***************************************************/ 3340 3341 static const uint64_t pc_freemask[_NPCM] = { 3342 [0 ... _NPCM - 2] = PC_FREEN, 3343 [_NPCM - 1] = PC_FREEL 3344 }; 3345 3346 #ifdef PV_STATS 3347 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 3348 3349 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 3350 "Current number of pv entry chunks"); 3351 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 3352 "Current number of pv entry chunks allocated"); 3353 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 3354 "Current number of pv entry chunks frees"); 3355 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 3356 "Number of times tried to get a chunk page but failed."); 3357 3358 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 3359 static int pv_entry_spare; 3360 3361 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 3362 "Current number of pv entry frees"); 3363 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 3364 "Current number of pv entry allocs"); 3365 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 3366 "Current number of pv entries"); 3367 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 3368 "Current number of spare pv entries"); 3369 #endif 3370 3371 /* 3372 * We are in a serious low memory condition. Resort to 3373 * drastic measures to free some pages so we can allocate 3374 * another pv entry chunk. 3375 * 3376 * Returns NULL if PV entries were reclaimed from the specified pmap. 3377 * 3378 * We do not, however, unmap 2mpages because subsequent accesses will 3379 * allocate per-page pv entries until repromotion occurs, thereby 3380 * exacerbating the shortage of free pv entries. 3381 */ 3382 static vm_page_t 3383 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 3384 { 3385 struct pv_chunks_list *pvc; 3386 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 3387 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 3388 struct md_page *pvh; 3389 pd_entry_t *pde; 3390 pmap_t next_pmap, pmap; 3391 pt_entry_t *pte, tpte; 3392 pv_entry_t pv; 3393 vm_offset_t va; 3394 vm_page_t m, m_pc; 3395 struct spglist free; 3396 uint64_t inuse; 3397 int bit, field, freed, lvl; 3398 3399 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 3400 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 3401 3402 pmap = NULL; 3403 m_pc = NULL; 3404 SLIST_INIT(&free); 3405 bzero(&pc_marker_b, sizeof(pc_marker_b)); 3406 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 3407 pc_marker = (struct pv_chunk *)&pc_marker_b; 3408 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 3409 3410 pvc = &pv_chunks[domain]; 3411 mtx_lock(&pvc->pvc_lock); 3412 pvc->active_reclaims++; 3413 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 3414 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 3415 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 3416 SLIST_EMPTY(&free)) { 3417 next_pmap = pc->pc_pmap; 3418 if (next_pmap == NULL) { 3419 /* 3420 * The next chunk is a marker. However, it is 3421 * not our marker, so active_reclaims must be 3422 * > 1. Consequently, the next_chunk code 3423 * will not rotate the pv_chunks list. 3424 */ 3425 goto next_chunk; 3426 } 3427 mtx_unlock(&pvc->pvc_lock); 3428 3429 /* 3430 * A pv_chunk can only be removed from the pc_lru list 3431 * when both pvc->pvc_lock is owned and the 3432 * corresponding pmap is locked. 3433 */ 3434 if (pmap != next_pmap) { 3435 if (pmap != NULL && pmap != locked_pmap) 3436 PMAP_UNLOCK(pmap); 3437 pmap = next_pmap; 3438 /* Avoid deadlock and lock recursion. */ 3439 if (pmap > locked_pmap) { 3440 RELEASE_PV_LIST_LOCK(lockp); 3441 PMAP_LOCK(pmap); 3442 mtx_lock(&pvc->pvc_lock); 3443 continue; 3444 } else if (pmap != locked_pmap) { 3445 if (PMAP_TRYLOCK(pmap)) { 3446 mtx_lock(&pvc->pvc_lock); 3447 continue; 3448 } else { 3449 pmap = NULL; /* pmap is not locked */ 3450 mtx_lock(&pvc->pvc_lock); 3451 pc = TAILQ_NEXT(pc_marker, pc_lru); 3452 if (pc == NULL || 3453 pc->pc_pmap != next_pmap) 3454 continue; 3455 goto next_chunk; 3456 } 3457 } 3458 } 3459 3460 /* 3461 * Destroy every non-wired, 4 KB page mapping in the chunk. 3462 */ 3463 freed = 0; 3464 for (field = 0; field < _NPCM; field++) { 3465 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 3466 inuse != 0; inuse &= ~(1UL << bit)) { 3467 bit = ffsl(inuse) - 1; 3468 pv = &pc->pc_pventry[field * 64 + bit]; 3469 va = pv->pv_va; 3470 pde = pmap_pde(pmap, va, &lvl); 3471 if (lvl != 2) 3472 continue; 3473 pte = pmap_l2_to_l3(pde, va); 3474 tpte = pmap_load(pte); 3475 if ((tpte & ATTR_SW_WIRED) != 0) 3476 continue; 3477 if ((tpte & ATTR_CONTIGUOUS) != 0) 3478 (void)pmap_demote_l3c(pmap, pte, va); 3479 tpte = pmap_load_clear(pte); 3480 m = PTE_TO_VM_PAGE(tpte); 3481 if (pmap_pte_dirty(pmap, tpte)) 3482 vm_page_dirty(m); 3483 if ((tpte & ATTR_AF) != 0) { 3484 pmap_s1_invalidate_page(pmap, va, true); 3485 vm_page_aflag_set(m, PGA_REFERENCED); 3486 } 3487 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3488 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3489 m->md.pv_gen++; 3490 if (TAILQ_EMPTY(&m->md.pv_list) && 3491 (m->flags & PG_FICTITIOUS) == 0) { 3492 pvh = page_to_pvh(m); 3493 if (TAILQ_EMPTY(&pvh->pv_list)) { 3494 vm_page_aflag_clear(m, 3495 PGA_WRITEABLE); 3496 } 3497 } 3498 pc->pc_map[field] |= 1UL << bit; 3499 pmap_unuse_pt(pmap, va, pmap_load(pde), &free); 3500 freed++; 3501 } 3502 } 3503 if (freed == 0) { 3504 mtx_lock(&pvc->pvc_lock); 3505 goto next_chunk; 3506 } 3507 /* Every freed mapping is for a 4 KB page. */ 3508 pmap_resident_count_dec(pmap, freed); 3509 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3510 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3511 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3512 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3513 if (pc_is_free(pc)) { 3514 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3515 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3516 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3517 /* Entire chunk is free; return it. */ 3518 m_pc = DMAP_TO_VM_PAGE(pc); 3519 dump_drop_page(m_pc->phys_addr); 3520 mtx_lock(&pvc->pvc_lock); 3521 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3522 break; 3523 } 3524 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3525 mtx_lock(&pvc->pvc_lock); 3526 /* One freed pv entry in locked_pmap is sufficient. */ 3527 if (pmap == locked_pmap) 3528 break; 3529 3530 next_chunk: 3531 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 3532 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 3533 if (pvc->active_reclaims == 1 && pmap != NULL) { 3534 /* 3535 * Rotate the pv chunks list so that we do not 3536 * scan the same pv chunks that could not be 3537 * freed (because they contained a wired 3538 * and/or superpage mapping) on every 3539 * invocation of reclaim_pv_chunk(). 3540 */ 3541 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){ 3542 MPASS(pc->pc_pmap != NULL); 3543 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3544 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 3545 } 3546 } 3547 } 3548 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 3549 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 3550 pvc->active_reclaims--; 3551 mtx_unlock(&pvc->pvc_lock); 3552 if (pmap != NULL && pmap != locked_pmap) 3553 PMAP_UNLOCK(pmap); 3554 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 3555 m_pc = SLIST_FIRST(&free); 3556 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 3557 /* Recycle a freed page table page. */ 3558 m_pc->ref_count = 1; 3559 } 3560 vm_page_free_pages_toq(&free, true); 3561 return (m_pc); 3562 } 3563 3564 static vm_page_t 3565 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 3566 { 3567 vm_page_t m; 3568 int i, domain; 3569 3570 domain = PCPU_GET(domain); 3571 for (i = 0; i < vm_ndomains; i++) { 3572 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 3573 if (m != NULL) 3574 break; 3575 domain = (domain + 1) % vm_ndomains; 3576 } 3577 3578 return (m); 3579 } 3580 3581 /* 3582 * free the pv_entry back to the free list 3583 */ 3584 static void 3585 free_pv_entry(pmap_t pmap, pv_entry_t pv) 3586 { 3587 struct pv_chunk *pc; 3588 int idx, field, bit; 3589 3590 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3591 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 3592 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 3593 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 3594 pc = pv_to_chunk(pv); 3595 idx = pv - &pc->pc_pventry[0]; 3596 field = idx / 64; 3597 bit = idx % 64; 3598 pc->pc_map[field] |= 1ul << bit; 3599 if (!pc_is_free(pc)) { 3600 /* 98% of the time, pc is already at the head of the list. */ 3601 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 3602 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3603 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3604 } 3605 return; 3606 } 3607 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3608 free_pv_chunk(pc); 3609 } 3610 3611 static void 3612 free_pv_chunk_dequeued(struct pv_chunk *pc) 3613 { 3614 vm_page_t m; 3615 3616 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3617 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3618 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3619 /* entire chunk is free, return it */ 3620 m = DMAP_TO_VM_PAGE(pc); 3621 dump_drop_page(m->phys_addr); 3622 vm_page_unwire_noq(m); 3623 vm_page_free(m); 3624 } 3625 3626 static void 3627 free_pv_chunk(struct pv_chunk *pc) 3628 { 3629 struct pv_chunks_list *pvc; 3630 3631 pvc = &pv_chunks[pc_to_domain(pc)]; 3632 mtx_lock(&pvc->pvc_lock); 3633 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3634 mtx_unlock(&pvc->pvc_lock); 3635 free_pv_chunk_dequeued(pc); 3636 } 3637 3638 static void 3639 free_pv_chunk_batch(struct pv_chunklist *batch) 3640 { 3641 struct pv_chunks_list *pvc; 3642 struct pv_chunk *pc, *npc; 3643 int i; 3644 3645 for (i = 0; i < vm_ndomains; i++) { 3646 if (TAILQ_EMPTY(&batch[i])) 3647 continue; 3648 pvc = &pv_chunks[i]; 3649 mtx_lock(&pvc->pvc_lock); 3650 TAILQ_FOREACH(pc, &batch[i], pc_list) { 3651 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3652 } 3653 mtx_unlock(&pvc->pvc_lock); 3654 } 3655 3656 for (i = 0; i < vm_ndomains; i++) { 3657 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 3658 free_pv_chunk_dequeued(pc); 3659 } 3660 } 3661 } 3662 3663 /* 3664 * Returns a new PV entry, allocating a new PV chunk from the system when 3665 * needed. If this PV chunk allocation fails and a PV list lock pointer was 3666 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 3667 * returned. 3668 * 3669 * The given PV list lock may be released. 3670 */ 3671 static pv_entry_t 3672 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 3673 { 3674 struct pv_chunks_list *pvc; 3675 int bit, field; 3676 pv_entry_t pv; 3677 struct pv_chunk *pc; 3678 vm_page_t m; 3679 3680 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3681 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 3682 retry: 3683 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3684 if (pc != NULL) { 3685 for (field = 0; field < _NPCM; field++) { 3686 if (pc->pc_map[field]) { 3687 bit = ffsl(pc->pc_map[field]) - 1; 3688 break; 3689 } 3690 } 3691 if (field < _NPCM) { 3692 pv = &pc->pc_pventry[field * 64 + bit]; 3693 pc->pc_map[field] &= ~(1ul << bit); 3694 /* If this was the last item, move it to tail */ 3695 if (pc_is_full(pc)) { 3696 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3697 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 3698 pc_list); 3699 } 3700 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3701 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 3702 return (pv); 3703 } 3704 } 3705 /* No free items, allocate another chunk */ 3706 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3707 if (m == NULL) { 3708 if (lockp == NULL) { 3709 PV_STAT(pc_chunk_tryfail++); 3710 return (NULL); 3711 } 3712 m = reclaim_pv_chunk(pmap, lockp); 3713 if (m == NULL) 3714 goto retry; 3715 } 3716 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3717 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3718 dump_add_page(m->phys_addr); 3719 pc = PHYS_TO_DMAP(m->phys_addr); 3720 pc->pc_pmap = pmap; 3721 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask)); 3722 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */ 3723 pvc = &pv_chunks[vm_page_domain(m)]; 3724 mtx_lock(&pvc->pvc_lock); 3725 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 3726 mtx_unlock(&pvc->pvc_lock); 3727 pv = &pc->pc_pventry[0]; 3728 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3729 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3730 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 3731 return (pv); 3732 } 3733 3734 /* 3735 * Ensure that the number of spare PV entries in the specified pmap meets or 3736 * exceeds the given count, "needed". 3737 * 3738 * The given PV list lock may be released. 3739 */ 3740 static void 3741 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 3742 { 3743 struct pv_chunks_list *pvc; 3744 struct pch new_tail[PMAP_MEMDOM]; 3745 struct pv_chunk *pc; 3746 vm_page_t m; 3747 int avail, free, i; 3748 bool reclaimed; 3749 3750 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3751 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 3752 3753 /* 3754 * Newly allocated PV chunks must be stored in a private list until 3755 * the required number of PV chunks have been allocated. Otherwise, 3756 * reclaim_pv_chunk() could recycle one of these chunks. In 3757 * contrast, these chunks must be added to the pmap upon allocation. 3758 */ 3759 for (i = 0; i < PMAP_MEMDOM; i++) 3760 TAILQ_INIT(&new_tail[i]); 3761 retry: 3762 avail = 0; 3763 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3764 bit_count((bitstr_t *)pc->pc_map, 0, 3765 sizeof(pc->pc_map) * NBBY, &free); 3766 if (free == 0) 3767 break; 3768 avail += free; 3769 if (avail >= needed) 3770 break; 3771 } 3772 for (reclaimed = false; avail < needed; avail += _NPCPV) { 3773 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3774 if (m == NULL) { 3775 m = reclaim_pv_chunk(pmap, lockp); 3776 if (m == NULL) 3777 goto retry; 3778 reclaimed = true; 3779 } 3780 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3781 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3782 dump_add_page(m->phys_addr); 3783 pc = PHYS_TO_DMAP(m->phys_addr); 3784 pc->pc_pmap = pmap; 3785 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask)); 3786 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3787 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 3788 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3789 3790 /* 3791 * The reclaim might have freed a chunk from the current pmap. 3792 * If that chunk contained available entries, we need to 3793 * re-count the number of available entries. 3794 */ 3795 if (reclaimed) 3796 goto retry; 3797 } 3798 for (i = 0; i < vm_ndomains; i++) { 3799 if (TAILQ_EMPTY(&new_tail[i])) 3800 continue; 3801 pvc = &pv_chunks[i]; 3802 mtx_lock(&pvc->pvc_lock); 3803 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 3804 mtx_unlock(&pvc->pvc_lock); 3805 } 3806 } 3807 3808 /* 3809 * First find and then remove the pv entry for the specified pmap and virtual 3810 * address from the specified pv list. Returns the pv entry if found and NULL 3811 * otherwise. This operation can be performed on pv lists for either 4KB or 3812 * 2MB page mappings. 3813 */ 3814 static __inline pv_entry_t 3815 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3816 { 3817 pv_entry_t pv; 3818 3819 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3820 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3821 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3822 pvh->pv_gen++; 3823 break; 3824 } 3825 } 3826 return (pv); 3827 } 3828 3829 /* 3830 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3831 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3832 * entries for each of the 4KB page mappings. 3833 */ 3834 static void 3835 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3836 struct rwlock **lockp) 3837 { 3838 struct md_page *pvh; 3839 struct pv_chunk *pc; 3840 pv_entry_t pv; 3841 vm_offset_t va_last; 3842 vm_page_t m; 3843 int bit, field; 3844 3845 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3846 KASSERT((va & L2_OFFSET) == 0, 3847 ("pmap_pv_demote_l2: va is not 2mpage aligned")); 3848 KASSERT((pa & L2_OFFSET) == 0, 3849 ("pmap_pv_demote_l2: pa is not 2mpage aligned")); 3850 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3851 3852 /* 3853 * Transfer the 2mpage's pv entry for this mapping to the first 3854 * page's pv list. Once this transfer begins, the pv list lock 3855 * must not be released until the last pv entry is reinstantiated. 3856 */ 3857 pvh = pa_to_pvh(pa); 3858 pv = pmap_pvh_remove(pvh, pmap, va); 3859 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 3860 m = PHYS_TO_VM_PAGE(pa); 3861 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3862 m->md.pv_gen++; 3863 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ 3864 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); 3865 va_last = va + L2_SIZE - PAGE_SIZE; 3866 for (;;) { 3867 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3868 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare")); 3869 for (field = 0; field < _NPCM; field++) { 3870 while (pc->pc_map[field]) { 3871 bit = ffsl(pc->pc_map[field]) - 1; 3872 pc->pc_map[field] &= ~(1ul << bit); 3873 pv = &pc->pc_pventry[field * 64 + bit]; 3874 va += PAGE_SIZE; 3875 pv->pv_va = va; 3876 m++; 3877 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3878 ("pmap_pv_demote_l2: page %p is not managed", m)); 3879 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3880 m->md.pv_gen++; 3881 if (va == va_last) 3882 goto out; 3883 } 3884 } 3885 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3886 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3887 } 3888 out: 3889 if (pc_is_full(pc)) { 3890 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3891 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3892 } 3893 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); 3894 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); 3895 } 3896 3897 /* 3898 * First find and then destroy the pv entry for the specified pmap and virtual 3899 * address. This operation can be performed on pv lists for either 4KB or 2MB 3900 * page mappings. 3901 */ 3902 static void 3903 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3904 { 3905 pv_entry_t pv; 3906 3907 pv = pmap_pvh_remove(pvh, pmap, va); 3908 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3909 free_pv_entry(pmap, pv); 3910 } 3911 3912 /* 3913 * Conditionally create the PV entry for a 4KB page mapping if the required 3914 * memory can be allocated without resorting to reclamation. 3915 */ 3916 static bool 3917 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3918 struct rwlock **lockp) 3919 { 3920 pv_entry_t pv; 3921 3922 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3923 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3924 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3925 pv->pv_va = va; 3926 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3927 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3928 m->md.pv_gen++; 3929 return (true); 3930 } else 3931 return (false); 3932 } 3933 3934 /* 3935 * Create the PV entry for a 2MB page mapping. Always returns true unless the 3936 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 3937 * false if the PV entry cannot be allocated without resorting to reclamation. 3938 */ 3939 static bool 3940 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 3941 struct rwlock **lockp) 3942 { 3943 struct md_page *pvh; 3944 pv_entry_t pv; 3945 vm_paddr_t pa; 3946 3947 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3948 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3949 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 3950 NULL : lockp)) == NULL) 3951 return (false); 3952 pv->pv_va = va; 3953 pa = PTE_TO_PHYS(l2e); 3954 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3955 pvh = pa_to_pvh(pa); 3956 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3957 pvh->pv_gen++; 3958 return (true); 3959 } 3960 3961 /* 3962 * Conditionally creates the PV entries for a L3C superpage mapping if 3963 * the required memory can be allocated without resorting to reclamation. 3964 */ 3965 static bool 3966 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m, 3967 struct rwlock **lockp) 3968 { 3969 pv_entry_t pv; 3970 vm_offset_t tva; 3971 vm_paddr_t pa __diagused; 3972 vm_page_t mt; 3973 3974 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3975 KASSERT((va & L3C_OFFSET) == 0, 3976 ("pmap_pv_insert_l3c: va is not aligned")); 3977 pa = VM_PAGE_TO_PHYS(m); 3978 KASSERT((pa & L3C_OFFSET) == 0, 3979 ("pmap_pv_insert_l3c: pa is not aligned")); 3980 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3981 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) { 3982 /* Pass NULL instead of lockp to disable reclamation. */ 3983 pv = get_pv_entry(pmap, NULL); 3984 if (__predict_false(pv == NULL)) { 3985 while (tva > va) { 3986 mt--; 3987 tva -= L3_SIZE; 3988 pmap_pvh_free(&mt->md, pmap, tva); 3989 } 3990 return (false); 3991 } 3992 pv->pv_va = tva; 3993 TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next); 3994 mt->md.pv_gen++; 3995 } 3996 return (true); 3997 } 3998 3999 static void 4000 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 4001 { 4002 pt_entry_t newl2, oldl2 __diagused; 4003 vm_page_t ml3; 4004 vm_paddr_t ml3pa; 4005 4006 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 4007 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 4008 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4009 4010 ml3 = pmap_remove_pt_page(pmap, va); 4011 KASSERT(ml3 != NULL, ("pmap_remove_kernel_l2: missing pt page")); 4012 4013 ml3pa = VM_PAGE_TO_PHYS(ml3); 4014 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE; 4015 4016 /* 4017 * If this page table page was unmapped by a promotion, then it 4018 * contains valid mappings. Zero it to invalidate those mappings. 4019 */ 4020 if (vm_page_any_valid(ml3)) 4021 pagezero(PHYS_TO_DMAP(ml3pa)); 4022 4023 /* 4024 * Demote the mapping. The caller must have already invalidated the 4025 * mapping (i.e., the "break" in break-before-make). 4026 */ 4027 oldl2 = pmap_load_store(l2, newl2); 4028 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 4029 __func__, l2, oldl2)); 4030 } 4031 4032 /* 4033 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 4034 */ 4035 static int 4036 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, 4037 bool demote_kl2e, struct spglist *free, struct rwlock **lockp) 4038 { 4039 struct md_page *pvh; 4040 pt_entry_t old_l2; 4041 vm_page_t m, ml3, mt; 4042 4043 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4044 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 4045 old_l2 = pmap_load_clear(l2); 4046 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 4047 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); 4048 4049 /* 4050 * Since a promotion must break the 4KB page mappings before making 4051 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices. 4052 */ 4053 pmap_s1_invalidate_page(pmap, sva, true); 4054 4055 if (old_l2 & ATTR_SW_WIRED) 4056 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 4057 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 4058 if (old_l2 & ATTR_SW_MANAGED) { 4059 m = PTE_TO_VM_PAGE(old_l2); 4060 pvh = page_to_pvh(m); 4061 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 4062 pmap_pvh_free(pvh, pmap, sva); 4063 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) { 4064 if (pmap_pte_dirty(pmap, old_l2)) 4065 vm_page_dirty(mt); 4066 if (old_l2 & ATTR_AF) 4067 vm_page_aflag_set(mt, PGA_REFERENCED); 4068 if (TAILQ_EMPTY(&mt->md.pv_list) && 4069 TAILQ_EMPTY(&pvh->pv_list)) 4070 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4071 } 4072 } 4073 if (pmap != kernel_pmap) { 4074 ml3 = pmap_remove_pt_page(pmap, sva); 4075 if (ml3 != NULL) { 4076 KASSERT(vm_page_any_valid(ml3), 4077 ("pmap_remove_l2: l3 page not promoted")); 4078 pmap_resident_count_dec(pmap, 1); 4079 KASSERT(ml3->ref_count == NL3PG, 4080 ("pmap_remove_l2: l3 page ref count error")); 4081 ml3->ref_count = 0; 4082 pmap_add_delayed_free_list(ml3, free, false); 4083 } 4084 } else if (demote_kl2e) { 4085 pmap_remove_kernel_l2(pmap, l2, sva); 4086 } else { 4087 ml3 = vm_radix_lookup(&pmap->pm_root, pmap_l2_pindex(sva)); 4088 if (vm_page_any_valid(ml3)) { 4089 ml3->valid = 0; 4090 pmap_zero_page(ml3); 4091 } 4092 } 4093 return (pmap_unuse_pt(pmap, sva, l1e, free)); 4094 } 4095 4096 /* 4097 * pmap_remove_l3: do the things to unmap a page in a process 4098 */ 4099 static int 4100 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 4101 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 4102 { 4103 struct md_page *pvh; 4104 pt_entry_t old_l3; 4105 vm_page_t m; 4106 4107 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4108 old_l3 = pmap_load(l3); 4109 if ((old_l3 & ATTR_CONTIGUOUS) != 0) 4110 (void)pmap_demote_l3c(pmap, l3, va); 4111 old_l3 = pmap_load_clear(l3); 4112 pmap_s1_invalidate_page(pmap, va, true); 4113 if (old_l3 & ATTR_SW_WIRED) 4114 pmap->pm_stats.wired_count -= 1; 4115 pmap_resident_count_dec(pmap, 1); 4116 if (old_l3 & ATTR_SW_MANAGED) { 4117 m = PTE_TO_VM_PAGE(old_l3); 4118 if (pmap_pte_dirty(pmap, old_l3)) 4119 vm_page_dirty(m); 4120 if (old_l3 & ATTR_AF) 4121 vm_page_aflag_set(m, PGA_REFERENCED); 4122 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 4123 pmap_pvh_free(&m->md, pmap, va); 4124 if (TAILQ_EMPTY(&m->md.pv_list) && 4125 (m->flags & PG_FICTITIOUS) == 0) { 4126 pvh = page_to_pvh(m); 4127 if (TAILQ_EMPTY(&pvh->pv_list)) 4128 vm_page_aflag_clear(m, PGA_WRITEABLE); 4129 } 4130 } 4131 return (pmap_unuse_pt(pmap, va, l2e, free)); 4132 } 4133 4134 /* 4135 * Removes the specified L3C superpage mapping. Requests TLB invalidations 4136 * to be performed by the caller through the returned "*vap". Returns true 4137 * if the level 3 table "ml3" was unmapped and added to the spglist "free". 4138 * Otherwise, returns false. 4139 */ 4140 static bool 4141 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap, 4142 vm_offset_t va_next, vm_page_t ml3, struct spglist *free, 4143 struct rwlock **lockp) 4144 { 4145 struct md_page *pvh; 4146 struct rwlock *new_lock; 4147 pt_entry_t first_l3e, l3e, *tl3p; 4148 vm_offset_t tva; 4149 vm_page_t m, mt; 4150 4151 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4152 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) == 4153 0, ("pmap_remove_l3c: l3p is not aligned")); 4154 KASSERT((va & L3C_OFFSET) == 0, 4155 ("pmap_remove_l3c: va is not aligned")); 4156 4157 /* 4158 * Hardware accessed and dirty bit maintenance might only update a 4159 * single L3 entry, so we must combine the accessed and dirty bits 4160 * from this entire set of contiguous L3 entries. 4161 */ 4162 first_l3e = pmap_load_clear(l3p); 4163 for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 4164 l3e = pmap_load_clear(tl3p); 4165 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 4166 ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS")); 4167 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) == 4168 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW))) 4169 first_l3e &= ~ATTR_S1_AP_RW_BIT; 4170 first_l3e |= l3e & ATTR_AF; 4171 } 4172 if ((first_l3e & ATTR_SW_WIRED) != 0) 4173 pmap->pm_stats.wired_count -= L3C_ENTRIES; 4174 pmap_resident_count_dec(pmap, L3C_ENTRIES); 4175 if ((first_l3e & ATTR_SW_MANAGED) != 0) { 4176 m = PTE_TO_VM_PAGE(first_l3e); 4177 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4178 if (new_lock != *lockp) { 4179 if (*lockp != NULL) { 4180 /* 4181 * Pending TLB invalidations must be 4182 * performed before the PV list lock is 4183 * released. Otherwise, a concurrent 4184 * pmap_remove_all() on a physical page 4185 * could return while a stale TLB entry 4186 * still provides access to that page. 4187 */ 4188 if (*vap != va_next) { 4189 pmap_invalidate_range(pmap, *vap, va, 4190 true); 4191 *vap = va_next; 4192 } 4193 rw_wunlock(*lockp); 4194 } 4195 *lockp = new_lock; 4196 rw_wlock(*lockp); 4197 } 4198 pvh = page_to_pvh(m); 4199 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += 4200 L3_SIZE) { 4201 if (pmap_pte_dirty(pmap, first_l3e)) 4202 vm_page_dirty(mt); 4203 if ((first_l3e & ATTR_AF) != 0) 4204 vm_page_aflag_set(mt, PGA_REFERENCED); 4205 pmap_pvh_free(&mt->md, pmap, tva); 4206 if (TAILQ_EMPTY(&mt->md.pv_list) && 4207 TAILQ_EMPTY(&pvh->pv_list)) 4208 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4209 } 4210 } 4211 if (*vap == va_next) 4212 *vap = va; 4213 if (ml3 != NULL) { 4214 ml3->ref_count -= L3C_ENTRIES; 4215 if (ml3->ref_count == 0) { 4216 _pmap_unwire_l3(pmap, va, ml3, free); 4217 return (true); 4218 } 4219 } 4220 return (false); 4221 } 4222 4223 /* 4224 * Remove the specified range of addresses from the L3 page table that is 4225 * identified by the given L2 entry. 4226 */ 4227 static void 4228 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, 4229 vm_offset_t eva, struct spglist *free, struct rwlock **lockp) 4230 { 4231 struct md_page *pvh; 4232 struct rwlock *new_lock; 4233 pt_entry_t *l3, old_l3; 4234 vm_offset_t va; 4235 vm_page_t l3pg, m; 4236 4237 KASSERT(ADDR_IS_CANONICAL(sva), 4238 ("%s: Start address not in canonical form: %lx", __func__, sva)); 4239 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS, 4240 ("%s: End address not in canonical form: %lx", __func__, eva)); 4241 4242 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4243 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), 4244 ("pmap_remove_l3_range: range crosses an L3 page table boundary")); 4245 l3pg = ADDR_IS_USER(sva) ? PTE_TO_VM_PAGE(l2e) : NULL; 4246 va = eva; 4247 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { 4248 old_l3 = pmap_load(l3); 4249 if (!pmap_l3_valid(old_l3)) { 4250 if (va != eva) { 4251 pmap_invalidate_range(pmap, va, sva, true); 4252 va = eva; 4253 } 4254 continue; 4255 } 4256 if ((old_l3 & ATTR_CONTIGUOUS) != 0) { 4257 /* 4258 * Is this entire set of contiguous L3 entries being 4259 * removed? Handle the possibility that "eva" is zero 4260 * because of address wraparound. 4261 */ 4262 if ((sva & L3C_OFFSET) == 0 && 4263 sva + L3C_OFFSET <= eva - 1) { 4264 if (pmap_remove_l3c(pmap, l3, sva, &va, eva, 4265 l3pg, free, lockp)) { 4266 /* The L3 table was unmapped. */ 4267 sva += L3C_SIZE; 4268 break; 4269 } 4270 l3 += L3C_ENTRIES - 1; 4271 sva += L3C_SIZE - L3_SIZE; 4272 continue; 4273 } 4274 4275 (void)pmap_demote_l3c(pmap, l3, sva); 4276 } 4277 old_l3 = pmap_load_clear(l3); 4278 if ((old_l3 & ATTR_SW_WIRED) != 0) 4279 pmap->pm_stats.wired_count--; 4280 pmap_resident_count_dec(pmap, 1); 4281 /* Below will only be true in a realm environment. */ 4282 if (PTE_TO_PHYS(old_l3) & prot_ns_shared_pa) 4283 pmap_set_protected(old_l3); 4284 if ((old_l3 & ATTR_SW_MANAGED) != 0) { 4285 m = PTE_TO_VM_PAGE(old_l3); 4286 if (pmap_pte_dirty(pmap, old_l3)) 4287 vm_page_dirty(m); 4288 if ((old_l3 & ATTR_AF) != 0) 4289 vm_page_aflag_set(m, PGA_REFERENCED); 4290 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4291 if (new_lock != *lockp) { 4292 if (*lockp != NULL) { 4293 /* 4294 * Pending TLB invalidations must be 4295 * performed before the PV list lock is 4296 * released. Otherwise, a concurrent 4297 * pmap_remove_all() on a physical page 4298 * could return while a stale TLB entry 4299 * still provides access to that page. 4300 */ 4301 if (va != eva) { 4302 pmap_invalidate_range(pmap, va, 4303 sva, true); 4304 va = eva; 4305 } 4306 rw_wunlock(*lockp); 4307 } 4308 *lockp = new_lock; 4309 rw_wlock(*lockp); 4310 } 4311 pmap_pvh_free(&m->md, pmap, sva); 4312 if (TAILQ_EMPTY(&m->md.pv_list) && 4313 (m->flags & PG_FICTITIOUS) == 0) { 4314 pvh = page_to_pvh(m); 4315 if (TAILQ_EMPTY(&pvh->pv_list)) 4316 vm_page_aflag_clear(m, PGA_WRITEABLE); 4317 } 4318 } 4319 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) { 4320 /* 4321 * _pmap_unwire_l3() has already invalidated the TLB 4322 * entries at all levels for "sva". So, we need not 4323 * perform "sva += L3_SIZE;" here. Moreover, we need 4324 * not perform "va = sva;" if "sva" is at the start 4325 * of a new valid range consisting of a single page. 4326 */ 4327 break; 4328 } 4329 if (va == eva) 4330 va = sva; 4331 } 4332 if (va != eva) 4333 pmap_invalidate_range(pmap, va, sva, true); 4334 } 4335 4336 static void 4337 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) 4338 { 4339 struct rwlock *lock; 4340 vm_offset_t va_next; 4341 pd_entry_t *l0, *l1, *l2; 4342 pt_entry_t l3_paddr; 4343 struct spglist free; 4344 4345 /* 4346 * Perform an unsynchronized read. This is, however, safe. 4347 */ 4348 if (pmap->pm_stats.resident_count == 0) 4349 return; 4350 4351 SLIST_INIT(&free); 4352 4353 PMAP_LOCK(pmap); 4354 if (map_delete) 4355 pmap_bti_on_remove(pmap, sva, eva); 4356 4357 lock = NULL; 4358 for (; sva < eva; sva = va_next) { 4359 if (pmap->pm_stats.resident_count == 0) 4360 break; 4361 4362 l0 = pmap_l0(pmap, sva); 4363 if (pmap_load(l0) == 0) { 4364 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 4365 if (va_next < sva) 4366 va_next = eva; 4367 continue; 4368 } 4369 4370 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 4371 if (va_next < sva) 4372 va_next = eva; 4373 l1 = pmap_l0_to_l1(l0, sva); 4374 if (pmap_load(l1) == 0) 4375 continue; 4376 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4377 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4378 KASSERT(va_next <= eva, 4379 ("partial update of non-transparent 1G page " 4380 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 4381 pmap_load(l1), sva, eva, va_next)); 4382 MPASS(pmap != kernel_pmap); 4383 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 4384 pmap_clear(l1); 4385 pmap_s1_invalidate_page(pmap, sva, true); 4386 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE); 4387 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free); 4388 continue; 4389 } 4390 4391 /* 4392 * Calculate index for next page table. 4393 */ 4394 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4395 if (va_next < sva) 4396 va_next = eva; 4397 4398 l2 = pmap_l1_to_l2(l1, sva); 4399 l3_paddr = pmap_load(l2); 4400 4401 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { 4402 if (sva + L2_SIZE == va_next && eva >= va_next) { 4403 pmap_remove_l2(pmap, l2, sva, pmap_load(l1), 4404 true, &free, &lock); 4405 continue; 4406 } else if (pmap_demote_l2_locked(pmap, l2, sva, 4407 &lock) == NULL) 4408 continue; 4409 l3_paddr = pmap_load(l2); 4410 } 4411 4412 /* 4413 * Weed out invalid mappings. 4414 */ 4415 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) 4416 continue; 4417 4418 /* 4419 * Limit our scan to either the end of the va represented 4420 * by the current page table page, or to the end of the 4421 * range being removed. 4422 */ 4423 if (va_next > eva) 4424 va_next = eva; 4425 4426 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, 4427 &lock); 4428 } 4429 if (lock != NULL) 4430 rw_wunlock(lock); 4431 PMAP_UNLOCK(pmap); 4432 vm_page_free_pages_toq(&free, true); 4433 } 4434 4435 /* 4436 * Remove the given range of addresses from the specified map. 4437 * 4438 * It is assumed that the start and end are properly 4439 * rounded to the page size. 4440 */ 4441 void 4442 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4443 { 4444 pmap_remove1(pmap, sva, eva, false); 4445 } 4446 4447 /* 4448 * Remove the given range of addresses as part of a logical unmap 4449 * operation. This has the effect of calling pmap_remove(), but 4450 * also clears any metadata that should persist for the lifetime 4451 * of a logical mapping. 4452 */ 4453 void 4454 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4455 { 4456 pmap_remove1(pmap, sva, eva, true); 4457 } 4458 4459 /* 4460 * Routine: pmap_remove_all 4461 * Function: 4462 * Removes this physical page from 4463 * all physical maps in which it resides. 4464 * Reflects back modify bits to the pager. 4465 * 4466 * Notes: 4467 * Original versions of this routine were very 4468 * inefficient because they iteratively called 4469 * pmap_remove (slow...) 4470 */ 4471 4472 void 4473 pmap_remove_all(vm_page_t m) 4474 { 4475 struct md_page *pvh; 4476 pv_entry_t pv; 4477 pmap_t pmap; 4478 struct rwlock *lock; 4479 pd_entry_t *pde, tpde; 4480 pt_entry_t *pte, tpte; 4481 vm_offset_t va; 4482 struct spglist free; 4483 int lvl, pvh_gen, md_gen; 4484 4485 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4486 ("pmap_remove_all: page %p is not managed", m)); 4487 SLIST_INIT(&free); 4488 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4489 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 4490 rw_wlock(lock); 4491 retry: 4492 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4493 pmap = PV_PMAP(pv); 4494 if (!PMAP_TRYLOCK(pmap)) { 4495 pvh_gen = pvh->pv_gen; 4496 rw_wunlock(lock); 4497 PMAP_LOCK(pmap); 4498 rw_wlock(lock); 4499 if (pvh_gen != pvh->pv_gen) { 4500 PMAP_UNLOCK(pmap); 4501 goto retry; 4502 } 4503 } 4504 va = pv->pv_va; 4505 pte = pmap_pte_exists(pmap, va, 2, __func__); 4506 pmap_demote_l2_locked(pmap, pte, va, &lock); 4507 PMAP_UNLOCK(pmap); 4508 } 4509 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4510 pmap = PV_PMAP(pv); 4511 if (!PMAP_TRYLOCK(pmap)) { 4512 pvh_gen = pvh->pv_gen; 4513 md_gen = m->md.pv_gen; 4514 rw_wunlock(lock); 4515 PMAP_LOCK(pmap); 4516 rw_wlock(lock); 4517 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4518 PMAP_UNLOCK(pmap); 4519 goto retry; 4520 } 4521 } 4522 pmap_resident_count_dec(pmap, 1); 4523 4524 pde = pmap_pde(pmap, pv->pv_va, &lvl); 4525 KASSERT(pde != NULL, 4526 ("pmap_remove_all: no page directory entry found")); 4527 KASSERT(lvl == 2, 4528 ("pmap_remove_all: invalid pde level %d", lvl)); 4529 tpde = pmap_load(pde); 4530 4531 pte = pmap_l2_to_l3(pde, pv->pv_va); 4532 tpte = pmap_load(pte); 4533 if ((tpte & ATTR_CONTIGUOUS) != 0) 4534 (void)pmap_demote_l3c(pmap, pte, pv->pv_va); 4535 tpte = pmap_load_clear(pte); 4536 if (tpte & ATTR_SW_WIRED) 4537 pmap->pm_stats.wired_count--; 4538 if ((tpte & ATTR_AF) != 0) { 4539 pmap_invalidate_page(pmap, pv->pv_va, true); 4540 vm_page_aflag_set(m, PGA_REFERENCED); 4541 } 4542 4543 /* 4544 * Update the vm_page_t clean and reference bits. 4545 */ 4546 if (pmap_pte_dirty(pmap, tpte)) 4547 vm_page_dirty(m); 4548 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); 4549 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4550 m->md.pv_gen++; 4551 free_pv_entry(pmap, pv); 4552 PMAP_UNLOCK(pmap); 4553 } 4554 vm_page_aflag_clear(m, PGA_WRITEABLE); 4555 rw_wunlock(lock); 4556 vm_page_free_pages_toq(&free, true); 4557 } 4558 4559 /* 4560 * Masks and sets bits in a level 2 page table entries in the specified pmap 4561 */ 4562 static void 4563 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, 4564 pt_entry_t nbits) 4565 { 4566 pd_entry_t old_l2; 4567 vm_page_t m, mt; 4568 4569 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4570 PMAP_ASSERT_STAGE1(pmap); 4571 KASSERT((sva & L2_OFFSET) == 0, 4572 ("pmap_protect_l2: sva is not 2mpage aligned")); 4573 old_l2 = pmap_load(l2); 4574 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 4575 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); 4576 4577 /* 4578 * Return if the L2 entry already has the desired access restrictions 4579 * in place. 4580 */ 4581 if ((old_l2 & mask) == nbits) 4582 return; 4583 4584 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) 4585 cpu_spinwait(); 4586 4587 /* 4588 * When a dirty read/write superpage mapping is write protected, 4589 * update the dirty field of each of the superpage's constituent 4KB 4590 * pages. 4591 */ 4592 if ((old_l2 & ATTR_SW_MANAGED) != 0 && 4593 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 4594 pmap_pte_dirty(pmap, old_l2)) { 4595 m = PTE_TO_VM_PAGE(old_l2); 4596 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 4597 vm_page_dirty(mt); 4598 } 4599 4600 /* 4601 * Since a promotion must break the 4KB page mappings before making 4602 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices. 4603 */ 4604 pmap_s1_invalidate_page(pmap, sva, true); 4605 } 4606 4607 /* 4608 * Masks and sets bits in the specified L3C superpage mapping. 4609 * 4610 * Requests TLB invalidations to be performed by the caller through the 4611 * returned "*vap". 4612 */ 4613 static void 4614 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 4615 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits) 4616 { 4617 pt_entry_t l3e, *tl3p; 4618 vm_page_t m, mt; 4619 bool dirty; 4620 4621 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4622 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) == 4623 0, ("pmap_mask_set_l3c: l3p is not aligned")); 4624 KASSERT((va & L3C_OFFSET) == 0, 4625 ("pmap_mask_set_l3c: va is not aligned")); 4626 dirty = false; 4627 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 4628 l3e = pmap_load(tl3p); 4629 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 4630 ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS")); 4631 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits)) 4632 cpu_spinwait(); 4633 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) == 4634 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW))) 4635 dirty = true; 4636 } 4637 4638 /* 4639 * When a dirty read/write superpage mapping is write protected, 4640 * update the dirty field of each of the superpage's constituent 4KB 4641 * pages. 4642 */ 4643 if ((l3e & ATTR_SW_MANAGED) != 0 && 4644 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 4645 dirty) { 4646 m = PTE_TO_VM_PAGE(pmap_load(l3p)); 4647 for (mt = m; mt < &m[L3C_ENTRIES]; mt++) 4648 vm_page_dirty(mt); 4649 } 4650 4651 if (*vap == va_next) 4652 *vap = va; 4653 } 4654 4655 /* 4656 * Masks and sets bits in last level page table entries in the specified 4657 * pmap and range 4658 */ 4659 static void 4660 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask, 4661 pt_entry_t nbits, bool invalidate) 4662 { 4663 vm_offset_t va, va_next; 4664 pd_entry_t *l0, *l1, *l2; 4665 pt_entry_t *l3p, l3; 4666 4667 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4668 for (; sva < eva; sva = va_next) { 4669 l0 = pmap_l0(pmap, sva); 4670 if (pmap_load(l0) == 0) { 4671 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 4672 if (va_next < sva) 4673 va_next = eva; 4674 continue; 4675 } 4676 4677 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 4678 if (va_next < sva) 4679 va_next = eva; 4680 l1 = pmap_l0_to_l1(l0, sva); 4681 if (pmap_load(l1) == 0) 4682 continue; 4683 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4684 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4685 KASSERT(va_next <= eva, 4686 ("partial update of non-transparent 1G page " 4687 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 4688 pmap_load(l1), sva, eva, va_next)); 4689 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 4690 if ((pmap_load(l1) & mask) != nbits) { 4691 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits); 4692 if (invalidate) 4693 pmap_s1_invalidate_page(pmap, sva, true); 4694 } 4695 continue; 4696 } 4697 4698 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4699 if (va_next < sva) 4700 va_next = eva; 4701 4702 l2 = pmap_l1_to_l2(l1, sva); 4703 if (pmap_load(l2) == 0) 4704 continue; 4705 4706 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 4707 if (sva + L2_SIZE == va_next && eva >= va_next) { 4708 pmap_protect_l2(pmap, l2, sva, mask, nbits); 4709 continue; 4710 } else if ((pmap_load(l2) & mask) == nbits || 4711 pmap_demote_l2(pmap, l2, sva) == NULL) 4712 continue; 4713 } 4714 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 4715 ("pmap_protect: Invalid L2 entry after demotion")); 4716 4717 if (va_next > eva) 4718 va_next = eva; 4719 4720 va = va_next; 4721 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, 4722 sva += L3_SIZE) { 4723 l3 = pmap_load(l3p); 4724 4725 /* 4726 * Go to the next L3 entry if the current one is 4727 * invalid or already has the desired access 4728 * restrictions in place. (The latter case occurs 4729 * frequently. For example, in a "buildworld" 4730 * workload, almost 1 out of 4 L3 entries already 4731 * have the desired restrictions.) 4732 */ 4733 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { 4734 if (va != va_next) { 4735 if (invalidate) 4736 pmap_s1_invalidate_range(pmap, 4737 va, sva, true); 4738 va = va_next; 4739 } 4740 if ((l3 & ATTR_CONTIGUOUS) != 0) { 4741 /* 4742 * Does this L3C page extend beyond 4743 * the requested range? Handle the 4744 * possibility that "va_next" is zero. 4745 */ 4746 if ((sva | L3C_OFFSET) > va_next - 1) 4747 break; 4748 4749 /* 4750 * Skip ahead to the last L3_PAGE 4751 * within this L3C page. 4752 */ 4753 l3p = (pt_entry_t *)((uintptr_t)l3p | 4754 ((L3C_ENTRIES - 1) * 4755 sizeof(pt_entry_t))); 4756 sva |= L3C_SIZE - L3_SIZE; 4757 } 4758 continue; 4759 } 4760 4761 if ((l3 & ATTR_CONTIGUOUS) != 0) { 4762 /* 4763 * Is this entire set of contiguous L3 entries 4764 * being protected? Handle the possibility 4765 * that "va_next" is zero because of address 4766 * wraparound. 4767 */ 4768 if ((sva & L3C_OFFSET) == 0 && 4769 sva + L3C_OFFSET <= va_next - 1) { 4770 pmap_mask_set_l3c(pmap, l3p, sva, &va, 4771 va_next, mask, nbits); 4772 l3p += L3C_ENTRIES - 1; 4773 sva += L3C_SIZE - L3_SIZE; 4774 continue; 4775 } 4776 4777 (void)pmap_demote_l3c(pmap, l3p, sva); 4778 4779 /* 4780 * The L3 entry's accessed bit may have changed. 4781 */ 4782 l3 = pmap_load(l3p); 4783 } 4784 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | 4785 nbits)) 4786 cpu_spinwait(); 4787 4788 /* 4789 * When a dirty read/write mapping is write protected, 4790 * update the page's dirty field. 4791 */ 4792 if ((l3 & ATTR_SW_MANAGED) != 0 && 4793 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 4794 pmap_pte_dirty(pmap, l3)) 4795 vm_page_dirty(PTE_TO_VM_PAGE(l3)); 4796 4797 if (va == va_next) 4798 va = sva; 4799 } 4800 if (va != va_next && invalidate) 4801 pmap_s1_invalidate_range(pmap, va, sva, true); 4802 } 4803 } 4804 4805 static void 4806 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask, 4807 pt_entry_t nbits, bool invalidate) 4808 { 4809 PMAP_LOCK(pmap); 4810 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate); 4811 PMAP_UNLOCK(pmap); 4812 } 4813 4814 /* 4815 * Set the physical protection on the 4816 * specified range of this map as requested. 4817 */ 4818 void 4819 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4820 { 4821 pt_entry_t mask, nbits; 4822 4823 PMAP_ASSERT_STAGE1(pmap); 4824 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4825 if (prot == VM_PROT_NONE) { 4826 pmap_remove(pmap, sva, eva); 4827 return; 4828 } 4829 4830 mask = nbits = 0; 4831 if ((prot & VM_PROT_WRITE) == 0) { 4832 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM; 4833 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO); 4834 } 4835 if ((prot & VM_PROT_EXECUTE) == 0) { 4836 mask |= ATTR_S1_XN; 4837 nbits |= ATTR_S1_XN; 4838 } 4839 if (pmap == kernel_pmap) { 4840 mask |= ATTR_KERN_GP; 4841 nbits |= ATTR_KERN_GP; 4842 } 4843 if (mask == 0) 4844 return; 4845 4846 pmap_mask_set(pmap, sva, eva, mask, nbits, true); 4847 } 4848 4849 void 4850 pmap_disable_promotion(vm_offset_t sva, vm_size_t size) 4851 { 4852 4853 MPASS((sva & L3_OFFSET) == 0); 4854 MPASS(((sva + size) & L3_OFFSET) == 0); 4855 4856 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE, 4857 ATTR_SW_NO_PROMOTE, false); 4858 } 4859 4860 /* 4861 * Inserts the specified page table page into the specified pmap's collection 4862 * of idle page table pages. Each of a pmap's page table pages is responsible 4863 * for mapping a distinct range of virtual addresses. The pmap's collection is 4864 * ordered by this virtual address range. 4865 * 4866 * If "promoted" is false, then the page table page "mpte" must be zero filled; 4867 * "mpte"'s valid field will be set to 0. 4868 * 4869 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must 4870 * contain valid mappings with identical attributes except for ATTR_AF; 4871 * "mpte"'s valid field will be set to 1. 4872 * 4873 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain 4874 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid 4875 * field will be set to VM_PAGE_BITS_ALL. 4876 */ 4877 static __inline int 4878 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 4879 bool all_l3e_AF_set) 4880 { 4881 4882 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4883 KASSERT(promoted || !all_l3e_AF_set, 4884 ("a zero-filled PTP can't have ATTR_AF set in every PTE")); 4885 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0; 4886 return (vm_radix_insert(&pmap->pm_root, mpte)); 4887 } 4888 4889 /* 4890 * Removes the page table page mapping the specified virtual address from the 4891 * specified pmap's collection of idle page table pages, and returns it. 4892 * Otherwise, returns NULL if there is no page table page corresponding to the 4893 * specified virtual address. 4894 */ 4895 static __inline vm_page_t 4896 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4897 { 4898 4899 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4900 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 4901 } 4902 4903 /* 4904 * Performs a break-before-make update of a pmap entry. This is needed when 4905 * either promoting or demoting pages to ensure the TLB doesn't get into an 4906 * inconsistent state. 4907 */ 4908 static void 4909 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte, 4910 vm_offset_t va, vm_size_t size) 4911 { 4912 register_t intr; 4913 4914 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4915 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0, 4916 ("%s: Updating non-promote pte", __func__)); 4917 4918 /* 4919 * Ensure we don't get switched out with the page table in an 4920 * inconsistent state. We also need to ensure no interrupts fire 4921 * as they may make use of an address we are about to invalidate. 4922 */ 4923 intr = intr_disable(); 4924 4925 /* 4926 * Clear the old mapping's valid bit, but leave the rest of the entry 4927 * unchanged, so that a lockless, concurrent pmap_kextract() can still 4928 * lookup the physical address. 4929 */ 4930 pmap_clear_bits(ptep, ATTR_DESCR_VALID); 4931 4932 /* 4933 * When promoting, the L{1,2}_TABLE entry that is being replaced might 4934 * be cached, so we invalidate intermediate entries as well as final 4935 * entries. 4936 */ 4937 pmap_s1_invalidate_range(pmap, va, va + size, false); 4938 4939 /* Create the new mapping */ 4940 pmap_store(ptep, newpte); 4941 dsb(ishst); 4942 4943 intr_restore(intr); 4944 } 4945 4946 /* 4947 * Performs a break-before-make update of an ATTR_CONTIGUOUS mapping. 4948 */ 4949 static void __nosanitizecoverage 4950 pmap_update_strided(pmap_t pmap, pd_entry_t *ptep, pd_entry_t *ptep_end, 4951 pd_entry_t newpte, vm_offset_t va, vm_offset_t stride, vm_size_t size) 4952 { 4953 pd_entry_t *lip; 4954 register_t intr; 4955 4956 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4957 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0, 4958 ("%s: Updating non-promote pte", __func__)); 4959 4960 /* 4961 * Ensure we don't get switched out with the page table in an 4962 * inconsistent state. We also need to ensure no interrupts fire 4963 * as they may make use of an address we are about to invalidate. 4964 */ 4965 intr = intr_disable(); 4966 4967 /* 4968 * Clear the old mapping's valid bits, but leave the rest of each 4969 * entry unchanged, so that a lockless, concurrent pmap_kextract() can 4970 * still lookup the physical address. 4971 */ 4972 for (lip = ptep; lip < ptep_end; lip++) 4973 pmap_clear_bits(lip, ATTR_DESCR_VALID); 4974 4975 /* Only final entries are changing. */ 4976 pmap_s1_invalidate_strided(pmap, va, va + size, stride, true); 4977 4978 /* Create the new mapping. */ 4979 for (lip = ptep; lip < ptep_end; lip++) { 4980 pmap_store(lip, newpte); 4981 newpte += stride; 4982 } 4983 dsb(ishst); 4984 4985 intr_restore(intr); 4986 } 4987 4988 #if VM_NRESERVLEVEL > 0 4989 /* 4990 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 4991 * replace the many pv entries for the 4KB page mappings by a single pv entry 4992 * for the 2MB page mapping. 4993 */ 4994 static void 4995 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 4996 struct rwlock **lockp) 4997 { 4998 struct md_page *pvh; 4999 pv_entry_t pv; 5000 vm_offset_t va_last; 5001 vm_page_t m; 5002 5003 KASSERT((pa & L2_OFFSET) == 0, 5004 ("pmap_pv_promote_l2: pa is not 2mpage aligned")); 5005 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5006 5007 /* 5008 * Transfer the first page's pv entry for this mapping to the 2mpage's 5009 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 5010 * a transfer avoids the possibility that get_pv_entry() calls 5011 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 5012 * mappings that is being promoted. 5013 */ 5014 m = PHYS_TO_VM_PAGE(pa); 5015 va = va & ~L2_OFFSET; 5016 pv = pmap_pvh_remove(&m->md, pmap, va); 5017 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); 5018 pvh = page_to_pvh(m); 5019 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5020 pvh->pv_gen++; 5021 /* Free the remaining NPTEPG - 1 pv entries. */ 5022 va_last = va + L2_SIZE - PAGE_SIZE; 5023 do { 5024 m++; 5025 va += PAGE_SIZE; 5026 pmap_pvh_free(&m->md, pmap, va); 5027 } while (va < va_last); 5028 } 5029 5030 /* 5031 * Tries to promote the 512, contiguous 4KB page mappings that are within a 5032 * single level 2 table entry to a single 2MB page mapping. For promotion 5033 * to occur, two conditions must be met: (1) the 4KB page mappings must map 5034 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 5035 * identical characteristics. 5036 */ 5037 static bool 5038 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte, 5039 struct rwlock **lockp) 5040 { 5041 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa; 5042 5043 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5044 5045 /* 5046 * Currently, this function only supports promotion on stage 1 pmaps 5047 * because it tests stage 1 specific fields and performs a break- 5048 * before-make sequence that is incorrect for stage 2 pmaps. 5049 */ 5050 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap)) 5051 return (false); 5052 5053 /* 5054 * Examine the first L3E in the specified PTP. Abort if this L3E is 5055 * ineligible for promotion... 5056 */ 5057 firstl3 = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 5058 newl2 = pmap_load(firstl3); 5059 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0) 5060 return (false); 5061 /* ... is not the first physical page within an L2 block */ 5062 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 || 5063 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */ 5064 counter_u64_add(pmap_l2_p_failures, 1); 5065 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 5066 " in pmap %p", va, pmap); 5067 return (false); 5068 } 5069 5070 /* 5071 * Both here and in the below "for" loop, to allow for repromotion 5072 * after MADV_FREE, conditionally write protect a clean L3E before 5073 * possibly aborting the promotion due to other L3E attributes. Why? 5074 * Suppose that MADV_FREE is applied to a part of a superpage, the 5075 * address range [S, E). pmap_advise() will demote the superpage 5076 * mapping, destroy the 4KB page mapping at the end of [S, E), and 5077 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later, 5078 * imagine that the memory in [S, E) is recycled, but the last 4KB 5079 * page in [S, E) is not the last to be rewritten, or simply accessed. 5080 * In other words, there is still a 4KB page in [S, E), call it P, 5081 * that is writeable but AP_RO is set and AF is clear in P's L3E. 5082 * Unless we write protect P before aborting the promotion, if and 5083 * when P is finally rewritten, there won't be a page fault to trigger 5084 * repromotion. 5085 */ 5086 setl2: 5087 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 5088 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 5089 /* 5090 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 5091 * ATTR_SW_DBM can be cleared without a TLB invalidation. 5092 */ 5093 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM)) 5094 goto setl2; 5095 newl2 &= ~ATTR_SW_DBM; 5096 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx" 5097 " in pmap %p", va & ~L2_OFFSET, pmap); 5098 } 5099 5100 /* 5101 * Examine each of the other L3Es in the specified PTP. Abort if this 5102 * L3E maps an unexpected 4KB physical page or does not have identical 5103 * characteristics to the first L3E. If ATTR_AF is not set in every 5104 * PTE, then request that the PTP be refilled on demotion. 5105 */ 5106 all_l3e_AF = newl2 & ATTR_AF; 5107 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK)) 5108 + L2_SIZE - PAGE_SIZE; 5109 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { 5110 oldl3 = pmap_load(l3); 5111 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) { 5112 counter_u64_add(pmap_l2_p_failures, 1); 5113 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 5114 " in pmap %p", va, pmap); 5115 return (false); 5116 } 5117 setl3: 5118 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 5119 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 5120 /* 5121 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 5122 * set, ATTR_SW_DBM can be cleared without a TLB 5123 * invalidation. 5124 */ 5125 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 5126 ~ATTR_SW_DBM)) 5127 goto setl3; 5128 oldl3 &= ~ATTR_SW_DBM; 5129 } 5130 if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) { 5131 counter_u64_add(pmap_l2_p_failures, 1); 5132 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 5133 " in pmap %p", va, pmap); 5134 return (false); 5135 } 5136 all_l3e_AF &= oldl3; 5137 pa -= PAGE_SIZE; 5138 } 5139 5140 /* 5141 * Unless all PTEs have ATTR_AF set, clear it from the superpage 5142 * mapping, so that promotions triggered by speculative mappings, 5143 * such as pmap_enter_quick(), don't automatically mark the 5144 * underlying pages as referenced. 5145 */ 5146 newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF; 5147 5148 /* 5149 * Save the page table page in its current state until the L2 5150 * mapping the superpage is demoted by pmap_demote_l2() or 5151 * destroyed by pmap_remove_l3(). 5152 */ 5153 if (mpte == NULL) 5154 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 5155 KASSERT(mpte >= vm_page_array && 5156 mpte < &vm_page_array[vm_page_array_size], 5157 ("pmap_promote_l2: page table page is out of range")); 5158 KASSERT(mpte->pindex == pmap_l2_pindex(va), 5159 ("pmap_promote_l2: page table page's pindex is wrong")); 5160 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) { 5161 counter_u64_add(pmap_l2_p_failures, 1); 5162 CTR2(KTR_PMAP, 5163 "pmap_promote_l2: failure for va %#lx in pmap %p", va, 5164 pmap); 5165 return (false); 5166 } 5167 5168 if ((newl2 & ATTR_SW_MANAGED) != 0) 5169 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp); 5170 5171 pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE); 5172 5173 counter_u64_add(pmap_l2_promotions, 1); 5174 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 5175 pmap); 5176 return (true); 5177 } 5178 5179 /* 5180 * Tries to promote an aligned, contiguous set of base page mappings to a 5181 * single L3C page mapping. For promotion to occur, two conditions must be 5182 * met: (1) the base page mappings must map aligned, contiguous physical 5183 * memory and (2) the base page mappings must have identical characteristics 5184 * except for the accessed flag. 5185 */ 5186 static bool 5187 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va) 5188 { 5189 pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa; 5190 5191 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5192 5193 /* 5194 * Currently, this function only supports promotion on stage 1 pmaps 5195 * because it tests stage 1 specific fields and performs a break- 5196 * before-make sequence that is incorrect for stage 2 pmaps. 5197 */ 5198 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap)) 5199 return (false); 5200 5201 /* 5202 * Compute the address of the first L3 entry in the superpage 5203 * candidate. 5204 */ 5205 l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES * 5206 sizeof(pt_entry_t)) - 1)); 5207 5208 firstl3c = pmap_load(l3p); 5209 5210 /* 5211 * Examine the first L3 entry. Abort if this L3E is ineligible for 5212 * promotion... 5213 */ 5214 if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0) 5215 return (false); 5216 /* ...is not properly aligned... */ 5217 if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 || 5218 (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */ 5219 counter_u64_add(pmap_l3c_p_failures, 1); 5220 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx" 5221 " in pmap %p", va, pmap); 5222 return (false); 5223 } 5224 5225 /* 5226 * If the first L3 entry is a clean read-write mapping, convert it 5227 * to a read-only mapping. See pmap_promote_l2() for the rationale. 5228 */ 5229 set_first: 5230 if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 5231 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 5232 /* 5233 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 5234 * ATTR_SW_DBM can be cleared without a TLB invalidation. 5235 */ 5236 if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM)) 5237 goto set_first; 5238 firstl3c &= ~ATTR_SW_DBM; 5239 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx" 5240 " in pmap %p", va & ~L3C_OFFSET, pmap); 5241 } 5242 5243 /* 5244 * Check that the rest of the L3 entries are compatible with the first, 5245 * and convert clean read-write mappings to read-only mappings. 5246 */ 5247 all_l3e_AF = firstl3c & ATTR_AF; 5248 pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) + 5249 L3C_SIZE - PAGE_SIZE; 5250 for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) { 5251 oldl3 = pmap_load(l3); 5252 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) { 5253 counter_u64_add(pmap_l3c_p_failures, 1); 5254 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx" 5255 " in pmap %p", va, pmap); 5256 return (false); 5257 } 5258 set_l3: 5259 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 5260 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 5261 /* 5262 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 5263 * set, ATTR_SW_DBM can be cleared without a TLB 5264 * invalidation. 5265 */ 5266 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 5267 ~ATTR_SW_DBM)) 5268 goto set_l3; 5269 oldl3 &= ~ATTR_SW_DBM; 5270 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx" 5271 " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) | 5272 (va & ~L3C_OFFSET), pmap); 5273 } 5274 if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) { 5275 counter_u64_add(pmap_l3c_p_failures, 1); 5276 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx" 5277 " in pmap %p", va, pmap); 5278 return (false); 5279 } 5280 all_l3e_AF &= oldl3; 5281 pa -= PAGE_SIZE; 5282 } 5283 5284 /* 5285 * Unless all PTEs have ATTR_AF set, clear it from the superpage 5286 * mapping, so that promotions triggered by speculative mappings, 5287 * such as pmap_enter_quick(), don't automatically mark the 5288 * underlying pages as referenced. 5289 */ 5290 firstl3c &= ~ATTR_AF | all_l3e_AF; 5291 5292 /* 5293 * Remake the mappings with the contiguous bit set. 5294 */ 5295 pmap_update_strided(pmap, l3p, l3p + L3C_ENTRIES, firstl3c | 5296 ATTR_CONTIGUOUS, va & ~L3C_OFFSET, L3_SIZE, L3C_SIZE); 5297 5298 counter_u64_add(pmap_l3c_promotions, 1); 5299 CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va, 5300 pmap); 5301 return (true); 5302 } 5303 #endif /* VM_NRESERVLEVEL > 0 */ 5304 5305 static int 5306 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags, 5307 int psind) 5308 { 5309 pd_entry_t *l0p, *l1p, *l2p, *l3p, newpte, origpte, *tl3p; 5310 vm_page_t mp; 5311 5312 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5313 KASSERT(psind > 0 && psind < MAXPAGESIZES, 5314 ("psind %d unexpected", psind)); 5315 KASSERT((PTE_TO_PHYS(pte) & (pagesizes[psind] - 1)) == 0, 5316 ("unaligned phys address %#lx pte %#lx psind %d", 5317 PTE_TO_PHYS(pte), pte, psind)); 5318 5319 restart: 5320 newpte = pte; 5321 if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte)) 5322 return (KERN_PROTECTION_FAILURE); 5323 if (psind == 3) { 5324 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 5325 5326 KASSERT(pagesizes[psind] == L1_SIZE, 5327 ("pagesizes[%d] != L1_SIZE", psind)); 5328 l0p = pmap_l0(pmap, va); 5329 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) { 5330 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL); 5331 if (mp == NULL) { 5332 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 5333 return (KERN_RESOURCE_SHORTAGE); 5334 PMAP_UNLOCK(pmap); 5335 vm_wait(NULL); 5336 PMAP_LOCK(pmap); 5337 goto restart; 5338 } 5339 l1p = pmap_l0_to_l1(l0p, va); 5340 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 5341 origpte = pmap_load(l1p); 5342 } else { 5343 l1p = pmap_l0_to_l1(l0p, va); 5344 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 5345 origpte = pmap_load(l1p); 5346 if ((origpte & ATTR_DESCR_VALID) == 0) { 5347 mp = PTE_TO_VM_PAGE(pmap_load(l0p)); 5348 mp->ref_count++; 5349 } 5350 } 5351 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) && 5352 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) || 5353 (origpte & ATTR_DESCR_VALID) == 0, 5354 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx", 5355 va, origpte, newpte)); 5356 pmap_store(l1p, newpte); 5357 } else if (psind == 2) { 5358 KASSERT(pagesizes[psind] == L2_SIZE, 5359 ("pagesizes[%d] != L2_SIZE", psind)); 5360 l2p = pmap_l2(pmap, va); 5361 if (l2p == NULL) { 5362 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL); 5363 if (mp == NULL) { 5364 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 5365 return (KERN_RESOURCE_SHORTAGE); 5366 PMAP_UNLOCK(pmap); 5367 vm_wait(NULL); 5368 PMAP_LOCK(pmap); 5369 goto restart; 5370 } 5371 l2p = VM_PAGE_TO_DMAP(mp); 5372 l2p = &l2p[pmap_l2_index(va)]; 5373 origpte = pmap_load(l2p); 5374 } else { 5375 l1p = pmap_l1(pmap, va); 5376 origpte = pmap_load(l2p); 5377 if ((origpte & ATTR_DESCR_VALID) == 0) { 5378 mp = PTE_TO_VM_PAGE(pmap_load(l1p)); 5379 mp->ref_count++; 5380 } 5381 } 5382 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 5383 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK && 5384 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)), 5385 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx", 5386 va, origpte, newpte)); 5387 pmap_store(l2p, newpte); 5388 } else /* (psind == 1) */ { 5389 KASSERT(pagesizes[psind] == L3C_SIZE, 5390 ("pagesizes[%d] != L3C_SIZE", psind)); 5391 l2p = pmap_l2(pmap, va); 5392 if (l2p == NULL || (pmap_load(l2p) & ATTR_DESCR_VALID) == 0) { 5393 mp = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL); 5394 if (mp == NULL) { 5395 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 5396 return (KERN_RESOURCE_SHORTAGE); 5397 PMAP_UNLOCK(pmap); 5398 vm_wait(NULL); 5399 PMAP_LOCK(pmap); 5400 goto restart; 5401 } 5402 mp->ref_count += L3C_ENTRIES - 1; 5403 l3p = VM_PAGE_TO_DMAP(mp); 5404 l3p = &l3p[pmap_l3_index(va)]; 5405 } else { 5406 l3p = pmap_l2_to_l3(l2p, va); 5407 if ((pmap_load(l3p) & ATTR_DESCR_VALID) == 0) { 5408 mp = PTE_TO_VM_PAGE(pmap_load(l2p)); 5409 mp->ref_count += L3C_ENTRIES; 5410 } 5411 } 5412 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 5413 origpte = pmap_load(tl3p); 5414 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 5415 ((origpte & ATTR_CONTIGUOUS) != 0 && 5416 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)), 5417 ("va %#lx changing 64K phys page l3 %#lx newpte %#lx", 5418 va, origpte, newpte)); 5419 pmap_store(tl3p, newpte); 5420 newpte += L3_SIZE; 5421 } 5422 } 5423 dsb(ishst); 5424 5425 if ((origpte & ATTR_DESCR_VALID) == 0) 5426 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); 5427 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0) 5428 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 5429 else if ((newpte & ATTR_SW_WIRED) == 0 && 5430 (origpte & ATTR_SW_WIRED) != 0) 5431 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 5432 5433 return (KERN_SUCCESS); 5434 } 5435 5436 static void 5437 pmap_set_unprotected(pt_entry_t new_l3) 5438 { 5439 vm_paddr_t pa; 5440 5441 pa = PTE_TO_PHYS(new_l3) & ~prot_ns_shared_pa; 5442 5443 rsi_set_addr_range_state(pa, pa + L3_SIZE, RSI_RIPAS_EMPTY, 5444 RSI_CHANGE_DESTROYED, NULL); 5445 } 5446 5447 static void 5448 pmap_set_protected(pt_entry_t old_l3) 5449 { 5450 vm_paddr_t pa; 5451 5452 pa = PTE_TO_PHYS(old_l3) & ~prot_ns_shared_pa; 5453 5454 rsi_set_addr_range_state(pa, pa + L3_SIZE, RSI_RIPAS_RAM, 5455 RSI_CHANGE_DESTROYED, NULL); 5456 } 5457 5458 /* 5459 * Insert the given physical page (p) at 5460 * the specified virtual address (v) in the 5461 * target physical map with the protection requested. 5462 * 5463 * If specified, the page will be wired down, meaning 5464 * that the related pte can not be reclaimed. 5465 * 5466 * NB: This is the only routine which MAY NOT lazy-evaluate 5467 * or lose information. That is, this routine must actually 5468 * insert this page into the given map NOW. 5469 */ 5470 int 5471 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 5472 u_int flags, int8_t psind) 5473 { 5474 struct rwlock *lock; 5475 pd_entry_t *pde; 5476 pt_entry_t new_l3, orig_l3; 5477 pt_entry_t *l2, *l3; 5478 pv_entry_t pv; 5479 vm_paddr_t opa, pa; 5480 vm_page_t mpte, om; 5481 bool nosleep; 5482 int full_lvl, lvl, rv; 5483 5484 KASSERT(ADDR_IS_CANONICAL(va), 5485 ("%s: Address not in canonical form: %lx", __func__, va)); 5486 5487 va = trunc_page(va); 5488 if ((m->oflags & VPO_UNMANAGED) == 0) 5489 VM_PAGE_OBJECT_BUSY_ASSERT(m); 5490 pa = VM_PAGE_TO_PHYS(m); 5491 if (in_realm() && (flags & PMAP_ENTER_UNPROTECTED) != 0) 5492 pa |= prot_ns_shared_pa; 5493 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr | 5494 L3_PAGE); 5495 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr); 5496 new_l3 |= pmap_pte_prot(pmap, prot); 5497 if ((flags & PMAP_ENTER_WIRED) != 0) 5498 new_l3 |= ATTR_SW_WIRED; 5499 if (pmap->pm_stage == PM_STAGE1) { 5500 if (ADDR_IS_USER(va)) 5501 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5502 else 5503 new_l3 |= ATTR_S1_UXN; 5504 if (pmap != kernel_pmap) 5505 new_l3 |= ATTR_S1_nG; 5506 } else { 5507 /* 5508 * Clear the access flag on executable mappings, this will be 5509 * set later when the page is accessed. The fault handler is 5510 * required to invalidate the I-cache. 5511 * 5512 * TODO: Switch to the valid flag to allow hardware management 5513 * of the access flag. Much of the pmap code assumes the 5514 * valid flag is set and fails to destroy the old page tables 5515 * correctly if it is clear. 5516 */ 5517 if (prot & VM_PROT_EXECUTE) 5518 new_l3 &= ~ATTR_AF; 5519 } 5520 if ((m->oflags & VPO_UNMANAGED) == 0) { 5521 new_l3 |= ATTR_SW_MANAGED; 5522 if ((prot & VM_PROT_WRITE) != 0) { 5523 new_l3 |= ATTR_SW_DBM; 5524 if ((flags & VM_PROT_WRITE) == 0) { 5525 if (pmap->pm_stage == PM_STAGE1) 5526 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); 5527 else 5528 new_l3 &= 5529 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 5530 } 5531 } 5532 } 5533 5534 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 5535 5536 lock = NULL; 5537 PMAP_LOCK(pmap); 5538 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 5539 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 5540 ("managed largepage va %#lx flags %#x", va, flags)); 5541 if (psind == 3) { 5542 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 5543 new_l3 &= ~L3_PAGE; 5544 new_l3 |= L1_BLOCK; 5545 } else if (psind == 2) { 5546 new_l3 &= ~L3_PAGE; 5547 new_l3 |= L2_BLOCK; 5548 } else /* (psind == 1) */ 5549 new_l3 |= ATTR_CONTIGUOUS; 5550 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind); 5551 goto out; 5552 } 5553 if (psind == 2) { 5554 /* Assert the required virtual and physical alignment. */ 5555 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); 5556 KASSERT(m->psind > 1, ("pmap_enter: m->psind < psind")); 5557 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, 5558 flags, m, &lock); 5559 goto out; 5560 } 5561 mpte = NULL; 5562 if (psind == 1) { 5563 KASSERT((va & L3C_OFFSET) == 0, ("pmap_enter: va unaligned")); 5564 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 5565 rv = pmap_enter_l3c(pmap, va, new_l3 | ATTR_CONTIGUOUS, flags, 5566 m, &mpte, &lock); 5567 #if VM_NRESERVLEVEL > 0 5568 /* 5569 * Attempt L2 promotion, if both the PTP and a level 1 5570 * reservation are fully populated. 5571 */ 5572 if (rv == KERN_SUCCESS && 5573 (mpte == NULL || mpte->ref_count == NL3PG) && 5574 (m->flags & PG_FICTITIOUS) == 0 && 5575 vm_reserv_level_iffullpop(m) == 1) { 5576 pde = pmap_l2(pmap, va); 5577 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock); 5578 } 5579 #endif 5580 goto out; 5581 } 5582 5583 /* 5584 * In the case that a page table page is not 5585 * resident, we are creating it here. 5586 */ 5587 retry: 5588 pde = pmap_pde(pmap, va, &lvl); 5589 if (pde != NULL && lvl == 2) { 5590 l3 = pmap_l2_to_l3(pde, va); 5591 if (ADDR_IS_USER(va) && mpte == NULL) { 5592 mpte = PTE_TO_VM_PAGE(pmap_load(pde)); 5593 mpte->ref_count++; 5594 } 5595 goto havel3; 5596 } else if (pde != NULL && lvl == 1) { 5597 l2 = pmap_l1_to_l2(pde, va); 5598 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && 5599 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { 5600 l3 = &l3[pmap_l3_index(va)]; 5601 if (ADDR_IS_USER(va)) { 5602 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 5603 mpte->ref_count++; 5604 } 5605 goto havel3; 5606 } 5607 /* We need to allocate an L3 table. */ 5608 } 5609 if (ADDR_IS_USER(va)) { 5610 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 5611 5612 /* 5613 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order 5614 * to handle the possibility that a superpage mapping for "va" 5615 * was created while we slept. 5616 */ 5617 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), 5618 nosleep ? NULL : &lock); 5619 if (mpte == NULL && nosleep) { 5620 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 5621 rv = KERN_RESOURCE_SHORTAGE; 5622 goto out; 5623 } 5624 goto retry; 5625 } else 5626 panic("pmap_enter: missing L3 table for kernel va %#lx", va); 5627 5628 havel3: 5629 orig_l3 = pmap_load(l3); 5630 opa = PTE_TO_PHYS(orig_l3); 5631 pv = NULL; 5632 new_l3 |= pmap_pte_bti(pmap, va); 5633 5634 /* 5635 * Is the specified virtual address already mapped? 5636 */ 5637 if (pmap_l3_valid(orig_l3)) { 5638 /* 5639 * Wiring change, just update stats. We don't worry about 5640 * wiring PT pages as they remain resident as long as there 5641 * are valid mappings in them. Hence, if a user page is wired, 5642 * the PT page will be also. 5643 */ 5644 if ((flags & PMAP_ENTER_WIRED) != 0 && 5645 (orig_l3 & ATTR_SW_WIRED) == 0) 5646 pmap->pm_stats.wired_count++; 5647 else if ((flags & PMAP_ENTER_WIRED) == 0 && 5648 (orig_l3 & ATTR_SW_WIRED) != 0) 5649 pmap->pm_stats.wired_count--; 5650 5651 /* 5652 * Remove the extra PT page reference. 5653 */ 5654 if (mpte != NULL) { 5655 mpte->ref_count--; 5656 KASSERT(mpte->ref_count > 0, 5657 ("pmap_enter: missing reference to page table page," 5658 " va: 0x%lx", va)); 5659 } 5660 5661 /* 5662 * Has the physical page changed? 5663 */ 5664 if (opa == pa) { 5665 /* 5666 * No, might be a protection or wiring change. 5667 */ 5668 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 5669 (new_l3 & ATTR_SW_DBM) != 0) 5670 vm_page_aflag_set(m, PGA_WRITEABLE); 5671 goto validate; 5672 } 5673 5674 /* 5675 * The physical page has changed. Temporarily invalidate 5676 * the mapping. 5677 */ 5678 if ((orig_l3 & ATTR_CONTIGUOUS) != 0) 5679 (void)pmap_demote_l3c(pmap, l3, va); 5680 orig_l3 = pmap_load_clear(l3); 5681 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 5682 ("pmap_enter: unexpected pa update for %#lx", va)); 5683 if ((orig_l3 & ATTR_SW_MANAGED) != 0) { 5684 om = PHYS_TO_VM_PAGE(opa); 5685 5686 /* 5687 * The pmap lock is sufficient to synchronize with 5688 * concurrent calls to pmap_page_test_mappings() and 5689 * pmap_ts_referenced(). 5690 */ 5691 if (pmap_pte_dirty(pmap, orig_l3)) 5692 vm_page_dirty(om); 5693 if ((orig_l3 & ATTR_AF) != 0) { 5694 pmap_invalidate_page(pmap, va, true); 5695 vm_page_aflag_set(om, PGA_REFERENCED); 5696 } 5697 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om); 5698 pv = pmap_pvh_remove(&om->md, pmap, va); 5699 if ((m->oflags & VPO_UNMANAGED) != 0) 5700 free_pv_entry(pmap, pv); 5701 if ((om->a.flags & PGA_WRITEABLE) != 0 && 5702 TAILQ_EMPTY(&om->md.pv_list) && 5703 ((om->flags & PG_FICTITIOUS) != 0 || 5704 TAILQ_EMPTY(&page_to_pvh(om)->pv_list))) 5705 vm_page_aflag_clear(om, PGA_WRITEABLE); 5706 } else { 5707 KASSERT((orig_l3 & ATTR_AF) != 0, 5708 ("pmap_enter: unmanaged mapping lacks ATTR_AF")); 5709 pmap_invalidate_page(pmap, va, true); 5710 } 5711 orig_l3 = 0; 5712 } else { 5713 /* 5714 * Increment the counters. 5715 */ 5716 if ((new_l3 & ATTR_SW_WIRED) != 0) 5717 pmap->pm_stats.wired_count++; 5718 pmap_resident_count_inc(pmap, 1); 5719 } 5720 /* 5721 * Enter on the PV list if part of our managed memory. 5722 */ 5723 if ((m->oflags & VPO_UNMANAGED) == 0) { 5724 if (pv == NULL) { 5725 pv = get_pv_entry(pmap, &lock); 5726 pv->pv_va = va; 5727 } 5728 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5729 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5730 m->md.pv_gen++; 5731 if ((new_l3 & ATTR_SW_DBM) != 0) 5732 vm_page_aflag_set(m, PGA_WRITEABLE); 5733 } 5734 5735 validate: 5736 if (pmap->pm_stage == PM_STAGE1) { 5737 /* 5738 * Sync icache if exec permission and attribute 5739 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping 5740 * is stored and made valid for hardware table walk. If done 5741 * later, then other can access this page before caches are 5742 * properly synced. Don't do it for kernel memory which is 5743 * mapped with exec permission even if the memory isn't going 5744 * to hold executable code. The only time when icache sync is 5745 * needed is after kernel module is loaded and the relocation 5746 * info is processed. And it's done in elf_cpu_load_file(). 5747 */ 5748 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 5749 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && 5750 (opa != pa || (orig_l3 & ATTR_S1_XN))) { 5751 PMAP_ASSERT_STAGE1(pmap); 5752 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 5753 } 5754 } else { 5755 cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 5756 } 5757 5758 /* 5759 * Update the L3 entry 5760 */ 5761 if (pmap_l3_valid(orig_l3)) { 5762 KASSERT(opa == pa, ("pmap_enter: invalid update")); 5763 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { 5764 /* same PA, different attributes */ 5765 if ((orig_l3 & ATTR_CONTIGUOUS) != 0) 5766 (void)pmap_demote_l3c(pmap, l3, va); 5767 orig_l3 = pmap_load_store(l3, new_l3); 5768 pmap_invalidate_page(pmap, va, true); 5769 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 5770 pmap_pte_dirty(pmap, orig_l3)) 5771 vm_page_dirty(m); 5772 } else { 5773 /* 5774 * orig_l3 == new_l3 5775 * This can happens if multiple threads simultaneously 5776 * access not yet mapped page. This bad for performance 5777 * since this can cause full demotion-NOP-promotion 5778 * cycle. 5779 * Another possible reasons are: 5780 * - VM and pmap memory layout are diverged 5781 * - tlb flush is missing somewhere and CPU doesn't see 5782 * actual mapping. 5783 */ 5784 CTR4(KTR_PMAP, "%s: already mapped page - " 5785 "pmap %p va 0x%#lx pte 0x%lx", 5786 __func__, pmap, va, new_l3); 5787 } 5788 } else { 5789 /* New mapping */ 5790 pmap_store(l3, new_l3); 5791 dsb(ishst); 5792 } 5793 5794 #if VM_NRESERVLEVEL > 0 5795 /* 5796 * First, attempt L3C promotion, if the virtual and physical addresses 5797 * are aligned with each other and an underlying reservation has the 5798 * neighboring L3 pages allocated. The first condition is simply an 5799 * optimization that recognizes some eventual promotion failures early 5800 * at a lower run-time cost. Then, if both a level 1 reservation and 5801 * the PTP are fully populated, attempt L2 promotion. 5802 */ 5803 if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) && 5804 (m->flags & PG_FICTITIOUS) == 0 && 5805 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 && 5806 pmap_promote_l3c(pmap, l3, va) && 5807 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) 5808 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock); 5809 #endif 5810 5811 rv = KERN_SUCCESS; 5812 5813 if (in_realm() && (flags & PMAP_ENTER_UNPROTECTED) != 0) 5814 pmap_set_unprotected(new_l3); 5815 5816 out: 5817 if (lock != NULL) 5818 rw_wunlock(lock); 5819 PMAP_UNLOCK(pmap); 5820 return (rv); 5821 } 5822 5823 /* 5824 * Tries to create a read- and/or execute-only L2 page mapping. Returns 5825 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 5826 * value. See pmap_enter_l2() for the possible error values when "no sleep", 5827 * "no replace", and "no reclaim" are specified. 5828 */ 5829 static int 5830 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 5831 struct rwlock **lockp) 5832 { 5833 pd_entry_t new_l2; 5834 5835 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5836 PMAP_ASSERT_STAGE1(pmap); 5837 KASSERT(ADDR_IS_CANONICAL(va), 5838 ("%s: Address not in canonical form: %lx", __func__, va)); 5839 5840 new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | pmap_sh_attr | 5841 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 5842 L2_BLOCK); 5843 if ((m->oflags & VPO_UNMANAGED) == 0) 5844 new_l2 |= ATTR_SW_MANAGED; 5845 else 5846 new_l2 |= ATTR_AF; 5847 if ((prot & VM_PROT_EXECUTE) == 0 || 5848 m->md.pv_memattr == VM_MEMATTR_DEVICE) 5849 new_l2 |= ATTR_S1_XN; 5850 if (ADDR_IS_USER(va)) 5851 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5852 else 5853 new_l2 |= ATTR_S1_UXN; 5854 if (pmap != kernel_pmap) 5855 new_l2 |= ATTR_S1_nG; 5856 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 5857 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp)); 5858 } 5859 5860 /* 5861 * Returns true if every page table entry in the specified page table is 5862 * zero. 5863 */ 5864 static bool 5865 pmap_every_pte_zero(vm_paddr_t pa) 5866 { 5867 pt_entry_t *pt_end, *pte; 5868 5869 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 5870 pte = PHYS_TO_DMAP(pa); 5871 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 5872 if (*pte != 0) 5873 return (false); 5874 } 5875 return (true); 5876 } 5877 5878 /* 5879 * Tries to create the specified L2 page mapping. Returns KERN_SUCCESS if 5880 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or 5881 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 5882 * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists 5883 * within the L2 virtual address range starting at the specified virtual 5884 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a 5885 * L2 page mapping already exists at the specified virtual address. Returns 5886 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a 5887 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified 5888 * and a PV entry allocation failed. 5889 */ 5890 static int 5891 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 5892 vm_page_t m, struct rwlock **lockp) 5893 { 5894 struct spglist free; 5895 pd_entry_t *l2, old_l2; 5896 vm_page_t l2pg, mt; 5897 vm_page_t uwptpg; 5898 5899 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5900 KASSERT(ADDR_IS_CANONICAL(va), 5901 ("%s: Address not in canonical form: %lx", __func__, va)); 5902 KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) != 5903 PMAP_ENTER_NORECLAIM, 5904 ("pmap_enter_l2: flags is missing PMAP_ENTER_NOREPLACE")); 5905 5906 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & 5907 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 5908 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 5909 va, pmap); 5910 return (KERN_RESOURCE_SHORTAGE); 5911 } 5912 5913 /* 5914 * If bti is not the same for the whole l2 range, return failure 5915 * and let vm_fault() cope. Check after l2 allocation, since 5916 * it could sleep. 5917 */ 5918 if (!pmap_bti_same(pmap, va, va + L2_SIZE, &new_l2)) { 5919 KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP")); 5920 pmap_abort_ptp(pmap, va, l2pg); 5921 return (KERN_PROTECTION_FAILURE); 5922 } 5923 5924 /* 5925 * If there are existing mappings, either abort or remove them. 5926 */ 5927 if ((old_l2 = pmap_load(l2)) != 0) { 5928 KASSERT(l2pg == NULL || l2pg->ref_count > 1, 5929 ("pmap_enter_l2: l2pg's ref count is too low")); 5930 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 5931 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) { 5932 if (l2pg != NULL) 5933 l2pg->ref_count--; 5934 CTR2(KTR_PMAP, 5935 "pmap_enter_l2: no space for va %#lx" 5936 " in pmap %p", va, pmap); 5937 return (KERN_NO_SPACE); 5938 } else if (ADDR_IS_USER(va) || 5939 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) { 5940 if (l2pg != NULL) 5941 l2pg->ref_count--; 5942 CTR2(KTR_PMAP, 5943 "pmap_enter_l2: failure for va %#lx" 5944 " in pmap %p", va, pmap); 5945 return (KERN_FAILURE); 5946 } 5947 } 5948 SLIST_INIT(&free); 5949 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) { 5950 (void)pmap_remove_l2(pmap, l2, va, 5951 pmap_load(pmap_l1(pmap, va)), false, &free, lockp); 5952 } else { 5953 if (ADDR_IS_KERNEL(va)) { 5954 /* 5955 * Try to save the ptp in the trie 5956 * before any changes to mappings are 5957 * made. Abort on failure. 5958 */ 5959 mt = PTE_TO_VM_PAGE(old_l2); 5960 if (pmap_insert_pt_page(pmap, mt, false, 5961 false)) { 5962 CTR1(KTR_PMAP, 5963 "pmap_enter_l2: cannot ins kern ptp va %#lx", 5964 va); 5965 return (KERN_RESOURCE_SHORTAGE); 5966 } 5967 /* 5968 * Both pmap_remove_l2() and 5969 * pmap_remove_l3_range() will zero fill 5970 * the L3 kernel page table page. 5971 */ 5972 } 5973 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, 5974 &free, lockp); 5975 if (ADDR_IS_KERNEL(va)) { 5976 /* 5977 * The TLB could have an intermediate 5978 * entry for the L3 kernel page table 5979 * page, so request an invalidation at 5980 * all levels after clearing the 5981 * L2_TABLE entry. 5982 */ 5983 pmap_clear(l2); 5984 pmap_s1_invalidate_page(pmap, va, false); 5985 } 5986 } 5987 KASSERT(pmap_load(l2) == 0, 5988 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 5989 if (ADDR_IS_USER(va)) { 5990 vm_page_free_pages_toq(&free, true); 5991 } else { 5992 KASSERT(SLIST_EMPTY(&free), 5993 ("pmap_enter_l2: freed kernel page table page")); 5994 } 5995 } 5996 5997 /* 5998 * Allocate leaf ptpage for wired userspace pages. 5999 */ 6000 uwptpg = NULL; 6001 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) { 6002 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED); 6003 if (uwptpg == NULL) { 6004 pmap_abort_ptp(pmap, va, l2pg); 6005 return (KERN_RESOURCE_SHORTAGE); 6006 } 6007 uwptpg->pindex = pmap_l2_pindex(va); 6008 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 6009 vm_page_unwire_noq(uwptpg); 6010 vm_page_free(uwptpg); 6011 pmap_abort_ptp(pmap, va, l2pg); 6012 return (KERN_RESOURCE_SHORTAGE); 6013 } 6014 pmap_resident_count_inc(pmap, 1); 6015 uwptpg->ref_count = NL3PG; 6016 } 6017 if ((new_l2 & ATTR_SW_MANAGED) != 0) { 6018 /* 6019 * Abort this mapping if its PV entry could not be created. 6020 */ 6021 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 6022 if (l2pg != NULL) 6023 pmap_abort_ptp(pmap, va, l2pg); 6024 else { 6025 KASSERT(ADDR_IS_KERNEL(va) && 6026 (pmap_load(l2) & ATTR_DESCR_MASK) == 6027 L2_TABLE, 6028 ("pmap_enter_l2: invalid kernel L2E")); 6029 mt = pmap_remove_pt_page(pmap, va); 6030 KASSERT(mt != NULL, 6031 ("pmap_enter_l2: missing kernel PTP")); 6032 } 6033 if (uwptpg != NULL) { 6034 mt = pmap_remove_pt_page(pmap, va); 6035 KASSERT(mt == uwptpg, 6036 ("removed pt page %p, expected %p", mt, 6037 uwptpg)); 6038 pmap_resident_count_dec(pmap, 1); 6039 uwptpg->ref_count = 1; 6040 vm_page_unwire_noq(uwptpg); 6041 vm_page_free(uwptpg); 6042 } 6043 CTR2(KTR_PMAP, 6044 "pmap_enter_l2: failure for va %#lx in pmap %p", 6045 va, pmap); 6046 return (KERN_RESOURCE_SHORTAGE); 6047 } 6048 if ((new_l2 & ATTR_SW_DBM) != 0) 6049 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 6050 vm_page_aflag_set(mt, PGA_WRITEABLE); 6051 } 6052 6053 /* 6054 * Increment counters. 6055 */ 6056 if ((new_l2 & ATTR_SW_WIRED) != 0) 6057 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 6058 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 6059 6060 /* 6061 * Conditionally sync the icache. See pmap_enter() for details. 6062 */ 6063 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) != 6064 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) && 6065 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) { 6066 cpu_icache_sync_range(PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)), 6067 L2_SIZE); 6068 } 6069 6070 /* 6071 * Map the superpage. 6072 */ 6073 pmap_store(l2, new_l2); 6074 dsb(ishst); 6075 6076 counter_u64_add(pmap_l2_mappings, 1); 6077 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 6078 va, pmap); 6079 6080 return (KERN_SUCCESS); 6081 } 6082 6083 /* 6084 * Tries to create a read- and/or execute-only L3C page mapping. Returns 6085 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 6086 * value. 6087 */ 6088 static int 6089 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p, 6090 vm_prot_t prot, struct rwlock **lockp) 6091 { 6092 pt_entry_t l3e; 6093 6094 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6095 PMAP_ASSERT_STAGE1(pmap); 6096 KASSERT(ADDR_IS_CANONICAL(va), 6097 ("%s: Address not in canonical form: %lx", __func__, va)); 6098 6099 l3e = VM_PAGE_TO_PTE(m) | pmap_sh_attr | 6100 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 6101 ATTR_CONTIGUOUS | L3_PAGE; 6102 if ((m->oflags & VPO_UNMANAGED) == 0) 6103 l3e |= ATTR_SW_MANAGED; 6104 else 6105 l3e |= ATTR_AF; 6106 if ((prot & VM_PROT_EXECUTE) == 0 || 6107 m->md.pv_memattr == VM_MEMATTR_DEVICE) 6108 l3e |= ATTR_S1_XN; 6109 if (ADDR_IS_USER(va)) 6110 l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 6111 else 6112 l3e |= ATTR_S1_UXN; 6113 if (pmap != kernel_pmap) 6114 l3e |= ATTR_S1_nG; 6115 return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP | 6116 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp)); 6117 } 6118 6119 static int 6120 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags, 6121 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp) 6122 { 6123 pd_entry_t *l2p, *pde; 6124 pt_entry_t *l3p, *tl3p; 6125 vm_page_t mt; 6126 vm_paddr_t pa; 6127 vm_pindex_t l2pindex; 6128 int lvl; 6129 6130 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6131 KASSERT((va & L3C_OFFSET) == 0, 6132 ("pmap_enter_l3c: va is not aligned")); 6133 KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0, 6134 ("pmap_enter_l3c: managed mapping within the clean submap")); 6135 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 6136 ("pmap_enter_l3c: l3e is missing ATTR_CONTIGUOUS")); 6137 6138 /* 6139 * If the L3 PTP is not resident, we attempt to create it here. 6140 */ 6141 if (ADDR_IS_USER(va)) { 6142 /* 6143 * Were we given the correct L3 PTP? If so, we can simply 6144 * increment its ref count. 6145 */ 6146 l2pindex = pmap_l2_pindex(va); 6147 if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) { 6148 (*ml3p)->ref_count += L3C_ENTRIES; 6149 } else { 6150 retry: 6151 /* 6152 * Get the L2 entry. 6153 */ 6154 pde = pmap_pde(pmap, va, &lvl); 6155 6156 /* 6157 * If the L2 entry is a superpage, we either abort or 6158 * demote depending on the given flags. 6159 */ 6160 if (lvl == 1) { 6161 l2p = pmap_l1_to_l2(pde, va); 6162 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == 6163 L2_BLOCK) { 6164 if ((flags & PMAP_ENTER_NOREPLACE) != 0) 6165 return (KERN_FAILURE); 6166 l3p = pmap_demote_l2_locked(pmap, l2p, 6167 va, lockp); 6168 if (l3p != NULL) { 6169 *ml3p = PTE_TO_VM_PAGE( 6170 pmap_load(l2p)); 6171 (*ml3p)->ref_count += 6172 L3C_ENTRIES; 6173 goto have_l3p; 6174 } 6175 } 6176 /* We need to allocate an L3 PTP. */ 6177 } 6178 6179 /* 6180 * If the L3 PTP is mapped, we just increment its ref 6181 * count. Otherwise, we attempt to allocate it. 6182 */ 6183 if (lvl == 2 && pmap_load(pde) != 0) { 6184 *ml3p = PTE_TO_VM_PAGE(pmap_load(pde)); 6185 (*ml3p)->ref_count += L3C_ENTRIES; 6186 } else { 6187 *ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags & 6188 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp); 6189 if (*ml3p == NULL) { 6190 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 6191 return (KERN_FAILURE); 6192 6193 /* 6194 * The page table may have changed 6195 * while we slept. 6196 */ 6197 goto retry; 6198 } 6199 (*ml3p)->ref_count += L3C_ENTRIES - 1; 6200 } 6201 } 6202 l3p = VM_PAGE_TO_DMAP(*ml3p); 6203 } else { 6204 *ml3p = NULL; 6205 6206 /* 6207 * If the L2 entry is a superpage, we either abort or demote 6208 * depending on the given flags. 6209 */ 6210 pde = pmap_pde(kernel_pmap, va, &lvl); 6211 if (lvl == 1) { 6212 l2p = pmap_l1_to_l2(pde, va); 6213 KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK, 6214 ("pmap_enter_l3c: missing L2 block")); 6215 if ((flags & PMAP_ENTER_NOREPLACE) != 0) 6216 return (KERN_FAILURE); 6217 l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp); 6218 } else { 6219 KASSERT(lvl == 2, 6220 ("pmap_enter_l3c: Invalid level %d", lvl)); 6221 l3p = PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(pde))); 6222 } 6223 } 6224 have_l3p: 6225 l3p = &l3p[pmap_l3_index(va)]; 6226 6227 /* 6228 * If bti is not the same for the whole L3C range, return failure 6229 * and let vm_fault() cope. Check after L3 allocation, since 6230 * it could sleep. 6231 */ 6232 if (!pmap_bti_same(pmap, va, va + L3C_SIZE, &l3e)) { 6233 KASSERT(*ml3p != NULL, ("pmap_enter_l3c: missing L3 PTP")); 6234 (*ml3p)->ref_count -= L3C_ENTRIES - 1; 6235 pmap_abort_ptp(pmap, va, *ml3p); 6236 *ml3p = NULL; 6237 return (KERN_PROTECTION_FAILURE); 6238 } 6239 6240 /* 6241 * If there are existing mappings, either abort or remove them. 6242 */ 6243 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 6244 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 6245 if (pmap_load(tl3p) != 0) { 6246 if (*ml3p != NULL) 6247 (*ml3p)->ref_count -= L3C_ENTRIES; 6248 return (KERN_FAILURE); 6249 } 6250 } 6251 } else { 6252 /* 6253 * Because we increment the L3 page's reference count above, 6254 * it is guaranteed not to be freed here and we can pass NULL 6255 * instead of a valid free list. 6256 */ 6257 pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va, 6258 va + L3C_SIZE, NULL, lockp); 6259 } 6260 6261 /* 6262 * Enter on the PV list if part of our managed memory. 6263 */ 6264 if ((l3e & ATTR_SW_MANAGED) != 0) { 6265 if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) { 6266 if (*ml3p != NULL) { 6267 (*ml3p)->ref_count -= L3C_ENTRIES - 1; 6268 pmap_abort_ptp(pmap, va, *ml3p); 6269 *ml3p = NULL; 6270 } 6271 return (KERN_RESOURCE_SHORTAGE); 6272 } 6273 if ((l3e & ATTR_SW_DBM) != 0) 6274 for (mt = m; mt < &m[L3C_ENTRIES]; mt++) 6275 vm_page_aflag_set(mt, PGA_WRITEABLE); 6276 } 6277 6278 /* 6279 * Increment counters. 6280 */ 6281 if ((l3e & ATTR_SW_WIRED) != 0) 6282 pmap->pm_stats.wired_count += L3C_ENTRIES; 6283 pmap_resident_count_inc(pmap, L3C_ENTRIES); 6284 6285 pa = VM_PAGE_TO_PHYS(m); 6286 KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned")); 6287 6288 /* 6289 * Sync the icache before the mapping is stored. 6290 */ 6291 if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap && 6292 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 6293 cpu_icache_sync_range(PHYS_TO_DMAP(pa), L3C_SIZE); 6294 6295 /* 6296 * Map the superpage. 6297 */ 6298 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 6299 pmap_store(tl3p, l3e); 6300 l3e += L3_SIZE; 6301 } 6302 dsb(ishst); 6303 6304 counter_u64_add(pmap_l3c_mappings, 1); 6305 CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p", 6306 va, pmap); 6307 return (KERN_SUCCESS); 6308 } 6309 6310 /* 6311 * Maps a sequence of resident pages belonging to the same object. 6312 * The sequence begins with the given page m_start. This page is 6313 * mapped at the given virtual address start. Each subsequent page is 6314 * mapped at a virtual address that is offset from start by the same 6315 * amount as the page is offset from m_start within the object. The 6316 * last page in the sequence is the page with the largest offset from 6317 * m_start that can be mapped at a virtual address less than the given 6318 * virtual address end. Not every virtual page between start and end 6319 * is mapped; only those for which a resident page exists with the 6320 * corresponding offset from m_start are mapped. 6321 */ 6322 void 6323 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 6324 vm_page_t m_start, vm_prot_t prot) 6325 { 6326 struct pctrie_iter pages; 6327 struct rwlock *lock; 6328 vm_offset_t va; 6329 vm_page_t m, mpte; 6330 int rv; 6331 6332 VM_OBJECT_ASSERT_LOCKED(m_start->object); 6333 6334 mpte = NULL; 6335 vm_page_iter_limit_init(&pages, m_start->object, 6336 m_start->pindex + atop(end - start)); 6337 m = vm_radix_iter_lookup(&pages, m_start->pindex); 6338 lock = NULL; 6339 PMAP_LOCK(pmap); 6340 while (m != NULL) { 6341 va = start + ptoa(m->pindex - m_start->pindex); 6342 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 6343 m->psind == 2 && pmap_ps_enabled(pmap) && 6344 ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) == 6345 KERN_SUCCESS || rv == KERN_NO_SPACE)) { 6346 m = vm_radix_iter_jump(&pages, L2_SIZE / PAGE_SIZE); 6347 } else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end && 6348 m->psind >= 1 && pmap_ps_enabled(pmap) && 6349 ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot, 6350 &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE)) { 6351 m = vm_radix_iter_jump(&pages, L3C_ENTRIES); 6352 } else { 6353 /* 6354 * In general, if a superpage mapping were possible, 6355 * it would have been created above. That said, if 6356 * start and end are not superpage aligned, then 6357 * promotion might be possible at the ends of [start, 6358 * end). However, in practice, those promotion 6359 * attempts are so unlikely to succeed that they are 6360 * not worth trying. 6361 */ 6362 mpte = pmap_enter_quick_locked(pmap, va, m, prot | 6363 VM_PROT_NO_PROMOTE, mpte, &lock); 6364 m = vm_radix_iter_step(&pages); 6365 } 6366 } 6367 if (lock != NULL) 6368 rw_wunlock(lock); 6369 PMAP_UNLOCK(pmap); 6370 } 6371 6372 /* 6373 * this code makes some *MAJOR* assumptions: 6374 * 1. Current pmap & pmap exists. 6375 * 2. Not wired. 6376 * 3. Read access. 6377 * 4. No page table pages. 6378 * but is *MUCH* faster than pmap_enter... 6379 */ 6380 6381 void 6382 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 6383 { 6384 struct rwlock *lock; 6385 6386 lock = NULL; 6387 PMAP_LOCK(pmap); 6388 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 6389 if (lock != NULL) 6390 rw_wunlock(lock); 6391 PMAP_UNLOCK(pmap); 6392 } 6393 6394 static vm_page_t 6395 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 6396 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 6397 { 6398 pt_entry_t *l1, *l2, *l3, l3_val; 6399 vm_paddr_t pa; 6400 int full_lvl, lvl; 6401 6402 KASSERT(!VA_IS_CLEANMAP(va) || 6403 (m->oflags & VPO_UNMANAGED) != 0, 6404 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 6405 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6406 PMAP_ASSERT_STAGE1(pmap); 6407 KASSERT(ADDR_IS_CANONICAL(va), 6408 ("%s: Address not in canonical form: %lx", __func__, va)); 6409 l2 = NULL; 6410 6411 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 6412 /* 6413 * In the case that a page table page is not 6414 * resident, we are creating it here. 6415 */ 6416 if (ADDR_IS_USER(va)) { 6417 vm_pindex_t l2pindex; 6418 6419 /* 6420 * Calculate pagetable page index 6421 */ 6422 l2pindex = pmap_l2_pindex(va); 6423 if (mpte && (mpte->pindex == l2pindex)) { 6424 mpte->ref_count++; 6425 } else { 6426 /* 6427 * If the page table page is mapped, we just increment 6428 * the hold count, and activate it. Otherwise, we 6429 * attempt to allocate a page table page, passing NULL 6430 * instead of the PV list lock pointer because we don't 6431 * intend to sleep. If this attempt fails, we don't 6432 * retry. Instead, we give up. 6433 */ 6434 l1 = pmap_l1(pmap, va); 6435 if (l1 != NULL && pmap_load(l1) != 0) { 6436 if ((pmap_load(l1) & ATTR_DESCR_MASK) == 6437 L1_BLOCK) 6438 return (NULL); 6439 l2 = pmap_l1_to_l2(l1, va); 6440 if (pmap_load(l2) != 0) { 6441 if ((pmap_load(l2) & ATTR_DESCR_MASK) == 6442 L2_BLOCK) 6443 return (NULL); 6444 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 6445 mpte->ref_count++; 6446 } else { 6447 mpte = _pmap_alloc_l3(pmap, l2pindex, 6448 NULL); 6449 if (mpte == NULL) 6450 return (mpte); 6451 } 6452 } else { 6453 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 6454 if (mpte == NULL) 6455 return (mpte); 6456 } 6457 } 6458 l3 = VM_PAGE_TO_DMAP(mpte); 6459 l3 = &l3[pmap_l3_index(va)]; 6460 } else { 6461 mpte = NULL; 6462 l2 = pmap_pde(kernel_pmap, va, &lvl); 6463 KASSERT(l2 != NULL, 6464 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", 6465 va)); 6466 KASSERT(lvl == 2, 6467 ("pmap_enter_quick_locked: Invalid level %d", lvl)); 6468 l3 = pmap_l2_to_l3(l2, va); 6469 } 6470 6471 /* 6472 * Abort if a mapping already exists. 6473 */ 6474 if (pmap_load(l3) != 0) { 6475 if (mpte != NULL) 6476 mpte->ref_count--; 6477 return (NULL); 6478 } 6479 6480 /* 6481 * Enter on the PV list if part of our managed memory. 6482 */ 6483 if ((m->oflags & VPO_UNMANAGED) == 0 && 6484 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 6485 if (mpte != NULL) 6486 pmap_abort_ptp(pmap, va, mpte); 6487 return (NULL); 6488 } 6489 6490 /* 6491 * Increment counters 6492 */ 6493 pmap_resident_count_inc(pmap, 1); 6494 6495 pa = VM_PAGE_TO_PHYS(m); 6496 l3_val = PHYS_TO_PTE(pa) | pmap_sh_attr | 6497 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE; 6498 l3_val |= pmap_pte_bti(pmap, va); 6499 if ((prot & VM_PROT_EXECUTE) == 0 || 6500 m->md.pv_memattr == VM_MEMATTR_DEVICE) 6501 l3_val |= ATTR_S1_XN; 6502 if (ADDR_IS_USER(va)) 6503 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 6504 else 6505 l3_val |= ATTR_S1_UXN; 6506 if (pmap != kernel_pmap) 6507 l3_val |= ATTR_S1_nG; 6508 6509 /* 6510 * Now validate mapping with RO protection 6511 */ 6512 if ((m->oflags & VPO_UNMANAGED) == 0) 6513 l3_val |= ATTR_SW_MANAGED; 6514 else 6515 l3_val |= ATTR_AF; 6516 6517 /* Sync icache before the mapping is stored to PTE */ 6518 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 6519 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 6520 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); 6521 6522 pmap_store(l3, l3_val); 6523 dsb(ishst); 6524 6525 #if VM_NRESERVLEVEL > 0 6526 /* 6527 * First, attempt L3C promotion, if the virtual and physical addresses 6528 * are aligned with each other and an underlying reservation has the 6529 * neighboring L3 pages allocated. The first condition is simply an 6530 * optimization that recognizes some eventual promotion failures early 6531 * at a lower run-time cost. Then, attempt L2 promotion, if both a 6532 * level 1 reservation and the PTP are fully populated. 6533 */ 6534 if ((prot & VM_PROT_NO_PROMOTE) == 0 && 6535 (va & L3C_OFFSET) == (pa & L3C_OFFSET) && 6536 (m->flags & PG_FICTITIOUS) == 0 && 6537 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 && 6538 pmap_promote_l3c(pmap, l3, va) && 6539 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) { 6540 if (l2 == NULL) 6541 l2 = pmap_l2(pmap, va); 6542 6543 /* 6544 * If promotion succeeds, then the next call to this function 6545 * should not be given the unmapped PTP as a hint. 6546 */ 6547 if (pmap_promote_l2(pmap, l2, va, mpte, lockp)) 6548 mpte = NULL; 6549 } 6550 #endif 6551 6552 return (mpte); 6553 } 6554 6555 /* 6556 * This code maps large physical mmap regions into the 6557 * processor address space. Note that some shortcuts 6558 * are taken, but the code works. 6559 */ 6560 void 6561 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 6562 vm_pindex_t pindex, vm_size_t size) 6563 { 6564 6565 VM_OBJECT_ASSERT_WLOCKED(object); 6566 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 6567 ("pmap_object_init_pt: non-device object")); 6568 } 6569 6570 /* 6571 * Clear the wired attribute from the mappings for the specified range of 6572 * addresses in the given pmap. Every valid mapping within that range 6573 * must have the wired attribute set. In contrast, invalid mappings 6574 * cannot have the wired attribute set, so they are ignored. 6575 * 6576 * The wired attribute of the page table entry is not a hardware feature, 6577 * so there is no need to invalidate any TLB entries. 6578 */ 6579 void 6580 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6581 { 6582 vm_offset_t va_next; 6583 pd_entry_t *l0, *l1, *l2; 6584 pt_entry_t *l3; 6585 bool partial_l3c; 6586 6587 PMAP_LOCK(pmap); 6588 for (; sva < eva; sva = va_next) { 6589 l0 = pmap_l0(pmap, sva); 6590 if (pmap_load(l0) == 0) { 6591 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 6592 if (va_next < sva) 6593 va_next = eva; 6594 continue; 6595 } 6596 6597 l1 = pmap_l0_to_l1(l0, sva); 6598 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 6599 if (va_next < sva) 6600 va_next = eva; 6601 if (pmap_load(l1) == 0) 6602 continue; 6603 6604 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 6605 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6606 KASSERT(va_next <= eva, 6607 ("partial update of non-transparent 1G page " 6608 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 6609 pmap_load(l1), sva, eva, va_next)); 6610 MPASS(pmap != kernel_pmap); 6611 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED | 6612 ATTR_SW_WIRED)) == ATTR_SW_WIRED); 6613 pmap_clear_bits(l1, ATTR_SW_WIRED); 6614 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE; 6615 continue; 6616 } 6617 6618 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 6619 if (va_next < sva) 6620 va_next = eva; 6621 6622 l2 = pmap_l1_to_l2(l1, sva); 6623 if (pmap_load(l2) == 0) 6624 continue; 6625 6626 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 6627 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) 6628 panic("pmap_unwire: l2 %#jx is missing " 6629 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); 6630 6631 /* 6632 * Are we unwiring the entire large page? If not, 6633 * demote the mapping and fall through. 6634 */ 6635 if (sva + L2_SIZE == va_next && eva >= va_next) { 6636 pmap_clear_bits(l2, ATTR_SW_WIRED); 6637 pmap->pm_stats.wired_count -= L2_SIZE / 6638 PAGE_SIZE; 6639 continue; 6640 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 6641 panic("pmap_unwire: demotion failed"); 6642 } 6643 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 6644 ("pmap_unwire: Invalid l2 entry after demotion")); 6645 6646 if (va_next > eva) 6647 va_next = eva; 6648 for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva); 6649 sva != va_next; l3++, sva += L3_SIZE) { 6650 if (pmap_load(l3) == 0) 6651 continue; 6652 if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) { 6653 /* 6654 * Avoid demotion for whole-page unwiring. 6655 */ 6656 if ((sva & L3C_OFFSET) == 0) { 6657 /* 6658 * Handle the possibility that 6659 * "va_next" is zero because of 6660 * address wraparound. 6661 */ 6662 partial_l3c = sva + L3C_OFFSET > 6663 va_next - 1; 6664 } 6665 if (partial_l3c) 6666 (void)pmap_demote_l3c(pmap, l3, sva); 6667 } 6668 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) 6669 panic("pmap_unwire: l3 %#jx is missing " 6670 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); 6671 6672 /* 6673 * ATTR_SW_WIRED must be cleared atomically. Although 6674 * the pmap lock synchronizes access to ATTR_SW_WIRED, 6675 * the System MMU may write to the entry concurrently. 6676 */ 6677 pmap_clear_bits(l3, ATTR_SW_WIRED); 6678 pmap->pm_stats.wired_count--; 6679 } 6680 } 6681 PMAP_UNLOCK(pmap); 6682 } 6683 6684 /* 6685 * This function requires that the caller has already added one to ml3's 6686 * ref_count in anticipation of creating a 4KB page mapping. 6687 */ 6688 static bool 6689 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e, 6690 vm_page_t ml3, struct rwlock **lockp) 6691 { 6692 pt_entry_t *tl3p; 6693 6694 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6695 KASSERT((va & L3C_OFFSET) == 0, 6696 ("pmap_copy_l3c: va is not aligned")); 6697 KASSERT((l3e & ATTR_SW_MANAGED) != 0, 6698 ("pmap_copy_l3c: l3e is not managed")); 6699 6700 /* 6701 * Abort if a mapping already exists. 6702 */ 6703 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) 6704 if (pmap_load(tl3p) != 0) { 6705 if (ml3 != NULL) 6706 ml3->ref_count--; 6707 return (false); 6708 } 6709 6710 if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) { 6711 if (ml3 != NULL) 6712 pmap_abort_ptp(pmap, va, ml3); 6713 return (false); 6714 } 6715 ml3->ref_count += L3C_ENTRIES - 1; 6716 6717 /* 6718 * Clear the wired and accessed bits. However, leave the dirty bit 6719 * unchanged because read/write superpage mappings are required to be 6720 * dirty. 6721 */ 6722 l3e &= ~(ATTR_SW_WIRED | ATTR_AF); 6723 6724 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 6725 pmap_store(tl3p, l3e); 6726 l3e += L3_SIZE; 6727 } 6728 pmap_resident_count_inc(pmap, L3C_ENTRIES); 6729 counter_u64_add(pmap_l3c_mappings, 1); 6730 CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p", 6731 va, pmap); 6732 return (true); 6733 } 6734 6735 /* 6736 * Copy the range specified by src_addr/len 6737 * from the source map to the range dst_addr/len 6738 * in the destination map. 6739 * 6740 * This routine is only advisory and need not do anything. 6741 * 6742 * Because the executable mappings created by this routine are copied, 6743 * it should not have to flush the instruction cache. 6744 */ 6745 void 6746 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 6747 vm_offset_t src_addr) 6748 { 6749 struct rwlock *lock; 6750 pd_entry_t *l0, *l1, *l2, srcptepaddr; 6751 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; 6752 vm_offset_t addr, end_addr, va_next; 6753 vm_page_t dst_m, dstmpte, srcmpte; 6754 6755 PMAP_ASSERT_STAGE1(dst_pmap); 6756 PMAP_ASSERT_STAGE1(src_pmap); 6757 6758 if (dst_addr != src_addr) 6759 return; 6760 end_addr = src_addr + len; 6761 lock = NULL; 6762 if (dst_pmap < src_pmap) { 6763 PMAP_LOCK(dst_pmap); 6764 PMAP_LOCK(src_pmap); 6765 } else { 6766 PMAP_LOCK(src_pmap); 6767 PMAP_LOCK(dst_pmap); 6768 } 6769 for (addr = src_addr; addr < end_addr; addr = va_next) { 6770 l0 = pmap_l0(src_pmap, addr); 6771 if (pmap_load(l0) == 0) { 6772 va_next = (addr + L0_SIZE) & ~L0_OFFSET; 6773 if (va_next < addr) 6774 va_next = end_addr; 6775 continue; 6776 } 6777 6778 va_next = (addr + L1_SIZE) & ~L1_OFFSET; 6779 if (va_next < addr) 6780 va_next = end_addr; 6781 l1 = pmap_l0_to_l1(l0, addr); 6782 if (pmap_load(l1) == 0) 6783 continue; 6784 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 6785 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6786 KASSERT(va_next <= end_addr, 6787 ("partial update of non-transparent 1G page " 6788 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 6789 pmap_load(l1), addr, end_addr, va_next)); 6790 srcptepaddr = pmap_load(l1); 6791 l1 = pmap_l1(dst_pmap, addr); 6792 if (l1 == NULL) { 6793 if (_pmap_alloc_l3(dst_pmap, 6794 pmap_l0_pindex(addr), NULL) == NULL) 6795 break; 6796 l1 = pmap_l1(dst_pmap, addr); 6797 } else { 6798 l0 = pmap_l0(dst_pmap, addr); 6799 dst_m = PTE_TO_VM_PAGE(pmap_load(l0)); 6800 dst_m->ref_count++; 6801 } 6802 KASSERT(pmap_load(l1) == 0, 6803 ("1G mapping present in dst pmap " 6804 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 6805 pmap_load(l1), addr, end_addr, va_next)); 6806 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED); 6807 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE); 6808 continue; 6809 } 6810 6811 va_next = (addr + L2_SIZE) & ~L2_OFFSET; 6812 if (va_next < addr) 6813 va_next = end_addr; 6814 l2 = pmap_l1_to_l2(l1, addr); 6815 srcptepaddr = pmap_load(l2); 6816 if (srcptepaddr == 0) 6817 continue; 6818 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { 6819 /* 6820 * We can only virtual copy whole superpages. 6821 */ 6822 if ((addr & L2_OFFSET) != 0 || 6823 addr + L2_SIZE > end_addr) 6824 continue; 6825 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL); 6826 if (l2 == NULL) 6827 break; 6828 if (pmap_load(l2) == 0 && 6829 ((srcptepaddr & ATTR_SW_MANAGED) == 0 || 6830 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, 6831 PMAP_ENTER_NORECLAIM, &lock))) { 6832 /* 6833 * We leave the dirty bit unchanged because 6834 * managed read/write superpage mappings are 6835 * required to be dirty. However, managed 6836 * superpage mappings are not required to 6837 * have their accessed bit set, so we clear 6838 * it because we don't know if this mapping 6839 * will be used. 6840 */ 6841 srcptepaddr &= ~ATTR_SW_WIRED; 6842 if ((srcptepaddr & ATTR_SW_MANAGED) != 0) 6843 srcptepaddr &= ~ATTR_AF; 6844 pmap_store(l2, srcptepaddr); 6845 pmap_resident_count_inc(dst_pmap, L2_SIZE / 6846 PAGE_SIZE); 6847 counter_u64_add(pmap_l2_mappings, 1); 6848 } else 6849 pmap_abort_ptp(dst_pmap, addr, dst_m); 6850 continue; 6851 } 6852 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, 6853 ("pmap_copy: invalid L2 entry")); 6854 srcmpte = PTE_TO_VM_PAGE(srcptepaddr); 6855 KASSERT(srcmpte->ref_count > 0, 6856 ("pmap_copy: source page table page is unused")); 6857 if (va_next > end_addr) 6858 va_next = end_addr; 6859 src_pte = PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr)); 6860 src_pte = &src_pte[pmap_l3_index(addr)]; 6861 dstmpte = NULL; 6862 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 6863 ptetemp = pmap_load(src_pte); 6864 6865 /* 6866 * We only virtual copy managed pages. 6867 */ 6868 if ((ptetemp & ATTR_SW_MANAGED) == 0) 6869 continue; 6870 6871 if (dstmpte != NULL) { 6872 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), 6873 ("dstmpte pindex/addr mismatch")); 6874 dstmpte->ref_count++; 6875 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, 6876 NULL)) == NULL) 6877 goto out; 6878 dst_pte = VM_PAGE_TO_DMAP(dstmpte); 6879 dst_pte = &dst_pte[pmap_l3_index(addr)]; 6880 if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr & 6881 L3C_OFFSET) == 0 && addr + L3C_OFFSET <= 6882 va_next - 1) { 6883 if (!pmap_copy_l3c(dst_pmap, dst_pte, addr, 6884 ptetemp, dstmpte, &lock)) 6885 goto out; 6886 addr += L3C_SIZE - PAGE_SIZE; 6887 src_pte += L3C_ENTRIES - 1; 6888 } else if (pmap_load(dst_pte) == 0 && 6889 pmap_try_insert_pv_entry(dst_pmap, addr, 6890 PTE_TO_VM_PAGE(ptetemp), &lock)) { 6891 /* 6892 * Clear the wired, contiguous, modified, and 6893 * accessed bits from the destination PTE. 6894 * The contiguous bit is cleared because we 6895 * are not copying the entire L3C superpage. 6896 */ 6897 mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS | 6898 ATTR_AF; 6899 nbits = 0; 6900 if ((ptetemp & ATTR_SW_DBM) != 0) 6901 nbits |= ATTR_S1_AP_RW_BIT; 6902 pmap_store(dst_pte, (ptetemp & ~mask) | nbits); 6903 pmap_resident_count_inc(dst_pmap, 1); 6904 } else { 6905 pmap_abort_ptp(dst_pmap, addr, dstmpte); 6906 goto out; 6907 } 6908 /* Have we copied all of the valid mappings? */ 6909 if (dstmpte->ref_count >= srcmpte->ref_count) 6910 break; 6911 } 6912 } 6913 out: 6914 /* 6915 * XXX This barrier may not be needed because the destination pmap is 6916 * not active. 6917 */ 6918 dsb(ishst); 6919 6920 if (lock != NULL) 6921 rw_wunlock(lock); 6922 PMAP_UNLOCK(src_pmap); 6923 PMAP_UNLOCK(dst_pmap); 6924 } 6925 6926 int 6927 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 6928 { 6929 int error; 6930 6931 if (dst_pmap->pm_stage != src_pmap->pm_stage) 6932 return (EINVAL); 6933 6934 if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL) 6935 return (0); 6936 6937 for (;;) { 6938 if (dst_pmap < src_pmap) { 6939 PMAP_LOCK(dst_pmap); 6940 PMAP_LOCK(src_pmap); 6941 } else { 6942 PMAP_LOCK(src_pmap); 6943 PMAP_LOCK(dst_pmap); 6944 } 6945 error = pmap_bti_copy(dst_pmap, src_pmap); 6946 /* Clean up partial copy on failure due to no memory. */ 6947 if (error == ENOMEM) 6948 pmap_bti_deassign_all(dst_pmap); 6949 PMAP_UNLOCK(src_pmap); 6950 PMAP_UNLOCK(dst_pmap); 6951 if (error != ENOMEM) 6952 break; 6953 vm_wait(NULL); 6954 } 6955 return (error); 6956 } 6957 6958 /* 6959 * pmap_zero_page zeros the specified hardware page by mapping 6960 * the page into KVM and using bzero to clear its contents. 6961 */ 6962 void 6963 pmap_zero_page(vm_page_t m) 6964 { 6965 void *va = VM_PAGE_TO_DMAP(m); 6966 6967 pagezero(va); 6968 m->md.pv_flags &= ~PV_MTE_TAGGED; 6969 } 6970 6971 /* 6972 * pmap_zero_page_area zeros the specified hardware page by mapping 6973 * the page into KVM and using bzero to clear its contents. 6974 * 6975 * off and size may not cover an area beyond a single hardware page. 6976 */ 6977 void 6978 pmap_zero_page_area(vm_page_t m, int off, int size) 6979 { 6980 void *va = VM_PAGE_TO_DMAP(m); 6981 6982 if (off == 0 && size == PAGE_SIZE) 6983 pagezero(va); 6984 else 6985 bzero((char *)va + off, size); 6986 } 6987 6988 /* 6989 * pmap_copy_page copies the specified (machine independent) 6990 * page by mapping the page into virtual memory and using 6991 * memcpy to copy the page, one machine dependent page at a 6992 * time. 6993 */ 6994 void 6995 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 6996 { 6997 void *src = VM_PAGE_TO_DMAP(msrc); 6998 void *dst = VM_PAGE_TO_DMAP(mdst); 6999 7000 /* 7001 * On a page copy, check whether the src page is tagged. If it is, 7002 * we must copy the tags before copying the contents of the page. 7003 */ 7004 if ((msrc->md.pv_flags & PV_MTE_TAGGED) != 0) 7005 mte_copy_tags(msrc, mdst, src, dst); 7006 else 7007 mdst->md.pv_flags &= ~PV_MTE_TAGGED; 7008 7009 pagecopy(src, dst); 7010 } 7011 7012 int unmapped_buf_allowed = 1; 7013 7014 void 7015 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 7016 vm_offset_t b_offset, int xfersize) 7017 { 7018 void *a_cp, *b_cp; 7019 vm_page_t m_a, m_b; 7020 vm_paddr_t p_a, p_b; 7021 vm_offset_t a_pg_offset, b_pg_offset; 7022 int cnt; 7023 7024 while (xfersize > 0) { 7025 KASSERT(ADDR_IS_CANONICAL(a_offset), 7026 ("%s: Address not in canonical form: %lx", __func__, a_offset)); 7027 7028 a_pg_offset = a_offset & PAGE_MASK; 7029 m_a = ma[a_offset >> PAGE_SHIFT]; 7030 p_a = m_a->phys_addr; 7031 b_pg_offset = b_offset & PAGE_MASK; 7032 m_b = mb[b_offset >> PAGE_SHIFT]; 7033 p_b = m_b->phys_addr; 7034 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 7035 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 7036 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 7037 panic("!DMAP a %lx", p_a); 7038 } else { 7039 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 7040 } 7041 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 7042 panic("!DMAP b %lx", p_b); 7043 } else { 7044 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 7045 } 7046 memcpy(b_cp, a_cp, cnt); 7047 a_offset += cnt; 7048 b_offset += cnt; 7049 xfersize -= cnt; 7050 } 7051 } 7052 7053 void * 7054 pmap_quick_enter_page(vm_page_t m) 7055 { 7056 7057 return (VM_PAGE_TO_DMAP(m)); 7058 } 7059 7060 void 7061 pmap_quick_remove_page(void *addr) 7062 { 7063 } 7064 7065 /* 7066 * Returns true if the pmap's pv is one of the first 7067 * 16 pvs linked to from this page. This count may 7068 * be changed upwards or downwards in the future; it 7069 * is only necessary that true be returned for a small 7070 * subset of pmaps for proper page aging. 7071 */ 7072 bool 7073 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 7074 { 7075 struct md_page *pvh; 7076 struct rwlock *lock; 7077 pv_entry_t pv; 7078 int loops = 0; 7079 bool rv; 7080 7081 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7082 ("pmap_page_exists_quick: page %p is not managed", m)); 7083 rv = false; 7084 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7085 rw_rlock(lock); 7086 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7087 if (PV_PMAP(pv) == pmap) { 7088 rv = true; 7089 break; 7090 } 7091 loops++; 7092 if (loops >= 16) 7093 break; 7094 } 7095 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 7096 pvh = page_to_pvh(m); 7097 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 7098 if (PV_PMAP(pv) == pmap) { 7099 rv = true; 7100 break; 7101 } 7102 loops++; 7103 if (loops >= 16) 7104 break; 7105 } 7106 } 7107 rw_runlock(lock); 7108 return (rv); 7109 } 7110 7111 /* 7112 * pmap_page_wired_mappings: 7113 * 7114 * Return the number of managed mappings to the given physical page 7115 * that are wired. 7116 */ 7117 int 7118 pmap_page_wired_mappings(vm_page_t m) 7119 { 7120 struct rwlock *lock; 7121 struct md_page *pvh; 7122 pmap_t pmap; 7123 pt_entry_t *pte; 7124 pv_entry_t pv; 7125 int count, md_gen, pvh_gen; 7126 7127 if ((m->oflags & VPO_UNMANAGED) != 0) 7128 return (0); 7129 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7130 rw_rlock(lock); 7131 restart: 7132 count = 0; 7133 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7134 pmap = PV_PMAP(pv); 7135 if (!PMAP_TRYLOCK(pmap)) { 7136 md_gen = m->md.pv_gen; 7137 rw_runlock(lock); 7138 PMAP_LOCK(pmap); 7139 rw_rlock(lock); 7140 if (md_gen != m->md.pv_gen) { 7141 PMAP_UNLOCK(pmap); 7142 goto restart; 7143 } 7144 } 7145 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7146 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 7147 count++; 7148 PMAP_UNLOCK(pmap); 7149 } 7150 if ((m->flags & PG_FICTITIOUS) == 0) { 7151 pvh = page_to_pvh(m); 7152 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 7153 pmap = PV_PMAP(pv); 7154 if (!PMAP_TRYLOCK(pmap)) { 7155 md_gen = m->md.pv_gen; 7156 pvh_gen = pvh->pv_gen; 7157 rw_runlock(lock); 7158 PMAP_LOCK(pmap); 7159 rw_rlock(lock); 7160 if (md_gen != m->md.pv_gen || 7161 pvh_gen != pvh->pv_gen) { 7162 PMAP_UNLOCK(pmap); 7163 goto restart; 7164 } 7165 } 7166 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 7167 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 7168 count++; 7169 PMAP_UNLOCK(pmap); 7170 } 7171 } 7172 rw_runlock(lock); 7173 return (count); 7174 } 7175 7176 /* 7177 * Returns true if the given page is mapped individually or as part of 7178 * a 2mpage. Otherwise, returns false. 7179 */ 7180 bool 7181 pmap_page_is_mapped(vm_page_t m) 7182 { 7183 struct rwlock *lock; 7184 bool rv; 7185 7186 if ((m->oflags & VPO_UNMANAGED) != 0) 7187 return (false); 7188 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7189 rw_rlock(lock); 7190 rv = !TAILQ_EMPTY(&m->md.pv_list) || 7191 ((m->flags & PG_FICTITIOUS) == 0 && 7192 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list)); 7193 rw_runlock(lock); 7194 return (rv); 7195 } 7196 7197 /* 7198 * Destroy all managed, non-wired mappings in the given user-space 7199 * pmap. This pmap cannot be active on any processor besides the 7200 * caller. 7201 * 7202 * This function cannot be applied to the kernel pmap. Moreover, it 7203 * is not intended for general use. It is only to be used during 7204 * process termination. Consequently, it can be implemented in ways 7205 * that make it faster than pmap_remove(). First, it can more quickly 7206 * destroy mappings by iterating over the pmap's collection of PV 7207 * entries, rather than searching the page table. Second, it doesn't 7208 * have to test and clear the page table entries atomically, because 7209 * no processor is currently accessing the user address space. In 7210 * particular, a page table entry's dirty bit won't change state once 7211 * this function starts. 7212 */ 7213 void 7214 pmap_remove_pages(pmap_t pmap) 7215 { 7216 pd_entry_t *pde; 7217 pt_entry_t *pte, tpte; 7218 struct spglist free; 7219 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 7220 vm_page_t m, ml3, mt; 7221 pv_entry_t pv; 7222 struct md_page *pvh; 7223 struct pv_chunk *pc, *npc; 7224 struct rwlock *lock; 7225 int64_t bit; 7226 uint64_t inuse, bitmask; 7227 int allfree, field, i, idx, lvl; 7228 int freed __pvused; 7229 vm_paddr_t pa; 7230 7231 lock = NULL; 7232 7233 for (i = 0; i < PMAP_MEMDOM; i++) 7234 TAILQ_INIT(&free_chunks[i]); 7235 SLIST_INIT(&free); 7236 PMAP_LOCK(pmap); 7237 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 7238 allfree = 1; 7239 freed = 0; 7240 for (field = 0; field < _NPCM; field++) { 7241 inuse = ~pc->pc_map[field] & pc_freemask[field]; 7242 while (inuse != 0) { 7243 bit = ffsl(inuse) - 1; 7244 bitmask = 1UL << bit; 7245 idx = field * 64 + bit; 7246 pv = &pc->pc_pventry[idx]; 7247 inuse &= ~bitmask; 7248 7249 pde = pmap_pde(pmap, pv->pv_va, &lvl); 7250 KASSERT(pde != NULL, 7251 ("Attempting to remove an unmapped page")); 7252 7253 switch(lvl) { 7254 case 1: 7255 pte = pmap_l1_to_l2(pde, pv->pv_va); 7256 tpte = pmap_load(pte); 7257 KASSERT((tpte & ATTR_DESCR_MASK) == 7258 L2_BLOCK, 7259 ("Attempting to remove an invalid " 7260 "block: %lx", tpte)); 7261 break; 7262 case 2: 7263 pte = pmap_l2_to_l3(pde, pv->pv_va); 7264 tpte = pmap_load(pte); 7265 KASSERT((tpte & ATTR_DESCR_MASK) == 7266 L3_PAGE, 7267 ("Attempting to remove an invalid " 7268 "page: %lx", tpte)); 7269 break; 7270 default: 7271 panic( 7272 "Invalid page directory level: %d", 7273 lvl); 7274 } 7275 7276 /* 7277 * We cannot remove wired mappings at this time. 7278 * 7279 * For L3C superpages, all of the constituent PTEs 7280 * should have the wired bit set, so we don't 7281 * check for ATTR_CONTIGUOUS here. 7282 */ 7283 if (tpte & ATTR_SW_WIRED) { 7284 allfree = 0; 7285 continue; 7286 } 7287 7288 /* Mark free */ 7289 pc->pc_map[field] |= bitmask; 7290 7291 /* 7292 * Because this pmap is not active on other 7293 * processors, the dirty bit cannot have 7294 * changed state since we last loaded pte. 7295 */ 7296 pmap_clear(pte); 7297 7298 pa = PTE_TO_PHYS(tpte); 7299 7300 m = PHYS_TO_VM_PAGE(pa); 7301 KASSERT(m->phys_addr == pa, 7302 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 7303 m, (uintmax_t)m->phys_addr, 7304 (uintmax_t)tpte)); 7305 7306 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 7307 m < &vm_page_array[vm_page_array_size], 7308 ("pmap_remove_pages: bad pte %#jx", 7309 (uintmax_t)tpte)); 7310 7311 /* 7312 * Update the vm_page_t clean/reference bits. 7313 * 7314 * We don't check for ATTR_CONTIGUOUS here 7315 * because writeable L3C superpages are expected 7316 * to be dirty, i.e., every constituent PTE 7317 * should be dirty. 7318 */ 7319 if (pmap_pte_dirty(pmap, tpte)) { 7320 switch (lvl) { 7321 case 1: 7322 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 7323 vm_page_dirty(mt); 7324 break; 7325 case 2: 7326 vm_page_dirty(m); 7327 break; 7328 } 7329 } 7330 7331 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 7332 7333 switch (lvl) { 7334 case 1: 7335 pmap_resident_count_dec(pmap, 7336 L2_SIZE / PAGE_SIZE); 7337 pvh = page_to_pvh(m); 7338 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); 7339 pvh->pv_gen++; 7340 if (TAILQ_EMPTY(&pvh->pv_list)) { 7341 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 7342 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 7343 TAILQ_EMPTY(&mt->md.pv_list)) 7344 vm_page_aflag_clear(mt, PGA_WRITEABLE); 7345 } 7346 ml3 = pmap_remove_pt_page(pmap, 7347 pv->pv_va); 7348 if (ml3 != NULL) { 7349 KASSERT(vm_page_any_valid(ml3), 7350 ("pmap_remove_pages: l3 page not promoted")); 7351 pmap_resident_count_dec(pmap,1); 7352 KASSERT(ml3->ref_count == NL3PG, 7353 ("pmap_remove_pages: l3 page ref count error")); 7354 ml3->ref_count = 0; 7355 pmap_add_delayed_free_list(ml3, 7356 &free, false); 7357 } 7358 break; 7359 case 2: 7360 pmap_resident_count_dec(pmap, 1); 7361 TAILQ_REMOVE(&m->md.pv_list, pv, 7362 pv_next); 7363 m->md.pv_gen++; 7364 if ((m->a.flags & PGA_WRITEABLE) != 0 && 7365 TAILQ_EMPTY(&m->md.pv_list) && 7366 (m->flags & PG_FICTITIOUS) == 0) { 7367 pvh = page_to_pvh(m); 7368 if (TAILQ_EMPTY(&pvh->pv_list)) 7369 vm_page_aflag_clear(m, 7370 PGA_WRITEABLE); 7371 } 7372 break; 7373 } 7374 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), 7375 &free); 7376 freed++; 7377 } 7378 } 7379 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 7380 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 7381 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 7382 if (allfree) { 7383 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 7384 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, 7385 pc_list); 7386 } 7387 } 7388 if (lock != NULL) 7389 rw_wunlock(lock); 7390 pmap_invalidate_all(pmap); 7391 pmap_bti_deassign_all(pmap); 7392 free_pv_chunk_batch(free_chunks); 7393 PMAP_UNLOCK(pmap); 7394 vm_page_free_pages_toq(&free, true); 7395 } 7396 7397 /* 7398 * This is used to check if a page has been accessed or modified. 7399 */ 7400 static bool 7401 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified) 7402 { 7403 struct rwlock *lock; 7404 pv_entry_t pv; 7405 struct md_page *pvh; 7406 pt_entry_t l3e, mask, *pte, value; 7407 pmap_t pmap; 7408 int md_gen, pvh_gen; 7409 bool rv; 7410 7411 rv = false; 7412 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7413 rw_rlock(lock); 7414 restart: 7415 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7416 pmap = PV_PMAP(pv); 7417 PMAP_ASSERT_STAGE1(pmap); 7418 if (!PMAP_TRYLOCK(pmap)) { 7419 md_gen = m->md.pv_gen; 7420 rw_runlock(lock); 7421 PMAP_LOCK(pmap); 7422 rw_rlock(lock); 7423 if (md_gen != m->md.pv_gen) { 7424 PMAP_UNLOCK(pmap); 7425 goto restart; 7426 } 7427 } 7428 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7429 mask = 0; 7430 value = 0; 7431 if (modified) { 7432 mask |= ATTR_S1_AP_RW_BIT; 7433 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 7434 } 7435 if (accessed) { 7436 mask |= ATTR_AF | ATTR_DESCR_MASK; 7437 value |= ATTR_AF | L3_PAGE; 7438 } 7439 l3e = pmap_load(pte); 7440 if ((l3e & ATTR_CONTIGUOUS) != 0) 7441 l3e = pmap_load_l3c(pte); 7442 PMAP_UNLOCK(pmap); 7443 rv = (l3e & mask) == value; 7444 if (rv) 7445 goto out; 7446 } 7447 if ((m->flags & PG_FICTITIOUS) == 0) { 7448 pvh = page_to_pvh(m); 7449 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 7450 pmap = PV_PMAP(pv); 7451 PMAP_ASSERT_STAGE1(pmap); 7452 if (!PMAP_TRYLOCK(pmap)) { 7453 md_gen = m->md.pv_gen; 7454 pvh_gen = pvh->pv_gen; 7455 rw_runlock(lock); 7456 PMAP_LOCK(pmap); 7457 rw_rlock(lock); 7458 if (md_gen != m->md.pv_gen || 7459 pvh_gen != pvh->pv_gen) { 7460 PMAP_UNLOCK(pmap); 7461 goto restart; 7462 } 7463 } 7464 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 7465 mask = 0; 7466 value = 0; 7467 if (modified) { 7468 mask |= ATTR_S1_AP_RW_BIT; 7469 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 7470 } 7471 if (accessed) { 7472 mask |= ATTR_AF | ATTR_DESCR_MASK; 7473 value |= ATTR_AF | L2_BLOCK; 7474 } 7475 rv = (pmap_load(pte) & mask) == value; 7476 PMAP_UNLOCK(pmap); 7477 if (rv) 7478 goto out; 7479 } 7480 } 7481 out: 7482 rw_runlock(lock); 7483 return (rv); 7484 } 7485 7486 /* 7487 * pmap_is_modified: 7488 * 7489 * Return whether or not the specified physical page was modified 7490 * in any physical maps. 7491 */ 7492 bool 7493 pmap_is_modified(vm_page_t m) 7494 { 7495 7496 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7497 ("pmap_is_modified: page %p is not managed", m)); 7498 7499 /* 7500 * If the page is not busied then this check is racy. 7501 */ 7502 if (!pmap_page_is_write_mapped(m)) 7503 return (false); 7504 return (pmap_page_test_mappings(m, false, true)); 7505 } 7506 7507 /* 7508 * pmap_is_prefaultable: 7509 * 7510 * Return whether or not the specified virtual address is eligible 7511 * for prefault. 7512 */ 7513 bool 7514 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 7515 { 7516 pd_entry_t *pde; 7517 pt_entry_t *pte; 7518 bool rv; 7519 int lvl; 7520 7521 /* 7522 * Return true if and only if the L3 entry for the specified virtual 7523 * address is allocated but invalid. 7524 */ 7525 rv = false; 7526 PMAP_LOCK(pmap); 7527 pde = pmap_pde(pmap, addr, &lvl); 7528 if (pde != NULL && lvl == 2) { 7529 pte = pmap_l2_to_l3(pde, addr); 7530 rv = pmap_load(pte) == 0; 7531 } 7532 PMAP_UNLOCK(pmap); 7533 return (rv); 7534 } 7535 7536 /* 7537 * pmap_is_referenced: 7538 * 7539 * Return whether or not the specified physical page was referenced 7540 * in any physical maps. 7541 */ 7542 bool 7543 pmap_is_referenced(vm_page_t m) 7544 { 7545 7546 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7547 ("pmap_is_referenced: page %p is not managed", m)); 7548 return (pmap_page_test_mappings(m, true, false)); 7549 } 7550 7551 /* 7552 * Clear the write and modified bits in each of the given page's mappings. 7553 */ 7554 void 7555 pmap_remove_write(vm_page_t m) 7556 { 7557 struct md_page *pvh; 7558 pmap_t pmap; 7559 struct rwlock *lock; 7560 pv_entry_t next_pv, pv; 7561 pt_entry_t oldpte, *pte, set, clear, mask, val; 7562 vm_offset_t va; 7563 int md_gen, pvh_gen; 7564 7565 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7566 ("pmap_remove_write: page %p is not managed", m)); 7567 vm_page_assert_busied(m); 7568 7569 if (!pmap_page_is_write_mapped(m)) 7570 return; 7571 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7572 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 7573 rw_wlock(lock); 7574 retry: 7575 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 7576 pmap = PV_PMAP(pv); 7577 PMAP_ASSERT_STAGE1(pmap); 7578 if (!PMAP_TRYLOCK(pmap)) { 7579 pvh_gen = pvh->pv_gen; 7580 rw_wunlock(lock); 7581 PMAP_LOCK(pmap); 7582 rw_wlock(lock); 7583 if (pvh_gen != pvh->pv_gen) { 7584 PMAP_UNLOCK(pmap); 7585 goto retry; 7586 } 7587 } 7588 va = pv->pv_va; 7589 pte = pmap_pte_exists(pmap, va, 2, __func__); 7590 if ((pmap_load(pte) & ATTR_SW_DBM) != 0) 7591 (void)pmap_demote_l2_locked(pmap, pte, va, &lock); 7592 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 7593 ("inconsistent pv lock %p %p for page %p", 7594 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 7595 PMAP_UNLOCK(pmap); 7596 } 7597 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7598 pmap = PV_PMAP(pv); 7599 if (!PMAP_TRYLOCK(pmap)) { 7600 pvh_gen = pvh->pv_gen; 7601 md_gen = m->md.pv_gen; 7602 rw_wunlock(lock); 7603 PMAP_LOCK(pmap); 7604 rw_wlock(lock); 7605 if (pvh_gen != pvh->pv_gen || 7606 md_gen != m->md.pv_gen) { 7607 PMAP_UNLOCK(pmap); 7608 goto retry; 7609 } 7610 } 7611 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7612 oldpte = pmap_load(pte); 7613 if ((oldpte & ATTR_SW_DBM) != 0) { 7614 if ((oldpte & ATTR_CONTIGUOUS) != 0) { 7615 (void)pmap_demote_l3c(pmap, pte, pv->pv_va); 7616 7617 /* 7618 * The L3 entry's accessed bit may have 7619 * changed. 7620 */ 7621 oldpte = pmap_load(pte); 7622 } 7623 if (pmap->pm_stage == PM_STAGE1) { 7624 set = ATTR_S1_AP_RW_BIT; 7625 clear = 0; 7626 mask = ATTR_S1_AP_RW_BIT; 7627 val = ATTR_S1_AP(ATTR_S1_AP_RW); 7628 } else { 7629 set = 0; 7630 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 7631 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 7632 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 7633 } 7634 clear |= ATTR_SW_DBM; 7635 while (!atomic_fcmpset_64(pte, &oldpte, 7636 (oldpte | set) & ~clear)) 7637 cpu_spinwait(); 7638 7639 if ((oldpte & mask) == val) 7640 vm_page_dirty(m); 7641 pmap_invalidate_page(pmap, pv->pv_va, true); 7642 } 7643 PMAP_UNLOCK(pmap); 7644 } 7645 rw_wunlock(lock); 7646 vm_page_aflag_clear(m, PGA_WRITEABLE); 7647 } 7648 7649 /* 7650 * pmap_ts_referenced: 7651 * 7652 * Return a count of reference bits for a page, clearing those bits. 7653 * It is not necessary for every reference bit to be cleared, but it 7654 * is necessary that 0 only be returned when there are truly no 7655 * reference bits set. 7656 * 7657 * As an optimization, update the page's dirty field if a modified bit is 7658 * found while counting reference bits. This opportunistic update can be 7659 * performed at low cost and can eliminate the need for some future calls 7660 * to pmap_is_modified(). However, since this function stops after 7661 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 7662 * dirty pages. Those dirty pages will only be detected by a future call 7663 * to pmap_is_modified(). 7664 */ 7665 int 7666 pmap_ts_referenced(vm_page_t m) 7667 { 7668 struct md_page *pvh; 7669 pv_entry_t pv, pvf; 7670 pmap_t pmap; 7671 struct rwlock *lock; 7672 pt_entry_t *pte, tpte; 7673 vm_offset_t va; 7674 vm_paddr_t pa; 7675 int cleared, md_gen, not_cleared, pvh_gen; 7676 struct spglist free; 7677 7678 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7679 ("pmap_ts_referenced: page %p is not managed", m)); 7680 SLIST_INIT(&free); 7681 cleared = 0; 7682 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 7683 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7684 rw_wlock(lock); 7685 retry: 7686 not_cleared = 0; 7687 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 7688 goto small_mappings; 7689 pv = pvf; 7690 do { 7691 if (pvf == NULL) 7692 pvf = pv; 7693 pmap = PV_PMAP(pv); 7694 if (!PMAP_TRYLOCK(pmap)) { 7695 pvh_gen = pvh->pv_gen; 7696 rw_wunlock(lock); 7697 PMAP_LOCK(pmap); 7698 rw_wlock(lock); 7699 if (pvh_gen != pvh->pv_gen) { 7700 PMAP_UNLOCK(pmap); 7701 goto retry; 7702 } 7703 } 7704 va = pv->pv_va; 7705 pte = pmap_pte_exists(pmap, va, 2, __func__); 7706 tpte = pmap_load(pte); 7707 if (pmap_pte_dirty(pmap, tpte)) { 7708 /* 7709 * Although "tpte" is mapping a 2MB page, because 7710 * this function is called at a 4KB page granularity, 7711 * we only update the 4KB page under test. 7712 */ 7713 vm_page_dirty(m); 7714 } 7715 if ((tpte & ATTR_AF) != 0) { 7716 pa = VM_PAGE_TO_PHYS(m); 7717 7718 /* 7719 * Since this reference bit is shared by 512 4KB pages, 7720 * it should not be cleared every time it is tested. 7721 * Apply a simple "hash" function on the physical page 7722 * number, the virtual superpage number, and the pmap 7723 * address to select one 4KB page out of the 512 on 7724 * which testing the reference bit will result in 7725 * clearing that reference bit. This function is 7726 * designed to avoid the selection of the same 4KB page 7727 * for every 2MB page mapping. 7728 * 7729 * On demotion, a mapping that hasn't been referenced 7730 * is simply destroyed. To avoid the possibility of a 7731 * subsequent page fault on a demoted wired mapping, 7732 * always leave its reference bit set. Moreover, 7733 * since the superpage is wired, the current state of 7734 * its reference bit won't affect page replacement. 7735 */ 7736 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^ 7737 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 7738 (tpte & ATTR_SW_WIRED) == 0) { 7739 pmap_clear_bits(pte, ATTR_AF); 7740 pmap_invalidate_page(pmap, va, true); 7741 cleared++; 7742 } else 7743 not_cleared++; 7744 } 7745 PMAP_UNLOCK(pmap); 7746 /* Rotate the PV list if it has more than one entry. */ 7747 if (TAILQ_NEXT(pv, pv_next) != NULL) { 7748 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 7749 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 7750 pvh->pv_gen++; 7751 } 7752 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 7753 goto out; 7754 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 7755 small_mappings: 7756 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 7757 goto out; 7758 pv = pvf; 7759 do { 7760 if (pvf == NULL) 7761 pvf = pv; 7762 pmap = PV_PMAP(pv); 7763 if (!PMAP_TRYLOCK(pmap)) { 7764 pvh_gen = pvh->pv_gen; 7765 md_gen = m->md.pv_gen; 7766 rw_wunlock(lock); 7767 PMAP_LOCK(pmap); 7768 rw_wlock(lock); 7769 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 7770 PMAP_UNLOCK(pmap); 7771 goto retry; 7772 } 7773 } 7774 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7775 tpte = pmap_load(pte); 7776 if (pmap_pte_dirty(pmap, tpte)) 7777 vm_page_dirty(m); 7778 if ((tpte & ATTR_AF) != 0) { 7779 if ((tpte & ATTR_SW_WIRED) == 0) { 7780 /* 7781 * Clear the accessed bit in this L3 entry 7782 * regardless of the contiguous bit. 7783 */ 7784 pmap_clear_bits(pte, ATTR_AF); 7785 pmap_invalidate_page(pmap, pv->pv_va, true); 7786 cleared++; 7787 } else 7788 not_cleared++; 7789 } else if ((tpte & ATTR_CONTIGUOUS) != 0 && 7790 (pmap_load_l3c(pte) & ATTR_AF) != 0) { 7791 /* 7792 * An L3C superpage mapping is regarded as accessed 7793 * until the accessed bit has been cleared in all 7794 * of its constituent entries. 7795 */ 7796 not_cleared++; 7797 } 7798 PMAP_UNLOCK(pmap); 7799 /* Rotate the PV list if it has more than one entry. */ 7800 if (TAILQ_NEXT(pv, pv_next) != NULL) { 7801 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 7802 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7803 m->md.pv_gen++; 7804 } 7805 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 7806 not_cleared < PMAP_TS_REFERENCED_MAX); 7807 out: 7808 rw_wunlock(lock); 7809 vm_page_free_pages_toq(&free, true); 7810 return (cleared + not_cleared); 7811 } 7812 7813 /* 7814 * Apply the given advice to the specified range of addresses within the 7815 * given pmap. Depending on the advice, clear the referenced and/or 7816 * modified flags in each mapping and set the mapped page's dirty field. 7817 */ 7818 void 7819 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 7820 { 7821 struct rwlock *lock; 7822 vm_offset_t va, va_next, dva; 7823 vm_page_t m; 7824 pd_entry_t *l0, *l1, *l2, oldl2; 7825 pt_entry_t *l3, *dl3, oldl3; 7826 7827 PMAP_ASSERT_STAGE1(pmap); 7828 7829 if (advice != MADV_DONTNEED && advice != MADV_FREE) 7830 return; 7831 7832 PMAP_LOCK(pmap); 7833 for (; sva < eva; sva = va_next) { 7834 l0 = pmap_l0(pmap, sva); 7835 if (pmap_load(l0) == 0) { 7836 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 7837 if (va_next < sva) 7838 va_next = eva; 7839 continue; 7840 } 7841 7842 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 7843 if (va_next < sva) 7844 va_next = eva; 7845 l1 = pmap_l0_to_l1(l0, sva); 7846 if (pmap_load(l1) == 0) 7847 continue; 7848 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 7849 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 7850 continue; 7851 } 7852 7853 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 7854 if (va_next < sva) 7855 va_next = eva; 7856 l2 = pmap_l1_to_l2(l1, sva); 7857 oldl2 = pmap_load(l2); 7858 if (oldl2 == 0) 7859 continue; 7860 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { 7861 if ((oldl2 & ATTR_SW_MANAGED) == 0) 7862 continue; 7863 lock = NULL; 7864 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { 7865 if (lock != NULL) 7866 rw_wunlock(lock); 7867 7868 /* 7869 * The 2MB page mapping was destroyed. 7870 */ 7871 continue; 7872 } 7873 7874 /* 7875 * Unless the page mappings are wired, remove the 7876 * mapping to a single page so that a subsequent 7877 * access may repromote. Choosing the last page 7878 * within the address range [sva, min(va_next, eva)) 7879 * generally results in more repromotions. Since the 7880 * underlying page table page is fully populated, this 7881 * removal never frees a page table page. 7882 */ 7883 if ((oldl2 & ATTR_SW_WIRED) == 0) { 7884 va = eva; 7885 if (va > va_next) 7886 va = va_next; 7887 va -= PAGE_SIZE; 7888 KASSERT(va >= sva, 7889 ("pmap_advise: no address gap")); 7890 l3 = pmap_l2_to_l3(l2, va); 7891 KASSERT(pmap_load(l3) != 0, 7892 ("pmap_advise: invalid PTE")); 7893 pmap_remove_l3(pmap, l3, va, pmap_load(l2), 7894 NULL, &lock); 7895 } 7896 if (lock != NULL) 7897 rw_wunlock(lock); 7898 } 7899 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 7900 ("pmap_advise: invalid L2 entry after demotion")); 7901 if (va_next > eva) 7902 va_next = eva; 7903 va = va_next; 7904 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 7905 sva += L3_SIZE) { 7906 oldl3 = pmap_load(l3); 7907 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != 7908 (ATTR_SW_MANAGED | L3_PAGE)) 7909 goto maybe_invlrng; 7910 else if (pmap_pte_dirty(pmap, oldl3)) { 7911 if (advice == MADV_DONTNEED) { 7912 /* 7913 * Future calls to pmap_is_modified() 7914 * can be avoided by making the page 7915 * dirty now. 7916 */ 7917 m = PTE_TO_VM_PAGE(oldl3); 7918 vm_page_dirty(m); 7919 } 7920 if ((oldl3 & ATTR_CONTIGUOUS) != 0) { 7921 /* 7922 * Unconditionally demote the L3C 7923 * superpage because we do not allow 7924 * writeable, clean superpages. 7925 */ 7926 (void)pmap_demote_l3c(pmap, l3, sva); 7927 7928 /* 7929 * Destroy the final mapping before the 7930 * next L3C boundary or va_next, 7931 * whichever comes first, so that a 7932 * subsequent access may act as a 7933 * repromotion trigger. 7934 */ 7935 if ((oldl3 & ATTR_SW_WIRED) == 0) { 7936 dva = MIN((sva & ~L3C_OFFSET) + 7937 L3C_SIZE - PAGE_SIZE, 7938 va_next - PAGE_SIZE); 7939 dl3 = pmap_l2_to_l3(l2, dva); 7940 KASSERT(pmap_load(dl3) != 0, 7941 ("pmap_advise: invalid PTE")); 7942 lock = NULL; 7943 pmap_remove_l3(pmap, dl3, dva, 7944 pmap_load(l2), NULL, &lock); 7945 if (lock != NULL) 7946 rw_wunlock(lock); 7947 } 7948 7949 /* 7950 * The L3 entry's accessed bit may have 7951 * changed. 7952 */ 7953 oldl3 = pmap_load(l3); 7954 } 7955 7956 /* 7957 * Check that we did not just destroy this entry so 7958 * we avoid corrupting the page able. 7959 */ 7960 if (oldl3 != 0) { 7961 while (!atomic_fcmpset_long(l3, &oldl3, 7962 (oldl3 & ~ATTR_AF) | 7963 ATTR_S1_AP(ATTR_S1_AP_RO))) 7964 cpu_spinwait(); 7965 } 7966 } else if ((oldl3 & ATTR_AF) != 0) { 7967 /* 7968 * Clear the accessed bit in this L3 entry 7969 * regardless of the contiguous bit. 7970 */ 7971 pmap_clear_bits(l3, ATTR_AF); 7972 } else 7973 goto maybe_invlrng; 7974 if (va == va_next) 7975 va = sva; 7976 continue; 7977 maybe_invlrng: 7978 if (va != va_next) { 7979 pmap_s1_invalidate_range(pmap, va, sva, true); 7980 va = va_next; 7981 } 7982 } 7983 if (va != va_next) 7984 pmap_s1_invalidate_range(pmap, va, sva, true); 7985 } 7986 PMAP_UNLOCK(pmap); 7987 } 7988 7989 /* 7990 * Clear the modify bits on the specified physical page. 7991 */ 7992 void 7993 pmap_clear_modify(vm_page_t m) 7994 { 7995 struct md_page *pvh; 7996 struct rwlock *lock; 7997 pmap_t pmap; 7998 pv_entry_t next_pv, pv; 7999 pd_entry_t *l2, oldl2; 8000 pt_entry_t *l3, oldl3; 8001 vm_offset_t va; 8002 int md_gen, pvh_gen; 8003 8004 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8005 ("pmap_clear_modify: page %p is not managed", m)); 8006 vm_page_assert_busied(m); 8007 8008 if (!pmap_page_is_write_mapped(m)) 8009 return; 8010 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 8011 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8012 rw_wlock(lock); 8013 restart: 8014 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 8015 pmap = PV_PMAP(pv); 8016 PMAP_ASSERT_STAGE1(pmap); 8017 if (!PMAP_TRYLOCK(pmap)) { 8018 pvh_gen = pvh->pv_gen; 8019 rw_wunlock(lock); 8020 PMAP_LOCK(pmap); 8021 rw_wlock(lock); 8022 if (pvh_gen != pvh->pv_gen) { 8023 PMAP_UNLOCK(pmap); 8024 goto restart; 8025 } 8026 } 8027 va = pv->pv_va; 8028 l2 = pmap_l2(pmap, va); 8029 oldl2 = pmap_load(l2); 8030 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ 8031 if ((oldl2 & ATTR_SW_DBM) != 0 && 8032 pmap_demote_l2_locked(pmap, l2, va, &lock) && 8033 (oldl2 & ATTR_SW_WIRED) == 0) { 8034 /* 8035 * Write protect the mapping to a single page so that 8036 * a subsequent write access may repromote. 8037 */ 8038 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 8039 l3 = pmap_l2_to_l3(l2, va); 8040 oldl3 = pmap_load(l3); 8041 while (!atomic_fcmpset_long(l3, &oldl3, 8042 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO))) 8043 cpu_spinwait(); 8044 vm_page_dirty(m); 8045 pmap_s1_invalidate_page(pmap, va, true); 8046 } 8047 PMAP_UNLOCK(pmap); 8048 } 8049 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8050 pmap = PV_PMAP(pv); 8051 PMAP_ASSERT_STAGE1(pmap); 8052 if (!PMAP_TRYLOCK(pmap)) { 8053 md_gen = m->md.pv_gen; 8054 pvh_gen = pvh->pv_gen; 8055 rw_wunlock(lock); 8056 PMAP_LOCK(pmap); 8057 rw_wlock(lock); 8058 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 8059 PMAP_UNLOCK(pmap); 8060 goto restart; 8061 } 8062 } 8063 l2 = pmap_l2(pmap, pv->pv_va); 8064 l3 = pmap_l2_to_l3(l2, pv->pv_va); 8065 oldl3 = pmap_load(l3); 8066 KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 || 8067 (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) != 8068 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)), 8069 ("writeable L3C superpage not dirty")); 8070 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) { 8071 if ((oldl3 & ATTR_CONTIGUOUS) != 0) 8072 (void)pmap_demote_l3c(pmap, l3, pv->pv_va); 8073 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO)); 8074 pmap_s1_invalidate_page(pmap, pv->pv_va, true); 8075 } 8076 PMAP_UNLOCK(pmap); 8077 } 8078 rw_wunlock(lock); 8079 } 8080 8081 void * 8082 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 8083 { 8084 struct pmap_preinit_mapping *ppim; 8085 vm_offset_t va, offset; 8086 pd_entry_t old_l2e, *pde; 8087 pt_entry_t *l2; 8088 int i, lvl, l2_blocks, free_l2_count, start_idx; 8089 8090 /* Use the DMAP region if we can */ 8091 if (PHYS_IN_DMAP(pa) && PHYS_IN_DMAP(pa + size - 1) && 8092 pmap_kmapped_range(PHYS_TO_DMAP(pa), size)) 8093 return (PHYS_TO_DMAP(pa)); 8094 8095 if (!vm_initialized) { 8096 /* 8097 * No L3 ptables so map entire L2 blocks where start VA is: 8098 * preinit_map_va + start_idx * L2_SIZE 8099 * There may be duplicate mappings (multiple VA -> same PA) but 8100 * ARM64 dcache is always PIPT so that's acceptable. 8101 */ 8102 if (size == 0) 8103 return (NULL); 8104 8105 /* Calculate how many L2 blocks are needed for the mapping */ 8106 l2_blocks = (roundup2(pa + size, L2_SIZE) - 8107 rounddown2(pa, L2_SIZE)) >> L2_SHIFT; 8108 8109 offset = pa & L2_OFFSET; 8110 8111 if (preinit_map_va == 0) 8112 return (NULL); 8113 8114 /* Map 2MiB L2 blocks from reserved VA space */ 8115 8116 free_l2_count = 0; 8117 start_idx = -1; 8118 /* Find enough free contiguous VA space */ 8119 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 8120 ppim = pmap_preinit_mapping + i; 8121 if (free_l2_count > 0 && ppim->pa != 0) { 8122 /* Not enough space here */ 8123 free_l2_count = 0; 8124 start_idx = -1; 8125 continue; 8126 } 8127 8128 if (ppim->pa == 0) { 8129 /* Free L2 block */ 8130 if (start_idx == -1) 8131 start_idx = i; 8132 free_l2_count++; 8133 if (free_l2_count == l2_blocks) 8134 break; 8135 } 8136 } 8137 if (free_l2_count != l2_blocks) 8138 panic("%s: too many preinit mappings", __func__); 8139 8140 va = preinit_map_va + (start_idx * L2_SIZE); 8141 for (i = start_idx; i < start_idx + l2_blocks; i++) { 8142 /* Mark entries as allocated */ 8143 ppim = pmap_preinit_mapping + i; 8144 ppim->pa = pa; 8145 ppim->va = (char *)va + offset; 8146 ppim->size = size; 8147 } 8148 8149 /* Map L2 blocks */ 8150 pa = rounddown2(pa, L2_SIZE); 8151 old_l2e = 0; 8152 for (i = 0; i < l2_blocks; i++) { 8153 pde = pmap_pde(kernel_pmap, va, &lvl); 8154 KASSERT(pde != NULL, 8155 ("pmap_mapbios: Invalid page entry, va: 0x%lx", 8156 va)); 8157 KASSERT(lvl == 1, 8158 ("pmap_mapbios: Invalid level %d", lvl)); 8159 8160 /* Insert L2_BLOCK */ 8161 l2 = pmap_l1_to_l2(pde, va); 8162 old_l2e |= pmap_load_store(l2, 8163 PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr | 8164 ATTR_S1_XN | ATTR_KERN_GP | 8165 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); 8166 8167 va += L2_SIZE; 8168 pa += L2_SIZE; 8169 } 8170 if ((old_l2e & ATTR_DESCR_VALID) != 0) 8171 pmap_s1_invalidate_all_kernel(); 8172 else { 8173 /* 8174 * Because the old entries were invalid and the new 8175 * mappings are not executable, an isb is not required. 8176 */ 8177 dsb(ishst); 8178 } 8179 8180 va = preinit_map_va + (start_idx * L2_SIZE); 8181 8182 } else { 8183 /* kva_alloc may be used to map the pages */ 8184 offset = pa & PAGE_MASK; 8185 size = round_page(offset + size); 8186 8187 va = (vm_offset_t)kva_alloc(size); 8188 if (va == 0) 8189 panic("%s: Couldn't allocate KVA", __func__); 8190 8191 pde = pmap_pde(kernel_pmap, va, &lvl); 8192 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); 8193 8194 /* L3 table is linked */ 8195 va = trunc_page(va); 8196 pa = trunc_page(pa); 8197 pmap_kenter(va, size, pa, memory_mapping_mode(pa)); 8198 } 8199 8200 return ((void *)(va + offset)); 8201 } 8202 8203 void 8204 pmap_unmapbios(void *p, vm_size_t size) 8205 { 8206 struct pmap_preinit_mapping *ppim; 8207 char *va; 8208 vm_offset_t offset, va_trunc; 8209 pd_entry_t *pde; 8210 pt_entry_t *l2; 8211 int error __diagused, i, lvl, l2_blocks, block; 8212 bool preinit_map; 8213 8214 va = p; 8215 if (VIRT_IN_DMAP(va)) { 8216 KASSERT(VIRT_IN_DMAP(va + size - 1), 8217 ("%s: End address not in DMAP region: %p", __func__, 8218 va + size - 1)); 8219 /* Ensure the attributes are as expected for the DMAP region */ 8220 PMAP_LOCK(kernel_pmap); 8221 error = pmap_change_props_locked(va, size, 8222 PROT_READ | PROT_WRITE, VM_MEMATTR_DEFAULT, -1, false); 8223 PMAP_UNLOCK(kernel_pmap); 8224 KASSERT(error == 0, ("%s: Failed to reset DMAP attributes: %d", 8225 __func__, error)); 8226 8227 return; 8228 } 8229 8230 l2_blocks = 8231 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; 8232 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); 8233 8234 /* Remove preinit mapping */ 8235 preinit_map = false; 8236 block = 0; 8237 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 8238 ppim = pmap_preinit_mapping + i; 8239 if (ppim->va == va) { 8240 KASSERT(ppim->size == size, 8241 ("pmap_unmapbios: size mismatch")); 8242 ppim->va = NULL; 8243 ppim->pa = 0; 8244 ppim->size = 0; 8245 preinit_map = true; 8246 offset = block * L2_SIZE; 8247 va_trunc = rounddown2((vm_offset_t)va, L2_SIZE) + 8248 offset; 8249 8250 /* Remove L2_BLOCK */ 8251 pde = pmap_pde(kernel_pmap, va_trunc, &lvl); 8252 KASSERT(pde != NULL, 8253 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", 8254 va_trunc)); 8255 l2 = pmap_l1_to_l2(pde, va_trunc); 8256 pmap_clear(l2); 8257 8258 if (block == (l2_blocks - 1)) 8259 break; 8260 block++; 8261 } 8262 } 8263 if (preinit_map) { 8264 pmap_s1_invalidate_all_kernel(); 8265 return; 8266 } 8267 8268 /* Unmap the pages reserved with kva_alloc. */ 8269 if (vm_initialized) { 8270 offset = (vm_offset_t)va & PAGE_MASK; 8271 size = round_page(offset + size); 8272 va = trunc_page(va); 8273 8274 /* Unmap and invalidate the pages */ 8275 pmap_kremove_device((vm_offset_t)va, size); 8276 8277 kva_free(va, size); 8278 } 8279 } 8280 8281 /* 8282 * Sets the memory attribute for the specified page. 8283 */ 8284 void 8285 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 8286 { 8287 if (m->md.pv_memattr == ma) 8288 return; 8289 8290 m->md.pv_memattr = ma; 8291 8292 /* 8293 * If "m" is a normal page, update its direct mapping. This update 8294 * can be relied upon to perform any cache operations that are 8295 * required for data coherence. 8296 */ 8297 if ((m->flags & PG_FICTITIOUS) == 0 && 8298 pmap_change_attr(VM_PAGE_TO_DMAP(m), PAGE_SIZE, 8299 m->md.pv_memattr) != 0) 8300 panic("memory attribute change on the direct map failed"); 8301 } 8302 8303 /* 8304 * Changes the specified virtual address range's memory type to that given by 8305 * the parameter "mode". The specified virtual address range must be 8306 * completely contained within either the direct map or the kernel map. If 8307 * the virtual address range is contained within the kernel map, then the 8308 * memory type for each of the corresponding ranges of the direct map is also 8309 * changed. (The corresponding ranges of the direct map are those ranges that 8310 * map the same physical pages as the specified virtual address range.) These 8311 * changes to the direct map are necessary because Intel describes the 8312 * behavior of their processors as "undefined" if two or more mappings to the 8313 * same physical page have different memory types. 8314 * 8315 * Returns zero if the change completed successfully, and either EINVAL or 8316 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 8317 * of the virtual address range was not mapped, and ENOMEM is returned if 8318 * there was insufficient memory available to complete the change. In the 8319 * latter case, the memory type may have been changed on some part of the 8320 * virtual address range or the direct map. 8321 */ 8322 int 8323 pmap_change_attr(void *va, vm_size_t size, int mode) 8324 { 8325 int error; 8326 8327 PMAP_LOCK(kernel_pmap); 8328 error = pmap_change_props_locked(va, size, PROT_NONE, mode, -1, false); 8329 PMAP_UNLOCK(kernel_pmap); 8330 return (error); 8331 } 8332 8333 int 8334 pmap_change_dmap_attr(int mode) 8335 { 8336 int error; 8337 8338 KASSERT(mode == VM_MEMATTR_WRITE_BACK || 8339 mode == VM_MEMATTR_TAGGED, 8340 ("%s: mode %d must be compatible with write-back", __func__, mode)); 8341 8342 PMAP_LOCK(kernel_pmap); 8343 error = pmap_change_props_locked((void *)DMAP_MIN_ADDRESS, 8344 dmap_max_addr - DMAP_MIN_ADDRESS, PROT_NONE, mode, dmap_attr, true); 8345 if (error == 0) 8346 dmap_attr = mode; 8347 PMAP_UNLOCK(kernel_pmap); 8348 return (error); 8349 } 8350 8351 /* 8352 * Changes the specified virtual address range's protections to those 8353 * specified by "prot". Like pmap_change_attr(), protections for aliases 8354 * in the direct map are updated as well. Protections on aliasing mappings may 8355 * be a subset of the requested protections; for example, mappings in the direct 8356 * map are never executable. 8357 */ 8358 int 8359 pmap_change_prot(void *va, vm_size_t size, vm_prot_t prot) 8360 { 8361 int error; 8362 8363 /* Only supported within the kernel map. */ 8364 if ((vm_offset_t)va < VM_MIN_KERNEL_ADDRESS) 8365 return (EINVAL); 8366 8367 PMAP_LOCK(kernel_pmap); 8368 error = pmap_change_props_locked(va, size, prot, -1, -1, false); 8369 PMAP_UNLOCK(kernel_pmap); 8370 return (error); 8371 } 8372 8373 static int 8374 pmap_change_props_locked(void *addr, vm_size_t size, vm_prot_t prot, 8375 int mode, int old_mode, bool skip_unmapped) 8376 { 8377 vm_offset_t base, offset, tmpva, va; 8378 vm_size_t pte_size; 8379 vm_paddr_t pa; 8380 pt_entry_t pte, *ptep, *newpte; 8381 pt_entry_t bits, mask, old_mode_bits, old_mode_mask; 8382 char *tmpptep; 8383 int lvl, rv; 8384 8385 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 8386 va = (vm_offset_t)addr; 8387 base = trunc_page(va); 8388 offset = va & PAGE_MASK; 8389 size = round_page(offset + size); 8390 8391 if (!VIRT_IN_DMAP(base) && 8392 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 8393 return (EINVAL); 8394 8395 bits = old_mode_bits = 0; 8396 mask = old_mode_mask = 0; 8397 if (mode != -1) { 8398 bits = ATTR_S1_IDX(mode); 8399 mask = ATTR_S1_IDX_MASK; 8400 if (mode == VM_MEMATTR_DEVICE) { 8401 mask |= ATTR_S1_XN; 8402 bits |= ATTR_S1_XN; 8403 } 8404 } 8405 if (old_mode != -1) { 8406 old_mode_bits = ATTR_S1_IDX(old_mode); 8407 old_mode_mask = ATTR_S1_IDX_MASK; 8408 } 8409 if (prot != VM_PROT_NONE) { 8410 /* Don't mark the DMAP as executable. It never is on arm64. */ 8411 if (VIRT_IN_DMAP(base)) { 8412 prot &= ~VM_PROT_EXECUTE; 8413 /* 8414 * XXX Mark the DMAP as writable for now. We rely 8415 * on this in ddb & dtrace to insert breakpoint 8416 * instructions. 8417 */ 8418 prot |= VM_PROT_WRITE; 8419 } 8420 8421 if ((prot & VM_PROT_WRITE) == 0) { 8422 bits |= ATTR_S1_AP(ATTR_S1_AP_RO); 8423 } 8424 if ((prot & VM_PROT_EXECUTE) == 0) { 8425 bits |= ATTR_S1_PXN; 8426 } 8427 bits |= ATTR_S1_UXN; 8428 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN; 8429 } 8430 8431 for (tmpva = base; tmpva < base + size; ) { 8432 ptep = pmap_pte(kernel_pmap, tmpva, &lvl); 8433 if (ptep == NULL && !skip_unmapped) { 8434 return (EINVAL); 8435 } else if ((ptep == NULL && skip_unmapped) || 8436 (pmap_load(ptep) & mask) == bits || 8437 (pmap_load(ptep) & old_mode_mask) != old_mode_bits) { 8438 /* 8439 * We already have one of the following meaning 8440 * we can skip this memory region:: 8441 * - No memory mapped at this address 8442 * - The new attributes are already set 8443 * - The expected attributes are incorrect 8444 */ 8445 switch (lvl) { 8446 default: 8447 panic("Invalid DMAP table level: %d\n", lvl); 8448 case 1: 8449 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 8450 break; 8451 case 2: 8452 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 8453 break; 8454 case 3: 8455 tmpva += PAGE_SIZE; 8456 break; 8457 } 8458 } else { 8459 /* We can't demote/promote this entry */ 8460 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0); 8461 8462 /* 8463 * Find the entry and demote it if the requested change 8464 * only applies to part of the address range mapped by 8465 * the entry. 8466 */ 8467 switch (lvl) { 8468 default: 8469 panic("Invalid DMAP table level: %d\n", lvl); 8470 case 1: 8471 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 8472 if ((tmpva & L1_OFFSET) == 0 && 8473 (base + size - tmpva) >= L1_SIZE) { 8474 pte_size = L1_SIZE; 8475 break; 8476 } 8477 newpte = pmap_demote_l1(kernel_pmap, ptep, 8478 tmpva & ~L1_OFFSET); 8479 if (newpte == NULL) 8480 return (EINVAL); 8481 ptep = pmap_l1_to_l2(ptep, tmpva); 8482 /* FALLTHROUGH */ 8483 case 2: 8484 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) { 8485 if ((tmpva & L2C_OFFSET) == 0 && 8486 (base + size - tmpva) >= L2C_SIZE) { 8487 pte_size = L2C_SIZE; 8488 break; 8489 } 8490 if (!pmap_demote_l2c(kernel_pmap, ptep, 8491 tmpva)) 8492 return (EINVAL); 8493 } 8494 if ((tmpva & L2_OFFSET) == 0 && 8495 (base + size - tmpva) >= L2_SIZE) { 8496 pte_size = L2_SIZE; 8497 break; 8498 } 8499 newpte = pmap_demote_l2(kernel_pmap, ptep, 8500 tmpva); 8501 if (newpte == NULL) 8502 return (EINVAL); 8503 ptep = pmap_l2_to_l3(ptep, tmpva); 8504 /* FALLTHROUGH */ 8505 case 3: 8506 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) { 8507 if ((tmpva & L3C_OFFSET) == 0 && 8508 (base + size - tmpva) >= L3C_SIZE) { 8509 pte_size = L3C_SIZE; 8510 break; 8511 } 8512 if (!pmap_demote_l3c(kernel_pmap, ptep, 8513 tmpva)) 8514 return (EINVAL); 8515 } 8516 pte_size = PAGE_SIZE; 8517 break; 8518 } 8519 8520 tmpptep = 0; 8521 if (tmpva <= (vm_offset_t)ptep && 8522 tmpva + pte_size > (vm_offset_t)ptep) { 8523 vm_paddr_t pte_pa; 8524 8525 mtx_lock(&cmap_lock); 8526 tmpptep = cmap1_addr; 8527 pte_pa = DMAP_TO_PHYS((vm_offset_t)ptep); 8528 pmap_store(cmap1_pte, ATTR_AF | 8529 pmap_sh_attr | ATTR_S1_AP(ATTR_S1_AP_RW) | 8530 ATTR_S1_XN | ATTR_KERN_GP | 8531 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 8532 PHYS_TO_PTE(pte_pa &~L3_OFFSET) | L3_PAGE); 8533 dsb(ishst); 8534 ptep = (pt_entry_t *)(tmpptep + 8535 ((vm_offset_t)ptep & PAGE_MASK)); 8536 } 8537 8538 /* Update the entry */ 8539 pte = pmap_load(ptep); 8540 pte &= ~mask; 8541 pte |= bits; 8542 8543 switch (pte_size) { 8544 case L2C_SIZE: 8545 pmap_update_strided(kernel_pmap, ptep, ptep + 8546 L2C_ENTRIES, pte, tmpva, L2_SIZE, L2C_SIZE); 8547 break; 8548 case L3C_SIZE: 8549 pmap_update_strided(kernel_pmap, ptep, ptep + 8550 L3C_ENTRIES, pte, tmpva, L3_SIZE, L3C_SIZE); 8551 break; 8552 default: 8553 /* 8554 * We are updating a single block or page entry, 8555 * so regardless of pte_size pass PAGE_SIZE in 8556 * order that a single TLB invalidation is 8557 * performed. 8558 */ 8559 pmap_update_entry(kernel_pmap, ptep, pte, tmpva, 8560 PAGE_SIZE); 8561 break; 8562 } 8563 8564 if (tmpptep != 0) { 8565 pmap_clear(cmap1_pte); 8566 pmap_s1_invalidate_page(kernel_pmap, 8567 (vm_offset_t)tmpptep, true); 8568 mtx_unlock(&cmap_lock); 8569 } 8570 8571 pa = PTE_TO_PHYS(pte); 8572 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) { 8573 int dmap_mode; 8574 8575 /* 8576 * When booting on HW with MTE enabled we may 8577 * need to swap to a tagged type for the DMAP 8578 * to allow tags to be set through it. 8579 */ 8580 if (mode == VM_MEMATTR_WRITE_BACK) 8581 dmap_mode = dmap_attr; 8582 else 8583 dmap_mode = mode; 8584 8585 /* 8586 * Keep the DMAP memory in sync. 8587 */ 8588 rv = pmap_change_props_locked( 8589 PHYS_TO_DMAP(pa), pte_size, 8590 prot, dmap_mode, old_mode, true); 8591 if (rv != 0) 8592 return (rv); 8593 } 8594 8595 /* 8596 * If moving to a non-cacheable entry flush 8597 * the cache. 8598 */ 8599 if (mode == VM_MEMATTR_UNCACHEABLE) 8600 cpu_dcache_wbinv_range((void *)tmpva, pte_size); 8601 tmpva += pte_size; 8602 } 8603 } 8604 8605 return (0); 8606 } 8607 8608 /* 8609 * Create an L2 table to map all addresses within an L1 mapping. 8610 */ 8611 static pt_entry_t * 8612 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) 8613 { 8614 pt_entry_t *l2, newl2, oldl1; 8615 char *tmpl1; 8616 vm_paddr_t l2phys, phys; 8617 vm_page_t ml2; 8618 int i; 8619 8620 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8621 oldl1 = pmap_load(l1); 8622 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 8623 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, 8624 ("pmap_demote_l1: Demoting a non-block entry")); 8625 KASSERT((va & L1_OFFSET) == 0, 8626 ("pmap_demote_l1: Invalid virtual address %#lx", va)); 8627 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, 8628 ("pmap_demote_l1: Level 1 table shouldn't be managed")); 8629 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0, 8630 ("pmap_demote_l1: Demoting entry with no-demote flag set")); 8631 8632 tmpl1 = NULL; 8633 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { 8634 tmpl1 = kva_alloc(PAGE_SIZE); 8635 if (tmpl1 == NULL) 8636 return (NULL); 8637 } 8638 8639 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) == 8640 NULL) { 8641 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" 8642 " in pmap %p", va, pmap); 8643 l2 = NULL; 8644 goto fail; 8645 } 8646 8647 l2phys = VM_PAGE_TO_PHYS(ml2); 8648 l2 = PHYS_TO_DMAP(l2phys); 8649 8650 /* Address the range points at */ 8651 phys = PTE_TO_PHYS(oldl1); 8652 /* The attributed from the old l1 table to be copied */ 8653 newl2 = oldl1 & ATTR_MASK; 8654 8655 /* Create the new entries */ 8656 newl2 |= ATTR_CONTIGUOUS; 8657 for (i = 0; i < Ln_ENTRIES; i++) { 8658 l2[i] = newl2 | phys; 8659 phys += L2_SIZE; 8660 } 8661 KASSERT(l2[0] == (ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | 8662 L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0], 8663 ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); 8664 8665 if (tmpl1 != NULL) { 8666 pmap_kenter((vm_offset_t)tmpl1, PAGE_SIZE, 8667 DMAP_TO_PHYS(l1) & ~L3_OFFSET, 8668 VM_MEMATTR_WRITE_BACK); 8669 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); 8670 } 8671 8672 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); 8673 8674 counter_u64_add(pmap_l1_demotions, 1); 8675 fail: 8676 if (tmpl1 != NULL) { 8677 pmap_kremove((vm_offset_t)tmpl1); 8678 kva_free(tmpl1, PAGE_SIZE); 8679 } 8680 8681 return (l2); 8682 } 8683 8684 static void 8685 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) 8686 { 8687 pt_entry_t *l3; 8688 8689 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { 8690 *l3 = newl3; 8691 newl3 += L3_SIZE; 8692 } 8693 } 8694 8695 static void 8696 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused) 8697 { 8698 #ifdef INVARIANTS 8699 #ifdef DIAGNOSTIC 8700 pt_entry_t *xl3p, *yl3p; 8701 8702 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES; 8703 xl3p++, newl3e += PAGE_SIZE) { 8704 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) { 8705 printf("pmap_demote_l2: xl3e %zd and newl3e map " 8706 "different pages: found %#lx, expected %#lx\n", 8707 xl3p - firstl3p, pmap_load(xl3p), newl3e); 8708 printf("page table dump\n"); 8709 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES; 8710 yl3p++) { 8711 printf("%zd %#lx\n", yl3p - firstl3p, 8712 pmap_load(yl3p)); 8713 } 8714 panic("firstpte"); 8715 } 8716 } 8717 #else 8718 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e), 8719 ("pmap_demote_l2: firstl3 and newl3e map different physical" 8720 " addresses")); 8721 #endif 8722 #endif 8723 } 8724 8725 static void 8726 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, 8727 struct rwlock **lockp) 8728 { 8729 struct spglist free; 8730 8731 SLIST_INIT(&free); 8732 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), true, 8733 &free, lockp); 8734 vm_page_free_pages_toq(&free, true); 8735 } 8736 8737 /* 8738 * Create an L3 table to map all addresses within an L2 mapping. 8739 */ 8740 static pt_entry_t * 8741 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, 8742 struct rwlock **lockp) 8743 { 8744 pt_entry_t *l3, newl3, oldl2; 8745 char *tmpl2; 8746 vm_paddr_t l3phys; 8747 vm_page_t ml3; 8748 8749 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8750 PMAP_ASSERT_STAGE1(pmap); 8751 KASSERT(ADDR_IS_CANONICAL(va), 8752 ("%s: Address not in canonical form: %lx", __func__, va)); 8753 8754 l3 = NULL; 8755 oldl2 = pmap_load(l2); 8756 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, 8757 ("pmap_demote_l2: Demoting a non-block entry")); 8758 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0, 8759 ("pmap_demote_l2: Demoting entry with no-demote flag set")); 8760 va &= ~L2_OFFSET; 8761 8762 tmpl2 = NULL; 8763 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { 8764 tmpl2 = kva_alloc(PAGE_SIZE); 8765 if (tmpl2 == NULL) 8766 return (NULL); 8767 } 8768 8769 /* 8770 * Invalidate the 2MB page mapping and return "failure" if the 8771 * mapping was never accessed and not wired. 8772 */ 8773 if ((oldl2 & ATTR_AF) == 0) { 8774 if ((oldl2 & ATTR_SW_WIRED) == 0) { 8775 pmap_demote_l2_abort(pmap, va, l2, lockp); 8776 CTR2(KTR_PMAP, 8777 "pmap_demote_l2: failure for va %#lx in pmap %p", 8778 va, pmap); 8779 goto fail; 8780 } 8781 ml3 = pmap_remove_pt_page(pmap, va); 8782 /* Fill the PTP with L3Es that have ATTR_AF cleared. */ 8783 ml3->valid = 0; 8784 } else if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { 8785 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 8786 ("pmap_demote_l2: page table page for a wired mapping" 8787 " is missing")); 8788 8789 /* 8790 * If the page table page is missing and the mapping 8791 * is for a kernel address, the mapping must belong to 8792 * either the direct map or the early kernel memory. 8793 * Page table pages are preallocated for every other 8794 * part of the kernel address space, so the direct map 8795 * region and early kernel memory are the only parts of the 8796 * kernel address space that must be handled here. 8797 */ 8798 KASSERT(ADDR_IS_USER(va) || VIRT_IN_DMAP(va) || 8799 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end), 8800 ("pmap_demote_l2: No saved mpte for va %#lx", va)); 8801 8802 /* 8803 * If the 2MB page mapping belongs to the direct map 8804 * region of the kernel's address space, then the page 8805 * allocation request specifies the highest possible 8806 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 8807 * priority is normal. 8808 */ 8809 ml3 = vm_page_alloc_noobj( 8810 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 8811 VM_ALLOC_WIRED); 8812 8813 /* 8814 * If the allocation of the new page table page fails, 8815 * invalidate the 2MB page mapping and return "failure". 8816 */ 8817 if (ml3 == NULL) { 8818 pmap_demote_l2_abort(pmap, va, l2, lockp); 8819 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" 8820 " in pmap %p", va, pmap); 8821 goto fail; 8822 } 8823 ml3->pindex = pmap_l2_pindex(va); 8824 8825 if (ADDR_IS_USER(va)) { 8826 ml3->ref_count = NL3PG; 8827 pmap_resident_count_inc(pmap, 1); 8828 } 8829 } 8830 l3phys = VM_PAGE_TO_PHYS(ml3); 8831 l3 = PHYS_TO_DMAP(l3phys); 8832 newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; 8833 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 8834 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM), 8835 ("pmap_demote_l2: L2 entry is writeable but not dirty")); 8836 8837 /* 8838 * If the PTP is not leftover from an earlier promotion or it does not 8839 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all 8840 * have ATTR_AF set, unless this is a wired mapping with ATTR_AF clear. 8841 * 8842 * When pmap_update_entry() clears the old L2 mapping, it (indirectly) 8843 * performs a dsb(). That dsb() ensures that the stores for filling 8844 * "l3" are visible before "l3" is added to the page table. 8845 */ 8846 if (!vm_page_all_valid(ml3)) 8847 pmap_fill_l3(l3, newl3); 8848 8849 pmap_demote_l2_check(l3, newl3); 8850 8851 /* 8852 * If the mapping has changed attributes, update the L3Es. 8853 */ 8854 if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE)) 8855 pmap_fill_l3(l3, newl3); 8856 8857 /* 8858 * Map the temporary page so we don't lose access to the l2 table. 8859 */ 8860 if (tmpl2 != NULL) { 8861 pmap_kenter((vm_offset_t)tmpl2, PAGE_SIZE, 8862 DMAP_TO_PHYS(l2) & ~L3_OFFSET, 8863 VM_MEMATTR_WRITE_BACK); 8864 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); 8865 } 8866 8867 /* 8868 * The spare PV entries must be reserved prior to demoting the 8869 * mapping, that is, prior to changing the PDE. Otherwise, the state 8870 * of the L2 and the PV lists will be inconsistent, which can result 8871 * in reclaim_pv_chunk() attempting to remove a PV entry from the 8872 * wrong PV list and pmap_pv_demote_l2() failing to find the expected 8873 * PV entry for the 2MB page mapping that is being demoted. 8874 */ 8875 if ((oldl2 & ATTR_SW_MANAGED) != 0) 8876 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 8877 8878 /* 8879 * Pass PAGE_SIZE so that a single TLB invalidation is performed on 8880 * the 2MB page mapping. 8881 */ 8882 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); 8883 8884 /* 8885 * Demote the PV entry. 8886 */ 8887 if ((oldl2 & ATTR_SW_MANAGED) != 0) 8888 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 8889 8890 counter_u64_add(pmap_l2_demotions, 1); 8891 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" 8892 " in pmap %p %lx", va, pmap, l3[0]); 8893 8894 fail: 8895 if (tmpl2 != NULL) { 8896 pmap_kremove((vm_offset_t)tmpl2); 8897 kva_free(tmpl2, PAGE_SIZE); 8898 } 8899 8900 return (l3); 8901 8902 } 8903 8904 static pt_entry_t * 8905 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 8906 { 8907 struct rwlock *lock; 8908 pt_entry_t *l3; 8909 8910 lock = NULL; 8911 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); 8912 if (lock != NULL) 8913 rw_wunlock(lock); 8914 return (l3); 8915 } 8916 8917 /* 8918 * Demote an L2C superpage mapping to L2C_ENTRIES L2 block mappings. 8919 */ 8920 static bool 8921 pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va) 8922 { 8923 pd_entry_t *l2c_end, *l2c_start, l2e, mask, nbits, *tl2p; 8924 char *tmpl3; 8925 register_t intr; 8926 8927 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8928 PMAP_ASSERT_STAGE1(pmap); 8929 l2c_start = (pd_entry_t *)((uintptr_t)l2p & ~((L2C_ENTRIES * 8930 sizeof(pd_entry_t)) - 1)); 8931 l2c_end = l2c_start + L2C_ENTRIES; 8932 tmpl3 = NULL; 8933 if ((va & ~L2C_OFFSET) < (vm_offset_t)l2c_end && 8934 (vm_offset_t)l2c_start < (va & ~L2C_OFFSET) + L2C_SIZE) { 8935 tmpl3 = kva_alloc(PAGE_SIZE); 8936 if (tmpl3 == NULL) 8937 return (false); 8938 pmap_kenter((vm_offset_t)tmpl3, PAGE_SIZE, 8939 DMAP_TO_PHYS(l2c_start) & ~L3_OFFSET, 8940 VM_MEMATTR_WRITE_BACK); 8941 l2c_start = (pd_entry_t *)(tmpl3 + 8942 ((vm_offset_t)l2c_start & PAGE_MASK)); 8943 l2c_end = (pd_entry_t *)(tmpl3 + 8944 ((vm_offset_t)l2c_end & PAGE_MASK)); 8945 } 8946 mask = 0; 8947 nbits = ATTR_DESCR_VALID; 8948 intr = intr_disable(); 8949 8950 /* 8951 * Break the mappings. 8952 */ 8953 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) { 8954 /* 8955 * Clear the mapping's contiguous and valid bits, but leave 8956 * the rest of the entry unchanged, so that a lockless, 8957 * concurrent pmap_kextract() can still lookup the physical 8958 * address. 8959 */ 8960 l2e = pmap_load(tl2p); 8961 KASSERT((l2e & ATTR_CONTIGUOUS) != 0, 8962 ("pmap_demote_l2c: missing ATTR_CONTIGUOUS")); 8963 KASSERT((l2e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) != 8964 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)), 8965 ("pmap_demote_l2c: missing ATTR_S1_AP_RW")); 8966 while (!atomic_fcmpset_64(tl2p, &l2e, l2e & ~(ATTR_CONTIGUOUS | 8967 ATTR_DESCR_VALID))) 8968 cpu_spinwait(); 8969 8970 /* 8971 * Hardware accessed and dirty bit maintenance might only 8972 * update a single L2 entry, so we must combine the accessed 8973 * and dirty bits from this entire set of contiguous L2 8974 * entries. 8975 */ 8976 if ((l2e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 8977 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)) 8978 mask = ATTR_S1_AP_RW_BIT; 8979 nbits |= l2e & ATTR_AF; 8980 } 8981 if ((nbits & ATTR_AF) != 0) { 8982 pmap_s1_invalidate_strided(pmap, va & ~L2C_OFFSET, (va + 8983 L2C_SIZE) & ~L2C_OFFSET, L2_SIZE, true); 8984 } 8985 8986 /* 8987 * Remake the mappings, updating the accessed and dirty bits. 8988 */ 8989 l2e = (pmap_load(l2c_start) & ~mask) | nbits; 8990 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) { 8991 pmap_store(tl2p, l2e); 8992 l2e += L2_SIZE; 8993 } 8994 dsb(ishst); 8995 8996 intr_restore(intr); 8997 if (tmpl3 != NULL) { 8998 pmap_kremove((vm_offset_t)tmpl3); 8999 kva_free(tmpl3, PAGE_SIZE); 9000 } 9001 counter_u64_add(pmap_l2c_demotions, 1); 9002 CTR2(KTR_PMAP, "pmap_demote_l2c: success for va %#lx in pmap %p", 9003 va, pmap); 9004 return (true); 9005 } 9006 9007 /* 9008 * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings. 9009 */ 9010 static bool 9011 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va) 9012 { 9013 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p; 9014 char *tmpl3; 9015 register_t intr; 9016 9017 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9018 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES * 9019 sizeof(pt_entry_t)) - 1)); 9020 l3c_end = l3c_start + L3C_ENTRIES; 9021 tmpl3 = NULL; 9022 if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end && 9023 (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) { 9024 tmpl3 = kva_alloc(PAGE_SIZE); 9025 if (tmpl3 == NULL) 9026 return (false); 9027 pmap_kenter((vm_offset_t)tmpl3, PAGE_SIZE, 9028 DMAP_TO_PHYS(l3c_start) & ~L3_OFFSET, 9029 VM_MEMATTR_WRITE_BACK); 9030 l3c_start = (pt_entry_t *)(tmpl3 + 9031 ((vm_offset_t)l3c_start & PAGE_MASK)); 9032 l3c_end = (pt_entry_t *)(tmpl3 + 9033 ((vm_offset_t)l3c_end & PAGE_MASK)); 9034 } 9035 mask = 0; 9036 nbits = ATTR_DESCR_VALID; 9037 intr = intr_disable(); 9038 9039 /* 9040 * Break the mappings. 9041 */ 9042 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) { 9043 /* 9044 * Clear the mapping's contiguous and valid bits, but leave 9045 * the rest of the entry unchanged, so that a lockless, 9046 * concurrent pmap_kextract() can still lookup the physical 9047 * address. 9048 */ 9049 l3e = pmap_load(tl3p); 9050 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 9051 ("pmap_demote_l3c: missing ATTR_CONTIGUOUS")); 9052 KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) != 9053 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)), 9054 ("pmap_demote_l3c: missing ATTR_S1_AP_RW")); 9055 while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS | 9056 ATTR_DESCR_VALID))) 9057 cpu_spinwait(); 9058 9059 /* 9060 * Hardware accessed and dirty bit maintenance might only 9061 * update a single L3 entry, so we must combine the accessed 9062 * and dirty bits from this entire set of contiguous L3 9063 * entries. 9064 */ 9065 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 9066 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)) 9067 mask = ATTR_S1_AP_RW_BIT; 9068 nbits |= l3e & ATTR_AF; 9069 } 9070 if ((nbits & ATTR_AF) != 0) { 9071 pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) & 9072 ~L3C_OFFSET, true); 9073 } 9074 9075 /* 9076 * Remake the mappings, updating the accessed and dirty bits. 9077 */ 9078 l3e = (pmap_load(l3c_start) & ~mask) | nbits; 9079 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) { 9080 pmap_store(tl3p, l3e); 9081 l3e += L3_SIZE; 9082 } 9083 dsb(ishst); 9084 9085 intr_restore(intr); 9086 if (tmpl3 != NULL) { 9087 pmap_kremove((vm_offset_t)tmpl3); 9088 kva_free(tmpl3, PAGE_SIZE); 9089 } 9090 counter_u64_add(pmap_l3c_demotions, 1); 9091 CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p", 9092 va, pmap); 9093 return (true); 9094 } 9095 9096 /* 9097 * Accumulate the accessed and dirty bits within a L3C superpage and 9098 * return the specified PTE with them applied correctly. 9099 */ 9100 static pt_entry_t 9101 pmap_load_l3c(pt_entry_t *l3p) 9102 { 9103 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p; 9104 9105 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES * 9106 sizeof(pt_entry_t)) - 1)); 9107 l3c_end = l3c_start + L3C_ENTRIES; 9108 mask = 0; 9109 nbits = 0; 9110 /* Iterate over each mapping in the superpage. */ 9111 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) { 9112 l3e = pmap_load(tl3p); 9113 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 9114 ("pmap_load_l3c: missing ATTR_CONTIGUOUS")); 9115 /* Update mask if the current page has its dirty bit set. */ 9116 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 9117 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)) 9118 mask = ATTR_S1_AP_RW_BIT; 9119 /* Update nbits if the accessed bit is set. */ 9120 nbits |= l3e & ATTR_AF; 9121 } 9122 return ((pmap_load(l3p) & ~mask) | nbits); 9123 } 9124 9125 /* 9126 * Perform the pmap work for mincore(2). If the page is not both referenced and 9127 * modified by this pmap, returns its physical address so that the caller can 9128 * find other mappings. 9129 */ 9130 int 9131 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 9132 { 9133 pt_entry_t *pte, tpte; 9134 vm_paddr_t mask, pa; 9135 int lvl, psind, val; 9136 bool managed; 9137 9138 PMAP_ASSERT_STAGE1(pmap); 9139 PMAP_LOCK(pmap); 9140 pte = pmap_pte(pmap, addr, &lvl); 9141 if (pte != NULL) { 9142 tpte = pmap_load(pte); 9143 9144 switch (lvl) { 9145 case 3: 9146 mask = L3_OFFSET; 9147 psind = (tpte & ATTR_CONTIGUOUS) != 0 ? 1 : 0; 9148 break; 9149 case 2: 9150 mask = L2_OFFSET; 9151 psind = 2; 9152 break; 9153 case 1: 9154 mask = L1_OFFSET; 9155 psind = 3; 9156 break; 9157 default: 9158 panic("pmap_mincore: invalid level %d", lvl); 9159 } 9160 9161 managed = (tpte & ATTR_SW_MANAGED) != 0; 9162 val = MINCORE_INCORE | MINCORE_PSIND(psind); 9163 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && 9164 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) 9165 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 9166 if ((tpte & ATTR_AF) == ATTR_AF) 9167 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 9168 9169 pa = PTE_TO_PHYS(tpte) | (addr & mask); 9170 } else { 9171 managed = false; 9172 val = 0; 9173 } 9174 9175 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 9176 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 9177 *pap = pa; 9178 } 9179 PMAP_UNLOCK(pmap); 9180 return (val); 9181 } 9182 9183 /* 9184 * Garbage collect every ASID that is neither active on a processor nor 9185 * reserved. 9186 */ 9187 static void 9188 pmap_reset_asid_set(pmap_t pmap) 9189 { 9190 pmap_t curpmap; 9191 int asid, cpuid, epoch; 9192 struct asid_set *set; 9193 enum pmap_stage stage; 9194 9195 set = pmap->pm_asid_set; 9196 stage = pmap->pm_stage; 9197 9198 set = pmap->pm_asid_set; 9199 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 9200 mtx_assert(&set->asid_set_mutex, MA_OWNED); 9201 9202 /* 9203 * Ensure that the store to asid_epoch is globally visible before the 9204 * loads from pc_curpmap are performed. 9205 */ 9206 epoch = set->asid_epoch + 1; 9207 if (epoch == INT_MAX) 9208 epoch = 0; 9209 set->asid_epoch = epoch; 9210 dsb(ishst); 9211 if (stage == PM_STAGE1) { 9212 __asm __volatile("tlbi vmalle1is"); 9213 } else { 9214 KASSERT(pmap_clean_stage2_tlbi != NULL, 9215 ("%s: Unset stage 2 tlb invalidation callback\n", 9216 __func__)); 9217 pmap_clean_stage2_tlbi(); 9218 } 9219 dsb(ish); 9220 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE, 9221 set->asid_set_size - 1); 9222 CPU_FOREACH(cpuid) { 9223 if (cpuid == curcpu) 9224 continue; 9225 if (stage == PM_STAGE1) { 9226 curpmap = pcpu_find(cpuid)->pc_curpmap; 9227 PMAP_ASSERT_STAGE1(pmap); 9228 } else { 9229 curpmap = pcpu_find(cpuid)->pc_curvmpmap; 9230 if (curpmap == NULL) 9231 continue; 9232 PMAP_ASSERT_STAGE2(pmap); 9233 } 9234 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set")); 9235 asid = COOKIE_TO_ASID(curpmap->pm_cookie); 9236 if (asid == -1) 9237 continue; 9238 bit_set(set->asid_set, asid); 9239 curpmap->pm_cookie = COOKIE_FROM(asid, epoch); 9240 } 9241 } 9242 9243 /* 9244 * Allocate a new ASID for the specified pmap. 9245 */ 9246 static void 9247 pmap_alloc_asid(pmap_t pmap) 9248 { 9249 struct asid_set *set; 9250 int new_asid; 9251 9252 set = pmap->pm_asid_set; 9253 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 9254 9255 mtx_lock_spin(&set->asid_set_mutex); 9256 9257 /* 9258 * While this processor was waiting to acquire the asid set mutex, 9259 * pmap_reset_asid_set() running on another processor might have 9260 * updated this pmap's cookie to the current epoch. In which case, we 9261 * don't need to allocate a new ASID. 9262 */ 9263 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) 9264 goto out; 9265 9266 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size, 9267 &new_asid); 9268 if (new_asid == -1) { 9269 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 9270 set->asid_next, &new_asid); 9271 if (new_asid == -1) { 9272 pmap_reset_asid_set(pmap); 9273 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 9274 set->asid_set_size, &new_asid); 9275 KASSERT(new_asid != -1, ("ASID allocation failure")); 9276 } 9277 } 9278 bit_set(set->asid_set, new_asid); 9279 set->asid_next = new_asid + 1; 9280 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch); 9281 out: 9282 mtx_unlock_spin(&set->asid_set_mutex); 9283 } 9284 9285 static uint64_t __read_mostly ttbr_flags; 9286 9287 /* 9288 * Compute the value that should be stored in ttbr0 to activate the specified 9289 * pmap. This value may change from time to time. 9290 */ 9291 uint64_t 9292 pmap_to_ttbr0(pmap_t pmap) 9293 { 9294 uint64_t ttbr; 9295 9296 ttbr = pmap->pm_ttbr; 9297 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 9298 ttbr |= ttbr_flags; 9299 9300 return (ttbr); 9301 } 9302 9303 static void 9304 pmap_set_cnp(void *arg) 9305 { 9306 uint64_t ttbr0, ttbr1; 9307 u_int cpuid; 9308 9309 cpuid = *(u_int *)arg; 9310 if (cpuid == curcpu) { 9311 /* 9312 * Set the flags while all CPUs are handling the 9313 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls 9314 * to pmap_to_ttbr0 after this will have the CnP flag set. 9315 * The dsb after invalidating the TLB will act as a barrier 9316 * to ensure all CPUs can observe this change. 9317 */ 9318 ttbr_flags |= TTBR_CnP; 9319 } 9320 9321 ttbr0 = READ_SPECIALREG(ttbr0_el1); 9322 ttbr0 |= TTBR_CnP; 9323 9324 ttbr1 = READ_SPECIALREG(ttbr1_el1); 9325 ttbr1 |= TTBR_CnP; 9326 9327 /* Update ttbr{0,1}_el1 with the CnP flag */ 9328 WRITE_SPECIALREG(ttbr0_el1, ttbr0); 9329 WRITE_SPECIALREG(ttbr1_el1, ttbr1); 9330 isb(); 9331 __asm __volatile("tlbi vmalle1is"); 9332 dsb(ish); 9333 isb(); 9334 } 9335 9336 /* 9337 * Defer enabling some features until we have read the ID registers to know 9338 * if they are supported on all CPUs. 9339 */ 9340 static void 9341 pmap_init_mp(void *dummy __unused) 9342 { 9343 uint64_t reg; 9344 9345 get_kernel_reg(ID_AA64PFR1_EL1, ®); 9346 if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) { 9347 if (bootverbose) 9348 printf("Enabling BTI\n"); 9349 pmap_bti_support = true; 9350 9351 pmap_bti_ranges_zone = uma_zcreate("BTI ranges", 9352 sizeof(struct rs_el), NULL, NULL, NULL, NULL, 9353 UMA_ALIGN_PTR, 0); 9354 } 9355 } 9356 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL); 9357 9358 /* 9359 * Defer enabling CnP until we have read the ID registers to know if it's 9360 * supported on all CPUs. 9361 */ 9362 static void 9363 pmap_init_cnp(void *dummy __unused) 9364 { 9365 uint64_t reg; 9366 u_int cpuid; 9367 9368 get_kernel_reg(ID_AA64MMFR2_EL1, ®); 9369 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) { 9370 if (bootverbose) 9371 printf("Enabling CnP\n"); 9372 cpuid = curcpu; 9373 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid); 9374 } 9375 9376 } 9377 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL); 9378 9379 static bool 9380 pmap_activate_int(struct thread *td, pmap_t pmap) 9381 { 9382 struct asid_set *set; 9383 int epoch; 9384 9385 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap")); 9386 KASSERT(pmap != kernel_pmap, ("kernel pmap activation")); 9387 9388 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) || 9389 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) { 9390 /* 9391 * Handle the possibility that the old thread was preempted 9392 * after an "ic" or "tlbi" instruction but before it performed 9393 * a "dsb" instruction. If the old thread migrates to a new 9394 * processor, its completion of a "dsb" instruction on that 9395 * new processor does not guarantee that the "ic" or "tlbi" 9396 * instructions performed on the old processor have completed. 9397 */ 9398 dsb(ish); 9399 return (false); 9400 } 9401 9402 set = pmap->pm_asid_set; 9403 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 9404 9405 /* 9406 * Ensure that the store to curpmap is globally visible before the 9407 * load from asid_epoch is performed. 9408 */ 9409 if (pmap->pm_stage == PM_STAGE1) 9410 PCPU_SET(curpmap, pmap); 9411 else 9412 PCPU_SET(curvmpmap, pmap); 9413 dsb(ish); 9414 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie); 9415 if (epoch >= 0 && epoch != set->asid_epoch) 9416 pmap_alloc_asid(pmap); 9417 9418 if (pmap->pm_stage == PM_STAGE1) { 9419 uint64_t new_tcr, tcr; 9420 9421 new_tcr = td->td_proc->p_md.md_tcr; 9422 tcr = READ_SPECIALREG(tcr_el1); 9423 if ((tcr & MD_TCR_FIELDS) != new_tcr) { 9424 tcr &= ~MD_TCR_FIELDS; 9425 tcr |= new_tcr; 9426 WRITE_SPECIALREG(tcr_el1, tcr); 9427 } 9428 set_ttbr0(pmap_to_ttbr0(pmap)); 9429 if (PCPU_GET(bcast_tlbi_workaround) != 0) 9430 invalidate_local_icache(); 9431 } 9432 return (true); 9433 } 9434 9435 void 9436 pmap_activate_vm(pmap_t pmap) 9437 { 9438 9439 PMAP_ASSERT_STAGE2(pmap); 9440 9441 (void)pmap_activate_int(NULL, pmap); 9442 } 9443 9444 void 9445 pmap_activate(struct thread *td) 9446 { 9447 pmap_t pmap; 9448 9449 pmap = vmspace_pmap(td->td_proc->p_vmspace); 9450 PMAP_ASSERT_STAGE1(pmap); 9451 critical_enter(); 9452 (void)pmap_activate_int(td, pmap); 9453 critical_exit(); 9454 } 9455 9456 /* 9457 * Activate the thread we are switching to. 9458 * To simplify the assembly in cpu_throw return the new threads pcb. 9459 */ 9460 struct pcb * 9461 pmap_switch(struct thread *new) 9462 { 9463 pcpu_bp_harden bp_harden; 9464 struct pcb *pcb; 9465 uint64_t sctlr; 9466 9467 /* Store the new curthread */ 9468 PCPU_SET(curthread, new); 9469 9470 /* And the new pcb */ 9471 pcb = new->td_pcb; 9472 PCPU_SET(curpcb, pcb); 9473 9474 if ((new->td_proc->p_flag & P_KPROC) == 0) { 9475 sctlr = READ_SPECIALREG(sctlr_el1); 9476 if ((sctlr & SCTLR_USER_MASK) != new->td_md.md_sctlr) { 9477 sctlr &= ~SCTLR_USER_MASK; 9478 sctlr |= new->td_md.md_sctlr; 9479 WRITE_SPECIALREG(sctlr_el1, sctlr); 9480 isb(); 9481 } 9482 } 9483 9484 /* 9485 * TODO: We may need to flush the cache here if switching 9486 * to a user process. 9487 */ 9488 9489 if (pmap_activate_int(new, vmspace_pmap(new->td_proc->p_vmspace))) { 9490 /* 9491 * Stop userspace from training the branch predictor against 9492 * other processes. This will call into a CPU specific 9493 * function that clears the branch predictor state. 9494 */ 9495 bp_harden = PCPU_GET(bp_harden); 9496 if (bp_harden != NULL) 9497 bp_harden(); 9498 } 9499 9500 return (pcb); 9501 } 9502 9503 void 9504 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 9505 { 9506 9507 PMAP_ASSERT_STAGE1(pmap); 9508 KASSERT(ADDR_IS_CANONICAL(va), 9509 ("%s: Address not in canonical form: %lx", __func__, va)); 9510 9511 if (ADDR_IS_KERNEL(va)) { 9512 cpu_icache_sync_range((void *)va, sz); 9513 } else { 9514 u_int len, offset; 9515 vm_paddr_t pa; 9516 9517 /* Find the length of data in this page to flush */ 9518 offset = va & PAGE_MASK; 9519 len = imin(PAGE_SIZE - offset, sz); 9520 9521 while (sz != 0) { 9522 /* Extract the physical address & find it in the DMAP */ 9523 pa = pmap_extract(pmap, va); 9524 if (pa != 0) 9525 cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); 9526 9527 /* Move to the next page */ 9528 sz -= len; 9529 va += len; 9530 /* Set the length for the next iteration */ 9531 len = imin(PAGE_SIZE, sz); 9532 } 9533 } 9534 } 9535 9536 static int 9537 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far) 9538 { 9539 pd_entry_t *pdep; 9540 pt_entry_t *ptep, pte; 9541 int rv, lvl, dfsc; 9542 9543 PMAP_ASSERT_STAGE2(pmap); 9544 rv = KERN_FAILURE; 9545 9546 /* Data and insn aborts use same encoding for FSC field. */ 9547 dfsc = esr & ISS_DATA_DFSC_MASK; 9548 switch (dfsc) { 9549 case ISS_DATA_DFSC_TF_L0: 9550 case ISS_DATA_DFSC_TF_L1: 9551 case ISS_DATA_DFSC_TF_L2: 9552 case ISS_DATA_DFSC_TF_L3: 9553 PMAP_LOCK(pmap); 9554 pdep = pmap_pde(pmap, far, &lvl); 9555 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) { 9556 PMAP_UNLOCK(pmap); 9557 break; 9558 } 9559 9560 switch (lvl) { 9561 case 0: 9562 ptep = pmap_l0_to_l1(pdep, far); 9563 break; 9564 case 1: 9565 ptep = pmap_l1_to_l2(pdep, far); 9566 break; 9567 case 2: 9568 ptep = pmap_l2_to_l3(pdep, far); 9569 break; 9570 default: 9571 panic("%s: Invalid pde level %d", __func__,lvl); 9572 } 9573 goto fault_exec; 9574 9575 case ISS_DATA_DFSC_AFF_L1: 9576 case ISS_DATA_DFSC_AFF_L2: 9577 case ISS_DATA_DFSC_AFF_L3: 9578 PMAP_LOCK(pmap); 9579 ptep = pmap_pte(pmap, far, &lvl); 9580 fault_exec: 9581 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) { 9582 /* 9583 * If accessing an executable page invalidate 9584 * the I-cache so it will be valid when we 9585 * continue execution in the guest. The D-cache 9586 * is assumed to already be clean to the Point 9587 * of Coherency. 9588 */ 9589 if ((pte & ATTR_S2_XN_MASK) != 9590 ATTR_S2_XN(ATTR_S2_XN_NONE)) { 9591 invalidate_icache(); 9592 } 9593 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID); 9594 rv = KERN_SUCCESS; 9595 } 9596 PMAP_UNLOCK(pmap); 9597 break; 9598 } 9599 9600 return (rv); 9601 } 9602 9603 int 9604 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) 9605 { 9606 pt_entry_t pte, *ptep; 9607 register_t intr; 9608 uint64_t ec, par; 9609 int lvl, rv; 9610 9611 rv = KERN_FAILURE; 9612 9613 ec = ESR_ELx_EXCEPTION(esr); 9614 switch (ec) { 9615 case EXCP_INSN_ABORT_L: 9616 case EXCP_INSN_ABORT: 9617 case EXCP_DATA_ABORT_L: 9618 case EXCP_DATA_ABORT: 9619 break; 9620 default: 9621 return (rv); 9622 } 9623 9624 if (pmap->pm_stage == PM_STAGE2) 9625 return (pmap_stage2_fault(pmap, esr, far)); 9626 9627 /* Data and insn aborts use same encoding for FSC field. */ 9628 switch (esr & ISS_DATA_DFSC_MASK) { 9629 case ISS_DATA_DFSC_AFF_L1: 9630 case ISS_DATA_DFSC_AFF_L2: 9631 case ISS_DATA_DFSC_AFF_L3: 9632 PMAP_LOCK(pmap); 9633 ptep = pmap_pte(pmap, far, &lvl); 9634 if (ptep != NULL) { 9635 pmap_set_bits(ptep, ATTR_AF); 9636 rv = KERN_SUCCESS; 9637 /* 9638 * XXXMJ as an optimization we could mark the entry 9639 * dirty if this is a write fault. 9640 */ 9641 } 9642 PMAP_UNLOCK(pmap); 9643 break; 9644 case ISS_DATA_DFSC_PF_L1: 9645 case ISS_DATA_DFSC_PF_L2: 9646 case ISS_DATA_DFSC_PF_L3: 9647 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || 9648 (esr & ISS_DATA_WnR) == 0) 9649 return (rv); 9650 PMAP_LOCK(pmap); 9651 ptep = pmap_pte(pmap, far, &lvl); 9652 if (ptep != NULL && 9653 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { 9654 if ((pte & ATTR_S1_AP_RW_BIT) == 9655 ATTR_S1_AP(ATTR_S1_AP_RO)) { 9656 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT); 9657 pmap_s1_invalidate_page(pmap, far, true); 9658 } 9659 rv = KERN_SUCCESS; 9660 } 9661 PMAP_UNLOCK(pmap); 9662 break; 9663 case ISS_DATA_DFSC_TF_L0: 9664 case ISS_DATA_DFSC_TF_L1: 9665 case ISS_DATA_DFSC_TF_L2: 9666 case ISS_DATA_DFSC_TF_L3: 9667 /* 9668 * Retry the translation. A break-before-make sequence can 9669 * produce a transient fault. 9670 */ 9671 if (pmap == kernel_pmap) { 9672 /* 9673 * The translation fault may have occurred within a 9674 * critical section. Therefore, we must check the 9675 * address without acquiring the kernel pmap's lock. 9676 */ 9677 if (pmap_klookup(far, NULL)) 9678 rv = KERN_SUCCESS; 9679 } else { 9680 bool owned; 9681 9682 /* 9683 * In the EFIRT driver we lock the pmap before 9684 * calling into the runtime service. As the lock 9685 * is already owned by the current thread skip 9686 * locking it again. 9687 */ 9688 owned = PMAP_OWNED(pmap); 9689 if (!owned) 9690 PMAP_LOCK(pmap); 9691 /* Ask the MMU to check the address. */ 9692 intr = intr_disable(); 9693 par = arm64_address_translate_s1e0r(far); 9694 intr_restore(intr); 9695 if (!owned) 9696 PMAP_UNLOCK(pmap); 9697 9698 /* 9699 * If the translation was successful, then we can 9700 * return success to the trap handler. 9701 */ 9702 if (PAR_SUCCESS(par)) 9703 rv = KERN_SUCCESS; 9704 } 9705 break; 9706 } 9707 9708 return (rv); 9709 } 9710 9711 /* 9712 * Increase the starting virtual address of the given mapping if a 9713 * different alignment might result in more superpage mappings. 9714 */ 9715 void 9716 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 9717 vm_offset_t *addr, vm_size_t size) 9718 { 9719 vm_offset_t superpage_offset; 9720 9721 if (size < L3C_SIZE) 9722 return; 9723 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 9724 offset += ptoa(object->pg_color); 9725 9726 /* 9727 * Considering the object's physical alignment, is the mapping large 9728 * enough to encompass an L2 (2MB/32MB) superpage ... 9729 */ 9730 superpage_offset = offset & L2_OFFSET; 9731 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) >= L2_SIZE) { 9732 /* 9733 * If the virtual and physical alignments differ, then 9734 * increase the virtual address so that the alignments match. 9735 */ 9736 if ((*addr & L2_OFFSET) < superpage_offset) 9737 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 9738 else if ((*addr & L2_OFFSET) > superpage_offset) 9739 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + 9740 superpage_offset; 9741 return; 9742 } 9743 /* ... or an L3C (64KB/2MB) superpage? */ 9744 superpage_offset = offset & L3C_OFFSET; 9745 if (size - ((L3C_SIZE - superpage_offset) & L3C_OFFSET) >= L3C_SIZE) { 9746 if ((*addr & L3C_OFFSET) < superpage_offset) 9747 *addr = (*addr & ~L3C_OFFSET) + superpage_offset; 9748 else if ((*addr & L3C_OFFSET) > superpage_offset) 9749 *addr = ((*addr + L3C_OFFSET) & ~L3C_OFFSET) + 9750 superpage_offset; 9751 } 9752 } 9753 9754 /** 9755 * Get the kernel virtual address of a set of physical pages. If there are 9756 * physical addresses not covered by the DMAP perform a transient mapping 9757 * that will be removed when calling pmap_unmap_io_transient. 9758 * 9759 * \param page The pages the caller wishes to obtain the virtual 9760 * address on the kernel memory map. 9761 * \param vaddr On return contains the kernel virtual memory address 9762 * of the pages passed in the page parameter. 9763 * \param count Number of pages passed in. 9764 * \param can_fault true if the thread using the mapped pages can take 9765 * page faults, false otherwise. 9766 * 9767 * \returns true if the caller must call pmap_unmap_io_transient when 9768 * finished or false otherwise. 9769 * 9770 */ 9771 bool 9772 pmap_map_io_transient(vm_page_t page[], void *vaddr[], int count, 9773 bool can_fault) 9774 { 9775 vm_paddr_t paddr; 9776 vmem_addr_t addr; 9777 bool needs_mapping; 9778 int error __diagused, i; 9779 9780 /* 9781 * Allocate any KVA space that we need, this is done in a separate 9782 * loop to prevent calling vmem_alloc while pinned. 9783 */ 9784 needs_mapping = false; 9785 for (i = 0; i < count; i++) { 9786 paddr = VM_PAGE_TO_PHYS(page[i]); 9787 if (__predict_false(!PHYS_IN_DMAP(paddr))) { 9788 error = vmem_alloc(kernel_arena, PAGE_SIZE, 9789 M_BESTFIT | M_WAITOK, &addr); 9790 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 9791 vaddr[i] = (void *)addr; 9792 needs_mapping = true; 9793 } else { 9794 vaddr[i] = PHYS_TO_DMAP(paddr); 9795 } 9796 } 9797 9798 /* Exit early if everything is covered by the DMAP */ 9799 if (!needs_mapping) 9800 return (false); 9801 9802 if (!can_fault) 9803 sched_pin(); 9804 for (i = 0; i < count; i++) { 9805 paddr = VM_PAGE_TO_PHYS(page[i]); 9806 if (!PHYS_IN_DMAP(paddr)) { 9807 panic( 9808 "pmap_map_io_transient: TODO: Map out of DMAP data"); 9809 } 9810 } 9811 9812 return (needs_mapping); 9813 } 9814 9815 void 9816 pmap_unmap_io_transient(vm_page_t page[], void *vaddr[], int count, 9817 bool can_fault) 9818 { 9819 vm_paddr_t paddr; 9820 int i; 9821 9822 if (!can_fault) 9823 sched_unpin(); 9824 for (i = 0; i < count; i++) { 9825 paddr = VM_PAGE_TO_PHYS(page[i]); 9826 if (!PHYS_IN_DMAP(paddr)) { 9827 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); 9828 } 9829 } 9830 } 9831 9832 bool 9833 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 9834 { 9835 9836 return (mode >= 0 && mode < VM_MEMATTR_END); 9837 } 9838 9839 static void * 9840 bti_dup_range(void *ctx __unused, void *data) 9841 { 9842 struct rs_el *node, *new_node; 9843 9844 new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT); 9845 if (new_node == NULL) 9846 return (NULL); 9847 node = data; 9848 memcpy(new_node, node, sizeof(*node)); 9849 return (new_node); 9850 } 9851 9852 static void 9853 bti_free_range(void *ctx __unused, void *node) 9854 { 9855 9856 uma_zfree(pmap_bti_ranges_zone, node); 9857 } 9858 9859 static int 9860 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9861 { 9862 struct rs_el *rs; 9863 int error; 9864 9865 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9866 PMAP_ASSERT_STAGE1(pmap); 9867 MPASS(pmap->pm_bti != NULL); 9868 rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT); 9869 if (rs == NULL) 9870 return (ENOMEM); 9871 error = rangeset_insert(pmap->pm_bti, sva, eva, rs); 9872 if (error != 0) 9873 uma_zfree(pmap_bti_ranges_zone, rs); 9874 return (error); 9875 } 9876 9877 static void 9878 pmap_bti_deassign_all(pmap_t pmap) 9879 { 9880 9881 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9882 if (pmap->pm_bti != NULL) 9883 rangeset_remove_all(pmap->pm_bti); 9884 } 9885 9886 /* 9887 * Returns true if the BTI setting is the same across the specified address 9888 * range, and false otherwise. When returning true, updates the referenced PTE 9889 * to reflect the BTI setting. 9890 * 9891 * Only stage 1 pmaps support BTI. The kernel pmap is always a stage 1 pmap 9892 * that has the same BTI setting implicitly across its entire address range. 9893 */ 9894 static bool 9895 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte) 9896 { 9897 struct rs_el *rs; 9898 vm_offset_t va; 9899 9900 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9901 KASSERT(ADDR_IS_CANONICAL(sva), 9902 ("%s: Start address not in canonical form: %lx", __func__, sva)); 9903 KASSERT(ADDR_IS_CANONICAL(eva), 9904 ("%s: End address not in canonical form: %lx", __func__, eva)); 9905 KASSERT((*pte & ATTR_S1_GP) == 0, 9906 ("%s: pte %lx has ATTR_S1_GP preset", __func__, *pte)); 9907 9908 if (pmap == kernel_pmap) { 9909 *pte |= ATTR_KERN_GP; 9910 return (true); 9911 } 9912 if (pmap->pm_bti == NULL) 9913 return (true); 9914 PMAP_ASSERT_STAGE1(pmap); 9915 rs = rangeset_containing(pmap->pm_bti, sva); 9916 if (rs == NULL) 9917 return (rangeset_empty(pmap->pm_bti, sva, eva)); 9918 while ((va = rs->re_end) < eva) { 9919 if ((rs = rangeset_beginning(pmap->pm_bti, va)) == NULL) 9920 return (false); 9921 } 9922 *pte |= ATTR_S1_GP; 9923 return (true); 9924 } 9925 9926 static pt_entry_t 9927 pmap_pte_bti(pmap_t pmap, vm_offset_t va) 9928 { 9929 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9930 MPASS(ADDR_IS_CANONICAL(va)); 9931 9932 if (pmap->pm_stage != PM_STAGE1) 9933 return (0); 9934 if (pmap == kernel_pmap) 9935 return (ATTR_KERN_GP); 9936 if (pmap->pm_bti != NULL && 9937 rangeset_containing(pmap->pm_bti, va) != NULL) 9938 return (ATTR_S1_GP); 9939 return (0); 9940 } 9941 9942 static void 9943 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9944 { 9945 9946 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9947 if (pmap->pm_bti != NULL) 9948 rangeset_remove(pmap->pm_bti, sva, eva); 9949 } 9950 9951 static int 9952 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap) 9953 { 9954 9955 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 9956 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 9957 MPASS(src_pmap->pm_stage == dst_pmap->pm_stage); 9958 MPASS(src_pmap->pm_bti != NULL); 9959 MPASS(dst_pmap->pm_bti != NULL); 9960 if (src_pmap->pm_bti->rs_data_ctx == NULL) 9961 return (0); 9962 return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti)); 9963 } 9964 9965 static void 9966 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set) 9967 { 9968 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9969 PMAP_ASSERT_STAGE1(pmap); 9970 9971 pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0, 9972 true); 9973 } 9974 9975 int 9976 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9977 { 9978 int error; 9979 9980 if (pmap->pm_bti == NULL) 9981 return (0); 9982 if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva)) 9983 return (EINVAL); 9984 if (pmap->pm_stage != PM_STAGE1) 9985 return (EINVAL); 9986 if (eva <= sva || ADDR_IS_KERNEL(eva)) 9987 return (EFAULT); 9988 9989 sva = trunc_page(sva); 9990 eva = round_page(eva); 9991 for (;;) { 9992 PMAP_LOCK(pmap); 9993 error = pmap_bti_assign(pmap, sva, eva); 9994 if (error == 0) 9995 pmap_bti_update_range(pmap, sva, eva, true); 9996 PMAP_UNLOCK(pmap); 9997 if (error != ENOMEM) 9998 break; 9999 vm_wait(NULL); 10000 } 10001 return (error); 10002 } 10003 10004 #if defined(KASAN) || defined(KMSAN) 10005 static pd_entry_t *pmap_san_early_l2; 10006 10007 #define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE) 10008 #define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE) 10009 static vm_offset_t __nosanitizeaddress 10010 pmap_san_enter_bootstrap_alloc_l2(void) 10011 { 10012 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE); 10013 static size_t offset = 0; 10014 vm_offset_t addr; 10015 10016 if (offset + L2_SIZE > sizeof(bootstrap_data)) { 10017 panic("%s: out of memory for the bootstrap shadow map L2 entries", 10018 __func__); 10019 } 10020 10021 addr = (uintptr_t)&bootstrap_data[offset]; 10022 offset += L2_SIZE; 10023 return (addr); 10024 } 10025 10026 /* 10027 * SAN L1 + L2 pages, maybe L3 entries later? 10028 */ 10029 static vm_offset_t __nosanitizeaddress 10030 pmap_san_enter_bootstrap_alloc_pages(int npages) 10031 { 10032 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE); 10033 static size_t offset = 0; 10034 vm_offset_t addr; 10035 10036 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) { 10037 panic("%s: out of memory for the bootstrap shadow map", 10038 __func__); 10039 } 10040 10041 addr = (uintptr_t)&bootstrap_data[offset]; 10042 offset += (npages * PAGE_SIZE); 10043 return (addr); 10044 } 10045 10046 static void __nosanitizeaddress 10047 pmap_san_enter_bootstrap(void) 10048 { 10049 vm_offset_t freemempos; 10050 10051 /* L1, L2 */ 10052 freemempos = pmap_san_enter_bootstrap_alloc_pages(2); 10053 bs_state.freemempos = freemempos; 10054 bs_state.va = KASAN_MIN_ADDRESS; 10055 pmap_bootstrap_l1_table(&bs_state); 10056 pmap_san_early_l2 = bs_state.l2; 10057 } 10058 10059 static vm_page_t 10060 pmap_san_enter_alloc_l3(void) 10061 { 10062 vm_page_t m; 10063 10064 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 10065 VM_ALLOC_ZERO); 10066 if (m == NULL) 10067 panic("%s: no memory to grow shadow map", __func__); 10068 return (m); 10069 } 10070 10071 static vm_page_t 10072 pmap_san_enter_alloc_l2(void) 10073 { 10074 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 10075 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT)); 10076 } 10077 10078 void __nosanitizeaddress __nosanitizememory 10079 pmap_san_enter(vm_offset_t va) 10080 { 10081 pd_entry_t *l1, *l2; 10082 pt_entry_t *l3; 10083 vm_page_t m; 10084 10085 if (virtual_avail == 0) { 10086 vm_offset_t block; 10087 int slot; 10088 bool first; 10089 10090 /* Temporary shadow map prior to pmap_bootstrap(). */ 10091 first = pmap_san_early_l2 == NULL; 10092 if (first) 10093 pmap_san_enter_bootstrap(); 10094 10095 l2 = pmap_san_early_l2; 10096 slot = pmap_l2_index(va); 10097 10098 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) { 10099 MPASS(first); 10100 block = pmap_san_enter_bootstrap_alloc_l2(); 10101 pmap_store(&l2[slot], 10102 PHYS_TO_PTE(pmap_early_vtophys(block)) | 10103 PMAP_SAN_PTE_BITS | L2_BLOCK); 10104 dmb(ishst); 10105 } 10106 10107 return; 10108 } 10109 10110 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 10111 l1 = pmap_l1(kernel_pmap, va); 10112 MPASS(l1 != NULL); 10113 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) { 10114 m = pmap_san_enter_alloc_l3(); 10115 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE); 10116 } 10117 l2 = pmap_l1_to_l2(l1, va); 10118 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) { 10119 m = pmap_san_enter_alloc_l2(); 10120 if (m != NULL) { 10121 pmap_store(l2, VM_PAGE_TO_PTE(m) | 10122 PMAP_SAN_PTE_BITS | L2_BLOCK); 10123 } else { 10124 m = pmap_san_enter_alloc_l3(); 10125 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE); 10126 } 10127 dmb(ishst); 10128 } 10129 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) 10130 return; 10131 l3 = pmap_l2_to_l3(l2, va); 10132 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0) 10133 return; 10134 m = pmap_san_enter_alloc_l3(); 10135 pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE); 10136 dmb(ishst); 10137 } 10138 #endif /* KASAN || KMSAN */ 10139 10140 /* 10141 * Track a range of the kernel's virtual address space that is contiguous 10142 * in various mapping attributes. 10143 */ 10144 struct pmap_kernel_map_range { 10145 vm_offset_t sva; 10146 pt_entry_t attrs; 10147 int l3pages; 10148 int l3contig; 10149 int l2blocks; 10150 int l2contig; 10151 int l1blocks; 10152 }; 10153 10154 static void 10155 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 10156 vm_offset_t eva) 10157 { 10158 const char *mode; 10159 int index; 10160 10161 if (eva <= range->sva) 10162 return; 10163 10164 index = range->attrs & ATTR_S1_IDX_MASK; 10165 switch (index) { 10166 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP): 10167 mode = "DEV-NP"; 10168 break; 10169 case ATTR_S1_IDX(VM_MEMATTR_DEVICE): 10170 mode = "DEV"; 10171 break; 10172 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE): 10173 mode = "UC"; 10174 break; 10175 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK): 10176 mode = "WB"; 10177 break; 10178 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH): 10179 mode = "WT"; 10180 break; 10181 case ATTR_S1_IDX(VM_MEMATTR_TAGGED): 10182 mode = "TAGGED"; 10183 break; 10184 default: 10185 printf( 10186 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n", 10187 __func__, index, range->sva, eva); 10188 mode = "??"; 10189 break; 10190 } 10191 10192 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d %d\n", 10193 range->sva, eva, 10194 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-', 10195 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x', 10196 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X', 10197 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's', 10198 (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-', 10199 mode, range->l1blocks, range->l2contig, range->l2blocks, 10200 range->l3contig, range->l3pages); 10201 10202 /* Reset to sentinel value. */ 10203 range->sva = 0xfffffffffffffffful; 10204 } 10205 10206 /* 10207 * Determine whether the attributes specified by a page table entry match those 10208 * being tracked by the current range. 10209 */ 10210 static bool 10211 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 10212 { 10213 10214 return (range->attrs == attrs); 10215 } 10216 10217 static void 10218 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 10219 pt_entry_t attrs) 10220 { 10221 10222 memset(range, 0, sizeof(*range)); 10223 range->sva = va; 10224 range->attrs = attrs; 10225 } 10226 10227 /* Get the block/page attributes that correspond to the table attributes */ 10228 static pt_entry_t 10229 sysctl_kmaps_table_attrs(pd_entry_t table) 10230 { 10231 pt_entry_t attrs; 10232 10233 attrs = 0; 10234 if ((table & TATTR_UXN_TABLE) != 0) 10235 attrs |= ATTR_S1_UXN; 10236 if ((table & TATTR_PXN_TABLE) != 0) 10237 attrs |= ATTR_S1_PXN; 10238 if ((table & TATTR_AP_TABLE_RO) != 0) 10239 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO); 10240 10241 return (attrs); 10242 } 10243 10244 /* Read the block/page attributes we care about */ 10245 static pt_entry_t 10246 sysctl_kmaps_block_attrs(pt_entry_t block) 10247 { 10248 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK | 10249 ATTR_S1_GP)); 10250 } 10251 10252 /* 10253 * Given a leaf PTE, derive the mapping's attributes. If they do not match 10254 * those of the current run, dump the address range and its attributes, and 10255 * begin a new run. 10256 */ 10257 static void 10258 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 10259 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e, 10260 pt_entry_t l3e) 10261 { 10262 pt_entry_t attrs; 10263 10264 attrs = sysctl_kmaps_table_attrs(l0e); 10265 10266 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 10267 attrs |= sysctl_kmaps_block_attrs(l1e); 10268 goto done; 10269 } 10270 attrs |= sysctl_kmaps_table_attrs(l1e); 10271 10272 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 10273 attrs |= sysctl_kmaps_block_attrs(l2e); 10274 goto done; 10275 } 10276 attrs |= sysctl_kmaps_table_attrs(l2e); 10277 attrs |= sysctl_kmaps_block_attrs(l3e); 10278 10279 done: 10280 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 10281 sysctl_kmaps_dump(sb, range, va); 10282 sysctl_kmaps_reinit(range, va, attrs); 10283 } 10284 } 10285 10286 static int 10287 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 10288 { 10289 struct pmap_kernel_map_range range; 10290 struct sbuf sbuf, *sb; 10291 pd_entry_t l0e, *l1, l1e, *l2, l2e; 10292 pt_entry_t *l3, l3e; 10293 vm_offset_t sva; 10294 vm_paddr_t pa; 10295 int error, i, j, k, l; 10296 10297 error = sysctl_wire_old_buffer(req, 0); 10298 if (error != 0) 10299 return (error); 10300 sb = &sbuf; 10301 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 10302 10303 /* Sentinel value. */ 10304 range.sva = 0xfffffffffffffffful; 10305 10306 /* 10307 * Iterate over the kernel page tables without holding the kernel pmap 10308 * lock. Kernel page table pages are never freed, so at worst we will 10309 * observe inconsistencies in the output. 10310 */ 10311 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES; 10312 i++) { 10313 if (i == pmap_l0_index(DMAP_MIN_ADDRESS)) 10314 sbuf_printf(sb, "\nDirect map:\n"); 10315 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS)) 10316 sbuf_printf(sb, "\nKernel map:\n"); 10317 #ifdef KASAN 10318 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS)) 10319 sbuf_printf(sb, "\nKASAN shadow map:\n"); 10320 #endif 10321 #ifdef KMSAN 10322 else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS)) 10323 sbuf_printf(sb, "\nKMSAN shadow map:\n"); 10324 else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS)) 10325 sbuf_printf(sb, "\nKMSAN origin map:\n"); 10326 #endif 10327 10328 l0e = kernel_pmap->pm_l0[i]; 10329 if ((l0e & ATTR_DESCR_VALID) == 0) { 10330 sysctl_kmaps_dump(sb, &range, sva); 10331 sva += L0_SIZE; 10332 continue; 10333 } 10334 pa = PTE_TO_PHYS(l0e); 10335 l1 = PHYS_TO_DMAP(pa); 10336 10337 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) { 10338 l1e = l1[j]; 10339 if ((l1e & ATTR_DESCR_VALID) == 0) { 10340 sysctl_kmaps_dump(sb, &range, sva); 10341 sva += L1_SIZE; 10342 continue; 10343 } 10344 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { 10345 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 10346 sysctl_kmaps_check(sb, &range, sva, l0e, l1e, 10347 0, 0); 10348 range.l1blocks++; 10349 sva += L1_SIZE; 10350 continue; 10351 } 10352 pa = PTE_TO_PHYS(l1e); 10353 l2 = PHYS_TO_DMAP(pa); 10354 10355 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) { 10356 l2e = l2[k]; 10357 if ((l2e & ATTR_DESCR_VALID) == 0) { 10358 sysctl_kmaps_dump(sb, &range, sva); 10359 sva += L2_SIZE; 10360 continue; 10361 } 10362 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { 10363 sysctl_kmaps_check(sb, &range, sva, 10364 l0e, l1e, l2e, 0); 10365 if ((l2e & ATTR_CONTIGUOUS) != 0) 10366 range.l2contig += 10367 k % L2C_ENTRIES == 0 ? 10368 1 : 0; 10369 else 10370 range.l2blocks++; 10371 sva += L2_SIZE; 10372 continue; 10373 } 10374 pa = PTE_TO_PHYS(l2e); 10375 l3 = PHYS_TO_DMAP(pa); 10376 10377 for (l = pmap_l3_index(sva); l < Ln_ENTRIES; 10378 l++, sva += L3_SIZE) { 10379 l3e = l3[l]; 10380 if ((l3e & ATTR_DESCR_VALID) == 0) { 10381 sysctl_kmaps_dump(sb, &range, 10382 sva); 10383 continue; 10384 } 10385 sysctl_kmaps_check(sb, &range, sva, 10386 l0e, l1e, l2e, l3e); 10387 if ((l3e & ATTR_CONTIGUOUS) != 0) 10388 range.l3contig += 10389 l % L3C_ENTRIES == 0 ? 10390 1 : 0; 10391 else 10392 range.l3pages++; 10393 } 10394 } 10395 } 10396 } 10397 10398 error = sbuf_finish(sb); 10399 sbuf_delete(sb); 10400 return (error); 10401 } 10402 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 10403 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 10404 NULL, 0, sysctl_kmaps, "A", 10405 "Dump kernel address layout"); 10406 10407 10408 void pagezero_simple(void *); 10409 void pagezero_cache(void *); 10410 void pagezero_mops(void *); 10411 10412 DEFINE_IFUNC(static, void, pagezero, (void *)) 10413 { 10414 uint32_t dczid_el0; 10415 10416 dczid_el0 = READ_SPECIALREG(dczid_el0); 10417 10418 if (elf_hwcap2 & HWCAP2_MOPS) 10419 return (pagezero_mops); 10420 else if ((dczid_el0 & DCZID_DZP) == 0) 10421 return (pagezero_cache); 10422 else 10423 return (pagezero_simple); 10424 } 10425