1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * Copyright (c) 2014 Andrew Turner 13 * All rights reserved. 14 * Copyright (c) 2014-2016 The FreeBSD Foundation 15 * All rights reserved. 16 * 17 * This code is derived from software contributed to Berkeley by 18 * the Systems Programming Group of the University of Utah Computer 19 * Science Department and William Jolitz of UUNET Technologies Inc. 20 * 21 * This software was developed by Andrew Turner under sponsorship from 22 * the FreeBSD Foundation. 23 * 24 * Redistribution and use in source and binary forms, with or without 25 * modification, are permitted provided that the following conditions 26 * are met: 27 * 1. Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * 2. Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in the 31 * documentation and/or other materials provided with the distribution. 32 * 3. All advertising materials mentioning features or use of this software 33 * must display the following acknowledgement: 34 * This product includes software developed by the University of 35 * California, Berkeley and its contributors. 36 * 4. Neither the name of the University nor the names of its contributors 37 * may be used to endorse or promote products derived from this software 38 * without specific prior written permission. 39 * 40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 */ 52 /*- 53 * Copyright (c) 2003 Networks Associates Technology, Inc. 54 * All rights reserved. 55 * 56 * This software was developed for the FreeBSD Project by Jake Burkholder, 57 * Safeport Network Services, and Network Associates Laboratories, the 58 * Security Research Division of Network Associates, Inc. under 59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 60 * CHATS research program. 61 * 62 * Redistribution and use in source and binary forms, with or without 63 * modification, are permitted provided that the following conditions 64 * are met: 65 * 1. Redistributions of source code must retain the above copyright 66 * notice, this list of conditions and the following disclaimer. 67 * 2. Redistributions in binary form must reproduce the above copyright 68 * notice, this list of conditions and the following disclaimer in the 69 * documentation and/or other materials provided with the distribution. 70 * 71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 81 * SUCH DAMAGE. 82 */ 83 84 #include <sys/cdefs.h> 85 /* 86 * Manages physical address maps. 87 * 88 * Since the information managed by this module is 89 * also stored by the logical address mapping module, 90 * this module may throw away valid virtual-to-physical 91 * mappings at almost any time. However, invalidations 92 * of virtual-to-physical mappings must be done as 93 * requested. 94 * 95 * In order to cope with hardware architectures which 96 * make virtual-to-physical map invalidates expensive, 97 * this module may delay invalidate or reduced protection 98 * operations until such time as they are actually 99 * necessary. This module is given full information as 100 * to which processors are currently using which maps, 101 * and to when physical maps must be made correct. 102 */ 103 104 #include "opt_vm.h" 105 106 #include <sys/param.h> 107 #include <sys/asan.h> 108 #include <sys/bitstring.h> 109 #include <sys/bus.h> 110 #include <sys/systm.h> 111 #include <sys/kernel.h> 112 #include <sys/ktr.h> 113 #include <sys/limits.h> 114 #include <sys/lock.h> 115 #include <sys/malloc.h> 116 #include <sys/mman.h> 117 #include <sys/msan.h> 118 #include <sys/msgbuf.h> 119 #include <sys/mutex.h> 120 #include <sys/physmem.h> 121 #include <sys/proc.h> 122 #include <sys/rangeset.h> 123 #include <sys/rwlock.h> 124 #include <sys/sbuf.h> 125 #include <sys/sx.h> 126 #include <sys/vmem.h> 127 #include <sys/vmmeter.h> 128 #include <sys/sched.h> 129 #include <sys/sysctl.h> 130 #include <sys/_unrhdr.h> 131 #include <sys/smp.h> 132 133 #include <vm/vm.h> 134 #include <vm/vm_param.h> 135 #include <vm/vm_kern.h> 136 #include <vm/vm_page.h> 137 #include <vm/vm_map.h> 138 #include <vm/vm_object.h> 139 #include <vm/vm_extern.h> 140 #include <vm/vm_pageout.h> 141 #include <vm/vm_pager.h> 142 #include <vm/vm_phys.h> 143 #include <vm/vm_radix.h> 144 #include <vm/vm_reserv.h> 145 #include <vm/vm_dumpset.h> 146 #include <vm/uma.h> 147 148 #include <machine/asan.h> 149 #include <machine/cpu_feat.h> 150 #include <machine/elf.h> 151 #include <machine/ifunc.h> 152 #include <machine/machdep.h> 153 #include <machine/md_var.h> 154 #include <machine/pcb.h> 155 156 #ifdef NUMA 157 #define PMAP_MEMDOM MAXMEMDOM 158 #else 159 #define PMAP_MEMDOM 1 160 #endif 161 162 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1) 163 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2) 164 165 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) 166 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) 167 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) 168 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) 169 170 #define NUL0E L0_ENTRIES 171 #define NUL1E (NUL0E * NL1PG) 172 #define NUL2E (NUL1E * NL2PG) 173 174 #ifdef PV_STATS 175 #define PV_STAT(x) do { x ; } while (0) 176 #define __pvused 177 #else 178 #define PV_STAT(x) do { } while (0) 179 #define __pvused __unused 180 #endif 181 182 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT)) 183 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 184 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 185 186 #ifdef __ARM_FEATURE_BTI_DEFAULT 187 pt_entry_t __read_mostly pmap_gp_attr; 188 #define ATTR_KERN_GP pmap_gp_attr 189 #else 190 #define ATTR_KERN_GP 0 191 #endif 192 #define PMAP_SAN_PTE_BITS (ATTR_AF | ATTR_S1_XN | pmap_sh_attr | \ 193 ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW)) 194 195 static bool __read_mostly pmap_multiple_tlbi = false; 196 197 struct pmap_large_md_page { 198 struct rwlock pv_lock; 199 struct md_page pv_page; 200 /* Pad to a power of 2, see pmap_init_pv_table(). */ 201 int pv_pad[2]; 202 }; 203 204 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 205 #define pv_dummy pv_dummy_large.pv_page 206 __read_mostly static struct pmap_large_md_page *pv_table; 207 208 static struct pmap_large_md_page * 209 _pa_to_pmdp(vm_paddr_t pa) 210 { 211 struct vm_phys_seg *seg; 212 213 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 214 return ((struct pmap_large_md_page *)seg->md_first + 215 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start)); 216 return (NULL); 217 } 218 219 static struct pmap_large_md_page * 220 pa_to_pmdp(vm_paddr_t pa) 221 { 222 struct pmap_large_md_page *pvd; 223 224 pvd = _pa_to_pmdp(pa); 225 if (pvd == NULL) 226 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa); 227 return (pvd); 228 } 229 230 static struct pmap_large_md_page * 231 page_to_pmdp(vm_page_t m) 232 { 233 struct vm_phys_seg *seg; 234 235 seg = &vm_phys_segs[m->segind]; 236 return ((struct pmap_large_md_page *)seg->md_first + 237 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start)); 238 } 239 240 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 241 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page)) 242 243 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 244 struct pmap_large_md_page *_pvd; \ 245 struct rwlock *_lock; \ 246 _pvd = _pa_to_pmdp(pa); \ 247 if (__predict_false(_pvd == NULL)) \ 248 _lock = &pv_dummy_large.pv_lock; \ 249 else \ 250 _lock = &(_pvd->pv_lock); \ 251 _lock; \ 252 }) 253 254 static struct rwlock * 255 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m) 256 { 257 if ((m->flags & PG_FICTITIOUS) == 0) 258 return (&page_to_pmdp(m)->pv_lock); 259 else 260 return (&pv_dummy_large.pv_lock); 261 } 262 263 #define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \ 264 struct rwlock **_lockp = (lockp); \ 265 struct rwlock *_new_lock = (new_lock); \ 266 \ 267 if (_new_lock != *_lockp) { \ 268 if (*_lockp != NULL) \ 269 rw_wunlock(*_lockp); \ 270 *_lockp = _new_lock; \ 271 rw_wlock(*_lockp); \ 272 } \ 273 } while (0) 274 275 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \ 276 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa)) 277 278 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 279 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m)) 280 281 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 282 struct rwlock **_lockp = (lockp); \ 283 \ 284 if (*_lockp != NULL) { \ 285 rw_wunlock(*_lockp); \ 286 *_lockp = NULL; \ 287 } \ 288 } while (0) 289 290 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte)) 291 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) 292 293 /* 294 * The presence of this flag indicates that the mapping is writeable. 295 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise 296 * it is dirty. This flag may only be set on managed mappings. 297 * 298 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it 299 * as a software managed bit. 300 */ 301 #define ATTR_SW_DBM ATTR_DBM 302 303 struct pmap kernel_pmap_store; 304 305 /* Used for mapping ACPI memory before VM is initialized */ 306 #define PMAP_PREINIT_MAPPING_COUNT 32 307 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) 308 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ 309 static int vm_initialized = 0; /* No need to use pre-init maps when set */ 310 311 /* 312 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. 313 * Always map entire L2 block for simplicity. 314 * VA of L2 block = preinit_map_va + i * L2_SIZE 315 */ 316 static struct pmap_preinit_mapping { 317 vm_paddr_t pa; 318 vm_offset_t va; 319 vm_size_t size; 320 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 321 322 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 323 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 324 vm_offset_t kernel_vm_end = 0; 325 326 /* 327 * Data for the pv entry allocation mechanism. 328 */ 329 #ifdef NUMA 330 static __inline int 331 pc_to_domain(struct pv_chunk *pc) 332 { 333 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 334 } 335 #else 336 static __inline int 337 pc_to_domain(struct pv_chunk *pc __unused) 338 { 339 return (0); 340 } 341 #endif 342 343 struct pv_chunks_list { 344 struct mtx pvc_lock; 345 TAILQ_HEAD(pch, pv_chunk) pvc_list; 346 int active_reclaims; 347 } __aligned(CACHE_LINE_SIZE); 348 349 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 350 351 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 352 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 353 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 354 355 extern pt_entry_t pagetable_l0_ttbr1[]; 356 357 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 358 static vm_paddr_t physmap[PHYSMAP_SIZE]; 359 static u_int physmap_idx; 360 361 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 362 "VM/pmap parameters"); 363 364 static int pmap_growkernel_panic = 0; 365 SYSCTL_INT(_vm_pmap, OID_AUTO, growkernel_panic, CTLFLAG_RDTUN, 366 &pmap_growkernel_panic, 0, 367 "panic on failure to allocate kernel page table page"); 368 369 bool pmap_lpa_enabled __read_mostly = false; 370 pt_entry_t pmap_sh_attr __read_mostly = ATTR_SH(ATTR_SH_IS); 371 372 #if PAGE_SIZE == PAGE_SIZE_4K 373 #define L1_BLOCKS_SUPPORTED 1 374 #else 375 #define L1_BLOCKS_SUPPORTED (pmap_lpa_enabled) 376 #endif 377 378 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED) 379 380 static bool pmap_l1_supported __read_mostly = false; 381 382 /* 383 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs 384 * that it has currently allocated to a pmap, a cursor ("asid_next") to 385 * optimize its search for a free ASID in the bit vector, and an epoch number 386 * ("asid_epoch") to indicate when it has reclaimed all previously allocated 387 * ASIDs that are not currently active on a processor. 388 * 389 * The current epoch number is always in the range [0, INT_MAX). Negative 390 * numbers and INT_MAX are reserved for special cases that are described 391 * below. 392 */ 393 struct asid_set { 394 int asid_bits; 395 bitstr_t *asid_set; 396 int asid_set_size; 397 int asid_next; 398 int asid_epoch; 399 struct mtx asid_set_mutex; 400 }; 401 402 static struct asid_set asids; 403 static struct asid_set vmids; 404 405 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 406 "ASID allocator"); 407 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0, 408 "The number of bits in an ASID"); 409 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0, 410 "The last allocated ASID plus one"); 411 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0, 412 "The current epoch number"); 413 414 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator"); 415 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0, 416 "The number of bits in an VMID"); 417 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0, 418 "The last allocated VMID plus one"); 419 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0, 420 "The current epoch number"); 421 422 void (*pmap_clean_stage2_tlbi)(void); 423 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool); 424 void (*pmap_stage2_invalidate_all)(uint64_t); 425 426 /* 427 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved 428 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for 429 * dynamically allocated ASIDs have a non-negative epoch number. 430 * 431 * An invalid ASID is represented by -1. 432 * 433 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN), 434 * which indicates that an ASID should never be allocated to the pmap, and 435 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be 436 * allocated when the pmap is next activated. 437 */ 438 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \ 439 ((u_long)(epoch) << 32))) 440 #define COOKIE_TO_ASID(cookie) ((int)(cookie)) 441 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32)) 442 443 #define TLBI_VA_SHIFT 12 444 #define TLBI_VA_MASK ((1ul << 44) - 1) 445 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK) 446 447 static int __read_frequently superpages_enabled = 1; 448 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 449 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, 450 "Are large page mappings enabled?"); 451 452 /* 453 * True when Branch Target Identification should be used by userspace. This 454 * allows pmap to mark pages as guarded with ATTR_S1_GP. 455 */ 456 __read_mostly static bool pmap_bti_support = false; 457 458 /* 459 * Internal flags for pmap_enter()'s helper functions. 460 */ 461 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 462 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 463 464 TAILQ_HEAD(pv_chunklist, pv_chunk); 465 466 static void free_pv_chunk(struct pv_chunk *pc); 467 static void free_pv_chunk_batch(struct pv_chunklist *batch); 468 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 469 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 470 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 471 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 472 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 473 vm_offset_t va); 474 475 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 476 static bool pmap_activate_int(struct thread *td, pmap_t pmap); 477 static void pmap_alloc_asid(pmap_t pmap); 478 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 479 vm_prot_t prot, int mode, bool skip_unmapped); 480 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 481 pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp); 482 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); 483 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, 484 vm_offset_t va, struct rwlock **lockp); 485 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 486 static bool pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va); 487 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va); 488 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 489 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 490 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 491 u_int flags, vm_page_t m, struct rwlock **lockp); 492 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags, 493 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp); 494 static bool pmap_every_pte_zero(vm_paddr_t pa); 495 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 496 bool all_l3e_AF_set); 497 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p); 498 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 499 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits); 500 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m, 501 struct rwlock **lockp); 502 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 503 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 504 pd_entry_t l1e, bool demote_kl2e, struct spglist *free, 505 struct rwlock **lockp); 506 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 507 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); 508 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 509 vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free, 510 struct rwlock **lockp); 511 static void pmap_reset_asid_set(pmap_t pmap); 512 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 513 vm_page_t m, struct rwlock **lockp); 514 515 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 516 struct rwlock **lockp); 517 518 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, 519 struct spglist *free); 520 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 521 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, 522 vm_offset_t va, vm_size_t size); 523 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 524 525 static uma_zone_t pmap_bti_ranges_zone; 526 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 527 pt_entry_t *pte); 528 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va); 529 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 530 static void *bti_dup_range(void *ctx, void *data); 531 static void bti_free_range(void *ctx, void *node); 532 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap); 533 static void pmap_bti_deassign_all(pmap_t pmap); 534 static void pagezero(void *); 535 536 /* 537 * These load the old table data and store the new value. 538 * They need to be atomic as the System MMU may write to the table at 539 * the same time as the CPU. 540 */ 541 #define pmap_clear(table) atomic_store_64(table, 0) 542 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) 543 #define pmap_load(table) (*table) 544 #define pmap_load_clear(table) atomic_swap_64(table, 0) 545 #define pmap_load_store(table, entry) atomic_swap_64(table, entry) 546 #define pmap_set_bits(table, bits) atomic_set_64(table, bits) 547 #define pmap_store(table, entry) atomic_store_64(table, entry) 548 549 /********************/ 550 /* Inline functions */ 551 /********************/ 552 553 static __inline void 554 pagecopy(void *s, void *d) 555 { 556 557 memcpy(d, s, PAGE_SIZE); 558 } 559 560 static __inline pd_entry_t * 561 pmap_l0(pmap_t pmap, vm_offset_t va) 562 { 563 564 return (&pmap->pm_l0[pmap_l0_index(va)]); 565 } 566 567 static __inline pd_entry_t * 568 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 569 { 570 pd_entry_t *l1; 571 572 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0))); 573 return (&l1[pmap_l1_index(va)]); 574 } 575 576 static __inline pd_entry_t * 577 pmap_l1(pmap_t pmap, vm_offset_t va) 578 { 579 pd_entry_t *l0; 580 581 l0 = pmap_l0(pmap, va); 582 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) 583 return (NULL); 584 585 return (pmap_l0_to_l1(l0, va)); 586 } 587 588 static __inline pd_entry_t * 589 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) 590 { 591 pd_entry_t l1, *l2p; 592 593 l1 = pmap_load(l1p); 594 595 KASSERT(ADDR_IS_CANONICAL(va), 596 ("%s: Address not in canonical form: %lx", __func__, va)); 597 /* 598 * The valid bit may be clear if pmap_update_entry() is concurrently 599 * modifying the entry, so for KVA only the entry type may be checked. 600 */ 601 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0, 602 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va)); 603 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 604 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va)); 605 l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1)); 606 return (&l2p[pmap_l2_index(va)]); 607 } 608 609 static __inline pd_entry_t * 610 pmap_l2(pmap_t pmap, vm_offset_t va) 611 { 612 pd_entry_t *l1; 613 614 l1 = pmap_l1(pmap, va); 615 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) 616 return (NULL); 617 618 return (pmap_l1_to_l2(l1, va)); 619 } 620 621 static __inline pt_entry_t * 622 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va) 623 { 624 pd_entry_t l2; 625 pt_entry_t *l3p; 626 627 l2 = pmap_load(l2p); 628 629 KASSERT(ADDR_IS_CANONICAL(va), 630 ("%s: Address not in canonical form: %lx", __func__, va)); 631 /* 632 * The valid bit may be clear if pmap_update_entry() is concurrently 633 * modifying the entry, so for KVA only the entry type may be checked. 634 */ 635 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0, 636 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va)); 637 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 638 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va)); 639 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2)); 640 return (&l3p[pmap_l3_index(va)]); 641 } 642 643 /* 644 * Returns the lowest valid pde for a given virtual address. 645 * The next level may or may not point to a valid page or block. 646 */ 647 static __inline pd_entry_t * 648 pmap_pde(pmap_t pmap, vm_offset_t va, int *level) 649 { 650 pd_entry_t *l0, *l1, *l2, desc; 651 652 l0 = pmap_l0(pmap, va); 653 desc = pmap_load(l0) & ATTR_DESCR_MASK; 654 if (desc != L0_TABLE) { 655 *level = -1; 656 return (NULL); 657 } 658 659 l1 = pmap_l0_to_l1(l0, va); 660 desc = pmap_load(l1) & ATTR_DESCR_MASK; 661 if (desc != L1_TABLE) { 662 *level = 0; 663 return (l0); 664 } 665 666 l2 = pmap_l1_to_l2(l1, va); 667 desc = pmap_load(l2) & ATTR_DESCR_MASK; 668 if (desc != L2_TABLE) { 669 *level = 1; 670 return (l1); 671 } 672 673 *level = 2; 674 return (l2); 675 } 676 677 /* 678 * Returns the lowest valid pte block or table entry for a given virtual 679 * address. If there are no valid entries return NULL and set the level to 680 * the first invalid level. 681 */ 682 static __inline pt_entry_t * 683 pmap_pte(pmap_t pmap, vm_offset_t va, int *level) 684 { 685 pd_entry_t *l1, *l2, desc; 686 pt_entry_t *l3; 687 688 l1 = pmap_l1(pmap, va); 689 if (l1 == NULL) { 690 *level = 0; 691 return (NULL); 692 } 693 desc = pmap_load(l1) & ATTR_DESCR_MASK; 694 if (desc == L1_BLOCK) { 695 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 696 *level = 1; 697 return (l1); 698 } 699 700 if (desc != L1_TABLE) { 701 *level = 1; 702 return (NULL); 703 } 704 705 l2 = pmap_l1_to_l2(l1, va); 706 desc = pmap_load(l2) & ATTR_DESCR_MASK; 707 if (desc == L2_BLOCK) { 708 *level = 2; 709 return (l2); 710 } 711 712 if (desc != L2_TABLE) { 713 *level = 2; 714 return (NULL); 715 } 716 717 *level = 3; 718 l3 = pmap_l2_to_l3(l2, va); 719 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) 720 return (NULL); 721 722 return (l3); 723 } 724 725 /* 726 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified 727 * level that maps the specified virtual address, then a pointer to that entry 728 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled 729 * and a diagnostic message is provided, in which case this function panics. 730 */ 731 static __always_inline pt_entry_t * 732 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag) 733 { 734 pd_entry_t *l0p, *l1p, *l2p; 735 pt_entry_t desc, *l3p; 736 int walk_level __diagused; 737 738 KASSERT(level >= 0 && level < 4, 739 ("%s: %s passed an out-of-range level (%d)", __func__, diag, 740 level)); 741 l0p = pmap_l0(pmap, va); 742 desc = pmap_load(l0p) & ATTR_DESCR_MASK; 743 if (desc == L0_TABLE && level > 0) { 744 l1p = pmap_l0_to_l1(l0p, va); 745 desc = pmap_load(l1p) & ATTR_DESCR_MASK; 746 if (desc == L1_BLOCK && level == 1) { 747 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 748 return (l1p); 749 } 750 if (desc == L1_TABLE && level > 1) { 751 l2p = pmap_l1_to_l2(l1p, va); 752 desc = pmap_load(l2p) & ATTR_DESCR_MASK; 753 if (desc == L2_BLOCK && level == 2) 754 return (l2p); 755 else if (desc == L2_TABLE && level > 2) { 756 l3p = pmap_l2_to_l3(l2p, va); 757 desc = pmap_load(l3p) & ATTR_DESCR_MASK; 758 if (desc == L3_PAGE && level == 3) 759 return (l3p); 760 else 761 walk_level = 3; 762 } else 763 walk_level = 2; 764 } else 765 walk_level = 1; 766 } else 767 walk_level = 0; 768 KASSERT(diag == NULL, 769 ("%s: va %#lx not mapped at level %d, desc %ld at level %d", 770 diag, va, level, desc, walk_level)); 771 return (NULL); 772 } 773 774 bool 775 pmap_ps_enabled(pmap_t pmap) 776 { 777 /* 778 * Promotion requires a hypervisor call when the kernel is running 779 * in EL1. To stop this disable superpage support on non-stage 1 780 * pmaps for now. 781 */ 782 if (pmap->pm_stage != PM_STAGE1) 783 return (false); 784 785 #ifdef KMSAN 786 /* 787 * The break-before-make in pmap_update_entry() results in a situation 788 * where a CPU may call into the KMSAN runtime while the entry is 789 * invalid. If the entry is used to map the current thread structure, 790 * then the runtime will attempt to access unmapped memory. Avoid this 791 * by simply disabling superpage promotion for the kernel map. 792 */ 793 if (pmap == kernel_pmap) 794 return (false); 795 #endif 796 797 return (superpages_enabled != 0); 798 } 799 800 bool 801 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, 802 pd_entry_t **l2, pt_entry_t **l3) 803 { 804 pd_entry_t *l0p, *l1p, *l2p; 805 806 if (pmap->pm_l0 == NULL) 807 return (false); 808 809 l0p = pmap_l0(pmap, va); 810 *l0 = l0p; 811 812 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) 813 return (false); 814 815 l1p = pmap_l0_to_l1(l0p, va); 816 *l1 = l1p; 817 818 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { 819 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 820 *l2 = NULL; 821 *l3 = NULL; 822 return (true); 823 } 824 825 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) 826 return (false); 827 828 l2p = pmap_l1_to_l2(l1p, va); 829 *l2 = l2p; 830 831 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { 832 *l3 = NULL; 833 return (true); 834 } 835 836 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) 837 return (false); 838 839 *l3 = pmap_l2_to_l3(l2p, va); 840 841 return (true); 842 } 843 844 static __inline int 845 pmap_l3_valid(pt_entry_t l3) 846 { 847 848 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); 849 } 850 851 CTASSERT(L1_BLOCK == L2_BLOCK); 852 853 static pt_entry_t 854 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr) 855 { 856 pt_entry_t val; 857 858 if (pmap->pm_stage == PM_STAGE1) { 859 val = ATTR_S1_IDX(memattr); 860 if (memattr == VM_MEMATTR_DEVICE) 861 val |= ATTR_S1_XN; 862 return (val); 863 } 864 865 val = 0; 866 867 switch (memattr) { 868 case VM_MEMATTR_DEVICE: 869 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) | 870 ATTR_S2_XN(ATTR_S2_XN_ALL)); 871 case VM_MEMATTR_UNCACHEABLE: 872 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC)); 873 case VM_MEMATTR_WRITE_BACK: 874 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB)); 875 case VM_MEMATTR_WRITE_THROUGH: 876 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT)); 877 default: 878 panic("%s: invalid memory attribute %x", __func__, memattr); 879 } 880 } 881 882 static pt_entry_t 883 pmap_pte_prot(pmap_t pmap, vm_prot_t prot) 884 { 885 pt_entry_t val; 886 887 val = 0; 888 if (pmap->pm_stage == PM_STAGE1) { 889 if ((prot & VM_PROT_EXECUTE) == 0) 890 val |= ATTR_S1_XN; 891 if ((prot & VM_PROT_WRITE) == 0) 892 val |= ATTR_S1_AP(ATTR_S1_AP_RO); 893 } else { 894 if ((prot & VM_PROT_WRITE) != 0) 895 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 896 if ((prot & VM_PROT_READ) != 0) 897 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ); 898 if ((prot & VM_PROT_EXECUTE) == 0) 899 val |= ATTR_S2_XN(ATTR_S2_XN_ALL); 900 } 901 902 return (val); 903 } 904 905 /* 906 * Checks if the PTE is dirty. 907 */ 908 static inline int 909 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte) 910 { 911 912 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); 913 914 if (pmap->pm_stage == PM_STAGE1) { 915 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0, 916 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); 917 918 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 919 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)); 920 } 921 922 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 923 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)); 924 } 925 926 static __inline void 927 pmap_resident_count_inc(pmap_t pmap, int count) 928 { 929 930 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 931 pmap->pm_stats.resident_count += count; 932 } 933 934 static __inline void 935 pmap_resident_count_dec(pmap_t pmap, int count) 936 { 937 938 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 939 KASSERT(pmap->pm_stats.resident_count >= count, 940 ("pmap %p resident count underflow %ld %d", pmap, 941 pmap->pm_stats.resident_count, count)); 942 pmap->pm_stats.resident_count -= count; 943 } 944 945 static vm_paddr_t 946 pmap_early_vtophys(vm_offset_t va) 947 { 948 vm_paddr_t pa_page; 949 950 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK; 951 return (pa_page | (va & PAR_LOW_MASK)); 952 } 953 954 /* State of the bootstrapped DMAP page tables */ 955 struct pmap_bootstrap_state { 956 pt_entry_t *l1; 957 pt_entry_t *l2; 958 pt_entry_t *l3; 959 vm_offset_t freemempos; 960 vm_offset_t va; 961 vm_paddr_t pa; 962 pt_entry_t table_attrs; 963 u_int l0_slot; 964 u_int l1_slot; 965 u_int l2_slot; 966 bool dmap_valid; 967 }; 968 969 /* The bootstrap state */ 970 static struct pmap_bootstrap_state bs_state = { 971 .l1 = NULL, 972 .l2 = NULL, 973 .l3 = NULL, 974 .table_attrs = TATTR_PXN_TABLE, 975 .l0_slot = L0_ENTRIES, 976 .l1_slot = Ln_ENTRIES, 977 .l2_slot = Ln_ENTRIES, 978 .dmap_valid = false, 979 }; 980 981 static void 982 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state) 983 { 984 vm_paddr_t l1_pa; 985 pd_entry_t l0e; 986 u_int l0_slot; 987 988 /* Link the level 0 table to a level 1 table */ 989 l0_slot = pmap_l0_index(state->va); 990 if (l0_slot != state->l0_slot) { 991 /* 992 * Make sure we move from a low address to high address 993 * before the DMAP region is ready. This ensures we never 994 * modify an existing mapping until we can map from a 995 * physical address to a virtual address. 996 */ 997 MPASS(state->l0_slot < l0_slot || 998 state->l0_slot == L0_ENTRIES || 999 state->dmap_valid); 1000 1001 /* Reset lower levels */ 1002 state->l2 = NULL; 1003 state->l3 = NULL; 1004 state->l1_slot = Ln_ENTRIES; 1005 state->l2_slot = Ln_ENTRIES; 1006 1007 /* Check the existing L0 entry */ 1008 state->l0_slot = l0_slot; 1009 if (state->dmap_valid) { 1010 l0e = pagetable_l0_ttbr1[l0_slot]; 1011 if ((l0e & ATTR_DESCR_VALID) != 0) { 1012 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE); 1013 l1_pa = PTE_TO_PHYS(l0e); 1014 state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa); 1015 return; 1016 } 1017 } 1018 1019 /* Create a new L0 table entry */ 1020 state->l1 = (pt_entry_t *)state->freemempos; 1021 memset_early(state->l1, 0, PAGE_SIZE); 1022 state->freemempos += PAGE_SIZE; 1023 1024 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1); 1025 MPASS((l1_pa & Ln_TABLE_MASK) == 0); 1026 MPASS(pagetable_l0_ttbr1[l0_slot] == 0); 1027 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) | 1028 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE); 1029 } 1030 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__)); 1031 } 1032 1033 static void 1034 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state) 1035 { 1036 vm_paddr_t l2_pa; 1037 pd_entry_t l1e; 1038 u_int l1_slot; 1039 1040 /* Make sure there is a valid L0 -> L1 table */ 1041 pmap_bootstrap_l0_table(state); 1042 1043 /* Link the level 1 table to a level 2 table */ 1044 l1_slot = pmap_l1_index(state->va); 1045 if (l1_slot != state->l1_slot) { 1046 /* See pmap_bootstrap_l0_table for a description */ 1047 MPASS(state->l1_slot < l1_slot || 1048 state->l1_slot == Ln_ENTRIES || 1049 state->dmap_valid); 1050 1051 /* Reset lower levels */ 1052 state->l3 = NULL; 1053 state->l2_slot = Ln_ENTRIES; 1054 1055 /* Check the existing L1 entry */ 1056 state->l1_slot = l1_slot; 1057 if (state->dmap_valid) { 1058 l1e = state->l1[l1_slot]; 1059 if ((l1e & ATTR_DESCR_VALID) != 0) { 1060 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE); 1061 l2_pa = PTE_TO_PHYS(l1e); 1062 state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa); 1063 return; 1064 } 1065 } 1066 1067 /* Create a new L1 table entry */ 1068 state->l2 = (pt_entry_t *)state->freemempos; 1069 memset_early(state->l2, 0, PAGE_SIZE); 1070 state->freemempos += PAGE_SIZE; 1071 1072 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2); 1073 MPASS((l2_pa & Ln_TABLE_MASK) == 0); 1074 MPASS(state->l1[l1_slot] == 0); 1075 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) | 1076 state->table_attrs | L1_TABLE); 1077 } 1078 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__)); 1079 } 1080 1081 static void 1082 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state) 1083 { 1084 vm_paddr_t l3_pa; 1085 pd_entry_t l2e; 1086 u_int l2_slot; 1087 1088 /* Make sure there is a valid L1 -> L2 table */ 1089 pmap_bootstrap_l1_table(state); 1090 1091 /* Link the level 2 table to a level 3 table */ 1092 l2_slot = pmap_l2_index(state->va); 1093 if (l2_slot != state->l2_slot) { 1094 /* See pmap_bootstrap_l0_table for a description */ 1095 MPASS(state->l2_slot < l2_slot || 1096 state->l2_slot == Ln_ENTRIES || 1097 state->dmap_valid); 1098 1099 /* Check the existing L2 entry */ 1100 state->l2_slot = l2_slot; 1101 if (state->dmap_valid) { 1102 l2e = state->l2[l2_slot]; 1103 if ((l2e & ATTR_DESCR_VALID) != 0) { 1104 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE); 1105 l3_pa = PTE_TO_PHYS(l2e); 1106 state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa); 1107 return; 1108 } 1109 } 1110 1111 /* Create a new L2 table entry */ 1112 state->l3 = (pt_entry_t *)state->freemempos; 1113 memset_early(state->l3, 0, PAGE_SIZE); 1114 state->freemempos += PAGE_SIZE; 1115 1116 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3); 1117 MPASS((l3_pa & Ln_TABLE_MASK) == 0); 1118 MPASS(state->l2[l2_slot] == 0); 1119 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) | 1120 state->table_attrs | L2_TABLE); 1121 } 1122 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__)); 1123 } 1124 1125 static void 1126 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i) 1127 { 1128 pt_entry_t contig; 1129 u_int l2_slot; 1130 bool first; 1131 1132 if ((physmap[i + 1] - state->pa) < L2_SIZE) 1133 return; 1134 1135 /* Make sure there is a valid L1 table */ 1136 pmap_bootstrap_l1_table(state); 1137 1138 MPASS((state->va & L2_OFFSET) == 0); 1139 for (first = true, contig = 0; 1140 state->va < DMAP_MAX_ADDRESS && 1141 (physmap[i + 1] - state->pa) >= L2_SIZE; 1142 state->va += L2_SIZE, state->pa += L2_SIZE) { 1143 /* 1144 * Stop if we are about to walk off the end of what the 1145 * current L1 slot can address. 1146 */ 1147 if (!first && (state->pa & L1_OFFSET) == 0) 1148 break; 1149 1150 /* 1151 * If we have an aligned, contiguous chunk of L2C_ENTRIES 1152 * L2 blocks, set the contiguous bit within each PTE so that 1153 * the chunk can be cached using only one TLB entry. 1154 */ 1155 if ((state->pa & L2C_OFFSET) == 0) { 1156 if (state->va + L2C_SIZE < DMAP_MAX_ADDRESS && 1157 physmap[i + 1] - state->pa >= L2C_SIZE) { 1158 contig = ATTR_CONTIGUOUS; 1159 } else { 1160 contig = 0; 1161 } 1162 } 1163 1164 first = false; 1165 l2_slot = pmap_l2_index(state->va); 1166 MPASS((state->pa & L2_OFFSET) == 0); 1167 MPASS(state->l2[l2_slot] == 0); 1168 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) | 1169 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP | 1170 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L2_BLOCK); 1171 } 1172 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 1173 } 1174 1175 static void 1176 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i) 1177 { 1178 pt_entry_t contig; 1179 u_int l3_slot; 1180 bool first; 1181 1182 if (physmap[i + 1] - state->pa < L3_SIZE) 1183 return; 1184 1185 /* Make sure there is a valid L2 table */ 1186 pmap_bootstrap_l2_table(state); 1187 1188 MPASS((state->va & L3_OFFSET) == 0); 1189 for (first = true, contig = 0; 1190 state->va < DMAP_MAX_ADDRESS && 1191 physmap[i + 1] - state->pa >= L3_SIZE; 1192 state->va += L3_SIZE, state->pa += L3_SIZE) { 1193 /* 1194 * Stop if we are about to walk off the end of what the 1195 * current L2 slot can address. 1196 */ 1197 if (!first && (state->pa & L2_OFFSET) == 0) 1198 break; 1199 1200 /* 1201 * If we have an aligned, contiguous chunk of L3C_ENTRIES 1202 * L3 pages, set the contiguous bit within each PTE so that 1203 * the chunk can be cached using only one TLB entry. 1204 */ 1205 if ((state->pa & L3C_OFFSET) == 0) { 1206 if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS && 1207 physmap[i + 1] - state->pa >= L3C_SIZE) { 1208 contig = ATTR_CONTIGUOUS; 1209 } else { 1210 contig = 0; 1211 } 1212 } 1213 1214 first = false; 1215 l3_slot = pmap_l3_index(state->va); 1216 MPASS((state->pa & L3_OFFSET) == 0); 1217 MPASS(state->l3[l3_slot] == 0); 1218 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) | 1219 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP | 1220 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE); 1221 } 1222 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 1223 } 1224 1225 void 1226 pmap_bootstrap_dmap(vm_size_t kernlen) 1227 { 1228 vm_paddr_t start_pa, pa; 1229 uint64_t tcr; 1230 int i; 1231 1232 tcr = READ_SPECIALREG(tcr_el1); 1233 1234 /* Verify that the ASID is set through TTBR0. */ 1235 KASSERT((tcr & TCR_A1) == 0, ("pmap_bootstrap: TCR_EL1.A1 != 0")); 1236 1237 if ((tcr & TCR_DS) != 0) 1238 pmap_lpa_enabled = true; 1239 1240 pmap_l1_supported = L1_BLOCKS_SUPPORTED; 1241 1242 start_pa = pmap_early_vtophys(KERNBASE); 1243 1244 bs_state.freemempos = KERNBASE + kernlen; 1245 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE); 1246 1247 /* Fill in physmap array. */ 1248 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1249 1250 dmap_phys_base = physmap[0] & ~L1_OFFSET; 1251 dmap_phys_max = 0; 1252 dmap_max_addr = 0; 1253 1254 for (i = 0; i < physmap_idx; i += 2) { 1255 bs_state.pa = physmap[i] & ~L3_OFFSET; 1256 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS; 1257 1258 /* Create L3 mappings at the start of the region */ 1259 if ((bs_state.pa & L2_OFFSET) != 0) 1260 pmap_bootstrap_l3_page(&bs_state, i); 1261 MPASS(bs_state.pa <= physmap[i + 1]); 1262 1263 if (L1_BLOCKS_SUPPORTED) { 1264 /* Create L2 mappings at the start of the region */ 1265 if ((bs_state.pa & L1_OFFSET) != 0) 1266 pmap_bootstrap_l2_block(&bs_state, i); 1267 MPASS(bs_state.pa <= physmap[i + 1]); 1268 1269 /* Create the main L1 block mappings */ 1270 for (; bs_state.va < DMAP_MAX_ADDRESS && 1271 (physmap[i + 1] - bs_state.pa) >= L1_SIZE; 1272 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) { 1273 /* Make sure there is a valid L1 table */ 1274 pmap_bootstrap_l0_table(&bs_state); 1275 MPASS((bs_state.pa & L1_OFFSET) == 0); 1276 pmap_store( 1277 &bs_state.l1[pmap_l1_index(bs_state.va)], 1278 PHYS_TO_PTE(bs_state.pa) | ATTR_AF | 1279 pmap_sh_attr | 1280 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 1281 ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK); 1282 } 1283 MPASS(bs_state.pa <= physmap[i + 1]); 1284 1285 /* Create L2 mappings at the end of the region */ 1286 pmap_bootstrap_l2_block(&bs_state, i); 1287 } else { 1288 while (bs_state.va < DMAP_MAX_ADDRESS && 1289 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) { 1290 pmap_bootstrap_l2_block(&bs_state, i); 1291 } 1292 } 1293 MPASS(bs_state.pa <= physmap[i + 1]); 1294 1295 /* Create L3 mappings at the end of the region */ 1296 pmap_bootstrap_l3_page(&bs_state, i); 1297 MPASS(bs_state.pa == physmap[i + 1]); 1298 1299 if (bs_state.pa > dmap_phys_max) { 1300 dmap_phys_max = bs_state.pa; 1301 dmap_max_addr = bs_state.va; 1302 } 1303 } 1304 1305 pmap_s1_invalidate_all_kernel(); 1306 1307 bs_state.dmap_valid = true; 1308 1309 /* Exclude the kernel and DMAP region */ 1310 pa = pmap_early_vtophys(bs_state.freemempos); 1311 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); 1312 } 1313 1314 static void 1315 pmap_bootstrap_l2(vm_offset_t va) 1316 { 1317 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); 1318 1319 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/ 1320 bs_state.va = va; 1321 1322 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE) 1323 pmap_bootstrap_l1_table(&bs_state); 1324 } 1325 1326 static void 1327 pmap_bootstrap_l3(vm_offset_t va) 1328 { 1329 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 1330 1331 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/ 1332 bs_state.va = va; 1333 1334 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE) 1335 pmap_bootstrap_l2_table(&bs_state); 1336 } 1337 1338 /* 1339 * Bootstrap the system enough to run with virtual memory. 1340 */ 1341 void 1342 pmap_bootstrap(void) 1343 { 1344 vm_offset_t dpcpu, msgbufpv; 1345 vm_paddr_t start_pa, pa; 1346 size_t largest_phys_size; 1347 1348 /* Set this early so we can use the pagetable walking functions */ 1349 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1; 1350 mtx_init(&kernel_pmap->pm_mtx, "kernel pmap", NULL, MTX_DEF); 1351 kernel_pmap->pm_l0_paddr = 1352 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0); 1353 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1354 vm_radix_init(&kernel_pmap->pm_root); 1355 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN); 1356 kernel_pmap->pm_stage = PM_STAGE1; 1357 kernel_pmap->pm_levels = 4; 1358 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr; 1359 kernel_pmap->pm_asid_set = &asids; 1360 1361 /* Reserve some VA space for early BIOS/ACPI mapping */ 1362 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE); 1363 1364 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; 1365 virtual_avail = roundup2(virtual_avail, L1_SIZE); 1366 virtual_end = VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE; 1367 kernel_vm_end = virtual_avail; 1368 1369 /* 1370 * We only use PXN when we know nothing will be executed from it, e.g. 1371 * the DMAP region. 1372 */ 1373 bs_state.table_attrs &= ~TATTR_PXN_TABLE; 1374 1375 /* 1376 * Find the physical memory we could use. This needs to be after we 1377 * exclude any memory that is mapped into the DMAP region but should 1378 * not be used by the kernel, e.g. some UEFI memory types. 1379 */ 1380 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1381 1382 /* 1383 * Find space for early allocations. We search for the largest 1384 * region. This is because the user may choose a large msgbuf. 1385 * This could be smarter, e.g. to allow multiple regions to be 1386 * used & switch to the next when one is full. 1387 */ 1388 largest_phys_size = 0; 1389 for (int i = 0; i < physmap_idx; i += 2) { 1390 if ((physmap[i + 1] - physmap[i]) > largest_phys_size) { 1391 largest_phys_size = physmap[i + 1] - physmap[i]; 1392 bs_state.freemempos = PHYS_TO_DMAP(physmap[i]); 1393 } 1394 } 1395 1396 start_pa = pmap_early_vtophys(bs_state.freemempos); 1397 1398 /* 1399 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the 1400 * loader allocated the first and only l2 page table page used to map 1401 * the kernel, preloaded files and module metadata. 1402 */ 1403 pmap_bootstrap_l2(KERNBASE + L1_SIZE); 1404 /* And the l3 tables for the early devmap */ 1405 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE)); 1406 1407 pmap_s1_invalidate_all_kernel(); 1408 1409 #define alloc_pages(var, np) \ 1410 (var) = bs_state.freemempos; \ 1411 bs_state.freemempos += (np * PAGE_SIZE); \ 1412 memset_early((char *)(var), 0, ((np) * PAGE_SIZE)); 1413 1414 /* Allocate dynamic per-cpu area. */ 1415 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 1416 dpcpu_init((void *)dpcpu, 0); 1417 1418 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 1419 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 1420 msgbufp = (void *)msgbufpv; 1421 1422 pa = pmap_early_vtophys(bs_state.freemempos); 1423 1424 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); 1425 } 1426 1427 #if defined(KASAN) || defined(KMSAN) 1428 static void 1429 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa, 1430 vm_offset_t *vap, vm_offset_t eva) 1431 { 1432 vm_paddr_t pa; 1433 vm_offset_t va; 1434 pd_entry_t *l2; 1435 1436 va = *vap; 1437 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE); 1438 for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) { 1439 l2 = pmap_l2(kernel_pmap, va); 1440 1441 /* 1442 * KASAN stack checking results in us having already allocated 1443 * part of our shadow map, so we can just skip those segments. 1444 */ 1445 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) { 1446 pa += L2_SIZE; 1447 continue; 1448 } 1449 1450 bzero_early((void *)PHYS_TO_DMAP(pa), L2_SIZE); 1451 physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC); 1452 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK); 1453 } 1454 *vap = va; 1455 } 1456 1457 /* 1458 * Finish constructing the initial shadow map: 1459 * - Count how many pages from KERNBASE to virtual_avail (scaled for 1460 * shadow map) 1461 * - Map that entire range using L2 superpages. 1462 */ 1463 static void 1464 pmap_bootstrap_san1(vm_offset_t va, int scale) 1465 { 1466 vm_offset_t eva; 1467 vm_paddr_t kernstart; 1468 int i; 1469 1470 kernstart = pmap_early_vtophys(KERNBASE); 1471 1472 /* 1473 * Rebuild physmap one more time, we may have excluded more regions from 1474 * allocation since pmap_bootstrap(). 1475 */ 1476 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1477 1478 eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale; 1479 1480 /* 1481 * Find a slot in the physmap large enough for what we needed. We try to put 1482 * the shadow map as high up as we can to avoid depleting the lower 4GB in case 1483 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA. 1484 */ 1485 for (i = physmap_idx - 2; i >= 0; i -= 2) { 1486 vm_paddr_t plow, phigh; 1487 1488 /* L2 mappings must be backed by memory that is L2-aligned */ 1489 plow = roundup2(physmap[i], L2_SIZE); 1490 phigh = physmap[i + 1]; 1491 if (plow >= phigh) 1492 continue; 1493 if (kernstart >= plow && kernstart < phigh) 1494 phigh = kernstart; 1495 if (phigh - plow >= L2_SIZE) { 1496 pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva); 1497 if (va >= eva) 1498 break; 1499 } 1500 } 1501 if (i < 0) 1502 panic("Could not find phys region for shadow map"); 1503 1504 /* 1505 * Done. We should now have a valid shadow address mapped for all KVA 1506 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus, 1507 * shadow accesses by the sanitizer runtime will succeed for this range. 1508 * When the kernel virtual address range is later expanded, as will 1509 * happen in vm_mem_init(), the shadow map will be grown as well. This 1510 * is handled by pmap_san_enter(). 1511 */ 1512 } 1513 1514 void 1515 pmap_bootstrap_san(void) 1516 { 1517 #ifdef KASAN 1518 pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE); 1519 #else 1520 static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE); 1521 static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE); 1522 pd_entry_t *l0, *l1; 1523 1524 if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE) 1525 panic("initial kernel map is too large"); 1526 1527 l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS); 1528 pmap_store(l0, L0_TABLE | PHYS_TO_PTE( 1529 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp))); 1530 l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS); 1531 pmap_store(l1, L1_TABLE | PHYS_TO_PTE( 1532 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE))); 1533 pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1); 1534 1535 l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS); 1536 pmap_store(l0, L0_TABLE | PHYS_TO_PTE( 1537 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp))); 1538 l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS); 1539 pmap_store(l1, L1_TABLE | PHYS_TO_PTE( 1540 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE))); 1541 pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1); 1542 #endif 1543 } 1544 #endif 1545 1546 /* 1547 * Initialize a vm_page's machine-dependent fields. 1548 */ 1549 void 1550 pmap_page_init(vm_page_t m) 1551 { 1552 1553 TAILQ_INIT(&m->md.pv_list); 1554 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 1555 } 1556 1557 static void 1558 pmap_init_asids(struct asid_set *set, int bits) 1559 { 1560 int i; 1561 1562 set->asid_bits = bits; 1563 1564 /* 1565 * We may be too early in the overall initialization process to use 1566 * bit_alloc(). 1567 */ 1568 set->asid_set_size = 1 << set->asid_bits; 1569 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size), 1570 M_WAITOK | M_ZERO); 1571 for (i = 0; i < ASID_FIRST_AVAILABLE; i++) 1572 bit_set(set->asid_set, i); 1573 set->asid_next = ASID_FIRST_AVAILABLE; 1574 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN); 1575 } 1576 1577 static void 1578 pmap_init_pv_table(void) 1579 { 1580 struct vm_phys_seg *seg, *next_seg; 1581 struct pmap_large_md_page *pvd; 1582 vm_size_t s; 1583 int domain, i, j, pages; 1584 1585 /* 1586 * We depend on the size being evenly divisible into a page so 1587 * that the pv_table array can be indexed directly while 1588 * safely spanning multiple pages from different domains. 1589 */ 1590 CTASSERT(PAGE_SIZE % sizeof(*pvd) == 0); 1591 1592 /* 1593 * Calculate the size of the array. 1594 */ 1595 s = 0; 1596 for (i = 0; i < vm_phys_nsegs; i++) { 1597 seg = &vm_phys_segs[i]; 1598 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1599 pmap_l2_pindex(seg->start); 1600 s += round_page(pages * sizeof(*pvd)); 1601 } 1602 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 1603 if (pv_table == NULL) 1604 panic("%s: kva_alloc failed\n", __func__); 1605 1606 /* 1607 * Iterate physical segments to allocate domain-local memory for PV 1608 * list headers. 1609 */ 1610 pvd = pv_table; 1611 for (i = 0; i < vm_phys_nsegs; i++) { 1612 seg = &vm_phys_segs[i]; 1613 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1614 pmap_l2_pindex(seg->start); 1615 domain = seg->domain; 1616 1617 s = round_page(pages * sizeof(*pvd)); 1618 1619 for (j = 0; j < s; j += PAGE_SIZE) { 1620 vm_page_t m = vm_page_alloc_noobj_domain(domain, 1621 VM_ALLOC_ZERO); 1622 if (m == NULL) 1623 panic("failed to allocate PV table page"); 1624 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 1625 } 1626 1627 for (j = 0; j < s / sizeof(*pvd); j++) { 1628 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 1629 TAILQ_INIT(&pvd->pv_page.pv_list); 1630 pvd++; 1631 } 1632 } 1633 pvd = &pv_dummy_large; 1634 memset(pvd, 0, sizeof(*pvd)); 1635 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 1636 TAILQ_INIT(&pvd->pv_page.pv_list); 1637 1638 /* 1639 * Set pointers from vm_phys_segs to pv_table. 1640 */ 1641 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) { 1642 seg = &vm_phys_segs[i]; 1643 seg->md_first = pvd; 1644 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1645 pmap_l2_pindex(seg->start); 1646 1647 /* 1648 * If there is a following segment, and the final 1649 * superpage of this segment and the initial superpage 1650 * of the next segment are the same then adjust the 1651 * pv_table entry for that next segment down by one so 1652 * that the pv_table entries will be shared. 1653 */ 1654 if (i + 1 < vm_phys_nsegs) { 1655 next_seg = &vm_phys_segs[i + 1]; 1656 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 == 1657 pmap_l2_pindex(next_seg->start)) { 1658 pvd--; 1659 } 1660 } 1661 } 1662 } 1663 1664 static cpu_feat_en 1665 pmap_dbm_check(const struct cpu_feat *feat __unused, u_int midr __unused) 1666 { 1667 uint64_t id_aa64mmfr1; 1668 1669 id_aa64mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); 1670 if (ID_AA64MMFR1_HAFDBS_VAL(id_aa64mmfr1) >= 1671 ID_AA64MMFR1_HAFDBS_AF_DBS) 1672 return (FEAT_DEFAULT_ENABLE); 1673 1674 return (FEAT_ALWAYS_DISABLE); 1675 } 1676 1677 static bool 1678 pmap_dbm_has_errata(const struct cpu_feat *feat __unused, u_int midr, 1679 u_int **errata_list, u_int *errata_count) 1680 { 1681 /* Disable on Cortex-A55 for erratum 1024718 - all revisions */ 1682 if (CPU_IMPL(midr) == CPU_IMPL_ARM && 1683 CPU_PART(midr) == CPU_PART_CORTEX_A55) { 1684 static u_int errata_id = 1024718; 1685 1686 *errata_list = &errata_id; 1687 *errata_count = 1; 1688 return (true); 1689 } 1690 1691 /* Disable on Cortex-A510 for erratum 2051678 - r0p0 to r0p2 */ 1692 if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A510, 1693 0, 0, 0, 2)) { 1694 static u_int errata_id = 2051678; 1695 1696 *errata_list = &errata_id; 1697 *errata_count = 1; 1698 return (true); 1699 } 1700 1701 return (false); 1702 } 1703 1704 static bool 1705 pmap_dbm_enable(const struct cpu_feat *feat __unused, 1706 cpu_feat_errata errata_status, u_int *errata_list __unused, 1707 u_int errata_count) 1708 { 1709 uint64_t tcr; 1710 1711 /* Skip if there is an erratum affecting DBM */ 1712 if (errata_status != ERRATA_NONE) 1713 return (false); 1714 1715 tcr = READ_SPECIALREG(tcr_el1) | TCR_HD; 1716 WRITE_SPECIALREG(tcr_el1, tcr); 1717 isb(); 1718 /* Flush the local TLB for the TCR_HD flag change */ 1719 dsb(nshst); 1720 __asm __volatile("tlbi vmalle1"); 1721 dsb(nsh); 1722 isb(); 1723 1724 return (true); 1725 } 1726 1727 CPU_FEAT(feat_hafdbs, "Hardware management of the Access flag and dirty state", 1728 pmap_dbm_check, pmap_dbm_has_errata, pmap_dbm_enable, NULL, 1729 CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU); 1730 1731 static cpu_feat_en 1732 pmap_multiple_tlbi_check(const struct cpu_feat *feat __unused, u_int midr) 1733 { 1734 /* 1735 * Cortex-A55 erratum 2441007 (Cat B rare) 1736 * Present in all revisions 1737 */ 1738 if (CPU_IMPL(midr) == CPU_IMPL_ARM && 1739 CPU_PART(midr) == CPU_PART_CORTEX_A55) 1740 return (FEAT_DEFAULT_DISABLE); 1741 1742 /* 1743 * Cortex-A76 erratum 1286807 (Cat B rare) 1744 * Present in r0p0 - r3p0 1745 * Fixed in r3p1 1746 */ 1747 if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A76, 1748 0, 0, 3, 0)) 1749 return (FEAT_DEFAULT_DISABLE); 1750 1751 /* 1752 * Cortex-A510 erratum 2441009 (Cat B rare) 1753 * Present in r0p0 - r1p1 1754 * Fixed in r1p2 1755 */ 1756 if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A510, 1757 0, 0, 1, 1)) 1758 return (FEAT_DEFAULT_DISABLE); 1759 1760 return (FEAT_ALWAYS_DISABLE); 1761 } 1762 1763 static bool 1764 pmap_multiple_tlbi_enable(const struct cpu_feat *feat __unused, 1765 cpu_feat_errata errata_status, u_int *errata_list __unused, 1766 u_int errata_count __unused) 1767 { 1768 pmap_multiple_tlbi = true; 1769 return (true); 1770 } 1771 1772 CPU_FEAT(errata_multi_tlbi, "Multiple TLBI errata", 1773 pmap_multiple_tlbi_check, NULL, pmap_multiple_tlbi_enable, NULL, 1774 CPU_FEAT_EARLY_BOOT | CPU_FEAT_PER_CPU); 1775 1776 /* 1777 * Initialize the pmap module. 1778 * 1779 * Called by vm_mem_init(), to initialize any structures that the pmap 1780 * system needs to map virtual memory. 1781 */ 1782 void 1783 pmap_init(void) 1784 { 1785 uint64_t mmfr1; 1786 int i, vmid_bits; 1787 1788 /* 1789 * Are large page mappings enabled? 1790 */ 1791 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 1792 if (superpages_enabled) { 1793 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1794 ("pmap_init: can't assign to pagesizes[1]")); 1795 pagesizes[1] = L3C_SIZE; 1796 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 1797 ("pmap_init: can't assign to pagesizes[2]")); 1798 pagesizes[2] = L2_SIZE; 1799 if (L1_BLOCKS_SUPPORTED) { 1800 KASSERT(MAXPAGESIZES > 3 && pagesizes[3] == 0, 1801 ("pmap_init: can't assign to pagesizes[3]")); 1802 pagesizes[3] = L1_SIZE; 1803 } 1804 } 1805 1806 /* 1807 * Initialize the ASID allocator. 1808 */ 1809 pmap_init_asids(&asids, 1810 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8); 1811 1812 if (has_hyp()) { 1813 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); 1814 vmid_bits = 8; 1815 1816 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) == 1817 ID_AA64MMFR1_VMIDBits_16) 1818 vmid_bits = 16; 1819 pmap_init_asids(&vmids, vmid_bits); 1820 } 1821 1822 /* 1823 * Initialize pv chunk lists. 1824 */ 1825 for (i = 0; i < PMAP_MEMDOM; i++) { 1826 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, 1827 MTX_DEF); 1828 TAILQ_INIT(&pv_chunks[i].pvc_list); 1829 } 1830 pmap_init_pv_table(); 1831 1832 vm_initialized = 1; 1833 } 1834 1835 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1836 "L1 (1GB/64GB) page mapping counters"); 1837 1838 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions); 1839 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD, 1840 &pmap_l1_demotions, "L1 (1GB/64GB) page demotions"); 1841 1842 SYSCTL_BOOL(_vm_pmap_l1, OID_AUTO, supported, CTLFLAG_RD, &pmap_l1_supported, 1843 0, "L1 blocks are supported"); 1844 1845 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1846 "L2C (32MB/1GB) page mapping counters"); 1847 1848 static COUNTER_U64_DEFINE_EARLY(pmap_l2c_demotions); 1849 SYSCTL_COUNTER_U64(_vm_pmap_l2c, OID_AUTO, demotions, CTLFLAG_RD, 1850 &pmap_l2c_demotions, "L2C (32MB/1GB) page demotions"); 1851 1852 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1853 "2MB page mapping counters"); 1854 1855 static COUNTER_U64_DEFINE_EARLY(pmap_l2_demotions); 1856 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 1857 &pmap_l2_demotions, "L2 (2MB/32MB) page demotions"); 1858 1859 static COUNTER_U64_DEFINE_EARLY(pmap_l2_mappings); 1860 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 1861 &pmap_l2_mappings, "L2 (2MB/32MB) page mappings"); 1862 1863 static COUNTER_U64_DEFINE_EARLY(pmap_l2_p_failures); 1864 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 1865 &pmap_l2_p_failures, "L2 (2MB/32MB) page promotion failures"); 1866 1867 static COUNTER_U64_DEFINE_EARLY(pmap_l2_promotions); 1868 SYSCTL_COUNTER_U64(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 1869 &pmap_l2_promotions, "L2 (2MB/32MB) page promotions"); 1870 1871 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1872 "L3C (64KB/2MB) page mapping counters"); 1873 1874 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions); 1875 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD, 1876 &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions"); 1877 1878 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings); 1879 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD, 1880 &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings"); 1881 1882 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures); 1883 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD, 1884 &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures"); 1885 1886 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions); 1887 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD, 1888 &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions"); 1889 1890 /* 1891 * If the given value for "final_only" is false, then any cached intermediate- 1892 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to 1893 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry. 1894 * Otherwise, just the cached final-level entry is invalidated. 1895 */ 1896 static __inline void 1897 pmap_s1_invalidate_kernel(uint64_t r, bool final_only) 1898 { 1899 if (final_only) 1900 __asm __volatile("tlbi vaale1is, %0" : : "r" (r)); 1901 else 1902 __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); 1903 } 1904 1905 static __inline void 1906 pmap_s1_invalidate_user(uint64_t r, bool final_only) 1907 { 1908 if (final_only) 1909 __asm __volatile("tlbi vale1is, %0" : : "r" (r)); 1910 else 1911 __asm __volatile("tlbi vae1is, %0" : : "r" (r)); 1912 } 1913 1914 /* 1915 * Invalidates any cached final- and optionally intermediate-level TLB entries 1916 * for the specified virtual address in the given virtual address space. 1917 */ 1918 static __inline void 1919 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1920 { 1921 uint64_t r; 1922 1923 PMAP_ASSERT_STAGE1(pmap); 1924 1925 dsb(ishst); 1926 r = TLBI_VA(va); 1927 if (pmap == kernel_pmap) { 1928 pmap_s1_invalidate_kernel(r, final_only); 1929 } else { 1930 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1931 pmap_s1_invalidate_user(r, final_only); 1932 } 1933 if (pmap_multiple_tlbi) { 1934 dsb(ish); 1935 __asm __volatile("tlbi vale1is, xzr" ::: "memory"); 1936 } 1937 dsb(ish); 1938 isb(); 1939 } 1940 1941 static __inline void 1942 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1943 { 1944 PMAP_ASSERT_STAGE2(pmap); 1945 MPASS(pmap_stage2_invalidate_range != NULL); 1946 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE, 1947 final_only); 1948 } 1949 1950 static __inline void 1951 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1952 { 1953 if (pmap->pm_stage == PM_STAGE1) 1954 pmap_s1_invalidate_page(pmap, va, final_only); 1955 else 1956 pmap_s2_invalidate_page(pmap, va, final_only); 1957 } 1958 1959 /* 1960 * Use stride L{1,2}_SIZE when invalidating the TLB entries for L{1,2}_BLOCK 1961 * mappings. Otherwise, use stride L3_SIZE. 1962 */ 1963 static __inline void 1964 pmap_s1_invalidate_strided(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1965 vm_offset_t stride, bool final_only) 1966 { 1967 uint64_t end, r, start; 1968 1969 PMAP_ASSERT_STAGE1(pmap); 1970 1971 dsb(ishst); 1972 if (pmap == kernel_pmap) { 1973 start = TLBI_VA(sva); 1974 end = TLBI_VA(eva); 1975 for (r = start; r < end; r += TLBI_VA(stride)) 1976 pmap_s1_invalidate_kernel(r, final_only); 1977 } else { 1978 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1979 start |= TLBI_VA(sva); 1980 end |= TLBI_VA(eva); 1981 for (r = start; r < end; r += TLBI_VA(stride)) 1982 pmap_s1_invalidate_user(r, final_only); 1983 } 1984 if (pmap_multiple_tlbi) { 1985 dsb(ish); 1986 __asm __volatile("tlbi vale1is, xzr" ::: "memory"); 1987 } 1988 dsb(ish); 1989 isb(); 1990 } 1991 1992 /* 1993 * Invalidates any cached final- and optionally intermediate-level TLB entries 1994 * for the specified virtual address range in the given virtual address space. 1995 */ 1996 static __inline void 1997 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1998 bool final_only) 1999 { 2000 pmap_s1_invalidate_strided(pmap, sva, eva, L3_SIZE, final_only); 2001 } 2002 2003 static __inline void 2004 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 2005 bool final_only) 2006 { 2007 PMAP_ASSERT_STAGE2(pmap); 2008 MPASS(pmap_stage2_invalidate_range != NULL); 2009 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only); 2010 } 2011 2012 static __inline void 2013 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 2014 bool final_only) 2015 { 2016 if (pmap->pm_stage == PM_STAGE1) 2017 pmap_s1_invalidate_range(pmap, sva, eva, final_only); 2018 else 2019 pmap_s2_invalidate_range(pmap, sva, eva, final_only); 2020 } 2021 2022 void 2023 pmap_s1_invalidate_all_kernel(void) 2024 { 2025 dsb(ishst); 2026 __asm __volatile("tlbi vmalle1is"); 2027 if (pmap_multiple_tlbi) { 2028 dsb(ish); 2029 __asm __volatile("tlbi vale1is, xzr" ::: "memory"); 2030 } 2031 dsb(ish); 2032 isb(); 2033 } 2034 2035 /* 2036 * Invalidates all cached intermediate- and final-level TLB entries for the 2037 * given virtual address space. 2038 */ 2039 static __inline void 2040 pmap_s1_invalidate_all(pmap_t pmap) 2041 { 2042 uint64_t r; 2043 2044 PMAP_ASSERT_STAGE1(pmap); 2045 2046 dsb(ishst); 2047 if (pmap == kernel_pmap) { 2048 __asm __volatile("tlbi vmalle1is"); 2049 } else { 2050 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 2051 __asm __volatile("tlbi aside1is, %0" : : "r" (r)); 2052 } 2053 if (pmap_multiple_tlbi) { 2054 dsb(ish); 2055 __asm __volatile("tlbi vale1is, xzr" ::: "memory"); 2056 } 2057 dsb(ish); 2058 isb(); 2059 } 2060 2061 static __inline void 2062 pmap_s2_invalidate_all(pmap_t pmap) 2063 { 2064 PMAP_ASSERT_STAGE2(pmap); 2065 MPASS(pmap_stage2_invalidate_all != NULL); 2066 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap)); 2067 } 2068 2069 static __inline void 2070 pmap_invalidate_all(pmap_t pmap) 2071 { 2072 if (pmap->pm_stage == PM_STAGE1) 2073 pmap_s1_invalidate_all(pmap); 2074 else 2075 pmap_s2_invalidate_all(pmap); 2076 } 2077 2078 /* 2079 * Routine: pmap_extract 2080 * Function: 2081 * Extract the physical page address associated 2082 * with the given map/virtual_address pair. 2083 */ 2084 vm_paddr_t 2085 pmap_extract(pmap_t pmap, vm_offset_t va) 2086 { 2087 pt_entry_t *pte, tpte; 2088 vm_paddr_t pa; 2089 int lvl; 2090 2091 pa = 0; 2092 PMAP_LOCK(pmap); 2093 /* 2094 * Find the block or page map for this virtual address. pmap_pte 2095 * will return either a valid block/page entry, or NULL. 2096 */ 2097 pte = pmap_pte(pmap, va, &lvl); 2098 if (pte != NULL) { 2099 tpte = pmap_load(pte); 2100 pa = PTE_TO_PHYS(tpte); 2101 switch(lvl) { 2102 case 1: 2103 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 2104 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, 2105 ("pmap_extract: Invalid L1 pte found: %lx", 2106 tpte & ATTR_DESCR_MASK)); 2107 pa |= (va & L1_OFFSET); 2108 break; 2109 case 2: 2110 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, 2111 ("pmap_extract: Invalid L2 pte found: %lx", 2112 tpte & ATTR_DESCR_MASK)); 2113 pa |= (va & L2_OFFSET); 2114 break; 2115 case 3: 2116 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, 2117 ("pmap_extract: Invalid L3 pte found: %lx", 2118 tpte & ATTR_DESCR_MASK)); 2119 pa |= (va & L3_OFFSET); 2120 break; 2121 } 2122 } 2123 PMAP_UNLOCK(pmap); 2124 return (pa); 2125 } 2126 2127 /* 2128 * Routine: pmap_extract_and_hold 2129 * Function: 2130 * Atomically extract and hold the physical page 2131 * with the given pmap and virtual address pair 2132 * if that mapping permits the given protection. 2133 */ 2134 vm_page_t 2135 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 2136 { 2137 pt_entry_t *pte, tpte; 2138 vm_offset_t off; 2139 vm_page_t m; 2140 int lvl; 2141 bool use; 2142 2143 m = NULL; 2144 PMAP_LOCK(pmap); 2145 pte = pmap_pte(pmap, va, &lvl); 2146 if (pte != NULL) { 2147 tpte = pmap_load(pte); 2148 2149 KASSERT(lvl > 0 && lvl <= 3, 2150 ("pmap_extract_and_hold: Invalid level %d", lvl)); 2151 /* 2152 * Check that the pte is either a L3 page, or a L1 or L2 block 2153 * entry. We can assume L1_BLOCK == L2_BLOCK. 2154 */ 2155 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || 2156 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), 2157 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, 2158 tpte & ATTR_DESCR_MASK)); 2159 2160 use = false; 2161 if ((prot & VM_PROT_WRITE) == 0) 2162 use = true; 2163 else if (pmap->pm_stage == PM_STAGE1 && 2164 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) 2165 use = true; 2166 else if (pmap->pm_stage == PM_STAGE2 && 2167 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 2168 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE))) 2169 use = true; 2170 2171 if (use) { 2172 switch (lvl) { 2173 case 1: 2174 off = va & L1_OFFSET; 2175 break; 2176 case 2: 2177 off = va & L2_OFFSET; 2178 break; 2179 case 3: 2180 default: 2181 off = 0; 2182 } 2183 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off); 2184 if (m != NULL && !vm_page_wire_mapped(m)) 2185 m = NULL; 2186 } 2187 } 2188 PMAP_UNLOCK(pmap); 2189 return (m); 2190 } 2191 2192 /* 2193 * Returns true if the entire kernel virtual address range is mapped 2194 */ 2195 static bool 2196 pmap_kmapped_range(vm_offset_t sva, vm_size_t size) 2197 { 2198 pt_entry_t *pte, tpte; 2199 vm_offset_t eva; 2200 2201 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, 2202 ("%s: Invalid virtual address: %lx", __func__, sva)); 2203 MPASS(size != 0); 2204 eva = sva + size - 1; 2205 KASSERT(eva > sva, ("%s: Size too large: sva %lx, size %lx", __func__, 2206 sva, size)); 2207 2208 while (sva <= eva) { 2209 pte = pmap_l1(kernel_pmap, sva); 2210 if (pte == NULL) 2211 return (false); 2212 tpte = pmap_load(pte); 2213 if (tpte == 0) 2214 return (false); 2215 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 2216 sva = (sva & ~L1_OFFSET) + L1_SIZE; 2217 continue; 2218 } 2219 2220 pte = pmap_l1_to_l2(&tpte, sva); 2221 tpte = pmap_load(pte); 2222 if (tpte == 0) 2223 return (false); 2224 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 2225 sva = (sva & ~L2_OFFSET) + L2_SIZE; 2226 continue; 2227 } 2228 pte = pmap_l2_to_l3(&tpte, sva); 2229 tpte = pmap_load(pte); 2230 if (tpte == 0) 2231 return (false); 2232 MPASS((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_PAGE); 2233 if ((tpte & ATTR_CONTIGUOUS) == ATTR_CONTIGUOUS) 2234 sva = (sva & ~L3C_OFFSET) + L3C_SIZE; 2235 else 2236 sva = (sva & ~L3_OFFSET) + L3_SIZE; 2237 } 2238 2239 return (true); 2240 } 2241 2242 /* 2243 * Walks the page tables to translate a kernel virtual address to a 2244 * physical address. Returns true if the kva is valid and stores the 2245 * physical address in pa if it is not NULL. 2246 * 2247 * See the comment above data_abort() for the rationale for specifying 2248 * NO_PERTHREAD_SSP here. 2249 */ 2250 bool NO_PERTHREAD_SSP 2251 pmap_klookup(vm_offset_t va, vm_paddr_t *pa) 2252 { 2253 pt_entry_t *pte, tpte; 2254 register_t intr; 2255 uint64_t par; 2256 2257 /* 2258 * Disable interrupts so we don't get interrupted between asking 2259 * for address translation, and getting the result back. 2260 */ 2261 intr = intr_disable(); 2262 par = arm64_address_translate_s1e1r(va); 2263 intr_restore(intr); 2264 2265 if (PAR_SUCCESS(par)) { 2266 if (pa != NULL) 2267 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK); 2268 return (true); 2269 } 2270 2271 /* 2272 * Fall back to walking the page table. The address translation 2273 * instruction may fail when the page is in a break-before-make 2274 * sequence. As we only clear the valid bit in said sequence we 2275 * can walk the page table to find the physical address. 2276 */ 2277 2278 pte = pmap_l1(kernel_pmap, va); 2279 if (pte == NULL) 2280 return (false); 2281 2282 /* 2283 * A concurrent pmap_update_entry() will clear the entry's valid bit 2284 * but leave the rest of the entry unchanged. Therefore, we treat a 2285 * non-zero entry as being valid, and we ignore the valid bit when 2286 * determining whether the entry maps a block, page, or table. 2287 */ 2288 tpte = pmap_load(pte); 2289 if (tpte == 0) 2290 return (false); 2291 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 2292 if (pa != NULL) 2293 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET); 2294 return (true); 2295 } 2296 pte = pmap_l1_to_l2(&tpte, va); 2297 tpte = pmap_load(pte); 2298 if (tpte == 0) 2299 return (false); 2300 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 2301 if (pa != NULL) 2302 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET); 2303 return (true); 2304 } 2305 pte = pmap_l2_to_l3(&tpte, va); 2306 tpte = pmap_load(pte); 2307 if (tpte == 0) 2308 return (false); 2309 if (pa != NULL) 2310 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET); 2311 return (true); 2312 } 2313 2314 /* 2315 * Routine: pmap_kextract 2316 * Function: 2317 * Extract the physical page address associated with the given kernel 2318 * virtual address. 2319 */ 2320 vm_paddr_t 2321 pmap_kextract(vm_offset_t va) 2322 { 2323 vm_paddr_t pa; 2324 2325 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 2326 return (DMAP_TO_PHYS(va)); 2327 2328 if (pmap_klookup(va, &pa) == false) 2329 return (0); 2330 return (pa); 2331 } 2332 2333 /*************************************************** 2334 * Low level mapping routines..... 2335 ***************************************************/ 2336 2337 void 2338 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) 2339 { 2340 pd_entry_t *pde; 2341 pt_entry_t attr, old_l3e, *pte; 2342 vm_offset_t va; 2343 vm_page_t mpte; 2344 int error, lvl; 2345 2346 KASSERT((pa & L3_OFFSET) == 0, 2347 ("pmap_kenter: Invalid physical address")); 2348 KASSERT((sva & L3_OFFSET) == 0, 2349 ("pmap_kenter: Invalid virtual address")); 2350 KASSERT((size & PAGE_MASK) == 0, 2351 ("pmap_kenter: Mapping is not page-sized")); 2352 2353 attr = ATTR_AF | pmap_sh_attr | ATTR_S1_AP(ATTR_S1_AP_RW) | 2354 ATTR_S1_XN | ATTR_KERN_GP | ATTR_S1_IDX(mode); 2355 old_l3e = 0; 2356 va = sva; 2357 while (size != 0) { 2358 pde = pmap_pde(kernel_pmap, va, &lvl); 2359 KASSERT(pde != NULL, 2360 ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); 2361 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); 2362 2363 /* 2364 * If we have an aligned, contiguous chunk of L2_SIZE, try 2365 * to create an L2_BLOCK mapping. 2366 */ 2367 if ((va & L2_OFFSET) == 0 && size >= L2_SIZE && 2368 (pa & L2_OFFSET) == 0 && vm_initialized) { 2369 mpte = PTE_TO_VM_PAGE(pmap_load(pde)); 2370 KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)), 2371 ("pmap_kenter: Unexpected mapping")); 2372 PMAP_LOCK(kernel_pmap); 2373 error = pmap_insert_pt_page(kernel_pmap, mpte, false, 2374 false); 2375 if (error == 0) { 2376 attr &= ~ATTR_CONTIGUOUS; 2377 2378 /* 2379 * Although the page table page "mpte" should 2380 * be devoid of mappings, the TLB might hold 2381 * intermediate entries that reference it, so 2382 * we perform a single-page invalidation. 2383 */ 2384 pmap_update_entry(kernel_pmap, pde, 2385 PHYS_TO_PTE(pa) | attr | L2_BLOCK, va, 2386 PAGE_SIZE); 2387 } 2388 PMAP_UNLOCK(kernel_pmap); 2389 if (error == 0) { 2390 va += L2_SIZE; 2391 pa += L2_SIZE; 2392 size -= L2_SIZE; 2393 continue; 2394 } 2395 } 2396 2397 /* 2398 * If we have an aligned, contiguous chunk of L3C_ENTRIES 2399 * L3 pages, set the contiguous bit within each PTE so that 2400 * the chunk can be cached using only one TLB entry. 2401 */ 2402 if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) { 2403 if (size >= L3C_SIZE) 2404 attr |= ATTR_CONTIGUOUS; 2405 else 2406 attr &= ~ATTR_CONTIGUOUS; 2407 } 2408 2409 pte = pmap_l2_to_l3(pde, va); 2410 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr | 2411 L3_PAGE); 2412 2413 va += PAGE_SIZE; 2414 pa += PAGE_SIZE; 2415 size -= PAGE_SIZE; 2416 } 2417 if ((old_l3e & ATTR_DESCR_VALID) != 0) 2418 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2419 else { 2420 /* 2421 * Because the old entries were invalid and the new mappings 2422 * are not executable, an isb is not required. 2423 */ 2424 dsb(ishst); 2425 } 2426 } 2427 2428 void 2429 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 2430 { 2431 2432 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 2433 } 2434 2435 /* 2436 * Remove a page from the kernel pagetables. 2437 */ 2438 void 2439 pmap_kremove(vm_offset_t va) 2440 { 2441 pt_entry_t *pte; 2442 2443 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__); 2444 KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0, 2445 ("pmap_kremove: unexpected ATTR_CONTIGUOUS")); 2446 pmap_clear(pte); 2447 pmap_s1_invalidate_page(kernel_pmap, va, true); 2448 } 2449 2450 /* 2451 * Remove the specified range of mappings from the kernel address space. 2452 * 2453 * Should only be applied to mappings that were created by pmap_kenter() or 2454 * pmap_kenter_device(). Nothing about this function is actually specific 2455 * to device mappings. 2456 */ 2457 void 2458 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 2459 { 2460 pt_entry_t *ptep, *ptep_end; 2461 vm_offset_t va; 2462 int lvl; 2463 2464 KASSERT((sva & L3_OFFSET) == 0, 2465 ("pmap_kremove_device: Invalid virtual address")); 2466 KASSERT((size & PAGE_MASK) == 0, 2467 ("pmap_kremove_device: Mapping is not page-sized")); 2468 2469 va = sva; 2470 while (size != 0) { 2471 ptep = pmap_pte(kernel_pmap, va, &lvl); 2472 KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va)); 2473 switch (lvl) { 2474 case 2: 2475 KASSERT((va & L2_OFFSET) == 0, 2476 ("Unaligned virtual address")); 2477 KASSERT(size >= L2_SIZE, ("Insufficient size")); 2478 2479 if (va != sva) { 2480 pmap_s1_invalidate_range(kernel_pmap, sva, va, 2481 true); 2482 } 2483 pmap_clear(ptep); 2484 pmap_s1_invalidate_page(kernel_pmap, va, true); 2485 PMAP_LOCK(kernel_pmap); 2486 pmap_remove_kernel_l2(kernel_pmap, ptep, va); 2487 PMAP_UNLOCK(kernel_pmap); 2488 2489 va += L2_SIZE; 2490 sva = va; 2491 size -= L2_SIZE; 2492 break; 2493 case 3: 2494 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) { 2495 KASSERT((va & L3C_OFFSET) == 0, 2496 ("Unaligned L3C virtual address")); 2497 KASSERT(size >= L3C_SIZE, 2498 ("Insufficient L3C size")); 2499 2500 ptep_end = ptep + L3C_ENTRIES; 2501 for (; ptep < ptep_end; ptep++) 2502 pmap_clear(ptep); 2503 2504 va += L3C_SIZE; 2505 size -= L3C_SIZE; 2506 break; 2507 } 2508 pmap_clear(ptep); 2509 2510 va += PAGE_SIZE; 2511 size -= PAGE_SIZE; 2512 break; 2513 default: 2514 __assert_unreachable(); 2515 break; 2516 } 2517 } 2518 if (va != sva) 2519 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2520 } 2521 2522 /* 2523 * Used to map a range of physical addresses into kernel 2524 * virtual address space. 2525 * 2526 * The value passed in '*virt' is a suggested virtual address for 2527 * the mapping. Architectures which can support a direct-mapped 2528 * physical to virtual region can return the appropriate address 2529 * within that region, leaving '*virt' unchanged. Other 2530 * architectures should map the pages starting at '*virt' and 2531 * update '*virt' with the first usable address after the mapped 2532 * region. 2533 */ 2534 vm_offset_t 2535 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 2536 { 2537 return PHYS_TO_DMAP(start); 2538 } 2539 2540 /* 2541 * Add a list of wired pages to the kva 2542 * this routine is only used for temporary 2543 * kernel mappings that do not need to have 2544 * page modification or references recorded. 2545 * Note that old mappings are simply written 2546 * over. The page *must* be wired. 2547 * Note: SMP coherent. Uses a ranged shootdown IPI. 2548 */ 2549 void 2550 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 2551 { 2552 pd_entry_t *pde; 2553 pt_entry_t attr, old_l3e, *pte; 2554 vm_offset_t va; 2555 vm_page_t m; 2556 int i, lvl; 2557 2558 old_l3e = 0; 2559 va = sva; 2560 for (i = 0; i < count; i++) { 2561 pde = pmap_pde(kernel_pmap, va, &lvl); 2562 KASSERT(pde != NULL, 2563 ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); 2564 KASSERT(lvl == 2, 2565 ("pmap_qenter: Invalid level %d", lvl)); 2566 2567 m = ma[i]; 2568 attr = ATTR_AF | pmap_sh_attr | 2569 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 2570 ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE; 2571 pte = pmap_l2_to_l3(pde, va); 2572 old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr); 2573 2574 va += L3_SIZE; 2575 } 2576 if ((old_l3e & ATTR_DESCR_VALID) != 0) 2577 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2578 else { 2579 /* 2580 * Because the old entries were invalid and the new mappings 2581 * are not executable, an isb is not required. 2582 */ 2583 dsb(ishst); 2584 } 2585 } 2586 2587 /* 2588 * This routine tears out page mappings from the 2589 * kernel -- it is meant only for temporary mappings. 2590 */ 2591 void 2592 pmap_qremove(vm_offset_t sva, int count) 2593 { 2594 pt_entry_t *pte; 2595 vm_offset_t va; 2596 2597 KASSERT(ADDR_IS_CANONICAL(sva), 2598 ("%s: Address not in canonical form: %lx", __func__, sva)); 2599 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva)); 2600 2601 va = sva; 2602 while (count-- > 0) { 2603 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL); 2604 if (pte != NULL) { 2605 pmap_clear(pte); 2606 } 2607 2608 va += PAGE_SIZE; 2609 } 2610 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2611 } 2612 2613 /*************************************************** 2614 * Page table page management routines..... 2615 ***************************************************/ 2616 /* 2617 * Schedule the specified unused page table page to be freed. Specifically, 2618 * add the page to the specified list of pages that will be released to the 2619 * physical memory manager after the TLB has been updated. 2620 */ 2621 static __inline void 2622 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO) 2623 { 2624 2625 if (set_PG_ZERO) 2626 m->flags |= PG_ZERO; 2627 else 2628 m->flags &= ~PG_ZERO; 2629 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2630 } 2631 2632 /* 2633 * Decrements a page table page's reference count, which is used to record the 2634 * number of valid page table entries within the page. If the reference count 2635 * drops to zero, then the page table page is unmapped. Returns true if the 2636 * page table page was unmapped and false otherwise. 2637 */ 2638 static inline bool 2639 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2640 { 2641 2642 --m->ref_count; 2643 if (m->ref_count == 0) { 2644 _pmap_unwire_l3(pmap, va, m, free); 2645 return (true); 2646 } else 2647 return (false); 2648 } 2649 2650 static void 2651 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2652 { 2653 2654 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2655 /* 2656 * unmap the page table page 2657 */ 2658 if (m->pindex >= (NUL2E + NUL1E)) { 2659 /* l1 page */ 2660 pd_entry_t *l0; 2661 2662 l0 = pmap_l0(pmap, va); 2663 pmap_clear(l0); 2664 } else if (m->pindex >= NUL2E) { 2665 /* l2 page */ 2666 pd_entry_t *l1; 2667 2668 l1 = pmap_l1(pmap, va); 2669 pmap_clear(l1); 2670 } else { 2671 /* l3 page */ 2672 pd_entry_t *l2; 2673 2674 l2 = pmap_l2(pmap, va); 2675 pmap_clear(l2); 2676 } 2677 pmap_resident_count_dec(pmap, 1); 2678 if (m->pindex < NUL2E) { 2679 /* We just released an l3, unhold the matching l2 */ 2680 pd_entry_t *l1, tl1; 2681 vm_page_t l2pg; 2682 2683 l1 = pmap_l1(pmap, va); 2684 tl1 = pmap_load(l1); 2685 l2pg = PTE_TO_VM_PAGE(tl1); 2686 pmap_unwire_l3(pmap, va, l2pg, free); 2687 } else if (m->pindex < (NUL2E + NUL1E)) { 2688 /* We just released an l2, unhold the matching l1 */ 2689 pd_entry_t *l0, tl0; 2690 vm_page_t l1pg; 2691 2692 l0 = pmap_l0(pmap, va); 2693 tl0 = pmap_load(l0); 2694 l1pg = PTE_TO_VM_PAGE(tl0); 2695 pmap_unwire_l3(pmap, va, l1pg, free); 2696 } 2697 pmap_invalidate_page(pmap, va, false); 2698 2699 /* 2700 * Put page on a list so that it is released after 2701 * *ALL* TLB shootdown is done 2702 */ 2703 pmap_add_delayed_free_list(m, free, true); 2704 } 2705 2706 /* 2707 * After removing a page table entry, this routine is used to 2708 * conditionally free the page, and manage the reference count. 2709 */ 2710 static int 2711 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2712 struct spglist *free) 2713 { 2714 vm_page_t mpte; 2715 2716 KASSERT(ADDR_IS_CANONICAL(va), 2717 ("%s: Address not in canonical form: %lx", __func__, va)); 2718 if (ADDR_IS_KERNEL(va)) 2719 return (0); 2720 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2721 mpte = PTE_TO_VM_PAGE(ptepde); 2722 return (pmap_unwire_l3(pmap, va, mpte, free)); 2723 } 2724 2725 /* 2726 * Release a page table page reference after a failed attempt to create a 2727 * mapping. 2728 */ 2729 static void 2730 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 2731 { 2732 struct spglist free; 2733 2734 SLIST_INIT(&free); 2735 if (pmap_unwire_l3(pmap, va, mpte, &free)) 2736 vm_page_free_pages_toq(&free, true); 2737 } 2738 2739 void 2740 pmap_pinit0(pmap_t pmap) 2741 { 2742 2743 PMAP_LOCK_INIT(pmap); 2744 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 2745 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1); 2746 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 2747 TAILQ_INIT(&pmap->pm_pvchunk); 2748 vm_radix_init(&pmap->pm_root); 2749 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN); 2750 pmap->pm_stage = PM_STAGE1; 2751 pmap->pm_levels = 4; 2752 pmap->pm_ttbr = pmap->pm_l0_paddr; 2753 pmap->pm_asid_set = &asids; 2754 pmap->pm_bti = NULL; 2755 2756 PCPU_SET(curpmap, pmap); 2757 } 2758 2759 int 2760 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels) 2761 { 2762 vm_page_t m; 2763 2764 /* 2765 * allocate the l0 page 2766 */ 2767 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED | 2768 VM_ALLOC_ZERO); 2769 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); 2770 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 2771 2772 TAILQ_INIT(&pmap->pm_pvchunk); 2773 vm_radix_init(&pmap->pm_root); 2774 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 2775 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); 2776 2777 MPASS(levels == 3 || levels == 4); 2778 pmap->pm_levels = levels; 2779 pmap->pm_stage = stage; 2780 pmap->pm_bti = NULL; 2781 switch (stage) { 2782 case PM_STAGE1: 2783 pmap->pm_asid_set = &asids; 2784 if (pmap_bti_support) { 2785 pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF, 2786 M_ZERO | M_WAITOK); 2787 rangeset_init(pmap->pm_bti, bti_dup_range, 2788 bti_free_range, pmap, M_NOWAIT); 2789 } 2790 break; 2791 case PM_STAGE2: 2792 pmap->pm_asid_set = &vmids; 2793 break; 2794 default: 2795 panic("%s: Invalid pmap type %d", __func__, stage); 2796 break; 2797 } 2798 2799 /* XXX Temporarily disable deferred ASID allocation. */ 2800 pmap_alloc_asid(pmap); 2801 2802 /* 2803 * Allocate the level 1 entry to use as the root. This will increase 2804 * the refcount on the level 1 page so it won't be removed until 2805 * pmap_release() is called. 2806 */ 2807 if (pmap->pm_levels == 3) { 2808 PMAP_LOCK(pmap); 2809 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL); 2810 PMAP_UNLOCK(pmap); 2811 } 2812 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m); 2813 2814 return (1); 2815 } 2816 2817 int 2818 pmap_pinit(pmap_t pmap) 2819 { 2820 2821 return (pmap_pinit_stage(pmap, PM_STAGE1, 4)); 2822 } 2823 2824 /* 2825 * This routine is called if the desired page table page does not exist. 2826 * 2827 * If page table page allocation fails, this routine may sleep before 2828 * returning NULL. It sleeps only if a lock pointer was given. 2829 * 2830 * Note: If a page allocation fails at page table level two or three, 2831 * one or two pages may be held during the wait, only to be released 2832 * afterwards. This conservative approach is easily argued to avoid 2833 * race conditions. 2834 */ 2835 static vm_page_t 2836 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2837 { 2838 vm_page_t m, l1pg, l2pg; 2839 2840 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2841 2842 /* 2843 * Allocate a page table page. 2844 */ 2845 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2846 if (lockp != NULL) { 2847 RELEASE_PV_LIST_LOCK(lockp); 2848 PMAP_UNLOCK(pmap); 2849 vm_wait(NULL); 2850 PMAP_LOCK(pmap); 2851 } 2852 2853 /* 2854 * Indicate the need to retry. While waiting, the page table 2855 * page may have been allocated. 2856 */ 2857 return (NULL); 2858 } 2859 m->pindex = ptepindex; 2860 2861 /* 2862 * Because of AArch64's weak memory consistency model, we must have a 2863 * barrier here to ensure that the stores for zeroing "m", whether by 2864 * pmap_zero_page() or an earlier function, are visible before adding 2865 * "m" to the page table. Otherwise, a page table walk by another 2866 * processor's MMU could see the mapping to "m" and a stale, non-zero 2867 * PTE within "m". 2868 */ 2869 dmb(ishst); 2870 2871 /* 2872 * Map the pagetable page into the process address space, if 2873 * it isn't already there. 2874 */ 2875 2876 if (ptepindex >= (NUL2E + NUL1E)) { 2877 pd_entry_t *l0p, l0e; 2878 vm_pindex_t l0index; 2879 2880 l0index = ptepindex - (NUL2E + NUL1E); 2881 l0p = &pmap->pm_l0[l0index]; 2882 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0, 2883 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p))); 2884 l0e = VM_PAGE_TO_PTE(m) | L0_TABLE; 2885 2886 /* 2887 * Mark all kernel memory as not accessible from userspace 2888 * and userspace memory as not executable from the kernel. 2889 * This has been done for the bootstrap L0 entries in 2890 * locore.S. 2891 */ 2892 if (pmap == kernel_pmap) 2893 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0; 2894 else 2895 l0e |= TATTR_PXN_TABLE; 2896 pmap_store(l0p, l0e); 2897 } else if (ptepindex >= NUL2E) { 2898 vm_pindex_t l0index, l1index; 2899 pd_entry_t *l0, *l1; 2900 pd_entry_t tl0; 2901 2902 l1index = ptepindex - NUL2E; 2903 l0index = l1index >> Ln_ENTRIES_SHIFT; 2904 2905 l0 = &pmap->pm_l0[l0index]; 2906 tl0 = pmap_load(l0); 2907 if (tl0 == 0) { 2908 /* recurse for allocating page dir */ 2909 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, 2910 lockp) == NULL) { 2911 vm_page_unwire_noq(m); 2912 vm_page_free_zero(m); 2913 return (NULL); 2914 } 2915 } else { 2916 l1pg = PTE_TO_VM_PAGE(tl0); 2917 l1pg->ref_count++; 2918 } 2919 2920 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0))); 2921 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 2922 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0, 2923 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 2924 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE); 2925 } else { 2926 vm_pindex_t l0index, l1index; 2927 pd_entry_t *l0, *l1, *l2; 2928 pd_entry_t tl0, tl1; 2929 2930 l1index = ptepindex >> Ln_ENTRIES_SHIFT; 2931 l0index = l1index >> Ln_ENTRIES_SHIFT; 2932 2933 l0 = &pmap->pm_l0[l0index]; 2934 tl0 = pmap_load(l0); 2935 if (tl0 == 0) { 2936 /* recurse for allocating page dir */ 2937 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2938 lockp) == NULL) { 2939 vm_page_unwire_noq(m); 2940 vm_page_free_zero(m); 2941 return (NULL); 2942 } 2943 tl0 = pmap_load(l0); 2944 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0)); 2945 l1 = &l1[l1index & Ln_ADDR_MASK]; 2946 } else { 2947 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0)); 2948 l1 = &l1[l1index & Ln_ADDR_MASK]; 2949 tl1 = pmap_load(l1); 2950 if (tl1 == 0) { 2951 /* recurse for allocating page dir */ 2952 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2953 lockp) == NULL) { 2954 vm_page_unwire_noq(m); 2955 vm_page_free_zero(m); 2956 return (NULL); 2957 } 2958 } else { 2959 l2pg = PTE_TO_VM_PAGE(tl1); 2960 l2pg->ref_count++; 2961 } 2962 } 2963 2964 l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1))); 2965 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 2966 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0, 2967 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 2968 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE); 2969 } 2970 2971 pmap_resident_count_inc(pmap, 1); 2972 2973 return (m); 2974 } 2975 2976 static pd_entry_t * 2977 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp, 2978 struct rwlock **lockp) 2979 { 2980 pd_entry_t *l1, *l2; 2981 vm_page_t l2pg; 2982 vm_pindex_t l2pindex; 2983 2984 KASSERT(ADDR_IS_CANONICAL(va), 2985 ("%s: Address not in canonical form: %lx", __func__, va)); 2986 2987 retry: 2988 l1 = pmap_l1(pmap, va); 2989 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { 2990 l2 = pmap_l1_to_l2(l1, va); 2991 if (ADDR_IS_USER(va)) { 2992 /* Add a reference to the L2 page. */ 2993 l2pg = PTE_TO_VM_PAGE(pmap_load(l1)); 2994 l2pg->ref_count++; 2995 } else 2996 l2pg = NULL; 2997 } else if (ADDR_IS_USER(va)) { 2998 /* Allocate a L2 page. */ 2999 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 3000 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 3001 if (l2pg == NULL) { 3002 if (lockp != NULL) 3003 goto retry; 3004 else 3005 return (NULL); 3006 } 3007 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 3008 l2 = &l2[pmap_l2_index(va)]; 3009 } else 3010 panic("pmap_alloc_l2: missing page table page for va %#lx", 3011 va); 3012 *l2pgp = l2pg; 3013 return (l2); 3014 } 3015 3016 static vm_page_t 3017 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 3018 { 3019 vm_pindex_t ptepindex; 3020 pd_entry_t *pde, tpde; 3021 #ifdef INVARIANTS 3022 pt_entry_t *pte; 3023 #endif 3024 vm_page_t m; 3025 int lvl; 3026 3027 /* 3028 * Calculate pagetable page index 3029 */ 3030 ptepindex = pmap_l2_pindex(va); 3031 retry: 3032 /* 3033 * Get the page directory entry 3034 */ 3035 pde = pmap_pde(pmap, va, &lvl); 3036 3037 /* 3038 * If the page table page is mapped, we just increment the hold count, 3039 * and activate it. If we get a level 2 pde it will point to a level 3 3040 * table. 3041 */ 3042 switch (lvl) { 3043 case -1: 3044 break; 3045 case 0: 3046 #ifdef INVARIANTS 3047 pte = pmap_l0_to_l1(pde, va); 3048 KASSERT(pmap_load(pte) == 0, 3049 ("pmap_alloc_l3: TODO: l0 superpages")); 3050 #endif 3051 break; 3052 case 1: 3053 #ifdef INVARIANTS 3054 pte = pmap_l1_to_l2(pde, va); 3055 KASSERT(pmap_load(pte) == 0, 3056 ("pmap_alloc_l3: TODO: l1 superpages")); 3057 #endif 3058 break; 3059 case 2: 3060 tpde = pmap_load(pde); 3061 if (tpde != 0) { 3062 m = PTE_TO_VM_PAGE(tpde); 3063 m->ref_count++; 3064 return (m); 3065 } 3066 break; 3067 default: 3068 panic("pmap_alloc_l3: Invalid level %d", lvl); 3069 } 3070 3071 /* 3072 * Here if the pte page isn't mapped, or if it has been deallocated. 3073 */ 3074 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 3075 if (m == NULL && lockp != NULL) 3076 goto retry; 3077 3078 return (m); 3079 } 3080 3081 /*************************************************** 3082 * Pmap allocation/deallocation routines. 3083 ***************************************************/ 3084 3085 /* 3086 * Release any resources held by the given physical map. 3087 * Called when a pmap initialized by pmap_pinit is being released. 3088 * Should only be called if the map contains no valid mappings. 3089 */ 3090 void 3091 pmap_release(pmap_t pmap) 3092 { 3093 bool rv __diagused; 3094 struct spglist freelist; 3095 struct asid_set *set; 3096 vm_page_t m; 3097 int asid; 3098 3099 if (pmap->pm_levels != 4) { 3100 PMAP_ASSERT_STAGE2(pmap); 3101 KASSERT(pmap->pm_stats.resident_count == 1, 3102 ("pmap_release: pmap resident count %ld != 0", 3103 pmap->pm_stats.resident_count)); 3104 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID, 3105 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0])); 3106 3107 SLIST_INIT(&freelist); 3108 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr); 3109 PMAP_LOCK(pmap); 3110 rv = pmap_unwire_l3(pmap, 0, m, &freelist); 3111 PMAP_UNLOCK(pmap); 3112 MPASS(rv == true); 3113 vm_page_free_pages_toq(&freelist, true); 3114 } 3115 3116 KASSERT(pmap->pm_stats.resident_count == 0, 3117 ("pmap_release: pmap resident count %ld != 0", 3118 pmap->pm_stats.resident_count)); 3119 KASSERT(vm_radix_is_empty(&pmap->pm_root), 3120 ("pmap_release: pmap has reserved page table page(s)")); 3121 3122 set = pmap->pm_asid_set; 3123 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 3124 3125 /* 3126 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate 3127 * the entries when removing them so rely on a later tlb invalidation. 3128 * this will happen when updating the VMID generation. Because of this 3129 * we don't reuse VMIDs within a generation. 3130 */ 3131 if (pmap->pm_stage == PM_STAGE1) { 3132 mtx_lock_spin(&set->asid_set_mutex); 3133 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) { 3134 asid = COOKIE_TO_ASID(pmap->pm_cookie); 3135 KASSERT(asid >= ASID_FIRST_AVAILABLE && 3136 asid < set->asid_set_size, 3137 ("pmap_release: pmap cookie has out-of-range asid")); 3138 bit_clear(set->asid_set, asid); 3139 } 3140 mtx_unlock_spin(&set->asid_set_mutex); 3141 3142 if (pmap->pm_bti != NULL) { 3143 rangeset_fini(pmap->pm_bti); 3144 free(pmap->pm_bti, M_DEVBUF); 3145 } 3146 } 3147 3148 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); 3149 vm_page_unwire_noq(m); 3150 vm_page_free_zero(m); 3151 } 3152 3153 static int 3154 kvm_size(SYSCTL_HANDLER_ARGS) 3155 { 3156 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 3157 3158 return sysctl_handle_long(oidp, &ksize, 0, req); 3159 } 3160 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 3161 0, 0, kvm_size, "LU", 3162 "Size of KVM"); 3163 3164 static int 3165 kvm_free(SYSCTL_HANDLER_ARGS) 3166 { 3167 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 3168 3169 return sysctl_handle_long(oidp, &kfree, 0, req); 3170 } 3171 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 3172 0, 0, kvm_free, "LU", 3173 "Amount of KVM free"); 3174 3175 /* 3176 * grow the number of kernel page table entries, if needed 3177 */ 3178 static int 3179 pmap_growkernel_nopanic(vm_offset_t addr) 3180 { 3181 vm_page_t nkpg; 3182 pd_entry_t *l0, *l1, *l2; 3183 3184 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 3185 3186 addr = roundup2(addr, L2_SIZE); 3187 if (addr - 1 >= vm_map_max(kernel_map)) 3188 addr = vm_map_max(kernel_map); 3189 if (kernel_vm_end < addr) { 3190 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end); 3191 kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end); 3192 } 3193 while (kernel_vm_end < addr) { 3194 l0 = pmap_l0(kernel_pmap, kernel_vm_end); 3195 KASSERT(pmap_load(l0) != 0, 3196 ("pmap_growkernel: No level 0 kernel entry")); 3197 3198 l1 = pmap_l0_to_l1(l0, kernel_vm_end); 3199 if (pmap_load(l1) == 0) { 3200 /* We need a new PDP entry */ 3201 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 3202 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3203 if (nkpg == NULL) 3204 return (KERN_RESOURCE_SHORTAGE); 3205 nkpg->pindex = pmap_l1_pindex(kernel_vm_end); 3206 /* See the dmb() in _pmap_alloc_l3(). */ 3207 dmb(ishst); 3208 pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE); 3209 continue; /* try again */ 3210 } 3211 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 3212 if (pmap_load(l2) != 0) { 3213 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 3214 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3215 kernel_vm_end = vm_map_max(kernel_map); 3216 break; 3217 } 3218 continue; 3219 } 3220 3221 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 3222 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3223 if (nkpg == NULL) 3224 return (KERN_RESOURCE_SHORTAGE); 3225 nkpg->pindex = pmap_l2_pindex(kernel_vm_end); 3226 /* See the dmb() in _pmap_alloc_l3(). */ 3227 dmb(ishst); 3228 pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE); 3229 3230 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 3231 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3232 kernel_vm_end = vm_map_max(kernel_map); 3233 break; 3234 } 3235 } 3236 return (KERN_SUCCESS); 3237 } 3238 3239 int 3240 pmap_growkernel(vm_offset_t addr) 3241 { 3242 int rv; 3243 3244 rv = pmap_growkernel_nopanic(addr); 3245 if (rv != KERN_SUCCESS && pmap_growkernel_panic) 3246 panic("pmap_growkernel: no memory to grow kernel"); 3247 return (rv); 3248 } 3249 3250 /*************************************************** 3251 * page management routines. 3252 ***************************************************/ 3253 3254 static const uint64_t pc_freemask[_NPCM] = { 3255 [0 ... _NPCM - 2] = PC_FREEN, 3256 [_NPCM - 1] = PC_FREEL 3257 }; 3258 3259 #ifdef PV_STATS 3260 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 3261 3262 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 3263 "Current number of pv entry chunks"); 3264 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 3265 "Current number of pv entry chunks allocated"); 3266 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 3267 "Current number of pv entry chunks frees"); 3268 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 3269 "Number of times tried to get a chunk page but failed."); 3270 3271 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 3272 static int pv_entry_spare; 3273 3274 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 3275 "Current number of pv entry frees"); 3276 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 3277 "Current number of pv entry allocs"); 3278 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 3279 "Current number of pv entries"); 3280 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 3281 "Current number of spare pv entries"); 3282 #endif 3283 3284 /* 3285 * We are in a serious low memory condition. Resort to 3286 * drastic measures to free some pages so we can allocate 3287 * another pv entry chunk. 3288 * 3289 * Returns NULL if PV entries were reclaimed from the specified pmap. 3290 * 3291 * We do not, however, unmap 2mpages because subsequent accesses will 3292 * allocate per-page pv entries until repromotion occurs, thereby 3293 * exacerbating the shortage of free pv entries. 3294 */ 3295 static vm_page_t 3296 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 3297 { 3298 struct pv_chunks_list *pvc; 3299 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 3300 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 3301 struct md_page *pvh; 3302 pd_entry_t *pde; 3303 pmap_t next_pmap, pmap; 3304 pt_entry_t *pte, tpte; 3305 pv_entry_t pv; 3306 vm_offset_t va; 3307 vm_page_t m, m_pc; 3308 struct spglist free; 3309 uint64_t inuse; 3310 int bit, field, freed, lvl; 3311 3312 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 3313 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 3314 3315 pmap = NULL; 3316 m_pc = NULL; 3317 SLIST_INIT(&free); 3318 bzero(&pc_marker_b, sizeof(pc_marker_b)); 3319 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 3320 pc_marker = (struct pv_chunk *)&pc_marker_b; 3321 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 3322 3323 pvc = &pv_chunks[domain]; 3324 mtx_lock(&pvc->pvc_lock); 3325 pvc->active_reclaims++; 3326 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 3327 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 3328 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 3329 SLIST_EMPTY(&free)) { 3330 next_pmap = pc->pc_pmap; 3331 if (next_pmap == NULL) { 3332 /* 3333 * The next chunk is a marker. However, it is 3334 * not our marker, so active_reclaims must be 3335 * > 1. Consequently, the next_chunk code 3336 * will not rotate the pv_chunks list. 3337 */ 3338 goto next_chunk; 3339 } 3340 mtx_unlock(&pvc->pvc_lock); 3341 3342 /* 3343 * A pv_chunk can only be removed from the pc_lru list 3344 * when both pvc->pvc_lock is owned and the 3345 * corresponding pmap is locked. 3346 */ 3347 if (pmap != next_pmap) { 3348 if (pmap != NULL && pmap != locked_pmap) 3349 PMAP_UNLOCK(pmap); 3350 pmap = next_pmap; 3351 /* Avoid deadlock and lock recursion. */ 3352 if (pmap > locked_pmap) { 3353 RELEASE_PV_LIST_LOCK(lockp); 3354 PMAP_LOCK(pmap); 3355 mtx_lock(&pvc->pvc_lock); 3356 continue; 3357 } else if (pmap != locked_pmap) { 3358 if (PMAP_TRYLOCK(pmap)) { 3359 mtx_lock(&pvc->pvc_lock); 3360 continue; 3361 } else { 3362 pmap = NULL; /* pmap is not locked */ 3363 mtx_lock(&pvc->pvc_lock); 3364 pc = TAILQ_NEXT(pc_marker, pc_lru); 3365 if (pc == NULL || 3366 pc->pc_pmap != next_pmap) 3367 continue; 3368 goto next_chunk; 3369 } 3370 } 3371 } 3372 3373 /* 3374 * Destroy every non-wired, 4 KB page mapping in the chunk. 3375 */ 3376 freed = 0; 3377 for (field = 0; field < _NPCM; field++) { 3378 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 3379 inuse != 0; inuse &= ~(1UL << bit)) { 3380 bit = ffsl(inuse) - 1; 3381 pv = &pc->pc_pventry[field * 64 + bit]; 3382 va = pv->pv_va; 3383 pde = pmap_pde(pmap, va, &lvl); 3384 if (lvl != 2) 3385 continue; 3386 pte = pmap_l2_to_l3(pde, va); 3387 tpte = pmap_load(pte); 3388 if ((tpte & ATTR_SW_WIRED) != 0) 3389 continue; 3390 if ((tpte & ATTR_CONTIGUOUS) != 0) 3391 (void)pmap_demote_l3c(pmap, pte, va); 3392 tpte = pmap_load_clear(pte); 3393 m = PTE_TO_VM_PAGE(tpte); 3394 if (pmap_pte_dirty(pmap, tpte)) 3395 vm_page_dirty(m); 3396 if ((tpte & ATTR_AF) != 0) { 3397 pmap_s1_invalidate_page(pmap, va, true); 3398 vm_page_aflag_set(m, PGA_REFERENCED); 3399 } 3400 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3401 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3402 m->md.pv_gen++; 3403 if (TAILQ_EMPTY(&m->md.pv_list) && 3404 (m->flags & PG_FICTITIOUS) == 0) { 3405 pvh = page_to_pvh(m); 3406 if (TAILQ_EMPTY(&pvh->pv_list)) { 3407 vm_page_aflag_clear(m, 3408 PGA_WRITEABLE); 3409 } 3410 } 3411 pc->pc_map[field] |= 1UL << bit; 3412 pmap_unuse_pt(pmap, va, pmap_load(pde), &free); 3413 freed++; 3414 } 3415 } 3416 if (freed == 0) { 3417 mtx_lock(&pvc->pvc_lock); 3418 goto next_chunk; 3419 } 3420 /* Every freed mapping is for a 4 KB page. */ 3421 pmap_resident_count_dec(pmap, freed); 3422 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3423 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3424 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3425 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3426 if (pc_is_free(pc)) { 3427 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3428 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3429 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3430 /* Entire chunk is free; return it. */ 3431 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3432 dump_drop_page(m_pc->phys_addr); 3433 mtx_lock(&pvc->pvc_lock); 3434 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3435 break; 3436 } 3437 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3438 mtx_lock(&pvc->pvc_lock); 3439 /* One freed pv entry in locked_pmap is sufficient. */ 3440 if (pmap == locked_pmap) 3441 break; 3442 3443 next_chunk: 3444 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 3445 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 3446 if (pvc->active_reclaims == 1 && pmap != NULL) { 3447 /* 3448 * Rotate the pv chunks list so that we do not 3449 * scan the same pv chunks that could not be 3450 * freed (because they contained a wired 3451 * and/or superpage mapping) on every 3452 * invocation of reclaim_pv_chunk(). 3453 */ 3454 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){ 3455 MPASS(pc->pc_pmap != NULL); 3456 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3457 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 3458 } 3459 } 3460 } 3461 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 3462 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 3463 pvc->active_reclaims--; 3464 mtx_unlock(&pvc->pvc_lock); 3465 if (pmap != NULL && pmap != locked_pmap) 3466 PMAP_UNLOCK(pmap); 3467 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 3468 m_pc = SLIST_FIRST(&free); 3469 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 3470 /* Recycle a freed page table page. */ 3471 m_pc->ref_count = 1; 3472 } 3473 vm_page_free_pages_toq(&free, true); 3474 return (m_pc); 3475 } 3476 3477 static vm_page_t 3478 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 3479 { 3480 vm_page_t m; 3481 int i, domain; 3482 3483 domain = PCPU_GET(domain); 3484 for (i = 0; i < vm_ndomains; i++) { 3485 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 3486 if (m != NULL) 3487 break; 3488 domain = (domain + 1) % vm_ndomains; 3489 } 3490 3491 return (m); 3492 } 3493 3494 /* 3495 * free the pv_entry back to the free list 3496 */ 3497 static void 3498 free_pv_entry(pmap_t pmap, pv_entry_t pv) 3499 { 3500 struct pv_chunk *pc; 3501 int idx, field, bit; 3502 3503 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3504 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 3505 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 3506 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 3507 pc = pv_to_chunk(pv); 3508 idx = pv - &pc->pc_pventry[0]; 3509 field = idx / 64; 3510 bit = idx % 64; 3511 pc->pc_map[field] |= 1ul << bit; 3512 if (!pc_is_free(pc)) { 3513 /* 98% of the time, pc is already at the head of the list. */ 3514 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 3515 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3516 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3517 } 3518 return; 3519 } 3520 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3521 free_pv_chunk(pc); 3522 } 3523 3524 static void 3525 free_pv_chunk_dequeued(struct pv_chunk *pc) 3526 { 3527 vm_page_t m; 3528 3529 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3530 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3531 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3532 /* entire chunk is free, return it */ 3533 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3534 dump_drop_page(m->phys_addr); 3535 vm_page_unwire_noq(m); 3536 vm_page_free(m); 3537 } 3538 3539 static void 3540 free_pv_chunk(struct pv_chunk *pc) 3541 { 3542 struct pv_chunks_list *pvc; 3543 3544 pvc = &pv_chunks[pc_to_domain(pc)]; 3545 mtx_lock(&pvc->pvc_lock); 3546 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3547 mtx_unlock(&pvc->pvc_lock); 3548 free_pv_chunk_dequeued(pc); 3549 } 3550 3551 static void 3552 free_pv_chunk_batch(struct pv_chunklist *batch) 3553 { 3554 struct pv_chunks_list *pvc; 3555 struct pv_chunk *pc, *npc; 3556 int i; 3557 3558 for (i = 0; i < vm_ndomains; i++) { 3559 if (TAILQ_EMPTY(&batch[i])) 3560 continue; 3561 pvc = &pv_chunks[i]; 3562 mtx_lock(&pvc->pvc_lock); 3563 TAILQ_FOREACH(pc, &batch[i], pc_list) { 3564 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3565 } 3566 mtx_unlock(&pvc->pvc_lock); 3567 } 3568 3569 for (i = 0; i < vm_ndomains; i++) { 3570 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 3571 free_pv_chunk_dequeued(pc); 3572 } 3573 } 3574 } 3575 3576 /* 3577 * Returns a new PV entry, allocating a new PV chunk from the system when 3578 * needed. If this PV chunk allocation fails and a PV list lock pointer was 3579 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 3580 * returned. 3581 * 3582 * The given PV list lock may be released. 3583 */ 3584 static pv_entry_t 3585 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 3586 { 3587 struct pv_chunks_list *pvc; 3588 int bit, field; 3589 pv_entry_t pv; 3590 struct pv_chunk *pc; 3591 vm_page_t m; 3592 3593 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3594 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 3595 retry: 3596 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3597 if (pc != NULL) { 3598 for (field = 0; field < _NPCM; field++) { 3599 if (pc->pc_map[field]) { 3600 bit = ffsl(pc->pc_map[field]) - 1; 3601 break; 3602 } 3603 } 3604 if (field < _NPCM) { 3605 pv = &pc->pc_pventry[field * 64 + bit]; 3606 pc->pc_map[field] &= ~(1ul << bit); 3607 /* If this was the last item, move it to tail */ 3608 if (pc_is_full(pc)) { 3609 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3610 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 3611 pc_list); 3612 } 3613 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3614 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 3615 return (pv); 3616 } 3617 } 3618 /* No free items, allocate another chunk */ 3619 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3620 if (m == NULL) { 3621 if (lockp == NULL) { 3622 PV_STAT(pc_chunk_tryfail++); 3623 return (NULL); 3624 } 3625 m = reclaim_pv_chunk(pmap, lockp); 3626 if (m == NULL) 3627 goto retry; 3628 } 3629 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3630 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3631 dump_add_page(m->phys_addr); 3632 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3633 pc->pc_pmap = pmap; 3634 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask)); 3635 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */ 3636 pvc = &pv_chunks[vm_page_domain(m)]; 3637 mtx_lock(&pvc->pvc_lock); 3638 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 3639 mtx_unlock(&pvc->pvc_lock); 3640 pv = &pc->pc_pventry[0]; 3641 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3642 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3643 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 3644 return (pv); 3645 } 3646 3647 /* 3648 * Ensure that the number of spare PV entries in the specified pmap meets or 3649 * exceeds the given count, "needed". 3650 * 3651 * The given PV list lock may be released. 3652 */ 3653 static void 3654 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 3655 { 3656 struct pv_chunks_list *pvc; 3657 struct pch new_tail[PMAP_MEMDOM]; 3658 struct pv_chunk *pc; 3659 vm_page_t m; 3660 int avail, free, i; 3661 bool reclaimed; 3662 3663 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3664 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 3665 3666 /* 3667 * Newly allocated PV chunks must be stored in a private list until 3668 * the required number of PV chunks have been allocated. Otherwise, 3669 * reclaim_pv_chunk() could recycle one of these chunks. In 3670 * contrast, these chunks must be added to the pmap upon allocation. 3671 */ 3672 for (i = 0; i < PMAP_MEMDOM; i++) 3673 TAILQ_INIT(&new_tail[i]); 3674 retry: 3675 avail = 0; 3676 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3677 bit_count((bitstr_t *)pc->pc_map, 0, 3678 sizeof(pc->pc_map) * NBBY, &free); 3679 if (free == 0) 3680 break; 3681 avail += free; 3682 if (avail >= needed) 3683 break; 3684 } 3685 for (reclaimed = false; avail < needed; avail += _NPCPV) { 3686 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3687 if (m == NULL) { 3688 m = reclaim_pv_chunk(pmap, lockp); 3689 if (m == NULL) 3690 goto retry; 3691 reclaimed = true; 3692 } 3693 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3694 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3695 dump_add_page(m->phys_addr); 3696 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3697 pc->pc_pmap = pmap; 3698 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask)); 3699 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3700 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 3701 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3702 3703 /* 3704 * The reclaim might have freed a chunk from the current pmap. 3705 * If that chunk contained available entries, we need to 3706 * re-count the number of available entries. 3707 */ 3708 if (reclaimed) 3709 goto retry; 3710 } 3711 for (i = 0; i < vm_ndomains; i++) { 3712 if (TAILQ_EMPTY(&new_tail[i])) 3713 continue; 3714 pvc = &pv_chunks[i]; 3715 mtx_lock(&pvc->pvc_lock); 3716 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 3717 mtx_unlock(&pvc->pvc_lock); 3718 } 3719 } 3720 3721 /* 3722 * First find and then remove the pv entry for the specified pmap and virtual 3723 * address from the specified pv list. Returns the pv entry if found and NULL 3724 * otherwise. This operation can be performed on pv lists for either 4KB or 3725 * 2MB page mappings. 3726 */ 3727 static __inline pv_entry_t 3728 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3729 { 3730 pv_entry_t pv; 3731 3732 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3733 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3734 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3735 pvh->pv_gen++; 3736 break; 3737 } 3738 } 3739 return (pv); 3740 } 3741 3742 /* 3743 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3744 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3745 * entries for each of the 4KB page mappings. 3746 */ 3747 static void 3748 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3749 struct rwlock **lockp) 3750 { 3751 struct md_page *pvh; 3752 struct pv_chunk *pc; 3753 pv_entry_t pv; 3754 vm_offset_t va_last; 3755 vm_page_t m; 3756 int bit, field; 3757 3758 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3759 KASSERT((va & L2_OFFSET) == 0, 3760 ("pmap_pv_demote_l2: va is not 2mpage aligned")); 3761 KASSERT((pa & L2_OFFSET) == 0, 3762 ("pmap_pv_demote_l2: pa is not 2mpage aligned")); 3763 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3764 3765 /* 3766 * Transfer the 2mpage's pv entry for this mapping to the first 3767 * page's pv list. Once this transfer begins, the pv list lock 3768 * must not be released until the last pv entry is reinstantiated. 3769 */ 3770 pvh = pa_to_pvh(pa); 3771 pv = pmap_pvh_remove(pvh, pmap, va); 3772 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 3773 m = PHYS_TO_VM_PAGE(pa); 3774 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3775 m->md.pv_gen++; 3776 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ 3777 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); 3778 va_last = va + L2_SIZE - PAGE_SIZE; 3779 for (;;) { 3780 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3781 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare")); 3782 for (field = 0; field < _NPCM; field++) { 3783 while (pc->pc_map[field]) { 3784 bit = ffsl(pc->pc_map[field]) - 1; 3785 pc->pc_map[field] &= ~(1ul << bit); 3786 pv = &pc->pc_pventry[field * 64 + bit]; 3787 va += PAGE_SIZE; 3788 pv->pv_va = va; 3789 m++; 3790 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3791 ("pmap_pv_demote_l2: page %p is not managed", m)); 3792 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3793 m->md.pv_gen++; 3794 if (va == va_last) 3795 goto out; 3796 } 3797 } 3798 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3799 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3800 } 3801 out: 3802 if (pc_is_full(pc)) { 3803 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3804 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3805 } 3806 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); 3807 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); 3808 } 3809 3810 /* 3811 * First find and then destroy the pv entry for the specified pmap and virtual 3812 * address. This operation can be performed on pv lists for either 4KB or 2MB 3813 * page mappings. 3814 */ 3815 static void 3816 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3817 { 3818 pv_entry_t pv; 3819 3820 pv = pmap_pvh_remove(pvh, pmap, va); 3821 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3822 free_pv_entry(pmap, pv); 3823 } 3824 3825 /* 3826 * Conditionally create the PV entry for a 4KB page mapping if the required 3827 * memory can be allocated without resorting to reclamation. 3828 */ 3829 static bool 3830 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3831 struct rwlock **lockp) 3832 { 3833 pv_entry_t pv; 3834 3835 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3836 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3837 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3838 pv->pv_va = va; 3839 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3840 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3841 m->md.pv_gen++; 3842 return (true); 3843 } else 3844 return (false); 3845 } 3846 3847 /* 3848 * Create the PV entry for a 2MB page mapping. Always returns true unless the 3849 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 3850 * false if the PV entry cannot be allocated without resorting to reclamation. 3851 */ 3852 static bool 3853 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 3854 struct rwlock **lockp) 3855 { 3856 struct md_page *pvh; 3857 pv_entry_t pv; 3858 vm_paddr_t pa; 3859 3860 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3861 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3862 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 3863 NULL : lockp)) == NULL) 3864 return (false); 3865 pv->pv_va = va; 3866 pa = PTE_TO_PHYS(l2e); 3867 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3868 pvh = pa_to_pvh(pa); 3869 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3870 pvh->pv_gen++; 3871 return (true); 3872 } 3873 3874 /* 3875 * Conditionally creates the PV entries for a L3C superpage mapping if 3876 * the required memory can be allocated without resorting to reclamation. 3877 */ 3878 static bool 3879 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m, 3880 struct rwlock **lockp) 3881 { 3882 pv_entry_t pv; 3883 vm_offset_t tva; 3884 vm_paddr_t pa __diagused; 3885 vm_page_t mt; 3886 3887 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3888 KASSERT((va & L3C_OFFSET) == 0, 3889 ("pmap_pv_insert_l3c: va is not aligned")); 3890 pa = VM_PAGE_TO_PHYS(m); 3891 KASSERT((pa & L3C_OFFSET) == 0, 3892 ("pmap_pv_insert_l3c: pa is not aligned")); 3893 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3894 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) { 3895 /* Pass NULL instead of lockp to disable reclamation. */ 3896 pv = get_pv_entry(pmap, NULL); 3897 if (__predict_false(pv == NULL)) { 3898 while (tva > va) { 3899 mt--; 3900 tva -= L3_SIZE; 3901 pmap_pvh_free(&mt->md, pmap, tva); 3902 } 3903 return (false); 3904 } 3905 pv->pv_va = tva; 3906 TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next); 3907 mt->md.pv_gen++; 3908 } 3909 return (true); 3910 } 3911 3912 static void 3913 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 3914 { 3915 pt_entry_t newl2, oldl2 __diagused; 3916 vm_page_t ml3; 3917 vm_paddr_t ml3pa; 3918 3919 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 3920 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 3921 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3922 3923 ml3 = pmap_remove_pt_page(pmap, va); 3924 KASSERT(ml3 != NULL, ("pmap_remove_kernel_l2: missing pt page")); 3925 3926 ml3pa = VM_PAGE_TO_PHYS(ml3); 3927 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE; 3928 3929 /* 3930 * If this page table page was unmapped by a promotion, then it 3931 * contains valid mappings. Zero it to invalidate those mappings. 3932 */ 3933 if (vm_page_any_valid(ml3)) 3934 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 3935 3936 /* 3937 * Demote the mapping. The caller must have already invalidated the 3938 * mapping (i.e., the "break" in break-before-make). 3939 */ 3940 oldl2 = pmap_load_store(l2, newl2); 3941 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 3942 __func__, l2, oldl2)); 3943 } 3944 3945 /* 3946 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 3947 */ 3948 static int 3949 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, 3950 bool demote_kl2e, struct spglist *free, struct rwlock **lockp) 3951 { 3952 struct md_page *pvh; 3953 pt_entry_t old_l2; 3954 vm_page_t m, ml3, mt; 3955 3956 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3957 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 3958 old_l2 = pmap_load_clear(l2); 3959 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 3960 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); 3961 3962 /* 3963 * Since a promotion must break the 4KB page mappings before making 3964 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices. 3965 */ 3966 pmap_s1_invalidate_page(pmap, sva, true); 3967 3968 if (old_l2 & ATTR_SW_WIRED) 3969 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 3970 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 3971 if (old_l2 & ATTR_SW_MANAGED) { 3972 m = PTE_TO_VM_PAGE(old_l2); 3973 pvh = page_to_pvh(m); 3974 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3975 pmap_pvh_free(pvh, pmap, sva); 3976 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) { 3977 if (pmap_pte_dirty(pmap, old_l2)) 3978 vm_page_dirty(mt); 3979 if (old_l2 & ATTR_AF) 3980 vm_page_aflag_set(mt, PGA_REFERENCED); 3981 if (TAILQ_EMPTY(&mt->md.pv_list) && 3982 TAILQ_EMPTY(&pvh->pv_list)) 3983 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3984 } 3985 } 3986 if (pmap != kernel_pmap) { 3987 ml3 = pmap_remove_pt_page(pmap, sva); 3988 if (ml3 != NULL) { 3989 KASSERT(vm_page_any_valid(ml3), 3990 ("pmap_remove_l2: l3 page not promoted")); 3991 pmap_resident_count_dec(pmap, 1); 3992 KASSERT(ml3->ref_count == NL3PG, 3993 ("pmap_remove_l2: l3 page ref count error")); 3994 ml3->ref_count = 0; 3995 pmap_add_delayed_free_list(ml3, free, false); 3996 } 3997 } else if (demote_kl2e) { 3998 pmap_remove_kernel_l2(pmap, l2, sva); 3999 } else { 4000 ml3 = vm_radix_lookup(&pmap->pm_root, pmap_l2_pindex(sva)); 4001 if (vm_page_any_valid(ml3)) { 4002 ml3->valid = 0; 4003 pmap_zero_page(ml3); 4004 } 4005 } 4006 return (pmap_unuse_pt(pmap, sva, l1e, free)); 4007 } 4008 4009 /* 4010 * pmap_remove_l3: do the things to unmap a page in a process 4011 */ 4012 static int 4013 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 4014 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 4015 { 4016 struct md_page *pvh; 4017 pt_entry_t old_l3; 4018 vm_page_t m; 4019 4020 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4021 old_l3 = pmap_load(l3); 4022 if ((old_l3 & ATTR_CONTIGUOUS) != 0) 4023 (void)pmap_demote_l3c(pmap, l3, va); 4024 old_l3 = pmap_load_clear(l3); 4025 pmap_s1_invalidate_page(pmap, va, true); 4026 if (old_l3 & ATTR_SW_WIRED) 4027 pmap->pm_stats.wired_count -= 1; 4028 pmap_resident_count_dec(pmap, 1); 4029 if (old_l3 & ATTR_SW_MANAGED) { 4030 m = PTE_TO_VM_PAGE(old_l3); 4031 if (pmap_pte_dirty(pmap, old_l3)) 4032 vm_page_dirty(m); 4033 if (old_l3 & ATTR_AF) 4034 vm_page_aflag_set(m, PGA_REFERENCED); 4035 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 4036 pmap_pvh_free(&m->md, pmap, va); 4037 if (TAILQ_EMPTY(&m->md.pv_list) && 4038 (m->flags & PG_FICTITIOUS) == 0) { 4039 pvh = page_to_pvh(m); 4040 if (TAILQ_EMPTY(&pvh->pv_list)) 4041 vm_page_aflag_clear(m, PGA_WRITEABLE); 4042 } 4043 } 4044 return (pmap_unuse_pt(pmap, va, l2e, free)); 4045 } 4046 4047 /* 4048 * Removes the specified L3C superpage mapping. Requests TLB invalidations 4049 * to be performed by the caller through the returned "*vap". Returns true 4050 * if the level 3 table "ml3" was unmapped and added to the spglist "free". 4051 * Otherwise, returns false. 4052 */ 4053 static bool 4054 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap, 4055 vm_offset_t va_next, vm_page_t ml3, struct spglist *free, 4056 struct rwlock **lockp) 4057 { 4058 struct md_page *pvh; 4059 struct rwlock *new_lock; 4060 pt_entry_t first_l3e, l3e, *tl3p; 4061 vm_offset_t tva; 4062 vm_page_t m, mt; 4063 4064 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4065 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) == 4066 0, ("pmap_remove_l3c: l3p is not aligned")); 4067 KASSERT((va & L3C_OFFSET) == 0, 4068 ("pmap_remove_l3c: va is not aligned")); 4069 4070 /* 4071 * Hardware accessed and dirty bit maintenance might only update a 4072 * single L3 entry, so we must combine the accessed and dirty bits 4073 * from this entire set of contiguous L3 entries. 4074 */ 4075 first_l3e = pmap_load_clear(l3p); 4076 for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 4077 l3e = pmap_load_clear(tl3p); 4078 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 4079 ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS")); 4080 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) == 4081 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW))) 4082 first_l3e &= ~ATTR_S1_AP_RW_BIT; 4083 first_l3e |= l3e & ATTR_AF; 4084 } 4085 if ((first_l3e & ATTR_SW_WIRED) != 0) 4086 pmap->pm_stats.wired_count -= L3C_ENTRIES; 4087 pmap_resident_count_dec(pmap, L3C_ENTRIES); 4088 if ((first_l3e & ATTR_SW_MANAGED) != 0) { 4089 m = PTE_TO_VM_PAGE(first_l3e); 4090 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4091 if (new_lock != *lockp) { 4092 if (*lockp != NULL) { 4093 /* 4094 * Pending TLB invalidations must be 4095 * performed before the PV list lock is 4096 * released. Otherwise, a concurrent 4097 * pmap_remove_all() on a physical page 4098 * could return while a stale TLB entry 4099 * still provides access to that page. 4100 */ 4101 if (*vap != va_next) { 4102 pmap_invalidate_range(pmap, *vap, va, 4103 true); 4104 *vap = va_next; 4105 } 4106 rw_wunlock(*lockp); 4107 } 4108 *lockp = new_lock; 4109 rw_wlock(*lockp); 4110 } 4111 pvh = page_to_pvh(m); 4112 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += 4113 L3_SIZE) { 4114 if (pmap_pte_dirty(pmap, first_l3e)) 4115 vm_page_dirty(mt); 4116 if ((first_l3e & ATTR_AF) != 0) 4117 vm_page_aflag_set(mt, PGA_REFERENCED); 4118 pmap_pvh_free(&mt->md, pmap, tva); 4119 if (TAILQ_EMPTY(&mt->md.pv_list) && 4120 TAILQ_EMPTY(&pvh->pv_list)) 4121 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4122 } 4123 } 4124 if (*vap == va_next) 4125 *vap = va; 4126 if (ml3 != NULL) { 4127 ml3->ref_count -= L3C_ENTRIES; 4128 if (ml3->ref_count == 0) { 4129 _pmap_unwire_l3(pmap, va, ml3, free); 4130 return (true); 4131 } 4132 } 4133 return (false); 4134 } 4135 4136 /* 4137 * Remove the specified range of addresses from the L3 page table that is 4138 * identified by the given L2 entry. 4139 */ 4140 static void 4141 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, 4142 vm_offset_t eva, struct spglist *free, struct rwlock **lockp) 4143 { 4144 struct md_page *pvh; 4145 struct rwlock *new_lock; 4146 pt_entry_t *l3, old_l3; 4147 vm_offset_t va; 4148 vm_page_t l3pg, m; 4149 4150 KASSERT(ADDR_IS_CANONICAL(sva), 4151 ("%s: Start address not in canonical form: %lx", __func__, sva)); 4152 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS, 4153 ("%s: End address not in canonical form: %lx", __func__, eva)); 4154 4155 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4156 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), 4157 ("pmap_remove_l3_range: range crosses an L3 page table boundary")); 4158 l3pg = ADDR_IS_USER(sva) ? PTE_TO_VM_PAGE(l2e) : NULL; 4159 va = eva; 4160 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { 4161 old_l3 = pmap_load(l3); 4162 if (!pmap_l3_valid(old_l3)) { 4163 if (va != eva) { 4164 pmap_invalidate_range(pmap, va, sva, true); 4165 va = eva; 4166 } 4167 continue; 4168 } 4169 if ((old_l3 & ATTR_CONTIGUOUS) != 0) { 4170 /* 4171 * Is this entire set of contiguous L3 entries being 4172 * removed? Handle the possibility that "eva" is zero 4173 * because of address wraparound. 4174 */ 4175 if ((sva & L3C_OFFSET) == 0 && 4176 sva + L3C_OFFSET <= eva - 1) { 4177 if (pmap_remove_l3c(pmap, l3, sva, &va, eva, 4178 l3pg, free, lockp)) { 4179 /* The L3 table was unmapped. */ 4180 sva += L3C_SIZE; 4181 break; 4182 } 4183 l3 += L3C_ENTRIES - 1; 4184 sva += L3C_SIZE - L3_SIZE; 4185 continue; 4186 } 4187 4188 (void)pmap_demote_l3c(pmap, l3, sva); 4189 } 4190 old_l3 = pmap_load_clear(l3); 4191 if ((old_l3 & ATTR_SW_WIRED) != 0) 4192 pmap->pm_stats.wired_count--; 4193 pmap_resident_count_dec(pmap, 1); 4194 if ((old_l3 & ATTR_SW_MANAGED) != 0) { 4195 m = PTE_TO_VM_PAGE(old_l3); 4196 if (pmap_pte_dirty(pmap, old_l3)) 4197 vm_page_dirty(m); 4198 if ((old_l3 & ATTR_AF) != 0) 4199 vm_page_aflag_set(m, PGA_REFERENCED); 4200 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4201 if (new_lock != *lockp) { 4202 if (*lockp != NULL) { 4203 /* 4204 * Pending TLB invalidations must be 4205 * performed before the PV list lock is 4206 * released. Otherwise, a concurrent 4207 * pmap_remove_all() on a physical page 4208 * could return while a stale TLB entry 4209 * still provides access to that page. 4210 */ 4211 if (va != eva) { 4212 pmap_invalidate_range(pmap, va, 4213 sva, true); 4214 va = eva; 4215 } 4216 rw_wunlock(*lockp); 4217 } 4218 *lockp = new_lock; 4219 rw_wlock(*lockp); 4220 } 4221 pmap_pvh_free(&m->md, pmap, sva); 4222 if (TAILQ_EMPTY(&m->md.pv_list) && 4223 (m->flags & PG_FICTITIOUS) == 0) { 4224 pvh = page_to_pvh(m); 4225 if (TAILQ_EMPTY(&pvh->pv_list)) 4226 vm_page_aflag_clear(m, PGA_WRITEABLE); 4227 } 4228 } 4229 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) { 4230 /* 4231 * _pmap_unwire_l3() has already invalidated the TLB 4232 * entries at all levels for "sva". So, we need not 4233 * perform "sva += L3_SIZE;" here. Moreover, we need 4234 * not perform "va = sva;" if "sva" is at the start 4235 * of a new valid range consisting of a single page. 4236 */ 4237 break; 4238 } 4239 if (va == eva) 4240 va = sva; 4241 } 4242 if (va != eva) 4243 pmap_invalidate_range(pmap, va, sva, true); 4244 } 4245 4246 static void 4247 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) 4248 { 4249 struct rwlock *lock; 4250 vm_offset_t va_next; 4251 pd_entry_t *l0, *l1, *l2; 4252 pt_entry_t l3_paddr; 4253 struct spglist free; 4254 4255 /* 4256 * Perform an unsynchronized read. This is, however, safe. 4257 */ 4258 if (pmap->pm_stats.resident_count == 0) 4259 return; 4260 4261 SLIST_INIT(&free); 4262 4263 PMAP_LOCK(pmap); 4264 if (map_delete) 4265 pmap_bti_on_remove(pmap, sva, eva); 4266 4267 lock = NULL; 4268 for (; sva < eva; sva = va_next) { 4269 if (pmap->pm_stats.resident_count == 0) 4270 break; 4271 4272 l0 = pmap_l0(pmap, sva); 4273 if (pmap_load(l0) == 0) { 4274 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 4275 if (va_next < sva) 4276 va_next = eva; 4277 continue; 4278 } 4279 4280 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 4281 if (va_next < sva) 4282 va_next = eva; 4283 l1 = pmap_l0_to_l1(l0, sva); 4284 if (pmap_load(l1) == 0) 4285 continue; 4286 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4287 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4288 KASSERT(va_next <= eva, 4289 ("partial update of non-transparent 1G page " 4290 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 4291 pmap_load(l1), sva, eva, va_next)); 4292 MPASS(pmap != kernel_pmap); 4293 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 4294 pmap_clear(l1); 4295 pmap_s1_invalidate_page(pmap, sva, true); 4296 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE); 4297 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free); 4298 continue; 4299 } 4300 4301 /* 4302 * Calculate index for next page table. 4303 */ 4304 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4305 if (va_next < sva) 4306 va_next = eva; 4307 4308 l2 = pmap_l1_to_l2(l1, sva); 4309 l3_paddr = pmap_load(l2); 4310 4311 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { 4312 if (sva + L2_SIZE == va_next && eva >= va_next) { 4313 pmap_remove_l2(pmap, l2, sva, pmap_load(l1), 4314 true, &free, &lock); 4315 continue; 4316 } else if (pmap_demote_l2_locked(pmap, l2, sva, 4317 &lock) == NULL) 4318 continue; 4319 l3_paddr = pmap_load(l2); 4320 } 4321 4322 /* 4323 * Weed out invalid mappings. 4324 */ 4325 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) 4326 continue; 4327 4328 /* 4329 * Limit our scan to either the end of the va represented 4330 * by the current page table page, or to the end of the 4331 * range being removed. 4332 */ 4333 if (va_next > eva) 4334 va_next = eva; 4335 4336 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, 4337 &lock); 4338 } 4339 if (lock != NULL) 4340 rw_wunlock(lock); 4341 PMAP_UNLOCK(pmap); 4342 vm_page_free_pages_toq(&free, true); 4343 } 4344 4345 /* 4346 * Remove the given range of addresses from the specified map. 4347 * 4348 * It is assumed that the start and end are properly 4349 * rounded to the page size. 4350 */ 4351 void 4352 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4353 { 4354 pmap_remove1(pmap, sva, eva, false); 4355 } 4356 4357 /* 4358 * Remove the given range of addresses as part of a logical unmap 4359 * operation. This has the effect of calling pmap_remove(), but 4360 * also clears any metadata that should persist for the lifetime 4361 * of a logical mapping. 4362 */ 4363 void 4364 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4365 { 4366 pmap_remove1(pmap, sva, eva, true); 4367 } 4368 4369 /* 4370 * Routine: pmap_remove_all 4371 * Function: 4372 * Removes this physical page from 4373 * all physical maps in which it resides. 4374 * Reflects back modify bits to the pager. 4375 * 4376 * Notes: 4377 * Original versions of this routine were very 4378 * inefficient because they iteratively called 4379 * pmap_remove (slow...) 4380 */ 4381 4382 void 4383 pmap_remove_all(vm_page_t m) 4384 { 4385 struct md_page *pvh; 4386 pv_entry_t pv; 4387 pmap_t pmap; 4388 struct rwlock *lock; 4389 pd_entry_t *pde, tpde; 4390 pt_entry_t *pte, tpte; 4391 vm_offset_t va; 4392 struct spglist free; 4393 int lvl, pvh_gen, md_gen; 4394 4395 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4396 ("pmap_remove_all: page %p is not managed", m)); 4397 SLIST_INIT(&free); 4398 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4399 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 4400 rw_wlock(lock); 4401 retry: 4402 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4403 pmap = PV_PMAP(pv); 4404 if (!PMAP_TRYLOCK(pmap)) { 4405 pvh_gen = pvh->pv_gen; 4406 rw_wunlock(lock); 4407 PMAP_LOCK(pmap); 4408 rw_wlock(lock); 4409 if (pvh_gen != pvh->pv_gen) { 4410 PMAP_UNLOCK(pmap); 4411 goto retry; 4412 } 4413 } 4414 va = pv->pv_va; 4415 pte = pmap_pte_exists(pmap, va, 2, __func__); 4416 pmap_demote_l2_locked(pmap, pte, va, &lock); 4417 PMAP_UNLOCK(pmap); 4418 } 4419 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4420 pmap = PV_PMAP(pv); 4421 if (!PMAP_TRYLOCK(pmap)) { 4422 pvh_gen = pvh->pv_gen; 4423 md_gen = m->md.pv_gen; 4424 rw_wunlock(lock); 4425 PMAP_LOCK(pmap); 4426 rw_wlock(lock); 4427 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4428 PMAP_UNLOCK(pmap); 4429 goto retry; 4430 } 4431 } 4432 pmap_resident_count_dec(pmap, 1); 4433 4434 pde = pmap_pde(pmap, pv->pv_va, &lvl); 4435 KASSERT(pde != NULL, 4436 ("pmap_remove_all: no page directory entry found")); 4437 KASSERT(lvl == 2, 4438 ("pmap_remove_all: invalid pde level %d", lvl)); 4439 tpde = pmap_load(pde); 4440 4441 pte = pmap_l2_to_l3(pde, pv->pv_va); 4442 tpte = pmap_load(pte); 4443 if ((tpte & ATTR_CONTIGUOUS) != 0) 4444 (void)pmap_demote_l3c(pmap, pte, pv->pv_va); 4445 tpte = pmap_load_clear(pte); 4446 if (tpte & ATTR_SW_WIRED) 4447 pmap->pm_stats.wired_count--; 4448 if ((tpte & ATTR_AF) != 0) { 4449 pmap_invalidate_page(pmap, pv->pv_va, true); 4450 vm_page_aflag_set(m, PGA_REFERENCED); 4451 } 4452 4453 /* 4454 * Update the vm_page_t clean and reference bits. 4455 */ 4456 if (pmap_pte_dirty(pmap, tpte)) 4457 vm_page_dirty(m); 4458 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); 4459 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4460 m->md.pv_gen++; 4461 free_pv_entry(pmap, pv); 4462 PMAP_UNLOCK(pmap); 4463 } 4464 vm_page_aflag_clear(m, PGA_WRITEABLE); 4465 rw_wunlock(lock); 4466 vm_page_free_pages_toq(&free, true); 4467 } 4468 4469 /* 4470 * Masks and sets bits in a level 2 page table entries in the specified pmap 4471 */ 4472 static void 4473 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, 4474 pt_entry_t nbits) 4475 { 4476 pd_entry_t old_l2; 4477 vm_page_t m, mt; 4478 4479 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4480 PMAP_ASSERT_STAGE1(pmap); 4481 KASSERT((sva & L2_OFFSET) == 0, 4482 ("pmap_protect_l2: sva is not 2mpage aligned")); 4483 old_l2 = pmap_load(l2); 4484 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 4485 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); 4486 4487 /* 4488 * Return if the L2 entry already has the desired access restrictions 4489 * in place. 4490 */ 4491 if ((old_l2 & mask) == nbits) 4492 return; 4493 4494 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) 4495 cpu_spinwait(); 4496 4497 /* 4498 * When a dirty read/write superpage mapping is write protected, 4499 * update the dirty field of each of the superpage's constituent 4KB 4500 * pages. 4501 */ 4502 if ((old_l2 & ATTR_SW_MANAGED) != 0 && 4503 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 4504 pmap_pte_dirty(pmap, old_l2)) { 4505 m = PTE_TO_VM_PAGE(old_l2); 4506 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 4507 vm_page_dirty(mt); 4508 } 4509 4510 /* 4511 * Since a promotion must break the 4KB page mappings before making 4512 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices. 4513 */ 4514 pmap_s1_invalidate_page(pmap, sva, true); 4515 } 4516 4517 /* 4518 * Masks and sets bits in the specified L3C superpage mapping. 4519 * 4520 * Requests TLB invalidations to be performed by the caller through the 4521 * returned "*vap". 4522 */ 4523 static void 4524 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 4525 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits) 4526 { 4527 pt_entry_t l3e, *tl3p; 4528 vm_page_t m, mt; 4529 bool dirty; 4530 4531 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4532 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) == 4533 0, ("pmap_mask_set_l3c: l3p is not aligned")); 4534 KASSERT((va & L3C_OFFSET) == 0, 4535 ("pmap_mask_set_l3c: va is not aligned")); 4536 dirty = false; 4537 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 4538 l3e = pmap_load(tl3p); 4539 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 4540 ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS")); 4541 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits)) 4542 cpu_spinwait(); 4543 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) == 4544 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW))) 4545 dirty = true; 4546 } 4547 4548 /* 4549 * When a dirty read/write superpage mapping is write protected, 4550 * update the dirty field of each of the superpage's constituent 4KB 4551 * pages. 4552 */ 4553 if ((l3e & ATTR_SW_MANAGED) != 0 && 4554 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 4555 dirty) { 4556 m = PTE_TO_VM_PAGE(pmap_load(l3p)); 4557 for (mt = m; mt < &m[L3C_ENTRIES]; mt++) 4558 vm_page_dirty(mt); 4559 } 4560 4561 if (*vap == va_next) 4562 *vap = va; 4563 } 4564 4565 /* 4566 * Masks and sets bits in last level page table entries in the specified 4567 * pmap and range 4568 */ 4569 static void 4570 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask, 4571 pt_entry_t nbits, bool invalidate) 4572 { 4573 vm_offset_t va, va_next; 4574 pd_entry_t *l0, *l1, *l2; 4575 pt_entry_t *l3p, l3; 4576 4577 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4578 for (; sva < eva; sva = va_next) { 4579 l0 = pmap_l0(pmap, sva); 4580 if (pmap_load(l0) == 0) { 4581 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 4582 if (va_next < sva) 4583 va_next = eva; 4584 continue; 4585 } 4586 4587 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 4588 if (va_next < sva) 4589 va_next = eva; 4590 l1 = pmap_l0_to_l1(l0, sva); 4591 if (pmap_load(l1) == 0) 4592 continue; 4593 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4594 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4595 KASSERT(va_next <= eva, 4596 ("partial update of non-transparent 1G page " 4597 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 4598 pmap_load(l1), sva, eva, va_next)); 4599 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 4600 if ((pmap_load(l1) & mask) != nbits) { 4601 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits); 4602 if (invalidate) 4603 pmap_s1_invalidate_page(pmap, sva, true); 4604 } 4605 continue; 4606 } 4607 4608 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4609 if (va_next < sva) 4610 va_next = eva; 4611 4612 l2 = pmap_l1_to_l2(l1, sva); 4613 if (pmap_load(l2) == 0) 4614 continue; 4615 4616 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 4617 if (sva + L2_SIZE == va_next && eva >= va_next) { 4618 pmap_protect_l2(pmap, l2, sva, mask, nbits); 4619 continue; 4620 } else if ((pmap_load(l2) & mask) == nbits || 4621 pmap_demote_l2(pmap, l2, sva) == NULL) 4622 continue; 4623 } 4624 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 4625 ("pmap_protect: Invalid L2 entry after demotion")); 4626 4627 if (va_next > eva) 4628 va_next = eva; 4629 4630 va = va_next; 4631 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, 4632 sva += L3_SIZE) { 4633 l3 = pmap_load(l3p); 4634 4635 /* 4636 * Go to the next L3 entry if the current one is 4637 * invalid or already has the desired access 4638 * restrictions in place. (The latter case occurs 4639 * frequently. For example, in a "buildworld" 4640 * workload, almost 1 out of 4 L3 entries already 4641 * have the desired restrictions.) 4642 */ 4643 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { 4644 if (va != va_next) { 4645 if (invalidate) 4646 pmap_s1_invalidate_range(pmap, 4647 va, sva, true); 4648 va = va_next; 4649 } 4650 if ((l3 & ATTR_CONTIGUOUS) != 0) { 4651 /* 4652 * Does this L3C page extend beyond 4653 * the requested range? Handle the 4654 * possibility that "va_next" is zero. 4655 */ 4656 if ((sva | L3C_OFFSET) > va_next - 1) 4657 break; 4658 4659 /* 4660 * Skip ahead to the last L3_PAGE 4661 * within this L3C page. 4662 */ 4663 l3p = (pt_entry_t *)((uintptr_t)l3p | 4664 ((L3C_ENTRIES - 1) * 4665 sizeof(pt_entry_t))); 4666 sva |= L3C_SIZE - L3_SIZE; 4667 } 4668 continue; 4669 } 4670 4671 if ((l3 & ATTR_CONTIGUOUS) != 0) { 4672 /* 4673 * Is this entire set of contiguous L3 entries 4674 * being protected? Handle the possibility 4675 * that "va_next" is zero because of address 4676 * wraparound. 4677 */ 4678 if ((sva & L3C_OFFSET) == 0 && 4679 sva + L3C_OFFSET <= va_next - 1) { 4680 pmap_mask_set_l3c(pmap, l3p, sva, &va, 4681 va_next, mask, nbits); 4682 l3p += L3C_ENTRIES - 1; 4683 sva += L3C_SIZE - L3_SIZE; 4684 continue; 4685 } 4686 4687 (void)pmap_demote_l3c(pmap, l3p, sva); 4688 4689 /* 4690 * The L3 entry's accessed bit may have changed. 4691 */ 4692 l3 = pmap_load(l3p); 4693 } 4694 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | 4695 nbits)) 4696 cpu_spinwait(); 4697 4698 /* 4699 * When a dirty read/write mapping is write protected, 4700 * update the page's dirty field. 4701 */ 4702 if ((l3 & ATTR_SW_MANAGED) != 0 && 4703 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 4704 pmap_pte_dirty(pmap, l3)) 4705 vm_page_dirty(PTE_TO_VM_PAGE(l3)); 4706 4707 if (va == va_next) 4708 va = sva; 4709 } 4710 if (va != va_next && invalidate) 4711 pmap_s1_invalidate_range(pmap, va, sva, true); 4712 } 4713 } 4714 4715 static void 4716 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask, 4717 pt_entry_t nbits, bool invalidate) 4718 { 4719 PMAP_LOCK(pmap); 4720 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate); 4721 PMAP_UNLOCK(pmap); 4722 } 4723 4724 /* 4725 * Set the physical protection on the 4726 * specified range of this map as requested. 4727 */ 4728 void 4729 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4730 { 4731 pt_entry_t mask, nbits; 4732 4733 PMAP_ASSERT_STAGE1(pmap); 4734 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4735 if (prot == VM_PROT_NONE) { 4736 pmap_remove(pmap, sva, eva); 4737 return; 4738 } 4739 4740 mask = nbits = 0; 4741 if ((prot & VM_PROT_WRITE) == 0) { 4742 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM; 4743 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO); 4744 } 4745 if ((prot & VM_PROT_EXECUTE) == 0) { 4746 mask |= ATTR_S1_XN; 4747 nbits |= ATTR_S1_XN; 4748 } 4749 if (pmap == kernel_pmap) { 4750 mask |= ATTR_KERN_GP; 4751 nbits |= ATTR_KERN_GP; 4752 } 4753 if (mask == 0) 4754 return; 4755 4756 pmap_mask_set(pmap, sva, eva, mask, nbits, true); 4757 } 4758 4759 void 4760 pmap_disable_promotion(vm_offset_t sva, vm_size_t size) 4761 { 4762 4763 MPASS((sva & L3_OFFSET) == 0); 4764 MPASS(((sva + size) & L3_OFFSET) == 0); 4765 4766 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE, 4767 ATTR_SW_NO_PROMOTE, false); 4768 } 4769 4770 /* 4771 * Inserts the specified page table page into the specified pmap's collection 4772 * of idle page table pages. Each of a pmap's page table pages is responsible 4773 * for mapping a distinct range of virtual addresses. The pmap's collection is 4774 * ordered by this virtual address range. 4775 * 4776 * If "promoted" is false, then the page table page "mpte" must be zero filled; 4777 * "mpte"'s valid field will be set to 0. 4778 * 4779 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must 4780 * contain valid mappings with identical attributes except for ATTR_AF; 4781 * "mpte"'s valid field will be set to 1. 4782 * 4783 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain 4784 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid 4785 * field will be set to VM_PAGE_BITS_ALL. 4786 */ 4787 static __inline int 4788 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 4789 bool all_l3e_AF_set) 4790 { 4791 4792 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4793 KASSERT(promoted || !all_l3e_AF_set, 4794 ("a zero-filled PTP can't have ATTR_AF set in every PTE")); 4795 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0; 4796 return (vm_radix_insert(&pmap->pm_root, mpte)); 4797 } 4798 4799 /* 4800 * Removes the page table page mapping the specified virtual address from the 4801 * specified pmap's collection of idle page table pages, and returns it. 4802 * Otherwise, returns NULL if there is no page table page corresponding to the 4803 * specified virtual address. 4804 */ 4805 static __inline vm_page_t 4806 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4807 { 4808 4809 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4810 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 4811 } 4812 4813 /* 4814 * Performs a break-before-make update of a pmap entry. This is needed when 4815 * either promoting or demoting pages to ensure the TLB doesn't get into an 4816 * inconsistent state. 4817 */ 4818 static void 4819 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte, 4820 vm_offset_t va, vm_size_t size) 4821 { 4822 register_t intr; 4823 4824 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4825 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0, 4826 ("%s: Updating non-promote pte", __func__)); 4827 4828 /* 4829 * Ensure we don't get switched out with the page table in an 4830 * inconsistent state. We also need to ensure no interrupts fire 4831 * as they may make use of an address we are about to invalidate. 4832 */ 4833 intr = intr_disable(); 4834 4835 /* 4836 * Clear the old mapping's valid bit, but leave the rest of the entry 4837 * unchanged, so that a lockless, concurrent pmap_kextract() can still 4838 * lookup the physical address. 4839 */ 4840 pmap_clear_bits(ptep, ATTR_DESCR_VALID); 4841 4842 /* 4843 * When promoting, the L{1,2}_TABLE entry that is being replaced might 4844 * be cached, so we invalidate intermediate entries as well as final 4845 * entries. 4846 */ 4847 pmap_s1_invalidate_range(pmap, va, va + size, false); 4848 4849 /* Create the new mapping */ 4850 pmap_store(ptep, newpte); 4851 dsb(ishst); 4852 4853 intr_restore(intr); 4854 } 4855 4856 /* 4857 * Performs a break-before-make update of an ATTR_CONTIGUOUS mapping. 4858 */ 4859 static void __nosanitizecoverage 4860 pmap_update_strided(pmap_t pmap, pd_entry_t *ptep, pd_entry_t *ptep_end, 4861 pd_entry_t newpte, vm_offset_t va, vm_offset_t stride, vm_size_t size) 4862 { 4863 pd_entry_t *lip; 4864 register_t intr; 4865 4866 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4867 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0, 4868 ("%s: Updating non-promote pte", __func__)); 4869 4870 /* 4871 * Ensure we don't get switched out with the page table in an 4872 * inconsistent state. We also need to ensure no interrupts fire 4873 * as they may make use of an address we are about to invalidate. 4874 */ 4875 intr = intr_disable(); 4876 4877 /* 4878 * Clear the old mapping's valid bits, but leave the rest of each 4879 * entry unchanged, so that a lockless, concurrent pmap_kextract() can 4880 * still lookup the physical address. 4881 */ 4882 for (lip = ptep; lip < ptep_end; lip++) 4883 pmap_clear_bits(lip, ATTR_DESCR_VALID); 4884 4885 /* Only final entries are changing. */ 4886 pmap_s1_invalidate_strided(pmap, va, va + size, stride, true); 4887 4888 /* Create the new mapping. */ 4889 for (lip = ptep; lip < ptep_end; lip++) { 4890 pmap_store(lip, newpte); 4891 newpte += stride; 4892 } 4893 dsb(ishst); 4894 4895 intr_restore(intr); 4896 } 4897 4898 #if VM_NRESERVLEVEL > 0 4899 /* 4900 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 4901 * replace the many pv entries for the 4KB page mappings by a single pv entry 4902 * for the 2MB page mapping. 4903 */ 4904 static void 4905 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 4906 struct rwlock **lockp) 4907 { 4908 struct md_page *pvh; 4909 pv_entry_t pv; 4910 vm_offset_t va_last; 4911 vm_page_t m; 4912 4913 KASSERT((pa & L2_OFFSET) == 0, 4914 ("pmap_pv_promote_l2: pa is not 2mpage aligned")); 4915 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 4916 4917 /* 4918 * Transfer the first page's pv entry for this mapping to the 2mpage's 4919 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 4920 * a transfer avoids the possibility that get_pv_entry() calls 4921 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 4922 * mappings that is being promoted. 4923 */ 4924 m = PHYS_TO_VM_PAGE(pa); 4925 va = va & ~L2_OFFSET; 4926 pv = pmap_pvh_remove(&m->md, pmap, va); 4927 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); 4928 pvh = page_to_pvh(m); 4929 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4930 pvh->pv_gen++; 4931 /* Free the remaining NPTEPG - 1 pv entries. */ 4932 va_last = va + L2_SIZE - PAGE_SIZE; 4933 do { 4934 m++; 4935 va += PAGE_SIZE; 4936 pmap_pvh_free(&m->md, pmap, va); 4937 } while (va < va_last); 4938 } 4939 4940 /* 4941 * Tries to promote the 512, contiguous 4KB page mappings that are within a 4942 * single level 2 table entry to a single 2MB page mapping. For promotion 4943 * to occur, two conditions must be met: (1) the 4KB page mappings must map 4944 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 4945 * identical characteristics. 4946 */ 4947 static bool 4948 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte, 4949 struct rwlock **lockp) 4950 { 4951 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa; 4952 4953 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4954 4955 /* 4956 * Currently, this function only supports promotion on stage 1 pmaps 4957 * because it tests stage 1 specific fields and performs a break- 4958 * before-make sequence that is incorrect for stage 2 pmaps. 4959 */ 4960 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap)) 4961 return (false); 4962 4963 /* 4964 * Examine the first L3E in the specified PTP. Abort if this L3E is 4965 * ineligible for promotion... 4966 */ 4967 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 4968 newl2 = pmap_load(firstl3); 4969 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0) 4970 return (false); 4971 /* ... is not the first physical page within an L2 block */ 4972 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 || 4973 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */ 4974 counter_u64_add(pmap_l2_p_failures, 1); 4975 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4976 " in pmap %p", va, pmap); 4977 return (false); 4978 } 4979 4980 /* 4981 * Both here and in the below "for" loop, to allow for repromotion 4982 * after MADV_FREE, conditionally write protect a clean L3E before 4983 * possibly aborting the promotion due to other L3E attributes. Why? 4984 * Suppose that MADV_FREE is applied to a part of a superpage, the 4985 * address range [S, E). pmap_advise() will demote the superpage 4986 * mapping, destroy the 4KB page mapping at the end of [S, E), and 4987 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later, 4988 * imagine that the memory in [S, E) is recycled, but the last 4KB 4989 * page in [S, E) is not the last to be rewritten, or simply accessed. 4990 * In other words, there is still a 4KB page in [S, E), call it P, 4991 * that is writeable but AP_RO is set and AF is clear in P's L3E. 4992 * Unless we write protect P before aborting the promotion, if and 4993 * when P is finally rewritten, there won't be a page fault to trigger 4994 * repromotion. 4995 */ 4996 setl2: 4997 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 4998 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 4999 /* 5000 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 5001 * ATTR_SW_DBM can be cleared without a TLB invalidation. 5002 */ 5003 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM)) 5004 goto setl2; 5005 newl2 &= ~ATTR_SW_DBM; 5006 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx" 5007 " in pmap %p", va & ~L2_OFFSET, pmap); 5008 } 5009 5010 /* 5011 * Examine each of the other L3Es in the specified PTP. Abort if this 5012 * L3E maps an unexpected 4KB physical page or does not have identical 5013 * characteristics to the first L3E. If ATTR_AF is not set in every 5014 * PTE, then request that the PTP be refilled on demotion. 5015 */ 5016 all_l3e_AF = newl2 & ATTR_AF; 5017 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK)) 5018 + L2_SIZE - PAGE_SIZE; 5019 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { 5020 oldl3 = pmap_load(l3); 5021 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) { 5022 counter_u64_add(pmap_l2_p_failures, 1); 5023 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 5024 " in pmap %p", va, pmap); 5025 return (false); 5026 } 5027 setl3: 5028 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 5029 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 5030 /* 5031 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 5032 * set, ATTR_SW_DBM can be cleared without a TLB 5033 * invalidation. 5034 */ 5035 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 5036 ~ATTR_SW_DBM)) 5037 goto setl3; 5038 oldl3 &= ~ATTR_SW_DBM; 5039 } 5040 if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) { 5041 counter_u64_add(pmap_l2_p_failures, 1); 5042 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 5043 " in pmap %p", va, pmap); 5044 return (false); 5045 } 5046 all_l3e_AF &= oldl3; 5047 pa -= PAGE_SIZE; 5048 } 5049 5050 /* 5051 * Unless all PTEs have ATTR_AF set, clear it from the superpage 5052 * mapping, so that promotions triggered by speculative mappings, 5053 * such as pmap_enter_quick(), don't automatically mark the 5054 * underlying pages as referenced. 5055 */ 5056 newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF; 5057 5058 /* 5059 * Save the page table page in its current state until the L2 5060 * mapping the superpage is demoted by pmap_demote_l2() or 5061 * destroyed by pmap_remove_l3(). 5062 */ 5063 if (mpte == NULL) 5064 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 5065 KASSERT(mpte >= vm_page_array && 5066 mpte < &vm_page_array[vm_page_array_size], 5067 ("pmap_promote_l2: page table page is out of range")); 5068 KASSERT(mpte->pindex == pmap_l2_pindex(va), 5069 ("pmap_promote_l2: page table page's pindex is wrong")); 5070 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) { 5071 counter_u64_add(pmap_l2_p_failures, 1); 5072 CTR2(KTR_PMAP, 5073 "pmap_promote_l2: failure for va %#lx in pmap %p", va, 5074 pmap); 5075 return (false); 5076 } 5077 5078 if ((newl2 & ATTR_SW_MANAGED) != 0) 5079 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp); 5080 5081 pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE); 5082 5083 counter_u64_add(pmap_l2_promotions, 1); 5084 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 5085 pmap); 5086 return (true); 5087 } 5088 5089 /* 5090 * Tries to promote an aligned, contiguous set of base page mappings to a 5091 * single L3C page mapping. For promotion to occur, two conditions must be 5092 * met: (1) the base page mappings must map aligned, contiguous physical 5093 * memory and (2) the base page mappings must have identical characteristics 5094 * except for the accessed flag. 5095 */ 5096 static bool 5097 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va) 5098 { 5099 pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa; 5100 5101 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5102 5103 /* 5104 * Currently, this function only supports promotion on stage 1 pmaps 5105 * because it tests stage 1 specific fields and performs a break- 5106 * before-make sequence that is incorrect for stage 2 pmaps. 5107 */ 5108 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap)) 5109 return (false); 5110 5111 /* 5112 * Compute the address of the first L3 entry in the superpage 5113 * candidate. 5114 */ 5115 l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES * 5116 sizeof(pt_entry_t)) - 1)); 5117 5118 firstl3c = pmap_load(l3p); 5119 5120 /* 5121 * Examine the first L3 entry. Abort if this L3E is ineligible for 5122 * promotion... 5123 */ 5124 if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0) 5125 return (false); 5126 /* ...is not properly aligned... */ 5127 if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 || 5128 (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */ 5129 counter_u64_add(pmap_l3c_p_failures, 1); 5130 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx" 5131 " in pmap %p", va, pmap); 5132 return (false); 5133 } 5134 5135 /* 5136 * If the first L3 entry is a clean read-write mapping, convert it 5137 * to a read-only mapping. See pmap_promote_l2() for the rationale. 5138 */ 5139 set_first: 5140 if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 5141 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 5142 /* 5143 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 5144 * ATTR_SW_DBM can be cleared without a TLB invalidation. 5145 */ 5146 if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM)) 5147 goto set_first; 5148 firstl3c &= ~ATTR_SW_DBM; 5149 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx" 5150 " in pmap %p", va & ~L3C_OFFSET, pmap); 5151 } 5152 5153 /* 5154 * Check that the rest of the L3 entries are compatible with the first, 5155 * and convert clean read-write mappings to read-only mappings. 5156 */ 5157 all_l3e_AF = firstl3c & ATTR_AF; 5158 pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) + 5159 L3C_SIZE - PAGE_SIZE; 5160 for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) { 5161 oldl3 = pmap_load(l3); 5162 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) { 5163 counter_u64_add(pmap_l3c_p_failures, 1); 5164 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx" 5165 " in pmap %p", va, pmap); 5166 return (false); 5167 } 5168 set_l3: 5169 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 5170 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 5171 /* 5172 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 5173 * set, ATTR_SW_DBM can be cleared without a TLB 5174 * invalidation. 5175 */ 5176 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 5177 ~ATTR_SW_DBM)) 5178 goto set_l3; 5179 oldl3 &= ~ATTR_SW_DBM; 5180 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx" 5181 " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) | 5182 (va & ~L3C_OFFSET), pmap); 5183 } 5184 if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) { 5185 counter_u64_add(pmap_l3c_p_failures, 1); 5186 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx" 5187 " in pmap %p", va, pmap); 5188 return (false); 5189 } 5190 all_l3e_AF &= oldl3; 5191 pa -= PAGE_SIZE; 5192 } 5193 5194 /* 5195 * Unless all PTEs have ATTR_AF set, clear it from the superpage 5196 * mapping, so that promotions triggered by speculative mappings, 5197 * such as pmap_enter_quick(), don't automatically mark the 5198 * underlying pages as referenced. 5199 */ 5200 firstl3c &= ~ATTR_AF | all_l3e_AF; 5201 5202 /* 5203 * Remake the mappings with the contiguous bit set. 5204 */ 5205 pmap_update_strided(pmap, l3p, l3p + L3C_ENTRIES, firstl3c | 5206 ATTR_CONTIGUOUS, va & ~L3C_OFFSET, L3_SIZE, L3C_SIZE); 5207 5208 counter_u64_add(pmap_l3c_promotions, 1); 5209 CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va, 5210 pmap); 5211 return (true); 5212 } 5213 #endif /* VM_NRESERVLEVEL > 0 */ 5214 5215 static int 5216 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags, 5217 int psind) 5218 { 5219 pd_entry_t *l0p, *l1p, *l2p, *l3p, newpte, origpte, *tl3p; 5220 vm_page_t mp; 5221 5222 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5223 KASSERT(psind > 0 && psind < MAXPAGESIZES, 5224 ("psind %d unexpected", psind)); 5225 KASSERT((PTE_TO_PHYS(pte) & (pagesizes[psind] - 1)) == 0, 5226 ("unaligned phys address %#lx pte %#lx psind %d", 5227 PTE_TO_PHYS(pte), pte, psind)); 5228 5229 restart: 5230 newpte = pte; 5231 if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte)) 5232 return (KERN_PROTECTION_FAILURE); 5233 if (psind == 3) { 5234 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 5235 5236 KASSERT(pagesizes[psind] == L1_SIZE, 5237 ("pagesizes[%d] != L1_SIZE", psind)); 5238 l0p = pmap_l0(pmap, va); 5239 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) { 5240 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL); 5241 if (mp == NULL) { 5242 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 5243 return (KERN_RESOURCE_SHORTAGE); 5244 PMAP_UNLOCK(pmap); 5245 vm_wait(NULL); 5246 PMAP_LOCK(pmap); 5247 goto restart; 5248 } 5249 l1p = pmap_l0_to_l1(l0p, va); 5250 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 5251 origpte = pmap_load(l1p); 5252 } else { 5253 l1p = pmap_l0_to_l1(l0p, va); 5254 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 5255 origpte = pmap_load(l1p); 5256 if ((origpte & ATTR_DESCR_VALID) == 0) { 5257 mp = PTE_TO_VM_PAGE(pmap_load(l0p)); 5258 mp->ref_count++; 5259 } 5260 } 5261 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) && 5262 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) || 5263 (origpte & ATTR_DESCR_VALID) == 0, 5264 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx", 5265 va, origpte, newpte)); 5266 pmap_store(l1p, newpte); 5267 } else if (psind == 2) { 5268 KASSERT(pagesizes[psind] == L2_SIZE, 5269 ("pagesizes[%d] != L2_SIZE", psind)); 5270 l2p = pmap_l2(pmap, va); 5271 if (l2p == NULL) { 5272 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL); 5273 if (mp == NULL) { 5274 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 5275 return (KERN_RESOURCE_SHORTAGE); 5276 PMAP_UNLOCK(pmap); 5277 vm_wait(NULL); 5278 PMAP_LOCK(pmap); 5279 goto restart; 5280 } 5281 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 5282 l2p = &l2p[pmap_l2_index(va)]; 5283 origpte = pmap_load(l2p); 5284 } else { 5285 l1p = pmap_l1(pmap, va); 5286 origpte = pmap_load(l2p); 5287 if ((origpte & ATTR_DESCR_VALID) == 0) { 5288 mp = PTE_TO_VM_PAGE(pmap_load(l1p)); 5289 mp->ref_count++; 5290 } 5291 } 5292 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 5293 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK && 5294 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)), 5295 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx", 5296 va, origpte, newpte)); 5297 pmap_store(l2p, newpte); 5298 } else /* (psind == 1) */ { 5299 KASSERT(pagesizes[psind] == L3C_SIZE, 5300 ("pagesizes[%d] != L3C_SIZE", psind)); 5301 l2p = pmap_l2(pmap, va); 5302 if (l2p == NULL || (pmap_load(l2p) & ATTR_DESCR_VALID) == 0) { 5303 mp = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL); 5304 if (mp == NULL) { 5305 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 5306 return (KERN_RESOURCE_SHORTAGE); 5307 PMAP_UNLOCK(pmap); 5308 vm_wait(NULL); 5309 PMAP_LOCK(pmap); 5310 goto restart; 5311 } 5312 mp->ref_count += L3C_ENTRIES - 1; 5313 l3p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 5314 l3p = &l3p[pmap_l3_index(va)]; 5315 } else { 5316 l3p = pmap_l2_to_l3(l2p, va); 5317 if ((pmap_load(l3p) & ATTR_DESCR_VALID) == 0) { 5318 mp = PTE_TO_VM_PAGE(pmap_load(l2p)); 5319 mp->ref_count += L3C_ENTRIES; 5320 } 5321 } 5322 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 5323 origpte = pmap_load(tl3p); 5324 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 5325 ((origpte & ATTR_CONTIGUOUS) != 0 && 5326 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)), 5327 ("va %#lx changing 64K phys page l3 %#lx newpte %#lx", 5328 va, origpte, newpte)); 5329 pmap_store(tl3p, newpte); 5330 newpte += L3_SIZE; 5331 } 5332 } 5333 dsb(ishst); 5334 5335 if ((origpte & ATTR_DESCR_VALID) == 0) 5336 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); 5337 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0) 5338 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 5339 else if ((newpte & ATTR_SW_WIRED) == 0 && 5340 (origpte & ATTR_SW_WIRED) != 0) 5341 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 5342 5343 return (KERN_SUCCESS); 5344 } 5345 5346 /* 5347 * Insert the given physical page (p) at 5348 * the specified virtual address (v) in the 5349 * target physical map with the protection requested. 5350 * 5351 * If specified, the page will be wired down, meaning 5352 * that the related pte can not be reclaimed. 5353 * 5354 * NB: This is the only routine which MAY NOT lazy-evaluate 5355 * or lose information. That is, this routine must actually 5356 * insert this page into the given map NOW. 5357 */ 5358 int 5359 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 5360 u_int flags, int8_t psind) 5361 { 5362 struct rwlock *lock; 5363 pd_entry_t *pde; 5364 pt_entry_t new_l3, orig_l3; 5365 pt_entry_t *l2, *l3; 5366 pv_entry_t pv; 5367 vm_paddr_t opa, pa; 5368 vm_page_t mpte, om; 5369 bool nosleep; 5370 int full_lvl, lvl, rv; 5371 5372 KASSERT(ADDR_IS_CANONICAL(va), 5373 ("%s: Address not in canonical form: %lx", __func__, va)); 5374 5375 va = trunc_page(va); 5376 if ((m->oflags & VPO_UNMANAGED) == 0) 5377 VM_PAGE_OBJECT_BUSY_ASSERT(m); 5378 pa = VM_PAGE_TO_PHYS(m); 5379 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr | 5380 L3_PAGE); 5381 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr); 5382 new_l3 |= pmap_pte_prot(pmap, prot); 5383 if ((flags & PMAP_ENTER_WIRED) != 0) 5384 new_l3 |= ATTR_SW_WIRED; 5385 if (pmap->pm_stage == PM_STAGE1) { 5386 if (ADDR_IS_USER(va)) 5387 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5388 else 5389 new_l3 |= ATTR_S1_UXN; 5390 if (pmap != kernel_pmap) 5391 new_l3 |= ATTR_S1_nG; 5392 } else { 5393 /* 5394 * Clear the access flag on executable mappings, this will be 5395 * set later when the page is accessed. The fault handler is 5396 * required to invalidate the I-cache. 5397 * 5398 * TODO: Switch to the valid flag to allow hardware management 5399 * of the access flag. Much of the pmap code assumes the 5400 * valid flag is set and fails to destroy the old page tables 5401 * correctly if it is clear. 5402 */ 5403 if (prot & VM_PROT_EXECUTE) 5404 new_l3 &= ~ATTR_AF; 5405 } 5406 if ((m->oflags & VPO_UNMANAGED) == 0) { 5407 new_l3 |= ATTR_SW_MANAGED; 5408 if ((prot & VM_PROT_WRITE) != 0) { 5409 new_l3 |= ATTR_SW_DBM; 5410 if ((flags & VM_PROT_WRITE) == 0) { 5411 if (pmap->pm_stage == PM_STAGE1) 5412 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); 5413 else 5414 new_l3 &= 5415 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 5416 } 5417 } 5418 } 5419 5420 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 5421 5422 lock = NULL; 5423 PMAP_LOCK(pmap); 5424 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 5425 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 5426 ("managed largepage va %#lx flags %#x", va, flags)); 5427 if (psind == 3) { 5428 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 5429 new_l3 &= ~L3_PAGE; 5430 new_l3 |= L1_BLOCK; 5431 } else if (psind == 2) { 5432 new_l3 &= ~L3_PAGE; 5433 new_l3 |= L2_BLOCK; 5434 } else /* (psind == 1) */ 5435 new_l3 |= ATTR_CONTIGUOUS; 5436 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind); 5437 goto out; 5438 } 5439 if (psind == 2) { 5440 /* Assert the required virtual and physical alignment. */ 5441 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); 5442 KASSERT(m->psind > 1, ("pmap_enter: m->psind < psind")); 5443 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, 5444 flags, m, &lock); 5445 goto out; 5446 } 5447 mpte = NULL; 5448 if (psind == 1) { 5449 KASSERT((va & L3C_OFFSET) == 0, ("pmap_enter: va unaligned")); 5450 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 5451 rv = pmap_enter_l3c(pmap, va, new_l3 | ATTR_CONTIGUOUS, flags, 5452 m, &mpte, &lock); 5453 #if VM_NRESERVLEVEL > 0 5454 /* 5455 * Attempt L2 promotion, if both the PTP and a level 1 5456 * reservation are fully populated. 5457 */ 5458 if (rv == KERN_SUCCESS && 5459 (mpte == NULL || mpte->ref_count == NL3PG) && 5460 (m->flags & PG_FICTITIOUS) == 0 && 5461 vm_reserv_level_iffullpop(m) == 1) { 5462 pde = pmap_l2(pmap, va); 5463 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock); 5464 } 5465 #endif 5466 goto out; 5467 } 5468 5469 /* 5470 * In the case that a page table page is not 5471 * resident, we are creating it here. 5472 */ 5473 retry: 5474 pde = pmap_pde(pmap, va, &lvl); 5475 if (pde != NULL && lvl == 2) { 5476 l3 = pmap_l2_to_l3(pde, va); 5477 if (ADDR_IS_USER(va) && mpte == NULL) { 5478 mpte = PTE_TO_VM_PAGE(pmap_load(pde)); 5479 mpte->ref_count++; 5480 } 5481 goto havel3; 5482 } else if (pde != NULL && lvl == 1) { 5483 l2 = pmap_l1_to_l2(pde, va); 5484 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && 5485 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { 5486 l3 = &l3[pmap_l3_index(va)]; 5487 if (ADDR_IS_USER(va)) { 5488 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 5489 mpte->ref_count++; 5490 } 5491 goto havel3; 5492 } 5493 /* We need to allocate an L3 table. */ 5494 } 5495 if (ADDR_IS_USER(va)) { 5496 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 5497 5498 /* 5499 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order 5500 * to handle the possibility that a superpage mapping for "va" 5501 * was created while we slept. 5502 */ 5503 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), 5504 nosleep ? NULL : &lock); 5505 if (mpte == NULL && nosleep) { 5506 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 5507 rv = KERN_RESOURCE_SHORTAGE; 5508 goto out; 5509 } 5510 goto retry; 5511 } else 5512 panic("pmap_enter: missing L3 table for kernel va %#lx", va); 5513 5514 havel3: 5515 orig_l3 = pmap_load(l3); 5516 opa = PTE_TO_PHYS(orig_l3); 5517 pv = NULL; 5518 new_l3 |= pmap_pte_bti(pmap, va); 5519 5520 /* 5521 * Is the specified virtual address already mapped? 5522 */ 5523 if (pmap_l3_valid(orig_l3)) { 5524 /* 5525 * Wiring change, just update stats. We don't worry about 5526 * wiring PT pages as they remain resident as long as there 5527 * are valid mappings in them. Hence, if a user page is wired, 5528 * the PT page will be also. 5529 */ 5530 if ((flags & PMAP_ENTER_WIRED) != 0 && 5531 (orig_l3 & ATTR_SW_WIRED) == 0) 5532 pmap->pm_stats.wired_count++; 5533 else if ((flags & PMAP_ENTER_WIRED) == 0 && 5534 (orig_l3 & ATTR_SW_WIRED) != 0) 5535 pmap->pm_stats.wired_count--; 5536 5537 /* 5538 * Remove the extra PT page reference. 5539 */ 5540 if (mpte != NULL) { 5541 mpte->ref_count--; 5542 KASSERT(mpte->ref_count > 0, 5543 ("pmap_enter: missing reference to page table page," 5544 " va: 0x%lx", va)); 5545 } 5546 5547 /* 5548 * Has the physical page changed? 5549 */ 5550 if (opa == pa) { 5551 /* 5552 * No, might be a protection or wiring change. 5553 */ 5554 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 5555 (new_l3 & ATTR_SW_DBM) != 0) 5556 vm_page_aflag_set(m, PGA_WRITEABLE); 5557 goto validate; 5558 } 5559 5560 /* 5561 * The physical page has changed. Temporarily invalidate 5562 * the mapping. 5563 */ 5564 if ((orig_l3 & ATTR_CONTIGUOUS) != 0) 5565 (void)pmap_demote_l3c(pmap, l3, va); 5566 orig_l3 = pmap_load_clear(l3); 5567 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 5568 ("pmap_enter: unexpected pa update for %#lx", va)); 5569 if ((orig_l3 & ATTR_SW_MANAGED) != 0) { 5570 om = PHYS_TO_VM_PAGE(opa); 5571 5572 /* 5573 * The pmap lock is sufficient to synchronize with 5574 * concurrent calls to pmap_page_test_mappings() and 5575 * pmap_ts_referenced(). 5576 */ 5577 if (pmap_pte_dirty(pmap, orig_l3)) 5578 vm_page_dirty(om); 5579 if ((orig_l3 & ATTR_AF) != 0) { 5580 pmap_invalidate_page(pmap, va, true); 5581 vm_page_aflag_set(om, PGA_REFERENCED); 5582 } 5583 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om); 5584 pv = pmap_pvh_remove(&om->md, pmap, va); 5585 if ((m->oflags & VPO_UNMANAGED) != 0) 5586 free_pv_entry(pmap, pv); 5587 if ((om->a.flags & PGA_WRITEABLE) != 0 && 5588 TAILQ_EMPTY(&om->md.pv_list) && 5589 ((om->flags & PG_FICTITIOUS) != 0 || 5590 TAILQ_EMPTY(&page_to_pvh(om)->pv_list))) 5591 vm_page_aflag_clear(om, PGA_WRITEABLE); 5592 } else { 5593 KASSERT((orig_l3 & ATTR_AF) != 0, 5594 ("pmap_enter: unmanaged mapping lacks ATTR_AF")); 5595 pmap_invalidate_page(pmap, va, true); 5596 } 5597 orig_l3 = 0; 5598 } else { 5599 /* 5600 * Increment the counters. 5601 */ 5602 if ((new_l3 & ATTR_SW_WIRED) != 0) 5603 pmap->pm_stats.wired_count++; 5604 pmap_resident_count_inc(pmap, 1); 5605 } 5606 /* 5607 * Enter on the PV list if part of our managed memory. 5608 */ 5609 if ((m->oflags & VPO_UNMANAGED) == 0) { 5610 if (pv == NULL) { 5611 pv = get_pv_entry(pmap, &lock); 5612 pv->pv_va = va; 5613 } 5614 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5615 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5616 m->md.pv_gen++; 5617 if ((new_l3 & ATTR_SW_DBM) != 0) 5618 vm_page_aflag_set(m, PGA_WRITEABLE); 5619 } 5620 5621 validate: 5622 if (pmap->pm_stage == PM_STAGE1) { 5623 /* 5624 * Sync icache if exec permission and attribute 5625 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping 5626 * is stored and made valid for hardware table walk. If done 5627 * later, then other can access this page before caches are 5628 * properly synced. Don't do it for kernel memory which is 5629 * mapped with exec permission even if the memory isn't going 5630 * to hold executable code. The only time when icache sync is 5631 * needed is after kernel module is loaded and the relocation 5632 * info is processed. And it's done in elf_cpu_load_file(). 5633 */ 5634 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 5635 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && 5636 (opa != pa || (orig_l3 & ATTR_S1_XN))) { 5637 PMAP_ASSERT_STAGE1(pmap); 5638 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), 5639 PAGE_SIZE); 5640 } 5641 } else { 5642 cpu_dcache_wb_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE); 5643 } 5644 5645 /* 5646 * Update the L3 entry 5647 */ 5648 if (pmap_l3_valid(orig_l3)) { 5649 KASSERT(opa == pa, ("pmap_enter: invalid update")); 5650 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { 5651 /* same PA, different attributes */ 5652 if ((orig_l3 & ATTR_CONTIGUOUS) != 0) 5653 (void)pmap_demote_l3c(pmap, l3, va); 5654 orig_l3 = pmap_load_store(l3, new_l3); 5655 pmap_invalidate_page(pmap, va, true); 5656 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 5657 pmap_pte_dirty(pmap, orig_l3)) 5658 vm_page_dirty(m); 5659 } else { 5660 /* 5661 * orig_l3 == new_l3 5662 * This can happens if multiple threads simultaneously 5663 * access not yet mapped page. This bad for performance 5664 * since this can cause full demotion-NOP-promotion 5665 * cycle. 5666 * Another possible reasons are: 5667 * - VM and pmap memory layout are diverged 5668 * - tlb flush is missing somewhere and CPU doesn't see 5669 * actual mapping. 5670 */ 5671 CTR4(KTR_PMAP, "%s: already mapped page - " 5672 "pmap %p va 0x%#lx pte 0x%lx", 5673 __func__, pmap, va, new_l3); 5674 } 5675 } else { 5676 /* New mapping */ 5677 pmap_store(l3, new_l3); 5678 dsb(ishst); 5679 } 5680 5681 #if VM_NRESERVLEVEL > 0 5682 /* 5683 * First, attempt L3C promotion, if the virtual and physical addresses 5684 * are aligned with each other and an underlying reservation has the 5685 * neighboring L3 pages allocated. The first condition is simply an 5686 * optimization that recognizes some eventual promotion failures early 5687 * at a lower run-time cost. Then, if both a level 1 reservation and 5688 * the PTP are fully populated, attempt L2 promotion. 5689 */ 5690 if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) && 5691 (m->flags & PG_FICTITIOUS) == 0 && 5692 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 && 5693 pmap_promote_l3c(pmap, l3, va) && 5694 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) 5695 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock); 5696 #endif 5697 5698 rv = KERN_SUCCESS; 5699 out: 5700 if (lock != NULL) 5701 rw_wunlock(lock); 5702 PMAP_UNLOCK(pmap); 5703 return (rv); 5704 } 5705 5706 /* 5707 * Tries to create a read- and/or execute-only L2 page mapping. Returns 5708 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 5709 * value. See pmap_enter_l2() for the possible error values when "no sleep", 5710 * "no replace", and "no reclaim" are specified. 5711 */ 5712 static int 5713 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 5714 struct rwlock **lockp) 5715 { 5716 pd_entry_t new_l2; 5717 5718 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5719 PMAP_ASSERT_STAGE1(pmap); 5720 KASSERT(ADDR_IS_CANONICAL(va), 5721 ("%s: Address not in canonical form: %lx", __func__, va)); 5722 5723 new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | pmap_sh_attr | 5724 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 5725 L2_BLOCK); 5726 if ((m->oflags & VPO_UNMANAGED) == 0) 5727 new_l2 |= ATTR_SW_MANAGED; 5728 else 5729 new_l2 |= ATTR_AF; 5730 if ((prot & VM_PROT_EXECUTE) == 0 || 5731 m->md.pv_memattr == VM_MEMATTR_DEVICE) 5732 new_l2 |= ATTR_S1_XN; 5733 if (ADDR_IS_USER(va)) 5734 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5735 else 5736 new_l2 |= ATTR_S1_UXN; 5737 if (pmap != kernel_pmap) 5738 new_l2 |= ATTR_S1_nG; 5739 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 5740 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp)); 5741 } 5742 5743 /* 5744 * Returns true if every page table entry in the specified page table is 5745 * zero. 5746 */ 5747 static bool 5748 pmap_every_pte_zero(vm_paddr_t pa) 5749 { 5750 pt_entry_t *pt_end, *pte; 5751 5752 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 5753 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 5754 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 5755 if (*pte != 0) 5756 return (false); 5757 } 5758 return (true); 5759 } 5760 5761 /* 5762 * Tries to create the specified L2 page mapping. Returns KERN_SUCCESS if 5763 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or 5764 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 5765 * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists 5766 * within the L2 virtual address range starting at the specified virtual 5767 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a 5768 * L2 page mapping already exists at the specified virtual address. Returns 5769 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a 5770 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified 5771 * and a PV entry allocation failed. 5772 */ 5773 static int 5774 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 5775 vm_page_t m, struct rwlock **lockp) 5776 { 5777 struct spglist free; 5778 pd_entry_t *l2, old_l2; 5779 vm_page_t l2pg, mt; 5780 vm_page_t uwptpg; 5781 5782 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5783 KASSERT(ADDR_IS_CANONICAL(va), 5784 ("%s: Address not in canonical form: %lx", __func__, va)); 5785 KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) != 5786 PMAP_ENTER_NORECLAIM, 5787 ("pmap_enter_l2: flags is missing PMAP_ENTER_NOREPLACE")); 5788 5789 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & 5790 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 5791 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 5792 va, pmap); 5793 return (KERN_RESOURCE_SHORTAGE); 5794 } 5795 5796 /* 5797 * If bti is not the same for the whole l2 range, return failure 5798 * and let vm_fault() cope. Check after l2 allocation, since 5799 * it could sleep. 5800 */ 5801 if (!pmap_bti_same(pmap, va, va + L2_SIZE, &new_l2)) { 5802 KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP")); 5803 pmap_abort_ptp(pmap, va, l2pg); 5804 return (KERN_PROTECTION_FAILURE); 5805 } 5806 5807 /* 5808 * If there are existing mappings, either abort or remove them. 5809 */ 5810 if ((old_l2 = pmap_load(l2)) != 0) { 5811 KASSERT(l2pg == NULL || l2pg->ref_count > 1, 5812 ("pmap_enter_l2: l2pg's ref count is too low")); 5813 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 5814 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) { 5815 if (l2pg != NULL) 5816 l2pg->ref_count--; 5817 CTR2(KTR_PMAP, 5818 "pmap_enter_l2: no space for va %#lx" 5819 " in pmap %p", va, pmap); 5820 return (KERN_NO_SPACE); 5821 } else if (ADDR_IS_USER(va) || 5822 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) { 5823 if (l2pg != NULL) 5824 l2pg->ref_count--; 5825 CTR2(KTR_PMAP, 5826 "pmap_enter_l2: failure for va %#lx" 5827 " in pmap %p", va, pmap); 5828 return (KERN_FAILURE); 5829 } 5830 } 5831 SLIST_INIT(&free); 5832 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) { 5833 (void)pmap_remove_l2(pmap, l2, va, 5834 pmap_load(pmap_l1(pmap, va)), false, &free, lockp); 5835 } else { 5836 if (ADDR_IS_KERNEL(va)) { 5837 /* 5838 * Try to save the ptp in the trie 5839 * before any changes to mappings are 5840 * made. Abort on failure. 5841 */ 5842 mt = PTE_TO_VM_PAGE(old_l2); 5843 if (pmap_insert_pt_page(pmap, mt, false, 5844 false)) { 5845 CTR1(KTR_PMAP, 5846 "pmap_enter_l2: cannot ins kern ptp va %#lx", 5847 va); 5848 return (KERN_RESOURCE_SHORTAGE); 5849 } 5850 /* 5851 * Both pmap_remove_l2() and 5852 * pmap_remove_l3_range() will zero fill 5853 * the L3 kernel page table page. 5854 */ 5855 } 5856 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, 5857 &free, lockp); 5858 if (ADDR_IS_KERNEL(va)) { 5859 /* 5860 * The TLB could have an intermediate 5861 * entry for the L3 kernel page table 5862 * page, so request an invalidation at 5863 * all levels after clearing the 5864 * L2_TABLE entry. 5865 */ 5866 pmap_clear(l2); 5867 pmap_s1_invalidate_page(pmap, va, false); 5868 } 5869 } 5870 KASSERT(pmap_load(l2) == 0, 5871 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 5872 if (ADDR_IS_USER(va)) { 5873 vm_page_free_pages_toq(&free, true); 5874 } else { 5875 KASSERT(SLIST_EMPTY(&free), 5876 ("pmap_enter_l2: freed kernel page table page")); 5877 } 5878 } 5879 5880 /* 5881 * Allocate leaf ptpage for wired userspace pages. 5882 */ 5883 uwptpg = NULL; 5884 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) { 5885 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5886 if (uwptpg == NULL) { 5887 pmap_abort_ptp(pmap, va, l2pg); 5888 return (KERN_RESOURCE_SHORTAGE); 5889 } 5890 uwptpg->pindex = pmap_l2_pindex(va); 5891 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 5892 vm_page_unwire_noq(uwptpg); 5893 vm_page_free(uwptpg); 5894 pmap_abort_ptp(pmap, va, l2pg); 5895 return (KERN_RESOURCE_SHORTAGE); 5896 } 5897 pmap_resident_count_inc(pmap, 1); 5898 uwptpg->ref_count = NL3PG; 5899 } 5900 if ((new_l2 & ATTR_SW_MANAGED) != 0) { 5901 /* 5902 * Abort this mapping if its PV entry could not be created. 5903 */ 5904 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 5905 if (l2pg != NULL) 5906 pmap_abort_ptp(pmap, va, l2pg); 5907 else { 5908 KASSERT(ADDR_IS_KERNEL(va) && 5909 (pmap_load(l2) & ATTR_DESCR_MASK) == 5910 L2_TABLE, 5911 ("pmap_enter_l2: invalid kernel L2E")); 5912 mt = pmap_remove_pt_page(pmap, va); 5913 KASSERT(mt != NULL, 5914 ("pmap_enter_l2: missing kernel PTP")); 5915 } 5916 if (uwptpg != NULL) { 5917 mt = pmap_remove_pt_page(pmap, va); 5918 KASSERT(mt == uwptpg, 5919 ("removed pt page %p, expected %p", mt, 5920 uwptpg)); 5921 pmap_resident_count_dec(pmap, 1); 5922 uwptpg->ref_count = 1; 5923 vm_page_unwire_noq(uwptpg); 5924 vm_page_free(uwptpg); 5925 } 5926 CTR2(KTR_PMAP, 5927 "pmap_enter_l2: failure for va %#lx in pmap %p", 5928 va, pmap); 5929 return (KERN_RESOURCE_SHORTAGE); 5930 } 5931 if ((new_l2 & ATTR_SW_DBM) != 0) 5932 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 5933 vm_page_aflag_set(mt, PGA_WRITEABLE); 5934 } 5935 5936 /* 5937 * Increment counters. 5938 */ 5939 if ((new_l2 & ATTR_SW_WIRED) != 0) 5940 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 5941 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 5942 5943 /* 5944 * Conditionally sync the icache. See pmap_enter() for details. 5945 */ 5946 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) != 5947 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) && 5948 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) { 5949 cpu_icache_sync_range((void *)PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)), 5950 L2_SIZE); 5951 } 5952 5953 /* 5954 * Map the superpage. 5955 */ 5956 pmap_store(l2, new_l2); 5957 dsb(ishst); 5958 5959 counter_u64_add(pmap_l2_mappings, 1); 5960 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 5961 va, pmap); 5962 5963 return (KERN_SUCCESS); 5964 } 5965 5966 /* 5967 * Tries to create a read- and/or execute-only L3C page mapping. Returns 5968 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 5969 * value. 5970 */ 5971 static int 5972 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p, 5973 vm_prot_t prot, struct rwlock **lockp) 5974 { 5975 pt_entry_t l3e; 5976 5977 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5978 PMAP_ASSERT_STAGE1(pmap); 5979 KASSERT(ADDR_IS_CANONICAL(va), 5980 ("%s: Address not in canonical form: %lx", __func__, va)); 5981 5982 l3e = VM_PAGE_TO_PTE(m) | pmap_sh_attr | 5983 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 5984 ATTR_CONTIGUOUS | L3_PAGE; 5985 if ((m->oflags & VPO_UNMANAGED) == 0) 5986 l3e |= ATTR_SW_MANAGED; 5987 else 5988 l3e |= ATTR_AF; 5989 if ((prot & VM_PROT_EXECUTE) == 0 || 5990 m->md.pv_memattr == VM_MEMATTR_DEVICE) 5991 l3e |= ATTR_S1_XN; 5992 if (ADDR_IS_USER(va)) 5993 l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5994 else 5995 l3e |= ATTR_S1_UXN; 5996 if (pmap != kernel_pmap) 5997 l3e |= ATTR_S1_nG; 5998 return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP | 5999 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp)); 6000 } 6001 6002 static int 6003 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags, 6004 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp) 6005 { 6006 pd_entry_t *l2p, *pde; 6007 pt_entry_t *l3p, *tl3p; 6008 vm_page_t mt; 6009 vm_paddr_t pa; 6010 vm_pindex_t l2pindex; 6011 int lvl; 6012 6013 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6014 KASSERT((va & L3C_OFFSET) == 0, 6015 ("pmap_enter_l3c: va is not aligned")); 6016 KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0, 6017 ("pmap_enter_l3c: managed mapping within the clean submap")); 6018 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 6019 ("pmap_enter_l3c: l3e is missing ATTR_CONTIGUOUS")); 6020 6021 /* 6022 * If the L3 PTP is not resident, we attempt to create it here. 6023 */ 6024 if (ADDR_IS_USER(va)) { 6025 /* 6026 * Were we given the correct L3 PTP? If so, we can simply 6027 * increment its ref count. 6028 */ 6029 l2pindex = pmap_l2_pindex(va); 6030 if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) { 6031 (*ml3p)->ref_count += L3C_ENTRIES; 6032 } else { 6033 retry: 6034 /* 6035 * Get the L2 entry. 6036 */ 6037 pde = pmap_pde(pmap, va, &lvl); 6038 6039 /* 6040 * If the L2 entry is a superpage, we either abort or 6041 * demote depending on the given flags. 6042 */ 6043 if (lvl == 1) { 6044 l2p = pmap_l1_to_l2(pde, va); 6045 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == 6046 L2_BLOCK) { 6047 if ((flags & PMAP_ENTER_NOREPLACE) != 0) 6048 return (KERN_FAILURE); 6049 l3p = pmap_demote_l2_locked(pmap, l2p, 6050 va, lockp); 6051 if (l3p != NULL) { 6052 *ml3p = PTE_TO_VM_PAGE( 6053 pmap_load(l2p)); 6054 (*ml3p)->ref_count += 6055 L3C_ENTRIES; 6056 goto have_l3p; 6057 } 6058 } 6059 /* We need to allocate an L3 PTP. */ 6060 } 6061 6062 /* 6063 * If the L3 PTP is mapped, we just increment its ref 6064 * count. Otherwise, we attempt to allocate it. 6065 */ 6066 if (lvl == 2 && pmap_load(pde) != 0) { 6067 *ml3p = PTE_TO_VM_PAGE(pmap_load(pde)); 6068 (*ml3p)->ref_count += L3C_ENTRIES; 6069 } else { 6070 *ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags & 6071 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp); 6072 if (*ml3p == NULL) { 6073 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 6074 return (KERN_FAILURE); 6075 6076 /* 6077 * The page table may have changed 6078 * while we slept. 6079 */ 6080 goto retry; 6081 } 6082 (*ml3p)->ref_count += L3C_ENTRIES - 1; 6083 } 6084 } 6085 l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p)); 6086 } else { 6087 *ml3p = NULL; 6088 6089 /* 6090 * If the L2 entry is a superpage, we either abort or demote 6091 * depending on the given flags. 6092 */ 6093 pde = pmap_pde(kernel_pmap, va, &lvl); 6094 if (lvl == 1) { 6095 l2p = pmap_l1_to_l2(pde, va); 6096 KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK, 6097 ("pmap_enter_l3c: missing L2 block")); 6098 if ((flags & PMAP_ENTER_NOREPLACE) != 0) 6099 return (KERN_FAILURE); 6100 l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp); 6101 } else { 6102 KASSERT(lvl == 2, 6103 ("pmap_enter_l3c: Invalid level %d", lvl)); 6104 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS( 6105 pmap_load(pde))); 6106 } 6107 } 6108 have_l3p: 6109 l3p = &l3p[pmap_l3_index(va)]; 6110 6111 /* 6112 * If bti is not the same for the whole L3C range, return failure 6113 * and let vm_fault() cope. Check after L3 allocation, since 6114 * it could sleep. 6115 */ 6116 if (!pmap_bti_same(pmap, va, va + L3C_SIZE, &l3e)) { 6117 KASSERT(*ml3p != NULL, ("pmap_enter_l3c: missing L3 PTP")); 6118 (*ml3p)->ref_count -= L3C_ENTRIES - 1; 6119 pmap_abort_ptp(pmap, va, *ml3p); 6120 *ml3p = NULL; 6121 return (KERN_PROTECTION_FAILURE); 6122 } 6123 6124 /* 6125 * If there are existing mappings, either abort or remove them. 6126 */ 6127 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 6128 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 6129 if (pmap_load(tl3p) != 0) { 6130 if (*ml3p != NULL) 6131 (*ml3p)->ref_count -= L3C_ENTRIES; 6132 return (KERN_FAILURE); 6133 } 6134 } 6135 } else { 6136 /* 6137 * Because we increment the L3 page's reference count above, 6138 * it is guaranteed not to be freed here and we can pass NULL 6139 * instead of a valid free list. 6140 */ 6141 pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va, 6142 va + L3C_SIZE, NULL, lockp); 6143 } 6144 6145 /* 6146 * Enter on the PV list if part of our managed memory. 6147 */ 6148 if ((l3e & ATTR_SW_MANAGED) != 0) { 6149 if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) { 6150 if (*ml3p != NULL) { 6151 (*ml3p)->ref_count -= L3C_ENTRIES - 1; 6152 pmap_abort_ptp(pmap, va, *ml3p); 6153 *ml3p = NULL; 6154 } 6155 return (KERN_RESOURCE_SHORTAGE); 6156 } 6157 if ((l3e & ATTR_SW_DBM) != 0) 6158 for (mt = m; mt < &m[L3C_ENTRIES]; mt++) 6159 vm_page_aflag_set(mt, PGA_WRITEABLE); 6160 } 6161 6162 /* 6163 * Increment counters. 6164 */ 6165 if ((l3e & ATTR_SW_WIRED) != 0) 6166 pmap->pm_stats.wired_count += L3C_ENTRIES; 6167 pmap_resident_count_inc(pmap, L3C_ENTRIES); 6168 6169 pa = VM_PAGE_TO_PHYS(m); 6170 KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned")); 6171 6172 /* 6173 * Sync the icache before the mapping is stored. 6174 */ 6175 if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap && 6176 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 6177 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), L3C_SIZE); 6178 6179 /* 6180 * Map the superpage. 6181 */ 6182 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 6183 pmap_store(tl3p, l3e); 6184 l3e += L3_SIZE; 6185 } 6186 dsb(ishst); 6187 6188 counter_u64_add(pmap_l3c_mappings, 1); 6189 CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p", 6190 va, pmap); 6191 return (KERN_SUCCESS); 6192 } 6193 6194 /* 6195 * Maps a sequence of resident pages belonging to the same object. 6196 * The sequence begins with the given page m_start. This page is 6197 * mapped at the given virtual address start. Each subsequent page is 6198 * mapped at a virtual address that is offset from start by the same 6199 * amount as the page is offset from m_start within the object. The 6200 * last page in the sequence is the page with the largest offset from 6201 * m_start that can be mapped at a virtual address less than the given 6202 * virtual address end. Not every virtual page between start and end 6203 * is mapped; only those for which a resident page exists with the 6204 * corresponding offset from m_start are mapped. 6205 */ 6206 void 6207 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 6208 vm_page_t m_start, vm_prot_t prot) 6209 { 6210 struct pctrie_iter pages; 6211 struct rwlock *lock; 6212 vm_offset_t va; 6213 vm_page_t m, mpte; 6214 int rv; 6215 6216 VM_OBJECT_ASSERT_LOCKED(m_start->object); 6217 6218 mpte = NULL; 6219 vm_page_iter_limit_init(&pages, m_start->object, 6220 m_start->pindex + atop(end - start)); 6221 m = vm_radix_iter_lookup(&pages, m_start->pindex); 6222 lock = NULL; 6223 PMAP_LOCK(pmap); 6224 while (m != NULL) { 6225 va = start + ptoa(m->pindex - m_start->pindex); 6226 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 6227 m->psind == 2 && pmap_ps_enabled(pmap) && 6228 ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) == 6229 KERN_SUCCESS || rv == KERN_NO_SPACE)) { 6230 m = vm_radix_iter_jump(&pages, L2_SIZE / PAGE_SIZE); 6231 } else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end && 6232 m->psind >= 1 && pmap_ps_enabled(pmap) && 6233 ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot, 6234 &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE)) { 6235 m = vm_radix_iter_jump(&pages, L3C_ENTRIES); 6236 } else { 6237 /* 6238 * In general, if a superpage mapping were possible, 6239 * it would have been created above. That said, if 6240 * start and end are not superpage aligned, then 6241 * promotion might be possible at the ends of [start, 6242 * end). However, in practice, those promotion 6243 * attempts are so unlikely to succeed that they are 6244 * not worth trying. 6245 */ 6246 mpte = pmap_enter_quick_locked(pmap, va, m, prot | 6247 VM_PROT_NO_PROMOTE, mpte, &lock); 6248 m = vm_radix_iter_step(&pages); 6249 } 6250 } 6251 if (lock != NULL) 6252 rw_wunlock(lock); 6253 PMAP_UNLOCK(pmap); 6254 } 6255 6256 /* 6257 * this code makes some *MAJOR* assumptions: 6258 * 1. Current pmap & pmap exists. 6259 * 2. Not wired. 6260 * 3. Read access. 6261 * 4. No page table pages. 6262 * but is *MUCH* faster than pmap_enter... 6263 */ 6264 6265 void 6266 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 6267 { 6268 struct rwlock *lock; 6269 6270 lock = NULL; 6271 PMAP_LOCK(pmap); 6272 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 6273 if (lock != NULL) 6274 rw_wunlock(lock); 6275 PMAP_UNLOCK(pmap); 6276 } 6277 6278 static vm_page_t 6279 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 6280 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 6281 { 6282 pt_entry_t *l1, *l2, *l3, l3_val; 6283 vm_paddr_t pa; 6284 int full_lvl, lvl; 6285 6286 KASSERT(!VA_IS_CLEANMAP(va) || 6287 (m->oflags & VPO_UNMANAGED) != 0, 6288 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 6289 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6290 PMAP_ASSERT_STAGE1(pmap); 6291 KASSERT(ADDR_IS_CANONICAL(va), 6292 ("%s: Address not in canonical form: %lx", __func__, va)); 6293 l2 = NULL; 6294 6295 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 6296 /* 6297 * In the case that a page table page is not 6298 * resident, we are creating it here. 6299 */ 6300 if (ADDR_IS_USER(va)) { 6301 vm_pindex_t l2pindex; 6302 6303 /* 6304 * Calculate pagetable page index 6305 */ 6306 l2pindex = pmap_l2_pindex(va); 6307 if (mpte && (mpte->pindex == l2pindex)) { 6308 mpte->ref_count++; 6309 } else { 6310 /* 6311 * If the page table page is mapped, we just increment 6312 * the hold count, and activate it. Otherwise, we 6313 * attempt to allocate a page table page, passing NULL 6314 * instead of the PV list lock pointer because we don't 6315 * intend to sleep. If this attempt fails, we don't 6316 * retry. Instead, we give up. 6317 */ 6318 l1 = pmap_l1(pmap, va); 6319 if (l1 != NULL && pmap_load(l1) != 0) { 6320 if ((pmap_load(l1) & ATTR_DESCR_MASK) == 6321 L1_BLOCK) 6322 return (NULL); 6323 l2 = pmap_l1_to_l2(l1, va); 6324 if (pmap_load(l2) != 0) { 6325 if ((pmap_load(l2) & ATTR_DESCR_MASK) == 6326 L2_BLOCK) 6327 return (NULL); 6328 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 6329 mpte->ref_count++; 6330 } else { 6331 mpte = _pmap_alloc_l3(pmap, l2pindex, 6332 NULL); 6333 if (mpte == NULL) 6334 return (mpte); 6335 } 6336 } else { 6337 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 6338 if (mpte == NULL) 6339 return (mpte); 6340 } 6341 } 6342 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 6343 l3 = &l3[pmap_l3_index(va)]; 6344 } else { 6345 mpte = NULL; 6346 l2 = pmap_pde(kernel_pmap, va, &lvl); 6347 KASSERT(l2 != NULL, 6348 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", 6349 va)); 6350 KASSERT(lvl == 2, 6351 ("pmap_enter_quick_locked: Invalid level %d", lvl)); 6352 l3 = pmap_l2_to_l3(l2, va); 6353 } 6354 6355 /* 6356 * Abort if a mapping already exists. 6357 */ 6358 if (pmap_load(l3) != 0) { 6359 if (mpte != NULL) 6360 mpte->ref_count--; 6361 return (NULL); 6362 } 6363 6364 /* 6365 * Enter on the PV list if part of our managed memory. 6366 */ 6367 if ((m->oflags & VPO_UNMANAGED) == 0 && 6368 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 6369 if (mpte != NULL) 6370 pmap_abort_ptp(pmap, va, mpte); 6371 return (NULL); 6372 } 6373 6374 /* 6375 * Increment counters 6376 */ 6377 pmap_resident_count_inc(pmap, 1); 6378 6379 pa = VM_PAGE_TO_PHYS(m); 6380 l3_val = PHYS_TO_PTE(pa) | pmap_sh_attr | 6381 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE; 6382 l3_val |= pmap_pte_bti(pmap, va); 6383 if ((prot & VM_PROT_EXECUTE) == 0 || 6384 m->md.pv_memattr == VM_MEMATTR_DEVICE) 6385 l3_val |= ATTR_S1_XN; 6386 if (ADDR_IS_USER(va)) 6387 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 6388 else 6389 l3_val |= ATTR_S1_UXN; 6390 if (pmap != kernel_pmap) 6391 l3_val |= ATTR_S1_nG; 6392 6393 /* 6394 * Now validate mapping with RO protection 6395 */ 6396 if ((m->oflags & VPO_UNMANAGED) == 0) 6397 l3_val |= ATTR_SW_MANAGED; 6398 else 6399 l3_val |= ATTR_AF; 6400 6401 /* Sync icache before the mapping is stored to PTE */ 6402 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 6403 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 6404 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE); 6405 6406 pmap_store(l3, l3_val); 6407 dsb(ishst); 6408 6409 #if VM_NRESERVLEVEL > 0 6410 /* 6411 * First, attempt L3C promotion, if the virtual and physical addresses 6412 * are aligned with each other and an underlying reservation has the 6413 * neighboring L3 pages allocated. The first condition is simply an 6414 * optimization that recognizes some eventual promotion failures early 6415 * at a lower run-time cost. Then, attempt L2 promotion, if both a 6416 * level 1 reservation and the PTP are fully populated. 6417 */ 6418 if ((prot & VM_PROT_NO_PROMOTE) == 0 && 6419 (va & L3C_OFFSET) == (pa & L3C_OFFSET) && 6420 (m->flags & PG_FICTITIOUS) == 0 && 6421 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 && 6422 pmap_promote_l3c(pmap, l3, va) && 6423 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) { 6424 if (l2 == NULL) 6425 l2 = pmap_l2(pmap, va); 6426 6427 /* 6428 * If promotion succeeds, then the next call to this function 6429 * should not be given the unmapped PTP as a hint. 6430 */ 6431 if (pmap_promote_l2(pmap, l2, va, mpte, lockp)) 6432 mpte = NULL; 6433 } 6434 #endif 6435 6436 return (mpte); 6437 } 6438 6439 /* 6440 * This code maps large physical mmap regions into the 6441 * processor address space. Note that some shortcuts 6442 * are taken, but the code works. 6443 */ 6444 void 6445 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 6446 vm_pindex_t pindex, vm_size_t size) 6447 { 6448 6449 VM_OBJECT_ASSERT_WLOCKED(object); 6450 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 6451 ("pmap_object_init_pt: non-device object")); 6452 } 6453 6454 /* 6455 * Clear the wired attribute from the mappings for the specified range of 6456 * addresses in the given pmap. Every valid mapping within that range 6457 * must have the wired attribute set. In contrast, invalid mappings 6458 * cannot have the wired attribute set, so they are ignored. 6459 * 6460 * The wired attribute of the page table entry is not a hardware feature, 6461 * so there is no need to invalidate any TLB entries. 6462 */ 6463 void 6464 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6465 { 6466 vm_offset_t va_next; 6467 pd_entry_t *l0, *l1, *l2; 6468 pt_entry_t *l3; 6469 bool partial_l3c; 6470 6471 PMAP_LOCK(pmap); 6472 for (; sva < eva; sva = va_next) { 6473 l0 = pmap_l0(pmap, sva); 6474 if (pmap_load(l0) == 0) { 6475 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 6476 if (va_next < sva) 6477 va_next = eva; 6478 continue; 6479 } 6480 6481 l1 = pmap_l0_to_l1(l0, sva); 6482 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 6483 if (va_next < sva) 6484 va_next = eva; 6485 if (pmap_load(l1) == 0) 6486 continue; 6487 6488 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 6489 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6490 KASSERT(va_next <= eva, 6491 ("partial update of non-transparent 1G page " 6492 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 6493 pmap_load(l1), sva, eva, va_next)); 6494 MPASS(pmap != kernel_pmap); 6495 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED | 6496 ATTR_SW_WIRED)) == ATTR_SW_WIRED); 6497 pmap_clear_bits(l1, ATTR_SW_WIRED); 6498 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE; 6499 continue; 6500 } 6501 6502 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 6503 if (va_next < sva) 6504 va_next = eva; 6505 6506 l2 = pmap_l1_to_l2(l1, sva); 6507 if (pmap_load(l2) == 0) 6508 continue; 6509 6510 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 6511 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) 6512 panic("pmap_unwire: l2 %#jx is missing " 6513 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); 6514 6515 /* 6516 * Are we unwiring the entire large page? If not, 6517 * demote the mapping and fall through. 6518 */ 6519 if (sva + L2_SIZE == va_next && eva >= va_next) { 6520 pmap_clear_bits(l2, ATTR_SW_WIRED); 6521 pmap->pm_stats.wired_count -= L2_SIZE / 6522 PAGE_SIZE; 6523 continue; 6524 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 6525 panic("pmap_unwire: demotion failed"); 6526 } 6527 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 6528 ("pmap_unwire: Invalid l2 entry after demotion")); 6529 6530 if (va_next > eva) 6531 va_next = eva; 6532 for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva); 6533 sva != va_next; l3++, sva += L3_SIZE) { 6534 if (pmap_load(l3) == 0) 6535 continue; 6536 if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) { 6537 /* 6538 * Avoid demotion for whole-page unwiring. 6539 */ 6540 if ((sva & L3C_OFFSET) == 0) { 6541 /* 6542 * Handle the possibility that 6543 * "va_next" is zero because of 6544 * address wraparound. 6545 */ 6546 partial_l3c = sva + L3C_OFFSET > 6547 va_next - 1; 6548 } 6549 if (partial_l3c) 6550 (void)pmap_demote_l3c(pmap, l3, sva); 6551 } 6552 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) 6553 panic("pmap_unwire: l3 %#jx is missing " 6554 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); 6555 6556 /* 6557 * ATTR_SW_WIRED must be cleared atomically. Although 6558 * the pmap lock synchronizes access to ATTR_SW_WIRED, 6559 * the System MMU may write to the entry concurrently. 6560 */ 6561 pmap_clear_bits(l3, ATTR_SW_WIRED); 6562 pmap->pm_stats.wired_count--; 6563 } 6564 } 6565 PMAP_UNLOCK(pmap); 6566 } 6567 6568 /* 6569 * This function requires that the caller has already added one to ml3's 6570 * ref_count in anticipation of creating a 4KB page mapping. 6571 */ 6572 static bool 6573 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e, 6574 vm_page_t ml3, struct rwlock **lockp) 6575 { 6576 pt_entry_t *tl3p; 6577 6578 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6579 KASSERT((va & L3C_OFFSET) == 0, 6580 ("pmap_copy_l3c: va is not aligned")); 6581 KASSERT((l3e & ATTR_SW_MANAGED) != 0, 6582 ("pmap_copy_l3c: l3e is not managed")); 6583 6584 /* 6585 * Abort if a mapping already exists. 6586 */ 6587 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) 6588 if (pmap_load(tl3p) != 0) { 6589 if (ml3 != NULL) 6590 ml3->ref_count--; 6591 return (false); 6592 } 6593 6594 if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) { 6595 if (ml3 != NULL) 6596 pmap_abort_ptp(pmap, va, ml3); 6597 return (false); 6598 } 6599 ml3->ref_count += L3C_ENTRIES - 1; 6600 6601 /* 6602 * Clear the wired and accessed bits. However, leave the dirty bit 6603 * unchanged because read/write superpage mappings are required to be 6604 * dirty. 6605 */ 6606 l3e &= ~(ATTR_SW_WIRED | ATTR_AF); 6607 6608 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 6609 pmap_store(tl3p, l3e); 6610 l3e += L3_SIZE; 6611 } 6612 pmap_resident_count_inc(pmap, L3C_ENTRIES); 6613 counter_u64_add(pmap_l3c_mappings, 1); 6614 CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p", 6615 va, pmap); 6616 return (true); 6617 } 6618 6619 /* 6620 * Copy the range specified by src_addr/len 6621 * from the source map to the range dst_addr/len 6622 * in the destination map. 6623 * 6624 * This routine is only advisory and need not do anything. 6625 * 6626 * Because the executable mappings created by this routine are copied, 6627 * it should not have to flush the instruction cache. 6628 */ 6629 void 6630 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 6631 vm_offset_t src_addr) 6632 { 6633 struct rwlock *lock; 6634 pd_entry_t *l0, *l1, *l2, srcptepaddr; 6635 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; 6636 vm_offset_t addr, end_addr, va_next; 6637 vm_page_t dst_m, dstmpte, srcmpte; 6638 6639 PMAP_ASSERT_STAGE1(dst_pmap); 6640 PMAP_ASSERT_STAGE1(src_pmap); 6641 6642 if (dst_addr != src_addr) 6643 return; 6644 end_addr = src_addr + len; 6645 lock = NULL; 6646 if (dst_pmap < src_pmap) { 6647 PMAP_LOCK(dst_pmap); 6648 PMAP_LOCK(src_pmap); 6649 } else { 6650 PMAP_LOCK(src_pmap); 6651 PMAP_LOCK(dst_pmap); 6652 } 6653 for (addr = src_addr; addr < end_addr; addr = va_next) { 6654 l0 = pmap_l0(src_pmap, addr); 6655 if (pmap_load(l0) == 0) { 6656 va_next = (addr + L0_SIZE) & ~L0_OFFSET; 6657 if (va_next < addr) 6658 va_next = end_addr; 6659 continue; 6660 } 6661 6662 va_next = (addr + L1_SIZE) & ~L1_OFFSET; 6663 if (va_next < addr) 6664 va_next = end_addr; 6665 l1 = pmap_l0_to_l1(l0, addr); 6666 if (pmap_load(l1) == 0) 6667 continue; 6668 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 6669 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6670 KASSERT(va_next <= end_addr, 6671 ("partial update of non-transparent 1G page " 6672 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 6673 pmap_load(l1), addr, end_addr, va_next)); 6674 srcptepaddr = pmap_load(l1); 6675 l1 = pmap_l1(dst_pmap, addr); 6676 if (l1 == NULL) { 6677 if (_pmap_alloc_l3(dst_pmap, 6678 pmap_l0_pindex(addr), NULL) == NULL) 6679 break; 6680 l1 = pmap_l1(dst_pmap, addr); 6681 } else { 6682 l0 = pmap_l0(dst_pmap, addr); 6683 dst_m = PTE_TO_VM_PAGE(pmap_load(l0)); 6684 dst_m->ref_count++; 6685 } 6686 KASSERT(pmap_load(l1) == 0, 6687 ("1G mapping present in dst pmap " 6688 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 6689 pmap_load(l1), addr, end_addr, va_next)); 6690 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED); 6691 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE); 6692 continue; 6693 } 6694 6695 va_next = (addr + L2_SIZE) & ~L2_OFFSET; 6696 if (va_next < addr) 6697 va_next = end_addr; 6698 l2 = pmap_l1_to_l2(l1, addr); 6699 srcptepaddr = pmap_load(l2); 6700 if (srcptepaddr == 0) 6701 continue; 6702 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { 6703 /* 6704 * We can only virtual copy whole superpages. 6705 */ 6706 if ((addr & L2_OFFSET) != 0 || 6707 addr + L2_SIZE > end_addr) 6708 continue; 6709 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL); 6710 if (l2 == NULL) 6711 break; 6712 if (pmap_load(l2) == 0 && 6713 ((srcptepaddr & ATTR_SW_MANAGED) == 0 || 6714 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, 6715 PMAP_ENTER_NORECLAIM, &lock))) { 6716 /* 6717 * We leave the dirty bit unchanged because 6718 * managed read/write superpage mappings are 6719 * required to be dirty. However, managed 6720 * superpage mappings are not required to 6721 * have their accessed bit set, so we clear 6722 * it because we don't know if this mapping 6723 * will be used. 6724 */ 6725 srcptepaddr &= ~ATTR_SW_WIRED; 6726 if ((srcptepaddr & ATTR_SW_MANAGED) != 0) 6727 srcptepaddr &= ~ATTR_AF; 6728 pmap_store(l2, srcptepaddr); 6729 pmap_resident_count_inc(dst_pmap, L2_SIZE / 6730 PAGE_SIZE); 6731 counter_u64_add(pmap_l2_mappings, 1); 6732 } else 6733 pmap_abort_ptp(dst_pmap, addr, dst_m); 6734 continue; 6735 } 6736 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, 6737 ("pmap_copy: invalid L2 entry")); 6738 srcmpte = PTE_TO_VM_PAGE(srcptepaddr); 6739 KASSERT(srcmpte->ref_count > 0, 6740 ("pmap_copy: source page table page is unused")); 6741 if (va_next > end_addr) 6742 va_next = end_addr; 6743 src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr)); 6744 src_pte = &src_pte[pmap_l3_index(addr)]; 6745 dstmpte = NULL; 6746 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 6747 ptetemp = pmap_load(src_pte); 6748 6749 /* 6750 * We only virtual copy managed pages. 6751 */ 6752 if ((ptetemp & ATTR_SW_MANAGED) == 0) 6753 continue; 6754 6755 if (dstmpte != NULL) { 6756 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), 6757 ("dstmpte pindex/addr mismatch")); 6758 dstmpte->ref_count++; 6759 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, 6760 NULL)) == NULL) 6761 goto out; 6762 dst_pte = (pt_entry_t *) 6763 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 6764 dst_pte = &dst_pte[pmap_l3_index(addr)]; 6765 if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr & 6766 L3C_OFFSET) == 0 && addr + L3C_OFFSET <= 6767 va_next - 1) { 6768 if (!pmap_copy_l3c(dst_pmap, dst_pte, addr, 6769 ptetemp, dstmpte, &lock)) 6770 goto out; 6771 addr += L3C_SIZE - PAGE_SIZE; 6772 src_pte += L3C_ENTRIES - 1; 6773 } else if (pmap_load(dst_pte) == 0 && 6774 pmap_try_insert_pv_entry(dst_pmap, addr, 6775 PTE_TO_VM_PAGE(ptetemp), &lock)) { 6776 /* 6777 * Clear the wired, contiguous, modified, and 6778 * accessed bits from the destination PTE. 6779 * The contiguous bit is cleared because we 6780 * are not copying the entire L3C superpage. 6781 */ 6782 mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS | 6783 ATTR_AF; 6784 nbits = 0; 6785 if ((ptetemp & ATTR_SW_DBM) != 0) 6786 nbits |= ATTR_S1_AP_RW_BIT; 6787 pmap_store(dst_pte, (ptetemp & ~mask) | nbits); 6788 pmap_resident_count_inc(dst_pmap, 1); 6789 } else { 6790 pmap_abort_ptp(dst_pmap, addr, dstmpte); 6791 goto out; 6792 } 6793 /* Have we copied all of the valid mappings? */ 6794 if (dstmpte->ref_count >= srcmpte->ref_count) 6795 break; 6796 } 6797 } 6798 out: 6799 /* 6800 * XXX This barrier may not be needed because the destination pmap is 6801 * not active. 6802 */ 6803 dsb(ishst); 6804 6805 if (lock != NULL) 6806 rw_wunlock(lock); 6807 PMAP_UNLOCK(src_pmap); 6808 PMAP_UNLOCK(dst_pmap); 6809 } 6810 6811 int 6812 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 6813 { 6814 int error; 6815 6816 if (dst_pmap->pm_stage != src_pmap->pm_stage) 6817 return (EINVAL); 6818 6819 if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL) 6820 return (0); 6821 6822 for (;;) { 6823 if (dst_pmap < src_pmap) { 6824 PMAP_LOCK(dst_pmap); 6825 PMAP_LOCK(src_pmap); 6826 } else { 6827 PMAP_LOCK(src_pmap); 6828 PMAP_LOCK(dst_pmap); 6829 } 6830 error = pmap_bti_copy(dst_pmap, src_pmap); 6831 /* Clean up partial copy on failure due to no memory. */ 6832 if (error == ENOMEM) 6833 pmap_bti_deassign_all(dst_pmap); 6834 PMAP_UNLOCK(src_pmap); 6835 PMAP_UNLOCK(dst_pmap); 6836 if (error != ENOMEM) 6837 break; 6838 vm_wait(NULL); 6839 } 6840 return (error); 6841 } 6842 6843 /* 6844 * pmap_zero_page zeros the specified hardware page by mapping 6845 * the page into KVM and using bzero to clear its contents. 6846 */ 6847 void 6848 pmap_zero_page(vm_page_t m) 6849 { 6850 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 6851 6852 pagezero((void *)va); 6853 } 6854 6855 /* 6856 * pmap_zero_page_area zeros the specified hardware page by mapping 6857 * the page into KVM and using bzero to clear its contents. 6858 * 6859 * off and size may not cover an area beyond a single hardware page. 6860 */ 6861 void 6862 pmap_zero_page_area(vm_page_t m, int off, int size) 6863 { 6864 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 6865 6866 if (off == 0 && size == PAGE_SIZE) 6867 pagezero((void *)va); 6868 else 6869 bzero((char *)va + off, size); 6870 } 6871 6872 /* 6873 * pmap_copy_page copies the specified (machine independent) 6874 * page by mapping the page into virtual memory and using 6875 * bcopy to copy the page, one machine dependent page at a 6876 * time. 6877 */ 6878 void 6879 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 6880 { 6881 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 6882 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 6883 6884 pagecopy((void *)src, (void *)dst); 6885 } 6886 6887 int unmapped_buf_allowed = 1; 6888 6889 void 6890 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 6891 vm_offset_t b_offset, int xfersize) 6892 { 6893 void *a_cp, *b_cp; 6894 vm_page_t m_a, m_b; 6895 vm_paddr_t p_a, p_b; 6896 vm_offset_t a_pg_offset, b_pg_offset; 6897 int cnt; 6898 6899 while (xfersize > 0) { 6900 a_pg_offset = a_offset & PAGE_MASK; 6901 m_a = ma[a_offset >> PAGE_SHIFT]; 6902 p_a = m_a->phys_addr; 6903 b_pg_offset = b_offset & PAGE_MASK; 6904 m_b = mb[b_offset >> PAGE_SHIFT]; 6905 p_b = m_b->phys_addr; 6906 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 6907 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 6908 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 6909 panic("!DMAP a %lx", p_a); 6910 } else { 6911 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 6912 } 6913 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 6914 panic("!DMAP b %lx", p_b); 6915 } else { 6916 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 6917 } 6918 bcopy(a_cp, b_cp, cnt); 6919 a_offset += cnt; 6920 b_offset += cnt; 6921 xfersize -= cnt; 6922 } 6923 } 6924 6925 vm_offset_t 6926 pmap_quick_enter_page(vm_page_t m) 6927 { 6928 6929 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 6930 } 6931 6932 void 6933 pmap_quick_remove_page(vm_offset_t addr) 6934 { 6935 } 6936 6937 /* 6938 * Returns true if the pmap's pv is one of the first 6939 * 16 pvs linked to from this page. This count may 6940 * be changed upwards or downwards in the future; it 6941 * is only necessary that true be returned for a small 6942 * subset of pmaps for proper page aging. 6943 */ 6944 bool 6945 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 6946 { 6947 struct md_page *pvh; 6948 struct rwlock *lock; 6949 pv_entry_t pv; 6950 int loops = 0; 6951 bool rv; 6952 6953 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6954 ("pmap_page_exists_quick: page %p is not managed", m)); 6955 rv = false; 6956 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6957 rw_rlock(lock); 6958 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6959 if (PV_PMAP(pv) == pmap) { 6960 rv = true; 6961 break; 6962 } 6963 loops++; 6964 if (loops >= 16) 6965 break; 6966 } 6967 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 6968 pvh = page_to_pvh(m); 6969 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 6970 if (PV_PMAP(pv) == pmap) { 6971 rv = true; 6972 break; 6973 } 6974 loops++; 6975 if (loops >= 16) 6976 break; 6977 } 6978 } 6979 rw_runlock(lock); 6980 return (rv); 6981 } 6982 6983 /* 6984 * pmap_page_wired_mappings: 6985 * 6986 * Return the number of managed mappings to the given physical page 6987 * that are wired. 6988 */ 6989 int 6990 pmap_page_wired_mappings(vm_page_t m) 6991 { 6992 struct rwlock *lock; 6993 struct md_page *pvh; 6994 pmap_t pmap; 6995 pt_entry_t *pte; 6996 pv_entry_t pv; 6997 int count, md_gen, pvh_gen; 6998 6999 if ((m->oflags & VPO_UNMANAGED) != 0) 7000 return (0); 7001 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7002 rw_rlock(lock); 7003 restart: 7004 count = 0; 7005 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7006 pmap = PV_PMAP(pv); 7007 if (!PMAP_TRYLOCK(pmap)) { 7008 md_gen = m->md.pv_gen; 7009 rw_runlock(lock); 7010 PMAP_LOCK(pmap); 7011 rw_rlock(lock); 7012 if (md_gen != m->md.pv_gen) { 7013 PMAP_UNLOCK(pmap); 7014 goto restart; 7015 } 7016 } 7017 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7018 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 7019 count++; 7020 PMAP_UNLOCK(pmap); 7021 } 7022 if ((m->flags & PG_FICTITIOUS) == 0) { 7023 pvh = page_to_pvh(m); 7024 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 7025 pmap = PV_PMAP(pv); 7026 if (!PMAP_TRYLOCK(pmap)) { 7027 md_gen = m->md.pv_gen; 7028 pvh_gen = pvh->pv_gen; 7029 rw_runlock(lock); 7030 PMAP_LOCK(pmap); 7031 rw_rlock(lock); 7032 if (md_gen != m->md.pv_gen || 7033 pvh_gen != pvh->pv_gen) { 7034 PMAP_UNLOCK(pmap); 7035 goto restart; 7036 } 7037 } 7038 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 7039 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 7040 count++; 7041 PMAP_UNLOCK(pmap); 7042 } 7043 } 7044 rw_runlock(lock); 7045 return (count); 7046 } 7047 7048 /* 7049 * Returns true if the given page is mapped individually or as part of 7050 * a 2mpage. Otherwise, returns false. 7051 */ 7052 bool 7053 pmap_page_is_mapped(vm_page_t m) 7054 { 7055 struct rwlock *lock; 7056 bool rv; 7057 7058 if ((m->oflags & VPO_UNMANAGED) != 0) 7059 return (false); 7060 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7061 rw_rlock(lock); 7062 rv = !TAILQ_EMPTY(&m->md.pv_list) || 7063 ((m->flags & PG_FICTITIOUS) == 0 && 7064 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list)); 7065 rw_runlock(lock); 7066 return (rv); 7067 } 7068 7069 /* 7070 * Destroy all managed, non-wired mappings in the given user-space 7071 * pmap. This pmap cannot be active on any processor besides the 7072 * caller. 7073 * 7074 * This function cannot be applied to the kernel pmap. Moreover, it 7075 * is not intended for general use. It is only to be used during 7076 * process termination. Consequently, it can be implemented in ways 7077 * that make it faster than pmap_remove(). First, it can more quickly 7078 * destroy mappings by iterating over the pmap's collection of PV 7079 * entries, rather than searching the page table. Second, it doesn't 7080 * have to test and clear the page table entries atomically, because 7081 * no processor is currently accessing the user address space. In 7082 * particular, a page table entry's dirty bit won't change state once 7083 * this function starts. 7084 */ 7085 void 7086 pmap_remove_pages(pmap_t pmap) 7087 { 7088 pd_entry_t *pde; 7089 pt_entry_t *pte, tpte; 7090 struct spglist free; 7091 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 7092 vm_page_t m, ml3, mt; 7093 pv_entry_t pv; 7094 struct md_page *pvh; 7095 struct pv_chunk *pc, *npc; 7096 struct rwlock *lock; 7097 int64_t bit; 7098 uint64_t inuse, bitmask; 7099 int allfree, field, i, idx, lvl; 7100 int freed __pvused; 7101 vm_paddr_t pa; 7102 7103 lock = NULL; 7104 7105 for (i = 0; i < PMAP_MEMDOM; i++) 7106 TAILQ_INIT(&free_chunks[i]); 7107 SLIST_INIT(&free); 7108 PMAP_LOCK(pmap); 7109 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 7110 allfree = 1; 7111 freed = 0; 7112 for (field = 0; field < _NPCM; field++) { 7113 inuse = ~pc->pc_map[field] & pc_freemask[field]; 7114 while (inuse != 0) { 7115 bit = ffsl(inuse) - 1; 7116 bitmask = 1UL << bit; 7117 idx = field * 64 + bit; 7118 pv = &pc->pc_pventry[idx]; 7119 inuse &= ~bitmask; 7120 7121 pde = pmap_pde(pmap, pv->pv_va, &lvl); 7122 KASSERT(pde != NULL, 7123 ("Attempting to remove an unmapped page")); 7124 7125 switch(lvl) { 7126 case 1: 7127 pte = pmap_l1_to_l2(pde, pv->pv_va); 7128 tpte = pmap_load(pte); 7129 KASSERT((tpte & ATTR_DESCR_MASK) == 7130 L2_BLOCK, 7131 ("Attempting to remove an invalid " 7132 "block: %lx", tpte)); 7133 break; 7134 case 2: 7135 pte = pmap_l2_to_l3(pde, pv->pv_va); 7136 tpte = pmap_load(pte); 7137 KASSERT((tpte & ATTR_DESCR_MASK) == 7138 L3_PAGE, 7139 ("Attempting to remove an invalid " 7140 "page: %lx", tpte)); 7141 break; 7142 default: 7143 panic( 7144 "Invalid page directory level: %d", 7145 lvl); 7146 } 7147 7148 /* 7149 * We cannot remove wired mappings at this time. 7150 * 7151 * For L3C superpages, all of the constituent PTEs 7152 * should have the wired bit set, so we don't 7153 * check for ATTR_CONTIGUOUS here. 7154 */ 7155 if (tpte & ATTR_SW_WIRED) { 7156 allfree = 0; 7157 continue; 7158 } 7159 7160 /* Mark free */ 7161 pc->pc_map[field] |= bitmask; 7162 7163 /* 7164 * Because this pmap is not active on other 7165 * processors, the dirty bit cannot have 7166 * changed state since we last loaded pte. 7167 */ 7168 pmap_clear(pte); 7169 7170 pa = PTE_TO_PHYS(tpte); 7171 7172 m = PHYS_TO_VM_PAGE(pa); 7173 KASSERT(m->phys_addr == pa, 7174 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 7175 m, (uintmax_t)m->phys_addr, 7176 (uintmax_t)tpte)); 7177 7178 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 7179 m < &vm_page_array[vm_page_array_size], 7180 ("pmap_remove_pages: bad pte %#jx", 7181 (uintmax_t)tpte)); 7182 7183 /* 7184 * Update the vm_page_t clean/reference bits. 7185 * 7186 * We don't check for ATTR_CONTIGUOUS here 7187 * because writeable L3C superpages are expected 7188 * to be dirty, i.e., every constituent PTE 7189 * should be dirty. 7190 */ 7191 if (pmap_pte_dirty(pmap, tpte)) { 7192 switch (lvl) { 7193 case 1: 7194 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 7195 vm_page_dirty(mt); 7196 break; 7197 case 2: 7198 vm_page_dirty(m); 7199 break; 7200 } 7201 } 7202 7203 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 7204 7205 switch (lvl) { 7206 case 1: 7207 pmap_resident_count_dec(pmap, 7208 L2_SIZE / PAGE_SIZE); 7209 pvh = page_to_pvh(m); 7210 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); 7211 pvh->pv_gen++; 7212 if (TAILQ_EMPTY(&pvh->pv_list)) { 7213 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 7214 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 7215 TAILQ_EMPTY(&mt->md.pv_list)) 7216 vm_page_aflag_clear(mt, PGA_WRITEABLE); 7217 } 7218 ml3 = pmap_remove_pt_page(pmap, 7219 pv->pv_va); 7220 if (ml3 != NULL) { 7221 KASSERT(vm_page_any_valid(ml3), 7222 ("pmap_remove_pages: l3 page not promoted")); 7223 pmap_resident_count_dec(pmap,1); 7224 KASSERT(ml3->ref_count == NL3PG, 7225 ("pmap_remove_pages: l3 page ref count error")); 7226 ml3->ref_count = 0; 7227 pmap_add_delayed_free_list(ml3, 7228 &free, false); 7229 } 7230 break; 7231 case 2: 7232 pmap_resident_count_dec(pmap, 1); 7233 TAILQ_REMOVE(&m->md.pv_list, pv, 7234 pv_next); 7235 m->md.pv_gen++; 7236 if ((m->a.flags & PGA_WRITEABLE) != 0 && 7237 TAILQ_EMPTY(&m->md.pv_list) && 7238 (m->flags & PG_FICTITIOUS) == 0) { 7239 pvh = page_to_pvh(m); 7240 if (TAILQ_EMPTY(&pvh->pv_list)) 7241 vm_page_aflag_clear(m, 7242 PGA_WRITEABLE); 7243 } 7244 break; 7245 } 7246 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), 7247 &free); 7248 freed++; 7249 } 7250 } 7251 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 7252 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 7253 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 7254 if (allfree) { 7255 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 7256 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, 7257 pc_list); 7258 } 7259 } 7260 if (lock != NULL) 7261 rw_wunlock(lock); 7262 pmap_invalidate_all(pmap); 7263 pmap_bti_deassign_all(pmap); 7264 free_pv_chunk_batch(free_chunks); 7265 PMAP_UNLOCK(pmap); 7266 vm_page_free_pages_toq(&free, true); 7267 } 7268 7269 /* 7270 * This is used to check if a page has been accessed or modified. 7271 */ 7272 static bool 7273 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified) 7274 { 7275 struct rwlock *lock; 7276 pv_entry_t pv; 7277 struct md_page *pvh; 7278 pt_entry_t l3e, mask, *pte, value; 7279 pmap_t pmap; 7280 int md_gen, pvh_gen; 7281 bool rv; 7282 7283 rv = false; 7284 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7285 rw_rlock(lock); 7286 restart: 7287 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7288 pmap = PV_PMAP(pv); 7289 PMAP_ASSERT_STAGE1(pmap); 7290 if (!PMAP_TRYLOCK(pmap)) { 7291 md_gen = m->md.pv_gen; 7292 rw_runlock(lock); 7293 PMAP_LOCK(pmap); 7294 rw_rlock(lock); 7295 if (md_gen != m->md.pv_gen) { 7296 PMAP_UNLOCK(pmap); 7297 goto restart; 7298 } 7299 } 7300 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7301 mask = 0; 7302 value = 0; 7303 if (modified) { 7304 mask |= ATTR_S1_AP_RW_BIT; 7305 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 7306 } 7307 if (accessed) { 7308 mask |= ATTR_AF | ATTR_DESCR_MASK; 7309 value |= ATTR_AF | L3_PAGE; 7310 } 7311 l3e = pmap_load(pte); 7312 if ((l3e & ATTR_CONTIGUOUS) != 0) 7313 l3e = pmap_load_l3c(pte); 7314 PMAP_UNLOCK(pmap); 7315 rv = (l3e & mask) == value; 7316 if (rv) 7317 goto out; 7318 } 7319 if ((m->flags & PG_FICTITIOUS) == 0) { 7320 pvh = page_to_pvh(m); 7321 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 7322 pmap = PV_PMAP(pv); 7323 PMAP_ASSERT_STAGE1(pmap); 7324 if (!PMAP_TRYLOCK(pmap)) { 7325 md_gen = m->md.pv_gen; 7326 pvh_gen = pvh->pv_gen; 7327 rw_runlock(lock); 7328 PMAP_LOCK(pmap); 7329 rw_rlock(lock); 7330 if (md_gen != m->md.pv_gen || 7331 pvh_gen != pvh->pv_gen) { 7332 PMAP_UNLOCK(pmap); 7333 goto restart; 7334 } 7335 } 7336 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 7337 mask = 0; 7338 value = 0; 7339 if (modified) { 7340 mask |= ATTR_S1_AP_RW_BIT; 7341 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 7342 } 7343 if (accessed) { 7344 mask |= ATTR_AF | ATTR_DESCR_MASK; 7345 value |= ATTR_AF | L2_BLOCK; 7346 } 7347 rv = (pmap_load(pte) & mask) == value; 7348 PMAP_UNLOCK(pmap); 7349 if (rv) 7350 goto out; 7351 } 7352 } 7353 out: 7354 rw_runlock(lock); 7355 return (rv); 7356 } 7357 7358 /* 7359 * pmap_is_modified: 7360 * 7361 * Return whether or not the specified physical page was modified 7362 * in any physical maps. 7363 */ 7364 bool 7365 pmap_is_modified(vm_page_t m) 7366 { 7367 7368 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7369 ("pmap_is_modified: page %p is not managed", m)); 7370 7371 /* 7372 * If the page is not busied then this check is racy. 7373 */ 7374 if (!pmap_page_is_write_mapped(m)) 7375 return (false); 7376 return (pmap_page_test_mappings(m, false, true)); 7377 } 7378 7379 /* 7380 * pmap_is_prefaultable: 7381 * 7382 * Return whether or not the specified virtual address is eligible 7383 * for prefault. 7384 */ 7385 bool 7386 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 7387 { 7388 pd_entry_t *pde; 7389 pt_entry_t *pte; 7390 bool rv; 7391 int lvl; 7392 7393 /* 7394 * Return true if and only if the L3 entry for the specified virtual 7395 * address is allocated but invalid. 7396 */ 7397 rv = false; 7398 PMAP_LOCK(pmap); 7399 pde = pmap_pde(pmap, addr, &lvl); 7400 if (pde != NULL && lvl == 2) { 7401 pte = pmap_l2_to_l3(pde, addr); 7402 rv = pmap_load(pte) == 0; 7403 } 7404 PMAP_UNLOCK(pmap); 7405 return (rv); 7406 } 7407 7408 /* 7409 * pmap_is_referenced: 7410 * 7411 * Return whether or not the specified physical page was referenced 7412 * in any physical maps. 7413 */ 7414 bool 7415 pmap_is_referenced(vm_page_t m) 7416 { 7417 7418 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7419 ("pmap_is_referenced: page %p is not managed", m)); 7420 return (pmap_page_test_mappings(m, true, false)); 7421 } 7422 7423 /* 7424 * Clear the write and modified bits in each of the given page's mappings. 7425 */ 7426 void 7427 pmap_remove_write(vm_page_t m) 7428 { 7429 struct md_page *pvh; 7430 pmap_t pmap; 7431 struct rwlock *lock; 7432 pv_entry_t next_pv, pv; 7433 pt_entry_t oldpte, *pte, set, clear, mask, val; 7434 vm_offset_t va; 7435 int md_gen, pvh_gen; 7436 7437 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7438 ("pmap_remove_write: page %p is not managed", m)); 7439 vm_page_assert_busied(m); 7440 7441 if (!pmap_page_is_write_mapped(m)) 7442 return; 7443 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7444 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 7445 rw_wlock(lock); 7446 retry: 7447 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 7448 pmap = PV_PMAP(pv); 7449 PMAP_ASSERT_STAGE1(pmap); 7450 if (!PMAP_TRYLOCK(pmap)) { 7451 pvh_gen = pvh->pv_gen; 7452 rw_wunlock(lock); 7453 PMAP_LOCK(pmap); 7454 rw_wlock(lock); 7455 if (pvh_gen != pvh->pv_gen) { 7456 PMAP_UNLOCK(pmap); 7457 goto retry; 7458 } 7459 } 7460 va = pv->pv_va; 7461 pte = pmap_pte_exists(pmap, va, 2, __func__); 7462 if ((pmap_load(pte) & ATTR_SW_DBM) != 0) 7463 (void)pmap_demote_l2_locked(pmap, pte, va, &lock); 7464 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 7465 ("inconsistent pv lock %p %p for page %p", 7466 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 7467 PMAP_UNLOCK(pmap); 7468 } 7469 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7470 pmap = PV_PMAP(pv); 7471 if (!PMAP_TRYLOCK(pmap)) { 7472 pvh_gen = pvh->pv_gen; 7473 md_gen = m->md.pv_gen; 7474 rw_wunlock(lock); 7475 PMAP_LOCK(pmap); 7476 rw_wlock(lock); 7477 if (pvh_gen != pvh->pv_gen || 7478 md_gen != m->md.pv_gen) { 7479 PMAP_UNLOCK(pmap); 7480 goto retry; 7481 } 7482 } 7483 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7484 oldpte = pmap_load(pte); 7485 if ((oldpte & ATTR_SW_DBM) != 0) { 7486 if ((oldpte & ATTR_CONTIGUOUS) != 0) { 7487 (void)pmap_demote_l3c(pmap, pte, pv->pv_va); 7488 7489 /* 7490 * The L3 entry's accessed bit may have 7491 * changed. 7492 */ 7493 oldpte = pmap_load(pte); 7494 } 7495 if (pmap->pm_stage == PM_STAGE1) { 7496 set = ATTR_S1_AP_RW_BIT; 7497 clear = 0; 7498 mask = ATTR_S1_AP_RW_BIT; 7499 val = ATTR_S1_AP(ATTR_S1_AP_RW); 7500 } else { 7501 set = 0; 7502 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 7503 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 7504 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 7505 } 7506 clear |= ATTR_SW_DBM; 7507 while (!atomic_fcmpset_64(pte, &oldpte, 7508 (oldpte | set) & ~clear)) 7509 cpu_spinwait(); 7510 7511 if ((oldpte & mask) == val) 7512 vm_page_dirty(m); 7513 pmap_invalidate_page(pmap, pv->pv_va, true); 7514 } 7515 PMAP_UNLOCK(pmap); 7516 } 7517 rw_wunlock(lock); 7518 vm_page_aflag_clear(m, PGA_WRITEABLE); 7519 } 7520 7521 /* 7522 * pmap_ts_referenced: 7523 * 7524 * Return a count of reference bits for a page, clearing those bits. 7525 * It is not necessary for every reference bit to be cleared, but it 7526 * is necessary that 0 only be returned when there are truly no 7527 * reference bits set. 7528 * 7529 * As an optimization, update the page's dirty field if a modified bit is 7530 * found while counting reference bits. This opportunistic update can be 7531 * performed at low cost and can eliminate the need for some future calls 7532 * to pmap_is_modified(). However, since this function stops after 7533 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 7534 * dirty pages. Those dirty pages will only be detected by a future call 7535 * to pmap_is_modified(). 7536 */ 7537 int 7538 pmap_ts_referenced(vm_page_t m) 7539 { 7540 struct md_page *pvh; 7541 pv_entry_t pv, pvf; 7542 pmap_t pmap; 7543 struct rwlock *lock; 7544 pt_entry_t *pte, tpte; 7545 vm_offset_t va; 7546 vm_paddr_t pa; 7547 int cleared, md_gen, not_cleared, pvh_gen; 7548 struct spglist free; 7549 7550 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7551 ("pmap_ts_referenced: page %p is not managed", m)); 7552 SLIST_INIT(&free); 7553 cleared = 0; 7554 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 7555 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7556 rw_wlock(lock); 7557 retry: 7558 not_cleared = 0; 7559 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 7560 goto small_mappings; 7561 pv = pvf; 7562 do { 7563 if (pvf == NULL) 7564 pvf = pv; 7565 pmap = PV_PMAP(pv); 7566 if (!PMAP_TRYLOCK(pmap)) { 7567 pvh_gen = pvh->pv_gen; 7568 rw_wunlock(lock); 7569 PMAP_LOCK(pmap); 7570 rw_wlock(lock); 7571 if (pvh_gen != pvh->pv_gen) { 7572 PMAP_UNLOCK(pmap); 7573 goto retry; 7574 } 7575 } 7576 va = pv->pv_va; 7577 pte = pmap_pte_exists(pmap, va, 2, __func__); 7578 tpte = pmap_load(pte); 7579 if (pmap_pte_dirty(pmap, tpte)) { 7580 /* 7581 * Although "tpte" is mapping a 2MB page, because 7582 * this function is called at a 4KB page granularity, 7583 * we only update the 4KB page under test. 7584 */ 7585 vm_page_dirty(m); 7586 } 7587 if ((tpte & ATTR_AF) != 0) { 7588 pa = VM_PAGE_TO_PHYS(m); 7589 7590 /* 7591 * Since this reference bit is shared by 512 4KB pages, 7592 * it should not be cleared every time it is tested. 7593 * Apply a simple "hash" function on the physical page 7594 * number, the virtual superpage number, and the pmap 7595 * address to select one 4KB page out of the 512 on 7596 * which testing the reference bit will result in 7597 * clearing that reference bit. This function is 7598 * designed to avoid the selection of the same 4KB page 7599 * for every 2MB page mapping. 7600 * 7601 * On demotion, a mapping that hasn't been referenced 7602 * is simply destroyed. To avoid the possibility of a 7603 * subsequent page fault on a demoted wired mapping, 7604 * always leave its reference bit set. Moreover, 7605 * since the superpage is wired, the current state of 7606 * its reference bit won't affect page replacement. 7607 */ 7608 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^ 7609 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 7610 (tpte & ATTR_SW_WIRED) == 0) { 7611 pmap_clear_bits(pte, ATTR_AF); 7612 pmap_invalidate_page(pmap, va, true); 7613 cleared++; 7614 } else 7615 not_cleared++; 7616 } 7617 PMAP_UNLOCK(pmap); 7618 /* Rotate the PV list if it has more than one entry. */ 7619 if (TAILQ_NEXT(pv, pv_next) != NULL) { 7620 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 7621 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 7622 pvh->pv_gen++; 7623 } 7624 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 7625 goto out; 7626 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 7627 small_mappings: 7628 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 7629 goto out; 7630 pv = pvf; 7631 do { 7632 if (pvf == NULL) 7633 pvf = pv; 7634 pmap = PV_PMAP(pv); 7635 if (!PMAP_TRYLOCK(pmap)) { 7636 pvh_gen = pvh->pv_gen; 7637 md_gen = m->md.pv_gen; 7638 rw_wunlock(lock); 7639 PMAP_LOCK(pmap); 7640 rw_wlock(lock); 7641 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 7642 PMAP_UNLOCK(pmap); 7643 goto retry; 7644 } 7645 } 7646 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7647 tpte = pmap_load(pte); 7648 if (pmap_pte_dirty(pmap, tpte)) 7649 vm_page_dirty(m); 7650 if ((tpte & ATTR_AF) != 0) { 7651 if ((tpte & ATTR_SW_WIRED) == 0) { 7652 /* 7653 * Clear the accessed bit in this L3 entry 7654 * regardless of the contiguous bit. 7655 */ 7656 pmap_clear_bits(pte, ATTR_AF); 7657 pmap_invalidate_page(pmap, pv->pv_va, true); 7658 cleared++; 7659 } else 7660 not_cleared++; 7661 } else if ((tpte & ATTR_CONTIGUOUS) != 0 && 7662 (pmap_load_l3c(pte) & ATTR_AF) != 0) { 7663 /* 7664 * An L3C superpage mapping is regarded as accessed 7665 * until the accessed bit has been cleared in all 7666 * of its constituent entries. 7667 */ 7668 not_cleared++; 7669 } 7670 PMAP_UNLOCK(pmap); 7671 /* Rotate the PV list if it has more than one entry. */ 7672 if (TAILQ_NEXT(pv, pv_next) != NULL) { 7673 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 7674 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7675 m->md.pv_gen++; 7676 } 7677 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 7678 not_cleared < PMAP_TS_REFERENCED_MAX); 7679 out: 7680 rw_wunlock(lock); 7681 vm_page_free_pages_toq(&free, true); 7682 return (cleared + not_cleared); 7683 } 7684 7685 /* 7686 * Apply the given advice to the specified range of addresses within the 7687 * given pmap. Depending on the advice, clear the referenced and/or 7688 * modified flags in each mapping and set the mapped page's dirty field. 7689 */ 7690 void 7691 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 7692 { 7693 struct rwlock *lock; 7694 vm_offset_t va, va_next, dva; 7695 vm_page_t m; 7696 pd_entry_t *l0, *l1, *l2, oldl2; 7697 pt_entry_t *l3, *dl3, oldl3; 7698 7699 PMAP_ASSERT_STAGE1(pmap); 7700 7701 if (advice != MADV_DONTNEED && advice != MADV_FREE) 7702 return; 7703 7704 PMAP_LOCK(pmap); 7705 for (; sva < eva; sva = va_next) { 7706 l0 = pmap_l0(pmap, sva); 7707 if (pmap_load(l0) == 0) { 7708 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 7709 if (va_next < sva) 7710 va_next = eva; 7711 continue; 7712 } 7713 7714 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 7715 if (va_next < sva) 7716 va_next = eva; 7717 l1 = pmap_l0_to_l1(l0, sva); 7718 if (pmap_load(l1) == 0) 7719 continue; 7720 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 7721 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 7722 continue; 7723 } 7724 7725 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 7726 if (va_next < sva) 7727 va_next = eva; 7728 l2 = pmap_l1_to_l2(l1, sva); 7729 oldl2 = pmap_load(l2); 7730 if (oldl2 == 0) 7731 continue; 7732 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { 7733 if ((oldl2 & ATTR_SW_MANAGED) == 0) 7734 continue; 7735 lock = NULL; 7736 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { 7737 if (lock != NULL) 7738 rw_wunlock(lock); 7739 7740 /* 7741 * The 2MB page mapping was destroyed. 7742 */ 7743 continue; 7744 } 7745 7746 /* 7747 * Unless the page mappings are wired, remove the 7748 * mapping to a single page so that a subsequent 7749 * access may repromote. Choosing the last page 7750 * within the address range [sva, min(va_next, eva)) 7751 * generally results in more repromotions. Since the 7752 * underlying page table page is fully populated, this 7753 * removal never frees a page table page. 7754 */ 7755 if ((oldl2 & ATTR_SW_WIRED) == 0) { 7756 va = eva; 7757 if (va > va_next) 7758 va = va_next; 7759 va -= PAGE_SIZE; 7760 KASSERT(va >= sva, 7761 ("pmap_advise: no address gap")); 7762 l3 = pmap_l2_to_l3(l2, va); 7763 KASSERT(pmap_load(l3) != 0, 7764 ("pmap_advise: invalid PTE")); 7765 pmap_remove_l3(pmap, l3, va, pmap_load(l2), 7766 NULL, &lock); 7767 } 7768 if (lock != NULL) 7769 rw_wunlock(lock); 7770 } 7771 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 7772 ("pmap_advise: invalid L2 entry after demotion")); 7773 if (va_next > eva) 7774 va_next = eva; 7775 va = va_next; 7776 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 7777 sva += L3_SIZE) { 7778 oldl3 = pmap_load(l3); 7779 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != 7780 (ATTR_SW_MANAGED | L3_PAGE)) 7781 goto maybe_invlrng; 7782 else if (pmap_pte_dirty(pmap, oldl3)) { 7783 if (advice == MADV_DONTNEED) { 7784 /* 7785 * Future calls to pmap_is_modified() 7786 * can be avoided by making the page 7787 * dirty now. 7788 */ 7789 m = PTE_TO_VM_PAGE(oldl3); 7790 vm_page_dirty(m); 7791 } 7792 if ((oldl3 & ATTR_CONTIGUOUS) != 0) { 7793 /* 7794 * Unconditionally demote the L3C 7795 * superpage because we do not allow 7796 * writeable, clean superpages. 7797 */ 7798 (void)pmap_demote_l3c(pmap, l3, sva); 7799 7800 /* 7801 * Destroy the final mapping before the 7802 * next L3C boundary or va_next, 7803 * whichever comes first, so that a 7804 * subsequent access may act as a 7805 * repromotion trigger. 7806 */ 7807 if ((oldl3 & ATTR_SW_WIRED) == 0) { 7808 dva = MIN((sva & ~L3C_OFFSET) + 7809 L3C_SIZE - PAGE_SIZE, 7810 va_next - PAGE_SIZE); 7811 dl3 = pmap_l2_to_l3(l2, dva); 7812 KASSERT(pmap_load(dl3) != 0, 7813 ("pmap_advise: invalid PTE")); 7814 lock = NULL; 7815 pmap_remove_l3(pmap, dl3, dva, 7816 pmap_load(l2), NULL, &lock); 7817 if (lock != NULL) 7818 rw_wunlock(lock); 7819 } 7820 7821 /* 7822 * The L3 entry's accessed bit may have 7823 * changed. 7824 */ 7825 oldl3 = pmap_load(l3); 7826 } 7827 7828 /* 7829 * Check that we did not just destroy this entry so 7830 * we avoid corrupting the page able. 7831 */ 7832 if (oldl3 != 0) { 7833 while (!atomic_fcmpset_long(l3, &oldl3, 7834 (oldl3 & ~ATTR_AF) | 7835 ATTR_S1_AP(ATTR_S1_AP_RO))) 7836 cpu_spinwait(); 7837 } 7838 } else if ((oldl3 & ATTR_AF) != 0) { 7839 /* 7840 * Clear the accessed bit in this L3 entry 7841 * regardless of the contiguous bit. 7842 */ 7843 pmap_clear_bits(l3, ATTR_AF); 7844 } else 7845 goto maybe_invlrng; 7846 if (va == va_next) 7847 va = sva; 7848 continue; 7849 maybe_invlrng: 7850 if (va != va_next) { 7851 pmap_s1_invalidate_range(pmap, va, sva, true); 7852 va = va_next; 7853 } 7854 } 7855 if (va != va_next) 7856 pmap_s1_invalidate_range(pmap, va, sva, true); 7857 } 7858 PMAP_UNLOCK(pmap); 7859 } 7860 7861 /* 7862 * Clear the modify bits on the specified physical page. 7863 */ 7864 void 7865 pmap_clear_modify(vm_page_t m) 7866 { 7867 struct md_page *pvh; 7868 struct rwlock *lock; 7869 pmap_t pmap; 7870 pv_entry_t next_pv, pv; 7871 pd_entry_t *l2, oldl2; 7872 pt_entry_t *l3, oldl3; 7873 vm_offset_t va; 7874 int md_gen, pvh_gen; 7875 7876 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7877 ("pmap_clear_modify: page %p is not managed", m)); 7878 vm_page_assert_busied(m); 7879 7880 if (!pmap_page_is_write_mapped(m)) 7881 return; 7882 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 7883 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7884 rw_wlock(lock); 7885 restart: 7886 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 7887 pmap = PV_PMAP(pv); 7888 PMAP_ASSERT_STAGE1(pmap); 7889 if (!PMAP_TRYLOCK(pmap)) { 7890 pvh_gen = pvh->pv_gen; 7891 rw_wunlock(lock); 7892 PMAP_LOCK(pmap); 7893 rw_wlock(lock); 7894 if (pvh_gen != pvh->pv_gen) { 7895 PMAP_UNLOCK(pmap); 7896 goto restart; 7897 } 7898 } 7899 va = pv->pv_va; 7900 l2 = pmap_l2(pmap, va); 7901 oldl2 = pmap_load(l2); 7902 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ 7903 if ((oldl2 & ATTR_SW_DBM) != 0 && 7904 pmap_demote_l2_locked(pmap, l2, va, &lock) && 7905 (oldl2 & ATTR_SW_WIRED) == 0) { 7906 /* 7907 * Write protect the mapping to a single page so that 7908 * a subsequent write access may repromote. 7909 */ 7910 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 7911 l3 = pmap_l2_to_l3(l2, va); 7912 oldl3 = pmap_load(l3); 7913 while (!atomic_fcmpset_long(l3, &oldl3, 7914 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO))) 7915 cpu_spinwait(); 7916 vm_page_dirty(m); 7917 pmap_s1_invalidate_page(pmap, va, true); 7918 } 7919 PMAP_UNLOCK(pmap); 7920 } 7921 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7922 pmap = PV_PMAP(pv); 7923 PMAP_ASSERT_STAGE1(pmap); 7924 if (!PMAP_TRYLOCK(pmap)) { 7925 md_gen = m->md.pv_gen; 7926 pvh_gen = pvh->pv_gen; 7927 rw_wunlock(lock); 7928 PMAP_LOCK(pmap); 7929 rw_wlock(lock); 7930 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 7931 PMAP_UNLOCK(pmap); 7932 goto restart; 7933 } 7934 } 7935 l2 = pmap_l2(pmap, pv->pv_va); 7936 l3 = pmap_l2_to_l3(l2, pv->pv_va); 7937 oldl3 = pmap_load(l3); 7938 KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 || 7939 (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) != 7940 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)), 7941 ("writeable L3C superpage not dirty")); 7942 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) { 7943 if ((oldl3 & ATTR_CONTIGUOUS) != 0) 7944 (void)pmap_demote_l3c(pmap, l3, pv->pv_va); 7945 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO)); 7946 pmap_s1_invalidate_page(pmap, pv->pv_va, true); 7947 } 7948 PMAP_UNLOCK(pmap); 7949 } 7950 rw_wunlock(lock); 7951 } 7952 7953 void * 7954 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 7955 { 7956 struct pmap_preinit_mapping *ppim; 7957 vm_offset_t va, offset; 7958 pd_entry_t old_l2e, *pde; 7959 pt_entry_t *l2; 7960 int i, lvl, l2_blocks, free_l2_count, start_idx; 7961 7962 /* Use the DMAP region if we can */ 7963 if (PHYS_IN_DMAP(pa) && PHYS_IN_DMAP(pa + size - 1) && 7964 pmap_kmapped_range(PHYS_TO_DMAP(pa), size)) 7965 return ((void *)PHYS_TO_DMAP(pa)); 7966 7967 if (!vm_initialized) { 7968 /* 7969 * No L3 ptables so map entire L2 blocks where start VA is: 7970 * preinit_map_va + start_idx * L2_SIZE 7971 * There may be duplicate mappings (multiple VA -> same PA) but 7972 * ARM64 dcache is always PIPT so that's acceptable. 7973 */ 7974 if (size == 0) 7975 return (NULL); 7976 7977 /* Calculate how many L2 blocks are needed for the mapping */ 7978 l2_blocks = (roundup2(pa + size, L2_SIZE) - 7979 rounddown2(pa, L2_SIZE)) >> L2_SHIFT; 7980 7981 offset = pa & L2_OFFSET; 7982 7983 if (preinit_map_va == 0) 7984 return (NULL); 7985 7986 /* Map 2MiB L2 blocks from reserved VA space */ 7987 7988 free_l2_count = 0; 7989 start_idx = -1; 7990 /* Find enough free contiguous VA space */ 7991 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 7992 ppim = pmap_preinit_mapping + i; 7993 if (free_l2_count > 0 && ppim->pa != 0) { 7994 /* Not enough space here */ 7995 free_l2_count = 0; 7996 start_idx = -1; 7997 continue; 7998 } 7999 8000 if (ppim->pa == 0) { 8001 /* Free L2 block */ 8002 if (start_idx == -1) 8003 start_idx = i; 8004 free_l2_count++; 8005 if (free_l2_count == l2_blocks) 8006 break; 8007 } 8008 } 8009 if (free_l2_count != l2_blocks) 8010 panic("%s: too many preinit mappings", __func__); 8011 8012 va = preinit_map_va + (start_idx * L2_SIZE); 8013 for (i = start_idx; i < start_idx + l2_blocks; i++) { 8014 /* Mark entries as allocated */ 8015 ppim = pmap_preinit_mapping + i; 8016 ppim->pa = pa; 8017 ppim->va = va + offset; 8018 ppim->size = size; 8019 } 8020 8021 /* Map L2 blocks */ 8022 pa = rounddown2(pa, L2_SIZE); 8023 old_l2e = 0; 8024 for (i = 0; i < l2_blocks; i++) { 8025 pde = pmap_pde(kernel_pmap, va, &lvl); 8026 KASSERT(pde != NULL, 8027 ("pmap_mapbios: Invalid page entry, va: 0x%lx", 8028 va)); 8029 KASSERT(lvl == 1, 8030 ("pmap_mapbios: Invalid level %d", lvl)); 8031 8032 /* Insert L2_BLOCK */ 8033 l2 = pmap_l1_to_l2(pde, va); 8034 old_l2e |= pmap_load_store(l2, 8035 PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr | 8036 ATTR_S1_XN | ATTR_KERN_GP | 8037 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); 8038 8039 va += L2_SIZE; 8040 pa += L2_SIZE; 8041 } 8042 if ((old_l2e & ATTR_DESCR_VALID) != 0) 8043 pmap_s1_invalidate_all_kernel(); 8044 else { 8045 /* 8046 * Because the old entries were invalid and the new 8047 * mappings are not executable, an isb is not required. 8048 */ 8049 dsb(ishst); 8050 } 8051 8052 va = preinit_map_va + (start_idx * L2_SIZE); 8053 8054 } else { 8055 /* kva_alloc may be used to map the pages */ 8056 offset = pa & PAGE_MASK; 8057 size = round_page(offset + size); 8058 8059 va = kva_alloc(size); 8060 if (va == 0) 8061 panic("%s: Couldn't allocate KVA", __func__); 8062 8063 pde = pmap_pde(kernel_pmap, va, &lvl); 8064 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); 8065 8066 /* L3 table is linked */ 8067 va = trunc_page(va); 8068 pa = trunc_page(pa); 8069 pmap_kenter(va, size, pa, memory_mapping_mode(pa)); 8070 } 8071 8072 return ((void *)(va + offset)); 8073 } 8074 8075 void 8076 pmap_unmapbios(void *p, vm_size_t size) 8077 { 8078 struct pmap_preinit_mapping *ppim; 8079 vm_offset_t offset, va, va_trunc; 8080 pd_entry_t *pde; 8081 pt_entry_t *l2; 8082 int error __diagused, i, lvl, l2_blocks, block; 8083 bool preinit_map; 8084 8085 va = (vm_offset_t)p; 8086 if (VIRT_IN_DMAP(va)) { 8087 KASSERT(VIRT_IN_DMAP(va + size - 1), 8088 ("%s: End address not in DMAP region: %lx", __func__, 8089 va + size - 1)); 8090 /* Ensure the attributes are as expected for the DMAP region */ 8091 PMAP_LOCK(kernel_pmap); 8092 error = pmap_change_props_locked(va, size, 8093 PROT_READ | PROT_WRITE, VM_MEMATTR_DEFAULT, false); 8094 PMAP_UNLOCK(kernel_pmap); 8095 KASSERT(error == 0, ("%s: Failed to reset DMAP attributes: %d", 8096 __func__, error)); 8097 8098 return; 8099 } 8100 8101 l2_blocks = 8102 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; 8103 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); 8104 8105 /* Remove preinit mapping */ 8106 preinit_map = false; 8107 block = 0; 8108 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 8109 ppim = pmap_preinit_mapping + i; 8110 if (ppim->va == va) { 8111 KASSERT(ppim->size == size, 8112 ("pmap_unmapbios: size mismatch")); 8113 ppim->va = 0; 8114 ppim->pa = 0; 8115 ppim->size = 0; 8116 preinit_map = true; 8117 offset = block * L2_SIZE; 8118 va_trunc = rounddown2(va, L2_SIZE) + offset; 8119 8120 /* Remove L2_BLOCK */ 8121 pde = pmap_pde(kernel_pmap, va_trunc, &lvl); 8122 KASSERT(pde != NULL, 8123 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", 8124 va_trunc)); 8125 l2 = pmap_l1_to_l2(pde, va_trunc); 8126 pmap_clear(l2); 8127 8128 if (block == (l2_blocks - 1)) 8129 break; 8130 block++; 8131 } 8132 } 8133 if (preinit_map) { 8134 pmap_s1_invalidate_all_kernel(); 8135 return; 8136 } 8137 8138 /* Unmap the pages reserved with kva_alloc. */ 8139 if (vm_initialized) { 8140 offset = va & PAGE_MASK; 8141 size = round_page(offset + size); 8142 va = trunc_page(va); 8143 8144 /* Unmap and invalidate the pages */ 8145 pmap_kremove_device(va, size); 8146 8147 kva_free(va, size); 8148 } 8149 } 8150 8151 /* 8152 * Sets the memory attribute for the specified page. 8153 */ 8154 void 8155 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 8156 { 8157 if (m->md.pv_memattr == ma) 8158 return; 8159 8160 m->md.pv_memattr = ma; 8161 8162 /* 8163 * If "m" is a normal page, update its direct mapping. This update 8164 * can be relied upon to perform any cache operations that are 8165 * required for data coherence. 8166 */ 8167 if ((m->flags & PG_FICTITIOUS) == 0 && 8168 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 8169 m->md.pv_memattr) != 0) 8170 panic("memory attribute change on the direct map failed"); 8171 } 8172 8173 /* 8174 * Changes the specified virtual address range's memory type to that given by 8175 * the parameter "mode". The specified virtual address range must be 8176 * completely contained within either the direct map or the kernel map. If 8177 * the virtual address range is contained within the kernel map, then the 8178 * memory type for each of the corresponding ranges of the direct map is also 8179 * changed. (The corresponding ranges of the direct map are those ranges that 8180 * map the same physical pages as the specified virtual address range.) These 8181 * changes to the direct map are necessary because Intel describes the 8182 * behavior of their processors as "undefined" if two or more mappings to the 8183 * same physical page have different memory types. 8184 * 8185 * Returns zero if the change completed successfully, and either EINVAL or 8186 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 8187 * of the virtual address range was not mapped, and ENOMEM is returned if 8188 * there was insufficient memory available to complete the change. In the 8189 * latter case, the memory type may have been changed on some part of the 8190 * virtual address range or the direct map. 8191 */ 8192 int 8193 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 8194 { 8195 int error; 8196 8197 PMAP_LOCK(kernel_pmap); 8198 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false); 8199 PMAP_UNLOCK(kernel_pmap); 8200 return (error); 8201 } 8202 8203 /* 8204 * Changes the specified virtual address range's protections to those 8205 * specified by "prot". Like pmap_change_attr(), protections for aliases 8206 * in the direct map are updated as well. Protections on aliasing mappings may 8207 * be a subset of the requested protections; for example, mappings in the direct 8208 * map are never executable. 8209 */ 8210 int 8211 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 8212 { 8213 int error; 8214 8215 /* Only supported within the kernel map. */ 8216 if (va < VM_MIN_KERNEL_ADDRESS) 8217 return (EINVAL); 8218 8219 PMAP_LOCK(kernel_pmap); 8220 error = pmap_change_props_locked(va, size, prot, -1, false); 8221 PMAP_UNLOCK(kernel_pmap); 8222 return (error); 8223 } 8224 8225 static int 8226 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 8227 int mode, bool skip_unmapped) 8228 { 8229 vm_offset_t base, offset, tmpva; 8230 vm_size_t pte_size; 8231 vm_paddr_t pa; 8232 pt_entry_t pte, *ptep, *newpte; 8233 pt_entry_t bits, mask; 8234 int lvl, rv; 8235 8236 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 8237 base = trunc_page(va); 8238 offset = va & PAGE_MASK; 8239 size = round_page(offset + size); 8240 8241 if (!VIRT_IN_DMAP(base) && 8242 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 8243 return (EINVAL); 8244 8245 bits = 0; 8246 mask = 0; 8247 if (mode != -1) { 8248 bits = ATTR_S1_IDX(mode); 8249 mask = ATTR_S1_IDX_MASK; 8250 if (mode == VM_MEMATTR_DEVICE) { 8251 mask |= ATTR_S1_XN; 8252 bits |= ATTR_S1_XN; 8253 } 8254 } 8255 if (prot != VM_PROT_NONE) { 8256 /* Don't mark the DMAP as executable. It never is on arm64. */ 8257 if (VIRT_IN_DMAP(base)) { 8258 prot &= ~VM_PROT_EXECUTE; 8259 /* 8260 * XXX Mark the DMAP as writable for now. We rely 8261 * on this in ddb & dtrace to insert breakpoint 8262 * instructions. 8263 */ 8264 prot |= VM_PROT_WRITE; 8265 } 8266 8267 if ((prot & VM_PROT_WRITE) == 0) { 8268 bits |= ATTR_S1_AP(ATTR_S1_AP_RO); 8269 } 8270 if ((prot & VM_PROT_EXECUTE) == 0) { 8271 bits |= ATTR_S1_PXN; 8272 } 8273 bits |= ATTR_S1_UXN; 8274 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN; 8275 } 8276 8277 for (tmpva = base; tmpva < base + size; ) { 8278 ptep = pmap_pte(kernel_pmap, tmpva, &lvl); 8279 if (ptep == NULL && !skip_unmapped) { 8280 return (EINVAL); 8281 } else if ((ptep == NULL && skip_unmapped) || 8282 (pmap_load(ptep) & mask) == bits) { 8283 /* 8284 * We already have the correct attribute or there 8285 * is no memory mapped at this address and we are 8286 * skipping unmapped memory. 8287 */ 8288 switch (lvl) { 8289 default: 8290 panic("Invalid DMAP table level: %d\n", lvl); 8291 case 1: 8292 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 8293 break; 8294 case 2: 8295 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 8296 break; 8297 case 3: 8298 tmpva += PAGE_SIZE; 8299 break; 8300 } 8301 } else { 8302 /* We can't demote/promote this entry */ 8303 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0); 8304 8305 /* 8306 * Find the entry and demote it if the requested change 8307 * only applies to part of the address range mapped by 8308 * the entry. 8309 */ 8310 switch (lvl) { 8311 default: 8312 panic("Invalid DMAP table level: %d\n", lvl); 8313 case 1: 8314 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 8315 if ((tmpva & L1_OFFSET) == 0 && 8316 (base + size - tmpva) >= L1_SIZE) { 8317 pte_size = L1_SIZE; 8318 break; 8319 } 8320 newpte = pmap_demote_l1(kernel_pmap, ptep, 8321 tmpva & ~L1_OFFSET); 8322 if (newpte == NULL) 8323 return (EINVAL); 8324 ptep = pmap_l1_to_l2(ptep, tmpva); 8325 /* FALLTHROUGH */ 8326 case 2: 8327 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) { 8328 if ((tmpva & L2C_OFFSET) == 0 && 8329 (base + size - tmpva) >= L2C_SIZE) { 8330 pte_size = L2C_SIZE; 8331 break; 8332 } 8333 if (!pmap_demote_l2c(kernel_pmap, ptep, 8334 tmpva)) 8335 return (EINVAL); 8336 } 8337 if ((tmpva & L2_OFFSET) == 0 && 8338 (base + size - tmpva) >= L2_SIZE) { 8339 pte_size = L2_SIZE; 8340 break; 8341 } 8342 newpte = pmap_demote_l2(kernel_pmap, ptep, 8343 tmpva); 8344 if (newpte == NULL) 8345 return (EINVAL); 8346 ptep = pmap_l2_to_l3(ptep, tmpva); 8347 /* FALLTHROUGH */ 8348 case 3: 8349 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) { 8350 if ((tmpva & L3C_OFFSET) == 0 && 8351 (base + size - tmpva) >= L3C_SIZE) { 8352 pte_size = L3C_SIZE; 8353 break; 8354 } 8355 if (!pmap_demote_l3c(kernel_pmap, ptep, 8356 tmpva)) 8357 return (EINVAL); 8358 } 8359 pte_size = PAGE_SIZE; 8360 break; 8361 } 8362 8363 /* Update the entry */ 8364 pte = pmap_load(ptep); 8365 pte &= ~mask; 8366 pte |= bits; 8367 8368 switch (pte_size) { 8369 case L2C_SIZE: 8370 pmap_update_strided(kernel_pmap, ptep, ptep + 8371 L2C_ENTRIES, pte, tmpva, L2_SIZE, L2C_SIZE); 8372 break; 8373 case L3C_SIZE: 8374 pmap_update_strided(kernel_pmap, ptep, ptep + 8375 L3C_ENTRIES, pte, tmpva, L3_SIZE, L3C_SIZE); 8376 break; 8377 default: 8378 /* 8379 * We are updating a single block or page entry, 8380 * so regardless of pte_size pass PAGE_SIZE in 8381 * order that a single TLB invalidation is 8382 * performed. 8383 */ 8384 pmap_update_entry(kernel_pmap, ptep, pte, tmpva, 8385 PAGE_SIZE); 8386 break; 8387 } 8388 8389 pa = PTE_TO_PHYS(pte); 8390 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) { 8391 /* 8392 * Keep the DMAP memory in sync. 8393 */ 8394 rv = pmap_change_props_locked( 8395 PHYS_TO_DMAP(pa), pte_size, 8396 prot, mode, true); 8397 if (rv != 0) 8398 return (rv); 8399 } 8400 8401 /* 8402 * If moving to a non-cacheable entry flush 8403 * the cache. 8404 */ 8405 if (mode == VM_MEMATTR_UNCACHEABLE) 8406 cpu_dcache_wbinv_range((void *)tmpva, pte_size); 8407 tmpva += pte_size; 8408 } 8409 } 8410 8411 return (0); 8412 } 8413 8414 /* 8415 * Create an L2 table to map all addresses within an L1 mapping. 8416 */ 8417 static pt_entry_t * 8418 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) 8419 { 8420 pt_entry_t *l2, newl2, oldl1; 8421 vm_offset_t tmpl1; 8422 vm_paddr_t l2phys, phys; 8423 vm_page_t ml2; 8424 int i; 8425 8426 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8427 oldl1 = pmap_load(l1); 8428 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 8429 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, 8430 ("pmap_demote_l1: Demoting a non-block entry")); 8431 KASSERT((va & L1_OFFSET) == 0, 8432 ("pmap_demote_l1: Invalid virtual address %#lx", va)); 8433 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, 8434 ("pmap_demote_l1: Level 1 table shouldn't be managed")); 8435 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0, 8436 ("pmap_demote_l1: Demoting entry with no-demote flag set")); 8437 8438 tmpl1 = 0; 8439 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { 8440 tmpl1 = kva_alloc(PAGE_SIZE); 8441 if (tmpl1 == 0) 8442 return (NULL); 8443 } 8444 8445 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) == 8446 NULL) { 8447 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" 8448 " in pmap %p", va, pmap); 8449 l2 = NULL; 8450 goto fail; 8451 } 8452 8453 l2phys = VM_PAGE_TO_PHYS(ml2); 8454 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); 8455 8456 /* Address the range points at */ 8457 phys = PTE_TO_PHYS(oldl1); 8458 /* The attributed from the old l1 table to be copied */ 8459 newl2 = oldl1 & ATTR_MASK; 8460 8461 /* Create the new entries */ 8462 newl2 |= ATTR_CONTIGUOUS; 8463 for (i = 0; i < Ln_ENTRIES; i++) { 8464 l2[i] = newl2 | phys; 8465 phys += L2_SIZE; 8466 } 8467 KASSERT(l2[0] == (ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | 8468 L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0], 8469 ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); 8470 8471 if (tmpl1 != 0) { 8472 pmap_kenter(tmpl1, PAGE_SIZE, 8473 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, 8474 VM_MEMATTR_WRITE_BACK); 8475 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); 8476 } 8477 8478 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); 8479 8480 counter_u64_add(pmap_l1_demotions, 1); 8481 fail: 8482 if (tmpl1 != 0) { 8483 pmap_kremove(tmpl1); 8484 kva_free(tmpl1, PAGE_SIZE); 8485 } 8486 8487 return (l2); 8488 } 8489 8490 static void 8491 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) 8492 { 8493 pt_entry_t *l3; 8494 8495 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { 8496 *l3 = newl3; 8497 newl3 += L3_SIZE; 8498 } 8499 } 8500 8501 static void 8502 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused) 8503 { 8504 #ifdef INVARIANTS 8505 #ifdef DIAGNOSTIC 8506 pt_entry_t *xl3p, *yl3p; 8507 8508 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES; 8509 xl3p++, newl3e += PAGE_SIZE) { 8510 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) { 8511 printf("pmap_demote_l2: xl3e %zd and newl3e map " 8512 "different pages: found %#lx, expected %#lx\n", 8513 xl3p - firstl3p, pmap_load(xl3p), newl3e); 8514 printf("page table dump\n"); 8515 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES; 8516 yl3p++) { 8517 printf("%zd %#lx\n", yl3p - firstl3p, 8518 pmap_load(yl3p)); 8519 } 8520 panic("firstpte"); 8521 } 8522 } 8523 #else 8524 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e), 8525 ("pmap_demote_l2: firstl3 and newl3e map different physical" 8526 " addresses")); 8527 #endif 8528 #endif 8529 } 8530 8531 static void 8532 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, 8533 struct rwlock **lockp) 8534 { 8535 struct spglist free; 8536 8537 SLIST_INIT(&free); 8538 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), true, 8539 &free, lockp); 8540 vm_page_free_pages_toq(&free, true); 8541 } 8542 8543 /* 8544 * Create an L3 table to map all addresses within an L2 mapping. 8545 */ 8546 static pt_entry_t * 8547 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, 8548 struct rwlock **lockp) 8549 { 8550 pt_entry_t *l3, newl3, oldl2; 8551 vm_offset_t tmpl2; 8552 vm_paddr_t l3phys; 8553 vm_page_t ml3; 8554 8555 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8556 PMAP_ASSERT_STAGE1(pmap); 8557 KASSERT(ADDR_IS_CANONICAL(va), 8558 ("%s: Address not in canonical form: %lx", __func__, va)); 8559 8560 l3 = NULL; 8561 oldl2 = pmap_load(l2); 8562 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, 8563 ("pmap_demote_l2: Demoting a non-block entry")); 8564 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0, 8565 ("pmap_demote_l2: Demoting entry with no-demote flag set")); 8566 va &= ~L2_OFFSET; 8567 8568 tmpl2 = 0; 8569 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { 8570 tmpl2 = kva_alloc(PAGE_SIZE); 8571 if (tmpl2 == 0) 8572 return (NULL); 8573 } 8574 8575 /* 8576 * Invalidate the 2MB page mapping and return "failure" if the 8577 * mapping was never accessed and not wired. 8578 */ 8579 if ((oldl2 & ATTR_AF) == 0) { 8580 if ((oldl2 & ATTR_SW_WIRED) == 0) { 8581 pmap_demote_l2_abort(pmap, va, l2, lockp); 8582 CTR2(KTR_PMAP, 8583 "pmap_demote_l2: failure for va %#lx in pmap %p", 8584 va, pmap); 8585 goto fail; 8586 } 8587 ml3 = pmap_remove_pt_page(pmap, va); 8588 /* Fill the PTP with L3Es that have ATTR_AF cleared. */ 8589 ml3->valid = 0; 8590 } else if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { 8591 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 8592 ("pmap_demote_l2: page table page for a wired mapping" 8593 " is missing")); 8594 8595 /* 8596 * If the page table page is missing and the mapping 8597 * is for a kernel address, the mapping must belong to 8598 * either the direct map or the early kernel memory. 8599 * Page table pages are preallocated for every other 8600 * part of the kernel address space, so the direct map 8601 * region and early kernel memory are the only parts of the 8602 * kernel address space that must be handled here. 8603 */ 8604 KASSERT(ADDR_IS_USER(va) || VIRT_IN_DMAP(va) || 8605 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end), 8606 ("pmap_demote_l2: No saved mpte for va %#lx", va)); 8607 8608 /* 8609 * If the 2MB page mapping belongs to the direct map 8610 * region of the kernel's address space, then the page 8611 * allocation request specifies the highest possible 8612 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 8613 * priority is normal. 8614 */ 8615 ml3 = vm_page_alloc_noobj( 8616 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 8617 VM_ALLOC_WIRED); 8618 8619 /* 8620 * If the allocation of the new page table page fails, 8621 * invalidate the 2MB page mapping and return "failure". 8622 */ 8623 if (ml3 == NULL) { 8624 pmap_demote_l2_abort(pmap, va, l2, lockp); 8625 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" 8626 " in pmap %p", va, pmap); 8627 goto fail; 8628 } 8629 ml3->pindex = pmap_l2_pindex(va); 8630 8631 if (ADDR_IS_USER(va)) { 8632 ml3->ref_count = NL3PG; 8633 pmap_resident_count_inc(pmap, 1); 8634 } 8635 } 8636 l3phys = VM_PAGE_TO_PHYS(ml3); 8637 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); 8638 newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; 8639 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 8640 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM), 8641 ("pmap_demote_l2: L2 entry is writeable but not dirty")); 8642 8643 /* 8644 * If the PTP is not leftover from an earlier promotion or it does not 8645 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all 8646 * have ATTR_AF set, unless this is a wired mapping with ATTR_AF clear. 8647 * 8648 * When pmap_update_entry() clears the old L2 mapping, it (indirectly) 8649 * performs a dsb(). That dsb() ensures that the stores for filling 8650 * "l3" are visible before "l3" is added to the page table. 8651 */ 8652 if (!vm_page_all_valid(ml3)) 8653 pmap_fill_l3(l3, newl3); 8654 8655 pmap_demote_l2_check(l3, newl3); 8656 8657 /* 8658 * If the mapping has changed attributes, update the L3Es. 8659 */ 8660 if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE)) 8661 pmap_fill_l3(l3, newl3); 8662 8663 /* 8664 * Map the temporary page so we don't lose access to the l2 table. 8665 */ 8666 if (tmpl2 != 0) { 8667 pmap_kenter(tmpl2, PAGE_SIZE, 8668 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, 8669 VM_MEMATTR_WRITE_BACK); 8670 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); 8671 } 8672 8673 /* 8674 * The spare PV entries must be reserved prior to demoting the 8675 * mapping, that is, prior to changing the PDE. Otherwise, the state 8676 * of the L2 and the PV lists will be inconsistent, which can result 8677 * in reclaim_pv_chunk() attempting to remove a PV entry from the 8678 * wrong PV list and pmap_pv_demote_l2() failing to find the expected 8679 * PV entry for the 2MB page mapping that is being demoted. 8680 */ 8681 if ((oldl2 & ATTR_SW_MANAGED) != 0) 8682 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 8683 8684 /* 8685 * Pass PAGE_SIZE so that a single TLB invalidation is performed on 8686 * the 2MB page mapping. 8687 */ 8688 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); 8689 8690 /* 8691 * Demote the PV entry. 8692 */ 8693 if ((oldl2 & ATTR_SW_MANAGED) != 0) 8694 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 8695 8696 counter_u64_add(pmap_l2_demotions, 1); 8697 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" 8698 " in pmap %p %lx", va, pmap, l3[0]); 8699 8700 fail: 8701 if (tmpl2 != 0) { 8702 pmap_kremove(tmpl2); 8703 kva_free(tmpl2, PAGE_SIZE); 8704 } 8705 8706 return (l3); 8707 8708 } 8709 8710 static pt_entry_t * 8711 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 8712 { 8713 struct rwlock *lock; 8714 pt_entry_t *l3; 8715 8716 lock = NULL; 8717 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); 8718 if (lock != NULL) 8719 rw_wunlock(lock); 8720 return (l3); 8721 } 8722 8723 /* 8724 * Demote an L2C superpage mapping to L2C_ENTRIES L2 block mappings. 8725 */ 8726 static bool 8727 pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va) 8728 { 8729 pd_entry_t *l2c_end, *l2c_start, l2e, mask, nbits, *tl2p; 8730 vm_offset_t tmpl3; 8731 register_t intr; 8732 8733 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8734 PMAP_ASSERT_STAGE1(pmap); 8735 l2c_start = (pd_entry_t *)((uintptr_t)l2p & ~((L2C_ENTRIES * 8736 sizeof(pd_entry_t)) - 1)); 8737 l2c_end = l2c_start + L2C_ENTRIES; 8738 tmpl3 = 0; 8739 if ((va & ~L2C_OFFSET) < (vm_offset_t)l2c_end && 8740 (vm_offset_t)l2c_start < (va & ~L2C_OFFSET) + L2C_SIZE) { 8741 tmpl3 = kva_alloc(PAGE_SIZE); 8742 if (tmpl3 == 0) 8743 return (false); 8744 pmap_kenter(tmpl3, PAGE_SIZE, 8745 DMAP_TO_PHYS((vm_offset_t)l2c_start) & ~L3_OFFSET, 8746 VM_MEMATTR_WRITE_BACK); 8747 l2c_start = (pd_entry_t *)(tmpl3 + 8748 ((vm_offset_t)l2c_start & PAGE_MASK)); 8749 l2c_end = (pd_entry_t *)(tmpl3 + 8750 ((vm_offset_t)l2c_end & PAGE_MASK)); 8751 } 8752 mask = 0; 8753 nbits = ATTR_DESCR_VALID; 8754 intr = intr_disable(); 8755 8756 /* 8757 * Break the mappings. 8758 */ 8759 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) { 8760 /* 8761 * Clear the mapping's contiguous and valid bits, but leave 8762 * the rest of the entry unchanged, so that a lockless, 8763 * concurrent pmap_kextract() can still lookup the physical 8764 * address. 8765 */ 8766 l2e = pmap_load(tl2p); 8767 KASSERT((l2e & ATTR_CONTIGUOUS) != 0, 8768 ("pmap_demote_l2c: missing ATTR_CONTIGUOUS")); 8769 KASSERT((l2e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) != 8770 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)), 8771 ("pmap_demote_l2c: missing ATTR_S1_AP_RW")); 8772 while (!atomic_fcmpset_64(tl2p, &l2e, l2e & ~(ATTR_CONTIGUOUS | 8773 ATTR_DESCR_VALID))) 8774 cpu_spinwait(); 8775 8776 /* 8777 * Hardware accessed and dirty bit maintenance might only 8778 * update a single L2 entry, so we must combine the accessed 8779 * and dirty bits from this entire set of contiguous L2 8780 * entries. 8781 */ 8782 if ((l2e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 8783 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)) 8784 mask = ATTR_S1_AP_RW_BIT; 8785 nbits |= l2e & ATTR_AF; 8786 } 8787 if ((nbits & ATTR_AF) != 0) { 8788 pmap_s1_invalidate_strided(pmap, va & ~L2C_OFFSET, (va + 8789 L2C_SIZE) & ~L2C_OFFSET, L2_SIZE, true); 8790 } 8791 8792 /* 8793 * Remake the mappings, updating the accessed and dirty bits. 8794 */ 8795 l2e = (pmap_load(l2c_start) & ~mask) | nbits; 8796 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) { 8797 pmap_store(tl2p, l2e); 8798 l2e += L2_SIZE; 8799 } 8800 dsb(ishst); 8801 8802 intr_restore(intr); 8803 if (tmpl3 != 0) { 8804 pmap_kremove(tmpl3); 8805 kva_free(tmpl3, PAGE_SIZE); 8806 } 8807 counter_u64_add(pmap_l2c_demotions, 1); 8808 CTR2(KTR_PMAP, "pmap_demote_l2c: success for va %#lx in pmap %p", 8809 va, pmap); 8810 return (true); 8811 } 8812 8813 /* 8814 * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings. 8815 */ 8816 static bool 8817 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va) 8818 { 8819 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p; 8820 vm_offset_t tmpl3; 8821 register_t intr; 8822 8823 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8824 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES * 8825 sizeof(pt_entry_t)) - 1)); 8826 l3c_end = l3c_start + L3C_ENTRIES; 8827 tmpl3 = 0; 8828 if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end && 8829 (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) { 8830 tmpl3 = kva_alloc(PAGE_SIZE); 8831 if (tmpl3 == 0) 8832 return (false); 8833 pmap_kenter(tmpl3, PAGE_SIZE, 8834 DMAP_TO_PHYS((vm_offset_t)l3c_start) & ~L3_OFFSET, 8835 VM_MEMATTR_WRITE_BACK); 8836 l3c_start = (pt_entry_t *)(tmpl3 + 8837 ((vm_offset_t)l3c_start & PAGE_MASK)); 8838 l3c_end = (pt_entry_t *)(tmpl3 + 8839 ((vm_offset_t)l3c_end & PAGE_MASK)); 8840 } 8841 mask = 0; 8842 nbits = ATTR_DESCR_VALID; 8843 intr = intr_disable(); 8844 8845 /* 8846 * Break the mappings. 8847 */ 8848 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) { 8849 /* 8850 * Clear the mapping's contiguous and valid bits, but leave 8851 * the rest of the entry unchanged, so that a lockless, 8852 * concurrent pmap_kextract() can still lookup the physical 8853 * address. 8854 */ 8855 l3e = pmap_load(tl3p); 8856 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 8857 ("pmap_demote_l3c: missing ATTR_CONTIGUOUS")); 8858 KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) != 8859 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)), 8860 ("pmap_demote_l3c: missing ATTR_S1_AP_RW")); 8861 while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS | 8862 ATTR_DESCR_VALID))) 8863 cpu_spinwait(); 8864 8865 /* 8866 * Hardware accessed and dirty bit maintenance might only 8867 * update a single L3 entry, so we must combine the accessed 8868 * and dirty bits from this entire set of contiguous L3 8869 * entries. 8870 */ 8871 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 8872 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)) 8873 mask = ATTR_S1_AP_RW_BIT; 8874 nbits |= l3e & ATTR_AF; 8875 } 8876 if ((nbits & ATTR_AF) != 0) { 8877 pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) & 8878 ~L3C_OFFSET, true); 8879 } 8880 8881 /* 8882 * Remake the mappings, updating the accessed and dirty bits. 8883 */ 8884 l3e = (pmap_load(l3c_start) & ~mask) | nbits; 8885 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) { 8886 pmap_store(tl3p, l3e); 8887 l3e += L3_SIZE; 8888 } 8889 dsb(ishst); 8890 8891 intr_restore(intr); 8892 if (tmpl3 != 0) { 8893 pmap_kremove(tmpl3); 8894 kva_free(tmpl3, PAGE_SIZE); 8895 } 8896 counter_u64_add(pmap_l3c_demotions, 1); 8897 CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p", 8898 va, pmap); 8899 return (true); 8900 } 8901 8902 /* 8903 * Accumulate the accessed and dirty bits within a L3C superpage and 8904 * return the specified PTE with them applied correctly. 8905 */ 8906 static pt_entry_t 8907 pmap_load_l3c(pt_entry_t *l3p) 8908 { 8909 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p; 8910 8911 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES * 8912 sizeof(pt_entry_t)) - 1)); 8913 l3c_end = l3c_start + L3C_ENTRIES; 8914 mask = 0; 8915 nbits = 0; 8916 /* Iterate over each mapping in the superpage. */ 8917 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) { 8918 l3e = pmap_load(tl3p); 8919 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 8920 ("pmap_load_l3c: missing ATTR_CONTIGUOUS")); 8921 /* Update mask if the current page has its dirty bit set. */ 8922 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 8923 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)) 8924 mask = ATTR_S1_AP_RW_BIT; 8925 /* Update nbits if the accessed bit is set. */ 8926 nbits |= l3e & ATTR_AF; 8927 } 8928 return ((pmap_load(l3p) & ~mask) | nbits); 8929 } 8930 8931 /* 8932 * Perform the pmap work for mincore(2). If the page is not both referenced and 8933 * modified by this pmap, returns its physical address so that the caller can 8934 * find other mappings. 8935 */ 8936 int 8937 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 8938 { 8939 pt_entry_t *pte, tpte; 8940 vm_paddr_t mask, pa; 8941 int lvl, psind, val; 8942 bool managed; 8943 8944 PMAP_ASSERT_STAGE1(pmap); 8945 PMAP_LOCK(pmap); 8946 pte = pmap_pte(pmap, addr, &lvl); 8947 if (pte != NULL) { 8948 tpte = pmap_load(pte); 8949 8950 switch (lvl) { 8951 case 3: 8952 mask = L3_OFFSET; 8953 psind = (tpte & ATTR_CONTIGUOUS) != 0 ? 1 : 0; 8954 break; 8955 case 2: 8956 mask = L2_OFFSET; 8957 psind = 2; 8958 break; 8959 case 1: 8960 mask = L1_OFFSET; 8961 psind = 3; 8962 break; 8963 default: 8964 panic("pmap_mincore: invalid level %d", lvl); 8965 } 8966 8967 managed = (tpte & ATTR_SW_MANAGED) != 0; 8968 val = MINCORE_INCORE | MINCORE_PSIND(psind); 8969 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && 8970 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) 8971 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 8972 if ((tpte & ATTR_AF) == ATTR_AF) 8973 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 8974 8975 pa = PTE_TO_PHYS(tpte) | (addr & mask); 8976 } else { 8977 managed = false; 8978 val = 0; 8979 } 8980 8981 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 8982 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 8983 *pap = pa; 8984 } 8985 PMAP_UNLOCK(pmap); 8986 return (val); 8987 } 8988 8989 /* 8990 * Garbage collect every ASID that is neither active on a processor nor 8991 * reserved. 8992 */ 8993 static void 8994 pmap_reset_asid_set(pmap_t pmap) 8995 { 8996 pmap_t curpmap; 8997 int asid, cpuid, epoch; 8998 struct asid_set *set; 8999 enum pmap_stage stage; 9000 9001 set = pmap->pm_asid_set; 9002 stage = pmap->pm_stage; 9003 9004 set = pmap->pm_asid_set; 9005 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 9006 mtx_assert(&set->asid_set_mutex, MA_OWNED); 9007 9008 /* 9009 * Ensure that the store to asid_epoch is globally visible before the 9010 * loads from pc_curpmap are performed. 9011 */ 9012 epoch = set->asid_epoch + 1; 9013 if (epoch == INT_MAX) 9014 epoch = 0; 9015 set->asid_epoch = epoch; 9016 dsb(ishst); 9017 if (stage == PM_STAGE1) { 9018 __asm __volatile("tlbi vmalle1is"); 9019 } else { 9020 KASSERT(pmap_clean_stage2_tlbi != NULL, 9021 ("%s: Unset stage 2 tlb invalidation callback\n", 9022 __func__)); 9023 pmap_clean_stage2_tlbi(); 9024 } 9025 dsb(ish); 9026 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE, 9027 set->asid_set_size - 1); 9028 CPU_FOREACH(cpuid) { 9029 if (cpuid == curcpu) 9030 continue; 9031 if (stage == PM_STAGE1) { 9032 curpmap = pcpu_find(cpuid)->pc_curpmap; 9033 PMAP_ASSERT_STAGE1(pmap); 9034 } else { 9035 curpmap = pcpu_find(cpuid)->pc_curvmpmap; 9036 if (curpmap == NULL) 9037 continue; 9038 PMAP_ASSERT_STAGE2(pmap); 9039 } 9040 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set")); 9041 asid = COOKIE_TO_ASID(curpmap->pm_cookie); 9042 if (asid == -1) 9043 continue; 9044 bit_set(set->asid_set, asid); 9045 curpmap->pm_cookie = COOKIE_FROM(asid, epoch); 9046 } 9047 } 9048 9049 /* 9050 * Allocate a new ASID for the specified pmap. 9051 */ 9052 static void 9053 pmap_alloc_asid(pmap_t pmap) 9054 { 9055 struct asid_set *set; 9056 int new_asid; 9057 9058 set = pmap->pm_asid_set; 9059 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 9060 9061 mtx_lock_spin(&set->asid_set_mutex); 9062 9063 /* 9064 * While this processor was waiting to acquire the asid set mutex, 9065 * pmap_reset_asid_set() running on another processor might have 9066 * updated this pmap's cookie to the current epoch. In which case, we 9067 * don't need to allocate a new ASID. 9068 */ 9069 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) 9070 goto out; 9071 9072 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size, 9073 &new_asid); 9074 if (new_asid == -1) { 9075 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 9076 set->asid_next, &new_asid); 9077 if (new_asid == -1) { 9078 pmap_reset_asid_set(pmap); 9079 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 9080 set->asid_set_size, &new_asid); 9081 KASSERT(new_asid != -1, ("ASID allocation failure")); 9082 } 9083 } 9084 bit_set(set->asid_set, new_asid); 9085 set->asid_next = new_asid + 1; 9086 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch); 9087 out: 9088 mtx_unlock_spin(&set->asid_set_mutex); 9089 } 9090 9091 static uint64_t __read_mostly ttbr_flags; 9092 9093 /* 9094 * Compute the value that should be stored in ttbr0 to activate the specified 9095 * pmap. This value may change from time to time. 9096 */ 9097 uint64_t 9098 pmap_to_ttbr0(pmap_t pmap) 9099 { 9100 uint64_t ttbr; 9101 9102 ttbr = pmap->pm_ttbr; 9103 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 9104 ttbr |= ttbr_flags; 9105 9106 return (ttbr); 9107 } 9108 9109 static void 9110 pmap_set_cnp(void *arg) 9111 { 9112 uint64_t ttbr0, ttbr1; 9113 u_int cpuid; 9114 9115 cpuid = *(u_int *)arg; 9116 if (cpuid == curcpu) { 9117 /* 9118 * Set the flags while all CPUs are handling the 9119 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls 9120 * to pmap_to_ttbr0 after this will have the CnP flag set. 9121 * The dsb after invalidating the TLB will act as a barrier 9122 * to ensure all CPUs can observe this change. 9123 */ 9124 ttbr_flags |= TTBR_CnP; 9125 } 9126 9127 ttbr0 = READ_SPECIALREG(ttbr0_el1); 9128 ttbr0 |= TTBR_CnP; 9129 9130 ttbr1 = READ_SPECIALREG(ttbr1_el1); 9131 ttbr1 |= TTBR_CnP; 9132 9133 /* Update ttbr{0,1}_el1 with the CnP flag */ 9134 WRITE_SPECIALREG(ttbr0_el1, ttbr0); 9135 WRITE_SPECIALREG(ttbr1_el1, ttbr1); 9136 isb(); 9137 __asm __volatile("tlbi vmalle1is"); 9138 dsb(ish); 9139 isb(); 9140 } 9141 9142 /* 9143 * Defer enabling some features until we have read the ID registers to know 9144 * if they are supported on all CPUs. 9145 */ 9146 static void 9147 pmap_init_mp(void *dummy __unused) 9148 { 9149 uint64_t reg; 9150 9151 get_kernel_reg(ID_AA64PFR1_EL1, ®); 9152 if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) { 9153 if (bootverbose) 9154 printf("Enabling BTI\n"); 9155 pmap_bti_support = true; 9156 9157 pmap_bti_ranges_zone = uma_zcreate("BTI ranges", 9158 sizeof(struct rs_el), NULL, NULL, NULL, NULL, 9159 UMA_ALIGN_PTR, 0); 9160 } 9161 } 9162 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL); 9163 9164 /* 9165 * Defer enabling CnP until we have read the ID registers to know if it's 9166 * supported on all CPUs. 9167 */ 9168 static void 9169 pmap_init_cnp(void *dummy __unused) 9170 { 9171 uint64_t reg; 9172 u_int cpuid; 9173 9174 get_kernel_reg(ID_AA64MMFR2_EL1, ®); 9175 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) { 9176 if (bootverbose) 9177 printf("Enabling CnP\n"); 9178 cpuid = curcpu; 9179 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid); 9180 } 9181 9182 } 9183 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL); 9184 9185 static bool 9186 pmap_activate_int(struct thread *td, pmap_t pmap) 9187 { 9188 struct asid_set *set; 9189 int epoch; 9190 9191 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap")); 9192 KASSERT(pmap != kernel_pmap, ("kernel pmap activation")); 9193 9194 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) || 9195 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) { 9196 /* 9197 * Handle the possibility that the old thread was preempted 9198 * after an "ic" or "tlbi" instruction but before it performed 9199 * a "dsb" instruction. If the old thread migrates to a new 9200 * processor, its completion of a "dsb" instruction on that 9201 * new processor does not guarantee that the "ic" or "tlbi" 9202 * instructions performed on the old processor have completed. 9203 */ 9204 dsb(ish); 9205 return (false); 9206 } 9207 9208 set = pmap->pm_asid_set; 9209 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 9210 9211 /* 9212 * Ensure that the store to curpmap is globally visible before the 9213 * load from asid_epoch is performed. 9214 */ 9215 if (pmap->pm_stage == PM_STAGE1) 9216 PCPU_SET(curpmap, pmap); 9217 else 9218 PCPU_SET(curvmpmap, pmap); 9219 dsb(ish); 9220 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie); 9221 if (epoch >= 0 && epoch != set->asid_epoch) 9222 pmap_alloc_asid(pmap); 9223 9224 if (pmap->pm_stage == PM_STAGE1) { 9225 uint64_t new_tcr, tcr; 9226 9227 new_tcr = td->td_proc->p_md.md_tcr; 9228 tcr = READ_SPECIALREG(tcr_el1); 9229 if ((tcr & MD_TCR_FIELDS) != new_tcr) { 9230 tcr &= ~MD_TCR_FIELDS; 9231 tcr |= new_tcr; 9232 WRITE_SPECIALREG(tcr_el1, tcr); 9233 } 9234 set_ttbr0(pmap_to_ttbr0(pmap)); 9235 if (PCPU_GET(bcast_tlbi_workaround) != 0) 9236 invalidate_local_icache(); 9237 } 9238 return (true); 9239 } 9240 9241 void 9242 pmap_activate_vm(pmap_t pmap) 9243 { 9244 9245 PMAP_ASSERT_STAGE2(pmap); 9246 9247 (void)pmap_activate_int(NULL, pmap); 9248 } 9249 9250 void 9251 pmap_activate(struct thread *td) 9252 { 9253 pmap_t pmap; 9254 9255 pmap = vmspace_pmap(td->td_proc->p_vmspace); 9256 PMAP_ASSERT_STAGE1(pmap); 9257 critical_enter(); 9258 (void)pmap_activate_int(td, pmap); 9259 critical_exit(); 9260 } 9261 9262 /* 9263 * Activate the thread we are switching to. 9264 * To simplify the assembly in cpu_throw return the new threads pcb. 9265 */ 9266 struct pcb * 9267 pmap_switch(struct thread *new) 9268 { 9269 pcpu_bp_harden bp_harden; 9270 struct pcb *pcb; 9271 9272 /* Store the new curthread */ 9273 PCPU_SET(curthread, new); 9274 9275 /* And the new pcb */ 9276 pcb = new->td_pcb; 9277 PCPU_SET(curpcb, pcb); 9278 9279 /* 9280 * TODO: We may need to flush the cache here if switching 9281 * to a user process. 9282 */ 9283 9284 if (pmap_activate_int(new, vmspace_pmap(new->td_proc->p_vmspace))) { 9285 /* 9286 * Stop userspace from training the branch predictor against 9287 * other processes. This will call into a CPU specific 9288 * function that clears the branch predictor state. 9289 */ 9290 bp_harden = PCPU_GET(bp_harden); 9291 if (bp_harden != NULL) 9292 bp_harden(); 9293 } 9294 9295 return (pcb); 9296 } 9297 9298 void 9299 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 9300 { 9301 9302 PMAP_ASSERT_STAGE1(pmap); 9303 KASSERT(ADDR_IS_CANONICAL(va), 9304 ("%s: Address not in canonical form: %lx", __func__, va)); 9305 9306 if (ADDR_IS_KERNEL(va)) { 9307 cpu_icache_sync_range((void *)va, sz); 9308 } else { 9309 u_int len, offset; 9310 vm_paddr_t pa; 9311 9312 /* Find the length of data in this page to flush */ 9313 offset = va & PAGE_MASK; 9314 len = imin(PAGE_SIZE - offset, sz); 9315 9316 while (sz != 0) { 9317 /* Extract the physical address & find it in the DMAP */ 9318 pa = pmap_extract(pmap, va); 9319 if (pa != 0) 9320 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), 9321 len); 9322 9323 /* Move to the next page */ 9324 sz -= len; 9325 va += len; 9326 /* Set the length for the next iteration */ 9327 len = imin(PAGE_SIZE, sz); 9328 } 9329 } 9330 } 9331 9332 static int 9333 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far) 9334 { 9335 pd_entry_t *pdep; 9336 pt_entry_t *ptep, pte; 9337 int rv, lvl, dfsc; 9338 9339 PMAP_ASSERT_STAGE2(pmap); 9340 rv = KERN_FAILURE; 9341 9342 /* Data and insn aborts use same encoding for FSC field. */ 9343 dfsc = esr & ISS_DATA_DFSC_MASK; 9344 switch (dfsc) { 9345 case ISS_DATA_DFSC_TF_L0: 9346 case ISS_DATA_DFSC_TF_L1: 9347 case ISS_DATA_DFSC_TF_L2: 9348 case ISS_DATA_DFSC_TF_L3: 9349 PMAP_LOCK(pmap); 9350 pdep = pmap_pde(pmap, far, &lvl); 9351 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) { 9352 PMAP_UNLOCK(pmap); 9353 break; 9354 } 9355 9356 switch (lvl) { 9357 case 0: 9358 ptep = pmap_l0_to_l1(pdep, far); 9359 break; 9360 case 1: 9361 ptep = pmap_l1_to_l2(pdep, far); 9362 break; 9363 case 2: 9364 ptep = pmap_l2_to_l3(pdep, far); 9365 break; 9366 default: 9367 panic("%s: Invalid pde level %d", __func__,lvl); 9368 } 9369 goto fault_exec; 9370 9371 case ISS_DATA_DFSC_AFF_L1: 9372 case ISS_DATA_DFSC_AFF_L2: 9373 case ISS_DATA_DFSC_AFF_L3: 9374 PMAP_LOCK(pmap); 9375 ptep = pmap_pte(pmap, far, &lvl); 9376 fault_exec: 9377 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) { 9378 /* 9379 * If accessing an executable page invalidate 9380 * the I-cache so it will be valid when we 9381 * continue execution in the guest. The D-cache 9382 * is assumed to already be clean to the Point 9383 * of Coherency. 9384 */ 9385 if ((pte & ATTR_S2_XN_MASK) != 9386 ATTR_S2_XN(ATTR_S2_XN_NONE)) { 9387 invalidate_icache(); 9388 } 9389 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID); 9390 rv = KERN_SUCCESS; 9391 } 9392 PMAP_UNLOCK(pmap); 9393 break; 9394 } 9395 9396 return (rv); 9397 } 9398 9399 int 9400 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) 9401 { 9402 pt_entry_t pte, *ptep; 9403 register_t intr; 9404 uint64_t ec, par; 9405 int lvl, rv; 9406 9407 rv = KERN_FAILURE; 9408 9409 ec = ESR_ELx_EXCEPTION(esr); 9410 switch (ec) { 9411 case EXCP_INSN_ABORT_L: 9412 case EXCP_INSN_ABORT: 9413 case EXCP_DATA_ABORT_L: 9414 case EXCP_DATA_ABORT: 9415 break; 9416 default: 9417 return (rv); 9418 } 9419 9420 if (pmap->pm_stage == PM_STAGE2) 9421 return (pmap_stage2_fault(pmap, esr, far)); 9422 9423 /* Data and insn aborts use same encoding for FSC field. */ 9424 switch (esr & ISS_DATA_DFSC_MASK) { 9425 case ISS_DATA_DFSC_AFF_L1: 9426 case ISS_DATA_DFSC_AFF_L2: 9427 case ISS_DATA_DFSC_AFF_L3: 9428 PMAP_LOCK(pmap); 9429 ptep = pmap_pte(pmap, far, &lvl); 9430 if (ptep != NULL) { 9431 pmap_set_bits(ptep, ATTR_AF); 9432 rv = KERN_SUCCESS; 9433 /* 9434 * XXXMJ as an optimization we could mark the entry 9435 * dirty if this is a write fault. 9436 */ 9437 } 9438 PMAP_UNLOCK(pmap); 9439 break; 9440 case ISS_DATA_DFSC_PF_L1: 9441 case ISS_DATA_DFSC_PF_L2: 9442 case ISS_DATA_DFSC_PF_L3: 9443 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || 9444 (esr & ISS_DATA_WnR) == 0) 9445 return (rv); 9446 PMAP_LOCK(pmap); 9447 ptep = pmap_pte(pmap, far, &lvl); 9448 if (ptep != NULL && 9449 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { 9450 if ((pte & ATTR_S1_AP_RW_BIT) == 9451 ATTR_S1_AP(ATTR_S1_AP_RO)) { 9452 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT); 9453 pmap_s1_invalidate_page(pmap, far, true); 9454 } 9455 rv = KERN_SUCCESS; 9456 } 9457 PMAP_UNLOCK(pmap); 9458 break; 9459 case ISS_DATA_DFSC_TF_L0: 9460 case ISS_DATA_DFSC_TF_L1: 9461 case ISS_DATA_DFSC_TF_L2: 9462 case ISS_DATA_DFSC_TF_L3: 9463 /* 9464 * Retry the translation. A break-before-make sequence can 9465 * produce a transient fault. 9466 */ 9467 if (pmap == kernel_pmap) { 9468 /* 9469 * The translation fault may have occurred within a 9470 * critical section. Therefore, we must check the 9471 * address without acquiring the kernel pmap's lock. 9472 */ 9473 if (pmap_klookup(far, NULL)) 9474 rv = KERN_SUCCESS; 9475 } else { 9476 bool owned; 9477 9478 /* 9479 * In the EFIRT driver we lock the pmap before 9480 * calling into the runtime service. As the lock 9481 * is already owned by the current thread skip 9482 * locking it again. 9483 */ 9484 owned = PMAP_OWNED(pmap); 9485 if (!owned) 9486 PMAP_LOCK(pmap); 9487 /* Ask the MMU to check the address. */ 9488 intr = intr_disable(); 9489 par = arm64_address_translate_s1e0r(far); 9490 intr_restore(intr); 9491 if (!owned) 9492 PMAP_UNLOCK(pmap); 9493 9494 /* 9495 * If the translation was successful, then we can 9496 * return success to the trap handler. 9497 */ 9498 if (PAR_SUCCESS(par)) 9499 rv = KERN_SUCCESS; 9500 } 9501 break; 9502 } 9503 9504 return (rv); 9505 } 9506 9507 /* 9508 * Increase the starting virtual address of the given mapping if a 9509 * different alignment might result in more superpage mappings. 9510 */ 9511 void 9512 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 9513 vm_offset_t *addr, vm_size_t size) 9514 { 9515 vm_offset_t superpage_offset; 9516 9517 if (size < L3C_SIZE) 9518 return; 9519 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 9520 offset += ptoa(object->pg_color); 9521 9522 /* 9523 * Considering the object's physical alignment, is the mapping large 9524 * enough to encompass an L2 (2MB/32MB) superpage ... 9525 */ 9526 superpage_offset = offset & L2_OFFSET; 9527 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) >= L2_SIZE) { 9528 /* 9529 * If the virtual and physical alignments differ, then 9530 * increase the virtual address so that the alignments match. 9531 */ 9532 if ((*addr & L2_OFFSET) < superpage_offset) 9533 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 9534 else if ((*addr & L2_OFFSET) > superpage_offset) 9535 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + 9536 superpage_offset; 9537 return; 9538 } 9539 /* ... or an L3C (64KB/2MB) superpage? */ 9540 superpage_offset = offset & L3C_OFFSET; 9541 if (size - ((L3C_SIZE - superpage_offset) & L3C_OFFSET) >= L3C_SIZE) { 9542 if ((*addr & L3C_OFFSET) < superpage_offset) 9543 *addr = (*addr & ~L3C_OFFSET) + superpage_offset; 9544 else if ((*addr & L3C_OFFSET) > superpage_offset) 9545 *addr = ((*addr + L3C_OFFSET) & ~L3C_OFFSET) + 9546 superpage_offset; 9547 } 9548 } 9549 9550 /** 9551 * Get the kernel virtual address of a set of physical pages. If there are 9552 * physical addresses not covered by the DMAP perform a transient mapping 9553 * that will be removed when calling pmap_unmap_io_transient. 9554 * 9555 * \param page The pages the caller wishes to obtain the virtual 9556 * address on the kernel memory map. 9557 * \param vaddr On return contains the kernel virtual memory address 9558 * of the pages passed in the page parameter. 9559 * \param count Number of pages passed in. 9560 * \param can_fault true if the thread using the mapped pages can take 9561 * page faults, false otherwise. 9562 * 9563 * \returns true if the caller must call pmap_unmap_io_transient when 9564 * finished or false otherwise. 9565 * 9566 */ 9567 bool 9568 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 9569 bool can_fault) 9570 { 9571 vm_paddr_t paddr; 9572 bool needs_mapping; 9573 int error __diagused, i; 9574 9575 /* 9576 * Allocate any KVA space that we need, this is done in a separate 9577 * loop to prevent calling vmem_alloc while pinned. 9578 */ 9579 needs_mapping = false; 9580 for (i = 0; i < count; i++) { 9581 paddr = VM_PAGE_TO_PHYS(page[i]); 9582 if (__predict_false(!PHYS_IN_DMAP(paddr))) { 9583 error = vmem_alloc(kernel_arena, PAGE_SIZE, 9584 M_BESTFIT | M_WAITOK, &vaddr[i]); 9585 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 9586 needs_mapping = true; 9587 } else { 9588 vaddr[i] = PHYS_TO_DMAP(paddr); 9589 } 9590 } 9591 9592 /* Exit early if everything is covered by the DMAP */ 9593 if (!needs_mapping) 9594 return (false); 9595 9596 if (!can_fault) 9597 sched_pin(); 9598 for (i = 0; i < count; i++) { 9599 paddr = VM_PAGE_TO_PHYS(page[i]); 9600 if (!PHYS_IN_DMAP(paddr)) { 9601 panic( 9602 "pmap_map_io_transient: TODO: Map out of DMAP data"); 9603 } 9604 } 9605 9606 return (needs_mapping); 9607 } 9608 9609 void 9610 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 9611 bool can_fault) 9612 { 9613 vm_paddr_t paddr; 9614 int i; 9615 9616 if (!can_fault) 9617 sched_unpin(); 9618 for (i = 0; i < count; i++) { 9619 paddr = VM_PAGE_TO_PHYS(page[i]); 9620 if (!PHYS_IN_DMAP(paddr)) { 9621 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); 9622 } 9623 } 9624 } 9625 9626 bool 9627 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 9628 { 9629 9630 return (mode >= 0 && mode < VM_MEMATTR_END); 9631 } 9632 9633 static void * 9634 bti_dup_range(void *ctx __unused, void *data) 9635 { 9636 struct rs_el *node, *new_node; 9637 9638 new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT); 9639 if (new_node == NULL) 9640 return (NULL); 9641 node = data; 9642 memcpy(new_node, node, sizeof(*node)); 9643 return (new_node); 9644 } 9645 9646 static void 9647 bti_free_range(void *ctx __unused, void *node) 9648 { 9649 9650 uma_zfree(pmap_bti_ranges_zone, node); 9651 } 9652 9653 static int 9654 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9655 { 9656 struct rs_el *rs; 9657 int error; 9658 9659 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9660 PMAP_ASSERT_STAGE1(pmap); 9661 MPASS(pmap->pm_bti != NULL); 9662 rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT); 9663 if (rs == NULL) 9664 return (ENOMEM); 9665 error = rangeset_insert(pmap->pm_bti, sva, eva, rs); 9666 if (error != 0) 9667 uma_zfree(pmap_bti_ranges_zone, rs); 9668 return (error); 9669 } 9670 9671 static void 9672 pmap_bti_deassign_all(pmap_t pmap) 9673 { 9674 9675 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9676 if (pmap->pm_bti != NULL) 9677 rangeset_remove_all(pmap->pm_bti); 9678 } 9679 9680 /* 9681 * Returns true if the BTI setting is the same across the specified address 9682 * range, and false otherwise. When returning true, updates the referenced PTE 9683 * to reflect the BTI setting. 9684 * 9685 * Only stage 1 pmaps support BTI. The kernel pmap is always a stage 1 pmap 9686 * that has the same BTI setting implicitly across its entire address range. 9687 */ 9688 static bool 9689 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte) 9690 { 9691 struct rs_el *rs; 9692 vm_offset_t va; 9693 9694 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9695 KASSERT(ADDR_IS_CANONICAL(sva), 9696 ("%s: Start address not in canonical form: %lx", __func__, sva)); 9697 KASSERT(ADDR_IS_CANONICAL(eva), 9698 ("%s: End address not in canonical form: %lx", __func__, eva)); 9699 KASSERT((*pte & ATTR_S1_GP) == 0, 9700 ("%s: pte %lx has ATTR_S1_GP preset", __func__, *pte)); 9701 9702 if (pmap == kernel_pmap) { 9703 *pte |= ATTR_KERN_GP; 9704 return (true); 9705 } 9706 if (pmap->pm_bti == NULL) 9707 return (true); 9708 PMAP_ASSERT_STAGE1(pmap); 9709 rs = rangeset_containing(pmap->pm_bti, sva); 9710 if (rs == NULL) 9711 return (rangeset_empty(pmap->pm_bti, sva, eva)); 9712 while ((va = rs->re_end) < eva) { 9713 if ((rs = rangeset_beginning(pmap->pm_bti, va)) == NULL) 9714 return (false); 9715 } 9716 *pte |= ATTR_S1_GP; 9717 return (true); 9718 } 9719 9720 static pt_entry_t 9721 pmap_pte_bti(pmap_t pmap, vm_offset_t va) 9722 { 9723 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9724 MPASS(ADDR_IS_CANONICAL(va)); 9725 9726 if (pmap->pm_stage != PM_STAGE1) 9727 return (0); 9728 if (pmap == kernel_pmap) 9729 return (ATTR_KERN_GP); 9730 if (pmap->pm_bti != NULL && 9731 rangeset_containing(pmap->pm_bti, va) != NULL) 9732 return (ATTR_S1_GP); 9733 return (0); 9734 } 9735 9736 static void 9737 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9738 { 9739 9740 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9741 if (pmap->pm_bti != NULL) 9742 rangeset_remove(pmap->pm_bti, sva, eva); 9743 } 9744 9745 static int 9746 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap) 9747 { 9748 9749 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 9750 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 9751 MPASS(src_pmap->pm_stage == dst_pmap->pm_stage); 9752 MPASS(src_pmap->pm_bti != NULL); 9753 MPASS(dst_pmap->pm_bti != NULL); 9754 if (src_pmap->pm_bti->rs_data_ctx == NULL) 9755 return (0); 9756 return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti)); 9757 } 9758 9759 static void 9760 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set) 9761 { 9762 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9763 PMAP_ASSERT_STAGE1(pmap); 9764 9765 pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0, 9766 true); 9767 } 9768 9769 int 9770 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9771 { 9772 int error; 9773 9774 if (pmap->pm_bti == NULL) 9775 return (0); 9776 if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva)) 9777 return (EINVAL); 9778 if (pmap->pm_stage != PM_STAGE1) 9779 return (EINVAL); 9780 if (eva <= sva || ADDR_IS_KERNEL(eva)) 9781 return (EFAULT); 9782 9783 sva = trunc_page(sva); 9784 eva = round_page(eva); 9785 for (;;) { 9786 PMAP_LOCK(pmap); 9787 error = pmap_bti_assign(pmap, sva, eva); 9788 if (error == 0) 9789 pmap_bti_update_range(pmap, sva, eva, true); 9790 PMAP_UNLOCK(pmap); 9791 if (error != ENOMEM) 9792 break; 9793 vm_wait(NULL); 9794 } 9795 return (error); 9796 } 9797 9798 #if defined(KASAN) || defined(KMSAN) 9799 static pd_entry_t *pmap_san_early_l2; 9800 9801 #define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE) 9802 #define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE) 9803 static vm_offset_t __nosanitizeaddress 9804 pmap_san_enter_bootstrap_alloc_l2(void) 9805 { 9806 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE); 9807 static size_t offset = 0; 9808 vm_offset_t addr; 9809 9810 if (offset + L2_SIZE > sizeof(bootstrap_data)) { 9811 panic("%s: out of memory for the bootstrap shadow map L2 entries", 9812 __func__); 9813 } 9814 9815 addr = (uintptr_t)&bootstrap_data[offset]; 9816 offset += L2_SIZE; 9817 return (addr); 9818 } 9819 9820 /* 9821 * SAN L1 + L2 pages, maybe L3 entries later? 9822 */ 9823 static vm_offset_t __nosanitizeaddress 9824 pmap_san_enter_bootstrap_alloc_pages(int npages) 9825 { 9826 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE); 9827 static size_t offset = 0; 9828 vm_offset_t addr; 9829 9830 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) { 9831 panic("%s: out of memory for the bootstrap shadow map", 9832 __func__); 9833 } 9834 9835 addr = (uintptr_t)&bootstrap_data[offset]; 9836 offset += (npages * PAGE_SIZE); 9837 return (addr); 9838 } 9839 9840 static void __nosanitizeaddress 9841 pmap_san_enter_bootstrap(void) 9842 { 9843 vm_offset_t freemempos; 9844 9845 /* L1, L2 */ 9846 freemempos = pmap_san_enter_bootstrap_alloc_pages(2); 9847 bs_state.freemempos = freemempos; 9848 bs_state.va = KASAN_MIN_ADDRESS; 9849 pmap_bootstrap_l1_table(&bs_state); 9850 pmap_san_early_l2 = bs_state.l2; 9851 } 9852 9853 static vm_page_t 9854 pmap_san_enter_alloc_l3(void) 9855 { 9856 vm_page_t m; 9857 9858 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 9859 VM_ALLOC_ZERO); 9860 if (m == NULL) 9861 panic("%s: no memory to grow shadow map", __func__); 9862 return (m); 9863 } 9864 9865 static vm_page_t 9866 pmap_san_enter_alloc_l2(void) 9867 { 9868 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 9869 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT)); 9870 } 9871 9872 void __nosanitizeaddress __nosanitizememory 9873 pmap_san_enter(vm_offset_t va) 9874 { 9875 pd_entry_t *l1, *l2; 9876 pt_entry_t *l3; 9877 vm_page_t m; 9878 9879 if (virtual_avail == 0) { 9880 vm_offset_t block; 9881 int slot; 9882 bool first; 9883 9884 /* Temporary shadow map prior to pmap_bootstrap(). */ 9885 first = pmap_san_early_l2 == NULL; 9886 if (first) 9887 pmap_san_enter_bootstrap(); 9888 9889 l2 = pmap_san_early_l2; 9890 slot = pmap_l2_index(va); 9891 9892 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) { 9893 MPASS(first); 9894 block = pmap_san_enter_bootstrap_alloc_l2(); 9895 pmap_store(&l2[slot], 9896 PHYS_TO_PTE(pmap_early_vtophys(block)) | 9897 PMAP_SAN_PTE_BITS | L2_BLOCK); 9898 dmb(ishst); 9899 } 9900 9901 return; 9902 } 9903 9904 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 9905 l1 = pmap_l1(kernel_pmap, va); 9906 MPASS(l1 != NULL); 9907 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) { 9908 m = pmap_san_enter_alloc_l3(); 9909 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE); 9910 } 9911 l2 = pmap_l1_to_l2(l1, va); 9912 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) { 9913 m = pmap_san_enter_alloc_l2(); 9914 if (m != NULL) { 9915 pmap_store(l2, VM_PAGE_TO_PTE(m) | 9916 PMAP_SAN_PTE_BITS | L2_BLOCK); 9917 } else { 9918 m = pmap_san_enter_alloc_l3(); 9919 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE); 9920 } 9921 dmb(ishst); 9922 } 9923 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) 9924 return; 9925 l3 = pmap_l2_to_l3(l2, va); 9926 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0) 9927 return; 9928 m = pmap_san_enter_alloc_l3(); 9929 pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE); 9930 dmb(ishst); 9931 } 9932 #endif /* KASAN || KMSAN */ 9933 9934 /* 9935 * Track a range of the kernel's virtual address space that is contiguous 9936 * in various mapping attributes. 9937 */ 9938 struct pmap_kernel_map_range { 9939 vm_offset_t sva; 9940 pt_entry_t attrs; 9941 int l3pages; 9942 int l3contig; 9943 int l2blocks; 9944 int l2contig; 9945 int l1blocks; 9946 }; 9947 9948 static void 9949 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 9950 vm_offset_t eva) 9951 { 9952 const char *mode; 9953 int index; 9954 9955 if (eva <= range->sva) 9956 return; 9957 9958 index = range->attrs & ATTR_S1_IDX_MASK; 9959 switch (index) { 9960 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP): 9961 mode = "DEV-NP"; 9962 break; 9963 case ATTR_S1_IDX(VM_MEMATTR_DEVICE): 9964 mode = "DEV"; 9965 break; 9966 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE): 9967 mode = "UC"; 9968 break; 9969 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK): 9970 mode = "WB"; 9971 break; 9972 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH): 9973 mode = "WT"; 9974 break; 9975 default: 9976 printf( 9977 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n", 9978 __func__, index, range->sva, eva); 9979 mode = "??"; 9980 break; 9981 } 9982 9983 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d %d\n", 9984 range->sva, eva, 9985 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-', 9986 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x', 9987 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X', 9988 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's', 9989 (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-', 9990 mode, range->l1blocks, range->l2contig, range->l2blocks, 9991 range->l3contig, range->l3pages); 9992 9993 /* Reset to sentinel value. */ 9994 range->sva = 0xfffffffffffffffful; 9995 } 9996 9997 /* 9998 * Determine whether the attributes specified by a page table entry match those 9999 * being tracked by the current range. 10000 */ 10001 static bool 10002 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 10003 { 10004 10005 return (range->attrs == attrs); 10006 } 10007 10008 static void 10009 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 10010 pt_entry_t attrs) 10011 { 10012 10013 memset(range, 0, sizeof(*range)); 10014 range->sva = va; 10015 range->attrs = attrs; 10016 } 10017 10018 /* Get the block/page attributes that correspond to the table attributes */ 10019 static pt_entry_t 10020 sysctl_kmaps_table_attrs(pd_entry_t table) 10021 { 10022 pt_entry_t attrs; 10023 10024 attrs = 0; 10025 if ((table & TATTR_UXN_TABLE) != 0) 10026 attrs |= ATTR_S1_UXN; 10027 if ((table & TATTR_PXN_TABLE) != 0) 10028 attrs |= ATTR_S1_PXN; 10029 if ((table & TATTR_AP_TABLE_RO) != 0) 10030 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO); 10031 10032 return (attrs); 10033 } 10034 10035 /* Read the block/page attributes we care about */ 10036 static pt_entry_t 10037 sysctl_kmaps_block_attrs(pt_entry_t block) 10038 { 10039 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK | 10040 ATTR_S1_GP)); 10041 } 10042 10043 /* 10044 * Given a leaf PTE, derive the mapping's attributes. If they do not match 10045 * those of the current run, dump the address range and its attributes, and 10046 * begin a new run. 10047 */ 10048 static void 10049 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 10050 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e, 10051 pt_entry_t l3e) 10052 { 10053 pt_entry_t attrs; 10054 10055 attrs = sysctl_kmaps_table_attrs(l0e); 10056 10057 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 10058 attrs |= sysctl_kmaps_block_attrs(l1e); 10059 goto done; 10060 } 10061 attrs |= sysctl_kmaps_table_attrs(l1e); 10062 10063 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 10064 attrs |= sysctl_kmaps_block_attrs(l2e); 10065 goto done; 10066 } 10067 attrs |= sysctl_kmaps_table_attrs(l2e); 10068 attrs |= sysctl_kmaps_block_attrs(l3e); 10069 10070 done: 10071 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 10072 sysctl_kmaps_dump(sb, range, va); 10073 sysctl_kmaps_reinit(range, va, attrs); 10074 } 10075 } 10076 10077 static int 10078 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 10079 { 10080 struct pmap_kernel_map_range range; 10081 struct sbuf sbuf, *sb; 10082 pd_entry_t l0e, *l1, l1e, *l2, l2e; 10083 pt_entry_t *l3, l3e; 10084 vm_offset_t sva; 10085 vm_paddr_t pa; 10086 int error, i, j, k, l; 10087 10088 error = sysctl_wire_old_buffer(req, 0); 10089 if (error != 0) 10090 return (error); 10091 sb = &sbuf; 10092 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 10093 10094 /* Sentinel value. */ 10095 range.sva = 0xfffffffffffffffful; 10096 10097 /* 10098 * Iterate over the kernel page tables without holding the kernel pmap 10099 * lock. Kernel page table pages are never freed, so at worst we will 10100 * observe inconsistencies in the output. 10101 */ 10102 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES; 10103 i++) { 10104 if (i == pmap_l0_index(DMAP_MIN_ADDRESS)) 10105 sbuf_printf(sb, "\nDirect map:\n"); 10106 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS)) 10107 sbuf_printf(sb, "\nKernel map:\n"); 10108 #ifdef KASAN 10109 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS)) 10110 sbuf_printf(sb, "\nKASAN shadow map:\n"); 10111 #endif 10112 #ifdef KMSAN 10113 else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS)) 10114 sbuf_printf(sb, "\nKMSAN shadow map:\n"); 10115 else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS)) 10116 sbuf_printf(sb, "\nKMSAN origin map:\n"); 10117 #endif 10118 10119 l0e = kernel_pmap->pm_l0[i]; 10120 if ((l0e & ATTR_DESCR_VALID) == 0) { 10121 sysctl_kmaps_dump(sb, &range, sva); 10122 sva += L0_SIZE; 10123 continue; 10124 } 10125 pa = PTE_TO_PHYS(l0e); 10126 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa); 10127 10128 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) { 10129 l1e = l1[j]; 10130 if ((l1e & ATTR_DESCR_VALID) == 0) { 10131 sysctl_kmaps_dump(sb, &range, sva); 10132 sva += L1_SIZE; 10133 continue; 10134 } 10135 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { 10136 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 10137 sysctl_kmaps_check(sb, &range, sva, l0e, l1e, 10138 0, 0); 10139 range.l1blocks++; 10140 sva += L1_SIZE; 10141 continue; 10142 } 10143 pa = PTE_TO_PHYS(l1e); 10144 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 10145 10146 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) { 10147 l2e = l2[k]; 10148 if ((l2e & ATTR_DESCR_VALID) == 0) { 10149 sysctl_kmaps_dump(sb, &range, sva); 10150 sva += L2_SIZE; 10151 continue; 10152 } 10153 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { 10154 sysctl_kmaps_check(sb, &range, sva, 10155 l0e, l1e, l2e, 0); 10156 if ((l2e & ATTR_CONTIGUOUS) != 0) 10157 range.l2contig += 10158 k % L2C_ENTRIES == 0 ? 10159 1 : 0; 10160 else 10161 range.l2blocks++; 10162 sva += L2_SIZE; 10163 continue; 10164 } 10165 pa = PTE_TO_PHYS(l2e); 10166 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); 10167 10168 for (l = pmap_l3_index(sva); l < Ln_ENTRIES; 10169 l++, sva += L3_SIZE) { 10170 l3e = l3[l]; 10171 if ((l3e & ATTR_DESCR_VALID) == 0) { 10172 sysctl_kmaps_dump(sb, &range, 10173 sva); 10174 continue; 10175 } 10176 sysctl_kmaps_check(sb, &range, sva, 10177 l0e, l1e, l2e, l3e); 10178 if ((l3e & ATTR_CONTIGUOUS) != 0) 10179 range.l3contig += 10180 l % L3C_ENTRIES == 0 ? 10181 1 : 0; 10182 else 10183 range.l3pages++; 10184 } 10185 } 10186 } 10187 } 10188 10189 error = sbuf_finish(sb); 10190 sbuf_delete(sb); 10191 return (error); 10192 } 10193 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 10194 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 10195 NULL, 0, sysctl_kmaps, "A", 10196 "Dump kernel address layout"); 10197 10198 10199 void pagezero_simple(void *); 10200 void pagezero_cache(void *); 10201 void pagezero_mops(void *); 10202 10203 DEFINE_IFUNC(static, void, pagezero, (void *)) 10204 { 10205 uint32_t dczid_el0; 10206 10207 dczid_el0 = READ_SPECIALREG(dczid_el0); 10208 10209 if (elf_hwcap2 & HWCAP2_MOPS) 10210 return (pagezero_mops); 10211 else if ((dczid_el0 & DCZID_DZP) == 0) 10212 return (pagezero_cache); 10213 else 10214 return (pagezero_simple); 10215 } 10216