1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2018 Matthew Macy 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "opt_platform.h" 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include <sys/param.h> 34 #include <sys/kernel.h> 35 #include <sys/systm.h> 36 #include <sys/conf.h> 37 #include <sys/bitstring.h> 38 #include <sys/queue.h> 39 #include <sys/cpuset.h> 40 #include <sys/endian.h> 41 #include <sys/kerneldump.h> 42 #include <sys/ktr.h> 43 #include <sys/lock.h> 44 #include <sys/syslog.h> 45 #include <sys/msgbuf.h> 46 #include <sys/malloc.h> 47 #include <sys/mman.h> 48 #include <sys/mutex.h> 49 #include <sys/proc.h> 50 #include <sys/rwlock.h> 51 #include <sys/sched.h> 52 #include <sys/sysctl.h> 53 #include <sys/systm.h> 54 #include <sys/vmem.h> 55 #include <sys/vmmeter.h> 56 #include <sys/smp.h> 57 58 #include <sys/kdb.h> 59 60 #include <dev/ofw/openfirm.h> 61 62 #include <vm/vm.h> 63 #include <vm/pmap.h> 64 #include <vm/vm_param.h> 65 #include <vm/vm_kern.h> 66 #include <vm/vm_page.h> 67 #include <vm/vm_map.h> 68 #include <vm/vm_object.h> 69 #include <vm/vm_extern.h> 70 #include <vm/vm_pageout.h> 71 #include <vm/vm_phys.h> 72 #include <vm/vm_reserv.h> 73 #include <vm/vm_dumpset.h> 74 #include <vm/uma.h> 75 76 #include <machine/_inttypes.h> 77 #include <machine/cpu.h> 78 #include <machine/platform.h> 79 #include <machine/frame.h> 80 #include <machine/md_var.h> 81 #include <machine/psl.h> 82 #include <machine/bat.h> 83 #include <machine/hid.h> 84 #include <machine/pte.h> 85 #include <machine/sr.h> 86 #include <machine/trap.h> 87 #include <machine/mmuvar.h> 88 89 /* For pseries bit. */ 90 #include <powerpc/pseries/phyp-hvcall.h> 91 92 #ifdef INVARIANTS 93 #include <vm/uma_dbg.h> 94 #endif 95 96 #define PPC_BITLSHIFT(bit) (sizeof(long)*NBBY - 1 - (bit)) 97 #define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) 98 #define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit)) 99 100 #include "opt_ddb.h" 101 102 #ifdef DDB 103 static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va); 104 #endif 105 106 #define PG_W RPTE_WIRED 107 #define PG_V RPTE_VALID 108 #define PG_MANAGED RPTE_MANAGED 109 #define PG_PROMOTED RPTE_PROMOTED 110 #define PG_M RPTE_C 111 #define PG_A RPTE_R 112 #define PG_X RPTE_EAA_X 113 #define PG_RW RPTE_EAA_W 114 #define PG_PTE_CACHE RPTE_ATTR_MASK 115 116 #define RPTE_SHIFT 9 117 #define NLS_MASK ((1UL<<5)-1) 118 #define RPTE_ENTRIES (1UL<<RPTE_SHIFT) 119 #define RPTE_MASK (RPTE_ENTRIES-1) 120 121 #define NLB_SHIFT 0 122 #define NLB_MASK (((1UL<<52)-1) << 8) 123 124 extern int nkpt; 125 extern caddr_t crashdumpmap; 126 127 #define RIC_FLUSH_TLB 0 128 #define RIC_FLUSH_PWC 1 129 #define RIC_FLUSH_ALL 2 130 131 #define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */ 132 133 #define PPC_INST_TLBIE 0x7c000264 134 #define PPC_INST_TLBIEL 0x7c000224 135 #define PPC_INST_SLBIA 0x7c0003e4 136 137 #define ___PPC_RA(a) (((a) & 0x1f) << 16) 138 #define ___PPC_RB(b) (((b) & 0x1f) << 11) 139 #define ___PPC_RS(s) (((s) & 0x1f) << 21) 140 #define ___PPC_RT(t) ___PPC_RS(t) 141 #define ___PPC_R(r) (((r) & 0x1) << 16) 142 #define ___PPC_PRS(prs) (((prs) & 0x1) << 17) 143 #define ___PPC_RIC(ric) (((ric) & 0x3) << 18) 144 145 #define PPC_SLBIA(IH) __XSTRING(.long PPC_INST_SLBIA | \ 146 ((IH & 0x7) << 21)) 147 #define PPC_TLBIE_5(rb,rs,ric,prs,r) \ 148 __XSTRING(.long PPC_INST_TLBIE | \ 149 ___PPC_RB(rb) | ___PPC_RS(rs) | \ 150 ___PPC_RIC(ric) | ___PPC_PRS(prs) | \ 151 ___PPC_R(r)) 152 153 #define PPC_TLBIEL(rb,rs,ric,prs,r) \ 154 __XSTRING(.long PPC_INST_TLBIEL | \ 155 ___PPC_RB(rb) | ___PPC_RS(rs) | \ 156 ___PPC_RIC(ric) | ___PPC_PRS(prs) | \ 157 ___PPC_R(r)) 158 159 #define PPC_INVALIDATE_ERAT PPC_SLBIA(7) 160 161 static __inline void 162 ttusync(void) 163 { 164 __asm __volatile("eieio; tlbsync; ptesync" ::: "memory"); 165 } 166 167 #define TLBIEL_INVAL_SEL_MASK 0xc00 /* invalidation selector */ 168 #define TLBIEL_INVAL_PAGE 0x000 /* invalidate a single page */ 169 #define TLBIEL_INVAL_SET_PID 0x400 /* invalidate a set for the current PID */ 170 #define TLBIEL_INVAL_SET_LPID 0x800 /* invalidate a set for current LPID */ 171 #define TLBIEL_INVAL_SET 0xc00 /* invalidate a set for all LPIDs */ 172 173 #define TLBIE_ACTUAL_PAGE_MASK 0xe0 174 #define TLBIE_ACTUAL_PAGE_4K 0x00 175 #define TLBIE_ACTUAL_PAGE_64K 0xa0 176 #define TLBIE_ACTUAL_PAGE_2M 0x20 177 #define TLBIE_ACTUAL_PAGE_1G 0x40 178 179 #define TLBIE_PRS_PARTITION_SCOPE 0x0 180 #define TLBIE_PRS_PROCESS_SCOPE 0x1 181 182 #define TLBIE_RIC_INVALIDATE_TLB 0x0 /* Invalidate just TLB */ 183 #define TLBIE_RIC_INVALIDATE_PWC 0x1 /* Invalidate just PWC */ 184 #define TLBIE_RIC_INVALIDATE_ALL 0x2 /* Invalidate TLB, PWC, 185 * cached {proc, part}tab entries 186 */ 187 #define TLBIE_RIC_INVALIDATE_SEQ 0x3 /* HPT - only: 188 * Invalidate a range of translations 189 */ 190 191 static __always_inline void 192 radix_tlbie(uint8_t ric, uint8_t prs, uint16_t is, uint32_t pid, uint32_t lpid, 193 vm_offset_t va, uint16_t ap) 194 { 195 uint64_t rb, rs; 196 197 MPASS((va & PAGE_MASK) == 0); 198 199 rs = ((uint64_t)pid << 32) | lpid; 200 rb = va | is | ap; 201 __asm __volatile(PPC_TLBIE_5(%0, %1, %2, %3, 1) : : 202 "r" (rb), "r" (rs), "i" (ric), "i" (prs) : "memory"); 203 } 204 205 static __inline void 206 radix_tlbie_fixup(uint32_t pid, vm_offset_t va, int ap) 207 { 208 209 __asm __volatile("ptesync" ::: "memory"); 210 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 211 TLBIEL_INVAL_PAGE, 0, 0, va, ap); 212 __asm __volatile("ptesync" ::: "memory"); 213 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 214 TLBIEL_INVAL_PAGE, pid, 0, va, ap); 215 } 216 217 static __inline void 218 radix_tlbie_invlpg_user_4k(uint32_t pid, vm_offset_t va) 219 { 220 221 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 222 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_4K); 223 radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_4K); 224 } 225 226 static __inline void 227 radix_tlbie_invlpg_user_2m(uint32_t pid, vm_offset_t va) 228 { 229 230 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 231 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_2M); 232 radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_2M); 233 } 234 235 static __inline void 236 radix_tlbie_invlpwc_user(uint32_t pid) 237 { 238 239 radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE, 240 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0); 241 } 242 243 static __inline void 244 radix_tlbie_flush_user(uint32_t pid) 245 { 246 247 radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE, 248 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0); 249 } 250 251 static __inline void 252 radix_tlbie_invlpg_kernel_4k(vm_offset_t va) 253 { 254 255 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 256 TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_4K); 257 radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_4K); 258 } 259 260 static __inline void 261 radix_tlbie_invlpg_kernel_2m(vm_offset_t va) 262 { 263 264 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 265 TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_2M); 266 radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_2M); 267 } 268 269 /* 1GB pages aren't currently supported. */ 270 static __inline __unused void 271 radix_tlbie_invlpg_kernel_1g(vm_offset_t va) 272 { 273 274 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 275 TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_1G); 276 radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_1G); 277 } 278 279 static __inline void 280 radix_tlbie_invlpwc_kernel(void) 281 { 282 283 radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE, 284 TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0); 285 } 286 287 static __inline void 288 radix_tlbie_flush_kernel(void) 289 { 290 291 radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE, 292 TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0); 293 } 294 295 static __inline vm_pindex_t 296 pmap_l3e_pindex(vm_offset_t va) 297 { 298 return ((va & PG_FRAME) >> L3_PAGE_SIZE_SHIFT); 299 } 300 301 static __inline vm_pindex_t 302 pmap_pml3e_index(vm_offset_t va) 303 { 304 305 return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK); 306 } 307 308 static __inline vm_pindex_t 309 pmap_pml2e_index(vm_offset_t va) 310 { 311 return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK); 312 } 313 314 static __inline vm_pindex_t 315 pmap_pml1e_index(vm_offset_t va) 316 { 317 return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT); 318 } 319 320 /* Return various clipped indexes for a given VA */ 321 static __inline vm_pindex_t 322 pmap_pte_index(vm_offset_t va) 323 { 324 325 return ((va >> PAGE_SHIFT) & RPTE_MASK); 326 } 327 328 /* Return a pointer to the PT slot that corresponds to a VA */ 329 static __inline pt_entry_t * 330 pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va) 331 { 332 pt_entry_t *pte; 333 vm_paddr_t ptepa; 334 335 ptepa = (be64toh(*l3e) & NLB_MASK); 336 pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa); 337 return (&pte[pmap_pte_index(va)]); 338 } 339 340 /* Return a pointer to the PD slot that corresponds to a VA */ 341 static __inline pt_entry_t * 342 pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va) 343 { 344 pt_entry_t *l3e; 345 vm_paddr_t l3pa; 346 347 l3pa = (be64toh(*l2e) & NLB_MASK); 348 l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa); 349 return (&l3e[pmap_pml3e_index(va)]); 350 } 351 352 /* Return a pointer to the PD slot that corresponds to a VA */ 353 static __inline pt_entry_t * 354 pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va) 355 { 356 pt_entry_t *l2e; 357 vm_paddr_t l2pa; 358 359 l2pa = (be64toh(*l1e) & NLB_MASK); 360 361 l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa); 362 return (&l2e[pmap_pml2e_index(va)]); 363 } 364 365 static __inline pml1_entry_t * 366 pmap_pml1e(pmap_t pmap, vm_offset_t va) 367 { 368 369 return (&pmap->pm_pml1[pmap_pml1e_index(va)]); 370 } 371 372 static pt_entry_t * 373 pmap_pml2e(pmap_t pmap, vm_offset_t va) 374 { 375 pt_entry_t *l1e; 376 377 l1e = pmap_pml1e(pmap, va); 378 if (l1e == NULL || (be64toh(*l1e) & RPTE_VALID) == 0) 379 return (NULL); 380 return (pmap_l1e_to_l2e(l1e, va)); 381 } 382 383 static __inline pt_entry_t * 384 pmap_pml3e(pmap_t pmap, vm_offset_t va) 385 { 386 pt_entry_t *l2e; 387 388 l2e = pmap_pml2e(pmap, va); 389 if (l2e == NULL || (be64toh(*l2e) & RPTE_VALID) == 0) 390 return (NULL); 391 return (pmap_l2e_to_l3e(l2e, va)); 392 } 393 394 static __inline pt_entry_t * 395 pmap_pte(pmap_t pmap, vm_offset_t va) 396 { 397 pt_entry_t *l3e; 398 399 l3e = pmap_pml3e(pmap, va); 400 if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0) 401 return (NULL); 402 return (pmap_l3e_to_pte(l3e, va)); 403 } 404 405 int nkpt = 64; 406 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 407 "Number of kernel page table pages allocated on bootup"); 408 409 vm_paddr_t dmaplimit; 410 411 SYSCTL_DECL(_vm_pmap); 412 413 #ifdef INVARIANTS 414 #define VERBOSE_PMAP 0 415 #define VERBOSE_PROTECT 0 416 static int pmap_logging; 417 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN, 418 &pmap_logging, 0, "verbose debug logging"); 419 #endif 420 421 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 422 423 //static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ 424 425 static vm_offset_t qframe = 0; 426 static struct mtx qframe_mtx; 427 428 void mmu_radix_activate(struct thread *); 429 void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int); 430 void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *, 431 vm_size_t); 432 void mmu_radix_clear_modify(vm_page_t); 433 void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); 434 int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *); 435 int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); 436 void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t, 437 vm_prot_t); 438 void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t); 439 vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va); 440 vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t); 441 void mmu_radix_kenter(vm_offset_t, vm_paddr_t); 442 vm_paddr_t mmu_radix_kextract(vm_offset_t); 443 void mmu_radix_kremove(vm_offset_t); 444 boolean_t mmu_radix_is_modified(vm_page_t); 445 boolean_t mmu_radix_is_prefaultable(pmap_t, vm_offset_t); 446 boolean_t mmu_radix_is_referenced(vm_page_t); 447 void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t, 448 vm_pindex_t, vm_size_t); 449 boolean_t mmu_radix_page_exists_quick(pmap_t, vm_page_t); 450 void mmu_radix_page_init(vm_page_t); 451 boolean_t mmu_radix_page_is_mapped(vm_page_t m); 452 void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t); 453 int mmu_radix_page_wired_mappings(vm_page_t); 454 int mmu_radix_pinit(pmap_t); 455 void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); 456 bool mmu_radix_ps_enabled(pmap_t); 457 void mmu_radix_qenter(vm_offset_t, vm_page_t *, int); 458 void mmu_radix_qremove(vm_offset_t, int); 459 vm_offset_t mmu_radix_quick_enter_page(vm_page_t); 460 void mmu_radix_quick_remove_page(vm_offset_t); 461 boolean_t mmu_radix_ts_referenced(vm_page_t); 462 void mmu_radix_release(pmap_t); 463 void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t); 464 void mmu_radix_remove_all(vm_page_t); 465 void mmu_radix_remove_pages(pmap_t); 466 void mmu_radix_remove_write(vm_page_t); 467 void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t); 468 void mmu_radix_zero_page(vm_page_t); 469 void mmu_radix_zero_page_area(vm_page_t, int, int); 470 int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t); 471 void mmu_radix_page_array_startup(long pages); 472 473 #include "mmu_oea64.h" 474 475 /* 476 * Kernel MMU interface 477 */ 478 479 static void mmu_radix_bootstrap(vm_offset_t, vm_offset_t); 480 481 static void mmu_radix_copy_page(vm_page_t, vm_page_t); 482 static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset, 483 vm_page_t *mb, vm_offset_t b_offset, int xfersize); 484 static void mmu_radix_growkernel(vm_offset_t); 485 static void mmu_radix_init(void); 486 static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *); 487 static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); 488 static void mmu_radix_pinit0(pmap_t); 489 490 static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t); 491 static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); 492 static void mmu_radix_unmapdev(vm_offset_t, vm_size_t); 493 static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma); 494 static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t); 495 static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va); 496 static void mmu_radix_scan_init(void); 497 static void mmu_radix_cpu_bootstrap(int ap); 498 static void mmu_radix_tlbie_all(void); 499 500 static struct pmap_funcs mmu_radix_methods = { 501 .bootstrap = mmu_radix_bootstrap, 502 .copy_page = mmu_radix_copy_page, 503 .copy_pages = mmu_radix_copy_pages, 504 .cpu_bootstrap = mmu_radix_cpu_bootstrap, 505 .growkernel = mmu_radix_growkernel, 506 .init = mmu_radix_init, 507 .map = mmu_radix_map, 508 .mincore = mmu_radix_mincore, 509 .pinit = mmu_radix_pinit, 510 .pinit0 = mmu_radix_pinit0, 511 512 .mapdev = mmu_radix_mapdev, 513 .mapdev_attr = mmu_radix_mapdev_attr, 514 .unmapdev = mmu_radix_unmapdev, 515 .kenter_attr = mmu_radix_kenter_attr, 516 .dev_direct_mapped = mmu_radix_dev_direct_mapped, 517 .dumpsys_pa_init = mmu_radix_scan_init, 518 .dumpsys_map_chunk = mmu_radix_dumpsys_map, 519 .page_is_mapped = mmu_radix_page_is_mapped, 520 .ps_enabled = mmu_radix_ps_enabled, 521 .align_superpage = mmu_radix_align_superpage, 522 .object_init_pt = mmu_radix_object_init_pt, 523 .protect = mmu_radix_protect, 524 /* pmap dispatcher interface */ 525 .clear_modify = mmu_radix_clear_modify, 526 .copy = mmu_radix_copy, 527 .enter = mmu_radix_enter, 528 .enter_object = mmu_radix_enter_object, 529 .enter_quick = mmu_radix_enter_quick, 530 .extract = mmu_radix_extract, 531 .extract_and_hold = mmu_radix_extract_and_hold, 532 .is_modified = mmu_radix_is_modified, 533 .is_prefaultable = mmu_radix_is_prefaultable, 534 .is_referenced = mmu_radix_is_referenced, 535 .ts_referenced = mmu_radix_ts_referenced, 536 .page_exists_quick = mmu_radix_page_exists_quick, 537 .page_init = mmu_radix_page_init, 538 .page_wired_mappings = mmu_radix_page_wired_mappings, 539 .qenter = mmu_radix_qenter, 540 .qremove = mmu_radix_qremove, 541 .release = mmu_radix_release, 542 .remove = mmu_radix_remove, 543 .remove_all = mmu_radix_remove_all, 544 .remove_write = mmu_radix_remove_write, 545 .unwire = mmu_radix_unwire, 546 .zero_page = mmu_radix_zero_page, 547 .zero_page_area = mmu_radix_zero_page_area, 548 .activate = mmu_radix_activate, 549 .quick_enter_page = mmu_radix_quick_enter_page, 550 .quick_remove_page = mmu_radix_quick_remove_page, 551 .page_set_memattr = mmu_radix_page_set_memattr, 552 .page_array_startup = mmu_radix_page_array_startup, 553 554 /* Internal interfaces */ 555 .kenter = mmu_radix_kenter, 556 .kextract = mmu_radix_kextract, 557 .kremove = mmu_radix_kremove, 558 .change_attr = mmu_radix_change_attr, 559 .decode_kernel_ptr = mmu_radix_decode_kernel_ptr, 560 561 .tlbie_all = mmu_radix_tlbie_all, 562 }; 563 564 MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods); 565 566 static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, 567 struct rwlock **lockp); 568 static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va); 569 static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *); 570 static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, 571 struct spglist *free, struct rwlock **lockp); 572 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 573 pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 574 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 575 static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde, 576 struct spglist *free); 577 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 578 pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp); 579 580 static bool pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e, 581 u_int flags, struct rwlock **lockp); 582 #if VM_NRESERVLEVEL > 0 583 static void pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 584 struct rwlock **lockp); 585 #endif 586 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 587 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 588 static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 589 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate); 590 591 static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 592 vm_prot_t prot, struct rwlock **lockp); 593 static int pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, 594 u_int flags, vm_page_t m, struct rwlock **lockp); 595 596 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 597 static void free_pv_chunk(struct pv_chunk *pc); 598 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); 599 static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va, 600 struct rwlock **lockp); 601 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 602 struct rwlock **lockp); 603 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 604 struct spglist *free); 605 static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); 606 607 static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start); 608 static void pmap_invalidate_all(pmap_t pmap); 609 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush); 610 611 /* 612 * Internal flags for pmap_enter()'s helper functions. 613 */ 614 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 615 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 616 617 #define UNIMPLEMENTED() panic("%s not implemented", __func__) 618 #define UNTESTED() panic("%s not yet tested", __func__) 619 620 /* Number of supported PID bits */ 621 static unsigned int isa3_pid_bits; 622 623 /* PID to start allocating from */ 624 static unsigned int isa3_base_pid; 625 626 #define PROCTAB_SIZE_SHIFT (isa3_pid_bits + 4) 627 #define PROCTAB_ENTRIES (1ul << isa3_pid_bits) 628 629 /* 630 * Map of physical memory regions. 631 */ 632 static struct mem_region *regions, *pregions; 633 static struct numa_mem_region *numa_pregions; 634 static u_int phys_avail_count; 635 static int regions_sz, pregions_sz, numa_pregions_sz; 636 static struct pate *isa3_parttab; 637 static struct prte *isa3_proctab; 638 static vmem_t *asid_arena; 639 640 extern void bs_remap_earlyboot(void); 641 642 #define RADIX_PGD_SIZE_SHIFT 16 643 #define RADIX_PGD_SIZE (1UL << RADIX_PGD_SIZE_SHIFT) 644 645 #define RADIX_PGD_INDEX_SHIFT (RADIX_PGD_SIZE_SHIFT-3) 646 #define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t)) 647 #define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t)) 648 649 #define NUPML1E (RADIX_PGD_SIZE/sizeof(uint64_t)) /* number of userland PML1 pages */ 650 #define NUPDPE (NUPML1E * NL2EPG)/* number of userland PDP pages */ 651 #define NUPDE (NUPDPE * NL3EPG) /* number of userland PD entries */ 652 653 /* POWER9 only permits a 64k partition table size. */ 654 #define PARTTAB_SIZE_SHIFT 16 655 #define PARTTAB_SIZE (1UL << PARTTAB_SIZE_SHIFT) 656 657 #define PARTTAB_HR (1UL << 63) /* host uses radix */ 658 #define PARTTAB_GR (1UL << 63) /* guest uses radix must match host */ 659 660 /* TLB flush actions. Used as argument to tlbiel_all() */ 661 enum { 662 TLB_INVAL_SCOPE_LPID = 0, /* invalidate TLBs for current LPID */ 663 TLB_INVAL_SCOPE_GLOBAL = 1, /* invalidate all TLBs */ 664 }; 665 666 #define NPV_LIST_LOCKS MAXCPU 667 static int pmap_initialized; 668 static vm_paddr_t proctab0pa; 669 static vm_paddr_t parttab_phys; 670 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 671 672 /* 673 * Data for the pv entry allocation mechanism. 674 * Updates to pv_invl_gen are protected by the pv_list_locks[] 675 * elements, but reads are not. 676 */ 677 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 678 static struct mtx __exclusive_cache_line pv_chunks_mutex; 679 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 680 static struct md_page *pv_table; 681 static struct md_page pv_dummy; 682 683 #ifdef PV_STATS 684 #define PV_STAT(x) do { x ; } while (0) 685 #else 686 #define PV_STAT(x) do { } while (0) 687 #endif 688 689 #define pa_radix_index(pa) ((pa) >> L3_PAGE_SIZE_SHIFT) 690 #define pa_to_pvh(pa) (&pv_table[pa_radix_index(pa)]) 691 692 #define PHYS_TO_PV_LIST_LOCK(pa) \ 693 (&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS]) 694 695 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 696 struct rwlock **_lockp = (lockp); \ 697 struct rwlock *_new_lock; \ 698 \ 699 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 700 if (_new_lock != *_lockp) { \ 701 if (*_lockp != NULL) \ 702 rw_wunlock(*_lockp); \ 703 *_lockp = _new_lock; \ 704 rw_wlock(*_lockp); \ 705 } \ 706 } while (0) 707 708 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 709 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 710 711 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 712 struct rwlock **_lockp = (lockp); \ 713 \ 714 if (*_lockp != NULL) { \ 715 rw_wunlock(*_lockp); \ 716 *_lockp = NULL; \ 717 } \ 718 } while (0) 719 720 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 721 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 722 723 /* 724 * We support 52 bits, hence: 725 * bits 52 - 31 = 21, 0b10101 726 * RTS encoding details 727 * bits 0 - 3 of rts -> bits 6 - 8 unsigned long 728 * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long 729 */ 730 #define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5)) 731 732 static int powernv_enabled = 1; 733 734 static __always_inline void 735 tlbiel_radix_set_isa300(uint32_t set, uint32_t is, 736 uint32_t pid, uint32_t ric, uint32_t prs) 737 { 738 uint64_t rb; 739 uint64_t rs; 740 741 rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53); 742 rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31); 743 744 __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) 745 : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) 746 : "memory"); 747 } 748 749 static void 750 tlbiel_flush_isa3(uint32_t num_sets, uint32_t is) 751 { 752 uint32_t set; 753 754 __asm __volatile("ptesync": : :"memory"); 755 756 /* 757 * Flush the first set of the TLB, and the entire Page Walk Cache 758 * and partition table entries. Then flush the remaining sets of the 759 * TLB. 760 */ 761 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); 762 for (set = 1; set < num_sets; set++) 763 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); 764 765 /* Do the same for process scoped entries. */ 766 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); 767 for (set = 1; set < num_sets; set++) 768 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); 769 770 __asm __volatile("ptesync": : :"memory"); 771 } 772 773 static void 774 mmu_radix_tlbiel_flush(int scope) 775 { 776 int is; 777 778 MPASS(scope == TLB_INVAL_SCOPE_LPID || 779 scope == TLB_INVAL_SCOPE_GLOBAL); 780 is = scope + 2; 781 782 tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, is); 783 __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); 784 } 785 786 static void 787 mmu_radix_tlbie_all() 788 { 789 if (powernv_enabled) 790 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); 791 else 792 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID); 793 } 794 795 static void 796 mmu_radix_init_amor(void) 797 { 798 /* 799 * In HV mode, we init AMOR (Authority Mask Override Register) so that 800 * the hypervisor and guest can setup IAMR (Instruction Authority Mask 801 * Register), enable key 0 and set it to 1. 802 * 803 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) 804 */ 805 mtspr(SPR_AMOR, (3ul << 62)); 806 } 807 808 static void 809 mmu_radix_init_iamr(void) 810 { 811 /* 812 * Radix always uses key0 of the IAMR to determine if an access is 813 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction 814 * fetch. 815 */ 816 mtspr(SPR_IAMR, (1ul << 62)); 817 } 818 819 static void 820 mmu_radix_pid_set(pmap_t pmap) 821 { 822 823 mtspr(SPR_PID, pmap->pm_pid); 824 isync(); 825 } 826 827 /* Quick sort callout for comparing physical addresses. */ 828 static int 829 pa_cmp(const void *a, const void *b) 830 { 831 const vm_paddr_t *pa = a, *pb = b; 832 833 if (*pa < *pb) 834 return (-1); 835 else if (*pa > *pb) 836 return (1); 837 else 838 return (0); 839 } 840 841 #define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) 842 #define pte_load_clear(ptep) atomic_swap_long(ptep, 0) 843 #define pte_store(ptep, pte) do { \ 844 MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X)); \ 845 *(u_long *)(ptep) = htobe64((u_long)((pte) | PG_V | RPTE_LEAF)); \ 846 } while (0) 847 /* 848 * NB: should only be used for adding directories - not for direct mappings 849 */ 850 #define pde_store(ptep, pa) do { \ 851 *(u_long *)(ptep) = htobe64((u_long)(pa|RPTE_VALID|RPTE_SHIFT)); \ 852 } while (0) 853 854 #define pte_clear(ptep) do { \ 855 *(u_long *)(ptep) = (u_long)(0); \ 856 } while (0) 857 858 #define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ 859 860 /* 861 * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB 862 * (PTE) page mappings have identical settings for the following fields: 863 */ 864 #define PG_PTE_PROMOTE (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \ 865 PG_M | PG_A | RPTE_EAA_MASK | PG_V) 866 867 static __inline void 868 pmap_resident_count_inc(pmap_t pmap, int count) 869 { 870 871 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 872 pmap->pm_stats.resident_count += count; 873 } 874 875 static __inline void 876 pmap_resident_count_dec(pmap_t pmap, int count) 877 { 878 879 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 880 KASSERT(pmap->pm_stats.resident_count >= count, 881 ("pmap %p resident count underflow %ld %d", pmap, 882 pmap->pm_stats.resident_count, count)); 883 pmap->pm_stats.resident_count -= count; 884 } 885 886 static void 887 pagezero(vm_offset_t va) 888 { 889 va = trunc_page(va); 890 891 bzero((void *)va, PAGE_SIZE); 892 } 893 894 static uint64_t 895 allocpages(int n) 896 { 897 u_int64_t ret; 898 899 ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE); 900 for (int i = 0; i < n; i++) 901 pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE)); 902 return (ret); 903 } 904 905 static pt_entry_t * 906 kvtopte(vm_offset_t va) 907 { 908 pt_entry_t *l3e; 909 910 l3e = pmap_pml3e(kernel_pmap, va); 911 if ((be64toh(*l3e) & RPTE_VALID) == 0) 912 return (NULL); 913 return (pmap_l3e_to_pte(l3e, va)); 914 } 915 916 void 917 mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa) 918 { 919 pt_entry_t *pte; 920 921 pte = kvtopte(va); 922 MPASS(pte != NULL); 923 *pte = htobe64(pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | \ 924 RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A); 925 } 926 927 bool 928 mmu_radix_ps_enabled(pmap_t pmap) 929 { 930 return (superpages_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 931 } 932 933 static pt_entry_t * 934 pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e) 935 { 936 pml3_entry_t *l3e; 937 pt_entry_t *pte; 938 939 va &= PG_PS_FRAME; 940 l3e = pmap_pml3e(pmap, va); 941 if (l3e == NULL || (be64toh(*l3e) & PG_V) == 0) 942 return (NULL); 943 944 if (be64toh(*l3e) & RPTE_LEAF) { 945 *is_l3e = 1; 946 return (l3e); 947 } 948 *is_l3e = 0; 949 va &= PG_FRAME; 950 pte = pmap_l3e_to_pte(l3e, va); 951 if (pte == NULL || (be64toh(*pte) & PG_V) == 0) 952 return (NULL); 953 return (pte); 954 } 955 956 int 957 pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags) 958 { 959 pt_entry_t *pte; 960 pt_entry_t startpte, origpte, newpte; 961 vm_page_t m; 962 int is_l3e; 963 964 startpte = 0; 965 retry: 966 if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL) 967 return (KERN_INVALID_ADDRESS); 968 origpte = newpte = be64toh(*pte); 969 if (startpte == 0) { 970 startpte = origpte; 971 if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) || 972 ((flags & VM_PROT_READ) && (startpte & PG_A))) { 973 pmap_invalidate_all(pmap); 974 #ifdef INVARIANTS 975 if (VERBOSE_PMAP || pmap_logging) 976 printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n", 977 __func__, pmap, va, flags, origpte); 978 #endif 979 return (KERN_FAILURE); 980 } 981 } 982 #ifdef INVARIANTS 983 if (VERBOSE_PMAP || pmap_logging) 984 printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va, 985 flags, origpte); 986 #endif 987 PMAP_LOCK(pmap); 988 if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL || 989 be64toh(*pte) != origpte) { 990 PMAP_UNLOCK(pmap); 991 return (KERN_FAILURE); 992 } 993 m = PHYS_TO_VM_PAGE(newpte & PG_FRAME); 994 MPASS(m != NULL); 995 switch (flags) { 996 case VM_PROT_READ: 997 if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0) 998 goto protfail; 999 newpte |= PG_A; 1000 vm_page_aflag_set(m, PGA_REFERENCED); 1001 break; 1002 case VM_PROT_WRITE: 1003 if ((newpte & RPTE_EAA_W) == 0) 1004 goto protfail; 1005 if (is_l3e) 1006 goto protfail; 1007 newpte |= PG_M; 1008 vm_page_dirty(m); 1009 break; 1010 case VM_PROT_EXECUTE: 1011 if ((newpte & RPTE_EAA_X) == 0) 1012 goto protfail; 1013 newpte |= PG_A; 1014 vm_page_aflag_set(m, PGA_REFERENCED); 1015 break; 1016 } 1017 1018 if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte))) 1019 goto retry; 1020 ptesync(); 1021 PMAP_UNLOCK(pmap); 1022 if (startpte == newpte) 1023 return (KERN_FAILURE); 1024 return (0); 1025 protfail: 1026 PMAP_UNLOCK(pmap); 1027 return (KERN_PROTECTION_FAILURE); 1028 } 1029 1030 /* 1031 * Returns TRUE if the given page is mapped individually or as part of 1032 * a 2mpage. Otherwise, returns FALSE. 1033 */ 1034 boolean_t 1035 mmu_radix_page_is_mapped(vm_page_t m) 1036 { 1037 struct rwlock *lock; 1038 boolean_t rv; 1039 1040 if ((m->oflags & VPO_UNMANAGED) != 0) 1041 return (FALSE); 1042 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 1043 rw_rlock(lock); 1044 rv = !TAILQ_EMPTY(&m->md.pv_list) || 1045 ((m->flags & PG_FICTITIOUS) == 0 && 1046 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 1047 rw_runlock(lock); 1048 return (rv); 1049 } 1050 1051 /* 1052 * Determine the appropriate bits to set in a PTE or PDE for a specified 1053 * caching mode. 1054 */ 1055 static int 1056 pmap_cache_bits(vm_memattr_t ma) 1057 { 1058 if (ma != VM_MEMATTR_DEFAULT) { 1059 switch (ma) { 1060 case VM_MEMATTR_UNCACHEABLE: 1061 return (RPTE_ATTR_GUARDEDIO); 1062 case VM_MEMATTR_CACHEABLE: 1063 return (RPTE_ATTR_MEM); 1064 case VM_MEMATTR_WRITE_BACK: 1065 case VM_MEMATTR_PREFETCHABLE: 1066 case VM_MEMATTR_WRITE_COMBINING: 1067 return (RPTE_ATTR_UNGUARDEDIO); 1068 } 1069 } 1070 return (0); 1071 } 1072 1073 static void 1074 pmap_invalidate_page(pmap_t pmap, vm_offset_t start) 1075 { 1076 ptesync(); 1077 if (pmap == kernel_pmap) 1078 radix_tlbie_invlpg_kernel_4k(start); 1079 else 1080 radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); 1081 ttusync(); 1082 } 1083 1084 static void 1085 pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start) 1086 { 1087 ptesync(); 1088 if (pmap == kernel_pmap) 1089 radix_tlbie_invlpg_kernel_2m(start); 1090 else 1091 radix_tlbie_invlpg_user_2m(pmap->pm_pid, start); 1092 ttusync(); 1093 } 1094 1095 static void 1096 pmap_invalidate_pwc(pmap_t pmap) 1097 { 1098 ptesync(); 1099 if (pmap == kernel_pmap) 1100 radix_tlbie_invlpwc_kernel(); 1101 else 1102 radix_tlbie_invlpwc_user(pmap->pm_pid); 1103 ttusync(); 1104 } 1105 1106 static void 1107 pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end) 1108 { 1109 if (((start - end) >> PAGE_SHIFT) > 8) { 1110 pmap_invalidate_all(pmap); 1111 return; 1112 } 1113 ptesync(); 1114 if (pmap == kernel_pmap) { 1115 while (start < end) { 1116 radix_tlbie_invlpg_kernel_4k(start); 1117 start += PAGE_SIZE; 1118 } 1119 } else { 1120 while (start < end) { 1121 radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); 1122 start += PAGE_SIZE; 1123 } 1124 } 1125 ttusync(); 1126 } 1127 1128 static void 1129 pmap_invalidate_all(pmap_t pmap) 1130 { 1131 ptesync(); 1132 if (pmap == kernel_pmap) 1133 radix_tlbie_flush_kernel(); 1134 else 1135 radix_tlbie_flush_user(pmap->pm_pid); 1136 ttusync(); 1137 } 1138 1139 static void 1140 pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e) 1141 { 1142 1143 /* 1144 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 1145 * by a promotion that did not invalidate the 512 4KB page mappings 1146 * that might exist in the TLB. Consequently, at this point, the TLB 1147 * may hold both 4KB and 2MB page mappings for the address range [va, 1148 * va + L3_PAGE_SIZE). Therefore, the entire range must be invalidated here. 1149 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 1150 * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a 1151 * single INVLPG suffices to invalidate the 2MB page mapping from the 1152 * TLB. 1153 */ 1154 ptesync(); 1155 if ((l3e & PG_PROMOTED) != 0) 1156 pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1); 1157 else 1158 pmap_invalidate_page_2m(pmap, va); 1159 1160 pmap_invalidate_pwc(pmap); 1161 } 1162 1163 static __inline struct pv_chunk * 1164 pv_to_chunk(pv_entry_t pv) 1165 { 1166 1167 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1168 } 1169 1170 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1171 1172 #define PC_FREE0 0xfffffffffffffffful 1173 #define PC_FREE1 0x3ffffffffffffffful 1174 1175 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 }; 1176 1177 /* 1178 * Ensure that the number of spare PV entries in the specified pmap meets or 1179 * exceeds the given count, "needed". 1180 * 1181 * The given PV list lock may be released. 1182 */ 1183 static void 1184 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1185 { 1186 struct pch new_tail; 1187 struct pv_chunk *pc; 1188 vm_page_t m; 1189 int avail, free; 1190 bool reclaimed; 1191 1192 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1193 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1194 1195 /* 1196 * Newly allocated PV chunks must be stored in a private list until 1197 * the required number of PV chunks have been allocated. Otherwise, 1198 * reclaim_pv_chunk() could recycle one of these chunks. In 1199 * contrast, these chunks must be added to the pmap upon allocation. 1200 */ 1201 TAILQ_INIT(&new_tail); 1202 retry: 1203 avail = 0; 1204 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1205 // if ((cpu_feature2 & CPUID2_POPCNT) == 0) 1206 bit_count((bitstr_t *)pc->pc_map, 0, 1207 sizeof(pc->pc_map) * NBBY, &free); 1208 #if 0 1209 free = popcnt_pc_map_pq(pc->pc_map); 1210 #endif 1211 if (free == 0) 1212 break; 1213 avail += free; 1214 if (avail >= needed) 1215 break; 1216 } 1217 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1218 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1219 VM_ALLOC_WIRED); 1220 if (m == NULL) { 1221 m = reclaim_pv_chunk(pmap, lockp); 1222 if (m == NULL) 1223 goto retry; 1224 reclaimed = true; 1225 } 1226 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1227 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1228 dump_add_page(m->phys_addr); 1229 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1230 pc->pc_pmap = pmap; 1231 pc->pc_map[0] = PC_FREE0; 1232 pc->pc_map[1] = PC_FREE1; 1233 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1234 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1235 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 1236 1237 /* 1238 * The reclaim might have freed a chunk from the current pmap. 1239 * If that chunk contained available entries, we need to 1240 * re-count the number of available entries. 1241 */ 1242 if (reclaimed) 1243 goto retry; 1244 } 1245 if (!TAILQ_EMPTY(&new_tail)) { 1246 mtx_lock(&pv_chunks_mutex); 1247 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1248 mtx_unlock(&pv_chunks_mutex); 1249 } 1250 } 1251 1252 /* 1253 * First find and then remove the pv entry for the specified pmap and virtual 1254 * address from the specified pv list. Returns the pv entry if found and NULL 1255 * otherwise. This operation can be performed on pv lists for either 4KB or 1256 * 2MB page mappings. 1257 */ 1258 static __inline pv_entry_t 1259 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1260 { 1261 pv_entry_t pv; 1262 1263 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 1264 #ifdef INVARIANTS 1265 if (PV_PMAP(pv) == NULL) { 1266 printf("corrupted pv_chunk/pv %p\n", pv); 1267 printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":"); 1268 } 1269 MPASS(PV_PMAP(pv) != NULL); 1270 MPASS(pv->pv_va != 0); 1271 #endif 1272 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1273 TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); 1274 pvh->pv_gen++; 1275 break; 1276 } 1277 } 1278 return (pv); 1279 } 1280 1281 /* 1282 * After demotion from a 2MB page mapping to 512 4KB page mappings, 1283 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 1284 * entries for each of the 4KB page mappings. 1285 */ 1286 static void 1287 pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1288 struct rwlock **lockp) 1289 { 1290 struct md_page *pvh; 1291 struct pv_chunk *pc; 1292 pv_entry_t pv; 1293 vm_offset_t va_last; 1294 vm_page_t m; 1295 int bit, field; 1296 1297 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1298 KASSERT((pa & L3_PAGE_MASK) == 0, 1299 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 1300 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1301 1302 /* 1303 * Transfer the 2mpage's pv entry for this mapping to the first 1304 * page's pv list. Once this transfer begins, the pv list lock 1305 * must not be released until the last pv entry is reinstantiated. 1306 */ 1307 pvh = pa_to_pvh(pa); 1308 va = trunc_2mpage(va); 1309 pv = pmap_pvh_remove(pvh, pmap, va); 1310 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 1311 m = PHYS_TO_VM_PAGE(pa); 1312 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 1313 1314 m->md.pv_gen++; 1315 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 1316 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 1317 va_last = va + L3_PAGE_SIZE - PAGE_SIZE; 1318 for (;;) { 1319 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1320 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 1321 , ("pmap_pv_demote_pde: missing spare")); 1322 for (field = 0; field < _NPCM; field++) { 1323 while (pc->pc_map[field]) { 1324 bit = cnttzd(pc->pc_map[field]); 1325 pc->pc_map[field] &= ~(1ul << bit); 1326 pv = &pc->pc_pventry[field * 64 + bit]; 1327 va += PAGE_SIZE; 1328 pv->pv_va = va; 1329 m++; 1330 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1331 ("pmap_pv_demote_pde: page %p is not managed", m)); 1332 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 1333 1334 m->md.pv_gen++; 1335 if (va == va_last) 1336 goto out; 1337 } 1338 } 1339 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1340 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1341 } 1342 out: 1343 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { 1344 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1345 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1346 } 1347 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 1348 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 1349 } 1350 1351 static void 1352 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap) 1353 { 1354 1355 if (pmap == NULL) 1356 return; 1357 pmap_invalidate_all(pmap); 1358 if (pmap != locked_pmap) 1359 PMAP_UNLOCK(pmap); 1360 } 1361 1362 /* 1363 * We are in a serious low memory condition. Resort to 1364 * drastic measures to free some pages so we can allocate 1365 * another pv entry chunk. 1366 * 1367 * Returns NULL if PV entries were reclaimed from the specified pmap. 1368 * 1369 * We do not, however, unmap 2mpages because subsequent accesses will 1370 * allocate per-page pv entries until repromotion occurs, thereby 1371 * exacerbating the shortage of free pv entries. 1372 */ 1373 static int active_reclaims = 0; 1374 static vm_page_t 1375 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1376 { 1377 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 1378 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 1379 struct md_page *pvh; 1380 pml3_entry_t *l3e; 1381 pmap_t next_pmap, pmap; 1382 pt_entry_t *pte, tpte; 1383 pv_entry_t pv; 1384 vm_offset_t va; 1385 vm_page_t m, m_pc; 1386 struct spglist free; 1387 uint64_t inuse; 1388 int bit, field, freed; 1389 1390 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 1391 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 1392 pmap = NULL; 1393 m_pc = NULL; 1394 SLIST_INIT(&free); 1395 bzero(&pc_marker_b, sizeof(pc_marker_b)); 1396 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 1397 pc_marker = (struct pv_chunk *)&pc_marker_b; 1398 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 1399 1400 mtx_lock(&pv_chunks_mutex); 1401 active_reclaims++; 1402 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); 1403 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); 1404 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 1405 SLIST_EMPTY(&free)) { 1406 next_pmap = pc->pc_pmap; 1407 if (next_pmap == NULL) { 1408 /* 1409 * The next chunk is a marker. However, it is 1410 * not our marker, so active_reclaims must be 1411 * > 1. Consequently, the next_chunk code 1412 * will not rotate the pv_chunks list. 1413 */ 1414 goto next_chunk; 1415 } 1416 mtx_unlock(&pv_chunks_mutex); 1417 1418 /* 1419 * A pv_chunk can only be removed from the pc_lru list 1420 * when both pc_chunks_mutex is owned and the 1421 * corresponding pmap is locked. 1422 */ 1423 if (pmap != next_pmap) { 1424 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); 1425 pmap = next_pmap; 1426 /* Avoid deadlock and lock recursion. */ 1427 if (pmap > locked_pmap) { 1428 RELEASE_PV_LIST_LOCK(lockp); 1429 PMAP_LOCK(pmap); 1430 mtx_lock(&pv_chunks_mutex); 1431 continue; 1432 } else if (pmap != locked_pmap) { 1433 if (PMAP_TRYLOCK(pmap)) { 1434 mtx_lock(&pv_chunks_mutex); 1435 continue; 1436 } else { 1437 pmap = NULL; /* pmap is not locked */ 1438 mtx_lock(&pv_chunks_mutex); 1439 pc = TAILQ_NEXT(pc_marker, pc_lru); 1440 if (pc == NULL || 1441 pc->pc_pmap != next_pmap) 1442 continue; 1443 goto next_chunk; 1444 } 1445 } 1446 } 1447 1448 /* 1449 * Destroy every non-wired, 4 KB page mapping in the chunk. 1450 */ 1451 freed = 0; 1452 for (field = 0; field < _NPCM; field++) { 1453 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 1454 inuse != 0; inuse &= ~(1UL << bit)) { 1455 bit = cnttzd(inuse); 1456 pv = &pc->pc_pventry[field * 64 + bit]; 1457 va = pv->pv_va; 1458 l3e = pmap_pml3e(pmap, va); 1459 if ((be64toh(*l3e) & RPTE_LEAF) != 0) 1460 continue; 1461 pte = pmap_l3e_to_pte(l3e, va); 1462 if ((be64toh(*pte) & PG_W) != 0) 1463 continue; 1464 tpte = be64toh(pte_load_clear(pte)); 1465 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 1466 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 1467 vm_page_dirty(m); 1468 if ((tpte & PG_A) != 0) 1469 vm_page_aflag_set(m, PGA_REFERENCED); 1470 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1471 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 1472 1473 m->md.pv_gen++; 1474 if (TAILQ_EMPTY(&m->md.pv_list) && 1475 (m->flags & PG_FICTITIOUS) == 0) { 1476 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 1477 if (TAILQ_EMPTY(&pvh->pv_list)) { 1478 vm_page_aflag_clear(m, 1479 PGA_WRITEABLE); 1480 } 1481 } 1482 pc->pc_map[field] |= 1UL << bit; 1483 pmap_unuse_pt(pmap, va, be64toh(*l3e), &free); 1484 freed++; 1485 } 1486 } 1487 if (freed == 0) { 1488 mtx_lock(&pv_chunks_mutex); 1489 goto next_chunk; 1490 } 1491 /* Every freed mapping is for a 4 KB page. */ 1492 pmap_resident_count_dec(pmap, freed); 1493 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 1494 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 1495 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 1496 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1497 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) { 1498 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1499 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1500 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1501 /* Entire chunk is free; return it. */ 1502 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1503 dump_drop_page(m_pc->phys_addr); 1504 mtx_lock(&pv_chunks_mutex); 1505 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1506 break; 1507 } 1508 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1509 mtx_lock(&pv_chunks_mutex); 1510 /* One freed pv entry in locked_pmap is sufficient. */ 1511 if (pmap == locked_pmap) 1512 break; 1513 next_chunk: 1514 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 1515 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); 1516 if (active_reclaims == 1 && pmap != NULL) { 1517 /* 1518 * Rotate the pv chunks list so that we do not 1519 * scan the same pv chunks that could not be 1520 * freed (because they contained a wired 1521 * and/or superpage mapping) on every 1522 * invocation of reclaim_pv_chunk(). 1523 */ 1524 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { 1525 MPASS(pc->pc_pmap != NULL); 1526 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1527 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1528 } 1529 } 1530 } 1531 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 1532 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); 1533 active_reclaims--; 1534 mtx_unlock(&pv_chunks_mutex); 1535 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); 1536 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 1537 m_pc = SLIST_FIRST(&free); 1538 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 1539 /* Recycle a freed page table page. */ 1540 m_pc->ref_count = 1; 1541 } 1542 vm_page_free_pages_toq(&free, true); 1543 return (m_pc); 1544 } 1545 1546 /* 1547 * free the pv_entry back to the free list 1548 */ 1549 static void 1550 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1551 { 1552 struct pv_chunk *pc; 1553 int idx, field, bit; 1554 1555 #ifdef VERBOSE_PV 1556 if (pmap != kernel_pmap) 1557 printf("%s(%p, %p)\n", __func__, pmap, pv); 1558 #endif 1559 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1560 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1561 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1562 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1563 pc = pv_to_chunk(pv); 1564 idx = pv - &pc->pc_pventry[0]; 1565 field = idx / 64; 1566 bit = idx % 64; 1567 pc->pc_map[field] |= 1ul << bit; 1568 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) { 1569 /* 98% of the time, pc is already at the head of the list. */ 1570 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1571 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1572 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1573 } 1574 return; 1575 } 1576 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1577 free_pv_chunk(pc); 1578 } 1579 1580 static void 1581 free_pv_chunk(struct pv_chunk *pc) 1582 { 1583 vm_page_t m; 1584 1585 mtx_lock(&pv_chunks_mutex); 1586 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1587 mtx_unlock(&pv_chunks_mutex); 1588 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1589 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1590 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1591 /* entire chunk is free, return it */ 1592 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1593 dump_drop_page(m->phys_addr); 1594 vm_page_unwire_noq(m); 1595 vm_page_free(m); 1596 } 1597 1598 /* 1599 * Returns a new PV entry, allocating a new PV chunk from the system when 1600 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1601 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1602 * returned. 1603 * 1604 * The given PV list lock may be released. 1605 */ 1606 static pv_entry_t 1607 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1608 { 1609 int bit, field; 1610 pv_entry_t pv; 1611 struct pv_chunk *pc; 1612 vm_page_t m; 1613 1614 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1615 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1616 retry: 1617 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1618 if (pc != NULL) { 1619 for (field = 0; field < _NPCM; field++) { 1620 if (pc->pc_map[field]) { 1621 bit = cnttzd(pc->pc_map[field]); 1622 break; 1623 } 1624 } 1625 if (field < _NPCM) { 1626 pv = &pc->pc_pventry[field * 64 + bit]; 1627 pc->pc_map[field] &= ~(1ul << bit); 1628 /* If this was the last item, move it to tail */ 1629 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { 1630 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1631 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1632 pc_list); 1633 } 1634 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1635 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1636 MPASS(PV_PMAP(pv) != NULL); 1637 return (pv); 1638 } 1639 } 1640 /* No free items, allocate another chunk */ 1641 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1642 VM_ALLOC_WIRED); 1643 if (m == NULL) { 1644 if (lockp == NULL) { 1645 PV_STAT(pc_chunk_tryfail++); 1646 return (NULL); 1647 } 1648 m = reclaim_pv_chunk(pmap, lockp); 1649 if (m == NULL) 1650 goto retry; 1651 } 1652 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1653 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1654 dump_add_page(m->phys_addr); 1655 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1656 pc->pc_pmap = pmap; 1657 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1658 pc->pc_map[1] = PC_FREE1; 1659 mtx_lock(&pv_chunks_mutex); 1660 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1661 mtx_unlock(&pv_chunks_mutex); 1662 pv = &pc->pc_pventry[0]; 1663 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1664 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1665 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1666 MPASS(PV_PMAP(pv) != NULL); 1667 return (pv); 1668 } 1669 1670 #if VM_NRESERVLEVEL > 0 1671 /* 1672 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 1673 * replace the many pv entries for the 4KB page mappings by a single pv entry 1674 * for the 2MB page mapping. 1675 */ 1676 static void 1677 pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1678 struct rwlock **lockp) 1679 { 1680 struct md_page *pvh; 1681 pv_entry_t pv; 1682 vm_offset_t va_last; 1683 vm_page_t m; 1684 1685 KASSERT((pa & L3_PAGE_MASK) == 0, 1686 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 1687 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1688 1689 /* 1690 * Transfer the first page's pv entry for this mapping to the 2mpage's 1691 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 1692 * a transfer avoids the possibility that get_pv_entry() calls 1693 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 1694 * mappings that is being promoted. 1695 */ 1696 m = PHYS_TO_VM_PAGE(pa); 1697 va = trunc_2mpage(va); 1698 pv = pmap_pvh_remove(&m->md, pmap, va); 1699 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 1700 pvh = pa_to_pvh(pa); 1701 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); 1702 pvh->pv_gen++; 1703 /* Free the remaining NPTEPG - 1 pv entries. */ 1704 va_last = va + L3_PAGE_SIZE - PAGE_SIZE; 1705 do { 1706 m++; 1707 va += PAGE_SIZE; 1708 pmap_pvh_free(&m->md, pmap, va); 1709 } while (va < va_last); 1710 } 1711 #endif /* VM_NRESERVLEVEL > 0 */ 1712 1713 /* 1714 * First find and then destroy the pv entry for the specified pmap and virtual 1715 * address. This operation can be performed on pv lists for either 4KB or 2MB 1716 * page mappings. 1717 */ 1718 static void 1719 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1720 { 1721 pv_entry_t pv; 1722 1723 pv = pmap_pvh_remove(pvh, pmap, va); 1724 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 1725 free_pv_entry(pmap, pv); 1726 } 1727 1728 /* 1729 * Conditionally create the PV entry for a 4KB page mapping if the required 1730 * memory can be allocated without resorting to reclamation. 1731 */ 1732 static boolean_t 1733 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 1734 struct rwlock **lockp) 1735 { 1736 pv_entry_t pv; 1737 1738 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1739 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1740 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 1741 pv->pv_va = va; 1742 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1743 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 1744 m->md.pv_gen++; 1745 return (TRUE); 1746 } else 1747 return (FALSE); 1748 } 1749 1750 vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX]; 1751 #ifdef INVARIANTS 1752 static void 1753 validate_addr(vm_paddr_t addr, vm_size_t size) 1754 { 1755 vm_paddr_t end = addr + size; 1756 bool found = false; 1757 1758 for (int i = 0; i < 2 * phys_avail_count; i += 2) { 1759 if (addr >= phys_avail_debug[i] && 1760 end <= phys_avail_debug[i + 1]) { 1761 found = true; 1762 break; 1763 } 1764 } 1765 KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array", 1766 addr, end)); 1767 } 1768 #else 1769 static void validate_addr(vm_paddr_t addr, vm_size_t size) {} 1770 #endif 1771 #define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A) 1772 1773 static vm_paddr_t 1774 alloc_pt_page(void) 1775 { 1776 vm_paddr_t page; 1777 1778 page = allocpages(1); 1779 pagezero(PHYS_TO_DMAP(page)); 1780 return (page); 1781 } 1782 1783 static void 1784 mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end) 1785 { 1786 pt_entry_t *pte, pteval; 1787 vm_paddr_t page; 1788 1789 if (bootverbose) 1790 printf("%s %lx -> %lx\n", __func__, start, end); 1791 while (start < end) { 1792 pteval = start | DMAP_PAGE_BITS; 1793 pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start)); 1794 if ((be64toh(*pte) & RPTE_VALID) == 0) { 1795 page = alloc_pt_page(); 1796 pde_store(pte, page); 1797 } 1798 pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start)); 1799 if ((start & L2_PAGE_MASK) == 0 && 1800 end - start >= L2_PAGE_SIZE) { 1801 start += L2_PAGE_SIZE; 1802 goto done; 1803 } else if ((be64toh(*pte) & RPTE_VALID) == 0) { 1804 page = alloc_pt_page(); 1805 pde_store(pte, page); 1806 } 1807 1808 pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start)); 1809 if ((start & L3_PAGE_MASK) == 0 && 1810 end - start >= L3_PAGE_SIZE) { 1811 start += L3_PAGE_SIZE; 1812 goto done; 1813 } else if ((be64toh(*pte) & RPTE_VALID) == 0) { 1814 page = alloc_pt_page(); 1815 pde_store(pte, page); 1816 } 1817 pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start)); 1818 start += PAGE_SIZE; 1819 done: 1820 pte_store(pte, pteval); 1821 } 1822 } 1823 1824 static void 1825 mmu_radix_dmap_populate(vm_size_t hwphyssz) 1826 { 1827 vm_paddr_t start, end; 1828 1829 for (int i = 0; i < pregions_sz; i++) { 1830 start = pregions[i].mr_start; 1831 end = start + pregions[i].mr_size; 1832 if (hwphyssz && start >= hwphyssz) 1833 break; 1834 if (hwphyssz && hwphyssz < end) 1835 end = hwphyssz; 1836 mmu_radix_dmap_range(start, end); 1837 } 1838 } 1839 1840 static void 1841 mmu_radix_setup_pagetables(vm_size_t hwphyssz) 1842 { 1843 vm_paddr_t ptpages, pages; 1844 pt_entry_t *pte; 1845 vm_paddr_t l1phys; 1846 1847 bzero(kernel_pmap, sizeof(struct pmap)); 1848 PMAP_LOCK_INIT(kernel_pmap); 1849 1850 ptpages = allocpages(3); 1851 l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE); 1852 validate_addr(l1phys, RADIX_PGD_SIZE); 1853 if (bootverbose) 1854 printf("l1phys=%lx\n", l1phys); 1855 MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0); 1856 for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++) 1857 pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE)); 1858 kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys); 1859 1860 mmu_radix_dmap_populate(hwphyssz); 1861 1862 /* 1863 * Create page tables for first 128MB of KVA 1864 */ 1865 pages = ptpages; 1866 pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS); 1867 *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); 1868 pages += PAGE_SIZE; 1869 pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS); 1870 *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); 1871 pages += PAGE_SIZE; 1872 pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS); 1873 /* 1874 * the kernel page table pages need to be preserved in 1875 * phys_avail and not overlap with previous allocations 1876 */ 1877 pages = allocpages(nkpt); 1878 if (bootverbose) { 1879 printf("phys_avail after dmap populate and nkpt allocation\n"); 1880 for (int j = 0; j < 2 * phys_avail_count; j+=2) 1881 printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", 1882 j, phys_avail[j], j + 1, phys_avail[j + 1]); 1883 } 1884 KPTphys = pages; 1885 for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE) 1886 *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); 1887 kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE; 1888 if (bootverbose) 1889 printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1); 1890 /* 1891 * Add a physical memory segment (vm_phys_seg) corresponding to the 1892 * preallocated kernel page table pages so that vm_page structures 1893 * representing these pages will be created. The vm_page structures 1894 * are required for promotion of the corresponding kernel virtual 1895 * addresses to superpage mappings. 1896 */ 1897 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1898 } 1899 1900 static void 1901 mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end) 1902 { 1903 vm_paddr_t kpstart, kpend; 1904 vm_size_t physsz, hwphyssz; 1905 //uint64_t l2virt; 1906 int rm_pavail, proctab_size; 1907 int i, j; 1908 1909 kpstart = start & ~DMAP_BASE_ADDRESS; 1910 kpend = end & ~DMAP_BASE_ADDRESS; 1911 1912 /* Get physical memory regions from firmware */ 1913 mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); 1914 CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory"); 1915 1916 if (2 * VM_PHYSSEG_MAX < regions_sz) 1917 panic("mmu_radix_early_bootstrap: phys_avail too small"); 1918 1919 if (bootverbose) 1920 for (int i = 0; i < regions_sz; i++) 1921 printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n", 1922 i, regions[i].mr_start, i, regions[i].mr_size); 1923 /* 1924 * XXX workaround a simulator bug 1925 */ 1926 for (int i = 0; i < regions_sz; i++) 1927 if (regions[i].mr_start & PAGE_MASK) { 1928 regions[i].mr_start += PAGE_MASK; 1929 regions[i].mr_start &= ~PAGE_MASK; 1930 regions[i].mr_size &= ~PAGE_MASK; 1931 } 1932 if (bootverbose) 1933 for (int i = 0; i < pregions_sz; i++) 1934 printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n", 1935 i, pregions[i].mr_start, i, pregions[i].mr_size); 1936 1937 phys_avail_count = 0; 1938 physsz = 0; 1939 hwphyssz = 0; 1940 TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); 1941 for (i = 0, j = 0; i < regions_sz; i++) { 1942 if (bootverbose) 1943 printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n", 1944 i, regions[i].mr_start, i, regions[i].mr_size); 1945 1946 if (regions[i].mr_size < PAGE_SIZE) 1947 continue; 1948 1949 if (hwphyssz != 0 && 1950 (physsz + regions[i].mr_size) >= hwphyssz) { 1951 if (physsz < hwphyssz) { 1952 phys_avail[j] = regions[i].mr_start; 1953 phys_avail[j + 1] = regions[i].mr_start + 1954 (hwphyssz - physsz); 1955 physsz = hwphyssz; 1956 phys_avail_count++; 1957 dump_avail[j] = phys_avail[j]; 1958 dump_avail[j + 1] = phys_avail[j + 1]; 1959 } 1960 break; 1961 } 1962 phys_avail[j] = regions[i].mr_start; 1963 phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; 1964 dump_avail[j] = phys_avail[j]; 1965 dump_avail[j + 1] = phys_avail[j + 1]; 1966 1967 phys_avail_count++; 1968 physsz += regions[i].mr_size; 1969 j += 2; 1970 } 1971 1972 /* Check for overlap with the kernel and exception vectors */ 1973 rm_pavail = 0; 1974 for (j = 0; j < 2 * phys_avail_count; j+=2) { 1975 if (phys_avail[j] < EXC_LAST) 1976 phys_avail[j] += EXC_LAST; 1977 1978 if (phys_avail[j] >= kpstart && 1979 phys_avail[j + 1] <= kpend) { 1980 phys_avail[j] = phys_avail[j + 1] = ~0; 1981 rm_pavail++; 1982 continue; 1983 } 1984 1985 if (kpstart >= phys_avail[j] && 1986 kpstart < phys_avail[j + 1]) { 1987 if (kpend < phys_avail[j + 1]) { 1988 phys_avail[2 * phys_avail_count] = 1989 (kpend & ~PAGE_MASK) + PAGE_SIZE; 1990 phys_avail[2 * phys_avail_count + 1] = 1991 phys_avail[j + 1]; 1992 phys_avail_count++; 1993 } 1994 1995 phys_avail[j + 1] = kpstart & ~PAGE_MASK; 1996 } 1997 1998 if (kpend >= phys_avail[j] && 1999 kpend < phys_avail[j + 1]) { 2000 if (kpstart > phys_avail[j]) { 2001 phys_avail[2 * phys_avail_count] = phys_avail[j]; 2002 phys_avail[2 * phys_avail_count + 1] = 2003 kpstart & ~PAGE_MASK; 2004 phys_avail_count++; 2005 } 2006 2007 phys_avail[j] = (kpend & ~PAGE_MASK) + 2008 PAGE_SIZE; 2009 } 2010 } 2011 qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp); 2012 for (i = 0; i < 2 * phys_avail_count; i++) 2013 phys_avail_debug[i] = phys_avail[i]; 2014 2015 /* Remove physical available regions marked for removal (~0) */ 2016 if (rm_pavail) { 2017 phys_avail_count -= rm_pavail; 2018 for (i = 2 * phys_avail_count; 2019 i < 2*(phys_avail_count + rm_pavail); i+=2) 2020 phys_avail[i] = phys_avail[i + 1] = 0; 2021 } 2022 if (bootverbose) { 2023 printf("phys_avail ranges after filtering:\n"); 2024 for (j = 0; j < 2 * phys_avail_count; j+=2) 2025 printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", 2026 j, phys_avail[j], j + 1, phys_avail[j + 1]); 2027 } 2028 physmem = btoc(physsz); 2029 2030 /* XXX assume we're running non-virtualized and 2031 * we don't support BHYVE 2032 */ 2033 if (isa3_pid_bits == 0) 2034 isa3_pid_bits = 20; 2035 if (powernv_enabled) { 2036 parttab_phys = 2037 moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE); 2038 validate_addr(parttab_phys, PARTTAB_SIZE); 2039 for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++) 2040 pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE)); 2041 2042 } 2043 proctab_size = 1UL << PROCTAB_SIZE_SHIFT; 2044 proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size); 2045 validate_addr(proctab0pa, proctab_size); 2046 for (int i = 0; i < proctab_size/PAGE_SIZE; i++) 2047 pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE)); 2048 2049 mmu_radix_setup_pagetables(hwphyssz); 2050 } 2051 2052 static void 2053 mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end) 2054 { 2055 int i; 2056 vm_paddr_t pa; 2057 void *dpcpu; 2058 vm_offset_t va; 2059 2060 /* 2061 * Set up the Open Firmware pmap and add its mappings if not in real 2062 * mode. 2063 */ 2064 if (bootverbose) 2065 printf("%s enter\n", __func__); 2066 2067 /* 2068 * Calculate the last available physical address, and reserve the 2069 * vm_page_array (upper bound). 2070 */ 2071 Maxmem = 0; 2072 for (i = 0; phys_avail[i + 2] != 0; i += 2) 2073 Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); 2074 2075 /* 2076 * Set the start and end of kva. 2077 */ 2078 virtual_avail = VM_MIN_KERNEL_ADDRESS; 2079 virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; 2080 2081 /* 2082 * Remap any early IO mappings (console framebuffer, etc.) 2083 */ 2084 bs_remap_earlyboot(); 2085 2086 /* 2087 * Allocate a kernel stack with a guard page for thread0 and map it 2088 * into the kernel page map. 2089 */ 2090 pa = allocpages(kstack_pages); 2091 va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; 2092 virtual_avail = va + kstack_pages * PAGE_SIZE; 2093 CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); 2094 thread0.td_kstack = va; 2095 for (i = 0; i < kstack_pages; i++) { 2096 mmu_radix_kenter(va, pa); 2097 pa += PAGE_SIZE; 2098 va += PAGE_SIZE; 2099 } 2100 thread0.td_kstack_pages = kstack_pages; 2101 2102 /* 2103 * Allocate virtual address space for the message buffer. 2104 */ 2105 pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK) >> PAGE_SHIFT); 2106 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa); 2107 2108 /* 2109 * Allocate virtual address space for the dynamic percpu area. 2110 */ 2111 pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT); 2112 dpcpu = (void *)PHYS_TO_DMAP(pa); 2113 dpcpu_init(dpcpu, curcpu); 2114 2115 crashdumpmap = (caddr_t)virtual_avail; 2116 virtual_avail += MAXDUMPPGS * PAGE_SIZE; 2117 2118 /* 2119 * Reserve some special page table entries/VA space for temporary 2120 * mapping of pages. 2121 */ 2122 } 2123 2124 static void 2125 mmu_parttab_init(void) 2126 { 2127 uint64_t ptcr; 2128 2129 isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys); 2130 2131 if (bootverbose) 2132 printf("%s parttab: %p\n", __func__, isa3_parttab); 2133 ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); 2134 if (bootverbose) 2135 printf("setting ptcr %lx\n", ptcr); 2136 mtspr(SPR_PTCR, ptcr); 2137 } 2138 2139 static void 2140 mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab) 2141 { 2142 uint64_t prev; 2143 2144 if (bootverbose) 2145 printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab, 2146 lpid, pagetab, proctab); 2147 prev = be64toh(isa3_parttab[lpid].pagetab); 2148 isa3_parttab[lpid].pagetab = htobe64(pagetab); 2149 isa3_parttab[lpid].proctab = htobe64(proctab); 2150 2151 if (prev & PARTTAB_HR) { 2152 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : 2153 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 2154 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : 2155 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 2156 } else { 2157 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : 2158 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 2159 } 2160 ttusync(); 2161 } 2162 2163 static void 2164 mmu_radix_parttab_init(void) 2165 { 2166 uint64_t pagetab; 2167 2168 mmu_parttab_init(); 2169 pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \ 2170 RADIX_PGD_INDEX_SHIFT | PARTTAB_HR; 2171 mmu_parttab_update(0, pagetab, 0); 2172 } 2173 2174 static void 2175 mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size) 2176 { 2177 uint64_t pagetab, proctab; 2178 2179 pagetab = be64toh(isa3_parttab[0].pagetab); 2180 proctab = proctabpa | table_size | PARTTAB_GR; 2181 mmu_parttab_update(0, pagetab, proctab); 2182 } 2183 2184 static void 2185 mmu_radix_proctab_init(void) 2186 { 2187 2188 isa3_base_pid = 1; 2189 2190 isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa); 2191 isa3_proctab->proctab0 = 2192 htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | 2193 RADIX_PGD_INDEX_SHIFT); 2194 2195 if (powernv_enabled) { 2196 mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12); 2197 __asm __volatile("ptesync" : : : "memory"); 2198 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : 2199 "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); 2200 __asm __volatile("eieio; tlbsync; ptesync" : : : "memory"); 2201 #ifdef PSERIES 2202 } else { 2203 phyp_hcall(H_REGISTER_PROC_TBL, 2204 PROC_TABLE_NEW | PROC_TABLE_RADIX | PROC_TABLE_GTSE, 2205 proctab0pa, 0, PROCTAB_SIZE_SHIFT - 12); 2206 #endif 2207 } 2208 2209 if (bootverbose) 2210 printf("process table %p and kernel radix PDE: %p\n", 2211 isa3_proctab, kernel_pmap->pm_pml1); 2212 mtmsr(mfmsr() | PSL_DR ); 2213 mtmsr(mfmsr() & ~PSL_DR); 2214 kernel_pmap->pm_pid = isa3_base_pid; 2215 isa3_base_pid++; 2216 } 2217 2218 void 2219 mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 2220 int advice) 2221 { 2222 struct rwlock *lock; 2223 pml1_entry_t *l1e; 2224 pml2_entry_t *l2e; 2225 pml3_entry_t oldl3e, *l3e; 2226 pt_entry_t *pte; 2227 vm_offset_t va, va_next; 2228 vm_page_t m; 2229 bool anychanged; 2230 2231 if (advice != MADV_DONTNEED && advice != MADV_FREE) 2232 return; 2233 anychanged = false; 2234 PMAP_LOCK(pmap); 2235 for (; sva < eva; sva = va_next) { 2236 l1e = pmap_pml1e(pmap, sva); 2237 if ((be64toh(*l1e) & PG_V) == 0) { 2238 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 2239 if (va_next < sva) 2240 va_next = eva; 2241 continue; 2242 } 2243 l2e = pmap_l1e_to_l2e(l1e, sva); 2244 if ((be64toh(*l2e) & PG_V) == 0) { 2245 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 2246 if (va_next < sva) 2247 va_next = eva; 2248 continue; 2249 } 2250 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 2251 if (va_next < sva) 2252 va_next = eva; 2253 l3e = pmap_l2e_to_l3e(l2e, sva); 2254 oldl3e = be64toh(*l3e); 2255 if ((oldl3e & PG_V) == 0) 2256 continue; 2257 else if ((oldl3e & RPTE_LEAF) != 0) { 2258 if ((oldl3e & PG_MANAGED) == 0) 2259 continue; 2260 lock = NULL; 2261 if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) { 2262 if (lock != NULL) 2263 rw_wunlock(lock); 2264 2265 /* 2266 * The large page mapping was destroyed. 2267 */ 2268 continue; 2269 } 2270 2271 /* 2272 * Unless the page mappings are wired, remove the 2273 * mapping to a single page so that a subsequent 2274 * access may repromote. Choosing the last page 2275 * within the address range [sva, min(va_next, eva)) 2276 * generally results in more repromotions. Since the 2277 * underlying page table page is fully populated, this 2278 * removal never frees a page table page. 2279 */ 2280 if ((oldl3e & PG_W) == 0) { 2281 va = eva; 2282 if (va > va_next) 2283 va = va_next; 2284 va -= PAGE_SIZE; 2285 KASSERT(va >= sva, 2286 ("mmu_radix_advise: no address gap")); 2287 pte = pmap_l3e_to_pte(l3e, va); 2288 KASSERT((be64toh(*pte) & PG_V) != 0, 2289 ("pmap_advise: invalid PTE")); 2290 pmap_remove_pte(pmap, pte, va, be64toh(*l3e), NULL, 2291 &lock); 2292 anychanged = true; 2293 } 2294 if (lock != NULL) 2295 rw_wunlock(lock); 2296 } 2297 if (va_next > eva) 2298 va_next = eva; 2299 va = va_next; 2300 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; 2301 pte++, sva += PAGE_SIZE) { 2302 MPASS(pte == pmap_pte(pmap, sva)); 2303 2304 if ((be64toh(*pte) & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 2305 goto maybe_invlrng; 2306 else if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 2307 if (advice == MADV_DONTNEED) { 2308 /* 2309 * Future calls to pmap_is_modified() 2310 * can be avoided by making the page 2311 * dirty now. 2312 */ 2313 m = PHYS_TO_VM_PAGE(be64toh(*pte) & PG_FRAME); 2314 vm_page_dirty(m); 2315 } 2316 atomic_clear_long(pte, htobe64(PG_M | PG_A)); 2317 } else if ((be64toh(*pte) & PG_A) != 0) 2318 atomic_clear_long(pte, htobe64(PG_A)); 2319 else 2320 goto maybe_invlrng; 2321 anychanged = true; 2322 continue; 2323 maybe_invlrng: 2324 if (va != va_next) { 2325 anychanged = true; 2326 va = va_next; 2327 } 2328 } 2329 if (va != va_next) 2330 anychanged = true; 2331 } 2332 if (anychanged) 2333 pmap_invalidate_all(pmap); 2334 PMAP_UNLOCK(pmap); 2335 } 2336 2337 /* 2338 * Routines used in machine-dependent code 2339 */ 2340 static void 2341 mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end) 2342 { 2343 uint64_t lpcr; 2344 2345 if (bootverbose) 2346 printf("%s\n", __func__); 2347 hw_direct_map = 1; 2348 powernv_enabled = (mfmsr() & PSL_HV) ? 1 : 0; 2349 mmu_radix_early_bootstrap(start, end); 2350 if (bootverbose) 2351 printf("early bootstrap complete\n"); 2352 if (powernv_enabled) { 2353 lpcr = mfspr(SPR_LPCR); 2354 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 2355 mmu_radix_parttab_init(); 2356 mmu_radix_init_amor(); 2357 if (bootverbose) 2358 printf("powernv init complete\n"); 2359 } 2360 mmu_radix_init_iamr(); 2361 mmu_radix_proctab_init(); 2362 mmu_radix_pid_set(kernel_pmap); 2363 if (powernv_enabled) 2364 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); 2365 else 2366 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID); 2367 2368 mmu_radix_late_bootstrap(start, end); 2369 numa_mem_regions(&numa_pregions, &numa_pregions_sz); 2370 if (bootverbose) 2371 printf("%s done\n", __func__); 2372 pmap_bootstrapped = 1; 2373 dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE); 2374 PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS); 2375 } 2376 2377 static void 2378 mmu_radix_cpu_bootstrap(int ap) 2379 { 2380 uint64_t lpcr; 2381 uint64_t ptcr; 2382 2383 if (powernv_enabled) { 2384 lpcr = mfspr(SPR_LPCR); 2385 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 2386 2387 ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); 2388 mtspr(SPR_PTCR, ptcr); 2389 mmu_radix_init_amor(); 2390 } 2391 mmu_radix_init_iamr(); 2392 mmu_radix_pid_set(kernel_pmap); 2393 if (powernv_enabled) 2394 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); 2395 else 2396 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID); 2397 } 2398 2399 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0, 2400 "2MB page mapping counters"); 2401 2402 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_demotions); 2403 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD, 2404 &pmap_l3e_demotions, "2MB page demotions"); 2405 2406 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_mappings); 2407 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD, 2408 &pmap_l3e_mappings, "2MB page mappings"); 2409 2410 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_p_failures); 2411 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD, 2412 &pmap_l3e_p_failures, "2MB page promotion failures"); 2413 2414 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_promotions); 2415 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD, 2416 &pmap_l3e_promotions, "2MB page promotions"); 2417 2418 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0, 2419 "1GB page mapping counters"); 2420 2421 static COUNTER_U64_DEFINE_EARLY(pmap_l2e_demotions); 2422 SYSCTL_COUNTER_U64(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD, 2423 &pmap_l2e_demotions, "1GB page demotions"); 2424 2425 void 2426 mmu_radix_clear_modify(vm_page_t m) 2427 { 2428 struct md_page *pvh; 2429 pmap_t pmap; 2430 pv_entry_t next_pv, pv; 2431 pml3_entry_t oldl3e, *l3e; 2432 pt_entry_t oldpte, *pte; 2433 struct rwlock *lock; 2434 vm_offset_t va; 2435 int md_gen, pvh_gen; 2436 2437 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2438 ("pmap_clear_modify: page %p is not managed", m)); 2439 vm_page_assert_busied(m); 2440 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 2441 2442 /* 2443 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 2444 * If the object containing the page is locked and the page is not 2445 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 2446 */ 2447 if ((m->a.flags & PGA_WRITEABLE) == 0) 2448 return; 2449 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2450 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2451 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 2452 rw_wlock(lock); 2453 restart: 2454 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { 2455 pmap = PV_PMAP(pv); 2456 if (!PMAP_TRYLOCK(pmap)) { 2457 pvh_gen = pvh->pv_gen; 2458 rw_wunlock(lock); 2459 PMAP_LOCK(pmap); 2460 rw_wlock(lock); 2461 if (pvh_gen != pvh->pv_gen) { 2462 PMAP_UNLOCK(pmap); 2463 goto restart; 2464 } 2465 } 2466 va = pv->pv_va; 2467 l3e = pmap_pml3e(pmap, va); 2468 oldl3e = be64toh(*l3e); 2469 if ((oldl3e & PG_RW) != 0 && 2470 pmap_demote_l3e_locked(pmap, l3e, va, &lock) && 2471 (oldl3e & PG_W) == 0) { 2472 /* 2473 * Write protect the mapping to a 2474 * single page so that a subsequent 2475 * write access may repromote. 2476 */ 2477 va += VM_PAGE_TO_PHYS(m) - (oldl3e & 2478 PG_PS_FRAME); 2479 pte = pmap_l3e_to_pte(l3e, va); 2480 oldpte = be64toh(*pte); 2481 while (!atomic_cmpset_long(pte, 2482 htobe64(oldpte), 2483 htobe64((oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW)))) 2484 oldpte = be64toh(*pte); 2485 vm_page_dirty(m); 2486 pmap_invalidate_page(pmap, va); 2487 } 2488 PMAP_UNLOCK(pmap); 2489 } 2490 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 2491 pmap = PV_PMAP(pv); 2492 if (!PMAP_TRYLOCK(pmap)) { 2493 md_gen = m->md.pv_gen; 2494 pvh_gen = pvh->pv_gen; 2495 rw_wunlock(lock); 2496 PMAP_LOCK(pmap); 2497 rw_wlock(lock); 2498 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 2499 PMAP_UNLOCK(pmap); 2500 goto restart; 2501 } 2502 } 2503 l3e = pmap_pml3e(pmap, pv->pv_va); 2504 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_clear_modify: found" 2505 " a 2mpage in page %p's pv list", m)); 2506 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 2507 if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 2508 atomic_clear_long(pte, htobe64(PG_M)); 2509 pmap_invalidate_page(pmap, pv->pv_va); 2510 } 2511 PMAP_UNLOCK(pmap); 2512 } 2513 rw_wunlock(lock); 2514 } 2515 2516 void 2517 mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 2518 vm_size_t len, vm_offset_t src_addr) 2519 { 2520 struct rwlock *lock; 2521 struct spglist free; 2522 vm_offset_t addr; 2523 vm_offset_t end_addr = src_addr + len; 2524 vm_offset_t va_next; 2525 vm_page_t dst_pdpg, dstmpte, srcmpte; 2526 bool invalidate_all; 2527 2528 CTR6(KTR_PMAP, 2529 "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n", 2530 __func__, dst_pmap, src_pmap, dst_addr, len, src_addr); 2531 2532 if (dst_addr != src_addr) 2533 return; 2534 lock = NULL; 2535 invalidate_all = false; 2536 if (dst_pmap < src_pmap) { 2537 PMAP_LOCK(dst_pmap); 2538 PMAP_LOCK(src_pmap); 2539 } else { 2540 PMAP_LOCK(src_pmap); 2541 PMAP_LOCK(dst_pmap); 2542 } 2543 2544 for (addr = src_addr; addr < end_addr; addr = va_next) { 2545 pml1_entry_t *l1e; 2546 pml2_entry_t *l2e; 2547 pml3_entry_t srcptepaddr, *l3e; 2548 pt_entry_t *src_pte, *dst_pte; 2549 2550 l1e = pmap_pml1e(src_pmap, addr); 2551 if ((be64toh(*l1e) & PG_V) == 0) { 2552 va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 2553 if (va_next < addr) 2554 va_next = end_addr; 2555 continue; 2556 } 2557 2558 l2e = pmap_l1e_to_l2e(l1e, addr); 2559 if ((be64toh(*l2e) & PG_V) == 0) { 2560 va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 2561 if (va_next < addr) 2562 va_next = end_addr; 2563 continue; 2564 } 2565 2566 va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 2567 if (va_next < addr) 2568 va_next = end_addr; 2569 2570 l3e = pmap_l2e_to_l3e(l2e, addr); 2571 srcptepaddr = be64toh(*l3e); 2572 if (srcptepaddr == 0) 2573 continue; 2574 2575 if (srcptepaddr & RPTE_LEAF) { 2576 if ((addr & L3_PAGE_MASK) != 0 || 2577 addr + L3_PAGE_SIZE > end_addr) 2578 continue; 2579 dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL); 2580 if (dst_pdpg == NULL) 2581 break; 2582 l3e = (pml3_entry_t *) 2583 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); 2584 l3e = &l3e[pmap_pml3e_index(addr)]; 2585 if (be64toh(*l3e) == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 2586 pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr, 2587 PMAP_ENTER_NORECLAIM, &lock))) { 2588 *l3e = htobe64(srcptepaddr & ~PG_W); 2589 pmap_resident_count_inc(dst_pmap, 2590 L3_PAGE_SIZE / PAGE_SIZE); 2591 counter_u64_add(pmap_l3e_mappings, 1); 2592 } else 2593 dst_pdpg->ref_count--; 2594 continue; 2595 } 2596 2597 srcptepaddr &= PG_FRAME; 2598 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 2599 KASSERT(srcmpte->ref_count > 0, 2600 ("pmap_copy: source page table page is unused")); 2601 2602 if (va_next > end_addr) 2603 va_next = end_addr; 2604 2605 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 2606 src_pte = &src_pte[pmap_pte_index(addr)]; 2607 dstmpte = NULL; 2608 while (addr < va_next) { 2609 pt_entry_t ptetemp; 2610 ptetemp = be64toh(*src_pte); 2611 /* 2612 * we only virtual copy managed pages 2613 */ 2614 if ((ptetemp & PG_MANAGED) != 0) { 2615 if (dstmpte != NULL && 2616 dstmpte->pindex == pmap_l3e_pindex(addr)) 2617 dstmpte->ref_count++; 2618 else if ((dstmpte = pmap_allocpte(dst_pmap, 2619 addr, NULL)) == NULL) 2620 goto out; 2621 dst_pte = (pt_entry_t *) 2622 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 2623 dst_pte = &dst_pte[pmap_pte_index(addr)]; 2624 if (be64toh(*dst_pte) == 0 && 2625 pmap_try_insert_pv_entry(dst_pmap, addr, 2626 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 2627 &lock)) { 2628 /* 2629 * Clear the wired, modified, and 2630 * accessed (referenced) bits 2631 * during the copy. 2632 */ 2633 *dst_pte = htobe64(ptetemp & ~(PG_W | PG_M | 2634 PG_A)); 2635 pmap_resident_count_inc(dst_pmap, 1); 2636 } else { 2637 SLIST_INIT(&free); 2638 if (pmap_unwire_ptp(dst_pmap, addr, 2639 dstmpte, &free)) { 2640 /* 2641 * Although "addr" is not 2642 * mapped, paging-structure 2643 * caches could nonetheless 2644 * have entries that refer to 2645 * the freed page table pages. 2646 * Invalidate those entries. 2647 */ 2648 invalidate_all = true; 2649 vm_page_free_pages_toq(&free, 2650 true); 2651 } 2652 goto out; 2653 } 2654 if (dstmpte->ref_count >= srcmpte->ref_count) 2655 break; 2656 } 2657 addr += PAGE_SIZE; 2658 if (__predict_false((addr & L3_PAGE_MASK) == 0)) 2659 src_pte = pmap_pte(src_pmap, addr); 2660 else 2661 src_pte++; 2662 } 2663 } 2664 out: 2665 if (invalidate_all) 2666 pmap_invalidate_all(dst_pmap); 2667 if (lock != NULL) 2668 rw_wunlock(lock); 2669 PMAP_UNLOCK(src_pmap); 2670 PMAP_UNLOCK(dst_pmap); 2671 } 2672 2673 static void 2674 mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst) 2675 { 2676 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 2677 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 2678 2679 CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst); 2680 /* 2681 * XXX slow 2682 */ 2683 bcopy((void *)src, (void *)dst, PAGE_SIZE); 2684 } 2685 2686 static void 2687 mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 2688 vm_offset_t b_offset, int xfersize) 2689 { 2690 void *a_cp, *b_cp; 2691 vm_offset_t a_pg_offset, b_pg_offset; 2692 int cnt; 2693 2694 CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma, 2695 a_offset, mb, b_offset, xfersize); 2696 2697 while (xfersize > 0) { 2698 a_pg_offset = a_offset & PAGE_MASK; 2699 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 2700 a_cp = (char *)(uintptr_t)PHYS_TO_DMAP( 2701 VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) + 2702 a_pg_offset; 2703 b_pg_offset = b_offset & PAGE_MASK; 2704 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 2705 b_cp = (char *)(uintptr_t)PHYS_TO_DMAP( 2706 VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) + 2707 b_pg_offset; 2708 bcopy(a_cp, b_cp, cnt); 2709 a_offset += cnt; 2710 b_offset += cnt; 2711 xfersize -= cnt; 2712 } 2713 } 2714 2715 #if VM_NRESERVLEVEL > 0 2716 /* 2717 * Tries to promote the 512, contiguous 4KB page mappings that are within a 2718 * single page table page (PTP) to a single 2MB page mapping. For promotion 2719 * to occur, two conditions must be met: (1) the 4KB page mappings must map 2720 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 2721 * identical characteristics. 2722 */ 2723 static int 2724 pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va, 2725 struct rwlock **lockp) 2726 { 2727 pml3_entry_t newpde; 2728 pt_entry_t *firstpte, oldpte, pa, *pte; 2729 vm_page_t mpte; 2730 2731 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2732 2733 /* 2734 * Examine the first PTE in the specified PTP. Abort if this PTE is 2735 * either invalid, unused, or does not map the first 4KB physical page 2736 * within a 2MB page. 2737 */ 2738 firstpte = (pt_entry_t *)PHYS_TO_DMAP(be64toh(*pde) & PG_FRAME); 2739 setpde: 2740 newpde = be64toh(*firstpte); 2741 if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 2742 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" 2743 " in pmap %p", va, pmap); 2744 goto fail; 2745 } 2746 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 2747 /* 2748 * When PG_M is already clear, PG_RW can be cleared without 2749 * a TLB invalidation. 2750 */ 2751 if (!atomic_cmpset_long(firstpte, htobe64(newpde), htobe64((newpde | RPTE_EAA_R) & ~RPTE_EAA_W))) 2752 goto setpde; 2753 newpde &= ~RPTE_EAA_W; 2754 } 2755 2756 /* 2757 * Examine each of the other PTEs in the specified PTP. Abort if this 2758 * PTE maps an unexpected 4KB physical page or does not have identical 2759 * characteristics to the first PTE. 2760 */ 2761 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE; 2762 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 2763 setpte: 2764 oldpte = be64toh(*pte); 2765 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 2766 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" 2767 " in pmap %p", va, pmap); 2768 goto fail; 2769 } 2770 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 2771 /* 2772 * When PG_M is already clear, PG_RW can be cleared 2773 * without a TLB invalidation. 2774 */ 2775 if (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~RPTE_EAA_W))) 2776 goto setpte; 2777 oldpte &= ~RPTE_EAA_W; 2778 CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx" 2779 " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) | 2780 (va & ~L3_PAGE_MASK), pmap); 2781 } 2782 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 2783 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" 2784 " in pmap %p", va, pmap); 2785 goto fail; 2786 } 2787 pa -= PAGE_SIZE; 2788 } 2789 2790 /* 2791 * Save the page table page in its current state until the PDE 2792 * mapping the superpage is demoted by pmap_demote_pde() or 2793 * destroyed by pmap_remove_pde(). 2794 */ 2795 mpte = PHYS_TO_VM_PAGE(be64toh(*pde) & PG_FRAME); 2796 KASSERT(mpte >= vm_page_array && 2797 mpte < &vm_page_array[vm_page_array_size], 2798 ("pmap_promote_l3e: page table page is out of range")); 2799 KASSERT(mpte->pindex == pmap_l3e_pindex(va), 2800 ("pmap_promote_l3e: page table page's pindex is wrong")); 2801 if (pmap_insert_pt_page(pmap, mpte)) { 2802 CTR2(KTR_PMAP, 2803 "pmap_promote_l3e: failure for va %#lx in pmap %p", va, 2804 pmap); 2805 goto fail; 2806 } 2807 2808 /* 2809 * Promote the pv entries. 2810 */ 2811 if ((newpde & PG_MANAGED) != 0) 2812 pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp); 2813 2814 pte_store(pde, PG_PROMOTED | newpde); 2815 ptesync(); 2816 counter_u64_add(pmap_l3e_promotions, 1); 2817 CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx" 2818 " in pmap %p", va, pmap); 2819 return (0); 2820 fail: 2821 counter_u64_add(pmap_l3e_p_failures, 1); 2822 return (KERN_FAILURE); 2823 } 2824 #endif /* VM_NRESERVLEVEL > 0 */ 2825 2826 int 2827 mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, 2828 vm_prot_t prot, u_int flags, int8_t psind) 2829 { 2830 struct rwlock *lock; 2831 pml3_entry_t *l3e; 2832 pt_entry_t *pte; 2833 pt_entry_t newpte, origpte; 2834 pv_entry_t pv; 2835 vm_paddr_t opa, pa; 2836 vm_page_t mpte, om; 2837 int rv, retrycount; 2838 boolean_t nosleep, invalidate_all, invalidate_page; 2839 2840 va = trunc_page(va); 2841 retrycount = 0; 2842 invalidate_page = invalidate_all = false; 2843 CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va, 2844 m, prot, flags, psind); 2845 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 2846 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 2847 ("pmap_enter: managed mapping within the clean submap")); 2848 if ((m->oflags & VPO_UNMANAGED) == 0) 2849 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2850 2851 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 2852 ("pmap_enter: flags %u has reserved bits set", flags)); 2853 pa = VM_PAGE_TO_PHYS(m); 2854 newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF); 2855 if ((flags & VM_PROT_WRITE) != 0) 2856 newpte |= PG_M; 2857 if ((flags & VM_PROT_READ) != 0) 2858 newpte |= PG_A; 2859 if (prot & VM_PROT_READ) 2860 newpte |= RPTE_EAA_R; 2861 if ((prot & VM_PROT_WRITE) != 0) 2862 newpte |= RPTE_EAA_W; 2863 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 2864 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 2865 2866 if (prot & VM_PROT_EXECUTE) 2867 newpte |= PG_X; 2868 if ((flags & PMAP_ENTER_WIRED) != 0) 2869 newpte |= PG_W; 2870 if (va >= DMAP_MIN_ADDRESS) 2871 newpte |= RPTE_EAA_P; 2872 newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs); 2873 /* 2874 * Set modified bit gratuitously for writeable mappings if 2875 * the page is unmanaged. We do not want to take a fault 2876 * to do the dirty bit accounting for these mappings. 2877 */ 2878 if ((m->oflags & VPO_UNMANAGED) != 0) { 2879 if ((newpte & PG_RW) != 0) 2880 newpte |= PG_M; 2881 } else 2882 newpte |= PG_MANAGED; 2883 2884 lock = NULL; 2885 PMAP_LOCK(pmap); 2886 if (psind == 1) { 2887 /* Assert the required virtual and physical alignment. */ 2888 KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned")); 2889 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2890 rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock); 2891 goto out; 2892 } 2893 mpte = NULL; 2894 2895 /* 2896 * In the case that a page table page is not 2897 * resident, we are creating it here. 2898 */ 2899 retry: 2900 l3e = pmap_pml3e(pmap, va); 2901 if (l3e != NULL && (be64toh(*l3e) & PG_V) != 0 && ((be64toh(*l3e) & RPTE_LEAF) == 0 || 2902 pmap_demote_l3e_locked(pmap, l3e, va, &lock))) { 2903 pte = pmap_l3e_to_pte(l3e, va); 2904 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 2905 mpte = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME); 2906 mpte->ref_count++; 2907 } 2908 } else if (va < VM_MAXUSER_ADDRESS) { 2909 /* 2910 * Here if the pte page isn't mapped, or if it has been 2911 * deallocated. 2912 */ 2913 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2914 mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va), 2915 nosleep ? NULL : &lock); 2916 if (mpte == NULL && nosleep) { 2917 rv = KERN_RESOURCE_SHORTAGE; 2918 goto out; 2919 } 2920 if (__predict_false(retrycount++ == 6)) 2921 panic("too many retries"); 2922 invalidate_all = true; 2923 goto retry; 2924 } else 2925 panic("pmap_enter: invalid page directory va=%#lx", va); 2926 2927 origpte = be64toh(*pte); 2928 pv = NULL; 2929 2930 /* 2931 * Is the specified virtual address already mapped? 2932 */ 2933 if ((origpte & PG_V) != 0) { 2934 #ifdef INVARIANTS 2935 if (VERBOSE_PMAP || pmap_logging) { 2936 printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --" 2937 " asid=%lu curpid=%d name=%s origpte0x%lx\n", 2938 pmap, va, m, prot, flags, psind, pmap->pm_pid, 2939 curproc->p_pid, curproc->p_comm, origpte); 2940 pmap_pte_walk(pmap->pm_pml1, va); 2941 } 2942 #endif 2943 /* 2944 * Wiring change, just update stats. We don't worry about 2945 * wiring PT pages as they remain resident as long as there 2946 * are valid mappings in them. Hence, if a user page is wired, 2947 * the PT page will be also. 2948 */ 2949 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 2950 pmap->pm_stats.wired_count++; 2951 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 2952 pmap->pm_stats.wired_count--; 2953 2954 /* 2955 * Remove the extra PT page reference. 2956 */ 2957 if (mpte != NULL) { 2958 mpte->ref_count--; 2959 KASSERT(mpte->ref_count > 0, 2960 ("pmap_enter: missing reference to page table page," 2961 " va: 0x%lx", va)); 2962 } 2963 2964 /* 2965 * Has the physical page changed? 2966 */ 2967 opa = origpte & PG_FRAME; 2968 if (opa == pa) { 2969 /* 2970 * No, might be a protection or wiring change. 2971 */ 2972 if ((origpte & PG_MANAGED) != 0 && 2973 (newpte & PG_RW) != 0) 2974 vm_page_aflag_set(m, PGA_WRITEABLE); 2975 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) { 2976 if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) { 2977 if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte))) 2978 goto retry; 2979 if ((newpte & PG_M) != (origpte & PG_M)) 2980 vm_page_dirty(m); 2981 if ((newpte & PG_A) != (origpte & PG_A)) 2982 vm_page_aflag_set(m, PGA_REFERENCED); 2983 ptesync(); 2984 } else 2985 invalidate_all = true; 2986 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 2987 goto unchanged; 2988 } 2989 goto validate; 2990 } 2991 2992 /* 2993 * The physical page has changed. Temporarily invalidate 2994 * the mapping. This ensures that all threads sharing the 2995 * pmap keep a consistent view of the mapping, which is 2996 * necessary for the correct handling of COW faults. It 2997 * also permits reuse of the old mapping's PV entry, 2998 * avoiding an allocation. 2999 * 3000 * For consistency, handle unmanaged mappings the same way. 3001 */ 3002 origpte = be64toh(pte_load_clear(pte)); 3003 KASSERT((origpte & PG_FRAME) == opa, 3004 ("pmap_enter: unexpected pa update for %#lx", va)); 3005 if ((origpte & PG_MANAGED) != 0) { 3006 om = PHYS_TO_VM_PAGE(opa); 3007 3008 /* 3009 * The pmap lock is sufficient to synchronize with 3010 * concurrent calls to pmap_page_test_mappings() and 3011 * pmap_ts_referenced(). 3012 */ 3013 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3014 vm_page_dirty(om); 3015 if ((origpte & PG_A) != 0) 3016 vm_page_aflag_set(om, PGA_REFERENCED); 3017 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 3018 pv = pmap_pvh_remove(&om->md, pmap, va); 3019 if ((newpte & PG_MANAGED) == 0) 3020 free_pv_entry(pmap, pv); 3021 #ifdef INVARIANTS 3022 else if (origpte & PG_MANAGED) { 3023 if (pv == NULL) { 3024 pmap_page_print_mappings(om); 3025 MPASS(pv != NULL); 3026 } 3027 } 3028 #endif 3029 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3030 TAILQ_EMPTY(&om->md.pv_list) && 3031 ((om->flags & PG_FICTITIOUS) != 0 || 3032 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3033 vm_page_aflag_clear(om, PGA_WRITEABLE); 3034 } 3035 if ((origpte & PG_A) != 0) 3036 invalidate_page = true; 3037 origpte = 0; 3038 } else { 3039 if (pmap != kernel_pmap) { 3040 #ifdef INVARIANTS 3041 if (VERBOSE_PMAP || pmap_logging) 3042 printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n", 3043 pmap, va, m, prot, flags, psind, 3044 pmap->pm_pid, curproc->p_pid, 3045 curproc->p_comm); 3046 #endif 3047 } 3048 3049 /* 3050 * Increment the counters. 3051 */ 3052 if ((newpte & PG_W) != 0) 3053 pmap->pm_stats.wired_count++; 3054 pmap_resident_count_inc(pmap, 1); 3055 } 3056 3057 /* 3058 * Enter on the PV list if part of our managed memory. 3059 */ 3060 if ((newpte & PG_MANAGED) != 0) { 3061 if (pv == NULL) { 3062 pv = get_pv_entry(pmap, &lock); 3063 pv->pv_va = va; 3064 } 3065 #ifdef VERBOSE_PV 3066 else 3067 printf("reassigning pv: %p to pmap: %p\n", 3068 pv, pmap); 3069 #endif 3070 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 3071 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 3072 m->md.pv_gen++; 3073 if ((newpte & PG_RW) != 0) 3074 vm_page_aflag_set(m, PGA_WRITEABLE); 3075 } 3076 3077 /* 3078 * Update the PTE. 3079 */ 3080 if ((origpte & PG_V) != 0) { 3081 validate: 3082 origpte = be64toh(pte_load_store(pte, htobe64(newpte))); 3083 KASSERT((origpte & PG_FRAME) == pa, 3084 ("pmap_enter: unexpected pa update for %#lx", va)); 3085 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 3086 (PG_M | PG_RW)) { 3087 if ((origpte & PG_MANAGED) != 0) 3088 vm_page_dirty(m); 3089 invalidate_page = true; 3090 3091 /* 3092 * Although the PTE may still have PG_RW set, TLB 3093 * invalidation may nonetheless be required because 3094 * the PTE no longer has PG_M set. 3095 */ 3096 } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) { 3097 /* 3098 * Removing capabilities requires invalidation on POWER 3099 */ 3100 invalidate_page = true; 3101 goto unchanged; 3102 } 3103 if ((origpte & PG_A) != 0) 3104 invalidate_page = true; 3105 } else { 3106 pte_store(pte, newpte); 3107 ptesync(); 3108 } 3109 unchanged: 3110 3111 #if VM_NRESERVLEVEL > 0 3112 /* 3113 * If both the page table page and the reservation are fully 3114 * populated, then attempt promotion. 3115 */ 3116 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 3117 mmu_radix_ps_enabled(pmap) && 3118 (m->flags & PG_FICTITIOUS) == 0 && 3119 vm_reserv_level_iffullpop(m) == 0 && 3120 pmap_promote_l3e(pmap, l3e, va, &lock) == 0) 3121 invalidate_all = true; 3122 #endif 3123 if (invalidate_all) 3124 pmap_invalidate_all(pmap); 3125 else if (invalidate_page) 3126 pmap_invalidate_page(pmap, va); 3127 3128 rv = KERN_SUCCESS; 3129 out: 3130 if (lock != NULL) 3131 rw_wunlock(lock); 3132 PMAP_UNLOCK(pmap); 3133 3134 return (rv); 3135 } 3136 3137 /* 3138 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 3139 * if successful. Returns false if (1) a page table page cannot be allocated 3140 * without sleeping, (2) a mapping already exists at the specified virtual 3141 * address, or (3) a PV entry cannot be allocated without reclaiming another 3142 * PV entry. 3143 */ 3144 static bool 3145 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3146 struct rwlock **lockp) 3147 { 3148 pml3_entry_t newpde; 3149 3150 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3151 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) | 3152 RPTE_LEAF | PG_V; 3153 if ((m->oflags & VPO_UNMANAGED) == 0) 3154 newpde |= PG_MANAGED; 3155 if (prot & VM_PROT_EXECUTE) 3156 newpde |= PG_X; 3157 if (prot & VM_PROT_READ) 3158 newpde |= RPTE_EAA_R; 3159 if (va >= DMAP_MIN_ADDRESS) 3160 newpde |= RPTE_EAA_P; 3161 return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 3162 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 3163 KERN_SUCCESS); 3164 } 3165 3166 /* 3167 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 3168 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 3169 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 3170 * a mapping already exists at the specified virtual address. Returns 3171 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 3172 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 3173 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 3174 * 3175 * The parameter "m" is only used when creating a managed, writeable mapping. 3176 */ 3177 static int 3178 pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags, 3179 vm_page_t m, struct rwlock **lockp) 3180 { 3181 struct spglist free; 3182 pml3_entry_t oldl3e, *l3e; 3183 vm_page_t mt, pdpg; 3184 3185 KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, 3186 ("pmap_enter_pde: newpde is missing PG_M")); 3187 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3188 3189 if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 3190 NULL : lockp)) == NULL) { 3191 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3192 " in pmap %p", va, pmap); 3193 return (KERN_RESOURCE_SHORTAGE); 3194 } 3195 l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 3196 l3e = &l3e[pmap_pml3e_index(va)]; 3197 oldl3e = be64toh(*l3e); 3198 if ((oldl3e & PG_V) != 0) { 3199 KASSERT(pdpg->ref_count > 1, 3200 ("pmap_enter_pde: pdpg's wire count is too low")); 3201 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 3202 pdpg->ref_count--; 3203 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3204 " in pmap %p", va, pmap); 3205 return (KERN_FAILURE); 3206 } 3207 /* Break the existing mapping(s). */ 3208 SLIST_INIT(&free); 3209 if ((oldl3e & RPTE_LEAF) != 0) { 3210 /* 3211 * The reference to the PD page that was acquired by 3212 * pmap_allocl3e() ensures that it won't be freed. 3213 * However, if the PDE resulted from a promotion, then 3214 * a reserved PT page could be freed. 3215 */ 3216 (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp); 3217 pmap_invalidate_l3e_page(pmap, va, oldl3e); 3218 } else { 3219 if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e, 3220 &free, lockp)) 3221 pmap_invalidate_all(pmap); 3222 } 3223 vm_page_free_pages_toq(&free, true); 3224 if (va >= VM_MAXUSER_ADDRESS) { 3225 mt = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME); 3226 if (pmap_insert_pt_page(pmap, mt)) { 3227 /* 3228 * XXX Currently, this can't happen because 3229 * we do not perform pmap_enter(psind == 1) 3230 * on the kernel pmap. 3231 */ 3232 panic("pmap_enter_pde: trie insert failed"); 3233 } 3234 } else 3235 KASSERT(be64toh(*l3e) == 0, ("pmap_enter_pde: non-zero pde %p", 3236 l3e)); 3237 } 3238 if ((newpde & PG_MANAGED) != 0) { 3239 /* 3240 * Abort this mapping if its PV entry could not be created. 3241 */ 3242 if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) { 3243 SLIST_INIT(&free); 3244 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { 3245 /* 3246 * Although "va" is not mapped, paging- 3247 * structure caches could nonetheless have 3248 * entries that refer to the freed page table 3249 * pages. Invalidate those entries. 3250 */ 3251 pmap_invalidate_page(pmap, va); 3252 vm_page_free_pages_toq(&free, true); 3253 } 3254 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3255 " in pmap %p", va, pmap); 3256 return (KERN_RESOURCE_SHORTAGE); 3257 } 3258 if ((newpde & PG_RW) != 0) { 3259 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) 3260 vm_page_aflag_set(mt, PGA_WRITEABLE); 3261 } 3262 } 3263 3264 /* 3265 * Increment counters. 3266 */ 3267 if ((newpde & PG_W) != 0) 3268 pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE; 3269 pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); 3270 3271 /* 3272 * Map the superpage. (This is not a promoted mapping; there will not 3273 * be any lingering 4KB page mappings in the TLB.) 3274 */ 3275 pte_store(l3e, newpde); 3276 ptesync(); 3277 3278 counter_u64_add(pmap_l3e_mappings, 1); 3279 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3280 " in pmap %p", va, pmap); 3281 return (KERN_SUCCESS); 3282 } 3283 3284 void 3285 mmu_radix_enter_object(pmap_t pmap, vm_offset_t start, 3286 vm_offset_t end, vm_page_t m_start, vm_prot_t prot) 3287 { 3288 3289 struct rwlock *lock; 3290 vm_offset_t va; 3291 vm_page_t m, mpte; 3292 vm_pindex_t diff, psize; 3293 bool invalidate; 3294 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3295 3296 CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start, 3297 end, m_start, prot); 3298 3299 invalidate = false; 3300 psize = atop(end - start); 3301 mpte = NULL; 3302 m = m_start; 3303 lock = NULL; 3304 PMAP_LOCK(pmap); 3305 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3306 va = start + ptoa(diff); 3307 if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end && 3308 m->psind == 1 && mmu_radix_ps_enabled(pmap) && 3309 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3310 m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1]; 3311 else 3312 mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot, 3313 mpte, &lock, &invalidate); 3314 m = TAILQ_NEXT(m, listq); 3315 } 3316 ptesync(); 3317 if (lock != NULL) 3318 rw_wunlock(lock); 3319 if (invalidate) 3320 pmap_invalidate_all(pmap); 3321 PMAP_UNLOCK(pmap); 3322 } 3323 3324 static vm_page_t 3325 mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3326 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate) 3327 { 3328 struct spglist free; 3329 pt_entry_t *pte; 3330 vm_paddr_t pa; 3331 3332 KASSERT(!VA_IS_CLEANMAP(va) || 3333 (m->oflags & VPO_UNMANAGED) != 0, 3334 ("mmu_radix_enter_quick_locked: managed mapping within the clean submap")); 3335 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3336 3337 /* 3338 * In the case that a page table page is not 3339 * resident, we are creating it here. 3340 */ 3341 if (va < VM_MAXUSER_ADDRESS) { 3342 vm_pindex_t ptepindex; 3343 pml3_entry_t *ptepa; 3344 3345 /* 3346 * Calculate pagetable page index 3347 */ 3348 ptepindex = pmap_l3e_pindex(va); 3349 if (mpte && (mpte->pindex == ptepindex)) { 3350 mpte->ref_count++; 3351 } else { 3352 /* 3353 * Get the page directory entry 3354 */ 3355 ptepa = pmap_pml3e(pmap, va); 3356 3357 /* 3358 * If the page table page is mapped, we just increment 3359 * the hold count, and activate it. Otherwise, we 3360 * attempt to allocate a page table page. If this 3361 * attempt fails, we don't retry. Instead, we give up. 3362 */ 3363 if (ptepa && (be64toh(*ptepa) & PG_V) != 0) { 3364 if (be64toh(*ptepa) & RPTE_LEAF) 3365 return (NULL); 3366 mpte = PHYS_TO_VM_PAGE(be64toh(*ptepa) & PG_FRAME); 3367 mpte->ref_count++; 3368 } else { 3369 /* 3370 * Pass NULL instead of the PV list lock 3371 * pointer, because we don't intend to sleep. 3372 */ 3373 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 3374 if (mpte == NULL) 3375 return (mpte); 3376 } 3377 } 3378 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3379 pte = &pte[pmap_pte_index(va)]; 3380 } else { 3381 mpte = NULL; 3382 pte = pmap_pte(pmap, va); 3383 } 3384 if (be64toh(*pte)) { 3385 if (mpte != NULL) { 3386 mpte->ref_count--; 3387 mpte = NULL; 3388 } 3389 return (mpte); 3390 } 3391 3392 /* 3393 * Enter on the PV list if part of our managed memory. 3394 */ 3395 if ((m->oflags & VPO_UNMANAGED) == 0 && 3396 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3397 if (mpte != NULL) { 3398 SLIST_INIT(&free); 3399 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3400 /* 3401 * Although "va" is not mapped, paging- 3402 * structure caches could nonetheless have 3403 * entries that refer to the freed page table 3404 * pages. Invalidate those entries. 3405 */ 3406 *invalidate = true; 3407 vm_page_free_pages_toq(&free, true); 3408 } 3409 mpte = NULL; 3410 } 3411 return (mpte); 3412 } 3413 3414 /* 3415 * Increment counters 3416 */ 3417 pmap_resident_count_inc(pmap, 1); 3418 3419 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs); 3420 if (prot & VM_PROT_EXECUTE) 3421 pa |= PG_X; 3422 else 3423 pa |= RPTE_EAA_R; 3424 if ((m->oflags & VPO_UNMANAGED) == 0) 3425 pa |= PG_MANAGED; 3426 3427 pte_store(pte, pa); 3428 return (mpte); 3429 } 3430 3431 void 3432 mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, 3433 vm_prot_t prot) 3434 { 3435 struct rwlock *lock; 3436 bool invalidate; 3437 3438 lock = NULL; 3439 invalidate = false; 3440 PMAP_LOCK(pmap); 3441 mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock, 3442 &invalidate); 3443 ptesync(); 3444 if (lock != NULL) 3445 rw_wunlock(lock); 3446 if (invalidate) 3447 pmap_invalidate_all(pmap); 3448 PMAP_UNLOCK(pmap); 3449 } 3450 3451 vm_paddr_t 3452 mmu_radix_extract(pmap_t pmap, vm_offset_t va) 3453 { 3454 pml3_entry_t *l3e; 3455 pt_entry_t *pte; 3456 vm_paddr_t pa; 3457 3458 l3e = pmap_pml3e(pmap, va); 3459 if (__predict_false(l3e == NULL)) 3460 return (0); 3461 if (be64toh(*l3e) & RPTE_LEAF) { 3462 pa = (be64toh(*l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK); 3463 pa |= (va & L3_PAGE_MASK); 3464 } else { 3465 /* 3466 * Beware of a concurrent promotion that changes the 3467 * PDE at this point! For example, vtopte() must not 3468 * be used to access the PTE because it would use the 3469 * new PDE. It is, however, safe to use the old PDE 3470 * because the page table page is preserved by the 3471 * promotion. 3472 */ 3473 pte = pmap_l3e_to_pte(l3e, va); 3474 if (__predict_false(pte == NULL)) 3475 return (0); 3476 pa = be64toh(*pte); 3477 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3478 pa |= (va & PAGE_MASK); 3479 } 3480 return (pa); 3481 } 3482 3483 vm_page_t 3484 mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3485 { 3486 pml3_entry_t l3e, *l3ep; 3487 pt_entry_t pte; 3488 vm_page_t m; 3489 3490 m = NULL; 3491 CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot); 3492 PMAP_LOCK(pmap); 3493 l3ep = pmap_pml3e(pmap, va); 3494 if (l3ep != NULL && (l3e = be64toh(*l3ep))) { 3495 if (l3e & RPTE_LEAF) { 3496 if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0) 3497 m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) | 3498 (va & L3_PAGE_MASK)); 3499 } else { 3500 /* Native endian PTE, do not pass to pmap functions */ 3501 pte = be64toh(*pmap_l3e_to_pte(l3ep, va)); 3502 if ((pte & PG_V) && 3503 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) 3504 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3505 } 3506 if (m != NULL && !vm_page_wire_mapped(m)) 3507 m = NULL; 3508 } 3509 PMAP_UNLOCK(pmap); 3510 return (m); 3511 } 3512 3513 static void 3514 mmu_radix_growkernel(vm_offset_t addr) 3515 { 3516 vm_paddr_t paddr; 3517 vm_page_t nkpg; 3518 pml3_entry_t *l3e; 3519 pml2_entry_t *l2e; 3520 3521 CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); 3522 if (VM_MIN_KERNEL_ADDRESS < addr && 3523 addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE)) 3524 return; 3525 3526 addr = roundup2(addr, L3_PAGE_SIZE); 3527 if (addr - 1 >= vm_map_max(kernel_map)) 3528 addr = vm_map_max(kernel_map); 3529 while (kernel_vm_end < addr) { 3530 l2e = pmap_pml2e(kernel_pmap, kernel_vm_end); 3531 if ((be64toh(*l2e) & PG_V) == 0) { 3532 /* We need a new PDP entry */ 3533 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_PAGE_SIZE_SHIFT, 3534 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 3535 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3536 if (nkpg == NULL) 3537 panic("pmap_growkernel: no memory to grow kernel"); 3538 if ((nkpg->flags & PG_ZERO) == 0) 3539 mmu_radix_zero_page(nkpg); 3540 paddr = VM_PAGE_TO_PHYS(nkpg); 3541 pde_store(l2e, paddr); 3542 continue; /* try again */ 3543 } 3544 l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end); 3545 if ((be64toh(*l3e) & PG_V) != 0) { 3546 kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 3547 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3548 kernel_vm_end = vm_map_max(kernel_map); 3549 break; 3550 } 3551 continue; 3552 } 3553 3554 nkpg = vm_page_alloc(NULL, pmap_l3e_pindex(kernel_vm_end), 3555 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 3556 VM_ALLOC_ZERO); 3557 if (nkpg == NULL) 3558 panic("pmap_growkernel: no memory to grow kernel"); 3559 if ((nkpg->flags & PG_ZERO) == 0) 3560 mmu_radix_zero_page(nkpg); 3561 paddr = VM_PAGE_TO_PHYS(nkpg); 3562 pde_store(l3e, paddr); 3563 3564 kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 3565 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3566 kernel_vm_end = vm_map_max(kernel_map); 3567 break; 3568 } 3569 } 3570 ptesync(); 3571 } 3572 3573 static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory"); 3574 static uma_zone_t zone_radix_pgd; 3575 3576 static int 3577 radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused, 3578 int flags) 3579 { 3580 3581 for (int i = 0; i < count; i++) { 3582 vm_page_t m = vm_page_alloc_contig(NULL, 0, 3583 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 3584 VM_ALLOC_ZERO | VM_ALLOC_WAITOK, RADIX_PGD_SIZE/PAGE_SIZE, 3585 0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE, 3586 VM_MEMATTR_DEFAULT); 3587 /* XXX zero on alloc here so we don't have to later */ 3588 store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3589 } 3590 return (count); 3591 } 3592 3593 static void 3594 radix_pgd_release(void *arg __unused, void **store, int count) 3595 { 3596 vm_page_t m; 3597 struct spglist free; 3598 int page_count; 3599 3600 SLIST_INIT(&free); 3601 page_count = RADIX_PGD_SIZE/PAGE_SIZE; 3602 3603 for (int i = 0; i < count; i++) { 3604 /* 3605 * XXX selectively remove dmap and KVA entries so we don't 3606 * need to bzero 3607 */ 3608 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); 3609 for (int j = page_count-1; j >= 0; j--) { 3610 vm_page_unwire_noq(&m[j]); 3611 SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss); 3612 } 3613 vm_page_free_pages_toq(&free, false); 3614 } 3615 } 3616 3617 static void 3618 mmu_radix_init() 3619 { 3620 vm_page_t mpte; 3621 vm_size_t s; 3622 int error, i, pv_npg; 3623 3624 /* XXX is this really needed for POWER? */ 3625 /* L1TF, reserve page @0 unconditionally */ 3626 vm_page_blacklist_add(0, bootverbose); 3627 3628 zone_radix_pgd = uma_zcache_create("radix_pgd_cache", 3629 RADIX_PGD_SIZE, NULL, NULL, 3630 #ifdef INVARIANTS 3631 trash_init, trash_fini, 3632 #else 3633 NULL, NULL, 3634 #endif 3635 radix_pgd_import, radix_pgd_release, 3636 NULL, UMA_ZONE_NOBUCKET); 3637 3638 /* 3639 * Initialize the vm page array entries for the kernel pmap's 3640 * page table pages. 3641 */ 3642 PMAP_LOCK(kernel_pmap); 3643 for (i = 0; i < nkpt; i++) { 3644 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 3645 KASSERT(mpte >= vm_page_array && 3646 mpte < &vm_page_array[vm_page_array_size], 3647 ("pmap_init: page table page is out of range size: %lu", 3648 vm_page_array_size)); 3649 mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i; 3650 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 3651 MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte); 3652 //pmap_insert_pt_page(kernel_pmap, mpte); 3653 mpte->ref_count = 1; 3654 } 3655 PMAP_UNLOCK(kernel_pmap); 3656 vm_wire_add(nkpt); 3657 3658 CTR1(KTR_PMAP, "%s()", __func__); 3659 TAILQ_INIT(&pv_dummy.pv_list); 3660 3661 /* 3662 * Are large page mappings enabled? 3663 */ 3664 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 3665 if (superpages_enabled) { 3666 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 3667 ("pmap_init: can't assign to pagesizes[1]")); 3668 pagesizes[1] = L3_PAGE_SIZE; 3669 } 3670 3671 /* 3672 * Initialize the pv chunk list mutex. 3673 */ 3674 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 3675 3676 /* 3677 * Initialize the pool of pv list locks. 3678 */ 3679 for (i = 0; i < NPV_LIST_LOCKS; i++) 3680 rw_init(&pv_list_locks[i], "pmap pv list"); 3681 3682 /* 3683 * Calculate the size of the pv head table for superpages. 3684 */ 3685 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE); 3686 3687 /* 3688 * Allocate memory for the pv head table for superpages. 3689 */ 3690 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 3691 s = round_page(s); 3692 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 3693 for (i = 0; i < pv_npg; i++) 3694 TAILQ_INIT(&pv_table[i].pv_list); 3695 TAILQ_INIT(&pv_dummy.pv_list); 3696 3697 pmap_initialized = 1; 3698 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 3699 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3700 (vmem_addr_t *)&qframe); 3701 3702 if (error != 0) 3703 panic("qframe allocation failed"); 3704 asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<<isa3_pid_bits), 3705 1, 1, M_WAITOK); 3706 } 3707 3708 static boolean_t 3709 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3710 { 3711 struct rwlock *lock; 3712 pv_entry_t pv; 3713 struct md_page *pvh; 3714 pt_entry_t *pte, mask; 3715 pmap_t pmap; 3716 int md_gen, pvh_gen; 3717 boolean_t rv; 3718 3719 rv = FALSE; 3720 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3721 rw_rlock(lock); 3722 restart: 3723 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 3724 pmap = PV_PMAP(pv); 3725 if (!PMAP_TRYLOCK(pmap)) { 3726 md_gen = m->md.pv_gen; 3727 rw_runlock(lock); 3728 PMAP_LOCK(pmap); 3729 rw_rlock(lock); 3730 if (md_gen != m->md.pv_gen) { 3731 PMAP_UNLOCK(pmap); 3732 goto restart; 3733 } 3734 } 3735 pte = pmap_pte(pmap, pv->pv_va); 3736 mask = 0; 3737 if (modified) 3738 mask |= PG_RW | PG_M; 3739 if (accessed) 3740 mask |= PG_V | PG_A; 3741 rv = (be64toh(*pte) & mask) == mask; 3742 PMAP_UNLOCK(pmap); 3743 if (rv) 3744 goto out; 3745 } 3746 if ((m->flags & PG_FICTITIOUS) == 0) { 3747 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3748 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 3749 pmap = PV_PMAP(pv); 3750 if (!PMAP_TRYLOCK(pmap)) { 3751 md_gen = m->md.pv_gen; 3752 pvh_gen = pvh->pv_gen; 3753 rw_runlock(lock); 3754 PMAP_LOCK(pmap); 3755 rw_rlock(lock); 3756 if (md_gen != m->md.pv_gen || 3757 pvh_gen != pvh->pv_gen) { 3758 PMAP_UNLOCK(pmap); 3759 goto restart; 3760 } 3761 } 3762 pte = pmap_pml3e(pmap, pv->pv_va); 3763 mask = 0; 3764 if (modified) 3765 mask |= PG_RW | PG_M; 3766 if (accessed) 3767 mask |= PG_V | PG_A; 3768 rv = (be64toh(*pte) & mask) == mask; 3769 PMAP_UNLOCK(pmap); 3770 if (rv) 3771 goto out; 3772 } 3773 } 3774 out: 3775 rw_runlock(lock); 3776 return (rv); 3777 } 3778 3779 /* 3780 * pmap_is_modified: 3781 * 3782 * Return whether or not the specified physical page was modified 3783 * in any physical maps. 3784 */ 3785 boolean_t 3786 mmu_radix_is_modified(vm_page_t m) 3787 { 3788 3789 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3790 ("pmap_is_modified: page %p is not managed", m)); 3791 3792 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 3793 /* 3794 * If the page is not busied then this check is racy. 3795 */ 3796 if (!pmap_page_is_write_mapped(m)) 3797 return (FALSE); 3798 return (pmap_page_test_mappings(m, FALSE, TRUE)); 3799 } 3800 3801 boolean_t 3802 mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3803 { 3804 pml3_entry_t *l3e; 3805 pt_entry_t *pte; 3806 boolean_t rv; 3807 3808 CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); 3809 rv = FALSE; 3810 PMAP_LOCK(pmap); 3811 l3e = pmap_pml3e(pmap, addr); 3812 if (l3e != NULL && (be64toh(*l3e) & (RPTE_LEAF | PG_V)) == PG_V) { 3813 pte = pmap_l3e_to_pte(l3e, addr); 3814 rv = (be64toh(*pte) & PG_V) == 0; 3815 } 3816 PMAP_UNLOCK(pmap); 3817 return (rv); 3818 } 3819 3820 boolean_t 3821 mmu_radix_is_referenced(vm_page_t m) 3822 { 3823 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3824 ("pmap_is_referenced: page %p is not managed", m)); 3825 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 3826 return (pmap_page_test_mappings(m, TRUE, FALSE)); 3827 } 3828 3829 /* 3830 * pmap_ts_referenced: 3831 * 3832 * Return a count of reference bits for a page, clearing those bits. 3833 * It is not necessary for every reference bit to be cleared, but it 3834 * is necessary that 0 only be returned when there are truly no 3835 * reference bits set. 3836 * 3837 * As an optimization, update the page's dirty field if a modified bit is 3838 * found while counting reference bits. This opportunistic update can be 3839 * performed at low cost and can eliminate the need for some future calls 3840 * to pmap_is_modified(). However, since this function stops after 3841 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 3842 * dirty pages. Those dirty pages will only be detected by a future call 3843 * to pmap_is_modified(). 3844 * 3845 * A DI block is not needed within this function, because 3846 * invalidations are performed before the PV list lock is 3847 * released. 3848 */ 3849 boolean_t 3850 mmu_radix_ts_referenced(vm_page_t m) 3851 { 3852 struct md_page *pvh; 3853 pv_entry_t pv, pvf; 3854 pmap_t pmap; 3855 struct rwlock *lock; 3856 pml3_entry_t oldl3e, *l3e; 3857 pt_entry_t *pte; 3858 vm_paddr_t pa; 3859 int cleared, md_gen, not_cleared, pvh_gen; 3860 struct spglist free; 3861 3862 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 3863 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3864 ("pmap_ts_referenced: page %p is not managed", m)); 3865 SLIST_INIT(&free); 3866 cleared = 0; 3867 pa = VM_PAGE_TO_PHYS(m); 3868 lock = PHYS_TO_PV_LIST_LOCK(pa); 3869 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 3870 rw_wlock(lock); 3871 retry: 3872 not_cleared = 0; 3873 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 3874 goto small_mappings; 3875 pv = pvf; 3876 do { 3877 if (pvf == NULL) 3878 pvf = pv; 3879 pmap = PV_PMAP(pv); 3880 if (!PMAP_TRYLOCK(pmap)) { 3881 pvh_gen = pvh->pv_gen; 3882 rw_wunlock(lock); 3883 PMAP_LOCK(pmap); 3884 rw_wlock(lock); 3885 if (pvh_gen != pvh->pv_gen) { 3886 PMAP_UNLOCK(pmap); 3887 goto retry; 3888 } 3889 } 3890 l3e = pmap_pml3e(pmap, pv->pv_va); 3891 oldl3e = be64toh(*l3e); 3892 if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3893 /* 3894 * Although "oldpde" is mapping a 2MB page, because 3895 * this function is called at a 4KB page granularity, 3896 * we only update the 4KB page under test. 3897 */ 3898 vm_page_dirty(m); 3899 } 3900 if ((oldl3e & PG_A) != 0) { 3901 /* 3902 * Since this reference bit is shared by 512 4KB 3903 * pages, it should not be cleared every time it is 3904 * tested. Apply a simple "hash" function on the 3905 * physical page number, the virtual superpage number, 3906 * and the pmap address to select one 4KB page out of 3907 * the 512 on which testing the reference bit will 3908 * result in clearing that reference bit. This 3909 * function is designed to avoid the selection of the 3910 * same 4KB page for every 2MB page mapping. 3911 * 3912 * On demotion, a mapping that hasn't been referenced 3913 * is simply destroyed. To avoid the possibility of a 3914 * subsequent page fault on a demoted wired mapping, 3915 * always leave its reference bit set. Moreover, 3916 * since the superpage is wired, the current state of 3917 * its reference bit won't affect page replacement. 3918 */ 3919 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^ 3920 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 3921 (oldl3e & PG_W) == 0) { 3922 atomic_clear_long(l3e, htobe64(PG_A)); 3923 pmap_invalidate_page(pmap, pv->pv_va); 3924 cleared++; 3925 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 3926 ("inconsistent pv lock %p %p for page %p", 3927 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 3928 } else 3929 not_cleared++; 3930 } 3931 PMAP_UNLOCK(pmap); 3932 /* Rotate the PV list if it has more than one entry. */ 3933 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { 3934 TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); 3935 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); 3936 pvh->pv_gen++; 3937 } 3938 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 3939 goto out; 3940 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 3941 small_mappings: 3942 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 3943 goto out; 3944 pv = pvf; 3945 do { 3946 if (pvf == NULL) 3947 pvf = pv; 3948 pmap = PV_PMAP(pv); 3949 if (!PMAP_TRYLOCK(pmap)) { 3950 pvh_gen = pvh->pv_gen; 3951 md_gen = m->md.pv_gen; 3952 rw_wunlock(lock); 3953 PMAP_LOCK(pmap); 3954 rw_wlock(lock); 3955 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3956 PMAP_UNLOCK(pmap); 3957 goto retry; 3958 } 3959 } 3960 l3e = pmap_pml3e(pmap, pv->pv_va); 3961 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, 3962 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 3963 m)); 3964 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 3965 if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3966 vm_page_dirty(m); 3967 if ((be64toh(*pte) & PG_A) != 0) { 3968 atomic_clear_long(pte, htobe64(PG_A)); 3969 pmap_invalidate_page(pmap, pv->pv_va); 3970 cleared++; 3971 } 3972 PMAP_UNLOCK(pmap); 3973 /* Rotate the PV list if it has more than one entry. */ 3974 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { 3975 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 3976 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 3977 m->md.pv_gen++; 3978 } 3979 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 3980 not_cleared < PMAP_TS_REFERENCED_MAX); 3981 out: 3982 rw_wunlock(lock); 3983 vm_page_free_pages_toq(&free, true); 3984 return (cleared + not_cleared); 3985 } 3986 3987 static vm_offset_t 3988 mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start, 3989 vm_paddr_t end, int prot __unused) 3990 { 3991 3992 CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end, 3993 prot); 3994 return (PHYS_TO_DMAP(start)); 3995 } 3996 3997 void 3998 mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr, 3999 vm_object_t object, vm_pindex_t pindex, vm_size_t size) 4000 { 4001 pml3_entry_t *l3e; 4002 vm_paddr_t pa, ptepa; 4003 vm_page_t p, pdpg; 4004 vm_memattr_t ma; 4005 4006 CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr, 4007 object, pindex, size); 4008 VM_OBJECT_ASSERT_WLOCKED(object); 4009 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4010 ("pmap_object_init_pt: non-device object")); 4011 /* NB: size can be logically ored with addr here */ 4012 if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) { 4013 if (!mmu_radix_ps_enabled(pmap)) 4014 return; 4015 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4016 return; 4017 p = vm_page_lookup(object, pindex); 4018 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4019 ("pmap_object_init_pt: invalid page %p", p)); 4020 ma = p->md.mdpg_cache_attrs; 4021 4022 /* 4023 * Abort the mapping if the first page is not physically 4024 * aligned to a 2MB page boundary. 4025 */ 4026 ptepa = VM_PAGE_TO_PHYS(p); 4027 if (ptepa & L3_PAGE_MASK) 4028 return; 4029 4030 /* 4031 * Skip the first page. Abort the mapping if the rest of 4032 * the pages are not physically contiguous or have differing 4033 * memory attributes. 4034 */ 4035 p = TAILQ_NEXT(p, listq); 4036 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 4037 pa += PAGE_SIZE) { 4038 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4039 ("pmap_object_init_pt: invalid page %p", p)); 4040 if (pa != VM_PAGE_TO_PHYS(p) || 4041 ma != p->md.mdpg_cache_attrs) 4042 return; 4043 p = TAILQ_NEXT(p, listq); 4044 } 4045 4046 PMAP_LOCK(pmap); 4047 for (pa = ptepa | pmap_cache_bits(ma); 4048 pa < ptepa + size; pa += L3_PAGE_SIZE) { 4049 pdpg = pmap_allocl3e(pmap, addr, NULL); 4050 if (pdpg == NULL) { 4051 /* 4052 * The creation of mappings below is only an 4053 * optimization. If a page directory page 4054 * cannot be allocated without blocking, 4055 * continue on to the next mapping rather than 4056 * blocking. 4057 */ 4058 addr += L3_PAGE_SIZE; 4059 continue; 4060 } 4061 l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4062 l3e = &l3e[pmap_pml3e_index(addr)]; 4063 if ((be64toh(*l3e) & PG_V) == 0) { 4064 pa |= PG_M | PG_A | PG_RW; 4065 pte_store(l3e, pa); 4066 pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); 4067 counter_u64_add(pmap_l3e_mappings, 1); 4068 } else { 4069 /* Continue on if the PDE is already valid. */ 4070 pdpg->ref_count--; 4071 KASSERT(pdpg->ref_count > 0, 4072 ("pmap_object_init_pt: missing reference " 4073 "to page directory page, va: 0x%lx", addr)); 4074 } 4075 addr += L3_PAGE_SIZE; 4076 } 4077 ptesync(); 4078 PMAP_UNLOCK(pmap); 4079 } 4080 } 4081 4082 boolean_t 4083 mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m) 4084 { 4085 struct md_page *pvh; 4086 struct rwlock *lock; 4087 pv_entry_t pv; 4088 int loops = 0; 4089 boolean_t rv; 4090 4091 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4092 ("pmap_page_exists_quick: page %p is not managed", m)); 4093 CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m); 4094 rv = FALSE; 4095 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4096 rw_rlock(lock); 4097 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 4098 if (PV_PMAP(pv) == pmap) { 4099 rv = TRUE; 4100 break; 4101 } 4102 loops++; 4103 if (loops >= 16) 4104 break; 4105 } 4106 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4107 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4108 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 4109 if (PV_PMAP(pv) == pmap) { 4110 rv = TRUE; 4111 break; 4112 } 4113 loops++; 4114 if (loops >= 16) 4115 break; 4116 } 4117 } 4118 rw_runlock(lock); 4119 return (rv); 4120 } 4121 4122 void 4123 mmu_radix_page_init(vm_page_t m) 4124 { 4125 4126 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 4127 TAILQ_INIT(&m->md.pv_list); 4128 m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; 4129 } 4130 4131 int 4132 mmu_radix_page_wired_mappings(vm_page_t m) 4133 { 4134 struct rwlock *lock; 4135 struct md_page *pvh; 4136 pmap_t pmap; 4137 pt_entry_t *pte; 4138 pv_entry_t pv; 4139 int count, md_gen, pvh_gen; 4140 4141 if ((m->oflags & VPO_UNMANAGED) != 0) 4142 return (0); 4143 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 4144 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4145 rw_rlock(lock); 4146 restart: 4147 count = 0; 4148 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 4149 pmap = PV_PMAP(pv); 4150 if (!PMAP_TRYLOCK(pmap)) { 4151 md_gen = m->md.pv_gen; 4152 rw_runlock(lock); 4153 PMAP_LOCK(pmap); 4154 rw_rlock(lock); 4155 if (md_gen != m->md.pv_gen) { 4156 PMAP_UNLOCK(pmap); 4157 goto restart; 4158 } 4159 } 4160 pte = pmap_pte(pmap, pv->pv_va); 4161 if ((be64toh(*pte) & PG_W) != 0) 4162 count++; 4163 PMAP_UNLOCK(pmap); 4164 } 4165 if ((m->flags & PG_FICTITIOUS) == 0) { 4166 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4167 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 4168 pmap = PV_PMAP(pv); 4169 if (!PMAP_TRYLOCK(pmap)) { 4170 md_gen = m->md.pv_gen; 4171 pvh_gen = pvh->pv_gen; 4172 rw_runlock(lock); 4173 PMAP_LOCK(pmap); 4174 rw_rlock(lock); 4175 if (md_gen != m->md.pv_gen || 4176 pvh_gen != pvh->pv_gen) { 4177 PMAP_UNLOCK(pmap); 4178 goto restart; 4179 } 4180 } 4181 pte = pmap_pml3e(pmap, pv->pv_va); 4182 if ((be64toh(*pte) & PG_W) != 0) 4183 count++; 4184 PMAP_UNLOCK(pmap); 4185 } 4186 } 4187 rw_runlock(lock); 4188 return (count); 4189 } 4190 4191 static void 4192 mmu_radix_update_proctab(int pid, pml1_entry_t l1pa) 4193 { 4194 isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE | l1pa | RADIX_PGD_INDEX_SHIFT); 4195 } 4196 4197 int 4198 mmu_radix_pinit(pmap_t pmap) 4199 { 4200 vmem_addr_t pid; 4201 vm_paddr_t l1pa; 4202 4203 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 4204 4205 /* 4206 * allocate the page directory page 4207 */ 4208 pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK); 4209 4210 for (int j = 0; j < RADIX_PGD_SIZE_SHIFT; j++) 4211 pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE); 4212 pmap->pm_radix.rt_root = 0; 4213 TAILQ_INIT(&pmap->pm_pvchunk); 4214 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4215 pmap->pm_flags = PMAP_PDE_SUPERPAGE; 4216 vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid); 4217 4218 pmap->pm_pid = pid; 4219 l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1); 4220 mmu_radix_update_proctab(pid, l1pa); 4221 __asm __volatile("ptesync;isync" : : : "memory"); 4222 4223 return (1); 4224 } 4225 4226 /* 4227 * This routine is called if the desired page table page does not exist. 4228 * 4229 * If page table page allocation fails, this routine may sleep before 4230 * returning NULL. It sleeps only if a lock pointer was given. 4231 * 4232 * Note: If a page allocation fails at page table level two or three, 4233 * one or two pages may be held during the wait, only to be released 4234 * afterwards. This conservative approach is easily argued to avoid 4235 * race conditions. 4236 */ 4237 static vm_page_t 4238 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 4239 { 4240 vm_page_t m, pdppg, pdpg; 4241 4242 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4243 4244 /* 4245 * Allocate a page table page. 4246 */ 4247 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 4248 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 4249 if (lockp != NULL) { 4250 RELEASE_PV_LIST_LOCK(lockp); 4251 PMAP_UNLOCK(pmap); 4252 vm_wait(NULL); 4253 PMAP_LOCK(pmap); 4254 } 4255 /* 4256 * Indicate the need to retry. While waiting, the page table 4257 * page may have been allocated. 4258 */ 4259 return (NULL); 4260 } 4261 if ((m->flags & PG_ZERO) == 0) 4262 mmu_radix_zero_page(m); 4263 4264 /* 4265 * Map the pagetable page into the process address space, if 4266 * it isn't already there. 4267 */ 4268 4269 if (ptepindex >= (NUPDE + NUPDPE)) { 4270 pml1_entry_t *l1e; 4271 vm_pindex_t pml1index; 4272 4273 /* Wire up a new PDPE page */ 4274 pml1index = ptepindex - (NUPDE + NUPDPE); 4275 l1e = &pmap->pm_pml1[pml1index]; 4276 KASSERT((be64toh(*l1e) & PG_V) == 0, 4277 ("%s: L1 entry %#lx is valid", __func__, *l1e)); 4278 pde_store(l1e, VM_PAGE_TO_PHYS(m)); 4279 } else if (ptepindex >= NUPDE) { 4280 vm_pindex_t pml1index; 4281 vm_pindex_t pdpindex; 4282 pml1_entry_t *l1e; 4283 pml2_entry_t *l2e; 4284 4285 /* Wire up a new l2e page */ 4286 pdpindex = ptepindex - NUPDE; 4287 pml1index = pdpindex >> RPTE_SHIFT; 4288 4289 l1e = &pmap->pm_pml1[pml1index]; 4290 if ((be64toh(*l1e) & PG_V) == 0) { 4291 /* Have to allocate a new pdp, recurse */ 4292 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index, 4293 lockp) == NULL) { 4294 vm_page_unwire_noq(m); 4295 vm_page_free_zero(m); 4296 return (NULL); 4297 } 4298 } else { 4299 /* Add reference to l2e page */ 4300 pdppg = PHYS_TO_VM_PAGE(be64toh(*l1e) & PG_FRAME); 4301 pdppg->ref_count++; 4302 } 4303 l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); 4304 4305 /* Now find the pdp page */ 4306 l2e = &l2e[pdpindex & RPTE_MASK]; 4307 KASSERT((be64toh(*l2e) & PG_V) == 0, 4308 ("%s: L2 entry %#lx is valid", __func__, *l2e)); 4309 pde_store(l2e, VM_PAGE_TO_PHYS(m)); 4310 } else { 4311 vm_pindex_t pml1index; 4312 vm_pindex_t pdpindex; 4313 pml1_entry_t *l1e; 4314 pml2_entry_t *l2e; 4315 pml3_entry_t *l3e; 4316 4317 /* Wire up a new PTE page */ 4318 pdpindex = ptepindex >> RPTE_SHIFT; 4319 pml1index = pdpindex >> RPTE_SHIFT; 4320 4321 /* First, find the pdp and check that its valid. */ 4322 l1e = &pmap->pm_pml1[pml1index]; 4323 if ((be64toh(*l1e) & PG_V) == 0) { 4324 /* Have to allocate a new pd, recurse */ 4325 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 4326 lockp) == NULL) { 4327 vm_page_unwire_noq(m); 4328 vm_page_free_zero(m); 4329 return (NULL); 4330 } 4331 l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); 4332 l2e = &l2e[pdpindex & RPTE_MASK]; 4333 } else { 4334 l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); 4335 l2e = &l2e[pdpindex & RPTE_MASK]; 4336 if ((be64toh(*l2e) & PG_V) == 0) { 4337 /* Have to allocate a new pd, recurse */ 4338 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 4339 lockp) == NULL) { 4340 vm_page_unwire_noq(m); 4341 vm_page_free_zero(m); 4342 return (NULL); 4343 } 4344 } else { 4345 /* Add reference to the pd page */ 4346 pdpg = PHYS_TO_VM_PAGE(be64toh(*l2e) & PG_FRAME); 4347 pdpg->ref_count++; 4348 } 4349 } 4350 l3e = (pml3_entry_t *)PHYS_TO_DMAP(be64toh(*l2e) & PG_FRAME); 4351 4352 /* Now we know where the page directory page is */ 4353 l3e = &l3e[ptepindex & RPTE_MASK]; 4354 KASSERT((be64toh(*l3e) & PG_V) == 0, 4355 ("%s: L3 entry %#lx is valid", __func__, *l3e)); 4356 pde_store(l3e, VM_PAGE_TO_PHYS(m)); 4357 } 4358 4359 pmap_resident_count_inc(pmap, 1); 4360 return (m); 4361 } 4362 static vm_page_t 4363 pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4364 { 4365 vm_pindex_t pdpindex, ptepindex; 4366 pml2_entry_t *pdpe; 4367 vm_page_t pdpg; 4368 4369 retry: 4370 pdpe = pmap_pml2e(pmap, va); 4371 if (pdpe != NULL && (be64toh(*pdpe) & PG_V) != 0) { 4372 /* Add a reference to the pd page. */ 4373 pdpg = PHYS_TO_VM_PAGE(be64toh(*pdpe) & PG_FRAME); 4374 pdpg->ref_count++; 4375 } else { 4376 /* Allocate a pd page. */ 4377 ptepindex = pmap_l3e_pindex(va); 4378 pdpindex = ptepindex >> RPTE_SHIFT; 4379 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 4380 if (pdpg == NULL && lockp != NULL) 4381 goto retry; 4382 } 4383 return (pdpg); 4384 } 4385 4386 static vm_page_t 4387 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4388 { 4389 vm_pindex_t ptepindex; 4390 pml3_entry_t *pd; 4391 vm_page_t m; 4392 4393 /* 4394 * Calculate pagetable page index 4395 */ 4396 ptepindex = pmap_l3e_pindex(va); 4397 retry: 4398 /* 4399 * Get the page directory entry 4400 */ 4401 pd = pmap_pml3e(pmap, va); 4402 4403 /* 4404 * This supports switching from a 2MB page to a 4405 * normal 4K page. 4406 */ 4407 if (pd != NULL && (be64toh(*pd) & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) { 4408 if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) { 4409 /* 4410 * Invalidation of the 2MB page mapping may have caused 4411 * the deallocation of the underlying PD page. 4412 */ 4413 pd = NULL; 4414 } 4415 } 4416 4417 /* 4418 * If the page table page is mapped, we just increment the 4419 * hold count, and activate it. 4420 */ 4421 if (pd != NULL && (be64toh(*pd) & PG_V) != 0) { 4422 m = PHYS_TO_VM_PAGE(be64toh(*pd) & PG_FRAME); 4423 m->ref_count++; 4424 } else { 4425 /* 4426 * Here if the pte page isn't mapped, or if it has been 4427 * deallocated. 4428 */ 4429 m = _pmap_allocpte(pmap, ptepindex, lockp); 4430 if (m == NULL && lockp != NULL) 4431 goto retry; 4432 } 4433 return (m); 4434 } 4435 4436 static void 4437 mmu_radix_pinit0(pmap_t pmap) 4438 { 4439 4440 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 4441 PMAP_LOCK_INIT(pmap); 4442 pmap->pm_pml1 = kernel_pmap->pm_pml1; 4443 pmap->pm_pid = kernel_pmap->pm_pid; 4444 4445 pmap->pm_radix.rt_root = 0; 4446 TAILQ_INIT(&pmap->pm_pvchunk); 4447 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4448 kernel_pmap->pm_flags = 4449 pmap->pm_flags = PMAP_PDE_SUPERPAGE; 4450 } 4451 /* 4452 * pmap_protect_l3e: do the things to protect a 2mpage in a process 4453 */ 4454 static boolean_t 4455 pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot) 4456 { 4457 pt_entry_t newpde, oldpde; 4458 vm_offset_t eva, va; 4459 vm_page_t m; 4460 boolean_t anychanged; 4461 4462 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4463 KASSERT((sva & L3_PAGE_MASK) == 0, 4464 ("pmap_protect_l3e: sva is not 2mpage aligned")); 4465 anychanged = FALSE; 4466 retry: 4467 oldpde = newpde = be64toh(*l3e); 4468 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 4469 (PG_MANAGED | PG_M | PG_RW)) { 4470 eva = sva + L3_PAGE_SIZE; 4471 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 4472 va < eva; va += PAGE_SIZE, m++) 4473 vm_page_dirty(m); 4474 } 4475 if ((prot & VM_PROT_WRITE) == 0) { 4476 newpde &= ~(PG_RW | PG_M); 4477 newpde |= RPTE_EAA_R; 4478 } 4479 if (prot & VM_PROT_EXECUTE) 4480 newpde |= PG_X; 4481 if (newpde != oldpde) { 4482 /* 4483 * As an optimization to future operations on this PDE, clear 4484 * PG_PROMOTED. The impending invalidation will remove any 4485 * lingering 4KB page mappings from the TLB. 4486 */ 4487 if (!atomic_cmpset_long(l3e, htobe64(oldpde), htobe64(newpde & ~PG_PROMOTED))) 4488 goto retry; 4489 anychanged = TRUE; 4490 } 4491 return (anychanged); 4492 } 4493 4494 void 4495 mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 4496 vm_prot_t prot) 4497 { 4498 vm_offset_t va_next; 4499 pml1_entry_t *l1e; 4500 pml2_entry_t *l2e; 4501 pml3_entry_t ptpaddr, *l3e; 4502 pt_entry_t *pte; 4503 boolean_t anychanged; 4504 4505 CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva, 4506 prot); 4507 4508 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4509 if (prot == VM_PROT_NONE) { 4510 mmu_radix_remove(pmap, sva, eva); 4511 return; 4512 } 4513 4514 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 4515 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 4516 return; 4517 4518 #ifdef INVARIANTS 4519 if (VERBOSE_PROTECT || pmap_logging) 4520 printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n", 4521 pmap, sva, eva, prot, pmap->pm_pid); 4522 #endif 4523 anychanged = FALSE; 4524 4525 PMAP_LOCK(pmap); 4526 for (; sva < eva; sva = va_next) { 4527 l1e = pmap_pml1e(pmap, sva); 4528 if ((be64toh(*l1e) & PG_V) == 0) { 4529 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 4530 if (va_next < sva) 4531 va_next = eva; 4532 continue; 4533 } 4534 4535 l2e = pmap_l1e_to_l2e(l1e, sva); 4536 if ((be64toh(*l2e) & PG_V) == 0) { 4537 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 4538 if (va_next < sva) 4539 va_next = eva; 4540 continue; 4541 } 4542 4543 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 4544 if (va_next < sva) 4545 va_next = eva; 4546 4547 l3e = pmap_l2e_to_l3e(l2e, sva); 4548 ptpaddr = be64toh(*l3e); 4549 4550 /* 4551 * Weed out invalid mappings. 4552 */ 4553 if (ptpaddr == 0) 4554 continue; 4555 4556 /* 4557 * Check for large page. 4558 */ 4559 if ((ptpaddr & RPTE_LEAF) != 0) { 4560 /* 4561 * Are we protecting the entire large page? If not, 4562 * demote the mapping and fall through. 4563 */ 4564 if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { 4565 if (pmap_protect_l3e(pmap, l3e, sva, prot)) 4566 anychanged = TRUE; 4567 continue; 4568 } else if (!pmap_demote_l3e(pmap, l3e, sva)) { 4569 /* 4570 * The large page mapping was destroyed. 4571 */ 4572 continue; 4573 } 4574 } 4575 4576 if (va_next > eva) 4577 va_next = eva; 4578 4579 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, 4580 sva += PAGE_SIZE) { 4581 pt_entry_t obits, pbits; 4582 vm_page_t m; 4583 4584 retry: 4585 MPASS(pte == pmap_pte(pmap, sva)); 4586 obits = pbits = be64toh(*pte); 4587 if ((pbits & PG_V) == 0) 4588 continue; 4589 4590 if ((prot & VM_PROT_WRITE) == 0) { 4591 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 4592 (PG_MANAGED | PG_M | PG_RW)) { 4593 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4594 vm_page_dirty(m); 4595 } 4596 pbits &= ~(PG_RW | PG_M); 4597 pbits |= RPTE_EAA_R; 4598 } 4599 if (prot & VM_PROT_EXECUTE) 4600 pbits |= PG_X; 4601 4602 if (pbits != obits) { 4603 if (!atomic_cmpset_long(pte, htobe64(obits), htobe64(pbits))) 4604 goto retry; 4605 if (obits & (PG_A|PG_M)) { 4606 anychanged = TRUE; 4607 #ifdef INVARIANTS 4608 if (VERBOSE_PROTECT || pmap_logging) 4609 printf("%#lx %#lx -> %#lx\n", 4610 sva, obits, pbits); 4611 #endif 4612 } 4613 } 4614 } 4615 } 4616 if (anychanged) 4617 pmap_invalidate_all(pmap); 4618 PMAP_UNLOCK(pmap); 4619 } 4620 4621 void 4622 mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count) 4623 { 4624 4625 CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count); 4626 pt_entry_t oldpte, pa, *pte; 4627 vm_page_t m; 4628 uint64_t cache_bits, attr_bits; 4629 vm_offset_t va; 4630 4631 oldpte = 0; 4632 attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; 4633 va = sva; 4634 pte = kvtopte(va); 4635 while (va < sva + PAGE_SIZE * count) { 4636 if (__predict_false((va & L3_PAGE_MASK) == 0)) 4637 pte = kvtopte(va); 4638 MPASS(pte == pmap_pte(kernel_pmap, va)); 4639 4640 /* 4641 * XXX there has to be a more efficient way than traversing 4642 * the page table every time - but go for correctness for 4643 * today 4644 */ 4645 4646 m = *ma++; 4647 cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs); 4648 pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits; 4649 if (be64toh(*pte) != pa) { 4650 oldpte |= be64toh(*pte); 4651 pte_store(pte, pa); 4652 } 4653 va += PAGE_SIZE; 4654 pte++; 4655 } 4656 if (__predict_false((oldpte & RPTE_VALID) != 0)) 4657 pmap_invalidate_range(kernel_pmap, sva, sva + count * 4658 PAGE_SIZE); 4659 else 4660 ptesync(); 4661 } 4662 4663 void 4664 mmu_radix_qremove(vm_offset_t sva, int count) 4665 { 4666 vm_offset_t va; 4667 pt_entry_t *pte; 4668 4669 CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count); 4670 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva)); 4671 4672 va = sva; 4673 pte = kvtopte(va); 4674 while (va < sva + PAGE_SIZE * count) { 4675 if (__predict_false((va & L3_PAGE_MASK) == 0)) 4676 pte = kvtopte(va); 4677 pte_clear(pte); 4678 pte++; 4679 va += PAGE_SIZE; 4680 } 4681 pmap_invalidate_range(kernel_pmap, sva, va); 4682 } 4683 4684 /*************************************************** 4685 * Page table page management routines..... 4686 ***************************************************/ 4687 /* 4688 * Schedule the specified unused page table page to be freed. Specifically, 4689 * add the page to the specified list of pages that will be released to the 4690 * physical memory manager after the TLB has been updated. 4691 */ 4692 static __inline void 4693 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 4694 boolean_t set_PG_ZERO) 4695 { 4696 4697 if (set_PG_ZERO) 4698 m->flags |= PG_ZERO; 4699 else 4700 m->flags &= ~PG_ZERO; 4701 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 4702 } 4703 4704 /* 4705 * Inserts the specified page table page into the specified pmap's collection 4706 * of idle page table pages. Each of a pmap's page table pages is responsible 4707 * for mapping a distinct range of virtual addresses. The pmap's collection is 4708 * ordered by this virtual address range. 4709 */ 4710 static __inline int 4711 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 4712 { 4713 4714 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4715 return (vm_radix_insert(&pmap->pm_radix, mpte)); 4716 } 4717 4718 /* 4719 * Removes the page table page mapping the specified virtual address from the 4720 * specified pmap's collection of idle page table pages, and returns it. 4721 * Otherwise, returns NULL if there is no page table page corresponding to the 4722 * specified virtual address. 4723 */ 4724 static __inline vm_page_t 4725 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4726 { 4727 4728 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4729 return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va))); 4730 } 4731 4732 /* 4733 * Decrements a page table page's wire count, which is used to record the 4734 * number of valid page table entries within the page. If the wire count 4735 * drops to zero, then the page table page is unmapped. Returns TRUE if the 4736 * page table page was unmapped and FALSE otherwise. 4737 */ 4738 static inline boolean_t 4739 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4740 { 4741 4742 --m->ref_count; 4743 if (m->ref_count == 0) { 4744 _pmap_unwire_ptp(pmap, va, m, free); 4745 return (TRUE); 4746 } else 4747 return (FALSE); 4748 } 4749 4750 static void 4751 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4752 { 4753 4754 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4755 /* 4756 * unmap the page table page 4757 */ 4758 if (m->pindex >= NUPDE + NUPDPE) { 4759 /* PDP page */ 4760 pml1_entry_t *pml1; 4761 pml1 = pmap_pml1e(pmap, va); 4762 *pml1 = 0; 4763 } else if (m->pindex >= NUPDE) { 4764 /* PD page */ 4765 pml2_entry_t *l2e; 4766 l2e = pmap_pml2e(pmap, va); 4767 *l2e = 0; 4768 } else { 4769 /* PTE page */ 4770 pml3_entry_t *l3e; 4771 l3e = pmap_pml3e(pmap, va); 4772 *l3e = 0; 4773 } 4774 pmap_resident_count_dec(pmap, 1); 4775 if (m->pindex < NUPDE) { 4776 /* We just released a PT, unhold the matching PD */ 4777 vm_page_t pdpg; 4778 4779 pdpg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml2e(pmap, va)) & PG_FRAME); 4780 pmap_unwire_ptp(pmap, va, pdpg, free); 4781 } 4782 else if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 4783 /* We just released a PD, unhold the matching PDP */ 4784 vm_page_t pdppg; 4785 4786 pdppg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml1e(pmap, va)) & PG_FRAME); 4787 pmap_unwire_ptp(pmap, va, pdppg, free); 4788 } 4789 4790 /* 4791 * Put page on a list so that it is released after 4792 * *ALL* TLB shootdown is done 4793 */ 4794 pmap_add_delayed_free_list(m, free, TRUE); 4795 } 4796 4797 /* 4798 * After removing a page table entry, this routine is used to 4799 * conditionally free the page, and manage the hold/wire counts. 4800 */ 4801 static int 4802 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde, 4803 struct spglist *free) 4804 { 4805 vm_page_t mpte; 4806 4807 if (va >= VM_MAXUSER_ADDRESS) 4808 return (0); 4809 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 4810 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 4811 return (pmap_unwire_ptp(pmap, va, mpte, free)); 4812 } 4813 4814 void 4815 mmu_radix_release(pmap_t pmap) 4816 { 4817 4818 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 4819 KASSERT(pmap->pm_stats.resident_count == 0, 4820 ("pmap_release: pmap resident count %ld != 0", 4821 pmap->pm_stats.resident_count)); 4822 KASSERT(vm_radix_is_empty(&pmap->pm_radix), 4823 ("pmap_release: pmap has reserved page table page(s)")); 4824 4825 pmap_invalidate_all(pmap); 4826 isa3_proctab[pmap->pm_pid].proctab0 = 0; 4827 uma_zfree(zone_radix_pgd, pmap->pm_pml1); 4828 vmem_free(asid_arena, pmap->pm_pid, 1); 4829 } 4830 4831 /* 4832 * Create the PV entry for a 2MB page mapping. Always returns true unless the 4833 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 4834 * false if the PV entry cannot be allocated without resorting to reclamation. 4835 */ 4836 static bool 4837 pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags, 4838 struct rwlock **lockp) 4839 { 4840 struct md_page *pvh; 4841 pv_entry_t pv; 4842 vm_paddr_t pa; 4843 4844 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4845 /* Pass NULL instead of the lock pointer to disable reclamation. */ 4846 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 4847 NULL : lockp)) == NULL) 4848 return (false); 4849 pv->pv_va = va; 4850 pa = pde & PG_PS_FRAME; 4851 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 4852 pvh = pa_to_pvh(pa); 4853 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); 4854 pvh->pv_gen++; 4855 return (true); 4856 } 4857 4858 /* 4859 * Fills a page table page with mappings to consecutive physical pages. 4860 */ 4861 static void 4862 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 4863 { 4864 pt_entry_t *pte; 4865 4866 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 4867 *pte = htobe64(newpte); 4868 newpte += PAGE_SIZE; 4869 } 4870 } 4871 4872 static boolean_t 4873 pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va) 4874 { 4875 struct rwlock *lock; 4876 boolean_t rv; 4877 4878 lock = NULL; 4879 rv = pmap_demote_l3e_locked(pmap, pde, va, &lock); 4880 if (lock != NULL) 4881 rw_wunlock(lock); 4882 return (rv); 4883 } 4884 4885 static boolean_t 4886 pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, 4887 struct rwlock **lockp) 4888 { 4889 pml3_entry_t oldpde; 4890 pt_entry_t *firstpte; 4891 vm_paddr_t mptepa; 4892 vm_page_t mpte; 4893 struct spglist free; 4894 vm_offset_t sva; 4895 4896 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4897 oldpde = be64toh(*l3e); 4898 KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), 4899 ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx", 4900 oldpde)); 4901 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 4902 NULL) { 4903 KASSERT((oldpde & PG_W) == 0, 4904 ("pmap_demote_l3e: page table page for a wired mapping" 4905 " is missing")); 4906 4907 /* 4908 * Invalidate the 2MB page mapping and return "failure" if the 4909 * mapping was never accessed or the allocation of the new 4910 * page table page fails. If the 2MB page mapping belongs to 4911 * the direct map region of the kernel's address space, then 4912 * the page allocation request specifies the highest possible 4913 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 4914 * normal. Page table pages are preallocated for every other 4915 * part of the kernel address space, so the direct map region 4916 * is the only part of the kernel address space that must be 4917 * handled here. 4918 */ 4919 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 4920 pmap_l3e_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 4921 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 4922 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 4923 SLIST_INIT(&free); 4924 sva = trunc_2mpage(va); 4925 pmap_remove_l3e(pmap, l3e, sva, &free, lockp); 4926 pmap_invalidate_l3e_page(pmap, sva, oldpde); 4927 vm_page_free_pages_toq(&free, true); 4928 CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx" 4929 " in pmap %p", va, pmap); 4930 return (FALSE); 4931 } 4932 if (va < VM_MAXUSER_ADDRESS) 4933 pmap_resident_count_inc(pmap, 1); 4934 } 4935 mptepa = VM_PAGE_TO_PHYS(mpte); 4936 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 4937 KASSERT((oldpde & PG_A) != 0, 4938 ("pmap_demote_l3e: oldpde is missing PG_A")); 4939 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 4940 ("pmap_demote_l3e: oldpde is missing PG_M")); 4941 4942 /* 4943 * If the page table page is new, initialize it. 4944 */ 4945 if (mpte->ref_count == 1) { 4946 mpte->ref_count = NPTEPG; 4947 pmap_fill_ptp(firstpte, oldpde); 4948 } 4949 4950 KASSERT((be64toh(*firstpte) & PG_FRAME) == (oldpde & PG_FRAME), 4951 ("pmap_demote_l3e: firstpte and newpte map different physical" 4952 " addresses")); 4953 4954 /* 4955 * If the mapping has changed attributes, update the page table 4956 * entries. 4957 */ 4958 if ((be64toh(*firstpte) & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE)) 4959 pmap_fill_ptp(firstpte, oldpde); 4960 4961 /* 4962 * The spare PV entries must be reserved prior to demoting the 4963 * mapping, that is, prior to changing the PDE. Otherwise, the state 4964 * of the PDE and the PV lists will be inconsistent, which can result 4965 * in reclaim_pv_chunk() attempting to remove a PV entry from the 4966 * wrong PV list and pmap_pv_demote_l3e() failing to find the expected 4967 * PV entry for the 2MB page mapping that is being demoted. 4968 */ 4969 if ((oldpde & PG_MANAGED) != 0) 4970 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 4971 4972 /* 4973 * Demote the mapping. This pmap is locked. The old PDE has 4974 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 4975 * set. Thus, there is no danger of a race with another 4976 * processor changing the setting of PG_A and/or PG_M between 4977 * the read above and the store below. 4978 */ 4979 pde_store(l3e, mptepa); 4980 pmap_invalidate_l3e_page(pmap, trunc_2mpage(va), oldpde); 4981 /* 4982 * Demote the PV entry. 4983 */ 4984 if ((oldpde & PG_MANAGED) != 0) 4985 pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp); 4986 4987 counter_u64_add(pmap_l3e_demotions, 1); 4988 CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx" 4989 " in pmap %p", va, pmap); 4990 return (TRUE); 4991 } 4992 4993 /* 4994 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 4995 */ 4996 static void 4997 pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va) 4998 { 4999 vm_paddr_t mptepa; 5000 vm_page_t mpte; 5001 5002 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 5003 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5004 mpte = pmap_remove_pt_page(pmap, va); 5005 if (mpte == NULL) 5006 panic("pmap_remove_kernel_pde: Missing pt page."); 5007 5008 mptepa = VM_PAGE_TO_PHYS(mpte); 5009 5010 /* 5011 * Initialize the page table page. 5012 */ 5013 pagezero(PHYS_TO_DMAP(mptepa)); 5014 5015 /* 5016 * Demote the mapping. 5017 */ 5018 pde_store(l3e, mptepa); 5019 ptesync(); 5020 } 5021 5022 /* 5023 * pmap_remove_l3e: do the things to unmap a superpage in a process 5024 */ 5025 static int 5026 pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, 5027 struct spglist *free, struct rwlock **lockp) 5028 { 5029 struct md_page *pvh; 5030 pml3_entry_t oldpde; 5031 vm_offset_t eva, va; 5032 vm_page_t m, mpte; 5033 5034 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5035 KASSERT((sva & L3_PAGE_MASK) == 0, 5036 ("pmap_remove_l3e: sva is not 2mpage aligned")); 5037 oldpde = be64toh(pte_load_clear(pdq)); 5038 if (oldpde & PG_W) 5039 pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE); 5040 pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); 5041 if (oldpde & PG_MANAGED) { 5042 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 5043 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 5044 pmap_pvh_free(pvh, pmap, sva); 5045 eva = sva + L3_PAGE_SIZE; 5046 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 5047 va < eva; va += PAGE_SIZE, m++) { 5048 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5049 vm_page_dirty(m); 5050 if (oldpde & PG_A) 5051 vm_page_aflag_set(m, PGA_REFERENCED); 5052 if (TAILQ_EMPTY(&m->md.pv_list) && 5053 TAILQ_EMPTY(&pvh->pv_list)) 5054 vm_page_aflag_clear(m, PGA_WRITEABLE); 5055 } 5056 } 5057 if (pmap == kernel_pmap) { 5058 pmap_remove_kernel_l3e(pmap, pdq, sva); 5059 } else { 5060 mpte = pmap_remove_pt_page(pmap, sva); 5061 if (mpte != NULL) { 5062 pmap_resident_count_dec(pmap, 1); 5063 KASSERT(mpte->ref_count == NPTEPG, 5064 ("pmap_remove_l3e: pte page wire count error")); 5065 mpte->ref_count = 0; 5066 pmap_add_delayed_free_list(mpte, free, FALSE); 5067 } 5068 } 5069 return (pmap_unuse_pt(pmap, sva, be64toh(*pmap_pml2e(pmap, sva)), free)); 5070 } 5071 5072 /* 5073 * pmap_remove_pte: do the things to unmap a page in a process 5074 */ 5075 static int 5076 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 5077 pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 5078 { 5079 struct md_page *pvh; 5080 pt_entry_t oldpte; 5081 vm_page_t m; 5082 5083 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5084 oldpte = be64toh(pte_load_clear(ptq)); 5085 if (oldpte & RPTE_WIRED) 5086 pmap->pm_stats.wired_count -= 1; 5087 pmap_resident_count_dec(pmap, 1); 5088 if (oldpte & RPTE_MANAGED) { 5089 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 5090 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5091 vm_page_dirty(m); 5092 if (oldpte & PG_A) 5093 vm_page_aflag_set(m, PGA_REFERENCED); 5094 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5095 pmap_pvh_free(&m->md, pmap, va); 5096 if (TAILQ_EMPTY(&m->md.pv_list) && 5097 (m->flags & PG_FICTITIOUS) == 0) { 5098 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5099 if (TAILQ_EMPTY(&pvh->pv_list)) 5100 vm_page_aflag_clear(m, PGA_WRITEABLE); 5101 } 5102 } 5103 return (pmap_unuse_pt(pmap, va, ptepde, free)); 5104 } 5105 5106 /* 5107 * Remove a single page from a process address space 5108 */ 5109 static bool 5110 pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e, 5111 struct spglist *free) 5112 { 5113 struct rwlock *lock; 5114 pt_entry_t *pte; 5115 bool invalidate_all; 5116 5117 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5118 if ((be64toh(*l3e) & RPTE_VALID) == 0) { 5119 return (false); 5120 } 5121 pte = pmap_l3e_to_pte(l3e, va); 5122 if ((be64toh(*pte) & RPTE_VALID) == 0) { 5123 return (false); 5124 } 5125 lock = NULL; 5126 5127 invalidate_all = pmap_remove_pte(pmap, pte, va, be64toh(*l3e), free, &lock); 5128 if (lock != NULL) 5129 rw_wunlock(lock); 5130 if (!invalidate_all) 5131 pmap_invalidate_page(pmap, va); 5132 return (invalidate_all); 5133 } 5134 5135 /* 5136 * Removes the specified range of addresses from the page table page. 5137 */ 5138 static bool 5139 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 5140 pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp) 5141 { 5142 pt_entry_t *pte; 5143 vm_offset_t va; 5144 bool anyvalid; 5145 5146 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5147 anyvalid = false; 5148 va = eva; 5149 for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++, 5150 sva += PAGE_SIZE) { 5151 MPASS(pte == pmap_pte(pmap, sva)); 5152 if (*pte == 0) { 5153 if (va != eva) { 5154 anyvalid = true; 5155 va = eva; 5156 } 5157 continue; 5158 } 5159 if (va == eva) 5160 va = sva; 5161 if (pmap_remove_pte(pmap, pte, sva, be64toh(*l3e), free, lockp)) { 5162 anyvalid = true; 5163 sva += PAGE_SIZE; 5164 break; 5165 } 5166 } 5167 if (anyvalid) 5168 pmap_invalidate_all(pmap); 5169 else if (va != eva) 5170 pmap_invalidate_range(pmap, va, sva); 5171 return (anyvalid); 5172 } 5173 5174 void 5175 mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5176 { 5177 struct rwlock *lock; 5178 vm_offset_t va_next; 5179 pml1_entry_t *l1e; 5180 pml2_entry_t *l2e; 5181 pml3_entry_t ptpaddr, *l3e; 5182 struct spglist free; 5183 bool anyvalid; 5184 5185 CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); 5186 5187 /* 5188 * Perform an unsynchronized read. This is, however, safe. 5189 */ 5190 if (pmap->pm_stats.resident_count == 0) 5191 return; 5192 5193 anyvalid = false; 5194 SLIST_INIT(&free); 5195 5196 /* XXX something fishy here */ 5197 sva = (sva + PAGE_MASK) & ~PAGE_MASK; 5198 eva = (eva + PAGE_MASK) & ~PAGE_MASK; 5199 5200 PMAP_LOCK(pmap); 5201 5202 /* 5203 * special handling of removing one page. a very 5204 * common operation and easy to short circuit some 5205 * code. 5206 */ 5207 if (sva + PAGE_SIZE == eva) { 5208 l3e = pmap_pml3e(pmap, sva); 5209 if (l3e && (be64toh(*l3e) & RPTE_LEAF) == 0) { 5210 anyvalid = pmap_remove_page(pmap, sva, l3e, &free); 5211 goto out; 5212 } 5213 } 5214 5215 lock = NULL; 5216 for (; sva < eva; sva = va_next) { 5217 if (pmap->pm_stats.resident_count == 0) 5218 break; 5219 l1e = pmap_pml1e(pmap, sva); 5220 if (l1e == NULL || (be64toh(*l1e) & PG_V) == 0) { 5221 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 5222 if (va_next < sva) 5223 va_next = eva; 5224 continue; 5225 } 5226 5227 l2e = pmap_l1e_to_l2e(l1e, sva); 5228 if (l2e == NULL || (be64toh(*l2e) & PG_V) == 0) { 5229 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 5230 if (va_next < sva) 5231 va_next = eva; 5232 continue; 5233 } 5234 5235 /* 5236 * Calculate index for next page table. 5237 */ 5238 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 5239 if (va_next < sva) 5240 va_next = eva; 5241 5242 l3e = pmap_l2e_to_l3e(l2e, sva); 5243 ptpaddr = be64toh(*l3e); 5244 5245 /* 5246 * Weed out invalid mappings. 5247 */ 5248 if (ptpaddr == 0) 5249 continue; 5250 5251 /* 5252 * Check for large page. 5253 */ 5254 if ((ptpaddr & RPTE_LEAF) != 0) { 5255 /* 5256 * Are we removing the entire large page? If not, 5257 * demote the mapping and fall through. 5258 */ 5259 if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { 5260 pmap_remove_l3e(pmap, l3e, sva, &free, &lock); 5261 anyvalid = true; 5262 continue; 5263 } else if (!pmap_demote_l3e_locked(pmap, l3e, sva, 5264 &lock)) { 5265 /* The large page mapping was destroyed. */ 5266 continue; 5267 } else 5268 ptpaddr = be64toh(*l3e); 5269 } 5270 5271 /* 5272 * Limit our scan to either the end of the va represented 5273 * by the current page table page, or to the end of the 5274 * range being removed. 5275 */ 5276 if (va_next > eva) 5277 va_next = eva; 5278 5279 if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock)) 5280 anyvalid = true; 5281 } 5282 if (lock != NULL) 5283 rw_wunlock(lock); 5284 out: 5285 if (anyvalid) 5286 pmap_invalidate_all(pmap); 5287 PMAP_UNLOCK(pmap); 5288 vm_page_free_pages_toq(&free, true); 5289 } 5290 5291 void 5292 mmu_radix_remove_all(vm_page_t m) 5293 { 5294 struct md_page *pvh; 5295 pv_entry_t pv; 5296 pmap_t pmap; 5297 struct rwlock *lock; 5298 pt_entry_t *pte, tpte; 5299 pml3_entry_t *l3e; 5300 vm_offset_t va; 5301 struct spglist free; 5302 int pvh_gen, md_gen; 5303 5304 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 5305 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5306 ("pmap_remove_all: page %p is not managed", m)); 5307 SLIST_INIT(&free); 5308 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5309 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5310 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5311 retry: 5312 rw_wlock(lock); 5313 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 5314 pmap = PV_PMAP(pv); 5315 if (!PMAP_TRYLOCK(pmap)) { 5316 pvh_gen = pvh->pv_gen; 5317 rw_wunlock(lock); 5318 PMAP_LOCK(pmap); 5319 rw_wlock(lock); 5320 if (pvh_gen != pvh->pv_gen) { 5321 rw_wunlock(lock); 5322 PMAP_UNLOCK(pmap); 5323 goto retry; 5324 } 5325 } 5326 va = pv->pv_va; 5327 l3e = pmap_pml3e(pmap, va); 5328 (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock); 5329 PMAP_UNLOCK(pmap); 5330 } 5331 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 5332 pmap = PV_PMAP(pv); 5333 if (!PMAP_TRYLOCK(pmap)) { 5334 pvh_gen = pvh->pv_gen; 5335 md_gen = m->md.pv_gen; 5336 rw_wunlock(lock); 5337 PMAP_LOCK(pmap); 5338 rw_wlock(lock); 5339 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5340 rw_wunlock(lock); 5341 PMAP_UNLOCK(pmap); 5342 goto retry; 5343 } 5344 } 5345 pmap_resident_count_dec(pmap, 1); 5346 l3e = pmap_pml3e(pmap, pv->pv_va); 5347 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_remove_all: found" 5348 " a 2mpage in page %p's pv list", m)); 5349 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 5350 tpte = be64toh(pte_load_clear(pte)); 5351 if (tpte & PG_W) 5352 pmap->pm_stats.wired_count--; 5353 if (tpte & PG_A) 5354 vm_page_aflag_set(m, PGA_REFERENCED); 5355 5356 /* 5357 * Update the vm_page_t clean and reference bits. 5358 */ 5359 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5360 vm_page_dirty(m); 5361 pmap_unuse_pt(pmap, pv->pv_va, be64toh(*l3e), &free); 5362 pmap_invalidate_page(pmap, pv->pv_va); 5363 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 5364 m->md.pv_gen++; 5365 free_pv_entry(pmap, pv); 5366 PMAP_UNLOCK(pmap); 5367 } 5368 vm_page_aflag_clear(m, PGA_WRITEABLE); 5369 rw_wunlock(lock); 5370 vm_page_free_pages_toq(&free, true); 5371 } 5372 5373 /* 5374 * Destroy all managed, non-wired mappings in the given user-space 5375 * pmap. This pmap cannot be active on any processor besides the 5376 * caller. 5377 * 5378 * This function cannot be applied to the kernel pmap. Moreover, it 5379 * is not intended for general use. It is only to be used during 5380 * process termination. Consequently, it can be implemented in ways 5381 * that make it faster than pmap_remove(). First, it can more quickly 5382 * destroy mappings by iterating over the pmap's collection of PV 5383 * entries, rather than searching the page table. Second, it doesn't 5384 * have to test and clear the page table entries atomically, because 5385 * no processor is currently accessing the user address space. In 5386 * particular, a page table entry's dirty bit won't change state once 5387 * this function starts. 5388 * 5389 * Although this function destroys all of the pmap's managed, 5390 * non-wired mappings, it can delay and batch the invalidation of TLB 5391 * entries without calling pmap_delayed_invl_started() and 5392 * pmap_delayed_invl_finished(). Because the pmap is not active on 5393 * any other processor, none of these TLB entries will ever be used 5394 * before their eventual invalidation. Consequently, there is no need 5395 * for either pmap_remove_all() or pmap_remove_write() to wait for 5396 * that eventual TLB invalidation. 5397 */ 5398 5399 void 5400 mmu_radix_remove_pages(pmap_t pmap) 5401 { 5402 5403 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 5404 pml3_entry_t ptel3e; 5405 pt_entry_t *pte, tpte; 5406 struct spglist free; 5407 vm_page_t m, mpte, mt; 5408 pv_entry_t pv; 5409 struct md_page *pvh; 5410 struct pv_chunk *pc, *npc; 5411 struct rwlock *lock; 5412 int64_t bit; 5413 uint64_t inuse, bitmask; 5414 int allfree, field, freed, idx; 5415 boolean_t superpage; 5416 vm_paddr_t pa; 5417 5418 /* 5419 * Assert that the given pmap is only active on the current 5420 * CPU. Unfortunately, we cannot block another CPU from 5421 * activating the pmap while this function is executing. 5422 */ 5423 KASSERT(pmap->pm_pid == mfspr(SPR_PID), 5424 ("non-current asid %lu - expected %lu", pmap->pm_pid, 5425 mfspr(SPR_PID))); 5426 5427 lock = NULL; 5428 5429 SLIST_INIT(&free); 5430 PMAP_LOCK(pmap); 5431 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5432 allfree = 1; 5433 freed = 0; 5434 for (field = 0; field < _NPCM; field++) { 5435 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5436 while (inuse != 0) { 5437 bit = cnttzd(inuse); 5438 bitmask = 1UL << bit; 5439 idx = field * 64 + bit; 5440 pv = &pc->pc_pventry[idx]; 5441 inuse &= ~bitmask; 5442 5443 pte = pmap_pml2e(pmap, pv->pv_va); 5444 ptel3e = be64toh(*pte); 5445 pte = pmap_l2e_to_l3e(pte, pv->pv_va); 5446 tpte = be64toh(*pte); 5447 if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) { 5448 superpage = FALSE; 5449 ptel3e = tpte; 5450 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 5451 PG_FRAME); 5452 pte = &pte[pmap_pte_index(pv->pv_va)]; 5453 tpte = be64toh(*pte); 5454 } else { 5455 /* 5456 * Keep track whether 'tpte' is a 5457 * superpage explicitly instead of 5458 * relying on RPTE_LEAF being set. 5459 * 5460 * This is because RPTE_LEAF is numerically 5461 * identical to PG_PTE_PAT and thus a 5462 * regular page could be mistaken for 5463 * a superpage. 5464 */ 5465 superpage = TRUE; 5466 } 5467 5468 if ((tpte & PG_V) == 0) { 5469 panic("bad pte va %lx pte %lx", 5470 pv->pv_va, tpte); 5471 } 5472 5473 /* 5474 * We cannot remove wired pages from a process' mapping at this time 5475 */ 5476 if (tpte & PG_W) { 5477 allfree = 0; 5478 continue; 5479 } 5480 5481 if (superpage) 5482 pa = tpte & PG_PS_FRAME; 5483 else 5484 pa = tpte & PG_FRAME; 5485 5486 m = PHYS_TO_VM_PAGE(pa); 5487 KASSERT(m->phys_addr == pa, 5488 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5489 m, (uintmax_t)m->phys_addr, 5490 (uintmax_t)tpte)); 5491 5492 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5493 m < &vm_page_array[vm_page_array_size], 5494 ("pmap_remove_pages: bad tpte %#jx", 5495 (uintmax_t)tpte)); 5496 5497 pte_clear(pte); 5498 5499 /* 5500 * Update the vm_page_t clean/reference bits. 5501 */ 5502 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5503 if (superpage) { 5504 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) 5505 vm_page_dirty(mt); 5506 } else 5507 vm_page_dirty(m); 5508 } 5509 5510 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5511 5512 /* Mark free */ 5513 pc->pc_map[field] |= bitmask; 5514 if (superpage) { 5515 pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); 5516 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 5517 TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); 5518 pvh->pv_gen++; 5519 if (TAILQ_EMPTY(&pvh->pv_list)) { 5520 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) 5521 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 5522 TAILQ_EMPTY(&mt->md.pv_list)) 5523 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5524 } 5525 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 5526 if (mpte != NULL) { 5527 pmap_resident_count_dec(pmap, 1); 5528 KASSERT(mpte->ref_count == NPTEPG, 5529 ("pmap_remove_pages: pte page wire count error")); 5530 mpte->ref_count = 0; 5531 pmap_add_delayed_free_list(mpte, &free, FALSE); 5532 } 5533 } else { 5534 pmap_resident_count_dec(pmap, 1); 5535 #ifdef VERBOSE_PV 5536 printf("freeing pv (%p, %p)\n", 5537 pmap, pv); 5538 #endif 5539 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 5540 m->md.pv_gen++; 5541 if ((m->a.flags & PGA_WRITEABLE) != 0 && 5542 TAILQ_EMPTY(&m->md.pv_list) && 5543 (m->flags & PG_FICTITIOUS) == 0) { 5544 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5545 if (TAILQ_EMPTY(&pvh->pv_list)) 5546 vm_page_aflag_clear(m, PGA_WRITEABLE); 5547 } 5548 } 5549 pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free); 5550 freed++; 5551 } 5552 } 5553 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5554 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5555 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5556 if (allfree) { 5557 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5558 free_pv_chunk(pc); 5559 } 5560 } 5561 if (lock != NULL) 5562 rw_wunlock(lock); 5563 pmap_invalidate_all(pmap); 5564 PMAP_UNLOCK(pmap); 5565 vm_page_free_pages_toq(&free, true); 5566 } 5567 5568 void 5569 mmu_radix_remove_write(vm_page_t m) 5570 { 5571 struct md_page *pvh; 5572 pmap_t pmap; 5573 struct rwlock *lock; 5574 pv_entry_t next_pv, pv; 5575 pml3_entry_t *l3e; 5576 pt_entry_t oldpte, *pte; 5577 int pvh_gen, md_gen; 5578 5579 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 5580 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5581 ("pmap_remove_write: page %p is not managed", m)); 5582 vm_page_assert_busied(m); 5583 5584 if (!pmap_page_is_write_mapped(m)) 5585 return; 5586 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5587 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5588 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5589 retry_pv_loop: 5590 rw_wlock(lock); 5591 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { 5592 pmap = PV_PMAP(pv); 5593 if (!PMAP_TRYLOCK(pmap)) { 5594 pvh_gen = pvh->pv_gen; 5595 rw_wunlock(lock); 5596 PMAP_LOCK(pmap); 5597 rw_wlock(lock); 5598 if (pvh_gen != pvh->pv_gen) { 5599 PMAP_UNLOCK(pmap); 5600 rw_wunlock(lock); 5601 goto retry_pv_loop; 5602 } 5603 } 5604 l3e = pmap_pml3e(pmap, pv->pv_va); 5605 if ((be64toh(*l3e) & PG_RW) != 0) 5606 (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock); 5607 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5608 ("inconsistent pv lock %p %p for page %p", 5609 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5610 PMAP_UNLOCK(pmap); 5611 } 5612 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 5613 pmap = PV_PMAP(pv); 5614 if (!PMAP_TRYLOCK(pmap)) { 5615 pvh_gen = pvh->pv_gen; 5616 md_gen = m->md.pv_gen; 5617 rw_wunlock(lock); 5618 PMAP_LOCK(pmap); 5619 rw_wlock(lock); 5620 if (pvh_gen != pvh->pv_gen || 5621 md_gen != m->md.pv_gen) { 5622 PMAP_UNLOCK(pmap); 5623 rw_wunlock(lock); 5624 goto retry_pv_loop; 5625 } 5626 } 5627 l3e = pmap_pml3e(pmap, pv->pv_va); 5628 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, 5629 ("pmap_remove_write: found a 2mpage in page %p's pv list", 5630 m)); 5631 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 5632 retry: 5633 oldpte = be64toh(*pte); 5634 if (oldpte & PG_RW) { 5635 if (!atomic_cmpset_long(pte, htobe64(oldpte), 5636 htobe64((oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M)))) 5637 goto retry; 5638 if ((oldpte & PG_M) != 0) 5639 vm_page_dirty(m); 5640 pmap_invalidate_page(pmap, pv->pv_va); 5641 } 5642 PMAP_UNLOCK(pmap); 5643 } 5644 rw_wunlock(lock); 5645 vm_page_aflag_clear(m, PGA_WRITEABLE); 5646 } 5647 5648 /* 5649 * Clear the wired attribute from the mappings for the specified range of 5650 * addresses in the given pmap. Every valid mapping within that range 5651 * must have the wired attribute set. In contrast, invalid mappings 5652 * cannot have the wired attribute set, so they are ignored. 5653 * 5654 * The wired attribute of the page table entry is not a hardware 5655 * feature, so there is no need to invalidate any TLB entries. 5656 * Since pmap_demote_l3e() for the wired entry must never fail, 5657 * pmap_delayed_invl_started()/finished() calls around the 5658 * function are not needed. 5659 */ 5660 void 5661 mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5662 { 5663 vm_offset_t va_next; 5664 pml1_entry_t *l1e; 5665 pml2_entry_t *l2e; 5666 pml3_entry_t *l3e; 5667 pt_entry_t *pte; 5668 5669 CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); 5670 PMAP_LOCK(pmap); 5671 for (; sva < eva; sva = va_next) { 5672 l1e = pmap_pml1e(pmap, sva); 5673 if ((be64toh(*l1e) & PG_V) == 0) { 5674 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 5675 if (va_next < sva) 5676 va_next = eva; 5677 continue; 5678 } 5679 l2e = pmap_l1e_to_l2e(l1e, sva); 5680 if ((be64toh(*l2e) & PG_V) == 0) { 5681 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 5682 if (va_next < sva) 5683 va_next = eva; 5684 continue; 5685 } 5686 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 5687 if (va_next < sva) 5688 va_next = eva; 5689 l3e = pmap_l2e_to_l3e(l2e, sva); 5690 if ((be64toh(*l3e) & PG_V) == 0) 5691 continue; 5692 if ((be64toh(*l3e) & RPTE_LEAF) != 0) { 5693 if ((be64toh(*l3e) & PG_W) == 0) 5694 panic("pmap_unwire: pde %#jx is missing PG_W", 5695 (uintmax_t)(be64toh(*l3e))); 5696 5697 /* 5698 * Are we unwiring the entire large page? If not, 5699 * demote the mapping and fall through. 5700 */ 5701 if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { 5702 atomic_clear_long(l3e, htobe64(PG_W)); 5703 pmap->pm_stats.wired_count -= L3_PAGE_SIZE / 5704 PAGE_SIZE; 5705 continue; 5706 } else if (!pmap_demote_l3e(pmap, l3e, sva)) 5707 panic("pmap_unwire: demotion failed"); 5708 } 5709 if (va_next > eva) 5710 va_next = eva; 5711 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, 5712 sva += PAGE_SIZE) { 5713 MPASS(pte == pmap_pte(pmap, sva)); 5714 if ((be64toh(*pte) & PG_V) == 0) 5715 continue; 5716 if ((be64toh(*pte) & PG_W) == 0) 5717 panic("pmap_unwire: pte %#jx is missing PG_W", 5718 (uintmax_t)(be64toh(*pte))); 5719 5720 /* 5721 * PG_W must be cleared atomically. Although the pmap 5722 * lock synchronizes access to PG_W, another processor 5723 * could be setting PG_M and/or PG_A concurrently. 5724 */ 5725 atomic_clear_long(pte, htobe64(PG_W)); 5726 pmap->pm_stats.wired_count--; 5727 } 5728 } 5729 PMAP_UNLOCK(pmap); 5730 } 5731 5732 void 5733 mmu_radix_zero_page(vm_page_t m) 5734 { 5735 vm_offset_t addr; 5736 5737 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 5738 addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5739 pagezero(addr); 5740 } 5741 5742 void 5743 mmu_radix_zero_page_area(vm_page_t m, int off, int size) 5744 { 5745 caddr_t addr; 5746 5747 CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size); 5748 MPASS(off + size <= PAGE_SIZE); 5749 addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5750 memset(addr + off, 0, size); 5751 } 5752 5753 static int 5754 mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5755 { 5756 pml3_entry_t *l3ep; 5757 pt_entry_t pte; 5758 vm_paddr_t pa; 5759 int val; 5760 5761 CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); 5762 PMAP_LOCK(pmap); 5763 5764 l3ep = pmap_pml3e(pmap, addr); 5765 if (l3ep != NULL && (be64toh(*l3ep) & PG_V)) { 5766 if (be64toh(*l3ep) & RPTE_LEAF) { 5767 pte = be64toh(*l3ep); 5768 /* Compute the physical address of the 4KB page. */ 5769 pa = ((be64toh(*l3ep) & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) & 5770 PG_FRAME; 5771 val = MINCORE_PSIND(1); 5772 } else { 5773 /* Native endian PTE, do not pass to functions */ 5774 pte = be64toh(*pmap_l3e_to_pte(l3ep, addr)); 5775 pa = pte & PG_FRAME; 5776 val = 0; 5777 } 5778 } else { 5779 pte = 0; 5780 pa = 0; 5781 val = 0; 5782 } 5783 if ((pte & PG_V) != 0) { 5784 val |= MINCORE_INCORE; 5785 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5786 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5787 if ((pte & PG_A) != 0) 5788 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5789 } 5790 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5791 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5792 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5793 *locked_pa = pa; 5794 } 5795 PMAP_UNLOCK(pmap); 5796 return (val); 5797 } 5798 5799 void 5800 mmu_radix_activate(struct thread *td) 5801 { 5802 pmap_t pmap; 5803 uint32_t curpid; 5804 5805 CTR2(KTR_PMAP, "%s(%p)", __func__, td); 5806 critical_enter(); 5807 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5808 curpid = mfspr(SPR_PID); 5809 if (pmap->pm_pid > isa3_base_pid && 5810 curpid != pmap->pm_pid) { 5811 mmu_radix_pid_set(pmap); 5812 } 5813 critical_exit(); 5814 } 5815 5816 /* 5817 * Increase the starting virtual address of the given mapping if a 5818 * different alignment might result in more superpage mappings. 5819 */ 5820 void 5821 mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset, 5822 vm_offset_t *addr, vm_size_t size) 5823 { 5824 5825 CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr, 5826 size); 5827 vm_offset_t superpage_offset; 5828 5829 if (size < L3_PAGE_SIZE) 5830 return; 5831 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5832 offset += ptoa(object->pg_color); 5833 superpage_offset = offset & L3_PAGE_MASK; 5834 if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE || 5835 (*addr & L3_PAGE_MASK) == superpage_offset) 5836 return; 5837 if ((*addr & L3_PAGE_MASK) < superpage_offset) 5838 *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset; 5839 else 5840 *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset; 5841 } 5842 5843 static void * 5844 mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr) 5845 { 5846 vm_offset_t va, tmpva, ppa, offset; 5847 5848 ppa = trunc_page(pa); 5849 offset = pa & PAGE_MASK; 5850 size = roundup2(offset + size, PAGE_SIZE); 5851 if (pa < powerpc_ptob(Maxmem)) 5852 panic("bad pa: %#lx less than Maxmem %#lx\n", 5853 pa, powerpc_ptob(Maxmem)); 5854 va = kva_alloc(size); 5855 if (bootverbose) 5856 printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr); 5857 KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr)); 5858 5859 if (!va) 5860 panic("%s: Couldn't alloc kernel virtual memory", __func__); 5861 5862 for (tmpva = va; size > 0;) { 5863 mmu_radix_kenter_attr(tmpva, ppa, attr); 5864 size -= PAGE_SIZE; 5865 tmpva += PAGE_SIZE; 5866 ppa += PAGE_SIZE; 5867 } 5868 ptesync(); 5869 5870 return ((void *)(va + offset)); 5871 } 5872 5873 static void * 5874 mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size) 5875 { 5876 5877 CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); 5878 5879 return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT)); 5880 } 5881 5882 void 5883 mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5884 { 5885 5886 CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma); 5887 m->md.mdpg_cache_attrs = ma; 5888 5889 /* 5890 * If "m" is a normal page, update its direct mapping. This update 5891 * can be relied upon to perform any cache operations that are 5892 * required for data coherence. 5893 */ 5894 if ((m->flags & PG_FICTITIOUS) == 0 && 5895 mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 5896 PAGE_SIZE, m->md.mdpg_cache_attrs)) 5897 panic("memory attribute change on the direct map failed"); 5898 } 5899 5900 static void 5901 mmu_radix_unmapdev(vm_offset_t va, vm_size_t size) 5902 { 5903 vm_offset_t offset; 5904 5905 CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, size); 5906 /* If we gave a direct map region in pmap_mapdev, do nothing */ 5907 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 5908 return; 5909 5910 offset = va & PAGE_MASK; 5911 size = round_page(offset + size); 5912 va = trunc_page(va); 5913 5914 if (pmap_initialized) { 5915 mmu_radix_qremove(va, atop(size)); 5916 kva_free(va, size); 5917 } 5918 } 5919 5920 static __inline void 5921 pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask) 5922 { 5923 uint64_t opte, npte; 5924 5925 /* 5926 * The cache mode bits are all in the low 32-bits of the 5927 * PTE, so we can just spin on updating the low 32-bits. 5928 */ 5929 do { 5930 opte = be64toh(*pte); 5931 npte = opte & ~mask; 5932 npte |= cache_bits; 5933 } while (npte != opte && !atomic_cmpset_long(pte, htobe64(opte), htobe64(npte))); 5934 } 5935 5936 /* 5937 * Tries to demote a 1GB page mapping. 5938 */ 5939 static boolean_t 5940 pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va) 5941 { 5942 pml2_entry_t oldpdpe; 5943 pml3_entry_t *firstpde, newpde, *pde; 5944 vm_paddr_t pdpgpa; 5945 vm_page_t pdpg; 5946 5947 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5948 oldpdpe = be64toh(*l2e); 5949 KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), 5950 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 5951 pdpg = vm_page_alloc(NULL, va >> L2_PAGE_SIZE_SHIFT, 5952 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); 5953 if (pdpg == NULL) { 5954 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 5955 " in pmap %p", va, pmap); 5956 return (FALSE); 5957 } 5958 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 5959 firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa); 5960 KASSERT((oldpdpe & PG_A) != 0, 5961 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 5962 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 5963 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 5964 newpde = oldpdpe; 5965 5966 /* 5967 * Initialize the page directory page. 5968 */ 5969 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 5970 *pde = htobe64(newpde); 5971 newpde += L3_PAGE_SIZE; 5972 } 5973 5974 /* 5975 * Demote the mapping. 5976 */ 5977 pde_store(l2e, pdpgpa); 5978 5979 /* 5980 * Flush PWC --- XXX revisit 5981 */ 5982 pmap_invalidate_all(pmap); 5983 5984 counter_u64_add(pmap_l2e_demotions, 1); 5985 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 5986 " in pmap %p", va, pmap); 5987 return (TRUE); 5988 } 5989 5990 vm_paddr_t 5991 mmu_radix_kextract(vm_offset_t va) 5992 { 5993 pml3_entry_t l3e; 5994 vm_paddr_t pa; 5995 5996 CTR2(KTR_PMAP, "%s(%#x)", __func__, va); 5997 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 5998 pa = DMAP_TO_PHYS(va); 5999 } else { 6000 /* Big-endian PTE on stack */ 6001 l3e = *pmap_pml3e(kernel_pmap, va); 6002 if (be64toh(l3e) & RPTE_LEAF) { 6003 pa = (be64toh(l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK); 6004 pa |= (va & L3_PAGE_MASK); 6005 } else { 6006 /* 6007 * Beware of a concurrent promotion that changes the 6008 * PDE at this point! For example, vtopte() must not 6009 * be used to access the PTE because it would use the 6010 * new PDE. It is, however, safe to use the old PDE 6011 * because the page table page is preserved by the 6012 * promotion. 6013 */ 6014 pa = be64toh(*pmap_l3e_to_pte(&l3e, va)); 6015 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 6016 pa |= (va & PAGE_MASK); 6017 } 6018 } 6019 return (pa); 6020 } 6021 6022 static pt_entry_t 6023 mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) 6024 { 6025 6026 if (ma != VM_MEMATTR_DEFAULT) { 6027 return pmap_cache_bits(ma); 6028 } 6029 6030 /* 6031 * Assume the page is cache inhibited and access is guarded unless 6032 * it's in our available memory array. 6033 */ 6034 for (int i = 0; i < pregions_sz; i++) { 6035 if ((pa >= pregions[i].mr_start) && 6036 (pa < (pregions[i].mr_start + pregions[i].mr_size))) 6037 return (RPTE_ATTR_MEM); 6038 } 6039 return (RPTE_ATTR_GUARDEDIO); 6040 } 6041 6042 static void 6043 mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) 6044 { 6045 pt_entry_t *pte, pteval; 6046 uint64_t cache_bits; 6047 6048 pte = kvtopte(va); 6049 MPASS(pte != NULL); 6050 pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; 6051 cache_bits = mmu_radix_calc_wimg(pa, ma); 6052 pte_store(pte, pteval | cache_bits); 6053 } 6054 6055 void 6056 mmu_radix_kremove(vm_offset_t va) 6057 { 6058 pt_entry_t *pte; 6059 6060 CTR2(KTR_PMAP, "%s(%#x)", __func__, va); 6061 6062 pte = kvtopte(va); 6063 pte_clear(pte); 6064 } 6065 6066 int 6067 mmu_radix_decode_kernel_ptr(vm_offset_t addr, 6068 int *is_user, vm_offset_t *decoded) 6069 { 6070 6071 CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr); 6072 *decoded = addr; 6073 *is_user = (addr < VM_MAXUSER_ADDRESS); 6074 return (0); 6075 } 6076 6077 static boolean_t 6078 mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) 6079 { 6080 6081 CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); 6082 return (mem_valid(pa, size)); 6083 } 6084 6085 static void 6086 mmu_radix_scan_init() 6087 { 6088 6089 CTR1(KTR_PMAP, "%s()", __func__); 6090 UNIMPLEMENTED(); 6091 } 6092 6093 static void 6094 mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, 6095 void **va) 6096 { 6097 CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va); 6098 UNIMPLEMENTED(); 6099 } 6100 6101 vm_offset_t 6102 mmu_radix_quick_enter_page(vm_page_t m) 6103 { 6104 vm_paddr_t paddr; 6105 6106 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 6107 paddr = VM_PAGE_TO_PHYS(m); 6108 return (PHYS_TO_DMAP(paddr)); 6109 } 6110 6111 void 6112 mmu_radix_quick_remove_page(vm_offset_t addr __unused) 6113 { 6114 /* no work to do here */ 6115 CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); 6116 } 6117 6118 static void 6119 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 6120 { 6121 cpu_flush_dcache((void *)sva, eva - sva); 6122 } 6123 6124 int 6125 mmu_radix_change_attr(vm_offset_t va, vm_size_t size, 6126 vm_memattr_t mode) 6127 { 6128 int error; 6129 6130 CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode); 6131 PMAP_LOCK(kernel_pmap); 6132 error = pmap_change_attr_locked(va, size, mode, true); 6133 PMAP_UNLOCK(kernel_pmap); 6134 return (error); 6135 } 6136 6137 static int 6138 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush) 6139 { 6140 vm_offset_t base, offset, tmpva; 6141 vm_paddr_t pa_start, pa_end, pa_end1; 6142 pml2_entry_t *l2e; 6143 pml3_entry_t *l3e; 6144 pt_entry_t *pte; 6145 int cache_bits, error; 6146 boolean_t changed; 6147 6148 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6149 base = trunc_page(va); 6150 offset = va & PAGE_MASK; 6151 size = round_page(offset + size); 6152 6153 /* 6154 * Only supported on kernel virtual addresses, including the direct 6155 * map but excluding the recursive map. 6156 */ 6157 if (base < DMAP_MIN_ADDRESS) 6158 return (EINVAL); 6159 6160 cache_bits = pmap_cache_bits(mode); 6161 changed = FALSE; 6162 6163 /* 6164 * Pages that aren't mapped aren't supported. Also break down 2MB pages 6165 * into 4KB pages if required. 6166 */ 6167 for (tmpva = base; tmpva < base + size; ) { 6168 l2e = pmap_pml2e(kernel_pmap, tmpva); 6169 if (l2e == NULL || *l2e == 0) 6170 return (EINVAL); 6171 if (be64toh(*l2e) & RPTE_LEAF) { 6172 /* 6173 * If the current 1GB page already has the required 6174 * memory type, then we need not demote this page. Just 6175 * increment tmpva to the next 1GB page frame. 6176 */ 6177 if ((be64toh(*l2e) & RPTE_ATTR_MASK) == cache_bits) { 6178 tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; 6179 continue; 6180 } 6181 6182 /* 6183 * If the current offset aligns with a 1GB page frame 6184 * and there is at least 1GB left within the range, then 6185 * we need not break down this page into 2MB pages. 6186 */ 6187 if ((tmpva & L2_PAGE_MASK) == 0 && 6188 tmpva + L2_PAGE_MASK < base + size) { 6189 tmpva += L2_PAGE_MASK; 6190 continue; 6191 } 6192 if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva)) 6193 return (ENOMEM); 6194 } 6195 l3e = pmap_l2e_to_l3e(l2e, tmpva); 6196 KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n", 6197 tmpva, l2e)); 6198 if (*l3e == 0) 6199 return (EINVAL); 6200 if (be64toh(*l3e) & RPTE_LEAF) { 6201 /* 6202 * If the current 2MB page already has the required 6203 * memory type, then we need not demote this page. Just 6204 * increment tmpva to the next 2MB page frame. 6205 */ 6206 if ((be64toh(*l3e) & RPTE_ATTR_MASK) == cache_bits) { 6207 tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; 6208 continue; 6209 } 6210 6211 /* 6212 * If the current offset aligns with a 2MB page frame 6213 * and there is at least 2MB left within the range, then 6214 * we need not break down this page into 4KB pages. 6215 */ 6216 if ((tmpva & L3_PAGE_MASK) == 0 && 6217 tmpva + L3_PAGE_MASK < base + size) { 6218 tmpva += L3_PAGE_SIZE; 6219 continue; 6220 } 6221 if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva)) 6222 return (ENOMEM); 6223 } 6224 pte = pmap_l3e_to_pte(l3e, tmpva); 6225 if (*pte == 0) 6226 return (EINVAL); 6227 tmpva += PAGE_SIZE; 6228 } 6229 error = 0; 6230 6231 /* 6232 * Ok, all the pages exist, so run through them updating their 6233 * cache mode if required. 6234 */ 6235 pa_start = pa_end = 0; 6236 for (tmpva = base; tmpva < base + size; ) { 6237 l2e = pmap_pml2e(kernel_pmap, tmpva); 6238 if (be64toh(*l2e) & RPTE_LEAF) { 6239 if ((be64toh(*l2e) & RPTE_ATTR_MASK) != cache_bits) { 6240 pmap_pte_attr(l2e, cache_bits, 6241 RPTE_ATTR_MASK); 6242 changed = TRUE; 6243 } 6244 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6245 (*l2e & PG_PS_FRAME) < dmaplimit) { 6246 if (pa_start == pa_end) { 6247 /* Start physical address run. */ 6248 pa_start = be64toh(*l2e) & PG_PS_FRAME; 6249 pa_end = pa_start + L2_PAGE_SIZE; 6250 } else if (pa_end == (be64toh(*l2e) & PG_PS_FRAME)) 6251 pa_end += L2_PAGE_SIZE; 6252 else { 6253 /* Run ended, update direct map. */ 6254 error = pmap_change_attr_locked( 6255 PHYS_TO_DMAP(pa_start), 6256 pa_end - pa_start, mode, flush); 6257 if (error != 0) 6258 break; 6259 /* Start physical address run. */ 6260 pa_start = be64toh(*l2e) & PG_PS_FRAME; 6261 pa_end = pa_start + L2_PAGE_SIZE; 6262 } 6263 } 6264 tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; 6265 continue; 6266 } 6267 l3e = pmap_l2e_to_l3e(l2e, tmpva); 6268 if (be64toh(*l3e) & RPTE_LEAF) { 6269 if ((be64toh(*l3e) & RPTE_ATTR_MASK) != cache_bits) { 6270 pmap_pte_attr(l3e, cache_bits, 6271 RPTE_ATTR_MASK); 6272 changed = TRUE; 6273 } 6274 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6275 (be64toh(*l3e) & PG_PS_FRAME) < dmaplimit) { 6276 if (pa_start == pa_end) { 6277 /* Start physical address run. */ 6278 pa_start = be64toh(*l3e) & PG_PS_FRAME; 6279 pa_end = pa_start + L3_PAGE_SIZE; 6280 } else if (pa_end == (be64toh(*l3e) & PG_PS_FRAME)) 6281 pa_end += L3_PAGE_SIZE; 6282 else { 6283 /* Run ended, update direct map. */ 6284 error = pmap_change_attr_locked( 6285 PHYS_TO_DMAP(pa_start), 6286 pa_end - pa_start, mode, flush); 6287 if (error != 0) 6288 break; 6289 /* Start physical address run. */ 6290 pa_start = be64toh(*l3e) & PG_PS_FRAME; 6291 pa_end = pa_start + L3_PAGE_SIZE; 6292 } 6293 } 6294 tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; 6295 } else { 6296 pte = pmap_l3e_to_pte(l3e, tmpva); 6297 if ((be64toh(*pte) & RPTE_ATTR_MASK) != cache_bits) { 6298 pmap_pte_attr(pte, cache_bits, 6299 RPTE_ATTR_MASK); 6300 changed = TRUE; 6301 } 6302 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6303 (be64toh(*pte) & PG_FRAME) < dmaplimit) { 6304 if (pa_start == pa_end) { 6305 /* Start physical address run. */ 6306 pa_start = be64toh(*pte) & PG_FRAME; 6307 pa_end = pa_start + PAGE_SIZE; 6308 } else if (pa_end == (be64toh(*pte) & PG_FRAME)) 6309 pa_end += PAGE_SIZE; 6310 else { 6311 /* Run ended, update direct map. */ 6312 error = pmap_change_attr_locked( 6313 PHYS_TO_DMAP(pa_start), 6314 pa_end - pa_start, mode, flush); 6315 if (error != 0) 6316 break; 6317 /* Start physical address run. */ 6318 pa_start = be64toh(*pte) & PG_FRAME; 6319 pa_end = pa_start + PAGE_SIZE; 6320 } 6321 } 6322 tmpva += PAGE_SIZE; 6323 } 6324 } 6325 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 6326 pa_end1 = MIN(pa_end, dmaplimit); 6327 if (pa_start != pa_end1) 6328 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 6329 pa_end1 - pa_start, mode, flush); 6330 } 6331 6332 /* 6333 * Flush CPU caches if required to make sure any data isn't cached that 6334 * shouldn't be, etc. 6335 */ 6336 if (changed) { 6337 pmap_invalidate_all(kernel_pmap); 6338 6339 if (flush) 6340 pmap_invalidate_cache_range(base, tmpva); 6341 } 6342 return (error); 6343 } 6344 6345 /* 6346 * Allocate physical memory for the vm_page array and map it into KVA, 6347 * attempting to back the vm_pages with domain-local memory. 6348 */ 6349 void 6350 mmu_radix_page_array_startup(long pages) 6351 { 6352 #ifdef notyet 6353 pml2_entry_t *l2e; 6354 pml3_entry_t *pde; 6355 pml3_entry_t newl3; 6356 vm_offset_t va; 6357 long pfn; 6358 int domain, i; 6359 #endif 6360 vm_paddr_t pa; 6361 vm_offset_t start, end; 6362 6363 vm_page_array_size = pages; 6364 6365 start = VM_MIN_KERNEL_ADDRESS; 6366 end = start + pages * sizeof(struct vm_page); 6367 6368 pa = vm_phys_early_alloc(0, end - start); 6369 6370 start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT); 6371 #ifdef notyet 6372 /* TODO: NUMA vm_page_array. Blocked out until then (copied from amd64). */ 6373 for (va = start; va < end; va += L3_PAGE_SIZE) { 6374 pfn = first_page + (va - start) / sizeof(struct vm_page); 6375 domain = vm_phys_domain(ptoa(pfn)); 6376 l2e = pmap_pml2e(kernel_pmap, va); 6377 if ((be64toh(*l2e) & PG_V) == 0) { 6378 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 6379 dump_add_page(pa); 6380 pagezero(PHYS_TO_DMAP(pa)); 6381 pde_store(l2e, (pml2_entry_t)pa); 6382 } 6383 pde = pmap_l2e_to_l3e(l2e, va); 6384 if ((be64toh(*pde) & PG_V) != 0) 6385 panic("Unexpected pde %p", pde); 6386 pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE); 6387 for (i = 0; i < NPDEPG; i++) 6388 dump_add_page(pa + i * PAGE_SIZE); 6389 newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W); 6390 pte_store(pde, newl3); 6391 } 6392 #endif 6393 vm_page_array = (vm_page_t)start; 6394 } 6395 6396 #ifdef DDB 6397 #include <sys/kdb.h> 6398 #include <ddb/ddb.h> 6399 6400 static void 6401 pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va) 6402 { 6403 pml1_entry_t *l1e; 6404 pml2_entry_t *l2e; 6405 pml3_entry_t *l3e; 6406 pt_entry_t *pte; 6407 6408 l1e = &l1[pmap_pml1e_index(va)]; 6409 db_printf("VA %#016lx l1e %#016lx", va, be64toh(*l1e)); 6410 if ((be64toh(*l1e) & PG_V) == 0) { 6411 db_printf("\n"); 6412 return; 6413 } 6414 l2e = pmap_l1e_to_l2e(l1e, va); 6415 db_printf(" l2e %#016lx", be64toh(*l2e)); 6416 if ((be64toh(*l2e) & PG_V) == 0 || (be64toh(*l2e) & RPTE_LEAF) != 0) { 6417 db_printf("\n"); 6418 return; 6419 } 6420 l3e = pmap_l2e_to_l3e(l2e, va); 6421 db_printf(" l3e %#016lx", be64toh(*l3e)); 6422 if ((be64toh(*l3e) & PG_V) == 0 || (be64toh(*l3e) & RPTE_LEAF) != 0) { 6423 db_printf("\n"); 6424 return; 6425 } 6426 pte = pmap_l3e_to_pte(l3e, va); 6427 db_printf(" pte %#016lx\n", be64toh(*pte)); 6428 } 6429 6430 void 6431 pmap_page_print_mappings(vm_page_t m) 6432 { 6433 pmap_t pmap; 6434 pv_entry_t pv; 6435 6436 db_printf("page %p(%lx)\n", m, m->phys_addr); 6437 /* need to elide locks if running in ddb */ 6438 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 6439 db_printf("pv: %p ", pv); 6440 db_printf("va: %#016lx ", pv->pv_va); 6441 pmap = PV_PMAP(pv); 6442 db_printf("pmap %p ", pmap); 6443 if (pmap != NULL) { 6444 db_printf("asid: %lu\n", pmap->pm_pid); 6445 pmap_pte_walk(pmap->pm_pml1, pv->pv_va); 6446 } 6447 } 6448 } 6449 6450 DB_SHOW_COMMAND(pte, pmap_print_pte) 6451 { 6452 vm_offset_t va; 6453 pmap_t pmap; 6454 6455 if (!have_addr) { 6456 db_printf("show pte addr\n"); 6457 return; 6458 } 6459 va = (vm_offset_t)addr; 6460 6461 if (va >= DMAP_MIN_ADDRESS) 6462 pmap = kernel_pmap; 6463 else if (kdb_thread != NULL) 6464 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 6465 else 6466 pmap = vmspace_pmap(curthread->td_proc->p_vmspace); 6467 6468 pmap_pte_walk(pmap->pm_pml1, va); 6469 } 6470 6471 #endif 6472