1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2018 Matthew Macy 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include <sys/param.h> 32 #include <sys/kernel.h> 33 #include <sys/systm.h> 34 #include <sys/conf.h> 35 #include <sys/bitstring.h> 36 #include <sys/queue.h> 37 #include <sys/cpuset.h> 38 #include <sys/endian.h> 39 #include <sys/kerneldump.h> 40 #include <sys/ktr.h> 41 #include <sys/lock.h> 42 #include <sys/syslog.h> 43 #include <sys/msgbuf.h> 44 #include <sys/malloc.h> 45 #include <sys/mman.h> 46 #include <sys/mutex.h> 47 #include <sys/proc.h> 48 #include <sys/rwlock.h> 49 #include <sys/sched.h> 50 #include <sys/sysctl.h> 51 #include <sys/systm.h> 52 #include <sys/vmem.h> 53 #include <sys/vmmeter.h> 54 #include <sys/smp.h> 55 56 #include <sys/kdb.h> 57 58 #include <dev/ofw/openfirm.h> 59 60 #include <vm/vm.h> 61 #include <vm/pmap.h> 62 #include <vm/vm_param.h> 63 #include <vm/vm_kern.h> 64 #include <vm/vm_page.h> 65 #include <vm/vm_map.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_extern.h> 68 #include <vm/vm_pageout.h> 69 #include <vm/vm_phys.h> 70 #include <vm/vm_reserv.h> 71 #include <vm/vm_dumpset.h> 72 #include <vm/uma.h> 73 74 #include <machine/_inttypes.h> 75 #include <machine/cpu.h> 76 #include <machine/platform.h> 77 #include <machine/frame.h> 78 #include <machine/md_var.h> 79 #include <machine/psl.h> 80 #include <machine/bat.h> 81 #include <machine/hid.h> 82 #include <machine/pte.h> 83 #include <machine/sr.h> 84 #include <machine/trap.h> 85 #include <machine/mmuvar.h> 86 87 #ifdef INVARIANTS 88 #include <vm/uma_dbg.h> 89 #endif 90 91 #define PPC_BITLSHIFT(bit) (sizeof(long)*NBBY - 1 - (bit)) 92 #define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) 93 #define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit)) 94 95 #include "opt_ddb.h" 96 #ifdef DDB 97 static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va); 98 #endif 99 100 #define PG_W RPTE_WIRED 101 #define PG_V RPTE_VALID 102 #define PG_MANAGED RPTE_MANAGED 103 #define PG_PROMOTED RPTE_PROMOTED 104 #define PG_M RPTE_C 105 #define PG_A RPTE_R 106 #define PG_X RPTE_EAA_X 107 #define PG_RW RPTE_EAA_W 108 #define PG_PTE_CACHE RPTE_ATTR_MASK 109 110 #define RPTE_SHIFT 9 111 #define NLS_MASK ((1UL<<5)-1) 112 #define RPTE_ENTRIES (1UL<<RPTE_SHIFT) 113 #define RPTE_MASK (RPTE_ENTRIES-1) 114 115 #define NLB_SHIFT 0 116 #define NLB_MASK (((1UL<<52)-1) << 8) 117 118 extern int nkpt; 119 extern caddr_t crashdumpmap; 120 121 #define RIC_FLUSH_TLB 0 122 #define RIC_FLUSH_PWC 1 123 #define RIC_FLUSH_ALL 2 124 125 #define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */ 126 127 #define PPC_INST_TLBIE 0x7c000264 128 #define PPC_INST_TLBIEL 0x7c000224 129 #define PPC_INST_SLBIA 0x7c0003e4 130 131 #define ___PPC_RA(a) (((a) & 0x1f) << 16) 132 #define ___PPC_RB(b) (((b) & 0x1f) << 11) 133 #define ___PPC_RS(s) (((s) & 0x1f) << 21) 134 #define ___PPC_RT(t) ___PPC_RS(t) 135 #define ___PPC_R(r) (((r) & 0x1) << 16) 136 #define ___PPC_PRS(prs) (((prs) & 0x1) << 17) 137 #define ___PPC_RIC(ric) (((ric) & 0x3) << 18) 138 139 #define PPC_SLBIA(IH) __XSTRING(.long PPC_INST_SLBIA | \ 140 ((IH & 0x7) << 21)) 141 #define PPC_TLBIE_5(rb,rs,ric,prs,r) \ 142 __XSTRING(.long PPC_INST_TLBIE | \ 143 ___PPC_RB(rb) | ___PPC_RS(rs) | \ 144 ___PPC_RIC(ric) | ___PPC_PRS(prs) | \ 145 ___PPC_R(r)) 146 147 #define PPC_TLBIEL(rb,rs,ric,prs,r) \ 148 __XSTRING(.long PPC_INST_TLBIEL | \ 149 ___PPC_RB(rb) | ___PPC_RS(rs) | \ 150 ___PPC_RIC(ric) | ___PPC_PRS(prs) | \ 151 ___PPC_R(r)) 152 153 #define PPC_INVALIDATE_ERAT PPC_SLBIA(7) 154 155 static __inline void 156 ttusync(void) 157 { 158 __asm __volatile("eieio; tlbsync; ptesync" ::: "memory"); 159 } 160 161 #define TLBIEL_INVAL_SEL_MASK 0xc00 /* invalidation selector */ 162 #define TLBIEL_INVAL_PAGE 0x000 /* invalidate a single page */ 163 #define TLBIEL_INVAL_SET_PID 0x400 /* invalidate a set for the current PID */ 164 #define TLBIEL_INVAL_SET_LPID 0x800 /* invalidate a set for current LPID */ 165 #define TLBIEL_INVAL_SET 0xc00 /* invalidate a set for all LPIDs */ 166 167 #define TLBIE_ACTUAL_PAGE_MASK 0xe0 168 #define TLBIE_ACTUAL_PAGE_4K 0x00 169 #define TLBIE_ACTUAL_PAGE_64K 0xa0 170 #define TLBIE_ACTUAL_PAGE_2M 0x20 171 #define TLBIE_ACTUAL_PAGE_1G 0x40 172 173 #define TLBIE_PRS_PARTITION_SCOPE 0x0 174 #define TLBIE_PRS_PROCESS_SCOPE 0x1 175 176 #define TLBIE_RIC_INVALIDATE_TLB 0x0 /* Invalidate just TLB */ 177 #define TLBIE_RIC_INVALIDATE_PWC 0x1 /* Invalidate just PWC */ 178 #define TLBIE_RIC_INVALIDATE_ALL 0x2 /* Invalidate TLB, PWC, 179 * cached {proc, part}tab entries 180 */ 181 #define TLBIE_RIC_INVALIDATE_SEQ 0x3 /* HPT - only: 182 * Invalidate a range of translations 183 */ 184 185 static __always_inline void 186 radix_tlbie(uint8_t ric, uint8_t prs, uint16_t is, uint32_t pid, uint32_t lpid, 187 vm_offset_t va, uint16_t ap) 188 { 189 uint64_t rb, rs; 190 191 MPASS((va & PAGE_MASK) == 0); 192 193 rs = ((uint64_t)pid << 32) | lpid; 194 rb = va | is | ap; 195 __asm __volatile(PPC_TLBIE_5(%0, %1, %2, %3, 1) : : 196 "r" (rb), "r" (rs), "i" (ric), "i" (prs) : "memory"); 197 } 198 199 static __inline void 200 radix_tlbie_fixup(uint32_t pid, vm_offset_t va, int ap) 201 { 202 203 __asm __volatile("ptesync" ::: "memory"); 204 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 205 TLBIEL_INVAL_PAGE, 0, 0, va, ap); 206 __asm __volatile("ptesync" ::: "memory"); 207 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 208 TLBIEL_INVAL_PAGE, pid, 0, va, ap); 209 } 210 211 static __inline void 212 radix_tlbie_invlpg_user_4k(uint32_t pid, vm_offset_t va) 213 { 214 215 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 216 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_4K); 217 radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_4K); 218 } 219 220 static __inline void 221 radix_tlbie_invlpg_user_2m(uint32_t pid, vm_offset_t va) 222 { 223 224 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 225 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_2M); 226 radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_2M); 227 } 228 229 static __inline void 230 radix_tlbie_invlpwc_user(uint32_t pid) 231 { 232 233 radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE, 234 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0); 235 } 236 237 static __inline void 238 radix_tlbie_flush_user(uint32_t pid) 239 { 240 241 radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE, 242 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0); 243 } 244 245 static __inline void 246 radix_tlbie_invlpg_kernel_4k(vm_offset_t va) 247 { 248 249 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 250 TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_4K); 251 radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_4K); 252 } 253 254 static __inline void 255 radix_tlbie_invlpg_kernel_2m(vm_offset_t va) 256 { 257 258 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 259 TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_2M); 260 radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_2M); 261 } 262 263 /* 1GB pages aren't currently supported. */ 264 static __inline __unused void 265 radix_tlbie_invlpg_kernel_1g(vm_offset_t va) 266 { 267 268 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 269 TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_1G); 270 radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_1G); 271 } 272 273 static __inline void 274 radix_tlbie_invlpwc_kernel(void) 275 { 276 277 radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE, 278 TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0); 279 } 280 281 static __inline void 282 radix_tlbie_flush_kernel(void) 283 { 284 285 radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE, 286 TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0); 287 } 288 289 static __inline vm_pindex_t 290 pmap_l3e_pindex(vm_offset_t va) 291 { 292 return ((va & PG_FRAME) >> L3_PAGE_SIZE_SHIFT); 293 } 294 295 static __inline vm_pindex_t 296 pmap_pml3e_index(vm_offset_t va) 297 { 298 299 return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK); 300 } 301 302 static __inline vm_pindex_t 303 pmap_pml2e_index(vm_offset_t va) 304 { 305 return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK); 306 } 307 308 static __inline vm_pindex_t 309 pmap_pml1e_index(vm_offset_t va) 310 { 311 return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT); 312 } 313 314 /* Return various clipped indexes for a given VA */ 315 static __inline vm_pindex_t 316 pmap_pte_index(vm_offset_t va) 317 { 318 319 return ((va >> PAGE_SHIFT) & RPTE_MASK); 320 } 321 322 /* Return a pointer to the PT slot that corresponds to a VA */ 323 static __inline pt_entry_t * 324 pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va) 325 { 326 pt_entry_t *pte; 327 vm_paddr_t ptepa; 328 329 ptepa = (be64toh(*l3e) & NLB_MASK); 330 pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa); 331 return (&pte[pmap_pte_index(va)]); 332 } 333 334 /* Return a pointer to the PD slot that corresponds to a VA */ 335 static __inline pt_entry_t * 336 pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va) 337 { 338 pt_entry_t *l3e; 339 vm_paddr_t l3pa; 340 341 l3pa = (be64toh(*l2e) & NLB_MASK); 342 l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa); 343 return (&l3e[pmap_pml3e_index(va)]); 344 } 345 346 /* Return a pointer to the PD slot that corresponds to a VA */ 347 static __inline pt_entry_t * 348 pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va) 349 { 350 pt_entry_t *l2e; 351 vm_paddr_t l2pa; 352 353 l2pa = (be64toh(*l1e) & NLB_MASK); 354 355 l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa); 356 return (&l2e[pmap_pml2e_index(va)]); 357 } 358 359 static __inline pml1_entry_t * 360 pmap_pml1e(pmap_t pmap, vm_offset_t va) 361 { 362 363 return (&pmap->pm_pml1[pmap_pml1e_index(va)]); 364 } 365 366 static pt_entry_t * 367 pmap_pml2e(pmap_t pmap, vm_offset_t va) 368 { 369 pt_entry_t *l1e; 370 371 l1e = pmap_pml1e(pmap, va); 372 if (l1e == NULL || (be64toh(*l1e) & RPTE_VALID) == 0) 373 return (NULL); 374 return (pmap_l1e_to_l2e(l1e, va)); 375 } 376 377 static __inline pt_entry_t * 378 pmap_pml3e(pmap_t pmap, vm_offset_t va) 379 { 380 pt_entry_t *l2e; 381 382 l2e = pmap_pml2e(pmap, va); 383 if (l2e == NULL || (be64toh(*l2e) & RPTE_VALID) == 0) 384 return (NULL); 385 return (pmap_l2e_to_l3e(l2e, va)); 386 } 387 388 static __inline pt_entry_t * 389 pmap_pte(pmap_t pmap, vm_offset_t va) 390 { 391 pt_entry_t *l3e; 392 393 l3e = pmap_pml3e(pmap, va); 394 if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0) 395 return (NULL); 396 return (pmap_l3e_to_pte(l3e, va)); 397 } 398 399 int nkpt = 64; 400 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 401 "Number of kernel page table pages allocated on bootup"); 402 403 vm_paddr_t dmaplimit; 404 405 SYSCTL_DECL(_vm_pmap); 406 407 #ifdef INVARIANTS 408 #define VERBOSE_PMAP 0 409 #define VERBOSE_PROTECT 0 410 static int pmap_logging; 411 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN, 412 &pmap_logging, 0, "verbose debug logging"); 413 #endif 414 415 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 416 417 //static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ 418 419 static vm_offset_t qframe = 0; 420 static struct mtx qframe_mtx; 421 422 void mmu_radix_activate(struct thread *); 423 void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int); 424 void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *, 425 vm_size_t); 426 void mmu_radix_clear_modify(vm_page_t); 427 void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); 428 int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *); 429 int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); 430 void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t, 431 vm_prot_t); 432 void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t); 433 vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va); 434 vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t); 435 void mmu_radix_kenter(vm_offset_t, vm_paddr_t); 436 vm_paddr_t mmu_radix_kextract(vm_offset_t); 437 void mmu_radix_kremove(vm_offset_t); 438 boolean_t mmu_radix_is_modified(vm_page_t); 439 boolean_t mmu_radix_is_prefaultable(pmap_t, vm_offset_t); 440 boolean_t mmu_radix_is_referenced(vm_page_t); 441 void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t, 442 vm_pindex_t, vm_size_t); 443 boolean_t mmu_radix_page_exists_quick(pmap_t, vm_page_t); 444 void mmu_radix_page_init(vm_page_t); 445 boolean_t mmu_radix_page_is_mapped(vm_page_t m); 446 void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t); 447 int mmu_radix_page_wired_mappings(vm_page_t); 448 int mmu_radix_pinit(pmap_t); 449 void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); 450 bool mmu_radix_ps_enabled(pmap_t); 451 void mmu_radix_qenter(vm_offset_t, vm_page_t *, int); 452 void mmu_radix_qremove(vm_offset_t, int); 453 vm_offset_t mmu_radix_quick_enter_page(vm_page_t); 454 void mmu_radix_quick_remove_page(vm_offset_t); 455 boolean_t mmu_radix_ts_referenced(vm_page_t); 456 void mmu_radix_release(pmap_t); 457 void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t); 458 void mmu_radix_remove_all(vm_page_t); 459 void mmu_radix_remove_pages(pmap_t); 460 void mmu_radix_remove_write(vm_page_t); 461 void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t); 462 void mmu_radix_zero_page(vm_page_t); 463 void mmu_radix_zero_page_area(vm_page_t, int, int); 464 int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t); 465 void mmu_radix_page_array_startup(long pages); 466 467 #include "mmu_oea64.h" 468 469 /* 470 * Kernel MMU interface 471 */ 472 473 static void mmu_radix_bootstrap(vm_offset_t, vm_offset_t); 474 475 static void mmu_radix_copy_page(vm_page_t, vm_page_t); 476 static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset, 477 vm_page_t *mb, vm_offset_t b_offset, int xfersize); 478 static void mmu_radix_growkernel(vm_offset_t); 479 static void mmu_radix_init(void); 480 static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *); 481 static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); 482 static void mmu_radix_pinit0(pmap_t); 483 484 static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t); 485 static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); 486 static void mmu_radix_unmapdev(vm_offset_t, vm_size_t); 487 static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma); 488 static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t); 489 static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va); 490 static void mmu_radix_scan_init(void); 491 static void mmu_radix_cpu_bootstrap(int ap); 492 static void mmu_radix_tlbie_all(void); 493 494 static struct pmap_funcs mmu_radix_methods = { 495 .bootstrap = mmu_radix_bootstrap, 496 .copy_page = mmu_radix_copy_page, 497 .copy_pages = mmu_radix_copy_pages, 498 .cpu_bootstrap = mmu_radix_cpu_bootstrap, 499 .growkernel = mmu_radix_growkernel, 500 .init = mmu_radix_init, 501 .map = mmu_radix_map, 502 .mincore = mmu_radix_mincore, 503 .pinit = mmu_radix_pinit, 504 .pinit0 = mmu_radix_pinit0, 505 506 .mapdev = mmu_radix_mapdev, 507 .mapdev_attr = mmu_radix_mapdev_attr, 508 .unmapdev = mmu_radix_unmapdev, 509 .kenter_attr = mmu_radix_kenter_attr, 510 .dev_direct_mapped = mmu_radix_dev_direct_mapped, 511 .dumpsys_pa_init = mmu_radix_scan_init, 512 .dumpsys_map_chunk = mmu_radix_dumpsys_map, 513 .page_is_mapped = mmu_radix_page_is_mapped, 514 .ps_enabled = mmu_radix_ps_enabled, 515 .align_superpage = mmu_radix_align_superpage, 516 .object_init_pt = mmu_radix_object_init_pt, 517 .protect = mmu_radix_protect, 518 /* pmap dispatcher interface */ 519 .clear_modify = mmu_radix_clear_modify, 520 .copy = mmu_radix_copy, 521 .enter = mmu_radix_enter, 522 .enter_object = mmu_radix_enter_object, 523 .enter_quick = mmu_radix_enter_quick, 524 .extract = mmu_radix_extract, 525 .extract_and_hold = mmu_radix_extract_and_hold, 526 .is_modified = mmu_radix_is_modified, 527 .is_prefaultable = mmu_radix_is_prefaultable, 528 .is_referenced = mmu_radix_is_referenced, 529 .ts_referenced = mmu_radix_ts_referenced, 530 .page_exists_quick = mmu_radix_page_exists_quick, 531 .page_init = mmu_radix_page_init, 532 .page_wired_mappings = mmu_radix_page_wired_mappings, 533 .qenter = mmu_radix_qenter, 534 .qremove = mmu_radix_qremove, 535 .release = mmu_radix_release, 536 .remove = mmu_radix_remove, 537 .remove_all = mmu_radix_remove_all, 538 .remove_write = mmu_radix_remove_write, 539 .unwire = mmu_radix_unwire, 540 .zero_page = mmu_radix_zero_page, 541 .zero_page_area = mmu_radix_zero_page_area, 542 .activate = mmu_radix_activate, 543 .quick_enter_page = mmu_radix_quick_enter_page, 544 .quick_remove_page = mmu_radix_quick_remove_page, 545 .page_set_memattr = mmu_radix_page_set_memattr, 546 .page_array_startup = mmu_radix_page_array_startup, 547 548 /* Internal interfaces */ 549 .kenter = mmu_radix_kenter, 550 .kextract = mmu_radix_kextract, 551 .kremove = mmu_radix_kremove, 552 .change_attr = mmu_radix_change_attr, 553 .decode_kernel_ptr = mmu_radix_decode_kernel_ptr, 554 555 .tlbie_all = mmu_radix_tlbie_all, 556 }; 557 558 MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods); 559 560 static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, 561 struct rwlock **lockp); 562 static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va); 563 static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *); 564 static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, 565 struct spglist *free, struct rwlock **lockp); 566 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 567 pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 568 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 569 static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde, 570 struct spglist *free); 571 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 572 pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp); 573 574 static bool pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e, 575 u_int flags, struct rwlock **lockp); 576 #if VM_NRESERVLEVEL > 0 577 static void pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 578 struct rwlock **lockp); 579 #endif 580 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 581 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 582 static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 583 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate); 584 585 static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 586 vm_prot_t prot, struct rwlock **lockp); 587 static int pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, 588 u_int flags, vm_page_t m, struct rwlock **lockp); 589 590 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 591 static void free_pv_chunk(struct pv_chunk *pc); 592 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); 593 static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va, 594 struct rwlock **lockp); 595 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 596 struct rwlock **lockp); 597 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 598 struct spglist *free); 599 static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); 600 601 static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start); 602 static void pmap_invalidate_all(pmap_t pmap); 603 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush); 604 605 /* 606 * Internal flags for pmap_enter()'s helper functions. 607 */ 608 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 609 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 610 611 #define UNIMPLEMENTED() panic("%s not implemented", __func__) 612 #define UNTESTED() panic("%s not yet tested", __func__) 613 614 /* Number of supported PID bits */ 615 static unsigned int isa3_pid_bits; 616 617 /* PID to start allocating from */ 618 static unsigned int isa3_base_pid; 619 620 #define PROCTAB_SIZE_SHIFT (isa3_pid_bits + 4) 621 #define PROCTAB_ENTRIES (1ul << isa3_pid_bits) 622 623 /* 624 * Map of physical memory regions. 625 */ 626 static struct mem_region *regions, *pregions; 627 static struct numa_mem_region *numa_pregions; 628 static u_int phys_avail_count; 629 static int regions_sz, pregions_sz, numa_pregions_sz; 630 static struct pate *isa3_parttab; 631 static struct prte *isa3_proctab; 632 static vmem_t *asid_arena; 633 634 extern void bs_remap_earlyboot(void); 635 636 #define RADIX_PGD_SIZE_SHIFT 16 637 #define RADIX_PGD_SIZE (1UL << RADIX_PGD_SIZE_SHIFT) 638 639 #define RADIX_PGD_INDEX_SHIFT (RADIX_PGD_SIZE_SHIFT-3) 640 #define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t)) 641 #define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t)) 642 643 #define NUPML1E (RADIX_PGD_SIZE/sizeof(uint64_t)) /* number of userland PML1 pages */ 644 #define NUPDPE (NUPML1E * NL2EPG)/* number of userland PDP pages */ 645 #define NUPDE (NUPDPE * NL3EPG) /* number of userland PD entries */ 646 647 /* POWER9 only permits a 64k partition table size. */ 648 #define PARTTAB_SIZE_SHIFT 16 649 #define PARTTAB_SIZE (1UL << PARTTAB_SIZE_SHIFT) 650 651 #define PARTTAB_HR (1UL << 63) /* host uses radix */ 652 #define PARTTAB_GR (1UL << 63) /* guest uses radix must match host */ 653 654 /* TLB flush actions. Used as argument to tlbiel_all() */ 655 enum { 656 TLB_INVAL_SCOPE_LPID = 0, /* invalidate TLBs for current LPID */ 657 TLB_INVAL_SCOPE_GLOBAL = 1, /* invalidate all TLBs */ 658 }; 659 660 #define NPV_LIST_LOCKS MAXCPU 661 static int pmap_initialized; 662 static vm_paddr_t proctab0pa; 663 static vm_paddr_t parttab_phys; 664 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 665 666 /* 667 * Data for the pv entry allocation mechanism. 668 * Updates to pv_invl_gen are protected by the pv_list_locks[] 669 * elements, but reads are not. 670 */ 671 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 672 static struct mtx __exclusive_cache_line pv_chunks_mutex; 673 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 674 static struct md_page *pv_table; 675 static struct md_page pv_dummy; 676 677 #ifdef PV_STATS 678 #define PV_STAT(x) do { x ; } while (0) 679 #else 680 #define PV_STAT(x) do { } while (0) 681 #endif 682 683 #define pa_radix_index(pa) ((pa) >> L3_PAGE_SIZE_SHIFT) 684 #define pa_to_pvh(pa) (&pv_table[pa_radix_index(pa)]) 685 686 #define PHYS_TO_PV_LIST_LOCK(pa) \ 687 (&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS]) 688 689 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 690 struct rwlock **_lockp = (lockp); \ 691 struct rwlock *_new_lock; \ 692 \ 693 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 694 if (_new_lock != *_lockp) { \ 695 if (*_lockp != NULL) \ 696 rw_wunlock(*_lockp); \ 697 *_lockp = _new_lock; \ 698 rw_wlock(*_lockp); \ 699 } \ 700 } while (0) 701 702 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 703 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 704 705 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 706 struct rwlock **_lockp = (lockp); \ 707 \ 708 if (*_lockp != NULL) { \ 709 rw_wunlock(*_lockp); \ 710 *_lockp = NULL; \ 711 } \ 712 } while (0) 713 714 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 715 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 716 717 /* 718 * We support 52 bits, hence: 719 * bits 52 - 31 = 21, 0b10101 720 * RTS encoding details 721 * bits 0 - 3 of rts -> bits 6 - 8 unsigned long 722 * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long 723 */ 724 #define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5)) 725 726 static int powernv_enabled = 1; 727 728 static __always_inline void 729 tlbiel_radix_set_isa300(uint32_t set, uint32_t is, 730 uint32_t pid, uint32_t ric, uint32_t prs) 731 { 732 uint64_t rb; 733 uint64_t rs; 734 735 rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53); 736 rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31); 737 738 __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) 739 : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) 740 : "memory"); 741 } 742 743 static void 744 tlbiel_flush_isa3(uint32_t num_sets, uint32_t is) 745 { 746 uint32_t set; 747 748 __asm __volatile("ptesync": : :"memory"); 749 750 /* 751 * Flush the first set of the TLB, and the entire Page Walk Cache 752 * and partition table entries. Then flush the remaining sets of the 753 * TLB. 754 */ 755 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); 756 for (set = 1; set < num_sets; set++) 757 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); 758 759 /* Do the same for process scoped entries. */ 760 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); 761 for (set = 1; set < num_sets; set++) 762 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); 763 764 __asm __volatile("ptesync": : :"memory"); 765 } 766 767 static void 768 mmu_radix_tlbiel_flush(int scope) 769 { 770 int is; 771 772 MPASS(scope == TLB_INVAL_SCOPE_LPID || 773 scope == TLB_INVAL_SCOPE_GLOBAL); 774 is = scope + 2; 775 776 tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, is); 777 __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); 778 } 779 780 static void 781 mmu_radix_tlbie_all() 782 { 783 /* TODO: LPID invalidate */ 784 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); 785 } 786 787 static void 788 mmu_radix_init_amor(void) 789 { 790 /* 791 * In HV mode, we init AMOR (Authority Mask Override Register) so that 792 * the hypervisor and guest can setup IAMR (Instruction Authority Mask 793 * Register), enable key 0 and set it to 1. 794 * 795 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) 796 */ 797 mtspr(SPR_AMOR, (3ul << 62)); 798 } 799 800 static void 801 mmu_radix_init_iamr(void) 802 { 803 /* 804 * Radix always uses key0 of the IAMR to determine if an access is 805 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction 806 * fetch. 807 */ 808 mtspr(SPR_IAMR, (1ul << 62)); 809 } 810 811 static void 812 mmu_radix_pid_set(pmap_t pmap) 813 { 814 815 mtspr(SPR_PID, pmap->pm_pid); 816 isync(); 817 } 818 819 /* Quick sort callout for comparing physical addresses. */ 820 static int 821 pa_cmp(const void *a, const void *b) 822 { 823 const vm_paddr_t *pa = a, *pb = b; 824 825 if (*pa < *pb) 826 return (-1); 827 else if (*pa > *pb) 828 return (1); 829 else 830 return (0); 831 } 832 833 #define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) 834 #define pte_load_clear(ptep) atomic_swap_long(ptep, 0) 835 #define pte_store(ptep, pte) do { \ 836 MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X)); \ 837 *(u_long *)(ptep) = htobe64((u_long)((pte) | PG_V | RPTE_LEAF)); \ 838 } while (0) 839 /* 840 * NB: should only be used for adding directories - not for direct mappings 841 */ 842 #define pde_store(ptep, pa) do { \ 843 *(u_long *)(ptep) = htobe64((u_long)(pa|RPTE_VALID|RPTE_SHIFT)); \ 844 } while (0) 845 846 #define pte_clear(ptep) do { \ 847 *(u_long *)(ptep) = (u_long)(0); \ 848 } while (0) 849 850 #define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ 851 852 /* 853 * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB 854 * (PTE) page mappings have identical settings for the following fields: 855 */ 856 #define PG_PTE_PROMOTE (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \ 857 PG_M | PG_A | RPTE_EAA_MASK | PG_V) 858 859 static __inline void 860 pmap_resident_count_inc(pmap_t pmap, int count) 861 { 862 863 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 864 pmap->pm_stats.resident_count += count; 865 } 866 867 static __inline void 868 pmap_resident_count_dec(pmap_t pmap, int count) 869 { 870 871 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 872 KASSERT(pmap->pm_stats.resident_count >= count, 873 ("pmap %p resident count underflow %ld %d", pmap, 874 pmap->pm_stats.resident_count, count)); 875 pmap->pm_stats.resident_count -= count; 876 } 877 878 static void 879 pagezero(vm_offset_t va) 880 { 881 va = trunc_page(va); 882 883 bzero((void *)va, PAGE_SIZE); 884 } 885 886 static uint64_t 887 allocpages(int n) 888 { 889 u_int64_t ret; 890 891 ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE); 892 for (int i = 0; i < n; i++) 893 pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE)); 894 return (ret); 895 } 896 897 static pt_entry_t * 898 kvtopte(vm_offset_t va) 899 { 900 pt_entry_t *l3e; 901 902 l3e = pmap_pml3e(kernel_pmap, va); 903 if ((be64toh(*l3e) & RPTE_VALID) == 0) 904 return (NULL); 905 return (pmap_l3e_to_pte(l3e, va)); 906 } 907 908 void 909 mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa) 910 { 911 pt_entry_t *pte; 912 913 pte = kvtopte(va); 914 MPASS(pte != NULL); 915 *pte = htobe64(pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | \ 916 RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A); 917 } 918 919 bool 920 mmu_radix_ps_enabled(pmap_t pmap) 921 { 922 return (superpages_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 923 } 924 925 static pt_entry_t * 926 pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e) 927 { 928 pml3_entry_t *l3e; 929 pt_entry_t *pte; 930 931 va &= PG_PS_FRAME; 932 l3e = pmap_pml3e(pmap, va); 933 if (l3e == NULL || (be64toh(*l3e) & PG_V) == 0) 934 return (NULL); 935 936 if (be64toh(*l3e) & RPTE_LEAF) { 937 *is_l3e = 1; 938 return (l3e); 939 } 940 *is_l3e = 0; 941 va &= PG_FRAME; 942 pte = pmap_l3e_to_pte(l3e, va); 943 if (pte == NULL || (be64toh(*pte) & PG_V) == 0) 944 return (NULL); 945 return (pte); 946 } 947 948 int 949 pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags) 950 { 951 pt_entry_t *pte; 952 pt_entry_t startpte, origpte, newpte; 953 vm_page_t m; 954 int is_l3e; 955 956 startpte = 0; 957 retry: 958 if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL) 959 return (KERN_INVALID_ADDRESS); 960 origpte = newpte = be64toh(*pte); 961 if (startpte == 0) { 962 startpte = origpte; 963 if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) || 964 ((flags & VM_PROT_READ) && (startpte & PG_A))) { 965 pmap_invalidate_all(pmap); 966 #ifdef INVARIANTS 967 if (VERBOSE_PMAP || pmap_logging) 968 printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n", 969 __func__, pmap, va, flags, origpte); 970 #endif 971 return (KERN_FAILURE); 972 } 973 } 974 #ifdef INVARIANTS 975 if (VERBOSE_PMAP || pmap_logging) 976 printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va, 977 flags, origpte); 978 #endif 979 PMAP_LOCK(pmap); 980 if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL || 981 be64toh(*pte) != origpte) { 982 PMAP_UNLOCK(pmap); 983 return (KERN_FAILURE); 984 } 985 m = PHYS_TO_VM_PAGE(newpte & PG_FRAME); 986 MPASS(m != NULL); 987 switch (flags) { 988 case VM_PROT_READ: 989 if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0) 990 goto protfail; 991 newpte |= PG_A; 992 vm_page_aflag_set(m, PGA_REFERENCED); 993 break; 994 case VM_PROT_WRITE: 995 if ((newpte & RPTE_EAA_W) == 0) 996 goto protfail; 997 if (is_l3e) 998 goto protfail; 999 newpte |= PG_M; 1000 vm_page_dirty(m); 1001 break; 1002 case VM_PROT_EXECUTE: 1003 if ((newpte & RPTE_EAA_X) == 0) 1004 goto protfail; 1005 newpte |= PG_A; 1006 vm_page_aflag_set(m, PGA_REFERENCED); 1007 break; 1008 } 1009 1010 if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte))) 1011 goto retry; 1012 ptesync(); 1013 PMAP_UNLOCK(pmap); 1014 if (startpte == newpte) 1015 return (KERN_FAILURE); 1016 return (0); 1017 protfail: 1018 PMAP_UNLOCK(pmap); 1019 return (KERN_PROTECTION_FAILURE); 1020 } 1021 1022 /* 1023 * Returns TRUE if the given page is mapped individually or as part of 1024 * a 2mpage. Otherwise, returns FALSE. 1025 */ 1026 boolean_t 1027 mmu_radix_page_is_mapped(vm_page_t m) 1028 { 1029 struct rwlock *lock; 1030 boolean_t rv; 1031 1032 if ((m->oflags & VPO_UNMANAGED) != 0) 1033 return (FALSE); 1034 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 1035 rw_rlock(lock); 1036 rv = !TAILQ_EMPTY(&m->md.pv_list) || 1037 ((m->flags & PG_FICTITIOUS) == 0 && 1038 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 1039 rw_runlock(lock); 1040 return (rv); 1041 } 1042 1043 /* 1044 * Determine the appropriate bits to set in a PTE or PDE for a specified 1045 * caching mode. 1046 */ 1047 static int 1048 pmap_cache_bits(vm_memattr_t ma) 1049 { 1050 if (ma != VM_MEMATTR_DEFAULT) { 1051 switch (ma) { 1052 case VM_MEMATTR_UNCACHEABLE: 1053 return (RPTE_ATTR_GUARDEDIO); 1054 case VM_MEMATTR_CACHEABLE: 1055 return (RPTE_ATTR_MEM); 1056 case VM_MEMATTR_WRITE_BACK: 1057 case VM_MEMATTR_PREFETCHABLE: 1058 case VM_MEMATTR_WRITE_COMBINING: 1059 return (RPTE_ATTR_UNGUARDEDIO); 1060 } 1061 } 1062 return (0); 1063 } 1064 1065 static void 1066 pmap_invalidate_page(pmap_t pmap, vm_offset_t start) 1067 { 1068 ptesync(); 1069 if (pmap == kernel_pmap) 1070 radix_tlbie_invlpg_kernel_4k(start); 1071 else 1072 radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); 1073 ttusync(); 1074 } 1075 1076 static void 1077 pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start) 1078 { 1079 ptesync(); 1080 if (pmap == kernel_pmap) 1081 radix_tlbie_invlpg_kernel_2m(start); 1082 else 1083 radix_tlbie_invlpg_user_2m(pmap->pm_pid, start); 1084 ttusync(); 1085 } 1086 1087 static void 1088 pmap_invalidate_pwc(pmap_t pmap) 1089 { 1090 ptesync(); 1091 if (pmap == kernel_pmap) 1092 radix_tlbie_invlpwc_kernel(); 1093 else 1094 radix_tlbie_invlpwc_user(pmap->pm_pid); 1095 ttusync(); 1096 } 1097 1098 static void 1099 pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end) 1100 { 1101 if (((start - end) >> PAGE_SHIFT) > 8) { 1102 pmap_invalidate_all(pmap); 1103 return; 1104 } 1105 ptesync(); 1106 if (pmap == kernel_pmap) { 1107 while (start < end) { 1108 radix_tlbie_invlpg_kernel_4k(start); 1109 start += PAGE_SIZE; 1110 } 1111 } else { 1112 while (start < end) { 1113 radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); 1114 start += PAGE_SIZE; 1115 } 1116 } 1117 ttusync(); 1118 } 1119 1120 static void 1121 pmap_invalidate_all(pmap_t pmap) 1122 { 1123 ptesync(); 1124 if (pmap == kernel_pmap) 1125 radix_tlbie_flush_kernel(); 1126 else 1127 radix_tlbie_flush_user(pmap->pm_pid); 1128 ttusync(); 1129 } 1130 1131 static void 1132 pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e) 1133 { 1134 1135 /* 1136 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 1137 * by a promotion that did not invalidate the 512 4KB page mappings 1138 * that might exist in the TLB. Consequently, at this point, the TLB 1139 * may hold both 4KB and 2MB page mappings for the address range [va, 1140 * va + L3_PAGE_SIZE). Therefore, the entire range must be invalidated here. 1141 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 1142 * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a 1143 * single INVLPG suffices to invalidate the 2MB page mapping from the 1144 * TLB. 1145 */ 1146 ptesync(); 1147 if ((l3e & PG_PROMOTED) != 0) 1148 pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1); 1149 else 1150 pmap_invalidate_page_2m(pmap, va); 1151 1152 pmap_invalidate_pwc(pmap); 1153 } 1154 1155 static __inline struct pv_chunk * 1156 pv_to_chunk(pv_entry_t pv) 1157 { 1158 1159 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1160 } 1161 1162 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1163 1164 #define PC_FREE0 0xfffffffffffffffful 1165 #define PC_FREE1 0x3ffffffffffffffful 1166 1167 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 }; 1168 1169 /* 1170 * Ensure that the number of spare PV entries in the specified pmap meets or 1171 * exceeds the given count, "needed". 1172 * 1173 * The given PV list lock may be released. 1174 */ 1175 static void 1176 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1177 { 1178 struct pch new_tail; 1179 struct pv_chunk *pc; 1180 vm_page_t m; 1181 int avail, free; 1182 bool reclaimed; 1183 1184 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1185 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1186 1187 /* 1188 * Newly allocated PV chunks must be stored in a private list until 1189 * the required number of PV chunks have been allocated. Otherwise, 1190 * reclaim_pv_chunk() could recycle one of these chunks. In 1191 * contrast, these chunks must be added to the pmap upon allocation. 1192 */ 1193 TAILQ_INIT(&new_tail); 1194 retry: 1195 avail = 0; 1196 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1197 // if ((cpu_feature2 & CPUID2_POPCNT) == 0) 1198 bit_count((bitstr_t *)pc->pc_map, 0, 1199 sizeof(pc->pc_map) * NBBY, &free); 1200 #if 0 1201 free = popcnt_pc_map_pq(pc->pc_map); 1202 #endif 1203 if (free == 0) 1204 break; 1205 avail += free; 1206 if (avail >= needed) 1207 break; 1208 } 1209 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1210 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1211 VM_ALLOC_WIRED); 1212 if (m == NULL) { 1213 m = reclaim_pv_chunk(pmap, lockp); 1214 if (m == NULL) 1215 goto retry; 1216 reclaimed = true; 1217 } 1218 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1219 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1220 dump_add_page(m->phys_addr); 1221 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1222 pc->pc_pmap = pmap; 1223 pc->pc_map[0] = PC_FREE0; 1224 pc->pc_map[1] = PC_FREE1; 1225 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1226 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1227 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 1228 1229 /* 1230 * The reclaim might have freed a chunk from the current pmap. 1231 * If that chunk contained available entries, we need to 1232 * re-count the number of available entries. 1233 */ 1234 if (reclaimed) 1235 goto retry; 1236 } 1237 if (!TAILQ_EMPTY(&new_tail)) { 1238 mtx_lock(&pv_chunks_mutex); 1239 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1240 mtx_unlock(&pv_chunks_mutex); 1241 } 1242 } 1243 1244 /* 1245 * First find and then remove the pv entry for the specified pmap and virtual 1246 * address from the specified pv list. Returns the pv entry if found and NULL 1247 * otherwise. This operation can be performed on pv lists for either 4KB or 1248 * 2MB page mappings. 1249 */ 1250 static __inline pv_entry_t 1251 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1252 { 1253 pv_entry_t pv; 1254 1255 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 1256 #ifdef INVARIANTS 1257 if (PV_PMAP(pv) == NULL) { 1258 printf("corrupted pv_chunk/pv %p\n", pv); 1259 printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":"); 1260 } 1261 MPASS(PV_PMAP(pv) != NULL); 1262 MPASS(pv->pv_va != 0); 1263 #endif 1264 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1265 TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); 1266 pvh->pv_gen++; 1267 break; 1268 } 1269 } 1270 return (pv); 1271 } 1272 1273 /* 1274 * After demotion from a 2MB page mapping to 512 4KB page mappings, 1275 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 1276 * entries for each of the 4KB page mappings. 1277 */ 1278 static void 1279 pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1280 struct rwlock **lockp) 1281 { 1282 struct md_page *pvh; 1283 struct pv_chunk *pc; 1284 pv_entry_t pv; 1285 vm_offset_t va_last; 1286 vm_page_t m; 1287 int bit, field; 1288 1289 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1290 KASSERT((pa & L3_PAGE_MASK) == 0, 1291 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 1292 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1293 1294 /* 1295 * Transfer the 2mpage's pv entry for this mapping to the first 1296 * page's pv list. Once this transfer begins, the pv list lock 1297 * must not be released until the last pv entry is reinstantiated. 1298 */ 1299 pvh = pa_to_pvh(pa); 1300 va = trunc_2mpage(va); 1301 pv = pmap_pvh_remove(pvh, pmap, va); 1302 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 1303 m = PHYS_TO_VM_PAGE(pa); 1304 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 1305 1306 m->md.pv_gen++; 1307 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 1308 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 1309 va_last = va + L3_PAGE_SIZE - PAGE_SIZE; 1310 for (;;) { 1311 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1312 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 1313 , ("pmap_pv_demote_pde: missing spare")); 1314 for (field = 0; field < _NPCM; field++) { 1315 while (pc->pc_map[field]) { 1316 bit = cnttzd(pc->pc_map[field]); 1317 pc->pc_map[field] &= ~(1ul << bit); 1318 pv = &pc->pc_pventry[field * 64 + bit]; 1319 va += PAGE_SIZE; 1320 pv->pv_va = va; 1321 m++; 1322 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1323 ("pmap_pv_demote_pde: page %p is not managed", m)); 1324 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 1325 1326 m->md.pv_gen++; 1327 if (va == va_last) 1328 goto out; 1329 } 1330 } 1331 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1332 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1333 } 1334 out: 1335 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { 1336 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1337 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1338 } 1339 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 1340 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 1341 } 1342 1343 static void 1344 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap) 1345 { 1346 1347 if (pmap == NULL) 1348 return; 1349 pmap_invalidate_all(pmap); 1350 if (pmap != locked_pmap) 1351 PMAP_UNLOCK(pmap); 1352 } 1353 1354 /* 1355 * We are in a serious low memory condition. Resort to 1356 * drastic measures to free some pages so we can allocate 1357 * another pv entry chunk. 1358 * 1359 * Returns NULL if PV entries were reclaimed from the specified pmap. 1360 * 1361 * We do not, however, unmap 2mpages because subsequent accesses will 1362 * allocate per-page pv entries until repromotion occurs, thereby 1363 * exacerbating the shortage of free pv entries. 1364 */ 1365 static int active_reclaims = 0; 1366 static vm_page_t 1367 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1368 { 1369 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 1370 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 1371 struct md_page *pvh; 1372 pml3_entry_t *l3e; 1373 pmap_t next_pmap, pmap; 1374 pt_entry_t *pte, tpte; 1375 pv_entry_t pv; 1376 vm_offset_t va; 1377 vm_page_t m, m_pc; 1378 struct spglist free; 1379 uint64_t inuse; 1380 int bit, field, freed; 1381 1382 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 1383 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 1384 pmap = NULL; 1385 m_pc = NULL; 1386 SLIST_INIT(&free); 1387 bzero(&pc_marker_b, sizeof(pc_marker_b)); 1388 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 1389 pc_marker = (struct pv_chunk *)&pc_marker_b; 1390 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 1391 1392 mtx_lock(&pv_chunks_mutex); 1393 active_reclaims++; 1394 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); 1395 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); 1396 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 1397 SLIST_EMPTY(&free)) { 1398 next_pmap = pc->pc_pmap; 1399 if (next_pmap == NULL) { 1400 /* 1401 * The next chunk is a marker. However, it is 1402 * not our marker, so active_reclaims must be 1403 * > 1. Consequently, the next_chunk code 1404 * will not rotate the pv_chunks list. 1405 */ 1406 goto next_chunk; 1407 } 1408 mtx_unlock(&pv_chunks_mutex); 1409 1410 /* 1411 * A pv_chunk can only be removed from the pc_lru list 1412 * when both pc_chunks_mutex is owned and the 1413 * corresponding pmap is locked. 1414 */ 1415 if (pmap != next_pmap) { 1416 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); 1417 pmap = next_pmap; 1418 /* Avoid deadlock and lock recursion. */ 1419 if (pmap > locked_pmap) { 1420 RELEASE_PV_LIST_LOCK(lockp); 1421 PMAP_LOCK(pmap); 1422 mtx_lock(&pv_chunks_mutex); 1423 continue; 1424 } else if (pmap != locked_pmap) { 1425 if (PMAP_TRYLOCK(pmap)) { 1426 mtx_lock(&pv_chunks_mutex); 1427 continue; 1428 } else { 1429 pmap = NULL; /* pmap is not locked */ 1430 mtx_lock(&pv_chunks_mutex); 1431 pc = TAILQ_NEXT(pc_marker, pc_lru); 1432 if (pc == NULL || 1433 pc->pc_pmap != next_pmap) 1434 continue; 1435 goto next_chunk; 1436 } 1437 } 1438 } 1439 1440 /* 1441 * Destroy every non-wired, 4 KB page mapping in the chunk. 1442 */ 1443 freed = 0; 1444 for (field = 0; field < _NPCM; field++) { 1445 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 1446 inuse != 0; inuse &= ~(1UL << bit)) { 1447 bit = cnttzd(inuse); 1448 pv = &pc->pc_pventry[field * 64 + bit]; 1449 va = pv->pv_va; 1450 l3e = pmap_pml3e(pmap, va); 1451 if ((be64toh(*l3e) & RPTE_LEAF) != 0) 1452 continue; 1453 pte = pmap_l3e_to_pte(l3e, va); 1454 if ((be64toh(*pte) & PG_W) != 0) 1455 continue; 1456 tpte = be64toh(pte_load_clear(pte)); 1457 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 1458 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 1459 vm_page_dirty(m); 1460 if ((tpte & PG_A) != 0) 1461 vm_page_aflag_set(m, PGA_REFERENCED); 1462 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1463 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 1464 1465 m->md.pv_gen++; 1466 if (TAILQ_EMPTY(&m->md.pv_list) && 1467 (m->flags & PG_FICTITIOUS) == 0) { 1468 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 1469 if (TAILQ_EMPTY(&pvh->pv_list)) { 1470 vm_page_aflag_clear(m, 1471 PGA_WRITEABLE); 1472 } 1473 } 1474 pc->pc_map[field] |= 1UL << bit; 1475 pmap_unuse_pt(pmap, va, be64toh(*l3e), &free); 1476 freed++; 1477 } 1478 } 1479 if (freed == 0) { 1480 mtx_lock(&pv_chunks_mutex); 1481 goto next_chunk; 1482 } 1483 /* Every freed mapping is for a 4 KB page. */ 1484 pmap_resident_count_dec(pmap, freed); 1485 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 1486 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 1487 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 1488 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1489 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) { 1490 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1491 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1492 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1493 /* Entire chunk is free; return it. */ 1494 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1495 dump_drop_page(m_pc->phys_addr); 1496 mtx_lock(&pv_chunks_mutex); 1497 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1498 break; 1499 } 1500 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1501 mtx_lock(&pv_chunks_mutex); 1502 /* One freed pv entry in locked_pmap is sufficient. */ 1503 if (pmap == locked_pmap) 1504 break; 1505 next_chunk: 1506 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 1507 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); 1508 if (active_reclaims == 1 && pmap != NULL) { 1509 /* 1510 * Rotate the pv chunks list so that we do not 1511 * scan the same pv chunks that could not be 1512 * freed (because they contained a wired 1513 * and/or superpage mapping) on every 1514 * invocation of reclaim_pv_chunk(). 1515 */ 1516 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { 1517 MPASS(pc->pc_pmap != NULL); 1518 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1519 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1520 } 1521 } 1522 } 1523 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 1524 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); 1525 active_reclaims--; 1526 mtx_unlock(&pv_chunks_mutex); 1527 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); 1528 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 1529 m_pc = SLIST_FIRST(&free); 1530 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 1531 /* Recycle a freed page table page. */ 1532 m_pc->ref_count = 1; 1533 } 1534 vm_page_free_pages_toq(&free, true); 1535 return (m_pc); 1536 } 1537 1538 /* 1539 * free the pv_entry back to the free list 1540 */ 1541 static void 1542 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1543 { 1544 struct pv_chunk *pc; 1545 int idx, field, bit; 1546 1547 #ifdef VERBOSE_PV 1548 if (pmap != kernel_pmap) 1549 printf("%s(%p, %p)\n", __func__, pmap, pv); 1550 #endif 1551 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1552 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1553 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1554 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1555 pc = pv_to_chunk(pv); 1556 idx = pv - &pc->pc_pventry[0]; 1557 field = idx / 64; 1558 bit = idx % 64; 1559 pc->pc_map[field] |= 1ul << bit; 1560 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) { 1561 /* 98% of the time, pc is already at the head of the list. */ 1562 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1563 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1564 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1565 } 1566 return; 1567 } 1568 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1569 free_pv_chunk(pc); 1570 } 1571 1572 static void 1573 free_pv_chunk(struct pv_chunk *pc) 1574 { 1575 vm_page_t m; 1576 1577 mtx_lock(&pv_chunks_mutex); 1578 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1579 mtx_unlock(&pv_chunks_mutex); 1580 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1581 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1582 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1583 /* entire chunk is free, return it */ 1584 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1585 dump_drop_page(m->phys_addr); 1586 vm_page_unwire_noq(m); 1587 vm_page_free(m); 1588 } 1589 1590 /* 1591 * Returns a new PV entry, allocating a new PV chunk from the system when 1592 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1593 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1594 * returned. 1595 * 1596 * The given PV list lock may be released. 1597 */ 1598 static pv_entry_t 1599 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1600 { 1601 int bit, field; 1602 pv_entry_t pv; 1603 struct pv_chunk *pc; 1604 vm_page_t m; 1605 1606 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1607 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1608 retry: 1609 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1610 if (pc != NULL) { 1611 for (field = 0; field < _NPCM; field++) { 1612 if (pc->pc_map[field]) { 1613 bit = cnttzd(pc->pc_map[field]); 1614 break; 1615 } 1616 } 1617 if (field < _NPCM) { 1618 pv = &pc->pc_pventry[field * 64 + bit]; 1619 pc->pc_map[field] &= ~(1ul << bit); 1620 /* If this was the last item, move it to tail */ 1621 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { 1622 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1623 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1624 pc_list); 1625 } 1626 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1627 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1628 MPASS(PV_PMAP(pv) != NULL); 1629 return (pv); 1630 } 1631 } 1632 /* No free items, allocate another chunk */ 1633 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1634 VM_ALLOC_WIRED); 1635 if (m == NULL) { 1636 if (lockp == NULL) { 1637 PV_STAT(pc_chunk_tryfail++); 1638 return (NULL); 1639 } 1640 m = reclaim_pv_chunk(pmap, lockp); 1641 if (m == NULL) 1642 goto retry; 1643 } 1644 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1645 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1646 dump_add_page(m->phys_addr); 1647 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1648 pc->pc_pmap = pmap; 1649 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1650 pc->pc_map[1] = PC_FREE1; 1651 mtx_lock(&pv_chunks_mutex); 1652 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1653 mtx_unlock(&pv_chunks_mutex); 1654 pv = &pc->pc_pventry[0]; 1655 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1656 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1657 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1658 MPASS(PV_PMAP(pv) != NULL); 1659 return (pv); 1660 } 1661 1662 #if VM_NRESERVLEVEL > 0 1663 /* 1664 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 1665 * replace the many pv entries for the 4KB page mappings by a single pv entry 1666 * for the 2MB page mapping. 1667 */ 1668 static void 1669 pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1670 struct rwlock **lockp) 1671 { 1672 struct md_page *pvh; 1673 pv_entry_t pv; 1674 vm_offset_t va_last; 1675 vm_page_t m; 1676 1677 KASSERT((pa & L3_PAGE_MASK) == 0, 1678 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 1679 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1680 1681 /* 1682 * Transfer the first page's pv entry for this mapping to the 2mpage's 1683 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 1684 * a transfer avoids the possibility that get_pv_entry() calls 1685 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 1686 * mappings that is being promoted. 1687 */ 1688 m = PHYS_TO_VM_PAGE(pa); 1689 va = trunc_2mpage(va); 1690 pv = pmap_pvh_remove(&m->md, pmap, va); 1691 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 1692 pvh = pa_to_pvh(pa); 1693 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); 1694 pvh->pv_gen++; 1695 /* Free the remaining NPTEPG - 1 pv entries. */ 1696 va_last = va + L3_PAGE_SIZE - PAGE_SIZE; 1697 do { 1698 m++; 1699 va += PAGE_SIZE; 1700 pmap_pvh_free(&m->md, pmap, va); 1701 } while (va < va_last); 1702 } 1703 #endif /* VM_NRESERVLEVEL > 0 */ 1704 1705 /* 1706 * First find and then destroy the pv entry for the specified pmap and virtual 1707 * address. This operation can be performed on pv lists for either 4KB or 2MB 1708 * page mappings. 1709 */ 1710 static void 1711 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1712 { 1713 pv_entry_t pv; 1714 1715 pv = pmap_pvh_remove(pvh, pmap, va); 1716 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 1717 free_pv_entry(pmap, pv); 1718 } 1719 1720 /* 1721 * Conditionally create the PV entry for a 4KB page mapping if the required 1722 * memory can be allocated without resorting to reclamation. 1723 */ 1724 static boolean_t 1725 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 1726 struct rwlock **lockp) 1727 { 1728 pv_entry_t pv; 1729 1730 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1731 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1732 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 1733 pv->pv_va = va; 1734 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1735 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 1736 m->md.pv_gen++; 1737 return (TRUE); 1738 } else 1739 return (FALSE); 1740 } 1741 1742 vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX]; 1743 #ifdef INVARIANTS 1744 static void 1745 validate_addr(vm_paddr_t addr, vm_size_t size) 1746 { 1747 vm_paddr_t end = addr + size; 1748 bool found = false; 1749 1750 for (int i = 0; i < 2 * phys_avail_count; i += 2) { 1751 if (addr >= phys_avail_debug[i] && 1752 end <= phys_avail_debug[i + 1]) { 1753 found = true; 1754 break; 1755 } 1756 } 1757 KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array", 1758 addr, end)); 1759 } 1760 #else 1761 static void validate_addr(vm_paddr_t addr, vm_size_t size) {} 1762 #endif 1763 #define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A) 1764 1765 static vm_paddr_t 1766 alloc_pt_page(void) 1767 { 1768 vm_paddr_t page; 1769 1770 page = allocpages(1); 1771 pagezero(PHYS_TO_DMAP(page)); 1772 return (page); 1773 } 1774 1775 static void 1776 mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end) 1777 { 1778 pt_entry_t *pte, pteval; 1779 vm_paddr_t page; 1780 1781 if (bootverbose) 1782 printf("%s %lx -> %lx\n", __func__, start, end); 1783 while (start < end) { 1784 pteval = start | DMAP_PAGE_BITS; 1785 pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start)); 1786 if ((be64toh(*pte) & RPTE_VALID) == 0) { 1787 page = alloc_pt_page(); 1788 pde_store(pte, page); 1789 } 1790 pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start)); 1791 if ((start & L2_PAGE_MASK) == 0 && 1792 end - start >= L2_PAGE_SIZE) { 1793 start += L2_PAGE_SIZE; 1794 goto done; 1795 } else if ((be64toh(*pte) & RPTE_VALID) == 0) { 1796 page = alloc_pt_page(); 1797 pde_store(pte, page); 1798 } 1799 1800 pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start)); 1801 if ((start & L3_PAGE_MASK) == 0 && 1802 end - start >= L3_PAGE_SIZE) { 1803 start += L3_PAGE_SIZE; 1804 goto done; 1805 } else if ((be64toh(*pte) & RPTE_VALID) == 0) { 1806 page = alloc_pt_page(); 1807 pde_store(pte, page); 1808 } 1809 pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start)); 1810 start += PAGE_SIZE; 1811 done: 1812 pte_store(pte, pteval); 1813 } 1814 } 1815 1816 static void 1817 mmu_radix_dmap_populate(vm_size_t hwphyssz) 1818 { 1819 vm_paddr_t start, end; 1820 1821 for (int i = 0; i < pregions_sz; i++) { 1822 start = pregions[i].mr_start; 1823 end = start + pregions[i].mr_size; 1824 if (hwphyssz && start >= hwphyssz) 1825 break; 1826 if (hwphyssz && hwphyssz < end) 1827 end = hwphyssz; 1828 mmu_radix_dmap_range(start, end); 1829 } 1830 } 1831 1832 static void 1833 mmu_radix_setup_pagetables(vm_size_t hwphyssz) 1834 { 1835 vm_paddr_t ptpages, pages; 1836 pt_entry_t *pte; 1837 vm_paddr_t l1phys; 1838 1839 bzero(kernel_pmap, sizeof(struct pmap)); 1840 PMAP_LOCK_INIT(kernel_pmap); 1841 1842 ptpages = allocpages(3); 1843 l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE); 1844 validate_addr(l1phys, RADIX_PGD_SIZE); 1845 if (bootverbose) 1846 printf("l1phys=%lx\n", l1phys); 1847 MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0); 1848 for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++) 1849 pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE)); 1850 kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys); 1851 1852 mmu_radix_dmap_populate(hwphyssz); 1853 1854 /* 1855 * Create page tables for first 128MB of KVA 1856 */ 1857 pages = ptpages; 1858 pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS); 1859 *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); 1860 pages += PAGE_SIZE; 1861 pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS); 1862 *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); 1863 pages += PAGE_SIZE; 1864 pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS); 1865 /* 1866 * the kernel page table pages need to be preserved in 1867 * phys_avail and not overlap with previous allocations 1868 */ 1869 pages = allocpages(nkpt); 1870 if (bootverbose) { 1871 printf("phys_avail after dmap populate and nkpt allocation\n"); 1872 for (int j = 0; j < 2 * phys_avail_count; j+=2) 1873 printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", 1874 j, phys_avail[j], j + 1, phys_avail[j + 1]); 1875 } 1876 KPTphys = pages; 1877 for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE) 1878 *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); 1879 kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE; 1880 if (bootverbose) 1881 printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1); 1882 /* 1883 * Add a physical memory segment (vm_phys_seg) corresponding to the 1884 * preallocated kernel page table pages so that vm_page structures 1885 * representing these pages will be created. The vm_page structures 1886 * are required for promotion of the corresponding kernel virtual 1887 * addresses to superpage mappings. 1888 */ 1889 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1890 } 1891 1892 static void 1893 mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end) 1894 { 1895 vm_paddr_t kpstart, kpend; 1896 vm_size_t physsz, hwphyssz; 1897 //uint64_t l2virt; 1898 int rm_pavail, proctab_size; 1899 int i, j; 1900 1901 kpstart = start & ~DMAP_BASE_ADDRESS; 1902 kpend = end & ~DMAP_BASE_ADDRESS; 1903 1904 /* Get physical memory regions from firmware */ 1905 mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); 1906 CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory"); 1907 1908 if (2 * VM_PHYSSEG_MAX < regions_sz) 1909 panic("mmu_radix_early_bootstrap: phys_avail too small"); 1910 1911 if (bootverbose) 1912 for (int i = 0; i < regions_sz; i++) 1913 printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n", 1914 i, regions[i].mr_start, i, regions[i].mr_size); 1915 /* 1916 * XXX workaround a simulator bug 1917 */ 1918 for (int i = 0; i < regions_sz; i++) 1919 if (regions[i].mr_start & PAGE_MASK) { 1920 regions[i].mr_start += PAGE_MASK; 1921 regions[i].mr_start &= ~PAGE_MASK; 1922 regions[i].mr_size &= ~PAGE_MASK; 1923 } 1924 if (bootverbose) 1925 for (int i = 0; i < pregions_sz; i++) 1926 printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n", 1927 i, pregions[i].mr_start, i, pregions[i].mr_size); 1928 1929 phys_avail_count = 0; 1930 physsz = 0; 1931 hwphyssz = 0; 1932 TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); 1933 for (i = 0, j = 0; i < regions_sz; i++) { 1934 if (bootverbose) 1935 printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n", 1936 i, regions[i].mr_start, i, regions[i].mr_size); 1937 1938 if (regions[i].mr_size < PAGE_SIZE) 1939 continue; 1940 1941 if (hwphyssz != 0 && 1942 (physsz + regions[i].mr_size) >= hwphyssz) { 1943 if (physsz < hwphyssz) { 1944 phys_avail[j] = regions[i].mr_start; 1945 phys_avail[j + 1] = regions[i].mr_start + 1946 (hwphyssz - physsz); 1947 physsz = hwphyssz; 1948 phys_avail_count++; 1949 dump_avail[j] = phys_avail[j]; 1950 dump_avail[j + 1] = phys_avail[j + 1]; 1951 } 1952 break; 1953 } 1954 phys_avail[j] = regions[i].mr_start; 1955 phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; 1956 dump_avail[j] = phys_avail[j]; 1957 dump_avail[j + 1] = phys_avail[j + 1]; 1958 1959 phys_avail_count++; 1960 physsz += regions[i].mr_size; 1961 j += 2; 1962 } 1963 1964 /* Check for overlap with the kernel and exception vectors */ 1965 rm_pavail = 0; 1966 for (j = 0; j < 2 * phys_avail_count; j+=2) { 1967 if (phys_avail[j] < EXC_LAST) 1968 phys_avail[j] += EXC_LAST; 1969 1970 if (phys_avail[j] >= kpstart && 1971 phys_avail[j + 1] <= kpend) { 1972 phys_avail[j] = phys_avail[j + 1] = ~0; 1973 rm_pavail++; 1974 continue; 1975 } 1976 1977 if (kpstart >= phys_avail[j] && 1978 kpstart < phys_avail[j + 1]) { 1979 if (kpend < phys_avail[j + 1]) { 1980 phys_avail[2 * phys_avail_count] = 1981 (kpend & ~PAGE_MASK) + PAGE_SIZE; 1982 phys_avail[2 * phys_avail_count + 1] = 1983 phys_avail[j + 1]; 1984 phys_avail_count++; 1985 } 1986 1987 phys_avail[j + 1] = kpstart & ~PAGE_MASK; 1988 } 1989 1990 if (kpend >= phys_avail[j] && 1991 kpend < phys_avail[j + 1]) { 1992 if (kpstart > phys_avail[j]) { 1993 phys_avail[2 * phys_avail_count] = phys_avail[j]; 1994 phys_avail[2 * phys_avail_count + 1] = 1995 kpstart & ~PAGE_MASK; 1996 phys_avail_count++; 1997 } 1998 1999 phys_avail[j] = (kpend & ~PAGE_MASK) + 2000 PAGE_SIZE; 2001 } 2002 } 2003 qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp); 2004 for (i = 0; i < 2 * phys_avail_count; i++) 2005 phys_avail_debug[i] = phys_avail[i]; 2006 2007 /* Remove physical available regions marked for removal (~0) */ 2008 if (rm_pavail) { 2009 phys_avail_count -= rm_pavail; 2010 for (i = 2 * phys_avail_count; 2011 i < 2*(phys_avail_count + rm_pavail); i+=2) 2012 phys_avail[i] = phys_avail[i + 1] = 0; 2013 } 2014 if (bootverbose) { 2015 printf("phys_avail ranges after filtering:\n"); 2016 for (j = 0; j < 2 * phys_avail_count; j+=2) 2017 printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", 2018 j, phys_avail[j], j + 1, phys_avail[j + 1]); 2019 } 2020 physmem = btoc(physsz); 2021 2022 /* XXX assume we're running non-virtualized and 2023 * we don't support BHYVE 2024 */ 2025 if (isa3_pid_bits == 0) 2026 isa3_pid_bits = 20; 2027 parttab_phys = moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE); 2028 validate_addr(parttab_phys, PARTTAB_SIZE); 2029 for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++) 2030 pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE)); 2031 2032 proctab_size = 1UL << PROCTAB_SIZE_SHIFT; 2033 proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size); 2034 validate_addr(proctab0pa, proctab_size); 2035 for (int i = 0; i < proctab_size/PAGE_SIZE; i++) 2036 pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE)); 2037 2038 mmu_radix_setup_pagetables(hwphyssz); 2039 } 2040 2041 static void 2042 mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end) 2043 { 2044 int i; 2045 vm_paddr_t pa; 2046 void *dpcpu; 2047 vm_offset_t va; 2048 2049 /* 2050 * Set up the Open Firmware pmap and add its mappings if not in real 2051 * mode. 2052 */ 2053 if (bootverbose) 2054 printf("%s enter\n", __func__); 2055 2056 /* 2057 * Calculate the last available physical address, and reserve the 2058 * vm_page_array (upper bound). 2059 */ 2060 Maxmem = 0; 2061 for (i = 0; phys_avail[i + 2] != 0; i += 2) 2062 Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); 2063 2064 /* 2065 * Set the start and end of kva. 2066 */ 2067 virtual_avail = VM_MIN_KERNEL_ADDRESS; 2068 virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; 2069 2070 /* 2071 * Remap any early IO mappings (console framebuffer, etc.) 2072 */ 2073 bs_remap_earlyboot(); 2074 2075 /* 2076 * Allocate a kernel stack with a guard page for thread0 and map it 2077 * into the kernel page map. 2078 */ 2079 pa = allocpages(kstack_pages); 2080 va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; 2081 virtual_avail = va + kstack_pages * PAGE_SIZE; 2082 CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); 2083 thread0.td_kstack = va; 2084 for (i = 0; i < kstack_pages; i++) { 2085 mmu_radix_kenter(va, pa); 2086 pa += PAGE_SIZE; 2087 va += PAGE_SIZE; 2088 } 2089 thread0.td_kstack_pages = kstack_pages; 2090 2091 /* 2092 * Allocate virtual address space for the message buffer. 2093 */ 2094 pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK) >> PAGE_SHIFT); 2095 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa); 2096 2097 /* 2098 * Allocate virtual address space for the dynamic percpu area. 2099 */ 2100 pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT); 2101 dpcpu = (void *)PHYS_TO_DMAP(pa); 2102 dpcpu_init(dpcpu, curcpu); 2103 2104 crashdumpmap = (caddr_t)virtual_avail; 2105 virtual_avail += MAXDUMPPGS * PAGE_SIZE; 2106 2107 /* 2108 * Reserve some special page table entries/VA space for temporary 2109 * mapping of pages. 2110 */ 2111 } 2112 2113 static void 2114 mmu_parttab_init(void) 2115 { 2116 uint64_t ptcr; 2117 2118 isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys); 2119 2120 if (bootverbose) 2121 printf("%s parttab: %p\n", __func__, isa3_parttab); 2122 ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); 2123 if (bootverbose) 2124 printf("setting ptcr %lx\n", ptcr); 2125 mtspr(SPR_PTCR, ptcr); 2126 } 2127 2128 static void 2129 mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab) 2130 { 2131 uint64_t prev; 2132 2133 if (bootverbose) 2134 printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab, 2135 lpid, pagetab, proctab); 2136 prev = be64toh(isa3_parttab[lpid].pagetab); 2137 isa3_parttab[lpid].pagetab = htobe64(pagetab); 2138 isa3_parttab[lpid].proctab = htobe64(proctab); 2139 2140 if (prev & PARTTAB_HR) { 2141 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : 2142 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 2143 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : 2144 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 2145 } else { 2146 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : 2147 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 2148 } 2149 ttusync(); 2150 } 2151 2152 static void 2153 mmu_radix_parttab_init(void) 2154 { 2155 uint64_t pagetab; 2156 2157 mmu_parttab_init(); 2158 pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \ 2159 RADIX_PGD_INDEX_SHIFT | PARTTAB_HR; 2160 mmu_parttab_update(0, pagetab, 0); 2161 } 2162 2163 static void 2164 mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size) 2165 { 2166 uint64_t pagetab, proctab; 2167 2168 pagetab = be64toh(isa3_parttab[0].pagetab); 2169 proctab = proctabpa | table_size | PARTTAB_GR; 2170 mmu_parttab_update(0, pagetab, proctab); 2171 } 2172 2173 static void 2174 mmu_radix_proctab_init(void) 2175 { 2176 2177 isa3_base_pid = 1; 2178 2179 isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa); 2180 isa3_proctab->proctab0 = 2181 htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | 2182 RADIX_PGD_INDEX_SHIFT); 2183 2184 mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12); 2185 2186 __asm __volatile("ptesync" : : : "memory"); 2187 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : 2188 "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); 2189 __asm __volatile("eieio; tlbsync; ptesync" : : : "memory"); 2190 if (bootverbose) 2191 printf("process table %p and kernel radix PDE: %p\n", 2192 isa3_proctab, kernel_pmap->pm_pml1); 2193 mtmsr(mfmsr() | PSL_DR ); 2194 mtmsr(mfmsr() & ~PSL_DR); 2195 kernel_pmap->pm_pid = isa3_base_pid; 2196 isa3_base_pid++; 2197 } 2198 2199 void 2200 mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 2201 int advice) 2202 { 2203 struct rwlock *lock; 2204 pml1_entry_t *l1e; 2205 pml2_entry_t *l2e; 2206 pml3_entry_t oldl3e, *l3e; 2207 pt_entry_t *pte; 2208 vm_offset_t va, va_next; 2209 vm_page_t m; 2210 bool anychanged; 2211 2212 if (advice != MADV_DONTNEED && advice != MADV_FREE) 2213 return; 2214 anychanged = false; 2215 PMAP_LOCK(pmap); 2216 for (; sva < eva; sva = va_next) { 2217 l1e = pmap_pml1e(pmap, sva); 2218 if ((be64toh(*l1e) & PG_V) == 0) { 2219 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 2220 if (va_next < sva) 2221 va_next = eva; 2222 continue; 2223 } 2224 l2e = pmap_l1e_to_l2e(l1e, sva); 2225 if ((be64toh(*l2e) & PG_V) == 0) { 2226 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 2227 if (va_next < sva) 2228 va_next = eva; 2229 continue; 2230 } 2231 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 2232 if (va_next < sva) 2233 va_next = eva; 2234 l3e = pmap_l2e_to_l3e(l2e, sva); 2235 oldl3e = be64toh(*l3e); 2236 if ((oldl3e & PG_V) == 0) 2237 continue; 2238 else if ((oldl3e & RPTE_LEAF) != 0) { 2239 if ((oldl3e & PG_MANAGED) == 0) 2240 continue; 2241 lock = NULL; 2242 if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) { 2243 if (lock != NULL) 2244 rw_wunlock(lock); 2245 2246 /* 2247 * The large page mapping was destroyed. 2248 */ 2249 continue; 2250 } 2251 2252 /* 2253 * Unless the page mappings are wired, remove the 2254 * mapping to a single page so that a subsequent 2255 * access may repromote. Choosing the last page 2256 * within the address range [sva, min(va_next, eva)) 2257 * generally results in more repromotions. Since the 2258 * underlying page table page is fully populated, this 2259 * removal never frees a page table page. 2260 */ 2261 if ((oldl3e & PG_W) == 0) { 2262 va = eva; 2263 if (va > va_next) 2264 va = va_next; 2265 va -= PAGE_SIZE; 2266 KASSERT(va >= sva, 2267 ("mmu_radix_advise: no address gap")); 2268 pte = pmap_l3e_to_pte(l3e, va); 2269 KASSERT((be64toh(*pte) & PG_V) != 0, 2270 ("pmap_advise: invalid PTE")); 2271 pmap_remove_pte(pmap, pte, va, be64toh(*l3e), NULL, 2272 &lock); 2273 anychanged = true; 2274 } 2275 if (lock != NULL) 2276 rw_wunlock(lock); 2277 } 2278 if (va_next > eva) 2279 va_next = eva; 2280 va = va_next; 2281 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; 2282 pte++, sva += PAGE_SIZE) { 2283 MPASS(pte == pmap_pte(pmap, sva)); 2284 2285 if ((be64toh(*pte) & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 2286 goto maybe_invlrng; 2287 else if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 2288 if (advice == MADV_DONTNEED) { 2289 /* 2290 * Future calls to pmap_is_modified() 2291 * can be avoided by making the page 2292 * dirty now. 2293 */ 2294 m = PHYS_TO_VM_PAGE(be64toh(*pte) & PG_FRAME); 2295 vm_page_dirty(m); 2296 } 2297 atomic_clear_long(pte, htobe64(PG_M | PG_A)); 2298 } else if ((be64toh(*pte) & PG_A) != 0) 2299 atomic_clear_long(pte, htobe64(PG_A)); 2300 else 2301 goto maybe_invlrng; 2302 anychanged = true; 2303 continue; 2304 maybe_invlrng: 2305 if (va != va_next) { 2306 anychanged = true; 2307 va = va_next; 2308 } 2309 } 2310 if (va != va_next) 2311 anychanged = true; 2312 } 2313 if (anychanged) 2314 pmap_invalidate_all(pmap); 2315 PMAP_UNLOCK(pmap); 2316 } 2317 2318 /* 2319 * Routines used in machine-dependent code 2320 */ 2321 static void 2322 mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end) 2323 { 2324 uint64_t lpcr; 2325 2326 if (bootverbose) 2327 printf("%s\n", __func__); 2328 hw_direct_map = 1; 2329 mmu_radix_early_bootstrap(start, end); 2330 if (bootverbose) 2331 printf("early bootstrap complete\n"); 2332 if (powernv_enabled) { 2333 lpcr = mfspr(SPR_LPCR); 2334 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 2335 mmu_radix_parttab_init(); 2336 mmu_radix_init_amor(); 2337 if (bootverbose) 2338 printf("powernv init complete\n"); 2339 } 2340 mmu_radix_init_iamr(); 2341 mmu_radix_proctab_init(); 2342 mmu_radix_pid_set(kernel_pmap); 2343 /* XXX assume CPU_FTR_HVMODE */ 2344 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); 2345 2346 mmu_radix_late_bootstrap(start, end); 2347 numa_mem_regions(&numa_pregions, &numa_pregions_sz); 2348 if (bootverbose) 2349 printf("%s done\n", __func__); 2350 pmap_bootstrapped = 1; 2351 dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE); 2352 PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS); 2353 } 2354 2355 static void 2356 mmu_radix_cpu_bootstrap(int ap) 2357 { 2358 uint64_t lpcr; 2359 uint64_t ptcr; 2360 2361 if (powernv_enabled) { 2362 lpcr = mfspr(SPR_LPCR); 2363 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 2364 2365 ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); 2366 mtspr(SPR_PTCR, ptcr); 2367 mmu_radix_init_amor(); 2368 } 2369 mmu_radix_init_iamr(); 2370 mmu_radix_pid_set(kernel_pmap); 2371 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); 2372 } 2373 2374 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0, 2375 "2MB page mapping counters"); 2376 2377 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_demotions); 2378 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD, 2379 &pmap_l3e_demotions, "2MB page demotions"); 2380 2381 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_mappings); 2382 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD, 2383 &pmap_l3e_mappings, "2MB page mappings"); 2384 2385 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_p_failures); 2386 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD, 2387 &pmap_l3e_p_failures, "2MB page promotion failures"); 2388 2389 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_promotions); 2390 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD, 2391 &pmap_l3e_promotions, "2MB page promotions"); 2392 2393 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0, 2394 "1GB page mapping counters"); 2395 2396 static COUNTER_U64_DEFINE_EARLY(pmap_l2e_demotions); 2397 SYSCTL_COUNTER_U64(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD, 2398 &pmap_l2e_demotions, "1GB page demotions"); 2399 2400 void 2401 mmu_radix_clear_modify(vm_page_t m) 2402 { 2403 struct md_page *pvh; 2404 pmap_t pmap; 2405 pv_entry_t next_pv, pv; 2406 pml3_entry_t oldl3e, *l3e; 2407 pt_entry_t oldpte, *pte; 2408 struct rwlock *lock; 2409 vm_offset_t va; 2410 int md_gen, pvh_gen; 2411 2412 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2413 ("pmap_clear_modify: page %p is not managed", m)); 2414 vm_page_assert_busied(m); 2415 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 2416 2417 /* 2418 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 2419 * If the object containing the page is locked and the page is not 2420 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 2421 */ 2422 if ((m->a.flags & PGA_WRITEABLE) == 0) 2423 return; 2424 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2425 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2426 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 2427 rw_wlock(lock); 2428 restart: 2429 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { 2430 pmap = PV_PMAP(pv); 2431 if (!PMAP_TRYLOCK(pmap)) { 2432 pvh_gen = pvh->pv_gen; 2433 rw_wunlock(lock); 2434 PMAP_LOCK(pmap); 2435 rw_wlock(lock); 2436 if (pvh_gen != pvh->pv_gen) { 2437 PMAP_UNLOCK(pmap); 2438 goto restart; 2439 } 2440 } 2441 va = pv->pv_va; 2442 l3e = pmap_pml3e(pmap, va); 2443 oldl3e = be64toh(*l3e); 2444 if ((oldl3e & PG_RW) != 0 && 2445 pmap_demote_l3e_locked(pmap, l3e, va, &lock) && 2446 (oldl3e & PG_W) == 0) { 2447 /* 2448 * Write protect the mapping to a 2449 * single page so that a subsequent 2450 * write access may repromote. 2451 */ 2452 va += VM_PAGE_TO_PHYS(m) - (oldl3e & 2453 PG_PS_FRAME); 2454 pte = pmap_l3e_to_pte(l3e, va); 2455 oldpte = be64toh(*pte); 2456 while (!atomic_cmpset_long(pte, 2457 htobe64(oldpte), 2458 htobe64((oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW)))) 2459 oldpte = be64toh(*pte); 2460 vm_page_dirty(m); 2461 pmap_invalidate_page(pmap, va); 2462 } 2463 PMAP_UNLOCK(pmap); 2464 } 2465 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 2466 pmap = PV_PMAP(pv); 2467 if (!PMAP_TRYLOCK(pmap)) { 2468 md_gen = m->md.pv_gen; 2469 pvh_gen = pvh->pv_gen; 2470 rw_wunlock(lock); 2471 PMAP_LOCK(pmap); 2472 rw_wlock(lock); 2473 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 2474 PMAP_UNLOCK(pmap); 2475 goto restart; 2476 } 2477 } 2478 l3e = pmap_pml3e(pmap, pv->pv_va); 2479 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_clear_modify: found" 2480 " a 2mpage in page %p's pv list", m)); 2481 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 2482 if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 2483 atomic_clear_long(pte, htobe64(PG_M)); 2484 pmap_invalidate_page(pmap, pv->pv_va); 2485 } 2486 PMAP_UNLOCK(pmap); 2487 } 2488 rw_wunlock(lock); 2489 } 2490 2491 void 2492 mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 2493 vm_size_t len, vm_offset_t src_addr) 2494 { 2495 struct rwlock *lock; 2496 struct spglist free; 2497 vm_offset_t addr; 2498 vm_offset_t end_addr = src_addr + len; 2499 vm_offset_t va_next; 2500 vm_page_t dst_pdpg, dstmpte, srcmpte; 2501 bool invalidate_all; 2502 2503 CTR6(KTR_PMAP, 2504 "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n", 2505 __func__, dst_pmap, src_pmap, dst_addr, len, src_addr); 2506 2507 if (dst_addr != src_addr) 2508 return; 2509 lock = NULL; 2510 invalidate_all = false; 2511 if (dst_pmap < src_pmap) { 2512 PMAP_LOCK(dst_pmap); 2513 PMAP_LOCK(src_pmap); 2514 } else { 2515 PMAP_LOCK(src_pmap); 2516 PMAP_LOCK(dst_pmap); 2517 } 2518 2519 for (addr = src_addr; addr < end_addr; addr = va_next) { 2520 pml1_entry_t *l1e; 2521 pml2_entry_t *l2e; 2522 pml3_entry_t srcptepaddr, *l3e; 2523 pt_entry_t *src_pte, *dst_pte; 2524 2525 l1e = pmap_pml1e(src_pmap, addr); 2526 if ((be64toh(*l1e) & PG_V) == 0) { 2527 va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 2528 if (va_next < addr) 2529 va_next = end_addr; 2530 continue; 2531 } 2532 2533 l2e = pmap_l1e_to_l2e(l1e, addr); 2534 if ((be64toh(*l2e) & PG_V) == 0) { 2535 va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 2536 if (va_next < addr) 2537 va_next = end_addr; 2538 continue; 2539 } 2540 2541 va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 2542 if (va_next < addr) 2543 va_next = end_addr; 2544 2545 l3e = pmap_l2e_to_l3e(l2e, addr); 2546 srcptepaddr = be64toh(*l3e); 2547 if (srcptepaddr == 0) 2548 continue; 2549 2550 if (srcptepaddr & RPTE_LEAF) { 2551 if ((addr & L3_PAGE_MASK) != 0 || 2552 addr + L3_PAGE_SIZE > end_addr) 2553 continue; 2554 dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL); 2555 if (dst_pdpg == NULL) 2556 break; 2557 l3e = (pml3_entry_t *) 2558 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); 2559 l3e = &l3e[pmap_pml3e_index(addr)]; 2560 if (be64toh(*l3e) == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 2561 pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr, 2562 PMAP_ENTER_NORECLAIM, &lock))) { 2563 *l3e = htobe64(srcptepaddr & ~PG_W); 2564 pmap_resident_count_inc(dst_pmap, 2565 L3_PAGE_SIZE / PAGE_SIZE); 2566 counter_u64_add(pmap_l3e_mappings, 1); 2567 } else 2568 dst_pdpg->ref_count--; 2569 continue; 2570 } 2571 2572 srcptepaddr &= PG_FRAME; 2573 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 2574 KASSERT(srcmpte->ref_count > 0, 2575 ("pmap_copy: source page table page is unused")); 2576 2577 if (va_next > end_addr) 2578 va_next = end_addr; 2579 2580 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 2581 src_pte = &src_pte[pmap_pte_index(addr)]; 2582 dstmpte = NULL; 2583 while (addr < va_next) { 2584 pt_entry_t ptetemp; 2585 ptetemp = be64toh(*src_pte); 2586 /* 2587 * we only virtual copy managed pages 2588 */ 2589 if ((ptetemp & PG_MANAGED) != 0) { 2590 if (dstmpte != NULL && 2591 dstmpte->pindex == pmap_l3e_pindex(addr)) 2592 dstmpte->ref_count++; 2593 else if ((dstmpte = pmap_allocpte(dst_pmap, 2594 addr, NULL)) == NULL) 2595 goto out; 2596 dst_pte = (pt_entry_t *) 2597 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 2598 dst_pte = &dst_pte[pmap_pte_index(addr)]; 2599 if (be64toh(*dst_pte) == 0 && 2600 pmap_try_insert_pv_entry(dst_pmap, addr, 2601 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 2602 &lock)) { 2603 /* 2604 * Clear the wired, modified, and 2605 * accessed (referenced) bits 2606 * during the copy. 2607 */ 2608 *dst_pte = htobe64(ptetemp & ~(PG_W | PG_M | 2609 PG_A)); 2610 pmap_resident_count_inc(dst_pmap, 1); 2611 } else { 2612 SLIST_INIT(&free); 2613 if (pmap_unwire_ptp(dst_pmap, addr, 2614 dstmpte, &free)) { 2615 /* 2616 * Although "addr" is not 2617 * mapped, paging-structure 2618 * caches could nonetheless 2619 * have entries that refer to 2620 * the freed page table pages. 2621 * Invalidate those entries. 2622 */ 2623 invalidate_all = true; 2624 vm_page_free_pages_toq(&free, 2625 true); 2626 } 2627 goto out; 2628 } 2629 if (dstmpte->ref_count >= srcmpte->ref_count) 2630 break; 2631 } 2632 addr += PAGE_SIZE; 2633 if (__predict_false((addr & L3_PAGE_MASK) == 0)) 2634 src_pte = pmap_pte(src_pmap, addr); 2635 else 2636 src_pte++; 2637 } 2638 } 2639 out: 2640 if (invalidate_all) 2641 pmap_invalidate_all(dst_pmap); 2642 if (lock != NULL) 2643 rw_wunlock(lock); 2644 PMAP_UNLOCK(src_pmap); 2645 PMAP_UNLOCK(dst_pmap); 2646 } 2647 2648 static void 2649 mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst) 2650 { 2651 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 2652 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 2653 2654 CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst); 2655 /* 2656 * XXX slow 2657 */ 2658 bcopy((void *)src, (void *)dst, PAGE_SIZE); 2659 } 2660 2661 static void 2662 mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 2663 vm_offset_t b_offset, int xfersize) 2664 { 2665 void *a_cp, *b_cp; 2666 vm_offset_t a_pg_offset, b_pg_offset; 2667 int cnt; 2668 2669 CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma, 2670 a_offset, mb, b_offset, xfersize); 2671 2672 while (xfersize > 0) { 2673 a_pg_offset = a_offset & PAGE_MASK; 2674 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 2675 a_cp = (char *)(uintptr_t)PHYS_TO_DMAP( 2676 VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) + 2677 a_pg_offset; 2678 b_pg_offset = b_offset & PAGE_MASK; 2679 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 2680 b_cp = (char *)(uintptr_t)PHYS_TO_DMAP( 2681 VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) + 2682 b_pg_offset; 2683 bcopy(a_cp, b_cp, cnt); 2684 a_offset += cnt; 2685 b_offset += cnt; 2686 xfersize -= cnt; 2687 } 2688 } 2689 2690 #if VM_NRESERVLEVEL > 0 2691 /* 2692 * Tries to promote the 512, contiguous 4KB page mappings that are within a 2693 * single page table page (PTP) to a single 2MB page mapping. For promotion 2694 * to occur, two conditions must be met: (1) the 4KB page mappings must map 2695 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 2696 * identical characteristics. 2697 */ 2698 static int 2699 pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va, 2700 struct rwlock **lockp) 2701 { 2702 pml3_entry_t newpde; 2703 pt_entry_t *firstpte, oldpte, pa, *pte; 2704 vm_page_t mpte; 2705 2706 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2707 2708 /* 2709 * Examine the first PTE in the specified PTP. Abort if this PTE is 2710 * either invalid, unused, or does not map the first 4KB physical page 2711 * within a 2MB page. 2712 */ 2713 firstpte = (pt_entry_t *)PHYS_TO_DMAP(be64toh(*pde) & PG_FRAME); 2714 setpde: 2715 newpde = be64toh(*firstpte); 2716 if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 2717 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" 2718 " in pmap %p", va, pmap); 2719 goto fail; 2720 } 2721 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 2722 /* 2723 * When PG_M is already clear, PG_RW can be cleared without 2724 * a TLB invalidation. 2725 */ 2726 if (!atomic_cmpset_long(firstpte, htobe64(newpde), htobe64((newpde | RPTE_EAA_R) & ~RPTE_EAA_W))) 2727 goto setpde; 2728 newpde &= ~RPTE_EAA_W; 2729 } 2730 2731 /* 2732 * Examine each of the other PTEs in the specified PTP. Abort if this 2733 * PTE maps an unexpected 4KB physical page or does not have identical 2734 * characteristics to the first PTE. 2735 */ 2736 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE; 2737 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 2738 setpte: 2739 oldpte = be64toh(*pte); 2740 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 2741 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" 2742 " in pmap %p", va, pmap); 2743 goto fail; 2744 } 2745 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 2746 /* 2747 * When PG_M is already clear, PG_RW can be cleared 2748 * without a TLB invalidation. 2749 */ 2750 if (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~RPTE_EAA_W))) 2751 goto setpte; 2752 oldpte &= ~RPTE_EAA_W; 2753 CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx" 2754 " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) | 2755 (va & ~L3_PAGE_MASK), pmap); 2756 } 2757 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 2758 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" 2759 " in pmap %p", va, pmap); 2760 goto fail; 2761 } 2762 pa -= PAGE_SIZE; 2763 } 2764 2765 /* 2766 * Save the page table page in its current state until the PDE 2767 * mapping the superpage is demoted by pmap_demote_pde() or 2768 * destroyed by pmap_remove_pde(). 2769 */ 2770 mpte = PHYS_TO_VM_PAGE(be64toh(*pde) & PG_FRAME); 2771 KASSERT(mpte >= vm_page_array && 2772 mpte < &vm_page_array[vm_page_array_size], 2773 ("pmap_promote_l3e: page table page is out of range")); 2774 KASSERT(mpte->pindex == pmap_l3e_pindex(va), 2775 ("pmap_promote_l3e: page table page's pindex is wrong")); 2776 if (pmap_insert_pt_page(pmap, mpte)) { 2777 CTR2(KTR_PMAP, 2778 "pmap_promote_l3e: failure for va %#lx in pmap %p", va, 2779 pmap); 2780 goto fail; 2781 } 2782 2783 /* 2784 * Promote the pv entries. 2785 */ 2786 if ((newpde & PG_MANAGED) != 0) 2787 pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp); 2788 2789 pte_store(pde, PG_PROMOTED | newpde); 2790 ptesync(); 2791 counter_u64_add(pmap_l3e_promotions, 1); 2792 CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx" 2793 " in pmap %p", va, pmap); 2794 return (0); 2795 fail: 2796 counter_u64_add(pmap_l3e_p_failures, 1); 2797 return (KERN_FAILURE); 2798 } 2799 #endif /* VM_NRESERVLEVEL > 0 */ 2800 2801 int 2802 mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, 2803 vm_prot_t prot, u_int flags, int8_t psind) 2804 { 2805 struct rwlock *lock; 2806 pml3_entry_t *l3e; 2807 pt_entry_t *pte; 2808 pt_entry_t newpte, origpte; 2809 pv_entry_t pv; 2810 vm_paddr_t opa, pa; 2811 vm_page_t mpte, om; 2812 int rv, retrycount; 2813 boolean_t nosleep, invalidate_all, invalidate_page; 2814 2815 va = trunc_page(va); 2816 retrycount = 0; 2817 invalidate_page = invalidate_all = false; 2818 CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va, 2819 m, prot, flags, psind); 2820 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 2821 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 2822 ("pmap_enter: managed mapping within the clean submap")); 2823 if ((m->oflags & VPO_UNMANAGED) == 0) 2824 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2825 2826 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 2827 ("pmap_enter: flags %u has reserved bits set", flags)); 2828 pa = VM_PAGE_TO_PHYS(m); 2829 newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF); 2830 if ((flags & VM_PROT_WRITE) != 0) 2831 newpte |= PG_M; 2832 if ((flags & VM_PROT_READ) != 0) 2833 newpte |= PG_A; 2834 if (prot & VM_PROT_READ) 2835 newpte |= RPTE_EAA_R; 2836 if ((prot & VM_PROT_WRITE) != 0) 2837 newpte |= RPTE_EAA_W; 2838 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 2839 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 2840 2841 if (prot & VM_PROT_EXECUTE) 2842 newpte |= PG_X; 2843 if ((flags & PMAP_ENTER_WIRED) != 0) 2844 newpte |= PG_W; 2845 if (va >= DMAP_MIN_ADDRESS) 2846 newpte |= RPTE_EAA_P; 2847 newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs); 2848 /* 2849 * Set modified bit gratuitously for writeable mappings if 2850 * the page is unmanaged. We do not want to take a fault 2851 * to do the dirty bit accounting for these mappings. 2852 */ 2853 if ((m->oflags & VPO_UNMANAGED) != 0) { 2854 if ((newpte & PG_RW) != 0) 2855 newpte |= PG_M; 2856 } else 2857 newpte |= PG_MANAGED; 2858 2859 lock = NULL; 2860 PMAP_LOCK(pmap); 2861 if (psind == 1) { 2862 /* Assert the required virtual and physical alignment. */ 2863 KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned")); 2864 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2865 rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock); 2866 goto out; 2867 } 2868 mpte = NULL; 2869 2870 /* 2871 * In the case that a page table page is not 2872 * resident, we are creating it here. 2873 */ 2874 retry: 2875 l3e = pmap_pml3e(pmap, va); 2876 if (l3e != NULL && (be64toh(*l3e) & PG_V) != 0 && ((be64toh(*l3e) & RPTE_LEAF) == 0 || 2877 pmap_demote_l3e_locked(pmap, l3e, va, &lock))) { 2878 pte = pmap_l3e_to_pte(l3e, va); 2879 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 2880 mpte = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME); 2881 mpte->ref_count++; 2882 } 2883 } else if (va < VM_MAXUSER_ADDRESS) { 2884 /* 2885 * Here if the pte page isn't mapped, or if it has been 2886 * deallocated. 2887 */ 2888 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2889 mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va), 2890 nosleep ? NULL : &lock); 2891 if (mpte == NULL && nosleep) { 2892 rv = KERN_RESOURCE_SHORTAGE; 2893 goto out; 2894 } 2895 if (__predict_false(retrycount++ == 6)) 2896 panic("too many retries"); 2897 invalidate_all = true; 2898 goto retry; 2899 } else 2900 panic("pmap_enter: invalid page directory va=%#lx", va); 2901 2902 origpte = be64toh(*pte); 2903 pv = NULL; 2904 2905 /* 2906 * Is the specified virtual address already mapped? 2907 */ 2908 if ((origpte & PG_V) != 0) { 2909 #ifdef INVARIANTS 2910 if (VERBOSE_PMAP || pmap_logging) { 2911 printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --" 2912 " asid=%lu curpid=%d name=%s origpte0x%lx\n", 2913 pmap, va, m, prot, flags, psind, pmap->pm_pid, 2914 curproc->p_pid, curproc->p_comm, origpte); 2915 pmap_pte_walk(pmap->pm_pml1, va); 2916 } 2917 #endif 2918 /* 2919 * Wiring change, just update stats. We don't worry about 2920 * wiring PT pages as they remain resident as long as there 2921 * are valid mappings in them. Hence, if a user page is wired, 2922 * the PT page will be also. 2923 */ 2924 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 2925 pmap->pm_stats.wired_count++; 2926 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 2927 pmap->pm_stats.wired_count--; 2928 2929 /* 2930 * Remove the extra PT page reference. 2931 */ 2932 if (mpte != NULL) { 2933 mpte->ref_count--; 2934 KASSERT(mpte->ref_count > 0, 2935 ("pmap_enter: missing reference to page table page," 2936 " va: 0x%lx", va)); 2937 } 2938 2939 /* 2940 * Has the physical page changed? 2941 */ 2942 opa = origpte & PG_FRAME; 2943 if (opa == pa) { 2944 /* 2945 * No, might be a protection or wiring change. 2946 */ 2947 if ((origpte & PG_MANAGED) != 0 && 2948 (newpte & PG_RW) != 0) 2949 vm_page_aflag_set(m, PGA_WRITEABLE); 2950 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) { 2951 if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) { 2952 if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte))) 2953 goto retry; 2954 if ((newpte & PG_M) != (origpte & PG_M)) 2955 vm_page_dirty(m); 2956 if ((newpte & PG_A) != (origpte & PG_A)) 2957 vm_page_aflag_set(m, PGA_REFERENCED); 2958 ptesync(); 2959 } else 2960 invalidate_all = true; 2961 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 2962 goto unchanged; 2963 } 2964 goto validate; 2965 } 2966 2967 /* 2968 * The physical page has changed. Temporarily invalidate 2969 * the mapping. This ensures that all threads sharing the 2970 * pmap keep a consistent view of the mapping, which is 2971 * necessary for the correct handling of COW faults. It 2972 * also permits reuse of the old mapping's PV entry, 2973 * avoiding an allocation. 2974 * 2975 * For consistency, handle unmanaged mappings the same way. 2976 */ 2977 origpte = be64toh(pte_load_clear(pte)); 2978 KASSERT((origpte & PG_FRAME) == opa, 2979 ("pmap_enter: unexpected pa update for %#lx", va)); 2980 if ((origpte & PG_MANAGED) != 0) { 2981 om = PHYS_TO_VM_PAGE(opa); 2982 2983 /* 2984 * The pmap lock is sufficient to synchronize with 2985 * concurrent calls to pmap_page_test_mappings() and 2986 * pmap_ts_referenced(). 2987 */ 2988 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2989 vm_page_dirty(om); 2990 if ((origpte & PG_A) != 0) 2991 vm_page_aflag_set(om, PGA_REFERENCED); 2992 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 2993 pv = pmap_pvh_remove(&om->md, pmap, va); 2994 if ((newpte & PG_MANAGED) == 0) 2995 free_pv_entry(pmap, pv); 2996 #ifdef INVARIANTS 2997 else if (origpte & PG_MANAGED) { 2998 if (pv == NULL) { 2999 pmap_page_print_mappings(om); 3000 MPASS(pv != NULL); 3001 } 3002 } 3003 #endif 3004 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3005 TAILQ_EMPTY(&om->md.pv_list) && 3006 ((om->flags & PG_FICTITIOUS) != 0 || 3007 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3008 vm_page_aflag_clear(om, PGA_WRITEABLE); 3009 } 3010 if ((origpte & PG_A) != 0) 3011 invalidate_page = true; 3012 origpte = 0; 3013 } else { 3014 if (pmap != kernel_pmap) { 3015 #ifdef INVARIANTS 3016 if (VERBOSE_PMAP || pmap_logging) 3017 printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n", 3018 pmap, va, m, prot, flags, psind, 3019 pmap->pm_pid, curproc->p_pid, 3020 curproc->p_comm); 3021 #endif 3022 } 3023 3024 /* 3025 * Increment the counters. 3026 */ 3027 if ((newpte & PG_W) != 0) 3028 pmap->pm_stats.wired_count++; 3029 pmap_resident_count_inc(pmap, 1); 3030 } 3031 3032 /* 3033 * Enter on the PV list if part of our managed memory. 3034 */ 3035 if ((newpte & PG_MANAGED) != 0) { 3036 if (pv == NULL) { 3037 pv = get_pv_entry(pmap, &lock); 3038 pv->pv_va = va; 3039 } 3040 #ifdef VERBOSE_PV 3041 else 3042 printf("reassigning pv: %p to pmap: %p\n", 3043 pv, pmap); 3044 #endif 3045 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 3046 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 3047 m->md.pv_gen++; 3048 if ((newpte & PG_RW) != 0) 3049 vm_page_aflag_set(m, PGA_WRITEABLE); 3050 } 3051 3052 /* 3053 * Update the PTE. 3054 */ 3055 if ((origpte & PG_V) != 0) { 3056 validate: 3057 origpte = be64toh(pte_load_store(pte, htobe64(newpte))); 3058 KASSERT((origpte & PG_FRAME) == pa, 3059 ("pmap_enter: unexpected pa update for %#lx", va)); 3060 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 3061 (PG_M | PG_RW)) { 3062 if ((origpte & PG_MANAGED) != 0) 3063 vm_page_dirty(m); 3064 invalidate_page = true; 3065 3066 /* 3067 * Although the PTE may still have PG_RW set, TLB 3068 * invalidation may nonetheless be required because 3069 * the PTE no longer has PG_M set. 3070 */ 3071 } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) { 3072 /* 3073 * Removing capabilities requires invalidation on POWER 3074 */ 3075 invalidate_page = true; 3076 goto unchanged; 3077 } 3078 if ((origpte & PG_A) != 0) 3079 invalidate_page = true; 3080 } else { 3081 pte_store(pte, newpte); 3082 ptesync(); 3083 } 3084 unchanged: 3085 3086 #if VM_NRESERVLEVEL > 0 3087 /* 3088 * If both the page table page and the reservation are fully 3089 * populated, then attempt promotion. 3090 */ 3091 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 3092 mmu_radix_ps_enabled(pmap) && 3093 (m->flags & PG_FICTITIOUS) == 0 && 3094 vm_reserv_level_iffullpop(m) == 0 && 3095 pmap_promote_l3e(pmap, l3e, va, &lock) == 0) 3096 invalidate_all = true; 3097 #endif 3098 if (invalidate_all) 3099 pmap_invalidate_all(pmap); 3100 else if (invalidate_page) 3101 pmap_invalidate_page(pmap, va); 3102 3103 rv = KERN_SUCCESS; 3104 out: 3105 if (lock != NULL) 3106 rw_wunlock(lock); 3107 PMAP_UNLOCK(pmap); 3108 3109 return (rv); 3110 } 3111 3112 /* 3113 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 3114 * if successful. Returns false if (1) a page table page cannot be allocated 3115 * without sleeping, (2) a mapping already exists at the specified virtual 3116 * address, or (3) a PV entry cannot be allocated without reclaiming another 3117 * PV entry. 3118 */ 3119 static bool 3120 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3121 struct rwlock **lockp) 3122 { 3123 pml3_entry_t newpde; 3124 3125 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3126 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) | 3127 RPTE_LEAF | PG_V; 3128 if ((m->oflags & VPO_UNMANAGED) == 0) 3129 newpde |= PG_MANAGED; 3130 if (prot & VM_PROT_EXECUTE) 3131 newpde |= PG_X; 3132 if (prot & VM_PROT_READ) 3133 newpde |= RPTE_EAA_R; 3134 if (va >= DMAP_MIN_ADDRESS) 3135 newpde |= RPTE_EAA_P; 3136 return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 3137 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 3138 KERN_SUCCESS); 3139 } 3140 3141 /* 3142 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 3143 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 3144 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 3145 * a mapping already exists at the specified virtual address. Returns 3146 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 3147 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 3148 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 3149 * 3150 * The parameter "m" is only used when creating a managed, writeable mapping. 3151 */ 3152 static int 3153 pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags, 3154 vm_page_t m, struct rwlock **lockp) 3155 { 3156 struct spglist free; 3157 pml3_entry_t oldl3e, *l3e; 3158 vm_page_t mt, pdpg; 3159 3160 KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, 3161 ("pmap_enter_pde: newpde is missing PG_M")); 3162 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3163 3164 if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 3165 NULL : lockp)) == NULL) { 3166 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3167 " in pmap %p", va, pmap); 3168 return (KERN_RESOURCE_SHORTAGE); 3169 } 3170 l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 3171 l3e = &l3e[pmap_pml3e_index(va)]; 3172 oldl3e = be64toh(*l3e); 3173 if ((oldl3e & PG_V) != 0) { 3174 KASSERT(pdpg->ref_count > 1, 3175 ("pmap_enter_pde: pdpg's wire count is too low")); 3176 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 3177 pdpg->ref_count--; 3178 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3179 " in pmap %p", va, pmap); 3180 return (KERN_FAILURE); 3181 } 3182 /* Break the existing mapping(s). */ 3183 SLIST_INIT(&free); 3184 if ((oldl3e & RPTE_LEAF) != 0) { 3185 /* 3186 * The reference to the PD page that was acquired by 3187 * pmap_allocl3e() ensures that it won't be freed. 3188 * However, if the PDE resulted from a promotion, then 3189 * a reserved PT page could be freed. 3190 */ 3191 (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp); 3192 pmap_invalidate_l3e_page(pmap, va, oldl3e); 3193 } else { 3194 if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e, 3195 &free, lockp)) 3196 pmap_invalidate_all(pmap); 3197 } 3198 vm_page_free_pages_toq(&free, true); 3199 if (va >= VM_MAXUSER_ADDRESS) { 3200 mt = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME); 3201 if (pmap_insert_pt_page(pmap, mt)) { 3202 /* 3203 * XXX Currently, this can't happen because 3204 * we do not perform pmap_enter(psind == 1) 3205 * on the kernel pmap. 3206 */ 3207 panic("pmap_enter_pde: trie insert failed"); 3208 } 3209 } else 3210 KASSERT(be64toh(*l3e) == 0, ("pmap_enter_pde: non-zero pde %p", 3211 l3e)); 3212 } 3213 if ((newpde & PG_MANAGED) != 0) { 3214 /* 3215 * Abort this mapping if its PV entry could not be created. 3216 */ 3217 if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) { 3218 SLIST_INIT(&free); 3219 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { 3220 /* 3221 * Although "va" is not mapped, paging- 3222 * structure caches could nonetheless have 3223 * entries that refer to the freed page table 3224 * pages. Invalidate those entries. 3225 */ 3226 pmap_invalidate_page(pmap, va); 3227 vm_page_free_pages_toq(&free, true); 3228 } 3229 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3230 " in pmap %p", va, pmap); 3231 return (KERN_RESOURCE_SHORTAGE); 3232 } 3233 if ((newpde & PG_RW) != 0) { 3234 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) 3235 vm_page_aflag_set(mt, PGA_WRITEABLE); 3236 } 3237 } 3238 3239 /* 3240 * Increment counters. 3241 */ 3242 if ((newpde & PG_W) != 0) 3243 pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE; 3244 pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); 3245 3246 /* 3247 * Map the superpage. (This is not a promoted mapping; there will not 3248 * be any lingering 4KB page mappings in the TLB.) 3249 */ 3250 pte_store(l3e, newpde); 3251 ptesync(); 3252 3253 counter_u64_add(pmap_l3e_mappings, 1); 3254 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3255 " in pmap %p", va, pmap); 3256 return (KERN_SUCCESS); 3257 } 3258 3259 void 3260 mmu_radix_enter_object(pmap_t pmap, vm_offset_t start, 3261 vm_offset_t end, vm_page_t m_start, vm_prot_t prot) 3262 { 3263 3264 struct rwlock *lock; 3265 vm_offset_t va; 3266 vm_page_t m, mpte; 3267 vm_pindex_t diff, psize; 3268 bool invalidate; 3269 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3270 3271 CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start, 3272 end, m_start, prot); 3273 3274 invalidate = false; 3275 psize = atop(end - start); 3276 mpte = NULL; 3277 m = m_start; 3278 lock = NULL; 3279 PMAP_LOCK(pmap); 3280 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3281 va = start + ptoa(diff); 3282 if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end && 3283 m->psind == 1 && mmu_radix_ps_enabled(pmap) && 3284 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3285 m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1]; 3286 else 3287 mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot, 3288 mpte, &lock, &invalidate); 3289 m = TAILQ_NEXT(m, listq); 3290 } 3291 ptesync(); 3292 if (lock != NULL) 3293 rw_wunlock(lock); 3294 if (invalidate) 3295 pmap_invalidate_all(pmap); 3296 PMAP_UNLOCK(pmap); 3297 } 3298 3299 static vm_page_t 3300 mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3301 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate) 3302 { 3303 struct spglist free; 3304 pt_entry_t *pte; 3305 vm_paddr_t pa; 3306 3307 KASSERT(!VA_IS_CLEANMAP(va) || 3308 (m->oflags & VPO_UNMANAGED) != 0, 3309 ("mmu_radix_enter_quick_locked: managed mapping within the clean submap")); 3310 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3311 3312 /* 3313 * In the case that a page table page is not 3314 * resident, we are creating it here. 3315 */ 3316 if (va < VM_MAXUSER_ADDRESS) { 3317 vm_pindex_t ptepindex; 3318 pml3_entry_t *ptepa; 3319 3320 /* 3321 * Calculate pagetable page index 3322 */ 3323 ptepindex = pmap_l3e_pindex(va); 3324 if (mpte && (mpte->pindex == ptepindex)) { 3325 mpte->ref_count++; 3326 } else { 3327 /* 3328 * Get the page directory entry 3329 */ 3330 ptepa = pmap_pml3e(pmap, va); 3331 3332 /* 3333 * If the page table page is mapped, we just increment 3334 * the hold count, and activate it. Otherwise, we 3335 * attempt to allocate a page table page. If this 3336 * attempt fails, we don't retry. Instead, we give up. 3337 */ 3338 if (ptepa && (be64toh(*ptepa) & PG_V) != 0) { 3339 if (be64toh(*ptepa) & RPTE_LEAF) 3340 return (NULL); 3341 mpte = PHYS_TO_VM_PAGE(be64toh(*ptepa) & PG_FRAME); 3342 mpte->ref_count++; 3343 } else { 3344 /* 3345 * Pass NULL instead of the PV list lock 3346 * pointer, because we don't intend to sleep. 3347 */ 3348 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 3349 if (mpte == NULL) 3350 return (mpte); 3351 } 3352 } 3353 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3354 pte = &pte[pmap_pte_index(va)]; 3355 } else { 3356 mpte = NULL; 3357 pte = pmap_pte(pmap, va); 3358 } 3359 if (be64toh(*pte)) { 3360 if (mpte != NULL) { 3361 mpte->ref_count--; 3362 mpte = NULL; 3363 } 3364 return (mpte); 3365 } 3366 3367 /* 3368 * Enter on the PV list if part of our managed memory. 3369 */ 3370 if ((m->oflags & VPO_UNMANAGED) == 0 && 3371 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3372 if (mpte != NULL) { 3373 SLIST_INIT(&free); 3374 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3375 /* 3376 * Although "va" is not mapped, paging- 3377 * structure caches could nonetheless have 3378 * entries that refer to the freed page table 3379 * pages. Invalidate those entries. 3380 */ 3381 *invalidate = true; 3382 vm_page_free_pages_toq(&free, true); 3383 } 3384 mpte = NULL; 3385 } 3386 return (mpte); 3387 } 3388 3389 /* 3390 * Increment counters 3391 */ 3392 pmap_resident_count_inc(pmap, 1); 3393 3394 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs); 3395 if (prot & VM_PROT_EXECUTE) 3396 pa |= PG_X; 3397 else 3398 pa |= RPTE_EAA_R; 3399 if ((m->oflags & VPO_UNMANAGED) == 0) 3400 pa |= PG_MANAGED; 3401 3402 pte_store(pte, pa); 3403 return (mpte); 3404 } 3405 3406 void 3407 mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, 3408 vm_prot_t prot) 3409 { 3410 struct rwlock *lock; 3411 bool invalidate; 3412 3413 lock = NULL; 3414 invalidate = false; 3415 PMAP_LOCK(pmap); 3416 mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock, 3417 &invalidate); 3418 ptesync(); 3419 if (lock != NULL) 3420 rw_wunlock(lock); 3421 if (invalidate) 3422 pmap_invalidate_all(pmap); 3423 PMAP_UNLOCK(pmap); 3424 } 3425 3426 vm_paddr_t 3427 mmu_radix_extract(pmap_t pmap, vm_offset_t va) 3428 { 3429 pml3_entry_t *l3e; 3430 pt_entry_t *pte; 3431 vm_paddr_t pa; 3432 3433 l3e = pmap_pml3e(pmap, va); 3434 if (__predict_false(l3e == NULL)) 3435 return (0); 3436 if (be64toh(*l3e) & RPTE_LEAF) { 3437 pa = (be64toh(*l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK); 3438 pa |= (va & L3_PAGE_MASK); 3439 } else { 3440 /* 3441 * Beware of a concurrent promotion that changes the 3442 * PDE at this point! For example, vtopte() must not 3443 * be used to access the PTE because it would use the 3444 * new PDE. It is, however, safe to use the old PDE 3445 * because the page table page is preserved by the 3446 * promotion. 3447 */ 3448 pte = pmap_l3e_to_pte(l3e, va); 3449 if (__predict_false(pte == NULL)) 3450 return (0); 3451 pa = be64toh(*pte); 3452 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3453 pa |= (va & PAGE_MASK); 3454 } 3455 return (pa); 3456 } 3457 3458 vm_page_t 3459 mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3460 { 3461 pml3_entry_t l3e, *l3ep; 3462 pt_entry_t pte; 3463 vm_page_t m; 3464 3465 m = NULL; 3466 CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot); 3467 PMAP_LOCK(pmap); 3468 l3ep = pmap_pml3e(pmap, va); 3469 if (l3ep != NULL && (l3e = be64toh(*l3ep))) { 3470 if (l3e & RPTE_LEAF) { 3471 if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0) 3472 m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) | 3473 (va & L3_PAGE_MASK)); 3474 } else { 3475 /* Native endian PTE, do not pass to pmap functions */ 3476 pte = be64toh(*pmap_l3e_to_pte(l3ep, va)); 3477 if ((pte & PG_V) && 3478 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) 3479 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3480 } 3481 if (m != NULL && !vm_page_wire_mapped(m)) 3482 m = NULL; 3483 } 3484 PMAP_UNLOCK(pmap); 3485 return (m); 3486 } 3487 3488 static void 3489 mmu_radix_growkernel(vm_offset_t addr) 3490 { 3491 vm_paddr_t paddr; 3492 vm_page_t nkpg; 3493 pml3_entry_t *l3e; 3494 pml2_entry_t *l2e; 3495 3496 CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); 3497 if (VM_MIN_KERNEL_ADDRESS < addr && 3498 addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE)) 3499 return; 3500 3501 addr = roundup2(addr, L3_PAGE_SIZE); 3502 if (addr - 1 >= vm_map_max(kernel_map)) 3503 addr = vm_map_max(kernel_map); 3504 while (kernel_vm_end < addr) { 3505 l2e = pmap_pml2e(kernel_pmap, kernel_vm_end); 3506 if ((be64toh(*l2e) & PG_V) == 0) { 3507 /* We need a new PDP entry */ 3508 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_PAGE_SIZE_SHIFT, 3509 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 3510 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3511 if (nkpg == NULL) 3512 panic("pmap_growkernel: no memory to grow kernel"); 3513 if ((nkpg->flags & PG_ZERO) == 0) 3514 mmu_radix_zero_page(nkpg); 3515 paddr = VM_PAGE_TO_PHYS(nkpg); 3516 pde_store(l2e, paddr); 3517 continue; /* try again */ 3518 } 3519 l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end); 3520 if ((be64toh(*l3e) & PG_V) != 0) { 3521 kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 3522 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3523 kernel_vm_end = vm_map_max(kernel_map); 3524 break; 3525 } 3526 continue; 3527 } 3528 3529 nkpg = vm_page_alloc(NULL, pmap_l3e_pindex(kernel_vm_end), 3530 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 3531 VM_ALLOC_ZERO); 3532 if (nkpg == NULL) 3533 panic("pmap_growkernel: no memory to grow kernel"); 3534 if ((nkpg->flags & PG_ZERO) == 0) 3535 mmu_radix_zero_page(nkpg); 3536 paddr = VM_PAGE_TO_PHYS(nkpg); 3537 pde_store(l3e, paddr); 3538 3539 kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 3540 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3541 kernel_vm_end = vm_map_max(kernel_map); 3542 break; 3543 } 3544 } 3545 ptesync(); 3546 } 3547 3548 static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory"); 3549 static uma_zone_t zone_radix_pgd; 3550 3551 static int 3552 radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused, 3553 int flags) 3554 { 3555 3556 for (int i = 0; i < count; i++) { 3557 vm_page_t m = vm_page_alloc_contig(NULL, 0, 3558 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 3559 VM_ALLOC_ZERO | VM_ALLOC_WAITOK, RADIX_PGD_SIZE/PAGE_SIZE, 3560 0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE, 3561 VM_MEMATTR_DEFAULT); 3562 /* XXX zero on alloc here so we don't have to later */ 3563 store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3564 } 3565 return (count); 3566 } 3567 3568 static void 3569 radix_pgd_release(void *arg __unused, void **store, int count) 3570 { 3571 vm_page_t m; 3572 struct spglist free; 3573 int page_count; 3574 3575 SLIST_INIT(&free); 3576 page_count = RADIX_PGD_SIZE/PAGE_SIZE; 3577 3578 for (int i = 0; i < count; i++) { 3579 /* 3580 * XXX selectively remove dmap and KVA entries so we don't 3581 * need to bzero 3582 */ 3583 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); 3584 for (int j = page_count-1; j >= 0; j--) { 3585 vm_page_unwire_noq(&m[j]); 3586 SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss); 3587 } 3588 vm_page_free_pages_toq(&free, false); 3589 } 3590 } 3591 3592 static void 3593 mmu_radix_init() 3594 { 3595 vm_page_t mpte; 3596 vm_size_t s; 3597 int error, i, pv_npg; 3598 3599 /* XXX is this really needed for POWER? */ 3600 /* L1TF, reserve page @0 unconditionally */ 3601 vm_page_blacklist_add(0, bootverbose); 3602 3603 zone_radix_pgd = uma_zcache_create("radix_pgd_cache", 3604 RADIX_PGD_SIZE, NULL, NULL, 3605 #ifdef INVARIANTS 3606 trash_init, trash_fini, 3607 #else 3608 NULL, NULL, 3609 #endif 3610 radix_pgd_import, radix_pgd_release, 3611 NULL, UMA_ZONE_NOBUCKET); 3612 3613 /* 3614 * Initialize the vm page array entries for the kernel pmap's 3615 * page table pages. 3616 */ 3617 PMAP_LOCK(kernel_pmap); 3618 for (i = 0; i < nkpt; i++) { 3619 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 3620 KASSERT(mpte >= vm_page_array && 3621 mpte < &vm_page_array[vm_page_array_size], 3622 ("pmap_init: page table page is out of range size: %lu", 3623 vm_page_array_size)); 3624 mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i; 3625 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 3626 MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte); 3627 //pmap_insert_pt_page(kernel_pmap, mpte); 3628 mpte->ref_count = 1; 3629 } 3630 PMAP_UNLOCK(kernel_pmap); 3631 vm_wire_add(nkpt); 3632 3633 CTR1(KTR_PMAP, "%s()", __func__); 3634 TAILQ_INIT(&pv_dummy.pv_list); 3635 3636 /* 3637 * Are large page mappings enabled? 3638 */ 3639 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 3640 if (superpages_enabled) { 3641 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 3642 ("pmap_init: can't assign to pagesizes[1]")); 3643 pagesizes[1] = L3_PAGE_SIZE; 3644 } 3645 3646 /* 3647 * Initialize the pv chunk list mutex. 3648 */ 3649 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 3650 3651 /* 3652 * Initialize the pool of pv list locks. 3653 */ 3654 for (i = 0; i < NPV_LIST_LOCKS; i++) 3655 rw_init(&pv_list_locks[i], "pmap pv list"); 3656 3657 /* 3658 * Calculate the size of the pv head table for superpages. 3659 */ 3660 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE); 3661 3662 /* 3663 * Allocate memory for the pv head table for superpages. 3664 */ 3665 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 3666 s = round_page(s); 3667 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 3668 for (i = 0; i < pv_npg; i++) 3669 TAILQ_INIT(&pv_table[i].pv_list); 3670 TAILQ_INIT(&pv_dummy.pv_list); 3671 3672 pmap_initialized = 1; 3673 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 3674 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3675 (vmem_addr_t *)&qframe); 3676 3677 if (error != 0) 3678 panic("qframe allocation failed"); 3679 asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<<isa3_pid_bits), 3680 1, 1, M_WAITOK); 3681 } 3682 3683 static boolean_t 3684 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3685 { 3686 struct rwlock *lock; 3687 pv_entry_t pv; 3688 struct md_page *pvh; 3689 pt_entry_t *pte, mask; 3690 pmap_t pmap; 3691 int md_gen, pvh_gen; 3692 boolean_t rv; 3693 3694 rv = FALSE; 3695 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3696 rw_rlock(lock); 3697 restart: 3698 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 3699 pmap = PV_PMAP(pv); 3700 if (!PMAP_TRYLOCK(pmap)) { 3701 md_gen = m->md.pv_gen; 3702 rw_runlock(lock); 3703 PMAP_LOCK(pmap); 3704 rw_rlock(lock); 3705 if (md_gen != m->md.pv_gen) { 3706 PMAP_UNLOCK(pmap); 3707 goto restart; 3708 } 3709 } 3710 pte = pmap_pte(pmap, pv->pv_va); 3711 mask = 0; 3712 if (modified) 3713 mask |= PG_RW | PG_M; 3714 if (accessed) 3715 mask |= PG_V | PG_A; 3716 rv = (be64toh(*pte) & mask) == mask; 3717 PMAP_UNLOCK(pmap); 3718 if (rv) 3719 goto out; 3720 } 3721 if ((m->flags & PG_FICTITIOUS) == 0) { 3722 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3723 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 3724 pmap = PV_PMAP(pv); 3725 if (!PMAP_TRYLOCK(pmap)) { 3726 md_gen = m->md.pv_gen; 3727 pvh_gen = pvh->pv_gen; 3728 rw_runlock(lock); 3729 PMAP_LOCK(pmap); 3730 rw_rlock(lock); 3731 if (md_gen != m->md.pv_gen || 3732 pvh_gen != pvh->pv_gen) { 3733 PMAP_UNLOCK(pmap); 3734 goto restart; 3735 } 3736 } 3737 pte = pmap_pml3e(pmap, pv->pv_va); 3738 mask = 0; 3739 if (modified) 3740 mask |= PG_RW | PG_M; 3741 if (accessed) 3742 mask |= PG_V | PG_A; 3743 rv = (be64toh(*pte) & mask) == mask; 3744 PMAP_UNLOCK(pmap); 3745 if (rv) 3746 goto out; 3747 } 3748 } 3749 out: 3750 rw_runlock(lock); 3751 return (rv); 3752 } 3753 3754 /* 3755 * pmap_is_modified: 3756 * 3757 * Return whether or not the specified physical page was modified 3758 * in any physical maps. 3759 */ 3760 boolean_t 3761 mmu_radix_is_modified(vm_page_t m) 3762 { 3763 3764 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3765 ("pmap_is_modified: page %p is not managed", m)); 3766 3767 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 3768 /* 3769 * If the page is not busied then this check is racy. 3770 */ 3771 if (!pmap_page_is_write_mapped(m)) 3772 return (FALSE); 3773 return (pmap_page_test_mappings(m, FALSE, TRUE)); 3774 } 3775 3776 boolean_t 3777 mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3778 { 3779 pml3_entry_t *l3e; 3780 pt_entry_t *pte; 3781 boolean_t rv; 3782 3783 CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); 3784 rv = FALSE; 3785 PMAP_LOCK(pmap); 3786 l3e = pmap_pml3e(pmap, addr); 3787 if (l3e != NULL && (be64toh(*l3e) & (RPTE_LEAF | PG_V)) == PG_V) { 3788 pte = pmap_l3e_to_pte(l3e, addr); 3789 rv = (be64toh(*pte) & PG_V) == 0; 3790 } 3791 PMAP_UNLOCK(pmap); 3792 return (rv); 3793 } 3794 3795 boolean_t 3796 mmu_radix_is_referenced(vm_page_t m) 3797 { 3798 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3799 ("pmap_is_referenced: page %p is not managed", m)); 3800 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 3801 return (pmap_page_test_mappings(m, TRUE, FALSE)); 3802 } 3803 3804 /* 3805 * pmap_ts_referenced: 3806 * 3807 * Return a count of reference bits for a page, clearing those bits. 3808 * It is not necessary for every reference bit to be cleared, but it 3809 * is necessary that 0 only be returned when there are truly no 3810 * reference bits set. 3811 * 3812 * As an optimization, update the page's dirty field if a modified bit is 3813 * found while counting reference bits. This opportunistic update can be 3814 * performed at low cost and can eliminate the need for some future calls 3815 * to pmap_is_modified(). However, since this function stops after 3816 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 3817 * dirty pages. Those dirty pages will only be detected by a future call 3818 * to pmap_is_modified(). 3819 * 3820 * A DI block is not needed within this function, because 3821 * invalidations are performed before the PV list lock is 3822 * released. 3823 */ 3824 boolean_t 3825 mmu_radix_ts_referenced(vm_page_t m) 3826 { 3827 struct md_page *pvh; 3828 pv_entry_t pv, pvf; 3829 pmap_t pmap; 3830 struct rwlock *lock; 3831 pml3_entry_t oldl3e, *l3e; 3832 pt_entry_t *pte; 3833 vm_paddr_t pa; 3834 int cleared, md_gen, not_cleared, pvh_gen; 3835 struct spglist free; 3836 3837 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 3838 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3839 ("pmap_ts_referenced: page %p is not managed", m)); 3840 SLIST_INIT(&free); 3841 cleared = 0; 3842 pa = VM_PAGE_TO_PHYS(m); 3843 lock = PHYS_TO_PV_LIST_LOCK(pa); 3844 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 3845 rw_wlock(lock); 3846 retry: 3847 not_cleared = 0; 3848 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 3849 goto small_mappings; 3850 pv = pvf; 3851 do { 3852 if (pvf == NULL) 3853 pvf = pv; 3854 pmap = PV_PMAP(pv); 3855 if (!PMAP_TRYLOCK(pmap)) { 3856 pvh_gen = pvh->pv_gen; 3857 rw_wunlock(lock); 3858 PMAP_LOCK(pmap); 3859 rw_wlock(lock); 3860 if (pvh_gen != pvh->pv_gen) { 3861 PMAP_UNLOCK(pmap); 3862 goto retry; 3863 } 3864 } 3865 l3e = pmap_pml3e(pmap, pv->pv_va); 3866 oldl3e = be64toh(*l3e); 3867 if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3868 /* 3869 * Although "oldpde" is mapping a 2MB page, because 3870 * this function is called at a 4KB page granularity, 3871 * we only update the 4KB page under test. 3872 */ 3873 vm_page_dirty(m); 3874 } 3875 if ((oldl3e & PG_A) != 0) { 3876 /* 3877 * Since this reference bit is shared by 512 4KB 3878 * pages, it should not be cleared every time it is 3879 * tested. Apply a simple "hash" function on the 3880 * physical page number, the virtual superpage number, 3881 * and the pmap address to select one 4KB page out of 3882 * the 512 on which testing the reference bit will 3883 * result in clearing that reference bit. This 3884 * function is designed to avoid the selection of the 3885 * same 4KB page for every 2MB page mapping. 3886 * 3887 * On demotion, a mapping that hasn't been referenced 3888 * is simply destroyed. To avoid the possibility of a 3889 * subsequent page fault on a demoted wired mapping, 3890 * always leave its reference bit set. Moreover, 3891 * since the superpage is wired, the current state of 3892 * its reference bit won't affect page replacement. 3893 */ 3894 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^ 3895 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 3896 (oldl3e & PG_W) == 0) { 3897 atomic_clear_long(l3e, htobe64(PG_A)); 3898 pmap_invalidate_page(pmap, pv->pv_va); 3899 cleared++; 3900 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 3901 ("inconsistent pv lock %p %p for page %p", 3902 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 3903 } else 3904 not_cleared++; 3905 } 3906 PMAP_UNLOCK(pmap); 3907 /* Rotate the PV list if it has more than one entry. */ 3908 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { 3909 TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); 3910 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); 3911 pvh->pv_gen++; 3912 } 3913 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 3914 goto out; 3915 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 3916 small_mappings: 3917 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 3918 goto out; 3919 pv = pvf; 3920 do { 3921 if (pvf == NULL) 3922 pvf = pv; 3923 pmap = PV_PMAP(pv); 3924 if (!PMAP_TRYLOCK(pmap)) { 3925 pvh_gen = pvh->pv_gen; 3926 md_gen = m->md.pv_gen; 3927 rw_wunlock(lock); 3928 PMAP_LOCK(pmap); 3929 rw_wlock(lock); 3930 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3931 PMAP_UNLOCK(pmap); 3932 goto retry; 3933 } 3934 } 3935 l3e = pmap_pml3e(pmap, pv->pv_va); 3936 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, 3937 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 3938 m)); 3939 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 3940 if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3941 vm_page_dirty(m); 3942 if ((be64toh(*pte) & PG_A) != 0) { 3943 atomic_clear_long(pte, htobe64(PG_A)); 3944 pmap_invalidate_page(pmap, pv->pv_va); 3945 cleared++; 3946 } 3947 PMAP_UNLOCK(pmap); 3948 /* Rotate the PV list if it has more than one entry. */ 3949 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { 3950 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 3951 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 3952 m->md.pv_gen++; 3953 } 3954 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 3955 not_cleared < PMAP_TS_REFERENCED_MAX); 3956 out: 3957 rw_wunlock(lock); 3958 vm_page_free_pages_toq(&free, true); 3959 return (cleared + not_cleared); 3960 } 3961 3962 static vm_offset_t 3963 mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start, 3964 vm_paddr_t end, int prot __unused) 3965 { 3966 3967 CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end, 3968 prot); 3969 return (PHYS_TO_DMAP(start)); 3970 } 3971 3972 void 3973 mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr, 3974 vm_object_t object, vm_pindex_t pindex, vm_size_t size) 3975 { 3976 pml3_entry_t *l3e; 3977 vm_paddr_t pa, ptepa; 3978 vm_page_t p, pdpg; 3979 vm_memattr_t ma; 3980 3981 CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr, 3982 object, pindex, size); 3983 VM_OBJECT_ASSERT_WLOCKED(object); 3984 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3985 ("pmap_object_init_pt: non-device object")); 3986 /* NB: size can be logically ored with addr here */ 3987 if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) { 3988 if (!mmu_radix_ps_enabled(pmap)) 3989 return; 3990 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3991 return; 3992 p = vm_page_lookup(object, pindex); 3993 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3994 ("pmap_object_init_pt: invalid page %p", p)); 3995 ma = p->md.mdpg_cache_attrs; 3996 3997 /* 3998 * Abort the mapping if the first page is not physically 3999 * aligned to a 2MB page boundary. 4000 */ 4001 ptepa = VM_PAGE_TO_PHYS(p); 4002 if (ptepa & L3_PAGE_MASK) 4003 return; 4004 4005 /* 4006 * Skip the first page. Abort the mapping if the rest of 4007 * the pages are not physically contiguous or have differing 4008 * memory attributes. 4009 */ 4010 p = TAILQ_NEXT(p, listq); 4011 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 4012 pa += PAGE_SIZE) { 4013 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4014 ("pmap_object_init_pt: invalid page %p", p)); 4015 if (pa != VM_PAGE_TO_PHYS(p) || 4016 ma != p->md.mdpg_cache_attrs) 4017 return; 4018 p = TAILQ_NEXT(p, listq); 4019 } 4020 4021 PMAP_LOCK(pmap); 4022 for (pa = ptepa | pmap_cache_bits(ma); 4023 pa < ptepa + size; pa += L3_PAGE_SIZE) { 4024 pdpg = pmap_allocl3e(pmap, addr, NULL); 4025 if (pdpg == NULL) { 4026 /* 4027 * The creation of mappings below is only an 4028 * optimization. If a page directory page 4029 * cannot be allocated without blocking, 4030 * continue on to the next mapping rather than 4031 * blocking. 4032 */ 4033 addr += L3_PAGE_SIZE; 4034 continue; 4035 } 4036 l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4037 l3e = &l3e[pmap_pml3e_index(addr)]; 4038 if ((be64toh(*l3e) & PG_V) == 0) { 4039 pa |= PG_M | PG_A | PG_RW; 4040 pte_store(l3e, pa); 4041 pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); 4042 counter_u64_add(pmap_l3e_mappings, 1); 4043 } else { 4044 /* Continue on if the PDE is already valid. */ 4045 pdpg->ref_count--; 4046 KASSERT(pdpg->ref_count > 0, 4047 ("pmap_object_init_pt: missing reference " 4048 "to page directory page, va: 0x%lx", addr)); 4049 } 4050 addr += L3_PAGE_SIZE; 4051 } 4052 ptesync(); 4053 PMAP_UNLOCK(pmap); 4054 } 4055 } 4056 4057 boolean_t 4058 mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m) 4059 { 4060 struct md_page *pvh; 4061 struct rwlock *lock; 4062 pv_entry_t pv; 4063 int loops = 0; 4064 boolean_t rv; 4065 4066 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4067 ("pmap_page_exists_quick: page %p is not managed", m)); 4068 CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m); 4069 rv = FALSE; 4070 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4071 rw_rlock(lock); 4072 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 4073 if (PV_PMAP(pv) == pmap) { 4074 rv = TRUE; 4075 break; 4076 } 4077 loops++; 4078 if (loops >= 16) 4079 break; 4080 } 4081 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4082 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4083 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 4084 if (PV_PMAP(pv) == pmap) { 4085 rv = TRUE; 4086 break; 4087 } 4088 loops++; 4089 if (loops >= 16) 4090 break; 4091 } 4092 } 4093 rw_runlock(lock); 4094 return (rv); 4095 } 4096 4097 void 4098 mmu_radix_page_init(vm_page_t m) 4099 { 4100 4101 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 4102 TAILQ_INIT(&m->md.pv_list); 4103 m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; 4104 } 4105 4106 int 4107 mmu_radix_page_wired_mappings(vm_page_t m) 4108 { 4109 struct rwlock *lock; 4110 struct md_page *pvh; 4111 pmap_t pmap; 4112 pt_entry_t *pte; 4113 pv_entry_t pv; 4114 int count, md_gen, pvh_gen; 4115 4116 if ((m->oflags & VPO_UNMANAGED) != 0) 4117 return (0); 4118 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 4119 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4120 rw_rlock(lock); 4121 restart: 4122 count = 0; 4123 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 4124 pmap = PV_PMAP(pv); 4125 if (!PMAP_TRYLOCK(pmap)) { 4126 md_gen = m->md.pv_gen; 4127 rw_runlock(lock); 4128 PMAP_LOCK(pmap); 4129 rw_rlock(lock); 4130 if (md_gen != m->md.pv_gen) { 4131 PMAP_UNLOCK(pmap); 4132 goto restart; 4133 } 4134 } 4135 pte = pmap_pte(pmap, pv->pv_va); 4136 if ((be64toh(*pte) & PG_W) != 0) 4137 count++; 4138 PMAP_UNLOCK(pmap); 4139 } 4140 if ((m->flags & PG_FICTITIOUS) == 0) { 4141 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4142 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 4143 pmap = PV_PMAP(pv); 4144 if (!PMAP_TRYLOCK(pmap)) { 4145 md_gen = m->md.pv_gen; 4146 pvh_gen = pvh->pv_gen; 4147 rw_runlock(lock); 4148 PMAP_LOCK(pmap); 4149 rw_rlock(lock); 4150 if (md_gen != m->md.pv_gen || 4151 pvh_gen != pvh->pv_gen) { 4152 PMAP_UNLOCK(pmap); 4153 goto restart; 4154 } 4155 } 4156 pte = pmap_pml3e(pmap, pv->pv_va); 4157 if ((be64toh(*pte) & PG_W) != 0) 4158 count++; 4159 PMAP_UNLOCK(pmap); 4160 } 4161 } 4162 rw_runlock(lock); 4163 return (count); 4164 } 4165 4166 static void 4167 mmu_radix_update_proctab(int pid, pml1_entry_t l1pa) 4168 { 4169 isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE | l1pa | RADIX_PGD_INDEX_SHIFT); 4170 } 4171 4172 int 4173 mmu_radix_pinit(pmap_t pmap) 4174 { 4175 vmem_addr_t pid; 4176 vm_paddr_t l1pa; 4177 4178 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 4179 4180 /* 4181 * allocate the page directory page 4182 */ 4183 pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK); 4184 4185 for (int j = 0; j < RADIX_PGD_SIZE_SHIFT; j++) 4186 pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE); 4187 pmap->pm_radix.rt_root = 0; 4188 TAILQ_INIT(&pmap->pm_pvchunk); 4189 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4190 pmap->pm_flags = PMAP_PDE_SUPERPAGE; 4191 vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid); 4192 4193 pmap->pm_pid = pid; 4194 l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1); 4195 mmu_radix_update_proctab(pid, l1pa); 4196 __asm __volatile("ptesync;isync" : : : "memory"); 4197 4198 return (1); 4199 } 4200 4201 /* 4202 * This routine is called if the desired page table page does not exist. 4203 * 4204 * If page table page allocation fails, this routine may sleep before 4205 * returning NULL. It sleeps only if a lock pointer was given. 4206 * 4207 * Note: If a page allocation fails at page table level two or three, 4208 * one or two pages may be held during the wait, only to be released 4209 * afterwards. This conservative approach is easily argued to avoid 4210 * race conditions. 4211 */ 4212 static vm_page_t 4213 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 4214 { 4215 vm_page_t m, pdppg, pdpg; 4216 4217 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4218 4219 /* 4220 * Allocate a page table page. 4221 */ 4222 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 4223 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 4224 if (lockp != NULL) { 4225 RELEASE_PV_LIST_LOCK(lockp); 4226 PMAP_UNLOCK(pmap); 4227 vm_wait(NULL); 4228 PMAP_LOCK(pmap); 4229 } 4230 /* 4231 * Indicate the need to retry. While waiting, the page table 4232 * page may have been allocated. 4233 */ 4234 return (NULL); 4235 } 4236 if ((m->flags & PG_ZERO) == 0) 4237 mmu_radix_zero_page(m); 4238 4239 /* 4240 * Map the pagetable page into the process address space, if 4241 * it isn't already there. 4242 */ 4243 4244 if (ptepindex >= (NUPDE + NUPDPE)) { 4245 pml1_entry_t *l1e; 4246 vm_pindex_t pml1index; 4247 4248 /* Wire up a new PDPE page */ 4249 pml1index = ptepindex - (NUPDE + NUPDPE); 4250 l1e = &pmap->pm_pml1[pml1index]; 4251 pde_store(l1e, VM_PAGE_TO_PHYS(m)); 4252 4253 } else if (ptepindex >= NUPDE) { 4254 vm_pindex_t pml1index; 4255 vm_pindex_t pdpindex; 4256 pml1_entry_t *l1e; 4257 pml2_entry_t *l2e; 4258 4259 /* Wire up a new l2e page */ 4260 pdpindex = ptepindex - NUPDE; 4261 pml1index = pdpindex >> RPTE_SHIFT; 4262 4263 l1e = &pmap->pm_pml1[pml1index]; 4264 if ((be64toh(*l1e) & PG_V) == 0) { 4265 /* Have to allocate a new pdp, recurse */ 4266 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index, 4267 lockp) == NULL) { 4268 vm_page_unwire_noq(m); 4269 vm_page_free_zero(m); 4270 return (NULL); 4271 } 4272 } else { 4273 /* Add reference to l2e page */ 4274 pdppg = PHYS_TO_VM_PAGE(be64toh(*l1e) & PG_FRAME); 4275 pdppg->ref_count++; 4276 } 4277 l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); 4278 4279 /* Now find the pdp page */ 4280 l2e = &l2e[pdpindex & RPTE_MASK]; 4281 pde_store(l2e, VM_PAGE_TO_PHYS(m)); 4282 4283 } else { 4284 vm_pindex_t pml1index; 4285 vm_pindex_t pdpindex; 4286 pml1_entry_t *l1e; 4287 pml2_entry_t *l2e; 4288 pml3_entry_t *l3e; 4289 4290 /* Wire up a new PTE page */ 4291 pdpindex = ptepindex >> RPTE_SHIFT; 4292 pml1index = pdpindex >> RPTE_SHIFT; 4293 4294 /* First, find the pdp and check that its valid. */ 4295 l1e = &pmap->pm_pml1[pml1index]; 4296 if ((be64toh(*l1e) & PG_V) == 0) { 4297 /* Have to allocate a new pd, recurse */ 4298 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 4299 lockp) == NULL) { 4300 vm_page_unwire_noq(m); 4301 vm_page_free_zero(m); 4302 return (NULL); 4303 } 4304 l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); 4305 l2e = &l2e[pdpindex & RPTE_MASK]; 4306 } else { 4307 l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); 4308 l2e = &l2e[pdpindex & RPTE_MASK]; 4309 if ((be64toh(*l2e) & PG_V) == 0) { 4310 /* Have to allocate a new pd, recurse */ 4311 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 4312 lockp) == NULL) { 4313 vm_page_unwire_noq(m); 4314 vm_page_free_zero(m); 4315 return (NULL); 4316 } 4317 } else { 4318 /* Add reference to the pd page */ 4319 pdpg = PHYS_TO_VM_PAGE(be64toh(*l2e) & PG_FRAME); 4320 pdpg->ref_count++; 4321 } 4322 } 4323 l3e = (pml3_entry_t *)PHYS_TO_DMAP(be64toh(*l2e) & PG_FRAME); 4324 4325 /* Now we know where the page directory page is */ 4326 l3e = &l3e[ptepindex & RPTE_MASK]; 4327 pde_store(l3e, VM_PAGE_TO_PHYS(m)); 4328 } 4329 4330 pmap_resident_count_inc(pmap, 1); 4331 return (m); 4332 } 4333 static vm_page_t 4334 pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4335 { 4336 vm_pindex_t pdpindex, ptepindex; 4337 pml2_entry_t *pdpe; 4338 vm_page_t pdpg; 4339 4340 retry: 4341 pdpe = pmap_pml2e(pmap, va); 4342 if (pdpe != NULL && (be64toh(*pdpe) & PG_V) != 0) { 4343 /* Add a reference to the pd page. */ 4344 pdpg = PHYS_TO_VM_PAGE(be64toh(*pdpe) & PG_FRAME); 4345 pdpg->ref_count++; 4346 } else { 4347 /* Allocate a pd page. */ 4348 ptepindex = pmap_l3e_pindex(va); 4349 pdpindex = ptepindex >> RPTE_SHIFT; 4350 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 4351 if (pdpg == NULL && lockp != NULL) 4352 goto retry; 4353 } 4354 return (pdpg); 4355 } 4356 4357 static vm_page_t 4358 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4359 { 4360 vm_pindex_t ptepindex; 4361 pml3_entry_t *pd; 4362 vm_page_t m; 4363 4364 /* 4365 * Calculate pagetable page index 4366 */ 4367 ptepindex = pmap_l3e_pindex(va); 4368 retry: 4369 /* 4370 * Get the page directory entry 4371 */ 4372 pd = pmap_pml3e(pmap, va); 4373 4374 /* 4375 * This supports switching from a 2MB page to a 4376 * normal 4K page. 4377 */ 4378 if (pd != NULL && (be64toh(*pd) & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) { 4379 if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) { 4380 /* 4381 * Invalidation of the 2MB page mapping may have caused 4382 * the deallocation of the underlying PD page. 4383 */ 4384 pd = NULL; 4385 } 4386 } 4387 4388 /* 4389 * If the page table page is mapped, we just increment the 4390 * hold count, and activate it. 4391 */ 4392 if (pd != NULL && (be64toh(*pd) & PG_V) != 0) { 4393 m = PHYS_TO_VM_PAGE(be64toh(*pd) & PG_FRAME); 4394 m->ref_count++; 4395 } else { 4396 /* 4397 * Here if the pte page isn't mapped, or if it has been 4398 * deallocated. 4399 */ 4400 m = _pmap_allocpte(pmap, ptepindex, lockp); 4401 if (m == NULL && lockp != NULL) 4402 goto retry; 4403 } 4404 return (m); 4405 } 4406 4407 static void 4408 mmu_radix_pinit0(pmap_t pmap) 4409 { 4410 4411 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 4412 PMAP_LOCK_INIT(pmap); 4413 pmap->pm_pml1 = kernel_pmap->pm_pml1; 4414 pmap->pm_pid = kernel_pmap->pm_pid; 4415 4416 pmap->pm_radix.rt_root = 0; 4417 TAILQ_INIT(&pmap->pm_pvchunk); 4418 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4419 kernel_pmap->pm_flags = 4420 pmap->pm_flags = PMAP_PDE_SUPERPAGE; 4421 } 4422 /* 4423 * pmap_protect_l3e: do the things to protect a 2mpage in a process 4424 */ 4425 static boolean_t 4426 pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot) 4427 { 4428 pt_entry_t newpde, oldpde; 4429 vm_offset_t eva, va; 4430 vm_page_t m; 4431 boolean_t anychanged; 4432 4433 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4434 KASSERT((sva & L3_PAGE_MASK) == 0, 4435 ("pmap_protect_l3e: sva is not 2mpage aligned")); 4436 anychanged = FALSE; 4437 retry: 4438 oldpde = newpde = be64toh(*l3e); 4439 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 4440 (PG_MANAGED | PG_M | PG_RW)) { 4441 eva = sva + L3_PAGE_SIZE; 4442 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 4443 va < eva; va += PAGE_SIZE, m++) 4444 vm_page_dirty(m); 4445 } 4446 if ((prot & VM_PROT_WRITE) == 0) { 4447 newpde &= ~(PG_RW | PG_M); 4448 newpde |= RPTE_EAA_R; 4449 } 4450 if (prot & VM_PROT_EXECUTE) 4451 newpde |= PG_X; 4452 if (newpde != oldpde) { 4453 /* 4454 * As an optimization to future operations on this PDE, clear 4455 * PG_PROMOTED. The impending invalidation will remove any 4456 * lingering 4KB page mappings from the TLB. 4457 */ 4458 if (!atomic_cmpset_long(l3e, htobe64(oldpde), htobe64(newpde & ~PG_PROMOTED))) 4459 goto retry; 4460 anychanged = TRUE; 4461 } 4462 return (anychanged); 4463 } 4464 4465 void 4466 mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 4467 vm_prot_t prot) 4468 { 4469 vm_offset_t va_next; 4470 pml1_entry_t *l1e; 4471 pml2_entry_t *l2e; 4472 pml3_entry_t ptpaddr, *l3e; 4473 pt_entry_t *pte; 4474 boolean_t anychanged; 4475 4476 CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva, 4477 prot); 4478 4479 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4480 if (prot == VM_PROT_NONE) { 4481 mmu_radix_remove(pmap, sva, eva); 4482 return; 4483 } 4484 4485 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 4486 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 4487 return; 4488 4489 #ifdef INVARIANTS 4490 if (VERBOSE_PROTECT || pmap_logging) 4491 printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n", 4492 pmap, sva, eva, prot, pmap->pm_pid); 4493 #endif 4494 anychanged = FALSE; 4495 4496 PMAP_LOCK(pmap); 4497 for (; sva < eva; sva = va_next) { 4498 l1e = pmap_pml1e(pmap, sva); 4499 if ((be64toh(*l1e) & PG_V) == 0) { 4500 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 4501 if (va_next < sva) 4502 va_next = eva; 4503 continue; 4504 } 4505 4506 l2e = pmap_l1e_to_l2e(l1e, sva); 4507 if ((be64toh(*l2e) & PG_V) == 0) { 4508 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 4509 if (va_next < sva) 4510 va_next = eva; 4511 continue; 4512 } 4513 4514 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 4515 if (va_next < sva) 4516 va_next = eva; 4517 4518 l3e = pmap_l2e_to_l3e(l2e, sva); 4519 ptpaddr = be64toh(*l3e); 4520 4521 /* 4522 * Weed out invalid mappings. 4523 */ 4524 if (ptpaddr == 0) 4525 continue; 4526 4527 /* 4528 * Check for large page. 4529 */ 4530 if ((ptpaddr & RPTE_LEAF) != 0) { 4531 /* 4532 * Are we protecting the entire large page? If not, 4533 * demote the mapping and fall through. 4534 */ 4535 if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { 4536 if (pmap_protect_l3e(pmap, l3e, sva, prot)) 4537 anychanged = TRUE; 4538 continue; 4539 } else if (!pmap_demote_l3e(pmap, l3e, sva)) { 4540 /* 4541 * The large page mapping was destroyed. 4542 */ 4543 continue; 4544 } 4545 } 4546 4547 if (va_next > eva) 4548 va_next = eva; 4549 4550 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, 4551 sva += PAGE_SIZE) { 4552 pt_entry_t obits, pbits; 4553 vm_page_t m; 4554 4555 retry: 4556 MPASS(pte == pmap_pte(pmap, sva)); 4557 obits = pbits = be64toh(*pte); 4558 if ((pbits & PG_V) == 0) 4559 continue; 4560 4561 if ((prot & VM_PROT_WRITE) == 0) { 4562 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 4563 (PG_MANAGED | PG_M | PG_RW)) { 4564 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4565 vm_page_dirty(m); 4566 } 4567 pbits &= ~(PG_RW | PG_M); 4568 pbits |= RPTE_EAA_R; 4569 } 4570 if (prot & VM_PROT_EXECUTE) 4571 pbits |= PG_X; 4572 4573 if (pbits != obits) { 4574 if (!atomic_cmpset_long(pte, htobe64(obits), htobe64(pbits))) 4575 goto retry; 4576 if (obits & (PG_A|PG_M)) { 4577 anychanged = TRUE; 4578 #ifdef INVARIANTS 4579 if (VERBOSE_PROTECT || pmap_logging) 4580 printf("%#lx %#lx -> %#lx\n", 4581 sva, obits, pbits); 4582 #endif 4583 } 4584 } 4585 } 4586 } 4587 if (anychanged) 4588 pmap_invalidate_all(pmap); 4589 PMAP_UNLOCK(pmap); 4590 } 4591 4592 void 4593 mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count) 4594 { 4595 4596 CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count); 4597 pt_entry_t oldpte, pa, *pte; 4598 vm_page_t m; 4599 uint64_t cache_bits, attr_bits; 4600 vm_offset_t va; 4601 4602 oldpte = 0; 4603 attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; 4604 va = sva; 4605 pte = kvtopte(va); 4606 while (va < sva + PAGE_SIZE * count) { 4607 if (__predict_false((va & L3_PAGE_MASK) == 0)) 4608 pte = kvtopte(va); 4609 MPASS(pte == pmap_pte(kernel_pmap, va)); 4610 4611 /* 4612 * XXX there has to be a more efficient way than traversing 4613 * the page table every time - but go for correctness for 4614 * today 4615 */ 4616 4617 m = *ma++; 4618 cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs); 4619 pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits; 4620 if (be64toh(*pte) != pa) { 4621 oldpte |= be64toh(*pte); 4622 pte_store(pte, pa); 4623 } 4624 va += PAGE_SIZE; 4625 pte++; 4626 } 4627 if (__predict_false((oldpte & RPTE_VALID) != 0)) 4628 pmap_invalidate_range(kernel_pmap, sva, sva + count * 4629 PAGE_SIZE); 4630 else 4631 ptesync(); 4632 } 4633 4634 void 4635 mmu_radix_qremove(vm_offset_t sva, int count) 4636 { 4637 vm_offset_t va; 4638 pt_entry_t *pte; 4639 4640 CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count); 4641 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva)); 4642 4643 va = sva; 4644 pte = kvtopte(va); 4645 while (va < sva + PAGE_SIZE * count) { 4646 if (__predict_false((va & L3_PAGE_MASK) == 0)) 4647 pte = kvtopte(va); 4648 pte_clear(pte); 4649 pte++; 4650 va += PAGE_SIZE; 4651 } 4652 pmap_invalidate_range(kernel_pmap, sva, va); 4653 } 4654 4655 /*************************************************** 4656 * Page table page management routines..... 4657 ***************************************************/ 4658 /* 4659 * Schedule the specified unused page table page to be freed. Specifically, 4660 * add the page to the specified list of pages that will be released to the 4661 * physical memory manager after the TLB has been updated. 4662 */ 4663 static __inline void 4664 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 4665 boolean_t set_PG_ZERO) 4666 { 4667 4668 if (set_PG_ZERO) 4669 m->flags |= PG_ZERO; 4670 else 4671 m->flags &= ~PG_ZERO; 4672 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 4673 } 4674 4675 /* 4676 * Inserts the specified page table page into the specified pmap's collection 4677 * of idle page table pages. Each of a pmap's page table pages is responsible 4678 * for mapping a distinct range of virtual addresses. The pmap's collection is 4679 * ordered by this virtual address range. 4680 */ 4681 static __inline int 4682 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 4683 { 4684 4685 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4686 return (vm_radix_insert(&pmap->pm_radix, mpte)); 4687 } 4688 4689 /* 4690 * Removes the page table page mapping the specified virtual address from the 4691 * specified pmap's collection of idle page table pages, and returns it. 4692 * Otherwise, returns NULL if there is no page table page corresponding to the 4693 * specified virtual address. 4694 */ 4695 static __inline vm_page_t 4696 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4697 { 4698 4699 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4700 return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va))); 4701 } 4702 4703 /* 4704 * Decrements a page table page's wire count, which is used to record the 4705 * number of valid page table entries within the page. If the wire count 4706 * drops to zero, then the page table page is unmapped. Returns TRUE if the 4707 * page table page was unmapped and FALSE otherwise. 4708 */ 4709 static inline boolean_t 4710 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4711 { 4712 4713 --m->ref_count; 4714 if (m->ref_count == 0) { 4715 _pmap_unwire_ptp(pmap, va, m, free); 4716 return (TRUE); 4717 } else 4718 return (FALSE); 4719 } 4720 4721 static void 4722 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4723 { 4724 4725 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4726 /* 4727 * unmap the page table page 4728 */ 4729 if (m->pindex >= NUPDE + NUPDPE) { 4730 /* PDP page */ 4731 pml1_entry_t *pml1; 4732 pml1 = pmap_pml1e(pmap, va); 4733 *pml1 = 0; 4734 } else if (m->pindex >= NUPDE) { 4735 /* PD page */ 4736 pml2_entry_t *l2e; 4737 l2e = pmap_pml2e(pmap, va); 4738 *l2e = 0; 4739 } else { 4740 /* PTE page */ 4741 pml3_entry_t *l3e; 4742 l3e = pmap_pml3e(pmap, va); 4743 *l3e = 0; 4744 } 4745 pmap_resident_count_dec(pmap, 1); 4746 if (m->pindex < NUPDE) { 4747 /* We just released a PT, unhold the matching PD */ 4748 vm_page_t pdpg; 4749 4750 pdpg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml2e(pmap, va)) & PG_FRAME); 4751 pmap_unwire_ptp(pmap, va, pdpg, free); 4752 } 4753 else if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 4754 /* We just released a PD, unhold the matching PDP */ 4755 vm_page_t pdppg; 4756 4757 pdppg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml1e(pmap, va)) & PG_FRAME); 4758 pmap_unwire_ptp(pmap, va, pdppg, free); 4759 } 4760 4761 /* 4762 * Put page on a list so that it is released after 4763 * *ALL* TLB shootdown is done 4764 */ 4765 pmap_add_delayed_free_list(m, free, TRUE); 4766 } 4767 4768 /* 4769 * After removing a page table entry, this routine is used to 4770 * conditionally free the page, and manage the hold/wire counts. 4771 */ 4772 static int 4773 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde, 4774 struct spglist *free) 4775 { 4776 vm_page_t mpte; 4777 4778 if (va >= VM_MAXUSER_ADDRESS) 4779 return (0); 4780 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 4781 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 4782 return (pmap_unwire_ptp(pmap, va, mpte, free)); 4783 } 4784 4785 void 4786 mmu_radix_release(pmap_t pmap) 4787 { 4788 4789 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 4790 KASSERT(pmap->pm_stats.resident_count == 0, 4791 ("pmap_release: pmap resident count %ld != 0", 4792 pmap->pm_stats.resident_count)); 4793 KASSERT(vm_radix_is_empty(&pmap->pm_radix), 4794 ("pmap_release: pmap has reserved page table page(s)")); 4795 4796 pmap_invalidate_all(pmap); 4797 isa3_proctab[pmap->pm_pid].proctab0 = 0; 4798 uma_zfree(zone_radix_pgd, pmap->pm_pml1); 4799 vmem_free(asid_arena, pmap->pm_pid, 1); 4800 } 4801 4802 /* 4803 * Create the PV entry for a 2MB page mapping. Always returns true unless the 4804 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 4805 * false if the PV entry cannot be allocated without resorting to reclamation. 4806 */ 4807 static bool 4808 pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags, 4809 struct rwlock **lockp) 4810 { 4811 struct md_page *pvh; 4812 pv_entry_t pv; 4813 vm_paddr_t pa; 4814 4815 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4816 /* Pass NULL instead of the lock pointer to disable reclamation. */ 4817 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 4818 NULL : lockp)) == NULL) 4819 return (false); 4820 pv->pv_va = va; 4821 pa = pde & PG_PS_FRAME; 4822 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 4823 pvh = pa_to_pvh(pa); 4824 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); 4825 pvh->pv_gen++; 4826 return (true); 4827 } 4828 4829 /* 4830 * Fills a page table page with mappings to consecutive physical pages. 4831 */ 4832 static void 4833 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 4834 { 4835 pt_entry_t *pte; 4836 4837 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 4838 *pte = htobe64(newpte); 4839 newpte += PAGE_SIZE; 4840 } 4841 } 4842 4843 static boolean_t 4844 pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va) 4845 { 4846 struct rwlock *lock; 4847 boolean_t rv; 4848 4849 lock = NULL; 4850 rv = pmap_demote_l3e_locked(pmap, pde, va, &lock); 4851 if (lock != NULL) 4852 rw_wunlock(lock); 4853 return (rv); 4854 } 4855 4856 static boolean_t 4857 pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, 4858 struct rwlock **lockp) 4859 { 4860 pml3_entry_t oldpde; 4861 pt_entry_t *firstpte; 4862 vm_paddr_t mptepa; 4863 vm_page_t mpte; 4864 struct spglist free; 4865 vm_offset_t sva; 4866 4867 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4868 oldpde = be64toh(*l3e); 4869 KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), 4870 ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx", 4871 oldpde)); 4872 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 4873 NULL) { 4874 KASSERT((oldpde & PG_W) == 0, 4875 ("pmap_demote_l3e: page table page for a wired mapping" 4876 " is missing")); 4877 4878 /* 4879 * Invalidate the 2MB page mapping and return "failure" if the 4880 * mapping was never accessed or the allocation of the new 4881 * page table page fails. If the 2MB page mapping belongs to 4882 * the direct map region of the kernel's address space, then 4883 * the page allocation request specifies the highest possible 4884 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 4885 * normal. Page table pages are preallocated for every other 4886 * part of the kernel address space, so the direct map region 4887 * is the only part of the kernel address space that must be 4888 * handled here. 4889 */ 4890 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 4891 pmap_l3e_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 4892 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 4893 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 4894 SLIST_INIT(&free); 4895 sva = trunc_2mpage(va); 4896 pmap_remove_l3e(pmap, l3e, sva, &free, lockp); 4897 pmap_invalidate_l3e_page(pmap, sva, oldpde); 4898 vm_page_free_pages_toq(&free, true); 4899 CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx" 4900 " in pmap %p", va, pmap); 4901 return (FALSE); 4902 } 4903 if (va < VM_MAXUSER_ADDRESS) 4904 pmap_resident_count_inc(pmap, 1); 4905 } 4906 mptepa = VM_PAGE_TO_PHYS(mpte); 4907 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 4908 KASSERT((oldpde & PG_A) != 0, 4909 ("pmap_demote_l3e: oldpde is missing PG_A")); 4910 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 4911 ("pmap_demote_l3e: oldpde is missing PG_M")); 4912 4913 /* 4914 * If the page table page is new, initialize it. 4915 */ 4916 if (mpte->ref_count == 1) { 4917 mpte->ref_count = NPTEPG; 4918 pmap_fill_ptp(firstpte, oldpde); 4919 } 4920 4921 KASSERT((be64toh(*firstpte) & PG_FRAME) == (oldpde & PG_FRAME), 4922 ("pmap_demote_l3e: firstpte and newpte map different physical" 4923 " addresses")); 4924 4925 /* 4926 * If the mapping has changed attributes, update the page table 4927 * entries. 4928 */ 4929 if ((be64toh(*firstpte) & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE)) 4930 pmap_fill_ptp(firstpte, oldpde); 4931 4932 /* 4933 * The spare PV entries must be reserved prior to demoting the 4934 * mapping, that is, prior to changing the PDE. Otherwise, the state 4935 * of the PDE and the PV lists will be inconsistent, which can result 4936 * in reclaim_pv_chunk() attempting to remove a PV entry from the 4937 * wrong PV list and pmap_pv_demote_l3e() failing to find the expected 4938 * PV entry for the 2MB page mapping that is being demoted. 4939 */ 4940 if ((oldpde & PG_MANAGED) != 0) 4941 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 4942 4943 /* 4944 * Demote the mapping. This pmap is locked. The old PDE has 4945 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 4946 * set. Thus, there is no danger of a race with another 4947 * processor changing the setting of PG_A and/or PG_M between 4948 * the read above and the store below. 4949 */ 4950 pde_store(l3e, mptepa); 4951 pmap_invalidate_l3e_page(pmap, trunc_2mpage(va), oldpde); 4952 /* 4953 * Demote the PV entry. 4954 */ 4955 if ((oldpde & PG_MANAGED) != 0) 4956 pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp); 4957 4958 counter_u64_add(pmap_l3e_demotions, 1); 4959 CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx" 4960 " in pmap %p", va, pmap); 4961 return (TRUE); 4962 } 4963 4964 /* 4965 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 4966 */ 4967 static void 4968 pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va) 4969 { 4970 vm_paddr_t mptepa; 4971 vm_page_t mpte; 4972 4973 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 4974 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4975 mpte = pmap_remove_pt_page(pmap, va); 4976 if (mpte == NULL) 4977 panic("pmap_remove_kernel_pde: Missing pt page."); 4978 4979 mptepa = VM_PAGE_TO_PHYS(mpte); 4980 4981 /* 4982 * Initialize the page table page. 4983 */ 4984 pagezero(PHYS_TO_DMAP(mptepa)); 4985 4986 /* 4987 * Demote the mapping. 4988 */ 4989 pde_store(l3e, mptepa); 4990 ptesync(); 4991 } 4992 4993 /* 4994 * pmap_remove_l3e: do the things to unmap a superpage in a process 4995 */ 4996 static int 4997 pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, 4998 struct spglist *free, struct rwlock **lockp) 4999 { 5000 struct md_page *pvh; 5001 pml3_entry_t oldpde; 5002 vm_offset_t eva, va; 5003 vm_page_t m, mpte; 5004 5005 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5006 KASSERT((sva & L3_PAGE_MASK) == 0, 5007 ("pmap_remove_l3e: sva is not 2mpage aligned")); 5008 oldpde = be64toh(pte_load_clear(pdq)); 5009 if (oldpde & PG_W) 5010 pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE); 5011 pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); 5012 if (oldpde & PG_MANAGED) { 5013 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 5014 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 5015 pmap_pvh_free(pvh, pmap, sva); 5016 eva = sva + L3_PAGE_SIZE; 5017 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 5018 va < eva; va += PAGE_SIZE, m++) { 5019 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5020 vm_page_dirty(m); 5021 if (oldpde & PG_A) 5022 vm_page_aflag_set(m, PGA_REFERENCED); 5023 if (TAILQ_EMPTY(&m->md.pv_list) && 5024 TAILQ_EMPTY(&pvh->pv_list)) 5025 vm_page_aflag_clear(m, PGA_WRITEABLE); 5026 } 5027 } 5028 if (pmap == kernel_pmap) { 5029 pmap_remove_kernel_l3e(pmap, pdq, sva); 5030 } else { 5031 mpte = pmap_remove_pt_page(pmap, sva); 5032 if (mpte != NULL) { 5033 pmap_resident_count_dec(pmap, 1); 5034 KASSERT(mpte->ref_count == NPTEPG, 5035 ("pmap_remove_l3e: pte page wire count error")); 5036 mpte->ref_count = 0; 5037 pmap_add_delayed_free_list(mpte, free, FALSE); 5038 } 5039 } 5040 return (pmap_unuse_pt(pmap, sva, be64toh(*pmap_pml2e(pmap, sva)), free)); 5041 } 5042 5043 /* 5044 * pmap_remove_pte: do the things to unmap a page in a process 5045 */ 5046 static int 5047 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 5048 pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 5049 { 5050 struct md_page *pvh; 5051 pt_entry_t oldpte; 5052 vm_page_t m; 5053 5054 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5055 oldpte = be64toh(pte_load_clear(ptq)); 5056 if (oldpte & RPTE_WIRED) 5057 pmap->pm_stats.wired_count -= 1; 5058 pmap_resident_count_dec(pmap, 1); 5059 if (oldpte & RPTE_MANAGED) { 5060 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 5061 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5062 vm_page_dirty(m); 5063 if (oldpte & PG_A) 5064 vm_page_aflag_set(m, PGA_REFERENCED); 5065 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5066 pmap_pvh_free(&m->md, pmap, va); 5067 if (TAILQ_EMPTY(&m->md.pv_list) && 5068 (m->flags & PG_FICTITIOUS) == 0) { 5069 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5070 if (TAILQ_EMPTY(&pvh->pv_list)) 5071 vm_page_aflag_clear(m, PGA_WRITEABLE); 5072 } 5073 } 5074 return (pmap_unuse_pt(pmap, va, ptepde, free)); 5075 } 5076 5077 /* 5078 * Remove a single page from a process address space 5079 */ 5080 static bool 5081 pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e, 5082 struct spglist *free) 5083 { 5084 struct rwlock *lock; 5085 pt_entry_t *pte; 5086 bool invalidate_all; 5087 5088 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5089 if ((be64toh(*l3e) & RPTE_VALID) == 0) { 5090 return (false); 5091 } 5092 pte = pmap_l3e_to_pte(l3e, va); 5093 if ((be64toh(*pte) & RPTE_VALID) == 0) { 5094 return (false); 5095 } 5096 lock = NULL; 5097 5098 invalidate_all = pmap_remove_pte(pmap, pte, va, be64toh(*l3e), free, &lock); 5099 if (lock != NULL) 5100 rw_wunlock(lock); 5101 if (!invalidate_all) 5102 pmap_invalidate_page(pmap, va); 5103 return (invalidate_all); 5104 } 5105 5106 /* 5107 * Removes the specified range of addresses from the page table page. 5108 */ 5109 static bool 5110 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 5111 pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp) 5112 { 5113 pt_entry_t *pte; 5114 vm_offset_t va; 5115 bool anyvalid; 5116 5117 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5118 anyvalid = false; 5119 va = eva; 5120 for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++, 5121 sva += PAGE_SIZE) { 5122 MPASS(pte == pmap_pte(pmap, sva)); 5123 if (*pte == 0) { 5124 if (va != eva) { 5125 anyvalid = true; 5126 va = eva; 5127 } 5128 continue; 5129 } 5130 if (va == eva) 5131 va = sva; 5132 if (pmap_remove_pte(pmap, pte, sva, be64toh(*l3e), free, lockp)) { 5133 anyvalid = true; 5134 sva += PAGE_SIZE; 5135 break; 5136 } 5137 } 5138 if (anyvalid) 5139 pmap_invalidate_all(pmap); 5140 else if (va != eva) 5141 pmap_invalidate_range(pmap, va, sva); 5142 return (anyvalid); 5143 } 5144 5145 void 5146 mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5147 { 5148 struct rwlock *lock; 5149 vm_offset_t va_next; 5150 pml1_entry_t *l1e; 5151 pml2_entry_t *l2e; 5152 pml3_entry_t ptpaddr, *l3e; 5153 struct spglist free; 5154 bool anyvalid; 5155 5156 CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); 5157 5158 /* 5159 * Perform an unsynchronized read. This is, however, safe. 5160 */ 5161 if (pmap->pm_stats.resident_count == 0) 5162 return; 5163 5164 anyvalid = false; 5165 SLIST_INIT(&free); 5166 5167 /* XXX something fishy here */ 5168 sva = (sva + PAGE_MASK) & ~PAGE_MASK; 5169 eva = (eva + PAGE_MASK) & ~PAGE_MASK; 5170 5171 PMAP_LOCK(pmap); 5172 5173 /* 5174 * special handling of removing one page. a very 5175 * common operation and easy to short circuit some 5176 * code. 5177 */ 5178 if (sva + PAGE_SIZE == eva) { 5179 l3e = pmap_pml3e(pmap, sva); 5180 if (l3e && (be64toh(*l3e) & RPTE_LEAF) == 0) { 5181 anyvalid = pmap_remove_page(pmap, sva, l3e, &free); 5182 goto out; 5183 } 5184 } 5185 5186 lock = NULL; 5187 for (; sva < eva; sva = va_next) { 5188 if (pmap->pm_stats.resident_count == 0) 5189 break; 5190 l1e = pmap_pml1e(pmap, sva); 5191 if (l1e == NULL || (be64toh(*l1e) & PG_V) == 0) { 5192 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 5193 if (va_next < sva) 5194 va_next = eva; 5195 continue; 5196 } 5197 5198 l2e = pmap_l1e_to_l2e(l1e, sva); 5199 if (l2e == NULL || (be64toh(*l2e) & PG_V) == 0) { 5200 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 5201 if (va_next < sva) 5202 va_next = eva; 5203 continue; 5204 } 5205 5206 /* 5207 * Calculate index for next page table. 5208 */ 5209 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 5210 if (va_next < sva) 5211 va_next = eva; 5212 5213 l3e = pmap_l2e_to_l3e(l2e, sva); 5214 ptpaddr = be64toh(*l3e); 5215 5216 /* 5217 * Weed out invalid mappings. 5218 */ 5219 if (ptpaddr == 0) 5220 continue; 5221 5222 /* 5223 * Check for large page. 5224 */ 5225 if ((ptpaddr & RPTE_LEAF) != 0) { 5226 /* 5227 * Are we removing the entire large page? If not, 5228 * demote the mapping and fall through. 5229 */ 5230 if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { 5231 pmap_remove_l3e(pmap, l3e, sva, &free, &lock); 5232 anyvalid = true; 5233 continue; 5234 } else if (!pmap_demote_l3e_locked(pmap, l3e, sva, 5235 &lock)) { 5236 /* The large page mapping was destroyed. */ 5237 continue; 5238 } else 5239 ptpaddr = be64toh(*l3e); 5240 } 5241 5242 /* 5243 * Limit our scan to either the end of the va represented 5244 * by the current page table page, or to the end of the 5245 * range being removed. 5246 */ 5247 if (va_next > eva) 5248 va_next = eva; 5249 5250 if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock)) 5251 anyvalid = true; 5252 } 5253 if (lock != NULL) 5254 rw_wunlock(lock); 5255 out: 5256 if (anyvalid) 5257 pmap_invalidate_all(pmap); 5258 PMAP_UNLOCK(pmap); 5259 vm_page_free_pages_toq(&free, true); 5260 } 5261 5262 void 5263 mmu_radix_remove_all(vm_page_t m) 5264 { 5265 struct md_page *pvh; 5266 pv_entry_t pv; 5267 pmap_t pmap; 5268 struct rwlock *lock; 5269 pt_entry_t *pte, tpte; 5270 pml3_entry_t *l3e; 5271 vm_offset_t va; 5272 struct spglist free; 5273 int pvh_gen, md_gen; 5274 5275 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 5276 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5277 ("pmap_remove_all: page %p is not managed", m)); 5278 SLIST_INIT(&free); 5279 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5280 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5281 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5282 retry: 5283 rw_wlock(lock); 5284 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 5285 pmap = PV_PMAP(pv); 5286 if (!PMAP_TRYLOCK(pmap)) { 5287 pvh_gen = pvh->pv_gen; 5288 rw_wunlock(lock); 5289 PMAP_LOCK(pmap); 5290 rw_wlock(lock); 5291 if (pvh_gen != pvh->pv_gen) { 5292 rw_wunlock(lock); 5293 PMAP_UNLOCK(pmap); 5294 goto retry; 5295 } 5296 } 5297 va = pv->pv_va; 5298 l3e = pmap_pml3e(pmap, va); 5299 (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock); 5300 PMAP_UNLOCK(pmap); 5301 } 5302 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 5303 pmap = PV_PMAP(pv); 5304 if (!PMAP_TRYLOCK(pmap)) { 5305 pvh_gen = pvh->pv_gen; 5306 md_gen = m->md.pv_gen; 5307 rw_wunlock(lock); 5308 PMAP_LOCK(pmap); 5309 rw_wlock(lock); 5310 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5311 rw_wunlock(lock); 5312 PMAP_UNLOCK(pmap); 5313 goto retry; 5314 } 5315 } 5316 pmap_resident_count_dec(pmap, 1); 5317 l3e = pmap_pml3e(pmap, pv->pv_va); 5318 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_remove_all: found" 5319 " a 2mpage in page %p's pv list", m)); 5320 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 5321 tpte = be64toh(pte_load_clear(pte)); 5322 if (tpte & PG_W) 5323 pmap->pm_stats.wired_count--; 5324 if (tpte & PG_A) 5325 vm_page_aflag_set(m, PGA_REFERENCED); 5326 5327 /* 5328 * Update the vm_page_t clean and reference bits. 5329 */ 5330 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5331 vm_page_dirty(m); 5332 pmap_unuse_pt(pmap, pv->pv_va, be64toh(*l3e), &free); 5333 pmap_invalidate_page(pmap, pv->pv_va); 5334 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 5335 m->md.pv_gen++; 5336 free_pv_entry(pmap, pv); 5337 PMAP_UNLOCK(pmap); 5338 } 5339 vm_page_aflag_clear(m, PGA_WRITEABLE); 5340 rw_wunlock(lock); 5341 vm_page_free_pages_toq(&free, true); 5342 } 5343 5344 /* 5345 * Destroy all managed, non-wired mappings in the given user-space 5346 * pmap. This pmap cannot be active on any processor besides the 5347 * caller. 5348 * 5349 * This function cannot be applied to the kernel pmap. Moreover, it 5350 * is not intended for general use. It is only to be used during 5351 * process termination. Consequently, it can be implemented in ways 5352 * that make it faster than pmap_remove(). First, it can more quickly 5353 * destroy mappings by iterating over the pmap's collection of PV 5354 * entries, rather than searching the page table. Second, it doesn't 5355 * have to test and clear the page table entries atomically, because 5356 * no processor is currently accessing the user address space. In 5357 * particular, a page table entry's dirty bit won't change state once 5358 * this function starts. 5359 * 5360 * Although this function destroys all of the pmap's managed, 5361 * non-wired mappings, it can delay and batch the invalidation of TLB 5362 * entries without calling pmap_delayed_invl_started() and 5363 * pmap_delayed_invl_finished(). Because the pmap is not active on 5364 * any other processor, none of these TLB entries will ever be used 5365 * before their eventual invalidation. Consequently, there is no need 5366 * for either pmap_remove_all() or pmap_remove_write() to wait for 5367 * that eventual TLB invalidation. 5368 */ 5369 5370 void 5371 mmu_radix_remove_pages(pmap_t pmap) 5372 { 5373 5374 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 5375 pml3_entry_t ptel3e; 5376 pt_entry_t *pte, tpte; 5377 struct spglist free; 5378 vm_page_t m, mpte, mt; 5379 pv_entry_t pv; 5380 struct md_page *pvh; 5381 struct pv_chunk *pc, *npc; 5382 struct rwlock *lock; 5383 int64_t bit; 5384 uint64_t inuse, bitmask; 5385 int allfree, field, freed, idx; 5386 boolean_t superpage; 5387 vm_paddr_t pa; 5388 5389 /* 5390 * Assert that the given pmap is only active on the current 5391 * CPU. Unfortunately, we cannot block another CPU from 5392 * activating the pmap while this function is executing. 5393 */ 5394 KASSERT(pmap->pm_pid == mfspr(SPR_PID), 5395 ("non-current asid %lu - expected %lu", pmap->pm_pid, 5396 mfspr(SPR_PID))); 5397 5398 lock = NULL; 5399 5400 SLIST_INIT(&free); 5401 PMAP_LOCK(pmap); 5402 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5403 allfree = 1; 5404 freed = 0; 5405 for (field = 0; field < _NPCM; field++) { 5406 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5407 while (inuse != 0) { 5408 bit = cnttzd(inuse); 5409 bitmask = 1UL << bit; 5410 idx = field * 64 + bit; 5411 pv = &pc->pc_pventry[idx]; 5412 inuse &= ~bitmask; 5413 5414 pte = pmap_pml2e(pmap, pv->pv_va); 5415 ptel3e = be64toh(*pte); 5416 pte = pmap_l2e_to_l3e(pte, pv->pv_va); 5417 tpte = be64toh(*pte); 5418 if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) { 5419 superpage = FALSE; 5420 ptel3e = tpte; 5421 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 5422 PG_FRAME); 5423 pte = &pte[pmap_pte_index(pv->pv_va)]; 5424 tpte = be64toh(*pte); 5425 } else { 5426 /* 5427 * Keep track whether 'tpte' is a 5428 * superpage explicitly instead of 5429 * relying on RPTE_LEAF being set. 5430 * 5431 * This is because RPTE_LEAF is numerically 5432 * identical to PG_PTE_PAT and thus a 5433 * regular page could be mistaken for 5434 * a superpage. 5435 */ 5436 superpage = TRUE; 5437 } 5438 5439 if ((tpte & PG_V) == 0) { 5440 panic("bad pte va %lx pte %lx", 5441 pv->pv_va, tpte); 5442 } 5443 5444 /* 5445 * We cannot remove wired pages from a process' mapping at this time 5446 */ 5447 if (tpte & PG_W) { 5448 allfree = 0; 5449 continue; 5450 } 5451 5452 if (superpage) 5453 pa = tpte & PG_PS_FRAME; 5454 else 5455 pa = tpte & PG_FRAME; 5456 5457 m = PHYS_TO_VM_PAGE(pa); 5458 KASSERT(m->phys_addr == pa, 5459 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5460 m, (uintmax_t)m->phys_addr, 5461 (uintmax_t)tpte)); 5462 5463 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5464 m < &vm_page_array[vm_page_array_size], 5465 ("pmap_remove_pages: bad tpte %#jx", 5466 (uintmax_t)tpte)); 5467 5468 pte_clear(pte); 5469 5470 /* 5471 * Update the vm_page_t clean/reference bits. 5472 */ 5473 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5474 if (superpage) { 5475 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) 5476 vm_page_dirty(mt); 5477 } else 5478 vm_page_dirty(m); 5479 } 5480 5481 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5482 5483 /* Mark free */ 5484 pc->pc_map[field] |= bitmask; 5485 if (superpage) { 5486 pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); 5487 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 5488 TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); 5489 pvh->pv_gen++; 5490 if (TAILQ_EMPTY(&pvh->pv_list)) { 5491 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) 5492 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 5493 TAILQ_EMPTY(&mt->md.pv_list)) 5494 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5495 } 5496 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 5497 if (mpte != NULL) { 5498 pmap_resident_count_dec(pmap, 1); 5499 KASSERT(mpte->ref_count == NPTEPG, 5500 ("pmap_remove_pages: pte page wire count error")); 5501 mpte->ref_count = 0; 5502 pmap_add_delayed_free_list(mpte, &free, FALSE); 5503 } 5504 } else { 5505 pmap_resident_count_dec(pmap, 1); 5506 #ifdef VERBOSE_PV 5507 printf("freeing pv (%p, %p)\n", 5508 pmap, pv); 5509 #endif 5510 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 5511 m->md.pv_gen++; 5512 if ((m->a.flags & PGA_WRITEABLE) != 0 && 5513 TAILQ_EMPTY(&m->md.pv_list) && 5514 (m->flags & PG_FICTITIOUS) == 0) { 5515 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5516 if (TAILQ_EMPTY(&pvh->pv_list)) 5517 vm_page_aflag_clear(m, PGA_WRITEABLE); 5518 } 5519 } 5520 pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free); 5521 freed++; 5522 } 5523 } 5524 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5525 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5526 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5527 if (allfree) { 5528 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5529 free_pv_chunk(pc); 5530 } 5531 } 5532 if (lock != NULL) 5533 rw_wunlock(lock); 5534 pmap_invalidate_all(pmap); 5535 PMAP_UNLOCK(pmap); 5536 vm_page_free_pages_toq(&free, true); 5537 } 5538 5539 void 5540 mmu_radix_remove_write(vm_page_t m) 5541 { 5542 struct md_page *pvh; 5543 pmap_t pmap; 5544 struct rwlock *lock; 5545 pv_entry_t next_pv, pv; 5546 pml3_entry_t *l3e; 5547 pt_entry_t oldpte, *pte; 5548 int pvh_gen, md_gen; 5549 5550 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 5551 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5552 ("pmap_remove_write: page %p is not managed", m)); 5553 vm_page_assert_busied(m); 5554 5555 if (!pmap_page_is_write_mapped(m)) 5556 return; 5557 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5558 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5559 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5560 retry_pv_loop: 5561 rw_wlock(lock); 5562 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { 5563 pmap = PV_PMAP(pv); 5564 if (!PMAP_TRYLOCK(pmap)) { 5565 pvh_gen = pvh->pv_gen; 5566 rw_wunlock(lock); 5567 PMAP_LOCK(pmap); 5568 rw_wlock(lock); 5569 if (pvh_gen != pvh->pv_gen) { 5570 PMAP_UNLOCK(pmap); 5571 rw_wunlock(lock); 5572 goto retry_pv_loop; 5573 } 5574 } 5575 l3e = pmap_pml3e(pmap, pv->pv_va); 5576 if ((be64toh(*l3e) & PG_RW) != 0) 5577 (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock); 5578 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5579 ("inconsistent pv lock %p %p for page %p", 5580 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5581 PMAP_UNLOCK(pmap); 5582 } 5583 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 5584 pmap = PV_PMAP(pv); 5585 if (!PMAP_TRYLOCK(pmap)) { 5586 pvh_gen = pvh->pv_gen; 5587 md_gen = m->md.pv_gen; 5588 rw_wunlock(lock); 5589 PMAP_LOCK(pmap); 5590 rw_wlock(lock); 5591 if (pvh_gen != pvh->pv_gen || 5592 md_gen != m->md.pv_gen) { 5593 PMAP_UNLOCK(pmap); 5594 rw_wunlock(lock); 5595 goto retry_pv_loop; 5596 } 5597 } 5598 l3e = pmap_pml3e(pmap, pv->pv_va); 5599 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, 5600 ("pmap_remove_write: found a 2mpage in page %p's pv list", 5601 m)); 5602 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 5603 retry: 5604 oldpte = be64toh(*pte); 5605 if (oldpte & PG_RW) { 5606 if (!atomic_cmpset_long(pte, htobe64(oldpte), 5607 htobe64((oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M)))) 5608 goto retry; 5609 if ((oldpte & PG_M) != 0) 5610 vm_page_dirty(m); 5611 pmap_invalidate_page(pmap, pv->pv_va); 5612 } 5613 PMAP_UNLOCK(pmap); 5614 } 5615 rw_wunlock(lock); 5616 vm_page_aflag_clear(m, PGA_WRITEABLE); 5617 } 5618 5619 /* 5620 * Clear the wired attribute from the mappings for the specified range of 5621 * addresses in the given pmap. Every valid mapping within that range 5622 * must have the wired attribute set. In contrast, invalid mappings 5623 * cannot have the wired attribute set, so they are ignored. 5624 * 5625 * The wired attribute of the page table entry is not a hardware 5626 * feature, so there is no need to invalidate any TLB entries. 5627 * Since pmap_demote_l3e() for the wired entry must never fail, 5628 * pmap_delayed_invl_started()/finished() calls around the 5629 * function are not needed. 5630 */ 5631 void 5632 mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5633 { 5634 vm_offset_t va_next; 5635 pml1_entry_t *l1e; 5636 pml2_entry_t *l2e; 5637 pml3_entry_t *l3e; 5638 pt_entry_t *pte; 5639 5640 CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); 5641 PMAP_LOCK(pmap); 5642 for (; sva < eva; sva = va_next) { 5643 l1e = pmap_pml1e(pmap, sva); 5644 if ((be64toh(*l1e) & PG_V) == 0) { 5645 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 5646 if (va_next < sva) 5647 va_next = eva; 5648 continue; 5649 } 5650 l2e = pmap_l1e_to_l2e(l1e, sva); 5651 if ((be64toh(*l2e) & PG_V) == 0) { 5652 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 5653 if (va_next < sva) 5654 va_next = eva; 5655 continue; 5656 } 5657 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 5658 if (va_next < sva) 5659 va_next = eva; 5660 l3e = pmap_l2e_to_l3e(l2e, sva); 5661 if ((be64toh(*l3e) & PG_V) == 0) 5662 continue; 5663 if ((be64toh(*l3e) & RPTE_LEAF) != 0) { 5664 if ((be64toh(*l3e) & PG_W) == 0) 5665 panic("pmap_unwire: pde %#jx is missing PG_W", 5666 (uintmax_t)(be64toh(*l3e))); 5667 5668 /* 5669 * Are we unwiring the entire large page? If not, 5670 * demote the mapping and fall through. 5671 */ 5672 if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { 5673 atomic_clear_long(l3e, htobe64(PG_W)); 5674 pmap->pm_stats.wired_count -= L3_PAGE_SIZE / 5675 PAGE_SIZE; 5676 continue; 5677 } else if (!pmap_demote_l3e(pmap, l3e, sva)) 5678 panic("pmap_unwire: demotion failed"); 5679 } 5680 if (va_next > eva) 5681 va_next = eva; 5682 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, 5683 sva += PAGE_SIZE) { 5684 MPASS(pte == pmap_pte(pmap, sva)); 5685 if ((be64toh(*pte) & PG_V) == 0) 5686 continue; 5687 if ((be64toh(*pte) & PG_W) == 0) 5688 panic("pmap_unwire: pte %#jx is missing PG_W", 5689 (uintmax_t)(be64toh(*pte))); 5690 5691 /* 5692 * PG_W must be cleared atomically. Although the pmap 5693 * lock synchronizes access to PG_W, another processor 5694 * could be setting PG_M and/or PG_A concurrently. 5695 */ 5696 atomic_clear_long(pte, htobe64(PG_W)); 5697 pmap->pm_stats.wired_count--; 5698 } 5699 } 5700 PMAP_UNLOCK(pmap); 5701 } 5702 5703 void 5704 mmu_radix_zero_page(vm_page_t m) 5705 { 5706 vm_offset_t addr; 5707 5708 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 5709 addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5710 pagezero(addr); 5711 } 5712 5713 void 5714 mmu_radix_zero_page_area(vm_page_t m, int off, int size) 5715 { 5716 caddr_t addr; 5717 5718 CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size); 5719 MPASS(off + size <= PAGE_SIZE); 5720 addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5721 memset(addr + off, 0, size); 5722 } 5723 5724 static int 5725 mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5726 { 5727 pml3_entry_t *l3ep; 5728 pt_entry_t pte; 5729 vm_paddr_t pa; 5730 int val; 5731 5732 CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); 5733 PMAP_LOCK(pmap); 5734 5735 l3ep = pmap_pml3e(pmap, addr); 5736 if (l3ep != NULL && (be64toh(*l3ep) & PG_V)) { 5737 if (be64toh(*l3ep) & RPTE_LEAF) { 5738 pte = be64toh(*l3ep); 5739 /* Compute the physical address of the 4KB page. */ 5740 pa = ((be64toh(*l3ep) & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) & 5741 PG_FRAME; 5742 val = MINCORE_PSIND(1); 5743 } else { 5744 /* Native endian PTE, do not pass to functions */ 5745 pte = be64toh(*pmap_l3e_to_pte(l3ep, addr)); 5746 pa = pte & PG_FRAME; 5747 val = 0; 5748 } 5749 } else { 5750 pte = 0; 5751 pa = 0; 5752 val = 0; 5753 } 5754 if ((pte & PG_V) != 0) { 5755 val |= MINCORE_INCORE; 5756 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5757 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5758 if ((pte & PG_A) != 0) 5759 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5760 } 5761 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5762 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5763 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5764 *locked_pa = pa; 5765 } 5766 PMAP_UNLOCK(pmap); 5767 return (val); 5768 } 5769 5770 void 5771 mmu_radix_activate(struct thread *td) 5772 { 5773 pmap_t pmap; 5774 uint32_t curpid; 5775 5776 CTR2(KTR_PMAP, "%s(%p)", __func__, td); 5777 critical_enter(); 5778 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5779 curpid = mfspr(SPR_PID); 5780 if (pmap->pm_pid > isa3_base_pid && 5781 curpid != pmap->pm_pid) { 5782 mmu_radix_pid_set(pmap); 5783 } 5784 critical_exit(); 5785 } 5786 5787 /* 5788 * Increase the starting virtual address of the given mapping if a 5789 * different alignment might result in more superpage mappings. 5790 */ 5791 void 5792 mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset, 5793 vm_offset_t *addr, vm_size_t size) 5794 { 5795 5796 CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr, 5797 size); 5798 vm_offset_t superpage_offset; 5799 5800 if (size < L3_PAGE_SIZE) 5801 return; 5802 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5803 offset += ptoa(object->pg_color); 5804 superpage_offset = offset & L3_PAGE_MASK; 5805 if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE || 5806 (*addr & L3_PAGE_MASK) == superpage_offset) 5807 return; 5808 if ((*addr & L3_PAGE_MASK) < superpage_offset) 5809 *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset; 5810 else 5811 *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset; 5812 } 5813 5814 static void * 5815 mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr) 5816 { 5817 vm_offset_t va, tmpva, ppa, offset; 5818 5819 ppa = trunc_page(pa); 5820 offset = pa & PAGE_MASK; 5821 size = roundup2(offset + size, PAGE_SIZE); 5822 if (pa < powerpc_ptob(Maxmem)) 5823 panic("bad pa: %#lx less than Maxmem %#lx\n", 5824 pa, powerpc_ptob(Maxmem)); 5825 va = kva_alloc(size); 5826 if (bootverbose) 5827 printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr); 5828 KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr)); 5829 5830 if (!va) 5831 panic("%s: Couldn't alloc kernel virtual memory", __func__); 5832 5833 for (tmpva = va; size > 0;) { 5834 mmu_radix_kenter_attr(tmpva, ppa, attr); 5835 size -= PAGE_SIZE; 5836 tmpva += PAGE_SIZE; 5837 ppa += PAGE_SIZE; 5838 } 5839 ptesync(); 5840 5841 return ((void *)(va + offset)); 5842 } 5843 5844 static void * 5845 mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size) 5846 { 5847 5848 CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); 5849 5850 return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT)); 5851 } 5852 5853 void 5854 mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5855 { 5856 5857 CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma); 5858 m->md.mdpg_cache_attrs = ma; 5859 5860 /* 5861 * If "m" is a normal page, update its direct mapping. This update 5862 * can be relied upon to perform any cache operations that are 5863 * required for data coherence. 5864 */ 5865 if ((m->flags & PG_FICTITIOUS) == 0 && 5866 mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 5867 PAGE_SIZE, m->md.mdpg_cache_attrs)) 5868 panic("memory attribute change on the direct map failed"); 5869 } 5870 5871 static void 5872 mmu_radix_unmapdev(vm_offset_t va, vm_size_t size) 5873 { 5874 vm_offset_t offset; 5875 5876 CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, size); 5877 /* If we gave a direct map region in pmap_mapdev, do nothing */ 5878 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 5879 return; 5880 5881 offset = va & PAGE_MASK; 5882 size = round_page(offset + size); 5883 va = trunc_page(va); 5884 5885 if (pmap_initialized) { 5886 mmu_radix_qremove(va, atop(size)); 5887 kva_free(va, size); 5888 } 5889 } 5890 5891 static __inline void 5892 pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask) 5893 { 5894 uint64_t opte, npte; 5895 5896 /* 5897 * The cache mode bits are all in the low 32-bits of the 5898 * PTE, so we can just spin on updating the low 32-bits. 5899 */ 5900 do { 5901 opte = be64toh(*pte); 5902 npte = opte & ~mask; 5903 npte |= cache_bits; 5904 } while (npte != opte && !atomic_cmpset_long(pte, htobe64(opte), htobe64(npte))); 5905 } 5906 5907 /* 5908 * Tries to demote a 1GB page mapping. 5909 */ 5910 static boolean_t 5911 pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va) 5912 { 5913 pml2_entry_t oldpdpe; 5914 pml3_entry_t *firstpde, newpde, *pde; 5915 vm_paddr_t pdpgpa; 5916 vm_page_t pdpg; 5917 5918 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5919 oldpdpe = be64toh(*l2e); 5920 KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), 5921 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 5922 pdpg = vm_page_alloc(NULL, va >> L2_PAGE_SIZE_SHIFT, 5923 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); 5924 if (pdpg == NULL) { 5925 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 5926 " in pmap %p", va, pmap); 5927 return (FALSE); 5928 } 5929 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 5930 firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa); 5931 KASSERT((oldpdpe & PG_A) != 0, 5932 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 5933 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 5934 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 5935 newpde = oldpdpe; 5936 5937 /* 5938 * Initialize the page directory page. 5939 */ 5940 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 5941 *pde = htobe64(newpde); 5942 newpde += L3_PAGE_SIZE; 5943 } 5944 5945 /* 5946 * Demote the mapping. 5947 */ 5948 pde_store(l2e, pdpgpa); 5949 5950 /* 5951 * Flush PWC --- XXX revisit 5952 */ 5953 pmap_invalidate_all(pmap); 5954 5955 counter_u64_add(pmap_l2e_demotions, 1); 5956 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 5957 " in pmap %p", va, pmap); 5958 return (TRUE); 5959 } 5960 5961 vm_paddr_t 5962 mmu_radix_kextract(vm_offset_t va) 5963 { 5964 pml3_entry_t l3e; 5965 vm_paddr_t pa; 5966 5967 CTR2(KTR_PMAP, "%s(%#x)", __func__, va); 5968 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 5969 pa = DMAP_TO_PHYS(va); 5970 } else { 5971 /* Big-endian PTE on stack */ 5972 l3e = *pmap_pml3e(kernel_pmap, va); 5973 if (be64toh(l3e) & RPTE_LEAF) { 5974 pa = (be64toh(l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK); 5975 pa |= (va & L3_PAGE_MASK); 5976 } else { 5977 /* 5978 * Beware of a concurrent promotion that changes the 5979 * PDE at this point! For example, vtopte() must not 5980 * be used to access the PTE because it would use the 5981 * new PDE. It is, however, safe to use the old PDE 5982 * because the page table page is preserved by the 5983 * promotion. 5984 */ 5985 pa = be64toh(*pmap_l3e_to_pte(&l3e, va)); 5986 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 5987 pa |= (va & PAGE_MASK); 5988 } 5989 } 5990 return (pa); 5991 } 5992 5993 static pt_entry_t 5994 mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) 5995 { 5996 5997 if (ma != VM_MEMATTR_DEFAULT) { 5998 return pmap_cache_bits(ma); 5999 } 6000 6001 /* 6002 * Assume the page is cache inhibited and access is guarded unless 6003 * it's in our available memory array. 6004 */ 6005 for (int i = 0; i < pregions_sz; i++) { 6006 if ((pa >= pregions[i].mr_start) && 6007 (pa < (pregions[i].mr_start + pregions[i].mr_size))) 6008 return (RPTE_ATTR_MEM); 6009 } 6010 return (RPTE_ATTR_GUARDEDIO); 6011 } 6012 6013 static void 6014 mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) 6015 { 6016 pt_entry_t *pte, pteval; 6017 uint64_t cache_bits; 6018 6019 pte = kvtopte(va); 6020 MPASS(pte != NULL); 6021 pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; 6022 cache_bits = mmu_radix_calc_wimg(pa, ma); 6023 pte_store(pte, pteval | cache_bits); 6024 } 6025 6026 void 6027 mmu_radix_kremove(vm_offset_t va) 6028 { 6029 pt_entry_t *pte; 6030 6031 CTR2(KTR_PMAP, "%s(%#x)", __func__, va); 6032 6033 pte = kvtopte(va); 6034 pte_clear(pte); 6035 } 6036 6037 int 6038 mmu_radix_decode_kernel_ptr(vm_offset_t addr, 6039 int *is_user, vm_offset_t *decoded) 6040 { 6041 6042 CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr); 6043 *decoded = addr; 6044 *is_user = (addr < VM_MAXUSER_ADDRESS); 6045 return (0); 6046 } 6047 6048 static boolean_t 6049 mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) 6050 { 6051 6052 CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); 6053 return (mem_valid(pa, size)); 6054 } 6055 6056 static void 6057 mmu_radix_scan_init() 6058 { 6059 6060 CTR1(KTR_PMAP, "%s()", __func__); 6061 UNIMPLEMENTED(); 6062 } 6063 6064 static void 6065 mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, 6066 void **va) 6067 { 6068 CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va); 6069 UNIMPLEMENTED(); 6070 } 6071 6072 vm_offset_t 6073 mmu_radix_quick_enter_page(vm_page_t m) 6074 { 6075 vm_paddr_t paddr; 6076 6077 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 6078 paddr = VM_PAGE_TO_PHYS(m); 6079 return (PHYS_TO_DMAP(paddr)); 6080 } 6081 6082 void 6083 mmu_radix_quick_remove_page(vm_offset_t addr __unused) 6084 { 6085 /* no work to do here */ 6086 CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); 6087 } 6088 6089 static void 6090 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 6091 { 6092 cpu_flush_dcache((void *)sva, eva - sva); 6093 } 6094 6095 int 6096 mmu_radix_change_attr(vm_offset_t va, vm_size_t size, 6097 vm_memattr_t mode) 6098 { 6099 int error; 6100 6101 CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode); 6102 PMAP_LOCK(kernel_pmap); 6103 error = pmap_change_attr_locked(va, size, mode, true); 6104 PMAP_UNLOCK(kernel_pmap); 6105 return (error); 6106 } 6107 6108 static int 6109 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush) 6110 { 6111 vm_offset_t base, offset, tmpva; 6112 vm_paddr_t pa_start, pa_end, pa_end1; 6113 pml2_entry_t *l2e; 6114 pml3_entry_t *l3e; 6115 pt_entry_t *pte; 6116 int cache_bits, error; 6117 boolean_t changed; 6118 6119 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6120 base = trunc_page(va); 6121 offset = va & PAGE_MASK; 6122 size = round_page(offset + size); 6123 6124 /* 6125 * Only supported on kernel virtual addresses, including the direct 6126 * map but excluding the recursive map. 6127 */ 6128 if (base < DMAP_MIN_ADDRESS) 6129 return (EINVAL); 6130 6131 cache_bits = pmap_cache_bits(mode); 6132 changed = FALSE; 6133 6134 /* 6135 * Pages that aren't mapped aren't supported. Also break down 2MB pages 6136 * into 4KB pages if required. 6137 */ 6138 for (tmpva = base; tmpva < base + size; ) { 6139 l2e = pmap_pml2e(kernel_pmap, tmpva); 6140 if (l2e == NULL || *l2e == 0) 6141 return (EINVAL); 6142 if (be64toh(*l2e) & RPTE_LEAF) { 6143 /* 6144 * If the current 1GB page already has the required 6145 * memory type, then we need not demote this page. Just 6146 * increment tmpva to the next 1GB page frame. 6147 */ 6148 if ((be64toh(*l2e) & RPTE_ATTR_MASK) == cache_bits) { 6149 tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; 6150 continue; 6151 } 6152 6153 /* 6154 * If the current offset aligns with a 1GB page frame 6155 * and there is at least 1GB left within the range, then 6156 * we need not break down this page into 2MB pages. 6157 */ 6158 if ((tmpva & L2_PAGE_MASK) == 0 && 6159 tmpva + L2_PAGE_MASK < base + size) { 6160 tmpva += L2_PAGE_MASK; 6161 continue; 6162 } 6163 if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva)) 6164 return (ENOMEM); 6165 } 6166 l3e = pmap_l2e_to_l3e(l2e, tmpva); 6167 KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n", 6168 tmpva, l2e)); 6169 if (*l3e == 0) 6170 return (EINVAL); 6171 if (be64toh(*l3e) & RPTE_LEAF) { 6172 /* 6173 * If the current 2MB page already has the required 6174 * memory type, then we need not demote this page. Just 6175 * increment tmpva to the next 2MB page frame. 6176 */ 6177 if ((be64toh(*l3e) & RPTE_ATTR_MASK) == cache_bits) { 6178 tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; 6179 continue; 6180 } 6181 6182 /* 6183 * If the current offset aligns with a 2MB page frame 6184 * and there is at least 2MB left within the range, then 6185 * we need not break down this page into 4KB pages. 6186 */ 6187 if ((tmpva & L3_PAGE_MASK) == 0 && 6188 tmpva + L3_PAGE_MASK < base + size) { 6189 tmpva += L3_PAGE_SIZE; 6190 continue; 6191 } 6192 if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva)) 6193 return (ENOMEM); 6194 } 6195 pte = pmap_l3e_to_pte(l3e, tmpva); 6196 if (*pte == 0) 6197 return (EINVAL); 6198 tmpva += PAGE_SIZE; 6199 } 6200 error = 0; 6201 6202 /* 6203 * Ok, all the pages exist, so run through them updating their 6204 * cache mode if required. 6205 */ 6206 pa_start = pa_end = 0; 6207 for (tmpva = base; tmpva < base + size; ) { 6208 l2e = pmap_pml2e(kernel_pmap, tmpva); 6209 if (be64toh(*l2e) & RPTE_LEAF) { 6210 if ((be64toh(*l2e) & RPTE_ATTR_MASK) != cache_bits) { 6211 pmap_pte_attr(l2e, cache_bits, 6212 RPTE_ATTR_MASK); 6213 changed = TRUE; 6214 } 6215 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6216 (*l2e & PG_PS_FRAME) < dmaplimit) { 6217 if (pa_start == pa_end) { 6218 /* Start physical address run. */ 6219 pa_start = be64toh(*l2e) & PG_PS_FRAME; 6220 pa_end = pa_start + L2_PAGE_SIZE; 6221 } else if (pa_end == (be64toh(*l2e) & PG_PS_FRAME)) 6222 pa_end += L2_PAGE_SIZE; 6223 else { 6224 /* Run ended, update direct map. */ 6225 error = pmap_change_attr_locked( 6226 PHYS_TO_DMAP(pa_start), 6227 pa_end - pa_start, mode, flush); 6228 if (error != 0) 6229 break; 6230 /* Start physical address run. */ 6231 pa_start = be64toh(*l2e) & PG_PS_FRAME; 6232 pa_end = pa_start + L2_PAGE_SIZE; 6233 } 6234 } 6235 tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; 6236 continue; 6237 } 6238 l3e = pmap_l2e_to_l3e(l2e, tmpva); 6239 if (be64toh(*l3e) & RPTE_LEAF) { 6240 if ((be64toh(*l3e) & RPTE_ATTR_MASK) != cache_bits) { 6241 pmap_pte_attr(l3e, cache_bits, 6242 RPTE_ATTR_MASK); 6243 changed = TRUE; 6244 } 6245 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6246 (be64toh(*l3e) & PG_PS_FRAME) < dmaplimit) { 6247 if (pa_start == pa_end) { 6248 /* Start physical address run. */ 6249 pa_start = be64toh(*l3e) & PG_PS_FRAME; 6250 pa_end = pa_start + L3_PAGE_SIZE; 6251 } else if (pa_end == (be64toh(*l3e) & PG_PS_FRAME)) 6252 pa_end += L3_PAGE_SIZE; 6253 else { 6254 /* Run ended, update direct map. */ 6255 error = pmap_change_attr_locked( 6256 PHYS_TO_DMAP(pa_start), 6257 pa_end - pa_start, mode, flush); 6258 if (error != 0) 6259 break; 6260 /* Start physical address run. */ 6261 pa_start = be64toh(*l3e) & PG_PS_FRAME; 6262 pa_end = pa_start + L3_PAGE_SIZE; 6263 } 6264 } 6265 tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; 6266 } else { 6267 pte = pmap_l3e_to_pte(l3e, tmpva); 6268 if ((be64toh(*pte) & RPTE_ATTR_MASK) != cache_bits) { 6269 pmap_pte_attr(pte, cache_bits, 6270 RPTE_ATTR_MASK); 6271 changed = TRUE; 6272 } 6273 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6274 (be64toh(*pte) & PG_FRAME) < dmaplimit) { 6275 if (pa_start == pa_end) { 6276 /* Start physical address run. */ 6277 pa_start = be64toh(*pte) & PG_FRAME; 6278 pa_end = pa_start + PAGE_SIZE; 6279 } else if (pa_end == (be64toh(*pte) & PG_FRAME)) 6280 pa_end += PAGE_SIZE; 6281 else { 6282 /* Run ended, update direct map. */ 6283 error = pmap_change_attr_locked( 6284 PHYS_TO_DMAP(pa_start), 6285 pa_end - pa_start, mode, flush); 6286 if (error != 0) 6287 break; 6288 /* Start physical address run. */ 6289 pa_start = be64toh(*pte) & PG_FRAME; 6290 pa_end = pa_start + PAGE_SIZE; 6291 } 6292 } 6293 tmpva += PAGE_SIZE; 6294 } 6295 } 6296 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 6297 pa_end1 = MIN(pa_end, dmaplimit); 6298 if (pa_start != pa_end1) 6299 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 6300 pa_end1 - pa_start, mode, flush); 6301 } 6302 6303 /* 6304 * Flush CPU caches if required to make sure any data isn't cached that 6305 * shouldn't be, etc. 6306 */ 6307 if (changed) { 6308 pmap_invalidate_all(kernel_pmap); 6309 6310 if (flush) 6311 pmap_invalidate_cache_range(base, tmpva); 6312 } 6313 return (error); 6314 } 6315 6316 /* 6317 * Allocate physical memory for the vm_page array and map it into KVA, 6318 * attempting to back the vm_pages with domain-local memory. 6319 */ 6320 void 6321 mmu_radix_page_array_startup(long pages) 6322 { 6323 #ifdef notyet 6324 pml2_entry_t *l2e; 6325 pml3_entry_t *pde; 6326 pml3_entry_t newl3; 6327 vm_offset_t va; 6328 long pfn; 6329 int domain, i; 6330 #endif 6331 vm_paddr_t pa; 6332 vm_offset_t start, end; 6333 6334 vm_page_array_size = pages; 6335 6336 start = VM_MIN_KERNEL_ADDRESS; 6337 end = start + pages * sizeof(struct vm_page); 6338 6339 pa = vm_phys_early_alloc(0, end - start); 6340 6341 start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT); 6342 #ifdef notyet 6343 /* TODO: NUMA vm_page_array. Blocked out until then (copied from amd64). */ 6344 for (va = start; va < end; va += L3_PAGE_SIZE) { 6345 pfn = first_page + (va - start) / sizeof(struct vm_page); 6346 domain = vm_phys_domain(ptoa(pfn)); 6347 l2e = pmap_pml2e(kernel_pmap, va); 6348 if ((be64toh(*l2e) & PG_V) == 0) { 6349 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 6350 dump_add_page(pa); 6351 pagezero(PHYS_TO_DMAP(pa)); 6352 pde_store(l2e, (pml2_entry_t)pa); 6353 } 6354 pde = pmap_l2e_to_l3e(l2e, va); 6355 if ((be64toh(*pde) & PG_V) != 0) 6356 panic("Unexpected pde %p", pde); 6357 pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE); 6358 for (i = 0; i < NPDEPG; i++) 6359 dump_add_page(pa + i * PAGE_SIZE); 6360 newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W); 6361 pte_store(pde, newl3); 6362 } 6363 #endif 6364 vm_page_array = (vm_page_t)start; 6365 } 6366 6367 #ifdef DDB 6368 #include <sys/kdb.h> 6369 #include <ddb/ddb.h> 6370 6371 static void 6372 pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va) 6373 { 6374 pml1_entry_t *l1e; 6375 pml2_entry_t *l2e; 6376 pml3_entry_t *l3e; 6377 pt_entry_t *pte; 6378 6379 l1e = &l1[pmap_pml1e_index(va)]; 6380 db_printf("VA %#016lx l1e %#016lx", va, be64toh(*l1e)); 6381 if ((be64toh(*l1e) & PG_V) == 0) { 6382 db_printf("\n"); 6383 return; 6384 } 6385 l2e = pmap_l1e_to_l2e(l1e, va); 6386 db_printf(" l2e %#016lx", be64toh(*l2e)); 6387 if ((be64toh(*l2e) & PG_V) == 0 || (be64toh(*l2e) & RPTE_LEAF) != 0) { 6388 db_printf("\n"); 6389 return; 6390 } 6391 l3e = pmap_l2e_to_l3e(l2e, va); 6392 db_printf(" l3e %#016lx", be64toh(*l3e)); 6393 if ((be64toh(*l3e) & PG_V) == 0 || (be64toh(*l3e) & RPTE_LEAF) != 0) { 6394 db_printf("\n"); 6395 return; 6396 } 6397 pte = pmap_l3e_to_pte(l3e, va); 6398 db_printf(" pte %#016lx\n", be64toh(*pte)); 6399 } 6400 6401 void 6402 pmap_page_print_mappings(vm_page_t m) 6403 { 6404 pmap_t pmap; 6405 pv_entry_t pv; 6406 6407 db_printf("page %p(%lx)\n", m, m->phys_addr); 6408 /* need to elide locks if running in ddb */ 6409 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 6410 db_printf("pv: %p ", pv); 6411 db_printf("va: %#016lx ", pv->pv_va); 6412 pmap = PV_PMAP(pv); 6413 db_printf("pmap %p ", pmap); 6414 if (pmap != NULL) { 6415 db_printf("asid: %lu\n", pmap->pm_pid); 6416 pmap_pte_walk(pmap->pm_pml1, pv->pv_va); 6417 } 6418 } 6419 } 6420 6421 DB_SHOW_COMMAND(pte, pmap_print_pte) 6422 { 6423 vm_offset_t va; 6424 pmap_t pmap; 6425 6426 if (!have_addr) { 6427 db_printf("show pte addr\n"); 6428 return; 6429 } 6430 va = (vm_offset_t)addr; 6431 6432 if (va >= DMAP_MIN_ADDRESS) 6433 pmap = kernel_pmap; 6434 else if (kdb_thread != NULL) 6435 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 6436 else 6437 pmap = vmspace_pmap(curthread->td_proc->p_vmspace); 6438 6439 pmap_pte_walk(pmap->pm_pml1, va); 6440 } 6441 6442 #endif 6443