1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2018 Matthew Macy 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include <sys/param.h> 32 #include <sys/kernel.h> 33 #include <sys/systm.h> 34 #include <sys/conf.h> 35 #include <sys/bitstring.h> 36 #include <sys/queue.h> 37 #include <sys/cpuset.h> 38 #include <sys/endian.h> 39 #include <sys/kerneldump.h> 40 #include <sys/ktr.h> 41 #include <sys/lock.h> 42 #include <sys/syslog.h> 43 #include <sys/msgbuf.h> 44 #include <sys/malloc.h> 45 #include <sys/mman.h> 46 #include <sys/mutex.h> 47 #include <sys/proc.h> 48 #include <sys/rwlock.h> 49 #include <sys/sched.h> 50 #include <sys/sysctl.h> 51 #include <sys/systm.h> 52 #include <sys/vmem.h> 53 #include <sys/vmmeter.h> 54 #include <sys/smp.h> 55 56 #include <sys/kdb.h> 57 58 #include <dev/ofw/openfirm.h> 59 60 #include <vm/vm.h> 61 #include <vm/pmap.h> 62 #include <vm/vm_param.h> 63 #include <vm/vm_kern.h> 64 #include <vm/vm_page.h> 65 #include <vm/vm_map.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_extern.h> 68 #include <vm/vm_pageout.h> 69 #include <vm/vm_phys.h> 70 #include <vm/vm_reserv.h> 71 #include <vm/vm_dumpset.h> 72 #include <vm/uma.h> 73 74 #include <machine/_inttypes.h> 75 #include <machine/cpu.h> 76 #include <machine/platform.h> 77 #include <machine/frame.h> 78 #include <machine/md_var.h> 79 #include <machine/psl.h> 80 #include <machine/bat.h> 81 #include <machine/hid.h> 82 #include <machine/pte.h> 83 #include <machine/sr.h> 84 #include <machine/trap.h> 85 #include <machine/mmuvar.h> 86 87 #ifdef INVARIANTS 88 #include <vm/uma_dbg.h> 89 #endif 90 91 #define PPC_BITLSHIFT(bit) (sizeof(long)*NBBY - 1 - (bit)) 92 #define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) 93 #define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit)) 94 95 #include "opt_ddb.h" 96 #ifdef DDB 97 static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va); 98 #endif 99 100 #define PG_W RPTE_WIRED 101 #define PG_V RPTE_VALID 102 #define PG_MANAGED RPTE_MANAGED 103 #define PG_PROMOTED RPTE_PROMOTED 104 #define PG_M RPTE_C 105 #define PG_A RPTE_R 106 #define PG_X RPTE_EAA_X 107 #define PG_RW RPTE_EAA_W 108 #define PG_PTE_CACHE RPTE_ATTR_MASK 109 110 #define RPTE_SHIFT 9 111 #define NLS_MASK ((1UL<<5)-1) 112 #define RPTE_ENTRIES (1UL<<RPTE_SHIFT) 113 #define RPTE_MASK (RPTE_ENTRIES-1) 114 115 #define NLB_SHIFT 0 116 #define NLB_MASK (((1UL<<52)-1) << 8) 117 118 extern int nkpt; 119 extern caddr_t crashdumpmap; 120 121 #define RIC_FLUSH_TLB 0 122 #define RIC_FLUSH_PWC 1 123 #define RIC_FLUSH_ALL 2 124 125 #define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */ 126 127 #define PPC_INST_TLBIE 0x7c000264 128 #define PPC_INST_TLBIEL 0x7c000224 129 #define PPC_INST_SLBIA 0x7c0003e4 130 131 #define ___PPC_RA(a) (((a) & 0x1f) << 16) 132 #define ___PPC_RB(b) (((b) & 0x1f) << 11) 133 #define ___PPC_RS(s) (((s) & 0x1f) << 21) 134 #define ___PPC_RT(t) ___PPC_RS(t) 135 #define ___PPC_R(r) (((r) & 0x1) << 16) 136 #define ___PPC_PRS(prs) (((prs) & 0x1) << 17) 137 #define ___PPC_RIC(ric) (((ric) & 0x3) << 18) 138 139 #define PPC_SLBIA(IH) __XSTRING(.long PPC_INST_SLBIA | \ 140 ((IH & 0x7) << 21)) 141 #define PPC_TLBIE_5(rb,rs,ric,prs,r) \ 142 __XSTRING(.long PPC_INST_TLBIE | \ 143 ___PPC_RB(rb) | ___PPC_RS(rs) | \ 144 ___PPC_RIC(ric) | ___PPC_PRS(prs) | \ 145 ___PPC_R(r)) 146 147 #define PPC_TLBIEL(rb,rs,ric,prs,r) \ 148 __XSTRING(.long PPC_INST_TLBIEL | \ 149 ___PPC_RB(rb) | ___PPC_RS(rs) | \ 150 ___PPC_RIC(ric) | ___PPC_PRS(prs) | \ 151 ___PPC_R(r)) 152 153 #define PPC_INVALIDATE_ERAT PPC_SLBIA(7) 154 155 static __inline void 156 ttusync(void) 157 { 158 __asm __volatile("eieio; tlbsync; ptesync" ::: "memory"); 159 } 160 161 #define TLBIEL_INVAL_SEL_MASK 0xc00 /* invalidation selector */ 162 #define TLBIEL_INVAL_PAGE 0x000 /* invalidate a single page */ 163 #define TLBIEL_INVAL_SET_PID 0x400 /* invalidate a set for the current PID */ 164 #define TLBIEL_INVAL_SET_LPID 0x800 /* invalidate a set for current LPID */ 165 #define TLBIEL_INVAL_SET 0xc00 /* invalidate a set for all LPIDs */ 166 167 #define TLBIE_ACTUAL_PAGE_MASK 0xe0 168 #define TLBIE_ACTUAL_PAGE_4K 0x00 169 #define TLBIE_ACTUAL_PAGE_64K 0xa0 170 #define TLBIE_ACTUAL_PAGE_2M 0x20 171 #define TLBIE_ACTUAL_PAGE_1G 0x40 172 173 #define TLBIE_PRS_PARTITION_SCOPE 0x0 174 #define TLBIE_PRS_PROCESS_SCOPE 0x1 175 176 #define TLBIE_RIC_INVALIDATE_TLB 0x0 /* Invalidate just TLB */ 177 #define TLBIE_RIC_INVALIDATE_PWC 0x1 /* Invalidate just PWC */ 178 #define TLBIE_RIC_INVALIDATE_ALL 0x2 /* Invalidate TLB, PWC, 179 * cached {proc, part}tab entries 180 */ 181 #define TLBIE_RIC_INVALIDATE_SEQ 0x3 /* HPT - only: 182 * Invalidate a range of translations 183 */ 184 185 static __always_inline void 186 radix_tlbie(uint8_t ric, uint8_t prs, uint16_t is, uint32_t pid, uint32_t lpid, 187 vm_offset_t va, uint16_t ap) 188 { 189 uint64_t rb, rs; 190 191 MPASS((va & PAGE_MASK) == 0); 192 193 rs = ((uint64_t)pid << 32) | lpid; 194 rb = va | is | ap; 195 __asm __volatile(PPC_TLBIE_5(%0, %1, %2, %3, 1) : : 196 "r" (rb), "r" (rs), "i" (ric), "i" (prs) : "memory"); 197 } 198 199 static __inline void 200 radix_tlbie_fixup(uint32_t pid, vm_offset_t va, int ap) 201 { 202 203 __asm __volatile("ptesync" ::: "memory"); 204 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 205 TLBIEL_INVAL_PAGE, 0, 0, va, ap); 206 __asm __volatile("ptesync" ::: "memory"); 207 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 208 TLBIEL_INVAL_PAGE, pid, 0, va, ap); 209 } 210 211 static __inline void 212 radix_tlbie_invlpg_user_4k(uint32_t pid, vm_offset_t va) 213 { 214 215 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 216 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_4K); 217 radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_4K); 218 } 219 220 static __inline void 221 radix_tlbie_invlpg_user_2m(uint32_t pid, vm_offset_t va) 222 { 223 224 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 225 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_2M); 226 radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_2M); 227 } 228 229 static __inline void 230 radix_tlbie_invlpwc_user(uint32_t pid) 231 { 232 233 radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE, 234 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0); 235 } 236 237 static __inline void 238 radix_tlbie_flush_user(uint32_t pid) 239 { 240 241 radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE, 242 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0); 243 } 244 245 static __inline void 246 radix_tlbie_invlpg_kernel_4k(vm_offset_t va) 247 { 248 249 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 250 TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_4K); 251 radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_4K); 252 } 253 254 static __inline void 255 radix_tlbie_invlpg_kernel_2m(vm_offset_t va) 256 { 257 258 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 259 TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_2M); 260 radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_2M); 261 } 262 263 /* 1GB pages aren't currently supported. */ 264 static __inline __unused void 265 radix_tlbie_invlpg_kernel_1g(vm_offset_t va) 266 { 267 268 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 269 TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_1G); 270 radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_1G); 271 } 272 273 static __inline void 274 radix_tlbie_invlpwc_kernel(void) 275 { 276 277 radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE, 278 TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0); 279 } 280 281 static __inline void 282 radix_tlbie_flush_kernel(void) 283 { 284 285 radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE, 286 TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0); 287 } 288 289 static __inline vm_pindex_t 290 pmap_l3e_pindex(vm_offset_t va) 291 { 292 return ((va & PG_FRAME) >> L3_PAGE_SIZE_SHIFT); 293 } 294 295 static __inline vm_pindex_t 296 pmap_pml3e_index(vm_offset_t va) 297 { 298 299 return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK); 300 } 301 302 static __inline vm_pindex_t 303 pmap_pml2e_index(vm_offset_t va) 304 { 305 return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK); 306 } 307 308 static __inline vm_pindex_t 309 pmap_pml1e_index(vm_offset_t va) 310 { 311 return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT); 312 } 313 314 /* Return various clipped indexes for a given VA */ 315 static __inline vm_pindex_t 316 pmap_pte_index(vm_offset_t va) 317 { 318 319 return ((va >> PAGE_SHIFT) & RPTE_MASK); 320 } 321 322 /* Return a pointer to the PT slot that corresponds to a VA */ 323 static __inline pt_entry_t * 324 pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va) 325 { 326 pt_entry_t *pte; 327 vm_paddr_t ptepa; 328 329 ptepa = (be64toh(*l3e) & NLB_MASK); 330 pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa); 331 return (&pte[pmap_pte_index(va)]); 332 } 333 334 /* Return a pointer to the PD slot that corresponds to a VA */ 335 static __inline pt_entry_t * 336 pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va) 337 { 338 pt_entry_t *l3e; 339 vm_paddr_t l3pa; 340 341 l3pa = (be64toh(*l2e) & NLB_MASK); 342 l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa); 343 return (&l3e[pmap_pml3e_index(va)]); 344 } 345 346 /* Return a pointer to the PD slot that corresponds to a VA */ 347 static __inline pt_entry_t * 348 pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va) 349 { 350 pt_entry_t *l2e; 351 vm_paddr_t l2pa; 352 353 l2pa = (be64toh(*l1e) & NLB_MASK); 354 355 l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa); 356 return (&l2e[pmap_pml2e_index(va)]); 357 } 358 359 static __inline pml1_entry_t * 360 pmap_pml1e(pmap_t pmap, vm_offset_t va) 361 { 362 363 return (&pmap->pm_pml1[pmap_pml1e_index(va)]); 364 } 365 366 static pt_entry_t * 367 pmap_pml2e(pmap_t pmap, vm_offset_t va) 368 { 369 pt_entry_t *l1e; 370 371 l1e = pmap_pml1e(pmap, va); 372 if (l1e == NULL || (be64toh(*l1e) & RPTE_VALID) == 0) 373 return (NULL); 374 return (pmap_l1e_to_l2e(l1e, va)); 375 } 376 377 static __inline pt_entry_t * 378 pmap_pml3e(pmap_t pmap, vm_offset_t va) 379 { 380 pt_entry_t *l2e; 381 382 l2e = pmap_pml2e(pmap, va); 383 if (l2e == NULL || (be64toh(*l2e) & RPTE_VALID) == 0) 384 return (NULL); 385 return (pmap_l2e_to_l3e(l2e, va)); 386 } 387 388 static __inline pt_entry_t * 389 pmap_pte(pmap_t pmap, vm_offset_t va) 390 { 391 pt_entry_t *l3e; 392 393 l3e = pmap_pml3e(pmap, va); 394 if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0) 395 return (NULL); 396 return (pmap_l3e_to_pte(l3e, va)); 397 } 398 399 int nkpt = 64; 400 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 401 "Number of kernel page table pages allocated on bootup"); 402 403 vm_paddr_t dmaplimit; 404 405 SYSCTL_DECL(_vm_pmap); 406 407 #ifdef INVARIANTS 408 #define VERBOSE_PMAP 0 409 #define VERBOSE_PROTECT 0 410 static int pmap_logging; 411 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN, 412 &pmap_logging, 0, "verbose debug logging"); 413 #endif 414 415 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 416 417 //static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ 418 419 static vm_offset_t qframe = 0; 420 static struct mtx qframe_mtx; 421 422 void mmu_radix_activate(struct thread *); 423 void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int); 424 void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *, 425 vm_size_t); 426 void mmu_radix_clear_modify(vm_page_t); 427 void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); 428 int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *); 429 int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); 430 void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t, 431 vm_prot_t); 432 void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t); 433 vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va); 434 vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t); 435 void mmu_radix_kenter(vm_offset_t, vm_paddr_t); 436 vm_paddr_t mmu_radix_kextract(vm_offset_t); 437 void mmu_radix_kremove(vm_offset_t); 438 boolean_t mmu_radix_is_modified(vm_page_t); 439 boolean_t mmu_radix_is_prefaultable(pmap_t, vm_offset_t); 440 boolean_t mmu_radix_is_referenced(vm_page_t); 441 void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t, 442 vm_pindex_t, vm_size_t); 443 boolean_t mmu_radix_page_exists_quick(pmap_t, vm_page_t); 444 void mmu_radix_page_init(vm_page_t); 445 boolean_t mmu_radix_page_is_mapped(vm_page_t m); 446 void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t); 447 int mmu_radix_page_wired_mappings(vm_page_t); 448 int mmu_radix_pinit(pmap_t); 449 void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); 450 bool mmu_radix_ps_enabled(pmap_t); 451 void mmu_radix_qenter(vm_offset_t, vm_page_t *, int); 452 void mmu_radix_qremove(vm_offset_t, int); 453 vm_offset_t mmu_radix_quick_enter_page(vm_page_t); 454 void mmu_radix_quick_remove_page(vm_offset_t); 455 boolean_t mmu_radix_ts_referenced(vm_page_t); 456 void mmu_radix_release(pmap_t); 457 void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t); 458 void mmu_radix_remove_all(vm_page_t); 459 void mmu_radix_remove_pages(pmap_t); 460 void mmu_radix_remove_write(vm_page_t); 461 void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t); 462 void mmu_radix_zero_page(vm_page_t); 463 void mmu_radix_zero_page_area(vm_page_t, int, int); 464 int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t); 465 void mmu_radix_page_array_startup(long pages); 466 467 #include "mmu_oea64.h" 468 469 /* 470 * Kernel MMU interface 471 */ 472 473 static void mmu_radix_bootstrap(vm_offset_t, vm_offset_t); 474 475 static void mmu_radix_copy_page(vm_page_t, vm_page_t); 476 static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset, 477 vm_page_t *mb, vm_offset_t b_offset, int xfersize); 478 static void mmu_radix_growkernel(vm_offset_t); 479 static void mmu_radix_init(void); 480 static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *); 481 static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); 482 static void mmu_radix_pinit0(pmap_t); 483 484 static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t); 485 static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); 486 static void mmu_radix_unmapdev(vm_offset_t, vm_size_t); 487 static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma); 488 static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t); 489 static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va); 490 static void mmu_radix_scan_init(void); 491 static void mmu_radix_cpu_bootstrap(int ap); 492 static void mmu_radix_tlbie_all(void); 493 494 static struct pmap_funcs mmu_radix_methods = { 495 .bootstrap = mmu_radix_bootstrap, 496 .copy_page = mmu_radix_copy_page, 497 .copy_pages = mmu_radix_copy_pages, 498 .cpu_bootstrap = mmu_radix_cpu_bootstrap, 499 .growkernel = mmu_radix_growkernel, 500 .init = mmu_radix_init, 501 .map = mmu_radix_map, 502 .mincore = mmu_radix_mincore, 503 .pinit = mmu_radix_pinit, 504 .pinit0 = mmu_radix_pinit0, 505 506 .mapdev = mmu_radix_mapdev, 507 .mapdev_attr = mmu_radix_mapdev_attr, 508 .unmapdev = mmu_radix_unmapdev, 509 .kenter_attr = mmu_radix_kenter_attr, 510 .dev_direct_mapped = mmu_radix_dev_direct_mapped, 511 .dumpsys_pa_init = mmu_radix_scan_init, 512 .dumpsys_map_chunk = mmu_radix_dumpsys_map, 513 .page_is_mapped = mmu_radix_page_is_mapped, 514 .ps_enabled = mmu_radix_ps_enabled, 515 .align_superpage = mmu_radix_align_superpage, 516 .object_init_pt = mmu_radix_object_init_pt, 517 .protect = mmu_radix_protect, 518 /* pmap dispatcher interface */ 519 .clear_modify = mmu_radix_clear_modify, 520 .copy = mmu_radix_copy, 521 .enter = mmu_radix_enter, 522 .enter_object = mmu_radix_enter_object, 523 .enter_quick = mmu_radix_enter_quick, 524 .extract = mmu_radix_extract, 525 .extract_and_hold = mmu_radix_extract_and_hold, 526 .is_modified = mmu_radix_is_modified, 527 .is_prefaultable = mmu_radix_is_prefaultable, 528 .is_referenced = mmu_radix_is_referenced, 529 .ts_referenced = mmu_radix_ts_referenced, 530 .page_exists_quick = mmu_radix_page_exists_quick, 531 .page_init = mmu_radix_page_init, 532 .page_wired_mappings = mmu_radix_page_wired_mappings, 533 .qenter = mmu_radix_qenter, 534 .qremove = mmu_radix_qremove, 535 .release = mmu_radix_release, 536 .remove = mmu_radix_remove, 537 .remove_all = mmu_radix_remove_all, 538 .remove_write = mmu_radix_remove_write, 539 .unwire = mmu_radix_unwire, 540 .zero_page = mmu_radix_zero_page, 541 .zero_page_area = mmu_radix_zero_page_area, 542 .activate = mmu_radix_activate, 543 .quick_enter_page = mmu_radix_quick_enter_page, 544 .quick_remove_page = mmu_radix_quick_remove_page, 545 .page_set_memattr = mmu_radix_page_set_memattr, 546 .page_array_startup = mmu_radix_page_array_startup, 547 548 /* Internal interfaces */ 549 .kenter = mmu_radix_kenter, 550 .kextract = mmu_radix_kextract, 551 .kremove = mmu_radix_kremove, 552 .change_attr = mmu_radix_change_attr, 553 .decode_kernel_ptr = mmu_radix_decode_kernel_ptr, 554 555 .tlbie_all = mmu_radix_tlbie_all, 556 }; 557 558 MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods); 559 560 static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, 561 struct rwlock **lockp); 562 static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va); 563 static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *); 564 static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, 565 struct spglist *free, struct rwlock **lockp); 566 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 567 pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 568 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 569 static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde, 570 struct spglist *free); 571 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 572 pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp); 573 574 static bool pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e, 575 u_int flags, struct rwlock **lockp); 576 #if VM_NRESERVLEVEL > 0 577 static void pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 578 struct rwlock **lockp); 579 #endif 580 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 581 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 582 static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 583 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate); 584 585 static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 586 vm_prot_t prot, struct rwlock **lockp); 587 static int pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, 588 u_int flags, vm_page_t m, struct rwlock **lockp); 589 590 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 591 static void free_pv_chunk(struct pv_chunk *pc); 592 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); 593 static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va, 594 struct rwlock **lockp); 595 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 596 struct rwlock **lockp); 597 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 598 struct spglist *free); 599 static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); 600 601 static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start); 602 static void pmap_invalidate_all(pmap_t pmap); 603 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush); 604 605 /* 606 * Internal flags for pmap_enter()'s helper functions. 607 */ 608 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 609 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 610 611 #define UNIMPLEMENTED() panic("%s not implemented", __func__) 612 #define UNTESTED() panic("%s not yet tested", __func__) 613 614 /* Number of supported PID bits */ 615 static unsigned int isa3_pid_bits; 616 617 /* PID to start allocating from */ 618 static unsigned int isa3_base_pid; 619 620 #define PROCTAB_SIZE_SHIFT (isa3_pid_bits + 4) 621 #define PROCTAB_ENTRIES (1ul << isa3_pid_bits) 622 623 /* 624 * Map of physical memory regions. 625 */ 626 static struct mem_region *regions, *pregions; 627 static struct numa_mem_region *numa_pregions; 628 static u_int phys_avail_count; 629 static int regions_sz, pregions_sz, numa_pregions_sz; 630 static struct pate *isa3_parttab; 631 static struct prte *isa3_proctab; 632 static vmem_t *asid_arena; 633 634 extern void bs_remap_earlyboot(void); 635 636 #define RADIX_PGD_SIZE_SHIFT 16 637 #define RADIX_PGD_SIZE (1UL << RADIX_PGD_SIZE_SHIFT) 638 639 #define RADIX_PGD_INDEX_SHIFT (RADIX_PGD_SIZE_SHIFT-3) 640 #define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t)) 641 #define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t)) 642 643 #define NUPML1E (RADIX_PGD_SIZE/sizeof(uint64_t)) /* number of userland PML1 pages */ 644 #define NUPDPE (NUPML1E * NL2EPG)/* number of userland PDP pages */ 645 #define NUPDE (NUPDPE * NL3EPG) /* number of userland PD entries */ 646 647 /* POWER9 only permits a 64k partition table size. */ 648 #define PARTTAB_SIZE_SHIFT 16 649 #define PARTTAB_SIZE (1UL << PARTTAB_SIZE_SHIFT) 650 651 #define PARTTAB_HR (1UL << 63) /* host uses radix */ 652 #define PARTTAB_GR (1UL << 63) /* guest uses radix must match host */ 653 654 /* TLB flush actions. Used as argument to tlbiel_all() */ 655 enum { 656 TLB_INVAL_SCOPE_LPID = 0, /* invalidate TLBs for current LPID */ 657 TLB_INVAL_SCOPE_GLOBAL = 1, /* invalidate all TLBs */ 658 }; 659 660 #define NPV_LIST_LOCKS MAXCPU 661 static int pmap_initialized; 662 static vm_paddr_t proctab0pa; 663 static vm_paddr_t parttab_phys; 664 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 665 666 /* 667 * Data for the pv entry allocation mechanism. 668 * Updates to pv_invl_gen are protected by the pv_list_locks[] 669 * elements, but reads are not. 670 */ 671 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 672 static struct mtx __exclusive_cache_line pv_chunks_mutex; 673 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 674 static struct md_page *pv_table; 675 static struct md_page pv_dummy; 676 677 #ifdef PV_STATS 678 #define PV_STAT(x) do { x ; } while (0) 679 #else 680 #define PV_STAT(x) do { } while (0) 681 #endif 682 683 #define pa_radix_index(pa) ((pa) >> L3_PAGE_SIZE_SHIFT) 684 #define pa_to_pvh(pa) (&pv_table[pa_radix_index(pa)]) 685 686 #define PHYS_TO_PV_LIST_LOCK(pa) \ 687 (&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS]) 688 689 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 690 struct rwlock **_lockp = (lockp); \ 691 struct rwlock *_new_lock; \ 692 \ 693 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 694 if (_new_lock != *_lockp) { \ 695 if (*_lockp != NULL) \ 696 rw_wunlock(*_lockp); \ 697 *_lockp = _new_lock; \ 698 rw_wlock(*_lockp); \ 699 } \ 700 } while (0) 701 702 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 703 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 704 705 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 706 struct rwlock **_lockp = (lockp); \ 707 \ 708 if (*_lockp != NULL) { \ 709 rw_wunlock(*_lockp); \ 710 *_lockp = NULL; \ 711 } \ 712 } while (0) 713 714 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 715 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 716 717 /* 718 * We support 52 bits, hence: 719 * bits 52 - 31 = 21, 0b10101 720 * RTS encoding details 721 * bits 0 - 3 of rts -> bits 6 - 8 unsigned long 722 * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long 723 */ 724 #define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5)) 725 726 static int powernv_enabled = 1; 727 728 static __always_inline void 729 tlbiel_radix_set_isa300(uint32_t set, uint32_t is, 730 uint32_t pid, uint32_t ric, uint32_t prs) 731 { 732 uint64_t rb; 733 uint64_t rs; 734 735 rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53); 736 rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31); 737 738 __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) 739 : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) 740 : "memory"); 741 } 742 743 static void 744 tlbiel_flush_isa3(uint32_t num_sets, uint32_t is) 745 { 746 uint32_t set; 747 748 __asm __volatile("ptesync": : :"memory"); 749 750 /* 751 * Flush the first set of the TLB, and the entire Page Walk Cache 752 * and partition table entries. Then flush the remaining sets of the 753 * TLB. 754 */ 755 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); 756 for (set = 1; set < num_sets; set++) 757 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); 758 759 /* Do the same for process scoped entries. */ 760 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); 761 for (set = 1; set < num_sets; set++) 762 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); 763 764 __asm __volatile("ptesync": : :"memory"); 765 } 766 767 static void 768 mmu_radix_tlbiel_flush(int scope) 769 { 770 int is; 771 772 MPASS(scope == TLB_INVAL_SCOPE_LPID || 773 scope == TLB_INVAL_SCOPE_GLOBAL); 774 is = scope + 2; 775 776 tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, is); 777 __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); 778 } 779 780 static void 781 mmu_radix_tlbie_all() 782 { 783 /* TODO: LPID invalidate */ 784 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); 785 } 786 787 static void 788 mmu_radix_init_amor(void) 789 { 790 /* 791 * In HV mode, we init AMOR (Authority Mask Override Register) so that 792 * the hypervisor and guest can setup IAMR (Instruction Authority Mask 793 * Register), enable key 0 and set it to 1. 794 * 795 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) 796 */ 797 mtspr(SPR_AMOR, (3ul << 62)); 798 } 799 800 static void 801 mmu_radix_init_iamr(void) 802 { 803 /* 804 * Radix always uses key0 of the IAMR to determine if an access is 805 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction 806 * fetch. 807 */ 808 mtspr(SPR_IAMR, (1ul << 62)); 809 } 810 811 static void 812 mmu_radix_pid_set(pmap_t pmap) 813 { 814 815 mtspr(SPR_PID, pmap->pm_pid); 816 isync(); 817 } 818 819 /* Quick sort callout for comparing physical addresses. */ 820 static int 821 pa_cmp(const void *a, const void *b) 822 { 823 const vm_paddr_t *pa = a, *pb = b; 824 825 if (*pa < *pb) 826 return (-1); 827 else if (*pa > *pb) 828 return (1); 829 else 830 return (0); 831 } 832 833 #define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) 834 #define pte_load_clear(ptep) atomic_swap_long(ptep, 0) 835 #define pte_store(ptep, pte) do { \ 836 MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X)); \ 837 *(u_long *)(ptep) = htobe64((u_long)((pte) | PG_V | RPTE_LEAF)); \ 838 } while (0) 839 /* 840 * NB: should only be used for adding directories - not for direct mappings 841 */ 842 #define pde_store(ptep, pa) do { \ 843 *(u_long *)(ptep) = htobe64((u_long)(pa|RPTE_VALID|RPTE_SHIFT)); \ 844 } while (0) 845 846 #define pte_clear(ptep) do { \ 847 *(u_long *)(ptep) = (u_long)(0); \ 848 } while (0) 849 850 #define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ 851 852 /* 853 * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB 854 * (PTE) page mappings have identical settings for the following fields: 855 */ 856 #define PG_PTE_PROMOTE (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \ 857 PG_M | PG_A | RPTE_EAA_MASK | PG_V) 858 859 static __inline void 860 pmap_resident_count_inc(pmap_t pmap, int count) 861 { 862 863 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 864 pmap->pm_stats.resident_count += count; 865 } 866 867 static __inline void 868 pmap_resident_count_dec(pmap_t pmap, int count) 869 { 870 871 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 872 KASSERT(pmap->pm_stats.resident_count >= count, 873 ("pmap %p resident count underflow %ld %d", pmap, 874 pmap->pm_stats.resident_count, count)); 875 pmap->pm_stats.resident_count -= count; 876 } 877 878 static void 879 pagezero(vm_offset_t va) 880 { 881 va = trunc_page(va); 882 883 bzero((void *)va, PAGE_SIZE); 884 } 885 886 static uint64_t 887 allocpages(int n) 888 { 889 u_int64_t ret; 890 891 ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE); 892 for (int i = 0; i < n; i++) 893 pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE)); 894 return (ret); 895 } 896 897 static pt_entry_t * 898 kvtopte(vm_offset_t va) 899 { 900 pt_entry_t *l3e; 901 902 l3e = pmap_pml3e(kernel_pmap, va); 903 if ((be64toh(*l3e) & RPTE_VALID) == 0) 904 return (NULL); 905 return (pmap_l3e_to_pte(l3e, va)); 906 } 907 908 void 909 mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa) 910 { 911 pt_entry_t *pte; 912 913 pte = kvtopte(va); 914 MPASS(pte != NULL); 915 *pte = htobe64(pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | \ 916 RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A); 917 } 918 919 bool 920 mmu_radix_ps_enabled(pmap_t pmap) 921 { 922 return (superpages_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 923 } 924 925 static pt_entry_t * 926 pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e) 927 { 928 pml3_entry_t *l3e; 929 pt_entry_t *pte; 930 931 va &= PG_PS_FRAME; 932 l3e = pmap_pml3e(pmap, va); 933 if (l3e == NULL || (be64toh(*l3e) & PG_V) == 0) 934 return (NULL); 935 936 if (be64toh(*l3e) & RPTE_LEAF) { 937 *is_l3e = 1; 938 return (l3e); 939 } 940 *is_l3e = 0; 941 va &= PG_FRAME; 942 pte = pmap_l3e_to_pte(l3e, va); 943 if (pte == NULL || (be64toh(*pte) & PG_V) == 0) 944 return (NULL); 945 return (pte); 946 } 947 948 int 949 pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags) 950 { 951 pt_entry_t *pte; 952 pt_entry_t startpte, origpte, newpte; 953 vm_page_t m; 954 int is_l3e; 955 956 startpte = 0; 957 retry: 958 if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL) 959 return (KERN_INVALID_ADDRESS); 960 origpte = newpte = be64toh(*pte); 961 if (startpte == 0) { 962 startpte = origpte; 963 if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) || 964 ((flags & VM_PROT_READ) && (startpte & PG_A))) { 965 pmap_invalidate_all(pmap); 966 #ifdef INVARIANTS 967 if (VERBOSE_PMAP || pmap_logging) 968 printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n", 969 __func__, pmap, va, flags, origpte); 970 #endif 971 return (KERN_FAILURE); 972 } 973 } 974 #ifdef INVARIANTS 975 if (VERBOSE_PMAP || pmap_logging) 976 printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va, 977 flags, origpte); 978 #endif 979 PMAP_LOCK(pmap); 980 if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL || 981 be64toh(*pte) != origpte) { 982 PMAP_UNLOCK(pmap); 983 return (KERN_FAILURE); 984 } 985 m = PHYS_TO_VM_PAGE(newpte & PG_FRAME); 986 MPASS(m != NULL); 987 switch (flags) { 988 case VM_PROT_READ: 989 if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0) 990 goto protfail; 991 newpte |= PG_A; 992 vm_page_aflag_set(m, PGA_REFERENCED); 993 break; 994 case VM_PROT_WRITE: 995 if ((newpte & RPTE_EAA_W) == 0) 996 goto protfail; 997 if (is_l3e) 998 goto protfail; 999 newpte |= PG_M; 1000 vm_page_dirty(m); 1001 break; 1002 case VM_PROT_EXECUTE: 1003 if ((newpte & RPTE_EAA_X) == 0) 1004 goto protfail; 1005 newpte |= PG_A; 1006 vm_page_aflag_set(m, PGA_REFERENCED); 1007 break; 1008 } 1009 1010 if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte))) 1011 goto retry; 1012 ptesync(); 1013 PMAP_UNLOCK(pmap); 1014 if (startpte == newpte) 1015 return (KERN_FAILURE); 1016 return (0); 1017 protfail: 1018 PMAP_UNLOCK(pmap); 1019 return (KERN_PROTECTION_FAILURE); 1020 } 1021 1022 /* 1023 * Returns TRUE if the given page is mapped individually or as part of 1024 * a 2mpage. Otherwise, returns FALSE. 1025 */ 1026 boolean_t 1027 mmu_radix_page_is_mapped(vm_page_t m) 1028 { 1029 struct rwlock *lock; 1030 boolean_t rv; 1031 1032 if ((m->oflags & VPO_UNMANAGED) != 0) 1033 return (FALSE); 1034 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 1035 rw_rlock(lock); 1036 rv = !TAILQ_EMPTY(&m->md.pv_list) || 1037 ((m->flags & PG_FICTITIOUS) == 0 && 1038 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 1039 rw_runlock(lock); 1040 return (rv); 1041 } 1042 1043 /* 1044 * Determine the appropriate bits to set in a PTE or PDE for a specified 1045 * caching mode. 1046 */ 1047 static int 1048 pmap_cache_bits(vm_memattr_t ma) 1049 { 1050 if (ma != VM_MEMATTR_DEFAULT) { 1051 switch (ma) { 1052 case VM_MEMATTR_UNCACHEABLE: 1053 return (RPTE_ATTR_GUARDEDIO); 1054 case VM_MEMATTR_CACHEABLE: 1055 return (RPTE_ATTR_MEM); 1056 case VM_MEMATTR_WRITE_BACK: 1057 case VM_MEMATTR_PREFETCHABLE: 1058 case VM_MEMATTR_WRITE_COMBINING: 1059 return (RPTE_ATTR_UNGUARDEDIO); 1060 } 1061 } 1062 return (0); 1063 } 1064 1065 static void 1066 pmap_invalidate_page(pmap_t pmap, vm_offset_t start) 1067 { 1068 ptesync(); 1069 if (pmap == kernel_pmap) 1070 radix_tlbie_invlpg_kernel_4k(start); 1071 else 1072 radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); 1073 ttusync(); 1074 } 1075 1076 static void 1077 pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start) 1078 { 1079 ptesync(); 1080 if (pmap == kernel_pmap) 1081 radix_tlbie_invlpg_kernel_2m(start); 1082 else 1083 radix_tlbie_invlpg_user_2m(pmap->pm_pid, start); 1084 ttusync(); 1085 } 1086 1087 static void 1088 pmap_invalidate_pwc(pmap_t pmap) 1089 { 1090 ptesync(); 1091 if (pmap == kernel_pmap) 1092 radix_tlbie_invlpwc_kernel(); 1093 else 1094 radix_tlbie_invlpwc_user(pmap->pm_pid); 1095 ttusync(); 1096 } 1097 1098 static void 1099 pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end) 1100 { 1101 if (((start - end) >> PAGE_SHIFT) > 8) { 1102 pmap_invalidate_all(pmap); 1103 return; 1104 } 1105 ptesync(); 1106 if (pmap == kernel_pmap) { 1107 while (start < end) { 1108 radix_tlbie_invlpg_kernel_4k(start); 1109 start += PAGE_SIZE; 1110 } 1111 } else { 1112 while (start < end) { 1113 radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); 1114 start += PAGE_SIZE; 1115 } 1116 } 1117 ttusync(); 1118 } 1119 1120 static void 1121 pmap_invalidate_all(pmap_t pmap) 1122 { 1123 ptesync(); 1124 if (pmap == kernel_pmap) 1125 radix_tlbie_flush_kernel(); 1126 else 1127 radix_tlbie_flush_user(pmap->pm_pid); 1128 ttusync(); 1129 } 1130 1131 static void 1132 pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e) 1133 { 1134 1135 /* 1136 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 1137 * by a promotion that did not invalidate the 512 4KB page mappings 1138 * that might exist in the TLB. Consequently, at this point, the TLB 1139 * may hold both 4KB and 2MB page mappings for the address range [va, 1140 * va + L3_PAGE_SIZE). Therefore, the entire range must be invalidated here. 1141 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 1142 * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a 1143 * single INVLPG suffices to invalidate the 2MB page mapping from the 1144 * TLB. 1145 */ 1146 ptesync(); 1147 if ((l3e & PG_PROMOTED) != 0) 1148 pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1); 1149 else 1150 pmap_invalidate_page_2m(pmap, va); 1151 1152 pmap_invalidate_pwc(pmap); 1153 } 1154 1155 static __inline struct pv_chunk * 1156 pv_to_chunk(pv_entry_t pv) 1157 { 1158 1159 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1160 } 1161 1162 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1163 1164 #define PC_FREE0 0xfffffffffffffffful 1165 #define PC_FREE1 0x3ffffffffffffffful 1166 1167 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 }; 1168 1169 /* 1170 * Ensure that the number of spare PV entries in the specified pmap meets or 1171 * exceeds the given count, "needed". 1172 * 1173 * The given PV list lock may be released. 1174 */ 1175 static void 1176 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1177 { 1178 struct pch new_tail; 1179 struct pv_chunk *pc; 1180 vm_page_t m; 1181 int avail, free; 1182 bool reclaimed; 1183 1184 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1185 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1186 1187 /* 1188 * Newly allocated PV chunks must be stored in a private list until 1189 * the required number of PV chunks have been allocated. Otherwise, 1190 * reclaim_pv_chunk() could recycle one of these chunks. In 1191 * contrast, these chunks must be added to the pmap upon allocation. 1192 */ 1193 TAILQ_INIT(&new_tail); 1194 retry: 1195 avail = 0; 1196 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1197 // if ((cpu_feature2 & CPUID2_POPCNT) == 0) 1198 bit_count((bitstr_t *)pc->pc_map, 0, 1199 sizeof(pc->pc_map) * NBBY, &free); 1200 #if 0 1201 free = popcnt_pc_map_pq(pc->pc_map); 1202 #endif 1203 if (free == 0) 1204 break; 1205 avail += free; 1206 if (avail >= needed) 1207 break; 1208 } 1209 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1210 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1211 VM_ALLOC_WIRED); 1212 if (m == NULL) { 1213 m = reclaim_pv_chunk(pmap, lockp); 1214 if (m == NULL) 1215 goto retry; 1216 reclaimed = true; 1217 } 1218 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1219 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1220 dump_add_page(m->phys_addr); 1221 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1222 pc->pc_pmap = pmap; 1223 pc->pc_map[0] = PC_FREE0; 1224 pc->pc_map[1] = PC_FREE1; 1225 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1226 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1227 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 1228 1229 /* 1230 * The reclaim might have freed a chunk from the current pmap. 1231 * If that chunk contained available entries, we need to 1232 * re-count the number of available entries. 1233 */ 1234 if (reclaimed) 1235 goto retry; 1236 } 1237 if (!TAILQ_EMPTY(&new_tail)) { 1238 mtx_lock(&pv_chunks_mutex); 1239 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1240 mtx_unlock(&pv_chunks_mutex); 1241 } 1242 } 1243 1244 /* 1245 * First find and then remove the pv entry for the specified pmap and virtual 1246 * address from the specified pv list. Returns the pv entry if found and NULL 1247 * otherwise. This operation can be performed on pv lists for either 4KB or 1248 * 2MB page mappings. 1249 */ 1250 static __inline pv_entry_t 1251 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1252 { 1253 pv_entry_t pv; 1254 1255 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 1256 #ifdef INVARIANTS 1257 if (PV_PMAP(pv) == NULL) { 1258 printf("corrupted pv_chunk/pv %p\n", pv); 1259 printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":"); 1260 } 1261 MPASS(PV_PMAP(pv) != NULL); 1262 MPASS(pv->pv_va != 0); 1263 #endif 1264 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1265 TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); 1266 pvh->pv_gen++; 1267 break; 1268 } 1269 } 1270 return (pv); 1271 } 1272 1273 /* 1274 * After demotion from a 2MB page mapping to 512 4KB page mappings, 1275 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 1276 * entries for each of the 4KB page mappings. 1277 */ 1278 static void 1279 pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1280 struct rwlock **lockp) 1281 { 1282 struct md_page *pvh; 1283 struct pv_chunk *pc; 1284 pv_entry_t pv; 1285 vm_offset_t va_last; 1286 vm_page_t m; 1287 int bit, field; 1288 1289 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1290 KASSERT((pa & L3_PAGE_MASK) == 0, 1291 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 1292 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1293 1294 /* 1295 * Transfer the 2mpage's pv entry for this mapping to the first 1296 * page's pv list. Once this transfer begins, the pv list lock 1297 * must not be released until the last pv entry is reinstantiated. 1298 */ 1299 pvh = pa_to_pvh(pa); 1300 va = trunc_2mpage(va); 1301 pv = pmap_pvh_remove(pvh, pmap, va); 1302 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 1303 m = PHYS_TO_VM_PAGE(pa); 1304 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 1305 1306 m->md.pv_gen++; 1307 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 1308 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 1309 va_last = va + L3_PAGE_SIZE - PAGE_SIZE; 1310 for (;;) { 1311 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1312 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 1313 , ("pmap_pv_demote_pde: missing spare")); 1314 for (field = 0; field < _NPCM; field++) { 1315 while (pc->pc_map[field]) { 1316 bit = cnttzd(pc->pc_map[field]); 1317 pc->pc_map[field] &= ~(1ul << bit); 1318 pv = &pc->pc_pventry[field * 64 + bit]; 1319 va += PAGE_SIZE; 1320 pv->pv_va = va; 1321 m++; 1322 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1323 ("pmap_pv_demote_pde: page %p is not managed", m)); 1324 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 1325 1326 m->md.pv_gen++; 1327 if (va == va_last) 1328 goto out; 1329 } 1330 } 1331 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1332 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1333 } 1334 out: 1335 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { 1336 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1337 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1338 } 1339 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 1340 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 1341 } 1342 1343 static void 1344 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap) 1345 { 1346 1347 if (pmap == NULL) 1348 return; 1349 pmap_invalidate_all(pmap); 1350 if (pmap != locked_pmap) 1351 PMAP_UNLOCK(pmap); 1352 } 1353 1354 /* 1355 * We are in a serious low memory condition. Resort to 1356 * drastic measures to free some pages so we can allocate 1357 * another pv entry chunk. 1358 * 1359 * Returns NULL if PV entries were reclaimed from the specified pmap. 1360 * 1361 * We do not, however, unmap 2mpages because subsequent accesses will 1362 * allocate per-page pv entries until repromotion occurs, thereby 1363 * exacerbating the shortage of free pv entries. 1364 */ 1365 static int active_reclaims = 0; 1366 static vm_page_t 1367 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1368 { 1369 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 1370 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 1371 struct md_page *pvh; 1372 pml3_entry_t *l3e; 1373 pmap_t next_pmap, pmap; 1374 pt_entry_t *pte, tpte; 1375 pv_entry_t pv; 1376 vm_offset_t va; 1377 vm_page_t m, m_pc; 1378 struct spglist free; 1379 uint64_t inuse; 1380 int bit, field, freed; 1381 1382 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 1383 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 1384 pmap = NULL; 1385 m_pc = NULL; 1386 SLIST_INIT(&free); 1387 bzero(&pc_marker_b, sizeof(pc_marker_b)); 1388 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 1389 pc_marker = (struct pv_chunk *)&pc_marker_b; 1390 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 1391 1392 mtx_lock(&pv_chunks_mutex); 1393 active_reclaims++; 1394 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); 1395 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); 1396 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 1397 SLIST_EMPTY(&free)) { 1398 next_pmap = pc->pc_pmap; 1399 if (next_pmap == NULL) { 1400 /* 1401 * The next chunk is a marker. However, it is 1402 * not our marker, so active_reclaims must be 1403 * > 1. Consequently, the next_chunk code 1404 * will not rotate the pv_chunks list. 1405 */ 1406 goto next_chunk; 1407 } 1408 mtx_unlock(&pv_chunks_mutex); 1409 1410 /* 1411 * A pv_chunk can only be removed from the pc_lru list 1412 * when both pc_chunks_mutex is owned and the 1413 * corresponding pmap is locked. 1414 */ 1415 if (pmap != next_pmap) { 1416 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); 1417 pmap = next_pmap; 1418 /* Avoid deadlock and lock recursion. */ 1419 if (pmap > locked_pmap) { 1420 RELEASE_PV_LIST_LOCK(lockp); 1421 PMAP_LOCK(pmap); 1422 mtx_lock(&pv_chunks_mutex); 1423 continue; 1424 } else if (pmap != locked_pmap) { 1425 if (PMAP_TRYLOCK(pmap)) { 1426 mtx_lock(&pv_chunks_mutex); 1427 continue; 1428 } else { 1429 pmap = NULL; /* pmap is not locked */ 1430 mtx_lock(&pv_chunks_mutex); 1431 pc = TAILQ_NEXT(pc_marker, pc_lru); 1432 if (pc == NULL || 1433 pc->pc_pmap != next_pmap) 1434 continue; 1435 goto next_chunk; 1436 } 1437 } 1438 } 1439 1440 /* 1441 * Destroy every non-wired, 4 KB page mapping in the chunk. 1442 */ 1443 freed = 0; 1444 for (field = 0; field < _NPCM; field++) { 1445 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 1446 inuse != 0; inuse &= ~(1UL << bit)) { 1447 bit = cnttzd(inuse); 1448 pv = &pc->pc_pventry[field * 64 + bit]; 1449 va = pv->pv_va; 1450 l3e = pmap_pml3e(pmap, va); 1451 if ((be64toh(*l3e) & RPTE_LEAF) != 0) 1452 continue; 1453 pte = pmap_l3e_to_pte(l3e, va); 1454 if ((be64toh(*pte) & PG_W) != 0) 1455 continue; 1456 tpte = be64toh(pte_load_clear(pte)); 1457 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 1458 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 1459 vm_page_dirty(m); 1460 if ((tpte & PG_A) != 0) 1461 vm_page_aflag_set(m, PGA_REFERENCED); 1462 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1463 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 1464 1465 m->md.pv_gen++; 1466 if (TAILQ_EMPTY(&m->md.pv_list) && 1467 (m->flags & PG_FICTITIOUS) == 0) { 1468 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 1469 if (TAILQ_EMPTY(&pvh->pv_list)) { 1470 vm_page_aflag_clear(m, 1471 PGA_WRITEABLE); 1472 } 1473 } 1474 pc->pc_map[field] |= 1UL << bit; 1475 pmap_unuse_pt(pmap, va, be64toh(*l3e), &free); 1476 freed++; 1477 } 1478 } 1479 if (freed == 0) { 1480 mtx_lock(&pv_chunks_mutex); 1481 goto next_chunk; 1482 } 1483 /* Every freed mapping is for a 4 KB page. */ 1484 pmap_resident_count_dec(pmap, freed); 1485 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 1486 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 1487 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 1488 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1489 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) { 1490 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1491 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1492 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1493 /* Entire chunk is free; return it. */ 1494 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1495 dump_drop_page(m_pc->phys_addr); 1496 mtx_lock(&pv_chunks_mutex); 1497 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1498 break; 1499 } 1500 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1501 mtx_lock(&pv_chunks_mutex); 1502 /* One freed pv entry in locked_pmap is sufficient. */ 1503 if (pmap == locked_pmap) 1504 break; 1505 next_chunk: 1506 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 1507 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); 1508 if (active_reclaims == 1 && pmap != NULL) { 1509 /* 1510 * Rotate the pv chunks list so that we do not 1511 * scan the same pv chunks that could not be 1512 * freed (because they contained a wired 1513 * and/or superpage mapping) on every 1514 * invocation of reclaim_pv_chunk(). 1515 */ 1516 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { 1517 MPASS(pc->pc_pmap != NULL); 1518 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1519 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1520 } 1521 } 1522 } 1523 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 1524 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); 1525 active_reclaims--; 1526 mtx_unlock(&pv_chunks_mutex); 1527 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); 1528 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 1529 m_pc = SLIST_FIRST(&free); 1530 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 1531 /* Recycle a freed page table page. */ 1532 m_pc->ref_count = 1; 1533 } 1534 vm_page_free_pages_toq(&free, true); 1535 return (m_pc); 1536 } 1537 1538 /* 1539 * free the pv_entry back to the free list 1540 */ 1541 static void 1542 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1543 { 1544 struct pv_chunk *pc; 1545 int idx, field, bit; 1546 1547 #ifdef VERBOSE_PV 1548 if (pmap != kernel_pmap) 1549 printf("%s(%p, %p)\n", __func__, pmap, pv); 1550 #endif 1551 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1552 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1553 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1554 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1555 pc = pv_to_chunk(pv); 1556 idx = pv - &pc->pc_pventry[0]; 1557 field = idx / 64; 1558 bit = idx % 64; 1559 pc->pc_map[field] |= 1ul << bit; 1560 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) { 1561 /* 98% of the time, pc is already at the head of the list. */ 1562 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1563 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1564 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1565 } 1566 return; 1567 } 1568 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1569 free_pv_chunk(pc); 1570 } 1571 1572 static void 1573 free_pv_chunk(struct pv_chunk *pc) 1574 { 1575 vm_page_t m; 1576 1577 mtx_lock(&pv_chunks_mutex); 1578 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1579 mtx_unlock(&pv_chunks_mutex); 1580 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1581 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1582 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1583 /* entire chunk is free, return it */ 1584 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1585 dump_drop_page(m->phys_addr); 1586 vm_page_unwire_noq(m); 1587 vm_page_free(m); 1588 } 1589 1590 /* 1591 * Returns a new PV entry, allocating a new PV chunk from the system when 1592 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1593 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1594 * returned. 1595 * 1596 * The given PV list lock may be released. 1597 */ 1598 static pv_entry_t 1599 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1600 { 1601 int bit, field; 1602 pv_entry_t pv; 1603 struct pv_chunk *pc; 1604 vm_page_t m; 1605 1606 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1607 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1608 retry: 1609 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1610 if (pc != NULL) { 1611 for (field = 0; field < _NPCM; field++) { 1612 if (pc->pc_map[field]) { 1613 bit = cnttzd(pc->pc_map[field]); 1614 break; 1615 } 1616 } 1617 if (field < _NPCM) { 1618 pv = &pc->pc_pventry[field * 64 + bit]; 1619 pc->pc_map[field] &= ~(1ul << bit); 1620 /* If this was the last item, move it to tail */ 1621 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { 1622 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1623 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1624 pc_list); 1625 } 1626 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1627 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1628 MPASS(PV_PMAP(pv) != NULL); 1629 return (pv); 1630 } 1631 } 1632 /* No free items, allocate another chunk */ 1633 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1634 VM_ALLOC_WIRED); 1635 if (m == NULL) { 1636 if (lockp == NULL) { 1637 PV_STAT(pc_chunk_tryfail++); 1638 return (NULL); 1639 } 1640 m = reclaim_pv_chunk(pmap, lockp); 1641 if (m == NULL) 1642 goto retry; 1643 } 1644 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1645 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1646 dump_add_page(m->phys_addr); 1647 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1648 pc->pc_pmap = pmap; 1649 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1650 pc->pc_map[1] = PC_FREE1; 1651 mtx_lock(&pv_chunks_mutex); 1652 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1653 mtx_unlock(&pv_chunks_mutex); 1654 pv = &pc->pc_pventry[0]; 1655 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1656 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1657 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1658 MPASS(PV_PMAP(pv) != NULL); 1659 return (pv); 1660 } 1661 1662 #if VM_NRESERVLEVEL > 0 1663 /* 1664 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 1665 * replace the many pv entries for the 4KB page mappings by a single pv entry 1666 * for the 2MB page mapping. 1667 */ 1668 static void 1669 pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1670 struct rwlock **lockp) 1671 { 1672 struct md_page *pvh; 1673 pv_entry_t pv; 1674 vm_offset_t va_last; 1675 vm_page_t m; 1676 1677 KASSERT((pa & L3_PAGE_MASK) == 0, 1678 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 1679 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1680 1681 /* 1682 * Transfer the first page's pv entry for this mapping to the 2mpage's 1683 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 1684 * a transfer avoids the possibility that get_pv_entry() calls 1685 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 1686 * mappings that is being promoted. 1687 */ 1688 m = PHYS_TO_VM_PAGE(pa); 1689 va = trunc_2mpage(va); 1690 pv = pmap_pvh_remove(&m->md, pmap, va); 1691 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 1692 pvh = pa_to_pvh(pa); 1693 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); 1694 pvh->pv_gen++; 1695 /* Free the remaining NPTEPG - 1 pv entries. */ 1696 va_last = va + L3_PAGE_SIZE - PAGE_SIZE; 1697 do { 1698 m++; 1699 va += PAGE_SIZE; 1700 pmap_pvh_free(&m->md, pmap, va); 1701 } while (va < va_last); 1702 } 1703 #endif /* VM_NRESERVLEVEL > 0 */ 1704 1705 /* 1706 * First find and then destroy the pv entry for the specified pmap and virtual 1707 * address. This operation can be performed on pv lists for either 4KB or 2MB 1708 * page mappings. 1709 */ 1710 static void 1711 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1712 { 1713 pv_entry_t pv; 1714 1715 pv = pmap_pvh_remove(pvh, pmap, va); 1716 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 1717 free_pv_entry(pmap, pv); 1718 } 1719 1720 /* 1721 * Conditionally create the PV entry for a 4KB page mapping if the required 1722 * memory can be allocated without resorting to reclamation. 1723 */ 1724 static boolean_t 1725 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 1726 struct rwlock **lockp) 1727 { 1728 pv_entry_t pv; 1729 1730 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1731 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1732 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 1733 pv->pv_va = va; 1734 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1735 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 1736 m->md.pv_gen++; 1737 return (TRUE); 1738 } else 1739 return (FALSE); 1740 } 1741 1742 vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX]; 1743 #ifdef INVARIANTS 1744 static void 1745 validate_addr(vm_paddr_t addr, vm_size_t size) 1746 { 1747 vm_paddr_t end = addr + size; 1748 bool found = false; 1749 1750 for (int i = 0; i < 2 * phys_avail_count; i += 2) { 1751 if (addr >= phys_avail_debug[i] && 1752 end <= phys_avail_debug[i + 1]) { 1753 found = true; 1754 break; 1755 } 1756 } 1757 KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array", 1758 addr, end)); 1759 } 1760 #else 1761 static void validate_addr(vm_paddr_t addr, vm_size_t size) {} 1762 #endif 1763 #define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A) 1764 1765 static vm_paddr_t 1766 alloc_pt_page(void) 1767 { 1768 vm_paddr_t page; 1769 1770 page = allocpages(1); 1771 pagezero(PHYS_TO_DMAP(page)); 1772 return (page); 1773 } 1774 1775 static void 1776 mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end) 1777 { 1778 pt_entry_t *pte, pteval; 1779 vm_paddr_t page; 1780 1781 if (bootverbose) 1782 printf("%s %lx -> %lx\n", __func__, start, end); 1783 while (start < end) { 1784 pteval = start | DMAP_PAGE_BITS; 1785 pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start)); 1786 if ((be64toh(*pte) & RPTE_VALID) == 0) { 1787 page = alloc_pt_page(); 1788 pde_store(pte, page); 1789 } 1790 pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start)); 1791 if ((start & L2_PAGE_MASK) == 0 && 1792 end - start >= L2_PAGE_SIZE) { 1793 start += L2_PAGE_SIZE; 1794 goto done; 1795 } else if ((be64toh(*pte) & RPTE_VALID) == 0) { 1796 page = alloc_pt_page(); 1797 pde_store(pte, page); 1798 } 1799 1800 pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start)); 1801 if ((start & L3_PAGE_MASK) == 0 && 1802 end - start >= L3_PAGE_SIZE) { 1803 start += L3_PAGE_SIZE; 1804 goto done; 1805 } else if ((be64toh(*pte) & RPTE_VALID) == 0) { 1806 page = alloc_pt_page(); 1807 pde_store(pte, page); 1808 } 1809 pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start)); 1810 start += PAGE_SIZE; 1811 done: 1812 pte_store(pte, pteval); 1813 } 1814 } 1815 1816 static void 1817 mmu_radix_dmap_populate(vm_size_t hwphyssz) 1818 { 1819 vm_paddr_t start, end; 1820 1821 for (int i = 0; i < pregions_sz; i++) { 1822 start = pregions[i].mr_start; 1823 end = start + pregions[i].mr_size; 1824 if (hwphyssz && start >= hwphyssz) 1825 break; 1826 if (hwphyssz && hwphyssz < end) 1827 end = hwphyssz; 1828 mmu_radix_dmap_range(start, end); 1829 } 1830 } 1831 1832 static void 1833 mmu_radix_setup_pagetables(vm_size_t hwphyssz) 1834 { 1835 vm_paddr_t ptpages, pages; 1836 pt_entry_t *pte; 1837 vm_paddr_t l1phys; 1838 1839 bzero(kernel_pmap, sizeof(struct pmap)); 1840 PMAP_LOCK_INIT(kernel_pmap); 1841 1842 ptpages = allocpages(3); 1843 l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE); 1844 validate_addr(l1phys, RADIX_PGD_SIZE); 1845 if (bootverbose) 1846 printf("l1phys=%lx\n", l1phys); 1847 MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0); 1848 for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++) 1849 pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE)); 1850 kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys); 1851 1852 mmu_radix_dmap_populate(hwphyssz); 1853 1854 /* 1855 * Create page tables for first 128MB of KVA 1856 */ 1857 pages = ptpages; 1858 pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS); 1859 *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); 1860 pages += PAGE_SIZE; 1861 pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS); 1862 *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); 1863 pages += PAGE_SIZE; 1864 pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS); 1865 /* 1866 * the kernel page table pages need to be preserved in 1867 * phys_avail and not overlap with previous allocations 1868 */ 1869 pages = allocpages(nkpt); 1870 if (bootverbose) { 1871 printf("phys_avail after dmap populate and nkpt allocation\n"); 1872 for (int j = 0; j < 2 * phys_avail_count; j+=2) 1873 printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", 1874 j, phys_avail[j], j + 1, phys_avail[j + 1]); 1875 } 1876 KPTphys = pages; 1877 for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE) 1878 *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); 1879 kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE; 1880 if (bootverbose) 1881 printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1); 1882 /* 1883 * Add a physical memory segment (vm_phys_seg) corresponding to the 1884 * preallocated kernel page table pages so that vm_page structures 1885 * representing these pages will be created. The vm_page structures 1886 * are required for promotion of the corresponding kernel virtual 1887 * addresses to superpage mappings. 1888 */ 1889 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1890 } 1891 1892 static void 1893 mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end) 1894 { 1895 vm_paddr_t kpstart, kpend; 1896 vm_size_t physsz, hwphyssz; 1897 //uint64_t l2virt; 1898 int rm_pavail, proctab_size; 1899 int i, j; 1900 1901 kpstart = start & ~DMAP_BASE_ADDRESS; 1902 kpend = end & ~DMAP_BASE_ADDRESS; 1903 1904 /* Get physical memory regions from firmware */ 1905 mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); 1906 CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory"); 1907 1908 if (2 * VM_PHYSSEG_MAX < regions_sz) 1909 panic("mmu_radix_early_bootstrap: phys_avail too small"); 1910 1911 if (bootverbose) 1912 for (int i = 0; i < regions_sz; i++) 1913 printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n", 1914 i, regions[i].mr_start, i, regions[i].mr_size); 1915 /* 1916 * XXX workaround a simulator bug 1917 */ 1918 for (int i = 0; i < regions_sz; i++) 1919 if (regions[i].mr_start & PAGE_MASK) { 1920 regions[i].mr_start += PAGE_MASK; 1921 regions[i].mr_start &= ~PAGE_MASK; 1922 regions[i].mr_size &= ~PAGE_MASK; 1923 } 1924 if (bootverbose) 1925 for (int i = 0; i < pregions_sz; i++) 1926 printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n", 1927 i, pregions[i].mr_start, i, pregions[i].mr_size); 1928 1929 phys_avail_count = 0; 1930 physsz = 0; 1931 hwphyssz = 0; 1932 TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); 1933 for (i = 0, j = 0; i < regions_sz; i++) { 1934 if (bootverbose) 1935 printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n", 1936 i, regions[i].mr_start, i, regions[i].mr_size); 1937 1938 if (regions[i].mr_size < PAGE_SIZE) 1939 continue; 1940 1941 if (hwphyssz != 0 && 1942 (physsz + regions[i].mr_size) >= hwphyssz) { 1943 if (physsz < hwphyssz) { 1944 phys_avail[j] = regions[i].mr_start; 1945 phys_avail[j + 1] = regions[i].mr_start + 1946 (hwphyssz - physsz); 1947 physsz = hwphyssz; 1948 phys_avail_count++; 1949 dump_avail[j] = phys_avail[j]; 1950 dump_avail[j + 1] = phys_avail[j + 1]; 1951 } 1952 break; 1953 } 1954 phys_avail[j] = regions[i].mr_start; 1955 phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; 1956 dump_avail[j] = phys_avail[j]; 1957 dump_avail[j + 1] = phys_avail[j + 1]; 1958 1959 phys_avail_count++; 1960 physsz += regions[i].mr_size; 1961 j += 2; 1962 } 1963 1964 /* Check for overlap with the kernel and exception vectors */ 1965 rm_pavail = 0; 1966 for (j = 0; j < 2 * phys_avail_count; j+=2) { 1967 if (phys_avail[j] < EXC_LAST) 1968 phys_avail[j] += EXC_LAST; 1969 1970 if (phys_avail[j] >= kpstart && 1971 phys_avail[j + 1] <= kpend) { 1972 phys_avail[j] = phys_avail[j + 1] = ~0; 1973 rm_pavail++; 1974 continue; 1975 } 1976 1977 if (kpstart >= phys_avail[j] && 1978 kpstart < phys_avail[j + 1]) { 1979 if (kpend < phys_avail[j + 1]) { 1980 phys_avail[2 * phys_avail_count] = 1981 (kpend & ~PAGE_MASK) + PAGE_SIZE; 1982 phys_avail[2 * phys_avail_count + 1] = 1983 phys_avail[j + 1]; 1984 phys_avail_count++; 1985 } 1986 1987 phys_avail[j + 1] = kpstart & ~PAGE_MASK; 1988 } 1989 1990 if (kpend >= phys_avail[j] && 1991 kpend < phys_avail[j + 1]) { 1992 if (kpstart > phys_avail[j]) { 1993 phys_avail[2 * phys_avail_count] = phys_avail[j]; 1994 phys_avail[2 * phys_avail_count + 1] = 1995 kpstart & ~PAGE_MASK; 1996 phys_avail_count++; 1997 } 1998 1999 phys_avail[j] = (kpend & ~PAGE_MASK) + 2000 PAGE_SIZE; 2001 } 2002 } 2003 qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp); 2004 for (i = 0; i < 2 * phys_avail_count; i++) 2005 phys_avail_debug[i] = phys_avail[i]; 2006 2007 /* Remove physical available regions marked for removal (~0) */ 2008 if (rm_pavail) { 2009 phys_avail_count -= rm_pavail; 2010 for (i = 2 * phys_avail_count; 2011 i < 2*(phys_avail_count + rm_pavail); i+=2) 2012 phys_avail[i] = phys_avail[i + 1] = 0; 2013 } 2014 if (bootverbose) { 2015 printf("phys_avail ranges after filtering:\n"); 2016 for (j = 0; j < 2 * phys_avail_count; j+=2) 2017 printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", 2018 j, phys_avail[j], j + 1, phys_avail[j + 1]); 2019 } 2020 physmem = btoc(physsz); 2021 2022 /* XXX assume we're running non-virtualized and 2023 * we don't support BHYVE 2024 */ 2025 if (isa3_pid_bits == 0) 2026 isa3_pid_bits = 20; 2027 parttab_phys = moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE); 2028 validate_addr(parttab_phys, PARTTAB_SIZE); 2029 for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++) 2030 pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE)); 2031 2032 proctab_size = 1UL << PROCTAB_SIZE_SHIFT; 2033 proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size); 2034 validate_addr(proctab0pa, proctab_size); 2035 for (int i = 0; i < proctab_size/PAGE_SIZE; i++) 2036 pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE)); 2037 2038 mmu_radix_setup_pagetables(hwphyssz); 2039 } 2040 2041 static void 2042 mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end) 2043 { 2044 int i; 2045 vm_paddr_t pa; 2046 void *dpcpu; 2047 vm_offset_t va; 2048 2049 /* 2050 * Set up the Open Firmware pmap and add its mappings if not in real 2051 * mode. 2052 */ 2053 if (bootverbose) 2054 printf("%s enter\n", __func__); 2055 2056 /* 2057 * Calculate the last available physical address, and reserve the 2058 * vm_page_array (upper bound). 2059 */ 2060 Maxmem = 0; 2061 for (i = 0; phys_avail[i + 2] != 0; i += 2) 2062 Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); 2063 2064 /* 2065 * Set the start and end of kva. 2066 */ 2067 virtual_avail = VM_MIN_KERNEL_ADDRESS; 2068 virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; 2069 2070 /* 2071 * Remap any early IO mappings (console framebuffer, etc.) 2072 */ 2073 bs_remap_earlyboot(); 2074 2075 /* 2076 * Allocate a kernel stack with a guard page for thread0 and map it 2077 * into the kernel page map. 2078 */ 2079 pa = allocpages(kstack_pages); 2080 va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; 2081 virtual_avail = va + kstack_pages * PAGE_SIZE; 2082 CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); 2083 thread0.td_kstack = va; 2084 for (i = 0; i < kstack_pages; i++) { 2085 mmu_radix_kenter(va, pa); 2086 pa += PAGE_SIZE; 2087 va += PAGE_SIZE; 2088 } 2089 thread0.td_kstack_pages = kstack_pages; 2090 2091 /* 2092 * Allocate virtual address space for the message buffer. 2093 */ 2094 pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK) >> PAGE_SHIFT); 2095 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa); 2096 2097 /* 2098 * Allocate virtual address space for the dynamic percpu area. 2099 */ 2100 pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT); 2101 dpcpu = (void *)PHYS_TO_DMAP(pa); 2102 dpcpu_init(dpcpu, curcpu); 2103 2104 crashdumpmap = (caddr_t)virtual_avail; 2105 virtual_avail += MAXDUMPPGS * PAGE_SIZE; 2106 2107 /* 2108 * Reserve some special page table entries/VA space for temporary 2109 * mapping of pages. 2110 */ 2111 } 2112 2113 static void 2114 mmu_parttab_init(void) 2115 { 2116 uint64_t ptcr; 2117 2118 isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys); 2119 2120 if (bootverbose) 2121 printf("%s parttab: %p\n", __func__, isa3_parttab); 2122 ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); 2123 if (bootverbose) 2124 printf("setting ptcr %lx\n", ptcr); 2125 mtspr(SPR_PTCR, ptcr); 2126 } 2127 2128 static void 2129 mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab) 2130 { 2131 uint64_t prev; 2132 2133 if (bootverbose) 2134 printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab, 2135 lpid, pagetab, proctab); 2136 prev = be64toh(isa3_parttab[lpid].pagetab); 2137 isa3_parttab[lpid].pagetab = htobe64(pagetab); 2138 isa3_parttab[lpid].proctab = htobe64(proctab); 2139 2140 if (prev & PARTTAB_HR) { 2141 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : 2142 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 2143 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : 2144 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 2145 } else { 2146 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : 2147 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 2148 } 2149 ttusync(); 2150 } 2151 2152 static void 2153 mmu_radix_parttab_init(void) 2154 { 2155 uint64_t pagetab; 2156 2157 mmu_parttab_init(); 2158 pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \ 2159 RADIX_PGD_INDEX_SHIFT | PARTTAB_HR; 2160 mmu_parttab_update(0, pagetab, 0); 2161 } 2162 2163 static void 2164 mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size) 2165 { 2166 uint64_t pagetab, proctab; 2167 2168 pagetab = be64toh(isa3_parttab[0].pagetab); 2169 proctab = proctabpa | table_size | PARTTAB_GR; 2170 mmu_parttab_update(0, pagetab, proctab); 2171 } 2172 2173 static void 2174 mmu_radix_proctab_init(void) 2175 { 2176 2177 isa3_base_pid = 1; 2178 2179 isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa); 2180 isa3_proctab->proctab0 = 2181 htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | 2182 RADIX_PGD_INDEX_SHIFT); 2183 2184 mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12); 2185 2186 __asm __volatile("ptesync" : : : "memory"); 2187 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : 2188 "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); 2189 __asm __volatile("eieio; tlbsync; ptesync" : : : "memory"); 2190 if (bootverbose) 2191 printf("process table %p and kernel radix PDE: %p\n", 2192 isa3_proctab, kernel_pmap->pm_pml1); 2193 mtmsr(mfmsr() | PSL_DR ); 2194 mtmsr(mfmsr() & ~PSL_DR); 2195 kernel_pmap->pm_pid = isa3_base_pid; 2196 isa3_base_pid++; 2197 } 2198 2199 void 2200 mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 2201 int advice) 2202 { 2203 struct rwlock *lock; 2204 pml1_entry_t *l1e; 2205 pml2_entry_t *l2e; 2206 pml3_entry_t oldl3e, *l3e; 2207 pt_entry_t *pte; 2208 vm_offset_t va, va_next; 2209 vm_page_t m; 2210 bool anychanged; 2211 2212 if (advice != MADV_DONTNEED && advice != MADV_FREE) 2213 return; 2214 anychanged = false; 2215 PMAP_LOCK(pmap); 2216 for (; sva < eva; sva = va_next) { 2217 l1e = pmap_pml1e(pmap, sva); 2218 if ((be64toh(*l1e) & PG_V) == 0) { 2219 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 2220 if (va_next < sva) 2221 va_next = eva; 2222 continue; 2223 } 2224 l2e = pmap_l1e_to_l2e(l1e, sva); 2225 if ((be64toh(*l2e) & PG_V) == 0) { 2226 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 2227 if (va_next < sva) 2228 va_next = eva; 2229 continue; 2230 } 2231 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 2232 if (va_next < sva) 2233 va_next = eva; 2234 l3e = pmap_l2e_to_l3e(l2e, sva); 2235 oldl3e = be64toh(*l3e); 2236 if ((oldl3e & PG_V) == 0) 2237 continue; 2238 else if ((oldl3e & RPTE_LEAF) != 0) { 2239 if ((oldl3e & PG_MANAGED) == 0) 2240 continue; 2241 lock = NULL; 2242 if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) { 2243 if (lock != NULL) 2244 rw_wunlock(lock); 2245 2246 /* 2247 * The large page mapping was destroyed. 2248 */ 2249 continue; 2250 } 2251 2252 /* 2253 * Unless the page mappings are wired, remove the 2254 * mapping to a single page so that a subsequent 2255 * access may repromote. Choosing the last page 2256 * within the address range [sva, min(va_next, eva)) 2257 * generally results in more repromotions. Since the 2258 * underlying page table page is fully populated, this 2259 * removal never frees a page table page. 2260 */ 2261 if ((oldl3e & PG_W) == 0) { 2262 va = eva; 2263 if (va > va_next) 2264 va = va_next; 2265 va -= PAGE_SIZE; 2266 KASSERT(va >= sva, 2267 ("mmu_radix_advise: no address gap")); 2268 pte = pmap_l3e_to_pte(l3e, va); 2269 KASSERT((be64toh(*pte) & PG_V) != 0, 2270 ("pmap_advise: invalid PTE")); 2271 pmap_remove_pte(pmap, pte, va, be64toh(*l3e), NULL, 2272 &lock); 2273 anychanged = true; 2274 } 2275 if (lock != NULL) 2276 rw_wunlock(lock); 2277 } 2278 if (va_next > eva) 2279 va_next = eva; 2280 va = va_next; 2281 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; 2282 pte++, sva += PAGE_SIZE) { 2283 MPASS(pte == pmap_pte(pmap, sva)); 2284 2285 if ((be64toh(*pte) & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 2286 goto maybe_invlrng; 2287 else if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 2288 if (advice == MADV_DONTNEED) { 2289 /* 2290 * Future calls to pmap_is_modified() 2291 * can be avoided by making the page 2292 * dirty now. 2293 */ 2294 m = PHYS_TO_VM_PAGE(be64toh(*pte) & PG_FRAME); 2295 vm_page_dirty(m); 2296 } 2297 atomic_clear_long(pte, htobe64(PG_M | PG_A)); 2298 } else if ((be64toh(*pte) & PG_A) != 0) 2299 atomic_clear_long(pte, htobe64(PG_A)); 2300 else 2301 goto maybe_invlrng; 2302 anychanged = true; 2303 continue; 2304 maybe_invlrng: 2305 if (va != va_next) { 2306 anychanged = true; 2307 va = va_next; 2308 } 2309 } 2310 if (va != va_next) 2311 anychanged = true; 2312 } 2313 if (anychanged) 2314 pmap_invalidate_all(pmap); 2315 PMAP_UNLOCK(pmap); 2316 } 2317 2318 /* 2319 * Routines used in machine-dependent code 2320 */ 2321 static void 2322 mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end) 2323 { 2324 uint64_t lpcr; 2325 2326 if (bootverbose) 2327 printf("%s\n", __func__); 2328 hw_direct_map = 1; 2329 mmu_radix_early_bootstrap(start, end); 2330 if (bootverbose) 2331 printf("early bootstrap complete\n"); 2332 if (powernv_enabled) { 2333 lpcr = mfspr(SPR_LPCR); 2334 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 2335 mmu_radix_parttab_init(); 2336 mmu_radix_init_amor(); 2337 if (bootverbose) 2338 printf("powernv init complete\n"); 2339 } 2340 mmu_radix_init_iamr(); 2341 mmu_radix_proctab_init(); 2342 mmu_radix_pid_set(kernel_pmap); 2343 /* XXX assume CPU_FTR_HVMODE */ 2344 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); 2345 2346 mmu_radix_late_bootstrap(start, end); 2347 numa_mem_regions(&numa_pregions, &numa_pregions_sz); 2348 if (bootverbose) 2349 printf("%s done\n", __func__); 2350 pmap_bootstrapped = 1; 2351 dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE); 2352 PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS); 2353 } 2354 2355 static void 2356 mmu_radix_cpu_bootstrap(int ap) 2357 { 2358 uint64_t lpcr; 2359 uint64_t ptcr; 2360 2361 if (powernv_enabled) { 2362 lpcr = mfspr(SPR_LPCR); 2363 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 2364 2365 ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); 2366 mtspr(SPR_PTCR, ptcr); 2367 mmu_radix_init_amor(); 2368 } 2369 mmu_radix_init_iamr(); 2370 mmu_radix_pid_set(kernel_pmap); 2371 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); 2372 } 2373 2374 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0, 2375 "2MB page mapping counters"); 2376 2377 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_demotions); 2378 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD, 2379 &pmap_l3e_demotions, "2MB page demotions"); 2380 2381 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_mappings); 2382 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD, 2383 &pmap_l3e_mappings, "2MB page mappings"); 2384 2385 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_p_failures); 2386 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD, 2387 &pmap_l3e_p_failures, "2MB page promotion failures"); 2388 2389 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_promotions); 2390 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD, 2391 &pmap_l3e_promotions, "2MB page promotions"); 2392 2393 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0, 2394 "1GB page mapping counters"); 2395 2396 static COUNTER_U64_DEFINE_EARLY(pmap_l2e_demotions); 2397 SYSCTL_COUNTER_U64(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD, 2398 &pmap_l2e_demotions, "1GB page demotions"); 2399 2400 void 2401 mmu_radix_clear_modify(vm_page_t m) 2402 { 2403 struct md_page *pvh; 2404 pmap_t pmap; 2405 pv_entry_t next_pv, pv; 2406 pml3_entry_t oldl3e, *l3e; 2407 pt_entry_t oldpte, *pte; 2408 struct rwlock *lock; 2409 vm_offset_t va; 2410 int md_gen, pvh_gen; 2411 2412 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2413 ("pmap_clear_modify: page %p is not managed", m)); 2414 vm_page_assert_busied(m); 2415 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 2416 2417 /* 2418 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 2419 * If the object containing the page is locked and the page is not 2420 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 2421 */ 2422 if ((m->a.flags & PGA_WRITEABLE) == 0) 2423 return; 2424 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2425 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2426 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 2427 rw_wlock(lock); 2428 restart: 2429 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { 2430 pmap = PV_PMAP(pv); 2431 if (!PMAP_TRYLOCK(pmap)) { 2432 pvh_gen = pvh->pv_gen; 2433 rw_wunlock(lock); 2434 PMAP_LOCK(pmap); 2435 rw_wlock(lock); 2436 if (pvh_gen != pvh->pv_gen) { 2437 PMAP_UNLOCK(pmap); 2438 goto restart; 2439 } 2440 } 2441 va = pv->pv_va; 2442 l3e = pmap_pml3e(pmap, va); 2443 oldl3e = be64toh(*l3e); 2444 if ((oldl3e & PG_RW) != 0 && 2445 pmap_demote_l3e_locked(pmap, l3e, va, &lock) && 2446 (oldl3e & PG_W) == 0) { 2447 /* 2448 * Write protect the mapping to a 2449 * single page so that a subsequent 2450 * write access may repromote. 2451 */ 2452 va += VM_PAGE_TO_PHYS(m) - (oldl3e & 2453 PG_PS_FRAME); 2454 pte = pmap_l3e_to_pte(l3e, va); 2455 oldpte = be64toh(*pte); 2456 while (!atomic_cmpset_long(pte, 2457 htobe64(oldpte), 2458 htobe64((oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW)))) 2459 oldpte = be64toh(*pte); 2460 vm_page_dirty(m); 2461 pmap_invalidate_page(pmap, va); 2462 } 2463 PMAP_UNLOCK(pmap); 2464 } 2465 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 2466 pmap = PV_PMAP(pv); 2467 if (!PMAP_TRYLOCK(pmap)) { 2468 md_gen = m->md.pv_gen; 2469 pvh_gen = pvh->pv_gen; 2470 rw_wunlock(lock); 2471 PMAP_LOCK(pmap); 2472 rw_wlock(lock); 2473 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 2474 PMAP_UNLOCK(pmap); 2475 goto restart; 2476 } 2477 } 2478 l3e = pmap_pml3e(pmap, pv->pv_va); 2479 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_clear_modify: found" 2480 " a 2mpage in page %p's pv list", m)); 2481 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 2482 if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 2483 atomic_clear_long(pte, htobe64(PG_M)); 2484 pmap_invalidate_page(pmap, pv->pv_va); 2485 } 2486 PMAP_UNLOCK(pmap); 2487 } 2488 rw_wunlock(lock); 2489 } 2490 2491 void 2492 mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 2493 vm_size_t len, vm_offset_t src_addr) 2494 { 2495 struct rwlock *lock; 2496 struct spglist free; 2497 vm_offset_t addr; 2498 vm_offset_t end_addr = src_addr + len; 2499 vm_offset_t va_next; 2500 vm_page_t dst_pdpg, dstmpte, srcmpte; 2501 bool invalidate_all; 2502 2503 CTR6(KTR_PMAP, 2504 "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n", 2505 __func__, dst_pmap, src_pmap, dst_addr, len, src_addr); 2506 2507 if (dst_addr != src_addr) 2508 return; 2509 lock = NULL; 2510 invalidate_all = false; 2511 if (dst_pmap < src_pmap) { 2512 PMAP_LOCK(dst_pmap); 2513 PMAP_LOCK(src_pmap); 2514 } else { 2515 PMAP_LOCK(src_pmap); 2516 PMAP_LOCK(dst_pmap); 2517 } 2518 2519 for (addr = src_addr; addr < end_addr; addr = va_next) { 2520 pml1_entry_t *l1e; 2521 pml2_entry_t *l2e; 2522 pml3_entry_t srcptepaddr, *l3e; 2523 pt_entry_t *src_pte, *dst_pte; 2524 2525 l1e = pmap_pml1e(src_pmap, addr); 2526 if ((be64toh(*l1e) & PG_V) == 0) { 2527 va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 2528 if (va_next < addr) 2529 va_next = end_addr; 2530 continue; 2531 } 2532 2533 l2e = pmap_l1e_to_l2e(l1e, addr); 2534 if ((be64toh(*l2e) & PG_V) == 0) { 2535 va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 2536 if (va_next < addr) 2537 va_next = end_addr; 2538 continue; 2539 } 2540 2541 va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 2542 if (va_next < addr) 2543 va_next = end_addr; 2544 2545 l3e = pmap_l2e_to_l3e(l2e, addr); 2546 srcptepaddr = be64toh(*l3e); 2547 if (srcptepaddr == 0) 2548 continue; 2549 2550 if (srcptepaddr & RPTE_LEAF) { 2551 if ((addr & L3_PAGE_MASK) != 0 || 2552 addr + L3_PAGE_SIZE > end_addr) 2553 continue; 2554 dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL); 2555 if (dst_pdpg == NULL) 2556 break; 2557 l3e = (pml3_entry_t *) 2558 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); 2559 l3e = &l3e[pmap_pml3e_index(addr)]; 2560 if (be64toh(*l3e) == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 2561 pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr, 2562 PMAP_ENTER_NORECLAIM, &lock))) { 2563 *l3e = htobe64(srcptepaddr & ~PG_W); 2564 pmap_resident_count_inc(dst_pmap, 2565 L3_PAGE_SIZE / PAGE_SIZE); 2566 counter_u64_add(pmap_l3e_mappings, 1); 2567 } else 2568 dst_pdpg->ref_count--; 2569 continue; 2570 } 2571 2572 srcptepaddr &= PG_FRAME; 2573 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 2574 KASSERT(srcmpte->ref_count > 0, 2575 ("pmap_copy: source page table page is unused")); 2576 2577 if (va_next > end_addr) 2578 va_next = end_addr; 2579 2580 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 2581 src_pte = &src_pte[pmap_pte_index(addr)]; 2582 dstmpte = NULL; 2583 while (addr < va_next) { 2584 pt_entry_t ptetemp; 2585 ptetemp = be64toh(*src_pte); 2586 /* 2587 * we only virtual copy managed pages 2588 */ 2589 if ((ptetemp & PG_MANAGED) != 0) { 2590 if (dstmpte != NULL && 2591 dstmpte->pindex == pmap_l3e_pindex(addr)) 2592 dstmpte->ref_count++; 2593 else if ((dstmpte = pmap_allocpte(dst_pmap, 2594 addr, NULL)) == NULL) 2595 goto out; 2596 dst_pte = (pt_entry_t *) 2597 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 2598 dst_pte = &dst_pte[pmap_pte_index(addr)]; 2599 if (be64toh(*dst_pte) == 0 && 2600 pmap_try_insert_pv_entry(dst_pmap, addr, 2601 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 2602 &lock)) { 2603 /* 2604 * Clear the wired, modified, and 2605 * accessed (referenced) bits 2606 * during the copy. 2607 */ 2608 *dst_pte = htobe64(ptetemp & ~(PG_W | PG_M | 2609 PG_A)); 2610 pmap_resident_count_inc(dst_pmap, 1); 2611 } else { 2612 SLIST_INIT(&free); 2613 if (pmap_unwire_ptp(dst_pmap, addr, 2614 dstmpte, &free)) { 2615 /* 2616 * Although "addr" is not 2617 * mapped, paging-structure 2618 * caches could nonetheless 2619 * have entries that refer to 2620 * the freed page table pages. 2621 * Invalidate those entries. 2622 */ 2623 invalidate_all = true; 2624 vm_page_free_pages_toq(&free, 2625 true); 2626 } 2627 goto out; 2628 } 2629 if (dstmpte->ref_count >= srcmpte->ref_count) 2630 break; 2631 } 2632 addr += PAGE_SIZE; 2633 if (__predict_false((addr & L3_PAGE_MASK) == 0)) 2634 src_pte = pmap_pte(src_pmap, addr); 2635 else 2636 src_pte++; 2637 } 2638 } 2639 out: 2640 if (invalidate_all) 2641 pmap_invalidate_all(dst_pmap); 2642 if (lock != NULL) 2643 rw_wunlock(lock); 2644 PMAP_UNLOCK(src_pmap); 2645 PMAP_UNLOCK(dst_pmap); 2646 } 2647 2648 static void 2649 mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst) 2650 { 2651 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 2652 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 2653 2654 CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst); 2655 /* 2656 * XXX slow 2657 */ 2658 bcopy((void *)src, (void *)dst, PAGE_SIZE); 2659 } 2660 2661 static void 2662 mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 2663 vm_offset_t b_offset, int xfersize) 2664 { 2665 void *a_cp, *b_cp; 2666 vm_offset_t a_pg_offset, b_pg_offset; 2667 int cnt; 2668 2669 CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma, 2670 a_offset, mb, b_offset, xfersize); 2671 2672 while (xfersize > 0) { 2673 a_pg_offset = a_offset & PAGE_MASK; 2674 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 2675 a_cp = (char *)(uintptr_t)PHYS_TO_DMAP( 2676 VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) + 2677 a_pg_offset; 2678 b_pg_offset = b_offset & PAGE_MASK; 2679 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 2680 b_cp = (char *)(uintptr_t)PHYS_TO_DMAP( 2681 VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) + 2682 b_pg_offset; 2683 bcopy(a_cp, b_cp, cnt); 2684 a_offset += cnt; 2685 b_offset += cnt; 2686 xfersize -= cnt; 2687 } 2688 } 2689 2690 #if VM_NRESERVLEVEL > 0 2691 /* 2692 * Tries to promote the 512, contiguous 4KB page mappings that are within a 2693 * single page table page (PTP) to a single 2MB page mapping. For promotion 2694 * to occur, two conditions must be met: (1) the 4KB page mappings must map 2695 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 2696 * identical characteristics. 2697 */ 2698 static int 2699 pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va, 2700 struct rwlock **lockp) 2701 { 2702 pml3_entry_t newpde; 2703 pt_entry_t *firstpte, oldpte, pa, *pte; 2704 vm_page_t mpte; 2705 2706 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2707 2708 /* 2709 * Examine the first PTE in the specified PTP. Abort if this PTE is 2710 * either invalid, unused, or does not map the first 4KB physical page 2711 * within a 2MB page. 2712 */ 2713 firstpte = (pt_entry_t *)PHYS_TO_DMAP(be64toh(*pde) & PG_FRAME); 2714 setpde: 2715 newpde = be64toh(*firstpte); 2716 if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 2717 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" 2718 " in pmap %p", va, pmap); 2719 goto fail; 2720 } 2721 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 2722 /* 2723 * When PG_M is already clear, PG_RW can be cleared without 2724 * a TLB invalidation. 2725 */ 2726 if (!atomic_cmpset_long(firstpte, htobe64(newpde), htobe64((newpde | RPTE_EAA_R) & ~RPTE_EAA_W))) 2727 goto setpde; 2728 newpde &= ~RPTE_EAA_W; 2729 } 2730 2731 /* 2732 * Examine each of the other PTEs in the specified PTP. Abort if this 2733 * PTE maps an unexpected 4KB physical page or does not have identical 2734 * characteristics to the first PTE. 2735 */ 2736 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE; 2737 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 2738 setpte: 2739 oldpte = be64toh(*pte); 2740 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 2741 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" 2742 " in pmap %p", va, pmap); 2743 goto fail; 2744 } 2745 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 2746 /* 2747 * When PG_M is already clear, PG_RW can be cleared 2748 * without a TLB invalidation. 2749 */ 2750 if (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~RPTE_EAA_W))) 2751 goto setpte; 2752 oldpte &= ~RPTE_EAA_W; 2753 CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx" 2754 " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) | 2755 (va & ~L3_PAGE_MASK), pmap); 2756 } 2757 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 2758 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" 2759 " in pmap %p", va, pmap); 2760 goto fail; 2761 } 2762 pa -= PAGE_SIZE; 2763 } 2764 2765 /* 2766 * Save the page table page in its current state until the PDE 2767 * mapping the superpage is demoted by pmap_demote_pde() or 2768 * destroyed by pmap_remove_pde(). 2769 */ 2770 mpte = PHYS_TO_VM_PAGE(be64toh(*pde) & PG_FRAME); 2771 KASSERT(mpte >= vm_page_array && 2772 mpte < &vm_page_array[vm_page_array_size], 2773 ("pmap_promote_l3e: page table page is out of range")); 2774 KASSERT(mpte->pindex == pmap_l3e_pindex(va), 2775 ("pmap_promote_l3e: page table page's pindex is wrong")); 2776 if (pmap_insert_pt_page(pmap, mpte)) { 2777 CTR2(KTR_PMAP, 2778 "pmap_promote_l3e: failure for va %#lx in pmap %p", va, 2779 pmap); 2780 goto fail; 2781 } 2782 2783 /* 2784 * Promote the pv entries. 2785 */ 2786 if ((newpde & PG_MANAGED) != 0) 2787 pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp); 2788 2789 pte_store(pde, PG_PROMOTED | newpde); 2790 ptesync(); 2791 counter_u64_add(pmap_l3e_promotions, 1); 2792 CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx" 2793 " in pmap %p", va, pmap); 2794 return (0); 2795 fail: 2796 counter_u64_add(pmap_l3e_p_failures, 1); 2797 return (KERN_FAILURE); 2798 } 2799 #endif /* VM_NRESERVLEVEL > 0 */ 2800 2801 int 2802 mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, 2803 vm_prot_t prot, u_int flags, int8_t psind) 2804 { 2805 struct rwlock *lock; 2806 pml3_entry_t *l3e; 2807 pt_entry_t *pte; 2808 pt_entry_t newpte, origpte; 2809 pv_entry_t pv; 2810 vm_paddr_t opa, pa; 2811 vm_page_t mpte, om; 2812 int rv, retrycount; 2813 boolean_t nosleep, invalidate_all, invalidate_page; 2814 2815 va = trunc_page(va); 2816 retrycount = 0; 2817 invalidate_page = invalidate_all = false; 2818 CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va, 2819 m, prot, flags, psind); 2820 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 2821 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 2822 ("pmap_enter: managed mapping within the clean submap")); 2823 if ((m->oflags & VPO_UNMANAGED) == 0) 2824 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2825 2826 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 2827 ("pmap_enter: flags %u has reserved bits set", flags)); 2828 pa = VM_PAGE_TO_PHYS(m); 2829 newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF); 2830 if ((flags & VM_PROT_WRITE) != 0) 2831 newpte |= PG_M; 2832 if ((flags & VM_PROT_READ) != 0) 2833 newpte |= PG_A; 2834 if (prot & VM_PROT_READ) 2835 newpte |= RPTE_EAA_R; 2836 if ((prot & VM_PROT_WRITE) != 0) 2837 newpte |= RPTE_EAA_W; 2838 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 2839 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 2840 2841 if (prot & VM_PROT_EXECUTE) 2842 newpte |= PG_X; 2843 if ((flags & PMAP_ENTER_WIRED) != 0) 2844 newpte |= PG_W; 2845 if (va >= DMAP_MIN_ADDRESS) 2846 newpte |= RPTE_EAA_P; 2847 newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs); 2848 /* 2849 * Set modified bit gratuitously for writeable mappings if 2850 * the page is unmanaged. We do not want to take a fault 2851 * to do the dirty bit accounting for these mappings. 2852 */ 2853 if ((m->oflags & VPO_UNMANAGED) != 0) { 2854 if ((newpte & PG_RW) != 0) 2855 newpte |= PG_M; 2856 } else 2857 newpte |= PG_MANAGED; 2858 2859 lock = NULL; 2860 PMAP_LOCK(pmap); 2861 if (psind == 1) { 2862 /* Assert the required virtual and physical alignment. */ 2863 KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned")); 2864 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2865 rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock); 2866 goto out; 2867 } 2868 mpte = NULL; 2869 2870 /* 2871 * In the case that a page table page is not 2872 * resident, we are creating it here. 2873 */ 2874 retry: 2875 l3e = pmap_pml3e(pmap, va); 2876 if (l3e != NULL && (be64toh(*l3e) & PG_V) != 0 && ((be64toh(*l3e) & RPTE_LEAF) == 0 || 2877 pmap_demote_l3e_locked(pmap, l3e, va, &lock))) { 2878 pte = pmap_l3e_to_pte(l3e, va); 2879 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 2880 mpte = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME); 2881 mpte->ref_count++; 2882 } 2883 } else if (va < VM_MAXUSER_ADDRESS) { 2884 /* 2885 * Here if the pte page isn't mapped, or if it has been 2886 * deallocated. 2887 */ 2888 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2889 mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va), 2890 nosleep ? NULL : &lock); 2891 if (mpte == NULL && nosleep) { 2892 rv = KERN_RESOURCE_SHORTAGE; 2893 goto out; 2894 } 2895 if (__predict_false(retrycount++ == 6)) 2896 panic("too many retries"); 2897 invalidate_all = true; 2898 goto retry; 2899 } else 2900 panic("pmap_enter: invalid page directory va=%#lx", va); 2901 2902 origpte = be64toh(*pte); 2903 pv = NULL; 2904 2905 /* 2906 * Is the specified virtual address already mapped? 2907 */ 2908 if ((origpte & PG_V) != 0) { 2909 #ifdef INVARIANTS 2910 if (VERBOSE_PMAP || pmap_logging) { 2911 printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --" 2912 " asid=%lu curpid=%d name=%s origpte0x%lx\n", 2913 pmap, va, m, prot, flags, psind, pmap->pm_pid, 2914 curproc->p_pid, curproc->p_comm, origpte); 2915 pmap_pte_walk(pmap->pm_pml1, va); 2916 } 2917 #endif 2918 /* 2919 * Wiring change, just update stats. We don't worry about 2920 * wiring PT pages as they remain resident as long as there 2921 * are valid mappings in them. Hence, if a user page is wired, 2922 * the PT page will be also. 2923 */ 2924 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 2925 pmap->pm_stats.wired_count++; 2926 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 2927 pmap->pm_stats.wired_count--; 2928 2929 /* 2930 * Remove the extra PT page reference. 2931 */ 2932 if (mpte != NULL) { 2933 mpte->ref_count--; 2934 KASSERT(mpte->ref_count > 0, 2935 ("pmap_enter: missing reference to page table page," 2936 " va: 0x%lx", va)); 2937 } 2938 2939 /* 2940 * Has the physical page changed? 2941 */ 2942 opa = origpte & PG_FRAME; 2943 if (opa == pa) { 2944 /* 2945 * No, might be a protection or wiring change. 2946 */ 2947 if ((origpte & PG_MANAGED) != 0 && 2948 (newpte & PG_RW) != 0) 2949 vm_page_aflag_set(m, PGA_WRITEABLE); 2950 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) { 2951 if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) { 2952 if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte))) 2953 goto retry; 2954 if ((newpte & PG_M) != (origpte & PG_M)) 2955 vm_page_dirty(m); 2956 if ((newpte & PG_A) != (origpte & PG_A)) 2957 vm_page_aflag_set(m, PGA_REFERENCED); 2958 ptesync(); 2959 } else 2960 invalidate_all = true; 2961 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 2962 goto unchanged; 2963 } 2964 goto validate; 2965 } 2966 2967 /* 2968 * The physical page has changed. Temporarily invalidate 2969 * the mapping. This ensures that all threads sharing the 2970 * pmap keep a consistent view of the mapping, which is 2971 * necessary for the correct handling of COW faults. It 2972 * also permits reuse of the old mapping's PV entry, 2973 * avoiding an allocation. 2974 * 2975 * For consistency, handle unmanaged mappings the same way. 2976 */ 2977 origpte = be64toh(pte_load_clear(pte)); 2978 KASSERT((origpte & PG_FRAME) == opa, 2979 ("pmap_enter: unexpected pa update for %#lx", va)); 2980 if ((origpte & PG_MANAGED) != 0) { 2981 om = PHYS_TO_VM_PAGE(opa); 2982 2983 /* 2984 * The pmap lock is sufficient to synchronize with 2985 * concurrent calls to pmap_page_test_mappings() and 2986 * pmap_ts_referenced(). 2987 */ 2988 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2989 vm_page_dirty(om); 2990 if ((origpte & PG_A) != 0) 2991 vm_page_aflag_set(om, PGA_REFERENCED); 2992 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 2993 pv = pmap_pvh_remove(&om->md, pmap, va); 2994 if ((newpte & PG_MANAGED) == 0) 2995 free_pv_entry(pmap, pv); 2996 #ifdef INVARIANTS 2997 else if (origpte & PG_MANAGED) { 2998 if (pv == NULL) { 2999 pmap_page_print_mappings(om); 3000 MPASS(pv != NULL); 3001 } 3002 } 3003 #endif 3004 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3005 TAILQ_EMPTY(&om->md.pv_list) && 3006 ((om->flags & PG_FICTITIOUS) != 0 || 3007 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3008 vm_page_aflag_clear(om, PGA_WRITEABLE); 3009 } 3010 if ((origpte & PG_A) != 0) 3011 invalidate_page = true; 3012 origpte = 0; 3013 } else { 3014 if (pmap != kernel_pmap) { 3015 #ifdef INVARIANTS 3016 if (VERBOSE_PMAP || pmap_logging) 3017 printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n", 3018 pmap, va, m, prot, flags, psind, 3019 pmap->pm_pid, curproc->p_pid, 3020 curproc->p_comm); 3021 #endif 3022 } 3023 3024 /* 3025 * Increment the counters. 3026 */ 3027 if ((newpte & PG_W) != 0) 3028 pmap->pm_stats.wired_count++; 3029 pmap_resident_count_inc(pmap, 1); 3030 } 3031 3032 /* 3033 * Enter on the PV list if part of our managed memory. 3034 */ 3035 if ((newpte & PG_MANAGED) != 0) { 3036 if (pv == NULL) { 3037 pv = get_pv_entry(pmap, &lock); 3038 pv->pv_va = va; 3039 } 3040 #ifdef VERBOSE_PV 3041 else 3042 printf("reassigning pv: %p to pmap: %p\n", 3043 pv, pmap); 3044 #endif 3045 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 3046 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 3047 m->md.pv_gen++; 3048 if ((newpte & PG_RW) != 0) 3049 vm_page_aflag_set(m, PGA_WRITEABLE); 3050 } 3051 3052 /* 3053 * Update the PTE. 3054 */ 3055 if ((origpte & PG_V) != 0) { 3056 validate: 3057 origpte = be64toh(pte_load_store(pte, htobe64(newpte))); 3058 KASSERT((origpte & PG_FRAME) == pa, 3059 ("pmap_enter: unexpected pa update for %#lx", va)); 3060 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 3061 (PG_M | PG_RW)) { 3062 if ((origpte & PG_MANAGED) != 0) 3063 vm_page_dirty(m); 3064 invalidate_page = true; 3065 3066 /* 3067 * Although the PTE may still have PG_RW set, TLB 3068 * invalidation may nonetheless be required because 3069 * the PTE no longer has PG_M set. 3070 */ 3071 } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) { 3072 /* 3073 * Removing capabilities requires invalidation on POWER 3074 */ 3075 invalidate_page = true; 3076 goto unchanged; 3077 } 3078 if ((origpte & PG_A) != 0) 3079 invalidate_page = true; 3080 } else { 3081 pte_store(pte, newpte); 3082 ptesync(); 3083 } 3084 unchanged: 3085 3086 #if VM_NRESERVLEVEL > 0 3087 /* 3088 * If both the page table page and the reservation are fully 3089 * populated, then attempt promotion. 3090 */ 3091 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 3092 mmu_radix_ps_enabled(pmap) && 3093 (m->flags & PG_FICTITIOUS) == 0 && 3094 vm_reserv_level_iffullpop(m) == 0 && 3095 pmap_promote_l3e(pmap, l3e, va, &lock) == 0) 3096 invalidate_all = true; 3097 #endif 3098 if (invalidate_all) 3099 pmap_invalidate_all(pmap); 3100 else if (invalidate_page) 3101 pmap_invalidate_page(pmap, va); 3102 3103 rv = KERN_SUCCESS; 3104 out: 3105 if (lock != NULL) 3106 rw_wunlock(lock); 3107 PMAP_UNLOCK(pmap); 3108 3109 return (rv); 3110 } 3111 3112 /* 3113 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 3114 * if successful. Returns false if (1) a page table page cannot be allocated 3115 * without sleeping, (2) a mapping already exists at the specified virtual 3116 * address, or (3) a PV entry cannot be allocated without reclaiming another 3117 * PV entry. 3118 */ 3119 static bool 3120 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3121 struct rwlock **lockp) 3122 { 3123 pml3_entry_t newpde; 3124 3125 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3126 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) | 3127 RPTE_LEAF | PG_V; 3128 if ((m->oflags & VPO_UNMANAGED) == 0) 3129 newpde |= PG_MANAGED; 3130 if (prot & VM_PROT_EXECUTE) 3131 newpde |= PG_X; 3132 if (prot & VM_PROT_READ) 3133 newpde |= RPTE_EAA_R; 3134 if (va >= DMAP_MIN_ADDRESS) 3135 newpde |= RPTE_EAA_P; 3136 return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 3137 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 3138 KERN_SUCCESS); 3139 } 3140 3141 /* 3142 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 3143 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 3144 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 3145 * a mapping already exists at the specified virtual address. Returns 3146 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 3147 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 3148 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 3149 * 3150 * The parameter "m" is only used when creating a managed, writeable mapping. 3151 */ 3152 static int 3153 pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags, 3154 vm_page_t m, struct rwlock **lockp) 3155 { 3156 struct spglist free; 3157 pml3_entry_t oldl3e, *l3e; 3158 vm_page_t mt, pdpg; 3159 3160 KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, 3161 ("pmap_enter_pde: newpde is missing PG_M")); 3162 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3163 3164 if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 3165 NULL : lockp)) == NULL) { 3166 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3167 " in pmap %p", va, pmap); 3168 return (KERN_RESOURCE_SHORTAGE); 3169 } 3170 l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 3171 l3e = &l3e[pmap_pml3e_index(va)]; 3172 oldl3e = be64toh(*l3e); 3173 if ((oldl3e & PG_V) != 0) { 3174 KASSERT(pdpg->ref_count > 1, 3175 ("pmap_enter_pde: pdpg's wire count is too low")); 3176 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 3177 pdpg->ref_count--; 3178 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3179 " in pmap %p", va, pmap); 3180 return (KERN_FAILURE); 3181 } 3182 /* Break the existing mapping(s). */ 3183 SLIST_INIT(&free); 3184 if ((oldl3e & RPTE_LEAF) != 0) { 3185 /* 3186 * The reference to the PD page that was acquired by 3187 * pmap_allocl3e() ensures that it won't be freed. 3188 * However, if the PDE resulted from a promotion, then 3189 * a reserved PT page could be freed. 3190 */ 3191 (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp); 3192 pmap_invalidate_l3e_page(pmap, va, oldl3e); 3193 } else { 3194 if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e, 3195 &free, lockp)) 3196 pmap_invalidate_all(pmap); 3197 } 3198 vm_page_free_pages_toq(&free, true); 3199 if (va >= VM_MAXUSER_ADDRESS) { 3200 mt = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME); 3201 if (pmap_insert_pt_page(pmap, mt)) { 3202 /* 3203 * XXX Currently, this can't happen because 3204 * we do not perform pmap_enter(psind == 1) 3205 * on the kernel pmap. 3206 */ 3207 panic("pmap_enter_pde: trie insert failed"); 3208 } 3209 } else 3210 KASSERT(be64toh(*l3e) == 0, ("pmap_enter_pde: non-zero pde %p", 3211 l3e)); 3212 } 3213 if ((newpde & PG_MANAGED) != 0) { 3214 /* 3215 * Abort this mapping if its PV entry could not be created. 3216 */ 3217 if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) { 3218 SLIST_INIT(&free); 3219 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { 3220 /* 3221 * Although "va" is not mapped, paging- 3222 * structure caches could nonetheless have 3223 * entries that refer to the freed page table 3224 * pages. Invalidate those entries. 3225 */ 3226 pmap_invalidate_page(pmap, va); 3227 vm_page_free_pages_toq(&free, true); 3228 } 3229 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3230 " in pmap %p", va, pmap); 3231 return (KERN_RESOURCE_SHORTAGE); 3232 } 3233 if ((newpde & PG_RW) != 0) { 3234 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) 3235 vm_page_aflag_set(mt, PGA_WRITEABLE); 3236 } 3237 } 3238 3239 /* 3240 * Increment counters. 3241 */ 3242 if ((newpde & PG_W) != 0) 3243 pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE; 3244 pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); 3245 3246 /* 3247 * Map the superpage. (This is not a promoted mapping; there will not 3248 * be any lingering 4KB page mappings in the TLB.) 3249 */ 3250 pte_store(l3e, newpde); 3251 ptesync(); 3252 3253 counter_u64_add(pmap_l3e_mappings, 1); 3254 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3255 " in pmap %p", va, pmap); 3256 return (KERN_SUCCESS); 3257 } 3258 3259 void 3260 mmu_radix_enter_object(pmap_t pmap, vm_offset_t start, 3261 vm_offset_t end, vm_page_t m_start, vm_prot_t prot) 3262 { 3263 3264 struct rwlock *lock; 3265 vm_offset_t va; 3266 vm_page_t m, mpte; 3267 vm_pindex_t diff, psize; 3268 bool invalidate; 3269 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3270 3271 CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start, 3272 end, m_start, prot); 3273 3274 invalidate = false; 3275 psize = atop(end - start); 3276 mpte = NULL; 3277 m = m_start; 3278 lock = NULL; 3279 PMAP_LOCK(pmap); 3280 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3281 va = start + ptoa(diff); 3282 if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end && 3283 m->psind == 1 && mmu_radix_ps_enabled(pmap) && 3284 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3285 m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1]; 3286 else 3287 mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot, 3288 mpte, &lock, &invalidate); 3289 m = TAILQ_NEXT(m, listq); 3290 } 3291 ptesync(); 3292 if (lock != NULL) 3293 rw_wunlock(lock); 3294 if (invalidate) 3295 pmap_invalidate_all(pmap); 3296 PMAP_UNLOCK(pmap); 3297 } 3298 3299 static vm_page_t 3300 mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3301 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate) 3302 { 3303 struct spglist free; 3304 pt_entry_t *pte; 3305 vm_paddr_t pa; 3306 3307 KASSERT(!VA_IS_CLEANMAP(va) || 3308 (m->oflags & VPO_UNMANAGED) != 0, 3309 ("mmu_radix_enter_quick_locked: managed mapping within the clean submap")); 3310 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3311 3312 /* 3313 * In the case that a page table page is not 3314 * resident, we are creating it here. 3315 */ 3316 if (va < VM_MAXUSER_ADDRESS) { 3317 vm_pindex_t ptepindex; 3318 pml3_entry_t *ptepa; 3319 3320 /* 3321 * Calculate pagetable page index 3322 */ 3323 ptepindex = pmap_l3e_pindex(va); 3324 if (mpte && (mpte->pindex == ptepindex)) { 3325 mpte->ref_count++; 3326 } else { 3327 /* 3328 * Get the page directory entry 3329 */ 3330 ptepa = pmap_pml3e(pmap, va); 3331 3332 /* 3333 * If the page table page is mapped, we just increment 3334 * the hold count, and activate it. Otherwise, we 3335 * attempt to allocate a page table page. If this 3336 * attempt fails, we don't retry. Instead, we give up. 3337 */ 3338 if (ptepa && (be64toh(*ptepa) & PG_V) != 0) { 3339 if (be64toh(*ptepa) & RPTE_LEAF) 3340 return (NULL); 3341 mpte = PHYS_TO_VM_PAGE(be64toh(*ptepa) & PG_FRAME); 3342 mpte->ref_count++; 3343 } else { 3344 /* 3345 * Pass NULL instead of the PV list lock 3346 * pointer, because we don't intend to sleep. 3347 */ 3348 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 3349 if (mpte == NULL) 3350 return (mpte); 3351 } 3352 } 3353 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3354 pte = &pte[pmap_pte_index(va)]; 3355 } else { 3356 mpte = NULL; 3357 pte = pmap_pte(pmap, va); 3358 } 3359 if (be64toh(*pte)) { 3360 if (mpte != NULL) { 3361 mpte->ref_count--; 3362 mpte = NULL; 3363 } 3364 return (mpte); 3365 } 3366 3367 /* 3368 * Enter on the PV list if part of our managed memory. 3369 */ 3370 if ((m->oflags & VPO_UNMANAGED) == 0 && 3371 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3372 if (mpte != NULL) { 3373 SLIST_INIT(&free); 3374 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3375 /* 3376 * Although "va" is not mapped, paging- 3377 * structure caches could nonetheless have 3378 * entries that refer to the freed page table 3379 * pages. Invalidate those entries. 3380 */ 3381 *invalidate = true; 3382 vm_page_free_pages_toq(&free, true); 3383 } 3384 mpte = NULL; 3385 } 3386 return (mpte); 3387 } 3388 3389 /* 3390 * Increment counters 3391 */ 3392 pmap_resident_count_inc(pmap, 1); 3393 3394 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs); 3395 if (prot & VM_PROT_EXECUTE) 3396 pa |= PG_X; 3397 else 3398 pa |= RPTE_EAA_R; 3399 if ((m->oflags & VPO_UNMANAGED) == 0) 3400 pa |= PG_MANAGED; 3401 3402 pte_store(pte, pa); 3403 return (mpte); 3404 } 3405 3406 void 3407 mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, 3408 vm_prot_t prot) 3409 { 3410 struct rwlock *lock; 3411 bool invalidate; 3412 3413 lock = NULL; 3414 invalidate = false; 3415 PMAP_LOCK(pmap); 3416 mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock, 3417 &invalidate); 3418 ptesync(); 3419 if (lock != NULL) 3420 rw_wunlock(lock); 3421 if (invalidate) 3422 pmap_invalidate_all(pmap); 3423 PMAP_UNLOCK(pmap); 3424 } 3425 3426 vm_paddr_t 3427 mmu_radix_extract(pmap_t pmap, vm_offset_t va) 3428 { 3429 pml3_entry_t *l3e; 3430 pt_entry_t *pte; 3431 vm_paddr_t pa; 3432 3433 l3e = pmap_pml3e(pmap, va); 3434 if (__predict_false(l3e == NULL)) 3435 return (0); 3436 if (be64toh(*l3e) & RPTE_LEAF) { 3437 pa = (be64toh(*l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK); 3438 pa |= (va & L3_PAGE_MASK); 3439 } else { 3440 /* 3441 * Beware of a concurrent promotion that changes the 3442 * PDE at this point! For example, vtopte() must not 3443 * be used to access the PTE because it would use the 3444 * new PDE. It is, however, safe to use the old PDE 3445 * because the page table page is preserved by the 3446 * promotion. 3447 */ 3448 pte = pmap_l3e_to_pte(l3e, va); 3449 if (__predict_false(pte == NULL)) 3450 return (0); 3451 pa = be64toh(*pte); 3452 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3453 pa |= (va & PAGE_MASK); 3454 } 3455 return (pa); 3456 } 3457 3458 vm_page_t 3459 mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3460 { 3461 pml3_entry_t l3e, *l3ep; 3462 pt_entry_t pte; 3463 vm_page_t m; 3464 3465 m = NULL; 3466 CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot); 3467 PMAP_LOCK(pmap); 3468 l3ep = pmap_pml3e(pmap, va); 3469 if (l3ep != NULL && (l3e = be64toh(*l3ep))) { 3470 if (l3e & RPTE_LEAF) { 3471 if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0) 3472 m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) | 3473 (va & L3_PAGE_MASK)); 3474 } else { 3475 /* Native endian PTE, do not pass to pmap functions */ 3476 pte = be64toh(*pmap_l3e_to_pte(l3ep, va)); 3477 if ((pte & PG_V) && 3478 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) 3479 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3480 } 3481 if (m != NULL && !vm_page_wire_mapped(m)) 3482 m = NULL; 3483 } 3484 PMAP_UNLOCK(pmap); 3485 return (m); 3486 } 3487 3488 static void 3489 mmu_radix_growkernel(vm_offset_t addr) 3490 { 3491 vm_paddr_t paddr; 3492 vm_page_t nkpg; 3493 pml3_entry_t *l3e; 3494 pml2_entry_t *l2e; 3495 3496 CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); 3497 if (VM_MIN_KERNEL_ADDRESS < addr && 3498 addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE)) 3499 return; 3500 3501 addr = roundup2(addr, L3_PAGE_SIZE); 3502 if (addr - 1 >= vm_map_max(kernel_map)) 3503 addr = vm_map_max(kernel_map); 3504 while (kernel_vm_end < addr) { 3505 l2e = pmap_pml2e(kernel_pmap, kernel_vm_end); 3506 if ((be64toh(*l2e) & PG_V) == 0) { 3507 /* We need a new PDP entry */ 3508 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_PAGE_SIZE_SHIFT, 3509 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 3510 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3511 if (nkpg == NULL) 3512 panic("pmap_growkernel: no memory to grow kernel"); 3513 if ((nkpg->flags & PG_ZERO) == 0) 3514 mmu_radix_zero_page(nkpg); 3515 paddr = VM_PAGE_TO_PHYS(nkpg); 3516 pde_store(l2e, paddr); 3517 continue; /* try again */ 3518 } 3519 l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end); 3520 if ((be64toh(*l3e) & PG_V) != 0) { 3521 kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 3522 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3523 kernel_vm_end = vm_map_max(kernel_map); 3524 break; 3525 } 3526 continue; 3527 } 3528 3529 nkpg = vm_page_alloc(NULL, pmap_l3e_pindex(kernel_vm_end), 3530 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 3531 VM_ALLOC_ZERO); 3532 if (nkpg == NULL) 3533 panic("pmap_growkernel: no memory to grow kernel"); 3534 if ((nkpg->flags & PG_ZERO) == 0) 3535 mmu_radix_zero_page(nkpg); 3536 paddr = VM_PAGE_TO_PHYS(nkpg); 3537 pde_store(l3e, paddr); 3538 3539 kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 3540 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3541 kernel_vm_end = vm_map_max(kernel_map); 3542 break; 3543 } 3544 } 3545 ptesync(); 3546 } 3547 3548 static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory"); 3549 static uma_zone_t zone_radix_pgd; 3550 3551 static int 3552 radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused, 3553 int flags) 3554 { 3555 3556 for (int i = 0; i < count; i++) { 3557 vm_page_t m = vm_page_alloc_contig(NULL, 0, 3558 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 3559 VM_ALLOC_ZERO | VM_ALLOC_WAITOK, RADIX_PGD_SIZE/PAGE_SIZE, 3560 0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE, 3561 VM_MEMATTR_DEFAULT); 3562 /* XXX zero on alloc here so we don't have to later */ 3563 store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3564 } 3565 return (count); 3566 } 3567 3568 static void 3569 radix_pgd_release(void *arg __unused, void **store, int count) 3570 { 3571 vm_page_t m; 3572 struct spglist free; 3573 int page_count; 3574 3575 SLIST_INIT(&free); 3576 page_count = RADIX_PGD_SIZE/PAGE_SIZE; 3577 3578 for (int i = 0; i < count; i++) { 3579 /* 3580 * XXX selectively remove dmap and KVA entries so we don't 3581 * need to bzero 3582 */ 3583 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); 3584 for (int j = page_count-1; j >= 0; j--) { 3585 vm_page_unwire_noq(&m[j]); 3586 SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss); 3587 } 3588 vm_page_free_pages_toq(&free, false); 3589 } 3590 } 3591 3592 static void 3593 mmu_radix_init() 3594 { 3595 vm_page_t mpte; 3596 vm_size_t s; 3597 int error, i, pv_npg; 3598 3599 /* XXX is this really needed for POWER? */ 3600 /* L1TF, reserve page @0 unconditionally */ 3601 vm_page_blacklist_add(0, bootverbose); 3602 3603 zone_radix_pgd = uma_zcache_create("radix_pgd_cache", 3604 RADIX_PGD_SIZE, NULL, NULL, 3605 #ifdef INVARIANTS 3606 trash_init, trash_fini, 3607 #else 3608 NULL, NULL, 3609 #endif 3610 radix_pgd_import, radix_pgd_release, 3611 NULL, UMA_ZONE_NOBUCKET); 3612 3613 /* 3614 * Initialize the vm page array entries for the kernel pmap's 3615 * page table pages. 3616 */ 3617 PMAP_LOCK(kernel_pmap); 3618 for (i = 0; i < nkpt; i++) { 3619 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 3620 KASSERT(mpte >= vm_page_array && 3621 mpte < &vm_page_array[vm_page_array_size], 3622 ("pmap_init: page table page is out of range size: %lu", 3623 vm_page_array_size)); 3624 mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i; 3625 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 3626 MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte); 3627 //pmap_insert_pt_page(kernel_pmap, mpte); 3628 mpte->ref_count = 1; 3629 } 3630 PMAP_UNLOCK(kernel_pmap); 3631 vm_wire_add(nkpt); 3632 3633 CTR1(KTR_PMAP, "%s()", __func__); 3634 TAILQ_INIT(&pv_dummy.pv_list); 3635 3636 /* 3637 * Are large page mappings enabled? 3638 */ 3639 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 3640 if (superpages_enabled) { 3641 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 3642 ("pmap_init: can't assign to pagesizes[1]")); 3643 pagesizes[1] = L3_PAGE_SIZE; 3644 } 3645 3646 /* 3647 * Initialize the pv chunk list mutex. 3648 */ 3649 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 3650 3651 /* 3652 * Initialize the pool of pv list locks. 3653 */ 3654 for (i = 0; i < NPV_LIST_LOCKS; i++) 3655 rw_init(&pv_list_locks[i], "pmap pv list"); 3656 3657 /* 3658 * Calculate the size of the pv head table for superpages. 3659 */ 3660 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE); 3661 3662 /* 3663 * Allocate memory for the pv head table for superpages. 3664 */ 3665 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 3666 s = round_page(s); 3667 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 3668 for (i = 0; i < pv_npg; i++) 3669 TAILQ_INIT(&pv_table[i].pv_list); 3670 TAILQ_INIT(&pv_dummy.pv_list); 3671 3672 pmap_initialized = 1; 3673 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 3674 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3675 (vmem_addr_t *)&qframe); 3676 3677 if (error != 0) 3678 panic("qframe allocation failed"); 3679 asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<<isa3_pid_bits), 3680 1, 1, M_WAITOK); 3681 } 3682 3683 static boolean_t 3684 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3685 { 3686 struct rwlock *lock; 3687 pv_entry_t pv; 3688 struct md_page *pvh; 3689 pt_entry_t *pte, mask; 3690 pmap_t pmap; 3691 int md_gen, pvh_gen; 3692 boolean_t rv; 3693 3694 rv = FALSE; 3695 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3696 rw_rlock(lock); 3697 restart: 3698 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 3699 pmap = PV_PMAP(pv); 3700 if (!PMAP_TRYLOCK(pmap)) { 3701 md_gen = m->md.pv_gen; 3702 rw_runlock(lock); 3703 PMAP_LOCK(pmap); 3704 rw_rlock(lock); 3705 if (md_gen != m->md.pv_gen) { 3706 PMAP_UNLOCK(pmap); 3707 goto restart; 3708 } 3709 } 3710 pte = pmap_pte(pmap, pv->pv_va); 3711 mask = 0; 3712 if (modified) 3713 mask |= PG_RW | PG_M; 3714 if (accessed) 3715 mask |= PG_V | PG_A; 3716 rv = (be64toh(*pte) & mask) == mask; 3717 PMAP_UNLOCK(pmap); 3718 if (rv) 3719 goto out; 3720 } 3721 if ((m->flags & PG_FICTITIOUS) == 0) { 3722 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3723 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 3724 pmap = PV_PMAP(pv); 3725 if (!PMAP_TRYLOCK(pmap)) { 3726 md_gen = m->md.pv_gen; 3727 pvh_gen = pvh->pv_gen; 3728 rw_runlock(lock); 3729 PMAP_LOCK(pmap); 3730 rw_rlock(lock); 3731 if (md_gen != m->md.pv_gen || 3732 pvh_gen != pvh->pv_gen) { 3733 PMAP_UNLOCK(pmap); 3734 goto restart; 3735 } 3736 } 3737 pte = pmap_pml3e(pmap, pv->pv_va); 3738 mask = 0; 3739 if (modified) 3740 mask |= PG_RW | PG_M; 3741 if (accessed) 3742 mask |= PG_V | PG_A; 3743 rv = (be64toh(*pte) & mask) == mask; 3744 PMAP_UNLOCK(pmap); 3745 if (rv) 3746 goto out; 3747 } 3748 } 3749 out: 3750 rw_runlock(lock); 3751 return (rv); 3752 } 3753 3754 /* 3755 * pmap_is_modified: 3756 * 3757 * Return whether or not the specified physical page was modified 3758 * in any physical maps. 3759 */ 3760 boolean_t 3761 mmu_radix_is_modified(vm_page_t m) 3762 { 3763 3764 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3765 ("pmap_is_modified: page %p is not managed", m)); 3766 3767 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 3768 /* 3769 * If the page is not busied then this check is racy. 3770 */ 3771 if (!pmap_page_is_write_mapped(m)) 3772 return (FALSE); 3773 return (pmap_page_test_mappings(m, FALSE, TRUE)); 3774 } 3775 3776 boolean_t 3777 mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3778 { 3779 pml3_entry_t *l3e; 3780 pt_entry_t *pte; 3781 boolean_t rv; 3782 3783 CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); 3784 rv = FALSE; 3785 PMAP_LOCK(pmap); 3786 l3e = pmap_pml3e(pmap, addr); 3787 if (l3e != NULL && (be64toh(*l3e) & (RPTE_LEAF | PG_V)) == PG_V) { 3788 pte = pmap_l3e_to_pte(l3e, addr); 3789 rv = (be64toh(*pte) & PG_V) == 0; 3790 } 3791 PMAP_UNLOCK(pmap); 3792 return (rv); 3793 } 3794 3795 boolean_t 3796 mmu_radix_is_referenced(vm_page_t m) 3797 { 3798 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3799 ("pmap_is_referenced: page %p is not managed", m)); 3800 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 3801 return (pmap_page_test_mappings(m, TRUE, FALSE)); 3802 } 3803 3804 /* 3805 * pmap_ts_referenced: 3806 * 3807 * Return a count of reference bits for a page, clearing those bits. 3808 * It is not necessary for every reference bit to be cleared, but it 3809 * is necessary that 0 only be returned when there are truly no 3810 * reference bits set. 3811 * 3812 * As an optimization, update the page's dirty field if a modified bit is 3813 * found while counting reference bits. This opportunistic update can be 3814 * performed at low cost and can eliminate the need for some future calls 3815 * to pmap_is_modified(). However, since this function stops after 3816 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 3817 * dirty pages. Those dirty pages will only be detected by a future call 3818 * to pmap_is_modified(). 3819 * 3820 * A DI block is not needed within this function, because 3821 * invalidations are performed before the PV list lock is 3822 * released. 3823 */ 3824 boolean_t 3825 mmu_radix_ts_referenced(vm_page_t m) 3826 { 3827 struct md_page *pvh; 3828 pv_entry_t pv, pvf; 3829 pmap_t pmap; 3830 struct rwlock *lock; 3831 pml3_entry_t oldl3e, *l3e; 3832 pt_entry_t *pte; 3833 vm_paddr_t pa; 3834 int cleared, md_gen, not_cleared, pvh_gen; 3835 struct spglist free; 3836 3837 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 3838 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3839 ("pmap_ts_referenced: page %p is not managed", m)); 3840 SLIST_INIT(&free); 3841 cleared = 0; 3842 pa = VM_PAGE_TO_PHYS(m); 3843 lock = PHYS_TO_PV_LIST_LOCK(pa); 3844 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 3845 rw_wlock(lock); 3846 retry: 3847 not_cleared = 0; 3848 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 3849 goto small_mappings; 3850 pv = pvf; 3851 do { 3852 if (pvf == NULL) 3853 pvf = pv; 3854 pmap = PV_PMAP(pv); 3855 if (!PMAP_TRYLOCK(pmap)) { 3856 pvh_gen = pvh->pv_gen; 3857 rw_wunlock(lock); 3858 PMAP_LOCK(pmap); 3859 rw_wlock(lock); 3860 if (pvh_gen != pvh->pv_gen) { 3861 PMAP_UNLOCK(pmap); 3862 goto retry; 3863 } 3864 } 3865 l3e = pmap_pml3e(pmap, pv->pv_va); 3866 oldl3e = be64toh(*l3e); 3867 if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3868 /* 3869 * Although "oldpde" is mapping a 2MB page, because 3870 * this function is called at a 4KB page granularity, 3871 * we only update the 4KB page under test. 3872 */ 3873 vm_page_dirty(m); 3874 } 3875 if ((oldl3e & PG_A) != 0) { 3876 /* 3877 * Since this reference bit is shared by 512 4KB 3878 * pages, it should not be cleared every time it is 3879 * tested. Apply a simple "hash" function on the 3880 * physical page number, the virtual superpage number, 3881 * and the pmap address to select one 4KB page out of 3882 * the 512 on which testing the reference bit will 3883 * result in clearing that reference bit. This 3884 * function is designed to avoid the selection of the 3885 * same 4KB page for every 2MB page mapping. 3886 * 3887 * On demotion, a mapping that hasn't been referenced 3888 * is simply destroyed. To avoid the possibility of a 3889 * subsequent page fault on a demoted wired mapping, 3890 * always leave its reference bit set. Moreover, 3891 * since the superpage is wired, the current state of 3892 * its reference bit won't affect page replacement. 3893 */ 3894 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^ 3895 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 3896 (oldl3e & PG_W) == 0) { 3897 atomic_clear_long(l3e, htobe64(PG_A)); 3898 pmap_invalidate_page(pmap, pv->pv_va); 3899 cleared++; 3900 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 3901 ("inconsistent pv lock %p %p for page %p", 3902 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 3903 } else 3904 not_cleared++; 3905 } 3906 PMAP_UNLOCK(pmap); 3907 /* Rotate the PV list if it has more than one entry. */ 3908 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { 3909 TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); 3910 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); 3911 pvh->pv_gen++; 3912 } 3913 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 3914 goto out; 3915 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 3916 small_mappings: 3917 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 3918 goto out; 3919 pv = pvf; 3920 do { 3921 if (pvf == NULL) 3922 pvf = pv; 3923 pmap = PV_PMAP(pv); 3924 if (!PMAP_TRYLOCK(pmap)) { 3925 pvh_gen = pvh->pv_gen; 3926 md_gen = m->md.pv_gen; 3927 rw_wunlock(lock); 3928 PMAP_LOCK(pmap); 3929 rw_wlock(lock); 3930 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3931 PMAP_UNLOCK(pmap); 3932 goto retry; 3933 } 3934 } 3935 l3e = pmap_pml3e(pmap, pv->pv_va); 3936 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, 3937 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 3938 m)); 3939 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 3940 if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3941 vm_page_dirty(m); 3942 if ((be64toh(*pte) & PG_A) != 0) { 3943 atomic_clear_long(pte, htobe64(PG_A)); 3944 pmap_invalidate_page(pmap, pv->pv_va); 3945 cleared++; 3946 } 3947 PMAP_UNLOCK(pmap); 3948 /* Rotate the PV list if it has more than one entry. */ 3949 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { 3950 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 3951 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 3952 m->md.pv_gen++; 3953 } 3954 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 3955 not_cleared < PMAP_TS_REFERENCED_MAX); 3956 out: 3957 rw_wunlock(lock); 3958 vm_page_free_pages_toq(&free, true); 3959 return (cleared + not_cleared); 3960 } 3961 3962 static vm_offset_t 3963 mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start, 3964 vm_paddr_t end, int prot __unused) 3965 { 3966 3967 CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end, 3968 prot); 3969 return (PHYS_TO_DMAP(start)); 3970 } 3971 3972 void 3973 mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr, 3974 vm_object_t object, vm_pindex_t pindex, vm_size_t size) 3975 { 3976 pml3_entry_t *l3e; 3977 vm_paddr_t pa, ptepa; 3978 vm_page_t p, pdpg; 3979 vm_memattr_t ma; 3980 3981 CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr, 3982 object, pindex, size); 3983 VM_OBJECT_ASSERT_WLOCKED(object); 3984 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3985 ("pmap_object_init_pt: non-device object")); 3986 /* NB: size can be logically ored with addr here */ 3987 if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) { 3988 if (!mmu_radix_ps_enabled(pmap)) 3989 return; 3990 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3991 return; 3992 p = vm_page_lookup(object, pindex); 3993 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3994 ("pmap_object_init_pt: invalid page %p", p)); 3995 ma = p->md.mdpg_cache_attrs; 3996 3997 /* 3998 * Abort the mapping if the first page is not physically 3999 * aligned to a 2MB page boundary. 4000 */ 4001 ptepa = VM_PAGE_TO_PHYS(p); 4002 if (ptepa & L3_PAGE_MASK) 4003 return; 4004 4005 /* 4006 * Skip the first page. Abort the mapping if the rest of 4007 * the pages are not physically contiguous or have differing 4008 * memory attributes. 4009 */ 4010 p = TAILQ_NEXT(p, listq); 4011 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 4012 pa += PAGE_SIZE) { 4013 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4014 ("pmap_object_init_pt: invalid page %p", p)); 4015 if (pa != VM_PAGE_TO_PHYS(p) || 4016 ma != p->md.mdpg_cache_attrs) 4017 return; 4018 p = TAILQ_NEXT(p, listq); 4019 } 4020 4021 PMAP_LOCK(pmap); 4022 for (pa = ptepa | pmap_cache_bits(ma); 4023 pa < ptepa + size; pa += L3_PAGE_SIZE) { 4024 pdpg = pmap_allocl3e(pmap, addr, NULL); 4025 if (pdpg == NULL) { 4026 /* 4027 * The creation of mappings below is only an 4028 * optimization. If a page directory page 4029 * cannot be allocated without blocking, 4030 * continue on to the next mapping rather than 4031 * blocking. 4032 */ 4033 addr += L3_PAGE_SIZE; 4034 continue; 4035 } 4036 l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4037 l3e = &l3e[pmap_pml3e_index(addr)]; 4038 if ((be64toh(*l3e) & PG_V) == 0) { 4039 pa |= PG_M | PG_A | PG_RW; 4040 pte_store(l3e, pa); 4041 pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); 4042 counter_u64_add(pmap_l3e_mappings, 1); 4043 } else { 4044 /* Continue on if the PDE is already valid. */ 4045 pdpg->ref_count--; 4046 KASSERT(pdpg->ref_count > 0, 4047 ("pmap_object_init_pt: missing reference " 4048 "to page directory page, va: 0x%lx", addr)); 4049 } 4050 addr += L3_PAGE_SIZE; 4051 } 4052 ptesync(); 4053 PMAP_UNLOCK(pmap); 4054 } 4055 } 4056 4057 boolean_t 4058 mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m) 4059 { 4060 struct md_page *pvh; 4061 struct rwlock *lock; 4062 pv_entry_t pv; 4063 int loops = 0; 4064 boolean_t rv; 4065 4066 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4067 ("pmap_page_exists_quick: page %p is not managed", m)); 4068 CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m); 4069 rv = FALSE; 4070 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4071 rw_rlock(lock); 4072 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 4073 if (PV_PMAP(pv) == pmap) { 4074 rv = TRUE; 4075 break; 4076 } 4077 loops++; 4078 if (loops >= 16) 4079 break; 4080 } 4081 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4082 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4083 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 4084 if (PV_PMAP(pv) == pmap) { 4085 rv = TRUE; 4086 break; 4087 } 4088 loops++; 4089 if (loops >= 16) 4090 break; 4091 } 4092 } 4093 rw_runlock(lock); 4094 return (rv); 4095 } 4096 4097 void 4098 mmu_radix_page_init(vm_page_t m) 4099 { 4100 4101 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 4102 TAILQ_INIT(&m->md.pv_list); 4103 m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; 4104 } 4105 4106 int 4107 mmu_radix_page_wired_mappings(vm_page_t m) 4108 { 4109 struct rwlock *lock; 4110 struct md_page *pvh; 4111 pmap_t pmap; 4112 pt_entry_t *pte; 4113 pv_entry_t pv; 4114 int count, md_gen, pvh_gen; 4115 4116 if ((m->oflags & VPO_UNMANAGED) != 0) 4117 return (0); 4118 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 4119 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4120 rw_rlock(lock); 4121 restart: 4122 count = 0; 4123 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 4124 pmap = PV_PMAP(pv); 4125 if (!PMAP_TRYLOCK(pmap)) { 4126 md_gen = m->md.pv_gen; 4127 rw_runlock(lock); 4128 PMAP_LOCK(pmap); 4129 rw_rlock(lock); 4130 if (md_gen != m->md.pv_gen) { 4131 PMAP_UNLOCK(pmap); 4132 goto restart; 4133 } 4134 } 4135 pte = pmap_pte(pmap, pv->pv_va); 4136 if ((be64toh(*pte) & PG_W) != 0) 4137 count++; 4138 PMAP_UNLOCK(pmap); 4139 } 4140 if ((m->flags & PG_FICTITIOUS) == 0) { 4141 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4142 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 4143 pmap = PV_PMAP(pv); 4144 if (!PMAP_TRYLOCK(pmap)) { 4145 md_gen = m->md.pv_gen; 4146 pvh_gen = pvh->pv_gen; 4147 rw_runlock(lock); 4148 PMAP_LOCK(pmap); 4149 rw_rlock(lock); 4150 if (md_gen != m->md.pv_gen || 4151 pvh_gen != pvh->pv_gen) { 4152 PMAP_UNLOCK(pmap); 4153 goto restart; 4154 } 4155 } 4156 pte = pmap_pml3e(pmap, pv->pv_va); 4157 if ((be64toh(*pte) & PG_W) != 0) 4158 count++; 4159 PMAP_UNLOCK(pmap); 4160 } 4161 } 4162 rw_runlock(lock); 4163 return (count); 4164 } 4165 4166 static void 4167 mmu_radix_update_proctab(int pid, pml1_entry_t l1pa) 4168 { 4169 isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE | l1pa | RADIX_PGD_INDEX_SHIFT); 4170 } 4171 4172 int 4173 mmu_radix_pinit(pmap_t pmap) 4174 { 4175 vmem_addr_t pid; 4176 vm_paddr_t l1pa; 4177 4178 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 4179 4180 /* 4181 * allocate the page directory page 4182 */ 4183 pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK); 4184 4185 for (int j = 0; j < RADIX_PGD_SIZE_SHIFT; j++) 4186 pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE); 4187 pmap->pm_radix.rt_root = 0; 4188 TAILQ_INIT(&pmap->pm_pvchunk); 4189 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4190 pmap->pm_flags = PMAP_PDE_SUPERPAGE; 4191 vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid); 4192 4193 pmap->pm_pid = pid; 4194 l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1); 4195 mmu_radix_update_proctab(pid, l1pa); 4196 __asm __volatile("ptesync;isync" : : : "memory"); 4197 4198 return (1); 4199 } 4200 4201 /* 4202 * This routine is called if the desired page table page does not exist. 4203 * 4204 * If page table page allocation fails, this routine may sleep before 4205 * returning NULL. It sleeps only if a lock pointer was given. 4206 * 4207 * Note: If a page allocation fails at page table level two or three, 4208 * one or two pages may be held during the wait, only to be released 4209 * afterwards. This conservative approach is easily argued to avoid 4210 * race conditions. 4211 */ 4212 static vm_page_t 4213 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 4214 { 4215 vm_page_t m, pdppg, pdpg; 4216 4217 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4218 4219 /* 4220 * Allocate a page table page. 4221 */ 4222 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 4223 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 4224 if (lockp != NULL) { 4225 RELEASE_PV_LIST_LOCK(lockp); 4226 PMAP_UNLOCK(pmap); 4227 vm_wait(NULL); 4228 PMAP_LOCK(pmap); 4229 } 4230 /* 4231 * Indicate the need to retry. While waiting, the page table 4232 * page may have been allocated. 4233 */ 4234 return (NULL); 4235 } 4236 if ((m->flags & PG_ZERO) == 0) 4237 mmu_radix_zero_page(m); 4238 4239 /* 4240 * Map the pagetable page into the process address space, if 4241 * it isn't already there. 4242 */ 4243 4244 if (ptepindex >= (NUPDE + NUPDPE)) { 4245 pml1_entry_t *l1e; 4246 vm_pindex_t pml1index; 4247 4248 /* Wire up a new PDPE page */ 4249 pml1index = ptepindex - (NUPDE + NUPDPE); 4250 l1e = &pmap->pm_pml1[pml1index]; 4251 KASSERT((be64toh(*l1e) & PG_V) == 0, 4252 ("%s: L1 entry %#lx is valid", __func__, *l1e)); 4253 pde_store(l1e, VM_PAGE_TO_PHYS(m)); 4254 } else if (ptepindex >= NUPDE) { 4255 vm_pindex_t pml1index; 4256 vm_pindex_t pdpindex; 4257 pml1_entry_t *l1e; 4258 pml2_entry_t *l2e; 4259 4260 /* Wire up a new l2e page */ 4261 pdpindex = ptepindex - NUPDE; 4262 pml1index = pdpindex >> RPTE_SHIFT; 4263 4264 l1e = &pmap->pm_pml1[pml1index]; 4265 if ((be64toh(*l1e) & PG_V) == 0) { 4266 /* Have to allocate a new pdp, recurse */ 4267 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index, 4268 lockp) == NULL) { 4269 vm_page_unwire_noq(m); 4270 vm_page_free_zero(m); 4271 return (NULL); 4272 } 4273 } else { 4274 /* Add reference to l2e page */ 4275 pdppg = PHYS_TO_VM_PAGE(be64toh(*l1e) & PG_FRAME); 4276 pdppg->ref_count++; 4277 } 4278 l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); 4279 4280 /* Now find the pdp page */ 4281 l2e = &l2e[pdpindex & RPTE_MASK]; 4282 KASSERT((be64toh(*l2e) & PG_V) == 0, 4283 ("%s: L2 entry %#lx is valid", __func__, *l2e)); 4284 pde_store(l2e, VM_PAGE_TO_PHYS(m)); 4285 } else { 4286 vm_pindex_t pml1index; 4287 vm_pindex_t pdpindex; 4288 pml1_entry_t *l1e; 4289 pml2_entry_t *l2e; 4290 pml3_entry_t *l3e; 4291 4292 /* Wire up a new PTE page */ 4293 pdpindex = ptepindex >> RPTE_SHIFT; 4294 pml1index = pdpindex >> RPTE_SHIFT; 4295 4296 /* First, find the pdp and check that its valid. */ 4297 l1e = &pmap->pm_pml1[pml1index]; 4298 if ((be64toh(*l1e) & PG_V) == 0) { 4299 /* Have to allocate a new pd, recurse */ 4300 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 4301 lockp) == NULL) { 4302 vm_page_unwire_noq(m); 4303 vm_page_free_zero(m); 4304 return (NULL); 4305 } 4306 l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); 4307 l2e = &l2e[pdpindex & RPTE_MASK]; 4308 } else { 4309 l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); 4310 l2e = &l2e[pdpindex & RPTE_MASK]; 4311 if ((be64toh(*l2e) & PG_V) == 0) { 4312 /* Have to allocate a new pd, recurse */ 4313 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 4314 lockp) == NULL) { 4315 vm_page_unwire_noq(m); 4316 vm_page_free_zero(m); 4317 return (NULL); 4318 } 4319 } else { 4320 /* Add reference to the pd page */ 4321 pdpg = PHYS_TO_VM_PAGE(be64toh(*l2e) & PG_FRAME); 4322 pdpg->ref_count++; 4323 } 4324 } 4325 l3e = (pml3_entry_t *)PHYS_TO_DMAP(be64toh(*l2e) & PG_FRAME); 4326 4327 /* Now we know where the page directory page is */ 4328 l3e = &l3e[ptepindex & RPTE_MASK]; 4329 KASSERT((be64toh(*l3e) & PG_V) == 0, 4330 ("%s: L3 entry %#lx is valid", __func__, *l3e)); 4331 pde_store(l3e, VM_PAGE_TO_PHYS(m)); 4332 } 4333 4334 pmap_resident_count_inc(pmap, 1); 4335 return (m); 4336 } 4337 static vm_page_t 4338 pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4339 { 4340 vm_pindex_t pdpindex, ptepindex; 4341 pml2_entry_t *pdpe; 4342 vm_page_t pdpg; 4343 4344 retry: 4345 pdpe = pmap_pml2e(pmap, va); 4346 if (pdpe != NULL && (be64toh(*pdpe) & PG_V) != 0) { 4347 /* Add a reference to the pd page. */ 4348 pdpg = PHYS_TO_VM_PAGE(be64toh(*pdpe) & PG_FRAME); 4349 pdpg->ref_count++; 4350 } else { 4351 /* Allocate a pd page. */ 4352 ptepindex = pmap_l3e_pindex(va); 4353 pdpindex = ptepindex >> RPTE_SHIFT; 4354 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 4355 if (pdpg == NULL && lockp != NULL) 4356 goto retry; 4357 } 4358 return (pdpg); 4359 } 4360 4361 static vm_page_t 4362 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4363 { 4364 vm_pindex_t ptepindex; 4365 pml3_entry_t *pd; 4366 vm_page_t m; 4367 4368 /* 4369 * Calculate pagetable page index 4370 */ 4371 ptepindex = pmap_l3e_pindex(va); 4372 retry: 4373 /* 4374 * Get the page directory entry 4375 */ 4376 pd = pmap_pml3e(pmap, va); 4377 4378 /* 4379 * This supports switching from a 2MB page to a 4380 * normal 4K page. 4381 */ 4382 if (pd != NULL && (be64toh(*pd) & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) { 4383 if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) { 4384 /* 4385 * Invalidation of the 2MB page mapping may have caused 4386 * the deallocation of the underlying PD page. 4387 */ 4388 pd = NULL; 4389 } 4390 } 4391 4392 /* 4393 * If the page table page is mapped, we just increment the 4394 * hold count, and activate it. 4395 */ 4396 if (pd != NULL && (be64toh(*pd) & PG_V) != 0) { 4397 m = PHYS_TO_VM_PAGE(be64toh(*pd) & PG_FRAME); 4398 m->ref_count++; 4399 } else { 4400 /* 4401 * Here if the pte page isn't mapped, or if it has been 4402 * deallocated. 4403 */ 4404 m = _pmap_allocpte(pmap, ptepindex, lockp); 4405 if (m == NULL && lockp != NULL) 4406 goto retry; 4407 } 4408 return (m); 4409 } 4410 4411 static void 4412 mmu_radix_pinit0(pmap_t pmap) 4413 { 4414 4415 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 4416 PMAP_LOCK_INIT(pmap); 4417 pmap->pm_pml1 = kernel_pmap->pm_pml1; 4418 pmap->pm_pid = kernel_pmap->pm_pid; 4419 4420 pmap->pm_radix.rt_root = 0; 4421 TAILQ_INIT(&pmap->pm_pvchunk); 4422 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4423 kernel_pmap->pm_flags = 4424 pmap->pm_flags = PMAP_PDE_SUPERPAGE; 4425 } 4426 /* 4427 * pmap_protect_l3e: do the things to protect a 2mpage in a process 4428 */ 4429 static boolean_t 4430 pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot) 4431 { 4432 pt_entry_t newpde, oldpde; 4433 vm_offset_t eva, va; 4434 vm_page_t m; 4435 boolean_t anychanged; 4436 4437 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4438 KASSERT((sva & L3_PAGE_MASK) == 0, 4439 ("pmap_protect_l3e: sva is not 2mpage aligned")); 4440 anychanged = FALSE; 4441 retry: 4442 oldpde = newpde = be64toh(*l3e); 4443 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 4444 (PG_MANAGED | PG_M | PG_RW)) { 4445 eva = sva + L3_PAGE_SIZE; 4446 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 4447 va < eva; va += PAGE_SIZE, m++) 4448 vm_page_dirty(m); 4449 } 4450 if ((prot & VM_PROT_WRITE) == 0) { 4451 newpde &= ~(PG_RW | PG_M); 4452 newpde |= RPTE_EAA_R; 4453 } 4454 if (prot & VM_PROT_EXECUTE) 4455 newpde |= PG_X; 4456 if (newpde != oldpde) { 4457 /* 4458 * As an optimization to future operations on this PDE, clear 4459 * PG_PROMOTED. The impending invalidation will remove any 4460 * lingering 4KB page mappings from the TLB. 4461 */ 4462 if (!atomic_cmpset_long(l3e, htobe64(oldpde), htobe64(newpde & ~PG_PROMOTED))) 4463 goto retry; 4464 anychanged = TRUE; 4465 } 4466 return (anychanged); 4467 } 4468 4469 void 4470 mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 4471 vm_prot_t prot) 4472 { 4473 vm_offset_t va_next; 4474 pml1_entry_t *l1e; 4475 pml2_entry_t *l2e; 4476 pml3_entry_t ptpaddr, *l3e; 4477 pt_entry_t *pte; 4478 boolean_t anychanged; 4479 4480 CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva, 4481 prot); 4482 4483 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4484 if (prot == VM_PROT_NONE) { 4485 mmu_radix_remove(pmap, sva, eva); 4486 return; 4487 } 4488 4489 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 4490 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 4491 return; 4492 4493 #ifdef INVARIANTS 4494 if (VERBOSE_PROTECT || pmap_logging) 4495 printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n", 4496 pmap, sva, eva, prot, pmap->pm_pid); 4497 #endif 4498 anychanged = FALSE; 4499 4500 PMAP_LOCK(pmap); 4501 for (; sva < eva; sva = va_next) { 4502 l1e = pmap_pml1e(pmap, sva); 4503 if ((be64toh(*l1e) & PG_V) == 0) { 4504 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 4505 if (va_next < sva) 4506 va_next = eva; 4507 continue; 4508 } 4509 4510 l2e = pmap_l1e_to_l2e(l1e, sva); 4511 if ((be64toh(*l2e) & PG_V) == 0) { 4512 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 4513 if (va_next < sva) 4514 va_next = eva; 4515 continue; 4516 } 4517 4518 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 4519 if (va_next < sva) 4520 va_next = eva; 4521 4522 l3e = pmap_l2e_to_l3e(l2e, sva); 4523 ptpaddr = be64toh(*l3e); 4524 4525 /* 4526 * Weed out invalid mappings. 4527 */ 4528 if (ptpaddr == 0) 4529 continue; 4530 4531 /* 4532 * Check for large page. 4533 */ 4534 if ((ptpaddr & RPTE_LEAF) != 0) { 4535 /* 4536 * Are we protecting the entire large page? If not, 4537 * demote the mapping and fall through. 4538 */ 4539 if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { 4540 if (pmap_protect_l3e(pmap, l3e, sva, prot)) 4541 anychanged = TRUE; 4542 continue; 4543 } else if (!pmap_demote_l3e(pmap, l3e, sva)) { 4544 /* 4545 * The large page mapping was destroyed. 4546 */ 4547 continue; 4548 } 4549 } 4550 4551 if (va_next > eva) 4552 va_next = eva; 4553 4554 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, 4555 sva += PAGE_SIZE) { 4556 pt_entry_t obits, pbits; 4557 vm_page_t m; 4558 4559 retry: 4560 MPASS(pte == pmap_pte(pmap, sva)); 4561 obits = pbits = be64toh(*pte); 4562 if ((pbits & PG_V) == 0) 4563 continue; 4564 4565 if ((prot & VM_PROT_WRITE) == 0) { 4566 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 4567 (PG_MANAGED | PG_M | PG_RW)) { 4568 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4569 vm_page_dirty(m); 4570 } 4571 pbits &= ~(PG_RW | PG_M); 4572 pbits |= RPTE_EAA_R; 4573 } 4574 if (prot & VM_PROT_EXECUTE) 4575 pbits |= PG_X; 4576 4577 if (pbits != obits) { 4578 if (!atomic_cmpset_long(pte, htobe64(obits), htobe64(pbits))) 4579 goto retry; 4580 if (obits & (PG_A|PG_M)) { 4581 anychanged = TRUE; 4582 #ifdef INVARIANTS 4583 if (VERBOSE_PROTECT || pmap_logging) 4584 printf("%#lx %#lx -> %#lx\n", 4585 sva, obits, pbits); 4586 #endif 4587 } 4588 } 4589 } 4590 } 4591 if (anychanged) 4592 pmap_invalidate_all(pmap); 4593 PMAP_UNLOCK(pmap); 4594 } 4595 4596 void 4597 mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count) 4598 { 4599 4600 CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count); 4601 pt_entry_t oldpte, pa, *pte; 4602 vm_page_t m; 4603 uint64_t cache_bits, attr_bits; 4604 vm_offset_t va; 4605 4606 oldpte = 0; 4607 attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; 4608 va = sva; 4609 pte = kvtopte(va); 4610 while (va < sva + PAGE_SIZE * count) { 4611 if (__predict_false((va & L3_PAGE_MASK) == 0)) 4612 pte = kvtopte(va); 4613 MPASS(pte == pmap_pte(kernel_pmap, va)); 4614 4615 /* 4616 * XXX there has to be a more efficient way than traversing 4617 * the page table every time - but go for correctness for 4618 * today 4619 */ 4620 4621 m = *ma++; 4622 cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs); 4623 pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits; 4624 if (be64toh(*pte) != pa) { 4625 oldpte |= be64toh(*pte); 4626 pte_store(pte, pa); 4627 } 4628 va += PAGE_SIZE; 4629 pte++; 4630 } 4631 if (__predict_false((oldpte & RPTE_VALID) != 0)) 4632 pmap_invalidate_range(kernel_pmap, sva, sva + count * 4633 PAGE_SIZE); 4634 else 4635 ptesync(); 4636 } 4637 4638 void 4639 mmu_radix_qremove(vm_offset_t sva, int count) 4640 { 4641 vm_offset_t va; 4642 pt_entry_t *pte; 4643 4644 CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count); 4645 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva)); 4646 4647 va = sva; 4648 pte = kvtopte(va); 4649 while (va < sva + PAGE_SIZE * count) { 4650 if (__predict_false((va & L3_PAGE_MASK) == 0)) 4651 pte = kvtopte(va); 4652 pte_clear(pte); 4653 pte++; 4654 va += PAGE_SIZE; 4655 } 4656 pmap_invalidate_range(kernel_pmap, sva, va); 4657 } 4658 4659 /*************************************************** 4660 * Page table page management routines..... 4661 ***************************************************/ 4662 /* 4663 * Schedule the specified unused page table page to be freed. Specifically, 4664 * add the page to the specified list of pages that will be released to the 4665 * physical memory manager after the TLB has been updated. 4666 */ 4667 static __inline void 4668 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 4669 boolean_t set_PG_ZERO) 4670 { 4671 4672 if (set_PG_ZERO) 4673 m->flags |= PG_ZERO; 4674 else 4675 m->flags &= ~PG_ZERO; 4676 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 4677 } 4678 4679 /* 4680 * Inserts the specified page table page into the specified pmap's collection 4681 * of idle page table pages. Each of a pmap's page table pages is responsible 4682 * for mapping a distinct range of virtual addresses. The pmap's collection is 4683 * ordered by this virtual address range. 4684 */ 4685 static __inline int 4686 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 4687 { 4688 4689 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4690 return (vm_radix_insert(&pmap->pm_radix, mpte)); 4691 } 4692 4693 /* 4694 * Removes the page table page mapping the specified virtual address from the 4695 * specified pmap's collection of idle page table pages, and returns it. 4696 * Otherwise, returns NULL if there is no page table page corresponding to the 4697 * specified virtual address. 4698 */ 4699 static __inline vm_page_t 4700 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4701 { 4702 4703 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4704 return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va))); 4705 } 4706 4707 /* 4708 * Decrements a page table page's wire count, which is used to record the 4709 * number of valid page table entries within the page. If the wire count 4710 * drops to zero, then the page table page is unmapped. Returns TRUE if the 4711 * page table page was unmapped and FALSE otherwise. 4712 */ 4713 static inline boolean_t 4714 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4715 { 4716 4717 --m->ref_count; 4718 if (m->ref_count == 0) { 4719 _pmap_unwire_ptp(pmap, va, m, free); 4720 return (TRUE); 4721 } else 4722 return (FALSE); 4723 } 4724 4725 static void 4726 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4727 { 4728 4729 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4730 /* 4731 * unmap the page table page 4732 */ 4733 if (m->pindex >= NUPDE + NUPDPE) { 4734 /* PDP page */ 4735 pml1_entry_t *pml1; 4736 pml1 = pmap_pml1e(pmap, va); 4737 *pml1 = 0; 4738 } else if (m->pindex >= NUPDE) { 4739 /* PD page */ 4740 pml2_entry_t *l2e; 4741 l2e = pmap_pml2e(pmap, va); 4742 *l2e = 0; 4743 } else { 4744 /* PTE page */ 4745 pml3_entry_t *l3e; 4746 l3e = pmap_pml3e(pmap, va); 4747 *l3e = 0; 4748 } 4749 pmap_resident_count_dec(pmap, 1); 4750 if (m->pindex < NUPDE) { 4751 /* We just released a PT, unhold the matching PD */ 4752 vm_page_t pdpg; 4753 4754 pdpg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml2e(pmap, va)) & PG_FRAME); 4755 pmap_unwire_ptp(pmap, va, pdpg, free); 4756 } 4757 else if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 4758 /* We just released a PD, unhold the matching PDP */ 4759 vm_page_t pdppg; 4760 4761 pdppg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml1e(pmap, va)) & PG_FRAME); 4762 pmap_unwire_ptp(pmap, va, pdppg, free); 4763 } 4764 4765 /* 4766 * Put page on a list so that it is released after 4767 * *ALL* TLB shootdown is done 4768 */ 4769 pmap_add_delayed_free_list(m, free, TRUE); 4770 } 4771 4772 /* 4773 * After removing a page table entry, this routine is used to 4774 * conditionally free the page, and manage the hold/wire counts. 4775 */ 4776 static int 4777 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde, 4778 struct spglist *free) 4779 { 4780 vm_page_t mpte; 4781 4782 if (va >= VM_MAXUSER_ADDRESS) 4783 return (0); 4784 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 4785 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 4786 return (pmap_unwire_ptp(pmap, va, mpte, free)); 4787 } 4788 4789 void 4790 mmu_radix_release(pmap_t pmap) 4791 { 4792 4793 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 4794 KASSERT(pmap->pm_stats.resident_count == 0, 4795 ("pmap_release: pmap resident count %ld != 0", 4796 pmap->pm_stats.resident_count)); 4797 KASSERT(vm_radix_is_empty(&pmap->pm_radix), 4798 ("pmap_release: pmap has reserved page table page(s)")); 4799 4800 pmap_invalidate_all(pmap); 4801 isa3_proctab[pmap->pm_pid].proctab0 = 0; 4802 uma_zfree(zone_radix_pgd, pmap->pm_pml1); 4803 vmem_free(asid_arena, pmap->pm_pid, 1); 4804 } 4805 4806 /* 4807 * Create the PV entry for a 2MB page mapping. Always returns true unless the 4808 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 4809 * false if the PV entry cannot be allocated without resorting to reclamation. 4810 */ 4811 static bool 4812 pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags, 4813 struct rwlock **lockp) 4814 { 4815 struct md_page *pvh; 4816 pv_entry_t pv; 4817 vm_paddr_t pa; 4818 4819 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4820 /* Pass NULL instead of the lock pointer to disable reclamation. */ 4821 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 4822 NULL : lockp)) == NULL) 4823 return (false); 4824 pv->pv_va = va; 4825 pa = pde & PG_PS_FRAME; 4826 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 4827 pvh = pa_to_pvh(pa); 4828 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); 4829 pvh->pv_gen++; 4830 return (true); 4831 } 4832 4833 /* 4834 * Fills a page table page with mappings to consecutive physical pages. 4835 */ 4836 static void 4837 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 4838 { 4839 pt_entry_t *pte; 4840 4841 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 4842 *pte = htobe64(newpte); 4843 newpte += PAGE_SIZE; 4844 } 4845 } 4846 4847 static boolean_t 4848 pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va) 4849 { 4850 struct rwlock *lock; 4851 boolean_t rv; 4852 4853 lock = NULL; 4854 rv = pmap_demote_l3e_locked(pmap, pde, va, &lock); 4855 if (lock != NULL) 4856 rw_wunlock(lock); 4857 return (rv); 4858 } 4859 4860 static boolean_t 4861 pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, 4862 struct rwlock **lockp) 4863 { 4864 pml3_entry_t oldpde; 4865 pt_entry_t *firstpte; 4866 vm_paddr_t mptepa; 4867 vm_page_t mpte; 4868 struct spglist free; 4869 vm_offset_t sva; 4870 4871 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4872 oldpde = be64toh(*l3e); 4873 KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), 4874 ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx", 4875 oldpde)); 4876 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 4877 NULL) { 4878 KASSERT((oldpde & PG_W) == 0, 4879 ("pmap_demote_l3e: page table page for a wired mapping" 4880 " is missing")); 4881 4882 /* 4883 * Invalidate the 2MB page mapping and return "failure" if the 4884 * mapping was never accessed or the allocation of the new 4885 * page table page fails. If the 2MB page mapping belongs to 4886 * the direct map region of the kernel's address space, then 4887 * the page allocation request specifies the highest possible 4888 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 4889 * normal. Page table pages are preallocated for every other 4890 * part of the kernel address space, so the direct map region 4891 * is the only part of the kernel address space that must be 4892 * handled here. 4893 */ 4894 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 4895 pmap_l3e_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 4896 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 4897 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 4898 SLIST_INIT(&free); 4899 sva = trunc_2mpage(va); 4900 pmap_remove_l3e(pmap, l3e, sva, &free, lockp); 4901 pmap_invalidate_l3e_page(pmap, sva, oldpde); 4902 vm_page_free_pages_toq(&free, true); 4903 CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx" 4904 " in pmap %p", va, pmap); 4905 return (FALSE); 4906 } 4907 if (va < VM_MAXUSER_ADDRESS) 4908 pmap_resident_count_inc(pmap, 1); 4909 } 4910 mptepa = VM_PAGE_TO_PHYS(mpte); 4911 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 4912 KASSERT((oldpde & PG_A) != 0, 4913 ("pmap_demote_l3e: oldpde is missing PG_A")); 4914 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 4915 ("pmap_demote_l3e: oldpde is missing PG_M")); 4916 4917 /* 4918 * If the page table page is new, initialize it. 4919 */ 4920 if (mpte->ref_count == 1) { 4921 mpte->ref_count = NPTEPG; 4922 pmap_fill_ptp(firstpte, oldpde); 4923 } 4924 4925 KASSERT((be64toh(*firstpte) & PG_FRAME) == (oldpde & PG_FRAME), 4926 ("pmap_demote_l3e: firstpte and newpte map different physical" 4927 " addresses")); 4928 4929 /* 4930 * If the mapping has changed attributes, update the page table 4931 * entries. 4932 */ 4933 if ((be64toh(*firstpte) & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE)) 4934 pmap_fill_ptp(firstpte, oldpde); 4935 4936 /* 4937 * The spare PV entries must be reserved prior to demoting the 4938 * mapping, that is, prior to changing the PDE. Otherwise, the state 4939 * of the PDE and the PV lists will be inconsistent, which can result 4940 * in reclaim_pv_chunk() attempting to remove a PV entry from the 4941 * wrong PV list and pmap_pv_demote_l3e() failing to find the expected 4942 * PV entry for the 2MB page mapping that is being demoted. 4943 */ 4944 if ((oldpde & PG_MANAGED) != 0) 4945 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 4946 4947 /* 4948 * Demote the mapping. This pmap is locked. The old PDE has 4949 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 4950 * set. Thus, there is no danger of a race with another 4951 * processor changing the setting of PG_A and/or PG_M between 4952 * the read above and the store below. 4953 */ 4954 pde_store(l3e, mptepa); 4955 pmap_invalidate_l3e_page(pmap, trunc_2mpage(va), oldpde); 4956 /* 4957 * Demote the PV entry. 4958 */ 4959 if ((oldpde & PG_MANAGED) != 0) 4960 pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp); 4961 4962 counter_u64_add(pmap_l3e_demotions, 1); 4963 CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx" 4964 " in pmap %p", va, pmap); 4965 return (TRUE); 4966 } 4967 4968 /* 4969 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 4970 */ 4971 static void 4972 pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va) 4973 { 4974 vm_paddr_t mptepa; 4975 vm_page_t mpte; 4976 4977 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 4978 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4979 mpte = pmap_remove_pt_page(pmap, va); 4980 if (mpte == NULL) 4981 panic("pmap_remove_kernel_pde: Missing pt page."); 4982 4983 mptepa = VM_PAGE_TO_PHYS(mpte); 4984 4985 /* 4986 * Initialize the page table page. 4987 */ 4988 pagezero(PHYS_TO_DMAP(mptepa)); 4989 4990 /* 4991 * Demote the mapping. 4992 */ 4993 pde_store(l3e, mptepa); 4994 ptesync(); 4995 } 4996 4997 /* 4998 * pmap_remove_l3e: do the things to unmap a superpage in a process 4999 */ 5000 static int 5001 pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, 5002 struct spglist *free, struct rwlock **lockp) 5003 { 5004 struct md_page *pvh; 5005 pml3_entry_t oldpde; 5006 vm_offset_t eva, va; 5007 vm_page_t m, mpte; 5008 5009 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5010 KASSERT((sva & L3_PAGE_MASK) == 0, 5011 ("pmap_remove_l3e: sva is not 2mpage aligned")); 5012 oldpde = be64toh(pte_load_clear(pdq)); 5013 if (oldpde & PG_W) 5014 pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE); 5015 pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); 5016 if (oldpde & PG_MANAGED) { 5017 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 5018 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 5019 pmap_pvh_free(pvh, pmap, sva); 5020 eva = sva + L3_PAGE_SIZE; 5021 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 5022 va < eva; va += PAGE_SIZE, m++) { 5023 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5024 vm_page_dirty(m); 5025 if (oldpde & PG_A) 5026 vm_page_aflag_set(m, PGA_REFERENCED); 5027 if (TAILQ_EMPTY(&m->md.pv_list) && 5028 TAILQ_EMPTY(&pvh->pv_list)) 5029 vm_page_aflag_clear(m, PGA_WRITEABLE); 5030 } 5031 } 5032 if (pmap == kernel_pmap) { 5033 pmap_remove_kernel_l3e(pmap, pdq, sva); 5034 } else { 5035 mpte = pmap_remove_pt_page(pmap, sva); 5036 if (mpte != NULL) { 5037 pmap_resident_count_dec(pmap, 1); 5038 KASSERT(mpte->ref_count == NPTEPG, 5039 ("pmap_remove_l3e: pte page wire count error")); 5040 mpte->ref_count = 0; 5041 pmap_add_delayed_free_list(mpte, free, FALSE); 5042 } 5043 } 5044 return (pmap_unuse_pt(pmap, sva, be64toh(*pmap_pml2e(pmap, sva)), free)); 5045 } 5046 5047 /* 5048 * pmap_remove_pte: do the things to unmap a page in a process 5049 */ 5050 static int 5051 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 5052 pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 5053 { 5054 struct md_page *pvh; 5055 pt_entry_t oldpte; 5056 vm_page_t m; 5057 5058 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5059 oldpte = be64toh(pte_load_clear(ptq)); 5060 if (oldpte & RPTE_WIRED) 5061 pmap->pm_stats.wired_count -= 1; 5062 pmap_resident_count_dec(pmap, 1); 5063 if (oldpte & RPTE_MANAGED) { 5064 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 5065 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5066 vm_page_dirty(m); 5067 if (oldpte & PG_A) 5068 vm_page_aflag_set(m, PGA_REFERENCED); 5069 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5070 pmap_pvh_free(&m->md, pmap, va); 5071 if (TAILQ_EMPTY(&m->md.pv_list) && 5072 (m->flags & PG_FICTITIOUS) == 0) { 5073 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5074 if (TAILQ_EMPTY(&pvh->pv_list)) 5075 vm_page_aflag_clear(m, PGA_WRITEABLE); 5076 } 5077 } 5078 return (pmap_unuse_pt(pmap, va, ptepde, free)); 5079 } 5080 5081 /* 5082 * Remove a single page from a process address space 5083 */ 5084 static bool 5085 pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e, 5086 struct spglist *free) 5087 { 5088 struct rwlock *lock; 5089 pt_entry_t *pte; 5090 bool invalidate_all; 5091 5092 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5093 if ((be64toh(*l3e) & RPTE_VALID) == 0) { 5094 return (false); 5095 } 5096 pte = pmap_l3e_to_pte(l3e, va); 5097 if ((be64toh(*pte) & RPTE_VALID) == 0) { 5098 return (false); 5099 } 5100 lock = NULL; 5101 5102 invalidate_all = pmap_remove_pte(pmap, pte, va, be64toh(*l3e), free, &lock); 5103 if (lock != NULL) 5104 rw_wunlock(lock); 5105 if (!invalidate_all) 5106 pmap_invalidate_page(pmap, va); 5107 return (invalidate_all); 5108 } 5109 5110 /* 5111 * Removes the specified range of addresses from the page table page. 5112 */ 5113 static bool 5114 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 5115 pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp) 5116 { 5117 pt_entry_t *pte; 5118 vm_offset_t va; 5119 bool anyvalid; 5120 5121 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5122 anyvalid = false; 5123 va = eva; 5124 for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++, 5125 sva += PAGE_SIZE) { 5126 MPASS(pte == pmap_pte(pmap, sva)); 5127 if (*pte == 0) { 5128 if (va != eva) { 5129 anyvalid = true; 5130 va = eva; 5131 } 5132 continue; 5133 } 5134 if (va == eva) 5135 va = sva; 5136 if (pmap_remove_pte(pmap, pte, sva, be64toh(*l3e), free, lockp)) { 5137 anyvalid = true; 5138 sva += PAGE_SIZE; 5139 break; 5140 } 5141 } 5142 if (anyvalid) 5143 pmap_invalidate_all(pmap); 5144 else if (va != eva) 5145 pmap_invalidate_range(pmap, va, sva); 5146 return (anyvalid); 5147 } 5148 5149 void 5150 mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5151 { 5152 struct rwlock *lock; 5153 vm_offset_t va_next; 5154 pml1_entry_t *l1e; 5155 pml2_entry_t *l2e; 5156 pml3_entry_t ptpaddr, *l3e; 5157 struct spglist free; 5158 bool anyvalid; 5159 5160 CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); 5161 5162 /* 5163 * Perform an unsynchronized read. This is, however, safe. 5164 */ 5165 if (pmap->pm_stats.resident_count == 0) 5166 return; 5167 5168 anyvalid = false; 5169 SLIST_INIT(&free); 5170 5171 /* XXX something fishy here */ 5172 sva = (sva + PAGE_MASK) & ~PAGE_MASK; 5173 eva = (eva + PAGE_MASK) & ~PAGE_MASK; 5174 5175 PMAP_LOCK(pmap); 5176 5177 /* 5178 * special handling of removing one page. a very 5179 * common operation and easy to short circuit some 5180 * code. 5181 */ 5182 if (sva + PAGE_SIZE == eva) { 5183 l3e = pmap_pml3e(pmap, sva); 5184 if (l3e && (be64toh(*l3e) & RPTE_LEAF) == 0) { 5185 anyvalid = pmap_remove_page(pmap, sva, l3e, &free); 5186 goto out; 5187 } 5188 } 5189 5190 lock = NULL; 5191 for (; sva < eva; sva = va_next) { 5192 if (pmap->pm_stats.resident_count == 0) 5193 break; 5194 l1e = pmap_pml1e(pmap, sva); 5195 if (l1e == NULL || (be64toh(*l1e) & PG_V) == 0) { 5196 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 5197 if (va_next < sva) 5198 va_next = eva; 5199 continue; 5200 } 5201 5202 l2e = pmap_l1e_to_l2e(l1e, sva); 5203 if (l2e == NULL || (be64toh(*l2e) & PG_V) == 0) { 5204 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 5205 if (va_next < sva) 5206 va_next = eva; 5207 continue; 5208 } 5209 5210 /* 5211 * Calculate index for next page table. 5212 */ 5213 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 5214 if (va_next < sva) 5215 va_next = eva; 5216 5217 l3e = pmap_l2e_to_l3e(l2e, sva); 5218 ptpaddr = be64toh(*l3e); 5219 5220 /* 5221 * Weed out invalid mappings. 5222 */ 5223 if (ptpaddr == 0) 5224 continue; 5225 5226 /* 5227 * Check for large page. 5228 */ 5229 if ((ptpaddr & RPTE_LEAF) != 0) { 5230 /* 5231 * Are we removing the entire large page? If not, 5232 * demote the mapping and fall through. 5233 */ 5234 if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { 5235 pmap_remove_l3e(pmap, l3e, sva, &free, &lock); 5236 anyvalid = true; 5237 continue; 5238 } else if (!pmap_demote_l3e_locked(pmap, l3e, sva, 5239 &lock)) { 5240 /* The large page mapping was destroyed. */ 5241 continue; 5242 } else 5243 ptpaddr = be64toh(*l3e); 5244 } 5245 5246 /* 5247 * Limit our scan to either the end of the va represented 5248 * by the current page table page, or to the end of the 5249 * range being removed. 5250 */ 5251 if (va_next > eva) 5252 va_next = eva; 5253 5254 if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock)) 5255 anyvalid = true; 5256 } 5257 if (lock != NULL) 5258 rw_wunlock(lock); 5259 out: 5260 if (anyvalid) 5261 pmap_invalidate_all(pmap); 5262 PMAP_UNLOCK(pmap); 5263 vm_page_free_pages_toq(&free, true); 5264 } 5265 5266 void 5267 mmu_radix_remove_all(vm_page_t m) 5268 { 5269 struct md_page *pvh; 5270 pv_entry_t pv; 5271 pmap_t pmap; 5272 struct rwlock *lock; 5273 pt_entry_t *pte, tpte; 5274 pml3_entry_t *l3e; 5275 vm_offset_t va; 5276 struct spglist free; 5277 int pvh_gen, md_gen; 5278 5279 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 5280 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5281 ("pmap_remove_all: page %p is not managed", m)); 5282 SLIST_INIT(&free); 5283 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5284 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5285 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5286 retry: 5287 rw_wlock(lock); 5288 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 5289 pmap = PV_PMAP(pv); 5290 if (!PMAP_TRYLOCK(pmap)) { 5291 pvh_gen = pvh->pv_gen; 5292 rw_wunlock(lock); 5293 PMAP_LOCK(pmap); 5294 rw_wlock(lock); 5295 if (pvh_gen != pvh->pv_gen) { 5296 rw_wunlock(lock); 5297 PMAP_UNLOCK(pmap); 5298 goto retry; 5299 } 5300 } 5301 va = pv->pv_va; 5302 l3e = pmap_pml3e(pmap, va); 5303 (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock); 5304 PMAP_UNLOCK(pmap); 5305 } 5306 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 5307 pmap = PV_PMAP(pv); 5308 if (!PMAP_TRYLOCK(pmap)) { 5309 pvh_gen = pvh->pv_gen; 5310 md_gen = m->md.pv_gen; 5311 rw_wunlock(lock); 5312 PMAP_LOCK(pmap); 5313 rw_wlock(lock); 5314 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5315 rw_wunlock(lock); 5316 PMAP_UNLOCK(pmap); 5317 goto retry; 5318 } 5319 } 5320 pmap_resident_count_dec(pmap, 1); 5321 l3e = pmap_pml3e(pmap, pv->pv_va); 5322 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_remove_all: found" 5323 " a 2mpage in page %p's pv list", m)); 5324 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 5325 tpte = be64toh(pte_load_clear(pte)); 5326 if (tpte & PG_W) 5327 pmap->pm_stats.wired_count--; 5328 if (tpte & PG_A) 5329 vm_page_aflag_set(m, PGA_REFERENCED); 5330 5331 /* 5332 * Update the vm_page_t clean and reference bits. 5333 */ 5334 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5335 vm_page_dirty(m); 5336 pmap_unuse_pt(pmap, pv->pv_va, be64toh(*l3e), &free); 5337 pmap_invalidate_page(pmap, pv->pv_va); 5338 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 5339 m->md.pv_gen++; 5340 free_pv_entry(pmap, pv); 5341 PMAP_UNLOCK(pmap); 5342 } 5343 vm_page_aflag_clear(m, PGA_WRITEABLE); 5344 rw_wunlock(lock); 5345 vm_page_free_pages_toq(&free, true); 5346 } 5347 5348 /* 5349 * Destroy all managed, non-wired mappings in the given user-space 5350 * pmap. This pmap cannot be active on any processor besides the 5351 * caller. 5352 * 5353 * This function cannot be applied to the kernel pmap. Moreover, it 5354 * is not intended for general use. It is only to be used during 5355 * process termination. Consequently, it can be implemented in ways 5356 * that make it faster than pmap_remove(). First, it can more quickly 5357 * destroy mappings by iterating over the pmap's collection of PV 5358 * entries, rather than searching the page table. Second, it doesn't 5359 * have to test and clear the page table entries atomically, because 5360 * no processor is currently accessing the user address space. In 5361 * particular, a page table entry's dirty bit won't change state once 5362 * this function starts. 5363 * 5364 * Although this function destroys all of the pmap's managed, 5365 * non-wired mappings, it can delay and batch the invalidation of TLB 5366 * entries without calling pmap_delayed_invl_started() and 5367 * pmap_delayed_invl_finished(). Because the pmap is not active on 5368 * any other processor, none of these TLB entries will ever be used 5369 * before their eventual invalidation. Consequently, there is no need 5370 * for either pmap_remove_all() or pmap_remove_write() to wait for 5371 * that eventual TLB invalidation. 5372 */ 5373 5374 void 5375 mmu_radix_remove_pages(pmap_t pmap) 5376 { 5377 5378 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 5379 pml3_entry_t ptel3e; 5380 pt_entry_t *pte, tpte; 5381 struct spglist free; 5382 vm_page_t m, mpte, mt; 5383 pv_entry_t pv; 5384 struct md_page *pvh; 5385 struct pv_chunk *pc, *npc; 5386 struct rwlock *lock; 5387 int64_t bit; 5388 uint64_t inuse, bitmask; 5389 int allfree, field, freed, idx; 5390 boolean_t superpage; 5391 vm_paddr_t pa; 5392 5393 /* 5394 * Assert that the given pmap is only active on the current 5395 * CPU. Unfortunately, we cannot block another CPU from 5396 * activating the pmap while this function is executing. 5397 */ 5398 KASSERT(pmap->pm_pid == mfspr(SPR_PID), 5399 ("non-current asid %lu - expected %lu", pmap->pm_pid, 5400 mfspr(SPR_PID))); 5401 5402 lock = NULL; 5403 5404 SLIST_INIT(&free); 5405 PMAP_LOCK(pmap); 5406 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5407 allfree = 1; 5408 freed = 0; 5409 for (field = 0; field < _NPCM; field++) { 5410 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5411 while (inuse != 0) { 5412 bit = cnttzd(inuse); 5413 bitmask = 1UL << bit; 5414 idx = field * 64 + bit; 5415 pv = &pc->pc_pventry[idx]; 5416 inuse &= ~bitmask; 5417 5418 pte = pmap_pml2e(pmap, pv->pv_va); 5419 ptel3e = be64toh(*pte); 5420 pte = pmap_l2e_to_l3e(pte, pv->pv_va); 5421 tpte = be64toh(*pte); 5422 if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) { 5423 superpage = FALSE; 5424 ptel3e = tpte; 5425 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 5426 PG_FRAME); 5427 pte = &pte[pmap_pte_index(pv->pv_va)]; 5428 tpte = be64toh(*pte); 5429 } else { 5430 /* 5431 * Keep track whether 'tpte' is a 5432 * superpage explicitly instead of 5433 * relying on RPTE_LEAF being set. 5434 * 5435 * This is because RPTE_LEAF is numerically 5436 * identical to PG_PTE_PAT and thus a 5437 * regular page could be mistaken for 5438 * a superpage. 5439 */ 5440 superpage = TRUE; 5441 } 5442 5443 if ((tpte & PG_V) == 0) { 5444 panic("bad pte va %lx pte %lx", 5445 pv->pv_va, tpte); 5446 } 5447 5448 /* 5449 * We cannot remove wired pages from a process' mapping at this time 5450 */ 5451 if (tpte & PG_W) { 5452 allfree = 0; 5453 continue; 5454 } 5455 5456 if (superpage) 5457 pa = tpte & PG_PS_FRAME; 5458 else 5459 pa = tpte & PG_FRAME; 5460 5461 m = PHYS_TO_VM_PAGE(pa); 5462 KASSERT(m->phys_addr == pa, 5463 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5464 m, (uintmax_t)m->phys_addr, 5465 (uintmax_t)tpte)); 5466 5467 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5468 m < &vm_page_array[vm_page_array_size], 5469 ("pmap_remove_pages: bad tpte %#jx", 5470 (uintmax_t)tpte)); 5471 5472 pte_clear(pte); 5473 5474 /* 5475 * Update the vm_page_t clean/reference bits. 5476 */ 5477 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5478 if (superpage) { 5479 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) 5480 vm_page_dirty(mt); 5481 } else 5482 vm_page_dirty(m); 5483 } 5484 5485 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5486 5487 /* Mark free */ 5488 pc->pc_map[field] |= bitmask; 5489 if (superpage) { 5490 pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); 5491 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 5492 TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); 5493 pvh->pv_gen++; 5494 if (TAILQ_EMPTY(&pvh->pv_list)) { 5495 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) 5496 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 5497 TAILQ_EMPTY(&mt->md.pv_list)) 5498 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5499 } 5500 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 5501 if (mpte != NULL) { 5502 pmap_resident_count_dec(pmap, 1); 5503 KASSERT(mpte->ref_count == NPTEPG, 5504 ("pmap_remove_pages: pte page wire count error")); 5505 mpte->ref_count = 0; 5506 pmap_add_delayed_free_list(mpte, &free, FALSE); 5507 } 5508 } else { 5509 pmap_resident_count_dec(pmap, 1); 5510 #ifdef VERBOSE_PV 5511 printf("freeing pv (%p, %p)\n", 5512 pmap, pv); 5513 #endif 5514 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 5515 m->md.pv_gen++; 5516 if ((m->a.flags & PGA_WRITEABLE) != 0 && 5517 TAILQ_EMPTY(&m->md.pv_list) && 5518 (m->flags & PG_FICTITIOUS) == 0) { 5519 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5520 if (TAILQ_EMPTY(&pvh->pv_list)) 5521 vm_page_aflag_clear(m, PGA_WRITEABLE); 5522 } 5523 } 5524 pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free); 5525 freed++; 5526 } 5527 } 5528 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5529 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5530 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5531 if (allfree) { 5532 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5533 free_pv_chunk(pc); 5534 } 5535 } 5536 if (lock != NULL) 5537 rw_wunlock(lock); 5538 pmap_invalidate_all(pmap); 5539 PMAP_UNLOCK(pmap); 5540 vm_page_free_pages_toq(&free, true); 5541 } 5542 5543 void 5544 mmu_radix_remove_write(vm_page_t m) 5545 { 5546 struct md_page *pvh; 5547 pmap_t pmap; 5548 struct rwlock *lock; 5549 pv_entry_t next_pv, pv; 5550 pml3_entry_t *l3e; 5551 pt_entry_t oldpte, *pte; 5552 int pvh_gen, md_gen; 5553 5554 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 5555 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5556 ("pmap_remove_write: page %p is not managed", m)); 5557 vm_page_assert_busied(m); 5558 5559 if (!pmap_page_is_write_mapped(m)) 5560 return; 5561 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5562 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5563 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5564 retry_pv_loop: 5565 rw_wlock(lock); 5566 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { 5567 pmap = PV_PMAP(pv); 5568 if (!PMAP_TRYLOCK(pmap)) { 5569 pvh_gen = pvh->pv_gen; 5570 rw_wunlock(lock); 5571 PMAP_LOCK(pmap); 5572 rw_wlock(lock); 5573 if (pvh_gen != pvh->pv_gen) { 5574 PMAP_UNLOCK(pmap); 5575 rw_wunlock(lock); 5576 goto retry_pv_loop; 5577 } 5578 } 5579 l3e = pmap_pml3e(pmap, pv->pv_va); 5580 if ((be64toh(*l3e) & PG_RW) != 0) 5581 (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock); 5582 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5583 ("inconsistent pv lock %p %p for page %p", 5584 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5585 PMAP_UNLOCK(pmap); 5586 } 5587 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 5588 pmap = PV_PMAP(pv); 5589 if (!PMAP_TRYLOCK(pmap)) { 5590 pvh_gen = pvh->pv_gen; 5591 md_gen = m->md.pv_gen; 5592 rw_wunlock(lock); 5593 PMAP_LOCK(pmap); 5594 rw_wlock(lock); 5595 if (pvh_gen != pvh->pv_gen || 5596 md_gen != m->md.pv_gen) { 5597 PMAP_UNLOCK(pmap); 5598 rw_wunlock(lock); 5599 goto retry_pv_loop; 5600 } 5601 } 5602 l3e = pmap_pml3e(pmap, pv->pv_va); 5603 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, 5604 ("pmap_remove_write: found a 2mpage in page %p's pv list", 5605 m)); 5606 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 5607 retry: 5608 oldpte = be64toh(*pte); 5609 if (oldpte & PG_RW) { 5610 if (!atomic_cmpset_long(pte, htobe64(oldpte), 5611 htobe64((oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M)))) 5612 goto retry; 5613 if ((oldpte & PG_M) != 0) 5614 vm_page_dirty(m); 5615 pmap_invalidate_page(pmap, pv->pv_va); 5616 } 5617 PMAP_UNLOCK(pmap); 5618 } 5619 rw_wunlock(lock); 5620 vm_page_aflag_clear(m, PGA_WRITEABLE); 5621 } 5622 5623 /* 5624 * Clear the wired attribute from the mappings for the specified range of 5625 * addresses in the given pmap. Every valid mapping within that range 5626 * must have the wired attribute set. In contrast, invalid mappings 5627 * cannot have the wired attribute set, so they are ignored. 5628 * 5629 * The wired attribute of the page table entry is not a hardware 5630 * feature, so there is no need to invalidate any TLB entries. 5631 * Since pmap_demote_l3e() for the wired entry must never fail, 5632 * pmap_delayed_invl_started()/finished() calls around the 5633 * function are not needed. 5634 */ 5635 void 5636 mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5637 { 5638 vm_offset_t va_next; 5639 pml1_entry_t *l1e; 5640 pml2_entry_t *l2e; 5641 pml3_entry_t *l3e; 5642 pt_entry_t *pte; 5643 5644 CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); 5645 PMAP_LOCK(pmap); 5646 for (; sva < eva; sva = va_next) { 5647 l1e = pmap_pml1e(pmap, sva); 5648 if ((be64toh(*l1e) & PG_V) == 0) { 5649 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 5650 if (va_next < sva) 5651 va_next = eva; 5652 continue; 5653 } 5654 l2e = pmap_l1e_to_l2e(l1e, sva); 5655 if ((be64toh(*l2e) & PG_V) == 0) { 5656 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 5657 if (va_next < sva) 5658 va_next = eva; 5659 continue; 5660 } 5661 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 5662 if (va_next < sva) 5663 va_next = eva; 5664 l3e = pmap_l2e_to_l3e(l2e, sva); 5665 if ((be64toh(*l3e) & PG_V) == 0) 5666 continue; 5667 if ((be64toh(*l3e) & RPTE_LEAF) != 0) { 5668 if ((be64toh(*l3e) & PG_W) == 0) 5669 panic("pmap_unwire: pde %#jx is missing PG_W", 5670 (uintmax_t)(be64toh(*l3e))); 5671 5672 /* 5673 * Are we unwiring the entire large page? If not, 5674 * demote the mapping and fall through. 5675 */ 5676 if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { 5677 atomic_clear_long(l3e, htobe64(PG_W)); 5678 pmap->pm_stats.wired_count -= L3_PAGE_SIZE / 5679 PAGE_SIZE; 5680 continue; 5681 } else if (!pmap_demote_l3e(pmap, l3e, sva)) 5682 panic("pmap_unwire: demotion failed"); 5683 } 5684 if (va_next > eva) 5685 va_next = eva; 5686 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, 5687 sva += PAGE_SIZE) { 5688 MPASS(pte == pmap_pte(pmap, sva)); 5689 if ((be64toh(*pte) & PG_V) == 0) 5690 continue; 5691 if ((be64toh(*pte) & PG_W) == 0) 5692 panic("pmap_unwire: pte %#jx is missing PG_W", 5693 (uintmax_t)(be64toh(*pte))); 5694 5695 /* 5696 * PG_W must be cleared atomically. Although the pmap 5697 * lock synchronizes access to PG_W, another processor 5698 * could be setting PG_M and/or PG_A concurrently. 5699 */ 5700 atomic_clear_long(pte, htobe64(PG_W)); 5701 pmap->pm_stats.wired_count--; 5702 } 5703 } 5704 PMAP_UNLOCK(pmap); 5705 } 5706 5707 void 5708 mmu_radix_zero_page(vm_page_t m) 5709 { 5710 vm_offset_t addr; 5711 5712 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 5713 addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5714 pagezero(addr); 5715 } 5716 5717 void 5718 mmu_radix_zero_page_area(vm_page_t m, int off, int size) 5719 { 5720 caddr_t addr; 5721 5722 CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size); 5723 MPASS(off + size <= PAGE_SIZE); 5724 addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5725 memset(addr + off, 0, size); 5726 } 5727 5728 static int 5729 mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5730 { 5731 pml3_entry_t *l3ep; 5732 pt_entry_t pte; 5733 vm_paddr_t pa; 5734 int val; 5735 5736 CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); 5737 PMAP_LOCK(pmap); 5738 5739 l3ep = pmap_pml3e(pmap, addr); 5740 if (l3ep != NULL && (be64toh(*l3ep) & PG_V)) { 5741 if (be64toh(*l3ep) & RPTE_LEAF) { 5742 pte = be64toh(*l3ep); 5743 /* Compute the physical address of the 4KB page. */ 5744 pa = ((be64toh(*l3ep) & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) & 5745 PG_FRAME; 5746 val = MINCORE_PSIND(1); 5747 } else { 5748 /* Native endian PTE, do not pass to functions */ 5749 pte = be64toh(*pmap_l3e_to_pte(l3ep, addr)); 5750 pa = pte & PG_FRAME; 5751 val = 0; 5752 } 5753 } else { 5754 pte = 0; 5755 pa = 0; 5756 val = 0; 5757 } 5758 if ((pte & PG_V) != 0) { 5759 val |= MINCORE_INCORE; 5760 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5761 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5762 if ((pte & PG_A) != 0) 5763 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5764 } 5765 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5766 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5767 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5768 *locked_pa = pa; 5769 } 5770 PMAP_UNLOCK(pmap); 5771 return (val); 5772 } 5773 5774 void 5775 mmu_radix_activate(struct thread *td) 5776 { 5777 pmap_t pmap; 5778 uint32_t curpid; 5779 5780 CTR2(KTR_PMAP, "%s(%p)", __func__, td); 5781 critical_enter(); 5782 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5783 curpid = mfspr(SPR_PID); 5784 if (pmap->pm_pid > isa3_base_pid && 5785 curpid != pmap->pm_pid) { 5786 mmu_radix_pid_set(pmap); 5787 } 5788 critical_exit(); 5789 } 5790 5791 /* 5792 * Increase the starting virtual address of the given mapping if a 5793 * different alignment might result in more superpage mappings. 5794 */ 5795 void 5796 mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset, 5797 vm_offset_t *addr, vm_size_t size) 5798 { 5799 5800 CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr, 5801 size); 5802 vm_offset_t superpage_offset; 5803 5804 if (size < L3_PAGE_SIZE) 5805 return; 5806 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5807 offset += ptoa(object->pg_color); 5808 superpage_offset = offset & L3_PAGE_MASK; 5809 if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE || 5810 (*addr & L3_PAGE_MASK) == superpage_offset) 5811 return; 5812 if ((*addr & L3_PAGE_MASK) < superpage_offset) 5813 *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset; 5814 else 5815 *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset; 5816 } 5817 5818 static void * 5819 mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr) 5820 { 5821 vm_offset_t va, tmpva, ppa, offset; 5822 5823 ppa = trunc_page(pa); 5824 offset = pa & PAGE_MASK; 5825 size = roundup2(offset + size, PAGE_SIZE); 5826 if (pa < powerpc_ptob(Maxmem)) 5827 panic("bad pa: %#lx less than Maxmem %#lx\n", 5828 pa, powerpc_ptob(Maxmem)); 5829 va = kva_alloc(size); 5830 if (bootverbose) 5831 printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr); 5832 KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr)); 5833 5834 if (!va) 5835 panic("%s: Couldn't alloc kernel virtual memory", __func__); 5836 5837 for (tmpva = va; size > 0;) { 5838 mmu_radix_kenter_attr(tmpva, ppa, attr); 5839 size -= PAGE_SIZE; 5840 tmpva += PAGE_SIZE; 5841 ppa += PAGE_SIZE; 5842 } 5843 ptesync(); 5844 5845 return ((void *)(va + offset)); 5846 } 5847 5848 static void * 5849 mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size) 5850 { 5851 5852 CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); 5853 5854 return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT)); 5855 } 5856 5857 void 5858 mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5859 { 5860 5861 CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma); 5862 m->md.mdpg_cache_attrs = ma; 5863 5864 /* 5865 * If "m" is a normal page, update its direct mapping. This update 5866 * can be relied upon to perform any cache operations that are 5867 * required for data coherence. 5868 */ 5869 if ((m->flags & PG_FICTITIOUS) == 0 && 5870 mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 5871 PAGE_SIZE, m->md.mdpg_cache_attrs)) 5872 panic("memory attribute change on the direct map failed"); 5873 } 5874 5875 static void 5876 mmu_radix_unmapdev(vm_offset_t va, vm_size_t size) 5877 { 5878 vm_offset_t offset; 5879 5880 CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, size); 5881 /* If we gave a direct map region in pmap_mapdev, do nothing */ 5882 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 5883 return; 5884 5885 offset = va & PAGE_MASK; 5886 size = round_page(offset + size); 5887 va = trunc_page(va); 5888 5889 if (pmap_initialized) { 5890 mmu_radix_qremove(va, atop(size)); 5891 kva_free(va, size); 5892 } 5893 } 5894 5895 static __inline void 5896 pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask) 5897 { 5898 uint64_t opte, npte; 5899 5900 /* 5901 * The cache mode bits are all in the low 32-bits of the 5902 * PTE, so we can just spin on updating the low 32-bits. 5903 */ 5904 do { 5905 opte = be64toh(*pte); 5906 npte = opte & ~mask; 5907 npte |= cache_bits; 5908 } while (npte != opte && !atomic_cmpset_long(pte, htobe64(opte), htobe64(npte))); 5909 } 5910 5911 /* 5912 * Tries to demote a 1GB page mapping. 5913 */ 5914 static boolean_t 5915 pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va) 5916 { 5917 pml2_entry_t oldpdpe; 5918 pml3_entry_t *firstpde, newpde, *pde; 5919 vm_paddr_t pdpgpa; 5920 vm_page_t pdpg; 5921 5922 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5923 oldpdpe = be64toh(*l2e); 5924 KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), 5925 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 5926 pdpg = vm_page_alloc(NULL, va >> L2_PAGE_SIZE_SHIFT, 5927 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); 5928 if (pdpg == NULL) { 5929 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 5930 " in pmap %p", va, pmap); 5931 return (FALSE); 5932 } 5933 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 5934 firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa); 5935 KASSERT((oldpdpe & PG_A) != 0, 5936 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 5937 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 5938 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 5939 newpde = oldpdpe; 5940 5941 /* 5942 * Initialize the page directory page. 5943 */ 5944 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 5945 *pde = htobe64(newpde); 5946 newpde += L3_PAGE_SIZE; 5947 } 5948 5949 /* 5950 * Demote the mapping. 5951 */ 5952 pde_store(l2e, pdpgpa); 5953 5954 /* 5955 * Flush PWC --- XXX revisit 5956 */ 5957 pmap_invalidate_all(pmap); 5958 5959 counter_u64_add(pmap_l2e_demotions, 1); 5960 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 5961 " in pmap %p", va, pmap); 5962 return (TRUE); 5963 } 5964 5965 vm_paddr_t 5966 mmu_radix_kextract(vm_offset_t va) 5967 { 5968 pml3_entry_t l3e; 5969 vm_paddr_t pa; 5970 5971 CTR2(KTR_PMAP, "%s(%#x)", __func__, va); 5972 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 5973 pa = DMAP_TO_PHYS(va); 5974 } else { 5975 /* Big-endian PTE on stack */ 5976 l3e = *pmap_pml3e(kernel_pmap, va); 5977 if (be64toh(l3e) & RPTE_LEAF) { 5978 pa = (be64toh(l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK); 5979 pa |= (va & L3_PAGE_MASK); 5980 } else { 5981 /* 5982 * Beware of a concurrent promotion that changes the 5983 * PDE at this point! For example, vtopte() must not 5984 * be used to access the PTE because it would use the 5985 * new PDE. It is, however, safe to use the old PDE 5986 * because the page table page is preserved by the 5987 * promotion. 5988 */ 5989 pa = be64toh(*pmap_l3e_to_pte(&l3e, va)); 5990 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 5991 pa |= (va & PAGE_MASK); 5992 } 5993 } 5994 return (pa); 5995 } 5996 5997 static pt_entry_t 5998 mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) 5999 { 6000 6001 if (ma != VM_MEMATTR_DEFAULT) { 6002 return pmap_cache_bits(ma); 6003 } 6004 6005 /* 6006 * Assume the page is cache inhibited and access is guarded unless 6007 * it's in our available memory array. 6008 */ 6009 for (int i = 0; i < pregions_sz; i++) { 6010 if ((pa >= pregions[i].mr_start) && 6011 (pa < (pregions[i].mr_start + pregions[i].mr_size))) 6012 return (RPTE_ATTR_MEM); 6013 } 6014 return (RPTE_ATTR_GUARDEDIO); 6015 } 6016 6017 static void 6018 mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) 6019 { 6020 pt_entry_t *pte, pteval; 6021 uint64_t cache_bits; 6022 6023 pte = kvtopte(va); 6024 MPASS(pte != NULL); 6025 pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; 6026 cache_bits = mmu_radix_calc_wimg(pa, ma); 6027 pte_store(pte, pteval | cache_bits); 6028 } 6029 6030 void 6031 mmu_radix_kremove(vm_offset_t va) 6032 { 6033 pt_entry_t *pte; 6034 6035 CTR2(KTR_PMAP, "%s(%#x)", __func__, va); 6036 6037 pte = kvtopte(va); 6038 pte_clear(pte); 6039 } 6040 6041 int 6042 mmu_radix_decode_kernel_ptr(vm_offset_t addr, 6043 int *is_user, vm_offset_t *decoded) 6044 { 6045 6046 CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr); 6047 *decoded = addr; 6048 *is_user = (addr < VM_MAXUSER_ADDRESS); 6049 return (0); 6050 } 6051 6052 static boolean_t 6053 mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) 6054 { 6055 6056 CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); 6057 return (mem_valid(pa, size)); 6058 } 6059 6060 static void 6061 mmu_radix_scan_init() 6062 { 6063 6064 CTR1(KTR_PMAP, "%s()", __func__); 6065 UNIMPLEMENTED(); 6066 } 6067 6068 static void 6069 mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, 6070 void **va) 6071 { 6072 CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va); 6073 UNIMPLEMENTED(); 6074 } 6075 6076 vm_offset_t 6077 mmu_radix_quick_enter_page(vm_page_t m) 6078 { 6079 vm_paddr_t paddr; 6080 6081 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 6082 paddr = VM_PAGE_TO_PHYS(m); 6083 return (PHYS_TO_DMAP(paddr)); 6084 } 6085 6086 void 6087 mmu_radix_quick_remove_page(vm_offset_t addr __unused) 6088 { 6089 /* no work to do here */ 6090 CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); 6091 } 6092 6093 static void 6094 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 6095 { 6096 cpu_flush_dcache((void *)sva, eva - sva); 6097 } 6098 6099 int 6100 mmu_radix_change_attr(vm_offset_t va, vm_size_t size, 6101 vm_memattr_t mode) 6102 { 6103 int error; 6104 6105 CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode); 6106 PMAP_LOCK(kernel_pmap); 6107 error = pmap_change_attr_locked(va, size, mode, true); 6108 PMAP_UNLOCK(kernel_pmap); 6109 return (error); 6110 } 6111 6112 static int 6113 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush) 6114 { 6115 vm_offset_t base, offset, tmpva; 6116 vm_paddr_t pa_start, pa_end, pa_end1; 6117 pml2_entry_t *l2e; 6118 pml3_entry_t *l3e; 6119 pt_entry_t *pte; 6120 int cache_bits, error; 6121 boolean_t changed; 6122 6123 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6124 base = trunc_page(va); 6125 offset = va & PAGE_MASK; 6126 size = round_page(offset + size); 6127 6128 /* 6129 * Only supported on kernel virtual addresses, including the direct 6130 * map but excluding the recursive map. 6131 */ 6132 if (base < DMAP_MIN_ADDRESS) 6133 return (EINVAL); 6134 6135 cache_bits = pmap_cache_bits(mode); 6136 changed = FALSE; 6137 6138 /* 6139 * Pages that aren't mapped aren't supported. Also break down 2MB pages 6140 * into 4KB pages if required. 6141 */ 6142 for (tmpva = base; tmpva < base + size; ) { 6143 l2e = pmap_pml2e(kernel_pmap, tmpva); 6144 if (l2e == NULL || *l2e == 0) 6145 return (EINVAL); 6146 if (be64toh(*l2e) & RPTE_LEAF) { 6147 /* 6148 * If the current 1GB page already has the required 6149 * memory type, then we need not demote this page. Just 6150 * increment tmpva to the next 1GB page frame. 6151 */ 6152 if ((be64toh(*l2e) & RPTE_ATTR_MASK) == cache_bits) { 6153 tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; 6154 continue; 6155 } 6156 6157 /* 6158 * If the current offset aligns with a 1GB page frame 6159 * and there is at least 1GB left within the range, then 6160 * we need not break down this page into 2MB pages. 6161 */ 6162 if ((tmpva & L2_PAGE_MASK) == 0 && 6163 tmpva + L2_PAGE_MASK < base + size) { 6164 tmpva += L2_PAGE_MASK; 6165 continue; 6166 } 6167 if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva)) 6168 return (ENOMEM); 6169 } 6170 l3e = pmap_l2e_to_l3e(l2e, tmpva); 6171 KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n", 6172 tmpva, l2e)); 6173 if (*l3e == 0) 6174 return (EINVAL); 6175 if (be64toh(*l3e) & RPTE_LEAF) { 6176 /* 6177 * If the current 2MB page already has the required 6178 * memory type, then we need not demote this page. Just 6179 * increment tmpva to the next 2MB page frame. 6180 */ 6181 if ((be64toh(*l3e) & RPTE_ATTR_MASK) == cache_bits) { 6182 tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; 6183 continue; 6184 } 6185 6186 /* 6187 * If the current offset aligns with a 2MB page frame 6188 * and there is at least 2MB left within the range, then 6189 * we need not break down this page into 4KB pages. 6190 */ 6191 if ((tmpva & L3_PAGE_MASK) == 0 && 6192 tmpva + L3_PAGE_MASK < base + size) { 6193 tmpva += L3_PAGE_SIZE; 6194 continue; 6195 } 6196 if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva)) 6197 return (ENOMEM); 6198 } 6199 pte = pmap_l3e_to_pte(l3e, tmpva); 6200 if (*pte == 0) 6201 return (EINVAL); 6202 tmpva += PAGE_SIZE; 6203 } 6204 error = 0; 6205 6206 /* 6207 * Ok, all the pages exist, so run through them updating their 6208 * cache mode if required. 6209 */ 6210 pa_start = pa_end = 0; 6211 for (tmpva = base; tmpva < base + size; ) { 6212 l2e = pmap_pml2e(kernel_pmap, tmpva); 6213 if (be64toh(*l2e) & RPTE_LEAF) { 6214 if ((be64toh(*l2e) & RPTE_ATTR_MASK) != cache_bits) { 6215 pmap_pte_attr(l2e, cache_bits, 6216 RPTE_ATTR_MASK); 6217 changed = TRUE; 6218 } 6219 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6220 (*l2e & PG_PS_FRAME) < dmaplimit) { 6221 if (pa_start == pa_end) { 6222 /* Start physical address run. */ 6223 pa_start = be64toh(*l2e) & PG_PS_FRAME; 6224 pa_end = pa_start + L2_PAGE_SIZE; 6225 } else if (pa_end == (be64toh(*l2e) & PG_PS_FRAME)) 6226 pa_end += L2_PAGE_SIZE; 6227 else { 6228 /* Run ended, update direct map. */ 6229 error = pmap_change_attr_locked( 6230 PHYS_TO_DMAP(pa_start), 6231 pa_end - pa_start, mode, flush); 6232 if (error != 0) 6233 break; 6234 /* Start physical address run. */ 6235 pa_start = be64toh(*l2e) & PG_PS_FRAME; 6236 pa_end = pa_start + L2_PAGE_SIZE; 6237 } 6238 } 6239 tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; 6240 continue; 6241 } 6242 l3e = pmap_l2e_to_l3e(l2e, tmpva); 6243 if (be64toh(*l3e) & RPTE_LEAF) { 6244 if ((be64toh(*l3e) & RPTE_ATTR_MASK) != cache_bits) { 6245 pmap_pte_attr(l3e, cache_bits, 6246 RPTE_ATTR_MASK); 6247 changed = TRUE; 6248 } 6249 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6250 (be64toh(*l3e) & PG_PS_FRAME) < dmaplimit) { 6251 if (pa_start == pa_end) { 6252 /* Start physical address run. */ 6253 pa_start = be64toh(*l3e) & PG_PS_FRAME; 6254 pa_end = pa_start + L3_PAGE_SIZE; 6255 } else if (pa_end == (be64toh(*l3e) & PG_PS_FRAME)) 6256 pa_end += L3_PAGE_SIZE; 6257 else { 6258 /* Run ended, update direct map. */ 6259 error = pmap_change_attr_locked( 6260 PHYS_TO_DMAP(pa_start), 6261 pa_end - pa_start, mode, flush); 6262 if (error != 0) 6263 break; 6264 /* Start physical address run. */ 6265 pa_start = be64toh(*l3e) & PG_PS_FRAME; 6266 pa_end = pa_start + L3_PAGE_SIZE; 6267 } 6268 } 6269 tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; 6270 } else { 6271 pte = pmap_l3e_to_pte(l3e, tmpva); 6272 if ((be64toh(*pte) & RPTE_ATTR_MASK) != cache_bits) { 6273 pmap_pte_attr(pte, cache_bits, 6274 RPTE_ATTR_MASK); 6275 changed = TRUE; 6276 } 6277 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6278 (be64toh(*pte) & PG_FRAME) < dmaplimit) { 6279 if (pa_start == pa_end) { 6280 /* Start physical address run. */ 6281 pa_start = be64toh(*pte) & PG_FRAME; 6282 pa_end = pa_start + PAGE_SIZE; 6283 } else if (pa_end == (be64toh(*pte) & PG_FRAME)) 6284 pa_end += PAGE_SIZE; 6285 else { 6286 /* Run ended, update direct map. */ 6287 error = pmap_change_attr_locked( 6288 PHYS_TO_DMAP(pa_start), 6289 pa_end - pa_start, mode, flush); 6290 if (error != 0) 6291 break; 6292 /* Start physical address run. */ 6293 pa_start = be64toh(*pte) & PG_FRAME; 6294 pa_end = pa_start + PAGE_SIZE; 6295 } 6296 } 6297 tmpva += PAGE_SIZE; 6298 } 6299 } 6300 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 6301 pa_end1 = MIN(pa_end, dmaplimit); 6302 if (pa_start != pa_end1) 6303 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 6304 pa_end1 - pa_start, mode, flush); 6305 } 6306 6307 /* 6308 * Flush CPU caches if required to make sure any data isn't cached that 6309 * shouldn't be, etc. 6310 */ 6311 if (changed) { 6312 pmap_invalidate_all(kernel_pmap); 6313 6314 if (flush) 6315 pmap_invalidate_cache_range(base, tmpva); 6316 } 6317 return (error); 6318 } 6319 6320 /* 6321 * Allocate physical memory for the vm_page array and map it into KVA, 6322 * attempting to back the vm_pages with domain-local memory. 6323 */ 6324 void 6325 mmu_radix_page_array_startup(long pages) 6326 { 6327 #ifdef notyet 6328 pml2_entry_t *l2e; 6329 pml3_entry_t *pde; 6330 pml3_entry_t newl3; 6331 vm_offset_t va; 6332 long pfn; 6333 int domain, i; 6334 #endif 6335 vm_paddr_t pa; 6336 vm_offset_t start, end; 6337 6338 vm_page_array_size = pages; 6339 6340 start = VM_MIN_KERNEL_ADDRESS; 6341 end = start + pages * sizeof(struct vm_page); 6342 6343 pa = vm_phys_early_alloc(0, end - start); 6344 6345 start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT); 6346 #ifdef notyet 6347 /* TODO: NUMA vm_page_array. Blocked out until then (copied from amd64). */ 6348 for (va = start; va < end; va += L3_PAGE_SIZE) { 6349 pfn = first_page + (va - start) / sizeof(struct vm_page); 6350 domain = vm_phys_domain(ptoa(pfn)); 6351 l2e = pmap_pml2e(kernel_pmap, va); 6352 if ((be64toh(*l2e) & PG_V) == 0) { 6353 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 6354 dump_add_page(pa); 6355 pagezero(PHYS_TO_DMAP(pa)); 6356 pde_store(l2e, (pml2_entry_t)pa); 6357 } 6358 pde = pmap_l2e_to_l3e(l2e, va); 6359 if ((be64toh(*pde) & PG_V) != 0) 6360 panic("Unexpected pde %p", pde); 6361 pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE); 6362 for (i = 0; i < NPDEPG; i++) 6363 dump_add_page(pa + i * PAGE_SIZE); 6364 newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W); 6365 pte_store(pde, newl3); 6366 } 6367 #endif 6368 vm_page_array = (vm_page_t)start; 6369 } 6370 6371 #ifdef DDB 6372 #include <sys/kdb.h> 6373 #include <ddb/ddb.h> 6374 6375 static void 6376 pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va) 6377 { 6378 pml1_entry_t *l1e; 6379 pml2_entry_t *l2e; 6380 pml3_entry_t *l3e; 6381 pt_entry_t *pte; 6382 6383 l1e = &l1[pmap_pml1e_index(va)]; 6384 db_printf("VA %#016lx l1e %#016lx", va, be64toh(*l1e)); 6385 if ((be64toh(*l1e) & PG_V) == 0) { 6386 db_printf("\n"); 6387 return; 6388 } 6389 l2e = pmap_l1e_to_l2e(l1e, va); 6390 db_printf(" l2e %#016lx", be64toh(*l2e)); 6391 if ((be64toh(*l2e) & PG_V) == 0 || (be64toh(*l2e) & RPTE_LEAF) != 0) { 6392 db_printf("\n"); 6393 return; 6394 } 6395 l3e = pmap_l2e_to_l3e(l2e, va); 6396 db_printf(" l3e %#016lx", be64toh(*l3e)); 6397 if ((be64toh(*l3e) & PG_V) == 0 || (be64toh(*l3e) & RPTE_LEAF) != 0) { 6398 db_printf("\n"); 6399 return; 6400 } 6401 pte = pmap_l3e_to_pte(l3e, va); 6402 db_printf(" pte %#016lx\n", be64toh(*pte)); 6403 } 6404 6405 void 6406 pmap_page_print_mappings(vm_page_t m) 6407 { 6408 pmap_t pmap; 6409 pv_entry_t pv; 6410 6411 db_printf("page %p(%lx)\n", m, m->phys_addr); 6412 /* need to elide locks if running in ddb */ 6413 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 6414 db_printf("pv: %p ", pv); 6415 db_printf("va: %#016lx ", pv->pv_va); 6416 pmap = PV_PMAP(pv); 6417 db_printf("pmap %p ", pmap); 6418 if (pmap != NULL) { 6419 db_printf("asid: %lu\n", pmap->pm_pid); 6420 pmap_pte_walk(pmap->pm_pml1, pv->pv_va); 6421 } 6422 } 6423 } 6424 6425 DB_SHOW_COMMAND(pte, pmap_print_pte) 6426 { 6427 vm_offset_t va; 6428 pmap_t pmap; 6429 6430 if (!have_addr) { 6431 db_printf("show pte addr\n"); 6432 return; 6433 } 6434 va = (vm_offset_t)addr; 6435 6436 if (va >= DMAP_MIN_ADDRESS) 6437 pmap = kernel_pmap; 6438 else if (kdb_thread != NULL) 6439 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 6440 else 6441 pmap = vmspace_pmap(curthread->td_proc->p_vmspace); 6442 6443 pmap_pte_walk(pmap->pm_pml1, va); 6444 } 6445 6446 #endif 6447