1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * Copyright (c) 1994 John S. Dyson 6 * Copyright (c) 1994 David Greenman 7 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 8 * Copyright (c) 2014-2016 Svatopluk Kraus <skra@FreeBSD.org> 9 * Copyright (c) 2014-2016 Michal Meloun <mmel@FreeBSD.org> 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 */ 40 /*- 41 * Copyright (c) 2003 Networks Associates Technology, Inc. 42 * All rights reserved. 43 * 44 * This software was developed for the FreeBSD Project by Jake Burkholder, 45 * Safeport Network Services, and Network Associates Laboratories, the 46 * Security Research Division of Network Associates, Inc. under 47 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 48 * CHATS research program. 49 * 50 * Redistribution and use in source and binary forms, with or without 51 * modification, are permitted provided that the following conditions 52 * are met: 53 * 1. Redistributions of source code must retain the above copyright 54 * notice, this list of conditions and the following disclaimer. 55 * 2. Redistributions in binary form must reproduce the above copyright 56 * notice, this list of conditions and the following disclaimer in the 57 * documentation and/or other materials provided with the distribution. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 */ 71 72 #include <sys/cdefs.h> 73 /* 74 * Manages physical address maps. 75 * 76 * Since the information managed by this module is 77 * also stored by the logical address mapping module, 78 * this module may throw away valid virtual-to-physical 79 * mappings at almost any time. However, invalidations 80 * of virtual-to-physical mappings must be done as 81 * requested. 82 * 83 * In order to cope with hardware architectures which 84 * make virtual-to-physical map invalidates expensive, 85 * this module may delay invalidate or reduced protection 86 * operations until such time as they are actually 87 * necessary. This module is given full information as 88 * to which processors are currently using which maps, 89 * and to when physical maps must be made correct. 90 */ 91 92 #include "opt_vm.h" 93 #include "opt_pmap.h" 94 #include "opt_ddb.h" 95 96 #include <sys/param.h> 97 #include <sys/systm.h> 98 #include <sys/kernel.h> 99 #include <sys/ktr.h> 100 #include <sys/lock.h> 101 #include <sys/proc.h> 102 #include <sys/rwlock.h> 103 #include <sys/malloc.h> 104 #include <sys/vmmeter.h> 105 #include <sys/malloc.h> 106 #include <sys/mman.h> 107 #include <sys/sf_buf.h> 108 #include <sys/smp.h> 109 #include <sys/sched.h> 110 #include <sys/sysctl.h> 111 112 #ifdef DDB 113 #include <ddb/ddb.h> 114 #endif 115 116 #include <vm/vm.h> 117 #include <vm/uma.h> 118 #include <vm/pmap.h> 119 #include <vm/vm_param.h> 120 #include <vm/vm_kern.h> 121 #include <vm/vm_object.h> 122 #include <vm/vm_map.h> 123 #include <vm/vm_page.h> 124 #include <vm/vm_pageout.h> 125 #include <vm/vm_phys.h> 126 #include <vm/vm_extern.h> 127 #include <vm/vm_radix.h> 128 #include <vm/vm_reserv.h> 129 #include <sys/lock.h> 130 #include <sys/mutex.h> 131 132 #include <machine/md_var.h> 133 #include <machine/pmap_var.h> 134 #include <machine/cpu.h> 135 #include <machine/pcb.h> 136 #include <machine/sf_buf.h> 137 #ifdef SMP 138 #include <machine/smp.h> 139 #endif 140 #ifndef PMAP_SHPGPERPROC 141 #define PMAP_SHPGPERPROC 200 142 #endif 143 144 #ifndef DIAGNOSTIC 145 #define PMAP_INLINE __inline 146 #else 147 #define PMAP_INLINE 148 #endif 149 150 #ifdef PMAP_DEBUG 151 static void pmap_zero_page_check(vm_page_t m); 152 void pmap_debug(int level); 153 int pmap_pid_dump(int pid); 154 155 #define PDEBUG(_lev_,_stat_) \ 156 if (pmap_debug_level >= (_lev_)) \ 157 ((_stat_)) 158 #define dprintf printf 159 int pmap_debug_level = 1; 160 #else /* PMAP_DEBUG */ 161 #define PDEBUG(_lev_,_stat_) /* Nothing */ 162 #define dprintf(x, arg...) 163 #endif /* PMAP_DEBUG */ 164 165 /* 166 * Level 2 page tables map definion ('max' is excluded). 167 */ 168 169 #define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 170 #define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) 171 172 #define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 173 #define UPT2V_MAX_ADDRESS \ 174 ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) 175 176 /* 177 * Promotion to a 1MB (PTE1) page mapping requires that the corresponding 178 * 4KB (PTE2) page mappings have identical settings for the following fields: 179 */ 180 #define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ 181 PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ 182 PTE2_ATTR_MASK) 183 184 #define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ 185 PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ 186 PTE1_ATTR_MASK) 187 188 #define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ 189 (((l2_attr) & L2_C) ? L1_S_C : 0) | \ 190 (((l2_attr) & L2_B) ? L1_S_B : 0) | \ 191 (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ 192 (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ 193 (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ 194 (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ 195 (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ 196 (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ 197 (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ 198 (((l2_attr) & PTE2_W) ? PTE1_W : 0)) 199 200 #define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ 201 (((l1_attr) & L1_S_C) ? L2_C : 0) | \ 202 (((l1_attr) & L1_S_B) ? L2_B : 0) | \ 203 (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ 204 (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ 205 (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ 206 (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ 207 (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ 208 (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ 209 (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ 210 (((l1_attr) & PTE1_W) ? PTE2_W : 0)) 211 212 /* 213 * PTE2 descriptors creation macros. 214 */ 215 #define PTE2_ATTR_DEFAULT vm_memattr_to_pte2(VM_MEMATTR_DEFAULT) 216 #define PTE2_ATTR_PT vm_memattr_to_pte2(pt_memattr) 217 218 #define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 219 #define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 220 221 #define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT) 222 #define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT) 223 224 #define PV_STATS 225 #ifdef PV_STATS 226 #define PV_STAT(x) do { x ; } while (0) 227 #else 228 #define PV_STAT(x) do { } while (0) 229 #endif 230 231 /* 232 * The boot_pt1 is used temporary in very early boot stage as L1 page table. 233 * We can init many things with no memory allocation thanks to its static 234 * allocation and this brings two main advantages: 235 * (1) other cores can be started very simply, 236 * (2) various boot loaders can be supported as its arguments can be processed 237 * in virtual address space and can be moved to safe location before 238 * first allocation happened. 239 * Only disadvantage is that boot_pt1 is used only in very early boot stage. 240 * However, the table is uninitialized and so lays in bss. Therefore kernel 241 * image size is not influenced. 242 * 243 * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and 244 * CPU suspend/resume game. 245 */ 246 extern pt1_entry_t boot_pt1[]; 247 248 vm_paddr_t base_pt1; 249 pt1_entry_t *kern_pt1; 250 pt2_entry_t *kern_pt2tab; 251 pt2_entry_t *PT2MAP; 252 253 static uint32_t ttb_flags; 254 static vm_memattr_t pt_memattr; 255 ttb_entry_t pmap_kern_ttb; 256 257 struct pmap kernel_pmap_store; 258 LIST_HEAD(pmaplist, pmap); 259 static struct pmaplist allpmaps; 260 static struct mtx allpmaps_lock; 261 262 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 263 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 264 265 static vm_offset_t kernel_vm_end_new; 266 vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; 267 vm_offset_t vm_max_kernel_address; 268 vm_paddr_t kernel_l1pa; 269 270 static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; 271 272 /* 273 * Data for the pv entry allocation mechanism 274 */ 275 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 276 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 277 static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ 278 static int shpgperproc = PMAP_SHPGPERPROC; 279 280 struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 281 int pv_maxchunks; /* How many chunks we have KVA for */ 282 vm_offset_t pv_vafree; /* freelist stored in the PTE */ 283 284 vm_paddr_t first_managed_pa; 285 #define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) 286 287 /* 288 * All those kernel PT submaps that BSD is so fond of 289 */ 290 caddr_t _tmppt = 0; 291 292 /* 293 * Crashdump maps. 294 */ 295 static caddr_t crashdumpmap; 296 297 static pt2_entry_t *PMAP1 = NULL, *PMAP2; 298 static pt2_entry_t *PADDR1 = NULL, *PADDR2; 299 #ifdef DDB 300 static pt2_entry_t *PMAP3; 301 static pt2_entry_t *PADDR3; 302 static int PMAP3cpu __unused; /* for SMP only */ 303 #endif 304 #ifdef SMP 305 static int PMAP1cpu; 306 static int PMAP1changedcpu; 307 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 308 &PMAP1changedcpu, 0, 309 "Number of times pmap_pte2_quick changed CPU with same PMAP1"); 310 #endif 311 static int PMAP1changed; 312 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 313 &PMAP1changed, 0, 314 "Number of times pmap_pte2_quick changed PMAP1"); 315 static int PMAP1unchanged; 316 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 317 &PMAP1unchanged, 0, 318 "Number of times pmap_pte2_quick didn't change PMAP1"); 319 static struct mtx PMAP2mutex; 320 321 /* 322 * Internal flags for pmap_enter()'s helper functions. 323 */ 324 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 325 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 326 327 static __inline void pt2_wirecount_init(vm_page_t m); 328 static bool pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, 329 vm_offset_t va); 330 static int pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, 331 u_int flags, vm_page_t m); 332 void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); 333 334 /* 335 * Function to set the debug level of the pmap code. 336 */ 337 #ifdef PMAP_DEBUG 338 void 339 pmap_debug(int level) 340 { 341 342 pmap_debug_level = level; 343 dprintf("pmap_debug: level=%d\n", pmap_debug_level); 344 } 345 #endif /* PMAP_DEBUG */ 346 347 /* 348 * This table must corespond with memory attribute configuration in vm.h. 349 * First entry is used for normal system mapping. 350 * 351 * Device memory is always marked as shared. 352 * Normal memory is shared only in SMP . 353 * Not outer shareable bits are not used yet. 354 * Class 6 cannot be used on ARM11. 355 */ 356 #define TEXDEF_TYPE_SHIFT 0 357 #define TEXDEF_TYPE_MASK 0x3 358 #define TEXDEF_INNER_SHIFT 2 359 #define TEXDEF_INNER_MASK 0x3 360 #define TEXDEF_OUTER_SHIFT 4 361 #define TEXDEF_OUTER_MASK 0x3 362 #define TEXDEF_NOS_SHIFT 6 363 #define TEXDEF_NOS_MASK 0x1 364 365 #define TEX(t, i, o, s) \ 366 ((t) << TEXDEF_TYPE_SHIFT) | \ 367 ((i) << TEXDEF_INNER_SHIFT) | \ 368 ((o) << TEXDEF_OUTER_SHIFT | \ 369 ((s) << TEXDEF_NOS_SHIFT)) 370 371 static uint32_t tex_class[8] = { 372 /* type inner cache outer cache */ 373 TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ 374 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ 375 TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ 376 TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ 377 TEX(PRRR_MEM, NMRR_WT, NMRR_WT, 0), /* 4 - ATTR_WT */ 378 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ 379 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ 380 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ 381 }; 382 #undef TEX 383 384 static uint32_t pte2_attr_tab[8] = { 385 PTE2_ATTR_WB_WA, /* 0 - VM_MEMATTR_WB_WA */ 386 PTE2_ATTR_NOCACHE, /* 1 - VM_MEMATTR_NOCACHE */ 387 PTE2_ATTR_DEVICE, /* 2 - VM_MEMATTR_DEVICE */ 388 PTE2_ATTR_SO, /* 3 - VM_MEMATTR_SO */ 389 PTE2_ATTR_WT, /* 4 - VM_MEMATTR_WRITE_THROUGH */ 390 0, /* 5 - NOT USED YET */ 391 0, /* 6 - NOT USED YET */ 392 0 /* 7 - NOT USED YET */ 393 }; 394 CTASSERT(VM_MEMATTR_WB_WA == 0); 395 CTASSERT(VM_MEMATTR_NOCACHE == 1); 396 CTASSERT(VM_MEMATTR_DEVICE == 2); 397 CTASSERT(VM_MEMATTR_SO == 3); 398 CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4); 399 #define VM_MEMATTR_END (VM_MEMATTR_WRITE_THROUGH + 1) 400 401 bool 402 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 403 { 404 405 return (mode >= 0 && mode < VM_MEMATTR_END); 406 } 407 408 static inline uint32_t 409 vm_memattr_to_pte2(vm_memattr_t ma) 410 { 411 412 KASSERT((u_int)ma < VM_MEMATTR_END, 413 ("%s: bad vm_memattr_t %d", __func__, ma)); 414 return (pte2_attr_tab[(u_int)ma]); 415 } 416 417 static inline uint32_t 418 vm_page_pte2_attr(vm_page_t m) 419 { 420 421 return (vm_memattr_to_pte2(m->md.pat_mode)); 422 } 423 424 /* 425 * Convert TEX definition entry to TTB flags. 426 */ 427 static uint32_t 428 encode_ttb_flags(int idx) 429 { 430 uint32_t inner, outer, nos, reg; 431 432 inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & 433 TEXDEF_INNER_MASK; 434 outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & 435 TEXDEF_OUTER_MASK; 436 nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & 437 TEXDEF_NOS_MASK; 438 439 reg = nos << 5; 440 reg |= outer << 3; 441 if (cpuinfo.coherent_walk) 442 reg |= (inner & 0x1) << 6; 443 reg |= (inner & 0x2) >> 1; 444 #ifdef SMP 445 ARM_SMP_UP( 446 reg |= 1 << 1, 447 ); 448 #endif 449 return reg; 450 } 451 452 /* 453 * Set TEX remapping registers in current CPU. 454 */ 455 void 456 pmap_set_tex(void) 457 { 458 uint32_t prrr, nmrr; 459 uint32_t type, inner, outer, nos; 460 int i; 461 462 #ifdef PMAP_PTE_NOCACHE 463 /* XXX fixme */ 464 if (cpuinfo.coherent_walk) { 465 pt_memattr = VM_MEMATTR_WB_WA; 466 ttb_flags = encode_ttb_flags(0); 467 } 468 else { 469 pt_memattr = VM_MEMATTR_NOCACHE; 470 ttb_flags = encode_ttb_flags(1); 471 } 472 #else 473 pt_memattr = VM_MEMATTR_WB_WA; 474 ttb_flags = encode_ttb_flags(0); 475 #endif 476 477 prrr = 0; 478 nmrr = 0; 479 480 /* Build remapping register from TEX classes. */ 481 for (i = 0; i < 8; i++) { 482 type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & 483 TEXDEF_TYPE_MASK; 484 inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & 485 TEXDEF_INNER_MASK; 486 outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & 487 TEXDEF_OUTER_MASK; 488 nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & 489 TEXDEF_NOS_MASK; 490 491 prrr |= type << (i * 2); 492 prrr |= nos << (i + 24); 493 nmrr |= inner << (i * 2); 494 nmrr |= outer << (i * 2 + 16); 495 } 496 /* Add shareable bits for device memory. */ 497 prrr |= PRRR_DS0 | PRRR_DS1; 498 499 /* Add shareable bits for normal memory in SMP case. */ 500 #ifdef SMP 501 ARM_SMP_UP( 502 prrr |= PRRR_NS1, 503 ); 504 #endif 505 cp15_prrr_set(prrr); 506 cp15_nmrr_set(nmrr); 507 508 /* Caches are disabled, so full TLB flush should be enough. */ 509 tlb_flush_all_local(); 510 } 511 512 /* 513 * Remap one vm_meattr class to another one. This can be useful as 514 * workaround for SOC errata, e.g. if devices must be accessed using 515 * SO memory class. 516 * 517 * !!! Please note that this function is absolutely last resort thing. 518 * It should not be used under normal circumstances. !!! 519 * 520 * Usage rules: 521 * - it shall be called after pmap_bootstrap_prepare() and before 522 * cpu_mp_start() (thus only on boot CPU). In practice, it's expected 523 * to be called from platform_attach() or platform_late_init(). 524 * 525 * - if remapping doesn't change caching mode, or until uncached class 526 * is remapped to any kind of cached one, then no other restriction exists. 527 * 528 * - if pmap_remap_vm_attr() changes caching mode, but both (original and 529 * remapped) remain cached, then caller is resposible for calling 530 * of dcache_wbinv_poc_all(). 531 * 532 * - remapping of any kind of cached class to uncached is not permitted. 533 */ 534 void 535 pmap_remap_vm_attr(vm_memattr_t old_attr, vm_memattr_t new_attr) 536 { 537 int old_idx, new_idx; 538 539 /* Map VM memattrs to indexes to tex_class table. */ 540 old_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)old_attr]); 541 new_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)new_attr]); 542 543 /* Replace TEX attribute and apply it. */ 544 tex_class[old_idx] = tex_class[new_idx]; 545 pmap_set_tex(); 546 } 547 548 /* 549 * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, 550 * KERNBASE is mapped by first L2 page table in L2 page table page. It 551 * meets same constrain due to PT2MAP being placed just under KERNBASE. 552 */ 553 CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); 554 CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); 555 556 /* 557 * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. 558 * For now, anyhow, the following check must be fulfilled. 559 */ 560 CTASSERT(PAGE_SIZE == PTE2_SIZE); 561 /* 562 * We don't want to mess up MI code with all MMU and PMAP definitions, 563 * so some things, which depend on other ones, are defined independently. 564 * Now, it is time to check that we don't screw up something. 565 */ 566 CTASSERT(PDR_SHIFT == PTE1_SHIFT); 567 /* 568 * Check L1 and L2 page table entries definitions consistency. 569 */ 570 CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); 571 CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); 572 /* 573 * Check L2 page tables page consistency. 574 */ 575 CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); 576 CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); 577 /* 578 * Check PT2TAB consistency. 579 * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. 580 * This should be done without remainder. 581 */ 582 CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); 583 584 /* 585 * A PT2MAP magic. 586 * 587 * All level 2 page tables (PT2s) are mapped continuously and accordingly 588 * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can 589 * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page 590 * must be used together, but not necessary at once. The first PT2 in a page 591 * must map things on correctly aligned address and the others must follow 592 * in right order. 593 */ 594 #define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) 595 #define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) 596 #define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) 597 598 /* 599 * Check PT2TAB consistency. 600 * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. 601 * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. 602 * The both should be done without remainder. 603 */ 604 CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); 605 CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); 606 /* 607 * The implementation was made general, however, with the assumption 608 * bellow in mind. In case of another value of NPG_IN_PT2TAB, 609 * the code should be once more rechecked. 610 */ 611 CTASSERT(NPG_IN_PT2TAB == 1); 612 613 /* 614 * Get offset of PT2 in a page 615 * associated with given PT1 index. 616 */ 617 static __inline u_int 618 page_pt2off(u_int pt1_idx) 619 { 620 621 return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); 622 } 623 624 /* 625 * Get physical address of PT2 626 * associated with given PT2s page and PT1 index. 627 */ 628 static __inline vm_paddr_t 629 page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) 630 { 631 632 return (pgpa + page_pt2off(pt1_idx)); 633 } 634 635 /* 636 * Get first entry of PT2 637 * associated with given PT2s page and PT1 index. 638 */ 639 static __inline pt2_entry_t * 640 page_pt2(vm_offset_t pgva, u_int pt1_idx) 641 { 642 643 return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); 644 } 645 646 /* 647 * Get virtual address of PT2s page (mapped in PT2MAP) 648 * which holds PT2 which holds entry which maps given virtual address. 649 */ 650 static __inline vm_offset_t 651 pt2map_pt2pg(vm_offset_t va) 652 { 653 654 va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); 655 return ((vm_offset_t)pt2map_entry(va)); 656 } 657 658 /***************************************************************************** 659 * 660 * THREE pmap initialization milestones exist: 661 * 662 * locore.S 663 * -> fundamental init (including MMU) in ASM 664 * 665 * initarm() 666 * -> fundamental init continues in C 667 * -> first available physical address is known 668 * 669 * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) 670 * -> basic (safe) interface for physical address allocation is made 671 * -> basic (safe) interface for virtual mapping is made 672 * -> limited not SMP coherent work is possible 673 * 674 * -> more fundamental init continues in C 675 * -> locks and some more things are available 676 * -> all fundamental allocations and mappings are done 677 * 678 * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) 679 * -> phys_avail[] and virtual_avail is set 680 * -> control is passed to vm subsystem 681 * -> physical and virtual address allocation are off limit 682 * -> low level mapping functions, some SMP coherent, 683 * are available, which cannot be used before vm subsystem 684 * is being inited 685 * 686 * mi_startup() 687 * -> vm subsystem is being inited 688 * 689 * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) 690 * -> pmap is fully inited 691 * 692 *****************************************************************************/ 693 694 /***************************************************************************** 695 * 696 * PMAP first stage initialization and utility functions 697 * for pre-bootstrap epoch. 698 * 699 * After pmap_bootstrap_prepare() is called, the following functions 700 * can be used: 701 * 702 * (1) strictly only for this stage functions for physical page allocations, 703 * virtual space allocations, and mappings: 704 * 705 * vm_paddr_t pmap_preboot_get_pages(u_int num); 706 * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); 707 * vm_offset_t pmap_preboot_reserve_pages(u_int num); 708 * vm_offset_t pmap_preboot_get_vpages(u_int num); 709 * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 710 * vm_prot_t prot, vm_memattr_t attr); 711 * 712 * (2) for all stages: 713 * 714 * vm_paddr_t pmap_kextract(vm_offset_t va); 715 * 716 * NOTE: This is not SMP coherent stage. 717 * 718 *****************************************************************************/ 719 720 #define KERNEL_P2V(pa) \ 721 ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) 722 #define KERNEL_V2P(va) \ 723 ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) 724 725 static vm_paddr_t last_paddr; 726 727 /* 728 * Pre-bootstrap epoch page allocator. 729 */ 730 vm_paddr_t 731 pmap_preboot_get_pages(u_int num) 732 { 733 vm_paddr_t ret; 734 735 ret = last_paddr; 736 last_paddr += num * PAGE_SIZE; 737 738 return (ret); 739 } 740 741 /* 742 * The fundamental initialization of PMAP stuff. 743 * 744 * Some things already happened in locore.S and some things could happen 745 * before pmap_bootstrap_prepare() is called, so let's recall what is done: 746 * 1. Caches are disabled. 747 * 2. We are running on virtual addresses already with 'boot_pt1' 748 * as L1 page table. 749 * 3. So far, all virtual addresses can be converted to physical ones and 750 * vice versa by the following macros: 751 * KERNEL_P2V(pa) .... physical to virtual ones, 752 * KERNEL_V2P(va) .... virtual to physical ones. 753 * 754 * What is done herein: 755 * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. 756 * 2. PT2MAP magic is brought to live. 757 * 3. Basic preboot functions for page allocations and mappings can be used. 758 * 4. Everything is prepared for L1 cache enabling. 759 * 760 * Variations: 761 * 1. To use second TTB register, so kernel and users page tables will be 762 * separated. This way process forking - pmap_pinit() - could be faster, 763 * it saves physical pages and KVA per a process, and it's simple change. 764 * However, it will lead, due to hardware matter, to the following: 765 * (a) 2G space for kernel and 2G space for users. 766 * (b) 1G space for kernel in low addresses and 3G for users above it. 767 * A question is: Is the case (b) really an option? Note that case (b) 768 * does save neither physical memory and KVA. 769 */ 770 void 771 pmap_bootstrap_prepare(vm_paddr_t last) 772 { 773 vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; 774 vm_offset_t pt2pg_va; 775 pt1_entry_t *pte1p; 776 pt2_entry_t *pte2p; 777 u_int i; 778 uint32_t l1_attr; 779 780 /* 781 * Now, we are going to make real kernel mapping. Note that we are 782 * already running on some mapping made in locore.S and we expect 783 * that it's large enough to ensure nofault access to physical memory 784 * allocated herein before switch. 785 * 786 * As kernel image and everything needed before are and will be mapped 787 * by section mappings, we align last physical address to PTE1_SIZE. 788 */ 789 last_paddr = pte1_roundup(last); 790 791 /* 792 * Allocate and zero page(s) for kernel L1 page table. 793 * 794 * Note that it's first allocation on space which was PTE1_SIZE 795 * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. 796 */ 797 base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); 798 kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); 799 bzero((void*)kern_pt1, NB_IN_PT1); 800 pte1_sync_range(kern_pt1, NB_IN_PT1); 801 802 /* Allocate and zero page(s) for kernel PT2TAB. */ 803 pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); 804 kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); 805 bzero(kern_pt2tab, NB_IN_PT2TAB); 806 pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); 807 808 /* Allocate and zero page(s) for kernel L2 page tables. */ 809 pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); 810 pt2pg_va = KERNEL_P2V(pt2pg_pa); 811 size = NKPT2PG * PAGE_SIZE; 812 bzero((void*)pt2pg_va, size); 813 pte2_sync_range((pt2_entry_t *)pt2pg_va, size); 814 815 /* 816 * Add a physical memory segment (vm_phys_seg) corresponding to the 817 * preallocated pages for kernel L2 page tables so that vm_page 818 * structures representing these pages will be created. The vm_page 819 * structures are required for promotion of the corresponding kernel 820 * virtual addresses to section mappings. 821 */ 822 vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); 823 824 /* 825 * Insert allocated L2 page table pages to PT2TAB and make 826 * link to all PT2s in L1 page table. See how kernel_vm_end 827 * is initialized. 828 * 829 * We play simple and safe. So every KVA will have underlaying 830 * L2 page table, even kernel image mapped by sections. 831 */ 832 pte2p = kern_pt2tab_entry(KERNBASE); 833 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) 834 pt2tab_store(pte2p++, PTE2_KPT(pa)); 835 836 pte1p = kern_pte1(KERNBASE); 837 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) 838 pte1_store(pte1p++, PTE1_LINK(pa)); 839 840 /* Make section mappings for kernel. */ 841 l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT); 842 pte1p = kern_pte1(KERNBASE); 843 for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) 844 pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr)); 845 846 /* 847 * Get free and aligned space for PT2MAP and make L1 page table links 848 * to L2 page tables held in PT2TAB. 849 * 850 * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t 851 * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus 852 * each entry in PT2TAB maps all PT2s in a page. This implies that 853 * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. 854 */ 855 PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); 856 pte1p = kern_pte1((vm_offset_t)PT2MAP); 857 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 858 pte1_store(pte1p++, PTE1_LINK(pa)); 859 } 860 861 /* 862 * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. 863 * Each pmap will hold own PT2TAB, so the mapping should be not global. 864 */ 865 pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); 866 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 867 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 868 } 869 870 /* 871 * Choose correct L2 page table and make mappings for allocations 872 * made herein which replaces temporary locore.S mappings after a while. 873 * Note that PT2MAP cannot be used until we switch to kern_pt1. 874 * 875 * Note, that these allocations started aligned on 1M section and 876 * kernel PT1 was allocated first. Making of mappings must follow 877 * order of physical allocations as we've used KERNEL_P2V() macro 878 * for virtual addresses resolution. 879 */ 880 pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); 881 pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); 882 883 pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); 884 885 /* Make mapping for kernel L1 page table. */ 886 for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) 887 pte2_store(pte2p++, PTE2_KPT(pa)); 888 889 /* Make mapping for kernel PT2TAB. */ 890 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) 891 pte2_store(pte2p++, PTE2_KPT(pa)); 892 893 /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ 894 pmap_kern_ttb = base_pt1 | ttb_flags; 895 cpuinfo_reinit_mmu(pmap_kern_ttb); 896 /* 897 * Initialize the first available KVA. As kernel image is mapped by 898 * sections, we are leaving some gap behind. 899 */ 900 virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; 901 } 902 903 /* 904 * Setup L2 page table page for given KVA. 905 * Used in pre-bootstrap epoch. 906 * 907 * Note that we have allocated NKPT2PG pages for L2 page tables in advance 908 * and used them for mapping KVA starting from KERNBASE. However, this is not 909 * enough. Vectors and devices need L2 page tables too. Note that they are 910 * even above VM_MAX_KERNEL_ADDRESS. 911 */ 912 static __inline vm_paddr_t 913 pmap_preboot_pt2pg_setup(vm_offset_t va) 914 { 915 pt2_entry_t *pte2p, pte2; 916 vm_paddr_t pt2pg_pa; 917 918 /* Get associated entry in PT2TAB. */ 919 pte2p = kern_pt2tab_entry(va); 920 921 /* Just return, if PT2s page exists already. */ 922 pte2 = pt2tab_load(pte2p); 923 if (pte2_is_valid(pte2)) 924 return (pte2_pa(pte2)); 925 926 KASSERT(va >= VM_MAX_KERNEL_ADDRESS, 927 ("%s: NKPT2PG too small", __func__)); 928 929 /* 930 * Allocate page for PT2s and insert it to PT2TAB. 931 * In other words, map it into PT2MAP space. 932 */ 933 pt2pg_pa = pmap_preboot_get_pages(1); 934 pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); 935 936 /* Zero all PT2s in allocated page. */ 937 bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); 938 pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); 939 940 return (pt2pg_pa); 941 } 942 943 /* 944 * Setup L2 page table for given KVA. 945 * Used in pre-bootstrap epoch. 946 */ 947 static void 948 pmap_preboot_pt2_setup(vm_offset_t va) 949 { 950 pt1_entry_t *pte1p; 951 vm_paddr_t pt2pg_pa, pt2_pa; 952 953 /* Setup PT2's page. */ 954 pt2pg_pa = pmap_preboot_pt2pg_setup(va); 955 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); 956 957 /* Insert PT2 to PT1. */ 958 pte1p = kern_pte1(va); 959 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 960 } 961 962 /* 963 * Get L2 page entry associated with given KVA. 964 * Used in pre-bootstrap epoch. 965 */ 966 static __inline pt2_entry_t* 967 pmap_preboot_vtopte2(vm_offset_t va) 968 { 969 pt1_entry_t *pte1p; 970 971 /* Setup PT2 if needed. */ 972 pte1p = kern_pte1(va); 973 if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ 974 pmap_preboot_pt2_setup(va); 975 976 return (pt2map_entry(va)); 977 } 978 979 /* 980 * Pre-bootstrap epoch page(s) mapping(s). 981 */ 982 void 983 pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) 984 { 985 u_int i; 986 pt2_entry_t *pte2p; 987 988 /* Map all the pages. */ 989 for (i = 0; i < num; i++) { 990 pte2p = pmap_preboot_vtopte2(va); 991 pte2_store(pte2p, PTE2_KRW(pa)); 992 va += PAGE_SIZE; 993 pa += PAGE_SIZE; 994 } 995 } 996 997 /* 998 * Pre-bootstrap epoch virtual space alocator. 999 */ 1000 vm_offset_t 1001 pmap_preboot_reserve_pages(u_int num) 1002 { 1003 u_int i; 1004 vm_offset_t start, va; 1005 pt2_entry_t *pte2p; 1006 1007 /* Allocate virtual space. */ 1008 start = va = virtual_avail; 1009 virtual_avail += num * PAGE_SIZE; 1010 1011 /* Zero the mapping. */ 1012 for (i = 0; i < num; i++) { 1013 pte2p = pmap_preboot_vtopte2(va); 1014 pte2_store(pte2p, 0); 1015 va += PAGE_SIZE; 1016 } 1017 1018 return (start); 1019 } 1020 1021 /* 1022 * Pre-bootstrap epoch page(s) allocation and mapping(s). 1023 */ 1024 vm_offset_t 1025 pmap_preboot_get_vpages(u_int num) 1026 { 1027 vm_paddr_t pa; 1028 vm_offset_t va; 1029 1030 /* Allocate physical page(s). */ 1031 pa = pmap_preboot_get_pages(num); 1032 1033 /* Allocate virtual space. */ 1034 va = virtual_avail; 1035 virtual_avail += num * PAGE_SIZE; 1036 1037 /* Map and zero all. */ 1038 pmap_preboot_map_pages(pa, va, num); 1039 bzero((void *)va, num * PAGE_SIZE); 1040 1041 return (va); 1042 } 1043 1044 /* 1045 * Pre-bootstrap epoch page mapping(s) with attributes. 1046 */ 1047 void 1048 pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 1049 vm_prot_t prot, vm_memattr_t attr) 1050 { 1051 u_int num; 1052 u_int l1_attr, l1_prot, l2_prot, l2_attr; 1053 pt1_entry_t *pte1p; 1054 pt2_entry_t *pte2p; 1055 1056 l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR; 1057 l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1058 l2_attr = vm_memattr_to_pte2(attr); 1059 l1_prot = ATTR_TO_L1(l2_prot); 1060 l1_attr = ATTR_TO_L1(l2_attr); 1061 1062 /* Map all the pages. */ 1063 num = round_page(size); 1064 while (num > 0) { 1065 if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { 1066 pte1p = kern_pte1(va); 1067 pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); 1068 va += PTE1_SIZE; 1069 pa += PTE1_SIZE; 1070 num -= PTE1_SIZE; 1071 } else { 1072 pte2p = pmap_preboot_vtopte2(va); 1073 pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr)); 1074 va += PAGE_SIZE; 1075 pa += PAGE_SIZE; 1076 num -= PAGE_SIZE; 1077 } 1078 } 1079 } 1080 1081 /* 1082 * Extract from the kernel page table the physical address 1083 * that is mapped by the given virtual address "va". 1084 */ 1085 vm_paddr_t 1086 pmap_kextract(vm_offset_t va) 1087 { 1088 vm_paddr_t pa; 1089 pt1_entry_t pte1; 1090 pt2_entry_t pte2; 1091 1092 pte1 = pte1_load(kern_pte1(va)); 1093 if (pte1_is_section(pte1)) { 1094 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1095 } else if (pte1_is_link(pte1)) { 1096 /* 1097 * We should beware of concurrent promotion that changes 1098 * pte1 at this point. However, it's not a problem as PT2 1099 * page is preserved by promotion in PT2TAB. So even if 1100 * it happens, using of PT2MAP is still safe. 1101 * 1102 * QQQ: However, concurrent removing is a problem which 1103 * ends in abort on PT2MAP space. Locking must be used 1104 * to deal with this. 1105 */ 1106 pte2 = pte2_load(pt2map_entry(va)); 1107 pa = pte2_pa(pte2) | (va & PTE2_OFFSET); 1108 } 1109 else { 1110 panic("%s: va %#x pte1 %#x", __func__, va, pte1); 1111 } 1112 return (pa); 1113 } 1114 1115 /* 1116 * Extract from the kernel page table the physical address 1117 * that is mapped by the given virtual address "va". Also 1118 * return L2 page table entry which maps the address. 1119 * 1120 * This is only intended to be used for panic dumps. 1121 */ 1122 vm_paddr_t 1123 pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) 1124 { 1125 vm_paddr_t pa; 1126 pt1_entry_t pte1; 1127 pt2_entry_t pte2; 1128 1129 pte1 = pte1_load(kern_pte1(va)); 1130 if (pte1_is_section(pte1)) { 1131 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1132 pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V; 1133 } else if (pte1_is_link(pte1)) { 1134 pte2 = pte2_load(pt2map_entry(va)); 1135 pa = pte2_pa(pte2); 1136 } else { 1137 pte2 = 0; 1138 pa = 0; 1139 } 1140 if (pte2p != NULL) 1141 *pte2p = pte2; 1142 return (pa); 1143 } 1144 1145 /***************************************************************************** 1146 * 1147 * PMAP second stage initialization and utility functions 1148 * for bootstrap epoch. 1149 * 1150 * After pmap_bootstrap() is called, the following functions for 1151 * mappings can be used: 1152 * 1153 * void pmap_kenter(vm_offset_t va, vm_size_t size, vm_paddr_t pa, int mode); 1154 * void pmap_kremove(vm_offset_t va); 1155 * void *pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, 1156 * int prot); 1157 * 1158 * NOTE: This is not SMP coherent stage. And physical page allocation is not 1159 * allowed during this stage. 1160 * 1161 *****************************************************************************/ 1162 1163 /* 1164 * Initialize kernel PMAP locks and lists, kernel_pmap itself, and 1165 * reserve various virtual spaces for temporary mappings. 1166 */ 1167 void 1168 pmap_bootstrap(vm_offset_t firstaddr) 1169 { 1170 pt2_entry_t *unused __unused; 1171 struct pcpu *pc; 1172 1173 /* 1174 * Initialize the kernel pmap (which is statically allocated). 1175 */ 1176 mtx_init(&kernel_pmap->pm_mtx, "kernel pmap", NULL, MTX_DEF); 1177 kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ 1178 kernel_pmap->pm_pt1 = kern_pt1; 1179 kernel_pmap->pm_pt2tab = kern_pt2tab; 1180 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1181 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1182 1183 /* 1184 * Initialize the global pv list lock. 1185 */ 1186 rw_init(&pvh_global_lock, "pmap pv global"); 1187 1188 LIST_INIT(&allpmaps); 1189 1190 /* 1191 * Request a spin mutex so that changes to allpmaps cannot be 1192 * preempted by smp_rendezvous_cpus(). 1193 */ 1194 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 1195 mtx_lock_spin(&allpmaps_lock); 1196 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 1197 mtx_unlock_spin(&allpmaps_lock); 1198 1199 /* 1200 * Reserve some special page table entries/VA space for temporary 1201 * mapping of pages. 1202 */ 1203 #define SYSMAP(c, p, v, n) do { \ 1204 v = (c)pmap_preboot_reserve_pages(n); \ 1205 p = pt2map_entry((vm_offset_t)v); \ 1206 } while (0) 1207 1208 /* 1209 * Local CMAP1/CMAP2 are used for zeroing and copying pages. 1210 * Local CMAP2 is also used for data cache cleaning. 1211 */ 1212 pc = get_pcpu(); 1213 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1214 SYSMAP(caddr_t, pc->pc_cmap1_pte2p, pc->pc_cmap1_addr, 1); 1215 SYSMAP(caddr_t, pc->pc_cmap2_pte2p, pc->pc_cmap2_addr, 1); 1216 SYSMAP(caddr_t, pc->pc_qmap_pte2p, pc->pc_qmap_addr, 1); 1217 1218 /* 1219 * Crashdump maps. 1220 */ 1221 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); 1222 1223 /* 1224 * _tmppt is used for reading arbitrary physical pages via /dev/mem. 1225 */ 1226 SYSMAP(caddr_t, unused, _tmppt, 1); 1227 1228 /* 1229 * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), 1230 * respectively. PADDR3 is used by pmap_pte2_ddb(). 1231 */ 1232 SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); 1233 SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); 1234 #ifdef DDB 1235 SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); 1236 #endif 1237 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 1238 1239 /* 1240 * Note that in very short time in initarm(), we are going to 1241 * initialize phys_avail[] array and no further page allocation 1242 * can happen after that until vm subsystem will be initialized. 1243 */ 1244 kernel_vm_end_new = kernel_vm_end; 1245 virtual_end = vm_max_kernel_address; 1246 } 1247 1248 static void 1249 pmap_init_reserved_pages(void *dummy __unused) 1250 { 1251 struct pcpu *pc; 1252 char *pages; 1253 int i; 1254 1255 CPU_FOREACH(i) { 1256 pc = pcpu_find(i); 1257 /* 1258 * Skip if the mapping has already been initialized, 1259 * i.e. this is the BSP. 1260 */ 1261 if (pc->pc_cmap1_addr != 0) 1262 continue; 1263 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1264 pages = kva_alloc(PAGE_SIZE * 3); 1265 if (pages == NULL) 1266 panic("%s: unable to allocate KVA", __func__); 1267 pc->pc_cmap1_pte2p = pt2map_entry((vm_offset_t)pages); 1268 pc->pc_cmap2_pte2p = pt2map_entry((vm_offset_t)pages + PAGE_SIZE); 1269 pc->pc_qmap_pte2p = pt2map_entry((vm_offset_t)pages + (PAGE_SIZE * 2)); 1270 pc->pc_cmap1_addr = pages; 1271 pc->pc_cmap2_addr = pages + PAGE_SIZE; 1272 pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); 1273 } 1274 } 1275 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 1276 1277 /* 1278 * The function can already be use in second initialization stage. 1279 * As such, the function DOES NOT call pmap_growkernel() where PT2 1280 * allocation can happen. So if used, be sure that PT2 for given 1281 * virtual address is allocated already! 1282 * 1283 * Add a wired page to the kva. 1284 * Note: not SMP coherent. 1285 */ 1286 static __inline void 1287 pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, 1288 uint32_t attr) 1289 { 1290 pt1_entry_t *pte1p; 1291 pt2_entry_t *pte2p; 1292 1293 pte1p = kern_pte1(va); 1294 if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ 1295 /* 1296 * This is a very low level function, so PT2 and particularly 1297 * PT2PG associated with given virtual address must be already 1298 * allocated. It's a pain mainly during pmap initialization 1299 * stage. However, called after pmap initialization with 1300 * virtual address not under kernel_vm_end will lead to 1301 * the same misery. 1302 */ 1303 if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) 1304 panic("%s: kernel PT2 not allocated!", __func__); 1305 } 1306 1307 pte2p = pt2map_entry(va); 1308 pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); 1309 } 1310 1311 static __inline void 1312 pmap_kenter_noflush(vm_offset_t va, vm_size_t size, vm_paddr_t pa, int mode) 1313 { 1314 uint32_t l2attr; 1315 1316 KASSERT((size & PAGE_MASK) == 0, 1317 ("%s: device mapping not page-sized", __func__)); 1318 1319 l2attr = vm_memattr_to_pte2(mode); 1320 while (size != 0) { 1321 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr); 1322 va += PAGE_SIZE; 1323 pa += PAGE_SIZE; 1324 size -= PAGE_SIZE; 1325 } 1326 } 1327 1328 PMAP_INLINE void 1329 pmap_kenter(vm_offset_t va, vm_size_t size, vm_paddr_t pa, int mode) 1330 { 1331 pmap_kenter_noflush(va, size, pa, mode); 1332 tlb_flush_range(va, size); 1333 } 1334 1335 /* 1336 * Remove a page from the kernel pagetables. 1337 * Note: not SMP coherent. 1338 */ 1339 PMAP_INLINE void 1340 pmap_kremove(vm_offset_t va) 1341 { 1342 pt1_entry_t *pte1p; 1343 pt2_entry_t *pte2p; 1344 1345 pte1p = kern_pte1(va); 1346 if (pte1_is_section(pte1_load(pte1p))) { 1347 pte1_clear(pte1p); 1348 } else { 1349 pte2p = pt2map_entry(va); 1350 pte2_clear(pte2p); 1351 } 1352 } 1353 1354 /* 1355 * Share new kernel PT2PG with all pmaps. 1356 * The caller is responsible for maintaining TLB consistency. 1357 */ 1358 static void 1359 pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) 1360 { 1361 pmap_t pmap; 1362 pt2_entry_t *pte2p; 1363 1364 mtx_lock_spin(&allpmaps_lock); 1365 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1366 pte2p = pmap_pt2tab_entry(pmap, va); 1367 pt2tab_store(pte2p, npte2); 1368 } 1369 mtx_unlock_spin(&allpmaps_lock); 1370 } 1371 1372 /* 1373 * Share new kernel PTE1 with all pmaps. 1374 * The caller is responsible for maintaining TLB consistency. 1375 */ 1376 static void 1377 pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) 1378 { 1379 pmap_t pmap; 1380 pt1_entry_t *pte1p; 1381 1382 mtx_lock_spin(&allpmaps_lock); 1383 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1384 pte1p = pmap_pte1(pmap, va); 1385 pte1_store(pte1p, npte1); 1386 } 1387 mtx_unlock_spin(&allpmaps_lock); 1388 } 1389 1390 /* 1391 * Used to map a range of physical addresses into kernel 1392 * virtual address space. 1393 * 1394 * The value passed in '*virt' is a suggested virtual address for 1395 * the mapping. Architectures which can support a direct-mapped 1396 * physical to virtual region can return the appropriate address 1397 * within that region, leaving '*virt' unchanged. Other 1398 * architectures should map the pages starting at '*virt' and 1399 * update '*virt' with the first usable address after the mapped 1400 * region. 1401 * 1402 * NOTE: Read the comments above pmap_kenter_prot_attr() as 1403 * the function is used herein! 1404 */ 1405 void * 1406 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1407 { 1408 vm_offset_t va, sva; 1409 vm_paddr_t pte1_offset; 1410 pt1_entry_t npte1; 1411 uint32_t l1prot, l2prot; 1412 uint32_t l1attr, l2attr; 1413 1414 PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," 1415 " prot = %d\n", __func__, *virt, start, end, end - start, prot)); 1416 1417 l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR; 1418 l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1419 l1prot = ATTR_TO_L1(l2prot); 1420 1421 l2attr = PTE2_ATTR_DEFAULT; 1422 l1attr = ATTR_TO_L1(l2attr); 1423 1424 va = *virt; 1425 /* 1426 * Does the physical address range's size and alignment permit at 1427 * least one section mapping to be created? 1428 */ 1429 pte1_offset = start & PTE1_OFFSET; 1430 if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= 1431 PTE1_SIZE) { 1432 /* 1433 * Increase the starting virtual address so that its alignment 1434 * does not preclude the use of section mappings. 1435 */ 1436 if ((va & PTE1_OFFSET) < pte1_offset) 1437 va = pte1_trunc(va) + pte1_offset; 1438 else if ((va & PTE1_OFFSET) > pte1_offset) 1439 va = pte1_roundup(va) + pte1_offset; 1440 } 1441 sva = va; 1442 while (start < end) { 1443 if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { 1444 KASSERT((va & PTE1_OFFSET) == 0, 1445 ("%s: misaligned va %#x", __func__, va)); 1446 npte1 = PTE1_KERN(start, l1prot, l1attr); 1447 pmap_kenter_pte1(va, npte1); 1448 va += PTE1_SIZE; 1449 start += PTE1_SIZE; 1450 } else { 1451 pmap_kenter_prot_attr(va, start, l2prot, l2attr); 1452 va += PAGE_SIZE; 1453 start += PAGE_SIZE; 1454 } 1455 } 1456 tlb_flush_range(sva, va - sva); 1457 *virt = va; 1458 return ((void *)sva); 1459 } 1460 1461 /* 1462 * Make a temporary mapping for a physical address. 1463 * This is only intended to be used for panic dumps. 1464 */ 1465 void * 1466 pmap_kenter_temporary(vm_paddr_t pa, int i) 1467 { 1468 vm_offset_t va; 1469 1470 /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ 1471 1472 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 1473 pmap_kenter_noflush(va, PAGE_SIZE, pa, VM_MEMATTR_DEFAULT); 1474 tlb_flush_local(va); 1475 return ((void *)crashdumpmap); 1476 } 1477 1478 /************************************* 1479 * 1480 * TLB & cache maintenance routines. 1481 * 1482 *************************************/ 1483 1484 /* 1485 * We inline these within pmap.c for speed. 1486 */ 1487 PMAP_INLINE void 1488 pmap_tlb_flush(pmap_t pmap, vm_offset_t va) 1489 { 1490 1491 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1492 tlb_flush(va); 1493 } 1494 1495 PMAP_INLINE void 1496 pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) 1497 { 1498 1499 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1500 tlb_flush_range(sva, size); 1501 } 1502 1503 /* 1504 * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. 1505 * Requirements: 1506 * - Must deal with pages in order to ensure that none of the PTE2_* bits 1507 * are ever set, PTE2_V in particular. 1508 * - Assumes we can write to pte2s without pte2_store() atomic ops. 1509 * - Assumes nothing will ever test these addresses for 0 to indicate 1510 * no mapping instead of correctly checking PTE2_V. 1511 * - Assumes a vm_offset_t will fit in a pte2 (true for arm). 1512 * Because PTE2_V is never set, there can be no mappings to invalidate. 1513 */ 1514 static vm_offset_t 1515 pmap_pte2list_alloc(vm_offset_t *head) 1516 { 1517 pt2_entry_t *pte2p; 1518 vm_offset_t va; 1519 1520 va = *head; 1521 if (va == 0) 1522 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 1523 pte2p = pt2map_entry(va); 1524 *head = *pte2p; 1525 if (*head & PTE2_V) 1526 panic("%s: va with PTE2_V set!", __func__); 1527 *pte2p = 0; 1528 return (va); 1529 } 1530 1531 static void 1532 pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) 1533 { 1534 pt2_entry_t *pte2p; 1535 1536 if (va & PTE2_V) 1537 panic("%s: freeing va with PTE2_V set!", __func__); 1538 pte2p = pt2map_entry(va); 1539 *pte2p = *head; /* virtual! PTE2_V is 0 though */ 1540 *head = va; 1541 } 1542 1543 static void 1544 pmap_pte2list_init(vm_offset_t *head, void *base, int npages) 1545 { 1546 int i; 1547 vm_offset_t va; 1548 1549 *head = 0; 1550 for (i = npages - 1; i >= 0; i--) { 1551 va = (vm_offset_t)base + i * PAGE_SIZE; 1552 pmap_pte2list_free(head, va); 1553 } 1554 } 1555 1556 /***************************************************************************** 1557 * 1558 * PMAP third and final stage initialization. 1559 * 1560 * After pmap_init() is called, PMAP subsystem is fully initialized. 1561 * 1562 *****************************************************************************/ 1563 1564 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1565 "VM/pmap parameters"); 1566 1567 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 1568 "Max number of PV entries"); 1569 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 1570 "Page share factor per proc"); 1571 1572 static u_long nkpt2pg = NKPT2PG; 1573 SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, 1574 &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); 1575 1576 static int sp_enabled = 1; 1577 SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 1578 &sp_enabled, 0, "Are large page mappings enabled?"); 1579 1580 static int pmap_growkernel_panic = 0; 1581 SYSCTL_INT(_vm_pmap, OID_AUTO, growkernel_panic, CTLFLAG_RDTUN, 1582 &pmap_growkernel_panic, 0, 1583 "panic on failure to allocate kernel page table page"); 1584 1585 bool 1586 pmap_ps_enabled(pmap_t pmap __unused) 1587 { 1588 1589 return (sp_enabled != 0); 1590 } 1591 1592 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1593 "1MB page mapping counters"); 1594 1595 static u_long pmap_pte1_demotions; 1596 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, 1597 &pmap_pte1_demotions, 0, "1MB page demotions"); 1598 1599 static u_long pmap_pte1_mappings; 1600 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, 1601 &pmap_pte1_mappings, 0, "1MB page mappings"); 1602 1603 static u_long pmap_pte1_p_failures; 1604 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, 1605 &pmap_pte1_p_failures, 0, "1MB page promotion failures"); 1606 1607 static u_long pmap_pte1_promotions; 1608 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, 1609 &pmap_pte1_promotions, 0, "1MB page promotions"); 1610 1611 static u_long pmap_pte1_kern_demotions; 1612 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD, 1613 &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions"); 1614 1615 static u_long pmap_pte1_kern_promotions; 1616 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD, 1617 &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions"); 1618 1619 static __inline ttb_entry_t 1620 pmap_ttb_get(pmap_t pmap) 1621 { 1622 1623 return (vtophys(pmap->pm_pt1) | ttb_flags); 1624 } 1625 1626 /* 1627 * Initialize a vm_page's machine-dependent fields. 1628 * 1629 * Variations: 1630 * 1. Pages for L2 page tables are always not managed. So, pv_list and 1631 * pt2_wirecount can share same physical space. However, proper 1632 * initialization on a page alloc for page tables and reinitialization 1633 * on the page free must be ensured. 1634 */ 1635 void 1636 pmap_page_init(vm_page_t m) 1637 { 1638 1639 TAILQ_INIT(&m->md.pv_list); 1640 pt2_wirecount_init(m); 1641 m->md.pat_mode = VM_MEMATTR_DEFAULT; 1642 } 1643 1644 /* 1645 * Virtualization for faster way how to zero whole page. 1646 */ 1647 static __inline void 1648 pagezero(void *page) 1649 { 1650 1651 bzero(page, PAGE_SIZE); 1652 } 1653 1654 /* 1655 * Zero L2 page table page. 1656 * Use same KVA as in pmap_zero_page(). 1657 */ 1658 static __inline vm_paddr_t 1659 pmap_pt2pg_zero(vm_page_t m) 1660 { 1661 pt2_entry_t *cmap2_pte2p; 1662 vm_paddr_t pa; 1663 struct pcpu *pc; 1664 1665 pa = VM_PAGE_TO_PHYS(m); 1666 1667 /* 1668 * XXX: For now, we map whole page even if it's already zero, 1669 * to sync it even if the sync is only DSB. 1670 */ 1671 sched_pin(); 1672 pc = get_pcpu(); 1673 cmap2_pte2p = pc->pc_cmap2_pte2p; 1674 mtx_lock(&pc->pc_cmap_lock); 1675 if (pte2_load(cmap2_pte2p) != 0) 1676 panic("%s: CMAP2 busy", __func__); 1677 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 1678 vm_page_pte2_attr(m))); 1679 /* Even VM_ALLOC_ZERO request is only advisory. */ 1680 if ((m->flags & PG_ZERO) == 0) 1681 pagezero(pc->pc_cmap2_addr); 1682 pte2_sync_range((pt2_entry_t *)pc->pc_cmap2_addr, PAGE_SIZE); 1683 pte2_clear(cmap2_pte2p); 1684 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 1685 1686 /* 1687 * Unpin the thread before releasing the lock. Otherwise the thread 1688 * could be rescheduled while still bound to the current CPU, only 1689 * to unpin itself immediately upon resuming execution. 1690 */ 1691 sched_unpin(); 1692 mtx_unlock(&pc->pc_cmap_lock); 1693 1694 return (pa); 1695 } 1696 1697 /* 1698 * Init just allocated page as L2 page table(s) holder 1699 * and return its physical address. 1700 */ 1701 static __inline vm_paddr_t 1702 pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) 1703 { 1704 vm_paddr_t pa; 1705 pt2_entry_t *pte2p; 1706 1707 /* Check page attributes. */ 1708 if (m->md.pat_mode != pt_memattr) 1709 pmap_page_set_memattr(m, pt_memattr); 1710 1711 /* Zero page and init wire counts. */ 1712 pa = pmap_pt2pg_zero(m); 1713 pt2_wirecount_init(m); 1714 1715 /* 1716 * Map page to PT2MAP address space for given pmap. 1717 * Note that PT2MAP space is shared with all pmaps. 1718 */ 1719 if (pmap == kernel_pmap) 1720 pmap_kenter_pt2tab(va, PTE2_KPT(pa)); 1721 else { 1722 pte2p = pmap_pt2tab_entry(pmap, va); 1723 pt2tab_store(pte2p, PTE2_KPT_NG(pa)); 1724 } 1725 1726 return (pa); 1727 } 1728 1729 /* 1730 * Initialize the pmap module. 1731 * 1732 * Called by vm_mem_init(), to initialize any structures that the pmap system 1733 * needs to map virtual memory. 1734 */ 1735 void 1736 pmap_init(void) 1737 { 1738 vm_size_t s; 1739 pt2_entry_t *pte2p, pte2; 1740 u_int i, pte1_idx, pv_npg; 1741 1742 /* 1743 * Initialize the vm page array entries for kernel pmap's 1744 * L2 page table pages allocated in advance. 1745 */ 1746 pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); 1747 pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); 1748 for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { 1749 vm_paddr_t pa; 1750 vm_page_t m; 1751 1752 pte2 = pte2_load(pte2p); 1753 KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); 1754 1755 pa = pte2_pa(pte2); 1756 m = PHYS_TO_VM_PAGE(pa); 1757 KASSERT(m >= vm_page_array && 1758 m < &vm_page_array[vm_page_array_size], 1759 ("%s: L2 page table page is out of range", __func__)); 1760 1761 m->pindex = pte1_idx; 1762 m->phys_addr = pa; 1763 pte1_idx += NPT2_IN_PG; 1764 } 1765 1766 /* 1767 * Initialize the address space (zone) for the pv entries. Set a 1768 * high water mark so that the system can recover from excessive 1769 * numbers of pv entries. 1770 */ 1771 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1772 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 1773 TUNABLE_INT_FETCH("vm.pmap.pv_entry_max", &pv_entry_max); 1774 pv_entry_max = roundup(pv_entry_max, _NPCPV); 1775 pv_entry_high_water = 9 * (pv_entry_max / 10); 1776 1777 /* 1778 * Are large page mappings enabled? 1779 */ 1780 TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); 1781 if (sp_enabled) { 1782 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1783 ("%s: can't assign to pagesizes[1]", __func__)); 1784 pagesizes[1] = PTE1_SIZE; 1785 } 1786 1787 /* 1788 * Calculate the size of the pv head table for sections. 1789 * Handle the possibility that "vm_phys_segs[...].end" is zero. 1790 * Note that the table is only for sections which could be promoted. 1791 */ 1792 first_managed_pa = pte1_trunc(vm_phys_segs[0].start); 1793 pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) 1794 - first_managed_pa) / PTE1_SIZE + 1; 1795 1796 /* 1797 * Allocate memory for the pv head table for sections. 1798 */ 1799 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1800 s = round_page(s); 1801 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 1802 for (i = 0; i < pv_npg; i++) 1803 TAILQ_INIT(&pv_table[i].pv_list); 1804 1805 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 1806 pv_chunkbase = kva_alloc(PAGE_SIZE * pv_maxchunks); 1807 if (pv_chunkbase == NULL) 1808 panic("%s: not enough kvm for pv chunks", __func__); 1809 pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 1810 } 1811 1812 /* 1813 * Add a list of wired pages to the kva 1814 * this routine is only used for temporary 1815 * kernel mappings that do not need to have 1816 * page modification or references recorded. 1817 * Note that old mappings are simply written 1818 * over. The page *must* be wired. 1819 * Note: SMP coherent. Uses a ranged shootdown IPI. 1820 */ 1821 void 1822 pmap_qenter(void *va, vm_page_t *ma, int count) 1823 { 1824 vm_offset_t sva = (vm_offset_t)va; 1825 u_int anychanged; 1826 pt2_entry_t *epte2p, *pte2p, pte2; 1827 vm_page_t m; 1828 vm_paddr_t pa; 1829 1830 anychanged = 0; 1831 pte2p = pt2map_entry(sva); 1832 epte2p = pte2p + count; 1833 while (pte2p < epte2p) { 1834 m = *ma++; 1835 pa = VM_PAGE_TO_PHYS(m); 1836 pte2 = pte2_load(pte2p); 1837 if ((pte2_pa(pte2) != pa) || 1838 (pte2_attr(pte2) != vm_page_pte2_attr(m))) { 1839 anychanged++; 1840 pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, 1841 vm_page_pte2_attr(m))); 1842 } 1843 pte2p++; 1844 } 1845 if (__predict_false(anychanged)) 1846 tlb_flush_range(sva, count * PAGE_SIZE); 1847 } 1848 1849 /* 1850 * This routine tears out page mappings from the 1851 * kernel -- it is meant only for temporary mappings. 1852 * Note: SMP coherent. Uses a ranged shootdown IPI. 1853 */ 1854 void 1855 pmap_qremove(void *sva, int count) 1856 { 1857 vm_offset_t va; 1858 1859 va = (vm_offset_t)sva; 1860 while (count-- > 0) { 1861 pmap_kremove(va); 1862 va += PAGE_SIZE; 1863 } 1864 tlb_flush_range((vm_offset_t)sva, va - (vm_offset_t)sva); 1865 } 1866 1867 /* 1868 * Are we current address space or kernel? 1869 */ 1870 static __inline int 1871 pmap_is_current(pmap_t pmap) 1872 { 1873 1874 return (pmap == kernel_pmap || 1875 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); 1876 } 1877 1878 /* 1879 * If the given pmap is not the current or kernel pmap, the returned 1880 * pte2 must be released by passing it to pmap_pte2_release(). 1881 */ 1882 static pt2_entry_t * 1883 pmap_pte2(pmap_t pmap, vm_offset_t va) 1884 { 1885 pt1_entry_t pte1; 1886 vm_paddr_t pt2pg_pa; 1887 1888 pte1 = pte1_load(pmap_pte1(pmap, va)); 1889 if (pte1_is_section(pte1)) 1890 panic("%s: attempt to map PTE1", __func__); 1891 if (pte1_is_link(pte1)) { 1892 /* Are we current address space or kernel? */ 1893 if (pmap_is_current(pmap)) 1894 return (pt2map_entry(va)); 1895 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1896 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1897 mtx_lock(&PMAP2mutex); 1898 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 1899 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 1900 tlb_flush((vm_offset_t)PADDR2); 1901 } 1902 return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1903 } 1904 return (NULL); 1905 } 1906 1907 /* 1908 * Releases a pte2 that was obtained from pmap_pte2(). 1909 * Be prepared for the pte2p being NULL. 1910 */ 1911 static __inline void 1912 pmap_pte2_release(pt2_entry_t *pte2p) 1913 { 1914 1915 if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { 1916 mtx_unlock(&PMAP2mutex); 1917 } 1918 } 1919 1920 /* 1921 * Super fast pmap_pte2 routine best used when scanning 1922 * the pv lists. This eliminates many coarse-grained 1923 * invltlb calls. Note that many of the pv list 1924 * scans are across different pmaps. It is very wasteful 1925 * to do an entire tlb flush for checking a single mapping. 1926 * 1927 * If the given pmap is not the current pmap, pvh_global_lock 1928 * must be held and curthread pinned to a CPU. 1929 */ 1930 static pt2_entry_t * 1931 pmap_pte2_quick(pmap_t pmap, vm_offset_t va) 1932 { 1933 pt1_entry_t pte1; 1934 vm_paddr_t pt2pg_pa; 1935 1936 pte1 = pte1_load(pmap_pte1(pmap, va)); 1937 if (pte1_is_section(pte1)) 1938 panic("%s: attempt to map PTE1", __func__); 1939 if (pte1_is_link(pte1)) { 1940 /* Are we current address space or kernel? */ 1941 if (pmap_is_current(pmap)) 1942 return (pt2map_entry(va)); 1943 rw_assert(&pvh_global_lock, RA_WLOCKED); 1944 KASSERT(curthread->td_pinned > 0, 1945 ("%s: curthread not pinned", __func__)); 1946 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1947 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1948 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 1949 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 1950 #ifdef SMP 1951 PMAP1cpu = PCPU_GET(cpuid); 1952 #endif 1953 tlb_flush_local((vm_offset_t)PADDR1); 1954 PMAP1changed++; 1955 } else 1956 #ifdef SMP 1957 if (PMAP1cpu != PCPU_GET(cpuid)) { 1958 PMAP1cpu = PCPU_GET(cpuid); 1959 tlb_flush_local((vm_offset_t)PADDR1); 1960 PMAP1changedcpu++; 1961 } else 1962 #endif 1963 PMAP1unchanged++; 1964 return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1965 } 1966 return (NULL); 1967 } 1968 1969 /* 1970 * Routine: pmap_extract 1971 * Function: 1972 * Extract the physical page address associated 1973 * with the given map/virtual_address pair. 1974 */ 1975 vm_paddr_t 1976 pmap_extract(pmap_t pmap, vm_offset_t va) 1977 { 1978 vm_paddr_t pa; 1979 pt1_entry_t pte1; 1980 pt2_entry_t *pte2p; 1981 1982 PMAP_LOCK(pmap); 1983 pte1 = pte1_load(pmap_pte1(pmap, va)); 1984 if (pte1_is_section(pte1)) 1985 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1986 else if (pte1_is_link(pte1)) { 1987 pte2p = pmap_pte2(pmap, va); 1988 pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); 1989 pmap_pte2_release(pte2p); 1990 } else 1991 pa = 0; 1992 PMAP_UNLOCK(pmap); 1993 return (pa); 1994 } 1995 1996 /* 1997 * Routine: pmap_extract_and_hold 1998 * Function: 1999 * Atomically extract and hold the physical page 2000 * with the given pmap and virtual address pair 2001 * if that mapping permits the given protection. 2002 */ 2003 vm_page_t 2004 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 2005 { 2006 vm_paddr_t pa; 2007 pt1_entry_t pte1; 2008 pt2_entry_t pte2, *pte2p; 2009 vm_page_t m; 2010 2011 m = NULL; 2012 PMAP_LOCK(pmap); 2013 pte1 = pte1_load(pmap_pte1(pmap, va)); 2014 if (pte1_is_section(pte1)) { 2015 if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { 2016 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 2017 m = PHYS_TO_VM_PAGE(pa); 2018 if (!vm_page_wire_mapped(m)) 2019 m = NULL; 2020 } 2021 } else if (pte1_is_link(pte1)) { 2022 pte2p = pmap_pte2(pmap, va); 2023 pte2 = pte2_load(pte2p); 2024 pmap_pte2_release(pte2p); 2025 if (pte2_is_valid(pte2) && 2026 (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { 2027 pa = pte2_pa(pte2); 2028 m = PHYS_TO_VM_PAGE(pa); 2029 if (!vm_page_wire_mapped(m)) 2030 m = NULL; 2031 } 2032 } 2033 PMAP_UNLOCK(pmap); 2034 return (m); 2035 } 2036 2037 /* 2038 * Grow the number of kernel L2 page table entries, if needed. 2039 */ 2040 static int 2041 pmap_growkernel_nopanic(vm_offset_t addr) 2042 { 2043 vm_page_t m; 2044 vm_paddr_t pt2pg_pa, pt2_pa; 2045 pt1_entry_t pte1; 2046 pt2_entry_t pte2; 2047 2048 PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); 2049 /* 2050 * All the time kernel_vm_end is first KVA for which underlying 2051 * L2 page table is either not allocated or linked from L1 page table 2052 * (not considering sections). Except for two possible cases: 2053 * 2054 * (1) in the very beginning as long as pmap_growkernel() was 2055 * not called, it could be first unused KVA (which is not 2056 * rounded up to PTE1_SIZE), 2057 * 2058 * (2) when all KVA space is mapped and vm_map_max(kernel_map) 2059 * address is not rounded up to PTE1_SIZE. (For example, 2060 * it could be 0xFFFFFFFF.) 2061 */ 2062 kernel_vm_end = pte1_roundup(kernel_vm_end); 2063 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2064 addr = roundup2(addr, PTE1_SIZE); 2065 if (addr - 1 >= vm_map_max(kernel_map)) 2066 addr = vm_map_max(kernel_map); 2067 while (kernel_vm_end < addr) { 2068 pte1 = pte1_load(kern_pte1(kernel_vm_end)); 2069 if (pte1_is_valid(pte1)) { 2070 kernel_vm_end += PTE1_SIZE; 2071 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2072 kernel_vm_end = vm_map_max(kernel_map); 2073 break; 2074 } 2075 continue; 2076 } 2077 2078 /* 2079 * kernel_vm_end_new is used in pmap_pinit() when kernel 2080 * mappings are entered to new pmap all at once to avoid race 2081 * between pmap_kenter_pte1() and kernel_vm_end increase. 2082 * The same aplies to pmap_kenter_pt2tab(). 2083 */ 2084 kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; 2085 2086 pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); 2087 if (!pte2_is_valid(pte2)) { 2088 /* 2089 * Install new PT2s page into kernel PT2TAB. 2090 */ 2091 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 2092 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2093 if (m == NULL) 2094 return (KERN_RESOURCE_SHORTAGE); 2095 m->pindex = pte1_index(kernel_vm_end) & ~PT2PG_MASK; 2096 2097 /* 2098 * QQQ: To link all new L2 page tables from L1 page 2099 * table now and so pmap_kenter_pte1() them 2100 * at once together with pmap_kenter_pt2tab() 2101 * could be nice speed up. However, 2102 * pmap_growkernel() does not happen so often... 2103 * QQQ: The other TTBR is another option. 2104 */ 2105 pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, 2106 m); 2107 } else 2108 pt2pg_pa = pte2_pa(pte2); 2109 2110 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); 2111 pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); 2112 2113 kernel_vm_end = kernel_vm_end_new; 2114 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2115 kernel_vm_end = vm_map_max(kernel_map); 2116 break; 2117 } 2118 } 2119 return (KERN_SUCCESS); 2120 } 2121 2122 int 2123 pmap_growkernel(vm_offset_t addr) 2124 { 2125 int rv; 2126 2127 rv = pmap_growkernel_nopanic(addr); 2128 if (rv != KERN_SUCCESS && pmap_growkernel_panic) 2129 panic("pmap_growkernel: no memory to grow kernel"); 2130 return (rv); 2131 } 2132 2133 static int 2134 kvm_size(SYSCTL_HANDLER_ARGS) 2135 { 2136 unsigned long ksize = vm_max_kernel_address - KERNBASE; 2137 2138 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2139 } 2140 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, 2141 CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, kvm_size, "IU", 2142 "Size of KVM"); 2143 2144 static int 2145 kvm_free(SYSCTL_HANDLER_ARGS) 2146 { 2147 unsigned long kfree = vm_max_kernel_address - kernel_vm_end; 2148 2149 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2150 } 2151 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, 2152 CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, kvm_free, "IU", 2153 "Amount of KVM free"); 2154 2155 /*********************************************** 2156 * 2157 * Pmap allocation/deallocation routines. 2158 * 2159 ***********************************************/ 2160 2161 /* 2162 * Initialize the pmap for proc0. 2163 */ 2164 void 2165 pmap_pinit0(pmap_t pmap) 2166 { 2167 PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); 2168 2169 PMAP_LOCK_INIT(pmap); 2170 2171 /* 2172 * Kernel page table directory and pmap stuff around is already 2173 * initialized, we are using it right now and here. So, finish 2174 * only PMAP structures initialization for process0 ... 2175 * 2176 * Since the L1 page table and PT2TAB is shared with the kernel pmap, 2177 * which is already included in the list "allpmaps", this pmap does 2178 * not need to be inserted into that list. 2179 */ 2180 pmap->pm_pt1 = kern_pt1; 2181 pmap->pm_pt2tab = kern_pt2tab; 2182 CPU_ZERO(&pmap->pm_active); 2183 PCPU_SET(curpmap, pmap); 2184 TAILQ_INIT(&pmap->pm_pvchunk); 2185 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2186 CPU_SET(0, &pmap->pm_active); 2187 } 2188 2189 static __inline void 2190 pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, 2191 vm_offset_t eva) 2192 { 2193 u_int idx, count; 2194 2195 idx = pte1_index(sva); 2196 count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); 2197 memcpy(dpte1p + idx, spte1p + idx, count); 2198 } 2199 2200 static __inline void 2201 pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, 2202 vm_offset_t eva) 2203 { 2204 u_int idx, count; 2205 2206 idx = pt2tab_index(sva); 2207 count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); 2208 memcpy(dpte2p + idx, spte2p + idx, count); 2209 } 2210 2211 /* 2212 * Initialize a preallocated and zeroed pmap structure, 2213 * such as one in a vmspace structure. 2214 */ 2215 int 2216 pmap_pinit(pmap_t pmap) 2217 { 2218 pt1_entry_t *pte1p; 2219 pt2_entry_t *pte2p; 2220 vm_paddr_t pa, pt2tab_pa; 2221 u_int i; 2222 2223 PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, 2224 pmap->pm_pt1)); 2225 2226 /* 2227 * No need to allocate L2 page table space yet but we do need 2228 * a valid L1 page table and PT2TAB table. 2229 * 2230 * Install shared kernel mappings to these tables. It's a little 2231 * tricky as some parts of KVA are reserved for vectors, devices, 2232 * and whatever else. These parts are supposed to be above 2233 * vm_max_kernel_address. Thus two regions should be installed: 2234 * 2235 * (1) <KERNBASE, kernel_vm_end), 2236 * (2) <vm_max_kernel_address, 0xFFFFFFFF>. 2237 * 2238 * QQQ: The second region should be stable enough to be installed 2239 * only once in time when the tables are allocated. 2240 * QQQ: Maybe copy of both regions at once could be faster ... 2241 * QQQ: Maybe the other TTBR is an option. 2242 * 2243 * Finally, install own PT2TAB table to these tables. 2244 */ 2245 2246 if (pmap->pm_pt1 == NULL) { 2247 pmap->pm_pt1 = kmem_alloc_contig(NB_IN_PT1, 2248 M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, pt_memattr); 2249 if (pmap->pm_pt1 == NULL) 2250 return (0); 2251 } 2252 if (pmap->pm_pt2tab == NULL) { 2253 /* 2254 * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page 2255 * only, what should be the only size for 32 bit systems, 2256 * then we could allocate it with vm_page_alloc() and all 2257 * the stuff needed as other L2 page table pages. 2258 * (2) Note that a process PT2TAB is special L2 page table 2259 * page. Its mapping in kernel_arena is permanent and can 2260 * be used no matter which process is current. Its mapping 2261 * in PT2MAP can be used only for current process. 2262 */ 2263 pmap->pm_pt2tab = kmem_alloc_attr(NB_IN_PT2TAB, 2264 M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); 2265 if (pmap->pm_pt2tab == NULL) { 2266 /* 2267 * QQQ: As struct pmap is allocated from UMA with 2268 * UMA_ZONE_NOFREE flag, it's important to leave 2269 * no allocation in pmap if initialization failed. 2270 */ 2271 kmem_free(pmap->pm_pt1, NB_IN_PT1); 2272 pmap->pm_pt1 = NULL; 2273 return (0); 2274 } 2275 /* 2276 * QQQ: Each L2 page table page vm_page_t has pindex set to 2277 * pte1 index of virtual address mapped by this page. 2278 * It's not valid for non kernel PT2TABs themselves. 2279 * The pindex of these pages can not be altered because 2280 * of the way how they are allocated now. However, it 2281 * should not be a problem. 2282 */ 2283 } 2284 2285 mtx_lock_spin(&allpmaps_lock); 2286 /* 2287 * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), 2288 * kernel_vm_end_new is used here instead of kernel_vm_end. 2289 */ 2290 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, 2291 kernel_vm_end_new - 1); 2292 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, 2293 0xFFFFFFFF); 2294 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, 2295 kernel_vm_end_new - 1); 2296 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, 2297 0xFFFFFFFF); 2298 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 2299 mtx_unlock_spin(&allpmaps_lock); 2300 2301 /* 2302 * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. 2303 * I.e. self reference mapping. The PT2TAB is private, however mapped 2304 * into shared PT2MAP space, so the mapping should be not global. 2305 */ 2306 pt2tab_pa = vtophys(pmap->pm_pt2tab); 2307 pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); 2308 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 2309 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 2310 } 2311 2312 /* Insert PT2MAP PT2s into pmap PT1. */ 2313 pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); 2314 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 2315 pte1_store(pte1p++, PTE1_LINK(pa)); 2316 } 2317 2318 /* 2319 * Now synchronize new mapping which was made above. 2320 */ 2321 pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); 2322 pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); 2323 2324 CPU_ZERO(&pmap->pm_active); 2325 TAILQ_INIT(&pmap->pm_pvchunk); 2326 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2327 2328 return (1); 2329 } 2330 2331 #ifdef INVARIANTS 2332 static bool 2333 pt2tab_user_is_empty(pt2_entry_t *tab) 2334 { 2335 u_int i, end; 2336 2337 end = pt2tab_index(VM_MAXUSER_ADDRESS); 2338 for (i = 0; i < end; i++) 2339 if (tab[i] != 0) return (false); 2340 return (true); 2341 } 2342 #endif 2343 /* 2344 * Release any resources held by the given physical map. 2345 * Called when a pmap initialized by pmap_pinit is being released. 2346 * Should only be called if the map contains no valid mappings. 2347 */ 2348 void 2349 pmap_release(pmap_t pmap) 2350 { 2351 #ifdef INVARIANTS 2352 vm_offset_t start, end; 2353 #endif 2354 KASSERT(pmap->pm_stats.resident_count == 0, 2355 ("%s: pmap resident count %ld != 0", __func__, 2356 pmap->pm_stats.resident_count)); 2357 KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), 2358 ("%s: has allocated user PT2(s)", __func__)); 2359 KASSERT(CPU_EMPTY(&pmap->pm_active), 2360 ("%s: pmap %p is active on some CPU(s)", __func__, pmap)); 2361 2362 mtx_lock_spin(&allpmaps_lock); 2363 LIST_REMOVE(pmap, pm_list); 2364 mtx_unlock_spin(&allpmaps_lock); 2365 2366 #ifdef INVARIANTS 2367 start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); 2368 end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); 2369 bzero((char *)pmap->pm_pt1 + start, end - start); 2370 2371 start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); 2372 end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); 2373 bzero((char *)pmap->pm_pt2tab + start, end - start); 2374 #endif 2375 /* 2376 * We are leaving PT1 and PT2TAB allocated on released pmap, 2377 * so hopefully UMA vmspace_zone will always be inited with 2378 * UMA_ZONE_NOFREE flag. 2379 */ 2380 } 2381 2382 /********************************************************* 2383 * 2384 * L2 table pages and their pages management routines. 2385 * 2386 *********************************************************/ 2387 2388 /* 2389 * Virtual interface for L2 page table wire counting. 2390 * 2391 * Each L2 page table in a page has own counter which counts a number of 2392 * valid mappings in a table. Global page counter counts mappings in all 2393 * tables in a page plus a single itself mapping in PT2TAB. 2394 * 2395 * During a promotion we leave the associated L2 page table counter 2396 * untouched, so the table (strictly speaking a page which holds it) 2397 * is never freed if promoted. 2398 * 2399 * If a page m->ref_count == 1 then no valid mappings exist in any L2 page 2400 * table in the page and the page itself is only mapped in PT2TAB. 2401 */ 2402 2403 static __inline void 2404 pt2_wirecount_init(vm_page_t m) 2405 { 2406 u_int i; 2407 2408 /* 2409 * Note: A page m is allocated with VM_ALLOC_WIRED flag and 2410 * m->ref_count should be already set correctly. 2411 * So, there is no need to set it again herein. 2412 */ 2413 for (i = 0; i < NPT2_IN_PG; i++) 2414 m->md.pt2_wirecount[i] = 0; 2415 } 2416 2417 static __inline void 2418 pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) 2419 { 2420 2421 /* 2422 * Note: A just modificated pte2 (i.e. already allocated) 2423 * is acquiring one extra reference which must be 2424 * explicitly cleared. It influences the KASSERTs herein. 2425 * All L2 page tables in a page always belong to the same 2426 * pmap, so we allow only one extra reference for the page. 2427 */ 2428 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), 2429 ("%s: PT2 is overflowing ...", __func__)); 2430 KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), 2431 ("%s: PT2PG is overflowing ...", __func__)); 2432 2433 m->ref_count++; 2434 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; 2435 } 2436 2437 static __inline void 2438 pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) 2439 { 2440 2441 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, 2442 ("%s: PT2 is underflowing ...", __func__)); 2443 KASSERT(m->ref_count > 1, 2444 ("%s: PT2PG is underflowing ...", __func__)); 2445 2446 m->ref_count--; 2447 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; 2448 } 2449 2450 static __inline void 2451 pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) 2452 { 2453 2454 KASSERT(count <= NPTE2_IN_PT2, 2455 ("%s: invalid count %u", __func__, count)); 2456 KASSERT(m->ref_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], 2457 ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->ref_count, 2458 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); 2459 2460 m->ref_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; 2461 m->ref_count += count; 2462 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; 2463 2464 KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), 2465 ("%s: PT2PG is overflowed (%u) ...", __func__, m->ref_count)); 2466 } 2467 2468 static __inline uint32_t 2469 pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) 2470 { 2471 2472 return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); 2473 } 2474 2475 static __inline bool 2476 pt2_is_empty(vm_page_t m, vm_offset_t va) 2477 { 2478 2479 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); 2480 } 2481 2482 static __inline bool 2483 pt2_is_full(vm_page_t m, vm_offset_t va) 2484 { 2485 2486 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 2487 NPTE2_IN_PT2); 2488 } 2489 2490 static __inline bool 2491 pt2pg_is_empty(vm_page_t m) 2492 { 2493 2494 return (m->ref_count == 1); 2495 } 2496 2497 /* 2498 * This routine is called if the L2 page table 2499 * is not mapped correctly. 2500 */ 2501 static vm_page_t 2502 _pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2503 { 2504 uint32_t pte1_idx; 2505 pt1_entry_t *pte1p; 2506 pt2_entry_t pte2; 2507 vm_page_t m; 2508 vm_paddr_t pt2pg_pa, pt2_pa; 2509 2510 pte1_idx = pte1_index(va); 2511 pte1p = pmap->pm_pt1 + pte1_idx; 2512 2513 KASSERT(pte1_load(pte1p) == 0, 2514 ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, 2515 pte1_load(pte1p))); 2516 2517 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); 2518 if (!pte2_is_valid(pte2)) { 2519 /* 2520 * Install new PT2s page into pmap PT2TAB. 2521 */ 2522 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2523 if (m == NULL) { 2524 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 2525 PMAP_UNLOCK(pmap); 2526 rw_wunlock(&pvh_global_lock); 2527 vm_wait(NULL); 2528 rw_wlock(&pvh_global_lock); 2529 PMAP_LOCK(pmap); 2530 } 2531 2532 /* 2533 * Indicate the need to retry. While waiting, 2534 * the L2 page table page may have been allocated. 2535 */ 2536 return (NULL); 2537 } 2538 m->pindex = pte1_idx & ~PT2PG_MASK; 2539 pmap->pm_stats.resident_count++; 2540 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 2541 } else { 2542 pt2pg_pa = pte2_pa(pte2); 2543 m = PHYS_TO_VM_PAGE(pt2pg_pa); 2544 } 2545 2546 pt2_wirecount_inc(m, pte1_idx); 2547 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 2548 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 2549 2550 return (m); 2551 } 2552 2553 static vm_page_t 2554 pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2555 { 2556 u_int pte1_idx; 2557 pt1_entry_t *pte1p, pte1; 2558 vm_page_t m; 2559 2560 pte1_idx = pte1_index(va); 2561 retry: 2562 pte1p = pmap->pm_pt1 + pte1_idx; 2563 pte1 = pte1_load(pte1p); 2564 2565 /* 2566 * This supports switching from a 1MB page to a 2567 * normal 4K page. 2568 */ 2569 if (pte1_is_section(pte1)) { 2570 (void)pmap_demote_pte1(pmap, pte1p, va); 2571 /* 2572 * Reload pte1 after demotion. 2573 * 2574 * Note: Demotion can even fail as either PT2 is not find for 2575 * the virtual address or PT2PG can not be allocated. 2576 */ 2577 pte1 = pte1_load(pte1p); 2578 } 2579 2580 /* 2581 * If the L2 page table page is mapped, we just increment the 2582 * hold count, and activate it. 2583 */ 2584 if (pte1_is_link(pte1)) { 2585 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2586 pt2_wirecount_inc(m, pte1_idx); 2587 } else { 2588 /* 2589 * Here if the PT2 isn't mapped, or if it has 2590 * been deallocated. 2591 */ 2592 m = _pmap_allocpte2(pmap, va, flags); 2593 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2594 goto retry; 2595 } 2596 2597 return (m); 2598 } 2599 2600 /* 2601 * Schedule the specified unused L2 page table page to be freed. Specifically, 2602 * add the page to the specified list of pages that will be released to the 2603 * physical memory manager after the TLB has been updated. 2604 */ 2605 static __inline void 2606 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) 2607 { 2608 2609 /* 2610 * Put page on a list so that it is released after 2611 * *ALL* TLB shootdown is done 2612 */ 2613 #ifdef PMAP_DEBUG 2614 pmap_zero_page_check(m); 2615 #endif 2616 m->flags |= PG_ZERO; 2617 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2618 } 2619 2620 /* 2621 * Unwire L2 page tables page. 2622 */ 2623 static void 2624 pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) 2625 { 2626 pt1_entry_t *pte1p, opte1 __unused; 2627 pt2_entry_t *pte2p; 2628 uint32_t i; 2629 2630 KASSERT(pt2pg_is_empty(m), 2631 ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); 2632 2633 /* 2634 * Unmap all L2 page tables in the page from L1 page table. 2635 * 2636 * QQQ: Individual L2 page tables (except the last one) can be unmapped 2637 * earlier. However, we are doing that this way. 2638 */ 2639 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 2640 ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); 2641 pte1p = pmap->pm_pt1 + m->pindex; 2642 for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { 2643 KASSERT(m->md.pt2_wirecount[i] == 0, 2644 ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); 2645 opte1 = pte1_load(pte1p); 2646 if (pte1_is_link(opte1)) { 2647 pte1_clear(pte1p); 2648 /* 2649 * Flush intermediate TLB cache. 2650 */ 2651 pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT); 2652 } 2653 #ifdef INVARIANTS 2654 else 2655 KASSERT((opte1 == 0) || pte1_is_section(opte1), 2656 ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, 2657 pmap, va, opte1, i)); 2658 #endif 2659 } 2660 2661 /* 2662 * Unmap the page from PT2TAB. 2663 */ 2664 pte2p = pmap_pt2tab_entry(pmap, va); 2665 (void)pt2tab_load_clear(pte2p); 2666 pmap_tlb_flush(pmap, pt2map_pt2pg(va)); 2667 2668 m->ref_count = 0; 2669 pmap->pm_stats.resident_count--; 2670 2671 /* 2672 * This barrier is so that the ordinary store unmapping 2673 * the L2 page table page is globally performed before TLB shoot- 2674 * down is begun. 2675 */ 2676 wmb(); 2677 vm_wire_sub(1); 2678 } 2679 2680 /* 2681 * Decrements a L2 page table page's wire count, which is used to record the 2682 * number of valid page table entries within the page. If the wire count 2683 * drops to zero, then the page table page is unmapped. Returns true if the 2684 * page table page was unmapped and false otherwise. 2685 */ 2686 static __inline bool 2687 pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2688 { 2689 pt2_wirecount_dec(m, pte1_index(va)); 2690 if (pt2pg_is_empty(m)) { 2691 /* 2692 * QQQ: Wire count is zero, so whole page should be zero and 2693 * we can set PG_ZERO flag to it. 2694 * Note that when promotion is enabled, it takes some 2695 * more efforts. See pmap_unwire_pt2_all() below. 2696 */ 2697 pmap_unwire_pt2pg(pmap, va, m); 2698 pmap_add_delayed_free_list(m, free); 2699 return (true); 2700 } else 2701 return (false); 2702 } 2703 2704 /* 2705 * Drop a L2 page table page's wire count at once, which is used to record 2706 * the number of valid L2 page table entries within the page. If the wire 2707 * count drops to zero, then the L2 page table page is unmapped. 2708 */ 2709 static __inline void 2710 pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, 2711 struct spglist *free) 2712 { 2713 u_int pte1_idx = pte1_index(va); 2714 2715 KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), 2716 ("%s: PT2 page's pindex is wrong", __func__)); 2717 KASSERT(m->ref_count > pt2_wirecount_get(m, pte1_idx), 2718 ("%s: bad pt2 wire count %u > %u", __func__, m->ref_count, 2719 pt2_wirecount_get(m, pte1_idx))); 2720 2721 /* 2722 * It's possible that the L2 page table was never used. 2723 * It happened in case that a section was created without promotion. 2724 */ 2725 if (pt2_is_full(m, va)) { 2726 pt2_wirecount_set(m, pte1_idx, 0); 2727 2728 /* 2729 * QQQ: We clear L2 page table now, so when L2 page table page 2730 * is going to be freed, we can set it PG_ZERO flag ... 2731 * This function is called only on section mappings, so 2732 * hopefully it's not to big overload. 2733 * 2734 * XXX: If pmap is current, existing PT2MAP mapping could be 2735 * used for zeroing. 2736 */ 2737 pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); 2738 } 2739 #ifdef INVARIANTS 2740 else 2741 KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", 2742 __func__, pt2_wirecount_get(m, pte1_idx))); 2743 #endif 2744 if (pt2pg_is_empty(m)) { 2745 pmap_unwire_pt2pg(pmap, va, m); 2746 pmap_add_delayed_free_list(m, free); 2747 } 2748 } 2749 2750 /* 2751 * After removing a L2 page table entry, this routine is used to 2752 * conditionally free the page, and manage the hold/wire counts. 2753 */ 2754 static bool 2755 pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) 2756 { 2757 pt1_entry_t pte1; 2758 vm_page_t mpte; 2759 2760 if (va >= VM_MAXUSER_ADDRESS) 2761 return (false); 2762 pte1 = pte1_load(pmap_pte1(pmap, va)); 2763 mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2764 return (pmap_unwire_pt2(pmap, va, mpte, free)); 2765 } 2766 2767 /************************************* 2768 * 2769 * Page management routines. 2770 * 2771 *************************************/ 2772 2773 static const uint32_t pc_freemask[_NPCM] = { 2774 [0 ... _NPCM - 2] = PC_FREEN, 2775 [_NPCM - 1] = PC_FREEL 2776 }; 2777 2778 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2779 "Current number of pv entries"); 2780 2781 #ifdef PV_STATS 2782 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2783 2784 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2785 "Current number of pv entry chunks"); 2786 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2787 "Current number of pv entry chunks allocated"); 2788 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2789 "Current number of pv entry chunks frees"); 2790 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 2791 0, "Number of times tried to get a chunk page but failed."); 2792 2793 static long pv_entry_frees, pv_entry_allocs; 2794 static int pv_entry_spare; 2795 2796 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2797 "Current number of pv entry frees"); 2798 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 2799 0, "Current number of pv entry allocs"); 2800 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2801 "Current number of spare pv entries"); 2802 #endif 2803 2804 /* 2805 * Is given page managed? 2806 */ 2807 static __inline bool 2808 is_managed(vm_paddr_t pa) 2809 { 2810 vm_page_t m; 2811 2812 m = PHYS_TO_VM_PAGE(pa); 2813 if (m == NULL) 2814 return (false); 2815 return ((m->oflags & VPO_UNMANAGED) == 0); 2816 } 2817 2818 static __inline bool 2819 pte1_is_managed(pt1_entry_t pte1) 2820 { 2821 2822 return (is_managed(pte1_pa(pte1))); 2823 } 2824 2825 static __inline bool 2826 pte2_is_managed(pt2_entry_t pte2) 2827 { 2828 2829 return (is_managed(pte2_pa(pte2))); 2830 } 2831 2832 /* 2833 * We are in a serious low memory condition. Resort to 2834 * drastic measures to free some pages so we can allocate 2835 * another pv entry chunk. 2836 */ 2837 static vm_page_t 2838 pmap_pv_reclaim(pmap_t locked_pmap) 2839 { 2840 struct pch newtail; 2841 struct pv_chunk *pc; 2842 struct md_page *pvh; 2843 pt1_entry_t *pte1p; 2844 pmap_t pmap; 2845 pt2_entry_t *pte2p, tpte2; 2846 pv_entry_t pv; 2847 vm_offset_t va; 2848 vm_page_t m, m_pc; 2849 struct spglist free; 2850 uint32_t inuse; 2851 int bit, field, freed; 2852 2853 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2854 pmap = NULL; 2855 m_pc = NULL; 2856 SLIST_INIT(&free); 2857 TAILQ_INIT(&newtail); 2858 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2859 SLIST_EMPTY(&free))) { 2860 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2861 if (pmap != pc->pc_pmap) { 2862 if (pmap != NULL) { 2863 if (pmap != locked_pmap) 2864 PMAP_UNLOCK(pmap); 2865 } 2866 pmap = pc->pc_pmap; 2867 /* Avoid deadlock and lock recursion. */ 2868 if (pmap > locked_pmap) 2869 PMAP_LOCK(pmap); 2870 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2871 pmap = NULL; 2872 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2873 continue; 2874 } 2875 } 2876 2877 /* 2878 * Destroy every non-wired, 4 KB page mapping in the chunk. 2879 */ 2880 freed = 0; 2881 for (field = 0; field < _NPCM; field++) { 2882 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2883 inuse != 0; inuse &= ~(1UL << bit)) { 2884 bit = ffs(inuse) - 1; 2885 pv = &pc->pc_pventry[field * 32 + bit]; 2886 va = pv->pv_va; 2887 pte1p = pmap_pte1(pmap, va); 2888 if (pte1_is_section(pte1_load(pte1p))) 2889 continue; 2890 pte2p = pmap_pte2(pmap, va); 2891 tpte2 = pte2_load(pte2p); 2892 if ((tpte2 & PTE2_W) == 0) 2893 tpte2 = pte2_load_clear(pte2p); 2894 pmap_pte2_release(pte2p); 2895 if ((tpte2 & PTE2_W) != 0) 2896 continue; 2897 KASSERT(tpte2 != 0, 2898 ("pmap_pv_reclaim: pmap %p va %#x zero pte", 2899 pmap, va)); 2900 pmap_tlb_flush(pmap, va); 2901 m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); 2902 if (pte2_is_dirty(tpte2)) 2903 vm_page_dirty(m); 2904 if ((tpte2 & PTE2_A) != 0) 2905 vm_page_aflag_set(m, PGA_REFERENCED); 2906 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2907 if (TAILQ_EMPTY(&m->md.pv_list) && 2908 (m->flags & PG_FICTITIOUS) == 0) { 2909 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2910 if (TAILQ_EMPTY(&pvh->pv_list)) { 2911 vm_page_aflag_clear(m, 2912 PGA_WRITEABLE); 2913 } 2914 } 2915 pc->pc_map[field] |= 1UL << bit; 2916 pmap_unuse_pt2(pmap, va, &free); 2917 freed++; 2918 } 2919 } 2920 if (freed == 0) { 2921 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2922 continue; 2923 } 2924 /* Every freed mapping is for a 4 KB page. */ 2925 pmap->pm_stats.resident_count -= freed; 2926 PV_STAT(pv_entry_frees += freed); 2927 PV_STAT(pv_entry_spare += freed); 2928 pv_entry_count -= freed; 2929 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2930 for (field = 0; field < _NPCM; field++) 2931 if (pc->pc_map[field] != pc_freemask[field]) { 2932 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2933 pc_list); 2934 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2935 2936 /* 2937 * One freed pv entry in locked_pmap is 2938 * sufficient. 2939 */ 2940 if (pmap == locked_pmap) 2941 goto out; 2942 break; 2943 } 2944 if (field == _NPCM) { 2945 PV_STAT(pv_entry_spare -= _NPCPV); 2946 PV_STAT(pc_chunk_count--); 2947 PV_STAT(pc_chunk_frees++); 2948 /* Entire chunk is free; return it. */ 2949 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2950 pmap_qremove(pc, 1); 2951 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2952 break; 2953 } 2954 } 2955 out: 2956 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2957 if (pmap != NULL) { 2958 if (pmap != locked_pmap) 2959 PMAP_UNLOCK(pmap); 2960 } 2961 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2962 m_pc = SLIST_FIRST(&free); 2963 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2964 /* Recycle a freed page table page. */ 2965 m_pc->ref_count = 1; 2966 vm_wire_add(1); 2967 } 2968 vm_page_free_pages_toq(&free, false); 2969 return (m_pc); 2970 } 2971 2972 static void 2973 free_pv_chunk(struct pv_chunk *pc) 2974 { 2975 vm_page_t m; 2976 2977 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2978 PV_STAT(pv_entry_spare -= _NPCPV); 2979 PV_STAT(pc_chunk_count--); 2980 PV_STAT(pc_chunk_frees++); 2981 /* entire chunk is free, return it */ 2982 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2983 pmap_qremove(pc, 1); 2984 vm_page_unwire_noq(m); 2985 vm_page_free(m); 2986 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2987 } 2988 2989 /* 2990 * Free the pv_entry back to the free list. 2991 */ 2992 static void 2993 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2994 { 2995 struct pv_chunk *pc; 2996 int idx, field, bit; 2997 2998 rw_assert(&pvh_global_lock, RA_WLOCKED); 2999 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3000 PV_STAT(pv_entry_frees++); 3001 PV_STAT(pv_entry_spare++); 3002 pv_entry_count--; 3003 pc = pv_to_chunk(pv); 3004 idx = pv - &pc->pc_pventry[0]; 3005 field = idx / 32; 3006 bit = idx % 32; 3007 pc->pc_map[field] |= 1ul << bit; 3008 for (idx = 0; idx < _NPCM; idx++) 3009 if (pc->pc_map[idx] != pc_freemask[idx]) { 3010 /* 3011 * 98% of the time, pc is already at the head of the 3012 * list. If it isn't already, move it to the head. 3013 */ 3014 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 3015 pc)) { 3016 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3017 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 3018 pc_list); 3019 } 3020 return; 3021 } 3022 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3023 free_pv_chunk(pc); 3024 } 3025 3026 /* 3027 * Get a new pv_entry, allocating a block from the system 3028 * when needed. 3029 */ 3030 static pv_entry_t 3031 get_pv_entry(pmap_t pmap, bool try) 3032 { 3033 static const struct timeval printinterval = { 60, 0 }; 3034 static struct timeval lastprint; 3035 int bit, field; 3036 pv_entry_t pv; 3037 struct pv_chunk *pc; 3038 vm_page_t m; 3039 3040 rw_assert(&pvh_global_lock, RA_WLOCKED); 3041 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3042 PV_STAT(pv_entry_allocs++); 3043 pv_entry_count++; 3044 if (pv_entry_count > pv_entry_high_water) 3045 if (ratecheck(&lastprint, &printinterval)) 3046 printf("Approaching the limit on PV entries, consider " 3047 "increasing either the vm.pmap.shpgperproc or the " 3048 "vm.pmap.pv_entry_max tunable.\n"); 3049 retry: 3050 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3051 if (pc != NULL) { 3052 for (field = 0; field < _NPCM; field++) { 3053 if (pc->pc_map[field]) { 3054 bit = ffs(pc->pc_map[field]) - 1; 3055 break; 3056 } 3057 } 3058 if (field < _NPCM) { 3059 pv = &pc->pc_pventry[field * 32 + bit]; 3060 pc->pc_map[field] &= ~(1ul << bit); 3061 /* If this was the last item, move it to tail */ 3062 for (field = 0; field < _NPCM; field++) 3063 if (pc->pc_map[field] != 0) { 3064 PV_STAT(pv_entry_spare--); 3065 return (pv); /* not full, return */ 3066 } 3067 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3068 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3069 PV_STAT(pv_entry_spare--); 3070 return (pv); 3071 } 3072 } 3073 /* 3074 * Access to the pte2list "pv_vafree" is synchronized by the pvh 3075 * global lock. If "pv_vafree" is currently non-empty, it will 3076 * remain non-empty until pmap_pte2list_alloc() completes. 3077 */ 3078 if (pv_vafree == 0 || 3079 (m = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { 3080 if (try) { 3081 pv_entry_count--; 3082 PV_STAT(pc_chunk_tryfail++); 3083 return (NULL); 3084 } 3085 m = pmap_pv_reclaim(pmap); 3086 if (m == NULL) 3087 goto retry; 3088 } 3089 PV_STAT(pc_chunk_count++); 3090 PV_STAT(pc_chunk_allocs++); 3091 pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); 3092 pmap_qenter(pc, &m, 1); 3093 pc->pc_pmap = pmap; 3094 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 3095 for (field = 1; field < _NPCM; field++) 3096 pc->pc_map[field] = pc_freemask[field]; 3097 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3098 pv = &pc->pc_pventry[0]; 3099 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3100 PV_STAT(pv_entry_spare += _NPCPV - 1); 3101 return (pv); 3102 } 3103 3104 /* 3105 * Create a pv entry for page at pa for 3106 * (pmap, va). 3107 */ 3108 static void 3109 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3110 { 3111 pv_entry_t pv; 3112 3113 rw_assert(&pvh_global_lock, RA_WLOCKED); 3114 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3115 pv = get_pv_entry(pmap, false); 3116 pv->pv_va = va; 3117 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3118 } 3119 3120 static __inline pv_entry_t 3121 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3122 { 3123 pv_entry_t pv; 3124 3125 rw_assert(&pvh_global_lock, RA_WLOCKED); 3126 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3127 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3128 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3129 break; 3130 } 3131 } 3132 return (pv); 3133 } 3134 3135 static void 3136 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3137 { 3138 pv_entry_t pv; 3139 3140 pv = pmap_pvh_remove(pvh, pmap, va); 3141 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3142 free_pv_entry(pmap, pv); 3143 } 3144 3145 static void 3146 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 3147 { 3148 struct md_page *pvh; 3149 3150 rw_assert(&pvh_global_lock, RA_WLOCKED); 3151 pmap_pvh_free(&m->md, pmap, va); 3152 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 3153 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3154 if (TAILQ_EMPTY(&pvh->pv_list)) 3155 vm_page_aflag_clear(m, PGA_WRITEABLE); 3156 } 3157 } 3158 3159 static void 3160 pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3161 { 3162 struct md_page *pvh; 3163 pv_entry_t pv; 3164 vm_offset_t va_last; 3165 vm_page_t m; 3166 3167 rw_assert(&pvh_global_lock, RA_WLOCKED); 3168 KASSERT((pa & PTE1_OFFSET) == 0, 3169 ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); 3170 3171 /* 3172 * Transfer the 1mpage's pv entry for this mapping to the first 3173 * page's pv list. 3174 */ 3175 pvh = pa_to_pvh(pa); 3176 va = pte1_trunc(va); 3177 pv = pmap_pvh_remove(pvh, pmap, va); 3178 KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); 3179 m = PHYS_TO_VM_PAGE(pa); 3180 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3181 /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3182 va_last = va + PTE1_SIZE - PAGE_SIZE; 3183 do { 3184 m++; 3185 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3186 ("pmap_pv_demote_pte1: page %p is not managed", m)); 3187 va += PAGE_SIZE; 3188 pmap_insert_entry(pmap, va, m); 3189 } while (va < va_last); 3190 } 3191 3192 #if VM_NRESERVLEVEL > 0 3193 static void 3194 pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3195 { 3196 struct md_page *pvh; 3197 pv_entry_t pv; 3198 vm_offset_t va_last; 3199 vm_page_t m; 3200 3201 rw_assert(&pvh_global_lock, RA_WLOCKED); 3202 KASSERT((pa & PTE1_OFFSET) == 0, 3203 ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); 3204 3205 /* 3206 * Transfer the first page's pv entry for this mapping to the 3207 * 1mpage's pv list. Aside from avoiding the cost of a call 3208 * to get_pv_entry(), a transfer avoids the possibility that 3209 * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() 3210 * removes one of the mappings that is being promoted. 3211 */ 3212 m = PHYS_TO_VM_PAGE(pa); 3213 va = pte1_trunc(va); 3214 pv = pmap_pvh_remove(&m->md, pmap, va); 3215 KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); 3216 pvh = pa_to_pvh(pa); 3217 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3218 /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3219 va_last = va + PTE1_SIZE - PAGE_SIZE; 3220 do { 3221 m++; 3222 va += PAGE_SIZE; 3223 pmap_pvh_free(&m->md, pmap, va); 3224 } while (va < va_last); 3225 } 3226 #endif 3227 3228 /* 3229 * Conditionally create a pv entry. 3230 */ 3231 static bool 3232 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3233 { 3234 pv_entry_t pv; 3235 3236 rw_assert(&pvh_global_lock, RA_WLOCKED); 3237 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3238 if (pv_entry_count < pv_entry_high_water && 3239 (pv = get_pv_entry(pmap, true)) != NULL) { 3240 pv->pv_va = va; 3241 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3242 return (true); 3243 } else 3244 return (false); 3245 } 3246 3247 /* 3248 * Create the pv entries for each of the pages within a section. 3249 */ 3250 static bool 3251 pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags) 3252 { 3253 struct md_page *pvh; 3254 pv_entry_t pv; 3255 bool noreclaim; 3256 3257 rw_assert(&pvh_global_lock, RA_WLOCKED); 3258 noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0; 3259 if ((noreclaim && pv_entry_count >= pv_entry_high_water) || 3260 (pv = get_pv_entry(pmap, noreclaim)) == NULL) 3261 return (false); 3262 pv->pv_va = va; 3263 pvh = pa_to_pvh(pte1_pa(pte1)); 3264 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3265 return (true); 3266 } 3267 3268 static inline void 3269 pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1) 3270 { 3271 3272 /* Kill all the small mappings or the big one only. */ 3273 if (pte1_is_section(npte1)) 3274 pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); 3275 else 3276 pmap_tlb_flush(pmap, pte1_trunc(va)); 3277 } 3278 3279 /* 3280 * Update kernel pte1 on all pmaps. 3281 * 3282 * The following function is called only on one cpu with disabled interrupts. 3283 * In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way 3284 * nobody can invoke explicit hardware table walk during the update of pte1. 3285 * Unsolicited hardware table walk can still happen, invoked by speculative 3286 * data or instruction prefetch or even by speculative hardware table walk. 3287 * 3288 * The break-before-make approach should be implemented here. However, it's 3289 * not so easy to do that for kernel mappings as it would be unhappy to unmap 3290 * itself unexpectedly but voluntarily. 3291 */ 3292 static void 3293 pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1) 3294 { 3295 pmap_t pmap; 3296 pt1_entry_t *pte1p; 3297 3298 /* 3299 * Get current pmap. Interrupts should be disabled here 3300 * so PCPU_GET() is done atomically. 3301 */ 3302 pmap = PCPU_GET(curpmap); 3303 if (pmap == NULL) 3304 pmap = kernel_pmap; 3305 3306 /* 3307 * (1) Change pte1 on current pmap. 3308 * (2) Flush all obsolete TLB entries on current CPU. 3309 * (3) Change pte1 on all pmaps. 3310 * (4) Flush all obsolete TLB entries on all CPUs in SMP case. 3311 */ 3312 3313 pte1p = pmap_pte1(pmap, va); 3314 pte1_store(pte1p, npte1); 3315 3316 /* Kill all the small mappings or the big one only. */ 3317 if (pte1_is_section(npte1)) { 3318 pmap_pte1_kern_promotions++; 3319 tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE); 3320 } else { 3321 pmap_pte1_kern_demotions++; 3322 tlb_flush_local(pte1_trunc(va)); 3323 } 3324 3325 /* 3326 * In SMP case, this function is called when all cpus are at smp 3327 * rendezvous, so there is no need to use 'allpmaps_lock' lock here. 3328 * In UP case, the function is called with this lock locked. 3329 */ 3330 LIST_FOREACH(pmap, &allpmaps, pm_list) { 3331 pte1p = pmap_pte1(pmap, va); 3332 pte1_store(pte1p, npte1); 3333 } 3334 3335 #ifdef SMP 3336 /* Kill all the small mappings or the big one only. */ 3337 if (pte1_is_section(npte1)) 3338 tlb_flush_range(pte1_trunc(va), PTE1_SIZE); 3339 else 3340 tlb_flush(pte1_trunc(va)); 3341 #endif 3342 } 3343 3344 #ifdef SMP 3345 struct pte1_action { 3346 vm_offset_t va; 3347 pt1_entry_t npte1; 3348 u_int update; /* CPU that updates the PTE1 */ 3349 }; 3350 3351 static void 3352 pmap_update_pte1_action(void *arg) 3353 { 3354 struct pte1_action *act = arg; 3355 3356 if (act->update == PCPU_GET(cpuid)) 3357 pmap_update_pte1_kernel(act->va, act->npte1); 3358 } 3359 3360 /* 3361 * Change pte1 on current pmap. 3362 * Note that kernel pte1 must be changed on all pmaps. 3363 * 3364 * According to the architecture reference manual published by ARM, 3365 * the behaviour is UNPREDICTABLE when two or more TLB entries map the same VA. 3366 * According to this manual, UNPREDICTABLE behaviours must never happen in 3367 * a viable system. In contrast, on x86 processors, it is not specified which 3368 * TLB entry mapping the virtual address will be used, but the MMU doesn't 3369 * generate a bogus translation the way it does on Cortex-A8 rev 2 (Beaglebone 3370 * Black). 3371 * 3372 * It's a problem when either promotion or demotion is being done. The pte1 3373 * update and appropriate TLB flush must be done atomically in general. 3374 */ 3375 static void 3376 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3377 pt1_entry_t npte1) 3378 { 3379 3380 if (pmap == kernel_pmap) { 3381 struct pte1_action act; 3382 3383 sched_pin(); 3384 act.va = va; 3385 act.npte1 = npte1; 3386 act.update = PCPU_GET(cpuid); 3387 smp_rendezvous_cpus(all_cpus, smp_no_rendezvous_barrier, 3388 pmap_update_pte1_action, NULL, &act); 3389 sched_unpin(); 3390 } else { 3391 register_t cspr; 3392 3393 /* 3394 * Use break-before-make approach for changing userland 3395 * mappings. It can cause L1 translation aborts on other 3396 * cores in SMP case. So, special treatment is implemented 3397 * in pmap_fault(). To reduce the likelihood that another core 3398 * will be affected by the broken mapping, disable interrupts 3399 * until the mapping change is completed. 3400 */ 3401 cspr = disable_interrupts(PSR_I); 3402 pte1_clear(pte1p); 3403 pmap_tlb_flush_pte1(pmap, va, npte1); 3404 pte1_store(pte1p, npte1); 3405 restore_interrupts(cspr); 3406 } 3407 } 3408 #else 3409 static void 3410 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3411 pt1_entry_t npte1) 3412 { 3413 3414 if (pmap == kernel_pmap) { 3415 mtx_lock_spin(&allpmaps_lock); 3416 pmap_update_pte1_kernel(va, npte1); 3417 mtx_unlock_spin(&allpmaps_lock); 3418 } else { 3419 register_t cspr; 3420 3421 /* 3422 * Use break-before-make approach for changing userland 3423 * mappings. It's absolutely safe in UP case when interrupts 3424 * are disabled. 3425 */ 3426 cspr = disable_interrupts(PSR_I); 3427 pte1_clear(pte1p); 3428 pmap_tlb_flush_pte1(pmap, va, npte1); 3429 pte1_store(pte1p, npte1); 3430 restore_interrupts(cspr); 3431 } 3432 } 3433 #endif 3434 3435 #if VM_NRESERVLEVEL > 0 3436 /* 3437 * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are 3438 * within a single page table page (PT2) to a single 1MB page mapping. 3439 * For promotion to occur, two conditions must be met: (1) the 4KB page 3440 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3441 * mappings must have identical characteristics. 3442 * 3443 * Managed (PG_MANAGED) mappings within the kernel address space are not 3444 * promoted. The reason is that kernel PTE1s are replicated in each pmap but 3445 * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only 3446 * read the PTE1 from the kernel pmap. 3447 */ 3448 static void 3449 pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3450 { 3451 pt1_entry_t npte1; 3452 pt2_entry_t *fpte2p, fpte2, fpte2_fav; 3453 pt2_entry_t *pte2p, pte2; 3454 vm_offset_t pteva __unused; 3455 vm_page_t m __unused; 3456 3457 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3458 pmap, va, pte1_load(pte1p), pte1p)); 3459 3460 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3461 3462 /* 3463 * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is 3464 * either invalid, unused, or does not map the first 4KB physical page 3465 * within a 1MB page. 3466 */ 3467 fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); 3468 fpte2 = pte2_load(fpte2p); 3469 if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != 3470 (PTE2_A | PTE2_V)) { 3471 pmap_pte1_p_failures++; 3472 CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", 3473 __func__, va, pmap); 3474 return; 3475 } 3476 if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { 3477 pmap_pte1_p_failures++; 3478 CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", 3479 __func__, va, pmap); 3480 return; 3481 } 3482 if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3483 /* 3484 * When page is not modified, PTE2_RO can be set without 3485 * a TLB invalidation. 3486 */ 3487 fpte2 |= PTE2_RO; 3488 pte2_store(fpte2p, fpte2); 3489 } 3490 3491 /* 3492 * Examine each of the other PTE2s in the specified PT2. Abort if this 3493 * PTE2 maps an unexpected 4KB physical page or does not have identical 3494 * characteristics to the first PTE2. 3495 */ 3496 fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); 3497 fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ 3498 for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { 3499 pte2 = pte2_load(pte2p); 3500 if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { 3501 pmap_pte1_p_failures++; 3502 CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", 3503 __func__, va, pmap); 3504 return; 3505 } 3506 if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3507 /* 3508 * When page is not modified, PTE2_RO can be set 3509 * without a TLB invalidation. See note above. 3510 */ 3511 pte2 |= PTE2_RO; 3512 pte2_store(pte2p, pte2); 3513 pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & 3514 PTE2_FRAME); 3515 CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", 3516 __func__, pteva, pmap); 3517 } 3518 if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { 3519 pmap_pte1_p_failures++; 3520 CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", 3521 __func__, va, pmap); 3522 return; 3523 } 3524 3525 fpte2_fav -= PTE2_SIZE; 3526 } 3527 /* 3528 * The page table page in its current state will stay in PT2TAB 3529 * until the PTE1 mapping the section is demoted by pmap_demote_pte1() 3530 * or destroyed by pmap_remove_pte1(). 3531 * 3532 * Note that L2 page table size is not equal to PAGE_SIZE. 3533 */ 3534 m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); 3535 KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], 3536 ("%s: PT2 page is out of range", __func__)); 3537 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 3538 ("%s: PT2 page's pindex is wrong", __func__)); 3539 3540 /* 3541 * Get pte1 from pte2 format. 3542 */ 3543 npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; 3544 3545 /* 3546 * Promote the pv entries. 3547 */ 3548 if (pte2_is_managed(fpte2)) 3549 pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); 3550 3551 /* 3552 * Promote the mappings. 3553 */ 3554 pmap_change_pte1(pmap, pte1p, va, npte1); 3555 3556 pmap_pte1_promotions++; 3557 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3558 __func__, va, pmap); 3559 3560 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3561 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3562 } 3563 #endif /* VM_NRESERVLEVEL > 0 */ 3564 3565 /* 3566 * Zero L2 page table page. 3567 */ 3568 static __inline void 3569 pmap_clear_pt2(pt2_entry_t *fpte2p) 3570 { 3571 pt2_entry_t *pte2p; 3572 3573 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) 3574 pte2_clear(pte2p); 3575 3576 } 3577 3578 /* 3579 * Removes a 1MB page mapping from the kernel pmap. 3580 */ 3581 static void 3582 pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3583 { 3584 vm_page_t m; 3585 uint32_t pte1_idx; 3586 pt2_entry_t *fpte2p; 3587 vm_paddr_t pt2_pa; 3588 3589 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3590 m = pmap_pt2_page(pmap, va); 3591 if (m == NULL) 3592 /* 3593 * QQQ: Is this function called only on promoted pte1? 3594 * We certainly do section mappings directly 3595 * (without promotion) in kernel !!! 3596 */ 3597 panic("%s: missing pt2 page", __func__); 3598 3599 pte1_idx = pte1_index(va); 3600 3601 /* 3602 * Initialize the L2 page table. 3603 */ 3604 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3605 pmap_clear_pt2(fpte2p); 3606 3607 /* 3608 * Remove the mapping. 3609 */ 3610 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); 3611 pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); 3612 3613 /* 3614 * QQQ: We do not need to invalidate PT2MAP mapping 3615 * as we did not change it. I.e. the L2 page table page 3616 * was and still is mapped the same way. 3617 */ 3618 } 3619 3620 /* 3621 * Do the things to unmap a section in a process 3622 */ 3623 static void 3624 pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 3625 struct spglist *free) 3626 { 3627 pt1_entry_t opte1; 3628 struct md_page *pvh; 3629 vm_offset_t eva, va; 3630 vm_page_t m; 3631 3632 PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, 3633 pte1_load(pte1p), pte1p)); 3634 3635 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3636 KASSERT((sva & PTE1_OFFSET) == 0, 3637 ("%s: sva is not 1mpage aligned", __func__)); 3638 3639 /* 3640 * Clear and invalidate the mapping. It should occupy one and only TLB 3641 * entry. So, pmap_tlb_flush() called with aligned address should be 3642 * sufficient. 3643 */ 3644 opte1 = pte1_load_clear(pte1p); 3645 pmap_tlb_flush(pmap, sva); 3646 3647 if (pte1_is_wired(opte1)) 3648 pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; 3649 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 3650 if (pte1_is_managed(opte1)) { 3651 pvh = pa_to_pvh(pte1_pa(opte1)); 3652 pmap_pvh_free(pvh, pmap, sva); 3653 eva = sva + PTE1_SIZE; 3654 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 3655 va < eva; va += PAGE_SIZE, m++) { 3656 if (pte1_is_dirty(opte1)) 3657 vm_page_dirty(m); 3658 if (opte1 & PTE1_A) 3659 vm_page_aflag_set(m, PGA_REFERENCED); 3660 if (TAILQ_EMPTY(&m->md.pv_list) && 3661 TAILQ_EMPTY(&pvh->pv_list)) 3662 vm_page_aflag_clear(m, PGA_WRITEABLE); 3663 } 3664 } 3665 if (pmap == kernel_pmap) { 3666 /* 3667 * L2 page table(s) can't be removed from kernel map as 3668 * kernel counts on it (stuff around pmap_growkernel()). 3669 */ 3670 pmap_remove_kernel_pte1(pmap, pte1p, sva); 3671 } else { 3672 /* 3673 * Get associated L2 page table page. 3674 * It's possible that the page was never allocated. 3675 */ 3676 m = pmap_pt2_page(pmap, sva); 3677 if (m != NULL) 3678 pmap_unwire_pt2_all(pmap, sva, m, free); 3679 } 3680 } 3681 3682 /* 3683 * Fills L2 page table page with mappings to consecutive physical pages. 3684 */ 3685 static __inline void 3686 pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) 3687 { 3688 pt2_entry_t *pte2p; 3689 3690 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { 3691 pte2_store(pte2p, npte2); 3692 npte2 += PTE2_SIZE; 3693 } 3694 } 3695 3696 /* 3697 * Tries to demote a 1MB page mapping. If demotion fails, the 3698 * 1MB page mapping is invalidated. 3699 */ 3700 static bool 3701 pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3702 { 3703 pt1_entry_t opte1, npte1; 3704 pt2_entry_t *fpte2p, npte2; 3705 vm_paddr_t pt2pg_pa, pt2_pa; 3706 vm_page_t m; 3707 struct spglist free; 3708 uint32_t pte1_idx, isnew = 0; 3709 3710 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3711 pmap, va, pte1_load(pte1p), pte1p)); 3712 3713 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3714 3715 opte1 = pte1_load(pte1p); 3716 KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); 3717 3718 if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { 3719 KASSERT(!pte1_is_wired(opte1), 3720 ("%s: PT2 page for a wired mapping is missing", __func__)); 3721 3722 /* 3723 * Invalidate the 1MB page mapping and return 3724 * "failure" if the mapping was never accessed or the 3725 * allocation of the new page table page fails. 3726 */ 3727 if ((opte1 & PTE1_A) == 0 || 3728 (m = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { 3729 SLIST_INIT(&free); 3730 pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); 3731 vm_page_free_pages_toq(&free, false); 3732 CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", 3733 __func__, va, pmap); 3734 return (false); 3735 } 3736 m->pindex = pte1_index(va) & ~PT2PG_MASK; 3737 if (va < VM_MAXUSER_ADDRESS) 3738 pmap->pm_stats.resident_count++; 3739 3740 isnew = 1; 3741 3742 /* 3743 * We init all L2 page tables in the page even if 3744 * we are going to change everything for one L2 page 3745 * table in a while. 3746 */ 3747 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 3748 } else { 3749 if (va < VM_MAXUSER_ADDRESS) { 3750 if (pt2_is_empty(m, va)) 3751 isnew = 1; /* Demoting section w/o promotion. */ 3752 #ifdef INVARIANTS 3753 else 3754 KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" 3755 " count %u", __func__, 3756 pt2_wirecount_get(m, pte1_index(va)))); 3757 #endif 3758 } 3759 } 3760 3761 pt2pg_pa = VM_PAGE_TO_PHYS(m); 3762 pte1_idx = pte1_index(va); 3763 /* 3764 * If the pmap is current, then the PT2MAP can provide access to 3765 * the page table page (promoted L2 page tables are not unmapped). 3766 * Otherwise, temporarily map the L2 page table page (m) into 3767 * the kernel's address space at either PADDR1 or PADDR2. 3768 * 3769 * Note that L2 page table size is not equal to PAGE_SIZE. 3770 */ 3771 if (pmap_is_current(pmap)) 3772 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3773 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 3774 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 3775 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 3776 #ifdef SMP 3777 PMAP1cpu = PCPU_GET(cpuid); 3778 #endif 3779 tlb_flush_local((vm_offset_t)PADDR1); 3780 PMAP1changed++; 3781 } else 3782 #ifdef SMP 3783 if (PMAP1cpu != PCPU_GET(cpuid)) { 3784 PMAP1cpu = PCPU_GET(cpuid); 3785 tlb_flush_local((vm_offset_t)PADDR1); 3786 PMAP1changedcpu++; 3787 } else 3788 #endif 3789 PMAP1unchanged++; 3790 fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); 3791 } else { 3792 mtx_lock(&PMAP2mutex); 3793 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 3794 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 3795 tlb_flush((vm_offset_t)PADDR2); 3796 } 3797 fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); 3798 } 3799 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 3800 npte1 = PTE1_LINK(pt2_pa); 3801 3802 KASSERT((opte1 & PTE1_A) != 0, 3803 ("%s: opte1 is missing PTE1_A", __func__)); 3804 KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, 3805 ("%s: opte1 has PTE1_NM", __func__)); 3806 3807 /* 3808 * Get pte2 from pte1 format. 3809 */ 3810 npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; 3811 3812 /* 3813 * If the L2 page table page is new, initialize it. If the mapping 3814 * has changed attributes, update the page table entries. 3815 */ 3816 if (isnew != 0) { 3817 pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); 3818 pmap_fill_pt2(fpte2p, npte2); 3819 } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != 3820 (npte2 & PTE2_PROMOTE)) 3821 pmap_fill_pt2(fpte2p, npte2); 3822 3823 KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), 3824 ("%s: fpte2p and npte2 map different physical addresses", 3825 __func__)); 3826 3827 if (fpte2p == PADDR2) 3828 mtx_unlock(&PMAP2mutex); 3829 3830 /* 3831 * Demote the mapping. This pmap is locked. The old PTE1 has 3832 * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also 3833 * has not PTE1_NM set. Thus, there is no danger of a race with 3834 * another processor changing the setting of PTE1_A and/or PTE1_NM 3835 * between the read above and the store below. 3836 */ 3837 pmap_change_pte1(pmap, pte1p, va, npte1); 3838 3839 /* 3840 * Demote the pv entry. This depends on the earlier demotion 3841 * of the mapping. Specifically, the (re)creation of a per- 3842 * page pv entry might trigger the execution of pmap_pv_reclaim(), 3843 * which might reclaim a newly (re)created per-page pv entry 3844 * and destroy the associated mapping. In order to destroy 3845 * the mapping, the PTE1 must have already changed from mapping 3846 * the 1mpage to referencing the page table page. 3847 */ 3848 if (pte1_is_managed(opte1)) 3849 pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); 3850 3851 pmap_pte1_demotions++; 3852 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3853 __func__, va, pmap); 3854 3855 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3856 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3857 return (true); 3858 } 3859 3860 /* 3861 * Insert the given physical page (p) at 3862 * the specified virtual address (v) in the 3863 * target physical map with the protection requested. 3864 * 3865 * If specified, the page will be wired down, meaning 3866 * that the related pte can not be reclaimed. 3867 * 3868 * NB: This is the only routine which MAY NOT lazy-evaluate 3869 * or lose information. That is, this routine must actually 3870 * insert this page into the given map NOW. 3871 */ 3872 int 3873 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3874 u_int flags, int8_t psind) 3875 { 3876 pt1_entry_t *pte1p; 3877 pt2_entry_t *pte2p; 3878 pt2_entry_t npte2, opte2; 3879 pv_entry_t pv; 3880 vm_paddr_t opa, pa; 3881 vm_page_t mpte2, om; 3882 int rv; 3883 3884 va = trunc_page(va); 3885 KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); 3886 KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, 3887 ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, 3888 va)); 3889 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 3890 ("%s: managed mapping within the clean submap", __func__)); 3891 if ((m->oflags & VPO_UNMANAGED) == 0) 3892 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3893 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 3894 ("%s: flags %u has reserved bits set", __func__, flags)); 3895 pa = VM_PAGE_TO_PHYS(m); 3896 npte2 = PTE2(pa, PTE2_A, vm_page_pte2_attr(m)); 3897 if ((flags & VM_PROT_WRITE) == 0) 3898 npte2 |= PTE2_NM; 3899 if ((prot & VM_PROT_WRITE) == 0) 3900 npte2 |= PTE2_RO; 3901 KASSERT((npte2 & (PTE2_NM | PTE2_RO)) != PTE2_RO, 3902 ("%s: flags includes VM_PROT_WRITE but prot doesn't", __func__)); 3903 if ((prot & VM_PROT_EXECUTE) == 0) 3904 npte2 |= PTE2_NX; 3905 if ((flags & PMAP_ENTER_WIRED) != 0) 3906 npte2 |= PTE2_W; 3907 if (va < VM_MAXUSER_ADDRESS) 3908 npte2 |= PTE2_U; 3909 if (pmap != kernel_pmap) 3910 npte2 |= PTE2_NG; 3911 3912 rw_wlock(&pvh_global_lock); 3913 PMAP_LOCK(pmap); 3914 sched_pin(); 3915 if (psind == 1) { 3916 /* Assert the required virtual and physical alignment. */ 3917 KASSERT((va & PTE1_OFFSET) == 0, 3918 ("%s: va unaligned", __func__)); 3919 KASSERT(m->psind > 0, ("%s: m->psind < psind", __func__)); 3920 rv = pmap_enter_pte1(pmap, va, PTE1_PA(pa) | ATTR_TO_L1(npte2) | 3921 PTE1_V, flags, m); 3922 goto out; 3923 } 3924 3925 /* 3926 * In the case that a page table page is not 3927 * resident, we are creating it here. 3928 */ 3929 if (va < VM_MAXUSER_ADDRESS) { 3930 mpte2 = pmap_allocpte2(pmap, va, flags); 3931 if (mpte2 == NULL) { 3932 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3933 ("pmap_allocpte2 failed with sleep allowed")); 3934 rv = KERN_RESOURCE_SHORTAGE; 3935 goto out; 3936 } 3937 } else 3938 mpte2 = NULL; 3939 pte1p = pmap_pte1(pmap, va); 3940 if (pte1_is_section(pte1_load(pte1p))) 3941 panic("%s: attempted on 1MB page", __func__); 3942 pte2p = pmap_pte2_quick(pmap, va); 3943 if (pte2p == NULL) 3944 panic("%s: invalid L1 page table entry va=%#x", __func__, va); 3945 3946 om = NULL; 3947 opte2 = pte2_load(pte2p); 3948 opa = pte2_pa(opte2); 3949 /* 3950 * Mapping has not changed, must be protection or wiring change. 3951 */ 3952 if (pte2_is_valid(opte2) && (opa == pa)) { 3953 /* 3954 * Wiring change, just update stats. We don't worry about 3955 * wiring PT2 pages as they remain resident as long as there 3956 * are valid mappings in them. Hence, if a user page is wired, 3957 * the PT2 page will be also. 3958 */ 3959 if (pte2_is_wired(npte2) && !pte2_is_wired(opte2)) 3960 pmap->pm_stats.wired_count++; 3961 else if (!pte2_is_wired(npte2) && pte2_is_wired(opte2)) 3962 pmap->pm_stats.wired_count--; 3963 3964 /* 3965 * Remove extra pte2 reference 3966 */ 3967 if (mpte2) 3968 pt2_wirecount_dec(mpte2, pte1_index(va)); 3969 if ((m->oflags & VPO_UNMANAGED) == 0) 3970 om = m; 3971 goto validate; 3972 } 3973 3974 /* 3975 * QQQ: We think that changing physical address on writeable mapping 3976 * is not safe. Well, maybe on kernel address space with correct 3977 * locking, it can make a sense. However, we have no idea why 3978 * anyone should do that on user address space. Are we wrong? 3979 */ 3980 KASSERT((opa == 0) || (opa == pa) || 3981 !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), 3982 ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", 3983 __func__, pmap, va, opte2, opa, pa, flags, prot)); 3984 3985 pv = NULL; 3986 3987 /* 3988 * Mapping has changed, invalidate old range and fall through to 3989 * handle validating new mapping. 3990 */ 3991 if (opa) { 3992 if (pte2_is_wired(opte2)) 3993 pmap->pm_stats.wired_count--; 3994 om = PHYS_TO_VM_PAGE(opa); 3995 if (om != NULL && (om->oflags & VPO_UNMANAGED) != 0) 3996 om = NULL; 3997 if (om != NULL) 3998 pv = pmap_pvh_remove(&om->md, pmap, va); 3999 4000 /* 4001 * Remove extra pte2 reference 4002 */ 4003 if (mpte2 != NULL) 4004 pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); 4005 } else 4006 pmap->pm_stats.resident_count++; 4007 4008 /* 4009 * Enter on the PV list if part of our managed memory. 4010 */ 4011 if ((m->oflags & VPO_UNMANAGED) == 0) { 4012 if (pv == NULL) { 4013 pv = get_pv_entry(pmap, false); 4014 pv->pv_va = va; 4015 } 4016 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4017 } else if (pv != NULL) 4018 free_pv_entry(pmap, pv); 4019 4020 /* 4021 * Increment counters 4022 */ 4023 if (pte2_is_wired(npte2)) 4024 pmap->pm_stats.wired_count++; 4025 4026 validate: 4027 /* 4028 * Now validate mapping with desired protection/wiring. 4029 */ 4030 if (prot & VM_PROT_WRITE) { 4031 if ((m->oflags & VPO_UNMANAGED) == 0) 4032 vm_page_aflag_set(m, PGA_WRITEABLE); 4033 } 4034 4035 /* 4036 * If the mapping or permission bits are different, we need 4037 * to update the pte2. 4038 * 4039 * QQQ: Think again and again what to do 4040 * if the mapping is going to be changed! 4041 */ 4042 if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { 4043 /* 4044 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4045 * is set. Do it now, before the mapping is stored and made 4046 * valid for hardware table walk. If done later, there is a race 4047 * for other threads of current process in lazy loading case. 4048 * Don't do it for kernel memory which is mapped with exec 4049 * permission even if the memory isn't going to hold executable 4050 * code. The only time when icache sync is needed is after 4051 * kernel module is loaded and the relocation info is processed. 4052 * And it's done in elf_cpu_load_file(). 4053 * 4054 * QQQ: (1) Does it exist any better way where 4055 * or how to sync icache? 4056 * (2) Now, we do it on a page basis. 4057 */ 4058 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4059 m->md.pat_mode == VM_MEMATTR_WB_WA && 4060 (opa != pa || (opte2 & PTE2_NX))) 4061 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4062 4063 if (opte2 & PTE2_V) { 4064 /* Change mapping with break-before-make approach. */ 4065 opte2 = pte2_load_clear(pte2p); 4066 pmap_tlb_flush(pmap, va); 4067 pte2_store(pte2p, npte2); 4068 if (om != NULL) { 4069 KASSERT((om->oflags & VPO_UNMANAGED) == 0, 4070 ("%s: om %p unmanaged", __func__, om)); 4071 if ((opte2 & PTE2_A) != 0) 4072 vm_page_aflag_set(om, PGA_REFERENCED); 4073 if (pte2_is_dirty(opte2)) 4074 vm_page_dirty(om); 4075 if (TAILQ_EMPTY(&om->md.pv_list) && 4076 ((om->flags & PG_FICTITIOUS) != 0 || 4077 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4078 vm_page_aflag_clear(om, PGA_WRITEABLE); 4079 } 4080 } else 4081 pte2_store(pte2p, npte2); 4082 } 4083 #if 0 4084 else { 4085 /* 4086 * QQQ: In time when both access and not mofified bits are 4087 * emulated by software, this should not happen. Some 4088 * analysis is need, if this really happen. Missing 4089 * tlb flush somewhere could be the reason. 4090 */ 4091 panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, 4092 va, opte2, npte2); 4093 } 4094 #endif 4095 4096 #if VM_NRESERVLEVEL > 0 4097 /* 4098 * If both the L2 page table page and the reservation are fully 4099 * populated, then attempt promotion. 4100 */ 4101 if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && 4102 sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && 4103 vm_reserv_level_iffullpop(m) == 0) 4104 pmap_promote_pte1(pmap, pte1p, va); 4105 #endif 4106 4107 rv = KERN_SUCCESS; 4108 out: 4109 sched_unpin(); 4110 rw_wunlock(&pvh_global_lock); 4111 PMAP_UNLOCK(pmap); 4112 return (rv); 4113 } 4114 4115 /* 4116 * Do the things to unmap a page in a process. 4117 */ 4118 static int 4119 pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, 4120 struct spglist *free) 4121 { 4122 pt2_entry_t opte2; 4123 vm_page_t m; 4124 4125 rw_assert(&pvh_global_lock, RA_WLOCKED); 4126 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4127 4128 /* Clear and invalidate the mapping. */ 4129 opte2 = pte2_load_clear(pte2p); 4130 pmap_tlb_flush(pmap, va); 4131 4132 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", 4133 __func__, pmap, va, opte2)); 4134 4135 if (opte2 & PTE2_W) 4136 pmap->pm_stats.wired_count -= 1; 4137 pmap->pm_stats.resident_count -= 1; 4138 if (pte2_is_managed(opte2)) { 4139 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 4140 if (pte2_is_dirty(opte2)) 4141 vm_page_dirty(m); 4142 if (opte2 & PTE2_A) 4143 vm_page_aflag_set(m, PGA_REFERENCED); 4144 pmap_remove_entry(pmap, m, va); 4145 } 4146 return (pmap_unuse_pt2(pmap, va, free)); 4147 } 4148 4149 /* 4150 * Remove a single page from a process address space. 4151 */ 4152 static void 4153 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 4154 { 4155 pt2_entry_t *pte2p; 4156 4157 rw_assert(&pvh_global_lock, RA_WLOCKED); 4158 KASSERT(curthread->td_pinned > 0, 4159 ("%s: curthread not pinned", __func__)); 4160 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4161 if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || 4162 !pte2_is_valid(pte2_load(pte2p))) 4163 return; 4164 pmap_remove_pte2(pmap, pte2p, va, free); 4165 } 4166 4167 /* 4168 * Remove the given range of addresses from the specified map. 4169 * 4170 * It is assumed that the start and end are properly 4171 * rounded to the page size. 4172 */ 4173 void 4174 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4175 { 4176 vm_offset_t nextva; 4177 pt1_entry_t *pte1p, pte1; 4178 pt2_entry_t *pte2p, pte2; 4179 struct spglist free; 4180 4181 /* 4182 * Perform an unsynchronized read. This is, however, safe. 4183 */ 4184 if (pmap->pm_stats.resident_count == 0) 4185 return; 4186 4187 SLIST_INIT(&free); 4188 4189 rw_wlock(&pvh_global_lock); 4190 sched_pin(); 4191 PMAP_LOCK(pmap); 4192 4193 /* 4194 * Special handling of removing one page. A very common 4195 * operation and easy to short circuit some code. 4196 */ 4197 if (sva + PAGE_SIZE == eva) { 4198 pte1 = pte1_load(pmap_pte1(pmap, sva)); 4199 if (pte1_is_link(pte1)) { 4200 pmap_remove_page(pmap, sva, &free); 4201 goto out; 4202 } 4203 } 4204 4205 for (; sva < eva; sva = nextva) { 4206 /* 4207 * Calculate address for next L2 page table. 4208 */ 4209 nextva = pte1_trunc(sva + PTE1_SIZE); 4210 if (nextva < sva) 4211 nextva = eva; 4212 if (pmap->pm_stats.resident_count == 0) 4213 break; 4214 4215 pte1p = pmap_pte1(pmap, sva); 4216 pte1 = pte1_load(pte1p); 4217 4218 /* 4219 * Weed out invalid mappings. Note: we assume that the L1 page 4220 * table is always allocated, and in kernel virtual. 4221 */ 4222 if (pte1 == 0) 4223 continue; 4224 4225 if (pte1_is_section(pte1)) { 4226 /* 4227 * Are we removing the entire large page? If not, 4228 * demote the mapping and fall through. 4229 */ 4230 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 4231 pmap_remove_pte1(pmap, pte1p, sva, &free); 4232 continue; 4233 } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { 4234 /* The large page mapping was destroyed. */ 4235 continue; 4236 } 4237 #ifdef INVARIANTS 4238 else { 4239 /* Update pte1 after demotion. */ 4240 pte1 = pte1_load(pte1p); 4241 } 4242 #endif 4243 } 4244 4245 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 4246 " is not link", __func__, pmap, sva, pte1, pte1p)); 4247 4248 /* 4249 * Limit our scan to either the end of the va represented 4250 * by the current L2 page table page, or to the end of the 4251 * range being removed. 4252 */ 4253 if (nextva > eva) 4254 nextva = eva; 4255 4256 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; 4257 pte2p++, sva += PAGE_SIZE) { 4258 pte2 = pte2_load(pte2p); 4259 if (!pte2_is_valid(pte2)) 4260 continue; 4261 if (pmap_remove_pte2(pmap, pte2p, sva, &free)) 4262 break; 4263 } 4264 } 4265 out: 4266 sched_unpin(); 4267 rw_wunlock(&pvh_global_lock); 4268 PMAP_UNLOCK(pmap); 4269 vm_page_free_pages_toq(&free, false); 4270 } 4271 4272 /* 4273 * Routine: pmap_remove_all 4274 * Function: 4275 * Removes this physical page from 4276 * all physical maps in which it resides. 4277 * Reflects back modify bits to the pager. 4278 * 4279 * Notes: 4280 * Original versions of this routine were very 4281 * inefficient because they iteratively called 4282 * pmap_remove (slow...) 4283 */ 4284 4285 void 4286 pmap_remove_all(vm_page_t m) 4287 { 4288 struct md_page *pvh; 4289 pv_entry_t pv; 4290 pmap_t pmap; 4291 pt2_entry_t *pte2p, opte2; 4292 pt1_entry_t *pte1p; 4293 vm_offset_t va; 4294 struct spglist free; 4295 4296 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4297 ("%s: page %p is not managed", __func__, m)); 4298 SLIST_INIT(&free); 4299 rw_wlock(&pvh_global_lock); 4300 sched_pin(); 4301 if ((m->flags & PG_FICTITIOUS) != 0) 4302 goto small_mappings; 4303 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4304 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4305 va = pv->pv_va; 4306 pmap = PV_PMAP(pv); 4307 PMAP_LOCK(pmap); 4308 pte1p = pmap_pte1(pmap, va); 4309 (void)pmap_demote_pte1(pmap, pte1p, va); 4310 PMAP_UNLOCK(pmap); 4311 } 4312 small_mappings: 4313 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4314 pmap = PV_PMAP(pv); 4315 PMAP_LOCK(pmap); 4316 pmap->pm_stats.resident_count--; 4317 pte1p = pmap_pte1(pmap, pv->pv_va); 4318 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " 4319 "a 1mpage in page %p's pv list", __func__, m)); 4320 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 4321 opte2 = pte2_load_clear(pte2p); 4322 pmap_tlb_flush(pmap, pv->pv_va); 4323 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", 4324 __func__, pmap, pv->pv_va)); 4325 if (pte2_is_wired(opte2)) 4326 pmap->pm_stats.wired_count--; 4327 if (opte2 & PTE2_A) 4328 vm_page_aflag_set(m, PGA_REFERENCED); 4329 4330 /* 4331 * Update the vm_page_t clean and reference bits. 4332 */ 4333 if (pte2_is_dirty(opte2)) 4334 vm_page_dirty(m); 4335 pmap_unuse_pt2(pmap, pv->pv_va, &free); 4336 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4337 free_pv_entry(pmap, pv); 4338 PMAP_UNLOCK(pmap); 4339 } 4340 vm_page_aflag_clear(m, PGA_WRITEABLE); 4341 sched_unpin(); 4342 rw_wunlock(&pvh_global_lock); 4343 vm_page_free_pages_toq(&free, false); 4344 } 4345 4346 /* 4347 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4348 * good coding style, a.k.a. 80 character line width limit hell. 4349 */ 4350 static __inline void 4351 pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, 4352 struct spglist *free) 4353 { 4354 vm_paddr_t pa; 4355 vm_page_t m, mt, mpt2pg; 4356 struct md_page *pvh; 4357 4358 pa = pte1_pa(pte1); 4359 m = PHYS_TO_VM_PAGE(pa); 4360 4361 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4362 __func__, m, m->phys_addr, pa)); 4363 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4364 m < &vm_page_array[vm_page_array_size], 4365 ("%s: bad pte1 %#x", __func__, pte1)); 4366 4367 if (pte1_is_dirty(pte1)) { 4368 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4369 vm_page_dirty(mt); 4370 } 4371 4372 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 4373 pvh = pa_to_pvh(pa); 4374 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4375 if (TAILQ_EMPTY(&pvh->pv_list)) { 4376 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4377 if (TAILQ_EMPTY(&mt->md.pv_list)) 4378 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4379 } 4380 mpt2pg = pmap_pt2_page(pmap, pv->pv_va); 4381 if (mpt2pg != NULL) 4382 pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); 4383 } 4384 4385 /* 4386 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4387 * good coding style, a.k.a. 80 character line width limit hell. 4388 */ 4389 static __inline void 4390 pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, 4391 struct spglist *free) 4392 { 4393 vm_paddr_t pa; 4394 vm_page_t m; 4395 struct md_page *pvh; 4396 4397 pa = pte2_pa(pte2); 4398 m = PHYS_TO_VM_PAGE(pa); 4399 4400 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4401 __func__, m, m->phys_addr, pa)); 4402 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4403 m < &vm_page_array[vm_page_array_size], 4404 ("%s: bad pte2 %#x", __func__, pte2)); 4405 4406 if (pte2_is_dirty(pte2)) 4407 vm_page_dirty(m); 4408 4409 pmap->pm_stats.resident_count--; 4410 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4411 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 4412 pvh = pa_to_pvh(pa); 4413 if (TAILQ_EMPTY(&pvh->pv_list)) 4414 vm_page_aflag_clear(m, PGA_WRITEABLE); 4415 } 4416 pmap_unuse_pt2(pmap, pv->pv_va, free); 4417 } 4418 4419 /* 4420 * Remove all pages from specified address space this aids process 4421 * exit speeds. Also, this code is special cased for current process 4422 * only, but can have the more generic (and slightly slower) mode enabled. 4423 * This is much faster than pmap_remove in the case of running down 4424 * an entire address space. 4425 */ 4426 void 4427 pmap_remove_pages(pmap_t pmap) 4428 { 4429 pt1_entry_t *pte1p, pte1; 4430 pt2_entry_t *pte2p, pte2; 4431 pv_entry_t pv; 4432 struct pv_chunk *pc, *npc; 4433 struct spglist free; 4434 int field, idx; 4435 int32_t bit; 4436 uint32_t inuse, bitmask; 4437 bool allfree; 4438 4439 /* 4440 * Assert that the given pmap is only active on the current 4441 * CPU. Unfortunately, we cannot block another CPU from 4442 * activating the pmap while this function is executing. 4443 */ 4444 KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), 4445 ("%s: non-current pmap %p", __func__, pmap)); 4446 #if defined(SMP) && defined(INVARIANTS) 4447 { 4448 cpuset_t other_cpus; 4449 4450 sched_pin(); 4451 other_cpus = pmap->pm_active; 4452 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 4453 sched_unpin(); 4454 KASSERT(CPU_EMPTY(&other_cpus), 4455 ("%s: pmap %p active on other cpus", __func__, pmap)); 4456 } 4457 #endif 4458 SLIST_INIT(&free); 4459 rw_wlock(&pvh_global_lock); 4460 PMAP_LOCK(pmap); 4461 sched_pin(); 4462 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4463 KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", 4464 __func__, pmap, pc->pc_pmap)); 4465 allfree = true; 4466 for (field = 0; field < _NPCM; field++) { 4467 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 4468 while (inuse != 0) { 4469 bit = ffs(inuse) - 1; 4470 bitmask = 1UL << bit; 4471 idx = field * 32 + bit; 4472 pv = &pc->pc_pventry[idx]; 4473 inuse &= ~bitmask; 4474 4475 /* 4476 * Note that we cannot remove wired pages 4477 * from a process' mapping at this time 4478 */ 4479 pte1p = pmap_pte1(pmap, pv->pv_va); 4480 pte1 = pte1_load(pte1p); 4481 if (pte1_is_section(pte1)) { 4482 if (pte1_is_wired(pte1)) { 4483 allfree = false; 4484 continue; 4485 } 4486 pte1_clear(pte1p); 4487 pmap_remove_pte1_quick(pmap, pte1, pv, 4488 &free); 4489 } 4490 else if (pte1_is_link(pte1)) { 4491 pte2p = pt2map_entry(pv->pv_va); 4492 pte2 = pte2_load(pte2p); 4493 4494 if (!pte2_is_valid(pte2)) { 4495 printf("%s: pmap %p va %#x " 4496 "pte2 %#x\n", __func__, 4497 pmap, pv->pv_va, pte2); 4498 panic("bad pte2"); 4499 } 4500 4501 if (pte2_is_wired(pte2)) { 4502 allfree = false; 4503 continue; 4504 } 4505 pte2_clear(pte2p); 4506 pmap_remove_pte2_quick(pmap, pte2, pv, 4507 &free); 4508 } else { 4509 printf("%s: pmap %p va %#x pte1 %#x\n", 4510 __func__, pmap, pv->pv_va, pte1); 4511 panic("bad pte1"); 4512 } 4513 4514 /* Mark free */ 4515 PV_STAT(pv_entry_frees++); 4516 PV_STAT(pv_entry_spare++); 4517 pv_entry_count--; 4518 pc->pc_map[field] |= bitmask; 4519 } 4520 } 4521 if (allfree) { 4522 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4523 free_pv_chunk(pc); 4524 } 4525 } 4526 tlb_flush_all_ng_local(); 4527 sched_unpin(); 4528 rw_wunlock(&pvh_global_lock); 4529 PMAP_UNLOCK(pmap); 4530 vm_page_free_pages_toq(&free, false); 4531 } 4532 4533 /* 4534 * This code makes some *MAJOR* assumptions: 4535 * 1. Current pmap & pmap exists. 4536 * 2. Not wired. 4537 * 3. Read access. 4538 * 4. No L2 page table pages. 4539 * but is *MUCH* faster than pmap_enter... 4540 */ 4541 static vm_page_t 4542 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4543 vm_prot_t prot, vm_page_t mpt2pg) 4544 { 4545 pt2_entry_t *pte2p, pte2; 4546 vm_paddr_t pa; 4547 struct spglist free; 4548 uint32_t l2prot; 4549 4550 KASSERT(!VA_IS_CLEANMAP(va) || 4551 (m->oflags & VPO_UNMANAGED) != 0, 4552 ("%s: managed mapping within the clean submap", __func__)); 4553 rw_assert(&pvh_global_lock, RA_WLOCKED); 4554 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4555 4556 /* 4557 * In the case that a L2 page table page is not 4558 * resident, we are creating it here. 4559 */ 4560 if (va < VM_MAXUSER_ADDRESS) { 4561 u_int pte1_idx; 4562 pt1_entry_t pte1, *pte1p; 4563 vm_paddr_t pt2_pa; 4564 4565 /* 4566 * Get L1 page table things. 4567 */ 4568 pte1_idx = pte1_index(va); 4569 pte1p = pmap_pte1(pmap, va); 4570 pte1 = pte1_load(pte1p); 4571 4572 if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { 4573 /* 4574 * Each of NPT2_IN_PG L2 page tables on the page can 4575 * come here. Make sure that associated L1 page table 4576 * link is established. 4577 * 4578 * QQQ: It comes that we don't establish all links to 4579 * L2 page tables for newly allocated L2 page 4580 * tables page. 4581 */ 4582 KASSERT(!pte1_is_section(pte1), 4583 ("%s: pte1 %#x is section", __func__, pte1)); 4584 if (!pte1_is_link(pte1)) { 4585 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), 4586 pte1_idx); 4587 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 4588 } 4589 pt2_wirecount_inc(mpt2pg, pte1_idx); 4590 } else { 4591 /* 4592 * If the L2 page table page is mapped, we just 4593 * increment the hold count, and activate it. 4594 */ 4595 if (pte1_is_section(pte1)) { 4596 return (NULL); 4597 } else if (pte1_is_link(pte1)) { 4598 mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 4599 pt2_wirecount_inc(mpt2pg, pte1_idx); 4600 } else { 4601 mpt2pg = _pmap_allocpte2(pmap, va, 4602 PMAP_ENTER_NOSLEEP); 4603 if (mpt2pg == NULL) 4604 return (NULL); 4605 } 4606 } 4607 } else { 4608 mpt2pg = NULL; 4609 } 4610 4611 /* 4612 * This call to pt2map_entry() makes the assumption that we are 4613 * entering the page into the current pmap. In order to support 4614 * quick entry into any pmap, one would likely use pmap_pte2_quick(). 4615 * But that isn't as quick as pt2map_entry(). 4616 */ 4617 pte2p = pt2map_entry(va); 4618 pte2 = pte2_load(pte2p); 4619 if (pte2_is_valid(pte2)) { 4620 if (mpt2pg != NULL) { 4621 /* 4622 * Remove extra pte2 reference 4623 */ 4624 pt2_wirecount_dec(mpt2pg, pte1_index(va)); 4625 mpt2pg = NULL; 4626 } 4627 return (NULL); 4628 } 4629 4630 /* 4631 * Enter on the PV list if part of our managed memory. 4632 */ 4633 if ((m->oflags & VPO_UNMANAGED) == 0 && 4634 !pmap_try_insert_pv_entry(pmap, va, m)) { 4635 if (mpt2pg != NULL) { 4636 SLIST_INIT(&free); 4637 if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { 4638 pmap_tlb_flush(pmap, va); 4639 vm_page_free_pages_toq(&free, false); 4640 } 4641 4642 mpt2pg = NULL; 4643 } 4644 return (NULL); 4645 } 4646 4647 /* 4648 * Increment counters 4649 */ 4650 pmap->pm_stats.resident_count++; 4651 4652 /* 4653 * Now validate mapping with RO protection 4654 */ 4655 pa = VM_PAGE_TO_PHYS(m); 4656 l2prot = PTE2_RO | PTE2_NM; 4657 if (va < VM_MAXUSER_ADDRESS) 4658 l2prot |= PTE2_U | PTE2_NG; 4659 if ((prot & VM_PROT_EXECUTE) == 0) 4660 l2prot |= PTE2_NX; 4661 else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { 4662 /* 4663 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4664 * is set. QQQ: For more info, see comments in pmap_enter(). 4665 */ 4666 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4667 } 4668 pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m))); 4669 4670 return (mpt2pg); 4671 } 4672 4673 void 4674 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4675 { 4676 4677 rw_wlock(&pvh_global_lock); 4678 PMAP_LOCK(pmap); 4679 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 4680 rw_wunlock(&pvh_global_lock); 4681 PMAP_UNLOCK(pmap); 4682 } 4683 4684 /* 4685 * Tries to create a read- and/or execute-only 1 MB page mapping. Returns 4686 * true if successful. Returns false if (1) a mapping already exists at the 4687 * specified virtual address or (2) a PV entry cannot be allocated without 4688 * reclaiming another PV entry. 4689 */ 4690 static bool 4691 pmap_enter_1mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4692 { 4693 pt1_entry_t pte1; 4694 vm_paddr_t pa; 4695 4696 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4697 pa = VM_PAGE_TO_PHYS(m); 4698 pte1 = PTE1(pa, PTE1_NM | PTE1_RO, ATTR_TO_L1(vm_page_pte2_attr(m))); 4699 if ((prot & VM_PROT_EXECUTE) == 0) 4700 pte1 |= PTE1_NX; 4701 if (va < VM_MAXUSER_ADDRESS) 4702 pte1 |= PTE1_U; 4703 if (pmap != kernel_pmap) 4704 pte1 |= PTE1_NG; 4705 return (pmap_enter_pte1(pmap, va, pte1, PMAP_ENTER_NOSLEEP | 4706 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m) == KERN_SUCCESS); 4707 } 4708 4709 /* 4710 * Tries to create the specified 1 MB page mapping. Returns KERN_SUCCESS if 4711 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 4712 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 4713 * a mapping already exists at the specified virtual address. Returns 4714 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NORECLAIM was specified and PV entry 4715 * allocation failed. 4716 */ 4717 static int 4718 pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags, 4719 vm_page_t m) 4720 { 4721 struct spglist free; 4722 pt1_entry_t opte1, *pte1p; 4723 pt2_entry_t pte2, *pte2p; 4724 vm_offset_t cur, end; 4725 vm_page_t mt; 4726 4727 rw_assert(&pvh_global_lock, RA_WLOCKED); 4728 KASSERT((pte1 & (PTE1_NM | PTE1_RO)) == 0 || 4729 (pte1 & (PTE1_NM | PTE1_RO)) == (PTE1_NM | PTE1_RO), 4730 ("%s: pte1 has inconsistent NM and RO attributes", __func__)); 4731 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4732 pte1p = pmap_pte1(pmap, va); 4733 opte1 = pte1_load(pte1p); 4734 if (pte1_is_valid(opte1)) { 4735 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 4736 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4737 __func__, va, pmap); 4738 return (KERN_FAILURE); 4739 } 4740 /* Break the existing mapping(s). */ 4741 SLIST_INIT(&free); 4742 if (pte1_is_section(opte1)) { 4743 /* 4744 * If the section resulted from a promotion, then a 4745 * reserved PT page could be freed. 4746 */ 4747 pmap_remove_pte1(pmap, pte1p, va, &free); 4748 } else { 4749 sched_pin(); 4750 end = va + PTE1_SIZE; 4751 for (cur = va, pte2p = pmap_pte2_quick(pmap, va); 4752 cur != end; cur += PAGE_SIZE, pte2p++) { 4753 pte2 = pte2_load(pte2p); 4754 if (!pte2_is_valid(pte2)) 4755 continue; 4756 if (pmap_remove_pte2(pmap, pte2p, cur, &free)) 4757 break; 4758 } 4759 sched_unpin(); 4760 } 4761 vm_page_free_pages_toq(&free, false); 4762 } 4763 if ((m->oflags & VPO_UNMANAGED) == 0) { 4764 /* 4765 * Abort this mapping if its PV entry could not be created. 4766 */ 4767 if (!pmap_pv_insert_pte1(pmap, va, pte1, flags)) { 4768 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4769 __func__, va, pmap); 4770 return (KERN_RESOURCE_SHORTAGE); 4771 } 4772 if ((pte1 & PTE1_RO) == 0) { 4773 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4774 vm_page_aflag_set(mt, PGA_WRITEABLE); 4775 } 4776 } 4777 4778 /* 4779 * Increment counters. 4780 */ 4781 if (pte1_is_wired(pte1)) 4782 pmap->pm_stats.wired_count += PTE1_SIZE / PAGE_SIZE; 4783 pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; 4784 4785 /* 4786 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4787 * is set. QQQ: For more info, see comments in pmap_enter(). 4788 */ 4789 if ((pte1 & PTE1_NX) == 0 && m->md.pat_mode == VM_MEMATTR_WB_WA && 4790 pmap != kernel_pmap && (!pte1_is_section(opte1) || 4791 pte1_pa(opte1) != VM_PAGE_TO_PHYS(m) || (opte1 & PTE2_NX) != 0)) 4792 cache_icache_sync_fresh(va, VM_PAGE_TO_PHYS(m), PTE1_SIZE); 4793 4794 /* 4795 * Map the section. 4796 */ 4797 pte1_store(pte1p, pte1); 4798 4799 pmap_pte1_mappings++; 4800 CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, 4801 pmap); 4802 return (KERN_SUCCESS); 4803 } 4804 4805 /* 4806 * Maps a sequence of resident pages belonging to the same object. 4807 * The sequence begins with the given page m_start. This page is 4808 * mapped at the given virtual address start. Each subsequent page is 4809 * mapped at a virtual address that is offset from start by the same 4810 * amount as the page is offset from m_start within the object. The 4811 * last page in the sequence is the page with the largest offset from 4812 * m_start that can be mapped at a virtual address less than the given 4813 * virtual address end. Not every virtual page between start and end 4814 * is mapped; only those for which a resident page exists with the 4815 * corresponding offset from m_start are mapped. 4816 */ 4817 void 4818 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4819 vm_page_t m_start, vm_prot_t prot) 4820 { 4821 struct pctrie_iter pages; 4822 vm_offset_t va; 4823 vm_page_t m, mpt2pg; 4824 4825 PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", 4826 __func__, pmap, start, end, m_start, prot)); 4827 4828 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4829 4830 mpt2pg = NULL; 4831 vm_page_iter_limit_init(&pages, m_start->object, 4832 m_start->pindex + atop(end - start)); 4833 m = vm_radix_iter_lookup(&pages, m_start->pindex); 4834 rw_wlock(&pvh_global_lock); 4835 PMAP_LOCK(pmap); 4836 while (m != NULL) { 4837 va = start + ptoa(m->pindex - m_start->pindex); 4838 if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && 4839 m->psind == 1 && sp_enabled && 4840 pmap_enter_1mpage(pmap, va, m, prot)) { 4841 m = vm_radix_iter_jump(&pages, NBPDR / PAGE_SIZE); 4842 } else { 4843 mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, 4844 mpt2pg); 4845 m = vm_radix_iter_step(&pages); 4846 } 4847 } 4848 rw_wunlock(&pvh_global_lock); 4849 PMAP_UNLOCK(pmap); 4850 } 4851 4852 /* 4853 * This code maps large physical mmap regions into the 4854 * processor address space. Note that some shortcuts 4855 * are taken, but the code works. 4856 */ 4857 void 4858 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4859 vm_pindex_t pindex, vm_size_t size) 4860 { 4861 struct pctrie_iter pages; 4862 pt1_entry_t *pte1p; 4863 vm_paddr_t pa, pte2_pa; 4864 vm_page_t p; 4865 vm_memattr_t pat_mode; 4866 u_int l1attr, l1prot; 4867 4868 VM_OBJECT_ASSERT_WLOCKED(object); 4869 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4870 ("%s: non-device object", __func__)); 4871 if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { 4872 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4873 return; 4874 vm_page_iter_init(&pages, object); 4875 p = vm_radix_iter_lookup(&pages, pindex); 4876 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4877 ("%s: invalid page %p", __func__, p)); 4878 pat_mode = p->md.pat_mode; 4879 4880 /* 4881 * Abort the mapping if the first page is not physically 4882 * aligned to a 1MB page boundary. 4883 */ 4884 pte2_pa = VM_PAGE_TO_PHYS(p); 4885 if (pte2_pa & PTE1_OFFSET) 4886 return; 4887 4888 /* 4889 * Skip the first page. Abort the mapping if the rest of 4890 * the pages are not physically contiguous or have differing 4891 * memory attributes. 4892 */ 4893 for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; 4894 pa += PAGE_SIZE) { 4895 p = vm_radix_iter_next(&pages); 4896 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4897 ("%s: invalid page %p", __func__, p)); 4898 if (pa != VM_PAGE_TO_PHYS(p) || 4899 pat_mode != p->md.pat_mode) 4900 return; 4901 } 4902 4903 /* 4904 * Map using 1MB pages. 4905 * 4906 * QQQ: Well, we are mapping a section, so same condition must 4907 * be hold like during promotion. It looks that only RW mapping 4908 * is done here, so readonly mapping must be done elsewhere. 4909 */ 4910 l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; 4911 l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode)); 4912 PMAP_LOCK(pmap); 4913 for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { 4914 pte1p = pmap_pte1(pmap, addr); 4915 if (!pte1_is_valid(pte1_load(pte1p))) { 4916 pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); 4917 pmap->pm_stats.resident_count += PTE1_SIZE / 4918 PAGE_SIZE; 4919 pmap_pte1_mappings++; 4920 } 4921 /* Else continue on if the PTE1 is already valid. */ 4922 addr += PTE1_SIZE; 4923 } 4924 PMAP_UNLOCK(pmap); 4925 } 4926 } 4927 4928 /* 4929 * Do the things to protect a 1mpage in a process. 4930 */ 4931 static void 4932 pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 4933 vm_prot_t prot) 4934 { 4935 pt1_entry_t npte1, opte1; 4936 vm_offset_t eva, va; 4937 vm_page_t m; 4938 4939 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4940 KASSERT((sva & PTE1_OFFSET) == 0, 4941 ("%s: sva is not 1mpage aligned", __func__)); 4942 4943 opte1 = npte1 = pte1_load(pte1p); 4944 if (pte1_is_managed(opte1) && pte1_is_dirty(opte1)) { 4945 eva = sva + PTE1_SIZE; 4946 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 4947 va < eva; va += PAGE_SIZE, m++) 4948 vm_page_dirty(m); 4949 } 4950 if ((prot & VM_PROT_WRITE) == 0) 4951 npte1 |= PTE1_RO | PTE1_NM; 4952 if ((prot & VM_PROT_EXECUTE) == 0) 4953 npte1 |= PTE1_NX; 4954 4955 /* 4956 * QQQ: Herein, execute permission is never set. 4957 * It only can be cleared. So, no icache 4958 * syncing is needed. 4959 */ 4960 4961 if (npte1 != opte1) { 4962 pte1_store(pte1p, npte1); 4963 pmap_tlb_flush(pmap, sva); 4964 } 4965 } 4966 4967 /* 4968 * Set the physical protection on the 4969 * specified range of this map as requested. 4970 */ 4971 void 4972 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4973 { 4974 bool pv_lists_locked; 4975 vm_offset_t nextva; 4976 pt1_entry_t *pte1p, pte1; 4977 pt2_entry_t *pte2p, opte2, npte2; 4978 4979 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4980 if (prot == VM_PROT_NONE) { 4981 pmap_remove(pmap, sva, eva); 4982 return; 4983 } 4984 4985 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 4986 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 4987 return; 4988 4989 if (pmap_is_current(pmap)) 4990 pv_lists_locked = false; 4991 else { 4992 pv_lists_locked = true; 4993 resume: 4994 rw_wlock(&pvh_global_lock); 4995 sched_pin(); 4996 } 4997 4998 PMAP_LOCK(pmap); 4999 for (; sva < eva; sva = nextva) { 5000 /* 5001 * Calculate address for next L2 page table. 5002 */ 5003 nextva = pte1_trunc(sva + PTE1_SIZE); 5004 if (nextva < sva) 5005 nextva = eva; 5006 5007 pte1p = pmap_pte1(pmap, sva); 5008 pte1 = pte1_load(pte1p); 5009 5010 /* 5011 * Weed out invalid mappings. Note: we assume that L1 page 5012 * page table is always allocated, and in kernel virtual. 5013 */ 5014 if (pte1 == 0) 5015 continue; 5016 5017 if (pte1_is_section(pte1)) { 5018 /* 5019 * Are we protecting the entire large page? If not, 5020 * demote the mapping and fall through. 5021 */ 5022 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 5023 pmap_protect_pte1(pmap, pte1p, sva, prot); 5024 continue; 5025 } else { 5026 if (!pv_lists_locked) { 5027 pv_lists_locked = true; 5028 if (!rw_try_wlock(&pvh_global_lock)) { 5029 PMAP_UNLOCK(pmap); 5030 goto resume; 5031 } 5032 sched_pin(); 5033 } 5034 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5035 /* 5036 * The large page mapping 5037 * was destroyed. 5038 */ 5039 continue; 5040 } 5041 #ifdef INVARIANTS 5042 else { 5043 /* Update pte1 after demotion */ 5044 pte1 = pte1_load(pte1p); 5045 } 5046 #endif 5047 } 5048 } 5049 5050 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5051 " is not link", __func__, pmap, sva, pte1, pte1p)); 5052 5053 /* 5054 * Limit our scan to either the end of the va represented 5055 * by the current L2 page table page, or to the end of the 5056 * range being protected. 5057 */ 5058 if (nextva > eva) 5059 nextva = eva; 5060 5061 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5062 sva += PAGE_SIZE) { 5063 vm_page_t m; 5064 5065 opte2 = npte2 = pte2_load(pte2p); 5066 if (!pte2_is_valid(opte2)) 5067 continue; 5068 5069 if ((prot & VM_PROT_WRITE) == 0) { 5070 if (pte2_is_managed(opte2) && 5071 pte2_is_dirty(opte2)) { 5072 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 5073 vm_page_dirty(m); 5074 } 5075 npte2 |= PTE2_RO | PTE2_NM; 5076 } 5077 5078 if ((prot & VM_PROT_EXECUTE) == 0) 5079 npte2 |= PTE2_NX; 5080 5081 /* 5082 * QQQ: Herein, execute permission is never set. 5083 * It only can be cleared. So, no icache 5084 * syncing is needed. 5085 */ 5086 5087 if (npte2 != opte2) { 5088 pte2_store(pte2p, npte2); 5089 pmap_tlb_flush(pmap, sva); 5090 } 5091 } 5092 } 5093 if (pv_lists_locked) { 5094 sched_unpin(); 5095 rw_wunlock(&pvh_global_lock); 5096 } 5097 PMAP_UNLOCK(pmap); 5098 } 5099 5100 /* 5101 * pmap_pvh_wired_mappings: 5102 * 5103 * Return the updated number "count" of managed mappings that are wired. 5104 */ 5105 static int 5106 pmap_pvh_wired_mappings(struct md_page *pvh, int count) 5107 { 5108 pmap_t pmap; 5109 pt1_entry_t pte1; 5110 pt2_entry_t pte2; 5111 pv_entry_t pv; 5112 5113 rw_assert(&pvh_global_lock, RA_WLOCKED); 5114 sched_pin(); 5115 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5116 pmap = PV_PMAP(pv); 5117 PMAP_LOCK(pmap); 5118 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5119 if (pte1_is_section(pte1)) { 5120 if (pte1_is_wired(pte1)) 5121 count++; 5122 } else { 5123 KASSERT(pte1_is_link(pte1), 5124 ("%s: pte1 %#x is not link", __func__, pte1)); 5125 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5126 if (pte2_is_wired(pte2)) 5127 count++; 5128 } 5129 PMAP_UNLOCK(pmap); 5130 } 5131 sched_unpin(); 5132 return (count); 5133 } 5134 5135 /* 5136 * pmap_page_wired_mappings: 5137 * 5138 * Return the number of managed mappings to the given physical page 5139 * that are wired. 5140 */ 5141 int 5142 pmap_page_wired_mappings(vm_page_t m) 5143 { 5144 int count; 5145 5146 count = 0; 5147 if ((m->oflags & VPO_UNMANAGED) != 0) 5148 return (count); 5149 rw_wlock(&pvh_global_lock); 5150 count = pmap_pvh_wired_mappings(&m->md, count); 5151 if ((m->flags & PG_FICTITIOUS) == 0) { 5152 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 5153 count); 5154 } 5155 rw_wunlock(&pvh_global_lock); 5156 return (count); 5157 } 5158 5159 /* 5160 * Returns true if any of the given mappings were used to modify 5161 * physical memory. Otherwise, returns false. Both page and 1mpage 5162 * mappings are supported. 5163 */ 5164 static bool 5165 pmap_is_modified_pvh(struct md_page *pvh) 5166 { 5167 pv_entry_t pv; 5168 pt1_entry_t pte1; 5169 pt2_entry_t pte2; 5170 pmap_t pmap; 5171 bool rv; 5172 5173 rw_assert(&pvh_global_lock, RA_WLOCKED); 5174 rv = false; 5175 sched_pin(); 5176 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5177 pmap = PV_PMAP(pv); 5178 PMAP_LOCK(pmap); 5179 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5180 if (pte1_is_section(pte1)) { 5181 rv = pte1_is_dirty(pte1); 5182 } else { 5183 KASSERT(pte1_is_link(pte1), 5184 ("%s: pte1 %#x is not link", __func__, pte1)); 5185 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5186 rv = pte2_is_dirty(pte2); 5187 } 5188 PMAP_UNLOCK(pmap); 5189 if (rv) 5190 break; 5191 } 5192 sched_unpin(); 5193 return (rv); 5194 } 5195 5196 /* 5197 * pmap_is_modified: 5198 * 5199 * Return whether or not the specified physical page was modified 5200 * in any physical maps. 5201 */ 5202 bool 5203 pmap_is_modified(vm_page_t m) 5204 { 5205 bool rv; 5206 5207 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5208 ("%s: page %p is not managed", __func__, m)); 5209 5210 /* 5211 * If the page is not busied then this check is racy. 5212 */ 5213 if (!pmap_page_is_write_mapped(m)) 5214 return (false); 5215 rw_wlock(&pvh_global_lock); 5216 rv = pmap_is_modified_pvh(&m->md) || 5217 ((m->flags & PG_FICTITIOUS) == 0 && 5218 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5219 rw_wunlock(&pvh_global_lock); 5220 return (rv); 5221 } 5222 5223 /* 5224 * pmap_is_prefaultable: 5225 * 5226 * Return whether or not the specified virtual address is eligible 5227 * for prefault. 5228 */ 5229 bool 5230 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5231 { 5232 pt1_entry_t pte1; 5233 pt2_entry_t pte2; 5234 bool rv; 5235 5236 rv = false; 5237 PMAP_LOCK(pmap); 5238 pte1 = pte1_load(pmap_pte1(pmap, addr)); 5239 if (pte1_is_link(pte1)) { 5240 pte2 = pte2_load(pt2map_entry(addr)); 5241 rv = !pte2_is_valid(pte2) ; 5242 } 5243 PMAP_UNLOCK(pmap); 5244 return (rv); 5245 } 5246 5247 /* 5248 * Returns true if any of the given mappings were referenced and false 5249 * otherwise. Both page and 1mpage mappings are supported. 5250 */ 5251 static bool 5252 pmap_is_referenced_pvh(struct md_page *pvh) 5253 { 5254 5255 pv_entry_t pv; 5256 pt1_entry_t pte1; 5257 pt2_entry_t pte2; 5258 pmap_t pmap; 5259 bool rv; 5260 5261 rw_assert(&pvh_global_lock, RA_WLOCKED); 5262 rv = false; 5263 sched_pin(); 5264 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5265 pmap = PV_PMAP(pv); 5266 PMAP_LOCK(pmap); 5267 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5268 if (pte1_is_section(pte1)) { 5269 rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); 5270 } else { 5271 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5272 rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); 5273 } 5274 PMAP_UNLOCK(pmap); 5275 if (rv) 5276 break; 5277 } 5278 sched_unpin(); 5279 return (rv); 5280 } 5281 5282 /* 5283 * pmap_is_referenced: 5284 * 5285 * Return whether or not the specified physical page was referenced 5286 * in any physical maps. 5287 */ 5288 bool 5289 pmap_is_referenced(vm_page_t m) 5290 { 5291 bool rv; 5292 5293 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5294 ("%s: page %p is not managed", __func__, m)); 5295 rw_wlock(&pvh_global_lock); 5296 rv = pmap_is_referenced_pvh(&m->md) || 5297 ((m->flags & PG_FICTITIOUS) == 0 && 5298 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5299 rw_wunlock(&pvh_global_lock); 5300 return (rv); 5301 } 5302 5303 /* 5304 * pmap_ts_referenced: 5305 * 5306 * Return a count of reference bits for a page, clearing those bits. 5307 * It is not necessary for every reference bit to be cleared, but it 5308 * is necessary that 0 only be returned when there are truly no 5309 * reference bits set. 5310 * 5311 * As an optimization, update the page's dirty field if a modified bit is 5312 * found while counting reference bits. This opportunistic update can be 5313 * performed at low cost and can eliminate the need for some future calls 5314 * to pmap_is_modified(). However, since this function stops after 5315 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5316 * dirty pages. Those dirty pages will only be detected by a future call 5317 * to pmap_is_modified(). 5318 */ 5319 int 5320 pmap_ts_referenced(vm_page_t m) 5321 { 5322 struct md_page *pvh; 5323 pv_entry_t pv, pvf; 5324 pmap_t pmap; 5325 pt1_entry_t *pte1p, opte1; 5326 pt2_entry_t *pte2p, opte2; 5327 vm_paddr_t pa; 5328 int rtval = 0; 5329 5330 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5331 ("%s: page %p is not managed", __func__, m)); 5332 pa = VM_PAGE_TO_PHYS(m); 5333 pvh = pa_to_pvh(pa); 5334 rw_wlock(&pvh_global_lock); 5335 sched_pin(); 5336 if ((m->flags & PG_FICTITIOUS) != 0 || 5337 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5338 goto small_mappings; 5339 pv = pvf; 5340 do { 5341 pmap = PV_PMAP(pv); 5342 PMAP_LOCK(pmap); 5343 pte1p = pmap_pte1(pmap, pv->pv_va); 5344 opte1 = pte1_load(pte1p); 5345 if (pte1_is_dirty(opte1)) { 5346 /* 5347 * Although "opte1" is mapping a 1MB page, because 5348 * this function is called at a 4KB page granularity, 5349 * we only update the 4KB page under test. 5350 */ 5351 vm_page_dirty(m); 5352 } 5353 if ((opte1 & PTE1_A) != 0) { 5354 /* 5355 * Since this reference bit is shared by 256 4KB pages, 5356 * it should not be cleared every time it is tested. 5357 * Apply a simple "hash" function on the physical page 5358 * number, the virtual section number, and the pmap 5359 * address to select one 4KB page out of the 256 5360 * on which testing the reference bit will result 5361 * in clearing that bit. This function is designed 5362 * to avoid the selection of the same 4KB page 5363 * for every 1MB page mapping. 5364 * 5365 * On demotion, a mapping that hasn't been referenced 5366 * is simply destroyed. To avoid the possibility of a 5367 * subsequent page fault on a demoted wired mapping, 5368 * always leave its reference bit set. Moreover, 5369 * since the section is wired, the current state of 5370 * its reference bit won't affect page replacement. 5371 */ 5372 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ 5373 (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && 5374 !pte1_is_wired(opte1)) { 5375 pte1_clear_bit(pte1p, PTE1_A); 5376 pmap_tlb_flush(pmap, pv->pv_va); 5377 } 5378 rtval++; 5379 } 5380 PMAP_UNLOCK(pmap); 5381 /* Rotate the PV list if it has more than one entry. */ 5382 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5383 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5384 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5385 } 5386 if (rtval >= PMAP_TS_REFERENCED_MAX) 5387 goto out; 5388 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5389 small_mappings: 5390 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5391 goto out; 5392 pv = pvf; 5393 do { 5394 pmap = PV_PMAP(pv); 5395 PMAP_LOCK(pmap); 5396 pte1p = pmap_pte1(pmap, pv->pv_va); 5397 KASSERT(pte1_is_link(pte1_load(pte1p)), 5398 ("%s: not found a link in page %p's pv list", __func__, m)); 5399 5400 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5401 opte2 = pte2_load(pte2p); 5402 if (pte2_is_dirty(opte2)) 5403 vm_page_dirty(m); 5404 if ((opte2 & PTE2_A) != 0) { 5405 pte2_clear_bit(pte2p, PTE2_A); 5406 pmap_tlb_flush(pmap, pv->pv_va); 5407 rtval++; 5408 } 5409 PMAP_UNLOCK(pmap); 5410 /* Rotate the PV list if it has more than one entry. */ 5411 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5412 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5413 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5414 } 5415 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 5416 PMAP_TS_REFERENCED_MAX); 5417 out: 5418 sched_unpin(); 5419 rw_wunlock(&pvh_global_lock); 5420 return (rtval); 5421 } 5422 5423 /* 5424 * Clear the wired attribute from the mappings for the specified range of 5425 * addresses in the given pmap. Every valid mapping within that range 5426 * must have the wired attribute set. In contrast, invalid mappings 5427 * cannot have the wired attribute set, so they are ignored. 5428 * 5429 * The wired attribute of the page table entry is not a hardware feature, 5430 * so there is no need to invalidate any TLB entries. 5431 */ 5432 void 5433 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5434 { 5435 vm_offset_t nextva; 5436 pt1_entry_t *pte1p, pte1; 5437 pt2_entry_t *pte2p, pte2; 5438 bool pv_lists_locked; 5439 5440 if (pmap_is_current(pmap)) 5441 pv_lists_locked = false; 5442 else { 5443 pv_lists_locked = true; 5444 resume: 5445 rw_wlock(&pvh_global_lock); 5446 sched_pin(); 5447 } 5448 PMAP_LOCK(pmap); 5449 for (; sva < eva; sva = nextva) { 5450 nextva = pte1_trunc(sva + PTE1_SIZE); 5451 if (nextva < sva) 5452 nextva = eva; 5453 5454 pte1p = pmap_pte1(pmap, sva); 5455 pte1 = pte1_load(pte1p); 5456 5457 /* 5458 * Weed out invalid mappings. Note: we assume that L1 page 5459 * page table is always allocated, and in kernel virtual. 5460 */ 5461 if (pte1 == 0) 5462 continue; 5463 5464 if (pte1_is_section(pte1)) { 5465 if (!pte1_is_wired(pte1)) 5466 panic("%s: pte1 %#x not wired", __func__, pte1); 5467 5468 /* 5469 * Are we unwiring the entire large page? If not, 5470 * demote the mapping and fall through. 5471 */ 5472 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 5473 pte1_clear_bit(pte1p, PTE1_W); 5474 pmap->pm_stats.wired_count -= PTE1_SIZE / 5475 PAGE_SIZE; 5476 continue; 5477 } else { 5478 if (!pv_lists_locked) { 5479 pv_lists_locked = true; 5480 if (!rw_try_wlock(&pvh_global_lock)) { 5481 PMAP_UNLOCK(pmap); 5482 /* Repeat sva. */ 5483 goto resume; 5484 } 5485 sched_pin(); 5486 } 5487 if (!pmap_demote_pte1(pmap, pte1p, sva)) 5488 panic("%s: demotion failed", __func__); 5489 #ifdef INVARIANTS 5490 else { 5491 /* Update pte1 after demotion */ 5492 pte1 = pte1_load(pte1p); 5493 } 5494 #endif 5495 } 5496 } 5497 5498 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5499 " is not link", __func__, pmap, sva, pte1, pte1p)); 5500 5501 /* 5502 * Limit our scan to either the end of the va represented 5503 * by the current L2 page table page, or to the end of the 5504 * range being protected. 5505 */ 5506 if (nextva > eva) 5507 nextva = eva; 5508 5509 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5510 sva += PAGE_SIZE) { 5511 pte2 = pte2_load(pte2p); 5512 if (!pte2_is_valid(pte2)) 5513 continue; 5514 if (!pte2_is_wired(pte2)) 5515 panic("%s: pte2 %#x is missing PTE2_W", 5516 __func__, pte2); 5517 5518 /* 5519 * PTE2_W must be cleared atomically. Although the pmap 5520 * lock synchronizes access to PTE2_W, another processor 5521 * could be changing PTE2_NM and/or PTE2_A concurrently. 5522 */ 5523 pte2_clear_bit(pte2p, PTE2_W); 5524 pmap->pm_stats.wired_count--; 5525 } 5526 } 5527 if (pv_lists_locked) { 5528 sched_unpin(); 5529 rw_wunlock(&pvh_global_lock); 5530 } 5531 PMAP_UNLOCK(pmap); 5532 } 5533 5534 /* 5535 * Clear the write and modified bits in each of the given page's mappings. 5536 */ 5537 void 5538 pmap_remove_write(vm_page_t m) 5539 { 5540 struct md_page *pvh; 5541 pv_entry_t next_pv, pv; 5542 pmap_t pmap; 5543 pt1_entry_t *pte1p; 5544 pt2_entry_t *pte2p, opte2; 5545 vm_offset_t va; 5546 5547 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5548 ("%s: page %p is not managed", __func__, m)); 5549 vm_page_assert_busied(m); 5550 5551 if (!pmap_page_is_write_mapped(m)) 5552 return; 5553 rw_wlock(&pvh_global_lock); 5554 sched_pin(); 5555 if ((m->flags & PG_FICTITIOUS) != 0) 5556 goto small_mappings; 5557 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5558 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5559 va = pv->pv_va; 5560 pmap = PV_PMAP(pv); 5561 PMAP_LOCK(pmap); 5562 pte1p = pmap_pte1(pmap, va); 5563 if (!(pte1_load(pte1p) & PTE1_RO)) 5564 (void)pmap_demote_pte1(pmap, pte1p, va); 5565 PMAP_UNLOCK(pmap); 5566 } 5567 small_mappings: 5568 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5569 pmap = PV_PMAP(pv); 5570 PMAP_LOCK(pmap); 5571 pte1p = pmap_pte1(pmap, pv->pv_va); 5572 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5573 " a section in page %p's pv list", __func__, m)); 5574 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5575 opte2 = pte2_load(pte2p); 5576 if (!(opte2 & PTE2_RO)) { 5577 pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM); 5578 if (pte2_is_dirty(opte2)) 5579 vm_page_dirty(m); 5580 pmap_tlb_flush(pmap, pv->pv_va); 5581 } 5582 PMAP_UNLOCK(pmap); 5583 } 5584 vm_page_aflag_clear(m, PGA_WRITEABLE); 5585 sched_unpin(); 5586 rw_wunlock(&pvh_global_lock); 5587 } 5588 5589 /* 5590 * Apply the given advice to the specified range of addresses within the 5591 * given pmap. Depending on the advice, clear the referenced and/or 5592 * modified flags in each mapping and set the mapped page's dirty field. 5593 */ 5594 void 5595 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5596 { 5597 pt1_entry_t *pte1p, opte1; 5598 pt2_entry_t *pte2p, pte2; 5599 vm_offset_t pdnxt; 5600 vm_page_t m; 5601 bool pv_lists_locked; 5602 5603 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5604 return; 5605 if (pmap_is_current(pmap)) 5606 pv_lists_locked = false; 5607 else { 5608 pv_lists_locked = true; 5609 resume: 5610 rw_wlock(&pvh_global_lock); 5611 sched_pin(); 5612 } 5613 PMAP_LOCK(pmap); 5614 for (; sva < eva; sva = pdnxt) { 5615 pdnxt = pte1_trunc(sva + PTE1_SIZE); 5616 if (pdnxt < sva) 5617 pdnxt = eva; 5618 pte1p = pmap_pte1(pmap, sva); 5619 opte1 = pte1_load(pte1p); 5620 if (!pte1_is_valid(opte1)) /* XXX */ 5621 continue; 5622 else if (pte1_is_section(opte1)) { 5623 if (!pte1_is_managed(opte1)) 5624 continue; 5625 if (!pv_lists_locked) { 5626 pv_lists_locked = true; 5627 if (!rw_try_wlock(&pvh_global_lock)) { 5628 PMAP_UNLOCK(pmap); 5629 goto resume; 5630 } 5631 sched_pin(); 5632 } 5633 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5634 /* 5635 * The large page mapping was destroyed. 5636 */ 5637 continue; 5638 } 5639 5640 /* 5641 * Unless the page mappings are wired, remove the 5642 * mapping to a single page so that a subsequent 5643 * access may repromote. Since the underlying L2 page 5644 * table is fully populated, this removal never 5645 * frees a L2 page table page. 5646 */ 5647 if (!pte1_is_wired(opte1)) { 5648 pte2p = pmap_pte2_quick(pmap, sva); 5649 KASSERT(pte2_is_valid(pte2_load(pte2p)), 5650 ("%s: invalid PTE2", __func__)); 5651 pmap_remove_pte2(pmap, pte2p, sva, NULL); 5652 } 5653 } 5654 if (pdnxt > eva) 5655 pdnxt = eva; 5656 for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, 5657 sva += PAGE_SIZE) { 5658 pte2 = pte2_load(pte2p); 5659 if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) 5660 continue; 5661 else if (pte2_is_dirty(pte2)) { 5662 if (advice == MADV_DONTNEED) { 5663 /* 5664 * Future calls to pmap_is_modified() 5665 * can be avoided by making the page 5666 * dirty now. 5667 */ 5668 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 5669 vm_page_dirty(m); 5670 } 5671 pte2_set_bit(pte2p, PTE2_NM); 5672 pte2_clear_bit(pte2p, PTE2_A); 5673 } else if ((pte2 & PTE2_A) != 0) 5674 pte2_clear_bit(pte2p, PTE2_A); 5675 else 5676 continue; 5677 pmap_tlb_flush(pmap, sva); 5678 } 5679 } 5680 if (pv_lists_locked) { 5681 sched_unpin(); 5682 rw_wunlock(&pvh_global_lock); 5683 } 5684 PMAP_UNLOCK(pmap); 5685 } 5686 5687 /* 5688 * Clear the modify bits on the specified physical page. 5689 */ 5690 void 5691 pmap_clear_modify(vm_page_t m) 5692 { 5693 struct md_page *pvh; 5694 pv_entry_t next_pv, pv; 5695 pmap_t pmap; 5696 pt1_entry_t *pte1p, opte1; 5697 pt2_entry_t *pte2p, opte2; 5698 vm_offset_t va; 5699 5700 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5701 ("%s: page %p is not managed", __func__, m)); 5702 vm_page_assert_busied(m); 5703 5704 if (!pmap_page_is_write_mapped(m)) 5705 return; 5706 rw_wlock(&pvh_global_lock); 5707 sched_pin(); 5708 if ((m->flags & PG_FICTITIOUS) != 0) 5709 goto small_mappings; 5710 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5711 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5712 va = pv->pv_va; 5713 pmap = PV_PMAP(pv); 5714 PMAP_LOCK(pmap); 5715 pte1p = pmap_pte1(pmap, va); 5716 opte1 = pte1_load(pte1p); 5717 if (!(opte1 & PTE1_RO)) { 5718 if (pmap_demote_pte1(pmap, pte1p, va) && 5719 !pte1_is_wired(opte1)) { 5720 /* 5721 * Write protect the mapping to a 5722 * single page so that a subsequent 5723 * write access may repromote. 5724 */ 5725 va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); 5726 pte2p = pmap_pte2_quick(pmap, va); 5727 opte2 = pte2_load(pte2p); 5728 if ((opte2 & PTE2_V)) { 5729 pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); 5730 vm_page_dirty(m); 5731 pmap_tlb_flush(pmap, va); 5732 } 5733 } 5734 } 5735 PMAP_UNLOCK(pmap); 5736 } 5737 small_mappings: 5738 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5739 pmap = PV_PMAP(pv); 5740 PMAP_LOCK(pmap); 5741 pte1p = pmap_pte1(pmap, pv->pv_va); 5742 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5743 " a section in page %p's pv list", __func__, m)); 5744 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5745 if (pte2_is_dirty(pte2_load(pte2p))) { 5746 pte2_set_bit(pte2p, PTE2_NM); 5747 pmap_tlb_flush(pmap, pv->pv_va); 5748 } 5749 PMAP_UNLOCK(pmap); 5750 } 5751 sched_unpin(); 5752 rw_wunlock(&pvh_global_lock); 5753 } 5754 5755 /* 5756 * Sets the memory attribute for the specified page. 5757 */ 5758 void 5759 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5760 { 5761 pt2_entry_t *cmap2_pte2p; 5762 vm_memattr_t oma; 5763 vm_paddr_t pa; 5764 struct pcpu *pc; 5765 5766 oma = m->md.pat_mode; 5767 m->md.pat_mode = ma; 5768 5769 CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, 5770 VM_PAGE_TO_PHYS(m), oma, ma); 5771 if (ma == oma || (m->flags & PG_FICTITIOUS) != 0) 5772 return; 5773 #if 0 5774 /* 5775 * If "m" is a normal page, flush it from the cache. 5776 * 5777 * First, try to find an existing mapping of the page by sf 5778 * buffer. sf_buf_invalidate_cache() modifies mapping and 5779 * flushes the cache. 5780 */ 5781 if (sf_buf_invalidate_cache(m, oma)) 5782 return; 5783 #endif 5784 /* 5785 * If page is not mapped by sf buffer, map the page 5786 * transient and do invalidation. 5787 */ 5788 pa = VM_PAGE_TO_PHYS(m); 5789 sched_pin(); 5790 pc = get_pcpu(); 5791 cmap2_pte2p = pc->pc_cmap2_pte2p; 5792 mtx_lock(&pc->pc_cmap_lock); 5793 if (pte2_load(cmap2_pte2p) != 0) 5794 panic("%s: CMAP2 busy", __func__); 5795 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 5796 vm_memattr_to_pte2(ma))); 5797 dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); 5798 pte2_clear(cmap2_pte2p); 5799 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5800 sched_unpin(); 5801 mtx_unlock(&pc->pc_cmap_lock); 5802 } 5803 5804 /* 5805 * Miscellaneous support routines follow 5806 */ 5807 5808 /* 5809 * Returns true if the given page is mapped individually or as part of 5810 * a 1mpage. Otherwise, returns false. 5811 */ 5812 bool 5813 pmap_page_is_mapped(vm_page_t m) 5814 { 5815 bool rv; 5816 5817 if ((m->oflags & VPO_UNMANAGED) != 0) 5818 return (false); 5819 rw_wlock(&pvh_global_lock); 5820 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5821 ((m->flags & PG_FICTITIOUS) == 0 && 5822 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5823 rw_wunlock(&pvh_global_lock); 5824 return (rv); 5825 } 5826 5827 /* 5828 * Returns true if the pmap's pv is one of the first 5829 * 16 pvs linked to from this page. This count may 5830 * be changed upwards or downwards in the future; it 5831 * is only necessary that true be returned for a small 5832 * subset of pmaps for proper page aging. 5833 */ 5834 bool 5835 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5836 { 5837 struct md_page *pvh; 5838 pv_entry_t pv; 5839 int loops = 0; 5840 bool rv; 5841 5842 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5843 ("%s: page %p is not managed", __func__, m)); 5844 rv = false; 5845 rw_wlock(&pvh_global_lock); 5846 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5847 if (PV_PMAP(pv) == pmap) { 5848 rv = true; 5849 break; 5850 } 5851 loops++; 5852 if (loops >= 16) 5853 break; 5854 } 5855 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5856 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5857 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5858 if (PV_PMAP(pv) == pmap) { 5859 rv = true; 5860 break; 5861 } 5862 loops++; 5863 if (loops >= 16) 5864 break; 5865 } 5866 } 5867 rw_wunlock(&pvh_global_lock); 5868 return (rv); 5869 } 5870 5871 /* 5872 * pmap_zero_page zeros the specified hardware page by mapping 5873 * the page into KVM and using bzero to clear its contents. 5874 */ 5875 void 5876 pmap_zero_page(vm_page_t m) 5877 { 5878 pt2_entry_t *cmap2_pte2p; 5879 struct pcpu *pc; 5880 5881 sched_pin(); 5882 pc = get_pcpu(); 5883 cmap2_pte2p = pc->pc_cmap2_pte2p; 5884 mtx_lock(&pc->pc_cmap_lock); 5885 if (pte2_load(cmap2_pte2p) != 0) 5886 panic("%s: CMAP2 busy", __func__); 5887 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5888 vm_page_pte2_attr(m))); 5889 pagezero(pc->pc_cmap2_addr); 5890 pte2_clear(cmap2_pte2p); 5891 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5892 sched_unpin(); 5893 mtx_unlock(&pc->pc_cmap_lock); 5894 } 5895 5896 /* 5897 * pmap_zero_page_area zeros the specified hardware page by mapping 5898 * the page into KVM and using bzero to clear its contents. 5899 * 5900 * off and size may not cover an area beyond a single hardware page. 5901 */ 5902 void 5903 pmap_zero_page_area(vm_page_t m, int off, int size) 5904 { 5905 pt2_entry_t *cmap2_pte2p; 5906 struct pcpu *pc; 5907 5908 sched_pin(); 5909 pc = get_pcpu(); 5910 cmap2_pte2p = pc->pc_cmap2_pte2p; 5911 mtx_lock(&pc->pc_cmap_lock); 5912 if (pte2_load(cmap2_pte2p) != 0) 5913 panic("%s: CMAP2 busy", __func__); 5914 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5915 vm_page_pte2_attr(m))); 5916 if (off == 0 && size == PAGE_SIZE) 5917 pagezero(pc->pc_cmap2_addr); 5918 else 5919 bzero(pc->pc_cmap2_addr + off, size); 5920 pte2_clear(cmap2_pte2p); 5921 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5922 sched_unpin(); 5923 mtx_unlock(&pc->pc_cmap_lock); 5924 } 5925 5926 /* 5927 * pmap_copy_page copies the specified (machine independent) 5928 * page by mapping the page into virtual memory and using 5929 * memcpy to copy the page, one machine dependent page at a 5930 * time. 5931 */ 5932 void 5933 pmap_copy_page(vm_page_t src, vm_page_t dst) 5934 { 5935 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5936 struct pcpu *pc; 5937 5938 sched_pin(); 5939 pc = get_pcpu(); 5940 cmap1_pte2p = pc->pc_cmap1_pte2p; 5941 cmap2_pte2p = pc->pc_cmap2_pte2p; 5942 mtx_lock(&pc->pc_cmap_lock); 5943 if (pte2_load(cmap1_pte2p) != 0) 5944 panic("%s: CMAP1 busy", __func__); 5945 if (pte2_load(cmap2_pte2p) != 0) 5946 panic("%s: CMAP2 busy", __func__); 5947 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), 5948 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src))); 5949 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), 5950 PTE2_AP_KRW, vm_page_pte2_attr(dst))); 5951 memcpy(pc->pc_cmap2_addr, pc->pc_cmap1_addr, PAGE_SIZE); 5952 pte2_clear(cmap1_pte2p); 5953 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5954 pte2_clear(cmap2_pte2p); 5955 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5956 sched_unpin(); 5957 mtx_unlock(&pc->pc_cmap_lock); 5958 } 5959 5960 int unmapped_buf_allowed = 1; 5961 5962 void 5963 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5964 vm_offset_t b_offset, int xfersize) 5965 { 5966 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5967 vm_page_t a_pg, b_pg; 5968 char *a_cp, *b_cp; 5969 vm_offset_t a_pg_offset, b_pg_offset; 5970 struct pcpu *pc; 5971 int cnt; 5972 5973 sched_pin(); 5974 pc = get_pcpu(); 5975 cmap1_pte2p = pc->pc_cmap1_pte2p; 5976 cmap2_pte2p = pc->pc_cmap2_pte2p; 5977 mtx_lock(&pc->pc_cmap_lock); 5978 if (pte2_load(cmap1_pte2p) != 0) 5979 panic("pmap_copy_pages: CMAP1 busy"); 5980 if (pte2_load(cmap2_pte2p) != 0) 5981 panic("pmap_copy_pages: CMAP2 busy"); 5982 while (xfersize > 0) { 5983 a_pg = ma[a_offset >> PAGE_SHIFT]; 5984 a_pg_offset = a_offset & PAGE_MASK; 5985 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5986 b_pg = mb[b_offset >> PAGE_SHIFT]; 5987 b_pg_offset = b_offset & PAGE_MASK; 5988 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5989 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), 5990 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg))); 5991 tlb_flush_local((vm_offset_t)pc->pc_cmap1_addr); 5992 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), 5993 PTE2_AP_KRW, vm_page_pte2_attr(b_pg))); 5994 tlb_flush_local((vm_offset_t)pc->pc_cmap2_addr); 5995 a_cp = pc->pc_cmap1_addr + a_pg_offset; 5996 b_cp = pc->pc_cmap2_addr + b_pg_offset; 5997 memcpy(b_cp, a_cp, cnt); 5998 a_offset += cnt; 5999 b_offset += cnt; 6000 xfersize -= cnt; 6001 } 6002 pte2_clear(cmap1_pte2p); 6003 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 6004 pte2_clear(cmap2_pte2p); 6005 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6006 sched_unpin(); 6007 mtx_unlock(&pc->pc_cmap_lock); 6008 } 6009 6010 void * 6011 pmap_quick_enter_page(vm_page_t m) 6012 { 6013 struct pcpu *pc; 6014 pt2_entry_t *pte2p; 6015 6016 critical_enter(); 6017 pc = get_pcpu(); 6018 pte2p = pc->pc_qmap_pte2p; 6019 6020 KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); 6021 6022 pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 6023 vm_page_pte2_attr(m))); 6024 return (pc->pc_qmap_addr); 6025 } 6026 6027 void 6028 pmap_quick_remove_page(void *addr) 6029 { 6030 struct pcpu *pc; 6031 pt2_entry_t *pte2p; 6032 6033 pc = get_pcpu(); 6034 pte2p = pc->pc_qmap_pte2p; 6035 6036 KASSERT(addr == pc->pc_qmap_addr, 6037 ("%s: invalid address", __func__)); 6038 KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); 6039 6040 pte2_clear(pte2p); 6041 tlb_flush((vm_offset_t)pc->pc_qmap_addr); 6042 critical_exit(); 6043 } 6044 6045 /* 6046 * Copy the range specified by src_addr/len 6047 * from the source map to the range dst_addr/len 6048 * in the destination map. 6049 * 6050 * This routine is only advisory and need not do anything. 6051 */ 6052 void 6053 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 6054 vm_offset_t src_addr) 6055 { 6056 struct spglist free; 6057 vm_offset_t addr; 6058 vm_offset_t end_addr = src_addr + len; 6059 vm_offset_t nextva; 6060 6061 if (dst_addr != src_addr) 6062 return; 6063 6064 if (!pmap_is_current(src_pmap)) 6065 return; 6066 6067 rw_wlock(&pvh_global_lock); 6068 if (dst_pmap < src_pmap) { 6069 PMAP_LOCK(dst_pmap); 6070 PMAP_LOCK(src_pmap); 6071 } else { 6072 PMAP_LOCK(src_pmap); 6073 PMAP_LOCK(dst_pmap); 6074 } 6075 sched_pin(); 6076 for (addr = src_addr; addr < end_addr; addr = nextva) { 6077 pt2_entry_t *src_pte2p, *dst_pte2p; 6078 vm_page_t dst_mpt2pg, src_mpt2pg; 6079 pt1_entry_t src_pte1; 6080 u_int pte1_idx; 6081 6082 KASSERT(addr < VM_MAXUSER_ADDRESS, 6083 ("%s: invalid to pmap_copy page tables", __func__)); 6084 6085 nextva = pte1_trunc(addr + PTE1_SIZE); 6086 if (nextva < addr) 6087 nextva = end_addr; 6088 6089 pte1_idx = pte1_index(addr); 6090 src_pte1 = src_pmap->pm_pt1[pte1_idx]; 6091 if (pte1_is_section(src_pte1)) { 6092 if ((addr & PTE1_OFFSET) != 0 || 6093 (addr + PTE1_SIZE) > end_addr) 6094 continue; 6095 if (dst_pmap->pm_pt1[pte1_idx] == 0 && 6096 (!pte1_is_managed(src_pte1) || 6097 pmap_pv_insert_pte1(dst_pmap, addr, src_pte1, 6098 PMAP_ENTER_NORECLAIM))) { 6099 dst_pmap->pm_pt1[pte1_idx] = src_pte1 & 6100 ~PTE1_W; 6101 dst_pmap->pm_stats.resident_count += 6102 PTE1_SIZE / PAGE_SIZE; 6103 pmap_pte1_mappings++; 6104 } 6105 continue; 6106 } else if (!pte1_is_link(src_pte1)) 6107 continue; 6108 6109 src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); 6110 6111 /* 6112 * We leave PT2s to be linked from PT1 even if they are not 6113 * referenced until all PT2s in a page are without reference. 6114 * 6115 * QQQ: It could be changed ... 6116 */ 6117 #if 0 /* single_pt2_link_is_cleared */ 6118 KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, 6119 ("%s: source page table page is unused", __func__)); 6120 #else 6121 if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) 6122 continue; 6123 #endif 6124 if (nextva > end_addr) 6125 nextva = end_addr; 6126 6127 src_pte2p = pt2map_entry(addr); 6128 while (addr < nextva) { 6129 pt2_entry_t temp_pte2; 6130 temp_pte2 = pte2_load(src_pte2p); 6131 /* 6132 * we only virtual copy managed pages 6133 */ 6134 if (pte2_is_managed(temp_pte2)) { 6135 dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, 6136 PMAP_ENTER_NOSLEEP); 6137 if (dst_mpt2pg == NULL) 6138 goto out; 6139 dst_pte2p = pmap_pte2_quick(dst_pmap, addr); 6140 if (!pte2_is_valid(pte2_load(dst_pte2p)) && 6141 pmap_try_insert_pv_entry(dst_pmap, addr, 6142 PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { 6143 /* 6144 * Clear the wired, modified, and 6145 * accessed (referenced) bits 6146 * during the copy. 6147 */ 6148 temp_pte2 &= ~(PTE2_W | PTE2_A); 6149 temp_pte2 |= PTE2_NM; 6150 pte2_store(dst_pte2p, temp_pte2); 6151 dst_pmap->pm_stats.resident_count++; 6152 } else { 6153 SLIST_INIT(&free); 6154 if (pmap_unwire_pt2(dst_pmap, addr, 6155 dst_mpt2pg, &free)) { 6156 pmap_tlb_flush(dst_pmap, addr); 6157 vm_page_free_pages_toq(&free, 6158 false); 6159 } 6160 goto out; 6161 } 6162 if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= 6163 pt2_wirecount_get(src_mpt2pg, pte1_idx)) 6164 break; 6165 } 6166 addr += PAGE_SIZE; 6167 src_pte2p++; 6168 } 6169 } 6170 out: 6171 sched_unpin(); 6172 rw_wunlock(&pvh_global_lock); 6173 PMAP_UNLOCK(src_pmap); 6174 PMAP_UNLOCK(dst_pmap); 6175 } 6176 6177 /* 6178 * Increase the starting virtual address of the given mapping if a 6179 * different alignment might result in more section mappings. 6180 */ 6181 void 6182 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6183 vm_offset_t *addr, vm_size_t size) 6184 { 6185 vm_offset_t pte1_offset; 6186 6187 if (size < PTE1_SIZE) 6188 return; 6189 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6190 offset += ptoa(object->pg_color); 6191 pte1_offset = offset & PTE1_OFFSET; 6192 if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || 6193 (*addr & PTE1_OFFSET) == pte1_offset) 6194 return; 6195 if ((*addr & PTE1_OFFSET) < pte1_offset) 6196 *addr = pte1_trunc(*addr) + pte1_offset; 6197 else 6198 *addr = pte1_roundup(*addr) + pte1_offset; 6199 } 6200 6201 void 6202 pmap_activate(struct thread *td) 6203 { 6204 pmap_t pmap, oldpmap; 6205 u_int cpuid, ttb; 6206 6207 PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); 6208 6209 critical_enter(); 6210 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6211 oldpmap = PCPU_GET(curpmap); 6212 cpuid = PCPU_GET(cpuid); 6213 6214 #if defined(SMP) 6215 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6216 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6217 #else 6218 CPU_CLR(cpuid, &oldpmap->pm_active); 6219 CPU_SET(cpuid, &pmap->pm_active); 6220 #endif 6221 6222 ttb = pmap_ttb_get(pmap); 6223 6224 /* 6225 * pmap_activate is for the current thread on the current cpu 6226 */ 6227 td->td_pcb->pcb_pagedir = ttb; 6228 cp15_ttbr_set(ttb); 6229 PCPU_SET(curpmap, pmap); 6230 critical_exit(); 6231 } 6232 6233 void 6234 pmap_active_cpus(pmap_t pmap, cpuset_t *res) 6235 { 6236 *res = pmap->pm_active; 6237 } 6238 6239 /* 6240 * Perform the pmap work for mincore(2). If the page is not both referenced and 6241 * modified by this pmap, returns its physical address so that the caller can 6242 * find other mappings. 6243 */ 6244 int 6245 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 6246 { 6247 pt1_entry_t *pte1p, pte1; 6248 pt2_entry_t *pte2p, pte2; 6249 vm_paddr_t pa; 6250 bool managed; 6251 int val; 6252 6253 PMAP_LOCK(pmap); 6254 pte1p = pmap_pte1(pmap, addr); 6255 pte1 = pte1_load(pte1p); 6256 if (pte1_is_section(pte1)) { 6257 pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); 6258 managed = pte1_is_managed(pte1); 6259 val = MINCORE_PSIND(1) | MINCORE_INCORE; 6260 if (pte1_is_dirty(pte1)) 6261 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6262 if (pte1 & PTE1_A) 6263 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6264 } else if (pte1_is_link(pte1)) { 6265 pte2p = pmap_pte2(pmap, addr); 6266 pte2 = pte2_load(pte2p); 6267 pmap_pte2_release(pte2p); 6268 pa = pte2_pa(pte2); 6269 managed = pte2_is_managed(pte2); 6270 val = MINCORE_INCORE; 6271 if (pte2_is_dirty(pte2)) 6272 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6273 if (pte2 & PTE2_A) 6274 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6275 } else { 6276 managed = false; 6277 val = 0; 6278 } 6279 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6280 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 6281 *pap = pa; 6282 } 6283 PMAP_UNLOCK(pmap); 6284 return (val); 6285 } 6286 6287 void 6288 pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) 6289 { 6290 pmap_kenter(va, size, pa, VM_MEMATTR_DEVICE); 6291 } 6292 6293 void 6294 pmap_kremove_device(vm_offset_t va, vm_size_t size) 6295 { 6296 vm_offset_t sva; 6297 6298 KASSERT((size & PAGE_MASK) == 0, 6299 ("%s: device mapping not page-sized", __func__)); 6300 6301 sva = va; 6302 while (size != 0) { 6303 pmap_kremove(va); 6304 va += PAGE_SIZE; 6305 size -= PAGE_SIZE; 6306 } 6307 tlb_flush_range(sva, va - sva); 6308 } 6309 6310 void 6311 pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) 6312 { 6313 6314 pcb->pcb_pagedir = pmap_ttb_get(pmap); 6315 } 6316 6317 /* 6318 * Clean L1 data cache range by physical address. 6319 * The range must be within a single page. 6320 */ 6321 static void 6322 pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr) 6323 { 6324 pt2_entry_t *cmap2_pte2p; 6325 struct pcpu *pc; 6326 6327 KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, 6328 ("%s: not on single page", __func__)); 6329 6330 sched_pin(); 6331 pc = get_pcpu(); 6332 cmap2_pte2p = pc->pc_cmap2_pte2p; 6333 mtx_lock(&pc->pc_cmap_lock); 6334 if (pte2_load(cmap2_pte2p) != 0) 6335 panic("%s: CMAP2 busy", __func__); 6336 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr)); 6337 dcache_wb_pou((vm_offset_t)pc->pc_cmap2_addr + (pa & PAGE_MASK), size); 6338 pte2_clear(cmap2_pte2p); 6339 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6340 sched_unpin(); 6341 mtx_unlock(&pc->pc_cmap_lock); 6342 } 6343 6344 /* 6345 * Sync instruction cache range which is not mapped yet. 6346 */ 6347 void 6348 cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) 6349 { 6350 uint32_t len, offset; 6351 vm_page_t m; 6352 6353 /* Write back d-cache on given address range. */ 6354 offset = pa & PAGE_MASK; 6355 for ( ; size != 0; size -= len, pa += len, offset = 0) { 6356 len = min(PAGE_SIZE - offset, size); 6357 m = PHYS_TO_VM_PAGE(pa); 6358 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6359 __func__, pa)); 6360 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6361 } 6362 /* 6363 * I-cache is VIPT. Only way how to flush all virtual mappings 6364 * on given physical address is to invalidate all i-cache. 6365 */ 6366 icache_inv_all(); 6367 } 6368 6369 void 6370 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) 6371 { 6372 6373 /* Write back d-cache on given address range. */ 6374 if (va >= VM_MIN_KERNEL_ADDRESS) { 6375 dcache_wb_pou(va, size); 6376 } else { 6377 uint32_t len, offset; 6378 vm_paddr_t pa; 6379 vm_page_t m; 6380 6381 offset = va & PAGE_MASK; 6382 for ( ; size != 0; size -= len, va += len, offset = 0) { 6383 pa = pmap_extract(pmap, va); /* offset is preserved */ 6384 len = min(PAGE_SIZE - offset, size); 6385 m = PHYS_TO_VM_PAGE(pa); 6386 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6387 __func__, pa)); 6388 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6389 } 6390 } 6391 /* 6392 * I-cache is VIPT. Only way how to flush all virtual mappings 6393 * on given physical address is to invalidate all i-cache. 6394 */ 6395 icache_inv_all(); 6396 } 6397 6398 /* 6399 * The implementation of pmap_fault() uses IN_RANGE2() macro which 6400 * depends on the fact that given range size is a power of 2. 6401 */ 6402 CTASSERT(powerof2(NB_IN_PT1)); 6403 CTASSERT(powerof2(PT2MAP_SIZE)); 6404 6405 #define IN_RANGE2(addr, start, size) \ 6406 ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) 6407 6408 /* 6409 * Handle access and R/W emulation faults. 6410 */ 6411 int 6412 pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode) 6413 { 6414 pt1_entry_t *pte1p, pte1; 6415 pt2_entry_t *pte2p, pte2; 6416 6417 if (pmap == NULL) 6418 pmap = kernel_pmap; 6419 6420 /* 6421 * In kernel, we should never get abort with FAR which is in range of 6422 * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here 6423 * and print out a useful abort message and even get to the debugger 6424 * otherwise it likely ends with never ending loop of aborts. 6425 */ 6426 if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { 6427 /* 6428 * All L1 tables should always be mapped and present. 6429 * However, we check only current one herein. For user mode, 6430 * only permission abort from malicious user is not fatal. 6431 * And alignment abort as it may have higher priority. 6432 */ 6433 if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) { 6434 CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", 6435 __func__, pmap, pmap->pm_pt1, far); 6436 panic("%s: pm_pt1 abort", __func__); 6437 } 6438 return (KERN_INVALID_ADDRESS); 6439 } 6440 if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { 6441 /* 6442 * PT2MAP should be always mapped and present in current 6443 * L1 table. However, only existing L2 tables are mapped 6444 * in PT2MAP. For user mode, only L2 translation abort and 6445 * permission abort from malicious user is not fatal. 6446 * And alignment abort as it may have higher priority. 6447 */ 6448 if (!usermode || (idx != FAULT_ALIGN && 6449 idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { 6450 CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", 6451 __func__, pmap, PT2MAP, far); 6452 panic("%s: PT2MAP abort", __func__); 6453 } 6454 return (KERN_INVALID_ADDRESS); 6455 } 6456 6457 /* 6458 * A pmap lock is used below for handling of access and R/W emulation 6459 * aborts. They were handled by atomic operations before so some 6460 * analysis of new situation is needed to answer the following question: 6461 * Is it safe to use the lock even for these aborts? 6462 * 6463 * There may happen two cases in general: 6464 * 6465 * (1) Aborts while the pmap lock is locked already - this should not 6466 * happen as pmap lock is not recursive. However, under pmap lock only 6467 * internal kernel data should be accessed and such data should be 6468 * mapped with A bit set and NM bit cleared. If double abort happens, 6469 * then a mapping of data which has caused it must be fixed. Further, 6470 * all new mappings are always made with A bit set and the bit can be 6471 * cleared only on managed mappings. 6472 * 6473 * (2) Aborts while another lock(s) is/are locked - this already can 6474 * happen. However, there is no difference here if it's either access or 6475 * R/W emulation abort, or if it's some other abort. 6476 */ 6477 6478 PMAP_LOCK(pmap); 6479 #ifdef INVARIANTS 6480 pte1 = pte1_load(pmap_pte1(pmap, far)); 6481 if (pte1_is_link(pte1)) { 6482 /* 6483 * Check in advance that associated L2 page table is mapped into 6484 * PT2MAP space. Note that faulty access to not mapped L2 page 6485 * table is caught in more general check above where "far" is 6486 * checked that it does not lay in PT2MAP space. Note also that 6487 * L1 page table and PT2TAB always exist and are mapped. 6488 */ 6489 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); 6490 if (!pte2_is_valid(pte2)) 6491 panic("%s: missing L2 page table (%p, %#x)", 6492 __func__, pmap, far); 6493 } 6494 #endif 6495 #ifdef SMP 6496 /* 6497 * Special treatment is due to break-before-make approach done when 6498 * pte1 is updated for userland mapping during section promotion or 6499 * demotion. If not caught here, pmap_enter() can find a section 6500 * mapping on faulting address. That is not allowed. 6501 */ 6502 if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) { 6503 PMAP_UNLOCK(pmap); 6504 return (KERN_SUCCESS); 6505 } 6506 #endif 6507 /* 6508 * Access bits for page and section. Note that the entry 6509 * is not in TLB yet, so TLB flush is not necessary. 6510 * 6511 * QQQ: This is hardware emulation, we do not call userret() 6512 * for aborts from user mode. 6513 */ 6514 if (idx == FAULT_ACCESS_L2) { 6515 pte1 = pte1_load(pmap_pte1(pmap, far)); 6516 if (pte1_is_link(pte1)) { 6517 /* L2 page table should exist and be mapped. */ 6518 pte2p = pt2map_entry(far); 6519 pte2 = pte2_load(pte2p); 6520 if (pte2_is_valid(pte2)) { 6521 pte2_store(pte2p, pte2 | PTE2_A); 6522 PMAP_UNLOCK(pmap); 6523 return (KERN_SUCCESS); 6524 } 6525 } else { 6526 /* 6527 * We got L2 access fault but PTE1 is not a link. 6528 * Probably some race happened, do nothing. 6529 */ 6530 CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L2 - pmap %#x far %#x", 6531 __func__, pmap, far); 6532 PMAP_UNLOCK(pmap); 6533 return (KERN_SUCCESS); 6534 } 6535 } 6536 if (idx == FAULT_ACCESS_L1) { 6537 pte1p = pmap_pte1(pmap, far); 6538 pte1 = pte1_load(pte1p); 6539 if (pte1_is_section(pte1)) { 6540 pte1_store(pte1p, pte1 | PTE1_A); 6541 PMAP_UNLOCK(pmap); 6542 return (KERN_SUCCESS); 6543 } else { 6544 /* 6545 * We got L1 access fault but PTE1 is not section 6546 * mapping. Probably some race happened, do nothing. 6547 */ 6548 CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L1 - pmap %#x far %#x", 6549 __func__, pmap, far); 6550 PMAP_UNLOCK(pmap); 6551 return (KERN_SUCCESS); 6552 } 6553 } 6554 6555 /* 6556 * Handle modify bits for page and section. Note that the modify 6557 * bit is emulated by software. So PTEx_RO is software read only 6558 * bit and PTEx_NM flag is real hardware read only bit. 6559 * 6560 * QQQ: This is hardware emulation, we do not call userret() 6561 * for aborts from user mode. 6562 */ 6563 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { 6564 pte1 = pte1_load(pmap_pte1(pmap, far)); 6565 if (pte1_is_link(pte1)) { 6566 /* L2 page table should exist and be mapped. */ 6567 pte2p = pt2map_entry(far); 6568 pte2 = pte2_load(pte2p); 6569 if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && 6570 (pte2 & PTE2_NM)) { 6571 pte2_store(pte2p, pte2 & ~PTE2_NM); 6572 tlb_flush(trunc_page(far)); 6573 PMAP_UNLOCK(pmap); 6574 return (KERN_SUCCESS); 6575 } 6576 } else { 6577 /* 6578 * We got L2 permission fault but PTE1 is not a link. 6579 * Probably some race happened, do nothing. 6580 */ 6581 CTR3(KTR_PMAP, "%s: FAULT_PERM_L2 - pmap %#x far %#x", 6582 __func__, pmap, far); 6583 PMAP_UNLOCK(pmap); 6584 return (KERN_SUCCESS); 6585 } 6586 } 6587 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { 6588 pte1p = pmap_pte1(pmap, far); 6589 pte1 = pte1_load(pte1p); 6590 if (pte1_is_section(pte1)) { 6591 if (!(pte1 & PTE1_RO) && (pte1 & PTE1_NM)) { 6592 pte1_store(pte1p, pte1 & ~PTE1_NM); 6593 tlb_flush(pte1_trunc(far)); 6594 PMAP_UNLOCK(pmap); 6595 return (KERN_SUCCESS); 6596 } 6597 } else { 6598 /* 6599 * We got L1 permission fault but PTE1 is not section 6600 * mapping. Probably some race happened, do nothing. 6601 */ 6602 CTR3(KTR_PMAP, "%s: FAULT_PERM_L1 - pmap %#x far %#x", 6603 __func__, pmap, far); 6604 PMAP_UNLOCK(pmap); 6605 return (KERN_SUCCESS); 6606 } 6607 } 6608 6609 /* 6610 * QQQ: The previous code, mainly fast handling of access and 6611 * modify bits aborts, could be moved to ASM. Now we are 6612 * starting to deal with not fast aborts. 6613 */ 6614 PMAP_UNLOCK(pmap); 6615 return (KERN_FAILURE); 6616 } 6617 6618 #if defined(PMAP_DEBUG) 6619 /* 6620 * Reusing of KVA used in pmap_zero_page function !!! 6621 */ 6622 static void 6623 pmap_zero_page_check(vm_page_t m) 6624 { 6625 pt2_entry_t *cmap2_pte2p; 6626 uint32_t *p, *end; 6627 struct pcpu *pc; 6628 6629 sched_pin(); 6630 pc = get_pcpu(); 6631 cmap2_pte2p = pc->pc_cmap2_pte2p; 6632 mtx_lock(&pc->pc_cmap_lock); 6633 if (pte2_load(cmap2_pte2p) != 0) 6634 panic("%s: CMAP2 busy", __func__); 6635 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 6636 vm_page_pte2_attr(m))); 6637 end = (uint32_t*)(pc->pc_cmap2_addr + PAGE_SIZE); 6638 for (p = (uint32_t*)pc->pc_cmap2_addr; p < end; p++) 6639 if (*p != 0) 6640 panic("%s: page %p not zero, va: %p", __func__, m, 6641 pc->pc_cmap2_addr); 6642 pte2_clear(cmap2_pte2p); 6643 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6644 sched_unpin(); 6645 mtx_unlock(&pc->pc_cmap_lock); 6646 } 6647 6648 int 6649 pmap_pid_dump(int pid) 6650 { 6651 pmap_t pmap; 6652 struct proc *p; 6653 int npte2 = 0; 6654 int i, j, index; 6655 6656 sx_slock(&allproc_lock); 6657 FOREACH_PROC_IN_SYSTEM(p) { 6658 if (p->p_pid != pid || p->p_vmspace == NULL) 6659 continue; 6660 index = 0; 6661 pmap = vmspace_pmap(p->p_vmspace); 6662 for (i = 0; i < NPTE1_IN_PT1; i++) { 6663 pt1_entry_t pte1; 6664 pt2_entry_t *pte2p, pte2; 6665 vm_offset_t base, va; 6666 vm_paddr_t pa; 6667 vm_page_t m; 6668 6669 base = i << PTE1_SHIFT; 6670 pte1 = pte1_load(&pmap->pm_pt1[i]); 6671 6672 if (pte1_is_section(pte1)) { 6673 /* 6674 * QQQ: Do something here! 6675 */ 6676 } else if (pte1_is_link(pte1)) { 6677 for (j = 0; j < NPTE2_IN_PT2; j++) { 6678 va = base + (j << PAGE_SHIFT); 6679 if (va >= VM_MIN_KERNEL_ADDRESS) { 6680 if (index) { 6681 index = 0; 6682 printf("\n"); 6683 } 6684 sx_sunlock(&allproc_lock); 6685 return (npte2); 6686 } 6687 pte2p = pmap_pte2(pmap, va); 6688 pte2 = pte2_load(pte2p); 6689 pmap_pte2_release(pte2p); 6690 if (!pte2_is_valid(pte2)) 6691 continue; 6692 6693 pa = pte2_pa(pte2); 6694 m = PHYS_TO_VM_PAGE(pa); 6695 printf("va: 0x%x, pa: 0x%x, w: %d, " 6696 "f: 0x%x", va, pa, 6697 m->ref_count, m->flags); 6698 npte2++; 6699 index++; 6700 if (index >= 2) { 6701 index = 0; 6702 printf("\n"); 6703 } else { 6704 printf(" "); 6705 } 6706 } 6707 } 6708 } 6709 } 6710 sx_sunlock(&allproc_lock); 6711 return (npte2); 6712 } 6713 6714 #endif 6715 6716 #ifdef DDB 6717 static pt2_entry_t * 6718 pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) 6719 { 6720 pt1_entry_t pte1; 6721 vm_paddr_t pt2pg_pa; 6722 6723 pte1 = pte1_load(pmap_pte1(pmap, va)); 6724 if (!pte1_is_link(pte1)) 6725 return (NULL); 6726 6727 if (pmap_is_current(pmap)) 6728 return (pt2map_entry(va)); 6729 6730 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 6731 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 6732 if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { 6733 pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); 6734 #ifdef SMP 6735 PMAP3cpu = PCPU_GET(cpuid); 6736 #endif 6737 tlb_flush_local((vm_offset_t)PADDR3); 6738 } 6739 #ifdef SMP 6740 else if (PMAP3cpu != PCPU_GET(cpuid)) { 6741 PMAP3cpu = PCPU_GET(cpuid); 6742 tlb_flush_local((vm_offset_t)PADDR3); 6743 } 6744 #endif 6745 return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 6746 } 6747 6748 static void 6749 dump_pmap(pmap_t pmap) 6750 { 6751 6752 printf("pmap %p\n", pmap); 6753 printf(" pm_pt1: %p\n", pmap->pm_pt1); 6754 printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); 6755 printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); 6756 } 6757 6758 DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) 6759 { 6760 6761 pmap_t pmap; 6762 LIST_FOREACH(pmap, &allpmaps, pm_list) { 6763 dump_pmap(pmap); 6764 } 6765 } 6766 6767 static int 6768 pte2_class(pt2_entry_t pte2) 6769 { 6770 int cls; 6771 6772 cls = (pte2 >> 2) & 0x03; 6773 cls |= (pte2 >> 4) & 0x04; 6774 return (cls); 6775 } 6776 6777 static void 6778 dump_section(pmap_t pmap, uint32_t pte1_idx) 6779 { 6780 } 6781 6782 static void 6783 dump_link(pmap_t pmap, uint32_t pte1_idx, bool invalid_ok) 6784 { 6785 uint32_t i; 6786 vm_offset_t va; 6787 pt2_entry_t *pte2p, pte2; 6788 vm_page_t m; 6789 6790 va = pte1_idx << PTE1_SHIFT; 6791 pte2p = pmap_pte2_ddb(pmap, va); 6792 for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { 6793 pte2 = pte2_load(pte2p); 6794 if (pte2 == 0) 6795 continue; 6796 if (!pte2_is_valid(pte2)) { 6797 printf(" 0x%08X: 0x%08X", va, pte2); 6798 if (!invalid_ok) 6799 printf(" - not valid !!!"); 6800 printf("\n"); 6801 continue; 6802 } 6803 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 6804 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, 6805 pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); 6806 if (m != NULL) { 6807 printf(" v:%d w:%d f:0x%04X\n", m->valid, 6808 m->ref_count, m->flags); 6809 } else { 6810 printf("\n"); 6811 } 6812 } 6813 } 6814 6815 static __inline bool 6816 is_pv_chunk_space(vm_offset_t va) 6817 { 6818 6819 if ((((vm_offset_t)pv_chunkbase) <= va) && 6820 (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) 6821 return (true); 6822 return (false); 6823 } 6824 6825 DB_SHOW_COMMAND(pmap, pmap_pmap_print) 6826 { 6827 /* XXX convert args. */ 6828 pmap_t pmap = (pmap_t)addr; 6829 pt1_entry_t pte1; 6830 pt2_entry_t pte2; 6831 vm_offset_t va, eva; 6832 vm_page_t m; 6833 uint32_t i; 6834 bool invalid_ok, dump_link_ok, dump_pv_chunk; 6835 6836 if (have_addr) { 6837 pmap_t pm; 6838 6839 LIST_FOREACH(pm, &allpmaps, pm_list) 6840 if (pm == pmap) break; 6841 if (pm == NULL) { 6842 printf("given pmap %p is not in allpmaps list\n", pmap); 6843 return; 6844 } 6845 } else 6846 pmap = PCPU_GET(curpmap); 6847 6848 eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; 6849 dump_pv_chunk = false; /* XXX evaluate from modif[] */ 6850 6851 printf("pmap: 0x%08X\n", (uint32_t)pmap); 6852 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6853 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6854 6855 for(i = 0; i < NPTE1_IN_PT1; i++) { 6856 pte1 = pte1_load(&pmap->pm_pt1[i]); 6857 if (pte1 == 0) 6858 continue; 6859 va = i << PTE1_SHIFT; 6860 if (va >= eva) 6861 break; 6862 6863 if (pte1_is_section(pte1)) { 6864 printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, 6865 !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); 6866 dump_section(pmap, i); 6867 } else if (pte1_is_link(pte1)) { 6868 dump_link_ok = true; 6869 invalid_ok = false; 6870 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6871 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 6872 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", 6873 va, pte1, pte2, m); 6874 if (is_pv_chunk_space(va)) { 6875 printf(" - pv_chunk space"); 6876 if (dump_pv_chunk) 6877 invalid_ok = true; 6878 else 6879 dump_link_ok = false; 6880 } 6881 else if (m != NULL) 6882 printf(" w:%d w2:%u", m->ref_count, 6883 pt2_wirecount_get(m, pte1_index(va))); 6884 if (pte2 == 0) 6885 printf(" !!! pt2tab entry is ZERO"); 6886 else if (pte2_pa(pte1) != pte2_pa(pte2)) 6887 printf(" !!! pt2tab entry is DIFFERENT - m: %p", 6888 PHYS_TO_VM_PAGE(pte2_pa(pte2))); 6889 printf("\n"); 6890 if (dump_link_ok) 6891 dump_link(pmap, i, invalid_ok); 6892 } else 6893 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6894 } 6895 } 6896 6897 static void 6898 dump_pt2tab(pmap_t pmap) 6899 { 6900 uint32_t i; 6901 pt2_entry_t pte2; 6902 vm_offset_t va; 6903 vm_paddr_t pa; 6904 vm_page_t m; 6905 6906 printf("PT2TAB:\n"); 6907 for (i = 0; i < PT2TAB_ENTRIES; i++) { 6908 pte2 = pte2_load(&pmap->pm_pt2tab[i]); 6909 if (!pte2_is_valid(pte2)) 6910 continue; 6911 va = i << PT2TAB_SHIFT; 6912 pa = pte2_pa(pte2); 6913 m = PHYS_TO_VM_PAGE(pa); 6914 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, 6915 pte2_class(pte2), !!(pte2 & PTE2_S), m); 6916 if (m != NULL) 6917 printf(" , w: %d, f: 0x%04X pidx: %lld", 6918 m->ref_count, m->flags, m->pindex); 6919 printf("\n"); 6920 } 6921 } 6922 6923 DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) 6924 { 6925 /* XXX convert args. */ 6926 pmap_t pmap = (pmap_t)addr; 6927 pt1_entry_t pte1; 6928 pt2_entry_t pte2; 6929 vm_offset_t va; 6930 uint32_t i, start; 6931 6932 if (have_addr) { 6933 printf("supported only on current pmap\n"); 6934 return; 6935 } 6936 6937 pmap = PCPU_GET(curpmap); 6938 printf("curpmap: 0x%08X\n", (uint32_t)pmap); 6939 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6940 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6941 6942 start = pte1_index((vm_offset_t)PT2MAP); 6943 for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { 6944 pte1 = pte1_load(&pmap->pm_pt1[i]); 6945 if (pte1 == 0) 6946 continue; 6947 va = i << PTE1_SHIFT; 6948 if (pte1_is_section(pte1)) { 6949 printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, 6950 !!(pte1 & PTE1_S)); 6951 dump_section(pmap, i); 6952 } else if (pte1_is_link(pte1)) { 6953 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6954 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, 6955 pte1, pte2); 6956 if (pte2 == 0) 6957 printf(" !!! pt2tab entry is ZERO\n"); 6958 } else 6959 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6960 } 6961 dump_pt2tab(pmap); 6962 } 6963 #endif 6964