1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright (c) 2010, Intel Corporation. 26 * All rights reserved. 27 */ 28 /* 29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 30 * Copyright 2018 Joyent, Inc. All rights reserved. 31 * Copyright (c) 2014, 2015 by Delphix. All rights reserved. 32 */ 33 34 /* 35 * VM - Hardware Address Translation management for i386 and amd64 36 * 37 * Implementation of the interfaces described in <common/vm/hat.h> 38 * 39 * Nearly all the details of how the hardware is managed should not be 40 * visible outside this layer except for misc. machine specific functions 41 * that work in conjunction with this code. 42 * 43 * Routines used only inside of i86pc/vm start with hati_ for HAT Internal. 44 */ 45 46 /* 47 * amd64 HAT Design 48 * 49 * ---------- 50 * Background 51 * ---------- 52 * 53 * On x86, the address space is shared between a user process and the kernel. 54 * This is different from SPARC. Conventionally, the kernel lives at the top of 55 * the address space and the user process gets to enjoy the rest of it. If you 56 * look at the image of the address map in uts/i86pc/os/startup.c, you'll get a 57 * rough sense of how the address space is laid out and used. 58 * 59 * Every unique address space is represented by an instance of a HAT structure 60 * called a 'hat_t'. In addition to a hat_t structure for each process, there is 61 * also one that is used for the kernel (kas.a_hat), and each CPU ultimately 62 * also has a HAT. 63 * 64 * Each HAT contains a pointer to its root page table. This root page table is 65 * what we call an L3 page table in illumos and Intel calls the PML4. It is the 66 * physical address of the L3 table that we place in the %cr3 register which the 67 * processor uses. 68 * 69 * Each of the many layers of the page table is represented by a structure 70 * called an htable_t. The htable_t manages a set of 512 8-byte entries. The 71 * number of entries in a given page table is constant across all different 72 * level page tables. Note, this is only true on amd64. This has not always been 73 * the case on x86. 74 * 75 * Each entry in a page table, generally referred to as a PTE, may refer to 76 * another page table or a memory location, depending on the level of the page 77 * table and the use of large pages. Importantly, the top-level L3 page table 78 * (PML4) only supports linking to further page tables. This is also true on 79 * systems which support a 5th level page table (which we do not currently 80 * support). 81 * 82 * Historically, on x86, when a process was running on CPU, the root of the page 83 * table was inserted into %cr3 on each CPU on which it was currently running. 84 * When processes would switch (by calling hat_switch()), then the value in %cr3 85 * on that CPU would change to that of the new HAT. While this behavior is still 86 * maintained in the xpv kernel, this is not what is done today. 87 * 88 * ------------------- 89 * Per-CPU Page Tables 90 * ------------------- 91 * 92 * Throughout the system the 64-bit kernel has a notion of what it calls a 93 * per-CPU page table or PCP. The notion of a per-CPU page table was originally 94 * introduced as part of the original work to support x86 PAE. On the 64-bit 95 * kernel, it was originally used for 32-bit processes running on the 64-bit 96 * kernel. The rationale behind this was that each 32-bit process could have all 97 * of its memory represented in a single L2 page table as each L2 page table 98 * entry represents 1 GbE of memory. 99 * 100 * Following on from this, the idea was that given that all of the L3 page table 101 * entries for 32-bit processes are basically going to be identical with the 102 * exception of the first entry in the page table, why not share those page 103 * table entries. This gave rise to the idea of a per-CPU page table. 104 * 105 * The way this works is that we have a member in the machcpu_t called the 106 * mcpu_hat_info. That structure contains two different 4k pages: one that 107 * represents the L3 page table and one that represents an L2 page table. When 108 * the CPU starts up, the L3 page table entries are copied in from the kernel's 109 * page table. The L3 kernel entries do not change throughout the lifetime of 110 * the kernel. The kernel portion of these L3 pages for each CPU have the same 111 * records, meaning that they point to the same L2 page tables and thus see a 112 * consistent view of the world. 113 * 114 * When a 32-bit process is loaded into this world, we copy the 32-bit process's 115 * four top-level page table entries into the CPU's L2 page table and then set 116 * the CPU's first L3 page table entry to point to the CPU's L2 page. 117 * Specifically, in hat_pcp_update(), we're copying from the process's 118 * HAT_COPIED_32 HAT into the page tables specific to this CPU. 119 * 120 * As part of the implementation of kernel page table isolation, this was also 121 * extended to 64-bit processes. When a 64-bit process runs, we'll copy their L3 122 * PTEs across into the current CPU's L3 page table. (As we can't do the 123 * first-L3-entry trick for 64-bit processes, ->hci_pcp_l2ptes is unused in this 124 * case.) 125 * 126 * The use of per-CPU page tables has a lot of implementation ramifications. A 127 * HAT that runs a user process will be flagged with the HAT_COPIED flag to 128 * indicate that it is using the per-CPU page table functionality. In tandem 129 * with the HAT, the top-level htable_t will be flagged with the HTABLE_COPIED 130 * flag. If the HAT represents a 32-bit process, then we will also set the 131 * HAT_COPIED_32 flag on that hat_t. 132 * 133 * These two flags work together. The top-level htable_t when using per-CPU page 134 * tables is 'virtual'. We never allocate a ptable for this htable_t (i.e. 135 * ht->ht_pfn is PFN_INVALID). Instead, when we need to modify a PTE in an 136 * HTABLE_COPIED ptable, x86pte_access_pagetable() will redirect any accesses to 137 * ht_hat->hat_copied_ptes. 138 * 139 * Of course, such a modification won't actually modify the HAT_PCP page tables 140 * that were copied from the HAT_COPIED htable. When we change the top level 141 * page table entries (L2 PTEs for a 32-bit process and L3 PTEs for a 64-bit 142 * process), we need to make sure to trigger hat_pcp_update() on all CPUs that 143 * are currently tied to this HAT (including the current CPU). 144 * 145 * To do this, PCP piggy-backs on TLB invalidation, specifically via the 146 * hat_tlb_inval() path from link_ptp() and unlink_ptp(). 147 * 148 * (Importantly, in all such cases, when this is in operation, the top-level 149 * entry should not be able to refer to an actual page table entry that can be 150 * changed and consolidated into a large page. If large page consolidation is 151 * required here, then there will be much that needs to be reconsidered.) 152 * 153 * ----------------------------------------------- 154 * Kernel Page Table Isolation and the Per-CPU HAT 155 * ----------------------------------------------- 156 * 157 * All Intel CPUs that support speculative execution and paging are subject to a 158 * series of bugs that have been termed 'Meltdown'. These exploits allow a user 159 * process to read kernel memory through cache side channels and speculative 160 * execution. To mitigate this on vulnerable CPUs, we need to use a technique 161 * called kernel page table isolation. What this requires is that we have two 162 * different page table roots. When executing in kernel mode, we will use a %cr3 163 * value that has both the user and kernel pages. However when executing in user 164 * mode, we will need to have a %cr3 that has all of the user pages; however, 165 * only a subset of the kernel pages required to operate. 166 * 167 * These kernel pages that we need mapped are: 168 * 169 * o Kernel Text that allows us to switch between the cr3 values. 170 * o The current global descriptor table (GDT) 171 * o The current interrupt descriptor table (IDT) 172 * o The current task switching state (TSS) 173 * o The current local descriptor table (LDT) 174 * o Stacks and scratch space used by the interrupt handlers 175 * 176 * For more information on the stack switching techniques, construction of the 177 * trampolines, and more, please see i86pc/ml/kpti_trampolines.s. The most 178 * important part of these mappings are the following two constraints: 179 * 180 * o The mappings are all per-CPU (except for read-only text) 181 * o The mappings are static. They are all established before the CPU is 182 * started (with the exception of the boot CPU). 183 * 184 * To facilitate the kernel page table isolation we employ our per-CPU 185 * page tables discussed in the previous section and add the notion of a per-CPU 186 * HAT. Fundamentally we have a second page table root. There is both a kernel 187 * page table (hci_pcp_l3ptes), and a user L3 page table (hci_user_l3ptes). 188 * Both will have the user page table entries copied into them, the same way 189 * that we discussed in the section 'Per-CPU Page Tables'. 190 * 191 * The complex part of this is how do we construct the set of kernel mappings 192 * that should be present when running with the user page table. To answer that, 193 * we add the notion of a per-CPU HAT. This HAT functions like a normal HAT, 194 * except that it's not really associated with an address space the same way 195 * that other HATs are. 196 * 197 * This HAT lives off of the 'struct hat_cpu_info' which is a member of the 198 * machcpu in the member hci_user_hat. We use this per-CPU HAT to create the set 199 * of kernel mappings that should be present on this CPU. The kernel mappings 200 * are added to the per-CPU HAT through the function hati_cpu_punchin(). Once a 201 * mapping has been punched in, it may not be punched out. The reason that we 202 * opt to leverage a HAT structure is that it knows how to allocate and manage 203 * all of the lower level page tables as required. 204 * 205 * Because all of the mappings are present at the beginning of time for this CPU 206 * and none of the mappings are in the kernel pageable segment, we don't have to 207 * worry about faulting on these HAT structures and thus the notion of the 208 * current HAT that we're using is always the appropriate HAT for the process 209 * (usually a user HAT or the kernel's HAT). 210 * 211 * A further constraint we place on the system with these per-CPU HATs is that 212 * they are not subject to htable_steal(). Because each CPU will have a rather 213 * fixed number of page tables, the same way that we don't steal from the 214 * kernel's HAT, it was determined that we should not steal from this HAT due to 215 * the complications involved and somewhat criminal nature of htable_steal(). 216 * 217 * The per-CPU HAT is initialized in hat_pcp_setup() which is called as part of 218 * onlining the CPU, but before the CPU is actually started. The per-CPU HAT is 219 * removed in hat_pcp_teardown() which is called when a CPU is being offlined to 220 * be removed from the system (which is different from what psradm usually 221 * does). 222 * 223 * Finally, once the CPU has been onlined, the set of mappings in the per-CPU 224 * HAT must not change. The HAT related functions that we call are not meant to 225 * be called when we're switching between processes. For example, it is quite 226 * possible that if they were, they would try to grab an htable mutex which 227 * another thread might have. One needs to treat hat_switch() as though they 228 * were above LOCK_LEVEL and therefore _must not_ block under any circumstance. 229 */ 230 231 #include <sys/machparam.h> 232 #include <sys/machsystm.h> 233 #include <sys/mman.h> 234 #include <sys/types.h> 235 #include <sys/systm.h> 236 #include <sys/cpuvar.h> 237 #include <sys/thread.h> 238 #include <sys/proc.h> 239 #include <sys/cpu.h> 240 #include <sys/kmem.h> 241 #include <sys/disp.h> 242 #include <sys/shm.h> 243 #include <sys/sysmacros.h> 244 #include <sys/machparam.h> 245 #include <sys/vmem.h> 246 #include <sys/vmsystm.h> 247 #include <sys/promif.h> 248 #include <sys/var.h> 249 #include <sys/x86_archext.h> 250 #include <sys/atomic.h> 251 #include <sys/bitmap.h> 252 #include <sys/controlregs.h> 253 #include <sys/bootconf.h> 254 #include <sys/bootsvcs.h> 255 #include <sys/bootinfo.h> 256 #include <sys/archsystm.h> 257 258 #include <vm/seg_kmem.h> 259 #include <vm/hat_i86.h> 260 #include <vm/as.h> 261 #include <vm/seg.h> 262 #include <vm/page.h> 263 #include <vm/seg_kp.h> 264 #include <vm/seg_kpm.h> 265 #include <vm/vm_dep.h> 266 #ifdef __xpv 267 #include <sys/hypervisor.h> 268 #endif 269 #include <vm/kboot_mmu.h> 270 #include <vm/seg_spt.h> 271 272 #include <sys/cmn_err.h> 273 274 /* 275 * Basic parameters for hat operation. 276 */ 277 struct hat_mmu_info mmu; 278 279 /* 280 * The page that is the kernel's top level pagetable. 281 * 282 * For 32 bit PAE support on i86pc, the kernel hat will use the 1st 4 entries 283 * on this 4K page for its top level page table. The remaining groups of 284 * 4 entries are used for per processor copies of user PCP pagetables for 285 * running threads. See hat_switch() and reload_pae32() for details. 286 * 287 * pcp_page[0..3] - level==2 PTEs for kernel HAT 288 * pcp_page[4..7] - level==2 PTEs for user thread on cpu 0 289 * pcp_page[8..11] - level==2 PTE for user thread on cpu 1 290 * etc... 291 * 292 * On the 64-bit kernel, this is the normal root of the page table and there is 293 * nothing special about it when used for other CPUs. 294 */ 295 static x86pte_t *pcp_page; 296 297 /* 298 * forward declaration of internal utility routines 299 */ 300 static x86pte_t hati_update_pte(htable_t *ht, uint_t entry, x86pte_t expected, 301 x86pte_t new); 302 303 /* 304 * The kernel address space exists in all non-HAT_COPIED HATs. To implement this 305 * the kernel reserves a fixed number of entries in the topmost level(s) of page 306 * tables. The values are setup during startup and then copied to every user hat 307 * created by hat_alloc(). This means that kernelbase must be: 308 * 309 * 4Meg aligned for 32 bit kernels 310 * 512Gig aligned for x86_64 64 bit kernel 311 * 312 * The hat_kernel_range_ts describe what needs to be copied from kernel hat 313 * to each user hat. 314 */ 315 typedef struct hat_kernel_range { 316 level_t hkr_level; 317 uintptr_t hkr_start_va; 318 uintptr_t hkr_end_va; /* zero means to end of memory */ 319 } hat_kernel_range_t; 320 #define NUM_KERNEL_RANGE 2 321 static hat_kernel_range_t kernel_ranges[NUM_KERNEL_RANGE]; 322 static int num_kernel_ranges; 323 324 uint_t use_boot_reserve = 1; /* cleared after early boot process */ 325 uint_t can_steal_post_boot = 0; /* set late in boot to enable stealing */ 326 327 /* 328 * enable_1gpg: controls 1g page support for user applications. 329 * By default, 1g pages are exported to user applications. enable_1gpg can 330 * be set to 0 to not export. 331 */ 332 int enable_1gpg = 1; 333 334 /* 335 * AMD shanghai processors provide better management of 1gb ptes in its tlb. 336 * By default, 1g page support will be disabled for pre-shanghai AMD 337 * processors that don't have optimal tlb support for the 1g page size. 338 * chk_optimal_1gtlb can be set to 0 to force 1g page support on sub-optimal 339 * processors. 340 */ 341 int chk_optimal_1gtlb = 1; 342 343 344 #ifdef DEBUG 345 uint_t map1gcnt; 346 #endif 347 348 349 /* 350 * A cpuset for all cpus. This is used for kernel address cross calls, since 351 * the kernel addresses apply to all cpus. 352 */ 353 cpuset_t khat_cpuset; 354 355 /* 356 * management stuff for hat structures 357 */ 358 kmutex_t hat_list_lock; 359 kcondvar_t hat_list_cv; 360 kmem_cache_t *hat_cache; 361 kmem_cache_t *hat_hash_cache; 362 kmem_cache_t *hat32_hash_cache; 363 364 /* 365 * Simple statistics 366 */ 367 struct hatstats hatstat; 368 369 /* 370 * Some earlier hypervisor versions do not emulate cmpxchg of PTEs 371 * correctly. For such hypervisors we must set PT_USER for kernel 372 * entries ourselves (normally the emulation would set PT_USER for 373 * kernel entries and PT_USER|PT_GLOBAL for user entries). pt_kern is 374 * thus set appropriately. Note that dboot/kbm is OK, as only the full 375 * HAT uses cmpxchg() and the other paths (hypercall etc.) were never 376 * incorrect. 377 */ 378 int pt_kern; 379 380 #ifndef __xpv 381 extern pfn_t memseg_get_start(struct memseg *); 382 #endif 383 384 #define PP_GETRM(pp, rmmask) (pp->p_nrm & rmmask) 385 #define PP_ISMOD(pp) PP_GETRM(pp, P_MOD) 386 #define PP_ISREF(pp) PP_GETRM(pp, P_REF) 387 #define PP_ISRO(pp) PP_GETRM(pp, P_RO) 388 389 #define PP_SETRM(pp, rm) atomic_orb(&(pp->p_nrm), rm) 390 #define PP_SETMOD(pp) PP_SETRM(pp, P_MOD) 391 #define PP_SETREF(pp) PP_SETRM(pp, P_REF) 392 #define PP_SETRO(pp) PP_SETRM(pp, P_RO) 393 394 #define PP_CLRRM(pp, rm) atomic_andb(&(pp->p_nrm), ~(rm)) 395 #define PP_CLRMOD(pp) PP_CLRRM(pp, P_MOD) 396 #define PP_CLRREF(pp) PP_CLRRM(pp, P_REF) 397 #define PP_CLRRO(pp) PP_CLRRM(pp, P_RO) 398 #define PP_CLRALL(pp) PP_CLRRM(pp, P_MOD | P_REF | P_RO) 399 400 /* 401 * kmem cache constructor for struct hat 402 */ 403 /*ARGSUSED*/ 404 static int 405 hati_constructor(void *buf, void *handle, int kmflags) 406 { 407 hat_t *hat = buf; 408 409 mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL); 410 bzero(hat->hat_pages_mapped, 411 sizeof (pgcnt_t) * (mmu.max_page_level + 1)); 412 hat->hat_ism_pgcnt = 0; 413 hat->hat_stats = 0; 414 hat->hat_flags = 0; 415 CPUSET_ZERO(hat->hat_cpus); 416 hat->hat_htable = NULL; 417 hat->hat_ht_hash = NULL; 418 return (0); 419 } 420 421 /* 422 * Put it at the start of the global list of all hats (used by stealing) 423 * 424 * kas.a_hat is not in the list but is instead used to find the 425 * first and last items in the list. 426 * 427 * - kas.a_hat->hat_next points to the start of the user hats. 428 * The list ends where hat->hat_next == NULL 429 * 430 * - kas.a_hat->hat_prev points to the last of the user hats. 431 * The list begins where hat->hat_prev == NULL 432 */ 433 static void 434 hat_list_append(hat_t *hat) 435 { 436 mutex_enter(&hat_list_lock); 437 hat->hat_prev = NULL; 438 hat->hat_next = kas.a_hat->hat_next; 439 if (hat->hat_next) 440 hat->hat_next->hat_prev = hat; 441 else 442 kas.a_hat->hat_prev = hat; 443 kas.a_hat->hat_next = hat; 444 mutex_exit(&hat_list_lock); 445 } 446 447 /* 448 * Allocate a hat structure for as. We also create the top level 449 * htable and initialize it to contain the kernel hat entries. 450 */ 451 hat_t * 452 hat_alloc(struct as *as) 453 { 454 hat_t *hat; 455 htable_t *ht; /* top level htable */ 456 uint_t use_copied; 457 uint_t r; 458 hat_kernel_range_t *rp; 459 uintptr_t va; 460 uintptr_t eva; 461 uint_t start; 462 uint_t cnt; 463 htable_t *src; 464 boolean_t use_hat32_cache; 465 466 /* 467 * Once we start creating user process HATs we can enable 468 * the htable_steal() code. 469 */ 470 if (can_steal_post_boot == 0) 471 can_steal_post_boot = 1; 472 473 ASSERT(AS_WRITE_HELD(as)); 474 hat = kmem_cache_alloc(hat_cache, KM_SLEEP); 475 hat->hat_as = as; 476 mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL); 477 ASSERT(hat->hat_flags == 0); 478 479 #if defined(__xpv) 480 /* 481 * No PCP stuff on the hypervisor due to the 64-bit split top level 482 * page tables. On 32-bit it's not needed as the hypervisor takes 483 * care of copying the top level PTEs to a below 4Gig page. 484 */ 485 use_copied = 0; 486 use_hat32_cache = B_FALSE; 487 hat->hat_max_level = mmu.max_level; 488 hat->hat_num_copied = 0; 489 hat->hat_flags = 0; 490 #else /* __xpv */ 491 492 /* 493 * All processes use HAT_COPIED on the 64-bit kernel if KPTI is 494 * turned on. 495 */ 496 if (ttoproc(curthread)->p_model == DATAMODEL_ILP32) { 497 use_copied = 1; 498 hat->hat_max_level = mmu.max_level32; 499 hat->hat_num_copied = mmu.num_copied_ents32; 500 use_hat32_cache = B_TRUE; 501 hat->hat_flags |= HAT_COPIED_32; 502 HATSTAT_INC(hs_hat_copied32); 503 } else if (kpti_enable == 1) { 504 use_copied = 1; 505 hat->hat_max_level = mmu.max_level; 506 hat->hat_num_copied = mmu.num_copied_ents; 507 use_hat32_cache = B_FALSE; 508 HATSTAT_INC(hs_hat_copied64); 509 } else { 510 use_copied = 0; 511 use_hat32_cache = B_FALSE; 512 hat->hat_max_level = mmu.max_level; 513 hat->hat_num_copied = 0; 514 hat->hat_flags = 0; 515 HATSTAT_INC(hs_hat_normal64); 516 } 517 #endif /* __xpv */ 518 if (use_copied) { 519 hat->hat_flags |= HAT_COPIED; 520 bzero(hat->hat_copied_ptes, sizeof (hat->hat_copied_ptes)); 521 } 522 523 /* 524 * Allocate the htable hash. For 32-bit PCP processes we use the 525 * hat32_hash_cache. However, for 64-bit PCP processes we do not as the 526 * number of entries that they have to handle is closer to 527 * hat_hash_cache in count (though there will be more wastage when we 528 * have more DRAM in the system and thus push down the user address 529 * range). 530 */ 531 if (use_hat32_cache) { 532 hat->hat_num_hash = mmu.hat32_hash_cnt; 533 hat->hat_ht_hash = kmem_cache_alloc(hat32_hash_cache, KM_SLEEP); 534 } else { 535 hat->hat_num_hash = mmu.hash_cnt; 536 hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP); 537 } 538 bzero(hat->hat_ht_hash, hat->hat_num_hash * sizeof (htable_t *)); 539 540 /* 541 * Initialize Kernel HAT entries at the top of the top level page 542 * tables for the new hat. 543 */ 544 hat->hat_htable = NULL; 545 hat->hat_ht_cached = NULL; 546 XPV_DISALLOW_MIGRATE(); 547 ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL); 548 hat->hat_htable = ht; 549 550 if (hat->hat_flags & HAT_COPIED) 551 goto init_done; 552 553 for (r = 0; r < num_kernel_ranges; ++r) { 554 rp = &kernel_ranges[r]; 555 for (va = rp->hkr_start_va; va != rp->hkr_end_va; 556 va += cnt * LEVEL_SIZE(rp->hkr_level)) { 557 558 if (rp->hkr_level == TOP_LEVEL(hat)) 559 ht = hat->hat_htable; 560 else 561 ht = htable_create(hat, va, rp->hkr_level, 562 NULL); 563 564 start = htable_va2entry(va, ht); 565 cnt = HTABLE_NUM_PTES(ht) - start; 566 eva = va + 567 ((uintptr_t)cnt << LEVEL_SHIFT(rp->hkr_level)); 568 if (rp->hkr_end_va != 0 && 569 (eva > rp->hkr_end_va || eva == 0)) 570 cnt = htable_va2entry(rp->hkr_end_va, ht) - 571 start; 572 573 src = htable_lookup(kas.a_hat, va, rp->hkr_level); 574 ASSERT(src != NULL); 575 x86pte_copy(src, ht, start, cnt); 576 htable_release(src); 577 } 578 } 579 580 init_done: 581 582 #if defined(__xpv) 583 /* 584 * Pin top level page tables after initializing them 585 */ 586 xen_pin(hat->hat_htable->ht_pfn, mmu.max_level); 587 xen_pin(hat->hat_user_ptable, mmu.max_level); 588 #endif 589 XPV_ALLOW_MIGRATE(); 590 591 hat_list_append(hat); 592 593 return (hat); 594 } 595 596 #if !defined(__xpv) 597 /* 598 * Cons up a HAT for a CPU. This represents the user mappings. This will have 599 * various kernel pages punched into it manually. Importantly, this hat is 600 * ineligible for stealing. We really don't want to deal with this ever 601 * faulting and figuring out that this is happening, much like we don't with 602 * kas. 603 */ 604 static hat_t * 605 hat_cpu_alloc(cpu_t *cpu) 606 { 607 hat_t *hat; 608 htable_t *ht; 609 610 hat = kmem_cache_alloc(hat_cache, KM_SLEEP); 611 hat->hat_as = NULL; 612 mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL); 613 hat->hat_max_level = mmu.max_level; 614 hat->hat_num_copied = 0; 615 hat->hat_flags = HAT_PCP; 616 617 hat->hat_num_hash = mmu.hash_cnt; 618 hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP); 619 bzero(hat->hat_ht_hash, hat->hat_num_hash * sizeof (htable_t *)); 620 621 hat->hat_next = hat->hat_prev = NULL; 622 623 /* 624 * Because this HAT will only ever be used by the current CPU, we'll go 625 * ahead and set the CPUSET up to only point to the CPU in question. 626 */ 627 CPUSET_ADD(hat->hat_cpus, cpu->cpu_id); 628 629 hat->hat_htable = NULL; 630 hat->hat_ht_cached = NULL; 631 ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL); 632 hat->hat_htable = ht; 633 634 hat_list_append(hat); 635 636 return (hat); 637 } 638 #endif /* !__xpv */ 639 640 /* 641 * process has finished executing but as has not been cleaned up yet. 642 */ 643 /*ARGSUSED*/ 644 void 645 hat_free_start(hat_t *hat) 646 { 647 ASSERT(AS_WRITE_HELD(hat->hat_as)); 648 649 /* 650 * If the hat is currently a stealing victim, wait for the stealing 651 * to finish. Once we mark it as HAT_FREEING, htable_steal() 652 * won't look at its pagetables anymore. 653 */ 654 mutex_enter(&hat_list_lock); 655 while (hat->hat_flags & HAT_VICTIM) 656 cv_wait(&hat_list_cv, &hat_list_lock); 657 hat->hat_flags |= HAT_FREEING; 658 mutex_exit(&hat_list_lock); 659 } 660 661 /* 662 * An address space is being destroyed, so we destroy the associated hat. 663 */ 664 void 665 hat_free_end(hat_t *hat) 666 { 667 kmem_cache_t *cache; 668 669 ASSERT(hat->hat_flags & HAT_FREEING); 670 671 /* 672 * must not be running on the given hat 673 */ 674 ASSERT(CPU->cpu_current_hat != hat); 675 676 /* 677 * Remove it from the list of HATs 678 */ 679 mutex_enter(&hat_list_lock); 680 if (hat->hat_prev) 681 hat->hat_prev->hat_next = hat->hat_next; 682 else 683 kas.a_hat->hat_next = hat->hat_next; 684 if (hat->hat_next) 685 hat->hat_next->hat_prev = hat->hat_prev; 686 else 687 kas.a_hat->hat_prev = hat->hat_prev; 688 mutex_exit(&hat_list_lock); 689 hat->hat_next = hat->hat_prev = NULL; 690 691 #if defined(__xpv) 692 /* 693 * On the hypervisor, unpin top level page table(s) 694 */ 695 VERIFY3U(hat->hat_flags & HAT_PCP, ==, 0); 696 xen_unpin(hat->hat_htable->ht_pfn); 697 xen_unpin(hat->hat_user_ptable); 698 #endif 699 700 /* 701 * Make a pass through the htables freeing them all up. 702 */ 703 htable_purge_hat(hat); 704 705 /* 706 * Decide which kmem cache the hash table came from, then free it. 707 */ 708 if (hat->hat_flags & HAT_COPIED) { 709 if (hat->hat_flags & HAT_COPIED_32) { 710 cache = hat32_hash_cache; 711 } else { 712 cache = hat_hash_cache; 713 } 714 } else { 715 cache = hat_hash_cache; 716 } 717 kmem_cache_free(cache, hat->hat_ht_hash); 718 hat->hat_ht_hash = NULL; 719 720 hat->hat_flags = 0; 721 hat->hat_max_level = 0; 722 hat->hat_num_copied = 0; 723 kmem_cache_free(hat_cache, hat); 724 } 725 726 /* 727 * round kernelbase down to a supported value to use for _userlimit 728 * 729 * userlimit must be aligned down to an entry in the top level htable. 730 * The one exception is for 32 bit HAT's running PAE. 731 */ 732 uintptr_t 733 hat_kernelbase(uintptr_t va) 734 { 735 if (IN_VA_HOLE(va)) 736 panic("_userlimit %p will fall in VA hole\n", (void *)va); 737 return (va); 738 } 739 740 /* 741 * 742 */ 743 static void 744 set_max_page_level() 745 { 746 level_t lvl; 747 748 if (!kbm_largepage_support) { 749 lvl = 0; 750 } else { 751 if (is_x86_feature(x86_featureset, X86FSET_1GPG)) { 752 lvl = 2; 753 if (chk_optimal_1gtlb && 754 cpuid_opteron_erratum(CPU, 6671130)) { 755 lvl = 1; 756 } 757 if (plat_mnode_xcheck(LEVEL_SIZE(2) >> 758 LEVEL_SHIFT(0))) { 759 lvl = 1; 760 } 761 } else { 762 lvl = 1; 763 } 764 } 765 mmu.max_page_level = lvl; 766 767 if ((lvl == 2) && (enable_1gpg == 0)) 768 mmu.umax_page_level = 1; 769 else 770 mmu.umax_page_level = lvl; 771 } 772 773 /* 774 * Determine the number of slots that are in used in the top-most level page 775 * table for user memory. This is based on _userlimit. In effect this is similar 776 * to htable_va2entry, but without the convenience of having an htable. 777 */ 778 void 779 mmu_calc_user_slots(void) 780 { 781 uint_t ent, nptes; 782 uintptr_t shift; 783 784 nptes = mmu.top_level_count; 785 shift = _userlimit >> mmu.level_shift[mmu.max_level]; 786 ent = shift & (nptes - 1); 787 788 /* 789 * Ent tells us the slot that the page for _userlimit would fit in. We 790 * need to add one to this to cover the total number of entries. 791 */ 792 mmu.top_level_uslots = ent + 1; 793 794 /* 795 * When running 32-bit compatability processes on a 64-bit kernel, we 796 * will only need to use one slot. 797 */ 798 mmu.top_level_uslots32 = 1; 799 800 /* 801 * Record the number of PCP page table entries that we'll need to copy 802 * around. For 64-bit processes this is the number of user slots. For 803 * 32-bit proceses, this is 4 1 GiB pages. 804 */ 805 mmu.num_copied_ents = mmu.top_level_uslots; 806 mmu.num_copied_ents32 = 4; 807 } 808 809 /* 810 * Initialize hat data structures based on processor MMU information. 811 */ 812 void 813 mmu_init(void) 814 { 815 uint_t max_htables; 816 uint_t pa_bits; 817 uint_t va_bits; 818 int i; 819 820 /* 821 * If CPU enabled the page table global bit, use it for the kernel 822 * This is bit 7 in CR4 (PGE - Page Global Enable). 823 */ 824 if (is_x86_feature(x86_featureset, X86FSET_PGE) && 825 (getcr4() & CR4_PGE) != 0) 826 mmu.pt_global = PT_GLOBAL; 827 828 #if !defined(__xpv) 829 /* 830 * The 64-bit x86 kernel has split user/kernel page tables. As such we 831 * cannot have the global bit set. The simplest way for us to deal with 832 * this is to just say that pt_global is zero, so the global bit isn't 833 * present. 834 */ 835 if (kpti_enable == 1) 836 mmu.pt_global = 0; 837 #endif 838 839 /* 840 * Detect NX and PAE usage. 841 */ 842 mmu.pae_hat = kbm_pae_support; 843 if (kbm_nx_support) 844 mmu.pt_nx = PT_NX; 845 else 846 mmu.pt_nx = 0; 847 848 /* 849 * Use CPU info to set various MMU parameters 850 */ 851 cpuid_get_addrsize(CPU, &pa_bits, &va_bits); 852 853 if (va_bits < sizeof (void *) * NBBY) { 854 mmu.hole_start = (1ul << (va_bits - 1)); 855 mmu.hole_end = 0ul - mmu.hole_start - 1; 856 } else { 857 mmu.hole_end = 0; 858 mmu.hole_start = mmu.hole_end - 1; 859 } 860 #if defined(OPTERON_ERRATUM_121) 861 /* 862 * If erratum 121 has already been detected at this time, hole_start 863 * contains the value to be subtracted from mmu.hole_start. 864 */ 865 ASSERT(hole_start == 0 || opteron_erratum_121 != 0); 866 hole_start = mmu.hole_start - hole_start; 867 #else 868 hole_start = mmu.hole_start; 869 #endif 870 hole_end = mmu.hole_end; 871 872 mmu.highest_pfn = mmu_btop((1ull << pa_bits) - 1); 873 if (mmu.pae_hat == 0 && pa_bits > 32) 874 mmu.highest_pfn = PFN_4G - 1; 875 876 if (mmu.pae_hat) { 877 mmu.pte_size = 8; /* 8 byte PTEs */ 878 mmu.pte_size_shift = 3; 879 } else { 880 mmu.pte_size = 4; /* 4 byte PTEs */ 881 mmu.pte_size_shift = 2; 882 } 883 884 if (mmu.pae_hat && !is_x86_feature(x86_featureset, X86FSET_PAE)) 885 panic("Processor does not support PAE"); 886 887 if (!is_x86_feature(x86_featureset, X86FSET_CX8)) 888 panic("Processor does not support cmpxchg8b instruction"); 889 890 891 mmu.num_level = 4; 892 mmu.max_level = 3; 893 mmu.ptes_per_table = 512; 894 mmu.top_level_count = 512; 895 896 /* 897 * 32-bit processes only use 1 GB ptes. 898 */ 899 mmu.max_level32 = 2; 900 901 mmu.level_shift[0] = 12; 902 mmu.level_shift[1] = 21; 903 mmu.level_shift[2] = 30; 904 mmu.level_shift[3] = 39; 905 906 907 for (i = 0; i < mmu.num_level; ++i) { 908 mmu.level_size[i] = 1UL << mmu.level_shift[i]; 909 mmu.level_offset[i] = mmu.level_size[i] - 1; 910 mmu.level_mask[i] = ~mmu.level_offset[i]; 911 } 912 913 set_max_page_level(); 914 mmu_calc_user_slots(); 915 916 mmu_page_sizes = mmu.max_page_level + 1; 917 mmu_exported_page_sizes = mmu.umax_page_level + 1; 918 919 /* restrict legacy applications from using pagesizes 1g and above */ 920 mmu_legacy_page_sizes = 921 (mmu_exported_page_sizes > 2) ? 2 : mmu_exported_page_sizes; 922 923 924 for (i = 0; i <= mmu.max_page_level; ++i) { 925 mmu.pte_bits[i] = PT_VALID | pt_kern; 926 if (i > 0) 927 mmu.pte_bits[i] |= PT_PAGESIZE; 928 } 929 930 /* 931 * NOTE Legacy 32 bit PAE mode only has the P_VALID bit at top level. 932 */ 933 for (i = 1; i < mmu.num_level; ++i) 934 mmu.ptp_bits[i] = PT_PTPBITS; 935 936 /* 937 * Compute how many hash table entries to have per process for htables. 938 * We start with 1 page's worth of entries. 939 * 940 * If physical memory is small, reduce the amount need to cover it. 941 */ 942 max_htables = physmax / mmu.ptes_per_table; 943 mmu.hash_cnt = MMU_PAGESIZE / sizeof (htable_t *); 944 while (mmu.hash_cnt > 16 && mmu.hash_cnt >= max_htables) 945 mmu.hash_cnt >>= 1; 946 mmu.hat32_hash_cnt = mmu.hash_cnt; 947 948 /* 949 * If running in 64 bits and physical memory is large, 950 * increase the size of the cache to cover all of memory for 951 * a 64 bit process. 952 */ 953 #define HASH_MAX_LENGTH 4 954 while (mmu.hash_cnt * HASH_MAX_LENGTH < max_htables) 955 mmu.hash_cnt <<= 1; 956 } 957 958 959 /* 960 * initialize hat data structures 961 */ 962 void 963 hat_init() 964 { 965 cv_init(&hat_list_cv, NULL, CV_DEFAULT, NULL); 966 967 /* 968 * initialize kmem caches 969 */ 970 htable_init(); 971 hment_init(); 972 973 hat_cache = kmem_cache_create("hat_t", 974 sizeof (hat_t), 0, hati_constructor, NULL, NULL, 975 NULL, 0, 0); 976 977 hat_hash_cache = kmem_cache_create("HatHash", 978 mmu.hash_cnt * sizeof (htable_t *), 0, NULL, NULL, NULL, 979 NULL, 0, 0); 980 981 /* 982 * 32-bit PCP hats can use a smaller hash table size on large memory 983 * machines 984 */ 985 if (mmu.hash_cnt == mmu.hat32_hash_cnt) { 986 hat32_hash_cache = hat_hash_cache; 987 } else { 988 hat32_hash_cache = kmem_cache_create("Hat32Hash", 989 mmu.hat32_hash_cnt * sizeof (htable_t *), 0, NULL, NULL, 990 NULL, NULL, 0, 0); 991 } 992 993 /* 994 * Set up the kernel's hat 995 */ 996 AS_LOCK_ENTER(&kas, RW_WRITER); 997 kas.a_hat = kmem_cache_alloc(hat_cache, KM_NOSLEEP); 998 mutex_init(&kas.a_hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL); 999 kas.a_hat->hat_as = &kas; 1000 kas.a_hat->hat_flags = 0; 1001 AS_LOCK_EXIT(&kas); 1002 1003 CPUSET_ZERO(khat_cpuset); 1004 CPUSET_ADD(khat_cpuset, CPU->cpu_id); 1005 1006 /* 1007 * The kernel HAT doesn't use PCP regardless of architectures. 1008 */ 1009 ASSERT3U(mmu.max_level, >, 0); 1010 kas.a_hat->hat_max_level = mmu.max_level; 1011 kas.a_hat->hat_num_copied = 0; 1012 1013 /* 1014 * The kernel hat's next pointer serves as the head of the hat list . 1015 * The kernel hat's prev pointer tracks the last hat on the list for 1016 * htable_steal() to use. 1017 */ 1018 kas.a_hat->hat_next = NULL; 1019 kas.a_hat->hat_prev = NULL; 1020 1021 /* 1022 * Allocate an htable hash bucket for the kernel 1023 * XX64 - tune for 64 bit procs 1024 */ 1025 kas.a_hat->hat_num_hash = mmu.hash_cnt; 1026 kas.a_hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_NOSLEEP); 1027 bzero(kas.a_hat->hat_ht_hash, mmu.hash_cnt * sizeof (htable_t *)); 1028 1029 /* 1030 * zero out the top level and cached htable pointers 1031 */ 1032 kas.a_hat->hat_ht_cached = NULL; 1033 kas.a_hat->hat_htable = NULL; 1034 1035 /* 1036 * Pre-allocate hrm_hashtab before enabling the collection of 1037 * refmod statistics. Allocating on the fly would mean us 1038 * running the risk of suffering recursive mutex enters or 1039 * deadlocks. 1040 */ 1041 hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *), 1042 KM_SLEEP); 1043 } 1044 1045 1046 extern void kpti_tramp_start(); 1047 extern void kpti_tramp_end(); 1048 1049 extern void kdi_isr_start(); 1050 extern void kdi_isr_end(); 1051 1052 extern gate_desc_t kdi_idt[NIDT]; 1053 1054 /* 1055 * Prepare per-CPU pagetables for all processes on the 64 bit kernel. 1056 * 1057 * Each CPU has a set of 2 pagetables that are reused for any 32 bit 1058 * process it runs. They are the top level pagetable, hci_pcp_l3ptes, and 1059 * the next to top level table for the bottom 512 Gig, hci_pcp_l2ptes. 1060 */ 1061 /*ARGSUSED*/ 1062 static void 1063 hat_pcp_setup(struct cpu *cpu) 1064 { 1065 #if !defined(__xpv) 1066 struct hat_cpu_info *hci = cpu->cpu_hat_info; 1067 uintptr_t va; 1068 size_t len; 1069 1070 /* 1071 * allocate the level==2 page table for the bottom most 1072 * 512Gig of address space (this is where 32 bit apps live) 1073 */ 1074 ASSERT(hci != NULL); 1075 hci->hci_pcp_l2ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP); 1076 1077 /* 1078 * Allocate a top level pagetable and copy the kernel's 1079 * entries into it. Then link in hci_pcp_l2ptes in the 1st entry. 1080 */ 1081 hci->hci_pcp_l3ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP); 1082 hci->hci_pcp_l3pfn = 1083 hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_pcp_l3ptes); 1084 ASSERT3U(hci->hci_pcp_l3pfn, !=, PFN_INVALID); 1085 bcopy(pcp_page, hci->hci_pcp_l3ptes, MMU_PAGESIZE); 1086 1087 hci->hci_pcp_l2pfn = 1088 hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_pcp_l2ptes); 1089 ASSERT3U(hci->hci_pcp_l2pfn, !=, PFN_INVALID); 1090 1091 /* 1092 * Now go through and allocate the user version of these structures. 1093 * Unlike with the kernel version, we allocate a hat to represent the 1094 * top-level page table as that will make it much simpler when we need 1095 * to patch through user entries. 1096 */ 1097 hci->hci_user_hat = hat_cpu_alloc(cpu); 1098 hci->hci_user_l3pfn = hci->hci_user_hat->hat_htable->ht_pfn; 1099 ASSERT3U(hci->hci_user_l3pfn, !=, PFN_INVALID); 1100 hci->hci_user_l3ptes = 1101 (x86pte_t *)hat_kpm_mapin_pfn(hci->hci_user_l3pfn); 1102 1103 /* Skip the rest of this if KPTI is switched off at boot. */ 1104 if (kpti_enable != 1) 1105 return; 1106 1107 /* 1108 * OK, now that we have this we need to go through and punch the normal 1109 * holes in the CPU's hat for this. At this point we'll punch in the 1110 * following: 1111 * 1112 * o GDT 1113 * o IDT 1114 * o LDT 1115 * o Trampoline Code 1116 * o machcpu KPTI page 1117 * o kmdb ISR code page (just trampolines) 1118 * 1119 * If this is cpu0, then we also can initialize the following because 1120 * they'll have already been allocated. 1121 * 1122 * o TSS for CPU 0 1123 * o Double Fault for CPU 0 1124 * 1125 * The following items have yet to be allocated and have not been 1126 * punched in yet. They will be punched in later: 1127 * 1128 * o TSS (mach_cpucontext_alloc_tables()) 1129 * o Double Fault Stack (mach_cpucontext_alloc_tables()) 1130 */ 1131 hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_gdt, PROT_READ); 1132 hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_idt, PROT_READ); 1133 1134 /* 1135 * As the KDI IDT is only active during kmdb sessions (including single 1136 * stepping), typically we don't actually need this punched in (we 1137 * consider the routines that switch to the user cr3 to be toxic). But 1138 * if we ever accidentally end up on the user cr3 while on this IDT, 1139 * we'd prefer not to triple fault. 1140 */ 1141 hati_cpu_punchin(cpu, (uintptr_t)&kdi_idt, PROT_READ); 1142 1143 CTASSERT(((uintptr_t)&kpti_tramp_start % MMU_PAGESIZE) == 0); 1144 CTASSERT(((uintptr_t)&kpti_tramp_end % MMU_PAGESIZE) == 0); 1145 for (va = (uintptr_t)&kpti_tramp_start; 1146 va < (uintptr_t)&kpti_tramp_end; va += MMU_PAGESIZE) { 1147 hati_cpu_punchin(cpu, va, PROT_READ | PROT_EXEC); 1148 } 1149 1150 VERIFY3U(((uintptr_t)cpu->cpu_m.mcpu_ldt) % MMU_PAGESIZE, ==, 0); 1151 for (va = (uintptr_t)cpu->cpu_m.mcpu_ldt, len = LDT_CPU_SIZE; 1152 len >= MMU_PAGESIZE; va += MMU_PAGESIZE, len -= MMU_PAGESIZE) { 1153 hati_cpu_punchin(cpu, va, PROT_READ); 1154 } 1155 1156 /* mcpu_pad2 is the start of the page containing the kpti_frames. */ 1157 hati_cpu_punchin(cpu, (uintptr_t)&cpu->cpu_m.mcpu_pad2[0], 1158 PROT_READ | PROT_WRITE); 1159 1160 if (cpu == &cpus[0]) { 1161 /* 1162 * CPU0 uses a global for its double fault stack to deal with 1163 * the chicken and egg problem. We need to punch it into its 1164 * user HAT. 1165 */ 1166 extern char dblfault_stack0[]; 1167 1168 hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_m.mcpu_tss, 1169 PROT_READ); 1170 1171 for (va = (uintptr_t)dblfault_stack0, 1172 len = DEFAULTSTKSZ; len >= MMU_PAGESIZE; 1173 va += MMU_PAGESIZE, len -= MMU_PAGESIZE) { 1174 hati_cpu_punchin(cpu, va, PROT_READ | PROT_WRITE); 1175 } 1176 } 1177 1178 CTASSERT(((uintptr_t)&kdi_isr_start % MMU_PAGESIZE) == 0); 1179 CTASSERT(((uintptr_t)&kdi_isr_end % MMU_PAGESIZE) == 0); 1180 for (va = (uintptr_t)&kdi_isr_start; 1181 va < (uintptr_t)&kdi_isr_end; va += MMU_PAGESIZE) { 1182 hati_cpu_punchin(cpu, va, PROT_READ | PROT_EXEC); 1183 } 1184 #endif /* !__xpv */ 1185 } 1186 1187 /*ARGSUSED*/ 1188 static void 1189 hat_pcp_teardown(cpu_t *cpu) 1190 { 1191 #if !defined(__xpv) 1192 struct hat_cpu_info *hci; 1193 1194 if ((hci = cpu->cpu_hat_info) == NULL) 1195 return; 1196 if (hci->hci_pcp_l2ptes != NULL) 1197 kmem_free(hci->hci_pcp_l2ptes, MMU_PAGESIZE); 1198 if (hci->hci_pcp_l3ptes != NULL) 1199 kmem_free(hci->hci_pcp_l3ptes, MMU_PAGESIZE); 1200 if (hci->hci_user_hat != NULL) { 1201 hat_free_start(hci->hci_user_hat); 1202 hat_free_end(hci->hci_user_hat); 1203 } 1204 #endif 1205 } 1206 1207 #define NEXT_HKR(r, l, s, e) { \ 1208 kernel_ranges[r].hkr_level = l; \ 1209 kernel_ranges[r].hkr_start_va = s; \ 1210 kernel_ranges[r].hkr_end_va = e; \ 1211 ++r; \ 1212 } 1213 1214 /* 1215 * Finish filling in the kernel hat. 1216 * Pre fill in all top level kernel page table entries for the kernel's 1217 * part of the address range. From this point on we can't use any new 1218 * kernel large pages if they need PTE's at max_level 1219 * 1220 * create the kmap mappings. 1221 */ 1222 void 1223 hat_init_finish(void) 1224 { 1225 size_t size; 1226 uint_t r = 0; 1227 uintptr_t va; 1228 hat_kernel_range_t *rp; 1229 1230 1231 /* 1232 * We are now effectively running on the kernel hat. 1233 * Clearing use_boot_reserve shuts off using the pre-allocated boot 1234 * reserve for all HAT allocations. From here on, the reserves are 1235 * only used when avoiding recursion in kmem_alloc(). 1236 */ 1237 use_boot_reserve = 0; 1238 htable_adjust_reserve(); 1239 1240 /* 1241 * User HATs are initialized with copies of all kernel mappings in 1242 * higher level page tables. Ensure that those entries exist. 1243 */ 1244 1245 NEXT_HKR(r, 3, kernelbase, 0); 1246 #if defined(__xpv) 1247 NEXT_HKR(r, 3, HYPERVISOR_VIRT_START, HYPERVISOR_VIRT_END); 1248 #endif 1249 1250 num_kernel_ranges = r; 1251 1252 /* 1253 * Create all the kernel pagetables that will have entries 1254 * shared to user HATs. 1255 */ 1256 for (r = 0; r < num_kernel_ranges; ++r) { 1257 rp = &kernel_ranges[r]; 1258 for (va = rp->hkr_start_va; va != rp->hkr_end_va; 1259 va += LEVEL_SIZE(rp->hkr_level)) { 1260 htable_t *ht; 1261 1262 if (IN_HYPERVISOR_VA(va)) 1263 continue; 1264 1265 /* can/must skip if a page mapping already exists */ 1266 if (rp->hkr_level <= mmu.max_page_level && 1267 (ht = htable_getpage(kas.a_hat, va, NULL)) != 1268 NULL) { 1269 htable_release(ht); 1270 continue; 1271 } 1272 1273 (void) htable_create(kas.a_hat, va, rp->hkr_level - 1, 1274 NULL); 1275 } 1276 } 1277 1278 /* 1279 * 32 bit PAE metal kernels use only 4 of the 512 entries in the 1280 * page holding the top level pagetable. We use the remainder for 1281 * the "per CPU" page tables for PCP processes. 1282 * Map the top level kernel pagetable into the kernel to make 1283 * it easy to use bcopy access these tables. 1284 * 1285 * PAE is required for the 64-bit kernel which uses this as well to 1286 * perform the per-CPU pagetables. See the big theory statement. 1287 */ 1288 if (mmu.pae_hat) { 1289 pcp_page = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP); 1290 hat_devload(kas.a_hat, (caddr_t)pcp_page, MMU_PAGESIZE, 1291 kas.a_hat->hat_htable->ht_pfn, 1292 #if !defined(__xpv) 1293 PROT_WRITE | 1294 #endif 1295 PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK, 1296 HAT_LOAD | HAT_LOAD_NOCONSIST); 1297 } 1298 hat_pcp_setup(CPU); 1299 1300 /* 1301 * Create kmap (cached mappings of kernel PTEs) 1302 * for 32 bit we map from segmap_start .. ekernelheap 1303 * for 64 bit we map from segmap_start .. segmap_start + segmapsize; 1304 */ 1305 size = segmapsize; 1306 hat_kmap_init((uintptr_t)segmap_start, size); 1307 1308 #if !defined(__xpv) 1309 ASSERT3U(kas.a_hat->hat_htable->ht_pfn, !=, PFN_INVALID); 1310 ASSERT3U(kpti_safe_cr3, ==, 1311 MAKECR3(kas.a_hat->hat_htable->ht_pfn, PCID_KERNEL)); 1312 #endif 1313 } 1314 1315 /* 1316 * Update the PCP data on the CPU cpu to the one on the hat. If this is a 32-bit 1317 * process, then we must update the L2 pages and then the L3. If this is a 1318 * 64-bit process then we must update the L3 entries. 1319 */ 1320 static void 1321 hat_pcp_update(cpu_t *cpu, const hat_t *hat) 1322 { 1323 ASSERT3U(hat->hat_flags & HAT_COPIED, !=, 0); 1324 1325 if ((hat->hat_flags & HAT_COPIED_32) != 0) { 1326 const x86pte_t *l2src; 1327 x86pte_t *l2dst, *l3ptes, *l3uptes; 1328 /* 1329 * This is a 32-bit process. To set this up, we need to do the 1330 * following: 1331 * 1332 * - Copy the 4 L2 PTEs into the dedicated L2 table 1333 * - Zero the user L3 PTEs in the user and kernel page table 1334 * - Set the first L3 PTE to point to the CPU L2 table 1335 */ 1336 l2src = hat->hat_copied_ptes; 1337 l2dst = cpu->cpu_hat_info->hci_pcp_l2ptes; 1338 l3ptes = cpu->cpu_hat_info->hci_pcp_l3ptes; 1339 l3uptes = cpu->cpu_hat_info->hci_user_l3ptes; 1340 1341 l2dst[0] = l2src[0]; 1342 l2dst[1] = l2src[1]; 1343 l2dst[2] = l2src[2]; 1344 l2dst[3] = l2src[3]; 1345 1346 /* 1347 * Make sure to use the mmu to get the number of slots. The 1348 * number of PLP entries that this has will always be less as 1349 * it's a 32-bit process. 1350 */ 1351 bzero(l3ptes, sizeof (x86pte_t) * mmu.top_level_uslots); 1352 l3ptes[0] = MAKEPTP(cpu->cpu_hat_info->hci_pcp_l2pfn, 2); 1353 bzero(l3uptes, sizeof (x86pte_t) * mmu.top_level_uslots); 1354 l3uptes[0] = MAKEPTP(cpu->cpu_hat_info->hci_pcp_l2pfn, 2); 1355 } else { 1356 /* 1357 * This is a 64-bit process. To set this up, we need to do the 1358 * following: 1359 * 1360 * - Zero the 4 L2 PTEs in the CPU structure for safety 1361 * - Copy over the new user L3 PTEs into the kernel page table 1362 * - Copy over the new user L3 PTEs into the user page table 1363 */ 1364 ASSERT3S(kpti_enable, ==, 1); 1365 bzero(cpu->cpu_hat_info->hci_pcp_l2ptes, sizeof (x86pte_t) * 4); 1366 bcopy(hat->hat_copied_ptes, cpu->cpu_hat_info->hci_pcp_l3ptes, 1367 sizeof (x86pte_t) * mmu.top_level_uslots); 1368 bcopy(hat->hat_copied_ptes, cpu->cpu_hat_info->hci_user_l3ptes, 1369 sizeof (x86pte_t) * mmu.top_level_uslots); 1370 } 1371 } 1372 1373 static void 1374 reset_kpti(struct kpti_frame *fr, uint64_t kcr3, uint64_t ucr3) 1375 { 1376 ASSERT3U(fr->kf_tr_flag, ==, 0); 1377 #if DEBUG 1378 if (fr->kf_kernel_cr3 != 0) { 1379 ASSERT3U(fr->kf_lower_redzone, ==, 0xdeadbeefdeadbeef); 1380 ASSERT3U(fr->kf_middle_redzone, ==, 0xdeadbeefdeadbeef); 1381 ASSERT3U(fr->kf_upper_redzone, ==, 0xdeadbeefdeadbeef); 1382 } 1383 #endif 1384 1385 bzero(fr, offsetof(struct kpti_frame, kf_kernel_cr3)); 1386 bzero(&fr->kf_unused, sizeof (struct kpti_frame) - 1387 offsetof(struct kpti_frame, kf_unused)); 1388 1389 fr->kf_kernel_cr3 = kcr3; 1390 fr->kf_user_cr3 = ucr3; 1391 fr->kf_tr_ret_rsp = (uintptr_t)&fr->kf_tr_rsp; 1392 1393 fr->kf_lower_redzone = 0xdeadbeefdeadbeef; 1394 fr->kf_middle_redzone = 0xdeadbeefdeadbeef; 1395 fr->kf_upper_redzone = 0xdeadbeefdeadbeef; 1396 } 1397 1398 #ifdef __xpv 1399 static void 1400 hat_switch_xen(hat_t *hat) 1401 { 1402 struct mmuext_op t[2]; 1403 uint_t retcnt; 1404 uint_t opcnt = 1; 1405 uint64_t newcr3; 1406 1407 ASSERT(!(hat->hat_flags & HAT_COPIED)); 1408 ASSERT(!(getcr4() & CR4_PCIDE)); 1409 1410 newcr3 = MAKECR3((uint64_t)hat->hat_htable->ht_pfn, PCID_NONE); 1411 1412 t[0].cmd = MMUEXT_NEW_BASEPTR; 1413 t[0].arg1.mfn = mmu_btop(pa_to_ma(newcr3)); 1414 1415 /* 1416 * There's an interesting problem here, as to what to actually specify 1417 * when switching to the kernel hat. For now we'll reuse the kernel hat 1418 * again. 1419 */ 1420 t[1].cmd = MMUEXT_NEW_USER_BASEPTR; 1421 if (hat == kas.a_hat) 1422 t[1].arg1.mfn = mmu_btop(pa_to_ma(newcr3)); 1423 else 1424 t[1].arg1.mfn = pfn_to_mfn(hat->hat_user_ptable); 1425 ++opcnt; 1426 1427 if (HYPERVISOR_mmuext_op(t, opcnt, &retcnt, DOMID_SELF) < 0) 1428 panic("HYPERVISOR_mmu_update() failed"); 1429 ASSERT(retcnt == opcnt); 1430 } 1431 #endif /* __xpv */ 1432 1433 /* 1434 * Switch to a new active hat, maintaining bit masks to track active CPUs. 1435 * 1436 * With KPTI, all our HATs except kas should be using PCP. Thus, to switch 1437 * HATs, we need to copy over the new user PTEs, then set our trampoline context 1438 * as appropriate. 1439 * 1440 * If lacking PCID, we then load our new cr3, which will flush the TLB: we may 1441 * have established userspace TLB entries via kernel accesses, and these are no 1442 * longer valid. We have to do this eagerly, as we just deleted this CPU from 1443 * ->hat_cpus, so would no longer see any TLB shootdowns. 1444 * 1445 * With PCID enabled, things get a little more complicated. We would like to 1446 * keep TLB context around when entering and exiting the kernel, and to do this, 1447 * we partition the TLB into two different spaces: 1448 * 1449 * PCID_KERNEL is defined as zero, and used both by kas and all other address 1450 * spaces while in the kernel (post-trampoline). 1451 * 1452 * PCID_USER is used while in userspace. Therefore, userspace cannot use any 1453 * lingering PCID_KERNEL entries to kernel addresses it should not be able to 1454 * read. 1455 * 1456 * The trampoline cr3s are set not to invalidate on a mov to %cr3. This means if 1457 * we take a journey through the kernel without switching HATs, we have some 1458 * hope of keeping our TLB state around. 1459 * 1460 * On a hat switch, rather than deal with any necessary flushes on the way out 1461 * of the trampolines, we do them upfront here. If we're switching from kas, we 1462 * shouldn't need any invalidation. 1463 * 1464 * Otherwise, we can have stale userspace entries for both PCID_USER (what 1465 * happened before we move onto the kcr3) and PCID_KERNEL (any subsequent 1466 * userspace accesses such as ddi_copyin()). Since setcr3() won't do these 1467 * flushes on its own in PCIDE, we'll do a non-flushing load and then 1468 * invalidate everything. 1469 */ 1470 void 1471 hat_switch(hat_t *hat) 1472 { 1473 cpu_t *cpu = CPU; 1474 hat_t *old = cpu->cpu_current_hat; 1475 1476 /* 1477 * set up this information first, so we don't miss any cross calls 1478 */ 1479 if (old != NULL) { 1480 if (old == hat) 1481 return; 1482 if (old != kas.a_hat) 1483 CPUSET_ATOMIC_DEL(old->hat_cpus, cpu->cpu_id); 1484 } 1485 1486 /* 1487 * Add this CPU to the active set for this HAT. 1488 */ 1489 if (hat != kas.a_hat) { 1490 CPUSET_ATOMIC_ADD(hat->hat_cpus, cpu->cpu_id); 1491 } 1492 cpu->cpu_current_hat = hat; 1493 1494 #if defined(__xpv) 1495 hat_switch_xen(hat); 1496 #else 1497 struct hat_cpu_info *info = cpu->cpu_m.mcpu_hat_info; 1498 uint64_t pcide = getcr4() & CR4_PCIDE; 1499 uint64_t kcr3, ucr3; 1500 pfn_t tl_kpfn; 1501 ulong_t flag; 1502 1503 EQUIV(kpti_enable, !mmu.pt_global); 1504 1505 if (hat->hat_flags & HAT_COPIED) { 1506 hat_pcp_update(cpu, hat); 1507 tl_kpfn = info->hci_pcp_l3pfn; 1508 } else { 1509 IMPLY(kpti_enable, hat == kas.a_hat); 1510 tl_kpfn = hat->hat_htable->ht_pfn; 1511 } 1512 1513 if (pcide) { 1514 ASSERT(kpti_enable); 1515 1516 kcr3 = MAKECR3(tl_kpfn, PCID_KERNEL) | CR3_NOINVL_BIT; 1517 ucr3 = MAKECR3(info->hci_user_l3pfn, PCID_USER) | 1518 CR3_NOINVL_BIT; 1519 1520 setcr3(kcr3); 1521 if (old != kas.a_hat) 1522 mmu_flush_tlb(FLUSH_TLB_ALL, NULL); 1523 } else { 1524 kcr3 = MAKECR3(tl_kpfn, PCID_NONE); 1525 ucr3 = kpti_enable ? 1526 MAKECR3(info->hci_user_l3pfn, PCID_NONE) : 1527 0; 1528 1529 setcr3(kcr3); 1530 } 1531 1532 /* 1533 * We will already be taking shootdowns for our new HAT, and as KPTI 1534 * invpcid emulation needs to use kf_user_cr3, make sure we don't get 1535 * any cross calls while we're inconsistent. Note that it's harmless to 1536 * have a *stale* kf_user_cr3 (we just did a FLUSH_TLB_ALL), but a 1537 * *zero* kf_user_cr3 is not going to go very well. 1538 */ 1539 if (pcide) 1540 flag = intr_clear(); 1541 1542 reset_kpti(&cpu->cpu_m.mcpu_kpti, kcr3, ucr3); 1543 reset_kpti(&cpu->cpu_m.mcpu_kpti_flt, kcr3, ucr3); 1544 reset_kpti(&cpu->cpu_m.mcpu_kpti_dbg, kcr3, ucr3); 1545 1546 if (pcide) 1547 intr_restore(flag); 1548 1549 #endif /* !__xpv */ 1550 1551 ASSERT(cpu == CPU); 1552 } 1553 1554 /* 1555 * Utility to return a valid x86pte_t from protections, pfn, and level number 1556 */ 1557 static x86pte_t 1558 hati_mkpte(pfn_t pfn, uint_t attr, level_t level, uint_t flags) 1559 { 1560 x86pte_t pte; 1561 uint_t cache_attr = attr & HAT_ORDER_MASK; 1562 1563 pte = MAKEPTE(pfn, level); 1564 1565 if (attr & PROT_WRITE) 1566 PTE_SET(pte, PT_WRITABLE); 1567 1568 if (attr & PROT_USER) 1569 PTE_SET(pte, PT_USER); 1570 1571 if (!(attr & PROT_EXEC)) 1572 PTE_SET(pte, mmu.pt_nx); 1573 1574 /* 1575 * Set the software bits used track ref/mod sync's and hments. 1576 * If not using REF/MOD, set them to avoid h/w rewriting PTEs. 1577 */ 1578 if (flags & HAT_LOAD_NOCONSIST) 1579 PTE_SET(pte, PT_NOCONSIST | PT_REF | PT_MOD); 1580 else if (attr & HAT_NOSYNC) 1581 PTE_SET(pte, PT_NOSYNC | PT_REF | PT_MOD); 1582 1583 /* 1584 * Set the caching attributes in the PTE. The combination 1585 * of attributes are poorly defined, so we pay attention 1586 * to them in the given order. 1587 * 1588 * The test for HAT_STRICTORDER is different because it's defined 1589 * as "0" - which was a stupid thing to do, but is too late to change! 1590 */ 1591 if (cache_attr == HAT_STRICTORDER) { 1592 PTE_SET(pte, PT_NOCACHE); 1593 /*LINTED [Lint hates empty ifs, but it's the obvious way to do this] */ 1594 } else if (cache_attr & (HAT_UNORDERED_OK | HAT_STORECACHING_OK)) { 1595 /* nothing to set */; 1596 } else if (cache_attr & (HAT_MERGING_OK | HAT_LOADCACHING_OK)) { 1597 PTE_SET(pte, PT_NOCACHE); 1598 if (is_x86_feature(x86_featureset, X86FSET_PAT)) 1599 PTE_SET(pte, (level == 0) ? PT_PAT_4K : PT_PAT_LARGE); 1600 else 1601 PTE_SET(pte, PT_WRITETHRU); 1602 } else { 1603 panic("hati_mkpte(): bad caching attributes: %x\n", cache_attr); 1604 } 1605 1606 return (pte); 1607 } 1608 1609 /* 1610 * Duplicate address translations of the parent to the child. 1611 * This function really isn't used anymore. 1612 */ 1613 /*ARGSUSED*/ 1614 int 1615 hat_dup(hat_t *old, hat_t *new, caddr_t addr, size_t len, uint_t flag) 1616 { 1617 ASSERT((uintptr_t)addr < kernelbase); 1618 ASSERT(new != kas.a_hat); 1619 ASSERT(old != kas.a_hat); 1620 return (0); 1621 } 1622 1623 /* 1624 * Allocate any hat resources required for a process being swapped in. 1625 */ 1626 /*ARGSUSED*/ 1627 void 1628 hat_swapin(hat_t *hat) 1629 { 1630 /* do nothing - we let everything fault back in */ 1631 } 1632 1633 /* 1634 * Unload all translations associated with an address space of a process 1635 * that is being swapped out. 1636 */ 1637 void 1638 hat_swapout(hat_t *hat) 1639 { 1640 uintptr_t vaddr = (uintptr_t)0; 1641 uintptr_t eaddr = _userlimit; 1642 htable_t *ht = NULL; 1643 level_t l; 1644 1645 XPV_DISALLOW_MIGRATE(); 1646 /* 1647 * We can't just call hat_unload(hat, 0, _userlimit...) here, because 1648 * seg_spt and shared pagetables can't be swapped out. 1649 * Take a look at segspt_shmswapout() - it's a big no-op. 1650 * 1651 * Instead we'll walk through all the address space and unload 1652 * any mappings which we are sure are not shared, not locked. 1653 */ 1654 ASSERT(IS_PAGEALIGNED(vaddr)); 1655 ASSERT(IS_PAGEALIGNED(eaddr)); 1656 ASSERT(AS_LOCK_HELD(hat->hat_as)); 1657 if ((uintptr_t)hat->hat_as->a_userlimit < eaddr) 1658 eaddr = (uintptr_t)hat->hat_as->a_userlimit; 1659 1660 while (vaddr < eaddr) { 1661 (void) htable_walk(hat, &ht, &vaddr, eaddr); 1662 if (ht == NULL) 1663 break; 1664 1665 ASSERT(!IN_VA_HOLE(vaddr)); 1666 1667 /* 1668 * If the page table is shared skip its entire range. 1669 */ 1670 l = ht->ht_level; 1671 if (ht->ht_flags & HTABLE_SHARED_PFN) { 1672 vaddr = ht->ht_vaddr + LEVEL_SIZE(l + 1); 1673 htable_release(ht); 1674 ht = NULL; 1675 continue; 1676 } 1677 1678 /* 1679 * If the page table has no locked entries, unload this one. 1680 */ 1681 if (ht->ht_lock_cnt == 0) 1682 hat_unload(hat, (caddr_t)vaddr, LEVEL_SIZE(l), 1683 HAT_UNLOAD_UNMAP); 1684 1685 /* 1686 * If we have a level 0 page table with locked entries, 1687 * skip the entire page table, otherwise skip just one entry. 1688 */ 1689 if (ht->ht_lock_cnt > 0 && l == 0) 1690 vaddr = ht->ht_vaddr + LEVEL_SIZE(1); 1691 else 1692 vaddr += LEVEL_SIZE(l); 1693 } 1694 if (ht) 1695 htable_release(ht); 1696 1697 /* 1698 * We're in swapout because the system is low on memory, so 1699 * go back and flush all the htables off the cached list. 1700 */ 1701 htable_purge_hat(hat); 1702 XPV_ALLOW_MIGRATE(); 1703 } 1704 1705 /* 1706 * returns number of bytes that have valid mappings in hat. 1707 */ 1708 size_t 1709 hat_get_mapped_size(hat_t *hat) 1710 { 1711 size_t total = 0; 1712 int l; 1713 1714 for (l = 0; l <= mmu.max_page_level; l++) 1715 total += (hat->hat_pages_mapped[l] << LEVEL_SHIFT(l)); 1716 total += hat->hat_ism_pgcnt; 1717 1718 return (total); 1719 } 1720 1721 /* 1722 * enable/disable collection of stats for hat. 1723 */ 1724 int 1725 hat_stats_enable(hat_t *hat) 1726 { 1727 atomic_inc_32(&hat->hat_stats); 1728 return (1); 1729 } 1730 1731 void 1732 hat_stats_disable(hat_t *hat) 1733 { 1734 atomic_dec_32(&hat->hat_stats); 1735 } 1736 1737 /* 1738 * Utility to sync the ref/mod bits from a page table entry to the page_t 1739 * We must be holding the mapping list lock when this is called. 1740 */ 1741 static void 1742 hati_sync_pte_to_page(page_t *pp, x86pte_t pte, level_t level) 1743 { 1744 uint_t rm = 0; 1745 pgcnt_t pgcnt; 1746 1747 if (PTE_GET(pte, PT_SOFTWARE) >= PT_NOSYNC) 1748 return; 1749 1750 if (PTE_GET(pte, PT_REF)) 1751 rm |= P_REF; 1752 1753 if (PTE_GET(pte, PT_MOD)) 1754 rm |= P_MOD; 1755 1756 if (rm == 0) 1757 return; 1758 1759 /* 1760 * sync to all constituent pages of a large page 1761 */ 1762 ASSERT(x86_hm_held(pp)); 1763 pgcnt = page_get_pagecnt(level); 1764 ASSERT(IS_P2ALIGNED(pp->p_pagenum, pgcnt)); 1765 for (; pgcnt > 0; --pgcnt) { 1766 /* 1767 * hat_page_demote() can't decrease 1768 * pszc below this mapping size 1769 * since this large mapping existed after we 1770 * took mlist lock. 1771 */ 1772 ASSERT(pp->p_szc >= level); 1773 hat_page_setattr(pp, rm); 1774 ++pp; 1775 } 1776 } 1777 1778 /* 1779 * This the set of PTE bits for PFN, permissions and caching 1780 * that are allowed to change on a HAT_LOAD_REMAP 1781 */ 1782 #define PT_REMAP_BITS \ 1783 (PT_PADDR | PT_NX | PT_WRITABLE | PT_WRITETHRU | \ 1784 PT_NOCACHE | PT_PAT_4K | PT_PAT_LARGE | PT_IGNORE | PT_REF | PT_MOD) 1785 1786 #define REMAPASSERT(EX) if (!(EX)) panic("hati_pte_map: " #EX) 1787 /* 1788 * Do the low-level work to get a mapping entered into a HAT's pagetables 1789 * and in the mapping list of the associated page_t. 1790 */ 1791 static int 1792 hati_pte_map( 1793 htable_t *ht, 1794 uint_t entry, 1795 page_t *pp, 1796 x86pte_t pte, 1797 int flags, 1798 void *pte_ptr) 1799 { 1800 hat_t *hat = ht->ht_hat; 1801 x86pte_t old_pte; 1802 level_t l = ht->ht_level; 1803 hment_t *hm; 1804 uint_t is_consist; 1805 uint_t is_locked; 1806 int rv = 0; 1807 1808 /* 1809 * Is this a consistent (ie. need mapping list lock) mapping? 1810 */ 1811 is_consist = (pp != NULL && (flags & HAT_LOAD_NOCONSIST) == 0); 1812 1813 /* 1814 * Track locked mapping count in the htable. Do this first, 1815 * as we track locking even if there already is a mapping present. 1816 */ 1817 is_locked = (flags & HAT_LOAD_LOCK) != 0 && hat != kas.a_hat; 1818 if (is_locked) 1819 HTABLE_LOCK_INC(ht); 1820 1821 /* 1822 * Acquire the page's mapping list lock and get an hment to use. 1823 * Note that hment_prepare() might return NULL. 1824 */ 1825 if (is_consist) { 1826 x86_hm_enter(pp); 1827 hm = hment_prepare(ht, entry, pp); 1828 } 1829 1830 /* 1831 * Set the new pte, retrieving the old one at the same time. 1832 */ 1833 old_pte = x86pte_set(ht, entry, pte, pte_ptr); 1834 1835 /* 1836 * Did we get a large page / page table collision? 1837 */ 1838 if (old_pte == LPAGE_ERROR) { 1839 if (is_locked) 1840 HTABLE_LOCK_DEC(ht); 1841 rv = -1; 1842 goto done; 1843 } 1844 1845 /* 1846 * If the mapping didn't change there is nothing more to do. 1847 */ 1848 if (PTE_EQUIV(pte, old_pte)) 1849 goto done; 1850 1851 /* 1852 * Install a new mapping in the page's mapping list 1853 */ 1854 if (!PTE_ISVALID(old_pte)) { 1855 if (is_consist) { 1856 hment_assign(ht, entry, pp, hm); 1857 x86_hm_exit(pp); 1858 } else { 1859 ASSERT(flags & HAT_LOAD_NOCONSIST); 1860 } 1861 if (ht->ht_flags & HTABLE_COPIED) { 1862 cpu_t *cpu = CPU; 1863 hat_pcp_update(cpu, hat); 1864 } 1865 HTABLE_INC(ht->ht_valid_cnt); 1866 PGCNT_INC(hat, l); 1867 return (rv); 1868 } 1869 1870 /* 1871 * Remap's are more complicated: 1872 * - HAT_LOAD_REMAP must be specified if changing the pfn. 1873 * We also require that NOCONSIST be specified. 1874 * - Otherwise only permission or caching bits may change. 1875 */ 1876 if (!PTE_ISPAGE(old_pte, l)) 1877 panic("non-null/page mapping pte=" FMT_PTE, old_pte); 1878 1879 if (PTE2PFN(old_pte, l) != PTE2PFN(pte, l)) { 1880 REMAPASSERT(flags & HAT_LOAD_REMAP); 1881 REMAPASSERT(flags & HAT_LOAD_NOCONSIST); 1882 REMAPASSERT(PTE_GET(old_pte, PT_SOFTWARE) >= PT_NOCONSIST); 1883 REMAPASSERT(pf_is_memory(PTE2PFN(old_pte, l)) == 1884 pf_is_memory(PTE2PFN(pte, l))); 1885 REMAPASSERT(!is_consist); 1886 } 1887 1888 /* 1889 * We only let remaps change the certain bits in the PTE. 1890 */ 1891 if (PTE_GET(old_pte, ~PT_REMAP_BITS) != PTE_GET(pte, ~PT_REMAP_BITS)) 1892 panic("remap bits changed: old_pte="FMT_PTE", pte="FMT_PTE"\n", 1893 old_pte, pte); 1894 1895 /* 1896 * We don't create any mapping list entries on a remap, so release 1897 * any allocated hment after we drop the mapping list lock. 1898 */ 1899 done: 1900 if (is_consist) { 1901 x86_hm_exit(pp); 1902 if (hm != NULL) 1903 hment_free(hm); 1904 } 1905 return (rv); 1906 } 1907 1908 /* 1909 * Internal routine to load a single page table entry. This only fails if 1910 * we attempt to overwrite a page table link with a large page. 1911 */ 1912 static int 1913 hati_load_common( 1914 hat_t *hat, 1915 uintptr_t va, 1916 page_t *pp, 1917 uint_t attr, 1918 uint_t flags, 1919 level_t level, 1920 pfn_t pfn) 1921 { 1922 htable_t *ht; 1923 uint_t entry; 1924 x86pte_t pte; 1925 int rv = 0; 1926 1927 /* 1928 * The number 16 is arbitrary and here to catch a recursion problem 1929 * early before we blow out the kernel stack. 1930 */ 1931 ++curthread->t_hatdepth; 1932 ASSERT(curthread->t_hatdepth < 16); 1933 1934 ASSERT(hat == kas.a_hat || (hat->hat_flags & HAT_PCP) != 0 || 1935 AS_LOCK_HELD(hat->hat_as)); 1936 1937 if (flags & HAT_LOAD_SHARE) 1938 hat->hat_flags |= HAT_SHARED; 1939 1940 /* 1941 * Find the page table that maps this page if it already exists. 1942 */ 1943 ht = htable_lookup(hat, va, level); 1944 1945 /* 1946 * We must have HAT_LOAD_NOCONSIST if page_t is NULL. 1947 */ 1948 if (pp == NULL) 1949 flags |= HAT_LOAD_NOCONSIST; 1950 1951 if (ht == NULL) { 1952 ht = htable_create(hat, va, level, NULL); 1953 ASSERT(ht != NULL); 1954 } 1955 /* 1956 * htable_va2entry checks this condition as well, but it won't include 1957 * much useful info in the panic. So we do it in advance here to include 1958 * all the context. 1959 */ 1960 if (ht->ht_vaddr > va || va > HTABLE_LAST_PAGE(ht)) { 1961 panic("hati_load_common: bad htable: va=%p, last page=%p, " 1962 "ht->ht_vaddr=%p, ht->ht_level=%d", (void *)va, 1963 (void *)HTABLE_LAST_PAGE(ht), (void *)ht->ht_vaddr, 1964 (int)ht->ht_level); 1965 } 1966 entry = htable_va2entry(va, ht); 1967 1968 /* 1969 * a bunch of paranoid error checking 1970 */ 1971 ASSERT(ht->ht_busy > 0); 1972 ASSERT(ht->ht_level == level); 1973 1974 /* 1975 * construct the new PTE 1976 */ 1977 if (hat == kas.a_hat) 1978 attr &= ~PROT_USER; 1979 pte = hati_mkpte(pfn, attr, level, flags); 1980 if (hat == kas.a_hat && va >= kernelbase) 1981 PTE_SET(pte, mmu.pt_global); 1982 1983 /* 1984 * establish the mapping 1985 */ 1986 rv = hati_pte_map(ht, entry, pp, pte, flags, NULL); 1987 1988 /* 1989 * release the htable and any reserves 1990 */ 1991 htable_release(ht); 1992 --curthread->t_hatdepth; 1993 return (rv); 1994 } 1995 1996 /* 1997 * special case of hat_memload to deal with some kernel addrs for performance 1998 */ 1999 static void 2000 hat_kmap_load( 2001 caddr_t addr, 2002 page_t *pp, 2003 uint_t attr, 2004 uint_t flags) 2005 { 2006 uintptr_t va = (uintptr_t)addr; 2007 x86pte_t pte; 2008 pfn_t pfn = page_pptonum(pp); 2009 pgcnt_t pg_off = mmu_btop(va - mmu.kmap_addr); 2010 htable_t *ht; 2011 uint_t entry; 2012 void *pte_ptr; 2013 2014 /* 2015 * construct the requested PTE 2016 */ 2017 attr &= ~PROT_USER; 2018 attr |= HAT_STORECACHING_OK; 2019 pte = hati_mkpte(pfn, attr, 0, flags); 2020 PTE_SET(pte, mmu.pt_global); 2021 2022 /* 2023 * Figure out the pte_ptr and htable and use common code to finish up 2024 */ 2025 if (mmu.pae_hat) 2026 pte_ptr = mmu.kmap_ptes + pg_off; 2027 else 2028 pte_ptr = (x86pte32_t *)mmu.kmap_ptes + pg_off; 2029 ht = mmu.kmap_htables[(va - mmu.kmap_htables[0]->ht_vaddr) >> 2030 LEVEL_SHIFT(1)]; 2031 entry = htable_va2entry(va, ht); 2032 ++curthread->t_hatdepth; 2033 ASSERT(curthread->t_hatdepth < 16); 2034 (void) hati_pte_map(ht, entry, pp, pte, flags, pte_ptr); 2035 --curthread->t_hatdepth; 2036 } 2037 2038 /* 2039 * hat_memload() - load a translation to the given page struct 2040 * 2041 * Flags for hat_memload/hat_devload/hat_*attr. 2042 * 2043 * HAT_LOAD Default flags to load a translation to the page. 2044 * 2045 * HAT_LOAD_LOCK Lock down mapping resources; hat_map(), hat_memload(), 2046 * and hat_devload(). 2047 * 2048 * HAT_LOAD_NOCONSIST Do not add mapping to page_t mapping list. 2049 * sets PT_NOCONSIST 2050 * 2051 * HAT_LOAD_SHARE A flag to hat_memload() to indicate h/w page tables 2052 * that map some user pages (not kas) is shared by more 2053 * than one process (eg. ISM). 2054 * 2055 * HAT_LOAD_REMAP Reload a valid pte with a different page frame. 2056 * 2057 * HAT_NO_KALLOC Do not kmem_alloc while creating the mapping; at this 2058 * point, it's setting up mapping to allocate internal 2059 * hat layer data structures. This flag forces hat layer 2060 * to tap its reserves in order to prevent infinite 2061 * recursion. 2062 * 2063 * The following is a protection attribute (like PROT_READ, etc.) 2064 * 2065 * HAT_NOSYNC set PT_NOSYNC - this mapping's ref/mod bits 2066 * are never cleared. 2067 * 2068 * Installing new valid PTE's and creation of the mapping list 2069 * entry are controlled under the same lock. It's derived from the 2070 * page_t being mapped. 2071 */ 2072 static uint_t supported_memload_flags = 2073 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_ADV | HAT_LOAD_NOCONSIST | 2074 HAT_LOAD_SHARE | HAT_NO_KALLOC | HAT_LOAD_REMAP | HAT_LOAD_TEXT; 2075 2076 void 2077 hat_memload( 2078 hat_t *hat, 2079 caddr_t addr, 2080 page_t *pp, 2081 uint_t attr, 2082 uint_t flags) 2083 { 2084 uintptr_t va = (uintptr_t)addr; 2085 level_t level = 0; 2086 pfn_t pfn = page_pptonum(pp); 2087 2088 XPV_DISALLOW_MIGRATE(); 2089 ASSERT(IS_PAGEALIGNED(va)); 2090 ASSERT(hat == kas.a_hat || va < _userlimit); 2091 ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as)); 2092 ASSERT((flags & supported_memload_flags) == flags); 2093 2094 ASSERT(!IN_VA_HOLE(va)); 2095 ASSERT(!PP_ISFREE(pp)); 2096 2097 /* 2098 * kernel address special case for performance. 2099 */ 2100 if (mmu.kmap_addr <= va && va < mmu.kmap_eaddr) { 2101 ASSERT(hat == kas.a_hat); 2102 hat_kmap_load(addr, pp, attr, flags); 2103 XPV_ALLOW_MIGRATE(); 2104 return; 2105 } 2106 2107 /* 2108 * This is used for memory with normal caching enabled, so 2109 * always set HAT_STORECACHING_OK. 2110 */ 2111 attr |= HAT_STORECACHING_OK; 2112 if (hati_load_common(hat, va, pp, attr, flags, level, pfn) != 0) 2113 panic("unexpected hati_load_common() failure"); 2114 XPV_ALLOW_MIGRATE(); 2115 } 2116 2117 /* ARGSUSED */ 2118 void 2119 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp, 2120 uint_t attr, uint_t flags, hat_region_cookie_t rcookie) 2121 { 2122 hat_memload(hat, addr, pp, attr, flags); 2123 } 2124 2125 /* 2126 * Load the given array of page structs using large pages when possible 2127 */ 2128 void 2129 hat_memload_array( 2130 hat_t *hat, 2131 caddr_t addr, 2132 size_t len, 2133 page_t **pages, 2134 uint_t attr, 2135 uint_t flags) 2136 { 2137 uintptr_t va = (uintptr_t)addr; 2138 uintptr_t eaddr = va + len; 2139 level_t level; 2140 size_t pgsize; 2141 pgcnt_t pgindx = 0; 2142 pfn_t pfn; 2143 pgcnt_t i; 2144 2145 XPV_DISALLOW_MIGRATE(); 2146 ASSERT(IS_PAGEALIGNED(va)); 2147 ASSERT(hat == kas.a_hat || va + len <= _userlimit); 2148 ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as)); 2149 ASSERT((flags & supported_memload_flags) == flags); 2150 2151 /* 2152 * memload is used for memory with full caching enabled, so 2153 * set HAT_STORECACHING_OK. 2154 */ 2155 attr |= HAT_STORECACHING_OK; 2156 2157 /* 2158 * handle all pages using largest possible pagesize 2159 */ 2160 while (va < eaddr) { 2161 /* 2162 * decide what level mapping to use (ie. pagesize) 2163 */ 2164 pfn = page_pptonum(pages[pgindx]); 2165 for (level = mmu.max_page_level; ; --level) { 2166 pgsize = LEVEL_SIZE(level); 2167 if (level == 0) 2168 break; 2169 2170 if (!IS_P2ALIGNED(va, pgsize) || 2171 (eaddr - va) < pgsize || 2172 !IS_P2ALIGNED(pfn_to_pa(pfn), pgsize)) 2173 continue; 2174 2175 /* 2176 * To use a large mapping of this size, all the 2177 * pages we are passed must be sequential subpages 2178 * of the large page. 2179 * hat_page_demote() can't change p_szc because 2180 * all pages are locked. 2181 */ 2182 if (pages[pgindx]->p_szc >= level) { 2183 for (i = 0; i < mmu_btop(pgsize); ++i) { 2184 if (pfn + i != 2185 page_pptonum(pages[pgindx + i])) 2186 break; 2187 ASSERT(pages[pgindx + i]->p_szc >= 2188 level); 2189 ASSERT(pages[pgindx] + i == 2190 pages[pgindx + i]); 2191 } 2192 if (i == mmu_btop(pgsize)) { 2193 #ifdef DEBUG 2194 if (level == 2) 2195 map1gcnt++; 2196 #endif 2197 break; 2198 } 2199 } 2200 } 2201 2202 /* 2203 * Load this page mapping. If the load fails, try a smaller 2204 * pagesize. 2205 */ 2206 ASSERT(!IN_VA_HOLE(va)); 2207 while (hati_load_common(hat, va, pages[pgindx], attr, 2208 flags, level, pfn) != 0) { 2209 if (level == 0) 2210 panic("unexpected hati_load_common() failure"); 2211 --level; 2212 pgsize = LEVEL_SIZE(level); 2213 } 2214 2215 /* 2216 * move to next page 2217 */ 2218 va += pgsize; 2219 pgindx += mmu_btop(pgsize); 2220 } 2221 XPV_ALLOW_MIGRATE(); 2222 } 2223 2224 /* ARGSUSED */ 2225 void 2226 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len, 2227 struct page **pps, uint_t attr, uint_t flags, 2228 hat_region_cookie_t rcookie) 2229 { 2230 hat_memload_array(hat, addr, len, pps, attr, flags); 2231 } 2232 2233 /* 2234 * void hat_devload(hat, addr, len, pf, attr, flags) 2235 * load/lock the given page frame number 2236 * 2237 * Advisory ordering attributes. Apply only to device mappings. 2238 * 2239 * HAT_STRICTORDER: the CPU must issue the references in order, as the 2240 * programmer specified. This is the default. 2241 * HAT_UNORDERED_OK: the CPU may reorder the references (this is all kinds 2242 * of reordering; store or load with store or load). 2243 * HAT_MERGING_OK: merging and batching: the CPU may merge individual stores 2244 * to consecutive locations (for example, turn two consecutive byte 2245 * stores into one halfword store), and it may batch individual loads 2246 * (for example, turn two consecutive byte loads into one halfword load). 2247 * This also implies re-ordering. 2248 * HAT_LOADCACHING_OK: the CPU may cache the data it fetches and reuse it 2249 * until another store occurs. The default is to fetch new data 2250 * on every load. This also implies merging. 2251 * HAT_STORECACHING_OK: the CPU may keep the data in the cache and push it to 2252 * the device (perhaps with other data) at a later time. The default is 2253 * to push the data right away. This also implies load caching. 2254 * 2255 * Equivalent of hat_memload(), but can be used for device memory where 2256 * there are no page_t's and we support additional flags (write merging, etc). 2257 * Note that we can have large page mappings with this interface. 2258 */ 2259 int supported_devload_flags = HAT_LOAD | HAT_LOAD_LOCK | 2260 HAT_LOAD_NOCONSIST | HAT_STRICTORDER | HAT_UNORDERED_OK | 2261 HAT_MERGING_OK | HAT_LOADCACHING_OK | HAT_STORECACHING_OK; 2262 2263 void 2264 hat_devload( 2265 hat_t *hat, 2266 caddr_t addr, 2267 size_t len, 2268 pfn_t pfn, 2269 uint_t attr, 2270 int flags) 2271 { 2272 uintptr_t va = ALIGN2PAGE(addr); 2273 uintptr_t eva = va + len; 2274 level_t level; 2275 size_t pgsize; 2276 page_t *pp; 2277 int f; /* per PTE copy of flags - maybe modified */ 2278 uint_t a; /* per PTE copy of attr */ 2279 2280 XPV_DISALLOW_MIGRATE(); 2281 ASSERT(IS_PAGEALIGNED(va)); 2282 ASSERT(hat == kas.a_hat || eva <= _userlimit); 2283 ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as)); 2284 ASSERT((flags & supported_devload_flags) == flags); 2285 2286 /* 2287 * handle all pages 2288 */ 2289 while (va < eva) { 2290 2291 /* 2292 * decide what level mapping to use (ie. pagesize) 2293 */ 2294 for (level = mmu.max_page_level; ; --level) { 2295 pgsize = LEVEL_SIZE(level); 2296 if (level == 0) 2297 break; 2298 if (IS_P2ALIGNED(va, pgsize) && 2299 (eva - va) >= pgsize && 2300 IS_P2ALIGNED(pfn, mmu_btop(pgsize))) { 2301 #ifdef DEBUG 2302 if (level == 2) 2303 map1gcnt++; 2304 #endif 2305 break; 2306 } 2307 } 2308 2309 /* 2310 * If this is just memory then allow caching (this happens 2311 * for the nucleus pages) - though HAT_PLAT_NOCACHE can be used 2312 * to override that. If we don't have a page_t then make sure 2313 * NOCONSIST is set. 2314 */ 2315 a = attr; 2316 f = flags; 2317 if (!pf_is_memory(pfn)) 2318 f |= HAT_LOAD_NOCONSIST; 2319 else if (!(a & HAT_PLAT_NOCACHE)) 2320 a |= HAT_STORECACHING_OK; 2321 2322 if (f & HAT_LOAD_NOCONSIST) 2323 pp = NULL; 2324 else 2325 pp = page_numtopp_nolock(pfn); 2326 2327 /* 2328 * Check to make sure we are really trying to map a valid 2329 * memory page. The caller wishing to intentionally map 2330 * free memory pages will have passed the HAT_LOAD_NOCONSIST 2331 * flag, then pp will be NULL. 2332 */ 2333 if (pp != NULL) { 2334 if (PP_ISFREE(pp)) { 2335 panic("hat_devload: loading " 2336 "a mapping to free page %p", (void *)pp); 2337 } 2338 2339 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) { 2340 panic("hat_devload: loading a mapping " 2341 "to an unlocked page %p", 2342 (void *)pp); 2343 } 2344 } 2345 2346 /* 2347 * load this page mapping 2348 */ 2349 ASSERT(!IN_VA_HOLE(va)); 2350 while (hati_load_common(hat, va, pp, a, f, level, pfn) != 0) { 2351 if (level == 0) 2352 panic("unexpected hati_load_common() failure"); 2353 --level; 2354 pgsize = LEVEL_SIZE(level); 2355 } 2356 2357 /* 2358 * move to next page 2359 */ 2360 va += pgsize; 2361 pfn += mmu_btop(pgsize); 2362 } 2363 XPV_ALLOW_MIGRATE(); 2364 } 2365 2366 /* 2367 * void hat_unlock(hat, addr, len) 2368 * unlock the mappings to a given range of addresses 2369 * 2370 * Locks are tracked by ht_lock_cnt in the htable. 2371 */ 2372 void 2373 hat_unlock(hat_t *hat, caddr_t addr, size_t len) 2374 { 2375 uintptr_t vaddr = (uintptr_t)addr; 2376 uintptr_t eaddr = vaddr + len; 2377 htable_t *ht = NULL; 2378 2379 /* 2380 * kernel entries are always locked, we don't track lock counts 2381 */ 2382 ASSERT(hat == kas.a_hat || eaddr <= _userlimit); 2383 ASSERT(IS_PAGEALIGNED(vaddr)); 2384 ASSERT(IS_PAGEALIGNED(eaddr)); 2385 if (hat == kas.a_hat) 2386 return; 2387 if (eaddr > _userlimit) 2388 panic("hat_unlock() address out of range - above _userlimit"); 2389 2390 XPV_DISALLOW_MIGRATE(); 2391 ASSERT(AS_LOCK_HELD(hat->hat_as)); 2392 while (vaddr < eaddr) { 2393 (void) htable_walk(hat, &ht, &vaddr, eaddr); 2394 if (ht == NULL) 2395 break; 2396 2397 ASSERT(!IN_VA_HOLE(vaddr)); 2398 2399 if (ht->ht_lock_cnt < 1) 2400 panic("hat_unlock(): lock_cnt < 1, " 2401 "htable=%p, vaddr=%p\n", (void *)ht, (void *)vaddr); 2402 HTABLE_LOCK_DEC(ht); 2403 2404 vaddr += LEVEL_SIZE(ht->ht_level); 2405 } 2406 if (ht) 2407 htable_release(ht); 2408 XPV_ALLOW_MIGRATE(); 2409 } 2410 2411 /* ARGSUSED */ 2412 void 2413 hat_unlock_region(struct hat *hat, caddr_t addr, size_t len, 2414 hat_region_cookie_t rcookie) 2415 { 2416 panic("No shared region support on x86"); 2417 } 2418 2419 #if !defined(__xpv) 2420 /* 2421 * Cross call service routine to demap a range of virtual 2422 * pages on the current CPU or flush all mappings in TLB. 2423 */ 2424 static int 2425 hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) 2426 { 2427 _NOTE(ARGUNUSED(a3)); 2428 hat_t *hat = (hat_t *)a1; 2429 tlb_range_t *range = (tlb_range_t *)a2; 2430 2431 /* 2432 * If the target hat isn't the kernel and this CPU isn't operating 2433 * in the target hat, we can ignore the cross call. 2434 */ 2435 if (hat != kas.a_hat && hat != CPU->cpu_current_hat) 2436 return (0); 2437 2438 if (range->tr_va != DEMAP_ALL_ADDR) { 2439 mmu_flush_tlb(FLUSH_TLB_RANGE, range); 2440 return (0); 2441 } 2442 2443 /* 2444 * We are flushing all of userspace. 2445 * 2446 * When using PCP, we first need to update this CPU's idea of the PCP 2447 * PTEs. 2448 */ 2449 if (hat->hat_flags & HAT_COPIED) { 2450 hat_pcp_update(CPU, hat); 2451 } 2452 2453 mmu_flush_tlb(FLUSH_TLB_NONGLOBAL, NULL); 2454 return (0); 2455 } 2456 2457 #define TLBIDLE_CPU_HALTED (0x1UL) 2458 #define TLBIDLE_INVAL_ALL (0x2UL) 2459 #define CAS_TLB_INFO(cpu, old, new) \ 2460 atomic_cas_ulong((ulong_t *)&(cpu)->cpu_m.mcpu_tlb_info, (old), (new)) 2461 2462 /* 2463 * Record that a CPU is going idle 2464 */ 2465 void 2466 tlb_going_idle(void) 2467 { 2468 atomic_or_ulong((ulong_t *)&CPU->cpu_m.mcpu_tlb_info, 2469 TLBIDLE_CPU_HALTED); 2470 } 2471 2472 /* 2473 * Service a delayed TLB flush if coming out of being idle. 2474 * It will be called from cpu idle notification with interrupt disabled. 2475 */ 2476 void 2477 tlb_service(void) 2478 { 2479 ulong_t tlb_info; 2480 ulong_t found; 2481 2482 /* 2483 * We only have to do something if coming out of being idle. 2484 */ 2485 tlb_info = CPU->cpu_m.mcpu_tlb_info; 2486 if (tlb_info & TLBIDLE_CPU_HALTED) { 2487 ASSERT(CPU->cpu_current_hat == kas.a_hat); 2488 2489 /* 2490 * Atomic clear and fetch of old state. 2491 */ 2492 while ((found = CAS_TLB_INFO(CPU, tlb_info, 0)) != tlb_info) { 2493 ASSERT(found & TLBIDLE_CPU_HALTED); 2494 tlb_info = found; 2495 SMT_PAUSE(); 2496 } 2497 if (tlb_info & TLBIDLE_INVAL_ALL) 2498 mmu_flush_tlb(FLUSH_TLB_ALL, NULL); 2499 } 2500 } 2501 #endif /* !__xpv */ 2502 2503 /* 2504 * Internal routine to do cross calls to invalidate a range of pages on 2505 * all CPUs using a given hat. 2506 */ 2507 void 2508 hat_tlb_inval_range(hat_t *hat, tlb_range_t *in_range) 2509 { 2510 extern int flushes_require_xcalls; /* from mp_startup.c */ 2511 cpuset_t justme; 2512 cpuset_t cpus_to_shootdown; 2513 tlb_range_t range = *in_range; 2514 #ifndef __xpv 2515 cpuset_t check_cpus; 2516 cpu_t *cpup; 2517 int c; 2518 #endif 2519 2520 /* 2521 * If the hat is being destroyed, there are no more users, so 2522 * demap need not do anything. 2523 */ 2524 if (hat->hat_flags & HAT_FREEING) 2525 return; 2526 2527 /* 2528 * If demapping from a shared pagetable, we best demap the 2529 * entire set of user TLBs, since we don't know what addresses 2530 * these were shared at. 2531 */ 2532 if (hat->hat_flags & HAT_SHARED) { 2533 hat = kas.a_hat; 2534 range.tr_va = DEMAP_ALL_ADDR; 2535 } 2536 2537 /* 2538 * if not running with multiple CPUs, don't use cross calls 2539 */ 2540 if (panicstr || !flushes_require_xcalls) { 2541 #ifdef __xpv 2542 if (range.tr_va == DEMAP_ALL_ADDR) { 2543 xen_flush_tlb(); 2544 } else { 2545 for (size_t i = 0; i < TLB_RANGE_LEN(&range); 2546 i += MMU_PAGESIZE) { 2547 xen_flush_va((caddr_t)(range.tr_va + i)); 2548 } 2549 } 2550 #else 2551 (void) hati_demap_func((xc_arg_t)hat, (xc_arg_t)&range, 0); 2552 #endif 2553 return; 2554 } 2555 2556 2557 /* 2558 * Determine CPUs to shootdown. Kernel changes always do all CPUs. 2559 * Otherwise it's just CPUs currently executing in this hat. 2560 */ 2561 kpreempt_disable(); 2562 CPUSET_ONLY(justme, CPU->cpu_id); 2563 if (hat == kas.a_hat) 2564 cpus_to_shootdown = khat_cpuset; 2565 else 2566 cpus_to_shootdown = hat->hat_cpus; 2567 2568 #ifndef __xpv 2569 /* 2570 * If any CPUs in the set are idle, just request a delayed flush 2571 * and avoid waking them up. 2572 */ 2573 check_cpus = cpus_to_shootdown; 2574 for (c = 0; c < NCPU && !CPUSET_ISNULL(check_cpus); ++c) { 2575 ulong_t tlb_info; 2576 2577 if (!CPU_IN_SET(check_cpus, c)) 2578 continue; 2579 CPUSET_DEL(check_cpus, c); 2580 cpup = cpu[c]; 2581 if (cpup == NULL) 2582 continue; 2583 2584 tlb_info = cpup->cpu_m.mcpu_tlb_info; 2585 while (tlb_info == TLBIDLE_CPU_HALTED) { 2586 (void) CAS_TLB_INFO(cpup, TLBIDLE_CPU_HALTED, 2587 TLBIDLE_CPU_HALTED | TLBIDLE_INVAL_ALL); 2588 SMT_PAUSE(); 2589 tlb_info = cpup->cpu_m.mcpu_tlb_info; 2590 } 2591 if (tlb_info == (TLBIDLE_CPU_HALTED | TLBIDLE_INVAL_ALL)) { 2592 HATSTAT_INC(hs_tlb_inval_delayed); 2593 CPUSET_DEL(cpus_to_shootdown, c); 2594 } 2595 } 2596 #endif 2597 2598 if (CPUSET_ISNULL(cpus_to_shootdown) || 2599 CPUSET_ISEQUAL(cpus_to_shootdown, justme)) { 2600 2601 #ifdef __xpv 2602 if (range.tr_va == DEMAP_ALL_ADDR) { 2603 xen_flush_tlb(); 2604 } else { 2605 for (size_t i = 0; i < TLB_RANGE_LEN(&range); 2606 i += MMU_PAGESIZE) { 2607 xen_flush_va((caddr_t)(range.tr_va + i)); 2608 } 2609 } 2610 #else 2611 (void) hati_demap_func((xc_arg_t)hat, (xc_arg_t)&range, 0); 2612 #endif 2613 2614 } else { 2615 2616 CPUSET_ADD(cpus_to_shootdown, CPU->cpu_id); 2617 #ifdef __xpv 2618 if (range.tr_va == DEMAP_ALL_ADDR) { 2619 xen_gflush_tlb(cpus_to_shootdown); 2620 } else { 2621 for (size_t i = 0; i < TLB_RANGE_LEN(&range); 2622 i += MMU_PAGESIZE) { 2623 xen_gflush_va((caddr_t)(range.tr_va + i), 2624 cpus_to_shootdown); 2625 } 2626 } 2627 #else 2628 xc_call((xc_arg_t)hat, (xc_arg_t)&range, 0, 2629 CPUSET2BV(cpus_to_shootdown), hati_demap_func); 2630 #endif 2631 2632 } 2633 kpreempt_enable(); 2634 } 2635 2636 void 2637 hat_tlb_inval(hat_t *hat, uintptr_t va) 2638 { 2639 /* 2640 * Create range for a single page. 2641 */ 2642 tlb_range_t range; 2643 range.tr_va = va; 2644 range.tr_cnt = 1; /* one page */ 2645 range.tr_level = MIN_PAGE_LEVEL; /* pages are MMU_PAGESIZE */ 2646 2647 hat_tlb_inval_range(hat, &range); 2648 } 2649 2650 /* 2651 * Interior routine for HAT_UNLOADs from hat_unload_callback(), 2652 * hat_kmap_unload() OR from hat_steal() code. This routine doesn't 2653 * handle releasing of the htables. 2654 */ 2655 void 2656 hat_pte_unmap( 2657 htable_t *ht, 2658 uint_t entry, 2659 uint_t flags, 2660 x86pte_t old_pte, 2661 void *pte_ptr, 2662 boolean_t tlb) 2663 { 2664 hat_t *hat = ht->ht_hat; 2665 hment_t *hm = NULL; 2666 page_t *pp = NULL; 2667 level_t l = ht->ht_level; 2668 pfn_t pfn; 2669 2670 /* 2671 * We always track the locking counts, even if nothing is unmapped 2672 */ 2673 if ((flags & HAT_UNLOAD_UNLOCK) != 0 && hat != kas.a_hat) { 2674 ASSERT(ht->ht_lock_cnt > 0); 2675 HTABLE_LOCK_DEC(ht); 2676 } 2677 2678 /* 2679 * Figure out which page's mapping list lock to acquire using the PFN 2680 * passed in "old" PTE. We then attempt to invalidate the PTE. 2681 * If another thread, probably a hat_pageunload, has asynchronously 2682 * unmapped/remapped this address we'll loop here. 2683 */ 2684 ASSERT(ht->ht_busy > 0); 2685 while (PTE_ISVALID(old_pte)) { 2686 pfn = PTE2PFN(old_pte, l); 2687 if (PTE_GET(old_pte, PT_SOFTWARE) >= PT_NOCONSIST) { 2688 pp = NULL; 2689 } else { 2690 #ifdef __xpv 2691 if (pfn == PFN_INVALID) 2692 panic("Invalid PFN, but not PT_NOCONSIST"); 2693 #endif 2694 pp = page_numtopp_nolock(pfn); 2695 if (pp == NULL) { 2696 panic("no page_t, not NOCONSIST: old_pte=" 2697 FMT_PTE " ht=%lx entry=0x%x pte_ptr=%lx", 2698 old_pte, (uintptr_t)ht, entry, 2699 (uintptr_t)pte_ptr); 2700 } 2701 x86_hm_enter(pp); 2702 } 2703 2704 old_pte = x86pte_inval(ht, entry, old_pte, pte_ptr, tlb); 2705 2706 /* 2707 * If the page hadn't changed we've unmapped it and can proceed 2708 */ 2709 if (PTE_ISVALID(old_pte) && PTE2PFN(old_pte, l) == pfn) 2710 break; 2711 2712 /* 2713 * Otherwise, we'll have to retry with the current old_pte. 2714 * Drop the hment lock, since the pfn may have changed. 2715 */ 2716 if (pp != NULL) { 2717 x86_hm_exit(pp); 2718 pp = NULL; 2719 } else { 2720 ASSERT(PTE_GET(old_pte, PT_SOFTWARE) >= PT_NOCONSIST); 2721 } 2722 } 2723 2724 /* 2725 * If the old mapping wasn't valid, there's nothing more to do 2726 */ 2727 if (!PTE_ISVALID(old_pte)) { 2728 if (pp != NULL) 2729 x86_hm_exit(pp); 2730 return; 2731 } 2732 2733 /* 2734 * Take care of syncing any MOD/REF bits and removing the hment. 2735 */ 2736 if (pp != NULL) { 2737 if (!(flags & HAT_UNLOAD_NOSYNC)) 2738 hati_sync_pte_to_page(pp, old_pte, l); 2739 hm = hment_remove(pp, ht, entry); 2740 x86_hm_exit(pp); 2741 if (hm != NULL) 2742 hment_free(hm); 2743 } 2744 2745 /* 2746 * Handle book keeping in the htable and hat 2747 */ 2748 ASSERT(ht->ht_valid_cnt > 0); 2749 HTABLE_DEC(ht->ht_valid_cnt); 2750 PGCNT_DEC(hat, l); 2751 } 2752 2753 /* 2754 * very cheap unload implementation to special case some kernel addresses 2755 */ 2756 static void 2757 hat_kmap_unload(caddr_t addr, size_t len, uint_t flags) 2758 { 2759 uintptr_t va = (uintptr_t)addr; 2760 uintptr_t eva = va + len; 2761 pgcnt_t pg_index; 2762 htable_t *ht; 2763 uint_t entry; 2764 x86pte_t *pte_ptr; 2765 x86pte_t old_pte; 2766 2767 for (; va < eva; va += MMU_PAGESIZE) { 2768 /* 2769 * Get the PTE 2770 */ 2771 pg_index = mmu_btop(va - mmu.kmap_addr); 2772 pte_ptr = PT_INDEX_PTR(mmu.kmap_ptes, pg_index); 2773 old_pte = GET_PTE(pte_ptr); 2774 2775 /* 2776 * get the htable / entry 2777 */ 2778 ht = mmu.kmap_htables[(va - mmu.kmap_htables[0]->ht_vaddr) 2779 >> LEVEL_SHIFT(1)]; 2780 entry = htable_va2entry(va, ht); 2781 2782 /* 2783 * use mostly common code to unmap it. 2784 */ 2785 hat_pte_unmap(ht, entry, flags, old_pte, pte_ptr, B_TRUE); 2786 } 2787 } 2788 2789 2790 /* 2791 * unload a range of virtual address space (no callback) 2792 */ 2793 void 2794 hat_unload(hat_t *hat, caddr_t addr, size_t len, uint_t flags) 2795 { 2796 uintptr_t va = (uintptr_t)addr; 2797 2798 XPV_DISALLOW_MIGRATE(); 2799 ASSERT(hat == kas.a_hat || va + len <= _userlimit); 2800 2801 /* 2802 * special case for performance. 2803 */ 2804 if (mmu.kmap_addr <= va && va < mmu.kmap_eaddr) { 2805 ASSERT(hat == kas.a_hat); 2806 hat_kmap_unload(addr, len, flags); 2807 } else { 2808 hat_unload_callback(hat, addr, len, flags, NULL); 2809 } 2810 XPV_ALLOW_MIGRATE(); 2811 } 2812 2813 /* 2814 * Invalidate the TLB, and perform the callback to the upper level VM system, 2815 * for the specified ranges of contiguous pages. 2816 */ 2817 static void 2818 handle_ranges(hat_t *hat, hat_callback_t *cb, uint_t cnt, tlb_range_t *range) 2819 { 2820 while (cnt > 0) { 2821 --cnt; 2822 hat_tlb_inval_range(hat, &range[cnt]); 2823 2824 if (cb != NULL) { 2825 cb->hcb_start_addr = (caddr_t)range[cnt].tr_va; 2826 cb->hcb_end_addr = cb->hcb_start_addr; 2827 cb->hcb_end_addr += range[cnt].tr_cnt << 2828 LEVEL_SHIFT(range[cnt].tr_level); 2829 cb->hcb_function(cb); 2830 } 2831 } 2832 } 2833 2834 /* 2835 * Unload a given range of addresses (has optional callback) 2836 * 2837 * Flags: 2838 * define HAT_UNLOAD 0x00 2839 * define HAT_UNLOAD_NOSYNC 0x02 2840 * define HAT_UNLOAD_UNLOCK 0x04 2841 * define HAT_UNLOAD_OTHER 0x08 - not used 2842 * define HAT_UNLOAD_UNMAP 0x10 - same as HAT_UNLOAD 2843 */ 2844 #define MAX_UNLOAD_CNT (8) 2845 void 2846 hat_unload_callback( 2847 hat_t *hat, 2848 caddr_t addr, 2849 size_t len, 2850 uint_t flags, 2851 hat_callback_t *cb) 2852 { 2853 uintptr_t vaddr = (uintptr_t)addr; 2854 uintptr_t eaddr = vaddr + len; 2855 htable_t *ht = NULL; 2856 uint_t entry; 2857 uintptr_t contig_va = (uintptr_t)-1L; 2858 tlb_range_t r[MAX_UNLOAD_CNT]; 2859 uint_t r_cnt = 0; 2860 x86pte_t old_pte; 2861 2862 XPV_DISALLOW_MIGRATE(); 2863 ASSERT(hat == kas.a_hat || eaddr <= _userlimit); 2864 ASSERT(IS_PAGEALIGNED(vaddr)); 2865 ASSERT(IS_PAGEALIGNED(eaddr)); 2866 2867 /* 2868 * Special case a single page being unloaded for speed. This happens 2869 * quite frequently, COW faults after a fork() for example. 2870 */ 2871 if (cb == NULL && len == MMU_PAGESIZE) { 2872 ht = htable_getpte(hat, vaddr, &entry, &old_pte, 0); 2873 if (ht != NULL) { 2874 if (PTE_ISVALID(old_pte)) { 2875 hat_pte_unmap(ht, entry, flags, old_pte, 2876 NULL, B_TRUE); 2877 } 2878 htable_release(ht); 2879 } 2880 XPV_ALLOW_MIGRATE(); 2881 return; 2882 } 2883 2884 while (vaddr < eaddr) { 2885 old_pte = htable_walk(hat, &ht, &vaddr, eaddr); 2886 if (ht == NULL) 2887 break; 2888 2889 ASSERT(!IN_VA_HOLE(vaddr)); 2890 2891 if (vaddr < (uintptr_t)addr) 2892 panic("hat_unload_callback(): unmap inside large page"); 2893 2894 /* 2895 * We'll do the call backs for contiguous ranges 2896 */ 2897 if (vaddr != contig_va || 2898 (r_cnt > 0 && r[r_cnt - 1].tr_level != ht->ht_level)) { 2899 if (r_cnt == MAX_UNLOAD_CNT) { 2900 handle_ranges(hat, cb, r_cnt, r); 2901 r_cnt = 0; 2902 } 2903 r[r_cnt].tr_va = vaddr; 2904 r[r_cnt].tr_cnt = 0; 2905 r[r_cnt].tr_level = ht->ht_level; 2906 ++r_cnt; 2907 } 2908 2909 /* 2910 * Unload one mapping (for a single page) from the page tables. 2911 * Note that we do not remove the mapping from the TLB yet, 2912 * as indicated by the tlb=FALSE argument to hat_pte_unmap(). 2913 * handle_ranges() will clear the TLB entries with one call to 2914 * hat_tlb_inval_range() per contiguous range. This is 2915 * safe because the page can not be reused until the 2916 * callback is made (or we return). 2917 */ 2918 entry = htable_va2entry(vaddr, ht); 2919 hat_pte_unmap(ht, entry, flags, old_pte, NULL, B_FALSE); 2920 ASSERT(ht->ht_level <= mmu.max_page_level); 2921 vaddr += LEVEL_SIZE(ht->ht_level); 2922 contig_va = vaddr; 2923 ++r[r_cnt - 1].tr_cnt; 2924 } 2925 if (ht) 2926 htable_release(ht); 2927 2928 /* 2929 * handle last range for callbacks 2930 */ 2931 if (r_cnt > 0) 2932 handle_ranges(hat, cb, r_cnt, r); 2933 XPV_ALLOW_MIGRATE(); 2934 } 2935 2936 /* 2937 * Invalidate a virtual address translation on a slave CPU during 2938 * panic() dumps. 2939 */ 2940 void 2941 hat_flush_range(hat_t *hat, caddr_t va, size_t size) 2942 { 2943 ssize_t sz; 2944 caddr_t endva = va + size; 2945 2946 while (va < endva) { 2947 sz = hat_getpagesize(hat, va); 2948 if (sz < 0) { 2949 #ifdef __xpv 2950 xen_flush_tlb(); 2951 #else 2952 mmu_flush_tlb(FLUSH_TLB_ALL, NULL); 2953 #endif 2954 break; 2955 } 2956 #ifdef __xpv 2957 xen_flush_va(va); 2958 #else 2959 mmu_flush_tlb_kpage((uintptr_t)va); 2960 #endif 2961 va += sz; 2962 } 2963 } 2964 2965 /* 2966 * synchronize mapping with software data structures 2967 * 2968 * This interface is currently only used by the working set monitor 2969 * driver. 2970 */ 2971 /*ARGSUSED*/ 2972 void 2973 hat_sync(hat_t *hat, caddr_t addr, size_t len, uint_t flags) 2974 { 2975 uintptr_t vaddr = (uintptr_t)addr; 2976 uintptr_t eaddr = vaddr + len; 2977 htable_t *ht = NULL; 2978 uint_t entry; 2979 x86pte_t pte; 2980 x86pte_t save_pte; 2981 x86pte_t new; 2982 page_t *pp; 2983 2984 ASSERT(!IN_VA_HOLE(vaddr)); 2985 ASSERT(IS_PAGEALIGNED(vaddr)); 2986 ASSERT(IS_PAGEALIGNED(eaddr)); 2987 ASSERT(hat == kas.a_hat || eaddr <= _userlimit); 2988 2989 XPV_DISALLOW_MIGRATE(); 2990 for (; vaddr < eaddr; vaddr += LEVEL_SIZE(ht->ht_level)) { 2991 try_again: 2992 pte = htable_walk(hat, &ht, &vaddr, eaddr); 2993 if (ht == NULL) 2994 break; 2995 entry = htable_va2entry(vaddr, ht); 2996 2997 if (PTE_GET(pte, PT_SOFTWARE) >= PT_NOSYNC || 2998 PTE_GET(pte, PT_REF | PT_MOD) == 0) 2999 continue; 3000 3001 /* 3002 * We need to acquire the mapping list lock to protect 3003 * against hat_pageunload(), hat_unload(), etc. 3004 */ 3005 pp = page_numtopp_nolock(PTE2PFN(pte, ht->ht_level)); 3006 if (pp == NULL) 3007 break; 3008 x86_hm_enter(pp); 3009 save_pte = pte; 3010 pte = x86pte_get(ht, entry); 3011 if (pte != save_pte) { 3012 x86_hm_exit(pp); 3013 goto try_again; 3014 } 3015 if (PTE_GET(pte, PT_SOFTWARE) >= PT_NOSYNC || 3016 PTE_GET(pte, PT_REF | PT_MOD) == 0) { 3017 x86_hm_exit(pp); 3018 continue; 3019 } 3020 3021 /* 3022 * Need to clear ref or mod bits. We may compete with 3023 * hardware updating the R/M bits and have to try again. 3024 */ 3025 if (flags == HAT_SYNC_ZERORM) { 3026 new = pte; 3027 PTE_CLR(new, PT_REF | PT_MOD); 3028 pte = hati_update_pte(ht, entry, pte, new); 3029 if (pte != 0) { 3030 x86_hm_exit(pp); 3031 goto try_again; 3032 } 3033 } else { 3034 /* 3035 * sync the PTE to the page_t 3036 */ 3037 hati_sync_pte_to_page(pp, save_pte, ht->ht_level); 3038 } 3039 x86_hm_exit(pp); 3040 } 3041 if (ht) 3042 htable_release(ht); 3043 XPV_ALLOW_MIGRATE(); 3044 } 3045 3046 /* 3047 * void hat_map(hat, addr, len, flags) 3048 */ 3049 /*ARGSUSED*/ 3050 void 3051 hat_map(hat_t *hat, caddr_t addr, size_t len, uint_t flags) 3052 { 3053 /* does nothing */ 3054 } 3055 3056 /* 3057 * uint_t hat_getattr(hat, addr, *attr) 3058 * returns attr for <hat,addr> in *attr. returns 0 if there was a 3059 * mapping and *attr is valid, nonzero if there was no mapping and 3060 * *attr is not valid. 3061 */ 3062 uint_t 3063 hat_getattr(hat_t *hat, caddr_t addr, uint_t *attr) 3064 { 3065 uintptr_t vaddr = ALIGN2PAGE(addr); 3066 htable_t *ht = NULL; 3067 x86pte_t pte; 3068 3069 ASSERT(hat == kas.a_hat || vaddr <= _userlimit); 3070 3071 if (IN_VA_HOLE(vaddr)) 3072 return ((uint_t)-1); 3073 3074 ht = htable_getpte(hat, vaddr, NULL, &pte, mmu.max_page_level); 3075 if (ht == NULL) 3076 return ((uint_t)-1); 3077 3078 if (!PTE_ISVALID(pte) || !PTE_ISPAGE(pte, ht->ht_level)) { 3079 htable_release(ht); 3080 return ((uint_t)-1); 3081 } 3082 3083 *attr = PROT_READ; 3084 if (PTE_GET(pte, PT_WRITABLE)) 3085 *attr |= PROT_WRITE; 3086 if (PTE_GET(pte, PT_USER)) 3087 *attr |= PROT_USER; 3088 if (!PTE_GET(pte, mmu.pt_nx)) 3089 *attr |= PROT_EXEC; 3090 if (PTE_GET(pte, PT_SOFTWARE) >= PT_NOSYNC) 3091 *attr |= HAT_NOSYNC; 3092 htable_release(ht); 3093 return (0); 3094 } 3095 3096 /* 3097 * hat_updateattr() applies the given attribute change to an existing mapping 3098 */ 3099 #define HAT_LOAD_ATTR 1 3100 #define HAT_SET_ATTR 2 3101 #define HAT_CLR_ATTR 3 3102 3103 static void 3104 hat_updateattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr, int what) 3105 { 3106 uintptr_t vaddr = (uintptr_t)addr; 3107 uintptr_t eaddr = (uintptr_t)addr + len; 3108 htable_t *ht = NULL; 3109 uint_t entry; 3110 x86pte_t oldpte, newpte; 3111 page_t *pp; 3112 3113 XPV_DISALLOW_MIGRATE(); 3114 ASSERT(IS_PAGEALIGNED(vaddr)); 3115 ASSERT(IS_PAGEALIGNED(eaddr)); 3116 ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as)); 3117 for (; vaddr < eaddr; vaddr += LEVEL_SIZE(ht->ht_level)) { 3118 try_again: 3119 oldpte = htable_walk(hat, &ht, &vaddr, eaddr); 3120 if (ht == NULL) 3121 break; 3122 if (PTE_GET(oldpte, PT_SOFTWARE) >= PT_NOCONSIST) 3123 continue; 3124 3125 pp = page_numtopp_nolock(PTE2PFN(oldpte, ht->ht_level)); 3126 if (pp == NULL) 3127 continue; 3128 x86_hm_enter(pp); 3129 3130 newpte = oldpte; 3131 /* 3132 * We found a page table entry in the desired range, 3133 * figure out the new attributes. 3134 */ 3135 if (what == HAT_SET_ATTR || what == HAT_LOAD_ATTR) { 3136 if ((attr & PROT_WRITE) && 3137 !PTE_GET(oldpte, PT_WRITABLE)) 3138 newpte |= PT_WRITABLE; 3139 3140 if ((attr & HAT_NOSYNC) && 3141 PTE_GET(oldpte, PT_SOFTWARE) < PT_NOSYNC) 3142 newpte |= PT_NOSYNC; 3143 3144 if ((attr & PROT_EXEC) && PTE_GET(oldpte, mmu.pt_nx)) 3145 newpte &= ~mmu.pt_nx; 3146 } 3147 3148 if (what == HAT_LOAD_ATTR) { 3149 if (!(attr & PROT_WRITE) && 3150 PTE_GET(oldpte, PT_WRITABLE)) 3151 newpte &= ~PT_WRITABLE; 3152 3153 if (!(attr & HAT_NOSYNC) && 3154 PTE_GET(oldpte, PT_SOFTWARE) >= PT_NOSYNC) 3155 newpte &= ~PT_SOFTWARE; 3156 3157 if (!(attr & PROT_EXEC) && !PTE_GET(oldpte, mmu.pt_nx)) 3158 newpte |= mmu.pt_nx; 3159 } 3160 3161 if (what == HAT_CLR_ATTR) { 3162 if ((attr & PROT_WRITE) && PTE_GET(oldpte, PT_WRITABLE)) 3163 newpte &= ~PT_WRITABLE; 3164 3165 if ((attr & HAT_NOSYNC) && 3166 PTE_GET(oldpte, PT_SOFTWARE) >= PT_NOSYNC) 3167 newpte &= ~PT_SOFTWARE; 3168 3169 if ((attr & PROT_EXEC) && !PTE_GET(oldpte, mmu.pt_nx)) 3170 newpte |= mmu.pt_nx; 3171 } 3172 3173 /* 3174 * Ensure NOSYNC/NOCONSIST mappings have REF and MOD set. 3175 * x86pte_set() depends on this. 3176 */ 3177 if (PTE_GET(newpte, PT_SOFTWARE) >= PT_NOSYNC) 3178 newpte |= PT_REF | PT_MOD; 3179 3180 /* 3181 * what about PROT_READ or others? this code only handles: 3182 * EXEC, WRITE, NOSYNC 3183 */ 3184 3185 /* 3186 * If new PTE really changed, update the table. 3187 */ 3188 if (newpte != oldpte) { 3189 entry = htable_va2entry(vaddr, ht); 3190 oldpte = hati_update_pte(ht, entry, oldpte, newpte); 3191 if (oldpte != 0) { 3192 x86_hm_exit(pp); 3193 goto try_again; 3194 } 3195 } 3196 x86_hm_exit(pp); 3197 } 3198 if (ht) 3199 htable_release(ht); 3200 XPV_ALLOW_MIGRATE(); 3201 } 3202 3203 /* 3204 * Various wrappers for hat_updateattr() 3205 */ 3206 void 3207 hat_setattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr) 3208 { 3209 ASSERT(hat == kas.a_hat || (uintptr_t)addr + len <= _userlimit); 3210 hat_updateattr(hat, addr, len, attr, HAT_SET_ATTR); 3211 } 3212 3213 void 3214 hat_clrattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr) 3215 { 3216 ASSERT(hat == kas.a_hat || (uintptr_t)addr + len <= _userlimit); 3217 hat_updateattr(hat, addr, len, attr, HAT_CLR_ATTR); 3218 } 3219 3220 void 3221 hat_chgattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr) 3222 { 3223 ASSERT(hat == kas.a_hat || (uintptr_t)addr + len <= _userlimit); 3224 hat_updateattr(hat, addr, len, attr, HAT_LOAD_ATTR); 3225 } 3226 3227 void 3228 hat_chgprot(hat_t *hat, caddr_t addr, size_t len, uint_t vprot) 3229 { 3230 ASSERT(hat == kas.a_hat || (uintptr_t)addr + len <= _userlimit); 3231 hat_updateattr(hat, addr, len, vprot & HAT_PROT_MASK, HAT_LOAD_ATTR); 3232 } 3233 3234 /* 3235 * size_t hat_getpagesize(hat, addr) 3236 * returns pagesize in bytes for <hat, addr>. returns -1 of there is 3237 * no mapping. This is an advisory call. 3238 */ 3239 ssize_t 3240 hat_getpagesize(hat_t *hat, caddr_t addr) 3241 { 3242 uintptr_t vaddr = ALIGN2PAGE(addr); 3243 htable_t *ht; 3244 size_t pagesize; 3245 3246 ASSERT(hat == kas.a_hat || vaddr <= _userlimit); 3247 if (IN_VA_HOLE(vaddr)) 3248 return (-1); 3249 ht = htable_getpage(hat, vaddr, NULL); 3250 if (ht == NULL) 3251 return (-1); 3252 pagesize = LEVEL_SIZE(ht->ht_level); 3253 htable_release(ht); 3254 return (pagesize); 3255 } 3256 3257 3258 3259 /* 3260 * pfn_t hat_getpfnum(hat, addr) 3261 * returns pfn for <hat, addr> or PFN_INVALID if mapping is invalid. 3262 */ 3263 pfn_t 3264 hat_getpfnum(hat_t *hat, caddr_t addr) 3265 { 3266 uintptr_t vaddr = ALIGN2PAGE(addr); 3267 htable_t *ht; 3268 uint_t entry; 3269 pfn_t pfn = PFN_INVALID; 3270 3271 ASSERT(hat == kas.a_hat || vaddr <= _userlimit); 3272 if (khat_running == 0) 3273 return (PFN_INVALID); 3274 3275 if (IN_VA_HOLE(vaddr)) 3276 return (PFN_INVALID); 3277 3278 XPV_DISALLOW_MIGRATE(); 3279 /* 3280 * A very common use of hat_getpfnum() is from the DDI for kernel pages. 3281 * Use the kmap_ptes (which also covers the 32 bit heap) to speed 3282 * this up. 3283 */ 3284 if (mmu.kmap_addr <= vaddr && vaddr < mmu.kmap_eaddr) { 3285 x86pte_t pte; 3286 pgcnt_t pg_index; 3287 3288 pg_index = mmu_btop(vaddr - mmu.kmap_addr); 3289 pte = GET_PTE(PT_INDEX_PTR(mmu.kmap_ptes, pg_index)); 3290 if (PTE_ISVALID(pte)) 3291 /*LINTED [use of constant 0 causes a lint warning] */ 3292 pfn = PTE2PFN(pte, 0); 3293 XPV_ALLOW_MIGRATE(); 3294 return (pfn); 3295 } 3296 3297 ht = htable_getpage(hat, vaddr, &entry); 3298 if (ht == NULL) { 3299 XPV_ALLOW_MIGRATE(); 3300 return (PFN_INVALID); 3301 } 3302 ASSERT(vaddr >= ht->ht_vaddr); 3303 ASSERT(vaddr <= HTABLE_LAST_PAGE(ht)); 3304 pfn = PTE2PFN(x86pte_get(ht, entry), ht->ht_level); 3305 if (ht->ht_level > 0) 3306 pfn += mmu_btop(vaddr & LEVEL_OFFSET(ht->ht_level)); 3307 htable_release(ht); 3308 XPV_ALLOW_MIGRATE(); 3309 return (pfn); 3310 } 3311 3312 /* 3313 * int hat_probe(hat, addr) 3314 * return 0 if no valid mapping is present. Faster version 3315 * of hat_getattr in certain architectures. 3316 */ 3317 int 3318 hat_probe(hat_t *hat, caddr_t addr) 3319 { 3320 uintptr_t vaddr = ALIGN2PAGE(addr); 3321 uint_t entry; 3322 htable_t *ht; 3323 pgcnt_t pg_off; 3324 3325 ASSERT(hat == kas.a_hat || vaddr <= _userlimit); 3326 ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as)); 3327 if (IN_VA_HOLE(vaddr)) 3328 return (0); 3329 3330 /* 3331 * Most common use of hat_probe is from segmap. We special case it 3332 * for performance. 3333 */ 3334 if (mmu.kmap_addr <= vaddr && vaddr < mmu.kmap_eaddr) { 3335 pg_off = mmu_btop(vaddr - mmu.kmap_addr); 3336 if (mmu.pae_hat) 3337 return (PTE_ISVALID(mmu.kmap_ptes[pg_off])); 3338 else 3339 return (PTE_ISVALID( 3340 ((x86pte32_t *)mmu.kmap_ptes)[pg_off])); 3341 } 3342 3343 ht = htable_getpage(hat, vaddr, &entry); 3344 htable_release(ht); 3345 return (ht != NULL); 3346 } 3347 3348 /* 3349 * Find out if the segment for hat_share()/hat_unshare() is DISM or locked ISM. 3350 */ 3351 static int 3352 is_it_dism(hat_t *hat, caddr_t va) 3353 { 3354 struct seg *seg; 3355 struct shm_data *shmd; 3356 struct spt_data *sptd; 3357 3358 seg = as_findseg(hat->hat_as, va, 0); 3359 ASSERT(seg != NULL); 3360 ASSERT(seg->s_base <= va); 3361 shmd = (struct shm_data *)seg->s_data; 3362 ASSERT(shmd != NULL); 3363 sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 3364 ASSERT(sptd != NULL); 3365 if (sptd->spt_flags & SHM_PAGEABLE) 3366 return (1); 3367 return (0); 3368 } 3369 3370 /* 3371 * Simple implementation of ISM. hat_share() is similar to hat_memload_array(), 3372 * except that we use the ism_hat's existing mappings to determine the pages 3373 * and protections to use for this hat. If we find a full properly aligned 3374 * and sized pagetable, we will attempt to share the pagetable itself. 3375 */ 3376 /*ARGSUSED*/ 3377 int 3378 hat_share( 3379 hat_t *hat, 3380 caddr_t addr, 3381 hat_t *ism_hat, 3382 caddr_t src_addr, 3383 size_t len, /* almost useless value, see below.. */ 3384 uint_t ismszc) 3385 { 3386 uintptr_t vaddr_start = (uintptr_t)addr; 3387 uintptr_t vaddr; 3388 uintptr_t eaddr = vaddr_start + len; 3389 uintptr_t ism_addr_start = (uintptr_t)src_addr; 3390 uintptr_t ism_addr = ism_addr_start; 3391 uintptr_t e_ism_addr = ism_addr + len; 3392 htable_t *ism_ht = NULL; 3393 htable_t *ht; 3394 x86pte_t pte; 3395 page_t *pp; 3396 pfn_t pfn; 3397 level_t l; 3398 pgcnt_t pgcnt; 3399 uint_t prot; 3400 int is_dism; 3401 int flags; 3402 3403 /* 3404 * We might be asked to share an empty DISM hat by as_dup() 3405 */ 3406 ASSERT(hat != kas.a_hat); 3407 ASSERT(eaddr <= _userlimit); 3408 if (!(ism_hat->hat_flags & HAT_SHARED)) { 3409 ASSERT(hat_get_mapped_size(ism_hat) == 0); 3410 return (0); 3411 } 3412 XPV_DISALLOW_MIGRATE(); 3413 3414 /* 3415 * The SPT segment driver often passes us a size larger than there are 3416 * valid mappings. That's because it rounds the segment size up to a 3417 * large pagesize, even if the actual memory mapped by ism_hat is less. 3418 */ 3419 ASSERT(IS_PAGEALIGNED(vaddr_start)); 3420 ASSERT(IS_PAGEALIGNED(ism_addr_start)); 3421 ASSERT(ism_hat->hat_flags & HAT_SHARED); 3422 is_dism = is_it_dism(hat, addr); 3423 while (ism_addr < e_ism_addr) { 3424 /* 3425 * use htable_walk to get the next valid ISM mapping 3426 */ 3427 pte = htable_walk(ism_hat, &ism_ht, &ism_addr, e_ism_addr); 3428 if (ism_ht == NULL) 3429 break; 3430 3431 /* 3432 * First check to see if we already share the page table. 3433 */ 3434 l = ism_ht->ht_level; 3435 vaddr = vaddr_start + (ism_addr - ism_addr_start); 3436 ht = htable_lookup(hat, vaddr, l); 3437 if (ht != NULL) { 3438 if (ht->ht_flags & HTABLE_SHARED_PFN) 3439 goto shared; 3440 htable_release(ht); 3441 goto not_shared; 3442 } 3443 3444 /* 3445 * Can't ever share top table. 3446 */ 3447 if (l == mmu.max_level) 3448 goto not_shared; 3449 3450 /* 3451 * Avoid level mismatches later due to DISM faults. 3452 */ 3453 if (is_dism && l > 0) 3454 goto not_shared; 3455 3456 /* 3457 * addresses and lengths must align 3458 * table must be fully populated 3459 * no lower level page tables 3460 */ 3461 if (ism_addr != ism_ht->ht_vaddr || 3462 (vaddr & LEVEL_OFFSET(l + 1)) != 0) 3463 goto not_shared; 3464 3465 /* 3466 * The range of address space must cover a full table. 3467 */ 3468 if (e_ism_addr - ism_addr < LEVEL_SIZE(l + 1)) 3469 goto not_shared; 3470 3471 /* 3472 * All entries in the ISM page table must be leaf PTEs. 3473 */ 3474 if (l > 0) { 3475 int e; 3476 3477 /* 3478 * We know the 0th is from htable_walk() above. 3479 */ 3480 for (e = 1; e < HTABLE_NUM_PTES(ism_ht); ++e) { 3481 x86pte_t pte; 3482 pte = x86pte_get(ism_ht, e); 3483 if (!PTE_ISPAGE(pte, l)) 3484 goto not_shared; 3485 } 3486 } 3487 3488 /* 3489 * share the page table 3490 */ 3491 ht = htable_create(hat, vaddr, l, ism_ht); 3492 shared: 3493 ASSERT(ht->ht_flags & HTABLE_SHARED_PFN); 3494 ASSERT(ht->ht_shares == ism_ht); 3495 hat->hat_ism_pgcnt += 3496 (ism_ht->ht_valid_cnt - ht->ht_valid_cnt) << 3497 (LEVEL_SHIFT(ht->ht_level) - MMU_PAGESHIFT); 3498 ht->ht_valid_cnt = ism_ht->ht_valid_cnt; 3499 htable_release(ht); 3500 ism_addr = ism_ht->ht_vaddr + LEVEL_SIZE(l + 1); 3501 htable_release(ism_ht); 3502 ism_ht = NULL; 3503 continue; 3504 3505 not_shared: 3506 /* 3507 * Unable to share the page table. Instead we will 3508 * create new mappings from the values in the ISM mappings. 3509 * Figure out what level size mappings to use; 3510 */ 3511 for (l = ism_ht->ht_level; l > 0; --l) { 3512 if (LEVEL_SIZE(l) <= eaddr - vaddr && 3513 (vaddr & LEVEL_OFFSET(l)) == 0) 3514 break; 3515 } 3516 3517 /* 3518 * The ISM mapping might be larger than the share area, 3519 * be careful to truncate it if needed. 3520 */ 3521 if (eaddr - vaddr >= LEVEL_SIZE(ism_ht->ht_level)) { 3522 pgcnt = mmu_btop(LEVEL_SIZE(ism_ht->ht_level)); 3523 } else { 3524 pgcnt = mmu_btop(eaddr - vaddr); 3525 l = 0; 3526 } 3527 3528 pfn = PTE2PFN(pte, ism_ht->ht_level); 3529 ASSERT(pfn != PFN_INVALID); 3530 while (pgcnt > 0) { 3531 /* 3532 * Make a new pte for the PFN for this level. 3533 * Copy protections for the pte from the ISM pte. 3534 */ 3535 pp = page_numtopp_nolock(pfn); 3536 ASSERT(pp != NULL); 3537 3538 prot = PROT_USER | PROT_READ | HAT_UNORDERED_OK; 3539 if (PTE_GET(pte, PT_WRITABLE)) 3540 prot |= PROT_WRITE; 3541 if (!PTE_GET(pte, PT_NX)) 3542 prot |= PROT_EXEC; 3543 3544 flags = HAT_LOAD; 3545 if (!is_dism) 3546 flags |= HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST; 3547 while (hati_load_common(hat, vaddr, pp, prot, flags, 3548 l, pfn) != 0) { 3549 if (l == 0) 3550 panic("hati_load_common() failure"); 3551 --l; 3552 } 3553 3554 vaddr += LEVEL_SIZE(l); 3555 ism_addr += LEVEL_SIZE(l); 3556 pfn += mmu_btop(LEVEL_SIZE(l)); 3557 pgcnt -= mmu_btop(LEVEL_SIZE(l)); 3558 } 3559 } 3560 if (ism_ht != NULL) 3561 htable_release(ism_ht); 3562 XPV_ALLOW_MIGRATE(); 3563 return (0); 3564 } 3565 3566 3567 /* 3568 * hat_unshare() is similar to hat_unload_callback(), but 3569 * we have to look for empty shared pagetables. Note that 3570 * hat_unshare() is always invoked against an entire segment. 3571 */ 3572 /*ARGSUSED*/ 3573 void 3574 hat_unshare(hat_t *hat, caddr_t addr, size_t len, uint_t ismszc) 3575 { 3576 uint64_t vaddr = (uintptr_t)addr; 3577 uintptr_t eaddr = vaddr + len; 3578 htable_t *ht = NULL; 3579 uint_t need_demaps = 0; 3580 int flags = HAT_UNLOAD_UNMAP; 3581 level_t l; 3582 3583 ASSERT(hat != kas.a_hat); 3584 ASSERT(eaddr <= _userlimit); 3585 ASSERT(IS_PAGEALIGNED(vaddr)); 3586 ASSERT(IS_PAGEALIGNED(eaddr)); 3587 XPV_DISALLOW_MIGRATE(); 3588 3589 /* 3590 * First go through and remove any shared pagetables. 3591 * 3592 * Note that it's ok to delay the TLB shootdown till the entire range is 3593 * finished, because if hat_pageunload() were to unload a shared 3594 * pagetable page, its hat_tlb_inval() will do a global TLB invalidate. 3595 */ 3596 l = mmu.max_page_level; 3597 if (l == mmu.max_level) 3598 --l; 3599 for (; l >= 0; --l) { 3600 for (vaddr = (uintptr_t)addr; vaddr < eaddr; 3601 vaddr = (vaddr & LEVEL_MASK(l + 1)) + LEVEL_SIZE(l + 1)) { 3602 ASSERT(!IN_VA_HOLE(vaddr)); 3603 /* 3604 * find a pagetable that maps the current address 3605 */ 3606 ht = htable_lookup(hat, vaddr, l); 3607 if (ht == NULL) 3608 continue; 3609 if (ht->ht_flags & HTABLE_SHARED_PFN) { 3610 /* 3611 * clear page count, set valid_cnt to 0, 3612 * let htable_release() finish the job 3613 */ 3614 hat->hat_ism_pgcnt -= ht->ht_valid_cnt << 3615 (LEVEL_SHIFT(ht->ht_level) - MMU_PAGESHIFT); 3616 ht->ht_valid_cnt = 0; 3617 need_demaps = 1; 3618 } 3619 htable_release(ht); 3620 } 3621 } 3622 3623 /* 3624 * flush the TLBs - since we're probably dealing with MANY mappings 3625 * we just do a full invalidation. 3626 */ 3627 if (!(hat->hat_flags & HAT_FREEING) && need_demaps) 3628 hat_tlb_inval(hat, DEMAP_ALL_ADDR); 3629 3630 /* 3631 * Now go back and clean up any unaligned mappings that 3632 * couldn't share pagetables. 3633 */ 3634 if (!is_it_dism(hat, addr)) 3635 flags |= HAT_UNLOAD_UNLOCK; 3636 hat_unload(hat, addr, len, flags); 3637 XPV_ALLOW_MIGRATE(); 3638 } 3639 3640 3641 /* 3642 * hat_reserve() does nothing 3643 */ 3644 /*ARGSUSED*/ 3645 void 3646 hat_reserve(struct as *as, caddr_t addr, size_t len) 3647 { 3648 } 3649 3650 3651 /* 3652 * Called when all mappings to a page should have write permission removed. 3653 * Mostly stolen from hat_pagesync() 3654 */ 3655 static void 3656 hati_page_clrwrt(struct page *pp) 3657 { 3658 hment_t *hm = NULL; 3659 htable_t *ht; 3660 uint_t entry; 3661 x86pte_t old; 3662 x86pte_t new; 3663 uint_t pszc = 0; 3664 3665 XPV_DISALLOW_MIGRATE(); 3666 next_size: 3667 /* 3668 * walk thru the mapping list clearing write permission 3669 */ 3670 x86_hm_enter(pp); 3671 while ((hm = hment_walk(pp, &ht, &entry, hm)) != NULL) { 3672 if (ht->ht_level < pszc) 3673 continue; 3674 old = x86pte_get(ht, entry); 3675 3676 for (;;) { 3677 /* 3678 * Is this mapping of interest? 3679 */ 3680 if (PTE2PFN(old, ht->ht_level) != pp->p_pagenum || 3681 PTE_GET(old, PT_WRITABLE) == 0) 3682 break; 3683 3684 /* 3685 * Clear ref/mod writable bits. This requires cross 3686 * calls to ensure any executing TLBs see cleared bits. 3687 */ 3688 new = old; 3689 PTE_CLR(new, PT_REF | PT_MOD | PT_WRITABLE); 3690 old = hati_update_pte(ht, entry, old, new); 3691 if (old != 0) 3692 continue; 3693 3694 break; 3695 } 3696 } 3697 x86_hm_exit(pp); 3698 while (pszc < pp->p_szc) { 3699 page_t *tpp; 3700 pszc++; 3701 tpp = PP_GROUPLEADER(pp, pszc); 3702 if (pp != tpp) { 3703 pp = tpp; 3704 goto next_size; 3705 } 3706 } 3707 XPV_ALLOW_MIGRATE(); 3708 } 3709 3710 /* 3711 * void hat_page_setattr(pp, flag) 3712 * void hat_page_clrattr(pp, flag) 3713 * used to set/clr ref/mod bits. 3714 */ 3715 void 3716 hat_page_setattr(struct page *pp, uint_t flag) 3717 { 3718 vnode_t *vp = pp->p_vnode; 3719 kmutex_t *vphm = NULL; 3720 page_t **listp; 3721 int noshuffle; 3722 3723 noshuffle = flag & P_NSH; 3724 flag &= ~P_NSH; 3725 3726 if (PP_GETRM(pp, flag) == flag) 3727 return; 3728 3729 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) && 3730 !noshuffle) { 3731 vphm = page_vnode_mutex(vp); 3732 mutex_enter(vphm); 3733 } 3734 3735 PP_SETRM(pp, flag); 3736 3737 if (vphm != NULL) { 3738 3739 /* 3740 * Some File Systems examine v_pages for NULL w/o 3741 * grabbing the vphm mutex. Must not let it become NULL when 3742 * pp is the only page on the list. 3743 */ 3744 if (pp->p_vpnext != pp) { 3745 page_vpsub(&vp->v_pages, pp); 3746 if (vp->v_pages != NULL) 3747 listp = &vp->v_pages->p_vpprev->p_vpnext; 3748 else 3749 listp = &vp->v_pages; 3750 page_vpadd(listp, pp); 3751 } 3752 mutex_exit(vphm); 3753 } 3754 } 3755 3756 void 3757 hat_page_clrattr(struct page *pp, uint_t flag) 3758 { 3759 vnode_t *vp = pp->p_vnode; 3760 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 3761 3762 /* 3763 * Caller is expected to hold page's io lock for VMODSORT to work 3764 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod 3765 * bit is cleared. 3766 * We don't have assert to avoid tripping some existing third party 3767 * code. The dirty page is moved back to top of the v_page list 3768 * after IO is done in pvn_write_done(). 3769 */ 3770 PP_CLRRM(pp, flag); 3771 3772 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) { 3773 3774 /* 3775 * VMODSORT works by removing write permissions and getting 3776 * a fault when a page is made dirty. At this point 3777 * we need to remove write permission from all mappings 3778 * to this page. 3779 */ 3780 hati_page_clrwrt(pp); 3781 } 3782 } 3783 3784 /* 3785 * If flag is specified, returns 0 if attribute is disabled 3786 * and non zero if enabled. If flag specifes multiple attributes 3787 * then returns 0 if ALL attributes are disabled. This is an advisory 3788 * call. 3789 */ 3790 uint_t 3791 hat_page_getattr(struct page *pp, uint_t flag) 3792 { 3793 return (PP_GETRM(pp, flag)); 3794 } 3795 3796 3797 /* 3798 * common code used by hat_pageunload() and hment_steal() 3799 */ 3800 hment_t * 3801 hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry) 3802 { 3803 x86pte_t old_pte; 3804 pfn_t pfn = pp->p_pagenum; 3805 hment_t *hm; 3806 3807 /* 3808 * We need to acquire a hold on the htable in order to 3809 * do the invalidate. We know the htable must exist, since 3810 * unmap's don't release the htable until after removing any 3811 * hment. Having x86_hm_enter() keeps that from proceeding. 3812 */ 3813 htable_acquire(ht); 3814 3815 /* 3816 * Invalidate the PTE and remove the hment. 3817 */ 3818 old_pte = x86pte_inval(ht, entry, 0, NULL, B_TRUE); 3819 if (PTE2PFN(old_pte, ht->ht_level) != pfn) { 3820 panic("x86pte_inval() failure found PTE = " FMT_PTE 3821 " pfn being unmapped is %lx ht=0x%lx entry=0x%x", 3822 old_pte, pfn, (uintptr_t)ht, entry); 3823 } 3824 3825 /* 3826 * Clean up all the htable information for this mapping 3827 */ 3828 ASSERT(ht->ht_valid_cnt > 0); 3829 HTABLE_DEC(ht->ht_valid_cnt); 3830 PGCNT_DEC(ht->ht_hat, ht->ht_level); 3831 3832 /* 3833 * sync ref/mod bits to the page_t 3834 */ 3835 if (PTE_GET(old_pte, PT_SOFTWARE) < PT_NOSYNC) 3836 hati_sync_pte_to_page(pp, old_pte, ht->ht_level); 3837 3838 /* 3839 * Remove the mapping list entry for this page. 3840 */ 3841 hm = hment_remove(pp, ht, entry); 3842 3843 /* 3844 * drop the mapping list lock so that we might free the 3845 * hment and htable. 3846 */ 3847 x86_hm_exit(pp); 3848 htable_release(ht); 3849 return (hm); 3850 } 3851 3852 extern int vpm_enable; 3853 /* 3854 * Unload all translations to a page. If the page is a subpage of a large 3855 * page, the large page mappings are also removed. 3856 * 3857 * The forceflags are unused. 3858 */ 3859 3860 /*ARGSUSED*/ 3861 static int 3862 hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) 3863 { 3864 page_t *cur_pp = pp; 3865 hment_t *hm; 3866 hment_t *prev; 3867 htable_t *ht; 3868 uint_t entry; 3869 level_t level; 3870 3871 XPV_DISALLOW_MIGRATE(); 3872 3873 /* 3874 * prevent recursion due to kmem_free() 3875 */ 3876 ++curthread->t_hatdepth; 3877 ASSERT(curthread->t_hatdepth < 16); 3878 3879 /* 3880 * clear the vpm ref. 3881 */ 3882 if (vpm_enable) { 3883 pp->p_vpmref = 0; 3884 } 3885 /* 3886 * The loop with next_size handles pages with multiple pagesize mappings 3887 */ 3888 next_size: 3889 for (;;) { 3890 3891 /* 3892 * Get a mapping list entry 3893 */ 3894 x86_hm_enter(cur_pp); 3895 for (prev = NULL; ; prev = hm) { 3896 hm = hment_walk(cur_pp, &ht, &entry, prev); 3897 if (hm == NULL) { 3898 x86_hm_exit(cur_pp); 3899 3900 /* 3901 * If not part of a larger page, we're done. 3902 */ 3903 if (cur_pp->p_szc <= pg_szcd) { 3904 ASSERT(curthread->t_hatdepth > 0); 3905 --curthread->t_hatdepth; 3906 XPV_ALLOW_MIGRATE(); 3907 return (0); 3908 } 3909 3910 /* 3911 * Else check the next larger page size. 3912 * hat_page_demote() may decrease p_szc 3913 * but that's ok we'll just take an extra 3914 * trip discover there're no larger mappings 3915 * and return. 3916 */ 3917 ++pg_szcd; 3918 cur_pp = PP_GROUPLEADER(cur_pp, pg_szcd); 3919 goto next_size; 3920 } 3921 3922 /* 3923 * If this mapping size matches, remove it. 3924 */ 3925 level = ht->ht_level; 3926 if (level == pg_szcd) 3927 break; 3928 } 3929 3930 /* 3931 * Remove the mapping list entry for this page. 3932 * Note this does the x86_hm_exit() for us. 3933 */ 3934 hm = hati_page_unmap(cur_pp, ht, entry); 3935 if (hm != NULL) 3936 hment_free(hm); 3937 } 3938 } 3939 3940 int 3941 hat_pageunload(struct page *pp, uint_t forceflag) 3942 { 3943 ASSERT(PAGE_EXCL(pp)); 3944 return (hati_pageunload(pp, 0, forceflag)); 3945 } 3946 3947 /* 3948 * Unload all large mappings to pp and reduce by 1 p_szc field of every large 3949 * page level that included pp. 3950 * 3951 * pp must be locked EXCL. Even though no other constituent pages are locked 3952 * it's legal to unload large mappings to pp because all constituent pages of 3953 * large locked mappings have to be locked SHARED. therefore if we have EXCL 3954 * lock on one of constituent pages none of the large mappings to pp are 3955 * locked. 3956 * 3957 * Change (always decrease) p_szc field starting from the last constituent 3958 * page and ending with root constituent page so that root's pszc always shows 3959 * the area where hat_page_demote() may be active. 3960 * 3961 * This mechanism is only used for file system pages where it's not always 3962 * possible to get EXCL locks on all constituent pages to demote the size code 3963 * (as is done for anonymous or kernel large pages). 3964 */ 3965 void 3966 hat_page_demote(page_t *pp) 3967 { 3968 uint_t pszc; 3969 uint_t rszc; 3970 uint_t szc; 3971 page_t *rootpp; 3972 page_t *firstpp; 3973 page_t *lastpp; 3974 pgcnt_t pgcnt; 3975 3976 ASSERT(PAGE_EXCL(pp)); 3977 ASSERT(!PP_ISFREE(pp)); 3978 ASSERT(page_szc_lock_assert(pp)); 3979 3980 if (pp->p_szc == 0) 3981 return; 3982 3983 rootpp = PP_GROUPLEADER(pp, 1); 3984 (void) hati_pageunload(rootpp, 1, HAT_FORCE_PGUNLOAD); 3985 3986 /* 3987 * all large mappings to pp are gone 3988 * and no new can be setup since pp is locked exclusively. 3989 * 3990 * Lock the root to make sure there's only one hat_page_demote() 3991 * outstanding within the area of this root's pszc. 3992 * 3993 * Second potential hat_page_demote() is already eliminated by upper 3994 * VM layer via page_szc_lock() but we don't rely on it and use our 3995 * own locking (so that upper layer locking can be changed without 3996 * assumptions that hat depends on upper layer VM to prevent multiple 3997 * hat_page_demote() to be issued simultaneously to the same large 3998 * page). 3999 */ 4000 again: 4001 pszc = pp->p_szc; 4002 if (pszc == 0) 4003 return; 4004 rootpp = PP_GROUPLEADER(pp, pszc); 4005 x86_hm_enter(rootpp); 4006 /* 4007 * If root's p_szc is different from pszc we raced with another 4008 * hat_page_demote(). Drop the lock and try to find the root again. 4009 * If root's p_szc is greater than pszc previous hat_page_demote() is 4010 * not done yet. Take and release mlist lock of root's root to wait 4011 * for previous hat_page_demote() to complete. 4012 */ 4013 if ((rszc = rootpp->p_szc) != pszc) { 4014 x86_hm_exit(rootpp); 4015 if (rszc > pszc) { 4016 /* p_szc of a locked non free page can't increase */ 4017 ASSERT(pp != rootpp); 4018 4019 rootpp = PP_GROUPLEADER(rootpp, rszc); 4020 x86_hm_enter(rootpp); 4021 x86_hm_exit(rootpp); 4022 } 4023 goto again; 4024 } 4025 ASSERT(pp->p_szc == pszc); 4026 4027 /* 4028 * Decrement by 1 p_szc of every constituent page of a region that 4029 * covered pp. For example if original szc is 3 it gets changed to 2 4030 * everywhere except in region 2 that covered pp. Region 2 that 4031 * covered pp gets demoted to 1 everywhere except in region 1 that 4032 * covered pp. The region 1 that covered pp is demoted to region 4033 * 0. It's done this way because from region 3 we removed level 3 4034 * mappings, from region 2 that covered pp we removed level 2 mappings 4035 * and from region 1 that covered pp we removed level 1 mappings. All 4036 * changes are done from from high pfn's to low pfn's so that roots 4037 * are changed last allowing one to know the largest region where 4038 * hat_page_demote() is stil active by only looking at the root page. 4039 * 4040 * This algorithm is implemented in 2 while loops. First loop changes 4041 * p_szc of pages to the right of pp's level 1 region and second 4042 * loop changes p_szc of pages of level 1 region that covers pp 4043 * and all pages to the left of level 1 region that covers pp. 4044 * In the first loop p_szc keeps dropping with every iteration 4045 * and in the second loop it keeps increasing with every iteration. 4046 * 4047 * First loop description: Demote pages to the right of pp outside of 4048 * level 1 region that covers pp. In every iteration of the while 4049 * loop below find the last page of szc region and the first page of 4050 * (szc - 1) region that is immediately to the right of (szc - 1) 4051 * region that covers pp. From last such page to first such page 4052 * change every page's szc to szc - 1. Decrement szc and continue 4053 * looping until szc is 1. If pp belongs to the last (szc - 1) region 4054 * of szc region skip to the next iteration. 4055 */ 4056 szc = pszc; 4057 while (szc > 1) { 4058 lastpp = PP_GROUPLEADER(pp, szc); 4059 pgcnt = page_get_pagecnt(szc); 4060 lastpp += pgcnt - 1; 4061 firstpp = PP_GROUPLEADER(pp, (szc - 1)); 4062 pgcnt = page_get_pagecnt(szc - 1); 4063 if (lastpp - firstpp < pgcnt) { 4064 szc--; 4065 continue; 4066 } 4067 firstpp += pgcnt; 4068 while (lastpp != firstpp) { 4069 ASSERT(lastpp->p_szc == pszc); 4070 lastpp->p_szc = szc - 1; 4071 lastpp--; 4072 } 4073 firstpp->p_szc = szc - 1; 4074 szc--; 4075 } 4076 4077 /* 4078 * Second loop description: 4079 * First iteration changes p_szc to 0 of every 4080 * page of level 1 region that covers pp. 4081 * Subsequent iterations find last page of szc region 4082 * immediately to the left of szc region that covered pp 4083 * and first page of (szc + 1) region that covers pp. 4084 * From last to first page change p_szc of every page to szc. 4085 * Increment szc and continue looping until szc is pszc. 4086 * If pp belongs to the fist szc region of (szc + 1) region 4087 * skip to the next iteration. 4088 * 4089 */ 4090 szc = 0; 4091 while (szc < pszc) { 4092 firstpp = PP_GROUPLEADER(pp, (szc + 1)); 4093 if (szc == 0) { 4094 pgcnt = page_get_pagecnt(1); 4095 lastpp = firstpp + (pgcnt - 1); 4096 } else { 4097 lastpp = PP_GROUPLEADER(pp, szc); 4098 if (firstpp == lastpp) { 4099 szc++; 4100 continue; 4101 } 4102 lastpp--; 4103 pgcnt = page_get_pagecnt(szc); 4104 } 4105 while (lastpp != firstpp) { 4106 ASSERT(lastpp->p_szc == pszc); 4107 lastpp->p_szc = szc; 4108 lastpp--; 4109 } 4110 firstpp->p_szc = szc; 4111 if (firstpp == rootpp) 4112 break; 4113 szc++; 4114 } 4115 x86_hm_exit(rootpp); 4116 } 4117 4118 /* 4119 * get hw stats from hardware into page struct and reset hw stats 4120 * returns attributes of page 4121 * Flags for hat_pagesync, hat_getstat, hat_sync 4122 * 4123 * define HAT_SYNC_ZERORM 0x01 4124 * 4125 * Additional flags for hat_pagesync 4126 * 4127 * define HAT_SYNC_STOPON_REF 0x02 4128 * define HAT_SYNC_STOPON_MOD 0x04 4129 * define HAT_SYNC_STOPON_RM 0x06 4130 * define HAT_SYNC_STOPON_SHARED 0x08 4131 */ 4132 uint_t 4133 hat_pagesync(struct page *pp, uint_t flags) 4134 { 4135 hment_t *hm = NULL; 4136 htable_t *ht; 4137 uint_t entry; 4138 x86pte_t old, save_old; 4139 x86pte_t new; 4140 uchar_t nrmbits = P_REF|P_MOD|P_RO; 4141 extern ulong_t po_share; 4142 page_t *save_pp = pp; 4143 uint_t pszc = 0; 4144 4145 ASSERT(PAGE_LOCKED(pp) || panicstr); 4146 4147 if (PP_ISRO(pp) && (flags & HAT_SYNC_STOPON_MOD)) 4148 return (pp->p_nrm & nrmbits); 4149 4150 if ((flags & HAT_SYNC_ZERORM) == 0) { 4151 4152 if ((flags & HAT_SYNC_STOPON_REF) != 0 && PP_ISREF(pp)) 4153 return (pp->p_nrm & nrmbits); 4154 4155 if ((flags & HAT_SYNC_STOPON_MOD) != 0 && PP_ISMOD(pp)) 4156 return (pp->p_nrm & nrmbits); 4157 4158 if ((flags & HAT_SYNC_STOPON_SHARED) != 0 && 4159 hat_page_getshare(pp) > po_share) { 4160 if (PP_ISRO(pp)) 4161 PP_SETREF(pp); 4162 return (pp->p_nrm & nrmbits); 4163 } 4164 } 4165 4166 XPV_DISALLOW_MIGRATE(); 4167 next_size: 4168 /* 4169 * walk thru the mapping list syncing (and clearing) ref/mod bits. 4170 */ 4171 x86_hm_enter(pp); 4172 while ((hm = hment_walk(pp, &ht, &entry, hm)) != NULL) { 4173 if (ht->ht_level < pszc) 4174 continue; 4175 old = x86pte_get(ht, entry); 4176 try_again: 4177 4178 ASSERT(PTE2PFN(old, ht->ht_level) == pp->p_pagenum); 4179 4180 if (PTE_GET(old, PT_REF | PT_MOD) == 0) 4181 continue; 4182 4183 save_old = old; 4184 if ((flags & HAT_SYNC_ZERORM) != 0) { 4185 4186 /* 4187 * Need to clear ref or mod bits. Need to demap 4188 * to make sure any executing TLBs see cleared bits. 4189 */ 4190 new = old; 4191 PTE_CLR(new, PT_REF | PT_MOD); 4192 old = hati_update_pte(ht, entry, old, new); 4193 if (old != 0) 4194 goto try_again; 4195 4196 old = save_old; 4197 } 4198 4199 /* 4200 * Sync the PTE 4201 */ 4202 if (!(flags & HAT_SYNC_ZERORM) && 4203 PTE_GET(old, PT_SOFTWARE) <= PT_NOSYNC) 4204 hati_sync_pte_to_page(pp, old, ht->ht_level); 4205 4206 /* 4207 * can stop short if we found a ref'd or mod'd page 4208 */ 4209 if ((flags & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp) || 4210 (flags & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)) { 4211 x86_hm_exit(pp); 4212 goto done; 4213 } 4214 } 4215 x86_hm_exit(pp); 4216 while (pszc < pp->p_szc) { 4217 page_t *tpp; 4218 pszc++; 4219 tpp = PP_GROUPLEADER(pp, pszc); 4220 if (pp != tpp) { 4221 pp = tpp; 4222 goto next_size; 4223 } 4224 } 4225 done: 4226 XPV_ALLOW_MIGRATE(); 4227 return (save_pp->p_nrm & nrmbits); 4228 } 4229 4230 /* 4231 * returns approx number of mappings to this pp. A return of 0 implies 4232 * there are no mappings to the page. 4233 */ 4234 ulong_t 4235 hat_page_getshare(page_t *pp) 4236 { 4237 uint_t cnt; 4238 cnt = hment_mapcnt(pp); 4239 if (vpm_enable && pp->p_vpmref) { 4240 cnt += 1; 4241 } 4242 return (cnt); 4243 } 4244 4245 /* 4246 * Return 1 the number of mappings exceeds sh_thresh. Return 0 4247 * otherwise. 4248 */ 4249 int 4250 hat_page_checkshare(page_t *pp, ulong_t sh_thresh) 4251 { 4252 return (hat_page_getshare(pp) > sh_thresh); 4253 } 4254 4255 /* 4256 * hat_softlock isn't supported anymore 4257 */ 4258 /*ARGSUSED*/ 4259 faultcode_t 4260 hat_softlock( 4261 hat_t *hat, 4262 caddr_t addr, 4263 size_t *len, 4264 struct page **page_array, 4265 uint_t flags) 4266 { 4267 return (FC_NOSUPPORT); 4268 } 4269 4270 4271 4272 /* 4273 * Routine to expose supported HAT features to platform independent code. 4274 */ 4275 /*ARGSUSED*/ 4276 int 4277 hat_supported(enum hat_features feature, void *arg) 4278 { 4279 switch (feature) { 4280 4281 case HAT_SHARED_PT: /* this is really ISM */ 4282 return (1); 4283 4284 case HAT_DYNAMIC_ISM_UNMAP: 4285 return (0); 4286 4287 case HAT_VMODSORT: 4288 return (1); 4289 4290 case HAT_SHARED_REGIONS: 4291 return (0); 4292 4293 default: 4294 panic("hat_supported() - unknown feature"); 4295 } 4296 return (0); 4297 } 4298 4299 /* 4300 * Called when a thread is exiting and has been switched to the kernel AS 4301 */ 4302 void 4303 hat_thread_exit(kthread_t *thd) 4304 { 4305 ASSERT(thd->t_procp->p_as == &kas); 4306 XPV_DISALLOW_MIGRATE(); 4307 hat_switch(thd->t_procp->p_as->a_hat); 4308 XPV_ALLOW_MIGRATE(); 4309 } 4310 4311 /* 4312 * Setup the given brand new hat structure as the new HAT on this cpu's mmu. 4313 */ 4314 /*ARGSUSED*/ 4315 void 4316 hat_setup(hat_t *hat, int flags) 4317 { 4318 XPV_DISALLOW_MIGRATE(); 4319 kpreempt_disable(); 4320 4321 hat_switch(hat); 4322 4323 kpreempt_enable(); 4324 XPV_ALLOW_MIGRATE(); 4325 } 4326 4327 /* 4328 * Prepare for a CPU private mapping for the given address. 4329 * 4330 * The address can only be used from a single CPU and can be remapped 4331 * using hat_mempte_remap(). Return the address of the PTE. 4332 * 4333 * We do the htable_create() if necessary and increment the valid count so 4334 * the htable can't disappear. We also hat_devload() the page table into 4335 * kernel so that the PTE is quickly accessed. 4336 */ 4337 hat_mempte_t 4338 hat_mempte_setup(caddr_t addr) 4339 { 4340 uintptr_t va = (uintptr_t)addr; 4341 htable_t *ht; 4342 uint_t entry; 4343 x86pte_t oldpte; 4344 hat_mempte_t p; 4345 4346 ASSERT(IS_PAGEALIGNED(va)); 4347 ASSERT(!IN_VA_HOLE(va)); 4348 ++curthread->t_hatdepth; 4349 XPV_DISALLOW_MIGRATE(); 4350 ht = htable_getpte(kas.a_hat, va, &entry, &oldpte, 0); 4351 if (ht == NULL) { 4352 ht = htable_create(kas.a_hat, va, 0, NULL); 4353 entry = htable_va2entry(va, ht); 4354 ASSERT(ht->ht_level == 0); 4355 oldpte = x86pte_get(ht, entry); 4356 } 4357 if (PTE_ISVALID(oldpte)) 4358 panic("hat_mempte_setup(): address already mapped" 4359 "ht=%p, entry=%d, pte=" FMT_PTE, (void *)ht, entry, oldpte); 4360 4361 /* 4362 * increment ht_valid_cnt so that the pagetable can't disappear 4363 */ 4364 HTABLE_INC(ht->ht_valid_cnt); 4365 4366 /* 4367 * return the PTE physical address to the caller. 4368 */ 4369 htable_release(ht); 4370 XPV_ALLOW_MIGRATE(); 4371 p = PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry); 4372 --curthread->t_hatdepth; 4373 return (p); 4374 } 4375 4376 /* 4377 * Release a CPU private mapping for the given address. 4378 * We decrement the htable valid count so it might be destroyed. 4379 */ 4380 /*ARGSUSED1*/ 4381 void 4382 hat_mempte_release(caddr_t addr, hat_mempte_t pte_pa) 4383 { 4384 htable_t *ht; 4385 4386 XPV_DISALLOW_MIGRATE(); 4387 /* 4388 * invalidate any left over mapping and decrement the htable valid count 4389 */ 4390 #ifdef __xpv 4391 if (HYPERVISOR_update_va_mapping((uintptr_t)addr, 0, 4392 UVMF_INVLPG | UVMF_LOCAL)) 4393 panic("HYPERVISOR_update_va_mapping() failed"); 4394 #else 4395 { 4396 x86pte_t *pteptr; 4397 4398 pteptr = x86pte_mapin(mmu_btop(pte_pa), 4399 (pte_pa & MMU_PAGEOFFSET) >> mmu.pte_size_shift, NULL); 4400 if (mmu.pae_hat) 4401 *pteptr = 0; 4402 else 4403 *(x86pte32_t *)pteptr = 0; 4404 mmu_flush_tlb_kpage((uintptr_t)addr); 4405 x86pte_mapout(); 4406 } 4407 #endif 4408 4409 ht = htable_getpte(kas.a_hat, ALIGN2PAGE(addr), NULL, NULL, 0); 4410 if (ht == NULL) 4411 panic("hat_mempte_release(): invalid address"); 4412 ASSERT(ht->ht_level == 0); 4413 HTABLE_DEC(ht->ht_valid_cnt); 4414 htable_release(ht); 4415 XPV_ALLOW_MIGRATE(); 4416 } 4417 4418 /* 4419 * Apply a temporary CPU private mapping to a page. We flush the TLB only 4420 * on this CPU, so this ought to have been called with preemption disabled. 4421 */ 4422 void 4423 hat_mempte_remap( 4424 pfn_t pfn, 4425 caddr_t addr, 4426 hat_mempte_t pte_pa, 4427 uint_t attr, 4428 uint_t flags) 4429 { 4430 uintptr_t va = (uintptr_t)addr; 4431 x86pte_t pte; 4432 4433 /* 4434 * Remap the given PTE to the new page's PFN. Invalidate only 4435 * on this CPU. 4436 */ 4437 #ifdef DEBUG 4438 htable_t *ht; 4439 uint_t entry; 4440 4441 ASSERT(IS_PAGEALIGNED(va)); 4442 ASSERT(!IN_VA_HOLE(va)); 4443 ht = htable_getpte(kas.a_hat, va, &entry, NULL, 0); 4444 ASSERT(ht != NULL); 4445 ASSERT(ht->ht_level == 0); 4446 ASSERT(ht->ht_valid_cnt > 0); 4447 ASSERT(ht->ht_pfn == mmu_btop(pte_pa)); 4448 htable_release(ht); 4449 #endif 4450 XPV_DISALLOW_MIGRATE(); 4451 pte = hati_mkpte(pfn, attr, 0, flags); 4452 #ifdef __xpv 4453 if (HYPERVISOR_update_va_mapping(va, pte, UVMF_INVLPG | UVMF_LOCAL)) 4454 panic("HYPERVISOR_update_va_mapping() failed"); 4455 #else 4456 { 4457 x86pte_t *pteptr; 4458 4459 pteptr = x86pte_mapin(mmu_btop(pte_pa), 4460 (pte_pa & MMU_PAGEOFFSET) >> mmu.pte_size_shift, NULL); 4461 if (mmu.pae_hat) 4462 *(x86pte_t *)pteptr = pte; 4463 else 4464 *(x86pte32_t *)pteptr = (x86pte32_t)pte; 4465 mmu_flush_tlb_kpage((uintptr_t)addr); 4466 x86pte_mapout(); 4467 } 4468 #endif 4469 XPV_ALLOW_MIGRATE(); 4470 } 4471 4472 4473 4474 /* 4475 * Hat locking functions 4476 * XXX - these two functions are currently being used by hatstats 4477 * they can be removed by using a per-as mutex for hatstats. 4478 */ 4479 void 4480 hat_enter(hat_t *hat) 4481 { 4482 mutex_enter(&hat->hat_mutex); 4483 } 4484 4485 void 4486 hat_exit(hat_t *hat) 4487 { 4488 mutex_exit(&hat->hat_mutex); 4489 } 4490 4491 /* 4492 * HAT part of cpu initialization. 4493 */ 4494 void 4495 hat_cpu_online(struct cpu *cpup) 4496 { 4497 if (cpup != CPU) { 4498 x86pte_cpu_init(cpup); 4499 hat_pcp_setup(cpup); 4500 } 4501 CPUSET_ATOMIC_ADD(khat_cpuset, cpup->cpu_id); 4502 } 4503 4504 /* 4505 * HAT part of cpu deletion. 4506 * (currently, we only call this after the cpu is safely passivated.) 4507 */ 4508 void 4509 hat_cpu_offline(struct cpu *cpup) 4510 { 4511 ASSERT(cpup != CPU); 4512 4513 CPUSET_ATOMIC_DEL(khat_cpuset, cpup->cpu_id); 4514 hat_pcp_teardown(cpup); 4515 x86pte_cpu_fini(cpup); 4516 } 4517 4518 /* 4519 * Function called after all CPUs are brought online. 4520 * Used to remove low address boot mappings. 4521 */ 4522 void 4523 clear_boot_mappings(uintptr_t low, uintptr_t high) 4524 { 4525 uintptr_t vaddr = low; 4526 htable_t *ht = NULL; 4527 level_t level; 4528 uint_t entry; 4529 x86pte_t pte; 4530 4531 /* 4532 * On 1st CPU we can unload the prom mappings, basically we blow away 4533 * all virtual mappings under _userlimit. 4534 */ 4535 while (vaddr < high) { 4536 pte = htable_walk(kas.a_hat, &ht, &vaddr, high); 4537 if (ht == NULL) 4538 break; 4539 4540 level = ht->ht_level; 4541 entry = htable_va2entry(vaddr, ht); 4542 ASSERT(level <= mmu.max_page_level); 4543 ASSERT(PTE_ISPAGE(pte, level)); 4544 4545 /* 4546 * Unload the mapping from the page tables. 4547 */ 4548 (void) x86pte_inval(ht, entry, 0, NULL, B_TRUE); 4549 ASSERT(ht->ht_valid_cnt > 0); 4550 HTABLE_DEC(ht->ht_valid_cnt); 4551 PGCNT_DEC(ht->ht_hat, ht->ht_level); 4552 4553 vaddr += LEVEL_SIZE(ht->ht_level); 4554 } 4555 if (ht) 4556 htable_release(ht); 4557 } 4558 4559 /* 4560 * Atomically update a new translation for a single page. If the 4561 * currently installed PTE doesn't match the value we expect to find, 4562 * it's not updated and we return the PTE we found. 4563 * 4564 * If activating nosync or NOWRITE and the page was modified we need to sync 4565 * with the page_t. Also sync with page_t if clearing ref/mod bits. 4566 */ 4567 static x86pte_t 4568 hati_update_pte(htable_t *ht, uint_t entry, x86pte_t expected, x86pte_t new) 4569 { 4570 page_t *pp; 4571 uint_t rm = 0; 4572 x86pte_t replaced; 4573 4574 if (PTE_GET(expected, PT_SOFTWARE) < PT_NOSYNC && 4575 PTE_GET(expected, PT_MOD | PT_REF) && 4576 (PTE_GET(new, PT_NOSYNC) || !PTE_GET(new, PT_WRITABLE) || 4577 !PTE_GET(new, PT_MOD | PT_REF))) { 4578 4579 ASSERT(!pfn_is_foreign(PTE2PFN(expected, ht->ht_level))); 4580 pp = page_numtopp_nolock(PTE2PFN(expected, ht->ht_level)); 4581 ASSERT(pp != NULL); 4582 if (PTE_GET(expected, PT_MOD)) 4583 rm |= P_MOD; 4584 if (PTE_GET(expected, PT_REF)) 4585 rm |= P_REF; 4586 PTE_CLR(new, PT_MOD | PT_REF); 4587 } 4588 4589 replaced = x86pte_update(ht, entry, expected, new); 4590 if (replaced != expected) 4591 return (replaced); 4592 4593 if (rm) { 4594 /* 4595 * sync to all constituent pages of a large page 4596 */ 4597 pgcnt_t pgcnt = page_get_pagecnt(ht->ht_level); 4598 ASSERT(IS_P2ALIGNED(pp->p_pagenum, pgcnt)); 4599 while (pgcnt-- > 0) { 4600 /* 4601 * hat_page_demote() can't decrease 4602 * pszc below this mapping size 4603 * since large mapping existed after we 4604 * took mlist lock. 4605 */ 4606 ASSERT(pp->p_szc >= ht->ht_level); 4607 hat_page_setattr(pp, rm); 4608 ++pp; 4609 } 4610 } 4611 4612 return (0); 4613 } 4614 4615 /* ARGSUSED */ 4616 void 4617 hat_join_srd(struct hat *hat, vnode_t *evp) 4618 { 4619 } 4620 4621 /* ARGSUSED */ 4622 hat_region_cookie_t 4623 hat_join_region(struct hat *hat, 4624 caddr_t r_saddr, 4625 size_t r_size, 4626 void *r_obj, 4627 u_offset_t r_objoff, 4628 uchar_t r_perm, 4629 uchar_t r_pgszc, 4630 hat_rgn_cb_func_t r_cb_function, 4631 uint_t flags) 4632 { 4633 panic("No shared region support on x86"); 4634 return (HAT_INVALID_REGION_COOKIE); 4635 } 4636 4637 /* ARGSUSED */ 4638 void 4639 hat_leave_region(struct hat *hat, hat_region_cookie_t rcookie, uint_t flags) 4640 { 4641 panic("No shared region support on x86"); 4642 } 4643 4644 /* ARGSUSED */ 4645 void 4646 hat_dup_region(struct hat *hat, hat_region_cookie_t rcookie) 4647 { 4648 panic("No shared region support on x86"); 4649 } 4650 4651 4652 /* 4653 * Kernel Physical Mapping (kpm) facility 4654 * 4655 * Most of the routines needed to support segkpm are almost no-ops on the 4656 * x86 platform. We map in the entire segment when it is created and leave 4657 * it mapped in, so there is no additional work required to set up and tear 4658 * down individual mappings. All of these routines were created to support 4659 * SPARC platforms that have to avoid aliasing in their virtually indexed 4660 * caches. 4661 * 4662 * Most of the routines have sanity checks in them (e.g. verifying that the 4663 * passed-in page is locked). We don't actually care about most of these 4664 * checks on x86, but we leave them in place to identify problems in the 4665 * upper levels. 4666 */ 4667 4668 /* 4669 * Map in a locked page and return the vaddr. 4670 */ 4671 /*ARGSUSED*/ 4672 caddr_t 4673 hat_kpm_mapin(struct page *pp, struct kpme *kpme) 4674 { 4675 caddr_t vaddr; 4676 4677 #ifdef DEBUG 4678 if (kpm_enable == 0) { 4679 cmn_err(CE_WARN, "hat_kpm_mapin: kpm_enable not set\n"); 4680 return ((caddr_t)NULL); 4681 } 4682 4683 if (pp == NULL || PAGE_LOCKED(pp) == 0) { 4684 cmn_err(CE_WARN, "hat_kpm_mapin: pp zero or not locked\n"); 4685 return ((caddr_t)NULL); 4686 } 4687 #endif 4688 4689 vaddr = hat_kpm_page2va(pp, 1); 4690 4691 return (vaddr); 4692 } 4693 4694 /* 4695 * Mapout a locked page. 4696 */ 4697 /*ARGSUSED*/ 4698 void 4699 hat_kpm_mapout(struct page *pp, struct kpme *kpme, caddr_t vaddr) 4700 { 4701 #ifdef DEBUG 4702 if (kpm_enable == 0) { 4703 cmn_err(CE_WARN, "hat_kpm_mapout: kpm_enable not set\n"); 4704 return; 4705 } 4706 4707 if (IS_KPM_ADDR(vaddr) == 0) { 4708 cmn_err(CE_WARN, "hat_kpm_mapout: no kpm address\n"); 4709 return; 4710 } 4711 4712 if (pp == NULL || PAGE_LOCKED(pp) == 0) { 4713 cmn_err(CE_WARN, "hat_kpm_mapout: page zero or not locked\n"); 4714 return; 4715 } 4716 #endif 4717 } 4718 4719 /* 4720 * hat_kpm_mapin_pfn is used to obtain a kpm mapping for physical 4721 * memory addresses that are not described by a page_t. It can 4722 * also be used for normal pages that are not locked, but beware 4723 * this is dangerous - no locking is performed, so the identity of 4724 * the page could change. hat_kpm_mapin_pfn is not supported when 4725 * vac_colors > 1, because the chosen va depends on the page identity, 4726 * which could change. 4727 * The caller must only pass pfn's for valid physical addresses; violation 4728 * of this rule will cause panic. 4729 */ 4730 caddr_t 4731 hat_kpm_mapin_pfn(pfn_t pfn) 4732 { 4733 caddr_t paddr, vaddr; 4734 4735 if (kpm_enable == 0) 4736 return ((caddr_t)NULL); 4737 4738 paddr = (caddr_t)ptob(pfn); 4739 vaddr = (uintptr_t)kpm_vbase + paddr; 4740 4741 return ((caddr_t)vaddr); 4742 } 4743 4744 /*ARGSUSED*/ 4745 void 4746 hat_kpm_mapout_pfn(pfn_t pfn) 4747 { 4748 /* empty */ 4749 } 4750 4751 /* 4752 * Return the kpm virtual address for a specific pfn 4753 */ 4754 caddr_t 4755 hat_kpm_pfn2va(pfn_t pfn) 4756 { 4757 uintptr_t vaddr = (uintptr_t)kpm_vbase + mmu_ptob(pfn); 4758 4759 ASSERT(!pfn_is_foreign(pfn)); 4760 return ((caddr_t)vaddr); 4761 } 4762 4763 /* 4764 * Return the kpm virtual address for the page at pp. 4765 */ 4766 /*ARGSUSED*/ 4767 caddr_t 4768 hat_kpm_page2va(struct page *pp, int checkswap) 4769 { 4770 return (hat_kpm_pfn2va(pp->p_pagenum)); 4771 } 4772 4773 /* 4774 * Return the page frame number for the kpm virtual address vaddr. 4775 */ 4776 pfn_t 4777 hat_kpm_va2pfn(caddr_t vaddr) 4778 { 4779 pfn_t pfn; 4780 4781 ASSERT(IS_KPM_ADDR(vaddr)); 4782 4783 pfn = (pfn_t)btop(vaddr - kpm_vbase); 4784 4785 return (pfn); 4786 } 4787 4788 4789 /* 4790 * Return the page for the kpm virtual address vaddr. 4791 */ 4792 page_t * 4793 hat_kpm_vaddr2page(caddr_t vaddr) 4794 { 4795 pfn_t pfn; 4796 4797 ASSERT(IS_KPM_ADDR(vaddr)); 4798 4799 pfn = hat_kpm_va2pfn(vaddr); 4800 4801 return (page_numtopp_nolock(pfn)); 4802 } 4803 4804 /* 4805 * hat_kpm_fault is called from segkpm_fault when we take a page fault on a 4806 * KPM page. This should never happen on x86 4807 */ 4808 int 4809 hat_kpm_fault(hat_t *hat, caddr_t vaddr) 4810 { 4811 panic("pagefault in seg_kpm. hat: 0x%p vaddr: 0x%p", 4812 (void *)hat, (void *)vaddr); 4813 4814 return (0); 4815 } 4816 4817 /*ARGSUSED*/ 4818 void 4819 hat_kpm_mseghash_clear(int nentries) 4820 {} 4821 4822 /*ARGSUSED*/ 4823 void 4824 hat_kpm_mseghash_update(pgcnt_t inx, struct memseg *msp) 4825 {} 4826 4827 #ifndef __xpv 4828 void 4829 hat_kpm_addmem_mseg_update(struct memseg *msp, pgcnt_t nkpmpgs, 4830 offset_t kpm_pages_off) 4831 { 4832 _NOTE(ARGUNUSED(nkpmpgs, kpm_pages_off)); 4833 pfn_t base, end; 4834 4835 /* 4836 * kphysm_add_memory_dynamic() does not set nkpmpgs 4837 * when page_t memory is externally allocated. That 4838 * code must properly calculate nkpmpgs in all cases 4839 * if nkpmpgs needs to be used at some point. 4840 */ 4841 4842 /* 4843 * The meta (page_t) pages for dynamically added memory are allocated 4844 * either from the incoming memory itself or from existing memory. 4845 * In the former case the base of the incoming pages will be different 4846 * than the base of the dynamic segment so call memseg_get_start() to 4847 * get the actual base of the incoming memory for each case. 4848 */ 4849 4850 base = memseg_get_start(msp); 4851 end = msp->pages_end; 4852 4853 hat_devload(kas.a_hat, kpm_vbase + mmu_ptob(base), 4854 mmu_ptob(end - base), base, PROT_READ | PROT_WRITE, 4855 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 4856 } 4857 4858 void 4859 hat_kpm_addmem_mseg_insert(struct memseg *msp) 4860 { 4861 _NOTE(ARGUNUSED(msp)); 4862 } 4863 4864 void 4865 hat_kpm_addmem_memsegs_update(struct memseg *msp) 4866 { 4867 _NOTE(ARGUNUSED(msp)); 4868 } 4869 4870 /* 4871 * Return end of metadata for an already setup memseg. 4872 * X86 platforms don't need per-page meta data to support kpm. 4873 */ 4874 caddr_t 4875 hat_kpm_mseg_reuse(struct memseg *msp) 4876 { 4877 return ((caddr_t)msp->epages); 4878 } 4879 4880 void 4881 hat_kpm_delmem_mseg_update(struct memseg *msp, struct memseg **mspp) 4882 { 4883 _NOTE(ARGUNUSED(msp, mspp)); 4884 ASSERT(0); 4885 } 4886 4887 void 4888 hat_kpm_split_mseg_update(struct memseg *msp, struct memseg **mspp, 4889 struct memseg *lo, struct memseg *mid, struct memseg *hi) 4890 { 4891 _NOTE(ARGUNUSED(msp, mspp, lo, mid, hi)); 4892 ASSERT(0); 4893 } 4894 4895 /* 4896 * Walk the memsegs chain, applying func to each memseg span. 4897 */ 4898 void 4899 hat_kpm_walk(void (*func)(void *, void *, size_t), void *arg) 4900 { 4901 pfn_t pbase, pend; 4902 void *base; 4903 size_t size; 4904 struct memseg *msp; 4905 4906 for (msp = memsegs; msp; msp = msp->next) { 4907 pbase = msp->pages_base; 4908 pend = msp->pages_end; 4909 base = ptob(pbase) + kpm_vbase; 4910 size = ptob(pend - pbase); 4911 func(arg, base, size); 4912 } 4913 } 4914 4915 #else /* __xpv */ 4916 4917 /* 4918 * There are specific Hypervisor calls to establish and remove mappings 4919 * to grant table references and the privcmd driver. We have to ensure 4920 * that a page table actually exists. 4921 */ 4922 void 4923 hat_prepare_mapping(hat_t *hat, caddr_t addr, uint64_t *pte_ma) 4924 { 4925 maddr_t base_ma; 4926 htable_t *ht; 4927 uint_t entry; 4928 4929 ASSERT(IS_P2ALIGNED((uintptr_t)addr, MMU_PAGESIZE)); 4930 XPV_DISALLOW_MIGRATE(); 4931 ht = htable_create(hat, (uintptr_t)addr, 0, NULL); 4932 4933 /* 4934 * if an address for pte_ma is passed in, return the MA of the pte 4935 * for this specific address. This address is only valid as long 4936 * as the htable stays locked. 4937 */ 4938 if (pte_ma != NULL) { 4939 entry = htable_va2entry((uintptr_t)addr, ht); 4940 base_ma = pa_to_ma(ptob(ht->ht_pfn)); 4941 *pte_ma = base_ma + (entry << mmu.pte_size_shift); 4942 } 4943 XPV_ALLOW_MIGRATE(); 4944 } 4945 4946 void 4947 hat_release_mapping(hat_t *hat, caddr_t addr) 4948 { 4949 htable_t *ht; 4950 4951 ASSERT(IS_P2ALIGNED((uintptr_t)addr, MMU_PAGESIZE)); 4952 XPV_DISALLOW_MIGRATE(); 4953 ht = htable_lookup(hat, (uintptr_t)addr, 0); 4954 ASSERT(ht != NULL); 4955 ASSERT(ht->ht_busy >= 2); 4956 htable_release(ht); 4957 htable_release(ht); 4958 XPV_ALLOW_MIGRATE(); 4959 } 4960 #endif /* __xpv */ 4961 4962 /* 4963 * Helper function to punch in a mapping that we need with the specified 4964 * attributes. 4965 */ 4966 void 4967 hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs) 4968 { 4969 int ret; 4970 pfn_t pfn; 4971 hat_t *cpu_hat = cpu->cpu_hat_info->hci_user_hat; 4972 4973 ASSERT3S(kpti_enable, ==, 1); 4974 ASSERT3P(cpu_hat, !=, NULL); 4975 ASSERT3U(cpu_hat->hat_flags & HAT_PCP, ==, HAT_PCP); 4976 ASSERT3U(va & MMU_PAGEOFFSET, ==, 0); 4977 4978 pfn = hat_getpfnum(kas.a_hat, (caddr_t)va); 4979 VERIFY3U(pfn, !=, PFN_INVALID); 4980 4981 /* 4982 * We purposefully don't try to find the page_t. This means that this 4983 * will be marked PT_NOCONSIST; however, given that this is pretty much 4984 * a static mapping that we're using we should be relatively OK. 4985 */ 4986 attrs |= HAT_STORECACHING_OK; 4987 ret = hati_load_common(cpu_hat, va, NULL, attrs, 0, 0, pfn); 4988 VERIFY3S(ret, ==, 0); 4989 } 4990