1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright (c) 2010, Intel Corporation. 26 * All rights reserved. 27 */ 28 /* 29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 30 * Copyright 2018 Joyent, Inc. All rights reserved. 31 * Copyright (c) 2014, 2015 by Delphix. All rights reserved. 32 */ 33 34 /* 35 * VM - Hardware Address Translation management for i386 and amd64 36 * 37 * Implementation of the interfaces described in <common/vm/hat.h> 38 * 39 * Nearly all the details of how the hardware is managed should not be 40 * visible outside this layer except for misc. machine specific functions 41 * that work in conjunction with this code. 42 * 43 * Routines used only inside of i86pc/vm start with hati_ for HAT Internal. 44 */ 45 46 /* 47 * amd64 HAT Design 48 * 49 * ---------- 50 * Background 51 * ---------- 52 * 53 * On x86, the address space is shared between a user process and the kernel. 54 * This is different from SPARC. Conventionally, the kernel lives at the top of 55 * the address space and the user process gets to enjoy the rest of it. If you 56 * look at the image of the address map in uts/i86pc/os/startup.c, you'll get a 57 * rough sense of how the address space is laid out and used. 58 * 59 * Every unique address space is represented by an instance of a HAT structure 60 * called a 'hat_t'. In addition to a hat_t structure for each process, there is 61 * also one that is used for the kernel (kas.a_hat), and each CPU ultimately 62 * also has a HAT. 63 * 64 * Each HAT contains a pointer to its root page table. This root page table is 65 * what we call an L3 page table in illumos and Intel calls the PML4. It is the 66 * physical address of the L3 table that we place in the %cr3 register which the 67 * processor uses. 68 * 69 * Each of the many layers of the page table is represented by a structure 70 * called an htable_t. The htable_t manages a set of 512 8-byte entries. The 71 * number of entries in a given page table is constant across all different 72 * level page tables. Note, this is only true on amd64. This has not always been 73 * the case on x86. 74 * 75 * Each entry in a page table, generally referred to as a PTE, may refer to 76 * another page table or a memory location, depending on the level of the page 77 * table and the use of large pages. Importantly, the top-level L3 page table 78 * (PML4) only supports linking to further page tables. This is also true on 79 * systems which support a 5th level page table (which we do not currently 80 * support). 81 * 82 * Historically, on x86, when a process was running on CPU, the root of the page 83 * table was inserted into %cr3 on each CPU on which it was currently running. 84 * When processes would switch (by calling hat_switch()), then the value in %cr3 85 * on that CPU would change to that of the new HAT. While this behavior is still 86 * maintained in the xpv kernel, this is not what is done today. 87 * 88 * ------------------- 89 * Per-CPU Page Tables 90 * ------------------- 91 * 92 * Throughout the system the 64-bit kernel has a notion of what it calls a 93 * per-CPU page table or PCP. The notion of a per-CPU page table was originally 94 * introduced as part of the original work to support x86 PAE. On the 64-bit 95 * kernel, it was originally used for 32-bit processes running on the 64-bit 96 * kernel. The rationale behind this was that each 32-bit process could have all 97 * of its memory represented in a single L2 page table as each L2 page table 98 * entry represents 1 GbE of memory. 99 * 100 * Following on from this, the idea was that given that all of the L3 page table 101 * entries for 32-bit processes are basically going to be identical with the 102 * exception of the first entry in the page table, why not share those page 103 * table entries. This gave rise to the idea of a per-CPU page table. 104 * 105 * The way this works is that we have a member in the machcpu_t called the 106 * mcpu_hat_info. That structure contains two different 4k pages: one that 107 * represents the L3 page table and one that represents an L2 page table. When 108 * the CPU starts up, the L3 page table entries are copied in from the kernel's 109 * page table. The L3 kernel entries do not change throughout the lifetime of 110 * the kernel. The kernel portion of these L3 pages for each CPU have the same 111 * records, meaning that they point to the same L2 page tables and thus see a 112 * consistent view of the world. 113 * 114 * When a 32-bit process is loaded into this world, we copy the 32-bit process's 115 * four top-level page table entries into the CPU's L2 page table and then set 116 * the CPU's first L3 page table entry to point to the CPU's L2 page. 117 * Specifically, in hat_pcp_update(), we're copying from the process's 118 * HAT_COPIED_32 HAT into the page tables specific to this CPU. 119 * 120 * As part of the implementation of kernel page table isolation, this was also 121 * extended to 64-bit processes. When a 64-bit process runs, we'll copy their L3 122 * PTEs across into the current CPU's L3 page table. (As we can't do the 123 * first-L3-entry trick for 64-bit processes, ->hci_pcp_l2ptes is unused in this 124 * case.) 125 * 126 * The use of per-CPU page tables has a lot of implementation ramifications. A 127 * HAT that runs a user process will be flagged with the HAT_COPIED flag to 128 * indicate that it is using the per-CPU page table functionality. In tandem 129 * with the HAT, the top-level htable_t will be flagged with the HTABLE_COPIED 130 * flag. If the HAT represents a 32-bit process, then we will also set the 131 * HAT_COPIED_32 flag on that hat_t. 132 * 133 * These two flags work together. The top-level htable_t when using per-CPU page 134 * tables is 'virtual'. We never allocate a ptable for this htable_t (i.e. 135 * ht->ht_pfn is PFN_INVALID). Instead, when we need to modify a PTE in an 136 * HTABLE_COPIED ptable, x86pte_access_pagetable() will redirect any accesses to 137 * ht_hat->hat_copied_ptes. 138 * 139 * Of course, such a modification won't actually modify the HAT_PCP page tables 140 * that were copied from the HAT_COPIED htable. When we change the top level 141 * page table entries (L2 PTEs for a 32-bit process and L3 PTEs for a 64-bit 142 * process), we need to make sure to trigger hat_pcp_update() on all CPUs that 143 * are currently tied to this HAT (including the current CPU). 144 * 145 * To do this, PCP piggy-backs on TLB invalidation, specifically via the 146 * hat_tlb_inval() path from link_ptp() and unlink_ptp(). 147 * 148 * (Importantly, in all such cases, when this is in operation, the top-level 149 * entry should not be able to refer to an actual page table entry that can be 150 * changed and consolidated into a large page. If large page consolidation is 151 * required here, then there will be much that needs to be reconsidered.) 152 * 153 * ----------------------------------------------- 154 * Kernel Page Table Isolation and the Per-CPU HAT 155 * ----------------------------------------------- 156 * 157 * All Intel CPUs that support speculative execution and paging are subject to a 158 * series of bugs that have been termed 'Meltdown'. These exploits allow a user 159 * process to read kernel memory through cache side channels and speculative 160 * execution. To mitigate this on vulnerable CPUs, we need to use a technique 161 * called kernel page table isolation. What this requires is that we have two 162 * different page table roots. When executing in kernel mode, we will use a %cr3 163 * value that has both the user and kernel pages. However when executing in user 164 * mode, we will need to have a %cr3 that has all of the user pages; however, 165 * only a subset of the kernel pages required to operate. 166 * 167 * These kernel pages that we need mapped are: 168 * 169 * o Kernel Text that allows us to switch between the cr3 values. 170 * o The current global descriptor table (GDT) 171 * o The current interrupt descriptor table (IDT) 172 * o The current task switching state (TSS) 173 * o The current local descriptor table (LDT) 174 * o Stacks and scratch space used by the interrupt handlers 175 * 176 * For more information on the stack switching techniques, construction of the 177 * trampolines, and more, please see i86pc/ml/kpti_trampolines.s. The most 178 * important part of these mappings are the following two constraints: 179 * 180 * o The mappings are all per-CPU (except for read-only text) 181 * o The mappings are static. They are all established before the CPU is 182 * started (with the exception of the boot CPU). 183 * 184 * To facilitate the kernel page table isolation we employ our per-CPU 185 * page tables discussed in the previous section and add the notion of a per-CPU 186 * HAT. Fundamentally we have a second page table root. There is both a kernel 187 * page table (hci_pcp_l3ptes), and a user L3 page table (hci_user_l3ptes). 188 * Both will have the user page table entries copied into them, the same way 189 * that we discussed in the section 'Per-CPU Page Tables'. 190 * 191 * The complex part of this is how do we construct the set of kernel mappings 192 * that should be present when running with the user page table. To answer that, 193 * we add the notion of a per-CPU HAT. This HAT functions like a normal HAT, 194 * except that it's not really associated with an address space the same way 195 * that other HATs are. 196 * 197 * This HAT lives off of the 'struct hat_cpu_info' which is a member of the 198 * machcpu in the member hci_user_hat. We use this per-CPU HAT to create the set 199 * of kernel mappings that should be present on this CPU. The kernel mappings 200 * are added to the per-CPU HAT through the function hati_cpu_punchin(). Once a 201 * mapping has been punched in, it may not be punched out. The reason that we 202 * opt to leverage a HAT structure is that it knows how to allocate and manage 203 * all of the lower level page tables as required. 204 * 205 * Because all of the mappings are present at the beginning of time for this CPU 206 * and none of the mappings are in the kernel pageable segment, we don't have to 207 * worry about faulting on these HAT structures and thus the notion of the 208 * current HAT that we're using is always the appropriate HAT for the process 209 * (usually a user HAT or the kernel's HAT). 210 * 211 * A further constraint we place on the system with these per-CPU HATs is that 212 * they are not subject to htable_steal(). Because each CPU will have a rather 213 * fixed number of page tables, the same way that we don't steal from the 214 * kernel's HAT, it was determined that we should not steal from this HAT due to 215 * the complications involved and somewhat criminal nature of htable_steal(). 216 * 217 * The per-CPU HAT is initialized in hat_pcp_setup() which is called as part of 218 * onlining the CPU, but before the CPU is actually started. The per-CPU HAT is 219 * removed in hat_pcp_teardown() which is called when a CPU is being offlined to 220 * be removed from the system (which is different from what psradm usually 221 * does). 222 * 223 * Finally, once the CPU has been onlined, the set of mappings in the per-CPU 224 * HAT must not change. The HAT related functions that we call are not meant to 225 * be called when we're switching between processes. For example, it is quite 226 * possible that if they were, they would try to grab an htable mutex which 227 * another thread might have. One needs to treat hat_switch() as though they 228 * were above LOCK_LEVEL and therefore _must not_ block under any circumstance. 229 */ 230 231 #include <sys/machparam.h> 232 #include <sys/machsystm.h> 233 #include <sys/mman.h> 234 #include <sys/types.h> 235 #include <sys/systm.h> 236 #include <sys/cpuvar.h> 237 #include <sys/thread.h> 238 #include <sys/proc.h> 239 #include <sys/cpu.h> 240 #include <sys/kmem.h> 241 #include <sys/disp.h> 242 #include <sys/shm.h> 243 #include <sys/sysmacros.h> 244 #include <sys/machparam.h> 245 #include <sys/vmem.h> 246 #include <sys/vmsystm.h> 247 #include <sys/promif.h> 248 #include <sys/var.h> 249 #include <sys/x86_archext.h> 250 #include <sys/atomic.h> 251 #include <sys/bitmap.h> 252 #include <sys/controlregs.h> 253 #include <sys/bootconf.h> 254 #include <sys/bootsvcs.h> 255 #include <sys/bootinfo.h> 256 #include <sys/archsystm.h> 257 258 #include <vm/seg_kmem.h> 259 #include <vm/hat_i86.h> 260 #include <vm/as.h> 261 #include <vm/seg.h> 262 #include <vm/page.h> 263 #include <vm/seg_kp.h> 264 #include <vm/seg_kpm.h> 265 #include <vm/vm_dep.h> 266 #ifdef __xpv 267 #include <sys/hypervisor.h> 268 #endif 269 #include <vm/kboot_mmu.h> 270 #include <vm/seg_spt.h> 271 272 #include <sys/cmn_err.h> 273 274 /* 275 * Basic parameters for hat operation. 276 */ 277 struct hat_mmu_info mmu; 278 279 /* 280 * The page that is the kernel's top level pagetable. 281 * 282 * For 32 bit PAE support on i86pc, the kernel hat will use the 1st 4 entries 283 * on this 4K page for its top level page table. The remaining groups of 284 * 4 entries are used for per processor copies of user PCP pagetables for 285 * running threads. See hat_switch() and reload_pae32() for details. 286 * 287 * pcp_page[0..3] - level==2 PTEs for kernel HAT 288 * pcp_page[4..7] - level==2 PTEs for user thread on cpu 0 289 * pcp_page[8..11] - level==2 PTE for user thread on cpu 1 290 * etc... 291 * 292 * On the 64-bit kernel, this is the normal root of the page table and there is 293 * nothing special about it when used for other CPUs. 294 */ 295 static x86pte_t *pcp_page; 296 297 /* 298 * forward declaration of internal utility routines 299 */ 300 static x86pte_t hati_update_pte(htable_t *ht, uint_t entry, x86pte_t expected, 301 x86pte_t new); 302 303 /* 304 * The kernel address space exists in all non-HAT_COPIED HATs. To implement this 305 * the kernel reserves a fixed number of entries in the topmost level(s) of page 306 * tables. The values are setup during startup and then copied to every user hat 307 * created by hat_alloc(). This means that kernelbase must be: 308 * 309 * 4Meg aligned for 32 bit kernels 310 * 512Gig aligned for x86_64 64 bit kernel 311 * 312 * The hat_kernel_range_ts describe what needs to be copied from kernel hat 313 * to each user hat. 314 */ 315 typedef struct hat_kernel_range { 316 level_t hkr_level; 317 uintptr_t hkr_start_va; 318 uintptr_t hkr_end_va; /* zero means to end of memory */ 319 } hat_kernel_range_t; 320 #define NUM_KERNEL_RANGE 2 321 static hat_kernel_range_t kernel_ranges[NUM_KERNEL_RANGE]; 322 static int num_kernel_ranges; 323 324 uint_t use_boot_reserve = 1; /* cleared after early boot process */ 325 uint_t can_steal_post_boot = 0; /* set late in boot to enable stealing */ 326 327 /* 328 * enable_1gpg: controls 1g page support for user applications. 329 * By default, 1g pages are exported to user applications. enable_1gpg can 330 * be set to 0 to not export. 331 */ 332 int enable_1gpg = 1; 333 334 /* 335 * AMD shanghai processors provide better management of 1gb ptes in its tlb. 336 * By default, 1g page support will be disabled for pre-shanghai AMD 337 * processors that don't have optimal tlb support for the 1g page size. 338 * chk_optimal_1gtlb can be set to 0 to force 1g page support on sub-optimal 339 * processors. 340 */ 341 int chk_optimal_1gtlb = 1; 342 343 344 #ifdef DEBUG 345 uint_t map1gcnt; 346 #endif 347 348 349 /* 350 * A cpuset for all cpus. This is used for kernel address cross calls, since 351 * the kernel addresses apply to all cpus. 352 */ 353 cpuset_t khat_cpuset; 354 355 /* 356 * management stuff for hat structures 357 */ 358 kmutex_t hat_list_lock; 359 kcondvar_t hat_list_cv; 360 kmem_cache_t *hat_cache; 361 kmem_cache_t *hat_hash_cache; 362 kmem_cache_t *hat32_hash_cache; 363 364 /* 365 * Simple statistics 366 */ 367 struct hatstats hatstat; 368 369 /* 370 * Some earlier hypervisor versions do not emulate cmpxchg of PTEs 371 * correctly. For such hypervisors we must set PT_USER for kernel 372 * entries ourselves (normally the emulation would set PT_USER for 373 * kernel entries and PT_USER|PT_GLOBAL for user entries). pt_kern is 374 * thus set appropriately. Note that dboot/kbm is OK, as only the full 375 * HAT uses cmpxchg() and the other paths (hypercall etc.) were never 376 * incorrect. 377 */ 378 int pt_kern; 379 380 #ifndef __xpv 381 extern pfn_t memseg_get_start(struct memseg *); 382 #endif 383 384 #define PP_GETRM(pp, rmmask) (pp->p_nrm & rmmask) 385 #define PP_ISMOD(pp) PP_GETRM(pp, P_MOD) 386 #define PP_ISREF(pp) PP_GETRM(pp, P_REF) 387 #define PP_ISRO(pp) PP_GETRM(pp, P_RO) 388 389 #define PP_SETRM(pp, rm) atomic_orb(&(pp->p_nrm), rm) 390 #define PP_SETMOD(pp) PP_SETRM(pp, P_MOD) 391 #define PP_SETREF(pp) PP_SETRM(pp, P_REF) 392 #define PP_SETRO(pp) PP_SETRM(pp, P_RO) 393 394 #define PP_CLRRM(pp, rm) atomic_andb(&(pp->p_nrm), ~(rm)) 395 #define PP_CLRMOD(pp) PP_CLRRM(pp, P_MOD) 396 #define PP_CLRREF(pp) PP_CLRRM(pp, P_REF) 397 #define PP_CLRRO(pp) PP_CLRRM(pp, P_RO) 398 #define PP_CLRALL(pp) PP_CLRRM(pp, P_MOD | P_REF | P_RO) 399 400 /* 401 * kmem cache constructor for struct hat 402 */ 403 /*ARGSUSED*/ 404 static int 405 hati_constructor(void *buf, void *handle, int kmflags) 406 { 407 hat_t *hat = buf; 408 409 mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL); 410 bzero(hat->hat_pages_mapped, 411 sizeof (pgcnt_t) * (mmu.max_page_level + 1)); 412 hat->hat_ism_pgcnt = 0; 413 hat->hat_stats = 0; 414 hat->hat_flags = 0; 415 CPUSET_ZERO(hat->hat_cpus); 416 hat->hat_htable = NULL; 417 hat->hat_ht_hash = NULL; 418 return (0); 419 } 420 421 /* 422 * Put it at the start of the global list of all hats (used by stealing) 423 * 424 * kas.a_hat is not in the list but is instead used to find the 425 * first and last items in the list. 426 * 427 * - kas.a_hat->hat_next points to the start of the user hats. 428 * The list ends where hat->hat_next == NULL 429 * 430 * - kas.a_hat->hat_prev points to the last of the user hats. 431 * The list begins where hat->hat_prev == NULL 432 */ 433 static void 434 hat_list_append(hat_t *hat) 435 { 436 mutex_enter(&hat_list_lock); 437 hat->hat_prev = NULL; 438 hat->hat_next = kas.a_hat->hat_next; 439 if (hat->hat_next) 440 hat->hat_next->hat_prev = hat; 441 else 442 kas.a_hat->hat_prev = hat; 443 kas.a_hat->hat_next = hat; 444 mutex_exit(&hat_list_lock); 445 } 446 447 /* 448 * Allocate a hat structure for as. We also create the top level 449 * htable and initialize it to contain the kernel hat entries. 450 */ 451 hat_t * 452 hat_alloc(struct as *as) 453 { 454 hat_t *hat; 455 htable_t *ht; /* top level htable */ 456 uint_t use_copied; 457 uint_t r; 458 hat_kernel_range_t *rp; 459 uintptr_t va; 460 uintptr_t eva; 461 uint_t start; 462 uint_t cnt; 463 htable_t *src; 464 boolean_t use_hat32_cache; 465 466 /* 467 * Once we start creating user process HATs we can enable 468 * the htable_steal() code. 469 */ 470 if (can_steal_post_boot == 0) 471 can_steal_post_boot = 1; 472 473 ASSERT(AS_WRITE_HELD(as)); 474 hat = kmem_cache_alloc(hat_cache, KM_SLEEP); 475 hat->hat_as = as; 476 mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL); 477 ASSERT(hat->hat_flags == 0); 478 479 #if defined(__xpv) 480 /* 481 * No PCP stuff on the hypervisor due to the 64-bit split top level 482 * page tables. On 32-bit it's not needed as the hypervisor takes 483 * care of copying the top level PTEs to a below 4Gig page. 484 */ 485 use_copied = 0; 486 use_hat32_cache = B_FALSE; 487 hat->hat_max_level = mmu.max_level; 488 hat->hat_num_copied = 0; 489 hat->hat_flags = 0; 490 #else /* __xpv */ 491 492 /* 493 * All processes use HAT_COPIED on the 64-bit kernel if KPTI is 494 * turned on. 495 */ 496 if (ttoproc(curthread)->p_model == DATAMODEL_ILP32) { 497 use_copied = 1; 498 hat->hat_max_level = mmu.max_level32; 499 hat->hat_num_copied = mmu.num_copied_ents32; 500 use_hat32_cache = B_TRUE; 501 hat->hat_flags |= HAT_COPIED_32; 502 HATSTAT_INC(hs_hat_copied32); 503 } else if (kpti_enable == 1) { 504 use_copied = 1; 505 hat->hat_max_level = mmu.max_level; 506 hat->hat_num_copied = mmu.num_copied_ents; 507 use_hat32_cache = B_FALSE; 508 HATSTAT_INC(hs_hat_copied64); 509 } else { 510 use_copied = 0; 511 use_hat32_cache = B_FALSE; 512 hat->hat_max_level = mmu.max_level; 513 hat->hat_num_copied = 0; 514 hat->hat_flags = 0; 515 HATSTAT_INC(hs_hat_normal64); 516 } 517 #endif /* __xpv */ 518 if (use_copied) { 519 hat->hat_flags |= HAT_COPIED; 520 bzero(hat->hat_copied_ptes, sizeof (hat->hat_copied_ptes)); 521 } 522 523 /* 524 * Allocate the htable hash. For 32-bit PCP processes we use the 525 * hat32_hash_cache. However, for 64-bit PCP processes we do not as the 526 * number of entries that they have to handle is closer to 527 * hat_hash_cache in count (though there will be more wastage when we 528 * have more DRAM in the system and thus push down the user address 529 * range). 530 */ 531 if (use_hat32_cache) { 532 hat->hat_num_hash = mmu.hat32_hash_cnt; 533 hat->hat_ht_hash = kmem_cache_alloc(hat32_hash_cache, KM_SLEEP); 534 } else { 535 hat->hat_num_hash = mmu.hash_cnt; 536 hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP); 537 } 538 bzero(hat->hat_ht_hash, hat->hat_num_hash * sizeof (htable_t *)); 539 540 /* 541 * Initialize Kernel HAT entries at the top of the top level page 542 * tables for the new hat. 543 */ 544 hat->hat_htable = NULL; 545 hat->hat_ht_cached = NULL; 546 XPV_DISALLOW_MIGRATE(); 547 ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL); 548 hat->hat_htable = ht; 549 550 #if defined(__amd64) 551 if (hat->hat_flags & HAT_COPIED) 552 goto init_done; 553 #endif 554 555 for (r = 0; r < num_kernel_ranges; ++r) { 556 rp = &kernel_ranges[r]; 557 for (va = rp->hkr_start_va; va != rp->hkr_end_va; 558 va += cnt * LEVEL_SIZE(rp->hkr_level)) { 559 560 if (rp->hkr_level == TOP_LEVEL(hat)) 561 ht = hat->hat_htable; 562 else 563 ht = htable_create(hat, va, rp->hkr_level, 564 NULL); 565 566 start = htable_va2entry(va, ht); 567 cnt = HTABLE_NUM_PTES(ht) - start; 568 eva = va + 569 ((uintptr_t)cnt << LEVEL_SHIFT(rp->hkr_level)); 570 if (rp->hkr_end_va != 0 && 571 (eva > rp->hkr_end_va || eva == 0)) 572 cnt = htable_va2entry(rp->hkr_end_va, ht) - 573 start; 574 575 #if defined(__i386) && !defined(__xpv) 576 if (ht->ht_flags & HTABLE_COPIED) { 577 bcopy(&pcp_page[start], 578 &hat->hat_copied_ptes[start], 579 cnt * sizeof (x86pte_t)); 580 continue; 581 } 582 #endif 583 src = htable_lookup(kas.a_hat, va, rp->hkr_level); 584 ASSERT(src != NULL); 585 x86pte_copy(src, ht, start, cnt); 586 htable_release(src); 587 } 588 } 589 590 init_done: 591 592 #if defined(__xpv) 593 /* 594 * Pin top level page tables after initializing them 595 */ 596 xen_pin(hat->hat_htable->ht_pfn, mmu.max_level); 597 #if defined(__amd64) 598 xen_pin(hat->hat_user_ptable, mmu.max_level); 599 #endif 600 #endif 601 XPV_ALLOW_MIGRATE(); 602 603 hat_list_append(hat); 604 605 return (hat); 606 } 607 608 #if !defined(__xpv) 609 /* 610 * Cons up a HAT for a CPU. This represents the user mappings. This will have 611 * various kernel pages punched into it manually. Importantly, this hat is 612 * ineligible for stealing. We really don't want to deal with this ever 613 * faulting and figuring out that this is happening, much like we don't with 614 * kas. 615 */ 616 static hat_t * 617 hat_cpu_alloc(cpu_t *cpu) 618 { 619 hat_t *hat; 620 htable_t *ht; 621 622 hat = kmem_cache_alloc(hat_cache, KM_SLEEP); 623 hat->hat_as = NULL; 624 mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL); 625 hat->hat_max_level = mmu.max_level; 626 hat->hat_num_copied = 0; 627 hat->hat_flags = HAT_PCP; 628 629 hat->hat_num_hash = mmu.hash_cnt; 630 hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP); 631 bzero(hat->hat_ht_hash, hat->hat_num_hash * sizeof (htable_t *)); 632 633 hat->hat_next = hat->hat_prev = NULL; 634 635 /* 636 * Because this HAT will only ever be used by the current CPU, we'll go 637 * ahead and set the CPUSET up to only point to the CPU in question. 638 */ 639 CPUSET_ADD(hat->hat_cpus, cpu->cpu_id); 640 641 hat->hat_htable = NULL; 642 hat->hat_ht_cached = NULL; 643 ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL); 644 hat->hat_htable = ht; 645 646 hat_list_append(hat); 647 648 return (hat); 649 } 650 #endif /* !__xpv */ 651 652 /* 653 * process has finished executing but as has not been cleaned up yet. 654 */ 655 /*ARGSUSED*/ 656 void 657 hat_free_start(hat_t *hat) 658 { 659 ASSERT(AS_WRITE_HELD(hat->hat_as)); 660 661 /* 662 * If the hat is currently a stealing victim, wait for the stealing 663 * to finish. Once we mark it as HAT_FREEING, htable_steal() 664 * won't look at its pagetables anymore. 665 */ 666 mutex_enter(&hat_list_lock); 667 while (hat->hat_flags & HAT_VICTIM) 668 cv_wait(&hat_list_cv, &hat_list_lock); 669 hat->hat_flags |= HAT_FREEING; 670 mutex_exit(&hat_list_lock); 671 } 672 673 /* 674 * An address space is being destroyed, so we destroy the associated hat. 675 */ 676 void 677 hat_free_end(hat_t *hat) 678 { 679 kmem_cache_t *cache; 680 681 ASSERT(hat->hat_flags & HAT_FREEING); 682 683 /* 684 * must not be running on the given hat 685 */ 686 ASSERT(CPU->cpu_current_hat != hat); 687 688 /* 689 * Remove it from the list of HATs 690 */ 691 mutex_enter(&hat_list_lock); 692 if (hat->hat_prev) 693 hat->hat_prev->hat_next = hat->hat_next; 694 else 695 kas.a_hat->hat_next = hat->hat_next; 696 if (hat->hat_next) 697 hat->hat_next->hat_prev = hat->hat_prev; 698 else 699 kas.a_hat->hat_prev = hat->hat_prev; 700 mutex_exit(&hat_list_lock); 701 hat->hat_next = hat->hat_prev = NULL; 702 703 #if defined(__xpv) 704 /* 705 * On the hypervisor, unpin top level page table(s) 706 */ 707 VERIFY3U(hat->hat_flags & HAT_PCP, ==, 0); 708 xen_unpin(hat->hat_htable->ht_pfn); 709 #if defined(__amd64) 710 xen_unpin(hat->hat_user_ptable); 711 #endif 712 #endif 713 714 /* 715 * Make a pass through the htables freeing them all up. 716 */ 717 htable_purge_hat(hat); 718 719 /* 720 * Decide which kmem cache the hash table came from, then free it. 721 */ 722 if (hat->hat_flags & HAT_COPIED) { 723 #if defined(__amd64) 724 if (hat->hat_flags & HAT_COPIED_32) { 725 cache = hat32_hash_cache; 726 } else { 727 cache = hat_hash_cache; 728 } 729 #else 730 cache = hat32_hash_cache; 731 #endif 732 } else { 733 cache = hat_hash_cache; 734 } 735 kmem_cache_free(cache, hat->hat_ht_hash); 736 hat->hat_ht_hash = NULL; 737 738 hat->hat_flags = 0; 739 hat->hat_max_level = 0; 740 hat->hat_num_copied = 0; 741 kmem_cache_free(hat_cache, hat); 742 } 743 744 /* 745 * round kernelbase down to a supported value to use for _userlimit 746 * 747 * userlimit must be aligned down to an entry in the top level htable. 748 * The one exception is for 32 bit HAT's running PAE. 749 */ 750 uintptr_t 751 hat_kernelbase(uintptr_t va) 752 { 753 #if defined(__i386) 754 va &= LEVEL_MASK(1); 755 #endif 756 if (IN_VA_HOLE(va)) 757 panic("_userlimit %p will fall in VA hole\n", (void *)va); 758 return (va); 759 } 760 761 /* 762 * 763 */ 764 static void 765 set_max_page_level() 766 { 767 level_t lvl; 768 769 if (!kbm_largepage_support) { 770 lvl = 0; 771 } else { 772 if (is_x86_feature(x86_featureset, X86FSET_1GPG)) { 773 lvl = 2; 774 if (chk_optimal_1gtlb && 775 cpuid_opteron_erratum(CPU, 6671130)) { 776 lvl = 1; 777 } 778 if (plat_mnode_xcheck(LEVEL_SIZE(2) >> 779 LEVEL_SHIFT(0))) { 780 lvl = 1; 781 } 782 } else { 783 lvl = 1; 784 } 785 } 786 mmu.max_page_level = lvl; 787 788 if ((lvl == 2) && (enable_1gpg == 0)) 789 mmu.umax_page_level = 1; 790 else 791 mmu.umax_page_level = lvl; 792 } 793 794 /* 795 * Determine the number of slots that are in used in the top-most level page 796 * table for user memory. This is based on _userlimit. In effect this is similar 797 * to htable_va2entry, but without the convenience of having an htable. 798 */ 799 void 800 mmu_calc_user_slots(void) 801 { 802 uint_t ent, nptes; 803 uintptr_t shift; 804 805 nptes = mmu.top_level_count; 806 shift = _userlimit >> mmu.level_shift[mmu.max_level]; 807 ent = shift & (nptes - 1); 808 809 /* 810 * Ent tells us the slot that the page for _userlimit would fit in. We 811 * need to add one to this to cover the total number of entries. 812 */ 813 mmu.top_level_uslots = ent + 1; 814 815 /* 816 * When running 32-bit compatability processes on a 64-bit kernel, we 817 * will only need to use one slot. 818 */ 819 mmu.top_level_uslots32 = 1; 820 821 /* 822 * Record the number of PCP page table entries that we'll need to copy 823 * around. For 64-bit processes this is the number of user slots. For 824 * 32-bit proceses, this is 4 1 GiB pages. 825 */ 826 mmu.num_copied_ents = mmu.top_level_uslots; 827 mmu.num_copied_ents32 = 4; 828 } 829 830 /* 831 * Initialize hat data structures based on processor MMU information. 832 */ 833 void 834 mmu_init(void) 835 { 836 uint_t max_htables; 837 uint_t pa_bits; 838 uint_t va_bits; 839 int i; 840 841 /* 842 * If CPU enabled the page table global bit, use it for the kernel 843 * This is bit 7 in CR4 (PGE - Page Global Enable). 844 */ 845 if (is_x86_feature(x86_featureset, X86FSET_PGE) && 846 (getcr4() & CR4_PGE) != 0) 847 mmu.pt_global = PT_GLOBAL; 848 849 #if !defined(__xpv) 850 /* 851 * The 64-bit x86 kernel has split user/kernel page tables. As such we 852 * cannot have the global bit set. The simplest way for us to deal with 853 * this is to just say that pt_global is zero, so the global bit isn't 854 * present. 855 */ 856 if (kpti_enable == 1) 857 mmu.pt_global = 0; 858 #endif 859 860 /* 861 * Detect NX and PAE usage. 862 */ 863 mmu.pae_hat = kbm_pae_support; 864 if (kbm_nx_support) 865 mmu.pt_nx = PT_NX; 866 else 867 mmu.pt_nx = 0; 868 869 /* 870 * Use CPU info to set various MMU parameters 871 */ 872 cpuid_get_addrsize(CPU, &pa_bits, &va_bits); 873 874 if (va_bits < sizeof (void *) * NBBY) { 875 mmu.hole_start = (1ul << (va_bits - 1)); 876 mmu.hole_end = 0ul - mmu.hole_start - 1; 877 } else { 878 mmu.hole_end = 0; 879 mmu.hole_start = mmu.hole_end - 1; 880 } 881 #if defined(OPTERON_ERRATUM_121) 882 /* 883 * If erratum 121 has already been detected at this time, hole_start 884 * contains the value to be subtracted from mmu.hole_start. 885 */ 886 ASSERT(hole_start == 0 || opteron_erratum_121 != 0); 887 hole_start = mmu.hole_start - hole_start; 888 #else 889 hole_start = mmu.hole_start; 890 #endif 891 hole_end = mmu.hole_end; 892 893 mmu.highest_pfn = mmu_btop((1ull << pa_bits) - 1); 894 if (mmu.pae_hat == 0 && pa_bits > 32) 895 mmu.highest_pfn = PFN_4G - 1; 896 897 if (mmu.pae_hat) { 898 mmu.pte_size = 8; /* 8 byte PTEs */ 899 mmu.pte_size_shift = 3; 900 } else { 901 mmu.pte_size = 4; /* 4 byte PTEs */ 902 mmu.pte_size_shift = 2; 903 } 904 905 if (mmu.pae_hat && !is_x86_feature(x86_featureset, X86FSET_PAE)) 906 panic("Processor does not support PAE"); 907 908 if (!is_x86_feature(x86_featureset, X86FSET_CX8)) 909 panic("Processor does not support cmpxchg8b instruction"); 910 911 #if defined(__amd64) 912 913 mmu.num_level = 4; 914 mmu.max_level = 3; 915 mmu.ptes_per_table = 512; 916 mmu.top_level_count = 512; 917 918 /* 919 * 32-bit processes only use 1 GB ptes. 920 */ 921 mmu.max_level32 = 2; 922 923 mmu.level_shift[0] = 12; 924 mmu.level_shift[1] = 21; 925 mmu.level_shift[2] = 30; 926 mmu.level_shift[3] = 39; 927 928 #elif defined(__i386) 929 930 if (mmu.pae_hat) { 931 mmu.num_level = 3; 932 mmu.max_level = 2; 933 mmu.ptes_per_table = 512; 934 mmu.top_level_count = 4; 935 936 mmu.level_shift[0] = 12; 937 mmu.level_shift[1] = 21; 938 mmu.level_shift[2] = 30; 939 940 } else { 941 mmu.num_level = 2; 942 mmu.max_level = 1; 943 mmu.ptes_per_table = 1024; 944 mmu.top_level_count = 1024; 945 946 mmu.level_shift[0] = 12; 947 mmu.level_shift[1] = 22; 948 } 949 950 #endif /* __i386 */ 951 952 for (i = 0; i < mmu.num_level; ++i) { 953 mmu.level_size[i] = 1UL << mmu.level_shift[i]; 954 mmu.level_offset[i] = mmu.level_size[i] - 1; 955 mmu.level_mask[i] = ~mmu.level_offset[i]; 956 } 957 958 set_max_page_level(); 959 mmu_calc_user_slots(); 960 961 mmu_page_sizes = mmu.max_page_level + 1; 962 mmu_exported_page_sizes = mmu.umax_page_level + 1; 963 964 /* restrict legacy applications from using pagesizes 1g and above */ 965 mmu_legacy_page_sizes = 966 (mmu_exported_page_sizes > 2) ? 2 : mmu_exported_page_sizes; 967 968 969 for (i = 0; i <= mmu.max_page_level; ++i) { 970 mmu.pte_bits[i] = PT_VALID | pt_kern; 971 if (i > 0) 972 mmu.pte_bits[i] |= PT_PAGESIZE; 973 } 974 975 /* 976 * NOTE Legacy 32 bit PAE mode only has the P_VALID bit at top level. 977 */ 978 for (i = 1; i < mmu.num_level; ++i) 979 mmu.ptp_bits[i] = PT_PTPBITS; 980 981 #if defined(__i386) 982 mmu.ptp_bits[2] = PT_VALID; 983 #endif 984 985 /* 986 * Compute how many hash table entries to have per process for htables. 987 * We start with 1 page's worth of entries. 988 * 989 * If physical memory is small, reduce the amount need to cover it. 990 */ 991 max_htables = physmax / mmu.ptes_per_table; 992 mmu.hash_cnt = MMU_PAGESIZE / sizeof (htable_t *); 993 while (mmu.hash_cnt > 16 && mmu.hash_cnt >= max_htables) 994 mmu.hash_cnt >>= 1; 995 mmu.hat32_hash_cnt = mmu.hash_cnt; 996 997 #if defined(__amd64) 998 /* 999 * If running in 64 bits and physical memory is large, 1000 * increase the size of the cache to cover all of memory for 1001 * a 64 bit process. 1002 */ 1003 #define HASH_MAX_LENGTH 4 1004 while (mmu.hash_cnt * HASH_MAX_LENGTH < max_htables) 1005 mmu.hash_cnt <<= 1; 1006 #endif 1007 } 1008 1009 1010 /* 1011 * initialize hat data structures 1012 */ 1013 void 1014 hat_init() 1015 { 1016 #if defined(__i386) 1017 /* 1018 * _userlimit must be aligned correctly 1019 */ 1020 if ((_userlimit & LEVEL_MASK(1)) != _userlimit) { 1021 prom_printf("hat_init(): _userlimit=%p, not aligned at %p\n", 1022 (void *)_userlimit, (void *)LEVEL_SIZE(1)); 1023 halt("hat_init(): Unable to continue"); 1024 } 1025 #endif 1026 1027 cv_init(&hat_list_cv, NULL, CV_DEFAULT, NULL); 1028 1029 /* 1030 * initialize kmem caches 1031 */ 1032 htable_init(); 1033 hment_init(); 1034 1035 hat_cache = kmem_cache_create("hat_t", 1036 sizeof (hat_t), 0, hati_constructor, NULL, NULL, 1037 NULL, 0, 0); 1038 1039 hat_hash_cache = kmem_cache_create("HatHash", 1040 mmu.hash_cnt * sizeof (htable_t *), 0, NULL, NULL, NULL, 1041 NULL, 0, 0); 1042 1043 /* 1044 * 32-bit PCP hats can use a smaller hash table size on large memory 1045 * machines 1046 */ 1047 if (mmu.hash_cnt == mmu.hat32_hash_cnt) { 1048 hat32_hash_cache = hat_hash_cache; 1049 } else { 1050 hat32_hash_cache = kmem_cache_create("Hat32Hash", 1051 mmu.hat32_hash_cnt * sizeof (htable_t *), 0, NULL, NULL, 1052 NULL, NULL, 0, 0); 1053 } 1054 1055 /* 1056 * Set up the kernel's hat 1057 */ 1058 AS_LOCK_ENTER(&kas, RW_WRITER); 1059 kas.a_hat = kmem_cache_alloc(hat_cache, KM_NOSLEEP); 1060 mutex_init(&kas.a_hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL); 1061 kas.a_hat->hat_as = &kas; 1062 kas.a_hat->hat_flags = 0; 1063 AS_LOCK_EXIT(&kas); 1064 1065 CPUSET_ZERO(khat_cpuset); 1066 CPUSET_ADD(khat_cpuset, CPU->cpu_id); 1067 1068 /* 1069 * The kernel HAT doesn't use PCP regardless of architectures. 1070 */ 1071 ASSERT3U(mmu.max_level, >, 0); 1072 kas.a_hat->hat_max_level = mmu.max_level; 1073 kas.a_hat->hat_num_copied = 0; 1074 1075 /* 1076 * The kernel hat's next pointer serves as the head of the hat list . 1077 * The kernel hat's prev pointer tracks the last hat on the list for 1078 * htable_steal() to use. 1079 */ 1080 kas.a_hat->hat_next = NULL; 1081 kas.a_hat->hat_prev = NULL; 1082 1083 /* 1084 * Allocate an htable hash bucket for the kernel 1085 * XX64 - tune for 64 bit procs 1086 */ 1087 kas.a_hat->hat_num_hash = mmu.hash_cnt; 1088 kas.a_hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_NOSLEEP); 1089 bzero(kas.a_hat->hat_ht_hash, mmu.hash_cnt * sizeof (htable_t *)); 1090 1091 /* 1092 * zero out the top level and cached htable pointers 1093 */ 1094 kas.a_hat->hat_ht_cached = NULL; 1095 kas.a_hat->hat_htable = NULL; 1096 1097 /* 1098 * Pre-allocate hrm_hashtab before enabling the collection of 1099 * refmod statistics. Allocating on the fly would mean us 1100 * running the risk of suffering recursive mutex enters or 1101 * deadlocks. 1102 */ 1103 hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *), 1104 KM_SLEEP); 1105 } 1106 1107 1108 extern void kpti_tramp_start(); 1109 extern void kpti_tramp_end(); 1110 1111 extern void kdi_isr_start(); 1112 extern void kdi_isr_end(); 1113 1114 extern gate_desc_t kdi_idt[NIDT]; 1115 1116 /* 1117 * Prepare per-CPU pagetables for all processes on the 64 bit kernel. 1118 * 1119 * Each CPU has a set of 2 pagetables that are reused for any 32 bit 1120 * process it runs. They are the top level pagetable, hci_pcp_l3ptes, and 1121 * the next to top level table for the bottom 512 Gig, hci_pcp_l2ptes. 1122 */ 1123 /*ARGSUSED*/ 1124 static void 1125 hat_pcp_setup(struct cpu *cpu) 1126 { 1127 #if !defined(__xpv) 1128 struct hat_cpu_info *hci = cpu->cpu_hat_info; 1129 uintptr_t va; 1130 size_t len; 1131 1132 /* 1133 * allocate the level==2 page table for the bottom most 1134 * 512Gig of address space (this is where 32 bit apps live) 1135 */ 1136 ASSERT(hci != NULL); 1137 hci->hci_pcp_l2ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP); 1138 1139 /* 1140 * Allocate a top level pagetable and copy the kernel's 1141 * entries into it. Then link in hci_pcp_l2ptes in the 1st entry. 1142 */ 1143 hci->hci_pcp_l3ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP); 1144 hci->hci_pcp_l3pfn = 1145 hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_pcp_l3ptes); 1146 ASSERT3U(hci->hci_pcp_l3pfn, !=, PFN_INVALID); 1147 bcopy(pcp_page, hci->hci_pcp_l3ptes, MMU_PAGESIZE); 1148 1149 hci->hci_pcp_l2pfn = 1150 hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_pcp_l2ptes); 1151 ASSERT3U(hci->hci_pcp_l2pfn, !=, PFN_INVALID); 1152 1153 /* 1154 * Now go through and allocate the user version of these structures. 1155 * Unlike with the kernel version, we allocate a hat to represent the 1156 * top-level page table as that will make it much simpler when we need 1157 * to patch through user entries. 1158 */ 1159 hci->hci_user_hat = hat_cpu_alloc(cpu); 1160 hci->hci_user_l3pfn = hci->hci_user_hat->hat_htable->ht_pfn; 1161 ASSERT3U(hci->hci_user_l3pfn, !=, PFN_INVALID); 1162 hci->hci_user_l3ptes = 1163 (x86pte_t *)hat_kpm_mapin_pfn(hci->hci_user_l3pfn); 1164 1165 /* Skip the rest of this if KPTI is switched off at boot. */ 1166 if (kpti_enable != 1) 1167 return; 1168 1169 /* 1170 * OK, now that we have this we need to go through and punch the normal 1171 * holes in the CPU's hat for this. At this point we'll punch in the 1172 * following: 1173 * 1174 * o GDT 1175 * o IDT 1176 * o LDT 1177 * o Trampoline Code 1178 * o machcpu KPTI page 1179 * o kmdb ISR code page (just trampolines) 1180 * 1181 * If this is cpu0, then we also can initialize the following because 1182 * they'll have already been allocated. 1183 * 1184 * o TSS for CPU 0 1185 * o Double Fault for CPU 0 1186 * 1187 * The following items have yet to be allocated and have not been 1188 * punched in yet. They will be punched in later: 1189 * 1190 * o TSS (mach_cpucontext_alloc_tables()) 1191 * o Double Fault Stack (mach_cpucontext_alloc_tables()) 1192 */ 1193 hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_gdt, PROT_READ); 1194 hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_idt, PROT_READ); 1195 1196 /* 1197 * As the KDI IDT is only active during kmdb sessions (including single 1198 * stepping), typically we don't actually need this punched in (we 1199 * consider the routines that switch to the user cr3 to be toxic). But 1200 * if we ever accidentally end up on the user cr3 while on this IDT, 1201 * we'd prefer not to triple fault. 1202 */ 1203 hati_cpu_punchin(cpu, (uintptr_t)&kdi_idt, PROT_READ); 1204 1205 CTASSERT(((uintptr_t)&kpti_tramp_start % MMU_PAGESIZE) == 0); 1206 CTASSERT(((uintptr_t)&kpti_tramp_end % MMU_PAGESIZE) == 0); 1207 for (va = (uintptr_t)&kpti_tramp_start; 1208 va < (uintptr_t)&kpti_tramp_end; va += MMU_PAGESIZE) { 1209 hati_cpu_punchin(cpu, va, PROT_READ | PROT_EXEC); 1210 } 1211 1212 VERIFY3U(((uintptr_t)cpu->cpu_m.mcpu_ldt) % MMU_PAGESIZE, ==, 0); 1213 for (va = (uintptr_t)cpu->cpu_m.mcpu_ldt, len = LDT_CPU_SIZE; 1214 len >= MMU_PAGESIZE; va += MMU_PAGESIZE, len -= MMU_PAGESIZE) { 1215 hati_cpu_punchin(cpu, va, PROT_READ); 1216 } 1217 1218 /* mcpu_pad2 is the start of the page containing the kpti_frames. */ 1219 hati_cpu_punchin(cpu, (uintptr_t)&cpu->cpu_m.mcpu_pad2[0], 1220 PROT_READ | PROT_WRITE); 1221 1222 if (cpu == &cpus[0]) { 1223 /* 1224 * CPU0 uses a global for its double fault stack to deal with 1225 * the chicken and egg problem. We need to punch it into its 1226 * user HAT. 1227 */ 1228 extern char dblfault_stack0[]; 1229 1230 hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_m.mcpu_tss, 1231 PROT_READ); 1232 1233 for (va = (uintptr_t)dblfault_stack0, 1234 len = DEFAULTSTKSZ; len >= MMU_PAGESIZE; 1235 va += MMU_PAGESIZE, len -= MMU_PAGESIZE) { 1236 hati_cpu_punchin(cpu, va, PROT_READ | PROT_WRITE); 1237 } 1238 } 1239 1240 CTASSERT(((uintptr_t)&kdi_isr_start % MMU_PAGESIZE) == 0); 1241 CTASSERT(((uintptr_t)&kdi_isr_end % MMU_PAGESIZE) == 0); 1242 for (va = (uintptr_t)&kdi_isr_start; 1243 va < (uintptr_t)&kdi_isr_end; va += MMU_PAGESIZE) { 1244 hati_cpu_punchin(cpu, va, PROT_READ | PROT_EXEC); 1245 } 1246 #endif /* !__xpv */ 1247 } 1248 1249 /*ARGSUSED*/ 1250 static void 1251 hat_pcp_teardown(cpu_t *cpu) 1252 { 1253 #if !defined(__xpv) 1254 struct hat_cpu_info *hci; 1255 1256 if ((hci = cpu->cpu_hat_info) == NULL) 1257 return; 1258 if (hci->hci_pcp_l2ptes != NULL) 1259 kmem_free(hci->hci_pcp_l2ptes, MMU_PAGESIZE); 1260 if (hci->hci_pcp_l3ptes != NULL) 1261 kmem_free(hci->hci_pcp_l3ptes, MMU_PAGESIZE); 1262 if (hci->hci_user_hat != NULL) { 1263 hat_free_start(hci->hci_user_hat); 1264 hat_free_end(hci->hci_user_hat); 1265 } 1266 #endif 1267 } 1268 1269 #define NEXT_HKR(r, l, s, e) { \ 1270 kernel_ranges[r].hkr_level = l; \ 1271 kernel_ranges[r].hkr_start_va = s; \ 1272 kernel_ranges[r].hkr_end_va = e; \ 1273 ++r; \ 1274 } 1275 1276 /* 1277 * Finish filling in the kernel hat. 1278 * Pre fill in all top level kernel page table entries for the kernel's 1279 * part of the address range. From this point on we can't use any new 1280 * kernel large pages if they need PTE's at max_level 1281 * 1282 * create the kmap mappings. 1283 */ 1284 void 1285 hat_init_finish(void) 1286 { 1287 size_t size; 1288 uint_t r = 0; 1289 uintptr_t va; 1290 hat_kernel_range_t *rp; 1291 1292 1293 /* 1294 * We are now effectively running on the kernel hat. 1295 * Clearing use_boot_reserve shuts off using the pre-allocated boot 1296 * reserve for all HAT allocations. From here on, the reserves are 1297 * only used when avoiding recursion in kmem_alloc(). 1298 */ 1299 use_boot_reserve = 0; 1300 htable_adjust_reserve(); 1301 1302 /* 1303 * User HATs are initialized with copies of all kernel mappings in 1304 * higher level page tables. Ensure that those entries exist. 1305 */ 1306 #if defined(__amd64) 1307 1308 NEXT_HKR(r, 3, kernelbase, 0); 1309 #if defined(__xpv) 1310 NEXT_HKR(r, 3, HYPERVISOR_VIRT_START, HYPERVISOR_VIRT_END); 1311 #endif 1312 1313 #elif defined(__i386) 1314 1315 #if !defined(__xpv) 1316 if (mmu.pae_hat) { 1317 va = kernelbase; 1318 if ((va & LEVEL_MASK(2)) != va) { 1319 va = P2ROUNDUP(va, LEVEL_SIZE(2)); 1320 NEXT_HKR(r, 1, kernelbase, va); 1321 } 1322 if (va != 0) 1323 NEXT_HKR(r, 2, va, 0); 1324 } else 1325 #endif /* __xpv */ 1326 NEXT_HKR(r, 1, kernelbase, 0); 1327 1328 #endif /* __i386 */ 1329 1330 num_kernel_ranges = r; 1331 1332 /* 1333 * Create all the kernel pagetables that will have entries 1334 * shared to user HATs. 1335 */ 1336 for (r = 0; r < num_kernel_ranges; ++r) { 1337 rp = &kernel_ranges[r]; 1338 for (va = rp->hkr_start_va; va != rp->hkr_end_va; 1339 va += LEVEL_SIZE(rp->hkr_level)) { 1340 htable_t *ht; 1341 1342 if (IN_HYPERVISOR_VA(va)) 1343 continue; 1344 1345 /* can/must skip if a page mapping already exists */ 1346 if (rp->hkr_level <= mmu.max_page_level && 1347 (ht = htable_getpage(kas.a_hat, va, NULL)) != 1348 NULL) { 1349 htable_release(ht); 1350 continue; 1351 } 1352 1353 (void) htable_create(kas.a_hat, va, rp->hkr_level - 1, 1354 NULL); 1355 } 1356 } 1357 1358 /* 1359 * 32 bit PAE metal kernels use only 4 of the 512 entries in the 1360 * page holding the top level pagetable. We use the remainder for 1361 * the "per CPU" page tables for PCP processes. 1362 * Map the top level kernel pagetable into the kernel to make 1363 * it easy to use bcopy access these tables. 1364 * 1365 * PAE is required for the 64-bit kernel which uses this as well to 1366 * perform the per-CPU pagetables. See the big theory statement. 1367 */ 1368 if (mmu.pae_hat) { 1369 pcp_page = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP); 1370 hat_devload(kas.a_hat, (caddr_t)pcp_page, MMU_PAGESIZE, 1371 kas.a_hat->hat_htable->ht_pfn, 1372 #if !defined(__xpv) 1373 PROT_WRITE | 1374 #endif 1375 PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK, 1376 HAT_LOAD | HAT_LOAD_NOCONSIST); 1377 } 1378 hat_pcp_setup(CPU); 1379 1380 /* 1381 * Create kmap (cached mappings of kernel PTEs) 1382 * for 32 bit we map from segmap_start .. ekernelheap 1383 * for 64 bit we map from segmap_start .. segmap_start + segmapsize; 1384 */ 1385 #if defined(__i386) 1386 size = (uintptr_t)ekernelheap - segmap_start; 1387 #elif defined(__amd64) 1388 size = segmapsize; 1389 #endif 1390 hat_kmap_init((uintptr_t)segmap_start, size); 1391 1392 #if !defined(__xpv) 1393 ASSERT3U(kas.a_hat->hat_htable->ht_pfn, !=, PFN_INVALID); 1394 ASSERT3U(kpti_safe_cr3, ==, 1395 MAKECR3(kas.a_hat->hat_htable->ht_pfn, PCID_KERNEL)); 1396 #endif 1397 } 1398 1399 /* 1400 * On 32 bit PAE mode, PTE's are 64 bits, but ordinary atomic memory references 1401 * are 32 bit, so for safety we must use atomic_cas_64() to install these. 1402 */ 1403 #ifdef __i386 1404 static void 1405 reload_pae32(hat_t *hat, cpu_t *cpu) 1406 { 1407 x86pte_t *src; 1408 x86pte_t *dest; 1409 x86pte_t pte; 1410 int i; 1411 1412 /* 1413 * Load the 4 entries of the level 2 page table into this 1414 * cpu's range of the pcp_page and point cr3 at them. 1415 */ 1416 ASSERT(mmu.pae_hat); 1417 src = hat->hat_copied_ptes; 1418 dest = pcp_page + (cpu->cpu_id + 1) * MAX_COPIED_PTES; 1419 for (i = 0; i < MAX_COPIED_PTES; ++i) { 1420 for (;;) { 1421 pte = dest[i]; 1422 if (pte == src[i]) 1423 break; 1424 if (atomic_cas_64(dest + i, pte, src[i]) != src[i]) 1425 break; 1426 } 1427 } 1428 } 1429 #endif 1430 1431 /* 1432 * Update the PCP data on the CPU cpu to the one on the hat. If this is a 32-bit 1433 * process, then we must update the L2 pages and then the L3. If this is a 1434 * 64-bit process then we must update the L3 entries. 1435 */ 1436 static void 1437 hat_pcp_update(cpu_t *cpu, const hat_t *hat) 1438 { 1439 ASSERT3U(hat->hat_flags & HAT_COPIED, !=, 0); 1440 1441 if ((hat->hat_flags & HAT_COPIED_32) != 0) { 1442 const x86pte_t *l2src; 1443 x86pte_t *l2dst, *l3ptes, *l3uptes; 1444 /* 1445 * This is a 32-bit process. To set this up, we need to do the 1446 * following: 1447 * 1448 * - Copy the 4 L2 PTEs into the dedicated L2 table 1449 * - Zero the user L3 PTEs in the user and kernel page table 1450 * - Set the first L3 PTE to point to the CPU L2 table 1451 */ 1452 l2src = hat->hat_copied_ptes; 1453 l2dst = cpu->cpu_hat_info->hci_pcp_l2ptes; 1454 l3ptes = cpu->cpu_hat_info->hci_pcp_l3ptes; 1455 l3uptes = cpu->cpu_hat_info->hci_user_l3ptes; 1456 1457 l2dst[0] = l2src[0]; 1458 l2dst[1] = l2src[1]; 1459 l2dst[2] = l2src[2]; 1460 l2dst[3] = l2src[3]; 1461 1462 /* 1463 * Make sure to use the mmu to get the number of slots. The 1464 * number of PLP entries that this has will always be less as 1465 * it's a 32-bit process. 1466 */ 1467 bzero(l3ptes, sizeof (x86pte_t) * mmu.top_level_uslots); 1468 l3ptes[0] = MAKEPTP(cpu->cpu_hat_info->hci_pcp_l2pfn, 2); 1469 bzero(l3uptes, sizeof (x86pte_t) * mmu.top_level_uslots); 1470 l3uptes[0] = MAKEPTP(cpu->cpu_hat_info->hci_pcp_l2pfn, 2); 1471 } else { 1472 /* 1473 * This is a 64-bit process. To set this up, we need to do the 1474 * following: 1475 * 1476 * - Zero the 4 L2 PTEs in the CPU structure for safety 1477 * - Copy over the new user L3 PTEs into the kernel page table 1478 * - Copy over the new user L3 PTEs into the user page table 1479 */ 1480 ASSERT3S(kpti_enable, ==, 1); 1481 bzero(cpu->cpu_hat_info->hci_pcp_l2ptes, sizeof (x86pte_t) * 4); 1482 bcopy(hat->hat_copied_ptes, cpu->cpu_hat_info->hci_pcp_l3ptes, 1483 sizeof (x86pte_t) * mmu.top_level_uslots); 1484 bcopy(hat->hat_copied_ptes, cpu->cpu_hat_info->hci_user_l3ptes, 1485 sizeof (x86pte_t) * mmu.top_level_uslots); 1486 } 1487 } 1488 1489 static void 1490 reset_kpti(struct kpti_frame *fr, uint64_t kcr3, uint64_t ucr3) 1491 { 1492 ASSERT3U(fr->kf_tr_flag, ==, 0); 1493 #if DEBUG 1494 if (fr->kf_kernel_cr3 != 0) { 1495 ASSERT3U(fr->kf_lower_redzone, ==, 0xdeadbeefdeadbeef); 1496 ASSERT3U(fr->kf_middle_redzone, ==, 0xdeadbeefdeadbeef); 1497 ASSERT3U(fr->kf_upper_redzone, ==, 0xdeadbeefdeadbeef); 1498 } 1499 #endif 1500 1501 bzero(fr, offsetof(struct kpti_frame, kf_kernel_cr3)); 1502 bzero(&fr->kf_unused, sizeof (struct kpti_frame) - 1503 offsetof(struct kpti_frame, kf_unused)); 1504 1505 fr->kf_kernel_cr3 = kcr3; 1506 fr->kf_user_cr3 = ucr3; 1507 fr->kf_tr_ret_rsp = (uintptr_t)&fr->kf_tr_rsp; 1508 1509 fr->kf_lower_redzone = 0xdeadbeefdeadbeef; 1510 fr->kf_middle_redzone = 0xdeadbeefdeadbeef; 1511 fr->kf_upper_redzone = 0xdeadbeefdeadbeef; 1512 } 1513 1514 #ifdef __xpv 1515 static void 1516 hat_switch_xen(hat_t *hat) 1517 { 1518 struct mmuext_op t[2]; 1519 uint_t retcnt; 1520 uint_t opcnt = 1; 1521 uint64_t newcr3; 1522 1523 ASSERT(!(hat->hat_flags & HAT_COPIED)); 1524 ASSERT(!(getcr4() & CR4_PCIDE)); 1525 1526 newcr3 = MAKECR3((uint64_t)hat->hat_htable->ht_pfn, PCID_NONE); 1527 1528 t[0].cmd = MMUEXT_NEW_BASEPTR; 1529 t[0].arg1.mfn = mmu_btop(pa_to_ma(newcr3)); 1530 1531 /* 1532 * There's an interesting problem here, as to what to actually specify 1533 * when switching to the kernel hat. For now we'll reuse the kernel hat 1534 * again. 1535 */ 1536 t[1].cmd = MMUEXT_NEW_USER_BASEPTR; 1537 if (hat == kas.a_hat) 1538 t[1].arg1.mfn = mmu_btop(pa_to_ma(newcr3)); 1539 else 1540 t[1].arg1.mfn = pfn_to_mfn(hat->hat_user_ptable); 1541 ++opcnt; 1542 1543 if (HYPERVISOR_mmuext_op(t, opcnt, &retcnt, DOMID_SELF) < 0) 1544 panic("HYPERVISOR_mmu_update() failed"); 1545 ASSERT(retcnt == opcnt); 1546 } 1547 #endif /* __xpv */ 1548 1549 /* 1550 * Switch to a new active hat, maintaining bit masks to track active CPUs. 1551 * 1552 * With KPTI, all our HATs except kas should be using PCP. Thus, to switch 1553 * HATs, we need to copy over the new user PTEs, then set our trampoline context 1554 * as appropriate. 1555 * 1556 * If lacking PCID, we then load our new cr3, which will flush the TLB: we may 1557 * have established userspace TLB entries via kernel accesses, and these are no 1558 * longer valid. We have to do this eagerly, as we just deleted this CPU from 1559 * ->hat_cpus, so would no longer see any TLB shootdowns. 1560 * 1561 * With PCID enabled, things get a little more complicated. We would like to 1562 * keep TLB context around when entering and exiting the kernel, and to do this, 1563 * we partition the TLB into two different spaces: 1564 * 1565 * PCID_KERNEL is defined as zero, and used both by kas and all other address 1566 * spaces while in the kernel (post-trampoline). 1567 * 1568 * PCID_USER is used while in userspace. Therefore, userspace cannot use any 1569 * lingering PCID_KERNEL entries to kernel addresses it should not be able to 1570 * read. 1571 * 1572 * The trampoline cr3s are set not to invalidate on a mov to %cr3. This means if 1573 * we take a journey through the kernel without switching HATs, we have some 1574 * hope of keeping our TLB state around. 1575 * 1576 * On a hat switch, rather than deal with any necessary flushes on the way out 1577 * of the trampolines, we do them upfront here. If we're switching from kas, we 1578 * shouldn't need any invalidation. 1579 * 1580 * Otherwise, we can have stale userspace entries for both PCID_USER (what 1581 * happened before we move onto the kcr3) and PCID_KERNEL (any subsequent 1582 * userspace accesses such as ddi_copyin()). Since setcr3() won't do these 1583 * flushes on its own in PCIDE, we'll do a non-flushing load and then 1584 * invalidate everything. 1585 */ 1586 void 1587 hat_switch(hat_t *hat) 1588 { 1589 cpu_t *cpu = CPU; 1590 hat_t *old = cpu->cpu_current_hat; 1591 1592 /* 1593 * set up this information first, so we don't miss any cross calls 1594 */ 1595 if (old != NULL) { 1596 if (old == hat) 1597 return; 1598 if (old != kas.a_hat) 1599 CPUSET_ATOMIC_DEL(old->hat_cpus, cpu->cpu_id); 1600 } 1601 1602 /* 1603 * Add this CPU to the active set for this HAT. 1604 */ 1605 if (hat != kas.a_hat) { 1606 CPUSET_ATOMIC_ADD(hat->hat_cpus, cpu->cpu_id); 1607 } 1608 cpu->cpu_current_hat = hat; 1609 1610 #if defined(__xpv) 1611 hat_switch_xen(hat); 1612 #else 1613 struct hat_cpu_info *info = cpu->cpu_m.mcpu_hat_info; 1614 uint64_t pcide = getcr4() & CR4_PCIDE; 1615 uint64_t kcr3, ucr3; 1616 pfn_t tl_kpfn; 1617 ulong_t flag; 1618 1619 EQUIV(kpti_enable, !mmu.pt_global); 1620 1621 if (hat->hat_flags & HAT_COPIED) { 1622 hat_pcp_update(cpu, hat); 1623 tl_kpfn = info->hci_pcp_l3pfn; 1624 } else { 1625 IMPLY(kpti_enable, hat == kas.a_hat); 1626 tl_kpfn = hat->hat_htable->ht_pfn; 1627 } 1628 1629 if (pcide) { 1630 ASSERT(kpti_enable); 1631 1632 kcr3 = MAKECR3(tl_kpfn, PCID_KERNEL) | CR3_NOINVL_BIT; 1633 ucr3 = MAKECR3(info->hci_user_l3pfn, PCID_USER) | 1634 CR3_NOINVL_BIT; 1635 1636 setcr3(kcr3); 1637 if (old != kas.a_hat) 1638 mmu_flush_tlb(FLUSH_TLB_ALL, NULL); 1639 } else { 1640 kcr3 = MAKECR3(tl_kpfn, PCID_NONE); 1641 ucr3 = kpti_enable ? 1642 MAKECR3(info->hci_user_l3pfn, PCID_NONE) : 1643 0; 1644 1645 setcr3(kcr3); 1646 } 1647 1648 /* 1649 * We will already be taking shootdowns for our new HAT, and as KPTI 1650 * invpcid emulation needs to use kf_user_cr3, make sure we don't get 1651 * any cross calls while we're inconsistent. Note that it's harmless to 1652 * have a *stale* kf_user_cr3 (we just did a FLUSH_TLB_ALL), but a 1653 * *zero* kf_user_cr3 is not going to go very well. 1654 */ 1655 if (pcide) 1656 flag = intr_clear(); 1657 1658 reset_kpti(&cpu->cpu_m.mcpu_kpti, kcr3, ucr3); 1659 reset_kpti(&cpu->cpu_m.mcpu_kpti_flt, kcr3, ucr3); 1660 reset_kpti(&cpu->cpu_m.mcpu_kpti_dbg, kcr3, ucr3); 1661 1662 if (pcide) 1663 intr_restore(flag); 1664 1665 #endif /* !__xpv */ 1666 1667 ASSERT(cpu == CPU); 1668 } 1669 1670 /* 1671 * Utility to return a valid x86pte_t from protections, pfn, and level number 1672 */ 1673 static x86pte_t 1674 hati_mkpte(pfn_t pfn, uint_t attr, level_t level, uint_t flags) 1675 { 1676 x86pte_t pte; 1677 uint_t cache_attr = attr & HAT_ORDER_MASK; 1678 1679 pte = MAKEPTE(pfn, level); 1680 1681 if (attr & PROT_WRITE) 1682 PTE_SET(pte, PT_WRITABLE); 1683 1684 if (attr & PROT_USER) 1685 PTE_SET(pte, PT_USER); 1686 1687 if (!(attr & PROT_EXEC)) 1688 PTE_SET(pte, mmu.pt_nx); 1689 1690 /* 1691 * Set the software bits used track ref/mod sync's and hments. 1692 * If not using REF/MOD, set them to avoid h/w rewriting PTEs. 1693 */ 1694 if (flags & HAT_LOAD_NOCONSIST) 1695 PTE_SET(pte, PT_NOCONSIST | PT_REF | PT_MOD); 1696 else if (attr & HAT_NOSYNC) 1697 PTE_SET(pte, PT_NOSYNC | PT_REF | PT_MOD); 1698 1699 /* 1700 * Set the caching attributes in the PTE. The combination 1701 * of attributes are poorly defined, so we pay attention 1702 * to them in the given order. 1703 * 1704 * The test for HAT_STRICTORDER is different because it's defined 1705 * as "0" - which was a stupid thing to do, but is too late to change! 1706 */ 1707 if (cache_attr == HAT_STRICTORDER) { 1708 PTE_SET(pte, PT_NOCACHE); 1709 /*LINTED [Lint hates empty ifs, but it's the obvious way to do this] */ 1710 } else if (cache_attr & (HAT_UNORDERED_OK | HAT_STORECACHING_OK)) { 1711 /* nothing to set */; 1712 } else if (cache_attr & (HAT_MERGING_OK | HAT_LOADCACHING_OK)) { 1713 PTE_SET(pte, PT_NOCACHE); 1714 if (is_x86_feature(x86_featureset, X86FSET_PAT)) 1715 PTE_SET(pte, (level == 0) ? PT_PAT_4K : PT_PAT_LARGE); 1716 else 1717 PTE_SET(pte, PT_WRITETHRU); 1718 } else { 1719 panic("hati_mkpte(): bad caching attributes: %x\n", cache_attr); 1720 } 1721 1722 return (pte); 1723 } 1724 1725 /* 1726 * Duplicate address translations of the parent to the child. 1727 * This function really isn't used anymore. 1728 */ 1729 /*ARGSUSED*/ 1730 int 1731 hat_dup(hat_t *old, hat_t *new, caddr_t addr, size_t len, uint_t flag) 1732 { 1733 ASSERT((uintptr_t)addr < kernelbase); 1734 ASSERT(new != kas.a_hat); 1735 ASSERT(old != kas.a_hat); 1736 return (0); 1737 } 1738 1739 /* 1740 * Allocate any hat resources required for a process being swapped in. 1741 */ 1742 /*ARGSUSED*/ 1743 void 1744 hat_swapin(hat_t *hat) 1745 { 1746 /* do nothing - we let everything fault back in */ 1747 } 1748 1749 /* 1750 * Unload all translations associated with an address space of a process 1751 * that is being swapped out. 1752 */ 1753 void 1754 hat_swapout(hat_t *hat) 1755 { 1756 uintptr_t vaddr = (uintptr_t)0; 1757 uintptr_t eaddr = _userlimit; 1758 htable_t *ht = NULL; 1759 level_t l; 1760 1761 XPV_DISALLOW_MIGRATE(); 1762 /* 1763 * We can't just call hat_unload(hat, 0, _userlimit...) here, because 1764 * seg_spt and shared pagetables can't be swapped out. 1765 * Take a look at segspt_shmswapout() - it's a big no-op. 1766 * 1767 * Instead we'll walk through all the address space and unload 1768 * any mappings which we are sure are not shared, not locked. 1769 */ 1770 ASSERT(IS_PAGEALIGNED(vaddr)); 1771 ASSERT(IS_PAGEALIGNED(eaddr)); 1772 ASSERT(AS_LOCK_HELD(hat->hat_as)); 1773 if ((uintptr_t)hat->hat_as->a_userlimit < eaddr) 1774 eaddr = (uintptr_t)hat->hat_as->a_userlimit; 1775 1776 while (vaddr < eaddr) { 1777 (void) htable_walk(hat, &ht, &vaddr, eaddr); 1778 if (ht == NULL) 1779 break; 1780 1781 ASSERT(!IN_VA_HOLE(vaddr)); 1782 1783 /* 1784 * If the page table is shared skip its entire range. 1785 */ 1786 l = ht->ht_level; 1787 if (ht->ht_flags & HTABLE_SHARED_PFN) { 1788 vaddr = ht->ht_vaddr + LEVEL_SIZE(l + 1); 1789 htable_release(ht); 1790 ht = NULL; 1791 continue; 1792 } 1793 1794 /* 1795 * If the page table has no locked entries, unload this one. 1796 */ 1797 if (ht->ht_lock_cnt == 0) 1798 hat_unload(hat, (caddr_t)vaddr, LEVEL_SIZE(l), 1799 HAT_UNLOAD_UNMAP); 1800 1801 /* 1802 * If we have a level 0 page table with locked entries, 1803 * skip the entire page table, otherwise skip just one entry. 1804 */ 1805 if (ht->ht_lock_cnt > 0 && l == 0) 1806 vaddr = ht->ht_vaddr + LEVEL_SIZE(1); 1807 else 1808 vaddr += LEVEL_SIZE(l); 1809 } 1810 if (ht) 1811 htable_release(ht); 1812 1813 /* 1814 * We're in swapout because the system is low on memory, so 1815 * go back and flush all the htables off the cached list. 1816 */ 1817 htable_purge_hat(hat); 1818 XPV_ALLOW_MIGRATE(); 1819 } 1820 1821 /* 1822 * returns number of bytes that have valid mappings in hat. 1823 */ 1824 size_t 1825 hat_get_mapped_size(hat_t *hat) 1826 { 1827 size_t total = 0; 1828 int l; 1829 1830 for (l = 0; l <= mmu.max_page_level; l++) 1831 total += (hat->hat_pages_mapped[l] << LEVEL_SHIFT(l)); 1832 total += hat->hat_ism_pgcnt; 1833 1834 return (total); 1835 } 1836 1837 /* 1838 * enable/disable collection of stats for hat. 1839 */ 1840 int 1841 hat_stats_enable(hat_t *hat) 1842 { 1843 atomic_inc_32(&hat->hat_stats); 1844 return (1); 1845 } 1846 1847 void 1848 hat_stats_disable(hat_t *hat) 1849 { 1850 atomic_dec_32(&hat->hat_stats); 1851 } 1852 1853 /* 1854 * Utility to sync the ref/mod bits from a page table entry to the page_t 1855 * We must be holding the mapping list lock when this is called. 1856 */ 1857 static void 1858 hati_sync_pte_to_page(page_t *pp, x86pte_t pte, level_t level) 1859 { 1860 uint_t rm = 0; 1861 pgcnt_t pgcnt; 1862 1863 if (PTE_GET(pte, PT_SOFTWARE) >= PT_NOSYNC) 1864 return; 1865 1866 if (PTE_GET(pte, PT_REF)) 1867 rm |= P_REF; 1868 1869 if (PTE_GET(pte, PT_MOD)) 1870 rm |= P_MOD; 1871 1872 if (rm == 0) 1873 return; 1874 1875 /* 1876 * sync to all constituent pages of a large page 1877 */ 1878 ASSERT(x86_hm_held(pp)); 1879 pgcnt = page_get_pagecnt(level); 1880 ASSERT(IS_P2ALIGNED(pp->p_pagenum, pgcnt)); 1881 for (; pgcnt > 0; --pgcnt) { 1882 /* 1883 * hat_page_demote() can't decrease 1884 * pszc below this mapping size 1885 * since this large mapping existed after we 1886 * took mlist lock. 1887 */ 1888 ASSERT(pp->p_szc >= level); 1889 hat_page_setattr(pp, rm); 1890 ++pp; 1891 } 1892 } 1893 1894 /* 1895 * This the set of PTE bits for PFN, permissions and caching 1896 * that are allowed to change on a HAT_LOAD_REMAP 1897 */ 1898 #define PT_REMAP_BITS \ 1899 (PT_PADDR | PT_NX | PT_WRITABLE | PT_WRITETHRU | \ 1900 PT_NOCACHE | PT_PAT_4K | PT_PAT_LARGE | PT_IGNORE | PT_REF | PT_MOD) 1901 1902 #define REMAPASSERT(EX) if (!(EX)) panic("hati_pte_map: " #EX) 1903 /* 1904 * Do the low-level work to get a mapping entered into a HAT's pagetables 1905 * and in the mapping list of the associated page_t. 1906 */ 1907 static int 1908 hati_pte_map( 1909 htable_t *ht, 1910 uint_t entry, 1911 page_t *pp, 1912 x86pte_t pte, 1913 int flags, 1914 void *pte_ptr) 1915 { 1916 hat_t *hat = ht->ht_hat; 1917 x86pte_t old_pte; 1918 level_t l = ht->ht_level; 1919 hment_t *hm; 1920 uint_t is_consist; 1921 uint_t is_locked; 1922 int rv = 0; 1923 1924 /* 1925 * Is this a consistent (ie. need mapping list lock) mapping? 1926 */ 1927 is_consist = (pp != NULL && (flags & HAT_LOAD_NOCONSIST) == 0); 1928 1929 /* 1930 * Track locked mapping count in the htable. Do this first, 1931 * as we track locking even if there already is a mapping present. 1932 */ 1933 is_locked = (flags & HAT_LOAD_LOCK) != 0 && hat != kas.a_hat; 1934 if (is_locked) 1935 HTABLE_LOCK_INC(ht); 1936 1937 /* 1938 * Acquire the page's mapping list lock and get an hment to use. 1939 * Note that hment_prepare() might return NULL. 1940 */ 1941 if (is_consist) { 1942 x86_hm_enter(pp); 1943 hm = hment_prepare(ht, entry, pp); 1944 } 1945 1946 /* 1947 * Set the new pte, retrieving the old one at the same time. 1948 */ 1949 old_pte = x86pte_set(ht, entry, pte, pte_ptr); 1950 1951 /* 1952 * Did we get a large page / page table collision? 1953 */ 1954 if (old_pte == LPAGE_ERROR) { 1955 if (is_locked) 1956 HTABLE_LOCK_DEC(ht); 1957 rv = -1; 1958 goto done; 1959 } 1960 1961 /* 1962 * If the mapping didn't change there is nothing more to do. 1963 */ 1964 if (PTE_EQUIV(pte, old_pte)) 1965 goto done; 1966 1967 /* 1968 * Install a new mapping in the page's mapping list 1969 */ 1970 if (!PTE_ISVALID(old_pte)) { 1971 if (is_consist) { 1972 hment_assign(ht, entry, pp, hm); 1973 x86_hm_exit(pp); 1974 } else { 1975 ASSERT(flags & HAT_LOAD_NOCONSIST); 1976 } 1977 #if defined(__amd64) 1978 if (ht->ht_flags & HTABLE_COPIED) { 1979 cpu_t *cpu = CPU; 1980 hat_pcp_update(cpu, hat); 1981 } 1982 #endif 1983 HTABLE_INC(ht->ht_valid_cnt); 1984 PGCNT_INC(hat, l); 1985 return (rv); 1986 } 1987 1988 /* 1989 * Remap's are more complicated: 1990 * - HAT_LOAD_REMAP must be specified if changing the pfn. 1991 * We also require that NOCONSIST be specified. 1992 * - Otherwise only permission or caching bits may change. 1993 */ 1994 if (!PTE_ISPAGE(old_pte, l)) 1995 panic("non-null/page mapping pte=" FMT_PTE, old_pte); 1996 1997 if (PTE2PFN(old_pte, l) != PTE2PFN(pte, l)) { 1998 REMAPASSERT(flags & HAT_LOAD_REMAP); 1999 REMAPASSERT(flags & HAT_LOAD_NOCONSIST); 2000 REMAPASSERT(PTE_GET(old_pte, PT_SOFTWARE) >= PT_NOCONSIST); 2001 REMAPASSERT(pf_is_memory(PTE2PFN(old_pte, l)) == 2002 pf_is_memory(PTE2PFN(pte, l))); 2003 REMAPASSERT(!is_consist); 2004 } 2005 2006 /* 2007 * We only let remaps change the certain bits in the PTE. 2008 */ 2009 if (PTE_GET(old_pte, ~PT_REMAP_BITS) != PTE_GET(pte, ~PT_REMAP_BITS)) 2010 panic("remap bits changed: old_pte="FMT_PTE", pte="FMT_PTE"\n", 2011 old_pte, pte); 2012 2013 /* 2014 * We don't create any mapping list entries on a remap, so release 2015 * any allocated hment after we drop the mapping list lock. 2016 */ 2017 done: 2018 if (is_consist) { 2019 x86_hm_exit(pp); 2020 if (hm != NULL) 2021 hment_free(hm); 2022 } 2023 return (rv); 2024 } 2025 2026 /* 2027 * Internal routine to load a single page table entry. This only fails if 2028 * we attempt to overwrite a page table link with a large page. 2029 */ 2030 static int 2031 hati_load_common( 2032 hat_t *hat, 2033 uintptr_t va, 2034 page_t *pp, 2035 uint_t attr, 2036 uint_t flags, 2037 level_t level, 2038 pfn_t pfn) 2039 { 2040 htable_t *ht; 2041 uint_t entry; 2042 x86pte_t pte; 2043 int rv = 0; 2044 2045 /* 2046 * The number 16 is arbitrary and here to catch a recursion problem 2047 * early before we blow out the kernel stack. 2048 */ 2049 ++curthread->t_hatdepth; 2050 ASSERT(curthread->t_hatdepth < 16); 2051 2052 ASSERT(hat == kas.a_hat || (hat->hat_flags & HAT_PCP) != 0 || 2053 AS_LOCK_HELD(hat->hat_as)); 2054 2055 if (flags & HAT_LOAD_SHARE) 2056 hat->hat_flags |= HAT_SHARED; 2057 2058 /* 2059 * Find the page table that maps this page if it already exists. 2060 */ 2061 ht = htable_lookup(hat, va, level); 2062 2063 /* 2064 * We must have HAT_LOAD_NOCONSIST if page_t is NULL. 2065 */ 2066 if (pp == NULL) 2067 flags |= HAT_LOAD_NOCONSIST; 2068 2069 if (ht == NULL) { 2070 ht = htable_create(hat, va, level, NULL); 2071 ASSERT(ht != NULL); 2072 } 2073 /* 2074 * htable_va2entry checks this condition as well, but it won't include 2075 * much useful info in the panic. So we do it in advance here to include 2076 * all the context. 2077 */ 2078 if (ht->ht_vaddr > va || va > HTABLE_LAST_PAGE(ht)) { 2079 panic("hati_load_common: bad htable: va=%p, last page=%p, " 2080 "ht->ht_vaddr=%p, ht->ht_level=%d", (void *)va, 2081 (void *)HTABLE_LAST_PAGE(ht), (void *)ht->ht_vaddr, 2082 (int)ht->ht_level); 2083 } 2084 entry = htable_va2entry(va, ht); 2085 2086 /* 2087 * a bunch of paranoid error checking 2088 */ 2089 ASSERT(ht->ht_busy > 0); 2090 ASSERT(ht->ht_level == level); 2091 2092 /* 2093 * construct the new PTE 2094 */ 2095 if (hat == kas.a_hat) 2096 attr &= ~PROT_USER; 2097 pte = hati_mkpte(pfn, attr, level, flags); 2098 if (hat == kas.a_hat && va >= kernelbase) 2099 PTE_SET(pte, mmu.pt_global); 2100 2101 /* 2102 * establish the mapping 2103 */ 2104 rv = hati_pte_map(ht, entry, pp, pte, flags, NULL); 2105 2106 /* 2107 * release the htable and any reserves 2108 */ 2109 htable_release(ht); 2110 --curthread->t_hatdepth; 2111 return (rv); 2112 } 2113 2114 /* 2115 * special case of hat_memload to deal with some kernel addrs for performance 2116 */ 2117 static void 2118 hat_kmap_load( 2119 caddr_t addr, 2120 page_t *pp, 2121 uint_t attr, 2122 uint_t flags) 2123 { 2124 uintptr_t va = (uintptr_t)addr; 2125 x86pte_t pte; 2126 pfn_t pfn = page_pptonum(pp); 2127 pgcnt_t pg_off = mmu_btop(va - mmu.kmap_addr); 2128 htable_t *ht; 2129 uint_t entry; 2130 void *pte_ptr; 2131 2132 /* 2133 * construct the requested PTE 2134 */ 2135 attr &= ~PROT_USER; 2136 attr |= HAT_STORECACHING_OK; 2137 pte = hati_mkpte(pfn, attr, 0, flags); 2138 PTE_SET(pte, mmu.pt_global); 2139 2140 /* 2141 * Figure out the pte_ptr and htable and use common code to finish up 2142 */ 2143 if (mmu.pae_hat) 2144 pte_ptr = mmu.kmap_ptes + pg_off; 2145 else 2146 pte_ptr = (x86pte32_t *)mmu.kmap_ptes + pg_off; 2147 ht = mmu.kmap_htables[(va - mmu.kmap_htables[0]->ht_vaddr) >> 2148 LEVEL_SHIFT(1)]; 2149 entry = htable_va2entry(va, ht); 2150 ++curthread->t_hatdepth; 2151 ASSERT(curthread->t_hatdepth < 16); 2152 (void) hati_pte_map(ht, entry, pp, pte, flags, pte_ptr); 2153 --curthread->t_hatdepth; 2154 } 2155 2156 /* 2157 * hat_memload() - load a translation to the given page struct 2158 * 2159 * Flags for hat_memload/hat_devload/hat_*attr. 2160 * 2161 * HAT_LOAD Default flags to load a translation to the page. 2162 * 2163 * HAT_LOAD_LOCK Lock down mapping resources; hat_map(), hat_memload(), 2164 * and hat_devload(). 2165 * 2166 * HAT_LOAD_NOCONSIST Do not add mapping to page_t mapping list. 2167 * sets PT_NOCONSIST 2168 * 2169 * HAT_LOAD_SHARE A flag to hat_memload() to indicate h/w page tables 2170 * that map some user pages (not kas) is shared by more 2171 * than one process (eg. ISM). 2172 * 2173 * HAT_LOAD_REMAP Reload a valid pte with a different page frame. 2174 * 2175 * HAT_NO_KALLOC Do not kmem_alloc while creating the mapping; at this 2176 * point, it's setting up mapping to allocate internal 2177 * hat layer data structures. This flag forces hat layer 2178 * to tap its reserves in order to prevent infinite 2179 * recursion. 2180 * 2181 * The following is a protection attribute (like PROT_READ, etc.) 2182 * 2183 * HAT_NOSYNC set PT_NOSYNC - this mapping's ref/mod bits 2184 * are never cleared. 2185 * 2186 * Installing new valid PTE's and creation of the mapping list 2187 * entry are controlled under the same lock. It's derived from the 2188 * page_t being mapped. 2189 */ 2190 static uint_t supported_memload_flags = 2191 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_ADV | HAT_LOAD_NOCONSIST | 2192 HAT_LOAD_SHARE | HAT_NO_KALLOC | HAT_LOAD_REMAP | HAT_LOAD_TEXT; 2193 2194 void 2195 hat_memload( 2196 hat_t *hat, 2197 caddr_t addr, 2198 page_t *pp, 2199 uint_t attr, 2200 uint_t flags) 2201 { 2202 uintptr_t va = (uintptr_t)addr; 2203 level_t level = 0; 2204 pfn_t pfn = page_pptonum(pp); 2205 2206 XPV_DISALLOW_MIGRATE(); 2207 ASSERT(IS_PAGEALIGNED(va)); 2208 ASSERT(hat == kas.a_hat || va < _userlimit); 2209 ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as)); 2210 ASSERT((flags & supported_memload_flags) == flags); 2211 2212 ASSERT(!IN_VA_HOLE(va)); 2213 ASSERT(!PP_ISFREE(pp)); 2214 2215 /* 2216 * kernel address special case for performance. 2217 */ 2218 if (mmu.kmap_addr <= va && va < mmu.kmap_eaddr) { 2219 ASSERT(hat == kas.a_hat); 2220 hat_kmap_load(addr, pp, attr, flags); 2221 XPV_ALLOW_MIGRATE(); 2222 return; 2223 } 2224 2225 /* 2226 * This is used for memory with normal caching enabled, so 2227 * always set HAT_STORECACHING_OK. 2228 */ 2229 attr |= HAT_STORECACHING_OK; 2230 if (hati_load_common(hat, va, pp, attr, flags, level, pfn) != 0) 2231 panic("unexpected hati_load_common() failure"); 2232 XPV_ALLOW_MIGRATE(); 2233 } 2234 2235 /* ARGSUSED */ 2236 void 2237 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp, 2238 uint_t attr, uint_t flags, hat_region_cookie_t rcookie) 2239 { 2240 hat_memload(hat, addr, pp, attr, flags); 2241 } 2242 2243 /* 2244 * Load the given array of page structs using large pages when possible 2245 */ 2246 void 2247 hat_memload_array( 2248 hat_t *hat, 2249 caddr_t addr, 2250 size_t len, 2251 page_t **pages, 2252 uint_t attr, 2253 uint_t flags) 2254 { 2255 uintptr_t va = (uintptr_t)addr; 2256 uintptr_t eaddr = va + len; 2257 level_t level; 2258 size_t pgsize; 2259 pgcnt_t pgindx = 0; 2260 pfn_t pfn; 2261 pgcnt_t i; 2262 2263 XPV_DISALLOW_MIGRATE(); 2264 ASSERT(IS_PAGEALIGNED(va)); 2265 ASSERT(hat == kas.a_hat || va + len <= _userlimit); 2266 ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as)); 2267 ASSERT((flags & supported_memload_flags) == flags); 2268 2269 /* 2270 * memload is used for memory with full caching enabled, so 2271 * set HAT_STORECACHING_OK. 2272 */ 2273 attr |= HAT_STORECACHING_OK; 2274 2275 /* 2276 * handle all pages using largest possible pagesize 2277 */ 2278 while (va < eaddr) { 2279 /* 2280 * decide what level mapping to use (ie. pagesize) 2281 */ 2282 pfn = page_pptonum(pages[pgindx]); 2283 for (level = mmu.max_page_level; ; --level) { 2284 pgsize = LEVEL_SIZE(level); 2285 if (level == 0) 2286 break; 2287 2288 if (!IS_P2ALIGNED(va, pgsize) || 2289 (eaddr - va) < pgsize || 2290 !IS_P2ALIGNED(pfn_to_pa(pfn), pgsize)) 2291 continue; 2292 2293 /* 2294 * To use a large mapping of this size, all the 2295 * pages we are passed must be sequential subpages 2296 * of the large page. 2297 * hat_page_demote() can't change p_szc because 2298 * all pages are locked. 2299 */ 2300 if (pages[pgindx]->p_szc >= level) { 2301 for (i = 0; i < mmu_btop(pgsize); ++i) { 2302 if (pfn + i != 2303 page_pptonum(pages[pgindx + i])) 2304 break; 2305 ASSERT(pages[pgindx + i]->p_szc >= 2306 level); 2307 ASSERT(pages[pgindx] + i == 2308 pages[pgindx + i]); 2309 } 2310 if (i == mmu_btop(pgsize)) { 2311 #ifdef DEBUG 2312 if (level == 2) 2313 map1gcnt++; 2314 #endif 2315 break; 2316 } 2317 } 2318 } 2319 2320 /* 2321 * Load this page mapping. If the load fails, try a smaller 2322 * pagesize. 2323 */ 2324 ASSERT(!IN_VA_HOLE(va)); 2325 while (hati_load_common(hat, va, pages[pgindx], attr, 2326 flags, level, pfn) != 0) { 2327 if (level == 0) 2328 panic("unexpected hati_load_common() failure"); 2329 --level; 2330 pgsize = LEVEL_SIZE(level); 2331 } 2332 2333 /* 2334 * move to next page 2335 */ 2336 va += pgsize; 2337 pgindx += mmu_btop(pgsize); 2338 } 2339 XPV_ALLOW_MIGRATE(); 2340 } 2341 2342 /* ARGSUSED */ 2343 void 2344 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len, 2345 struct page **pps, uint_t attr, uint_t flags, 2346 hat_region_cookie_t rcookie) 2347 { 2348 hat_memload_array(hat, addr, len, pps, attr, flags); 2349 } 2350 2351 /* 2352 * void hat_devload(hat, addr, len, pf, attr, flags) 2353 * load/lock the given page frame number 2354 * 2355 * Advisory ordering attributes. Apply only to device mappings. 2356 * 2357 * HAT_STRICTORDER: the CPU must issue the references in order, as the 2358 * programmer specified. This is the default. 2359 * HAT_UNORDERED_OK: the CPU may reorder the references (this is all kinds 2360 * of reordering; store or load with store or load). 2361 * HAT_MERGING_OK: merging and batching: the CPU may merge individual stores 2362 * to consecutive locations (for example, turn two consecutive byte 2363 * stores into one halfword store), and it may batch individual loads 2364 * (for example, turn two consecutive byte loads into one halfword load). 2365 * This also implies re-ordering. 2366 * HAT_LOADCACHING_OK: the CPU may cache the data it fetches and reuse it 2367 * until another store occurs. The default is to fetch new data 2368 * on every load. This also implies merging. 2369 * HAT_STORECACHING_OK: the CPU may keep the data in the cache and push it to 2370 * the device (perhaps with other data) at a later time. The default is 2371 * to push the data right away. This also implies load caching. 2372 * 2373 * Equivalent of hat_memload(), but can be used for device memory where 2374 * there are no page_t's and we support additional flags (write merging, etc). 2375 * Note that we can have large page mappings with this interface. 2376 */ 2377 int supported_devload_flags = HAT_LOAD | HAT_LOAD_LOCK | 2378 HAT_LOAD_NOCONSIST | HAT_STRICTORDER | HAT_UNORDERED_OK | 2379 HAT_MERGING_OK | HAT_LOADCACHING_OK | HAT_STORECACHING_OK; 2380 2381 void 2382 hat_devload( 2383 hat_t *hat, 2384 caddr_t addr, 2385 size_t len, 2386 pfn_t pfn, 2387 uint_t attr, 2388 int flags) 2389 { 2390 uintptr_t va = ALIGN2PAGE(addr); 2391 uintptr_t eva = va + len; 2392 level_t level; 2393 size_t pgsize; 2394 page_t *pp; 2395 int f; /* per PTE copy of flags - maybe modified */ 2396 uint_t a; /* per PTE copy of attr */ 2397 2398 XPV_DISALLOW_MIGRATE(); 2399 ASSERT(IS_PAGEALIGNED(va)); 2400 ASSERT(hat == kas.a_hat || eva <= _userlimit); 2401 ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as)); 2402 ASSERT((flags & supported_devload_flags) == flags); 2403 2404 /* 2405 * handle all pages 2406 */ 2407 while (va < eva) { 2408 2409 /* 2410 * decide what level mapping to use (ie. pagesize) 2411 */ 2412 for (level = mmu.max_page_level; ; --level) { 2413 pgsize = LEVEL_SIZE(level); 2414 if (level == 0) 2415 break; 2416 if (IS_P2ALIGNED(va, pgsize) && 2417 (eva - va) >= pgsize && 2418 IS_P2ALIGNED(pfn, mmu_btop(pgsize))) { 2419 #ifdef DEBUG 2420 if (level == 2) 2421 map1gcnt++; 2422 #endif 2423 break; 2424 } 2425 } 2426 2427 /* 2428 * If this is just memory then allow caching (this happens 2429 * for the nucleus pages) - though HAT_PLAT_NOCACHE can be used 2430 * to override that. If we don't have a page_t then make sure 2431 * NOCONSIST is set. 2432 */ 2433 a = attr; 2434 f = flags; 2435 if (!pf_is_memory(pfn)) 2436 f |= HAT_LOAD_NOCONSIST; 2437 else if (!(a & HAT_PLAT_NOCACHE)) 2438 a |= HAT_STORECACHING_OK; 2439 2440 if (f & HAT_LOAD_NOCONSIST) 2441 pp = NULL; 2442 else 2443 pp = page_numtopp_nolock(pfn); 2444 2445 /* 2446 * Check to make sure we are really trying to map a valid 2447 * memory page. The caller wishing to intentionally map 2448 * free memory pages will have passed the HAT_LOAD_NOCONSIST 2449 * flag, then pp will be NULL. 2450 */ 2451 if (pp != NULL) { 2452 if (PP_ISFREE(pp)) { 2453 panic("hat_devload: loading " 2454 "a mapping to free page %p", (void *)pp); 2455 } 2456 2457 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) { 2458 panic("hat_devload: loading a mapping " 2459 "to an unlocked page %p", 2460 (void *)pp); 2461 } 2462 } 2463 2464 /* 2465 * load this page mapping 2466 */ 2467 ASSERT(!IN_VA_HOLE(va)); 2468 while (hati_load_common(hat, va, pp, a, f, level, pfn) != 0) { 2469 if (level == 0) 2470 panic("unexpected hati_load_common() failure"); 2471 --level; 2472 pgsize = LEVEL_SIZE(level); 2473 } 2474 2475 /* 2476 * move to next page 2477 */ 2478 va += pgsize; 2479 pfn += mmu_btop(pgsize); 2480 } 2481 XPV_ALLOW_MIGRATE(); 2482 } 2483 2484 /* 2485 * void hat_unlock(hat, addr, len) 2486 * unlock the mappings to a given range of addresses 2487 * 2488 * Locks are tracked by ht_lock_cnt in the htable. 2489 */ 2490 void 2491 hat_unlock(hat_t *hat, caddr_t addr, size_t len) 2492 { 2493 uintptr_t vaddr = (uintptr_t)addr; 2494 uintptr_t eaddr = vaddr + len; 2495 htable_t *ht = NULL; 2496 2497 /* 2498 * kernel entries are always locked, we don't track lock counts 2499 */ 2500 ASSERT(hat == kas.a_hat || eaddr <= _userlimit); 2501 ASSERT(IS_PAGEALIGNED(vaddr)); 2502 ASSERT(IS_PAGEALIGNED(eaddr)); 2503 if (hat == kas.a_hat) 2504 return; 2505 if (eaddr > _userlimit) 2506 panic("hat_unlock() address out of range - above _userlimit"); 2507 2508 XPV_DISALLOW_MIGRATE(); 2509 ASSERT(AS_LOCK_HELD(hat->hat_as)); 2510 while (vaddr < eaddr) { 2511 (void) htable_walk(hat, &ht, &vaddr, eaddr); 2512 if (ht == NULL) 2513 break; 2514 2515 ASSERT(!IN_VA_HOLE(vaddr)); 2516 2517 if (ht->ht_lock_cnt < 1) 2518 panic("hat_unlock(): lock_cnt < 1, " 2519 "htable=%p, vaddr=%p\n", (void *)ht, (void *)vaddr); 2520 HTABLE_LOCK_DEC(ht); 2521 2522 vaddr += LEVEL_SIZE(ht->ht_level); 2523 } 2524 if (ht) 2525 htable_release(ht); 2526 XPV_ALLOW_MIGRATE(); 2527 } 2528 2529 /* ARGSUSED */ 2530 void 2531 hat_unlock_region(struct hat *hat, caddr_t addr, size_t len, 2532 hat_region_cookie_t rcookie) 2533 { 2534 panic("No shared region support on x86"); 2535 } 2536 2537 #if !defined(__xpv) 2538 /* 2539 * Cross call service routine to demap a range of virtual 2540 * pages on the current CPU or flush all mappings in TLB. 2541 */ 2542 static int 2543 hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) 2544 { 2545 _NOTE(ARGUNUSED(a3)); 2546 hat_t *hat = (hat_t *)a1; 2547 tlb_range_t *range = (tlb_range_t *)a2; 2548 2549 /* 2550 * If the target hat isn't the kernel and this CPU isn't operating 2551 * in the target hat, we can ignore the cross call. 2552 */ 2553 if (hat != kas.a_hat && hat != CPU->cpu_current_hat) 2554 return (0); 2555 2556 if (range->tr_va != DEMAP_ALL_ADDR) { 2557 mmu_flush_tlb(FLUSH_TLB_RANGE, range); 2558 return (0); 2559 } 2560 2561 /* 2562 * We are flushing all of userspace. 2563 * 2564 * When using PCP, we first need to update this CPU's idea of the PCP 2565 * PTEs. 2566 */ 2567 if (hat->hat_flags & HAT_COPIED) { 2568 #if defined(__amd64) 2569 hat_pcp_update(CPU, hat); 2570 #elif defined(__i386) 2571 reload_pae32(hat, CPU); 2572 #endif 2573 } 2574 2575 mmu_flush_tlb(FLUSH_TLB_NONGLOBAL, NULL); 2576 return (0); 2577 } 2578 2579 #define TLBIDLE_CPU_HALTED (0x1UL) 2580 #define TLBIDLE_INVAL_ALL (0x2UL) 2581 #define CAS_TLB_INFO(cpu, old, new) \ 2582 atomic_cas_ulong((ulong_t *)&(cpu)->cpu_m.mcpu_tlb_info, (old), (new)) 2583 2584 /* 2585 * Record that a CPU is going idle 2586 */ 2587 void 2588 tlb_going_idle(void) 2589 { 2590 atomic_or_ulong((ulong_t *)&CPU->cpu_m.mcpu_tlb_info, 2591 TLBIDLE_CPU_HALTED); 2592 } 2593 2594 /* 2595 * Service a delayed TLB flush if coming out of being idle. 2596 * It will be called from cpu idle notification with interrupt disabled. 2597 */ 2598 void 2599 tlb_service(void) 2600 { 2601 ulong_t tlb_info; 2602 ulong_t found; 2603 2604 /* 2605 * We only have to do something if coming out of being idle. 2606 */ 2607 tlb_info = CPU->cpu_m.mcpu_tlb_info; 2608 if (tlb_info & TLBIDLE_CPU_HALTED) { 2609 ASSERT(CPU->cpu_current_hat == kas.a_hat); 2610 2611 /* 2612 * Atomic clear and fetch of old state. 2613 */ 2614 while ((found = CAS_TLB_INFO(CPU, tlb_info, 0)) != tlb_info) { 2615 ASSERT(found & TLBIDLE_CPU_HALTED); 2616 tlb_info = found; 2617 SMT_PAUSE(); 2618 } 2619 if (tlb_info & TLBIDLE_INVAL_ALL) 2620 mmu_flush_tlb(FLUSH_TLB_ALL, NULL); 2621 } 2622 } 2623 #endif /* !__xpv */ 2624 2625 /* 2626 * Internal routine to do cross calls to invalidate a range of pages on 2627 * all CPUs using a given hat. 2628 */ 2629 void 2630 hat_tlb_inval_range(hat_t *hat, tlb_range_t *in_range) 2631 { 2632 extern int flushes_require_xcalls; /* from mp_startup.c */ 2633 cpuset_t justme; 2634 cpuset_t cpus_to_shootdown; 2635 tlb_range_t range = *in_range; 2636 #ifndef __xpv 2637 cpuset_t check_cpus; 2638 cpu_t *cpup; 2639 int c; 2640 #endif 2641 2642 /* 2643 * If the hat is being destroyed, there are no more users, so 2644 * demap need not do anything. 2645 */ 2646 if (hat->hat_flags & HAT_FREEING) 2647 return; 2648 2649 /* 2650 * If demapping from a shared pagetable, we best demap the 2651 * entire set of user TLBs, since we don't know what addresses 2652 * these were shared at. 2653 */ 2654 if (hat->hat_flags & HAT_SHARED) { 2655 hat = kas.a_hat; 2656 range.tr_va = DEMAP_ALL_ADDR; 2657 } 2658 2659 /* 2660 * if not running with multiple CPUs, don't use cross calls 2661 */ 2662 if (panicstr || !flushes_require_xcalls) { 2663 #ifdef __xpv 2664 if (range.tr_va == DEMAP_ALL_ADDR) { 2665 xen_flush_tlb(); 2666 } else { 2667 for (size_t i = 0; i < TLB_RANGE_LEN(&range); 2668 i += MMU_PAGESIZE) { 2669 xen_flush_va((caddr_t)(range.tr_va + i)); 2670 } 2671 } 2672 #else 2673 (void) hati_demap_func((xc_arg_t)hat, (xc_arg_t)&range, 0); 2674 #endif 2675 return; 2676 } 2677 2678 2679 /* 2680 * Determine CPUs to shootdown. Kernel changes always do all CPUs. 2681 * Otherwise it's just CPUs currently executing in this hat. 2682 */ 2683 kpreempt_disable(); 2684 CPUSET_ONLY(justme, CPU->cpu_id); 2685 if (hat == kas.a_hat) 2686 cpus_to_shootdown = khat_cpuset; 2687 else 2688 cpus_to_shootdown = hat->hat_cpus; 2689 2690 #ifndef __xpv 2691 /* 2692 * If any CPUs in the set are idle, just request a delayed flush 2693 * and avoid waking them up. 2694 */ 2695 check_cpus = cpus_to_shootdown; 2696 for (c = 0; c < NCPU && !CPUSET_ISNULL(check_cpus); ++c) { 2697 ulong_t tlb_info; 2698 2699 if (!CPU_IN_SET(check_cpus, c)) 2700 continue; 2701 CPUSET_DEL(check_cpus, c); 2702 cpup = cpu[c]; 2703 if (cpup == NULL) 2704 continue; 2705 2706 tlb_info = cpup->cpu_m.mcpu_tlb_info; 2707 while (tlb_info == TLBIDLE_CPU_HALTED) { 2708 (void) CAS_TLB_INFO(cpup, TLBIDLE_CPU_HALTED, 2709 TLBIDLE_CPU_HALTED | TLBIDLE_INVAL_ALL); 2710 SMT_PAUSE(); 2711 tlb_info = cpup->cpu_m.mcpu_tlb_info; 2712 } 2713 if (tlb_info == (TLBIDLE_CPU_HALTED | TLBIDLE_INVAL_ALL)) { 2714 HATSTAT_INC(hs_tlb_inval_delayed); 2715 CPUSET_DEL(cpus_to_shootdown, c); 2716 } 2717 } 2718 #endif 2719 2720 if (CPUSET_ISNULL(cpus_to_shootdown) || 2721 CPUSET_ISEQUAL(cpus_to_shootdown, justme)) { 2722 2723 #ifdef __xpv 2724 if (range.tr_va == DEMAP_ALL_ADDR) { 2725 xen_flush_tlb(); 2726 } else { 2727 for (size_t i = 0; i < TLB_RANGE_LEN(&range); 2728 i += MMU_PAGESIZE) { 2729 xen_flush_va((caddr_t)(range.tr_va + i)); 2730 } 2731 } 2732 #else 2733 (void) hati_demap_func((xc_arg_t)hat, (xc_arg_t)&range, 0); 2734 #endif 2735 2736 } else { 2737 2738 CPUSET_ADD(cpus_to_shootdown, CPU->cpu_id); 2739 #ifdef __xpv 2740 if (range.tr_va == DEMAP_ALL_ADDR) { 2741 xen_gflush_tlb(cpus_to_shootdown); 2742 } else { 2743 for (size_t i = 0; i < TLB_RANGE_LEN(&range); 2744 i += MMU_PAGESIZE) { 2745 xen_gflush_va((caddr_t)(range.tr_va + i), 2746 cpus_to_shootdown); 2747 } 2748 } 2749 #else 2750 xc_call((xc_arg_t)hat, (xc_arg_t)&range, 0, 2751 CPUSET2BV(cpus_to_shootdown), hati_demap_func); 2752 #endif 2753 2754 } 2755 kpreempt_enable(); 2756 } 2757 2758 void 2759 hat_tlb_inval(hat_t *hat, uintptr_t va) 2760 { 2761 /* 2762 * Create range for a single page. 2763 */ 2764 tlb_range_t range; 2765 range.tr_va = va; 2766 range.tr_cnt = 1; /* one page */ 2767 range.tr_level = MIN_PAGE_LEVEL; /* pages are MMU_PAGESIZE */ 2768 2769 hat_tlb_inval_range(hat, &range); 2770 } 2771 2772 /* 2773 * Interior routine for HAT_UNLOADs from hat_unload_callback(), 2774 * hat_kmap_unload() OR from hat_steal() code. This routine doesn't 2775 * handle releasing of the htables. 2776 */ 2777 void 2778 hat_pte_unmap( 2779 htable_t *ht, 2780 uint_t entry, 2781 uint_t flags, 2782 x86pte_t old_pte, 2783 void *pte_ptr, 2784 boolean_t tlb) 2785 { 2786 hat_t *hat = ht->ht_hat; 2787 hment_t *hm = NULL; 2788 page_t *pp = NULL; 2789 level_t l = ht->ht_level; 2790 pfn_t pfn; 2791 2792 /* 2793 * We always track the locking counts, even if nothing is unmapped 2794 */ 2795 if ((flags & HAT_UNLOAD_UNLOCK) != 0 && hat != kas.a_hat) { 2796 ASSERT(ht->ht_lock_cnt > 0); 2797 HTABLE_LOCK_DEC(ht); 2798 } 2799 2800 /* 2801 * Figure out which page's mapping list lock to acquire using the PFN 2802 * passed in "old" PTE. We then attempt to invalidate the PTE. 2803 * If another thread, probably a hat_pageunload, has asynchronously 2804 * unmapped/remapped this address we'll loop here. 2805 */ 2806 ASSERT(ht->ht_busy > 0); 2807 while (PTE_ISVALID(old_pte)) { 2808 pfn = PTE2PFN(old_pte, l); 2809 if (PTE_GET(old_pte, PT_SOFTWARE) >= PT_NOCONSIST) { 2810 pp = NULL; 2811 } else { 2812 #ifdef __xpv 2813 if (pfn == PFN_INVALID) 2814 panic("Invalid PFN, but not PT_NOCONSIST"); 2815 #endif 2816 pp = page_numtopp_nolock(pfn); 2817 if (pp == NULL) { 2818 panic("no page_t, not NOCONSIST: old_pte=" 2819 FMT_PTE " ht=%lx entry=0x%x pte_ptr=%lx", 2820 old_pte, (uintptr_t)ht, entry, 2821 (uintptr_t)pte_ptr); 2822 } 2823 x86_hm_enter(pp); 2824 } 2825 2826 old_pte = x86pte_inval(ht, entry, old_pte, pte_ptr, tlb); 2827 2828 /* 2829 * If the page hadn't changed we've unmapped it and can proceed 2830 */ 2831 if (PTE_ISVALID(old_pte) && PTE2PFN(old_pte, l) == pfn) 2832 break; 2833 2834 /* 2835 * Otherwise, we'll have to retry with the current old_pte. 2836 * Drop the hment lock, since the pfn may have changed. 2837 */ 2838 if (pp != NULL) { 2839 x86_hm_exit(pp); 2840 pp = NULL; 2841 } else { 2842 ASSERT(PTE_GET(old_pte, PT_SOFTWARE) >= PT_NOCONSIST); 2843 } 2844 } 2845 2846 /* 2847 * If the old mapping wasn't valid, there's nothing more to do 2848 */ 2849 if (!PTE_ISVALID(old_pte)) { 2850 if (pp != NULL) 2851 x86_hm_exit(pp); 2852 return; 2853 } 2854 2855 /* 2856 * Take care of syncing any MOD/REF bits and removing the hment. 2857 */ 2858 if (pp != NULL) { 2859 if (!(flags & HAT_UNLOAD_NOSYNC)) 2860 hati_sync_pte_to_page(pp, old_pte, l); 2861 hm = hment_remove(pp, ht, entry); 2862 x86_hm_exit(pp); 2863 if (hm != NULL) 2864 hment_free(hm); 2865 } 2866 2867 /* 2868 * Handle book keeping in the htable and hat 2869 */ 2870 ASSERT(ht->ht_valid_cnt > 0); 2871 HTABLE_DEC(ht->ht_valid_cnt); 2872 PGCNT_DEC(hat, l); 2873 } 2874 2875 /* 2876 * very cheap unload implementation to special case some kernel addresses 2877 */ 2878 static void 2879 hat_kmap_unload(caddr_t addr, size_t len, uint_t flags) 2880 { 2881 uintptr_t va = (uintptr_t)addr; 2882 uintptr_t eva = va + len; 2883 pgcnt_t pg_index; 2884 htable_t *ht; 2885 uint_t entry; 2886 x86pte_t *pte_ptr; 2887 x86pte_t old_pte; 2888 2889 for (; va < eva; va += MMU_PAGESIZE) { 2890 /* 2891 * Get the PTE 2892 */ 2893 pg_index = mmu_btop(va - mmu.kmap_addr); 2894 pte_ptr = PT_INDEX_PTR(mmu.kmap_ptes, pg_index); 2895 old_pte = GET_PTE(pte_ptr); 2896 2897 /* 2898 * get the htable / entry 2899 */ 2900 ht = mmu.kmap_htables[(va - mmu.kmap_htables[0]->ht_vaddr) 2901 >> LEVEL_SHIFT(1)]; 2902 entry = htable_va2entry(va, ht); 2903 2904 /* 2905 * use mostly common code to unmap it. 2906 */ 2907 hat_pte_unmap(ht, entry, flags, old_pte, pte_ptr, B_TRUE); 2908 } 2909 } 2910 2911 2912 /* 2913 * unload a range of virtual address space (no callback) 2914 */ 2915 void 2916 hat_unload(hat_t *hat, caddr_t addr, size_t len, uint_t flags) 2917 { 2918 uintptr_t va = (uintptr_t)addr; 2919 2920 XPV_DISALLOW_MIGRATE(); 2921 ASSERT(hat == kas.a_hat || va + len <= _userlimit); 2922 2923 /* 2924 * special case for performance. 2925 */ 2926 if (mmu.kmap_addr <= va && va < mmu.kmap_eaddr) { 2927 ASSERT(hat == kas.a_hat); 2928 hat_kmap_unload(addr, len, flags); 2929 } else { 2930 hat_unload_callback(hat, addr, len, flags, NULL); 2931 } 2932 XPV_ALLOW_MIGRATE(); 2933 } 2934 2935 /* 2936 * Invalidate the TLB, and perform the callback to the upper level VM system, 2937 * for the specified ranges of contiguous pages. 2938 */ 2939 static void 2940 handle_ranges(hat_t *hat, hat_callback_t *cb, uint_t cnt, tlb_range_t *range) 2941 { 2942 while (cnt > 0) { 2943 --cnt; 2944 hat_tlb_inval_range(hat, &range[cnt]); 2945 2946 if (cb != NULL) { 2947 cb->hcb_start_addr = (caddr_t)range[cnt].tr_va; 2948 cb->hcb_end_addr = cb->hcb_start_addr; 2949 cb->hcb_end_addr += range[cnt].tr_cnt << 2950 LEVEL_SHIFT(range[cnt].tr_level); 2951 cb->hcb_function(cb); 2952 } 2953 } 2954 } 2955 2956 /* 2957 * Unload a given range of addresses (has optional callback) 2958 * 2959 * Flags: 2960 * define HAT_UNLOAD 0x00 2961 * define HAT_UNLOAD_NOSYNC 0x02 2962 * define HAT_UNLOAD_UNLOCK 0x04 2963 * define HAT_UNLOAD_OTHER 0x08 - not used 2964 * define HAT_UNLOAD_UNMAP 0x10 - same as HAT_UNLOAD 2965 */ 2966 #define MAX_UNLOAD_CNT (8) 2967 void 2968 hat_unload_callback( 2969 hat_t *hat, 2970 caddr_t addr, 2971 size_t len, 2972 uint_t flags, 2973 hat_callback_t *cb) 2974 { 2975 uintptr_t vaddr = (uintptr_t)addr; 2976 uintptr_t eaddr = vaddr + len; 2977 htable_t *ht = NULL; 2978 uint_t entry; 2979 uintptr_t contig_va = (uintptr_t)-1L; 2980 tlb_range_t r[MAX_UNLOAD_CNT]; 2981 uint_t r_cnt = 0; 2982 x86pte_t old_pte; 2983 2984 XPV_DISALLOW_MIGRATE(); 2985 ASSERT(hat == kas.a_hat || eaddr <= _userlimit); 2986 ASSERT(IS_PAGEALIGNED(vaddr)); 2987 ASSERT(IS_PAGEALIGNED(eaddr)); 2988 2989 /* 2990 * Special case a single page being unloaded for speed. This happens 2991 * quite frequently, COW faults after a fork() for example. 2992 */ 2993 if (cb == NULL && len == MMU_PAGESIZE) { 2994 ht = htable_getpte(hat, vaddr, &entry, &old_pte, 0); 2995 if (ht != NULL) { 2996 if (PTE_ISVALID(old_pte)) { 2997 hat_pte_unmap(ht, entry, flags, old_pte, 2998 NULL, B_TRUE); 2999 } 3000 htable_release(ht); 3001 } 3002 XPV_ALLOW_MIGRATE(); 3003 return; 3004 } 3005 3006 while (vaddr < eaddr) { 3007 old_pte = htable_walk(hat, &ht, &vaddr, eaddr); 3008 if (ht == NULL) 3009 break; 3010 3011 ASSERT(!IN_VA_HOLE(vaddr)); 3012 3013 if (vaddr < (uintptr_t)addr) 3014 panic("hat_unload_callback(): unmap inside large page"); 3015 3016 /* 3017 * We'll do the call backs for contiguous ranges 3018 */ 3019 if (vaddr != contig_va || 3020 (r_cnt > 0 && r[r_cnt - 1].tr_level != ht->ht_level)) { 3021 if (r_cnt == MAX_UNLOAD_CNT) { 3022 handle_ranges(hat, cb, r_cnt, r); 3023 r_cnt = 0; 3024 } 3025 r[r_cnt].tr_va = vaddr; 3026 r[r_cnt].tr_cnt = 0; 3027 r[r_cnt].tr_level = ht->ht_level; 3028 ++r_cnt; 3029 } 3030 3031 /* 3032 * Unload one mapping (for a single page) from the page tables. 3033 * Note that we do not remove the mapping from the TLB yet, 3034 * as indicated by the tlb=FALSE argument to hat_pte_unmap(). 3035 * handle_ranges() will clear the TLB entries with one call to 3036 * hat_tlb_inval_range() per contiguous range. This is 3037 * safe because the page can not be reused until the 3038 * callback is made (or we return). 3039 */ 3040 entry = htable_va2entry(vaddr, ht); 3041 hat_pte_unmap(ht, entry, flags, old_pte, NULL, B_FALSE); 3042 ASSERT(ht->ht_level <= mmu.max_page_level); 3043 vaddr += LEVEL_SIZE(ht->ht_level); 3044 contig_va = vaddr; 3045 ++r[r_cnt - 1].tr_cnt; 3046 } 3047 if (ht) 3048 htable_release(ht); 3049 3050 /* 3051 * handle last range for callbacks 3052 */ 3053 if (r_cnt > 0) 3054 handle_ranges(hat, cb, r_cnt, r); 3055 XPV_ALLOW_MIGRATE(); 3056 } 3057 3058 /* 3059 * Invalidate a virtual address translation on a slave CPU during 3060 * panic() dumps. 3061 */ 3062 void 3063 hat_flush_range(hat_t *hat, caddr_t va, size_t size) 3064 { 3065 ssize_t sz; 3066 caddr_t endva = va + size; 3067 3068 while (va < endva) { 3069 sz = hat_getpagesize(hat, va); 3070 if (sz < 0) { 3071 #ifdef __xpv 3072 xen_flush_tlb(); 3073 #else 3074 mmu_flush_tlb(FLUSH_TLB_ALL, NULL); 3075 #endif 3076 break; 3077 } 3078 #ifdef __xpv 3079 xen_flush_va(va); 3080 #else 3081 mmu_flush_tlb_kpage((uintptr_t)va); 3082 #endif 3083 va += sz; 3084 } 3085 } 3086 3087 /* 3088 * synchronize mapping with software data structures 3089 * 3090 * This interface is currently only used by the working set monitor 3091 * driver. 3092 */ 3093 /*ARGSUSED*/ 3094 void 3095 hat_sync(hat_t *hat, caddr_t addr, size_t len, uint_t flags) 3096 { 3097 uintptr_t vaddr = (uintptr_t)addr; 3098 uintptr_t eaddr = vaddr + len; 3099 htable_t *ht = NULL; 3100 uint_t entry; 3101 x86pte_t pte; 3102 x86pte_t save_pte; 3103 x86pte_t new; 3104 page_t *pp; 3105 3106 ASSERT(!IN_VA_HOLE(vaddr)); 3107 ASSERT(IS_PAGEALIGNED(vaddr)); 3108 ASSERT(IS_PAGEALIGNED(eaddr)); 3109 ASSERT(hat == kas.a_hat || eaddr <= _userlimit); 3110 3111 XPV_DISALLOW_MIGRATE(); 3112 for (; vaddr < eaddr; vaddr += LEVEL_SIZE(ht->ht_level)) { 3113 try_again: 3114 pte = htable_walk(hat, &ht, &vaddr, eaddr); 3115 if (ht == NULL) 3116 break; 3117 entry = htable_va2entry(vaddr, ht); 3118 3119 if (PTE_GET(pte, PT_SOFTWARE) >= PT_NOSYNC || 3120 PTE_GET(pte, PT_REF | PT_MOD) == 0) 3121 continue; 3122 3123 /* 3124 * We need to acquire the mapping list lock to protect 3125 * against hat_pageunload(), hat_unload(), etc. 3126 */ 3127 pp = page_numtopp_nolock(PTE2PFN(pte, ht->ht_level)); 3128 if (pp == NULL) 3129 break; 3130 x86_hm_enter(pp); 3131 save_pte = pte; 3132 pte = x86pte_get(ht, entry); 3133 if (pte != save_pte) { 3134 x86_hm_exit(pp); 3135 goto try_again; 3136 } 3137 if (PTE_GET(pte, PT_SOFTWARE) >= PT_NOSYNC || 3138 PTE_GET(pte, PT_REF | PT_MOD) == 0) { 3139 x86_hm_exit(pp); 3140 continue; 3141 } 3142 3143 /* 3144 * Need to clear ref or mod bits. We may compete with 3145 * hardware updating the R/M bits and have to try again. 3146 */ 3147 if (flags == HAT_SYNC_ZERORM) { 3148 new = pte; 3149 PTE_CLR(new, PT_REF | PT_MOD); 3150 pte = hati_update_pte(ht, entry, pte, new); 3151 if (pte != 0) { 3152 x86_hm_exit(pp); 3153 goto try_again; 3154 } 3155 } else { 3156 /* 3157 * sync the PTE to the page_t 3158 */ 3159 hati_sync_pte_to_page(pp, save_pte, ht->ht_level); 3160 } 3161 x86_hm_exit(pp); 3162 } 3163 if (ht) 3164 htable_release(ht); 3165 XPV_ALLOW_MIGRATE(); 3166 } 3167 3168 /* 3169 * void hat_map(hat, addr, len, flags) 3170 */ 3171 /*ARGSUSED*/ 3172 void 3173 hat_map(hat_t *hat, caddr_t addr, size_t len, uint_t flags) 3174 { 3175 /* does nothing */ 3176 } 3177 3178 /* 3179 * uint_t hat_getattr(hat, addr, *attr) 3180 * returns attr for <hat,addr> in *attr. returns 0 if there was a 3181 * mapping and *attr is valid, nonzero if there was no mapping and 3182 * *attr is not valid. 3183 */ 3184 uint_t 3185 hat_getattr(hat_t *hat, caddr_t addr, uint_t *attr) 3186 { 3187 uintptr_t vaddr = ALIGN2PAGE(addr); 3188 htable_t *ht = NULL; 3189 x86pte_t pte; 3190 3191 ASSERT(hat == kas.a_hat || vaddr <= _userlimit); 3192 3193 if (IN_VA_HOLE(vaddr)) 3194 return ((uint_t)-1); 3195 3196 ht = htable_getpte(hat, vaddr, NULL, &pte, mmu.max_page_level); 3197 if (ht == NULL) 3198 return ((uint_t)-1); 3199 3200 if (!PTE_ISVALID(pte) || !PTE_ISPAGE(pte, ht->ht_level)) { 3201 htable_release(ht); 3202 return ((uint_t)-1); 3203 } 3204 3205 *attr = PROT_READ; 3206 if (PTE_GET(pte, PT_WRITABLE)) 3207 *attr |= PROT_WRITE; 3208 if (PTE_GET(pte, PT_USER)) 3209 *attr |= PROT_USER; 3210 if (!PTE_GET(pte, mmu.pt_nx)) 3211 *attr |= PROT_EXEC; 3212 if (PTE_GET(pte, PT_SOFTWARE) >= PT_NOSYNC) 3213 *attr |= HAT_NOSYNC; 3214 htable_release(ht); 3215 return (0); 3216 } 3217 3218 /* 3219 * hat_updateattr() applies the given attribute change to an existing mapping 3220 */ 3221 #define HAT_LOAD_ATTR 1 3222 #define HAT_SET_ATTR 2 3223 #define HAT_CLR_ATTR 3 3224 3225 static void 3226 hat_updateattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr, int what) 3227 { 3228 uintptr_t vaddr = (uintptr_t)addr; 3229 uintptr_t eaddr = (uintptr_t)addr + len; 3230 htable_t *ht = NULL; 3231 uint_t entry; 3232 x86pte_t oldpte, newpte; 3233 page_t *pp; 3234 3235 XPV_DISALLOW_MIGRATE(); 3236 ASSERT(IS_PAGEALIGNED(vaddr)); 3237 ASSERT(IS_PAGEALIGNED(eaddr)); 3238 ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as)); 3239 for (; vaddr < eaddr; vaddr += LEVEL_SIZE(ht->ht_level)) { 3240 try_again: 3241 oldpte = htable_walk(hat, &ht, &vaddr, eaddr); 3242 if (ht == NULL) 3243 break; 3244 if (PTE_GET(oldpte, PT_SOFTWARE) >= PT_NOCONSIST) 3245 continue; 3246 3247 pp = page_numtopp_nolock(PTE2PFN(oldpte, ht->ht_level)); 3248 if (pp == NULL) 3249 continue; 3250 x86_hm_enter(pp); 3251 3252 newpte = oldpte; 3253 /* 3254 * We found a page table entry in the desired range, 3255 * figure out the new attributes. 3256 */ 3257 if (what == HAT_SET_ATTR || what == HAT_LOAD_ATTR) { 3258 if ((attr & PROT_WRITE) && 3259 !PTE_GET(oldpte, PT_WRITABLE)) 3260 newpte |= PT_WRITABLE; 3261 3262 if ((attr & HAT_NOSYNC) && 3263 PTE_GET(oldpte, PT_SOFTWARE) < PT_NOSYNC) 3264 newpte |= PT_NOSYNC; 3265 3266 if ((attr & PROT_EXEC) && PTE_GET(oldpte, mmu.pt_nx)) 3267 newpte &= ~mmu.pt_nx; 3268 } 3269 3270 if (what == HAT_LOAD_ATTR) { 3271 if (!(attr & PROT_WRITE) && 3272 PTE_GET(oldpte, PT_WRITABLE)) 3273 newpte &= ~PT_WRITABLE; 3274 3275 if (!(attr & HAT_NOSYNC) && 3276 PTE_GET(oldpte, PT_SOFTWARE) >= PT_NOSYNC) 3277 newpte &= ~PT_SOFTWARE; 3278 3279 if (!(attr & PROT_EXEC) && !PTE_GET(oldpte, mmu.pt_nx)) 3280 newpte |= mmu.pt_nx; 3281 } 3282 3283 if (what == HAT_CLR_ATTR) { 3284 if ((attr & PROT_WRITE) && PTE_GET(oldpte, PT_WRITABLE)) 3285 newpte &= ~PT_WRITABLE; 3286 3287 if ((attr & HAT_NOSYNC) && 3288 PTE_GET(oldpte, PT_SOFTWARE) >= PT_NOSYNC) 3289 newpte &= ~PT_SOFTWARE; 3290 3291 if ((attr & PROT_EXEC) && !PTE_GET(oldpte, mmu.pt_nx)) 3292 newpte |= mmu.pt_nx; 3293 } 3294 3295 /* 3296 * Ensure NOSYNC/NOCONSIST mappings have REF and MOD set. 3297 * x86pte_set() depends on this. 3298 */ 3299 if (PTE_GET(newpte, PT_SOFTWARE) >= PT_NOSYNC) 3300 newpte |= PT_REF | PT_MOD; 3301 3302 /* 3303 * what about PROT_READ or others? this code only handles: 3304 * EXEC, WRITE, NOSYNC 3305 */ 3306 3307 /* 3308 * If new PTE really changed, update the table. 3309 */ 3310 if (newpte != oldpte) { 3311 entry = htable_va2entry(vaddr, ht); 3312 oldpte = hati_update_pte(ht, entry, oldpte, newpte); 3313 if (oldpte != 0) { 3314 x86_hm_exit(pp); 3315 goto try_again; 3316 } 3317 } 3318 x86_hm_exit(pp); 3319 } 3320 if (ht) 3321 htable_release(ht); 3322 XPV_ALLOW_MIGRATE(); 3323 } 3324 3325 /* 3326 * Various wrappers for hat_updateattr() 3327 */ 3328 void 3329 hat_setattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr) 3330 { 3331 ASSERT(hat == kas.a_hat || (uintptr_t)addr + len <= _userlimit); 3332 hat_updateattr(hat, addr, len, attr, HAT_SET_ATTR); 3333 } 3334 3335 void 3336 hat_clrattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr) 3337 { 3338 ASSERT(hat == kas.a_hat || (uintptr_t)addr + len <= _userlimit); 3339 hat_updateattr(hat, addr, len, attr, HAT_CLR_ATTR); 3340 } 3341 3342 void 3343 hat_chgattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr) 3344 { 3345 ASSERT(hat == kas.a_hat || (uintptr_t)addr + len <= _userlimit); 3346 hat_updateattr(hat, addr, len, attr, HAT_LOAD_ATTR); 3347 } 3348 3349 void 3350 hat_chgprot(hat_t *hat, caddr_t addr, size_t len, uint_t vprot) 3351 { 3352 ASSERT(hat == kas.a_hat || (uintptr_t)addr + len <= _userlimit); 3353 hat_updateattr(hat, addr, len, vprot & HAT_PROT_MASK, HAT_LOAD_ATTR); 3354 } 3355 3356 /* 3357 * size_t hat_getpagesize(hat, addr) 3358 * returns pagesize in bytes for <hat, addr>. returns -1 of there is 3359 * no mapping. This is an advisory call. 3360 */ 3361 ssize_t 3362 hat_getpagesize(hat_t *hat, caddr_t addr) 3363 { 3364 uintptr_t vaddr = ALIGN2PAGE(addr); 3365 htable_t *ht; 3366 size_t pagesize; 3367 3368 ASSERT(hat == kas.a_hat || vaddr <= _userlimit); 3369 if (IN_VA_HOLE(vaddr)) 3370 return (-1); 3371 ht = htable_getpage(hat, vaddr, NULL); 3372 if (ht == NULL) 3373 return (-1); 3374 pagesize = LEVEL_SIZE(ht->ht_level); 3375 htable_release(ht); 3376 return (pagesize); 3377 } 3378 3379 3380 3381 /* 3382 * pfn_t hat_getpfnum(hat, addr) 3383 * returns pfn for <hat, addr> or PFN_INVALID if mapping is invalid. 3384 */ 3385 pfn_t 3386 hat_getpfnum(hat_t *hat, caddr_t addr) 3387 { 3388 uintptr_t vaddr = ALIGN2PAGE(addr); 3389 htable_t *ht; 3390 uint_t entry; 3391 pfn_t pfn = PFN_INVALID; 3392 3393 ASSERT(hat == kas.a_hat || vaddr <= _userlimit); 3394 if (khat_running == 0) 3395 return (PFN_INVALID); 3396 3397 if (IN_VA_HOLE(vaddr)) 3398 return (PFN_INVALID); 3399 3400 XPV_DISALLOW_MIGRATE(); 3401 /* 3402 * A very common use of hat_getpfnum() is from the DDI for kernel pages. 3403 * Use the kmap_ptes (which also covers the 32 bit heap) to speed 3404 * this up. 3405 */ 3406 if (mmu.kmap_addr <= vaddr && vaddr < mmu.kmap_eaddr) { 3407 x86pte_t pte; 3408 pgcnt_t pg_index; 3409 3410 pg_index = mmu_btop(vaddr - mmu.kmap_addr); 3411 pte = GET_PTE(PT_INDEX_PTR(mmu.kmap_ptes, pg_index)); 3412 if (PTE_ISVALID(pte)) 3413 /*LINTED [use of constant 0 causes a lint warning] */ 3414 pfn = PTE2PFN(pte, 0); 3415 XPV_ALLOW_MIGRATE(); 3416 return (pfn); 3417 } 3418 3419 ht = htable_getpage(hat, vaddr, &entry); 3420 if (ht == NULL) { 3421 XPV_ALLOW_MIGRATE(); 3422 return (PFN_INVALID); 3423 } 3424 ASSERT(vaddr >= ht->ht_vaddr); 3425 ASSERT(vaddr <= HTABLE_LAST_PAGE(ht)); 3426 pfn = PTE2PFN(x86pte_get(ht, entry), ht->ht_level); 3427 if (ht->ht_level > 0) 3428 pfn += mmu_btop(vaddr & LEVEL_OFFSET(ht->ht_level)); 3429 htable_release(ht); 3430 XPV_ALLOW_MIGRATE(); 3431 return (pfn); 3432 } 3433 3434 /* 3435 * int hat_probe(hat, addr) 3436 * return 0 if no valid mapping is present. Faster version 3437 * of hat_getattr in certain architectures. 3438 */ 3439 int 3440 hat_probe(hat_t *hat, caddr_t addr) 3441 { 3442 uintptr_t vaddr = ALIGN2PAGE(addr); 3443 uint_t entry; 3444 htable_t *ht; 3445 pgcnt_t pg_off; 3446 3447 ASSERT(hat == kas.a_hat || vaddr <= _userlimit); 3448 ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as)); 3449 if (IN_VA_HOLE(vaddr)) 3450 return (0); 3451 3452 /* 3453 * Most common use of hat_probe is from segmap. We special case it 3454 * for performance. 3455 */ 3456 if (mmu.kmap_addr <= vaddr && vaddr < mmu.kmap_eaddr) { 3457 pg_off = mmu_btop(vaddr - mmu.kmap_addr); 3458 if (mmu.pae_hat) 3459 return (PTE_ISVALID(mmu.kmap_ptes[pg_off])); 3460 else 3461 return (PTE_ISVALID( 3462 ((x86pte32_t *)mmu.kmap_ptes)[pg_off])); 3463 } 3464 3465 ht = htable_getpage(hat, vaddr, &entry); 3466 htable_release(ht); 3467 return (ht != NULL); 3468 } 3469 3470 /* 3471 * Find out if the segment for hat_share()/hat_unshare() is DISM or locked ISM. 3472 */ 3473 static int 3474 is_it_dism(hat_t *hat, caddr_t va) 3475 { 3476 struct seg *seg; 3477 struct shm_data *shmd; 3478 struct spt_data *sptd; 3479 3480 seg = as_findseg(hat->hat_as, va, 0); 3481 ASSERT(seg != NULL); 3482 ASSERT(seg->s_base <= va); 3483 shmd = (struct shm_data *)seg->s_data; 3484 ASSERT(shmd != NULL); 3485 sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 3486 ASSERT(sptd != NULL); 3487 if (sptd->spt_flags & SHM_PAGEABLE) 3488 return (1); 3489 return (0); 3490 } 3491 3492 /* 3493 * Simple implementation of ISM. hat_share() is similar to hat_memload_array(), 3494 * except that we use the ism_hat's existing mappings to determine the pages 3495 * and protections to use for this hat. If we find a full properly aligned 3496 * and sized pagetable, we will attempt to share the pagetable itself. 3497 */ 3498 /*ARGSUSED*/ 3499 int 3500 hat_share( 3501 hat_t *hat, 3502 caddr_t addr, 3503 hat_t *ism_hat, 3504 caddr_t src_addr, 3505 size_t len, /* almost useless value, see below.. */ 3506 uint_t ismszc) 3507 { 3508 uintptr_t vaddr_start = (uintptr_t)addr; 3509 uintptr_t vaddr; 3510 uintptr_t eaddr = vaddr_start + len; 3511 uintptr_t ism_addr_start = (uintptr_t)src_addr; 3512 uintptr_t ism_addr = ism_addr_start; 3513 uintptr_t e_ism_addr = ism_addr + len; 3514 htable_t *ism_ht = NULL; 3515 htable_t *ht; 3516 x86pte_t pte; 3517 page_t *pp; 3518 pfn_t pfn; 3519 level_t l; 3520 pgcnt_t pgcnt; 3521 uint_t prot; 3522 int is_dism; 3523 int flags; 3524 3525 /* 3526 * We might be asked to share an empty DISM hat by as_dup() 3527 */ 3528 ASSERT(hat != kas.a_hat); 3529 ASSERT(eaddr <= _userlimit); 3530 if (!(ism_hat->hat_flags & HAT_SHARED)) { 3531 ASSERT(hat_get_mapped_size(ism_hat) == 0); 3532 return (0); 3533 } 3534 XPV_DISALLOW_MIGRATE(); 3535 3536 /* 3537 * The SPT segment driver often passes us a size larger than there are 3538 * valid mappings. That's because it rounds the segment size up to a 3539 * large pagesize, even if the actual memory mapped by ism_hat is less. 3540 */ 3541 ASSERT(IS_PAGEALIGNED(vaddr_start)); 3542 ASSERT(IS_PAGEALIGNED(ism_addr_start)); 3543 ASSERT(ism_hat->hat_flags & HAT_SHARED); 3544 is_dism = is_it_dism(hat, addr); 3545 while (ism_addr < e_ism_addr) { 3546 /* 3547 * use htable_walk to get the next valid ISM mapping 3548 */ 3549 pte = htable_walk(ism_hat, &ism_ht, &ism_addr, e_ism_addr); 3550 if (ism_ht == NULL) 3551 break; 3552 3553 /* 3554 * First check to see if we already share the page table. 3555 */ 3556 l = ism_ht->ht_level; 3557 vaddr = vaddr_start + (ism_addr - ism_addr_start); 3558 ht = htable_lookup(hat, vaddr, l); 3559 if (ht != NULL) { 3560 if (ht->ht_flags & HTABLE_SHARED_PFN) 3561 goto shared; 3562 htable_release(ht); 3563 goto not_shared; 3564 } 3565 3566 /* 3567 * Can't ever share top table. 3568 */ 3569 if (l == mmu.max_level) 3570 goto not_shared; 3571 3572 /* 3573 * Avoid level mismatches later due to DISM faults. 3574 */ 3575 if (is_dism && l > 0) 3576 goto not_shared; 3577 3578 /* 3579 * addresses and lengths must align 3580 * table must be fully populated 3581 * no lower level page tables 3582 */ 3583 if (ism_addr != ism_ht->ht_vaddr || 3584 (vaddr & LEVEL_OFFSET(l + 1)) != 0) 3585 goto not_shared; 3586 3587 /* 3588 * The range of address space must cover a full table. 3589 */ 3590 if (e_ism_addr - ism_addr < LEVEL_SIZE(l + 1)) 3591 goto not_shared; 3592 3593 /* 3594 * All entries in the ISM page table must be leaf PTEs. 3595 */ 3596 if (l > 0) { 3597 int e; 3598 3599 /* 3600 * We know the 0th is from htable_walk() above. 3601 */ 3602 for (e = 1; e < HTABLE_NUM_PTES(ism_ht); ++e) { 3603 x86pte_t pte; 3604 pte = x86pte_get(ism_ht, e); 3605 if (!PTE_ISPAGE(pte, l)) 3606 goto not_shared; 3607 } 3608 } 3609 3610 /* 3611 * share the page table 3612 */ 3613 ht = htable_create(hat, vaddr, l, ism_ht); 3614 shared: 3615 ASSERT(ht->ht_flags & HTABLE_SHARED_PFN); 3616 ASSERT(ht->ht_shares == ism_ht); 3617 hat->hat_ism_pgcnt += 3618 (ism_ht->ht_valid_cnt - ht->ht_valid_cnt) << 3619 (LEVEL_SHIFT(ht->ht_level) - MMU_PAGESHIFT); 3620 ht->ht_valid_cnt = ism_ht->ht_valid_cnt; 3621 htable_release(ht); 3622 ism_addr = ism_ht->ht_vaddr + LEVEL_SIZE(l + 1); 3623 htable_release(ism_ht); 3624 ism_ht = NULL; 3625 continue; 3626 3627 not_shared: 3628 /* 3629 * Unable to share the page table. Instead we will 3630 * create new mappings from the values in the ISM mappings. 3631 * Figure out what level size mappings to use; 3632 */ 3633 for (l = ism_ht->ht_level; l > 0; --l) { 3634 if (LEVEL_SIZE(l) <= eaddr - vaddr && 3635 (vaddr & LEVEL_OFFSET(l)) == 0) 3636 break; 3637 } 3638 3639 /* 3640 * The ISM mapping might be larger than the share area, 3641 * be careful to truncate it if needed. 3642 */ 3643 if (eaddr - vaddr >= LEVEL_SIZE(ism_ht->ht_level)) { 3644 pgcnt = mmu_btop(LEVEL_SIZE(ism_ht->ht_level)); 3645 } else { 3646 pgcnt = mmu_btop(eaddr - vaddr); 3647 l = 0; 3648 } 3649 3650 pfn = PTE2PFN(pte, ism_ht->ht_level); 3651 ASSERT(pfn != PFN_INVALID); 3652 while (pgcnt > 0) { 3653 /* 3654 * Make a new pte for the PFN for this level. 3655 * Copy protections for the pte from the ISM pte. 3656 */ 3657 pp = page_numtopp_nolock(pfn); 3658 ASSERT(pp != NULL); 3659 3660 prot = PROT_USER | PROT_READ | HAT_UNORDERED_OK; 3661 if (PTE_GET(pte, PT_WRITABLE)) 3662 prot |= PROT_WRITE; 3663 if (!PTE_GET(pte, PT_NX)) 3664 prot |= PROT_EXEC; 3665 3666 flags = HAT_LOAD; 3667 if (!is_dism) 3668 flags |= HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST; 3669 while (hati_load_common(hat, vaddr, pp, prot, flags, 3670 l, pfn) != 0) { 3671 if (l == 0) 3672 panic("hati_load_common() failure"); 3673 --l; 3674 } 3675 3676 vaddr += LEVEL_SIZE(l); 3677 ism_addr += LEVEL_SIZE(l); 3678 pfn += mmu_btop(LEVEL_SIZE(l)); 3679 pgcnt -= mmu_btop(LEVEL_SIZE(l)); 3680 } 3681 } 3682 if (ism_ht != NULL) 3683 htable_release(ism_ht); 3684 XPV_ALLOW_MIGRATE(); 3685 return (0); 3686 } 3687 3688 3689 /* 3690 * hat_unshare() is similar to hat_unload_callback(), but 3691 * we have to look for empty shared pagetables. Note that 3692 * hat_unshare() is always invoked against an entire segment. 3693 */ 3694 /*ARGSUSED*/ 3695 void 3696 hat_unshare(hat_t *hat, caddr_t addr, size_t len, uint_t ismszc) 3697 { 3698 uint64_t vaddr = (uintptr_t)addr; 3699 uintptr_t eaddr = vaddr + len; 3700 htable_t *ht = NULL; 3701 uint_t need_demaps = 0; 3702 int flags = HAT_UNLOAD_UNMAP; 3703 level_t l; 3704 3705 ASSERT(hat != kas.a_hat); 3706 ASSERT(eaddr <= _userlimit); 3707 ASSERT(IS_PAGEALIGNED(vaddr)); 3708 ASSERT(IS_PAGEALIGNED(eaddr)); 3709 XPV_DISALLOW_MIGRATE(); 3710 3711 /* 3712 * First go through and remove any shared pagetables. 3713 * 3714 * Note that it's ok to delay the TLB shootdown till the entire range is 3715 * finished, because if hat_pageunload() were to unload a shared 3716 * pagetable page, its hat_tlb_inval() will do a global TLB invalidate. 3717 */ 3718 l = mmu.max_page_level; 3719 if (l == mmu.max_level) 3720 --l; 3721 for (; l >= 0; --l) { 3722 for (vaddr = (uintptr_t)addr; vaddr < eaddr; 3723 vaddr = (vaddr & LEVEL_MASK(l + 1)) + LEVEL_SIZE(l + 1)) { 3724 ASSERT(!IN_VA_HOLE(vaddr)); 3725 /* 3726 * find a pagetable that maps the current address 3727 */ 3728 ht = htable_lookup(hat, vaddr, l); 3729 if (ht == NULL) 3730 continue; 3731 if (ht->ht_flags & HTABLE_SHARED_PFN) { 3732 /* 3733 * clear page count, set valid_cnt to 0, 3734 * let htable_release() finish the job 3735 */ 3736 hat->hat_ism_pgcnt -= ht->ht_valid_cnt << 3737 (LEVEL_SHIFT(ht->ht_level) - MMU_PAGESHIFT); 3738 ht->ht_valid_cnt = 0; 3739 need_demaps = 1; 3740 } 3741 htable_release(ht); 3742 } 3743 } 3744 3745 /* 3746 * flush the TLBs - since we're probably dealing with MANY mappings 3747 * we just do a full invalidation. 3748 */ 3749 if (!(hat->hat_flags & HAT_FREEING) && need_demaps) 3750 hat_tlb_inval(hat, DEMAP_ALL_ADDR); 3751 3752 /* 3753 * Now go back and clean up any unaligned mappings that 3754 * couldn't share pagetables. 3755 */ 3756 if (!is_it_dism(hat, addr)) 3757 flags |= HAT_UNLOAD_UNLOCK; 3758 hat_unload(hat, addr, len, flags); 3759 XPV_ALLOW_MIGRATE(); 3760 } 3761 3762 3763 /* 3764 * hat_reserve() does nothing 3765 */ 3766 /*ARGSUSED*/ 3767 void 3768 hat_reserve(struct as *as, caddr_t addr, size_t len) 3769 { 3770 } 3771 3772 3773 /* 3774 * Called when all mappings to a page should have write permission removed. 3775 * Mostly stolen from hat_pagesync() 3776 */ 3777 static void 3778 hati_page_clrwrt(struct page *pp) 3779 { 3780 hment_t *hm = NULL; 3781 htable_t *ht; 3782 uint_t entry; 3783 x86pte_t old; 3784 x86pte_t new; 3785 uint_t pszc = 0; 3786 3787 XPV_DISALLOW_MIGRATE(); 3788 next_size: 3789 /* 3790 * walk thru the mapping list clearing write permission 3791 */ 3792 x86_hm_enter(pp); 3793 while ((hm = hment_walk(pp, &ht, &entry, hm)) != NULL) { 3794 if (ht->ht_level < pszc) 3795 continue; 3796 old = x86pte_get(ht, entry); 3797 3798 for (;;) { 3799 /* 3800 * Is this mapping of interest? 3801 */ 3802 if (PTE2PFN(old, ht->ht_level) != pp->p_pagenum || 3803 PTE_GET(old, PT_WRITABLE) == 0) 3804 break; 3805 3806 /* 3807 * Clear ref/mod writable bits. This requires cross 3808 * calls to ensure any executing TLBs see cleared bits. 3809 */ 3810 new = old; 3811 PTE_CLR(new, PT_REF | PT_MOD | PT_WRITABLE); 3812 old = hati_update_pte(ht, entry, old, new); 3813 if (old != 0) 3814 continue; 3815 3816 break; 3817 } 3818 } 3819 x86_hm_exit(pp); 3820 while (pszc < pp->p_szc) { 3821 page_t *tpp; 3822 pszc++; 3823 tpp = PP_GROUPLEADER(pp, pszc); 3824 if (pp != tpp) { 3825 pp = tpp; 3826 goto next_size; 3827 } 3828 } 3829 XPV_ALLOW_MIGRATE(); 3830 } 3831 3832 /* 3833 * void hat_page_setattr(pp, flag) 3834 * void hat_page_clrattr(pp, flag) 3835 * used to set/clr ref/mod bits. 3836 */ 3837 void 3838 hat_page_setattr(struct page *pp, uint_t flag) 3839 { 3840 vnode_t *vp = pp->p_vnode; 3841 kmutex_t *vphm = NULL; 3842 page_t **listp; 3843 int noshuffle; 3844 3845 noshuffle = flag & P_NSH; 3846 flag &= ~P_NSH; 3847 3848 if (PP_GETRM(pp, flag) == flag) 3849 return; 3850 3851 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) && 3852 !noshuffle) { 3853 vphm = page_vnode_mutex(vp); 3854 mutex_enter(vphm); 3855 } 3856 3857 PP_SETRM(pp, flag); 3858 3859 if (vphm != NULL) { 3860 3861 /* 3862 * Some File Systems examine v_pages for NULL w/o 3863 * grabbing the vphm mutex. Must not let it become NULL when 3864 * pp is the only page on the list. 3865 */ 3866 if (pp->p_vpnext != pp) { 3867 page_vpsub(&vp->v_pages, pp); 3868 if (vp->v_pages != NULL) 3869 listp = &vp->v_pages->p_vpprev->p_vpnext; 3870 else 3871 listp = &vp->v_pages; 3872 page_vpadd(listp, pp); 3873 } 3874 mutex_exit(vphm); 3875 } 3876 } 3877 3878 void 3879 hat_page_clrattr(struct page *pp, uint_t flag) 3880 { 3881 vnode_t *vp = pp->p_vnode; 3882 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 3883 3884 /* 3885 * Caller is expected to hold page's io lock for VMODSORT to work 3886 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod 3887 * bit is cleared. 3888 * We don't have assert to avoid tripping some existing third party 3889 * code. The dirty page is moved back to top of the v_page list 3890 * after IO is done in pvn_write_done(). 3891 */ 3892 PP_CLRRM(pp, flag); 3893 3894 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) { 3895 3896 /* 3897 * VMODSORT works by removing write permissions and getting 3898 * a fault when a page is made dirty. At this point 3899 * we need to remove write permission from all mappings 3900 * to this page. 3901 */ 3902 hati_page_clrwrt(pp); 3903 } 3904 } 3905 3906 /* 3907 * If flag is specified, returns 0 if attribute is disabled 3908 * and non zero if enabled. If flag specifes multiple attributes 3909 * then returns 0 if ALL attributes are disabled. This is an advisory 3910 * call. 3911 */ 3912 uint_t 3913 hat_page_getattr(struct page *pp, uint_t flag) 3914 { 3915 return (PP_GETRM(pp, flag)); 3916 } 3917 3918 3919 /* 3920 * common code used by hat_pageunload() and hment_steal() 3921 */ 3922 hment_t * 3923 hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry) 3924 { 3925 x86pte_t old_pte; 3926 pfn_t pfn = pp->p_pagenum; 3927 hment_t *hm; 3928 3929 /* 3930 * We need to acquire a hold on the htable in order to 3931 * do the invalidate. We know the htable must exist, since 3932 * unmap's don't release the htable until after removing any 3933 * hment. Having x86_hm_enter() keeps that from proceeding. 3934 */ 3935 htable_acquire(ht); 3936 3937 /* 3938 * Invalidate the PTE and remove the hment. 3939 */ 3940 old_pte = x86pte_inval(ht, entry, 0, NULL, B_TRUE); 3941 if (PTE2PFN(old_pte, ht->ht_level) != pfn) { 3942 panic("x86pte_inval() failure found PTE = " FMT_PTE 3943 " pfn being unmapped is %lx ht=0x%lx entry=0x%x", 3944 old_pte, pfn, (uintptr_t)ht, entry); 3945 } 3946 3947 /* 3948 * Clean up all the htable information for this mapping 3949 */ 3950 ASSERT(ht->ht_valid_cnt > 0); 3951 HTABLE_DEC(ht->ht_valid_cnt); 3952 PGCNT_DEC(ht->ht_hat, ht->ht_level); 3953 3954 /* 3955 * sync ref/mod bits to the page_t 3956 */ 3957 if (PTE_GET(old_pte, PT_SOFTWARE) < PT_NOSYNC) 3958 hati_sync_pte_to_page(pp, old_pte, ht->ht_level); 3959 3960 /* 3961 * Remove the mapping list entry for this page. 3962 */ 3963 hm = hment_remove(pp, ht, entry); 3964 3965 /* 3966 * drop the mapping list lock so that we might free the 3967 * hment and htable. 3968 */ 3969 x86_hm_exit(pp); 3970 htable_release(ht); 3971 return (hm); 3972 } 3973 3974 extern int vpm_enable; 3975 /* 3976 * Unload all translations to a page. If the page is a subpage of a large 3977 * page, the large page mappings are also removed. 3978 * 3979 * The forceflags are unused. 3980 */ 3981 3982 /*ARGSUSED*/ 3983 static int 3984 hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) 3985 { 3986 page_t *cur_pp = pp; 3987 hment_t *hm; 3988 hment_t *prev; 3989 htable_t *ht; 3990 uint_t entry; 3991 level_t level; 3992 3993 XPV_DISALLOW_MIGRATE(); 3994 3995 /* 3996 * prevent recursion due to kmem_free() 3997 */ 3998 ++curthread->t_hatdepth; 3999 ASSERT(curthread->t_hatdepth < 16); 4000 4001 #if defined(__amd64) 4002 /* 4003 * clear the vpm ref. 4004 */ 4005 if (vpm_enable) { 4006 pp->p_vpmref = 0; 4007 } 4008 #endif 4009 /* 4010 * The loop with next_size handles pages with multiple pagesize mappings 4011 */ 4012 next_size: 4013 for (;;) { 4014 4015 /* 4016 * Get a mapping list entry 4017 */ 4018 x86_hm_enter(cur_pp); 4019 for (prev = NULL; ; prev = hm) { 4020 hm = hment_walk(cur_pp, &ht, &entry, prev); 4021 if (hm == NULL) { 4022 x86_hm_exit(cur_pp); 4023 4024 /* 4025 * If not part of a larger page, we're done. 4026 */ 4027 if (cur_pp->p_szc <= pg_szcd) { 4028 ASSERT(curthread->t_hatdepth > 0); 4029 --curthread->t_hatdepth; 4030 XPV_ALLOW_MIGRATE(); 4031 return (0); 4032 } 4033 4034 /* 4035 * Else check the next larger page size. 4036 * hat_page_demote() may decrease p_szc 4037 * but that's ok we'll just take an extra 4038 * trip discover there're no larger mappings 4039 * and return. 4040 */ 4041 ++pg_szcd; 4042 cur_pp = PP_GROUPLEADER(cur_pp, pg_szcd); 4043 goto next_size; 4044 } 4045 4046 /* 4047 * If this mapping size matches, remove it. 4048 */ 4049 level = ht->ht_level; 4050 if (level == pg_szcd) 4051 break; 4052 } 4053 4054 /* 4055 * Remove the mapping list entry for this page. 4056 * Note this does the x86_hm_exit() for us. 4057 */ 4058 hm = hati_page_unmap(cur_pp, ht, entry); 4059 if (hm != NULL) 4060 hment_free(hm); 4061 } 4062 } 4063 4064 int 4065 hat_pageunload(struct page *pp, uint_t forceflag) 4066 { 4067 ASSERT(PAGE_EXCL(pp)); 4068 return (hati_pageunload(pp, 0, forceflag)); 4069 } 4070 4071 /* 4072 * Unload all large mappings to pp and reduce by 1 p_szc field of every large 4073 * page level that included pp. 4074 * 4075 * pp must be locked EXCL. Even though no other constituent pages are locked 4076 * it's legal to unload large mappings to pp because all constituent pages of 4077 * large locked mappings have to be locked SHARED. therefore if we have EXCL 4078 * lock on one of constituent pages none of the large mappings to pp are 4079 * locked. 4080 * 4081 * Change (always decrease) p_szc field starting from the last constituent 4082 * page and ending with root constituent page so that root's pszc always shows 4083 * the area where hat_page_demote() may be active. 4084 * 4085 * This mechanism is only used for file system pages where it's not always 4086 * possible to get EXCL locks on all constituent pages to demote the size code 4087 * (as is done for anonymous or kernel large pages). 4088 */ 4089 void 4090 hat_page_demote(page_t *pp) 4091 { 4092 uint_t pszc; 4093 uint_t rszc; 4094 uint_t szc; 4095 page_t *rootpp; 4096 page_t *firstpp; 4097 page_t *lastpp; 4098 pgcnt_t pgcnt; 4099 4100 ASSERT(PAGE_EXCL(pp)); 4101 ASSERT(!PP_ISFREE(pp)); 4102 ASSERT(page_szc_lock_assert(pp)); 4103 4104 if (pp->p_szc == 0) 4105 return; 4106 4107 rootpp = PP_GROUPLEADER(pp, 1); 4108 (void) hati_pageunload(rootpp, 1, HAT_FORCE_PGUNLOAD); 4109 4110 /* 4111 * all large mappings to pp are gone 4112 * and no new can be setup since pp is locked exclusively. 4113 * 4114 * Lock the root to make sure there's only one hat_page_demote() 4115 * outstanding within the area of this root's pszc. 4116 * 4117 * Second potential hat_page_demote() is already eliminated by upper 4118 * VM layer via page_szc_lock() but we don't rely on it and use our 4119 * own locking (so that upper layer locking can be changed without 4120 * assumptions that hat depends on upper layer VM to prevent multiple 4121 * hat_page_demote() to be issued simultaneously to the same large 4122 * page). 4123 */ 4124 again: 4125 pszc = pp->p_szc; 4126 if (pszc == 0) 4127 return; 4128 rootpp = PP_GROUPLEADER(pp, pszc); 4129 x86_hm_enter(rootpp); 4130 /* 4131 * If root's p_szc is different from pszc we raced with another 4132 * hat_page_demote(). Drop the lock and try to find the root again. 4133 * If root's p_szc is greater than pszc previous hat_page_demote() is 4134 * not done yet. Take and release mlist lock of root's root to wait 4135 * for previous hat_page_demote() to complete. 4136 */ 4137 if ((rszc = rootpp->p_szc) != pszc) { 4138 x86_hm_exit(rootpp); 4139 if (rszc > pszc) { 4140 /* p_szc of a locked non free page can't increase */ 4141 ASSERT(pp != rootpp); 4142 4143 rootpp = PP_GROUPLEADER(rootpp, rszc); 4144 x86_hm_enter(rootpp); 4145 x86_hm_exit(rootpp); 4146 } 4147 goto again; 4148 } 4149 ASSERT(pp->p_szc == pszc); 4150 4151 /* 4152 * Decrement by 1 p_szc of every constituent page of a region that 4153 * covered pp. For example if original szc is 3 it gets changed to 2 4154 * everywhere except in region 2 that covered pp. Region 2 that 4155 * covered pp gets demoted to 1 everywhere except in region 1 that 4156 * covered pp. The region 1 that covered pp is demoted to region 4157 * 0. It's done this way because from region 3 we removed level 3 4158 * mappings, from region 2 that covered pp we removed level 2 mappings 4159 * and from region 1 that covered pp we removed level 1 mappings. All 4160 * changes are done from from high pfn's to low pfn's so that roots 4161 * are changed last allowing one to know the largest region where 4162 * hat_page_demote() is stil active by only looking at the root page. 4163 * 4164 * This algorithm is implemented in 2 while loops. First loop changes 4165 * p_szc of pages to the right of pp's level 1 region and second 4166 * loop changes p_szc of pages of level 1 region that covers pp 4167 * and all pages to the left of level 1 region that covers pp. 4168 * In the first loop p_szc keeps dropping with every iteration 4169 * and in the second loop it keeps increasing with every iteration. 4170 * 4171 * First loop description: Demote pages to the right of pp outside of 4172 * level 1 region that covers pp. In every iteration of the while 4173 * loop below find the last page of szc region and the first page of 4174 * (szc - 1) region that is immediately to the right of (szc - 1) 4175 * region that covers pp. From last such page to first such page 4176 * change every page's szc to szc - 1. Decrement szc and continue 4177 * looping until szc is 1. If pp belongs to the last (szc - 1) region 4178 * of szc region skip to the next iteration. 4179 */ 4180 szc = pszc; 4181 while (szc > 1) { 4182 lastpp = PP_GROUPLEADER(pp, szc); 4183 pgcnt = page_get_pagecnt(szc); 4184 lastpp += pgcnt - 1; 4185 firstpp = PP_GROUPLEADER(pp, (szc - 1)); 4186 pgcnt = page_get_pagecnt(szc - 1); 4187 if (lastpp - firstpp < pgcnt) { 4188 szc--; 4189 continue; 4190 } 4191 firstpp += pgcnt; 4192 while (lastpp != firstpp) { 4193 ASSERT(lastpp->p_szc == pszc); 4194 lastpp->p_szc = szc - 1; 4195 lastpp--; 4196 } 4197 firstpp->p_szc = szc - 1; 4198 szc--; 4199 } 4200 4201 /* 4202 * Second loop description: 4203 * First iteration changes p_szc to 0 of every 4204 * page of level 1 region that covers pp. 4205 * Subsequent iterations find last page of szc region 4206 * immediately to the left of szc region that covered pp 4207 * and first page of (szc + 1) region that covers pp. 4208 * From last to first page change p_szc of every page to szc. 4209 * Increment szc and continue looping until szc is pszc. 4210 * If pp belongs to the fist szc region of (szc + 1) region 4211 * skip to the next iteration. 4212 * 4213 */ 4214 szc = 0; 4215 while (szc < pszc) { 4216 firstpp = PP_GROUPLEADER(pp, (szc + 1)); 4217 if (szc == 0) { 4218 pgcnt = page_get_pagecnt(1); 4219 lastpp = firstpp + (pgcnt - 1); 4220 } else { 4221 lastpp = PP_GROUPLEADER(pp, szc); 4222 if (firstpp == lastpp) { 4223 szc++; 4224 continue; 4225 } 4226 lastpp--; 4227 pgcnt = page_get_pagecnt(szc); 4228 } 4229 while (lastpp != firstpp) { 4230 ASSERT(lastpp->p_szc == pszc); 4231 lastpp->p_szc = szc; 4232 lastpp--; 4233 } 4234 firstpp->p_szc = szc; 4235 if (firstpp == rootpp) 4236 break; 4237 szc++; 4238 } 4239 x86_hm_exit(rootpp); 4240 } 4241 4242 /* 4243 * get hw stats from hardware into page struct and reset hw stats 4244 * returns attributes of page 4245 * Flags for hat_pagesync, hat_getstat, hat_sync 4246 * 4247 * define HAT_SYNC_ZERORM 0x01 4248 * 4249 * Additional flags for hat_pagesync 4250 * 4251 * define HAT_SYNC_STOPON_REF 0x02 4252 * define HAT_SYNC_STOPON_MOD 0x04 4253 * define HAT_SYNC_STOPON_RM 0x06 4254 * define HAT_SYNC_STOPON_SHARED 0x08 4255 */ 4256 uint_t 4257 hat_pagesync(struct page *pp, uint_t flags) 4258 { 4259 hment_t *hm = NULL; 4260 htable_t *ht; 4261 uint_t entry; 4262 x86pte_t old, save_old; 4263 x86pte_t new; 4264 uchar_t nrmbits = P_REF|P_MOD|P_RO; 4265 extern ulong_t po_share; 4266 page_t *save_pp = pp; 4267 uint_t pszc = 0; 4268 4269 ASSERT(PAGE_LOCKED(pp) || panicstr); 4270 4271 if (PP_ISRO(pp) && (flags & HAT_SYNC_STOPON_MOD)) 4272 return (pp->p_nrm & nrmbits); 4273 4274 if ((flags & HAT_SYNC_ZERORM) == 0) { 4275 4276 if ((flags & HAT_SYNC_STOPON_REF) != 0 && PP_ISREF(pp)) 4277 return (pp->p_nrm & nrmbits); 4278 4279 if ((flags & HAT_SYNC_STOPON_MOD) != 0 && PP_ISMOD(pp)) 4280 return (pp->p_nrm & nrmbits); 4281 4282 if ((flags & HAT_SYNC_STOPON_SHARED) != 0 && 4283 hat_page_getshare(pp) > po_share) { 4284 if (PP_ISRO(pp)) 4285 PP_SETREF(pp); 4286 return (pp->p_nrm & nrmbits); 4287 } 4288 } 4289 4290 XPV_DISALLOW_MIGRATE(); 4291 next_size: 4292 /* 4293 * walk thru the mapping list syncing (and clearing) ref/mod bits. 4294 */ 4295 x86_hm_enter(pp); 4296 while ((hm = hment_walk(pp, &ht, &entry, hm)) != NULL) { 4297 if (ht->ht_level < pszc) 4298 continue; 4299 old = x86pte_get(ht, entry); 4300 try_again: 4301 4302 ASSERT(PTE2PFN(old, ht->ht_level) == pp->p_pagenum); 4303 4304 if (PTE_GET(old, PT_REF | PT_MOD) == 0) 4305 continue; 4306 4307 save_old = old; 4308 if ((flags & HAT_SYNC_ZERORM) != 0) { 4309 4310 /* 4311 * Need to clear ref or mod bits. Need to demap 4312 * to make sure any executing TLBs see cleared bits. 4313 */ 4314 new = old; 4315 PTE_CLR(new, PT_REF | PT_MOD); 4316 old = hati_update_pte(ht, entry, old, new); 4317 if (old != 0) 4318 goto try_again; 4319 4320 old = save_old; 4321 } 4322 4323 /* 4324 * Sync the PTE 4325 */ 4326 if (!(flags & HAT_SYNC_ZERORM) && 4327 PTE_GET(old, PT_SOFTWARE) <= PT_NOSYNC) 4328 hati_sync_pte_to_page(pp, old, ht->ht_level); 4329 4330 /* 4331 * can stop short if we found a ref'd or mod'd page 4332 */ 4333 if ((flags & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp) || 4334 (flags & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)) { 4335 x86_hm_exit(pp); 4336 goto done; 4337 } 4338 } 4339 x86_hm_exit(pp); 4340 while (pszc < pp->p_szc) { 4341 page_t *tpp; 4342 pszc++; 4343 tpp = PP_GROUPLEADER(pp, pszc); 4344 if (pp != tpp) { 4345 pp = tpp; 4346 goto next_size; 4347 } 4348 } 4349 done: 4350 XPV_ALLOW_MIGRATE(); 4351 return (save_pp->p_nrm & nrmbits); 4352 } 4353 4354 /* 4355 * returns approx number of mappings to this pp. A return of 0 implies 4356 * there are no mappings to the page. 4357 */ 4358 ulong_t 4359 hat_page_getshare(page_t *pp) 4360 { 4361 uint_t cnt; 4362 cnt = hment_mapcnt(pp); 4363 #if defined(__amd64) 4364 if (vpm_enable && pp->p_vpmref) { 4365 cnt += 1; 4366 } 4367 #endif 4368 return (cnt); 4369 } 4370 4371 /* 4372 * Return 1 the number of mappings exceeds sh_thresh. Return 0 4373 * otherwise. 4374 */ 4375 int 4376 hat_page_checkshare(page_t *pp, ulong_t sh_thresh) 4377 { 4378 return (hat_page_getshare(pp) > sh_thresh); 4379 } 4380 4381 /* 4382 * hat_softlock isn't supported anymore 4383 */ 4384 /*ARGSUSED*/ 4385 faultcode_t 4386 hat_softlock( 4387 hat_t *hat, 4388 caddr_t addr, 4389 size_t *len, 4390 struct page **page_array, 4391 uint_t flags) 4392 { 4393 return (FC_NOSUPPORT); 4394 } 4395 4396 4397 4398 /* 4399 * Routine to expose supported HAT features to platform independent code. 4400 */ 4401 /*ARGSUSED*/ 4402 int 4403 hat_supported(enum hat_features feature, void *arg) 4404 { 4405 switch (feature) { 4406 4407 case HAT_SHARED_PT: /* this is really ISM */ 4408 return (1); 4409 4410 case HAT_DYNAMIC_ISM_UNMAP: 4411 return (0); 4412 4413 case HAT_VMODSORT: 4414 return (1); 4415 4416 case HAT_SHARED_REGIONS: 4417 return (0); 4418 4419 default: 4420 panic("hat_supported() - unknown feature"); 4421 } 4422 return (0); 4423 } 4424 4425 /* 4426 * Called when a thread is exiting and has been switched to the kernel AS 4427 */ 4428 void 4429 hat_thread_exit(kthread_t *thd) 4430 { 4431 ASSERT(thd->t_procp->p_as == &kas); 4432 XPV_DISALLOW_MIGRATE(); 4433 hat_switch(thd->t_procp->p_as->a_hat); 4434 XPV_ALLOW_MIGRATE(); 4435 } 4436 4437 /* 4438 * Setup the given brand new hat structure as the new HAT on this cpu's mmu. 4439 */ 4440 /*ARGSUSED*/ 4441 void 4442 hat_setup(hat_t *hat, int flags) 4443 { 4444 XPV_DISALLOW_MIGRATE(); 4445 kpreempt_disable(); 4446 4447 hat_switch(hat); 4448 4449 kpreempt_enable(); 4450 XPV_ALLOW_MIGRATE(); 4451 } 4452 4453 /* 4454 * Prepare for a CPU private mapping for the given address. 4455 * 4456 * The address can only be used from a single CPU and can be remapped 4457 * using hat_mempte_remap(). Return the address of the PTE. 4458 * 4459 * We do the htable_create() if necessary and increment the valid count so 4460 * the htable can't disappear. We also hat_devload() the page table into 4461 * kernel so that the PTE is quickly accessed. 4462 */ 4463 hat_mempte_t 4464 hat_mempte_setup(caddr_t addr) 4465 { 4466 uintptr_t va = (uintptr_t)addr; 4467 htable_t *ht; 4468 uint_t entry; 4469 x86pte_t oldpte; 4470 hat_mempte_t p; 4471 4472 ASSERT(IS_PAGEALIGNED(va)); 4473 ASSERT(!IN_VA_HOLE(va)); 4474 ++curthread->t_hatdepth; 4475 XPV_DISALLOW_MIGRATE(); 4476 ht = htable_getpte(kas.a_hat, va, &entry, &oldpte, 0); 4477 if (ht == NULL) { 4478 ht = htable_create(kas.a_hat, va, 0, NULL); 4479 entry = htable_va2entry(va, ht); 4480 ASSERT(ht->ht_level == 0); 4481 oldpte = x86pte_get(ht, entry); 4482 } 4483 if (PTE_ISVALID(oldpte)) 4484 panic("hat_mempte_setup(): address already mapped" 4485 "ht=%p, entry=%d, pte=" FMT_PTE, (void *)ht, entry, oldpte); 4486 4487 /* 4488 * increment ht_valid_cnt so that the pagetable can't disappear 4489 */ 4490 HTABLE_INC(ht->ht_valid_cnt); 4491 4492 /* 4493 * return the PTE physical address to the caller. 4494 */ 4495 htable_release(ht); 4496 XPV_ALLOW_MIGRATE(); 4497 p = PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry); 4498 --curthread->t_hatdepth; 4499 return (p); 4500 } 4501 4502 /* 4503 * Release a CPU private mapping for the given address. 4504 * We decrement the htable valid count so it might be destroyed. 4505 */ 4506 /*ARGSUSED1*/ 4507 void 4508 hat_mempte_release(caddr_t addr, hat_mempte_t pte_pa) 4509 { 4510 htable_t *ht; 4511 4512 XPV_DISALLOW_MIGRATE(); 4513 /* 4514 * invalidate any left over mapping and decrement the htable valid count 4515 */ 4516 #ifdef __xpv 4517 if (HYPERVISOR_update_va_mapping((uintptr_t)addr, 0, 4518 UVMF_INVLPG | UVMF_LOCAL)) 4519 panic("HYPERVISOR_update_va_mapping() failed"); 4520 #else 4521 { 4522 x86pte_t *pteptr; 4523 4524 pteptr = x86pte_mapin(mmu_btop(pte_pa), 4525 (pte_pa & MMU_PAGEOFFSET) >> mmu.pte_size_shift, NULL); 4526 if (mmu.pae_hat) 4527 *pteptr = 0; 4528 else 4529 *(x86pte32_t *)pteptr = 0; 4530 mmu_flush_tlb_kpage((uintptr_t)addr); 4531 x86pte_mapout(); 4532 } 4533 #endif 4534 4535 ht = htable_getpte(kas.a_hat, ALIGN2PAGE(addr), NULL, NULL, 0); 4536 if (ht == NULL) 4537 panic("hat_mempte_release(): invalid address"); 4538 ASSERT(ht->ht_level == 0); 4539 HTABLE_DEC(ht->ht_valid_cnt); 4540 htable_release(ht); 4541 XPV_ALLOW_MIGRATE(); 4542 } 4543 4544 /* 4545 * Apply a temporary CPU private mapping to a page. We flush the TLB only 4546 * on this CPU, so this ought to have been called with preemption disabled. 4547 */ 4548 void 4549 hat_mempte_remap( 4550 pfn_t pfn, 4551 caddr_t addr, 4552 hat_mempte_t pte_pa, 4553 uint_t attr, 4554 uint_t flags) 4555 { 4556 uintptr_t va = (uintptr_t)addr; 4557 x86pte_t pte; 4558 4559 /* 4560 * Remap the given PTE to the new page's PFN. Invalidate only 4561 * on this CPU. 4562 */ 4563 #ifdef DEBUG 4564 htable_t *ht; 4565 uint_t entry; 4566 4567 ASSERT(IS_PAGEALIGNED(va)); 4568 ASSERT(!IN_VA_HOLE(va)); 4569 ht = htable_getpte(kas.a_hat, va, &entry, NULL, 0); 4570 ASSERT(ht != NULL); 4571 ASSERT(ht->ht_level == 0); 4572 ASSERT(ht->ht_valid_cnt > 0); 4573 ASSERT(ht->ht_pfn == mmu_btop(pte_pa)); 4574 htable_release(ht); 4575 #endif 4576 XPV_DISALLOW_MIGRATE(); 4577 pte = hati_mkpte(pfn, attr, 0, flags); 4578 #ifdef __xpv 4579 if (HYPERVISOR_update_va_mapping(va, pte, UVMF_INVLPG | UVMF_LOCAL)) 4580 panic("HYPERVISOR_update_va_mapping() failed"); 4581 #else 4582 { 4583 x86pte_t *pteptr; 4584 4585 pteptr = x86pte_mapin(mmu_btop(pte_pa), 4586 (pte_pa & MMU_PAGEOFFSET) >> mmu.pte_size_shift, NULL); 4587 if (mmu.pae_hat) 4588 *(x86pte_t *)pteptr = pte; 4589 else 4590 *(x86pte32_t *)pteptr = (x86pte32_t)pte; 4591 mmu_flush_tlb_kpage((uintptr_t)addr); 4592 x86pte_mapout(); 4593 } 4594 #endif 4595 XPV_ALLOW_MIGRATE(); 4596 } 4597 4598 4599 4600 /* 4601 * Hat locking functions 4602 * XXX - these two functions are currently being used by hatstats 4603 * they can be removed by using a per-as mutex for hatstats. 4604 */ 4605 void 4606 hat_enter(hat_t *hat) 4607 { 4608 mutex_enter(&hat->hat_mutex); 4609 } 4610 4611 void 4612 hat_exit(hat_t *hat) 4613 { 4614 mutex_exit(&hat->hat_mutex); 4615 } 4616 4617 /* 4618 * HAT part of cpu initialization. 4619 */ 4620 void 4621 hat_cpu_online(struct cpu *cpup) 4622 { 4623 if (cpup != CPU) { 4624 x86pte_cpu_init(cpup); 4625 hat_pcp_setup(cpup); 4626 } 4627 CPUSET_ATOMIC_ADD(khat_cpuset, cpup->cpu_id); 4628 } 4629 4630 /* 4631 * HAT part of cpu deletion. 4632 * (currently, we only call this after the cpu is safely passivated.) 4633 */ 4634 void 4635 hat_cpu_offline(struct cpu *cpup) 4636 { 4637 ASSERT(cpup != CPU); 4638 4639 CPUSET_ATOMIC_DEL(khat_cpuset, cpup->cpu_id); 4640 hat_pcp_teardown(cpup); 4641 x86pte_cpu_fini(cpup); 4642 } 4643 4644 /* 4645 * Function called after all CPUs are brought online. 4646 * Used to remove low address boot mappings. 4647 */ 4648 void 4649 clear_boot_mappings(uintptr_t low, uintptr_t high) 4650 { 4651 uintptr_t vaddr = low; 4652 htable_t *ht = NULL; 4653 level_t level; 4654 uint_t entry; 4655 x86pte_t pte; 4656 4657 /* 4658 * On 1st CPU we can unload the prom mappings, basically we blow away 4659 * all virtual mappings under _userlimit. 4660 */ 4661 while (vaddr < high) { 4662 pte = htable_walk(kas.a_hat, &ht, &vaddr, high); 4663 if (ht == NULL) 4664 break; 4665 4666 level = ht->ht_level; 4667 entry = htable_va2entry(vaddr, ht); 4668 ASSERT(level <= mmu.max_page_level); 4669 ASSERT(PTE_ISPAGE(pte, level)); 4670 4671 /* 4672 * Unload the mapping from the page tables. 4673 */ 4674 (void) x86pte_inval(ht, entry, 0, NULL, B_TRUE); 4675 ASSERT(ht->ht_valid_cnt > 0); 4676 HTABLE_DEC(ht->ht_valid_cnt); 4677 PGCNT_DEC(ht->ht_hat, ht->ht_level); 4678 4679 vaddr += LEVEL_SIZE(ht->ht_level); 4680 } 4681 if (ht) 4682 htable_release(ht); 4683 } 4684 4685 /* 4686 * Atomically update a new translation for a single page. If the 4687 * currently installed PTE doesn't match the value we expect to find, 4688 * it's not updated and we return the PTE we found. 4689 * 4690 * If activating nosync or NOWRITE and the page was modified we need to sync 4691 * with the page_t. Also sync with page_t if clearing ref/mod bits. 4692 */ 4693 static x86pte_t 4694 hati_update_pte(htable_t *ht, uint_t entry, x86pte_t expected, x86pte_t new) 4695 { 4696 page_t *pp; 4697 uint_t rm = 0; 4698 x86pte_t replaced; 4699 4700 if (PTE_GET(expected, PT_SOFTWARE) < PT_NOSYNC && 4701 PTE_GET(expected, PT_MOD | PT_REF) && 4702 (PTE_GET(new, PT_NOSYNC) || !PTE_GET(new, PT_WRITABLE) || 4703 !PTE_GET(new, PT_MOD | PT_REF))) { 4704 4705 ASSERT(!pfn_is_foreign(PTE2PFN(expected, ht->ht_level))); 4706 pp = page_numtopp_nolock(PTE2PFN(expected, ht->ht_level)); 4707 ASSERT(pp != NULL); 4708 if (PTE_GET(expected, PT_MOD)) 4709 rm |= P_MOD; 4710 if (PTE_GET(expected, PT_REF)) 4711 rm |= P_REF; 4712 PTE_CLR(new, PT_MOD | PT_REF); 4713 } 4714 4715 replaced = x86pte_update(ht, entry, expected, new); 4716 if (replaced != expected) 4717 return (replaced); 4718 4719 if (rm) { 4720 /* 4721 * sync to all constituent pages of a large page 4722 */ 4723 pgcnt_t pgcnt = page_get_pagecnt(ht->ht_level); 4724 ASSERT(IS_P2ALIGNED(pp->p_pagenum, pgcnt)); 4725 while (pgcnt-- > 0) { 4726 /* 4727 * hat_page_demote() can't decrease 4728 * pszc below this mapping size 4729 * since large mapping existed after we 4730 * took mlist lock. 4731 */ 4732 ASSERT(pp->p_szc >= ht->ht_level); 4733 hat_page_setattr(pp, rm); 4734 ++pp; 4735 } 4736 } 4737 4738 return (0); 4739 } 4740 4741 /* ARGSUSED */ 4742 void 4743 hat_join_srd(struct hat *hat, vnode_t *evp) 4744 { 4745 } 4746 4747 /* ARGSUSED */ 4748 hat_region_cookie_t 4749 hat_join_region(struct hat *hat, 4750 caddr_t r_saddr, 4751 size_t r_size, 4752 void *r_obj, 4753 u_offset_t r_objoff, 4754 uchar_t r_perm, 4755 uchar_t r_pgszc, 4756 hat_rgn_cb_func_t r_cb_function, 4757 uint_t flags) 4758 { 4759 panic("No shared region support on x86"); 4760 return (HAT_INVALID_REGION_COOKIE); 4761 } 4762 4763 /* ARGSUSED */ 4764 void 4765 hat_leave_region(struct hat *hat, hat_region_cookie_t rcookie, uint_t flags) 4766 { 4767 panic("No shared region support on x86"); 4768 } 4769 4770 /* ARGSUSED */ 4771 void 4772 hat_dup_region(struct hat *hat, hat_region_cookie_t rcookie) 4773 { 4774 panic("No shared region support on x86"); 4775 } 4776 4777 4778 /* 4779 * Kernel Physical Mapping (kpm) facility 4780 * 4781 * Most of the routines needed to support segkpm are almost no-ops on the 4782 * x86 platform. We map in the entire segment when it is created and leave 4783 * it mapped in, so there is no additional work required to set up and tear 4784 * down individual mappings. All of these routines were created to support 4785 * SPARC platforms that have to avoid aliasing in their virtually indexed 4786 * caches. 4787 * 4788 * Most of the routines have sanity checks in them (e.g. verifying that the 4789 * passed-in page is locked). We don't actually care about most of these 4790 * checks on x86, but we leave them in place to identify problems in the 4791 * upper levels. 4792 */ 4793 4794 /* 4795 * Map in a locked page and return the vaddr. 4796 */ 4797 /*ARGSUSED*/ 4798 caddr_t 4799 hat_kpm_mapin(struct page *pp, struct kpme *kpme) 4800 { 4801 caddr_t vaddr; 4802 4803 #ifdef DEBUG 4804 if (kpm_enable == 0) { 4805 cmn_err(CE_WARN, "hat_kpm_mapin: kpm_enable not set\n"); 4806 return ((caddr_t)NULL); 4807 } 4808 4809 if (pp == NULL || PAGE_LOCKED(pp) == 0) { 4810 cmn_err(CE_WARN, "hat_kpm_mapin: pp zero or not locked\n"); 4811 return ((caddr_t)NULL); 4812 } 4813 #endif 4814 4815 vaddr = hat_kpm_page2va(pp, 1); 4816 4817 return (vaddr); 4818 } 4819 4820 /* 4821 * Mapout a locked page. 4822 */ 4823 /*ARGSUSED*/ 4824 void 4825 hat_kpm_mapout(struct page *pp, struct kpme *kpme, caddr_t vaddr) 4826 { 4827 #ifdef DEBUG 4828 if (kpm_enable == 0) { 4829 cmn_err(CE_WARN, "hat_kpm_mapout: kpm_enable not set\n"); 4830 return; 4831 } 4832 4833 if (IS_KPM_ADDR(vaddr) == 0) { 4834 cmn_err(CE_WARN, "hat_kpm_mapout: no kpm address\n"); 4835 return; 4836 } 4837 4838 if (pp == NULL || PAGE_LOCKED(pp) == 0) { 4839 cmn_err(CE_WARN, "hat_kpm_mapout: page zero or not locked\n"); 4840 return; 4841 } 4842 #endif 4843 } 4844 4845 /* 4846 * hat_kpm_mapin_pfn is used to obtain a kpm mapping for physical 4847 * memory addresses that are not described by a page_t. It can 4848 * also be used for normal pages that are not locked, but beware 4849 * this is dangerous - no locking is performed, so the identity of 4850 * the page could change. hat_kpm_mapin_pfn is not supported when 4851 * vac_colors > 1, because the chosen va depends on the page identity, 4852 * which could change. 4853 * The caller must only pass pfn's for valid physical addresses; violation 4854 * of this rule will cause panic. 4855 */ 4856 caddr_t 4857 hat_kpm_mapin_pfn(pfn_t pfn) 4858 { 4859 caddr_t paddr, vaddr; 4860 4861 if (kpm_enable == 0) 4862 return ((caddr_t)NULL); 4863 4864 paddr = (caddr_t)ptob(pfn); 4865 vaddr = (uintptr_t)kpm_vbase + paddr; 4866 4867 return ((caddr_t)vaddr); 4868 } 4869 4870 /*ARGSUSED*/ 4871 void 4872 hat_kpm_mapout_pfn(pfn_t pfn) 4873 { 4874 /* empty */ 4875 } 4876 4877 /* 4878 * Return the kpm virtual address for a specific pfn 4879 */ 4880 caddr_t 4881 hat_kpm_pfn2va(pfn_t pfn) 4882 { 4883 uintptr_t vaddr = (uintptr_t)kpm_vbase + mmu_ptob(pfn); 4884 4885 ASSERT(!pfn_is_foreign(pfn)); 4886 return ((caddr_t)vaddr); 4887 } 4888 4889 /* 4890 * Return the kpm virtual address for the page at pp. 4891 */ 4892 /*ARGSUSED*/ 4893 caddr_t 4894 hat_kpm_page2va(struct page *pp, int checkswap) 4895 { 4896 return (hat_kpm_pfn2va(pp->p_pagenum)); 4897 } 4898 4899 /* 4900 * Return the page frame number for the kpm virtual address vaddr. 4901 */ 4902 pfn_t 4903 hat_kpm_va2pfn(caddr_t vaddr) 4904 { 4905 pfn_t pfn; 4906 4907 ASSERT(IS_KPM_ADDR(vaddr)); 4908 4909 pfn = (pfn_t)btop(vaddr - kpm_vbase); 4910 4911 return (pfn); 4912 } 4913 4914 4915 /* 4916 * Return the page for the kpm virtual address vaddr. 4917 */ 4918 page_t * 4919 hat_kpm_vaddr2page(caddr_t vaddr) 4920 { 4921 pfn_t pfn; 4922 4923 ASSERT(IS_KPM_ADDR(vaddr)); 4924 4925 pfn = hat_kpm_va2pfn(vaddr); 4926 4927 return (page_numtopp_nolock(pfn)); 4928 } 4929 4930 /* 4931 * hat_kpm_fault is called from segkpm_fault when we take a page fault on a 4932 * KPM page. This should never happen on x86 4933 */ 4934 int 4935 hat_kpm_fault(hat_t *hat, caddr_t vaddr) 4936 { 4937 panic("pagefault in seg_kpm. hat: 0x%p vaddr: 0x%p", 4938 (void *)hat, (void *)vaddr); 4939 4940 return (0); 4941 } 4942 4943 /*ARGSUSED*/ 4944 void 4945 hat_kpm_mseghash_clear(int nentries) 4946 {} 4947 4948 /*ARGSUSED*/ 4949 void 4950 hat_kpm_mseghash_update(pgcnt_t inx, struct memseg *msp) 4951 {} 4952 4953 #ifndef __xpv 4954 void 4955 hat_kpm_addmem_mseg_update(struct memseg *msp, pgcnt_t nkpmpgs, 4956 offset_t kpm_pages_off) 4957 { 4958 _NOTE(ARGUNUSED(nkpmpgs, kpm_pages_off)); 4959 pfn_t base, end; 4960 4961 /* 4962 * kphysm_add_memory_dynamic() does not set nkpmpgs 4963 * when page_t memory is externally allocated. That 4964 * code must properly calculate nkpmpgs in all cases 4965 * if nkpmpgs needs to be used at some point. 4966 */ 4967 4968 /* 4969 * The meta (page_t) pages for dynamically added memory are allocated 4970 * either from the incoming memory itself or from existing memory. 4971 * In the former case the base of the incoming pages will be different 4972 * than the base of the dynamic segment so call memseg_get_start() to 4973 * get the actual base of the incoming memory for each case. 4974 */ 4975 4976 base = memseg_get_start(msp); 4977 end = msp->pages_end; 4978 4979 hat_devload(kas.a_hat, kpm_vbase + mmu_ptob(base), 4980 mmu_ptob(end - base), base, PROT_READ | PROT_WRITE, 4981 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 4982 } 4983 4984 void 4985 hat_kpm_addmem_mseg_insert(struct memseg *msp) 4986 { 4987 _NOTE(ARGUNUSED(msp)); 4988 } 4989 4990 void 4991 hat_kpm_addmem_memsegs_update(struct memseg *msp) 4992 { 4993 _NOTE(ARGUNUSED(msp)); 4994 } 4995 4996 /* 4997 * Return end of metadata for an already setup memseg. 4998 * X86 platforms don't need per-page meta data to support kpm. 4999 */ 5000 caddr_t 5001 hat_kpm_mseg_reuse(struct memseg *msp) 5002 { 5003 return ((caddr_t)msp->epages); 5004 } 5005 5006 void 5007 hat_kpm_delmem_mseg_update(struct memseg *msp, struct memseg **mspp) 5008 { 5009 _NOTE(ARGUNUSED(msp, mspp)); 5010 ASSERT(0); 5011 } 5012 5013 void 5014 hat_kpm_split_mseg_update(struct memseg *msp, struct memseg **mspp, 5015 struct memseg *lo, struct memseg *mid, struct memseg *hi) 5016 { 5017 _NOTE(ARGUNUSED(msp, mspp, lo, mid, hi)); 5018 ASSERT(0); 5019 } 5020 5021 /* 5022 * Walk the memsegs chain, applying func to each memseg span. 5023 */ 5024 void 5025 hat_kpm_walk(void (*func)(void *, void *, size_t), void *arg) 5026 { 5027 pfn_t pbase, pend; 5028 void *base; 5029 size_t size; 5030 struct memseg *msp; 5031 5032 for (msp = memsegs; msp; msp = msp->next) { 5033 pbase = msp->pages_base; 5034 pend = msp->pages_end; 5035 base = ptob(pbase) + kpm_vbase; 5036 size = ptob(pend - pbase); 5037 func(arg, base, size); 5038 } 5039 } 5040 5041 #else /* __xpv */ 5042 5043 /* 5044 * There are specific Hypervisor calls to establish and remove mappings 5045 * to grant table references and the privcmd driver. We have to ensure 5046 * that a page table actually exists. 5047 */ 5048 void 5049 hat_prepare_mapping(hat_t *hat, caddr_t addr, uint64_t *pte_ma) 5050 { 5051 maddr_t base_ma; 5052 htable_t *ht; 5053 uint_t entry; 5054 5055 ASSERT(IS_P2ALIGNED((uintptr_t)addr, MMU_PAGESIZE)); 5056 XPV_DISALLOW_MIGRATE(); 5057 ht = htable_create(hat, (uintptr_t)addr, 0, NULL); 5058 5059 /* 5060 * if an address for pte_ma is passed in, return the MA of the pte 5061 * for this specific address. This address is only valid as long 5062 * as the htable stays locked. 5063 */ 5064 if (pte_ma != NULL) { 5065 entry = htable_va2entry((uintptr_t)addr, ht); 5066 base_ma = pa_to_ma(ptob(ht->ht_pfn)); 5067 *pte_ma = base_ma + (entry << mmu.pte_size_shift); 5068 } 5069 XPV_ALLOW_MIGRATE(); 5070 } 5071 5072 void 5073 hat_release_mapping(hat_t *hat, caddr_t addr) 5074 { 5075 htable_t *ht; 5076 5077 ASSERT(IS_P2ALIGNED((uintptr_t)addr, MMU_PAGESIZE)); 5078 XPV_DISALLOW_MIGRATE(); 5079 ht = htable_lookup(hat, (uintptr_t)addr, 0); 5080 ASSERT(ht != NULL); 5081 ASSERT(ht->ht_busy >= 2); 5082 htable_release(ht); 5083 htable_release(ht); 5084 XPV_ALLOW_MIGRATE(); 5085 } 5086 #endif /* __xpv */ 5087 5088 /* 5089 * Helper function to punch in a mapping that we need with the specified 5090 * attributes. 5091 */ 5092 void 5093 hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs) 5094 { 5095 int ret; 5096 pfn_t pfn; 5097 hat_t *cpu_hat = cpu->cpu_hat_info->hci_user_hat; 5098 5099 ASSERT3S(kpti_enable, ==, 1); 5100 ASSERT3P(cpu_hat, !=, NULL); 5101 ASSERT3U(cpu_hat->hat_flags & HAT_PCP, ==, HAT_PCP); 5102 ASSERT3U(va & MMU_PAGEOFFSET, ==, 0); 5103 5104 pfn = hat_getpfnum(kas.a_hat, (caddr_t)va); 5105 VERIFY3U(pfn, !=, PFN_INVALID); 5106 5107 /* 5108 * We purposefully don't try to find the page_t. This means that this 5109 * will be marked PT_NOCONSIST; however, given that this is pretty much 5110 * a static mapping that we're using we should be relatively OK. 5111 */ 5112 attrs |= HAT_STORECACHING_OK; 5113 ret = hati_load_common(cpu_hat, va, NULL, attrs, 0, 0, pfn); 5114 VERIFY3S(ret, ==, 0); 5115 } 5116