1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (C) 2010 Andreas Tobler 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 22 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 24 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 25 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include <sys/param.h> 32 #include <sys/kernel.h> 33 #include <sys/ktr.h> 34 #include <sys/lock.h> 35 #include <sys/rmlock.h> 36 #include <sys/mutex.h> 37 #include <sys/proc.h> 38 #include <sys/sysctl.h> 39 #include <sys/systm.h> 40 #include <sys/vmmeter.h> 41 42 #include <dev/ofw/openfirm.h> 43 #include <machine/ofw_machdep.h> 44 45 #include <vm/vm.h> 46 #include <vm/vm_param.h> 47 #include <vm/vm_kern.h> 48 #include <vm/vm_page.h> 49 #include <vm/vm_map.h> 50 #include <vm/vm_object.h> 51 #include <vm/vm_extern.h> 52 #include <vm/vm_pageout.h> 53 #include <vm/uma.h> 54 55 #include <powerpc/aim/mmu_oea64.h> 56 57 #include "phyp-hvcall.h" 58 59 #define MMU_PHYP_DEBUG 0 60 #define MMU_PHYP_ID "mmu_phyp: " 61 #if MMU_PHYP_DEBUG 62 #define dprintf(fmt, ...) printf(fmt, ## __VA_ARGS__) 63 #define dprintf0(fmt, ...) dprintf(MMU_PHYP_ID fmt, ## __VA_ARGS__) 64 #else 65 #define dprintf(fmt, args...) do { ; } while(0) 66 #define dprintf0(fmt, args...) do { ; } while(0) 67 #endif 68 69 static struct rmlock mphyp_eviction_lock; 70 71 /* 72 * Kernel MMU interface 73 */ 74 75 static void mphyp_install(void); 76 static void mphyp_bootstrap(vm_offset_t kernelstart, 77 vm_offset_t kernelend); 78 static void mphyp_cpu_bootstrap(int ap); 79 static void *mphyp_dump_pmap(void *ctx, void *buf, 80 u_long *nbytes); 81 static int64_t mphyp_pte_synch(struct pvo_entry *pvo); 82 static int64_t mphyp_pte_clear(struct pvo_entry *pvo, uint64_t ptebit); 83 static int64_t mphyp_pte_unset(struct pvo_entry *pvo); 84 static int64_t mphyp_pte_insert(struct pvo_entry *pvo); 85 86 static struct pmap_funcs mphyp_methods = { 87 .install = mphyp_install, 88 .bootstrap = mphyp_bootstrap, 89 .cpu_bootstrap = mphyp_cpu_bootstrap, 90 .dumpsys_dump_pmap = mphyp_dump_pmap, 91 }; 92 93 static struct moea64_funcs mmu_phyp_funcs = { 94 .pte_synch = mphyp_pte_synch, 95 .pte_clear = mphyp_pte_clear, 96 .pte_unset = mphyp_pte_unset, 97 .pte_insert = mphyp_pte_insert, 98 }; 99 100 MMU_DEF_INHERIT(pseries_mmu, "mmu_phyp", mphyp_methods, oea64_mmu); 101 102 static int brokenkvm = 0; 103 104 static void 105 print_kvm_bug_warning(void *data) 106 { 107 108 if (brokenkvm) 109 printf("WARNING: Running on a broken hypervisor that does " 110 "not support mandatory H_CLEAR_MOD and H_CLEAR_REF " 111 "hypercalls. Performance will be suboptimal.\n"); 112 } 113 114 SYSINIT(kvmbugwarn1, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, 115 print_kvm_bug_warning, NULL); 116 SYSINIT(kvmbugwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_kvm_bug_warning, 117 NULL); 118 119 static void 120 mphyp_install() 121 { 122 123 moea64_ops = &mmu_phyp_funcs; 124 } 125 126 static void 127 mphyp_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) 128 { 129 uint64_t final_pteg_count = 0; 130 char buf[8]; 131 uint32_t prop[2]; 132 uint32_t nptlp, shift = 0, slb_encoding = 0; 133 uint32_t lp_size, lp_encoding; 134 struct lpte old; 135 uint64_t vsid; 136 phandle_t dev, node, root; 137 int idx, len, res; 138 139 rm_init(&mphyp_eviction_lock, "pte eviction"); 140 141 moea64_early_bootstrap(kernelstart, kernelend); 142 143 root = OF_peer(0); 144 145 dev = OF_child(root); 146 while (dev != 0) { 147 res = OF_getprop(dev, "name", buf, sizeof(buf)); 148 if (res > 0 && strcmp(buf, "cpus") == 0) 149 break; 150 dev = OF_peer(dev); 151 } 152 153 node = OF_child(dev); 154 155 while (node != 0) { 156 res = OF_getprop(node, "device_type", buf, sizeof(buf)); 157 if (res > 0 && strcmp(buf, "cpu") == 0) 158 break; 159 node = OF_peer(node); 160 } 161 162 res = OF_getencprop(node, "ibm,pft-size", prop, sizeof(prop)); 163 if (res <= 0) 164 panic("mmu_phyp: unknown PFT size"); 165 final_pteg_count = 1 << prop[1]; 166 res = OF_getencprop(node, "ibm,slb-size", prop, sizeof(prop[0])); 167 if (res > 0) 168 n_slbs = prop[0]; 169 dprintf0("slb-size=%i\n", n_slbs); 170 171 moea64_pteg_count = final_pteg_count / sizeof(struct lpteg); 172 173 /* Clear any old page table entries */ 174 for (idx = 0; idx < moea64_pteg_count*8; idx++) { 175 phyp_pft_hcall(H_READ, 0, idx, 0, 0, &old.pte_hi, 176 &old.pte_lo, &old.pte_lo); 177 vsid = (old.pte_hi << (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) >> 28; 178 if (vsid == VSID_VRMA || vsid == 0 /* Older VRMA */) 179 continue; 180 181 if (old.pte_hi & LPTE_VALID) 182 phyp_hcall(H_REMOVE, 0, idx, 0); 183 } 184 185 /* 186 * Scan the large page size property for PAPR compatible machines. 187 * See PAPR D.5 Changes to Section 5.1.4, 'CPU Node Properties' 188 * for the encoding of the property. 189 */ 190 191 len = OF_getproplen(node, "ibm,segment-page-sizes"); 192 if (len > 0) { 193 /* 194 * We have to use a variable length array on the stack 195 * since we have very limited stack space. 196 */ 197 pcell_t arr[len/sizeof(cell_t)]; 198 res = OF_getencprop(node, "ibm,segment-page-sizes", arr, 199 sizeof(arr)); 200 len /= 4; 201 idx = 0; 202 while (len > 0) { 203 shift = arr[idx]; 204 slb_encoding = arr[idx + 1]; 205 nptlp = arr[idx + 2]; 206 207 dprintf0("Segment Page Size: " 208 "%uKB, slb_enc=0x%X: {size, encoding}[%u] =", 209 shift > 10? 1 << (shift-10) : 0, 210 slb_encoding, nptlp); 211 212 idx += 3; 213 len -= 3; 214 while (len > 0 && nptlp) { 215 lp_size = arr[idx]; 216 lp_encoding = arr[idx+1]; 217 218 dprintf(" {%uKB, 0x%X}", 219 lp_size > 10? 1 << (lp_size-10) : 0, 220 lp_encoding); 221 222 if (slb_encoding == SLBV_L && lp_encoding == 0) 223 break; 224 225 idx += 2; 226 len -= 2; 227 nptlp--; 228 } 229 dprintf("\n"); 230 if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) 231 break; 232 } 233 234 if (len > 0) { 235 moea64_large_page_shift = shift; 236 moea64_large_page_size = 1ULL << lp_size; 237 moea64_large_page_mask = moea64_large_page_size - 1; 238 hw_direct_map = 1; 239 printf(MMU_PHYP_ID 240 "Support for hugepages of %uKB detected\n", 241 moea64_large_page_shift > 10? 242 1 << (moea64_large_page_shift-10) : 0); 243 } else { 244 moea64_large_page_size = 0; 245 moea64_large_page_shift = 0; 246 moea64_large_page_mask = 0; 247 hw_direct_map = 0; 248 printf(MMU_PHYP_ID 249 "Support for hugepages not found\n"); 250 } 251 } 252 253 moea64_mid_bootstrap(kernelstart, kernelend); 254 moea64_late_bootstrap(kernelstart, kernelend); 255 256 /* Test for broken versions of KVM that don't conform to the spec */ 257 if (phyp_hcall(H_CLEAR_MOD, 0, 0) == H_FUNCTION) 258 brokenkvm = 1; 259 } 260 261 static void 262 mphyp_cpu_bootstrap(int ap) 263 { 264 struct slb *slb = PCPU_GET(aim.slb); 265 register_t seg0; 266 int i; 267 268 /* 269 * Install kernel SLB entries 270 */ 271 272 __asm __volatile ("slbia"); 273 __asm __volatile ("slbmfee %0,%1; slbie %0;" : "=r"(seg0) : "r"(0)); 274 for (i = 0; i < 64; i++) { 275 if (!(slb[i].slbe & SLBE_VALID)) 276 continue; 277 278 __asm __volatile ("slbmte %0, %1" :: 279 "r"(slb[i].slbv), "r"(slb[i].slbe)); 280 } 281 } 282 283 static int64_t 284 mphyp_pte_synch(struct pvo_entry *pvo) 285 { 286 struct lpte pte; 287 uint64_t junk; 288 289 __asm __volatile("ptesync"); 290 phyp_pft_hcall(H_READ, 0, pvo->pvo_pte.slot, 0, 0, &pte.pte_hi, 291 &pte.pte_lo, &junk); 292 if ((pte.pte_hi & LPTE_AVPN_MASK) != 293 ((pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & 294 LPTE_AVPN_MASK)) 295 return (-1); 296 if (!(pte.pte_hi & LPTE_VALID)) 297 return (-1); 298 299 return (pte.pte_lo & (LPTE_CHG | LPTE_REF)); 300 } 301 302 static int64_t 303 mphyp_pte_clear(struct pvo_entry *pvo, uint64_t ptebit) 304 { 305 struct rm_priotracker track; 306 int64_t refchg; 307 uint64_t ptelo, junk; 308 int err; 309 310 /* 311 * This involves two steps (synch and clear) so we need the entry 312 * not to change in the middle. We are protected against deliberate 313 * unset by virtue of holding the pmap lock. Protection against 314 * incidental unset (page table eviction) comes from holding the 315 * shared eviction lock. 316 */ 317 PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); 318 rm_rlock(&mphyp_eviction_lock, &track); 319 320 refchg = mphyp_pte_synch(pvo); 321 if (refchg < 0) { 322 rm_runlock(&mphyp_eviction_lock, &track); 323 return (refchg); 324 } 325 326 if (brokenkvm) { 327 /* 328 * No way to clear either bit, which is total madness. 329 * Pessimistically claim that, once modified, it stays so 330 * forever and that it is never referenced. 331 */ 332 rm_runlock(&mphyp_eviction_lock, &track); 333 return (refchg & ~LPTE_REF); 334 } 335 336 if (ptebit & LPTE_CHG) { 337 err = phyp_pft_hcall(H_CLEAR_MOD, 0, pvo->pvo_pte.slot, 0, 0, 338 &ptelo, &junk, &junk); 339 KASSERT(err == H_SUCCESS, 340 ("Error clearing page change bit: %d", err)); 341 refchg |= (ptelo & LPTE_CHG); 342 } 343 if (ptebit & LPTE_REF) { 344 err = phyp_pft_hcall(H_CLEAR_REF, 0, pvo->pvo_pte.slot, 0, 0, 345 &ptelo, &junk, &junk); 346 KASSERT(err == H_SUCCESS, 347 ("Error clearing page reference bit: %d", err)); 348 refchg |= (ptelo & LPTE_REF); 349 } 350 351 rm_runlock(&mphyp_eviction_lock, &track); 352 353 return (refchg); 354 } 355 356 static int64_t 357 mphyp_pte_unset(struct pvo_entry *pvo) 358 { 359 struct lpte pte; 360 uint64_t junk; 361 int err; 362 363 PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); 364 365 moea64_pte_from_pvo(pvo, &pte); 366 367 err = phyp_pft_hcall(H_REMOVE, H_AVPN, pvo->pvo_pte.slot, 368 pte.pte_hi & LPTE_AVPN_MASK, 0, &pte.pte_hi, &pte.pte_lo, 369 &junk); 370 KASSERT(err == H_SUCCESS || err == H_NOT_FOUND, 371 ("Error removing page: %d", err)); 372 373 if (err == H_NOT_FOUND) { 374 STAT_MOEA64(moea64_pte_overflow--); 375 return (-1); 376 } 377 378 return (pte.pte_lo & (LPTE_REF | LPTE_CHG)); 379 } 380 381 static uintptr_t 382 mphyp_pte_spillable_ident(uintptr_t ptegbase, struct lpte *to_evict) 383 { 384 uint64_t slot, junk, k; 385 struct lpte pt; 386 int i, j; 387 388 /* Start at a random slot */ 389 i = mftb() % 8; 390 k = -1; 391 for (j = 0; j < 8; j++) { 392 slot = ptegbase + (i + j) % 8; 393 phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pt.pte_hi, 394 &pt.pte_lo, &junk); 395 396 if (pt.pte_hi & LPTE_WIRED) 397 continue; 398 399 /* This is a candidate, so remember it */ 400 k = slot; 401 402 /* Try to get a page that has not been used lately */ 403 if (!(pt.pte_hi & LPTE_VALID) || !(pt.pte_lo & LPTE_REF)) { 404 memcpy(to_evict, &pt, sizeof(struct lpte)); 405 return (k); 406 } 407 } 408 409 if (k == -1) 410 return (k); 411 412 phyp_pft_hcall(H_READ, 0, k, 0, 0, &to_evict->pte_hi, 413 &to_evict->pte_lo, &junk); 414 return (k); 415 } 416 417 static int64_t 418 mphyp_pte_insert(struct pvo_entry *pvo) 419 { 420 struct rm_priotracker track; 421 int64_t result; 422 struct lpte evicted, pte; 423 uint64_t index, junk, lastptelo; 424 425 PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); 426 427 /* Initialize PTE */ 428 moea64_pte_from_pvo(pvo, &pte); 429 evicted.pte_hi = 0; 430 431 /* Make sure further insertion is locked out during evictions */ 432 rm_rlock(&mphyp_eviction_lock, &track); 433 434 /* 435 * First try primary hash. 436 */ 437 pvo->pvo_pte.slot &= ~7UL; /* Base slot address */ 438 result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, pte.pte_hi, 439 pte.pte_lo, &index, &evicted.pte_lo, &junk); 440 if (result == H_SUCCESS) { 441 rm_runlock(&mphyp_eviction_lock, &track); 442 pvo->pvo_pte.slot = index; 443 return (0); 444 } 445 KASSERT(result == H_PTEG_FULL, ("Page insertion error: %ld " 446 "(ptegidx: %#zx/%#lx, PTE %#lx/%#lx", result, pvo->pvo_pte.slot, 447 moea64_pteg_count, pte.pte_hi, pte.pte_lo)); 448 449 /* 450 * Next try secondary hash. 451 */ 452 pvo->pvo_vaddr ^= PVO_HID; 453 pte.pte_hi ^= LPTE_HID; 454 pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); 455 456 result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, 457 pte.pte_hi, pte.pte_lo, &index, &evicted.pte_lo, &junk); 458 if (result == H_SUCCESS) { 459 rm_runlock(&mphyp_eviction_lock, &track); 460 pvo->pvo_pte.slot = index; 461 return (0); 462 } 463 KASSERT(result == H_PTEG_FULL, ("Secondary page insertion error: %ld", 464 result)); 465 466 /* 467 * Out of luck. Find a PTE to sacrifice. 468 */ 469 470 /* Lock out all insertions for a bit */ 471 rm_runlock(&mphyp_eviction_lock, &track); 472 rm_wlock(&mphyp_eviction_lock); 473 474 index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); 475 if (index == -1L) { 476 /* Try other hash table? */ 477 pvo->pvo_vaddr ^= PVO_HID; 478 pte.pte_hi ^= LPTE_HID; 479 pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); 480 index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); 481 } 482 483 if (index == -1L) { 484 /* No freeable slots in either PTEG? We're hosed. */ 485 rm_wunlock(&mphyp_eviction_lock); 486 panic("mphyp_pte_insert: overflow"); 487 return (-1); 488 } 489 490 /* Victim acquired: update page before waving goodbye */ 491 if (evicted.pte_hi & LPTE_VALID) { 492 result = phyp_pft_hcall(H_REMOVE, H_AVPN, index, 493 evicted.pte_hi & LPTE_AVPN_MASK, 0, &junk, &lastptelo, 494 &junk); 495 STAT_MOEA64(moea64_pte_overflow++); 496 KASSERT(result == H_SUCCESS || result == H_NOT_FOUND, 497 ("Error evicting page: %d", (int)result)); 498 } 499 500 /* 501 * Set the new PTE. 502 */ 503 result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pte.pte_hi, 504 pte.pte_lo, &index, &evicted.pte_lo, &junk); 505 rm_wunlock(&mphyp_eviction_lock); /* All clear */ 506 507 pvo->pvo_pte.slot = index; 508 if (result == H_SUCCESS) 509 return (0); 510 511 panic("Page replacement error: %ld", result); 512 return (result); 513 } 514 515 static void * 516 mphyp_dump_pmap(void *ctx, void *buf, u_long *nbytes) 517 { 518 struct dump_context *dctx; 519 struct lpte p, *pbuf; 520 int bufidx; 521 uint64_t junk; 522 u_long ptex, ptex_end; 523 524 dctx = (struct dump_context *)ctx; 525 pbuf = (struct lpte *)buf; 526 bufidx = 0; 527 ptex = dctx->ptex; 528 ptex_end = ptex + dctx->blksz / sizeof(struct lpte); 529 ptex_end = MIN(ptex_end, dctx->ptex_end); 530 *nbytes = (ptex_end - ptex) * sizeof(struct lpte); 531 532 if (*nbytes == 0) 533 return (NULL); 534 535 for (; ptex < ptex_end; ptex++) { 536 phyp_pft_hcall(H_READ, 0, ptex, 0, 0, 537 &p.pte_hi, &p.pte_lo, &junk); 538 pbuf[bufidx++] = p; 539 } 540 541 dctx->ptex = ptex; 542 return (buf); 543 } 544