1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (C) 2010 Andreas Tobler 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 22 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 24 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 25 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include <sys/param.h> 32 #include <sys/kernel.h> 33 #include <sys/ktr.h> 34 #include <sys/lock.h> 35 #include <sys/rmlock.h> 36 #include <sys/mutex.h> 37 #include <sys/proc.h> 38 #include <sys/sysctl.h> 39 #include <sys/systm.h> 40 #include <sys/vmmeter.h> 41 42 #include <dev/ofw/openfirm.h> 43 #include <machine/ofw_machdep.h> 44 45 #include <vm/vm.h> 46 #include <vm/vm_param.h> 47 #include <vm/vm_kern.h> 48 #include <vm/vm_page.h> 49 #include <vm/vm_map.h> 50 #include <vm/vm_object.h> 51 #include <vm/vm_extern.h> 52 #include <vm/vm_pageout.h> 53 #include <vm/uma.h> 54 55 #include <powerpc/aim/mmu_oea64.h> 56 57 #include "mmu_if.h" 58 #include "moea64_if.h" 59 60 #include "phyp-hvcall.h" 61 62 #define MMU_PHYP_DEBUG 0 63 #define MMU_PHYP_ID "mmu_phyp: " 64 #if MMU_PHYP_DEBUG 65 #define dprintf(fmt, ...) printf(fmt, ## __VA_ARGS__) 66 #define dprintf0(fmt, ...) dprintf(MMU_PHYP_ID fmt, ## __VA_ARGS__) 67 #else 68 #define dprintf(fmt, args...) do { ; } while(0) 69 #define dprintf0(fmt, args...) do { ; } while(0) 70 #endif 71 72 static struct rmlock mphyp_eviction_lock; 73 74 /* 75 * Kernel MMU interface 76 */ 77 78 static void mphyp_bootstrap(mmu_t mmup, vm_offset_t kernelstart, 79 vm_offset_t kernelend); 80 static void mphyp_cpu_bootstrap(mmu_t mmup, int ap); 81 static void *mphyp_dump_pmap(mmu_t mmu, void *ctx, void *buf, 82 u_long *nbytes); 83 static int64_t mphyp_pte_synch(mmu_t, struct pvo_entry *pvo); 84 static int64_t mphyp_pte_clear(mmu_t, struct pvo_entry *pvo, uint64_t ptebit); 85 static int64_t mphyp_pte_unset(mmu_t, struct pvo_entry *pvo); 86 static int mphyp_pte_insert(mmu_t, struct pvo_entry *pvo); 87 88 static mmu_method_t mphyp_methods[] = { 89 MMUMETHOD(mmu_bootstrap, mphyp_bootstrap), 90 MMUMETHOD(mmu_cpu_bootstrap, mphyp_cpu_bootstrap), 91 MMUMETHOD(mmu_dump_pmap, mphyp_dump_pmap), 92 93 MMUMETHOD(moea64_pte_synch, mphyp_pte_synch), 94 MMUMETHOD(moea64_pte_clear, mphyp_pte_clear), 95 MMUMETHOD(moea64_pte_unset, mphyp_pte_unset), 96 MMUMETHOD(moea64_pte_insert, mphyp_pte_insert), 97 98 /* XXX: pmap_copy_page, pmap_init_page with H_PAGE_INIT */ 99 100 { 0, 0 } 101 }; 102 103 MMU_DEF_INHERIT(pseries_mmu, "mmu_phyp", mphyp_methods, 0, oea64_mmu); 104 105 static int brokenkvm = 0; 106 107 static void 108 print_kvm_bug_warning(void *data) 109 { 110 111 if (brokenkvm) 112 printf("WARNING: Running on a broken hypervisor that does " 113 "not support mandatory H_CLEAR_MOD and H_CLEAR_REF " 114 "hypercalls. Performance will be suboptimal.\n"); 115 } 116 117 SYSINIT(kvmbugwarn1, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, 118 print_kvm_bug_warning, NULL); 119 SYSINIT(kvmbugwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_kvm_bug_warning, 120 NULL); 121 122 static void 123 mphyp_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) 124 { 125 uint64_t final_pteg_count = 0; 126 char buf[8]; 127 uint32_t prop[2]; 128 uint32_t nptlp, shift = 0, slb_encoding = 0; 129 uint32_t lp_size, lp_encoding; 130 struct lpte old; 131 uint64_t vsid; 132 phandle_t dev, node, root; 133 int idx, len, res; 134 135 rm_init(&mphyp_eviction_lock, "pte eviction"); 136 137 moea64_early_bootstrap(mmup, kernelstart, kernelend); 138 139 root = OF_peer(0); 140 141 dev = OF_child(root); 142 while (dev != 0) { 143 res = OF_getprop(dev, "name", buf, sizeof(buf)); 144 if (res > 0 && strcmp(buf, "cpus") == 0) 145 break; 146 dev = OF_peer(dev); 147 } 148 149 node = OF_child(dev); 150 151 while (node != 0) { 152 res = OF_getprop(node, "device_type", buf, sizeof(buf)); 153 if (res > 0 && strcmp(buf, "cpu") == 0) 154 break; 155 node = OF_peer(node); 156 } 157 158 res = OF_getencprop(node, "ibm,pft-size", prop, sizeof(prop)); 159 if (res <= 0) 160 panic("mmu_phyp: unknown PFT size"); 161 final_pteg_count = 1 << prop[1]; 162 res = OF_getencprop(node, "ibm,slb-size", prop, sizeof(prop[0])); 163 if (res > 0) 164 n_slbs = prop[0]; 165 dprintf0("slb-size=%i\n", n_slbs); 166 167 moea64_pteg_count = final_pteg_count / sizeof(struct lpteg); 168 169 /* Clear any old page table entries */ 170 for (idx = 0; idx < moea64_pteg_count*8; idx++) { 171 phyp_pft_hcall(H_READ, 0, idx, 0, 0, &old.pte_hi, 172 &old.pte_lo, &old.pte_lo); 173 vsid = (old.pte_hi << (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) >> 28; 174 if (vsid == VSID_VRMA || vsid == 0 /* Older VRMA */) 175 continue; 176 177 if (old.pte_hi & LPTE_VALID) 178 phyp_hcall(H_REMOVE, 0, idx, 0); 179 } 180 181 /* 182 * Scan the large page size property for PAPR compatible machines. 183 * See PAPR D.5 Changes to Section 5.1.4, 'CPU Node Properties' 184 * for the encoding of the property. 185 */ 186 187 len = OF_getproplen(node, "ibm,segment-page-sizes"); 188 if (len > 0) { 189 /* 190 * We have to use a variable length array on the stack 191 * since we have very limited stack space. 192 */ 193 pcell_t arr[len/sizeof(cell_t)]; 194 res = OF_getencprop(node, "ibm,segment-page-sizes", arr, 195 sizeof(arr)); 196 len /= 4; 197 idx = 0; 198 while (len > 0) { 199 shift = arr[idx]; 200 slb_encoding = arr[idx + 1]; 201 nptlp = arr[idx + 2]; 202 203 dprintf0("Segment Page Size: " 204 "%uKB, slb_enc=0x%X: {size, encoding}[%u] =", 205 shift > 10? 1 << (shift-10) : 0, 206 slb_encoding, nptlp); 207 208 idx += 3; 209 len -= 3; 210 while (len > 0 && nptlp) { 211 lp_size = arr[idx]; 212 lp_encoding = arr[idx+1]; 213 214 dprintf(" {%uKB, 0x%X}", 215 lp_size > 10? 1 << (lp_size-10) : 0, 216 lp_encoding); 217 218 if (slb_encoding == SLBV_L && lp_encoding == 0) 219 break; 220 221 idx += 2; 222 len -= 2; 223 nptlp--; 224 } 225 dprintf("\n"); 226 if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) 227 break; 228 } 229 230 if (len > 0) { 231 moea64_large_page_shift = shift; 232 moea64_large_page_size = 1ULL << lp_size; 233 moea64_large_page_mask = moea64_large_page_size - 1; 234 hw_direct_map = 1; 235 printf(MMU_PHYP_ID 236 "Support for hugepages of %uKB detected\n", 237 moea64_large_page_shift > 10? 238 1 << (moea64_large_page_shift-10) : 0); 239 } else { 240 moea64_large_page_size = 0; 241 moea64_large_page_shift = 0; 242 moea64_large_page_mask = 0; 243 hw_direct_map = 0; 244 printf(MMU_PHYP_ID 245 "Support for hugepages not found\n"); 246 } 247 } 248 249 moea64_mid_bootstrap(mmup, kernelstart, kernelend); 250 moea64_late_bootstrap(mmup, kernelstart, kernelend); 251 252 /* Test for broken versions of KVM that don't conform to the spec */ 253 if (phyp_hcall(H_CLEAR_MOD, 0, 0) == H_FUNCTION) 254 brokenkvm = 1; 255 } 256 257 static void 258 mphyp_cpu_bootstrap(mmu_t mmup, int ap) 259 { 260 struct slb *slb = PCPU_GET(aim.slb); 261 register_t seg0; 262 int i; 263 264 /* 265 * Install kernel SLB entries 266 */ 267 268 __asm __volatile ("slbia"); 269 __asm __volatile ("slbmfee %0,%1; slbie %0;" : "=r"(seg0) : "r"(0)); 270 for (i = 0; i < 64; i++) { 271 if (!(slb[i].slbe & SLBE_VALID)) 272 continue; 273 274 __asm __volatile ("slbmte %0, %1" :: 275 "r"(slb[i].slbv), "r"(slb[i].slbe)); 276 } 277 } 278 279 static int64_t 280 mphyp_pte_synch(mmu_t mmu, struct pvo_entry *pvo) 281 { 282 struct lpte pte; 283 uint64_t junk; 284 285 __asm __volatile("ptesync"); 286 phyp_pft_hcall(H_READ, 0, pvo->pvo_pte.slot, 0, 0, &pte.pte_hi, 287 &pte.pte_lo, &junk); 288 if ((pte.pte_hi & LPTE_AVPN_MASK) != 289 ((pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & 290 LPTE_AVPN_MASK)) 291 return (-1); 292 if (!(pte.pte_hi & LPTE_VALID)) 293 return (-1); 294 295 return (pte.pte_lo & (LPTE_CHG | LPTE_REF)); 296 } 297 298 static int64_t 299 mphyp_pte_clear(mmu_t mmu, struct pvo_entry *pvo, uint64_t ptebit) 300 { 301 struct rm_priotracker track; 302 int64_t refchg; 303 uint64_t ptelo, junk; 304 int err; 305 306 /* 307 * This involves two steps (synch and clear) so we need the entry 308 * not to change in the middle. We are protected against deliberate 309 * unset by virtue of holding the pmap lock. Protection against 310 * incidental unset (page table eviction) comes from holding the 311 * shared eviction lock. 312 */ 313 PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); 314 rm_rlock(&mphyp_eviction_lock, &track); 315 316 refchg = mphyp_pte_synch(mmu, pvo); 317 if (refchg < 0) { 318 rm_runlock(&mphyp_eviction_lock, &track); 319 return (refchg); 320 } 321 322 if (brokenkvm) { 323 /* 324 * No way to clear either bit, which is total madness. 325 * Pessimistically claim that, once modified, it stays so 326 * forever and that it is never referenced. 327 */ 328 rm_runlock(&mphyp_eviction_lock, &track); 329 return (refchg & ~LPTE_REF); 330 } 331 332 if (ptebit & LPTE_CHG) { 333 err = phyp_pft_hcall(H_CLEAR_MOD, 0, pvo->pvo_pte.slot, 0, 0, 334 &ptelo, &junk, &junk); 335 KASSERT(err == H_SUCCESS, 336 ("Error clearing page change bit: %d", err)); 337 refchg |= (ptelo & LPTE_CHG); 338 } 339 if (ptebit & LPTE_REF) { 340 err = phyp_pft_hcall(H_CLEAR_REF, 0, pvo->pvo_pte.slot, 0, 0, 341 &ptelo, &junk, &junk); 342 KASSERT(err == H_SUCCESS, 343 ("Error clearing page reference bit: %d", err)); 344 refchg |= (ptelo & LPTE_REF); 345 } 346 347 rm_runlock(&mphyp_eviction_lock, &track); 348 349 return (refchg); 350 } 351 352 static int64_t 353 mphyp_pte_unset(mmu_t mmu, struct pvo_entry *pvo) 354 { 355 struct lpte pte; 356 uint64_t junk; 357 int err; 358 359 PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); 360 361 moea64_pte_from_pvo(pvo, &pte); 362 363 err = phyp_pft_hcall(H_REMOVE, H_AVPN, pvo->pvo_pte.slot, 364 pte.pte_hi & LPTE_AVPN_MASK, 0, &pte.pte_hi, &pte.pte_lo, 365 &junk); 366 KASSERT(err == H_SUCCESS || err == H_NOT_FOUND, 367 ("Error removing page: %d", err)); 368 369 if (err == H_NOT_FOUND) { 370 STAT_MOEA64(moea64_pte_overflow--); 371 return (-1); 372 } 373 374 return (pte.pte_lo & (LPTE_REF | LPTE_CHG)); 375 } 376 377 static uintptr_t 378 mphyp_pte_spillable_ident(uintptr_t ptegbase, struct lpte *to_evict) 379 { 380 uint64_t slot, junk, k; 381 struct lpte pt; 382 int i, j; 383 384 /* Start at a random slot */ 385 i = mftb() % 8; 386 k = -1; 387 for (j = 0; j < 8; j++) { 388 slot = ptegbase + (i + j) % 8; 389 phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pt.pte_hi, 390 &pt.pte_lo, &junk); 391 392 if (pt.pte_hi & LPTE_WIRED) 393 continue; 394 395 /* This is a candidate, so remember it */ 396 k = slot; 397 398 /* Try to get a page that has not been used lately */ 399 if (!(pt.pte_hi & LPTE_VALID) || !(pt.pte_lo & LPTE_REF)) { 400 memcpy(to_evict, &pt, sizeof(struct lpte)); 401 return (k); 402 } 403 } 404 405 if (k == -1) 406 return (k); 407 408 phyp_pft_hcall(H_READ, 0, k, 0, 0, &to_evict->pte_hi, 409 &to_evict->pte_lo, &junk); 410 return (k); 411 } 412 413 static int 414 mphyp_pte_insert(mmu_t mmu, struct pvo_entry *pvo) 415 { 416 struct rm_priotracker track; 417 int64_t result; 418 struct lpte evicted, pte; 419 uint64_t index, junk, lastptelo; 420 421 PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); 422 423 /* Initialize PTE */ 424 moea64_pte_from_pvo(pvo, &pte); 425 evicted.pte_hi = 0; 426 427 /* Make sure further insertion is locked out during evictions */ 428 rm_rlock(&mphyp_eviction_lock, &track); 429 430 /* 431 * First try primary hash. 432 */ 433 pvo->pvo_pte.slot &= ~7UL; /* Base slot address */ 434 result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, pte.pte_hi, 435 pte.pte_lo, &index, &evicted.pte_lo, &junk); 436 if (result == H_SUCCESS) { 437 rm_runlock(&mphyp_eviction_lock, &track); 438 pvo->pvo_pte.slot = index; 439 return (0); 440 } 441 KASSERT(result == H_PTEG_FULL, ("Page insertion error: %ld " 442 "(ptegidx: %#zx/%#lx, PTE %#lx/%#lx", result, pvo->pvo_pte.slot, 443 moea64_pteg_count, pte.pte_hi, pte.pte_lo)); 444 445 /* 446 * Next try secondary hash. 447 */ 448 pvo->pvo_vaddr ^= PVO_HID; 449 pte.pte_hi ^= LPTE_HID; 450 pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); 451 452 result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, 453 pte.pte_hi, pte.pte_lo, &index, &evicted.pte_lo, &junk); 454 if (result == H_SUCCESS) { 455 rm_runlock(&mphyp_eviction_lock, &track); 456 pvo->pvo_pte.slot = index; 457 return (0); 458 } 459 KASSERT(result == H_PTEG_FULL, ("Secondary page insertion error: %ld", 460 result)); 461 462 /* 463 * Out of luck. Find a PTE to sacrifice. 464 */ 465 466 /* Lock out all insertions for a bit */ 467 rm_runlock(&mphyp_eviction_lock, &track); 468 rm_wlock(&mphyp_eviction_lock); 469 470 index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); 471 if (index == -1L) { 472 /* Try other hash table? */ 473 pvo->pvo_vaddr ^= PVO_HID; 474 pte.pte_hi ^= LPTE_HID; 475 pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); 476 index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); 477 } 478 479 if (index == -1L) { 480 /* No freeable slots in either PTEG? We're hosed. */ 481 rm_wunlock(&mphyp_eviction_lock); 482 panic("mphyp_pte_insert: overflow"); 483 return (-1); 484 } 485 486 /* Victim acquired: update page before waving goodbye */ 487 if (evicted.pte_hi & LPTE_VALID) { 488 result = phyp_pft_hcall(H_REMOVE, H_AVPN, index, 489 evicted.pte_hi & LPTE_AVPN_MASK, 0, &junk, &lastptelo, 490 &junk); 491 STAT_MOEA64(moea64_pte_overflow++); 492 KASSERT(result == H_SUCCESS || result == H_NOT_FOUND, 493 ("Error evicting page: %d", (int)result)); 494 } 495 496 /* 497 * Set the new PTE. 498 */ 499 result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pte.pte_hi, 500 pte.pte_lo, &index, &evicted.pte_lo, &junk); 501 rm_wunlock(&mphyp_eviction_lock); /* All clear */ 502 503 pvo->pvo_pte.slot = index; 504 if (result == H_SUCCESS) 505 return (0); 506 507 panic("Page replacement error: %ld", result); 508 return (result); 509 } 510 511 static void * 512 mphyp_dump_pmap(mmu_t mmu, void *ctx, void *buf, u_long *nbytes) 513 { 514 struct dump_context *dctx; 515 struct lpte p, *pbuf; 516 int bufidx; 517 uint64_t junk; 518 u_long ptex, ptex_end; 519 520 dctx = (struct dump_context *)ctx; 521 pbuf = (struct lpte *)buf; 522 bufidx = 0; 523 ptex = dctx->ptex; 524 ptex_end = ptex + dctx->blksz / sizeof(struct lpte); 525 ptex_end = MIN(ptex_end, dctx->ptex_end); 526 *nbytes = (ptex_end - ptex) * sizeof(struct lpte); 527 528 if (*nbytes == 0) 529 return (NULL); 530 531 for (; ptex < ptex_end; ptex++) { 532 phyp_pft_hcall(H_READ, 0, ptex, 0, 0, 533 &p.pte_hi, &p.pte_lo, &junk); 534 pbuf[bufidx++] = p; 535 } 536 537 dctx->ptex = ptex; 538 return (buf); 539 } 540