1 /* 2 * Copyright (C) 2010 Andreas Tobler 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 18 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 21 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 22 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 23 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 #include <sys/param.h> 30 #include <sys/kernel.h> 31 #include <sys/ktr.h> 32 #include <sys/lock.h> 33 #include <sys/rmlock.h> 34 #include <sys/mutex.h> 35 #include <sys/proc.h> 36 #include <sys/sysctl.h> 37 #include <sys/systm.h> 38 #include <sys/vmmeter.h> 39 40 #include <dev/ofw/openfirm.h> 41 #include <machine/ofw_machdep.h> 42 43 #include <vm/vm.h> 44 #include <vm/vm_param.h> 45 #include <vm/vm_kern.h> 46 #include <vm/vm_page.h> 47 #include <vm/vm_map.h> 48 #include <vm/vm_object.h> 49 #include <vm/vm_extern.h> 50 #include <vm/vm_pageout.h> 51 #include <vm/uma.h> 52 53 #include <powerpc/aim/mmu_oea64.h> 54 55 #include "mmu_if.h" 56 #include "moea64_if.h" 57 58 #include "phyp-hvcall.h" 59 60 extern int n_slbs; 61 62 static struct rmlock mphyp_eviction_lock; 63 64 /* 65 * Kernel MMU interface 66 */ 67 68 static void mphyp_bootstrap(mmu_t mmup, vm_offset_t kernelstart, 69 vm_offset_t kernelend); 70 static void mphyp_cpu_bootstrap(mmu_t mmup, int ap); 71 static int64_t mphyp_pte_synch(mmu_t, struct pvo_entry *pvo); 72 static int64_t mphyp_pte_clear(mmu_t, struct pvo_entry *pvo, uint64_t ptebit); 73 static int64_t mphyp_pte_unset(mmu_t, struct pvo_entry *pvo); 74 static int mphyp_pte_insert(mmu_t, struct pvo_entry *pvo); 75 76 static mmu_method_t mphyp_methods[] = { 77 MMUMETHOD(mmu_bootstrap, mphyp_bootstrap), 78 MMUMETHOD(mmu_cpu_bootstrap, mphyp_cpu_bootstrap), 79 80 MMUMETHOD(moea64_pte_synch, mphyp_pte_synch), 81 MMUMETHOD(moea64_pte_clear, mphyp_pte_clear), 82 MMUMETHOD(moea64_pte_unset, mphyp_pte_unset), 83 MMUMETHOD(moea64_pte_insert, mphyp_pte_insert), 84 85 /* XXX: pmap_copy_page, pmap_init_page with H_PAGE_INIT */ 86 87 { 0, 0 } 88 }; 89 90 MMU_DEF_INHERIT(pseries_mmu, "mmu_phyp", mphyp_methods, 0, oea64_mmu); 91 92 static int brokenkvm = 0; 93 94 static void 95 print_kvm_bug_warning(void *data) 96 { 97 98 if (brokenkvm) 99 printf("WARNING: Running on a broken hypervisor that does " 100 "not support mandatory H_CLEAR_MOD and H_CLEAR_REF " 101 "hypercalls. Performance will be suboptimal.\n"); 102 } 103 104 SYSINIT(kvmbugwarn1, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, 105 print_kvm_bug_warning, NULL); 106 SYSINIT(kvmbugwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_kvm_bug_warning, 107 NULL); 108 109 static void 110 mphyp_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) 111 { 112 uint64_t final_pteg_count = 0; 113 char buf[8]; 114 uint32_t prop[2]; 115 uint32_t nptlp, shift = 0, slb_encoding = 0; 116 uint32_t lp_size, lp_encoding; 117 struct lpte old; 118 uint64_t vsid; 119 phandle_t dev, node, root; 120 int idx, len, res; 121 122 rm_init(&mphyp_eviction_lock, "pte eviction"); 123 124 moea64_early_bootstrap(mmup, kernelstart, kernelend); 125 126 root = OF_peer(0); 127 128 dev = OF_child(root); 129 while (dev != 0) { 130 res = OF_getprop(dev, "name", buf, sizeof(buf)); 131 if (res > 0 && strcmp(buf, "cpus") == 0) 132 break; 133 dev = OF_peer(dev); 134 } 135 136 node = OF_child(dev); 137 138 while (node != 0) { 139 res = OF_getprop(node, "device_type", buf, sizeof(buf)); 140 if (res > 0 && strcmp(buf, "cpu") == 0) 141 break; 142 node = OF_peer(node); 143 } 144 145 res = OF_getprop(node, "ibm,pft-size", prop, sizeof(prop)); 146 if (res <= 0) 147 panic("mmu_phyp: unknown PFT size"); 148 final_pteg_count = 1 << prop[1]; 149 res = OF_getprop(node, "ibm,slb-size", prop, sizeof(prop[0])); 150 if (res > 0) 151 n_slbs = prop[0]; 152 153 moea64_pteg_count = final_pteg_count / sizeof(struct lpteg); 154 155 /* Clear any old page table entries */ 156 for (idx = 0; idx < moea64_pteg_count*8; idx++) { 157 phyp_pft_hcall(H_READ, 0, idx, 0, 0, &old.pte_hi, 158 &old.pte_lo, &old.pte_lo); 159 vsid = (old.pte_hi << (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) >> 28; 160 if (vsid == VSID_VRMA || vsid == 0 /* Older VRMA */) 161 continue; 162 163 if (old.pte_hi & LPTE_VALID) 164 phyp_hcall(H_REMOVE, 0, idx, 0); 165 } 166 167 /* 168 * Scan the large page size property for PAPR compatible machines. 169 * See PAPR D.5 Changes to Section 5.1.4, 'CPU Node Properties' 170 * for the encoding of the property. 171 */ 172 173 len = OF_getproplen(node, "ibm,segment-page-sizes"); 174 if (len > 0) { 175 /* 176 * We have to use a variable length array on the stack 177 * since we have very limited stack space. 178 */ 179 pcell_t arr[len/sizeof(cell_t)]; 180 res = OF_getencprop(node, "ibm,segment-page-sizes", arr, 181 sizeof(arr)); 182 len /= 4; 183 idx = 0; 184 while (len > 0) { 185 shift = arr[idx]; 186 slb_encoding = arr[idx + 1]; 187 nptlp = arr[idx + 2]; 188 idx += 3; 189 len -= 3; 190 while (len > 0 && nptlp) { 191 lp_size = arr[idx]; 192 lp_encoding = arr[idx+1]; 193 if (slb_encoding == SLBV_L && lp_encoding == 0) 194 break; 195 196 idx += 2; 197 len -= 2; 198 nptlp--; 199 } 200 if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) 201 break; 202 } 203 204 if (len == 0) 205 panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) " 206 "not supported by this system. Please enable huge " 207 "page backing if running under PowerKVM."); 208 209 moea64_large_page_shift = shift; 210 moea64_large_page_size = 1ULL << lp_size; 211 } 212 213 moea64_mid_bootstrap(mmup, kernelstart, kernelend); 214 moea64_late_bootstrap(mmup, kernelstart, kernelend); 215 216 /* Test for broken versions of KVM that don't conform to the spec */ 217 if (phyp_hcall(H_CLEAR_MOD, 0, 0) == H_FUNCTION) 218 brokenkvm = 1; 219 } 220 221 static void 222 mphyp_cpu_bootstrap(mmu_t mmup, int ap) 223 { 224 struct slb *slb = PCPU_GET(slb); 225 register_t seg0; 226 int i; 227 228 /* 229 * Install kernel SLB entries 230 */ 231 232 __asm __volatile ("slbia"); 233 __asm __volatile ("slbmfee %0,%1; slbie %0;" : "=r"(seg0) : "r"(0)); 234 for (i = 0; i < 64; i++) { 235 if (!(slb[i].slbe & SLBE_VALID)) 236 continue; 237 238 __asm __volatile ("slbmte %0, %1" :: 239 "r"(slb[i].slbv), "r"(slb[i].slbe)); 240 } 241 } 242 243 static int64_t 244 mphyp_pte_synch(mmu_t mmu, struct pvo_entry *pvo) 245 { 246 struct lpte pte; 247 uint64_t junk; 248 249 __asm __volatile("ptesync"); 250 phyp_pft_hcall(H_READ, 0, pvo->pvo_pte.slot, 0, 0, &pte.pte_hi, 251 &pte.pte_lo, &junk); 252 if ((pte.pte_hi & LPTE_AVPN_MASK) != 253 ((pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & 254 LPTE_AVPN_MASK)) 255 return (-1); 256 if (!(pte.pte_hi & LPTE_VALID)) 257 return (-1); 258 259 return (pte.pte_lo & (LPTE_CHG | LPTE_REF)); 260 } 261 262 static int64_t 263 mphyp_pte_clear(mmu_t mmu, struct pvo_entry *pvo, uint64_t ptebit) 264 { 265 struct rm_priotracker track; 266 int64_t refchg; 267 uint64_t ptelo, junk; 268 int err; 269 270 /* 271 * This involves two steps (synch and clear) so we need the entry 272 * not to change in the middle. We are protected against deliberate 273 * unset by virtue of holding the pmap lock. Protection against 274 * incidental unset (page table eviction) comes from holding the 275 * shared eviction lock. 276 */ 277 PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); 278 rm_rlock(&mphyp_eviction_lock, &track); 279 280 refchg = mphyp_pte_synch(mmu, pvo); 281 if (refchg < 0) { 282 rm_runlock(&mphyp_eviction_lock, &track); 283 return (refchg); 284 } 285 286 if (brokenkvm) { 287 /* 288 * No way to clear either bit, which is total madness. 289 * Pessimistically claim that, once modified, it stays so 290 * forever and that it is never referenced. 291 */ 292 rm_runlock(&mphyp_eviction_lock, &track); 293 return (refchg & ~LPTE_REF); 294 } 295 296 if (ptebit & LPTE_CHG) { 297 err = phyp_pft_hcall(H_CLEAR_MOD, 0, pvo->pvo_pte.slot, 0, 0, 298 &ptelo, &junk, &junk); 299 KASSERT(err == H_SUCCESS, 300 ("Error clearing page change bit: %d", err)); 301 refchg |= (ptelo & LPTE_CHG); 302 } 303 if (ptebit & LPTE_REF) { 304 err = phyp_pft_hcall(H_CLEAR_REF, 0, pvo->pvo_pte.slot, 0, 0, 305 &ptelo, &junk, &junk); 306 KASSERT(err == H_SUCCESS, 307 ("Error clearing page reference bit: %d", err)); 308 refchg |= (ptelo & LPTE_REF); 309 } 310 311 rm_runlock(&mphyp_eviction_lock, &track); 312 313 return (refchg); 314 } 315 316 static int64_t 317 mphyp_pte_unset(mmu_t mmu, struct pvo_entry *pvo) 318 { 319 struct lpte pte; 320 uint64_t junk; 321 int err; 322 323 PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); 324 325 moea64_pte_from_pvo(pvo, &pte); 326 327 err = phyp_pft_hcall(H_REMOVE, H_AVPN, pvo->pvo_pte.slot, 328 pte.pte_hi & LPTE_AVPN_MASK, 0, &pte.pte_hi, &pte.pte_lo, 329 &junk); 330 KASSERT(err == H_SUCCESS || err == H_NOT_FOUND, 331 ("Error removing page: %d", err)); 332 333 if (err == H_NOT_FOUND) { 334 moea64_pte_overflow--; 335 return (-1); 336 } 337 338 return (pte.pte_lo & (LPTE_REF | LPTE_CHG)); 339 } 340 341 static uintptr_t 342 mphyp_pte_spillable_ident(uintptr_t ptegbase, struct lpte *to_evict) 343 { 344 uint64_t slot, junk, k; 345 struct lpte pt; 346 int i, j; 347 348 /* Start at a random slot */ 349 i = mftb() % 8; 350 k = -1; 351 for (j = 0; j < 8; j++) { 352 slot = ptegbase + (i + j) % 8; 353 phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pt.pte_hi, 354 &pt.pte_lo, &junk); 355 356 if (pt.pte_hi & LPTE_WIRED) 357 continue; 358 359 /* This is a candidate, so remember it */ 360 k = slot; 361 362 /* Try to get a page that has not been used lately */ 363 if (!(pt.pte_hi & LPTE_VALID) || !(pt.pte_lo & LPTE_REF)) { 364 memcpy(to_evict, &pt, sizeof(struct lpte)); 365 return (k); 366 } 367 } 368 369 if (k == -1) 370 return (k); 371 372 phyp_pft_hcall(H_READ, 0, k, 0, 0, &to_evict->pte_hi, 373 &to_evict->pte_lo, &junk); 374 return (k); 375 } 376 377 static int 378 mphyp_pte_insert(mmu_t mmu, struct pvo_entry *pvo) 379 { 380 struct rm_priotracker track; 381 int64_t result; 382 struct lpte evicted, pte; 383 uint64_t index, junk, lastptelo; 384 385 PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); 386 387 /* Initialize PTE */ 388 moea64_pte_from_pvo(pvo, &pte); 389 evicted.pte_hi = 0; 390 391 /* Make sure further insertion is locked out during evictions */ 392 rm_rlock(&mphyp_eviction_lock, &track); 393 394 /* 395 * First try primary hash. 396 */ 397 pvo->pvo_pte.slot &= ~7UL; /* Base slot address */ 398 result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, pte.pte_hi, 399 pte.pte_lo, &index, &evicted.pte_lo, &junk); 400 if (result == H_SUCCESS) { 401 rm_runlock(&mphyp_eviction_lock, &track); 402 pvo->pvo_pte.slot = index; 403 return (0); 404 } 405 KASSERT(result == H_PTEG_FULL, ("Page insertion error: %ld " 406 "(ptegidx: %#zx/%#x, PTE %#lx/%#lx", result, pvo->pvo_pte.slot, 407 moea64_pteg_count, pte.pte_hi, pte.pte_lo)); 408 409 /* 410 * Next try secondary hash. 411 */ 412 pvo->pvo_vaddr ^= PVO_HID; 413 pte.pte_hi ^= LPTE_HID; 414 pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); 415 416 result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, 417 pte.pte_hi, pte.pte_lo, &index, &evicted.pte_lo, &junk); 418 if (result == H_SUCCESS) { 419 rm_runlock(&mphyp_eviction_lock, &track); 420 pvo->pvo_pte.slot = index; 421 return (0); 422 } 423 KASSERT(result == H_PTEG_FULL, ("Secondary page insertion error: %ld", 424 result)); 425 426 /* 427 * Out of luck. Find a PTE to sacrifice. 428 */ 429 430 /* Lock out all insertions for a bit */ 431 rm_runlock(&mphyp_eviction_lock, &track); 432 rm_wlock(&mphyp_eviction_lock); 433 434 index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); 435 if (index == -1L) { 436 /* Try other hash table? */ 437 pvo->pvo_vaddr ^= PVO_HID; 438 pte.pte_hi ^= LPTE_HID; 439 pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); 440 index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); 441 } 442 443 if (index == -1L) { 444 /* No freeable slots in either PTEG? We're hosed. */ 445 rm_wunlock(&mphyp_eviction_lock); 446 panic("mphyp_pte_insert: overflow"); 447 return (-1); 448 } 449 450 /* Victim acquired: update page before waving goodbye */ 451 if (evicted.pte_hi & LPTE_VALID) { 452 result = phyp_pft_hcall(H_REMOVE, H_AVPN, index, 453 evicted.pte_hi & LPTE_AVPN_MASK, 0, &junk, &lastptelo, 454 &junk); 455 moea64_pte_overflow++; 456 KASSERT(result == H_SUCCESS, 457 ("Error evicting page: %d", (int)result)); 458 } 459 460 /* 461 * Set the new PTE. 462 */ 463 result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pte.pte_hi, 464 pte.pte_lo, &index, &evicted.pte_lo, &junk); 465 rm_wunlock(&mphyp_eviction_lock); /* All clear */ 466 467 pvo->pvo_pte.slot = index; 468 if (result == H_SUCCESS) 469 return (0); 470 471 panic("Page replacement error: %ld", result); 472 return (result); 473 } 474 475