1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Guest memory management for KVM/s390 4 * 5 * Copyright IBM Corp. 2008, 2020, 2024 6 * 7 * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> 8 * Martin Schwidefsky <schwidefsky@de.ibm.com> 9 * David Hildenbrand <david@redhat.com> 10 * Janosch Frank <frankja@linux.ibm.com> 11 */ 12 13 #include <linux/compiler.h> 14 #include <linux/kvm.h> 15 #include <linux/kvm_host.h> 16 #include <linux/pgtable.h> 17 #include <linux/pagemap.h> 18 #include <asm/lowcore.h> 19 #include <asm/uv.h> 20 #include <asm/gmap_helpers.h> 21 22 #include "dat.h" 23 #include "gmap.h" 24 #include "kvm-s390.h" 25 #include "faultin.h" 26 27 static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu) 28 { 29 return vcpu->arch.sie_block->prog0c & PROG_IN_SIE; 30 } 31 32 static int gmap_limit_to_type(gfn_t limit) 33 { 34 if (!limit) 35 return TABLE_TYPE_REGION1; 36 if (limit <= _REGION3_SIZE >> PAGE_SHIFT) 37 return TABLE_TYPE_SEGMENT; 38 if (limit <= _REGION2_SIZE >> PAGE_SHIFT) 39 return TABLE_TYPE_REGION3; 40 if (limit <= _REGION1_SIZE >> PAGE_SHIFT) 41 return TABLE_TYPE_REGION2; 42 return TABLE_TYPE_REGION1; 43 } 44 45 /** 46 * gmap_new() - Allocate and initialize a guest address space. 47 * @kvm: The kvm owning the guest. 48 * @limit: Maximum address of the gmap address space. 49 * 50 * Return: A guest address space structure. 51 */ 52 struct gmap *gmap_new(struct kvm *kvm, gfn_t limit) 53 { 54 struct crst_table *table; 55 struct gmap *gmap; 56 int type; 57 58 type = gmap_limit_to_type(limit); 59 60 gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT); 61 if (!gmap) 62 return NULL; 63 INIT_LIST_HEAD(&gmap->children); 64 INIT_LIST_HEAD(&gmap->list); 65 INIT_LIST_HEAD(&gmap->scb_users); 66 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE); 67 spin_lock_init(&gmap->children_lock); 68 spin_lock_init(&gmap->host_to_rmap_lock); 69 refcount_set(&gmap->refcount, 1); 70 71 table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val); 72 if (!table) { 73 kfree(gmap); 74 return NULL; 75 } 76 77 gmap->asce.val = __pa(table); 78 gmap->asce.dt = type; 79 gmap->asce.tl = _ASCE_TABLE_LENGTH; 80 gmap->asce.x = 1; 81 gmap->asce.p = 1; 82 gmap->asce.s = 1; 83 gmap->kvm = kvm; 84 set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags); 85 86 return gmap; 87 } 88 89 static void gmap_add_child(struct gmap *parent, struct gmap *child) 90 { 91 KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm); 92 KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm); 93 KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm); 94 lockdep_assert_held(&parent->children_lock); 95 96 child->parent = parent; 97 98 if (is_ucontrol(parent)) 99 set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); 100 else 101 clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); 102 103 if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags)) 104 set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); 105 else 106 clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); 107 108 if (test_bit(GMAP_FLAG_ALLOW_HPAGE_2G, &parent->flags)) 109 set_bit(GMAP_FLAG_ALLOW_HPAGE_2G, &child->flags); 110 else 111 clear_bit(GMAP_FLAG_ALLOW_HPAGE_2G, &child->flags); 112 113 if (kvm_is_ucontrol(parent->kvm)) 114 clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags); 115 list_add(&child->list, &parent->children); 116 } 117 118 struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit) 119 { 120 struct gmap *res; 121 122 lockdep_assert_not_held(&parent->children_lock); 123 res = gmap_new(parent->kvm, limit); 124 if (res) { 125 scoped_guard(spinlock, &parent->children_lock) 126 gmap_add_child(parent, res); 127 } 128 return res; 129 } 130 131 int gmap_set_limit(struct gmap *gmap, gfn_t limit) 132 { 133 struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL; 134 int rc, type; 135 136 type = gmap_limit_to_type(limit); 137 138 mc = kvm_s390_new_mmu_cache(); 139 if (!mc) 140 return -ENOMEM; 141 142 do { 143 rc = kvm_s390_mmu_cache_topup(mc); 144 if (rc) 145 return rc; 146 scoped_guard(write_lock, &gmap->kvm->mmu_lock) 147 rc = dat_set_asce_limit(mc, &gmap->asce, type); 148 } while (rc == -ENOMEM); 149 150 return 0; 151 } 152 153 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) 154 { 155 struct vsie_rmap *rmap, *rnext, *head; 156 struct radix_tree_iter iter; 157 unsigned long indices[16]; 158 unsigned long index; 159 void __rcu **slot; 160 int i, nr; 161 162 /* A radix tree is freed by deleting all of its entries */ 163 index = 0; 164 do { 165 nr = 0; 166 radix_tree_for_each_slot(slot, root, &iter, index) { 167 indices[nr] = iter.index; 168 if (++nr == 16) 169 break; 170 } 171 for (i = 0; i < nr; i++) { 172 index = indices[i]; 173 head = radix_tree_delete(root, index); 174 gmap_for_each_rmap_safe(rmap, rnext, head) 175 kfree(rmap); 176 } 177 } while (nr > 0); 178 } 179 180 void gmap_remove_child(struct gmap *child) 181 { 182 if (KVM_BUG_ON(!child->parent, child->kvm)) 183 return; 184 lockdep_assert_held(&child->parent->children_lock); 185 186 list_del(&child->list); 187 child->parent = NULL; 188 child->invalidated = true; 189 } 190 191 /** 192 * gmap_dispose() - Remove and free a guest address space and its children. 193 * @gmap: Pointer to the guest address space structure. 194 */ 195 void gmap_dispose(struct gmap *gmap) 196 { 197 /* The gmap must have been removed from the parent beforehands */ 198 KVM_BUG_ON(gmap->parent, gmap->kvm); 199 /* All children of this gmap must have been removed beforehands */ 200 KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm); 201 /* No VSIE shadow block is allowed to use this gmap */ 202 KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm); 203 /* The ASCE must be valid */ 204 KVM_BUG_ON(!gmap->asce.val, gmap->kvm); 205 /* The refcount must be 0 */ 206 KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm); 207 208 /* Flush tlb of all gmaps */ 209 asce_flush_tlb(gmap->asce); 210 211 /* Free all DAT tables. */ 212 dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap)); 213 214 /* Free additional data for a shadow gmap */ 215 if (is_shadow(gmap)) 216 gmap_rmap_radix_tree_free(&gmap->host_to_rmap); 217 218 kfree(gmap); 219 } 220 221 /** 222 * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy. 223 * @gmap: The gmap whose ASCE needs to be replaced. 224 * 225 * If the ASCE is a SEGMENT type then this function will return -EINVAL, 226 * otherwise the pointers in the host_to_guest radix tree will keep pointing 227 * to the wrong pages, causing use-after-free and memory corruption. 228 * If the allocation of the new top level page table fails, the ASCE is not 229 * replaced. 230 * In any case, the old ASCE is always removed from the gmap CRST list. 231 * Therefore the caller has to make sure to save a pointer to it 232 * beforehand, unless a leak is actually intended. 233 * 234 * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE, 235 * -ENOMEM if runinng out of memory. 236 */ 237 int s390_replace_asce(struct gmap *gmap) 238 { 239 struct crst_table *table; 240 union asce asce; 241 242 /* Replacing segment type ASCEs would cause serious issues */ 243 if (gmap->asce.dt == ASCE_TYPE_SEGMENT) 244 return -EINVAL; 245 246 table = dat_alloc_crst_sleepable(0); 247 if (!table) 248 return -ENOMEM; 249 memcpy(table, dereference_asce(gmap->asce), sizeof(*table)); 250 251 /* Set new table origin while preserving existing ASCE control bits */ 252 asce = gmap->asce; 253 asce.rsto = virt_to_pfn(table); 254 WRITE_ONCE(gmap->asce, asce); 255 256 return 0; 257 } 258 259 bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint) 260 { 261 struct kvm *kvm = gmap->kvm; 262 struct kvm_vcpu *vcpu; 263 gfn_t prefix_gfn; 264 unsigned long i; 265 266 if (is_shadow(gmap)) 267 return false; 268 kvm_for_each_vcpu(i, vcpu, kvm) { 269 /* Match against both prefix pages */ 270 prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu)); 271 if (prefix_gfn < end && gfn <= prefix_gfn + 1) { 272 if (hint && kvm_s390_is_in_sie(vcpu)) 273 return false; 274 VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx", 275 gfn_to_gpa(gfn), gfn_to_gpa(end)); 276 kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); 277 } 278 } 279 return true; 280 } 281 282 struct clear_young_pte_priv { 283 struct gmap *gmap; 284 bool young; 285 }; 286 287 static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk) 288 { 289 struct clear_young_pte_priv *p = walk->priv; 290 union pgste pgste; 291 union pte pte, new; 292 293 pte = READ_ONCE(*ptep); 294 295 if (!pte.s.pr || (!pte.s.y && pte.h.i)) 296 return 0; 297 298 pgste = pgste_get_lock(ptep); 299 if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) { 300 new = pte; 301 new.h.i = 1; 302 new.s.y = 0; 303 if ((new.s.d || !new.h.p) && !new.s.s) 304 folio_set_dirty(pfn_folio(pte.h.pfra)); 305 new.s.d = 0; 306 new.h.p = 1; 307 308 pgste.prefix_notif = 0; 309 pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap)); 310 } 311 p->young = 1; 312 pgste_set_unlock(ptep, pgste); 313 return 0; 314 } 315 316 static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk) 317 { 318 struct clear_young_pte_priv *priv = walk->priv; 319 union crste crste, new; 320 321 do { 322 crste = READ_ONCE(*crstep); 323 324 if (!crste.h.fc) 325 return 0; 326 if (!crste.s.fc1.y && crste.h.i) 327 return 0; 328 if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end)) 329 break; 330 331 new = crste; 332 new.h.i = 1; 333 new.s.fc1.y = 0; 334 new.s.fc1.prefix_notif = 0; 335 if (new.s.fc1.d || !new.h.p) 336 folio_set_dirty(phys_to_folio(crste_origin_large(crste))); 337 new.s.fc1.d = 0; 338 new.h.p = 1; 339 } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce)); 340 341 priv->young = 1; 342 return 0; 343 } 344 345 /** 346 * gmap_age_gfn() - Clear young. 347 * @gmap: The guest gmap. 348 * @start: The first gfn to test. 349 * @end: The gfn after the last one to test. 350 * 351 * Context: Called with the kvm mmu write lock held. 352 * Return: 1 if any page in the given range was young, otherwise 0. 353 */ 354 bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end) 355 { 356 const struct dat_walk_ops ops = { 357 .pte_entry = gmap_clear_young_pte, 358 .pmd_entry = gmap_clear_young_crste, 359 .pud_entry = gmap_clear_young_crste, 360 }; 361 struct clear_young_pte_priv priv = { 362 .gmap = gmap, 363 .young = false, 364 }; 365 366 _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); 367 368 return priv.young; 369 } 370 371 struct gmap_unmap_priv { 372 struct gmap *gmap; 373 struct kvm_memory_slot *slot; 374 }; 375 376 static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w) 377 { 378 struct gmap_unmap_priv *priv = w->priv; 379 struct folio *folio = NULL; 380 unsigned long vmaddr; 381 union pgste pgste; 382 383 pgste = pgste_get_lock(ptep); 384 if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) { 385 vmaddr = __gfn_to_hva_memslot(priv->slot, gfn); 386 gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr); 387 } 388 if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) 389 folio = pfn_folio(ptep->h.pfra); 390 pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn); 391 pgste_set_unlock(ptep, pgste); 392 if (folio) 393 uv_convert_from_secure_folio(folio); 394 395 return 0; 396 } 397 398 static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 399 { 400 struct gmap_unmap_priv *priv = walk->priv; 401 struct folio *folio = NULL; 402 union crste old = *crstep; 403 bool ok; 404 405 if (!old.h.fc) 406 return 0; 407 408 if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) 409 folio = phys_to_folio(crste_origin_large(old)); 410 /* 411 * No races should happen because kvm->mmu_lock is held in write mode, 412 * but the unmap operation could have triggered an unshadow, which 413 * causes gmap_crstep_xchg_atomic() to return false and clear the 414 * vsie_notif bit. Allow the operation to fail once, if the old crste 415 * had the vsie_notif bit set. A second failure is not allowed, for 416 * the reasons above. 417 */ 418 ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn); 419 if (!ok) { 420 KVM_BUG_ON(!old.s.fc1.vsie_notif, priv->gmap->kvm); 421 old.s.fc1.vsie_notif = 0; 422 ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn); 423 KVM_BUG_ON(!ok, priv->gmap->kvm); 424 } 425 if (folio) 426 uv_convert_from_secure_folio(folio); 427 428 return 0; 429 } 430 431 /** 432 * gmap_unmap_gfn_range() - Unmap a range of guest addresses. 433 * @gmap: The gmap to act on. 434 * @slot: The memslot in which the range is located. 435 * @start: The first gfn to unmap. 436 * @end: The gfn after the last one to unmap. 437 * 438 * Context: Called with the kvm mmu write lock held. 439 * Return: false 440 */ 441 bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end) 442 { 443 const struct dat_walk_ops ops = { 444 .pte_entry = _gmap_unmap_pte, 445 .pmd_entry = _gmap_unmap_crste, 446 .pud_entry = _gmap_unmap_crste, 447 }; 448 struct gmap_unmap_priv priv = { 449 .gmap = gmap, 450 .slot = slot, 451 }; 452 453 lockdep_assert_held_write(&gmap->kvm->mmu_lock); 454 455 _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); 456 return false; 457 } 458 459 static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn, 460 struct gmap *gmap) 461 { 462 union pte pte = READ_ONCE(*ptep); 463 464 if (!pte.s.pr || (pte.h.p && !pte.s.sd)) 465 return pgste; 466 467 /* 468 * If this page contains one or more prefixes of vCPUS that are currently 469 * running, do not reset the protection, leave it marked as dirty. 470 */ 471 if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) { 472 pte.h.p = 1; 473 pte.s.sd = 0; 474 pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn); 475 } 476 477 mark_page_dirty(gmap->kvm, gfn); 478 479 return pgste; 480 } 481 482 static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end, 483 struct dat_walk *walk) 484 { 485 struct gmap *gmap = walk->priv; 486 union pgste pgste; 487 488 pgste = pgste_get_lock(ptep); 489 pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap); 490 pgste_set_unlock(ptep, pgste); 491 return 0; 492 } 493 494 static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end, 495 struct dat_walk *walk) 496 { 497 struct gmap *gmap = walk->priv; 498 union crste crste, new; 499 500 if (fatal_signal_pending(current)) 501 return 1; 502 do { 503 crste = READ_ONCE(*table); 504 if (!crste.h.fc) 505 return 0; 506 if (crste.h.p && !crste.s.fc1.sd) 507 return 0; 508 509 /* 510 * If this large page contains one or more prefixes of vCPUs that are 511 * currently running, do not reset the protection, leave it marked as 512 * dirty. 513 */ 514 if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end)) 515 break; 516 new = crste; 517 new.h.p = 1; 518 new.s.fc1.sd = 0; 519 } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn)); 520 521 for ( ; gfn < end; gfn++) 522 mark_page_dirty(gmap->kvm, gfn); 523 524 return 0; 525 } 526 527 void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end) 528 { 529 const struct dat_walk_ops walk_ops = { 530 .pte_entry = _pte_test_and_clear_softdirty, 531 .pmd_entry = _crste_test_and_clear_softdirty, 532 .pud_entry = _crste_test_and_clear_softdirty, 533 }; 534 535 lockdep_assert_held(&gmap->kvm->mmu_lock); 536 537 _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap); 538 } 539 540 static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f) 541 { 542 union crste newcrste, oldcrste = READ_ONCE(*f->crstep); 543 544 /* Somehow the crste is not large anymore, let the slow path deal with it. */ 545 if (!oldcrste.h.fc) 546 return 1; 547 548 f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn)); 549 f->writable = oldcrste.s.fc1.w; 550 551 f->crste_region3 = is_pud(oldcrste); 552 /* Appropriate permissions already (race with another handler), nothing to do. */ 553 if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p)) 554 return 0; 555 556 if (!f->write_attempt || oldcrste.s.fc1.w) { 557 f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d; 558 newcrste = oldcrste; 559 newcrste.h.i = 0; 560 newcrste.s.fc1.y = 1; 561 if (f->write_attempt) { 562 newcrste.h.p = 0; 563 newcrste.s.fc1.d = 1; 564 newcrste.s.fc1.sd = 1; 565 } 566 /* In case of races, let the slow path deal with it. */ 567 return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn); 568 } 569 /* Trying to write on a read-only page, let the slow path deal with it. */ 570 return 1; 571 } 572 573 static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste, 574 struct guest_fault *f) 575 { 576 union pte newpte, oldpte = READ_ONCE(*f->ptep); 577 578 f->pfn = oldpte.h.pfra; 579 f->writable = oldpte.s.w; 580 581 /* Appropriate permissions already (race with another handler), nothing to do. */ 582 if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p)) 583 return 0; 584 /* Trying to write on a read-only page, let the slow path deal with it. */ 585 if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w)) 586 return 1; 587 588 newpte = oldpte; 589 newpte.h.i = 0; 590 newpte.s.y = 1; 591 if (f->write_attempt) { 592 newpte.h.p = 0; 593 newpte.s.d = 1; 594 newpte.s.sd = 1; 595 } 596 *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn); 597 598 return 0; 599 } 600 601 /** 602 * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault. 603 * @gmap: The gmap whose fault needs to be resolved. 604 * @fault: Describes the fault that is being resolved. 605 * 606 * A minor fault is a fault that can be resolved quickly within gmap. 607 * The page is already mapped, the fault is only due to dirty/young tracking. 608 * 609 * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could 610 * not be resolved and needs to go through the slow path. 611 */ 612 int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault) 613 { 614 union pgste pgste; 615 int rc; 616 617 lockdep_assert_held(&gmap->kvm->mmu_lock); 618 619 rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE, 620 &fault->crstep, &fault->ptep); 621 /* If a PTE or a leaf CRSTE could not be reached, slow path. */ 622 if (rc) 623 return 1; 624 625 if (fault->ptep) { 626 pgste = pgste_get_lock(fault->ptep); 627 rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault); 628 if (!rc && fault->callback) 629 fault->callback(fault); 630 pgste_set_unlock(fault->ptep, pgste); 631 } else { 632 rc = gmap_handle_minor_crste_fault(gmap, fault); 633 if (!rc && fault->callback) 634 fault->callback(fault); 635 } 636 return rc; 637 } 638 639 /** 640 * gmap_2g_allowed() - Check whether a 2G hugepage is allowed. 641 * @gmap: The gmap of the guest. 642 * @f: Describes the fault that is being resolved. 643 * @slot: The memslot the faulting address belongs to. 644 * 645 * The function checks whether the GMAP_FLAG_ALLOW_HPAGE_2G flag is set for 646 * @gmap, whether the offset of the address in the 2G virtual frame is the 647 * same as the offset in the physical 2G frame, and finally whether the whole 648 * 2G page would fit in the given memslot. 649 * 650 * Return: true if a 2G hugepage is allowed to back the faulting address, false 651 * otherwise. 652 */ 653 static inline bool gmap_2g_allowed(struct gmap *gmap, struct guest_fault *f, 654 struct kvm_memory_slot *slot) 655 { 656 return test_bit(GMAP_FLAG_ALLOW_HPAGE_2G, &gmap->flags) && 657 !((f->gfn ^ f->pfn) & ~_REGION3_FR_MASK) && 658 slot->base_gfn <= ALIGN_DOWN(f->gfn, _PAGES_PER_REGION3) && 659 slot->base_gfn + slot->npages >= ALIGN(f->gfn + 1, _PAGES_PER_REGION3); 660 } 661 662 /** 663 * gmap_1m_allowed() - Check whether a 1M hugepage is allowed. 664 * @gmap: The gmap of the guest. 665 * @f: Describes the fault that is being resolved. 666 * @slot: The memslot the faulting address belongs to. 667 * 668 * The function checks whether the GMAP_FLAG_ALLOW_HPAGE_1M flag is set for 669 * @gmap, whether the offset of the address in the 1M virtual frame is the 670 * same as the offset in the physical 1M frame, and finally whether the whole 671 * 1M page would fit in the given memslot. 672 * 673 * Return: true if a 1M hugepage is allowed to back the faulting address, false 674 * otherwise. 675 */ 676 static inline bool gmap_1m_allowed(struct gmap *gmap, struct guest_fault *f, 677 struct kvm_memory_slot *slot) 678 { 679 return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags) && 680 !((f->gfn ^ f->pfn) & ~_SEGMENT_FR_MASK) && 681 slot->base_gfn <= ALIGN_DOWN(f->gfn, _PAGES_PER_SEGMENT) && 682 slot->base_gfn + slot->npages >= ALIGN(f->gfn + 1, _PAGES_PER_SEGMENT); 683 } 684 685 static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level, 686 struct guest_fault *f) 687 { 688 union crste oldval, newval; 689 union pte newpte, oldpte; 690 union pgste pgste; 691 int rc = 0; 692 693 rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level, 694 &f->crstep, &f->ptep); 695 if (rc == -ENOMEM) 696 return rc; 697 if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm)) 698 return rc; 699 if (rc) 700 return -EAGAIN; 701 if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm)) 702 return -EINVAL; 703 704 if (f->ptep) { 705 pgste = pgste_get_lock(f->ptep); 706 oldpte = *f->ptep; 707 newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); 708 newpte.s.sd = oldpte.s.sd; 709 oldpte.s.sd = 0; 710 if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { 711 pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn); 712 if (f->callback) 713 f->callback(f); 714 } else { 715 rc = -EAGAIN; 716 } 717 pgste_set_unlock(f->ptep, pgste); 718 } else { 719 do { 720 oldval = READ_ONCE(*f->crstep); 721 newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, 722 f->write_attempt | oldval.s.fc1.d); 723 newval.s.fc1.s = !f->page; 724 newval.s.fc1.sd = oldval.s.fc1.sd; 725 if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && 726 crste_origin_large(oldval) != crste_origin_large(newval)) 727 return -EAGAIN; 728 f->crste_region3 = is_pud(newval); 729 } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn)); 730 if (f->callback) 731 f->callback(f); 732 } 733 734 return rc; 735 } 736 737 int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f, 738 struct kvm_memory_slot *slot) 739 { 740 unsigned int order; 741 int level; 742 743 lockdep_assert_held(&gmap->kvm->mmu_lock); 744 745 level = TABLE_TYPE_PAGE_TABLE; 746 if (f->page) { 747 order = folio_order(page_folio(f->page)); 748 if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f, slot)) 749 level = TABLE_TYPE_REGION3; 750 else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f, slot)) 751 level = TABLE_TYPE_SEGMENT; 752 } 753 return _gmap_link(mc, gmap, level, f); 754 } 755 756 static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, 757 gfn_t p_gfn, gfn_t c_gfn, bool force_alloc) 758 { 759 union crste newcrste, oldcrste; 760 struct page_table *pt; 761 union crste *crstep; 762 union pte *ptep; 763 int rc; 764 765 if (force_alloc) 766 rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC, 767 TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); 768 else 769 rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE, 770 TABLE_TYPE_SEGMENT, &crstep, &ptep); 771 if (rc) 772 return rc; 773 if (!ptep) { 774 newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT); 775 newcrste.h.i = 1; 776 newcrste.h.fc0.tl = 1; 777 } else { 778 pt = pte_table_start(ptep); 779 dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT)); 780 newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT); 781 } 782 rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT, 783 &crstep, &ptep); 784 if (rc) 785 return rc; 786 do { 787 oldcrste = READ_ONCE(*crstep); 788 if (oldcrste.val == newcrste.val) 789 break; 790 } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce)); 791 return 0; 792 } 793 794 static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp) 795 { 796 union pte *ptep; 797 int rc; 798 799 rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE, 800 TABLE_TYPE_SEGMENT, crstepp, &ptep); 801 if (rc || (!ptep && !crste_is_ucas(**crstepp))) 802 return -EREMOTE; 803 if (!ptep) 804 return 1; 805 *gaddr &= ~_SEGMENT_MASK; 806 *gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT; 807 return 0; 808 } 809 810 /** 811 * gmap_ucas_translate() - Translate a vcpu address into a host gmap address 812 * @mc: The memory cache to be used for allocations. 813 * @gmap: The per-cpu gmap. 814 * @gaddr: Pointer to the address to be translated, will get overwritten with 815 * the translated address in case of success. 816 * Translates the per-vCPU guest address into a fake guest address, which can 817 * then be used with the fake memslots that are identity mapping userspace. 818 * This allows ucontrol VMs to use the normal fault resolution path, like 819 * normal VMs. 820 * 821 * Return: %0 in case of success, otherwise %-EREMOTE. 822 */ 823 int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr) 824 { 825 gpa_t translated_address; 826 union crste *crstep; 827 gfn_t gfn; 828 int rc; 829 830 gfn = gpa_to_gfn(*gaddr); 831 832 scoped_guard(read_lock, &gmap->kvm->mmu_lock) { 833 rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); 834 if (rc <= 0) 835 return rc; 836 } 837 do { 838 scoped_guard(write_lock, &gmap->kvm->mmu_lock) { 839 rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); 840 if (rc <= 0) 841 return rc; 842 translated_address = (*gaddr & ~_SEGMENT_MASK) | 843 (crstep->val & _SEGMENT_MASK); 844 rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true); 845 } 846 if (!rc) { 847 *gaddr = translated_address; 848 return 0; 849 } 850 if (rc != -ENOMEM) 851 return -EREMOTE; 852 rc = kvm_s390_mmu_cache_topup(mc); 853 if (rc) 854 return rc; 855 } while (1); 856 return 0; 857 } 858 859 int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count) 860 { 861 struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL; 862 int rc = 0; 863 864 mc = kvm_s390_new_mmu_cache(); 865 if (!mc) 866 return -ENOMEM; 867 868 while (count) { 869 scoped_guard(write_lock, &gmap->kvm->mmu_lock) 870 rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false); 871 if (rc == -ENOMEM) { 872 rc = kvm_s390_mmu_cache_topup(mc); 873 if (rc) 874 return rc; 875 continue; 876 } 877 if (rc) 878 return rc; 879 880 count--; 881 c_gfn += _PAGE_ENTRIES; 882 p_gfn += _PAGE_ENTRIES; 883 } 884 return rc; 885 } 886 887 static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn) 888 { 889 union crste *crstep; 890 union pte *ptep; 891 int rc; 892 893 rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); 894 if (rc) 895 return; 896 while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce)) 897 ; 898 } 899 900 void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) 901 { 902 guard(read_lock)(&gmap->kvm->mmu_lock); 903 904 for ( ; count; count--, c_gfn += _PAGE_ENTRIES) 905 gmap_ucas_unmap_one(gmap, c_gfn); 906 } 907 908 static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 909 { 910 struct gmap *gmap = walk->priv; 911 union crste crste, newcrste; 912 913 crste = READ_ONCE(*crstep); 914 newcrste = _CRSTE_EMPTY(crste.h.tt); 915 916 while (crste_leaf(crste)) { 917 if (crste_prefix(crste)) 918 gmap_unmap_prefix(gmap, gfn, next); 919 if (crste.s.fc1.vsie_notif) 920 gmap_handle_vsie_unshadow_event(gmap, gfn); 921 if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce)) 922 break; 923 crste = READ_ONCE(*crstep); 924 } 925 926 if (need_resched()) 927 return next; 928 929 return 0; 930 } 931 932 void gmap_split_huge_pages(struct gmap *gmap) 933 { 934 const struct dat_walk_ops ops = { 935 .pmd_entry = _gmap_split_crste, 936 .pud_entry = _gmap_split_crste, 937 }; 938 gfn_t start = 0; 939 940 do { 941 scoped_guard(read_lock, &gmap->kvm->mmu_lock) 942 start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce, 943 &ops, DAT_WALK_IGN_HOLES, gmap); 944 cond_resched(); 945 } while (start); 946 } 947 948 static int _gmap_enable_skeys(struct gmap *gmap) 949 { 950 gfn_t start = 0; 951 int rc; 952 953 if (uses_skeys(gmap)) 954 return 0; 955 956 set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); 957 rc = gmap_helper_disable_cow_sharing(); 958 if (rc) { 959 clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); 960 return rc; 961 } 962 963 do { 964 scoped_guard(write_lock, &gmap->kvm->mmu_lock) 965 start = dat_reset_skeys(gmap->asce, start); 966 cond_resched(); 967 } while (start); 968 return 0; 969 } 970 971 int gmap_enable_skeys(struct gmap *gmap) 972 { 973 int rc; 974 975 mmap_write_lock(gmap->kvm->mm); 976 rc = _gmap_enable_skeys(gmap); 977 mmap_write_unlock(gmap->kvm->mm); 978 return rc; 979 } 980 981 static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 982 { 983 if (!ptep->s.pr) 984 return 0; 985 __kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep))); 986 if (need_resched()) 987 return next; 988 return 0; 989 } 990 991 static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 992 { 993 phys_addr_t origin, cur, end; 994 995 if (!crstep->h.fc || !crstep->s.fc1.pr) 996 return 0; 997 998 origin = crste_origin_large(*crstep); 999 cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin; 1000 end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin; 1001 for ( ; cur < end; cur += PAGE_SIZE) 1002 __kvm_s390_pv_destroy_page(phys_to_page(cur)); 1003 if (need_resched()) 1004 return next; 1005 return 0; 1006 } 1007 1008 int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible) 1009 { 1010 const struct dat_walk_ops ops = { 1011 .pte_entry = _destroy_pages_pte, 1012 .pmd_entry = _destroy_pages_crste, 1013 .pud_entry = _destroy_pages_crste, 1014 }; 1015 1016 do { 1017 scoped_guard(read_lock, &gmap->kvm->mmu_lock) 1018 start = _dat_walk_gfn_range(start, end, gmap->asce, &ops, 1019 DAT_WALK_IGN_HOLES, NULL); 1020 if (interruptible && fatal_signal_pending(current)) 1021 return -EINTR; 1022 cond_resched(); 1023 } while (start && start < end); 1024 return 0; 1025 } 1026 1027 int gmap_insert_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, 1028 gfn_t r_gfn, int level) 1029 { 1030 struct vsie_rmap *rmap __free(kvfree) = NULL; 1031 struct vsie_rmap *temp; 1032 void __rcu **slot; 1033 int rc = 0; 1034 1035 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 1036 lockdep_assert_held(&sg->host_to_rmap_lock); 1037 1038 rmap = kvm_s390_mmu_cache_alloc_rmap(mc); 1039 if (!rmap) 1040 return -ENOMEM; 1041 1042 rmap->r_gfn = r_gfn; 1043 rmap->level = level; 1044 slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn); 1045 if (slot) { 1046 rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock); 1047 for (temp = rmap->next; temp; temp = temp->next) { 1048 if (temp->val == rmap->val) 1049 return 0; 1050 } 1051 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); 1052 } else { 1053 rmap->next = NULL; 1054 rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap); 1055 if (rc) 1056 return rc; 1057 } 1058 rmap = NULL; 1059 1060 return 0; 1061 } 1062 1063 int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, 1064 kvm_pfn_t pfn, int level, bool wr) 1065 { 1066 unsigned long bitmask; 1067 union crste *crstep; 1068 union pgste pgste; 1069 union pte *ptep; 1070 union pte pte; 1071 int flags, rc; 1072 1073 if (KVM_BUG_ON(!is_shadow(sg) || level <= TABLE_TYPE_PAGE_TABLE, sg->kvm)) 1074 return -EINVAL; 1075 lockdep_assert_held(&sg->parent->children_lock); 1076 1077 flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); 1078 rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags, 1079 TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); 1080 if (rc) 1081 return rc; 1082 if (level <= TABLE_TYPE_REGION1) { 1083 bitmask = -1UL << (8 + 11 * level); 1084 scoped_guard(spinlock, &sg->host_to_rmap_lock) 1085 rc = gmap_insert_rmap(mc, sg, p_gfn, r_gfn & bitmask, level); 1086 } 1087 if (rc) 1088 return rc; 1089 1090 if (!pgste_get_trylock(ptep, &pgste)) 1091 return -EAGAIN; 1092 pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false); 1093 pte.h.p = 1; 1094 pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false); 1095 pgste.vsie_notif = 1; 1096 pgste_set_unlock(ptep, pgste); 1097 1098 return 0; 1099 } 1100 1101 static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 1102 { 1103 __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val); 1104 if (need_resched()) 1105 return next; 1106 return 0; 1107 } 1108 1109 void gmap_set_cmma_all_dirty(struct gmap *gmap) 1110 { 1111 const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, }; 1112 gfn_t gfn = 0; 1113 1114 do { 1115 scoped_guard(read_lock, &gmap->kvm->mmu_lock) 1116 gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops, 1117 DAT_WALK_IGN_HOLES, NULL); 1118 cond_resched(); 1119 } while (gfn); 1120 } 1121 1122 static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level) 1123 { 1124 unsigned long align = PAGE_SIZE; 1125 gpa_t gaddr = gfn_to_gpa(r_gfn); 1126 union crste *crstep; 1127 union crste crste; 1128 union pte *ptep; 1129 1130 if (level > TABLE_TYPE_PAGE_TABLE) 1131 align = 1UL << (11 * level + _SEGMENT_SHIFT); 1132 kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align)); 1133 sg->invalidated = true; 1134 if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep)) 1135 return; 1136 if (ptep) { 1137 if (READ_ONCE(*ptep).val != _PTE_EMPTY.val) 1138 dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); 1139 return; 1140 } 1141 1142 crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce); 1143 if (crste_leaf(crste) || crste.h.i) 1144 return; 1145 if (is_pmd(crste)) 1146 dat_free_pt(dereference_pmd(crste.pmd)); 1147 else 1148 dat_free_level(dereference_crste(crste), true); 1149 } 1150 1151 static void gmap_unshadow(struct gmap *sg) 1152 { 1153 struct gmap_cache *gmap_cache, *next; 1154 1155 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 1156 KVM_BUG_ON(!sg->parent, sg->kvm); 1157 1158 lockdep_assert_held(&sg->parent->children_lock); 1159 1160 gmap_remove_child(sg); 1161 kvm_s390_vsie_gmap_notifier(sg, 0, -1UL); 1162 1163 list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) { 1164 gmap_cache->gmap = NULL; 1165 list_del(&gmap_cache->list); 1166 } 1167 1168 gmap_put(sg); 1169 } 1170 1171 void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) 1172 { 1173 struct vsie_rmap *rmap, *rnext, *head; 1174 struct gmap *sg, *next; 1175 gfn_t start, end; 1176 1177 list_for_each_entry_safe(sg, next, &parent->children, list) { 1178 start = sg->guest_asce.rsto; 1179 end = start + sg->guest_asce.tl + 1; 1180 if (!sg->guest_asce.r && gfn >= start && gfn < end) { 1181 gmap_unshadow(sg); 1182 continue; 1183 } 1184 scoped_guard(spinlock, &sg->host_to_rmap_lock) 1185 head = radix_tree_delete(&sg->host_to_rmap, gfn); 1186 gmap_for_each_rmap_safe(rmap, rnext, head) { 1187 gmap_unshadow_level(sg, rmap->r_gfn, rmap->level); 1188 kfree(rmap); 1189 } 1190 } 1191 } 1192 1193 /** 1194 * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables. 1195 * @parent: Pointer to the parent gmap. 1196 * @asce: ASCE for which the shadow table is created. 1197 * @edat_level: Edat level to be used for the shadow translation. 1198 * 1199 * Context: Called with parent->children_lock held. 1200 * 1201 * Return: The pointer to a gmap if a shadow table with the given asce is 1202 * already available, ERR_PTR(-EAGAIN) if another one is just being created, 1203 * otherwise NULL. 1204 */ 1205 static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level) 1206 { 1207 struct gmap *sg; 1208 1209 lockdep_assert_held(&parent->children_lock); 1210 list_for_each_entry(sg, &parent->children, list) { 1211 if (!gmap_is_shadow_valid(sg, asce, edat_level)) 1212 continue; 1213 return sg; 1214 } 1215 return NULL; 1216 } 1217 1218 #define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE) 1219 struct gmap_protect_asce_top_level { 1220 unsigned long seq; 1221 struct guest_fault f[CRST_TABLE_PAGES]; 1222 }; 1223 1224 static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, 1225 struct gmap_protect_asce_top_level *context) 1226 { 1227 struct gmap *parent; 1228 int rc, i; 1229 1230 guard(write_lock)(&sg->kvm->mmu_lock); 1231 1232 if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f)) 1233 return -EAGAIN; 1234 1235 parent = READ_ONCE(sg->parent); 1236 if (!parent) 1237 return -EAGAIN; 1238 scoped_guard(spinlock, &parent->children_lock) { 1239 if (READ_ONCE(sg->parent) != parent) 1240 return -EAGAIN; 1241 sg->invalidated = false; 1242 for (i = 0; i < CRST_TABLE_PAGES; i++) { 1243 if (!context->f[i].valid) 1244 continue; 1245 rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn, 1246 TABLE_TYPE_REGION1 + 1, context->f[i].writable); 1247 if (rc) 1248 return rc; 1249 } 1250 gmap_add_child(sg->parent, sg); 1251 } 1252 1253 kvm_s390_release_faultin_array(sg->kvm, context->f, false); 1254 return 0; 1255 } 1256 1257 static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, 1258 struct gmap_protect_asce_top_level *context) 1259 { 1260 int rc; 1261 1262 if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f)) 1263 return -EAGAIN; 1264 do { 1265 rc = kvm_s390_mmu_cache_topup(mc); 1266 if (rc) 1267 return rc; 1268 rc = radix_tree_preload(GFP_KERNEL); 1269 if (rc) 1270 return rc; 1271 rc = __gmap_protect_asce_top_level(mc, sg, context); 1272 radix_tree_preload_end(); 1273 } while (rc == -ENOMEM); 1274 1275 return rc; 1276 } 1277 1278 static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg) 1279 { 1280 struct gmap_protect_asce_top_level context = {}; 1281 union asce asce = sg->guest_asce; 1282 int rc; 1283 1284 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 1285 1286 context.seq = sg->kvm->mmu_invalidate_seq; 1287 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1288 smp_rmb(); 1289 1290 rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false); 1291 if (rc > 0) 1292 rc = -EFAULT; 1293 if (!rc) 1294 rc = _gmap_protect_asce_top_level(mc, sg, &context); 1295 if (rc) 1296 kvm_s390_release_faultin_array(sg->kvm, context.f, true); 1297 return rc; 1298 } 1299 1300 /** 1301 * gmap_create_shadow() - Create/find a shadow guest address space. 1302 * @mc: The cache to use to allocate dat tables. 1303 * @parent: Pointer to the parent gmap. 1304 * @asce: ASCE for which the shadow table is created. 1305 * @edat_level: Edat level to be used for the shadow translation. 1306 * 1307 * The pages of the top level page table referred by the asce parameter 1308 * will be set to read-only and marked in the PGSTEs of the kvm process. 1309 * The shadow table will be removed automatically on any change to the 1310 * PTE mapping for the source table. 1311 * 1312 * The returned shadow gmap will be returned with one extra reference. 1313 * 1314 * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory, 1315 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the 1316 * parent gmap table could not be protected. 1317 */ 1318 struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent, 1319 union asce asce, int edat_level) 1320 { 1321 struct gmap *sg, *new; 1322 int rc; 1323 1324 if (WARN_ON(!parent)) 1325 return ERR_PTR(-EINVAL); 1326 1327 scoped_guard(spinlock, &parent->children_lock) { 1328 sg = gmap_find_shadow(parent, asce, edat_level); 1329 if (sg) { 1330 gmap_get(sg); 1331 return sg; 1332 } 1333 } 1334 /* Create a new shadow gmap. */ 1335 new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce)); 1336 if (!new) 1337 return ERR_PTR(-ENOMEM); 1338 new->guest_asce = asce; 1339 new->edat_level = edat_level; 1340 set_bit(GMAP_FLAG_SHADOW, &new->flags); 1341 1342 scoped_guard(spinlock, &parent->children_lock) { 1343 /* Recheck if another CPU created the same shadow. */ 1344 sg = gmap_find_shadow(parent, asce, edat_level); 1345 if (sg) { 1346 gmap_put(new); 1347 gmap_get(sg); 1348 return sg; 1349 } 1350 if (asce.r) { 1351 /* Only allow one real-space gmap shadow. */ 1352 list_for_each_entry(sg, &parent->children, list) { 1353 if (sg->guest_asce.r) { 1354 scoped_guard(write_lock, &parent->kvm->mmu_lock) 1355 gmap_unshadow(sg); 1356 break; 1357 } 1358 } 1359 gmap_add_child(parent, new); 1360 /* Nothing to protect, return right away. */ 1361 gmap_get(new); 1362 return new; 1363 } 1364 } 1365 1366 gmap_get(new); 1367 new->parent = parent; 1368 /* Protect while inserting, protects against invalidation races. */ 1369 rc = gmap_protect_asce_top_level(mc, new); 1370 if (rc) { 1371 new->parent = NULL; 1372 gmap_put(new); 1373 gmap_put(new); 1374 return ERR_PTR(rc); 1375 } 1376 return new; 1377 } 1378