1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Guest memory management for KVM/s390 4 * 5 * Copyright IBM Corp. 2008, 2020, 2024 6 * 7 * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> 8 * Martin Schwidefsky <schwidefsky@de.ibm.com> 9 * David Hildenbrand <david@redhat.com> 10 * Janosch Frank <frankja@linux.ibm.com> 11 */ 12 13 #include <linux/compiler.h> 14 #include <linux/kvm.h> 15 #include <linux/kvm_host.h> 16 #include <linux/pgtable.h> 17 #include <linux/pagemap.h> 18 #include <asm/lowcore.h> 19 #include <asm/uv.h> 20 #include <asm/gmap_helpers.h> 21 22 #include "dat.h" 23 #include "gmap.h" 24 #include "kvm-s390.h" 25 #include "faultin.h" 26 27 static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu) 28 { 29 return vcpu->arch.sie_block->prog0c & PROG_IN_SIE; 30 } 31 32 static int gmap_limit_to_type(gfn_t limit) 33 { 34 if (!limit) 35 return TABLE_TYPE_REGION1; 36 if (limit <= _REGION3_SIZE >> PAGE_SHIFT) 37 return TABLE_TYPE_SEGMENT; 38 if (limit <= _REGION2_SIZE >> PAGE_SHIFT) 39 return TABLE_TYPE_REGION3; 40 if (limit <= _REGION1_SIZE >> PAGE_SHIFT) 41 return TABLE_TYPE_REGION2; 42 return TABLE_TYPE_REGION1; 43 } 44 45 /** 46 * gmap_new() - Allocate and initialize a guest address space. 47 * @kvm: The kvm owning the guest. 48 * @limit: Maximum address of the gmap address space. 49 * 50 * Return: A guest address space structure. 51 */ 52 struct gmap *gmap_new(struct kvm *kvm, gfn_t limit) 53 { 54 struct crst_table *table; 55 struct gmap *gmap; 56 int type; 57 58 type = gmap_limit_to_type(limit); 59 60 gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT); 61 if (!gmap) 62 return NULL; 63 INIT_LIST_HEAD(&gmap->children); 64 INIT_LIST_HEAD(&gmap->list); 65 INIT_LIST_HEAD(&gmap->scb_users); 66 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE); 67 spin_lock_init(&gmap->children_lock); 68 spin_lock_init(&gmap->host_to_rmap_lock); 69 refcount_set(&gmap->refcount, 1); 70 71 table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val); 72 if (!table) { 73 kfree(gmap); 74 return NULL; 75 } 76 77 gmap->asce.val = __pa(table); 78 gmap->asce.dt = type; 79 gmap->asce.tl = _ASCE_TABLE_LENGTH; 80 gmap->asce.x = 1; 81 gmap->asce.p = 1; 82 gmap->asce.s = 1; 83 gmap->kvm = kvm; 84 set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags); 85 86 return gmap; 87 } 88 89 static void gmap_add_child(struct gmap *parent, struct gmap *child) 90 { 91 KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm); 92 KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm); 93 KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm); 94 lockdep_assert_held(&parent->children_lock); 95 96 child->parent = parent; 97 98 if (is_ucontrol(parent)) 99 set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); 100 else 101 clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); 102 103 if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags)) 104 set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); 105 else 106 clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); 107 108 if (kvm_is_ucontrol(parent->kvm)) 109 clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags); 110 list_add(&child->list, &parent->children); 111 } 112 113 struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit) 114 { 115 struct gmap *res; 116 117 lockdep_assert_not_held(&parent->children_lock); 118 res = gmap_new(parent->kvm, limit); 119 if (res) { 120 scoped_guard(spinlock, &parent->children_lock) 121 gmap_add_child(parent, res); 122 } 123 return res; 124 } 125 126 int gmap_set_limit(struct gmap *gmap, gfn_t limit) 127 { 128 struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL; 129 int rc, type; 130 131 type = gmap_limit_to_type(limit); 132 133 mc = kvm_s390_new_mmu_cache(); 134 if (!mc) 135 return -ENOMEM; 136 137 do { 138 rc = kvm_s390_mmu_cache_topup(mc); 139 if (rc) 140 return rc; 141 scoped_guard(write_lock, &gmap->kvm->mmu_lock) 142 rc = dat_set_asce_limit(mc, &gmap->asce, type); 143 } while (rc == -ENOMEM); 144 145 return 0; 146 } 147 148 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) 149 { 150 struct vsie_rmap *rmap, *rnext, *head; 151 struct radix_tree_iter iter; 152 unsigned long indices[16]; 153 unsigned long index; 154 void __rcu **slot; 155 int i, nr; 156 157 /* A radix tree is freed by deleting all of its entries */ 158 index = 0; 159 do { 160 nr = 0; 161 radix_tree_for_each_slot(slot, root, &iter, index) { 162 indices[nr] = iter.index; 163 if (++nr == 16) 164 break; 165 } 166 for (i = 0; i < nr; i++) { 167 index = indices[i]; 168 head = radix_tree_delete(root, index); 169 gmap_for_each_rmap_safe(rmap, rnext, head) 170 kfree(rmap); 171 } 172 } while (nr > 0); 173 } 174 175 void gmap_remove_child(struct gmap *child) 176 { 177 if (KVM_BUG_ON(!child->parent, child->kvm)) 178 return; 179 lockdep_assert_held(&child->parent->children_lock); 180 181 list_del(&child->list); 182 child->parent = NULL; 183 child->invalidated = true; 184 } 185 186 /** 187 * gmap_dispose() - Remove and free a guest address space and its children. 188 * @gmap: Pointer to the guest address space structure. 189 */ 190 void gmap_dispose(struct gmap *gmap) 191 { 192 /* The gmap must have been removed from the parent beforehands */ 193 KVM_BUG_ON(gmap->parent, gmap->kvm); 194 /* All children of this gmap must have been removed beforehands */ 195 KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm); 196 /* No VSIE shadow block is allowed to use this gmap */ 197 KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm); 198 /* The ASCE must be valid */ 199 KVM_BUG_ON(!gmap->asce.val, gmap->kvm); 200 /* The refcount must be 0 */ 201 KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm); 202 203 /* Flush tlb of all gmaps */ 204 asce_flush_tlb(gmap->asce); 205 206 /* Free all DAT tables. */ 207 dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap)); 208 209 /* Free additional data for a shadow gmap */ 210 if (is_shadow(gmap)) 211 gmap_rmap_radix_tree_free(&gmap->host_to_rmap); 212 213 kfree(gmap); 214 } 215 216 /** 217 * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy. 218 * @gmap: The gmap whose ASCE needs to be replaced. 219 * 220 * If the ASCE is a SEGMENT type then this function will return -EINVAL, 221 * otherwise the pointers in the host_to_guest radix tree will keep pointing 222 * to the wrong pages, causing use-after-free and memory corruption. 223 * If the allocation of the new top level page table fails, the ASCE is not 224 * replaced. 225 * In any case, the old ASCE is always removed from the gmap CRST list. 226 * Therefore the caller has to make sure to save a pointer to it 227 * beforehand, unless a leak is actually intended. 228 * 229 * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE, 230 * -ENOMEM if runinng out of memory. 231 */ 232 int s390_replace_asce(struct gmap *gmap) 233 { 234 struct crst_table *table; 235 union asce asce; 236 237 /* Replacing segment type ASCEs would cause serious issues */ 238 if (gmap->asce.dt == ASCE_TYPE_SEGMENT) 239 return -EINVAL; 240 241 table = dat_alloc_crst_sleepable(0); 242 if (!table) 243 return -ENOMEM; 244 memcpy(table, dereference_asce(gmap->asce), sizeof(*table)); 245 246 /* Set new table origin while preserving existing ASCE control bits */ 247 asce = gmap->asce; 248 asce.rsto = virt_to_pfn(table); 249 WRITE_ONCE(gmap->asce, asce); 250 251 return 0; 252 } 253 254 bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint) 255 { 256 struct kvm *kvm = gmap->kvm; 257 struct kvm_vcpu *vcpu; 258 gfn_t prefix_gfn; 259 unsigned long i; 260 261 if (is_shadow(gmap)) 262 return false; 263 kvm_for_each_vcpu(i, vcpu, kvm) { 264 /* Match against both prefix pages */ 265 prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu)); 266 if (prefix_gfn < end && gfn <= prefix_gfn + 1) { 267 if (hint && kvm_s390_is_in_sie(vcpu)) 268 return false; 269 VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx", 270 gfn_to_gpa(gfn), gfn_to_gpa(end)); 271 kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); 272 } 273 } 274 return true; 275 } 276 277 struct clear_young_pte_priv { 278 struct gmap *gmap; 279 bool young; 280 }; 281 282 static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk) 283 { 284 struct clear_young_pte_priv *p = walk->priv; 285 union pgste pgste; 286 union pte pte, new; 287 288 pte = READ_ONCE(*ptep); 289 290 if (!pte.s.pr || (!pte.s.y && pte.h.i)) 291 return 0; 292 293 pgste = pgste_get_lock(ptep); 294 if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) { 295 new = pte; 296 new.h.i = 1; 297 new.s.y = 0; 298 if ((new.s.d || !new.h.p) && !new.s.s) 299 folio_set_dirty(pfn_folio(pte.h.pfra)); 300 new.s.d = 0; 301 new.h.p = 1; 302 303 pgste.prefix_notif = 0; 304 pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap)); 305 } 306 p->young = 1; 307 pgste_set_unlock(ptep, pgste); 308 return 0; 309 } 310 311 static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk) 312 { 313 struct clear_young_pte_priv *priv = walk->priv; 314 union crste crste, new; 315 316 do { 317 crste = READ_ONCE(*crstep); 318 319 if (!crste.h.fc) 320 return 0; 321 if (!crste.s.fc1.y && crste.h.i) 322 return 0; 323 if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end)) 324 break; 325 326 new = crste; 327 new.h.i = 1; 328 new.s.fc1.y = 0; 329 new.s.fc1.prefix_notif = 0; 330 if (new.s.fc1.d || !new.h.p) 331 folio_set_dirty(phys_to_folio(crste_origin_large(crste))); 332 new.s.fc1.d = 0; 333 new.h.p = 1; 334 } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce)); 335 336 priv->young = 1; 337 return 0; 338 } 339 340 /** 341 * gmap_age_gfn() - Clear young. 342 * @gmap: The guest gmap. 343 * @start: The first gfn to test. 344 * @end: The gfn after the last one to test. 345 * 346 * Context: Called with the kvm mmu write lock held. 347 * Return: 1 if any page in the given range was young, otherwise 0. 348 */ 349 bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end) 350 { 351 const struct dat_walk_ops ops = { 352 .pte_entry = gmap_clear_young_pte, 353 .pmd_entry = gmap_clear_young_crste, 354 .pud_entry = gmap_clear_young_crste, 355 }; 356 struct clear_young_pte_priv priv = { 357 .gmap = gmap, 358 .young = false, 359 }; 360 361 _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); 362 363 return priv.young; 364 } 365 366 struct gmap_unmap_priv { 367 struct gmap *gmap; 368 struct kvm_memory_slot *slot; 369 }; 370 371 static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w) 372 { 373 struct gmap_unmap_priv *priv = w->priv; 374 struct folio *folio = NULL; 375 unsigned long vmaddr; 376 union pgste pgste; 377 378 pgste = pgste_get_lock(ptep); 379 if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) { 380 vmaddr = __gfn_to_hva_memslot(priv->slot, gfn); 381 gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr); 382 } 383 if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) 384 folio = pfn_folio(ptep->h.pfra); 385 pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn); 386 pgste_set_unlock(ptep, pgste); 387 if (folio) 388 uv_convert_from_secure_folio(folio); 389 390 return 0; 391 } 392 393 static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 394 { 395 struct gmap_unmap_priv *priv = walk->priv; 396 struct folio *folio = NULL; 397 union crste old = *crstep; 398 bool ok; 399 400 if (!old.h.fc) 401 return 0; 402 403 if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) 404 folio = phys_to_folio(crste_origin_large(old)); 405 /* 406 * No races should happen because kvm->mmu_lock is held in write mode, 407 * but the unmap operation could have triggered an unshadow, which 408 * causes gmap_crstep_xchg_atomic() to return false and clear the 409 * vsie_notif bit. Allow the operation to fail once, if the old crste 410 * had the vsie_notif bit set. A second failure is not allowed, for 411 * the reasons above. 412 */ 413 ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn); 414 if (!ok) { 415 KVM_BUG_ON(!old.s.fc1.vsie_notif, priv->gmap->kvm); 416 old.s.fc1.vsie_notif = 0; 417 ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn); 418 KVM_BUG_ON(!ok, priv->gmap->kvm); 419 } 420 if (folio) 421 uv_convert_from_secure_folio(folio); 422 423 return 0; 424 } 425 426 /** 427 * gmap_unmap_gfn_range() - Unmap a range of guest addresses. 428 * @gmap: The gmap to act on. 429 * @slot: The memslot in which the range is located. 430 * @start: The first gfn to unmap. 431 * @end: The gfn after the last one to unmap. 432 * 433 * Context: Called with the kvm mmu write lock held. 434 * Return: false 435 */ 436 bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end) 437 { 438 const struct dat_walk_ops ops = { 439 .pte_entry = _gmap_unmap_pte, 440 .pmd_entry = _gmap_unmap_crste, 441 .pud_entry = _gmap_unmap_crste, 442 }; 443 struct gmap_unmap_priv priv = { 444 .gmap = gmap, 445 .slot = slot, 446 }; 447 448 lockdep_assert_held_write(&gmap->kvm->mmu_lock); 449 450 _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); 451 return false; 452 } 453 454 static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn, 455 struct gmap *gmap) 456 { 457 union pte pte = READ_ONCE(*ptep); 458 459 if (!pte.s.pr || (pte.h.p && !pte.s.sd)) 460 return pgste; 461 462 /* 463 * If this page contains one or more prefixes of vCPUS that are currently 464 * running, do not reset the protection, leave it marked as dirty. 465 */ 466 if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) { 467 pte.h.p = 1; 468 pte.s.sd = 0; 469 pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn); 470 } 471 472 mark_page_dirty(gmap->kvm, gfn); 473 474 return pgste; 475 } 476 477 static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end, 478 struct dat_walk *walk) 479 { 480 struct gmap *gmap = walk->priv; 481 union pgste pgste; 482 483 pgste = pgste_get_lock(ptep); 484 pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap); 485 pgste_set_unlock(ptep, pgste); 486 return 0; 487 } 488 489 static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end, 490 struct dat_walk *walk) 491 { 492 struct gmap *gmap = walk->priv; 493 union crste crste, new; 494 495 if (fatal_signal_pending(current)) 496 return 1; 497 do { 498 crste = READ_ONCE(*table); 499 if (!crste.h.fc) 500 return 0; 501 if (crste.h.p && !crste.s.fc1.sd) 502 return 0; 503 504 /* 505 * If this large page contains one or more prefixes of vCPUs that are 506 * currently running, do not reset the protection, leave it marked as 507 * dirty. 508 */ 509 if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end)) 510 break; 511 new = crste; 512 new.h.p = 1; 513 new.s.fc1.sd = 0; 514 } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn)); 515 516 for ( ; gfn < end; gfn++) 517 mark_page_dirty(gmap->kvm, gfn); 518 519 return 0; 520 } 521 522 void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end) 523 { 524 const struct dat_walk_ops walk_ops = { 525 .pte_entry = _pte_test_and_clear_softdirty, 526 .pmd_entry = _crste_test_and_clear_softdirty, 527 .pud_entry = _crste_test_and_clear_softdirty, 528 }; 529 530 lockdep_assert_held(&gmap->kvm->mmu_lock); 531 532 _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap); 533 } 534 535 static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f) 536 { 537 union crste newcrste, oldcrste = READ_ONCE(*f->crstep); 538 539 /* Somehow the crste is not large anymore, let the slow path deal with it. */ 540 if (!oldcrste.h.fc) 541 return 1; 542 543 f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn)); 544 f->writable = oldcrste.s.fc1.w; 545 546 /* Appropriate permissions already (race with another handler), nothing to do. */ 547 if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p)) 548 return 0; 549 550 if (!f->write_attempt || oldcrste.s.fc1.w) { 551 f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d; 552 newcrste = oldcrste; 553 newcrste.h.i = 0; 554 newcrste.s.fc1.y = 1; 555 if (f->write_attempt) { 556 newcrste.h.p = 0; 557 newcrste.s.fc1.d = 1; 558 newcrste.s.fc1.sd = 1; 559 } 560 /* In case of races, let the slow path deal with it. */ 561 return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn); 562 } 563 /* Trying to write on a read-only page, let the slow path deal with it. */ 564 return 1; 565 } 566 567 static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste, 568 struct guest_fault *f) 569 { 570 union pte newpte, oldpte = READ_ONCE(*f->ptep); 571 572 f->pfn = oldpte.h.pfra; 573 f->writable = oldpte.s.w; 574 575 /* Appropriate permissions already (race with another handler), nothing to do. */ 576 if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p)) 577 return 0; 578 /* Trying to write on a read-only page, let the slow path deal with it. */ 579 if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w)) 580 return 1; 581 582 newpte = oldpte; 583 newpte.h.i = 0; 584 newpte.s.y = 1; 585 if (f->write_attempt) { 586 newpte.h.p = 0; 587 newpte.s.d = 1; 588 newpte.s.sd = 1; 589 } 590 *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn); 591 592 return 0; 593 } 594 595 /** 596 * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault. 597 * @gmap: The gmap whose fault needs to be resolved. 598 * @fault: Describes the fault that is being resolved. 599 * 600 * A minor fault is a fault that can be resolved quickly within gmap. 601 * The page is already mapped, the fault is only due to dirty/young tracking. 602 * 603 * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could 604 * not be resolved and needs to go through the slow path. 605 */ 606 int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault) 607 { 608 union pgste pgste; 609 int rc; 610 611 lockdep_assert_held(&gmap->kvm->mmu_lock); 612 613 rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE, 614 &fault->crstep, &fault->ptep); 615 /* If a PTE or a leaf CRSTE could not be reached, slow path. */ 616 if (rc) 617 return 1; 618 619 if (fault->ptep) { 620 pgste = pgste_get_lock(fault->ptep); 621 rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault); 622 if (!rc && fault->callback) 623 fault->callback(fault); 624 pgste_set_unlock(fault->ptep, pgste); 625 } else { 626 rc = gmap_handle_minor_crste_fault(gmap, fault); 627 if (!rc && fault->callback) 628 fault->callback(fault); 629 } 630 return rc; 631 } 632 633 static inline bool gmap_2g_allowed(struct gmap *gmap, struct guest_fault *f, 634 struct kvm_memory_slot *slot) 635 { 636 return false; 637 } 638 639 /** 640 * gmap_1m_allowed() - Check whether a 1M hugepage is allowed. 641 * @gmap: The gmap of the guest. 642 * @f: Describes the fault that is being resolved. 643 * @slot: The memslot the faulting address belongs to. 644 * 645 * The function checks whether the GMAP_FLAG_ALLOW_HPAGE_1M flag is set for 646 * @gmap, whether the offset of the address in the 1M virtual frame is the 647 * same as the offset in the physical 1M frame, and finally whether the whole 648 * 1M page would fit in the given memslot. 649 * 650 * Return: true if a 1M hugepage is allowed to back the faulting address, false 651 * otherwise. 652 */ 653 static inline bool gmap_1m_allowed(struct gmap *gmap, struct guest_fault *f, 654 struct kvm_memory_slot *slot) 655 { 656 return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags) && 657 !((f->gfn ^ f->pfn) & ~_SEGMENT_FR_MASK) && 658 slot->base_gfn <= ALIGN_DOWN(f->gfn, _PAGES_PER_SEGMENT) && 659 slot->base_gfn + slot->npages >= ALIGN(f->gfn + 1, _PAGES_PER_SEGMENT); 660 } 661 662 static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level, 663 struct guest_fault *f) 664 { 665 union crste oldval, newval; 666 union pte newpte, oldpte; 667 union pgste pgste; 668 int rc = 0; 669 670 rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level, 671 &f->crstep, &f->ptep); 672 if (rc == -ENOMEM) 673 return rc; 674 if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm)) 675 return rc; 676 if (rc) 677 return -EAGAIN; 678 if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm)) 679 return -EINVAL; 680 681 if (f->ptep) { 682 pgste = pgste_get_lock(f->ptep); 683 oldpte = *f->ptep; 684 newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); 685 newpte.s.sd = oldpte.s.sd; 686 oldpte.s.sd = 0; 687 if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { 688 pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn); 689 if (f->callback) 690 f->callback(f); 691 } else { 692 rc = -EAGAIN; 693 } 694 pgste_set_unlock(f->ptep, pgste); 695 } else { 696 do { 697 oldval = READ_ONCE(*f->crstep); 698 newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, 699 f->write_attempt | oldval.s.fc1.d); 700 newval.s.fc1.s = !f->page; 701 newval.s.fc1.sd = oldval.s.fc1.sd; 702 if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && 703 crste_origin_large(oldval) != crste_origin_large(newval)) 704 return -EAGAIN; 705 } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn)); 706 if (f->callback) 707 f->callback(f); 708 } 709 710 return rc; 711 } 712 713 int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f, 714 struct kvm_memory_slot *slot) 715 { 716 unsigned int order; 717 int level; 718 719 lockdep_assert_held(&gmap->kvm->mmu_lock); 720 721 level = TABLE_TYPE_PAGE_TABLE; 722 if (f->page) { 723 order = folio_order(page_folio(f->page)); 724 if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f, slot)) 725 level = TABLE_TYPE_REGION3; 726 else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f, slot)) 727 level = TABLE_TYPE_SEGMENT; 728 } 729 return _gmap_link(mc, gmap, level, f); 730 } 731 732 static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, 733 gfn_t p_gfn, gfn_t c_gfn, bool force_alloc) 734 { 735 union crste newcrste, oldcrste; 736 struct page_table *pt; 737 union crste *crstep; 738 union pte *ptep; 739 int rc; 740 741 if (force_alloc) 742 rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC, 743 TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); 744 else 745 rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE, 746 TABLE_TYPE_SEGMENT, &crstep, &ptep); 747 if (rc) 748 return rc; 749 if (!ptep) { 750 newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT); 751 newcrste.h.i = 1; 752 newcrste.h.fc0.tl = 1; 753 } else { 754 pt = pte_table_start(ptep); 755 dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT)); 756 newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT); 757 } 758 rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT, 759 &crstep, &ptep); 760 if (rc) 761 return rc; 762 do { 763 oldcrste = READ_ONCE(*crstep); 764 if (oldcrste.val == newcrste.val) 765 break; 766 } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce)); 767 return 0; 768 } 769 770 static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp) 771 { 772 union pte *ptep; 773 int rc; 774 775 rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE, 776 TABLE_TYPE_SEGMENT, crstepp, &ptep); 777 if (rc || (!ptep && !crste_is_ucas(**crstepp))) 778 return -EREMOTE; 779 if (!ptep) 780 return 1; 781 *gaddr &= ~_SEGMENT_MASK; 782 *gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT; 783 return 0; 784 } 785 786 /** 787 * gmap_ucas_translate() - Translate a vcpu address into a host gmap address 788 * @mc: The memory cache to be used for allocations. 789 * @gmap: The per-cpu gmap. 790 * @gaddr: Pointer to the address to be translated, will get overwritten with 791 * the translated address in case of success. 792 * Translates the per-vCPU guest address into a fake guest address, which can 793 * then be used with the fake memslots that are identity mapping userspace. 794 * This allows ucontrol VMs to use the normal fault resolution path, like 795 * normal VMs. 796 * 797 * Return: %0 in case of success, otherwise %-EREMOTE. 798 */ 799 int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr) 800 { 801 gpa_t translated_address; 802 union crste *crstep; 803 gfn_t gfn; 804 int rc; 805 806 gfn = gpa_to_gfn(*gaddr); 807 808 scoped_guard(read_lock, &gmap->kvm->mmu_lock) { 809 rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); 810 if (rc <= 0) 811 return rc; 812 } 813 do { 814 scoped_guard(write_lock, &gmap->kvm->mmu_lock) { 815 rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); 816 if (rc <= 0) 817 return rc; 818 translated_address = (*gaddr & ~_SEGMENT_MASK) | 819 (crstep->val & _SEGMENT_MASK); 820 rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true); 821 } 822 if (!rc) { 823 *gaddr = translated_address; 824 return 0; 825 } 826 if (rc != -ENOMEM) 827 return -EREMOTE; 828 rc = kvm_s390_mmu_cache_topup(mc); 829 if (rc) 830 return rc; 831 } while (1); 832 return 0; 833 } 834 835 int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count) 836 { 837 struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL; 838 int rc = 0; 839 840 mc = kvm_s390_new_mmu_cache(); 841 if (!mc) 842 return -ENOMEM; 843 844 while (count) { 845 scoped_guard(write_lock, &gmap->kvm->mmu_lock) 846 rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false); 847 if (rc == -ENOMEM) { 848 rc = kvm_s390_mmu_cache_topup(mc); 849 if (rc) 850 return rc; 851 continue; 852 } 853 if (rc) 854 return rc; 855 856 count--; 857 c_gfn += _PAGE_ENTRIES; 858 p_gfn += _PAGE_ENTRIES; 859 } 860 return rc; 861 } 862 863 static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn) 864 { 865 union crste *crstep; 866 union pte *ptep; 867 int rc; 868 869 rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); 870 if (rc) 871 return; 872 while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce)) 873 ; 874 } 875 876 void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) 877 { 878 guard(read_lock)(&gmap->kvm->mmu_lock); 879 880 for ( ; count; count--, c_gfn += _PAGE_ENTRIES) 881 gmap_ucas_unmap_one(gmap, c_gfn); 882 } 883 884 static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 885 { 886 struct gmap *gmap = walk->priv; 887 union crste crste, newcrste; 888 889 crste = READ_ONCE(*crstep); 890 newcrste = _CRSTE_EMPTY(crste.h.tt); 891 892 while (crste_leaf(crste)) { 893 if (crste_prefix(crste)) 894 gmap_unmap_prefix(gmap, gfn, next); 895 if (crste.s.fc1.vsie_notif) 896 gmap_handle_vsie_unshadow_event(gmap, gfn); 897 if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce)) 898 break; 899 crste = READ_ONCE(*crstep); 900 } 901 902 if (need_resched()) 903 return next; 904 905 return 0; 906 } 907 908 void gmap_split_huge_pages(struct gmap *gmap) 909 { 910 const struct dat_walk_ops ops = { 911 .pmd_entry = _gmap_split_crste, 912 .pud_entry = _gmap_split_crste, 913 }; 914 gfn_t start = 0; 915 916 do { 917 scoped_guard(read_lock, &gmap->kvm->mmu_lock) 918 start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce, 919 &ops, DAT_WALK_IGN_HOLES, gmap); 920 cond_resched(); 921 } while (start); 922 } 923 924 static int _gmap_enable_skeys(struct gmap *gmap) 925 { 926 gfn_t start = 0; 927 int rc; 928 929 if (uses_skeys(gmap)) 930 return 0; 931 932 set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); 933 rc = gmap_helper_disable_cow_sharing(); 934 if (rc) { 935 clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); 936 return rc; 937 } 938 939 do { 940 scoped_guard(write_lock, &gmap->kvm->mmu_lock) 941 start = dat_reset_skeys(gmap->asce, start); 942 cond_resched(); 943 } while (start); 944 return 0; 945 } 946 947 int gmap_enable_skeys(struct gmap *gmap) 948 { 949 int rc; 950 951 mmap_write_lock(gmap->kvm->mm); 952 rc = _gmap_enable_skeys(gmap); 953 mmap_write_unlock(gmap->kvm->mm); 954 return rc; 955 } 956 957 static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 958 { 959 if (!ptep->s.pr) 960 return 0; 961 __kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep))); 962 if (need_resched()) 963 return next; 964 return 0; 965 } 966 967 static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 968 { 969 phys_addr_t origin, cur, end; 970 971 if (!crstep->h.fc || !crstep->s.fc1.pr) 972 return 0; 973 974 origin = crste_origin_large(*crstep); 975 cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin; 976 end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin; 977 for ( ; cur < end; cur += PAGE_SIZE) 978 __kvm_s390_pv_destroy_page(phys_to_page(cur)); 979 if (need_resched()) 980 return next; 981 return 0; 982 } 983 984 int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible) 985 { 986 const struct dat_walk_ops ops = { 987 .pte_entry = _destroy_pages_pte, 988 .pmd_entry = _destroy_pages_crste, 989 .pud_entry = _destroy_pages_crste, 990 }; 991 992 do { 993 scoped_guard(read_lock, &gmap->kvm->mmu_lock) 994 start = _dat_walk_gfn_range(start, end, gmap->asce, &ops, 995 DAT_WALK_IGN_HOLES, NULL); 996 if (interruptible && fatal_signal_pending(current)) 997 return -EINTR; 998 cond_resched(); 999 } while (start && start < end); 1000 return 0; 1001 } 1002 1003 int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level) 1004 { 1005 struct vsie_rmap *rmap __free(kvfree) = NULL; 1006 struct vsie_rmap *temp; 1007 void __rcu **slot; 1008 int rc = 0; 1009 1010 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 1011 lockdep_assert_held(&sg->host_to_rmap_lock); 1012 1013 rmap = kzalloc_obj(*rmap, GFP_ATOMIC); 1014 if (!rmap) 1015 return -ENOMEM; 1016 1017 rmap->r_gfn = r_gfn; 1018 rmap->level = level; 1019 slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn); 1020 if (slot) { 1021 rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock); 1022 for (temp = rmap->next; temp; temp = temp->next) { 1023 if (temp->val == rmap->val) 1024 return 0; 1025 } 1026 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); 1027 } else { 1028 rmap->next = NULL; 1029 rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap); 1030 if (rc) 1031 return rc; 1032 } 1033 rmap = NULL; 1034 1035 return 0; 1036 } 1037 1038 int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, 1039 kvm_pfn_t pfn, int level, bool wr) 1040 { 1041 unsigned long bitmask; 1042 union crste *crstep; 1043 union pgste pgste; 1044 union pte *ptep; 1045 union pte pte; 1046 int flags, rc; 1047 1048 if (KVM_BUG_ON(!is_shadow(sg) || level <= TABLE_TYPE_PAGE_TABLE, sg->kvm)) 1049 return -EINVAL; 1050 lockdep_assert_held(&sg->parent->children_lock); 1051 1052 flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); 1053 rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags, 1054 TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); 1055 if (rc) 1056 return rc; 1057 if (level <= TABLE_TYPE_REGION1) { 1058 bitmask = -1UL << (8 + 11 * level); 1059 scoped_guard(spinlock, &sg->host_to_rmap_lock) 1060 rc = gmap_insert_rmap(sg, p_gfn, r_gfn & bitmask, level); 1061 } 1062 if (rc) 1063 return rc; 1064 1065 if (!pgste_get_trylock(ptep, &pgste)) 1066 return -EAGAIN; 1067 pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false); 1068 pte.h.p = 1; 1069 pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false); 1070 pgste.vsie_notif = 1; 1071 pgste_set_unlock(ptep, pgste); 1072 1073 return 0; 1074 } 1075 1076 static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 1077 { 1078 __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val); 1079 if (need_resched()) 1080 return next; 1081 return 0; 1082 } 1083 1084 void gmap_set_cmma_all_dirty(struct gmap *gmap) 1085 { 1086 const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, }; 1087 gfn_t gfn = 0; 1088 1089 do { 1090 scoped_guard(read_lock, &gmap->kvm->mmu_lock) 1091 gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops, 1092 DAT_WALK_IGN_HOLES, NULL); 1093 cond_resched(); 1094 } while (gfn); 1095 } 1096 1097 static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level) 1098 { 1099 unsigned long align = PAGE_SIZE; 1100 gpa_t gaddr = gfn_to_gpa(r_gfn); 1101 union crste *crstep; 1102 union crste crste; 1103 union pte *ptep; 1104 1105 if (level > TABLE_TYPE_PAGE_TABLE) 1106 align = 1UL << (11 * level + _SEGMENT_SHIFT); 1107 kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align)); 1108 sg->invalidated = true; 1109 if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep)) 1110 return; 1111 if (ptep) { 1112 if (READ_ONCE(*ptep).val != _PTE_EMPTY.val) 1113 dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); 1114 return; 1115 } 1116 1117 crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce); 1118 if (crste_leaf(crste) || crste.h.i) 1119 return; 1120 if (is_pmd(crste)) 1121 dat_free_pt(dereference_pmd(crste.pmd)); 1122 else 1123 dat_free_level(dereference_crste(crste), true); 1124 } 1125 1126 static void gmap_unshadow(struct gmap *sg) 1127 { 1128 struct gmap_cache *gmap_cache, *next; 1129 1130 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 1131 KVM_BUG_ON(!sg->parent, sg->kvm); 1132 1133 lockdep_assert_held(&sg->parent->children_lock); 1134 1135 gmap_remove_child(sg); 1136 kvm_s390_vsie_gmap_notifier(sg, 0, -1UL); 1137 1138 list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) { 1139 gmap_cache->gmap = NULL; 1140 list_del(&gmap_cache->list); 1141 } 1142 1143 gmap_put(sg); 1144 } 1145 1146 void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) 1147 { 1148 struct vsie_rmap *rmap, *rnext, *head; 1149 struct gmap *sg, *next; 1150 gfn_t start, end; 1151 1152 list_for_each_entry_safe(sg, next, &parent->children, list) { 1153 start = sg->guest_asce.rsto; 1154 end = start + sg->guest_asce.tl + 1; 1155 if (!sg->guest_asce.r && gfn >= start && gfn < end) { 1156 gmap_unshadow(sg); 1157 continue; 1158 } 1159 scoped_guard(spinlock, &sg->host_to_rmap_lock) 1160 head = radix_tree_delete(&sg->host_to_rmap, gfn); 1161 gmap_for_each_rmap_safe(rmap, rnext, head) { 1162 gmap_unshadow_level(sg, rmap->r_gfn, rmap->level); 1163 kfree(rmap); 1164 } 1165 } 1166 } 1167 1168 /** 1169 * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables. 1170 * @parent: Pointer to the parent gmap. 1171 * @asce: ASCE for which the shadow table is created. 1172 * @edat_level: Edat level to be used for the shadow translation. 1173 * 1174 * Context: Called with parent->children_lock held. 1175 * 1176 * Return: The pointer to a gmap if a shadow table with the given asce is 1177 * already available, ERR_PTR(-EAGAIN) if another one is just being created, 1178 * otherwise NULL. 1179 */ 1180 static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level) 1181 { 1182 struct gmap *sg; 1183 1184 lockdep_assert_held(&parent->children_lock); 1185 list_for_each_entry(sg, &parent->children, list) { 1186 if (!gmap_is_shadow_valid(sg, asce, edat_level)) 1187 continue; 1188 return sg; 1189 } 1190 return NULL; 1191 } 1192 1193 #define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE) 1194 struct gmap_protect_asce_top_level { 1195 unsigned long seq; 1196 struct guest_fault f[CRST_TABLE_PAGES]; 1197 }; 1198 1199 static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, 1200 struct gmap_protect_asce_top_level *context) 1201 { 1202 struct gmap *parent; 1203 int rc, i; 1204 1205 guard(write_lock)(&sg->kvm->mmu_lock); 1206 1207 if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f)) 1208 return -EAGAIN; 1209 1210 parent = READ_ONCE(sg->parent); 1211 if (!parent) 1212 return -EAGAIN; 1213 scoped_guard(spinlock, &parent->children_lock) { 1214 if (READ_ONCE(sg->parent) != parent) 1215 return -EAGAIN; 1216 sg->invalidated = false; 1217 for (i = 0; i < CRST_TABLE_PAGES; i++) { 1218 if (!context->f[i].valid) 1219 continue; 1220 rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn, 1221 TABLE_TYPE_REGION1 + 1, context->f[i].writable); 1222 if (rc) 1223 return rc; 1224 } 1225 gmap_add_child(sg->parent, sg); 1226 } 1227 1228 kvm_s390_release_faultin_array(sg->kvm, context->f, false); 1229 return 0; 1230 } 1231 1232 static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, 1233 struct gmap_protect_asce_top_level *context) 1234 { 1235 int rc; 1236 1237 if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f)) 1238 return -EAGAIN; 1239 do { 1240 rc = kvm_s390_mmu_cache_topup(mc); 1241 if (rc) 1242 return rc; 1243 rc = radix_tree_preload(GFP_KERNEL); 1244 if (rc) 1245 return rc; 1246 rc = __gmap_protect_asce_top_level(mc, sg, context); 1247 radix_tree_preload_end(); 1248 } while (rc == -ENOMEM); 1249 1250 return rc; 1251 } 1252 1253 static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg) 1254 { 1255 struct gmap_protect_asce_top_level context = {}; 1256 union asce asce = sg->guest_asce; 1257 int rc; 1258 1259 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 1260 1261 context.seq = sg->kvm->mmu_invalidate_seq; 1262 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1263 smp_rmb(); 1264 1265 rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false); 1266 if (rc > 0) 1267 rc = -EFAULT; 1268 if (!rc) 1269 rc = _gmap_protect_asce_top_level(mc, sg, &context); 1270 if (rc) 1271 kvm_s390_release_faultin_array(sg->kvm, context.f, true); 1272 return rc; 1273 } 1274 1275 /** 1276 * gmap_create_shadow() - Create/find a shadow guest address space. 1277 * @mc: The cache to use to allocate dat tables. 1278 * @parent: Pointer to the parent gmap. 1279 * @asce: ASCE for which the shadow table is created. 1280 * @edat_level: Edat level to be used for the shadow translation. 1281 * 1282 * The pages of the top level page table referred by the asce parameter 1283 * will be set to read-only and marked in the PGSTEs of the kvm process. 1284 * The shadow table will be removed automatically on any change to the 1285 * PTE mapping for the source table. 1286 * 1287 * The returned shadow gmap will be returned with one extra reference. 1288 * 1289 * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory, 1290 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the 1291 * parent gmap table could not be protected. 1292 */ 1293 struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent, 1294 union asce asce, int edat_level) 1295 { 1296 struct gmap *sg, *new; 1297 int rc; 1298 1299 if (WARN_ON(!parent)) 1300 return ERR_PTR(-EINVAL); 1301 1302 scoped_guard(spinlock, &parent->children_lock) { 1303 sg = gmap_find_shadow(parent, asce, edat_level); 1304 if (sg) { 1305 gmap_get(sg); 1306 return sg; 1307 } 1308 } 1309 /* Create a new shadow gmap. */ 1310 new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce)); 1311 if (!new) 1312 return ERR_PTR(-ENOMEM); 1313 new->guest_asce = asce; 1314 new->edat_level = edat_level; 1315 set_bit(GMAP_FLAG_SHADOW, &new->flags); 1316 1317 scoped_guard(spinlock, &parent->children_lock) { 1318 /* Recheck if another CPU created the same shadow. */ 1319 sg = gmap_find_shadow(parent, asce, edat_level); 1320 if (sg) { 1321 gmap_put(new); 1322 gmap_get(sg); 1323 return sg; 1324 } 1325 if (asce.r) { 1326 /* Only allow one real-space gmap shadow. */ 1327 list_for_each_entry(sg, &parent->children, list) { 1328 if (sg->guest_asce.r) { 1329 scoped_guard(write_lock, &parent->kvm->mmu_lock) 1330 gmap_unshadow(sg); 1331 break; 1332 } 1333 } 1334 gmap_add_child(parent, new); 1335 /* Nothing to protect, return right away. */ 1336 gmap_get(new); 1337 return new; 1338 } 1339 } 1340 1341 gmap_get(new); 1342 new->parent = parent; 1343 /* Protect while inserting, protects against invalidation races. */ 1344 rc = gmap_protect_asce_top_level(mc, new); 1345 if (rc) { 1346 new->parent = NULL; 1347 gmap_put(new); 1348 gmap_put(new); 1349 return ERR_PTR(rc); 1350 } 1351 return new; 1352 } 1353