1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Guest memory management for KVM/s390 4 * 5 * Copyright IBM Corp. 2008, 2020, 2024 6 * 7 * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> 8 * Martin Schwidefsky <schwidefsky@de.ibm.com> 9 * David Hildenbrand <david@redhat.com> 10 * Janosch Frank <frankja@linux.ibm.com> 11 */ 12 13 #include <linux/compiler.h> 14 #include <linux/kvm.h> 15 #include <linux/kvm_host.h> 16 #include <linux/pgtable.h> 17 #include <linux/pagemap.h> 18 #include <asm/lowcore.h> 19 #include <asm/uv.h> 20 #include <asm/gmap_helpers.h> 21 22 #include "dat.h" 23 #include "gmap.h" 24 #include "kvm-s390.h" 25 #include "faultin.h" 26 27 static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu) 28 { 29 return vcpu->arch.sie_block->prog0c & PROG_IN_SIE; 30 } 31 32 static int gmap_limit_to_type(gfn_t limit) 33 { 34 if (!limit) 35 return TABLE_TYPE_REGION1; 36 if (limit <= _REGION3_SIZE >> PAGE_SHIFT) 37 return TABLE_TYPE_SEGMENT; 38 if (limit <= _REGION2_SIZE >> PAGE_SHIFT) 39 return TABLE_TYPE_REGION3; 40 if (limit <= _REGION1_SIZE >> PAGE_SHIFT) 41 return TABLE_TYPE_REGION2; 42 return TABLE_TYPE_REGION1; 43 } 44 45 /** 46 * gmap_new() - Allocate and initialize a guest address space. 47 * @kvm: The kvm owning the guest. 48 * @limit: Maximum address of the gmap address space. 49 * 50 * Return: A guest address space structure. 51 */ 52 struct gmap *gmap_new(struct kvm *kvm, gfn_t limit) 53 { 54 struct crst_table *table; 55 struct gmap *gmap; 56 int type; 57 58 type = gmap_limit_to_type(limit); 59 60 gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT); 61 if (!gmap) 62 return NULL; 63 INIT_LIST_HEAD(&gmap->children); 64 INIT_LIST_HEAD(&gmap->list); 65 INIT_LIST_HEAD(&gmap->scb_users); 66 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE); 67 spin_lock_init(&gmap->children_lock); 68 spin_lock_init(&gmap->host_to_rmap_lock); 69 refcount_set(&gmap->refcount, 1); 70 71 table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val); 72 if (!table) { 73 kfree(gmap); 74 return NULL; 75 } 76 77 gmap->asce.val = __pa(table); 78 gmap->asce.dt = type; 79 gmap->asce.tl = _ASCE_TABLE_LENGTH; 80 gmap->asce.x = 1; 81 gmap->asce.p = 1; 82 gmap->asce.s = 1; 83 gmap->kvm = kvm; 84 set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags); 85 86 return gmap; 87 } 88 89 static void gmap_add_child(struct gmap *parent, struct gmap *child) 90 { 91 KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm); 92 KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm); 93 KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm); 94 lockdep_assert_held(&parent->children_lock); 95 96 child->parent = parent; 97 98 if (is_ucontrol(parent)) 99 set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); 100 else 101 clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); 102 103 if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags)) 104 set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); 105 else 106 clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); 107 108 if (kvm_is_ucontrol(parent->kvm)) 109 clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags); 110 list_add(&child->list, &parent->children); 111 } 112 113 struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit) 114 { 115 struct gmap *res; 116 117 lockdep_assert_not_held(&parent->children_lock); 118 res = gmap_new(parent->kvm, limit); 119 if (res) { 120 scoped_guard(spinlock, &parent->children_lock) 121 gmap_add_child(parent, res); 122 } 123 return res; 124 } 125 126 int gmap_set_limit(struct gmap *gmap, gfn_t limit) 127 { 128 struct kvm_s390_mmu_cache *mc; 129 int rc, type; 130 131 type = gmap_limit_to_type(limit); 132 133 mc = kvm_s390_new_mmu_cache(); 134 if (!mc) 135 return -ENOMEM; 136 137 do { 138 rc = kvm_s390_mmu_cache_topup(mc); 139 if (rc) 140 return rc; 141 scoped_guard(write_lock, &gmap->kvm->mmu_lock) 142 rc = dat_set_asce_limit(mc, &gmap->asce, type); 143 } while (rc == -ENOMEM); 144 145 kvm_s390_free_mmu_cache(mc); 146 return 0; 147 } 148 149 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) 150 { 151 struct vsie_rmap *rmap, *rnext, *head; 152 struct radix_tree_iter iter; 153 unsigned long indices[16]; 154 unsigned long index; 155 void __rcu **slot; 156 int i, nr; 157 158 /* A radix tree is freed by deleting all of its entries */ 159 index = 0; 160 do { 161 nr = 0; 162 radix_tree_for_each_slot(slot, root, &iter, index) { 163 indices[nr] = iter.index; 164 if (++nr == 16) 165 break; 166 } 167 for (i = 0; i < nr; i++) { 168 index = indices[i]; 169 head = radix_tree_delete(root, index); 170 gmap_for_each_rmap_safe(rmap, rnext, head) 171 kfree(rmap); 172 } 173 } while (nr > 0); 174 } 175 176 void gmap_remove_child(struct gmap *child) 177 { 178 if (KVM_BUG_ON(!child->parent, child->kvm)) 179 return; 180 lockdep_assert_held(&child->parent->children_lock); 181 182 list_del(&child->list); 183 child->parent = NULL; 184 child->invalidated = true; 185 } 186 187 /** 188 * gmap_dispose() - Remove and free a guest address space and its children. 189 * @gmap: Pointer to the guest address space structure. 190 */ 191 void gmap_dispose(struct gmap *gmap) 192 { 193 /* The gmap must have been removed from the parent beforehands */ 194 KVM_BUG_ON(gmap->parent, gmap->kvm); 195 /* All children of this gmap must have been removed beforehands */ 196 KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm); 197 /* No VSIE shadow block is allowed to use this gmap */ 198 KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm); 199 /* The ASCE must be valid */ 200 KVM_BUG_ON(!gmap->asce.val, gmap->kvm); 201 /* The refcount must be 0 */ 202 KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm); 203 204 /* Flush tlb of all gmaps */ 205 asce_flush_tlb(gmap->asce); 206 207 /* Free all DAT tables. */ 208 dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap)); 209 210 /* Free additional data for a shadow gmap */ 211 if (is_shadow(gmap)) 212 gmap_rmap_radix_tree_free(&gmap->host_to_rmap); 213 214 kfree(gmap); 215 } 216 217 /** 218 * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy. 219 * @gmap: The gmap whose ASCE needs to be replaced. 220 * 221 * If the ASCE is a SEGMENT type then this function will return -EINVAL, 222 * otherwise the pointers in the host_to_guest radix tree will keep pointing 223 * to the wrong pages, causing use-after-free and memory corruption. 224 * If the allocation of the new top level page table fails, the ASCE is not 225 * replaced. 226 * In any case, the old ASCE is always removed from the gmap CRST list. 227 * Therefore the caller has to make sure to save a pointer to it 228 * beforehand, unless a leak is actually intended. 229 * 230 * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE, 231 * -ENOMEM if runinng out of memory. 232 */ 233 int s390_replace_asce(struct gmap *gmap) 234 { 235 struct crst_table *table; 236 union asce asce; 237 238 /* Replacing segment type ASCEs would cause serious issues */ 239 if (gmap->asce.dt == ASCE_TYPE_SEGMENT) 240 return -EINVAL; 241 242 table = dat_alloc_crst_sleepable(0); 243 if (!table) 244 return -ENOMEM; 245 memcpy(table, dereference_asce(gmap->asce), sizeof(*table)); 246 247 /* Set new table origin while preserving existing ASCE control bits */ 248 asce = gmap->asce; 249 asce.rsto = virt_to_pfn(table); 250 WRITE_ONCE(gmap->asce, asce); 251 252 return 0; 253 } 254 255 bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint) 256 { 257 struct kvm *kvm = gmap->kvm; 258 struct kvm_vcpu *vcpu; 259 gfn_t prefix_gfn; 260 unsigned long i; 261 262 if (is_shadow(gmap)) 263 return false; 264 kvm_for_each_vcpu(i, vcpu, kvm) { 265 /* Match against both prefix pages */ 266 prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu)); 267 if (prefix_gfn < end && gfn <= prefix_gfn + 1) { 268 if (hint && kvm_s390_is_in_sie(vcpu)) 269 return false; 270 VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx", 271 gfn_to_gpa(gfn), gfn_to_gpa(end)); 272 kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); 273 } 274 } 275 return true; 276 } 277 278 struct clear_young_pte_priv { 279 struct gmap *gmap; 280 bool young; 281 }; 282 283 static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk) 284 { 285 struct clear_young_pte_priv *p = walk->priv; 286 union pgste pgste; 287 union pte pte, new; 288 289 pte = READ_ONCE(*ptep); 290 291 if (!pte.s.pr || (!pte.s.y && pte.h.i)) 292 return 0; 293 294 pgste = pgste_get_lock(ptep); 295 if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) { 296 new = pte; 297 new.h.i = 1; 298 new.s.y = 0; 299 if ((new.s.d || !new.h.p) && !new.s.s) 300 folio_set_dirty(pfn_folio(pte.h.pfra)); 301 new.s.d = 0; 302 new.h.p = 1; 303 304 pgste.prefix_notif = 0; 305 pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap)); 306 } 307 p->young = 1; 308 pgste_set_unlock(ptep, pgste); 309 return 0; 310 } 311 312 static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk) 313 { 314 struct clear_young_pte_priv *priv = walk->priv; 315 union crste crste, new; 316 317 do { 318 crste = READ_ONCE(*crstep); 319 320 if (!crste.h.fc) 321 return 0; 322 if (!crste.s.fc1.y && crste.h.i) 323 return 0; 324 if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end)) 325 break; 326 327 new = crste; 328 new.h.i = 1; 329 new.s.fc1.y = 0; 330 new.s.fc1.prefix_notif = 0; 331 if (new.s.fc1.d || !new.h.p) 332 folio_set_dirty(phys_to_folio(crste_origin_large(crste))); 333 new.s.fc1.d = 0; 334 new.h.p = 1; 335 } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce)); 336 337 priv->young = 1; 338 return 0; 339 } 340 341 /** 342 * gmap_age_gfn() - Clear young. 343 * @gmap: The guest gmap. 344 * @start: The first gfn to test. 345 * @end: The gfn after the last one to test. 346 * 347 * Context: Called with the kvm mmu write lock held. 348 * Return: 1 if any page in the given range was young, otherwise 0. 349 */ 350 bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end) 351 { 352 const struct dat_walk_ops ops = { 353 .pte_entry = gmap_clear_young_pte, 354 .pmd_entry = gmap_clear_young_crste, 355 .pud_entry = gmap_clear_young_crste, 356 }; 357 struct clear_young_pte_priv priv = { 358 .gmap = gmap, 359 .young = false, 360 }; 361 362 _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); 363 364 return priv.young; 365 } 366 367 struct gmap_unmap_priv { 368 struct gmap *gmap; 369 struct kvm_memory_slot *slot; 370 }; 371 372 static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w) 373 { 374 struct gmap_unmap_priv *priv = w->priv; 375 struct folio *folio = NULL; 376 unsigned long vmaddr; 377 union pgste pgste; 378 379 pgste = pgste_get_lock(ptep); 380 if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) { 381 vmaddr = __gfn_to_hva_memslot(priv->slot, gfn); 382 gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr); 383 } 384 if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) 385 folio = pfn_folio(ptep->h.pfra); 386 pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn); 387 pgste_set_unlock(ptep, pgste); 388 if (folio) 389 uv_convert_from_secure_folio(folio); 390 391 return 0; 392 } 393 394 static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 395 { 396 struct gmap_unmap_priv *priv = walk->priv; 397 struct folio *folio = NULL; 398 union crste old = *crstep; 399 400 if (!old.h.fc) 401 return 0; 402 403 if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) 404 folio = phys_to_folio(crste_origin_large(old)); 405 /* No races should happen because kvm->mmu_lock is held in write mode */ 406 KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn), 407 priv->gmap->kvm); 408 if (folio) 409 uv_convert_from_secure_folio(folio); 410 411 return 0; 412 } 413 414 /** 415 * gmap_unmap_gfn_range() - Unmap a range of guest addresses. 416 * @gmap: The gmap to act on. 417 * @slot: The memslot in which the range is located. 418 * @start: The first gfn to unmap. 419 * @end: The gfn after the last one to unmap. 420 * 421 * Context: Called with the kvm mmu write lock held. 422 * Return: false 423 */ 424 bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end) 425 { 426 const struct dat_walk_ops ops = { 427 .pte_entry = _gmap_unmap_pte, 428 .pmd_entry = _gmap_unmap_crste, 429 .pud_entry = _gmap_unmap_crste, 430 }; 431 struct gmap_unmap_priv priv = { 432 .gmap = gmap, 433 .slot = slot, 434 }; 435 436 lockdep_assert_held_write(&gmap->kvm->mmu_lock); 437 438 _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); 439 return false; 440 } 441 442 static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn, 443 struct gmap *gmap) 444 { 445 union pte pte = READ_ONCE(*ptep); 446 447 if (!pte.s.pr || (pte.h.p && !pte.s.sd)) 448 return pgste; 449 450 /* 451 * If this page contains one or more prefixes of vCPUS that are currently 452 * running, do not reset the protection, leave it marked as dirty. 453 */ 454 if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) { 455 pte.h.p = 1; 456 pte.s.sd = 0; 457 pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn); 458 } 459 460 mark_page_dirty(gmap->kvm, gfn); 461 462 return pgste; 463 } 464 465 static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end, 466 struct dat_walk *walk) 467 { 468 struct gmap *gmap = walk->priv; 469 union pgste pgste; 470 471 pgste = pgste_get_lock(ptep); 472 pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap); 473 pgste_set_unlock(ptep, pgste); 474 return 0; 475 } 476 477 static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end, 478 struct dat_walk *walk) 479 { 480 struct gmap *gmap = walk->priv; 481 union crste crste, new; 482 483 if (fatal_signal_pending(current)) 484 return 1; 485 do { 486 crste = READ_ONCE(*table); 487 if (!crste.h.fc) 488 return 0; 489 if (crste.h.p && !crste.s.fc1.sd) 490 return 0; 491 492 /* 493 * If this large page contains one or more prefixes of vCPUs that are 494 * currently running, do not reset the protection, leave it marked as 495 * dirty. 496 */ 497 if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end)) 498 break; 499 new = crste; 500 new.h.p = 1; 501 new.s.fc1.sd = 0; 502 } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn)); 503 504 for ( ; gfn < end; gfn++) 505 mark_page_dirty(gmap->kvm, gfn); 506 507 return 0; 508 } 509 510 void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end) 511 { 512 const struct dat_walk_ops walk_ops = { 513 .pte_entry = _pte_test_and_clear_softdirty, 514 .pmd_entry = _crste_test_and_clear_softdirty, 515 .pud_entry = _crste_test_and_clear_softdirty, 516 }; 517 518 lockdep_assert_held(&gmap->kvm->mmu_lock); 519 520 _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap); 521 } 522 523 static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f) 524 { 525 union crste newcrste, oldcrste = READ_ONCE(*f->crstep); 526 527 /* Somehow the crste is not large anymore, let the slow path deal with it. */ 528 if (!oldcrste.h.fc) 529 return 1; 530 531 f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn)); 532 f->writable = oldcrste.s.fc1.w; 533 534 /* Appropriate permissions already (race with another handler), nothing to do. */ 535 if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p)) 536 return 0; 537 538 if (!f->write_attempt || oldcrste.s.fc1.w) { 539 f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d; 540 newcrste = oldcrste; 541 newcrste.h.i = 0; 542 newcrste.s.fc1.y = 1; 543 if (f->write_attempt) { 544 newcrste.h.p = 0; 545 newcrste.s.fc1.d = 1; 546 newcrste.s.fc1.sd = 1; 547 } 548 /* In case of races, let the slow path deal with it. */ 549 return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn); 550 } 551 /* Trying to write on a read-only page, let the slow path deal with it. */ 552 return 1; 553 } 554 555 static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste, 556 struct guest_fault *f) 557 { 558 union pte newpte, oldpte = READ_ONCE(*f->ptep); 559 560 f->pfn = oldpte.h.pfra; 561 f->writable = oldpte.s.w; 562 563 /* Appropriate permissions already (race with another handler), nothing to do. */ 564 if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p)) 565 return 0; 566 /* Trying to write on a read-only page, let the slow path deal with it. */ 567 if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w)) 568 return 1; 569 570 newpte = oldpte; 571 newpte.h.i = 0; 572 newpte.s.y = 1; 573 if (f->write_attempt) { 574 newpte.h.p = 0; 575 newpte.s.d = 1; 576 newpte.s.sd = 1; 577 } 578 *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn); 579 580 return 0; 581 } 582 583 /** 584 * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault. 585 * @gmap: The gmap whose fault needs to be resolved. 586 * @fault: Describes the fault that is being resolved. 587 * 588 * A minor fault is a fault that can be resolved quickly within gmap. 589 * The page is already mapped, the fault is only due to dirty/young tracking. 590 * 591 * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could 592 * not be resolved and needs to go through the slow path. 593 */ 594 int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault) 595 { 596 union pgste pgste; 597 int rc; 598 599 lockdep_assert_held(&gmap->kvm->mmu_lock); 600 601 rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE, 602 &fault->crstep, &fault->ptep); 603 /* If a PTE or a leaf CRSTE could not be reached, slow path. */ 604 if (rc) 605 return 1; 606 607 if (fault->ptep) { 608 pgste = pgste_get_lock(fault->ptep); 609 rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault); 610 if (!rc && fault->callback) 611 fault->callback(fault); 612 pgste_set_unlock(fault->ptep, pgste); 613 } else { 614 rc = gmap_handle_minor_crste_fault(gmap, fault); 615 if (!rc && fault->callback) 616 fault->callback(fault); 617 } 618 return rc; 619 } 620 621 static inline bool gmap_2g_allowed(struct gmap *gmap, struct guest_fault *f, 622 struct kvm_memory_slot *slot) 623 { 624 return false; 625 } 626 627 /** 628 * gmap_1m_allowed() - Check whether a 1M hugepage is allowed. 629 * @gmap: The gmap of the guest. 630 * @f: Describes the fault that is being resolved. 631 * @slot: The memslot the faulting address belongs to. 632 * 633 * The function checks whether the GMAP_FLAG_ALLOW_HPAGE_1M flag is set for 634 * @gmap, whether the offset of the address in the 1M virtual frame is the 635 * same as the offset in the physical 1M frame, and finally whether the whole 636 * 1M page would fit in the given memslot. 637 * 638 * Return: true if a 1M hugepage is allowed to back the faulting address, false 639 * otherwise. 640 */ 641 static inline bool gmap_1m_allowed(struct gmap *gmap, struct guest_fault *f, 642 struct kvm_memory_slot *slot) 643 { 644 return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags) && 645 !((f->gfn ^ f->pfn) & ~_SEGMENT_FR_MASK) && 646 slot->base_gfn <= ALIGN_DOWN(f->gfn, _PAGES_PER_SEGMENT) && 647 slot->base_gfn + slot->npages >= ALIGN(f->gfn + 1, _PAGES_PER_SEGMENT); 648 } 649 650 static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level, 651 struct guest_fault *f) 652 { 653 union crste oldval, newval; 654 union pte newpte, oldpte; 655 union pgste pgste; 656 int rc = 0; 657 658 rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level, 659 &f->crstep, &f->ptep); 660 if (rc == -ENOMEM) 661 return rc; 662 if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm)) 663 return rc; 664 if (rc) 665 return -EAGAIN; 666 if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm)) 667 return -EINVAL; 668 669 if (f->ptep) { 670 pgste = pgste_get_lock(f->ptep); 671 oldpte = *f->ptep; 672 newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); 673 newpte.s.sd = oldpte.s.sd; 674 oldpte.s.sd = 0; 675 if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { 676 pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn); 677 if (f->callback) 678 f->callback(f); 679 } else { 680 rc = -EAGAIN; 681 } 682 pgste_set_unlock(f->ptep, pgste); 683 } else { 684 do { 685 oldval = READ_ONCE(*f->crstep); 686 newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, 687 f->write_attempt | oldval.s.fc1.d); 688 newval.s.fc1.s = !f->page; 689 newval.s.fc1.sd = oldval.s.fc1.sd; 690 if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && 691 crste_origin_large(oldval) != crste_origin_large(newval)) 692 return -EAGAIN; 693 } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn)); 694 if (f->callback) 695 f->callback(f); 696 } 697 698 return rc; 699 } 700 701 int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f, 702 struct kvm_memory_slot *slot) 703 { 704 unsigned int order; 705 int level; 706 707 lockdep_assert_held(&gmap->kvm->mmu_lock); 708 709 level = TABLE_TYPE_PAGE_TABLE; 710 if (f->page) { 711 order = folio_order(page_folio(f->page)); 712 if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f, slot)) 713 level = TABLE_TYPE_REGION3; 714 else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f, slot)) 715 level = TABLE_TYPE_SEGMENT; 716 } 717 return _gmap_link(mc, gmap, level, f); 718 } 719 720 static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, 721 gfn_t p_gfn, gfn_t c_gfn, bool force_alloc) 722 { 723 union crste newcrste, oldcrste; 724 struct page_table *pt; 725 union crste *crstep; 726 union pte *ptep; 727 int rc; 728 729 if (force_alloc) 730 rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC, 731 TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); 732 else 733 rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE, 734 TABLE_TYPE_SEGMENT, &crstep, &ptep); 735 if (rc) 736 return rc; 737 if (!ptep) { 738 newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT); 739 newcrste.h.i = 1; 740 newcrste.h.fc0.tl = 1; 741 } else { 742 pt = pte_table_start(ptep); 743 dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT)); 744 newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT); 745 } 746 rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT, 747 &crstep, &ptep); 748 if (rc) 749 return rc; 750 do { 751 oldcrste = READ_ONCE(*crstep); 752 if (oldcrste.val == newcrste.val) 753 break; 754 } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce)); 755 return 0; 756 } 757 758 static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp) 759 { 760 union pte *ptep; 761 int rc; 762 763 rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE, 764 TABLE_TYPE_SEGMENT, crstepp, &ptep); 765 if (rc || (!ptep && !crste_is_ucas(**crstepp))) 766 return -EREMOTE; 767 if (!ptep) 768 return 1; 769 *gaddr &= ~_SEGMENT_MASK; 770 *gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT; 771 return 0; 772 } 773 774 /** 775 * gmap_ucas_translate() - Translate a vcpu address into a host gmap address 776 * @mc: The memory cache to be used for allocations. 777 * @gmap: The per-cpu gmap. 778 * @gaddr: Pointer to the address to be translated, will get overwritten with 779 * the translated address in case of success. 780 * Translates the per-vCPU guest address into a fake guest address, which can 781 * then be used with the fake memslots that are identity mapping userspace. 782 * This allows ucontrol VMs to use the normal fault resolution path, like 783 * normal VMs. 784 * 785 * Return: %0 in case of success, otherwise %-EREMOTE. 786 */ 787 int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr) 788 { 789 gpa_t translated_address; 790 union crste *crstep; 791 gfn_t gfn; 792 int rc; 793 794 gfn = gpa_to_gfn(*gaddr); 795 796 scoped_guard(read_lock, &gmap->kvm->mmu_lock) { 797 rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); 798 if (rc <= 0) 799 return rc; 800 } 801 do { 802 scoped_guard(write_lock, &gmap->kvm->mmu_lock) { 803 rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); 804 if (rc <= 0) 805 return rc; 806 translated_address = (*gaddr & ~_SEGMENT_MASK) | 807 (crstep->val & _SEGMENT_MASK); 808 rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true); 809 } 810 if (!rc) { 811 *gaddr = translated_address; 812 return 0; 813 } 814 if (rc != -ENOMEM) 815 return -EREMOTE; 816 rc = kvm_s390_mmu_cache_topup(mc); 817 if (rc) 818 return rc; 819 } while (1); 820 return 0; 821 } 822 823 int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count) 824 { 825 struct kvm_s390_mmu_cache *mc; 826 int rc; 827 828 mc = kvm_s390_new_mmu_cache(); 829 if (!mc) 830 return -ENOMEM; 831 832 while (count) { 833 scoped_guard(write_lock, &gmap->kvm->mmu_lock) 834 rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false); 835 if (rc == -ENOMEM) { 836 rc = kvm_s390_mmu_cache_topup(mc); 837 if (rc) 838 return rc; 839 continue; 840 } 841 if (rc) 842 return rc; 843 844 count--; 845 c_gfn += _PAGE_ENTRIES; 846 p_gfn += _PAGE_ENTRIES; 847 } 848 return rc; 849 } 850 851 static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn) 852 { 853 union crste *crstep; 854 union pte *ptep; 855 int rc; 856 857 rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); 858 if (rc) 859 return; 860 while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce)) 861 ; 862 } 863 864 void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) 865 { 866 guard(read_lock)(&gmap->kvm->mmu_lock); 867 868 for ( ; count; count--, c_gfn += _PAGE_ENTRIES) 869 gmap_ucas_unmap_one(gmap, c_gfn); 870 } 871 872 static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 873 { 874 struct gmap *gmap = walk->priv; 875 union crste crste, newcrste; 876 877 crste = READ_ONCE(*crstep); 878 newcrste = _CRSTE_EMPTY(crste.h.tt); 879 880 while (crste_leaf(crste)) { 881 if (crste_prefix(crste)) 882 gmap_unmap_prefix(gmap, gfn, next); 883 if (crste.s.fc1.vsie_notif) 884 gmap_handle_vsie_unshadow_event(gmap, gfn); 885 if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce)) 886 break; 887 crste = READ_ONCE(*crstep); 888 } 889 890 if (need_resched()) 891 return next; 892 893 return 0; 894 } 895 896 void gmap_split_huge_pages(struct gmap *gmap) 897 { 898 const struct dat_walk_ops ops = { 899 .pmd_entry = _gmap_split_crste, 900 .pud_entry = _gmap_split_crste, 901 }; 902 gfn_t start = 0; 903 904 do { 905 scoped_guard(read_lock, &gmap->kvm->mmu_lock) 906 start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce, 907 &ops, DAT_WALK_IGN_HOLES, gmap); 908 cond_resched(); 909 } while (start); 910 } 911 912 static int _gmap_enable_skeys(struct gmap *gmap) 913 { 914 gfn_t start = 0; 915 int rc; 916 917 if (uses_skeys(gmap)) 918 return 0; 919 920 set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); 921 rc = gmap_helper_disable_cow_sharing(); 922 if (rc) { 923 clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); 924 return rc; 925 } 926 927 do { 928 scoped_guard(write_lock, &gmap->kvm->mmu_lock) 929 start = dat_reset_skeys(gmap->asce, start); 930 cond_resched(); 931 } while (start); 932 return 0; 933 } 934 935 int gmap_enable_skeys(struct gmap *gmap) 936 { 937 int rc; 938 939 mmap_write_lock(gmap->kvm->mm); 940 rc = _gmap_enable_skeys(gmap); 941 mmap_write_unlock(gmap->kvm->mm); 942 return rc; 943 } 944 945 static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 946 { 947 if (!ptep->s.pr) 948 return 0; 949 __kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep))); 950 if (need_resched()) 951 return next; 952 return 0; 953 } 954 955 static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 956 { 957 phys_addr_t origin, cur, end; 958 959 if (!crstep->h.fc || !crstep->s.fc1.pr) 960 return 0; 961 962 origin = crste_origin_large(*crstep); 963 cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin; 964 end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin; 965 for ( ; cur < end; cur += PAGE_SIZE) 966 __kvm_s390_pv_destroy_page(phys_to_page(cur)); 967 if (need_resched()) 968 return next; 969 return 0; 970 } 971 972 int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible) 973 { 974 const struct dat_walk_ops ops = { 975 .pte_entry = _destroy_pages_pte, 976 .pmd_entry = _destroy_pages_crste, 977 .pud_entry = _destroy_pages_crste, 978 }; 979 980 do { 981 scoped_guard(read_lock, &gmap->kvm->mmu_lock) 982 start = _dat_walk_gfn_range(start, end, gmap->asce, &ops, 983 DAT_WALK_IGN_HOLES, NULL); 984 if (interruptible && fatal_signal_pending(current)) 985 return -EINTR; 986 cond_resched(); 987 } while (start && start < end); 988 return 0; 989 } 990 991 int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level) 992 { 993 struct vsie_rmap *rmap __free(kvfree) = NULL; 994 struct vsie_rmap *temp; 995 void __rcu **slot; 996 int rc = 0; 997 998 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 999 lockdep_assert_held(&sg->host_to_rmap_lock); 1000 1001 rmap = kzalloc_obj(*rmap, GFP_ATOMIC); 1002 if (!rmap) 1003 return -ENOMEM; 1004 1005 rmap->r_gfn = r_gfn; 1006 rmap->level = level; 1007 slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn); 1008 if (slot) { 1009 rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock); 1010 for (temp = rmap->next; temp; temp = temp->next) { 1011 if (temp->val == rmap->val) 1012 return 0; 1013 } 1014 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); 1015 } else { 1016 rmap->next = NULL; 1017 rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap); 1018 if (rc) 1019 return rc; 1020 } 1021 rmap = NULL; 1022 1023 return 0; 1024 } 1025 1026 int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, 1027 kvm_pfn_t pfn, int level, bool wr) 1028 { 1029 union crste *crstep; 1030 union pgste pgste; 1031 union pte *ptep; 1032 union pte pte; 1033 int flags, rc; 1034 1035 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 1036 lockdep_assert_held(&sg->parent->children_lock); 1037 1038 flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); 1039 rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags, 1040 TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); 1041 if (rc) 1042 return rc; 1043 if (level <= TABLE_TYPE_REGION1) { 1044 scoped_guard(spinlock, &sg->host_to_rmap_lock) 1045 rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level); 1046 } 1047 if (rc) 1048 return rc; 1049 1050 if (!pgste_get_trylock(ptep, &pgste)) 1051 return -EAGAIN; 1052 pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false); 1053 pte.h.p = 1; 1054 pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false); 1055 pgste.vsie_notif = 1; 1056 pgste_set_unlock(ptep, pgste); 1057 1058 return 0; 1059 } 1060 1061 static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 1062 { 1063 __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val); 1064 if (need_resched()) 1065 return next; 1066 return 0; 1067 } 1068 1069 void gmap_set_cmma_all_dirty(struct gmap *gmap) 1070 { 1071 const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, }; 1072 gfn_t gfn = 0; 1073 1074 do { 1075 scoped_guard(read_lock, &gmap->kvm->mmu_lock) 1076 gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops, 1077 DAT_WALK_IGN_HOLES, NULL); 1078 cond_resched(); 1079 } while (gfn); 1080 } 1081 1082 static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level) 1083 { 1084 unsigned long align = PAGE_SIZE; 1085 gpa_t gaddr = gfn_to_gpa(r_gfn); 1086 union crste *crstep; 1087 union crste crste; 1088 union pte *ptep; 1089 1090 if (level > TABLE_TYPE_PAGE_TABLE) 1091 align = 1UL << (11 * level + _SEGMENT_SHIFT); 1092 kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align)); 1093 sg->invalidated = true; 1094 if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep)) 1095 return; 1096 if (ptep) { 1097 if (READ_ONCE(*ptep).val != _PTE_EMPTY.val) 1098 dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); 1099 return; 1100 } 1101 1102 crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce); 1103 if (crste_leaf(crste) || crste.h.i) 1104 return; 1105 if (is_pmd(crste)) 1106 dat_free_pt(dereference_pmd(crste.pmd)); 1107 else 1108 dat_free_level(dereference_crste(crste), true); 1109 } 1110 1111 static void gmap_unshadow(struct gmap *sg) 1112 { 1113 struct gmap_cache *gmap_cache, *next; 1114 1115 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 1116 KVM_BUG_ON(!sg->parent, sg->kvm); 1117 1118 lockdep_assert_held(&sg->parent->children_lock); 1119 1120 gmap_remove_child(sg); 1121 kvm_s390_vsie_gmap_notifier(sg, 0, -1UL); 1122 1123 list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) { 1124 gmap_cache->gmap = NULL; 1125 list_del(&gmap_cache->list); 1126 } 1127 1128 gmap_put(sg); 1129 } 1130 1131 void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) 1132 { 1133 struct vsie_rmap *rmap, *rnext, *head; 1134 struct gmap *sg, *next; 1135 gfn_t start, end; 1136 1137 list_for_each_entry_safe(sg, next, &parent->children, list) { 1138 start = sg->guest_asce.rsto; 1139 end = start + sg->guest_asce.tl + 1; 1140 if (!sg->guest_asce.r && gfn >= start && gfn < end) { 1141 gmap_unshadow(sg); 1142 continue; 1143 } 1144 scoped_guard(spinlock, &sg->host_to_rmap_lock) 1145 head = radix_tree_delete(&sg->host_to_rmap, gfn); 1146 gmap_for_each_rmap_safe(rmap, rnext, head) 1147 gmap_unshadow_level(sg, rmap->r_gfn, rmap->level); 1148 } 1149 } 1150 1151 /** 1152 * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables. 1153 * @parent: Pointer to the parent gmap. 1154 * @asce: ASCE for which the shadow table is created. 1155 * @edat_level: Edat level to be used for the shadow translation. 1156 * 1157 * Context: Called with parent->children_lock held. 1158 * 1159 * Return: The pointer to a gmap if a shadow table with the given asce is 1160 * already available, ERR_PTR(-EAGAIN) if another one is just being created, 1161 * otherwise NULL. 1162 */ 1163 static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level) 1164 { 1165 struct gmap *sg; 1166 1167 lockdep_assert_held(&parent->children_lock); 1168 list_for_each_entry(sg, &parent->children, list) { 1169 if (!gmap_is_shadow_valid(sg, asce, edat_level)) 1170 continue; 1171 return sg; 1172 } 1173 return NULL; 1174 } 1175 1176 #define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE) 1177 struct gmap_protect_asce_top_level { 1178 unsigned long seq; 1179 struct guest_fault f[CRST_TABLE_PAGES]; 1180 }; 1181 1182 static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, 1183 struct gmap_protect_asce_top_level *context) 1184 { 1185 struct gmap *parent; 1186 int rc, i; 1187 1188 guard(write_lock)(&sg->kvm->mmu_lock); 1189 1190 if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f)) 1191 return -EAGAIN; 1192 1193 parent = READ_ONCE(sg->parent); 1194 if (!parent) 1195 return -EAGAIN; 1196 scoped_guard(spinlock, &parent->children_lock) { 1197 if (READ_ONCE(sg->parent) != parent) 1198 return -EAGAIN; 1199 sg->invalidated = false; 1200 for (i = 0; i < CRST_TABLE_PAGES; i++) { 1201 if (!context->f[i].valid) 1202 continue; 1203 rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn, 1204 TABLE_TYPE_REGION1 + 1, context->f[i].writable); 1205 if (rc) 1206 return rc; 1207 } 1208 gmap_add_child(sg->parent, sg); 1209 } 1210 1211 kvm_s390_release_faultin_array(sg->kvm, context->f, false); 1212 return 0; 1213 } 1214 1215 static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, 1216 struct gmap_protect_asce_top_level *context) 1217 { 1218 int rc; 1219 1220 if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f)) 1221 return -EAGAIN; 1222 do { 1223 rc = kvm_s390_mmu_cache_topup(mc); 1224 if (rc) 1225 return rc; 1226 rc = radix_tree_preload(GFP_KERNEL); 1227 if (rc) 1228 return rc; 1229 rc = __gmap_protect_asce_top_level(mc, sg, context); 1230 radix_tree_preload_end(); 1231 } while (rc == -ENOMEM); 1232 1233 return rc; 1234 } 1235 1236 static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg) 1237 { 1238 struct gmap_protect_asce_top_level context = {}; 1239 union asce asce = sg->guest_asce; 1240 int rc; 1241 1242 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 1243 1244 context.seq = sg->kvm->mmu_invalidate_seq; 1245 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1246 smp_rmb(); 1247 1248 rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false); 1249 if (rc > 0) 1250 rc = -EFAULT; 1251 if (!rc) 1252 rc = _gmap_protect_asce_top_level(mc, sg, &context); 1253 if (rc) 1254 kvm_s390_release_faultin_array(sg->kvm, context.f, true); 1255 return rc; 1256 } 1257 1258 /** 1259 * gmap_create_shadow() - Create/find a shadow guest address space. 1260 * @mc: The cache to use to allocate dat tables. 1261 * @parent: Pointer to the parent gmap. 1262 * @asce: ASCE for which the shadow table is created. 1263 * @edat_level: Edat level to be used for the shadow translation. 1264 * 1265 * The pages of the top level page table referred by the asce parameter 1266 * will be set to read-only and marked in the PGSTEs of the kvm process. 1267 * The shadow table will be removed automatically on any change to the 1268 * PTE mapping for the source table. 1269 * 1270 * The returned shadow gmap will be returned with one extra reference. 1271 * 1272 * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory, 1273 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the 1274 * parent gmap table could not be protected. 1275 */ 1276 struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent, 1277 union asce asce, int edat_level) 1278 { 1279 struct gmap *sg, *new; 1280 int rc; 1281 1282 if (WARN_ON(!parent)) 1283 return ERR_PTR(-EINVAL); 1284 1285 scoped_guard(spinlock, &parent->children_lock) { 1286 sg = gmap_find_shadow(parent, asce, edat_level); 1287 if (sg) { 1288 gmap_get(sg); 1289 return sg; 1290 } 1291 } 1292 /* Create a new shadow gmap. */ 1293 new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce)); 1294 if (!new) 1295 return ERR_PTR(-ENOMEM); 1296 new->guest_asce = asce; 1297 new->edat_level = edat_level; 1298 set_bit(GMAP_FLAG_SHADOW, &new->flags); 1299 1300 scoped_guard(spinlock, &parent->children_lock) { 1301 /* Recheck if another CPU created the same shadow. */ 1302 sg = gmap_find_shadow(parent, asce, edat_level); 1303 if (sg) { 1304 gmap_put(new); 1305 gmap_get(sg); 1306 return sg; 1307 } 1308 if (asce.r) { 1309 /* Only allow one real-space gmap shadow. */ 1310 list_for_each_entry(sg, &parent->children, list) { 1311 if (sg->guest_asce.r) { 1312 scoped_guard(write_lock, &parent->kvm->mmu_lock) 1313 gmap_unshadow(sg); 1314 break; 1315 } 1316 } 1317 gmap_add_child(parent, new); 1318 /* Nothing to protect, return right away. */ 1319 gmap_get(new); 1320 return new; 1321 } 1322 } 1323 1324 gmap_get(new); 1325 new->parent = parent; 1326 /* Protect while inserting, protects against invalidation races. */ 1327 rc = gmap_protect_asce_top_level(mc, new); 1328 if (rc) { 1329 new->parent = NULL; 1330 gmap_put(new); 1331 gmap_put(new); 1332 return ERR_PTR(rc); 1333 } 1334 return new; 1335 } 1336