1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Guest memory management for KVM/s390 4 * 5 * Copyright IBM Corp. 2008, 2020, 2024 6 * 7 * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> 8 * Martin Schwidefsky <schwidefsky@de.ibm.com> 9 * David Hildenbrand <david@redhat.com> 10 * Janosch Frank <frankja@linux.ibm.com> 11 */ 12 13 #include <linux/compiler.h> 14 #include <linux/kvm.h> 15 #include <linux/kvm_host.h> 16 #include <linux/pgtable.h> 17 #include <linux/pagemap.h> 18 #include <asm/lowcore.h> 19 #include <asm/uv.h> 20 #include <asm/gmap_helpers.h> 21 22 #include "dat.h" 23 #include "gmap.h" 24 #include "kvm-s390.h" 25 #include "faultin.h" 26 27 static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu) 28 { 29 return vcpu->arch.sie_block->prog0c & PROG_IN_SIE; 30 } 31 32 static int gmap_limit_to_type(gfn_t limit) 33 { 34 if (!limit) 35 return TABLE_TYPE_REGION1; 36 if (limit <= _REGION3_SIZE >> PAGE_SHIFT) 37 return TABLE_TYPE_SEGMENT; 38 if (limit <= _REGION2_SIZE >> PAGE_SHIFT) 39 return TABLE_TYPE_REGION3; 40 if (limit <= _REGION1_SIZE >> PAGE_SHIFT) 41 return TABLE_TYPE_REGION2; 42 return TABLE_TYPE_REGION1; 43 } 44 45 /** 46 * gmap_new() - Allocate and initialize a guest address space. 47 * @kvm: The kvm owning the guest. 48 * @limit: Maximum address of the gmap address space. 49 * 50 * Return: A guest address space structure. 51 */ 52 struct gmap *gmap_new(struct kvm *kvm, gfn_t limit) 53 { 54 struct crst_table *table; 55 struct gmap *gmap; 56 int type; 57 58 type = gmap_limit_to_type(limit); 59 60 gmap = kzalloc(sizeof(*gmap), GFP_KERNEL_ACCOUNT); 61 if (!gmap) 62 return NULL; 63 INIT_LIST_HEAD(&gmap->children); 64 INIT_LIST_HEAD(&gmap->list); 65 INIT_LIST_HEAD(&gmap->scb_users); 66 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE); 67 spin_lock_init(&gmap->children_lock); 68 spin_lock_init(&gmap->host_to_rmap_lock); 69 refcount_set(&gmap->refcount, 1); 70 71 table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val); 72 if (!table) { 73 kfree(gmap); 74 return NULL; 75 } 76 77 gmap->asce.val = __pa(table); 78 gmap->asce.dt = type; 79 gmap->asce.tl = _ASCE_TABLE_LENGTH; 80 gmap->asce.x = 1; 81 gmap->asce.p = 1; 82 gmap->asce.s = 1; 83 gmap->kvm = kvm; 84 set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags); 85 86 return gmap; 87 } 88 89 static void gmap_add_child(struct gmap *parent, struct gmap *child) 90 { 91 KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm); 92 KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm); 93 KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm); 94 lockdep_assert_held(&parent->children_lock); 95 96 child->parent = parent; 97 98 if (is_ucontrol(parent)) 99 set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); 100 else 101 clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); 102 103 if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags)) 104 set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); 105 else 106 clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); 107 108 if (kvm_is_ucontrol(parent->kvm)) 109 clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags); 110 list_add(&child->list, &parent->children); 111 } 112 113 struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit) 114 { 115 struct gmap *res; 116 117 lockdep_assert_not_held(&parent->children_lock); 118 res = gmap_new(parent->kvm, limit); 119 if (res) { 120 scoped_guard(spinlock, &parent->children_lock) 121 gmap_add_child(parent, res); 122 } 123 return res; 124 } 125 126 int gmap_set_limit(struct gmap *gmap, gfn_t limit) 127 { 128 struct kvm_s390_mmu_cache *mc; 129 int rc, type; 130 131 type = gmap_limit_to_type(limit); 132 133 mc = kvm_s390_new_mmu_cache(); 134 if (!mc) 135 return -ENOMEM; 136 137 do { 138 rc = kvm_s390_mmu_cache_topup(mc); 139 if (rc) 140 return rc; 141 scoped_guard(write_lock, &gmap->kvm->mmu_lock) 142 rc = dat_set_asce_limit(mc, &gmap->asce, type); 143 } while (rc == -ENOMEM); 144 145 kvm_s390_free_mmu_cache(mc); 146 return 0; 147 } 148 149 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) 150 { 151 struct vsie_rmap *rmap, *rnext, *head; 152 struct radix_tree_iter iter; 153 unsigned long indices[16]; 154 unsigned long index; 155 void __rcu **slot; 156 int i, nr; 157 158 /* A radix tree is freed by deleting all of its entries */ 159 index = 0; 160 do { 161 nr = 0; 162 radix_tree_for_each_slot(slot, root, &iter, index) { 163 indices[nr] = iter.index; 164 if (++nr == 16) 165 break; 166 } 167 for (i = 0; i < nr; i++) { 168 index = indices[i]; 169 head = radix_tree_delete(root, index); 170 gmap_for_each_rmap_safe(rmap, rnext, head) 171 kfree(rmap); 172 } 173 } while (nr > 0); 174 } 175 176 void gmap_remove_child(struct gmap *child) 177 { 178 if (KVM_BUG_ON(!child->parent, child->kvm)) 179 return; 180 lockdep_assert_held(&child->parent->children_lock); 181 182 list_del(&child->list); 183 child->parent = NULL; 184 } 185 186 /** 187 * gmap_dispose() - Remove and free a guest address space and its children. 188 * @gmap: Pointer to the guest address space structure. 189 */ 190 void gmap_dispose(struct gmap *gmap) 191 { 192 /* The gmap must have been removed from the parent beforehands */ 193 KVM_BUG_ON(gmap->parent, gmap->kvm); 194 /* All children of this gmap must have been removed beforehands */ 195 KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm); 196 /* No VSIE shadow block is allowed to use this gmap */ 197 KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm); 198 /* The ASCE must be valid */ 199 KVM_BUG_ON(!gmap->asce.val, gmap->kvm); 200 /* The refcount must be 0 */ 201 KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm); 202 203 /* Flush tlb of all gmaps */ 204 asce_flush_tlb(gmap->asce); 205 206 /* Free all DAT tables. */ 207 dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap)); 208 209 /* Free additional data for a shadow gmap */ 210 if (is_shadow(gmap)) 211 gmap_rmap_radix_tree_free(&gmap->host_to_rmap); 212 213 kfree(gmap); 214 } 215 216 /** 217 * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy. 218 * @gmap: The gmap whose ASCE needs to be replaced. 219 * 220 * If the ASCE is a SEGMENT type then this function will return -EINVAL, 221 * otherwise the pointers in the host_to_guest radix tree will keep pointing 222 * to the wrong pages, causing use-after-free and memory corruption. 223 * If the allocation of the new top level page table fails, the ASCE is not 224 * replaced. 225 * In any case, the old ASCE is always removed from the gmap CRST list. 226 * Therefore the caller has to make sure to save a pointer to it 227 * beforehand, unless a leak is actually intended. 228 * 229 * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE, 230 * -ENOMEM if runinng out of memory. 231 */ 232 int s390_replace_asce(struct gmap *gmap) 233 { 234 struct crst_table *table; 235 union asce asce; 236 237 /* Replacing segment type ASCEs would cause serious issues */ 238 if (gmap->asce.dt == ASCE_TYPE_SEGMENT) 239 return -EINVAL; 240 241 table = dat_alloc_crst_sleepable(0); 242 if (!table) 243 return -ENOMEM; 244 memcpy(table, dereference_asce(gmap->asce), sizeof(*table)); 245 246 /* Set new table origin while preserving existing ASCE control bits */ 247 asce = gmap->asce; 248 asce.rsto = virt_to_pfn(table); 249 WRITE_ONCE(gmap->asce, asce); 250 251 return 0; 252 } 253 254 bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint) 255 { 256 struct kvm *kvm = gmap->kvm; 257 struct kvm_vcpu *vcpu; 258 gfn_t prefix_gfn; 259 unsigned long i; 260 261 if (is_shadow(gmap)) 262 return false; 263 kvm_for_each_vcpu(i, vcpu, kvm) { 264 /* Match against both prefix pages */ 265 prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu)); 266 if (prefix_gfn < end && gfn <= prefix_gfn + 1) { 267 if (hint && kvm_s390_is_in_sie(vcpu)) 268 return false; 269 VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx", 270 gfn_to_gpa(gfn), gfn_to_gpa(end)); 271 kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); 272 } 273 } 274 return true; 275 } 276 277 struct clear_young_pte_priv { 278 struct gmap *gmap; 279 bool young; 280 }; 281 282 static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk) 283 { 284 struct clear_young_pte_priv *p = walk->priv; 285 union pgste pgste; 286 union pte pte, new; 287 288 pte = READ_ONCE(*ptep); 289 290 if (!pte.s.pr || (!pte.s.y && pte.h.i)) 291 return 0; 292 293 pgste = pgste_get_lock(ptep); 294 if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) { 295 new = pte; 296 new.h.i = 1; 297 new.s.y = 0; 298 if ((new.s.d || !new.h.p) && !new.s.s) 299 folio_set_dirty(pfn_folio(pte.h.pfra)); 300 new.s.d = 0; 301 new.h.p = 1; 302 303 pgste.prefix_notif = 0; 304 pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap)); 305 } 306 p->young = 1; 307 pgste_set_unlock(ptep, pgste); 308 return 0; 309 } 310 311 static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk) 312 { 313 struct clear_young_pte_priv *priv = walk->priv; 314 union crste crste, new; 315 316 crste = READ_ONCE(*crstep); 317 318 if (!crste.h.fc) 319 return 0; 320 if (!crste.s.fc1.y && crste.h.i) 321 return 0; 322 if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) { 323 new = crste; 324 new.h.i = 1; 325 new.s.fc1.y = 0; 326 new.s.fc1.prefix_notif = 0; 327 if (new.s.fc1.d || !new.h.p) 328 folio_set_dirty(phys_to_folio(crste_origin_large(crste))); 329 new.s.fc1.d = 0; 330 new.h.p = 1; 331 dat_crstep_xchg(crstep, new, gfn, walk->asce); 332 } 333 priv->young = 1; 334 return 0; 335 } 336 337 /** 338 * gmap_age_gfn() - Clear young. 339 * @gmap: The guest gmap. 340 * @start: The first gfn to test. 341 * @end: The gfn after the last one to test. 342 * 343 * Context: Called with the kvm mmu write lock held. 344 * Return: 1 if any page in the given range was young, otherwise 0. 345 */ 346 bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end) 347 { 348 const struct dat_walk_ops ops = { 349 .pte_entry = gmap_clear_young_pte, 350 .pmd_entry = gmap_clear_young_crste, 351 .pud_entry = gmap_clear_young_crste, 352 }; 353 struct clear_young_pte_priv priv = { 354 .gmap = gmap, 355 .young = false, 356 }; 357 358 _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); 359 360 return priv.young; 361 } 362 363 struct gmap_unmap_priv { 364 struct gmap *gmap; 365 struct kvm_memory_slot *slot; 366 }; 367 368 static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w) 369 { 370 struct gmap_unmap_priv *priv = w->priv; 371 struct folio *folio = NULL; 372 unsigned long vmaddr; 373 union pgste pgste; 374 375 pgste = pgste_get_lock(ptep); 376 if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) { 377 vmaddr = __gfn_to_hva_memslot(priv->slot, gfn); 378 gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr); 379 } 380 if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) 381 folio = pfn_folio(ptep->h.pfra); 382 pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn); 383 pgste_set_unlock(ptep, pgste); 384 if (folio) 385 uv_convert_from_secure_folio(folio); 386 387 return 0; 388 } 389 390 static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 391 { 392 struct gmap_unmap_priv *priv = walk->priv; 393 struct folio *folio = NULL; 394 395 if (crstep->h.fc) { 396 if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) 397 folio = phys_to_folio(crste_origin_large(*crstep)); 398 gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn); 399 if (folio) 400 uv_convert_from_secure_folio(folio); 401 } 402 403 return 0; 404 } 405 406 /** 407 * gmap_unmap_gfn_range() - Unmap a range of guest addresses. 408 * @gmap: The gmap to act on. 409 * @slot: The memslot in which the range is located. 410 * @start: The first gfn to unmap. 411 * @end: The gfn after the last one to unmap. 412 * 413 * Context: Called with the kvm mmu write lock held. 414 * Return: false 415 */ 416 bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end) 417 { 418 const struct dat_walk_ops ops = { 419 .pte_entry = _gmap_unmap_pte, 420 .pmd_entry = _gmap_unmap_crste, 421 .pud_entry = _gmap_unmap_crste, 422 }; 423 struct gmap_unmap_priv priv = { 424 .gmap = gmap, 425 .slot = slot, 426 }; 427 428 lockdep_assert_held_write(&gmap->kvm->mmu_lock); 429 430 _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); 431 return false; 432 } 433 434 static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn, 435 struct gmap *gmap) 436 { 437 union pte pte = READ_ONCE(*ptep); 438 439 if (!pte.s.pr || (pte.h.p && !pte.s.sd)) 440 return pgste; 441 442 /* 443 * If this page contains one or more prefixes of vCPUS that are currently 444 * running, do not reset the protection, leave it marked as dirty. 445 */ 446 if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) { 447 pte.h.p = 1; 448 pte.s.sd = 0; 449 pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn); 450 } 451 452 mark_page_dirty(gmap->kvm, gfn); 453 454 return pgste; 455 } 456 457 static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end, 458 struct dat_walk *walk) 459 { 460 struct gmap *gmap = walk->priv; 461 union pgste pgste; 462 463 pgste = pgste_get_lock(ptep); 464 pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap); 465 pgste_set_unlock(ptep, pgste); 466 return 0; 467 } 468 469 static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end, 470 struct dat_walk *walk) 471 { 472 struct gmap *gmap = walk->priv; 473 union crste crste, new; 474 475 if (fatal_signal_pending(current)) 476 return 1; 477 crste = READ_ONCE(*table); 478 if (!crste.h.fc) 479 return 0; 480 if (crste.h.p && !crste.s.fc1.sd) 481 return 0; 482 483 /* 484 * If this large page contains one or more prefixes of vCPUs that are 485 * currently running, do not reset the protection, leave it marked as 486 * dirty. 487 */ 488 if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) { 489 new = crste; 490 new.h.p = 1; 491 new.s.fc1.sd = 0; 492 gmap_crstep_xchg(gmap, table, new, gfn); 493 } 494 495 for ( ; gfn < end; gfn++) 496 mark_page_dirty(gmap->kvm, gfn); 497 498 return 0; 499 } 500 501 void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end) 502 { 503 const struct dat_walk_ops walk_ops = { 504 .pte_entry = _pte_test_and_clear_softdirty, 505 .pmd_entry = _crste_test_and_clear_softdirty, 506 .pud_entry = _crste_test_and_clear_softdirty, 507 }; 508 509 lockdep_assert_held(&gmap->kvm->mmu_lock); 510 511 _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap); 512 } 513 514 static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f) 515 { 516 union crste newcrste, oldcrste = READ_ONCE(*f->crstep); 517 518 /* Somehow the crste is not large anymore, let the slow path deal with it. */ 519 if (!oldcrste.h.fc) 520 return 1; 521 522 f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn)); 523 f->writable = oldcrste.s.fc1.w; 524 525 /* Appropriate permissions already (race with another handler), nothing to do. */ 526 if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p)) 527 return 0; 528 529 if (!f->write_attempt || oldcrste.s.fc1.w) { 530 f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d; 531 newcrste = oldcrste; 532 newcrste.h.i = 0; 533 newcrste.s.fc1.y = 1; 534 if (f->write_attempt) { 535 newcrste.h.p = 0; 536 newcrste.s.fc1.d = 1; 537 newcrste.s.fc1.sd = 1; 538 } 539 if (!oldcrste.s.fc1.d && newcrste.s.fc1.d) 540 SetPageDirty(phys_to_page(crste_origin_large(newcrste))); 541 /* In case of races, let the slow path deal with it. */ 542 return !dat_crstep_xchg_atomic(f->crstep, oldcrste, newcrste, f->gfn, asce); 543 } 544 /* Trying to write on a read-only page, let the slow path deal with it. */ 545 return 1; 546 } 547 548 static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste, 549 struct guest_fault *f) 550 { 551 union pte newpte, oldpte = READ_ONCE(*f->ptep); 552 553 f->pfn = oldpte.h.pfra; 554 f->writable = oldpte.s.w; 555 556 /* Appropriate permissions already (race with another handler), nothing to do. */ 557 if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p)) 558 return 0; 559 /* Trying to write on a read-only page, let the slow path deal with it. */ 560 if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w)) 561 return 1; 562 563 newpte = oldpte; 564 newpte.h.i = 0; 565 newpte.s.y = 1; 566 if (f->write_attempt) { 567 newpte.h.p = 0; 568 newpte.s.d = 1; 569 newpte.s.sd = 1; 570 } 571 if (!oldpte.s.d && newpte.s.d) 572 SetPageDirty(pfn_to_page(newpte.h.pfra)); 573 *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn); 574 575 return 0; 576 } 577 578 /** 579 * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault. 580 * @gmap: The gmap whose fault needs to be resolved. 581 * @fault: Describes the fault that is being resolved. 582 * 583 * A minor fault is a fault that can be resolved quickly within gmap. 584 * The page is already mapped, the fault is only due to dirty/young tracking. 585 * 586 * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could 587 * not be resolved and needs to go through the slow path. 588 */ 589 int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault) 590 { 591 union pgste pgste; 592 int rc; 593 594 lockdep_assert_held(&gmap->kvm->mmu_lock); 595 596 rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE, 597 &fault->crstep, &fault->ptep); 598 /* If a PTE or a leaf CRSTE could not be reached, slow path. */ 599 if (rc) 600 return 1; 601 602 if (fault->ptep) { 603 pgste = pgste_get_lock(fault->ptep); 604 rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault); 605 if (!rc && fault->callback) 606 fault->callback(fault); 607 pgste_set_unlock(fault->ptep, pgste); 608 } else { 609 rc = gmap_handle_minor_crste_fault(gmap->asce, fault); 610 if (!rc && fault->callback) 611 fault->callback(fault); 612 } 613 return rc; 614 } 615 616 static inline bool gmap_2g_allowed(struct gmap *gmap, gfn_t gfn) 617 { 618 return false; 619 } 620 621 static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn) 622 { 623 return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags); 624 } 625 626 int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f) 627 { 628 unsigned int order; 629 int rc, level; 630 631 lockdep_assert_held(&gmap->kvm->mmu_lock); 632 633 level = TABLE_TYPE_PAGE_TABLE; 634 if (f->page) { 635 order = folio_order(page_folio(f->page)); 636 if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f->gfn)) 637 level = TABLE_TYPE_REGION3; 638 else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn)) 639 level = TABLE_TYPE_SEGMENT; 640 } 641 rc = dat_link(mc, gmap->asce, level, uses_skeys(gmap), f); 642 KVM_BUG_ON(rc == -EINVAL, gmap->kvm); 643 return rc; 644 } 645 646 static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, 647 gfn_t p_gfn, gfn_t c_gfn, bool force_alloc) 648 { 649 struct page_table *pt; 650 union crste newcrste; 651 union crste *crstep; 652 union pte *ptep; 653 int rc; 654 655 if (force_alloc) 656 rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC, 657 TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); 658 else 659 rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE, 660 TABLE_TYPE_SEGMENT, &crstep, &ptep); 661 if (rc) 662 return rc; 663 if (!ptep) { 664 newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT); 665 newcrste.h.i = 1; 666 newcrste.h.fc0.tl = 1; 667 } else { 668 pt = pte_table_start(ptep); 669 dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT)); 670 newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT); 671 } 672 rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT, 673 &crstep, &ptep); 674 if (rc) 675 return rc; 676 dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce); 677 return 0; 678 } 679 680 static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp) 681 { 682 union pte *ptep; 683 int rc; 684 685 rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE, 686 TABLE_TYPE_SEGMENT, crstepp, &ptep); 687 if (rc || (!ptep && !crste_is_ucas(**crstepp))) 688 return -EREMOTE; 689 if (!ptep) 690 return 1; 691 *gaddr &= ~_SEGMENT_MASK; 692 *gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT; 693 return 0; 694 } 695 696 /** 697 * gmap_ucas_translate() - Translate a vcpu address into a host gmap address 698 * @mc: The memory cache to be used for allocations. 699 * @gmap: The per-cpu gmap. 700 * @gaddr: Pointer to the address to be translated, will get overwritten with 701 * the translated address in case of success. 702 * Translates the per-vCPU guest address into a fake guest address, which can 703 * then be used with the fake memslots that are identity mapping userspace. 704 * This allows ucontrol VMs to use the normal fault resolution path, like 705 * normal VMs. 706 * 707 * Return: %0 in case of success, otherwise %-EREMOTE. 708 */ 709 int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr) 710 { 711 gpa_t translated_address; 712 union crste *crstep; 713 gfn_t gfn; 714 int rc; 715 716 gfn = gpa_to_gfn(*gaddr); 717 718 scoped_guard(read_lock, &gmap->kvm->mmu_lock) { 719 rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); 720 if (rc <= 0) 721 return rc; 722 } 723 do { 724 scoped_guard(write_lock, &gmap->kvm->mmu_lock) { 725 rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); 726 if (rc <= 0) 727 return rc; 728 translated_address = (*gaddr & ~_SEGMENT_MASK) | 729 (crstep->val & _SEGMENT_MASK); 730 rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true); 731 } 732 if (!rc) { 733 *gaddr = translated_address; 734 return 0; 735 } 736 if (rc != -ENOMEM) 737 return -EREMOTE; 738 rc = kvm_s390_mmu_cache_topup(mc); 739 if (rc) 740 return rc; 741 } while (1); 742 return 0; 743 } 744 745 int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count) 746 { 747 struct kvm_s390_mmu_cache *mc; 748 int rc; 749 750 mc = kvm_s390_new_mmu_cache(); 751 if (!mc) 752 return -ENOMEM; 753 754 while (count) { 755 scoped_guard(write_lock, &gmap->kvm->mmu_lock) 756 rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false); 757 if (rc == -ENOMEM) { 758 rc = kvm_s390_mmu_cache_topup(mc); 759 if (rc) 760 return rc; 761 continue; 762 } 763 if (rc) 764 return rc; 765 766 count--; 767 c_gfn += _PAGE_ENTRIES; 768 p_gfn += _PAGE_ENTRIES; 769 } 770 return rc; 771 } 772 773 static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn) 774 { 775 union crste *crstep; 776 union pte *ptep; 777 int rc; 778 779 rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); 780 if (!rc) 781 dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce); 782 } 783 784 void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) 785 { 786 guard(read_lock)(&gmap->kvm->mmu_lock); 787 788 for ( ; count; count--, c_gfn += _PAGE_ENTRIES) 789 gmap_ucas_unmap_one(gmap, c_gfn); 790 } 791 792 static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 793 { 794 struct gmap *gmap = walk->priv; 795 union crste crste, newcrste; 796 797 crste = READ_ONCE(*crstep); 798 newcrste = _CRSTE_EMPTY(crste.h.tt); 799 800 while (crste_leaf(crste)) { 801 if (crste_prefix(crste)) 802 gmap_unmap_prefix(gmap, gfn, next); 803 if (crste.s.fc1.vsie_notif) 804 gmap_handle_vsie_unshadow_event(gmap, gfn); 805 if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce)) 806 break; 807 crste = READ_ONCE(*crstep); 808 } 809 810 if (need_resched()) 811 return next; 812 813 return 0; 814 } 815 816 void gmap_split_huge_pages(struct gmap *gmap) 817 { 818 const struct dat_walk_ops ops = { 819 .pmd_entry = _gmap_split_crste, 820 .pud_entry = _gmap_split_crste, 821 }; 822 gfn_t start = 0; 823 824 do { 825 scoped_guard(read_lock, &gmap->kvm->mmu_lock) 826 start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce, 827 &ops, DAT_WALK_IGN_HOLES, gmap); 828 cond_resched(); 829 } while (start); 830 } 831 832 static int _gmap_enable_skeys(struct gmap *gmap) 833 { 834 gfn_t start = 0; 835 int rc; 836 837 if (uses_skeys(gmap)) 838 return 0; 839 840 set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); 841 rc = gmap_helper_disable_cow_sharing(); 842 if (rc) { 843 clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); 844 return rc; 845 } 846 847 do { 848 scoped_guard(write_lock, &gmap->kvm->mmu_lock) 849 start = dat_reset_skeys(gmap->asce, start); 850 cond_resched(); 851 } while (start); 852 return 0; 853 } 854 855 int gmap_enable_skeys(struct gmap *gmap) 856 { 857 int rc; 858 859 mmap_write_lock(gmap->kvm->mm); 860 rc = _gmap_enable_skeys(gmap); 861 mmap_write_unlock(gmap->kvm->mm); 862 return rc; 863 } 864 865 static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 866 { 867 if (!ptep->s.pr) 868 return 0; 869 __kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep))); 870 if (need_resched()) 871 return next; 872 return 0; 873 } 874 875 static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 876 { 877 phys_addr_t origin, cur, end; 878 879 if (!crstep->h.fc || !crstep->s.fc1.pr) 880 return 0; 881 882 origin = crste_origin_large(*crstep); 883 cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin; 884 end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin; 885 for ( ; cur < end; cur += PAGE_SIZE) 886 __kvm_s390_pv_destroy_page(phys_to_page(cur)); 887 if (need_resched()) 888 return next; 889 return 0; 890 } 891 892 int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible) 893 { 894 const struct dat_walk_ops ops = { 895 .pte_entry = _destroy_pages_pte, 896 .pmd_entry = _destroy_pages_crste, 897 .pud_entry = _destroy_pages_crste, 898 }; 899 900 do { 901 scoped_guard(read_lock, &gmap->kvm->mmu_lock) 902 start = _dat_walk_gfn_range(start, end, gmap->asce, &ops, 903 DAT_WALK_IGN_HOLES, NULL); 904 if (interruptible && fatal_signal_pending(current)) 905 return -EINTR; 906 cond_resched(); 907 } while (start && start < end); 908 return 0; 909 } 910 911 int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level) 912 { 913 struct vsie_rmap *rmap __free(kvfree) = NULL; 914 struct vsie_rmap *temp; 915 void __rcu **slot; 916 int rc = 0; 917 918 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 919 lockdep_assert_held(&sg->host_to_rmap_lock); 920 921 rmap = kzalloc(sizeof(*rmap), GFP_ATOMIC); 922 if (!rmap) 923 return -ENOMEM; 924 925 rmap->r_gfn = r_gfn; 926 rmap->level = level; 927 slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn); 928 if (slot) { 929 rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock); 930 for (temp = rmap->next; temp; temp = temp->next) { 931 if (temp->val == rmap->val) 932 return 0; 933 } 934 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); 935 } else { 936 rmap->next = NULL; 937 rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap); 938 if (rc) 939 return rc; 940 } 941 rmap = NULL; 942 943 return 0; 944 } 945 946 int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, 947 kvm_pfn_t pfn, int level, bool wr) 948 { 949 union crste *crstep; 950 union pgste pgste; 951 union pte *ptep; 952 union pte pte; 953 int flags, rc; 954 955 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 956 lockdep_assert_held(&sg->parent->children_lock); 957 958 flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); 959 rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags, 960 TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); 961 if (rc) 962 return rc; 963 if (level <= TABLE_TYPE_REGION1) { 964 scoped_guard(spinlock, &sg->host_to_rmap_lock) 965 rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level); 966 } 967 if (rc) 968 return rc; 969 970 if (!pgste_get_trylock(ptep, &pgste)) 971 return -EAGAIN; 972 pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false); 973 pte.h.p = 1; 974 pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false); 975 pgste.vsie_notif = 1; 976 pgste_set_unlock(ptep, pgste); 977 978 return 0; 979 } 980 981 static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 982 { 983 __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val); 984 if (need_resched()) 985 return next; 986 return 0; 987 } 988 989 void gmap_set_cmma_all_dirty(struct gmap *gmap) 990 { 991 const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, }; 992 gfn_t gfn = 0; 993 994 do { 995 scoped_guard(read_lock, &gmap->kvm->mmu_lock) 996 gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops, 997 DAT_WALK_IGN_HOLES, NULL); 998 cond_resched(); 999 } while (gfn); 1000 } 1001 1002 static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level) 1003 { 1004 unsigned long align = PAGE_SIZE; 1005 gpa_t gaddr = gfn_to_gpa(r_gfn); 1006 union crste *crstep; 1007 union crste crste; 1008 union pte *ptep; 1009 1010 if (level > TABLE_TYPE_PAGE_TABLE) 1011 align = 1UL << (11 * level + _SEGMENT_SHIFT); 1012 kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align)); 1013 if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep)) 1014 return; 1015 if (ptep) { 1016 if (READ_ONCE(*ptep).val != _PTE_EMPTY.val) 1017 dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); 1018 return; 1019 } 1020 crste = READ_ONCE(*crstep); 1021 dat_crstep_clear(crstep, r_gfn, sg->asce); 1022 if (crste_leaf(crste) || crste.h.i) 1023 return; 1024 if (is_pmd(crste)) 1025 dat_free_pt(dereference_pmd(crste.pmd)); 1026 else 1027 dat_free_level(dereference_crste(crste), true); 1028 } 1029 1030 static void gmap_unshadow(struct gmap *sg) 1031 { 1032 struct gmap_cache *gmap_cache, *next; 1033 1034 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 1035 KVM_BUG_ON(!sg->parent, sg->kvm); 1036 1037 lockdep_assert_held(&sg->parent->children_lock); 1038 1039 gmap_remove_child(sg); 1040 kvm_s390_vsie_gmap_notifier(sg, 0, -1UL); 1041 1042 list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) { 1043 gmap_cache->gmap = NULL; 1044 list_del(&gmap_cache->list); 1045 } 1046 1047 gmap_put(sg); 1048 } 1049 1050 void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) 1051 { 1052 struct vsie_rmap *rmap, *rnext, *head; 1053 struct gmap *sg, *next; 1054 gfn_t start, end; 1055 1056 list_for_each_entry_safe(sg, next, &parent->children, list) { 1057 start = sg->guest_asce.rsto; 1058 end = start + sg->guest_asce.tl + 1; 1059 if (!sg->guest_asce.r && gfn >= start && gfn < end) { 1060 gmap_unshadow(sg); 1061 continue; 1062 } 1063 scoped_guard(spinlock, &sg->host_to_rmap_lock) 1064 head = radix_tree_delete(&sg->host_to_rmap, gfn); 1065 gmap_for_each_rmap_safe(rmap, rnext, head) 1066 gmap_unshadow_level(sg, rmap->r_gfn, rmap->level); 1067 } 1068 } 1069 1070 /** 1071 * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables. 1072 * @parent: Pointer to the parent gmap. 1073 * @asce: ASCE for which the shadow table is created. 1074 * @edat_level: Edat level to be used for the shadow translation. 1075 * 1076 * Context: Called with parent->children_lock held. 1077 * 1078 * Return: The pointer to a gmap if a shadow table with the given asce is 1079 * already available, ERR_PTR(-EAGAIN) if another one is just being created, 1080 * otherwise NULL. 1081 */ 1082 static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level) 1083 { 1084 struct gmap *sg; 1085 1086 lockdep_assert_held(&parent->children_lock); 1087 list_for_each_entry(sg, &parent->children, list) { 1088 if (!gmap_is_shadow_valid(sg, asce, edat_level)) 1089 continue; 1090 return sg; 1091 } 1092 return NULL; 1093 } 1094 1095 #define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE) 1096 struct gmap_protect_asce_top_level { 1097 unsigned long seq; 1098 struct guest_fault f[CRST_TABLE_PAGES]; 1099 }; 1100 1101 static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, 1102 struct gmap_protect_asce_top_level *context) 1103 { 1104 int rc, i; 1105 1106 guard(write_lock)(&sg->kvm->mmu_lock); 1107 1108 if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f)) 1109 return -EAGAIN; 1110 1111 scoped_guard(spinlock, &sg->parent->children_lock) { 1112 for (i = 0; i < CRST_TABLE_PAGES; i++) { 1113 if (!context->f[i].valid) 1114 continue; 1115 rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn, 1116 TABLE_TYPE_REGION1 + 1, context->f[i].writable); 1117 if (rc) 1118 return rc; 1119 } 1120 gmap_add_child(sg->parent, sg); 1121 } 1122 1123 kvm_s390_release_faultin_array(sg->kvm, context->f, false); 1124 return 0; 1125 } 1126 1127 static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, 1128 struct gmap_protect_asce_top_level *context) 1129 { 1130 int rc; 1131 1132 if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f)) 1133 return -EAGAIN; 1134 do { 1135 rc = kvm_s390_mmu_cache_topup(mc); 1136 if (rc) 1137 return rc; 1138 rc = radix_tree_preload(GFP_KERNEL); 1139 if (rc) 1140 return rc; 1141 rc = __gmap_protect_asce_top_level(mc, sg, context); 1142 radix_tree_preload_end(); 1143 } while (rc == -ENOMEM); 1144 1145 return rc; 1146 } 1147 1148 static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg) 1149 { 1150 struct gmap_protect_asce_top_level context = {}; 1151 union asce asce = sg->guest_asce; 1152 int rc; 1153 1154 KVM_BUG_ON(!is_shadow(sg), sg->kvm); 1155 1156 context.seq = sg->kvm->mmu_invalidate_seq; 1157 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1158 smp_rmb(); 1159 1160 rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false); 1161 if (rc > 0) 1162 rc = -EFAULT; 1163 if (!rc) 1164 rc = _gmap_protect_asce_top_level(mc, sg, &context); 1165 if (rc) 1166 kvm_s390_release_faultin_array(sg->kvm, context.f, true); 1167 return rc; 1168 } 1169 1170 /** 1171 * gmap_create_shadow() - Create/find a shadow guest address space. 1172 * @mc: The cache to use to allocate dat tables. 1173 * @parent: Pointer to the parent gmap. 1174 * @asce: ASCE for which the shadow table is created. 1175 * @edat_level: Edat level to be used for the shadow translation. 1176 * 1177 * The pages of the top level page table referred by the asce parameter 1178 * will be set to read-only and marked in the PGSTEs of the kvm process. 1179 * The shadow table will be removed automatically on any change to the 1180 * PTE mapping for the source table. 1181 * 1182 * The returned shadow gmap will be returned with one extra reference. 1183 * 1184 * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory, 1185 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the 1186 * parent gmap table could not be protected. 1187 */ 1188 struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent, 1189 union asce asce, int edat_level) 1190 { 1191 struct gmap *sg, *new; 1192 int rc; 1193 1194 scoped_guard(spinlock, &parent->children_lock) { 1195 sg = gmap_find_shadow(parent, asce, edat_level); 1196 if (sg) { 1197 gmap_get(sg); 1198 return sg; 1199 } 1200 } 1201 /* Create a new shadow gmap. */ 1202 new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce)); 1203 if (!new) 1204 return ERR_PTR(-ENOMEM); 1205 new->guest_asce = asce; 1206 new->edat_level = edat_level; 1207 set_bit(GMAP_FLAG_SHADOW, &new->flags); 1208 1209 scoped_guard(spinlock, &parent->children_lock) { 1210 /* Recheck if another CPU created the same shadow. */ 1211 sg = gmap_find_shadow(parent, asce, edat_level); 1212 if (sg) { 1213 gmap_put(new); 1214 gmap_get(sg); 1215 return sg; 1216 } 1217 if (asce.r) { 1218 /* Only allow one real-space gmap shadow. */ 1219 list_for_each_entry(sg, &parent->children, list) { 1220 if (sg->guest_asce.r) { 1221 scoped_guard(write_lock, &parent->kvm->mmu_lock) 1222 gmap_unshadow(sg); 1223 break; 1224 } 1225 } 1226 gmap_add_child(parent, new); 1227 /* Nothing to protect, return right away. */ 1228 gmap_get(new); 1229 return new; 1230 } 1231 } 1232 1233 gmap_get(new); 1234 new->parent = parent; 1235 /* Protect while inserting, protects against invalidation races. */ 1236 rc = gmap_protect_asce_top_level(mc, new); 1237 if (rc) { 1238 new->parent = NULL; 1239 gmap_put(new); 1240 gmap_put(new); 1241 return ERR_PTR(rc); 1242 } 1243 return new; 1244 } 1245