xref: /linux/arch/s390/kvm/gmap.c (revision 4dffb0a5d1c29163cd4ab8f1a259a7278c94716a)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Guest memory management for KVM/s390
4  *
5  * Copyright IBM Corp. 2008, 2020, 2024
6  *
7  *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
8  *               Martin Schwidefsky <schwidefsky@de.ibm.com>
9  *               David Hildenbrand <david@redhat.com>
10  *               Janosch Frank <frankja@linux.ibm.com>
11  */
12 
13 #include <linux/compiler.h>
14 #include <linux/kvm.h>
15 #include <linux/kvm_host.h>
16 #include <linux/pgtable.h>
17 #include <linux/pagemap.h>
18 #include <asm/lowcore.h>
19 #include <asm/uv.h>
20 #include <asm/gmap_helpers.h>
21 
22 #include "dat.h"
23 #include "gmap.h"
24 #include "kvm-s390.h"
25 #include "faultin.h"
26 
27 static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu)
28 {
29 	return vcpu->arch.sie_block->prog0c & PROG_IN_SIE;
30 }
31 
32 static int gmap_limit_to_type(gfn_t limit)
33 {
34 	if (!limit)
35 		return TABLE_TYPE_REGION1;
36 	if (limit <= _REGION3_SIZE >> PAGE_SHIFT)
37 		return TABLE_TYPE_SEGMENT;
38 	if (limit <= _REGION2_SIZE >> PAGE_SHIFT)
39 		return TABLE_TYPE_REGION3;
40 	if (limit <= _REGION1_SIZE >> PAGE_SHIFT)
41 		return TABLE_TYPE_REGION2;
42 	return TABLE_TYPE_REGION1;
43 }
44 
45 /**
46  * gmap_new() - Allocate and initialize a guest address space.
47  * @kvm: The kvm owning the guest.
48  * @limit: Maximum address of the gmap address space.
49  *
50  * Return: A guest address space structure.
51  */
52 struct gmap *gmap_new(struct kvm *kvm, gfn_t limit)
53 {
54 	struct crst_table *table;
55 	struct gmap *gmap;
56 	int type;
57 
58 	type = gmap_limit_to_type(limit);
59 
60 	gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT);
61 	if (!gmap)
62 		return NULL;
63 	INIT_LIST_HEAD(&gmap->children);
64 	INIT_LIST_HEAD(&gmap->list);
65 	INIT_LIST_HEAD(&gmap->scb_users);
66 	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE);
67 	spin_lock_init(&gmap->children_lock);
68 	spin_lock_init(&gmap->host_to_rmap_lock);
69 	refcount_set(&gmap->refcount, 1);
70 
71 	table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val);
72 	if (!table) {
73 		kfree(gmap);
74 		return NULL;
75 	}
76 
77 	gmap->asce.val = __pa(table);
78 	gmap->asce.dt = type;
79 	gmap->asce.tl = _ASCE_TABLE_LENGTH;
80 	gmap->asce.x = 1;
81 	gmap->asce.p = 1;
82 	gmap->asce.s = 1;
83 	gmap->kvm = kvm;
84 	set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags);
85 
86 	return gmap;
87 }
88 
89 static void gmap_add_child(struct gmap *parent, struct gmap *child)
90 {
91 	KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm);
92 	KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm);
93 	KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm);
94 	lockdep_assert_held(&parent->children_lock);
95 
96 	child->parent = parent;
97 
98 	if (is_ucontrol(parent))
99 		set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
100 	else
101 		clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
102 
103 	if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags))
104 		set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
105 	else
106 		clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
107 
108 	if (test_bit(GMAP_FLAG_ALLOW_HPAGE_2G, &parent->flags))
109 		set_bit(GMAP_FLAG_ALLOW_HPAGE_2G, &child->flags);
110 	else
111 		clear_bit(GMAP_FLAG_ALLOW_HPAGE_2G, &child->flags);
112 
113 	if (kvm_is_ucontrol(parent->kvm))
114 		clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags);
115 	list_add(&child->list, &parent->children);
116 }
117 
118 struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit)
119 {
120 	struct gmap *res;
121 
122 	lockdep_assert_not_held(&parent->children_lock);
123 	res = gmap_new(parent->kvm, limit);
124 	if (res) {
125 		scoped_guard(spinlock, &parent->children_lock)
126 			gmap_add_child(parent, res);
127 	}
128 	return res;
129 }
130 
131 int gmap_set_limit(struct gmap *gmap, gfn_t limit)
132 {
133 	struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL;
134 	int rc, type;
135 
136 	type = gmap_limit_to_type(limit);
137 
138 	mc = kvm_s390_new_mmu_cache();
139 	if (!mc)
140 		return -ENOMEM;
141 
142 	do {
143 		rc = kvm_s390_mmu_cache_topup(mc);
144 		if (rc)
145 			return rc;
146 		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
147 			rc = dat_set_asce_limit(mc, &gmap->asce, type);
148 	} while (rc == -ENOMEM);
149 
150 	return 0;
151 }
152 
153 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
154 {
155 	struct vsie_rmap *rmap, *rnext, *head;
156 	struct radix_tree_iter iter;
157 	unsigned long indices[16];
158 	unsigned long index;
159 	void __rcu **slot;
160 	int i, nr;
161 
162 	/* A radix tree is freed by deleting all of its entries */
163 	index = 0;
164 	do {
165 		nr = 0;
166 		radix_tree_for_each_slot(slot, root, &iter, index) {
167 			indices[nr] = iter.index;
168 			if (++nr == 16)
169 				break;
170 		}
171 		for (i = 0; i < nr; i++) {
172 			index = indices[i];
173 			head = radix_tree_delete(root, index);
174 			gmap_for_each_rmap_safe(rmap, rnext, head)
175 				kfree(rmap);
176 		}
177 	} while (nr > 0);
178 }
179 
180 void gmap_remove_child(struct gmap *child)
181 {
182 	if (KVM_BUG_ON(!child->parent, child->kvm))
183 		return;
184 	lockdep_assert_held(&child->parent->children_lock);
185 
186 	list_del(&child->list);
187 	child->parent = NULL;
188 	child->invalidated = true;
189 }
190 
191 /**
192  * gmap_dispose() - Remove and free a guest address space and its children.
193  * @gmap: Pointer to the guest address space structure.
194  */
195 void gmap_dispose(struct gmap *gmap)
196 {
197 	/* The gmap must have been removed from the parent beforehands */
198 	KVM_BUG_ON(gmap->parent, gmap->kvm);
199 	/* All children of this gmap must have been removed beforehands */
200 	KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm);
201 	/* No VSIE shadow block is allowed to use this gmap */
202 	KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm);
203 	/* The ASCE must be valid */
204 	KVM_BUG_ON(!gmap->asce.val, gmap->kvm);
205 	/* The refcount must be 0 */
206 	KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm);
207 
208 	/* Flush tlb of all gmaps */
209 	asce_flush_tlb(gmap->asce);
210 
211 	/* Free all DAT tables. */
212 	dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap));
213 
214 	/* Free additional data for a shadow gmap */
215 	if (is_shadow(gmap))
216 		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
217 
218 	kfree(gmap);
219 }
220 
221 /**
222  * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy.
223  * @gmap: The gmap whose ASCE needs to be replaced.
224  *
225  * If the ASCE is a SEGMENT type then this function will return -EINVAL,
226  * otherwise the pointers in the host_to_guest radix tree will keep pointing
227  * to the wrong pages, causing use-after-free and memory corruption.
228  * If the allocation of the new top level page table fails, the ASCE is not
229  * replaced.
230  * In any case, the old ASCE is always removed from the gmap CRST list.
231  * Therefore the caller has to make sure to save a pointer to it
232  * beforehand, unless a leak is actually intended.
233  *
234  * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE,
235  *         -ENOMEM if runinng out of memory.
236  */
237 int s390_replace_asce(struct gmap *gmap)
238 {
239 	struct crst_table *table;
240 	union asce asce;
241 
242 	/* Replacing segment type ASCEs would cause serious issues */
243 	if (gmap->asce.dt == ASCE_TYPE_SEGMENT)
244 		return -EINVAL;
245 
246 	table = dat_alloc_crst_sleepable(0);
247 	if (!table)
248 		return -ENOMEM;
249 	memcpy(table, dereference_asce(gmap->asce), sizeof(*table));
250 
251 	/* Set new table origin while preserving existing ASCE control bits */
252 	asce = gmap->asce;
253 	asce.rsto = virt_to_pfn(table);
254 	WRITE_ONCE(gmap->asce, asce);
255 
256 	return 0;
257 }
258 
259 bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint)
260 {
261 	struct kvm *kvm = gmap->kvm;
262 	struct kvm_vcpu *vcpu;
263 	gfn_t prefix_gfn;
264 	unsigned long i;
265 
266 	if (is_shadow(gmap))
267 		return false;
268 	kvm_for_each_vcpu(i, vcpu, kvm) {
269 		/* Match against both prefix pages */
270 		prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu));
271 		if (prefix_gfn < end && gfn <= prefix_gfn + 1) {
272 			if (hint && kvm_s390_is_in_sie(vcpu))
273 				return false;
274 			VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx",
275 				   gfn_to_gpa(gfn), gfn_to_gpa(end));
276 			kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
277 		}
278 	}
279 	return true;
280 }
281 
282 struct clear_young_pte_priv {
283 	struct gmap *gmap;
284 	bool young;
285 };
286 
287 static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
288 {
289 	struct clear_young_pte_priv *p = walk->priv;
290 	union pgste pgste;
291 	union pte pte, new;
292 
293 	pte = READ_ONCE(*ptep);
294 
295 	if (!pte.s.pr || (!pte.s.y && pte.h.i))
296 		return 0;
297 
298 	pgste = pgste_get_lock(ptep);
299 	if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) {
300 		new = pte;
301 		new.h.i = 1;
302 		new.s.y = 0;
303 		if ((new.s.d || !new.h.p) && !new.s.s)
304 			folio_set_dirty(pfn_folio(pte.h.pfra));
305 		new.s.d = 0;
306 		new.h.p = 1;
307 
308 		pgste.prefix_notif = 0;
309 		pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap));
310 	}
311 	p->young = 1;
312 	pgste_set_unlock(ptep, pgste);
313 	return 0;
314 }
315 
316 static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
317 {
318 	struct clear_young_pte_priv *priv = walk->priv;
319 	union crste crste, new;
320 
321 	do {
322 		crste = READ_ONCE(*crstep);
323 
324 		if (!crste.h.fc)
325 			return 0;
326 		if (!crste.s.fc1.y && crste.h.i)
327 			return 0;
328 		if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end))
329 			break;
330 
331 		new = crste;
332 		new.h.i = 1;
333 		new.s.fc1.y = 0;
334 		new.s.fc1.prefix_notif = 0;
335 		if (new.s.fc1.d || !new.h.p)
336 			folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
337 		new.s.fc1.d = 0;
338 		new.h.p = 1;
339 	} while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce));
340 
341 	priv->young = 1;
342 	return 0;
343 }
344 
345 /**
346  * gmap_age_gfn() - Clear young.
347  * @gmap: The guest gmap.
348  * @start: The first gfn to test.
349  * @end: The gfn after the last one to test.
350  *
351  * Context: Called with the kvm mmu write lock held.
352  * Return: 1 if any page in the given range was young, otherwise 0.
353  */
354 bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end)
355 {
356 	const struct dat_walk_ops ops = {
357 		.pte_entry = gmap_clear_young_pte,
358 		.pmd_entry = gmap_clear_young_crste,
359 		.pud_entry = gmap_clear_young_crste,
360 	};
361 	struct clear_young_pte_priv priv = {
362 		.gmap = gmap,
363 		.young = false,
364 	};
365 
366 	_dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
367 
368 	return priv.young;
369 }
370 
371 struct gmap_unmap_priv {
372 	struct gmap *gmap;
373 	struct kvm_memory_slot *slot;
374 };
375 
376 static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w)
377 {
378 	struct gmap_unmap_priv *priv = w->priv;
379 	struct folio *folio = NULL;
380 	unsigned long vmaddr;
381 	union pgste pgste;
382 
383 	pgste = pgste_get_lock(ptep);
384 	if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) {
385 		vmaddr = __gfn_to_hva_memslot(priv->slot, gfn);
386 		gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr);
387 	}
388 	if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
389 		folio = pfn_folio(ptep->h.pfra);
390 	pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn);
391 	pgste_set_unlock(ptep, pgste);
392 	if (folio)
393 		uv_convert_from_secure_folio(folio);
394 
395 	return 0;
396 }
397 
398 static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
399 {
400 	struct gmap_unmap_priv *priv = walk->priv;
401 	struct folio *folio = NULL;
402 	union crste old = *crstep;
403 	bool ok;
404 
405 	if (!old.h.fc)
406 		return 0;
407 
408 	if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
409 		folio = phys_to_folio(crste_origin_large(old));
410 	/*
411 	 * No races should happen because kvm->mmu_lock is held in write mode,
412 	 * but the unmap operation could have triggered an unshadow, which
413 	 * causes gmap_crstep_xchg_atomic() to return false and clear the
414 	 * vsie_notif bit. Allow the operation to fail once, if the old crste
415 	 * had the vsie_notif bit set. A second failure is not allowed, for
416 	 * the reasons above.
417 	 */
418 	ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn);
419 	if (!ok) {
420 		KVM_BUG_ON(!old.s.fc1.vsie_notif, priv->gmap->kvm);
421 		old.s.fc1.vsie_notif = 0;
422 		ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn);
423 		KVM_BUG_ON(!ok, priv->gmap->kvm);
424 	}
425 	if (folio)
426 		uv_convert_from_secure_folio(folio);
427 
428 	return 0;
429 }
430 
431 /**
432  * gmap_unmap_gfn_range() - Unmap a range of guest addresses.
433  * @gmap: The gmap to act on.
434  * @slot: The memslot in which the range is located.
435  * @start: The first gfn to unmap.
436  * @end: The gfn after the last one to unmap.
437  *
438  * Context: Called with the kvm mmu write lock held.
439  * Return: false
440  */
441 bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
442 {
443 	const struct dat_walk_ops ops = {
444 		.pte_entry = _gmap_unmap_pte,
445 		.pmd_entry = _gmap_unmap_crste,
446 		.pud_entry = _gmap_unmap_crste,
447 	};
448 	struct gmap_unmap_priv priv = {
449 		.gmap = gmap,
450 		.slot = slot,
451 	};
452 
453 	lockdep_assert_held_write(&gmap->kvm->mmu_lock);
454 
455 	_dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
456 	return false;
457 }
458 
459 static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn,
460 						  struct gmap *gmap)
461 {
462 	union pte pte = READ_ONCE(*ptep);
463 
464 	if (!pte.s.pr || (pte.h.p && !pte.s.sd))
465 		return pgste;
466 
467 	/*
468 	 * If this page contains one or more prefixes of vCPUS that are currently
469 	 * running, do not reset the protection, leave it marked as dirty.
470 	 */
471 	if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) {
472 		pte.h.p = 1;
473 		pte.s.sd = 0;
474 		pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn);
475 	}
476 
477 	mark_page_dirty(gmap->kvm, gfn);
478 
479 	return pgste;
480 }
481 
482 static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end,
483 					  struct dat_walk *walk)
484 {
485 	struct gmap *gmap = walk->priv;
486 	union pgste pgste;
487 
488 	pgste = pgste_get_lock(ptep);
489 	pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap);
490 	pgste_set_unlock(ptep, pgste);
491 	return 0;
492 }
493 
494 static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end,
495 					    struct dat_walk *walk)
496 {
497 	struct gmap *gmap = walk->priv;
498 	union crste crste, new;
499 
500 	if (fatal_signal_pending(current))
501 		return 1;
502 	do {
503 		crste = READ_ONCE(*table);
504 		if (!crste.h.fc)
505 			return 0;
506 		if (crste.h.p && !crste.s.fc1.sd)
507 			return 0;
508 
509 		/*
510 		 * If this large page contains one or more prefixes of vCPUs that are
511 		 * currently running, do not reset the protection, leave it marked as
512 		 * dirty.
513 		 */
514 		if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end))
515 			break;
516 		new = crste;
517 		new.h.p = 1;
518 		new.s.fc1.sd = 0;
519 	} while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn));
520 
521 	for ( ; gfn < end; gfn++)
522 		mark_page_dirty(gmap->kvm, gfn);
523 
524 	return 0;
525 }
526 
527 void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end)
528 {
529 	const struct dat_walk_ops walk_ops = {
530 		.pte_entry = _pte_test_and_clear_softdirty,
531 		.pmd_entry = _crste_test_and_clear_softdirty,
532 		.pud_entry = _crste_test_and_clear_softdirty,
533 	};
534 
535 	lockdep_assert_held(&gmap->kvm->mmu_lock);
536 
537 	_dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap);
538 }
539 
540 static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f)
541 {
542 	union crste newcrste, oldcrste = READ_ONCE(*f->crstep);
543 
544 	/* Somehow the crste is not large anymore, let the slow path deal with it. */
545 	if (!oldcrste.h.fc)
546 		return 1;
547 
548 	f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn));
549 	f->writable = oldcrste.s.fc1.w;
550 
551 	f->crste_region3 = is_pud(oldcrste);
552 	/* Appropriate permissions already (race with another handler), nothing to do. */
553 	if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p))
554 		return 0;
555 
556 	if (!f->write_attempt || oldcrste.s.fc1.w) {
557 		f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d;
558 		newcrste = oldcrste;
559 		newcrste.h.i = 0;
560 		newcrste.s.fc1.y = 1;
561 		if (f->write_attempt) {
562 			newcrste.h.p = 0;
563 			newcrste.s.fc1.d = 1;
564 			newcrste.s.fc1.sd = 1;
565 		}
566 		/* In case of races, let the slow path deal with it. */
567 		return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn);
568 	}
569 	/* Trying to write on a read-only page, let the slow path deal with it. */
570 	return 1;
571 }
572 
573 static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste,
574 					struct guest_fault *f)
575 {
576 	union pte newpte, oldpte = READ_ONCE(*f->ptep);
577 
578 	f->pfn = oldpte.h.pfra;
579 	f->writable = oldpte.s.w;
580 
581 	/* Appropriate permissions already (race with another handler), nothing to do. */
582 	if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p))
583 		return 0;
584 	/* Trying to write on a read-only page, let the slow path deal with it. */
585 	if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w))
586 		return 1;
587 
588 	newpte = oldpte;
589 	newpte.h.i = 0;
590 	newpte.s.y = 1;
591 	if (f->write_attempt) {
592 		newpte.h.p = 0;
593 		newpte.s.d = 1;
594 		newpte.s.sd = 1;
595 	}
596 	*pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn);
597 
598 	return 0;
599 }
600 
601 /**
602  * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault.
603  * @gmap: The gmap whose fault needs to be resolved.
604  * @fault: Describes the fault that is being resolved.
605  *
606  * A minor fault is a fault that can be resolved quickly within gmap.
607  * The page is already mapped, the fault is only due to dirty/young tracking.
608  *
609  * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could
610  *         not be resolved and needs to go through the slow path.
611  */
612 int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault)
613 {
614 	union pgste pgste;
615 	int rc;
616 
617 	lockdep_assert_held(&gmap->kvm->mmu_lock);
618 
619 	rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE,
620 			    &fault->crstep, &fault->ptep);
621 	/* If a PTE or a leaf CRSTE could not be reached, slow path. */
622 	if (rc)
623 		return 1;
624 
625 	if (fault->ptep) {
626 		pgste = pgste_get_lock(fault->ptep);
627 		rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault);
628 		if (!rc && fault->callback)
629 			fault->callback(fault);
630 		pgste_set_unlock(fault->ptep, pgste);
631 	} else {
632 		rc = gmap_handle_minor_crste_fault(gmap, fault);
633 		if (!rc && fault->callback)
634 			fault->callback(fault);
635 	}
636 	return rc;
637 }
638 
639 /**
640  * gmap_2g_allowed() - Check whether a 2G hugepage is allowed.
641  * @gmap: The gmap of the guest.
642  * @f: Describes the fault that is being resolved.
643  * @slot: The memslot the faulting address belongs to.
644  *
645  * The function checks whether the GMAP_FLAG_ALLOW_HPAGE_2G flag is set for
646  * @gmap, whether the offset of the address in the 2G virtual frame is the
647  * same as the offset in the physical 2G frame, and finally whether the whole
648  * 2G page would fit in the given memslot.
649  *
650  * Return: true if a 2G hugepage is allowed to back the faulting address, false
651  *         otherwise.
652  */
653 static inline bool gmap_2g_allowed(struct gmap *gmap, struct guest_fault *f,
654 				   struct kvm_memory_slot *slot)
655 {
656 	return test_bit(GMAP_FLAG_ALLOW_HPAGE_2G, &gmap->flags) &&
657 	       !((f->gfn ^ f->pfn) & ~_REGION3_FR_MASK) &&
658 	       slot->base_gfn <= ALIGN_DOWN(f->gfn, _PAGES_PER_REGION3) &&
659 	       slot->base_gfn + slot->npages >= ALIGN(f->gfn + 1, _PAGES_PER_REGION3);
660 }
661 
662 /**
663  * gmap_1m_allowed() - Check whether a 1M hugepage is allowed.
664  * @gmap: The gmap of the guest.
665  * @f: Describes the fault that is being resolved.
666  * @slot: The memslot the faulting address belongs to.
667  *
668  * The function checks whether the GMAP_FLAG_ALLOW_HPAGE_1M flag is set for
669  * @gmap, whether the offset of the address in the 1M virtual frame is the
670  * same as the offset in the physical 1M frame, and finally whether the whole
671  * 1M page would fit in the given memslot.
672  *
673  * Return: true if a 1M hugepage is allowed to back the faulting address, false
674  *         otherwise.
675  */
676 static inline bool gmap_1m_allowed(struct gmap *gmap, struct guest_fault *f,
677 				   struct kvm_memory_slot *slot)
678 {
679 	return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags) &&
680 	       !((f->gfn ^ f->pfn) & ~_SEGMENT_FR_MASK) &&
681 	       slot->base_gfn <= ALIGN_DOWN(f->gfn, _PAGES_PER_SEGMENT) &&
682 	       slot->base_gfn + slot->npages >= ALIGN(f->gfn + 1, _PAGES_PER_SEGMENT);
683 }
684 
685 static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level,
686 		      struct guest_fault *f)
687 {
688 	union crste oldval, newval;
689 	union pte newpte, oldpte;
690 	union pgste pgste;
691 	int rc = 0;
692 
693 	rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level,
694 			    &f->crstep, &f->ptep);
695 	if (rc == -ENOMEM)
696 		return rc;
697 	if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm))
698 		return rc;
699 	if (rc)
700 		return -EAGAIN;
701 	if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm))
702 		return -EINVAL;
703 
704 	if (f->ptep) {
705 		pgste = pgste_get_lock(f->ptep);
706 		oldpte = *f->ptep;
707 		newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
708 		newpte.s.sd = oldpte.s.sd;
709 		oldpte.s.sd = 0;
710 		if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
711 			pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn);
712 			if (f->callback)
713 				f->callback(f);
714 		} else {
715 			rc = -EAGAIN;
716 		}
717 		pgste_set_unlock(f->ptep, pgste);
718 	} else {
719 		do {
720 			oldval = READ_ONCE(*f->crstep);
721 			newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
722 					    f->write_attempt | oldval.s.fc1.d);
723 			newval.s.fc1.s = !f->page;
724 			newval.s.fc1.sd = oldval.s.fc1.sd;
725 			if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
726 			    crste_origin_large(oldval) != crste_origin_large(newval))
727 				return -EAGAIN;
728 			f->crste_region3 = is_pud(newval);
729 		} while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn));
730 		if (f->callback)
731 			f->callback(f);
732 	}
733 
734 	return rc;
735 }
736 
737 int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f,
738 	      struct kvm_memory_slot *slot)
739 {
740 	unsigned int order;
741 	int level;
742 
743 	lockdep_assert_held(&gmap->kvm->mmu_lock);
744 
745 	level = TABLE_TYPE_PAGE_TABLE;
746 	if (f->page) {
747 		order = folio_order(page_folio(f->page));
748 		if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f, slot))
749 			level = TABLE_TYPE_REGION3;
750 		else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f, slot))
751 			level = TABLE_TYPE_SEGMENT;
752 	}
753 	return _gmap_link(mc, gmap, level, f);
754 }
755 
756 static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
757 			     gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
758 {
759 	union crste newcrste, oldcrste;
760 	struct page_table *pt;
761 	union crste *crstep;
762 	union pte *ptep;
763 	int rc;
764 
765 	if (force_alloc)
766 		rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC,
767 				    TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
768 	else
769 		rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE,
770 				    TABLE_TYPE_SEGMENT, &crstep, &ptep);
771 	if (rc)
772 		return rc;
773 	if (!ptep) {
774 		newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT);
775 		newcrste.h.i = 1;
776 		newcrste.h.fc0.tl = 1;
777 	} else {
778 		pt = pte_table_start(ptep);
779 		dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT));
780 		newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT);
781 	}
782 	rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT,
783 			    &crstep, &ptep);
784 	if (rc)
785 		return rc;
786 	do {
787 		oldcrste = READ_ONCE(*crstep);
788 		if (oldcrste.val == newcrste.val)
789 			break;
790 	} while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce));
791 	return 0;
792 }
793 
794 static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp)
795 {
796 	union pte *ptep;
797 	int rc;
798 
799 	rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE,
800 			    TABLE_TYPE_SEGMENT, crstepp, &ptep);
801 	if (rc || (!ptep && !crste_is_ucas(**crstepp)))
802 		return -EREMOTE;
803 	if (!ptep)
804 		return 1;
805 	*gaddr &= ~_SEGMENT_MASK;
806 	*gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT;
807 	return 0;
808 }
809 
810 /**
811  * gmap_ucas_translate() - Translate a vcpu address into a host gmap address
812  * @mc: The memory cache to be used for allocations.
813  * @gmap: The per-cpu gmap.
814  * @gaddr: Pointer to the address to be translated, will get overwritten with
815  *         the translated address in case of success.
816  * Translates the per-vCPU guest address into a fake guest address, which can
817  * then be used with the fake memslots that are identity mapping userspace.
818  * This allows ucontrol VMs to use the normal fault resolution path, like
819  * normal VMs.
820  *
821  * Return: %0 in case of success, otherwise %-EREMOTE.
822  */
823 int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr)
824 {
825 	gpa_t translated_address;
826 	union crste *crstep;
827 	gfn_t gfn;
828 	int rc;
829 
830 	gfn = gpa_to_gfn(*gaddr);
831 
832 	scoped_guard(read_lock, &gmap->kvm->mmu_lock) {
833 		rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
834 		if (rc <= 0)
835 			return rc;
836 	}
837 	do {
838 		scoped_guard(write_lock, &gmap->kvm->mmu_lock) {
839 			rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
840 			if (rc <= 0)
841 				return rc;
842 			translated_address = (*gaddr & ~_SEGMENT_MASK) |
843 					     (crstep->val & _SEGMENT_MASK);
844 			rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true);
845 		}
846 		if (!rc) {
847 			*gaddr = translated_address;
848 			return 0;
849 		}
850 		if (rc != -ENOMEM)
851 			return -EREMOTE;
852 		rc = kvm_s390_mmu_cache_topup(mc);
853 		if (rc)
854 			return rc;
855 	} while (1);
856 	return 0;
857 }
858 
859 int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count)
860 {
861 	struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL;
862 	int rc = 0;
863 
864 	mc = kvm_s390_new_mmu_cache();
865 	if (!mc)
866 		return -ENOMEM;
867 
868 	while (count) {
869 		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
870 			rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false);
871 		if (rc == -ENOMEM) {
872 			rc = kvm_s390_mmu_cache_topup(mc);
873 			if (rc)
874 				return rc;
875 			continue;
876 		}
877 		if (rc)
878 			return rc;
879 
880 		count--;
881 		c_gfn += _PAGE_ENTRIES;
882 		p_gfn += _PAGE_ENTRIES;
883 	}
884 	return rc;
885 }
886 
887 static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
888 {
889 	union crste *crstep;
890 	union pte *ptep;
891 	int rc;
892 
893 	rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
894 	if (rc)
895 		return;
896 	while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce))
897 		;
898 }
899 
900 void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
901 {
902 	guard(read_lock)(&gmap->kvm->mmu_lock);
903 
904 	for ( ; count; count--, c_gfn += _PAGE_ENTRIES)
905 		gmap_ucas_unmap_one(gmap, c_gfn);
906 }
907 
908 static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
909 {
910 	struct gmap *gmap = walk->priv;
911 	union crste crste, newcrste;
912 
913 	crste = READ_ONCE(*crstep);
914 	newcrste = _CRSTE_EMPTY(crste.h.tt);
915 
916 	while (crste_leaf(crste)) {
917 		if (crste_prefix(crste))
918 			gmap_unmap_prefix(gmap, gfn, next);
919 		if (crste.s.fc1.vsie_notif)
920 			gmap_handle_vsie_unshadow_event(gmap, gfn);
921 		if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce))
922 			break;
923 		crste = READ_ONCE(*crstep);
924 	}
925 
926 	if (need_resched())
927 		return next;
928 
929 	return 0;
930 }
931 
932 void gmap_split_huge_pages(struct gmap *gmap)
933 {
934 	const struct dat_walk_ops ops = {
935 		.pmd_entry = _gmap_split_crste,
936 		.pud_entry = _gmap_split_crste,
937 	};
938 	gfn_t start = 0;
939 
940 	do {
941 		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
942 			start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce,
943 						    &ops, DAT_WALK_IGN_HOLES, gmap);
944 		cond_resched();
945 	} while (start);
946 }
947 
948 static int _gmap_enable_skeys(struct gmap *gmap)
949 {
950 	gfn_t start = 0;
951 	int rc;
952 
953 	if (uses_skeys(gmap))
954 		return 0;
955 
956 	set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
957 	rc = gmap_helper_disable_cow_sharing();
958 	if (rc) {
959 		clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
960 		return rc;
961 	}
962 
963 	do {
964 		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
965 			start = dat_reset_skeys(gmap->asce, start);
966 		cond_resched();
967 	} while (start);
968 	return 0;
969 }
970 
971 int gmap_enable_skeys(struct gmap *gmap)
972 {
973 	int rc;
974 
975 	mmap_write_lock(gmap->kvm->mm);
976 	rc = _gmap_enable_skeys(gmap);
977 	mmap_write_unlock(gmap->kvm->mm);
978 	return rc;
979 }
980 
981 static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
982 {
983 	if (!ptep->s.pr)
984 		return 0;
985 	__kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep)));
986 	if (need_resched())
987 		return next;
988 	return 0;
989 }
990 
991 static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
992 {
993 	phys_addr_t origin, cur, end;
994 
995 	if (!crstep->h.fc || !crstep->s.fc1.pr)
996 		return 0;
997 
998 	origin = crste_origin_large(*crstep);
999 	cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin;
1000 	end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin;
1001 	for ( ; cur < end; cur += PAGE_SIZE)
1002 		__kvm_s390_pv_destroy_page(phys_to_page(cur));
1003 	if (need_resched())
1004 		return next;
1005 	return 0;
1006 }
1007 
1008 int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible)
1009 {
1010 	const struct dat_walk_ops ops = {
1011 		.pte_entry = _destroy_pages_pte,
1012 		.pmd_entry = _destroy_pages_crste,
1013 		.pud_entry = _destroy_pages_crste,
1014 	};
1015 
1016 	do {
1017 		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
1018 			start = _dat_walk_gfn_range(start, end, gmap->asce, &ops,
1019 						    DAT_WALK_IGN_HOLES, NULL);
1020 		if (interruptible && fatal_signal_pending(current))
1021 			return -EINTR;
1022 		cond_resched();
1023 	} while (start && start < end);
1024 	return 0;
1025 }
1026 
1027 int gmap_insert_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn,
1028 		     gfn_t r_gfn, int level)
1029 {
1030 	struct vsie_rmap *rmap __free(kvfree) = NULL;
1031 	struct vsie_rmap *temp;
1032 	void __rcu **slot;
1033 	int rc = 0;
1034 
1035 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1036 	lockdep_assert_held(&sg->host_to_rmap_lock);
1037 
1038 	rmap = kvm_s390_mmu_cache_alloc_rmap(mc);
1039 	if (!rmap)
1040 		return -ENOMEM;
1041 
1042 	rmap->r_gfn = r_gfn;
1043 	rmap->level = level;
1044 	slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn);
1045 	if (slot) {
1046 		rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock);
1047 		for (temp = rmap->next; temp; temp = temp->next) {
1048 			if (temp->val == rmap->val)
1049 				return 0;
1050 		}
1051 		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1052 	} else {
1053 		rmap->next = NULL;
1054 		rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap);
1055 		if (rc)
1056 			return rc;
1057 	}
1058 	rmap = NULL;
1059 
1060 	return 0;
1061 }
1062 
1063 int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
1064 		      kvm_pfn_t pfn, int level, bool wr)
1065 {
1066 	unsigned long bitmask;
1067 	union crste *crstep;
1068 	union pgste pgste;
1069 	union pte *ptep;
1070 	union pte pte;
1071 	int flags, rc;
1072 
1073 	if (KVM_BUG_ON(!is_shadow(sg) || level <= TABLE_TYPE_PAGE_TABLE, sg->kvm))
1074 		return -EINVAL;
1075 	lockdep_assert_held(&sg->parent->children_lock);
1076 
1077 	flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
1078 	rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags,
1079 			    TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
1080 	if (rc)
1081 		return rc;
1082 	if (level <= TABLE_TYPE_REGION1) {
1083 		bitmask = -1UL << (8 + 11 * level);
1084 		scoped_guard(spinlock, &sg->host_to_rmap_lock)
1085 			rc = gmap_insert_rmap(mc, sg, p_gfn, r_gfn & bitmask, level);
1086 	}
1087 	if (rc)
1088 		return rc;
1089 
1090 	if (!pgste_get_trylock(ptep, &pgste))
1091 		return -EAGAIN;
1092 	pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false);
1093 	pte.h.p = 1;
1094 	pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false);
1095 	pgste.vsie_notif = 1;
1096 	pgste_set_unlock(ptep, pgste);
1097 
1098 	return 0;
1099 }
1100 
1101 static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1102 {
1103 	__atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val);
1104 	if (need_resched())
1105 		return next;
1106 	return 0;
1107 }
1108 
1109 void gmap_set_cmma_all_dirty(struct gmap *gmap)
1110 {
1111 	const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, };
1112 	gfn_t gfn = 0;
1113 
1114 	do {
1115 		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
1116 			gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops,
1117 						  DAT_WALK_IGN_HOLES, NULL);
1118 		cond_resched();
1119 	} while (gfn);
1120 }
1121 
1122 static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
1123 {
1124 	unsigned long align = PAGE_SIZE;
1125 	gpa_t gaddr = gfn_to_gpa(r_gfn);
1126 	union crste *crstep;
1127 	union crste crste;
1128 	union pte *ptep;
1129 
1130 	if (level > TABLE_TYPE_PAGE_TABLE)
1131 		align = 1UL << (11 * level + _SEGMENT_SHIFT);
1132 	kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align));
1133 	sg->invalidated = true;
1134 	if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep))
1135 		return;
1136 	if (ptep) {
1137 		if (READ_ONCE(*ptep).val != _PTE_EMPTY.val)
1138 			dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
1139 		return;
1140 	}
1141 
1142 	crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce);
1143 	if (crste_leaf(crste) || crste.h.i)
1144 		return;
1145 	if (is_pmd(crste))
1146 		dat_free_pt(dereference_pmd(crste.pmd));
1147 	else
1148 		dat_free_level(dereference_crste(crste), true);
1149 }
1150 
1151 static void gmap_unshadow(struct gmap *sg)
1152 {
1153 	struct gmap_cache *gmap_cache, *next;
1154 
1155 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1156 	KVM_BUG_ON(!sg->parent, sg->kvm);
1157 
1158 	lockdep_assert_held(&sg->parent->children_lock);
1159 
1160 	gmap_remove_child(sg);
1161 	kvm_s390_vsie_gmap_notifier(sg, 0, -1UL);
1162 
1163 	list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) {
1164 		gmap_cache->gmap = NULL;
1165 		list_del(&gmap_cache->list);
1166 	}
1167 
1168 	gmap_put(sg);
1169 }
1170 
1171 void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
1172 {
1173 	struct vsie_rmap *rmap, *rnext, *head;
1174 	struct gmap *sg, *next;
1175 	gfn_t start, end;
1176 
1177 	list_for_each_entry_safe(sg, next, &parent->children, list) {
1178 		start = sg->guest_asce.rsto;
1179 		end = start + sg->guest_asce.tl + 1;
1180 		if (!sg->guest_asce.r && gfn >= start && gfn < end) {
1181 			gmap_unshadow(sg);
1182 			continue;
1183 		}
1184 		scoped_guard(spinlock, &sg->host_to_rmap_lock)
1185 			head = radix_tree_delete(&sg->host_to_rmap, gfn);
1186 		gmap_for_each_rmap_safe(rmap, rnext, head) {
1187 			gmap_unshadow_level(sg, rmap->r_gfn, rmap->level);
1188 			kfree(rmap);
1189 		}
1190 	}
1191 }
1192 
1193 /**
1194  * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables.
1195  * @parent: Pointer to the parent gmap.
1196  * @asce: ASCE for which the shadow table is created.
1197  * @edat_level: Edat level to be used for the shadow translation.
1198  *
1199  * Context: Called with parent->children_lock held.
1200  *
1201  * Return: The pointer to a gmap if a shadow table with the given asce is
1202  * already available, ERR_PTR(-EAGAIN) if another one is just being created,
1203  * otherwise NULL.
1204  */
1205 static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level)
1206 {
1207 	struct gmap *sg;
1208 
1209 	lockdep_assert_held(&parent->children_lock);
1210 	list_for_each_entry(sg, &parent->children, list) {
1211 		if (!gmap_is_shadow_valid(sg, asce, edat_level))
1212 			continue;
1213 		return sg;
1214 	}
1215 	return NULL;
1216 }
1217 
1218 #define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE)
1219 struct gmap_protect_asce_top_level {
1220 	unsigned long seq;
1221 	struct guest_fault f[CRST_TABLE_PAGES];
1222 };
1223 
1224 static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1225 						struct gmap_protect_asce_top_level *context)
1226 {
1227 	struct gmap *parent;
1228 	int rc, i;
1229 
1230 	guard(write_lock)(&sg->kvm->mmu_lock);
1231 
1232 	if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f))
1233 		return -EAGAIN;
1234 
1235 	parent = READ_ONCE(sg->parent);
1236 	if (!parent)
1237 		return -EAGAIN;
1238 	scoped_guard(spinlock, &parent->children_lock) {
1239 		if (READ_ONCE(sg->parent) != parent)
1240 			return -EAGAIN;
1241 		sg->invalidated = false;
1242 		for (i = 0; i < CRST_TABLE_PAGES; i++) {
1243 			if (!context->f[i].valid)
1244 				continue;
1245 			rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn,
1246 					       TABLE_TYPE_REGION1 + 1, context->f[i].writable);
1247 			if (rc)
1248 				return rc;
1249 		}
1250 		gmap_add_child(sg->parent, sg);
1251 	}
1252 
1253 	kvm_s390_release_faultin_array(sg->kvm, context->f, false);
1254 	return 0;
1255 }
1256 
1257 static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1258 					       struct gmap_protect_asce_top_level *context)
1259 {
1260 	int rc;
1261 
1262 	if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f))
1263 		return -EAGAIN;
1264 	do {
1265 		rc = kvm_s390_mmu_cache_topup(mc);
1266 		if (rc)
1267 			return rc;
1268 		rc = radix_tree_preload(GFP_KERNEL);
1269 		if (rc)
1270 			return rc;
1271 		rc = __gmap_protect_asce_top_level(mc, sg, context);
1272 		radix_tree_preload_end();
1273 	} while (rc == -ENOMEM);
1274 
1275 	return rc;
1276 }
1277 
1278 static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg)
1279 {
1280 	struct gmap_protect_asce_top_level context = {};
1281 	union asce asce = sg->guest_asce;
1282 	int rc;
1283 
1284 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1285 
1286 	context.seq = sg->kvm->mmu_invalidate_seq;
1287 	/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
1288 	smp_rmb();
1289 
1290 	rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false);
1291 	if (rc > 0)
1292 		rc = -EFAULT;
1293 	if (!rc)
1294 		rc = _gmap_protect_asce_top_level(mc, sg, &context);
1295 	if (rc)
1296 		kvm_s390_release_faultin_array(sg->kvm, context.f, true);
1297 	return rc;
1298 }
1299 
1300 /**
1301  * gmap_create_shadow() - Create/find a shadow guest address space.
1302  * @mc: The cache to use to allocate dat tables.
1303  * @parent: Pointer to the parent gmap.
1304  * @asce: ASCE for which the shadow table is created.
1305  * @edat_level: Edat level to be used for the shadow translation.
1306  *
1307  * The pages of the top level page table referred by the asce parameter
1308  * will be set to read-only and marked in the PGSTEs of the kvm process.
1309  * The shadow table will be removed automatically on any change to the
1310  * PTE mapping for the source table.
1311  *
1312  * The returned shadow gmap will be returned with one extra reference.
1313  *
1314  * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
1315  * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
1316  * parent gmap table could not be protected.
1317  */
1318 struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent,
1319 				union asce asce, int edat_level)
1320 {
1321 	struct gmap *sg, *new;
1322 	int rc;
1323 
1324 	if (WARN_ON(!parent))
1325 		return ERR_PTR(-EINVAL);
1326 
1327 	scoped_guard(spinlock, &parent->children_lock) {
1328 		sg = gmap_find_shadow(parent, asce, edat_level);
1329 		if (sg) {
1330 			gmap_get(sg);
1331 			return sg;
1332 		}
1333 	}
1334 	/* Create a new shadow gmap. */
1335 	new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce));
1336 	if (!new)
1337 		return ERR_PTR(-ENOMEM);
1338 	new->guest_asce = asce;
1339 	new->edat_level = edat_level;
1340 	set_bit(GMAP_FLAG_SHADOW, &new->flags);
1341 
1342 	scoped_guard(spinlock, &parent->children_lock) {
1343 		/* Recheck if another CPU created the same shadow. */
1344 		sg = gmap_find_shadow(parent, asce, edat_level);
1345 		if (sg) {
1346 			gmap_put(new);
1347 			gmap_get(sg);
1348 			return sg;
1349 		}
1350 		if (asce.r) {
1351 			/* Only allow one real-space gmap shadow. */
1352 			list_for_each_entry(sg, &parent->children, list) {
1353 				if (sg->guest_asce.r) {
1354 					scoped_guard(write_lock, &parent->kvm->mmu_lock)
1355 						gmap_unshadow(sg);
1356 					break;
1357 				}
1358 			}
1359 			gmap_add_child(parent, new);
1360 			/* Nothing to protect, return right away. */
1361 			gmap_get(new);
1362 			return new;
1363 		}
1364 	}
1365 
1366 	gmap_get(new);
1367 	new->parent = parent;
1368 	/* Protect while inserting, protects against invalidation races. */
1369 	rc = gmap_protect_asce_top_level(mc, new);
1370 	if (rc) {
1371 		new->parent = NULL;
1372 		gmap_put(new);
1373 		gmap_put(new);
1374 		return ERR_PTR(rc);
1375 	}
1376 	return new;
1377 }
1378