xref: /linux/arch/s390/kvm/gmap.c (revision 29e8751c1dd278262fb4cd234e8909287d4189d4)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Guest memory management for KVM/s390
4  *
5  * Copyright IBM Corp. 2008, 2020, 2024
6  *
7  *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
8  *               Martin Schwidefsky <schwidefsky@de.ibm.com>
9  *               David Hildenbrand <david@redhat.com>
10  *               Janosch Frank <frankja@linux.ibm.com>
11  */
12 
13 #include <linux/compiler.h>
14 #include <linux/kvm.h>
15 #include <linux/kvm_host.h>
16 #include <linux/pgtable.h>
17 #include <linux/pagemap.h>
18 #include <asm/lowcore.h>
19 #include <asm/uv.h>
20 #include <asm/gmap_helpers.h>
21 
22 #include "dat.h"
23 #include "gmap.h"
24 #include "kvm-s390.h"
25 #include "faultin.h"
26 
27 static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu)
28 {
29 	return vcpu->arch.sie_block->prog0c & PROG_IN_SIE;
30 }
31 
32 static int gmap_limit_to_type(gfn_t limit)
33 {
34 	if (!limit)
35 		return TABLE_TYPE_REGION1;
36 	if (limit <= _REGION3_SIZE >> PAGE_SHIFT)
37 		return TABLE_TYPE_SEGMENT;
38 	if (limit <= _REGION2_SIZE >> PAGE_SHIFT)
39 		return TABLE_TYPE_REGION3;
40 	if (limit <= _REGION1_SIZE >> PAGE_SHIFT)
41 		return TABLE_TYPE_REGION2;
42 	return TABLE_TYPE_REGION1;
43 }
44 
45 /**
46  * gmap_new() - Allocate and initialize a guest address space.
47  * @kvm: The kvm owning the guest.
48  * @limit: Maximum address of the gmap address space.
49  *
50  * Return: A guest address space structure.
51  */
52 struct gmap *gmap_new(struct kvm *kvm, gfn_t limit)
53 {
54 	struct crst_table *table;
55 	struct gmap *gmap;
56 	int type;
57 
58 	type = gmap_limit_to_type(limit);
59 
60 	gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT);
61 	if (!gmap)
62 		return NULL;
63 	INIT_LIST_HEAD(&gmap->children);
64 	INIT_LIST_HEAD(&gmap->list);
65 	INIT_LIST_HEAD(&gmap->scb_users);
66 	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE);
67 	spin_lock_init(&gmap->children_lock);
68 	spin_lock_init(&gmap->host_to_rmap_lock);
69 	refcount_set(&gmap->refcount, 1);
70 
71 	table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val);
72 	if (!table) {
73 		kfree(gmap);
74 		return NULL;
75 	}
76 
77 	gmap->asce.val = __pa(table);
78 	gmap->asce.dt = type;
79 	gmap->asce.tl = _ASCE_TABLE_LENGTH;
80 	gmap->asce.x = 1;
81 	gmap->asce.p = 1;
82 	gmap->asce.s = 1;
83 	gmap->kvm = kvm;
84 	set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags);
85 
86 	return gmap;
87 }
88 
89 static void gmap_add_child(struct gmap *parent, struct gmap *child)
90 {
91 	KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm);
92 	KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm);
93 	KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm);
94 	lockdep_assert_held(&parent->children_lock);
95 
96 	child->parent = parent;
97 
98 	if (is_ucontrol(parent))
99 		set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
100 	else
101 		clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
102 
103 	if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags))
104 		set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
105 	else
106 		clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
107 
108 	if (kvm_is_ucontrol(parent->kvm))
109 		clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags);
110 	list_add(&child->list, &parent->children);
111 }
112 
113 struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit)
114 {
115 	struct gmap *res;
116 
117 	lockdep_assert_not_held(&parent->children_lock);
118 	res = gmap_new(parent->kvm, limit);
119 	if (res) {
120 		scoped_guard(spinlock, &parent->children_lock)
121 			gmap_add_child(parent, res);
122 	}
123 	return res;
124 }
125 
126 int gmap_set_limit(struct gmap *gmap, gfn_t limit)
127 {
128 	struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL;
129 	int rc, type;
130 
131 	type = gmap_limit_to_type(limit);
132 
133 	mc = kvm_s390_new_mmu_cache();
134 	if (!mc)
135 		return -ENOMEM;
136 
137 	do {
138 		rc = kvm_s390_mmu_cache_topup(mc);
139 		if (rc)
140 			return rc;
141 		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
142 			rc = dat_set_asce_limit(mc, &gmap->asce, type);
143 	} while (rc == -ENOMEM);
144 
145 	return 0;
146 }
147 
148 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
149 {
150 	struct vsie_rmap *rmap, *rnext, *head;
151 	struct radix_tree_iter iter;
152 	unsigned long indices[16];
153 	unsigned long index;
154 	void __rcu **slot;
155 	int i, nr;
156 
157 	/* A radix tree is freed by deleting all of its entries */
158 	index = 0;
159 	do {
160 		nr = 0;
161 		radix_tree_for_each_slot(slot, root, &iter, index) {
162 			indices[nr] = iter.index;
163 			if (++nr == 16)
164 				break;
165 		}
166 		for (i = 0; i < nr; i++) {
167 			index = indices[i];
168 			head = radix_tree_delete(root, index);
169 			gmap_for_each_rmap_safe(rmap, rnext, head)
170 				kfree(rmap);
171 		}
172 	} while (nr > 0);
173 }
174 
175 void gmap_remove_child(struct gmap *child)
176 {
177 	if (KVM_BUG_ON(!child->parent, child->kvm))
178 		return;
179 	lockdep_assert_held(&child->parent->children_lock);
180 
181 	list_del(&child->list);
182 	child->parent = NULL;
183 	child->invalidated = true;
184 }
185 
186 /**
187  * gmap_dispose() - Remove and free a guest address space and its children.
188  * @gmap: Pointer to the guest address space structure.
189  */
190 void gmap_dispose(struct gmap *gmap)
191 {
192 	/* The gmap must have been removed from the parent beforehands */
193 	KVM_BUG_ON(gmap->parent, gmap->kvm);
194 	/* All children of this gmap must have been removed beforehands */
195 	KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm);
196 	/* No VSIE shadow block is allowed to use this gmap */
197 	KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm);
198 	/* The ASCE must be valid */
199 	KVM_BUG_ON(!gmap->asce.val, gmap->kvm);
200 	/* The refcount must be 0 */
201 	KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm);
202 
203 	/* Flush tlb of all gmaps */
204 	asce_flush_tlb(gmap->asce);
205 
206 	/* Free all DAT tables. */
207 	dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap));
208 
209 	/* Free additional data for a shadow gmap */
210 	if (is_shadow(gmap))
211 		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
212 
213 	kfree(gmap);
214 }
215 
216 /**
217  * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy.
218  * @gmap: The gmap whose ASCE needs to be replaced.
219  *
220  * If the ASCE is a SEGMENT type then this function will return -EINVAL,
221  * otherwise the pointers in the host_to_guest radix tree will keep pointing
222  * to the wrong pages, causing use-after-free and memory corruption.
223  * If the allocation of the new top level page table fails, the ASCE is not
224  * replaced.
225  * In any case, the old ASCE is always removed from the gmap CRST list.
226  * Therefore the caller has to make sure to save a pointer to it
227  * beforehand, unless a leak is actually intended.
228  *
229  * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE,
230  *         -ENOMEM if runinng out of memory.
231  */
232 int s390_replace_asce(struct gmap *gmap)
233 {
234 	struct crst_table *table;
235 	union asce asce;
236 
237 	/* Replacing segment type ASCEs would cause serious issues */
238 	if (gmap->asce.dt == ASCE_TYPE_SEGMENT)
239 		return -EINVAL;
240 
241 	table = dat_alloc_crst_sleepable(0);
242 	if (!table)
243 		return -ENOMEM;
244 	memcpy(table, dereference_asce(gmap->asce), sizeof(*table));
245 
246 	/* Set new table origin while preserving existing ASCE control bits */
247 	asce = gmap->asce;
248 	asce.rsto = virt_to_pfn(table);
249 	WRITE_ONCE(gmap->asce, asce);
250 
251 	return 0;
252 }
253 
254 bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint)
255 {
256 	struct kvm *kvm = gmap->kvm;
257 	struct kvm_vcpu *vcpu;
258 	gfn_t prefix_gfn;
259 	unsigned long i;
260 
261 	if (is_shadow(gmap))
262 		return false;
263 	kvm_for_each_vcpu(i, vcpu, kvm) {
264 		/* Match against both prefix pages */
265 		prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu));
266 		if (prefix_gfn < end && gfn <= prefix_gfn + 1) {
267 			if (hint && kvm_s390_is_in_sie(vcpu))
268 				return false;
269 			VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx",
270 				   gfn_to_gpa(gfn), gfn_to_gpa(end));
271 			kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
272 		}
273 	}
274 	return true;
275 }
276 
277 struct clear_young_pte_priv {
278 	struct gmap *gmap;
279 	bool young;
280 };
281 
282 static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
283 {
284 	struct clear_young_pte_priv *p = walk->priv;
285 	union pgste pgste;
286 	union pte pte, new;
287 
288 	pte = READ_ONCE(*ptep);
289 
290 	if (!pte.s.pr || (!pte.s.y && pte.h.i))
291 		return 0;
292 
293 	pgste = pgste_get_lock(ptep);
294 	if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) {
295 		new = pte;
296 		new.h.i = 1;
297 		new.s.y = 0;
298 		if ((new.s.d || !new.h.p) && !new.s.s)
299 			folio_set_dirty(pfn_folio(pte.h.pfra));
300 		new.s.d = 0;
301 		new.h.p = 1;
302 
303 		pgste.prefix_notif = 0;
304 		pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap));
305 	}
306 	p->young = 1;
307 	pgste_set_unlock(ptep, pgste);
308 	return 0;
309 }
310 
311 static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
312 {
313 	struct clear_young_pte_priv *priv = walk->priv;
314 	union crste crste, new;
315 
316 	do {
317 		crste = READ_ONCE(*crstep);
318 
319 		if (!crste.h.fc)
320 			return 0;
321 		if (!crste.s.fc1.y && crste.h.i)
322 			return 0;
323 		if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end))
324 			break;
325 
326 		new = crste;
327 		new.h.i = 1;
328 		new.s.fc1.y = 0;
329 		new.s.fc1.prefix_notif = 0;
330 		if (new.s.fc1.d || !new.h.p)
331 			folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
332 		new.s.fc1.d = 0;
333 		new.h.p = 1;
334 	} while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce));
335 
336 	priv->young = 1;
337 	return 0;
338 }
339 
340 /**
341  * gmap_age_gfn() - Clear young.
342  * @gmap: The guest gmap.
343  * @start: The first gfn to test.
344  * @end: The gfn after the last one to test.
345  *
346  * Context: Called with the kvm mmu write lock held.
347  * Return: 1 if any page in the given range was young, otherwise 0.
348  */
349 bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end)
350 {
351 	const struct dat_walk_ops ops = {
352 		.pte_entry = gmap_clear_young_pte,
353 		.pmd_entry = gmap_clear_young_crste,
354 		.pud_entry = gmap_clear_young_crste,
355 	};
356 	struct clear_young_pte_priv priv = {
357 		.gmap = gmap,
358 		.young = false,
359 	};
360 
361 	_dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
362 
363 	return priv.young;
364 }
365 
366 struct gmap_unmap_priv {
367 	struct gmap *gmap;
368 	struct kvm_memory_slot *slot;
369 };
370 
371 static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w)
372 {
373 	struct gmap_unmap_priv *priv = w->priv;
374 	struct folio *folio = NULL;
375 	unsigned long vmaddr;
376 	union pgste pgste;
377 
378 	pgste = pgste_get_lock(ptep);
379 	if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) {
380 		vmaddr = __gfn_to_hva_memslot(priv->slot, gfn);
381 		gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr);
382 	}
383 	if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
384 		folio = pfn_folio(ptep->h.pfra);
385 	pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn);
386 	pgste_set_unlock(ptep, pgste);
387 	if (folio)
388 		uv_convert_from_secure_folio(folio);
389 
390 	return 0;
391 }
392 
393 static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
394 {
395 	struct gmap_unmap_priv *priv = walk->priv;
396 	struct folio *folio = NULL;
397 	union crste old = *crstep;
398 	bool ok;
399 
400 	if (!old.h.fc)
401 		return 0;
402 
403 	if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
404 		folio = phys_to_folio(crste_origin_large(old));
405 	/*
406 	 * No races should happen because kvm->mmu_lock is held in write mode,
407 	 * but the unmap operation could have triggered an unshadow, which
408 	 * causes gmap_crstep_xchg_atomic() to return false and clear the
409 	 * vsie_notif bit. Allow the operation to fail once, if the old crste
410 	 * had the vsie_notif bit set. A second failure is not allowed, for
411 	 * the reasons above.
412 	 */
413 	ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn);
414 	if (!ok) {
415 		KVM_BUG_ON(!old.s.fc1.vsie_notif, priv->gmap->kvm);
416 		old.s.fc1.vsie_notif = 0;
417 		ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn);
418 		KVM_BUG_ON(!ok, priv->gmap->kvm);
419 	}
420 	if (folio)
421 		uv_convert_from_secure_folio(folio);
422 
423 	return 0;
424 }
425 
426 /**
427  * gmap_unmap_gfn_range() - Unmap a range of guest addresses.
428  * @gmap: The gmap to act on.
429  * @slot: The memslot in which the range is located.
430  * @start: The first gfn to unmap.
431  * @end: The gfn after the last one to unmap.
432  *
433  * Context: Called with the kvm mmu write lock held.
434  * Return: false
435  */
436 bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
437 {
438 	const struct dat_walk_ops ops = {
439 		.pte_entry = _gmap_unmap_pte,
440 		.pmd_entry = _gmap_unmap_crste,
441 		.pud_entry = _gmap_unmap_crste,
442 	};
443 	struct gmap_unmap_priv priv = {
444 		.gmap = gmap,
445 		.slot = slot,
446 	};
447 
448 	lockdep_assert_held_write(&gmap->kvm->mmu_lock);
449 
450 	_dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
451 	return false;
452 }
453 
454 static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn,
455 						  struct gmap *gmap)
456 {
457 	union pte pte = READ_ONCE(*ptep);
458 
459 	if (!pte.s.pr || (pte.h.p && !pte.s.sd))
460 		return pgste;
461 
462 	/*
463 	 * If this page contains one or more prefixes of vCPUS that are currently
464 	 * running, do not reset the protection, leave it marked as dirty.
465 	 */
466 	if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) {
467 		pte.h.p = 1;
468 		pte.s.sd = 0;
469 		pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn);
470 	}
471 
472 	mark_page_dirty(gmap->kvm, gfn);
473 
474 	return pgste;
475 }
476 
477 static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end,
478 					  struct dat_walk *walk)
479 {
480 	struct gmap *gmap = walk->priv;
481 	union pgste pgste;
482 
483 	pgste = pgste_get_lock(ptep);
484 	pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap);
485 	pgste_set_unlock(ptep, pgste);
486 	return 0;
487 }
488 
489 static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end,
490 					    struct dat_walk *walk)
491 {
492 	struct gmap *gmap = walk->priv;
493 	union crste crste, new;
494 
495 	if (fatal_signal_pending(current))
496 		return 1;
497 	do {
498 		crste = READ_ONCE(*table);
499 		if (!crste.h.fc)
500 			return 0;
501 		if (crste.h.p && !crste.s.fc1.sd)
502 			return 0;
503 
504 		/*
505 		 * If this large page contains one or more prefixes of vCPUs that are
506 		 * currently running, do not reset the protection, leave it marked as
507 		 * dirty.
508 		 */
509 		if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end))
510 			break;
511 		new = crste;
512 		new.h.p = 1;
513 		new.s.fc1.sd = 0;
514 	} while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn));
515 
516 	for ( ; gfn < end; gfn++)
517 		mark_page_dirty(gmap->kvm, gfn);
518 
519 	return 0;
520 }
521 
522 void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end)
523 {
524 	const struct dat_walk_ops walk_ops = {
525 		.pte_entry = _pte_test_and_clear_softdirty,
526 		.pmd_entry = _crste_test_and_clear_softdirty,
527 		.pud_entry = _crste_test_and_clear_softdirty,
528 	};
529 
530 	lockdep_assert_held(&gmap->kvm->mmu_lock);
531 
532 	_dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap);
533 }
534 
535 static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f)
536 {
537 	union crste newcrste, oldcrste = READ_ONCE(*f->crstep);
538 
539 	/* Somehow the crste is not large anymore, let the slow path deal with it. */
540 	if (!oldcrste.h.fc)
541 		return 1;
542 
543 	f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn));
544 	f->writable = oldcrste.s.fc1.w;
545 
546 	/* Appropriate permissions already (race with another handler), nothing to do. */
547 	if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p))
548 		return 0;
549 
550 	if (!f->write_attempt || oldcrste.s.fc1.w) {
551 		f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d;
552 		newcrste = oldcrste;
553 		newcrste.h.i = 0;
554 		newcrste.s.fc1.y = 1;
555 		if (f->write_attempt) {
556 			newcrste.h.p = 0;
557 			newcrste.s.fc1.d = 1;
558 			newcrste.s.fc1.sd = 1;
559 		}
560 		/* In case of races, let the slow path deal with it. */
561 		return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn);
562 	}
563 	/* Trying to write on a read-only page, let the slow path deal with it. */
564 	return 1;
565 }
566 
567 static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste,
568 					struct guest_fault *f)
569 {
570 	union pte newpte, oldpte = READ_ONCE(*f->ptep);
571 
572 	f->pfn = oldpte.h.pfra;
573 	f->writable = oldpte.s.w;
574 
575 	/* Appropriate permissions already (race with another handler), nothing to do. */
576 	if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p))
577 		return 0;
578 	/* Trying to write on a read-only page, let the slow path deal with it. */
579 	if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w))
580 		return 1;
581 
582 	newpte = oldpte;
583 	newpte.h.i = 0;
584 	newpte.s.y = 1;
585 	if (f->write_attempt) {
586 		newpte.h.p = 0;
587 		newpte.s.d = 1;
588 		newpte.s.sd = 1;
589 	}
590 	*pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn);
591 
592 	return 0;
593 }
594 
595 /**
596  * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault.
597  * @gmap: The gmap whose fault needs to be resolved.
598  * @fault: Describes the fault that is being resolved.
599  *
600  * A minor fault is a fault that can be resolved quickly within gmap.
601  * The page is already mapped, the fault is only due to dirty/young tracking.
602  *
603  * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could
604  *         not be resolved and needs to go through the slow path.
605  */
606 int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault)
607 {
608 	union pgste pgste;
609 	int rc;
610 
611 	lockdep_assert_held(&gmap->kvm->mmu_lock);
612 
613 	rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE,
614 			    &fault->crstep, &fault->ptep);
615 	/* If a PTE or a leaf CRSTE could not be reached, slow path. */
616 	if (rc)
617 		return 1;
618 
619 	if (fault->ptep) {
620 		pgste = pgste_get_lock(fault->ptep);
621 		rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault);
622 		if (!rc && fault->callback)
623 			fault->callback(fault);
624 		pgste_set_unlock(fault->ptep, pgste);
625 	} else {
626 		rc = gmap_handle_minor_crste_fault(gmap, fault);
627 		if (!rc && fault->callback)
628 			fault->callback(fault);
629 	}
630 	return rc;
631 }
632 
633 static inline bool gmap_2g_allowed(struct gmap *gmap, struct guest_fault *f,
634 				   struct kvm_memory_slot *slot)
635 {
636 	return false;
637 }
638 
639 /**
640  * gmap_1m_allowed() - Check whether a 1M hugepage is allowed.
641  * @gmap: The gmap of the guest.
642  * @f: Describes the fault that is being resolved.
643  * @slot: The memslot the faulting address belongs to.
644  *
645  * The function checks whether the GMAP_FLAG_ALLOW_HPAGE_1M flag is set for
646  * @gmap, whether the offset of the address in the 1M virtual frame is the
647  * same as the offset in the physical 1M frame, and finally whether the whole
648  * 1M page would fit in the given memslot.
649  *
650  * Return: true if a 1M hugepage is allowed to back the faulting address, false
651  *         otherwise.
652  */
653 static inline bool gmap_1m_allowed(struct gmap *gmap, struct guest_fault *f,
654 				   struct kvm_memory_slot *slot)
655 {
656 	return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags) &&
657 	       !((f->gfn ^ f->pfn) & ~_SEGMENT_FR_MASK) &&
658 	       slot->base_gfn <= ALIGN_DOWN(f->gfn, _PAGES_PER_SEGMENT) &&
659 	       slot->base_gfn + slot->npages >= ALIGN(f->gfn + 1, _PAGES_PER_SEGMENT);
660 }
661 
662 static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level,
663 		      struct guest_fault *f)
664 {
665 	union crste oldval, newval;
666 	union pte newpte, oldpte;
667 	union pgste pgste;
668 	int rc = 0;
669 
670 	rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level,
671 			    &f->crstep, &f->ptep);
672 	if (rc == -ENOMEM)
673 		return rc;
674 	if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm))
675 		return rc;
676 	if (rc)
677 		return -EAGAIN;
678 	if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm))
679 		return -EINVAL;
680 
681 	if (f->ptep) {
682 		pgste = pgste_get_lock(f->ptep);
683 		oldpte = *f->ptep;
684 		newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
685 		newpte.s.sd = oldpte.s.sd;
686 		oldpte.s.sd = 0;
687 		if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
688 			pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn);
689 			if (f->callback)
690 				f->callback(f);
691 		} else {
692 			rc = -EAGAIN;
693 		}
694 		pgste_set_unlock(f->ptep, pgste);
695 	} else {
696 		do {
697 			oldval = READ_ONCE(*f->crstep);
698 			newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
699 					    f->write_attempt | oldval.s.fc1.d);
700 			newval.s.fc1.s = !f->page;
701 			newval.s.fc1.sd = oldval.s.fc1.sd;
702 			if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
703 			    crste_origin_large(oldval) != crste_origin_large(newval))
704 				return -EAGAIN;
705 		} while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn));
706 		if (f->callback)
707 			f->callback(f);
708 	}
709 
710 	return rc;
711 }
712 
713 int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f,
714 	      struct kvm_memory_slot *slot)
715 {
716 	unsigned int order;
717 	int level;
718 
719 	lockdep_assert_held(&gmap->kvm->mmu_lock);
720 
721 	level = TABLE_TYPE_PAGE_TABLE;
722 	if (f->page) {
723 		order = folio_order(page_folio(f->page));
724 		if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f, slot))
725 			level = TABLE_TYPE_REGION3;
726 		else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f, slot))
727 			level = TABLE_TYPE_SEGMENT;
728 	}
729 	return _gmap_link(mc, gmap, level, f);
730 }
731 
732 static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
733 			     gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
734 {
735 	union crste newcrste, oldcrste;
736 	struct page_table *pt;
737 	union crste *crstep;
738 	union pte *ptep;
739 	int rc;
740 
741 	if (force_alloc)
742 		rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC,
743 				    TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
744 	else
745 		rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE,
746 				    TABLE_TYPE_SEGMENT, &crstep, &ptep);
747 	if (rc)
748 		return rc;
749 	if (!ptep) {
750 		newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT);
751 		newcrste.h.i = 1;
752 		newcrste.h.fc0.tl = 1;
753 	} else {
754 		pt = pte_table_start(ptep);
755 		dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT));
756 		newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT);
757 	}
758 	rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT,
759 			    &crstep, &ptep);
760 	if (rc)
761 		return rc;
762 	do {
763 		oldcrste = READ_ONCE(*crstep);
764 		if (oldcrste.val == newcrste.val)
765 			break;
766 	} while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce));
767 	return 0;
768 }
769 
770 static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp)
771 {
772 	union pte *ptep;
773 	int rc;
774 
775 	rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE,
776 			    TABLE_TYPE_SEGMENT, crstepp, &ptep);
777 	if (rc || (!ptep && !crste_is_ucas(**crstepp)))
778 		return -EREMOTE;
779 	if (!ptep)
780 		return 1;
781 	*gaddr &= ~_SEGMENT_MASK;
782 	*gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT;
783 	return 0;
784 }
785 
786 /**
787  * gmap_ucas_translate() - Translate a vcpu address into a host gmap address
788  * @mc: The memory cache to be used for allocations.
789  * @gmap: The per-cpu gmap.
790  * @gaddr: Pointer to the address to be translated, will get overwritten with
791  *         the translated address in case of success.
792  * Translates the per-vCPU guest address into a fake guest address, which can
793  * then be used with the fake memslots that are identity mapping userspace.
794  * This allows ucontrol VMs to use the normal fault resolution path, like
795  * normal VMs.
796  *
797  * Return: %0 in case of success, otherwise %-EREMOTE.
798  */
799 int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr)
800 {
801 	gpa_t translated_address;
802 	union crste *crstep;
803 	gfn_t gfn;
804 	int rc;
805 
806 	gfn = gpa_to_gfn(*gaddr);
807 
808 	scoped_guard(read_lock, &gmap->kvm->mmu_lock) {
809 		rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
810 		if (rc <= 0)
811 			return rc;
812 	}
813 	do {
814 		scoped_guard(write_lock, &gmap->kvm->mmu_lock) {
815 			rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
816 			if (rc <= 0)
817 				return rc;
818 			translated_address = (*gaddr & ~_SEGMENT_MASK) |
819 					     (crstep->val & _SEGMENT_MASK);
820 			rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true);
821 		}
822 		if (!rc) {
823 			*gaddr = translated_address;
824 			return 0;
825 		}
826 		if (rc != -ENOMEM)
827 			return -EREMOTE;
828 		rc = kvm_s390_mmu_cache_topup(mc);
829 		if (rc)
830 			return rc;
831 	} while (1);
832 	return 0;
833 }
834 
835 int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count)
836 {
837 	struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL;
838 	int rc = 0;
839 
840 	mc = kvm_s390_new_mmu_cache();
841 	if (!mc)
842 		return -ENOMEM;
843 
844 	while (count) {
845 		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
846 			rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false);
847 		if (rc == -ENOMEM) {
848 			rc = kvm_s390_mmu_cache_topup(mc);
849 			if (rc)
850 				return rc;
851 			continue;
852 		}
853 		if (rc)
854 			return rc;
855 
856 		count--;
857 		c_gfn += _PAGE_ENTRIES;
858 		p_gfn += _PAGE_ENTRIES;
859 	}
860 	return rc;
861 }
862 
863 static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
864 {
865 	union crste *crstep;
866 	union pte *ptep;
867 	int rc;
868 
869 	rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
870 	if (rc)
871 		return;
872 	while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce))
873 		;
874 }
875 
876 void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
877 {
878 	guard(read_lock)(&gmap->kvm->mmu_lock);
879 
880 	for ( ; count; count--, c_gfn += _PAGE_ENTRIES)
881 		gmap_ucas_unmap_one(gmap, c_gfn);
882 }
883 
884 static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
885 {
886 	struct gmap *gmap = walk->priv;
887 	union crste crste, newcrste;
888 
889 	crste = READ_ONCE(*crstep);
890 	newcrste = _CRSTE_EMPTY(crste.h.tt);
891 
892 	while (crste_leaf(crste)) {
893 		if (crste_prefix(crste))
894 			gmap_unmap_prefix(gmap, gfn, next);
895 		if (crste.s.fc1.vsie_notif)
896 			gmap_handle_vsie_unshadow_event(gmap, gfn);
897 		if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce))
898 			break;
899 		crste = READ_ONCE(*crstep);
900 	}
901 
902 	if (need_resched())
903 		return next;
904 
905 	return 0;
906 }
907 
908 void gmap_split_huge_pages(struct gmap *gmap)
909 {
910 	const struct dat_walk_ops ops = {
911 		.pmd_entry = _gmap_split_crste,
912 		.pud_entry = _gmap_split_crste,
913 	};
914 	gfn_t start = 0;
915 
916 	do {
917 		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
918 			start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce,
919 						    &ops, DAT_WALK_IGN_HOLES, gmap);
920 		cond_resched();
921 	} while (start);
922 }
923 
924 static int _gmap_enable_skeys(struct gmap *gmap)
925 {
926 	gfn_t start = 0;
927 	int rc;
928 
929 	if (uses_skeys(gmap))
930 		return 0;
931 
932 	set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
933 	rc = gmap_helper_disable_cow_sharing();
934 	if (rc) {
935 		clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
936 		return rc;
937 	}
938 
939 	do {
940 		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
941 			start = dat_reset_skeys(gmap->asce, start);
942 		cond_resched();
943 	} while (start);
944 	return 0;
945 }
946 
947 int gmap_enable_skeys(struct gmap *gmap)
948 {
949 	int rc;
950 
951 	mmap_write_lock(gmap->kvm->mm);
952 	rc = _gmap_enable_skeys(gmap);
953 	mmap_write_unlock(gmap->kvm->mm);
954 	return rc;
955 }
956 
957 static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
958 {
959 	if (!ptep->s.pr)
960 		return 0;
961 	__kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep)));
962 	if (need_resched())
963 		return next;
964 	return 0;
965 }
966 
967 static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
968 {
969 	phys_addr_t origin, cur, end;
970 
971 	if (!crstep->h.fc || !crstep->s.fc1.pr)
972 		return 0;
973 
974 	origin = crste_origin_large(*crstep);
975 	cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin;
976 	end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin;
977 	for ( ; cur < end; cur += PAGE_SIZE)
978 		__kvm_s390_pv_destroy_page(phys_to_page(cur));
979 	if (need_resched())
980 		return next;
981 	return 0;
982 }
983 
984 int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible)
985 {
986 	const struct dat_walk_ops ops = {
987 		.pte_entry = _destroy_pages_pte,
988 		.pmd_entry = _destroy_pages_crste,
989 		.pud_entry = _destroy_pages_crste,
990 	};
991 
992 	do {
993 		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
994 			start = _dat_walk_gfn_range(start, end, gmap->asce, &ops,
995 						    DAT_WALK_IGN_HOLES, NULL);
996 		if (interruptible && fatal_signal_pending(current))
997 			return -EINTR;
998 		cond_resched();
999 	} while (start && start < end);
1000 	return 0;
1001 }
1002 
1003 int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level)
1004 {
1005 	struct vsie_rmap *rmap __free(kvfree) = NULL;
1006 	struct vsie_rmap *temp;
1007 	void __rcu **slot;
1008 	int rc = 0;
1009 
1010 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1011 	lockdep_assert_held(&sg->host_to_rmap_lock);
1012 
1013 	rmap = kzalloc_obj(*rmap, GFP_ATOMIC);
1014 	if (!rmap)
1015 		return -ENOMEM;
1016 
1017 	rmap->r_gfn = r_gfn;
1018 	rmap->level = level;
1019 	slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn);
1020 	if (slot) {
1021 		rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock);
1022 		for (temp = rmap->next; temp; temp = temp->next) {
1023 			if (temp->val == rmap->val)
1024 				return 0;
1025 		}
1026 		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1027 	} else {
1028 		rmap->next = NULL;
1029 		rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap);
1030 		if (rc)
1031 			return rc;
1032 	}
1033 	rmap = NULL;
1034 
1035 	return 0;
1036 }
1037 
1038 int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
1039 		      kvm_pfn_t pfn, int level, bool wr)
1040 {
1041 	unsigned long bitmask;
1042 	union crste *crstep;
1043 	union pgste pgste;
1044 	union pte *ptep;
1045 	union pte pte;
1046 	int flags, rc;
1047 
1048 	if (KVM_BUG_ON(!is_shadow(sg) || level <= TABLE_TYPE_PAGE_TABLE, sg->kvm))
1049 		return -EINVAL;
1050 	lockdep_assert_held(&sg->parent->children_lock);
1051 
1052 	flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
1053 	rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags,
1054 			    TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
1055 	if (rc)
1056 		return rc;
1057 	if (level <= TABLE_TYPE_REGION1) {
1058 		bitmask = -1UL << (8 + 11 * level);
1059 		scoped_guard(spinlock, &sg->host_to_rmap_lock)
1060 			rc = gmap_insert_rmap(sg, p_gfn, r_gfn & bitmask, level);
1061 	}
1062 	if (rc)
1063 		return rc;
1064 
1065 	if (!pgste_get_trylock(ptep, &pgste))
1066 		return -EAGAIN;
1067 	pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false);
1068 	pte.h.p = 1;
1069 	pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false);
1070 	pgste.vsie_notif = 1;
1071 	pgste_set_unlock(ptep, pgste);
1072 
1073 	return 0;
1074 }
1075 
1076 static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1077 {
1078 	__atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val);
1079 	if (need_resched())
1080 		return next;
1081 	return 0;
1082 }
1083 
1084 void gmap_set_cmma_all_dirty(struct gmap *gmap)
1085 {
1086 	const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, };
1087 	gfn_t gfn = 0;
1088 
1089 	do {
1090 		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
1091 			gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops,
1092 						  DAT_WALK_IGN_HOLES, NULL);
1093 		cond_resched();
1094 	} while (gfn);
1095 }
1096 
1097 static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
1098 {
1099 	unsigned long align = PAGE_SIZE;
1100 	gpa_t gaddr = gfn_to_gpa(r_gfn);
1101 	union crste *crstep;
1102 	union crste crste;
1103 	union pte *ptep;
1104 
1105 	if (level > TABLE_TYPE_PAGE_TABLE)
1106 		align = 1UL << (11 * level + _SEGMENT_SHIFT);
1107 	kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align));
1108 	sg->invalidated = true;
1109 	if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep))
1110 		return;
1111 	if (ptep) {
1112 		if (READ_ONCE(*ptep).val != _PTE_EMPTY.val)
1113 			dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
1114 		return;
1115 	}
1116 
1117 	crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce);
1118 	if (crste_leaf(crste) || crste.h.i)
1119 		return;
1120 	if (is_pmd(crste))
1121 		dat_free_pt(dereference_pmd(crste.pmd));
1122 	else
1123 		dat_free_level(dereference_crste(crste), true);
1124 }
1125 
1126 static void gmap_unshadow(struct gmap *sg)
1127 {
1128 	struct gmap_cache *gmap_cache, *next;
1129 
1130 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1131 	KVM_BUG_ON(!sg->parent, sg->kvm);
1132 
1133 	lockdep_assert_held(&sg->parent->children_lock);
1134 
1135 	gmap_remove_child(sg);
1136 	kvm_s390_vsie_gmap_notifier(sg, 0, -1UL);
1137 
1138 	list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) {
1139 		gmap_cache->gmap = NULL;
1140 		list_del(&gmap_cache->list);
1141 	}
1142 
1143 	gmap_put(sg);
1144 }
1145 
1146 void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
1147 {
1148 	struct vsie_rmap *rmap, *rnext, *head;
1149 	struct gmap *sg, *next;
1150 	gfn_t start, end;
1151 
1152 	list_for_each_entry_safe(sg, next, &parent->children, list) {
1153 		start = sg->guest_asce.rsto;
1154 		end = start + sg->guest_asce.tl + 1;
1155 		if (!sg->guest_asce.r && gfn >= start && gfn < end) {
1156 			gmap_unshadow(sg);
1157 			continue;
1158 		}
1159 		scoped_guard(spinlock, &sg->host_to_rmap_lock)
1160 			head = radix_tree_delete(&sg->host_to_rmap, gfn);
1161 		gmap_for_each_rmap_safe(rmap, rnext, head) {
1162 			gmap_unshadow_level(sg, rmap->r_gfn, rmap->level);
1163 			kfree(rmap);
1164 		}
1165 	}
1166 }
1167 
1168 /**
1169  * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables.
1170  * @parent: Pointer to the parent gmap.
1171  * @asce: ASCE for which the shadow table is created.
1172  * @edat_level: Edat level to be used for the shadow translation.
1173  *
1174  * Context: Called with parent->children_lock held.
1175  *
1176  * Return: The pointer to a gmap if a shadow table with the given asce is
1177  * already available, ERR_PTR(-EAGAIN) if another one is just being created,
1178  * otherwise NULL.
1179  */
1180 static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level)
1181 {
1182 	struct gmap *sg;
1183 
1184 	lockdep_assert_held(&parent->children_lock);
1185 	list_for_each_entry(sg, &parent->children, list) {
1186 		if (!gmap_is_shadow_valid(sg, asce, edat_level))
1187 			continue;
1188 		return sg;
1189 	}
1190 	return NULL;
1191 }
1192 
1193 #define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE)
1194 struct gmap_protect_asce_top_level {
1195 	unsigned long seq;
1196 	struct guest_fault f[CRST_TABLE_PAGES];
1197 };
1198 
1199 static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1200 						struct gmap_protect_asce_top_level *context)
1201 {
1202 	struct gmap *parent;
1203 	int rc, i;
1204 
1205 	guard(write_lock)(&sg->kvm->mmu_lock);
1206 
1207 	if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f))
1208 		return -EAGAIN;
1209 
1210 	parent = READ_ONCE(sg->parent);
1211 	if (!parent)
1212 		return -EAGAIN;
1213 	scoped_guard(spinlock, &parent->children_lock) {
1214 		if (READ_ONCE(sg->parent) != parent)
1215 			return -EAGAIN;
1216 		sg->invalidated = false;
1217 		for (i = 0; i < CRST_TABLE_PAGES; i++) {
1218 			if (!context->f[i].valid)
1219 				continue;
1220 			rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn,
1221 					       TABLE_TYPE_REGION1 + 1, context->f[i].writable);
1222 			if (rc)
1223 				return rc;
1224 		}
1225 		gmap_add_child(sg->parent, sg);
1226 	}
1227 
1228 	kvm_s390_release_faultin_array(sg->kvm, context->f, false);
1229 	return 0;
1230 }
1231 
1232 static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1233 					       struct gmap_protect_asce_top_level *context)
1234 {
1235 	int rc;
1236 
1237 	if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f))
1238 		return -EAGAIN;
1239 	do {
1240 		rc = kvm_s390_mmu_cache_topup(mc);
1241 		if (rc)
1242 			return rc;
1243 		rc = radix_tree_preload(GFP_KERNEL);
1244 		if (rc)
1245 			return rc;
1246 		rc = __gmap_protect_asce_top_level(mc, sg, context);
1247 		radix_tree_preload_end();
1248 	} while (rc == -ENOMEM);
1249 
1250 	return rc;
1251 }
1252 
1253 static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg)
1254 {
1255 	struct gmap_protect_asce_top_level context = {};
1256 	union asce asce = sg->guest_asce;
1257 	int rc;
1258 
1259 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1260 
1261 	context.seq = sg->kvm->mmu_invalidate_seq;
1262 	/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
1263 	smp_rmb();
1264 
1265 	rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false);
1266 	if (rc > 0)
1267 		rc = -EFAULT;
1268 	if (!rc)
1269 		rc = _gmap_protect_asce_top_level(mc, sg, &context);
1270 	if (rc)
1271 		kvm_s390_release_faultin_array(sg->kvm, context.f, true);
1272 	return rc;
1273 }
1274 
1275 /**
1276  * gmap_create_shadow() - Create/find a shadow guest address space.
1277  * @mc: The cache to use to allocate dat tables.
1278  * @parent: Pointer to the parent gmap.
1279  * @asce: ASCE for which the shadow table is created.
1280  * @edat_level: Edat level to be used for the shadow translation.
1281  *
1282  * The pages of the top level page table referred by the asce parameter
1283  * will be set to read-only and marked in the PGSTEs of the kvm process.
1284  * The shadow table will be removed automatically on any change to the
1285  * PTE mapping for the source table.
1286  *
1287  * The returned shadow gmap will be returned with one extra reference.
1288  *
1289  * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
1290  * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
1291  * parent gmap table could not be protected.
1292  */
1293 struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent,
1294 				union asce asce, int edat_level)
1295 {
1296 	struct gmap *sg, *new;
1297 	int rc;
1298 
1299 	if (WARN_ON(!parent))
1300 		return ERR_PTR(-EINVAL);
1301 
1302 	scoped_guard(spinlock, &parent->children_lock) {
1303 		sg = gmap_find_shadow(parent, asce, edat_level);
1304 		if (sg) {
1305 			gmap_get(sg);
1306 			return sg;
1307 		}
1308 	}
1309 	/* Create a new shadow gmap. */
1310 	new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce));
1311 	if (!new)
1312 		return ERR_PTR(-ENOMEM);
1313 	new->guest_asce = asce;
1314 	new->edat_level = edat_level;
1315 	set_bit(GMAP_FLAG_SHADOW, &new->flags);
1316 
1317 	scoped_guard(spinlock, &parent->children_lock) {
1318 		/* Recheck if another CPU created the same shadow. */
1319 		sg = gmap_find_shadow(parent, asce, edat_level);
1320 		if (sg) {
1321 			gmap_put(new);
1322 			gmap_get(sg);
1323 			return sg;
1324 		}
1325 		if (asce.r) {
1326 			/* Only allow one real-space gmap shadow. */
1327 			list_for_each_entry(sg, &parent->children, list) {
1328 				if (sg->guest_asce.r) {
1329 					scoped_guard(write_lock, &parent->kvm->mmu_lock)
1330 						gmap_unshadow(sg);
1331 					break;
1332 				}
1333 			}
1334 			gmap_add_child(parent, new);
1335 			/* Nothing to protect, return right away. */
1336 			gmap_get(new);
1337 			return new;
1338 		}
1339 	}
1340 
1341 	gmap_get(new);
1342 	new->parent = parent;
1343 	/* Protect while inserting, protects against invalidation races. */
1344 	rc = gmap_protect_asce_top_level(mc, new);
1345 	if (rc) {
1346 		new->parent = NULL;
1347 		gmap_put(new);
1348 		gmap_put(new);
1349 		return ERR_PTR(rc);
1350 	}
1351 	return new;
1352 }
1353