xref: /linux/arch/s390/kvm/gmap.c (revision 8934827db5403eae57d4537114a9ff88b0a8460f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Guest memory management for KVM/s390
4  *
5  * Copyright IBM Corp. 2008, 2020, 2024
6  *
7  *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
8  *               Martin Schwidefsky <schwidefsky@de.ibm.com>
9  *               David Hildenbrand <david@redhat.com>
10  *               Janosch Frank <frankja@linux.ibm.com>
11  */
12 
13 #include <linux/compiler.h>
14 #include <linux/kvm.h>
15 #include <linux/kvm_host.h>
16 #include <linux/pgtable.h>
17 #include <linux/pagemap.h>
18 #include <asm/lowcore.h>
19 #include <asm/uv.h>
20 #include <asm/gmap_helpers.h>
21 
22 #include "dat.h"
23 #include "gmap.h"
24 #include "kvm-s390.h"
25 #include "faultin.h"
26 
kvm_s390_is_in_sie(struct kvm_vcpu * vcpu)27 static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu)
28 {
29 	return vcpu->arch.sie_block->prog0c & PROG_IN_SIE;
30 }
31 
gmap_limit_to_type(gfn_t limit)32 static int gmap_limit_to_type(gfn_t limit)
33 {
34 	if (!limit)
35 		return TABLE_TYPE_REGION1;
36 	if (limit <= _REGION3_SIZE >> PAGE_SHIFT)
37 		return TABLE_TYPE_SEGMENT;
38 	if (limit <= _REGION2_SIZE >> PAGE_SHIFT)
39 		return TABLE_TYPE_REGION3;
40 	if (limit <= _REGION1_SIZE >> PAGE_SHIFT)
41 		return TABLE_TYPE_REGION2;
42 	return TABLE_TYPE_REGION1;
43 }
44 
45 /**
46  * gmap_new() - Allocate and initialize a guest address space.
47  * @kvm: The kvm owning the guest.
48  * @limit: Maximum address of the gmap address space.
49  *
50  * Return: A guest address space structure.
51  */
gmap_new(struct kvm * kvm,gfn_t limit)52 struct gmap *gmap_new(struct kvm *kvm, gfn_t limit)
53 {
54 	struct crst_table *table;
55 	struct gmap *gmap;
56 	int type;
57 
58 	type = gmap_limit_to_type(limit);
59 
60 	gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT);
61 	if (!gmap)
62 		return NULL;
63 	INIT_LIST_HEAD(&gmap->children);
64 	INIT_LIST_HEAD(&gmap->list);
65 	INIT_LIST_HEAD(&gmap->scb_users);
66 	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE);
67 	spin_lock_init(&gmap->children_lock);
68 	spin_lock_init(&gmap->host_to_rmap_lock);
69 	refcount_set(&gmap->refcount, 1);
70 
71 	table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val);
72 	if (!table) {
73 		kfree(gmap);
74 		return NULL;
75 	}
76 
77 	gmap->asce.val = __pa(table);
78 	gmap->asce.dt = type;
79 	gmap->asce.tl = _ASCE_TABLE_LENGTH;
80 	gmap->asce.x = 1;
81 	gmap->asce.p = 1;
82 	gmap->asce.s = 1;
83 	gmap->kvm = kvm;
84 	set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags);
85 
86 	return gmap;
87 }
88 
gmap_add_child(struct gmap * parent,struct gmap * child)89 static void gmap_add_child(struct gmap *parent, struct gmap *child)
90 {
91 	KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm);
92 	KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm);
93 	KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm);
94 	lockdep_assert_held(&parent->children_lock);
95 
96 	child->parent = parent;
97 
98 	if (is_ucontrol(parent))
99 		set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
100 	else
101 		clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
102 
103 	if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags))
104 		set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
105 	else
106 		clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
107 
108 	if (kvm_is_ucontrol(parent->kvm))
109 		clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags);
110 	list_add(&child->list, &parent->children);
111 }
112 
gmap_new_child(struct gmap * parent,gfn_t limit)113 struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit)
114 {
115 	struct gmap *res;
116 
117 	lockdep_assert_not_held(&parent->children_lock);
118 	res = gmap_new(parent->kvm, limit);
119 	if (res) {
120 		scoped_guard(spinlock, &parent->children_lock)
121 			gmap_add_child(parent, res);
122 	}
123 	return res;
124 }
125 
gmap_set_limit(struct gmap * gmap,gfn_t limit)126 int gmap_set_limit(struct gmap *gmap, gfn_t limit)
127 {
128 	struct kvm_s390_mmu_cache *mc;
129 	int rc, type;
130 
131 	type = gmap_limit_to_type(limit);
132 
133 	mc = kvm_s390_new_mmu_cache();
134 	if (!mc)
135 		return -ENOMEM;
136 
137 	do {
138 		rc = kvm_s390_mmu_cache_topup(mc);
139 		if (rc)
140 			return rc;
141 		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
142 			rc = dat_set_asce_limit(mc, &gmap->asce, type);
143 	} while (rc == -ENOMEM);
144 
145 	kvm_s390_free_mmu_cache(mc);
146 	return 0;
147 }
148 
gmap_rmap_radix_tree_free(struct radix_tree_root * root)149 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
150 {
151 	struct vsie_rmap *rmap, *rnext, *head;
152 	struct radix_tree_iter iter;
153 	unsigned long indices[16];
154 	unsigned long index;
155 	void __rcu **slot;
156 	int i, nr;
157 
158 	/* A radix tree is freed by deleting all of its entries */
159 	index = 0;
160 	do {
161 		nr = 0;
162 		radix_tree_for_each_slot(slot, root, &iter, index) {
163 			indices[nr] = iter.index;
164 			if (++nr == 16)
165 				break;
166 		}
167 		for (i = 0; i < nr; i++) {
168 			index = indices[i];
169 			head = radix_tree_delete(root, index);
170 			gmap_for_each_rmap_safe(rmap, rnext, head)
171 				kfree(rmap);
172 		}
173 	} while (nr > 0);
174 }
175 
gmap_remove_child(struct gmap * child)176 void gmap_remove_child(struct gmap *child)
177 {
178 	if (KVM_BUG_ON(!child->parent, child->kvm))
179 		return;
180 	lockdep_assert_held(&child->parent->children_lock);
181 
182 	list_del(&child->list);
183 	child->parent = NULL;
184 }
185 
186 /**
187  * gmap_dispose() - Remove and free a guest address space and its children.
188  * @gmap: Pointer to the guest address space structure.
189  */
gmap_dispose(struct gmap * gmap)190 void gmap_dispose(struct gmap *gmap)
191 {
192 	/* The gmap must have been removed from the parent beforehands */
193 	KVM_BUG_ON(gmap->parent, gmap->kvm);
194 	/* All children of this gmap must have been removed beforehands */
195 	KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm);
196 	/* No VSIE shadow block is allowed to use this gmap */
197 	KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm);
198 	/* The ASCE must be valid */
199 	KVM_BUG_ON(!gmap->asce.val, gmap->kvm);
200 	/* The refcount must be 0 */
201 	KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm);
202 
203 	/* Flush tlb of all gmaps */
204 	asce_flush_tlb(gmap->asce);
205 
206 	/* Free all DAT tables. */
207 	dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap));
208 
209 	/* Free additional data for a shadow gmap */
210 	if (is_shadow(gmap))
211 		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
212 
213 	kfree(gmap);
214 }
215 
216 /**
217  * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy.
218  * @gmap: The gmap whose ASCE needs to be replaced.
219  *
220  * If the ASCE is a SEGMENT type then this function will return -EINVAL,
221  * otherwise the pointers in the host_to_guest radix tree will keep pointing
222  * to the wrong pages, causing use-after-free and memory corruption.
223  * If the allocation of the new top level page table fails, the ASCE is not
224  * replaced.
225  * In any case, the old ASCE is always removed from the gmap CRST list.
226  * Therefore the caller has to make sure to save a pointer to it
227  * beforehand, unless a leak is actually intended.
228  *
229  * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE,
230  *         -ENOMEM if runinng out of memory.
231  */
s390_replace_asce(struct gmap * gmap)232 int s390_replace_asce(struct gmap *gmap)
233 {
234 	struct crst_table *table;
235 	union asce asce;
236 
237 	/* Replacing segment type ASCEs would cause serious issues */
238 	if (gmap->asce.dt == ASCE_TYPE_SEGMENT)
239 		return -EINVAL;
240 
241 	table = dat_alloc_crst_sleepable(0);
242 	if (!table)
243 		return -ENOMEM;
244 	memcpy(table, dereference_asce(gmap->asce), sizeof(*table));
245 
246 	/* Set new table origin while preserving existing ASCE control bits */
247 	asce = gmap->asce;
248 	asce.rsto = virt_to_pfn(table);
249 	WRITE_ONCE(gmap->asce, asce);
250 
251 	return 0;
252 }
253 
_gmap_unmap_prefix(struct gmap * gmap,gfn_t gfn,gfn_t end,bool hint)254 bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint)
255 {
256 	struct kvm *kvm = gmap->kvm;
257 	struct kvm_vcpu *vcpu;
258 	gfn_t prefix_gfn;
259 	unsigned long i;
260 
261 	if (is_shadow(gmap))
262 		return false;
263 	kvm_for_each_vcpu(i, vcpu, kvm) {
264 		/* Match against both prefix pages */
265 		prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu));
266 		if (prefix_gfn < end && gfn <= prefix_gfn + 1) {
267 			if (hint && kvm_s390_is_in_sie(vcpu))
268 				return false;
269 			VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx",
270 				   gfn_to_gpa(gfn), gfn_to_gpa(end));
271 			kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
272 		}
273 	}
274 	return true;
275 }
276 
277 struct clear_young_pte_priv {
278 	struct gmap *gmap;
279 	bool young;
280 };
281 
gmap_clear_young_pte(union pte * ptep,gfn_t gfn,gfn_t end,struct dat_walk * walk)282 static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
283 {
284 	struct clear_young_pte_priv *p = walk->priv;
285 	union pgste pgste;
286 	union pte pte, new;
287 
288 	pte = READ_ONCE(*ptep);
289 
290 	if (!pte.s.pr || (!pte.s.y && pte.h.i))
291 		return 0;
292 
293 	pgste = pgste_get_lock(ptep);
294 	if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) {
295 		new = pte;
296 		new.h.i = 1;
297 		new.s.y = 0;
298 		if ((new.s.d || !new.h.p) && !new.s.s)
299 			folio_set_dirty(pfn_folio(pte.h.pfra));
300 		new.s.d = 0;
301 		new.h.p = 1;
302 
303 		pgste.prefix_notif = 0;
304 		pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap));
305 	}
306 	p->young = 1;
307 	pgste_set_unlock(ptep, pgste);
308 	return 0;
309 }
310 
gmap_clear_young_crste(union crste * crstep,gfn_t gfn,gfn_t end,struct dat_walk * walk)311 static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
312 {
313 	struct clear_young_pte_priv *priv = walk->priv;
314 	union crste crste, new;
315 
316 	crste = READ_ONCE(*crstep);
317 
318 	if (!crste.h.fc)
319 		return 0;
320 	if (!crste.s.fc1.y && crste.h.i)
321 		return 0;
322 	if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) {
323 		new = crste;
324 		new.h.i = 1;
325 		new.s.fc1.y = 0;
326 		new.s.fc1.prefix_notif = 0;
327 		if (new.s.fc1.d || !new.h.p)
328 			folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
329 		new.s.fc1.d = 0;
330 		new.h.p = 1;
331 		dat_crstep_xchg(crstep, new, gfn, walk->asce);
332 	}
333 	priv->young = 1;
334 	return 0;
335 }
336 
337 /**
338  * gmap_age_gfn() - Clear young.
339  * @gmap: The guest gmap.
340  * @start: The first gfn to test.
341  * @end: The gfn after the last one to test.
342  *
343  * Context: Called with the kvm mmu write lock held.
344  * Return: 1 if any page in the given range was young, otherwise 0.
345  */
gmap_age_gfn(struct gmap * gmap,gfn_t start,gfn_t end)346 bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end)
347 {
348 	const struct dat_walk_ops ops = {
349 		.pte_entry = gmap_clear_young_pte,
350 		.pmd_entry = gmap_clear_young_crste,
351 		.pud_entry = gmap_clear_young_crste,
352 	};
353 	struct clear_young_pte_priv priv = {
354 		.gmap = gmap,
355 		.young = false,
356 	};
357 
358 	_dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
359 
360 	return priv.young;
361 }
362 
363 struct gmap_unmap_priv {
364 	struct gmap *gmap;
365 	struct kvm_memory_slot *slot;
366 };
367 
_gmap_unmap_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * w)368 static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w)
369 {
370 	struct gmap_unmap_priv *priv = w->priv;
371 	struct folio *folio = NULL;
372 	unsigned long vmaddr;
373 	union pgste pgste;
374 
375 	pgste = pgste_get_lock(ptep);
376 	if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) {
377 		vmaddr = __gfn_to_hva_memslot(priv->slot, gfn);
378 		gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr);
379 	}
380 	if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
381 		folio = pfn_folio(ptep->h.pfra);
382 	pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn);
383 	pgste_set_unlock(ptep, pgste);
384 	if (folio)
385 		uv_convert_from_secure_folio(folio);
386 
387 	return 0;
388 }
389 
_gmap_unmap_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)390 static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
391 {
392 	struct gmap_unmap_priv *priv = walk->priv;
393 	struct folio *folio = NULL;
394 
395 	if (crstep->h.fc) {
396 		if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
397 			folio = phys_to_folio(crste_origin_large(*crstep));
398 		gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn);
399 		if (folio)
400 			uv_convert_from_secure_folio(folio);
401 	}
402 
403 	return 0;
404 }
405 
406 /**
407  * gmap_unmap_gfn_range() - Unmap a range of guest addresses.
408  * @gmap: The gmap to act on.
409  * @slot: The memslot in which the range is located.
410  * @start: The first gfn to unmap.
411  * @end: The gfn after the last one to unmap.
412  *
413  * Context: Called with the kvm mmu write lock held.
414  * Return: false
415  */
gmap_unmap_gfn_range(struct gmap * gmap,struct kvm_memory_slot * slot,gfn_t start,gfn_t end)416 bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
417 {
418 	const struct dat_walk_ops ops = {
419 		.pte_entry = _gmap_unmap_pte,
420 		.pmd_entry = _gmap_unmap_crste,
421 		.pud_entry = _gmap_unmap_crste,
422 	};
423 	struct gmap_unmap_priv priv = {
424 		.gmap = gmap,
425 		.slot = slot,
426 	};
427 
428 	lockdep_assert_held_write(&gmap->kvm->mmu_lock);
429 
430 	_dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
431 	return false;
432 }
433 
__pte_test_and_clear_softdirty(union pte * ptep,union pgste pgste,gfn_t gfn,struct gmap * gmap)434 static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn,
435 						  struct gmap *gmap)
436 {
437 	union pte pte = READ_ONCE(*ptep);
438 
439 	if (!pte.s.pr || (pte.h.p && !pte.s.sd))
440 		return pgste;
441 
442 	/*
443 	 * If this page contains one or more prefixes of vCPUS that are currently
444 	 * running, do not reset the protection, leave it marked as dirty.
445 	 */
446 	if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) {
447 		pte.h.p = 1;
448 		pte.s.sd = 0;
449 		pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn);
450 	}
451 
452 	mark_page_dirty(gmap->kvm, gfn);
453 
454 	return pgste;
455 }
456 
_pte_test_and_clear_softdirty(union pte * ptep,gfn_t gfn,gfn_t end,struct dat_walk * walk)457 static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end,
458 					  struct dat_walk *walk)
459 {
460 	struct gmap *gmap = walk->priv;
461 	union pgste pgste;
462 
463 	pgste = pgste_get_lock(ptep);
464 	pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap);
465 	pgste_set_unlock(ptep, pgste);
466 	return 0;
467 }
468 
_crste_test_and_clear_softdirty(union crste * table,gfn_t gfn,gfn_t end,struct dat_walk * walk)469 static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end,
470 					    struct dat_walk *walk)
471 {
472 	struct gmap *gmap = walk->priv;
473 	union crste crste, new;
474 
475 	if (fatal_signal_pending(current))
476 		return 1;
477 	crste = READ_ONCE(*table);
478 	if (!crste.h.fc)
479 		return 0;
480 	if (crste.h.p && !crste.s.fc1.sd)
481 		return 0;
482 
483 	/*
484 	 * If this large page contains one or more prefixes of vCPUs that are
485 	 * currently running, do not reset the protection, leave it marked as
486 	 * dirty.
487 	 */
488 	if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) {
489 		new = crste;
490 		new.h.p = 1;
491 		new.s.fc1.sd = 0;
492 		gmap_crstep_xchg(gmap, table, new, gfn);
493 	}
494 
495 	for ( ; gfn < end; gfn++)
496 		mark_page_dirty(gmap->kvm, gfn);
497 
498 	return 0;
499 }
500 
gmap_sync_dirty_log(struct gmap * gmap,gfn_t start,gfn_t end)501 void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end)
502 {
503 	const struct dat_walk_ops walk_ops = {
504 		.pte_entry = _pte_test_and_clear_softdirty,
505 		.pmd_entry = _crste_test_and_clear_softdirty,
506 		.pud_entry = _crste_test_and_clear_softdirty,
507 	};
508 
509 	lockdep_assert_held(&gmap->kvm->mmu_lock);
510 
511 	_dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap);
512 }
513 
gmap_handle_minor_crste_fault(union asce asce,struct guest_fault * f)514 static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f)
515 {
516 	union crste newcrste, oldcrste = READ_ONCE(*f->crstep);
517 
518 	/* Somehow the crste is not large anymore, let the slow path deal with it. */
519 	if (!oldcrste.h.fc)
520 		return 1;
521 
522 	f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn));
523 	f->writable = oldcrste.s.fc1.w;
524 
525 	/* Appropriate permissions already (race with another handler), nothing to do. */
526 	if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p))
527 		return 0;
528 
529 	if (!f->write_attempt || oldcrste.s.fc1.w) {
530 		f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d;
531 		newcrste = oldcrste;
532 		newcrste.h.i = 0;
533 		newcrste.s.fc1.y = 1;
534 		if (f->write_attempt) {
535 			newcrste.h.p = 0;
536 			newcrste.s.fc1.d = 1;
537 			newcrste.s.fc1.sd = 1;
538 		}
539 		if (!oldcrste.s.fc1.d && newcrste.s.fc1.d)
540 			SetPageDirty(phys_to_page(crste_origin_large(newcrste)));
541 		/* In case of races, let the slow path deal with it. */
542 		return !dat_crstep_xchg_atomic(f->crstep, oldcrste, newcrste, f->gfn, asce);
543 	}
544 	/* Trying to write on a read-only page, let the slow path deal with it. */
545 	return 1;
546 }
547 
_gmap_handle_minor_pte_fault(struct gmap * gmap,union pgste * pgste,struct guest_fault * f)548 static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste,
549 					struct guest_fault *f)
550 {
551 	union pte newpte, oldpte = READ_ONCE(*f->ptep);
552 
553 	f->pfn = oldpte.h.pfra;
554 	f->writable = oldpte.s.w;
555 
556 	/* Appropriate permissions already (race with another handler), nothing to do. */
557 	if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p))
558 		return 0;
559 	/* Trying to write on a read-only page, let the slow path deal with it. */
560 	if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w))
561 		return 1;
562 
563 	newpte = oldpte;
564 	newpte.h.i = 0;
565 	newpte.s.y = 1;
566 	if (f->write_attempt) {
567 		newpte.h.p = 0;
568 		newpte.s.d = 1;
569 		newpte.s.sd = 1;
570 	}
571 	if (!oldpte.s.d && newpte.s.d)
572 		SetPageDirty(pfn_to_page(newpte.h.pfra));
573 	*pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn);
574 
575 	return 0;
576 }
577 
578 /**
579  * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault.
580  * @gmap: The gmap whose fault needs to be resolved.
581  * @fault: Describes the fault that is being resolved.
582  *
583  * A minor fault is a fault that can be resolved quickly within gmap.
584  * The page is already mapped, the fault is only due to dirty/young tracking.
585  *
586  * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could
587  *         not be resolved and needs to go through the slow path.
588  */
gmap_try_fixup_minor(struct gmap * gmap,struct guest_fault * fault)589 int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault)
590 {
591 	union pgste pgste;
592 	int rc;
593 
594 	lockdep_assert_held(&gmap->kvm->mmu_lock);
595 
596 	rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE,
597 			    &fault->crstep, &fault->ptep);
598 	/* If a PTE or a leaf CRSTE could not be reached, slow path. */
599 	if (rc)
600 		return 1;
601 
602 	if (fault->ptep) {
603 		pgste = pgste_get_lock(fault->ptep);
604 		rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault);
605 		if (!rc && fault->callback)
606 			fault->callback(fault);
607 		pgste_set_unlock(fault->ptep, pgste);
608 	} else {
609 		rc = gmap_handle_minor_crste_fault(gmap->asce, fault);
610 		if (!rc && fault->callback)
611 			fault->callback(fault);
612 	}
613 	return rc;
614 }
615 
gmap_2g_allowed(struct gmap * gmap,gfn_t gfn)616 static inline bool gmap_2g_allowed(struct gmap *gmap, gfn_t gfn)
617 {
618 	return false;
619 }
620 
gmap_1m_allowed(struct gmap * gmap,gfn_t gfn)621 static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn)
622 {
623 	return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags);
624 }
625 
gmap_link(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,struct guest_fault * f)626 int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f)
627 {
628 	unsigned int order;
629 	int rc, level;
630 
631 	lockdep_assert_held(&gmap->kvm->mmu_lock);
632 
633 	level = TABLE_TYPE_PAGE_TABLE;
634 	if (f->page) {
635 		order = folio_order(page_folio(f->page));
636 		if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f->gfn))
637 			level = TABLE_TYPE_REGION3;
638 		else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn))
639 			level = TABLE_TYPE_SEGMENT;
640 	}
641 	rc = dat_link(mc, gmap->asce, level, uses_skeys(gmap), f);
642 	KVM_BUG_ON(rc == -EINVAL, gmap->kvm);
643 	return rc;
644 }
645 
gmap_ucas_map_one(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,gfn_t p_gfn,gfn_t c_gfn,bool force_alloc)646 static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
647 			     gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
648 {
649 	struct page_table *pt;
650 	union crste newcrste;
651 	union crste *crstep;
652 	union pte *ptep;
653 	int rc;
654 
655 	if (force_alloc)
656 		rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC,
657 				    TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
658 	else
659 		rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE,
660 				    TABLE_TYPE_SEGMENT, &crstep, &ptep);
661 	if (rc)
662 		return rc;
663 	if (!ptep) {
664 		newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT);
665 		newcrste.h.i = 1;
666 		newcrste.h.fc0.tl = 1;
667 	} else {
668 		pt = pte_table_start(ptep);
669 		dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT));
670 		newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT);
671 	}
672 	rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT,
673 			    &crstep, &ptep);
674 	if (rc)
675 		return rc;
676 	dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce);
677 	return 0;
678 }
679 
gmap_ucas_translate_simple(struct gmap * gmap,gpa_t * gaddr,union crste ** crstepp)680 static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp)
681 {
682 	union pte *ptep;
683 	int rc;
684 
685 	rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE,
686 			    TABLE_TYPE_SEGMENT, crstepp, &ptep);
687 	if (rc || (!ptep && !crste_is_ucas(**crstepp)))
688 		return -EREMOTE;
689 	if (!ptep)
690 		return 1;
691 	*gaddr &= ~_SEGMENT_MASK;
692 	*gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT;
693 	return 0;
694 }
695 
696 /**
697  * gmap_ucas_translate() - Translate a vcpu address into a host gmap address
698  * @mc: The memory cache to be used for allocations.
699  * @gmap: The per-cpu gmap.
700  * @gaddr: Pointer to the address to be translated, will get overwritten with
701  *         the translated address in case of success.
702  * Translates the per-vCPU guest address into a fake guest address, which can
703  * then be used with the fake memslots that are identity mapping userspace.
704  * This allows ucontrol VMs to use the normal fault resolution path, like
705  * normal VMs.
706  *
707  * Return: %0 in case of success, otherwise %-EREMOTE.
708  */
gmap_ucas_translate(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,gpa_t * gaddr)709 int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr)
710 {
711 	gpa_t translated_address;
712 	union crste *crstep;
713 	gfn_t gfn;
714 	int rc;
715 
716 	gfn = gpa_to_gfn(*gaddr);
717 
718 	scoped_guard(read_lock, &gmap->kvm->mmu_lock) {
719 		rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
720 		if (rc <= 0)
721 			return rc;
722 	}
723 	do {
724 		scoped_guard(write_lock, &gmap->kvm->mmu_lock) {
725 			rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
726 			if (rc <= 0)
727 				return rc;
728 			translated_address = (*gaddr & ~_SEGMENT_MASK) |
729 					     (crstep->val & _SEGMENT_MASK);
730 			rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true);
731 		}
732 		if (!rc) {
733 			*gaddr = translated_address;
734 			return 0;
735 		}
736 		if (rc != -ENOMEM)
737 			return -EREMOTE;
738 		rc = kvm_s390_mmu_cache_topup(mc);
739 		if (rc)
740 			return rc;
741 	} while (1);
742 	return 0;
743 }
744 
gmap_ucas_map(struct gmap * gmap,gfn_t p_gfn,gfn_t c_gfn,unsigned long count)745 int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count)
746 {
747 	struct kvm_s390_mmu_cache *mc;
748 	int rc;
749 
750 	mc = kvm_s390_new_mmu_cache();
751 	if (!mc)
752 		return -ENOMEM;
753 
754 	while (count) {
755 		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
756 			rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false);
757 		if (rc == -ENOMEM) {
758 			rc = kvm_s390_mmu_cache_topup(mc);
759 			if (rc)
760 				return rc;
761 			continue;
762 		}
763 		if (rc)
764 			return rc;
765 
766 		count--;
767 		c_gfn += _PAGE_ENTRIES;
768 		p_gfn += _PAGE_ENTRIES;
769 	}
770 	return rc;
771 }
772 
gmap_ucas_unmap_one(struct gmap * gmap,gfn_t c_gfn)773 static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
774 {
775 	union crste *crstep;
776 	union pte *ptep;
777 	int rc;
778 
779 	rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
780 	if (!rc)
781 		dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce);
782 }
783 
gmap_ucas_unmap(struct gmap * gmap,gfn_t c_gfn,unsigned long count)784 void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
785 {
786 	guard(read_lock)(&gmap->kvm->mmu_lock);
787 
788 	for ( ; count; count--, c_gfn += _PAGE_ENTRIES)
789 		gmap_ucas_unmap_one(gmap, c_gfn);
790 }
791 
_gmap_split_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)792 static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
793 {
794 	struct gmap *gmap = walk->priv;
795 	union crste crste, newcrste;
796 
797 	crste = READ_ONCE(*crstep);
798 	newcrste = _CRSTE_EMPTY(crste.h.tt);
799 
800 	while (crste_leaf(crste)) {
801 		if (crste_prefix(crste))
802 			gmap_unmap_prefix(gmap, gfn, next);
803 		if (crste.s.fc1.vsie_notif)
804 			gmap_handle_vsie_unshadow_event(gmap, gfn);
805 		if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce))
806 			break;
807 		crste = READ_ONCE(*crstep);
808 	}
809 
810 	if (need_resched())
811 		return next;
812 
813 	return 0;
814 }
815 
gmap_split_huge_pages(struct gmap * gmap)816 void gmap_split_huge_pages(struct gmap *gmap)
817 {
818 	const struct dat_walk_ops ops = {
819 		.pmd_entry = _gmap_split_crste,
820 		.pud_entry = _gmap_split_crste,
821 	};
822 	gfn_t start = 0;
823 
824 	do {
825 		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
826 			start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce,
827 						    &ops, DAT_WALK_IGN_HOLES, gmap);
828 		cond_resched();
829 	} while (start);
830 }
831 
_gmap_enable_skeys(struct gmap * gmap)832 static int _gmap_enable_skeys(struct gmap *gmap)
833 {
834 	gfn_t start = 0;
835 	int rc;
836 
837 	if (uses_skeys(gmap))
838 		return 0;
839 
840 	set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
841 	rc = gmap_helper_disable_cow_sharing();
842 	if (rc) {
843 		clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
844 		return rc;
845 	}
846 
847 	do {
848 		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
849 			start = dat_reset_skeys(gmap->asce, start);
850 		cond_resched();
851 	} while (start);
852 	return 0;
853 }
854 
gmap_enable_skeys(struct gmap * gmap)855 int gmap_enable_skeys(struct gmap *gmap)
856 {
857 	int rc;
858 
859 	mmap_write_lock(gmap->kvm->mm);
860 	rc = _gmap_enable_skeys(gmap);
861 	mmap_write_unlock(gmap->kvm->mm);
862 	return rc;
863 }
864 
_destroy_pages_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)865 static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
866 {
867 	if (!ptep->s.pr)
868 		return 0;
869 	__kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep)));
870 	if (need_resched())
871 		return next;
872 	return 0;
873 }
874 
_destroy_pages_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)875 static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
876 {
877 	phys_addr_t origin, cur, end;
878 
879 	if (!crstep->h.fc || !crstep->s.fc1.pr)
880 		return 0;
881 
882 	origin = crste_origin_large(*crstep);
883 	cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin;
884 	end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin;
885 	for ( ; cur < end; cur += PAGE_SIZE)
886 		__kvm_s390_pv_destroy_page(phys_to_page(cur));
887 	if (need_resched())
888 		return next;
889 	return 0;
890 }
891 
gmap_pv_destroy_range(struct gmap * gmap,gfn_t start,gfn_t end,bool interruptible)892 int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible)
893 {
894 	const struct dat_walk_ops ops = {
895 		.pte_entry = _destroy_pages_pte,
896 		.pmd_entry = _destroy_pages_crste,
897 		.pud_entry = _destroy_pages_crste,
898 	};
899 
900 	do {
901 		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
902 			start = _dat_walk_gfn_range(start, end, gmap->asce, &ops,
903 						    DAT_WALK_IGN_HOLES, NULL);
904 		if (interruptible && fatal_signal_pending(current))
905 			return -EINTR;
906 		cond_resched();
907 	} while (start && start < end);
908 	return 0;
909 }
910 
gmap_insert_rmap(struct gmap * sg,gfn_t p_gfn,gfn_t r_gfn,int level)911 int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level)
912 {
913 	struct vsie_rmap *rmap __free(kvfree) = NULL;
914 	struct vsie_rmap *temp;
915 	void __rcu **slot;
916 	int rc = 0;
917 
918 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
919 	lockdep_assert_held(&sg->host_to_rmap_lock);
920 
921 	rmap = kzalloc_obj(*rmap, GFP_ATOMIC);
922 	if (!rmap)
923 		return -ENOMEM;
924 
925 	rmap->r_gfn = r_gfn;
926 	rmap->level = level;
927 	slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn);
928 	if (slot) {
929 		rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock);
930 		for (temp = rmap->next; temp; temp = temp->next) {
931 			if (temp->val == rmap->val)
932 				return 0;
933 		}
934 		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
935 	} else {
936 		rmap->next = NULL;
937 		rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap);
938 		if (rc)
939 			return rc;
940 	}
941 	rmap = NULL;
942 
943 	return 0;
944 }
945 
gmap_protect_rmap(struct kvm_s390_mmu_cache * mc,struct gmap * sg,gfn_t p_gfn,gfn_t r_gfn,kvm_pfn_t pfn,int level,bool wr)946 int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
947 		      kvm_pfn_t pfn, int level, bool wr)
948 {
949 	union crste *crstep;
950 	union pgste pgste;
951 	union pte *ptep;
952 	union pte pte;
953 	int flags, rc;
954 
955 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
956 	lockdep_assert_held(&sg->parent->children_lock);
957 
958 	flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
959 	rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags,
960 			    TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
961 	if (rc)
962 		return rc;
963 	if (level <= TABLE_TYPE_REGION1) {
964 		scoped_guard(spinlock, &sg->host_to_rmap_lock)
965 			rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level);
966 	}
967 	if (rc)
968 		return rc;
969 
970 	if (!pgste_get_trylock(ptep, &pgste))
971 		return -EAGAIN;
972 	pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false);
973 	pte.h.p = 1;
974 	pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false);
975 	pgste.vsie_notif = 1;
976 	pgste_set_unlock(ptep, pgste);
977 
978 	return 0;
979 }
980 
__set_cmma_dirty_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)981 static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
982 {
983 	__atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val);
984 	if (need_resched())
985 		return next;
986 	return 0;
987 }
988 
gmap_set_cmma_all_dirty(struct gmap * gmap)989 void gmap_set_cmma_all_dirty(struct gmap *gmap)
990 {
991 	const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, };
992 	gfn_t gfn = 0;
993 
994 	do {
995 		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
996 			gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops,
997 						  DAT_WALK_IGN_HOLES, NULL);
998 		cond_resched();
999 	} while (gfn);
1000 }
1001 
gmap_unshadow_level(struct gmap * sg,gfn_t r_gfn,int level)1002 static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
1003 {
1004 	unsigned long align = PAGE_SIZE;
1005 	gpa_t gaddr = gfn_to_gpa(r_gfn);
1006 	union crste *crstep;
1007 	union crste crste;
1008 	union pte *ptep;
1009 
1010 	if (level > TABLE_TYPE_PAGE_TABLE)
1011 		align = 1UL << (11 * level + _SEGMENT_SHIFT);
1012 	kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align));
1013 	if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep))
1014 		return;
1015 	if (ptep) {
1016 		if (READ_ONCE(*ptep).val != _PTE_EMPTY.val)
1017 			dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
1018 		return;
1019 	}
1020 	crste = READ_ONCE(*crstep);
1021 	dat_crstep_clear(crstep, r_gfn, sg->asce);
1022 	if (crste_leaf(crste) || crste.h.i)
1023 		return;
1024 	if (is_pmd(crste))
1025 		dat_free_pt(dereference_pmd(crste.pmd));
1026 	else
1027 		dat_free_level(dereference_crste(crste), true);
1028 }
1029 
gmap_unshadow(struct gmap * sg)1030 static void gmap_unshadow(struct gmap *sg)
1031 {
1032 	struct gmap_cache *gmap_cache, *next;
1033 
1034 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1035 	KVM_BUG_ON(!sg->parent, sg->kvm);
1036 
1037 	lockdep_assert_held(&sg->parent->children_lock);
1038 
1039 	gmap_remove_child(sg);
1040 	kvm_s390_vsie_gmap_notifier(sg, 0, -1UL);
1041 
1042 	list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) {
1043 		gmap_cache->gmap = NULL;
1044 		list_del(&gmap_cache->list);
1045 	}
1046 
1047 	gmap_put(sg);
1048 }
1049 
_gmap_handle_vsie_unshadow_event(struct gmap * parent,gfn_t gfn)1050 void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
1051 {
1052 	struct vsie_rmap *rmap, *rnext, *head;
1053 	struct gmap *sg, *next;
1054 	gfn_t start, end;
1055 
1056 	list_for_each_entry_safe(sg, next, &parent->children, list) {
1057 		start = sg->guest_asce.rsto;
1058 		end = start + sg->guest_asce.tl + 1;
1059 		if (!sg->guest_asce.r && gfn >= start && gfn < end) {
1060 			gmap_unshadow(sg);
1061 			continue;
1062 		}
1063 		scoped_guard(spinlock, &sg->host_to_rmap_lock)
1064 			head = radix_tree_delete(&sg->host_to_rmap, gfn);
1065 		gmap_for_each_rmap_safe(rmap, rnext, head)
1066 			gmap_unshadow_level(sg, rmap->r_gfn, rmap->level);
1067 	}
1068 }
1069 
1070 /**
1071  * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables.
1072  * @parent: Pointer to the parent gmap.
1073  * @asce: ASCE for which the shadow table is created.
1074  * @edat_level: Edat level to be used for the shadow translation.
1075  *
1076  * Context: Called with parent->children_lock held.
1077  *
1078  * Return: The pointer to a gmap if a shadow table with the given asce is
1079  * already available, ERR_PTR(-EAGAIN) if another one is just being created,
1080  * otherwise NULL.
1081  */
gmap_find_shadow(struct gmap * parent,union asce asce,int edat_level)1082 static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level)
1083 {
1084 	struct gmap *sg;
1085 
1086 	lockdep_assert_held(&parent->children_lock);
1087 	list_for_each_entry(sg, &parent->children, list) {
1088 		if (!gmap_is_shadow_valid(sg, asce, edat_level))
1089 			continue;
1090 		return sg;
1091 	}
1092 	return NULL;
1093 }
1094 
1095 #define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE)
1096 struct gmap_protect_asce_top_level {
1097 	unsigned long seq;
1098 	struct guest_fault f[CRST_TABLE_PAGES];
1099 };
1100 
__gmap_protect_asce_top_level(struct kvm_s390_mmu_cache * mc,struct gmap * sg,struct gmap_protect_asce_top_level * context)1101 static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1102 						struct gmap_protect_asce_top_level *context)
1103 {
1104 	int rc, i;
1105 
1106 	guard(write_lock)(&sg->kvm->mmu_lock);
1107 
1108 	if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f))
1109 		return -EAGAIN;
1110 
1111 	scoped_guard(spinlock, &sg->parent->children_lock) {
1112 		for (i = 0; i < CRST_TABLE_PAGES; i++) {
1113 			if (!context->f[i].valid)
1114 				continue;
1115 			rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn,
1116 					       TABLE_TYPE_REGION1 + 1, context->f[i].writable);
1117 			if (rc)
1118 				return rc;
1119 		}
1120 		gmap_add_child(sg->parent, sg);
1121 	}
1122 
1123 	kvm_s390_release_faultin_array(sg->kvm, context->f, false);
1124 	return 0;
1125 }
1126 
_gmap_protect_asce_top_level(struct kvm_s390_mmu_cache * mc,struct gmap * sg,struct gmap_protect_asce_top_level * context)1127 static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1128 					       struct gmap_protect_asce_top_level *context)
1129 {
1130 	int rc;
1131 
1132 	if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f))
1133 		return -EAGAIN;
1134 	do {
1135 		rc = kvm_s390_mmu_cache_topup(mc);
1136 		if (rc)
1137 			return rc;
1138 		rc = radix_tree_preload(GFP_KERNEL);
1139 		if (rc)
1140 			return rc;
1141 		rc = __gmap_protect_asce_top_level(mc, sg, context);
1142 		radix_tree_preload_end();
1143 	} while (rc == -ENOMEM);
1144 
1145 	return rc;
1146 }
1147 
gmap_protect_asce_top_level(struct kvm_s390_mmu_cache * mc,struct gmap * sg)1148 static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg)
1149 {
1150 	struct gmap_protect_asce_top_level context = {};
1151 	union asce asce = sg->guest_asce;
1152 	int rc;
1153 
1154 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1155 
1156 	context.seq = sg->kvm->mmu_invalidate_seq;
1157 	/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
1158 	smp_rmb();
1159 
1160 	rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false);
1161 	if (rc > 0)
1162 		rc = -EFAULT;
1163 	if (!rc)
1164 		rc = _gmap_protect_asce_top_level(mc, sg, &context);
1165 	if (rc)
1166 		kvm_s390_release_faultin_array(sg->kvm, context.f, true);
1167 	return rc;
1168 }
1169 
1170 /**
1171  * gmap_create_shadow() - Create/find a shadow guest address space.
1172  * @mc: The cache to use to allocate dat tables.
1173  * @parent: Pointer to the parent gmap.
1174  * @asce: ASCE for which the shadow table is created.
1175  * @edat_level: Edat level to be used for the shadow translation.
1176  *
1177  * The pages of the top level page table referred by the asce parameter
1178  * will be set to read-only and marked in the PGSTEs of the kvm process.
1179  * The shadow table will be removed automatically on any change to the
1180  * PTE mapping for the source table.
1181  *
1182  * The returned shadow gmap will be returned with one extra reference.
1183  *
1184  * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
1185  * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
1186  * parent gmap table could not be protected.
1187  */
gmap_create_shadow(struct kvm_s390_mmu_cache * mc,struct gmap * parent,union asce asce,int edat_level)1188 struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent,
1189 				union asce asce, int edat_level)
1190 {
1191 	struct gmap *sg, *new;
1192 	int rc;
1193 
1194 	scoped_guard(spinlock, &parent->children_lock) {
1195 		sg = gmap_find_shadow(parent, asce, edat_level);
1196 		if (sg) {
1197 			gmap_get(sg);
1198 			return sg;
1199 		}
1200 	}
1201 	/* Create a new shadow gmap. */
1202 	new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce));
1203 	if (!new)
1204 		return ERR_PTR(-ENOMEM);
1205 	new->guest_asce = asce;
1206 	new->edat_level = edat_level;
1207 	set_bit(GMAP_FLAG_SHADOW, &new->flags);
1208 
1209 	scoped_guard(spinlock, &parent->children_lock) {
1210 		/* Recheck if another CPU created the same shadow. */
1211 		sg = gmap_find_shadow(parent, asce, edat_level);
1212 		if (sg) {
1213 			gmap_put(new);
1214 			gmap_get(sg);
1215 			return sg;
1216 		}
1217 		if (asce.r) {
1218 			/* Only allow one real-space gmap shadow. */
1219 			list_for_each_entry(sg, &parent->children, list) {
1220 				if (sg->guest_asce.r) {
1221 					scoped_guard(write_lock, &parent->kvm->mmu_lock)
1222 						gmap_unshadow(sg);
1223 					break;
1224 				}
1225 			}
1226 			gmap_add_child(parent, new);
1227 			/* Nothing to protect, return right away. */
1228 			gmap_get(new);
1229 			return new;
1230 		}
1231 	}
1232 
1233 	gmap_get(new);
1234 	new->parent = parent;
1235 	/* Protect while inserting, protects against invalidation races. */
1236 	rc = gmap_protect_asce_top_level(mc, new);
1237 	if (rc) {
1238 		new->parent = NULL;
1239 		gmap_put(new);
1240 		gmap_put(new);
1241 		return ERR_PTR(rc);
1242 	}
1243 	return new;
1244 }
1245