1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Guest memory management for KVM/s390
4 *
5 * Copyright IBM Corp. 2008, 2020, 2024
6 *
7 * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
8 * Martin Schwidefsky <schwidefsky@de.ibm.com>
9 * David Hildenbrand <david@redhat.com>
10 * Janosch Frank <frankja@linux.ibm.com>
11 */
12
13 #include <linux/compiler.h>
14 #include <linux/kvm.h>
15 #include <linux/kvm_host.h>
16 #include <linux/pgtable.h>
17 #include <linux/pagemap.h>
18 #include <asm/lowcore.h>
19 #include <asm/uv.h>
20 #include <asm/gmap_helpers.h>
21
22 #include "dat.h"
23 #include "gmap.h"
24 #include "kvm-s390.h"
25 #include "faultin.h"
26
kvm_s390_is_in_sie(struct kvm_vcpu * vcpu)27 static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu)
28 {
29 return vcpu->arch.sie_block->prog0c & PROG_IN_SIE;
30 }
31
gmap_limit_to_type(gfn_t limit)32 static int gmap_limit_to_type(gfn_t limit)
33 {
34 if (!limit)
35 return TABLE_TYPE_REGION1;
36 if (limit <= _REGION3_SIZE >> PAGE_SHIFT)
37 return TABLE_TYPE_SEGMENT;
38 if (limit <= _REGION2_SIZE >> PAGE_SHIFT)
39 return TABLE_TYPE_REGION3;
40 if (limit <= _REGION1_SIZE >> PAGE_SHIFT)
41 return TABLE_TYPE_REGION2;
42 return TABLE_TYPE_REGION1;
43 }
44
45 /**
46 * gmap_new() - Allocate and initialize a guest address space.
47 * @kvm: The kvm owning the guest.
48 * @limit: Maximum address of the gmap address space.
49 *
50 * Return: A guest address space structure.
51 */
gmap_new(struct kvm * kvm,gfn_t limit)52 struct gmap *gmap_new(struct kvm *kvm, gfn_t limit)
53 {
54 struct crst_table *table;
55 struct gmap *gmap;
56 int type;
57
58 type = gmap_limit_to_type(limit);
59
60 gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT);
61 if (!gmap)
62 return NULL;
63 INIT_LIST_HEAD(&gmap->children);
64 INIT_LIST_HEAD(&gmap->list);
65 INIT_LIST_HEAD(&gmap->scb_users);
66 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE);
67 spin_lock_init(&gmap->children_lock);
68 spin_lock_init(&gmap->host_to_rmap_lock);
69 refcount_set(&gmap->refcount, 1);
70
71 table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val);
72 if (!table) {
73 kfree(gmap);
74 return NULL;
75 }
76
77 gmap->asce.val = __pa(table);
78 gmap->asce.dt = type;
79 gmap->asce.tl = _ASCE_TABLE_LENGTH;
80 gmap->asce.x = 1;
81 gmap->asce.p = 1;
82 gmap->asce.s = 1;
83 gmap->kvm = kvm;
84 set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags);
85
86 return gmap;
87 }
88
gmap_add_child(struct gmap * parent,struct gmap * child)89 static void gmap_add_child(struct gmap *parent, struct gmap *child)
90 {
91 KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm);
92 KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm);
93 KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm);
94 lockdep_assert_held(&parent->children_lock);
95
96 child->parent = parent;
97
98 if (is_ucontrol(parent))
99 set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
100 else
101 clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
102
103 if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags))
104 set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
105 else
106 clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
107
108 if (kvm_is_ucontrol(parent->kvm))
109 clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags);
110 list_add(&child->list, &parent->children);
111 }
112
gmap_new_child(struct gmap * parent,gfn_t limit)113 struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit)
114 {
115 struct gmap *res;
116
117 lockdep_assert_not_held(&parent->children_lock);
118 res = gmap_new(parent->kvm, limit);
119 if (res) {
120 scoped_guard(spinlock, &parent->children_lock)
121 gmap_add_child(parent, res);
122 }
123 return res;
124 }
125
gmap_set_limit(struct gmap * gmap,gfn_t limit)126 int gmap_set_limit(struct gmap *gmap, gfn_t limit)
127 {
128 struct kvm_s390_mmu_cache *mc;
129 int rc, type;
130
131 type = gmap_limit_to_type(limit);
132
133 mc = kvm_s390_new_mmu_cache();
134 if (!mc)
135 return -ENOMEM;
136
137 do {
138 rc = kvm_s390_mmu_cache_topup(mc);
139 if (rc)
140 return rc;
141 scoped_guard(write_lock, &gmap->kvm->mmu_lock)
142 rc = dat_set_asce_limit(mc, &gmap->asce, type);
143 } while (rc == -ENOMEM);
144
145 kvm_s390_free_mmu_cache(mc);
146 return 0;
147 }
148
gmap_rmap_radix_tree_free(struct radix_tree_root * root)149 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
150 {
151 struct vsie_rmap *rmap, *rnext, *head;
152 struct radix_tree_iter iter;
153 unsigned long indices[16];
154 unsigned long index;
155 void __rcu **slot;
156 int i, nr;
157
158 /* A radix tree is freed by deleting all of its entries */
159 index = 0;
160 do {
161 nr = 0;
162 radix_tree_for_each_slot(slot, root, &iter, index) {
163 indices[nr] = iter.index;
164 if (++nr == 16)
165 break;
166 }
167 for (i = 0; i < nr; i++) {
168 index = indices[i];
169 head = radix_tree_delete(root, index);
170 gmap_for_each_rmap_safe(rmap, rnext, head)
171 kfree(rmap);
172 }
173 } while (nr > 0);
174 }
175
gmap_remove_child(struct gmap * child)176 void gmap_remove_child(struct gmap *child)
177 {
178 if (KVM_BUG_ON(!child->parent, child->kvm))
179 return;
180 lockdep_assert_held(&child->parent->children_lock);
181
182 list_del(&child->list);
183 child->parent = NULL;
184 child->invalidated = true;
185 }
186
187 /**
188 * gmap_dispose() - Remove and free a guest address space and its children.
189 * @gmap: Pointer to the guest address space structure.
190 */
gmap_dispose(struct gmap * gmap)191 void gmap_dispose(struct gmap *gmap)
192 {
193 /* The gmap must have been removed from the parent beforehands */
194 KVM_BUG_ON(gmap->parent, gmap->kvm);
195 /* All children of this gmap must have been removed beforehands */
196 KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm);
197 /* No VSIE shadow block is allowed to use this gmap */
198 KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm);
199 /* The ASCE must be valid */
200 KVM_BUG_ON(!gmap->asce.val, gmap->kvm);
201 /* The refcount must be 0 */
202 KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm);
203
204 /* Flush tlb of all gmaps */
205 asce_flush_tlb(gmap->asce);
206
207 /* Free all DAT tables. */
208 dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap));
209
210 /* Free additional data for a shadow gmap */
211 if (is_shadow(gmap))
212 gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
213
214 kfree(gmap);
215 }
216
217 /**
218 * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy.
219 * @gmap: The gmap whose ASCE needs to be replaced.
220 *
221 * If the ASCE is a SEGMENT type then this function will return -EINVAL,
222 * otherwise the pointers in the host_to_guest radix tree will keep pointing
223 * to the wrong pages, causing use-after-free and memory corruption.
224 * If the allocation of the new top level page table fails, the ASCE is not
225 * replaced.
226 * In any case, the old ASCE is always removed from the gmap CRST list.
227 * Therefore the caller has to make sure to save a pointer to it
228 * beforehand, unless a leak is actually intended.
229 *
230 * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE,
231 * -ENOMEM if runinng out of memory.
232 */
s390_replace_asce(struct gmap * gmap)233 int s390_replace_asce(struct gmap *gmap)
234 {
235 struct crst_table *table;
236 union asce asce;
237
238 /* Replacing segment type ASCEs would cause serious issues */
239 if (gmap->asce.dt == ASCE_TYPE_SEGMENT)
240 return -EINVAL;
241
242 table = dat_alloc_crst_sleepable(0);
243 if (!table)
244 return -ENOMEM;
245 memcpy(table, dereference_asce(gmap->asce), sizeof(*table));
246
247 /* Set new table origin while preserving existing ASCE control bits */
248 asce = gmap->asce;
249 asce.rsto = virt_to_pfn(table);
250 WRITE_ONCE(gmap->asce, asce);
251
252 return 0;
253 }
254
_gmap_unmap_prefix(struct gmap * gmap,gfn_t gfn,gfn_t end,bool hint)255 bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint)
256 {
257 struct kvm *kvm = gmap->kvm;
258 struct kvm_vcpu *vcpu;
259 gfn_t prefix_gfn;
260 unsigned long i;
261
262 if (is_shadow(gmap))
263 return false;
264 kvm_for_each_vcpu(i, vcpu, kvm) {
265 /* Match against both prefix pages */
266 prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu));
267 if (prefix_gfn < end && gfn <= prefix_gfn + 1) {
268 if (hint && kvm_s390_is_in_sie(vcpu))
269 return false;
270 VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx",
271 gfn_to_gpa(gfn), gfn_to_gpa(end));
272 kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
273 }
274 }
275 return true;
276 }
277
278 struct clear_young_pte_priv {
279 struct gmap *gmap;
280 bool young;
281 };
282
gmap_clear_young_pte(union pte * ptep,gfn_t gfn,gfn_t end,struct dat_walk * walk)283 static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
284 {
285 struct clear_young_pte_priv *p = walk->priv;
286 union pgste pgste;
287 union pte pte, new;
288
289 pte = READ_ONCE(*ptep);
290
291 if (!pte.s.pr || (!pte.s.y && pte.h.i))
292 return 0;
293
294 pgste = pgste_get_lock(ptep);
295 if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) {
296 new = pte;
297 new.h.i = 1;
298 new.s.y = 0;
299 if ((new.s.d || !new.h.p) && !new.s.s)
300 folio_set_dirty(pfn_folio(pte.h.pfra));
301 new.s.d = 0;
302 new.h.p = 1;
303
304 pgste.prefix_notif = 0;
305 pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap));
306 }
307 p->young = 1;
308 pgste_set_unlock(ptep, pgste);
309 return 0;
310 }
311
gmap_clear_young_crste(union crste * crstep,gfn_t gfn,gfn_t end,struct dat_walk * walk)312 static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
313 {
314 struct clear_young_pte_priv *priv = walk->priv;
315 union crste crste, new;
316
317 do {
318 crste = READ_ONCE(*crstep);
319
320 if (!crste.h.fc)
321 return 0;
322 if (!crste.s.fc1.y && crste.h.i)
323 return 0;
324 if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end))
325 break;
326
327 new = crste;
328 new.h.i = 1;
329 new.s.fc1.y = 0;
330 new.s.fc1.prefix_notif = 0;
331 if (new.s.fc1.d || !new.h.p)
332 folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
333 new.s.fc1.d = 0;
334 new.h.p = 1;
335 } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce));
336
337 priv->young = 1;
338 return 0;
339 }
340
341 /**
342 * gmap_age_gfn() - Clear young.
343 * @gmap: The guest gmap.
344 * @start: The first gfn to test.
345 * @end: The gfn after the last one to test.
346 *
347 * Context: Called with the kvm mmu write lock held.
348 * Return: 1 if any page in the given range was young, otherwise 0.
349 */
gmap_age_gfn(struct gmap * gmap,gfn_t start,gfn_t end)350 bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end)
351 {
352 const struct dat_walk_ops ops = {
353 .pte_entry = gmap_clear_young_pte,
354 .pmd_entry = gmap_clear_young_crste,
355 .pud_entry = gmap_clear_young_crste,
356 };
357 struct clear_young_pte_priv priv = {
358 .gmap = gmap,
359 .young = false,
360 };
361
362 _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
363
364 return priv.young;
365 }
366
367 struct gmap_unmap_priv {
368 struct gmap *gmap;
369 struct kvm_memory_slot *slot;
370 };
371
_gmap_unmap_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * w)372 static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w)
373 {
374 struct gmap_unmap_priv *priv = w->priv;
375 struct folio *folio = NULL;
376 unsigned long vmaddr;
377 union pgste pgste;
378
379 pgste = pgste_get_lock(ptep);
380 if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) {
381 vmaddr = __gfn_to_hva_memslot(priv->slot, gfn);
382 gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr);
383 }
384 if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
385 folio = pfn_folio(ptep->h.pfra);
386 pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn);
387 pgste_set_unlock(ptep, pgste);
388 if (folio)
389 uv_convert_from_secure_folio(folio);
390
391 return 0;
392 }
393
_gmap_unmap_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)394 static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
395 {
396 struct gmap_unmap_priv *priv = walk->priv;
397 struct folio *folio = NULL;
398 union crste old = *crstep;
399
400 if (!old.h.fc)
401 return 0;
402
403 if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
404 folio = phys_to_folio(crste_origin_large(old));
405 /* No races should happen because kvm->mmu_lock is held in write mode */
406 KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn),
407 priv->gmap->kvm);
408 if (folio)
409 uv_convert_from_secure_folio(folio);
410
411 return 0;
412 }
413
414 /**
415 * gmap_unmap_gfn_range() - Unmap a range of guest addresses.
416 * @gmap: The gmap to act on.
417 * @slot: The memslot in which the range is located.
418 * @start: The first gfn to unmap.
419 * @end: The gfn after the last one to unmap.
420 *
421 * Context: Called with the kvm mmu write lock held.
422 * Return: false
423 */
gmap_unmap_gfn_range(struct gmap * gmap,struct kvm_memory_slot * slot,gfn_t start,gfn_t end)424 bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
425 {
426 const struct dat_walk_ops ops = {
427 .pte_entry = _gmap_unmap_pte,
428 .pmd_entry = _gmap_unmap_crste,
429 .pud_entry = _gmap_unmap_crste,
430 };
431 struct gmap_unmap_priv priv = {
432 .gmap = gmap,
433 .slot = slot,
434 };
435
436 lockdep_assert_held_write(&gmap->kvm->mmu_lock);
437
438 _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
439 return false;
440 }
441
__pte_test_and_clear_softdirty(union pte * ptep,union pgste pgste,gfn_t gfn,struct gmap * gmap)442 static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn,
443 struct gmap *gmap)
444 {
445 union pte pte = READ_ONCE(*ptep);
446
447 if (!pte.s.pr || (pte.h.p && !pte.s.sd))
448 return pgste;
449
450 /*
451 * If this page contains one or more prefixes of vCPUS that are currently
452 * running, do not reset the protection, leave it marked as dirty.
453 */
454 if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) {
455 pte.h.p = 1;
456 pte.s.sd = 0;
457 pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn);
458 }
459
460 mark_page_dirty(gmap->kvm, gfn);
461
462 return pgste;
463 }
464
_pte_test_and_clear_softdirty(union pte * ptep,gfn_t gfn,gfn_t end,struct dat_walk * walk)465 static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end,
466 struct dat_walk *walk)
467 {
468 struct gmap *gmap = walk->priv;
469 union pgste pgste;
470
471 pgste = pgste_get_lock(ptep);
472 pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap);
473 pgste_set_unlock(ptep, pgste);
474 return 0;
475 }
476
_crste_test_and_clear_softdirty(union crste * table,gfn_t gfn,gfn_t end,struct dat_walk * walk)477 static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end,
478 struct dat_walk *walk)
479 {
480 struct gmap *gmap = walk->priv;
481 union crste crste, new;
482
483 if (fatal_signal_pending(current))
484 return 1;
485 do {
486 crste = READ_ONCE(*table);
487 if (!crste.h.fc)
488 return 0;
489 if (crste.h.p && !crste.s.fc1.sd)
490 return 0;
491
492 /*
493 * If this large page contains one or more prefixes of vCPUs that are
494 * currently running, do not reset the protection, leave it marked as
495 * dirty.
496 */
497 if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end))
498 break;
499 new = crste;
500 new.h.p = 1;
501 new.s.fc1.sd = 0;
502 } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn));
503
504 for ( ; gfn < end; gfn++)
505 mark_page_dirty(gmap->kvm, gfn);
506
507 return 0;
508 }
509
gmap_sync_dirty_log(struct gmap * gmap,gfn_t start,gfn_t end)510 void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end)
511 {
512 const struct dat_walk_ops walk_ops = {
513 .pte_entry = _pte_test_and_clear_softdirty,
514 .pmd_entry = _crste_test_and_clear_softdirty,
515 .pud_entry = _crste_test_and_clear_softdirty,
516 };
517
518 lockdep_assert_held(&gmap->kvm->mmu_lock);
519
520 _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap);
521 }
522
gmap_handle_minor_crste_fault(struct gmap * gmap,struct guest_fault * f)523 static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f)
524 {
525 union crste newcrste, oldcrste = READ_ONCE(*f->crstep);
526
527 /* Somehow the crste is not large anymore, let the slow path deal with it. */
528 if (!oldcrste.h.fc)
529 return 1;
530
531 f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn));
532 f->writable = oldcrste.s.fc1.w;
533
534 /* Appropriate permissions already (race with another handler), nothing to do. */
535 if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p))
536 return 0;
537
538 if (!f->write_attempt || oldcrste.s.fc1.w) {
539 f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d;
540 newcrste = oldcrste;
541 newcrste.h.i = 0;
542 newcrste.s.fc1.y = 1;
543 if (f->write_attempt) {
544 newcrste.h.p = 0;
545 newcrste.s.fc1.d = 1;
546 newcrste.s.fc1.sd = 1;
547 }
548 /* In case of races, let the slow path deal with it. */
549 return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn);
550 }
551 /* Trying to write on a read-only page, let the slow path deal with it. */
552 return 1;
553 }
554
_gmap_handle_minor_pte_fault(struct gmap * gmap,union pgste * pgste,struct guest_fault * f)555 static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste,
556 struct guest_fault *f)
557 {
558 union pte newpte, oldpte = READ_ONCE(*f->ptep);
559
560 f->pfn = oldpte.h.pfra;
561 f->writable = oldpte.s.w;
562
563 /* Appropriate permissions already (race with another handler), nothing to do. */
564 if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p))
565 return 0;
566 /* Trying to write on a read-only page, let the slow path deal with it. */
567 if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w))
568 return 1;
569
570 newpte = oldpte;
571 newpte.h.i = 0;
572 newpte.s.y = 1;
573 if (f->write_attempt) {
574 newpte.h.p = 0;
575 newpte.s.d = 1;
576 newpte.s.sd = 1;
577 }
578 *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn);
579
580 return 0;
581 }
582
583 /**
584 * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault.
585 * @gmap: The gmap whose fault needs to be resolved.
586 * @fault: Describes the fault that is being resolved.
587 *
588 * A minor fault is a fault that can be resolved quickly within gmap.
589 * The page is already mapped, the fault is only due to dirty/young tracking.
590 *
591 * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could
592 * not be resolved and needs to go through the slow path.
593 */
gmap_try_fixup_minor(struct gmap * gmap,struct guest_fault * fault)594 int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault)
595 {
596 union pgste pgste;
597 int rc;
598
599 lockdep_assert_held(&gmap->kvm->mmu_lock);
600
601 rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE,
602 &fault->crstep, &fault->ptep);
603 /* If a PTE or a leaf CRSTE could not be reached, slow path. */
604 if (rc)
605 return 1;
606
607 if (fault->ptep) {
608 pgste = pgste_get_lock(fault->ptep);
609 rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault);
610 if (!rc && fault->callback)
611 fault->callback(fault);
612 pgste_set_unlock(fault->ptep, pgste);
613 } else {
614 rc = gmap_handle_minor_crste_fault(gmap, fault);
615 if (!rc && fault->callback)
616 fault->callback(fault);
617 }
618 return rc;
619 }
620
gmap_2g_allowed(struct gmap * gmap,struct guest_fault * f,struct kvm_memory_slot * slot)621 static inline bool gmap_2g_allowed(struct gmap *gmap, struct guest_fault *f,
622 struct kvm_memory_slot *slot)
623 {
624 return false;
625 }
626
627 /**
628 * gmap_1m_allowed() - Check whether a 1M hugepage is allowed.
629 * @gmap: The gmap of the guest.
630 * @f: Describes the fault that is being resolved.
631 * @slot: The memslot the faulting address belongs to.
632 *
633 * The function checks whether the GMAP_FLAG_ALLOW_HPAGE_1M flag is set for
634 * @gmap, whether the offset of the address in the 1M virtual frame is the
635 * same as the offset in the physical 1M frame, and finally whether the whole
636 * 1M page would fit in the given memslot.
637 *
638 * Return: true if a 1M hugepage is allowed to back the faulting address, false
639 * otherwise.
640 */
gmap_1m_allowed(struct gmap * gmap,struct guest_fault * f,struct kvm_memory_slot * slot)641 static inline bool gmap_1m_allowed(struct gmap *gmap, struct guest_fault *f,
642 struct kvm_memory_slot *slot)
643 {
644 return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags) &&
645 !((f->gfn ^ f->pfn) & ~_SEGMENT_FR_MASK) &&
646 slot->base_gfn <= ALIGN_DOWN(f->gfn, _PAGES_PER_SEGMENT) &&
647 slot->base_gfn + slot->npages >= ALIGN(f->gfn + 1, _PAGES_PER_SEGMENT);
648 }
649
_gmap_link(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,int level,struct guest_fault * f)650 static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level,
651 struct guest_fault *f)
652 {
653 union crste oldval, newval;
654 union pte newpte, oldpte;
655 union pgste pgste;
656 int rc = 0;
657
658 rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level,
659 &f->crstep, &f->ptep);
660 if (rc == -ENOMEM)
661 return rc;
662 if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm))
663 return rc;
664 if (rc)
665 return -EAGAIN;
666 if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm))
667 return -EINVAL;
668
669 if (f->ptep) {
670 pgste = pgste_get_lock(f->ptep);
671 oldpte = *f->ptep;
672 newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
673 newpte.s.sd = oldpte.s.sd;
674 oldpte.s.sd = 0;
675 if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
676 pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn);
677 if (f->callback)
678 f->callback(f);
679 } else {
680 rc = -EAGAIN;
681 }
682 pgste_set_unlock(f->ptep, pgste);
683 } else {
684 do {
685 oldval = READ_ONCE(*f->crstep);
686 newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
687 f->write_attempt | oldval.s.fc1.d);
688 newval.s.fc1.s = !f->page;
689 newval.s.fc1.sd = oldval.s.fc1.sd;
690 if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
691 crste_origin_large(oldval) != crste_origin_large(newval))
692 return -EAGAIN;
693 } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn));
694 if (f->callback)
695 f->callback(f);
696 }
697
698 return rc;
699 }
700
gmap_link(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,struct guest_fault * f,struct kvm_memory_slot * slot)701 int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f,
702 struct kvm_memory_slot *slot)
703 {
704 unsigned int order;
705 int level;
706
707 lockdep_assert_held(&gmap->kvm->mmu_lock);
708
709 level = TABLE_TYPE_PAGE_TABLE;
710 if (f->page) {
711 order = folio_order(page_folio(f->page));
712 if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f, slot))
713 level = TABLE_TYPE_REGION3;
714 else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f, slot))
715 level = TABLE_TYPE_SEGMENT;
716 }
717 return _gmap_link(mc, gmap, level, f);
718 }
719
gmap_ucas_map_one(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,gfn_t p_gfn,gfn_t c_gfn,bool force_alloc)720 static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
721 gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
722 {
723 union crste newcrste, oldcrste;
724 struct page_table *pt;
725 union crste *crstep;
726 union pte *ptep;
727 int rc;
728
729 if (force_alloc)
730 rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC,
731 TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
732 else
733 rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE,
734 TABLE_TYPE_SEGMENT, &crstep, &ptep);
735 if (rc)
736 return rc;
737 if (!ptep) {
738 newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT);
739 newcrste.h.i = 1;
740 newcrste.h.fc0.tl = 1;
741 } else {
742 pt = pte_table_start(ptep);
743 dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT));
744 newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT);
745 }
746 rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT,
747 &crstep, &ptep);
748 if (rc)
749 return rc;
750 do {
751 oldcrste = READ_ONCE(*crstep);
752 if (oldcrste.val == newcrste.val)
753 break;
754 } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce));
755 return 0;
756 }
757
gmap_ucas_translate_simple(struct gmap * gmap,gpa_t * gaddr,union crste ** crstepp)758 static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp)
759 {
760 union pte *ptep;
761 int rc;
762
763 rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE,
764 TABLE_TYPE_SEGMENT, crstepp, &ptep);
765 if (rc || (!ptep && !crste_is_ucas(**crstepp)))
766 return -EREMOTE;
767 if (!ptep)
768 return 1;
769 *gaddr &= ~_SEGMENT_MASK;
770 *gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT;
771 return 0;
772 }
773
774 /**
775 * gmap_ucas_translate() - Translate a vcpu address into a host gmap address
776 * @mc: The memory cache to be used for allocations.
777 * @gmap: The per-cpu gmap.
778 * @gaddr: Pointer to the address to be translated, will get overwritten with
779 * the translated address in case of success.
780 * Translates the per-vCPU guest address into a fake guest address, which can
781 * then be used with the fake memslots that are identity mapping userspace.
782 * This allows ucontrol VMs to use the normal fault resolution path, like
783 * normal VMs.
784 *
785 * Return: %0 in case of success, otherwise %-EREMOTE.
786 */
gmap_ucas_translate(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,gpa_t * gaddr)787 int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr)
788 {
789 gpa_t translated_address;
790 union crste *crstep;
791 gfn_t gfn;
792 int rc;
793
794 gfn = gpa_to_gfn(*gaddr);
795
796 scoped_guard(read_lock, &gmap->kvm->mmu_lock) {
797 rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
798 if (rc <= 0)
799 return rc;
800 }
801 do {
802 scoped_guard(write_lock, &gmap->kvm->mmu_lock) {
803 rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
804 if (rc <= 0)
805 return rc;
806 translated_address = (*gaddr & ~_SEGMENT_MASK) |
807 (crstep->val & _SEGMENT_MASK);
808 rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true);
809 }
810 if (!rc) {
811 *gaddr = translated_address;
812 return 0;
813 }
814 if (rc != -ENOMEM)
815 return -EREMOTE;
816 rc = kvm_s390_mmu_cache_topup(mc);
817 if (rc)
818 return rc;
819 } while (1);
820 return 0;
821 }
822
gmap_ucas_map(struct gmap * gmap,gfn_t p_gfn,gfn_t c_gfn,unsigned long count)823 int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count)
824 {
825 struct kvm_s390_mmu_cache *mc;
826 int rc;
827
828 mc = kvm_s390_new_mmu_cache();
829 if (!mc)
830 return -ENOMEM;
831
832 while (count) {
833 scoped_guard(write_lock, &gmap->kvm->mmu_lock)
834 rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false);
835 if (rc == -ENOMEM) {
836 rc = kvm_s390_mmu_cache_topup(mc);
837 if (rc)
838 return rc;
839 continue;
840 }
841 if (rc)
842 return rc;
843
844 count--;
845 c_gfn += _PAGE_ENTRIES;
846 p_gfn += _PAGE_ENTRIES;
847 }
848 return rc;
849 }
850
gmap_ucas_unmap_one(struct gmap * gmap,gfn_t c_gfn)851 static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
852 {
853 union crste *crstep;
854 union pte *ptep;
855 int rc;
856
857 rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
858 if (rc)
859 return;
860 while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce))
861 ;
862 }
863
gmap_ucas_unmap(struct gmap * gmap,gfn_t c_gfn,unsigned long count)864 void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
865 {
866 guard(read_lock)(&gmap->kvm->mmu_lock);
867
868 for ( ; count; count--, c_gfn += _PAGE_ENTRIES)
869 gmap_ucas_unmap_one(gmap, c_gfn);
870 }
871
_gmap_split_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)872 static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
873 {
874 struct gmap *gmap = walk->priv;
875 union crste crste, newcrste;
876
877 crste = READ_ONCE(*crstep);
878 newcrste = _CRSTE_EMPTY(crste.h.tt);
879
880 while (crste_leaf(crste)) {
881 if (crste_prefix(crste))
882 gmap_unmap_prefix(gmap, gfn, next);
883 if (crste.s.fc1.vsie_notif)
884 gmap_handle_vsie_unshadow_event(gmap, gfn);
885 if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce))
886 break;
887 crste = READ_ONCE(*crstep);
888 }
889
890 if (need_resched())
891 return next;
892
893 return 0;
894 }
895
gmap_split_huge_pages(struct gmap * gmap)896 void gmap_split_huge_pages(struct gmap *gmap)
897 {
898 const struct dat_walk_ops ops = {
899 .pmd_entry = _gmap_split_crste,
900 .pud_entry = _gmap_split_crste,
901 };
902 gfn_t start = 0;
903
904 do {
905 scoped_guard(read_lock, &gmap->kvm->mmu_lock)
906 start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce,
907 &ops, DAT_WALK_IGN_HOLES, gmap);
908 cond_resched();
909 } while (start);
910 }
911
_gmap_enable_skeys(struct gmap * gmap)912 static int _gmap_enable_skeys(struct gmap *gmap)
913 {
914 gfn_t start = 0;
915 int rc;
916
917 if (uses_skeys(gmap))
918 return 0;
919
920 set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
921 rc = gmap_helper_disable_cow_sharing();
922 if (rc) {
923 clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
924 return rc;
925 }
926
927 do {
928 scoped_guard(write_lock, &gmap->kvm->mmu_lock)
929 start = dat_reset_skeys(gmap->asce, start);
930 cond_resched();
931 } while (start);
932 return 0;
933 }
934
gmap_enable_skeys(struct gmap * gmap)935 int gmap_enable_skeys(struct gmap *gmap)
936 {
937 int rc;
938
939 mmap_write_lock(gmap->kvm->mm);
940 rc = _gmap_enable_skeys(gmap);
941 mmap_write_unlock(gmap->kvm->mm);
942 return rc;
943 }
944
_destroy_pages_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)945 static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
946 {
947 if (!ptep->s.pr)
948 return 0;
949 __kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep)));
950 if (need_resched())
951 return next;
952 return 0;
953 }
954
_destroy_pages_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)955 static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
956 {
957 phys_addr_t origin, cur, end;
958
959 if (!crstep->h.fc || !crstep->s.fc1.pr)
960 return 0;
961
962 origin = crste_origin_large(*crstep);
963 cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin;
964 end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin;
965 for ( ; cur < end; cur += PAGE_SIZE)
966 __kvm_s390_pv_destroy_page(phys_to_page(cur));
967 if (need_resched())
968 return next;
969 return 0;
970 }
971
gmap_pv_destroy_range(struct gmap * gmap,gfn_t start,gfn_t end,bool interruptible)972 int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible)
973 {
974 const struct dat_walk_ops ops = {
975 .pte_entry = _destroy_pages_pte,
976 .pmd_entry = _destroy_pages_crste,
977 .pud_entry = _destroy_pages_crste,
978 };
979
980 do {
981 scoped_guard(read_lock, &gmap->kvm->mmu_lock)
982 start = _dat_walk_gfn_range(start, end, gmap->asce, &ops,
983 DAT_WALK_IGN_HOLES, NULL);
984 if (interruptible && fatal_signal_pending(current))
985 return -EINTR;
986 cond_resched();
987 } while (start && start < end);
988 return 0;
989 }
990
gmap_insert_rmap(struct gmap * sg,gfn_t p_gfn,gfn_t r_gfn,int level)991 int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level)
992 {
993 struct vsie_rmap *rmap __free(kvfree) = NULL;
994 struct vsie_rmap *temp;
995 void __rcu **slot;
996 int rc = 0;
997
998 KVM_BUG_ON(!is_shadow(sg), sg->kvm);
999 lockdep_assert_held(&sg->host_to_rmap_lock);
1000
1001 rmap = kzalloc_obj(*rmap, GFP_ATOMIC);
1002 if (!rmap)
1003 return -ENOMEM;
1004
1005 rmap->r_gfn = r_gfn;
1006 rmap->level = level;
1007 slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn);
1008 if (slot) {
1009 rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock);
1010 for (temp = rmap->next; temp; temp = temp->next) {
1011 if (temp->val == rmap->val)
1012 return 0;
1013 }
1014 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1015 } else {
1016 rmap->next = NULL;
1017 rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap);
1018 if (rc)
1019 return rc;
1020 }
1021 rmap = NULL;
1022
1023 return 0;
1024 }
1025
gmap_protect_rmap(struct kvm_s390_mmu_cache * mc,struct gmap * sg,gfn_t p_gfn,gfn_t r_gfn,kvm_pfn_t pfn,int level,bool wr)1026 int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
1027 kvm_pfn_t pfn, int level, bool wr)
1028 {
1029 union crste *crstep;
1030 union pgste pgste;
1031 union pte *ptep;
1032 union pte pte;
1033 int flags, rc;
1034
1035 KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1036 lockdep_assert_held(&sg->parent->children_lock);
1037
1038 flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
1039 rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags,
1040 TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
1041 if (rc)
1042 return rc;
1043 if (level <= TABLE_TYPE_REGION1) {
1044 scoped_guard(spinlock, &sg->host_to_rmap_lock)
1045 rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level);
1046 }
1047 if (rc)
1048 return rc;
1049
1050 if (!pgste_get_trylock(ptep, &pgste))
1051 return -EAGAIN;
1052 pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false);
1053 pte.h.p = 1;
1054 pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false);
1055 pgste.vsie_notif = 1;
1056 pgste_set_unlock(ptep, pgste);
1057
1058 return 0;
1059 }
1060
__set_cmma_dirty_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)1061 static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1062 {
1063 __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val);
1064 if (need_resched())
1065 return next;
1066 return 0;
1067 }
1068
gmap_set_cmma_all_dirty(struct gmap * gmap)1069 void gmap_set_cmma_all_dirty(struct gmap *gmap)
1070 {
1071 const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, };
1072 gfn_t gfn = 0;
1073
1074 do {
1075 scoped_guard(read_lock, &gmap->kvm->mmu_lock)
1076 gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops,
1077 DAT_WALK_IGN_HOLES, NULL);
1078 cond_resched();
1079 } while (gfn);
1080 }
1081
gmap_unshadow_level(struct gmap * sg,gfn_t r_gfn,int level)1082 static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
1083 {
1084 unsigned long align = PAGE_SIZE;
1085 gpa_t gaddr = gfn_to_gpa(r_gfn);
1086 union crste *crstep;
1087 union crste crste;
1088 union pte *ptep;
1089
1090 if (level > TABLE_TYPE_PAGE_TABLE)
1091 align = 1UL << (11 * level + _SEGMENT_SHIFT);
1092 kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align));
1093 sg->invalidated = true;
1094 if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep))
1095 return;
1096 if (ptep) {
1097 if (READ_ONCE(*ptep).val != _PTE_EMPTY.val)
1098 dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
1099 return;
1100 }
1101
1102 crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce);
1103 if (crste_leaf(crste) || crste.h.i)
1104 return;
1105 if (is_pmd(crste))
1106 dat_free_pt(dereference_pmd(crste.pmd));
1107 else
1108 dat_free_level(dereference_crste(crste), true);
1109 }
1110
gmap_unshadow(struct gmap * sg)1111 static void gmap_unshadow(struct gmap *sg)
1112 {
1113 struct gmap_cache *gmap_cache, *next;
1114
1115 KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1116 KVM_BUG_ON(!sg->parent, sg->kvm);
1117
1118 lockdep_assert_held(&sg->parent->children_lock);
1119
1120 gmap_remove_child(sg);
1121 kvm_s390_vsie_gmap_notifier(sg, 0, -1UL);
1122
1123 list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) {
1124 gmap_cache->gmap = NULL;
1125 list_del(&gmap_cache->list);
1126 }
1127
1128 gmap_put(sg);
1129 }
1130
_gmap_handle_vsie_unshadow_event(struct gmap * parent,gfn_t gfn)1131 void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
1132 {
1133 struct vsie_rmap *rmap, *rnext, *head;
1134 struct gmap *sg, *next;
1135 gfn_t start, end;
1136
1137 list_for_each_entry_safe(sg, next, &parent->children, list) {
1138 start = sg->guest_asce.rsto;
1139 end = start + sg->guest_asce.tl + 1;
1140 if (!sg->guest_asce.r && gfn >= start && gfn < end) {
1141 gmap_unshadow(sg);
1142 continue;
1143 }
1144 scoped_guard(spinlock, &sg->host_to_rmap_lock)
1145 head = radix_tree_delete(&sg->host_to_rmap, gfn);
1146 gmap_for_each_rmap_safe(rmap, rnext, head)
1147 gmap_unshadow_level(sg, rmap->r_gfn, rmap->level);
1148 }
1149 }
1150
1151 /**
1152 * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables.
1153 * @parent: Pointer to the parent gmap.
1154 * @asce: ASCE for which the shadow table is created.
1155 * @edat_level: Edat level to be used for the shadow translation.
1156 *
1157 * Context: Called with parent->children_lock held.
1158 *
1159 * Return: The pointer to a gmap if a shadow table with the given asce is
1160 * already available, ERR_PTR(-EAGAIN) if another one is just being created,
1161 * otherwise NULL.
1162 */
gmap_find_shadow(struct gmap * parent,union asce asce,int edat_level)1163 static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level)
1164 {
1165 struct gmap *sg;
1166
1167 lockdep_assert_held(&parent->children_lock);
1168 list_for_each_entry(sg, &parent->children, list) {
1169 if (!gmap_is_shadow_valid(sg, asce, edat_level))
1170 continue;
1171 return sg;
1172 }
1173 return NULL;
1174 }
1175
1176 #define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE)
1177 struct gmap_protect_asce_top_level {
1178 unsigned long seq;
1179 struct guest_fault f[CRST_TABLE_PAGES];
1180 };
1181
__gmap_protect_asce_top_level(struct kvm_s390_mmu_cache * mc,struct gmap * sg,struct gmap_protect_asce_top_level * context)1182 static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1183 struct gmap_protect_asce_top_level *context)
1184 {
1185 struct gmap *parent;
1186 int rc, i;
1187
1188 guard(write_lock)(&sg->kvm->mmu_lock);
1189
1190 if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f))
1191 return -EAGAIN;
1192
1193 parent = READ_ONCE(sg->parent);
1194 if (!parent)
1195 return -EAGAIN;
1196 scoped_guard(spinlock, &parent->children_lock) {
1197 if (READ_ONCE(sg->parent) != parent)
1198 return -EAGAIN;
1199 sg->invalidated = false;
1200 for (i = 0; i < CRST_TABLE_PAGES; i++) {
1201 if (!context->f[i].valid)
1202 continue;
1203 rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn,
1204 TABLE_TYPE_REGION1 + 1, context->f[i].writable);
1205 if (rc)
1206 return rc;
1207 }
1208 gmap_add_child(sg->parent, sg);
1209 }
1210
1211 kvm_s390_release_faultin_array(sg->kvm, context->f, false);
1212 return 0;
1213 }
1214
_gmap_protect_asce_top_level(struct kvm_s390_mmu_cache * mc,struct gmap * sg,struct gmap_protect_asce_top_level * context)1215 static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1216 struct gmap_protect_asce_top_level *context)
1217 {
1218 int rc;
1219
1220 if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f))
1221 return -EAGAIN;
1222 do {
1223 rc = kvm_s390_mmu_cache_topup(mc);
1224 if (rc)
1225 return rc;
1226 rc = radix_tree_preload(GFP_KERNEL);
1227 if (rc)
1228 return rc;
1229 rc = __gmap_protect_asce_top_level(mc, sg, context);
1230 radix_tree_preload_end();
1231 } while (rc == -ENOMEM);
1232
1233 return rc;
1234 }
1235
gmap_protect_asce_top_level(struct kvm_s390_mmu_cache * mc,struct gmap * sg)1236 static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg)
1237 {
1238 struct gmap_protect_asce_top_level context = {};
1239 union asce asce = sg->guest_asce;
1240 int rc;
1241
1242 KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1243
1244 context.seq = sg->kvm->mmu_invalidate_seq;
1245 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
1246 smp_rmb();
1247
1248 rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false);
1249 if (rc > 0)
1250 rc = -EFAULT;
1251 if (!rc)
1252 rc = _gmap_protect_asce_top_level(mc, sg, &context);
1253 if (rc)
1254 kvm_s390_release_faultin_array(sg->kvm, context.f, true);
1255 return rc;
1256 }
1257
1258 /**
1259 * gmap_create_shadow() - Create/find a shadow guest address space.
1260 * @mc: The cache to use to allocate dat tables.
1261 * @parent: Pointer to the parent gmap.
1262 * @asce: ASCE for which the shadow table is created.
1263 * @edat_level: Edat level to be used for the shadow translation.
1264 *
1265 * The pages of the top level page table referred by the asce parameter
1266 * will be set to read-only and marked in the PGSTEs of the kvm process.
1267 * The shadow table will be removed automatically on any change to the
1268 * PTE mapping for the source table.
1269 *
1270 * The returned shadow gmap will be returned with one extra reference.
1271 *
1272 * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
1273 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
1274 * parent gmap table could not be protected.
1275 */
gmap_create_shadow(struct kvm_s390_mmu_cache * mc,struct gmap * parent,union asce asce,int edat_level)1276 struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent,
1277 union asce asce, int edat_level)
1278 {
1279 struct gmap *sg, *new;
1280 int rc;
1281
1282 if (WARN_ON(!parent))
1283 return ERR_PTR(-EINVAL);
1284
1285 scoped_guard(spinlock, &parent->children_lock) {
1286 sg = gmap_find_shadow(parent, asce, edat_level);
1287 if (sg) {
1288 gmap_get(sg);
1289 return sg;
1290 }
1291 }
1292 /* Create a new shadow gmap. */
1293 new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce));
1294 if (!new)
1295 return ERR_PTR(-ENOMEM);
1296 new->guest_asce = asce;
1297 new->edat_level = edat_level;
1298 set_bit(GMAP_FLAG_SHADOW, &new->flags);
1299
1300 scoped_guard(spinlock, &parent->children_lock) {
1301 /* Recheck if another CPU created the same shadow. */
1302 sg = gmap_find_shadow(parent, asce, edat_level);
1303 if (sg) {
1304 gmap_put(new);
1305 gmap_get(sg);
1306 return sg;
1307 }
1308 if (asce.r) {
1309 /* Only allow one real-space gmap shadow. */
1310 list_for_each_entry(sg, &parent->children, list) {
1311 if (sg->guest_asce.r) {
1312 scoped_guard(write_lock, &parent->kvm->mmu_lock)
1313 gmap_unshadow(sg);
1314 break;
1315 }
1316 }
1317 gmap_add_child(parent, new);
1318 /* Nothing to protect, return right away. */
1319 gmap_get(new);
1320 return new;
1321 }
1322 }
1323
1324 gmap_get(new);
1325 new->parent = parent;
1326 /* Protect while inserting, protects against invalidation races. */
1327 rc = gmap_protect_asce_top_level(mc, new);
1328 if (rc) {
1329 new->parent = NULL;
1330 gmap_put(new);
1331 gmap_put(new);
1332 return ERR_PTR(rc);
1333 }
1334 return new;
1335 }
1336