1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Guest memory management for KVM/s390
4 *
5 * Copyright IBM Corp. 2008, 2020, 2024
6 *
7 * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
8 * Martin Schwidefsky <schwidefsky@de.ibm.com>
9 * David Hildenbrand <david@redhat.com>
10 * Janosch Frank <frankja@linux.ibm.com>
11 */
12
13 #include <linux/compiler.h>
14 #include <linux/kvm.h>
15 #include <linux/kvm_host.h>
16 #include <linux/pgtable.h>
17 #include <linux/pagemap.h>
18 #include <asm/lowcore.h>
19 #include <asm/uv.h>
20 #include <asm/gmap_helpers.h>
21
22 #include "dat.h"
23 #include "gmap.h"
24 #include "kvm-s390.h"
25 #include "faultin.h"
26
kvm_s390_is_in_sie(struct kvm_vcpu * vcpu)27 static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu)
28 {
29 return vcpu->arch.sie_block->prog0c & PROG_IN_SIE;
30 }
31
gmap_limit_to_type(gfn_t limit)32 static int gmap_limit_to_type(gfn_t limit)
33 {
34 if (!limit)
35 return TABLE_TYPE_REGION1;
36 if (limit <= _REGION3_SIZE >> PAGE_SHIFT)
37 return TABLE_TYPE_SEGMENT;
38 if (limit <= _REGION2_SIZE >> PAGE_SHIFT)
39 return TABLE_TYPE_REGION3;
40 if (limit <= _REGION1_SIZE >> PAGE_SHIFT)
41 return TABLE_TYPE_REGION2;
42 return TABLE_TYPE_REGION1;
43 }
44
45 /**
46 * gmap_new() - Allocate and initialize a guest address space.
47 * @kvm: The kvm owning the guest.
48 * @limit: Maximum address of the gmap address space.
49 *
50 * Return: A guest address space structure.
51 */
gmap_new(struct kvm * kvm,gfn_t limit)52 struct gmap *gmap_new(struct kvm *kvm, gfn_t limit)
53 {
54 struct crst_table *table;
55 struct gmap *gmap;
56 int type;
57
58 type = gmap_limit_to_type(limit);
59
60 gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT);
61 if (!gmap)
62 return NULL;
63 INIT_LIST_HEAD(&gmap->children);
64 INIT_LIST_HEAD(&gmap->list);
65 INIT_LIST_HEAD(&gmap->scb_users);
66 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE);
67 spin_lock_init(&gmap->children_lock);
68 spin_lock_init(&gmap->host_to_rmap_lock);
69 refcount_set(&gmap->refcount, 1);
70
71 table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val);
72 if (!table) {
73 kfree(gmap);
74 return NULL;
75 }
76
77 gmap->asce.val = __pa(table);
78 gmap->asce.dt = type;
79 gmap->asce.tl = _ASCE_TABLE_LENGTH;
80 gmap->asce.x = 1;
81 gmap->asce.p = 1;
82 gmap->asce.s = 1;
83 gmap->kvm = kvm;
84 set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags);
85
86 return gmap;
87 }
88
gmap_add_child(struct gmap * parent,struct gmap * child)89 static void gmap_add_child(struct gmap *parent, struct gmap *child)
90 {
91 KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm);
92 KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm);
93 KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm);
94 lockdep_assert_held(&parent->children_lock);
95
96 child->parent = parent;
97
98 if (is_ucontrol(parent))
99 set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
100 else
101 clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
102
103 if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags))
104 set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
105 else
106 clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
107
108 if (kvm_is_ucontrol(parent->kvm))
109 clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags);
110 list_add(&child->list, &parent->children);
111 }
112
gmap_new_child(struct gmap * parent,gfn_t limit)113 struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit)
114 {
115 struct gmap *res;
116
117 lockdep_assert_not_held(&parent->children_lock);
118 res = gmap_new(parent->kvm, limit);
119 if (res) {
120 scoped_guard(spinlock, &parent->children_lock)
121 gmap_add_child(parent, res);
122 }
123 return res;
124 }
125
gmap_set_limit(struct gmap * gmap,gfn_t limit)126 int gmap_set_limit(struct gmap *gmap, gfn_t limit)
127 {
128 struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL;
129 int rc, type;
130
131 type = gmap_limit_to_type(limit);
132
133 mc = kvm_s390_new_mmu_cache();
134 if (!mc)
135 return -ENOMEM;
136
137 do {
138 rc = kvm_s390_mmu_cache_topup(mc);
139 if (rc)
140 return rc;
141 scoped_guard(write_lock, &gmap->kvm->mmu_lock)
142 rc = dat_set_asce_limit(mc, &gmap->asce, type);
143 } while (rc == -ENOMEM);
144
145 return 0;
146 }
147
gmap_rmap_radix_tree_free(struct radix_tree_root * root)148 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
149 {
150 struct vsie_rmap *rmap, *rnext, *head;
151 struct radix_tree_iter iter;
152 unsigned long indices[16];
153 unsigned long index;
154 void __rcu **slot;
155 int i, nr;
156
157 /* A radix tree is freed by deleting all of its entries */
158 index = 0;
159 do {
160 nr = 0;
161 radix_tree_for_each_slot(slot, root, &iter, index) {
162 indices[nr] = iter.index;
163 if (++nr == 16)
164 break;
165 }
166 for (i = 0; i < nr; i++) {
167 index = indices[i];
168 head = radix_tree_delete(root, index);
169 gmap_for_each_rmap_safe(rmap, rnext, head)
170 kfree(rmap);
171 }
172 } while (nr > 0);
173 }
174
gmap_remove_child(struct gmap * child)175 void gmap_remove_child(struct gmap *child)
176 {
177 if (KVM_BUG_ON(!child->parent, child->kvm))
178 return;
179 lockdep_assert_held(&child->parent->children_lock);
180
181 list_del(&child->list);
182 child->parent = NULL;
183 child->invalidated = true;
184 }
185
186 /**
187 * gmap_dispose() - Remove and free a guest address space and its children.
188 * @gmap: Pointer to the guest address space structure.
189 */
gmap_dispose(struct gmap * gmap)190 void gmap_dispose(struct gmap *gmap)
191 {
192 /* The gmap must have been removed from the parent beforehands */
193 KVM_BUG_ON(gmap->parent, gmap->kvm);
194 /* All children of this gmap must have been removed beforehands */
195 KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm);
196 /* No VSIE shadow block is allowed to use this gmap */
197 KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm);
198 /* The ASCE must be valid */
199 KVM_BUG_ON(!gmap->asce.val, gmap->kvm);
200 /* The refcount must be 0 */
201 KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm);
202
203 /* Flush tlb of all gmaps */
204 asce_flush_tlb(gmap->asce);
205
206 /* Free all DAT tables. */
207 dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap));
208
209 /* Free additional data for a shadow gmap */
210 if (is_shadow(gmap))
211 gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
212
213 kfree(gmap);
214 }
215
216 /**
217 * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy.
218 * @gmap: The gmap whose ASCE needs to be replaced.
219 *
220 * If the ASCE is a SEGMENT type then this function will return -EINVAL,
221 * otherwise the pointers in the host_to_guest radix tree will keep pointing
222 * to the wrong pages, causing use-after-free and memory corruption.
223 * If the allocation of the new top level page table fails, the ASCE is not
224 * replaced.
225 * In any case, the old ASCE is always removed from the gmap CRST list.
226 * Therefore the caller has to make sure to save a pointer to it
227 * beforehand, unless a leak is actually intended.
228 *
229 * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE,
230 * -ENOMEM if runinng out of memory.
231 */
s390_replace_asce(struct gmap * gmap)232 int s390_replace_asce(struct gmap *gmap)
233 {
234 struct crst_table *table;
235 union asce asce;
236
237 /* Replacing segment type ASCEs would cause serious issues */
238 if (gmap->asce.dt == ASCE_TYPE_SEGMENT)
239 return -EINVAL;
240
241 table = dat_alloc_crst_sleepable(0);
242 if (!table)
243 return -ENOMEM;
244 memcpy(table, dereference_asce(gmap->asce), sizeof(*table));
245
246 /* Set new table origin while preserving existing ASCE control bits */
247 asce = gmap->asce;
248 asce.rsto = virt_to_pfn(table);
249 WRITE_ONCE(gmap->asce, asce);
250
251 return 0;
252 }
253
_gmap_unmap_prefix(struct gmap * gmap,gfn_t gfn,gfn_t end,bool hint)254 bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint)
255 {
256 struct kvm *kvm = gmap->kvm;
257 struct kvm_vcpu *vcpu;
258 gfn_t prefix_gfn;
259 unsigned long i;
260
261 if (is_shadow(gmap))
262 return false;
263 kvm_for_each_vcpu(i, vcpu, kvm) {
264 /* Match against both prefix pages */
265 prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu));
266 if (prefix_gfn < end && gfn <= prefix_gfn + 1) {
267 if (hint && kvm_s390_is_in_sie(vcpu))
268 return false;
269 VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx",
270 gfn_to_gpa(gfn), gfn_to_gpa(end));
271 kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
272 }
273 }
274 return true;
275 }
276
277 struct clear_young_pte_priv {
278 struct gmap *gmap;
279 bool young;
280 };
281
gmap_clear_young_pte(union pte * ptep,gfn_t gfn,gfn_t end,struct dat_walk * walk)282 static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
283 {
284 struct clear_young_pte_priv *p = walk->priv;
285 union pgste pgste;
286 union pte pte, new;
287
288 pte = READ_ONCE(*ptep);
289
290 if (!pte.s.pr || (!pte.s.y && pte.h.i))
291 return 0;
292
293 pgste = pgste_get_lock(ptep);
294 if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) {
295 new = pte;
296 new.h.i = 1;
297 new.s.y = 0;
298 if ((new.s.d || !new.h.p) && !new.s.s)
299 folio_set_dirty(pfn_folio(pte.h.pfra));
300 new.s.d = 0;
301 new.h.p = 1;
302
303 pgste.prefix_notif = 0;
304 pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap));
305 }
306 p->young = 1;
307 pgste_set_unlock(ptep, pgste);
308 return 0;
309 }
310
gmap_clear_young_crste(union crste * crstep,gfn_t gfn,gfn_t end,struct dat_walk * walk)311 static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
312 {
313 struct clear_young_pte_priv *priv = walk->priv;
314 union crste crste, new;
315
316 do {
317 crste = READ_ONCE(*crstep);
318
319 if (!crste.h.fc)
320 return 0;
321 if (!crste.s.fc1.y && crste.h.i)
322 return 0;
323 if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end))
324 break;
325
326 new = crste;
327 new.h.i = 1;
328 new.s.fc1.y = 0;
329 new.s.fc1.prefix_notif = 0;
330 if (new.s.fc1.d || !new.h.p)
331 folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
332 new.s.fc1.d = 0;
333 new.h.p = 1;
334 } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce));
335
336 priv->young = 1;
337 return 0;
338 }
339
340 /**
341 * gmap_age_gfn() - Clear young.
342 * @gmap: The guest gmap.
343 * @start: The first gfn to test.
344 * @end: The gfn after the last one to test.
345 *
346 * Context: Called with the kvm mmu write lock held.
347 * Return: 1 if any page in the given range was young, otherwise 0.
348 */
gmap_age_gfn(struct gmap * gmap,gfn_t start,gfn_t end)349 bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end)
350 {
351 const struct dat_walk_ops ops = {
352 .pte_entry = gmap_clear_young_pte,
353 .pmd_entry = gmap_clear_young_crste,
354 .pud_entry = gmap_clear_young_crste,
355 };
356 struct clear_young_pte_priv priv = {
357 .gmap = gmap,
358 .young = false,
359 };
360
361 _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
362
363 return priv.young;
364 }
365
366 struct gmap_unmap_priv {
367 struct gmap *gmap;
368 struct kvm_memory_slot *slot;
369 };
370
_gmap_unmap_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * w)371 static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w)
372 {
373 struct gmap_unmap_priv *priv = w->priv;
374 struct folio *folio = NULL;
375 unsigned long vmaddr;
376 union pgste pgste;
377
378 pgste = pgste_get_lock(ptep);
379 if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) {
380 vmaddr = __gfn_to_hva_memslot(priv->slot, gfn);
381 gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr);
382 }
383 if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
384 folio = pfn_folio(ptep->h.pfra);
385 pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn);
386 pgste_set_unlock(ptep, pgste);
387 if (folio)
388 uv_convert_from_secure_folio(folio);
389
390 return 0;
391 }
392
_gmap_unmap_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)393 static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
394 {
395 struct gmap_unmap_priv *priv = walk->priv;
396 struct folio *folio = NULL;
397 union crste old = *crstep;
398
399 if (!old.h.fc)
400 return 0;
401
402 if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
403 folio = phys_to_folio(crste_origin_large(old));
404 /* No races should happen because kvm->mmu_lock is held in write mode */
405 KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn),
406 priv->gmap->kvm);
407 if (folio)
408 uv_convert_from_secure_folio(folio);
409
410 return 0;
411 }
412
413 /**
414 * gmap_unmap_gfn_range() - Unmap a range of guest addresses.
415 * @gmap: The gmap to act on.
416 * @slot: The memslot in which the range is located.
417 * @start: The first gfn to unmap.
418 * @end: The gfn after the last one to unmap.
419 *
420 * Context: Called with the kvm mmu write lock held.
421 * Return: false
422 */
gmap_unmap_gfn_range(struct gmap * gmap,struct kvm_memory_slot * slot,gfn_t start,gfn_t end)423 bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
424 {
425 const struct dat_walk_ops ops = {
426 .pte_entry = _gmap_unmap_pte,
427 .pmd_entry = _gmap_unmap_crste,
428 .pud_entry = _gmap_unmap_crste,
429 };
430 struct gmap_unmap_priv priv = {
431 .gmap = gmap,
432 .slot = slot,
433 };
434
435 lockdep_assert_held_write(&gmap->kvm->mmu_lock);
436
437 _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
438 return false;
439 }
440
__pte_test_and_clear_softdirty(union pte * ptep,union pgste pgste,gfn_t gfn,struct gmap * gmap)441 static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn,
442 struct gmap *gmap)
443 {
444 union pte pte = READ_ONCE(*ptep);
445
446 if (!pte.s.pr || (pte.h.p && !pte.s.sd))
447 return pgste;
448
449 /*
450 * If this page contains one or more prefixes of vCPUS that are currently
451 * running, do not reset the protection, leave it marked as dirty.
452 */
453 if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) {
454 pte.h.p = 1;
455 pte.s.sd = 0;
456 pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn);
457 }
458
459 mark_page_dirty(gmap->kvm, gfn);
460
461 return pgste;
462 }
463
_pte_test_and_clear_softdirty(union pte * ptep,gfn_t gfn,gfn_t end,struct dat_walk * walk)464 static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end,
465 struct dat_walk *walk)
466 {
467 struct gmap *gmap = walk->priv;
468 union pgste pgste;
469
470 pgste = pgste_get_lock(ptep);
471 pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap);
472 pgste_set_unlock(ptep, pgste);
473 return 0;
474 }
475
_crste_test_and_clear_softdirty(union crste * table,gfn_t gfn,gfn_t end,struct dat_walk * walk)476 static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end,
477 struct dat_walk *walk)
478 {
479 struct gmap *gmap = walk->priv;
480 union crste crste, new;
481
482 if (fatal_signal_pending(current))
483 return 1;
484 do {
485 crste = READ_ONCE(*table);
486 if (!crste.h.fc)
487 return 0;
488 if (crste.h.p && !crste.s.fc1.sd)
489 return 0;
490
491 /*
492 * If this large page contains one or more prefixes of vCPUs that are
493 * currently running, do not reset the protection, leave it marked as
494 * dirty.
495 */
496 if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end))
497 break;
498 new = crste;
499 new.h.p = 1;
500 new.s.fc1.sd = 0;
501 } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn));
502
503 for ( ; gfn < end; gfn++)
504 mark_page_dirty(gmap->kvm, gfn);
505
506 return 0;
507 }
508
gmap_sync_dirty_log(struct gmap * gmap,gfn_t start,gfn_t end)509 void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end)
510 {
511 const struct dat_walk_ops walk_ops = {
512 .pte_entry = _pte_test_and_clear_softdirty,
513 .pmd_entry = _crste_test_and_clear_softdirty,
514 .pud_entry = _crste_test_and_clear_softdirty,
515 };
516
517 lockdep_assert_held(&gmap->kvm->mmu_lock);
518
519 _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap);
520 }
521
gmap_handle_minor_crste_fault(struct gmap * gmap,struct guest_fault * f)522 static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f)
523 {
524 union crste newcrste, oldcrste = READ_ONCE(*f->crstep);
525
526 /* Somehow the crste is not large anymore, let the slow path deal with it. */
527 if (!oldcrste.h.fc)
528 return 1;
529
530 f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn));
531 f->writable = oldcrste.s.fc1.w;
532
533 /* Appropriate permissions already (race with another handler), nothing to do. */
534 if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p))
535 return 0;
536
537 if (!f->write_attempt || oldcrste.s.fc1.w) {
538 f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d;
539 newcrste = oldcrste;
540 newcrste.h.i = 0;
541 newcrste.s.fc1.y = 1;
542 if (f->write_attempt) {
543 newcrste.h.p = 0;
544 newcrste.s.fc1.d = 1;
545 newcrste.s.fc1.sd = 1;
546 }
547 /* In case of races, let the slow path deal with it. */
548 return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn);
549 }
550 /* Trying to write on a read-only page, let the slow path deal with it. */
551 return 1;
552 }
553
_gmap_handle_minor_pte_fault(struct gmap * gmap,union pgste * pgste,struct guest_fault * f)554 static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste,
555 struct guest_fault *f)
556 {
557 union pte newpte, oldpte = READ_ONCE(*f->ptep);
558
559 f->pfn = oldpte.h.pfra;
560 f->writable = oldpte.s.w;
561
562 /* Appropriate permissions already (race with another handler), nothing to do. */
563 if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p))
564 return 0;
565 /* Trying to write on a read-only page, let the slow path deal with it. */
566 if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w))
567 return 1;
568
569 newpte = oldpte;
570 newpte.h.i = 0;
571 newpte.s.y = 1;
572 if (f->write_attempt) {
573 newpte.h.p = 0;
574 newpte.s.d = 1;
575 newpte.s.sd = 1;
576 }
577 *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn);
578
579 return 0;
580 }
581
582 /**
583 * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault.
584 * @gmap: The gmap whose fault needs to be resolved.
585 * @fault: Describes the fault that is being resolved.
586 *
587 * A minor fault is a fault that can be resolved quickly within gmap.
588 * The page is already mapped, the fault is only due to dirty/young tracking.
589 *
590 * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could
591 * not be resolved and needs to go through the slow path.
592 */
gmap_try_fixup_minor(struct gmap * gmap,struct guest_fault * fault)593 int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault)
594 {
595 union pgste pgste;
596 int rc;
597
598 lockdep_assert_held(&gmap->kvm->mmu_lock);
599
600 rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE,
601 &fault->crstep, &fault->ptep);
602 /* If a PTE or a leaf CRSTE could not be reached, slow path. */
603 if (rc)
604 return 1;
605
606 if (fault->ptep) {
607 pgste = pgste_get_lock(fault->ptep);
608 rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault);
609 if (!rc && fault->callback)
610 fault->callback(fault);
611 pgste_set_unlock(fault->ptep, pgste);
612 } else {
613 rc = gmap_handle_minor_crste_fault(gmap, fault);
614 if (!rc && fault->callback)
615 fault->callback(fault);
616 }
617 return rc;
618 }
619
gmap_2g_allowed(struct gmap * gmap,struct guest_fault * f,struct kvm_memory_slot * slot)620 static inline bool gmap_2g_allowed(struct gmap *gmap, struct guest_fault *f,
621 struct kvm_memory_slot *slot)
622 {
623 return false;
624 }
625
626 /**
627 * gmap_1m_allowed() - Check whether a 1M hugepage is allowed.
628 * @gmap: The gmap of the guest.
629 * @f: Describes the fault that is being resolved.
630 * @slot: The memslot the faulting address belongs to.
631 *
632 * The function checks whether the GMAP_FLAG_ALLOW_HPAGE_1M flag is set for
633 * @gmap, whether the offset of the address in the 1M virtual frame is the
634 * same as the offset in the physical 1M frame, and finally whether the whole
635 * 1M page would fit in the given memslot.
636 *
637 * Return: true if a 1M hugepage is allowed to back the faulting address, false
638 * otherwise.
639 */
gmap_1m_allowed(struct gmap * gmap,struct guest_fault * f,struct kvm_memory_slot * slot)640 static inline bool gmap_1m_allowed(struct gmap *gmap, struct guest_fault *f,
641 struct kvm_memory_slot *slot)
642 {
643 return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags) &&
644 !((f->gfn ^ f->pfn) & ~_SEGMENT_FR_MASK) &&
645 slot->base_gfn <= ALIGN_DOWN(f->gfn, _PAGES_PER_SEGMENT) &&
646 slot->base_gfn + slot->npages >= ALIGN(f->gfn + 1, _PAGES_PER_SEGMENT);
647 }
648
_gmap_link(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,int level,struct guest_fault * f)649 static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level,
650 struct guest_fault *f)
651 {
652 union crste oldval, newval;
653 union pte newpte, oldpte;
654 union pgste pgste;
655 int rc = 0;
656
657 rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level,
658 &f->crstep, &f->ptep);
659 if (rc == -ENOMEM)
660 return rc;
661 if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm))
662 return rc;
663 if (rc)
664 return -EAGAIN;
665 if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm))
666 return -EINVAL;
667
668 if (f->ptep) {
669 pgste = pgste_get_lock(f->ptep);
670 oldpte = *f->ptep;
671 newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
672 newpte.s.sd = oldpte.s.sd;
673 oldpte.s.sd = 0;
674 if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
675 pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn);
676 if (f->callback)
677 f->callback(f);
678 } else {
679 rc = -EAGAIN;
680 }
681 pgste_set_unlock(f->ptep, pgste);
682 } else {
683 do {
684 oldval = READ_ONCE(*f->crstep);
685 newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
686 f->write_attempt | oldval.s.fc1.d);
687 newval.s.fc1.s = !f->page;
688 newval.s.fc1.sd = oldval.s.fc1.sd;
689 if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
690 crste_origin_large(oldval) != crste_origin_large(newval))
691 return -EAGAIN;
692 } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn));
693 if (f->callback)
694 f->callback(f);
695 }
696
697 return rc;
698 }
699
gmap_link(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,struct guest_fault * f,struct kvm_memory_slot * slot)700 int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f,
701 struct kvm_memory_slot *slot)
702 {
703 unsigned int order;
704 int level;
705
706 lockdep_assert_held(&gmap->kvm->mmu_lock);
707
708 level = TABLE_TYPE_PAGE_TABLE;
709 if (f->page) {
710 order = folio_order(page_folio(f->page));
711 if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f, slot))
712 level = TABLE_TYPE_REGION3;
713 else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f, slot))
714 level = TABLE_TYPE_SEGMENT;
715 }
716 return _gmap_link(mc, gmap, level, f);
717 }
718
gmap_ucas_map_one(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,gfn_t p_gfn,gfn_t c_gfn,bool force_alloc)719 static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
720 gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
721 {
722 union crste newcrste, oldcrste;
723 struct page_table *pt;
724 union crste *crstep;
725 union pte *ptep;
726 int rc;
727
728 if (force_alloc)
729 rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC,
730 TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
731 else
732 rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE,
733 TABLE_TYPE_SEGMENT, &crstep, &ptep);
734 if (rc)
735 return rc;
736 if (!ptep) {
737 newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT);
738 newcrste.h.i = 1;
739 newcrste.h.fc0.tl = 1;
740 } else {
741 pt = pte_table_start(ptep);
742 dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT));
743 newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT);
744 }
745 rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT,
746 &crstep, &ptep);
747 if (rc)
748 return rc;
749 do {
750 oldcrste = READ_ONCE(*crstep);
751 if (oldcrste.val == newcrste.val)
752 break;
753 } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce));
754 return 0;
755 }
756
gmap_ucas_translate_simple(struct gmap * gmap,gpa_t * gaddr,union crste ** crstepp)757 static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp)
758 {
759 union pte *ptep;
760 int rc;
761
762 rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE,
763 TABLE_TYPE_SEGMENT, crstepp, &ptep);
764 if (rc || (!ptep && !crste_is_ucas(**crstepp)))
765 return -EREMOTE;
766 if (!ptep)
767 return 1;
768 *gaddr &= ~_SEGMENT_MASK;
769 *gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT;
770 return 0;
771 }
772
773 /**
774 * gmap_ucas_translate() - Translate a vcpu address into a host gmap address
775 * @mc: The memory cache to be used for allocations.
776 * @gmap: The per-cpu gmap.
777 * @gaddr: Pointer to the address to be translated, will get overwritten with
778 * the translated address in case of success.
779 * Translates the per-vCPU guest address into a fake guest address, which can
780 * then be used with the fake memslots that are identity mapping userspace.
781 * This allows ucontrol VMs to use the normal fault resolution path, like
782 * normal VMs.
783 *
784 * Return: %0 in case of success, otherwise %-EREMOTE.
785 */
gmap_ucas_translate(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,gpa_t * gaddr)786 int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr)
787 {
788 gpa_t translated_address;
789 union crste *crstep;
790 gfn_t gfn;
791 int rc;
792
793 gfn = gpa_to_gfn(*gaddr);
794
795 scoped_guard(read_lock, &gmap->kvm->mmu_lock) {
796 rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
797 if (rc <= 0)
798 return rc;
799 }
800 do {
801 scoped_guard(write_lock, &gmap->kvm->mmu_lock) {
802 rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
803 if (rc <= 0)
804 return rc;
805 translated_address = (*gaddr & ~_SEGMENT_MASK) |
806 (crstep->val & _SEGMENT_MASK);
807 rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true);
808 }
809 if (!rc) {
810 *gaddr = translated_address;
811 return 0;
812 }
813 if (rc != -ENOMEM)
814 return -EREMOTE;
815 rc = kvm_s390_mmu_cache_topup(mc);
816 if (rc)
817 return rc;
818 } while (1);
819 return 0;
820 }
821
gmap_ucas_map(struct gmap * gmap,gfn_t p_gfn,gfn_t c_gfn,unsigned long count)822 int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count)
823 {
824 struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL;
825 int rc = 0;
826
827 mc = kvm_s390_new_mmu_cache();
828 if (!mc)
829 return -ENOMEM;
830
831 while (count) {
832 scoped_guard(write_lock, &gmap->kvm->mmu_lock)
833 rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false);
834 if (rc == -ENOMEM) {
835 rc = kvm_s390_mmu_cache_topup(mc);
836 if (rc)
837 return rc;
838 continue;
839 }
840 if (rc)
841 return rc;
842
843 count--;
844 c_gfn += _PAGE_ENTRIES;
845 p_gfn += _PAGE_ENTRIES;
846 }
847 return rc;
848 }
849
gmap_ucas_unmap_one(struct gmap * gmap,gfn_t c_gfn)850 static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
851 {
852 union crste *crstep;
853 union pte *ptep;
854 int rc;
855
856 rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
857 if (rc)
858 return;
859 while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce))
860 ;
861 }
862
gmap_ucas_unmap(struct gmap * gmap,gfn_t c_gfn,unsigned long count)863 void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
864 {
865 guard(read_lock)(&gmap->kvm->mmu_lock);
866
867 for ( ; count; count--, c_gfn += _PAGE_ENTRIES)
868 gmap_ucas_unmap_one(gmap, c_gfn);
869 }
870
_gmap_split_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)871 static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
872 {
873 struct gmap *gmap = walk->priv;
874 union crste crste, newcrste;
875
876 crste = READ_ONCE(*crstep);
877 newcrste = _CRSTE_EMPTY(crste.h.tt);
878
879 while (crste_leaf(crste)) {
880 if (crste_prefix(crste))
881 gmap_unmap_prefix(gmap, gfn, next);
882 if (crste.s.fc1.vsie_notif)
883 gmap_handle_vsie_unshadow_event(gmap, gfn);
884 if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce))
885 break;
886 crste = READ_ONCE(*crstep);
887 }
888
889 if (need_resched())
890 return next;
891
892 return 0;
893 }
894
gmap_split_huge_pages(struct gmap * gmap)895 void gmap_split_huge_pages(struct gmap *gmap)
896 {
897 const struct dat_walk_ops ops = {
898 .pmd_entry = _gmap_split_crste,
899 .pud_entry = _gmap_split_crste,
900 };
901 gfn_t start = 0;
902
903 do {
904 scoped_guard(read_lock, &gmap->kvm->mmu_lock)
905 start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce,
906 &ops, DAT_WALK_IGN_HOLES, gmap);
907 cond_resched();
908 } while (start);
909 }
910
_gmap_enable_skeys(struct gmap * gmap)911 static int _gmap_enable_skeys(struct gmap *gmap)
912 {
913 gfn_t start = 0;
914 int rc;
915
916 if (uses_skeys(gmap))
917 return 0;
918
919 set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
920 rc = gmap_helper_disable_cow_sharing();
921 if (rc) {
922 clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
923 return rc;
924 }
925
926 do {
927 scoped_guard(write_lock, &gmap->kvm->mmu_lock)
928 start = dat_reset_skeys(gmap->asce, start);
929 cond_resched();
930 } while (start);
931 return 0;
932 }
933
gmap_enable_skeys(struct gmap * gmap)934 int gmap_enable_skeys(struct gmap *gmap)
935 {
936 int rc;
937
938 mmap_write_lock(gmap->kvm->mm);
939 rc = _gmap_enable_skeys(gmap);
940 mmap_write_unlock(gmap->kvm->mm);
941 return rc;
942 }
943
_destroy_pages_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)944 static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
945 {
946 if (!ptep->s.pr)
947 return 0;
948 __kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep)));
949 if (need_resched())
950 return next;
951 return 0;
952 }
953
_destroy_pages_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)954 static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
955 {
956 phys_addr_t origin, cur, end;
957
958 if (!crstep->h.fc || !crstep->s.fc1.pr)
959 return 0;
960
961 origin = crste_origin_large(*crstep);
962 cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin;
963 end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin;
964 for ( ; cur < end; cur += PAGE_SIZE)
965 __kvm_s390_pv_destroy_page(phys_to_page(cur));
966 if (need_resched())
967 return next;
968 return 0;
969 }
970
gmap_pv_destroy_range(struct gmap * gmap,gfn_t start,gfn_t end,bool interruptible)971 int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible)
972 {
973 const struct dat_walk_ops ops = {
974 .pte_entry = _destroy_pages_pte,
975 .pmd_entry = _destroy_pages_crste,
976 .pud_entry = _destroy_pages_crste,
977 };
978
979 do {
980 scoped_guard(read_lock, &gmap->kvm->mmu_lock)
981 start = _dat_walk_gfn_range(start, end, gmap->asce, &ops,
982 DAT_WALK_IGN_HOLES, NULL);
983 if (interruptible && fatal_signal_pending(current))
984 return -EINTR;
985 cond_resched();
986 } while (start && start < end);
987 return 0;
988 }
989
gmap_insert_rmap(struct gmap * sg,gfn_t p_gfn,gfn_t r_gfn,int level)990 int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level)
991 {
992 struct vsie_rmap *rmap __free(kvfree) = NULL;
993 struct vsie_rmap *temp;
994 void __rcu **slot;
995 int rc = 0;
996
997 KVM_BUG_ON(!is_shadow(sg), sg->kvm);
998 lockdep_assert_held(&sg->host_to_rmap_lock);
999
1000 rmap = kzalloc_obj(*rmap, GFP_ATOMIC);
1001 if (!rmap)
1002 return -ENOMEM;
1003
1004 rmap->r_gfn = r_gfn;
1005 rmap->level = level;
1006 slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn);
1007 if (slot) {
1008 rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock);
1009 for (temp = rmap->next; temp; temp = temp->next) {
1010 if (temp->val == rmap->val)
1011 return 0;
1012 }
1013 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1014 } else {
1015 rmap->next = NULL;
1016 rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap);
1017 if (rc)
1018 return rc;
1019 }
1020 rmap = NULL;
1021
1022 return 0;
1023 }
1024
gmap_protect_rmap(struct kvm_s390_mmu_cache * mc,struct gmap * sg,gfn_t p_gfn,gfn_t r_gfn,kvm_pfn_t pfn,int level,bool wr)1025 int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
1026 kvm_pfn_t pfn, int level, bool wr)
1027 {
1028 unsigned long bitmask;
1029 union crste *crstep;
1030 union pgste pgste;
1031 union pte *ptep;
1032 union pte pte;
1033 int flags, rc;
1034
1035 if (KVM_BUG_ON(!is_shadow(sg) || level <= TABLE_TYPE_PAGE_TABLE, sg->kvm))
1036 return -EINVAL;
1037 lockdep_assert_held(&sg->parent->children_lock);
1038
1039 flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
1040 rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags,
1041 TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
1042 if (rc)
1043 return rc;
1044 if (level <= TABLE_TYPE_REGION1) {
1045 bitmask = -1UL << (8 + 11 * level);
1046 scoped_guard(spinlock, &sg->host_to_rmap_lock)
1047 rc = gmap_insert_rmap(sg, p_gfn, r_gfn & bitmask, level);
1048 }
1049 if (rc)
1050 return rc;
1051
1052 if (!pgste_get_trylock(ptep, &pgste))
1053 return -EAGAIN;
1054 pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false);
1055 pte.h.p = 1;
1056 pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false);
1057 pgste.vsie_notif = 1;
1058 pgste_set_unlock(ptep, pgste);
1059
1060 return 0;
1061 }
1062
__set_cmma_dirty_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)1063 static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1064 {
1065 __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val);
1066 if (need_resched())
1067 return next;
1068 return 0;
1069 }
1070
gmap_set_cmma_all_dirty(struct gmap * gmap)1071 void gmap_set_cmma_all_dirty(struct gmap *gmap)
1072 {
1073 const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, };
1074 gfn_t gfn = 0;
1075
1076 do {
1077 scoped_guard(read_lock, &gmap->kvm->mmu_lock)
1078 gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops,
1079 DAT_WALK_IGN_HOLES, NULL);
1080 cond_resched();
1081 } while (gfn);
1082 }
1083
gmap_unshadow_level(struct gmap * sg,gfn_t r_gfn,int level)1084 static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
1085 {
1086 unsigned long align = PAGE_SIZE;
1087 gpa_t gaddr = gfn_to_gpa(r_gfn);
1088 union crste *crstep;
1089 union crste crste;
1090 union pte *ptep;
1091
1092 if (level > TABLE_TYPE_PAGE_TABLE)
1093 align = 1UL << (11 * level + _SEGMENT_SHIFT);
1094 kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align));
1095 sg->invalidated = true;
1096 if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep))
1097 return;
1098 if (ptep) {
1099 if (READ_ONCE(*ptep).val != _PTE_EMPTY.val)
1100 dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
1101 return;
1102 }
1103
1104 crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce);
1105 if (crste_leaf(crste) || crste.h.i)
1106 return;
1107 if (is_pmd(crste))
1108 dat_free_pt(dereference_pmd(crste.pmd));
1109 else
1110 dat_free_level(dereference_crste(crste), true);
1111 }
1112
gmap_unshadow(struct gmap * sg)1113 static void gmap_unshadow(struct gmap *sg)
1114 {
1115 struct gmap_cache *gmap_cache, *next;
1116
1117 KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1118 KVM_BUG_ON(!sg->parent, sg->kvm);
1119
1120 lockdep_assert_held(&sg->parent->children_lock);
1121
1122 gmap_remove_child(sg);
1123 kvm_s390_vsie_gmap_notifier(sg, 0, -1UL);
1124
1125 list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) {
1126 gmap_cache->gmap = NULL;
1127 list_del(&gmap_cache->list);
1128 }
1129
1130 gmap_put(sg);
1131 }
1132
_gmap_handle_vsie_unshadow_event(struct gmap * parent,gfn_t gfn)1133 void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
1134 {
1135 struct vsie_rmap *rmap, *rnext, *head;
1136 struct gmap *sg, *next;
1137 gfn_t start, end;
1138
1139 list_for_each_entry_safe(sg, next, &parent->children, list) {
1140 start = sg->guest_asce.rsto;
1141 end = start + sg->guest_asce.tl + 1;
1142 if (!sg->guest_asce.r && gfn >= start && gfn < end) {
1143 gmap_unshadow(sg);
1144 continue;
1145 }
1146 scoped_guard(spinlock, &sg->host_to_rmap_lock)
1147 head = radix_tree_delete(&sg->host_to_rmap, gfn);
1148 gmap_for_each_rmap_safe(rmap, rnext, head) {
1149 gmap_unshadow_level(sg, rmap->r_gfn, rmap->level);
1150 kfree(rmap);
1151 }
1152 }
1153 }
1154
1155 /**
1156 * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables.
1157 * @parent: Pointer to the parent gmap.
1158 * @asce: ASCE for which the shadow table is created.
1159 * @edat_level: Edat level to be used for the shadow translation.
1160 *
1161 * Context: Called with parent->children_lock held.
1162 *
1163 * Return: The pointer to a gmap if a shadow table with the given asce is
1164 * already available, ERR_PTR(-EAGAIN) if another one is just being created,
1165 * otherwise NULL.
1166 */
gmap_find_shadow(struct gmap * parent,union asce asce,int edat_level)1167 static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level)
1168 {
1169 struct gmap *sg;
1170
1171 lockdep_assert_held(&parent->children_lock);
1172 list_for_each_entry(sg, &parent->children, list) {
1173 if (!gmap_is_shadow_valid(sg, asce, edat_level))
1174 continue;
1175 return sg;
1176 }
1177 return NULL;
1178 }
1179
1180 #define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE)
1181 struct gmap_protect_asce_top_level {
1182 unsigned long seq;
1183 struct guest_fault f[CRST_TABLE_PAGES];
1184 };
1185
__gmap_protect_asce_top_level(struct kvm_s390_mmu_cache * mc,struct gmap * sg,struct gmap_protect_asce_top_level * context)1186 static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1187 struct gmap_protect_asce_top_level *context)
1188 {
1189 struct gmap *parent;
1190 int rc, i;
1191
1192 guard(write_lock)(&sg->kvm->mmu_lock);
1193
1194 if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f))
1195 return -EAGAIN;
1196
1197 parent = READ_ONCE(sg->parent);
1198 if (!parent)
1199 return -EAGAIN;
1200 scoped_guard(spinlock, &parent->children_lock) {
1201 if (READ_ONCE(sg->parent) != parent)
1202 return -EAGAIN;
1203 sg->invalidated = false;
1204 for (i = 0; i < CRST_TABLE_PAGES; i++) {
1205 if (!context->f[i].valid)
1206 continue;
1207 rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn,
1208 TABLE_TYPE_REGION1 + 1, context->f[i].writable);
1209 if (rc)
1210 return rc;
1211 }
1212 gmap_add_child(sg->parent, sg);
1213 }
1214
1215 kvm_s390_release_faultin_array(sg->kvm, context->f, false);
1216 return 0;
1217 }
1218
_gmap_protect_asce_top_level(struct kvm_s390_mmu_cache * mc,struct gmap * sg,struct gmap_protect_asce_top_level * context)1219 static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1220 struct gmap_protect_asce_top_level *context)
1221 {
1222 int rc;
1223
1224 if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f))
1225 return -EAGAIN;
1226 do {
1227 rc = kvm_s390_mmu_cache_topup(mc);
1228 if (rc)
1229 return rc;
1230 rc = radix_tree_preload(GFP_KERNEL);
1231 if (rc)
1232 return rc;
1233 rc = __gmap_protect_asce_top_level(mc, sg, context);
1234 radix_tree_preload_end();
1235 } while (rc == -ENOMEM);
1236
1237 return rc;
1238 }
1239
gmap_protect_asce_top_level(struct kvm_s390_mmu_cache * mc,struct gmap * sg)1240 static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg)
1241 {
1242 struct gmap_protect_asce_top_level context = {};
1243 union asce asce = sg->guest_asce;
1244 int rc;
1245
1246 KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1247
1248 context.seq = sg->kvm->mmu_invalidate_seq;
1249 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
1250 smp_rmb();
1251
1252 rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false);
1253 if (rc > 0)
1254 rc = -EFAULT;
1255 if (!rc)
1256 rc = _gmap_protect_asce_top_level(mc, sg, &context);
1257 if (rc)
1258 kvm_s390_release_faultin_array(sg->kvm, context.f, true);
1259 return rc;
1260 }
1261
1262 /**
1263 * gmap_create_shadow() - Create/find a shadow guest address space.
1264 * @mc: The cache to use to allocate dat tables.
1265 * @parent: Pointer to the parent gmap.
1266 * @asce: ASCE for which the shadow table is created.
1267 * @edat_level: Edat level to be used for the shadow translation.
1268 *
1269 * The pages of the top level page table referred by the asce parameter
1270 * will be set to read-only and marked in the PGSTEs of the kvm process.
1271 * The shadow table will be removed automatically on any change to the
1272 * PTE mapping for the source table.
1273 *
1274 * The returned shadow gmap will be returned with one extra reference.
1275 *
1276 * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
1277 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
1278 * parent gmap table could not be protected.
1279 */
gmap_create_shadow(struct kvm_s390_mmu_cache * mc,struct gmap * parent,union asce asce,int edat_level)1280 struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent,
1281 union asce asce, int edat_level)
1282 {
1283 struct gmap *sg, *new;
1284 int rc;
1285
1286 if (WARN_ON(!parent))
1287 return ERR_PTR(-EINVAL);
1288
1289 scoped_guard(spinlock, &parent->children_lock) {
1290 sg = gmap_find_shadow(parent, asce, edat_level);
1291 if (sg) {
1292 gmap_get(sg);
1293 return sg;
1294 }
1295 }
1296 /* Create a new shadow gmap. */
1297 new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce));
1298 if (!new)
1299 return ERR_PTR(-ENOMEM);
1300 new->guest_asce = asce;
1301 new->edat_level = edat_level;
1302 set_bit(GMAP_FLAG_SHADOW, &new->flags);
1303
1304 scoped_guard(spinlock, &parent->children_lock) {
1305 /* Recheck if another CPU created the same shadow. */
1306 sg = gmap_find_shadow(parent, asce, edat_level);
1307 if (sg) {
1308 gmap_put(new);
1309 gmap_get(sg);
1310 return sg;
1311 }
1312 if (asce.r) {
1313 /* Only allow one real-space gmap shadow. */
1314 list_for_each_entry(sg, &parent->children, list) {
1315 if (sg->guest_asce.r) {
1316 scoped_guard(write_lock, &parent->kvm->mmu_lock)
1317 gmap_unshadow(sg);
1318 break;
1319 }
1320 }
1321 gmap_add_child(parent, new);
1322 /* Nothing to protect, return right away. */
1323 gmap_get(new);
1324 return new;
1325 }
1326 }
1327
1328 gmap_get(new);
1329 new->parent = parent;
1330 /* Protect while inserting, protects against invalidation races. */
1331 rc = gmap_protect_asce_top_level(mc, new);
1332 if (rc) {
1333 new->parent = NULL;
1334 gmap_put(new);
1335 gmap_put(new);
1336 return ERR_PTR(rc);
1337 }
1338 return new;
1339 }
1340