xref: /linux/arch/s390/kvm/dat.c (revision 8934827db5403eae57d4537114a9ff88b0a8460f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  KVM guest address space mapping code
4  *
5  *    Copyright IBM Corp. 2007, 2020, 2024
6  *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
7  *		 Martin Schwidefsky <schwidefsky@de.ibm.com>
8  *		 David Hildenbrand <david@redhat.com>
9  *		 Janosch Frank <frankja@linux.ibm.com>
10  */
11 
12 #include <linux/kernel.h>
13 #include <linux/pagewalk.h>
14 #include <linux/swap.h>
15 #include <linux/smp.h>
16 #include <linux/spinlock.h>
17 #include <linux/slab.h>
18 #include <linux/swapops.h>
19 #include <linux/ksm.h>
20 #include <linux/mm.h>
21 #include <linux/mman.h>
22 #include <linux/pgtable.h>
23 #include <linux/kvm_types.h>
24 #include <linux/kvm_host.h>
25 #include <linux/pgalloc.h>
26 
27 #include <asm/page-states.h>
28 #include <asm/tlb.h>
29 #include "dat.h"
30 
kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache * mc)31 int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc)
32 {
33 	void *o;
34 
35 	for ( ; mc->n_crsts < KVM_S390_MMU_CACHE_N_CRSTS; mc->n_crsts++) {
36 		o = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER);
37 		if (!o)
38 			return -ENOMEM;
39 		mc->crsts[mc->n_crsts] = o;
40 	}
41 	for ( ; mc->n_pts < KVM_S390_MMU_CACHE_N_PTS; mc->n_pts++) {
42 		o = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
43 		if (!o)
44 			return -ENOMEM;
45 		mc->pts[mc->n_pts] = o;
46 	}
47 	for ( ; mc->n_rmaps < KVM_S390_MMU_CACHE_N_RMAPS; mc->n_rmaps++) {
48 		o = kzalloc_obj(*mc->rmaps[0], GFP_KERNEL_ACCOUNT);
49 		if (!o)
50 			return -ENOMEM;
51 		mc->rmaps[mc->n_rmaps] = o;
52 	}
53 	return 0;
54 }
55 
dat_alloc_pt_noinit(struct kvm_s390_mmu_cache * mc)56 static inline struct page_table *dat_alloc_pt_noinit(struct kvm_s390_mmu_cache *mc)
57 {
58 	struct page_table *res;
59 
60 	res = kvm_s390_mmu_cache_alloc_pt(mc);
61 	if (res)
62 		__arch_set_page_dat(res, 1);
63 	return res;
64 }
65 
dat_alloc_crst_noinit(struct kvm_s390_mmu_cache * mc)66 static inline struct crst_table *dat_alloc_crst_noinit(struct kvm_s390_mmu_cache *mc)
67 {
68 	struct crst_table *res;
69 
70 	res = kvm_s390_mmu_cache_alloc_crst(mc);
71 	if (res)
72 		__arch_set_page_dat(res, 1UL << CRST_ALLOC_ORDER);
73 	return res;
74 }
75 
dat_alloc_crst_sleepable(unsigned long init)76 struct crst_table *dat_alloc_crst_sleepable(unsigned long init)
77 {
78 	struct page *page;
79 	void *virt;
80 
81 	page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER);
82 	if (!page)
83 		return NULL;
84 	virt = page_to_virt(page);
85 	__arch_set_page_dat(virt, 1UL << CRST_ALLOC_ORDER);
86 	crst_table_init(virt, init);
87 	return virt;
88 }
89 
dat_free_level(struct crst_table * table,bool owns_ptes)90 void dat_free_level(struct crst_table *table, bool owns_ptes)
91 {
92 	unsigned int i;
93 
94 	for (i = 0; i < _CRST_ENTRIES; i++) {
95 		if (table->crstes[i].h.fc || table->crstes[i].h.i)
96 			continue;
97 		if (!is_pmd(table->crstes[i]))
98 			dat_free_level(dereference_crste(table->crstes[i]), owns_ptes);
99 		else if (owns_ptes)
100 			dat_free_pt(dereference_pmd(table->crstes[i].pmd));
101 	}
102 	dat_free_crst(table);
103 }
104 
dat_set_asce_limit(struct kvm_s390_mmu_cache * mc,union asce * asce,int newtype)105 int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype)
106 {
107 	struct crst_table *table;
108 	union crste crste;
109 
110 	while (asce->dt > newtype) {
111 		table = dereference_asce(*asce);
112 		crste = table->crstes[0];
113 		if (crste.h.fc)
114 			return 0;
115 		if (!crste.h.i) {
116 			asce->rsto = crste.h.fc0.to;
117 			dat_free_crst(table);
118 		} else {
119 			crste.h.tt--;
120 			crst_table_init((void *)table, crste.val);
121 		}
122 		asce->dt--;
123 	}
124 	while (asce->dt < newtype) {
125 		crste = _crste_fc0(asce->rsto, asce->dt + 1);
126 		table = dat_alloc_crst_noinit(mc);
127 		if (!table)
128 			return -ENOMEM;
129 		crst_table_init((void *)table, _CRSTE_HOLE(crste.h.tt).val);
130 		table->crstes[0] = crste;
131 		asce->rsto = __pa(table) >> PAGE_SHIFT;
132 		asce->dt++;
133 	}
134 	return 0;
135 }
136 
137 /**
138  * dat_crstep_xchg() - Exchange a gmap CRSTE with another.
139  * @crstep: Pointer to the CRST entry
140  * @new: Replacement entry.
141  * @gfn: The affected guest address.
142  * @asce: The ASCE of the address space.
143  *
144  * Context: This function is assumed to be called with kvm->mmu_lock held.
145  */
dat_crstep_xchg(union crste * crstep,union crste new,gfn_t gfn,union asce asce)146 void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce)
147 {
148 	if (crstep->h.i) {
149 		WRITE_ONCE(*crstep, new);
150 		return;
151 	} else if (cpu_has_edat2()) {
152 		crdte_crste(crstep, *crstep, new, gfn, asce);
153 		return;
154 	}
155 
156 	if (machine_has_tlb_guest())
157 		idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL);
158 	else
159 		idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL);
160 	WRITE_ONCE(*crstep, new);
161 }
162 
163 /**
164  * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another.
165  * @crstep: Pointer to the CRST entry.
166  * @old: Expected old value.
167  * @new: Replacement entry.
168  * @gfn: The affected guest address.
169  * @asce: The asce of the address space.
170  *
171  * This function is needed to atomically exchange a CRSTE that potentially
172  * maps a prefix area, without having to invalidate it inbetween.
173  *
174  * Context: This function is assumed to be called with kvm->mmu_lock held.
175  *
176  * Return: %true if the exchange was successful.
177  */
dat_crstep_xchg_atomic(union crste * crstep,union crste old,union crste new,gfn_t gfn,union asce asce)178 bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn,
179 			    union asce asce)
180 {
181 	if (old.h.i)
182 		return arch_try_cmpxchg((long *)crstep, &old.val, new.val);
183 	if (cpu_has_edat2())
184 		return crdte_crste(crstep, old, new, gfn, asce);
185 	return cspg_crste(crstep, old, new);
186 }
187 
dat_set_storage_key_from_pgste(union pte pte,union pgste pgste)188 static void dat_set_storage_key_from_pgste(union pte pte, union pgste pgste)
189 {
190 	union skey nkey = { .acc = pgste.acc, .fp = pgste.fp };
191 
192 	page_set_storage_key(pte_origin(pte), nkey.skey, 0);
193 }
194 
dat_move_storage_key(union pte old,union pte new)195 static void dat_move_storage_key(union pte old, union pte new)
196 {
197 	page_set_storage_key(pte_origin(new), page_get_storage_key(pte_origin(old)), 1);
198 }
199 
dat_save_storage_key_into_pgste(union pte pte,union pgste pgste)200 static union pgste dat_save_storage_key_into_pgste(union pte pte, union pgste pgste)
201 {
202 	union skey skey;
203 
204 	skey.skey = page_get_storage_key(pte_origin(pte));
205 
206 	pgste.acc = skey.acc;
207 	pgste.fp = skey.fp;
208 	pgste.gr |= skey.r;
209 	pgste.gc |= skey.c;
210 
211 	return pgste;
212 }
213 
__dat_ptep_xchg(union pte * ptep,union pgste pgste,union pte new,gfn_t gfn,union asce asce,bool uses_skeys)214 union pgste __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, gfn_t gfn,
215 			    union asce asce, bool uses_skeys)
216 {
217 	union pte old = READ_ONCE(*ptep);
218 
219 	/* Updating only the software bits while holding the pgste lock. */
220 	if (!((ptep->val ^ new.val) & ~_PAGE_SW_BITS)) {
221 		WRITE_ONCE(ptep->swbyte, new.swbyte);
222 		return pgste;
223 	}
224 
225 	if (!old.h.i) {
226 		unsigned long opts = IPTE_GUEST_ASCE | (pgste.nodat ? IPTE_NODAT : 0);
227 
228 		if (machine_has_tlb_guest())
229 			__ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, opts, asce.val, IPTE_GLOBAL);
230 		else
231 			__ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, 0, 0, IPTE_GLOBAL);
232 	}
233 
234 	if (uses_skeys) {
235 		if (old.h.i && !new.h.i)
236 			/* Invalid to valid: restore storage keys from PGSTE. */
237 			dat_set_storage_key_from_pgste(new, pgste);
238 		else if (!old.h.i && new.h.i)
239 			/* Valid to invalid: save storage keys to PGSTE. */
240 			pgste = dat_save_storage_key_into_pgste(old, pgste);
241 		else if (!old.h.i && !new.h.i)
242 			/* Valid to valid: move storage keys. */
243 			if (old.h.pfra != new.h.pfra)
244 				dat_move_storage_key(old, new);
245 		/* Invalid to invalid: nothing to do. */
246 	}
247 
248 	WRITE_ONCE(*ptep, new);
249 	return pgste;
250 }
251 
252 /*
253  * dat_split_ste() - Split a segment table entry into page table entries.
254  *
255  * Context: This function is assumed to be called with kvm->mmu_lock held.
256  *
257  * Return: 0 in case of success, -ENOMEM if running out of memory.
258  */
dat_split_ste(struct kvm_s390_mmu_cache * mc,union pmd * pmdp,gfn_t gfn,union asce asce,bool uses_skeys)259 static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t gfn,
260 			 union asce asce, bool uses_skeys)
261 {
262 	union pgste pgste_init;
263 	struct page_table *pt;
264 	union pmd new, old;
265 	union pte init;
266 	int i;
267 
268 	BUG_ON(!mc);
269 	old = READ_ONCE(*pmdp);
270 
271 	/* Already split, nothing to do. */
272 	if (!old.h.i && !old.h.fc)
273 		return 0;
274 
275 	pt = dat_alloc_pt_noinit(mc);
276 	if (!pt)
277 		return -ENOMEM;
278 	new.val = virt_to_phys(pt);
279 
280 	while (old.h.i || old.h.fc) {
281 		init.val = pmd_origin_large(old);
282 		init.h.p = old.h.p;
283 		init.h.i = old.h.i;
284 		init.s.d = old.s.fc1.d;
285 		init.s.w = old.s.fc1.w;
286 		init.s.y = old.s.fc1.y;
287 		init.s.sd = old.s.fc1.sd;
288 		init.s.pr = old.s.fc1.pr;
289 		pgste_init.val = 0;
290 		if (old.h.fc) {
291 			for (i = 0; i < _PAGE_ENTRIES; i++)
292 				pt->ptes[i].val = init.val | i * PAGE_SIZE;
293 			/* No need to take locks as the page table is not installed yet. */
294 			pgste_init.prefix_notif = old.s.fc1.prefix_notif;
295 			pgste_init.pcl = uses_skeys && init.h.i;
296 			dat_init_pgstes(pt, pgste_init.val);
297 		} else {
298 			dat_init_page_table(pt, init.val, 0);
299 		}
300 
301 		if (dat_pmdp_xchg_atomic(pmdp, old, new, gfn, asce)) {
302 			if (!pgste_init.pcl)
303 				return 0;
304 			for (i = 0; i < _PAGE_ENTRIES; i++) {
305 				union pgste pgste = pt->pgstes[i];
306 
307 				pgste = dat_save_storage_key_into_pgste(pt->ptes[i], pgste);
308 				pgste_set_unlock(pt->ptes + i, pgste);
309 			}
310 			return 0;
311 		}
312 		old = READ_ONCE(*pmdp);
313 	}
314 
315 	dat_free_pt(pt);
316 	return 0;
317 }
318 
319 /*
320  * dat_split_crste() - Split a crste into smaller crstes.
321  *
322  * Context: This function is assumed to be called with kvm->mmu_lock held.
323  *
324  * Return: %0 in case of success, %-ENOMEM if running out of memory.
325  */
dat_split_crste(struct kvm_s390_mmu_cache * mc,union crste * crstep,gfn_t gfn,union asce asce,bool uses_skeys)326 static int dat_split_crste(struct kvm_s390_mmu_cache *mc, union crste *crstep,
327 			   gfn_t gfn, union asce asce, bool uses_skeys)
328 {
329 	struct crst_table *table;
330 	union crste old, new, init;
331 	int i;
332 
333 	old = READ_ONCE(*crstep);
334 	if (is_pmd(old))
335 		return dat_split_ste(mc, &crstep->pmd, gfn, asce, uses_skeys);
336 
337 	BUG_ON(!mc);
338 
339 	/* Already split, nothing to do. */
340 	if (!old.h.i && !old.h.fc)
341 		return 0;
342 
343 	table = dat_alloc_crst_noinit(mc);
344 	if (!table)
345 		return -ENOMEM;
346 
347 	new.val = virt_to_phys(table);
348 	new.h.tt = old.h.tt;
349 	new.h.fc0.tl = _REGION_ENTRY_LENGTH;
350 
351 	while (old.h.i || old.h.fc) {
352 		init = old;
353 		init.h.tt--;
354 		if (old.h.fc) {
355 			for (i = 0; i < _CRST_ENTRIES; i++)
356 				table->crstes[i].val = init.val | i * HPAGE_SIZE;
357 		} else {
358 			crst_table_init((void *)table, init.val);
359 		}
360 		if (dat_crstep_xchg_atomic(crstep, old, new, gfn, asce))
361 			return 0;
362 		old = READ_ONCE(*crstep);
363 	}
364 
365 	dat_free_crst(table);
366 	return 0;
367 }
368 
369 /**
370  * dat_entry_walk() - Walk the gmap page tables.
371  * @mc: Cache to use to allocate dat tables, if needed; can be NULL if neither
372  *      %DAT_WALK_SPLIT or %DAT_WALK_ALLOC is specified in @flags.
373  * @gfn: Guest frame.
374  * @asce: The ASCE of the address space.
375  * @flags: Flags from WALK_* macros.
376  * @walk_level: Level to walk to, from LEVEL_* macros.
377  * @last: Will be filled the last visited non-pte DAT entry.
378  * @ptepp: Will be filled the last visited pte entry, if any, otherwise NULL.
379  *
380  * Returns a table entry pointer for the given guest address and @walk_level.
381  *
382  * The @flags have the following meanings:
383  * * %DAT_WALK_IGN_HOLES: consider holes as normal table entries
384  * * %DAT_WALK_ALLOC: allocate new tables to reach the requested level, if needed
385  * * %DAT_WALK_SPLIT: split existing large pages to reach the requested level, if needed
386  * * %DAT_WALK_LEAF: return successfully whenever a large page is encountered
387  * * %DAT_WALK_ANY: return successfully even if the requested level could not be reached
388  * * %DAT_WALK_CONTINUE: walk to the requested level with the specified flags, and then try to
389  *                       continue walking to ptes with only DAT_WALK_ANY
390  * * %DAT_WALK_USES_SKEYS: storage keys are in use
391  *
392  * Context: called with kvm->mmu_lock held.
393  *
394  * Return:
395  * * %PGM_ADDRESSING if the requested address lies outside memory
396  * * a PIC number if the requested address lies in a memory hole of type _DAT_TOKEN_PIC
397  * * %-EFAULT if the requested address lies inside a memory hole of a different type
398  * * %-EINVAL if the given ASCE is not compatible with the requested level
399  * * %-EFBIG if the requested level could not be reached because a larger frame was found
400  * * %-ENOENT if the requested level could not be reached for other reasons
401  * * %-ENOMEM if running out of memory while allocating or splitting a table
402  */
dat_entry_walk(struct kvm_s390_mmu_cache * mc,gfn_t gfn,union asce asce,int flags,int walk_level,union crste ** last,union pte ** ptepp)403 int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags,
404 		   int walk_level, union crste **last, union pte **ptepp)
405 {
406 	union vaddress vaddr = { .addr = gfn_to_gpa(gfn) };
407 	bool continue_anyway = flags & DAT_WALK_CONTINUE;
408 	bool uses_skeys = flags & DAT_WALK_USES_SKEYS;
409 	bool ign_holes = flags & DAT_WALK_IGN_HOLES;
410 	bool allocate = flags & DAT_WALK_ALLOC;
411 	bool split = flags & DAT_WALK_SPLIT;
412 	bool leaf = flags & DAT_WALK_LEAF;
413 	bool any = flags & DAT_WALK_ANY;
414 	struct page_table *pgtable;
415 	struct crst_table *table;
416 	union crste entry;
417 	int rc;
418 
419 	*last = NULL;
420 	*ptepp = NULL;
421 	if (WARN_ON_ONCE(unlikely(!asce.val)))
422 		return -EINVAL;
423 	if (WARN_ON_ONCE(unlikely(walk_level > asce.dt)))
424 		return -EINVAL;
425 	if (!asce_contains_gfn(asce, gfn))
426 		return PGM_ADDRESSING;
427 
428 	table = dereference_asce(asce);
429 	if (asce.dt >= ASCE_TYPE_REGION1) {
430 		*last = table->crstes + vaddr.rfx;
431 		entry = READ_ONCE(**last);
432 		if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION1))
433 			return -EINVAL;
434 		if (crste_hole(entry) && !ign_holes)
435 			return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT;
436 		if (walk_level == TABLE_TYPE_REGION1)
437 			return 0;
438 		if (entry.pgd.h.i) {
439 			if (!allocate)
440 				return any ? 0 : -ENOENT;
441 			rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys);
442 			if (rc)
443 				return rc;
444 			entry = READ_ONCE(**last);
445 		}
446 		table = dereference_crste(entry.pgd);
447 	}
448 
449 	if (asce.dt >= ASCE_TYPE_REGION2) {
450 		*last = table->crstes + vaddr.rsx;
451 		entry = READ_ONCE(**last);
452 		if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION2))
453 			return -EINVAL;
454 		if (crste_hole(entry) && !ign_holes)
455 			return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT;
456 		if (walk_level == TABLE_TYPE_REGION2)
457 			return 0;
458 		if (entry.p4d.h.i) {
459 			if (!allocate)
460 				return any ? 0 : -ENOENT;
461 			rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys);
462 			if (rc)
463 				return rc;
464 			entry = READ_ONCE(**last);
465 		}
466 		table = dereference_crste(entry.p4d);
467 	}
468 
469 	if (asce.dt >= ASCE_TYPE_REGION3) {
470 		*last = table->crstes + vaddr.rtx;
471 		entry = READ_ONCE(**last);
472 		if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION3))
473 			return -EINVAL;
474 		if (crste_hole(entry) && !ign_holes)
475 			return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT;
476 		if (walk_level == TABLE_TYPE_REGION3 &&
477 		    continue_anyway && !entry.pud.h.fc && !entry.h.i) {
478 			walk_level = TABLE_TYPE_PAGE_TABLE;
479 			allocate = false;
480 		}
481 		if (walk_level == TABLE_TYPE_REGION3 || ((leaf || any) && entry.pud.h.fc))
482 			return 0;
483 		if (entry.pud.h.i && !entry.pud.h.fc) {
484 			if (!allocate)
485 				return any ? 0 : -ENOENT;
486 			rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys);
487 			if (rc)
488 				return rc;
489 			entry = READ_ONCE(**last);
490 		}
491 		if (walk_level <= TABLE_TYPE_SEGMENT && entry.pud.h.fc) {
492 			if (!split)
493 				return -EFBIG;
494 			rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys);
495 			if (rc)
496 				return rc;
497 			entry = READ_ONCE(**last);
498 		}
499 		table = dereference_crste(entry.pud);
500 	}
501 
502 	*last = table->crstes + vaddr.sx;
503 	entry = READ_ONCE(**last);
504 	if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_SEGMENT))
505 		return -EINVAL;
506 	if (crste_hole(entry) && !ign_holes)
507 		return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT;
508 	if (continue_anyway && !entry.pmd.h.fc && !entry.h.i) {
509 		walk_level = TABLE_TYPE_PAGE_TABLE;
510 		allocate = false;
511 	}
512 	if (walk_level == TABLE_TYPE_SEGMENT || ((leaf || any) && entry.pmd.h.fc))
513 		return 0;
514 
515 	if (entry.pmd.h.i && !entry.pmd.h.fc) {
516 		if (!allocate)
517 			return any ? 0 : -ENOENT;
518 		rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys);
519 		if (rc)
520 			return rc;
521 		entry = READ_ONCE(**last);
522 	}
523 	if (walk_level <= TABLE_TYPE_PAGE_TABLE && entry.pmd.h.fc) {
524 		if (!split)
525 			return -EFBIG;
526 		rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys);
527 		if (rc)
528 			return rc;
529 		entry = READ_ONCE(**last);
530 	}
531 	pgtable = dereference_pmd(entry.pmd);
532 	*ptepp = pgtable->ptes + vaddr.px;
533 	if (pte_hole(**ptepp) && !ign_holes)
534 		return (*ptepp)->tok.type == _DAT_TOKEN_PIC ? (*ptepp)->tok.par : -EFAULT;
535 	return 0;
536 }
537 
dat_pte_walk_range(gfn_t gfn,gfn_t end,struct page_table * table,struct dat_walk * w)538 static long dat_pte_walk_range(gfn_t gfn, gfn_t end, struct page_table *table, struct dat_walk *w)
539 {
540 	unsigned int idx = gfn & (_PAGE_ENTRIES - 1);
541 	long rc = 0;
542 
543 	for ( ; gfn < end; idx++, gfn++) {
544 		if (pte_hole(READ_ONCE(table->ptes[idx]))) {
545 			if (!(w->flags & DAT_WALK_IGN_HOLES))
546 				return -EFAULT;
547 			if (!(w->flags & DAT_WALK_ANY))
548 				continue;
549 		}
550 
551 		rc = w->ops->pte_entry(table->ptes + idx, gfn, gfn + 1, w);
552 		if (rc)
553 			break;
554 	}
555 	return rc;
556 }
557 
dat_crste_walk_range(gfn_t start,gfn_t end,struct crst_table * table,struct dat_walk * walk)558 static long dat_crste_walk_range(gfn_t start, gfn_t end, struct crst_table *table,
559 				 struct dat_walk *walk)
560 {
561 	unsigned long idx, cur_shift, cur_size;
562 	dat_walk_op the_op;
563 	union crste crste;
564 	gfn_t cur, next;
565 	long rc = 0;
566 
567 	cur_shift = 8 + table->crstes[0].h.tt * 11;
568 	idx = (start >> cur_shift) & (_CRST_ENTRIES - 1);
569 	cur_size = 1UL << cur_shift;
570 
571 	for (cur = ALIGN_DOWN(start, cur_size); cur < end; idx++, cur = next) {
572 		next = cur + cur_size;
573 		walk->last = table->crstes + idx;
574 		crste = READ_ONCE(*walk->last);
575 
576 		if (crste_hole(crste)) {
577 			if (!(walk->flags & DAT_WALK_IGN_HOLES))
578 				return -EFAULT;
579 			if (!(walk->flags & DAT_WALK_ANY))
580 				continue;
581 		}
582 
583 		the_op = walk->ops->crste_ops[crste.h.tt];
584 		if (the_op) {
585 			rc = the_op(walk->last, cur, next, walk);
586 			crste = READ_ONCE(*walk->last);
587 		}
588 		if (rc)
589 			break;
590 		if (!crste.h.i && !crste.h.fc) {
591 			if (!is_pmd(crste))
592 				rc = dat_crste_walk_range(max(start, cur), min(end, next),
593 							  _dereference_crste(crste), walk);
594 			else if (walk->ops->pte_entry)
595 				rc = dat_pte_walk_range(max(start, cur), min(end, next),
596 							dereference_pmd(crste.pmd), walk);
597 		}
598 	}
599 	return rc;
600 }
601 
602 /**
603  * _dat_walk_gfn_range() - Walk DAT tables.
604  * @start: The first guest page frame to walk.
605  * @end: The guest page frame immediately after the last one to walk.
606  * @asce: The ASCE of the guest mapping.
607  * @ops: The gmap_walk_ops that will be used to perform the walk.
608  * @flags: Flags from WALK_* (currently only WALK_IGN_HOLES is supported).
609  * @priv: Will be passed as-is to the callbacks.
610  *
611  * Any callback returning non-zero causes the walk to stop immediately.
612  *
613  * Return: %-EINVAL in case of error, %-EFAULT if @start is too high for the
614  *         given ASCE unless the DAT_WALK_IGN_HOLES flag is specified,
615  *         otherwise it returns whatever the callbacks return.
616  */
_dat_walk_gfn_range(gfn_t start,gfn_t end,union asce asce,const struct dat_walk_ops * ops,int flags,void * priv)617 long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce,
618 			 const struct dat_walk_ops *ops, int flags, void *priv)
619 {
620 	struct crst_table *table = dereference_asce(asce);
621 	struct dat_walk walk = {
622 		.ops	= ops,
623 		.asce	= asce,
624 		.priv	= priv,
625 		.flags	= flags,
626 		.start	= start,
627 		.end	= end,
628 	};
629 
630 	if (WARN_ON_ONCE(unlikely(!asce.val)))
631 		return -EINVAL;
632 	if (!asce_contains_gfn(asce, start))
633 		return (flags & DAT_WALK_IGN_HOLES) ? 0 : -EFAULT;
634 
635 	return dat_crste_walk_range(start, min(end, asce_end(asce)), table, &walk);
636 }
637 
dat_get_storage_key(union asce asce,gfn_t gfn,union skey * skey)638 int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey)
639 {
640 	union crste *crstep;
641 	union pgste pgste;
642 	union pte *ptep;
643 	int rc;
644 
645 	skey->skey = 0;
646 	rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
647 	if (rc)
648 		return rc;
649 
650 	if (!ptep) {
651 		union crste crste;
652 
653 		crste = READ_ONCE(*crstep);
654 		if (!crste.h.fc || !crste.s.fc1.pr)
655 			return 0;
656 		skey->skey = page_get_storage_key(large_crste_to_phys(crste, gfn));
657 		return 0;
658 	}
659 	pgste = pgste_get_lock(ptep);
660 	if (ptep->h.i) {
661 		skey->acc = pgste.acc;
662 		skey->fp = pgste.fp;
663 	} else {
664 		skey->skey = page_get_storage_key(pte_origin(*ptep));
665 	}
666 	skey->r |= pgste.gr;
667 	skey->c |= pgste.gc;
668 	pgste_set_unlock(ptep, pgste);
669 	return 0;
670 }
671 
dat_update_ptep_sd(union pgste old,union pgste pgste,union pte * ptep)672 static void dat_update_ptep_sd(union pgste old, union pgste pgste, union pte *ptep)
673 {
674 	if (pgste.acc != old.acc || pgste.fp != old.fp || pgste.gr != old.gr || pgste.gc != old.gc)
675 		__atomic64_or(_PAGE_SD, &ptep->val);
676 }
677 
dat_set_storage_key(struct kvm_s390_mmu_cache * mc,union asce asce,gfn_t gfn,union skey skey,bool nq)678 int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
679 			union skey skey, bool nq)
680 {
681 	union pgste pgste, old;
682 	union crste *crstep;
683 	union pte *ptep;
684 	int rc;
685 
686 	rc = dat_entry_walk(mc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE,
687 			    &crstep, &ptep);
688 	if (rc)
689 		return rc;
690 
691 	if (!ptep) {
692 		page_set_storage_key(large_crste_to_phys(*crstep, gfn), skey.skey, !nq);
693 		return 0;
694 	}
695 
696 	old = pgste_get_lock(ptep);
697 	pgste = old;
698 
699 	pgste.acc = skey.acc;
700 	pgste.fp = skey.fp;
701 	pgste.gc = skey.c;
702 	pgste.gr = skey.r;
703 
704 	if (!ptep->h.i) {
705 		union skey old_skey;
706 
707 		old_skey.skey = page_get_storage_key(pte_origin(*ptep));
708 		pgste.hc |= old_skey.c;
709 		pgste.hr |= old_skey.r;
710 		old_skey.c = old.gc;
711 		old_skey.r = old.gr;
712 		skey.r = 0;
713 		skey.c = 0;
714 		page_set_storage_key(pte_origin(*ptep), skey.skey, !nq);
715 	}
716 
717 	dat_update_ptep_sd(old, pgste, ptep);
718 	pgste_set_unlock(ptep, pgste);
719 	return 0;
720 }
721 
page_cond_set_storage_key(phys_addr_t paddr,union skey skey,union skey * oldkey,bool nq,bool mr,bool mc)722 static bool page_cond_set_storage_key(phys_addr_t paddr, union skey skey, union skey *oldkey,
723 				      bool nq, bool mr, bool mc)
724 {
725 	oldkey->skey = page_get_storage_key(paddr);
726 	if (oldkey->acc == skey.acc && oldkey->fp == skey.fp &&
727 	    (oldkey->r == skey.r || mr) && (oldkey->c == skey.c || mc))
728 		return false;
729 	page_set_storage_key(paddr, skey.skey, !nq);
730 	return true;
731 }
732 
dat_cond_set_storage_key(struct kvm_s390_mmu_cache * mmc,union asce asce,gfn_t gfn,union skey skey,union skey * oldkey,bool nq,bool mr,bool mc)733 int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn,
734 			     union skey skey, union skey *oldkey, bool nq, bool mr, bool mc)
735 {
736 	union pgste pgste, old;
737 	union crste *crstep;
738 	union skey prev;
739 	union pte *ptep;
740 	int rc;
741 
742 	rc = dat_entry_walk(mmc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE,
743 			    &crstep, &ptep);
744 	if (rc)
745 		return rc;
746 
747 	if (!ptep)
748 		return page_cond_set_storage_key(large_crste_to_phys(*crstep, gfn), skey, oldkey,
749 						 nq, mr, mc);
750 
751 	old = pgste_get_lock(ptep);
752 	pgste = old;
753 
754 	rc = 1;
755 	pgste.acc = skey.acc;
756 	pgste.fp = skey.fp;
757 	pgste.gc = skey.c;
758 	pgste.gr = skey.r;
759 
760 	if (!ptep->h.i) {
761 		rc = page_cond_set_storage_key(pte_origin(*ptep), skey, &prev, nq, mr, mc);
762 		pgste.hc |= prev.c;
763 		pgste.hr |= prev.r;
764 		prev.c |= old.gc;
765 		prev.r |= old.gr;
766 	} else {
767 		prev.acc = old.acc;
768 		prev.fp = old.fp;
769 		prev.c = old.gc;
770 		prev.r = old.gr;
771 	}
772 	if (oldkey)
773 		*oldkey = prev;
774 
775 	dat_update_ptep_sd(old, pgste, ptep);
776 	pgste_set_unlock(ptep, pgste);
777 	return rc;
778 }
779 
dat_reset_reference_bit(union asce asce,gfn_t gfn)780 int dat_reset_reference_bit(union asce asce, gfn_t gfn)
781 {
782 	union pgste pgste, old;
783 	union crste *crstep;
784 	union pte *ptep;
785 	int rc;
786 
787 	rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
788 	if (rc)
789 		return rc;
790 
791 	if (!ptep) {
792 		union crste crste = READ_ONCE(*crstep);
793 
794 		if (!crste.h.fc || !crste.s.fc1.pr)
795 			return 0;
796 		return page_reset_referenced(large_crste_to_phys(*crstep, gfn));
797 	}
798 	old = pgste_get_lock(ptep);
799 	pgste = old;
800 
801 	if (!ptep->h.i) {
802 		rc = page_reset_referenced(pte_origin(*ptep));
803 		pgste.hr = rc >> 1;
804 	}
805 	rc |= (pgste.gr << 1) | pgste.gc;
806 	pgste.gr = 0;
807 
808 	dat_update_ptep_sd(old, pgste, ptep);
809 	pgste_set_unlock(ptep, pgste);
810 	return rc;
811 }
812 
dat_reset_skeys_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)813 static long dat_reset_skeys_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
814 {
815 	union pgste pgste;
816 
817 	pgste = pgste_get_lock(ptep);
818 	pgste.acc = 0;
819 	pgste.fp = 0;
820 	pgste.gr = 0;
821 	pgste.gc = 0;
822 	if (ptep->s.pr)
823 		page_set_storage_key(pte_origin(*ptep), PAGE_DEFAULT_KEY, 1);
824 	pgste_set_unlock(ptep, pgste);
825 
826 	if (need_resched())
827 		return next;
828 	return 0;
829 }
830 
dat_reset_skeys_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)831 static long dat_reset_skeys_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
832 {
833 	phys_addr_t addr, end, origin = crste_origin_large(*crstep);
834 
835 	if (!crstep->h.fc || !crstep->s.fc1.pr)
836 		return 0;
837 
838 	addr = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin;
839 	end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin;
840 	while (ALIGN(addr + 1, _SEGMENT_SIZE) <= end)
841 		addr = sske_frame(addr, PAGE_DEFAULT_KEY);
842 	for ( ; addr < end; addr += PAGE_SIZE)
843 		page_set_storage_key(addr, PAGE_DEFAULT_KEY, 1);
844 
845 	if (need_resched())
846 		return next;
847 	return 0;
848 }
849 
dat_reset_skeys(union asce asce,gfn_t start)850 long dat_reset_skeys(union asce asce, gfn_t start)
851 {
852 	const struct dat_walk_ops ops = {
853 		.pte_entry = dat_reset_skeys_pte,
854 		.pmd_entry = dat_reset_skeys_crste,
855 		.pud_entry = dat_reset_skeys_crste,
856 	};
857 
858 	return _dat_walk_gfn_range(start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, NULL);
859 }
860 
861 struct slot_priv {
862 	unsigned long token;
863 	struct kvm_s390_mmu_cache *mc;
864 };
865 
_dat_slot_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)866 static long _dat_slot_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
867 {
868 	struct slot_priv *p = walk->priv;
869 	union crste dummy = { .val = p->token };
870 	union pte new_pte, pte = READ_ONCE(*ptep);
871 
872 	new_pte = _PTE_TOK(dummy.tok.type, dummy.tok.par);
873 
874 	/* Table entry already in the desired state. */
875 	if (pte.val == new_pte.val)
876 		return 0;
877 
878 	dat_ptep_xchg(ptep, new_pte, gfn, walk->asce, false);
879 	return 0;
880 }
881 
_dat_slot_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)882 static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
883 {
884 	union crste new_crste, crste = READ_ONCE(*crstep);
885 	struct slot_priv *p = walk->priv;
886 
887 	new_crste.val = p->token;
888 	new_crste.h.tt = crste.h.tt;
889 
890 	/* Table entry already in the desired state. */
891 	if (crste.val == new_crste.val)
892 		return 0;
893 
894 	/* This table entry needs to be updated. */
895 	if (walk->start <= gfn && walk->end >= next) {
896 		dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce);
897 		/* A lower level table was present, needs to be freed. */
898 		if (!crste.h.fc && !crste.h.i) {
899 			if (is_pmd(crste))
900 				dat_free_pt(dereference_pmd(crste.pmd));
901 			else
902 				dat_free_level(dereference_crste(crste), true);
903 		}
904 		return 0;
905 	}
906 
907 	/* A lower level table is present, things will handled there. */
908 	if (!crste.h.fc && !crste.h.i)
909 		return 0;
910 	/* Split (install a lower level table), and handle things there. */
911 	return dat_split_crste(p->mc, crstep, gfn, walk->asce, false);
912 }
913 
914 static const struct dat_walk_ops dat_slot_ops = {
915 	.pte_entry = _dat_slot_pte,
916 	.crste_ops = { _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, },
917 };
918 
dat_set_slot(struct kvm_s390_mmu_cache * mc,union asce asce,gfn_t start,gfn_t end,u16 type,u16 param)919 int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end,
920 		 u16 type, u16 param)
921 {
922 	struct slot_priv priv = {
923 		.token = _CRSTE_TOK(0, type, param).val,
924 		.mc = mc,
925 	};
926 
927 	return _dat_walk_gfn_range(start, end, asce, &dat_slot_ops,
928 				   DAT_WALK_IGN_HOLES | DAT_WALK_ANY, &priv);
929 }
930 
pgste_set_unlock_multiple(union pte * first,int n,union pgste * pgstes)931 static void pgste_set_unlock_multiple(union pte *first, int n, union pgste *pgstes)
932 {
933 	int i;
934 
935 	for (i = 0; i < n; i++) {
936 		if (!pgstes[i].pcl)
937 			break;
938 		pgste_set_unlock(first + i, pgstes[i]);
939 	}
940 }
941 
pgste_get_trylock_multiple(union pte * first,int n,union pgste * pgstes)942 static bool pgste_get_trylock_multiple(union pte *first, int n, union pgste *pgstes)
943 {
944 	int i;
945 
946 	for (i = 0; i < n; i++) {
947 		if (!pgste_get_trylock(first + i, pgstes + i))
948 			break;
949 	}
950 	if (i == n)
951 		return true;
952 	pgste_set_unlock_multiple(first, n, pgstes);
953 	return false;
954 }
955 
dat_get_ptval(struct page_table * table,struct ptval_param param)956 unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param)
957 {
958 	union pgste pgstes[4] = {};
959 	unsigned long res = 0;
960 	int i, n;
961 
962 	n = param.len + 1;
963 
964 	while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes))
965 		cpu_relax();
966 
967 	for (i = 0; i < n; i++)
968 		res = res << 16 | pgstes[i].val16;
969 
970 	pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes);
971 	return res;
972 }
973 
dat_set_ptval(struct page_table * table,struct ptval_param param,unsigned long val)974 void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val)
975 {
976 	union pgste pgstes[4] = {};
977 	int i, n;
978 
979 	n = param.len + 1;
980 
981 	while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes))
982 		cpu_relax();
983 
984 	for (i = param.len; i >= 0; i--) {
985 		pgstes[i].val16 = val;
986 		val = val >> 16;
987 	}
988 
989 	pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes);
990 }
991 
_dat_test_young_pte(union pte * ptep,gfn_t start,gfn_t end,struct dat_walk * walk)992 static long _dat_test_young_pte(union pte *ptep, gfn_t start, gfn_t end, struct dat_walk *walk)
993 {
994 	return ptep->s.y;
995 }
996 
_dat_test_young_crste(union crste * crstep,gfn_t start,gfn_t end,struct dat_walk * walk)997 static long _dat_test_young_crste(union crste *crstep, gfn_t start, gfn_t end,
998 				  struct dat_walk *walk)
999 {
1000 	return crstep->h.fc && crstep->s.fc1.y;
1001 }
1002 
1003 static const struct dat_walk_ops test_age_ops = {
1004 	.pte_entry = _dat_test_young_pte,
1005 	.pmd_entry = _dat_test_young_crste,
1006 	.pud_entry = _dat_test_young_crste,
1007 };
1008 
1009 /**
1010  * dat_test_age_gfn() - Test young.
1011  * @asce: The ASCE whose address range is to be tested.
1012  * @start: The first guest frame of the range to check.
1013  * @end: The guest frame after the last in the range.
1014  *
1015  * Context: called by KVM common code with the kvm mmu write lock held.
1016  *
1017  * Return: %true if any page in the given range is young, otherwise %false.
1018  */
dat_test_age_gfn(union asce asce,gfn_t start,gfn_t end)1019 bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end)
1020 {
1021 	return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0;
1022 }
1023 
dat_link(struct kvm_s390_mmu_cache * mc,union asce asce,int level,bool uses_skeys,struct guest_fault * f)1024 int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
1025 	     bool uses_skeys, struct guest_fault *f)
1026 {
1027 	union crste oldval, newval;
1028 	union pte newpte, oldpte;
1029 	union pgste pgste;
1030 	int rc = 0;
1031 
1032 	rc = dat_entry_walk(mc, f->gfn, asce, DAT_WALK_ALLOC_CONTINUE, level, &f->crstep, &f->ptep);
1033 	if (rc == -EINVAL || rc == -ENOMEM)
1034 		return rc;
1035 	if (rc)
1036 		return -EAGAIN;
1037 
1038 	if (WARN_ON_ONCE(unlikely(get_level(f->crstep, f->ptep) > level)))
1039 		return -EINVAL;
1040 
1041 	if (f->ptep) {
1042 		pgste = pgste_get_lock(f->ptep);
1043 		oldpte = *f->ptep;
1044 		newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
1045 		newpte.s.sd = oldpte.s.sd;
1046 		oldpte.s.sd = 0;
1047 		if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
1048 			pgste = __dat_ptep_xchg(f->ptep, pgste, newpte, f->gfn, asce, uses_skeys);
1049 			if (f->callback)
1050 				f->callback(f);
1051 		} else {
1052 			rc = -EAGAIN;
1053 		}
1054 		pgste_set_unlock(f->ptep, pgste);
1055 	} else {
1056 		oldval = READ_ONCE(*f->crstep);
1057 		newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
1058 				    f->write_attempt | oldval.s.fc1.d);
1059 		newval.s.fc1.sd = oldval.s.fc1.sd;
1060 		if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
1061 		    crste_origin_large(oldval) != crste_origin_large(newval))
1062 			return -EAGAIN;
1063 		if (!dat_crstep_xchg_atomic(f->crstep, oldval, newval, f->gfn, asce))
1064 			return -EAGAIN;
1065 		if (f->callback)
1066 			f->callback(f);
1067 	}
1068 
1069 	return rc;
1070 }
1071 
dat_set_pn_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)1072 static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1073 {
1074 	union crste crste = READ_ONCE(*crstep);
1075 	int *n = walk->priv;
1076 
1077 	if (!crste.h.fc || crste.h.i || crste.h.p)
1078 		return 0;
1079 
1080 	*n = 2;
1081 	if (crste.s.fc1.prefix_notif)
1082 		return 0;
1083 	crste.s.fc1.prefix_notif = 1;
1084 	dat_crstep_xchg(crstep, crste, gfn, walk->asce);
1085 	return 0;
1086 }
1087 
dat_set_pn_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)1088 static long dat_set_pn_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1089 {
1090 	int *n = walk->priv;
1091 	union pgste pgste;
1092 
1093 	pgste = pgste_get_lock(ptep);
1094 	if (!ptep->h.i && !ptep->h.p) {
1095 		pgste.prefix_notif = 1;
1096 		*n += 1;
1097 	}
1098 	pgste_set_unlock(ptep, pgste);
1099 	return 0;
1100 }
1101 
dat_set_prefix_notif_bit(union asce asce,gfn_t gfn)1102 int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn)
1103 {
1104 	static const struct dat_walk_ops ops = {
1105 		.pte_entry = dat_set_pn_pte,
1106 		.pmd_entry = dat_set_pn_crste,
1107 		.pud_entry = dat_set_pn_crste,
1108 	};
1109 
1110 	int n = 0;
1111 
1112 	_dat_walk_gfn_range(gfn, gfn + 2, asce, &ops, DAT_WALK_IGN_HOLES, &n);
1113 	if (n != 2)
1114 		return -EAGAIN;
1115 	return 0;
1116 }
1117 
1118 /**
1119  * dat_perform_essa() - Perform ESSA actions on the PGSTE.
1120  * @asce: The asce to operate on.
1121  * @gfn: The guest page frame to operate on.
1122  * @orc: The specific action to perform, see the ESSA_SET_* macros.
1123  * @state: The storage attributes to be returned to the guest.
1124  * @dirty: Returns whether the function dirtied a previously clean entry.
1125  *
1126  * Context: Called with kvm->mmu_lock held.
1127  *
1128  * Return:
1129  * * %1 if the page state has been altered and the page is to be added to the CBRL
1130  * * %0 if the page state has been altered, but the page is not to be added to the CBRL
1131  * * %-1 if the page state has not been altered and the page is not to be added to the CBRL
1132  */
dat_perform_essa(union asce asce,gfn_t gfn,int orc,union essa_state * state,bool * dirty)1133 int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty)
1134 {
1135 	union crste *crstep;
1136 	union pgste pgste;
1137 	union pte *ptep;
1138 	int res = 0;
1139 
1140 	if (dat_entry_walk(NULL, gfn, asce, 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) {
1141 		*state = (union essa_state) { .exception = 1 };
1142 		return -1;
1143 	}
1144 
1145 	pgste = pgste_get_lock(ptep);
1146 
1147 	*state = (union essa_state) {
1148 		.content = (ptep->h.i << 1) + (ptep->h.i && pgste.zero),
1149 		.nodat = pgste.nodat,
1150 		.usage = pgste.usage,
1151 		};
1152 
1153 	switch (orc) {
1154 	case ESSA_GET_STATE:
1155 		res = -1;
1156 		break;
1157 	case ESSA_SET_STABLE:
1158 		pgste.usage = PGSTE_GPS_USAGE_STABLE;
1159 		pgste.nodat = 0;
1160 		break;
1161 	case ESSA_SET_UNUSED:
1162 		pgste.usage = PGSTE_GPS_USAGE_UNUSED;
1163 		if (ptep->h.i)
1164 			res = 1;
1165 		break;
1166 	case ESSA_SET_VOLATILE:
1167 		pgste.usage = PGSTE_GPS_USAGE_VOLATILE;
1168 		if (ptep->h.i)
1169 			res = 1;
1170 		break;
1171 	case ESSA_SET_POT_VOLATILE:
1172 		if (!ptep->h.i) {
1173 			pgste.usage = PGSTE_GPS_USAGE_POT_VOLATILE;
1174 		} else if (pgste.zero) {
1175 			pgste.usage = PGSTE_GPS_USAGE_VOLATILE;
1176 		} else if (!pgste.gc) {
1177 			pgste.usage = PGSTE_GPS_USAGE_VOLATILE;
1178 			res = 1;
1179 		}
1180 		break;
1181 	case ESSA_SET_STABLE_RESIDENT:
1182 		pgste.usage = PGSTE_GPS_USAGE_STABLE;
1183 		/*
1184 		 * Since the resident state can go away any time after this
1185 		 * call, we will not make this page resident. We can revisit
1186 		 * this decision if a guest will ever start using this.
1187 		 */
1188 		break;
1189 	case ESSA_SET_STABLE_IF_RESIDENT:
1190 		if (!ptep->h.i)
1191 			pgste.usage = PGSTE_GPS_USAGE_STABLE;
1192 		break;
1193 	case ESSA_SET_STABLE_NODAT:
1194 		pgste.usage = PGSTE_GPS_USAGE_STABLE;
1195 		pgste.nodat = 1;
1196 		break;
1197 	default:
1198 		WARN_ONCE(1, "Invalid ORC!");
1199 		res = -1;
1200 		break;
1201 	}
1202 	/* If we are discarding a page, set it to logical zero. */
1203 	pgste.zero = res == 1;
1204 	if (orc > 0) {
1205 		*dirty = !pgste.cmma_d;
1206 		pgste.cmma_d = 1;
1207 	}
1208 
1209 	pgste_set_unlock(ptep, pgste);
1210 
1211 	return res;
1212 }
1213 
dat_reset_cmma_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)1214 static long dat_reset_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1215 {
1216 	union pgste pgste;
1217 
1218 	pgste = pgste_get_lock(ptep);
1219 	pgste.usage = 0;
1220 	pgste.nodat = 0;
1221 	pgste.cmma_d = 0;
1222 	pgste_set_unlock(ptep, pgste);
1223 	if (need_resched())
1224 		return next;
1225 	return 0;
1226 }
1227 
dat_reset_cmma(union asce asce,gfn_t start)1228 long dat_reset_cmma(union asce asce, gfn_t start)
1229 {
1230 	const struct dat_walk_ops dat_reset_cmma_ops = {
1231 		.pte_entry = dat_reset_cmma_pte,
1232 	};
1233 
1234 	return _dat_walk_gfn_range(start, asce_end(asce), asce, &dat_reset_cmma_ops,
1235 				   DAT_WALK_IGN_HOLES, NULL);
1236 }
1237 
1238 struct dat_get_cmma_state {
1239 	gfn_t start;
1240 	gfn_t end;
1241 	unsigned int count;
1242 	u8 *values;
1243 	atomic64_t *remaining;
1244 };
1245 
__dat_peek_cmma_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)1246 static long __dat_peek_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1247 {
1248 	struct dat_get_cmma_state *state = walk->priv;
1249 	union pgste pgste;
1250 
1251 	pgste = pgste_get_lock(ptep);
1252 	state->values[gfn - walk->start] = pgste.usage | (pgste.nodat << 6);
1253 	pgste_set_unlock(ptep, pgste);
1254 	state->end = next;
1255 
1256 	return 0;
1257 }
1258 
__dat_peek_cmma_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)1259 static long __dat_peek_cmma_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1260 {
1261 	struct dat_get_cmma_state *state = walk->priv;
1262 
1263 	if (crstep->h.i)
1264 		state->end = min(walk->end, next);
1265 	return 0;
1266 }
1267 
dat_peek_cmma(gfn_t start,union asce asce,unsigned int * count,u8 * values)1268 int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values)
1269 {
1270 	const struct dat_walk_ops ops = {
1271 		.pte_entry = __dat_peek_cmma_pte,
1272 		.pmd_entry = __dat_peek_cmma_crste,
1273 		.pud_entry = __dat_peek_cmma_crste,
1274 		.p4d_entry = __dat_peek_cmma_crste,
1275 		.pgd_entry = __dat_peek_cmma_crste,
1276 	};
1277 	struct dat_get_cmma_state state = { .values = values, };
1278 	int rc;
1279 
1280 	rc = _dat_walk_gfn_range(start, start + *count, asce, &ops, DAT_WALK_DEFAULT, &state);
1281 	*count = state.end - start;
1282 	/* Return success if at least one value was saved, otherwise an error. */
1283 	return (rc == -EFAULT && *count > 0) ? 0 : rc;
1284 }
1285 
__dat_get_cmma_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)1286 static long __dat_get_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1287 {
1288 	struct dat_get_cmma_state *state = walk->priv;
1289 	union pgste pgste;
1290 
1291 	if (state->start != -1) {
1292 		if ((gfn - state->end) > KVM_S390_MAX_BIT_DISTANCE)
1293 			return 1;
1294 		if (gfn - state->start >= state->count)
1295 			return 1;
1296 	}
1297 
1298 	if (!READ_ONCE(*pgste_of(ptep)).cmma_d)
1299 		return 0;
1300 
1301 	pgste = pgste_get_lock(ptep);
1302 	if (pgste.cmma_d) {
1303 		if (state->start == -1)
1304 			state->start = gfn;
1305 		pgste.cmma_d = 0;
1306 		atomic64_dec(state->remaining);
1307 		state->values[gfn - state->start] = pgste.usage | pgste.nodat << 6;
1308 		state->end = next;
1309 	}
1310 	pgste_set_unlock(ptep, pgste);
1311 	return 0;
1312 }
1313 
dat_get_cmma(union asce asce,gfn_t * start,unsigned int * count,u8 * values,atomic64_t * rem)1314 int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem)
1315 {
1316 	const struct dat_walk_ops ops = { .pte_entry = __dat_get_cmma_pte, };
1317 	struct dat_get_cmma_state state = {
1318 		.remaining = rem,
1319 		.values = values,
1320 		.count = *count,
1321 		.start = -1,
1322 	};
1323 
1324 	_dat_walk_gfn_range(*start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, &state);
1325 
1326 	if (state.start == -1) {
1327 		*count = 0;
1328 	} else {
1329 		*count = state.end - state.start;
1330 		*start = state.start;
1331 	}
1332 
1333 	return 0;
1334 }
1335 
1336 struct dat_set_cmma_state {
1337 	unsigned long mask;
1338 	const u8 *bits;
1339 };
1340 
__dat_set_cmma_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)1341 static long __dat_set_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1342 {
1343 	struct dat_set_cmma_state *state = walk->priv;
1344 	union pgste pgste, tmp;
1345 
1346 	tmp.val = (state->bits[gfn - walk->start] << 24) & state->mask;
1347 
1348 	pgste = pgste_get_lock(ptep);
1349 	pgste.usage = tmp.usage;
1350 	pgste.nodat = tmp.nodat;
1351 	pgste_set_unlock(ptep, pgste);
1352 
1353 	return 0;
1354 }
1355 
1356 /**
1357  * dat_set_cmma_bits() - Set CMMA bits for a range of guest pages.
1358  * @mc: Cache used for allocations.
1359  * @asce: The ASCE of the guest.
1360  * @gfn: The guest frame of the fist page whose CMMA bits are to set.
1361  * @count: How many pages need to be processed.
1362  * @mask: Which PGSTE bits should be set.
1363  * @bits: Points to an array with the CMMA attributes.
1364  *
1365  * This function sets the CMMA attributes for the given pages. If the input
1366  * buffer has zero length, no action is taken, otherwise the attributes are
1367  * set and the mm->context.uses_cmm flag is set.
1368  *
1369  * Each byte in @bits contains new values for bits 32-39 of the PGSTE.
1370  * Currently, only the fields NT and US are applied.
1371  *
1372  * Return: %0 in case of success, a negative error value otherwise.
1373  */
dat_set_cmma_bits(struct kvm_s390_mmu_cache * mc,union asce asce,gfn_t gfn,unsigned long count,unsigned long mask,const uint8_t * bits)1374 int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
1375 		      unsigned long count, unsigned long mask, const uint8_t *bits)
1376 {
1377 	const struct dat_walk_ops ops = { .pte_entry = __dat_set_cmma_pte, };
1378 	struct dat_set_cmma_state state = { .mask = mask, .bits = bits, };
1379 	union crste *crstep;
1380 	union pte *ptep;
1381 	gfn_t cur;
1382 	int rc;
1383 
1384 	for (cur = ALIGN_DOWN(gfn, _PAGE_ENTRIES); cur < gfn + count; cur += _PAGE_ENTRIES) {
1385 		rc = dat_entry_walk(mc, cur, asce, DAT_WALK_ALLOC, TABLE_TYPE_PAGE_TABLE,
1386 				    &crstep, &ptep);
1387 		if (rc)
1388 			return rc;
1389 	}
1390 	return _dat_walk_gfn_range(gfn, gfn + count, asce, &ops, DAT_WALK_IGN_HOLES, &state);
1391 }
1392