xref: /linux/arch/s390/mm/gmap.c (revision bc46b7cbc58c4cb562b6a45a1fbc7b8e7b23df58)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  KVM guest address space mapping code
4  *
5  *    Copyright IBM Corp. 2007, 2020
6  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
7  *		 David Hildenbrand <david@redhat.com>
8  *		 Janosch Frank <frankja@linux.vnet.ibm.com>
9  */
10 
11 #include <linux/cpufeature.h>
12 #include <linux/export.h>
13 #include <linux/kernel.h>
14 #include <linux/pagewalk.h>
15 #include <linux/swap.h>
16 #include <linux/smp.h>
17 #include <linux/spinlock.h>
18 #include <linux/slab.h>
19 #include <linux/swapops.h>
20 #include <linux/ksm.h>
21 #include <linux/mman.h>
22 #include <linux/pgtable.h>
23 #include <asm/page-states.h>
24 #include <asm/pgalloc.h>
25 #include <asm/machine.h>
26 #include <asm/gmap_helpers.h>
27 #include <asm/gmap.h>
28 #include <asm/page.h>
29 
30 /*
31  * The address is saved in a radix tree directly; NULL would be ambiguous,
32  * since 0 is a valid address, and NULL is returned when nothing was found.
33  * The lower bits are ignored by all users of the macro, so it can be used
34  * to distinguish a valid address 0 from a NULL.
35  */
36 #define VALID_GADDR_FLAG 1
37 #define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG)
38 #define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG)
39 
40 #define GMAP_SHADOW_FAKE_TABLE 1ULL
41 
gmap_alloc_crst(void)42 static struct page *gmap_alloc_crst(void)
43 {
44 	struct page *page;
45 
46 	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
47 	if (!page)
48 		return NULL;
49 	__arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER);
50 	return page;
51 }
52 
53 /**
54  * gmap_alloc - allocate and initialize a guest address space
55  * @limit: maximum address of the gmap address space
56  *
57  * Returns a guest address space structure.
58  */
gmap_alloc(unsigned long limit)59 struct gmap *gmap_alloc(unsigned long limit)
60 {
61 	struct gmap *gmap;
62 	struct page *page;
63 	unsigned long *table;
64 	unsigned long etype, atype;
65 
66 	if (limit < _REGION3_SIZE) {
67 		limit = _REGION3_SIZE - 1;
68 		atype = _ASCE_TYPE_SEGMENT;
69 		etype = _SEGMENT_ENTRY_EMPTY;
70 	} else if (limit < _REGION2_SIZE) {
71 		limit = _REGION2_SIZE - 1;
72 		atype = _ASCE_TYPE_REGION3;
73 		etype = _REGION3_ENTRY_EMPTY;
74 	} else if (limit < _REGION1_SIZE) {
75 		limit = _REGION1_SIZE - 1;
76 		atype = _ASCE_TYPE_REGION2;
77 		etype = _REGION2_ENTRY_EMPTY;
78 	} else {
79 		limit = -1UL;
80 		atype = _ASCE_TYPE_REGION1;
81 		etype = _REGION1_ENTRY_EMPTY;
82 	}
83 	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
84 	if (!gmap)
85 		goto out;
86 	INIT_LIST_HEAD(&gmap->children);
87 	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
88 	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
89 	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
90 	spin_lock_init(&gmap->guest_table_lock);
91 	spin_lock_init(&gmap->shadow_lock);
92 	refcount_set(&gmap->ref_count, 1);
93 	page = gmap_alloc_crst();
94 	if (!page)
95 		goto out_free;
96 	table = page_to_virt(page);
97 	crst_table_init(table, etype);
98 	gmap->table = table;
99 	gmap->asce = atype | _ASCE_TABLE_LENGTH |
100 		_ASCE_USER_BITS | __pa(table);
101 	gmap->asce_end = limit;
102 	return gmap;
103 
104 out_free:
105 	kfree(gmap);
106 out:
107 	return NULL;
108 }
109 EXPORT_SYMBOL_GPL(gmap_alloc);
110 
111 /**
112  * gmap_create - create a guest address space
113  * @mm: pointer to the parent mm_struct
114  * @limit: maximum size of the gmap address space
115  *
116  * Returns a guest address space structure.
117  */
gmap_create(struct mm_struct * mm,unsigned long limit)118 struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
119 {
120 	struct gmap *gmap;
121 	unsigned long gmap_asce;
122 
123 	gmap = gmap_alloc(limit);
124 	if (!gmap)
125 		return NULL;
126 	gmap->mm = mm;
127 	spin_lock(&mm->context.lock);
128 	list_add_rcu(&gmap->list, &mm->context.gmap_list);
129 	if (list_is_singular(&mm->context.gmap_list))
130 		gmap_asce = gmap->asce;
131 	else
132 		gmap_asce = -1UL;
133 	WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
134 	spin_unlock(&mm->context.lock);
135 	return gmap;
136 }
137 EXPORT_SYMBOL_GPL(gmap_create);
138 
gmap_flush_tlb(struct gmap * gmap)139 static void gmap_flush_tlb(struct gmap *gmap)
140 {
141 	if (cpu_has_idte())
142 		__tlb_flush_idte(gmap->asce);
143 	else
144 		__tlb_flush_global();
145 }
146 
gmap_radix_tree_free(struct radix_tree_root * root)147 static void gmap_radix_tree_free(struct radix_tree_root *root)
148 {
149 	struct radix_tree_iter iter;
150 	unsigned long indices[16];
151 	unsigned long index;
152 	void __rcu **slot;
153 	int i, nr;
154 
155 	/* A radix tree is freed by deleting all of its entries */
156 	index = 0;
157 	do {
158 		nr = 0;
159 		radix_tree_for_each_slot(slot, root, &iter, index) {
160 			indices[nr] = iter.index;
161 			if (++nr == 16)
162 				break;
163 		}
164 		for (i = 0; i < nr; i++) {
165 			index = indices[i];
166 			radix_tree_delete(root, index);
167 		}
168 	} while (nr > 0);
169 }
170 
gmap_rmap_radix_tree_free(struct radix_tree_root * root)171 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
172 {
173 	struct gmap_rmap *rmap, *rnext, *head;
174 	struct radix_tree_iter iter;
175 	unsigned long indices[16];
176 	unsigned long index;
177 	void __rcu **slot;
178 	int i, nr;
179 
180 	/* A radix tree is freed by deleting all of its entries */
181 	index = 0;
182 	do {
183 		nr = 0;
184 		radix_tree_for_each_slot(slot, root, &iter, index) {
185 			indices[nr] = iter.index;
186 			if (++nr == 16)
187 				break;
188 		}
189 		for (i = 0; i < nr; i++) {
190 			index = indices[i];
191 			head = radix_tree_delete(root, index);
192 			gmap_for_each_rmap_safe(rmap, rnext, head)
193 				kfree(rmap);
194 		}
195 	} while (nr > 0);
196 }
197 
gmap_free_crst(unsigned long * table,bool free_ptes)198 static void gmap_free_crst(unsigned long *table, bool free_ptes)
199 {
200 	bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0;
201 	int i;
202 
203 	if (is_segment) {
204 		if (!free_ptes)
205 			goto out;
206 		for (i = 0; i < _CRST_ENTRIES; i++)
207 			if (!(table[i] & _SEGMENT_ENTRY_INVALID))
208 				page_table_free_pgste(page_ptdesc(phys_to_page(table[i])));
209 	} else {
210 		for (i = 0; i < _CRST_ENTRIES; i++)
211 			if (!(table[i] & _REGION_ENTRY_INVALID))
212 				gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes);
213 	}
214 
215 out:
216 	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
217 }
218 
219 /**
220  * gmap_free - free a guest address space
221  * @gmap: pointer to the guest address space structure
222  *
223  * No locks required. There are no references to this gmap anymore.
224  */
gmap_free(struct gmap * gmap)225 void gmap_free(struct gmap *gmap)
226 {
227 	/* Flush tlb of all gmaps (if not already done for shadows) */
228 	if (!(gmap_is_shadow(gmap) && gmap->removed))
229 		gmap_flush_tlb(gmap);
230 	/* Free all segment & region tables. */
231 	gmap_free_crst(gmap->table, gmap_is_shadow(gmap));
232 
233 	gmap_radix_tree_free(&gmap->guest_to_host);
234 	gmap_radix_tree_free(&gmap->host_to_guest);
235 
236 	/* Free additional data for a shadow gmap */
237 	if (gmap_is_shadow(gmap)) {
238 		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
239 		/* Release reference to the parent */
240 		gmap_put(gmap->parent);
241 	}
242 
243 	kfree(gmap);
244 }
245 EXPORT_SYMBOL_GPL(gmap_free);
246 
247 /**
248  * gmap_get - increase reference counter for guest address space
249  * @gmap: pointer to the guest address space structure
250  *
251  * Returns the gmap pointer
252  */
gmap_get(struct gmap * gmap)253 struct gmap *gmap_get(struct gmap *gmap)
254 {
255 	refcount_inc(&gmap->ref_count);
256 	return gmap;
257 }
258 EXPORT_SYMBOL_GPL(gmap_get);
259 
260 /**
261  * gmap_put - decrease reference counter for guest address space
262  * @gmap: pointer to the guest address space structure
263  *
264  * If the reference counter reaches zero the guest address space is freed.
265  */
gmap_put(struct gmap * gmap)266 void gmap_put(struct gmap *gmap)
267 {
268 	if (refcount_dec_and_test(&gmap->ref_count))
269 		gmap_free(gmap);
270 }
271 EXPORT_SYMBOL_GPL(gmap_put);
272 
273 /**
274  * gmap_remove - remove a guest address space but do not free it yet
275  * @gmap: pointer to the guest address space structure
276  */
gmap_remove(struct gmap * gmap)277 void gmap_remove(struct gmap *gmap)
278 {
279 	struct gmap *sg, *next;
280 	unsigned long gmap_asce;
281 
282 	/* Remove all shadow gmaps linked to this gmap */
283 	if (!list_empty(&gmap->children)) {
284 		spin_lock(&gmap->shadow_lock);
285 		list_for_each_entry_safe(sg, next, &gmap->children, list) {
286 			list_del(&sg->list);
287 			gmap_put(sg);
288 		}
289 		spin_unlock(&gmap->shadow_lock);
290 	}
291 	/* Remove gmap from the pre-mm list */
292 	spin_lock(&gmap->mm->context.lock);
293 	list_del_rcu(&gmap->list);
294 	if (list_empty(&gmap->mm->context.gmap_list))
295 		gmap_asce = 0;
296 	else if (list_is_singular(&gmap->mm->context.gmap_list))
297 		gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
298 					     struct gmap, list)->asce;
299 	else
300 		gmap_asce = -1UL;
301 	WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
302 	spin_unlock(&gmap->mm->context.lock);
303 	synchronize_rcu();
304 	/* Put reference */
305 	gmap_put(gmap);
306 }
307 EXPORT_SYMBOL_GPL(gmap_remove);
308 
309 /*
310  * gmap_alloc_table is assumed to be called with mmap_lock held
311  */
gmap_alloc_table(struct gmap * gmap,unsigned long * table,unsigned long init,unsigned long gaddr)312 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
313 			    unsigned long init, unsigned long gaddr)
314 {
315 	struct page *page;
316 	unsigned long *new;
317 
318 	/* since we dont free the gmap table until gmap_free we can unlock */
319 	page = gmap_alloc_crst();
320 	if (!page)
321 		return -ENOMEM;
322 	new = page_to_virt(page);
323 	crst_table_init(new, init);
324 	spin_lock(&gmap->guest_table_lock);
325 	if (*table & _REGION_ENTRY_INVALID) {
326 		*table = __pa(new) | _REGION_ENTRY_LENGTH |
327 			(*table & _REGION_ENTRY_TYPE_MASK);
328 		page = NULL;
329 	}
330 	spin_unlock(&gmap->guest_table_lock);
331 	if (page)
332 		__free_pages(page, CRST_ALLOC_ORDER);
333 	return 0;
334 }
335 
host_to_guest_lookup(struct gmap * gmap,unsigned long vmaddr)336 static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr)
337 {
338 	return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
339 }
340 
host_to_guest_delete(struct gmap * gmap,unsigned long vmaddr)341 static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr)
342 {
343 	return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
344 }
345 
host_to_guest_pmd_delete(struct gmap * gmap,unsigned long vmaddr,unsigned long * gaddr)346 static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr,
347 				       unsigned long *gaddr)
348 {
349 	*gaddr = host_to_guest_delete(gmap, vmaddr);
350 	if (IS_GADDR_VALID(*gaddr))
351 		return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1);
352 	return NULL;
353 }
354 
355 /**
356  * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
357  * @gmap: pointer to the guest address space structure
358  * @vmaddr: address in the host process address space
359  *
360  * Returns 1 if a TLB flush is required
361  */
__gmap_unlink_by_vmaddr(struct gmap * gmap,unsigned long vmaddr)362 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
363 {
364 	unsigned long gaddr;
365 	int flush = 0;
366 	pmd_t *pmdp;
367 
368 	BUG_ON(gmap_is_shadow(gmap));
369 	spin_lock(&gmap->guest_table_lock);
370 
371 	pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
372 	if (pmdp) {
373 		flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY);
374 		*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
375 	}
376 
377 	spin_unlock(&gmap->guest_table_lock);
378 	return flush;
379 }
380 
381 /**
382  * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
383  * @gmap: pointer to the guest address space structure
384  * @gaddr: address in the guest address space
385  *
386  * Returns 1 if a TLB flush is required
387  */
__gmap_unmap_by_gaddr(struct gmap * gmap,unsigned long gaddr)388 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
389 {
390 	unsigned long vmaddr;
391 
392 	vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
393 						   gaddr >> PMD_SHIFT);
394 	return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
395 }
396 
397 /**
398  * gmap_unmap_segment - unmap segment from the guest address space
399  * @gmap: pointer to the guest address space structure
400  * @to: address in the guest address space
401  * @len: length of the memory area to unmap
402  *
403  * Returns 0 if the unmap succeeded, -EINVAL if not.
404  */
gmap_unmap_segment(struct gmap * gmap,unsigned long to,unsigned long len)405 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
406 {
407 	unsigned long off;
408 	int flush;
409 
410 	BUG_ON(gmap_is_shadow(gmap));
411 	if ((to | len) & (PMD_SIZE - 1))
412 		return -EINVAL;
413 	if (len == 0 || to + len < to)
414 		return -EINVAL;
415 
416 	flush = 0;
417 	mmap_write_lock(gmap->mm);
418 	for (off = 0; off < len; off += PMD_SIZE)
419 		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
420 	mmap_write_unlock(gmap->mm);
421 	if (flush)
422 		gmap_flush_tlb(gmap);
423 	return 0;
424 }
425 EXPORT_SYMBOL_GPL(gmap_unmap_segment);
426 
427 /**
428  * gmap_map_segment - map a segment to the guest address space
429  * @gmap: pointer to the guest address space structure
430  * @from: source address in the parent address space
431  * @to: target address in the guest address space
432  * @len: length of the memory area to map
433  *
434  * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
435  */
gmap_map_segment(struct gmap * gmap,unsigned long from,unsigned long to,unsigned long len)436 int gmap_map_segment(struct gmap *gmap, unsigned long from,
437 		     unsigned long to, unsigned long len)
438 {
439 	unsigned long off;
440 	int flush;
441 
442 	BUG_ON(gmap_is_shadow(gmap));
443 	if ((from | to | len) & (PMD_SIZE - 1))
444 		return -EINVAL;
445 	if (len == 0 || from + len < from || to + len < to ||
446 	    from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
447 		return -EINVAL;
448 
449 	flush = 0;
450 	mmap_write_lock(gmap->mm);
451 	for (off = 0; off < len; off += PMD_SIZE) {
452 		/* Remove old translation */
453 		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
454 		/* Store new translation */
455 		if (radix_tree_insert(&gmap->guest_to_host,
456 				      (to + off) >> PMD_SHIFT,
457 				      (void *) from + off))
458 			break;
459 	}
460 	mmap_write_unlock(gmap->mm);
461 	if (flush)
462 		gmap_flush_tlb(gmap);
463 	if (off >= len)
464 		return 0;
465 	gmap_unmap_segment(gmap, to, len);
466 	return -ENOMEM;
467 }
468 EXPORT_SYMBOL_GPL(gmap_map_segment);
469 
470 /**
471  * __gmap_translate - translate a guest address to a user space address
472  * @gmap: pointer to guest mapping meta data structure
473  * @gaddr: guest address
474  *
475  * Returns user space address which corresponds to the guest address or
476  * -EFAULT if no such mapping exists.
477  * This function does not establish potentially missing page table entries.
478  * The mmap_lock of the mm that belongs to the address space must be held
479  * when this function gets called.
480  *
481  * Note: Can also be called for shadow gmaps.
482  */
__gmap_translate(struct gmap * gmap,unsigned long gaddr)483 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
484 {
485 	unsigned long vmaddr;
486 
487 	vmaddr = (unsigned long)
488 		radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
489 	/* Note: guest_to_host is empty for a shadow gmap */
490 	return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
491 }
492 EXPORT_SYMBOL_GPL(__gmap_translate);
493 
494 /**
495  * gmap_unlink - disconnect a page table from the gmap shadow tables
496  * @mm: pointer to the parent mm_struct
497  * @table: pointer to the host page table
498  * @vmaddr: vm address associated with the host page table
499  */
gmap_unlink(struct mm_struct * mm,unsigned long * table,unsigned long vmaddr)500 void gmap_unlink(struct mm_struct *mm, unsigned long *table,
501 		 unsigned long vmaddr)
502 {
503 	struct gmap *gmap;
504 	int flush;
505 
506 	rcu_read_lock();
507 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
508 		flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
509 		if (flush)
510 			gmap_flush_tlb(gmap);
511 	}
512 	rcu_read_unlock();
513 }
514 
515 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
516 			   unsigned long gaddr);
517 
518 /**
519  * __gmap_link - set up shadow page tables to connect a host to a guest address
520  * @gmap: pointer to guest mapping meta data structure
521  * @gaddr: guest address
522  * @vmaddr: vm address
523  *
524  * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
525  * if the vm address is already mapped to a different guest segment.
526  * The mmap_lock of the mm that belongs to the address space must be held
527  * when this function gets called.
528  */
__gmap_link(struct gmap * gmap,unsigned long gaddr,unsigned long vmaddr)529 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
530 {
531 	struct mm_struct *mm;
532 	unsigned long *table;
533 	spinlock_t *ptl;
534 	pgd_t *pgd;
535 	p4d_t *p4d;
536 	pud_t *pud;
537 	pmd_t *pmd;
538 	u64 unprot;
539 	int rc;
540 
541 	BUG_ON(gmap_is_shadow(gmap));
542 	/* Create higher level tables in the gmap page table */
543 	table = gmap->table;
544 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
545 		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
546 		if ((*table & _REGION_ENTRY_INVALID) &&
547 		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
548 				     gaddr & _REGION1_MASK))
549 			return -ENOMEM;
550 		table = __va(*table & _REGION_ENTRY_ORIGIN);
551 	}
552 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
553 		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
554 		if ((*table & _REGION_ENTRY_INVALID) &&
555 		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
556 				     gaddr & _REGION2_MASK))
557 			return -ENOMEM;
558 		table = __va(*table & _REGION_ENTRY_ORIGIN);
559 	}
560 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
561 		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
562 		if ((*table & _REGION_ENTRY_INVALID) &&
563 		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
564 				     gaddr & _REGION3_MASK))
565 			return -ENOMEM;
566 		table = __va(*table & _REGION_ENTRY_ORIGIN);
567 	}
568 	table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
569 	/* Walk the parent mm page table */
570 	mm = gmap->mm;
571 	pgd = pgd_offset(mm, vmaddr);
572 	VM_BUG_ON(pgd_none(*pgd));
573 	p4d = p4d_offset(pgd, vmaddr);
574 	VM_BUG_ON(p4d_none(*p4d));
575 	pud = pud_offset(p4d, vmaddr);
576 	VM_BUG_ON(pud_none(*pud));
577 	/* large puds cannot yet be handled */
578 	if (pud_leaf(*pud))
579 		return -EFAULT;
580 	pmd = pmd_offset(pud, vmaddr);
581 	VM_BUG_ON(pmd_none(*pmd));
582 	/* Are we allowed to use huge pages? */
583 	if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
584 		return -EFAULT;
585 	/* Link gmap segment table entry location to page table. */
586 	rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
587 	if (rc)
588 		return rc;
589 	ptl = pmd_lock(mm, pmd);
590 	spin_lock(&gmap->guest_table_lock);
591 	if (*table == _SEGMENT_ENTRY_EMPTY) {
592 		rc = radix_tree_insert(&gmap->host_to_guest,
593 				       vmaddr >> PMD_SHIFT,
594 				       (void *)MAKE_VALID_GADDR(gaddr));
595 		if (!rc) {
596 			if (pmd_leaf(*pmd)) {
597 				*table = (pmd_val(*pmd) &
598 					  _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
599 					| _SEGMENT_ENTRY_GMAP_UC
600 					| _SEGMENT_ENTRY;
601 			} else
602 				*table = pmd_val(*pmd) &
603 					_SEGMENT_ENTRY_HARDWARE_BITS;
604 		}
605 	} else if (*table & _SEGMENT_ENTRY_PROTECT &&
606 		   !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
607 		unprot = (u64)*table;
608 		unprot &= ~_SEGMENT_ENTRY_PROTECT;
609 		unprot |= _SEGMENT_ENTRY_GMAP_UC;
610 		gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
611 	}
612 	spin_unlock(&gmap->guest_table_lock);
613 	spin_unlock(ptl);
614 	radix_tree_preload_end();
615 	return rc;
616 }
617 EXPORT_SYMBOL(__gmap_link);
618 
619 /*
620  * this function is assumed to be called with mmap_lock held
621  */
__gmap_zap(struct gmap * gmap,unsigned long gaddr)622 void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
623 {
624 	unsigned long vmaddr;
625 
626 	mmap_assert_locked(gmap->mm);
627 
628 	/* Find the vm address for the guest address */
629 	vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
630 						   gaddr >> PMD_SHIFT);
631 	if (vmaddr) {
632 		vmaddr |= gaddr & ~PMD_MASK;
633 		gmap_helper_zap_one_page(gmap->mm, vmaddr);
634 	}
635 }
636 EXPORT_SYMBOL_GPL(__gmap_zap);
637 
638 static LIST_HEAD(gmap_notifier_list);
639 static DEFINE_SPINLOCK(gmap_notifier_lock);
640 
641 /**
642  * gmap_register_pte_notifier - register a pte invalidation callback
643  * @nb: pointer to the gmap notifier block
644  */
gmap_register_pte_notifier(struct gmap_notifier * nb)645 void gmap_register_pte_notifier(struct gmap_notifier *nb)
646 {
647 	spin_lock(&gmap_notifier_lock);
648 	list_add_rcu(&nb->list, &gmap_notifier_list);
649 	spin_unlock(&gmap_notifier_lock);
650 }
651 EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
652 
653 /**
654  * gmap_unregister_pte_notifier - remove a pte invalidation callback
655  * @nb: pointer to the gmap notifier block
656  */
gmap_unregister_pte_notifier(struct gmap_notifier * nb)657 void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
658 {
659 	spin_lock(&gmap_notifier_lock);
660 	list_del_rcu(&nb->list);
661 	spin_unlock(&gmap_notifier_lock);
662 	synchronize_rcu();
663 }
664 EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
665 
666 /**
667  * gmap_call_notifier - call all registered invalidation callbacks
668  * @gmap: pointer to guest mapping meta data structure
669  * @start: start virtual address in the guest address space
670  * @end: end virtual address in the guest address space
671  */
gmap_call_notifier(struct gmap * gmap,unsigned long start,unsigned long end)672 static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
673 			       unsigned long end)
674 {
675 	struct gmap_notifier *nb;
676 
677 	list_for_each_entry(nb, &gmap_notifier_list, list)
678 		nb->notifier_call(gmap, start, end);
679 }
680 
681 /**
682  * gmap_table_walk - walk the gmap page tables
683  * @gmap: pointer to guest mapping meta data structure
684  * @gaddr: virtual address in the guest address space
685  * @level: page table level to stop at
686  *
687  * Returns a table entry pointer for the given guest address and @level
688  * @level=0 : returns a pointer to a page table table entry (or NULL)
689  * @level=1 : returns a pointer to a segment table entry (or NULL)
690  * @level=2 : returns a pointer to a region-3 table entry (or NULL)
691  * @level=3 : returns a pointer to a region-2 table entry (or NULL)
692  * @level=4 : returns a pointer to a region-1 table entry (or NULL)
693  *
694  * Returns NULL if the gmap page tables could not be walked to the
695  * requested level.
696  *
697  * Note: Can also be called for shadow gmaps.
698  */
gmap_table_walk(struct gmap * gmap,unsigned long gaddr,int level)699 unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level)
700 {
701 	const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
702 	unsigned long *table = gmap->table;
703 
704 	if (gmap_is_shadow(gmap) && gmap->removed)
705 		return NULL;
706 
707 	if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
708 		return NULL;
709 
710 	if (asce_type != _ASCE_TYPE_REGION1 &&
711 	    gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
712 		return NULL;
713 
714 	switch (asce_type) {
715 	case _ASCE_TYPE_REGION1:
716 		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
717 		if (level == 4)
718 			break;
719 		if (*table & _REGION_ENTRY_INVALID)
720 			return NULL;
721 		table = __va(*table & _REGION_ENTRY_ORIGIN);
722 		fallthrough;
723 	case _ASCE_TYPE_REGION2:
724 		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
725 		if (level == 3)
726 			break;
727 		if (*table & _REGION_ENTRY_INVALID)
728 			return NULL;
729 		table = __va(*table & _REGION_ENTRY_ORIGIN);
730 		fallthrough;
731 	case _ASCE_TYPE_REGION3:
732 		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
733 		if (level == 2)
734 			break;
735 		if (*table & _REGION_ENTRY_INVALID)
736 			return NULL;
737 		table = __va(*table & _REGION_ENTRY_ORIGIN);
738 		fallthrough;
739 	case _ASCE_TYPE_SEGMENT:
740 		table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
741 		if (level == 1)
742 			break;
743 		if (*table & _REGION_ENTRY_INVALID)
744 			return NULL;
745 		table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
746 		table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT;
747 	}
748 	return table;
749 }
750 EXPORT_SYMBOL(gmap_table_walk);
751 
752 /**
753  * gmap_pte_op_walk - walk the gmap page table, get the page table lock
754  *		      and return the pte pointer
755  * @gmap: pointer to guest mapping meta data structure
756  * @gaddr: virtual address in the guest address space
757  * @ptl: pointer to the spinlock pointer
758  *
759  * Returns a pointer to the locked pte for a guest address, or NULL
760  */
gmap_pte_op_walk(struct gmap * gmap,unsigned long gaddr,spinlock_t ** ptl)761 static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
762 			       spinlock_t **ptl)
763 {
764 	unsigned long *table;
765 
766 	BUG_ON(gmap_is_shadow(gmap));
767 	/* Walk the gmap page table, lock and get pte pointer */
768 	table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
769 	if (!table || *table & _SEGMENT_ENTRY_INVALID)
770 		return NULL;
771 	return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
772 }
773 
774 /**
775  * gmap_pte_op_fixup - force a page in and connect the gmap page table
776  * @gmap: pointer to guest mapping meta data structure
777  * @gaddr: virtual address in the guest address space
778  * @vmaddr: address in the host process address space
779  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
780  *
781  * Returns 0 if the caller can retry __gmap_translate (might fail again),
782  * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
783  * up or connecting the gmap page table.
784  */
gmap_pte_op_fixup(struct gmap * gmap,unsigned long gaddr,unsigned long vmaddr,int prot)785 static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
786 			     unsigned long vmaddr, int prot)
787 {
788 	struct mm_struct *mm = gmap->mm;
789 	unsigned int fault_flags;
790 	bool unlocked = false;
791 
792 	BUG_ON(gmap_is_shadow(gmap));
793 	fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
794 	if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
795 		return -EFAULT;
796 	if (unlocked)
797 		/* lost mmap_lock, caller has to retry __gmap_translate */
798 		return 0;
799 	/* Connect the page tables */
800 	return __gmap_link(gmap, gaddr, vmaddr);
801 }
802 
803 /**
804  * gmap_pte_op_end - release the page table lock
805  * @ptep: pointer to the locked pte
806  * @ptl: pointer to the page table spinlock
807  */
gmap_pte_op_end(pte_t * ptep,spinlock_t * ptl)808 static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl)
809 {
810 	pte_unmap_unlock(ptep, ptl);
811 }
812 
813 /**
814  * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
815  *		      and return the pmd pointer
816  * @gmap: pointer to guest mapping meta data structure
817  * @gaddr: virtual address in the guest address space
818  *
819  * Returns a pointer to the pmd for a guest address, or NULL
820  */
gmap_pmd_op_walk(struct gmap * gmap,unsigned long gaddr)821 static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
822 {
823 	pmd_t *pmdp;
824 
825 	BUG_ON(gmap_is_shadow(gmap));
826 	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
827 	if (!pmdp)
828 		return NULL;
829 
830 	/* without huge pages, there is no need to take the table lock */
831 	if (!gmap->mm->context.allow_gmap_hpage_1m)
832 		return pmd_none(*pmdp) ? NULL : pmdp;
833 
834 	spin_lock(&gmap->guest_table_lock);
835 	if (pmd_none(*pmdp)) {
836 		spin_unlock(&gmap->guest_table_lock);
837 		return NULL;
838 	}
839 
840 	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
841 	if (!pmd_leaf(*pmdp))
842 		spin_unlock(&gmap->guest_table_lock);
843 	return pmdp;
844 }
845 
846 /**
847  * gmap_pmd_op_end - release the guest_table_lock if needed
848  * @gmap: pointer to the guest mapping meta data structure
849  * @pmdp: pointer to the pmd
850  */
gmap_pmd_op_end(struct gmap * gmap,pmd_t * pmdp)851 static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
852 {
853 	if (pmd_leaf(*pmdp))
854 		spin_unlock(&gmap->guest_table_lock);
855 }
856 
857 /*
858  * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
859  * @pmdp: pointer to the pmd to be protected
860  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
861  * @bits: notification bits to set
862  *
863  * Returns:
864  * 0 if successfully protected
865  * -EAGAIN if a fixup is needed
866  * -EINVAL if unsupported notifier bits have been specified
867  *
868  * Expected to be called with sg->mm->mmap_lock in read and
869  * guest_table_lock held.
870  */
gmap_protect_pmd(struct gmap * gmap,unsigned long gaddr,pmd_t * pmdp,int prot,unsigned long bits)871 static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
872 			    pmd_t *pmdp, int prot, unsigned long bits)
873 {
874 	int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
875 	int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
876 	pmd_t new = *pmdp;
877 
878 	/* Fixup needed */
879 	if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
880 		return -EAGAIN;
881 
882 	if (prot == PROT_NONE && !pmd_i) {
883 		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
884 		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
885 	}
886 
887 	if (prot == PROT_READ && !pmd_p) {
888 		new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
889 		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
890 		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
891 	}
892 
893 	if (bits & GMAP_NOTIFY_MPROT)
894 		set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
895 
896 	/* Shadow GMAP protection needs split PMDs */
897 	if (bits & GMAP_NOTIFY_SHADOW)
898 		return -EINVAL;
899 
900 	return 0;
901 }
902 
903 /*
904  * gmap_protect_pte - remove access rights to memory and set pgste bits
905  * @gmap: pointer to guest mapping meta data structure
906  * @gaddr: virtual address in the guest address space
907  * @pmdp: pointer to the pmd associated with the pte
908  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
909  * @bits: notification bits to set
910  *
911  * Returns 0 if successfully protected, -ENOMEM if out of memory and
912  * -EAGAIN if a fixup is needed.
913  *
914  * Expected to be called with sg->mm->mmap_lock in read
915  */
gmap_protect_pte(struct gmap * gmap,unsigned long gaddr,pmd_t * pmdp,int prot,unsigned long bits)916 static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
917 			    pmd_t *pmdp, int prot, unsigned long bits)
918 {
919 	int rc;
920 	pte_t *ptep;
921 	spinlock_t *ptl;
922 	unsigned long pbits = 0;
923 
924 	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
925 		return -EAGAIN;
926 
927 	ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
928 	if (!ptep)
929 		return -ENOMEM;
930 
931 	pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
932 	pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
933 	/* Protect and unlock. */
934 	rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
935 	gmap_pte_op_end(ptep, ptl);
936 	return rc;
937 }
938 
939 /*
940  * gmap_protect_range - remove access rights to memory and set pgste bits
941  * @gmap: pointer to guest mapping meta data structure
942  * @gaddr: virtual address in the guest address space
943  * @len: size of area
944  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
945  * @bits: pgste notification bits to set
946  *
947  * Returns:
948  *   PAGE_SIZE if a small page was successfully protected;
949  *   HPAGE_SIZE if a large page was successfully protected;
950  *   -ENOMEM if out of memory;
951  *   -EFAULT if gaddr is invalid (or mapping for shadows is missing);
952  *   -EAGAIN if the guest mapping is missing and should be fixed by the caller.
953  *
954  * Context: Called with sg->mm->mmap_lock in read.
955  */
gmap_protect_one(struct gmap * gmap,unsigned long gaddr,int prot,unsigned long bits)956 int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits)
957 {
958 	pmd_t *pmdp;
959 	int rc = 0;
960 
961 	BUG_ON(gmap_is_shadow(gmap));
962 
963 	pmdp = gmap_pmd_op_walk(gmap, gaddr);
964 	if (!pmdp)
965 		return -EAGAIN;
966 
967 	if (!pmd_leaf(*pmdp)) {
968 		rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits);
969 		if (!rc)
970 			rc = PAGE_SIZE;
971 	} else {
972 		rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits);
973 		if (!rc)
974 			rc = HPAGE_SIZE;
975 	}
976 	gmap_pmd_op_end(gmap, pmdp);
977 
978 	return rc;
979 }
980 EXPORT_SYMBOL_GPL(gmap_protect_one);
981 
982 /**
983  * gmap_read_table - get an unsigned long value from a guest page table using
984  *                   absolute addressing, without marking the page referenced.
985  * @gmap: pointer to guest mapping meta data structure
986  * @gaddr: virtual address in the guest address space
987  * @val: pointer to the unsigned long value to return
988  *
989  * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
990  * if reading using the virtual address failed. -EINVAL if called on a gmap
991  * shadow.
992  *
993  * Called with gmap->mm->mmap_lock in read.
994  */
gmap_read_table(struct gmap * gmap,unsigned long gaddr,unsigned long * val)995 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
996 {
997 	unsigned long address, vmaddr;
998 	spinlock_t *ptl;
999 	pte_t *ptep, pte;
1000 	int rc;
1001 
1002 	if (gmap_is_shadow(gmap))
1003 		return -EINVAL;
1004 
1005 	while (1) {
1006 		rc = -EAGAIN;
1007 		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
1008 		if (ptep) {
1009 			pte = *ptep;
1010 			if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
1011 				address = pte_val(pte) & PAGE_MASK;
1012 				address += gaddr & ~PAGE_MASK;
1013 				*val = *(unsigned long *)__va(address);
1014 				set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
1015 				/* Do *NOT* clear the _PAGE_INVALID bit! */
1016 				rc = 0;
1017 			}
1018 			gmap_pte_op_end(ptep, ptl);
1019 		}
1020 		if (!rc)
1021 			break;
1022 		vmaddr = __gmap_translate(gmap, gaddr);
1023 		if (IS_ERR_VALUE(vmaddr)) {
1024 			rc = vmaddr;
1025 			break;
1026 		}
1027 		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
1028 		if (rc)
1029 			break;
1030 	}
1031 	return rc;
1032 }
1033 EXPORT_SYMBOL_GPL(gmap_read_table);
1034 
1035 /**
1036  * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
1037  * @sg: pointer to the shadow guest address space structure
1038  * @vmaddr: vm address associated with the rmap
1039  * @rmap: pointer to the rmap structure
1040  *
1041  * Called with the sg->guest_table_lock
1042  */
gmap_insert_rmap(struct gmap * sg,unsigned long vmaddr,struct gmap_rmap * rmap)1043 static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
1044 				    struct gmap_rmap *rmap)
1045 {
1046 	struct gmap_rmap *temp;
1047 	void __rcu **slot;
1048 
1049 	BUG_ON(!gmap_is_shadow(sg));
1050 	slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1051 	if (slot) {
1052 		rmap->next = radix_tree_deref_slot_protected(slot,
1053 							&sg->guest_table_lock);
1054 		for (temp = rmap->next; temp; temp = temp->next) {
1055 			if (temp->raddr == rmap->raddr) {
1056 				kfree(rmap);
1057 				return;
1058 			}
1059 		}
1060 		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1061 	} else {
1062 		rmap->next = NULL;
1063 		radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
1064 				  rmap);
1065 	}
1066 }
1067 
1068 /**
1069  * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
1070  * @sg: pointer to the shadow guest address space structure
1071  * @raddr: rmap address in the shadow gmap
1072  * @paddr: address in the parent guest address space
1073  * @len: length of the memory area to protect
1074  *
1075  * Returns 0 if successfully protected and the rmap was created, -ENOMEM
1076  * if out of memory and -EFAULT if paddr is invalid.
1077  */
gmap_protect_rmap(struct gmap * sg,unsigned long raddr,unsigned long paddr,unsigned long len)1078 static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
1079 			     unsigned long paddr, unsigned long len)
1080 {
1081 	struct gmap *parent;
1082 	struct gmap_rmap *rmap;
1083 	unsigned long vmaddr;
1084 	spinlock_t *ptl;
1085 	pte_t *ptep;
1086 	int rc;
1087 
1088 	BUG_ON(!gmap_is_shadow(sg));
1089 	parent = sg->parent;
1090 	while (len) {
1091 		vmaddr = __gmap_translate(parent, paddr);
1092 		if (IS_ERR_VALUE(vmaddr))
1093 			return vmaddr;
1094 		rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1095 		if (!rmap)
1096 			return -ENOMEM;
1097 		rmap->raddr = raddr;
1098 		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1099 		if (rc) {
1100 			kfree(rmap);
1101 			return rc;
1102 		}
1103 		rc = -EAGAIN;
1104 		ptep = gmap_pte_op_walk(parent, paddr, &ptl);
1105 		if (ptep) {
1106 			spin_lock(&sg->guest_table_lock);
1107 			rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
1108 					     PGSTE_VSIE_BIT);
1109 			if (!rc)
1110 				gmap_insert_rmap(sg, vmaddr, rmap);
1111 			spin_unlock(&sg->guest_table_lock);
1112 			gmap_pte_op_end(ptep, ptl);
1113 		}
1114 		radix_tree_preload_end();
1115 		if (rc) {
1116 			kfree(rmap);
1117 			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
1118 			if (rc)
1119 				return rc;
1120 			continue;
1121 		}
1122 		paddr += PAGE_SIZE;
1123 		len -= PAGE_SIZE;
1124 	}
1125 	return 0;
1126 }
1127 
1128 #define _SHADOW_RMAP_MASK	0x7
1129 #define _SHADOW_RMAP_REGION1	0x5
1130 #define _SHADOW_RMAP_REGION2	0x4
1131 #define _SHADOW_RMAP_REGION3	0x3
1132 #define _SHADOW_RMAP_SEGMENT	0x2
1133 #define _SHADOW_RMAP_PGTABLE	0x1
1134 
1135 /**
1136  * gmap_idte_one - invalidate a single region or segment table entry
1137  * @asce: region or segment table *origin* + table-type bits
1138  * @vaddr: virtual address to identify the table entry to flush
1139  *
1140  * The invalid bit of a single region or segment table entry is set
1141  * and the associated TLB entries depending on the entry are flushed.
1142  * The table-type of the @asce identifies the portion of the @vaddr
1143  * that is used as the invalidation index.
1144  */
gmap_idte_one(unsigned long asce,unsigned long vaddr)1145 static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
1146 {
1147 	asm volatile(
1148 		"	idte	%0,0,%1"
1149 		: : "a" (asce), "a" (vaddr) : "cc", "memory");
1150 }
1151 
1152 /**
1153  * gmap_unshadow_page - remove a page from a shadow page table
1154  * @sg: pointer to the shadow guest address space structure
1155  * @raddr: rmap address in the shadow guest address space
1156  *
1157  * Called with the sg->guest_table_lock
1158  */
gmap_unshadow_page(struct gmap * sg,unsigned long raddr)1159 static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
1160 {
1161 	unsigned long *table;
1162 
1163 	BUG_ON(!gmap_is_shadow(sg));
1164 	table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
1165 	if (!table || *table & _PAGE_INVALID)
1166 		return;
1167 	gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1);
1168 	ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
1169 }
1170 
1171 /**
1172  * __gmap_unshadow_pgt - remove all entries from a shadow page table
1173  * @sg: pointer to the shadow guest address space structure
1174  * @raddr: rmap address in the shadow guest address space
1175  * @pgt: pointer to the start of a shadow page table
1176  *
1177  * Called with the sg->guest_table_lock
1178  */
__gmap_unshadow_pgt(struct gmap * sg,unsigned long raddr,unsigned long * pgt)1179 static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
1180 				unsigned long *pgt)
1181 {
1182 	int i;
1183 
1184 	BUG_ON(!gmap_is_shadow(sg));
1185 	for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE)
1186 		pgt[i] = _PAGE_INVALID;
1187 }
1188 
1189 /**
1190  * gmap_unshadow_pgt - remove a shadow page table from a segment entry
1191  * @sg: pointer to the shadow guest address space structure
1192  * @raddr: address in the shadow guest address space
1193  *
1194  * Called with the sg->guest_table_lock
1195  */
gmap_unshadow_pgt(struct gmap * sg,unsigned long raddr)1196 static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
1197 {
1198 	unsigned long *ste;
1199 	phys_addr_t sto, pgt;
1200 	struct ptdesc *ptdesc;
1201 
1202 	BUG_ON(!gmap_is_shadow(sg));
1203 	ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
1204 	if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
1205 		return;
1206 	gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
1207 	sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
1208 	gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
1209 	pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
1210 	*ste = _SEGMENT_ENTRY_EMPTY;
1211 	__gmap_unshadow_pgt(sg, raddr, __va(pgt));
1212 	/* Free page table */
1213 	ptdesc = page_ptdesc(phys_to_page(pgt));
1214 	page_table_free_pgste(ptdesc);
1215 }
1216 
1217 /**
1218  * __gmap_unshadow_sgt - remove all entries from a shadow segment table
1219  * @sg: pointer to the shadow guest address space structure
1220  * @raddr: rmap address in the shadow guest address space
1221  * @sgt: pointer to the start of a shadow segment table
1222  *
1223  * Called with the sg->guest_table_lock
1224  */
__gmap_unshadow_sgt(struct gmap * sg,unsigned long raddr,unsigned long * sgt)1225 static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
1226 				unsigned long *sgt)
1227 {
1228 	struct ptdesc *ptdesc;
1229 	phys_addr_t pgt;
1230 	int i;
1231 
1232 	BUG_ON(!gmap_is_shadow(sg));
1233 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
1234 		if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
1235 			continue;
1236 		pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
1237 		sgt[i] = _SEGMENT_ENTRY_EMPTY;
1238 		__gmap_unshadow_pgt(sg, raddr, __va(pgt));
1239 		/* Free page table */
1240 		ptdesc = page_ptdesc(phys_to_page(pgt));
1241 		page_table_free_pgste(ptdesc);
1242 	}
1243 }
1244 
1245 /**
1246  * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
1247  * @sg: pointer to the shadow guest address space structure
1248  * @raddr: rmap address in the shadow guest address space
1249  *
1250  * Called with the shadow->guest_table_lock
1251  */
gmap_unshadow_sgt(struct gmap * sg,unsigned long raddr)1252 static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
1253 {
1254 	unsigned long r3o, *r3e;
1255 	phys_addr_t sgt;
1256 	struct page *page;
1257 
1258 	BUG_ON(!gmap_is_shadow(sg));
1259 	r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
1260 	if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
1261 		return;
1262 	gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
1263 	r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
1264 	gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
1265 	sgt = *r3e & _REGION_ENTRY_ORIGIN;
1266 	*r3e = _REGION3_ENTRY_EMPTY;
1267 	__gmap_unshadow_sgt(sg, raddr, __va(sgt));
1268 	/* Free segment table */
1269 	page = phys_to_page(sgt);
1270 	__free_pages(page, CRST_ALLOC_ORDER);
1271 }
1272 
1273 /**
1274  * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
1275  * @sg: pointer to the shadow guest address space structure
1276  * @raddr: address in the shadow guest address space
1277  * @r3t: pointer to the start of a shadow region-3 table
1278  *
1279  * Called with the sg->guest_table_lock
1280  */
__gmap_unshadow_r3t(struct gmap * sg,unsigned long raddr,unsigned long * r3t)1281 static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
1282 				unsigned long *r3t)
1283 {
1284 	struct page *page;
1285 	phys_addr_t sgt;
1286 	int i;
1287 
1288 	BUG_ON(!gmap_is_shadow(sg));
1289 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
1290 		if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
1291 			continue;
1292 		sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
1293 		r3t[i] = _REGION3_ENTRY_EMPTY;
1294 		__gmap_unshadow_sgt(sg, raddr, __va(sgt));
1295 		/* Free segment table */
1296 		page = phys_to_page(sgt);
1297 		__free_pages(page, CRST_ALLOC_ORDER);
1298 	}
1299 }
1300 
1301 /**
1302  * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
1303  * @sg: pointer to the shadow guest address space structure
1304  * @raddr: rmap address in the shadow guest address space
1305  *
1306  * Called with the sg->guest_table_lock
1307  */
gmap_unshadow_r3t(struct gmap * sg,unsigned long raddr)1308 static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
1309 {
1310 	unsigned long r2o, *r2e;
1311 	phys_addr_t r3t;
1312 	struct page *page;
1313 
1314 	BUG_ON(!gmap_is_shadow(sg));
1315 	r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
1316 	if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
1317 		return;
1318 	gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
1319 	r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
1320 	gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
1321 	r3t = *r2e & _REGION_ENTRY_ORIGIN;
1322 	*r2e = _REGION2_ENTRY_EMPTY;
1323 	__gmap_unshadow_r3t(sg, raddr, __va(r3t));
1324 	/* Free region 3 table */
1325 	page = phys_to_page(r3t);
1326 	__free_pages(page, CRST_ALLOC_ORDER);
1327 }
1328 
1329 /**
1330  * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
1331  * @sg: pointer to the shadow guest address space structure
1332  * @raddr: rmap address in the shadow guest address space
1333  * @r2t: pointer to the start of a shadow region-2 table
1334  *
1335  * Called with the sg->guest_table_lock
1336  */
__gmap_unshadow_r2t(struct gmap * sg,unsigned long raddr,unsigned long * r2t)1337 static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
1338 				unsigned long *r2t)
1339 {
1340 	phys_addr_t r3t;
1341 	struct page *page;
1342 	int i;
1343 
1344 	BUG_ON(!gmap_is_shadow(sg));
1345 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
1346 		if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
1347 			continue;
1348 		r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
1349 		r2t[i] = _REGION2_ENTRY_EMPTY;
1350 		__gmap_unshadow_r3t(sg, raddr, __va(r3t));
1351 		/* Free region 3 table */
1352 		page = phys_to_page(r3t);
1353 		__free_pages(page, CRST_ALLOC_ORDER);
1354 	}
1355 }
1356 
1357 /**
1358  * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
1359  * @sg: pointer to the shadow guest address space structure
1360  * @raddr: rmap address in the shadow guest address space
1361  *
1362  * Called with the sg->guest_table_lock
1363  */
gmap_unshadow_r2t(struct gmap * sg,unsigned long raddr)1364 static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
1365 {
1366 	unsigned long r1o, *r1e;
1367 	struct page *page;
1368 	phys_addr_t r2t;
1369 
1370 	BUG_ON(!gmap_is_shadow(sg));
1371 	r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
1372 	if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
1373 		return;
1374 	gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
1375 	r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
1376 	gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
1377 	r2t = *r1e & _REGION_ENTRY_ORIGIN;
1378 	*r1e = _REGION1_ENTRY_EMPTY;
1379 	__gmap_unshadow_r2t(sg, raddr, __va(r2t));
1380 	/* Free region 2 table */
1381 	page = phys_to_page(r2t);
1382 	__free_pages(page, CRST_ALLOC_ORDER);
1383 }
1384 
1385 /**
1386  * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
1387  * @sg: pointer to the shadow guest address space structure
1388  * @raddr: rmap address in the shadow guest address space
1389  * @r1t: pointer to the start of a shadow region-1 table
1390  *
1391  * Called with the shadow->guest_table_lock
1392  */
__gmap_unshadow_r1t(struct gmap * sg,unsigned long raddr,unsigned long * r1t)1393 static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
1394 				unsigned long *r1t)
1395 {
1396 	unsigned long asce;
1397 	struct page *page;
1398 	phys_addr_t r2t;
1399 	int i;
1400 
1401 	BUG_ON(!gmap_is_shadow(sg));
1402 	asce = __pa(r1t) | _ASCE_TYPE_REGION1;
1403 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
1404 		if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
1405 			continue;
1406 		r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
1407 		__gmap_unshadow_r2t(sg, raddr, __va(r2t));
1408 		/* Clear entry and flush translation r1t -> r2t */
1409 		gmap_idte_one(asce, raddr);
1410 		r1t[i] = _REGION1_ENTRY_EMPTY;
1411 		/* Free region 2 table */
1412 		page = phys_to_page(r2t);
1413 		__free_pages(page, CRST_ALLOC_ORDER);
1414 	}
1415 }
1416 
1417 /**
1418  * gmap_unshadow - remove a shadow page table completely
1419  * @sg: pointer to the shadow guest address space structure
1420  *
1421  * Called with sg->guest_table_lock
1422  */
gmap_unshadow(struct gmap * sg)1423 void gmap_unshadow(struct gmap *sg)
1424 {
1425 	unsigned long *table;
1426 
1427 	BUG_ON(!gmap_is_shadow(sg));
1428 	if (sg->removed)
1429 		return;
1430 	sg->removed = 1;
1431 	gmap_call_notifier(sg, 0, -1UL);
1432 	gmap_flush_tlb(sg);
1433 	table = __va(sg->asce & _ASCE_ORIGIN);
1434 	switch (sg->asce & _ASCE_TYPE_MASK) {
1435 	case _ASCE_TYPE_REGION1:
1436 		__gmap_unshadow_r1t(sg, 0, table);
1437 		break;
1438 	case _ASCE_TYPE_REGION2:
1439 		__gmap_unshadow_r2t(sg, 0, table);
1440 		break;
1441 	case _ASCE_TYPE_REGION3:
1442 		__gmap_unshadow_r3t(sg, 0, table);
1443 		break;
1444 	case _ASCE_TYPE_SEGMENT:
1445 		__gmap_unshadow_sgt(sg, 0, table);
1446 		break;
1447 	}
1448 }
1449 EXPORT_SYMBOL(gmap_unshadow);
1450 
1451 /**
1452  * gmap_shadow_r2t - create an empty shadow region 2 table
1453  * @sg: pointer to the shadow guest address space structure
1454  * @saddr: faulting address in the shadow gmap
1455  * @r2t: parent gmap address of the region 2 table to get shadowed
1456  * @fake: r2t references contiguous guest memory block, not a r2t
1457  *
1458  * The r2t parameter specifies the address of the source table. The
1459  * four pages of the source table are made read-only in the parent gmap
1460  * address space. A write to the source table area @r2t will automatically
1461  * remove the shadow r2 table and all of its descendants.
1462  *
1463  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1464  * shadow table structure is incomplete, -ENOMEM if out of memory and
1465  * -EFAULT if an address in the parent gmap could not be resolved.
1466  *
1467  * Called with sg->mm->mmap_lock in read.
1468  */
gmap_shadow_r2t(struct gmap * sg,unsigned long saddr,unsigned long r2t,int fake)1469 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
1470 		    int fake)
1471 {
1472 	unsigned long raddr, origin, offset, len;
1473 	unsigned long *table;
1474 	phys_addr_t s_r2t;
1475 	struct page *page;
1476 	int rc;
1477 
1478 	BUG_ON(!gmap_is_shadow(sg));
1479 	/* Allocate a shadow region second table */
1480 	page = gmap_alloc_crst();
1481 	if (!page)
1482 		return -ENOMEM;
1483 	s_r2t = page_to_phys(page);
1484 	/* Install shadow region second table */
1485 	spin_lock(&sg->guest_table_lock);
1486 	table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
1487 	if (!table) {
1488 		rc = -EAGAIN;		/* Race with unshadow */
1489 		goto out_free;
1490 	}
1491 	if (!(*table & _REGION_ENTRY_INVALID)) {
1492 		rc = 0;			/* Already established */
1493 		goto out_free;
1494 	} else if (*table & _REGION_ENTRY_ORIGIN) {
1495 		rc = -EAGAIN;		/* Race with shadow */
1496 		goto out_free;
1497 	}
1498 	crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
1499 	/* mark as invalid as long as the parent table is not protected */
1500 	*table = s_r2t | _REGION_ENTRY_LENGTH |
1501 		 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
1502 	if (sg->edat_level >= 1)
1503 		*table |= (r2t & _REGION_ENTRY_PROTECT);
1504 	if (fake) {
1505 		/* nothing to protect for fake tables */
1506 		*table &= ~_REGION_ENTRY_INVALID;
1507 		spin_unlock(&sg->guest_table_lock);
1508 		return 0;
1509 	}
1510 	spin_unlock(&sg->guest_table_lock);
1511 	/* Make r2t read-only in parent gmap page table */
1512 	raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
1513 	origin = r2t & _REGION_ENTRY_ORIGIN;
1514 	offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1515 	len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1516 	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1517 	spin_lock(&sg->guest_table_lock);
1518 	if (!rc) {
1519 		table = gmap_table_walk(sg, saddr, 4);
1520 		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
1521 			rc = -EAGAIN;		/* Race with unshadow */
1522 		else
1523 			*table &= ~_REGION_ENTRY_INVALID;
1524 	} else {
1525 		gmap_unshadow_r2t(sg, raddr);
1526 	}
1527 	spin_unlock(&sg->guest_table_lock);
1528 	return rc;
1529 out_free:
1530 	spin_unlock(&sg->guest_table_lock);
1531 	__free_pages(page, CRST_ALLOC_ORDER);
1532 	return rc;
1533 }
1534 EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
1535 
1536 /**
1537  * gmap_shadow_r3t - create a shadow region 3 table
1538  * @sg: pointer to the shadow guest address space structure
1539  * @saddr: faulting address in the shadow gmap
1540  * @r3t: parent gmap address of the region 3 table to get shadowed
1541  * @fake: r3t references contiguous guest memory block, not a r3t
1542  *
1543  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1544  * shadow table structure is incomplete, -ENOMEM if out of memory and
1545  * -EFAULT if an address in the parent gmap could not be resolved.
1546  *
1547  * Called with sg->mm->mmap_lock in read.
1548  */
gmap_shadow_r3t(struct gmap * sg,unsigned long saddr,unsigned long r3t,int fake)1549 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
1550 		    int fake)
1551 {
1552 	unsigned long raddr, origin, offset, len;
1553 	unsigned long *table;
1554 	phys_addr_t s_r3t;
1555 	struct page *page;
1556 	int rc;
1557 
1558 	BUG_ON(!gmap_is_shadow(sg));
1559 	/* Allocate a shadow region second table */
1560 	page = gmap_alloc_crst();
1561 	if (!page)
1562 		return -ENOMEM;
1563 	s_r3t = page_to_phys(page);
1564 	/* Install shadow region second table */
1565 	spin_lock(&sg->guest_table_lock);
1566 	table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
1567 	if (!table) {
1568 		rc = -EAGAIN;		/* Race with unshadow */
1569 		goto out_free;
1570 	}
1571 	if (!(*table & _REGION_ENTRY_INVALID)) {
1572 		rc = 0;			/* Already established */
1573 		goto out_free;
1574 	} else if (*table & _REGION_ENTRY_ORIGIN) {
1575 		rc = -EAGAIN;		/* Race with shadow */
1576 		goto out_free;
1577 	}
1578 	crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
1579 	/* mark as invalid as long as the parent table is not protected */
1580 	*table = s_r3t | _REGION_ENTRY_LENGTH |
1581 		 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
1582 	if (sg->edat_level >= 1)
1583 		*table |= (r3t & _REGION_ENTRY_PROTECT);
1584 	if (fake) {
1585 		/* nothing to protect for fake tables */
1586 		*table &= ~_REGION_ENTRY_INVALID;
1587 		spin_unlock(&sg->guest_table_lock);
1588 		return 0;
1589 	}
1590 	spin_unlock(&sg->guest_table_lock);
1591 	/* Make r3t read-only in parent gmap page table */
1592 	raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
1593 	origin = r3t & _REGION_ENTRY_ORIGIN;
1594 	offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1595 	len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1596 	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1597 	spin_lock(&sg->guest_table_lock);
1598 	if (!rc) {
1599 		table = gmap_table_walk(sg, saddr, 3);
1600 		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
1601 			rc = -EAGAIN;		/* Race with unshadow */
1602 		else
1603 			*table &= ~_REGION_ENTRY_INVALID;
1604 	} else {
1605 		gmap_unshadow_r3t(sg, raddr);
1606 	}
1607 	spin_unlock(&sg->guest_table_lock);
1608 	return rc;
1609 out_free:
1610 	spin_unlock(&sg->guest_table_lock);
1611 	__free_pages(page, CRST_ALLOC_ORDER);
1612 	return rc;
1613 }
1614 EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
1615 
1616 /**
1617  * gmap_shadow_sgt - create a shadow segment table
1618  * @sg: pointer to the shadow guest address space structure
1619  * @saddr: faulting address in the shadow gmap
1620  * @sgt: parent gmap address of the segment table to get shadowed
1621  * @fake: sgt references contiguous guest memory block, not a sgt
1622  *
1623  * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
1624  * shadow table structure is incomplete, -ENOMEM if out of memory and
1625  * -EFAULT if an address in the parent gmap could not be resolved.
1626  *
1627  * Called with sg->mm->mmap_lock in read.
1628  */
gmap_shadow_sgt(struct gmap * sg,unsigned long saddr,unsigned long sgt,int fake)1629 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
1630 		    int fake)
1631 {
1632 	unsigned long raddr, origin, offset, len;
1633 	unsigned long *table;
1634 	phys_addr_t s_sgt;
1635 	struct page *page;
1636 	int rc;
1637 
1638 	BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
1639 	/* Allocate a shadow segment table */
1640 	page = gmap_alloc_crst();
1641 	if (!page)
1642 		return -ENOMEM;
1643 	s_sgt = page_to_phys(page);
1644 	/* Install shadow region second table */
1645 	spin_lock(&sg->guest_table_lock);
1646 	table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
1647 	if (!table) {
1648 		rc = -EAGAIN;		/* Race with unshadow */
1649 		goto out_free;
1650 	}
1651 	if (!(*table & _REGION_ENTRY_INVALID)) {
1652 		rc = 0;			/* Already established */
1653 		goto out_free;
1654 	} else if (*table & _REGION_ENTRY_ORIGIN) {
1655 		rc = -EAGAIN;		/* Race with shadow */
1656 		goto out_free;
1657 	}
1658 	crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
1659 	/* mark as invalid as long as the parent table is not protected */
1660 	*table = s_sgt | _REGION_ENTRY_LENGTH |
1661 		 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
1662 	if (sg->edat_level >= 1)
1663 		*table |= sgt & _REGION_ENTRY_PROTECT;
1664 	if (fake) {
1665 		/* nothing to protect for fake tables */
1666 		*table &= ~_REGION_ENTRY_INVALID;
1667 		spin_unlock(&sg->guest_table_lock);
1668 		return 0;
1669 	}
1670 	spin_unlock(&sg->guest_table_lock);
1671 	/* Make sgt read-only in parent gmap page table */
1672 	raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
1673 	origin = sgt & _REGION_ENTRY_ORIGIN;
1674 	offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1675 	len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1676 	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1677 	spin_lock(&sg->guest_table_lock);
1678 	if (!rc) {
1679 		table = gmap_table_walk(sg, saddr, 2);
1680 		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
1681 			rc = -EAGAIN;		/* Race with unshadow */
1682 		else
1683 			*table &= ~_REGION_ENTRY_INVALID;
1684 	} else {
1685 		gmap_unshadow_sgt(sg, raddr);
1686 	}
1687 	spin_unlock(&sg->guest_table_lock);
1688 	return rc;
1689 out_free:
1690 	spin_unlock(&sg->guest_table_lock);
1691 	__free_pages(page, CRST_ALLOC_ORDER);
1692 	return rc;
1693 }
1694 EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
1695 
gmap_pgste_set_pgt_addr(struct ptdesc * ptdesc,unsigned long pgt_addr)1696 static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr)
1697 {
1698 	unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc));
1699 
1700 	pgstes += _PAGE_ENTRIES;
1701 
1702 	pgstes[0] &= ~PGSTE_ST2_MASK;
1703 	pgstes[1] &= ~PGSTE_ST2_MASK;
1704 	pgstes[2] &= ~PGSTE_ST2_MASK;
1705 	pgstes[3] &= ~PGSTE_ST2_MASK;
1706 
1707 	pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK;
1708 	pgstes[1] |= pgt_addr & PGSTE_ST2_MASK;
1709 	pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK;
1710 	pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK;
1711 }
1712 
1713 /**
1714  * gmap_shadow_pgt - instantiate a shadow page table
1715  * @sg: pointer to the shadow guest address space structure
1716  * @saddr: faulting address in the shadow gmap
1717  * @pgt: parent gmap address of the page table to get shadowed
1718  * @fake: pgt references contiguous guest memory block, not a pgtable
1719  *
1720  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1721  * shadow table structure is incomplete, -ENOMEM if out of memory,
1722  * -EFAULT if an address in the parent gmap could not be resolved and
1723  *
1724  * Called with gmap->mm->mmap_lock in read
1725  */
gmap_shadow_pgt(struct gmap * sg,unsigned long saddr,unsigned long pgt,int fake)1726 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
1727 		    int fake)
1728 {
1729 	unsigned long raddr, origin;
1730 	unsigned long *table;
1731 	struct ptdesc *ptdesc;
1732 	phys_addr_t s_pgt;
1733 	int rc;
1734 
1735 	BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
1736 	/* Allocate a shadow page table */
1737 	ptdesc = page_table_alloc_pgste(sg->mm);
1738 	if (!ptdesc)
1739 		return -ENOMEM;
1740 	origin = pgt & _SEGMENT_ENTRY_ORIGIN;
1741 	if (fake)
1742 		origin |= GMAP_SHADOW_FAKE_TABLE;
1743 	gmap_pgste_set_pgt_addr(ptdesc, origin);
1744 	s_pgt = page_to_phys(ptdesc_page(ptdesc));
1745 	/* Install shadow page table */
1746 	spin_lock(&sg->guest_table_lock);
1747 	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
1748 	if (!table) {
1749 		rc = -EAGAIN;		/* Race with unshadow */
1750 		goto out_free;
1751 	}
1752 	if (!(*table & _SEGMENT_ENTRY_INVALID)) {
1753 		rc = 0;			/* Already established */
1754 		goto out_free;
1755 	} else if (*table & _SEGMENT_ENTRY_ORIGIN) {
1756 		rc = -EAGAIN;		/* Race with shadow */
1757 		goto out_free;
1758 	}
1759 	/* mark as invalid as long as the parent table is not protected */
1760 	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
1761 		 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
1762 	if (fake) {
1763 		/* nothing to protect for fake tables */
1764 		*table &= ~_SEGMENT_ENTRY_INVALID;
1765 		spin_unlock(&sg->guest_table_lock);
1766 		return 0;
1767 	}
1768 	spin_unlock(&sg->guest_table_lock);
1769 	/* Make pgt read-only in parent gmap page table (not the pgste) */
1770 	raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
1771 	origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
1772 	rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
1773 	spin_lock(&sg->guest_table_lock);
1774 	if (!rc) {
1775 		table = gmap_table_walk(sg, saddr, 1);
1776 		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
1777 			rc = -EAGAIN;		/* Race with unshadow */
1778 		else
1779 			*table &= ~_SEGMENT_ENTRY_INVALID;
1780 	} else {
1781 		gmap_unshadow_pgt(sg, raddr);
1782 	}
1783 	spin_unlock(&sg->guest_table_lock);
1784 	return rc;
1785 out_free:
1786 	spin_unlock(&sg->guest_table_lock);
1787 	page_table_free_pgste(ptdesc);
1788 	return rc;
1789 
1790 }
1791 EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
1792 
1793 /**
1794  * gmap_shadow_page - create a shadow page mapping
1795  * @sg: pointer to the shadow guest address space structure
1796  * @saddr: faulting address in the shadow gmap
1797  * @pte: pte in parent gmap address space to get shadowed
1798  *
1799  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1800  * shadow table structure is incomplete, -ENOMEM if out of memory and
1801  * -EFAULT if an address in the parent gmap could not be resolved.
1802  *
1803  * Called with sg->mm->mmap_lock in read.
1804  */
gmap_shadow_page(struct gmap * sg,unsigned long saddr,pte_t pte)1805 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
1806 {
1807 	struct gmap *parent;
1808 	struct gmap_rmap *rmap;
1809 	unsigned long vmaddr, paddr;
1810 	spinlock_t *ptl;
1811 	pte_t *sptep, *tptep;
1812 	int prot;
1813 	int rc;
1814 
1815 	BUG_ON(!gmap_is_shadow(sg));
1816 	parent = sg->parent;
1817 	prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
1818 
1819 	rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1820 	if (!rmap)
1821 		return -ENOMEM;
1822 	rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
1823 
1824 	while (1) {
1825 		paddr = pte_val(pte) & PAGE_MASK;
1826 		vmaddr = __gmap_translate(parent, paddr);
1827 		if (IS_ERR_VALUE(vmaddr)) {
1828 			rc = vmaddr;
1829 			break;
1830 		}
1831 		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1832 		if (rc)
1833 			break;
1834 		rc = -EAGAIN;
1835 		sptep = gmap_pte_op_walk(parent, paddr, &ptl);
1836 		if (sptep) {
1837 			spin_lock(&sg->guest_table_lock);
1838 			/* Get page table pointer */
1839 			tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
1840 			if (!tptep) {
1841 				spin_unlock(&sg->guest_table_lock);
1842 				gmap_pte_op_end(sptep, ptl);
1843 				radix_tree_preload_end();
1844 				break;
1845 			}
1846 			rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
1847 			if (rc > 0) {
1848 				/* Success and a new mapping */
1849 				gmap_insert_rmap(sg, vmaddr, rmap);
1850 				rmap = NULL;
1851 				rc = 0;
1852 			}
1853 			gmap_pte_op_end(sptep, ptl);
1854 			spin_unlock(&sg->guest_table_lock);
1855 		}
1856 		radix_tree_preload_end();
1857 		if (!rc)
1858 			break;
1859 		rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
1860 		if (rc)
1861 			break;
1862 	}
1863 	kfree(rmap);
1864 	return rc;
1865 }
1866 EXPORT_SYMBOL_GPL(gmap_shadow_page);
1867 
1868 /*
1869  * gmap_shadow_notify - handle notifications for shadow gmap
1870  *
1871  * Called with sg->parent->shadow_lock.
1872  */
gmap_shadow_notify(struct gmap * sg,unsigned long vmaddr,unsigned long gaddr)1873 static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
1874 			       unsigned long gaddr)
1875 {
1876 	struct gmap_rmap *rmap, *rnext, *head;
1877 	unsigned long start, end, bits, raddr;
1878 
1879 	BUG_ON(!gmap_is_shadow(sg));
1880 
1881 	spin_lock(&sg->guest_table_lock);
1882 	if (sg->removed) {
1883 		spin_unlock(&sg->guest_table_lock);
1884 		return;
1885 	}
1886 	/* Check for top level table */
1887 	start = sg->orig_asce & _ASCE_ORIGIN;
1888 	end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
1889 	if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
1890 	    gaddr < end) {
1891 		/* The complete shadow table has to go */
1892 		gmap_unshadow(sg);
1893 		spin_unlock(&sg->guest_table_lock);
1894 		list_del(&sg->list);
1895 		gmap_put(sg);
1896 		return;
1897 	}
1898 	/* Remove the page table tree from on specific entry */
1899 	head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1900 	gmap_for_each_rmap_safe(rmap, rnext, head) {
1901 		bits = rmap->raddr & _SHADOW_RMAP_MASK;
1902 		raddr = rmap->raddr ^ bits;
1903 		switch (bits) {
1904 		case _SHADOW_RMAP_REGION1:
1905 			gmap_unshadow_r2t(sg, raddr);
1906 			break;
1907 		case _SHADOW_RMAP_REGION2:
1908 			gmap_unshadow_r3t(sg, raddr);
1909 			break;
1910 		case _SHADOW_RMAP_REGION3:
1911 			gmap_unshadow_sgt(sg, raddr);
1912 			break;
1913 		case _SHADOW_RMAP_SEGMENT:
1914 			gmap_unshadow_pgt(sg, raddr);
1915 			break;
1916 		case _SHADOW_RMAP_PGTABLE:
1917 			gmap_unshadow_page(sg, raddr);
1918 			break;
1919 		}
1920 		kfree(rmap);
1921 	}
1922 	spin_unlock(&sg->guest_table_lock);
1923 }
1924 
1925 /**
1926  * ptep_notify - call all invalidation callbacks for a specific pte.
1927  * @mm: pointer to the process mm_struct
1928  * @vmaddr: virtual address in the process address space
1929  * @pte: pointer to the page table entry
1930  * @bits: bits from the pgste that caused the notify call
1931  *
1932  * This function is assumed to be called with the page table lock held
1933  * for the pte to notify.
1934  */
ptep_notify(struct mm_struct * mm,unsigned long vmaddr,pte_t * pte,unsigned long bits)1935 void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
1936 		 pte_t *pte, unsigned long bits)
1937 {
1938 	unsigned long offset, gaddr = 0;
1939 	struct gmap *gmap, *sg, *next;
1940 
1941 	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
1942 	offset = offset * (PAGE_SIZE / sizeof(pte_t));
1943 	rcu_read_lock();
1944 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
1945 		spin_lock(&gmap->guest_table_lock);
1946 		gaddr = host_to_guest_lookup(gmap, vmaddr) + offset;
1947 		spin_unlock(&gmap->guest_table_lock);
1948 		if (!IS_GADDR_VALID(gaddr))
1949 			continue;
1950 
1951 		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
1952 			spin_lock(&gmap->shadow_lock);
1953 			list_for_each_entry_safe(sg, next,
1954 						 &gmap->children, list)
1955 				gmap_shadow_notify(sg, vmaddr, gaddr);
1956 			spin_unlock(&gmap->shadow_lock);
1957 		}
1958 		if (bits & PGSTE_IN_BIT)
1959 			gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
1960 	}
1961 	rcu_read_unlock();
1962 }
1963 EXPORT_SYMBOL_GPL(ptep_notify);
1964 
pmdp_notify_gmap(struct gmap * gmap,pmd_t * pmdp,unsigned long gaddr)1965 static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
1966 			     unsigned long gaddr)
1967 {
1968 	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
1969 	gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
1970 }
1971 
1972 /**
1973  * gmap_pmdp_xchg - exchange a gmap pmd with another
1974  * @gmap: pointer to the guest address space structure
1975  * @pmdp: pointer to the pmd entry
1976  * @new: replacement entry
1977  * @gaddr: the affected guest address
1978  *
1979  * This function is assumed to be called with the guest_table_lock
1980  * held.
1981  */
gmap_pmdp_xchg(struct gmap * gmap,pmd_t * pmdp,pmd_t new,unsigned long gaddr)1982 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
1983 			   unsigned long gaddr)
1984 {
1985 	gaddr &= HPAGE_MASK;
1986 	pmdp_notify_gmap(gmap, pmdp, gaddr);
1987 	new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
1988 	if (machine_has_tlb_guest())
1989 		__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
1990 			    IDTE_GLOBAL);
1991 	else if (cpu_has_idte())
1992 		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
1993 	else
1994 		__pmdp_csp(pmdp);
1995 	set_pmd(pmdp, new);
1996 }
1997 
gmap_pmdp_clear(struct mm_struct * mm,unsigned long vmaddr,int purge)1998 static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
1999 			    int purge)
2000 {
2001 	pmd_t *pmdp;
2002 	struct gmap *gmap;
2003 	unsigned long gaddr;
2004 
2005 	rcu_read_lock();
2006 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2007 		spin_lock(&gmap->guest_table_lock);
2008 		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
2009 		if (pmdp) {
2010 			pmdp_notify_gmap(gmap, pmdp, gaddr);
2011 			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2012 						   _SEGMENT_ENTRY_GMAP_UC |
2013 						   _SEGMENT_ENTRY));
2014 			if (purge)
2015 				__pmdp_csp(pmdp);
2016 			set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
2017 		}
2018 		spin_unlock(&gmap->guest_table_lock);
2019 	}
2020 	rcu_read_unlock();
2021 }
2022 
2023 /**
2024  * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
2025  *                        flushing
2026  * @mm: pointer to the process mm_struct
2027  * @vmaddr: virtual address in the process address space
2028  */
gmap_pmdp_invalidate(struct mm_struct * mm,unsigned long vmaddr)2029 void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
2030 {
2031 	gmap_pmdp_clear(mm, vmaddr, 0);
2032 }
2033 EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
2034 
2035 /**
2036  * gmap_pmdp_csp - csp all affected guest pmd entries
2037  * @mm: pointer to the process mm_struct
2038  * @vmaddr: virtual address in the process address space
2039  */
gmap_pmdp_csp(struct mm_struct * mm,unsigned long vmaddr)2040 void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
2041 {
2042 	gmap_pmdp_clear(mm, vmaddr, 1);
2043 }
2044 EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
2045 
2046 /**
2047  * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
2048  * @mm: pointer to the process mm_struct
2049  * @vmaddr: virtual address in the process address space
2050  */
gmap_pmdp_idte_local(struct mm_struct * mm,unsigned long vmaddr)2051 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
2052 {
2053 	unsigned long gaddr;
2054 	struct gmap *gmap;
2055 	pmd_t *pmdp;
2056 
2057 	rcu_read_lock();
2058 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2059 		spin_lock(&gmap->guest_table_lock);
2060 		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
2061 		if (pmdp) {
2062 			pmdp_notify_gmap(gmap, pmdp, gaddr);
2063 			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2064 						   _SEGMENT_ENTRY_GMAP_UC |
2065 						   _SEGMENT_ENTRY));
2066 			if (machine_has_tlb_guest())
2067 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2068 					    gmap->asce, IDTE_LOCAL);
2069 			else if (cpu_has_idte())
2070 				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
2071 			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
2072 		}
2073 		spin_unlock(&gmap->guest_table_lock);
2074 	}
2075 	rcu_read_unlock();
2076 }
2077 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
2078 
2079 /**
2080  * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
2081  * @mm: pointer to the process mm_struct
2082  * @vmaddr: virtual address in the process address space
2083  */
gmap_pmdp_idte_global(struct mm_struct * mm,unsigned long vmaddr)2084 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
2085 {
2086 	unsigned long gaddr;
2087 	struct gmap *gmap;
2088 	pmd_t *pmdp;
2089 
2090 	rcu_read_lock();
2091 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2092 		spin_lock(&gmap->guest_table_lock);
2093 		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
2094 		if (pmdp) {
2095 			pmdp_notify_gmap(gmap, pmdp, gaddr);
2096 			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2097 						   _SEGMENT_ENTRY_GMAP_UC |
2098 						   _SEGMENT_ENTRY));
2099 			if (machine_has_tlb_guest())
2100 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2101 					    gmap->asce, IDTE_GLOBAL);
2102 			else if (cpu_has_idte())
2103 				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
2104 			else
2105 				__pmdp_csp(pmdp);
2106 			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
2107 		}
2108 		spin_unlock(&gmap->guest_table_lock);
2109 	}
2110 	rcu_read_unlock();
2111 }
2112 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
2113 
2114 /**
2115  * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
2116  * @gmap: pointer to guest address space
2117  * @pmdp: pointer to the pmd to be tested
2118  * @gaddr: virtual address in the guest address space
2119  *
2120  * This function is assumed to be called with the guest_table_lock
2121  * held.
2122  */
gmap_test_and_clear_dirty_pmd(struct gmap * gmap,pmd_t * pmdp,unsigned long gaddr)2123 static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
2124 					  unsigned long gaddr)
2125 {
2126 	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
2127 		return false;
2128 
2129 	/* Already protected memory, which did not change is clean */
2130 	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
2131 	    !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
2132 		return false;
2133 
2134 	/* Clear UC indication and reset protection */
2135 	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
2136 	gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
2137 	return true;
2138 }
2139 
2140 /**
2141  * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
2142  * @gmap: pointer to guest address space
2143  * @bitmap: dirty bitmap for this pmd
2144  * @gaddr: virtual address in the guest address space
2145  * @vmaddr: virtual address in the host address space
2146  *
2147  * This function is assumed to be called with the guest_table_lock
2148  * held.
2149  */
gmap_sync_dirty_log_pmd(struct gmap * gmap,unsigned long bitmap[4],unsigned long gaddr,unsigned long vmaddr)2150 void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
2151 			     unsigned long gaddr, unsigned long vmaddr)
2152 {
2153 	int i;
2154 	pmd_t *pmdp;
2155 	pte_t *ptep;
2156 	spinlock_t *ptl;
2157 
2158 	pmdp = gmap_pmd_op_walk(gmap, gaddr);
2159 	if (!pmdp)
2160 		return;
2161 
2162 	if (pmd_leaf(*pmdp)) {
2163 		if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
2164 			bitmap_fill(bitmap, _PAGE_ENTRIES);
2165 	} else {
2166 		for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
2167 			ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
2168 			if (!ptep)
2169 				continue;
2170 			if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
2171 				set_bit(i, bitmap);
2172 			pte_unmap_unlock(ptep, ptl);
2173 		}
2174 	}
2175 	gmap_pmd_op_end(gmap, pmdp);
2176 }
2177 EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
2178 
2179 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
thp_split_walk_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)2180 static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
2181 				    unsigned long end, struct mm_walk *walk)
2182 {
2183 	struct vm_area_struct *vma = walk->vma;
2184 
2185 	split_huge_pmd(vma, pmd, addr);
2186 	return 0;
2187 }
2188 
2189 static const struct mm_walk_ops thp_split_walk_ops = {
2190 	.pmd_entry	= thp_split_walk_pmd_entry,
2191 	.walk_lock	= PGWALK_WRLOCK_VERIFY,
2192 };
2193 
thp_split_mm(struct mm_struct * mm)2194 static inline void thp_split_mm(struct mm_struct *mm)
2195 {
2196 	struct vm_area_struct *vma;
2197 	VMA_ITERATOR(vmi, mm, 0);
2198 
2199 	for_each_vma(vmi, vma) {
2200 		vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
2201 		walk_page_vma(vma, &thp_split_walk_ops, NULL);
2202 	}
2203 	mm->def_flags |= VM_NOHUGEPAGE;
2204 }
2205 #else
thp_split_mm(struct mm_struct * mm)2206 static inline void thp_split_mm(struct mm_struct *mm)
2207 {
2208 }
2209 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2210 
2211 /*
2212  * switch on pgstes for its userspace process (for kvm)
2213  */
s390_enable_sie(void)2214 int s390_enable_sie(void)
2215 {
2216 	struct mm_struct *mm = current->mm;
2217 
2218 	/* Do we have pgstes? if yes, we are done */
2219 	if (mm_has_pgste(mm))
2220 		return 0;
2221 	mmap_write_lock(mm);
2222 	mm->context.has_pgste = 1;
2223 	/* split thp mappings and disable thp for future mappings */
2224 	thp_split_mm(mm);
2225 	mmap_write_unlock(mm);
2226 	return 0;
2227 }
2228 EXPORT_SYMBOL_GPL(s390_enable_sie);
2229 
2230 /*
2231  * Enable storage key handling from now on and initialize the storage
2232  * keys with the default key.
2233  */
__s390_enable_skey_pte(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)2234 static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
2235 				  unsigned long next, struct mm_walk *walk)
2236 {
2237 	/* Clear storage key */
2238 	ptep_zap_key(walk->mm, addr, pte);
2239 	return 0;
2240 }
2241 
2242 /*
2243  * Give a chance to schedule after setting a key to 256 pages.
2244  * We only hold the mm lock, which is a rwsem and the kvm srcu.
2245  * Both can sleep.
2246  */
__s390_enable_skey_pmd(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)2247 static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
2248 				  unsigned long next, struct mm_walk *walk)
2249 {
2250 	cond_resched();
2251 	return 0;
2252 }
2253 
__s390_enable_skey_hugetlb(pte_t * pte,unsigned long addr,unsigned long hmask,unsigned long next,struct mm_walk * walk)2254 static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
2255 				      unsigned long hmask, unsigned long next,
2256 				      struct mm_walk *walk)
2257 {
2258 	pmd_t *pmd = (pmd_t *)pte;
2259 	unsigned long start, end;
2260 	struct folio *folio = page_folio(pmd_page(*pmd));
2261 
2262 	/*
2263 	 * The write check makes sure we do not set a key on shared
2264 	 * memory. This is needed as the walker does not differentiate
2265 	 * between actual guest memory and the process executable or
2266 	 * shared libraries.
2267 	 */
2268 	if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
2269 	    !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
2270 		return 0;
2271 
2272 	start = pmd_val(*pmd) & HPAGE_MASK;
2273 	end = start + HPAGE_SIZE;
2274 	__storage_key_init_range(start, end);
2275 	set_bit(PG_arch_1, &folio->flags);
2276 	cond_resched();
2277 	return 0;
2278 }
2279 
2280 static const struct mm_walk_ops enable_skey_walk_ops = {
2281 	.hugetlb_entry		= __s390_enable_skey_hugetlb,
2282 	.pte_entry		= __s390_enable_skey_pte,
2283 	.pmd_entry		= __s390_enable_skey_pmd,
2284 	.walk_lock		= PGWALK_WRLOCK,
2285 };
2286 
s390_enable_skey(void)2287 int s390_enable_skey(void)
2288 {
2289 	struct mm_struct *mm = current->mm;
2290 	int rc = 0;
2291 
2292 	mmap_write_lock(mm);
2293 	if (mm_uses_skeys(mm))
2294 		goto out_up;
2295 
2296 	mm->context.uses_skeys = 1;
2297 	rc = gmap_helper_disable_cow_sharing();
2298 	if (rc) {
2299 		mm->context.uses_skeys = 0;
2300 		goto out_up;
2301 	}
2302 	walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
2303 
2304 out_up:
2305 	mmap_write_unlock(mm);
2306 	return rc;
2307 }
2308 EXPORT_SYMBOL_GPL(s390_enable_skey);
2309 
2310 /*
2311  * Reset CMMA state, make all pages stable again.
2312  */
__s390_reset_cmma(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)2313 static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
2314 			     unsigned long next, struct mm_walk *walk)
2315 {
2316 	ptep_zap_unused(walk->mm, addr, pte, 1);
2317 	return 0;
2318 }
2319 
2320 static const struct mm_walk_ops reset_cmma_walk_ops = {
2321 	.pte_entry		= __s390_reset_cmma,
2322 	.walk_lock		= PGWALK_WRLOCK,
2323 };
2324 
s390_reset_cmma(struct mm_struct * mm)2325 void s390_reset_cmma(struct mm_struct *mm)
2326 {
2327 	mmap_write_lock(mm);
2328 	walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
2329 	mmap_write_unlock(mm);
2330 }
2331 EXPORT_SYMBOL_GPL(s390_reset_cmma);
2332 
2333 #define GATHER_GET_PAGES 32
2334 
2335 struct reset_walk_state {
2336 	unsigned long next;
2337 	unsigned long count;
2338 	unsigned long pfns[GATHER_GET_PAGES];
2339 };
2340 
s390_gather_pages(pte_t * ptep,unsigned long addr,unsigned long next,struct mm_walk * walk)2341 static int s390_gather_pages(pte_t *ptep, unsigned long addr,
2342 			     unsigned long next, struct mm_walk *walk)
2343 {
2344 	struct reset_walk_state *p = walk->private;
2345 	pte_t pte = READ_ONCE(*ptep);
2346 
2347 	if (pte_present(pte)) {
2348 		/* we have a reference from the mapping, take an extra one */
2349 		get_page(phys_to_page(pte_val(pte)));
2350 		p->pfns[p->count] = phys_to_pfn(pte_val(pte));
2351 		p->next = next;
2352 		p->count++;
2353 	}
2354 	return p->count >= GATHER_GET_PAGES;
2355 }
2356 
2357 static const struct mm_walk_ops gather_pages_ops = {
2358 	.pte_entry = s390_gather_pages,
2359 	.walk_lock = PGWALK_RDLOCK,
2360 };
2361 
2362 /*
2363  * Call the Destroy secure page UVC on each page in the given array of PFNs.
2364  * Each page needs to have an extra reference, which will be released here.
2365  */
s390_uv_destroy_pfns(unsigned long count,unsigned long * pfns)2366 void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
2367 {
2368 	struct folio *folio;
2369 	unsigned long i;
2370 
2371 	for (i = 0; i < count; i++) {
2372 		folio = pfn_folio(pfns[i]);
2373 		/* we always have an extra reference */
2374 		uv_destroy_folio(folio);
2375 		/* get rid of the extra reference */
2376 		folio_put(folio);
2377 		cond_resched();
2378 	}
2379 }
2380 EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
2381 
2382 /**
2383  * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
2384  * in the given range of the given address space.
2385  * @mm: the mm to operate on
2386  * @start: the start of the range
2387  * @end: the end of the range
2388  * @interruptible: if not 0, stop when a fatal signal is received
2389  *
2390  * Walk the given range of the given address space and call the destroy
2391  * secure page UVC on each page. Optionally exit early if a fatal signal is
2392  * pending.
2393  *
2394  * Return: 0 on success, -EINTR if the function stopped before completing
2395  */
__s390_uv_destroy_range(struct mm_struct * mm,unsigned long start,unsigned long end,bool interruptible)2396 int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
2397 			    unsigned long end, bool interruptible)
2398 {
2399 	struct reset_walk_state state = { .next = start };
2400 	int r = 1;
2401 
2402 	while (r > 0) {
2403 		state.count = 0;
2404 		mmap_read_lock(mm);
2405 		r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
2406 		mmap_read_unlock(mm);
2407 		cond_resched();
2408 		s390_uv_destroy_pfns(state.count, state.pfns);
2409 		if (interruptible && fatal_signal_pending(current))
2410 			return -EINTR;
2411 	}
2412 	return 0;
2413 }
2414 EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
2415 
2416 /**
2417  * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
2418  * @gmap: the gmap whose ASCE needs to be replaced
2419  *
2420  * If the ASCE is a SEGMENT type then this function will return -EINVAL,
2421  * otherwise the pointers in the host_to_guest radix tree will keep pointing
2422  * to the wrong pages, causing use-after-free and memory corruption.
2423  * If the allocation of the new top level page table fails, the ASCE is not
2424  * replaced.
2425  * In any case, the old ASCE is always removed from the gmap CRST list.
2426  * Therefore the caller has to make sure to save a pointer to it
2427  * beforehand, unless a leak is actually intended.
2428  */
s390_replace_asce(struct gmap * gmap)2429 int s390_replace_asce(struct gmap *gmap)
2430 {
2431 	unsigned long asce;
2432 	struct page *page;
2433 	void *table;
2434 
2435 	/* Replacing segment type ASCEs would cause serious issues */
2436 	if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
2437 		return -EINVAL;
2438 
2439 	page = gmap_alloc_crst();
2440 	if (!page)
2441 		return -ENOMEM;
2442 	table = page_to_virt(page);
2443 	memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
2444 
2445 	/* Set new table origin while preserving existing ASCE control bits */
2446 	asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
2447 	WRITE_ONCE(gmap->asce, asce);
2448 	WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
2449 	WRITE_ONCE(gmap->table, table);
2450 
2451 	return 0;
2452 }
2453 EXPORT_SYMBOL_GPL(s390_replace_asce);
2454