1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * KVM guest address space mapping code
4 *
5 * Copyright IBM Corp. 2007, 2020
6 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
7 * David Hildenbrand <david@redhat.com>
8 * Janosch Frank <frankja@linux.vnet.ibm.com>
9 */
10
11 #include <linux/cpufeature.h>
12 #include <linux/export.h>
13 #include <linux/kernel.h>
14 #include <linux/pagewalk.h>
15 #include <linux/swap.h>
16 #include <linux/smp.h>
17 #include <linux/spinlock.h>
18 #include <linux/slab.h>
19 #include <linux/swapops.h>
20 #include <linux/ksm.h>
21 #include <linux/mman.h>
22 #include <linux/pgtable.h>
23 #include <asm/page-states.h>
24 #include <asm/pgalloc.h>
25 #include <asm/machine.h>
26 #include <asm/gmap_helpers.h>
27 #include <asm/gmap.h>
28 #include <asm/page.h>
29
30 /*
31 * The address is saved in a radix tree directly; NULL would be ambiguous,
32 * since 0 is a valid address, and NULL is returned when nothing was found.
33 * The lower bits are ignored by all users of the macro, so it can be used
34 * to distinguish a valid address 0 from a NULL.
35 */
36 #define VALID_GADDR_FLAG 1
37 #define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG)
38 #define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG)
39
40 #define GMAP_SHADOW_FAKE_TABLE 1ULL
41
gmap_alloc_crst(void)42 static struct page *gmap_alloc_crst(void)
43 {
44 struct page *page;
45
46 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
47 if (!page)
48 return NULL;
49 __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER);
50 return page;
51 }
52
53 /**
54 * gmap_alloc - allocate and initialize a guest address space
55 * @limit: maximum address of the gmap address space
56 *
57 * Returns a guest address space structure.
58 */
gmap_alloc(unsigned long limit)59 struct gmap *gmap_alloc(unsigned long limit)
60 {
61 struct gmap *gmap;
62 struct page *page;
63 unsigned long *table;
64 unsigned long etype, atype;
65
66 if (limit < _REGION3_SIZE) {
67 limit = _REGION3_SIZE - 1;
68 atype = _ASCE_TYPE_SEGMENT;
69 etype = _SEGMENT_ENTRY_EMPTY;
70 } else if (limit < _REGION2_SIZE) {
71 limit = _REGION2_SIZE - 1;
72 atype = _ASCE_TYPE_REGION3;
73 etype = _REGION3_ENTRY_EMPTY;
74 } else if (limit < _REGION1_SIZE) {
75 limit = _REGION1_SIZE - 1;
76 atype = _ASCE_TYPE_REGION2;
77 etype = _REGION2_ENTRY_EMPTY;
78 } else {
79 limit = -1UL;
80 atype = _ASCE_TYPE_REGION1;
81 etype = _REGION1_ENTRY_EMPTY;
82 }
83 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
84 if (!gmap)
85 goto out;
86 INIT_LIST_HEAD(&gmap->children);
87 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
88 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
89 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
90 spin_lock_init(&gmap->guest_table_lock);
91 spin_lock_init(&gmap->shadow_lock);
92 refcount_set(&gmap->ref_count, 1);
93 page = gmap_alloc_crst();
94 if (!page)
95 goto out_free;
96 table = page_to_virt(page);
97 crst_table_init(table, etype);
98 gmap->table = table;
99 gmap->asce = atype | _ASCE_TABLE_LENGTH |
100 _ASCE_USER_BITS | __pa(table);
101 gmap->asce_end = limit;
102 return gmap;
103
104 out_free:
105 kfree(gmap);
106 out:
107 return NULL;
108 }
109 EXPORT_SYMBOL_GPL(gmap_alloc);
110
111 /**
112 * gmap_create - create a guest address space
113 * @mm: pointer to the parent mm_struct
114 * @limit: maximum size of the gmap address space
115 *
116 * Returns a guest address space structure.
117 */
gmap_create(struct mm_struct * mm,unsigned long limit)118 struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
119 {
120 struct gmap *gmap;
121 unsigned long gmap_asce;
122
123 gmap = gmap_alloc(limit);
124 if (!gmap)
125 return NULL;
126 gmap->mm = mm;
127 spin_lock(&mm->context.lock);
128 list_add_rcu(&gmap->list, &mm->context.gmap_list);
129 if (list_is_singular(&mm->context.gmap_list))
130 gmap_asce = gmap->asce;
131 else
132 gmap_asce = -1UL;
133 WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
134 spin_unlock(&mm->context.lock);
135 return gmap;
136 }
137 EXPORT_SYMBOL_GPL(gmap_create);
138
gmap_flush_tlb(struct gmap * gmap)139 static void gmap_flush_tlb(struct gmap *gmap)
140 {
141 __tlb_flush_idte(gmap->asce);
142 }
143
gmap_radix_tree_free(struct radix_tree_root * root)144 static void gmap_radix_tree_free(struct radix_tree_root *root)
145 {
146 struct radix_tree_iter iter;
147 unsigned long indices[16];
148 unsigned long index;
149 void __rcu **slot;
150 int i, nr;
151
152 /* A radix tree is freed by deleting all of its entries */
153 index = 0;
154 do {
155 nr = 0;
156 radix_tree_for_each_slot(slot, root, &iter, index) {
157 indices[nr] = iter.index;
158 if (++nr == 16)
159 break;
160 }
161 for (i = 0; i < nr; i++) {
162 index = indices[i];
163 radix_tree_delete(root, index);
164 }
165 } while (nr > 0);
166 }
167
gmap_rmap_radix_tree_free(struct radix_tree_root * root)168 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
169 {
170 struct gmap_rmap *rmap, *rnext, *head;
171 struct radix_tree_iter iter;
172 unsigned long indices[16];
173 unsigned long index;
174 void __rcu **slot;
175 int i, nr;
176
177 /* A radix tree is freed by deleting all of its entries */
178 index = 0;
179 do {
180 nr = 0;
181 radix_tree_for_each_slot(slot, root, &iter, index) {
182 indices[nr] = iter.index;
183 if (++nr == 16)
184 break;
185 }
186 for (i = 0; i < nr; i++) {
187 index = indices[i];
188 head = radix_tree_delete(root, index);
189 gmap_for_each_rmap_safe(rmap, rnext, head)
190 kfree(rmap);
191 }
192 } while (nr > 0);
193 }
194
gmap_free_crst(unsigned long * table,bool free_ptes)195 static void gmap_free_crst(unsigned long *table, bool free_ptes)
196 {
197 bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0;
198 int i;
199
200 if (is_segment) {
201 if (!free_ptes)
202 goto out;
203 for (i = 0; i < _CRST_ENTRIES; i++)
204 if (!(table[i] & _SEGMENT_ENTRY_INVALID))
205 page_table_free_pgste(page_ptdesc(phys_to_page(table[i])));
206 } else {
207 for (i = 0; i < _CRST_ENTRIES; i++)
208 if (!(table[i] & _REGION_ENTRY_INVALID))
209 gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes);
210 }
211
212 out:
213 free_pages((unsigned long)table, CRST_ALLOC_ORDER);
214 }
215
216 /**
217 * gmap_free - free a guest address space
218 * @gmap: pointer to the guest address space structure
219 *
220 * No locks required. There are no references to this gmap anymore.
221 */
gmap_free(struct gmap * gmap)222 void gmap_free(struct gmap *gmap)
223 {
224 /* Flush tlb of all gmaps (if not already done for shadows) */
225 if (!(gmap_is_shadow(gmap) && gmap->removed))
226 gmap_flush_tlb(gmap);
227 /* Free all segment & region tables. */
228 gmap_free_crst(gmap->table, gmap_is_shadow(gmap));
229
230 gmap_radix_tree_free(&gmap->guest_to_host);
231 gmap_radix_tree_free(&gmap->host_to_guest);
232
233 /* Free additional data for a shadow gmap */
234 if (gmap_is_shadow(gmap)) {
235 gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
236 /* Release reference to the parent */
237 gmap_put(gmap->parent);
238 }
239
240 kfree(gmap);
241 }
242 EXPORT_SYMBOL_GPL(gmap_free);
243
244 /**
245 * gmap_get - increase reference counter for guest address space
246 * @gmap: pointer to the guest address space structure
247 *
248 * Returns the gmap pointer
249 */
gmap_get(struct gmap * gmap)250 struct gmap *gmap_get(struct gmap *gmap)
251 {
252 refcount_inc(&gmap->ref_count);
253 return gmap;
254 }
255 EXPORT_SYMBOL_GPL(gmap_get);
256
257 /**
258 * gmap_put - decrease reference counter for guest address space
259 * @gmap: pointer to the guest address space structure
260 *
261 * If the reference counter reaches zero the guest address space is freed.
262 */
gmap_put(struct gmap * gmap)263 void gmap_put(struct gmap *gmap)
264 {
265 if (refcount_dec_and_test(&gmap->ref_count))
266 gmap_free(gmap);
267 }
268 EXPORT_SYMBOL_GPL(gmap_put);
269
270 /**
271 * gmap_remove - remove a guest address space but do not free it yet
272 * @gmap: pointer to the guest address space structure
273 */
gmap_remove(struct gmap * gmap)274 void gmap_remove(struct gmap *gmap)
275 {
276 struct gmap *sg, *next;
277 unsigned long gmap_asce;
278
279 /* Remove all shadow gmaps linked to this gmap */
280 if (!list_empty(&gmap->children)) {
281 spin_lock(&gmap->shadow_lock);
282 list_for_each_entry_safe(sg, next, &gmap->children, list) {
283 list_del(&sg->list);
284 gmap_put(sg);
285 }
286 spin_unlock(&gmap->shadow_lock);
287 }
288 /* Remove gmap from the pre-mm list */
289 spin_lock(&gmap->mm->context.lock);
290 list_del_rcu(&gmap->list);
291 if (list_empty(&gmap->mm->context.gmap_list))
292 gmap_asce = 0;
293 else if (list_is_singular(&gmap->mm->context.gmap_list))
294 gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
295 struct gmap, list)->asce;
296 else
297 gmap_asce = -1UL;
298 WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
299 spin_unlock(&gmap->mm->context.lock);
300 synchronize_rcu();
301 /* Put reference */
302 gmap_put(gmap);
303 }
304 EXPORT_SYMBOL_GPL(gmap_remove);
305
306 /*
307 * gmap_alloc_table is assumed to be called with mmap_lock held
308 */
gmap_alloc_table(struct gmap * gmap,unsigned long * table,unsigned long init,unsigned long gaddr)309 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
310 unsigned long init, unsigned long gaddr)
311 {
312 struct page *page;
313 unsigned long *new;
314
315 /* since we dont free the gmap table until gmap_free we can unlock */
316 page = gmap_alloc_crst();
317 if (!page)
318 return -ENOMEM;
319 new = page_to_virt(page);
320 crst_table_init(new, init);
321 spin_lock(&gmap->guest_table_lock);
322 if (*table & _REGION_ENTRY_INVALID) {
323 *table = __pa(new) | _REGION_ENTRY_LENGTH |
324 (*table & _REGION_ENTRY_TYPE_MASK);
325 page = NULL;
326 }
327 spin_unlock(&gmap->guest_table_lock);
328 if (page)
329 __free_pages(page, CRST_ALLOC_ORDER);
330 return 0;
331 }
332
host_to_guest_lookup(struct gmap * gmap,unsigned long vmaddr)333 static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr)
334 {
335 return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
336 }
337
host_to_guest_delete(struct gmap * gmap,unsigned long vmaddr)338 static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr)
339 {
340 return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
341 }
342
host_to_guest_pmd_delete(struct gmap * gmap,unsigned long vmaddr,unsigned long * gaddr)343 static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr,
344 unsigned long *gaddr)
345 {
346 *gaddr = host_to_guest_delete(gmap, vmaddr);
347 if (IS_GADDR_VALID(*gaddr))
348 return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1);
349 return NULL;
350 }
351
352 /**
353 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
354 * @gmap: pointer to the guest address space structure
355 * @vmaddr: address in the host process address space
356 *
357 * Returns 1 if a TLB flush is required
358 */
__gmap_unlink_by_vmaddr(struct gmap * gmap,unsigned long vmaddr)359 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
360 {
361 unsigned long gaddr;
362 int flush = 0;
363 pmd_t *pmdp;
364
365 BUG_ON(gmap_is_shadow(gmap));
366 spin_lock(&gmap->guest_table_lock);
367
368 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
369 if (pmdp) {
370 flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY);
371 *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
372 }
373
374 spin_unlock(&gmap->guest_table_lock);
375 return flush;
376 }
377
378 /**
379 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
380 * @gmap: pointer to the guest address space structure
381 * @gaddr: address in the guest address space
382 *
383 * Returns 1 if a TLB flush is required
384 */
__gmap_unmap_by_gaddr(struct gmap * gmap,unsigned long gaddr)385 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
386 {
387 unsigned long vmaddr;
388
389 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
390 gaddr >> PMD_SHIFT);
391 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
392 }
393
394 /**
395 * gmap_unmap_segment - unmap segment from the guest address space
396 * @gmap: pointer to the guest address space structure
397 * @to: address in the guest address space
398 * @len: length of the memory area to unmap
399 *
400 * Returns 0 if the unmap succeeded, -EINVAL if not.
401 */
gmap_unmap_segment(struct gmap * gmap,unsigned long to,unsigned long len)402 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
403 {
404 unsigned long off;
405 int flush;
406
407 BUG_ON(gmap_is_shadow(gmap));
408 if ((to | len) & (PMD_SIZE - 1))
409 return -EINVAL;
410 if (len == 0 || to + len < to)
411 return -EINVAL;
412
413 flush = 0;
414 mmap_write_lock(gmap->mm);
415 for (off = 0; off < len; off += PMD_SIZE)
416 flush |= __gmap_unmap_by_gaddr(gmap, to + off);
417 mmap_write_unlock(gmap->mm);
418 if (flush)
419 gmap_flush_tlb(gmap);
420 return 0;
421 }
422 EXPORT_SYMBOL_GPL(gmap_unmap_segment);
423
424 /**
425 * gmap_map_segment - map a segment to the guest address space
426 * @gmap: pointer to the guest address space structure
427 * @from: source address in the parent address space
428 * @to: target address in the guest address space
429 * @len: length of the memory area to map
430 *
431 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
432 */
gmap_map_segment(struct gmap * gmap,unsigned long from,unsigned long to,unsigned long len)433 int gmap_map_segment(struct gmap *gmap, unsigned long from,
434 unsigned long to, unsigned long len)
435 {
436 unsigned long off;
437 int flush;
438
439 BUG_ON(gmap_is_shadow(gmap));
440 if ((from | to | len) & (PMD_SIZE - 1))
441 return -EINVAL;
442 if (len == 0 || from + len < from || to + len < to ||
443 from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
444 return -EINVAL;
445
446 flush = 0;
447 mmap_write_lock(gmap->mm);
448 for (off = 0; off < len; off += PMD_SIZE) {
449 /* Remove old translation */
450 flush |= __gmap_unmap_by_gaddr(gmap, to + off);
451 /* Store new translation */
452 if (radix_tree_insert(&gmap->guest_to_host,
453 (to + off) >> PMD_SHIFT,
454 (void *) from + off))
455 break;
456 }
457 mmap_write_unlock(gmap->mm);
458 if (flush)
459 gmap_flush_tlb(gmap);
460 if (off >= len)
461 return 0;
462 gmap_unmap_segment(gmap, to, len);
463 return -ENOMEM;
464 }
465 EXPORT_SYMBOL_GPL(gmap_map_segment);
466
467 /**
468 * __gmap_translate - translate a guest address to a user space address
469 * @gmap: pointer to guest mapping meta data structure
470 * @gaddr: guest address
471 *
472 * Returns user space address which corresponds to the guest address or
473 * -EFAULT if no such mapping exists.
474 * This function does not establish potentially missing page table entries.
475 * The mmap_lock of the mm that belongs to the address space must be held
476 * when this function gets called.
477 *
478 * Note: Can also be called for shadow gmaps.
479 */
__gmap_translate(struct gmap * gmap,unsigned long gaddr)480 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
481 {
482 unsigned long vmaddr;
483
484 vmaddr = (unsigned long)
485 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
486 /* Note: guest_to_host is empty for a shadow gmap */
487 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
488 }
489 EXPORT_SYMBOL_GPL(__gmap_translate);
490
491 /**
492 * gmap_unlink - disconnect a page table from the gmap shadow tables
493 * @mm: pointer to the parent mm_struct
494 * @table: pointer to the host page table
495 * @vmaddr: vm address associated with the host page table
496 */
gmap_unlink(struct mm_struct * mm,unsigned long * table,unsigned long vmaddr)497 void gmap_unlink(struct mm_struct *mm, unsigned long *table,
498 unsigned long vmaddr)
499 {
500 struct gmap *gmap;
501 int flush;
502
503 rcu_read_lock();
504 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
505 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
506 if (flush)
507 gmap_flush_tlb(gmap);
508 }
509 rcu_read_unlock();
510 }
511
512 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
513 unsigned long gaddr);
514
515 /**
516 * __gmap_link - set up shadow page tables to connect a host to a guest address
517 * @gmap: pointer to guest mapping meta data structure
518 * @gaddr: guest address
519 * @vmaddr: vm address
520 *
521 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
522 * if the vm address is already mapped to a different guest segment.
523 * The mmap_lock of the mm that belongs to the address space must be held
524 * when this function gets called.
525 */
__gmap_link(struct gmap * gmap,unsigned long gaddr,unsigned long vmaddr)526 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
527 {
528 struct mm_struct *mm;
529 unsigned long *table;
530 spinlock_t *ptl;
531 pgd_t *pgd;
532 p4d_t *p4d;
533 pud_t *pud;
534 pmd_t *pmd;
535 u64 unprot;
536 int rc;
537
538 BUG_ON(gmap_is_shadow(gmap));
539 /* Create higher level tables in the gmap page table */
540 table = gmap->table;
541 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
542 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
543 if ((*table & _REGION_ENTRY_INVALID) &&
544 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
545 gaddr & _REGION1_MASK))
546 return -ENOMEM;
547 table = __va(*table & _REGION_ENTRY_ORIGIN);
548 }
549 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
550 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
551 if ((*table & _REGION_ENTRY_INVALID) &&
552 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
553 gaddr & _REGION2_MASK))
554 return -ENOMEM;
555 table = __va(*table & _REGION_ENTRY_ORIGIN);
556 }
557 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
558 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
559 if ((*table & _REGION_ENTRY_INVALID) &&
560 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
561 gaddr & _REGION3_MASK))
562 return -ENOMEM;
563 table = __va(*table & _REGION_ENTRY_ORIGIN);
564 }
565 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
566 /* Walk the parent mm page table */
567 mm = gmap->mm;
568 pgd = pgd_offset(mm, vmaddr);
569 VM_BUG_ON(pgd_none(*pgd));
570 p4d = p4d_offset(pgd, vmaddr);
571 VM_BUG_ON(p4d_none(*p4d));
572 pud = pud_offset(p4d, vmaddr);
573 VM_BUG_ON(pud_none(*pud));
574 /* large puds cannot yet be handled */
575 if (pud_leaf(*pud))
576 return -EFAULT;
577 pmd = pmd_offset(pud, vmaddr);
578 VM_BUG_ON(pmd_none(*pmd));
579 /* Are we allowed to use huge pages? */
580 if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
581 return -EFAULT;
582 /* Link gmap segment table entry location to page table. */
583 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
584 if (rc)
585 return rc;
586 ptl = pmd_lock(mm, pmd);
587 spin_lock(&gmap->guest_table_lock);
588 if (*table == _SEGMENT_ENTRY_EMPTY) {
589 rc = radix_tree_insert(&gmap->host_to_guest,
590 vmaddr >> PMD_SHIFT,
591 (void *)MAKE_VALID_GADDR(gaddr));
592 if (!rc) {
593 if (pmd_leaf(*pmd)) {
594 *table = (pmd_val(*pmd) &
595 _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
596 | _SEGMENT_ENTRY_GMAP_UC
597 | _SEGMENT_ENTRY;
598 } else
599 *table = (pmd_val(*pmd) &
600 _SEGMENT_ENTRY_HARDWARE_BITS)
601 | _SEGMENT_ENTRY;
602 }
603 } else if (*table & _SEGMENT_ENTRY_PROTECT &&
604 !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
605 unprot = (u64)*table;
606 unprot &= ~_SEGMENT_ENTRY_PROTECT;
607 unprot |= _SEGMENT_ENTRY_GMAP_UC;
608 gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
609 }
610 spin_unlock(&gmap->guest_table_lock);
611 spin_unlock(ptl);
612 radix_tree_preload_end();
613 return rc;
614 }
615 EXPORT_SYMBOL(__gmap_link);
616
617 /*
618 * this function is assumed to be called with mmap_lock held
619 */
__gmap_zap(struct gmap * gmap,unsigned long gaddr)620 void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
621 {
622 unsigned long vmaddr;
623
624 mmap_assert_locked(gmap->mm);
625
626 /* Find the vm address for the guest address */
627 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
628 gaddr >> PMD_SHIFT);
629 if (vmaddr) {
630 vmaddr |= gaddr & ~PMD_MASK;
631 gmap_helper_zap_one_page(gmap->mm, vmaddr);
632 }
633 }
634 EXPORT_SYMBOL_GPL(__gmap_zap);
635
636 static LIST_HEAD(gmap_notifier_list);
637 static DEFINE_SPINLOCK(gmap_notifier_lock);
638
639 /**
640 * gmap_register_pte_notifier - register a pte invalidation callback
641 * @nb: pointer to the gmap notifier block
642 */
gmap_register_pte_notifier(struct gmap_notifier * nb)643 void gmap_register_pte_notifier(struct gmap_notifier *nb)
644 {
645 spin_lock(&gmap_notifier_lock);
646 list_add_rcu(&nb->list, &gmap_notifier_list);
647 spin_unlock(&gmap_notifier_lock);
648 }
649 EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
650
651 /**
652 * gmap_unregister_pte_notifier - remove a pte invalidation callback
653 * @nb: pointer to the gmap notifier block
654 */
gmap_unregister_pte_notifier(struct gmap_notifier * nb)655 void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
656 {
657 spin_lock(&gmap_notifier_lock);
658 list_del_rcu(&nb->list);
659 spin_unlock(&gmap_notifier_lock);
660 synchronize_rcu();
661 }
662 EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
663
664 /**
665 * gmap_call_notifier - call all registered invalidation callbacks
666 * @gmap: pointer to guest mapping meta data structure
667 * @start: start virtual address in the guest address space
668 * @end: end virtual address in the guest address space
669 */
gmap_call_notifier(struct gmap * gmap,unsigned long start,unsigned long end)670 static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
671 unsigned long end)
672 {
673 struct gmap_notifier *nb;
674
675 list_for_each_entry(nb, &gmap_notifier_list, list)
676 nb->notifier_call(gmap, start, end);
677 }
678
679 /**
680 * gmap_table_walk - walk the gmap page tables
681 * @gmap: pointer to guest mapping meta data structure
682 * @gaddr: virtual address in the guest address space
683 * @level: page table level to stop at
684 *
685 * Returns a table entry pointer for the given guest address and @level
686 * @level=0 : returns a pointer to a page table table entry (or NULL)
687 * @level=1 : returns a pointer to a segment table entry (or NULL)
688 * @level=2 : returns a pointer to a region-3 table entry (or NULL)
689 * @level=3 : returns a pointer to a region-2 table entry (or NULL)
690 * @level=4 : returns a pointer to a region-1 table entry (or NULL)
691 *
692 * Returns NULL if the gmap page tables could not be walked to the
693 * requested level.
694 *
695 * Note: Can also be called for shadow gmaps.
696 */
gmap_table_walk(struct gmap * gmap,unsigned long gaddr,int level)697 unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level)
698 {
699 const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
700 unsigned long *table = gmap->table;
701
702 if (gmap_is_shadow(gmap) && gmap->removed)
703 return NULL;
704
705 if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
706 return NULL;
707
708 if (asce_type != _ASCE_TYPE_REGION1 &&
709 gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
710 return NULL;
711
712 switch (asce_type) {
713 case _ASCE_TYPE_REGION1:
714 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
715 if (level == 4)
716 break;
717 if (*table & _REGION_ENTRY_INVALID)
718 return NULL;
719 table = __va(*table & _REGION_ENTRY_ORIGIN);
720 fallthrough;
721 case _ASCE_TYPE_REGION2:
722 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
723 if (level == 3)
724 break;
725 if (*table & _REGION_ENTRY_INVALID)
726 return NULL;
727 table = __va(*table & _REGION_ENTRY_ORIGIN);
728 fallthrough;
729 case _ASCE_TYPE_REGION3:
730 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
731 if (level == 2)
732 break;
733 if (*table & _REGION_ENTRY_INVALID)
734 return NULL;
735 table = __va(*table & _REGION_ENTRY_ORIGIN);
736 fallthrough;
737 case _ASCE_TYPE_SEGMENT:
738 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
739 if (level == 1)
740 break;
741 if (*table & _REGION_ENTRY_INVALID)
742 return NULL;
743 table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
744 table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT;
745 }
746 return table;
747 }
748 EXPORT_SYMBOL(gmap_table_walk);
749
750 /**
751 * gmap_pte_op_walk - walk the gmap page table, get the page table lock
752 * and return the pte pointer
753 * @gmap: pointer to guest mapping meta data structure
754 * @gaddr: virtual address in the guest address space
755 * @ptl: pointer to the spinlock pointer
756 *
757 * Returns a pointer to the locked pte for a guest address, or NULL
758 */
gmap_pte_op_walk(struct gmap * gmap,unsigned long gaddr,spinlock_t ** ptl)759 static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
760 spinlock_t **ptl)
761 {
762 unsigned long *table;
763
764 BUG_ON(gmap_is_shadow(gmap));
765 /* Walk the gmap page table, lock and get pte pointer */
766 table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
767 if (!table || *table & _SEGMENT_ENTRY_INVALID)
768 return NULL;
769 return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
770 }
771
772 /**
773 * gmap_pte_op_fixup - force a page in and connect the gmap page table
774 * @gmap: pointer to guest mapping meta data structure
775 * @gaddr: virtual address in the guest address space
776 * @vmaddr: address in the host process address space
777 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
778 *
779 * Returns 0 if the caller can retry __gmap_translate (might fail again),
780 * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
781 * up or connecting the gmap page table.
782 */
gmap_pte_op_fixup(struct gmap * gmap,unsigned long gaddr,unsigned long vmaddr,int prot)783 static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
784 unsigned long vmaddr, int prot)
785 {
786 struct mm_struct *mm = gmap->mm;
787 unsigned int fault_flags;
788 bool unlocked = false;
789
790 BUG_ON(gmap_is_shadow(gmap));
791 fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
792 if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
793 return -EFAULT;
794 if (unlocked)
795 /* lost mmap_lock, caller has to retry __gmap_translate */
796 return 0;
797 /* Connect the page tables */
798 return __gmap_link(gmap, gaddr, vmaddr);
799 }
800
801 /**
802 * gmap_pte_op_end - release the page table lock
803 * @ptep: pointer to the locked pte
804 * @ptl: pointer to the page table spinlock
805 */
gmap_pte_op_end(pte_t * ptep,spinlock_t * ptl)806 static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl)
807 {
808 pte_unmap_unlock(ptep, ptl);
809 }
810
811 /**
812 * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
813 * and return the pmd pointer
814 * @gmap: pointer to guest mapping meta data structure
815 * @gaddr: virtual address in the guest address space
816 *
817 * Returns a pointer to the pmd for a guest address, or NULL
818 */
gmap_pmd_op_walk(struct gmap * gmap,unsigned long gaddr)819 static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
820 {
821 pmd_t *pmdp;
822
823 BUG_ON(gmap_is_shadow(gmap));
824 pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
825 if (!pmdp)
826 return NULL;
827
828 /* without huge pages, there is no need to take the table lock */
829 if (!gmap->mm->context.allow_gmap_hpage_1m)
830 return pmd_none(*pmdp) ? NULL : pmdp;
831
832 spin_lock(&gmap->guest_table_lock);
833 if (pmd_none(*pmdp)) {
834 spin_unlock(&gmap->guest_table_lock);
835 return NULL;
836 }
837
838 /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
839 if (!pmd_leaf(*pmdp))
840 spin_unlock(&gmap->guest_table_lock);
841 return pmdp;
842 }
843
844 /**
845 * gmap_pmd_op_end - release the guest_table_lock if needed
846 * @gmap: pointer to the guest mapping meta data structure
847 * @pmdp: pointer to the pmd
848 */
gmap_pmd_op_end(struct gmap * gmap,pmd_t * pmdp)849 static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
850 {
851 if (pmd_leaf(*pmdp))
852 spin_unlock(&gmap->guest_table_lock);
853 }
854
855 /*
856 * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
857 * @pmdp: pointer to the pmd to be protected
858 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
859 * @bits: notification bits to set
860 *
861 * Returns:
862 * 0 if successfully protected
863 * -EAGAIN if a fixup is needed
864 * -EINVAL if unsupported notifier bits have been specified
865 *
866 * Expected to be called with sg->mm->mmap_lock in read and
867 * guest_table_lock held.
868 */
gmap_protect_pmd(struct gmap * gmap,unsigned long gaddr,pmd_t * pmdp,int prot,unsigned long bits)869 static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
870 pmd_t *pmdp, int prot, unsigned long bits)
871 {
872 int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
873 int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
874 pmd_t new = *pmdp;
875
876 /* Fixup needed */
877 if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
878 return -EAGAIN;
879
880 if (prot == PROT_NONE && !pmd_i) {
881 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
882 gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
883 }
884
885 if (prot == PROT_READ && !pmd_p) {
886 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
887 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
888 gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
889 }
890
891 if (bits & GMAP_NOTIFY_MPROT)
892 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
893
894 /* Shadow GMAP protection needs split PMDs */
895 if (bits & GMAP_NOTIFY_SHADOW)
896 return -EINVAL;
897
898 return 0;
899 }
900
901 /*
902 * gmap_protect_pte - remove access rights to memory and set pgste bits
903 * @gmap: pointer to guest mapping meta data structure
904 * @gaddr: virtual address in the guest address space
905 * @pmdp: pointer to the pmd associated with the pte
906 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
907 * @bits: notification bits to set
908 *
909 * Returns 0 if successfully protected, -ENOMEM if out of memory and
910 * -EAGAIN if a fixup is needed.
911 *
912 * Expected to be called with sg->mm->mmap_lock in read
913 */
gmap_protect_pte(struct gmap * gmap,unsigned long gaddr,pmd_t * pmdp,int prot,unsigned long bits)914 static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
915 pmd_t *pmdp, int prot, unsigned long bits)
916 {
917 int rc;
918 pte_t *ptep;
919 spinlock_t *ptl;
920 unsigned long pbits = 0;
921
922 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
923 return -EAGAIN;
924
925 ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
926 if (!ptep)
927 return -ENOMEM;
928
929 pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
930 pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
931 /* Protect and unlock. */
932 rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
933 gmap_pte_op_end(ptep, ptl);
934 return rc;
935 }
936
937 /*
938 * gmap_protect_range - remove access rights to memory and set pgste bits
939 * @gmap: pointer to guest mapping meta data structure
940 * @gaddr: virtual address in the guest address space
941 * @len: size of area
942 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
943 * @bits: pgste notification bits to set
944 *
945 * Returns:
946 * PAGE_SIZE if a small page was successfully protected;
947 * HPAGE_SIZE if a large page was successfully protected;
948 * -ENOMEM if out of memory;
949 * -EFAULT if gaddr is invalid (or mapping for shadows is missing);
950 * -EAGAIN if the guest mapping is missing and should be fixed by the caller.
951 *
952 * Context: Called with sg->mm->mmap_lock in read.
953 */
gmap_protect_one(struct gmap * gmap,unsigned long gaddr,int prot,unsigned long bits)954 int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits)
955 {
956 pmd_t *pmdp;
957 int rc = 0;
958
959 BUG_ON(gmap_is_shadow(gmap));
960
961 pmdp = gmap_pmd_op_walk(gmap, gaddr);
962 if (!pmdp)
963 return -EAGAIN;
964
965 if (!pmd_leaf(*pmdp)) {
966 rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits);
967 if (!rc)
968 rc = PAGE_SIZE;
969 } else {
970 rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits);
971 if (!rc)
972 rc = HPAGE_SIZE;
973 }
974 gmap_pmd_op_end(gmap, pmdp);
975
976 return rc;
977 }
978 EXPORT_SYMBOL_GPL(gmap_protect_one);
979
980 /**
981 * gmap_read_table - get an unsigned long value from a guest page table using
982 * absolute addressing, without marking the page referenced.
983 * @gmap: pointer to guest mapping meta data structure
984 * @gaddr: virtual address in the guest address space
985 * @val: pointer to the unsigned long value to return
986 *
987 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
988 * if reading using the virtual address failed. -EINVAL if called on a gmap
989 * shadow.
990 *
991 * Called with gmap->mm->mmap_lock in read.
992 */
gmap_read_table(struct gmap * gmap,unsigned long gaddr,unsigned long * val)993 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
994 {
995 unsigned long address, vmaddr;
996 spinlock_t *ptl;
997 pte_t *ptep, pte;
998 int rc;
999
1000 if (gmap_is_shadow(gmap))
1001 return -EINVAL;
1002
1003 while (1) {
1004 rc = -EAGAIN;
1005 ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
1006 if (ptep) {
1007 pte = *ptep;
1008 if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
1009 address = pte_val(pte) & PAGE_MASK;
1010 address += gaddr & ~PAGE_MASK;
1011 *val = *(unsigned long *)__va(address);
1012 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
1013 /* Do *NOT* clear the _PAGE_INVALID bit! */
1014 rc = 0;
1015 }
1016 gmap_pte_op_end(ptep, ptl);
1017 }
1018 if (!rc)
1019 break;
1020 vmaddr = __gmap_translate(gmap, gaddr);
1021 if (IS_ERR_VALUE(vmaddr)) {
1022 rc = vmaddr;
1023 break;
1024 }
1025 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
1026 if (rc)
1027 break;
1028 }
1029 return rc;
1030 }
1031 EXPORT_SYMBOL_GPL(gmap_read_table);
1032
1033 /**
1034 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
1035 * @sg: pointer to the shadow guest address space structure
1036 * @vmaddr: vm address associated with the rmap
1037 * @rmap: pointer to the rmap structure
1038 *
1039 * Called with the sg->guest_table_lock
1040 */
gmap_insert_rmap(struct gmap * sg,unsigned long vmaddr,struct gmap_rmap * rmap)1041 static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
1042 struct gmap_rmap *rmap)
1043 {
1044 struct gmap_rmap *temp;
1045 void __rcu **slot;
1046
1047 BUG_ON(!gmap_is_shadow(sg));
1048 slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1049 if (slot) {
1050 rmap->next = radix_tree_deref_slot_protected(slot,
1051 &sg->guest_table_lock);
1052 for (temp = rmap->next; temp; temp = temp->next) {
1053 if (temp->raddr == rmap->raddr) {
1054 kfree(rmap);
1055 return;
1056 }
1057 }
1058 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1059 } else {
1060 rmap->next = NULL;
1061 radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
1062 rmap);
1063 }
1064 }
1065
1066 /**
1067 * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
1068 * @sg: pointer to the shadow guest address space structure
1069 * @raddr: rmap address in the shadow gmap
1070 * @paddr: address in the parent guest address space
1071 * @len: length of the memory area to protect
1072 *
1073 * Returns 0 if successfully protected and the rmap was created, -ENOMEM
1074 * if out of memory and -EFAULT if paddr is invalid.
1075 */
gmap_protect_rmap(struct gmap * sg,unsigned long raddr,unsigned long paddr,unsigned long len)1076 static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
1077 unsigned long paddr, unsigned long len)
1078 {
1079 struct gmap *parent;
1080 struct gmap_rmap *rmap;
1081 unsigned long vmaddr;
1082 spinlock_t *ptl;
1083 pte_t *ptep;
1084 int rc;
1085
1086 BUG_ON(!gmap_is_shadow(sg));
1087 parent = sg->parent;
1088 while (len) {
1089 vmaddr = __gmap_translate(parent, paddr);
1090 if (IS_ERR_VALUE(vmaddr))
1091 return vmaddr;
1092 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1093 if (!rmap)
1094 return -ENOMEM;
1095 rmap->raddr = raddr;
1096 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1097 if (rc) {
1098 kfree(rmap);
1099 return rc;
1100 }
1101 rc = -EAGAIN;
1102 ptep = gmap_pte_op_walk(parent, paddr, &ptl);
1103 if (ptep) {
1104 spin_lock(&sg->guest_table_lock);
1105 rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
1106 PGSTE_VSIE_BIT);
1107 if (!rc)
1108 gmap_insert_rmap(sg, vmaddr, rmap);
1109 spin_unlock(&sg->guest_table_lock);
1110 gmap_pte_op_end(ptep, ptl);
1111 }
1112 radix_tree_preload_end();
1113 if (rc) {
1114 kfree(rmap);
1115 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
1116 if (rc)
1117 return rc;
1118 continue;
1119 }
1120 paddr += PAGE_SIZE;
1121 len -= PAGE_SIZE;
1122 }
1123 return 0;
1124 }
1125
1126 #define _SHADOW_RMAP_MASK 0x7
1127 #define _SHADOW_RMAP_REGION1 0x5
1128 #define _SHADOW_RMAP_REGION2 0x4
1129 #define _SHADOW_RMAP_REGION3 0x3
1130 #define _SHADOW_RMAP_SEGMENT 0x2
1131 #define _SHADOW_RMAP_PGTABLE 0x1
1132
1133 /**
1134 * gmap_idte_one - invalidate a single region or segment table entry
1135 * @asce: region or segment table *origin* + table-type bits
1136 * @vaddr: virtual address to identify the table entry to flush
1137 *
1138 * The invalid bit of a single region or segment table entry is set
1139 * and the associated TLB entries depending on the entry are flushed.
1140 * The table-type of the @asce identifies the portion of the @vaddr
1141 * that is used as the invalidation index.
1142 */
gmap_idte_one(unsigned long asce,unsigned long vaddr)1143 static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
1144 {
1145 asm volatile(
1146 " idte %0,0,%1"
1147 : : "a" (asce), "a" (vaddr) : "cc", "memory");
1148 }
1149
1150 /**
1151 * gmap_unshadow_page - remove a page from a shadow page table
1152 * @sg: pointer to the shadow guest address space structure
1153 * @raddr: rmap address in the shadow guest address space
1154 *
1155 * Called with the sg->guest_table_lock
1156 */
gmap_unshadow_page(struct gmap * sg,unsigned long raddr)1157 static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
1158 {
1159 unsigned long *table;
1160
1161 BUG_ON(!gmap_is_shadow(sg));
1162 table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
1163 if (!table || *table & _PAGE_INVALID)
1164 return;
1165 gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1);
1166 ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
1167 }
1168
1169 /**
1170 * __gmap_unshadow_pgt - remove all entries from a shadow page table
1171 * @sg: pointer to the shadow guest address space structure
1172 * @raddr: rmap address in the shadow guest address space
1173 * @pgt: pointer to the start of a shadow page table
1174 *
1175 * Called with the sg->guest_table_lock
1176 */
__gmap_unshadow_pgt(struct gmap * sg,unsigned long raddr,unsigned long * pgt)1177 static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
1178 unsigned long *pgt)
1179 {
1180 int i;
1181
1182 BUG_ON(!gmap_is_shadow(sg));
1183 for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE)
1184 pgt[i] = _PAGE_INVALID;
1185 }
1186
1187 /**
1188 * gmap_unshadow_pgt - remove a shadow page table from a segment entry
1189 * @sg: pointer to the shadow guest address space structure
1190 * @raddr: address in the shadow guest address space
1191 *
1192 * Called with the sg->guest_table_lock
1193 */
gmap_unshadow_pgt(struct gmap * sg,unsigned long raddr)1194 static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
1195 {
1196 unsigned long *ste;
1197 phys_addr_t sto, pgt;
1198 struct ptdesc *ptdesc;
1199
1200 BUG_ON(!gmap_is_shadow(sg));
1201 ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
1202 if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
1203 return;
1204 gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
1205 sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
1206 gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
1207 pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
1208 *ste = _SEGMENT_ENTRY_EMPTY;
1209 __gmap_unshadow_pgt(sg, raddr, __va(pgt));
1210 /* Free page table */
1211 ptdesc = page_ptdesc(phys_to_page(pgt));
1212 page_table_free_pgste(ptdesc);
1213 }
1214
1215 /**
1216 * __gmap_unshadow_sgt - remove all entries from a shadow segment table
1217 * @sg: pointer to the shadow guest address space structure
1218 * @raddr: rmap address in the shadow guest address space
1219 * @sgt: pointer to the start of a shadow segment table
1220 *
1221 * Called with the sg->guest_table_lock
1222 */
__gmap_unshadow_sgt(struct gmap * sg,unsigned long raddr,unsigned long * sgt)1223 static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
1224 unsigned long *sgt)
1225 {
1226 struct ptdesc *ptdesc;
1227 phys_addr_t pgt;
1228 int i;
1229
1230 BUG_ON(!gmap_is_shadow(sg));
1231 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
1232 if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
1233 continue;
1234 pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
1235 sgt[i] = _SEGMENT_ENTRY_EMPTY;
1236 __gmap_unshadow_pgt(sg, raddr, __va(pgt));
1237 /* Free page table */
1238 ptdesc = page_ptdesc(phys_to_page(pgt));
1239 page_table_free_pgste(ptdesc);
1240 }
1241 }
1242
1243 /**
1244 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
1245 * @sg: pointer to the shadow guest address space structure
1246 * @raddr: rmap address in the shadow guest address space
1247 *
1248 * Called with the shadow->guest_table_lock
1249 */
gmap_unshadow_sgt(struct gmap * sg,unsigned long raddr)1250 static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
1251 {
1252 unsigned long r3o, *r3e;
1253 phys_addr_t sgt;
1254 struct page *page;
1255
1256 BUG_ON(!gmap_is_shadow(sg));
1257 r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
1258 if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
1259 return;
1260 gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
1261 r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
1262 gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
1263 sgt = *r3e & _REGION_ENTRY_ORIGIN;
1264 *r3e = _REGION3_ENTRY_EMPTY;
1265 __gmap_unshadow_sgt(sg, raddr, __va(sgt));
1266 /* Free segment table */
1267 page = phys_to_page(sgt);
1268 __free_pages(page, CRST_ALLOC_ORDER);
1269 }
1270
1271 /**
1272 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
1273 * @sg: pointer to the shadow guest address space structure
1274 * @raddr: address in the shadow guest address space
1275 * @r3t: pointer to the start of a shadow region-3 table
1276 *
1277 * Called with the sg->guest_table_lock
1278 */
__gmap_unshadow_r3t(struct gmap * sg,unsigned long raddr,unsigned long * r3t)1279 static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
1280 unsigned long *r3t)
1281 {
1282 struct page *page;
1283 phys_addr_t sgt;
1284 int i;
1285
1286 BUG_ON(!gmap_is_shadow(sg));
1287 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
1288 if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
1289 continue;
1290 sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
1291 r3t[i] = _REGION3_ENTRY_EMPTY;
1292 __gmap_unshadow_sgt(sg, raddr, __va(sgt));
1293 /* Free segment table */
1294 page = phys_to_page(sgt);
1295 __free_pages(page, CRST_ALLOC_ORDER);
1296 }
1297 }
1298
1299 /**
1300 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
1301 * @sg: pointer to the shadow guest address space structure
1302 * @raddr: rmap address in the shadow guest address space
1303 *
1304 * Called with the sg->guest_table_lock
1305 */
gmap_unshadow_r3t(struct gmap * sg,unsigned long raddr)1306 static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
1307 {
1308 unsigned long r2o, *r2e;
1309 phys_addr_t r3t;
1310 struct page *page;
1311
1312 BUG_ON(!gmap_is_shadow(sg));
1313 r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
1314 if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
1315 return;
1316 gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
1317 r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
1318 gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
1319 r3t = *r2e & _REGION_ENTRY_ORIGIN;
1320 *r2e = _REGION2_ENTRY_EMPTY;
1321 __gmap_unshadow_r3t(sg, raddr, __va(r3t));
1322 /* Free region 3 table */
1323 page = phys_to_page(r3t);
1324 __free_pages(page, CRST_ALLOC_ORDER);
1325 }
1326
1327 /**
1328 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
1329 * @sg: pointer to the shadow guest address space structure
1330 * @raddr: rmap address in the shadow guest address space
1331 * @r2t: pointer to the start of a shadow region-2 table
1332 *
1333 * Called with the sg->guest_table_lock
1334 */
__gmap_unshadow_r2t(struct gmap * sg,unsigned long raddr,unsigned long * r2t)1335 static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
1336 unsigned long *r2t)
1337 {
1338 phys_addr_t r3t;
1339 struct page *page;
1340 int i;
1341
1342 BUG_ON(!gmap_is_shadow(sg));
1343 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
1344 if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
1345 continue;
1346 r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
1347 r2t[i] = _REGION2_ENTRY_EMPTY;
1348 __gmap_unshadow_r3t(sg, raddr, __va(r3t));
1349 /* Free region 3 table */
1350 page = phys_to_page(r3t);
1351 __free_pages(page, CRST_ALLOC_ORDER);
1352 }
1353 }
1354
1355 /**
1356 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
1357 * @sg: pointer to the shadow guest address space structure
1358 * @raddr: rmap address in the shadow guest address space
1359 *
1360 * Called with the sg->guest_table_lock
1361 */
gmap_unshadow_r2t(struct gmap * sg,unsigned long raddr)1362 static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
1363 {
1364 unsigned long r1o, *r1e;
1365 struct page *page;
1366 phys_addr_t r2t;
1367
1368 BUG_ON(!gmap_is_shadow(sg));
1369 r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
1370 if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
1371 return;
1372 gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
1373 r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
1374 gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
1375 r2t = *r1e & _REGION_ENTRY_ORIGIN;
1376 *r1e = _REGION1_ENTRY_EMPTY;
1377 __gmap_unshadow_r2t(sg, raddr, __va(r2t));
1378 /* Free region 2 table */
1379 page = phys_to_page(r2t);
1380 __free_pages(page, CRST_ALLOC_ORDER);
1381 }
1382
1383 /**
1384 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
1385 * @sg: pointer to the shadow guest address space structure
1386 * @raddr: rmap address in the shadow guest address space
1387 * @r1t: pointer to the start of a shadow region-1 table
1388 *
1389 * Called with the shadow->guest_table_lock
1390 */
__gmap_unshadow_r1t(struct gmap * sg,unsigned long raddr,unsigned long * r1t)1391 static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
1392 unsigned long *r1t)
1393 {
1394 unsigned long asce;
1395 struct page *page;
1396 phys_addr_t r2t;
1397 int i;
1398
1399 BUG_ON(!gmap_is_shadow(sg));
1400 asce = __pa(r1t) | _ASCE_TYPE_REGION1;
1401 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
1402 if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
1403 continue;
1404 r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
1405 __gmap_unshadow_r2t(sg, raddr, __va(r2t));
1406 /* Clear entry and flush translation r1t -> r2t */
1407 gmap_idte_one(asce, raddr);
1408 r1t[i] = _REGION1_ENTRY_EMPTY;
1409 /* Free region 2 table */
1410 page = phys_to_page(r2t);
1411 __free_pages(page, CRST_ALLOC_ORDER);
1412 }
1413 }
1414
1415 /**
1416 * gmap_unshadow - remove a shadow page table completely
1417 * @sg: pointer to the shadow guest address space structure
1418 *
1419 * Called with sg->guest_table_lock
1420 */
gmap_unshadow(struct gmap * sg)1421 void gmap_unshadow(struct gmap *sg)
1422 {
1423 unsigned long *table;
1424
1425 BUG_ON(!gmap_is_shadow(sg));
1426 if (sg->removed)
1427 return;
1428 sg->removed = 1;
1429 gmap_call_notifier(sg, 0, -1UL);
1430 gmap_flush_tlb(sg);
1431 table = __va(sg->asce & _ASCE_ORIGIN);
1432 switch (sg->asce & _ASCE_TYPE_MASK) {
1433 case _ASCE_TYPE_REGION1:
1434 __gmap_unshadow_r1t(sg, 0, table);
1435 break;
1436 case _ASCE_TYPE_REGION2:
1437 __gmap_unshadow_r2t(sg, 0, table);
1438 break;
1439 case _ASCE_TYPE_REGION3:
1440 __gmap_unshadow_r3t(sg, 0, table);
1441 break;
1442 case _ASCE_TYPE_SEGMENT:
1443 __gmap_unshadow_sgt(sg, 0, table);
1444 break;
1445 }
1446 }
1447 EXPORT_SYMBOL(gmap_unshadow);
1448
1449 /**
1450 * gmap_shadow_r2t - create an empty shadow region 2 table
1451 * @sg: pointer to the shadow guest address space structure
1452 * @saddr: faulting address in the shadow gmap
1453 * @r2t: parent gmap address of the region 2 table to get shadowed
1454 * @fake: r2t references contiguous guest memory block, not a r2t
1455 *
1456 * The r2t parameter specifies the address of the source table. The
1457 * four pages of the source table are made read-only in the parent gmap
1458 * address space. A write to the source table area @r2t will automatically
1459 * remove the shadow r2 table and all of its descendants.
1460 *
1461 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1462 * shadow table structure is incomplete, -ENOMEM if out of memory and
1463 * -EFAULT if an address in the parent gmap could not be resolved.
1464 *
1465 * Called with sg->mm->mmap_lock in read.
1466 */
gmap_shadow_r2t(struct gmap * sg,unsigned long saddr,unsigned long r2t,int fake)1467 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
1468 int fake)
1469 {
1470 unsigned long raddr, origin, offset, len;
1471 unsigned long *table;
1472 phys_addr_t s_r2t;
1473 struct page *page;
1474 int rc;
1475
1476 BUG_ON(!gmap_is_shadow(sg));
1477 /* Allocate a shadow region second table */
1478 page = gmap_alloc_crst();
1479 if (!page)
1480 return -ENOMEM;
1481 s_r2t = page_to_phys(page);
1482 /* Install shadow region second table */
1483 spin_lock(&sg->guest_table_lock);
1484 table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
1485 if (!table) {
1486 rc = -EAGAIN; /* Race with unshadow */
1487 goto out_free;
1488 }
1489 if (!(*table & _REGION_ENTRY_INVALID)) {
1490 rc = 0; /* Already established */
1491 goto out_free;
1492 } else if (*table & _REGION_ENTRY_ORIGIN) {
1493 rc = -EAGAIN; /* Race with shadow */
1494 goto out_free;
1495 }
1496 crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
1497 /* mark as invalid as long as the parent table is not protected */
1498 *table = s_r2t | _REGION_ENTRY_LENGTH |
1499 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
1500 if (sg->edat_level >= 1)
1501 *table |= (r2t & _REGION_ENTRY_PROTECT);
1502 if (fake) {
1503 /* nothing to protect for fake tables */
1504 *table &= ~_REGION_ENTRY_INVALID;
1505 spin_unlock(&sg->guest_table_lock);
1506 return 0;
1507 }
1508 spin_unlock(&sg->guest_table_lock);
1509 /* Make r2t read-only in parent gmap page table */
1510 raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
1511 origin = r2t & _REGION_ENTRY_ORIGIN;
1512 offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1513 len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1514 rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1515 spin_lock(&sg->guest_table_lock);
1516 if (!rc) {
1517 table = gmap_table_walk(sg, saddr, 4);
1518 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
1519 rc = -EAGAIN; /* Race with unshadow */
1520 else
1521 *table &= ~_REGION_ENTRY_INVALID;
1522 } else {
1523 gmap_unshadow_r2t(sg, raddr);
1524 }
1525 spin_unlock(&sg->guest_table_lock);
1526 return rc;
1527 out_free:
1528 spin_unlock(&sg->guest_table_lock);
1529 __free_pages(page, CRST_ALLOC_ORDER);
1530 return rc;
1531 }
1532 EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
1533
1534 /**
1535 * gmap_shadow_r3t - create a shadow region 3 table
1536 * @sg: pointer to the shadow guest address space structure
1537 * @saddr: faulting address in the shadow gmap
1538 * @r3t: parent gmap address of the region 3 table to get shadowed
1539 * @fake: r3t references contiguous guest memory block, not a r3t
1540 *
1541 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1542 * shadow table structure is incomplete, -ENOMEM if out of memory and
1543 * -EFAULT if an address in the parent gmap could not be resolved.
1544 *
1545 * Called with sg->mm->mmap_lock in read.
1546 */
gmap_shadow_r3t(struct gmap * sg,unsigned long saddr,unsigned long r3t,int fake)1547 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
1548 int fake)
1549 {
1550 unsigned long raddr, origin, offset, len;
1551 unsigned long *table;
1552 phys_addr_t s_r3t;
1553 struct page *page;
1554 int rc;
1555
1556 BUG_ON(!gmap_is_shadow(sg));
1557 /* Allocate a shadow region second table */
1558 page = gmap_alloc_crst();
1559 if (!page)
1560 return -ENOMEM;
1561 s_r3t = page_to_phys(page);
1562 /* Install shadow region second table */
1563 spin_lock(&sg->guest_table_lock);
1564 table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
1565 if (!table) {
1566 rc = -EAGAIN; /* Race with unshadow */
1567 goto out_free;
1568 }
1569 if (!(*table & _REGION_ENTRY_INVALID)) {
1570 rc = 0; /* Already established */
1571 goto out_free;
1572 } else if (*table & _REGION_ENTRY_ORIGIN) {
1573 rc = -EAGAIN; /* Race with shadow */
1574 goto out_free;
1575 }
1576 crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
1577 /* mark as invalid as long as the parent table is not protected */
1578 *table = s_r3t | _REGION_ENTRY_LENGTH |
1579 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
1580 if (sg->edat_level >= 1)
1581 *table |= (r3t & _REGION_ENTRY_PROTECT);
1582 if (fake) {
1583 /* nothing to protect for fake tables */
1584 *table &= ~_REGION_ENTRY_INVALID;
1585 spin_unlock(&sg->guest_table_lock);
1586 return 0;
1587 }
1588 spin_unlock(&sg->guest_table_lock);
1589 /* Make r3t read-only in parent gmap page table */
1590 raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
1591 origin = r3t & _REGION_ENTRY_ORIGIN;
1592 offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1593 len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1594 rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1595 spin_lock(&sg->guest_table_lock);
1596 if (!rc) {
1597 table = gmap_table_walk(sg, saddr, 3);
1598 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
1599 rc = -EAGAIN; /* Race with unshadow */
1600 else
1601 *table &= ~_REGION_ENTRY_INVALID;
1602 } else {
1603 gmap_unshadow_r3t(sg, raddr);
1604 }
1605 spin_unlock(&sg->guest_table_lock);
1606 return rc;
1607 out_free:
1608 spin_unlock(&sg->guest_table_lock);
1609 __free_pages(page, CRST_ALLOC_ORDER);
1610 return rc;
1611 }
1612 EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
1613
1614 /**
1615 * gmap_shadow_sgt - create a shadow segment table
1616 * @sg: pointer to the shadow guest address space structure
1617 * @saddr: faulting address in the shadow gmap
1618 * @sgt: parent gmap address of the segment table to get shadowed
1619 * @fake: sgt references contiguous guest memory block, not a sgt
1620 *
1621 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
1622 * shadow table structure is incomplete, -ENOMEM if out of memory and
1623 * -EFAULT if an address in the parent gmap could not be resolved.
1624 *
1625 * Called with sg->mm->mmap_lock in read.
1626 */
gmap_shadow_sgt(struct gmap * sg,unsigned long saddr,unsigned long sgt,int fake)1627 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
1628 int fake)
1629 {
1630 unsigned long raddr, origin, offset, len;
1631 unsigned long *table;
1632 phys_addr_t s_sgt;
1633 struct page *page;
1634 int rc;
1635
1636 BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
1637 /* Allocate a shadow segment table */
1638 page = gmap_alloc_crst();
1639 if (!page)
1640 return -ENOMEM;
1641 s_sgt = page_to_phys(page);
1642 /* Install shadow region second table */
1643 spin_lock(&sg->guest_table_lock);
1644 table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
1645 if (!table) {
1646 rc = -EAGAIN; /* Race with unshadow */
1647 goto out_free;
1648 }
1649 if (!(*table & _REGION_ENTRY_INVALID)) {
1650 rc = 0; /* Already established */
1651 goto out_free;
1652 } else if (*table & _REGION_ENTRY_ORIGIN) {
1653 rc = -EAGAIN; /* Race with shadow */
1654 goto out_free;
1655 }
1656 crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
1657 /* mark as invalid as long as the parent table is not protected */
1658 *table = s_sgt | _REGION_ENTRY_LENGTH |
1659 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
1660 if (sg->edat_level >= 1)
1661 *table |= sgt & _REGION_ENTRY_PROTECT;
1662 if (fake) {
1663 /* nothing to protect for fake tables */
1664 *table &= ~_REGION_ENTRY_INVALID;
1665 spin_unlock(&sg->guest_table_lock);
1666 return 0;
1667 }
1668 spin_unlock(&sg->guest_table_lock);
1669 /* Make sgt read-only in parent gmap page table */
1670 raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
1671 origin = sgt & _REGION_ENTRY_ORIGIN;
1672 offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1673 len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1674 rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1675 spin_lock(&sg->guest_table_lock);
1676 if (!rc) {
1677 table = gmap_table_walk(sg, saddr, 2);
1678 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
1679 rc = -EAGAIN; /* Race with unshadow */
1680 else
1681 *table &= ~_REGION_ENTRY_INVALID;
1682 } else {
1683 gmap_unshadow_sgt(sg, raddr);
1684 }
1685 spin_unlock(&sg->guest_table_lock);
1686 return rc;
1687 out_free:
1688 spin_unlock(&sg->guest_table_lock);
1689 __free_pages(page, CRST_ALLOC_ORDER);
1690 return rc;
1691 }
1692 EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
1693
gmap_pgste_set_pgt_addr(struct ptdesc * ptdesc,unsigned long pgt_addr)1694 static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr)
1695 {
1696 unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc));
1697
1698 pgstes += _PAGE_ENTRIES;
1699
1700 pgstes[0] &= ~PGSTE_ST2_MASK;
1701 pgstes[1] &= ~PGSTE_ST2_MASK;
1702 pgstes[2] &= ~PGSTE_ST2_MASK;
1703 pgstes[3] &= ~PGSTE_ST2_MASK;
1704
1705 pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK;
1706 pgstes[1] |= pgt_addr & PGSTE_ST2_MASK;
1707 pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK;
1708 pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK;
1709 }
1710
1711 /**
1712 * gmap_shadow_pgt - instantiate a shadow page table
1713 * @sg: pointer to the shadow guest address space structure
1714 * @saddr: faulting address in the shadow gmap
1715 * @pgt: parent gmap address of the page table to get shadowed
1716 * @fake: pgt references contiguous guest memory block, not a pgtable
1717 *
1718 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1719 * shadow table structure is incomplete, -ENOMEM if out of memory,
1720 * -EFAULT if an address in the parent gmap could not be resolved and
1721 *
1722 * Called with gmap->mm->mmap_lock in read
1723 */
gmap_shadow_pgt(struct gmap * sg,unsigned long saddr,unsigned long pgt,int fake)1724 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
1725 int fake)
1726 {
1727 unsigned long raddr, origin;
1728 unsigned long *table;
1729 struct ptdesc *ptdesc;
1730 phys_addr_t s_pgt;
1731 int rc;
1732
1733 BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
1734 /* Allocate a shadow page table */
1735 ptdesc = page_table_alloc_pgste(sg->mm);
1736 if (!ptdesc)
1737 return -ENOMEM;
1738 origin = pgt & _SEGMENT_ENTRY_ORIGIN;
1739 if (fake)
1740 origin |= GMAP_SHADOW_FAKE_TABLE;
1741 gmap_pgste_set_pgt_addr(ptdesc, origin);
1742 s_pgt = page_to_phys(ptdesc_page(ptdesc));
1743 /* Install shadow page table */
1744 spin_lock(&sg->guest_table_lock);
1745 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
1746 if (!table) {
1747 rc = -EAGAIN; /* Race with unshadow */
1748 goto out_free;
1749 }
1750 if (!(*table & _SEGMENT_ENTRY_INVALID)) {
1751 rc = 0; /* Already established */
1752 goto out_free;
1753 } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
1754 rc = -EAGAIN; /* Race with shadow */
1755 goto out_free;
1756 }
1757 /* mark as invalid as long as the parent table is not protected */
1758 *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
1759 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
1760 if (fake) {
1761 /* nothing to protect for fake tables */
1762 *table &= ~_SEGMENT_ENTRY_INVALID;
1763 spin_unlock(&sg->guest_table_lock);
1764 return 0;
1765 }
1766 spin_unlock(&sg->guest_table_lock);
1767 /* Make pgt read-only in parent gmap page table (not the pgste) */
1768 raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
1769 origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
1770 rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
1771 spin_lock(&sg->guest_table_lock);
1772 if (!rc) {
1773 table = gmap_table_walk(sg, saddr, 1);
1774 if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
1775 rc = -EAGAIN; /* Race with unshadow */
1776 else
1777 *table &= ~_SEGMENT_ENTRY_INVALID;
1778 } else {
1779 gmap_unshadow_pgt(sg, raddr);
1780 }
1781 spin_unlock(&sg->guest_table_lock);
1782 return rc;
1783 out_free:
1784 spin_unlock(&sg->guest_table_lock);
1785 page_table_free_pgste(ptdesc);
1786 return rc;
1787
1788 }
1789 EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
1790
1791 /**
1792 * gmap_shadow_page - create a shadow page mapping
1793 * @sg: pointer to the shadow guest address space structure
1794 * @saddr: faulting address in the shadow gmap
1795 * @pte: pte in parent gmap address space to get shadowed
1796 *
1797 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1798 * shadow table structure is incomplete, -ENOMEM if out of memory and
1799 * -EFAULT if an address in the parent gmap could not be resolved.
1800 *
1801 * Called with sg->mm->mmap_lock in read.
1802 */
gmap_shadow_page(struct gmap * sg,unsigned long saddr,pte_t pte)1803 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
1804 {
1805 struct gmap *parent;
1806 struct gmap_rmap *rmap;
1807 unsigned long vmaddr, paddr;
1808 spinlock_t *ptl;
1809 pte_t *sptep, *tptep;
1810 int prot;
1811 int rc;
1812
1813 BUG_ON(!gmap_is_shadow(sg));
1814 parent = sg->parent;
1815 prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
1816
1817 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1818 if (!rmap)
1819 return -ENOMEM;
1820 rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
1821
1822 while (1) {
1823 paddr = pte_val(pte) & PAGE_MASK;
1824 vmaddr = __gmap_translate(parent, paddr);
1825 if (IS_ERR_VALUE(vmaddr)) {
1826 rc = vmaddr;
1827 break;
1828 }
1829 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1830 if (rc)
1831 break;
1832 rc = -EAGAIN;
1833 sptep = gmap_pte_op_walk(parent, paddr, &ptl);
1834 if (sptep) {
1835 spin_lock(&sg->guest_table_lock);
1836 /* Get page table pointer */
1837 tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
1838 if (!tptep) {
1839 spin_unlock(&sg->guest_table_lock);
1840 gmap_pte_op_end(sptep, ptl);
1841 radix_tree_preload_end();
1842 break;
1843 }
1844 rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
1845 if (rc > 0) {
1846 /* Success and a new mapping */
1847 gmap_insert_rmap(sg, vmaddr, rmap);
1848 rmap = NULL;
1849 rc = 0;
1850 }
1851 gmap_pte_op_end(sptep, ptl);
1852 spin_unlock(&sg->guest_table_lock);
1853 }
1854 radix_tree_preload_end();
1855 if (!rc)
1856 break;
1857 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
1858 if (rc)
1859 break;
1860 }
1861 kfree(rmap);
1862 return rc;
1863 }
1864 EXPORT_SYMBOL_GPL(gmap_shadow_page);
1865
1866 /*
1867 * gmap_shadow_notify - handle notifications for shadow gmap
1868 *
1869 * Called with sg->parent->shadow_lock.
1870 */
gmap_shadow_notify(struct gmap * sg,unsigned long vmaddr,unsigned long gaddr)1871 static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
1872 unsigned long gaddr)
1873 {
1874 struct gmap_rmap *rmap, *rnext, *head;
1875 unsigned long start, end, bits, raddr;
1876
1877 BUG_ON(!gmap_is_shadow(sg));
1878
1879 spin_lock(&sg->guest_table_lock);
1880 if (sg->removed) {
1881 spin_unlock(&sg->guest_table_lock);
1882 return;
1883 }
1884 /* Check for top level table */
1885 start = sg->orig_asce & _ASCE_ORIGIN;
1886 end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
1887 if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
1888 gaddr < end) {
1889 /* The complete shadow table has to go */
1890 gmap_unshadow(sg);
1891 spin_unlock(&sg->guest_table_lock);
1892 list_del(&sg->list);
1893 gmap_put(sg);
1894 return;
1895 }
1896 /* Remove the page table tree from on specific entry */
1897 head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1898 gmap_for_each_rmap_safe(rmap, rnext, head) {
1899 bits = rmap->raddr & _SHADOW_RMAP_MASK;
1900 raddr = rmap->raddr ^ bits;
1901 switch (bits) {
1902 case _SHADOW_RMAP_REGION1:
1903 gmap_unshadow_r2t(sg, raddr);
1904 break;
1905 case _SHADOW_RMAP_REGION2:
1906 gmap_unshadow_r3t(sg, raddr);
1907 break;
1908 case _SHADOW_RMAP_REGION3:
1909 gmap_unshadow_sgt(sg, raddr);
1910 break;
1911 case _SHADOW_RMAP_SEGMENT:
1912 gmap_unshadow_pgt(sg, raddr);
1913 break;
1914 case _SHADOW_RMAP_PGTABLE:
1915 gmap_unshadow_page(sg, raddr);
1916 break;
1917 }
1918 kfree(rmap);
1919 }
1920 spin_unlock(&sg->guest_table_lock);
1921 }
1922
1923 /**
1924 * ptep_notify - call all invalidation callbacks for a specific pte.
1925 * @mm: pointer to the process mm_struct
1926 * @vmaddr: virtual address in the process address space
1927 * @pte: pointer to the page table entry
1928 * @bits: bits from the pgste that caused the notify call
1929 *
1930 * This function is assumed to be called with the page table lock held
1931 * for the pte to notify.
1932 */
ptep_notify(struct mm_struct * mm,unsigned long vmaddr,pte_t * pte,unsigned long bits)1933 void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
1934 pte_t *pte, unsigned long bits)
1935 {
1936 unsigned long offset, gaddr = 0;
1937 struct gmap *gmap, *sg, *next;
1938
1939 offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
1940 offset = offset * (PAGE_SIZE / sizeof(pte_t));
1941 rcu_read_lock();
1942 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
1943 spin_lock(&gmap->guest_table_lock);
1944 gaddr = host_to_guest_lookup(gmap, vmaddr) + offset;
1945 spin_unlock(&gmap->guest_table_lock);
1946 if (!IS_GADDR_VALID(gaddr))
1947 continue;
1948
1949 if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
1950 spin_lock(&gmap->shadow_lock);
1951 list_for_each_entry_safe(sg, next,
1952 &gmap->children, list)
1953 gmap_shadow_notify(sg, vmaddr, gaddr);
1954 spin_unlock(&gmap->shadow_lock);
1955 }
1956 if (bits & PGSTE_IN_BIT)
1957 gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
1958 }
1959 rcu_read_unlock();
1960 }
1961 EXPORT_SYMBOL_GPL(ptep_notify);
1962
pmdp_notify_gmap(struct gmap * gmap,pmd_t * pmdp,unsigned long gaddr)1963 static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
1964 unsigned long gaddr)
1965 {
1966 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
1967 gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
1968 }
1969
1970 /**
1971 * gmap_pmdp_xchg - exchange a gmap pmd with another
1972 * @gmap: pointer to the guest address space structure
1973 * @pmdp: pointer to the pmd entry
1974 * @new: replacement entry
1975 * @gaddr: the affected guest address
1976 *
1977 * This function is assumed to be called with the guest_table_lock
1978 * held.
1979 */
gmap_pmdp_xchg(struct gmap * gmap,pmd_t * pmdp,pmd_t new,unsigned long gaddr)1980 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
1981 unsigned long gaddr)
1982 {
1983 gaddr &= HPAGE_MASK;
1984 pmdp_notify_gmap(gmap, pmdp, gaddr);
1985 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
1986 if (machine_has_tlb_guest())
1987 __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
1988 IDTE_GLOBAL);
1989 else
1990 __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
1991 set_pmd(pmdp, new);
1992 }
1993
gmap_pmdp_clear(struct mm_struct * mm,unsigned long vmaddr,int purge)1994 static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
1995 int purge)
1996 {
1997 pmd_t *pmdp;
1998 struct gmap *gmap;
1999 unsigned long gaddr;
2000
2001 rcu_read_lock();
2002 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2003 spin_lock(&gmap->guest_table_lock);
2004 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
2005 if (pmdp) {
2006 pmdp_notify_gmap(gmap, pmdp, gaddr);
2007 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2008 _SEGMENT_ENTRY_GMAP_UC |
2009 _SEGMENT_ENTRY));
2010 if (purge)
2011 __pmdp_cspg(pmdp);
2012 set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
2013 }
2014 spin_unlock(&gmap->guest_table_lock);
2015 }
2016 rcu_read_unlock();
2017 }
2018
2019 /**
2020 * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
2021 * flushing
2022 * @mm: pointer to the process mm_struct
2023 * @vmaddr: virtual address in the process address space
2024 */
gmap_pmdp_invalidate(struct mm_struct * mm,unsigned long vmaddr)2025 void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
2026 {
2027 gmap_pmdp_clear(mm, vmaddr, 0);
2028 }
2029 EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
2030
2031 /**
2032 * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
2033 * @mm: pointer to the process mm_struct
2034 * @vmaddr: virtual address in the process address space
2035 */
gmap_pmdp_idte_local(struct mm_struct * mm,unsigned long vmaddr)2036 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
2037 {
2038 unsigned long gaddr;
2039 struct gmap *gmap;
2040 pmd_t *pmdp;
2041
2042 rcu_read_lock();
2043 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2044 spin_lock(&gmap->guest_table_lock);
2045 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
2046 if (pmdp) {
2047 pmdp_notify_gmap(gmap, pmdp, gaddr);
2048 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2049 _SEGMENT_ENTRY_GMAP_UC |
2050 _SEGMENT_ENTRY));
2051 if (machine_has_tlb_guest())
2052 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2053 gmap->asce, IDTE_LOCAL);
2054 else
2055 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
2056 *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
2057 }
2058 spin_unlock(&gmap->guest_table_lock);
2059 }
2060 rcu_read_unlock();
2061 }
2062 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
2063
2064 /**
2065 * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
2066 * @mm: pointer to the process mm_struct
2067 * @vmaddr: virtual address in the process address space
2068 */
gmap_pmdp_idte_global(struct mm_struct * mm,unsigned long vmaddr)2069 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
2070 {
2071 unsigned long gaddr;
2072 struct gmap *gmap;
2073 pmd_t *pmdp;
2074
2075 rcu_read_lock();
2076 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2077 spin_lock(&gmap->guest_table_lock);
2078 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
2079 if (pmdp) {
2080 pmdp_notify_gmap(gmap, pmdp, gaddr);
2081 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2082 _SEGMENT_ENTRY_GMAP_UC |
2083 _SEGMENT_ENTRY));
2084 if (machine_has_tlb_guest())
2085 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2086 gmap->asce, IDTE_GLOBAL);
2087 else
2088 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
2089 *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
2090 }
2091 spin_unlock(&gmap->guest_table_lock);
2092 }
2093 rcu_read_unlock();
2094 }
2095 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
2096
2097 /**
2098 * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
2099 * @gmap: pointer to guest address space
2100 * @pmdp: pointer to the pmd to be tested
2101 * @gaddr: virtual address in the guest address space
2102 *
2103 * This function is assumed to be called with the guest_table_lock
2104 * held.
2105 */
gmap_test_and_clear_dirty_pmd(struct gmap * gmap,pmd_t * pmdp,unsigned long gaddr)2106 static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
2107 unsigned long gaddr)
2108 {
2109 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
2110 return false;
2111
2112 /* Already protected memory, which did not change is clean */
2113 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
2114 !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
2115 return false;
2116
2117 /* Clear UC indication and reset protection */
2118 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
2119 gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
2120 return true;
2121 }
2122
2123 /**
2124 * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
2125 * @gmap: pointer to guest address space
2126 * @bitmap: dirty bitmap for this pmd
2127 * @gaddr: virtual address in the guest address space
2128 * @vmaddr: virtual address in the host address space
2129 *
2130 * This function is assumed to be called with the guest_table_lock
2131 * held.
2132 */
gmap_sync_dirty_log_pmd(struct gmap * gmap,unsigned long bitmap[4],unsigned long gaddr,unsigned long vmaddr)2133 void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
2134 unsigned long gaddr, unsigned long vmaddr)
2135 {
2136 int i;
2137 pmd_t *pmdp;
2138 pte_t *ptep;
2139 spinlock_t *ptl;
2140
2141 pmdp = gmap_pmd_op_walk(gmap, gaddr);
2142 if (!pmdp)
2143 return;
2144
2145 if (pmd_leaf(*pmdp)) {
2146 if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
2147 bitmap_fill(bitmap, _PAGE_ENTRIES);
2148 } else {
2149 for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
2150 ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
2151 if (!ptep)
2152 continue;
2153 if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
2154 set_bit(i, bitmap);
2155 pte_unmap_unlock(ptep, ptl);
2156 }
2157 }
2158 gmap_pmd_op_end(gmap, pmdp);
2159 }
2160 EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
2161
2162 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
thp_split_walk_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)2163 static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
2164 unsigned long end, struct mm_walk *walk)
2165 {
2166 struct vm_area_struct *vma = walk->vma;
2167
2168 split_huge_pmd(vma, pmd, addr);
2169 return 0;
2170 }
2171
2172 static const struct mm_walk_ops thp_split_walk_ops = {
2173 .pmd_entry = thp_split_walk_pmd_entry,
2174 .walk_lock = PGWALK_WRLOCK_VERIFY,
2175 };
2176
thp_split_mm(struct mm_struct * mm)2177 static inline void thp_split_mm(struct mm_struct *mm)
2178 {
2179 struct vm_area_struct *vma;
2180 VMA_ITERATOR(vmi, mm, 0);
2181
2182 for_each_vma(vmi, vma) {
2183 vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
2184 walk_page_vma(vma, &thp_split_walk_ops, NULL);
2185 }
2186 mm->def_flags |= VM_NOHUGEPAGE;
2187 }
2188 #else
thp_split_mm(struct mm_struct * mm)2189 static inline void thp_split_mm(struct mm_struct *mm)
2190 {
2191 }
2192 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2193
2194 /*
2195 * switch on pgstes for its userspace process (for kvm)
2196 */
s390_enable_sie(void)2197 int s390_enable_sie(void)
2198 {
2199 struct mm_struct *mm = current->mm;
2200
2201 /* Do we have pgstes? if yes, we are done */
2202 if (mm_has_pgste(mm))
2203 return 0;
2204 mmap_write_lock(mm);
2205 mm->context.has_pgste = 1;
2206 /* split thp mappings and disable thp for future mappings */
2207 thp_split_mm(mm);
2208 mmap_write_unlock(mm);
2209 return 0;
2210 }
2211 EXPORT_SYMBOL_GPL(s390_enable_sie);
2212
2213 /*
2214 * Enable storage key handling from now on and initialize the storage
2215 * keys with the default key.
2216 */
__s390_enable_skey_pte(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)2217 static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
2218 unsigned long next, struct mm_walk *walk)
2219 {
2220 /* Clear storage key */
2221 ptep_zap_key(walk->mm, addr, pte);
2222 return 0;
2223 }
2224
2225 /*
2226 * Give a chance to schedule after setting a key to 256 pages.
2227 * We only hold the mm lock, which is a rwsem and the kvm srcu.
2228 * Both can sleep.
2229 */
__s390_enable_skey_pmd(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)2230 static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
2231 unsigned long next, struct mm_walk *walk)
2232 {
2233 cond_resched();
2234 return 0;
2235 }
2236
__s390_enable_skey_hugetlb(pte_t * pte,unsigned long addr,unsigned long hmask,unsigned long next,struct mm_walk * walk)2237 static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
2238 unsigned long hmask, unsigned long next,
2239 struct mm_walk *walk)
2240 {
2241 pmd_t *pmd = (pmd_t *)pte;
2242 unsigned long start, end;
2243 struct folio *folio = page_folio(pmd_page(*pmd));
2244
2245 /*
2246 * The write check makes sure we do not set a key on shared
2247 * memory. This is needed as the walker does not differentiate
2248 * between actual guest memory and the process executable or
2249 * shared libraries.
2250 */
2251 if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
2252 !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
2253 return 0;
2254
2255 start = pmd_val(*pmd) & HPAGE_MASK;
2256 end = start + HPAGE_SIZE;
2257 __storage_key_init_range(start, end);
2258 set_bit(PG_arch_1, &folio->flags.f);
2259 cond_resched();
2260 return 0;
2261 }
2262
2263 static const struct mm_walk_ops enable_skey_walk_ops = {
2264 .hugetlb_entry = __s390_enable_skey_hugetlb,
2265 .pte_entry = __s390_enable_skey_pte,
2266 .pmd_entry = __s390_enable_skey_pmd,
2267 .walk_lock = PGWALK_WRLOCK,
2268 };
2269
s390_enable_skey(void)2270 int s390_enable_skey(void)
2271 {
2272 struct mm_struct *mm = current->mm;
2273 int rc = 0;
2274
2275 mmap_write_lock(mm);
2276 if (mm_uses_skeys(mm))
2277 goto out_up;
2278
2279 mm->context.uses_skeys = 1;
2280 rc = gmap_helper_disable_cow_sharing();
2281 if (rc) {
2282 mm->context.uses_skeys = 0;
2283 goto out_up;
2284 }
2285 walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
2286
2287 out_up:
2288 mmap_write_unlock(mm);
2289 return rc;
2290 }
2291 EXPORT_SYMBOL_GPL(s390_enable_skey);
2292
2293 /*
2294 * Reset CMMA state, make all pages stable again.
2295 */
__s390_reset_cmma(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)2296 static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
2297 unsigned long next, struct mm_walk *walk)
2298 {
2299 ptep_zap_unused(walk->mm, addr, pte, 1);
2300 return 0;
2301 }
2302
2303 static const struct mm_walk_ops reset_cmma_walk_ops = {
2304 .pte_entry = __s390_reset_cmma,
2305 .walk_lock = PGWALK_WRLOCK,
2306 };
2307
s390_reset_cmma(struct mm_struct * mm)2308 void s390_reset_cmma(struct mm_struct *mm)
2309 {
2310 mmap_write_lock(mm);
2311 walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
2312 mmap_write_unlock(mm);
2313 }
2314 EXPORT_SYMBOL_GPL(s390_reset_cmma);
2315
2316 #define GATHER_GET_PAGES 32
2317
2318 struct reset_walk_state {
2319 unsigned long next;
2320 unsigned long count;
2321 unsigned long pfns[GATHER_GET_PAGES];
2322 };
2323
s390_gather_pages(pte_t * ptep,unsigned long addr,unsigned long next,struct mm_walk * walk)2324 static int s390_gather_pages(pte_t *ptep, unsigned long addr,
2325 unsigned long next, struct mm_walk *walk)
2326 {
2327 struct reset_walk_state *p = walk->private;
2328 pte_t pte = READ_ONCE(*ptep);
2329
2330 if (pte_present(pte)) {
2331 /* we have a reference from the mapping, take an extra one */
2332 get_page(phys_to_page(pte_val(pte)));
2333 p->pfns[p->count] = phys_to_pfn(pte_val(pte));
2334 p->next = next;
2335 p->count++;
2336 }
2337 return p->count >= GATHER_GET_PAGES;
2338 }
2339
2340 static const struct mm_walk_ops gather_pages_ops = {
2341 .pte_entry = s390_gather_pages,
2342 .walk_lock = PGWALK_RDLOCK,
2343 };
2344
2345 /*
2346 * Call the Destroy secure page UVC on each page in the given array of PFNs.
2347 * Each page needs to have an extra reference, which will be released here.
2348 */
s390_uv_destroy_pfns(unsigned long count,unsigned long * pfns)2349 void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
2350 {
2351 struct folio *folio;
2352 unsigned long i;
2353
2354 for (i = 0; i < count; i++) {
2355 folio = pfn_folio(pfns[i]);
2356 /* we always have an extra reference */
2357 uv_destroy_folio(folio);
2358 /* get rid of the extra reference */
2359 folio_put(folio);
2360 cond_resched();
2361 }
2362 }
2363 EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
2364
2365 /**
2366 * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
2367 * in the given range of the given address space.
2368 * @mm: the mm to operate on
2369 * @start: the start of the range
2370 * @end: the end of the range
2371 * @interruptible: if not 0, stop when a fatal signal is received
2372 *
2373 * Walk the given range of the given address space and call the destroy
2374 * secure page UVC on each page. Optionally exit early if a fatal signal is
2375 * pending.
2376 *
2377 * Return: 0 on success, -EINTR if the function stopped before completing
2378 */
__s390_uv_destroy_range(struct mm_struct * mm,unsigned long start,unsigned long end,bool interruptible)2379 int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
2380 unsigned long end, bool interruptible)
2381 {
2382 struct reset_walk_state state = { .next = start };
2383 int r = 1;
2384
2385 while (r > 0) {
2386 state.count = 0;
2387 mmap_read_lock(mm);
2388 r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
2389 mmap_read_unlock(mm);
2390 cond_resched();
2391 s390_uv_destroy_pfns(state.count, state.pfns);
2392 if (interruptible && fatal_signal_pending(current))
2393 return -EINTR;
2394 }
2395 return 0;
2396 }
2397 EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
2398
2399 /**
2400 * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
2401 * @gmap: the gmap whose ASCE needs to be replaced
2402 *
2403 * If the ASCE is a SEGMENT type then this function will return -EINVAL,
2404 * otherwise the pointers in the host_to_guest radix tree will keep pointing
2405 * to the wrong pages, causing use-after-free and memory corruption.
2406 * If the allocation of the new top level page table fails, the ASCE is not
2407 * replaced.
2408 * In any case, the old ASCE is always removed from the gmap CRST list.
2409 * Therefore the caller has to make sure to save a pointer to it
2410 * beforehand, unless a leak is actually intended.
2411 */
s390_replace_asce(struct gmap * gmap)2412 int s390_replace_asce(struct gmap *gmap)
2413 {
2414 unsigned long asce;
2415 struct page *page;
2416 void *table;
2417
2418 /* Replacing segment type ASCEs would cause serious issues */
2419 if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
2420 return -EINVAL;
2421
2422 page = gmap_alloc_crst();
2423 if (!page)
2424 return -ENOMEM;
2425 table = page_to_virt(page);
2426 memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
2427
2428 /* Set new table origin while preserving existing ASCE control bits */
2429 asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
2430 WRITE_ONCE(gmap->asce, asce);
2431 WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
2432 WRITE_ONCE(gmap->table, table);
2433
2434 return 0;
2435 }
2436 EXPORT_SYMBOL_GPL(s390_replace_asce);
2437