1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * KVM guest address space mapping code 4 * 5 * Copyright IBM Corp. 2007, 2020 6 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 7 * David Hildenbrand <david@redhat.com> 8 * Janosch Frank <frankja@linux.vnet.ibm.com> 9 */ 10 11 #include <linux/cpufeature.h> 12 #include <linux/kernel.h> 13 #include <linux/pagewalk.h> 14 #include <linux/swap.h> 15 #include <linux/smp.h> 16 #include <linux/spinlock.h> 17 #include <linux/slab.h> 18 #include <linux/swapops.h> 19 #include <linux/ksm.h> 20 #include <linux/mman.h> 21 #include <linux/pgtable.h> 22 #include <asm/page-states.h> 23 #include <asm/pgalloc.h> 24 #include <asm/machine.h> 25 #include <asm/gmap.h> 26 #include <asm/page.h> 27 #include <asm/tlb.h> 28 29 /* 30 * The address is saved in a radix tree directly; NULL would be ambiguous, 31 * since 0 is a valid address, and NULL is returned when nothing was found. 32 * The lower bits are ignored by all users of the macro, so it can be used 33 * to distinguish a valid address 0 from a NULL. 34 */ 35 #define VALID_GADDR_FLAG 1 36 #define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG) 37 #define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG) 38 39 #define GMAP_SHADOW_FAKE_TABLE 1ULL 40 41 static struct page *gmap_alloc_crst(void) 42 { 43 struct page *page; 44 45 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 46 if (!page) 47 return NULL; 48 __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); 49 return page; 50 } 51 52 /** 53 * gmap_alloc - allocate and initialize a guest address space 54 * @limit: maximum address of the gmap address space 55 * 56 * Returns a guest address space structure. 57 */ 58 struct gmap *gmap_alloc(unsigned long limit) 59 { 60 struct gmap *gmap; 61 struct page *page; 62 unsigned long *table; 63 unsigned long etype, atype; 64 65 if (limit < _REGION3_SIZE) { 66 limit = _REGION3_SIZE - 1; 67 atype = _ASCE_TYPE_SEGMENT; 68 etype = _SEGMENT_ENTRY_EMPTY; 69 } else if (limit < _REGION2_SIZE) { 70 limit = _REGION2_SIZE - 1; 71 atype = _ASCE_TYPE_REGION3; 72 etype = _REGION3_ENTRY_EMPTY; 73 } else if (limit < _REGION1_SIZE) { 74 limit = _REGION1_SIZE - 1; 75 atype = _ASCE_TYPE_REGION2; 76 etype = _REGION2_ENTRY_EMPTY; 77 } else { 78 limit = -1UL; 79 atype = _ASCE_TYPE_REGION1; 80 etype = _REGION1_ENTRY_EMPTY; 81 } 82 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); 83 if (!gmap) 84 goto out; 85 INIT_LIST_HEAD(&gmap->children); 86 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); 87 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); 88 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); 89 spin_lock_init(&gmap->guest_table_lock); 90 spin_lock_init(&gmap->shadow_lock); 91 refcount_set(&gmap->ref_count, 1); 92 page = gmap_alloc_crst(); 93 if (!page) 94 goto out_free; 95 table = page_to_virt(page); 96 crst_table_init(table, etype); 97 gmap->table = table; 98 gmap->asce = atype | _ASCE_TABLE_LENGTH | 99 _ASCE_USER_BITS | __pa(table); 100 gmap->asce_end = limit; 101 return gmap; 102 103 out_free: 104 kfree(gmap); 105 out: 106 return NULL; 107 } 108 EXPORT_SYMBOL_GPL(gmap_alloc); 109 110 /** 111 * gmap_create - create a guest address space 112 * @mm: pointer to the parent mm_struct 113 * @limit: maximum size of the gmap address space 114 * 115 * Returns a guest address space structure. 116 */ 117 struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) 118 { 119 struct gmap *gmap; 120 unsigned long gmap_asce; 121 122 gmap = gmap_alloc(limit); 123 if (!gmap) 124 return NULL; 125 gmap->mm = mm; 126 spin_lock(&mm->context.lock); 127 list_add_rcu(&gmap->list, &mm->context.gmap_list); 128 if (list_is_singular(&mm->context.gmap_list)) 129 gmap_asce = gmap->asce; 130 else 131 gmap_asce = -1UL; 132 WRITE_ONCE(mm->context.gmap_asce, gmap_asce); 133 spin_unlock(&mm->context.lock); 134 return gmap; 135 } 136 EXPORT_SYMBOL_GPL(gmap_create); 137 138 static void gmap_flush_tlb(struct gmap *gmap) 139 { 140 if (cpu_has_idte()) 141 __tlb_flush_idte(gmap->asce); 142 else 143 __tlb_flush_global(); 144 } 145 146 static void gmap_radix_tree_free(struct radix_tree_root *root) 147 { 148 struct radix_tree_iter iter; 149 unsigned long indices[16]; 150 unsigned long index; 151 void __rcu **slot; 152 int i, nr; 153 154 /* A radix tree is freed by deleting all of its entries */ 155 index = 0; 156 do { 157 nr = 0; 158 radix_tree_for_each_slot(slot, root, &iter, index) { 159 indices[nr] = iter.index; 160 if (++nr == 16) 161 break; 162 } 163 for (i = 0; i < nr; i++) { 164 index = indices[i]; 165 radix_tree_delete(root, index); 166 } 167 } while (nr > 0); 168 } 169 170 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) 171 { 172 struct gmap_rmap *rmap, *rnext, *head; 173 struct radix_tree_iter iter; 174 unsigned long indices[16]; 175 unsigned long index; 176 void __rcu **slot; 177 int i, nr; 178 179 /* A radix tree is freed by deleting all of its entries */ 180 index = 0; 181 do { 182 nr = 0; 183 radix_tree_for_each_slot(slot, root, &iter, index) { 184 indices[nr] = iter.index; 185 if (++nr == 16) 186 break; 187 } 188 for (i = 0; i < nr; i++) { 189 index = indices[i]; 190 head = radix_tree_delete(root, index); 191 gmap_for_each_rmap_safe(rmap, rnext, head) 192 kfree(rmap); 193 } 194 } while (nr > 0); 195 } 196 197 static void gmap_free_crst(unsigned long *table, bool free_ptes) 198 { 199 bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0; 200 int i; 201 202 if (is_segment) { 203 if (!free_ptes) 204 goto out; 205 for (i = 0; i < _CRST_ENTRIES; i++) 206 if (!(table[i] & _SEGMENT_ENTRY_INVALID)) 207 page_table_free_pgste(page_ptdesc(phys_to_page(table[i]))); 208 } else { 209 for (i = 0; i < _CRST_ENTRIES; i++) 210 if (!(table[i] & _REGION_ENTRY_INVALID)) 211 gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes); 212 } 213 214 out: 215 free_pages((unsigned long)table, CRST_ALLOC_ORDER); 216 } 217 218 /** 219 * gmap_free - free a guest address space 220 * @gmap: pointer to the guest address space structure 221 * 222 * No locks required. There are no references to this gmap anymore. 223 */ 224 void gmap_free(struct gmap *gmap) 225 { 226 /* Flush tlb of all gmaps (if not already done for shadows) */ 227 if (!(gmap_is_shadow(gmap) && gmap->removed)) 228 gmap_flush_tlb(gmap); 229 /* Free all segment & region tables. */ 230 gmap_free_crst(gmap->table, gmap_is_shadow(gmap)); 231 232 gmap_radix_tree_free(&gmap->guest_to_host); 233 gmap_radix_tree_free(&gmap->host_to_guest); 234 235 /* Free additional data for a shadow gmap */ 236 if (gmap_is_shadow(gmap)) { 237 gmap_rmap_radix_tree_free(&gmap->host_to_rmap); 238 /* Release reference to the parent */ 239 gmap_put(gmap->parent); 240 } 241 242 kfree(gmap); 243 } 244 EXPORT_SYMBOL_GPL(gmap_free); 245 246 /** 247 * gmap_get - increase reference counter for guest address space 248 * @gmap: pointer to the guest address space structure 249 * 250 * Returns the gmap pointer 251 */ 252 struct gmap *gmap_get(struct gmap *gmap) 253 { 254 refcount_inc(&gmap->ref_count); 255 return gmap; 256 } 257 EXPORT_SYMBOL_GPL(gmap_get); 258 259 /** 260 * gmap_put - decrease reference counter for guest address space 261 * @gmap: pointer to the guest address space structure 262 * 263 * If the reference counter reaches zero the guest address space is freed. 264 */ 265 void gmap_put(struct gmap *gmap) 266 { 267 if (refcount_dec_and_test(&gmap->ref_count)) 268 gmap_free(gmap); 269 } 270 EXPORT_SYMBOL_GPL(gmap_put); 271 272 /** 273 * gmap_remove - remove a guest address space but do not free it yet 274 * @gmap: pointer to the guest address space structure 275 */ 276 void gmap_remove(struct gmap *gmap) 277 { 278 struct gmap *sg, *next; 279 unsigned long gmap_asce; 280 281 /* Remove all shadow gmaps linked to this gmap */ 282 if (!list_empty(&gmap->children)) { 283 spin_lock(&gmap->shadow_lock); 284 list_for_each_entry_safe(sg, next, &gmap->children, list) { 285 list_del(&sg->list); 286 gmap_put(sg); 287 } 288 spin_unlock(&gmap->shadow_lock); 289 } 290 /* Remove gmap from the pre-mm list */ 291 spin_lock(&gmap->mm->context.lock); 292 list_del_rcu(&gmap->list); 293 if (list_empty(&gmap->mm->context.gmap_list)) 294 gmap_asce = 0; 295 else if (list_is_singular(&gmap->mm->context.gmap_list)) 296 gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, 297 struct gmap, list)->asce; 298 else 299 gmap_asce = -1UL; 300 WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); 301 spin_unlock(&gmap->mm->context.lock); 302 synchronize_rcu(); 303 /* Put reference */ 304 gmap_put(gmap); 305 } 306 EXPORT_SYMBOL_GPL(gmap_remove); 307 308 /* 309 * gmap_alloc_table is assumed to be called with mmap_lock held 310 */ 311 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 312 unsigned long init, unsigned long gaddr) 313 { 314 struct page *page; 315 unsigned long *new; 316 317 /* since we dont free the gmap table until gmap_free we can unlock */ 318 page = gmap_alloc_crst(); 319 if (!page) 320 return -ENOMEM; 321 new = page_to_virt(page); 322 crst_table_init(new, init); 323 spin_lock(&gmap->guest_table_lock); 324 if (*table & _REGION_ENTRY_INVALID) { 325 *table = __pa(new) | _REGION_ENTRY_LENGTH | 326 (*table & _REGION_ENTRY_TYPE_MASK); 327 page = NULL; 328 } 329 spin_unlock(&gmap->guest_table_lock); 330 if (page) 331 __free_pages(page, CRST_ALLOC_ORDER); 332 return 0; 333 } 334 335 static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr) 336 { 337 return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 338 } 339 340 static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr) 341 { 342 return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 343 } 344 345 static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr, 346 unsigned long *gaddr) 347 { 348 *gaddr = host_to_guest_delete(gmap, vmaddr); 349 if (IS_GADDR_VALID(*gaddr)) 350 return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1); 351 return NULL; 352 } 353 354 /** 355 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 356 * @gmap: pointer to the guest address space structure 357 * @vmaddr: address in the host process address space 358 * 359 * Returns 1 if a TLB flush is required 360 */ 361 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 362 { 363 unsigned long gaddr; 364 int flush = 0; 365 pmd_t *pmdp; 366 367 BUG_ON(gmap_is_shadow(gmap)); 368 spin_lock(&gmap->guest_table_lock); 369 370 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); 371 if (pmdp) { 372 flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY); 373 *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); 374 } 375 376 spin_unlock(&gmap->guest_table_lock); 377 return flush; 378 } 379 380 /** 381 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 382 * @gmap: pointer to the guest address space structure 383 * @gaddr: address in the guest address space 384 * 385 * Returns 1 if a TLB flush is required 386 */ 387 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 388 { 389 unsigned long vmaddr; 390 391 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 392 gaddr >> PMD_SHIFT); 393 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 394 } 395 396 /** 397 * gmap_unmap_segment - unmap segment from the guest address space 398 * @gmap: pointer to the guest address space structure 399 * @to: address in the guest address space 400 * @len: length of the memory area to unmap 401 * 402 * Returns 0 if the unmap succeeded, -EINVAL if not. 403 */ 404 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 405 { 406 unsigned long off; 407 int flush; 408 409 BUG_ON(gmap_is_shadow(gmap)); 410 if ((to | len) & (PMD_SIZE - 1)) 411 return -EINVAL; 412 if (len == 0 || to + len < to) 413 return -EINVAL; 414 415 flush = 0; 416 mmap_write_lock(gmap->mm); 417 for (off = 0; off < len; off += PMD_SIZE) 418 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 419 mmap_write_unlock(gmap->mm); 420 if (flush) 421 gmap_flush_tlb(gmap); 422 return 0; 423 } 424 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 425 426 /** 427 * gmap_map_segment - map a segment to the guest address space 428 * @gmap: pointer to the guest address space structure 429 * @from: source address in the parent address space 430 * @to: target address in the guest address space 431 * @len: length of the memory area to map 432 * 433 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 434 */ 435 int gmap_map_segment(struct gmap *gmap, unsigned long from, 436 unsigned long to, unsigned long len) 437 { 438 unsigned long off; 439 int flush; 440 441 BUG_ON(gmap_is_shadow(gmap)); 442 if ((from | to | len) & (PMD_SIZE - 1)) 443 return -EINVAL; 444 if (len == 0 || from + len < from || to + len < to || 445 from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) 446 return -EINVAL; 447 448 flush = 0; 449 mmap_write_lock(gmap->mm); 450 for (off = 0; off < len; off += PMD_SIZE) { 451 /* Remove old translation */ 452 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 453 /* Store new translation */ 454 if (radix_tree_insert(&gmap->guest_to_host, 455 (to + off) >> PMD_SHIFT, 456 (void *) from + off)) 457 break; 458 } 459 mmap_write_unlock(gmap->mm); 460 if (flush) 461 gmap_flush_tlb(gmap); 462 if (off >= len) 463 return 0; 464 gmap_unmap_segment(gmap, to, len); 465 return -ENOMEM; 466 } 467 EXPORT_SYMBOL_GPL(gmap_map_segment); 468 469 /** 470 * __gmap_translate - translate a guest address to a user space address 471 * @gmap: pointer to guest mapping meta data structure 472 * @gaddr: guest address 473 * 474 * Returns user space address which corresponds to the guest address or 475 * -EFAULT if no such mapping exists. 476 * This function does not establish potentially missing page table entries. 477 * The mmap_lock of the mm that belongs to the address space must be held 478 * when this function gets called. 479 * 480 * Note: Can also be called for shadow gmaps. 481 */ 482 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 483 { 484 unsigned long vmaddr; 485 486 vmaddr = (unsigned long) 487 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 488 /* Note: guest_to_host is empty for a shadow gmap */ 489 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 490 } 491 EXPORT_SYMBOL_GPL(__gmap_translate); 492 493 /** 494 * gmap_unlink - disconnect a page table from the gmap shadow tables 495 * @mm: pointer to the parent mm_struct 496 * @table: pointer to the host page table 497 * @vmaddr: vm address associated with the host page table 498 */ 499 void gmap_unlink(struct mm_struct *mm, unsigned long *table, 500 unsigned long vmaddr) 501 { 502 struct gmap *gmap; 503 int flush; 504 505 rcu_read_lock(); 506 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 507 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 508 if (flush) 509 gmap_flush_tlb(gmap); 510 } 511 rcu_read_unlock(); 512 } 513 514 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, 515 unsigned long gaddr); 516 517 /** 518 * __gmap_link - set up shadow page tables to connect a host to a guest address 519 * @gmap: pointer to guest mapping meta data structure 520 * @gaddr: guest address 521 * @vmaddr: vm address 522 * 523 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 524 * if the vm address is already mapped to a different guest segment. 525 * The mmap_lock of the mm that belongs to the address space must be held 526 * when this function gets called. 527 */ 528 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 529 { 530 struct mm_struct *mm; 531 unsigned long *table; 532 spinlock_t *ptl; 533 pgd_t *pgd; 534 p4d_t *p4d; 535 pud_t *pud; 536 pmd_t *pmd; 537 u64 unprot; 538 int rc; 539 540 BUG_ON(gmap_is_shadow(gmap)); 541 /* Create higher level tables in the gmap page table */ 542 table = gmap->table; 543 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 544 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; 545 if ((*table & _REGION_ENTRY_INVALID) && 546 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 547 gaddr & _REGION1_MASK)) 548 return -ENOMEM; 549 table = __va(*table & _REGION_ENTRY_ORIGIN); 550 } 551 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 552 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; 553 if ((*table & _REGION_ENTRY_INVALID) && 554 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 555 gaddr & _REGION2_MASK)) 556 return -ENOMEM; 557 table = __va(*table & _REGION_ENTRY_ORIGIN); 558 } 559 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 560 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; 561 if ((*table & _REGION_ENTRY_INVALID) && 562 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 563 gaddr & _REGION3_MASK)) 564 return -ENOMEM; 565 table = __va(*table & _REGION_ENTRY_ORIGIN); 566 } 567 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 568 /* Walk the parent mm page table */ 569 mm = gmap->mm; 570 pgd = pgd_offset(mm, vmaddr); 571 VM_BUG_ON(pgd_none(*pgd)); 572 p4d = p4d_offset(pgd, vmaddr); 573 VM_BUG_ON(p4d_none(*p4d)); 574 pud = pud_offset(p4d, vmaddr); 575 VM_BUG_ON(pud_none(*pud)); 576 /* large puds cannot yet be handled */ 577 if (pud_leaf(*pud)) 578 return -EFAULT; 579 pmd = pmd_offset(pud, vmaddr); 580 VM_BUG_ON(pmd_none(*pmd)); 581 /* Are we allowed to use huge pages? */ 582 if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) 583 return -EFAULT; 584 /* Link gmap segment table entry location to page table. */ 585 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 586 if (rc) 587 return rc; 588 ptl = pmd_lock(mm, pmd); 589 spin_lock(&gmap->guest_table_lock); 590 if (*table == _SEGMENT_ENTRY_EMPTY) { 591 rc = radix_tree_insert(&gmap->host_to_guest, 592 vmaddr >> PMD_SHIFT, 593 (void *)MAKE_VALID_GADDR(gaddr)); 594 if (!rc) { 595 if (pmd_leaf(*pmd)) { 596 *table = (pmd_val(*pmd) & 597 _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) 598 | _SEGMENT_ENTRY_GMAP_UC 599 | _SEGMENT_ENTRY; 600 } else 601 *table = pmd_val(*pmd) & 602 _SEGMENT_ENTRY_HARDWARE_BITS; 603 } 604 } else if (*table & _SEGMENT_ENTRY_PROTECT && 605 !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { 606 unprot = (u64)*table; 607 unprot &= ~_SEGMENT_ENTRY_PROTECT; 608 unprot |= _SEGMENT_ENTRY_GMAP_UC; 609 gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); 610 } 611 spin_unlock(&gmap->guest_table_lock); 612 spin_unlock(ptl); 613 radix_tree_preload_end(); 614 return rc; 615 } 616 EXPORT_SYMBOL(__gmap_link); 617 618 /* 619 * this function is assumed to be called with mmap_lock held 620 */ 621 void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 622 { 623 struct vm_area_struct *vma; 624 unsigned long vmaddr; 625 spinlock_t *ptl; 626 pte_t *ptep; 627 628 /* Find the vm address for the guest address */ 629 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 630 gaddr >> PMD_SHIFT); 631 if (vmaddr) { 632 vmaddr |= gaddr & ~PMD_MASK; 633 634 vma = vma_lookup(gmap->mm, vmaddr); 635 if (!vma || is_vm_hugetlb_page(vma)) 636 return; 637 638 /* Get pointer to the page table entry */ 639 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 640 if (likely(ptep)) { 641 ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); 642 pte_unmap_unlock(ptep, ptl); 643 } 644 } 645 } 646 EXPORT_SYMBOL_GPL(__gmap_zap); 647 648 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 649 { 650 unsigned long gaddr, vmaddr, size; 651 struct vm_area_struct *vma; 652 653 mmap_read_lock(gmap->mm); 654 for (gaddr = from; gaddr < to; 655 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 656 /* Find the vm address for the guest address */ 657 vmaddr = (unsigned long) 658 radix_tree_lookup(&gmap->guest_to_host, 659 gaddr >> PMD_SHIFT); 660 if (!vmaddr) 661 continue; 662 vmaddr |= gaddr & ~PMD_MASK; 663 /* Find vma in the parent mm */ 664 vma = find_vma(gmap->mm, vmaddr); 665 if (!vma) 666 continue; 667 /* 668 * We do not discard pages that are backed by 669 * hugetlbfs, so we don't have to refault them. 670 */ 671 if (is_vm_hugetlb_page(vma)) 672 continue; 673 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 674 zap_page_range_single(vma, vmaddr, size, NULL); 675 } 676 mmap_read_unlock(gmap->mm); 677 } 678 EXPORT_SYMBOL_GPL(gmap_discard); 679 680 static LIST_HEAD(gmap_notifier_list); 681 static DEFINE_SPINLOCK(gmap_notifier_lock); 682 683 /** 684 * gmap_register_pte_notifier - register a pte invalidation callback 685 * @nb: pointer to the gmap notifier block 686 */ 687 void gmap_register_pte_notifier(struct gmap_notifier *nb) 688 { 689 spin_lock(&gmap_notifier_lock); 690 list_add_rcu(&nb->list, &gmap_notifier_list); 691 spin_unlock(&gmap_notifier_lock); 692 } 693 EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); 694 695 /** 696 * gmap_unregister_pte_notifier - remove a pte invalidation callback 697 * @nb: pointer to the gmap notifier block 698 */ 699 void gmap_unregister_pte_notifier(struct gmap_notifier *nb) 700 { 701 spin_lock(&gmap_notifier_lock); 702 list_del_rcu(&nb->list); 703 spin_unlock(&gmap_notifier_lock); 704 synchronize_rcu(); 705 } 706 EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); 707 708 /** 709 * gmap_call_notifier - call all registered invalidation callbacks 710 * @gmap: pointer to guest mapping meta data structure 711 * @start: start virtual address in the guest address space 712 * @end: end virtual address in the guest address space 713 */ 714 static void gmap_call_notifier(struct gmap *gmap, unsigned long start, 715 unsigned long end) 716 { 717 struct gmap_notifier *nb; 718 719 list_for_each_entry(nb, &gmap_notifier_list, list) 720 nb->notifier_call(gmap, start, end); 721 } 722 723 /** 724 * gmap_table_walk - walk the gmap page tables 725 * @gmap: pointer to guest mapping meta data structure 726 * @gaddr: virtual address in the guest address space 727 * @level: page table level to stop at 728 * 729 * Returns a table entry pointer for the given guest address and @level 730 * @level=0 : returns a pointer to a page table table entry (or NULL) 731 * @level=1 : returns a pointer to a segment table entry (or NULL) 732 * @level=2 : returns a pointer to a region-3 table entry (or NULL) 733 * @level=3 : returns a pointer to a region-2 table entry (or NULL) 734 * @level=4 : returns a pointer to a region-1 table entry (or NULL) 735 * 736 * Returns NULL if the gmap page tables could not be walked to the 737 * requested level. 738 * 739 * Note: Can also be called for shadow gmaps. 740 */ 741 unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) 742 { 743 const int asce_type = gmap->asce & _ASCE_TYPE_MASK; 744 unsigned long *table = gmap->table; 745 746 if (gmap_is_shadow(gmap) && gmap->removed) 747 return NULL; 748 749 if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) 750 return NULL; 751 752 if (asce_type != _ASCE_TYPE_REGION1 && 753 gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) 754 return NULL; 755 756 switch (asce_type) { 757 case _ASCE_TYPE_REGION1: 758 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; 759 if (level == 4) 760 break; 761 if (*table & _REGION_ENTRY_INVALID) 762 return NULL; 763 table = __va(*table & _REGION_ENTRY_ORIGIN); 764 fallthrough; 765 case _ASCE_TYPE_REGION2: 766 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; 767 if (level == 3) 768 break; 769 if (*table & _REGION_ENTRY_INVALID) 770 return NULL; 771 table = __va(*table & _REGION_ENTRY_ORIGIN); 772 fallthrough; 773 case _ASCE_TYPE_REGION3: 774 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; 775 if (level == 2) 776 break; 777 if (*table & _REGION_ENTRY_INVALID) 778 return NULL; 779 table = __va(*table & _REGION_ENTRY_ORIGIN); 780 fallthrough; 781 case _ASCE_TYPE_SEGMENT: 782 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 783 if (level == 1) 784 break; 785 if (*table & _REGION_ENTRY_INVALID) 786 return NULL; 787 table = __va(*table & _SEGMENT_ENTRY_ORIGIN); 788 table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT; 789 } 790 return table; 791 } 792 EXPORT_SYMBOL(gmap_table_walk); 793 794 /** 795 * gmap_pte_op_walk - walk the gmap page table, get the page table lock 796 * and return the pte pointer 797 * @gmap: pointer to guest mapping meta data structure 798 * @gaddr: virtual address in the guest address space 799 * @ptl: pointer to the spinlock pointer 800 * 801 * Returns a pointer to the locked pte for a guest address, or NULL 802 */ 803 static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, 804 spinlock_t **ptl) 805 { 806 unsigned long *table; 807 808 BUG_ON(gmap_is_shadow(gmap)); 809 /* Walk the gmap page table, lock and get pte pointer */ 810 table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ 811 if (!table || *table & _SEGMENT_ENTRY_INVALID) 812 return NULL; 813 return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); 814 } 815 816 /** 817 * gmap_pte_op_fixup - force a page in and connect the gmap page table 818 * @gmap: pointer to guest mapping meta data structure 819 * @gaddr: virtual address in the guest address space 820 * @vmaddr: address in the host process address space 821 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 822 * 823 * Returns 0 if the caller can retry __gmap_translate (might fail again), 824 * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing 825 * up or connecting the gmap page table. 826 */ 827 static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, 828 unsigned long vmaddr, int prot) 829 { 830 struct mm_struct *mm = gmap->mm; 831 unsigned int fault_flags; 832 bool unlocked = false; 833 834 BUG_ON(gmap_is_shadow(gmap)); 835 fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; 836 if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) 837 return -EFAULT; 838 if (unlocked) 839 /* lost mmap_lock, caller has to retry __gmap_translate */ 840 return 0; 841 /* Connect the page tables */ 842 return __gmap_link(gmap, gaddr, vmaddr); 843 } 844 845 /** 846 * gmap_pte_op_end - release the page table lock 847 * @ptep: pointer to the locked pte 848 * @ptl: pointer to the page table spinlock 849 */ 850 static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) 851 { 852 pte_unmap_unlock(ptep, ptl); 853 } 854 855 /** 856 * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock 857 * and return the pmd pointer 858 * @gmap: pointer to guest mapping meta data structure 859 * @gaddr: virtual address in the guest address space 860 * 861 * Returns a pointer to the pmd for a guest address, or NULL 862 */ 863 static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) 864 { 865 pmd_t *pmdp; 866 867 BUG_ON(gmap_is_shadow(gmap)); 868 pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); 869 if (!pmdp) 870 return NULL; 871 872 /* without huge pages, there is no need to take the table lock */ 873 if (!gmap->mm->context.allow_gmap_hpage_1m) 874 return pmd_none(*pmdp) ? NULL : pmdp; 875 876 spin_lock(&gmap->guest_table_lock); 877 if (pmd_none(*pmdp)) { 878 spin_unlock(&gmap->guest_table_lock); 879 return NULL; 880 } 881 882 /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ 883 if (!pmd_leaf(*pmdp)) 884 spin_unlock(&gmap->guest_table_lock); 885 return pmdp; 886 } 887 888 /** 889 * gmap_pmd_op_end - release the guest_table_lock if needed 890 * @gmap: pointer to the guest mapping meta data structure 891 * @pmdp: pointer to the pmd 892 */ 893 static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) 894 { 895 if (pmd_leaf(*pmdp)) 896 spin_unlock(&gmap->guest_table_lock); 897 } 898 899 /* 900 * gmap_protect_pmd - remove access rights to memory and set pmd notification bits 901 * @pmdp: pointer to the pmd to be protected 902 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 903 * @bits: notification bits to set 904 * 905 * Returns: 906 * 0 if successfully protected 907 * -EAGAIN if a fixup is needed 908 * -EINVAL if unsupported notifier bits have been specified 909 * 910 * Expected to be called with sg->mm->mmap_lock in read and 911 * guest_table_lock held. 912 */ 913 static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, 914 pmd_t *pmdp, int prot, unsigned long bits) 915 { 916 int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; 917 int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; 918 pmd_t new = *pmdp; 919 920 /* Fixup needed */ 921 if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) 922 return -EAGAIN; 923 924 if (prot == PROT_NONE && !pmd_i) { 925 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); 926 gmap_pmdp_xchg(gmap, pmdp, new, gaddr); 927 } 928 929 if (prot == PROT_READ && !pmd_p) { 930 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); 931 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); 932 gmap_pmdp_xchg(gmap, pmdp, new, gaddr); 933 } 934 935 if (bits & GMAP_NOTIFY_MPROT) 936 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); 937 938 /* Shadow GMAP protection needs split PMDs */ 939 if (bits & GMAP_NOTIFY_SHADOW) 940 return -EINVAL; 941 942 return 0; 943 } 944 945 /* 946 * gmap_protect_pte - remove access rights to memory and set pgste bits 947 * @gmap: pointer to guest mapping meta data structure 948 * @gaddr: virtual address in the guest address space 949 * @pmdp: pointer to the pmd associated with the pte 950 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 951 * @bits: notification bits to set 952 * 953 * Returns 0 if successfully protected, -ENOMEM if out of memory and 954 * -EAGAIN if a fixup is needed. 955 * 956 * Expected to be called with sg->mm->mmap_lock in read 957 */ 958 static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, 959 pmd_t *pmdp, int prot, unsigned long bits) 960 { 961 int rc; 962 pte_t *ptep; 963 spinlock_t *ptl; 964 unsigned long pbits = 0; 965 966 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) 967 return -EAGAIN; 968 969 ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); 970 if (!ptep) 971 return -ENOMEM; 972 973 pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; 974 pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; 975 /* Protect and unlock. */ 976 rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); 977 gmap_pte_op_end(ptep, ptl); 978 return rc; 979 } 980 981 /* 982 * gmap_protect_range - remove access rights to memory and set pgste bits 983 * @gmap: pointer to guest mapping meta data structure 984 * @gaddr: virtual address in the guest address space 985 * @len: size of area 986 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 987 * @bits: pgste notification bits to set 988 * 989 * Returns: 990 * PAGE_SIZE if a small page was successfully protected; 991 * HPAGE_SIZE if a large page was successfully protected; 992 * -ENOMEM if out of memory; 993 * -EFAULT if gaddr is invalid (or mapping for shadows is missing); 994 * -EAGAIN if the guest mapping is missing and should be fixed by the caller. 995 * 996 * Context: Called with sg->mm->mmap_lock in read. 997 */ 998 int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits) 999 { 1000 pmd_t *pmdp; 1001 int rc = 0; 1002 1003 BUG_ON(gmap_is_shadow(gmap)); 1004 1005 pmdp = gmap_pmd_op_walk(gmap, gaddr); 1006 if (!pmdp) 1007 return -EAGAIN; 1008 1009 if (!pmd_leaf(*pmdp)) { 1010 rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); 1011 if (!rc) 1012 rc = PAGE_SIZE; 1013 } else { 1014 rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits); 1015 if (!rc) 1016 rc = HPAGE_SIZE; 1017 } 1018 gmap_pmd_op_end(gmap, pmdp); 1019 1020 return rc; 1021 } 1022 EXPORT_SYMBOL_GPL(gmap_protect_one); 1023 1024 /** 1025 * gmap_read_table - get an unsigned long value from a guest page table using 1026 * absolute addressing, without marking the page referenced. 1027 * @gmap: pointer to guest mapping meta data structure 1028 * @gaddr: virtual address in the guest address space 1029 * @val: pointer to the unsigned long value to return 1030 * 1031 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT 1032 * if reading using the virtual address failed. -EINVAL if called on a gmap 1033 * shadow. 1034 * 1035 * Called with gmap->mm->mmap_lock in read. 1036 */ 1037 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) 1038 { 1039 unsigned long address, vmaddr; 1040 spinlock_t *ptl; 1041 pte_t *ptep, pte; 1042 int rc; 1043 1044 if (gmap_is_shadow(gmap)) 1045 return -EINVAL; 1046 1047 while (1) { 1048 rc = -EAGAIN; 1049 ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); 1050 if (ptep) { 1051 pte = *ptep; 1052 if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { 1053 address = pte_val(pte) & PAGE_MASK; 1054 address += gaddr & ~PAGE_MASK; 1055 *val = *(unsigned long *)__va(address); 1056 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); 1057 /* Do *NOT* clear the _PAGE_INVALID bit! */ 1058 rc = 0; 1059 } 1060 gmap_pte_op_end(ptep, ptl); 1061 } 1062 if (!rc) 1063 break; 1064 vmaddr = __gmap_translate(gmap, gaddr); 1065 if (IS_ERR_VALUE(vmaddr)) { 1066 rc = vmaddr; 1067 break; 1068 } 1069 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); 1070 if (rc) 1071 break; 1072 } 1073 return rc; 1074 } 1075 EXPORT_SYMBOL_GPL(gmap_read_table); 1076 1077 /** 1078 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree 1079 * @sg: pointer to the shadow guest address space structure 1080 * @vmaddr: vm address associated with the rmap 1081 * @rmap: pointer to the rmap structure 1082 * 1083 * Called with the sg->guest_table_lock 1084 */ 1085 static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, 1086 struct gmap_rmap *rmap) 1087 { 1088 struct gmap_rmap *temp; 1089 void __rcu **slot; 1090 1091 BUG_ON(!gmap_is_shadow(sg)); 1092 slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); 1093 if (slot) { 1094 rmap->next = radix_tree_deref_slot_protected(slot, 1095 &sg->guest_table_lock); 1096 for (temp = rmap->next; temp; temp = temp->next) { 1097 if (temp->raddr == rmap->raddr) { 1098 kfree(rmap); 1099 return; 1100 } 1101 } 1102 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); 1103 } else { 1104 rmap->next = NULL; 1105 radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, 1106 rmap); 1107 } 1108 } 1109 1110 /** 1111 * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap 1112 * @sg: pointer to the shadow guest address space structure 1113 * @raddr: rmap address in the shadow gmap 1114 * @paddr: address in the parent guest address space 1115 * @len: length of the memory area to protect 1116 * 1117 * Returns 0 if successfully protected and the rmap was created, -ENOMEM 1118 * if out of memory and -EFAULT if paddr is invalid. 1119 */ 1120 static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, 1121 unsigned long paddr, unsigned long len) 1122 { 1123 struct gmap *parent; 1124 struct gmap_rmap *rmap; 1125 unsigned long vmaddr; 1126 spinlock_t *ptl; 1127 pte_t *ptep; 1128 int rc; 1129 1130 BUG_ON(!gmap_is_shadow(sg)); 1131 parent = sg->parent; 1132 while (len) { 1133 vmaddr = __gmap_translate(parent, paddr); 1134 if (IS_ERR_VALUE(vmaddr)) 1135 return vmaddr; 1136 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); 1137 if (!rmap) 1138 return -ENOMEM; 1139 rmap->raddr = raddr; 1140 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 1141 if (rc) { 1142 kfree(rmap); 1143 return rc; 1144 } 1145 rc = -EAGAIN; 1146 ptep = gmap_pte_op_walk(parent, paddr, &ptl); 1147 if (ptep) { 1148 spin_lock(&sg->guest_table_lock); 1149 rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, 1150 PGSTE_VSIE_BIT); 1151 if (!rc) 1152 gmap_insert_rmap(sg, vmaddr, rmap); 1153 spin_unlock(&sg->guest_table_lock); 1154 gmap_pte_op_end(ptep, ptl); 1155 } 1156 radix_tree_preload_end(); 1157 if (rc) { 1158 kfree(rmap); 1159 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); 1160 if (rc) 1161 return rc; 1162 continue; 1163 } 1164 paddr += PAGE_SIZE; 1165 len -= PAGE_SIZE; 1166 } 1167 return 0; 1168 } 1169 1170 #define _SHADOW_RMAP_MASK 0x7 1171 #define _SHADOW_RMAP_REGION1 0x5 1172 #define _SHADOW_RMAP_REGION2 0x4 1173 #define _SHADOW_RMAP_REGION3 0x3 1174 #define _SHADOW_RMAP_SEGMENT 0x2 1175 #define _SHADOW_RMAP_PGTABLE 0x1 1176 1177 /** 1178 * gmap_idte_one - invalidate a single region or segment table entry 1179 * @asce: region or segment table *origin* + table-type bits 1180 * @vaddr: virtual address to identify the table entry to flush 1181 * 1182 * The invalid bit of a single region or segment table entry is set 1183 * and the associated TLB entries depending on the entry are flushed. 1184 * The table-type of the @asce identifies the portion of the @vaddr 1185 * that is used as the invalidation index. 1186 */ 1187 static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) 1188 { 1189 asm volatile( 1190 " idte %0,0,%1" 1191 : : "a" (asce), "a" (vaddr) : "cc", "memory"); 1192 } 1193 1194 /** 1195 * gmap_unshadow_page - remove a page from a shadow page table 1196 * @sg: pointer to the shadow guest address space structure 1197 * @raddr: rmap address in the shadow guest address space 1198 * 1199 * Called with the sg->guest_table_lock 1200 */ 1201 static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) 1202 { 1203 unsigned long *table; 1204 1205 BUG_ON(!gmap_is_shadow(sg)); 1206 table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ 1207 if (!table || *table & _PAGE_INVALID) 1208 return; 1209 gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1); 1210 ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); 1211 } 1212 1213 /** 1214 * __gmap_unshadow_pgt - remove all entries from a shadow page table 1215 * @sg: pointer to the shadow guest address space structure 1216 * @raddr: rmap address in the shadow guest address space 1217 * @pgt: pointer to the start of a shadow page table 1218 * 1219 * Called with the sg->guest_table_lock 1220 */ 1221 static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, 1222 unsigned long *pgt) 1223 { 1224 int i; 1225 1226 BUG_ON(!gmap_is_shadow(sg)); 1227 for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE) 1228 pgt[i] = _PAGE_INVALID; 1229 } 1230 1231 /** 1232 * gmap_unshadow_pgt - remove a shadow page table from a segment entry 1233 * @sg: pointer to the shadow guest address space structure 1234 * @raddr: address in the shadow guest address space 1235 * 1236 * Called with the sg->guest_table_lock 1237 */ 1238 static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) 1239 { 1240 unsigned long *ste; 1241 phys_addr_t sto, pgt; 1242 struct ptdesc *ptdesc; 1243 1244 BUG_ON(!gmap_is_shadow(sg)); 1245 ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ 1246 if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) 1247 return; 1248 gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); 1249 sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); 1250 gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); 1251 pgt = *ste & _SEGMENT_ENTRY_ORIGIN; 1252 *ste = _SEGMENT_ENTRY_EMPTY; 1253 __gmap_unshadow_pgt(sg, raddr, __va(pgt)); 1254 /* Free page table */ 1255 ptdesc = page_ptdesc(phys_to_page(pgt)); 1256 page_table_free_pgste(ptdesc); 1257 } 1258 1259 /** 1260 * __gmap_unshadow_sgt - remove all entries from a shadow segment table 1261 * @sg: pointer to the shadow guest address space structure 1262 * @raddr: rmap address in the shadow guest address space 1263 * @sgt: pointer to the start of a shadow segment table 1264 * 1265 * Called with the sg->guest_table_lock 1266 */ 1267 static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, 1268 unsigned long *sgt) 1269 { 1270 struct ptdesc *ptdesc; 1271 phys_addr_t pgt; 1272 int i; 1273 1274 BUG_ON(!gmap_is_shadow(sg)); 1275 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { 1276 if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) 1277 continue; 1278 pgt = sgt[i] & _REGION_ENTRY_ORIGIN; 1279 sgt[i] = _SEGMENT_ENTRY_EMPTY; 1280 __gmap_unshadow_pgt(sg, raddr, __va(pgt)); 1281 /* Free page table */ 1282 ptdesc = page_ptdesc(phys_to_page(pgt)); 1283 page_table_free_pgste(ptdesc); 1284 } 1285 } 1286 1287 /** 1288 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry 1289 * @sg: pointer to the shadow guest address space structure 1290 * @raddr: rmap address in the shadow guest address space 1291 * 1292 * Called with the shadow->guest_table_lock 1293 */ 1294 static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) 1295 { 1296 unsigned long r3o, *r3e; 1297 phys_addr_t sgt; 1298 struct page *page; 1299 1300 BUG_ON(!gmap_is_shadow(sg)); 1301 r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ 1302 if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) 1303 return; 1304 gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); 1305 r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); 1306 gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); 1307 sgt = *r3e & _REGION_ENTRY_ORIGIN; 1308 *r3e = _REGION3_ENTRY_EMPTY; 1309 __gmap_unshadow_sgt(sg, raddr, __va(sgt)); 1310 /* Free segment table */ 1311 page = phys_to_page(sgt); 1312 __free_pages(page, CRST_ALLOC_ORDER); 1313 } 1314 1315 /** 1316 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table 1317 * @sg: pointer to the shadow guest address space structure 1318 * @raddr: address in the shadow guest address space 1319 * @r3t: pointer to the start of a shadow region-3 table 1320 * 1321 * Called with the sg->guest_table_lock 1322 */ 1323 static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, 1324 unsigned long *r3t) 1325 { 1326 struct page *page; 1327 phys_addr_t sgt; 1328 int i; 1329 1330 BUG_ON(!gmap_is_shadow(sg)); 1331 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { 1332 if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) 1333 continue; 1334 sgt = r3t[i] & _REGION_ENTRY_ORIGIN; 1335 r3t[i] = _REGION3_ENTRY_EMPTY; 1336 __gmap_unshadow_sgt(sg, raddr, __va(sgt)); 1337 /* Free segment table */ 1338 page = phys_to_page(sgt); 1339 __free_pages(page, CRST_ALLOC_ORDER); 1340 } 1341 } 1342 1343 /** 1344 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry 1345 * @sg: pointer to the shadow guest address space structure 1346 * @raddr: rmap address in the shadow guest address space 1347 * 1348 * Called with the sg->guest_table_lock 1349 */ 1350 static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) 1351 { 1352 unsigned long r2o, *r2e; 1353 phys_addr_t r3t; 1354 struct page *page; 1355 1356 BUG_ON(!gmap_is_shadow(sg)); 1357 r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ 1358 if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) 1359 return; 1360 gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); 1361 r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); 1362 gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); 1363 r3t = *r2e & _REGION_ENTRY_ORIGIN; 1364 *r2e = _REGION2_ENTRY_EMPTY; 1365 __gmap_unshadow_r3t(sg, raddr, __va(r3t)); 1366 /* Free region 3 table */ 1367 page = phys_to_page(r3t); 1368 __free_pages(page, CRST_ALLOC_ORDER); 1369 } 1370 1371 /** 1372 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table 1373 * @sg: pointer to the shadow guest address space structure 1374 * @raddr: rmap address in the shadow guest address space 1375 * @r2t: pointer to the start of a shadow region-2 table 1376 * 1377 * Called with the sg->guest_table_lock 1378 */ 1379 static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, 1380 unsigned long *r2t) 1381 { 1382 phys_addr_t r3t; 1383 struct page *page; 1384 int i; 1385 1386 BUG_ON(!gmap_is_shadow(sg)); 1387 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { 1388 if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) 1389 continue; 1390 r3t = r2t[i] & _REGION_ENTRY_ORIGIN; 1391 r2t[i] = _REGION2_ENTRY_EMPTY; 1392 __gmap_unshadow_r3t(sg, raddr, __va(r3t)); 1393 /* Free region 3 table */ 1394 page = phys_to_page(r3t); 1395 __free_pages(page, CRST_ALLOC_ORDER); 1396 } 1397 } 1398 1399 /** 1400 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry 1401 * @sg: pointer to the shadow guest address space structure 1402 * @raddr: rmap address in the shadow guest address space 1403 * 1404 * Called with the sg->guest_table_lock 1405 */ 1406 static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) 1407 { 1408 unsigned long r1o, *r1e; 1409 struct page *page; 1410 phys_addr_t r2t; 1411 1412 BUG_ON(!gmap_is_shadow(sg)); 1413 r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ 1414 if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) 1415 return; 1416 gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); 1417 r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); 1418 gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); 1419 r2t = *r1e & _REGION_ENTRY_ORIGIN; 1420 *r1e = _REGION1_ENTRY_EMPTY; 1421 __gmap_unshadow_r2t(sg, raddr, __va(r2t)); 1422 /* Free region 2 table */ 1423 page = phys_to_page(r2t); 1424 __free_pages(page, CRST_ALLOC_ORDER); 1425 } 1426 1427 /** 1428 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table 1429 * @sg: pointer to the shadow guest address space structure 1430 * @raddr: rmap address in the shadow guest address space 1431 * @r1t: pointer to the start of a shadow region-1 table 1432 * 1433 * Called with the shadow->guest_table_lock 1434 */ 1435 static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, 1436 unsigned long *r1t) 1437 { 1438 unsigned long asce; 1439 struct page *page; 1440 phys_addr_t r2t; 1441 int i; 1442 1443 BUG_ON(!gmap_is_shadow(sg)); 1444 asce = __pa(r1t) | _ASCE_TYPE_REGION1; 1445 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { 1446 if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) 1447 continue; 1448 r2t = r1t[i] & _REGION_ENTRY_ORIGIN; 1449 __gmap_unshadow_r2t(sg, raddr, __va(r2t)); 1450 /* Clear entry and flush translation r1t -> r2t */ 1451 gmap_idte_one(asce, raddr); 1452 r1t[i] = _REGION1_ENTRY_EMPTY; 1453 /* Free region 2 table */ 1454 page = phys_to_page(r2t); 1455 __free_pages(page, CRST_ALLOC_ORDER); 1456 } 1457 } 1458 1459 /** 1460 * gmap_unshadow - remove a shadow page table completely 1461 * @sg: pointer to the shadow guest address space structure 1462 * 1463 * Called with sg->guest_table_lock 1464 */ 1465 void gmap_unshadow(struct gmap *sg) 1466 { 1467 unsigned long *table; 1468 1469 BUG_ON(!gmap_is_shadow(sg)); 1470 if (sg->removed) 1471 return; 1472 sg->removed = 1; 1473 gmap_call_notifier(sg, 0, -1UL); 1474 gmap_flush_tlb(sg); 1475 table = __va(sg->asce & _ASCE_ORIGIN); 1476 switch (sg->asce & _ASCE_TYPE_MASK) { 1477 case _ASCE_TYPE_REGION1: 1478 __gmap_unshadow_r1t(sg, 0, table); 1479 break; 1480 case _ASCE_TYPE_REGION2: 1481 __gmap_unshadow_r2t(sg, 0, table); 1482 break; 1483 case _ASCE_TYPE_REGION3: 1484 __gmap_unshadow_r3t(sg, 0, table); 1485 break; 1486 case _ASCE_TYPE_SEGMENT: 1487 __gmap_unshadow_sgt(sg, 0, table); 1488 break; 1489 } 1490 } 1491 EXPORT_SYMBOL(gmap_unshadow); 1492 1493 /** 1494 * gmap_shadow_r2t - create an empty shadow region 2 table 1495 * @sg: pointer to the shadow guest address space structure 1496 * @saddr: faulting address in the shadow gmap 1497 * @r2t: parent gmap address of the region 2 table to get shadowed 1498 * @fake: r2t references contiguous guest memory block, not a r2t 1499 * 1500 * The r2t parameter specifies the address of the source table. The 1501 * four pages of the source table are made read-only in the parent gmap 1502 * address space. A write to the source table area @r2t will automatically 1503 * remove the shadow r2 table and all of its descendants. 1504 * 1505 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1506 * shadow table structure is incomplete, -ENOMEM if out of memory and 1507 * -EFAULT if an address in the parent gmap could not be resolved. 1508 * 1509 * Called with sg->mm->mmap_lock in read. 1510 */ 1511 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, 1512 int fake) 1513 { 1514 unsigned long raddr, origin, offset, len; 1515 unsigned long *table; 1516 phys_addr_t s_r2t; 1517 struct page *page; 1518 int rc; 1519 1520 BUG_ON(!gmap_is_shadow(sg)); 1521 /* Allocate a shadow region second table */ 1522 page = gmap_alloc_crst(); 1523 if (!page) 1524 return -ENOMEM; 1525 s_r2t = page_to_phys(page); 1526 /* Install shadow region second table */ 1527 spin_lock(&sg->guest_table_lock); 1528 table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ 1529 if (!table) { 1530 rc = -EAGAIN; /* Race with unshadow */ 1531 goto out_free; 1532 } 1533 if (!(*table & _REGION_ENTRY_INVALID)) { 1534 rc = 0; /* Already established */ 1535 goto out_free; 1536 } else if (*table & _REGION_ENTRY_ORIGIN) { 1537 rc = -EAGAIN; /* Race with shadow */ 1538 goto out_free; 1539 } 1540 crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); 1541 /* mark as invalid as long as the parent table is not protected */ 1542 *table = s_r2t | _REGION_ENTRY_LENGTH | 1543 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; 1544 if (sg->edat_level >= 1) 1545 *table |= (r2t & _REGION_ENTRY_PROTECT); 1546 if (fake) { 1547 /* nothing to protect for fake tables */ 1548 *table &= ~_REGION_ENTRY_INVALID; 1549 spin_unlock(&sg->guest_table_lock); 1550 return 0; 1551 } 1552 spin_unlock(&sg->guest_table_lock); 1553 /* Make r2t read-only in parent gmap page table */ 1554 raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; 1555 origin = r2t & _REGION_ENTRY_ORIGIN; 1556 offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1557 len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1558 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1559 spin_lock(&sg->guest_table_lock); 1560 if (!rc) { 1561 table = gmap_table_walk(sg, saddr, 4); 1562 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) 1563 rc = -EAGAIN; /* Race with unshadow */ 1564 else 1565 *table &= ~_REGION_ENTRY_INVALID; 1566 } else { 1567 gmap_unshadow_r2t(sg, raddr); 1568 } 1569 spin_unlock(&sg->guest_table_lock); 1570 return rc; 1571 out_free: 1572 spin_unlock(&sg->guest_table_lock); 1573 __free_pages(page, CRST_ALLOC_ORDER); 1574 return rc; 1575 } 1576 EXPORT_SYMBOL_GPL(gmap_shadow_r2t); 1577 1578 /** 1579 * gmap_shadow_r3t - create a shadow region 3 table 1580 * @sg: pointer to the shadow guest address space structure 1581 * @saddr: faulting address in the shadow gmap 1582 * @r3t: parent gmap address of the region 3 table to get shadowed 1583 * @fake: r3t references contiguous guest memory block, not a r3t 1584 * 1585 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1586 * shadow table structure is incomplete, -ENOMEM if out of memory and 1587 * -EFAULT if an address in the parent gmap could not be resolved. 1588 * 1589 * Called with sg->mm->mmap_lock in read. 1590 */ 1591 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, 1592 int fake) 1593 { 1594 unsigned long raddr, origin, offset, len; 1595 unsigned long *table; 1596 phys_addr_t s_r3t; 1597 struct page *page; 1598 int rc; 1599 1600 BUG_ON(!gmap_is_shadow(sg)); 1601 /* Allocate a shadow region second table */ 1602 page = gmap_alloc_crst(); 1603 if (!page) 1604 return -ENOMEM; 1605 s_r3t = page_to_phys(page); 1606 /* Install shadow region second table */ 1607 spin_lock(&sg->guest_table_lock); 1608 table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ 1609 if (!table) { 1610 rc = -EAGAIN; /* Race with unshadow */ 1611 goto out_free; 1612 } 1613 if (!(*table & _REGION_ENTRY_INVALID)) { 1614 rc = 0; /* Already established */ 1615 goto out_free; 1616 } else if (*table & _REGION_ENTRY_ORIGIN) { 1617 rc = -EAGAIN; /* Race with shadow */ 1618 goto out_free; 1619 } 1620 crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); 1621 /* mark as invalid as long as the parent table is not protected */ 1622 *table = s_r3t | _REGION_ENTRY_LENGTH | 1623 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; 1624 if (sg->edat_level >= 1) 1625 *table |= (r3t & _REGION_ENTRY_PROTECT); 1626 if (fake) { 1627 /* nothing to protect for fake tables */ 1628 *table &= ~_REGION_ENTRY_INVALID; 1629 spin_unlock(&sg->guest_table_lock); 1630 return 0; 1631 } 1632 spin_unlock(&sg->guest_table_lock); 1633 /* Make r3t read-only in parent gmap page table */ 1634 raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; 1635 origin = r3t & _REGION_ENTRY_ORIGIN; 1636 offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1637 len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1638 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1639 spin_lock(&sg->guest_table_lock); 1640 if (!rc) { 1641 table = gmap_table_walk(sg, saddr, 3); 1642 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) 1643 rc = -EAGAIN; /* Race with unshadow */ 1644 else 1645 *table &= ~_REGION_ENTRY_INVALID; 1646 } else { 1647 gmap_unshadow_r3t(sg, raddr); 1648 } 1649 spin_unlock(&sg->guest_table_lock); 1650 return rc; 1651 out_free: 1652 spin_unlock(&sg->guest_table_lock); 1653 __free_pages(page, CRST_ALLOC_ORDER); 1654 return rc; 1655 } 1656 EXPORT_SYMBOL_GPL(gmap_shadow_r3t); 1657 1658 /** 1659 * gmap_shadow_sgt - create a shadow segment table 1660 * @sg: pointer to the shadow guest address space structure 1661 * @saddr: faulting address in the shadow gmap 1662 * @sgt: parent gmap address of the segment table to get shadowed 1663 * @fake: sgt references contiguous guest memory block, not a sgt 1664 * 1665 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the 1666 * shadow table structure is incomplete, -ENOMEM if out of memory and 1667 * -EFAULT if an address in the parent gmap could not be resolved. 1668 * 1669 * Called with sg->mm->mmap_lock in read. 1670 */ 1671 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, 1672 int fake) 1673 { 1674 unsigned long raddr, origin, offset, len; 1675 unsigned long *table; 1676 phys_addr_t s_sgt; 1677 struct page *page; 1678 int rc; 1679 1680 BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); 1681 /* Allocate a shadow segment table */ 1682 page = gmap_alloc_crst(); 1683 if (!page) 1684 return -ENOMEM; 1685 s_sgt = page_to_phys(page); 1686 /* Install shadow region second table */ 1687 spin_lock(&sg->guest_table_lock); 1688 table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ 1689 if (!table) { 1690 rc = -EAGAIN; /* Race with unshadow */ 1691 goto out_free; 1692 } 1693 if (!(*table & _REGION_ENTRY_INVALID)) { 1694 rc = 0; /* Already established */ 1695 goto out_free; 1696 } else if (*table & _REGION_ENTRY_ORIGIN) { 1697 rc = -EAGAIN; /* Race with shadow */ 1698 goto out_free; 1699 } 1700 crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); 1701 /* mark as invalid as long as the parent table is not protected */ 1702 *table = s_sgt | _REGION_ENTRY_LENGTH | 1703 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; 1704 if (sg->edat_level >= 1) 1705 *table |= sgt & _REGION_ENTRY_PROTECT; 1706 if (fake) { 1707 /* nothing to protect for fake tables */ 1708 *table &= ~_REGION_ENTRY_INVALID; 1709 spin_unlock(&sg->guest_table_lock); 1710 return 0; 1711 } 1712 spin_unlock(&sg->guest_table_lock); 1713 /* Make sgt read-only in parent gmap page table */ 1714 raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; 1715 origin = sgt & _REGION_ENTRY_ORIGIN; 1716 offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1717 len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1718 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1719 spin_lock(&sg->guest_table_lock); 1720 if (!rc) { 1721 table = gmap_table_walk(sg, saddr, 2); 1722 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) 1723 rc = -EAGAIN; /* Race with unshadow */ 1724 else 1725 *table &= ~_REGION_ENTRY_INVALID; 1726 } else { 1727 gmap_unshadow_sgt(sg, raddr); 1728 } 1729 spin_unlock(&sg->guest_table_lock); 1730 return rc; 1731 out_free: 1732 spin_unlock(&sg->guest_table_lock); 1733 __free_pages(page, CRST_ALLOC_ORDER); 1734 return rc; 1735 } 1736 EXPORT_SYMBOL_GPL(gmap_shadow_sgt); 1737 1738 static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr) 1739 { 1740 unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc)); 1741 1742 pgstes += _PAGE_ENTRIES; 1743 1744 pgstes[0] &= ~PGSTE_ST2_MASK; 1745 pgstes[1] &= ~PGSTE_ST2_MASK; 1746 pgstes[2] &= ~PGSTE_ST2_MASK; 1747 pgstes[3] &= ~PGSTE_ST2_MASK; 1748 1749 pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK; 1750 pgstes[1] |= pgt_addr & PGSTE_ST2_MASK; 1751 pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK; 1752 pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK; 1753 } 1754 1755 /** 1756 * gmap_shadow_pgt - instantiate a shadow page table 1757 * @sg: pointer to the shadow guest address space structure 1758 * @saddr: faulting address in the shadow gmap 1759 * @pgt: parent gmap address of the page table to get shadowed 1760 * @fake: pgt references contiguous guest memory block, not a pgtable 1761 * 1762 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1763 * shadow table structure is incomplete, -ENOMEM if out of memory, 1764 * -EFAULT if an address in the parent gmap could not be resolved and 1765 * 1766 * Called with gmap->mm->mmap_lock in read 1767 */ 1768 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, 1769 int fake) 1770 { 1771 unsigned long raddr, origin; 1772 unsigned long *table; 1773 struct ptdesc *ptdesc; 1774 phys_addr_t s_pgt; 1775 int rc; 1776 1777 BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); 1778 /* Allocate a shadow page table */ 1779 ptdesc = page_table_alloc_pgste(sg->mm); 1780 if (!ptdesc) 1781 return -ENOMEM; 1782 origin = pgt & _SEGMENT_ENTRY_ORIGIN; 1783 if (fake) 1784 origin |= GMAP_SHADOW_FAKE_TABLE; 1785 gmap_pgste_set_pgt_addr(ptdesc, origin); 1786 s_pgt = page_to_phys(ptdesc_page(ptdesc)); 1787 /* Install shadow page table */ 1788 spin_lock(&sg->guest_table_lock); 1789 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 1790 if (!table) { 1791 rc = -EAGAIN; /* Race with unshadow */ 1792 goto out_free; 1793 } 1794 if (!(*table & _SEGMENT_ENTRY_INVALID)) { 1795 rc = 0; /* Already established */ 1796 goto out_free; 1797 } else if (*table & _SEGMENT_ENTRY_ORIGIN) { 1798 rc = -EAGAIN; /* Race with shadow */ 1799 goto out_free; 1800 } 1801 /* mark as invalid as long as the parent table is not protected */ 1802 *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | 1803 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; 1804 if (fake) { 1805 /* nothing to protect for fake tables */ 1806 *table &= ~_SEGMENT_ENTRY_INVALID; 1807 spin_unlock(&sg->guest_table_lock); 1808 return 0; 1809 } 1810 spin_unlock(&sg->guest_table_lock); 1811 /* Make pgt read-only in parent gmap page table (not the pgste) */ 1812 raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; 1813 origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; 1814 rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); 1815 spin_lock(&sg->guest_table_lock); 1816 if (!rc) { 1817 table = gmap_table_walk(sg, saddr, 1); 1818 if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) 1819 rc = -EAGAIN; /* Race with unshadow */ 1820 else 1821 *table &= ~_SEGMENT_ENTRY_INVALID; 1822 } else { 1823 gmap_unshadow_pgt(sg, raddr); 1824 } 1825 spin_unlock(&sg->guest_table_lock); 1826 return rc; 1827 out_free: 1828 spin_unlock(&sg->guest_table_lock); 1829 page_table_free_pgste(ptdesc); 1830 return rc; 1831 1832 } 1833 EXPORT_SYMBOL_GPL(gmap_shadow_pgt); 1834 1835 /** 1836 * gmap_shadow_page - create a shadow page mapping 1837 * @sg: pointer to the shadow guest address space structure 1838 * @saddr: faulting address in the shadow gmap 1839 * @pte: pte in parent gmap address space to get shadowed 1840 * 1841 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1842 * shadow table structure is incomplete, -ENOMEM if out of memory and 1843 * -EFAULT if an address in the parent gmap could not be resolved. 1844 * 1845 * Called with sg->mm->mmap_lock in read. 1846 */ 1847 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) 1848 { 1849 struct gmap *parent; 1850 struct gmap_rmap *rmap; 1851 unsigned long vmaddr, paddr; 1852 spinlock_t *ptl; 1853 pte_t *sptep, *tptep; 1854 int prot; 1855 int rc; 1856 1857 BUG_ON(!gmap_is_shadow(sg)); 1858 parent = sg->parent; 1859 prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; 1860 1861 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); 1862 if (!rmap) 1863 return -ENOMEM; 1864 rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; 1865 1866 while (1) { 1867 paddr = pte_val(pte) & PAGE_MASK; 1868 vmaddr = __gmap_translate(parent, paddr); 1869 if (IS_ERR_VALUE(vmaddr)) { 1870 rc = vmaddr; 1871 break; 1872 } 1873 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 1874 if (rc) 1875 break; 1876 rc = -EAGAIN; 1877 sptep = gmap_pte_op_walk(parent, paddr, &ptl); 1878 if (sptep) { 1879 spin_lock(&sg->guest_table_lock); 1880 /* Get page table pointer */ 1881 tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); 1882 if (!tptep) { 1883 spin_unlock(&sg->guest_table_lock); 1884 gmap_pte_op_end(sptep, ptl); 1885 radix_tree_preload_end(); 1886 break; 1887 } 1888 rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); 1889 if (rc > 0) { 1890 /* Success and a new mapping */ 1891 gmap_insert_rmap(sg, vmaddr, rmap); 1892 rmap = NULL; 1893 rc = 0; 1894 } 1895 gmap_pte_op_end(sptep, ptl); 1896 spin_unlock(&sg->guest_table_lock); 1897 } 1898 radix_tree_preload_end(); 1899 if (!rc) 1900 break; 1901 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); 1902 if (rc) 1903 break; 1904 } 1905 kfree(rmap); 1906 return rc; 1907 } 1908 EXPORT_SYMBOL_GPL(gmap_shadow_page); 1909 1910 /* 1911 * gmap_shadow_notify - handle notifications for shadow gmap 1912 * 1913 * Called with sg->parent->shadow_lock. 1914 */ 1915 static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, 1916 unsigned long gaddr) 1917 { 1918 struct gmap_rmap *rmap, *rnext, *head; 1919 unsigned long start, end, bits, raddr; 1920 1921 BUG_ON(!gmap_is_shadow(sg)); 1922 1923 spin_lock(&sg->guest_table_lock); 1924 if (sg->removed) { 1925 spin_unlock(&sg->guest_table_lock); 1926 return; 1927 } 1928 /* Check for top level table */ 1929 start = sg->orig_asce & _ASCE_ORIGIN; 1930 end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; 1931 if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && 1932 gaddr < end) { 1933 /* The complete shadow table has to go */ 1934 gmap_unshadow(sg); 1935 spin_unlock(&sg->guest_table_lock); 1936 list_del(&sg->list); 1937 gmap_put(sg); 1938 return; 1939 } 1940 /* Remove the page table tree from on specific entry */ 1941 head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); 1942 gmap_for_each_rmap_safe(rmap, rnext, head) { 1943 bits = rmap->raddr & _SHADOW_RMAP_MASK; 1944 raddr = rmap->raddr ^ bits; 1945 switch (bits) { 1946 case _SHADOW_RMAP_REGION1: 1947 gmap_unshadow_r2t(sg, raddr); 1948 break; 1949 case _SHADOW_RMAP_REGION2: 1950 gmap_unshadow_r3t(sg, raddr); 1951 break; 1952 case _SHADOW_RMAP_REGION3: 1953 gmap_unshadow_sgt(sg, raddr); 1954 break; 1955 case _SHADOW_RMAP_SEGMENT: 1956 gmap_unshadow_pgt(sg, raddr); 1957 break; 1958 case _SHADOW_RMAP_PGTABLE: 1959 gmap_unshadow_page(sg, raddr); 1960 break; 1961 } 1962 kfree(rmap); 1963 } 1964 spin_unlock(&sg->guest_table_lock); 1965 } 1966 1967 /** 1968 * ptep_notify - call all invalidation callbacks for a specific pte. 1969 * @mm: pointer to the process mm_struct 1970 * @vmaddr: virtual address in the process address space 1971 * @pte: pointer to the page table entry 1972 * @bits: bits from the pgste that caused the notify call 1973 * 1974 * This function is assumed to be called with the page table lock held 1975 * for the pte to notify. 1976 */ 1977 void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, 1978 pte_t *pte, unsigned long bits) 1979 { 1980 unsigned long offset, gaddr = 0; 1981 struct gmap *gmap, *sg, *next; 1982 1983 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 1984 offset = offset * (PAGE_SIZE / sizeof(pte_t)); 1985 rcu_read_lock(); 1986 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 1987 spin_lock(&gmap->guest_table_lock); 1988 gaddr = host_to_guest_lookup(gmap, vmaddr) + offset; 1989 spin_unlock(&gmap->guest_table_lock); 1990 if (!IS_GADDR_VALID(gaddr)) 1991 continue; 1992 1993 if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { 1994 spin_lock(&gmap->shadow_lock); 1995 list_for_each_entry_safe(sg, next, 1996 &gmap->children, list) 1997 gmap_shadow_notify(sg, vmaddr, gaddr); 1998 spin_unlock(&gmap->shadow_lock); 1999 } 2000 if (bits & PGSTE_IN_BIT) 2001 gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); 2002 } 2003 rcu_read_unlock(); 2004 } 2005 EXPORT_SYMBOL_GPL(ptep_notify); 2006 2007 static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, 2008 unsigned long gaddr) 2009 { 2010 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); 2011 gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); 2012 } 2013 2014 /** 2015 * gmap_pmdp_xchg - exchange a gmap pmd with another 2016 * @gmap: pointer to the guest address space structure 2017 * @pmdp: pointer to the pmd entry 2018 * @new: replacement entry 2019 * @gaddr: the affected guest address 2020 * 2021 * This function is assumed to be called with the guest_table_lock 2022 * held. 2023 */ 2024 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, 2025 unsigned long gaddr) 2026 { 2027 gaddr &= HPAGE_MASK; 2028 pmdp_notify_gmap(gmap, pmdp, gaddr); 2029 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); 2030 if (machine_has_tlb_guest()) 2031 __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, 2032 IDTE_GLOBAL); 2033 else if (cpu_has_idte()) 2034 __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); 2035 else 2036 __pmdp_csp(pmdp); 2037 set_pmd(pmdp, new); 2038 } 2039 2040 static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, 2041 int purge) 2042 { 2043 pmd_t *pmdp; 2044 struct gmap *gmap; 2045 unsigned long gaddr; 2046 2047 rcu_read_lock(); 2048 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2049 spin_lock(&gmap->guest_table_lock); 2050 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); 2051 if (pmdp) { 2052 pmdp_notify_gmap(gmap, pmdp, gaddr); 2053 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2054 _SEGMENT_ENTRY_GMAP_UC | 2055 _SEGMENT_ENTRY)); 2056 if (purge) 2057 __pmdp_csp(pmdp); 2058 set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); 2059 } 2060 spin_unlock(&gmap->guest_table_lock); 2061 } 2062 rcu_read_unlock(); 2063 } 2064 2065 /** 2066 * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without 2067 * flushing 2068 * @mm: pointer to the process mm_struct 2069 * @vmaddr: virtual address in the process address space 2070 */ 2071 void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) 2072 { 2073 gmap_pmdp_clear(mm, vmaddr, 0); 2074 } 2075 EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); 2076 2077 /** 2078 * gmap_pmdp_csp - csp all affected guest pmd entries 2079 * @mm: pointer to the process mm_struct 2080 * @vmaddr: virtual address in the process address space 2081 */ 2082 void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr) 2083 { 2084 gmap_pmdp_clear(mm, vmaddr, 1); 2085 } 2086 EXPORT_SYMBOL_GPL(gmap_pmdp_csp); 2087 2088 /** 2089 * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry 2090 * @mm: pointer to the process mm_struct 2091 * @vmaddr: virtual address in the process address space 2092 */ 2093 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) 2094 { 2095 unsigned long gaddr; 2096 struct gmap *gmap; 2097 pmd_t *pmdp; 2098 2099 rcu_read_lock(); 2100 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2101 spin_lock(&gmap->guest_table_lock); 2102 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); 2103 if (pmdp) { 2104 pmdp_notify_gmap(gmap, pmdp, gaddr); 2105 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2106 _SEGMENT_ENTRY_GMAP_UC | 2107 _SEGMENT_ENTRY)); 2108 if (machine_has_tlb_guest()) 2109 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2110 gmap->asce, IDTE_LOCAL); 2111 else if (cpu_has_idte()) 2112 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); 2113 *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); 2114 } 2115 spin_unlock(&gmap->guest_table_lock); 2116 } 2117 rcu_read_unlock(); 2118 } 2119 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); 2120 2121 /** 2122 * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry 2123 * @mm: pointer to the process mm_struct 2124 * @vmaddr: virtual address in the process address space 2125 */ 2126 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) 2127 { 2128 unsigned long gaddr; 2129 struct gmap *gmap; 2130 pmd_t *pmdp; 2131 2132 rcu_read_lock(); 2133 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2134 spin_lock(&gmap->guest_table_lock); 2135 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); 2136 if (pmdp) { 2137 pmdp_notify_gmap(gmap, pmdp, gaddr); 2138 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2139 _SEGMENT_ENTRY_GMAP_UC | 2140 _SEGMENT_ENTRY)); 2141 if (machine_has_tlb_guest()) 2142 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2143 gmap->asce, IDTE_GLOBAL); 2144 else if (cpu_has_idte()) 2145 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); 2146 else 2147 __pmdp_csp(pmdp); 2148 *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); 2149 } 2150 spin_unlock(&gmap->guest_table_lock); 2151 } 2152 rcu_read_unlock(); 2153 } 2154 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); 2155 2156 /** 2157 * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status 2158 * @gmap: pointer to guest address space 2159 * @pmdp: pointer to the pmd to be tested 2160 * @gaddr: virtual address in the guest address space 2161 * 2162 * This function is assumed to be called with the guest_table_lock 2163 * held. 2164 */ 2165 static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, 2166 unsigned long gaddr) 2167 { 2168 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) 2169 return false; 2170 2171 /* Already protected memory, which did not change is clean */ 2172 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && 2173 !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) 2174 return false; 2175 2176 /* Clear UC indication and reset protection */ 2177 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); 2178 gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); 2179 return true; 2180 } 2181 2182 /** 2183 * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment 2184 * @gmap: pointer to guest address space 2185 * @bitmap: dirty bitmap for this pmd 2186 * @gaddr: virtual address in the guest address space 2187 * @vmaddr: virtual address in the host address space 2188 * 2189 * This function is assumed to be called with the guest_table_lock 2190 * held. 2191 */ 2192 void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], 2193 unsigned long gaddr, unsigned long vmaddr) 2194 { 2195 int i; 2196 pmd_t *pmdp; 2197 pte_t *ptep; 2198 spinlock_t *ptl; 2199 2200 pmdp = gmap_pmd_op_walk(gmap, gaddr); 2201 if (!pmdp) 2202 return; 2203 2204 if (pmd_leaf(*pmdp)) { 2205 if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) 2206 bitmap_fill(bitmap, _PAGE_ENTRIES); 2207 } else { 2208 for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { 2209 ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); 2210 if (!ptep) 2211 continue; 2212 if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) 2213 set_bit(i, bitmap); 2214 pte_unmap_unlock(ptep, ptl); 2215 } 2216 } 2217 gmap_pmd_op_end(gmap, pmdp); 2218 } 2219 EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); 2220 2221 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2222 static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, 2223 unsigned long end, struct mm_walk *walk) 2224 { 2225 struct vm_area_struct *vma = walk->vma; 2226 2227 split_huge_pmd(vma, pmd, addr); 2228 return 0; 2229 } 2230 2231 static const struct mm_walk_ops thp_split_walk_ops = { 2232 .pmd_entry = thp_split_walk_pmd_entry, 2233 .walk_lock = PGWALK_WRLOCK_VERIFY, 2234 }; 2235 2236 static inline void thp_split_mm(struct mm_struct *mm) 2237 { 2238 struct vm_area_struct *vma; 2239 VMA_ITERATOR(vmi, mm, 0); 2240 2241 for_each_vma(vmi, vma) { 2242 vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); 2243 walk_page_vma(vma, &thp_split_walk_ops, NULL); 2244 } 2245 mm->def_flags |= VM_NOHUGEPAGE; 2246 } 2247 #else 2248 static inline void thp_split_mm(struct mm_struct *mm) 2249 { 2250 } 2251 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2252 2253 /* 2254 * switch on pgstes for its userspace process (for kvm) 2255 */ 2256 int s390_enable_sie(void) 2257 { 2258 struct mm_struct *mm = current->mm; 2259 2260 /* Do we have pgstes? if yes, we are done */ 2261 if (mm_has_pgste(mm)) 2262 return 0; 2263 mmap_write_lock(mm); 2264 mm->context.has_pgste = 1; 2265 /* split thp mappings and disable thp for future mappings */ 2266 thp_split_mm(mm); 2267 mmap_write_unlock(mm); 2268 return 0; 2269 } 2270 EXPORT_SYMBOL_GPL(s390_enable_sie); 2271 2272 static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, 2273 unsigned long end, struct mm_walk *walk) 2274 { 2275 unsigned long *found_addr = walk->private; 2276 2277 /* Return 1 of the page is a zeropage. */ 2278 if (is_zero_pfn(pte_pfn(*pte))) { 2279 /* 2280 * Shared zeropage in e.g., a FS DAX mapping? We cannot do the 2281 * right thing and likely don't care: FAULT_FLAG_UNSHARE 2282 * currently only works in COW mappings, which is also where 2283 * mm_forbids_zeropage() is checked. 2284 */ 2285 if (!is_cow_mapping(walk->vma->vm_flags)) 2286 return -EFAULT; 2287 2288 *found_addr = addr; 2289 return 1; 2290 } 2291 return 0; 2292 } 2293 2294 static const struct mm_walk_ops find_zeropage_ops = { 2295 .pte_entry = find_zeropage_pte_entry, 2296 .walk_lock = PGWALK_WRLOCK, 2297 }; 2298 2299 /* 2300 * Unshare all shared zeropages, replacing them by anonymous pages. Note that 2301 * we cannot simply zap all shared zeropages, because this could later 2302 * trigger unexpected userfaultfd missing events. 2303 * 2304 * This must be called after mm->context.allow_cow_sharing was 2305 * set to 0, to avoid future mappings of shared zeropages. 2306 * 2307 * mm contracts with s390, that even if mm were to remove a page table, 2308 * and racing with walk_page_range_vma() calling pte_offset_map_lock() 2309 * would fail, it will never insert a page table containing empty zero 2310 * pages once mm_forbids_zeropage(mm) i.e. 2311 * mm->context.allow_cow_sharing is set to 0. 2312 */ 2313 static int __s390_unshare_zeropages(struct mm_struct *mm) 2314 { 2315 struct vm_area_struct *vma; 2316 VMA_ITERATOR(vmi, mm, 0); 2317 unsigned long addr; 2318 vm_fault_t fault; 2319 int rc; 2320 2321 for_each_vma(vmi, vma) { 2322 /* 2323 * We could only look at COW mappings, but it's more future 2324 * proof to catch unexpected zeropages in other mappings and 2325 * fail. 2326 */ 2327 if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) 2328 continue; 2329 addr = vma->vm_start; 2330 2331 retry: 2332 rc = walk_page_range_vma(vma, addr, vma->vm_end, 2333 &find_zeropage_ops, &addr); 2334 if (rc < 0) 2335 return rc; 2336 else if (!rc) 2337 continue; 2338 2339 /* addr was updated by find_zeropage_pte_entry() */ 2340 fault = handle_mm_fault(vma, addr, 2341 FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, 2342 NULL); 2343 if (fault & VM_FAULT_OOM) 2344 return -ENOMEM; 2345 /* 2346 * See break_ksm(): even after handle_mm_fault() returned 0, we 2347 * must start the lookup from the current address, because 2348 * handle_mm_fault() may back out if there's any difficulty. 2349 * 2350 * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but 2351 * maybe they could trigger in the future on concurrent 2352 * truncation. In that case, the shared zeropage would be gone 2353 * and we can simply retry and make progress. 2354 */ 2355 cond_resched(); 2356 goto retry; 2357 } 2358 2359 return 0; 2360 } 2361 2362 static int __s390_disable_cow_sharing(struct mm_struct *mm) 2363 { 2364 int rc; 2365 2366 if (!mm->context.allow_cow_sharing) 2367 return 0; 2368 2369 mm->context.allow_cow_sharing = 0; 2370 2371 /* Replace all shared zeropages by anonymous pages. */ 2372 rc = __s390_unshare_zeropages(mm); 2373 /* 2374 * Make sure to disable KSM (if enabled for the whole process or 2375 * individual VMAs). Note that nothing currently hinders user space 2376 * from re-enabling it. 2377 */ 2378 if (!rc) 2379 rc = ksm_disable(mm); 2380 if (rc) 2381 mm->context.allow_cow_sharing = 1; 2382 return rc; 2383 } 2384 2385 /* 2386 * Disable most COW-sharing of memory pages for the whole process: 2387 * (1) Disable KSM and unmerge/unshare any KSM pages. 2388 * (2) Disallow shared zeropages and unshare any zerpages that are mapped. 2389 * 2390 * Not that we currently don't bother with COW-shared pages that are shared 2391 * with parent/child processes due to fork(). 2392 */ 2393 int s390_disable_cow_sharing(void) 2394 { 2395 int rc; 2396 2397 mmap_write_lock(current->mm); 2398 rc = __s390_disable_cow_sharing(current->mm); 2399 mmap_write_unlock(current->mm); 2400 return rc; 2401 } 2402 EXPORT_SYMBOL_GPL(s390_disable_cow_sharing); 2403 2404 /* 2405 * Enable storage key handling from now on and initialize the storage 2406 * keys with the default key. 2407 */ 2408 static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, 2409 unsigned long next, struct mm_walk *walk) 2410 { 2411 /* Clear storage key */ 2412 ptep_zap_key(walk->mm, addr, pte); 2413 return 0; 2414 } 2415 2416 /* 2417 * Give a chance to schedule after setting a key to 256 pages. 2418 * We only hold the mm lock, which is a rwsem and the kvm srcu. 2419 * Both can sleep. 2420 */ 2421 static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, 2422 unsigned long next, struct mm_walk *walk) 2423 { 2424 cond_resched(); 2425 return 0; 2426 } 2427 2428 static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, 2429 unsigned long hmask, unsigned long next, 2430 struct mm_walk *walk) 2431 { 2432 pmd_t *pmd = (pmd_t *)pte; 2433 unsigned long start, end; 2434 struct folio *folio = page_folio(pmd_page(*pmd)); 2435 2436 /* 2437 * The write check makes sure we do not set a key on shared 2438 * memory. This is needed as the walker does not differentiate 2439 * between actual guest memory and the process executable or 2440 * shared libraries. 2441 */ 2442 if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || 2443 !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) 2444 return 0; 2445 2446 start = pmd_val(*pmd) & HPAGE_MASK; 2447 end = start + HPAGE_SIZE; 2448 __storage_key_init_range(start, end); 2449 set_bit(PG_arch_1, &folio->flags); 2450 cond_resched(); 2451 return 0; 2452 } 2453 2454 static const struct mm_walk_ops enable_skey_walk_ops = { 2455 .hugetlb_entry = __s390_enable_skey_hugetlb, 2456 .pte_entry = __s390_enable_skey_pte, 2457 .pmd_entry = __s390_enable_skey_pmd, 2458 .walk_lock = PGWALK_WRLOCK, 2459 }; 2460 2461 int s390_enable_skey(void) 2462 { 2463 struct mm_struct *mm = current->mm; 2464 int rc = 0; 2465 2466 mmap_write_lock(mm); 2467 if (mm_uses_skeys(mm)) 2468 goto out_up; 2469 2470 mm->context.uses_skeys = 1; 2471 rc = __s390_disable_cow_sharing(mm); 2472 if (rc) { 2473 mm->context.uses_skeys = 0; 2474 goto out_up; 2475 } 2476 walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); 2477 2478 out_up: 2479 mmap_write_unlock(mm); 2480 return rc; 2481 } 2482 EXPORT_SYMBOL_GPL(s390_enable_skey); 2483 2484 /* 2485 * Reset CMMA state, make all pages stable again. 2486 */ 2487 static int __s390_reset_cmma(pte_t *pte, unsigned long addr, 2488 unsigned long next, struct mm_walk *walk) 2489 { 2490 ptep_zap_unused(walk->mm, addr, pte, 1); 2491 return 0; 2492 } 2493 2494 static const struct mm_walk_ops reset_cmma_walk_ops = { 2495 .pte_entry = __s390_reset_cmma, 2496 .walk_lock = PGWALK_WRLOCK, 2497 }; 2498 2499 void s390_reset_cmma(struct mm_struct *mm) 2500 { 2501 mmap_write_lock(mm); 2502 walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); 2503 mmap_write_unlock(mm); 2504 } 2505 EXPORT_SYMBOL_GPL(s390_reset_cmma); 2506 2507 #define GATHER_GET_PAGES 32 2508 2509 struct reset_walk_state { 2510 unsigned long next; 2511 unsigned long count; 2512 unsigned long pfns[GATHER_GET_PAGES]; 2513 }; 2514 2515 static int s390_gather_pages(pte_t *ptep, unsigned long addr, 2516 unsigned long next, struct mm_walk *walk) 2517 { 2518 struct reset_walk_state *p = walk->private; 2519 pte_t pte = READ_ONCE(*ptep); 2520 2521 if (pte_present(pte)) { 2522 /* we have a reference from the mapping, take an extra one */ 2523 get_page(phys_to_page(pte_val(pte))); 2524 p->pfns[p->count] = phys_to_pfn(pte_val(pte)); 2525 p->next = next; 2526 p->count++; 2527 } 2528 return p->count >= GATHER_GET_PAGES; 2529 } 2530 2531 static const struct mm_walk_ops gather_pages_ops = { 2532 .pte_entry = s390_gather_pages, 2533 .walk_lock = PGWALK_RDLOCK, 2534 }; 2535 2536 /* 2537 * Call the Destroy secure page UVC on each page in the given array of PFNs. 2538 * Each page needs to have an extra reference, which will be released here. 2539 */ 2540 void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) 2541 { 2542 struct folio *folio; 2543 unsigned long i; 2544 2545 for (i = 0; i < count; i++) { 2546 folio = pfn_folio(pfns[i]); 2547 /* we always have an extra reference */ 2548 uv_destroy_folio(folio); 2549 /* get rid of the extra reference */ 2550 folio_put(folio); 2551 cond_resched(); 2552 } 2553 } 2554 EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); 2555 2556 /** 2557 * __s390_uv_destroy_range - Call the destroy secure page UVC on each page 2558 * in the given range of the given address space. 2559 * @mm: the mm to operate on 2560 * @start: the start of the range 2561 * @end: the end of the range 2562 * @interruptible: if not 0, stop when a fatal signal is received 2563 * 2564 * Walk the given range of the given address space and call the destroy 2565 * secure page UVC on each page. Optionally exit early if a fatal signal is 2566 * pending. 2567 * 2568 * Return: 0 on success, -EINTR if the function stopped before completing 2569 */ 2570 int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, 2571 unsigned long end, bool interruptible) 2572 { 2573 struct reset_walk_state state = { .next = start }; 2574 int r = 1; 2575 2576 while (r > 0) { 2577 state.count = 0; 2578 mmap_read_lock(mm); 2579 r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); 2580 mmap_read_unlock(mm); 2581 cond_resched(); 2582 s390_uv_destroy_pfns(state.count, state.pfns); 2583 if (interruptible && fatal_signal_pending(current)) 2584 return -EINTR; 2585 } 2586 return 0; 2587 } 2588 EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); 2589 2590 /** 2591 * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy 2592 * @gmap: the gmap whose ASCE needs to be replaced 2593 * 2594 * If the ASCE is a SEGMENT type then this function will return -EINVAL, 2595 * otherwise the pointers in the host_to_guest radix tree will keep pointing 2596 * to the wrong pages, causing use-after-free and memory corruption. 2597 * If the allocation of the new top level page table fails, the ASCE is not 2598 * replaced. 2599 * In any case, the old ASCE is always removed from the gmap CRST list. 2600 * Therefore the caller has to make sure to save a pointer to it 2601 * beforehand, unless a leak is actually intended. 2602 */ 2603 int s390_replace_asce(struct gmap *gmap) 2604 { 2605 unsigned long asce; 2606 struct page *page; 2607 void *table; 2608 2609 /* Replacing segment type ASCEs would cause serious issues */ 2610 if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) 2611 return -EINVAL; 2612 2613 page = gmap_alloc_crst(); 2614 if (!page) 2615 return -ENOMEM; 2616 table = page_to_virt(page); 2617 memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); 2618 2619 /* Set new table origin while preserving existing ASCE control bits */ 2620 asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); 2621 WRITE_ONCE(gmap->asce, asce); 2622 WRITE_ONCE(gmap->mm->context.gmap_asce, asce); 2623 WRITE_ONCE(gmap->table, table); 2624 2625 return 0; 2626 } 2627 EXPORT_SYMBOL_GPL(s390_replace_asce); 2628