1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * KVM guest address space mapping code 4 * 5 * Copyright IBM Corp. 2007, 2020 6 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 7 * David Hildenbrand <david@redhat.com> 8 * Janosch Frank <frankja@linux.vnet.ibm.com> 9 */ 10 11 #include <linux/kernel.h> 12 #include <linux/pagewalk.h> 13 #include <linux/swap.h> 14 #include <linux/smp.h> 15 #include <linux/spinlock.h> 16 #include <linux/slab.h> 17 #include <linux/swapops.h> 18 #include <linux/ksm.h> 19 #include <linux/mman.h> 20 #include <linux/pgtable.h> 21 #include <asm/page-states.h> 22 #include <asm/pgalloc.h> 23 #include <asm/gmap.h> 24 #include <asm/page.h> 25 #include <asm/tlb.h> 26 27 #define GMAP_SHADOW_FAKE_TABLE 1ULL 28 29 static struct page *gmap_alloc_crst(void) 30 { 31 struct page *page; 32 33 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 34 if (!page) 35 return NULL; 36 __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); 37 return page; 38 } 39 40 /** 41 * gmap_alloc - allocate and initialize a guest address space 42 * @limit: maximum address of the gmap address space 43 * 44 * Returns a guest address space structure. 45 */ 46 static struct gmap *gmap_alloc(unsigned long limit) 47 { 48 struct gmap *gmap; 49 struct page *page; 50 unsigned long *table; 51 unsigned long etype, atype; 52 53 if (limit < _REGION3_SIZE) { 54 limit = _REGION3_SIZE - 1; 55 atype = _ASCE_TYPE_SEGMENT; 56 etype = _SEGMENT_ENTRY_EMPTY; 57 } else if (limit < _REGION2_SIZE) { 58 limit = _REGION2_SIZE - 1; 59 atype = _ASCE_TYPE_REGION3; 60 etype = _REGION3_ENTRY_EMPTY; 61 } else if (limit < _REGION1_SIZE) { 62 limit = _REGION1_SIZE - 1; 63 atype = _ASCE_TYPE_REGION2; 64 etype = _REGION2_ENTRY_EMPTY; 65 } else { 66 limit = -1UL; 67 atype = _ASCE_TYPE_REGION1; 68 etype = _REGION1_ENTRY_EMPTY; 69 } 70 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); 71 if (!gmap) 72 goto out; 73 INIT_LIST_HEAD(&gmap->crst_list); 74 INIT_LIST_HEAD(&gmap->children); 75 INIT_LIST_HEAD(&gmap->pt_list); 76 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); 77 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); 78 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); 79 spin_lock_init(&gmap->guest_table_lock); 80 spin_lock_init(&gmap->shadow_lock); 81 refcount_set(&gmap->ref_count, 1); 82 page = gmap_alloc_crst(); 83 if (!page) 84 goto out_free; 85 page->index = 0; 86 list_add(&page->lru, &gmap->crst_list); 87 table = page_to_virt(page); 88 crst_table_init(table, etype); 89 gmap->table = table; 90 gmap->asce = atype | _ASCE_TABLE_LENGTH | 91 _ASCE_USER_BITS | __pa(table); 92 gmap->asce_end = limit; 93 return gmap; 94 95 out_free: 96 kfree(gmap); 97 out: 98 return NULL; 99 } 100 101 /** 102 * gmap_create - create a guest address space 103 * @mm: pointer to the parent mm_struct 104 * @limit: maximum size of the gmap address space 105 * 106 * Returns a guest address space structure. 107 */ 108 struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) 109 { 110 struct gmap *gmap; 111 unsigned long gmap_asce; 112 113 gmap = gmap_alloc(limit); 114 if (!gmap) 115 return NULL; 116 gmap->mm = mm; 117 spin_lock(&mm->context.lock); 118 list_add_rcu(&gmap->list, &mm->context.gmap_list); 119 if (list_is_singular(&mm->context.gmap_list)) 120 gmap_asce = gmap->asce; 121 else 122 gmap_asce = -1UL; 123 WRITE_ONCE(mm->context.gmap_asce, gmap_asce); 124 spin_unlock(&mm->context.lock); 125 return gmap; 126 } 127 EXPORT_SYMBOL_GPL(gmap_create); 128 129 static void gmap_flush_tlb(struct gmap *gmap) 130 { 131 if (MACHINE_HAS_IDTE) 132 __tlb_flush_idte(gmap->asce); 133 else 134 __tlb_flush_global(); 135 } 136 137 static void gmap_radix_tree_free(struct radix_tree_root *root) 138 { 139 struct radix_tree_iter iter; 140 unsigned long indices[16]; 141 unsigned long index; 142 void __rcu **slot; 143 int i, nr; 144 145 /* A radix tree is freed by deleting all of its entries */ 146 index = 0; 147 do { 148 nr = 0; 149 radix_tree_for_each_slot(slot, root, &iter, index) { 150 indices[nr] = iter.index; 151 if (++nr == 16) 152 break; 153 } 154 for (i = 0; i < nr; i++) { 155 index = indices[i]; 156 radix_tree_delete(root, index); 157 } 158 } while (nr > 0); 159 } 160 161 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) 162 { 163 struct gmap_rmap *rmap, *rnext, *head; 164 struct radix_tree_iter iter; 165 unsigned long indices[16]; 166 unsigned long index; 167 void __rcu **slot; 168 int i, nr; 169 170 /* A radix tree is freed by deleting all of its entries */ 171 index = 0; 172 do { 173 nr = 0; 174 radix_tree_for_each_slot(slot, root, &iter, index) { 175 indices[nr] = iter.index; 176 if (++nr == 16) 177 break; 178 } 179 for (i = 0; i < nr; i++) { 180 index = indices[i]; 181 head = radix_tree_delete(root, index); 182 gmap_for_each_rmap_safe(rmap, rnext, head) 183 kfree(rmap); 184 } 185 } while (nr > 0); 186 } 187 188 /** 189 * gmap_free - free a guest address space 190 * @gmap: pointer to the guest address space structure 191 * 192 * No locks required. There are no references to this gmap anymore. 193 */ 194 static void gmap_free(struct gmap *gmap) 195 { 196 struct page *page, *next; 197 198 /* Flush tlb of all gmaps (if not already done for shadows) */ 199 if (!(gmap_is_shadow(gmap) && gmap->removed)) 200 gmap_flush_tlb(gmap); 201 /* Free all segment & region tables. */ 202 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) 203 __free_pages(page, CRST_ALLOC_ORDER); 204 gmap_radix_tree_free(&gmap->guest_to_host); 205 gmap_radix_tree_free(&gmap->host_to_guest); 206 207 /* Free additional data for a shadow gmap */ 208 if (gmap_is_shadow(gmap)) { 209 struct ptdesc *ptdesc, *n; 210 211 /* Free all page tables. */ 212 list_for_each_entry_safe(ptdesc, n, &gmap->pt_list, pt_list) 213 page_table_free_pgste(ptdesc); 214 gmap_rmap_radix_tree_free(&gmap->host_to_rmap); 215 /* Release reference to the parent */ 216 gmap_put(gmap->parent); 217 } 218 219 kfree(gmap); 220 } 221 222 /** 223 * gmap_get - increase reference counter for guest address space 224 * @gmap: pointer to the guest address space structure 225 * 226 * Returns the gmap pointer 227 */ 228 struct gmap *gmap_get(struct gmap *gmap) 229 { 230 refcount_inc(&gmap->ref_count); 231 return gmap; 232 } 233 EXPORT_SYMBOL_GPL(gmap_get); 234 235 /** 236 * gmap_put - decrease reference counter for guest address space 237 * @gmap: pointer to the guest address space structure 238 * 239 * If the reference counter reaches zero the guest address space is freed. 240 */ 241 void gmap_put(struct gmap *gmap) 242 { 243 if (refcount_dec_and_test(&gmap->ref_count)) 244 gmap_free(gmap); 245 } 246 EXPORT_SYMBOL_GPL(gmap_put); 247 248 /** 249 * gmap_remove - remove a guest address space but do not free it yet 250 * @gmap: pointer to the guest address space structure 251 */ 252 void gmap_remove(struct gmap *gmap) 253 { 254 struct gmap *sg, *next; 255 unsigned long gmap_asce; 256 257 /* Remove all shadow gmaps linked to this gmap */ 258 if (!list_empty(&gmap->children)) { 259 spin_lock(&gmap->shadow_lock); 260 list_for_each_entry_safe(sg, next, &gmap->children, list) { 261 list_del(&sg->list); 262 gmap_put(sg); 263 } 264 spin_unlock(&gmap->shadow_lock); 265 } 266 /* Remove gmap from the pre-mm list */ 267 spin_lock(&gmap->mm->context.lock); 268 list_del_rcu(&gmap->list); 269 if (list_empty(&gmap->mm->context.gmap_list)) 270 gmap_asce = 0; 271 else if (list_is_singular(&gmap->mm->context.gmap_list)) 272 gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, 273 struct gmap, list)->asce; 274 else 275 gmap_asce = -1UL; 276 WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); 277 spin_unlock(&gmap->mm->context.lock); 278 synchronize_rcu(); 279 /* Put reference */ 280 gmap_put(gmap); 281 } 282 EXPORT_SYMBOL_GPL(gmap_remove); 283 284 /** 285 * gmap_enable - switch primary space to the guest address space 286 * @gmap: pointer to the guest address space structure 287 */ 288 void gmap_enable(struct gmap *gmap) 289 { 290 get_lowcore()->gmap = (unsigned long)gmap; 291 } 292 EXPORT_SYMBOL_GPL(gmap_enable); 293 294 /** 295 * gmap_disable - switch back to the standard primary address space 296 * @gmap: pointer to the guest address space structure 297 */ 298 void gmap_disable(struct gmap *gmap) 299 { 300 get_lowcore()->gmap = 0UL; 301 } 302 EXPORT_SYMBOL_GPL(gmap_disable); 303 304 /** 305 * gmap_get_enabled - get a pointer to the currently enabled gmap 306 * 307 * Returns a pointer to the currently enabled gmap. 0 if none is enabled. 308 */ 309 struct gmap *gmap_get_enabled(void) 310 { 311 return (struct gmap *)get_lowcore()->gmap; 312 } 313 EXPORT_SYMBOL_GPL(gmap_get_enabled); 314 315 /* 316 * gmap_alloc_table is assumed to be called with mmap_lock held 317 */ 318 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 319 unsigned long init, unsigned long gaddr) 320 { 321 struct page *page; 322 unsigned long *new; 323 324 /* since we dont free the gmap table until gmap_free we can unlock */ 325 page = gmap_alloc_crst(); 326 if (!page) 327 return -ENOMEM; 328 new = page_to_virt(page); 329 crst_table_init(new, init); 330 spin_lock(&gmap->guest_table_lock); 331 if (*table & _REGION_ENTRY_INVALID) { 332 list_add(&page->lru, &gmap->crst_list); 333 *table = __pa(new) | _REGION_ENTRY_LENGTH | 334 (*table & _REGION_ENTRY_TYPE_MASK); 335 page->index = gaddr; 336 page = NULL; 337 } 338 spin_unlock(&gmap->guest_table_lock); 339 if (page) 340 __free_pages(page, CRST_ALLOC_ORDER); 341 return 0; 342 } 343 344 /** 345 * __gmap_segment_gaddr - find virtual address from segment pointer 346 * @entry: pointer to a segment table entry in the guest address space 347 * 348 * Returns the virtual address in the guest address space for the segment 349 */ 350 static unsigned long __gmap_segment_gaddr(unsigned long *entry) 351 { 352 struct page *page; 353 unsigned long offset; 354 355 offset = (unsigned long) entry / sizeof(unsigned long); 356 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; 357 page = pmd_pgtable_page((pmd_t *) entry); 358 return page->index + offset; 359 } 360 361 /** 362 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 363 * @gmap: pointer to the guest address space structure 364 * @vmaddr: address in the host process address space 365 * 366 * Returns 1 if a TLB flush is required 367 */ 368 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 369 { 370 unsigned long *entry; 371 int flush = 0; 372 373 BUG_ON(gmap_is_shadow(gmap)); 374 spin_lock(&gmap->guest_table_lock); 375 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 376 if (entry) { 377 flush = (*entry != _SEGMENT_ENTRY_EMPTY); 378 *entry = _SEGMENT_ENTRY_EMPTY; 379 } 380 spin_unlock(&gmap->guest_table_lock); 381 return flush; 382 } 383 384 /** 385 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 386 * @gmap: pointer to the guest address space structure 387 * @gaddr: address in the guest address space 388 * 389 * Returns 1 if a TLB flush is required 390 */ 391 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 392 { 393 unsigned long vmaddr; 394 395 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 396 gaddr >> PMD_SHIFT); 397 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 398 } 399 400 /** 401 * gmap_unmap_segment - unmap segment from the guest address space 402 * @gmap: pointer to the guest address space structure 403 * @to: address in the guest address space 404 * @len: length of the memory area to unmap 405 * 406 * Returns 0 if the unmap succeeded, -EINVAL if not. 407 */ 408 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 409 { 410 unsigned long off; 411 int flush; 412 413 BUG_ON(gmap_is_shadow(gmap)); 414 if ((to | len) & (PMD_SIZE - 1)) 415 return -EINVAL; 416 if (len == 0 || to + len < to) 417 return -EINVAL; 418 419 flush = 0; 420 mmap_write_lock(gmap->mm); 421 for (off = 0; off < len; off += PMD_SIZE) 422 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 423 mmap_write_unlock(gmap->mm); 424 if (flush) 425 gmap_flush_tlb(gmap); 426 return 0; 427 } 428 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 429 430 /** 431 * gmap_map_segment - map a segment to the guest address space 432 * @gmap: pointer to the guest address space structure 433 * @from: source address in the parent address space 434 * @to: target address in the guest address space 435 * @len: length of the memory area to map 436 * 437 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 438 */ 439 int gmap_map_segment(struct gmap *gmap, unsigned long from, 440 unsigned long to, unsigned long len) 441 { 442 unsigned long off; 443 int flush; 444 445 BUG_ON(gmap_is_shadow(gmap)); 446 if ((from | to | len) & (PMD_SIZE - 1)) 447 return -EINVAL; 448 if (len == 0 || from + len < from || to + len < to || 449 from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) 450 return -EINVAL; 451 452 flush = 0; 453 mmap_write_lock(gmap->mm); 454 for (off = 0; off < len; off += PMD_SIZE) { 455 /* Remove old translation */ 456 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 457 /* Store new translation */ 458 if (radix_tree_insert(&gmap->guest_to_host, 459 (to + off) >> PMD_SHIFT, 460 (void *) from + off)) 461 break; 462 } 463 mmap_write_unlock(gmap->mm); 464 if (flush) 465 gmap_flush_tlb(gmap); 466 if (off >= len) 467 return 0; 468 gmap_unmap_segment(gmap, to, len); 469 return -ENOMEM; 470 } 471 EXPORT_SYMBOL_GPL(gmap_map_segment); 472 473 /** 474 * __gmap_translate - translate a guest address to a user space address 475 * @gmap: pointer to guest mapping meta data structure 476 * @gaddr: guest address 477 * 478 * Returns user space address which corresponds to the guest address or 479 * -EFAULT if no such mapping exists. 480 * This function does not establish potentially missing page table entries. 481 * The mmap_lock of the mm that belongs to the address space must be held 482 * when this function gets called. 483 * 484 * Note: Can also be called for shadow gmaps. 485 */ 486 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 487 { 488 unsigned long vmaddr; 489 490 vmaddr = (unsigned long) 491 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 492 /* Note: guest_to_host is empty for a shadow gmap */ 493 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 494 } 495 EXPORT_SYMBOL_GPL(__gmap_translate); 496 497 /** 498 * gmap_translate - translate a guest address to a user space address 499 * @gmap: pointer to guest mapping meta data structure 500 * @gaddr: guest address 501 * 502 * Returns user space address which corresponds to the guest address or 503 * -EFAULT if no such mapping exists. 504 * This function does not establish potentially missing page table entries. 505 */ 506 unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) 507 { 508 unsigned long rc; 509 510 mmap_read_lock(gmap->mm); 511 rc = __gmap_translate(gmap, gaddr); 512 mmap_read_unlock(gmap->mm); 513 return rc; 514 } 515 EXPORT_SYMBOL_GPL(gmap_translate); 516 517 /** 518 * gmap_unlink - disconnect a page table from the gmap shadow tables 519 * @mm: pointer to the parent mm_struct 520 * @table: pointer to the host page table 521 * @vmaddr: vm address associated with the host page table 522 */ 523 void gmap_unlink(struct mm_struct *mm, unsigned long *table, 524 unsigned long vmaddr) 525 { 526 struct gmap *gmap; 527 int flush; 528 529 rcu_read_lock(); 530 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 531 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 532 if (flush) 533 gmap_flush_tlb(gmap); 534 } 535 rcu_read_unlock(); 536 } 537 538 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, 539 unsigned long gaddr); 540 541 /** 542 * __gmap_link - set up shadow page tables to connect a host to a guest address 543 * @gmap: pointer to guest mapping meta data structure 544 * @gaddr: guest address 545 * @vmaddr: vm address 546 * 547 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 548 * if the vm address is already mapped to a different guest segment. 549 * The mmap_lock of the mm that belongs to the address space must be held 550 * when this function gets called. 551 */ 552 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 553 { 554 struct mm_struct *mm; 555 unsigned long *table; 556 spinlock_t *ptl; 557 pgd_t *pgd; 558 p4d_t *p4d; 559 pud_t *pud; 560 pmd_t *pmd; 561 u64 unprot; 562 int rc; 563 564 BUG_ON(gmap_is_shadow(gmap)); 565 /* Create higher level tables in the gmap page table */ 566 table = gmap->table; 567 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 568 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; 569 if ((*table & _REGION_ENTRY_INVALID) && 570 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 571 gaddr & _REGION1_MASK)) 572 return -ENOMEM; 573 table = __va(*table & _REGION_ENTRY_ORIGIN); 574 } 575 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 576 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; 577 if ((*table & _REGION_ENTRY_INVALID) && 578 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 579 gaddr & _REGION2_MASK)) 580 return -ENOMEM; 581 table = __va(*table & _REGION_ENTRY_ORIGIN); 582 } 583 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 584 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; 585 if ((*table & _REGION_ENTRY_INVALID) && 586 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 587 gaddr & _REGION3_MASK)) 588 return -ENOMEM; 589 table = __va(*table & _REGION_ENTRY_ORIGIN); 590 } 591 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 592 /* Walk the parent mm page table */ 593 mm = gmap->mm; 594 pgd = pgd_offset(mm, vmaddr); 595 VM_BUG_ON(pgd_none(*pgd)); 596 p4d = p4d_offset(pgd, vmaddr); 597 VM_BUG_ON(p4d_none(*p4d)); 598 pud = pud_offset(p4d, vmaddr); 599 VM_BUG_ON(pud_none(*pud)); 600 /* large puds cannot yet be handled */ 601 if (pud_leaf(*pud)) 602 return -EFAULT; 603 pmd = pmd_offset(pud, vmaddr); 604 VM_BUG_ON(pmd_none(*pmd)); 605 /* Are we allowed to use huge pages? */ 606 if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) 607 return -EFAULT; 608 /* Link gmap segment table entry location to page table. */ 609 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 610 if (rc) 611 return rc; 612 ptl = pmd_lock(mm, pmd); 613 spin_lock(&gmap->guest_table_lock); 614 if (*table == _SEGMENT_ENTRY_EMPTY) { 615 rc = radix_tree_insert(&gmap->host_to_guest, 616 vmaddr >> PMD_SHIFT, table); 617 if (!rc) { 618 if (pmd_leaf(*pmd)) { 619 *table = (pmd_val(*pmd) & 620 _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) 621 | _SEGMENT_ENTRY_GMAP_UC; 622 } else 623 *table = pmd_val(*pmd) & 624 _SEGMENT_ENTRY_HARDWARE_BITS; 625 } 626 } else if (*table & _SEGMENT_ENTRY_PROTECT && 627 !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { 628 unprot = (u64)*table; 629 unprot &= ~_SEGMENT_ENTRY_PROTECT; 630 unprot |= _SEGMENT_ENTRY_GMAP_UC; 631 gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); 632 } 633 spin_unlock(&gmap->guest_table_lock); 634 spin_unlock(ptl); 635 radix_tree_preload_end(); 636 return rc; 637 } 638 639 /** 640 * gmap_fault - resolve a fault on a guest address 641 * @gmap: pointer to guest mapping meta data structure 642 * @gaddr: guest address 643 * @fault_flags: flags to pass down to handle_mm_fault() 644 * 645 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 646 * if the vm address is already mapped to a different guest segment. 647 */ 648 int gmap_fault(struct gmap *gmap, unsigned long gaddr, 649 unsigned int fault_flags) 650 { 651 unsigned long vmaddr; 652 int rc; 653 bool unlocked; 654 655 mmap_read_lock(gmap->mm); 656 657 retry: 658 unlocked = false; 659 vmaddr = __gmap_translate(gmap, gaddr); 660 if (IS_ERR_VALUE(vmaddr)) { 661 rc = vmaddr; 662 goto out_up; 663 } 664 if (fixup_user_fault(gmap->mm, vmaddr, fault_flags, 665 &unlocked)) { 666 rc = -EFAULT; 667 goto out_up; 668 } 669 /* 670 * In the case that fixup_user_fault unlocked the mmap_lock during 671 * faultin redo __gmap_translate to not race with a map/unmap_segment. 672 */ 673 if (unlocked) 674 goto retry; 675 676 rc = __gmap_link(gmap, gaddr, vmaddr); 677 out_up: 678 mmap_read_unlock(gmap->mm); 679 return rc; 680 } 681 EXPORT_SYMBOL_GPL(gmap_fault); 682 683 /* 684 * this function is assumed to be called with mmap_lock held 685 */ 686 void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 687 { 688 struct vm_area_struct *vma; 689 unsigned long vmaddr; 690 spinlock_t *ptl; 691 pte_t *ptep; 692 693 /* Find the vm address for the guest address */ 694 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 695 gaddr >> PMD_SHIFT); 696 if (vmaddr) { 697 vmaddr |= gaddr & ~PMD_MASK; 698 699 vma = vma_lookup(gmap->mm, vmaddr); 700 if (!vma || is_vm_hugetlb_page(vma)) 701 return; 702 703 /* Get pointer to the page table entry */ 704 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 705 if (likely(ptep)) { 706 ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); 707 pte_unmap_unlock(ptep, ptl); 708 } 709 } 710 } 711 EXPORT_SYMBOL_GPL(__gmap_zap); 712 713 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 714 { 715 unsigned long gaddr, vmaddr, size; 716 struct vm_area_struct *vma; 717 718 mmap_read_lock(gmap->mm); 719 for (gaddr = from; gaddr < to; 720 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 721 /* Find the vm address for the guest address */ 722 vmaddr = (unsigned long) 723 radix_tree_lookup(&gmap->guest_to_host, 724 gaddr >> PMD_SHIFT); 725 if (!vmaddr) 726 continue; 727 vmaddr |= gaddr & ~PMD_MASK; 728 /* Find vma in the parent mm */ 729 vma = find_vma(gmap->mm, vmaddr); 730 if (!vma) 731 continue; 732 /* 733 * We do not discard pages that are backed by 734 * hugetlbfs, so we don't have to refault them. 735 */ 736 if (is_vm_hugetlb_page(vma)) 737 continue; 738 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 739 zap_page_range_single(vma, vmaddr, size, NULL); 740 } 741 mmap_read_unlock(gmap->mm); 742 } 743 EXPORT_SYMBOL_GPL(gmap_discard); 744 745 static LIST_HEAD(gmap_notifier_list); 746 static DEFINE_SPINLOCK(gmap_notifier_lock); 747 748 /** 749 * gmap_register_pte_notifier - register a pte invalidation callback 750 * @nb: pointer to the gmap notifier block 751 */ 752 void gmap_register_pte_notifier(struct gmap_notifier *nb) 753 { 754 spin_lock(&gmap_notifier_lock); 755 list_add_rcu(&nb->list, &gmap_notifier_list); 756 spin_unlock(&gmap_notifier_lock); 757 } 758 EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); 759 760 /** 761 * gmap_unregister_pte_notifier - remove a pte invalidation callback 762 * @nb: pointer to the gmap notifier block 763 */ 764 void gmap_unregister_pte_notifier(struct gmap_notifier *nb) 765 { 766 spin_lock(&gmap_notifier_lock); 767 list_del_rcu(&nb->list); 768 spin_unlock(&gmap_notifier_lock); 769 synchronize_rcu(); 770 } 771 EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); 772 773 /** 774 * gmap_call_notifier - call all registered invalidation callbacks 775 * @gmap: pointer to guest mapping meta data structure 776 * @start: start virtual address in the guest address space 777 * @end: end virtual address in the guest address space 778 */ 779 static void gmap_call_notifier(struct gmap *gmap, unsigned long start, 780 unsigned long end) 781 { 782 struct gmap_notifier *nb; 783 784 list_for_each_entry(nb, &gmap_notifier_list, list) 785 nb->notifier_call(gmap, start, end); 786 } 787 788 /** 789 * gmap_table_walk - walk the gmap page tables 790 * @gmap: pointer to guest mapping meta data structure 791 * @gaddr: virtual address in the guest address space 792 * @level: page table level to stop at 793 * 794 * Returns a table entry pointer for the given guest address and @level 795 * @level=0 : returns a pointer to a page table table entry (or NULL) 796 * @level=1 : returns a pointer to a segment table entry (or NULL) 797 * @level=2 : returns a pointer to a region-3 table entry (or NULL) 798 * @level=3 : returns a pointer to a region-2 table entry (or NULL) 799 * @level=4 : returns a pointer to a region-1 table entry (or NULL) 800 * 801 * Returns NULL if the gmap page tables could not be walked to the 802 * requested level. 803 * 804 * Note: Can also be called for shadow gmaps. 805 */ 806 static inline unsigned long *gmap_table_walk(struct gmap *gmap, 807 unsigned long gaddr, int level) 808 { 809 const int asce_type = gmap->asce & _ASCE_TYPE_MASK; 810 unsigned long *table = gmap->table; 811 812 if (gmap_is_shadow(gmap) && gmap->removed) 813 return NULL; 814 815 if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) 816 return NULL; 817 818 if (asce_type != _ASCE_TYPE_REGION1 && 819 gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) 820 return NULL; 821 822 switch (asce_type) { 823 case _ASCE_TYPE_REGION1: 824 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; 825 if (level == 4) 826 break; 827 if (*table & _REGION_ENTRY_INVALID) 828 return NULL; 829 table = __va(*table & _REGION_ENTRY_ORIGIN); 830 fallthrough; 831 case _ASCE_TYPE_REGION2: 832 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; 833 if (level == 3) 834 break; 835 if (*table & _REGION_ENTRY_INVALID) 836 return NULL; 837 table = __va(*table & _REGION_ENTRY_ORIGIN); 838 fallthrough; 839 case _ASCE_TYPE_REGION3: 840 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; 841 if (level == 2) 842 break; 843 if (*table & _REGION_ENTRY_INVALID) 844 return NULL; 845 table = __va(*table & _REGION_ENTRY_ORIGIN); 846 fallthrough; 847 case _ASCE_TYPE_SEGMENT: 848 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 849 if (level == 1) 850 break; 851 if (*table & _REGION_ENTRY_INVALID) 852 return NULL; 853 table = __va(*table & _SEGMENT_ENTRY_ORIGIN); 854 table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; 855 } 856 return table; 857 } 858 859 /** 860 * gmap_pte_op_walk - walk the gmap page table, get the page table lock 861 * and return the pte pointer 862 * @gmap: pointer to guest mapping meta data structure 863 * @gaddr: virtual address in the guest address space 864 * @ptl: pointer to the spinlock pointer 865 * 866 * Returns a pointer to the locked pte for a guest address, or NULL 867 */ 868 static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, 869 spinlock_t **ptl) 870 { 871 unsigned long *table; 872 873 BUG_ON(gmap_is_shadow(gmap)); 874 /* Walk the gmap page table, lock and get pte pointer */ 875 table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ 876 if (!table || *table & _SEGMENT_ENTRY_INVALID) 877 return NULL; 878 return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); 879 } 880 881 /** 882 * gmap_pte_op_fixup - force a page in and connect the gmap page table 883 * @gmap: pointer to guest mapping meta data structure 884 * @gaddr: virtual address in the guest address space 885 * @vmaddr: address in the host process address space 886 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 887 * 888 * Returns 0 if the caller can retry __gmap_translate (might fail again), 889 * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing 890 * up or connecting the gmap page table. 891 */ 892 static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, 893 unsigned long vmaddr, int prot) 894 { 895 struct mm_struct *mm = gmap->mm; 896 unsigned int fault_flags; 897 bool unlocked = false; 898 899 BUG_ON(gmap_is_shadow(gmap)); 900 fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; 901 if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) 902 return -EFAULT; 903 if (unlocked) 904 /* lost mmap_lock, caller has to retry __gmap_translate */ 905 return 0; 906 /* Connect the page tables */ 907 return __gmap_link(gmap, gaddr, vmaddr); 908 } 909 910 /** 911 * gmap_pte_op_end - release the page table lock 912 * @ptep: pointer to the locked pte 913 * @ptl: pointer to the page table spinlock 914 */ 915 static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) 916 { 917 pte_unmap_unlock(ptep, ptl); 918 } 919 920 /** 921 * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock 922 * and return the pmd pointer 923 * @gmap: pointer to guest mapping meta data structure 924 * @gaddr: virtual address in the guest address space 925 * 926 * Returns a pointer to the pmd for a guest address, or NULL 927 */ 928 static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) 929 { 930 pmd_t *pmdp; 931 932 BUG_ON(gmap_is_shadow(gmap)); 933 pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); 934 if (!pmdp) 935 return NULL; 936 937 /* without huge pages, there is no need to take the table lock */ 938 if (!gmap->mm->context.allow_gmap_hpage_1m) 939 return pmd_none(*pmdp) ? NULL : pmdp; 940 941 spin_lock(&gmap->guest_table_lock); 942 if (pmd_none(*pmdp)) { 943 spin_unlock(&gmap->guest_table_lock); 944 return NULL; 945 } 946 947 /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ 948 if (!pmd_leaf(*pmdp)) 949 spin_unlock(&gmap->guest_table_lock); 950 return pmdp; 951 } 952 953 /** 954 * gmap_pmd_op_end - release the guest_table_lock if needed 955 * @gmap: pointer to the guest mapping meta data structure 956 * @pmdp: pointer to the pmd 957 */ 958 static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) 959 { 960 if (pmd_leaf(*pmdp)) 961 spin_unlock(&gmap->guest_table_lock); 962 } 963 964 /* 965 * gmap_protect_pmd - remove access rights to memory and set pmd notification bits 966 * @pmdp: pointer to the pmd to be protected 967 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 968 * @bits: notification bits to set 969 * 970 * Returns: 971 * 0 if successfully protected 972 * -EAGAIN if a fixup is needed 973 * -EINVAL if unsupported notifier bits have been specified 974 * 975 * Expected to be called with sg->mm->mmap_lock in read and 976 * guest_table_lock held. 977 */ 978 static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, 979 pmd_t *pmdp, int prot, unsigned long bits) 980 { 981 int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; 982 int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; 983 pmd_t new = *pmdp; 984 985 /* Fixup needed */ 986 if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) 987 return -EAGAIN; 988 989 if (prot == PROT_NONE && !pmd_i) { 990 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); 991 gmap_pmdp_xchg(gmap, pmdp, new, gaddr); 992 } 993 994 if (prot == PROT_READ && !pmd_p) { 995 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); 996 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); 997 gmap_pmdp_xchg(gmap, pmdp, new, gaddr); 998 } 999 1000 if (bits & GMAP_NOTIFY_MPROT) 1001 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); 1002 1003 /* Shadow GMAP protection needs split PMDs */ 1004 if (bits & GMAP_NOTIFY_SHADOW) 1005 return -EINVAL; 1006 1007 return 0; 1008 } 1009 1010 /* 1011 * gmap_protect_pte - remove access rights to memory and set pgste bits 1012 * @gmap: pointer to guest mapping meta data structure 1013 * @gaddr: virtual address in the guest address space 1014 * @pmdp: pointer to the pmd associated with the pte 1015 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 1016 * @bits: notification bits to set 1017 * 1018 * Returns 0 if successfully protected, -ENOMEM if out of memory and 1019 * -EAGAIN if a fixup is needed. 1020 * 1021 * Expected to be called with sg->mm->mmap_lock in read 1022 */ 1023 static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, 1024 pmd_t *pmdp, int prot, unsigned long bits) 1025 { 1026 int rc; 1027 pte_t *ptep; 1028 spinlock_t *ptl; 1029 unsigned long pbits = 0; 1030 1031 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) 1032 return -EAGAIN; 1033 1034 ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); 1035 if (!ptep) 1036 return -ENOMEM; 1037 1038 pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; 1039 pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; 1040 /* Protect and unlock. */ 1041 rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); 1042 gmap_pte_op_end(ptep, ptl); 1043 return rc; 1044 } 1045 1046 /* 1047 * gmap_protect_range - remove access rights to memory and set pgste bits 1048 * @gmap: pointer to guest mapping meta data structure 1049 * @gaddr: virtual address in the guest address space 1050 * @len: size of area 1051 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 1052 * @bits: pgste notification bits to set 1053 * 1054 * Returns 0 if successfully protected, -ENOMEM if out of memory and 1055 * -EFAULT if gaddr is invalid (or mapping for shadows is missing). 1056 * 1057 * Called with sg->mm->mmap_lock in read. 1058 */ 1059 static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, 1060 unsigned long len, int prot, unsigned long bits) 1061 { 1062 unsigned long vmaddr, dist; 1063 pmd_t *pmdp; 1064 int rc; 1065 1066 BUG_ON(gmap_is_shadow(gmap)); 1067 while (len) { 1068 rc = -EAGAIN; 1069 pmdp = gmap_pmd_op_walk(gmap, gaddr); 1070 if (pmdp) { 1071 if (!pmd_leaf(*pmdp)) { 1072 rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, 1073 bits); 1074 if (!rc) { 1075 len -= PAGE_SIZE; 1076 gaddr += PAGE_SIZE; 1077 } 1078 } else { 1079 rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, 1080 bits); 1081 if (!rc) { 1082 dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); 1083 len = len < dist ? 0 : len - dist; 1084 gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; 1085 } 1086 } 1087 gmap_pmd_op_end(gmap, pmdp); 1088 } 1089 if (rc) { 1090 if (rc == -EINVAL) 1091 return rc; 1092 1093 /* -EAGAIN, fixup of userspace mm and gmap */ 1094 vmaddr = __gmap_translate(gmap, gaddr); 1095 if (IS_ERR_VALUE(vmaddr)) 1096 return vmaddr; 1097 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); 1098 if (rc) 1099 return rc; 1100 } 1101 } 1102 return 0; 1103 } 1104 1105 /** 1106 * gmap_mprotect_notify - change access rights for a range of ptes and 1107 * call the notifier if any pte changes again 1108 * @gmap: pointer to guest mapping meta data structure 1109 * @gaddr: virtual address in the guest address space 1110 * @len: size of area 1111 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 1112 * 1113 * Returns 0 if for each page in the given range a gmap mapping exists, 1114 * the new access rights could be set and the notifier could be armed. 1115 * If the gmap mapping is missing for one or more pages -EFAULT is 1116 * returned. If no memory could be allocated -ENOMEM is returned. 1117 * This function establishes missing page table entries. 1118 */ 1119 int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, 1120 unsigned long len, int prot) 1121 { 1122 int rc; 1123 1124 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) 1125 return -EINVAL; 1126 if (!MACHINE_HAS_ESOP && prot == PROT_READ) 1127 return -EINVAL; 1128 mmap_read_lock(gmap->mm); 1129 rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); 1130 mmap_read_unlock(gmap->mm); 1131 return rc; 1132 } 1133 EXPORT_SYMBOL_GPL(gmap_mprotect_notify); 1134 1135 /** 1136 * gmap_read_table - get an unsigned long value from a guest page table using 1137 * absolute addressing, without marking the page referenced. 1138 * @gmap: pointer to guest mapping meta data structure 1139 * @gaddr: virtual address in the guest address space 1140 * @val: pointer to the unsigned long value to return 1141 * 1142 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT 1143 * if reading using the virtual address failed. -EINVAL if called on a gmap 1144 * shadow. 1145 * 1146 * Called with gmap->mm->mmap_lock in read. 1147 */ 1148 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) 1149 { 1150 unsigned long address, vmaddr; 1151 spinlock_t *ptl; 1152 pte_t *ptep, pte; 1153 int rc; 1154 1155 if (gmap_is_shadow(gmap)) 1156 return -EINVAL; 1157 1158 while (1) { 1159 rc = -EAGAIN; 1160 ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); 1161 if (ptep) { 1162 pte = *ptep; 1163 if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { 1164 address = pte_val(pte) & PAGE_MASK; 1165 address += gaddr & ~PAGE_MASK; 1166 *val = *(unsigned long *)__va(address); 1167 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); 1168 /* Do *NOT* clear the _PAGE_INVALID bit! */ 1169 rc = 0; 1170 } 1171 gmap_pte_op_end(ptep, ptl); 1172 } 1173 if (!rc) 1174 break; 1175 vmaddr = __gmap_translate(gmap, gaddr); 1176 if (IS_ERR_VALUE(vmaddr)) { 1177 rc = vmaddr; 1178 break; 1179 } 1180 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); 1181 if (rc) 1182 break; 1183 } 1184 return rc; 1185 } 1186 EXPORT_SYMBOL_GPL(gmap_read_table); 1187 1188 /** 1189 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree 1190 * @sg: pointer to the shadow guest address space structure 1191 * @vmaddr: vm address associated with the rmap 1192 * @rmap: pointer to the rmap structure 1193 * 1194 * Called with the sg->guest_table_lock 1195 */ 1196 static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, 1197 struct gmap_rmap *rmap) 1198 { 1199 struct gmap_rmap *temp; 1200 void __rcu **slot; 1201 1202 BUG_ON(!gmap_is_shadow(sg)); 1203 slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); 1204 if (slot) { 1205 rmap->next = radix_tree_deref_slot_protected(slot, 1206 &sg->guest_table_lock); 1207 for (temp = rmap->next; temp; temp = temp->next) { 1208 if (temp->raddr == rmap->raddr) { 1209 kfree(rmap); 1210 return; 1211 } 1212 } 1213 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); 1214 } else { 1215 rmap->next = NULL; 1216 radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, 1217 rmap); 1218 } 1219 } 1220 1221 /** 1222 * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap 1223 * @sg: pointer to the shadow guest address space structure 1224 * @raddr: rmap address in the shadow gmap 1225 * @paddr: address in the parent guest address space 1226 * @len: length of the memory area to protect 1227 * 1228 * Returns 0 if successfully protected and the rmap was created, -ENOMEM 1229 * if out of memory and -EFAULT if paddr is invalid. 1230 */ 1231 static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, 1232 unsigned long paddr, unsigned long len) 1233 { 1234 struct gmap *parent; 1235 struct gmap_rmap *rmap; 1236 unsigned long vmaddr; 1237 spinlock_t *ptl; 1238 pte_t *ptep; 1239 int rc; 1240 1241 BUG_ON(!gmap_is_shadow(sg)); 1242 parent = sg->parent; 1243 while (len) { 1244 vmaddr = __gmap_translate(parent, paddr); 1245 if (IS_ERR_VALUE(vmaddr)) 1246 return vmaddr; 1247 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); 1248 if (!rmap) 1249 return -ENOMEM; 1250 rmap->raddr = raddr; 1251 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 1252 if (rc) { 1253 kfree(rmap); 1254 return rc; 1255 } 1256 rc = -EAGAIN; 1257 ptep = gmap_pte_op_walk(parent, paddr, &ptl); 1258 if (ptep) { 1259 spin_lock(&sg->guest_table_lock); 1260 rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, 1261 PGSTE_VSIE_BIT); 1262 if (!rc) 1263 gmap_insert_rmap(sg, vmaddr, rmap); 1264 spin_unlock(&sg->guest_table_lock); 1265 gmap_pte_op_end(ptep, ptl); 1266 } 1267 radix_tree_preload_end(); 1268 if (rc) { 1269 kfree(rmap); 1270 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); 1271 if (rc) 1272 return rc; 1273 continue; 1274 } 1275 paddr += PAGE_SIZE; 1276 len -= PAGE_SIZE; 1277 } 1278 return 0; 1279 } 1280 1281 #define _SHADOW_RMAP_MASK 0x7 1282 #define _SHADOW_RMAP_REGION1 0x5 1283 #define _SHADOW_RMAP_REGION2 0x4 1284 #define _SHADOW_RMAP_REGION3 0x3 1285 #define _SHADOW_RMAP_SEGMENT 0x2 1286 #define _SHADOW_RMAP_PGTABLE 0x1 1287 1288 /** 1289 * gmap_idte_one - invalidate a single region or segment table entry 1290 * @asce: region or segment table *origin* + table-type bits 1291 * @vaddr: virtual address to identify the table entry to flush 1292 * 1293 * The invalid bit of a single region or segment table entry is set 1294 * and the associated TLB entries depending on the entry are flushed. 1295 * The table-type of the @asce identifies the portion of the @vaddr 1296 * that is used as the invalidation index. 1297 */ 1298 static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) 1299 { 1300 asm volatile( 1301 " idte %0,0,%1" 1302 : : "a" (asce), "a" (vaddr) : "cc", "memory"); 1303 } 1304 1305 /** 1306 * gmap_unshadow_page - remove a page from a shadow page table 1307 * @sg: pointer to the shadow guest address space structure 1308 * @raddr: rmap address in the shadow guest address space 1309 * 1310 * Called with the sg->guest_table_lock 1311 */ 1312 static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) 1313 { 1314 unsigned long *table; 1315 1316 BUG_ON(!gmap_is_shadow(sg)); 1317 table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ 1318 if (!table || *table & _PAGE_INVALID) 1319 return; 1320 gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1); 1321 ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); 1322 } 1323 1324 /** 1325 * __gmap_unshadow_pgt - remove all entries from a shadow page table 1326 * @sg: pointer to the shadow guest address space structure 1327 * @raddr: rmap address in the shadow guest address space 1328 * @pgt: pointer to the start of a shadow page table 1329 * 1330 * Called with the sg->guest_table_lock 1331 */ 1332 static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, 1333 unsigned long *pgt) 1334 { 1335 int i; 1336 1337 BUG_ON(!gmap_is_shadow(sg)); 1338 for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE) 1339 pgt[i] = _PAGE_INVALID; 1340 } 1341 1342 /** 1343 * gmap_unshadow_pgt - remove a shadow page table from a segment entry 1344 * @sg: pointer to the shadow guest address space structure 1345 * @raddr: address in the shadow guest address space 1346 * 1347 * Called with the sg->guest_table_lock 1348 */ 1349 static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) 1350 { 1351 unsigned long *ste; 1352 phys_addr_t sto, pgt; 1353 struct ptdesc *ptdesc; 1354 1355 BUG_ON(!gmap_is_shadow(sg)); 1356 ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ 1357 if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) 1358 return; 1359 gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); 1360 sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); 1361 gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); 1362 pgt = *ste & _SEGMENT_ENTRY_ORIGIN; 1363 *ste = _SEGMENT_ENTRY_EMPTY; 1364 __gmap_unshadow_pgt(sg, raddr, __va(pgt)); 1365 /* Free page table */ 1366 ptdesc = page_ptdesc(phys_to_page(pgt)); 1367 list_del(&ptdesc->pt_list); 1368 page_table_free_pgste(ptdesc); 1369 } 1370 1371 /** 1372 * __gmap_unshadow_sgt - remove all entries from a shadow segment table 1373 * @sg: pointer to the shadow guest address space structure 1374 * @raddr: rmap address in the shadow guest address space 1375 * @sgt: pointer to the start of a shadow segment table 1376 * 1377 * Called with the sg->guest_table_lock 1378 */ 1379 static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, 1380 unsigned long *sgt) 1381 { 1382 struct ptdesc *ptdesc; 1383 phys_addr_t pgt; 1384 int i; 1385 1386 BUG_ON(!gmap_is_shadow(sg)); 1387 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { 1388 if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) 1389 continue; 1390 pgt = sgt[i] & _REGION_ENTRY_ORIGIN; 1391 sgt[i] = _SEGMENT_ENTRY_EMPTY; 1392 __gmap_unshadow_pgt(sg, raddr, __va(pgt)); 1393 /* Free page table */ 1394 ptdesc = page_ptdesc(phys_to_page(pgt)); 1395 list_del(&ptdesc->pt_list); 1396 page_table_free_pgste(ptdesc); 1397 } 1398 } 1399 1400 /** 1401 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry 1402 * @sg: pointer to the shadow guest address space structure 1403 * @raddr: rmap address in the shadow guest address space 1404 * 1405 * Called with the shadow->guest_table_lock 1406 */ 1407 static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) 1408 { 1409 unsigned long r3o, *r3e; 1410 phys_addr_t sgt; 1411 struct page *page; 1412 1413 BUG_ON(!gmap_is_shadow(sg)); 1414 r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ 1415 if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) 1416 return; 1417 gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); 1418 r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); 1419 gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); 1420 sgt = *r3e & _REGION_ENTRY_ORIGIN; 1421 *r3e = _REGION3_ENTRY_EMPTY; 1422 __gmap_unshadow_sgt(sg, raddr, __va(sgt)); 1423 /* Free segment table */ 1424 page = phys_to_page(sgt); 1425 list_del(&page->lru); 1426 __free_pages(page, CRST_ALLOC_ORDER); 1427 } 1428 1429 /** 1430 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table 1431 * @sg: pointer to the shadow guest address space structure 1432 * @raddr: address in the shadow guest address space 1433 * @r3t: pointer to the start of a shadow region-3 table 1434 * 1435 * Called with the sg->guest_table_lock 1436 */ 1437 static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, 1438 unsigned long *r3t) 1439 { 1440 struct page *page; 1441 phys_addr_t sgt; 1442 int i; 1443 1444 BUG_ON(!gmap_is_shadow(sg)); 1445 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { 1446 if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) 1447 continue; 1448 sgt = r3t[i] & _REGION_ENTRY_ORIGIN; 1449 r3t[i] = _REGION3_ENTRY_EMPTY; 1450 __gmap_unshadow_sgt(sg, raddr, __va(sgt)); 1451 /* Free segment table */ 1452 page = phys_to_page(sgt); 1453 list_del(&page->lru); 1454 __free_pages(page, CRST_ALLOC_ORDER); 1455 } 1456 } 1457 1458 /** 1459 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry 1460 * @sg: pointer to the shadow guest address space structure 1461 * @raddr: rmap address in the shadow guest address space 1462 * 1463 * Called with the sg->guest_table_lock 1464 */ 1465 static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) 1466 { 1467 unsigned long r2o, *r2e; 1468 phys_addr_t r3t; 1469 struct page *page; 1470 1471 BUG_ON(!gmap_is_shadow(sg)); 1472 r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ 1473 if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) 1474 return; 1475 gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); 1476 r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); 1477 gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); 1478 r3t = *r2e & _REGION_ENTRY_ORIGIN; 1479 *r2e = _REGION2_ENTRY_EMPTY; 1480 __gmap_unshadow_r3t(sg, raddr, __va(r3t)); 1481 /* Free region 3 table */ 1482 page = phys_to_page(r3t); 1483 list_del(&page->lru); 1484 __free_pages(page, CRST_ALLOC_ORDER); 1485 } 1486 1487 /** 1488 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table 1489 * @sg: pointer to the shadow guest address space structure 1490 * @raddr: rmap address in the shadow guest address space 1491 * @r2t: pointer to the start of a shadow region-2 table 1492 * 1493 * Called with the sg->guest_table_lock 1494 */ 1495 static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, 1496 unsigned long *r2t) 1497 { 1498 phys_addr_t r3t; 1499 struct page *page; 1500 int i; 1501 1502 BUG_ON(!gmap_is_shadow(sg)); 1503 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { 1504 if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) 1505 continue; 1506 r3t = r2t[i] & _REGION_ENTRY_ORIGIN; 1507 r2t[i] = _REGION2_ENTRY_EMPTY; 1508 __gmap_unshadow_r3t(sg, raddr, __va(r3t)); 1509 /* Free region 3 table */ 1510 page = phys_to_page(r3t); 1511 list_del(&page->lru); 1512 __free_pages(page, CRST_ALLOC_ORDER); 1513 } 1514 } 1515 1516 /** 1517 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry 1518 * @sg: pointer to the shadow guest address space structure 1519 * @raddr: rmap address in the shadow guest address space 1520 * 1521 * Called with the sg->guest_table_lock 1522 */ 1523 static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) 1524 { 1525 unsigned long r1o, *r1e; 1526 struct page *page; 1527 phys_addr_t r2t; 1528 1529 BUG_ON(!gmap_is_shadow(sg)); 1530 r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ 1531 if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) 1532 return; 1533 gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); 1534 r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); 1535 gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); 1536 r2t = *r1e & _REGION_ENTRY_ORIGIN; 1537 *r1e = _REGION1_ENTRY_EMPTY; 1538 __gmap_unshadow_r2t(sg, raddr, __va(r2t)); 1539 /* Free region 2 table */ 1540 page = phys_to_page(r2t); 1541 list_del(&page->lru); 1542 __free_pages(page, CRST_ALLOC_ORDER); 1543 } 1544 1545 /** 1546 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table 1547 * @sg: pointer to the shadow guest address space structure 1548 * @raddr: rmap address in the shadow guest address space 1549 * @r1t: pointer to the start of a shadow region-1 table 1550 * 1551 * Called with the shadow->guest_table_lock 1552 */ 1553 static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, 1554 unsigned long *r1t) 1555 { 1556 unsigned long asce; 1557 struct page *page; 1558 phys_addr_t r2t; 1559 int i; 1560 1561 BUG_ON(!gmap_is_shadow(sg)); 1562 asce = __pa(r1t) | _ASCE_TYPE_REGION1; 1563 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { 1564 if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) 1565 continue; 1566 r2t = r1t[i] & _REGION_ENTRY_ORIGIN; 1567 __gmap_unshadow_r2t(sg, raddr, __va(r2t)); 1568 /* Clear entry and flush translation r1t -> r2t */ 1569 gmap_idte_one(asce, raddr); 1570 r1t[i] = _REGION1_ENTRY_EMPTY; 1571 /* Free region 2 table */ 1572 page = phys_to_page(r2t); 1573 list_del(&page->lru); 1574 __free_pages(page, CRST_ALLOC_ORDER); 1575 } 1576 } 1577 1578 /** 1579 * gmap_unshadow - remove a shadow page table completely 1580 * @sg: pointer to the shadow guest address space structure 1581 * 1582 * Called with sg->guest_table_lock 1583 */ 1584 static void gmap_unshadow(struct gmap *sg) 1585 { 1586 unsigned long *table; 1587 1588 BUG_ON(!gmap_is_shadow(sg)); 1589 if (sg->removed) 1590 return; 1591 sg->removed = 1; 1592 gmap_call_notifier(sg, 0, -1UL); 1593 gmap_flush_tlb(sg); 1594 table = __va(sg->asce & _ASCE_ORIGIN); 1595 switch (sg->asce & _ASCE_TYPE_MASK) { 1596 case _ASCE_TYPE_REGION1: 1597 __gmap_unshadow_r1t(sg, 0, table); 1598 break; 1599 case _ASCE_TYPE_REGION2: 1600 __gmap_unshadow_r2t(sg, 0, table); 1601 break; 1602 case _ASCE_TYPE_REGION3: 1603 __gmap_unshadow_r3t(sg, 0, table); 1604 break; 1605 case _ASCE_TYPE_SEGMENT: 1606 __gmap_unshadow_sgt(sg, 0, table); 1607 break; 1608 } 1609 } 1610 1611 /** 1612 * gmap_find_shadow - find a specific asce in the list of shadow tables 1613 * @parent: pointer to the parent gmap 1614 * @asce: ASCE for which the shadow table is created 1615 * @edat_level: edat level to be used for the shadow translation 1616 * 1617 * Returns the pointer to a gmap if a shadow table with the given asce is 1618 * already available, ERR_PTR(-EAGAIN) if another one is just being created, 1619 * otherwise NULL 1620 */ 1621 static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, 1622 int edat_level) 1623 { 1624 struct gmap *sg; 1625 1626 list_for_each_entry(sg, &parent->children, list) { 1627 if (sg->orig_asce != asce || sg->edat_level != edat_level || 1628 sg->removed) 1629 continue; 1630 if (!sg->initialized) 1631 return ERR_PTR(-EAGAIN); 1632 refcount_inc(&sg->ref_count); 1633 return sg; 1634 } 1635 return NULL; 1636 } 1637 1638 /** 1639 * gmap_shadow_valid - check if a shadow guest address space matches the 1640 * given properties and is still valid 1641 * @sg: pointer to the shadow guest address space structure 1642 * @asce: ASCE for which the shadow table is requested 1643 * @edat_level: edat level to be used for the shadow translation 1644 * 1645 * Returns 1 if the gmap shadow is still valid and matches the given 1646 * properties, the caller can continue using it. Returns 0 otherwise, the 1647 * caller has to request a new shadow gmap in this case. 1648 * 1649 */ 1650 int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) 1651 { 1652 if (sg->removed) 1653 return 0; 1654 return sg->orig_asce == asce && sg->edat_level == edat_level; 1655 } 1656 EXPORT_SYMBOL_GPL(gmap_shadow_valid); 1657 1658 /** 1659 * gmap_shadow - create/find a shadow guest address space 1660 * @parent: pointer to the parent gmap 1661 * @asce: ASCE for which the shadow table is created 1662 * @edat_level: edat level to be used for the shadow translation 1663 * 1664 * The pages of the top level page table referred by the asce parameter 1665 * will be set to read-only and marked in the PGSTEs of the kvm process. 1666 * The shadow table will be removed automatically on any change to the 1667 * PTE mapping for the source table. 1668 * 1669 * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, 1670 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the 1671 * parent gmap table could not be protected. 1672 */ 1673 struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, 1674 int edat_level) 1675 { 1676 struct gmap *sg, *new; 1677 unsigned long limit; 1678 int rc; 1679 1680 BUG_ON(parent->mm->context.allow_gmap_hpage_1m); 1681 BUG_ON(gmap_is_shadow(parent)); 1682 spin_lock(&parent->shadow_lock); 1683 sg = gmap_find_shadow(parent, asce, edat_level); 1684 spin_unlock(&parent->shadow_lock); 1685 if (sg) 1686 return sg; 1687 /* Create a new shadow gmap */ 1688 limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); 1689 if (asce & _ASCE_REAL_SPACE) 1690 limit = -1UL; 1691 new = gmap_alloc(limit); 1692 if (!new) 1693 return ERR_PTR(-ENOMEM); 1694 new->mm = parent->mm; 1695 new->parent = gmap_get(parent); 1696 new->private = parent->private; 1697 new->orig_asce = asce; 1698 new->edat_level = edat_level; 1699 new->initialized = false; 1700 spin_lock(&parent->shadow_lock); 1701 /* Recheck if another CPU created the same shadow */ 1702 sg = gmap_find_shadow(parent, asce, edat_level); 1703 if (sg) { 1704 spin_unlock(&parent->shadow_lock); 1705 gmap_free(new); 1706 return sg; 1707 } 1708 if (asce & _ASCE_REAL_SPACE) { 1709 /* only allow one real-space gmap shadow */ 1710 list_for_each_entry(sg, &parent->children, list) { 1711 if (sg->orig_asce & _ASCE_REAL_SPACE) { 1712 spin_lock(&sg->guest_table_lock); 1713 gmap_unshadow(sg); 1714 spin_unlock(&sg->guest_table_lock); 1715 list_del(&sg->list); 1716 gmap_put(sg); 1717 break; 1718 } 1719 } 1720 } 1721 refcount_set(&new->ref_count, 2); 1722 list_add(&new->list, &parent->children); 1723 if (asce & _ASCE_REAL_SPACE) { 1724 /* nothing to protect, return right away */ 1725 new->initialized = true; 1726 spin_unlock(&parent->shadow_lock); 1727 return new; 1728 } 1729 spin_unlock(&parent->shadow_lock); 1730 /* protect after insertion, so it will get properly invalidated */ 1731 mmap_read_lock(parent->mm); 1732 rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, 1733 ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, 1734 PROT_READ, GMAP_NOTIFY_SHADOW); 1735 mmap_read_unlock(parent->mm); 1736 spin_lock(&parent->shadow_lock); 1737 new->initialized = true; 1738 if (rc) { 1739 list_del(&new->list); 1740 gmap_free(new); 1741 new = ERR_PTR(rc); 1742 } 1743 spin_unlock(&parent->shadow_lock); 1744 return new; 1745 } 1746 EXPORT_SYMBOL_GPL(gmap_shadow); 1747 1748 /** 1749 * gmap_shadow_r2t - create an empty shadow region 2 table 1750 * @sg: pointer to the shadow guest address space structure 1751 * @saddr: faulting address in the shadow gmap 1752 * @r2t: parent gmap address of the region 2 table to get shadowed 1753 * @fake: r2t references contiguous guest memory block, not a r2t 1754 * 1755 * The r2t parameter specifies the address of the source table. The 1756 * four pages of the source table are made read-only in the parent gmap 1757 * address space. A write to the source table area @r2t will automatically 1758 * remove the shadow r2 table and all of its descendants. 1759 * 1760 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1761 * shadow table structure is incomplete, -ENOMEM if out of memory and 1762 * -EFAULT if an address in the parent gmap could not be resolved. 1763 * 1764 * Called with sg->mm->mmap_lock in read. 1765 */ 1766 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, 1767 int fake) 1768 { 1769 unsigned long raddr, origin, offset, len; 1770 unsigned long *table; 1771 phys_addr_t s_r2t; 1772 struct page *page; 1773 int rc; 1774 1775 BUG_ON(!gmap_is_shadow(sg)); 1776 /* Allocate a shadow region second table */ 1777 page = gmap_alloc_crst(); 1778 if (!page) 1779 return -ENOMEM; 1780 page->index = r2t & _REGION_ENTRY_ORIGIN; 1781 if (fake) 1782 page->index |= GMAP_SHADOW_FAKE_TABLE; 1783 s_r2t = page_to_phys(page); 1784 /* Install shadow region second table */ 1785 spin_lock(&sg->guest_table_lock); 1786 table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ 1787 if (!table) { 1788 rc = -EAGAIN; /* Race with unshadow */ 1789 goto out_free; 1790 } 1791 if (!(*table & _REGION_ENTRY_INVALID)) { 1792 rc = 0; /* Already established */ 1793 goto out_free; 1794 } else if (*table & _REGION_ENTRY_ORIGIN) { 1795 rc = -EAGAIN; /* Race with shadow */ 1796 goto out_free; 1797 } 1798 crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); 1799 /* mark as invalid as long as the parent table is not protected */ 1800 *table = s_r2t | _REGION_ENTRY_LENGTH | 1801 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; 1802 if (sg->edat_level >= 1) 1803 *table |= (r2t & _REGION_ENTRY_PROTECT); 1804 list_add(&page->lru, &sg->crst_list); 1805 if (fake) { 1806 /* nothing to protect for fake tables */ 1807 *table &= ~_REGION_ENTRY_INVALID; 1808 spin_unlock(&sg->guest_table_lock); 1809 return 0; 1810 } 1811 spin_unlock(&sg->guest_table_lock); 1812 /* Make r2t read-only in parent gmap page table */ 1813 raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; 1814 origin = r2t & _REGION_ENTRY_ORIGIN; 1815 offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1816 len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1817 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1818 spin_lock(&sg->guest_table_lock); 1819 if (!rc) { 1820 table = gmap_table_walk(sg, saddr, 4); 1821 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) 1822 rc = -EAGAIN; /* Race with unshadow */ 1823 else 1824 *table &= ~_REGION_ENTRY_INVALID; 1825 } else { 1826 gmap_unshadow_r2t(sg, raddr); 1827 } 1828 spin_unlock(&sg->guest_table_lock); 1829 return rc; 1830 out_free: 1831 spin_unlock(&sg->guest_table_lock); 1832 __free_pages(page, CRST_ALLOC_ORDER); 1833 return rc; 1834 } 1835 EXPORT_SYMBOL_GPL(gmap_shadow_r2t); 1836 1837 /** 1838 * gmap_shadow_r3t - create a shadow region 3 table 1839 * @sg: pointer to the shadow guest address space structure 1840 * @saddr: faulting address in the shadow gmap 1841 * @r3t: parent gmap address of the region 3 table to get shadowed 1842 * @fake: r3t references contiguous guest memory block, not a r3t 1843 * 1844 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1845 * shadow table structure is incomplete, -ENOMEM if out of memory and 1846 * -EFAULT if an address in the parent gmap could not be resolved. 1847 * 1848 * Called with sg->mm->mmap_lock in read. 1849 */ 1850 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, 1851 int fake) 1852 { 1853 unsigned long raddr, origin, offset, len; 1854 unsigned long *table; 1855 phys_addr_t s_r3t; 1856 struct page *page; 1857 int rc; 1858 1859 BUG_ON(!gmap_is_shadow(sg)); 1860 /* Allocate a shadow region second table */ 1861 page = gmap_alloc_crst(); 1862 if (!page) 1863 return -ENOMEM; 1864 page->index = r3t & _REGION_ENTRY_ORIGIN; 1865 if (fake) 1866 page->index |= GMAP_SHADOW_FAKE_TABLE; 1867 s_r3t = page_to_phys(page); 1868 /* Install shadow region second table */ 1869 spin_lock(&sg->guest_table_lock); 1870 table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ 1871 if (!table) { 1872 rc = -EAGAIN; /* Race with unshadow */ 1873 goto out_free; 1874 } 1875 if (!(*table & _REGION_ENTRY_INVALID)) { 1876 rc = 0; /* Already established */ 1877 goto out_free; 1878 } else if (*table & _REGION_ENTRY_ORIGIN) { 1879 rc = -EAGAIN; /* Race with shadow */ 1880 goto out_free; 1881 } 1882 crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); 1883 /* mark as invalid as long as the parent table is not protected */ 1884 *table = s_r3t | _REGION_ENTRY_LENGTH | 1885 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; 1886 if (sg->edat_level >= 1) 1887 *table |= (r3t & _REGION_ENTRY_PROTECT); 1888 list_add(&page->lru, &sg->crst_list); 1889 if (fake) { 1890 /* nothing to protect for fake tables */ 1891 *table &= ~_REGION_ENTRY_INVALID; 1892 spin_unlock(&sg->guest_table_lock); 1893 return 0; 1894 } 1895 spin_unlock(&sg->guest_table_lock); 1896 /* Make r3t read-only in parent gmap page table */ 1897 raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; 1898 origin = r3t & _REGION_ENTRY_ORIGIN; 1899 offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1900 len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1901 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1902 spin_lock(&sg->guest_table_lock); 1903 if (!rc) { 1904 table = gmap_table_walk(sg, saddr, 3); 1905 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) 1906 rc = -EAGAIN; /* Race with unshadow */ 1907 else 1908 *table &= ~_REGION_ENTRY_INVALID; 1909 } else { 1910 gmap_unshadow_r3t(sg, raddr); 1911 } 1912 spin_unlock(&sg->guest_table_lock); 1913 return rc; 1914 out_free: 1915 spin_unlock(&sg->guest_table_lock); 1916 __free_pages(page, CRST_ALLOC_ORDER); 1917 return rc; 1918 } 1919 EXPORT_SYMBOL_GPL(gmap_shadow_r3t); 1920 1921 /** 1922 * gmap_shadow_sgt - create a shadow segment table 1923 * @sg: pointer to the shadow guest address space structure 1924 * @saddr: faulting address in the shadow gmap 1925 * @sgt: parent gmap address of the segment table to get shadowed 1926 * @fake: sgt references contiguous guest memory block, not a sgt 1927 * 1928 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the 1929 * shadow table structure is incomplete, -ENOMEM if out of memory and 1930 * -EFAULT if an address in the parent gmap could not be resolved. 1931 * 1932 * Called with sg->mm->mmap_lock in read. 1933 */ 1934 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, 1935 int fake) 1936 { 1937 unsigned long raddr, origin, offset, len; 1938 unsigned long *table; 1939 phys_addr_t s_sgt; 1940 struct page *page; 1941 int rc; 1942 1943 BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); 1944 /* Allocate a shadow segment table */ 1945 page = gmap_alloc_crst(); 1946 if (!page) 1947 return -ENOMEM; 1948 page->index = sgt & _REGION_ENTRY_ORIGIN; 1949 if (fake) 1950 page->index |= GMAP_SHADOW_FAKE_TABLE; 1951 s_sgt = page_to_phys(page); 1952 /* Install shadow region second table */ 1953 spin_lock(&sg->guest_table_lock); 1954 table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ 1955 if (!table) { 1956 rc = -EAGAIN; /* Race with unshadow */ 1957 goto out_free; 1958 } 1959 if (!(*table & _REGION_ENTRY_INVALID)) { 1960 rc = 0; /* Already established */ 1961 goto out_free; 1962 } else if (*table & _REGION_ENTRY_ORIGIN) { 1963 rc = -EAGAIN; /* Race with shadow */ 1964 goto out_free; 1965 } 1966 crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); 1967 /* mark as invalid as long as the parent table is not protected */ 1968 *table = s_sgt | _REGION_ENTRY_LENGTH | 1969 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; 1970 if (sg->edat_level >= 1) 1971 *table |= sgt & _REGION_ENTRY_PROTECT; 1972 list_add(&page->lru, &sg->crst_list); 1973 if (fake) { 1974 /* nothing to protect for fake tables */ 1975 *table &= ~_REGION_ENTRY_INVALID; 1976 spin_unlock(&sg->guest_table_lock); 1977 return 0; 1978 } 1979 spin_unlock(&sg->guest_table_lock); 1980 /* Make sgt read-only in parent gmap page table */ 1981 raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; 1982 origin = sgt & _REGION_ENTRY_ORIGIN; 1983 offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1984 len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1985 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1986 spin_lock(&sg->guest_table_lock); 1987 if (!rc) { 1988 table = gmap_table_walk(sg, saddr, 2); 1989 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) 1990 rc = -EAGAIN; /* Race with unshadow */ 1991 else 1992 *table &= ~_REGION_ENTRY_INVALID; 1993 } else { 1994 gmap_unshadow_sgt(sg, raddr); 1995 } 1996 spin_unlock(&sg->guest_table_lock); 1997 return rc; 1998 out_free: 1999 spin_unlock(&sg->guest_table_lock); 2000 __free_pages(page, CRST_ALLOC_ORDER); 2001 return rc; 2002 } 2003 EXPORT_SYMBOL_GPL(gmap_shadow_sgt); 2004 2005 /** 2006 * gmap_shadow_pgt_lookup - find a shadow page table 2007 * @sg: pointer to the shadow guest address space structure 2008 * @saddr: the address in the shadow aguest address space 2009 * @pgt: parent gmap address of the page table to get shadowed 2010 * @dat_protection: if the pgtable is marked as protected by dat 2011 * @fake: pgt references contiguous guest memory block, not a pgtable 2012 * 2013 * Returns 0 if the shadow page table was found and -EAGAIN if the page 2014 * table was not found. 2015 * 2016 * Called with sg->mm->mmap_lock in read. 2017 */ 2018 int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, 2019 unsigned long *pgt, int *dat_protection, 2020 int *fake) 2021 { 2022 unsigned long *table; 2023 struct page *page; 2024 int rc; 2025 2026 BUG_ON(!gmap_is_shadow(sg)); 2027 spin_lock(&sg->guest_table_lock); 2028 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 2029 if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { 2030 /* Shadow page tables are full pages (pte+pgste) */ 2031 page = pfn_to_page(*table >> PAGE_SHIFT); 2032 *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; 2033 *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); 2034 *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); 2035 rc = 0; 2036 } else { 2037 rc = -EAGAIN; 2038 } 2039 spin_unlock(&sg->guest_table_lock); 2040 return rc; 2041 2042 } 2043 EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); 2044 2045 /** 2046 * gmap_shadow_pgt - instantiate a shadow page table 2047 * @sg: pointer to the shadow guest address space structure 2048 * @saddr: faulting address in the shadow gmap 2049 * @pgt: parent gmap address of the page table to get shadowed 2050 * @fake: pgt references contiguous guest memory block, not a pgtable 2051 * 2052 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 2053 * shadow table structure is incomplete, -ENOMEM if out of memory, 2054 * -EFAULT if an address in the parent gmap could not be resolved and 2055 * 2056 * Called with gmap->mm->mmap_lock in read 2057 */ 2058 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, 2059 int fake) 2060 { 2061 unsigned long raddr, origin; 2062 unsigned long *table; 2063 struct ptdesc *ptdesc; 2064 phys_addr_t s_pgt; 2065 int rc; 2066 2067 BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); 2068 /* Allocate a shadow page table */ 2069 ptdesc = page_table_alloc_pgste(sg->mm); 2070 if (!ptdesc) 2071 return -ENOMEM; 2072 ptdesc->pt_index = pgt & _SEGMENT_ENTRY_ORIGIN; 2073 if (fake) 2074 ptdesc->pt_index |= GMAP_SHADOW_FAKE_TABLE; 2075 s_pgt = page_to_phys(ptdesc_page(ptdesc)); 2076 /* Install shadow page table */ 2077 spin_lock(&sg->guest_table_lock); 2078 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 2079 if (!table) { 2080 rc = -EAGAIN; /* Race with unshadow */ 2081 goto out_free; 2082 } 2083 if (!(*table & _SEGMENT_ENTRY_INVALID)) { 2084 rc = 0; /* Already established */ 2085 goto out_free; 2086 } else if (*table & _SEGMENT_ENTRY_ORIGIN) { 2087 rc = -EAGAIN; /* Race with shadow */ 2088 goto out_free; 2089 } 2090 /* mark as invalid as long as the parent table is not protected */ 2091 *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | 2092 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; 2093 list_add(&ptdesc->pt_list, &sg->pt_list); 2094 if (fake) { 2095 /* nothing to protect for fake tables */ 2096 *table &= ~_SEGMENT_ENTRY_INVALID; 2097 spin_unlock(&sg->guest_table_lock); 2098 return 0; 2099 } 2100 spin_unlock(&sg->guest_table_lock); 2101 /* Make pgt read-only in parent gmap page table (not the pgste) */ 2102 raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; 2103 origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; 2104 rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); 2105 spin_lock(&sg->guest_table_lock); 2106 if (!rc) { 2107 table = gmap_table_walk(sg, saddr, 1); 2108 if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) 2109 rc = -EAGAIN; /* Race with unshadow */ 2110 else 2111 *table &= ~_SEGMENT_ENTRY_INVALID; 2112 } else { 2113 gmap_unshadow_pgt(sg, raddr); 2114 } 2115 spin_unlock(&sg->guest_table_lock); 2116 return rc; 2117 out_free: 2118 spin_unlock(&sg->guest_table_lock); 2119 page_table_free_pgste(ptdesc); 2120 return rc; 2121 2122 } 2123 EXPORT_SYMBOL_GPL(gmap_shadow_pgt); 2124 2125 /** 2126 * gmap_shadow_page - create a shadow page mapping 2127 * @sg: pointer to the shadow guest address space structure 2128 * @saddr: faulting address in the shadow gmap 2129 * @pte: pte in parent gmap address space to get shadowed 2130 * 2131 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 2132 * shadow table structure is incomplete, -ENOMEM if out of memory and 2133 * -EFAULT if an address in the parent gmap could not be resolved. 2134 * 2135 * Called with sg->mm->mmap_lock in read. 2136 */ 2137 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) 2138 { 2139 struct gmap *parent; 2140 struct gmap_rmap *rmap; 2141 unsigned long vmaddr, paddr; 2142 spinlock_t *ptl; 2143 pte_t *sptep, *tptep; 2144 int prot; 2145 int rc; 2146 2147 BUG_ON(!gmap_is_shadow(sg)); 2148 parent = sg->parent; 2149 prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; 2150 2151 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); 2152 if (!rmap) 2153 return -ENOMEM; 2154 rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; 2155 2156 while (1) { 2157 paddr = pte_val(pte) & PAGE_MASK; 2158 vmaddr = __gmap_translate(parent, paddr); 2159 if (IS_ERR_VALUE(vmaddr)) { 2160 rc = vmaddr; 2161 break; 2162 } 2163 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 2164 if (rc) 2165 break; 2166 rc = -EAGAIN; 2167 sptep = gmap_pte_op_walk(parent, paddr, &ptl); 2168 if (sptep) { 2169 spin_lock(&sg->guest_table_lock); 2170 /* Get page table pointer */ 2171 tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); 2172 if (!tptep) { 2173 spin_unlock(&sg->guest_table_lock); 2174 gmap_pte_op_end(sptep, ptl); 2175 radix_tree_preload_end(); 2176 break; 2177 } 2178 rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); 2179 if (rc > 0) { 2180 /* Success and a new mapping */ 2181 gmap_insert_rmap(sg, vmaddr, rmap); 2182 rmap = NULL; 2183 rc = 0; 2184 } 2185 gmap_pte_op_end(sptep, ptl); 2186 spin_unlock(&sg->guest_table_lock); 2187 } 2188 radix_tree_preload_end(); 2189 if (!rc) 2190 break; 2191 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); 2192 if (rc) 2193 break; 2194 } 2195 kfree(rmap); 2196 return rc; 2197 } 2198 EXPORT_SYMBOL_GPL(gmap_shadow_page); 2199 2200 /* 2201 * gmap_shadow_notify - handle notifications for shadow gmap 2202 * 2203 * Called with sg->parent->shadow_lock. 2204 */ 2205 static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, 2206 unsigned long gaddr) 2207 { 2208 struct gmap_rmap *rmap, *rnext, *head; 2209 unsigned long start, end, bits, raddr; 2210 2211 BUG_ON(!gmap_is_shadow(sg)); 2212 2213 spin_lock(&sg->guest_table_lock); 2214 if (sg->removed) { 2215 spin_unlock(&sg->guest_table_lock); 2216 return; 2217 } 2218 /* Check for top level table */ 2219 start = sg->orig_asce & _ASCE_ORIGIN; 2220 end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; 2221 if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && 2222 gaddr < end) { 2223 /* The complete shadow table has to go */ 2224 gmap_unshadow(sg); 2225 spin_unlock(&sg->guest_table_lock); 2226 list_del(&sg->list); 2227 gmap_put(sg); 2228 return; 2229 } 2230 /* Remove the page table tree from on specific entry */ 2231 head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); 2232 gmap_for_each_rmap_safe(rmap, rnext, head) { 2233 bits = rmap->raddr & _SHADOW_RMAP_MASK; 2234 raddr = rmap->raddr ^ bits; 2235 switch (bits) { 2236 case _SHADOW_RMAP_REGION1: 2237 gmap_unshadow_r2t(sg, raddr); 2238 break; 2239 case _SHADOW_RMAP_REGION2: 2240 gmap_unshadow_r3t(sg, raddr); 2241 break; 2242 case _SHADOW_RMAP_REGION3: 2243 gmap_unshadow_sgt(sg, raddr); 2244 break; 2245 case _SHADOW_RMAP_SEGMENT: 2246 gmap_unshadow_pgt(sg, raddr); 2247 break; 2248 case _SHADOW_RMAP_PGTABLE: 2249 gmap_unshadow_page(sg, raddr); 2250 break; 2251 } 2252 kfree(rmap); 2253 } 2254 spin_unlock(&sg->guest_table_lock); 2255 } 2256 2257 /** 2258 * ptep_notify - call all invalidation callbacks for a specific pte. 2259 * @mm: pointer to the process mm_struct 2260 * @vmaddr: virtual address in the process address space 2261 * @pte: pointer to the page table entry 2262 * @bits: bits from the pgste that caused the notify call 2263 * 2264 * This function is assumed to be called with the page table lock held 2265 * for the pte to notify. 2266 */ 2267 void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, 2268 pte_t *pte, unsigned long bits) 2269 { 2270 unsigned long offset, gaddr = 0; 2271 unsigned long *table; 2272 struct gmap *gmap, *sg, *next; 2273 2274 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 2275 offset = offset * (PAGE_SIZE / sizeof(pte_t)); 2276 rcu_read_lock(); 2277 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2278 spin_lock(&gmap->guest_table_lock); 2279 table = radix_tree_lookup(&gmap->host_to_guest, 2280 vmaddr >> PMD_SHIFT); 2281 if (table) 2282 gaddr = __gmap_segment_gaddr(table) + offset; 2283 spin_unlock(&gmap->guest_table_lock); 2284 if (!table) 2285 continue; 2286 2287 if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { 2288 spin_lock(&gmap->shadow_lock); 2289 list_for_each_entry_safe(sg, next, 2290 &gmap->children, list) 2291 gmap_shadow_notify(sg, vmaddr, gaddr); 2292 spin_unlock(&gmap->shadow_lock); 2293 } 2294 if (bits & PGSTE_IN_BIT) 2295 gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); 2296 } 2297 rcu_read_unlock(); 2298 } 2299 EXPORT_SYMBOL_GPL(ptep_notify); 2300 2301 static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, 2302 unsigned long gaddr) 2303 { 2304 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); 2305 gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); 2306 } 2307 2308 /** 2309 * gmap_pmdp_xchg - exchange a gmap pmd with another 2310 * @gmap: pointer to the guest address space structure 2311 * @pmdp: pointer to the pmd entry 2312 * @new: replacement entry 2313 * @gaddr: the affected guest address 2314 * 2315 * This function is assumed to be called with the guest_table_lock 2316 * held. 2317 */ 2318 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, 2319 unsigned long gaddr) 2320 { 2321 gaddr &= HPAGE_MASK; 2322 pmdp_notify_gmap(gmap, pmdp, gaddr); 2323 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); 2324 if (MACHINE_HAS_TLB_GUEST) 2325 __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, 2326 IDTE_GLOBAL); 2327 else if (MACHINE_HAS_IDTE) 2328 __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); 2329 else 2330 __pmdp_csp(pmdp); 2331 set_pmd(pmdp, new); 2332 } 2333 2334 static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, 2335 int purge) 2336 { 2337 pmd_t *pmdp; 2338 struct gmap *gmap; 2339 unsigned long gaddr; 2340 2341 rcu_read_lock(); 2342 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2343 spin_lock(&gmap->guest_table_lock); 2344 pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, 2345 vmaddr >> PMD_SHIFT); 2346 if (pmdp) { 2347 gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); 2348 pmdp_notify_gmap(gmap, pmdp, gaddr); 2349 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2350 _SEGMENT_ENTRY_GMAP_UC)); 2351 if (purge) 2352 __pmdp_csp(pmdp); 2353 set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); 2354 } 2355 spin_unlock(&gmap->guest_table_lock); 2356 } 2357 rcu_read_unlock(); 2358 } 2359 2360 /** 2361 * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without 2362 * flushing 2363 * @mm: pointer to the process mm_struct 2364 * @vmaddr: virtual address in the process address space 2365 */ 2366 void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) 2367 { 2368 gmap_pmdp_clear(mm, vmaddr, 0); 2369 } 2370 EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); 2371 2372 /** 2373 * gmap_pmdp_csp - csp all affected guest pmd entries 2374 * @mm: pointer to the process mm_struct 2375 * @vmaddr: virtual address in the process address space 2376 */ 2377 void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr) 2378 { 2379 gmap_pmdp_clear(mm, vmaddr, 1); 2380 } 2381 EXPORT_SYMBOL_GPL(gmap_pmdp_csp); 2382 2383 /** 2384 * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry 2385 * @mm: pointer to the process mm_struct 2386 * @vmaddr: virtual address in the process address space 2387 */ 2388 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) 2389 { 2390 unsigned long *entry, gaddr; 2391 struct gmap *gmap; 2392 pmd_t *pmdp; 2393 2394 rcu_read_lock(); 2395 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2396 spin_lock(&gmap->guest_table_lock); 2397 entry = radix_tree_delete(&gmap->host_to_guest, 2398 vmaddr >> PMD_SHIFT); 2399 if (entry) { 2400 pmdp = (pmd_t *)entry; 2401 gaddr = __gmap_segment_gaddr(entry); 2402 pmdp_notify_gmap(gmap, pmdp, gaddr); 2403 WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2404 _SEGMENT_ENTRY_GMAP_UC)); 2405 if (MACHINE_HAS_TLB_GUEST) 2406 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2407 gmap->asce, IDTE_LOCAL); 2408 else if (MACHINE_HAS_IDTE) 2409 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); 2410 *entry = _SEGMENT_ENTRY_EMPTY; 2411 } 2412 spin_unlock(&gmap->guest_table_lock); 2413 } 2414 rcu_read_unlock(); 2415 } 2416 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); 2417 2418 /** 2419 * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry 2420 * @mm: pointer to the process mm_struct 2421 * @vmaddr: virtual address in the process address space 2422 */ 2423 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) 2424 { 2425 unsigned long *entry, gaddr; 2426 struct gmap *gmap; 2427 pmd_t *pmdp; 2428 2429 rcu_read_lock(); 2430 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2431 spin_lock(&gmap->guest_table_lock); 2432 entry = radix_tree_delete(&gmap->host_to_guest, 2433 vmaddr >> PMD_SHIFT); 2434 if (entry) { 2435 pmdp = (pmd_t *)entry; 2436 gaddr = __gmap_segment_gaddr(entry); 2437 pmdp_notify_gmap(gmap, pmdp, gaddr); 2438 WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2439 _SEGMENT_ENTRY_GMAP_UC)); 2440 if (MACHINE_HAS_TLB_GUEST) 2441 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2442 gmap->asce, IDTE_GLOBAL); 2443 else if (MACHINE_HAS_IDTE) 2444 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); 2445 else 2446 __pmdp_csp(pmdp); 2447 *entry = _SEGMENT_ENTRY_EMPTY; 2448 } 2449 spin_unlock(&gmap->guest_table_lock); 2450 } 2451 rcu_read_unlock(); 2452 } 2453 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); 2454 2455 /** 2456 * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status 2457 * @gmap: pointer to guest address space 2458 * @pmdp: pointer to the pmd to be tested 2459 * @gaddr: virtual address in the guest address space 2460 * 2461 * This function is assumed to be called with the guest_table_lock 2462 * held. 2463 */ 2464 static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, 2465 unsigned long gaddr) 2466 { 2467 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) 2468 return false; 2469 2470 /* Already protected memory, which did not change is clean */ 2471 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && 2472 !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) 2473 return false; 2474 2475 /* Clear UC indication and reset protection */ 2476 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); 2477 gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); 2478 return true; 2479 } 2480 2481 /** 2482 * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment 2483 * @gmap: pointer to guest address space 2484 * @bitmap: dirty bitmap for this pmd 2485 * @gaddr: virtual address in the guest address space 2486 * @vmaddr: virtual address in the host address space 2487 * 2488 * This function is assumed to be called with the guest_table_lock 2489 * held. 2490 */ 2491 void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], 2492 unsigned long gaddr, unsigned long vmaddr) 2493 { 2494 int i; 2495 pmd_t *pmdp; 2496 pte_t *ptep; 2497 spinlock_t *ptl; 2498 2499 pmdp = gmap_pmd_op_walk(gmap, gaddr); 2500 if (!pmdp) 2501 return; 2502 2503 if (pmd_leaf(*pmdp)) { 2504 if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) 2505 bitmap_fill(bitmap, _PAGE_ENTRIES); 2506 } else { 2507 for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { 2508 ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); 2509 if (!ptep) 2510 continue; 2511 if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) 2512 set_bit(i, bitmap); 2513 pte_unmap_unlock(ptep, ptl); 2514 } 2515 } 2516 gmap_pmd_op_end(gmap, pmdp); 2517 } 2518 EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); 2519 2520 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2521 static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, 2522 unsigned long end, struct mm_walk *walk) 2523 { 2524 struct vm_area_struct *vma = walk->vma; 2525 2526 split_huge_pmd(vma, pmd, addr); 2527 return 0; 2528 } 2529 2530 static const struct mm_walk_ops thp_split_walk_ops = { 2531 .pmd_entry = thp_split_walk_pmd_entry, 2532 .walk_lock = PGWALK_WRLOCK_VERIFY, 2533 }; 2534 2535 static inline void thp_split_mm(struct mm_struct *mm) 2536 { 2537 struct vm_area_struct *vma; 2538 VMA_ITERATOR(vmi, mm, 0); 2539 2540 for_each_vma(vmi, vma) { 2541 vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); 2542 walk_page_vma(vma, &thp_split_walk_ops, NULL); 2543 } 2544 mm->def_flags |= VM_NOHUGEPAGE; 2545 } 2546 #else 2547 static inline void thp_split_mm(struct mm_struct *mm) 2548 { 2549 } 2550 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2551 2552 /* 2553 * switch on pgstes for its userspace process (for kvm) 2554 */ 2555 int s390_enable_sie(void) 2556 { 2557 struct mm_struct *mm = current->mm; 2558 2559 /* Do we have pgstes? if yes, we are done */ 2560 if (mm_has_pgste(mm)) 2561 return 0; 2562 /* Fail if the page tables are 2K */ 2563 if (!mm_alloc_pgste(mm)) 2564 return -EINVAL; 2565 mmap_write_lock(mm); 2566 mm->context.has_pgste = 1; 2567 /* split thp mappings and disable thp for future mappings */ 2568 thp_split_mm(mm); 2569 mmap_write_unlock(mm); 2570 return 0; 2571 } 2572 EXPORT_SYMBOL_GPL(s390_enable_sie); 2573 2574 static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, 2575 unsigned long end, struct mm_walk *walk) 2576 { 2577 unsigned long *found_addr = walk->private; 2578 2579 /* Return 1 of the page is a zeropage. */ 2580 if (is_zero_pfn(pte_pfn(*pte))) { 2581 /* 2582 * Shared zeropage in e.g., a FS DAX mapping? We cannot do the 2583 * right thing and likely don't care: FAULT_FLAG_UNSHARE 2584 * currently only works in COW mappings, which is also where 2585 * mm_forbids_zeropage() is checked. 2586 */ 2587 if (!is_cow_mapping(walk->vma->vm_flags)) 2588 return -EFAULT; 2589 2590 *found_addr = addr; 2591 return 1; 2592 } 2593 return 0; 2594 } 2595 2596 static const struct mm_walk_ops find_zeropage_ops = { 2597 .pte_entry = find_zeropage_pte_entry, 2598 .walk_lock = PGWALK_WRLOCK, 2599 }; 2600 2601 /* 2602 * Unshare all shared zeropages, replacing them by anonymous pages. Note that 2603 * we cannot simply zap all shared zeropages, because this could later 2604 * trigger unexpected userfaultfd missing events. 2605 * 2606 * This must be called after mm->context.allow_cow_sharing was 2607 * set to 0, to avoid future mappings of shared zeropages. 2608 * 2609 * mm contracts with s390, that even if mm were to remove a page table, 2610 * and racing with walk_page_range_vma() calling pte_offset_map_lock() 2611 * would fail, it will never insert a page table containing empty zero 2612 * pages once mm_forbids_zeropage(mm) i.e. 2613 * mm->context.allow_cow_sharing is set to 0. 2614 */ 2615 static int __s390_unshare_zeropages(struct mm_struct *mm) 2616 { 2617 struct vm_area_struct *vma; 2618 VMA_ITERATOR(vmi, mm, 0); 2619 unsigned long addr; 2620 vm_fault_t fault; 2621 int rc; 2622 2623 for_each_vma(vmi, vma) { 2624 /* 2625 * We could only look at COW mappings, but it's more future 2626 * proof to catch unexpected zeropages in other mappings and 2627 * fail. 2628 */ 2629 if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) 2630 continue; 2631 addr = vma->vm_start; 2632 2633 retry: 2634 rc = walk_page_range_vma(vma, addr, vma->vm_end, 2635 &find_zeropage_ops, &addr); 2636 if (rc < 0) 2637 return rc; 2638 else if (!rc) 2639 continue; 2640 2641 /* addr was updated by find_zeropage_pte_entry() */ 2642 fault = handle_mm_fault(vma, addr, 2643 FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, 2644 NULL); 2645 if (fault & VM_FAULT_OOM) 2646 return -ENOMEM; 2647 /* 2648 * See break_ksm(): even after handle_mm_fault() returned 0, we 2649 * must start the lookup from the current address, because 2650 * handle_mm_fault() may back out if there's any difficulty. 2651 * 2652 * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but 2653 * maybe they could trigger in the future on concurrent 2654 * truncation. In that case, the shared zeropage would be gone 2655 * and we can simply retry and make progress. 2656 */ 2657 cond_resched(); 2658 goto retry; 2659 } 2660 2661 return 0; 2662 } 2663 2664 static int __s390_disable_cow_sharing(struct mm_struct *mm) 2665 { 2666 int rc; 2667 2668 if (!mm->context.allow_cow_sharing) 2669 return 0; 2670 2671 mm->context.allow_cow_sharing = 0; 2672 2673 /* Replace all shared zeropages by anonymous pages. */ 2674 rc = __s390_unshare_zeropages(mm); 2675 /* 2676 * Make sure to disable KSM (if enabled for the whole process or 2677 * individual VMAs). Note that nothing currently hinders user space 2678 * from re-enabling it. 2679 */ 2680 if (!rc) 2681 rc = ksm_disable(mm); 2682 if (rc) 2683 mm->context.allow_cow_sharing = 1; 2684 return rc; 2685 } 2686 2687 /* 2688 * Disable most COW-sharing of memory pages for the whole process: 2689 * (1) Disable KSM and unmerge/unshare any KSM pages. 2690 * (2) Disallow shared zeropages and unshare any zerpages that are mapped. 2691 * 2692 * Not that we currently don't bother with COW-shared pages that are shared 2693 * with parent/child processes due to fork(). 2694 */ 2695 int s390_disable_cow_sharing(void) 2696 { 2697 int rc; 2698 2699 mmap_write_lock(current->mm); 2700 rc = __s390_disable_cow_sharing(current->mm); 2701 mmap_write_unlock(current->mm); 2702 return rc; 2703 } 2704 EXPORT_SYMBOL_GPL(s390_disable_cow_sharing); 2705 2706 /* 2707 * Enable storage key handling from now on and initialize the storage 2708 * keys with the default key. 2709 */ 2710 static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, 2711 unsigned long next, struct mm_walk *walk) 2712 { 2713 /* Clear storage key */ 2714 ptep_zap_key(walk->mm, addr, pte); 2715 return 0; 2716 } 2717 2718 /* 2719 * Give a chance to schedule after setting a key to 256 pages. 2720 * We only hold the mm lock, which is a rwsem and the kvm srcu. 2721 * Both can sleep. 2722 */ 2723 static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, 2724 unsigned long next, struct mm_walk *walk) 2725 { 2726 cond_resched(); 2727 return 0; 2728 } 2729 2730 static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, 2731 unsigned long hmask, unsigned long next, 2732 struct mm_walk *walk) 2733 { 2734 pmd_t *pmd = (pmd_t *)pte; 2735 unsigned long start, end; 2736 struct folio *folio = page_folio(pmd_page(*pmd)); 2737 2738 /* 2739 * The write check makes sure we do not set a key on shared 2740 * memory. This is needed as the walker does not differentiate 2741 * between actual guest memory and the process executable or 2742 * shared libraries. 2743 */ 2744 if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || 2745 !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) 2746 return 0; 2747 2748 start = pmd_val(*pmd) & HPAGE_MASK; 2749 end = start + HPAGE_SIZE; 2750 __storage_key_init_range(start, end); 2751 set_bit(PG_arch_1, &folio->flags); 2752 cond_resched(); 2753 return 0; 2754 } 2755 2756 static const struct mm_walk_ops enable_skey_walk_ops = { 2757 .hugetlb_entry = __s390_enable_skey_hugetlb, 2758 .pte_entry = __s390_enable_skey_pte, 2759 .pmd_entry = __s390_enable_skey_pmd, 2760 .walk_lock = PGWALK_WRLOCK, 2761 }; 2762 2763 int s390_enable_skey(void) 2764 { 2765 struct mm_struct *mm = current->mm; 2766 int rc = 0; 2767 2768 mmap_write_lock(mm); 2769 if (mm_uses_skeys(mm)) 2770 goto out_up; 2771 2772 mm->context.uses_skeys = 1; 2773 rc = __s390_disable_cow_sharing(mm); 2774 if (rc) { 2775 mm->context.uses_skeys = 0; 2776 goto out_up; 2777 } 2778 walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); 2779 2780 out_up: 2781 mmap_write_unlock(mm); 2782 return rc; 2783 } 2784 EXPORT_SYMBOL_GPL(s390_enable_skey); 2785 2786 /* 2787 * Reset CMMA state, make all pages stable again. 2788 */ 2789 static int __s390_reset_cmma(pte_t *pte, unsigned long addr, 2790 unsigned long next, struct mm_walk *walk) 2791 { 2792 ptep_zap_unused(walk->mm, addr, pte, 1); 2793 return 0; 2794 } 2795 2796 static const struct mm_walk_ops reset_cmma_walk_ops = { 2797 .pte_entry = __s390_reset_cmma, 2798 .walk_lock = PGWALK_WRLOCK, 2799 }; 2800 2801 void s390_reset_cmma(struct mm_struct *mm) 2802 { 2803 mmap_write_lock(mm); 2804 walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); 2805 mmap_write_unlock(mm); 2806 } 2807 EXPORT_SYMBOL_GPL(s390_reset_cmma); 2808 2809 #define GATHER_GET_PAGES 32 2810 2811 struct reset_walk_state { 2812 unsigned long next; 2813 unsigned long count; 2814 unsigned long pfns[GATHER_GET_PAGES]; 2815 }; 2816 2817 static int s390_gather_pages(pte_t *ptep, unsigned long addr, 2818 unsigned long next, struct mm_walk *walk) 2819 { 2820 struct reset_walk_state *p = walk->private; 2821 pte_t pte = READ_ONCE(*ptep); 2822 2823 if (pte_present(pte)) { 2824 /* we have a reference from the mapping, take an extra one */ 2825 get_page(phys_to_page(pte_val(pte))); 2826 p->pfns[p->count] = phys_to_pfn(pte_val(pte)); 2827 p->next = next; 2828 p->count++; 2829 } 2830 return p->count >= GATHER_GET_PAGES; 2831 } 2832 2833 static const struct mm_walk_ops gather_pages_ops = { 2834 .pte_entry = s390_gather_pages, 2835 .walk_lock = PGWALK_RDLOCK, 2836 }; 2837 2838 /* 2839 * Call the Destroy secure page UVC on each page in the given array of PFNs. 2840 * Each page needs to have an extra reference, which will be released here. 2841 */ 2842 void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) 2843 { 2844 struct folio *folio; 2845 unsigned long i; 2846 2847 for (i = 0; i < count; i++) { 2848 folio = pfn_folio(pfns[i]); 2849 /* we always have an extra reference */ 2850 uv_destroy_folio(folio); 2851 /* get rid of the extra reference */ 2852 folio_put(folio); 2853 cond_resched(); 2854 } 2855 } 2856 EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); 2857 2858 /** 2859 * __s390_uv_destroy_range - Call the destroy secure page UVC on each page 2860 * in the given range of the given address space. 2861 * @mm: the mm to operate on 2862 * @start: the start of the range 2863 * @end: the end of the range 2864 * @interruptible: if not 0, stop when a fatal signal is received 2865 * 2866 * Walk the given range of the given address space and call the destroy 2867 * secure page UVC on each page. Optionally exit early if a fatal signal is 2868 * pending. 2869 * 2870 * Return: 0 on success, -EINTR if the function stopped before completing 2871 */ 2872 int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, 2873 unsigned long end, bool interruptible) 2874 { 2875 struct reset_walk_state state = { .next = start }; 2876 int r = 1; 2877 2878 while (r > 0) { 2879 state.count = 0; 2880 mmap_read_lock(mm); 2881 r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); 2882 mmap_read_unlock(mm); 2883 cond_resched(); 2884 s390_uv_destroy_pfns(state.count, state.pfns); 2885 if (interruptible && fatal_signal_pending(current)) 2886 return -EINTR; 2887 } 2888 return 0; 2889 } 2890 EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); 2891 2892 /** 2893 * s390_unlist_old_asce - Remove the topmost level of page tables from the 2894 * list of page tables of the gmap. 2895 * @gmap: the gmap whose table is to be removed 2896 * 2897 * On s390x, KVM keeps a list of all pages containing the page tables of the 2898 * gmap (the CRST list). This list is used at tear down time to free all 2899 * pages that are now not needed anymore. 2900 * 2901 * This function removes the topmost page of the tree (the one pointed to by 2902 * the ASCE) from the CRST list. 2903 * 2904 * This means that it will not be freed when the VM is torn down, and needs 2905 * to be handled separately by the caller, unless a leak is actually 2906 * intended. Notice that this function will only remove the page from the 2907 * list, the page will still be used as a top level page table (and ASCE). 2908 */ 2909 void s390_unlist_old_asce(struct gmap *gmap) 2910 { 2911 struct page *old; 2912 2913 old = virt_to_page(gmap->table); 2914 spin_lock(&gmap->guest_table_lock); 2915 list_del(&old->lru); 2916 /* 2917 * Sometimes the topmost page might need to be "removed" multiple 2918 * times, for example if the VM is rebooted into secure mode several 2919 * times concurrently, or if s390_replace_asce fails after calling 2920 * s390_remove_old_asce and is attempted again later. In that case 2921 * the old asce has been removed from the list, and therefore it 2922 * will not be freed when the VM terminates, but the ASCE is still 2923 * in use and still pointed to. 2924 * A subsequent call to replace_asce will follow the pointer and try 2925 * to remove the same page from the list again. 2926 * Therefore it's necessary that the page of the ASCE has valid 2927 * pointers, so list_del can work (and do nothing) without 2928 * dereferencing stale or invalid pointers. 2929 */ 2930 INIT_LIST_HEAD(&old->lru); 2931 spin_unlock(&gmap->guest_table_lock); 2932 } 2933 EXPORT_SYMBOL_GPL(s390_unlist_old_asce); 2934 2935 /** 2936 * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy 2937 * @gmap: the gmap whose ASCE needs to be replaced 2938 * 2939 * If the ASCE is a SEGMENT type then this function will return -EINVAL, 2940 * otherwise the pointers in the host_to_guest radix tree will keep pointing 2941 * to the wrong pages, causing use-after-free and memory corruption. 2942 * If the allocation of the new top level page table fails, the ASCE is not 2943 * replaced. 2944 * In any case, the old ASCE is always removed from the gmap CRST list. 2945 * Therefore the caller has to make sure to save a pointer to it 2946 * beforehand, unless a leak is actually intended. 2947 */ 2948 int s390_replace_asce(struct gmap *gmap) 2949 { 2950 unsigned long asce; 2951 struct page *page; 2952 void *table; 2953 2954 s390_unlist_old_asce(gmap); 2955 2956 /* Replacing segment type ASCEs would cause serious issues */ 2957 if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) 2958 return -EINVAL; 2959 2960 page = gmap_alloc_crst(); 2961 if (!page) 2962 return -ENOMEM; 2963 page->index = 0; 2964 table = page_to_virt(page); 2965 memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); 2966 2967 /* 2968 * The caller has to deal with the old ASCE, but here we make sure 2969 * the new one is properly added to the CRST list, so that 2970 * it will be freed when the VM is torn down. 2971 */ 2972 spin_lock(&gmap->guest_table_lock); 2973 list_add(&page->lru, &gmap->crst_list); 2974 spin_unlock(&gmap->guest_table_lock); 2975 2976 /* Set new table origin while preserving existing ASCE control bits */ 2977 asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); 2978 WRITE_ONCE(gmap->asce, asce); 2979 WRITE_ONCE(gmap->mm->context.gmap_asce, asce); 2980 WRITE_ONCE(gmap->table, table); 2981 2982 return 0; 2983 } 2984 EXPORT_SYMBOL_GPL(s390_replace_asce); 2985