1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * KVM guest address space mapping code 4 * 5 * Copyright IBM Corp. 2007, 2020 6 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 7 * David Hildenbrand <david@redhat.com> 8 * Janosch Frank <frankja@linux.vnet.ibm.com> 9 */ 10 11 #include <linux/kernel.h> 12 #include <linux/pagewalk.h> 13 #include <linux/swap.h> 14 #include <linux/smp.h> 15 #include <linux/spinlock.h> 16 #include <linux/slab.h> 17 #include <linux/swapops.h> 18 #include <linux/ksm.h> 19 #include <linux/mman.h> 20 #include <linux/pgtable.h> 21 #include <asm/page-states.h> 22 #include <asm/pgalloc.h> 23 #include <asm/gmap.h> 24 #include <asm/page.h> 25 #include <asm/tlb.h> 26 27 /* 28 * The address is saved in a radix tree directly; NULL would be ambiguous, 29 * since 0 is a valid address, and NULL is returned when nothing was found. 30 * The lower bits are ignored by all users of the macro, so it can be used 31 * to distinguish a valid address 0 from a NULL. 32 */ 33 #define VALID_GADDR_FLAG 1 34 #define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG) 35 #define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG) 36 37 #define GMAP_SHADOW_FAKE_TABLE 1ULL 38 39 static struct page *gmap_alloc_crst(void) 40 { 41 struct page *page; 42 43 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 44 if (!page) 45 return NULL; 46 __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); 47 return page; 48 } 49 50 /** 51 * gmap_alloc - allocate and initialize a guest address space 52 * @limit: maximum address of the gmap address space 53 * 54 * Returns a guest address space structure. 55 */ 56 struct gmap *gmap_alloc(unsigned long limit) 57 { 58 struct gmap *gmap; 59 struct page *page; 60 unsigned long *table; 61 unsigned long etype, atype; 62 63 if (limit < _REGION3_SIZE) { 64 limit = _REGION3_SIZE - 1; 65 atype = _ASCE_TYPE_SEGMENT; 66 etype = _SEGMENT_ENTRY_EMPTY; 67 } else if (limit < _REGION2_SIZE) { 68 limit = _REGION2_SIZE - 1; 69 atype = _ASCE_TYPE_REGION3; 70 etype = _REGION3_ENTRY_EMPTY; 71 } else if (limit < _REGION1_SIZE) { 72 limit = _REGION1_SIZE - 1; 73 atype = _ASCE_TYPE_REGION2; 74 etype = _REGION2_ENTRY_EMPTY; 75 } else { 76 limit = -1UL; 77 atype = _ASCE_TYPE_REGION1; 78 etype = _REGION1_ENTRY_EMPTY; 79 } 80 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); 81 if (!gmap) 82 goto out; 83 INIT_LIST_HEAD(&gmap->children); 84 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); 85 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); 86 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); 87 spin_lock_init(&gmap->guest_table_lock); 88 spin_lock_init(&gmap->shadow_lock); 89 refcount_set(&gmap->ref_count, 1); 90 page = gmap_alloc_crst(); 91 if (!page) 92 goto out_free; 93 table = page_to_virt(page); 94 crst_table_init(table, etype); 95 gmap->table = table; 96 gmap->asce = atype | _ASCE_TABLE_LENGTH | 97 _ASCE_USER_BITS | __pa(table); 98 gmap->asce_end = limit; 99 return gmap; 100 101 out_free: 102 kfree(gmap); 103 out: 104 return NULL; 105 } 106 EXPORT_SYMBOL_GPL(gmap_alloc); 107 108 /** 109 * gmap_create - create a guest address space 110 * @mm: pointer to the parent mm_struct 111 * @limit: maximum size of the gmap address space 112 * 113 * Returns a guest address space structure. 114 */ 115 struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) 116 { 117 struct gmap *gmap; 118 unsigned long gmap_asce; 119 120 gmap = gmap_alloc(limit); 121 if (!gmap) 122 return NULL; 123 gmap->mm = mm; 124 spin_lock(&mm->context.lock); 125 list_add_rcu(&gmap->list, &mm->context.gmap_list); 126 if (list_is_singular(&mm->context.gmap_list)) 127 gmap_asce = gmap->asce; 128 else 129 gmap_asce = -1UL; 130 WRITE_ONCE(mm->context.gmap_asce, gmap_asce); 131 spin_unlock(&mm->context.lock); 132 return gmap; 133 } 134 EXPORT_SYMBOL_GPL(gmap_create); 135 136 static void gmap_flush_tlb(struct gmap *gmap) 137 { 138 if (MACHINE_HAS_IDTE) 139 __tlb_flush_idte(gmap->asce); 140 else 141 __tlb_flush_global(); 142 } 143 144 static void gmap_radix_tree_free(struct radix_tree_root *root) 145 { 146 struct radix_tree_iter iter; 147 unsigned long indices[16]; 148 unsigned long index; 149 void __rcu **slot; 150 int i, nr; 151 152 /* A radix tree is freed by deleting all of its entries */ 153 index = 0; 154 do { 155 nr = 0; 156 radix_tree_for_each_slot(slot, root, &iter, index) { 157 indices[nr] = iter.index; 158 if (++nr == 16) 159 break; 160 } 161 for (i = 0; i < nr; i++) { 162 index = indices[i]; 163 radix_tree_delete(root, index); 164 } 165 } while (nr > 0); 166 } 167 168 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) 169 { 170 struct gmap_rmap *rmap, *rnext, *head; 171 struct radix_tree_iter iter; 172 unsigned long indices[16]; 173 unsigned long index; 174 void __rcu **slot; 175 int i, nr; 176 177 /* A radix tree is freed by deleting all of its entries */ 178 index = 0; 179 do { 180 nr = 0; 181 radix_tree_for_each_slot(slot, root, &iter, index) { 182 indices[nr] = iter.index; 183 if (++nr == 16) 184 break; 185 } 186 for (i = 0; i < nr; i++) { 187 index = indices[i]; 188 head = radix_tree_delete(root, index); 189 gmap_for_each_rmap_safe(rmap, rnext, head) 190 kfree(rmap); 191 } 192 } while (nr > 0); 193 } 194 195 static void gmap_free_crst(unsigned long *table, bool free_ptes) 196 { 197 bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0; 198 int i; 199 200 if (is_segment) { 201 if (!free_ptes) 202 goto out; 203 for (i = 0; i < _CRST_ENTRIES; i++) 204 if (!(table[i] & _SEGMENT_ENTRY_INVALID)) 205 page_table_free_pgste(page_ptdesc(phys_to_page(table[i]))); 206 } else { 207 for (i = 0; i < _CRST_ENTRIES; i++) 208 if (!(table[i] & _REGION_ENTRY_INVALID)) 209 gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes); 210 } 211 212 out: 213 free_pages((unsigned long)table, CRST_ALLOC_ORDER); 214 } 215 216 /** 217 * gmap_free - free a guest address space 218 * @gmap: pointer to the guest address space structure 219 * 220 * No locks required. There are no references to this gmap anymore. 221 */ 222 void gmap_free(struct gmap *gmap) 223 { 224 /* Flush tlb of all gmaps (if not already done for shadows) */ 225 if (!(gmap_is_shadow(gmap) && gmap->removed)) 226 gmap_flush_tlb(gmap); 227 /* Free all segment & region tables. */ 228 gmap_free_crst(gmap->table, gmap_is_shadow(gmap)); 229 230 gmap_radix_tree_free(&gmap->guest_to_host); 231 gmap_radix_tree_free(&gmap->host_to_guest); 232 233 /* Free additional data for a shadow gmap */ 234 if (gmap_is_shadow(gmap)) { 235 gmap_rmap_radix_tree_free(&gmap->host_to_rmap); 236 /* Release reference to the parent */ 237 gmap_put(gmap->parent); 238 } 239 240 kfree(gmap); 241 } 242 EXPORT_SYMBOL_GPL(gmap_free); 243 244 /** 245 * gmap_get - increase reference counter for guest address space 246 * @gmap: pointer to the guest address space structure 247 * 248 * Returns the gmap pointer 249 */ 250 struct gmap *gmap_get(struct gmap *gmap) 251 { 252 refcount_inc(&gmap->ref_count); 253 return gmap; 254 } 255 EXPORT_SYMBOL_GPL(gmap_get); 256 257 /** 258 * gmap_put - decrease reference counter for guest address space 259 * @gmap: pointer to the guest address space structure 260 * 261 * If the reference counter reaches zero the guest address space is freed. 262 */ 263 void gmap_put(struct gmap *gmap) 264 { 265 if (refcount_dec_and_test(&gmap->ref_count)) 266 gmap_free(gmap); 267 } 268 EXPORT_SYMBOL_GPL(gmap_put); 269 270 /** 271 * gmap_remove - remove a guest address space but do not free it yet 272 * @gmap: pointer to the guest address space structure 273 */ 274 void gmap_remove(struct gmap *gmap) 275 { 276 struct gmap *sg, *next; 277 unsigned long gmap_asce; 278 279 /* Remove all shadow gmaps linked to this gmap */ 280 if (!list_empty(&gmap->children)) { 281 spin_lock(&gmap->shadow_lock); 282 list_for_each_entry_safe(sg, next, &gmap->children, list) { 283 list_del(&sg->list); 284 gmap_put(sg); 285 } 286 spin_unlock(&gmap->shadow_lock); 287 } 288 /* Remove gmap from the pre-mm list */ 289 spin_lock(&gmap->mm->context.lock); 290 list_del_rcu(&gmap->list); 291 if (list_empty(&gmap->mm->context.gmap_list)) 292 gmap_asce = 0; 293 else if (list_is_singular(&gmap->mm->context.gmap_list)) 294 gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, 295 struct gmap, list)->asce; 296 else 297 gmap_asce = -1UL; 298 WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); 299 spin_unlock(&gmap->mm->context.lock); 300 synchronize_rcu(); 301 /* Put reference */ 302 gmap_put(gmap); 303 } 304 EXPORT_SYMBOL_GPL(gmap_remove); 305 306 /* 307 * gmap_alloc_table is assumed to be called with mmap_lock held 308 */ 309 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 310 unsigned long init, unsigned long gaddr) 311 { 312 struct page *page; 313 unsigned long *new; 314 315 /* since we dont free the gmap table until gmap_free we can unlock */ 316 page = gmap_alloc_crst(); 317 if (!page) 318 return -ENOMEM; 319 new = page_to_virt(page); 320 crst_table_init(new, init); 321 spin_lock(&gmap->guest_table_lock); 322 if (*table & _REGION_ENTRY_INVALID) { 323 *table = __pa(new) | _REGION_ENTRY_LENGTH | 324 (*table & _REGION_ENTRY_TYPE_MASK); 325 page = NULL; 326 } 327 spin_unlock(&gmap->guest_table_lock); 328 if (page) 329 __free_pages(page, CRST_ALLOC_ORDER); 330 return 0; 331 } 332 333 static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr) 334 { 335 return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 336 } 337 338 static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr) 339 { 340 return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 341 } 342 343 static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr, 344 unsigned long *gaddr) 345 { 346 *gaddr = host_to_guest_delete(gmap, vmaddr); 347 if (IS_GADDR_VALID(*gaddr)) 348 return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1); 349 return NULL; 350 } 351 352 /** 353 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 354 * @gmap: pointer to the guest address space structure 355 * @vmaddr: address in the host process address space 356 * 357 * Returns 1 if a TLB flush is required 358 */ 359 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 360 { 361 unsigned long gaddr; 362 int flush = 0; 363 pmd_t *pmdp; 364 365 BUG_ON(gmap_is_shadow(gmap)); 366 spin_lock(&gmap->guest_table_lock); 367 368 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); 369 if (pmdp) { 370 flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY); 371 *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); 372 } 373 374 spin_unlock(&gmap->guest_table_lock); 375 return flush; 376 } 377 378 /** 379 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 380 * @gmap: pointer to the guest address space structure 381 * @gaddr: address in the guest address space 382 * 383 * Returns 1 if a TLB flush is required 384 */ 385 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 386 { 387 unsigned long vmaddr; 388 389 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 390 gaddr >> PMD_SHIFT); 391 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 392 } 393 394 /** 395 * gmap_unmap_segment - unmap segment from the guest address space 396 * @gmap: pointer to the guest address space structure 397 * @to: address in the guest address space 398 * @len: length of the memory area to unmap 399 * 400 * Returns 0 if the unmap succeeded, -EINVAL if not. 401 */ 402 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 403 { 404 unsigned long off; 405 int flush; 406 407 BUG_ON(gmap_is_shadow(gmap)); 408 if ((to | len) & (PMD_SIZE - 1)) 409 return -EINVAL; 410 if (len == 0 || to + len < to) 411 return -EINVAL; 412 413 flush = 0; 414 mmap_write_lock(gmap->mm); 415 for (off = 0; off < len; off += PMD_SIZE) 416 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 417 mmap_write_unlock(gmap->mm); 418 if (flush) 419 gmap_flush_tlb(gmap); 420 return 0; 421 } 422 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 423 424 /** 425 * gmap_map_segment - map a segment to the guest address space 426 * @gmap: pointer to the guest address space structure 427 * @from: source address in the parent address space 428 * @to: target address in the guest address space 429 * @len: length of the memory area to map 430 * 431 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 432 */ 433 int gmap_map_segment(struct gmap *gmap, unsigned long from, 434 unsigned long to, unsigned long len) 435 { 436 unsigned long off; 437 int flush; 438 439 BUG_ON(gmap_is_shadow(gmap)); 440 if ((from | to | len) & (PMD_SIZE - 1)) 441 return -EINVAL; 442 if (len == 0 || from + len < from || to + len < to || 443 from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) 444 return -EINVAL; 445 446 flush = 0; 447 mmap_write_lock(gmap->mm); 448 for (off = 0; off < len; off += PMD_SIZE) { 449 /* Remove old translation */ 450 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 451 /* Store new translation */ 452 if (radix_tree_insert(&gmap->guest_to_host, 453 (to + off) >> PMD_SHIFT, 454 (void *) from + off)) 455 break; 456 } 457 mmap_write_unlock(gmap->mm); 458 if (flush) 459 gmap_flush_tlb(gmap); 460 if (off >= len) 461 return 0; 462 gmap_unmap_segment(gmap, to, len); 463 return -ENOMEM; 464 } 465 EXPORT_SYMBOL_GPL(gmap_map_segment); 466 467 /** 468 * __gmap_translate - translate a guest address to a user space address 469 * @gmap: pointer to guest mapping meta data structure 470 * @gaddr: guest address 471 * 472 * Returns user space address which corresponds to the guest address or 473 * -EFAULT if no such mapping exists. 474 * This function does not establish potentially missing page table entries. 475 * The mmap_lock of the mm that belongs to the address space must be held 476 * when this function gets called. 477 * 478 * Note: Can also be called for shadow gmaps. 479 */ 480 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 481 { 482 unsigned long vmaddr; 483 484 vmaddr = (unsigned long) 485 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 486 /* Note: guest_to_host is empty for a shadow gmap */ 487 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 488 } 489 EXPORT_SYMBOL_GPL(__gmap_translate); 490 491 /** 492 * gmap_unlink - disconnect a page table from the gmap shadow tables 493 * @mm: pointer to the parent mm_struct 494 * @table: pointer to the host page table 495 * @vmaddr: vm address associated with the host page table 496 */ 497 void gmap_unlink(struct mm_struct *mm, unsigned long *table, 498 unsigned long vmaddr) 499 { 500 struct gmap *gmap; 501 int flush; 502 503 rcu_read_lock(); 504 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 505 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 506 if (flush) 507 gmap_flush_tlb(gmap); 508 } 509 rcu_read_unlock(); 510 } 511 512 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, 513 unsigned long gaddr); 514 515 /** 516 * __gmap_link - set up shadow page tables to connect a host to a guest address 517 * @gmap: pointer to guest mapping meta data structure 518 * @gaddr: guest address 519 * @vmaddr: vm address 520 * 521 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 522 * if the vm address is already mapped to a different guest segment. 523 * The mmap_lock of the mm that belongs to the address space must be held 524 * when this function gets called. 525 */ 526 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 527 { 528 struct mm_struct *mm; 529 unsigned long *table; 530 spinlock_t *ptl; 531 pgd_t *pgd; 532 p4d_t *p4d; 533 pud_t *pud; 534 pmd_t *pmd; 535 u64 unprot; 536 int rc; 537 538 BUG_ON(gmap_is_shadow(gmap)); 539 /* Create higher level tables in the gmap page table */ 540 table = gmap->table; 541 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 542 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; 543 if ((*table & _REGION_ENTRY_INVALID) && 544 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 545 gaddr & _REGION1_MASK)) 546 return -ENOMEM; 547 table = __va(*table & _REGION_ENTRY_ORIGIN); 548 } 549 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 550 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; 551 if ((*table & _REGION_ENTRY_INVALID) && 552 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 553 gaddr & _REGION2_MASK)) 554 return -ENOMEM; 555 table = __va(*table & _REGION_ENTRY_ORIGIN); 556 } 557 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 558 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; 559 if ((*table & _REGION_ENTRY_INVALID) && 560 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 561 gaddr & _REGION3_MASK)) 562 return -ENOMEM; 563 table = __va(*table & _REGION_ENTRY_ORIGIN); 564 } 565 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 566 /* Walk the parent mm page table */ 567 mm = gmap->mm; 568 pgd = pgd_offset(mm, vmaddr); 569 VM_BUG_ON(pgd_none(*pgd)); 570 p4d = p4d_offset(pgd, vmaddr); 571 VM_BUG_ON(p4d_none(*p4d)); 572 pud = pud_offset(p4d, vmaddr); 573 VM_BUG_ON(pud_none(*pud)); 574 /* large puds cannot yet be handled */ 575 if (pud_leaf(*pud)) 576 return -EFAULT; 577 pmd = pmd_offset(pud, vmaddr); 578 VM_BUG_ON(pmd_none(*pmd)); 579 /* Are we allowed to use huge pages? */ 580 if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) 581 return -EFAULT; 582 /* Link gmap segment table entry location to page table. */ 583 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 584 if (rc) 585 return rc; 586 ptl = pmd_lock(mm, pmd); 587 spin_lock(&gmap->guest_table_lock); 588 if (*table == _SEGMENT_ENTRY_EMPTY) { 589 rc = radix_tree_insert(&gmap->host_to_guest, 590 vmaddr >> PMD_SHIFT, 591 (void *)MAKE_VALID_GADDR(gaddr)); 592 if (!rc) { 593 if (pmd_leaf(*pmd)) { 594 *table = (pmd_val(*pmd) & 595 _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) 596 | _SEGMENT_ENTRY_GMAP_UC 597 | _SEGMENT_ENTRY; 598 } else 599 *table = pmd_val(*pmd) & 600 _SEGMENT_ENTRY_HARDWARE_BITS; 601 } 602 } else if (*table & _SEGMENT_ENTRY_PROTECT && 603 !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { 604 unprot = (u64)*table; 605 unprot &= ~_SEGMENT_ENTRY_PROTECT; 606 unprot |= _SEGMENT_ENTRY_GMAP_UC; 607 gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); 608 } 609 spin_unlock(&gmap->guest_table_lock); 610 spin_unlock(ptl); 611 radix_tree_preload_end(); 612 return rc; 613 } 614 EXPORT_SYMBOL(__gmap_link); 615 616 /* 617 * this function is assumed to be called with mmap_lock held 618 */ 619 void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 620 { 621 struct vm_area_struct *vma; 622 unsigned long vmaddr; 623 spinlock_t *ptl; 624 pte_t *ptep; 625 626 /* Find the vm address for the guest address */ 627 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 628 gaddr >> PMD_SHIFT); 629 if (vmaddr) { 630 vmaddr |= gaddr & ~PMD_MASK; 631 632 vma = vma_lookup(gmap->mm, vmaddr); 633 if (!vma || is_vm_hugetlb_page(vma)) 634 return; 635 636 /* Get pointer to the page table entry */ 637 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 638 if (likely(ptep)) { 639 ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); 640 pte_unmap_unlock(ptep, ptl); 641 } 642 } 643 } 644 EXPORT_SYMBOL_GPL(__gmap_zap); 645 646 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 647 { 648 unsigned long gaddr, vmaddr, size; 649 struct vm_area_struct *vma; 650 651 mmap_read_lock(gmap->mm); 652 for (gaddr = from; gaddr < to; 653 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 654 /* Find the vm address for the guest address */ 655 vmaddr = (unsigned long) 656 radix_tree_lookup(&gmap->guest_to_host, 657 gaddr >> PMD_SHIFT); 658 if (!vmaddr) 659 continue; 660 vmaddr |= gaddr & ~PMD_MASK; 661 /* Find vma in the parent mm */ 662 vma = find_vma(gmap->mm, vmaddr); 663 if (!vma) 664 continue; 665 /* 666 * We do not discard pages that are backed by 667 * hugetlbfs, so we don't have to refault them. 668 */ 669 if (is_vm_hugetlb_page(vma)) 670 continue; 671 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 672 zap_page_range_single(vma, vmaddr, size, NULL); 673 } 674 mmap_read_unlock(gmap->mm); 675 } 676 EXPORT_SYMBOL_GPL(gmap_discard); 677 678 static LIST_HEAD(gmap_notifier_list); 679 static DEFINE_SPINLOCK(gmap_notifier_lock); 680 681 /** 682 * gmap_register_pte_notifier - register a pte invalidation callback 683 * @nb: pointer to the gmap notifier block 684 */ 685 void gmap_register_pte_notifier(struct gmap_notifier *nb) 686 { 687 spin_lock(&gmap_notifier_lock); 688 list_add_rcu(&nb->list, &gmap_notifier_list); 689 spin_unlock(&gmap_notifier_lock); 690 } 691 EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); 692 693 /** 694 * gmap_unregister_pte_notifier - remove a pte invalidation callback 695 * @nb: pointer to the gmap notifier block 696 */ 697 void gmap_unregister_pte_notifier(struct gmap_notifier *nb) 698 { 699 spin_lock(&gmap_notifier_lock); 700 list_del_rcu(&nb->list); 701 spin_unlock(&gmap_notifier_lock); 702 synchronize_rcu(); 703 } 704 EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); 705 706 /** 707 * gmap_call_notifier - call all registered invalidation callbacks 708 * @gmap: pointer to guest mapping meta data structure 709 * @start: start virtual address in the guest address space 710 * @end: end virtual address in the guest address space 711 */ 712 static void gmap_call_notifier(struct gmap *gmap, unsigned long start, 713 unsigned long end) 714 { 715 struct gmap_notifier *nb; 716 717 list_for_each_entry(nb, &gmap_notifier_list, list) 718 nb->notifier_call(gmap, start, end); 719 } 720 721 /** 722 * gmap_table_walk - walk the gmap page tables 723 * @gmap: pointer to guest mapping meta data structure 724 * @gaddr: virtual address in the guest address space 725 * @level: page table level to stop at 726 * 727 * Returns a table entry pointer for the given guest address and @level 728 * @level=0 : returns a pointer to a page table table entry (or NULL) 729 * @level=1 : returns a pointer to a segment table entry (or NULL) 730 * @level=2 : returns a pointer to a region-3 table entry (or NULL) 731 * @level=3 : returns a pointer to a region-2 table entry (or NULL) 732 * @level=4 : returns a pointer to a region-1 table entry (or NULL) 733 * 734 * Returns NULL if the gmap page tables could not be walked to the 735 * requested level. 736 * 737 * Note: Can also be called for shadow gmaps. 738 */ 739 unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) 740 { 741 const int asce_type = gmap->asce & _ASCE_TYPE_MASK; 742 unsigned long *table = gmap->table; 743 744 if (gmap_is_shadow(gmap) && gmap->removed) 745 return NULL; 746 747 if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) 748 return NULL; 749 750 if (asce_type != _ASCE_TYPE_REGION1 && 751 gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) 752 return NULL; 753 754 switch (asce_type) { 755 case _ASCE_TYPE_REGION1: 756 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; 757 if (level == 4) 758 break; 759 if (*table & _REGION_ENTRY_INVALID) 760 return NULL; 761 table = __va(*table & _REGION_ENTRY_ORIGIN); 762 fallthrough; 763 case _ASCE_TYPE_REGION2: 764 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; 765 if (level == 3) 766 break; 767 if (*table & _REGION_ENTRY_INVALID) 768 return NULL; 769 table = __va(*table & _REGION_ENTRY_ORIGIN); 770 fallthrough; 771 case _ASCE_TYPE_REGION3: 772 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; 773 if (level == 2) 774 break; 775 if (*table & _REGION_ENTRY_INVALID) 776 return NULL; 777 table = __va(*table & _REGION_ENTRY_ORIGIN); 778 fallthrough; 779 case _ASCE_TYPE_SEGMENT: 780 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 781 if (level == 1) 782 break; 783 if (*table & _REGION_ENTRY_INVALID) 784 return NULL; 785 table = __va(*table & _SEGMENT_ENTRY_ORIGIN); 786 table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT; 787 } 788 return table; 789 } 790 EXPORT_SYMBOL(gmap_table_walk); 791 792 /** 793 * gmap_pte_op_walk - walk the gmap page table, get the page table lock 794 * and return the pte pointer 795 * @gmap: pointer to guest mapping meta data structure 796 * @gaddr: virtual address in the guest address space 797 * @ptl: pointer to the spinlock pointer 798 * 799 * Returns a pointer to the locked pte for a guest address, or NULL 800 */ 801 static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, 802 spinlock_t **ptl) 803 { 804 unsigned long *table; 805 806 BUG_ON(gmap_is_shadow(gmap)); 807 /* Walk the gmap page table, lock and get pte pointer */ 808 table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ 809 if (!table || *table & _SEGMENT_ENTRY_INVALID) 810 return NULL; 811 return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); 812 } 813 814 /** 815 * gmap_pte_op_fixup - force a page in and connect the gmap page table 816 * @gmap: pointer to guest mapping meta data structure 817 * @gaddr: virtual address in the guest address space 818 * @vmaddr: address in the host process address space 819 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 820 * 821 * Returns 0 if the caller can retry __gmap_translate (might fail again), 822 * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing 823 * up or connecting the gmap page table. 824 */ 825 static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, 826 unsigned long vmaddr, int prot) 827 { 828 struct mm_struct *mm = gmap->mm; 829 unsigned int fault_flags; 830 bool unlocked = false; 831 832 BUG_ON(gmap_is_shadow(gmap)); 833 fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; 834 if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) 835 return -EFAULT; 836 if (unlocked) 837 /* lost mmap_lock, caller has to retry __gmap_translate */ 838 return 0; 839 /* Connect the page tables */ 840 return __gmap_link(gmap, gaddr, vmaddr); 841 } 842 843 /** 844 * gmap_pte_op_end - release the page table lock 845 * @ptep: pointer to the locked pte 846 * @ptl: pointer to the page table spinlock 847 */ 848 static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) 849 { 850 pte_unmap_unlock(ptep, ptl); 851 } 852 853 /** 854 * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock 855 * and return the pmd pointer 856 * @gmap: pointer to guest mapping meta data structure 857 * @gaddr: virtual address in the guest address space 858 * 859 * Returns a pointer to the pmd for a guest address, or NULL 860 */ 861 static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) 862 { 863 pmd_t *pmdp; 864 865 BUG_ON(gmap_is_shadow(gmap)); 866 pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); 867 if (!pmdp) 868 return NULL; 869 870 /* without huge pages, there is no need to take the table lock */ 871 if (!gmap->mm->context.allow_gmap_hpage_1m) 872 return pmd_none(*pmdp) ? NULL : pmdp; 873 874 spin_lock(&gmap->guest_table_lock); 875 if (pmd_none(*pmdp)) { 876 spin_unlock(&gmap->guest_table_lock); 877 return NULL; 878 } 879 880 /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ 881 if (!pmd_leaf(*pmdp)) 882 spin_unlock(&gmap->guest_table_lock); 883 return pmdp; 884 } 885 886 /** 887 * gmap_pmd_op_end - release the guest_table_lock if needed 888 * @gmap: pointer to the guest mapping meta data structure 889 * @pmdp: pointer to the pmd 890 */ 891 static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) 892 { 893 if (pmd_leaf(*pmdp)) 894 spin_unlock(&gmap->guest_table_lock); 895 } 896 897 /* 898 * gmap_protect_pmd - remove access rights to memory and set pmd notification bits 899 * @pmdp: pointer to the pmd to be protected 900 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 901 * @bits: notification bits to set 902 * 903 * Returns: 904 * 0 if successfully protected 905 * -EAGAIN if a fixup is needed 906 * -EINVAL if unsupported notifier bits have been specified 907 * 908 * Expected to be called with sg->mm->mmap_lock in read and 909 * guest_table_lock held. 910 */ 911 static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, 912 pmd_t *pmdp, int prot, unsigned long bits) 913 { 914 int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; 915 int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; 916 pmd_t new = *pmdp; 917 918 /* Fixup needed */ 919 if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) 920 return -EAGAIN; 921 922 if (prot == PROT_NONE && !pmd_i) { 923 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); 924 gmap_pmdp_xchg(gmap, pmdp, new, gaddr); 925 } 926 927 if (prot == PROT_READ && !pmd_p) { 928 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); 929 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); 930 gmap_pmdp_xchg(gmap, pmdp, new, gaddr); 931 } 932 933 if (bits & GMAP_NOTIFY_MPROT) 934 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); 935 936 /* Shadow GMAP protection needs split PMDs */ 937 if (bits & GMAP_NOTIFY_SHADOW) 938 return -EINVAL; 939 940 return 0; 941 } 942 943 /* 944 * gmap_protect_pte - remove access rights to memory and set pgste bits 945 * @gmap: pointer to guest mapping meta data structure 946 * @gaddr: virtual address in the guest address space 947 * @pmdp: pointer to the pmd associated with the pte 948 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 949 * @bits: notification bits to set 950 * 951 * Returns 0 if successfully protected, -ENOMEM if out of memory and 952 * -EAGAIN if a fixup is needed. 953 * 954 * Expected to be called with sg->mm->mmap_lock in read 955 */ 956 static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, 957 pmd_t *pmdp, int prot, unsigned long bits) 958 { 959 int rc; 960 pte_t *ptep; 961 spinlock_t *ptl; 962 unsigned long pbits = 0; 963 964 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) 965 return -EAGAIN; 966 967 ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); 968 if (!ptep) 969 return -ENOMEM; 970 971 pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; 972 pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; 973 /* Protect and unlock. */ 974 rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); 975 gmap_pte_op_end(ptep, ptl); 976 return rc; 977 } 978 979 /* 980 * gmap_protect_range - remove access rights to memory and set pgste bits 981 * @gmap: pointer to guest mapping meta data structure 982 * @gaddr: virtual address in the guest address space 983 * @len: size of area 984 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 985 * @bits: pgste notification bits to set 986 * 987 * Returns: 988 * PAGE_SIZE if a small page was successfully protected; 989 * HPAGE_SIZE if a large page was successfully protected; 990 * -ENOMEM if out of memory; 991 * -EFAULT if gaddr is invalid (or mapping for shadows is missing); 992 * -EAGAIN if the guest mapping is missing and should be fixed by the caller. 993 * 994 * Context: Called with sg->mm->mmap_lock in read. 995 */ 996 int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits) 997 { 998 pmd_t *pmdp; 999 int rc = 0; 1000 1001 BUG_ON(gmap_is_shadow(gmap)); 1002 1003 pmdp = gmap_pmd_op_walk(gmap, gaddr); 1004 if (!pmdp) 1005 return -EAGAIN; 1006 1007 if (!pmd_leaf(*pmdp)) { 1008 rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); 1009 if (!rc) 1010 rc = PAGE_SIZE; 1011 } else { 1012 rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits); 1013 if (!rc) 1014 rc = HPAGE_SIZE; 1015 } 1016 gmap_pmd_op_end(gmap, pmdp); 1017 1018 return rc; 1019 } 1020 EXPORT_SYMBOL_GPL(gmap_protect_one); 1021 1022 /** 1023 * gmap_read_table - get an unsigned long value from a guest page table using 1024 * absolute addressing, without marking the page referenced. 1025 * @gmap: pointer to guest mapping meta data structure 1026 * @gaddr: virtual address in the guest address space 1027 * @val: pointer to the unsigned long value to return 1028 * 1029 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT 1030 * if reading using the virtual address failed. -EINVAL if called on a gmap 1031 * shadow. 1032 * 1033 * Called with gmap->mm->mmap_lock in read. 1034 */ 1035 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) 1036 { 1037 unsigned long address, vmaddr; 1038 spinlock_t *ptl; 1039 pte_t *ptep, pte; 1040 int rc; 1041 1042 if (gmap_is_shadow(gmap)) 1043 return -EINVAL; 1044 1045 while (1) { 1046 rc = -EAGAIN; 1047 ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); 1048 if (ptep) { 1049 pte = *ptep; 1050 if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { 1051 address = pte_val(pte) & PAGE_MASK; 1052 address += gaddr & ~PAGE_MASK; 1053 *val = *(unsigned long *)__va(address); 1054 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); 1055 /* Do *NOT* clear the _PAGE_INVALID bit! */ 1056 rc = 0; 1057 } 1058 gmap_pte_op_end(ptep, ptl); 1059 } 1060 if (!rc) 1061 break; 1062 vmaddr = __gmap_translate(gmap, gaddr); 1063 if (IS_ERR_VALUE(vmaddr)) { 1064 rc = vmaddr; 1065 break; 1066 } 1067 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); 1068 if (rc) 1069 break; 1070 } 1071 return rc; 1072 } 1073 EXPORT_SYMBOL_GPL(gmap_read_table); 1074 1075 /** 1076 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree 1077 * @sg: pointer to the shadow guest address space structure 1078 * @vmaddr: vm address associated with the rmap 1079 * @rmap: pointer to the rmap structure 1080 * 1081 * Called with the sg->guest_table_lock 1082 */ 1083 static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, 1084 struct gmap_rmap *rmap) 1085 { 1086 struct gmap_rmap *temp; 1087 void __rcu **slot; 1088 1089 BUG_ON(!gmap_is_shadow(sg)); 1090 slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); 1091 if (slot) { 1092 rmap->next = radix_tree_deref_slot_protected(slot, 1093 &sg->guest_table_lock); 1094 for (temp = rmap->next; temp; temp = temp->next) { 1095 if (temp->raddr == rmap->raddr) { 1096 kfree(rmap); 1097 return; 1098 } 1099 } 1100 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); 1101 } else { 1102 rmap->next = NULL; 1103 radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, 1104 rmap); 1105 } 1106 } 1107 1108 /** 1109 * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap 1110 * @sg: pointer to the shadow guest address space structure 1111 * @raddr: rmap address in the shadow gmap 1112 * @paddr: address in the parent guest address space 1113 * @len: length of the memory area to protect 1114 * 1115 * Returns 0 if successfully protected and the rmap was created, -ENOMEM 1116 * if out of memory and -EFAULT if paddr is invalid. 1117 */ 1118 static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, 1119 unsigned long paddr, unsigned long len) 1120 { 1121 struct gmap *parent; 1122 struct gmap_rmap *rmap; 1123 unsigned long vmaddr; 1124 spinlock_t *ptl; 1125 pte_t *ptep; 1126 int rc; 1127 1128 BUG_ON(!gmap_is_shadow(sg)); 1129 parent = sg->parent; 1130 while (len) { 1131 vmaddr = __gmap_translate(parent, paddr); 1132 if (IS_ERR_VALUE(vmaddr)) 1133 return vmaddr; 1134 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); 1135 if (!rmap) 1136 return -ENOMEM; 1137 rmap->raddr = raddr; 1138 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 1139 if (rc) { 1140 kfree(rmap); 1141 return rc; 1142 } 1143 rc = -EAGAIN; 1144 ptep = gmap_pte_op_walk(parent, paddr, &ptl); 1145 if (ptep) { 1146 spin_lock(&sg->guest_table_lock); 1147 rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, 1148 PGSTE_VSIE_BIT); 1149 if (!rc) 1150 gmap_insert_rmap(sg, vmaddr, rmap); 1151 spin_unlock(&sg->guest_table_lock); 1152 gmap_pte_op_end(ptep, ptl); 1153 } 1154 radix_tree_preload_end(); 1155 if (rc) { 1156 kfree(rmap); 1157 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); 1158 if (rc) 1159 return rc; 1160 continue; 1161 } 1162 paddr += PAGE_SIZE; 1163 len -= PAGE_SIZE; 1164 } 1165 return 0; 1166 } 1167 1168 #define _SHADOW_RMAP_MASK 0x7 1169 #define _SHADOW_RMAP_REGION1 0x5 1170 #define _SHADOW_RMAP_REGION2 0x4 1171 #define _SHADOW_RMAP_REGION3 0x3 1172 #define _SHADOW_RMAP_SEGMENT 0x2 1173 #define _SHADOW_RMAP_PGTABLE 0x1 1174 1175 /** 1176 * gmap_idte_one - invalidate a single region or segment table entry 1177 * @asce: region or segment table *origin* + table-type bits 1178 * @vaddr: virtual address to identify the table entry to flush 1179 * 1180 * The invalid bit of a single region or segment table entry is set 1181 * and the associated TLB entries depending on the entry are flushed. 1182 * The table-type of the @asce identifies the portion of the @vaddr 1183 * that is used as the invalidation index. 1184 */ 1185 static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) 1186 { 1187 asm volatile( 1188 " idte %0,0,%1" 1189 : : "a" (asce), "a" (vaddr) : "cc", "memory"); 1190 } 1191 1192 /** 1193 * gmap_unshadow_page - remove a page from a shadow page table 1194 * @sg: pointer to the shadow guest address space structure 1195 * @raddr: rmap address in the shadow guest address space 1196 * 1197 * Called with the sg->guest_table_lock 1198 */ 1199 static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) 1200 { 1201 unsigned long *table; 1202 1203 BUG_ON(!gmap_is_shadow(sg)); 1204 table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ 1205 if (!table || *table & _PAGE_INVALID) 1206 return; 1207 gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1); 1208 ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); 1209 } 1210 1211 /** 1212 * __gmap_unshadow_pgt - remove all entries from a shadow page table 1213 * @sg: pointer to the shadow guest address space structure 1214 * @raddr: rmap address in the shadow guest address space 1215 * @pgt: pointer to the start of a shadow page table 1216 * 1217 * Called with the sg->guest_table_lock 1218 */ 1219 static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, 1220 unsigned long *pgt) 1221 { 1222 int i; 1223 1224 BUG_ON(!gmap_is_shadow(sg)); 1225 for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE) 1226 pgt[i] = _PAGE_INVALID; 1227 } 1228 1229 /** 1230 * gmap_unshadow_pgt - remove a shadow page table from a segment entry 1231 * @sg: pointer to the shadow guest address space structure 1232 * @raddr: address in the shadow guest address space 1233 * 1234 * Called with the sg->guest_table_lock 1235 */ 1236 static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) 1237 { 1238 unsigned long *ste; 1239 phys_addr_t sto, pgt; 1240 struct ptdesc *ptdesc; 1241 1242 BUG_ON(!gmap_is_shadow(sg)); 1243 ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ 1244 if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) 1245 return; 1246 gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); 1247 sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); 1248 gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); 1249 pgt = *ste & _SEGMENT_ENTRY_ORIGIN; 1250 *ste = _SEGMENT_ENTRY_EMPTY; 1251 __gmap_unshadow_pgt(sg, raddr, __va(pgt)); 1252 /* Free page table */ 1253 ptdesc = page_ptdesc(phys_to_page(pgt)); 1254 page_table_free_pgste(ptdesc); 1255 } 1256 1257 /** 1258 * __gmap_unshadow_sgt - remove all entries from a shadow segment table 1259 * @sg: pointer to the shadow guest address space structure 1260 * @raddr: rmap address in the shadow guest address space 1261 * @sgt: pointer to the start of a shadow segment table 1262 * 1263 * Called with the sg->guest_table_lock 1264 */ 1265 static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, 1266 unsigned long *sgt) 1267 { 1268 struct ptdesc *ptdesc; 1269 phys_addr_t pgt; 1270 int i; 1271 1272 BUG_ON(!gmap_is_shadow(sg)); 1273 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { 1274 if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) 1275 continue; 1276 pgt = sgt[i] & _REGION_ENTRY_ORIGIN; 1277 sgt[i] = _SEGMENT_ENTRY_EMPTY; 1278 __gmap_unshadow_pgt(sg, raddr, __va(pgt)); 1279 /* Free page table */ 1280 ptdesc = page_ptdesc(phys_to_page(pgt)); 1281 page_table_free_pgste(ptdesc); 1282 } 1283 } 1284 1285 /** 1286 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry 1287 * @sg: pointer to the shadow guest address space structure 1288 * @raddr: rmap address in the shadow guest address space 1289 * 1290 * Called with the shadow->guest_table_lock 1291 */ 1292 static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) 1293 { 1294 unsigned long r3o, *r3e; 1295 phys_addr_t sgt; 1296 struct page *page; 1297 1298 BUG_ON(!gmap_is_shadow(sg)); 1299 r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ 1300 if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) 1301 return; 1302 gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); 1303 r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); 1304 gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); 1305 sgt = *r3e & _REGION_ENTRY_ORIGIN; 1306 *r3e = _REGION3_ENTRY_EMPTY; 1307 __gmap_unshadow_sgt(sg, raddr, __va(sgt)); 1308 /* Free segment table */ 1309 page = phys_to_page(sgt); 1310 __free_pages(page, CRST_ALLOC_ORDER); 1311 } 1312 1313 /** 1314 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table 1315 * @sg: pointer to the shadow guest address space structure 1316 * @raddr: address in the shadow guest address space 1317 * @r3t: pointer to the start of a shadow region-3 table 1318 * 1319 * Called with the sg->guest_table_lock 1320 */ 1321 static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, 1322 unsigned long *r3t) 1323 { 1324 struct page *page; 1325 phys_addr_t sgt; 1326 int i; 1327 1328 BUG_ON(!gmap_is_shadow(sg)); 1329 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { 1330 if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) 1331 continue; 1332 sgt = r3t[i] & _REGION_ENTRY_ORIGIN; 1333 r3t[i] = _REGION3_ENTRY_EMPTY; 1334 __gmap_unshadow_sgt(sg, raddr, __va(sgt)); 1335 /* Free segment table */ 1336 page = phys_to_page(sgt); 1337 __free_pages(page, CRST_ALLOC_ORDER); 1338 } 1339 } 1340 1341 /** 1342 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry 1343 * @sg: pointer to the shadow guest address space structure 1344 * @raddr: rmap address in the shadow guest address space 1345 * 1346 * Called with the sg->guest_table_lock 1347 */ 1348 static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) 1349 { 1350 unsigned long r2o, *r2e; 1351 phys_addr_t r3t; 1352 struct page *page; 1353 1354 BUG_ON(!gmap_is_shadow(sg)); 1355 r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ 1356 if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) 1357 return; 1358 gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); 1359 r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); 1360 gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); 1361 r3t = *r2e & _REGION_ENTRY_ORIGIN; 1362 *r2e = _REGION2_ENTRY_EMPTY; 1363 __gmap_unshadow_r3t(sg, raddr, __va(r3t)); 1364 /* Free region 3 table */ 1365 page = phys_to_page(r3t); 1366 __free_pages(page, CRST_ALLOC_ORDER); 1367 } 1368 1369 /** 1370 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table 1371 * @sg: pointer to the shadow guest address space structure 1372 * @raddr: rmap address in the shadow guest address space 1373 * @r2t: pointer to the start of a shadow region-2 table 1374 * 1375 * Called with the sg->guest_table_lock 1376 */ 1377 static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, 1378 unsigned long *r2t) 1379 { 1380 phys_addr_t r3t; 1381 struct page *page; 1382 int i; 1383 1384 BUG_ON(!gmap_is_shadow(sg)); 1385 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { 1386 if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) 1387 continue; 1388 r3t = r2t[i] & _REGION_ENTRY_ORIGIN; 1389 r2t[i] = _REGION2_ENTRY_EMPTY; 1390 __gmap_unshadow_r3t(sg, raddr, __va(r3t)); 1391 /* Free region 3 table */ 1392 page = phys_to_page(r3t); 1393 __free_pages(page, CRST_ALLOC_ORDER); 1394 } 1395 } 1396 1397 /** 1398 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry 1399 * @sg: pointer to the shadow guest address space structure 1400 * @raddr: rmap address in the shadow guest address space 1401 * 1402 * Called with the sg->guest_table_lock 1403 */ 1404 static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) 1405 { 1406 unsigned long r1o, *r1e; 1407 struct page *page; 1408 phys_addr_t r2t; 1409 1410 BUG_ON(!gmap_is_shadow(sg)); 1411 r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ 1412 if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) 1413 return; 1414 gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); 1415 r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); 1416 gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); 1417 r2t = *r1e & _REGION_ENTRY_ORIGIN; 1418 *r1e = _REGION1_ENTRY_EMPTY; 1419 __gmap_unshadow_r2t(sg, raddr, __va(r2t)); 1420 /* Free region 2 table */ 1421 page = phys_to_page(r2t); 1422 __free_pages(page, CRST_ALLOC_ORDER); 1423 } 1424 1425 /** 1426 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table 1427 * @sg: pointer to the shadow guest address space structure 1428 * @raddr: rmap address in the shadow guest address space 1429 * @r1t: pointer to the start of a shadow region-1 table 1430 * 1431 * Called with the shadow->guest_table_lock 1432 */ 1433 static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, 1434 unsigned long *r1t) 1435 { 1436 unsigned long asce; 1437 struct page *page; 1438 phys_addr_t r2t; 1439 int i; 1440 1441 BUG_ON(!gmap_is_shadow(sg)); 1442 asce = __pa(r1t) | _ASCE_TYPE_REGION1; 1443 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { 1444 if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) 1445 continue; 1446 r2t = r1t[i] & _REGION_ENTRY_ORIGIN; 1447 __gmap_unshadow_r2t(sg, raddr, __va(r2t)); 1448 /* Clear entry and flush translation r1t -> r2t */ 1449 gmap_idte_one(asce, raddr); 1450 r1t[i] = _REGION1_ENTRY_EMPTY; 1451 /* Free region 2 table */ 1452 page = phys_to_page(r2t); 1453 __free_pages(page, CRST_ALLOC_ORDER); 1454 } 1455 } 1456 1457 /** 1458 * gmap_unshadow - remove a shadow page table completely 1459 * @sg: pointer to the shadow guest address space structure 1460 * 1461 * Called with sg->guest_table_lock 1462 */ 1463 void gmap_unshadow(struct gmap *sg) 1464 { 1465 unsigned long *table; 1466 1467 BUG_ON(!gmap_is_shadow(sg)); 1468 if (sg->removed) 1469 return; 1470 sg->removed = 1; 1471 gmap_call_notifier(sg, 0, -1UL); 1472 gmap_flush_tlb(sg); 1473 table = __va(sg->asce & _ASCE_ORIGIN); 1474 switch (sg->asce & _ASCE_TYPE_MASK) { 1475 case _ASCE_TYPE_REGION1: 1476 __gmap_unshadow_r1t(sg, 0, table); 1477 break; 1478 case _ASCE_TYPE_REGION2: 1479 __gmap_unshadow_r2t(sg, 0, table); 1480 break; 1481 case _ASCE_TYPE_REGION3: 1482 __gmap_unshadow_r3t(sg, 0, table); 1483 break; 1484 case _ASCE_TYPE_SEGMENT: 1485 __gmap_unshadow_sgt(sg, 0, table); 1486 break; 1487 } 1488 } 1489 EXPORT_SYMBOL(gmap_unshadow); 1490 1491 /** 1492 * gmap_shadow_r2t - create an empty shadow region 2 table 1493 * @sg: pointer to the shadow guest address space structure 1494 * @saddr: faulting address in the shadow gmap 1495 * @r2t: parent gmap address of the region 2 table to get shadowed 1496 * @fake: r2t references contiguous guest memory block, not a r2t 1497 * 1498 * The r2t parameter specifies the address of the source table. The 1499 * four pages of the source table are made read-only in the parent gmap 1500 * address space. A write to the source table area @r2t will automatically 1501 * remove the shadow r2 table and all of its descendants. 1502 * 1503 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1504 * shadow table structure is incomplete, -ENOMEM if out of memory and 1505 * -EFAULT if an address in the parent gmap could not be resolved. 1506 * 1507 * Called with sg->mm->mmap_lock in read. 1508 */ 1509 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, 1510 int fake) 1511 { 1512 unsigned long raddr, origin, offset, len; 1513 unsigned long *table; 1514 phys_addr_t s_r2t; 1515 struct page *page; 1516 int rc; 1517 1518 BUG_ON(!gmap_is_shadow(sg)); 1519 /* Allocate a shadow region second table */ 1520 page = gmap_alloc_crst(); 1521 if (!page) 1522 return -ENOMEM; 1523 s_r2t = page_to_phys(page); 1524 /* Install shadow region second table */ 1525 spin_lock(&sg->guest_table_lock); 1526 table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ 1527 if (!table) { 1528 rc = -EAGAIN; /* Race with unshadow */ 1529 goto out_free; 1530 } 1531 if (!(*table & _REGION_ENTRY_INVALID)) { 1532 rc = 0; /* Already established */ 1533 goto out_free; 1534 } else if (*table & _REGION_ENTRY_ORIGIN) { 1535 rc = -EAGAIN; /* Race with shadow */ 1536 goto out_free; 1537 } 1538 crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); 1539 /* mark as invalid as long as the parent table is not protected */ 1540 *table = s_r2t | _REGION_ENTRY_LENGTH | 1541 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; 1542 if (sg->edat_level >= 1) 1543 *table |= (r2t & _REGION_ENTRY_PROTECT); 1544 if (fake) { 1545 /* nothing to protect for fake tables */ 1546 *table &= ~_REGION_ENTRY_INVALID; 1547 spin_unlock(&sg->guest_table_lock); 1548 return 0; 1549 } 1550 spin_unlock(&sg->guest_table_lock); 1551 /* Make r2t read-only in parent gmap page table */ 1552 raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; 1553 origin = r2t & _REGION_ENTRY_ORIGIN; 1554 offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1555 len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1556 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1557 spin_lock(&sg->guest_table_lock); 1558 if (!rc) { 1559 table = gmap_table_walk(sg, saddr, 4); 1560 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) 1561 rc = -EAGAIN; /* Race with unshadow */ 1562 else 1563 *table &= ~_REGION_ENTRY_INVALID; 1564 } else { 1565 gmap_unshadow_r2t(sg, raddr); 1566 } 1567 spin_unlock(&sg->guest_table_lock); 1568 return rc; 1569 out_free: 1570 spin_unlock(&sg->guest_table_lock); 1571 __free_pages(page, CRST_ALLOC_ORDER); 1572 return rc; 1573 } 1574 EXPORT_SYMBOL_GPL(gmap_shadow_r2t); 1575 1576 /** 1577 * gmap_shadow_r3t - create a shadow region 3 table 1578 * @sg: pointer to the shadow guest address space structure 1579 * @saddr: faulting address in the shadow gmap 1580 * @r3t: parent gmap address of the region 3 table to get shadowed 1581 * @fake: r3t references contiguous guest memory block, not a r3t 1582 * 1583 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1584 * shadow table structure is incomplete, -ENOMEM if out of memory and 1585 * -EFAULT if an address in the parent gmap could not be resolved. 1586 * 1587 * Called with sg->mm->mmap_lock in read. 1588 */ 1589 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, 1590 int fake) 1591 { 1592 unsigned long raddr, origin, offset, len; 1593 unsigned long *table; 1594 phys_addr_t s_r3t; 1595 struct page *page; 1596 int rc; 1597 1598 BUG_ON(!gmap_is_shadow(sg)); 1599 /* Allocate a shadow region second table */ 1600 page = gmap_alloc_crst(); 1601 if (!page) 1602 return -ENOMEM; 1603 s_r3t = page_to_phys(page); 1604 /* Install shadow region second table */ 1605 spin_lock(&sg->guest_table_lock); 1606 table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ 1607 if (!table) { 1608 rc = -EAGAIN; /* Race with unshadow */ 1609 goto out_free; 1610 } 1611 if (!(*table & _REGION_ENTRY_INVALID)) { 1612 rc = 0; /* Already established */ 1613 goto out_free; 1614 } else if (*table & _REGION_ENTRY_ORIGIN) { 1615 rc = -EAGAIN; /* Race with shadow */ 1616 goto out_free; 1617 } 1618 crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); 1619 /* mark as invalid as long as the parent table is not protected */ 1620 *table = s_r3t | _REGION_ENTRY_LENGTH | 1621 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; 1622 if (sg->edat_level >= 1) 1623 *table |= (r3t & _REGION_ENTRY_PROTECT); 1624 if (fake) { 1625 /* nothing to protect for fake tables */ 1626 *table &= ~_REGION_ENTRY_INVALID; 1627 spin_unlock(&sg->guest_table_lock); 1628 return 0; 1629 } 1630 spin_unlock(&sg->guest_table_lock); 1631 /* Make r3t read-only in parent gmap page table */ 1632 raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; 1633 origin = r3t & _REGION_ENTRY_ORIGIN; 1634 offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1635 len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1636 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1637 spin_lock(&sg->guest_table_lock); 1638 if (!rc) { 1639 table = gmap_table_walk(sg, saddr, 3); 1640 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) 1641 rc = -EAGAIN; /* Race with unshadow */ 1642 else 1643 *table &= ~_REGION_ENTRY_INVALID; 1644 } else { 1645 gmap_unshadow_r3t(sg, raddr); 1646 } 1647 spin_unlock(&sg->guest_table_lock); 1648 return rc; 1649 out_free: 1650 spin_unlock(&sg->guest_table_lock); 1651 __free_pages(page, CRST_ALLOC_ORDER); 1652 return rc; 1653 } 1654 EXPORT_SYMBOL_GPL(gmap_shadow_r3t); 1655 1656 /** 1657 * gmap_shadow_sgt - create a shadow segment table 1658 * @sg: pointer to the shadow guest address space structure 1659 * @saddr: faulting address in the shadow gmap 1660 * @sgt: parent gmap address of the segment table to get shadowed 1661 * @fake: sgt references contiguous guest memory block, not a sgt 1662 * 1663 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the 1664 * shadow table structure is incomplete, -ENOMEM if out of memory and 1665 * -EFAULT if an address in the parent gmap could not be resolved. 1666 * 1667 * Called with sg->mm->mmap_lock in read. 1668 */ 1669 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, 1670 int fake) 1671 { 1672 unsigned long raddr, origin, offset, len; 1673 unsigned long *table; 1674 phys_addr_t s_sgt; 1675 struct page *page; 1676 int rc; 1677 1678 BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); 1679 /* Allocate a shadow segment table */ 1680 page = gmap_alloc_crst(); 1681 if (!page) 1682 return -ENOMEM; 1683 s_sgt = page_to_phys(page); 1684 /* Install shadow region second table */ 1685 spin_lock(&sg->guest_table_lock); 1686 table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ 1687 if (!table) { 1688 rc = -EAGAIN; /* Race with unshadow */ 1689 goto out_free; 1690 } 1691 if (!(*table & _REGION_ENTRY_INVALID)) { 1692 rc = 0; /* Already established */ 1693 goto out_free; 1694 } else if (*table & _REGION_ENTRY_ORIGIN) { 1695 rc = -EAGAIN; /* Race with shadow */ 1696 goto out_free; 1697 } 1698 crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); 1699 /* mark as invalid as long as the parent table is not protected */ 1700 *table = s_sgt | _REGION_ENTRY_LENGTH | 1701 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; 1702 if (sg->edat_level >= 1) 1703 *table |= sgt & _REGION_ENTRY_PROTECT; 1704 if (fake) { 1705 /* nothing to protect for fake tables */ 1706 *table &= ~_REGION_ENTRY_INVALID; 1707 spin_unlock(&sg->guest_table_lock); 1708 return 0; 1709 } 1710 spin_unlock(&sg->guest_table_lock); 1711 /* Make sgt read-only in parent gmap page table */ 1712 raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; 1713 origin = sgt & _REGION_ENTRY_ORIGIN; 1714 offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1715 len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1716 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1717 spin_lock(&sg->guest_table_lock); 1718 if (!rc) { 1719 table = gmap_table_walk(sg, saddr, 2); 1720 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) 1721 rc = -EAGAIN; /* Race with unshadow */ 1722 else 1723 *table &= ~_REGION_ENTRY_INVALID; 1724 } else { 1725 gmap_unshadow_sgt(sg, raddr); 1726 } 1727 spin_unlock(&sg->guest_table_lock); 1728 return rc; 1729 out_free: 1730 spin_unlock(&sg->guest_table_lock); 1731 __free_pages(page, CRST_ALLOC_ORDER); 1732 return rc; 1733 } 1734 EXPORT_SYMBOL_GPL(gmap_shadow_sgt); 1735 1736 static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr) 1737 { 1738 unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc)); 1739 1740 pgstes += _PAGE_ENTRIES; 1741 1742 pgstes[0] &= ~PGSTE_ST2_MASK; 1743 pgstes[1] &= ~PGSTE_ST2_MASK; 1744 pgstes[2] &= ~PGSTE_ST2_MASK; 1745 pgstes[3] &= ~PGSTE_ST2_MASK; 1746 1747 pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK; 1748 pgstes[1] |= pgt_addr & PGSTE_ST2_MASK; 1749 pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK; 1750 pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK; 1751 } 1752 1753 /** 1754 * gmap_shadow_pgt - instantiate a shadow page table 1755 * @sg: pointer to the shadow guest address space structure 1756 * @saddr: faulting address in the shadow gmap 1757 * @pgt: parent gmap address of the page table to get shadowed 1758 * @fake: pgt references contiguous guest memory block, not a pgtable 1759 * 1760 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1761 * shadow table structure is incomplete, -ENOMEM if out of memory, 1762 * -EFAULT if an address in the parent gmap could not be resolved and 1763 * 1764 * Called with gmap->mm->mmap_lock in read 1765 */ 1766 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, 1767 int fake) 1768 { 1769 unsigned long raddr, origin; 1770 unsigned long *table; 1771 struct ptdesc *ptdesc; 1772 phys_addr_t s_pgt; 1773 int rc; 1774 1775 BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); 1776 /* Allocate a shadow page table */ 1777 ptdesc = page_table_alloc_pgste(sg->mm); 1778 if (!ptdesc) 1779 return -ENOMEM; 1780 origin = pgt & _SEGMENT_ENTRY_ORIGIN; 1781 if (fake) 1782 origin |= GMAP_SHADOW_FAKE_TABLE; 1783 gmap_pgste_set_pgt_addr(ptdesc, origin); 1784 s_pgt = page_to_phys(ptdesc_page(ptdesc)); 1785 /* Install shadow page table */ 1786 spin_lock(&sg->guest_table_lock); 1787 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 1788 if (!table) { 1789 rc = -EAGAIN; /* Race with unshadow */ 1790 goto out_free; 1791 } 1792 if (!(*table & _SEGMENT_ENTRY_INVALID)) { 1793 rc = 0; /* Already established */ 1794 goto out_free; 1795 } else if (*table & _SEGMENT_ENTRY_ORIGIN) { 1796 rc = -EAGAIN; /* Race with shadow */ 1797 goto out_free; 1798 } 1799 /* mark as invalid as long as the parent table is not protected */ 1800 *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | 1801 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; 1802 if (fake) { 1803 /* nothing to protect for fake tables */ 1804 *table &= ~_SEGMENT_ENTRY_INVALID; 1805 spin_unlock(&sg->guest_table_lock); 1806 return 0; 1807 } 1808 spin_unlock(&sg->guest_table_lock); 1809 /* Make pgt read-only in parent gmap page table (not the pgste) */ 1810 raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; 1811 origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; 1812 rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); 1813 spin_lock(&sg->guest_table_lock); 1814 if (!rc) { 1815 table = gmap_table_walk(sg, saddr, 1); 1816 if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) 1817 rc = -EAGAIN; /* Race with unshadow */ 1818 else 1819 *table &= ~_SEGMENT_ENTRY_INVALID; 1820 } else { 1821 gmap_unshadow_pgt(sg, raddr); 1822 } 1823 spin_unlock(&sg->guest_table_lock); 1824 return rc; 1825 out_free: 1826 spin_unlock(&sg->guest_table_lock); 1827 page_table_free_pgste(ptdesc); 1828 return rc; 1829 1830 } 1831 EXPORT_SYMBOL_GPL(gmap_shadow_pgt); 1832 1833 /** 1834 * gmap_shadow_page - create a shadow page mapping 1835 * @sg: pointer to the shadow guest address space structure 1836 * @saddr: faulting address in the shadow gmap 1837 * @pte: pte in parent gmap address space to get shadowed 1838 * 1839 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1840 * shadow table structure is incomplete, -ENOMEM if out of memory and 1841 * -EFAULT if an address in the parent gmap could not be resolved. 1842 * 1843 * Called with sg->mm->mmap_lock in read. 1844 */ 1845 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) 1846 { 1847 struct gmap *parent; 1848 struct gmap_rmap *rmap; 1849 unsigned long vmaddr, paddr; 1850 spinlock_t *ptl; 1851 pte_t *sptep, *tptep; 1852 int prot; 1853 int rc; 1854 1855 BUG_ON(!gmap_is_shadow(sg)); 1856 parent = sg->parent; 1857 prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; 1858 1859 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); 1860 if (!rmap) 1861 return -ENOMEM; 1862 rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; 1863 1864 while (1) { 1865 paddr = pte_val(pte) & PAGE_MASK; 1866 vmaddr = __gmap_translate(parent, paddr); 1867 if (IS_ERR_VALUE(vmaddr)) { 1868 rc = vmaddr; 1869 break; 1870 } 1871 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 1872 if (rc) 1873 break; 1874 rc = -EAGAIN; 1875 sptep = gmap_pte_op_walk(parent, paddr, &ptl); 1876 if (sptep) { 1877 spin_lock(&sg->guest_table_lock); 1878 /* Get page table pointer */ 1879 tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); 1880 if (!tptep) { 1881 spin_unlock(&sg->guest_table_lock); 1882 gmap_pte_op_end(sptep, ptl); 1883 radix_tree_preload_end(); 1884 break; 1885 } 1886 rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); 1887 if (rc > 0) { 1888 /* Success and a new mapping */ 1889 gmap_insert_rmap(sg, vmaddr, rmap); 1890 rmap = NULL; 1891 rc = 0; 1892 } 1893 gmap_pte_op_end(sptep, ptl); 1894 spin_unlock(&sg->guest_table_lock); 1895 } 1896 radix_tree_preload_end(); 1897 if (!rc) 1898 break; 1899 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); 1900 if (rc) 1901 break; 1902 } 1903 kfree(rmap); 1904 return rc; 1905 } 1906 EXPORT_SYMBOL_GPL(gmap_shadow_page); 1907 1908 /* 1909 * gmap_shadow_notify - handle notifications for shadow gmap 1910 * 1911 * Called with sg->parent->shadow_lock. 1912 */ 1913 static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, 1914 unsigned long gaddr) 1915 { 1916 struct gmap_rmap *rmap, *rnext, *head; 1917 unsigned long start, end, bits, raddr; 1918 1919 BUG_ON(!gmap_is_shadow(sg)); 1920 1921 spin_lock(&sg->guest_table_lock); 1922 if (sg->removed) { 1923 spin_unlock(&sg->guest_table_lock); 1924 return; 1925 } 1926 /* Check for top level table */ 1927 start = sg->orig_asce & _ASCE_ORIGIN; 1928 end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; 1929 if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && 1930 gaddr < end) { 1931 /* The complete shadow table has to go */ 1932 gmap_unshadow(sg); 1933 spin_unlock(&sg->guest_table_lock); 1934 list_del(&sg->list); 1935 gmap_put(sg); 1936 return; 1937 } 1938 /* Remove the page table tree from on specific entry */ 1939 head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); 1940 gmap_for_each_rmap_safe(rmap, rnext, head) { 1941 bits = rmap->raddr & _SHADOW_RMAP_MASK; 1942 raddr = rmap->raddr ^ bits; 1943 switch (bits) { 1944 case _SHADOW_RMAP_REGION1: 1945 gmap_unshadow_r2t(sg, raddr); 1946 break; 1947 case _SHADOW_RMAP_REGION2: 1948 gmap_unshadow_r3t(sg, raddr); 1949 break; 1950 case _SHADOW_RMAP_REGION3: 1951 gmap_unshadow_sgt(sg, raddr); 1952 break; 1953 case _SHADOW_RMAP_SEGMENT: 1954 gmap_unshadow_pgt(sg, raddr); 1955 break; 1956 case _SHADOW_RMAP_PGTABLE: 1957 gmap_unshadow_page(sg, raddr); 1958 break; 1959 } 1960 kfree(rmap); 1961 } 1962 spin_unlock(&sg->guest_table_lock); 1963 } 1964 1965 /** 1966 * ptep_notify - call all invalidation callbacks for a specific pte. 1967 * @mm: pointer to the process mm_struct 1968 * @vmaddr: virtual address in the process address space 1969 * @pte: pointer to the page table entry 1970 * @bits: bits from the pgste that caused the notify call 1971 * 1972 * This function is assumed to be called with the page table lock held 1973 * for the pte to notify. 1974 */ 1975 void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, 1976 pte_t *pte, unsigned long bits) 1977 { 1978 unsigned long offset, gaddr = 0; 1979 struct gmap *gmap, *sg, *next; 1980 1981 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 1982 offset = offset * (PAGE_SIZE / sizeof(pte_t)); 1983 rcu_read_lock(); 1984 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 1985 spin_lock(&gmap->guest_table_lock); 1986 gaddr = host_to_guest_lookup(gmap, vmaddr) + offset; 1987 spin_unlock(&gmap->guest_table_lock); 1988 if (!IS_GADDR_VALID(gaddr)) 1989 continue; 1990 1991 if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { 1992 spin_lock(&gmap->shadow_lock); 1993 list_for_each_entry_safe(sg, next, 1994 &gmap->children, list) 1995 gmap_shadow_notify(sg, vmaddr, gaddr); 1996 spin_unlock(&gmap->shadow_lock); 1997 } 1998 if (bits & PGSTE_IN_BIT) 1999 gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); 2000 } 2001 rcu_read_unlock(); 2002 } 2003 EXPORT_SYMBOL_GPL(ptep_notify); 2004 2005 static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, 2006 unsigned long gaddr) 2007 { 2008 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); 2009 gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); 2010 } 2011 2012 /** 2013 * gmap_pmdp_xchg - exchange a gmap pmd with another 2014 * @gmap: pointer to the guest address space structure 2015 * @pmdp: pointer to the pmd entry 2016 * @new: replacement entry 2017 * @gaddr: the affected guest address 2018 * 2019 * This function is assumed to be called with the guest_table_lock 2020 * held. 2021 */ 2022 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, 2023 unsigned long gaddr) 2024 { 2025 gaddr &= HPAGE_MASK; 2026 pmdp_notify_gmap(gmap, pmdp, gaddr); 2027 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); 2028 if (MACHINE_HAS_TLB_GUEST) 2029 __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, 2030 IDTE_GLOBAL); 2031 else if (MACHINE_HAS_IDTE) 2032 __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); 2033 else 2034 __pmdp_csp(pmdp); 2035 set_pmd(pmdp, new); 2036 } 2037 2038 static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, 2039 int purge) 2040 { 2041 pmd_t *pmdp; 2042 struct gmap *gmap; 2043 unsigned long gaddr; 2044 2045 rcu_read_lock(); 2046 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2047 spin_lock(&gmap->guest_table_lock); 2048 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); 2049 if (pmdp) { 2050 pmdp_notify_gmap(gmap, pmdp, gaddr); 2051 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2052 _SEGMENT_ENTRY_GMAP_UC | 2053 _SEGMENT_ENTRY)); 2054 if (purge) 2055 __pmdp_csp(pmdp); 2056 set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); 2057 } 2058 spin_unlock(&gmap->guest_table_lock); 2059 } 2060 rcu_read_unlock(); 2061 } 2062 2063 /** 2064 * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without 2065 * flushing 2066 * @mm: pointer to the process mm_struct 2067 * @vmaddr: virtual address in the process address space 2068 */ 2069 void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) 2070 { 2071 gmap_pmdp_clear(mm, vmaddr, 0); 2072 } 2073 EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); 2074 2075 /** 2076 * gmap_pmdp_csp - csp all affected guest pmd entries 2077 * @mm: pointer to the process mm_struct 2078 * @vmaddr: virtual address in the process address space 2079 */ 2080 void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr) 2081 { 2082 gmap_pmdp_clear(mm, vmaddr, 1); 2083 } 2084 EXPORT_SYMBOL_GPL(gmap_pmdp_csp); 2085 2086 /** 2087 * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry 2088 * @mm: pointer to the process mm_struct 2089 * @vmaddr: virtual address in the process address space 2090 */ 2091 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) 2092 { 2093 unsigned long gaddr; 2094 struct gmap *gmap; 2095 pmd_t *pmdp; 2096 2097 rcu_read_lock(); 2098 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2099 spin_lock(&gmap->guest_table_lock); 2100 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); 2101 if (pmdp) { 2102 pmdp_notify_gmap(gmap, pmdp, gaddr); 2103 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2104 _SEGMENT_ENTRY_GMAP_UC | 2105 _SEGMENT_ENTRY)); 2106 if (MACHINE_HAS_TLB_GUEST) 2107 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2108 gmap->asce, IDTE_LOCAL); 2109 else if (MACHINE_HAS_IDTE) 2110 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); 2111 *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); 2112 } 2113 spin_unlock(&gmap->guest_table_lock); 2114 } 2115 rcu_read_unlock(); 2116 } 2117 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); 2118 2119 /** 2120 * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry 2121 * @mm: pointer to the process mm_struct 2122 * @vmaddr: virtual address in the process address space 2123 */ 2124 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) 2125 { 2126 unsigned long gaddr; 2127 struct gmap *gmap; 2128 pmd_t *pmdp; 2129 2130 rcu_read_lock(); 2131 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2132 spin_lock(&gmap->guest_table_lock); 2133 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); 2134 if (pmdp) { 2135 pmdp_notify_gmap(gmap, pmdp, gaddr); 2136 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2137 _SEGMENT_ENTRY_GMAP_UC | 2138 _SEGMENT_ENTRY)); 2139 if (MACHINE_HAS_TLB_GUEST) 2140 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2141 gmap->asce, IDTE_GLOBAL); 2142 else if (MACHINE_HAS_IDTE) 2143 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); 2144 else 2145 __pmdp_csp(pmdp); 2146 *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); 2147 } 2148 spin_unlock(&gmap->guest_table_lock); 2149 } 2150 rcu_read_unlock(); 2151 } 2152 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); 2153 2154 /** 2155 * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status 2156 * @gmap: pointer to guest address space 2157 * @pmdp: pointer to the pmd to be tested 2158 * @gaddr: virtual address in the guest address space 2159 * 2160 * This function is assumed to be called with the guest_table_lock 2161 * held. 2162 */ 2163 static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, 2164 unsigned long gaddr) 2165 { 2166 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) 2167 return false; 2168 2169 /* Already protected memory, which did not change is clean */ 2170 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && 2171 !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) 2172 return false; 2173 2174 /* Clear UC indication and reset protection */ 2175 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); 2176 gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); 2177 return true; 2178 } 2179 2180 /** 2181 * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment 2182 * @gmap: pointer to guest address space 2183 * @bitmap: dirty bitmap for this pmd 2184 * @gaddr: virtual address in the guest address space 2185 * @vmaddr: virtual address in the host address space 2186 * 2187 * This function is assumed to be called with the guest_table_lock 2188 * held. 2189 */ 2190 void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], 2191 unsigned long gaddr, unsigned long vmaddr) 2192 { 2193 int i; 2194 pmd_t *pmdp; 2195 pte_t *ptep; 2196 spinlock_t *ptl; 2197 2198 pmdp = gmap_pmd_op_walk(gmap, gaddr); 2199 if (!pmdp) 2200 return; 2201 2202 if (pmd_leaf(*pmdp)) { 2203 if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) 2204 bitmap_fill(bitmap, _PAGE_ENTRIES); 2205 } else { 2206 for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { 2207 ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); 2208 if (!ptep) 2209 continue; 2210 if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) 2211 set_bit(i, bitmap); 2212 pte_unmap_unlock(ptep, ptl); 2213 } 2214 } 2215 gmap_pmd_op_end(gmap, pmdp); 2216 } 2217 EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); 2218 2219 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2220 static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, 2221 unsigned long end, struct mm_walk *walk) 2222 { 2223 struct vm_area_struct *vma = walk->vma; 2224 2225 split_huge_pmd(vma, pmd, addr); 2226 return 0; 2227 } 2228 2229 static const struct mm_walk_ops thp_split_walk_ops = { 2230 .pmd_entry = thp_split_walk_pmd_entry, 2231 .walk_lock = PGWALK_WRLOCK_VERIFY, 2232 }; 2233 2234 static inline void thp_split_mm(struct mm_struct *mm) 2235 { 2236 struct vm_area_struct *vma; 2237 VMA_ITERATOR(vmi, mm, 0); 2238 2239 for_each_vma(vmi, vma) { 2240 vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); 2241 walk_page_vma(vma, &thp_split_walk_ops, NULL); 2242 } 2243 mm->def_flags |= VM_NOHUGEPAGE; 2244 } 2245 #else 2246 static inline void thp_split_mm(struct mm_struct *mm) 2247 { 2248 } 2249 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2250 2251 /* 2252 * switch on pgstes for its userspace process (for kvm) 2253 */ 2254 int s390_enable_sie(void) 2255 { 2256 struct mm_struct *mm = current->mm; 2257 2258 /* Do we have pgstes? if yes, we are done */ 2259 if (mm_has_pgste(mm)) 2260 return 0; 2261 /* Fail if the page tables are 2K */ 2262 if (!mm_alloc_pgste(mm)) 2263 return -EINVAL; 2264 mmap_write_lock(mm); 2265 mm->context.has_pgste = 1; 2266 /* split thp mappings and disable thp for future mappings */ 2267 thp_split_mm(mm); 2268 mmap_write_unlock(mm); 2269 return 0; 2270 } 2271 EXPORT_SYMBOL_GPL(s390_enable_sie); 2272 2273 static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, 2274 unsigned long end, struct mm_walk *walk) 2275 { 2276 unsigned long *found_addr = walk->private; 2277 2278 /* Return 1 of the page is a zeropage. */ 2279 if (is_zero_pfn(pte_pfn(*pte))) { 2280 /* 2281 * Shared zeropage in e.g., a FS DAX mapping? We cannot do the 2282 * right thing and likely don't care: FAULT_FLAG_UNSHARE 2283 * currently only works in COW mappings, which is also where 2284 * mm_forbids_zeropage() is checked. 2285 */ 2286 if (!is_cow_mapping(walk->vma->vm_flags)) 2287 return -EFAULT; 2288 2289 *found_addr = addr; 2290 return 1; 2291 } 2292 return 0; 2293 } 2294 2295 static const struct mm_walk_ops find_zeropage_ops = { 2296 .pte_entry = find_zeropage_pte_entry, 2297 .walk_lock = PGWALK_WRLOCK, 2298 }; 2299 2300 /* 2301 * Unshare all shared zeropages, replacing them by anonymous pages. Note that 2302 * we cannot simply zap all shared zeropages, because this could later 2303 * trigger unexpected userfaultfd missing events. 2304 * 2305 * This must be called after mm->context.allow_cow_sharing was 2306 * set to 0, to avoid future mappings of shared zeropages. 2307 * 2308 * mm contracts with s390, that even if mm were to remove a page table, 2309 * and racing with walk_page_range_vma() calling pte_offset_map_lock() 2310 * would fail, it will never insert a page table containing empty zero 2311 * pages once mm_forbids_zeropage(mm) i.e. 2312 * mm->context.allow_cow_sharing is set to 0. 2313 */ 2314 static int __s390_unshare_zeropages(struct mm_struct *mm) 2315 { 2316 struct vm_area_struct *vma; 2317 VMA_ITERATOR(vmi, mm, 0); 2318 unsigned long addr; 2319 vm_fault_t fault; 2320 int rc; 2321 2322 for_each_vma(vmi, vma) { 2323 /* 2324 * We could only look at COW mappings, but it's more future 2325 * proof to catch unexpected zeropages in other mappings and 2326 * fail. 2327 */ 2328 if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) 2329 continue; 2330 addr = vma->vm_start; 2331 2332 retry: 2333 rc = walk_page_range_vma(vma, addr, vma->vm_end, 2334 &find_zeropage_ops, &addr); 2335 if (rc < 0) 2336 return rc; 2337 else if (!rc) 2338 continue; 2339 2340 /* addr was updated by find_zeropage_pte_entry() */ 2341 fault = handle_mm_fault(vma, addr, 2342 FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, 2343 NULL); 2344 if (fault & VM_FAULT_OOM) 2345 return -ENOMEM; 2346 /* 2347 * See break_ksm(): even after handle_mm_fault() returned 0, we 2348 * must start the lookup from the current address, because 2349 * handle_mm_fault() may back out if there's any difficulty. 2350 * 2351 * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but 2352 * maybe they could trigger in the future on concurrent 2353 * truncation. In that case, the shared zeropage would be gone 2354 * and we can simply retry and make progress. 2355 */ 2356 cond_resched(); 2357 goto retry; 2358 } 2359 2360 return 0; 2361 } 2362 2363 static int __s390_disable_cow_sharing(struct mm_struct *mm) 2364 { 2365 int rc; 2366 2367 if (!mm->context.allow_cow_sharing) 2368 return 0; 2369 2370 mm->context.allow_cow_sharing = 0; 2371 2372 /* Replace all shared zeropages by anonymous pages. */ 2373 rc = __s390_unshare_zeropages(mm); 2374 /* 2375 * Make sure to disable KSM (if enabled for the whole process or 2376 * individual VMAs). Note that nothing currently hinders user space 2377 * from re-enabling it. 2378 */ 2379 if (!rc) 2380 rc = ksm_disable(mm); 2381 if (rc) 2382 mm->context.allow_cow_sharing = 1; 2383 return rc; 2384 } 2385 2386 /* 2387 * Disable most COW-sharing of memory pages for the whole process: 2388 * (1) Disable KSM and unmerge/unshare any KSM pages. 2389 * (2) Disallow shared zeropages and unshare any zerpages that are mapped. 2390 * 2391 * Not that we currently don't bother with COW-shared pages that are shared 2392 * with parent/child processes due to fork(). 2393 */ 2394 int s390_disable_cow_sharing(void) 2395 { 2396 int rc; 2397 2398 mmap_write_lock(current->mm); 2399 rc = __s390_disable_cow_sharing(current->mm); 2400 mmap_write_unlock(current->mm); 2401 return rc; 2402 } 2403 EXPORT_SYMBOL_GPL(s390_disable_cow_sharing); 2404 2405 /* 2406 * Enable storage key handling from now on and initialize the storage 2407 * keys with the default key. 2408 */ 2409 static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, 2410 unsigned long next, struct mm_walk *walk) 2411 { 2412 /* Clear storage key */ 2413 ptep_zap_key(walk->mm, addr, pte); 2414 return 0; 2415 } 2416 2417 /* 2418 * Give a chance to schedule after setting a key to 256 pages. 2419 * We only hold the mm lock, which is a rwsem and the kvm srcu. 2420 * Both can sleep. 2421 */ 2422 static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, 2423 unsigned long next, struct mm_walk *walk) 2424 { 2425 cond_resched(); 2426 return 0; 2427 } 2428 2429 static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, 2430 unsigned long hmask, unsigned long next, 2431 struct mm_walk *walk) 2432 { 2433 pmd_t *pmd = (pmd_t *)pte; 2434 unsigned long start, end; 2435 struct folio *folio = page_folio(pmd_page(*pmd)); 2436 2437 /* 2438 * The write check makes sure we do not set a key on shared 2439 * memory. This is needed as the walker does not differentiate 2440 * between actual guest memory and the process executable or 2441 * shared libraries. 2442 */ 2443 if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || 2444 !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) 2445 return 0; 2446 2447 start = pmd_val(*pmd) & HPAGE_MASK; 2448 end = start + HPAGE_SIZE; 2449 __storage_key_init_range(start, end); 2450 set_bit(PG_arch_1, &folio->flags); 2451 cond_resched(); 2452 return 0; 2453 } 2454 2455 static const struct mm_walk_ops enable_skey_walk_ops = { 2456 .hugetlb_entry = __s390_enable_skey_hugetlb, 2457 .pte_entry = __s390_enable_skey_pte, 2458 .pmd_entry = __s390_enable_skey_pmd, 2459 .walk_lock = PGWALK_WRLOCK, 2460 }; 2461 2462 int s390_enable_skey(void) 2463 { 2464 struct mm_struct *mm = current->mm; 2465 int rc = 0; 2466 2467 mmap_write_lock(mm); 2468 if (mm_uses_skeys(mm)) 2469 goto out_up; 2470 2471 mm->context.uses_skeys = 1; 2472 rc = __s390_disable_cow_sharing(mm); 2473 if (rc) { 2474 mm->context.uses_skeys = 0; 2475 goto out_up; 2476 } 2477 walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); 2478 2479 out_up: 2480 mmap_write_unlock(mm); 2481 return rc; 2482 } 2483 EXPORT_SYMBOL_GPL(s390_enable_skey); 2484 2485 /* 2486 * Reset CMMA state, make all pages stable again. 2487 */ 2488 static int __s390_reset_cmma(pte_t *pte, unsigned long addr, 2489 unsigned long next, struct mm_walk *walk) 2490 { 2491 ptep_zap_unused(walk->mm, addr, pte, 1); 2492 return 0; 2493 } 2494 2495 static const struct mm_walk_ops reset_cmma_walk_ops = { 2496 .pte_entry = __s390_reset_cmma, 2497 .walk_lock = PGWALK_WRLOCK, 2498 }; 2499 2500 void s390_reset_cmma(struct mm_struct *mm) 2501 { 2502 mmap_write_lock(mm); 2503 walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); 2504 mmap_write_unlock(mm); 2505 } 2506 EXPORT_SYMBOL_GPL(s390_reset_cmma); 2507 2508 #define GATHER_GET_PAGES 32 2509 2510 struct reset_walk_state { 2511 unsigned long next; 2512 unsigned long count; 2513 unsigned long pfns[GATHER_GET_PAGES]; 2514 }; 2515 2516 static int s390_gather_pages(pte_t *ptep, unsigned long addr, 2517 unsigned long next, struct mm_walk *walk) 2518 { 2519 struct reset_walk_state *p = walk->private; 2520 pte_t pte = READ_ONCE(*ptep); 2521 2522 if (pte_present(pte)) { 2523 /* we have a reference from the mapping, take an extra one */ 2524 get_page(phys_to_page(pte_val(pte))); 2525 p->pfns[p->count] = phys_to_pfn(pte_val(pte)); 2526 p->next = next; 2527 p->count++; 2528 } 2529 return p->count >= GATHER_GET_PAGES; 2530 } 2531 2532 static const struct mm_walk_ops gather_pages_ops = { 2533 .pte_entry = s390_gather_pages, 2534 .walk_lock = PGWALK_RDLOCK, 2535 }; 2536 2537 /* 2538 * Call the Destroy secure page UVC on each page in the given array of PFNs. 2539 * Each page needs to have an extra reference, which will be released here. 2540 */ 2541 void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) 2542 { 2543 struct folio *folio; 2544 unsigned long i; 2545 2546 for (i = 0; i < count; i++) { 2547 folio = pfn_folio(pfns[i]); 2548 /* we always have an extra reference */ 2549 uv_destroy_folio(folio); 2550 /* get rid of the extra reference */ 2551 folio_put(folio); 2552 cond_resched(); 2553 } 2554 } 2555 EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); 2556 2557 /** 2558 * __s390_uv_destroy_range - Call the destroy secure page UVC on each page 2559 * in the given range of the given address space. 2560 * @mm: the mm to operate on 2561 * @start: the start of the range 2562 * @end: the end of the range 2563 * @interruptible: if not 0, stop when a fatal signal is received 2564 * 2565 * Walk the given range of the given address space and call the destroy 2566 * secure page UVC on each page. Optionally exit early if a fatal signal is 2567 * pending. 2568 * 2569 * Return: 0 on success, -EINTR if the function stopped before completing 2570 */ 2571 int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, 2572 unsigned long end, bool interruptible) 2573 { 2574 struct reset_walk_state state = { .next = start }; 2575 int r = 1; 2576 2577 while (r > 0) { 2578 state.count = 0; 2579 mmap_read_lock(mm); 2580 r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); 2581 mmap_read_unlock(mm); 2582 cond_resched(); 2583 s390_uv_destroy_pfns(state.count, state.pfns); 2584 if (interruptible && fatal_signal_pending(current)) 2585 return -EINTR; 2586 } 2587 return 0; 2588 } 2589 EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); 2590 2591 /** 2592 * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy 2593 * @gmap: the gmap whose ASCE needs to be replaced 2594 * 2595 * If the ASCE is a SEGMENT type then this function will return -EINVAL, 2596 * otherwise the pointers in the host_to_guest radix tree will keep pointing 2597 * to the wrong pages, causing use-after-free and memory corruption. 2598 * If the allocation of the new top level page table fails, the ASCE is not 2599 * replaced. 2600 * In any case, the old ASCE is always removed from the gmap CRST list. 2601 * Therefore the caller has to make sure to save a pointer to it 2602 * beforehand, unless a leak is actually intended. 2603 */ 2604 int s390_replace_asce(struct gmap *gmap) 2605 { 2606 unsigned long asce; 2607 struct page *page; 2608 void *table; 2609 2610 /* Replacing segment type ASCEs would cause serious issues */ 2611 if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) 2612 return -EINVAL; 2613 2614 page = gmap_alloc_crst(); 2615 if (!page) 2616 return -ENOMEM; 2617 table = page_to_virt(page); 2618 memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); 2619 2620 /* Set new table origin while preserving existing ASCE control bits */ 2621 asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); 2622 WRITE_ONCE(gmap->asce, asce); 2623 WRITE_ONCE(gmap->mm->context.gmap_asce, asce); 2624 WRITE_ONCE(gmap->table, table); 2625 2626 return 0; 2627 } 2628 EXPORT_SYMBOL_GPL(s390_replace_asce); 2629