1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 #include <linux/slab.h> 20 #include <linux/swapops.h> 21 #include <linux/ksm.h> 22 #include <linux/mman.h> 23 24 #include <asm/pgtable.h> 25 #include <asm/pgalloc.h> 26 #include <asm/tlb.h> 27 #include <asm/tlbflush.h> 28 #include <asm/mmu_context.h> 29 30 #ifndef CONFIG_64BIT 31 #define ALLOC_ORDER 1 32 #define FRAG_MASK 0x0f 33 #else 34 #define ALLOC_ORDER 2 35 #define FRAG_MASK 0x03 36 #endif 37 38 39 unsigned long *crst_table_alloc(struct mm_struct *mm) 40 { 41 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 42 43 if (!page) 44 return NULL; 45 return (unsigned long *) page_to_phys(page); 46 } 47 48 void crst_table_free(struct mm_struct *mm, unsigned long *table) 49 { 50 free_pages((unsigned long) table, ALLOC_ORDER); 51 } 52 53 #ifdef CONFIG_64BIT 54 static void __crst_table_upgrade(void *arg) 55 { 56 struct mm_struct *mm = arg; 57 58 if (current->active_mm == mm) { 59 clear_user_asce(); 60 set_user_asce(mm); 61 } 62 __tlb_flush_local(); 63 } 64 65 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 66 { 67 unsigned long *table, *pgd; 68 unsigned long entry; 69 int flush; 70 71 BUG_ON(limit > (1UL << 53)); 72 flush = 0; 73 repeat: 74 table = crst_table_alloc(mm); 75 if (!table) 76 return -ENOMEM; 77 spin_lock_bh(&mm->page_table_lock); 78 if (mm->context.asce_limit < limit) { 79 pgd = (unsigned long *) mm->pgd; 80 if (mm->context.asce_limit <= (1UL << 31)) { 81 entry = _REGION3_ENTRY_EMPTY; 82 mm->context.asce_limit = 1UL << 42; 83 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 84 _ASCE_USER_BITS | 85 _ASCE_TYPE_REGION3; 86 } else { 87 entry = _REGION2_ENTRY_EMPTY; 88 mm->context.asce_limit = 1UL << 53; 89 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 90 _ASCE_USER_BITS | 91 _ASCE_TYPE_REGION2; 92 } 93 crst_table_init(table, entry); 94 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 95 mm->pgd = (pgd_t *) table; 96 mm->task_size = mm->context.asce_limit; 97 table = NULL; 98 flush = 1; 99 } 100 spin_unlock_bh(&mm->page_table_lock); 101 if (table) 102 crst_table_free(mm, table); 103 if (mm->context.asce_limit < limit) 104 goto repeat; 105 if (flush) 106 on_each_cpu(__crst_table_upgrade, mm, 0); 107 return 0; 108 } 109 110 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 111 { 112 pgd_t *pgd; 113 114 if (current->active_mm == mm) { 115 clear_user_asce(); 116 __tlb_flush_mm(mm); 117 } 118 while (mm->context.asce_limit > limit) { 119 pgd = mm->pgd; 120 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 121 case _REGION_ENTRY_TYPE_R2: 122 mm->context.asce_limit = 1UL << 42; 123 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 124 _ASCE_USER_BITS | 125 _ASCE_TYPE_REGION3; 126 break; 127 case _REGION_ENTRY_TYPE_R3: 128 mm->context.asce_limit = 1UL << 31; 129 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 130 _ASCE_USER_BITS | 131 _ASCE_TYPE_SEGMENT; 132 break; 133 default: 134 BUG(); 135 } 136 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 137 mm->task_size = mm->context.asce_limit; 138 crst_table_free(mm, (unsigned long *) pgd); 139 } 140 if (current->active_mm == mm) 141 set_user_asce(mm); 142 } 143 #endif 144 145 #ifdef CONFIG_PGSTE 146 147 /** 148 * gmap_alloc - allocate a guest address space 149 * @mm: pointer to the parent mm_struct 150 * @limit: maximum size of the gmap address space 151 * 152 * Returns a guest address space structure. 153 */ 154 struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) 155 { 156 struct gmap *gmap; 157 struct page *page; 158 unsigned long *table; 159 unsigned long etype, atype; 160 161 if (limit < (1UL << 31)) { 162 limit = (1UL << 31) - 1; 163 atype = _ASCE_TYPE_SEGMENT; 164 etype = _SEGMENT_ENTRY_EMPTY; 165 } else if (limit < (1UL << 42)) { 166 limit = (1UL << 42) - 1; 167 atype = _ASCE_TYPE_REGION3; 168 etype = _REGION3_ENTRY_EMPTY; 169 } else if (limit < (1UL << 53)) { 170 limit = (1UL << 53) - 1; 171 atype = _ASCE_TYPE_REGION2; 172 etype = _REGION2_ENTRY_EMPTY; 173 } else { 174 limit = -1UL; 175 atype = _ASCE_TYPE_REGION1; 176 etype = _REGION1_ENTRY_EMPTY; 177 } 178 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 179 if (!gmap) 180 goto out; 181 INIT_LIST_HEAD(&gmap->crst_list); 182 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); 183 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); 184 spin_lock_init(&gmap->guest_table_lock); 185 gmap->mm = mm; 186 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 187 if (!page) 188 goto out_free; 189 page->index = 0; 190 list_add(&page->lru, &gmap->crst_list); 191 table = (unsigned long *) page_to_phys(page); 192 crst_table_init(table, etype); 193 gmap->table = table; 194 gmap->asce = atype | _ASCE_TABLE_LENGTH | 195 _ASCE_USER_BITS | __pa(table); 196 gmap->asce_end = limit; 197 down_write(&mm->mmap_sem); 198 list_add(&gmap->list, &mm->context.gmap_list); 199 up_write(&mm->mmap_sem); 200 return gmap; 201 202 out_free: 203 kfree(gmap); 204 out: 205 return NULL; 206 } 207 EXPORT_SYMBOL_GPL(gmap_alloc); 208 209 static void gmap_flush_tlb(struct gmap *gmap) 210 { 211 if (MACHINE_HAS_IDTE) 212 __tlb_flush_asce(gmap->mm, gmap->asce); 213 else 214 __tlb_flush_global(); 215 } 216 217 static void gmap_radix_tree_free(struct radix_tree_root *root) 218 { 219 struct radix_tree_iter iter; 220 unsigned long indices[16]; 221 unsigned long index; 222 void **slot; 223 int i, nr; 224 225 /* A radix tree is freed by deleting all of its entries */ 226 index = 0; 227 do { 228 nr = 0; 229 radix_tree_for_each_slot(slot, root, &iter, index) { 230 indices[nr] = iter.index; 231 if (++nr == 16) 232 break; 233 } 234 for (i = 0; i < nr; i++) { 235 index = indices[i]; 236 radix_tree_delete(root, index); 237 } 238 } while (nr > 0); 239 } 240 241 /** 242 * gmap_free - free a guest address space 243 * @gmap: pointer to the guest address space structure 244 */ 245 void gmap_free(struct gmap *gmap) 246 { 247 struct page *page, *next; 248 249 /* Flush tlb. */ 250 if (MACHINE_HAS_IDTE) 251 __tlb_flush_asce(gmap->mm, gmap->asce); 252 else 253 __tlb_flush_global(); 254 255 /* Free all segment & region tables. */ 256 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) 257 __free_pages(page, ALLOC_ORDER); 258 gmap_radix_tree_free(&gmap->guest_to_host); 259 gmap_radix_tree_free(&gmap->host_to_guest); 260 down_write(&gmap->mm->mmap_sem); 261 list_del(&gmap->list); 262 up_write(&gmap->mm->mmap_sem); 263 kfree(gmap); 264 } 265 EXPORT_SYMBOL_GPL(gmap_free); 266 267 /** 268 * gmap_enable - switch primary space to the guest address space 269 * @gmap: pointer to the guest address space structure 270 */ 271 void gmap_enable(struct gmap *gmap) 272 { 273 S390_lowcore.gmap = (unsigned long) gmap; 274 } 275 EXPORT_SYMBOL_GPL(gmap_enable); 276 277 /** 278 * gmap_disable - switch back to the standard primary address space 279 * @gmap: pointer to the guest address space structure 280 */ 281 void gmap_disable(struct gmap *gmap) 282 { 283 S390_lowcore.gmap = 0UL; 284 } 285 EXPORT_SYMBOL_GPL(gmap_disable); 286 287 /* 288 * gmap_alloc_table is assumed to be called with mmap_sem held 289 */ 290 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 291 unsigned long init, unsigned long gaddr) 292 { 293 struct page *page; 294 unsigned long *new; 295 296 /* since we dont free the gmap table until gmap_free we can unlock */ 297 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 298 if (!page) 299 return -ENOMEM; 300 new = (unsigned long *) page_to_phys(page); 301 crst_table_init(new, init); 302 spin_lock(&gmap->mm->page_table_lock); 303 if (*table & _REGION_ENTRY_INVALID) { 304 list_add(&page->lru, &gmap->crst_list); 305 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 306 (*table & _REGION_ENTRY_TYPE_MASK); 307 page->index = gaddr; 308 page = NULL; 309 } 310 spin_unlock(&gmap->mm->page_table_lock); 311 if (page) 312 __free_pages(page, ALLOC_ORDER); 313 return 0; 314 } 315 316 /** 317 * __gmap_segment_gaddr - find virtual address from segment pointer 318 * @entry: pointer to a segment table entry in the guest address space 319 * 320 * Returns the virtual address in the guest address space for the segment 321 */ 322 static unsigned long __gmap_segment_gaddr(unsigned long *entry) 323 { 324 struct page *page; 325 unsigned long offset; 326 327 offset = (unsigned long) entry / sizeof(unsigned long); 328 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; 329 page = pmd_to_page((pmd_t *) entry); 330 return page->index + offset; 331 } 332 333 /** 334 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 335 * @gmap: pointer to the guest address space structure 336 * @vmaddr: address in the host process address space 337 * 338 * Returns 1 if a TLB flush is required 339 */ 340 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 341 { 342 unsigned long *entry; 343 int flush = 0; 344 345 spin_lock(&gmap->guest_table_lock); 346 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 347 if (entry) { 348 flush = (*entry != _SEGMENT_ENTRY_INVALID); 349 *entry = _SEGMENT_ENTRY_INVALID; 350 } 351 spin_unlock(&gmap->guest_table_lock); 352 return flush; 353 } 354 355 /** 356 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 357 * @gmap: pointer to the guest address space structure 358 * @gaddr: address in the guest address space 359 * 360 * Returns 1 if a TLB flush is required 361 */ 362 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 363 { 364 unsigned long vmaddr; 365 366 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 367 gaddr >> PMD_SHIFT); 368 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 369 } 370 371 /** 372 * gmap_unmap_segment - unmap segment from the guest address space 373 * @gmap: pointer to the guest address space structure 374 * @to: address in the guest address space 375 * @len: length of the memory area to unmap 376 * 377 * Returns 0 if the unmap succeeded, -EINVAL if not. 378 */ 379 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 380 { 381 unsigned long off; 382 int flush; 383 384 if ((to | len) & (PMD_SIZE - 1)) 385 return -EINVAL; 386 if (len == 0 || to + len < to) 387 return -EINVAL; 388 389 flush = 0; 390 down_write(&gmap->mm->mmap_sem); 391 for (off = 0; off < len; off += PMD_SIZE) 392 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 393 up_write(&gmap->mm->mmap_sem); 394 if (flush) 395 gmap_flush_tlb(gmap); 396 return 0; 397 } 398 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 399 400 /** 401 * gmap_mmap_segment - map a segment to the guest address space 402 * @gmap: pointer to the guest address space structure 403 * @from: source address in the parent address space 404 * @to: target address in the guest address space 405 * @len: length of the memory area to map 406 * 407 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 408 */ 409 int gmap_map_segment(struct gmap *gmap, unsigned long from, 410 unsigned long to, unsigned long len) 411 { 412 unsigned long off; 413 int flush; 414 415 if ((from | to | len) & (PMD_SIZE - 1)) 416 return -EINVAL; 417 if (len == 0 || from + len < from || to + len < to || 418 from + len > TASK_MAX_SIZE || to + len > gmap->asce_end) 419 return -EINVAL; 420 421 flush = 0; 422 down_write(&gmap->mm->mmap_sem); 423 for (off = 0; off < len; off += PMD_SIZE) { 424 /* Remove old translation */ 425 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 426 /* Store new translation */ 427 if (radix_tree_insert(&gmap->guest_to_host, 428 (to + off) >> PMD_SHIFT, 429 (void *) from + off)) 430 break; 431 } 432 up_write(&gmap->mm->mmap_sem); 433 if (flush) 434 gmap_flush_tlb(gmap); 435 if (off >= len) 436 return 0; 437 gmap_unmap_segment(gmap, to, len); 438 return -ENOMEM; 439 } 440 EXPORT_SYMBOL_GPL(gmap_map_segment); 441 442 /** 443 * __gmap_translate - translate a guest address to a user space address 444 * @gmap: pointer to guest mapping meta data structure 445 * @gaddr: guest address 446 * 447 * Returns user space address which corresponds to the guest address or 448 * -EFAULT if no such mapping exists. 449 * This function does not establish potentially missing page table entries. 450 * The mmap_sem of the mm that belongs to the address space must be held 451 * when this function gets called. 452 */ 453 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 454 { 455 unsigned long vmaddr; 456 457 vmaddr = (unsigned long) 458 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 459 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 460 } 461 EXPORT_SYMBOL_GPL(__gmap_translate); 462 463 /** 464 * gmap_translate - translate a guest address to a user space address 465 * @gmap: pointer to guest mapping meta data structure 466 * @gaddr: guest address 467 * 468 * Returns user space address which corresponds to the guest address or 469 * -EFAULT if no such mapping exists. 470 * This function does not establish potentially missing page table entries. 471 */ 472 unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) 473 { 474 unsigned long rc; 475 476 down_read(&gmap->mm->mmap_sem); 477 rc = __gmap_translate(gmap, gaddr); 478 up_read(&gmap->mm->mmap_sem); 479 return rc; 480 } 481 EXPORT_SYMBOL_GPL(gmap_translate); 482 483 /** 484 * gmap_unlink - disconnect a page table from the gmap shadow tables 485 * @gmap: pointer to guest mapping meta data structure 486 * @table: pointer to the host page table 487 * @vmaddr: vm address associated with the host page table 488 */ 489 static void gmap_unlink(struct mm_struct *mm, unsigned long *table, 490 unsigned long vmaddr) 491 { 492 struct gmap *gmap; 493 int flush; 494 495 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 496 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 497 if (flush) 498 gmap_flush_tlb(gmap); 499 } 500 } 501 502 /** 503 * gmap_link - set up shadow page tables to connect a host to a guest address 504 * @gmap: pointer to guest mapping meta data structure 505 * @gaddr: guest address 506 * @vmaddr: vm address 507 * 508 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 509 * if the vm address is already mapped to a different guest segment. 510 * The mmap_sem of the mm that belongs to the address space must be held 511 * when this function gets called. 512 */ 513 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 514 { 515 struct mm_struct *mm; 516 unsigned long *table; 517 spinlock_t *ptl; 518 pgd_t *pgd; 519 pud_t *pud; 520 pmd_t *pmd; 521 int rc; 522 523 /* Create higher level tables in the gmap page table */ 524 table = gmap->table; 525 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 526 table += (gaddr >> 53) & 0x7ff; 527 if ((*table & _REGION_ENTRY_INVALID) && 528 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 529 gaddr & 0xffe0000000000000)) 530 return -ENOMEM; 531 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 532 } 533 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 534 table += (gaddr >> 42) & 0x7ff; 535 if ((*table & _REGION_ENTRY_INVALID) && 536 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 537 gaddr & 0xfffffc0000000000)) 538 return -ENOMEM; 539 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 540 } 541 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 542 table += (gaddr >> 31) & 0x7ff; 543 if ((*table & _REGION_ENTRY_INVALID) && 544 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 545 gaddr & 0xffffffff80000000)) 546 return -ENOMEM; 547 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 548 } 549 table += (gaddr >> 20) & 0x7ff; 550 /* Walk the parent mm page table */ 551 mm = gmap->mm; 552 pgd = pgd_offset(mm, vmaddr); 553 VM_BUG_ON(pgd_none(*pgd)); 554 pud = pud_offset(pgd, vmaddr); 555 VM_BUG_ON(pud_none(*pud)); 556 pmd = pmd_offset(pud, vmaddr); 557 VM_BUG_ON(pmd_none(*pmd)); 558 /* large pmds cannot yet be handled */ 559 if (pmd_large(*pmd)) 560 return -EFAULT; 561 /* Link gmap segment table entry location to page table. */ 562 rc = radix_tree_preload(GFP_KERNEL); 563 if (rc) 564 return rc; 565 ptl = pmd_lock(mm, pmd); 566 spin_lock(&gmap->guest_table_lock); 567 if (*table == _SEGMENT_ENTRY_INVALID) { 568 rc = radix_tree_insert(&gmap->host_to_guest, 569 vmaddr >> PMD_SHIFT, table); 570 if (!rc) 571 *table = pmd_val(*pmd); 572 } else 573 rc = 0; 574 spin_unlock(&gmap->guest_table_lock); 575 spin_unlock(ptl); 576 radix_tree_preload_end(); 577 return rc; 578 } 579 580 /** 581 * gmap_fault - resolve a fault on a guest address 582 * @gmap: pointer to guest mapping meta data structure 583 * @gaddr: guest address 584 * @fault_flags: flags to pass down to handle_mm_fault() 585 * 586 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 587 * if the vm address is already mapped to a different guest segment. 588 */ 589 int gmap_fault(struct gmap *gmap, unsigned long gaddr, 590 unsigned int fault_flags) 591 { 592 unsigned long vmaddr; 593 int rc; 594 595 down_read(&gmap->mm->mmap_sem); 596 vmaddr = __gmap_translate(gmap, gaddr); 597 if (IS_ERR_VALUE(vmaddr)) { 598 rc = vmaddr; 599 goto out_up; 600 } 601 if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) { 602 rc = -EFAULT; 603 goto out_up; 604 } 605 rc = __gmap_link(gmap, gaddr, vmaddr); 606 out_up: 607 up_read(&gmap->mm->mmap_sem); 608 return rc; 609 } 610 EXPORT_SYMBOL_GPL(gmap_fault); 611 612 static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) 613 { 614 if (!non_swap_entry(entry)) 615 dec_mm_counter(mm, MM_SWAPENTS); 616 else if (is_migration_entry(entry)) { 617 struct page *page = migration_entry_to_page(entry); 618 619 if (PageAnon(page)) 620 dec_mm_counter(mm, MM_ANONPAGES); 621 else 622 dec_mm_counter(mm, MM_FILEPAGES); 623 } 624 free_swap_and_cache(entry); 625 } 626 627 /* 628 * this function is assumed to be called with mmap_sem held 629 */ 630 void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 631 { 632 unsigned long vmaddr, ptev, pgstev; 633 pte_t *ptep, pte; 634 spinlock_t *ptl; 635 pgste_t pgste; 636 637 /* Find the vm address for the guest address */ 638 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 639 gaddr >> PMD_SHIFT); 640 if (!vmaddr) 641 return; 642 vmaddr |= gaddr & ~PMD_MASK; 643 /* Get pointer to the page table entry */ 644 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 645 if (unlikely(!ptep)) 646 return; 647 pte = *ptep; 648 if (!pte_swap(pte)) 649 goto out_pte; 650 /* Zap unused and logically-zero pages */ 651 pgste = pgste_get_lock(ptep); 652 pgstev = pgste_val(pgste); 653 ptev = pte_val(pte); 654 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || 655 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { 656 gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm); 657 pte_clear(gmap->mm, vmaddr, ptep); 658 } 659 pgste_set_unlock(ptep, pgste); 660 out_pte: 661 pte_unmap_unlock(ptep, ptl); 662 } 663 EXPORT_SYMBOL_GPL(__gmap_zap); 664 665 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 666 { 667 unsigned long gaddr, vmaddr, size; 668 struct vm_area_struct *vma; 669 670 down_read(&gmap->mm->mmap_sem); 671 for (gaddr = from; gaddr < to; 672 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 673 /* Find the vm address for the guest address */ 674 vmaddr = (unsigned long) 675 radix_tree_lookup(&gmap->guest_to_host, 676 gaddr >> PMD_SHIFT); 677 if (!vmaddr) 678 continue; 679 vmaddr |= gaddr & ~PMD_MASK; 680 /* Find vma in the parent mm */ 681 vma = find_vma(gmap->mm, vmaddr); 682 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 683 zap_page_range(vma, vmaddr, size, NULL); 684 } 685 up_read(&gmap->mm->mmap_sem); 686 } 687 EXPORT_SYMBOL_GPL(gmap_discard); 688 689 static LIST_HEAD(gmap_notifier_list); 690 static DEFINE_SPINLOCK(gmap_notifier_lock); 691 692 /** 693 * gmap_register_ipte_notifier - register a pte invalidation callback 694 * @nb: pointer to the gmap notifier block 695 */ 696 void gmap_register_ipte_notifier(struct gmap_notifier *nb) 697 { 698 spin_lock(&gmap_notifier_lock); 699 list_add(&nb->list, &gmap_notifier_list); 700 spin_unlock(&gmap_notifier_lock); 701 } 702 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); 703 704 /** 705 * gmap_unregister_ipte_notifier - remove a pte invalidation callback 706 * @nb: pointer to the gmap notifier block 707 */ 708 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) 709 { 710 spin_lock(&gmap_notifier_lock); 711 list_del_init(&nb->list); 712 spin_unlock(&gmap_notifier_lock); 713 } 714 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); 715 716 /** 717 * gmap_ipte_notify - mark a range of ptes for invalidation notification 718 * @gmap: pointer to guest mapping meta data structure 719 * @gaddr: virtual address in the guest address space 720 * @len: size of area 721 * 722 * Returns 0 if for each page in the given range a gmap mapping exists and 723 * the invalidation notification could be set. If the gmap mapping is missing 724 * for one or more pages -EFAULT is returned. If no memory could be allocated 725 * -ENOMEM is returned. This function establishes missing page table entries. 726 */ 727 int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) 728 { 729 unsigned long addr; 730 spinlock_t *ptl; 731 pte_t *ptep, entry; 732 pgste_t pgste; 733 int rc = 0; 734 735 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) 736 return -EINVAL; 737 down_read(&gmap->mm->mmap_sem); 738 while (len) { 739 /* Convert gmap address and connect the page tables */ 740 addr = __gmap_translate(gmap, gaddr); 741 if (IS_ERR_VALUE(addr)) { 742 rc = addr; 743 break; 744 } 745 /* Get the page mapped */ 746 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { 747 rc = -EFAULT; 748 break; 749 } 750 rc = __gmap_link(gmap, gaddr, addr); 751 if (rc) 752 break; 753 /* Walk the process page table, lock and get pte pointer */ 754 ptep = get_locked_pte(gmap->mm, addr, &ptl); 755 VM_BUG_ON(!ptep); 756 /* Set notification bit in the pgste of the pte */ 757 entry = *ptep; 758 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { 759 pgste = pgste_get_lock(ptep); 760 pgste_val(pgste) |= PGSTE_IN_BIT; 761 pgste_set_unlock(ptep, pgste); 762 gaddr += PAGE_SIZE; 763 len -= PAGE_SIZE; 764 } 765 pte_unmap_unlock(ptep, ptl); 766 } 767 up_read(&gmap->mm->mmap_sem); 768 return rc; 769 } 770 EXPORT_SYMBOL_GPL(gmap_ipte_notify); 771 772 /** 773 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. 774 * @mm: pointer to the process mm_struct 775 * @addr: virtual address in the process address space 776 * @pte: pointer to the page table entry 777 * 778 * This function is assumed to be called with the page table lock held 779 * for the pte to notify. 780 */ 781 void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) 782 { 783 unsigned long offset, gaddr; 784 unsigned long *table; 785 struct gmap_notifier *nb; 786 struct gmap *gmap; 787 788 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 789 offset = offset * (4096 / sizeof(pte_t)); 790 spin_lock(&gmap_notifier_lock); 791 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 792 table = radix_tree_lookup(&gmap->host_to_guest, 793 vmaddr >> PMD_SHIFT); 794 if (!table) 795 continue; 796 gaddr = __gmap_segment_gaddr(table) + offset; 797 list_for_each_entry(nb, &gmap_notifier_list, list) 798 nb->notifier_call(gmap, gaddr); 799 } 800 spin_unlock(&gmap_notifier_lock); 801 } 802 EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); 803 804 static inline int page_table_with_pgste(struct page *page) 805 { 806 return atomic_read(&page->_mapcount) == 0; 807 } 808 809 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) 810 { 811 struct page *page; 812 unsigned long *table; 813 814 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 815 if (!page) 816 return NULL; 817 if (!pgtable_page_ctor(page)) { 818 __free_page(page); 819 return NULL; 820 } 821 atomic_set(&page->_mapcount, 0); 822 table = (unsigned long *) page_to_phys(page); 823 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 824 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 825 return table; 826 } 827 828 static inline void page_table_free_pgste(unsigned long *table) 829 { 830 struct page *page; 831 832 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 833 pgtable_page_dtor(page); 834 atomic_set(&page->_mapcount, -1); 835 __free_page(page); 836 } 837 838 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 839 unsigned long key, bool nq) 840 { 841 spinlock_t *ptl; 842 pgste_t old, new; 843 pte_t *ptep; 844 845 down_read(&mm->mmap_sem); 846 retry: 847 ptep = get_locked_pte(current->mm, addr, &ptl); 848 if (unlikely(!ptep)) { 849 up_read(&mm->mmap_sem); 850 return -EFAULT; 851 } 852 if (!(pte_val(*ptep) & _PAGE_INVALID) && 853 (pte_val(*ptep) & _PAGE_PROTECT)) { 854 pte_unmap_unlock(ptep, ptl); 855 if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { 856 up_read(&mm->mmap_sem); 857 return -EFAULT; 858 } 859 goto retry; 860 } 861 862 new = old = pgste_get_lock(ptep); 863 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 864 PGSTE_ACC_BITS | PGSTE_FP_BIT); 865 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 866 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 867 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 868 unsigned long address, bits, skey; 869 870 address = pte_val(*ptep) & PAGE_MASK; 871 skey = (unsigned long) page_get_storage_key(address); 872 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 873 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 874 /* Set storage key ACC and FP */ 875 page_set_storage_key(address, skey, !nq); 876 /* Merge host changed & referenced into pgste */ 877 pgste_val(new) |= bits << 52; 878 } 879 /* changing the guest storage key is considered a change of the page */ 880 if ((pgste_val(new) ^ pgste_val(old)) & 881 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 882 pgste_val(new) |= PGSTE_UC_BIT; 883 884 pgste_set_unlock(ptep, new); 885 pte_unmap_unlock(ptep, ptl); 886 up_read(&mm->mmap_sem); 887 return 0; 888 } 889 EXPORT_SYMBOL(set_guest_storage_key); 890 891 #else /* CONFIG_PGSTE */ 892 893 static inline int page_table_with_pgste(struct page *page) 894 { 895 return 0; 896 } 897 898 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) 899 { 900 return NULL; 901 } 902 903 static inline void page_table_free_pgste(unsigned long *table) 904 { 905 } 906 907 static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table, 908 unsigned long vmaddr) 909 { 910 } 911 912 #endif /* CONFIG_PGSTE */ 913 914 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 915 { 916 unsigned int old, new; 917 918 do { 919 old = atomic_read(v); 920 new = old ^ bits; 921 } while (atomic_cmpxchg(v, old, new) != old); 922 return new; 923 } 924 925 /* 926 * page table entry allocation/free routines. 927 */ 928 unsigned long *page_table_alloc(struct mm_struct *mm) 929 { 930 unsigned long *uninitialized_var(table); 931 struct page *uninitialized_var(page); 932 unsigned int mask, bit; 933 934 if (mm_has_pgste(mm)) 935 return page_table_alloc_pgste(mm); 936 /* Allocate fragments of a 4K page as 1K/2K page table */ 937 spin_lock_bh(&mm->context.list_lock); 938 mask = FRAG_MASK; 939 if (!list_empty(&mm->context.pgtable_list)) { 940 page = list_first_entry(&mm->context.pgtable_list, 941 struct page, lru); 942 table = (unsigned long *) page_to_phys(page); 943 mask = atomic_read(&page->_mapcount); 944 mask = mask | (mask >> 4); 945 } 946 if ((mask & FRAG_MASK) == FRAG_MASK) { 947 spin_unlock_bh(&mm->context.list_lock); 948 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 949 if (!page) 950 return NULL; 951 if (!pgtable_page_ctor(page)) { 952 __free_page(page); 953 return NULL; 954 } 955 atomic_set(&page->_mapcount, 1); 956 table = (unsigned long *) page_to_phys(page); 957 clear_table(table, _PAGE_INVALID, PAGE_SIZE); 958 spin_lock_bh(&mm->context.list_lock); 959 list_add(&page->lru, &mm->context.pgtable_list); 960 } else { 961 for (bit = 1; mask & bit; bit <<= 1) 962 table += PTRS_PER_PTE; 963 mask = atomic_xor_bits(&page->_mapcount, bit); 964 if ((mask & FRAG_MASK) == FRAG_MASK) 965 list_del(&page->lru); 966 } 967 spin_unlock_bh(&mm->context.list_lock); 968 return table; 969 } 970 971 void page_table_free(struct mm_struct *mm, unsigned long *table) 972 { 973 struct page *page; 974 unsigned int bit, mask; 975 976 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 977 if (page_table_with_pgste(page)) 978 return page_table_free_pgste(table); 979 /* Free 1K/2K page table fragment of a 4K page */ 980 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 981 spin_lock_bh(&mm->context.list_lock); 982 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 983 list_del(&page->lru); 984 mask = atomic_xor_bits(&page->_mapcount, bit); 985 if (mask & FRAG_MASK) 986 list_add(&page->lru, &mm->context.pgtable_list); 987 spin_unlock_bh(&mm->context.list_lock); 988 if (mask == 0) { 989 pgtable_page_dtor(page); 990 atomic_set(&page->_mapcount, -1); 991 __free_page(page); 992 } 993 } 994 995 static void __page_table_free_rcu(void *table, unsigned bit) 996 { 997 struct page *page; 998 999 if (bit == FRAG_MASK) 1000 return page_table_free_pgste(table); 1001 /* Free 1K/2K page table fragment of a 4K page */ 1002 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1003 if (atomic_xor_bits(&page->_mapcount, bit) == 0) { 1004 pgtable_page_dtor(page); 1005 atomic_set(&page->_mapcount, -1); 1006 __free_page(page); 1007 } 1008 } 1009 1010 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, 1011 unsigned long vmaddr) 1012 { 1013 struct mm_struct *mm; 1014 struct page *page; 1015 unsigned int bit, mask; 1016 1017 mm = tlb->mm; 1018 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1019 if (page_table_with_pgste(page)) { 1020 gmap_unlink(mm, table, vmaddr); 1021 table = (unsigned long *) (__pa(table) | FRAG_MASK); 1022 tlb_remove_table(tlb, table); 1023 return; 1024 } 1025 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 1026 spin_lock_bh(&mm->context.list_lock); 1027 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 1028 list_del(&page->lru); 1029 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); 1030 if (mask & FRAG_MASK) 1031 list_add_tail(&page->lru, &mm->context.pgtable_list); 1032 spin_unlock_bh(&mm->context.list_lock); 1033 table = (unsigned long *) (__pa(table) | (bit << 4)); 1034 tlb_remove_table(tlb, table); 1035 } 1036 1037 static void __tlb_remove_table(void *_table) 1038 { 1039 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; 1040 void *table = (void *)((unsigned long) _table & ~mask); 1041 unsigned type = (unsigned long) _table & mask; 1042 1043 if (type) 1044 __page_table_free_rcu(table, type); 1045 else 1046 free_pages((unsigned long) table, ALLOC_ORDER); 1047 } 1048 1049 static void tlb_remove_table_smp_sync(void *arg) 1050 { 1051 /* Simply deliver the interrupt */ 1052 } 1053 1054 static void tlb_remove_table_one(void *table) 1055 { 1056 /* 1057 * This isn't an RCU grace period and hence the page-tables cannot be 1058 * assumed to be actually RCU-freed. 1059 * 1060 * It is however sufficient for software page-table walkers that rely 1061 * on IRQ disabling. See the comment near struct mmu_table_batch. 1062 */ 1063 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 1064 __tlb_remove_table(table); 1065 } 1066 1067 static void tlb_remove_table_rcu(struct rcu_head *head) 1068 { 1069 struct mmu_table_batch *batch; 1070 int i; 1071 1072 batch = container_of(head, struct mmu_table_batch, rcu); 1073 1074 for (i = 0; i < batch->nr; i++) 1075 __tlb_remove_table(batch->tables[i]); 1076 1077 free_page((unsigned long)batch); 1078 } 1079 1080 void tlb_table_flush(struct mmu_gather *tlb) 1081 { 1082 struct mmu_table_batch **batch = &tlb->batch; 1083 1084 if (*batch) { 1085 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 1086 *batch = NULL; 1087 } 1088 } 1089 1090 void tlb_remove_table(struct mmu_gather *tlb, void *table) 1091 { 1092 struct mmu_table_batch **batch = &tlb->batch; 1093 1094 tlb->mm->context.flush_mm = 1; 1095 if (*batch == NULL) { 1096 *batch = (struct mmu_table_batch *) 1097 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 1098 if (*batch == NULL) { 1099 __tlb_flush_mm_lazy(tlb->mm); 1100 tlb_remove_table_one(table); 1101 return; 1102 } 1103 (*batch)->nr = 0; 1104 } 1105 (*batch)->tables[(*batch)->nr++] = table; 1106 if ((*batch)->nr == MAX_TABLE_BATCH) 1107 tlb_flush_mmu(tlb); 1108 } 1109 1110 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1111 static inline void thp_split_vma(struct vm_area_struct *vma) 1112 { 1113 unsigned long addr; 1114 1115 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) 1116 follow_page(vma, addr, FOLL_SPLIT); 1117 } 1118 1119 static inline void thp_split_mm(struct mm_struct *mm) 1120 { 1121 struct vm_area_struct *vma; 1122 1123 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { 1124 thp_split_vma(vma); 1125 vma->vm_flags &= ~VM_HUGEPAGE; 1126 vma->vm_flags |= VM_NOHUGEPAGE; 1127 } 1128 mm->def_flags |= VM_NOHUGEPAGE; 1129 } 1130 #else 1131 static inline void thp_split_mm(struct mm_struct *mm) 1132 { 1133 } 1134 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1135 1136 static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb, 1137 struct mm_struct *mm, pud_t *pud, 1138 unsigned long addr, unsigned long end) 1139 { 1140 unsigned long next, *table, *new; 1141 struct page *page; 1142 spinlock_t *ptl; 1143 pmd_t *pmd; 1144 1145 pmd = pmd_offset(pud, addr); 1146 do { 1147 next = pmd_addr_end(addr, end); 1148 again: 1149 if (pmd_none_or_clear_bad(pmd)) 1150 continue; 1151 table = (unsigned long *) pmd_deref(*pmd); 1152 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1153 if (page_table_with_pgste(page)) 1154 continue; 1155 /* Allocate new page table with pgstes */ 1156 new = page_table_alloc_pgste(mm); 1157 if (!new) 1158 return -ENOMEM; 1159 1160 ptl = pmd_lock(mm, pmd); 1161 if (likely((unsigned long *) pmd_deref(*pmd) == table)) { 1162 /* Nuke pmd entry pointing to the "short" page table */ 1163 pmdp_flush_lazy(mm, addr, pmd); 1164 pmd_clear(pmd); 1165 /* Copy ptes from old table to new table */ 1166 memcpy(new, table, PAGE_SIZE/2); 1167 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 1168 /* Establish new table */ 1169 pmd_populate(mm, pmd, (pte_t *) new); 1170 /* Free old table with rcu, there might be a walker! */ 1171 page_table_free_rcu(tlb, table, addr); 1172 new = NULL; 1173 } 1174 spin_unlock(ptl); 1175 if (new) { 1176 page_table_free_pgste(new); 1177 goto again; 1178 } 1179 } while (pmd++, addr = next, addr != end); 1180 1181 return addr; 1182 } 1183 1184 static unsigned long page_table_realloc_pud(struct mmu_gather *tlb, 1185 struct mm_struct *mm, pgd_t *pgd, 1186 unsigned long addr, unsigned long end) 1187 { 1188 unsigned long next; 1189 pud_t *pud; 1190 1191 pud = pud_offset(pgd, addr); 1192 do { 1193 next = pud_addr_end(addr, end); 1194 if (pud_none_or_clear_bad(pud)) 1195 continue; 1196 next = page_table_realloc_pmd(tlb, mm, pud, addr, next); 1197 if (unlikely(IS_ERR_VALUE(next))) 1198 return next; 1199 } while (pud++, addr = next, addr != end); 1200 1201 return addr; 1202 } 1203 1204 static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm, 1205 unsigned long addr, unsigned long end) 1206 { 1207 unsigned long next; 1208 pgd_t *pgd; 1209 1210 pgd = pgd_offset(mm, addr); 1211 do { 1212 next = pgd_addr_end(addr, end); 1213 if (pgd_none_or_clear_bad(pgd)) 1214 continue; 1215 next = page_table_realloc_pud(tlb, mm, pgd, addr, next); 1216 if (unlikely(IS_ERR_VALUE(next))) 1217 return next; 1218 } while (pgd++, addr = next, addr != end); 1219 1220 return 0; 1221 } 1222 1223 /* 1224 * switch on pgstes for its userspace process (for kvm) 1225 */ 1226 int s390_enable_sie(void) 1227 { 1228 struct task_struct *tsk = current; 1229 struct mm_struct *mm = tsk->mm; 1230 struct mmu_gather tlb; 1231 1232 /* Do we have pgstes? if yes, we are done */ 1233 if (mm_has_pgste(tsk->mm)) 1234 return 0; 1235 1236 down_write(&mm->mmap_sem); 1237 /* split thp mappings and disable thp for future mappings */ 1238 thp_split_mm(mm); 1239 /* Reallocate the page tables with pgstes */ 1240 tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE); 1241 if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE)) 1242 mm->context.has_pgste = 1; 1243 tlb_finish_mmu(&tlb, 0, TASK_SIZE); 1244 up_write(&mm->mmap_sem); 1245 return mm->context.has_pgste ? 0 : -ENOMEM; 1246 } 1247 EXPORT_SYMBOL_GPL(s390_enable_sie); 1248 1249 /* 1250 * Enable storage key handling from now on and initialize the storage 1251 * keys with the default key. 1252 */ 1253 static int __s390_enable_skey(pte_t *pte, unsigned long addr, 1254 unsigned long next, struct mm_walk *walk) 1255 { 1256 unsigned long ptev; 1257 pgste_t pgste; 1258 1259 pgste = pgste_get_lock(pte); 1260 /* 1261 * Remove all zero page mappings, 1262 * after establishing a policy to forbid zero page mappings 1263 * following faults for that page will get fresh anonymous pages 1264 */ 1265 if (is_zero_pfn(pte_pfn(*pte))) { 1266 ptep_flush_direct(walk->mm, addr, pte); 1267 pte_val(*pte) = _PAGE_INVALID; 1268 } 1269 /* Clear storage key */ 1270 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | 1271 PGSTE_GR_BIT | PGSTE_GC_BIT); 1272 ptev = pte_val(*pte); 1273 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 1274 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); 1275 pgste_set_unlock(pte, pgste); 1276 return 0; 1277 } 1278 1279 int s390_enable_skey(void) 1280 { 1281 struct mm_walk walk = { .pte_entry = __s390_enable_skey }; 1282 struct mm_struct *mm = current->mm; 1283 struct vm_area_struct *vma; 1284 int rc = 0; 1285 1286 down_write(&mm->mmap_sem); 1287 if (mm_use_skey(mm)) 1288 goto out_up; 1289 1290 mm->context.use_skey = 1; 1291 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1292 if (ksm_madvise(vma, vma->vm_start, vma->vm_end, 1293 MADV_UNMERGEABLE, &vma->vm_flags)) { 1294 mm->context.use_skey = 0; 1295 rc = -ENOMEM; 1296 goto out_up; 1297 } 1298 } 1299 mm->def_flags &= ~VM_MERGEABLE; 1300 1301 walk.mm = mm; 1302 walk_page_range(0, TASK_SIZE, &walk); 1303 1304 out_up: 1305 up_write(&mm->mmap_sem); 1306 return rc; 1307 } 1308 EXPORT_SYMBOL_GPL(s390_enable_skey); 1309 1310 /* 1311 * Reset CMMA state, make all pages stable again. 1312 */ 1313 static int __s390_reset_cmma(pte_t *pte, unsigned long addr, 1314 unsigned long next, struct mm_walk *walk) 1315 { 1316 pgste_t pgste; 1317 1318 pgste = pgste_get_lock(pte); 1319 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; 1320 pgste_set_unlock(pte, pgste); 1321 return 0; 1322 } 1323 1324 void s390_reset_cmma(struct mm_struct *mm) 1325 { 1326 struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; 1327 1328 down_write(&mm->mmap_sem); 1329 walk.mm = mm; 1330 walk_page_range(0, TASK_SIZE, &walk); 1331 up_write(&mm->mmap_sem); 1332 } 1333 EXPORT_SYMBOL_GPL(s390_reset_cmma); 1334 1335 /* 1336 * Test and reset if a guest page is dirty 1337 */ 1338 bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap) 1339 { 1340 pte_t *pte; 1341 spinlock_t *ptl; 1342 bool dirty = false; 1343 1344 pte = get_locked_pte(gmap->mm, address, &ptl); 1345 if (unlikely(!pte)) 1346 return false; 1347 1348 if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte)) 1349 dirty = true; 1350 1351 spin_unlock(ptl); 1352 return dirty; 1353 } 1354 EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty); 1355 1356 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1357 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, 1358 pmd_t *pmdp) 1359 { 1360 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1361 /* No need to flush TLB 1362 * On s390 reference bits are in storage key and never in TLB */ 1363 return pmdp_test_and_clear_young(vma, address, pmdp); 1364 } 1365 1366 int pmdp_set_access_flags(struct vm_area_struct *vma, 1367 unsigned long address, pmd_t *pmdp, 1368 pmd_t entry, int dirty) 1369 { 1370 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1371 1372 entry = pmd_mkyoung(entry); 1373 if (dirty) 1374 entry = pmd_mkdirty(entry); 1375 if (pmd_same(*pmdp, entry)) 1376 return 0; 1377 pmdp_invalidate(vma, address, pmdp); 1378 set_pmd_at(vma->vm_mm, address, pmdp, entry); 1379 return 1; 1380 } 1381 1382 static void pmdp_splitting_flush_sync(void *arg) 1383 { 1384 /* Simply deliver the interrupt */ 1385 } 1386 1387 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 1388 pmd_t *pmdp) 1389 { 1390 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1391 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, 1392 (unsigned long *) pmdp)) { 1393 /* need to serialize against gup-fast (IRQ disabled) */ 1394 smp_call_function(pmdp_splitting_flush_sync, NULL, 1); 1395 } 1396 } 1397 1398 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1399 pgtable_t pgtable) 1400 { 1401 struct list_head *lh = (struct list_head *) pgtable; 1402 1403 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1404 1405 /* FIFO */ 1406 if (!pmd_huge_pte(mm, pmdp)) 1407 INIT_LIST_HEAD(lh); 1408 else 1409 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1410 pmd_huge_pte(mm, pmdp) = pgtable; 1411 } 1412 1413 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1414 { 1415 struct list_head *lh; 1416 pgtable_t pgtable; 1417 pte_t *ptep; 1418 1419 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1420 1421 /* FIFO */ 1422 pgtable = pmd_huge_pte(mm, pmdp); 1423 lh = (struct list_head *) pgtable; 1424 if (list_empty(lh)) 1425 pmd_huge_pte(mm, pmdp) = NULL; 1426 else { 1427 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1428 list_del(lh); 1429 } 1430 ptep = (pte_t *) pgtable; 1431 pte_val(*ptep) = _PAGE_INVALID; 1432 ptep++; 1433 pte_val(*ptep) = _PAGE_INVALID; 1434 return pgtable; 1435 } 1436 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1437