1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 #include <linux/slab.h> 20 #include <linux/swapops.h> 21 #include <linux/ksm.h> 22 #include <linux/mman.h> 23 24 #include <asm/pgtable.h> 25 #include <asm/pgalloc.h> 26 #include <asm/tlb.h> 27 #include <asm/tlbflush.h> 28 #include <asm/mmu_context.h> 29 30 #ifndef CONFIG_64BIT 31 #define ALLOC_ORDER 1 32 #define FRAG_MASK 0x0f 33 #else 34 #define ALLOC_ORDER 2 35 #define FRAG_MASK 0x03 36 #endif 37 38 39 unsigned long *crst_table_alloc(struct mm_struct *mm) 40 { 41 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 42 43 if (!page) 44 return NULL; 45 return (unsigned long *) page_to_phys(page); 46 } 47 48 void crst_table_free(struct mm_struct *mm, unsigned long *table) 49 { 50 free_pages((unsigned long) table, ALLOC_ORDER); 51 } 52 53 #ifdef CONFIG_64BIT 54 static void __crst_table_upgrade(void *arg) 55 { 56 struct mm_struct *mm = arg; 57 58 if (current->active_mm == mm) { 59 clear_user_asce(); 60 set_user_asce(mm); 61 } 62 __tlb_flush_local(); 63 } 64 65 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 66 { 67 unsigned long *table, *pgd; 68 unsigned long entry; 69 int flush; 70 71 BUG_ON(limit > (1UL << 53)); 72 flush = 0; 73 repeat: 74 table = crst_table_alloc(mm); 75 if (!table) 76 return -ENOMEM; 77 spin_lock_bh(&mm->page_table_lock); 78 if (mm->context.asce_limit < limit) { 79 pgd = (unsigned long *) mm->pgd; 80 if (mm->context.asce_limit <= (1UL << 31)) { 81 entry = _REGION3_ENTRY_EMPTY; 82 mm->context.asce_limit = 1UL << 42; 83 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 84 _ASCE_USER_BITS | 85 _ASCE_TYPE_REGION3; 86 } else { 87 entry = _REGION2_ENTRY_EMPTY; 88 mm->context.asce_limit = 1UL << 53; 89 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 90 _ASCE_USER_BITS | 91 _ASCE_TYPE_REGION2; 92 } 93 crst_table_init(table, entry); 94 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 95 mm->pgd = (pgd_t *) table; 96 mm->task_size = mm->context.asce_limit; 97 table = NULL; 98 flush = 1; 99 } 100 spin_unlock_bh(&mm->page_table_lock); 101 if (table) 102 crst_table_free(mm, table); 103 if (mm->context.asce_limit < limit) 104 goto repeat; 105 if (flush) 106 on_each_cpu(__crst_table_upgrade, mm, 0); 107 return 0; 108 } 109 110 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 111 { 112 pgd_t *pgd; 113 114 if (current->active_mm == mm) { 115 clear_user_asce(); 116 __tlb_flush_mm(mm); 117 } 118 while (mm->context.asce_limit > limit) { 119 pgd = mm->pgd; 120 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 121 case _REGION_ENTRY_TYPE_R2: 122 mm->context.asce_limit = 1UL << 42; 123 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 124 _ASCE_USER_BITS | 125 _ASCE_TYPE_REGION3; 126 break; 127 case _REGION_ENTRY_TYPE_R3: 128 mm->context.asce_limit = 1UL << 31; 129 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 130 _ASCE_USER_BITS | 131 _ASCE_TYPE_SEGMENT; 132 break; 133 default: 134 BUG(); 135 } 136 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 137 mm->task_size = mm->context.asce_limit; 138 crst_table_free(mm, (unsigned long *) pgd); 139 } 140 if (current->active_mm == mm) 141 set_user_asce(mm); 142 } 143 #endif 144 145 #ifdef CONFIG_PGSTE 146 147 /** 148 * gmap_alloc - allocate a guest address space 149 * @mm: pointer to the parent mm_struct 150 * @limit: maximum size of the gmap address space 151 * 152 * Returns a guest address space structure. 153 */ 154 struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) 155 { 156 struct gmap *gmap; 157 struct page *page; 158 unsigned long *table; 159 unsigned long etype, atype; 160 161 if (limit < (1UL << 31)) { 162 limit = (1UL << 31) - 1; 163 atype = _ASCE_TYPE_SEGMENT; 164 etype = _SEGMENT_ENTRY_EMPTY; 165 } else if (limit < (1UL << 42)) { 166 limit = (1UL << 42) - 1; 167 atype = _ASCE_TYPE_REGION3; 168 etype = _REGION3_ENTRY_EMPTY; 169 } else if (limit < (1UL << 53)) { 170 limit = (1UL << 53) - 1; 171 atype = _ASCE_TYPE_REGION2; 172 etype = _REGION2_ENTRY_EMPTY; 173 } else { 174 limit = -1UL; 175 atype = _ASCE_TYPE_REGION1; 176 etype = _REGION1_ENTRY_EMPTY; 177 } 178 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 179 if (!gmap) 180 goto out; 181 INIT_LIST_HEAD(&gmap->crst_list); 182 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); 183 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); 184 spin_lock_init(&gmap->guest_table_lock); 185 gmap->mm = mm; 186 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 187 if (!page) 188 goto out_free; 189 page->index = 0; 190 list_add(&page->lru, &gmap->crst_list); 191 table = (unsigned long *) page_to_phys(page); 192 crst_table_init(table, etype); 193 gmap->table = table; 194 gmap->asce = atype | _ASCE_TABLE_LENGTH | 195 _ASCE_USER_BITS | __pa(table); 196 gmap->asce_end = limit; 197 down_write(&mm->mmap_sem); 198 list_add(&gmap->list, &mm->context.gmap_list); 199 up_write(&mm->mmap_sem); 200 return gmap; 201 202 out_free: 203 kfree(gmap); 204 out: 205 return NULL; 206 } 207 EXPORT_SYMBOL_GPL(gmap_alloc); 208 209 static void gmap_flush_tlb(struct gmap *gmap) 210 { 211 if (MACHINE_HAS_IDTE) 212 __tlb_flush_asce(gmap->mm, gmap->asce); 213 else 214 __tlb_flush_global(); 215 } 216 217 static void gmap_radix_tree_free(struct radix_tree_root *root) 218 { 219 struct radix_tree_iter iter; 220 unsigned long indices[16]; 221 unsigned long index; 222 void **slot; 223 int i, nr; 224 225 /* A radix tree is freed by deleting all of its entries */ 226 index = 0; 227 do { 228 nr = 0; 229 radix_tree_for_each_slot(slot, root, &iter, index) { 230 indices[nr] = iter.index; 231 if (++nr == 16) 232 break; 233 } 234 for (i = 0; i < nr; i++) { 235 index = indices[i]; 236 radix_tree_delete(root, index); 237 } 238 } while (nr > 0); 239 } 240 241 /** 242 * gmap_free - free a guest address space 243 * @gmap: pointer to the guest address space structure 244 */ 245 void gmap_free(struct gmap *gmap) 246 { 247 struct page *page, *next; 248 249 /* Flush tlb. */ 250 if (MACHINE_HAS_IDTE) 251 __tlb_flush_asce(gmap->mm, gmap->asce); 252 else 253 __tlb_flush_global(); 254 255 /* Free all segment & region tables. */ 256 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) 257 __free_pages(page, ALLOC_ORDER); 258 gmap_radix_tree_free(&gmap->guest_to_host); 259 gmap_radix_tree_free(&gmap->host_to_guest); 260 down_write(&gmap->mm->mmap_sem); 261 list_del(&gmap->list); 262 up_write(&gmap->mm->mmap_sem); 263 kfree(gmap); 264 } 265 EXPORT_SYMBOL_GPL(gmap_free); 266 267 /** 268 * gmap_enable - switch primary space to the guest address space 269 * @gmap: pointer to the guest address space structure 270 */ 271 void gmap_enable(struct gmap *gmap) 272 { 273 S390_lowcore.gmap = (unsigned long) gmap; 274 } 275 EXPORT_SYMBOL_GPL(gmap_enable); 276 277 /** 278 * gmap_disable - switch back to the standard primary address space 279 * @gmap: pointer to the guest address space structure 280 */ 281 void gmap_disable(struct gmap *gmap) 282 { 283 S390_lowcore.gmap = 0UL; 284 } 285 EXPORT_SYMBOL_GPL(gmap_disable); 286 287 /* 288 * gmap_alloc_table is assumed to be called with mmap_sem held 289 */ 290 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 291 unsigned long init, unsigned long gaddr) 292 { 293 struct page *page; 294 unsigned long *new; 295 296 /* since we dont free the gmap table until gmap_free we can unlock */ 297 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 298 if (!page) 299 return -ENOMEM; 300 new = (unsigned long *) page_to_phys(page); 301 crst_table_init(new, init); 302 spin_lock(&gmap->mm->page_table_lock); 303 if (*table & _REGION_ENTRY_INVALID) { 304 list_add(&page->lru, &gmap->crst_list); 305 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 306 (*table & _REGION_ENTRY_TYPE_MASK); 307 page->index = gaddr; 308 page = NULL; 309 } 310 spin_unlock(&gmap->mm->page_table_lock); 311 if (page) 312 __free_pages(page, ALLOC_ORDER); 313 return 0; 314 } 315 316 /** 317 * __gmap_segment_gaddr - find virtual address from segment pointer 318 * @entry: pointer to a segment table entry in the guest address space 319 * 320 * Returns the virtual address in the guest address space for the segment 321 */ 322 static unsigned long __gmap_segment_gaddr(unsigned long *entry) 323 { 324 struct page *page; 325 unsigned long offset, mask; 326 327 offset = (unsigned long) entry / sizeof(unsigned long); 328 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; 329 mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); 330 page = virt_to_page((void *)((unsigned long) entry & mask)); 331 return page->index + offset; 332 } 333 334 /** 335 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 336 * @gmap: pointer to the guest address space structure 337 * @vmaddr: address in the host process address space 338 * 339 * Returns 1 if a TLB flush is required 340 */ 341 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 342 { 343 unsigned long *entry; 344 int flush = 0; 345 346 spin_lock(&gmap->guest_table_lock); 347 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 348 if (entry) { 349 flush = (*entry != _SEGMENT_ENTRY_INVALID); 350 *entry = _SEGMENT_ENTRY_INVALID; 351 } 352 spin_unlock(&gmap->guest_table_lock); 353 return flush; 354 } 355 356 /** 357 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 358 * @gmap: pointer to the guest address space structure 359 * @gaddr: address in the guest address space 360 * 361 * Returns 1 if a TLB flush is required 362 */ 363 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 364 { 365 unsigned long vmaddr; 366 367 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 368 gaddr >> PMD_SHIFT); 369 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 370 } 371 372 /** 373 * gmap_unmap_segment - unmap segment from the guest address space 374 * @gmap: pointer to the guest address space structure 375 * @to: address in the guest address space 376 * @len: length of the memory area to unmap 377 * 378 * Returns 0 if the unmap succeeded, -EINVAL if not. 379 */ 380 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 381 { 382 unsigned long off; 383 int flush; 384 385 if ((to | len) & (PMD_SIZE - 1)) 386 return -EINVAL; 387 if (len == 0 || to + len < to) 388 return -EINVAL; 389 390 flush = 0; 391 down_write(&gmap->mm->mmap_sem); 392 for (off = 0; off < len; off += PMD_SIZE) 393 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 394 up_write(&gmap->mm->mmap_sem); 395 if (flush) 396 gmap_flush_tlb(gmap); 397 return 0; 398 } 399 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 400 401 /** 402 * gmap_mmap_segment - map a segment to the guest address space 403 * @gmap: pointer to the guest address space structure 404 * @from: source address in the parent address space 405 * @to: target address in the guest address space 406 * @len: length of the memory area to map 407 * 408 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 409 */ 410 int gmap_map_segment(struct gmap *gmap, unsigned long from, 411 unsigned long to, unsigned long len) 412 { 413 unsigned long off; 414 int flush; 415 416 if ((from | to | len) & (PMD_SIZE - 1)) 417 return -EINVAL; 418 if (len == 0 || from + len < from || to + len < to || 419 from + len > TASK_MAX_SIZE || to + len > gmap->asce_end) 420 return -EINVAL; 421 422 flush = 0; 423 down_write(&gmap->mm->mmap_sem); 424 for (off = 0; off < len; off += PMD_SIZE) { 425 /* Remove old translation */ 426 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 427 /* Store new translation */ 428 if (radix_tree_insert(&gmap->guest_to_host, 429 (to + off) >> PMD_SHIFT, 430 (void *) from + off)) 431 break; 432 } 433 up_write(&gmap->mm->mmap_sem); 434 if (flush) 435 gmap_flush_tlb(gmap); 436 if (off >= len) 437 return 0; 438 gmap_unmap_segment(gmap, to, len); 439 return -ENOMEM; 440 } 441 EXPORT_SYMBOL_GPL(gmap_map_segment); 442 443 /** 444 * __gmap_translate - translate a guest address to a user space address 445 * @gmap: pointer to guest mapping meta data structure 446 * @gaddr: guest address 447 * 448 * Returns user space address which corresponds to the guest address or 449 * -EFAULT if no such mapping exists. 450 * This function does not establish potentially missing page table entries. 451 * The mmap_sem of the mm that belongs to the address space must be held 452 * when this function gets called. 453 */ 454 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 455 { 456 unsigned long vmaddr; 457 458 vmaddr = (unsigned long) 459 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 460 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 461 } 462 EXPORT_SYMBOL_GPL(__gmap_translate); 463 464 /** 465 * gmap_translate - translate a guest address to a user space address 466 * @gmap: pointer to guest mapping meta data structure 467 * @gaddr: guest address 468 * 469 * Returns user space address which corresponds to the guest address or 470 * -EFAULT if no such mapping exists. 471 * This function does not establish potentially missing page table entries. 472 */ 473 unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) 474 { 475 unsigned long rc; 476 477 down_read(&gmap->mm->mmap_sem); 478 rc = __gmap_translate(gmap, gaddr); 479 up_read(&gmap->mm->mmap_sem); 480 return rc; 481 } 482 EXPORT_SYMBOL_GPL(gmap_translate); 483 484 /** 485 * gmap_unlink - disconnect a page table from the gmap shadow tables 486 * @gmap: pointer to guest mapping meta data structure 487 * @table: pointer to the host page table 488 * @vmaddr: vm address associated with the host page table 489 */ 490 static void gmap_unlink(struct mm_struct *mm, unsigned long *table, 491 unsigned long vmaddr) 492 { 493 struct gmap *gmap; 494 int flush; 495 496 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 497 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 498 if (flush) 499 gmap_flush_tlb(gmap); 500 } 501 } 502 503 /** 504 * gmap_link - set up shadow page tables to connect a host to a guest address 505 * @gmap: pointer to guest mapping meta data structure 506 * @gaddr: guest address 507 * @vmaddr: vm address 508 * 509 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 510 * if the vm address is already mapped to a different guest segment. 511 * The mmap_sem of the mm that belongs to the address space must be held 512 * when this function gets called. 513 */ 514 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 515 { 516 struct mm_struct *mm; 517 unsigned long *table; 518 spinlock_t *ptl; 519 pgd_t *pgd; 520 pud_t *pud; 521 pmd_t *pmd; 522 int rc; 523 524 /* Create higher level tables in the gmap page table */ 525 table = gmap->table; 526 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 527 table += (gaddr >> 53) & 0x7ff; 528 if ((*table & _REGION_ENTRY_INVALID) && 529 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 530 gaddr & 0xffe0000000000000)) 531 return -ENOMEM; 532 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 533 } 534 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 535 table += (gaddr >> 42) & 0x7ff; 536 if ((*table & _REGION_ENTRY_INVALID) && 537 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 538 gaddr & 0xfffffc0000000000)) 539 return -ENOMEM; 540 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 541 } 542 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 543 table += (gaddr >> 31) & 0x7ff; 544 if ((*table & _REGION_ENTRY_INVALID) && 545 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 546 gaddr & 0xffffffff80000000)) 547 return -ENOMEM; 548 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 549 } 550 table += (gaddr >> 20) & 0x7ff; 551 /* Walk the parent mm page table */ 552 mm = gmap->mm; 553 pgd = pgd_offset(mm, vmaddr); 554 VM_BUG_ON(pgd_none(*pgd)); 555 pud = pud_offset(pgd, vmaddr); 556 VM_BUG_ON(pud_none(*pud)); 557 pmd = pmd_offset(pud, vmaddr); 558 VM_BUG_ON(pmd_none(*pmd)); 559 /* large pmds cannot yet be handled */ 560 if (pmd_large(*pmd)) 561 return -EFAULT; 562 /* Link gmap segment table entry location to page table. */ 563 rc = radix_tree_preload(GFP_KERNEL); 564 if (rc) 565 return rc; 566 ptl = pmd_lock(mm, pmd); 567 spin_lock(&gmap->guest_table_lock); 568 if (*table == _SEGMENT_ENTRY_INVALID) { 569 rc = radix_tree_insert(&gmap->host_to_guest, 570 vmaddr >> PMD_SHIFT, table); 571 if (!rc) 572 *table = pmd_val(*pmd); 573 } else 574 rc = 0; 575 spin_unlock(&gmap->guest_table_lock); 576 spin_unlock(ptl); 577 radix_tree_preload_end(); 578 return rc; 579 } 580 581 /** 582 * gmap_fault - resolve a fault on a guest address 583 * @gmap: pointer to guest mapping meta data structure 584 * @gaddr: guest address 585 * @fault_flags: flags to pass down to handle_mm_fault() 586 * 587 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 588 * if the vm address is already mapped to a different guest segment. 589 */ 590 int gmap_fault(struct gmap *gmap, unsigned long gaddr, 591 unsigned int fault_flags) 592 { 593 unsigned long vmaddr; 594 int rc; 595 596 down_read(&gmap->mm->mmap_sem); 597 vmaddr = __gmap_translate(gmap, gaddr); 598 if (IS_ERR_VALUE(vmaddr)) { 599 rc = vmaddr; 600 goto out_up; 601 } 602 if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) { 603 rc = -EFAULT; 604 goto out_up; 605 } 606 rc = __gmap_link(gmap, gaddr, vmaddr); 607 out_up: 608 up_read(&gmap->mm->mmap_sem); 609 return rc; 610 } 611 EXPORT_SYMBOL_GPL(gmap_fault); 612 613 static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) 614 { 615 if (!non_swap_entry(entry)) 616 dec_mm_counter(mm, MM_SWAPENTS); 617 else if (is_migration_entry(entry)) { 618 struct page *page = migration_entry_to_page(entry); 619 620 if (PageAnon(page)) 621 dec_mm_counter(mm, MM_ANONPAGES); 622 else 623 dec_mm_counter(mm, MM_FILEPAGES); 624 } 625 free_swap_and_cache(entry); 626 } 627 628 /* 629 * this function is assumed to be called with mmap_sem held 630 */ 631 void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 632 { 633 unsigned long vmaddr, ptev, pgstev; 634 pte_t *ptep, pte; 635 spinlock_t *ptl; 636 pgste_t pgste; 637 638 /* Find the vm address for the guest address */ 639 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 640 gaddr >> PMD_SHIFT); 641 if (!vmaddr) 642 return; 643 vmaddr |= gaddr & ~PMD_MASK; 644 /* Get pointer to the page table entry */ 645 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 646 if (unlikely(!ptep)) 647 return; 648 pte = *ptep; 649 if (!pte_swap(pte)) 650 goto out_pte; 651 /* Zap unused and logically-zero pages */ 652 pgste = pgste_get_lock(ptep); 653 pgstev = pgste_val(pgste); 654 ptev = pte_val(pte); 655 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || 656 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { 657 gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm); 658 pte_clear(gmap->mm, vmaddr, ptep); 659 } 660 pgste_set_unlock(ptep, pgste); 661 out_pte: 662 pte_unmap_unlock(ptep, ptl); 663 } 664 EXPORT_SYMBOL_GPL(__gmap_zap); 665 666 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 667 { 668 unsigned long gaddr, vmaddr, size; 669 struct vm_area_struct *vma; 670 671 down_read(&gmap->mm->mmap_sem); 672 for (gaddr = from; gaddr < to; 673 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 674 /* Find the vm address for the guest address */ 675 vmaddr = (unsigned long) 676 radix_tree_lookup(&gmap->guest_to_host, 677 gaddr >> PMD_SHIFT); 678 if (!vmaddr) 679 continue; 680 vmaddr |= gaddr & ~PMD_MASK; 681 /* Find vma in the parent mm */ 682 vma = find_vma(gmap->mm, vmaddr); 683 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 684 zap_page_range(vma, vmaddr, size, NULL); 685 } 686 up_read(&gmap->mm->mmap_sem); 687 } 688 EXPORT_SYMBOL_GPL(gmap_discard); 689 690 static LIST_HEAD(gmap_notifier_list); 691 static DEFINE_SPINLOCK(gmap_notifier_lock); 692 693 /** 694 * gmap_register_ipte_notifier - register a pte invalidation callback 695 * @nb: pointer to the gmap notifier block 696 */ 697 void gmap_register_ipte_notifier(struct gmap_notifier *nb) 698 { 699 spin_lock(&gmap_notifier_lock); 700 list_add(&nb->list, &gmap_notifier_list); 701 spin_unlock(&gmap_notifier_lock); 702 } 703 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); 704 705 /** 706 * gmap_unregister_ipte_notifier - remove a pte invalidation callback 707 * @nb: pointer to the gmap notifier block 708 */ 709 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) 710 { 711 spin_lock(&gmap_notifier_lock); 712 list_del_init(&nb->list); 713 spin_unlock(&gmap_notifier_lock); 714 } 715 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); 716 717 /** 718 * gmap_ipte_notify - mark a range of ptes for invalidation notification 719 * @gmap: pointer to guest mapping meta data structure 720 * @gaddr: virtual address in the guest address space 721 * @len: size of area 722 * 723 * Returns 0 if for each page in the given range a gmap mapping exists and 724 * the invalidation notification could be set. If the gmap mapping is missing 725 * for one or more pages -EFAULT is returned. If no memory could be allocated 726 * -ENOMEM is returned. This function establishes missing page table entries. 727 */ 728 int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) 729 { 730 unsigned long addr; 731 spinlock_t *ptl; 732 pte_t *ptep, entry; 733 pgste_t pgste; 734 int rc = 0; 735 736 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) 737 return -EINVAL; 738 down_read(&gmap->mm->mmap_sem); 739 while (len) { 740 /* Convert gmap address and connect the page tables */ 741 addr = __gmap_translate(gmap, gaddr); 742 if (IS_ERR_VALUE(addr)) { 743 rc = addr; 744 break; 745 } 746 /* Get the page mapped */ 747 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { 748 rc = -EFAULT; 749 break; 750 } 751 rc = __gmap_link(gmap, gaddr, addr); 752 if (rc) 753 break; 754 /* Walk the process page table, lock and get pte pointer */ 755 ptep = get_locked_pte(gmap->mm, addr, &ptl); 756 VM_BUG_ON(!ptep); 757 /* Set notification bit in the pgste of the pte */ 758 entry = *ptep; 759 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { 760 pgste = pgste_get_lock(ptep); 761 pgste_val(pgste) |= PGSTE_IN_BIT; 762 pgste_set_unlock(ptep, pgste); 763 gaddr += PAGE_SIZE; 764 len -= PAGE_SIZE; 765 } 766 pte_unmap_unlock(ptep, ptl); 767 } 768 up_read(&gmap->mm->mmap_sem); 769 return rc; 770 } 771 EXPORT_SYMBOL_GPL(gmap_ipte_notify); 772 773 /** 774 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. 775 * @mm: pointer to the process mm_struct 776 * @addr: virtual address in the process address space 777 * @pte: pointer to the page table entry 778 * 779 * This function is assumed to be called with the page table lock held 780 * for the pte to notify. 781 */ 782 void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) 783 { 784 unsigned long offset, gaddr; 785 unsigned long *table; 786 struct gmap_notifier *nb; 787 struct gmap *gmap; 788 789 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 790 offset = offset * (4096 / sizeof(pte_t)); 791 spin_lock(&gmap_notifier_lock); 792 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 793 table = radix_tree_lookup(&gmap->host_to_guest, 794 vmaddr >> PMD_SHIFT); 795 if (!table) 796 continue; 797 gaddr = __gmap_segment_gaddr(table) + offset; 798 list_for_each_entry(nb, &gmap_notifier_list, list) 799 nb->notifier_call(gmap, gaddr); 800 } 801 spin_unlock(&gmap_notifier_lock); 802 } 803 EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); 804 805 static inline int page_table_with_pgste(struct page *page) 806 { 807 return atomic_read(&page->_mapcount) == 0; 808 } 809 810 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) 811 { 812 struct page *page; 813 unsigned long *table; 814 815 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 816 if (!page) 817 return NULL; 818 if (!pgtable_page_ctor(page)) { 819 __free_page(page); 820 return NULL; 821 } 822 atomic_set(&page->_mapcount, 0); 823 table = (unsigned long *) page_to_phys(page); 824 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 825 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 826 return table; 827 } 828 829 static inline void page_table_free_pgste(unsigned long *table) 830 { 831 struct page *page; 832 833 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 834 pgtable_page_dtor(page); 835 atomic_set(&page->_mapcount, -1); 836 __free_page(page); 837 } 838 839 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 840 unsigned long key, bool nq) 841 { 842 spinlock_t *ptl; 843 pgste_t old, new; 844 pte_t *ptep; 845 846 down_read(&mm->mmap_sem); 847 retry: 848 ptep = get_locked_pte(mm, addr, &ptl); 849 if (unlikely(!ptep)) { 850 up_read(&mm->mmap_sem); 851 return -EFAULT; 852 } 853 if (!(pte_val(*ptep) & _PAGE_INVALID) && 854 (pte_val(*ptep) & _PAGE_PROTECT)) { 855 pte_unmap_unlock(ptep, ptl); 856 if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { 857 up_read(&mm->mmap_sem); 858 return -EFAULT; 859 } 860 goto retry; 861 } 862 863 new = old = pgste_get_lock(ptep); 864 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 865 PGSTE_ACC_BITS | PGSTE_FP_BIT); 866 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 867 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 868 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 869 unsigned long address, bits, skey; 870 871 address = pte_val(*ptep) & PAGE_MASK; 872 skey = (unsigned long) page_get_storage_key(address); 873 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 874 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 875 /* Set storage key ACC and FP */ 876 page_set_storage_key(address, skey, !nq); 877 /* Merge host changed & referenced into pgste */ 878 pgste_val(new) |= bits << 52; 879 } 880 /* changing the guest storage key is considered a change of the page */ 881 if ((pgste_val(new) ^ pgste_val(old)) & 882 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 883 pgste_val(new) |= PGSTE_UC_BIT; 884 885 pgste_set_unlock(ptep, new); 886 pte_unmap_unlock(ptep, ptl); 887 up_read(&mm->mmap_sem); 888 return 0; 889 } 890 EXPORT_SYMBOL(set_guest_storage_key); 891 892 unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) 893 { 894 spinlock_t *ptl; 895 pgste_t pgste; 896 pte_t *ptep; 897 uint64_t physaddr; 898 unsigned long key = 0; 899 900 down_read(&mm->mmap_sem); 901 ptep = get_locked_pte(mm, addr, &ptl); 902 if (unlikely(!ptep)) { 903 up_read(&mm->mmap_sem); 904 return -EFAULT; 905 } 906 pgste = pgste_get_lock(ptep); 907 908 if (pte_val(*ptep) & _PAGE_INVALID) { 909 key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; 910 key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; 911 key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; 912 key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; 913 } else { 914 physaddr = pte_val(*ptep) & PAGE_MASK; 915 key = page_get_storage_key(physaddr); 916 917 /* Reflect guest's logical view, not physical */ 918 if (pgste_val(pgste) & PGSTE_GR_BIT) 919 key |= _PAGE_REFERENCED; 920 if (pgste_val(pgste) & PGSTE_GC_BIT) 921 key |= _PAGE_CHANGED; 922 } 923 924 pgste_set_unlock(ptep, pgste); 925 pte_unmap_unlock(ptep, ptl); 926 up_read(&mm->mmap_sem); 927 return key; 928 } 929 EXPORT_SYMBOL(get_guest_storage_key); 930 931 #else /* CONFIG_PGSTE */ 932 933 static inline int page_table_with_pgste(struct page *page) 934 { 935 return 0; 936 } 937 938 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) 939 { 940 return NULL; 941 } 942 943 static inline void page_table_free_pgste(unsigned long *table) 944 { 945 } 946 947 static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table, 948 unsigned long vmaddr) 949 { 950 } 951 952 #endif /* CONFIG_PGSTE */ 953 954 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 955 { 956 unsigned int old, new; 957 958 do { 959 old = atomic_read(v); 960 new = old ^ bits; 961 } while (atomic_cmpxchg(v, old, new) != old); 962 return new; 963 } 964 965 /* 966 * page table entry allocation/free routines. 967 */ 968 unsigned long *page_table_alloc(struct mm_struct *mm) 969 { 970 unsigned long *uninitialized_var(table); 971 struct page *uninitialized_var(page); 972 unsigned int mask, bit; 973 974 if (mm_has_pgste(mm)) 975 return page_table_alloc_pgste(mm); 976 /* Allocate fragments of a 4K page as 1K/2K page table */ 977 spin_lock_bh(&mm->context.list_lock); 978 mask = FRAG_MASK; 979 if (!list_empty(&mm->context.pgtable_list)) { 980 page = list_first_entry(&mm->context.pgtable_list, 981 struct page, lru); 982 table = (unsigned long *) page_to_phys(page); 983 mask = atomic_read(&page->_mapcount); 984 mask = mask | (mask >> 4); 985 } 986 if ((mask & FRAG_MASK) == FRAG_MASK) { 987 spin_unlock_bh(&mm->context.list_lock); 988 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 989 if (!page) 990 return NULL; 991 if (!pgtable_page_ctor(page)) { 992 __free_page(page); 993 return NULL; 994 } 995 atomic_set(&page->_mapcount, 1); 996 table = (unsigned long *) page_to_phys(page); 997 clear_table(table, _PAGE_INVALID, PAGE_SIZE); 998 spin_lock_bh(&mm->context.list_lock); 999 list_add(&page->lru, &mm->context.pgtable_list); 1000 } else { 1001 for (bit = 1; mask & bit; bit <<= 1) 1002 table += PTRS_PER_PTE; 1003 mask = atomic_xor_bits(&page->_mapcount, bit); 1004 if ((mask & FRAG_MASK) == FRAG_MASK) 1005 list_del(&page->lru); 1006 } 1007 spin_unlock_bh(&mm->context.list_lock); 1008 return table; 1009 } 1010 1011 void page_table_free(struct mm_struct *mm, unsigned long *table) 1012 { 1013 struct page *page; 1014 unsigned int bit, mask; 1015 1016 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1017 if (page_table_with_pgste(page)) 1018 return page_table_free_pgste(table); 1019 /* Free 1K/2K page table fragment of a 4K page */ 1020 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 1021 spin_lock_bh(&mm->context.list_lock); 1022 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 1023 list_del(&page->lru); 1024 mask = atomic_xor_bits(&page->_mapcount, bit); 1025 if (mask & FRAG_MASK) 1026 list_add(&page->lru, &mm->context.pgtable_list); 1027 spin_unlock_bh(&mm->context.list_lock); 1028 if (mask == 0) { 1029 pgtable_page_dtor(page); 1030 atomic_set(&page->_mapcount, -1); 1031 __free_page(page); 1032 } 1033 } 1034 1035 static void __page_table_free_rcu(void *table, unsigned bit) 1036 { 1037 struct page *page; 1038 1039 if (bit == FRAG_MASK) 1040 return page_table_free_pgste(table); 1041 /* Free 1K/2K page table fragment of a 4K page */ 1042 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1043 if (atomic_xor_bits(&page->_mapcount, bit) == 0) { 1044 pgtable_page_dtor(page); 1045 atomic_set(&page->_mapcount, -1); 1046 __free_page(page); 1047 } 1048 } 1049 1050 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, 1051 unsigned long vmaddr) 1052 { 1053 struct mm_struct *mm; 1054 struct page *page; 1055 unsigned int bit, mask; 1056 1057 mm = tlb->mm; 1058 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1059 if (page_table_with_pgste(page)) { 1060 gmap_unlink(mm, table, vmaddr); 1061 table = (unsigned long *) (__pa(table) | FRAG_MASK); 1062 tlb_remove_table(tlb, table); 1063 return; 1064 } 1065 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 1066 spin_lock_bh(&mm->context.list_lock); 1067 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 1068 list_del(&page->lru); 1069 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); 1070 if (mask & FRAG_MASK) 1071 list_add_tail(&page->lru, &mm->context.pgtable_list); 1072 spin_unlock_bh(&mm->context.list_lock); 1073 table = (unsigned long *) (__pa(table) | (bit << 4)); 1074 tlb_remove_table(tlb, table); 1075 } 1076 1077 static void __tlb_remove_table(void *_table) 1078 { 1079 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; 1080 void *table = (void *)((unsigned long) _table & ~mask); 1081 unsigned type = (unsigned long) _table & mask; 1082 1083 if (type) 1084 __page_table_free_rcu(table, type); 1085 else 1086 free_pages((unsigned long) table, ALLOC_ORDER); 1087 } 1088 1089 static void tlb_remove_table_smp_sync(void *arg) 1090 { 1091 /* Simply deliver the interrupt */ 1092 } 1093 1094 static void tlb_remove_table_one(void *table) 1095 { 1096 /* 1097 * This isn't an RCU grace period and hence the page-tables cannot be 1098 * assumed to be actually RCU-freed. 1099 * 1100 * It is however sufficient for software page-table walkers that rely 1101 * on IRQ disabling. See the comment near struct mmu_table_batch. 1102 */ 1103 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 1104 __tlb_remove_table(table); 1105 } 1106 1107 static void tlb_remove_table_rcu(struct rcu_head *head) 1108 { 1109 struct mmu_table_batch *batch; 1110 int i; 1111 1112 batch = container_of(head, struct mmu_table_batch, rcu); 1113 1114 for (i = 0; i < batch->nr; i++) 1115 __tlb_remove_table(batch->tables[i]); 1116 1117 free_page((unsigned long)batch); 1118 } 1119 1120 void tlb_table_flush(struct mmu_gather *tlb) 1121 { 1122 struct mmu_table_batch **batch = &tlb->batch; 1123 1124 if (*batch) { 1125 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 1126 *batch = NULL; 1127 } 1128 } 1129 1130 void tlb_remove_table(struct mmu_gather *tlb, void *table) 1131 { 1132 struct mmu_table_batch **batch = &tlb->batch; 1133 1134 tlb->mm->context.flush_mm = 1; 1135 if (*batch == NULL) { 1136 *batch = (struct mmu_table_batch *) 1137 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 1138 if (*batch == NULL) { 1139 __tlb_flush_mm_lazy(tlb->mm); 1140 tlb_remove_table_one(table); 1141 return; 1142 } 1143 (*batch)->nr = 0; 1144 } 1145 (*batch)->tables[(*batch)->nr++] = table; 1146 if ((*batch)->nr == MAX_TABLE_BATCH) 1147 tlb_flush_mmu(tlb); 1148 } 1149 1150 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1151 static inline void thp_split_vma(struct vm_area_struct *vma) 1152 { 1153 unsigned long addr; 1154 1155 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) 1156 follow_page(vma, addr, FOLL_SPLIT); 1157 } 1158 1159 static inline void thp_split_mm(struct mm_struct *mm) 1160 { 1161 struct vm_area_struct *vma; 1162 1163 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { 1164 thp_split_vma(vma); 1165 vma->vm_flags &= ~VM_HUGEPAGE; 1166 vma->vm_flags |= VM_NOHUGEPAGE; 1167 } 1168 mm->def_flags |= VM_NOHUGEPAGE; 1169 } 1170 #else 1171 static inline void thp_split_mm(struct mm_struct *mm) 1172 { 1173 } 1174 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1175 1176 static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb, 1177 struct mm_struct *mm, pud_t *pud, 1178 unsigned long addr, unsigned long end) 1179 { 1180 unsigned long next, *table, *new; 1181 struct page *page; 1182 spinlock_t *ptl; 1183 pmd_t *pmd; 1184 1185 pmd = pmd_offset(pud, addr); 1186 do { 1187 next = pmd_addr_end(addr, end); 1188 again: 1189 if (pmd_none_or_clear_bad(pmd)) 1190 continue; 1191 table = (unsigned long *) pmd_deref(*pmd); 1192 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1193 if (page_table_with_pgste(page)) 1194 continue; 1195 /* Allocate new page table with pgstes */ 1196 new = page_table_alloc_pgste(mm); 1197 if (!new) 1198 return -ENOMEM; 1199 1200 ptl = pmd_lock(mm, pmd); 1201 if (likely((unsigned long *) pmd_deref(*pmd) == table)) { 1202 /* Nuke pmd entry pointing to the "short" page table */ 1203 pmdp_flush_lazy(mm, addr, pmd); 1204 pmd_clear(pmd); 1205 /* Copy ptes from old table to new table */ 1206 memcpy(new, table, PAGE_SIZE/2); 1207 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 1208 /* Establish new table */ 1209 pmd_populate(mm, pmd, (pte_t *) new); 1210 /* Free old table with rcu, there might be a walker! */ 1211 page_table_free_rcu(tlb, table, addr); 1212 new = NULL; 1213 } 1214 spin_unlock(ptl); 1215 if (new) { 1216 page_table_free_pgste(new); 1217 goto again; 1218 } 1219 } while (pmd++, addr = next, addr != end); 1220 1221 return addr; 1222 } 1223 1224 static unsigned long page_table_realloc_pud(struct mmu_gather *tlb, 1225 struct mm_struct *mm, pgd_t *pgd, 1226 unsigned long addr, unsigned long end) 1227 { 1228 unsigned long next; 1229 pud_t *pud; 1230 1231 pud = pud_offset(pgd, addr); 1232 do { 1233 next = pud_addr_end(addr, end); 1234 if (pud_none_or_clear_bad(pud)) 1235 continue; 1236 next = page_table_realloc_pmd(tlb, mm, pud, addr, next); 1237 if (unlikely(IS_ERR_VALUE(next))) 1238 return next; 1239 } while (pud++, addr = next, addr != end); 1240 1241 return addr; 1242 } 1243 1244 static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm, 1245 unsigned long addr, unsigned long end) 1246 { 1247 unsigned long next; 1248 pgd_t *pgd; 1249 1250 pgd = pgd_offset(mm, addr); 1251 do { 1252 next = pgd_addr_end(addr, end); 1253 if (pgd_none_or_clear_bad(pgd)) 1254 continue; 1255 next = page_table_realloc_pud(tlb, mm, pgd, addr, next); 1256 if (unlikely(IS_ERR_VALUE(next))) 1257 return next; 1258 } while (pgd++, addr = next, addr != end); 1259 1260 return 0; 1261 } 1262 1263 /* 1264 * switch on pgstes for its userspace process (for kvm) 1265 */ 1266 int s390_enable_sie(void) 1267 { 1268 struct task_struct *tsk = current; 1269 struct mm_struct *mm = tsk->mm; 1270 struct mmu_gather tlb; 1271 1272 /* Do we have pgstes? if yes, we are done */ 1273 if (mm_has_pgste(tsk->mm)) 1274 return 0; 1275 1276 down_write(&mm->mmap_sem); 1277 /* split thp mappings and disable thp for future mappings */ 1278 thp_split_mm(mm); 1279 /* Reallocate the page tables with pgstes */ 1280 tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE); 1281 if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE)) 1282 mm->context.has_pgste = 1; 1283 tlb_finish_mmu(&tlb, 0, TASK_SIZE); 1284 up_write(&mm->mmap_sem); 1285 return mm->context.has_pgste ? 0 : -ENOMEM; 1286 } 1287 EXPORT_SYMBOL_GPL(s390_enable_sie); 1288 1289 /* 1290 * Enable storage key handling from now on and initialize the storage 1291 * keys with the default key. 1292 */ 1293 static int __s390_enable_skey(pte_t *pte, unsigned long addr, 1294 unsigned long next, struct mm_walk *walk) 1295 { 1296 unsigned long ptev; 1297 pgste_t pgste; 1298 1299 pgste = pgste_get_lock(pte); 1300 /* 1301 * Remove all zero page mappings, 1302 * after establishing a policy to forbid zero page mappings 1303 * following faults for that page will get fresh anonymous pages 1304 */ 1305 if (is_zero_pfn(pte_pfn(*pte))) { 1306 ptep_flush_direct(walk->mm, addr, pte); 1307 pte_val(*pte) = _PAGE_INVALID; 1308 } 1309 /* Clear storage key */ 1310 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | 1311 PGSTE_GR_BIT | PGSTE_GC_BIT); 1312 ptev = pte_val(*pte); 1313 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 1314 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); 1315 pgste_set_unlock(pte, pgste); 1316 return 0; 1317 } 1318 1319 int s390_enable_skey(void) 1320 { 1321 struct mm_walk walk = { .pte_entry = __s390_enable_skey }; 1322 struct mm_struct *mm = current->mm; 1323 struct vm_area_struct *vma; 1324 int rc = 0; 1325 1326 down_write(&mm->mmap_sem); 1327 if (mm_use_skey(mm)) 1328 goto out_up; 1329 1330 mm->context.use_skey = 1; 1331 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1332 if (ksm_madvise(vma, vma->vm_start, vma->vm_end, 1333 MADV_UNMERGEABLE, &vma->vm_flags)) { 1334 mm->context.use_skey = 0; 1335 rc = -ENOMEM; 1336 goto out_up; 1337 } 1338 } 1339 mm->def_flags &= ~VM_MERGEABLE; 1340 1341 walk.mm = mm; 1342 walk_page_range(0, TASK_SIZE, &walk); 1343 1344 out_up: 1345 up_write(&mm->mmap_sem); 1346 return rc; 1347 } 1348 EXPORT_SYMBOL_GPL(s390_enable_skey); 1349 1350 /* 1351 * Reset CMMA state, make all pages stable again. 1352 */ 1353 static int __s390_reset_cmma(pte_t *pte, unsigned long addr, 1354 unsigned long next, struct mm_walk *walk) 1355 { 1356 pgste_t pgste; 1357 1358 pgste = pgste_get_lock(pte); 1359 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; 1360 pgste_set_unlock(pte, pgste); 1361 return 0; 1362 } 1363 1364 void s390_reset_cmma(struct mm_struct *mm) 1365 { 1366 struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; 1367 1368 down_write(&mm->mmap_sem); 1369 walk.mm = mm; 1370 walk_page_range(0, TASK_SIZE, &walk); 1371 up_write(&mm->mmap_sem); 1372 } 1373 EXPORT_SYMBOL_GPL(s390_reset_cmma); 1374 1375 /* 1376 * Test and reset if a guest page is dirty 1377 */ 1378 bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap) 1379 { 1380 pte_t *pte; 1381 spinlock_t *ptl; 1382 bool dirty = false; 1383 1384 pte = get_locked_pte(gmap->mm, address, &ptl); 1385 if (unlikely(!pte)) 1386 return false; 1387 1388 if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte)) 1389 dirty = true; 1390 1391 spin_unlock(ptl); 1392 return dirty; 1393 } 1394 EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty); 1395 1396 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1397 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, 1398 pmd_t *pmdp) 1399 { 1400 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1401 /* No need to flush TLB 1402 * On s390 reference bits are in storage key and never in TLB */ 1403 return pmdp_test_and_clear_young(vma, address, pmdp); 1404 } 1405 1406 int pmdp_set_access_flags(struct vm_area_struct *vma, 1407 unsigned long address, pmd_t *pmdp, 1408 pmd_t entry, int dirty) 1409 { 1410 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1411 1412 entry = pmd_mkyoung(entry); 1413 if (dirty) 1414 entry = pmd_mkdirty(entry); 1415 if (pmd_same(*pmdp, entry)) 1416 return 0; 1417 pmdp_invalidate(vma, address, pmdp); 1418 set_pmd_at(vma->vm_mm, address, pmdp, entry); 1419 return 1; 1420 } 1421 1422 static void pmdp_splitting_flush_sync(void *arg) 1423 { 1424 /* Simply deliver the interrupt */ 1425 } 1426 1427 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 1428 pmd_t *pmdp) 1429 { 1430 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1431 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, 1432 (unsigned long *) pmdp)) { 1433 /* need to serialize against gup-fast (IRQ disabled) */ 1434 smp_call_function(pmdp_splitting_flush_sync, NULL, 1); 1435 } 1436 } 1437 1438 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1439 pgtable_t pgtable) 1440 { 1441 struct list_head *lh = (struct list_head *) pgtable; 1442 1443 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1444 1445 /* FIFO */ 1446 if (!pmd_huge_pte(mm, pmdp)) 1447 INIT_LIST_HEAD(lh); 1448 else 1449 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1450 pmd_huge_pte(mm, pmdp) = pgtable; 1451 } 1452 1453 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1454 { 1455 struct list_head *lh; 1456 pgtable_t pgtable; 1457 pte_t *ptep; 1458 1459 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1460 1461 /* FIFO */ 1462 pgtable = pmd_huge_pte(mm, pmdp); 1463 lh = (struct list_head *) pgtable; 1464 if (list_empty(lh)) 1465 pmd_huge_pte(mm, pmdp) = NULL; 1466 else { 1467 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1468 list_del(lh); 1469 } 1470 ptep = (pte_t *) pgtable; 1471 pte_val(*ptep) = _PAGE_INVALID; 1472 ptep++; 1473 pte_val(*ptep) = _PAGE_INVALID; 1474 return pgtable; 1475 } 1476 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1477