1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 #include <linux/slab.h> 20 21 #include <asm/pgtable.h> 22 #include <asm/pgalloc.h> 23 #include <asm/tlb.h> 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 27 #ifndef CONFIG_64BIT 28 #define ALLOC_ORDER 1 29 #define FRAG_MASK 0x0f 30 #else 31 #define ALLOC_ORDER 2 32 #define FRAG_MASK 0x03 33 #endif 34 35 36 unsigned long *crst_table_alloc(struct mm_struct *mm) 37 { 38 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 39 40 if (!page) 41 return NULL; 42 return (unsigned long *) page_to_phys(page); 43 } 44 45 void crst_table_free(struct mm_struct *mm, unsigned long *table) 46 { 47 free_pages((unsigned long) table, ALLOC_ORDER); 48 } 49 50 #ifdef CONFIG_64BIT 51 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 52 { 53 unsigned long *table, *pgd; 54 unsigned long entry; 55 56 BUG_ON(limit > (1UL << 53)); 57 repeat: 58 table = crst_table_alloc(mm); 59 if (!table) 60 return -ENOMEM; 61 spin_lock_bh(&mm->page_table_lock); 62 if (mm->context.asce_limit < limit) { 63 pgd = (unsigned long *) mm->pgd; 64 if (mm->context.asce_limit <= (1UL << 31)) { 65 entry = _REGION3_ENTRY_EMPTY; 66 mm->context.asce_limit = 1UL << 42; 67 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 68 _ASCE_USER_BITS | 69 _ASCE_TYPE_REGION3; 70 } else { 71 entry = _REGION2_ENTRY_EMPTY; 72 mm->context.asce_limit = 1UL << 53; 73 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 74 _ASCE_USER_BITS | 75 _ASCE_TYPE_REGION2; 76 } 77 crst_table_init(table, entry); 78 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 79 mm->pgd = (pgd_t *) table; 80 mm->task_size = mm->context.asce_limit; 81 table = NULL; 82 } 83 spin_unlock_bh(&mm->page_table_lock); 84 if (table) 85 crst_table_free(mm, table); 86 if (mm->context.asce_limit < limit) 87 goto repeat; 88 return 0; 89 } 90 91 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 92 { 93 pgd_t *pgd; 94 95 while (mm->context.asce_limit > limit) { 96 pgd = mm->pgd; 97 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 98 case _REGION_ENTRY_TYPE_R2: 99 mm->context.asce_limit = 1UL << 42; 100 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 101 _ASCE_USER_BITS | 102 _ASCE_TYPE_REGION3; 103 break; 104 case _REGION_ENTRY_TYPE_R3: 105 mm->context.asce_limit = 1UL << 31; 106 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 107 _ASCE_USER_BITS | 108 _ASCE_TYPE_SEGMENT; 109 break; 110 default: 111 BUG(); 112 } 113 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 114 mm->task_size = mm->context.asce_limit; 115 crst_table_free(mm, (unsigned long *) pgd); 116 } 117 } 118 #endif 119 120 #ifdef CONFIG_PGSTE 121 122 /** 123 * gmap_alloc - allocate a guest address space 124 * @mm: pointer to the parent mm_struct 125 * 126 * Returns a guest address space structure. 127 */ 128 struct gmap *gmap_alloc(struct mm_struct *mm) 129 { 130 struct gmap *gmap; 131 struct page *page; 132 unsigned long *table; 133 134 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 135 if (!gmap) 136 goto out; 137 INIT_LIST_HEAD(&gmap->crst_list); 138 gmap->mm = mm; 139 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 140 if (!page) 141 goto out_free; 142 list_add(&page->lru, &gmap->crst_list); 143 table = (unsigned long *) page_to_phys(page); 144 crst_table_init(table, _REGION1_ENTRY_EMPTY); 145 gmap->table = table; 146 gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH | 147 _ASCE_USER_BITS | __pa(table); 148 list_add(&gmap->list, &mm->context.gmap_list); 149 return gmap; 150 151 out_free: 152 kfree(gmap); 153 out: 154 return NULL; 155 } 156 EXPORT_SYMBOL_GPL(gmap_alloc); 157 158 static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table) 159 { 160 struct gmap_pgtable *mp; 161 struct gmap_rmap *rmap; 162 struct page *page; 163 164 if (*table & _SEGMENT_ENTRY_INVALID) 165 return 0; 166 page = pfn_to_page(*table >> PAGE_SHIFT); 167 mp = (struct gmap_pgtable *) page->index; 168 list_for_each_entry(rmap, &mp->mapper, list) { 169 if (rmap->entry != table) 170 continue; 171 list_del(&rmap->list); 172 kfree(rmap); 173 break; 174 } 175 *table = mp->vmaddr | _SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_PROTECT; 176 return 1; 177 } 178 179 static void gmap_flush_tlb(struct gmap *gmap) 180 { 181 if (MACHINE_HAS_IDTE) 182 __tlb_flush_idte((unsigned long) gmap->table | 183 _ASCE_TYPE_REGION1); 184 else 185 __tlb_flush_global(); 186 } 187 188 /** 189 * gmap_free - free a guest address space 190 * @gmap: pointer to the guest address space structure 191 */ 192 void gmap_free(struct gmap *gmap) 193 { 194 struct page *page, *next; 195 unsigned long *table; 196 int i; 197 198 199 /* Flush tlb. */ 200 if (MACHINE_HAS_IDTE) 201 __tlb_flush_idte((unsigned long) gmap->table | 202 _ASCE_TYPE_REGION1); 203 else 204 __tlb_flush_global(); 205 206 /* Free all segment & region tables. */ 207 down_read(&gmap->mm->mmap_sem); 208 spin_lock(&gmap->mm->page_table_lock); 209 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { 210 table = (unsigned long *) page_to_phys(page); 211 if ((*table & _REGION_ENTRY_TYPE_MASK) == 0) 212 /* Remove gmap rmap structures for segment table. */ 213 for (i = 0; i < PTRS_PER_PMD; i++, table++) 214 gmap_unlink_segment(gmap, table); 215 __free_pages(page, ALLOC_ORDER); 216 } 217 spin_unlock(&gmap->mm->page_table_lock); 218 up_read(&gmap->mm->mmap_sem); 219 list_del(&gmap->list); 220 kfree(gmap); 221 } 222 EXPORT_SYMBOL_GPL(gmap_free); 223 224 /** 225 * gmap_enable - switch primary space to the guest address space 226 * @gmap: pointer to the guest address space structure 227 */ 228 void gmap_enable(struct gmap *gmap) 229 { 230 S390_lowcore.gmap = (unsigned long) gmap; 231 } 232 EXPORT_SYMBOL_GPL(gmap_enable); 233 234 /** 235 * gmap_disable - switch back to the standard primary address space 236 * @gmap: pointer to the guest address space structure 237 */ 238 void gmap_disable(struct gmap *gmap) 239 { 240 S390_lowcore.gmap = 0UL; 241 } 242 EXPORT_SYMBOL_GPL(gmap_disable); 243 244 /* 245 * gmap_alloc_table is assumed to be called with mmap_sem held 246 */ 247 static int gmap_alloc_table(struct gmap *gmap, 248 unsigned long *table, unsigned long init) 249 { 250 struct page *page; 251 unsigned long *new; 252 253 /* since we dont free the gmap table until gmap_free we can unlock */ 254 spin_unlock(&gmap->mm->page_table_lock); 255 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 256 spin_lock(&gmap->mm->page_table_lock); 257 if (!page) 258 return -ENOMEM; 259 new = (unsigned long *) page_to_phys(page); 260 crst_table_init(new, init); 261 if (*table & _REGION_ENTRY_INVALID) { 262 list_add(&page->lru, &gmap->crst_list); 263 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 264 (*table & _REGION_ENTRY_TYPE_MASK); 265 } else 266 __free_pages(page, ALLOC_ORDER); 267 return 0; 268 } 269 270 /** 271 * gmap_unmap_segment - unmap segment from the guest address space 272 * @gmap: pointer to the guest address space structure 273 * @addr: address in the guest address space 274 * @len: length of the memory area to unmap 275 * 276 * Returns 0 if the unmap succeded, -EINVAL if not. 277 */ 278 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 279 { 280 unsigned long *table; 281 unsigned long off; 282 int flush; 283 284 if ((to | len) & (PMD_SIZE - 1)) 285 return -EINVAL; 286 if (len == 0 || to + len < to) 287 return -EINVAL; 288 289 flush = 0; 290 down_read(&gmap->mm->mmap_sem); 291 spin_lock(&gmap->mm->page_table_lock); 292 for (off = 0; off < len; off += PMD_SIZE) { 293 /* Walk the guest addr space page table */ 294 table = gmap->table + (((to + off) >> 53) & 0x7ff); 295 if (*table & _REGION_ENTRY_INVALID) 296 goto out; 297 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 298 table = table + (((to + off) >> 42) & 0x7ff); 299 if (*table & _REGION_ENTRY_INVALID) 300 goto out; 301 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 302 table = table + (((to + off) >> 31) & 0x7ff); 303 if (*table & _REGION_ENTRY_INVALID) 304 goto out; 305 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 306 table = table + (((to + off) >> 20) & 0x7ff); 307 308 /* Clear segment table entry in guest address space. */ 309 flush |= gmap_unlink_segment(gmap, table); 310 *table = _SEGMENT_ENTRY_INVALID; 311 } 312 out: 313 spin_unlock(&gmap->mm->page_table_lock); 314 up_read(&gmap->mm->mmap_sem); 315 if (flush) 316 gmap_flush_tlb(gmap); 317 return 0; 318 } 319 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 320 321 /** 322 * gmap_mmap_segment - map a segment to the guest address space 323 * @gmap: pointer to the guest address space structure 324 * @from: source address in the parent address space 325 * @to: target address in the guest address space 326 * 327 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not. 328 */ 329 int gmap_map_segment(struct gmap *gmap, unsigned long from, 330 unsigned long to, unsigned long len) 331 { 332 unsigned long *table; 333 unsigned long off; 334 int flush; 335 336 if ((from | to | len) & (PMD_SIZE - 1)) 337 return -EINVAL; 338 if (len == 0 || from + len > TASK_MAX_SIZE || 339 from + len < from || to + len < to) 340 return -EINVAL; 341 342 flush = 0; 343 down_read(&gmap->mm->mmap_sem); 344 spin_lock(&gmap->mm->page_table_lock); 345 for (off = 0; off < len; off += PMD_SIZE) { 346 /* Walk the gmap address space page table */ 347 table = gmap->table + (((to + off) >> 53) & 0x7ff); 348 if ((*table & _REGION_ENTRY_INVALID) && 349 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY)) 350 goto out_unmap; 351 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 352 table = table + (((to + off) >> 42) & 0x7ff); 353 if ((*table & _REGION_ENTRY_INVALID) && 354 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY)) 355 goto out_unmap; 356 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 357 table = table + (((to + off) >> 31) & 0x7ff); 358 if ((*table & _REGION_ENTRY_INVALID) && 359 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY)) 360 goto out_unmap; 361 table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN); 362 table = table + (((to + off) >> 20) & 0x7ff); 363 364 /* Store 'from' address in an invalid segment table entry. */ 365 flush |= gmap_unlink_segment(gmap, table); 366 *table = (from + off) | (_SEGMENT_ENTRY_INVALID | 367 _SEGMENT_ENTRY_PROTECT); 368 } 369 spin_unlock(&gmap->mm->page_table_lock); 370 up_read(&gmap->mm->mmap_sem); 371 if (flush) 372 gmap_flush_tlb(gmap); 373 return 0; 374 375 out_unmap: 376 spin_unlock(&gmap->mm->page_table_lock); 377 up_read(&gmap->mm->mmap_sem); 378 gmap_unmap_segment(gmap, to, len); 379 return -ENOMEM; 380 } 381 EXPORT_SYMBOL_GPL(gmap_map_segment); 382 383 static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap) 384 { 385 unsigned long *table; 386 387 table = gmap->table + ((address >> 53) & 0x7ff); 388 if (unlikely(*table & _REGION_ENTRY_INVALID)) 389 return ERR_PTR(-EFAULT); 390 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 391 table = table + ((address >> 42) & 0x7ff); 392 if (unlikely(*table & _REGION_ENTRY_INVALID)) 393 return ERR_PTR(-EFAULT); 394 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 395 table = table + ((address >> 31) & 0x7ff); 396 if (unlikely(*table & _REGION_ENTRY_INVALID)) 397 return ERR_PTR(-EFAULT); 398 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 399 table = table + ((address >> 20) & 0x7ff); 400 return table; 401 } 402 403 /** 404 * __gmap_translate - translate a guest address to a user space address 405 * @address: guest address 406 * @gmap: pointer to guest mapping meta data structure 407 * 408 * Returns user space address which corresponds to the guest address or 409 * -EFAULT if no such mapping exists. 410 * This function does not establish potentially missing page table entries. 411 * The mmap_sem of the mm that belongs to the address space must be held 412 * when this function gets called. 413 */ 414 unsigned long __gmap_translate(unsigned long address, struct gmap *gmap) 415 { 416 unsigned long *segment_ptr, vmaddr, segment; 417 struct gmap_pgtable *mp; 418 struct page *page; 419 420 current->thread.gmap_addr = address; 421 segment_ptr = gmap_table_walk(address, gmap); 422 if (IS_ERR(segment_ptr)) 423 return PTR_ERR(segment_ptr); 424 /* Convert the gmap address to an mm address. */ 425 segment = *segment_ptr; 426 if (!(segment & _SEGMENT_ENTRY_INVALID)) { 427 page = pfn_to_page(segment >> PAGE_SHIFT); 428 mp = (struct gmap_pgtable *) page->index; 429 return mp->vmaddr | (address & ~PMD_MASK); 430 } else if (segment & _SEGMENT_ENTRY_PROTECT) { 431 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; 432 return vmaddr | (address & ~PMD_MASK); 433 } 434 return -EFAULT; 435 } 436 EXPORT_SYMBOL_GPL(__gmap_translate); 437 438 /** 439 * gmap_translate - translate a guest address to a user space address 440 * @address: guest address 441 * @gmap: pointer to guest mapping meta data structure 442 * 443 * Returns user space address which corresponds to the guest address or 444 * -EFAULT if no such mapping exists. 445 * This function does not establish potentially missing page table entries. 446 */ 447 unsigned long gmap_translate(unsigned long address, struct gmap *gmap) 448 { 449 unsigned long rc; 450 451 down_read(&gmap->mm->mmap_sem); 452 rc = __gmap_translate(address, gmap); 453 up_read(&gmap->mm->mmap_sem); 454 return rc; 455 } 456 EXPORT_SYMBOL_GPL(gmap_translate); 457 458 static int gmap_connect_pgtable(unsigned long address, unsigned long segment, 459 unsigned long *segment_ptr, struct gmap *gmap) 460 { 461 unsigned long vmaddr; 462 struct vm_area_struct *vma; 463 struct gmap_pgtable *mp; 464 struct gmap_rmap *rmap; 465 struct mm_struct *mm; 466 struct page *page; 467 pgd_t *pgd; 468 pud_t *pud; 469 pmd_t *pmd; 470 471 mm = gmap->mm; 472 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; 473 vma = find_vma(mm, vmaddr); 474 if (!vma || vma->vm_start > vmaddr) 475 return -EFAULT; 476 /* Walk the parent mm page table */ 477 pgd = pgd_offset(mm, vmaddr); 478 pud = pud_alloc(mm, pgd, vmaddr); 479 if (!pud) 480 return -ENOMEM; 481 pmd = pmd_alloc(mm, pud, vmaddr); 482 if (!pmd) 483 return -ENOMEM; 484 if (!pmd_present(*pmd) && 485 __pte_alloc(mm, vma, pmd, vmaddr)) 486 return -ENOMEM; 487 /* pmd now points to a valid segment table entry. */ 488 rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT); 489 if (!rmap) 490 return -ENOMEM; 491 /* Link gmap segment table entry location to page table. */ 492 page = pmd_page(*pmd); 493 mp = (struct gmap_pgtable *) page->index; 494 rmap->gmap = gmap; 495 rmap->entry = segment_ptr; 496 rmap->vmaddr = address & PMD_MASK; 497 spin_lock(&mm->page_table_lock); 498 if (*segment_ptr == segment) { 499 list_add(&rmap->list, &mp->mapper); 500 /* Set gmap segment table entry to page table. */ 501 *segment_ptr = pmd_val(*pmd) & PAGE_MASK; 502 rmap = NULL; 503 } 504 spin_unlock(&mm->page_table_lock); 505 kfree(rmap); 506 return 0; 507 } 508 509 static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table) 510 { 511 struct gmap_rmap *rmap, *next; 512 struct gmap_pgtable *mp; 513 struct page *page; 514 int flush; 515 516 flush = 0; 517 spin_lock(&mm->page_table_lock); 518 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 519 mp = (struct gmap_pgtable *) page->index; 520 list_for_each_entry_safe(rmap, next, &mp->mapper, list) { 521 *rmap->entry = mp->vmaddr | (_SEGMENT_ENTRY_INVALID | 522 _SEGMENT_ENTRY_PROTECT); 523 list_del(&rmap->list); 524 kfree(rmap); 525 flush = 1; 526 } 527 spin_unlock(&mm->page_table_lock); 528 if (flush) 529 __tlb_flush_global(); 530 } 531 532 /* 533 * this function is assumed to be called with mmap_sem held 534 */ 535 unsigned long __gmap_fault(unsigned long address, struct gmap *gmap) 536 { 537 unsigned long *segment_ptr, segment; 538 struct gmap_pgtable *mp; 539 struct page *page; 540 int rc; 541 542 current->thread.gmap_addr = address; 543 segment_ptr = gmap_table_walk(address, gmap); 544 if (IS_ERR(segment_ptr)) 545 return -EFAULT; 546 /* Convert the gmap address to an mm address. */ 547 while (1) { 548 segment = *segment_ptr; 549 if (!(segment & _SEGMENT_ENTRY_INVALID)) { 550 /* Page table is present */ 551 page = pfn_to_page(segment >> PAGE_SHIFT); 552 mp = (struct gmap_pgtable *) page->index; 553 return mp->vmaddr | (address & ~PMD_MASK); 554 } 555 if (!(segment & _SEGMENT_ENTRY_PROTECT)) 556 /* Nothing mapped in the gmap address space. */ 557 break; 558 rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap); 559 if (rc) 560 return rc; 561 } 562 return -EFAULT; 563 } 564 565 unsigned long gmap_fault(unsigned long address, struct gmap *gmap) 566 { 567 unsigned long rc; 568 569 down_read(&gmap->mm->mmap_sem); 570 rc = __gmap_fault(address, gmap); 571 up_read(&gmap->mm->mmap_sem); 572 573 return rc; 574 } 575 EXPORT_SYMBOL_GPL(gmap_fault); 576 577 void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap) 578 { 579 580 unsigned long *table, address, size; 581 struct vm_area_struct *vma; 582 struct gmap_pgtable *mp; 583 struct page *page; 584 585 down_read(&gmap->mm->mmap_sem); 586 address = from; 587 while (address < to) { 588 /* Walk the gmap address space page table */ 589 table = gmap->table + ((address >> 53) & 0x7ff); 590 if (unlikely(*table & _REGION_ENTRY_INVALID)) { 591 address = (address + PMD_SIZE) & PMD_MASK; 592 continue; 593 } 594 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 595 table = table + ((address >> 42) & 0x7ff); 596 if (unlikely(*table & _REGION_ENTRY_INVALID)) { 597 address = (address + PMD_SIZE) & PMD_MASK; 598 continue; 599 } 600 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 601 table = table + ((address >> 31) & 0x7ff); 602 if (unlikely(*table & _REGION_ENTRY_INVALID)) { 603 address = (address + PMD_SIZE) & PMD_MASK; 604 continue; 605 } 606 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 607 table = table + ((address >> 20) & 0x7ff); 608 if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) { 609 address = (address + PMD_SIZE) & PMD_MASK; 610 continue; 611 } 612 page = pfn_to_page(*table >> PAGE_SHIFT); 613 mp = (struct gmap_pgtable *) page->index; 614 vma = find_vma(gmap->mm, mp->vmaddr); 615 size = min(to - address, PMD_SIZE - (address & ~PMD_MASK)); 616 zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK), 617 size, NULL); 618 address = (address + PMD_SIZE) & PMD_MASK; 619 } 620 up_read(&gmap->mm->mmap_sem); 621 } 622 EXPORT_SYMBOL_GPL(gmap_discard); 623 624 static LIST_HEAD(gmap_notifier_list); 625 static DEFINE_SPINLOCK(gmap_notifier_lock); 626 627 /** 628 * gmap_register_ipte_notifier - register a pte invalidation callback 629 * @nb: pointer to the gmap notifier block 630 */ 631 void gmap_register_ipte_notifier(struct gmap_notifier *nb) 632 { 633 spin_lock(&gmap_notifier_lock); 634 list_add(&nb->list, &gmap_notifier_list); 635 spin_unlock(&gmap_notifier_lock); 636 } 637 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); 638 639 /** 640 * gmap_unregister_ipte_notifier - remove a pte invalidation callback 641 * @nb: pointer to the gmap notifier block 642 */ 643 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) 644 { 645 spin_lock(&gmap_notifier_lock); 646 list_del_init(&nb->list); 647 spin_unlock(&gmap_notifier_lock); 648 } 649 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); 650 651 /** 652 * gmap_ipte_notify - mark a range of ptes for invalidation notification 653 * @gmap: pointer to guest mapping meta data structure 654 * @address: virtual address in the guest address space 655 * @len: size of area 656 * 657 * Returns 0 if for each page in the given range a gmap mapping exists and 658 * the invalidation notification could be set. If the gmap mapping is missing 659 * for one or more pages -EFAULT is returned. If no memory could be allocated 660 * -ENOMEM is returned. This function establishes missing page table entries. 661 */ 662 int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len) 663 { 664 unsigned long addr; 665 spinlock_t *ptl; 666 pte_t *ptep, entry; 667 pgste_t pgste; 668 int rc = 0; 669 670 if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK)) 671 return -EINVAL; 672 down_read(&gmap->mm->mmap_sem); 673 while (len) { 674 /* Convert gmap address and connect the page tables */ 675 addr = __gmap_fault(start, gmap); 676 if (IS_ERR_VALUE(addr)) { 677 rc = addr; 678 break; 679 } 680 /* Get the page mapped */ 681 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { 682 rc = -EFAULT; 683 break; 684 } 685 /* Walk the process page table, lock and get pte pointer */ 686 ptep = get_locked_pte(gmap->mm, addr, &ptl); 687 if (unlikely(!ptep)) 688 continue; 689 /* Set notification bit in the pgste of the pte */ 690 entry = *ptep; 691 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { 692 pgste = pgste_get_lock(ptep); 693 pgste_val(pgste) |= PGSTE_IN_BIT; 694 pgste_set_unlock(ptep, pgste); 695 start += PAGE_SIZE; 696 len -= PAGE_SIZE; 697 } 698 spin_unlock(ptl); 699 } 700 up_read(&gmap->mm->mmap_sem); 701 return rc; 702 } 703 EXPORT_SYMBOL_GPL(gmap_ipte_notify); 704 705 /** 706 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. 707 * @mm: pointer to the process mm_struct 708 * @addr: virtual address in the process address space 709 * @pte: pointer to the page table entry 710 * 711 * This function is assumed to be called with the page table lock held 712 * for the pte to notify. 713 */ 714 void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte) 715 { 716 unsigned long segment_offset; 717 struct gmap_notifier *nb; 718 struct gmap_pgtable *mp; 719 struct gmap_rmap *rmap; 720 struct page *page; 721 722 segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 723 segment_offset = segment_offset * (4096 / sizeof(pte_t)); 724 page = pfn_to_page(__pa(pte) >> PAGE_SHIFT); 725 mp = (struct gmap_pgtable *) page->index; 726 spin_lock(&gmap_notifier_lock); 727 list_for_each_entry(rmap, &mp->mapper, list) { 728 list_for_each_entry(nb, &gmap_notifier_list, list) 729 nb->notifier_call(rmap->gmap, 730 rmap->vmaddr + segment_offset); 731 } 732 spin_unlock(&gmap_notifier_lock); 733 } 734 735 static inline int page_table_with_pgste(struct page *page) 736 { 737 return atomic_read(&page->_mapcount) == 0; 738 } 739 740 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 741 unsigned long vmaddr) 742 { 743 struct page *page; 744 unsigned long *table; 745 struct gmap_pgtable *mp; 746 747 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 748 if (!page) 749 return NULL; 750 mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT); 751 if (!mp) { 752 __free_page(page); 753 return NULL; 754 } 755 pgtable_page_ctor(page); 756 mp->vmaddr = vmaddr & PMD_MASK; 757 INIT_LIST_HEAD(&mp->mapper); 758 page->index = (unsigned long) mp; 759 atomic_set(&page->_mapcount, 0); 760 table = (unsigned long *) page_to_phys(page); 761 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 762 clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT, 763 PAGE_SIZE/2); 764 return table; 765 } 766 767 static inline void page_table_free_pgste(unsigned long *table) 768 { 769 struct page *page; 770 struct gmap_pgtable *mp; 771 772 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 773 mp = (struct gmap_pgtable *) page->index; 774 BUG_ON(!list_empty(&mp->mapper)); 775 pgtable_page_dtor(page); 776 atomic_set(&page->_mapcount, -1); 777 kfree(mp); 778 __free_page(page); 779 } 780 781 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 782 unsigned long key, bool nq) 783 { 784 spinlock_t *ptl; 785 pgste_t old, new; 786 pte_t *ptep; 787 788 down_read(&mm->mmap_sem); 789 ptep = get_locked_pte(current->mm, addr, &ptl); 790 if (unlikely(!ptep)) { 791 up_read(&mm->mmap_sem); 792 return -EFAULT; 793 } 794 795 new = old = pgste_get_lock(ptep); 796 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 797 PGSTE_ACC_BITS | PGSTE_FP_BIT); 798 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 799 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 800 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 801 unsigned long address, bits, skey; 802 803 address = pte_val(*ptep) & PAGE_MASK; 804 skey = (unsigned long) page_get_storage_key(address); 805 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 806 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 807 /* Set storage key ACC and FP */ 808 page_set_storage_key(address, skey, !nq); 809 /* Merge host changed & referenced into pgste */ 810 pgste_val(new) |= bits << 52; 811 } 812 /* changing the guest storage key is considered a change of the page */ 813 if ((pgste_val(new) ^ pgste_val(old)) & 814 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 815 pgste_val(new) |= PGSTE_HC_BIT; 816 817 pgste_set_unlock(ptep, new); 818 pte_unmap_unlock(*ptep, ptl); 819 up_read(&mm->mmap_sem); 820 return 0; 821 } 822 EXPORT_SYMBOL(set_guest_storage_key); 823 824 #else /* CONFIG_PGSTE */ 825 826 static inline int page_table_with_pgste(struct page *page) 827 { 828 return 0; 829 } 830 831 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 832 unsigned long vmaddr) 833 { 834 return NULL; 835 } 836 837 static inline void page_table_free_pgste(unsigned long *table) 838 { 839 } 840 841 static inline void gmap_disconnect_pgtable(struct mm_struct *mm, 842 unsigned long *table) 843 { 844 } 845 846 #endif /* CONFIG_PGSTE */ 847 848 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 849 { 850 unsigned int old, new; 851 852 do { 853 old = atomic_read(v); 854 new = old ^ bits; 855 } while (atomic_cmpxchg(v, old, new) != old); 856 return new; 857 } 858 859 /* 860 * page table entry allocation/free routines. 861 */ 862 unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr) 863 { 864 unsigned long *uninitialized_var(table); 865 struct page *uninitialized_var(page); 866 unsigned int mask, bit; 867 868 if (mm_has_pgste(mm)) 869 return page_table_alloc_pgste(mm, vmaddr); 870 /* Allocate fragments of a 4K page as 1K/2K page table */ 871 spin_lock_bh(&mm->context.list_lock); 872 mask = FRAG_MASK; 873 if (!list_empty(&mm->context.pgtable_list)) { 874 page = list_first_entry(&mm->context.pgtable_list, 875 struct page, lru); 876 table = (unsigned long *) page_to_phys(page); 877 mask = atomic_read(&page->_mapcount); 878 mask = mask | (mask >> 4); 879 } 880 if ((mask & FRAG_MASK) == FRAG_MASK) { 881 spin_unlock_bh(&mm->context.list_lock); 882 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 883 if (!page) 884 return NULL; 885 pgtable_page_ctor(page); 886 atomic_set(&page->_mapcount, 1); 887 table = (unsigned long *) page_to_phys(page); 888 clear_table(table, _PAGE_INVALID, PAGE_SIZE); 889 spin_lock_bh(&mm->context.list_lock); 890 list_add(&page->lru, &mm->context.pgtable_list); 891 } else { 892 for (bit = 1; mask & bit; bit <<= 1) 893 table += PTRS_PER_PTE; 894 mask = atomic_xor_bits(&page->_mapcount, bit); 895 if ((mask & FRAG_MASK) == FRAG_MASK) 896 list_del(&page->lru); 897 } 898 spin_unlock_bh(&mm->context.list_lock); 899 return table; 900 } 901 902 void page_table_free(struct mm_struct *mm, unsigned long *table) 903 { 904 struct page *page; 905 unsigned int bit, mask; 906 907 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 908 if (page_table_with_pgste(page)) { 909 gmap_disconnect_pgtable(mm, table); 910 return page_table_free_pgste(table); 911 } 912 /* Free 1K/2K page table fragment of a 4K page */ 913 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 914 spin_lock_bh(&mm->context.list_lock); 915 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 916 list_del(&page->lru); 917 mask = atomic_xor_bits(&page->_mapcount, bit); 918 if (mask & FRAG_MASK) 919 list_add(&page->lru, &mm->context.pgtable_list); 920 spin_unlock_bh(&mm->context.list_lock); 921 if (mask == 0) { 922 pgtable_page_dtor(page); 923 atomic_set(&page->_mapcount, -1); 924 __free_page(page); 925 } 926 } 927 928 static void __page_table_free_rcu(void *table, unsigned bit) 929 { 930 struct page *page; 931 932 if (bit == FRAG_MASK) 933 return page_table_free_pgste(table); 934 /* Free 1K/2K page table fragment of a 4K page */ 935 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 936 if (atomic_xor_bits(&page->_mapcount, bit) == 0) { 937 pgtable_page_dtor(page); 938 atomic_set(&page->_mapcount, -1); 939 __free_page(page); 940 } 941 } 942 943 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table) 944 { 945 struct mm_struct *mm; 946 struct page *page; 947 unsigned int bit, mask; 948 949 mm = tlb->mm; 950 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 951 if (page_table_with_pgste(page)) { 952 gmap_disconnect_pgtable(mm, table); 953 table = (unsigned long *) (__pa(table) | FRAG_MASK); 954 tlb_remove_table(tlb, table); 955 return; 956 } 957 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 958 spin_lock_bh(&mm->context.list_lock); 959 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 960 list_del(&page->lru); 961 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); 962 if (mask & FRAG_MASK) 963 list_add_tail(&page->lru, &mm->context.pgtable_list); 964 spin_unlock_bh(&mm->context.list_lock); 965 table = (unsigned long *) (__pa(table) | (bit << 4)); 966 tlb_remove_table(tlb, table); 967 } 968 969 void __tlb_remove_table(void *_table) 970 { 971 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; 972 void *table = (void *)((unsigned long) _table & ~mask); 973 unsigned type = (unsigned long) _table & mask; 974 975 if (type) 976 __page_table_free_rcu(table, type); 977 else 978 free_pages((unsigned long) table, ALLOC_ORDER); 979 } 980 981 static void tlb_remove_table_smp_sync(void *arg) 982 { 983 /* Simply deliver the interrupt */ 984 } 985 986 static void tlb_remove_table_one(void *table) 987 { 988 /* 989 * This isn't an RCU grace period and hence the page-tables cannot be 990 * assumed to be actually RCU-freed. 991 * 992 * It is however sufficient for software page-table walkers that rely 993 * on IRQ disabling. See the comment near struct mmu_table_batch. 994 */ 995 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 996 __tlb_remove_table(table); 997 } 998 999 static void tlb_remove_table_rcu(struct rcu_head *head) 1000 { 1001 struct mmu_table_batch *batch; 1002 int i; 1003 1004 batch = container_of(head, struct mmu_table_batch, rcu); 1005 1006 for (i = 0; i < batch->nr; i++) 1007 __tlb_remove_table(batch->tables[i]); 1008 1009 free_page((unsigned long)batch); 1010 } 1011 1012 void tlb_table_flush(struct mmu_gather *tlb) 1013 { 1014 struct mmu_table_batch **batch = &tlb->batch; 1015 1016 if (*batch) { 1017 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 1018 *batch = NULL; 1019 } 1020 } 1021 1022 void tlb_remove_table(struct mmu_gather *tlb, void *table) 1023 { 1024 struct mmu_table_batch **batch = &tlb->batch; 1025 1026 tlb->mm->context.flush_mm = 1; 1027 if (*batch == NULL) { 1028 *batch = (struct mmu_table_batch *) 1029 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 1030 if (*batch == NULL) { 1031 __tlb_flush_mm_lazy(tlb->mm); 1032 tlb_remove_table_one(table); 1033 return; 1034 } 1035 (*batch)->nr = 0; 1036 } 1037 (*batch)->tables[(*batch)->nr++] = table; 1038 if ((*batch)->nr == MAX_TABLE_BATCH) 1039 tlb_flush_mmu(tlb); 1040 } 1041 1042 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1043 static inline void thp_split_vma(struct vm_area_struct *vma) 1044 { 1045 unsigned long addr; 1046 1047 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) 1048 follow_page(vma, addr, FOLL_SPLIT); 1049 } 1050 1051 static inline void thp_split_mm(struct mm_struct *mm) 1052 { 1053 struct vm_area_struct *vma; 1054 1055 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { 1056 thp_split_vma(vma); 1057 vma->vm_flags &= ~VM_HUGEPAGE; 1058 vma->vm_flags |= VM_NOHUGEPAGE; 1059 } 1060 mm->def_flags |= VM_NOHUGEPAGE; 1061 } 1062 #else 1063 static inline void thp_split_mm(struct mm_struct *mm) 1064 { 1065 } 1066 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1067 1068 static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb, 1069 struct mm_struct *mm, pud_t *pud, 1070 unsigned long addr, unsigned long end) 1071 { 1072 unsigned long next, *table, *new; 1073 struct page *page; 1074 pmd_t *pmd; 1075 1076 pmd = pmd_offset(pud, addr); 1077 do { 1078 next = pmd_addr_end(addr, end); 1079 again: 1080 if (pmd_none_or_clear_bad(pmd)) 1081 continue; 1082 table = (unsigned long *) pmd_deref(*pmd); 1083 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1084 if (page_table_with_pgste(page)) 1085 continue; 1086 /* Allocate new page table with pgstes */ 1087 new = page_table_alloc_pgste(mm, addr); 1088 if (!new) { 1089 mm->context.has_pgste = 0; 1090 continue; 1091 } 1092 spin_lock(&mm->page_table_lock); 1093 if (likely((unsigned long *) pmd_deref(*pmd) == table)) { 1094 /* Nuke pmd entry pointing to the "short" page table */ 1095 pmdp_flush_lazy(mm, addr, pmd); 1096 pmd_clear(pmd); 1097 /* Copy ptes from old table to new table */ 1098 memcpy(new, table, PAGE_SIZE/2); 1099 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 1100 /* Establish new table */ 1101 pmd_populate(mm, pmd, (pte_t *) new); 1102 /* Free old table with rcu, there might be a walker! */ 1103 page_table_free_rcu(tlb, table); 1104 new = NULL; 1105 } 1106 spin_unlock(&mm->page_table_lock); 1107 if (new) { 1108 page_table_free_pgste(new); 1109 goto again; 1110 } 1111 } while (pmd++, addr = next, addr != end); 1112 1113 return addr; 1114 } 1115 1116 static unsigned long page_table_realloc_pud(struct mmu_gather *tlb, 1117 struct mm_struct *mm, pgd_t *pgd, 1118 unsigned long addr, unsigned long end) 1119 { 1120 unsigned long next; 1121 pud_t *pud; 1122 1123 pud = pud_offset(pgd, addr); 1124 do { 1125 next = pud_addr_end(addr, end); 1126 if (pud_none_or_clear_bad(pud)) 1127 continue; 1128 next = page_table_realloc_pmd(tlb, mm, pud, addr, next); 1129 } while (pud++, addr = next, addr != end); 1130 1131 return addr; 1132 } 1133 1134 static void page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm, 1135 unsigned long addr, unsigned long end) 1136 { 1137 unsigned long next; 1138 pgd_t *pgd; 1139 1140 pgd = pgd_offset(mm, addr); 1141 do { 1142 next = pgd_addr_end(addr, end); 1143 if (pgd_none_or_clear_bad(pgd)) 1144 continue; 1145 next = page_table_realloc_pud(tlb, mm, pgd, addr, next); 1146 } while (pgd++, addr = next, addr != end); 1147 } 1148 1149 /* 1150 * switch on pgstes for its userspace process (for kvm) 1151 */ 1152 int s390_enable_sie(void) 1153 { 1154 struct task_struct *tsk = current; 1155 struct mm_struct *mm = tsk->mm; 1156 struct mmu_gather tlb; 1157 1158 /* Do we have switched amode? If no, we cannot do sie */ 1159 if (s390_user_mode == HOME_SPACE_MODE) 1160 return -EINVAL; 1161 1162 /* Do we have pgstes? if yes, we are done */ 1163 if (mm_has_pgste(tsk->mm)) 1164 return 0; 1165 1166 down_write(&mm->mmap_sem); 1167 /* split thp mappings and disable thp for future mappings */ 1168 thp_split_mm(mm); 1169 /* Reallocate the page tables with pgstes */ 1170 mm->context.has_pgste = 1; 1171 tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE); 1172 page_table_realloc(&tlb, mm, 0, TASK_SIZE); 1173 tlb_finish_mmu(&tlb, 0, TASK_SIZE); 1174 up_write(&mm->mmap_sem); 1175 return mm->context.has_pgste ? 0 : -ENOMEM; 1176 } 1177 EXPORT_SYMBOL_GPL(s390_enable_sie); 1178 1179 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1180 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, 1181 pmd_t *pmdp) 1182 { 1183 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1184 /* No need to flush TLB 1185 * On s390 reference bits are in storage key and never in TLB */ 1186 return pmdp_test_and_clear_young(vma, address, pmdp); 1187 } 1188 1189 int pmdp_set_access_flags(struct vm_area_struct *vma, 1190 unsigned long address, pmd_t *pmdp, 1191 pmd_t entry, int dirty) 1192 { 1193 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1194 1195 if (pmd_same(*pmdp, entry)) 1196 return 0; 1197 pmdp_invalidate(vma, address, pmdp); 1198 set_pmd_at(vma->vm_mm, address, pmdp, entry); 1199 return 1; 1200 } 1201 1202 static void pmdp_splitting_flush_sync(void *arg) 1203 { 1204 /* Simply deliver the interrupt */ 1205 } 1206 1207 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 1208 pmd_t *pmdp) 1209 { 1210 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1211 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, 1212 (unsigned long *) pmdp)) { 1213 /* need to serialize against gup-fast (IRQ disabled) */ 1214 smp_call_function(pmdp_splitting_flush_sync, NULL, 1); 1215 } 1216 } 1217 1218 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1219 pgtable_t pgtable) 1220 { 1221 struct list_head *lh = (struct list_head *) pgtable; 1222 1223 assert_spin_locked(&mm->page_table_lock); 1224 1225 /* FIFO */ 1226 if (!mm->pmd_huge_pte) 1227 INIT_LIST_HEAD(lh); 1228 else 1229 list_add(lh, (struct list_head *) mm->pmd_huge_pte); 1230 mm->pmd_huge_pte = pgtable; 1231 } 1232 1233 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1234 { 1235 struct list_head *lh; 1236 pgtable_t pgtable; 1237 pte_t *ptep; 1238 1239 assert_spin_locked(&mm->page_table_lock); 1240 1241 /* FIFO */ 1242 pgtable = mm->pmd_huge_pte; 1243 lh = (struct list_head *) pgtable; 1244 if (list_empty(lh)) 1245 mm->pmd_huge_pte = NULL; 1246 else { 1247 mm->pmd_huge_pte = (pgtable_t) lh->next; 1248 list_del(lh); 1249 } 1250 ptep = (pte_t *) pgtable; 1251 pte_val(*ptep) = _PAGE_INVALID; 1252 ptep++; 1253 pte_val(*ptep) = _PAGE_INVALID; 1254 return pgtable; 1255 } 1256 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1257