1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 #include <linux/slab.h> 20 21 #include <asm/pgtable.h> 22 #include <asm/pgalloc.h> 23 #include <asm/tlb.h> 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 27 #ifndef CONFIG_64BIT 28 #define ALLOC_ORDER 1 29 #define FRAG_MASK 0x0f 30 #else 31 #define ALLOC_ORDER 2 32 #define FRAG_MASK 0x03 33 #endif 34 35 36 unsigned long *crst_table_alloc(struct mm_struct *mm) 37 { 38 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 39 40 if (!page) 41 return NULL; 42 return (unsigned long *) page_to_phys(page); 43 } 44 45 void crst_table_free(struct mm_struct *mm, unsigned long *table) 46 { 47 free_pages((unsigned long) table, ALLOC_ORDER); 48 } 49 50 #ifdef CONFIG_64BIT 51 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 52 { 53 unsigned long *table, *pgd; 54 unsigned long entry; 55 56 BUG_ON(limit > (1UL << 53)); 57 repeat: 58 table = crst_table_alloc(mm); 59 if (!table) 60 return -ENOMEM; 61 spin_lock_bh(&mm->page_table_lock); 62 if (mm->context.asce_limit < limit) { 63 pgd = (unsigned long *) mm->pgd; 64 if (mm->context.asce_limit <= (1UL << 31)) { 65 entry = _REGION3_ENTRY_EMPTY; 66 mm->context.asce_limit = 1UL << 42; 67 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 68 _ASCE_USER_BITS | 69 _ASCE_TYPE_REGION3; 70 } else { 71 entry = _REGION2_ENTRY_EMPTY; 72 mm->context.asce_limit = 1UL << 53; 73 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 74 _ASCE_USER_BITS | 75 _ASCE_TYPE_REGION2; 76 } 77 crst_table_init(table, entry); 78 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 79 mm->pgd = (pgd_t *) table; 80 mm->task_size = mm->context.asce_limit; 81 table = NULL; 82 } 83 spin_unlock_bh(&mm->page_table_lock); 84 if (table) 85 crst_table_free(mm, table); 86 if (mm->context.asce_limit < limit) 87 goto repeat; 88 return 0; 89 } 90 91 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 92 { 93 pgd_t *pgd; 94 95 while (mm->context.asce_limit > limit) { 96 pgd = mm->pgd; 97 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 98 case _REGION_ENTRY_TYPE_R2: 99 mm->context.asce_limit = 1UL << 42; 100 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 101 _ASCE_USER_BITS | 102 _ASCE_TYPE_REGION3; 103 break; 104 case _REGION_ENTRY_TYPE_R3: 105 mm->context.asce_limit = 1UL << 31; 106 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 107 _ASCE_USER_BITS | 108 _ASCE_TYPE_SEGMENT; 109 break; 110 default: 111 BUG(); 112 } 113 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 114 mm->task_size = mm->context.asce_limit; 115 crst_table_free(mm, (unsigned long *) pgd); 116 } 117 } 118 #endif 119 120 #ifdef CONFIG_PGSTE 121 122 /** 123 * gmap_alloc - allocate a guest address space 124 * @mm: pointer to the parent mm_struct 125 * 126 * Returns a guest address space structure. 127 */ 128 struct gmap *gmap_alloc(struct mm_struct *mm) 129 { 130 struct gmap *gmap; 131 struct page *page; 132 unsigned long *table; 133 134 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 135 if (!gmap) 136 goto out; 137 INIT_LIST_HEAD(&gmap->crst_list); 138 gmap->mm = mm; 139 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 140 if (!page) 141 goto out_free; 142 list_add(&page->lru, &gmap->crst_list); 143 table = (unsigned long *) page_to_phys(page); 144 crst_table_init(table, _REGION1_ENTRY_EMPTY); 145 gmap->table = table; 146 gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH | 147 _ASCE_USER_BITS | __pa(table); 148 list_add(&gmap->list, &mm->context.gmap_list); 149 return gmap; 150 151 out_free: 152 kfree(gmap); 153 out: 154 return NULL; 155 } 156 EXPORT_SYMBOL_GPL(gmap_alloc); 157 158 static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table) 159 { 160 struct gmap_pgtable *mp; 161 struct gmap_rmap *rmap; 162 struct page *page; 163 164 if (*table & _SEGMENT_ENTRY_INV) 165 return 0; 166 page = pfn_to_page(*table >> PAGE_SHIFT); 167 mp = (struct gmap_pgtable *) page->index; 168 list_for_each_entry(rmap, &mp->mapper, list) { 169 if (rmap->entry != table) 170 continue; 171 list_del(&rmap->list); 172 kfree(rmap); 173 break; 174 } 175 *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr; 176 return 1; 177 } 178 179 static void gmap_flush_tlb(struct gmap *gmap) 180 { 181 if (MACHINE_HAS_IDTE) 182 __tlb_flush_idte((unsigned long) gmap->table | 183 _ASCE_TYPE_REGION1); 184 else 185 __tlb_flush_global(); 186 } 187 188 /** 189 * gmap_free - free a guest address space 190 * @gmap: pointer to the guest address space structure 191 */ 192 void gmap_free(struct gmap *gmap) 193 { 194 struct page *page, *next; 195 unsigned long *table; 196 int i; 197 198 199 /* Flush tlb. */ 200 if (MACHINE_HAS_IDTE) 201 __tlb_flush_idte((unsigned long) gmap->table | 202 _ASCE_TYPE_REGION1); 203 else 204 __tlb_flush_global(); 205 206 /* Free all segment & region tables. */ 207 down_read(&gmap->mm->mmap_sem); 208 spin_lock(&gmap->mm->page_table_lock); 209 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { 210 table = (unsigned long *) page_to_phys(page); 211 if ((*table & _REGION_ENTRY_TYPE_MASK) == 0) 212 /* Remove gmap rmap structures for segment table. */ 213 for (i = 0; i < PTRS_PER_PMD; i++, table++) 214 gmap_unlink_segment(gmap, table); 215 __free_pages(page, ALLOC_ORDER); 216 } 217 spin_unlock(&gmap->mm->page_table_lock); 218 up_read(&gmap->mm->mmap_sem); 219 list_del(&gmap->list); 220 kfree(gmap); 221 } 222 EXPORT_SYMBOL_GPL(gmap_free); 223 224 /** 225 * gmap_enable - switch primary space to the guest address space 226 * @gmap: pointer to the guest address space structure 227 */ 228 void gmap_enable(struct gmap *gmap) 229 { 230 S390_lowcore.gmap = (unsigned long) gmap; 231 } 232 EXPORT_SYMBOL_GPL(gmap_enable); 233 234 /** 235 * gmap_disable - switch back to the standard primary address space 236 * @gmap: pointer to the guest address space structure 237 */ 238 void gmap_disable(struct gmap *gmap) 239 { 240 S390_lowcore.gmap = 0UL; 241 } 242 EXPORT_SYMBOL_GPL(gmap_disable); 243 244 /* 245 * gmap_alloc_table is assumed to be called with mmap_sem held 246 */ 247 static int gmap_alloc_table(struct gmap *gmap, 248 unsigned long *table, unsigned long init) 249 { 250 struct page *page; 251 unsigned long *new; 252 253 /* since we dont free the gmap table until gmap_free we can unlock */ 254 spin_unlock(&gmap->mm->page_table_lock); 255 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 256 spin_lock(&gmap->mm->page_table_lock); 257 if (!page) 258 return -ENOMEM; 259 new = (unsigned long *) page_to_phys(page); 260 crst_table_init(new, init); 261 if (*table & _REGION_ENTRY_INV) { 262 list_add(&page->lru, &gmap->crst_list); 263 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 264 (*table & _REGION_ENTRY_TYPE_MASK); 265 } else 266 __free_pages(page, ALLOC_ORDER); 267 return 0; 268 } 269 270 /** 271 * gmap_unmap_segment - unmap segment from the guest address space 272 * @gmap: pointer to the guest address space structure 273 * @addr: address in the guest address space 274 * @len: length of the memory area to unmap 275 * 276 * Returns 0 if the unmap succeded, -EINVAL if not. 277 */ 278 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 279 { 280 unsigned long *table; 281 unsigned long off; 282 int flush; 283 284 if ((to | len) & (PMD_SIZE - 1)) 285 return -EINVAL; 286 if (len == 0 || to + len < to) 287 return -EINVAL; 288 289 flush = 0; 290 down_read(&gmap->mm->mmap_sem); 291 spin_lock(&gmap->mm->page_table_lock); 292 for (off = 0; off < len; off += PMD_SIZE) { 293 /* Walk the guest addr space page table */ 294 table = gmap->table + (((to + off) >> 53) & 0x7ff); 295 if (*table & _REGION_ENTRY_INV) 296 goto out; 297 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 298 table = table + (((to + off) >> 42) & 0x7ff); 299 if (*table & _REGION_ENTRY_INV) 300 goto out; 301 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 302 table = table + (((to + off) >> 31) & 0x7ff); 303 if (*table & _REGION_ENTRY_INV) 304 goto out; 305 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 306 table = table + (((to + off) >> 20) & 0x7ff); 307 308 /* Clear segment table entry in guest address space. */ 309 flush |= gmap_unlink_segment(gmap, table); 310 *table = _SEGMENT_ENTRY_INV; 311 } 312 out: 313 spin_unlock(&gmap->mm->page_table_lock); 314 up_read(&gmap->mm->mmap_sem); 315 if (flush) 316 gmap_flush_tlb(gmap); 317 return 0; 318 } 319 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 320 321 /** 322 * gmap_mmap_segment - map a segment to the guest address space 323 * @gmap: pointer to the guest address space structure 324 * @from: source address in the parent address space 325 * @to: target address in the guest address space 326 * 327 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not. 328 */ 329 int gmap_map_segment(struct gmap *gmap, unsigned long from, 330 unsigned long to, unsigned long len) 331 { 332 unsigned long *table; 333 unsigned long off; 334 int flush; 335 336 if ((from | to | len) & (PMD_SIZE - 1)) 337 return -EINVAL; 338 if (len == 0 || from + len > PGDIR_SIZE || 339 from + len < from || to + len < to) 340 return -EINVAL; 341 342 flush = 0; 343 down_read(&gmap->mm->mmap_sem); 344 spin_lock(&gmap->mm->page_table_lock); 345 for (off = 0; off < len; off += PMD_SIZE) { 346 /* Walk the gmap address space page table */ 347 table = gmap->table + (((to + off) >> 53) & 0x7ff); 348 if ((*table & _REGION_ENTRY_INV) && 349 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY)) 350 goto out_unmap; 351 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 352 table = table + (((to + off) >> 42) & 0x7ff); 353 if ((*table & _REGION_ENTRY_INV) && 354 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY)) 355 goto out_unmap; 356 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 357 table = table + (((to + off) >> 31) & 0x7ff); 358 if ((*table & _REGION_ENTRY_INV) && 359 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY)) 360 goto out_unmap; 361 table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN); 362 table = table + (((to + off) >> 20) & 0x7ff); 363 364 /* Store 'from' address in an invalid segment table entry. */ 365 flush |= gmap_unlink_segment(gmap, table); 366 *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off); 367 } 368 spin_unlock(&gmap->mm->page_table_lock); 369 up_read(&gmap->mm->mmap_sem); 370 if (flush) 371 gmap_flush_tlb(gmap); 372 return 0; 373 374 out_unmap: 375 spin_unlock(&gmap->mm->page_table_lock); 376 up_read(&gmap->mm->mmap_sem); 377 gmap_unmap_segment(gmap, to, len); 378 return -ENOMEM; 379 } 380 EXPORT_SYMBOL_GPL(gmap_map_segment); 381 382 /* 383 * this function is assumed to be called with mmap_sem held 384 */ 385 unsigned long __gmap_fault(unsigned long address, struct gmap *gmap) 386 { 387 unsigned long *table, vmaddr, segment; 388 struct mm_struct *mm; 389 struct gmap_pgtable *mp; 390 struct gmap_rmap *rmap; 391 struct vm_area_struct *vma; 392 struct page *page; 393 pgd_t *pgd; 394 pud_t *pud; 395 pmd_t *pmd; 396 397 current->thread.gmap_addr = address; 398 mm = gmap->mm; 399 /* Walk the gmap address space page table */ 400 table = gmap->table + ((address >> 53) & 0x7ff); 401 if (unlikely(*table & _REGION_ENTRY_INV)) 402 return -EFAULT; 403 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 404 table = table + ((address >> 42) & 0x7ff); 405 if (unlikely(*table & _REGION_ENTRY_INV)) 406 return -EFAULT; 407 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 408 table = table + ((address >> 31) & 0x7ff); 409 if (unlikely(*table & _REGION_ENTRY_INV)) 410 return -EFAULT; 411 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 412 table = table + ((address >> 20) & 0x7ff); 413 414 /* Convert the gmap address to an mm address. */ 415 segment = *table; 416 if (likely(!(segment & _SEGMENT_ENTRY_INV))) { 417 page = pfn_to_page(segment >> PAGE_SHIFT); 418 mp = (struct gmap_pgtable *) page->index; 419 return mp->vmaddr | (address & ~PMD_MASK); 420 } else if (segment & _SEGMENT_ENTRY_RO) { 421 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; 422 vma = find_vma(mm, vmaddr); 423 if (!vma || vma->vm_start > vmaddr) 424 return -EFAULT; 425 426 /* Walk the parent mm page table */ 427 pgd = pgd_offset(mm, vmaddr); 428 pud = pud_alloc(mm, pgd, vmaddr); 429 if (!pud) 430 return -ENOMEM; 431 pmd = pmd_alloc(mm, pud, vmaddr); 432 if (!pmd) 433 return -ENOMEM; 434 if (!pmd_present(*pmd) && 435 __pte_alloc(mm, vma, pmd, vmaddr)) 436 return -ENOMEM; 437 /* pmd now points to a valid segment table entry. */ 438 rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT); 439 if (!rmap) 440 return -ENOMEM; 441 /* Link gmap segment table entry location to page table. */ 442 page = pmd_page(*pmd); 443 mp = (struct gmap_pgtable *) page->index; 444 rmap->entry = table; 445 spin_lock(&mm->page_table_lock); 446 list_add(&rmap->list, &mp->mapper); 447 spin_unlock(&mm->page_table_lock); 448 /* Set gmap segment table entry to page table. */ 449 *table = pmd_val(*pmd) & PAGE_MASK; 450 return vmaddr | (address & ~PMD_MASK); 451 } 452 return -EFAULT; 453 } 454 455 unsigned long gmap_fault(unsigned long address, struct gmap *gmap) 456 { 457 unsigned long rc; 458 459 down_read(&gmap->mm->mmap_sem); 460 rc = __gmap_fault(address, gmap); 461 up_read(&gmap->mm->mmap_sem); 462 463 return rc; 464 } 465 EXPORT_SYMBOL_GPL(gmap_fault); 466 467 void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap) 468 { 469 470 unsigned long *table, address, size; 471 struct vm_area_struct *vma; 472 struct gmap_pgtable *mp; 473 struct page *page; 474 475 down_read(&gmap->mm->mmap_sem); 476 address = from; 477 while (address < to) { 478 /* Walk the gmap address space page table */ 479 table = gmap->table + ((address >> 53) & 0x7ff); 480 if (unlikely(*table & _REGION_ENTRY_INV)) { 481 address = (address + PMD_SIZE) & PMD_MASK; 482 continue; 483 } 484 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 485 table = table + ((address >> 42) & 0x7ff); 486 if (unlikely(*table & _REGION_ENTRY_INV)) { 487 address = (address + PMD_SIZE) & PMD_MASK; 488 continue; 489 } 490 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 491 table = table + ((address >> 31) & 0x7ff); 492 if (unlikely(*table & _REGION_ENTRY_INV)) { 493 address = (address + PMD_SIZE) & PMD_MASK; 494 continue; 495 } 496 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 497 table = table + ((address >> 20) & 0x7ff); 498 if (unlikely(*table & _SEGMENT_ENTRY_INV)) { 499 address = (address + PMD_SIZE) & PMD_MASK; 500 continue; 501 } 502 page = pfn_to_page(*table >> PAGE_SHIFT); 503 mp = (struct gmap_pgtable *) page->index; 504 vma = find_vma(gmap->mm, mp->vmaddr); 505 size = min(to - address, PMD_SIZE - (address & ~PMD_MASK)); 506 zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK), 507 size, NULL); 508 address = (address + PMD_SIZE) & PMD_MASK; 509 } 510 up_read(&gmap->mm->mmap_sem); 511 } 512 EXPORT_SYMBOL_GPL(gmap_discard); 513 514 void gmap_unmap_notifier(struct mm_struct *mm, unsigned long *table) 515 { 516 struct gmap_rmap *rmap, *next; 517 struct gmap_pgtable *mp; 518 struct page *page; 519 int flush; 520 521 flush = 0; 522 spin_lock(&mm->page_table_lock); 523 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 524 mp = (struct gmap_pgtable *) page->index; 525 list_for_each_entry_safe(rmap, next, &mp->mapper, list) { 526 *rmap->entry = 527 _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr; 528 list_del(&rmap->list); 529 kfree(rmap); 530 flush = 1; 531 } 532 spin_unlock(&mm->page_table_lock); 533 if (flush) 534 __tlb_flush_global(); 535 } 536 537 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 538 unsigned long vmaddr) 539 { 540 struct page *page; 541 unsigned long *table; 542 struct gmap_pgtable *mp; 543 544 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 545 if (!page) 546 return NULL; 547 mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT); 548 if (!mp) { 549 __free_page(page); 550 return NULL; 551 } 552 pgtable_page_ctor(page); 553 mp->vmaddr = vmaddr & PMD_MASK; 554 INIT_LIST_HEAD(&mp->mapper); 555 page->index = (unsigned long) mp; 556 atomic_set(&page->_mapcount, 3); 557 table = (unsigned long *) page_to_phys(page); 558 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); 559 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 560 return table; 561 } 562 563 static inline void page_table_free_pgste(unsigned long *table) 564 { 565 struct page *page; 566 struct gmap_pgtable *mp; 567 568 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 569 mp = (struct gmap_pgtable *) page->index; 570 BUG_ON(!list_empty(&mp->mapper)); 571 pgtable_page_dtor(page); 572 atomic_set(&page->_mapcount, -1); 573 kfree(mp); 574 __free_page(page); 575 } 576 577 #else /* CONFIG_PGSTE */ 578 579 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 580 unsigned long vmaddr) 581 { 582 return NULL; 583 } 584 585 static inline void page_table_free_pgste(unsigned long *table) 586 { 587 } 588 589 static inline void gmap_unmap_notifier(struct mm_struct *mm, 590 unsigned long *table) 591 { 592 } 593 594 #endif /* CONFIG_PGSTE */ 595 596 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 597 { 598 unsigned int old, new; 599 600 do { 601 old = atomic_read(v); 602 new = old ^ bits; 603 } while (atomic_cmpxchg(v, old, new) != old); 604 return new; 605 } 606 607 /* 608 * page table entry allocation/free routines. 609 */ 610 unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr) 611 { 612 unsigned long *uninitialized_var(table); 613 struct page *uninitialized_var(page); 614 unsigned int mask, bit; 615 616 if (mm_has_pgste(mm)) 617 return page_table_alloc_pgste(mm, vmaddr); 618 /* Allocate fragments of a 4K page as 1K/2K page table */ 619 spin_lock_bh(&mm->context.list_lock); 620 mask = FRAG_MASK; 621 if (!list_empty(&mm->context.pgtable_list)) { 622 page = list_first_entry(&mm->context.pgtable_list, 623 struct page, lru); 624 table = (unsigned long *) page_to_phys(page); 625 mask = atomic_read(&page->_mapcount); 626 mask = mask | (mask >> 4); 627 } 628 if ((mask & FRAG_MASK) == FRAG_MASK) { 629 spin_unlock_bh(&mm->context.list_lock); 630 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 631 if (!page) 632 return NULL; 633 pgtable_page_ctor(page); 634 atomic_set(&page->_mapcount, 1); 635 table = (unsigned long *) page_to_phys(page); 636 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); 637 spin_lock_bh(&mm->context.list_lock); 638 list_add(&page->lru, &mm->context.pgtable_list); 639 } else { 640 for (bit = 1; mask & bit; bit <<= 1) 641 table += PTRS_PER_PTE; 642 mask = atomic_xor_bits(&page->_mapcount, bit); 643 if ((mask & FRAG_MASK) == FRAG_MASK) 644 list_del(&page->lru); 645 } 646 spin_unlock_bh(&mm->context.list_lock); 647 return table; 648 } 649 650 void page_table_free(struct mm_struct *mm, unsigned long *table) 651 { 652 struct page *page; 653 unsigned int bit, mask; 654 655 if (mm_has_pgste(mm)) { 656 gmap_unmap_notifier(mm, table); 657 return page_table_free_pgste(table); 658 } 659 /* Free 1K/2K page table fragment of a 4K page */ 660 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 661 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 662 spin_lock_bh(&mm->context.list_lock); 663 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 664 list_del(&page->lru); 665 mask = atomic_xor_bits(&page->_mapcount, bit); 666 if (mask & FRAG_MASK) 667 list_add(&page->lru, &mm->context.pgtable_list); 668 spin_unlock_bh(&mm->context.list_lock); 669 if (mask == 0) { 670 pgtable_page_dtor(page); 671 atomic_set(&page->_mapcount, -1); 672 __free_page(page); 673 } 674 } 675 676 static void __page_table_free_rcu(void *table, unsigned bit) 677 { 678 struct page *page; 679 680 if (bit == FRAG_MASK) 681 return page_table_free_pgste(table); 682 /* Free 1K/2K page table fragment of a 4K page */ 683 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 684 if (atomic_xor_bits(&page->_mapcount, bit) == 0) { 685 pgtable_page_dtor(page); 686 atomic_set(&page->_mapcount, -1); 687 __free_page(page); 688 } 689 } 690 691 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table) 692 { 693 struct mm_struct *mm; 694 struct page *page; 695 unsigned int bit, mask; 696 697 mm = tlb->mm; 698 if (mm_has_pgste(mm)) { 699 gmap_unmap_notifier(mm, table); 700 table = (unsigned long *) (__pa(table) | FRAG_MASK); 701 tlb_remove_table(tlb, table); 702 return; 703 } 704 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 705 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 706 spin_lock_bh(&mm->context.list_lock); 707 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 708 list_del(&page->lru); 709 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); 710 if (mask & FRAG_MASK) 711 list_add_tail(&page->lru, &mm->context.pgtable_list); 712 spin_unlock_bh(&mm->context.list_lock); 713 table = (unsigned long *) (__pa(table) | (bit << 4)); 714 tlb_remove_table(tlb, table); 715 } 716 717 void __tlb_remove_table(void *_table) 718 { 719 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; 720 void *table = (void *)((unsigned long) _table & ~mask); 721 unsigned type = (unsigned long) _table & mask; 722 723 if (type) 724 __page_table_free_rcu(table, type); 725 else 726 free_pages((unsigned long) table, ALLOC_ORDER); 727 } 728 729 static void tlb_remove_table_smp_sync(void *arg) 730 { 731 /* Simply deliver the interrupt */ 732 } 733 734 static void tlb_remove_table_one(void *table) 735 { 736 /* 737 * This isn't an RCU grace period and hence the page-tables cannot be 738 * assumed to be actually RCU-freed. 739 * 740 * It is however sufficient for software page-table walkers that rely 741 * on IRQ disabling. See the comment near struct mmu_table_batch. 742 */ 743 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 744 __tlb_remove_table(table); 745 } 746 747 static void tlb_remove_table_rcu(struct rcu_head *head) 748 { 749 struct mmu_table_batch *batch; 750 int i; 751 752 batch = container_of(head, struct mmu_table_batch, rcu); 753 754 for (i = 0; i < batch->nr; i++) 755 __tlb_remove_table(batch->tables[i]); 756 757 free_page((unsigned long)batch); 758 } 759 760 void tlb_table_flush(struct mmu_gather *tlb) 761 { 762 struct mmu_table_batch **batch = &tlb->batch; 763 764 if (*batch) { 765 __tlb_flush_mm(tlb->mm); 766 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 767 *batch = NULL; 768 } 769 } 770 771 void tlb_remove_table(struct mmu_gather *tlb, void *table) 772 { 773 struct mmu_table_batch **batch = &tlb->batch; 774 775 if (*batch == NULL) { 776 *batch = (struct mmu_table_batch *) 777 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 778 if (*batch == NULL) { 779 __tlb_flush_mm(tlb->mm); 780 tlb_remove_table_one(table); 781 return; 782 } 783 (*batch)->nr = 0; 784 } 785 (*batch)->tables[(*batch)->nr++] = table; 786 if ((*batch)->nr == MAX_TABLE_BATCH) 787 tlb_table_flush(tlb); 788 } 789 790 /* 791 * switch on pgstes for its userspace process (for kvm) 792 */ 793 int s390_enable_sie(void) 794 { 795 struct task_struct *tsk = current; 796 struct mm_struct *mm, *old_mm; 797 798 /* Do we have switched amode? If no, we cannot do sie */ 799 if (s390_user_mode == HOME_SPACE_MODE) 800 return -EINVAL; 801 802 /* Do we have pgstes? if yes, we are done */ 803 if (mm_has_pgste(tsk->mm)) 804 return 0; 805 806 /* lets check if we are allowed to replace the mm */ 807 task_lock(tsk); 808 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 809 #ifdef CONFIG_AIO 810 !hlist_empty(&tsk->mm->ioctx_list) || 811 #endif 812 tsk->mm != tsk->active_mm) { 813 task_unlock(tsk); 814 return -EINVAL; 815 } 816 task_unlock(tsk); 817 818 /* we copy the mm and let dup_mm create the page tables with_pgstes */ 819 tsk->mm->context.alloc_pgste = 1; 820 /* make sure that both mms have a correct rss state */ 821 sync_mm_rss(tsk->mm); 822 mm = dup_mm(tsk); 823 tsk->mm->context.alloc_pgste = 0; 824 if (!mm) 825 return -ENOMEM; 826 827 /* Now lets check again if something happened */ 828 task_lock(tsk); 829 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 830 #ifdef CONFIG_AIO 831 !hlist_empty(&tsk->mm->ioctx_list) || 832 #endif 833 tsk->mm != tsk->active_mm) { 834 mmput(mm); 835 task_unlock(tsk); 836 return -EINVAL; 837 } 838 839 /* ok, we are alone. No ptrace, no threads, etc. */ 840 old_mm = tsk->mm; 841 tsk->mm = tsk->active_mm = mm; 842 preempt_disable(); 843 update_mm(mm, tsk); 844 atomic_inc(&mm->context.attach_count); 845 atomic_dec(&old_mm->context.attach_count); 846 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); 847 preempt_enable(); 848 task_unlock(tsk); 849 mmput(old_mm); 850 return 0; 851 } 852 EXPORT_SYMBOL_GPL(s390_enable_sie); 853 854 #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION) 855 bool kernel_page_present(struct page *page) 856 { 857 unsigned long addr; 858 int cc; 859 860 addr = page_to_phys(page); 861 asm volatile( 862 " lra %1,0(%1)\n" 863 " ipm %0\n" 864 " srl %0,28" 865 : "=d" (cc), "+a" (addr) : : "cc"); 866 return cc == 0; 867 } 868 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ 869