1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2007, 2011 4 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/kernel.h> 9 #include <linux/errno.h> 10 #include <linux/gfp.h> 11 #include <linux/mm.h> 12 #include <linux/swap.h> 13 #include <linux/smp.h> 14 #include <linux/spinlock.h> 15 #include <linux/rcupdate.h> 16 #include <linux/slab.h> 17 #include <linux/swapops.h> 18 #include <linux/sysctl.h> 19 #include <linux/ksm.h> 20 #include <linux/mman.h> 21 22 #include <asm/tlb.h> 23 #include <asm/tlbflush.h> 24 #include <asm/mmu_context.h> 25 #include <asm/page-states.h> 26 27 pgprot_t pgprot_writecombine(pgprot_t prot) 28 { 29 /* 30 * mio_wb_bit_mask may be set on a different CPU, but it is only set 31 * once at init and only read afterwards. 32 */ 33 return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); 34 } 35 EXPORT_SYMBOL_GPL(pgprot_writecombine); 36 37 pgprot_t pgprot_writethrough(pgprot_t prot) 38 { 39 /* 40 * mio_wb_bit_mask may be set on a different CPU, but it is only set 41 * once at init and only read afterwards. 42 */ 43 return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask); 44 } 45 EXPORT_SYMBOL_GPL(pgprot_writethrough); 46 47 static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, 48 pte_t *ptep, int nodat) 49 { 50 unsigned long opt, asce; 51 52 if (MACHINE_HAS_TLB_GUEST) { 53 opt = 0; 54 asce = READ_ONCE(mm->context.gmap_asce); 55 if (asce == 0UL || nodat) 56 opt |= IPTE_NODAT; 57 if (asce != -1UL) { 58 asce = asce ? : mm->context.asce; 59 opt |= IPTE_GUEST_ASCE; 60 } 61 __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); 62 } else { 63 __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); 64 } 65 } 66 67 static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, 68 pte_t *ptep, int nodat) 69 { 70 unsigned long opt, asce; 71 72 if (MACHINE_HAS_TLB_GUEST) { 73 opt = 0; 74 asce = READ_ONCE(mm->context.gmap_asce); 75 if (asce == 0UL || nodat) 76 opt |= IPTE_NODAT; 77 if (asce != -1UL) { 78 asce = asce ? : mm->context.asce; 79 opt |= IPTE_GUEST_ASCE; 80 } 81 __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); 82 } else { 83 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 84 } 85 } 86 87 static inline pte_t ptep_flush_direct(struct mm_struct *mm, 88 unsigned long addr, pte_t *ptep, 89 int nodat) 90 { 91 pte_t old; 92 93 old = *ptep; 94 if (unlikely(pte_val(old) & _PAGE_INVALID)) 95 return old; 96 atomic_inc(&mm->context.flush_count); 97 if (MACHINE_HAS_TLB_LC && 98 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 99 ptep_ipte_local(mm, addr, ptep, nodat); 100 else 101 ptep_ipte_global(mm, addr, ptep, nodat); 102 atomic_dec(&mm->context.flush_count); 103 return old; 104 } 105 106 static inline pte_t ptep_flush_lazy(struct mm_struct *mm, 107 unsigned long addr, pte_t *ptep, 108 int nodat) 109 { 110 pte_t old; 111 112 old = *ptep; 113 if (unlikely(pte_val(old) & _PAGE_INVALID)) 114 return old; 115 atomic_inc(&mm->context.flush_count); 116 if (cpumask_equal(&mm->context.cpu_attach_mask, 117 cpumask_of(smp_processor_id()))) { 118 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); 119 mm->context.flush_mm = 1; 120 } else 121 ptep_ipte_global(mm, addr, ptep, nodat); 122 atomic_dec(&mm->context.flush_count); 123 return old; 124 } 125 126 static inline pgste_t pgste_get_lock(pte_t *ptep) 127 { 128 unsigned long value = 0; 129 #ifdef CONFIG_PGSTE 130 unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); 131 132 do { 133 value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); 134 } while (value & PGSTE_PCL_BIT); 135 value |= PGSTE_PCL_BIT; 136 #endif 137 return __pgste(value); 138 } 139 140 static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) 141 { 142 #ifdef CONFIG_PGSTE 143 barrier(); 144 WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); 145 #endif 146 } 147 148 static inline pgste_t pgste_get(pte_t *ptep) 149 { 150 unsigned long pgste = 0; 151 #ifdef CONFIG_PGSTE 152 pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); 153 #endif 154 return __pgste(pgste); 155 } 156 157 static inline void pgste_set(pte_t *ptep, pgste_t pgste) 158 { 159 #ifdef CONFIG_PGSTE 160 *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; 161 #endif 162 } 163 164 static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, 165 struct mm_struct *mm) 166 { 167 #ifdef CONFIG_PGSTE 168 unsigned long address, bits, skey; 169 170 if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) 171 return pgste; 172 address = pte_val(pte) & PAGE_MASK; 173 skey = (unsigned long) page_get_storage_key(address); 174 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 175 /* Transfer page changed & referenced bit to guest bits in pgste */ 176 pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */ 177 /* Copy page access key and fetch protection bit to pgste */ 178 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); 179 pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 180 #endif 181 return pgste; 182 183 } 184 185 static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, 186 struct mm_struct *mm) 187 { 188 #ifdef CONFIG_PGSTE 189 unsigned long address; 190 unsigned long nkey; 191 192 if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) 193 return; 194 VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); 195 address = pte_val(entry) & PAGE_MASK; 196 /* 197 * Set page access key and fetch protection bit from pgste. 198 * The guest C/R information is still in the PGSTE, set real 199 * key C/R to 0. 200 */ 201 nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 202 nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 203 page_set_storage_key(address, nkey, 0); 204 #endif 205 } 206 207 static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) 208 { 209 #ifdef CONFIG_PGSTE 210 if ((pte_val(entry) & _PAGE_PRESENT) && 211 (pte_val(entry) & _PAGE_WRITE) && 212 !(pte_val(entry) & _PAGE_INVALID)) { 213 if (!MACHINE_HAS_ESOP) { 214 /* 215 * Without enhanced suppression-on-protection force 216 * the dirty bit on for all writable ptes. 217 */ 218 entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); 219 entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 220 } 221 if (!(pte_val(entry) & _PAGE_PROTECT)) 222 /* This pte allows write access, set user-dirty */ 223 pgste_val(pgste) |= PGSTE_UC_BIT; 224 } 225 #endif 226 set_pte(ptep, entry); 227 return pgste; 228 } 229 230 static inline pgste_t pgste_pte_notify(struct mm_struct *mm, 231 unsigned long addr, 232 pte_t *ptep, pgste_t pgste) 233 { 234 #ifdef CONFIG_PGSTE 235 unsigned long bits; 236 237 bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); 238 if (bits) { 239 pgste_val(pgste) ^= bits; 240 ptep_notify(mm, addr, ptep, bits); 241 } 242 #endif 243 return pgste; 244 } 245 246 static inline pgste_t ptep_xchg_start(struct mm_struct *mm, 247 unsigned long addr, pte_t *ptep) 248 { 249 pgste_t pgste = __pgste(0); 250 251 if (mm_has_pgste(mm)) { 252 pgste = pgste_get_lock(ptep); 253 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 254 } 255 return pgste; 256 } 257 258 static inline pte_t ptep_xchg_commit(struct mm_struct *mm, 259 unsigned long addr, pte_t *ptep, 260 pgste_t pgste, pte_t old, pte_t new) 261 { 262 if (mm_has_pgste(mm)) { 263 if (pte_val(old) & _PAGE_INVALID) 264 pgste_set_key(ptep, pgste, new, mm); 265 if (pte_val(new) & _PAGE_INVALID) { 266 pgste = pgste_update_all(old, pgste, mm); 267 if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == 268 _PGSTE_GPS_USAGE_UNUSED) 269 old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); 270 } 271 pgste = pgste_set_pte(ptep, pgste, new); 272 pgste_set_unlock(ptep, pgste); 273 } else { 274 set_pte(ptep, new); 275 } 276 return old; 277 } 278 279 pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, 280 pte_t *ptep, pte_t new) 281 { 282 pgste_t pgste; 283 pte_t old; 284 int nodat; 285 286 preempt_disable(); 287 pgste = ptep_xchg_start(mm, addr, ptep); 288 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 289 old = ptep_flush_direct(mm, addr, ptep, nodat); 290 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 291 preempt_enable(); 292 return old; 293 } 294 EXPORT_SYMBOL(ptep_xchg_direct); 295 296 /* 297 * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that 298 * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). 299 */ 300 void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, 301 pte_t new) 302 { 303 preempt_disable(); 304 atomic_inc(&mm->context.flush_count); 305 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 306 __ptep_rdp(addr, ptep, 0, 0, 1); 307 else 308 __ptep_rdp(addr, ptep, 0, 0, 0); 309 /* 310 * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That 311 * means it is still valid and active, and must not be changed according 312 * to the architecture. But writing a new value that only differs in SW 313 * bits is allowed. 314 */ 315 set_pte(ptep, new); 316 atomic_dec(&mm->context.flush_count); 317 preempt_enable(); 318 } 319 EXPORT_SYMBOL(ptep_reset_dat_prot); 320 321 pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, 322 pte_t *ptep, pte_t new) 323 { 324 pgste_t pgste; 325 pte_t old; 326 int nodat; 327 328 preempt_disable(); 329 pgste = ptep_xchg_start(mm, addr, ptep); 330 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 331 old = ptep_flush_lazy(mm, addr, ptep, nodat); 332 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 333 preempt_enable(); 334 return old; 335 } 336 EXPORT_SYMBOL(ptep_xchg_lazy); 337 338 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, 339 pte_t *ptep) 340 { 341 pgste_t pgste; 342 pte_t old; 343 int nodat; 344 struct mm_struct *mm = vma->vm_mm; 345 346 preempt_disable(); 347 pgste = ptep_xchg_start(mm, addr, ptep); 348 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 349 old = ptep_flush_lazy(mm, addr, ptep, nodat); 350 if (mm_has_pgste(mm)) { 351 pgste = pgste_update_all(old, pgste, mm); 352 pgste_set(ptep, pgste); 353 } 354 return old; 355 } 356 357 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 358 pte_t *ptep, pte_t old_pte, pte_t pte) 359 { 360 pgste_t pgste; 361 struct mm_struct *mm = vma->vm_mm; 362 363 if (mm_has_pgste(mm)) { 364 pgste = pgste_get(ptep); 365 pgste_set_key(ptep, pgste, pte, mm); 366 pgste = pgste_set_pte(ptep, pgste, pte); 367 pgste_set_unlock(ptep, pgste); 368 } else { 369 set_pte(ptep, pte); 370 } 371 preempt_enable(); 372 } 373 374 static inline void pmdp_idte_local(struct mm_struct *mm, 375 unsigned long addr, pmd_t *pmdp) 376 { 377 if (MACHINE_HAS_TLB_GUEST) 378 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 379 mm->context.asce, IDTE_LOCAL); 380 else 381 __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); 382 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 383 gmap_pmdp_idte_local(mm, addr); 384 } 385 386 static inline void pmdp_idte_global(struct mm_struct *mm, 387 unsigned long addr, pmd_t *pmdp) 388 { 389 if (MACHINE_HAS_TLB_GUEST) { 390 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 391 mm->context.asce, IDTE_GLOBAL); 392 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 393 gmap_pmdp_idte_global(mm, addr); 394 } else if (MACHINE_HAS_IDTE) { 395 __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); 396 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 397 gmap_pmdp_idte_global(mm, addr); 398 } else { 399 __pmdp_csp(pmdp); 400 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 401 gmap_pmdp_csp(mm, addr); 402 } 403 } 404 405 static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, 406 unsigned long addr, pmd_t *pmdp) 407 { 408 pmd_t old; 409 410 old = *pmdp; 411 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 412 return old; 413 atomic_inc(&mm->context.flush_count); 414 if (MACHINE_HAS_TLB_LC && 415 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 416 pmdp_idte_local(mm, addr, pmdp); 417 else 418 pmdp_idte_global(mm, addr, pmdp); 419 atomic_dec(&mm->context.flush_count); 420 return old; 421 } 422 423 static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, 424 unsigned long addr, pmd_t *pmdp) 425 { 426 pmd_t old; 427 428 old = *pmdp; 429 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 430 return old; 431 atomic_inc(&mm->context.flush_count); 432 if (cpumask_equal(&mm->context.cpu_attach_mask, 433 cpumask_of(smp_processor_id()))) { 434 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); 435 mm->context.flush_mm = 1; 436 if (mm_has_pgste(mm)) 437 gmap_pmdp_invalidate(mm, addr); 438 } else { 439 pmdp_idte_global(mm, addr, pmdp); 440 } 441 atomic_dec(&mm->context.flush_count); 442 return old; 443 } 444 445 #ifdef CONFIG_PGSTE 446 static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) 447 { 448 struct vm_area_struct *vma; 449 pgd_t *pgd; 450 p4d_t *p4d; 451 pud_t *pud; 452 453 /* We need a valid VMA, otherwise this is clearly a fault. */ 454 vma = vma_lookup(mm, addr); 455 if (!vma) 456 return -EFAULT; 457 458 pgd = pgd_offset(mm, addr); 459 if (!pgd_present(*pgd)) 460 return -ENOENT; 461 462 p4d = p4d_offset(pgd, addr); 463 if (!p4d_present(*p4d)) 464 return -ENOENT; 465 466 pud = pud_offset(p4d, addr); 467 if (!pud_present(*pud)) 468 return -ENOENT; 469 470 /* Large PUDs are not supported yet. */ 471 if (pud_leaf(*pud)) 472 return -EFAULT; 473 474 *pmdp = pmd_offset(pud, addr); 475 return 0; 476 } 477 #endif 478 479 pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, 480 pmd_t *pmdp, pmd_t new) 481 { 482 pmd_t old; 483 484 preempt_disable(); 485 old = pmdp_flush_direct(mm, addr, pmdp); 486 set_pmd(pmdp, new); 487 preempt_enable(); 488 return old; 489 } 490 EXPORT_SYMBOL(pmdp_xchg_direct); 491 492 pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, 493 pmd_t *pmdp, pmd_t new) 494 { 495 pmd_t old; 496 497 preempt_disable(); 498 old = pmdp_flush_lazy(mm, addr, pmdp); 499 set_pmd(pmdp, new); 500 preempt_enable(); 501 return old; 502 } 503 EXPORT_SYMBOL(pmdp_xchg_lazy); 504 505 static inline void pudp_idte_local(struct mm_struct *mm, 506 unsigned long addr, pud_t *pudp) 507 { 508 if (MACHINE_HAS_TLB_GUEST) 509 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 510 mm->context.asce, IDTE_LOCAL); 511 else 512 __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); 513 } 514 515 static inline void pudp_idte_global(struct mm_struct *mm, 516 unsigned long addr, pud_t *pudp) 517 { 518 if (MACHINE_HAS_TLB_GUEST) 519 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 520 mm->context.asce, IDTE_GLOBAL); 521 else if (MACHINE_HAS_IDTE) 522 __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); 523 else 524 /* 525 * Invalid bit position is the same for pmd and pud, so we can 526 * reuse _pmd_csp() here 527 */ 528 __pmdp_csp((pmd_t *) pudp); 529 } 530 531 static inline pud_t pudp_flush_direct(struct mm_struct *mm, 532 unsigned long addr, pud_t *pudp) 533 { 534 pud_t old; 535 536 old = *pudp; 537 if (pud_val(old) & _REGION_ENTRY_INVALID) 538 return old; 539 atomic_inc(&mm->context.flush_count); 540 if (MACHINE_HAS_TLB_LC && 541 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 542 pudp_idte_local(mm, addr, pudp); 543 else 544 pudp_idte_global(mm, addr, pudp); 545 atomic_dec(&mm->context.flush_count); 546 return old; 547 } 548 549 pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, 550 pud_t *pudp, pud_t new) 551 { 552 pud_t old; 553 554 preempt_disable(); 555 old = pudp_flush_direct(mm, addr, pudp); 556 set_pud(pudp, new); 557 preempt_enable(); 558 return old; 559 } 560 EXPORT_SYMBOL(pudp_xchg_direct); 561 562 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 563 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 564 pgtable_t pgtable) 565 { 566 struct list_head *lh = (struct list_head *) pgtable; 567 568 assert_spin_locked(pmd_lockptr(mm, pmdp)); 569 570 /* FIFO */ 571 if (!pmd_huge_pte(mm, pmdp)) 572 INIT_LIST_HEAD(lh); 573 else 574 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 575 pmd_huge_pte(mm, pmdp) = pgtable; 576 } 577 578 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 579 { 580 struct list_head *lh; 581 pgtable_t pgtable; 582 pte_t *ptep; 583 584 assert_spin_locked(pmd_lockptr(mm, pmdp)); 585 586 /* FIFO */ 587 pgtable = pmd_huge_pte(mm, pmdp); 588 lh = (struct list_head *) pgtable; 589 if (list_empty(lh)) 590 pmd_huge_pte(mm, pmdp) = NULL; 591 else { 592 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 593 list_del(lh); 594 } 595 ptep = (pte_t *) pgtable; 596 set_pte(ptep, __pte(_PAGE_INVALID)); 597 ptep++; 598 set_pte(ptep, __pte(_PAGE_INVALID)); 599 return pgtable; 600 } 601 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 602 603 #ifdef CONFIG_PGSTE 604 void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, 605 pte_t *ptep, pte_t entry) 606 { 607 pgste_t pgste; 608 609 /* the mm_has_pgste() check is done in set_pte_at() */ 610 preempt_disable(); 611 pgste = pgste_get_lock(ptep); 612 pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; 613 pgste_set_key(ptep, pgste, entry, mm); 614 pgste = pgste_set_pte(ptep, pgste, entry); 615 pgste_set_unlock(ptep, pgste); 616 preempt_enable(); 617 } 618 619 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 620 { 621 pgste_t pgste; 622 623 preempt_disable(); 624 pgste = pgste_get_lock(ptep); 625 pgste_val(pgste) |= PGSTE_IN_BIT; 626 pgste_set_unlock(ptep, pgste); 627 preempt_enable(); 628 } 629 630 /** 631 * ptep_force_prot - change access rights of a locked pte 632 * @mm: pointer to the process mm_struct 633 * @addr: virtual address in the guest address space 634 * @ptep: pointer to the page table entry 635 * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE 636 * @bit: pgste bit to set (e.g. for notification) 637 * 638 * Returns 0 if the access rights were changed and -EAGAIN if the current 639 * and requested access rights are incompatible. 640 */ 641 int ptep_force_prot(struct mm_struct *mm, unsigned long addr, 642 pte_t *ptep, int prot, unsigned long bit) 643 { 644 pte_t entry; 645 pgste_t pgste; 646 int pte_i, pte_p, nodat; 647 648 pgste = pgste_get_lock(ptep); 649 entry = *ptep; 650 /* Check pte entry after all locks have been acquired */ 651 pte_i = pte_val(entry) & _PAGE_INVALID; 652 pte_p = pte_val(entry) & _PAGE_PROTECT; 653 if ((pte_i && (prot != PROT_NONE)) || 654 (pte_p && (prot & PROT_WRITE))) { 655 pgste_set_unlock(ptep, pgste); 656 return -EAGAIN; 657 } 658 /* Change access rights and set pgste bit */ 659 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 660 if (prot == PROT_NONE && !pte_i) { 661 ptep_flush_direct(mm, addr, ptep, nodat); 662 pgste = pgste_update_all(entry, pgste, mm); 663 entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); 664 } 665 if (prot == PROT_READ && !pte_p) { 666 ptep_flush_direct(mm, addr, ptep, nodat); 667 entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); 668 entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 669 } 670 pgste_val(pgste) |= bit; 671 pgste = pgste_set_pte(ptep, pgste, entry); 672 pgste_set_unlock(ptep, pgste); 673 return 0; 674 } 675 676 int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, 677 pte_t *sptep, pte_t *tptep, pte_t pte) 678 { 679 pgste_t spgste, tpgste; 680 pte_t spte, tpte; 681 int rc = -EAGAIN; 682 683 if (!(pte_val(*tptep) & _PAGE_INVALID)) 684 return 0; /* already shadowed */ 685 spgste = pgste_get_lock(sptep); 686 spte = *sptep; 687 if (!(pte_val(spte) & _PAGE_INVALID) && 688 !((pte_val(spte) & _PAGE_PROTECT) && 689 !(pte_val(pte) & _PAGE_PROTECT))) { 690 pgste_val(spgste) |= PGSTE_VSIE_BIT; 691 tpgste = pgste_get_lock(tptep); 692 tpte = __pte((pte_val(spte) & PAGE_MASK) | 693 (pte_val(pte) & _PAGE_PROTECT)); 694 /* don't touch the storage key - it belongs to parent pgste */ 695 tpgste = pgste_set_pte(tptep, tpgste, tpte); 696 pgste_set_unlock(tptep, tpgste); 697 rc = 1; 698 } 699 pgste_set_unlock(sptep, spgste); 700 return rc; 701 } 702 703 void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) 704 { 705 pgste_t pgste; 706 int nodat; 707 708 pgste = pgste_get_lock(ptep); 709 /* notifier is called by the caller */ 710 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 711 ptep_flush_direct(mm, saddr, ptep, nodat); 712 /* don't touch the storage key - it belongs to parent pgste */ 713 pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); 714 pgste_set_unlock(ptep, pgste); 715 } 716 717 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) 718 { 719 if (!non_swap_entry(entry)) 720 dec_mm_counter(mm, MM_SWAPENTS); 721 else if (is_migration_entry(entry)) { 722 struct folio *folio = pfn_swap_entry_folio(entry); 723 724 dec_mm_counter(mm, mm_counter(folio)); 725 } 726 free_swap_and_cache(entry); 727 } 728 729 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, 730 pte_t *ptep, int reset) 731 { 732 unsigned long pgstev; 733 pgste_t pgste; 734 pte_t pte; 735 736 /* Zap unused and logically-zero pages */ 737 preempt_disable(); 738 pgste = pgste_get_lock(ptep); 739 pgstev = pgste_val(pgste); 740 pte = *ptep; 741 if (!reset && pte_swap(pte) && 742 ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || 743 (pgstev & _PGSTE_GPS_ZERO))) { 744 ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); 745 pte_clear(mm, addr, ptep); 746 } 747 if (reset) 748 pgste_val(pgste) &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 749 pgste_set_unlock(ptep, pgste); 750 preempt_enable(); 751 } 752 753 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 754 { 755 unsigned long ptev; 756 pgste_t pgste; 757 758 /* Clear storage key ACC and F, but set R/C */ 759 preempt_disable(); 760 pgste = pgste_get_lock(ptep); 761 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); 762 pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT; 763 ptev = pte_val(*ptep); 764 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 765 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); 766 pgste_set_unlock(ptep, pgste); 767 preempt_enable(); 768 } 769 770 /* 771 * Test and reset if a guest page is dirty 772 */ 773 bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, 774 pte_t *ptep) 775 { 776 pgste_t pgste; 777 pte_t pte; 778 bool dirty; 779 int nodat; 780 781 pgste = pgste_get_lock(ptep); 782 dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); 783 pgste_val(pgste) &= ~PGSTE_UC_BIT; 784 pte = *ptep; 785 if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { 786 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 787 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 788 ptep_ipte_global(mm, addr, ptep, nodat); 789 if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) 790 pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); 791 else 792 pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); 793 set_pte(ptep, pte); 794 } 795 pgste_set_unlock(ptep, pgste); 796 return dirty; 797 } 798 EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); 799 800 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 801 unsigned char key, bool nq) 802 { 803 unsigned long keyul, paddr; 804 spinlock_t *ptl; 805 pgste_t old, new; 806 pmd_t *pmdp; 807 pte_t *ptep; 808 809 /* 810 * If we don't have a PTE table and if there is no huge page mapped, 811 * we can ignore attempts to set the key to 0, because it already is 0. 812 */ 813 switch (pmd_lookup(mm, addr, &pmdp)) { 814 case -ENOENT: 815 return key ? -EFAULT : 0; 816 case 0: 817 break; 818 default: 819 return -EFAULT; 820 } 821 again: 822 ptl = pmd_lock(mm, pmdp); 823 if (!pmd_present(*pmdp)) { 824 spin_unlock(ptl); 825 return key ? -EFAULT : 0; 826 } 827 828 if (pmd_leaf(*pmdp)) { 829 paddr = pmd_val(*pmdp) & HPAGE_MASK; 830 paddr |= addr & ~HPAGE_MASK; 831 /* 832 * Huge pmds need quiescing operations, they are 833 * always mapped. 834 */ 835 page_set_storage_key(paddr, key, 1); 836 spin_unlock(ptl); 837 return 0; 838 } 839 spin_unlock(ptl); 840 841 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 842 if (!ptep) 843 goto again; 844 new = old = pgste_get_lock(ptep); 845 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 846 PGSTE_ACC_BITS | PGSTE_FP_BIT); 847 keyul = (unsigned long) key; 848 pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 849 pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 850 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 851 unsigned long bits, skey; 852 853 paddr = pte_val(*ptep) & PAGE_MASK; 854 skey = (unsigned long) page_get_storage_key(paddr); 855 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 856 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 857 /* Set storage key ACC and FP */ 858 page_set_storage_key(paddr, skey, !nq); 859 /* Merge host changed & referenced into pgste */ 860 pgste_val(new) |= bits << 52; 861 } 862 /* changing the guest storage key is considered a change of the page */ 863 if ((pgste_val(new) ^ pgste_val(old)) & 864 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 865 pgste_val(new) |= PGSTE_UC_BIT; 866 867 pgste_set_unlock(ptep, new); 868 pte_unmap_unlock(ptep, ptl); 869 return 0; 870 } 871 EXPORT_SYMBOL(set_guest_storage_key); 872 873 /* 874 * Conditionally set a guest storage key (handling csske). 875 * oldkey will be updated when either mr or mc is set and a pointer is given. 876 * 877 * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest 878 * storage key was updated and -EFAULT on access errors. 879 */ 880 int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 881 unsigned char key, unsigned char *oldkey, 882 bool nq, bool mr, bool mc) 883 { 884 unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; 885 int rc; 886 887 /* we can drop the pgste lock between getting and setting the key */ 888 if (mr | mc) { 889 rc = get_guest_storage_key(current->mm, addr, &tmp); 890 if (rc) 891 return rc; 892 if (oldkey) 893 *oldkey = tmp; 894 if (!mr) 895 mask |= _PAGE_REFERENCED; 896 if (!mc) 897 mask |= _PAGE_CHANGED; 898 if (!((tmp ^ key) & mask)) 899 return 0; 900 } 901 rc = set_guest_storage_key(current->mm, addr, key, nq); 902 return rc < 0 ? rc : 1; 903 } 904 EXPORT_SYMBOL(cond_set_guest_storage_key); 905 906 /* 907 * Reset a guest reference bit (rrbe), returning the reference and changed bit. 908 * 909 * Returns < 0 in case of error, otherwise the cc to be reported to the guest. 910 */ 911 int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) 912 { 913 spinlock_t *ptl; 914 unsigned long paddr; 915 pgste_t old, new; 916 pmd_t *pmdp; 917 pte_t *ptep; 918 int cc = 0; 919 920 /* 921 * If we don't have a PTE table and if there is no huge page mapped, 922 * the storage key is 0 and there is nothing for us to do. 923 */ 924 switch (pmd_lookup(mm, addr, &pmdp)) { 925 case -ENOENT: 926 return 0; 927 case 0: 928 break; 929 default: 930 return -EFAULT; 931 } 932 again: 933 ptl = pmd_lock(mm, pmdp); 934 if (!pmd_present(*pmdp)) { 935 spin_unlock(ptl); 936 return 0; 937 } 938 939 if (pmd_leaf(*pmdp)) { 940 paddr = pmd_val(*pmdp) & HPAGE_MASK; 941 paddr |= addr & ~HPAGE_MASK; 942 cc = page_reset_referenced(paddr); 943 spin_unlock(ptl); 944 return cc; 945 } 946 spin_unlock(ptl); 947 948 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 949 if (!ptep) 950 goto again; 951 new = old = pgste_get_lock(ptep); 952 /* Reset guest reference bit only */ 953 pgste_val(new) &= ~PGSTE_GR_BIT; 954 955 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 956 paddr = pte_val(*ptep) & PAGE_MASK; 957 cc = page_reset_referenced(paddr); 958 /* Merge real referenced bit into host-set */ 959 pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; 960 } 961 /* Reflect guest's logical view, not physical */ 962 cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; 963 /* Changing the guest storage key is considered a change of the page */ 964 if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) 965 pgste_val(new) |= PGSTE_UC_BIT; 966 967 pgste_set_unlock(ptep, new); 968 pte_unmap_unlock(ptep, ptl); 969 return cc; 970 } 971 EXPORT_SYMBOL(reset_guest_reference_bit); 972 973 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, 974 unsigned char *key) 975 { 976 unsigned long paddr; 977 spinlock_t *ptl; 978 pgste_t pgste; 979 pmd_t *pmdp; 980 pte_t *ptep; 981 982 /* 983 * If we don't have a PTE table and if there is no huge page mapped, 984 * the storage key is 0. 985 */ 986 *key = 0; 987 988 switch (pmd_lookup(mm, addr, &pmdp)) { 989 case -ENOENT: 990 return 0; 991 case 0: 992 break; 993 default: 994 return -EFAULT; 995 } 996 again: 997 ptl = pmd_lock(mm, pmdp); 998 if (!pmd_present(*pmdp)) { 999 spin_unlock(ptl); 1000 return 0; 1001 } 1002 1003 if (pmd_leaf(*pmdp)) { 1004 paddr = pmd_val(*pmdp) & HPAGE_MASK; 1005 paddr |= addr & ~HPAGE_MASK; 1006 *key = page_get_storage_key(paddr); 1007 spin_unlock(ptl); 1008 return 0; 1009 } 1010 spin_unlock(ptl); 1011 1012 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 1013 if (!ptep) 1014 goto again; 1015 pgste = pgste_get_lock(ptep); 1016 *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 1017 paddr = pte_val(*ptep) & PAGE_MASK; 1018 if (!(pte_val(*ptep) & _PAGE_INVALID)) 1019 *key = page_get_storage_key(paddr); 1020 /* Reflect guest's logical view, not physical */ 1021 *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 1022 pgste_set_unlock(ptep, pgste); 1023 pte_unmap_unlock(ptep, ptl); 1024 return 0; 1025 } 1026 EXPORT_SYMBOL(get_guest_storage_key); 1027 1028 /** 1029 * pgste_perform_essa - perform ESSA actions on the PGSTE. 1030 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1031 * @hva: the host virtual address of the page whose PGSTE is to be processed 1032 * @orc: the specific action to perform, see the ESSA_SET_* macros. 1033 * @oldpte: the PTE will be saved there if the pointer is not NULL. 1034 * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. 1035 * 1036 * Return: 1 if the page is to be added to the CBRL, otherwise 0, 1037 * or < 0 in case of error. -EINVAL is returned for invalid values 1038 * of orc, -EFAULT for invalid addresses. 1039 */ 1040 int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, 1041 unsigned long *oldpte, unsigned long *oldpgste) 1042 { 1043 struct vm_area_struct *vma; 1044 unsigned long pgstev; 1045 spinlock_t *ptl; 1046 pgste_t pgste; 1047 pte_t *ptep; 1048 int res = 0; 1049 1050 WARN_ON_ONCE(orc > ESSA_MAX); 1051 if (unlikely(orc > ESSA_MAX)) 1052 return -EINVAL; 1053 1054 vma = vma_lookup(mm, hva); 1055 if (!vma || is_vm_hugetlb_page(vma)) 1056 return -EFAULT; 1057 ptep = get_locked_pte(mm, hva, &ptl); 1058 if (unlikely(!ptep)) 1059 return -EFAULT; 1060 pgste = pgste_get_lock(ptep); 1061 pgstev = pgste_val(pgste); 1062 if (oldpte) 1063 *oldpte = pte_val(*ptep); 1064 if (oldpgste) 1065 *oldpgste = pgstev; 1066 1067 switch (orc) { 1068 case ESSA_GET_STATE: 1069 break; 1070 case ESSA_SET_STABLE: 1071 pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 1072 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1073 break; 1074 case ESSA_SET_UNUSED: 1075 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1076 pgstev |= _PGSTE_GPS_USAGE_UNUSED; 1077 if (pte_val(*ptep) & _PAGE_INVALID) 1078 res = 1; 1079 break; 1080 case ESSA_SET_VOLATILE: 1081 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1082 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1083 if (pte_val(*ptep) & _PAGE_INVALID) 1084 res = 1; 1085 break; 1086 case ESSA_SET_POT_VOLATILE: 1087 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1088 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1089 pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; 1090 break; 1091 } 1092 if (pgstev & _PGSTE_GPS_ZERO) { 1093 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1094 break; 1095 } 1096 if (!(pgstev & PGSTE_GC_BIT)) { 1097 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1098 res = 1; 1099 break; 1100 } 1101 break; 1102 case ESSA_SET_STABLE_RESIDENT: 1103 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1104 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1105 /* 1106 * Since the resident state can go away any time after this 1107 * call, we will not make this page resident. We can revisit 1108 * this decision if a guest will ever start using this. 1109 */ 1110 break; 1111 case ESSA_SET_STABLE_IF_RESIDENT: 1112 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1113 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1114 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1115 } 1116 break; 1117 case ESSA_SET_STABLE_NODAT: 1118 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1119 pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; 1120 break; 1121 default: 1122 /* we should never get here! */ 1123 break; 1124 } 1125 /* If we are discarding a page, set it to logical zero */ 1126 if (res) 1127 pgstev |= _PGSTE_GPS_ZERO; 1128 1129 pgste_val(pgste) = pgstev; 1130 pgste_set_unlock(ptep, pgste); 1131 pte_unmap_unlock(ptep, ptl); 1132 return res; 1133 } 1134 EXPORT_SYMBOL(pgste_perform_essa); 1135 1136 /** 1137 * set_pgste_bits - set specific PGSTE bits. 1138 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1139 * @hva: the host virtual address of the page whose PGSTE is to be processed 1140 * @bits: a bitmask representing the bits that will be touched 1141 * @value: the values of the bits to be written. Only the bits in the mask 1142 * will be written. 1143 * 1144 * Return: 0 on success, < 0 in case of error. 1145 */ 1146 int set_pgste_bits(struct mm_struct *mm, unsigned long hva, 1147 unsigned long bits, unsigned long value) 1148 { 1149 struct vm_area_struct *vma; 1150 spinlock_t *ptl; 1151 pgste_t new; 1152 pte_t *ptep; 1153 1154 vma = vma_lookup(mm, hva); 1155 if (!vma || is_vm_hugetlb_page(vma)) 1156 return -EFAULT; 1157 ptep = get_locked_pte(mm, hva, &ptl); 1158 if (unlikely(!ptep)) 1159 return -EFAULT; 1160 new = pgste_get_lock(ptep); 1161 1162 pgste_val(new) &= ~bits; 1163 pgste_val(new) |= value & bits; 1164 1165 pgste_set_unlock(ptep, new); 1166 pte_unmap_unlock(ptep, ptl); 1167 return 0; 1168 } 1169 EXPORT_SYMBOL(set_pgste_bits); 1170 1171 /** 1172 * get_pgste - get the current PGSTE for the given address. 1173 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1174 * @hva: the host virtual address of the page whose PGSTE is to be processed 1175 * @pgstep: will be written with the current PGSTE for the given address. 1176 * 1177 * Return: 0 on success, < 0 in case of error. 1178 */ 1179 int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) 1180 { 1181 struct vm_area_struct *vma; 1182 spinlock_t *ptl; 1183 pte_t *ptep; 1184 1185 vma = vma_lookup(mm, hva); 1186 if (!vma || is_vm_hugetlb_page(vma)) 1187 return -EFAULT; 1188 ptep = get_locked_pte(mm, hva, &ptl); 1189 if (unlikely(!ptep)) 1190 return -EFAULT; 1191 *pgstep = pgste_val(pgste_get(ptep)); 1192 pte_unmap_unlock(ptep, ptl); 1193 return 0; 1194 } 1195 EXPORT_SYMBOL(get_pgste); 1196 #endif 1197