1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2007, 2011 4 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 5 */ 6 7 #include <linux/cpufeature.h> 8 #include <linux/export.h> 9 #include <linux/sched.h> 10 #include <linux/kernel.h> 11 #include <linux/errno.h> 12 #include <linux/gfp.h> 13 #include <linux/mm.h> 14 #include <linux/swap.h> 15 #include <linux/smp.h> 16 #include <linux/spinlock.h> 17 #include <linux/rcupdate.h> 18 #include <linux/slab.h> 19 #include <linux/swapops.h> 20 #include <linux/sysctl.h> 21 #include <linux/ksm.h> 22 #include <linux/mman.h> 23 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 #include <asm/page-states.h> 27 #include <asm/pgtable.h> 28 #include <asm/machine.h> 29 30 pgprot_t pgprot_writecombine(pgprot_t prot) 31 { 32 /* 33 * mio_wb_bit_mask may be set on a different CPU, but it is only set 34 * once at init and only read afterwards. 35 */ 36 return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); 37 } 38 EXPORT_SYMBOL_GPL(pgprot_writecombine); 39 40 static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, 41 pte_t *ptep, int nodat) 42 { 43 unsigned long opt, asce; 44 45 if (machine_has_tlb_guest()) { 46 opt = 0; 47 asce = READ_ONCE(mm->context.gmap_asce); 48 if (asce == 0UL || nodat) 49 opt |= IPTE_NODAT; 50 if (asce != -1UL) { 51 asce = asce ? : mm->context.asce; 52 opt |= IPTE_GUEST_ASCE; 53 } 54 __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); 55 } else { 56 __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); 57 } 58 } 59 60 static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, 61 pte_t *ptep, int nodat) 62 { 63 unsigned long opt, asce; 64 65 if (machine_has_tlb_guest()) { 66 opt = 0; 67 asce = READ_ONCE(mm->context.gmap_asce); 68 if (asce == 0UL || nodat) 69 opt |= IPTE_NODAT; 70 if (asce != -1UL) { 71 asce = asce ? : mm->context.asce; 72 opt |= IPTE_GUEST_ASCE; 73 } 74 __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); 75 } else { 76 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 77 } 78 } 79 80 static inline pte_t ptep_flush_direct(struct mm_struct *mm, 81 unsigned long addr, pte_t *ptep, 82 int nodat) 83 { 84 pte_t old; 85 86 old = *ptep; 87 if (unlikely(pte_val(old) & _PAGE_INVALID)) 88 return old; 89 atomic_inc(&mm->context.flush_count); 90 if (cpu_has_tlb_lc() && 91 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 92 ptep_ipte_local(mm, addr, ptep, nodat); 93 else 94 ptep_ipte_global(mm, addr, ptep, nodat); 95 atomic_dec(&mm->context.flush_count); 96 return old; 97 } 98 99 static inline pte_t ptep_flush_lazy(struct mm_struct *mm, 100 unsigned long addr, pte_t *ptep, 101 int nodat) 102 { 103 pte_t old; 104 105 old = *ptep; 106 if (unlikely(pte_val(old) & _PAGE_INVALID)) 107 return old; 108 atomic_inc(&mm->context.flush_count); 109 if (cpumask_equal(&mm->context.cpu_attach_mask, 110 cpumask_of(smp_processor_id()))) { 111 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); 112 mm->context.flush_mm = 1; 113 } else 114 ptep_ipte_global(mm, addr, ptep, nodat); 115 atomic_dec(&mm->context.flush_count); 116 return old; 117 } 118 119 static inline pgste_t pgste_get(pte_t *ptep) 120 { 121 unsigned long pgste = 0; 122 #ifdef CONFIG_PGSTE 123 pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); 124 #endif 125 return __pgste(pgste); 126 } 127 128 static inline void pgste_set(pte_t *ptep, pgste_t pgste) 129 { 130 #ifdef CONFIG_PGSTE 131 *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; 132 #endif 133 } 134 135 static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, 136 struct mm_struct *mm) 137 { 138 #ifdef CONFIG_PGSTE 139 unsigned long address, bits, skey; 140 141 if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) 142 return pgste; 143 address = pte_val(pte) & PAGE_MASK; 144 skey = (unsigned long) page_get_storage_key(address); 145 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 146 /* Transfer page changed & referenced bit to guest bits in pgste */ 147 pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */ 148 /* Copy page access key and fetch protection bit to pgste */ 149 pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); 150 pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); 151 #endif 152 return pgste; 153 154 } 155 156 static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, 157 struct mm_struct *mm) 158 { 159 #ifdef CONFIG_PGSTE 160 unsigned long address; 161 unsigned long nkey; 162 163 if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) 164 return; 165 VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); 166 address = pte_val(entry) & PAGE_MASK; 167 /* 168 * Set page access key and fetch protection bit from pgste. 169 * The guest C/R information is still in the PGSTE, set real 170 * key C/R to 0. 171 */ 172 nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 173 nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 174 page_set_storage_key(address, nkey, 0); 175 #endif 176 } 177 178 static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) 179 { 180 #ifdef CONFIG_PGSTE 181 if ((pte_val(entry) & _PAGE_PRESENT) && 182 (pte_val(entry) & _PAGE_WRITE) && 183 !(pte_val(entry) & _PAGE_INVALID)) { 184 if (!machine_has_esop()) { 185 /* 186 * Without enhanced suppression-on-protection force 187 * the dirty bit on for all writable ptes. 188 */ 189 entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); 190 entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 191 } 192 if (!(pte_val(entry) & _PAGE_PROTECT)) 193 /* This pte allows write access, set user-dirty */ 194 pgste = set_pgste_bit(pgste, PGSTE_UC_BIT); 195 } 196 #endif 197 set_pte(ptep, entry); 198 return pgste; 199 } 200 201 static inline pgste_t pgste_pte_notify(struct mm_struct *mm, 202 unsigned long addr, 203 pte_t *ptep, pgste_t pgste) 204 { 205 #ifdef CONFIG_PGSTE 206 unsigned long bits; 207 208 bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); 209 if (bits) { 210 pgste = __pgste(pgste_val(pgste) ^ bits); 211 ptep_notify(mm, addr, ptep, bits); 212 } 213 #endif 214 return pgste; 215 } 216 217 static inline pgste_t ptep_xchg_start(struct mm_struct *mm, 218 unsigned long addr, pte_t *ptep) 219 { 220 pgste_t pgste = __pgste(0); 221 222 if (mm_has_pgste(mm)) { 223 pgste = pgste_get_lock(ptep); 224 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 225 } 226 return pgste; 227 } 228 229 static inline pte_t ptep_xchg_commit(struct mm_struct *mm, 230 unsigned long addr, pte_t *ptep, 231 pgste_t pgste, pte_t old, pte_t new) 232 { 233 if (mm_has_pgste(mm)) { 234 if (pte_val(old) & _PAGE_INVALID) 235 pgste_set_key(ptep, pgste, new, mm); 236 if (pte_val(new) & _PAGE_INVALID) { 237 pgste = pgste_update_all(old, pgste, mm); 238 if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == 239 _PGSTE_GPS_USAGE_UNUSED) 240 old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); 241 } 242 pgste = pgste_set_pte(ptep, pgste, new); 243 pgste_set_unlock(ptep, pgste); 244 } else { 245 set_pte(ptep, new); 246 } 247 return old; 248 } 249 250 pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, 251 pte_t *ptep, pte_t new) 252 { 253 pgste_t pgste; 254 pte_t old; 255 int nodat; 256 257 preempt_disable(); 258 pgste = ptep_xchg_start(mm, addr, ptep); 259 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 260 old = ptep_flush_direct(mm, addr, ptep, nodat); 261 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 262 preempt_enable(); 263 return old; 264 } 265 EXPORT_SYMBOL(ptep_xchg_direct); 266 267 /* 268 * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that 269 * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). 270 */ 271 void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, 272 pte_t new) 273 { 274 preempt_disable(); 275 atomic_inc(&mm->context.flush_count); 276 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 277 __ptep_rdp(addr, ptep, 0, 0, 1); 278 else 279 __ptep_rdp(addr, ptep, 0, 0, 0); 280 /* 281 * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That 282 * means it is still valid and active, and must not be changed according 283 * to the architecture. But writing a new value that only differs in SW 284 * bits is allowed. 285 */ 286 set_pte(ptep, new); 287 atomic_dec(&mm->context.flush_count); 288 preempt_enable(); 289 } 290 EXPORT_SYMBOL(ptep_reset_dat_prot); 291 292 pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, 293 pte_t *ptep, pte_t new) 294 { 295 pgste_t pgste; 296 pte_t old; 297 int nodat; 298 299 preempt_disable(); 300 pgste = ptep_xchg_start(mm, addr, ptep); 301 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 302 old = ptep_flush_lazy(mm, addr, ptep, nodat); 303 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 304 preempt_enable(); 305 return old; 306 } 307 EXPORT_SYMBOL(ptep_xchg_lazy); 308 309 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, 310 pte_t *ptep) 311 { 312 pgste_t pgste; 313 pte_t old; 314 int nodat; 315 struct mm_struct *mm = vma->vm_mm; 316 317 pgste = ptep_xchg_start(mm, addr, ptep); 318 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 319 old = ptep_flush_lazy(mm, addr, ptep, nodat); 320 if (mm_has_pgste(mm)) { 321 pgste = pgste_update_all(old, pgste, mm); 322 pgste_set(ptep, pgste); 323 } 324 return old; 325 } 326 327 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 328 pte_t *ptep, pte_t old_pte, pte_t pte) 329 { 330 pgste_t pgste; 331 struct mm_struct *mm = vma->vm_mm; 332 333 if (mm_has_pgste(mm)) { 334 pgste = pgste_get(ptep); 335 pgste_set_key(ptep, pgste, pte, mm); 336 pgste = pgste_set_pte(ptep, pgste, pte); 337 pgste_set_unlock(ptep, pgste); 338 } else { 339 set_pte(ptep, pte); 340 } 341 } 342 343 static inline void pmdp_idte_local(struct mm_struct *mm, 344 unsigned long addr, pmd_t *pmdp) 345 { 346 if (machine_has_tlb_guest()) 347 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 348 mm->context.asce, IDTE_LOCAL); 349 else 350 __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); 351 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 352 gmap_pmdp_idte_local(mm, addr); 353 } 354 355 static inline void pmdp_idte_global(struct mm_struct *mm, 356 unsigned long addr, pmd_t *pmdp) 357 { 358 if (machine_has_tlb_guest()) { 359 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 360 mm->context.asce, IDTE_GLOBAL); 361 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 362 gmap_pmdp_idte_global(mm, addr); 363 } else if (cpu_has_idte()) { 364 __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); 365 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 366 gmap_pmdp_idte_global(mm, addr); 367 } else { 368 __pmdp_csp(pmdp); 369 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 370 gmap_pmdp_csp(mm, addr); 371 } 372 } 373 374 static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, 375 unsigned long addr, pmd_t *pmdp) 376 { 377 pmd_t old; 378 379 old = *pmdp; 380 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 381 return old; 382 atomic_inc(&mm->context.flush_count); 383 if (cpu_has_tlb_lc() && 384 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 385 pmdp_idte_local(mm, addr, pmdp); 386 else 387 pmdp_idte_global(mm, addr, pmdp); 388 atomic_dec(&mm->context.flush_count); 389 return old; 390 } 391 392 static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, 393 unsigned long addr, pmd_t *pmdp) 394 { 395 pmd_t old; 396 397 old = *pmdp; 398 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 399 return old; 400 atomic_inc(&mm->context.flush_count); 401 if (cpumask_equal(&mm->context.cpu_attach_mask, 402 cpumask_of(smp_processor_id()))) { 403 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); 404 mm->context.flush_mm = 1; 405 if (mm_has_pgste(mm)) 406 gmap_pmdp_invalidate(mm, addr); 407 } else { 408 pmdp_idte_global(mm, addr, pmdp); 409 } 410 atomic_dec(&mm->context.flush_count); 411 return old; 412 } 413 414 #ifdef CONFIG_PGSTE 415 static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) 416 { 417 struct vm_area_struct *vma; 418 pgd_t *pgd; 419 p4d_t *p4d; 420 pud_t *pud; 421 422 /* We need a valid VMA, otherwise this is clearly a fault. */ 423 vma = vma_lookup(mm, addr); 424 if (!vma) 425 return -EFAULT; 426 427 pgd = pgd_offset(mm, addr); 428 if (!pgd_present(*pgd)) 429 return -ENOENT; 430 431 p4d = p4d_offset(pgd, addr); 432 if (!p4d_present(*p4d)) 433 return -ENOENT; 434 435 pud = pud_offset(p4d, addr); 436 if (!pud_present(*pud)) 437 return -ENOENT; 438 439 /* Large PUDs are not supported yet. */ 440 if (pud_leaf(*pud)) 441 return -EFAULT; 442 443 *pmdp = pmd_offset(pud, addr); 444 return 0; 445 } 446 #endif 447 448 pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, 449 pmd_t *pmdp, pmd_t new) 450 { 451 pmd_t old; 452 453 preempt_disable(); 454 old = pmdp_flush_direct(mm, addr, pmdp); 455 set_pmd(pmdp, new); 456 preempt_enable(); 457 return old; 458 } 459 EXPORT_SYMBOL(pmdp_xchg_direct); 460 461 pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, 462 pmd_t *pmdp, pmd_t new) 463 { 464 pmd_t old; 465 466 preempt_disable(); 467 old = pmdp_flush_lazy(mm, addr, pmdp); 468 set_pmd(pmdp, new); 469 preempt_enable(); 470 return old; 471 } 472 EXPORT_SYMBOL(pmdp_xchg_lazy); 473 474 static inline void pudp_idte_local(struct mm_struct *mm, 475 unsigned long addr, pud_t *pudp) 476 { 477 if (machine_has_tlb_guest()) 478 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 479 mm->context.asce, IDTE_LOCAL); 480 else 481 __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); 482 } 483 484 static inline void pudp_idte_global(struct mm_struct *mm, 485 unsigned long addr, pud_t *pudp) 486 { 487 if (machine_has_tlb_guest()) 488 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 489 mm->context.asce, IDTE_GLOBAL); 490 else if (cpu_has_idte()) 491 __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); 492 else 493 /* 494 * Invalid bit position is the same for pmd and pud, so we can 495 * reuse _pmd_csp() here 496 */ 497 __pmdp_csp((pmd_t *) pudp); 498 } 499 500 static inline pud_t pudp_flush_direct(struct mm_struct *mm, 501 unsigned long addr, pud_t *pudp) 502 { 503 pud_t old; 504 505 old = *pudp; 506 if (pud_val(old) & _REGION_ENTRY_INVALID) 507 return old; 508 atomic_inc(&mm->context.flush_count); 509 if (cpu_has_tlb_lc() && 510 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 511 pudp_idte_local(mm, addr, pudp); 512 else 513 pudp_idte_global(mm, addr, pudp); 514 atomic_dec(&mm->context.flush_count); 515 return old; 516 } 517 518 pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, 519 pud_t *pudp, pud_t new) 520 { 521 pud_t old; 522 523 preempt_disable(); 524 old = pudp_flush_direct(mm, addr, pudp); 525 set_pud(pudp, new); 526 preempt_enable(); 527 return old; 528 } 529 EXPORT_SYMBOL(pudp_xchg_direct); 530 531 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 532 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 533 pgtable_t pgtable) 534 { 535 struct list_head *lh = (struct list_head *) pgtable; 536 537 assert_spin_locked(pmd_lockptr(mm, pmdp)); 538 539 /* FIFO */ 540 if (!pmd_huge_pte(mm, pmdp)) 541 INIT_LIST_HEAD(lh); 542 else 543 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 544 pmd_huge_pte(mm, pmdp) = pgtable; 545 } 546 547 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 548 { 549 struct list_head *lh; 550 pgtable_t pgtable; 551 pte_t *ptep; 552 553 assert_spin_locked(pmd_lockptr(mm, pmdp)); 554 555 /* FIFO */ 556 pgtable = pmd_huge_pte(mm, pmdp); 557 lh = (struct list_head *) pgtable; 558 if (list_empty(lh)) 559 pmd_huge_pte(mm, pmdp) = NULL; 560 else { 561 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 562 list_del(lh); 563 } 564 ptep = (pte_t *) pgtable; 565 set_pte(ptep, __pte(_PAGE_INVALID)); 566 ptep++; 567 set_pte(ptep, __pte(_PAGE_INVALID)); 568 return pgtable; 569 } 570 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 571 572 #ifdef CONFIG_PGSTE 573 void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, 574 pte_t *ptep, pte_t entry) 575 { 576 pgste_t pgste; 577 578 /* the mm_has_pgste() check is done in set_pte_at() */ 579 preempt_disable(); 580 pgste = pgste_get_lock(ptep); 581 pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO); 582 pgste_set_key(ptep, pgste, entry, mm); 583 pgste = pgste_set_pte(ptep, pgste, entry); 584 pgste_set_unlock(ptep, pgste); 585 preempt_enable(); 586 } 587 588 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 589 { 590 pgste_t pgste; 591 592 preempt_disable(); 593 pgste = pgste_get_lock(ptep); 594 pgste = set_pgste_bit(pgste, PGSTE_IN_BIT); 595 pgste_set_unlock(ptep, pgste); 596 preempt_enable(); 597 } 598 599 /** 600 * ptep_force_prot - change access rights of a locked pte 601 * @mm: pointer to the process mm_struct 602 * @addr: virtual address in the guest address space 603 * @ptep: pointer to the page table entry 604 * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE 605 * @bit: pgste bit to set (e.g. for notification) 606 * 607 * Returns 0 if the access rights were changed and -EAGAIN if the current 608 * and requested access rights are incompatible. 609 */ 610 int ptep_force_prot(struct mm_struct *mm, unsigned long addr, 611 pte_t *ptep, int prot, unsigned long bit) 612 { 613 pte_t entry; 614 pgste_t pgste; 615 int pte_i, pte_p, nodat; 616 617 pgste = pgste_get_lock(ptep); 618 entry = *ptep; 619 /* Check pte entry after all locks have been acquired */ 620 pte_i = pte_val(entry) & _PAGE_INVALID; 621 pte_p = pte_val(entry) & _PAGE_PROTECT; 622 if ((pte_i && (prot != PROT_NONE)) || 623 (pte_p && (prot & PROT_WRITE))) { 624 pgste_set_unlock(ptep, pgste); 625 return -EAGAIN; 626 } 627 /* Change access rights and set pgste bit */ 628 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 629 if (prot == PROT_NONE && !pte_i) { 630 ptep_flush_direct(mm, addr, ptep, nodat); 631 pgste = pgste_update_all(entry, pgste, mm); 632 entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); 633 } 634 if (prot == PROT_READ && !pte_p) { 635 ptep_flush_direct(mm, addr, ptep, nodat); 636 entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); 637 entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 638 } 639 pgste = set_pgste_bit(pgste, bit); 640 pgste = pgste_set_pte(ptep, pgste, entry); 641 pgste_set_unlock(ptep, pgste); 642 return 0; 643 } 644 645 int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, 646 pte_t *sptep, pte_t *tptep, pte_t pte) 647 { 648 pgste_t spgste, tpgste; 649 pte_t spte, tpte; 650 int rc = -EAGAIN; 651 652 if (!(pte_val(*tptep) & _PAGE_INVALID)) 653 return 0; /* already shadowed */ 654 spgste = pgste_get_lock(sptep); 655 spte = *sptep; 656 if (!(pte_val(spte) & _PAGE_INVALID) && 657 !((pte_val(spte) & _PAGE_PROTECT) && 658 !(pte_val(pte) & _PAGE_PROTECT))) { 659 spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT); 660 tpgste = pgste_get_lock(tptep); 661 tpte = __pte((pte_val(spte) & PAGE_MASK) | 662 (pte_val(pte) & _PAGE_PROTECT)); 663 /* don't touch the storage key - it belongs to parent pgste */ 664 tpgste = pgste_set_pte(tptep, tpgste, tpte); 665 pgste_set_unlock(tptep, tpgste); 666 rc = 1; 667 } 668 pgste_set_unlock(sptep, spgste); 669 return rc; 670 } 671 672 void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) 673 { 674 pgste_t pgste; 675 int nodat; 676 677 pgste = pgste_get_lock(ptep); 678 /* notifier is called by the caller */ 679 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 680 ptep_flush_direct(mm, saddr, ptep, nodat); 681 /* don't touch the storage key - it belongs to parent pgste */ 682 pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); 683 pgste_set_unlock(ptep, pgste); 684 } 685 686 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) 687 { 688 if (!non_swap_entry(entry)) 689 dec_mm_counter(mm, MM_SWAPENTS); 690 else if (is_migration_entry(entry)) { 691 struct folio *folio = pfn_swap_entry_folio(entry); 692 693 dec_mm_counter(mm, mm_counter(folio)); 694 } 695 free_swap_and_cache(entry); 696 } 697 698 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, 699 pte_t *ptep, int reset) 700 { 701 unsigned long pgstev; 702 pgste_t pgste; 703 pte_t pte; 704 705 /* Zap unused and logically-zero pages */ 706 preempt_disable(); 707 pgste = pgste_get_lock(ptep); 708 pgstev = pgste_val(pgste); 709 pte = *ptep; 710 if (!reset && pte_swap(pte) && 711 ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || 712 (pgstev & _PGSTE_GPS_ZERO))) { 713 ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); 714 pte_clear(mm, addr, ptep); 715 } 716 if (reset) 717 pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 718 pgste_set_unlock(ptep, pgste); 719 preempt_enable(); 720 } 721 722 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 723 { 724 unsigned long ptev; 725 pgste_t pgste; 726 727 /* Clear storage key ACC and F, but set R/C */ 728 preempt_disable(); 729 pgste = pgste_get_lock(ptep); 730 pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); 731 pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT); 732 ptev = pte_val(*ptep); 733 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 734 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); 735 pgste_set_unlock(ptep, pgste); 736 preempt_enable(); 737 } 738 739 /* 740 * Test and reset if a guest page is dirty 741 */ 742 bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, 743 pte_t *ptep) 744 { 745 pgste_t pgste; 746 pte_t pte; 747 bool dirty; 748 int nodat; 749 750 pgste = pgste_get_lock(ptep); 751 dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); 752 pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT); 753 pte = *ptep; 754 if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { 755 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 756 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 757 ptep_ipte_global(mm, addr, ptep, nodat); 758 if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE)) 759 pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); 760 else 761 pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); 762 set_pte(ptep, pte); 763 } 764 pgste_set_unlock(ptep, pgste); 765 return dirty; 766 } 767 EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); 768 769 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 770 unsigned char key, bool nq) 771 { 772 unsigned long keyul, paddr; 773 spinlock_t *ptl; 774 pgste_t old, new; 775 pmd_t *pmdp; 776 pte_t *ptep; 777 778 /* 779 * If we don't have a PTE table and if there is no huge page mapped, 780 * we can ignore attempts to set the key to 0, because it already is 0. 781 */ 782 switch (pmd_lookup(mm, addr, &pmdp)) { 783 case -ENOENT: 784 return key ? -EFAULT : 0; 785 case 0: 786 break; 787 default: 788 return -EFAULT; 789 } 790 again: 791 ptl = pmd_lock(mm, pmdp); 792 if (!pmd_present(*pmdp)) { 793 spin_unlock(ptl); 794 return key ? -EFAULT : 0; 795 } 796 797 if (pmd_leaf(*pmdp)) { 798 paddr = pmd_val(*pmdp) & HPAGE_MASK; 799 paddr |= addr & ~HPAGE_MASK; 800 /* 801 * Huge pmds need quiescing operations, they are 802 * always mapped. 803 */ 804 page_set_storage_key(paddr, key, 1); 805 spin_unlock(ptl); 806 return 0; 807 } 808 spin_unlock(ptl); 809 810 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 811 if (!ptep) 812 goto again; 813 new = old = pgste_get_lock(ptep); 814 new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT | 815 PGSTE_ACC_BITS | PGSTE_FP_BIT); 816 keyul = (unsigned long) key; 817 new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48); 818 new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); 819 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 820 unsigned long bits, skey; 821 822 paddr = pte_val(*ptep) & PAGE_MASK; 823 skey = (unsigned long) page_get_storage_key(paddr); 824 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 825 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 826 /* Set storage key ACC and FP */ 827 page_set_storage_key(paddr, skey, !nq); 828 /* Merge host changed & referenced into pgste */ 829 new = set_pgste_bit(new, bits << 52); 830 } 831 /* changing the guest storage key is considered a change of the page */ 832 if ((pgste_val(new) ^ pgste_val(old)) & 833 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 834 new = set_pgste_bit(new, PGSTE_UC_BIT); 835 836 pgste_set_unlock(ptep, new); 837 pte_unmap_unlock(ptep, ptl); 838 return 0; 839 } 840 EXPORT_SYMBOL(set_guest_storage_key); 841 842 /* 843 * Conditionally set a guest storage key (handling csske). 844 * oldkey will be updated when either mr or mc is set and a pointer is given. 845 * 846 * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest 847 * storage key was updated and -EFAULT on access errors. 848 */ 849 int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 850 unsigned char key, unsigned char *oldkey, 851 bool nq, bool mr, bool mc) 852 { 853 unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; 854 int rc; 855 856 /* we can drop the pgste lock between getting and setting the key */ 857 if (mr | mc) { 858 rc = get_guest_storage_key(current->mm, addr, &tmp); 859 if (rc) 860 return rc; 861 if (oldkey) 862 *oldkey = tmp; 863 if (!mr) 864 mask |= _PAGE_REFERENCED; 865 if (!mc) 866 mask |= _PAGE_CHANGED; 867 if (!((tmp ^ key) & mask)) 868 return 0; 869 } 870 rc = set_guest_storage_key(current->mm, addr, key, nq); 871 return rc < 0 ? rc : 1; 872 } 873 EXPORT_SYMBOL(cond_set_guest_storage_key); 874 875 /* 876 * Reset a guest reference bit (rrbe), returning the reference and changed bit. 877 * 878 * Returns < 0 in case of error, otherwise the cc to be reported to the guest. 879 */ 880 int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) 881 { 882 spinlock_t *ptl; 883 unsigned long paddr; 884 pgste_t old, new; 885 pmd_t *pmdp; 886 pte_t *ptep; 887 int cc = 0; 888 889 /* 890 * If we don't have a PTE table and if there is no huge page mapped, 891 * the storage key is 0 and there is nothing for us to do. 892 */ 893 switch (pmd_lookup(mm, addr, &pmdp)) { 894 case -ENOENT: 895 return 0; 896 case 0: 897 break; 898 default: 899 return -EFAULT; 900 } 901 again: 902 ptl = pmd_lock(mm, pmdp); 903 if (!pmd_present(*pmdp)) { 904 spin_unlock(ptl); 905 return 0; 906 } 907 908 if (pmd_leaf(*pmdp)) { 909 paddr = pmd_val(*pmdp) & HPAGE_MASK; 910 paddr |= addr & ~HPAGE_MASK; 911 cc = page_reset_referenced(paddr); 912 spin_unlock(ptl); 913 return cc; 914 } 915 spin_unlock(ptl); 916 917 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 918 if (!ptep) 919 goto again; 920 new = old = pgste_get_lock(ptep); 921 /* Reset guest reference bit only */ 922 new = clear_pgste_bit(new, PGSTE_GR_BIT); 923 924 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 925 paddr = pte_val(*ptep) & PAGE_MASK; 926 cc = page_reset_referenced(paddr); 927 /* Merge real referenced bit into host-set */ 928 new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT); 929 } 930 /* Reflect guest's logical view, not physical */ 931 cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; 932 /* Changing the guest storage key is considered a change of the page */ 933 if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) 934 new = set_pgste_bit(new, PGSTE_UC_BIT); 935 936 pgste_set_unlock(ptep, new); 937 pte_unmap_unlock(ptep, ptl); 938 return cc; 939 } 940 EXPORT_SYMBOL(reset_guest_reference_bit); 941 942 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, 943 unsigned char *key) 944 { 945 unsigned long paddr; 946 spinlock_t *ptl; 947 pgste_t pgste; 948 pmd_t *pmdp; 949 pte_t *ptep; 950 951 /* 952 * If we don't have a PTE table and if there is no huge page mapped, 953 * the storage key is 0. 954 */ 955 *key = 0; 956 957 switch (pmd_lookup(mm, addr, &pmdp)) { 958 case -ENOENT: 959 return 0; 960 case 0: 961 break; 962 default: 963 return -EFAULT; 964 } 965 again: 966 ptl = pmd_lock(mm, pmdp); 967 if (!pmd_present(*pmdp)) { 968 spin_unlock(ptl); 969 return 0; 970 } 971 972 if (pmd_leaf(*pmdp)) { 973 paddr = pmd_val(*pmdp) & HPAGE_MASK; 974 paddr |= addr & ~HPAGE_MASK; 975 *key = page_get_storage_key(paddr); 976 spin_unlock(ptl); 977 return 0; 978 } 979 spin_unlock(ptl); 980 981 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 982 if (!ptep) 983 goto again; 984 pgste = pgste_get_lock(ptep); 985 *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 986 paddr = pte_val(*ptep) & PAGE_MASK; 987 if (!(pte_val(*ptep) & _PAGE_INVALID)) 988 *key = page_get_storage_key(paddr); 989 /* Reflect guest's logical view, not physical */ 990 *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 991 pgste_set_unlock(ptep, pgste); 992 pte_unmap_unlock(ptep, ptl); 993 return 0; 994 } 995 EXPORT_SYMBOL(get_guest_storage_key); 996 997 /** 998 * pgste_perform_essa - perform ESSA actions on the PGSTE. 999 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1000 * @hva: the host virtual address of the page whose PGSTE is to be processed 1001 * @orc: the specific action to perform, see the ESSA_SET_* macros. 1002 * @oldpte: the PTE will be saved there if the pointer is not NULL. 1003 * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. 1004 * 1005 * Return: 1 if the page is to be added to the CBRL, otherwise 0, 1006 * or < 0 in case of error. -EINVAL is returned for invalid values 1007 * of orc, -EFAULT for invalid addresses. 1008 */ 1009 int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, 1010 unsigned long *oldpte, unsigned long *oldpgste) 1011 { 1012 struct vm_area_struct *vma; 1013 unsigned long pgstev; 1014 spinlock_t *ptl; 1015 pgste_t pgste; 1016 pte_t *ptep; 1017 int res = 0; 1018 1019 WARN_ON_ONCE(orc > ESSA_MAX); 1020 if (unlikely(orc > ESSA_MAX)) 1021 return -EINVAL; 1022 1023 vma = vma_lookup(mm, hva); 1024 if (!vma || is_vm_hugetlb_page(vma)) 1025 return -EFAULT; 1026 ptep = get_locked_pte(mm, hva, &ptl); 1027 if (unlikely(!ptep)) 1028 return -EFAULT; 1029 pgste = pgste_get_lock(ptep); 1030 pgstev = pgste_val(pgste); 1031 if (oldpte) 1032 *oldpte = pte_val(*ptep); 1033 if (oldpgste) 1034 *oldpgste = pgstev; 1035 1036 switch (orc) { 1037 case ESSA_GET_STATE: 1038 break; 1039 case ESSA_SET_STABLE: 1040 pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 1041 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1042 break; 1043 case ESSA_SET_UNUSED: 1044 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1045 pgstev |= _PGSTE_GPS_USAGE_UNUSED; 1046 if (pte_val(*ptep) & _PAGE_INVALID) 1047 res = 1; 1048 break; 1049 case ESSA_SET_VOLATILE: 1050 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1051 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1052 if (pte_val(*ptep) & _PAGE_INVALID) 1053 res = 1; 1054 break; 1055 case ESSA_SET_POT_VOLATILE: 1056 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1057 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1058 pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; 1059 break; 1060 } 1061 if (pgstev & _PGSTE_GPS_ZERO) { 1062 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1063 break; 1064 } 1065 if (!(pgstev & PGSTE_GC_BIT)) { 1066 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1067 res = 1; 1068 break; 1069 } 1070 break; 1071 case ESSA_SET_STABLE_RESIDENT: 1072 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1073 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1074 /* 1075 * Since the resident state can go away any time after this 1076 * call, we will not make this page resident. We can revisit 1077 * this decision if a guest will ever start using this. 1078 */ 1079 break; 1080 case ESSA_SET_STABLE_IF_RESIDENT: 1081 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1082 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1083 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1084 } 1085 break; 1086 case ESSA_SET_STABLE_NODAT: 1087 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1088 pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; 1089 break; 1090 default: 1091 /* we should never get here! */ 1092 break; 1093 } 1094 /* If we are discarding a page, set it to logical zero */ 1095 if (res) 1096 pgstev |= _PGSTE_GPS_ZERO; 1097 1098 pgste = __pgste(pgstev); 1099 pgste_set_unlock(ptep, pgste); 1100 pte_unmap_unlock(ptep, ptl); 1101 return res; 1102 } 1103 EXPORT_SYMBOL(pgste_perform_essa); 1104 1105 /** 1106 * set_pgste_bits - set specific PGSTE bits. 1107 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1108 * @hva: the host virtual address of the page whose PGSTE is to be processed 1109 * @bits: a bitmask representing the bits that will be touched 1110 * @value: the values of the bits to be written. Only the bits in the mask 1111 * will be written. 1112 * 1113 * Return: 0 on success, < 0 in case of error. 1114 */ 1115 int set_pgste_bits(struct mm_struct *mm, unsigned long hva, 1116 unsigned long bits, unsigned long value) 1117 { 1118 struct vm_area_struct *vma; 1119 spinlock_t *ptl; 1120 pgste_t new; 1121 pte_t *ptep; 1122 1123 vma = vma_lookup(mm, hva); 1124 if (!vma || is_vm_hugetlb_page(vma)) 1125 return -EFAULT; 1126 ptep = get_locked_pte(mm, hva, &ptl); 1127 if (unlikely(!ptep)) 1128 return -EFAULT; 1129 new = pgste_get_lock(ptep); 1130 1131 new = clear_pgste_bit(new, bits); 1132 new = set_pgste_bit(new, value & bits); 1133 1134 pgste_set_unlock(ptep, new); 1135 pte_unmap_unlock(ptep, ptl); 1136 return 0; 1137 } 1138 EXPORT_SYMBOL(set_pgste_bits); 1139 1140 /** 1141 * get_pgste - get the current PGSTE for the given address. 1142 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1143 * @hva: the host virtual address of the page whose PGSTE is to be processed 1144 * @pgstep: will be written with the current PGSTE for the given address. 1145 * 1146 * Return: 0 on success, < 0 in case of error. 1147 */ 1148 int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) 1149 { 1150 struct vm_area_struct *vma; 1151 spinlock_t *ptl; 1152 pte_t *ptep; 1153 1154 vma = vma_lookup(mm, hva); 1155 if (!vma || is_vm_hugetlb_page(vma)) 1156 return -EFAULT; 1157 ptep = get_locked_pte(mm, hva, &ptl); 1158 if (unlikely(!ptep)) 1159 return -EFAULT; 1160 *pgstep = pgste_val(pgste_get(ptep)); 1161 pte_unmap_unlock(ptep, ptl); 1162 return 0; 1163 } 1164 EXPORT_SYMBOL(get_pgste); 1165 #endif 1166