1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2007, 2011 4 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 5 */ 6 7 #include <linux/cpufeature.h> 8 #include <linux/export.h> 9 #include <linux/sched.h> 10 #include <linux/kernel.h> 11 #include <linux/errno.h> 12 #include <linux/gfp.h> 13 #include <linux/mm.h> 14 #include <linux/swap.h> 15 #include <linux/smp.h> 16 #include <linux/spinlock.h> 17 #include <linux/rcupdate.h> 18 #include <linux/slab.h> 19 #include <linux/swapops.h> 20 #include <linux/sysctl.h> 21 #include <linux/ksm.h> 22 #include <linux/mman.h> 23 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 #include <asm/page-states.h> 27 #include <asm/pgtable.h> 28 #include <asm/machine.h> 29 30 pgprot_t pgprot_writecombine(pgprot_t prot) 31 { 32 /* 33 * mio_wb_bit_mask may be set on a different CPU, but it is only set 34 * once at init and only read afterwards. 35 */ 36 return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); 37 } 38 EXPORT_SYMBOL_GPL(pgprot_writecombine); 39 40 static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, 41 pte_t *ptep, int nodat) 42 { 43 unsigned long opt, asce; 44 45 if (machine_has_tlb_guest()) { 46 opt = 0; 47 asce = READ_ONCE(mm->context.gmap_asce); 48 if (asce == 0UL || nodat) 49 opt |= IPTE_NODAT; 50 if (asce != -1UL) { 51 asce = asce ? : mm->context.asce; 52 opt |= IPTE_GUEST_ASCE; 53 } 54 __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); 55 } else { 56 __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); 57 } 58 } 59 60 static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, 61 pte_t *ptep, int nodat) 62 { 63 unsigned long opt, asce; 64 65 if (machine_has_tlb_guest()) { 66 opt = 0; 67 asce = READ_ONCE(mm->context.gmap_asce); 68 if (asce == 0UL || nodat) 69 opt |= IPTE_NODAT; 70 if (asce != -1UL) { 71 asce = asce ? : mm->context.asce; 72 opt |= IPTE_GUEST_ASCE; 73 } 74 __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); 75 } else { 76 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 77 } 78 } 79 80 static inline pte_t ptep_flush_direct(struct mm_struct *mm, 81 unsigned long addr, pte_t *ptep, 82 int nodat) 83 { 84 pte_t old; 85 86 old = *ptep; 87 if (unlikely(pte_val(old) & _PAGE_INVALID)) 88 return old; 89 atomic_inc(&mm->context.flush_count); 90 if (cpu_has_tlb_lc() && 91 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 92 ptep_ipte_local(mm, addr, ptep, nodat); 93 else 94 ptep_ipte_global(mm, addr, ptep, nodat); 95 atomic_dec(&mm->context.flush_count); 96 return old; 97 } 98 99 static inline pte_t ptep_flush_lazy(struct mm_struct *mm, 100 unsigned long addr, pte_t *ptep, 101 int nodat) 102 { 103 pte_t old; 104 105 old = *ptep; 106 if (unlikely(pte_val(old) & _PAGE_INVALID)) 107 return old; 108 atomic_inc(&mm->context.flush_count); 109 if (cpumask_equal(&mm->context.cpu_attach_mask, 110 cpumask_of(smp_processor_id()))) { 111 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); 112 mm->context.flush_mm = 1; 113 } else 114 ptep_ipte_global(mm, addr, ptep, nodat); 115 atomic_dec(&mm->context.flush_count); 116 return old; 117 } 118 119 static inline pgste_t pgste_get(pte_t *ptep) 120 { 121 unsigned long pgste = 0; 122 #ifdef CONFIG_PGSTE 123 pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); 124 #endif 125 return __pgste(pgste); 126 } 127 128 static inline void pgste_set(pte_t *ptep, pgste_t pgste) 129 { 130 #ifdef CONFIG_PGSTE 131 *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; 132 #endif 133 } 134 135 static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, 136 struct mm_struct *mm) 137 { 138 #ifdef CONFIG_PGSTE 139 unsigned long address, bits, skey; 140 141 if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) 142 return pgste; 143 address = pte_val(pte) & PAGE_MASK; 144 skey = (unsigned long) page_get_storage_key(address); 145 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 146 /* Transfer page changed & referenced bit to guest bits in pgste */ 147 pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */ 148 /* Copy page access key and fetch protection bit to pgste */ 149 pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); 150 pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); 151 #endif 152 return pgste; 153 154 } 155 156 static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, 157 struct mm_struct *mm) 158 { 159 #ifdef CONFIG_PGSTE 160 unsigned long address; 161 unsigned long nkey; 162 163 if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) 164 return; 165 VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); 166 address = pte_val(entry) & PAGE_MASK; 167 /* 168 * Set page access key and fetch protection bit from pgste. 169 * The guest C/R information is still in the PGSTE, set real 170 * key C/R to 0. 171 */ 172 nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 173 nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 174 page_set_storage_key(address, nkey, 0); 175 #endif 176 } 177 178 static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) 179 { 180 #ifdef CONFIG_PGSTE 181 if ((pte_val(entry) & _PAGE_PRESENT) && 182 (pte_val(entry) & _PAGE_WRITE) && 183 !(pte_val(entry) & _PAGE_INVALID)) { 184 if (!machine_has_esop()) { 185 /* 186 * Without enhanced suppression-on-protection force 187 * the dirty bit on for all writable ptes. 188 */ 189 entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); 190 entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 191 } 192 if (!(pte_val(entry) & _PAGE_PROTECT)) 193 /* This pte allows write access, set user-dirty */ 194 pgste = set_pgste_bit(pgste, PGSTE_UC_BIT); 195 } 196 #endif 197 set_pte(ptep, entry); 198 return pgste; 199 } 200 201 static inline pgste_t pgste_pte_notify(struct mm_struct *mm, 202 unsigned long addr, 203 pte_t *ptep, pgste_t pgste) 204 { 205 #ifdef CONFIG_PGSTE 206 unsigned long bits; 207 208 bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); 209 if (bits) { 210 pgste = __pgste(pgste_val(pgste) ^ bits); 211 ptep_notify(mm, addr, ptep, bits); 212 } 213 #endif 214 return pgste; 215 } 216 217 static inline pgste_t ptep_xchg_start(struct mm_struct *mm, 218 unsigned long addr, pte_t *ptep) 219 { 220 pgste_t pgste = __pgste(0); 221 222 if (mm_has_pgste(mm)) { 223 pgste = pgste_get_lock(ptep); 224 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 225 } 226 return pgste; 227 } 228 229 static inline pte_t ptep_xchg_commit(struct mm_struct *mm, 230 unsigned long addr, pte_t *ptep, 231 pgste_t pgste, pte_t old, pte_t new) 232 { 233 if (mm_has_pgste(mm)) { 234 if (pte_val(old) & _PAGE_INVALID) 235 pgste_set_key(ptep, pgste, new, mm); 236 if (pte_val(new) & _PAGE_INVALID) { 237 pgste = pgste_update_all(old, pgste, mm); 238 if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == 239 _PGSTE_GPS_USAGE_UNUSED) 240 old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); 241 } 242 pgste = pgste_set_pte(ptep, pgste, new); 243 pgste_set_unlock(ptep, pgste); 244 } else { 245 set_pte(ptep, new); 246 } 247 return old; 248 } 249 250 pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, 251 pte_t *ptep, pte_t new) 252 { 253 pgste_t pgste; 254 pte_t old; 255 int nodat; 256 257 preempt_disable(); 258 pgste = ptep_xchg_start(mm, addr, ptep); 259 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 260 old = ptep_flush_direct(mm, addr, ptep, nodat); 261 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 262 preempt_enable(); 263 return old; 264 } 265 EXPORT_SYMBOL(ptep_xchg_direct); 266 267 /* 268 * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that 269 * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). 270 */ 271 void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, 272 pte_t new) 273 { 274 preempt_disable(); 275 atomic_inc(&mm->context.flush_count); 276 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 277 __ptep_rdp(addr, ptep, 1); 278 else 279 __ptep_rdp(addr, ptep, 0); 280 /* 281 * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That 282 * means it is still valid and active, and must not be changed according 283 * to the architecture. But writing a new value that only differs in SW 284 * bits is allowed. 285 */ 286 set_pte(ptep, new); 287 atomic_dec(&mm->context.flush_count); 288 preempt_enable(); 289 } 290 EXPORT_SYMBOL(ptep_reset_dat_prot); 291 292 pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, 293 pte_t *ptep, pte_t new) 294 { 295 pgste_t pgste; 296 pte_t old; 297 int nodat; 298 299 preempt_disable(); 300 pgste = ptep_xchg_start(mm, addr, ptep); 301 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 302 old = ptep_flush_lazy(mm, addr, ptep, nodat); 303 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 304 preempt_enable(); 305 return old; 306 } 307 EXPORT_SYMBOL(ptep_xchg_lazy); 308 309 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, 310 pte_t *ptep) 311 { 312 pgste_t pgste; 313 pte_t old; 314 int nodat; 315 struct mm_struct *mm = vma->vm_mm; 316 317 pgste = ptep_xchg_start(mm, addr, ptep); 318 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 319 old = ptep_flush_lazy(mm, addr, ptep, nodat); 320 if (mm_has_pgste(mm)) { 321 pgste = pgste_update_all(old, pgste, mm); 322 pgste_set(ptep, pgste); 323 } 324 return old; 325 } 326 327 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 328 pte_t *ptep, pte_t old_pte, pte_t pte) 329 { 330 pgste_t pgste; 331 struct mm_struct *mm = vma->vm_mm; 332 333 if (mm_has_pgste(mm)) { 334 pgste = pgste_get(ptep); 335 pgste_set_key(ptep, pgste, pte, mm); 336 pgste = pgste_set_pte(ptep, pgste, pte); 337 pgste_set_unlock(ptep, pgste); 338 } else { 339 set_pte(ptep, pte); 340 } 341 } 342 343 static inline void pmdp_idte_local(struct mm_struct *mm, 344 unsigned long addr, pmd_t *pmdp) 345 { 346 if (machine_has_tlb_guest()) 347 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 348 mm->context.asce, IDTE_LOCAL); 349 else 350 __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); 351 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 352 gmap_pmdp_idte_local(mm, addr); 353 } 354 355 static inline void pmdp_idte_global(struct mm_struct *mm, 356 unsigned long addr, pmd_t *pmdp) 357 { 358 if (machine_has_tlb_guest()) { 359 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 360 mm->context.asce, IDTE_GLOBAL); 361 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 362 gmap_pmdp_idte_global(mm, addr); 363 } else { 364 __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); 365 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 366 gmap_pmdp_idte_global(mm, addr); 367 } 368 } 369 370 static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, 371 unsigned long addr, pmd_t *pmdp) 372 { 373 pmd_t old; 374 375 old = *pmdp; 376 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 377 return old; 378 atomic_inc(&mm->context.flush_count); 379 if (cpu_has_tlb_lc() && 380 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 381 pmdp_idte_local(mm, addr, pmdp); 382 else 383 pmdp_idte_global(mm, addr, pmdp); 384 atomic_dec(&mm->context.flush_count); 385 return old; 386 } 387 388 static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, 389 unsigned long addr, pmd_t *pmdp) 390 { 391 pmd_t old; 392 393 old = *pmdp; 394 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 395 return old; 396 atomic_inc(&mm->context.flush_count); 397 if (cpumask_equal(&mm->context.cpu_attach_mask, 398 cpumask_of(smp_processor_id()))) { 399 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); 400 mm->context.flush_mm = 1; 401 if (mm_has_pgste(mm)) 402 gmap_pmdp_invalidate(mm, addr); 403 } else { 404 pmdp_idte_global(mm, addr, pmdp); 405 } 406 atomic_dec(&mm->context.flush_count); 407 return old; 408 } 409 410 #ifdef CONFIG_PGSTE 411 static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) 412 { 413 struct vm_area_struct *vma; 414 pgd_t *pgd; 415 p4d_t *p4d; 416 pud_t *pud; 417 418 /* We need a valid VMA, otherwise this is clearly a fault. */ 419 vma = vma_lookup(mm, addr); 420 if (!vma) 421 return -EFAULT; 422 423 pgd = pgd_offset(mm, addr); 424 if (!pgd_present(*pgd)) 425 return -ENOENT; 426 427 p4d = p4d_offset(pgd, addr); 428 if (!p4d_present(*p4d)) 429 return -ENOENT; 430 431 pud = pud_offset(p4d, addr); 432 if (!pud_present(*pud)) 433 return -ENOENT; 434 435 /* Large PUDs are not supported yet. */ 436 if (pud_leaf(*pud)) 437 return -EFAULT; 438 439 *pmdp = pmd_offset(pud, addr); 440 return 0; 441 } 442 #endif 443 444 pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, 445 pmd_t *pmdp, pmd_t new) 446 { 447 pmd_t old; 448 449 preempt_disable(); 450 old = pmdp_flush_direct(mm, addr, pmdp); 451 set_pmd(pmdp, new); 452 preempt_enable(); 453 return old; 454 } 455 EXPORT_SYMBOL(pmdp_xchg_direct); 456 457 pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, 458 pmd_t *pmdp, pmd_t new) 459 { 460 pmd_t old; 461 462 preempt_disable(); 463 old = pmdp_flush_lazy(mm, addr, pmdp); 464 set_pmd(pmdp, new); 465 preempt_enable(); 466 return old; 467 } 468 EXPORT_SYMBOL(pmdp_xchg_lazy); 469 470 static inline void pudp_idte_local(struct mm_struct *mm, 471 unsigned long addr, pud_t *pudp) 472 { 473 if (machine_has_tlb_guest()) 474 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 475 mm->context.asce, IDTE_LOCAL); 476 else 477 __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); 478 } 479 480 static inline void pudp_idte_global(struct mm_struct *mm, 481 unsigned long addr, pud_t *pudp) 482 { 483 if (machine_has_tlb_guest()) 484 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 485 mm->context.asce, IDTE_GLOBAL); 486 else 487 __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); 488 } 489 490 static inline pud_t pudp_flush_direct(struct mm_struct *mm, 491 unsigned long addr, pud_t *pudp) 492 { 493 pud_t old; 494 495 old = *pudp; 496 if (pud_val(old) & _REGION_ENTRY_INVALID) 497 return old; 498 atomic_inc(&mm->context.flush_count); 499 if (cpu_has_tlb_lc() && 500 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 501 pudp_idte_local(mm, addr, pudp); 502 else 503 pudp_idte_global(mm, addr, pudp); 504 atomic_dec(&mm->context.flush_count); 505 return old; 506 } 507 508 pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, 509 pud_t *pudp, pud_t new) 510 { 511 pud_t old; 512 513 preempt_disable(); 514 old = pudp_flush_direct(mm, addr, pudp); 515 set_pud(pudp, new); 516 preempt_enable(); 517 return old; 518 } 519 EXPORT_SYMBOL(pudp_xchg_direct); 520 521 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 522 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 523 pgtable_t pgtable) 524 { 525 struct list_head *lh = (struct list_head *) pgtable; 526 527 assert_spin_locked(pmd_lockptr(mm, pmdp)); 528 529 /* FIFO */ 530 if (!pmd_huge_pte(mm, pmdp)) 531 INIT_LIST_HEAD(lh); 532 else 533 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 534 pmd_huge_pte(mm, pmdp) = pgtable; 535 } 536 537 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 538 { 539 struct list_head *lh; 540 pgtable_t pgtable; 541 pte_t *ptep; 542 543 assert_spin_locked(pmd_lockptr(mm, pmdp)); 544 545 /* FIFO */ 546 pgtable = pmd_huge_pte(mm, pmdp); 547 lh = (struct list_head *) pgtable; 548 if (list_empty(lh)) 549 pmd_huge_pte(mm, pmdp) = NULL; 550 else { 551 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 552 list_del(lh); 553 } 554 ptep = (pte_t *) pgtable; 555 set_pte(ptep, __pte(_PAGE_INVALID)); 556 ptep++; 557 set_pte(ptep, __pte(_PAGE_INVALID)); 558 return pgtable; 559 } 560 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 561 562 #ifdef CONFIG_PGSTE 563 void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, 564 pte_t *ptep, pte_t entry) 565 { 566 pgste_t pgste; 567 568 /* the mm_has_pgste() check is done in set_pte_at() */ 569 preempt_disable(); 570 pgste = pgste_get_lock(ptep); 571 pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO); 572 pgste_set_key(ptep, pgste, entry, mm); 573 pgste = pgste_set_pte(ptep, pgste, entry); 574 pgste_set_unlock(ptep, pgste); 575 preempt_enable(); 576 } 577 578 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 579 { 580 pgste_t pgste; 581 582 preempt_disable(); 583 pgste = pgste_get_lock(ptep); 584 pgste = set_pgste_bit(pgste, PGSTE_IN_BIT); 585 pgste_set_unlock(ptep, pgste); 586 preempt_enable(); 587 } 588 589 /** 590 * ptep_force_prot - change access rights of a locked pte 591 * @mm: pointer to the process mm_struct 592 * @addr: virtual address in the guest address space 593 * @ptep: pointer to the page table entry 594 * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE 595 * @bit: pgste bit to set (e.g. for notification) 596 * 597 * Returns 0 if the access rights were changed and -EAGAIN if the current 598 * and requested access rights are incompatible. 599 */ 600 int ptep_force_prot(struct mm_struct *mm, unsigned long addr, 601 pte_t *ptep, int prot, unsigned long bit) 602 { 603 pte_t entry; 604 pgste_t pgste; 605 int pte_i, pte_p, nodat; 606 607 pgste = pgste_get_lock(ptep); 608 entry = *ptep; 609 /* Check pte entry after all locks have been acquired */ 610 pte_i = pte_val(entry) & _PAGE_INVALID; 611 pte_p = pte_val(entry) & _PAGE_PROTECT; 612 if ((pte_i && (prot != PROT_NONE)) || 613 (pte_p && (prot & PROT_WRITE))) { 614 pgste_set_unlock(ptep, pgste); 615 return -EAGAIN; 616 } 617 /* Change access rights and set pgste bit */ 618 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 619 if (prot == PROT_NONE && !pte_i) { 620 ptep_flush_direct(mm, addr, ptep, nodat); 621 pgste = pgste_update_all(entry, pgste, mm); 622 entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); 623 } 624 if (prot == PROT_READ && !pte_p) { 625 ptep_flush_direct(mm, addr, ptep, nodat); 626 entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); 627 entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 628 } 629 pgste = set_pgste_bit(pgste, bit); 630 pgste = pgste_set_pte(ptep, pgste, entry); 631 pgste_set_unlock(ptep, pgste); 632 return 0; 633 } 634 635 int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, 636 pte_t *sptep, pte_t *tptep, pte_t pte) 637 { 638 pgste_t spgste, tpgste; 639 pte_t spte, tpte; 640 int rc = -EAGAIN; 641 642 if (!(pte_val(*tptep) & _PAGE_INVALID)) 643 return 0; /* already shadowed */ 644 spgste = pgste_get_lock(sptep); 645 spte = *sptep; 646 if (!(pte_val(spte) & _PAGE_INVALID) && 647 !((pte_val(spte) & _PAGE_PROTECT) && 648 !(pte_val(pte) & _PAGE_PROTECT))) { 649 spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT); 650 tpgste = pgste_get_lock(tptep); 651 tpte = __pte((pte_val(spte) & PAGE_MASK) | 652 (pte_val(pte) & _PAGE_PROTECT)); 653 /* don't touch the storage key - it belongs to parent pgste */ 654 tpgste = pgste_set_pte(tptep, tpgste, tpte); 655 pgste_set_unlock(tptep, tpgste); 656 rc = 1; 657 } 658 pgste_set_unlock(sptep, spgste); 659 return rc; 660 } 661 662 void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) 663 { 664 pgste_t pgste; 665 int nodat; 666 667 pgste = pgste_get_lock(ptep); 668 /* notifier is called by the caller */ 669 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 670 ptep_flush_direct(mm, saddr, ptep, nodat); 671 /* don't touch the storage key - it belongs to parent pgste */ 672 pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); 673 pgste_set_unlock(ptep, pgste); 674 } 675 676 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) 677 { 678 if (!non_swap_entry(entry)) 679 dec_mm_counter(mm, MM_SWAPENTS); 680 else if (is_migration_entry(entry)) { 681 struct folio *folio = pfn_swap_entry_folio(entry); 682 683 dec_mm_counter(mm, mm_counter(folio)); 684 } 685 free_swap_and_cache(entry); 686 } 687 688 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, 689 pte_t *ptep, int reset) 690 { 691 unsigned long pgstev; 692 pgste_t pgste; 693 pte_t pte; 694 695 /* Zap unused and logically-zero pages */ 696 preempt_disable(); 697 pgste = pgste_get_lock(ptep); 698 pgstev = pgste_val(pgste); 699 pte = *ptep; 700 if (!reset && pte_swap(pte) && 701 ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || 702 (pgstev & _PGSTE_GPS_ZERO))) { 703 ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); 704 pte_clear(mm, addr, ptep); 705 } 706 if (reset) 707 pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 708 pgste_set_unlock(ptep, pgste); 709 preempt_enable(); 710 } 711 712 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 713 { 714 unsigned long ptev; 715 pgste_t pgste; 716 717 /* Clear storage key ACC and F, but set R/C */ 718 preempt_disable(); 719 pgste = pgste_get_lock(ptep); 720 pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); 721 pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT); 722 ptev = pte_val(*ptep); 723 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 724 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); 725 pgste_set_unlock(ptep, pgste); 726 preempt_enable(); 727 } 728 729 /* 730 * Test and reset if a guest page is dirty 731 */ 732 bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, 733 pte_t *ptep) 734 { 735 pgste_t pgste; 736 pte_t pte; 737 bool dirty; 738 int nodat; 739 740 pgste = pgste_get_lock(ptep); 741 dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); 742 pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT); 743 pte = *ptep; 744 if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { 745 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 746 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 747 ptep_ipte_global(mm, addr, ptep, nodat); 748 if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE)) 749 pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); 750 else 751 pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); 752 set_pte(ptep, pte); 753 } 754 pgste_set_unlock(ptep, pgste); 755 return dirty; 756 } 757 EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); 758 759 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 760 unsigned char key, bool nq) 761 { 762 unsigned long keyul, paddr; 763 spinlock_t *ptl; 764 pgste_t old, new; 765 pmd_t *pmdp; 766 pte_t *ptep; 767 768 /* 769 * If we don't have a PTE table and if there is no huge page mapped, 770 * we can ignore attempts to set the key to 0, because it already is 0. 771 */ 772 switch (pmd_lookup(mm, addr, &pmdp)) { 773 case -ENOENT: 774 return key ? -EFAULT : 0; 775 case 0: 776 break; 777 default: 778 return -EFAULT; 779 } 780 again: 781 ptl = pmd_lock(mm, pmdp); 782 if (!pmd_present(*pmdp)) { 783 spin_unlock(ptl); 784 return key ? -EFAULT : 0; 785 } 786 787 if (pmd_leaf(*pmdp)) { 788 paddr = pmd_val(*pmdp) & HPAGE_MASK; 789 paddr |= addr & ~HPAGE_MASK; 790 /* 791 * Huge pmds need quiescing operations, they are 792 * always mapped. 793 */ 794 page_set_storage_key(paddr, key, 1); 795 spin_unlock(ptl); 796 return 0; 797 } 798 spin_unlock(ptl); 799 800 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 801 if (!ptep) 802 goto again; 803 new = old = pgste_get_lock(ptep); 804 new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT | 805 PGSTE_ACC_BITS | PGSTE_FP_BIT); 806 keyul = (unsigned long) key; 807 new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48); 808 new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); 809 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 810 unsigned long bits, skey; 811 812 paddr = pte_val(*ptep) & PAGE_MASK; 813 skey = (unsigned long) page_get_storage_key(paddr); 814 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 815 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 816 /* Set storage key ACC and FP */ 817 page_set_storage_key(paddr, skey, !nq); 818 /* Merge host changed & referenced into pgste */ 819 new = set_pgste_bit(new, bits << 52); 820 } 821 /* changing the guest storage key is considered a change of the page */ 822 if ((pgste_val(new) ^ pgste_val(old)) & 823 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 824 new = set_pgste_bit(new, PGSTE_UC_BIT); 825 826 pgste_set_unlock(ptep, new); 827 pte_unmap_unlock(ptep, ptl); 828 return 0; 829 } 830 EXPORT_SYMBOL(set_guest_storage_key); 831 832 /* 833 * Conditionally set a guest storage key (handling csske). 834 * oldkey will be updated when either mr or mc is set and a pointer is given. 835 * 836 * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest 837 * storage key was updated and -EFAULT on access errors. 838 */ 839 int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 840 unsigned char key, unsigned char *oldkey, 841 bool nq, bool mr, bool mc) 842 { 843 unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; 844 int rc; 845 846 /* we can drop the pgste lock between getting and setting the key */ 847 if (mr | mc) { 848 rc = get_guest_storage_key(current->mm, addr, &tmp); 849 if (rc) 850 return rc; 851 if (oldkey) 852 *oldkey = tmp; 853 if (!mr) 854 mask |= _PAGE_REFERENCED; 855 if (!mc) 856 mask |= _PAGE_CHANGED; 857 if (!((tmp ^ key) & mask)) 858 return 0; 859 } 860 rc = set_guest_storage_key(current->mm, addr, key, nq); 861 return rc < 0 ? rc : 1; 862 } 863 EXPORT_SYMBOL(cond_set_guest_storage_key); 864 865 /* 866 * Reset a guest reference bit (rrbe), returning the reference and changed bit. 867 * 868 * Returns < 0 in case of error, otherwise the cc to be reported to the guest. 869 */ 870 int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) 871 { 872 spinlock_t *ptl; 873 unsigned long paddr; 874 pgste_t old, new; 875 pmd_t *pmdp; 876 pte_t *ptep; 877 int cc = 0; 878 879 /* 880 * If we don't have a PTE table and if there is no huge page mapped, 881 * the storage key is 0 and there is nothing for us to do. 882 */ 883 switch (pmd_lookup(mm, addr, &pmdp)) { 884 case -ENOENT: 885 return 0; 886 case 0: 887 break; 888 default: 889 return -EFAULT; 890 } 891 again: 892 ptl = pmd_lock(mm, pmdp); 893 if (!pmd_present(*pmdp)) { 894 spin_unlock(ptl); 895 return 0; 896 } 897 898 if (pmd_leaf(*pmdp)) { 899 paddr = pmd_val(*pmdp) & HPAGE_MASK; 900 paddr |= addr & ~HPAGE_MASK; 901 cc = page_reset_referenced(paddr); 902 spin_unlock(ptl); 903 return cc; 904 } 905 spin_unlock(ptl); 906 907 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 908 if (!ptep) 909 goto again; 910 new = old = pgste_get_lock(ptep); 911 /* Reset guest reference bit only */ 912 new = clear_pgste_bit(new, PGSTE_GR_BIT); 913 914 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 915 paddr = pte_val(*ptep) & PAGE_MASK; 916 cc = page_reset_referenced(paddr); 917 /* Merge real referenced bit into host-set */ 918 new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT); 919 } 920 /* Reflect guest's logical view, not physical */ 921 cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; 922 /* Changing the guest storage key is considered a change of the page */ 923 if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) 924 new = set_pgste_bit(new, PGSTE_UC_BIT); 925 926 pgste_set_unlock(ptep, new); 927 pte_unmap_unlock(ptep, ptl); 928 return cc; 929 } 930 EXPORT_SYMBOL(reset_guest_reference_bit); 931 932 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, 933 unsigned char *key) 934 { 935 unsigned long paddr; 936 spinlock_t *ptl; 937 pgste_t pgste; 938 pmd_t *pmdp; 939 pte_t *ptep; 940 941 /* 942 * If we don't have a PTE table and if there is no huge page mapped, 943 * the storage key is 0. 944 */ 945 *key = 0; 946 947 switch (pmd_lookup(mm, addr, &pmdp)) { 948 case -ENOENT: 949 return 0; 950 case 0: 951 break; 952 default: 953 return -EFAULT; 954 } 955 again: 956 ptl = pmd_lock(mm, pmdp); 957 if (!pmd_present(*pmdp)) { 958 spin_unlock(ptl); 959 return 0; 960 } 961 962 if (pmd_leaf(*pmdp)) { 963 paddr = pmd_val(*pmdp) & HPAGE_MASK; 964 paddr |= addr & ~HPAGE_MASK; 965 *key = page_get_storage_key(paddr); 966 spin_unlock(ptl); 967 return 0; 968 } 969 spin_unlock(ptl); 970 971 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 972 if (!ptep) 973 goto again; 974 pgste = pgste_get_lock(ptep); 975 *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 976 paddr = pte_val(*ptep) & PAGE_MASK; 977 if (!(pte_val(*ptep) & _PAGE_INVALID)) 978 *key = page_get_storage_key(paddr); 979 /* Reflect guest's logical view, not physical */ 980 *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 981 pgste_set_unlock(ptep, pgste); 982 pte_unmap_unlock(ptep, ptl); 983 return 0; 984 } 985 EXPORT_SYMBOL(get_guest_storage_key); 986 987 /** 988 * pgste_perform_essa - perform ESSA actions on the PGSTE. 989 * @mm: the memory context. It must have PGSTEs, no check is performed here! 990 * @hva: the host virtual address of the page whose PGSTE is to be processed 991 * @orc: the specific action to perform, see the ESSA_SET_* macros. 992 * @oldpte: the PTE will be saved there if the pointer is not NULL. 993 * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. 994 * 995 * Return: 1 if the page is to be added to the CBRL, otherwise 0, 996 * or < 0 in case of error. -EINVAL is returned for invalid values 997 * of orc, -EFAULT for invalid addresses. 998 */ 999 int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, 1000 unsigned long *oldpte, unsigned long *oldpgste) 1001 { 1002 struct vm_area_struct *vma; 1003 unsigned long pgstev; 1004 spinlock_t *ptl; 1005 pgste_t pgste; 1006 pte_t *ptep; 1007 int res = 0; 1008 1009 WARN_ON_ONCE(orc > ESSA_MAX); 1010 if (unlikely(orc > ESSA_MAX)) 1011 return -EINVAL; 1012 1013 vma = vma_lookup(mm, hva); 1014 if (!vma || is_vm_hugetlb_page(vma)) 1015 return -EFAULT; 1016 ptep = get_locked_pte(mm, hva, &ptl); 1017 if (unlikely(!ptep)) 1018 return -EFAULT; 1019 pgste = pgste_get_lock(ptep); 1020 pgstev = pgste_val(pgste); 1021 if (oldpte) 1022 *oldpte = pte_val(*ptep); 1023 if (oldpgste) 1024 *oldpgste = pgstev; 1025 1026 switch (orc) { 1027 case ESSA_GET_STATE: 1028 break; 1029 case ESSA_SET_STABLE: 1030 pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 1031 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1032 break; 1033 case ESSA_SET_UNUSED: 1034 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1035 pgstev |= _PGSTE_GPS_USAGE_UNUSED; 1036 if (pte_val(*ptep) & _PAGE_INVALID) 1037 res = 1; 1038 break; 1039 case ESSA_SET_VOLATILE: 1040 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1041 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1042 if (pte_val(*ptep) & _PAGE_INVALID) 1043 res = 1; 1044 break; 1045 case ESSA_SET_POT_VOLATILE: 1046 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1047 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1048 pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; 1049 break; 1050 } 1051 if (pgstev & _PGSTE_GPS_ZERO) { 1052 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1053 break; 1054 } 1055 if (!(pgstev & PGSTE_GC_BIT)) { 1056 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1057 res = 1; 1058 break; 1059 } 1060 break; 1061 case ESSA_SET_STABLE_RESIDENT: 1062 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1063 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1064 /* 1065 * Since the resident state can go away any time after this 1066 * call, we will not make this page resident. We can revisit 1067 * this decision if a guest will ever start using this. 1068 */ 1069 break; 1070 case ESSA_SET_STABLE_IF_RESIDENT: 1071 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1072 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1073 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1074 } 1075 break; 1076 case ESSA_SET_STABLE_NODAT: 1077 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1078 pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; 1079 break; 1080 default: 1081 /* we should never get here! */ 1082 break; 1083 } 1084 /* If we are discarding a page, set it to logical zero */ 1085 if (res) 1086 pgstev |= _PGSTE_GPS_ZERO; 1087 1088 pgste = __pgste(pgstev); 1089 pgste_set_unlock(ptep, pgste); 1090 pte_unmap_unlock(ptep, ptl); 1091 return res; 1092 } 1093 EXPORT_SYMBOL(pgste_perform_essa); 1094 1095 /** 1096 * set_pgste_bits - set specific PGSTE bits. 1097 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1098 * @hva: the host virtual address of the page whose PGSTE is to be processed 1099 * @bits: a bitmask representing the bits that will be touched 1100 * @value: the values of the bits to be written. Only the bits in the mask 1101 * will be written. 1102 * 1103 * Return: 0 on success, < 0 in case of error. 1104 */ 1105 int set_pgste_bits(struct mm_struct *mm, unsigned long hva, 1106 unsigned long bits, unsigned long value) 1107 { 1108 struct vm_area_struct *vma; 1109 spinlock_t *ptl; 1110 pgste_t new; 1111 pte_t *ptep; 1112 1113 vma = vma_lookup(mm, hva); 1114 if (!vma || is_vm_hugetlb_page(vma)) 1115 return -EFAULT; 1116 ptep = get_locked_pte(mm, hva, &ptl); 1117 if (unlikely(!ptep)) 1118 return -EFAULT; 1119 new = pgste_get_lock(ptep); 1120 1121 new = clear_pgste_bit(new, bits); 1122 new = set_pgste_bit(new, value & bits); 1123 1124 pgste_set_unlock(ptep, new); 1125 pte_unmap_unlock(ptep, ptl); 1126 return 0; 1127 } 1128 EXPORT_SYMBOL(set_pgste_bits); 1129 1130 /** 1131 * get_pgste - get the current PGSTE for the given address. 1132 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1133 * @hva: the host virtual address of the page whose PGSTE is to be processed 1134 * @pgstep: will be written with the current PGSTE for the given address. 1135 * 1136 * Return: 0 on success, < 0 in case of error. 1137 */ 1138 int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) 1139 { 1140 struct vm_area_struct *vma; 1141 spinlock_t *ptl; 1142 pte_t *ptep; 1143 1144 vma = vma_lookup(mm, hva); 1145 if (!vma || is_vm_hugetlb_page(vma)) 1146 return -EFAULT; 1147 ptep = get_locked_pte(mm, hva, &ptl); 1148 if (unlikely(!ptep)) 1149 return -EFAULT; 1150 *pgstep = pgste_val(pgste_get(ptep)); 1151 pte_unmap_unlock(ptep, ptl); 1152 return 0; 1153 } 1154 EXPORT_SYMBOL(get_pgste); 1155 #endif 1156