1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2007, 2011 4 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 5 */ 6 7 #include <linux/cpufeature.h> 8 #include <linux/export.h> 9 #include <linux/sched.h> 10 #include <linux/kernel.h> 11 #include <linux/errno.h> 12 #include <linux/gfp.h> 13 #include <linux/mm.h> 14 #include <linux/swap.h> 15 #include <linux/smp.h> 16 #include <linux/spinlock.h> 17 #include <linux/rcupdate.h> 18 #include <linux/slab.h> 19 #include <linux/swapops.h> 20 #include <linux/sysctl.h> 21 #include <linux/ksm.h> 22 #include <linux/mman.h> 23 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 #include <asm/page-states.h> 27 #include <asm/machine.h> 28 29 pgprot_t pgprot_writecombine(pgprot_t prot) 30 { 31 /* 32 * mio_wb_bit_mask may be set on a different CPU, but it is only set 33 * once at init and only read afterwards. 34 */ 35 return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); 36 } 37 EXPORT_SYMBOL_GPL(pgprot_writecombine); 38 39 static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, 40 pte_t *ptep, int nodat) 41 { 42 unsigned long opt, asce; 43 44 if (machine_has_tlb_guest()) { 45 opt = 0; 46 asce = READ_ONCE(mm->context.gmap_asce); 47 if (asce == 0UL || nodat) 48 opt |= IPTE_NODAT; 49 if (asce != -1UL) { 50 asce = asce ? : mm->context.asce; 51 opt |= IPTE_GUEST_ASCE; 52 } 53 __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); 54 } else { 55 __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); 56 } 57 } 58 59 static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, 60 pte_t *ptep, int nodat) 61 { 62 unsigned long opt, asce; 63 64 if (machine_has_tlb_guest()) { 65 opt = 0; 66 asce = READ_ONCE(mm->context.gmap_asce); 67 if (asce == 0UL || nodat) 68 opt |= IPTE_NODAT; 69 if (asce != -1UL) { 70 asce = asce ? : mm->context.asce; 71 opt |= IPTE_GUEST_ASCE; 72 } 73 __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); 74 } else { 75 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 76 } 77 } 78 79 static inline pte_t ptep_flush_direct(struct mm_struct *mm, 80 unsigned long addr, pte_t *ptep, 81 int nodat) 82 { 83 pte_t old; 84 85 old = *ptep; 86 if (unlikely(pte_val(old) & _PAGE_INVALID)) 87 return old; 88 atomic_inc(&mm->context.flush_count); 89 if (cpu_has_tlb_lc() && 90 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 91 ptep_ipte_local(mm, addr, ptep, nodat); 92 else 93 ptep_ipte_global(mm, addr, ptep, nodat); 94 atomic_dec(&mm->context.flush_count); 95 return old; 96 } 97 98 static inline pte_t ptep_flush_lazy(struct mm_struct *mm, 99 unsigned long addr, pte_t *ptep, 100 int nodat) 101 { 102 pte_t old; 103 104 old = *ptep; 105 if (unlikely(pte_val(old) & _PAGE_INVALID)) 106 return old; 107 atomic_inc(&mm->context.flush_count); 108 if (cpumask_equal(&mm->context.cpu_attach_mask, 109 cpumask_of(smp_processor_id()))) { 110 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); 111 mm->context.flush_mm = 1; 112 } else 113 ptep_ipte_global(mm, addr, ptep, nodat); 114 atomic_dec(&mm->context.flush_count); 115 return old; 116 } 117 118 static inline pgste_t pgste_get_lock(pte_t *ptep) 119 { 120 unsigned long value = 0; 121 #ifdef CONFIG_PGSTE 122 unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); 123 124 do { 125 value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); 126 } while (value & PGSTE_PCL_BIT); 127 value |= PGSTE_PCL_BIT; 128 #endif 129 return __pgste(value); 130 } 131 132 static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) 133 { 134 #ifdef CONFIG_PGSTE 135 barrier(); 136 WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); 137 #endif 138 } 139 140 static inline pgste_t pgste_get(pte_t *ptep) 141 { 142 unsigned long pgste = 0; 143 #ifdef CONFIG_PGSTE 144 pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); 145 #endif 146 return __pgste(pgste); 147 } 148 149 static inline void pgste_set(pte_t *ptep, pgste_t pgste) 150 { 151 #ifdef CONFIG_PGSTE 152 *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; 153 #endif 154 } 155 156 static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, 157 struct mm_struct *mm) 158 { 159 #ifdef CONFIG_PGSTE 160 unsigned long address, bits, skey; 161 162 if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) 163 return pgste; 164 address = pte_val(pte) & PAGE_MASK; 165 skey = (unsigned long) page_get_storage_key(address); 166 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 167 /* Transfer page changed & referenced bit to guest bits in pgste */ 168 pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */ 169 /* Copy page access key and fetch protection bit to pgste */ 170 pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); 171 pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); 172 #endif 173 return pgste; 174 175 } 176 177 static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, 178 struct mm_struct *mm) 179 { 180 #ifdef CONFIG_PGSTE 181 unsigned long address; 182 unsigned long nkey; 183 184 if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) 185 return; 186 VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); 187 address = pte_val(entry) & PAGE_MASK; 188 /* 189 * Set page access key and fetch protection bit from pgste. 190 * The guest C/R information is still in the PGSTE, set real 191 * key C/R to 0. 192 */ 193 nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 194 nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 195 page_set_storage_key(address, nkey, 0); 196 #endif 197 } 198 199 static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) 200 { 201 #ifdef CONFIG_PGSTE 202 if ((pte_val(entry) & _PAGE_PRESENT) && 203 (pte_val(entry) & _PAGE_WRITE) && 204 !(pte_val(entry) & _PAGE_INVALID)) { 205 if (!machine_has_esop()) { 206 /* 207 * Without enhanced suppression-on-protection force 208 * the dirty bit on for all writable ptes. 209 */ 210 entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); 211 entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 212 } 213 if (!(pte_val(entry) & _PAGE_PROTECT)) 214 /* This pte allows write access, set user-dirty */ 215 pgste = set_pgste_bit(pgste, PGSTE_UC_BIT); 216 } 217 #endif 218 set_pte(ptep, entry); 219 return pgste; 220 } 221 222 static inline pgste_t pgste_pte_notify(struct mm_struct *mm, 223 unsigned long addr, 224 pte_t *ptep, pgste_t pgste) 225 { 226 #ifdef CONFIG_PGSTE 227 unsigned long bits; 228 229 bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); 230 if (bits) { 231 pgste = __pgste(pgste_val(pgste) ^ bits); 232 ptep_notify(mm, addr, ptep, bits); 233 } 234 #endif 235 return pgste; 236 } 237 238 static inline pgste_t ptep_xchg_start(struct mm_struct *mm, 239 unsigned long addr, pte_t *ptep) 240 { 241 pgste_t pgste = __pgste(0); 242 243 if (mm_has_pgste(mm)) { 244 pgste = pgste_get_lock(ptep); 245 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 246 } 247 return pgste; 248 } 249 250 static inline pte_t ptep_xchg_commit(struct mm_struct *mm, 251 unsigned long addr, pte_t *ptep, 252 pgste_t pgste, pte_t old, pte_t new) 253 { 254 if (mm_has_pgste(mm)) { 255 if (pte_val(old) & _PAGE_INVALID) 256 pgste_set_key(ptep, pgste, new, mm); 257 if (pte_val(new) & _PAGE_INVALID) { 258 pgste = pgste_update_all(old, pgste, mm); 259 if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == 260 _PGSTE_GPS_USAGE_UNUSED) 261 old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); 262 } 263 pgste = pgste_set_pte(ptep, pgste, new); 264 pgste_set_unlock(ptep, pgste); 265 } else { 266 set_pte(ptep, new); 267 } 268 return old; 269 } 270 271 pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, 272 pte_t *ptep, pte_t new) 273 { 274 pgste_t pgste; 275 pte_t old; 276 int nodat; 277 278 preempt_disable(); 279 pgste = ptep_xchg_start(mm, addr, ptep); 280 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 281 old = ptep_flush_direct(mm, addr, ptep, nodat); 282 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 283 preempt_enable(); 284 return old; 285 } 286 EXPORT_SYMBOL(ptep_xchg_direct); 287 288 /* 289 * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that 290 * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). 291 */ 292 void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, 293 pte_t new) 294 { 295 preempt_disable(); 296 atomic_inc(&mm->context.flush_count); 297 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 298 __ptep_rdp(addr, ptep, 0, 0, 1); 299 else 300 __ptep_rdp(addr, ptep, 0, 0, 0); 301 /* 302 * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That 303 * means it is still valid and active, and must not be changed according 304 * to the architecture. But writing a new value that only differs in SW 305 * bits is allowed. 306 */ 307 set_pte(ptep, new); 308 atomic_dec(&mm->context.flush_count); 309 preempt_enable(); 310 } 311 EXPORT_SYMBOL(ptep_reset_dat_prot); 312 313 pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, 314 pte_t *ptep, pte_t new) 315 { 316 pgste_t pgste; 317 pte_t old; 318 int nodat; 319 320 preempt_disable(); 321 pgste = ptep_xchg_start(mm, addr, ptep); 322 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 323 old = ptep_flush_lazy(mm, addr, ptep, nodat); 324 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 325 preempt_enable(); 326 return old; 327 } 328 EXPORT_SYMBOL(ptep_xchg_lazy); 329 330 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, 331 pte_t *ptep) 332 { 333 pgste_t pgste; 334 pte_t old; 335 int nodat; 336 struct mm_struct *mm = vma->vm_mm; 337 338 preempt_disable(); 339 pgste = ptep_xchg_start(mm, addr, ptep); 340 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 341 old = ptep_flush_lazy(mm, addr, ptep, nodat); 342 if (mm_has_pgste(mm)) { 343 pgste = pgste_update_all(old, pgste, mm); 344 pgste_set(ptep, pgste); 345 } 346 return old; 347 } 348 349 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 350 pte_t *ptep, pte_t old_pte, pte_t pte) 351 { 352 pgste_t pgste; 353 struct mm_struct *mm = vma->vm_mm; 354 355 if (mm_has_pgste(mm)) { 356 pgste = pgste_get(ptep); 357 pgste_set_key(ptep, pgste, pte, mm); 358 pgste = pgste_set_pte(ptep, pgste, pte); 359 pgste_set_unlock(ptep, pgste); 360 } else { 361 set_pte(ptep, pte); 362 } 363 preempt_enable(); 364 } 365 366 static inline void pmdp_idte_local(struct mm_struct *mm, 367 unsigned long addr, pmd_t *pmdp) 368 { 369 if (machine_has_tlb_guest()) 370 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 371 mm->context.asce, IDTE_LOCAL); 372 else 373 __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); 374 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 375 gmap_pmdp_idte_local(mm, addr); 376 } 377 378 static inline void pmdp_idte_global(struct mm_struct *mm, 379 unsigned long addr, pmd_t *pmdp) 380 { 381 if (machine_has_tlb_guest()) { 382 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 383 mm->context.asce, IDTE_GLOBAL); 384 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 385 gmap_pmdp_idte_global(mm, addr); 386 } else if (cpu_has_idte()) { 387 __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); 388 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 389 gmap_pmdp_idte_global(mm, addr); 390 } else { 391 __pmdp_csp(pmdp); 392 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 393 gmap_pmdp_csp(mm, addr); 394 } 395 } 396 397 static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, 398 unsigned long addr, pmd_t *pmdp) 399 { 400 pmd_t old; 401 402 old = *pmdp; 403 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 404 return old; 405 atomic_inc(&mm->context.flush_count); 406 if (cpu_has_tlb_lc() && 407 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 408 pmdp_idte_local(mm, addr, pmdp); 409 else 410 pmdp_idte_global(mm, addr, pmdp); 411 atomic_dec(&mm->context.flush_count); 412 return old; 413 } 414 415 static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, 416 unsigned long addr, pmd_t *pmdp) 417 { 418 pmd_t old; 419 420 old = *pmdp; 421 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 422 return old; 423 atomic_inc(&mm->context.flush_count); 424 if (cpumask_equal(&mm->context.cpu_attach_mask, 425 cpumask_of(smp_processor_id()))) { 426 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); 427 mm->context.flush_mm = 1; 428 if (mm_has_pgste(mm)) 429 gmap_pmdp_invalidate(mm, addr); 430 } else { 431 pmdp_idte_global(mm, addr, pmdp); 432 } 433 atomic_dec(&mm->context.flush_count); 434 return old; 435 } 436 437 #ifdef CONFIG_PGSTE 438 static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) 439 { 440 struct vm_area_struct *vma; 441 pgd_t *pgd; 442 p4d_t *p4d; 443 pud_t *pud; 444 445 /* We need a valid VMA, otherwise this is clearly a fault. */ 446 vma = vma_lookup(mm, addr); 447 if (!vma) 448 return -EFAULT; 449 450 pgd = pgd_offset(mm, addr); 451 if (!pgd_present(*pgd)) 452 return -ENOENT; 453 454 p4d = p4d_offset(pgd, addr); 455 if (!p4d_present(*p4d)) 456 return -ENOENT; 457 458 pud = pud_offset(p4d, addr); 459 if (!pud_present(*pud)) 460 return -ENOENT; 461 462 /* Large PUDs are not supported yet. */ 463 if (pud_leaf(*pud)) 464 return -EFAULT; 465 466 *pmdp = pmd_offset(pud, addr); 467 return 0; 468 } 469 #endif 470 471 pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, 472 pmd_t *pmdp, pmd_t new) 473 { 474 pmd_t old; 475 476 preempt_disable(); 477 old = pmdp_flush_direct(mm, addr, pmdp); 478 set_pmd(pmdp, new); 479 preempt_enable(); 480 return old; 481 } 482 EXPORT_SYMBOL(pmdp_xchg_direct); 483 484 pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, 485 pmd_t *pmdp, pmd_t new) 486 { 487 pmd_t old; 488 489 preempt_disable(); 490 old = pmdp_flush_lazy(mm, addr, pmdp); 491 set_pmd(pmdp, new); 492 preempt_enable(); 493 return old; 494 } 495 EXPORT_SYMBOL(pmdp_xchg_lazy); 496 497 static inline void pudp_idte_local(struct mm_struct *mm, 498 unsigned long addr, pud_t *pudp) 499 { 500 if (machine_has_tlb_guest()) 501 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 502 mm->context.asce, IDTE_LOCAL); 503 else 504 __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); 505 } 506 507 static inline void pudp_idte_global(struct mm_struct *mm, 508 unsigned long addr, pud_t *pudp) 509 { 510 if (machine_has_tlb_guest()) 511 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 512 mm->context.asce, IDTE_GLOBAL); 513 else if (cpu_has_idte()) 514 __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); 515 else 516 /* 517 * Invalid bit position is the same for pmd and pud, so we can 518 * reuse _pmd_csp() here 519 */ 520 __pmdp_csp((pmd_t *) pudp); 521 } 522 523 static inline pud_t pudp_flush_direct(struct mm_struct *mm, 524 unsigned long addr, pud_t *pudp) 525 { 526 pud_t old; 527 528 old = *pudp; 529 if (pud_val(old) & _REGION_ENTRY_INVALID) 530 return old; 531 atomic_inc(&mm->context.flush_count); 532 if (cpu_has_tlb_lc() && 533 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 534 pudp_idte_local(mm, addr, pudp); 535 else 536 pudp_idte_global(mm, addr, pudp); 537 atomic_dec(&mm->context.flush_count); 538 return old; 539 } 540 541 pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, 542 pud_t *pudp, pud_t new) 543 { 544 pud_t old; 545 546 preempt_disable(); 547 old = pudp_flush_direct(mm, addr, pudp); 548 set_pud(pudp, new); 549 preempt_enable(); 550 return old; 551 } 552 EXPORT_SYMBOL(pudp_xchg_direct); 553 554 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 555 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 556 pgtable_t pgtable) 557 { 558 struct list_head *lh = (struct list_head *) pgtable; 559 560 assert_spin_locked(pmd_lockptr(mm, pmdp)); 561 562 /* FIFO */ 563 if (!pmd_huge_pte(mm, pmdp)) 564 INIT_LIST_HEAD(lh); 565 else 566 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 567 pmd_huge_pte(mm, pmdp) = pgtable; 568 } 569 570 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 571 { 572 struct list_head *lh; 573 pgtable_t pgtable; 574 pte_t *ptep; 575 576 assert_spin_locked(pmd_lockptr(mm, pmdp)); 577 578 /* FIFO */ 579 pgtable = pmd_huge_pte(mm, pmdp); 580 lh = (struct list_head *) pgtable; 581 if (list_empty(lh)) 582 pmd_huge_pte(mm, pmdp) = NULL; 583 else { 584 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 585 list_del(lh); 586 } 587 ptep = (pte_t *) pgtable; 588 set_pte(ptep, __pte(_PAGE_INVALID)); 589 ptep++; 590 set_pte(ptep, __pte(_PAGE_INVALID)); 591 return pgtable; 592 } 593 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 594 595 #ifdef CONFIG_PGSTE 596 void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, 597 pte_t *ptep, pte_t entry) 598 { 599 pgste_t pgste; 600 601 /* the mm_has_pgste() check is done in set_pte_at() */ 602 preempt_disable(); 603 pgste = pgste_get_lock(ptep); 604 pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO); 605 pgste_set_key(ptep, pgste, entry, mm); 606 pgste = pgste_set_pte(ptep, pgste, entry); 607 pgste_set_unlock(ptep, pgste); 608 preempt_enable(); 609 } 610 611 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 612 { 613 pgste_t pgste; 614 615 preempt_disable(); 616 pgste = pgste_get_lock(ptep); 617 pgste = set_pgste_bit(pgste, PGSTE_IN_BIT); 618 pgste_set_unlock(ptep, pgste); 619 preempt_enable(); 620 } 621 622 /** 623 * ptep_force_prot - change access rights of a locked pte 624 * @mm: pointer to the process mm_struct 625 * @addr: virtual address in the guest address space 626 * @ptep: pointer to the page table entry 627 * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE 628 * @bit: pgste bit to set (e.g. for notification) 629 * 630 * Returns 0 if the access rights were changed and -EAGAIN if the current 631 * and requested access rights are incompatible. 632 */ 633 int ptep_force_prot(struct mm_struct *mm, unsigned long addr, 634 pte_t *ptep, int prot, unsigned long bit) 635 { 636 pte_t entry; 637 pgste_t pgste; 638 int pte_i, pte_p, nodat; 639 640 pgste = pgste_get_lock(ptep); 641 entry = *ptep; 642 /* Check pte entry after all locks have been acquired */ 643 pte_i = pte_val(entry) & _PAGE_INVALID; 644 pte_p = pte_val(entry) & _PAGE_PROTECT; 645 if ((pte_i && (prot != PROT_NONE)) || 646 (pte_p && (prot & PROT_WRITE))) { 647 pgste_set_unlock(ptep, pgste); 648 return -EAGAIN; 649 } 650 /* Change access rights and set pgste bit */ 651 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 652 if (prot == PROT_NONE && !pte_i) { 653 ptep_flush_direct(mm, addr, ptep, nodat); 654 pgste = pgste_update_all(entry, pgste, mm); 655 entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); 656 } 657 if (prot == PROT_READ && !pte_p) { 658 ptep_flush_direct(mm, addr, ptep, nodat); 659 entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); 660 entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 661 } 662 pgste = set_pgste_bit(pgste, bit); 663 pgste = pgste_set_pte(ptep, pgste, entry); 664 pgste_set_unlock(ptep, pgste); 665 return 0; 666 } 667 668 int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, 669 pte_t *sptep, pte_t *tptep, pte_t pte) 670 { 671 pgste_t spgste, tpgste; 672 pte_t spte, tpte; 673 int rc = -EAGAIN; 674 675 if (!(pte_val(*tptep) & _PAGE_INVALID)) 676 return 0; /* already shadowed */ 677 spgste = pgste_get_lock(sptep); 678 spte = *sptep; 679 if (!(pte_val(spte) & _PAGE_INVALID) && 680 !((pte_val(spte) & _PAGE_PROTECT) && 681 !(pte_val(pte) & _PAGE_PROTECT))) { 682 spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT); 683 tpgste = pgste_get_lock(tptep); 684 tpte = __pte((pte_val(spte) & PAGE_MASK) | 685 (pte_val(pte) & _PAGE_PROTECT)); 686 /* don't touch the storage key - it belongs to parent pgste */ 687 tpgste = pgste_set_pte(tptep, tpgste, tpte); 688 pgste_set_unlock(tptep, tpgste); 689 rc = 1; 690 } 691 pgste_set_unlock(sptep, spgste); 692 return rc; 693 } 694 695 void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) 696 { 697 pgste_t pgste; 698 int nodat; 699 700 pgste = pgste_get_lock(ptep); 701 /* notifier is called by the caller */ 702 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 703 ptep_flush_direct(mm, saddr, ptep, nodat); 704 /* don't touch the storage key - it belongs to parent pgste */ 705 pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); 706 pgste_set_unlock(ptep, pgste); 707 } 708 709 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) 710 { 711 if (!non_swap_entry(entry)) 712 dec_mm_counter(mm, MM_SWAPENTS); 713 else if (is_migration_entry(entry)) { 714 struct folio *folio = pfn_swap_entry_folio(entry); 715 716 dec_mm_counter(mm, mm_counter(folio)); 717 } 718 free_swap_and_cache(entry); 719 } 720 721 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, 722 pte_t *ptep, int reset) 723 { 724 unsigned long pgstev; 725 pgste_t pgste; 726 pte_t pte; 727 728 /* Zap unused and logically-zero pages */ 729 preempt_disable(); 730 pgste = pgste_get_lock(ptep); 731 pgstev = pgste_val(pgste); 732 pte = *ptep; 733 if (!reset && pte_swap(pte) && 734 ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || 735 (pgstev & _PGSTE_GPS_ZERO))) { 736 ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); 737 pte_clear(mm, addr, ptep); 738 } 739 if (reset) 740 pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 741 pgste_set_unlock(ptep, pgste); 742 preempt_enable(); 743 } 744 745 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 746 { 747 unsigned long ptev; 748 pgste_t pgste; 749 750 /* Clear storage key ACC and F, but set R/C */ 751 preempt_disable(); 752 pgste = pgste_get_lock(ptep); 753 pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); 754 pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT); 755 ptev = pte_val(*ptep); 756 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 757 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); 758 pgste_set_unlock(ptep, pgste); 759 preempt_enable(); 760 } 761 762 /* 763 * Test and reset if a guest page is dirty 764 */ 765 bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, 766 pte_t *ptep) 767 { 768 pgste_t pgste; 769 pte_t pte; 770 bool dirty; 771 int nodat; 772 773 pgste = pgste_get_lock(ptep); 774 dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); 775 pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT); 776 pte = *ptep; 777 if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { 778 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 779 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 780 ptep_ipte_global(mm, addr, ptep, nodat); 781 if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE)) 782 pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); 783 else 784 pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); 785 set_pte(ptep, pte); 786 } 787 pgste_set_unlock(ptep, pgste); 788 return dirty; 789 } 790 EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); 791 792 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 793 unsigned char key, bool nq) 794 { 795 unsigned long keyul, paddr; 796 spinlock_t *ptl; 797 pgste_t old, new; 798 pmd_t *pmdp; 799 pte_t *ptep; 800 801 /* 802 * If we don't have a PTE table and if there is no huge page mapped, 803 * we can ignore attempts to set the key to 0, because it already is 0. 804 */ 805 switch (pmd_lookup(mm, addr, &pmdp)) { 806 case -ENOENT: 807 return key ? -EFAULT : 0; 808 case 0: 809 break; 810 default: 811 return -EFAULT; 812 } 813 again: 814 ptl = pmd_lock(mm, pmdp); 815 if (!pmd_present(*pmdp)) { 816 spin_unlock(ptl); 817 return key ? -EFAULT : 0; 818 } 819 820 if (pmd_leaf(*pmdp)) { 821 paddr = pmd_val(*pmdp) & HPAGE_MASK; 822 paddr |= addr & ~HPAGE_MASK; 823 /* 824 * Huge pmds need quiescing operations, they are 825 * always mapped. 826 */ 827 page_set_storage_key(paddr, key, 1); 828 spin_unlock(ptl); 829 return 0; 830 } 831 spin_unlock(ptl); 832 833 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 834 if (!ptep) 835 goto again; 836 new = old = pgste_get_lock(ptep); 837 new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT | 838 PGSTE_ACC_BITS | PGSTE_FP_BIT); 839 keyul = (unsigned long) key; 840 new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48); 841 new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); 842 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 843 unsigned long bits, skey; 844 845 paddr = pte_val(*ptep) & PAGE_MASK; 846 skey = (unsigned long) page_get_storage_key(paddr); 847 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 848 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 849 /* Set storage key ACC and FP */ 850 page_set_storage_key(paddr, skey, !nq); 851 /* Merge host changed & referenced into pgste */ 852 new = set_pgste_bit(new, bits << 52); 853 } 854 /* changing the guest storage key is considered a change of the page */ 855 if ((pgste_val(new) ^ pgste_val(old)) & 856 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 857 new = set_pgste_bit(new, PGSTE_UC_BIT); 858 859 pgste_set_unlock(ptep, new); 860 pte_unmap_unlock(ptep, ptl); 861 return 0; 862 } 863 EXPORT_SYMBOL(set_guest_storage_key); 864 865 /* 866 * Conditionally set a guest storage key (handling csske). 867 * oldkey will be updated when either mr or mc is set and a pointer is given. 868 * 869 * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest 870 * storage key was updated and -EFAULT on access errors. 871 */ 872 int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 873 unsigned char key, unsigned char *oldkey, 874 bool nq, bool mr, bool mc) 875 { 876 unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; 877 int rc; 878 879 /* we can drop the pgste lock between getting and setting the key */ 880 if (mr | mc) { 881 rc = get_guest_storage_key(current->mm, addr, &tmp); 882 if (rc) 883 return rc; 884 if (oldkey) 885 *oldkey = tmp; 886 if (!mr) 887 mask |= _PAGE_REFERENCED; 888 if (!mc) 889 mask |= _PAGE_CHANGED; 890 if (!((tmp ^ key) & mask)) 891 return 0; 892 } 893 rc = set_guest_storage_key(current->mm, addr, key, nq); 894 return rc < 0 ? rc : 1; 895 } 896 EXPORT_SYMBOL(cond_set_guest_storage_key); 897 898 /* 899 * Reset a guest reference bit (rrbe), returning the reference and changed bit. 900 * 901 * Returns < 0 in case of error, otherwise the cc to be reported to the guest. 902 */ 903 int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) 904 { 905 spinlock_t *ptl; 906 unsigned long paddr; 907 pgste_t old, new; 908 pmd_t *pmdp; 909 pte_t *ptep; 910 int cc = 0; 911 912 /* 913 * If we don't have a PTE table and if there is no huge page mapped, 914 * the storage key is 0 and there is nothing for us to do. 915 */ 916 switch (pmd_lookup(mm, addr, &pmdp)) { 917 case -ENOENT: 918 return 0; 919 case 0: 920 break; 921 default: 922 return -EFAULT; 923 } 924 again: 925 ptl = pmd_lock(mm, pmdp); 926 if (!pmd_present(*pmdp)) { 927 spin_unlock(ptl); 928 return 0; 929 } 930 931 if (pmd_leaf(*pmdp)) { 932 paddr = pmd_val(*pmdp) & HPAGE_MASK; 933 paddr |= addr & ~HPAGE_MASK; 934 cc = page_reset_referenced(paddr); 935 spin_unlock(ptl); 936 return cc; 937 } 938 spin_unlock(ptl); 939 940 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 941 if (!ptep) 942 goto again; 943 new = old = pgste_get_lock(ptep); 944 /* Reset guest reference bit only */ 945 new = clear_pgste_bit(new, PGSTE_GR_BIT); 946 947 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 948 paddr = pte_val(*ptep) & PAGE_MASK; 949 cc = page_reset_referenced(paddr); 950 /* Merge real referenced bit into host-set */ 951 new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT); 952 } 953 /* Reflect guest's logical view, not physical */ 954 cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; 955 /* Changing the guest storage key is considered a change of the page */ 956 if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) 957 new = set_pgste_bit(new, PGSTE_UC_BIT); 958 959 pgste_set_unlock(ptep, new); 960 pte_unmap_unlock(ptep, ptl); 961 return cc; 962 } 963 EXPORT_SYMBOL(reset_guest_reference_bit); 964 965 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, 966 unsigned char *key) 967 { 968 unsigned long paddr; 969 spinlock_t *ptl; 970 pgste_t pgste; 971 pmd_t *pmdp; 972 pte_t *ptep; 973 974 /* 975 * If we don't have a PTE table and if there is no huge page mapped, 976 * the storage key is 0. 977 */ 978 *key = 0; 979 980 switch (pmd_lookup(mm, addr, &pmdp)) { 981 case -ENOENT: 982 return 0; 983 case 0: 984 break; 985 default: 986 return -EFAULT; 987 } 988 again: 989 ptl = pmd_lock(mm, pmdp); 990 if (!pmd_present(*pmdp)) { 991 spin_unlock(ptl); 992 return 0; 993 } 994 995 if (pmd_leaf(*pmdp)) { 996 paddr = pmd_val(*pmdp) & HPAGE_MASK; 997 paddr |= addr & ~HPAGE_MASK; 998 *key = page_get_storage_key(paddr); 999 spin_unlock(ptl); 1000 return 0; 1001 } 1002 spin_unlock(ptl); 1003 1004 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 1005 if (!ptep) 1006 goto again; 1007 pgste = pgste_get_lock(ptep); 1008 *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 1009 paddr = pte_val(*ptep) & PAGE_MASK; 1010 if (!(pte_val(*ptep) & _PAGE_INVALID)) 1011 *key = page_get_storage_key(paddr); 1012 /* Reflect guest's logical view, not physical */ 1013 *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 1014 pgste_set_unlock(ptep, pgste); 1015 pte_unmap_unlock(ptep, ptl); 1016 return 0; 1017 } 1018 EXPORT_SYMBOL(get_guest_storage_key); 1019 1020 /** 1021 * pgste_perform_essa - perform ESSA actions on the PGSTE. 1022 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1023 * @hva: the host virtual address of the page whose PGSTE is to be processed 1024 * @orc: the specific action to perform, see the ESSA_SET_* macros. 1025 * @oldpte: the PTE will be saved there if the pointer is not NULL. 1026 * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. 1027 * 1028 * Return: 1 if the page is to be added to the CBRL, otherwise 0, 1029 * or < 0 in case of error. -EINVAL is returned for invalid values 1030 * of orc, -EFAULT for invalid addresses. 1031 */ 1032 int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, 1033 unsigned long *oldpte, unsigned long *oldpgste) 1034 { 1035 struct vm_area_struct *vma; 1036 unsigned long pgstev; 1037 spinlock_t *ptl; 1038 pgste_t pgste; 1039 pte_t *ptep; 1040 int res = 0; 1041 1042 WARN_ON_ONCE(orc > ESSA_MAX); 1043 if (unlikely(orc > ESSA_MAX)) 1044 return -EINVAL; 1045 1046 vma = vma_lookup(mm, hva); 1047 if (!vma || is_vm_hugetlb_page(vma)) 1048 return -EFAULT; 1049 ptep = get_locked_pte(mm, hva, &ptl); 1050 if (unlikely(!ptep)) 1051 return -EFAULT; 1052 pgste = pgste_get_lock(ptep); 1053 pgstev = pgste_val(pgste); 1054 if (oldpte) 1055 *oldpte = pte_val(*ptep); 1056 if (oldpgste) 1057 *oldpgste = pgstev; 1058 1059 switch (orc) { 1060 case ESSA_GET_STATE: 1061 break; 1062 case ESSA_SET_STABLE: 1063 pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 1064 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1065 break; 1066 case ESSA_SET_UNUSED: 1067 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1068 pgstev |= _PGSTE_GPS_USAGE_UNUSED; 1069 if (pte_val(*ptep) & _PAGE_INVALID) 1070 res = 1; 1071 break; 1072 case ESSA_SET_VOLATILE: 1073 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1074 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1075 if (pte_val(*ptep) & _PAGE_INVALID) 1076 res = 1; 1077 break; 1078 case ESSA_SET_POT_VOLATILE: 1079 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1080 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1081 pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; 1082 break; 1083 } 1084 if (pgstev & _PGSTE_GPS_ZERO) { 1085 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1086 break; 1087 } 1088 if (!(pgstev & PGSTE_GC_BIT)) { 1089 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1090 res = 1; 1091 break; 1092 } 1093 break; 1094 case ESSA_SET_STABLE_RESIDENT: 1095 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1096 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1097 /* 1098 * Since the resident state can go away any time after this 1099 * call, we will not make this page resident. We can revisit 1100 * this decision if a guest will ever start using this. 1101 */ 1102 break; 1103 case ESSA_SET_STABLE_IF_RESIDENT: 1104 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1105 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1106 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1107 } 1108 break; 1109 case ESSA_SET_STABLE_NODAT: 1110 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1111 pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; 1112 break; 1113 default: 1114 /* we should never get here! */ 1115 break; 1116 } 1117 /* If we are discarding a page, set it to logical zero */ 1118 if (res) 1119 pgstev |= _PGSTE_GPS_ZERO; 1120 1121 pgste = __pgste(pgstev); 1122 pgste_set_unlock(ptep, pgste); 1123 pte_unmap_unlock(ptep, ptl); 1124 return res; 1125 } 1126 EXPORT_SYMBOL(pgste_perform_essa); 1127 1128 /** 1129 * set_pgste_bits - set specific PGSTE bits. 1130 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1131 * @hva: the host virtual address of the page whose PGSTE is to be processed 1132 * @bits: a bitmask representing the bits that will be touched 1133 * @value: the values of the bits to be written. Only the bits in the mask 1134 * will be written. 1135 * 1136 * Return: 0 on success, < 0 in case of error. 1137 */ 1138 int set_pgste_bits(struct mm_struct *mm, unsigned long hva, 1139 unsigned long bits, unsigned long value) 1140 { 1141 struct vm_area_struct *vma; 1142 spinlock_t *ptl; 1143 pgste_t new; 1144 pte_t *ptep; 1145 1146 vma = vma_lookup(mm, hva); 1147 if (!vma || is_vm_hugetlb_page(vma)) 1148 return -EFAULT; 1149 ptep = get_locked_pte(mm, hva, &ptl); 1150 if (unlikely(!ptep)) 1151 return -EFAULT; 1152 new = pgste_get_lock(ptep); 1153 1154 new = clear_pgste_bit(new, bits); 1155 new = set_pgste_bit(new, value & bits); 1156 1157 pgste_set_unlock(ptep, new); 1158 pte_unmap_unlock(ptep, ptl); 1159 return 0; 1160 } 1161 EXPORT_SYMBOL(set_pgste_bits); 1162 1163 /** 1164 * get_pgste - get the current PGSTE for the given address. 1165 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1166 * @hva: the host virtual address of the page whose PGSTE is to be processed 1167 * @pgstep: will be written with the current PGSTE for the given address. 1168 * 1169 * Return: 0 on success, < 0 in case of error. 1170 */ 1171 int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) 1172 { 1173 struct vm_area_struct *vma; 1174 spinlock_t *ptl; 1175 pte_t *ptep; 1176 1177 vma = vma_lookup(mm, hva); 1178 if (!vma || is_vm_hugetlb_page(vma)) 1179 return -EFAULT; 1180 ptep = get_locked_pte(mm, hva, &ptl); 1181 if (unlikely(!ptep)) 1182 return -EFAULT; 1183 *pgstep = pgste_val(pgste_get(ptep)); 1184 pte_unmap_unlock(ptep, ptl); 1185 return 0; 1186 } 1187 EXPORT_SYMBOL(get_pgste); 1188 #endif 1189