1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2007, 2011 4 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 5 */ 6 7 #include <linux/cpufeature.h> 8 #include <linux/export.h> 9 #include <linux/sched.h> 10 #include <linux/kernel.h> 11 #include <linux/errno.h> 12 #include <linux/gfp.h> 13 #include <linux/mm.h> 14 #include <linux/swap.h> 15 #include <linux/smp.h> 16 #include <linux/spinlock.h> 17 #include <linux/rcupdate.h> 18 #include <linux/slab.h> 19 #include <linux/swapops.h> 20 #include <linux/sysctl.h> 21 #include <linux/ksm.h> 22 #include <linux/mman.h> 23 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 #include <asm/page-states.h> 27 #include <asm/machine.h> 28 29 pgprot_t pgprot_writecombine(pgprot_t prot) 30 { 31 /* 32 * mio_wb_bit_mask may be set on a different CPU, but it is only set 33 * once at init and only read afterwards. 34 */ 35 return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); 36 } 37 EXPORT_SYMBOL_GPL(pgprot_writecombine); 38 39 static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, 40 pte_t *ptep, int nodat) 41 { 42 unsigned long opt, asce; 43 44 if (machine_has_tlb_guest()) { 45 opt = 0; 46 asce = READ_ONCE(mm->context.gmap_asce); 47 if (asce == 0UL || nodat) 48 opt |= IPTE_NODAT; 49 if (asce != -1UL) { 50 asce = asce ? : mm->context.asce; 51 opt |= IPTE_GUEST_ASCE; 52 } 53 __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); 54 } else { 55 __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); 56 } 57 } 58 59 static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, 60 pte_t *ptep, int nodat) 61 { 62 unsigned long opt, asce; 63 64 if (machine_has_tlb_guest()) { 65 opt = 0; 66 asce = READ_ONCE(mm->context.gmap_asce); 67 if (asce == 0UL || nodat) 68 opt |= IPTE_NODAT; 69 if (asce != -1UL) { 70 asce = asce ? : mm->context.asce; 71 opt |= IPTE_GUEST_ASCE; 72 } 73 __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); 74 } else { 75 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 76 } 77 } 78 79 static inline pte_t ptep_flush_direct(struct mm_struct *mm, 80 unsigned long addr, pte_t *ptep, 81 int nodat) 82 { 83 pte_t old; 84 85 old = *ptep; 86 if (unlikely(pte_val(old) & _PAGE_INVALID)) 87 return old; 88 atomic_inc(&mm->context.flush_count); 89 if (cpu_has_tlb_lc() && 90 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 91 ptep_ipte_local(mm, addr, ptep, nodat); 92 else 93 ptep_ipte_global(mm, addr, ptep, nodat); 94 atomic_dec(&mm->context.flush_count); 95 return old; 96 } 97 98 static inline pte_t ptep_flush_lazy(struct mm_struct *mm, 99 unsigned long addr, pte_t *ptep, 100 int nodat) 101 { 102 pte_t old; 103 104 old = *ptep; 105 if (unlikely(pte_val(old) & _PAGE_INVALID)) 106 return old; 107 atomic_inc(&mm->context.flush_count); 108 if (cpumask_equal(&mm->context.cpu_attach_mask, 109 cpumask_of(smp_processor_id()))) { 110 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); 111 mm->context.flush_mm = 1; 112 } else 113 ptep_ipte_global(mm, addr, ptep, nodat); 114 atomic_dec(&mm->context.flush_count); 115 return old; 116 } 117 118 static inline pgste_t pgste_get_lock(pte_t *ptep) 119 { 120 unsigned long value = 0; 121 #ifdef CONFIG_PGSTE 122 unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); 123 124 do { 125 value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); 126 } while (value & PGSTE_PCL_BIT); 127 value |= PGSTE_PCL_BIT; 128 #endif 129 return __pgste(value); 130 } 131 132 static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) 133 { 134 #ifdef CONFIG_PGSTE 135 barrier(); 136 WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); 137 #endif 138 } 139 140 static inline pgste_t pgste_get(pte_t *ptep) 141 { 142 unsigned long pgste = 0; 143 #ifdef CONFIG_PGSTE 144 pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); 145 #endif 146 return __pgste(pgste); 147 } 148 149 static inline void pgste_set(pte_t *ptep, pgste_t pgste) 150 { 151 #ifdef CONFIG_PGSTE 152 *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; 153 #endif 154 } 155 156 static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, 157 struct mm_struct *mm) 158 { 159 #ifdef CONFIG_PGSTE 160 unsigned long address, bits, skey; 161 162 if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) 163 return pgste; 164 address = pte_val(pte) & PAGE_MASK; 165 skey = (unsigned long) page_get_storage_key(address); 166 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 167 /* Transfer page changed & referenced bit to guest bits in pgste */ 168 pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */ 169 /* Copy page access key and fetch protection bit to pgste */ 170 pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); 171 pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); 172 #endif 173 return pgste; 174 175 } 176 177 static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, 178 struct mm_struct *mm) 179 { 180 #ifdef CONFIG_PGSTE 181 unsigned long address; 182 unsigned long nkey; 183 184 if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) 185 return; 186 VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); 187 address = pte_val(entry) & PAGE_MASK; 188 /* 189 * Set page access key and fetch protection bit from pgste. 190 * The guest C/R information is still in the PGSTE, set real 191 * key C/R to 0. 192 */ 193 nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 194 nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 195 page_set_storage_key(address, nkey, 0); 196 #endif 197 } 198 199 static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) 200 { 201 #ifdef CONFIG_PGSTE 202 if ((pte_val(entry) & _PAGE_PRESENT) && 203 (pte_val(entry) & _PAGE_WRITE) && 204 !(pte_val(entry) & _PAGE_INVALID)) { 205 if (!machine_has_esop()) { 206 /* 207 * Without enhanced suppression-on-protection force 208 * the dirty bit on for all writable ptes. 209 */ 210 entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); 211 entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 212 } 213 if (!(pte_val(entry) & _PAGE_PROTECT)) 214 /* This pte allows write access, set user-dirty */ 215 pgste = set_pgste_bit(pgste, PGSTE_UC_BIT); 216 } 217 #endif 218 set_pte(ptep, entry); 219 return pgste; 220 } 221 222 static inline pgste_t pgste_pte_notify(struct mm_struct *mm, 223 unsigned long addr, 224 pte_t *ptep, pgste_t pgste) 225 { 226 #ifdef CONFIG_PGSTE 227 unsigned long bits; 228 229 bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); 230 if (bits) { 231 pgste = __pgste(pgste_val(pgste) ^ bits); 232 ptep_notify(mm, addr, ptep, bits); 233 } 234 #endif 235 return pgste; 236 } 237 238 static inline pgste_t ptep_xchg_start(struct mm_struct *mm, 239 unsigned long addr, pte_t *ptep) 240 { 241 pgste_t pgste = __pgste(0); 242 243 if (mm_has_pgste(mm)) { 244 pgste = pgste_get_lock(ptep); 245 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 246 } 247 return pgste; 248 } 249 250 static inline pte_t ptep_xchg_commit(struct mm_struct *mm, 251 unsigned long addr, pte_t *ptep, 252 pgste_t pgste, pte_t old, pte_t new) 253 { 254 if (mm_has_pgste(mm)) { 255 if (pte_val(old) & _PAGE_INVALID) 256 pgste_set_key(ptep, pgste, new, mm); 257 if (pte_val(new) & _PAGE_INVALID) { 258 pgste = pgste_update_all(old, pgste, mm); 259 if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == 260 _PGSTE_GPS_USAGE_UNUSED) 261 old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); 262 } 263 pgste = pgste_set_pte(ptep, pgste, new); 264 pgste_set_unlock(ptep, pgste); 265 } else { 266 set_pte(ptep, new); 267 } 268 return old; 269 } 270 271 pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, 272 pte_t *ptep, pte_t new) 273 { 274 pgste_t pgste; 275 pte_t old; 276 int nodat; 277 278 preempt_disable(); 279 pgste = ptep_xchg_start(mm, addr, ptep); 280 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 281 old = ptep_flush_direct(mm, addr, ptep, nodat); 282 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 283 preempt_enable(); 284 return old; 285 } 286 EXPORT_SYMBOL(ptep_xchg_direct); 287 288 /* 289 * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that 290 * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). 291 */ 292 void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, 293 pte_t new) 294 { 295 preempt_disable(); 296 atomic_inc(&mm->context.flush_count); 297 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 298 __ptep_rdp(addr, ptep, 0, 0, 1); 299 else 300 __ptep_rdp(addr, ptep, 0, 0, 0); 301 /* 302 * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That 303 * means it is still valid and active, and must not be changed according 304 * to the architecture. But writing a new value that only differs in SW 305 * bits is allowed. 306 */ 307 set_pte(ptep, new); 308 atomic_dec(&mm->context.flush_count); 309 preempt_enable(); 310 } 311 EXPORT_SYMBOL(ptep_reset_dat_prot); 312 313 pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, 314 pte_t *ptep, pte_t new) 315 { 316 pgste_t pgste; 317 pte_t old; 318 int nodat; 319 320 preempt_disable(); 321 pgste = ptep_xchg_start(mm, addr, ptep); 322 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 323 old = ptep_flush_lazy(mm, addr, ptep, nodat); 324 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 325 preempt_enable(); 326 return old; 327 } 328 EXPORT_SYMBOL(ptep_xchg_lazy); 329 330 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, 331 pte_t *ptep) 332 { 333 pgste_t pgste; 334 pte_t old; 335 int nodat; 336 struct mm_struct *mm = vma->vm_mm; 337 338 pgste = ptep_xchg_start(mm, addr, ptep); 339 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 340 old = ptep_flush_lazy(mm, addr, ptep, nodat); 341 if (mm_has_pgste(mm)) { 342 pgste = pgste_update_all(old, pgste, mm); 343 pgste_set(ptep, pgste); 344 } 345 return old; 346 } 347 348 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 349 pte_t *ptep, pte_t old_pte, pte_t pte) 350 { 351 pgste_t pgste; 352 struct mm_struct *mm = vma->vm_mm; 353 354 if (mm_has_pgste(mm)) { 355 pgste = pgste_get(ptep); 356 pgste_set_key(ptep, pgste, pte, mm); 357 pgste = pgste_set_pte(ptep, pgste, pte); 358 pgste_set_unlock(ptep, pgste); 359 } else { 360 set_pte(ptep, pte); 361 } 362 } 363 364 static inline void pmdp_idte_local(struct mm_struct *mm, 365 unsigned long addr, pmd_t *pmdp) 366 { 367 if (machine_has_tlb_guest()) 368 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 369 mm->context.asce, IDTE_LOCAL); 370 else 371 __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); 372 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 373 gmap_pmdp_idte_local(mm, addr); 374 } 375 376 static inline void pmdp_idte_global(struct mm_struct *mm, 377 unsigned long addr, pmd_t *pmdp) 378 { 379 if (machine_has_tlb_guest()) { 380 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 381 mm->context.asce, IDTE_GLOBAL); 382 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 383 gmap_pmdp_idte_global(mm, addr); 384 } else if (cpu_has_idte()) { 385 __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); 386 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 387 gmap_pmdp_idte_global(mm, addr); 388 } else { 389 __pmdp_csp(pmdp); 390 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 391 gmap_pmdp_csp(mm, addr); 392 } 393 } 394 395 static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, 396 unsigned long addr, pmd_t *pmdp) 397 { 398 pmd_t old; 399 400 old = *pmdp; 401 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 402 return old; 403 atomic_inc(&mm->context.flush_count); 404 if (cpu_has_tlb_lc() && 405 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 406 pmdp_idte_local(mm, addr, pmdp); 407 else 408 pmdp_idte_global(mm, addr, pmdp); 409 atomic_dec(&mm->context.flush_count); 410 return old; 411 } 412 413 static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, 414 unsigned long addr, pmd_t *pmdp) 415 { 416 pmd_t old; 417 418 old = *pmdp; 419 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 420 return old; 421 atomic_inc(&mm->context.flush_count); 422 if (cpumask_equal(&mm->context.cpu_attach_mask, 423 cpumask_of(smp_processor_id()))) { 424 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); 425 mm->context.flush_mm = 1; 426 if (mm_has_pgste(mm)) 427 gmap_pmdp_invalidate(mm, addr); 428 } else { 429 pmdp_idte_global(mm, addr, pmdp); 430 } 431 atomic_dec(&mm->context.flush_count); 432 return old; 433 } 434 435 #ifdef CONFIG_PGSTE 436 static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) 437 { 438 struct vm_area_struct *vma; 439 pgd_t *pgd; 440 p4d_t *p4d; 441 pud_t *pud; 442 443 /* We need a valid VMA, otherwise this is clearly a fault. */ 444 vma = vma_lookup(mm, addr); 445 if (!vma) 446 return -EFAULT; 447 448 pgd = pgd_offset(mm, addr); 449 if (!pgd_present(*pgd)) 450 return -ENOENT; 451 452 p4d = p4d_offset(pgd, addr); 453 if (!p4d_present(*p4d)) 454 return -ENOENT; 455 456 pud = pud_offset(p4d, addr); 457 if (!pud_present(*pud)) 458 return -ENOENT; 459 460 /* Large PUDs are not supported yet. */ 461 if (pud_leaf(*pud)) 462 return -EFAULT; 463 464 *pmdp = pmd_offset(pud, addr); 465 return 0; 466 } 467 #endif 468 469 pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, 470 pmd_t *pmdp, pmd_t new) 471 { 472 pmd_t old; 473 474 preempt_disable(); 475 old = pmdp_flush_direct(mm, addr, pmdp); 476 set_pmd(pmdp, new); 477 preempt_enable(); 478 return old; 479 } 480 EXPORT_SYMBOL(pmdp_xchg_direct); 481 482 pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, 483 pmd_t *pmdp, pmd_t new) 484 { 485 pmd_t old; 486 487 preempt_disable(); 488 old = pmdp_flush_lazy(mm, addr, pmdp); 489 set_pmd(pmdp, new); 490 preempt_enable(); 491 return old; 492 } 493 EXPORT_SYMBOL(pmdp_xchg_lazy); 494 495 static inline void pudp_idte_local(struct mm_struct *mm, 496 unsigned long addr, pud_t *pudp) 497 { 498 if (machine_has_tlb_guest()) 499 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 500 mm->context.asce, IDTE_LOCAL); 501 else 502 __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); 503 } 504 505 static inline void pudp_idte_global(struct mm_struct *mm, 506 unsigned long addr, pud_t *pudp) 507 { 508 if (machine_has_tlb_guest()) 509 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 510 mm->context.asce, IDTE_GLOBAL); 511 else if (cpu_has_idte()) 512 __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); 513 else 514 /* 515 * Invalid bit position is the same for pmd and pud, so we can 516 * reuse _pmd_csp() here 517 */ 518 __pmdp_csp((pmd_t *) pudp); 519 } 520 521 static inline pud_t pudp_flush_direct(struct mm_struct *mm, 522 unsigned long addr, pud_t *pudp) 523 { 524 pud_t old; 525 526 old = *pudp; 527 if (pud_val(old) & _REGION_ENTRY_INVALID) 528 return old; 529 atomic_inc(&mm->context.flush_count); 530 if (cpu_has_tlb_lc() && 531 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 532 pudp_idte_local(mm, addr, pudp); 533 else 534 pudp_idte_global(mm, addr, pudp); 535 atomic_dec(&mm->context.flush_count); 536 return old; 537 } 538 539 pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, 540 pud_t *pudp, pud_t new) 541 { 542 pud_t old; 543 544 preempt_disable(); 545 old = pudp_flush_direct(mm, addr, pudp); 546 set_pud(pudp, new); 547 preempt_enable(); 548 return old; 549 } 550 EXPORT_SYMBOL(pudp_xchg_direct); 551 552 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 553 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 554 pgtable_t pgtable) 555 { 556 struct list_head *lh = (struct list_head *) pgtable; 557 558 assert_spin_locked(pmd_lockptr(mm, pmdp)); 559 560 /* FIFO */ 561 if (!pmd_huge_pte(mm, pmdp)) 562 INIT_LIST_HEAD(lh); 563 else 564 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 565 pmd_huge_pte(mm, pmdp) = pgtable; 566 } 567 568 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 569 { 570 struct list_head *lh; 571 pgtable_t pgtable; 572 pte_t *ptep; 573 574 assert_spin_locked(pmd_lockptr(mm, pmdp)); 575 576 /* FIFO */ 577 pgtable = pmd_huge_pte(mm, pmdp); 578 lh = (struct list_head *) pgtable; 579 if (list_empty(lh)) 580 pmd_huge_pte(mm, pmdp) = NULL; 581 else { 582 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 583 list_del(lh); 584 } 585 ptep = (pte_t *) pgtable; 586 set_pte(ptep, __pte(_PAGE_INVALID)); 587 ptep++; 588 set_pte(ptep, __pte(_PAGE_INVALID)); 589 return pgtable; 590 } 591 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 592 593 #ifdef CONFIG_PGSTE 594 void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, 595 pte_t *ptep, pte_t entry) 596 { 597 pgste_t pgste; 598 599 /* the mm_has_pgste() check is done in set_pte_at() */ 600 preempt_disable(); 601 pgste = pgste_get_lock(ptep); 602 pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO); 603 pgste_set_key(ptep, pgste, entry, mm); 604 pgste = pgste_set_pte(ptep, pgste, entry); 605 pgste_set_unlock(ptep, pgste); 606 preempt_enable(); 607 } 608 609 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 610 { 611 pgste_t pgste; 612 613 preempt_disable(); 614 pgste = pgste_get_lock(ptep); 615 pgste = set_pgste_bit(pgste, PGSTE_IN_BIT); 616 pgste_set_unlock(ptep, pgste); 617 preempt_enable(); 618 } 619 620 /** 621 * ptep_force_prot - change access rights of a locked pte 622 * @mm: pointer to the process mm_struct 623 * @addr: virtual address in the guest address space 624 * @ptep: pointer to the page table entry 625 * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE 626 * @bit: pgste bit to set (e.g. for notification) 627 * 628 * Returns 0 if the access rights were changed and -EAGAIN if the current 629 * and requested access rights are incompatible. 630 */ 631 int ptep_force_prot(struct mm_struct *mm, unsigned long addr, 632 pte_t *ptep, int prot, unsigned long bit) 633 { 634 pte_t entry; 635 pgste_t pgste; 636 int pte_i, pte_p, nodat; 637 638 pgste = pgste_get_lock(ptep); 639 entry = *ptep; 640 /* Check pte entry after all locks have been acquired */ 641 pte_i = pte_val(entry) & _PAGE_INVALID; 642 pte_p = pte_val(entry) & _PAGE_PROTECT; 643 if ((pte_i && (prot != PROT_NONE)) || 644 (pte_p && (prot & PROT_WRITE))) { 645 pgste_set_unlock(ptep, pgste); 646 return -EAGAIN; 647 } 648 /* Change access rights and set pgste bit */ 649 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 650 if (prot == PROT_NONE && !pte_i) { 651 ptep_flush_direct(mm, addr, ptep, nodat); 652 pgste = pgste_update_all(entry, pgste, mm); 653 entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); 654 } 655 if (prot == PROT_READ && !pte_p) { 656 ptep_flush_direct(mm, addr, ptep, nodat); 657 entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); 658 entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 659 } 660 pgste = set_pgste_bit(pgste, bit); 661 pgste = pgste_set_pte(ptep, pgste, entry); 662 pgste_set_unlock(ptep, pgste); 663 return 0; 664 } 665 666 int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, 667 pte_t *sptep, pte_t *tptep, pte_t pte) 668 { 669 pgste_t spgste, tpgste; 670 pte_t spte, tpte; 671 int rc = -EAGAIN; 672 673 if (!(pte_val(*tptep) & _PAGE_INVALID)) 674 return 0; /* already shadowed */ 675 spgste = pgste_get_lock(sptep); 676 spte = *sptep; 677 if (!(pte_val(spte) & _PAGE_INVALID) && 678 !((pte_val(spte) & _PAGE_PROTECT) && 679 !(pte_val(pte) & _PAGE_PROTECT))) { 680 spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT); 681 tpgste = pgste_get_lock(tptep); 682 tpte = __pte((pte_val(spte) & PAGE_MASK) | 683 (pte_val(pte) & _PAGE_PROTECT)); 684 /* don't touch the storage key - it belongs to parent pgste */ 685 tpgste = pgste_set_pte(tptep, tpgste, tpte); 686 pgste_set_unlock(tptep, tpgste); 687 rc = 1; 688 } 689 pgste_set_unlock(sptep, spgste); 690 return rc; 691 } 692 693 void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) 694 { 695 pgste_t pgste; 696 int nodat; 697 698 pgste = pgste_get_lock(ptep); 699 /* notifier is called by the caller */ 700 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 701 ptep_flush_direct(mm, saddr, ptep, nodat); 702 /* don't touch the storage key - it belongs to parent pgste */ 703 pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); 704 pgste_set_unlock(ptep, pgste); 705 } 706 707 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) 708 { 709 if (!non_swap_entry(entry)) 710 dec_mm_counter(mm, MM_SWAPENTS); 711 else if (is_migration_entry(entry)) { 712 struct folio *folio = pfn_swap_entry_folio(entry); 713 714 dec_mm_counter(mm, mm_counter(folio)); 715 } 716 free_swap_and_cache(entry); 717 } 718 719 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, 720 pte_t *ptep, int reset) 721 { 722 unsigned long pgstev; 723 pgste_t pgste; 724 pte_t pte; 725 726 /* Zap unused and logically-zero pages */ 727 preempt_disable(); 728 pgste = pgste_get_lock(ptep); 729 pgstev = pgste_val(pgste); 730 pte = *ptep; 731 if (!reset && pte_swap(pte) && 732 ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || 733 (pgstev & _PGSTE_GPS_ZERO))) { 734 ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); 735 pte_clear(mm, addr, ptep); 736 } 737 if (reset) 738 pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 739 pgste_set_unlock(ptep, pgste); 740 preempt_enable(); 741 } 742 743 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 744 { 745 unsigned long ptev; 746 pgste_t pgste; 747 748 /* Clear storage key ACC and F, but set R/C */ 749 preempt_disable(); 750 pgste = pgste_get_lock(ptep); 751 pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); 752 pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT); 753 ptev = pte_val(*ptep); 754 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 755 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); 756 pgste_set_unlock(ptep, pgste); 757 preempt_enable(); 758 } 759 760 /* 761 * Test and reset if a guest page is dirty 762 */ 763 bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, 764 pte_t *ptep) 765 { 766 pgste_t pgste; 767 pte_t pte; 768 bool dirty; 769 int nodat; 770 771 pgste = pgste_get_lock(ptep); 772 dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); 773 pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT); 774 pte = *ptep; 775 if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { 776 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 777 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 778 ptep_ipte_global(mm, addr, ptep, nodat); 779 if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE)) 780 pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); 781 else 782 pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); 783 set_pte(ptep, pte); 784 } 785 pgste_set_unlock(ptep, pgste); 786 return dirty; 787 } 788 EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); 789 790 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 791 unsigned char key, bool nq) 792 { 793 unsigned long keyul, paddr; 794 spinlock_t *ptl; 795 pgste_t old, new; 796 pmd_t *pmdp; 797 pte_t *ptep; 798 799 /* 800 * If we don't have a PTE table and if there is no huge page mapped, 801 * we can ignore attempts to set the key to 0, because it already is 0. 802 */ 803 switch (pmd_lookup(mm, addr, &pmdp)) { 804 case -ENOENT: 805 return key ? -EFAULT : 0; 806 case 0: 807 break; 808 default: 809 return -EFAULT; 810 } 811 again: 812 ptl = pmd_lock(mm, pmdp); 813 if (!pmd_present(*pmdp)) { 814 spin_unlock(ptl); 815 return key ? -EFAULT : 0; 816 } 817 818 if (pmd_leaf(*pmdp)) { 819 paddr = pmd_val(*pmdp) & HPAGE_MASK; 820 paddr |= addr & ~HPAGE_MASK; 821 /* 822 * Huge pmds need quiescing operations, they are 823 * always mapped. 824 */ 825 page_set_storage_key(paddr, key, 1); 826 spin_unlock(ptl); 827 return 0; 828 } 829 spin_unlock(ptl); 830 831 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 832 if (!ptep) 833 goto again; 834 new = old = pgste_get_lock(ptep); 835 new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT | 836 PGSTE_ACC_BITS | PGSTE_FP_BIT); 837 keyul = (unsigned long) key; 838 new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48); 839 new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); 840 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 841 unsigned long bits, skey; 842 843 paddr = pte_val(*ptep) & PAGE_MASK; 844 skey = (unsigned long) page_get_storage_key(paddr); 845 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 846 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 847 /* Set storage key ACC and FP */ 848 page_set_storage_key(paddr, skey, !nq); 849 /* Merge host changed & referenced into pgste */ 850 new = set_pgste_bit(new, bits << 52); 851 } 852 /* changing the guest storage key is considered a change of the page */ 853 if ((pgste_val(new) ^ pgste_val(old)) & 854 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 855 new = set_pgste_bit(new, PGSTE_UC_BIT); 856 857 pgste_set_unlock(ptep, new); 858 pte_unmap_unlock(ptep, ptl); 859 return 0; 860 } 861 EXPORT_SYMBOL(set_guest_storage_key); 862 863 /* 864 * Conditionally set a guest storage key (handling csske). 865 * oldkey will be updated when either mr or mc is set and a pointer is given. 866 * 867 * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest 868 * storage key was updated and -EFAULT on access errors. 869 */ 870 int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 871 unsigned char key, unsigned char *oldkey, 872 bool nq, bool mr, bool mc) 873 { 874 unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; 875 int rc; 876 877 /* we can drop the pgste lock between getting and setting the key */ 878 if (mr | mc) { 879 rc = get_guest_storage_key(current->mm, addr, &tmp); 880 if (rc) 881 return rc; 882 if (oldkey) 883 *oldkey = tmp; 884 if (!mr) 885 mask |= _PAGE_REFERENCED; 886 if (!mc) 887 mask |= _PAGE_CHANGED; 888 if (!((tmp ^ key) & mask)) 889 return 0; 890 } 891 rc = set_guest_storage_key(current->mm, addr, key, nq); 892 return rc < 0 ? rc : 1; 893 } 894 EXPORT_SYMBOL(cond_set_guest_storage_key); 895 896 /* 897 * Reset a guest reference bit (rrbe), returning the reference and changed bit. 898 * 899 * Returns < 0 in case of error, otherwise the cc to be reported to the guest. 900 */ 901 int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) 902 { 903 spinlock_t *ptl; 904 unsigned long paddr; 905 pgste_t old, new; 906 pmd_t *pmdp; 907 pte_t *ptep; 908 int cc = 0; 909 910 /* 911 * If we don't have a PTE table and if there is no huge page mapped, 912 * the storage key is 0 and there is nothing for us to do. 913 */ 914 switch (pmd_lookup(mm, addr, &pmdp)) { 915 case -ENOENT: 916 return 0; 917 case 0: 918 break; 919 default: 920 return -EFAULT; 921 } 922 again: 923 ptl = pmd_lock(mm, pmdp); 924 if (!pmd_present(*pmdp)) { 925 spin_unlock(ptl); 926 return 0; 927 } 928 929 if (pmd_leaf(*pmdp)) { 930 paddr = pmd_val(*pmdp) & HPAGE_MASK; 931 paddr |= addr & ~HPAGE_MASK; 932 cc = page_reset_referenced(paddr); 933 spin_unlock(ptl); 934 return cc; 935 } 936 spin_unlock(ptl); 937 938 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 939 if (!ptep) 940 goto again; 941 new = old = pgste_get_lock(ptep); 942 /* Reset guest reference bit only */ 943 new = clear_pgste_bit(new, PGSTE_GR_BIT); 944 945 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 946 paddr = pte_val(*ptep) & PAGE_MASK; 947 cc = page_reset_referenced(paddr); 948 /* Merge real referenced bit into host-set */ 949 new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT); 950 } 951 /* Reflect guest's logical view, not physical */ 952 cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; 953 /* Changing the guest storage key is considered a change of the page */ 954 if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) 955 new = set_pgste_bit(new, PGSTE_UC_BIT); 956 957 pgste_set_unlock(ptep, new); 958 pte_unmap_unlock(ptep, ptl); 959 return cc; 960 } 961 EXPORT_SYMBOL(reset_guest_reference_bit); 962 963 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, 964 unsigned char *key) 965 { 966 unsigned long paddr; 967 spinlock_t *ptl; 968 pgste_t pgste; 969 pmd_t *pmdp; 970 pte_t *ptep; 971 972 /* 973 * If we don't have a PTE table and if there is no huge page mapped, 974 * the storage key is 0. 975 */ 976 *key = 0; 977 978 switch (pmd_lookup(mm, addr, &pmdp)) { 979 case -ENOENT: 980 return 0; 981 case 0: 982 break; 983 default: 984 return -EFAULT; 985 } 986 again: 987 ptl = pmd_lock(mm, pmdp); 988 if (!pmd_present(*pmdp)) { 989 spin_unlock(ptl); 990 return 0; 991 } 992 993 if (pmd_leaf(*pmdp)) { 994 paddr = pmd_val(*pmdp) & HPAGE_MASK; 995 paddr |= addr & ~HPAGE_MASK; 996 *key = page_get_storage_key(paddr); 997 spin_unlock(ptl); 998 return 0; 999 } 1000 spin_unlock(ptl); 1001 1002 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 1003 if (!ptep) 1004 goto again; 1005 pgste = pgste_get_lock(ptep); 1006 *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 1007 paddr = pte_val(*ptep) & PAGE_MASK; 1008 if (!(pte_val(*ptep) & _PAGE_INVALID)) 1009 *key = page_get_storage_key(paddr); 1010 /* Reflect guest's logical view, not physical */ 1011 *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 1012 pgste_set_unlock(ptep, pgste); 1013 pte_unmap_unlock(ptep, ptl); 1014 return 0; 1015 } 1016 EXPORT_SYMBOL(get_guest_storage_key); 1017 1018 /** 1019 * pgste_perform_essa - perform ESSA actions on the PGSTE. 1020 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1021 * @hva: the host virtual address of the page whose PGSTE is to be processed 1022 * @orc: the specific action to perform, see the ESSA_SET_* macros. 1023 * @oldpte: the PTE will be saved there if the pointer is not NULL. 1024 * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. 1025 * 1026 * Return: 1 if the page is to be added to the CBRL, otherwise 0, 1027 * or < 0 in case of error. -EINVAL is returned for invalid values 1028 * of orc, -EFAULT for invalid addresses. 1029 */ 1030 int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, 1031 unsigned long *oldpte, unsigned long *oldpgste) 1032 { 1033 struct vm_area_struct *vma; 1034 unsigned long pgstev; 1035 spinlock_t *ptl; 1036 pgste_t pgste; 1037 pte_t *ptep; 1038 int res = 0; 1039 1040 WARN_ON_ONCE(orc > ESSA_MAX); 1041 if (unlikely(orc > ESSA_MAX)) 1042 return -EINVAL; 1043 1044 vma = vma_lookup(mm, hva); 1045 if (!vma || is_vm_hugetlb_page(vma)) 1046 return -EFAULT; 1047 ptep = get_locked_pte(mm, hva, &ptl); 1048 if (unlikely(!ptep)) 1049 return -EFAULT; 1050 pgste = pgste_get_lock(ptep); 1051 pgstev = pgste_val(pgste); 1052 if (oldpte) 1053 *oldpte = pte_val(*ptep); 1054 if (oldpgste) 1055 *oldpgste = pgstev; 1056 1057 switch (orc) { 1058 case ESSA_GET_STATE: 1059 break; 1060 case ESSA_SET_STABLE: 1061 pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 1062 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1063 break; 1064 case ESSA_SET_UNUSED: 1065 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1066 pgstev |= _PGSTE_GPS_USAGE_UNUSED; 1067 if (pte_val(*ptep) & _PAGE_INVALID) 1068 res = 1; 1069 break; 1070 case ESSA_SET_VOLATILE: 1071 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1072 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1073 if (pte_val(*ptep) & _PAGE_INVALID) 1074 res = 1; 1075 break; 1076 case ESSA_SET_POT_VOLATILE: 1077 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1078 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1079 pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; 1080 break; 1081 } 1082 if (pgstev & _PGSTE_GPS_ZERO) { 1083 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1084 break; 1085 } 1086 if (!(pgstev & PGSTE_GC_BIT)) { 1087 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1088 res = 1; 1089 break; 1090 } 1091 break; 1092 case ESSA_SET_STABLE_RESIDENT: 1093 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1094 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1095 /* 1096 * Since the resident state can go away any time after this 1097 * call, we will not make this page resident. We can revisit 1098 * this decision if a guest will ever start using this. 1099 */ 1100 break; 1101 case ESSA_SET_STABLE_IF_RESIDENT: 1102 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1103 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1104 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1105 } 1106 break; 1107 case ESSA_SET_STABLE_NODAT: 1108 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1109 pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; 1110 break; 1111 default: 1112 /* we should never get here! */ 1113 break; 1114 } 1115 /* If we are discarding a page, set it to logical zero */ 1116 if (res) 1117 pgstev |= _PGSTE_GPS_ZERO; 1118 1119 pgste = __pgste(pgstev); 1120 pgste_set_unlock(ptep, pgste); 1121 pte_unmap_unlock(ptep, ptl); 1122 return res; 1123 } 1124 EXPORT_SYMBOL(pgste_perform_essa); 1125 1126 /** 1127 * set_pgste_bits - set specific PGSTE bits. 1128 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1129 * @hva: the host virtual address of the page whose PGSTE is to be processed 1130 * @bits: a bitmask representing the bits that will be touched 1131 * @value: the values of the bits to be written. Only the bits in the mask 1132 * will be written. 1133 * 1134 * Return: 0 on success, < 0 in case of error. 1135 */ 1136 int set_pgste_bits(struct mm_struct *mm, unsigned long hva, 1137 unsigned long bits, unsigned long value) 1138 { 1139 struct vm_area_struct *vma; 1140 spinlock_t *ptl; 1141 pgste_t new; 1142 pte_t *ptep; 1143 1144 vma = vma_lookup(mm, hva); 1145 if (!vma || is_vm_hugetlb_page(vma)) 1146 return -EFAULT; 1147 ptep = get_locked_pte(mm, hva, &ptl); 1148 if (unlikely(!ptep)) 1149 return -EFAULT; 1150 new = pgste_get_lock(ptep); 1151 1152 new = clear_pgste_bit(new, bits); 1153 new = set_pgste_bit(new, value & bits); 1154 1155 pgste_set_unlock(ptep, new); 1156 pte_unmap_unlock(ptep, ptl); 1157 return 0; 1158 } 1159 EXPORT_SYMBOL(set_pgste_bits); 1160 1161 /** 1162 * get_pgste - get the current PGSTE for the given address. 1163 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1164 * @hva: the host virtual address of the page whose PGSTE is to be processed 1165 * @pgstep: will be written with the current PGSTE for the given address. 1166 * 1167 * Return: 0 on success, < 0 in case of error. 1168 */ 1169 int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) 1170 { 1171 struct vm_area_struct *vma; 1172 spinlock_t *ptl; 1173 pte_t *ptep; 1174 1175 vma = vma_lookup(mm, hva); 1176 if (!vma || is_vm_hugetlb_page(vma)) 1177 return -EFAULT; 1178 ptep = get_locked_pte(mm, hva, &ptl); 1179 if (unlikely(!ptep)) 1180 return -EFAULT; 1181 *pgstep = pgste_val(pgste_get(ptep)); 1182 pte_unmap_unlock(ptep, ptl); 1183 return 0; 1184 } 1185 EXPORT_SYMBOL(get_pgste); 1186 #endif 1187