1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2007, 2011 4 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/kernel.h> 9 #include <linux/errno.h> 10 #include <linux/gfp.h> 11 #include <linux/mm.h> 12 #include <linux/swap.h> 13 #include <linux/smp.h> 14 #include <linux/spinlock.h> 15 #include <linux/rcupdate.h> 16 #include <linux/slab.h> 17 #include <linux/swapops.h> 18 #include <linux/sysctl.h> 19 #include <linux/ksm.h> 20 #include <linux/mman.h> 21 22 #include <asm/tlb.h> 23 #include <asm/tlbflush.h> 24 #include <asm/mmu_context.h> 25 #include <asm/page-states.h> 26 27 pgprot_t pgprot_writecombine(pgprot_t prot) 28 { 29 /* 30 * mio_wb_bit_mask may be set on a different CPU, but it is only set 31 * once at init and only read afterwards. 32 */ 33 return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); 34 } 35 EXPORT_SYMBOL_GPL(pgprot_writecombine); 36 37 pgprot_t pgprot_writethrough(pgprot_t prot) 38 { 39 /* 40 * mio_wb_bit_mask may be set on a different CPU, but it is only set 41 * once at init and only read afterwards. 42 */ 43 return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask); 44 } 45 EXPORT_SYMBOL_GPL(pgprot_writethrough); 46 47 static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, 48 pte_t *ptep, int nodat) 49 { 50 unsigned long opt, asce; 51 52 if (MACHINE_HAS_TLB_GUEST) { 53 opt = 0; 54 asce = READ_ONCE(mm->context.gmap_asce); 55 if (asce == 0UL || nodat) 56 opt |= IPTE_NODAT; 57 if (asce != -1UL) { 58 asce = asce ? : mm->context.asce; 59 opt |= IPTE_GUEST_ASCE; 60 } 61 __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); 62 } else { 63 __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); 64 } 65 } 66 67 static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, 68 pte_t *ptep, int nodat) 69 { 70 unsigned long opt, asce; 71 72 if (MACHINE_HAS_TLB_GUEST) { 73 opt = 0; 74 asce = READ_ONCE(mm->context.gmap_asce); 75 if (asce == 0UL || nodat) 76 opt |= IPTE_NODAT; 77 if (asce != -1UL) { 78 asce = asce ? : mm->context.asce; 79 opt |= IPTE_GUEST_ASCE; 80 } 81 __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); 82 } else { 83 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 84 } 85 } 86 87 static inline pte_t ptep_flush_direct(struct mm_struct *mm, 88 unsigned long addr, pte_t *ptep, 89 int nodat) 90 { 91 pte_t old; 92 93 old = *ptep; 94 if (unlikely(pte_val(old) & _PAGE_INVALID)) 95 return old; 96 atomic_inc(&mm->context.flush_count); 97 if (MACHINE_HAS_TLB_LC && 98 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 99 ptep_ipte_local(mm, addr, ptep, nodat); 100 else 101 ptep_ipte_global(mm, addr, ptep, nodat); 102 atomic_dec(&mm->context.flush_count); 103 return old; 104 } 105 106 static inline pte_t ptep_flush_lazy(struct mm_struct *mm, 107 unsigned long addr, pte_t *ptep, 108 int nodat) 109 { 110 pte_t old; 111 112 old = *ptep; 113 if (unlikely(pte_val(old) & _PAGE_INVALID)) 114 return old; 115 atomic_inc(&mm->context.flush_count); 116 if (cpumask_equal(&mm->context.cpu_attach_mask, 117 cpumask_of(smp_processor_id()))) { 118 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); 119 mm->context.flush_mm = 1; 120 } else 121 ptep_ipte_global(mm, addr, ptep, nodat); 122 atomic_dec(&mm->context.flush_count); 123 return old; 124 } 125 126 static inline pgste_t pgste_get_lock(pte_t *ptep) 127 { 128 unsigned long value = 0; 129 #ifdef CONFIG_PGSTE 130 unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); 131 132 do { 133 value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); 134 } while (value & PGSTE_PCL_BIT); 135 value |= PGSTE_PCL_BIT; 136 #endif 137 return __pgste(value); 138 } 139 140 static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) 141 { 142 #ifdef CONFIG_PGSTE 143 barrier(); 144 WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); 145 #endif 146 } 147 148 static inline pgste_t pgste_get(pte_t *ptep) 149 { 150 unsigned long pgste = 0; 151 #ifdef CONFIG_PGSTE 152 pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); 153 #endif 154 return __pgste(pgste); 155 } 156 157 static inline void pgste_set(pte_t *ptep, pgste_t pgste) 158 { 159 #ifdef CONFIG_PGSTE 160 *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; 161 #endif 162 } 163 164 static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, 165 struct mm_struct *mm) 166 { 167 #ifdef CONFIG_PGSTE 168 unsigned long address, bits, skey; 169 170 if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) 171 return pgste; 172 address = pte_val(pte) & PAGE_MASK; 173 skey = (unsigned long) page_get_storage_key(address); 174 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 175 /* Transfer page changed & referenced bit to guest bits in pgste */ 176 pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */ 177 /* Copy page access key and fetch protection bit to pgste */ 178 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); 179 pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 180 #endif 181 return pgste; 182 183 } 184 185 static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, 186 struct mm_struct *mm) 187 { 188 #ifdef CONFIG_PGSTE 189 unsigned long address; 190 unsigned long nkey; 191 192 if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) 193 return; 194 VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); 195 address = pte_val(entry) & PAGE_MASK; 196 /* 197 * Set page access key and fetch protection bit from pgste. 198 * The guest C/R information is still in the PGSTE, set real 199 * key C/R to 0. 200 */ 201 nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 202 nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 203 page_set_storage_key(address, nkey, 0); 204 #endif 205 } 206 207 static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) 208 { 209 #ifdef CONFIG_PGSTE 210 if ((pte_val(entry) & _PAGE_PRESENT) && 211 (pte_val(entry) & _PAGE_WRITE) && 212 !(pte_val(entry) & _PAGE_INVALID)) { 213 if (!MACHINE_HAS_ESOP) { 214 /* 215 * Without enhanced suppression-on-protection force 216 * the dirty bit on for all writable ptes. 217 */ 218 entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); 219 entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 220 } 221 if (!(pte_val(entry) & _PAGE_PROTECT)) 222 /* This pte allows write access, set user-dirty */ 223 pgste_val(pgste) |= PGSTE_UC_BIT; 224 } 225 #endif 226 set_pte(ptep, entry); 227 return pgste; 228 } 229 230 static inline pgste_t pgste_pte_notify(struct mm_struct *mm, 231 unsigned long addr, 232 pte_t *ptep, pgste_t pgste) 233 { 234 #ifdef CONFIG_PGSTE 235 unsigned long bits; 236 237 bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); 238 if (bits) { 239 pgste_val(pgste) ^= bits; 240 ptep_notify(mm, addr, ptep, bits); 241 } 242 #endif 243 return pgste; 244 } 245 246 static inline pgste_t ptep_xchg_start(struct mm_struct *mm, 247 unsigned long addr, pte_t *ptep) 248 { 249 pgste_t pgste = __pgste(0); 250 251 if (mm_has_pgste(mm)) { 252 pgste = pgste_get_lock(ptep); 253 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 254 } 255 return pgste; 256 } 257 258 static inline pte_t ptep_xchg_commit(struct mm_struct *mm, 259 unsigned long addr, pte_t *ptep, 260 pgste_t pgste, pte_t old, pte_t new) 261 { 262 if (mm_has_pgste(mm)) { 263 if (pte_val(old) & _PAGE_INVALID) 264 pgste_set_key(ptep, pgste, new, mm); 265 if (pte_val(new) & _PAGE_INVALID) { 266 pgste = pgste_update_all(old, pgste, mm); 267 if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == 268 _PGSTE_GPS_USAGE_UNUSED) 269 old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); 270 } 271 pgste = pgste_set_pte(ptep, pgste, new); 272 pgste_set_unlock(ptep, pgste); 273 } else { 274 set_pte(ptep, new); 275 } 276 return old; 277 } 278 279 pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, 280 pte_t *ptep, pte_t new) 281 { 282 pgste_t pgste; 283 pte_t old; 284 int nodat; 285 286 preempt_disable(); 287 pgste = ptep_xchg_start(mm, addr, ptep); 288 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 289 old = ptep_flush_direct(mm, addr, ptep, nodat); 290 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 291 preempt_enable(); 292 return old; 293 } 294 EXPORT_SYMBOL(ptep_xchg_direct); 295 296 /* 297 * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that 298 * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). 299 */ 300 void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, 301 pte_t new) 302 { 303 preempt_disable(); 304 atomic_inc(&mm->context.flush_count); 305 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 306 __ptep_rdp(addr, ptep, 0, 0, 1); 307 else 308 __ptep_rdp(addr, ptep, 0, 0, 0); 309 /* 310 * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That 311 * means it is still valid and active, and must not be changed according 312 * to the architecture. But writing a new value that only differs in SW 313 * bits is allowed. 314 */ 315 set_pte(ptep, new); 316 atomic_dec(&mm->context.flush_count); 317 preempt_enable(); 318 } 319 EXPORT_SYMBOL(ptep_reset_dat_prot); 320 321 pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, 322 pte_t *ptep, pte_t new) 323 { 324 pgste_t pgste; 325 pte_t old; 326 int nodat; 327 328 preempt_disable(); 329 pgste = ptep_xchg_start(mm, addr, ptep); 330 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 331 old = ptep_flush_lazy(mm, addr, ptep, nodat); 332 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 333 preempt_enable(); 334 return old; 335 } 336 EXPORT_SYMBOL(ptep_xchg_lazy); 337 338 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, 339 pte_t *ptep) 340 { 341 pgste_t pgste; 342 pte_t old; 343 int nodat; 344 struct mm_struct *mm = vma->vm_mm; 345 346 preempt_disable(); 347 pgste = ptep_xchg_start(mm, addr, ptep); 348 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 349 old = ptep_flush_lazy(mm, addr, ptep, nodat); 350 if (mm_has_pgste(mm)) { 351 pgste = pgste_update_all(old, pgste, mm); 352 pgste_set(ptep, pgste); 353 } 354 return old; 355 } 356 357 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 358 pte_t *ptep, pte_t old_pte, pte_t pte) 359 { 360 pgste_t pgste; 361 struct mm_struct *mm = vma->vm_mm; 362 363 if (!MACHINE_HAS_NX) 364 pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC)); 365 if (mm_has_pgste(mm)) { 366 pgste = pgste_get(ptep); 367 pgste_set_key(ptep, pgste, pte, mm); 368 pgste = pgste_set_pte(ptep, pgste, pte); 369 pgste_set_unlock(ptep, pgste); 370 } else { 371 set_pte(ptep, pte); 372 } 373 preempt_enable(); 374 } 375 376 static inline void pmdp_idte_local(struct mm_struct *mm, 377 unsigned long addr, pmd_t *pmdp) 378 { 379 if (MACHINE_HAS_TLB_GUEST) 380 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 381 mm->context.asce, IDTE_LOCAL); 382 else 383 __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); 384 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 385 gmap_pmdp_idte_local(mm, addr); 386 } 387 388 static inline void pmdp_idte_global(struct mm_struct *mm, 389 unsigned long addr, pmd_t *pmdp) 390 { 391 if (MACHINE_HAS_TLB_GUEST) { 392 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 393 mm->context.asce, IDTE_GLOBAL); 394 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 395 gmap_pmdp_idte_global(mm, addr); 396 } else if (MACHINE_HAS_IDTE) { 397 __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); 398 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 399 gmap_pmdp_idte_global(mm, addr); 400 } else { 401 __pmdp_csp(pmdp); 402 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 403 gmap_pmdp_csp(mm, addr); 404 } 405 } 406 407 static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, 408 unsigned long addr, pmd_t *pmdp) 409 { 410 pmd_t old; 411 412 old = *pmdp; 413 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 414 return old; 415 atomic_inc(&mm->context.flush_count); 416 if (MACHINE_HAS_TLB_LC && 417 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 418 pmdp_idte_local(mm, addr, pmdp); 419 else 420 pmdp_idte_global(mm, addr, pmdp); 421 atomic_dec(&mm->context.flush_count); 422 return old; 423 } 424 425 static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, 426 unsigned long addr, pmd_t *pmdp) 427 { 428 pmd_t old; 429 430 old = *pmdp; 431 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 432 return old; 433 atomic_inc(&mm->context.flush_count); 434 if (cpumask_equal(&mm->context.cpu_attach_mask, 435 cpumask_of(smp_processor_id()))) { 436 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); 437 mm->context.flush_mm = 1; 438 if (mm_has_pgste(mm)) 439 gmap_pmdp_invalidate(mm, addr); 440 } else { 441 pmdp_idte_global(mm, addr, pmdp); 442 } 443 atomic_dec(&mm->context.flush_count); 444 return old; 445 } 446 447 #ifdef CONFIG_PGSTE 448 static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) 449 { 450 struct vm_area_struct *vma; 451 pgd_t *pgd; 452 p4d_t *p4d; 453 pud_t *pud; 454 455 /* We need a valid VMA, otherwise this is clearly a fault. */ 456 vma = vma_lookup(mm, addr); 457 if (!vma) 458 return -EFAULT; 459 460 pgd = pgd_offset(mm, addr); 461 if (!pgd_present(*pgd)) 462 return -ENOENT; 463 464 p4d = p4d_offset(pgd, addr); 465 if (!p4d_present(*p4d)) 466 return -ENOENT; 467 468 pud = pud_offset(p4d, addr); 469 if (!pud_present(*pud)) 470 return -ENOENT; 471 472 /* Large PUDs are not supported yet. */ 473 if (pud_leaf(*pud)) 474 return -EFAULT; 475 476 *pmdp = pmd_offset(pud, addr); 477 return 0; 478 } 479 #endif 480 481 pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, 482 pmd_t *pmdp, pmd_t new) 483 { 484 pmd_t old; 485 486 preempt_disable(); 487 old = pmdp_flush_direct(mm, addr, pmdp); 488 set_pmd(pmdp, new); 489 preempt_enable(); 490 return old; 491 } 492 EXPORT_SYMBOL(pmdp_xchg_direct); 493 494 pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, 495 pmd_t *pmdp, pmd_t new) 496 { 497 pmd_t old; 498 499 preempt_disable(); 500 old = pmdp_flush_lazy(mm, addr, pmdp); 501 set_pmd(pmdp, new); 502 preempt_enable(); 503 return old; 504 } 505 EXPORT_SYMBOL(pmdp_xchg_lazy); 506 507 static inline void pudp_idte_local(struct mm_struct *mm, 508 unsigned long addr, pud_t *pudp) 509 { 510 if (MACHINE_HAS_TLB_GUEST) 511 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 512 mm->context.asce, IDTE_LOCAL); 513 else 514 __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); 515 } 516 517 static inline void pudp_idte_global(struct mm_struct *mm, 518 unsigned long addr, pud_t *pudp) 519 { 520 if (MACHINE_HAS_TLB_GUEST) 521 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 522 mm->context.asce, IDTE_GLOBAL); 523 else if (MACHINE_HAS_IDTE) 524 __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); 525 else 526 /* 527 * Invalid bit position is the same for pmd and pud, so we can 528 * re-use _pmd_csp() here 529 */ 530 __pmdp_csp((pmd_t *) pudp); 531 } 532 533 static inline pud_t pudp_flush_direct(struct mm_struct *mm, 534 unsigned long addr, pud_t *pudp) 535 { 536 pud_t old; 537 538 old = *pudp; 539 if (pud_val(old) & _REGION_ENTRY_INVALID) 540 return old; 541 atomic_inc(&mm->context.flush_count); 542 if (MACHINE_HAS_TLB_LC && 543 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 544 pudp_idte_local(mm, addr, pudp); 545 else 546 pudp_idte_global(mm, addr, pudp); 547 atomic_dec(&mm->context.flush_count); 548 return old; 549 } 550 551 pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, 552 pud_t *pudp, pud_t new) 553 { 554 pud_t old; 555 556 preempt_disable(); 557 old = pudp_flush_direct(mm, addr, pudp); 558 set_pud(pudp, new); 559 preempt_enable(); 560 return old; 561 } 562 EXPORT_SYMBOL(pudp_xchg_direct); 563 564 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 565 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 566 pgtable_t pgtable) 567 { 568 struct list_head *lh = (struct list_head *) pgtable; 569 570 assert_spin_locked(pmd_lockptr(mm, pmdp)); 571 572 /* FIFO */ 573 if (!pmd_huge_pte(mm, pmdp)) 574 INIT_LIST_HEAD(lh); 575 else 576 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 577 pmd_huge_pte(mm, pmdp) = pgtable; 578 } 579 580 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 581 { 582 struct list_head *lh; 583 pgtable_t pgtable; 584 pte_t *ptep; 585 586 assert_spin_locked(pmd_lockptr(mm, pmdp)); 587 588 /* FIFO */ 589 pgtable = pmd_huge_pte(mm, pmdp); 590 lh = (struct list_head *) pgtable; 591 if (list_empty(lh)) 592 pmd_huge_pte(mm, pmdp) = NULL; 593 else { 594 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 595 list_del(lh); 596 } 597 ptep = (pte_t *) pgtable; 598 set_pte(ptep, __pte(_PAGE_INVALID)); 599 ptep++; 600 set_pte(ptep, __pte(_PAGE_INVALID)); 601 return pgtable; 602 } 603 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 604 605 #ifdef CONFIG_PGSTE 606 void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, 607 pte_t *ptep, pte_t entry) 608 { 609 pgste_t pgste; 610 611 /* the mm_has_pgste() check is done in set_pte_at() */ 612 preempt_disable(); 613 pgste = pgste_get_lock(ptep); 614 pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; 615 pgste_set_key(ptep, pgste, entry, mm); 616 pgste = pgste_set_pte(ptep, pgste, entry); 617 pgste_set_unlock(ptep, pgste); 618 preempt_enable(); 619 } 620 621 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 622 { 623 pgste_t pgste; 624 625 preempt_disable(); 626 pgste = pgste_get_lock(ptep); 627 pgste_val(pgste) |= PGSTE_IN_BIT; 628 pgste_set_unlock(ptep, pgste); 629 preempt_enable(); 630 } 631 632 /** 633 * ptep_force_prot - change access rights of a locked pte 634 * @mm: pointer to the process mm_struct 635 * @addr: virtual address in the guest address space 636 * @ptep: pointer to the page table entry 637 * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE 638 * @bit: pgste bit to set (e.g. for notification) 639 * 640 * Returns 0 if the access rights were changed and -EAGAIN if the current 641 * and requested access rights are incompatible. 642 */ 643 int ptep_force_prot(struct mm_struct *mm, unsigned long addr, 644 pte_t *ptep, int prot, unsigned long bit) 645 { 646 pte_t entry; 647 pgste_t pgste; 648 int pte_i, pte_p, nodat; 649 650 pgste = pgste_get_lock(ptep); 651 entry = *ptep; 652 /* Check pte entry after all locks have been acquired */ 653 pte_i = pte_val(entry) & _PAGE_INVALID; 654 pte_p = pte_val(entry) & _PAGE_PROTECT; 655 if ((pte_i && (prot != PROT_NONE)) || 656 (pte_p && (prot & PROT_WRITE))) { 657 pgste_set_unlock(ptep, pgste); 658 return -EAGAIN; 659 } 660 /* Change access rights and set pgste bit */ 661 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 662 if (prot == PROT_NONE && !pte_i) { 663 ptep_flush_direct(mm, addr, ptep, nodat); 664 pgste = pgste_update_all(entry, pgste, mm); 665 entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); 666 } 667 if (prot == PROT_READ && !pte_p) { 668 ptep_flush_direct(mm, addr, ptep, nodat); 669 entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); 670 entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 671 } 672 pgste_val(pgste) |= bit; 673 pgste = pgste_set_pte(ptep, pgste, entry); 674 pgste_set_unlock(ptep, pgste); 675 return 0; 676 } 677 678 int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, 679 pte_t *sptep, pte_t *tptep, pte_t pte) 680 { 681 pgste_t spgste, tpgste; 682 pte_t spte, tpte; 683 int rc = -EAGAIN; 684 685 if (!(pte_val(*tptep) & _PAGE_INVALID)) 686 return 0; /* already shadowed */ 687 spgste = pgste_get_lock(sptep); 688 spte = *sptep; 689 if (!(pte_val(spte) & _PAGE_INVALID) && 690 !((pte_val(spte) & _PAGE_PROTECT) && 691 !(pte_val(pte) & _PAGE_PROTECT))) { 692 pgste_val(spgste) |= PGSTE_VSIE_BIT; 693 tpgste = pgste_get_lock(tptep); 694 tpte = __pte((pte_val(spte) & PAGE_MASK) | 695 (pte_val(pte) & _PAGE_PROTECT)); 696 /* don't touch the storage key - it belongs to parent pgste */ 697 tpgste = pgste_set_pte(tptep, tpgste, tpte); 698 pgste_set_unlock(tptep, tpgste); 699 rc = 1; 700 } 701 pgste_set_unlock(sptep, spgste); 702 return rc; 703 } 704 705 void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) 706 { 707 pgste_t pgste; 708 int nodat; 709 710 pgste = pgste_get_lock(ptep); 711 /* notifier is called by the caller */ 712 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 713 ptep_flush_direct(mm, saddr, ptep, nodat); 714 /* don't touch the storage key - it belongs to parent pgste */ 715 pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); 716 pgste_set_unlock(ptep, pgste); 717 } 718 719 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) 720 { 721 if (!non_swap_entry(entry)) 722 dec_mm_counter(mm, MM_SWAPENTS); 723 else if (is_migration_entry(entry)) { 724 struct folio *folio = pfn_swap_entry_folio(entry); 725 726 dec_mm_counter(mm, mm_counter(folio)); 727 } 728 free_swap_and_cache(entry); 729 } 730 731 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, 732 pte_t *ptep, int reset) 733 { 734 unsigned long pgstev; 735 pgste_t pgste; 736 pte_t pte; 737 738 /* Zap unused and logically-zero pages */ 739 preempt_disable(); 740 pgste = pgste_get_lock(ptep); 741 pgstev = pgste_val(pgste); 742 pte = *ptep; 743 if (!reset && pte_swap(pte) && 744 ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || 745 (pgstev & _PGSTE_GPS_ZERO))) { 746 ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); 747 pte_clear(mm, addr, ptep); 748 } 749 if (reset) 750 pgste_val(pgste) &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 751 pgste_set_unlock(ptep, pgste); 752 preempt_enable(); 753 } 754 755 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 756 { 757 unsigned long ptev; 758 pgste_t pgste; 759 760 /* Clear storage key ACC and F, but set R/C */ 761 preempt_disable(); 762 pgste = pgste_get_lock(ptep); 763 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); 764 pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT; 765 ptev = pte_val(*ptep); 766 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 767 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); 768 pgste_set_unlock(ptep, pgste); 769 preempt_enable(); 770 } 771 772 /* 773 * Test and reset if a guest page is dirty 774 */ 775 bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, 776 pte_t *ptep) 777 { 778 pgste_t pgste; 779 pte_t pte; 780 bool dirty; 781 int nodat; 782 783 pgste = pgste_get_lock(ptep); 784 dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); 785 pgste_val(pgste) &= ~PGSTE_UC_BIT; 786 pte = *ptep; 787 if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { 788 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 789 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 790 ptep_ipte_global(mm, addr, ptep, nodat); 791 if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) 792 pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); 793 else 794 pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); 795 set_pte(ptep, pte); 796 } 797 pgste_set_unlock(ptep, pgste); 798 return dirty; 799 } 800 EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); 801 802 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 803 unsigned char key, bool nq) 804 { 805 unsigned long keyul, paddr; 806 spinlock_t *ptl; 807 pgste_t old, new; 808 pmd_t *pmdp; 809 pte_t *ptep; 810 811 /* 812 * If we don't have a PTE table and if there is no huge page mapped, 813 * we can ignore attempts to set the key to 0, because it already is 0. 814 */ 815 switch (pmd_lookup(mm, addr, &pmdp)) { 816 case -ENOENT: 817 return key ? -EFAULT : 0; 818 case 0: 819 break; 820 default: 821 return -EFAULT; 822 } 823 again: 824 ptl = pmd_lock(mm, pmdp); 825 if (!pmd_present(*pmdp)) { 826 spin_unlock(ptl); 827 return key ? -EFAULT : 0; 828 } 829 830 if (pmd_leaf(*pmdp)) { 831 paddr = pmd_val(*pmdp) & HPAGE_MASK; 832 paddr |= addr & ~HPAGE_MASK; 833 /* 834 * Huge pmds need quiescing operations, they are 835 * always mapped. 836 */ 837 page_set_storage_key(paddr, key, 1); 838 spin_unlock(ptl); 839 return 0; 840 } 841 spin_unlock(ptl); 842 843 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 844 if (!ptep) 845 goto again; 846 new = old = pgste_get_lock(ptep); 847 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 848 PGSTE_ACC_BITS | PGSTE_FP_BIT); 849 keyul = (unsigned long) key; 850 pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 851 pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 852 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 853 unsigned long bits, skey; 854 855 paddr = pte_val(*ptep) & PAGE_MASK; 856 skey = (unsigned long) page_get_storage_key(paddr); 857 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 858 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 859 /* Set storage key ACC and FP */ 860 page_set_storage_key(paddr, skey, !nq); 861 /* Merge host changed & referenced into pgste */ 862 pgste_val(new) |= bits << 52; 863 } 864 /* changing the guest storage key is considered a change of the page */ 865 if ((pgste_val(new) ^ pgste_val(old)) & 866 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 867 pgste_val(new) |= PGSTE_UC_BIT; 868 869 pgste_set_unlock(ptep, new); 870 pte_unmap_unlock(ptep, ptl); 871 return 0; 872 } 873 EXPORT_SYMBOL(set_guest_storage_key); 874 875 /* 876 * Conditionally set a guest storage key (handling csske). 877 * oldkey will be updated when either mr or mc is set and a pointer is given. 878 * 879 * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest 880 * storage key was updated and -EFAULT on access errors. 881 */ 882 int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 883 unsigned char key, unsigned char *oldkey, 884 bool nq, bool mr, bool mc) 885 { 886 unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; 887 int rc; 888 889 /* we can drop the pgste lock between getting and setting the key */ 890 if (mr | mc) { 891 rc = get_guest_storage_key(current->mm, addr, &tmp); 892 if (rc) 893 return rc; 894 if (oldkey) 895 *oldkey = tmp; 896 if (!mr) 897 mask |= _PAGE_REFERENCED; 898 if (!mc) 899 mask |= _PAGE_CHANGED; 900 if (!((tmp ^ key) & mask)) 901 return 0; 902 } 903 rc = set_guest_storage_key(current->mm, addr, key, nq); 904 return rc < 0 ? rc : 1; 905 } 906 EXPORT_SYMBOL(cond_set_guest_storage_key); 907 908 /* 909 * Reset a guest reference bit (rrbe), returning the reference and changed bit. 910 * 911 * Returns < 0 in case of error, otherwise the cc to be reported to the guest. 912 */ 913 int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) 914 { 915 spinlock_t *ptl; 916 unsigned long paddr; 917 pgste_t old, new; 918 pmd_t *pmdp; 919 pte_t *ptep; 920 int cc = 0; 921 922 /* 923 * If we don't have a PTE table and if there is no huge page mapped, 924 * the storage key is 0 and there is nothing for us to do. 925 */ 926 switch (pmd_lookup(mm, addr, &pmdp)) { 927 case -ENOENT: 928 return 0; 929 case 0: 930 break; 931 default: 932 return -EFAULT; 933 } 934 again: 935 ptl = pmd_lock(mm, pmdp); 936 if (!pmd_present(*pmdp)) { 937 spin_unlock(ptl); 938 return 0; 939 } 940 941 if (pmd_leaf(*pmdp)) { 942 paddr = pmd_val(*pmdp) & HPAGE_MASK; 943 paddr |= addr & ~HPAGE_MASK; 944 cc = page_reset_referenced(paddr); 945 spin_unlock(ptl); 946 return cc; 947 } 948 spin_unlock(ptl); 949 950 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 951 if (!ptep) 952 goto again; 953 new = old = pgste_get_lock(ptep); 954 /* Reset guest reference bit only */ 955 pgste_val(new) &= ~PGSTE_GR_BIT; 956 957 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 958 paddr = pte_val(*ptep) & PAGE_MASK; 959 cc = page_reset_referenced(paddr); 960 /* Merge real referenced bit into host-set */ 961 pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; 962 } 963 /* Reflect guest's logical view, not physical */ 964 cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; 965 /* Changing the guest storage key is considered a change of the page */ 966 if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) 967 pgste_val(new) |= PGSTE_UC_BIT; 968 969 pgste_set_unlock(ptep, new); 970 pte_unmap_unlock(ptep, ptl); 971 return cc; 972 } 973 EXPORT_SYMBOL(reset_guest_reference_bit); 974 975 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, 976 unsigned char *key) 977 { 978 unsigned long paddr; 979 spinlock_t *ptl; 980 pgste_t pgste; 981 pmd_t *pmdp; 982 pte_t *ptep; 983 984 /* 985 * If we don't have a PTE table and if there is no huge page mapped, 986 * the storage key is 0. 987 */ 988 *key = 0; 989 990 switch (pmd_lookup(mm, addr, &pmdp)) { 991 case -ENOENT: 992 return 0; 993 case 0: 994 break; 995 default: 996 return -EFAULT; 997 } 998 again: 999 ptl = pmd_lock(mm, pmdp); 1000 if (!pmd_present(*pmdp)) { 1001 spin_unlock(ptl); 1002 return 0; 1003 } 1004 1005 if (pmd_leaf(*pmdp)) { 1006 paddr = pmd_val(*pmdp) & HPAGE_MASK; 1007 paddr |= addr & ~HPAGE_MASK; 1008 *key = page_get_storage_key(paddr); 1009 spin_unlock(ptl); 1010 return 0; 1011 } 1012 spin_unlock(ptl); 1013 1014 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 1015 if (!ptep) 1016 goto again; 1017 pgste = pgste_get_lock(ptep); 1018 *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 1019 paddr = pte_val(*ptep) & PAGE_MASK; 1020 if (!(pte_val(*ptep) & _PAGE_INVALID)) 1021 *key = page_get_storage_key(paddr); 1022 /* Reflect guest's logical view, not physical */ 1023 *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 1024 pgste_set_unlock(ptep, pgste); 1025 pte_unmap_unlock(ptep, ptl); 1026 return 0; 1027 } 1028 EXPORT_SYMBOL(get_guest_storage_key); 1029 1030 /** 1031 * pgste_perform_essa - perform ESSA actions on the PGSTE. 1032 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1033 * @hva: the host virtual address of the page whose PGSTE is to be processed 1034 * @orc: the specific action to perform, see the ESSA_SET_* macros. 1035 * @oldpte: the PTE will be saved there if the pointer is not NULL. 1036 * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. 1037 * 1038 * Return: 1 if the page is to be added to the CBRL, otherwise 0, 1039 * or < 0 in case of error. -EINVAL is returned for invalid values 1040 * of orc, -EFAULT for invalid addresses. 1041 */ 1042 int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, 1043 unsigned long *oldpte, unsigned long *oldpgste) 1044 { 1045 struct vm_area_struct *vma; 1046 unsigned long pgstev; 1047 spinlock_t *ptl; 1048 pgste_t pgste; 1049 pte_t *ptep; 1050 int res = 0; 1051 1052 WARN_ON_ONCE(orc > ESSA_MAX); 1053 if (unlikely(orc > ESSA_MAX)) 1054 return -EINVAL; 1055 1056 vma = vma_lookup(mm, hva); 1057 if (!vma || is_vm_hugetlb_page(vma)) 1058 return -EFAULT; 1059 ptep = get_locked_pte(mm, hva, &ptl); 1060 if (unlikely(!ptep)) 1061 return -EFAULT; 1062 pgste = pgste_get_lock(ptep); 1063 pgstev = pgste_val(pgste); 1064 if (oldpte) 1065 *oldpte = pte_val(*ptep); 1066 if (oldpgste) 1067 *oldpgste = pgstev; 1068 1069 switch (orc) { 1070 case ESSA_GET_STATE: 1071 break; 1072 case ESSA_SET_STABLE: 1073 pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 1074 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1075 break; 1076 case ESSA_SET_UNUSED: 1077 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1078 pgstev |= _PGSTE_GPS_USAGE_UNUSED; 1079 if (pte_val(*ptep) & _PAGE_INVALID) 1080 res = 1; 1081 break; 1082 case ESSA_SET_VOLATILE: 1083 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1084 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1085 if (pte_val(*ptep) & _PAGE_INVALID) 1086 res = 1; 1087 break; 1088 case ESSA_SET_POT_VOLATILE: 1089 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1090 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1091 pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; 1092 break; 1093 } 1094 if (pgstev & _PGSTE_GPS_ZERO) { 1095 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1096 break; 1097 } 1098 if (!(pgstev & PGSTE_GC_BIT)) { 1099 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1100 res = 1; 1101 break; 1102 } 1103 break; 1104 case ESSA_SET_STABLE_RESIDENT: 1105 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1106 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1107 /* 1108 * Since the resident state can go away any time after this 1109 * call, we will not make this page resident. We can revisit 1110 * this decision if a guest will ever start using this. 1111 */ 1112 break; 1113 case ESSA_SET_STABLE_IF_RESIDENT: 1114 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1115 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1116 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1117 } 1118 break; 1119 case ESSA_SET_STABLE_NODAT: 1120 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1121 pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; 1122 break; 1123 default: 1124 /* we should never get here! */ 1125 break; 1126 } 1127 /* If we are discarding a page, set it to logical zero */ 1128 if (res) 1129 pgstev |= _PGSTE_GPS_ZERO; 1130 1131 pgste_val(pgste) = pgstev; 1132 pgste_set_unlock(ptep, pgste); 1133 pte_unmap_unlock(ptep, ptl); 1134 return res; 1135 } 1136 EXPORT_SYMBOL(pgste_perform_essa); 1137 1138 /** 1139 * set_pgste_bits - set specific PGSTE bits. 1140 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1141 * @hva: the host virtual address of the page whose PGSTE is to be processed 1142 * @bits: a bitmask representing the bits that will be touched 1143 * @value: the values of the bits to be written. Only the bits in the mask 1144 * will be written. 1145 * 1146 * Return: 0 on success, < 0 in case of error. 1147 */ 1148 int set_pgste_bits(struct mm_struct *mm, unsigned long hva, 1149 unsigned long bits, unsigned long value) 1150 { 1151 struct vm_area_struct *vma; 1152 spinlock_t *ptl; 1153 pgste_t new; 1154 pte_t *ptep; 1155 1156 vma = vma_lookup(mm, hva); 1157 if (!vma || is_vm_hugetlb_page(vma)) 1158 return -EFAULT; 1159 ptep = get_locked_pte(mm, hva, &ptl); 1160 if (unlikely(!ptep)) 1161 return -EFAULT; 1162 new = pgste_get_lock(ptep); 1163 1164 pgste_val(new) &= ~bits; 1165 pgste_val(new) |= value & bits; 1166 1167 pgste_set_unlock(ptep, new); 1168 pte_unmap_unlock(ptep, ptl); 1169 return 0; 1170 } 1171 EXPORT_SYMBOL(set_pgste_bits); 1172 1173 /** 1174 * get_pgste - get the current PGSTE for the given address. 1175 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1176 * @hva: the host virtual address of the page whose PGSTE is to be processed 1177 * @pgstep: will be written with the current PGSTE for the given address. 1178 * 1179 * Return: 0 on success, < 0 in case of error. 1180 */ 1181 int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) 1182 { 1183 struct vm_area_struct *vma; 1184 spinlock_t *ptl; 1185 pte_t *ptep; 1186 1187 vma = vma_lookup(mm, hva); 1188 if (!vma || is_vm_hugetlb_page(vma)) 1189 return -EFAULT; 1190 ptep = get_locked_pte(mm, hva, &ptl); 1191 if (unlikely(!ptep)) 1192 return -EFAULT; 1193 *pgstep = pgste_val(pgste_get(ptep)); 1194 pte_unmap_unlock(ptep, ptl); 1195 return 0; 1196 } 1197 EXPORT_SYMBOL(get_pgste); 1198 #endif 1199