1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Page table allocation functions 4 * 5 * Copyright IBM Corp. 2016 6 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 7 */ 8 9 #include <linux/sysctl.h> 10 #include <linux/slab.h> 11 #include <linux/mm.h> 12 #include <asm/mmu_context.h> 13 #include <asm/page-states.h> 14 #include <asm/pgalloc.h> 15 #include <asm/tlbflush.h> 16 17 unsigned long *crst_table_alloc(struct mm_struct *mm) 18 { 19 struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); 20 unsigned long *table; 21 22 if (!ptdesc) 23 return NULL; 24 table = ptdesc_to_virt(ptdesc); 25 __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); 26 return table; 27 } 28 29 void crst_table_free(struct mm_struct *mm, unsigned long *table) 30 { 31 if (!table) 32 return; 33 pagetable_free(virt_to_ptdesc(table)); 34 } 35 36 static void __crst_table_upgrade(void *arg) 37 { 38 struct mm_struct *mm = arg; 39 struct ctlreg asce; 40 41 /* change all active ASCEs to avoid the creation of new TLBs */ 42 if (current->active_mm == mm) { 43 asce.val = mm->context.asce; 44 get_lowcore()->user_asce = asce; 45 local_ctl_load(7, &asce); 46 if (!test_thread_flag(TIF_ASCE_PRIMARY)) 47 local_ctl_load(1, &asce); 48 } 49 __tlb_flush_local(); 50 } 51 52 int crst_table_upgrade(struct mm_struct *mm, unsigned long end) 53 { 54 unsigned long *pgd = NULL, *p4d = NULL, *__pgd; 55 unsigned long asce_limit = mm->context.asce_limit; 56 57 mmap_assert_write_locked(mm); 58 59 /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ 60 VM_BUG_ON(asce_limit < _REGION2_SIZE); 61 62 if (end <= asce_limit) 63 return 0; 64 65 if (asce_limit == _REGION2_SIZE) { 66 p4d = crst_table_alloc(mm); 67 if (unlikely(!p4d)) 68 goto err_p4d; 69 crst_table_init(p4d, _REGION2_ENTRY_EMPTY); 70 pagetable_p4d_ctor(virt_to_ptdesc(p4d)); 71 } 72 if (end > _REGION1_SIZE) { 73 pgd = crst_table_alloc(mm); 74 if (unlikely(!pgd)) 75 goto err_pgd; 76 crst_table_init(pgd, _REGION1_ENTRY_EMPTY); 77 pagetable_pgd_ctor(virt_to_ptdesc(pgd)); 78 } 79 80 spin_lock_bh(&mm->page_table_lock); 81 82 if (p4d) { 83 __pgd = (unsigned long *) mm->pgd; 84 p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd); 85 mm->pgd = (pgd_t *) p4d; 86 mm->context.asce_limit = _REGION1_SIZE; 87 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | 88 _ASCE_USER_BITS | _ASCE_TYPE_REGION2; 89 mm_inc_nr_puds(mm); 90 } 91 if (pgd) { 92 __pgd = (unsigned long *) mm->pgd; 93 pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd); 94 mm->pgd = (pgd_t *) pgd; 95 mm->context.asce_limit = TASK_SIZE_MAX; 96 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | 97 _ASCE_USER_BITS | _ASCE_TYPE_REGION1; 98 } 99 100 spin_unlock_bh(&mm->page_table_lock); 101 102 on_each_cpu(__crst_table_upgrade, mm, 0); 103 104 return 0; 105 106 err_pgd: 107 pagetable_dtor(virt_to_ptdesc(p4d)); 108 crst_table_free(mm, p4d); 109 err_p4d: 110 return -ENOMEM; 111 } 112 113 #ifdef CONFIG_PGSTE 114 115 struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) 116 { 117 struct ptdesc *ptdesc; 118 u64 *table; 119 120 ptdesc = pagetable_alloc(GFP_KERNEL, 0); 121 if (ptdesc) { 122 table = (u64 *)ptdesc_to_virt(ptdesc); 123 __arch_set_page_dat(table, 1); 124 memset64(table, _PAGE_INVALID, PTRS_PER_PTE); 125 memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); 126 } 127 return ptdesc; 128 } 129 130 void page_table_free_pgste(struct ptdesc *ptdesc) 131 { 132 pagetable_free(ptdesc); 133 } 134 135 #endif /* CONFIG_PGSTE */ 136 137 unsigned long *page_table_alloc(struct mm_struct *mm) 138 { 139 struct ptdesc *ptdesc; 140 unsigned long *table; 141 142 ptdesc = pagetable_alloc(GFP_KERNEL, 0); 143 if (!ptdesc) 144 return NULL; 145 if (!pagetable_pte_ctor(mm, ptdesc)) { 146 pagetable_free(ptdesc); 147 return NULL; 148 } 149 table = ptdesc_to_virt(ptdesc); 150 __arch_set_page_dat(table, 1); 151 memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); 152 memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); 153 return table; 154 } 155 156 void page_table_free(struct mm_struct *mm, unsigned long *table) 157 { 158 struct ptdesc *ptdesc = virt_to_ptdesc(table); 159 160 pagetable_dtor_free(ptdesc); 161 } 162 163 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 164 static void pte_free_now(struct rcu_head *head) 165 { 166 struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head); 167 168 pagetable_dtor_free(ptdesc); 169 } 170 171 void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) 172 { 173 struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); 174 175 call_rcu(&ptdesc->pt_rcu_head, pte_free_now); 176 } 177 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 178 179 /* 180 * Base infrastructure required to generate basic asces, region, segment, 181 * and page tables that do not make use of enhanced features like EDAT1. 182 */ 183 184 static struct kmem_cache *base_pgt_cache; 185 186 static unsigned long *base_pgt_alloc(void) 187 { 188 unsigned long *table; 189 190 table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL); 191 if (table) 192 memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); 193 return table; 194 } 195 196 static void base_pgt_free(unsigned long *table) 197 { 198 kmem_cache_free(base_pgt_cache, table); 199 } 200 201 static unsigned long *base_crst_alloc(unsigned long val) 202 { 203 unsigned long *table; 204 struct ptdesc *ptdesc; 205 206 ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); 207 if (!ptdesc) 208 return NULL; 209 table = ptdesc_address(ptdesc); 210 crst_table_init(table, val); 211 return table; 212 } 213 214 static void base_crst_free(unsigned long *table) 215 { 216 if (!table) 217 return; 218 pagetable_free(virt_to_ptdesc(table)); 219 } 220 221 #define BASE_ADDR_END_FUNC(NAME, SIZE) \ 222 static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \ 223 unsigned long end) \ 224 { \ 225 unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \ 226 \ 227 return (next - 1) < (end - 1) ? next : end; \ 228 } 229 230 BASE_ADDR_END_FUNC(page, PAGE_SIZE) 231 BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE) 232 BASE_ADDR_END_FUNC(region3, _REGION3_SIZE) 233 BASE_ADDR_END_FUNC(region2, _REGION2_SIZE) 234 BASE_ADDR_END_FUNC(region1, _REGION1_SIZE) 235 236 static inline unsigned long base_lra(unsigned long address) 237 { 238 unsigned long real; 239 240 asm volatile( 241 " lra %0,0(%1)\n" 242 : "=d" (real) : "a" (address) : "cc"); 243 return real; 244 } 245 246 static int base_page_walk(unsigned long *origin, unsigned long addr, 247 unsigned long end, int alloc) 248 { 249 unsigned long *pte, next; 250 251 if (!alloc) 252 return 0; 253 pte = origin; 254 pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT; 255 do { 256 next = base_page_addr_end(addr, end); 257 *pte = base_lra(addr); 258 } while (pte++, addr = next, addr < end); 259 return 0; 260 } 261 262 static int base_segment_walk(unsigned long *origin, unsigned long addr, 263 unsigned long end, int alloc) 264 { 265 unsigned long *ste, next, *table; 266 int rc; 267 268 ste = origin; 269 ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 270 do { 271 next = base_segment_addr_end(addr, end); 272 if (*ste & _SEGMENT_ENTRY_INVALID) { 273 if (!alloc) 274 continue; 275 table = base_pgt_alloc(); 276 if (!table) 277 return -ENOMEM; 278 *ste = __pa(table) | _SEGMENT_ENTRY; 279 } 280 table = __va(*ste & _SEGMENT_ENTRY_ORIGIN); 281 rc = base_page_walk(table, addr, next, alloc); 282 if (rc) 283 return rc; 284 if (!alloc) 285 base_pgt_free(table); 286 cond_resched(); 287 } while (ste++, addr = next, addr < end); 288 return 0; 289 } 290 291 static int base_region3_walk(unsigned long *origin, unsigned long addr, 292 unsigned long end, int alloc) 293 { 294 unsigned long *rtte, next, *table; 295 int rc; 296 297 rtte = origin; 298 rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT; 299 do { 300 next = base_region3_addr_end(addr, end); 301 if (*rtte & _REGION_ENTRY_INVALID) { 302 if (!alloc) 303 continue; 304 table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); 305 if (!table) 306 return -ENOMEM; 307 *rtte = __pa(table) | _REGION3_ENTRY; 308 } 309 table = __va(*rtte & _REGION_ENTRY_ORIGIN); 310 rc = base_segment_walk(table, addr, next, alloc); 311 if (rc) 312 return rc; 313 if (!alloc) 314 base_crst_free(table); 315 } while (rtte++, addr = next, addr < end); 316 return 0; 317 } 318 319 static int base_region2_walk(unsigned long *origin, unsigned long addr, 320 unsigned long end, int alloc) 321 { 322 unsigned long *rste, next, *table; 323 int rc; 324 325 rste = origin; 326 rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT; 327 do { 328 next = base_region2_addr_end(addr, end); 329 if (*rste & _REGION_ENTRY_INVALID) { 330 if (!alloc) 331 continue; 332 table = base_crst_alloc(_REGION3_ENTRY_EMPTY); 333 if (!table) 334 return -ENOMEM; 335 *rste = __pa(table) | _REGION2_ENTRY; 336 } 337 table = __va(*rste & _REGION_ENTRY_ORIGIN); 338 rc = base_region3_walk(table, addr, next, alloc); 339 if (rc) 340 return rc; 341 if (!alloc) 342 base_crst_free(table); 343 } while (rste++, addr = next, addr < end); 344 return 0; 345 } 346 347 static int base_region1_walk(unsigned long *origin, unsigned long addr, 348 unsigned long end, int alloc) 349 { 350 unsigned long *rfte, next, *table; 351 int rc; 352 353 rfte = origin; 354 rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT; 355 do { 356 next = base_region1_addr_end(addr, end); 357 if (*rfte & _REGION_ENTRY_INVALID) { 358 if (!alloc) 359 continue; 360 table = base_crst_alloc(_REGION2_ENTRY_EMPTY); 361 if (!table) 362 return -ENOMEM; 363 *rfte = __pa(table) | _REGION1_ENTRY; 364 } 365 table = __va(*rfte & _REGION_ENTRY_ORIGIN); 366 rc = base_region2_walk(table, addr, next, alloc); 367 if (rc) 368 return rc; 369 if (!alloc) 370 base_crst_free(table); 371 } while (rfte++, addr = next, addr < end); 372 return 0; 373 } 374 375 /** 376 * base_asce_free - free asce and tables returned from base_asce_alloc() 377 * @asce: asce to be freed 378 * 379 * Frees all region, segment, and page tables that were allocated with a 380 * corresponding base_asce_alloc() call. 381 */ 382 void base_asce_free(unsigned long asce) 383 { 384 unsigned long *table = __va(asce & _ASCE_ORIGIN); 385 386 if (!asce) 387 return; 388 switch (asce & _ASCE_TYPE_MASK) { 389 case _ASCE_TYPE_SEGMENT: 390 base_segment_walk(table, 0, _REGION3_SIZE, 0); 391 break; 392 case _ASCE_TYPE_REGION3: 393 base_region3_walk(table, 0, _REGION2_SIZE, 0); 394 break; 395 case _ASCE_TYPE_REGION2: 396 base_region2_walk(table, 0, _REGION1_SIZE, 0); 397 break; 398 case _ASCE_TYPE_REGION1: 399 base_region1_walk(table, 0, TASK_SIZE_MAX, 0); 400 break; 401 } 402 base_crst_free(table); 403 } 404 405 static int base_pgt_cache_init(void) 406 { 407 static DEFINE_MUTEX(base_pgt_cache_mutex); 408 unsigned long sz = _PAGE_TABLE_SIZE; 409 410 if (base_pgt_cache) 411 return 0; 412 mutex_lock(&base_pgt_cache_mutex); 413 if (!base_pgt_cache) 414 base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL); 415 mutex_unlock(&base_pgt_cache_mutex); 416 return base_pgt_cache ? 0 : -ENOMEM; 417 } 418 419 /** 420 * base_asce_alloc - create kernel mapping without enhanced DAT features 421 * @addr: virtual start address of kernel mapping 422 * @num_pages: number of consecutive pages 423 * 424 * Generate an asce, including all required region, segment and page tables, 425 * that can be used to access the virtual kernel mapping. The difference is 426 * that the returned asce does not make use of any enhanced DAT features like 427 * e.g. large pages. This is required for some I/O functions that pass an 428 * asce, like e.g. some service call requests. 429 * 430 * Note: the returned asce may NEVER be attached to any cpu. It may only be 431 * used for I/O requests. tlb entries that might result because the 432 * asce was attached to a cpu won't be cleared. 433 */ 434 unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) 435 { 436 unsigned long asce, *table, end; 437 int rc; 438 439 if (base_pgt_cache_init()) 440 return 0; 441 end = addr + num_pages * PAGE_SIZE; 442 if (end <= _REGION3_SIZE) { 443 table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); 444 if (!table) 445 return 0; 446 rc = base_segment_walk(table, addr, end, 1); 447 asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; 448 } else if (end <= _REGION2_SIZE) { 449 table = base_crst_alloc(_REGION3_ENTRY_EMPTY); 450 if (!table) 451 return 0; 452 rc = base_region3_walk(table, addr, end, 1); 453 asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; 454 } else if (end <= _REGION1_SIZE) { 455 table = base_crst_alloc(_REGION2_ENTRY_EMPTY); 456 if (!table) 457 return 0; 458 rc = base_region2_walk(table, addr, end, 1); 459 asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; 460 } else { 461 table = base_crst_alloc(_REGION1_ENTRY_EMPTY); 462 if (!table) 463 return 0; 464 rc = base_region1_walk(table, addr, end, 1); 465 asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; 466 } 467 if (rc) { 468 base_asce_free(asce); 469 asce = 0; 470 } 471 return asce; 472 } 473