1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ 3 #include <linux/bpf.h> 4 #include <linux/btf.h> 5 #include <linux/cacheflush.h> 6 #include <linux/err.h> 7 #include <linux/irq_work.h> 8 #include "linux/filter.h" 9 #include <linux/llist.h> 10 #include <linux/btf_ids.h> 11 #include <linux/vmalloc.h> 12 #include <linux/pagemap.h> 13 #include <asm/tlbflush.h> 14 #include "range_tree.h" 15 16 /* 17 * bpf_arena is a sparsely populated shared memory region between bpf program and 18 * user space process. 19 * 20 * For example on x86-64 the values could be: 21 * user_vm_start 7f7d26200000 // picked by mmap() 22 * kern_vm_start ffffc90001e69000 // picked by get_vm_area() 23 * For user space all pointers within the arena are normal 8-byte addresses. 24 * In this example 7f7d26200000 is the address of the first page (pgoff=0). 25 * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr 26 * (u32)7f7d26200000 -> 26200000 27 * hence 28 * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb 29 * kernel memory region. 30 * 31 * BPF JITs generate the following code to access arena: 32 * mov eax, eax // eax has lower 32-bit of user pointer 33 * mov word ptr [rax + r12 + off], bx 34 * where r12 == kern_vm_start and off is s16. 35 * Hence allocate 4Gb + GUARD_SZ/2 on each side. 36 * 37 * Initially kernel vm_area and user vma are not populated. 38 * User space can fault-in any address which will insert the page 39 * into kernel and user vma. 40 * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc 41 * which will insert it into kernel vm_area. 42 * The later fault-in from user space will populate that page into user vma. 43 */ 44 45 /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ 46 #define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) 47 #define KERN_VM_SZ (SZ_4G + GUARD_SZ) 48 49 static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable); 50 51 struct bpf_arena { 52 struct bpf_map map; 53 u64 user_vm_start; 54 u64 user_vm_end; 55 struct vm_struct *kern_vm; 56 struct page *scratch_page; 57 struct range_tree rt; 58 /* protects rt */ 59 rqspinlock_t spinlock; 60 struct list_head vma_list; 61 /* protects vma_list */ 62 struct mutex lock; 63 u64 zap_gen; 64 struct mutex zap_mutex; 65 struct irq_work free_irq; 66 struct work_struct free_work; 67 struct llist_head free_spans; 68 }; 69 70 static void arena_free_worker(struct work_struct *work); 71 static void arena_free_irq(struct irq_work *iw); 72 73 struct arena_free_span { 74 struct llist_node node; 75 unsigned long uaddr; 76 u32 page_cnt; 77 }; 78 79 u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) 80 { 81 return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0; 82 } 83 84 u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) 85 { 86 return arena ? arena->user_vm_start : 0; 87 } 88 89 /** 90 * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map * 91 * @map: a BPF_MAP_TYPE_ARENA map 92 * 93 * Return @map's kern_vm_start. 94 */ 95 u64 bpf_arena_map_kern_vm_start(struct bpf_map *map) 96 { 97 return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map)); 98 } 99 100 /** 101 * bpf_prog_arena - return the bpf_map of the arena referenced by @prog 102 * @prog: a loaded BPF program 103 * 104 * The verifier enforces at most one arena per program and stores it in 105 * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if 106 * @prog does not reference an arena. 107 */ 108 struct bpf_map *bpf_prog_arena(struct bpf_prog *prog) 109 { 110 struct bpf_arena *arena = prog->aux->arena; 111 112 return arena ? &arena->map : NULL; 113 } 114 115 static long arena_map_peek_elem(struct bpf_map *map, void *value) 116 { 117 return -EOPNOTSUPP; 118 } 119 120 static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags) 121 { 122 return -EOPNOTSUPP; 123 } 124 125 static long arena_map_pop_elem(struct bpf_map *map, void *value) 126 { 127 return -EOPNOTSUPP; 128 } 129 130 static long arena_map_delete_elem(struct bpf_map *map, void *value) 131 { 132 return -EOPNOTSUPP; 133 } 134 135 static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 136 { 137 return -EOPNOTSUPP; 138 } 139 140 static long compute_pgoff(struct bpf_arena *arena, long uaddr) 141 { 142 return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT; 143 } 144 145 struct apply_range_data { 146 struct page **pages; 147 struct page *scratch_page; 148 int i; 149 }; 150 151 struct clear_range_data { 152 struct llist_head *free_pages; 153 struct page *scratch_page; 154 }; 155 156 static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) 157 { 158 struct apply_range_data *d = data; 159 struct page *page; 160 pte_t pteval; 161 162 if (!data) 163 return 0; 164 165 page = d->pages[d->i]; 166 /* paranoia, similar to vmap_pages_pte_range() */ 167 if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) 168 return -EINVAL; 169 170 pteval = mk_pte(page, PAGE_KERNEL); 171 #ifdef ptep_try_set 172 /* 173 * Kernel-fault recovery may have installed the scratch page here, and 174 * some architectures (arm64) prohibit valid->valid PTE transitions. 175 * Install atomically into a none slot. If scratch is present, clear it 176 * and flush_tlb_before_set() (break-before-make) before retrying. 177 */ 178 while (!ptep_try_set(pte, pteval)) { 179 pte_t old = ptep_get(pte); 180 181 if (pte_none(old)) 182 continue; 183 if (WARN_ON_ONCE(pte_page(old) != d->scratch_page)) 184 return -EBUSY; 185 ptep_get_and_clear(&init_mm, addr, pte); 186 flush_tlb_before_set(addr); 187 } 188 #else 189 /* 190 * Without ptep_try_set() there is no atomic installer, but such arches 191 * also do not wire up bpf_arena_handle_page_fault(), so no scratch page 192 * is ever installed and the slot is always none here. 193 */ 194 if (unlikely(!pte_none(ptep_get(pte)))) 195 return -EBUSY; 196 set_pte_at(&init_mm, addr, pte, pteval); 197 #endif 198 d->i++; 199 return 0; 200 } 201 202 static void flush_vmap_cache(unsigned long start, unsigned long size) 203 { 204 flush_cache_vmap(start, start + size); 205 } 206 207 static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) 208 { 209 struct clear_range_data *d = data; 210 pte_t old_pte; 211 struct page *page; 212 213 /* 214 * Pairs with ptep_try_set() in the kernel-fault scratch installer. 215 * Both sides must be atomic. 216 */ 217 old_pte = ptep_get_and_clear(&init_mm, addr, pte); 218 if (pte_none(old_pte) || !pte_present(old_pte)) 219 return 0; 220 221 page = pte_page(old_pte); 222 if (WARN_ON_ONCE(!page)) 223 return -EINVAL; 224 225 /* 226 * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr 227 * scratches its PTE. A later bpf_arena_free_pages() over that range walks 228 * here. Without the skip, scratch_page would be freed. 229 */ 230 if (page == d->scratch_page) 231 return 0; 232 233 __llist_add(&page->pcp_llist, d->free_pages); 234 return 0; 235 } 236 237 static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data) 238 { 239 struct page *scratch_page = data; 240 241 if (!pte_none(ptep_get(pte))) 242 return 0; 243 /* 244 * Best-effort install. ptep_try_set() returns false only if another 245 * installer (real allocation or concurrent fault) won the cmpxchg. 246 * Their PTE is already valid, so the access retry succeeds. 247 * 248 * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just 249 * cause one extra re-fault through this same path. 250 */ 251 ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL)); 252 return 0; 253 } 254 255 static int populate_pgtable_except_pte(struct bpf_arena *arena) 256 { 257 /* Populate intermediates for the recovery range (4 GiB + upper half-guard). */ 258 return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), 259 SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL); 260 } 261 262 static struct bpf_map *arena_map_alloc(union bpf_attr *attr) 263 { 264 struct vm_struct *kern_vm; 265 int numa_node = bpf_map_attr_numa_node(attr); 266 struct bpf_arena *arena; 267 u64 vm_range; 268 int err = -ENOMEM; 269 270 if (!bpf_jit_supports_arena()) 271 return ERR_PTR(-EOPNOTSUPP); 272 273 if (attr->key_size || attr->value_size || attr->max_entries == 0 || 274 /* BPF_F_MMAPABLE must be set */ 275 !(attr->map_flags & BPF_F_MMAPABLE) || 276 /* No unsupported flags present */ 277 (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV))) 278 return ERR_PTR(-EINVAL); 279 280 if (attr->map_extra & ~PAGE_MASK) 281 /* If non-zero the map_extra is an expected user VMA start address */ 282 return ERR_PTR(-EINVAL); 283 284 vm_range = (u64)attr->max_entries * PAGE_SIZE; 285 if (vm_range > SZ_4G) 286 return ERR_PTR(-E2BIG); 287 288 if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32)) 289 /* user vma must not cross 32-bit boundary */ 290 return ERR_PTR(-ERANGE); 291 292 kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP); 293 if (!kern_vm) 294 return ERR_PTR(-ENOMEM); 295 296 arena = bpf_map_area_alloc(sizeof(*arena), numa_node); 297 if (!arena) 298 goto err; 299 300 arena->kern_vm = kern_vm; 301 arena->user_vm_start = attr->map_extra; 302 if (arena->user_vm_start) 303 arena->user_vm_end = arena->user_vm_start + vm_range; 304 305 INIT_LIST_HEAD(&arena->vma_list); 306 init_llist_head(&arena->free_spans); 307 init_irq_work(&arena->free_irq, arena_free_irq); 308 INIT_WORK(&arena->free_work, arena_free_worker); 309 bpf_map_init_from_attr(&arena->map, attr); 310 311 err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page); 312 if (err) 313 goto err_free_arena; 314 315 range_tree_init(&arena->rt); 316 err = range_tree_set(&arena->rt, 0, attr->max_entries); 317 if (err) 318 goto err_free_scratch; 319 mutex_init(&arena->lock); 320 mutex_init(&arena->zap_mutex); 321 raw_res_spin_lock_init(&arena->spinlock); 322 err = populate_pgtable_except_pte(arena); 323 if (err) 324 goto err_destroy_rt; 325 326 return &arena->map; 327 328 err_destroy_rt: 329 range_tree_destroy(&arena->rt); 330 err_free_scratch: 331 __free_page(arena->scratch_page); 332 err_free_arena: 333 bpf_map_area_free(arena); 334 err: 335 free_vm_area(kern_vm); 336 return ERR_PTR(err); 337 } 338 339 static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) 340 { 341 struct bpf_arena *arena = data; 342 struct page *page; 343 pte_t pte; 344 345 pte = ptep_get(ptep); 346 if (!pte_present(pte)) /* sanity check */ 347 return 0; 348 page = pte_page(pte); 349 /* 350 * Skip the scratch page. The walk is page-table-driven, not range-tree-driven, 351 * so it can visit scratch PTEs at uaddrs the BPF program never allocated. 352 */ 353 if (page == arena->scratch_page) 354 return 0; 355 /* 356 * We do not update pte here: 357 * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug 358 * 2. TLB flushing is batched or deferred. Even if we clear pte, 359 * the TLB entries can stick around and continue to permit access to 360 * the freed page. So it all relies on 1. 361 */ 362 __free_page(page); 363 return 0; 364 } 365 366 static void arena_map_free(struct bpf_map *map) 367 { 368 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 369 370 /* 371 * Check that user vma-s are not around when bpf map is freed. 372 * mmap() holds vm_file which holds bpf_map refcnt. 373 * munmap() must have happened on vma followed by arena_vm_close() 374 * which would clear arena->vma_list. 375 */ 376 if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) 377 return; 378 379 /* Ensure no pending deferred frees */ 380 irq_work_sync(&arena->free_irq); 381 flush_work(&arena->free_work); 382 383 /* 384 * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). 385 * It unmaps everything from vmalloc area and clears pgtables. 386 * Call apply_to_existing_page_range() first to find populated ptes and 387 * free those pages. 388 */ 389 apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), 390 SZ_4G + GUARD_SZ / 2, existing_page_cb, arena); 391 free_vm_area(arena->kern_vm); 392 range_tree_destroy(&arena->rt); 393 __free_page(arena->scratch_page); 394 bpf_map_area_free(arena); 395 } 396 397 static void *arena_map_lookup_elem(struct bpf_map *map, void *key) 398 { 399 return ERR_PTR(-EINVAL); 400 } 401 402 static long arena_map_update_elem(struct bpf_map *map, void *key, 403 void *value, u64 flags) 404 { 405 return -EOPNOTSUPP; 406 } 407 408 static int arena_map_check_btf(struct bpf_map *map, const struct btf *btf, 409 const struct btf_type *key_type, const struct btf_type *value_type) 410 { 411 return 0; 412 } 413 414 static u64 arena_map_mem_usage(const struct bpf_map *map) 415 { 416 return 0; 417 } 418 419 struct vma_list { 420 struct vm_area_struct *vma; 421 struct list_head head; 422 refcount_t mmap_count; 423 u64 zap_gen; 424 }; 425 426 static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) 427 { 428 struct vma_list *vml; 429 430 vml = kmalloc_obj(*vml); 431 if (!vml) 432 return -ENOMEM; 433 refcount_set(&vml->mmap_count, 1); 434 vma->vm_private_data = vml; 435 vml->vma = vma; 436 vml->zap_gen = 0; 437 list_add(&vml->head, &arena->vma_list); 438 return 0; 439 } 440 441 static void arena_vm_open(struct vm_area_struct *vma) 442 { 443 struct vma_list *vml = vma->vm_private_data; 444 445 refcount_inc(&vml->mmap_count); 446 } 447 448 static int arena_vm_may_split(struct vm_area_struct *vma, unsigned long addr) 449 { 450 return -EINVAL; 451 } 452 453 static int arena_vm_mremap(struct vm_area_struct *vma) 454 { 455 return -EINVAL; 456 } 457 458 static void arena_vm_close(struct vm_area_struct *vma) 459 { 460 struct bpf_map *map = vma->vm_file->private_data; 461 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 462 struct vma_list *vml = vma->vm_private_data; 463 464 if (!refcount_dec_and_test(&vml->mmap_count)) 465 return; 466 guard(mutex)(&arena->lock); 467 /* update link list under lock */ 468 list_del(&vml->head); 469 vma->vm_private_data = NULL; 470 kfree(vml); 471 } 472 473 static vm_fault_t arena_vm_fault(struct vm_fault *vmf) 474 { 475 struct bpf_map *map = vmf->vma->vm_file->private_data; 476 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 477 struct mem_cgroup *new_memcg, *old_memcg; 478 struct page *page; 479 long kbase, kaddr; 480 unsigned long flags; 481 int ret; 482 483 kbase = bpf_arena_get_kern_vm_start(arena); 484 kaddr = kbase + (u32)(vmf->address); 485 486 if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) 487 /* Make a reasonable effort to address impossible case */ 488 return VM_FAULT_RETRY; 489 490 page = vmalloc_to_page((void *)kaddr); 491 if (page) { 492 if (page == arena->scratch_page) 493 /* BPF triggered scratch here; don't lazy-alloc over it */ 494 goto out_sigsegv; 495 /* already have a page vmap-ed */ 496 goto out; 497 } 498 499 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 500 501 if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) 502 /* User space requested to segfault when page is not allocated by bpf prog */ 503 goto out_sigsegv_memcg; 504 505 ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); 506 if (ret) 507 goto out_sigsegv_memcg; 508 509 struct apply_range_data data = { .pages = &page, .i = 0, 510 .scratch_page = arena->scratch_page }; 511 /* Account into memcg of the process that created bpf_arena */ 512 ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); 513 if (ret) { 514 range_tree_set(&arena->rt, vmf->pgoff, 1); 515 goto out_sigsegv_memcg; 516 } 517 518 ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); 519 if (ret) { 520 range_tree_set(&arena->rt, vmf->pgoff, 1); 521 free_pages_nolock(page, 0); 522 goto out_sigsegv_memcg; 523 } 524 flush_vmap_cache(kaddr, PAGE_SIZE); 525 bpf_map_memcg_exit(old_memcg, new_memcg); 526 out: 527 page_ref_add(page, 1); 528 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 529 vmf->page = page; 530 return 0; 531 out_sigsegv_memcg: 532 bpf_map_memcg_exit(old_memcg, new_memcg); 533 out_sigsegv: 534 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 535 return VM_FAULT_SIGSEGV; 536 } 537 538 static const struct vm_operations_struct arena_vm_ops = { 539 .open = arena_vm_open, 540 .may_split = arena_vm_may_split, 541 .mremap = arena_vm_mremap, 542 .close = arena_vm_close, 543 .fault = arena_vm_fault, 544 }; 545 546 static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr, 547 unsigned long len, unsigned long pgoff, 548 unsigned long flags) 549 { 550 struct bpf_map *map = filp->private_data; 551 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 552 long ret; 553 554 if (pgoff) 555 return -EINVAL; 556 if (len > SZ_4G) 557 return -E2BIG; 558 559 /* if user_vm_start was specified at arena creation time */ 560 if (arena->user_vm_start) { 561 if (len > arena->user_vm_end - arena->user_vm_start) 562 return -E2BIG; 563 if (len != arena->user_vm_end - arena->user_vm_start) 564 return -EINVAL; 565 if (addr != arena->user_vm_start) 566 return -EINVAL; 567 } 568 569 ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags); 570 if (IS_ERR_VALUE(ret)) 571 return ret; 572 if ((ret >> 32) == ((ret + len - 1) >> 32)) 573 return ret; 574 if (WARN_ON_ONCE(arena->user_vm_start)) 575 /* checks at map creation time should prevent this */ 576 return -EFAULT; 577 return round_up(ret, SZ_4G); 578 } 579 580 static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) 581 { 582 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 583 584 guard(mutex)(&arena->lock); 585 if (arena->user_vm_start && arena->user_vm_start != vma->vm_start) 586 /* 587 * If map_extra was not specified at arena creation time then 588 * 1st user process can do mmap(NULL, ...) to pick user_vm_start 589 * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..); 590 * or 591 * specify addr in map_extra and 592 * use the same addr later with mmap(addr, MAP_FIXED..); 593 */ 594 return -EBUSY; 595 596 if (arena->user_vm_end && arena->user_vm_end != vma->vm_end) 597 /* all user processes must have the same size of mmap-ed region */ 598 return -EBUSY; 599 600 /* Earlier checks should prevent this */ 601 if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff)) 602 return -EFAULT; 603 604 if (remember_vma(arena, vma)) 605 return -ENOMEM; 606 607 arena->user_vm_start = vma->vm_start; 608 arena->user_vm_end = vma->vm_end; 609 /* 610 * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and 611 * clears VM_MAYEXEC. Set VM_DONTEXPAND to avoid potential change 612 * of user_vm_start. Set VM_DONTCOPY to prevent arena VMA from 613 * being copied into the child process on fork. 614 */ 615 vm_flags_set(vma, VM_DONTEXPAND | VM_DONTCOPY); 616 vma->vm_ops = &arena_vm_ops; 617 return 0; 618 } 619 620 static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) 621 { 622 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 623 624 if ((u64)off >= arena->user_vm_end - arena->user_vm_start) 625 return -ERANGE; 626 *imm = (unsigned long)arena->user_vm_start; 627 return 0; 628 } 629 630 BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena) 631 const struct bpf_map_ops arena_map_ops = { 632 .map_meta_equal = bpf_map_meta_equal, 633 .map_alloc = arena_map_alloc, 634 .map_free = arena_map_free, 635 .map_direct_value_addr = arena_map_direct_value_addr, 636 .map_mmap = arena_map_mmap, 637 .map_get_unmapped_area = arena_get_unmapped_area, 638 .map_get_next_key = arena_map_get_next_key, 639 .map_push_elem = arena_map_push_elem, 640 .map_peek_elem = arena_map_peek_elem, 641 .map_pop_elem = arena_map_pop_elem, 642 .map_lookup_elem = arena_map_lookup_elem, 643 .map_update_elem = arena_map_update_elem, 644 .map_delete_elem = arena_map_delete_elem, 645 .map_check_btf = arena_map_check_btf, 646 .map_mem_usage = arena_map_mem_usage, 647 .map_btf_id = &bpf_arena_map_btf_ids[0], 648 }; 649 650 static u64 clear_lo32(u64 val) 651 { 652 return val & ~(u64)~0U; 653 } 654 655 /* 656 * Allocate pages and vmap them into kernel vmalloc area. 657 * Later the pages will be mmaped into user space vma. 658 */ 659 static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id, 660 bool sleepable) 661 { 662 /* user_vm_end/start are fixed before bpf prog runs */ 663 long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; 664 u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); 665 struct mem_cgroup *new_memcg, *old_memcg; 666 struct apply_range_data data; 667 struct page **pages = NULL; 668 long remaining, mapped = 0; 669 long alloc_pages; 670 unsigned long flags; 671 long pgoff = 0; 672 u32 uaddr32; 673 int ret, i; 674 675 if (node_id != NUMA_NO_NODE && 676 ((unsigned int)node_id >= nr_node_ids || !node_online(node_id))) 677 return 0; 678 679 if (page_cnt > page_cnt_max) 680 return 0; 681 682 if (uaddr) { 683 if (uaddr & ~PAGE_MASK) 684 return 0; 685 pgoff = compute_pgoff(arena, uaddr); 686 if (pgoff > page_cnt_max - page_cnt) 687 /* requested address will be outside of user VMA */ 688 return 0; 689 } 690 691 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 692 /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */ 693 alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *)); 694 pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE); 695 if (!pages) { 696 bpf_map_memcg_exit(old_memcg, new_memcg); 697 return 0; 698 } 699 data.pages = pages; 700 data.scratch_page = arena->scratch_page; 701 702 if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) 703 goto out_free_pages; 704 705 if (uaddr) { 706 ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); 707 if (ret) 708 goto out_unlock_free_pages; 709 ret = range_tree_clear(&arena->rt, pgoff, page_cnt); 710 } else { 711 ret = pgoff = range_tree_find(&arena->rt, page_cnt); 712 if (pgoff >= 0) 713 ret = range_tree_clear(&arena->rt, pgoff, page_cnt); 714 } 715 if (ret) 716 goto out_unlock_free_pages; 717 718 remaining = page_cnt; 719 uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); 720 721 while (remaining) { 722 long this_batch = min(remaining, alloc_pages); 723 724 /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ 725 memset(pages, 0, this_batch * sizeof(struct page *)); 726 727 ret = bpf_map_alloc_pages(&arena->map, node_id, this_batch, pages); 728 if (ret) 729 goto out; 730 731 /* 732 * Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 733 * will not overflow 32-bit. Lower 32-bit need to represent 734 * contiguous user address range. 735 * Map these pages at kern_vm_start base. 736 * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow 737 * lower 32-bit and it's ok. 738 */ 739 data.i = 0; 740 ret = apply_to_page_range(&init_mm, 741 kern_vm_start + uaddr32 + (mapped << PAGE_SHIFT), 742 this_batch << PAGE_SHIFT, apply_range_set_cb, &data); 743 if (ret) { 744 /* data.i pages were mapped, account them and free the remaining */ 745 mapped += data.i; 746 for (i = data.i; i < this_batch; i++) 747 free_pages_nolock(pages[i], 0); 748 goto out; 749 } 750 751 mapped += this_batch; 752 remaining -= this_batch; 753 } 754 flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); 755 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 756 kfree_nolock(pages); 757 bpf_map_memcg_exit(old_memcg, new_memcg); 758 return clear_lo32(arena->user_vm_start) + uaddr32; 759 out: 760 range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); 761 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 762 if (mapped) { 763 flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); 764 arena_free_pages(arena, uaddr32, mapped, sleepable); 765 } 766 goto out_free_pages; 767 out_unlock_free_pages: 768 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 769 out_free_pages: 770 kfree_nolock(pages); 771 bpf_map_memcg_exit(old_memcg, new_memcg); 772 return 0; 773 } 774 775 /* 776 * If page is present in vmalloc area, unmap it from vmalloc area, 777 * unmap it from all user space vma-s, 778 * and free it. 779 */ 780 static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) 781 { 782 unsigned long size = (unsigned long)page_cnt << PAGE_SHIFT; 783 struct vm_area_struct *vma; 784 struct mm_struct *mm; 785 struct vma_list *vml; 786 unsigned long vm_start; 787 u64 my_gen; 788 789 /* 790 * Taking mmap_read_lock() under arena->lock would deadlock against 791 * arena_vm_close(), which runs with mmap_write_lock held and then 792 * acquires arena->lock. Drop arena->lock for mmap_read_lock(). 793 * 794 * Use per-call my_gen, recorded in vml->zap_gen, to remember which 795 * vmls this invocation has already processed across the lock drop. 796 * Hold zap_mutex around the whole walk so concurrent zap_pages() 797 * callers cannot overwrite each other's marks on shared vmls -- 798 * otherwise call B's mark would make call A skip a vml that A has 799 * not yet zapped for A's uaddr range. 800 */ 801 mutex_lock(&arena->zap_mutex); 802 mutex_lock(&arena->lock); 803 my_gen = ++arena->zap_gen; 804 for (;;) { 805 mm = NULL; 806 list_for_each_entry(vml, &arena->vma_list, head) { 807 if (vml->zap_gen >= my_gen) 808 continue; 809 vml->zap_gen = my_gen; 810 if (!mmget_not_zero(vml->vma->vm_mm)) 811 continue; 812 mm = vml->vma->vm_mm; 813 vm_start = vml->vma->vm_start; 814 break; 815 } 816 if (!mm) 817 break; 818 mutex_unlock(&arena->lock); 819 820 mmap_read_lock(mm); 821 /* 822 * Re-resolve: while we waited the VMA could have been unmapped 823 * and a different mapping installed at the same address. 824 */ 825 vma = find_vma(mm, vm_start); 826 if (vma && vma->vm_start == vm_start && 827 vma->vm_file && vma->vm_file->private_data == &arena->map) 828 zap_vma_range(vma, uaddr, size); 829 mmap_read_unlock(mm); 830 mmput(mm); 831 832 mutex_lock(&arena->lock); 833 } 834 mutex_unlock(&arena->lock); 835 mutex_unlock(&arena->zap_mutex); 836 } 837 838 static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) 839 { 840 struct mem_cgroup *new_memcg, *old_memcg; 841 u64 full_uaddr, uaddr_end; 842 long kaddr, pgoff; 843 struct page *page; 844 struct llist_head free_pages; 845 struct llist_node *pos, *t; 846 struct arena_free_span *s; 847 struct clear_range_data cdata; 848 unsigned long flags; 849 int ret = 0; 850 851 /* only aligned lower 32-bit are relevant */ 852 uaddr = (u32)uaddr; 853 uaddr &= PAGE_MASK; 854 kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; 855 full_uaddr = clear_lo32(arena->user_vm_start) + uaddr; 856 uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT)); 857 if (full_uaddr >= uaddr_end) 858 return; 859 860 page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; 861 pgoff = compute_pgoff(arena, uaddr); 862 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 863 864 if (!sleepable) 865 goto defer; 866 867 ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags); 868 869 /* Can't proceed without holding the spinlock so defer the free */ 870 if (ret) 871 goto defer; 872 873 range_tree_set(&arena->rt, pgoff, page_cnt); 874 875 init_llist_head(&free_pages); 876 cdata.free_pages = &free_pages; 877 cdata.scratch_page = arena->scratch_page; 878 /* clear ptes and collect struct pages */ 879 apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, 880 apply_range_clear_cb, &cdata); 881 882 /* drop the lock to do the tlb flush and zap pages */ 883 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 884 885 /* ensure no stale TLB entries */ 886 flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); 887 888 if (page_cnt > 1) 889 /* bulk zap if multiple pages being freed */ 890 zap_pages(arena, full_uaddr, page_cnt); 891 892 llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { 893 page = llist_entry(pos, struct page, pcp_llist); 894 if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ 895 /* Optimization for the common case of page_cnt==1: 896 * If page wasn't mapped into some user vma there 897 * is no need to call zap_pages which is slow. When 898 * page_cnt is big it's faster to do the batched zap. 899 */ 900 zap_pages(arena, full_uaddr, 1); 901 __free_page(page); 902 } 903 bpf_map_memcg_exit(old_memcg, new_memcg); 904 905 return; 906 907 defer: 908 s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1); 909 bpf_map_memcg_exit(old_memcg, new_memcg); 910 if (!s) 911 /* 912 * If allocation fails in non-sleepable context, pages are intentionally left 913 * inaccessible (leaked) until the arena is destroyed. Cleanup or retries are not 914 * possible here, so we intentionally omit them for safety. 915 */ 916 return; 917 918 s->page_cnt = page_cnt; 919 s->uaddr = uaddr; 920 llist_add(&s->node, &arena->free_spans); 921 irq_work_queue(&arena->free_irq); 922 } 923 924 /* 925 * Reserve an arena virtual address range without populating it. This call stops 926 * bpf_arena_alloc_pages from adding pages to this range. 927 */ 928 static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) 929 { 930 long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; 931 struct mem_cgroup *new_memcg, *old_memcg; 932 unsigned long flags; 933 long pgoff; 934 int ret; 935 936 if (uaddr & ~PAGE_MASK) 937 return 0; 938 939 pgoff = compute_pgoff(arena, uaddr); 940 if (pgoff + page_cnt > page_cnt_max) 941 return -EINVAL; 942 943 if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) 944 return -EBUSY; 945 946 /* Cannot guard already allocated pages. */ 947 ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); 948 if (ret) { 949 ret = -EBUSY; 950 goto out; 951 } 952 953 /* "Allocate" the region to prevent it from being allocated. */ 954 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 955 ret = range_tree_clear(&arena->rt, pgoff, page_cnt); 956 bpf_map_memcg_exit(old_memcg, new_memcg); 957 out: 958 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 959 return ret; 960 } 961 962 static void arena_free_worker(struct work_struct *work) 963 { 964 struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work); 965 struct mem_cgroup *new_memcg, *old_memcg; 966 struct llist_node *list, *pos, *t; 967 struct arena_free_span *s; 968 u64 arena_vm_start, user_vm_start; 969 struct llist_head free_pages; 970 struct clear_range_data cdata; 971 struct page *page; 972 unsigned long full_uaddr; 973 long kaddr, page_cnt, pgoff; 974 unsigned long flags; 975 976 if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) { 977 schedule_work(work); 978 return; 979 } 980 981 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 982 983 init_llist_head(&free_pages); 984 cdata.free_pages = &free_pages; 985 cdata.scratch_page = arena->scratch_page; 986 arena_vm_start = bpf_arena_get_kern_vm_start(arena); 987 user_vm_start = bpf_arena_get_user_vm_start(arena); 988 989 list = llist_del_all(&arena->free_spans); 990 llist_for_each(pos, list) { 991 s = llist_entry(pos, struct arena_free_span, node); 992 page_cnt = s->page_cnt; 993 kaddr = arena_vm_start + s->uaddr; 994 pgoff = compute_pgoff(arena, s->uaddr); 995 996 /* clear ptes and collect pages in free_pages llist */ 997 apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, 998 apply_range_clear_cb, &cdata); 999 1000 range_tree_set(&arena->rt, pgoff, page_cnt); 1001 } 1002 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 1003 1004 /* Iterate the list again without holding spinlock to do the tlb flush and zap_pages */ 1005 llist_for_each_safe(pos, t, list) { 1006 s = llist_entry(pos, struct arena_free_span, node); 1007 page_cnt = s->page_cnt; 1008 full_uaddr = clear_lo32(user_vm_start) + s->uaddr; 1009 kaddr = arena_vm_start + s->uaddr; 1010 1011 /* ensure no stale TLB entries */ 1012 flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); 1013 1014 /* remove pages from user vmas */ 1015 zap_pages(arena, full_uaddr, page_cnt); 1016 1017 kfree_nolock(s); 1018 } 1019 1020 /* free all pages collected by apply_to_existing_page_range() in the first loop */ 1021 llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { 1022 page = llist_entry(pos, struct page, pcp_llist); 1023 __free_page(page); 1024 } 1025 1026 bpf_map_memcg_exit(old_memcg, new_memcg); 1027 } 1028 1029 static void arena_free_irq(struct irq_work *iw) 1030 { 1031 struct bpf_arena *arena = container_of(iw, struct bpf_arena, free_irq); 1032 1033 schedule_work(&arena->free_work); 1034 } 1035 1036 __bpf_kfunc_start_defs(); 1037 1038 __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, 1039 int node_id, u64 flags) 1040 { 1041 struct bpf_map *map = p__map; 1042 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 1043 1044 if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) 1045 return NULL; 1046 1047 return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); 1048 } 1049 1050 void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, 1051 int node_id, u64 flags) 1052 { 1053 struct bpf_map *map = p__map; 1054 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 1055 1056 if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) 1057 return NULL; 1058 1059 return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); 1060 } 1061 1062 void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, 1063 int node_id, u64 flags) 1064 { 1065 struct bpf_map *map = p__map; 1066 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 1067 1068 if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) 1069 return NULL; 1070 1071 return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); 1072 } 1073 1074 __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) 1075 { 1076 struct bpf_map *map = p__map; 1077 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 1078 1079 if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) 1080 return; 1081 arena_free_pages(arena, (long)ptr__ign, page_cnt, true); 1082 } 1083 1084 void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) 1085 { 1086 struct bpf_map *map = p__map; 1087 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 1088 1089 if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) 1090 return; 1091 arena_free_pages(arena, (long)ptr__ign, page_cnt, false); 1092 } 1093 1094 __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt) 1095 { 1096 struct bpf_map *map = p__map; 1097 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 1098 1099 if (map->map_type != BPF_MAP_TYPE_ARENA) 1100 return -EINVAL; 1101 1102 if (!page_cnt) 1103 return 0; 1104 1105 return arena_reserve_pages(arena, (long)ptr__ign, page_cnt); 1106 } 1107 __bpf_kfunc_end_defs(); 1108 1109 BTF_KFUNCS_START(arena_kfuncs) 1110 BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2) 1111 BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2) 1112 BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2) 1113 BTF_KFUNCS_END(arena_kfuncs) 1114 1115 static const struct btf_kfunc_id_set common_kfunc_set = { 1116 .owner = THIS_MODULE, 1117 .set = &arena_kfuncs, 1118 }; 1119 1120 static int __init kfunc_init(void) 1121 { 1122 return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); 1123 } 1124 late_initcall(kfunc_init); 1125 1126 static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write, 1127 unsigned long addr, unsigned long fault_ip) 1128 { 1129 struct bpf_stream_stage ss; 1130 u64 user_vm_start; 1131 1132 /* Use main prog for stream access */ 1133 prog = prog->aux->main_prog_aux->prog; 1134 1135 user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); 1136 addr += clear_lo32(user_vm_start); 1137 1138 bpf_stream_stage(ss, prog, BPF_STDERR, ({ 1139 bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n", 1140 write ? "WRITE" : "READ", addr); 1141 bpf_stream_dump_stack(ss); 1142 })); 1143 } 1144 1145 bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip) 1146 { 1147 struct bpf_arena *arena; 1148 struct bpf_prog *prog; 1149 unsigned long kbase; 1150 unsigned long page_addr = addr & PAGE_MASK; 1151 1152 prog = bpf_prog_find_from_stack(); 1153 if (!prog) 1154 return false; 1155 1156 arena = prog->aux->arena; 1157 /* a prog not using arena may be on stack, so arena can be NULL */ 1158 if (!arena) 1159 return false; 1160 1161 kbase = bpf_arena_get_kern_vm_start(arena); 1162 1163 /* 1164 * Recovery covers the 4 GiB mappable band plus the upper half-guard. 1165 * Lower guard is unreachable from kfuncs; an address there indicates 1166 * a different bug class - leave it to the regular kernel oops path. 1167 */ 1168 if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2) 1169 return false; 1170 1171 apply_to_page_range(&init_mm, page_addr, PAGE_SIZE, 1172 apply_range_set_scratch_cb, arena->scratch_page); 1173 flush_vmap_cache(page_addr, PAGE_SIZE); 1174 __bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip); 1175 return true; 1176 } 1177 1178 void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) 1179 { 1180 struct bpf_prog *prog; 1181 1182 /* 1183 * The RCU read lock is held to safely traverse the latch tree, but we 1184 * don't need its protection when accessing the prog, since it will not 1185 * disappear while we are handling the fault. 1186 */ 1187 rcu_read_lock(); 1188 prog = bpf_prog_ksym_find(fault_ip); 1189 rcu_read_unlock(); 1190 if (!prog) 1191 return; 1192 __bpf_prog_report_arena_violation(prog, write, addr, fault_ip); 1193 } 1194