1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ 3 #include <linux/bpf.h> 4 #include <linux/btf.h> 5 #include <linux/cacheflush.h> 6 #include <linux/err.h> 7 #include <linux/irq_work.h> 8 #include "linux/filter.h" 9 #include <linux/llist.h> 10 #include <linux/btf_ids.h> 11 #include <linux/vmalloc.h> 12 #include <linux/pagemap.h> 13 #include <asm/tlbflush.h> 14 #include "range_tree.h" 15 16 /* 17 * bpf_arena is a sparsely populated shared memory region between bpf program and 18 * user space process. 19 * 20 * For example on x86-64 the values could be: 21 * user_vm_start 7f7d26200000 // picked by mmap() 22 * kern_vm_start ffffc90001e69000 // picked by get_vm_area() 23 * For user space all pointers within the arena are normal 8-byte addresses. 24 * In this example 7f7d26200000 is the address of the first page (pgoff=0). 25 * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr 26 * (u32)7f7d26200000 -> 26200000 27 * hence 28 * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb 29 * kernel memory region. 30 * 31 * BPF JITs generate the following code to access arena: 32 * mov eax, eax // eax has lower 32-bit of user pointer 33 * mov word ptr [rax + r12 + off], bx 34 * where r12 == kern_vm_start and off is s16. 35 * Hence allocate 4Gb + GUARD_SZ/2 on each side. 36 * 37 * Initially kernel vm_area and user vma are not populated. 38 * User space can fault-in any address which will insert the page 39 * into kernel and user vma. 40 * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc 41 * which will insert it into kernel vm_area. 42 * The later fault-in from user space will populate that page into user vma. 43 */ 44 45 /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ 46 #define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) 47 #define KERN_VM_SZ (SZ_4G + GUARD_SZ) 48 49 static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable); 50 51 struct bpf_arena { 52 struct bpf_map map; 53 u64 user_vm_start; 54 u64 user_vm_end; 55 struct vm_struct *kern_vm; 56 struct range_tree rt; 57 /* protects rt */ 58 rqspinlock_t spinlock; 59 struct list_head vma_list; 60 /* protects vma_list */ 61 struct mutex lock; 62 struct irq_work free_irq; 63 struct work_struct free_work; 64 struct llist_head free_spans; 65 }; 66 67 static void arena_free_worker(struct work_struct *work); 68 static void arena_free_irq(struct irq_work *iw); 69 70 struct arena_free_span { 71 struct llist_node node; 72 unsigned long uaddr; 73 u32 page_cnt; 74 }; 75 76 u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) 77 { 78 return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0; 79 } 80 81 u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) 82 { 83 return arena ? arena->user_vm_start : 0; 84 } 85 86 static long arena_map_peek_elem(struct bpf_map *map, void *value) 87 { 88 return -EOPNOTSUPP; 89 } 90 91 static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags) 92 { 93 return -EOPNOTSUPP; 94 } 95 96 static long arena_map_pop_elem(struct bpf_map *map, void *value) 97 { 98 return -EOPNOTSUPP; 99 } 100 101 static long arena_map_delete_elem(struct bpf_map *map, void *value) 102 { 103 return -EOPNOTSUPP; 104 } 105 106 static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 107 { 108 return -EOPNOTSUPP; 109 } 110 111 static long compute_pgoff(struct bpf_arena *arena, long uaddr) 112 { 113 return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT; 114 } 115 116 struct apply_range_data { 117 struct page **pages; 118 int i; 119 }; 120 121 static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) 122 { 123 struct apply_range_data *d = data; 124 struct page *page; 125 126 if (!data) 127 return 0; 128 /* sanity check */ 129 if (unlikely(!pte_none(ptep_get(pte)))) 130 return -EBUSY; 131 132 page = d->pages[d->i]; 133 /* paranoia, similar to vmap_pages_pte_range() */ 134 if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) 135 return -EINVAL; 136 137 set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); 138 d->i++; 139 return 0; 140 } 141 142 static void flush_vmap_cache(unsigned long start, unsigned long size) 143 { 144 flush_cache_vmap(start, start + size); 145 } 146 147 static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages) 148 { 149 pte_t old_pte; 150 struct page *page; 151 152 /* sanity check */ 153 old_pte = ptep_get(pte); 154 if (pte_none(old_pte) || !pte_present(old_pte)) 155 return 0; /* nothing to do */ 156 157 page = pte_page(old_pte); 158 if (WARN_ON_ONCE(!page)) 159 return -EINVAL; 160 161 pte_clear(&init_mm, addr, pte); 162 163 /* Add page to the list so it is freed later */ 164 if (free_pages) 165 __llist_add(&page->pcp_llist, free_pages); 166 167 return 0; 168 } 169 170 static int populate_pgtable_except_pte(struct bpf_arena *arena) 171 { 172 return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), 173 KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL); 174 } 175 176 static struct bpf_map *arena_map_alloc(union bpf_attr *attr) 177 { 178 struct vm_struct *kern_vm; 179 int numa_node = bpf_map_attr_numa_node(attr); 180 struct bpf_arena *arena; 181 u64 vm_range; 182 int err = -ENOMEM; 183 184 if (!bpf_jit_supports_arena()) 185 return ERR_PTR(-EOPNOTSUPP); 186 187 if (attr->key_size || attr->value_size || attr->max_entries == 0 || 188 /* BPF_F_MMAPABLE must be set */ 189 !(attr->map_flags & BPF_F_MMAPABLE) || 190 /* No unsupported flags present */ 191 (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV))) 192 return ERR_PTR(-EINVAL); 193 194 if (attr->map_extra & ~PAGE_MASK) 195 /* If non-zero the map_extra is an expected user VMA start address */ 196 return ERR_PTR(-EINVAL); 197 198 vm_range = (u64)attr->max_entries * PAGE_SIZE; 199 if (vm_range > SZ_4G) 200 return ERR_PTR(-E2BIG); 201 202 if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32)) 203 /* user vma must not cross 32-bit boundary */ 204 return ERR_PTR(-ERANGE); 205 206 kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP); 207 if (!kern_vm) 208 return ERR_PTR(-ENOMEM); 209 210 arena = bpf_map_area_alloc(sizeof(*arena), numa_node); 211 if (!arena) 212 goto err; 213 214 arena->kern_vm = kern_vm; 215 arena->user_vm_start = attr->map_extra; 216 if (arena->user_vm_start) 217 arena->user_vm_end = arena->user_vm_start + vm_range; 218 219 INIT_LIST_HEAD(&arena->vma_list); 220 init_llist_head(&arena->free_spans); 221 init_irq_work(&arena->free_irq, arena_free_irq); 222 INIT_WORK(&arena->free_work, arena_free_worker); 223 bpf_map_init_from_attr(&arena->map, attr); 224 range_tree_init(&arena->rt); 225 err = range_tree_set(&arena->rt, 0, attr->max_entries); 226 if (err) { 227 bpf_map_area_free(arena); 228 goto err; 229 } 230 mutex_init(&arena->lock); 231 raw_res_spin_lock_init(&arena->spinlock); 232 err = populate_pgtable_except_pte(arena); 233 if (err) { 234 range_tree_destroy(&arena->rt); 235 bpf_map_area_free(arena); 236 goto err; 237 } 238 239 return &arena->map; 240 err: 241 free_vm_area(kern_vm); 242 return ERR_PTR(err); 243 } 244 245 static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) 246 { 247 struct page *page; 248 pte_t pte; 249 250 pte = ptep_get(ptep); 251 if (!pte_present(pte)) /* sanity check */ 252 return 0; 253 page = pte_page(pte); 254 /* 255 * We do not update pte here: 256 * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug 257 * 2. TLB flushing is batched or deferred. Even if we clear pte, 258 * the TLB entries can stick around and continue to permit access to 259 * the freed page. So it all relies on 1. 260 */ 261 __free_page(page); 262 return 0; 263 } 264 265 static void arena_map_free(struct bpf_map *map) 266 { 267 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 268 269 /* 270 * Check that user vma-s are not around when bpf map is freed. 271 * mmap() holds vm_file which holds bpf_map refcnt. 272 * munmap() must have happened on vma followed by arena_vm_close() 273 * which would clear arena->vma_list. 274 */ 275 if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) 276 return; 277 278 /* Ensure no pending deferred frees */ 279 irq_work_sync(&arena->free_irq); 280 flush_work(&arena->free_work); 281 282 /* 283 * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). 284 * It unmaps everything from vmalloc area and clears pgtables. 285 * Call apply_to_existing_page_range() first to find populated ptes and 286 * free those pages. 287 */ 288 apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), 289 KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); 290 free_vm_area(arena->kern_vm); 291 range_tree_destroy(&arena->rt); 292 bpf_map_area_free(arena); 293 } 294 295 static void *arena_map_lookup_elem(struct bpf_map *map, void *key) 296 { 297 return ERR_PTR(-EINVAL); 298 } 299 300 static long arena_map_update_elem(struct bpf_map *map, void *key, 301 void *value, u64 flags) 302 { 303 return -EOPNOTSUPP; 304 } 305 306 static int arena_map_check_btf(struct bpf_map *map, const struct btf *btf, 307 const struct btf_type *key_type, const struct btf_type *value_type) 308 { 309 return 0; 310 } 311 312 static u64 arena_map_mem_usage(const struct bpf_map *map) 313 { 314 return 0; 315 } 316 317 struct vma_list { 318 struct vm_area_struct *vma; 319 struct list_head head; 320 refcount_t mmap_count; 321 }; 322 323 static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) 324 { 325 struct vma_list *vml; 326 327 vml = kmalloc_obj(*vml); 328 if (!vml) 329 return -ENOMEM; 330 refcount_set(&vml->mmap_count, 1); 331 vma->vm_private_data = vml; 332 vml->vma = vma; 333 list_add(&vml->head, &arena->vma_list); 334 return 0; 335 } 336 337 static void arena_vm_open(struct vm_area_struct *vma) 338 { 339 struct vma_list *vml = vma->vm_private_data; 340 341 refcount_inc(&vml->mmap_count); 342 } 343 344 static int arena_vm_may_split(struct vm_area_struct *vma, unsigned long addr) 345 { 346 return -EINVAL; 347 } 348 349 static int arena_vm_mremap(struct vm_area_struct *vma) 350 { 351 return -EINVAL; 352 } 353 354 static void arena_vm_close(struct vm_area_struct *vma) 355 { 356 struct bpf_map *map = vma->vm_file->private_data; 357 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 358 struct vma_list *vml = vma->vm_private_data; 359 360 if (!refcount_dec_and_test(&vml->mmap_count)) 361 return; 362 guard(mutex)(&arena->lock); 363 /* update link list under lock */ 364 list_del(&vml->head); 365 vma->vm_private_data = NULL; 366 kfree(vml); 367 } 368 369 static vm_fault_t arena_vm_fault(struct vm_fault *vmf) 370 { 371 struct bpf_map *map = vmf->vma->vm_file->private_data; 372 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 373 struct mem_cgroup *new_memcg, *old_memcg; 374 struct page *page; 375 long kbase, kaddr; 376 unsigned long flags; 377 int ret; 378 379 kbase = bpf_arena_get_kern_vm_start(arena); 380 kaddr = kbase + (u32)(vmf->address); 381 382 if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) 383 /* Make a reasonable effort to address impossible case */ 384 return VM_FAULT_RETRY; 385 386 page = vmalloc_to_page((void *)kaddr); 387 if (page) 388 /* already have a page vmap-ed */ 389 goto out; 390 391 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 392 393 if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) 394 /* User space requested to segfault when page is not allocated by bpf prog */ 395 goto out_unlock_sigsegv; 396 397 ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); 398 if (ret) 399 goto out_unlock_sigsegv; 400 401 struct apply_range_data data = { .pages = &page, .i = 0 }; 402 /* Account into memcg of the process that created bpf_arena */ 403 ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); 404 if (ret) { 405 range_tree_set(&arena->rt, vmf->pgoff, 1); 406 goto out_unlock_sigsegv; 407 } 408 409 ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); 410 if (ret) { 411 range_tree_set(&arena->rt, vmf->pgoff, 1); 412 free_pages_nolock(page, 0); 413 goto out_unlock_sigsegv; 414 } 415 flush_vmap_cache(kaddr, PAGE_SIZE); 416 bpf_map_memcg_exit(old_memcg, new_memcg); 417 out: 418 page_ref_add(page, 1); 419 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 420 vmf->page = page; 421 return 0; 422 out_unlock_sigsegv: 423 bpf_map_memcg_exit(old_memcg, new_memcg); 424 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 425 return VM_FAULT_SIGSEGV; 426 } 427 428 static const struct vm_operations_struct arena_vm_ops = { 429 .open = arena_vm_open, 430 .may_split = arena_vm_may_split, 431 .mremap = arena_vm_mremap, 432 .close = arena_vm_close, 433 .fault = arena_vm_fault, 434 }; 435 436 static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr, 437 unsigned long len, unsigned long pgoff, 438 unsigned long flags) 439 { 440 struct bpf_map *map = filp->private_data; 441 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 442 long ret; 443 444 if (pgoff) 445 return -EINVAL; 446 if (len > SZ_4G) 447 return -E2BIG; 448 449 /* if user_vm_start was specified at arena creation time */ 450 if (arena->user_vm_start) { 451 if (len > arena->user_vm_end - arena->user_vm_start) 452 return -E2BIG; 453 if (len != arena->user_vm_end - arena->user_vm_start) 454 return -EINVAL; 455 if (addr != arena->user_vm_start) 456 return -EINVAL; 457 } 458 459 ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags); 460 if (IS_ERR_VALUE(ret)) 461 return ret; 462 if ((ret >> 32) == ((ret + len - 1) >> 32)) 463 return ret; 464 if (WARN_ON_ONCE(arena->user_vm_start)) 465 /* checks at map creation time should prevent this */ 466 return -EFAULT; 467 return round_up(ret, SZ_4G); 468 } 469 470 static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) 471 { 472 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 473 474 guard(mutex)(&arena->lock); 475 if (arena->user_vm_start && arena->user_vm_start != vma->vm_start) 476 /* 477 * If map_extra was not specified at arena creation time then 478 * 1st user process can do mmap(NULL, ...) to pick user_vm_start 479 * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..); 480 * or 481 * specify addr in map_extra and 482 * use the same addr later with mmap(addr, MAP_FIXED..); 483 */ 484 return -EBUSY; 485 486 if (arena->user_vm_end && arena->user_vm_end != vma->vm_end) 487 /* all user processes must have the same size of mmap-ed region */ 488 return -EBUSY; 489 490 /* Earlier checks should prevent this */ 491 if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff)) 492 return -EFAULT; 493 494 if (remember_vma(arena, vma)) 495 return -ENOMEM; 496 497 arena->user_vm_start = vma->vm_start; 498 arena->user_vm_end = vma->vm_end; 499 /* 500 * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and 501 * clears VM_MAYEXEC. Set VM_DONTEXPAND to avoid potential change 502 * of user_vm_start. Set VM_DONTCOPY to prevent arena VMA from 503 * being copied into the child process on fork. 504 */ 505 vm_flags_set(vma, VM_DONTEXPAND | VM_DONTCOPY); 506 vma->vm_ops = &arena_vm_ops; 507 return 0; 508 } 509 510 static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) 511 { 512 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 513 514 if ((u64)off > arena->user_vm_end - arena->user_vm_start) 515 return -ERANGE; 516 *imm = (unsigned long)arena->user_vm_start; 517 return 0; 518 } 519 520 BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena) 521 const struct bpf_map_ops arena_map_ops = { 522 .map_meta_equal = bpf_map_meta_equal, 523 .map_alloc = arena_map_alloc, 524 .map_free = arena_map_free, 525 .map_direct_value_addr = arena_map_direct_value_addr, 526 .map_mmap = arena_map_mmap, 527 .map_get_unmapped_area = arena_get_unmapped_area, 528 .map_get_next_key = arena_map_get_next_key, 529 .map_push_elem = arena_map_push_elem, 530 .map_peek_elem = arena_map_peek_elem, 531 .map_pop_elem = arena_map_pop_elem, 532 .map_lookup_elem = arena_map_lookup_elem, 533 .map_update_elem = arena_map_update_elem, 534 .map_delete_elem = arena_map_delete_elem, 535 .map_check_btf = arena_map_check_btf, 536 .map_mem_usage = arena_map_mem_usage, 537 .map_btf_id = &bpf_arena_map_btf_ids[0], 538 }; 539 540 static u64 clear_lo32(u64 val) 541 { 542 return val & ~(u64)~0U; 543 } 544 545 /* 546 * Allocate pages and vmap them into kernel vmalloc area. 547 * Later the pages will be mmaped into user space vma. 548 */ 549 static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id, 550 bool sleepable) 551 { 552 /* user_vm_end/start are fixed before bpf prog runs */ 553 long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; 554 u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); 555 struct mem_cgroup *new_memcg, *old_memcg; 556 struct apply_range_data data; 557 struct page **pages = NULL; 558 long remaining, mapped = 0; 559 long alloc_pages; 560 unsigned long flags; 561 long pgoff = 0; 562 u32 uaddr32; 563 int ret, i; 564 565 if (node_id != NUMA_NO_NODE && 566 ((unsigned int)node_id >= nr_node_ids || !node_online(node_id))) 567 return 0; 568 569 if (page_cnt > page_cnt_max) 570 return 0; 571 572 if (uaddr) { 573 if (uaddr & ~PAGE_MASK) 574 return 0; 575 pgoff = compute_pgoff(arena, uaddr); 576 if (pgoff > page_cnt_max - page_cnt) 577 /* requested address will be outside of user VMA */ 578 return 0; 579 } 580 581 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 582 /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */ 583 alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *)); 584 pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE); 585 if (!pages) { 586 bpf_map_memcg_exit(old_memcg, new_memcg); 587 return 0; 588 } 589 data.pages = pages; 590 591 if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) 592 goto out_free_pages; 593 594 if (uaddr) { 595 ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); 596 if (ret) 597 goto out_unlock_free_pages; 598 ret = range_tree_clear(&arena->rt, pgoff, page_cnt); 599 } else { 600 ret = pgoff = range_tree_find(&arena->rt, page_cnt); 601 if (pgoff >= 0) 602 ret = range_tree_clear(&arena->rt, pgoff, page_cnt); 603 } 604 if (ret) 605 goto out_unlock_free_pages; 606 607 remaining = page_cnt; 608 uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); 609 610 while (remaining) { 611 long this_batch = min(remaining, alloc_pages); 612 613 /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ 614 memset(pages, 0, this_batch * sizeof(struct page *)); 615 616 ret = bpf_map_alloc_pages(&arena->map, node_id, this_batch, pages); 617 if (ret) 618 goto out; 619 620 /* 621 * Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 622 * will not overflow 32-bit. Lower 32-bit need to represent 623 * contiguous user address range. 624 * Map these pages at kern_vm_start base. 625 * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow 626 * lower 32-bit and it's ok. 627 */ 628 data.i = 0; 629 ret = apply_to_page_range(&init_mm, 630 kern_vm_start + uaddr32 + (mapped << PAGE_SHIFT), 631 this_batch << PAGE_SHIFT, apply_range_set_cb, &data); 632 if (ret) { 633 /* data.i pages were mapped, account them and free the remaining */ 634 mapped += data.i; 635 for (i = data.i; i < this_batch; i++) 636 free_pages_nolock(pages[i], 0); 637 goto out; 638 } 639 640 mapped += this_batch; 641 remaining -= this_batch; 642 } 643 flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); 644 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 645 kfree_nolock(pages); 646 bpf_map_memcg_exit(old_memcg, new_memcg); 647 return clear_lo32(arena->user_vm_start) + uaddr32; 648 out: 649 range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); 650 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 651 if (mapped) { 652 flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); 653 arena_free_pages(arena, uaddr32, mapped, sleepable); 654 } 655 goto out_free_pages; 656 out_unlock_free_pages: 657 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 658 out_free_pages: 659 kfree_nolock(pages); 660 bpf_map_memcg_exit(old_memcg, new_memcg); 661 return 0; 662 } 663 664 /* 665 * If page is present in vmalloc area, unmap it from vmalloc area, 666 * unmap it from all user space vma-s, 667 * and free it. 668 */ 669 static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) 670 { 671 struct vma_list *vml; 672 673 guard(mutex)(&arena->lock); 674 /* iterate link list under lock */ 675 list_for_each_entry(vml, &arena->vma_list, head) 676 zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt); 677 } 678 679 static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) 680 { 681 struct mem_cgroup *new_memcg, *old_memcg; 682 u64 full_uaddr, uaddr_end; 683 long kaddr, pgoff; 684 struct page *page; 685 struct llist_head free_pages; 686 struct llist_node *pos, *t; 687 struct arena_free_span *s; 688 unsigned long flags; 689 int ret = 0; 690 691 /* only aligned lower 32-bit are relevant */ 692 uaddr = (u32)uaddr; 693 uaddr &= PAGE_MASK; 694 kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; 695 full_uaddr = clear_lo32(arena->user_vm_start) + uaddr; 696 uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT)); 697 if (full_uaddr >= uaddr_end) 698 return; 699 700 page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; 701 pgoff = compute_pgoff(arena, uaddr); 702 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 703 704 if (!sleepable) 705 goto defer; 706 707 ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags); 708 709 /* Can't proceed without holding the spinlock so defer the free */ 710 if (ret) 711 goto defer; 712 713 range_tree_set(&arena->rt, pgoff, page_cnt); 714 715 init_llist_head(&free_pages); 716 /* clear ptes and collect struct pages */ 717 apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, 718 apply_range_clear_cb, &free_pages); 719 720 /* drop the lock to do the tlb flush and zap pages */ 721 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 722 723 /* ensure no stale TLB entries */ 724 flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); 725 726 if (page_cnt > 1) 727 /* bulk zap if multiple pages being freed */ 728 zap_pages(arena, full_uaddr, page_cnt); 729 730 llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { 731 page = llist_entry(pos, struct page, pcp_llist); 732 if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ 733 /* Optimization for the common case of page_cnt==1: 734 * If page wasn't mapped into some user vma there 735 * is no need to call zap_pages which is slow. When 736 * page_cnt is big it's faster to do the batched zap. 737 */ 738 zap_pages(arena, full_uaddr, 1); 739 __free_page(page); 740 } 741 bpf_map_memcg_exit(old_memcg, new_memcg); 742 743 return; 744 745 defer: 746 s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1); 747 bpf_map_memcg_exit(old_memcg, new_memcg); 748 if (!s) 749 /* 750 * If allocation fails in non-sleepable context, pages are intentionally left 751 * inaccessible (leaked) until the arena is destroyed. Cleanup or retries are not 752 * possible here, so we intentionally omit them for safety. 753 */ 754 return; 755 756 s->page_cnt = page_cnt; 757 s->uaddr = uaddr; 758 llist_add(&s->node, &arena->free_spans); 759 irq_work_queue(&arena->free_irq); 760 } 761 762 /* 763 * Reserve an arena virtual address range without populating it. This call stops 764 * bpf_arena_alloc_pages from adding pages to this range. 765 */ 766 static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) 767 { 768 long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; 769 struct mem_cgroup *new_memcg, *old_memcg; 770 unsigned long flags; 771 long pgoff; 772 int ret; 773 774 if (uaddr & ~PAGE_MASK) 775 return 0; 776 777 pgoff = compute_pgoff(arena, uaddr); 778 if (pgoff + page_cnt > page_cnt_max) 779 return -EINVAL; 780 781 if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) 782 return -EBUSY; 783 784 /* Cannot guard already allocated pages. */ 785 ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); 786 if (ret) { 787 ret = -EBUSY; 788 goto out; 789 } 790 791 /* "Allocate" the region to prevent it from being allocated. */ 792 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 793 ret = range_tree_clear(&arena->rt, pgoff, page_cnt); 794 bpf_map_memcg_exit(old_memcg, new_memcg); 795 out: 796 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 797 return ret; 798 } 799 800 static void arena_free_worker(struct work_struct *work) 801 { 802 struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work); 803 struct mem_cgroup *new_memcg, *old_memcg; 804 struct llist_node *list, *pos, *t; 805 struct arena_free_span *s; 806 u64 arena_vm_start, user_vm_start; 807 struct llist_head free_pages; 808 struct page *page; 809 unsigned long full_uaddr; 810 long kaddr, page_cnt, pgoff; 811 unsigned long flags; 812 813 if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) { 814 schedule_work(work); 815 return; 816 } 817 818 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 819 820 init_llist_head(&free_pages); 821 arena_vm_start = bpf_arena_get_kern_vm_start(arena); 822 user_vm_start = bpf_arena_get_user_vm_start(arena); 823 824 list = llist_del_all(&arena->free_spans); 825 llist_for_each(pos, list) { 826 s = llist_entry(pos, struct arena_free_span, node); 827 page_cnt = s->page_cnt; 828 kaddr = arena_vm_start + s->uaddr; 829 pgoff = compute_pgoff(arena, s->uaddr); 830 831 /* clear ptes and collect pages in free_pages llist */ 832 apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, 833 apply_range_clear_cb, &free_pages); 834 835 range_tree_set(&arena->rt, pgoff, page_cnt); 836 } 837 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 838 839 /* Iterate the list again without holding spinlock to do the tlb flush and zap_pages */ 840 llist_for_each_safe(pos, t, list) { 841 s = llist_entry(pos, struct arena_free_span, node); 842 page_cnt = s->page_cnt; 843 full_uaddr = clear_lo32(user_vm_start) + s->uaddr; 844 kaddr = arena_vm_start + s->uaddr; 845 846 /* ensure no stale TLB entries */ 847 flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); 848 849 /* remove pages from user vmas */ 850 zap_pages(arena, full_uaddr, page_cnt); 851 852 kfree_nolock(s); 853 } 854 855 /* free all pages collected by apply_to_existing_page_range() in the first loop */ 856 llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { 857 page = llist_entry(pos, struct page, pcp_llist); 858 __free_page(page); 859 } 860 861 bpf_map_memcg_exit(old_memcg, new_memcg); 862 } 863 864 static void arena_free_irq(struct irq_work *iw) 865 { 866 struct bpf_arena *arena = container_of(iw, struct bpf_arena, free_irq); 867 868 schedule_work(&arena->free_work); 869 } 870 871 __bpf_kfunc_start_defs(); 872 873 __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, 874 int node_id, u64 flags) 875 { 876 struct bpf_map *map = p__map; 877 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 878 879 if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) 880 return NULL; 881 882 return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); 883 } 884 885 void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, 886 int node_id, u64 flags) 887 { 888 struct bpf_map *map = p__map; 889 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 890 891 if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) 892 return NULL; 893 894 return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); 895 } 896 __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) 897 { 898 struct bpf_map *map = p__map; 899 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 900 901 if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) 902 return; 903 arena_free_pages(arena, (long)ptr__ign, page_cnt, true); 904 } 905 906 void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) 907 { 908 struct bpf_map *map = p__map; 909 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 910 911 if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) 912 return; 913 arena_free_pages(arena, (long)ptr__ign, page_cnt, false); 914 } 915 916 __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt) 917 { 918 struct bpf_map *map = p__map; 919 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 920 921 if (map->map_type != BPF_MAP_TYPE_ARENA) 922 return -EINVAL; 923 924 if (!page_cnt) 925 return 0; 926 927 return arena_reserve_pages(arena, (long)ptr__ign, page_cnt); 928 } 929 __bpf_kfunc_end_defs(); 930 931 BTF_KFUNCS_START(arena_kfuncs) 932 BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2) 933 BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2) 934 BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2) 935 BTF_KFUNCS_END(arena_kfuncs) 936 937 static const struct btf_kfunc_id_set common_kfunc_set = { 938 .owner = THIS_MODULE, 939 .set = &arena_kfuncs, 940 }; 941 942 static int __init kfunc_init(void) 943 { 944 return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); 945 } 946 late_initcall(kfunc_init); 947 948 void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) 949 { 950 struct bpf_stream_stage ss; 951 struct bpf_prog *prog; 952 u64 user_vm_start; 953 954 /* 955 * The RCU read lock is held to safely traverse the latch tree, but we 956 * don't need its protection when accessing the prog, since it will not 957 * disappear while we are handling the fault. 958 */ 959 rcu_read_lock(); 960 prog = bpf_prog_ksym_find(fault_ip); 961 rcu_read_unlock(); 962 if (!prog) 963 return; 964 965 /* Use main prog for stream access */ 966 prog = prog->aux->main_prog_aux->prog; 967 968 user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); 969 addr += clear_lo32(user_vm_start); 970 971 bpf_stream_stage(ss, prog, BPF_STDERR, ({ 972 bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n", 973 write ? "WRITE" : "READ", addr); 974 bpf_stream_dump_stack(ss); 975 })); 976 } 977