1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ 3 #include <linux/bpf.h> 4 #include <linux/btf.h> 5 #include <linux/cacheflush.h> 6 #include <linux/err.h> 7 #include <linux/irq_work.h> 8 #include "linux/filter.h" 9 #include <linux/llist.h> 10 #include <linux/btf_ids.h> 11 #include <linux/vmalloc.h> 12 #include <linux/pagemap.h> 13 #include <asm/tlbflush.h> 14 #include "range_tree.h" 15 16 /* 17 * bpf_arena is a sparsely populated shared memory region between bpf program and 18 * user space process. 19 * 20 * For example on x86-64 the values could be: 21 * user_vm_start 7f7d26200000 // picked by mmap() 22 * kern_vm_start ffffc90001e69000 // picked by get_vm_area() 23 * For user space all pointers within the arena are normal 8-byte addresses. 24 * In this example 7f7d26200000 is the address of the first page (pgoff=0). 25 * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr 26 * (u32)7f7d26200000 -> 26200000 27 * hence 28 * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb 29 * kernel memory region. 30 * 31 * BPF JITs generate the following code to access arena: 32 * mov eax, eax // eax has lower 32-bit of user pointer 33 * mov word ptr [rax + r12 + off], bx 34 * where r12 == kern_vm_start and off is s16. 35 * Hence allocate 4Gb + GUARD_SZ/2 on each side. 36 * 37 * Initially kernel vm_area and user vma are not populated. 38 * User space can fault-in any address which will insert the page 39 * into kernel and user vma. 40 * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc 41 * which will insert it into kernel vm_area. 42 * The later fault-in from user space will populate that page into user vma. 43 */ 44 45 /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ 46 #define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) 47 #define KERN_VM_SZ (SZ_4G + GUARD_SZ) 48 49 static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable); 50 51 struct bpf_arena { 52 struct bpf_map map; 53 u64 user_vm_start; 54 u64 user_vm_end; 55 struct vm_struct *kern_vm; 56 struct range_tree rt; 57 /* protects rt */ 58 rqspinlock_t spinlock; 59 struct list_head vma_list; 60 /* protects vma_list */ 61 struct mutex lock; 62 struct irq_work free_irq; 63 struct work_struct free_work; 64 struct llist_head free_spans; 65 }; 66 67 static void arena_free_worker(struct work_struct *work); 68 static void arena_free_irq(struct irq_work *iw); 69 70 struct arena_free_span { 71 struct llist_node node; 72 unsigned long uaddr; 73 u32 page_cnt; 74 }; 75 76 u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) 77 { 78 return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0; 79 } 80 81 u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) 82 { 83 return arena ? arena->user_vm_start : 0; 84 } 85 86 static long arena_map_peek_elem(struct bpf_map *map, void *value) 87 { 88 return -EOPNOTSUPP; 89 } 90 91 static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags) 92 { 93 return -EOPNOTSUPP; 94 } 95 96 static long arena_map_pop_elem(struct bpf_map *map, void *value) 97 { 98 return -EOPNOTSUPP; 99 } 100 101 static long arena_map_delete_elem(struct bpf_map *map, void *value) 102 { 103 return -EOPNOTSUPP; 104 } 105 106 static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 107 { 108 return -EOPNOTSUPP; 109 } 110 111 static long compute_pgoff(struct bpf_arena *arena, long uaddr) 112 { 113 return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT; 114 } 115 116 struct apply_range_data { 117 struct page **pages; 118 int i; 119 }; 120 121 static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) 122 { 123 struct apply_range_data *d = data; 124 struct page *page; 125 126 if (!data) 127 return 0; 128 /* sanity check */ 129 if (unlikely(!pte_none(ptep_get(pte)))) 130 return -EBUSY; 131 132 page = d->pages[d->i]; 133 /* paranoia, similar to vmap_pages_pte_range() */ 134 if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) 135 return -EINVAL; 136 137 set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); 138 d->i++; 139 return 0; 140 } 141 142 static void flush_vmap_cache(unsigned long start, unsigned long size) 143 { 144 flush_cache_vmap(start, start + size); 145 } 146 147 static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages) 148 { 149 pte_t old_pte; 150 struct page *page; 151 152 /* sanity check */ 153 old_pte = ptep_get(pte); 154 if (pte_none(old_pte) || !pte_present(old_pte)) 155 return 0; /* nothing to do */ 156 157 page = pte_page(old_pte); 158 if (WARN_ON_ONCE(!page)) 159 return -EINVAL; 160 161 pte_clear(&init_mm, addr, pte); 162 163 /* Add page to the list so it is freed later */ 164 if (free_pages) 165 __llist_add(&page->pcp_llist, free_pages); 166 167 return 0; 168 } 169 170 static int populate_pgtable_except_pte(struct bpf_arena *arena) 171 { 172 return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), 173 KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL); 174 } 175 176 static struct bpf_map *arena_map_alloc(union bpf_attr *attr) 177 { 178 struct vm_struct *kern_vm; 179 int numa_node = bpf_map_attr_numa_node(attr); 180 struct bpf_arena *arena; 181 u64 vm_range; 182 int err = -ENOMEM; 183 184 if (!bpf_jit_supports_arena()) 185 return ERR_PTR(-EOPNOTSUPP); 186 187 if (attr->key_size || attr->value_size || attr->max_entries == 0 || 188 /* BPF_F_MMAPABLE must be set */ 189 !(attr->map_flags & BPF_F_MMAPABLE) || 190 /* No unsupported flags present */ 191 (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV))) 192 return ERR_PTR(-EINVAL); 193 194 if (attr->map_extra & ~PAGE_MASK) 195 /* If non-zero the map_extra is an expected user VMA start address */ 196 return ERR_PTR(-EINVAL); 197 198 vm_range = (u64)attr->max_entries * PAGE_SIZE; 199 if (vm_range > SZ_4G) 200 return ERR_PTR(-E2BIG); 201 202 if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32)) 203 /* user vma must not cross 32-bit boundary */ 204 return ERR_PTR(-ERANGE); 205 206 kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP); 207 if (!kern_vm) 208 return ERR_PTR(-ENOMEM); 209 210 arena = bpf_map_area_alloc(sizeof(*arena), numa_node); 211 if (!arena) 212 goto err; 213 214 arena->kern_vm = kern_vm; 215 arena->user_vm_start = attr->map_extra; 216 if (arena->user_vm_start) 217 arena->user_vm_end = arena->user_vm_start + vm_range; 218 219 INIT_LIST_HEAD(&arena->vma_list); 220 init_llist_head(&arena->free_spans); 221 init_irq_work(&arena->free_irq, arena_free_irq); 222 INIT_WORK(&arena->free_work, arena_free_worker); 223 bpf_map_init_from_attr(&arena->map, attr); 224 range_tree_init(&arena->rt); 225 err = range_tree_set(&arena->rt, 0, attr->max_entries); 226 if (err) { 227 bpf_map_area_free(arena); 228 goto err; 229 } 230 mutex_init(&arena->lock); 231 raw_res_spin_lock_init(&arena->spinlock); 232 err = populate_pgtable_except_pte(arena); 233 if (err) { 234 range_tree_destroy(&arena->rt); 235 bpf_map_area_free(arena); 236 goto err; 237 } 238 239 return &arena->map; 240 err: 241 free_vm_area(kern_vm); 242 return ERR_PTR(err); 243 } 244 245 static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) 246 { 247 struct page *page; 248 pte_t pte; 249 250 pte = ptep_get(ptep); 251 if (!pte_present(pte)) /* sanity check */ 252 return 0; 253 page = pte_page(pte); 254 /* 255 * We do not update pte here: 256 * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug 257 * 2. TLB flushing is batched or deferred. Even if we clear pte, 258 * the TLB entries can stick around and continue to permit access to 259 * the freed page. So it all relies on 1. 260 */ 261 __free_page(page); 262 return 0; 263 } 264 265 static void arena_map_free(struct bpf_map *map) 266 { 267 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 268 269 /* 270 * Check that user vma-s are not around when bpf map is freed. 271 * mmap() holds vm_file which holds bpf_map refcnt. 272 * munmap() must have happened on vma followed by arena_vm_close() 273 * which would clear arena->vma_list. 274 */ 275 if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) 276 return; 277 278 /* Ensure no pending deferred frees */ 279 irq_work_sync(&arena->free_irq); 280 flush_work(&arena->free_work); 281 282 /* 283 * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). 284 * It unmaps everything from vmalloc area and clears pgtables. 285 * Call apply_to_existing_page_range() first to find populated ptes and 286 * free those pages. 287 */ 288 apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), 289 KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); 290 free_vm_area(arena->kern_vm); 291 range_tree_destroy(&arena->rt); 292 bpf_map_area_free(arena); 293 } 294 295 static void *arena_map_lookup_elem(struct bpf_map *map, void *key) 296 { 297 return ERR_PTR(-EINVAL); 298 } 299 300 static long arena_map_update_elem(struct bpf_map *map, void *key, 301 void *value, u64 flags) 302 { 303 return -EOPNOTSUPP; 304 } 305 306 static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf, 307 const struct btf_type *key_type, const struct btf_type *value_type) 308 { 309 return 0; 310 } 311 312 static u64 arena_map_mem_usage(const struct bpf_map *map) 313 { 314 return 0; 315 } 316 317 struct vma_list { 318 struct vm_area_struct *vma; 319 struct list_head head; 320 refcount_t mmap_count; 321 }; 322 323 static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) 324 { 325 struct vma_list *vml; 326 327 vml = kmalloc(sizeof(*vml), GFP_KERNEL); 328 if (!vml) 329 return -ENOMEM; 330 refcount_set(&vml->mmap_count, 1); 331 vma->vm_private_data = vml; 332 vml->vma = vma; 333 list_add(&vml->head, &arena->vma_list); 334 return 0; 335 } 336 337 static void arena_vm_open(struct vm_area_struct *vma) 338 { 339 struct vma_list *vml = vma->vm_private_data; 340 341 refcount_inc(&vml->mmap_count); 342 } 343 344 static void arena_vm_close(struct vm_area_struct *vma) 345 { 346 struct bpf_map *map = vma->vm_file->private_data; 347 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 348 struct vma_list *vml = vma->vm_private_data; 349 350 if (!refcount_dec_and_test(&vml->mmap_count)) 351 return; 352 guard(mutex)(&arena->lock); 353 /* update link list under lock */ 354 list_del(&vml->head); 355 vma->vm_private_data = NULL; 356 kfree(vml); 357 } 358 359 static vm_fault_t arena_vm_fault(struct vm_fault *vmf) 360 { 361 struct bpf_map *map = vmf->vma->vm_file->private_data; 362 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 363 struct mem_cgroup *new_memcg, *old_memcg; 364 struct page *page; 365 long kbase, kaddr; 366 unsigned long flags; 367 int ret; 368 369 kbase = bpf_arena_get_kern_vm_start(arena); 370 kaddr = kbase + (u32)(vmf->address); 371 372 if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) 373 /* Make a reasonable effort to address impossible case */ 374 return VM_FAULT_RETRY; 375 376 page = vmalloc_to_page((void *)kaddr); 377 if (page) 378 /* already have a page vmap-ed */ 379 goto out; 380 381 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 382 383 if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) 384 /* User space requested to segfault when page is not allocated by bpf prog */ 385 goto out_unlock_sigsegv; 386 387 ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); 388 if (ret) 389 goto out_unlock_sigsegv; 390 391 struct apply_range_data data = { .pages = &page, .i = 0 }; 392 /* Account into memcg of the process that created bpf_arena */ 393 ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); 394 if (ret) { 395 range_tree_set(&arena->rt, vmf->pgoff, 1); 396 goto out_unlock_sigsegv; 397 } 398 399 ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); 400 if (ret) { 401 range_tree_set(&arena->rt, vmf->pgoff, 1); 402 free_pages_nolock(page, 0); 403 goto out_unlock_sigsegv; 404 } 405 flush_vmap_cache(kaddr, PAGE_SIZE); 406 bpf_map_memcg_exit(old_memcg, new_memcg); 407 out: 408 page_ref_add(page, 1); 409 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 410 vmf->page = page; 411 return 0; 412 out_unlock_sigsegv: 413 bpf_map_memcg_exit(old_memcg, new_memcg); 414 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 415 return VM_FAULT_SIGSEGV; 416 } 417 418 static const struct vm_operations_struct arena_vm_ops = { 419 .open = arena_vm_open, 420 .close = arena_vm_close, 421 .fault = arena_vm_fault, 422 }; 423 424 static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr, 425 unsigned long len, unsigned long pgoff, 426 unsigned long flags) 427 { 428 struct bpf_map *map = filp->private_data; 429 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 430 long ret; 431 432 if (pgoff) 433 return -EINVAL; 434 if (len > SZ_4G) 435 return -E2BIG; 436 437 /* if user_vm_start was specified at arena creation time */ 438 if (arena->user_vm_start) { 439 if (len > arena->user_vm_end - arena->user_vm_start) 440 return -E2BIG; 441 if (len != arena->user_vm_end - arena->user_vm_start) 442 return -EINVAL; 443 if (addr != arena->user_vm_start) 444 return -EINVAL; 445 } 446 447 ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags); 448 if (IS_ERR_VALUE(ret)) 449 return ret; 450 if ((ret >> 32) == ((ret + len - 1) >> 32)) 451 return ret; 452 if (WARN_ON_ONCE(arena->user_vm_start)) 453 /* checks at map creation time should prevent this */ 454 return -EFAULT; 455 return round_up(ret, SZ_4G); 456 } 457 458 static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) 459 { 460 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 461 462 guard(mutex)(&arena->lock); 463 if (arena->user_vm_start && arena->user_vm_start != vma->vm_start) 464 /* 465 * If map_extra was not specified at arena creation time then 466 * 1st user process can do mmap(NULL, ...) to pick user_vm_start 467 * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..); 468 * or 469 * specify addr in map_extra and 470 * use the same addr later with mmap(addr, MAP_FIXED..); 471 */ 472 return -EBUSY; 473 474 if (arena->user_vm_end && arena->user_vm_end != vma->vm_end) 475 /* all user processes must have the same size of mmap-ed region */ 476 return -EBUSY; 477 478 /* Earlier checks should prevent this */ 479 if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff)) 480 return -EFAULT; 481 482 if (remember_vma(arena, vma)) 483 return -ENOMEM; 484 485 arena->user_vm_start = vma->vm_start; 486 arena->user_vm_end = vma->vm_end; 487 /* 488 * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and 489 * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid 490 * potential change of user_vm_start. 491 */ 492 vm_flags_set(vma, VM_DONTEXPAND); 493 vma->vm_ops = &arena_vm_ops; 494 return 0; 495 } 496 497 static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) 498 { 499 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 500 501 if ((u64)off > arena->user_vm_end - arena->user_vm_start) 502 return -ERANGE; 503 *imm = (unsigned long)arena->user_vm_start; 504 return 0; 505 } 506 507 BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena) 508 const struct bpf_map_ops arena_map_ops = { 509 .map_meta_equal = bpf_map_meta_equal, 510 .map_alloc = arena_map_alloc, 511 .map_free = arena_map_free, 512 .map_direct_value_addr = arena_map_direct_value_addr, 513 .map_mmap = arena_map_mmap, 514 .map_get_unmapped_area = arena_get_unmapped_area, 515 .map_get_next_key = arena_map_get_next_key, 516 .map_push_elem = arena_map_push_elem, 517 .map_peek_elem = arena_map_peek_elem, 518 .map_pop_elem = arena_map_pop_elem, 519 .map_lookup_elem = arena_map_lookup_elem, 520 .map_update_elem = arena_map_update_elem, 521 .map_delete_elem = arena_map_delete_elem, 522 .map_check_btf = arena_map_check_btf, 523 .map_mem_usage = arena_map_mem_usage, 524 .map_btf_id = &bpf_arena_map_btf_ids[0], 525 }; 526 527 static u64 clear_lo32(u64 val) 528 { 529 return val & ~(u64)~0U; 530 } 531 532 /* 533 * Allocate pages and vmap them into kernel vmalloc area. 534 * Later the pages will be mmaped into user space vma. 535 */ 536 static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id, 537 bool sleepable) 538 { 539 /* user_vm_end/start are fixed before bpf prog runs */ 540 long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; 541 u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); 542 struct mem_cgroup *new_memcg, *old_memcg; 543 struct apply_range_data data; 544 struct page **pages = NULL; 545 long remaining, mapped = 0; 546 long alloc_pages; 547 unsigned long flags; 548 long pgoff = 0; 549 u32 uaddr32; 550 int ret, i; 551 552 if (page_cnt > page_cnt_max) 553 return 0; 554 555 if (uaddr) { 556 if (uaddr & ~PAGE_MASK) 557 return 0; 558 pgoff = compute_pgoff(arena, uaddr); 559 if (pgoff > page_cnt_max - page_cnt) 560 /* requested address will be outside of user VMA */ 561 return 0; 562 } 563 564 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 565 /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */ 566 alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *)); 567 pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE); 568 if (!pages) { 569 bpf_map_memcg_exit(old_memcg, new_memcg); 570 return 0; 571 } 572 data.pages = pages; 573 574 if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) 575 goto out_free_pages; 576 577 if (uaddr) { 578 ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); 579 if (ret) 580 goto out_unlock_free_pages; 581 ret = range_tree_clear(&arena->rt, pgoff, page_cnt); 582 } else { 583 ret = pgoff = range_tree_find(&arena->rt, page_cnt); 584 if (pgoff >= 0) 585 ret = range_tree_clear(&arena->rt, pgoff, page_cnt); 586 } 587 if (ret) 588 goto out_unlock_free_pages; 589 590 remaining = page_cnt; 591 uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); 592 593 while (remaining) { 594 long this_batch = min(remaining, alloc_pages); 595 596 /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ 597 memset(pages, 0, this_batch * sizeof(struct page *)); 598 599 ret = bpf_map_alloc_pages(&arena->map, node_id, this_batch, pages); 600 if (ret) 601 goto out; 602 603 /* 604 * Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 605 * will not overflow 32-bit. Lower 32-bit need to represent 606 * contiguous user address range. 607 * Map these pages at kern_vm_start base. 608 * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow 609 * lower 32-bit and it's ok. 610 */ 611 data.i = 0; 612 ret = apply_to_page_range(&init_mm, 613 kern_vm_start + uaddr32 + (mapped << PAGE_SHIFT), 614 this_batch << PAGE_SHIFT, apply_range_set_cb, &data); 615 if (ret) { 616 /* data.i pages were mapped, account them and free the remaining */ 617 mapped += data.i; 618 for (i = data.i; i < this_batch; i++) 619 free_pages_nolock(pages[i], 0); 620 goto out; 621 } 622 623 mapped += this_batch; 624 remaining -= this_batch; 625 } 626 flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); 627 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 628 kfree_nolock(pages); 629 bpf_map_memcg_exit(old_memcg, new_memcg); 630 return clear_lo32(arena->user_vm_start) + uaddr32; 631 out: 632 range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); 633 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 634 if (mapped) { 635 flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); 636 arena_free_pages(arena, uaddr32, mapped, sleepable); 637 } 638 goto out_free_pages; 639 out_unlock_free_pages: 640 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 641 out_free_pages: 642 kfree_nolock(pages); 643 bpf_map_memcg_exit(old_memcg, new_memcg); 644 return 0; 645 } 646 647 /* 648 * If page is present in vmalloc area, unmap it from vmalloc area, 649 * unmap it from all user space vma-s, 650 * and free it. 651 */ 652 static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) 653 { 654 struct vma_list *vml; 655 656 guard(mutex)(&arena->lock); 657 /* iterate link list under lock */ 658 list_for_each_entry(vml, &arena->vma_list, head) 659 zap_page_range_single(vml->vma, uaddr, 660 PAGE_SIZE * page_cnt, NULL); 661 } 662 663 static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) 664 { 665 struct mem_cgroup *new_memcg, *old_memcg; 666 u64 full_uaddr, uaddr_end; 667 long kaddr, pgoff; 668 struct page *page; 669 struct llist_head free_pages; 670 struct llist_node *pos, *t; 671 struct arena_free_span *s; 672 unsigned long flags; 673 int ret = 0; 674 675 /* only aligned lower 32-bit are relevant */ 676 uaddr = (u32)uaddr; 677 uaddr &= PAGE_MASK; 678 kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; 679 full_uaddr = clear_lo32(arena->user_vm_start) + uaddr; 680 uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT)); 681 if (full_uaddr >= uaddr_end) 682 return; 683 684 page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; 685 pgoff = compute_pgoff(arena, uaddr); 686 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 687 688 if (!sleepable) 689 goto defer; 690 691 ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags); 692 693 /* Can't proceed without holding the spinlock so defer the free */ 694 if (ret) 695 goto defer; 696 697 range_tree_set(&arena->rt, pgoff, page_cnt); 698 699 init_llist_head(&free_pages); 700 /* clear ptes and collect struct pages */ 701 apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, 702 apply_range_clear_cb, &free_pages); 703 704 /* drop the lock to do the tlb flush and zap pages */ 705 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 706 707 /* ensure no stale TLB entries */ 708 flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); 709 710 if (page_cnt > 1) 711 /* bulk zap if multiple pages being freed */ 712 zap_pages(arena, full_uaddr, page_cnt); 713 714 llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { 715 page = llist_entry(pos, struct page, pcp_llist); 716 if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ 717 /* Optimization for the common case of page_cnt==1: 718 * If page wasn't mapped into some user vma there 719 * is no need to call zap_pages which is slow. When 720 * page_cnt is big it's faster to do the batched zap. 721 */ 722 zap_pages(arena, full_uaddr, 1); 723 __free_page(page); 724 } 725 bpf_map_memcg_exit(old_memcg, new_memcg); 726 727 return; 728 729 defer: 730 s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1); 731 bpf_map_memcg_exit(old_memcg, new_memcg); 732 if (!s) 733 /* 734 * If allocation fails in non-sleepable context, pages are intentionally left 735 * inaccessible (leaked) until the arena is destroyed. Cleanup or retries are not 736 * possible here, so we intentionally omit them for safety. 737 */ 738 return; 739 740 s->page_cnt = page_cnt; 741 s->uaddr = uaddr; 742 llist_add(&s->node, &arena->free_spans); 743 irq_work_queue(&arena->free_irq); 744 } 745 746 /* 747 * Reserve an arena virtual address range without populating it. This call stops 748 * bpf_arena_alloc_pages from adding pages to this range. 749 */ 750 static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) 751 { 752 long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; 753 struct mem_cgroup *new_memcg, *old_memcg; 754 unsigned long flags; 755 long pgoff; 756 int ret; 757 758 if (uaddr & ~PAGE_MASK) 759 return 0; 760 761 pgoff = compute_pgoff(arena, uaddr); 762 if (pgoff + page_cnt > page_cnt_max) 763 return -EINVAL; 764 765 if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) 766 return -EBUSY; 767 768 /* Cannot guard already allocated pages. */ 769 ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); 770 if (ret) { 771 ret = -EBUSY; 772 goto out; 773 } 774 775 /* "Allocate" the region to prevent it from being allocated. */ 776 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 777 ret = range_tree_clear(&arena->rt, pgoff, page_cnt); 778 bpf_map_memcg_exit(old_memcg, new_memcg); 779 out: 780 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 781 return ret; 782 } 783 784 static void arena_free_worker(struct work_struct *work) 785 { 786 struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work); 787 struct mem_cgroup *new_memcg, *old_memcg; 788 struct llist_node *list, *pos, *t; 789 struct arena_free_span *s; 790 u64 arena_vm_start, user_vm_start; 791 struct llist_head free_pages; 792 struct page *page; 793 unsigned long full_uaddr; 794 long kaddr, page_cnt, pgoff; 795 unsigned long flags; 796 797 if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) { 798 schedule_work(work); 799 return; 800 } 801 802 bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); 803 804 init_llist_head(&free_pages); 805 arena_vm_start = bpf_arena_get_kern_vm_start(arena); 806 user_vm_start = bpf_arena_get_user_vm_start(arena); 807 808 list = llist_del_all(&arena->free_spans); 809 llist_for_each(pos, list) { 810 s = llist_entry(pos, struct arena_free_span, node); 811 page_cnt = s->page_cnt; 812 kaddr = arena_vm_start + s->uaddr; 813 pgoff = compute_pgoff(arena, s->uaddr); 814 815 /* clear ptes and collect pages in free_pages llist */ 816 apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, 817 apply_range_clear_cb, &free_pages); 818 819 range_tree_set(&arena->rt, pgoff, page_cnt); 820 } 821 raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); 822 823 /* Iterate the list again without holding spinlock to do the tlb flush and zap_pages */ 824 llist_for_each_safe(pos, t, list) { 825 s = llist_entry(pos, struct arena_free_span, node); 826 page_cnt = s->page_cnt; 827 full_uaddr = clear_lo32(user_vm_start) + s->uaddr; 828 kaddr = arena_vm_start + s->uaddr; 829 830 /* ensure no stale TLB entries */ 831 flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); 832 833 /* remove pages from user vmas */ 834 zap_pages(arena, full_uaddr, page_cnt); 835 836 kfree_nolock(s); 837 } 838 839 /* free all pages collected by apply_to_existing_page_range() in the first loop */ 840 llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { 841 page = llist_entry(pos, struct page, pcp_llist); 842 __free_page(page); 843 } 844 845 bpf_map_memcg_exit(old_memcg, new_memcg); 846 } 847 848 static void arena_free_irq(struct irq_work *iw) 849 { 850 struct bpf_arena *arena = container_of(iw, struct bpf_arena, free_irq); 851 852 schedule_work(&arena->free_work); 853 } 854 855 __bpf_kfunc_start_defs(); 856 857 __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, 858 int node_id, u64 flags) 859 { 860 struct bpf_map *map = p__map; 861 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 862 863 if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) 864 return NULL; 865 866 return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); 867 } 868 869 void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, 870 int node_id, u64 flags) 871 { 872 struct bpf_map *map = p__map; 873 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 874 875 if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) 876 return NULL; 877 878 return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); 879 } 880 __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) 881 { 882 struct bpf_map *map = p__map; 883 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 884 885 if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) 886 return; 887 arena_free_pages(arena, (long)ptr__ign, page_cnt, true); 888 } 889 890 void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) 891 { 892 struct bpf_map *map = p__map; 893 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 894 895 if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) 896 return; 897 arena_free_pages(arena, (long)ptr__ign, page_cnt, false); 898 } 899 900 __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt) 901 { 902 struct bpf_map *map = p__map; 903 struct bpf_arena *arena = container_of(map, struct bpf_arena, map); 904 905 if (map->map_type != BPF_MAP_TYPE_ARENA) 906 return -EINVAL; 907 908 if (!page_cnt) 909 return 0; 910 911 return arena_reserve_pages(arena, (long)ptr__ign, page_cnt); 912 } 913 __bpf_kfunc_end_defs(); 914 915 BTF_KFUNCS_START(arena_kfuncs) 916 BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2) 917 BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2) 918 BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2) 919 BTF_KFUNCS_END(arena_kfuncs) 920 921 static const struct btf_kfunc_id_set common_kfunc_set = { 922 .owner = THIS_MODULE, 923 .set = &arena_kfuncs, 924 }; 925 926 static int __init kfunc_init(void) 927 { 928 return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); 929 } 930 late_initcall(kfunc_init); 931 932 void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) 933 { 934 struct bpf_stream_stage ss; 935 struct bpf_prog *prog; 936 u64 user_vm_start; 937 938 /* 939 * The RCU read lock is held to safely traverse the latch tree, but we 940 * don't need its protection when accessing the prog, since it will not 941 * disappear while we are handling the fault. 942 */ 943 rcu_read_lock(); 944 prog = bpf_prog_ksym_find(fault_ip); 945 rcu_read_unlock(); 946 if (!prog) 947 return; 948 949 /* Use main prog for stream access */ 950 prog = prog->aux->main_prog_aux->prog; 951 952 user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); 953 addr += clear_lo32(user_vm_start); 954 955 bpf_stream_stage(ss, prog, BPF_STDERR, ({ 956 bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n", 957 write ? "WRITE" : "READ", addr); 958 bpf_stream_dump_stack(ss); 959 })); 960 } 961