1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 */ 4 #include <crypto/sha2.h> 5 #include <linux/bpf.h> 6 #include <linux/bpf-cgroup.h> 7 #include <linux/bpf_trace.h> 8 #include <linux/bpf_lirc.h> 9 #include <linux/bpf_verifier.h> 10 #include <linux/bsearch.h> 11 #include <linux/btf.h> 12 #include <linux/hex.h> 13 #include <linux/syscalls.h> 14 #include <linux/slab.h> 15 #include <linux/sched/signal.h> 16 #include <linux/vmalloc.h> 17 #include <linux/mmzone.h> 18 #include <linux/anon_inodes.h> 19 #include <linux/fdtable.h> 20 #include <linux/file.h> 21 #include <linux/fs.h> 22 #include <linux/license.h> 23 #include <linux/filter.h> 24 #include <linux/kernel.h> 25 #include <linux/idr.h> 26 #include <linux/cred.h> 27 #include <linux/timekeeping.h> 28 #include <linux/ctype.h> 29 #include <linux/nospec.h> 30 #include <linux/audit.h> 31 #include <uapi/linux/btf.h> 32 #include <linux/pgtable.h> 33 #include <linux/bpf_lsm.h> 34 #include <linux/poll.h> 35 #include <linux/sort.h> 36 #include <linux/bpf-netns.h> 37 #include <linux/rcupdate_trace.h> 38 #include <linux/memcontrol.h> 39 #include <linux/trace_events.h> 40 #include <linux/tracepoint.h> 41 #include <linux/overflow.h> 42 #include <linux/cookie.h> 43 #include <linux/verification.h> 44 45 #include <net/netfilter/nf_bpf_link.h> 46 #include <net/netkit.h> 47 #include <net/tcx.h> 48 49 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ 50 (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ 51 (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) 52 #define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) 53 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) 54 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ 55 IS_FD_HASH(map)) 56 57 #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) 58 59 DEFINE_PER_CPU(int, bpf_prog_active); 60 DEFINE_COOKIE(bpf_map_cookie); 61 static DEFINE_IDR(prog_idr); 62 static DEFINE_SPINLOCK(prog_idr_lock); 63 static DEFINE_IDR(map_idr); 64 static DEFINE_SPINLOCK(map_idr_lock); 65 static DEFINE_IDR(link_idr); 66 static DEFINE_SPINLOCK(link_idr_lock); 67 68 int sysctl_unprivileged_bpf_disabled __read_mostly = 69 IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; 70 71 static const struct bpf_map_ops * const bpf_map_types[] = { 72 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) 73 #define BPF_MAP_TYPE(_id, _ops) \ 74 [_id] = &_ops, 75 #define BPF_LINK_TYPE(_id, _name) 76 #include <linux/bpf_types.h> 77 #undef BPF_PROG_TYPE 78 #undef BPF_MAP_TYPE 79 #undef BPF_LINK_TYPE 80 }; 81 82 /* 83 * If we're handed a bigger struct than we know of, ensure all the unknown bits 84 * are 0 - i.e. new user-space does not rely on any kernel feature extensions 85 * we don't know about yet. 86 * 87 * There is a ToCToU between this function call and the following 88 * copy_from_user() call. However, this is not a concern since this function is 89 * meant to be a future-proofing of bits. 90 */ 91 int bpf_check_uarg_tail_zero(bpfptr_t uaddr, 92 size_t expected_size, 93 size_t actual_size) 94 { 95 int res; 96 97 if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ 98 return -E2BIG; 99 100 if (actual_size <= expected_size) 101 return 0; 102 103 if (uaddr.is_kernel) 104 res = memchr_inv(uaddr.kernel + expected_size, 0, 105 actual_size - expected_size) == NULL; 106 else 107 res = check_zeroed_user(uaddr.user + expected_size, 108 actual_size - expected_size); 109 if (res < 0) 110 return res; 111 return res ? 0 : -E2BIG; 112 } 113 114 const struct bpf_map_ops bpf_map_offload_ops = { 115 .map_meta_equal = bpf_map_meta_equal, 116 .map_alloc = bpf_map_offload_map_alloc, 117 .map_free = bpf_map_offload_map_free, 118 .map_check_btf = map_check_no_btf, 119 .map_mem_usage = bpf_map_offload_map_mem_usage, 120 }; 121 122 static void bpf_map_write_active_inc(struct bpf_map *map) 123 { 124 atomic64_inc(&map->writecnt); 125 } 126 127 static void bpf_map_write_active_dec(struct bpf_map *map) 128 { 129 atomic64_dec(&map->writecnt); 130 } 131 132 bool bpf_map_write_active(const struct bpf_map *map) 133 { 134 return atomic64_read(&map->writecnt) != 0; 135 } 136 137 static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags) 138 { 139 if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) 140 return map->value_size; 141 else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 142 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || 143 map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || 144 map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) 145 return round_up(map->value_size, 8) * num_possible_cpus(); 146 else if (IS_FD_MAP(map)) 147 return sizeof(u32); 148 else 149 return map->value_size; 150 } 151 152 static void maybe_wait_bpf_programs(struct bpf_map *map) 153 { 154 /* Wait for any running non-sleepable BPF programs to complete so that 155 * userspace, when we return to it, knows that all non-sleepable 156 * programs that could be running use the new map value. For sleepable 157 * BPF programs, synchronize_rcu_tasks_trace() should be used to wait 158 * for the completions of these programs, but considering the waiting 159 * time can be very long and userspace may think it will hang forever, 160 * so don't handle sleepable BPF programs now. 161 */ 162 if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || 163 map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) 164 synchronize_rcu_expedited(); 165 } 166 167 static void unpin_uptr_kaddr(void *kaddr) 168 { 169 if (kaddr) 170 unpin_user_page(virt_to_page(kaddr)); 171 } 172 173 static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj) 174 { 175 const struct btf_field *field; 176 void **uptr_addr; 177 int i; 178 179 for (i = 0, field = rec->fields; i < cnt; i++, field++) { 180 if (field->type != BPF_UPTR) 181 continue; 182 183 uptr_addr = obj + field->offset; 184 unpin_uptr_kaddr(*uptr_addr); 185 } 186 } 187 188 static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj) 189 { 190 if (!btf_record_has_field(rec, BPF_UPTR)) 191 return; 192 193 __bpf_obj_unpin_uptrs(rec, rec->cnt, obj); 194 } 195 196 static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj) 197 { 198 const struct btf_field *field; 199 const struct btf_type *t; 200 unsigned long start, end; 201 struct page *page; 202 void **uptr_addr; 203 int i, err; 204 205 if (!btf_record_has_field(rec, BPF_UPTR)) 206 return 0; 207 208 for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) { 209 if (field->type != BPF_UPTR) 210 continue; 211 212 uptr_addr = obj + field->offset; 213 start = *(unsigned long *)uptr_addr; 214 if (!start) 215 continue; 216 217 t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); 218 /* t->size was checked for zero before */ 219 if (check_add_overflow(start, t->size - 1, &end)) { 220 err = -EFAULT; 221 goto unpin_all; 222 } 223 224 /* The uptr's struct cannot span across two pages */ 225 if ((start & PAGE_MASK) != (end & PAGE_MASK)) { 226 err = -EOPNOTSUPP; 227 goto unpin_all; 228 } 229 230 err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page); 231 if (err != 1) 232 goto unpin_all; 233 234 if (PageHighMem(page)) { 235 err = -EOPNOTSUPP; 236 unpin_user_page(page); 237 goto unpin_all; 238 } 239 240 *uptr_addr = page_address(page) + offset_in_page(start); 241 } 242 243 return 0; 244 245 unpin_all: 246 __bpf_obj_unpin_uptrs(rec, i, obj); 247 return err; 248 } 249 250 static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, 251 void *key, void *value, __u64 flags) 252 { 253 int err; 254 255 /* Need to create a kthread, thus must support schedule */ 256 if (bpf_map_is_offloaded(map)) { 257 return bpf_map_offload_update_elem(map, key, value, flags); 258 } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || 259 map->map_type == BPF_MAP_TYPE_ARENA || 260 map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { 261 return map->ops->map_update_elem(map, key, value, flags); 262 } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || 263 map->map_type == BPF_MAP_TYPE_SOCKMAP) { 264 return sock_map_update_elem_sys(map, key, value, flags); 265 } else if (IS_FD_PROG_ARRAY(map)) { 266 return bpf_fd_array_map_update_elem(map, map_file, key, value, 267 flags); 268 } 269 270 bpf_disable_instrumentation(); 271 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 272 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 273 err = bpf_percpu_hash_update(map, key, value, flags); 274 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 275 err = bpf_percpu_array_update(map, key, value, flags); 276 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { 277 err = bpf_percpu_cgroup_storage_update(map, key, value, 278 flags); 279 } else if (IS_FD_ARRAY(map)) { 280 err = bpf_fd_array_map_update_elem(map, map_file, key, value, 281 flags); 282 } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { 283 err = bpf_fd_htab_map_update_elem(map, map_file, key, value, 284 flags); 285 } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { 286 /* rcu_read_lock() is not needed */ 287 err = bpf_fd_reuseport_array_update_elem(map, key, value, 288 flags); 289 } else if (map->map_type == BPF_MAP_TYPE_QUEUE || 290 map->map_type == BPF_MAP_TYPE_STACK || 291 map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { 292 err = map->ops->map_push_elem(map, value, flags); 293 } else { 294 err = bpf_obj_pin_uptrs(map->record, value); 295 if (!err) { 296 rcu_read_lock(); 297 err = map->ops->map_update_elem(map, key, value, flags); 298 rcu_read_unlock(); 299 if (err) 300 bpf_obj_unpin_uptrs(map->record, value); 301 } 302 } 303 bpf_enable_instrumentation(); 304 305 return err; 306 } 307 308 static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, 309 __u64 flags) 310 { 311 void *ptr; 312 int err; 313 314 if (bpf_map_is_offloaded(map)) 315 return bpf_map_offload_lookup_elem(map, key, value); 316 317 bpf_disable_instrumentation(); 318 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 319 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 320 err = bpf_percpu_hash_copy(map, key, value, flags); 321 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 322 err = bpf_percpu_array_copy(map, key, value, flags); 323 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { 324 err = bpf_percpu_cgroup_storage_copy(map, key, value, flags); 325 } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { 326 err = bpf_stackmap_extract(map, key, value, false); 327 } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { 328 err = bpf_fd_array_map_lookup_elem(map, key, value); 329 } else if (IS_FD_HASH(map)) { 330 err = bpf_fd_htab_map_lookup_elem(map, key, value); 331 } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { 332 err = bpf_fd_reuseport_array_lookup_elem(map, key, value); 333 } else if (map->map_type == BPF_MAP_TYPE_QUEUE || 334 map->map_type == BPF_MAP_TYPE_STACK || 335 map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { 336 err = map->ops->map_peek_elem(map, value); 337 } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { 338 /* struct_ops map requires directly updating "value" */ 339 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); 340 } else { 341 rcu_read_lock(); 342 if (map->ops->map_lookup_elem_sys_only) 343 ptr = map->ops->map_lookup_elem_sys_only(map, key); 344 else 345 ptr = map->ops->map_lookup_elem(map, key); 346 if (IS_ERR(ptr)) { 347 err = PTR_ERR(ptr); 348 } else if (!ptr) { 349 err = -ENOENT; 350 } else { 351 err = 0; 352 if (flags & BPF_F_LOCK) 353 /* lock 'ptr' and copy everything but lock */ 354 copy_map_value_locked(map, value, ptr, true); 355 else 356 copy_map_value(map, value, ptr); 357 /* mask lock and timer, since value wasn't zero inited */ 358 check_and_init_map_value(map, value); 359 } 360 rcu_read_unlock(); 361 } 362 363 bpf_enable_instrumentation(); 364 365 return err; 366 } 367 368 /* Please, do not use this function outside from the map creation path 369 * (e.g. in map update path) without taking care of setting the active 370 * memory cgroup (see at bpf_map_kmalloc_node() for example). 371 */ 372 static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) 373 { 374 /* We really just want to fail instead of triggering OOM killer 375 * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, 376 * which is used for lower order allocation requests. 377 * 378 * It has been observed that higher order allocation requests done by 379 * vmalloc with __GFP_NORETRY being set might fail due to not trying 380 * to reclaim memory from the page cache, thus we set 381 * __GFP_RETRY_MAYFAIL to avoid such situations. 382 */ 383 384 gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO); 385 unsigned int flags = 0; 386 unsigned long align = 1; 387 void *area; 388 389 if (size >= SIZE_MAX) 390 return NULL; 391 392 /* kmalloc()'ed memory can't be mmap()'ed */ 393 if (mmapable) { 394 BUG_ON(!PAGE_ALIGNED(size)); 395 align = SHMLBA; 396 flags = VM_USERMAP; 397 } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 398 area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, 399 numa_node); 400 if (area != NULL) 401 return area; 402 } 403 404 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 405 gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, 406 flags, numa_node, __builtin_return_address(0)); 407 } 408 409 void *bpf_map_area_alloc(u64 size, int numa_node) 410 { 411 return __bpf_map_area_alloc(size, numa_node, false); 412 } 413 414 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node) 415 { 416 return __bpf_map_area_alloc(size, numa_node, true); 417 } 418 419 void bpf_map_area_free(void *area) 420 { 421 kvfree(area); 422 } 423 424 static u32 bpf_map_flags_retain_permanent(u32 flags) 425 { 426 /* Some map creation flags are not tied to the map object but 427 * rather to the map fd instead, so they have no meaning upon 428 * map object inspection since multiple file descriptors with 429 * different (access) properties can exist here. Thus, given 430 * this has zero meaning for the map itself, lets clear these 431 * from here. 432 */ 433 return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); 434 } 435 436 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) 437 { 438 map->map_type = attr->map_type; 439 map->key_size = attr->key_size; 440 map->value_size = attr->value_size; 441 map->max_entries = attr->max_entries; 442 map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); 443 map->numa_node = bpf_map_attr_numa_node(attr); 444 map->map_extra = attr->map_extra; 445 } 446 447 static int bpf_map_alloc_id(struct bpf_map *map) 448 { 449 int id; 450 451 idr_preload(GFP_KERNEL); 452 spin_lock_bh(&map_idr_lock); 453 id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); 454 if (id > 0) 455 map->id = id; 456 spin_unlock_bh(&map_idr_lock); 457 idr_preload_end(); 458 459 if (WARN_ON_ONCE(!id)) 460 return -ENOSPC; 461 462 return id > 0 ? 0 : id; 463 } 464 465 void bpf_map_free_id(struct bpf_map *map) 466 { 467 unsigned long flags; 468 469 /* Offloaded maps are removed from the IDR store when their device 470 * disappears - even if someone holds an fd to them they are unusable, 471 * the memory is gone, all ops will fail; they are simply waiting for 472 * refcnt to drop to be freed. 473 */ 474 if (!map->id) 475 return; 476 477 spin_lock_irqsave(&map_idr_lock, flags); 478 479 idr_remove(&map_idr, map->id); 480 map->id = 0; 481 482 spin_unlock_irqrestore(&map_idr_lock, flags); 483 } 484 485 #ifdef CONFIG_MEMCG 486 static void bpf_map_save_memcg(struct bpf_map *map) 487 { 488 /* Currently if a map is created by a process belonging to the root 489 * memory cgroup, get_obj_cgroup_from_current() will return NULL. 490 * So we have to check map->objcg for being NULL each time it's 491 * being used. 492 */ 493 if (memcg_bpf_enabled()) 494 map->objcg = get_obj_cgroup_from_current(); 495 } 496 497 static void bpf_map_release_memcg(struct bpf_map *map) 498 { 499 if (map->objcg) 500 obj_cgroup_put(map->objcg); 501 } 502 503 static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) 504 { 505 if (map->objcg) 506 return get_mem_cgroup_from_objcg(map->objcg); 507 508 return root_mem_cgroup; 509 } 510 511 void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, 512 struct mem_cgroup **new_memcg) 513 { 514 *new_memcg = bpf_map_get_memcg(map); 515 *old_memcg = set_active_memcg(*new_memcg); 516 } 517 518 void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, 519 struct mem_cgroup *new_memcg) 520 { 521 set_active_memcg(old_memcg); 522 mem_cgroup_put(new_memcg); 523 } 524 525 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, 526 int node) 527 { 528 struct mem_cgroup *memcg, *old_memcg; 529 void *ptr; 530 531 bpf_map_memcg_enter(map, &old_memcg, &memcg); 532 ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); 533 bpf_map_memcg_exit(old_memcg, memcg); 534 535 return ptr; 536 } 537 538 void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, 539 int node) 540 { 541 struct mem_cgroup *memcg, *old_memcg; 542 void *ptr; 543 544 bpf_map_memcg_enter(map, &old_memcg, &memcg); 545 ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); 546 bpf_map_memcg_exit(old_memcg, memcg); 547 548 return ptr; 549 } 550 551 void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) 552 { 553 struct mem_cgroup *memcg, *old_memcg; 554 void *ptr; 555 556 bpf_map_memcg_enter(map, &old_memcg, &memcg); 557 ptr = kzalloc(size, flags | __GFP_ACCOUNT); 558 bpf_map_memcg_exit(old_memcg, memcg); 559 560 return ptr; 561 } 562 563 void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, 564 gfp_t flags) 565 { 566 struct mem_cgroup *memcg, *old_memcg; 567 void *ptr; 568 569 bpf_map_memcg_enter(map, &old_memcg, &memcg); 570 ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); 571 bpf_map_memcg_exit(old_memcg, memcg); 572 573 return ptr; 574 } 575 576 void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, 577 size_t align, gfp_t flags) 578 { 579 struct mem_cgroup *memcg, *old_memcg; 580 void __percpu *ptr; 581 582 bpf_map_memcg_enter(map, &old_memcg, &memcg); 583 ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); 584 bpf_map_memcg_exit(old_memcg, memcg); 585 586 return ptr; 587 } 588 589 #else 590 static void bpf_map_save_memcg(struct bpf_map *map) 591 { 592 } 593 594 static void bpf_map_release_memcg(struct bpf_map *map) 595 { 596 } 597 #endif 598 599 static bool can_alloc_pages(void) 600 { 601 return preempt_count() == 0 && !irqs_disabled() && 602 !IS_ENABLED(CONFIG_PREEMPT_RT); 603 } 604 605 static struct page *__bpf_alloc_page(int nid) 606 { 607 if (!can_alloc_pages()) 608 return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0); 609 610 return alloc_pages_node(nid, 611 GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT 612 | __GFP_NOWARN, 613 0); 614 } 615 616 int bpf_map_alloc_pages(const struct bpf_map *map, int nid, 617 unsigned long nr_pages, struct page **pages) 618 { 619 unsigned long i, j; 620 struct page *pg; 621 int ret = 0; 622 623 for (i = 0; i < nr_pages; i++) { 624 pg = __bpf_alloc_page(nid); 625 626 if (pg) { 627 pages[i] = pg; 628 continue; 629 } 630 for (j = 0; j < i; j++) 631 free_pages_nolock(pages[j], 0); 632 ret = -ENOMEM; 633 break; 634 } 635 636 return ret; 637 } 638 639 640 static int btf_field_cmp(const void *a, const void *b) 641 { 642 const struct btf_field *f1 = a, *f2 = b; 643 644 if (f1->offset < f2->offset) 645 return -1; 646 else if (f1->offset > f2->offset) 647 return 1; 648 return 0; 649 } 650 651 struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, 652 u32 field_mask) 653 { 654 struct btf_field *field; 655 656 if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask)) 657 return NULL; 658 field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); 659 if (!field || !(field->type & field_mask)) 660 return NULL; 661 return field; 662 } 663 664 void btf_record_free(struct btf_record *rec) 665 { 666 int i; 667 668 if (IS_ERR_OR_NULL(rec)) 669 return; 670 for (i = 0; i < rec->cnt; i++) { 671 switch (rec->fields[i].type) { 672 case BPF_KPTR_UNREF: 673 case BPF_KPTR_REF: 674 case BPF_KPTR_PERCPU: 675 case BPF_UPTR: 676 if (rec->fields[i].kptr.module) 677 module_put(rec->fields[i].kptr.module); 678 if (btf_is_kernel(rec->fields[i].kptr.btf)) 679 btf_put(rec->fields[i].kptr.btf); 680 break; 681 case BPF_LIST_HEAD: 682 case BPF_LIST_NODE: 683 case BPF_RB_ROOT: 684 case BPF_RB_NODE: 685 case BPF_SPIN_LOCK: 686 case BPF_RES_SPIN_LOCK: 687 case BPF_TIMER: 688 case BPF_REFCOUNT: 689 case BPF_WORKQUEUE: 690 case BPF_TASK_WORK: 691 /* Nothing to release */ 692 break; 693 default: 694 WARN_ON_ONCE(1); 695 continue; 696 } 697 } 698 kfree(rec); 699 } 700 701 void bpf_map_free_record(struct bpf_map *map) 702 { 703 btf_record_free(map->record); 704 map->record = NULL; 705 } 706 707 struct btf_record *btf_record_dup(const struct btf_record *rec) 708 { 709 const struct btf_field *fields; 710 struct btf_record *new_rec; 711 int ret, size, i; 712 713 if (IS_ERR_OR_NULL(rec)) 714 return NULL; 715 size = struct_size(rec, fields, rec->cnt); 716 new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); 717 if (!new_rec) 718 return ERR_PTR(-ENOMEM); 719 /* Do a deep copy of the btf_record */ 720 fields = rec->fields; 721 new_rec->cnt = 0; 722 for (i = 0; i < rec->cnt; i++) { 723 switch (fields[i].type) { 724 case BPF_KPTR_UNREF: 725 case BPF_KPTR_REF: 726 case BPF_KPTR_PERCPU: 727 case BPF_UPTR: 728 if (btf_is_kernel(fields[i].kptr.btf)) 729 btf_get(fields[i].kptr.btf); 730 if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { 731 ret = -ENXIO; 732 goto free; 733 } 734 break; 735 case BPF_LIST_HEAD: 736 case BPF_LIST_NODE: 737 case BPF_RB_ROOT: 738 case BPF_RB_NODE: 739 case BPF_SPIN_LOCK: 740 case BPF_RES_SPIN_LOCK: 741 case BPF_TIMER: 742 case BPF_REFCOUNT: 743 case BPF_WORKQUEUE: 744 case BPF_TASK_WORK: 745 /* Nothing to acquire */ 746 break; 747 default: 748 ret = -EFAULT; 749 WARN_ON_ONCE(1); 750 goto free; 751 } 752 new_rec->cnt++; 753 } 754 return new_rec; 755 free: 756 btf_record_free(new_rec); 757 return ERR_PTR(ret); 758 } 759 760 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b) 761 { 762 bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b); 763 int size; 764 765 if (!a_has_fields && !b_has_fields) 766 return true; 767 if (a_has_fields != b_has_fields) 768 return false; 769 if (rec_a->cnt != rec_b->cnt) 770 return false; 771 size = struct_size(rec_a, fields, rec_a->cnt); 772 /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused 773 * members are zeroed out. So memcmp is safe to do without worrying 774 * about padding/unused fields. 775 * 776 * While spin_lock, timer, and kptr have no relation to map BTF, 777 * list_head metadata is specific to map BTF, the btf and value_rec 778 * members in particular. btf is the map BTF, while value_rec points to 779 * btf_record in that map BTF. 780 * 781 * So while by default, we don't rely on the map BTF (which the records 782 * were parsed from) matching for both records, which is not backwards 783 * compatible, in case list_head is part of it, we implicitly rely on 784 * that by way of depending on memcmp succeeding for it. 785 */ 786 return !memcmp(rec_a, rec_b, size); 787 } 788 789 void bpf_obj_free_timer(const struct btf_record *rec, void *obj) 790 { 791 if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER))) 792 return; 793 bpf_timer_cancel_and_free(obj + rec->timer_off); 794 } 795 796 void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) 797 { 798 if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE))) 799 return; 800 bpf_wq_cancel_and_free(obj + rec->wq_off); 801 } 802 803 void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) 804 { 805 if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) 806 return; 807 bpf_task_work_cancel_and_free(obj + rec->task_work_off); 808 } 809 810 void bpf_obj_free_fields(const struct btf_record *rec, void *obj) 811 { 812 const struct btf_field *fields; 813 int i; 814 815 if (IS_ERR_OR_NULL(rec)) 816 return; 817 fields = rec->fields; 818 for (i = 0; i < rec->cnt; i++) { 819 struct btf_struct_meta *pointee_struct_meta; 820 const struct btf_field *field = &fields[i]; 821 void *field_ptr = obj + field->offset; 822 void *xchgd_field; 823 824 switch (fields[i].type) { 825 case BPF_SPIN_LOCK: 826 case BPF_RES_SPIN_LOCK: 827 break; 828 case BPF_TIMER: 829 bpf_timer_cancel_and_free(field_ptr); 830 break; 831 case BPF_WORKQUEUE: 832 bpf_wq_cancel_and_free(field_ptr); 833 break; 834 case BPF_TASK_WORK: 835 bpf_task_work_cancel_and_free(field_ptr); 836 break; 837 case BPF_KPTR_UNREF: 838 WRITE_ONCE(*(u64 *)field_ptr, 0); 839 break; 840 case BPF_KPTR_REF: 841 case BPF_KPTR_PERCPU: 842 xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); 843 if (!xchgd_field) 844 break; 845 846 if (!btf_is_kernel(field->kptr.btf)) { 847 pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, 848 field->kptr.btf_id); 849 __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? 850 pointee_struct_meta->record : NULL, 851 fields[i].type == BPF_KPTR_PERCPU); 852 } else { 853 field->kptr.dtor(xchgd_field); 854 } 855 break; 856 case BPF_UPTR: 857 /* The caller ensured that no one is using the uptr */ 858 unpin_uptr_kaddr(*(void **)field_ptr); 859 break; 860 case BPF_LIST_HEAD: 861 if (WARN_ON_ONCE(rec->spin_lock_off < 0)) 862 continue; 863 bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off); 864 break; 865 case BPF_RB_ROOT: 866 if (WARN_ON_ONCE(rec->spin_lock_off < 0)) 867 continue; 868 bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off); 869 break; 870 case BPF_LIST_NODE: 871 case BPF_RB_NODE: 872 case BPF_REFCOUNT: 873 break; 874 default: 875 WARN_ON_ONCE(1); 876 continue; 877 } 878 } 879 } 880 881 static void bpf_map_free(struct bpf_map *map) 882 { 883 struct btf_record *rec = map->record; 884 struct btf *btf = map->btf; 885 886 /* implementation dependent freeing. Disabling migration to simplify 887 * the free of values or special fields allocated from bpf memory 888 * allocator. 889 */ 890 kfree(map->excl_prog_sha); 891 migrate_disable(); 892 map->ops->map_free(map); 893 migrate_enable(); 894 895 /* Delay freeing of btf_record for maps, as map_free 896 * callback usually needs access to them. It is better to do it here 897 * than require each callback to do the free itself manually. 898 * 899 * Note that the btf_record stashed in map->inner_map_meta->record was 900 * already freed using the map_free callback for map in map case which 901 * eventually calls bpf_map_free_meta, since inner_map_meta is only a 902 * template bpf_map struct used during verification. 903 */ 904 btf_record_free(rec); 905 /* Delay freeing of btf for maps, as map_free callback may need 906 * struct_meta info which will be freed with btf_put(). 907 */ 908 btf_put(btf); 909 } 910 911 /* called from workqueue */ 912 static void bpf_map_free_deferred(struct work_struct *work) 913 { 914 struct bpf_map *map = container_of(work, struct bpf_map, work); 915 916 security_bpf_map_free(map); 917 bpf_map_release_memcg(map); 918 bpf_map_owner_free(map); 919 bpf_map_free(map); 920 } 921 922 static void bpf_map_put_uref(struct bpf_map *map) 923 { 924 if (atomic64_dec_and_test(&map->usercnt)) { 925 if (map->ops->map_release_uref) 926 map->ops->map_release_uref(map); 927 } 928 } 929 930 static void bpf_map_free_in_work(struct bpf_map *map) 931 { 932 INIT_WORK(&map->work, bpf_map_free_deferred); 933 /* Avoid spawning kworkers, since they all might contend 934 * for the same mutex like slab_mutex. 935 */ 936 queue_work(system_dfl_wq, &map->work); 937 } 938 939 static void bpf_map_free_rcu_gp(struct rcu_head *rcu) 940 { 941 bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); 942 } 943 944 /* decrement map refcnt and schedule it for freeing via workqueue 945 * (underlying map implementation ops->map_free() might sleep) 946 */ 947 void bpf_map_put(struct bpf_map *map) 948 { 949 if (atomic64_dec_and_test(&map->refcnt)) { 950 /* bpf_map_free_id() must be called first */ 951 bpf_map_free_id(map); 952 953 WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); 954 /* RCU tasks trace grace period implies RCU grace period. */ 955 if (READ_ONCE(map->free_after_mult_rcu_gp)) 956 call_rcu_tasks_trace(&map->rcu, bpf_map_free_rcu_gp); 957 else if (READ_ONCE(map->free_after_rcu_gp)) 958 call_rcu(&map->rcu, bpf_map_free_rcu_gp); 959 else 960 bpf_map_free_in_work(map); 961 } 962 } 963 EXPORT_SYMBOL_GPL(bpf_map_put); 964 965 void bpf_map_put_with_uref(struct bpf_map *map) 966 { 967 bpf_map_put_uref(map); 968 bpf_map_put(map); 969 } 970 971 static int bpf_map_release(struct inode *inode, struct file *filp) 972 { 973 struct bpf_map *map = filp->private_data; 974 975 if (map->ops->map_release) 976 map->ops->map_release(map, filp); 977 978 bpf_map_put_with_uref(map); 979 return 0; 980 } 981 982 static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) 983 { 984 fmode_t mode = fd_file(f)->f_mode; 985 986 /* Our file permissions may have been overridden by global 987 * map permissions facing syscall side. 988 */ 989 if (READ_ONCE(map->frozen)) 990 mode &= ~FMODE_CAN_WRITE; 991 return mode; 992 } 993 994 #ifdef CONFIG_PROC_FS 995 /* Show the memory usage of a bpf map */ 996 static u64 bpf_map_memory_usage(const struct bpf_map *map) 997 { 998 return map->ops->map_mem_usage(map); 999 } 1000 1001 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) 1002 { 1003 struct bpf_map *map = filp->private_data; 1004 u32 type = 0, jited = 0; 1005 1006 spin_lock(&map->owner_lock); 1007 if (map->owner) { 1008 type = map->owner->type; 1009 jited = map->owner->jited; 1010 } 1011 spin_unlock(&map->owner_lock); 1012 1013 seq_printf(m, 1014 "map_type:\t%u\n" 1015 "key_size:\t%u\n" 1016 "value_size:\t%u\n" 1017 "max_entries:\t%u\n" 1018 "map_flags:\t%#x\n" 1019 "map_extra:\t%#llx\n" 1020 "memlock:\t%llu\n" 1021 "map_id:\t%u\n" 1022 "frozen:\t%u\n", 1023 map->map_type, 1024 map->key_size, 1025 map->value_size, 1026 map->max_entries, 1027 map->map_flags, 1028 (unsigned long long)map->map_extra, 1029 bpf_map_memory_usage(map), 1030 map->id, 1031 READ_ONCE(map->frozen)); 1032 if (type) { 1033 seq_printf(m, "owner_prog_type:\t%u\n", type); 1034 seq_printf(m, "owner_jited:\t%u\n", jited); 1035 } 1036 } 1037 #endif 1038 1039 static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, 1040 loff_t *ppos) 1041 { 1042 /* We need this handler such that alloc_file() enables 1043 * f_mode with FMODE_CAN_READ. 1044 */ 1045 return -EINVAL; 1046 } 1047 1048 static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, 1049 size_t siz, loff_t *ppos) 1050 { 1051 /* We need this handler such that alloc_file() enables 1052 * f_mode with FMODE_CAN_WRITE. 1053 */ 1054 return -EINVAL; 1055 } 1056 1057 /* called for any extra memory-mapped regions (except initial) */ 1058 static void bpf_map_mmap_open(struct vm_area_struct *vma) 1059 { 1060 struct bpf_map *map = vma->vm_file->private_data; 1061 1062 if (vma->vm_flags & VM_MAYWRITE) 1063 bpf_map_write_active_inc(map); 1064 } 1065 1066 /* called for all unmapped memory region (including initial) */ 1067 static void bpf_map_mmap_close(struct vm_area_struct *vma) 1068 { 1069 struct bpf_map *map = vma->vm_file->private_data; 1070 1071 if (vma->vm_flags & VM_MAYWRITE) 1072 bpf_map_write_active_dec(map); 1073 } 1074 1075 static const struct vm_operations_struct bpf_map_default_vmops = { 1076 .open = bpf_map_mmap_open, 1077 .close = bpf_map_mmap_close, 1078 }; 1079 1080 static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) 1081 { 1082 struct bpf_map *map = filp->private_data; 1083 int err = 0; 1084 1085 if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) 1086 return -ENOTSUPP; 1087 1088 if (!(vma->vm_flags & VM_SHARED)) 1089 return -EINVAL; 1090 1091 mutex_lock(&map->freeze_mutex); 1092 1093 if (vma->vm_flags & VM_WRITE) { 1094 if (map->frozen) { 1095 err = -EPERM; 1096 goto out; 1097 } 1098 /* map is meant to be read-only, so do not allow mapping as 1099 * writable, because it's possible to leak a writable page 1100 * reference and allows user-space to still modify it after 1101 * freezing, while verifier will assume contents do not change 1102 */ 1103 if (map->map_flags & BPF_F_RDONLY_PROG) { 1104 err = -EACCES; 1105 goto out; 1106 } 1107 bpf_map_write_active_inc(map); 1108 } 1109 out: 1110 mutex_unlock(&map->freeze_mutex); 1111 if (err) 1112 return err; 1113 1114 /* set default open/close callbacks */ 1115 vma->vm_ops = &bpf_map_default_vmops; 1116 vma->vm_private_data = map; 1117 vm_flags_clear(vma, VM_MAYEXEC); 1118 /* If mapping is read-only, then disallow potentially re-mapping with 1119 * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing 1120 * means that as far as BPF map's memory-mapped VMAs are concerned, 1121 * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set, 1122 * both should be set, so we can forget about VM_MAYWRITE and always 1123 * check just VM_WRITE 1124 */ 1125 if (!(vma->vm_flags & VM_WRITE)) 1126 vm_flags_clear(vma, VM_MAYWRITE); 1127 1128 err = map->ops->map_mmap(map, vma); 1129 if (err) { 1130 if (vma->vm_flags & VM_WRITE) 1131 bpf_map_write_active_dec(map); 1132 } 1133 1134 return err; 1135 } 1136 1137 static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) 1138 { 1139 struct bpf_map *map = filp->private_data; 1140 1141 if (map->ops->map_poll) 1142 return map->ops->map_poll(map, filp, pts); 1143 1144 return EPOLLERR; 1145 } 1146 1147 static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr, 1148 unsigned long len, unsigned long pgoff, 1149 unsigned long flags) 1150 { 1151 struct bpf_map *map = filp->private_data; 1152 1153 if (map->ops->map_get_unmapped_area) 1154 return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); 1155 #ifdef CONFIG_MMU 1156 return mm_get_unmapped_area(filp, addr, len, pgoff, flags); 1157 #else 1158 return addr; 1159 #endif 1160 } 1161 1162 const struct file_operations bpf_map_fops = { 1163 #ifdef CONFIG_PROC_FS 1164 .show_fdinfo = bpf_map_show_fdinfo, 1165 #endif 1166 .release = bpf_map_release, 1167 .read = bpf_dummy_read, 1168 .write = bpf_dummy_write, 1169 .mmap = bpf_map_mmap, 1170 .poll = bpf_map_poll, 1171 .get_unmapped_area = bpf_get_unmapped_area, 1172 }; 1173 1174 int bpf_map_new_fd(struct bpf_map *map, int flags) 1175 { 1176 int ret; 1177 1178 ret = security_bpf_map(map, OPEN_FMODE(flags)); 1179 if (ret < 0) 1180 return ret; 1181 1182 return anon_inode_getfd("bpf-map", &bpf_map_fops, map, 1183 flags | O_CLOEXEC); 1184 } 1185 1186 int bpf_get_file_flag(int flags) 1187 { 1188 if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) 1189 return -EINVAL; 1190 if (flags & BPF_F_RDONLY) 1191 return O_RDONLY; 1192 if (flags & BPF_F_WRONLY) 1193 return O_WRONLY; 1194 return O_RDWR; 1195 } 1196 1197 /* helper macro to check that unused fields 'union bpf_attr' are zero */ 1198 #define CHECK_ATTR(CMD) \ 1199 memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ 1200 sizeof(attr->CMD##_LAST_FIELD), 0, \ 1201 sizeof(*attr) - \ 1202 offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ 1203 sizeof(attr->CMD##_LAST_FIELD)) != NULL 1204 1205 /* dst and src must have at least "size" number of bytes. 1206 * Return strlen on success and < 0 on error. 1207 */ 1208 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) 1209 { 1210 const char *end = src + size; 1211 const char *orig_src = src; 1212 1213 memset(dst, 0, size); 1214 /* Copy all isalnum(), '_' and '.' chars. */ 1215 while (src < end && *src) { 1216 if (!isalnum(*src) && 1217 *src != '_' && *src != '.') 1218 return -EINVAL; 1219 *dst++ = *src++; 1220 } 1221 1222 /* No '\0' found in "size" number of bytes */ 1223 if (src == end) 1224 return -EINVAL; 1225 1226 return src - orig_src; 1227 } 1228 EXPORT_SYMBOL_GPL(bpf_obj_name_cpy); 1229 1230 int map_check_no_btf(struct bpf_map *map, 1231 const struct btf *btf, 1232 const struct btf_type *key_type, 1233 const struct btf_type *value_type) 1234 { 1235 return -ENOTSUPP; 1236 } 1237 1238 static int map_check_btf(struct bpf_map *map, struct bpf_token *token, 1239 const struct btf *btf, u32 btf_key_id, u32 btf_value_id) 1240 { 1241 const struct btf_type *key_type, *value_type; 1242 u32 key_size, value_size; 1243 int ret = 0; 1244 1245 /* Some maps allow key to be unspecified. */ 1246 if (btf_key_id) { 1247 key_type = btf_type_id_size(btf, &btf_key_id, &key_size); 1248 if (!key_type || key_size != map->key_size) 1249 return -EINVAL; 1250 } else { 1251 key_type = btf_type_by_id(btf, 0); 1252 if (!map->ops->map_check_btf) 1253 return -EINVAL; 1254 } 1255 1256 value_type = btf_type_id_size(btf, &btf_value_id, &value_size); 1257 if (!value_type || value_size != map->value_size) 1258 return -EINVAL; 1259 1260 map->record = btf_parse_fields(btf, value_type, 1261 BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | 1262 BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | 1263 BPF_TASK_WORK, 1264 map->value_size); 1265 if (!IS_ERR_OR_NULL(map->record)) { 1266 int i; 1267 1268 if (!bpf_token_capable(token, CAP_BPF)) { 1269 ret = -EPERM; 1270 goto free_map_tab; 1271 } 1272 if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { 1273 ret = -EACCES; 1274 goto free_map_tab; 1275 } 1276 for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) { 1277 switch (map->record->field_mask & (1 << i)) { 1278 case 0: 1279 continue; 1280 case BPF_SPIN_LOCK: 1281 case BPF_RES_SPIN_LOCK: 1282 if (map->map_type != BPF_MAP_TYPE_HASH && 1283 map->map_type != BPF_MAP_TYPE_ARRAY && 1284 map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && 1285 map->map_type != BPF_MAP_TYPE_SK_STORAGE && 1286 map->map_type != BPF_MAP_TYPE_INODE_STORAGE && 1287 map->map_type != BPF_MAP_TYPE_TASK_STORAGE && 1288 map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { 1289 ret = -EOPNOTSUPP; 1290 goto free_map_tab; 1291 } 1292 break; 1293 case BPF_TIMER: 1294 case BPF_WORKQUEUE: 1295 case BPF_TASK_WORK: 1296 if (map->map_type != BPF_MAP_TYPE_HASH && 1297 map->map_type != BPF_MAP_TYPE_LRU_HASH && 1298 map->map_type != BPF_MAP_TYPE_ARRAY) { 1299 ret = -EOPNOTSUPP; 1300 goto free_map_tab; 1301 } 1302 break; 1303 case BPF_KPTR_UNREF: 1304 case BPF_KPTR_REF: 1305 case BPF_KPTR_PERCPU: 1306 case BPF_REFCOUNT: 1307 if (map->map_type != BPF_MAP_TYPE_HASH && 1308 map->map_type != BPF_MAP_TYPE_PERCPU_HASH && 1309 map->map_type != BPF_MAP_TYPE_LRU_HASH && 1310 map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && 1311 map->map_type != BPF_MAP_TYPE_ARRAY && 1312 map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && 1313 map->map_type != BPF_MAP_TYPE_SK_STORAGE && 1314 map->map_type != BPF_MAP_TYPE_INODE_STORAGE && 1315 map->map_type != BPF_MAP_TYPE_TASK_STORAGE && 1316 map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { 1317 ret = -EOPNOTSUPP; 1318 goto free_map_tab; 1319 } 1320 break; 1321 case BPF_UPTR: 1322 if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) { 1323 ret = -EOPNOTSUPP; 1324 goto free_map_tab; 1325 } 1326 break; 1327 case BPF_LIST_HEAD: 1328 case BPF_RB_ROOT: 1329 if (map->map_type != BPF_MAP_TYPE_HASH && 1330 map->map_type != BPF_MAP_TYPE_LRU_HASH && 1331 map->map_type != BPF_MAP_TYPE_ARRAY) { 1332 ret = -EOPNOTSUPP; 1333 goto free_map_tab; 1334 } 1335 break; 1336 default: 1337 /* Fail if map_type checks are missing for a field type */ 1338 ret = -EOPNOTSUPP; 1339 goto free_map_tab; 1340 } 1341 } 1342 } 1343 1344 ret = btf_check_and_fixup_fields(btf, map->record); 1345 if (ret < 0) 1346 goto free_map_tab; 1347 1348 if (map->ops->map_check_btf) { 1349 ret = map->ops->map_check_btf(map, btf, key_type, value_type); 1350 if (ret < 0) 1351 goto free_map_tab; 1352 } 1353 1354 return ret; 1355 free_map_tab: 1356 bpf_map_free_record(map); 1357 return ret; 1358 } 1359 1360 #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size 1361 /* called via syscall */ 1362 static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log) 1363 { 1364 const struct bpf_map_ops *ops; 1365 struct bpf_token *token = NULL; 1366 int numa_node = bpf_map_attr_numa_node(attr); 1367 u32 map_type = attr->map_type; 1368 struct bpf_map *map; 1369 bool token_flag; 1370 int f_flags; 1371 int err; 1372 1373 err = CHECK_ATTR(BPF_MAP_CREATE); 1374 if (err) { 1375 bpf_log(log, "Invalid attr.\n"); 1376 return -EINVAL; 1377 } 1378 1379 /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it 1380 * to avoid per-map type checks tripping on unknown flag 1381 */ 1382 token_flag = attr->map_flags & BPF_F_TOKEN_FD; 1383 attr->map_flags &= ~BPF_F_TOKEN_FD; 1384 1385 if (attr->btf_vmlinux_value_type_id) { 1386 if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS) { 1387 bpf_log(log, "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n"); 1388 return -EINVAL; 1389 } 1390 if (attr->btf_key_type_id || attr->btf_value_type_id) { 1391 bpf_log(log, "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n"); 1392 return -EINVAL; 1393 } 1394 } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { 1395 bpf_log(log, "Invalid btf_value_type_id.\n"); 1396 return -EINVAL; 1397 } 1398 1399 if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && 1400 attr->map_type != BPF_MAP_TYPE_ARENA && 1401 attr->map_extra != 0) { 1402 bpf_log(log, "Invalid map_extra.\n"); 1403 return -EINVAL; 1404 } 1405 1406 f_flags = bpf_get_file_flag(attr->map_flags); 1407 if (f_flags < 0) 1408 return f_flags; 1409 1410 if (numa_node != NUMA_NO_NODE && 1411 ((unsigned int)numa_node >= nr_node_ids || 1412 !node_online(numa_node))) { 1413 bpf_log(log, "Invalid numa_node.\n"); 1414 return -EINVAL; 1415 } 1416 1417 /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ 1418 map_type = attr->map_type; 1419 if (map_type >= ARRAY_SIZE(bpf_map_types)) { 1420 bpf_log(log, "Invalid map_type.\n"); 1421 return -EINVAL; 1422 } 1423 map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); 1424 ops = bpf_map_types[map_type]; 1425 if (!ops) 1426 return -EINVAL; 1427 1428 if (ops->map_alloc_check) { 1429 err = ops->map_alloc_check(attr); 1430 if (err) 1431 return err; 1432 } 1433 if (attr->map_ifindex) 1434 ops = &bpf_map_offload_ops; 1435 if (!ops->map_mem_usage) 1436 return -EINVAL; 1437 1438 if (token_flag) { 1439 token = bpf_token_get_from_fd(attr->map_token_fd); 1440 if (IS_ERR(token)) { 1441 bpf_log(log, "Invalid map_token_fd.\n"); 1442 return PTR_ERR(token); 1443 } 1444 1445 /* if current token doesn't grant map creation permissions, 1446 * then we can't use this token, so ignore it and rely on 1447 * system-wide capabilities checks 1448 */ 1449 if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) || 1450 !bpf_token_allow_map_type(token, attr->map_type)) { 1451 bpf_token_put(token); 1452 token = NULL; 1453 } 1454 } 1455 1456 err = -EPERM; 1457 1458 /* Intent here is for unprivileged_bpf_disabled to block BPF map 1459 * creation for unprivileged users; other actions depend 1460 * on fd availability and access to bpffs, so are dependent on 1461 * object creation success. Even with unprivileged BPF disabled, 1462 * capability checks are still carried out. 1463 */ 1464 if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF)) 1465 goto put_token; 1466 1467 /* check privileged map type permissions */ 1468 switch (map_type) { 1469 case BPF_MAP_TYPE_ARRAY: 1470 case BPF_MAP_TYPE_PERCPU_ARRAY: 1471 case BPF_MAP_TYPE_PROG_ARRAY: 1472 case BPF_MAP_TYPE_PERF_EVENT_ARRAY: 1473 case BPF_MAP_TYPE_CGROUP_ARRAY: 1474 case BPF_MAP_TYPE_ARRAY_OF_MAPS: 1475 case BPF_MAP_TYPE_HASH: 1476 case BPF_MAP_TYPE_PERCPU_HASH: 1477 case BPF_MAP_TYPE_HASH_OF_MAPS: 1478 case BPF_MAP_TYPE_RINGBUF: 1479 case BPF_MAP_TYPE_USER_RINGBUF: 1480 case BPF_MAP_TYPE_CGROUP_STORAGE: 1481 case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: 1482 /* unprivileged */ 1483 break; 1484 case BPF_MAP_TYPE_SK_STORAGE: 1485 case BPF_MAP_TYPE_INODE_STORAGE: 1486 case BPF_MAP_TYPE_TASK_STORAGE: 1487 case BPF_MAP_TYPE_CGRP_STORAGE: 1488 case BPF_MAP_TYPE_BLOOM_FILTER: 1489 case BPF_MAP_TYPE_LPM_TRIE: 1490 case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: 1491 case BPF_MAP_TYPE_STACK_TRACE: 1492 case BPF_MAP_TYPE_QUEUE: 1493 case BPF_MAP_TYPE_STACK: 1494 case BPF_MAP_TYPE_LRU_HASH: 1495 case BPF_MAP_TYPE_LRU_PERCPU_HASH: 1496 case BPF_MAP_TYPE_STRUCT_OPS: 1497 case BPF_MAP_TYPE_CPUMAP: 1498 case BPF_MAP_TYPE_ARENA: 1499 case BPF_MAP_TYPE_INSN_ARRAY: 1500 if (!bpf_token_capable(token, CAP_BPF)) 1501 goto put_token; 1502 break; 1503 case BPF_MAP_TYPE_SOCKMAP: 1504 case BPF_MAP_TYPE_SOCKHASH: 1505 case BPF_MAP_TYPE_DEVMAP: 1506 case BPF_MAP_TYPE_DEVMAP_HASH: 1507 case BPF_MAP_TYPE_XSKMAP: 1508 if (!bpf_token_capable(token, CAP_NET_ADMIN)) 1509 goto put_token; 1510 break; 1511 default: 1512 WARN(1, "unsupported map type %d", map_type); 1513 goto put_token; 1514 } 1515 1516 map = ops->map_alloc(attr); 1517 if (IS_ERR(map)) { 1518 err = PTR_ERR(map); 1519 goto put_token; 1520 } 1521 map->ops = ops; 1522 map->map_type = map_type; 1523 1524 err = bpf_obj_name_cpy(map->name, attr->map_name, 1525 sizeof(attr->map_name)); 1526 if (err < 0) { 1527 bpf_log(log, "Invalid map_name.\n"); 1528 goto free_map; 1529 } 1530 1531 preempt_disable(); 1532 map->cookie = gen_cookie_next(&bpf_map_cookie); 1533 preempt_enable(); 1534 1535 atomic64_set(&map->refcnt, 1); 1536 atomic64_set(&map->usercnt, 1); 1537 mutex_init(&map->freeze_mutex); 1538 spin_lock_init(&map->owner_lock); 1539 1540 if (attr->btf_key_type_id || attr->btf_value_type_id || 1541 /* Even the map's value is a kernel's struct, 1542 * the bpf_prog.o must have BTF to begin with 1543 * to figure out the corresponding kernel's 1544 * counter part. Thus, attr->btf_fd has 1545 * to be valid also. 1546 */ 1547 attr->btf_vmlinux_value_type_id) { 1548 struct btf *btf; 1549 1550 btf = btf_get_by_fd(attr->btf_fd); 1551 if (IS_ERR(btf)) { 1552 bpf_log(log, "Invalid btf_fd.\n"); 1553 err = PTR_ERR(btf); 1554 goto free_map; 1555 } 1556 if (btf_is_kernel(btf)) { 1557 btf_put(btf); 1558 err = -EACCES; 1559 goto free_map; 1560 } 1561 map->btf = btf; 1562 1563 if (attr->btf_value_type_id) { 1564 err = map_check_btf(map, token, btf, attr->btf_key_type_id, 1565 attr->btf_value_type_id); 1566 if (err) 1567 goto free_map; 1568 } 1569 1570 map->btf_key_type_id = attr->btf_key_type_id; 1571 map->btf_value_type_id = attr->btf_value_type_id; 1572 map->btf_vmlinux_value_type_id = 1573 attr->btf_vmlinux_value_type_id; 1574 } 1575 1576 if (attr->excl_prog_hash) { 1577 bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); 1578 1579 if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { 1580 bpf_log(log, "Invalid excl_prog_hash_size.\n"); 1581 err = -EINVAL; 1582 goto free_map; 1583 } 1584 1585 map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); 1586 if (!map->excl_prog_sha) { 1587 err = -ENOMEM; 1588 goto free_map; 1589 } 1590 1591 if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) { 1592 err = -EFAULT; 1593 goto free_map; 1594 } 1595 } else if (attr->excl_prog_hash_size) { 1596 bpf_log(log, "Invalid excl_prog_hash_size.\n"); 1597 err = -EINVAL; 1598 goto free_map; 1599 } 1600 1601 err = security_bpf_map_create(map, attr, token, uattr.is_kernel); 1602 if (err) 1603 goto free_map_sec; 1604 1605 err = bpf_map_alloc_id(map); 1606 if (err) 1607 goto free_map_sec; 1608 1609 bpf_map_save_memcg(map); 1610 bpf_token_put(token); 1611 1612 err = bpf_map_new_fd(map, f_flags); 1613 if (err < 0) { 1614 /* failed to allocate fd. 1615 * bpf_map_put_with_uref() is needed because the above 1616 * bpf_map_alloc_id() has published the map 1617 * to the userspace and the userspace may 1618 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. 1619 */ 1620 bpf_map_put_with_uref(map); 1621 return err; 1622 } 1623 1624 return err; 1625 1626 free_map_sec: 1627 security_bpf_map_free(map); 1628 free_map: 1629 bpf_map_free(map); 1630 put_token: 1631 bpf_token_put(token); 1632 return err; 1633 } 1634 1635 static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common, 1636 bpfptr_t uattr_common, u32 size_common) 1637 { 1638 struct bpf_verifier_log *log; 1639 struct bpf_log_attr attr_log; 1640 int err, ret; 1641 1642 log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common); 1643 if (IS_ERR(log)) 1644 return PTR_ERR(log); 1645 1646 err = __map_create(attr, uattr, log); 1647 1648 /* preserve original error even if log finalization is successful */ 1649 ret = bpf_log_attr_finalize(&attr_log, log); 1650 if (ret) { 1651 if (err >= 0) 1652 close_fd(err); 1653 err = ret; 1654 } 1655 1656 kfree(log); 1657 return err; 1658 } 1659 1660 void bpf_map_inc(struct bpf_map *map) 1661 { 1662 atomic64_inc(&map->refcnt); 1663 } 1664 EXPORT_SYMBOL_GPL(bpf_map_inc); 1665 1666 void bpf_map_inc_with_uref(struct bpf_map *map) 1667 { 1668 atomic64_inc(&map->refcnt); 1669 atomic64_inc(&map->usercnt); 1670 } 1671 EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); 1672 1673 struct bpf_map *bpf_map_get(u32 ufd) 1674 { 1675 CLASS(fd, f)(ufd); 1676 struct bpf_map *map = __bpf_map_get(f); 1677 1678 if (!IS_ERR(map)) 1679 bpf_map_inc(map); 1680 1681 return map; 1682 } 1683 EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL"); 1684 1685 struct bpf_map *bpf_map_get_with_uref(u32 ufd) 1686 { 1687 CLASS(fd, f)(ufd); 1688 struct bpf_map *map = __bpf_map_get(f); 1689 1690 if (!IS_ERR(map)) 1691 bpf_map_inc_with_uref(map); 1692 1693 return map; 1694 } 1695 1696 /* map_idr_lock should have been held or the map should have been 1697 * protected by rcu read lock. 1698 */ 1699 struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) 1700 { 1701 int refold; 1702 1703 refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0); 1704 if (!refold) 1705 return ERR_PTR(-ENOENT); 1706 if (uref) 1707 atomic64_inc(&map->usercnt); 1708 1709 return map; 1710 } 1711 1712 struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) 1713 { 1714 lockdep_assert(rcu_read_lock_held()); 1715 return __bpf_map_inc_not_zero(map, false); 1716 } 1717 EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); 1718 1719 int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, 1720 bool delete) 1721 { 1722 return -ENOTSUPP; 1723 } 1724 1725 static void *__bpf_copy_key(void __user *ukey, u64 key_size) 1726 { 1727 if (key_size) 1728 return vmemdup_user(ukey, key_size); 1729 1730 if (ukey) 1731 return ERR_PTR(-EINVAL); 1732 1733 return NULL; 1734 } 1735 1736 static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) 1737 { 1738 if (key_size) 1739 return kvmemdup_bpfptr(ukey, key_size); 1740 1741 if (!bpfptr_is_null(ukey)) 1742 return ERR_PTR(-EINVAL); 1743 1744 return NULL; 1745 } 1746 1747 /* last field in 'union bpf_attr' used by this command */ 1748 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags 1749 1750 static int map_lookup_elem(union bpf_attr *attr) 1751 { 1752 void __user *ukey = u64_to_user_ptr(attr->key); 1753 void __user *uvalue = u64_to_user_ptr(attr->value); 1754 struct bpf_map *map; 1755 void *key, *value; 1756 u32 value_size; 1757 int err; 1758 1759 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) 1760 return -EINVAL; 1761 1762 CLASS(fd, f)(attr->map_fd); 1763 map = __bpf_map_get(f); 1764 if (IS_ERR(map)) 1765 return PTR_ERR(map); 1766 if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) 1767 return -EPERM; 1768 1769 err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU); 1770 if (err) 1771 return err; 1772 1773 key = __bpf_copy_key(ukey, map->key_size); 1774 if (IS_ERR(key)) 1775 return PTR_ERR(key); 1776 1777 value_size = bpf_map_value_size(map, attr->flags); 1778 1779 err = -ENOMEM; 1780 value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); 1781 if (!value) 1782 goto free_key; 1783 1784 if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { 1785 if (copy_from_user(value, uvalue, value_size)) 1786 err = -EFAULT; 1787 else 1788 err = bpf_map_copy_value(map, key, value, attr->flags); 1789 goto free_value; 1790 } 1791 1792 err = bpf_map_copy_value(map, key, value, attr->flags); 1793 if (err) 1794 goto free_value; 1795 1796 err = -EFAULT; 1797 if (copy_to_user(uvalue, value, value_size) != 0) 1798 goto free_value; 1799 1800 err = 0; 1801 1802 free_value: 1803 kvfree(value); 1804 free_key: 1805 kvfree(key); 1806 return err; 1807 } 1808 1809 1810 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags 1811 1812 static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) 1813 { 1814 bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); 1815 bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel); 1816 struct bpf_map *map; 1817 void *key, *value; 1818 u32 value_size; 1819 int err; 1820 1821 if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) 1822 return -EINVAL; 1823 1824 CLASS(fd, f)(attr->map_fd); 1825 map = __bpf_map_get(f); 1826 if (IS_ERR(map)) 1827 return PTR_ERR(map); 1828 bpf_map_write_active_inc(map); 1829 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 1830 err = -EPERM; 1831 goto err_put; 1832 } 1833 1834 err = bpf_map_check_op_flags(map, attr->flags, ~0); 1835 if (err) 1836 goto err_put; 1837 1838 key = ___bpf_copy_key(ukey, map->key_size); 1839 if (IS_ERR(key)) { 1840 err = PTR_ERR(key); 1841 goto err_put; 1842 } 1843 1844 value_size = bpf_map_value_size(map, attr->flags); 1845 value = kvmemdup_bpfptr(uvalue, value_size); 1846 if (IS_ERR(value)) { 1847 err = PTR_ERR(value); 1848 goto free_key; 1849 } 1850 1851 err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags); 1852 if (!err) 1853 maybe_wait_bpf_programs(map); 1854 1855 kvfree(value); 1856 free_key: 1857 kvfree(key); 1858 err_put: 1859 bpf_map_write_active_dec(map); 1860 return err; 1861 } 1862 1863 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key 1864 1865 static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) 1866 { 1867 bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); 1868 struct bpf_map *map; 1869 void *key; 1870 int err; 1871 1872 if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) 1873 return -EINVAL; 1874 1875 CLASS(fd, f)(attr->map_fd); 1876 map = __bpf_map_get(f); 1877 if (IS_ERR(map)) 1878 return PTR_ERR(map); 1879 bpf_map_write_active_inc(map); 1880 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 1881 err = -EPERM; 1882 goto err_put; 1883 } 1884 1885 key = ___bpf_copy_key(ukey, map->key_size); 1886 if (IS_ERR(key)) { 1887 err = PTR_ERR(key); 1888 goto err_put; 1889 } 1890 1891 if (bpf_map_is_offloaded(map)) { 1892 err = bpf_map_offload_delete_elem(map, key); 1893 goto out; 1894 } else if (IS_FD_PROG_ARRAY(map) || 1895 map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { 1896 /* These maps require sleepable context */ 1897 err = map->ops->map_delete_elem(map, key); 1898 goto out; 1899 } 1900 1901 bpf_disable_instrumentation(); 1902 rcu_read_lock(); 1903 err = map->ops->map_delete_elem(map, key); 1904 rcu_read_unlock(); 1905 bpf_enable_instrumentation(); 1906 if (!err) 1907 maybe_wait_bpf_programs(map); 1908 out: 1909 kvfree(key); 1910 err_put: 1911 bpf_map_write_active_dec(map); 1912 return err; 1913 } 1914 1915 /* last field in 'union bpf_attr' used by this command */ 1916 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key 1917 1918 static int map_get_next_key(union bpf_attr *attr) 1919 { 1920 void __user *ukey = u64_to_user_ptr(attr->key); 1921 void __user *unext_key = u64_to_user_ptr(attr->next_key); 1922 struct bpf_map *map; 1923 void *key, *next_key; 1924 int err; 1925 1926 if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) 1927 return -EINVAL; 1928 1929 CLASS(fd, f)(attr->map_fd); 1930 map = __bpf_map_get(f); 1931 if (IS_ERR(map)) 1932 return PTR_ERR(map); 1933 if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) 1934 return -EPERM; 1935 1936 if (ukey) { 1937 key = __bpf_copy_key(ukey, map->key_size); 1938 if (IS_ERR(key)) 1939 return PTR_ERR(key); 1940 } else { 1941 key = NULL; 1942 } 1943 1944 err = -ENOMEM; 1945 next_key = kvmalloc(map->key_size, GFP_USER); 1946 if (!next_key) 1947 goto free_key; 1948 1949 if (bpf_map_is_offloaded(map)) { 1950 err = bpf_map_offload_get_next_key(map, key, next_key); 1951 goto out; 1952 } 1953 1954 rcu_read_lock(); 1955 err = map->ops->map_get_next_key(map, key, next_key); 1956 rcu_read_unlock(); 1957 out: 1958 if (err) 1959 goto free_next_key; 1960 1961 err = -EFAULT; 1962 if (copy_to_user(unext_key, next_key, map->key_size) != 0) 1963 goto free_next_key; 1964 1965 err = 0; 1966 1967 free_next_key: 1968 kvfree(next_key); 1969 free_key: 1970 kvfree(key); 1971 return err; 1972 } 1973 1974 int generic_map_delete_batch(struct bpf_map *map, 1975 const union bpf_attr *attr, 1976 union bpf_attr __user *uattr) 1977 { 1978 void __user *keys = u64_to_user_ptr(attr->batch.keys); 1979 u32 cp, max_count; 1980 int err = 0; 1981 void *key; 1982 1983 if (attr->batch.elem_flags & ~BPF_F_LOCK) 1984 return -EINVAL; 1985 1986 if ((attr->batch.elem_flags & BPF_F_LOCK) && 1987 !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { 1988 return -EINVAL; 1989 } 1990 1991 max_count = attr->batch.count; 1992 if (!max_count) 1993 return 0; 1994 1995 if (put_user(0, &uattr->batch.count)) 1996 return -EFAULT; 1997 1998 key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); 1999 if (!key) 2000 return -ENOMEM; 2001 2002 for (cp = 0; cp < max_count; cp++) { 2003 err = -EFAULT; 2004 if (copy_from_user(key, keys + cp * map->key_size, 2005 map->key_size)) 2006 break; 2007 2008 if (bpf_map_is_offloaded(map)) { 2009 err = bpf_map_offload_delete_elem(map, key); 2010 break; 2011 } 2012 2013 bpf_disable_instrumentation(); 2014 rcu_read_lock(); 2015 err = map->ops->map_delete_elem(map, key); 2016 rcu_read_unlock(); 2017 bpf_enable_instrumentation(); 2018 if (err) 2019 break; 2020 cond_resched(); 2021 } 2022 if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) 2023 err = -EFAULT; 2024 2025 kvfree(key); 2026 2027 return err; 2028 } 2029 2030 int generic_map_update_batch(struct bpf_map *map, struct file *map_file, 2031 const union bpf_attr *attr, 2032 union bpf_attr __user *uattr) 2033 { 2034 void __user *values = u64_to_user_ptr(attr->batch.values); 2035 void __user *keys = u64_to_user_ptr(attr->batch.keys); 2036 u32 value_size, cp, max_count; 2037 void *key, *value; 2038 int err = 0; 2039 2040 err = bpf_map_check_op_flags(map, attr->batch.elem_flags, 2041 BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS); 2042 if (err) 2043 return err; 2044 2045 value_size = bpf_map_value_size(map, attr->batch.elem_flags); 2046 2047 max_count = attr->batch.count; 2048 if (!max_count) 2049 return 0; 2050 2051 if (put_user(0, &uattr->batch.count)) 2052 return -EFAULT; 2053 2054 key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); 2055 if (!key) 2056 return -ENOMEM; 2057 2058 value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); 2059 if (!value) { 2060 kvfree(key); 2061 return -ENOMEM; 2062 } 2063 2064 for (cp = 0; cp < max_count; cp++) { 2065 err = -EFAULT; 2066 if (copy_from_user(key, keys + cp * map->key_size, 2067 map->key_size) || 2068 copy_from_user(value, values + cp * value_size, value_size)) 2069 break; 2070 2071 err = bpf_map_update_value(map, map_file, key, value, 2072 attr->batch.elem_flags); 2073 2074 if (err) 2075 break; 2076 cond_resched(); 2077 } 2078 2079 if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) 2080 err = -EFAULT; 2081 2082 kvfree(value); 2083 kvfree(key); 2084 2085 return err; 2086 } 2087 2088 int generic_map_lookup_batch(struct bpf_map *map, 2089 const union bpf_attr *attr, 2090 union bpf_attr __user *uattr) 2091 { 2092 void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); 2093 void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); 2094 void __user *values = u64_to_user_ptr(attr->batch.values); 2095 void __user *keys = u64_to_user_ptr(attr->batch.keys); 2096 void *buf, *buf_prevkey, *prev_key, *key, *value; 2097 u32 value_size, cp, max_count; 2098 int err; 2099 2100 err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU); 2101 if (err) 2102 return err; 2103 2104 value_size = bpf_map_value_size(map, attr->batch.elem_flags); 2105 2106 max_count = attr->batch.count; 2107 if (!max_count) 2108 return 0; 2109 2110 if (put_user(0, &uattr->batch.count)) 2111 return -EFAULT; 2112 2113 buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); 2114 if (!buf_prevkey) 2115 return -ENOMEM; 2116 2117 buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); 2118 if (!buf) { 2119 kvfree(buf_prevkey); 2120 return -ENOMEM; 2121 } 2122 2123 err = -EFAULT; 2124 prev_key = NULL; 2125 if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) 2126 goto free_buf; 2127 key = buf; 2128 value = key + map->key_size; 2129 if (ubatch) 2130 prev_key = buf_prevkey; 2131 2132 for (cp = 0; cp < max_count;) { 2133 rcu_read_lock(); 2134 err = map->ops->map_get_next_key(map, prev_key, key); 2135 rcu_read_unlock(); 2136 if (err) 2137 break; 2138 err = bpf_map_copy_value(map, key, value, 2139 attr->batch.elem_flags); 2140 2141 if (err == -ENOENT) 2142 goto next_key; 2143 2144 if (err) 2145 goto free_buf; 2146 2147 if (copy_to_user(keys + cp * map->key_size, key, 2148 map->key_size)) { 2149 err = -EFAULT; 2150 goto free_buf; 2151 } 2152 if (copy_to_user(values + cp * value_size, value, value_size)) { 2153 err = -EFAULT; 2154 goto free_buf; 2155 } 2156 2157 cp++; 2158 next_key: 2159 if (!prev_key) 2160 prev_key = buf_prevkey; 2161 2162 swap(prev_key, key); 2163 cond_resched(); 2164 } 2165 2166 if (err == -EFAULT) 2167 goto free_buf; 2168 2169 if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || 2170 (cp && copy_to_user(uobatch, prev_key, map->key_size)))) 2171 err = -EFAULT; 2172 2173 free_buf: 2174 kvfree(buf_prevkey); 2175 kvfree(buf); 2176 return err; 2177 } 2178 2179 #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags 2180 2181 static int map_lookup_and_delete_elem(union bpf_attr *attr) 2182 { 2183 void __user *ukey = u64_to_user_ptr(attr->key); 2184 void __user *uvalue = u64_to_user_ptr(attr->value); 2185 struct bpf_map *map; 2186 void *key, *value; 2187 u32 value_size; 2188 int err; 2189 2190 if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) 2191 return -EINVAL; 2192 2193 if (attr->flags & ~BPF_F_LOCK) 2194 return -EINVAL; 2195 2196 CLASS(fd, f)(attr->map_fd); 2197 map = __bpf_map_get(f); 2198 if (IS_ERR(map)) 2199 return PTR_ERR(map); 2200 bpf_map_write_active_inc(map); 2201 if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || 2202 !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 2203 err = -EPERM; 2204 goto err_put; 2205 } 2206 2207 if (attr->flags && 2208 (map->map_type == BPF_MAP_TYPE_QUEUE || 2209 map->map_type == BPF_MAP_TYPE_STACK)) { 2210 err = -EINVAL; 2211 goto err_put; 2212 } 2213 2214 if ((attr->flags & BPF_F_LOCK) && 2215 !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { 2216 err = -EINVAL; 2217 goto err_put; 2218 } 2219 2220 key = __bpf_copy_key(ukey, map->key_size); 2221 if (IS_ERR(key)) { 2222 err = PTR_ERR(key); 2223 goto err_put; 2224 } 2225 2226 value_size = bpf_map_value_size(map, 0); 2227 2228 err = -ENOMEM; 2229 value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); 2230 if (!value) 2231 goto free_key; 2232 2233 err = -ENOTSUPP; 2234 if (map->map_type == BPF_MAP_TYPE_QUEUE || 2235 map->map_type == BPF_MAP_TYPE_STACK) { 2236 err = map->ops->map_pop_elem(map, value); 2237 } else if (map->map_type == BPF_MAP_TYPE_HASH || 2238 map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 2239 map->map_type == BPF_MAP_TYPE_LRU_HASH || 2240 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || 2241 map->map_type == BPF_MAP_TYPE_STACK_TRACE) { 2242 if (!bpf_map_is_offloaded(map)) { 2243 bpf_disable_instrumentation(); 2244 rcu_read_lock(); 2245 err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); 2246 rcu_read_unlock(); 2247 bpf_enable_instrumentation(); 2248 } 2249 } 2250 2251 if (err) 2252 goto free_value; 2253 2254 if (copy_to_user(uvalue, value, value_size) != 0) { 2255 err = -EFAULT; 2256 goto free_value; 2257 } 2258 2259 err = 0; 2260 2261 free_value: 2262 kvfree(value); 2263 free_key: 2264 kvfree(key); 2265 err_put: 2266 bpf_map_write_active_dec(map); 2267 return err; 2268 } 2269 2270 #define BPF_MAP_FREEZE_LAST_FIELD map_fd 2271 2272 static int map_freeze(const union bpf_attr *attr) 2273 { 2274 int err = 0; 2275 struct bpf_map *map; 2276 2277 if (CHECK_ATTR(BPF_MAP_FREEZE)) 2278 return -EINVAL; 2279 2280 CLASS(fd, f)(attr->map_fd); 2281 map = __bpf_map_get(f); 2282 if (IS_ERR(map)) 2283 return PTR_ERR(map); 2284 2285 if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) 2286 return -ENOTSUPP; 2287 2288 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) 2289 return -EPERM; 2290 2291 mutex_lock(&map->freeze_mutex); 2292 if (bpf_map_write_active(map)) { 2293 err = -EBUSY; 2294 goto err_put; 2295 } 2296 if (READ_ONCE(map->frozen)) { 2297 err = -EBUSY; 2298 goto err_put; 2299 } 2300 2301 WRITE_ONCE(map->frozen, true); 2302 err_put: 2303 mutex_unlock(&map->freeze_mutex); 2304 return err; 2305 } 2306 2307 static const struct bpf_prog_ops * const bpf_prog_types[] = { 2308 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ 2309 [_id] = & _name ## _prog_ops, 2310 #define BPF_MAP_TYPE(_id, _ops) 2311 #define BPF_LINK_TYPE(_id, _name) 2312 #include <linux/bpf_types.h> 2313 #undef BPF_PROG_TYPE 2314 #undef BPF_MAP_TYPE 2315 #undef BPF_LINK_TYPE 2316 }; 2317 2318 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) 2319 { 2320 const struct bpf_prog_ops *ops; 2321 2322 if (type >= ARRAY_SIZE(bpf_prog_types)) 2323 return -EINVAL; 2324 type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types)); 2325 ops = bpf_prog_types[type]; 2326 if (!ops) 2327 return -EINVAL; 2328 2329 if (!bpf_prog_is_offloaded(prog->aux)) 2330 prog->aux->ops = ops; 2331 else 2332 prog->aux->ops = &bpf_offload_prog_ops; 2333 prog->type = type; 2334 return 0; 2335 } 2336 2337 enum bpf_audit { 2338 BPF_AUDIT_LOAD, 2339 BPF_AUDIT_UNLOAD, 2340 BPF_AUDIT_MAX, 2341 }; 2342 2343 static const char * const bpf_audit_str[BPF_AUDIT_MAX] = { 2344 [BPF_AUDIT_LOAD] = "LOAD", 2345 [BPF_AUDIT_UNLOAD] = "UNLOAD", 2346 }; 2347 2348 static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) 2349 { 2350 struct audit_context *ctx = NULL; 2351 struct audit_buffer *ab; 2352 2353 if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX)) 2354 return; 2355 if (audit_enabled == AUDIT_OFF) 2356 return; 2357 if (!in_hardirq() && !irqs_disabled()) 2358 ctx = audit_context(); 2359 ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); 2360 if (unlikely(!ab)) 2361 return; 2362 audit_log_format(ab, "prog-id=%u op=%s", 2363 prog->aux->id, bpf_audit_str[op]); 2364 audit_log_end(ab); 2365 } 2366 2367 static int bpf_prog_alloc_id(struct bpf_prog *prog) 2368 { 2369 int id; 2370 2371 idr_preload(GFP_KERNEL); 2372 spin_lock_bh(&prog_idr_lock); 2373 id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); 2374 if (id > 0) 2375 prog->aux->id = id; 2376 spin_unlock_bh(&prog_idr_lock); 2377 idr_preload_end(); 2378 2379 /* id is in [1, INT_MAX) */ 2380 if (WARN_ON_ONCE(!id)) 2381 return -ENOSPC; 2382 2383 return id > 0 ? 0 : id; 2384 } 2385 2386 void bpf_prog_free_id(struct bpf_prog *prog) 2387 { 2388 unsigned long flags; 2389 2390 /* cBPF to eBPF migrations are currently not in the idr store. 2391 * Offloaded programs are removed from the store when their device 2392 * disappears - even if someone grabs an fd to them they are unusable, 2393 * simply waiting for refcnt to drop to be freed. 2394 */ 2395 if (!prog->aux->id) 2396 return; 2397 2398 spin_lock_irqsave(&prog_idr_lock, flags); 2399 idr_remove(&prog_idr, prog->aux->id); 2400 prog->aux->id = 0; 2401 spin_unlock_irqrestore(&prog_idr_lock, flags); 2402 } 2403 2404 static void __bpf_prog_put_rcu(struct rcu_head *rcu) 2405 { 2406 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); 2407 2408 kvfree(aux->func_info); 2409 kfree(aux->func_info_aux); 2410 free_uid(aux->user); 2411 security_bpf_prog_free(aux->prog); 2412 bpf_prog_free(aux->prog); 2413 } 2414 2415 static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) 2416 { 2417 bpf_prog_kallsyms_del_all(prog); 2418 btf_put(prog->aux->btf); 2419 module_put(prog->aux->mod); 2420 kvfree(prog->aux->jited_linfo); 2421 kvfree(prog->aux->linfo); 2422 kfree(prog->aux->kfunc_tab); 2423 kfree(prog->aux->ctx_arg_info); 2424 if (prog->aux->attach_btf) 2425 btf_put(prog->aux->attach_btf); 2426 2427 if (deferred) { 2428 if (prog->sleepable) 2429 call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); 2430 else 2431 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); 2432 } else { 2433 __bpf_prog_put_rcu(&prog->aux->rcu); 2434 } 2435 } 2436 2437 static void bpf_prog_put_deferred(struct work_struct *work) 2438 { 2439 struct bpf_prog_aux *aux; 2440 struct bpf_prog *prog; 2441 2442 aux = container_of(work, struct bpf_prog_aux, work); 2443 prog = aux->prog; 2444 perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); 2445 bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); 2446 bpf_prog_free_id(prog); 2447 __bpf_prog_put_noref(prog, true); 2448 } 2449 2450 static void __bpf_prog_put(struct bpf_prog *prog) 2451 { 2452 struct bpf_prog_aux *aux = prog->aux; 2453 2454 if (atomic64_dec_and_test(&aux->refcnt)) { 2455 if (in_hardirq() || irqs_disabled()) { 2456 INIT_WORK(&aux->work, bpf_prog_put_deferred); 2457 schedule_work(&aux->work); 2458 } else { 2459 bpf_prog_put_deferred(&aux->work); 2460 } 2461 } 2462 } 2463 2464 void bpf_prog_put(struct bpf_prog *prog) 2465 { 2466 __bpf_prog_put(prog); 2467 } 2468 EXPORT_SYMBOL_GPL(bpf_prog_put); 2469 2470 static int bpf_prog_release(struct inode *inode, struct file *filp) 2471 { 2472 struct bpf_prog *prog = filp->private_data; 2473 2474 bpf_prog_put(prog); 2475 return 0; 2476 } 2477 2478 struct bpf_prog_kstats { 2479 u64 nsecs; 2480 u64 cnt; 2481 u64 misses; 2482 }; 2483 2484 void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) 2485 { 2486 struct bpf_prog_stats *stats; 2487 unsigned int flags; 2488 2489 if (unlikely(!prog->stats)) 2490 return; 2491 2492 stats = this_cpu_ptr(prog->stats); 2493 flags = u64_stats_update_begin_irqsave(&stats->syncp); 2494 u64_stats_inc(&stats->misses); 2495 u64_stats_update_end_irqrestore(&stats->syncp, flags); 2496 } 2497 2498 static void bpf_prog_get_stats(const struct bpf_prog *prog, 2499 struct bpf_prog_kstats *stats) 2500 { 2501 u64 nsecs = 0, cnt = 0, misses = 0; 2502 int cpu; 2503 2504 for_each_possible_cpu(cpu) { 2505 const struct bpf_prog_stats *st; 2506 unsigned int start; 2507 u64 tnsecs, tcnt, tmisses; 2508 2509 st = per_cpu_ptr(prog->stats, cpu); 2510 do { 2511 start = u64_stats_fetch_begin(&st->syncp); 2512 tnsecs = u64_stats_read(&st->nsecs); 2513 tcnt = u64_stats_read(&st->cnt); 2514 tmisses = u64_stats_read(&st->misses); 2515 } while (u64_stats_fetch_retry(&st->syncp, start)); 2516 nsecs += tnsecs; 2517 cnt += tcnt; 2518 misses += tmisses; 2519 } 2520 stats->nsecs = nsecs; 2521 stats->cnt = cnt; 2522 stats->misses = misses; 2523 } 2524 2525 #ifdef CONFIG_PROC_FS 2526 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) 2527 { 2528 const struct bpf_prog *prog = filp->private_data; 2529 char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; 2530 struct bpf_prog_kstats stats; 2531 2532 bpf_prog_get_stats(prog, &stats); 2533 bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); 2534 seq_printf(m, 2535 "prog_type:\t%u\n" 2536 "prog_jited:\t%u\n" 2537 "prog_tag:\t%s\n" 2538 "memlock:\t%llu\n" 2539 "prog_id:\t%u\n" 2540 "run_time_ns:\t%llu\n" 2541 "run_cnt:\t%llu\n" 2542 "recursion_misses:\t%llu\n" 2543 "verified_insns:\t%u\n", 2544 prog->type, 2545 prog->jited, 2546 prog_tag, 2547 prog->pages * 1ULL << PAGE_SHIFT, 2548 prog->aux->id, 2549 stats.nsecs, 2550 stats.cnt, 2551 stats.misses, 2552 prog->aux->verified_insns); 2553 } 2554 #endif 2555 2556 const struct file_operations bpf_prog_fops = { 2557 #ifdef CONFIG_PROC_FS 2558 .show_fdinfo = bpf_prog_show_fdinfo, 2559 #endif 2560 .release = bpf_prog_release, 2561 .read = bpf_dummy_read, 2562 .write = bpf_dummy_write, 2563 }; 2564 2565 int bpf_prog_new_fd(struct bpf_prog *prog) 2566 { 2567 int ret; 2568 2569 ret = security_bpf_prog(prog); 2570 if (ret < 0) 2571 return ret; 2572 2573 return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, 2574 O_RDWR | O_CLOEXEC); 2575 } 2576 2577 void bpf_prog_add(struct bpf_prog *prog, int i) 2578 { 2579 atomic64_add(i, &prog->aux->refcnt); 2580 } 2581 EXPORT_SYMBOL_GPL(bpf_prog_add); 2582 2583 void bpf_prog_sub(struct bpf_prog *prog, int i) 2584 { 2585 /* Only to be used for undoing previous bpf_prog_add() in some 2586 * error path. We still know that another entity in our call 2587 * path holds a reference to the program, thus atomic_sub() can 2588 * be safely used in such cases! 2589 */ 2590 WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0); 2591 } 2592 EXPORT_SYMBOL_GPL(bpf_prog_sub); 2593 2594 void bpf_prog_inc(struct bpf_prog *prog) 2595 { 2596 atomic64_inc(&prog->aux->refcnt); 2597 } 2598 EXPORT_SYMBOL_GPL(bpf_prog_inc); 2599 2600 /* prog_idr_lock should have been held */ 2601 struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) 2602 { 2603 int refold; 2604 2605 refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0); 2606 2607 if (!refold) 2608 return ERR_PTR(-ENOENT); 2609 2610 return prog; 2611 } 2612 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); 2613 2614 bool bpf_prog_get_ok(struct bpf_prog *prog, 2615 enum bpf_prog_type *attach_type, bool attach_drv) 2616 { 2617 /* not an attachment, just a refcount inc, always allow */ 2618 if (!attach_type) 2619 return true; 2620 2621 if (prog->type != *attach_type) 2622 return false; 2623 if (bpf_prog_is_offloaded(prog->aux) && !attach_drv) 2624 return false; 2625 2626 return true; 2627 } 2628 2629 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, 2630 bool attach_drv) 2631 { 2632 CLASS(fd, f)(ufd); 2633 struct bpf_prog *prog; 2634 2635 if (fd_empty(f)) 2636 return ERR_PTR(-EBADF); 2637 if (fd_file(f)->f_op != &bpf_prog_fops) 2638 return ERR_PTR(-EINVAL); 2639 2640 prog = fd_file(f)->private_data; 2641 if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) 2642 return ERR_PTR(-EINVAL); 2643 2644 bpf_prog_inc(prog); 2645 return prog; 2646 } 2647 2648 struct bpf_prog *bpf_prog_get(u32 ufd) 2649 { 2650 return __bpf_prog_get(ufd, NULL, false); 2651 } 2652 2653 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, 2654 bool attach_drv) 2655 { 2656 return __bpf_prog_get(ufd, &type, attach_drv); 2657 } 2658 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); 2659 2660 /* Initially all BPF programs could be loaded w/o specifying 2661 * expected_attach_type. Later for some of them specifying expected_attach_type 2662 * at load time became required so that program could be validated properly. 2663 * Programs of types that are allowed to be loaded both w/ and w/o (for 2664 * backward compatibility) expected_attach_type, should have the default attach 2665 * type assigned to expected_attach_type for the latter case, so that it can be 2666 * validated later at attach time. 2667 * 2668 * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if 2669 * prog type requires it but has some attach types that have to be backward 2670 * compatible. 2671 */ 2672 static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) 2673 { 2674 switch (attr->prog_type) { 2675 case BPF_PROG_TYPE_CGROUP_SOCK: 2676 /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't 2677 * exist so checking for non-zero is the way to go here. 2678 */ 2679 if (!attr->expected_attach_type) 2680 attr->expected_attach_type = 2681 BPF_CGROUP_INET_SOCK_CREATE; 2682 break; 2683 case BPF_PROG_TYPE_SK_REUSEPORT: 2684 if (!attr->expected_attach_type) 2685 attr->expected_attach_type = 2686 BPF_SK_REUSEPORT_SELECT; 2687 break; 2688 } 2689 } 2690 2691 static int 2692 bpf_prog_load_check_attach(enum bpf_prog_type prog_type, 2693 enum bpf_attach_type expected_attach_type, 2694 struct btf *attach_btf, u32 btf_id, 2695 struct bpf_prog *dst_prog) 2696 { 2697 if (btf_id) { 2698 if (btf_id > BTF_MAX_TYPE) 2699 return -EINVAL; 2700 2701 if (!attach_btf && !dst_prog) 2702 return -EINVAL; 2703 2704 switch (prog_type) { 2705 case BPF_PROG_TYPE_TRACING: 2706 case BPF_PROG_TYPE_LSM: 2707 case BPF_PROG_TYPE_STRUCT_OPS: 2708 case BPF_PROG_TYPE_EXT: 2709 break; 2710 default: 2711 return -EINVAL; 2712 } 2713 } 2714 2715 if (attach_btf && (!btf_id || dst_prog)) 2716 return -EINVAL; 2717 2718 if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING && 2719 prog_type != BPF_PROG_TYPE_EXT) 2720 return -EINVAL; 2721 2722 switch (prog_type) { 2723 case BPF_PROG_TYPE_CGROUP_SOCK: 2724 switch (expected_attach_type) { 2725 case BPF_CGROUP_INET_SOCK_CREATE: 2726 case BPF_CGROUP_INET_SOCK_RELEASE: 2727 case BPF_CGROUP_INET4_POST_BIND: 2728 case BPF_CGROUP_INET6_POST_BIND: 2729 return 0; 2730 default: 2731 return -EINVAL; 2732 } 2733 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 2734 switch (expected_attach_type) { 2735 case BPF_CGROUP_INET4_BIND: 2736 case BPF_CGROUP_INET6_BIND: 2737 case BPF_CGROUP_INET4_CONNECT: 2738 case BPF_CGROUP_INET6_CONNECT: 2739 case BPF_CGROUP_UNIX_CONNECT: 2740 case BPF_CGROUP_INET4_GETPEERNAME: 2741 case BPF_CGROUP_INET6_GETPEERNAME: 2742 case BPF_CGROUP_UNIX_GETPEERNAME: 2743 case BPF_CGROUP_INET4_GETSOCKNAME: 2744 case BPF_CGROUP_INET6_GETSOCKNAME: 2745 case BPF_CGROUP_UNIX_GETSOCKNAME: 2746 case BPF_CGROUP_UDP4_SENDMSG: 2747 case BPF_CGROUP_UDP6_SENDMSG: 2748 case BPF_CGROUP_UNIX_SENDMSG: 2749 case BPF_CGROUP_UDP4_RECVMSG: 2750 case BPF_CGROUP_UDP6_RECVMSG: 2751 case BPF_CGROUP_UNIX_RECVMSG: 2752 return 0; 2753 default: 2754 return -EINVAL; 2755 } 2756 case BPF_PROG_TYPE_CGROUP_SKB: 2757 switch (expected_attach_type) { 2758 case BPF_CGROUP_INET_INGRESS: 2759 case BPF_CGROUP_INET_EGRESS: 2760 return 0; 2761 default: 2762 return -EINVAL; 2763 } 2764 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 2765 switch (expected_attach_type) { 2766 case BPF_CGROUP_SETSOCKOPT: 2767 case BPF_CGROUP_GETSOCKOPT: 2768 return 0; 2769 default: 2770 return -EINVAL; 2771 } 2772 case BPF_PROG_TYPE_SK_LOOKUP: 2773 if (expected_attach_type == BPF_SK_LOOKUP) 2774 return 0; 2775 return -EINVAL; 2776 case BPF_PROG_TYPE_SK_REUSEPORT: 2777 switch (expected_attach_type) { 2778 case BPF_SK_REUSEPORT_SELECT: 2779 case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: 2780 return 0; 2781 default: 2782 return -EINVAL; 2783 } 2784 case BPF_PROG_TYPE_NETFILTER: 2785 if (expected_attach_type == BPF_NETFILTER) 2786 return 0; 2787 return -EINVAL; 2788 case BPF_PROG_TYPE_SYSCALL: 2789 case BPF_PROG_TYPE_EXT: 2790 if (expected_attach_type) 2791 return -EINVAL; 2792 fallthrough; 2793 default: 2794 return 0; 2795 } 2796 } 2797 2798 static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) 2799 { 2800 switch (prog_type) { 2801 case BPF_PROG_TYPE_SCHED_CLS: 2802 case BPF_PROG_TYPE_SCHED_ACT: 2803 case BPF_PROG_TYPE_XDP: 2804 case BPF_PROG_TYPE_LWT_IN: 2805 case BPF_PROG_TYPE_LWT_OUT: 2806 case BPF_PROG_TYPE_LWT_XMIT: 2807 case BPF_PROG_TYPE_LWT_SEG6LOCAL: 2808 case BPF_PROG_TYPE_SK_SKB: 2809 case BPF_PROG_TYPE_SK_MSG: 2810 case BPF_PROG_TYPE_FLOW_DISSECTOR: 2811 case BPF_PROG_TYPE_CGROUP_DEVICE: 2812 case BPF_PROG_TYPE_CGROUP_SOCK: 2813 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 2814 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 2815 case BPF_PROG_TYPE_CGROUP_SYSCTL: 2816 case BPF_PROG_TYPE_SOCK_OPS: 2817 case BPF_PROG_TYPE_EXT: /* extends any prog */ 2818 case BPF_PROG_TYPE_NETFILTER: 2819 return true; 2820 case BPF_PROG_TYPE_CGROUP_SKB: 2821 /* always unpriv */ 2822 case BPF_PROG_TYPE_SK_REUSEPORT: 2823 /* equivalent to SOCKET_FILTER. need CAP_BPF only */ 2824 default: 2825 return false; 2826 } 2827 } 2828 2829 static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) 2830 { 2831 switch (prog_type) { 2832 case BPF_PROG_TYPE_KPROBE: 2833 case BPF_PROG_TYPE_TRACEPOINT: 2834 case BPF_PROG_TYPE_PERF_EVENT: 2835 case BPF_PROG_TYPE_RAW_TRACEPOINT: 2836 case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: 2837 case BPF_PROG_TYPE_TRACING: 2838 case BPF_PROG_TYPE_LSM: 2839 case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ 2840 case BPF_PROG_TYPE_EXT: /* extends any prog */ 2841 return true; 2842 default: 2843 return false; 2844 } 2845 } 2846 2847 static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, 2848 bool is_kernel) 2849 { 2850 bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); 2851 struct bpf_dynptr_kern sig_ptr, insns_ptr; 2852 struct bpf_key *key = NULL; 2853 void *sig; 2854 int err = 0; 2855 2856 /* 2857 * Don't attempt to use kmalloc_large or vmalloc for signatures. 2858 * Practical signature for BPF program should be below this limit. 2859 */ 2860 if (attr->signature_size > KMALLOC_MAX_CACHE_SIZE) 2861 return -EINVAL; 2862 2863 if (system_keyring_id_check(attr->keyring_id) == 0) 2864 key = bpf_lookup_system_key(attr->keyring_id); 2865 else 2866 key = bpf_lookup_user_key(attr->keyring_id, 0); 2867 2868 if (!key) 2869 return -EINVAL; 2870 2871 sig = kvmemdup_bpfptr(usig, attr->signature_size); 2872 if (IS_ERR(sig)) { 2873 bpf_key_put(key); 2874 return PTR_ERR(sig); 2875 } 2876 2877 bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0, 2878 attr->signature_size); 2879 bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0, 2880 prog->len * sizeof(struct bpf_insn)); 2881 2882 err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, 2883 (struct bpf_dynptr *)&sig_ptr, key); 2884 2885 bpf_key_put(key); 2886 kvfree(sig); 2887 return err; 2888 } 2889 2890 static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) 2891 { 2892 int err; 2893 int i; 2894 2895 for (i = 0; i < prog->aux->used_map_cnt; i++) { 2896 if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY) 2897 continue; 2898 2899 err = bpf_insn_array_ready(prog->aux->used_maps[i]); 2900 if (err) 2901 return err; 2902 } 2903 2904 return 0; 2905 } 2906 2907 /* last field in 'union bpf_attr' used by this command */ 2908 #define BPF_PROG_LOAD_LAST_FIELD keyring_id 2909 2910 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) 2911 { 2912 enum bpf_prog_type type = attr->prog_type; 2913 struct bpf_prog *prog, *dst_prog = NULL; 2914 struct btf *attach_btf = NULL; 2915 struct bpf_token *token = NULL; 2916 bool bpf_cap; 2917 int err; 2918 char license[128]; 2919 2920 if (CHECK_ATTR(BPF_PROG_LOAD)) 2921 return -EINVAL; 2922 2923 if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | 2924 BPF_F_ANY_ALIGNMENT | 2925 BPF_F_TEST_STATE_FREQ | 2926 BPF_F_SLEEPABLE | 2927 BPF_F_TEST_RND_HI32 | 2928 BPF_F_XDP_HAS_FRAGS | 2929 BPF_F_XDP_DEV_BOUND_ONLY | 2930 BPF_F_TEST_REG_INVARIANTS | 2931 BPF_F_TOKEN_FD)) 2932 return -EINVAL; 2933 2934 bpf_prog_load_fixup_attach_type(attr); 2935 2936 if (attr->prog_flags & BPF_F_TOKEN_FD) { 2937 token = bpf_token_get_from_fd(attr->prog_token_fd); 2938 if (IS_ERR(token)) 2939 return PTR_ERR(token); 2940 /* if current token doesn't grant prog loading permissions, 2941 * then we can't use this token, so ignore it and rely on 2942 * system-wide capabilities checks 2943 */ 2944 if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) || 2945 !bpf_token_allow_prog_type(token, attr->prog_type, 2946 attr->expected_attach_type)) { 2947 bpf_token_put(token); 2948 token = NULL; 2949 } 2950 } 2951 2952 bpf_cap = bpf_token_capable(token, CAP_BPF); 2953 err = -EPERM; 2954 2955 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && 2956 (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && 2957 !bpf_cap) 2958 goto put_token; 2959 2960 /* Intent here is for unprivileged_bpf_disabled to block BPF program 2961 * creation for unprivileged users; other actions depend 2962 * on fd availability and access to bpffs, so are dependent on 2963 * object creation success. Even with unprivileged BPF disabled, 2964 * capability checks are still carried out for these 2965 * and other operations. 2966 */ 2967 if (sysctl_unprivileged_bpf_disabled && !bpf_cap) 2968 goto put_token; 2969 2970 if (attr->insn_cnt == 0 || 2971 attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) { 2972 err = -E2BIG; 2973 goto put_token; 2974 } 2975 if (type != BPF_PROG_TYPE_SOCKET_FILTER && 2976 type != BPF_PROG_TYPE_CGROUP_SKB && 2977 !bpf_cap) 2978 goto put_token; 2979 2980 if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN)) 2981 goto put_token; 2982 if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) 2983 goto put_token; 2984 2985 /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog 2986 * or btf, we need to check which one it is 2987 */ 2988 if (attr->attach_prog_fd) { 2989 dst_prog = bpf_prog_get(attr->attach_prog_fd); 2990 if (IS_ERR(dst_prog)) { 2991 dst_prog = NULL; 2992 attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); 2993 if (IS_ERR(attach_btf)) { 2994 err = -EINVAL; 2995 goto put_token; 2996 } 2997 if (!btf_is_kernel(attach_btf)) { 2998 /* attaching through specifying bpf_prog's BTF 2999 * objects directly might be supported eventually 3000 */ 3001 btf_put(attach_btf); 3002 err = -ENOTSUPP; 3003 goto put_token; 3004 } 3005 } 3006 } else if (attr->attach_btf_id) { 3007 /* fall back to vmlinux BTF, if BTF type ID is specified */ 3008 attach_btf = bpf_get_btf_vmlinux(); 3009 if (IS_ERR(attach_btf)) { 3010 err = PTR_ERR(attach_btf); 3011 goto put_token; 3012 } 3013 if (!attach_btf) { 3014 err = -EINVAL; 3015 goto put_token; 3016 } 3017 btf_get(attach_btf); 3018 } 3019 3020 if (bpf_prog_load_check_attach(type, attr->expected_attach_type, 3021 attach_btf, attr->attach_btf_id, 3022 dst_prog)) { 3023 if (dst_prog) 3024 bpf_prog_put(dst_prog); 3025 if (attach_btf) 3026 btf_put(attach_btf); 3027 err = -EINVAL; 3028 goto put_token; 3029 } 3030 3031 /* plain bpf_prog allocation */ 3032 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); 3033 if (!prog) { 3034 if (dst_prog) 3035 bpf_prog_put(dst_prog); 3036 if (attach_btf) 3037 btf_put(attach_btf); 3038 err = -EINVAL; 3039 goto put_token; 3040 } 3041 3042 prog->expected_attach_type = attr->expected_attach_type; 3043 prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE); 3044 prog->aux->attach_btf = attach_btf; 3045 prog->aux->attach_btf_id = attr->attach_btf_id; 3046 prog->aux->dst_prog = dst_prog; 3047 prog->aux->dev_bound = !!attr->prog_ifindex; 3048 prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; 3049 3050 /* move token into prog->aux, reuse taken refcnt */ 3051 prog->aux->token = token; 3052 token = NULL; 3053 3054 prog->aux->user = get_current_user(); 3055 prog->len = attr->insn_cnt; 3056 3057 err = -EFAULT; 3058 if (copy_from_bpfptr(prog->insns, 3059 make_bpfptr(attr->insns, uattr.is_kernel), 3060 bpf_prog_insn_size(prog)) != 0) 3061 goto free_prog; 3062 /* copy eBPF program license from user space */ 3063 if (strncpy_from_bpfptr(license, 3064 make_bpfptr(attr->license, uattr.is_kernel), 3065 sizeof(license) - 1) < 0) 3066 goto free_prog; 3067 license[sizeof(license) - 1] = 0; 3068 3069 /* eBPF programs must be GPL compatible to use GPL-ed functions */ 3070 prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; 3071 3072 if (attr->signature) { 3073 err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); 3074 if (err) 3075 goto free_prog; 3076 } 3077 3078 prog->orig_prog = NULL; 3079 prog->jited = 0; 3080 3081 atomic64_set(&prog->aux->refcnt, 1); 3082 3083 if (bpf_prog_is_dev_bound(prog->aux)) { 3084 err = bpf_prog_dev_bound_init(prog, attr); 3085 if (err) 3086 goto free_prog; 3087 } 3088 3089 if (type == BPF_PROG_TYPE_EXT && dst_prog && 3090 bpf_prog_is_dev_bound(dst_prog->aux)) { 3091 err = bpf_prog_dev_bound_inherit(prog, dst_prog); 3092 if (err) 3093 goto free_prog; 3094 } 3095 3096 /* 3097 * Bookkeeping for managing the program attachment chain. 3098 * 3099 * It might be tempting to set attach_tracing_prog flag at the attachment 3100 * time, but this will not prevent from loading bunch of tracing prog 3101 * first, then attach them one to another. 3102 * 3103 * The flag attach_tracing_prog is set for the whole program lifecycle, and 3104 * doesn't have to be cleared in bpf_tracing_link_release, since tracing 3105 * programs cannot change attachment target. 3106 */ 3107 if (type == BPF_PROG_TYPE_TRACING && dst_prog && 3108 dst_prog->type == BPF_PROG_TYPE_TRACING) { 3109 prog->aux->attach_tracing_prog = true; 3110 } 3111 3112 /* find program type: socket_filter vs tracing_filter */ 3113 err = find_prog_type(type, prog); 3114 if (err < 0) 3115 goto free_prog; 3116 3117 prog->aux->load_time = ktime_get_boottime_ns(); 3118 err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, 3119 sizeof(attr->prog_name)); 3120 if (err < 0) 3121 goto free_prog; 3122 3123 err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); 3124 if (err) 3125 goto free_prog_sec; 3126 3127 /* run eBPF verifier */ 3128 err = bpf_check(&prog, attr, uattr, attr_log); 3129 if (err < 0) 3130 goto free_used_maps; 3131 3132 err = bpf_prog_mark_insn_arrays_ready(prog); 3133 if (err < 0) 3134 goto free_used_maps; 3135 3136 err = bpf_prog_alloc_id(prog); 3137 if (err) 3138 goto free_used_maps; 3139 3140 /* Upon success of bpf_prog_alloc_id(), the BPF prog is 3141 * effectively publicly exposed. However, retrieving via 3142 * bpf_prog_get_fd_by_id() will take another reference, 3143 * therefore it cannot be gone underneath us. 3144 * 3145 * Only for the time /after/ successful bpf_prog_new_fd() 3146 * and before returning to userspace, we might just hold 3147 * one reference and any parallel close on that fd could 3148 * rip everything out. Hence, below notifications must 3149 * happen before bpf_prog_new_fd(). 3150 * 3151 * Also, any failure handling from this point onwards must 3152 * be using bpf_prog_put() given the program is exposed. 3153 */ 3154 bpf_prog_kallsyms_add(prog); 3155 perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); 3156 bpf_audit_prog(prog, BPF_AUDIT_LOAD); 3157 3158 err = bpf_prog_new_fd(prog); 3159 if (err < 0) 3160 bpf_prog_put(prog); 3161 return err; 3162 3163 free_used_maps: 3164 /* In case we have subprogs, we need to wait for a grace 3165 * period before we can tear down JIT memory since symbols 3166 * are already exposed under kallsyms. 3167 */ 3168 __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); 3169 return err; 3170 3171 free_prog_sec: 3172 security_bpf_prog_free(prog); 3173 free_prog: 3174 free_uid(prog->aux->user); 3175 if (prog->aux->attach_btf) 3176 btf_put(prog->aux->attach_btf); 3177 bpf_prog_free(prog); 3178 put_token: 3179 bpf_token_put(token); 3180 return err; 3181 } 3182 3183 #define BPF_OBJ_LAST_FIELD path_fd 3184 3185 static int bpf_obj_pin(const union bpf_attr *attr) 3186 { 3187 int path_fd; 3188 3189 if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD) 3190 return -EINVAL; 3191 3192 /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ 3193 if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) 3194 return -EINVAL; 3195 3196 path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; 3197 return bpf_obj_pin_user(attr->bpf_fd, path_fd, 3198 u64_to_user_ptr(attr->pathname)); 3199 } 3200 3201 static int bpf_obj_get(const union bpf_attr *attr) 3202 { 3203 int path_fd; 3204 3205 if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || 3206 attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD)) 3207 return -EINVAL; 3208 3209 /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ 3210 if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) 3211 return -EINVAL; 3212 3213 path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; 3214 return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname), 3215 attr->file_flags); 3216 } 3217 3218 /* bpf_link_init_sleepable() allows to specify whether BPF link itself has 3219 * "sleepable" semantics, which normally would mean that BPF link's attach 3220 * hook can dereference link or link's underlying program for some time after 3221 * detachment due to RCU Tasks Trace-based lifetime protection scheme. 3222 * BPF program itself can be non-sleepable, yet, because it's transitively 3223 * reachable through BPF link, its freeing has to be delayed until after RCU 3224 * Tasks Trace GP. 3225 */ 3226 void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, 3227 const struct bpf_link_ops *ops, struct bpf_prog *prog, 3228 enum bpf_attach_type attach_type, bool sleepable) 3229 { 3230 WARN_ON(ops->dealloc && ops->dealloc_deferred); 3231 atomic64_set(&link->refcnt, 1); 3232 link->type = type; 3233 link->sleepable = sleepable; 3234 link->id = 0; 3235 link->ops = ops; 3236 link->prog = prog; 3237 link->attach_type = attach_type; 3238 } 3239 3240 void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, 3241 const struct bpf_link_ops *ops, struct bpf_prog *prog, 3242 enum bpf_attach_type attach_type) 3243 { 3244 bpf_link_init_sleepable(link, type, ops, prog, attach_type, false); 3245 } 3246 3247 static void bpf_link_free_id(int id) 3248 { 3249 if (!id) 3250 return; 3251 3252 spin_lock_bh(&link_idr_lock); 3253 idr_remove(&link_idr, id); 3254 spin_unlock_bh(&link_idr_lock); 3255 } 3256 3257 /* Clean up bpf_link and corresponding anon_inode file and FD. After 3258 * anon_inode is created, bpf_link can't be just kfree()'d due to deferred 3259 * anon_inode's release() call. This helper marks bpf_link as 3260 * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt 3261 * is not decremented, it's the responsibility of a calling code that failed 3262 * to complete bpf_link initialization. 3263 * This helper eventually calls link's dealloc callback, but does not call 3264 * link's release callback. 3265 */ 3266 void bpf_link_cleanup(struct bpf_link_primer *primer) 3267 { 3268 primer->link->prog = NULL; 3269 bpf_link_free_id(primer->id); 3270 fput(primer->file); 3271 put_unused_fd(primer->fd); 3272 } 3273 3274 void bpf_link_inc(struct bpf_link *link) 3275 { 3276 atomic64_inc(&link->refcnt); 3277 } 3278 3279 static void bpf_link_dealloc(struct bpf_link *link) 3280 { 3281 /* now that we know that bpf_link itself can't be reached, put underlying BPF program */ 3282 if (link->prog) 3283 bpf_prog_put(link->prog); 3284 3285 /* free bpf_link and its containing memory */ 3286 if (link->ops->dealloc_deferred) 3287 link->ops->dealloc_deferred(link); 3288 else 3289 link->ops->dealloc(link); 3290 } 3291 3292 static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) 3293 { 3294 struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); 3295 3296 bpf_link_dealloc(link); 3297 } 3298 3299 static bool bpf_link_is_tracepoint(struct bpf_link *link) 3300 { 3301 /* 3302 * Only these combinations support a tracepoint bpf_link. 3303 * BPF_LINK_TYPE_TRACING raw_tp progs are hardcoded to use 3304 * bpf_raw_tp_link_lops and thus dealloc_deferred(), see 3305 * bpf_raw_tp_link_attach(). 3306 */ 3307 return link->type == BPF_LINK_TYPE_RAW_TRACEPOINT || 3308 (link->type == BPF_LINK_TYPE_TRACING && link->attach_type == BPF_TRACE_RAW_TP); 3309 } 3310 3311 /* bpf_link_free is guaranteed to be called from process context */ 3312 static void bpf_link_free(struct bpf_link *link) 3313 { 3314 const struct bpf_link_ops *ops = link->ops; 3315 3316 bpf_link_free_id(link->id); 3317 /* detach BPF program, clean up used resources */ 3318 if (link->prog) 3319 ops->release(link); 3320 if (ops->dealloc_deferred) { 3321 /* 3322 * Schedule BPF link deallocation, which will only then 3323 * trigger putting BPF program refcount. 3324 * If underlying BPF program is sleepable or BPF link's target 3325 * attach hookpoint is sleepable or otherwise requires RCU GPs 3326 * to ensure link and its underlying BPF program is not 3327 * reachable anymore, we need to first wait for RCU tasks 3328 * trace sync, and then go through "classic" RCU grace period. 3329 * 3330 * For tracepoint BPF links, we need to go through SRCU grace 3331 * period wait instead when non-faultable tracepoint is used. We 3332 * don't need to chain SRCU grace period waits, however, for the 3333 * faultable case, since it exclusively uses RCU Tasks Trace. 3334 */ 3335 if (link->sleepable || (link->prog && link->prog->sleepable)) 3336 /* RCU Tasks Trace grace period implies RCU grace period. */ 3337 call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_rcu_gp); 3338 /* We need to do a SRCU grace period wait for non-faultable tracepoint BPF links. */ 3339 else if (bpf_link_is_tracepoint(link)) 3340 call_tracepoint_unregister_atomic(&link->rcu, bpf_link_defer_dealloc_rcu_gp); 3341 else 3342 call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); 3343 } else if (ops->dealloc) { 3344 bpf_link_dealloc(link); 3345 } 3346 } 3347 3348 static void bpf_link_put_deferred(struct work_struct *work) 3349 { 3350 struct bpf_link *link = container_of(work, struct bpf_link, work); 3351 3352 bpf_link_free(link); 3353 } 3354 3355 /* bpf_link_put might be called from atomic context. It needs to be called 3356 * from sleepable context in order to acquire sleeping locks during the process. 3357 */ 3358 void bpf_link_put(struct bpf_link *link) 3359 { 3360 if (!atomic64_dec_and_test(&link->refcnt)) 3361 return; 3362 3363 INIT_WORK(&link->work, bpf_link_put_deferred); 3364 schedule_work(&link->work); 3365 } 3366 EXPORT_SYMBOL(bpf_link_put); 3367 3368 static void bpf_link_put_direct(struct bpf_link *link) 3369 { 3370 if (!atomic64_dec_and_test(&link->refcnt)) 3371 return; 3372 bpf_link_free(link); 3373 } 3374 3375 static int bpf_link_release(struct inode *inode, struct file *filp) 3376 { 3377 struct bpf_link *link = filp->private_data; 3378 3379 bpf_link_put_direct(link); 3380 return 0; 3381 } 3382 3383 #ifdef CONFIG_PROC_FS 3384 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) 3385 #define BPF_MAP_TYPE(_id, _ops) 3386 #define BPF_LINK_TYPE(_id, _name) [_id] = #_name, 3387 static const char *bpf_link_type_strs[] = { 3388 [BPF_LINK_TYPE_UNSPEC] = "<invalid>", 3389 #include <linux/bpf_types.h> 3390 }; 3391 #undef BPF_PROG_TYPE 3392 #undef BPF_MAP_TYPE 3393 #undef BPF_LINK_TYPE 3394 3395 static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) 3396 { 3397 const struct bpf_link *link = filp->private_data; 3398 const struct bpf_prog *prog = link->prog; 3399 enum bpf_link_type type = link->type; 3400 char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; 3401 3402 if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { 3403 if (link->type == BPF_LINK_TYPE_KPROBE_MULTI) 3404 seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ? 3405 "kretprobe_multi" : "kprobe_multi"); 3406 else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI) 3407 seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ? 3408 "uretprobe_multi" : "uprobe_multi"); 3409 else 3410 seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); 3411 } else { 3412 WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); 3413 seq_printf(m, "link_type:\t<%u>\n", type); 3414 } 3415 seq_printf(m, "link_id:\t%u\n", link->id); 3416 3417 if (prog) { 3418 bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); 3419 seq_printf(m, 3420 "prog_tag:\t%s\n" 3421 "prog_id:\t%u\n", 3422 prog_tag, 3423 prog->aux->id); 3424 } 3425 if (link->ops->show_fdinfo) 3426 link->ops->show_fdinfo(link, m); 3427 } 3428 #endif 3429 3430 static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts) 3431 { 3432 struct bpf_link *link = file->private_data; 3433 3434 return link->ops->poll(file, pts); 3435 } 3436 3437 static const struct file_operations bpf_link_fops = { 3438 #ifdef CONFIG_PROC_FS 3439 .show_fdinfo = bpf_link_show_fdinfo, 3440 #endif 3441 .release = bpf_link_release, 3442 .read = bpf_dummy_read, 3443 .write = bpf_dummy_write, 3444 }; 3445 3446 static const struct file_operations bpf_link_fops_poll = { 3447 #ifdef CONFIG_PROC_FS 3448 .show_fdinfo = bpf_link_show_fdinfo, 3449 #endif 3450 .release = bpf_link_release, 3451 .read = bpf_dummy_read, 3452 .write = bpf_dummy_write, 3453 .poll = bpf_link_poll, 3454 }; 3455 3456 static int bpf_link_alloc_id(struct bpf_link *link) 3457 { 3458 int id; 3459 3460 idr_preload(GFP_KERNEL); 3461 spin_lock_bh(&link_idr_lock); 3462 id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); 3463 spin_unlock_bh(&link_idr_lock); 3464 idr_preload_end(); 3465 3466 return id; 3467 } 3468 3469 /* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, 3470 * reserving unused FD and allocating ID from link_idr. This is to be paired 3471 * with bpf_link_settle() to install FD and ID and expose bpf_link to 3472 * user-space, if bpf_link is successfully attached. If not, bpf_link and 3473 * pre-allocated resources are to be freed with bpf_cleanup() call. All the 3474 * transient state is passed around in struct bpf_link_primer. 3475 * This is preferred way to create and initialize bpf_link, especially when 3476 * there are complicated and expensive operations in between creating bpf_link 3477 * itself and attaching it to BPF hook. By using bpf_link_prime() and 3478 * bpf_link_settle() kernel code using bpf_link doesn't have to perform 3479 * expensive (and potentially failing) roll back operations in a rare case 3480 * that file, FD, or ID can't be allocated. 3481 */ 3482 int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) 3483 { 3484 struct file *file; 3485 int fd, id; 3486 3487 fd = get_unused_fd_flags(O_CLOEXEC); 3488 if (fd < 0) 3489 return fd; 3490 3491 3492 id = bpf_link_alloc_id(link); 3493 if (id < 0) { 3494 put_unused_fd(fd); 3495 return id; 3496 } 3497 3498 file = anon_inode_getfile("bpf_link", 3499 link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, 3500 link, O_CLOEXEC); 3501 if (IS_ERR(file)) { 3502 bpf_link_free_id(id); 3503 put_unused_fd(fd); 3504 return PTR_ERR(file); 3505 } 3506 3507 primer->link = link; 3508 primer->file = file; 3509 primer->fd = fd; 3510 primer->id = id; 3511 return 0; 3512 } 3513 3514 int bpf_link_settle(struct bpf_link_primer *primer) 3515 { 3516 /* make bpf_link fetchable by ID */ 3517 spin_lock_bh(&link_idr_lock); 3518 primer->link->id = primer->id; 3519 spin_unlock_bh(&link_idr_lock); 3520 /* make bpf_link fetchable by FD */ 3521 fd_install(primer->fd, primer->file); 3522 /* pass through installed FD */ 3523 return primer->fd; 3524 } 3525 3526 int bpf_link_new_fd(struct bpf_link *link) 3527 { 3528 return anon_inode_getfd("bpf-link", 3529 link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, 3530 link, O_CLOEXEC); 3531 } 3532 3533 struct bpf_link *bpf_link_get_from_fd(u32 ufd) 3534 { 3535 CLASS(fd, f)(ufd); 3536 struct bpf_link *link; 3537 3538 if (fd_empty(f)) 3539 return ERR_PTR(-EBADF); 3540 if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll) 3541 return ERR_PTR(-EINVAL); 3542 3543 link = fd_file(f)->private_data; 3544 bpf_link_inc(link); 3545 return link; 3546 } 3547 EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL"); 3548 3549 static void bpf_tracing_link_release(struct bpf_link *link) 3550 { 3551 struct bpf_tracing_link *tr_link = 3552 container_of(link, struct bpf_tracing_link, link.link); 3553 3554 WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, 3555 tr_link->trampoline, 3556 tr_link->tgt_prog)); 3557 3558 bpf_trampoline_put(tr_link->trampoline); 3559 3560 /* tgt_prog is NULL if target is a kernel function */ 3561 if (tr_link->tgt_prog) 3562 bpf_prog_put(tr_link->tgt_prog); 3563 } 3564 3565 static void bpf_tracing_link_dealloc(struct bpf_link *link) 3566 { 3567 struct bpf_tracing_link *tr_link = 3568 container_of(link, struct bpf_tracing_link, link.link); 3569 3570 kfree(tr_link); 3571 } 3572 3573 static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, 3574 struct seq_file *seq) 3575 { 3576 struct bpf_tracing_link *tr_link = 3577 container_of(link, struct bpf_tracing_link, link.link); 3578 u32 target_btf_id, target_obj_id; 3579 3580 bpf_trampoline_unpack_key(tr_link->trampoline->key, 3581 &target_obj_id, &target_btf_id); 3582 seq_printf(seq, 3583 "attach_type:\t%d\n" 3584 "target_obj_id:\t%u\n" 3585 "target_btf_id:\t%u\n" 3586 "cookie:\t%llu\n", 3587 link->attach_type, 3588 target_obj_id, 3589 target_btf_id, 3590 tr_link->link.cookie); 3591 } 3592 3593 static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, 3594 struct bpf_link_info *info) 3595 { 3596 struct bpf_tracing_link *tr_link = 3597 container_of(link, struct bpf_tracing_link, link.link); 3598 3599 info->tracing.attach_type = link->attach_type; 3600 info->tracing.cookie = tr_link->link.cookie; 3601 bpf_trampoline_unpack_key(tr_link->trampoline->key, 3602 &info->tracing.target_obj_id, 3603 &info->tracing.target_btf_id); 3604 3605 return 0; 3606 } 3607 3608 static const struct bpf_link_ops bpf_tracing_link_lops = { 3609 .release = bpf_tracing_link_release, 3610 .dealloc = bpf_tracing_link_dealloc, 3611 .show_fdinfo = bpf_tracing_link_show_fdinfo, 3612 .fill_link_info = bpf_tracing_link_fill_link_info, 3613 }; 3614 3615 static int bpf_tracing_prog_attach(struct bpf_prog *prog, 3616 int tgt_prog_fd, 3617 u32 btf_id, 3618 u64 bpf_cookie, 3619 enum bpf_attach_type attach_type) 3620 { 3621 struct bpf_link_primer link_primer; 3622 struct bpf_prog *tgt_prog = NULL; 3623 struct bpf_trampoline *tr = NULL; 3624 struct bpf_tracing_link *link; 3625 u64 key = 0; 3626 int err; 3627 3628 switch (prog->type) { 3629 case BPF_PROG_TYPE_TRACING: 3630 if (prog->expected_attach_type != BPF_TRACE_FENTRY && 3631 prog->expected_attach_type != BPF_TRACE_FEXIT && 3632 prog->expected_attach_type != BPF_TRACE_FSESSION && 3633 prog->expected_attach_type != BPF_MODIFY_RETURN) { 3634 err = -EINVAL; 3635 goto out_put_prog; 3636 } 3637 break; 3638 case BPF_PROG_TYPE_EXT: 3639 if (prog->expected_attach_type != 0) { 3640 err = -EINVAL; 3641 goto out_put_prog; 3642 } 3643 break; 3644 case BPF_PROG_TYPE_LSM: 3645 if (prog->expected_attach_type != BPF_LSM_MAC) { 3646 err = -EINVAL; 3647 goto out_put_prog; 3648 } 3649 break; 3650 default: 3651 err = -EINVAL; 3652 goto out_put_prog; 3653 } 3654 3655 if (!!tgt_prog_fd != !!btf_id) { 3656 err = -EINVAL; 3657 goto out_put_prog; 3658 } 3659 3660 if (tgt_prog_fd) { 3661 /* 3662 * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this 3663 * part would be changed to implement the same for 3664 * BPF_PROG_TYPE_TRACING, do not forget to update the way how 3665 * attach_tracing_prog flag is set. 3666 */ 3667 if (prog->type != BPF_PROG_TYPE_EXT) { 3668 err = -EINVAL; 3669 goto out_put_prog; 3670 } 3671 3672 tgt_prog = bpf_prog_get(tgt_prog_fd); 3673 if (IS_ERR(tgt_prog)) { 3674 err = PTR_ERR(tgt_prog); 3675 tgt_prog = NULL; 3676 goto out_put_prog; 3677 } 3678 3679 key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); 3680 } 3681 3682 if (prog->expected_attach_type == BPF_TRACE_FSESSION) { 3683 struct bpf_fsession_link *fslink; 3684 3685 fslink = kzalloc_obj(*fslink, GFP_USER); 3686 if (fslink) { 3687 bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, 3688 &bpf_tracing_link_lops, prog, attach_type); 3689 fslink->fexit.cookie = bpf_cookie; 3690 link = &fslink->link; 3691 } else { 3692 link = NULL; 3693 } 3694 } else { 3695 link = kzalloc_obj(*link, GFP_USER); 3696 } 3697 if (!link) { 3698 err = -ENOMEM; 3699 goto out_put_prog; 3700 } 3701 bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, 3702 &bpf_tracing_link_lops, prog, attach_type); 3703 3704 link->link.cookie = bpf_cookie; 3705 3706 mutex_lock(&prog->aux->dst_mutex); 3707 3708 /* There are a few possible cases here: 3709 * 3710 * - if prog->aux->dst_trampoline is set, the program was just loaded 3711 * and not yet attached to anything, so we can use the values stored 3712 * in prog->aux 3713 * 3714 * - if prog->aux->dst_trampoline is NULL, the program has already been 3715 * attached to a target and its initial target was cleared (below) 3716 * 3717 * - if tgt_prog != NULL, the caller specified tgt_prog_fd + 3718 * target_btf_id using the link_create API. 3719 * 3720 * - if tgt_prog == NULL when this function was called using the old 3721 * raw_tracepoint_open API, and we need a target from prog->aux 3722 * 3723 * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program 3724 * was detached and is going for re-attachment. 3725 * 3726 * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf 3727 * are NULL, then program was already attached and user did not provide 3728 * tgt_prog_fd so we have no way to find out or create trampoline 3729 */ 3730 if (!prog->aux->dst_trampoline && !tgt_prog) { 3731 /* 3732 * Allow re-attach for TRACING and LSM programs. If it's 3733 * currently linked, bpf_trampoline_link_prog will fail. 3734 * EXT programs need to specify tgt_prog_fd, so they 3735 * re-attach in separate code path. 3736 */ 3737 if (prog->type != BPF_PROG_TYPE_TRACING && 3738 prog->type != BPF_PROG_TYPE_LSM) { 3739 err = -EINVAL; 3740 goto out_unlock; 3741 } 3742 /* We can allow re-attach only if we have valid attach_btf. */ 3743 if (!prog->aux->attach_btf) { 3744 err = -EINVAL; 3745 goto out_unlock; 3746 } 3747 btf_id = prog->aux->attach_btf_id; 3748 key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id); 3749 } 3750 3751 if (!prog->aux->dst_trampoline || 3752 (key && key != prog->aux->dst_trampoline->key)) { 3753 /* If there is no saved target, or the specified target is 3754 * different from the destination specified at load time, we 3755 * need a new trampoline and a check for compatibility 3756 */ 3757 struct bpf_attach_target_info tgt_info = {}; 3758 3759 err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id, 3760 &tgt_info); 3761 if (err) 3762 goto out_unlock; 3763 3764 if (tgt_info.tgt_mod) { 3765 module_put(prog->aux->mod); 3766 prog->aux->mod = tgt_info.tgt_mod; 3767 } 3768 3769 tr = bpf_trampoline_get(key, &tgt_info); 3770 if (!tr) { 3771 err = -ENOMEM; 3772 goto out_unlock; 3773 } 3774 } else { 3775 /* The caller didn't specify a target, or the target was the 3776 * same as the destination supplied during program load. This 3777 * means we can reuse the trampoline and reference from program 3778 * load time, and there is no need to allocate a new one. This 3779 * can only happen once for any program, as the saved values in 3780 * prog->aux are cleared below. 3781 */ 3782 tr = prog->aux->dst_trampoline; 3783 tgt_prog = prog->aux->dst_prog; 3784 } 3785 /* 3786 * It is to prevent modifying struct pt_regs via kprobe_write_ctx=true 3787 * freplace prog. Without this check, kprobe_write_ctx=true freplace 3788 * prog is allowed to attach to kprobe_write_ctx=false kprobe prog, and 3789 * then modify the registers of the kprobe prog's target kernel 3790 * function. 3791 * 3792 * This also blocks the combination of uprobe+freplace, because it is 3793 * unable to recognize the use of the tgt_prog as an uprobe or a kprobe 3794 * by tgt_prog itself. At attach time, uprobe/kprobe is recognized by 3795 * the target perf event flags in __perf_event_set_bpf_prog(). 3796 */ 3797 if (prog->type == BPF_PROG_TYPE_EXT && 3798 prog->aux->kprobe_write_ctx != tgt_prog->aux->kprobe_write_ctx) { 3799 err = -EINVAL; 3800 goto out_unlock; 3801 } 3802 3803 err = bpf_link_prime(&link->link.link, &link_primer); 3804 if (err) 3805 goto out_unlock; 3806 3807 err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog); 3808 if (err) { 3809 bpf_link_cleanup(&link_primer); 3810 link = NULL; 3811 goto out_unlock; 3812 } 3813 3814 link->tgt_prog = tgt_prog; 3815 link->trampoline = tr; 3816 3817 /* Always clear the trampoline and target prog from prog->aux to make 3818 * sure the original attach destination is not kept alive after a 3819 * program is (re-)attached to another target. 3820 */ 3821 if (prog->aux->dst_prog && 3822 (tgt_prog_fd || tr != prog->aux->dst_trampoline)) 3823 /* got extra prog ref from syscall, or attaching to different prog */ 3824 bpf_prog_put(prog->aux->dst_prog); 3825 if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline) 3826 /* we allocated a new trampoline, so free the old one */ 3827 bpf_trampoline_put(prog->aux->dst_trampoline); 3828 3829 prog->aux->dst_prog = NULL; 3830 prog->aux->dst_trampoline = NULL; 3831 mutex_unlock(&prog->aux->dst_mutex); 3832 3833 return bpf_link_settle(&link_primer); 3834 out_unlock: 3835 if (tr && tr != prog->aux->dst_trampoline) 3836 bpf_trampoline_put(tr); 3837 mutex_unlock(&prog->aux->dst_mutex); 3838 kfree(link); 3839 out_put_prog: 3840 if (tgt_prog_fd && tgt_prog) 3841 bpf_prog_put(tgt_prog); 3842 return err; 3843 } 3844 3845 static void bpf_raw_tp_link_release(struct bpf_link *link) 3846 { 3847 struct bpf_raw_tp_link *raw_tp = 3848 container_of(link, struct bpf_raw_tp_link, link); 3849 3850 bpf_probe_unregister(raw_tp->btp, raw_tp); 3851 bpf_put_raw_tracepoint(raw_tp->btp); 3852 } 3853 3854 static void bpf_raw_tp_link_dealloc(struct bpf_link *link) 3855 { 3856 struct bpf_raw_tp_link *raw_tp = 3857 container_of(link, struct bpf_raw_tp_link, link); 3858 3859 kfree(raw_tp); 3860 } 3861 3862 static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, 3863 struct seq_file *seq) 3864 { 3865 struct bpf_raw_tp_link *raw_tp_link = 3866 container_of(link, struct bpf_raw_tp_link, link); 3867 3868 seq_printf(seq, 3869 "tp_name:\t%s\n" 3870 "cookie:\t%llu\n", 3871 raw_tp_link->btp->tp->name, 3872 raw_tp_link->cookie); 3873 } 3874 3875 static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen, 3876 u32 len) 3877 { 3878 if (ulen >= len + 1) { 3879 if (copy_to_user(ubuf, buf, len + 1)) 3880 return -EFAULT; 3881 } else { 3882 char zero = '\0'; 3883 3884 if (copy_to_user(ubuf, buf, ulen - 1)) 3885 return -EFAULT; 3886 if (put_user(zero, ubuf + ulen - 1)) 3887 return -EFAULT; 3888 return -ENOSPC; 3889 } 3890 3891 return 0; 3892 } 3893 3894 static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, 3895 struct bpf_link_info *info) 3896 { 3897 struct bpf_raw_tp_link *raw_tp_link = 3898 container_of(link, struct bpf_raw_tp_link, link); 3899 char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); 3900 const char *tp_name = raw_tp_link->btp->tp->name; 3901 u32 ulen = info->raw_tracepoint.tp_name_len; 3902 size_t tp_len = strlen(tp_name); 3903 3904 if (!ulen ^ !ubuf) 3905 return -EINVAL; 3906 3907 info->raw_tracepoint.tp_name_len = tp_len + 1; 3908 info->raw_tracepoint.cookie = raw_tp_link->cookie; 3909 3910 if (!ubuf) 3911 return 0; 3912 3913 return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len); 3914 } 3915 3916 static const struct bpf_link_ops bpf_raw_tp_link_lops = { 3917 .release = bpf_raw_tp_link_release, 3918 .dealloc_deferred = bpf_raw_tp_link_dealloc, 3919 .show_fdinfo = bpf_raw_tp_link_show_fdinfo, 3920 .fill_link_info = bpf_raw_tp_link_fill_link_info, 3921 }; 3922 3923 #ifdef CONFIG_PERF_EVENTS 3924 struct bpf_perf_link { 3925 struct bpf_link link; 3926 struct file *perf_file; 3927 }; 3928 3929 static void bpf_perf_link_release(struct bpf_link *link) 3930 { 3931 struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); 3932 struct perf_event *event = perf_link->perf_file->private_data; 3933 3934 perf_event_free_bpf_prog(event); 3935 fput(perf_link->perf_file); 3936 } 3937 3938 static void bpf_perf_link_dealloc(struct bpf_link *link) 3939 { 3940 struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); 3941 3942 kfree(perf_link); 3943 } 3944 3945 static int bpf_perf_link_fill_common(const struct perf_event *event, 3946 char __user *uname, u32 *ulenp, 3947 u64 *probe_offset, u64 *probe_addr, 3948 u32 *fd_type, unsigned long *missed) 3949 { 3950 const char *buf; 3951 u32 prog_id, ulen; 3952 size_t len; 3953 int err; 3954 3955 ulen = *ulenp; 3956 if (!ulen ^ !uname) 3957 return -EINVAL; 3958 3959 err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf, 3960 probe_offset, probe_addr, missed); 3961 if (err) 3962 return err; 3963 3964 if (buf) { 3965 len = strlen(buf); 3966 *ulenp = len + 1; 3967 } else { 3968 *ulenp = 1; 3969 } 3970 if (!uname) 3971 return 0; 3972 3973 if (buf) { 3974 err = bpf_copy_to_user(uname, buf, ulen, len); 3975 if (err) 3976 return err; 3977 } else { 3978 char zero = '\0'; 3979 3980 if (put_user(zero, uname)) 3981 return -EFAULT; 3982 } 3983 return 0; 3984 } 3985 3986 #ifdef CONFIG_KPROBE_EVENTS 3987 static int bpf_perf_link_fill_kprobe(const struct perf_event *event, 3988 struct bpf_link_info *info) 3989 { 3990 unsigned long missed; 3991 char __user *uname; 3992 u64 addr, offset; 3993 u32 ulen, type; 3994 int err; 3995 3996 uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); 3997 ulen = info->perf_event.kprobe.name_len; 3998 err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, 3999 &type, &missed); 4000 if (err) 4001 return err; 4002 if (type == BPF_FD_TYPE_KRETPROBE) 4003 info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; 4004 else 4005 info->perf_event.type = BPF_PERF_EVENT_KPROBE; 4006 info->perf_event.kprobe.name_len = ulen; 4007 info->perf_event.kprobe.offset = offset; 4008 info->perf_event.kprobe.missed = missed; 4009 if (!kallsyms_show_value(current_cred())) 4010 addr = 0; 4011 info->perf_event.kprobe.addr = addr; 4012 info->perf_event.kprobe.cookie = event->bpf_cookie; 4013 return 0; 4014 } 4015 4016 static void bpf_perf_link_fdinfo_kprobe(const struct perf_event *event, 4017 struct seq_file *seq) 4018 { 4019 const char *name; 4020 int err; 4021 u32 prog_id, type; 4022 u64 offset, addr; 4023 unsigned long missed; 4024 4025 err = bpf_get_perf_event_info(event, &prog_id, &type, &name, 4026 &offset, &addr, &missed); 4027 if (err) 4028 return; 4029 4030 seq_printf(seq, 4031 "name:\t%s\n" 4032 "offset:\t%#llx\n" 4033 "missed:\t%lu\n" 4034 "addr:\t%#llx\n" 4035 "event_type:\t%s\n" 4036 "cookie:\t%llu\n", 4037 name, offset, missed, addr, 4038 type == BPF_FD_TYPE_KRETPROBE ? "kretprobe" : "kprobe", 4039 event->bpf_cookie); 4040 } 4041 #endif 4042 4043 #ifdef CONFIG_UPROBE_EVENTS 4044 static int bpf_perf_link_fill_uprobe(const struct perf_event *event, 4045 struct bpf_link_info *info) 4046 { 4047 u64 ref_ctr_offset, offset; 4048 char __user *uname; 4049 u32 ulen, type; 4050 int err; 4051 4052 uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); 4053 ulen = info->perf_event.uprobe.name_len; 4054 err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset, 4055 &type, NULL); 4056 if (err) 4057 return err; 4058 4059 if (type == BPF_FD_TYPE_URETPROBE) 4060 info->perf_event.type = BPF_PERF_EVENT_URETPROBE; 4061 else 4062 info->perf_event.type = BPF_PERF_EVENT_UPROBE; 4063 info->perf_event.uprobe.name_len = ulen; 4064 info->perf_event.uprobe.offset = offset; 4065 info->perf_event.uprobe.cookie = event->bpf_cookie; 4066 info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset; 4067 return 0; 4068 } 4069 4070 static void bpf_perf_link_fdinfo_uprobe(const struct perf_event *event, 4071 struct seq_file *seq) 4072 { 4073 const char *name; 4074 int err; 4075 u32 prog_id, type; 4076 u64 offset, ref_ctr_offset; 4077 unsigned long missed; 4078 4079 err = bpf_get_perf_event_info(event, &prog_id, &type, &name, 4080 &offset, &ref_ctr_offset, &missed); 4081 if (err) 4082 return; 4083 4084 seq_printf(seq, 4085 "name:\t%s\n" 4086 "offset:\t%#llx\n" 4087 "ref_ctr_offset:\t%#llx\n" 4088 "event_type:\t%s\n" 4089 "cookie:\t%llu\n", 4090 name, offset, ref_ctr_offset, 4091 type == BPF_FD_TYPE_URETPROBE ? "uretprobe" : "uprobe", 4092 event->bpf_cookie); 4093 } 4094 #endif 4095 4096 static int bpf_perf_link_fill_probe(const struct perf_event *event, 4097 struct bpf_link_info *info) 4098 { 4099 #ifdef CONFIG_KPROBE_EVENTS 4100 if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) 4101 return bpf_perf_link_fill_kprobe(event, info); 4102 #endif 4103 #ifdef CONFIG_UPROBE_EVENTS 4104 if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) 4105 return bpf_perf_link_fill_uprobe(event, info); 4106 #endif 4107 return -EOPNOTSUPP; 4108 } 4109 4110 static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, 4111 struct bpf_link_info *info) 4112 { 4113 char __user *uname; 4114 u32 ulen; 4115 int err; 4116 4117 uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); 4118 ulen = info->perf_event.tracepoint.name_len; 4119 err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL); 4120 if (err) 4121 return err; 4122 4123 info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; 4124 info->perf_event.tracepoint.name_len = ulen; 4125 info->perf_event.tracepoint.cookie = event->bpf_cookie; 4126 return 0; 4127 } 4128 4129 static int bpf_perf_link_fill_perf_event(const struct perf_event *event, 4130 struct bpf_link_info *info) 4131 { 4132 info->perf_event.event.type = event->attr.type; 4133 info->perf_event.event.config = event->attr.config; 4134 info->perf_event.event.cookie = event->bpf_cookie; 4135 info->perf_event.type = BPF_PERF_EVENT_EVENT; 4136 return 0; 4137 } 4138 4139 static int bpf_perf_link_fill_link_info(const struct bpf_link *link, 4140 struct bpf_link_info *info) 4141 { 4142 struct bpf_perf_link *perf_link; 4143 const struct perf_event *event; 4144 4145 perf_link = container_of(link, struct bpf_perf_link, link); 4146 event = perf_get_event(perf_link->perf_file); 4147 if (IS_ERR(event)) 4148 return PTR_ERR(event); 4149 4150 switch (event->prog->type) { 4151 case BPF_PROG_TYPE_PERF_EVENT: 4152 return bpf_perf_link_fill_perf_event(event, info); 4153 case BPF_PROG_TYPE_TRACEPOINT: 4154 return bpf_perf_link_fill_tracepoint(event, info); 4155 case BPF_PROG_TYPE_KPROBE: 4156 return bpf_perf_link_fill_probe(event, info); 4157 default: 4158 return -EOPNOTSUPP; 4159 } 4160 } 4161 4162 static void bpf_perf_event_link_show_fdinfo(const struct perf_event *event, 4163 struct seq_file *seq) 4164 { 4165 seq_printf(seq, 4166 "type:\t%u\n" 4167 "config:\t%llu\n" 4168 "event_type:\t%s\n" 4169 "cookie:\t%llu\n", 4170 event->attr.type, event->attr.config, 4171 "event", event->bpf_cookie); 4172 } 4173 4174 static void bpf_tracepoint_link_show_fdinfo(const struct perf_event *event, 4175 struct seq_file *seq) 4176 { 4177 int err; 4178 const char *name; 4179 u32 prog_id; 4180 4181 err = bpf_get_perf_event_info(event, &prog_id, NULL, &name, NULL, 4182 NULL, NULL); 4183 if (err) 4184 return; 4185 4186 seq_printf(seq, 4187 "tp_name:\t%s\n" 4188 "event_type:\t%s\n" 4189 "cookie:\t%llu\n", 4190 name, "tracepoint", event->bpf_cookie); 4191 } 4192 4193 static void bpf_probe_link_show_fdinfo(const struct perf_event *event, 4194 struct seq_file *seq) 4195 { 4196 #ifdef CONFIG_KPROBE_EVENTS 4197 if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) 4198 return bpf_perf_link_fdinfo_kprobe(event, seq); 4199 #endif 4200 4201 #ifdef CONFIG_UPROBE_EVENTS 4202 if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) 4203 return bpf_perf_link_fdinfo_uprobe(event, seq); 4204 #endif 4205 } 4206 4207 static void bpf_perf_link_show_fdinfo(const struct bpf_link *link, 4208 struct seq_file *seq) 4209 { 4210 struct bpf_perf_link *perf_link; 4211 const struct perf_event *event; 4212 4213 perf_link = container_of(link, struct bpf_perf_link, link); 4214 event = perf_get_event(perf_link->perf_file); 4215 if (IS_ERR(event)) 4216 return; 4217 4218 switch (event->prog->type) { 4219 case BPF_PROG_TYPE_PERF_EVENT: 4220 return bpf_perf_event_link_show_fdinfo(event, seq); 4221 case BPF_PROG_TYPE_TRACEPOINT: 4222 return bpf_tracepoint_link_show_fdinfo(event, seq); 4223 case BPF_PROG_TYPE_KPROBE: 4224 return bpf_probe_link_show_fdinfo(event, seq); 4225 default: 4226 return; 4227 } 4228 } 4229 4230 static const struct bpf_link_ops bpf_perf_link_lops = { 4231 .release = bpf_perf_link_release, 4232 .dealloc = bpf_perf_link_dealloc, 4233 .fill_link_info = bpf_perf_link_fill_link_info, 4234 .show_fdinfo = bpf_perf_link_show_fdinfo, 4235 }; 4236 4237 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) 4238 { 4239 struct bpf_link_primer link_primer; 4240 struct bpf_perf_link *link; 4241 struct perf_event *event; 4242 struct file *perf_file; 4243 int err; 4244 4245 if (attr->link_create.flags) 4246 return -EINVAL; 4247 4248 perf_file = perf_event_get(attr->link_create.target_fd); 4249 if (IS_ERR(perf_file)) 4250 return PTR_ERR(perf_file); 4251 4252 link = kzalloc_obj(*link, GFP_USER); 4253 if (!link) { 4254 err = -ENOMEM; 4255 goto out_put_file; 4256 } 4257 bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog, 4258 attr->link_create.attach_type); 4259 link->perf_file = perf_file; 4260 4261 err = bpf_link_prime(&link->link, &link_primer); 4262 if (err) { 4263 kfree(link); 4264 goto out_put_file; 4265 } 4266 4267 event = perf_file->private_data; 4268 err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie); 4269 if (err) { 4270 bpf_link_cleanup(&link_primer); 4271 goto out_put_file; 4272 } 4273 /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */ 4274 bpf_prog_inc(prog); 4275 4276 return bpf_link_settle(&link_primer); 4277 4278 out_put_file: 4279 fput(perf_file); 4280 return err; 4281 } 4282 #else 4283 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) 4284 { 4285 return -EOPNOTSUPP; 4286 } 4287 #endif /* CONFIG_PERF_EVENTS */ 4288 4289 static int bpf_raw_tp_link_attach(struct bpf_prog *prog, 4290 const char __user *user_tp_name, u64 cookie, 4291 enum bpf_attach_type attach_type) 4292 { 4293 struct bpf_link_primer link_primer; 4294 struct bpf_raw_tp_link *link; 4295 struct bpf_raw_event_map *btp; 4296 const char *tp_name; 4297 char buf[128]; 4298 int err; 4299 4300 switch (prog->type) { 4301 case BPF_PROG_TYPE_TRACING: 4302 case BPF_PROG_TYPE_EXT: 4303 case BPF_PROG_TYPE_LSM: 4304 if (user_tp_name) 4305 /* The attach point for this category of programs 4306 * should be specified via btf_id during program load. 4307 */ 4308 return -EINVAL; 4309 if (prog->type == BPF_PROG_TYPE_TRACING && 4310 prog->expected_attach_type == BPF_TRACE_RAW_TP) { 4311 tp_name = prog->aux->attach_func_name; 4312 break; 4313 } 4314 return bpf_tracing_prog_attach(prog, 0, 0, 0, attach_type); 4315 case BPF_PROG_TYPE_RAW_TRACEPOINT: 4316 case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: 4317 if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) 4318 return -EFAULT; 4319 buf[sizeof(buf) - 1] = 0; 4320 tp_name = buf; 4321 break; 4322 default: 4323 return -EINVAL; 4324 } 4325 4326 btp = bpf_get_raw_tracepoint(tp_name); 4327 if (!btp) 4328 return -ENOENT; 4329 4330 if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { 4331 bpf_put_raw_tracepoint(btp); 4332 return -EINVAL; 4333 } 4334 4335 link = kzalloc_obj(*link, GFP_USER); 4336 if (!link) { 4337 err = -ENOMEM; 4338 goto out_put_btp; 4339 } 4340 bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, 4341 &bpf_raw_tp_link_lops, prog, attach_type, 4342 tracepoint_is_faultable(btp->tp)); 4343 link->btp = btp; 4344 link->cookie = cookie; 4345 4346 err = bpf_link_prime(&link->link, &link_primer); 4347 if (err) { 4348 kfree(link); 4349 goto out_put_btp; 4350 } 4351 4352 err = bpf_probe_register(link->btp, link); 4353 if (err) { 4354 bpf_link_cleanup(&link_primer); 4355 goto out_put_btp; 4356 } 4357 4358 return bpf_link_settle(&link_primer); 4359 4360 out_put_btp: 4361 bpf_put_raw_tracepoint(btp); 4362 return err; 4363 } 4364 4365 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie 4366 4367 static int bpf_raw_tracepoint_open(const union bpf_attr *attr) 4368 { 4369 struct bpf_prog *prog; 4370 void __user *tp_name; 4371 __u64 cookie; 4372 int fd; 4373 4374 if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) 4375 return -EINVAL; 4376 4377 prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); 4378 if (IS_ERR(prog)) 4379 return PTR_ERR(prog); 4380 4381 tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); 4382 cookie = attr->raw_tracepoint.cookie; 4383 fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type); 4384 if (fd < 0) 4385 bpf_prog_put(prog); 4386 return fd; 4387 } 4388 4389 static enum bpf_prog_type 4390 attach_type_to_prog_type(enum bpf_attach_type attach_type) 4391 { 4392 switch (attach_type) { 4393 case BPF_CGROUP_INET_INGRESS: 4394 case BPF_CGROUP_INET_EGRESS: 4395 return BPF_PROG_TYPE_CGROUP_SKB; 4396 case BPF_CGROUP_INET_SOCK_CREATE: 4397 case BPF_CGROUP_INET_SOCK_RELEASE: 4398 case BPF_CGROUP_INET4_POST_BIND: 4399 case BPF_CGROUP_INET6_POST_BIND: 4400 return BPF_PROG_TYPE_CGROUP_SOCK; 4401 case BPF_CGROUP_INET4_BIND: 4402 case BPF_CGROUP_INET6_BIND: 4403 case BPF_CGROUP_INET4_CONNECT: 4404 case BPF_CGROUP_INET6_CONNECT: 4405 case BPF_CGROUP_UNIX_CONNECT: 4406 case BPF_CGROUP_INET4_GETPEERNAME: 4407 case BPF_CGROUP_INET6_GETPEERNAME: 4408 case BPF_CGROUP_UNIX_GETPEERNAME: 4409 case BPF_CGROUP_INET4_GETSOCKNAME: 4410 case BPF_CGROUP_INET6_GETSOCKNAME: 4411 case BPF_CGROUP_UNIX_GETSOCKNAME: 4412 case BPF_CGROUP_UDP4_SENDMSG: 4413 case BPF_CGROUP_UDP6_SENDMSG: 4414 case BPF_CGROUP_UNIX_SENDMSG: 4415 case BPF_CGROUP_UDP4_RECVMSG: 4416 case BPF_CGROUP_UDP6_RECVMSG: 4417 case BPF_CGROUP_UNIX_RECVMSG: 4418 return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; 4419 case BPF_CGROUP_SOCK_OPS: 4420 return BPF_PROG_TYPE_SOCK_OPS; 4421 case BPF_CGROUP_DEVICE: 4422 return BPF_PROG_TYPE_CGROUP_DEVICE; 4423 case BPF_SK_MSG_VERDICT: 4424 return BPF_PROG_TYPE_SK_MSG; 4425 case BPF_SK_SKB_STREAM_PARSER: 4426 case BPF_SK_SKB_STREAM_VERDICT: 4427 case BPF_SK_SKB_VERDICT: 4428 return BPF_PROG_TYPE_SK_SKB; 4429 case BPF_LIRC_MODE2: 4430 return BPF_PROG_TYPE_LIRC_MODE2; 4431 case BPF_FLOW_DISSECTOR: 4432 return BPF_PROG_TYPE_FLOW_DISSECTOR; 4433 case BPF_CGROUP_SYSCTL: 4434 return BPF_PROG_TYPE_CGROUP_SYSCTL; 4435 case BPF_CGROUP_GETSOCKOPT: 4436 case BPF_CGROUP_SETSOCKOPT: 4437 return BPF_PROG_TYPE_CGROUP_SOCKOPT; 4438 case BPF_TRACE_ITER: 4439 case BPF_TRACE_RAW_TP: 4440 case BPF_TRACE_FENTRY: 4441 case BPF_TRACE_FEXIT: 4442 case BPF_TRACE_FSESSION: 4443 case BPF_MODIFY_RETURN: 4444 return BPF_PROG_TYPE_TRACING; 4445 case BPF_LSM_MAC: 4446 return BPF_PROG_TYPE_LSM; 4447 case BPF_SK_LOOKUP: 4448 return BPF_PROG_TYPE_SK_LOOKUP; 4449 case BPF_XDP: 4450 return BPF_PROG_TYPE_XDP; 4451 case BPF_LSM_CGROUP: 4452 return BPF_PROG_TYPE_LSM; 4453 case BPF_TCX_INGRESS: 4454 case BPF_TCX_EGRESS: 4455 case BPF_NETKIT_PRIMARY: 4456 case BPF_NETKIT_PEER: 4457 return BPF_PROG_TYPE_SCHED_CLS; 4458 default: 4459 return BPF_PROG_TYPE_UNSPEC; 4460 } 4461 } 4462 4463 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, 4464 enum bpf_attach_type attach_type) 4465 { 4466 enum bpf_prog_type ptype; 4467 4468 switch (prog->type) { 4469 case BPF_PROG_TYPE_CGROUP_SOCK: 4470 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 4471 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 4472 case BPF_PROG_TYPE_SK_LOOKUP: 4473 return attach_type == prog->expected_attach_type ? 0 : -EINVAL; 4474 case BPF_PROG_TYPE_CGROUP_SKB: 4475 if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN)) 4476 /* cg-skb progs can be loaded by unpriv user. 4477 * check permissions at attach time. 4478 */ 4479 return -EPERM; 4480 4481 ptype = attach_type_to_prog_type(attach_type); 4482 if (prog->type != ptype) 4483 return -EINVAL; 4484 4485 return prog->enforce_expected_attach_type && 4486 prog->expected_attach_type != attach_type ? 4487 -EINVAL : 0; 4488 case BPF_PROG_TYPE_EXT: 4489 return 0; 4490 case BPF_PROG_TYPE_NETFILTER: 4491 if (attach_type != BPF_NETFILTER) 4492 return -EINVAL; 4493 return 0; 4494 case BPF_PROG_TYPE_PERF_EVENT: 4495 case BPF_PROG_TYPE_TRACEPOINT: 4496 if (attach_type != BPF_PERF_EVENT) 4497 return -EINVAL; 4498 return 0; 4499 case BPF_PROG_TYPE_KPROBE: 4500 if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && 4501 attach_type != BPF_TRACE_KPROBE_MULTI) 4502 return -EINVAL; 4503 if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION && 4504 attach_type != BPF_TRACE_KPROBE_SESSION) 4505 return -EINVAL; 4506 if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && 4507 attach_type != BPF_TRACE_UPROBE_MULTI) 4508 return -EINVAL; 4509 if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION && 4510 attach_type != BPF_TRACE_UPROBE_SESSION) 4511 return -EINVAL; 4512 if (attach_type != BPF_PERF_EVENT && 4513 attach_type != BPF_TRACE_KPROBE_MULTI && 4514 attach_type != BPF_TRACE_KPROBE_SESSION && 4515 attach_type != BPF_TRACE_UPROBE_MULTI && 4516 attach_type != BPF_TRACE_UPROBE_SESSION) 4517 return -EINVAL; 4518 return 0; 4519 case BPF_PROG_TYPE_SCHED_CLS: 4520 if (attach_type != BPF_TCX_INGRESS && 4521 attach_type != BPF_TCX_EGRESS && 4522 attach_type != BPF_NETKIT_PRIMARY && 4523 attach_type != BPF_NETKIT_PEER) 4524 return -EINVAL; 4525 return 0; 4526 default: 4527 ptype = attach_type_to_prog_type(attach_type); 4528 if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) 4529 return -EINVAL; 4530 return 0; 4531 } 4532 } 4533 4534 static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype, 4535 bool check_atype) 4536 { 4537 switch (ptype) { 4538 case BPF_PROG_TYPE_CGROUP_DEVICE: 4539 case BPF_PROG_TYPE_CGROUP_SKB: 4540 case BPF_PROG_TYPE_CGROUP_SOCK: 4541 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 4542 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 4543 case BPF_PROG_TYPE_CGROUP_SYSCTL: 4544 case BPF_PROG_TYPE_SOCK_OPS: 4545 return true; 4546 case BPF_PROG_TYPE_LSM: 4547 return check_atype ? atype == BPF_LSM_CGROUP : true; 4548 default: 4549 return false; 4550 } 4551 } 4552 4553 #define BPF_PROG_ATTACH_LAST_FIELD expected_revision 4554 4555 #define BPF_F_ATTACH_MASK_BASE \ 4556 (BPF_F_ALLOW_OVERRIDE | \ 4557 BPF_F_ALLOW_MULTI | \ 4558 BPF_F_REPLACE | \ 4559 BPF_F_PREORDER) 4560 4561 #define BPF_F_ATTACH_MASK_MPROG \ 4562 (BPF_F_REPLACE | \ 4563 BPF_F_BEFORE | \ 4564 BPF_F_AFTER | \ 4565 BPF_F_ID | \ 4566 BPF_F_LINK) 4567 4568 static int bpf_prog_attach(const union bpf_attr *attr) 4569 { 4570 enum bpf_prog_type ptype; 4571 struct bpf_prog *prog; 4572 int ret; 4573 4574 if (CHECK_ATTR(BPF_PROG_ATTACH)) 4575 return -EINVAL; 4576 4577 ptype = attach_type_to_prog_type(attr->attach_type); 4578 if (ptype == BPF_PROG_TYPE_UNSPEC) 4579 return -EINVAL; 4580 if (bpf_mprog_supported(ptype)) { 4581 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) 4582 return -EINVAL; 4583 } else if (is_cgroup_prog_type(ptype, 0, false)) { 4584 if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG)) 4585 return -EINVAL; 4586 } else { 4587 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE) 4588 return -EINVAL; 4589 if (attr->relative_fd || 4590 attr->expected_revision) 4591 return -EINVAL; 4592 } 4593 4594 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); 4595 if (IS_ERR(prog)) 4596 return PTR_ERR(prog); 4597 4598 if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { 4599 bpf_prog_put(prog); 4600 return -EINVAL; 4601 } 4602 4603 if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) { 4604 ret = cgroup_bpf_prog_attach(attr, ptype, prog); 4605 goto out; 4606 } 4607 4608 switch (ptype) { 4609 case BPF_PROG_TYPE_SK_SKB: 4610 case BPF_PROG_TYPE_SK_MSG: 4611 ret = sock_map_get_from_fd(attr, prog); 4612 break; 4613 case BPF_PROG_TYPE_LIRC_MODE2: 4614 ret = lirc_prog_attach(attr, prog); 4615 break; 4616 case BPF_PROG_TYPE_FLOW_DISSECTOR: 4617 ret = netns_bpf_prog_attach(attr, prog); 4618 break; 4619 case BPF_PROG_TYPE_SCHED_CLS: 4620 if (attr->attach_type == BPF_TCX_INGRESS || 4621 attr->attach_type == BPF_TCX_EGRESS) 4622 ret = tcx_prog_attach(attr, prog); 4623 else 4624 ret = netkit_prog_attach(attr, prog); 4625 break; 4626 default: 4627 ret = -EINVAL; 4628 } 4629 out: 4630 if (ret) 4631 bpf_prog_put(prog); 4632 return ret; 4633 } 4634 4635 #define BPF_PROG_DETACH_LAST_FIELD expected_revision 4636 4637 static int bpf_prog_detach(const union bpf_attr *attr) 4638 { 4639 struct bpf_prog *prog = NULL; 4640 enum bpf_prog_type ptype; 4641 int ret; 4642 4643 if (CHECK_ATTR(BPF_PROG_DETACH)) 4644 return -EINVAL; 4645 4646 ptype = attach_type_to_prog_type(attr->attach_type); 4647 if (bpf_mprog_supported(ptype)) { 4648 if (ptype == BPF_PROG_TYPE_UNSPEC) 4649 return -EINVAL; 4650 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) 4651 return -EINVAL; 4652 if (attr->attach_bpf_fd) { 4653 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); 4654 if (IS_ERR(prog)) 4655 return PTR_ERR(prog); 4656 } else if (!bpf_mprog_detach_empty(ptype)) { 4657 return -EPERM; 4658 } 4659 } else if (is_cgroup_prog_type(ptype, 0, false)) { 4660 if (attr->attach_flags || attr->relative_fd) 4661 return -EINVAL; 4662 } else if (attr->attach_flags || 4663 attr->relative_fd || 4664 attr->expected_revision) { 4665 return -EINVAL; 4666 } 4667 4668 switch (ptype) { 4669 case BPF_PROG_TYPE_SK_MSG: 4670 case BPF_PROG_TYPE_SK_SKB: 4671 ret = sock_map_prog_detach(attr, ptype); 4672 break; 4673 case BPF_PROG_TYPE_LIRC_MODE2: 4674 ret = lirc_prog_detach(attr); 4675 break; 4676 case BPF_PROG_TYPE_FLOW_DISSECTOR: 4677 ret = netns_bpf_prog_detach(attr, ptype); 4678 break; 4679 case BPF_PROG_TYPE_CGROUP_DEVICE: 4680 case BPF_PROG_TYPE_CGROUP_SKB: 4681 case BPF_PROG_TYPE_CGROUP_SOCK: 4682 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 4683 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 4684 case BPF_PROG_TYPE_CGROUP_SYSCTL: 4685 case BPF_PROG_TYPE_SOCK_OPS: 4686 case BPF_PROG_TYPE_LSM: 4687 ret = cgroup_bpf_prog_detach(attr, ptype); 4688 break; 4689 case BPF_PROG_TYPE_SCHED_CLS: 4690 if (attr->attach_type == BPF_TCX_INGRESS || 4691 attr->attach_type == BPF_TCX_EGRESS) 4692 ret = tcx_prog_detach(attr, prog); 4693 else 4694 ret = netkit_prog_detach(attr, prog); 4695 break; 4696 default: 4697 ret = -EINVAL; 4698 } 4699 4700 if (prog) 4701 bpf_prog_put(prog); 4702 return ret; 4703 } 4704 4705 #define BPF_PROG_QUERY_LAST_FIELD query.revision 4706 4707 static int bpf_prog_query(const union bpf_attr *attr, 4708 union bpf_attr __user *uattr) 4709 { 4710 if (!bpf_net_capable()) 4711 return -EPERM; 4712 if (CHECK_ATTR(BPF_PROG_QUERY)) 4713 return -EINVAL; 4714 if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) 4715 return -EINVAL; 4716 4717 switch (attr->query.attach_type) { 4718 case BPF_CGROUP_INET_INGRESS: 4719 case BPF_CGROUP_INET_EGRESS: 4720 case BPF_CGROUP_INET_SOCK_CREATE: 4721 case BPF_CGROUP_INET_SOCK_RELEASE: 4722 case BPF_CGROUP_INET4_BIND: 4723 case BPF_CGROUP_INET6_BIND: 4724 case BPF_CGROUP_INET4_POST_BIND: 4725 case BPF_CGROUP_INET6_POST_BIND: 4726 case BPF_CGROUP_INET4_CONNECT: 4727 case BPF_CGROUP_INET6_CONNECT: 4728 case BPF_CGROUP_UNIX_CONNECT: 4729 case BPF_CGROUP_INET4_GETPEERNAME: 4730 case BPF_CGROUP_INET6_GETPEERNAME: 4731 case BPF_CGROUP_UNIX_GETPEERNAME: 4732 case BPF_CGROUP_INET4_GETSOCKNAME: 4733 case BPF_CGROUP_INET6_GETSOCKNAME: 4734 case BPF_CGROUP_UNIX_GETSOCKNAME: 4735 case BPF_CGROUP_UDP4_SENDMSG: 4736 case BPF_CGROUP_UDP6_SENDMSG: 4737 case BPF_CGROUP_UNIX_SENDMSG: 4738 case BPF_CGROUP_UDP4_RECVMSG: 4739 case BPF_CGROUP_UDP6_RECVMSG: 4740 case BPF_CGROUP_UNIX_RECVMSG: 4741 case BPF_CGROUP_SOCK_OPS: 4742 case BPF_CGROUP_DEVICE: 4743 case BPF_CGROUP_SYSCTL: 4744 case BPF_CGROUP_GETSOCKOPT: 4745 case BPF_CGROUP_SETSOCKOPT: 4746 case BPF_LSM_CGROUP: 4747 return cgroup_bpf_prog_query(attr, uattr); 4748 case BPF_LIRC_MODE2: 4749 return lirc_prog_query(attr, uattr); 4750 case BPF_FLOW_DISSECTOR: 4751 case BPF_SK_LOOKUP: 4752 return netns_bpf_prog_query(attr, uattr); 4753 case BPF_SK_SKB_STREAM_PARSER: 4754 case BPF_SK_SKB_STREAM_VERDICT: 4755 case BPF_SK_MSG_VERDICT: 4756 case BPF_SK_SKB_VERDICT: 4757 return sock_map_bpf_prog_query(attr, uattr); 4758 case BPF_TCX_INGRESS: 4759 case BPF_TCX_EGRESS: 4760 return tcx_prog_query(attr, uattr); 4761 case BPF_NETKIT_PRIMARY: 4762 case BPF_NETKIT_PEER: 4763 return netkit_prog_query(attr, uattr); 4764 default: 4765 return -EINVAL; 4766 } 4767 } 4768 4769 #define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size 4770 4771 static int bpf_prog_test_run(const union bpf_attr *attr, 4772 union bpf_attr __user *uattr) 4773 { 4774 struct bpf_prog *prog; 4775 int ret = -ENOTSUPP; 4776 4777 if (CHECK_ATTR(BPF_PROG_TEST_RUN)) 4778 return -EINVAL; 4779 4780 if ((attr->test.ctx_size_in && !attr->test.ctx_in) || 4781 (!attr->test.ctx_size_in && attr->test.ctx_in)) 4782 return -EINVAL; 4783 4784 if ((attr->test.ctx_size_out && !attr->test.ctx_out) || 4785 (!attr->test.ctx_size_out && attr->test.ctx_out)) 4786 return -EINVAL; 4787 4788 prog = bpf_prog_get(attr->test.prog_fd); 4789 if (IS_ERR(prog)) 4790 return PTR_ERR(prog); 4791 4792 if (prog->aux->ops->test_run) 4793 ret = prog->aux->ops->test_run(prog, attr, uattr); 4794 4795 bpf_prog_put(prog); 4796 return ret; 4797 } 4798 4799 #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id 4800 4801 static int bpf_obj_get_next_id(const union bpf_attr *attr, 4802 union bpf_attr __user *uattr, 4803 struct idr *idr, 4804 spinlock_t *lock) 4805 { 4806 u32 next_id = attr->start_id; 4807 int err = 0; 4808 4809 if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) 4810 return -EINVAL; 4811 4812 if (!capable(CAP_SYS_ADMIN)) 4813 return -EPERM; 4814 4815 next_id++; 4816 spin_lock_bh(lock); 4817 if (!idr_get_next(idr, &next_id)) 4818 err = -ENOENT; 4819 spin_unlock_bh(lock); 4820 4821 if (!err) 4822 err = put_user(next_id, &uattr->next_id); 4823 4824 return err; 4825 } 4826 4827 struct bpf_map *bpf_map_get_curr_or_next(u32 *id) 4828 { 4829 struct bpf_map *map; 4830 4831 spin_lock_bh(&map_idr_lock); 4832 again: 4833 map = idr_get_next(&map_idr, id); 4834 if (map) { 4835 map = __bpf_map_inc_not_zero(map, false); 4836 if (IS_ERR(map)) { 4837 (*id)++; 4838 goto again; 4839 } 4840 } 4841 spin_unlock_bh(&map_idr_lock); 4842 4843 return map; 4844 } 4845 4846 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id) 4847 { 4848 struct bpf_prog *prog; 4849 4850 spin_lock_bh(&prog_idr_lock); 4851 again: 4852 prog = idr_get_next(&prog_idr, id); 4853 if (prog) { 4854 prog = bpf_prog_inc_not_zero(prog); 4855 if (IS_ERR(prog)) { 4856 (*id)++; 4857 goto again; 4858 } 4859 } 4860 spin_unlock_bh(&prog_idr_lock); 4861 4862 return prog; 4863 } 4864 4865 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id 4866 4867 struct bpf_prog *bpf_prog_by_id(u32 id) 4868 { 4869 struct bpf_prog *prog; 4870 4871 if (!id) 4872 return ERR_PTR(-ENOENT); 4873 4874 spin_lock_bh(&prog_idr_lock); 4875 prog = idr_find(&prog_idr, id); 4876 if (prog) 4877 prog = bpf_prog_inc_not_zero(prog); 4878 else 4879 prog = ERR_PTR(-ENOENT); 4880 spin_unlock_bh(&prog_idr_lock); 4881 return prog; 4882 } 4883 4884 static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) 4885 { 4886 struct bpf_prog *prog; 4887 u32 id = attr->prog_id; 4888 int fd; 4889 4890 if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) 4891 return -EINVAL; 4892 4893 if (!capable(CAP_SYS_ADMIN)) 4894 return -EPERM; 4895 4896 prog = bpf_prog_by_id(id); 4897 if (IS_ERR(prog)) 4898 return PTR_ERR(prog); 4899 4900 fd = bpf_prog_new_fd(prog); 4901 if (fd < 0) 4902 bpf_prog_put(prog); 4903 4904 return fd; 4905 } 4906 4907 #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags 4908 4909 static int bpf_map_get_fd_by_id(const union bpf_attr *attr) 4910 { 4911 struct bpf_map *map; 4912 u32 id = attr->map_id; 4913 int f_flags; 4914 int fd; 4915 4916 if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || 4917 attr->open_flags & ~BPF_OBJ_FLAG_MASK) 4918 return -EINVAL; 4919 4920 if (!capable(CAP_SYS_ADMIN)) 4921 return -EPERM; 4922 4923 f_flags = bpf_get_file_flag(attr->open_flags); 4924 if (f_flags < 0) 4925 return f_flags; 4926 4927 spin_lock_bh(&map_idr_lock); 4928 map = idr_find(&map_idr, id); 4929 if (map) 4930 map = __bpf_map_inc_not_zero(map, true); 4931 else 4932 map = ERR_PTR(-ENOENT); 4933 spin_unlock_bh(&map_idr_lock); 4934 4935 if (IS_ERR(map)) 4936 return PTR_ERR(map); 4937 4938 fd = bpf_map_new_fd(map, f_flags); 4939 if (fd < 0) 4940 bpf_map_put_with_uref(map); 4941 4942 return fd; 4943 } 4944 4945 static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, 4946 unsigned long addr, u32 *off, 4947 u32 *type) 4948 { 4949 const struct bpf_map *map; 4950 int i; 4951 4952 mutex_lock(&prog->aux->used_maps_mutex); 4953 for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { 4954 map = prog->aux->used_maps[i]; 4955 if (map == (void *)addr) { 4956 *type = BPF_PSEUDO_MAP_FD; 4957 goto out; 4958 } 4959 if (!map->ops->map_direct_value_meta) 4960 continue; 4961 if (!map->ops->map_direct_value_meta(map, addr, off)) { 4962 *type = BPF_PSEUDO_MAP_VALUE; 4963 goto out; 4964 } 4965 } 4966 map = NULL; 4967 4968 out: 4969 mutex_unlock(&prog->aux->used_maps_mutex); 4970 return map; 4971 } 4972 4973 static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, 4974 const struct cred *f_cred) 4975 { 4976 const struct bpf_map *map; 4977 struct bpf_insn *insns; 4978 u32 off, type; 4979 u64 imm; 4980 u8 code; 4981 int i; 4982 4983 insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), 4984 GFP_USER); 4985 if (!insns) 4986 return insns; 4987 4988 for (i = 0; i < prog->len; i++) { 4989 code = insns[i].code; 4990 4991 if (code == (BPF_JMP | BPF_TAIL_CALL)) { 4992 insns[i].code = BPF_JMP | BPF_CALL; 4993 insns[i].imm = BPF_FUNC_tail_call; 4994 /* fall-through */ 4995 } 4996 if (code == (BPF_JMP | BPF_CALL) || 4997 code == (BPF_JMP | BPF_CALL_ARGS)) { 4998 if (code == (BPF_JMP | BPF_CALL_ARGS)) 4999 insns[i].code = BPF_JMP | BPF_CALL; 5000 if (!bpf_dump_raw_ok(f_cred)) 5001 insns[i].imm = 0; 5002 continue; 5003 } 5004 if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) { 5005 insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM; 5006 continue; 5007 } 5008 5009 if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX || 5010 BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) { 5011 insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM; 5012 continue; 5013 } 5014 5015 if (code != (BPF_LD | BPF_IMM | BPF_DW)) 5016 continue; 5017 5018 imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; 5019 map = bpf_map_from_imm(prog, imm, &off, &type); 5020 if (map) { 5021 insns[i].src_reg = type; 5022 insns[i].imm = map->id; 5023 insns[i + 1].imm = off; 5024 continue; 5025 } 5026 } 5027 5028 return insns; 5029 } 5030 5031 static int set_info_rec_size(struct bpf_prog_info *info) 5032 { 5033 /* 5034 * Ensure info.*_rec_size is the same as kernel expected size 5035 * 5036 * or 5037 * 5038 * Only allow zero *_rec_size if both _rec_size and _cnt are 5039 * zero. In this case, the kernel will set the expected 5040 * _rec_size back to the info. 5041 */ 5042 5043 if ((info->nr_func_info || info->func_info_rec_size) && 5044 info->func_info_rec_size != sizeof(struct bpf_func_info)) 5045 return -EINVAL; 5046 5047 if ((info->nr_line_info || info->line_info_rec_size) && 5048 info->line_info_rec_size != sizeof(struct bpf_line_info)) 5049 return -EINVAL; 5050 5051 if ((info->nr_jited_line_info || info->jited_line_info_rec_size) && 5052 info->jited_line_info_rec_size != sizeof(__u64)) 5053 return -EINVAL; 5054 5055 info->func_info_rec_size = sizeof(struct bpf_func_info); 5056 info->line_info_rec_size = sizeof(struct bpf_line_info); 5057 info->jited_line_info_rec_size = sizeof(__u64); 5058 5059 return 0; 5060 } 5061 5062 static int bpf_prog_get_info_by_fd(struct file *file, 5063 struct bpf_prog *prog, 5064 const union bpf_attr *attr, 5065 union bpf_attr __user *uattr) 5066 { 5067 struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5068 struct btf *attach_btf = bpf_prog_get_target_btf(prog); 5069 struct bpf_prog_info info; 5070 u32 info_len = attr->info.info_len; 5071 struct bpf_prog_kstats stats; 5072 char __user *uinsns; 5073 u32 ulen; 5074 int err; 5075 5076 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); 5077 if (err) 5078 return err; 5079 info_len = min_t(u32, sizeof(info), info_len); 5080 5081 memset(&info, 0, sizeof(info)); 5082 if (copy_from_user(&info, uinfo, info_len)) 5083 return -EFAULT; 5084 5085 info.type = prog->type; 5086 info.id = prog->aux->id; 5087 info.load_time = prog->aux->load_time; 5088 info.created_by_uid = from_kuid_munged(current_user_ns(), 5089 prog->aux->user->uid); 5090 info.gpl_compatible = prog->gpl_compatible; 5091 5092 memcpy(info.tag, prog->tag, sizeof(prog->tag)); 5093 memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); 5094 5095 mutex_lock(&prog->aux->used_maps_mutex); 5096 ulen = info.nr_map_ids; 5097 info.nr_map_ids = prog->aux->used_map_cnt; 5098 ulen = min_t(u32, info.nr_map_ids, ulen); 5099 if (ulen) { 5100 u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); 5101 u32 i; 5102 5103 for (i = 0; i < ulen; i++) 5104 if (put_user(prog->aux->used_maps[i]->id, 5105 &user_map_ids[i])) { 5106 mutex_unlock(&prog->aux->used_maps_mutex); 5107 return -EFAULT; 5108 } 5109 } 5110 mutex_unlock(&prog->aux->used_maps_mutex); 5111 5112 err = set_info_rec_size(&info); 5113 if (err) 5114 return err; 5115 5116 bpf_prog_get_stats(prog, &stats); 5117 info.run_time_ns = stats.nsecs; 5118 info.run_cnt = stats.cnt; 5119 info.recursion_misses = stats.misses; 5120 5121 info.verified_insns = prog->aux->verified_insns; 5122 if (prog->aux->btf) 5123 info.btf_id = btf_obj_id(prog->aux->btf); 5124 5125 if (!bpf_capable()) { 5126 info.jited_prog_len = 0; 5127 info.xlated_prog_len = 0; 5128 info.nr_jited_ksyms = 0; 5129 info.nr_jited_func_lens = 0; 5130 info.nr_func_info = 0; 5131 info.nr_line_info = 0; 5132 info.nr_jited_line_info = 0; 5133 goto done; 5134 } 5135 5136 ulen = info.xlated_prog_len; 5137 info.xlated_prog_len = bpf_prog_insn_size(prog); 5138 if (info.xlated_prog_len && ulen) { 5139 struct bpf_insn *insns_sanitized; 5140 bool fault; 5141 5142 if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) { 5143 insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); 5144 if (!insns_sanitized) 5145 return -ENOMEM; 5146 uinsns = u64_to_user_ptr(info.xlated_prog_insns); 5147 ulen = min_t(u32, info.xlated_prog_len, ulen); 5148 fault = copy_to_user(uinsns, insns_sanitized, ulen); 5149 kfree(insns_sanitized); 5150 if (fault) 5151 return -EFAULT; 5152 } else { 5153 info.xlated_prog_insns = 0; 5154 } 5155 } 5156 5157 if (bpf_prog_is_offloaded(prog->aux)) { 5158 err = bpf_prog_offload_info_fill(&info, prog); 5159 if (err) 5160 return err; 5161 goto done; 5162 } 5163 5164 /* NOTE: the following code is supposed to be skipped for offload. 5165 * bpf_prog_offload_info_fill() is the place to fill similar fields 5166 * for offload. 5167 */ 5168 ulen = info.jited_prog_len; 5169 if (prog->aux->func_cnt) { 5170 u32 i; 5171 5172 info.jited_prog_len = 0; 5173 for (i = 0; i < prog->aux->func_cnt; i++) 5174 info.jited_prog_len += prog->aux->func[i]->jited_len; 5175 } else { 5176 info.jited_prog_len = prog->jited_len; 5177 } 5178 5179 if (info.jited_prog_len && ulen) { 5180 if (bpf_dump_raw_ok(file->f_cred)) { 5181 uinsns = u64_to_user_ptr(info.jited_prog_insns); 5182 ulen = min_t(u32, info.jited_prog_len, ulen); 5183 5184 /* for multi-function programs, copy the JITed 5185 * instructions for all the functions 5186 */ 5187 if (prog->aux->func_cnt) { 5188 u32 len, free, i; 5189 u8 *img; 5190 5191 free = ulen; 5192 for (i = 0; i < prog->aux->func_cnt; i++) { 5193 len = prog->aux->func[i]->jited_len; 5194 len = min_t(u32, len, free); 5195 img = (u8 *) prog->aux->func[i]->bpf_func; 5196 if (copy_to_user(uinsns, img, len)) 5197 return -EFAULT; 5198 uinsns += len; 5199 free -= len; 5200 if (!free) 5201 break; 5202 } 5203 } else { 5204 if (copy_to_user(uinsns, prog->bpf_func, ulen)) 5205 return -EFAULT; 5206 } 5207 } else { 5208 info.jited_prog_insns = 0; 5209 } 5210 } 5211 5212 ulen = info.nr_jited_ksyms; 5213 info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; 5214 if (ulen) { 5215 if (bpf_dump_raw_ok(file->f_cred)) { 5216 unsigned long ksym_addr; 5217 u64 __user *user_ksyms; 5218 u32 i; 5219 5220 /* copy the address of the kernel symbol 5221 * corresponding to each function 5222 */ 5223 ulen = min_t(u32, info.nr_jited_ksyms, ulen); 5224 user_ksyms = u64_to_user_ptr(info.jited_ksyms); 5225 if (prog->aux->func_cnt) { 5226 for (i = 0; i < ulen; i++) { 5227 ksym_addr = (unsigned long) 5228 prog->aux->func[i]->bpf_func; 5229 if (put_user((u64) ksym_addr, 5230 &user_ksyms[i])) 5231 return -EFAULT; 5232 } 5233 } else { 5234 ksym_addr = (unsigned long) prog->bpf_func; 5235 if (put_user((u64) ksym_addr, &user_ksyms[0])) 5236 return -EFAULT; 5237 } 5238 } else { 5239 info.jited_ksyms = 0; 5240 } 5241 } 5242 5243 ulen = info.nr_jited_func_lens; 5244 info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; 5245 if (ulen) { 5246 if (bpf_dump_raw_ok(file->f_cred)) { 5247 u32 __user *user_lens; 5248 u32 func_len, i; 5249 5250 /* copy the JITed image lengths for each function */ 5251 ulen = min_t(u32, info.nr_jited_func_lens, ulen); 5252 user_lens = u64_to_user_ptr(info.jited_func_lens); 5253 if (prog->aux->func_cnt) { 5254 for (i = 0; i < ulen; i++) { 5255 func_len = 5256 prog->aux->func[i]->jited_len; 5257 if (put_user(func_len, &user_lens[i])) 5258 return -EFAULT; 5259 } 5260 } else { 5261 func_len = prog->jited_len; 5262 if (put_user(func_len, &user_lens[0])) 5263 return -EFAULT; 5264 } 5265 } else { 5266 info.jited_func_lens = 0; 5267 } 5268 } 5269 5270 info.attach_btf_id = prog->aux->attach_btf_id; 5271 if (attach_btf) 5272 info.attach_btf_obj_id = btf_obj_id(attach_btf); 5273 5274 ulen = info.nr_func_info; 5275 info.nr_func_info = prog->aux->func_info_cnt; 5276 if (info.nr_func_info && ulen) { 5277 char __user *user_finfo; 5278 5279 user_finfo = u64_to_user_ptr(info.func_info); 5280 ulen = min_t(u32, info.nr_func_info, ulen); 5281 if (copy_to_user(user_finfo, prog->aux->func_info, 5282 info.func_info_rec_size * ulen)) 5283 return -EFAULT; 5284 } 5285 5286 ulen = info.nr_line_info; 5287 info.nr_line_info = prog->aux->nr_linfo; 5288 if (info.nr_line_info && ulen) { 5289 __u8 __user *user_linfo; 5290 5291 user_linfo = u64_to_user_ptr(info.line_info); 5292 ulen = min_t(u32, info.nr_line_info, ulen); 5293 if (copy_to_user(user_linfo, prog->aux->linfo, 5294 info.line_info_rec_size * ulen)) 5295 return -EFAULT; 5296 } 5297 5298 ulen = info.nr_jited_line_info; 5299 if (prog->aux->jited_linfo) 5300 info.nr_jited_line_info = prog->aux->nr_linfo; 5301 else 5302 info.nr_jited_line_info = 0; 5303 if (info.nr_jited_line_info && ulen) { 5304 if (bpf_dump_raw_ok(file->f_cred)) { 5305 unsigned long line_addr; 5306 __u64 __user *user_linfo; 5307 u32 i; 5308 5309 user_linfo = u64_to_user_ptr(info.jited_line_info); 5310 ulen = min_t(u32, info.nr_jited_line_info, ulen); 5311 for (i = 0; i < ulen; i++) { 5312 line_addr = (unsigned long)prog->aux->jited_linfo[i]; 5313 if (put_user((__u64)line_addr, &user_linfo[i])) 5314 return -EFAULT; 5315 } 5316 } else { 5317 info.jited_line_info = 0; 5318 } 5319 } 5320 5321 ulen = info.nr_prog_tags; 5322 info.nr_prog_tags = prog->aux->func_cnt ? : 1; 5323 if (ulen) { 5324 __u8 __user (*user_prog_tags)[BPF_TAG_SIZE]; 5325 u32 i; 5326 5327 user_prog_tags = u64_to_user_ptr(info.prog_tags); 5328 ulen = min_t(u32, info.nr_prog_tags, ulen); 5329 if (prog->aux->func_cnt) { 5330 for (i = 0; i < ulen; i++) { 5331 if (copy_to_user(user_prog_tags[i], 5332 prog->aux->func[i]->tag, 5333 BPF_TAG_SIZE)) 5334 return -EFAULT; 5335 } 5336 } else { 5337 if (copy_to_user(user_prog_tags[0], 5338 prog->tag, BPF_TAG_SIZE)) 5339 return -EFAULT; 5340 } 5341 } 5342 5343 done: 5344 if (copy_to_user(uinfo, &info, info_len) || 5345 put_user(info_len, &uattr->info.info_len)) 5346 return -EFAULT; 5347 5348 return 0; 5349 } 5350 5351 static int bpf_map_get_info_by_fd(struct file *file, 5352 struct bpf_map *map, 5353 const union bpf_attr *attr, 5354 union bpf_attr __user *uattr) 5355 { 5356 struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5357 struct bpf_map_info info; 5358 u32 info_len = attr->info.info_len; 5359 int err; 5360 5361 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); 5362 if (err) 5363 return err; 5364 info_len = min_t(u32, sizeof(info), info_len); 5365 5366 memset(&info, 0, sizeof(info)); 5367 if (copy_from_user(&info, uinfo, info_len)) 5368 return -EFAULT; 5369 5370 info.type = map->map_type; 5371 info.id = map->id; 5372 info.key_size = map->key_size; 5373 info.value_size = map->value_size; 5374 info.max_entries = map->max_entries; 5375 info.map_flags = map->map_flags; 5376 info.map_extra = map->map_extra; 5377 memcpy(info.name, map->name, sizeof(map->name)); 5378 5379 if (map->btf) { 5380 info.btf_id = btf_obj_id(map->btf); 5381 info.btf_key_type_id = map->btf_key_type_id; 5382 info.btf_value_type_id = map->btf_value_type_id; 5383 } 5384 info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; 5385 if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) 5386 bpf_map_struct_ops_info_fill(&info, map); 5387 5388 if (bpf_map_is_offloaded(map)) { 5389 err = bpf_map_offload_info_fill(&info, map); 5390 if (err) 5391 return err; 5392 } 5393 5394 if (info.hash) { 5395 char __user *uhash = u64_to_user_ptr(info.hash); 5396 5397 if (!map->ops->map_get_hash) 5398 return -EINVAL; 5399 5400 if (info.hash_size != SHA256_DIGEST_SIZE) 5401 return -EINVAL; 5402 5403 if (!READ_ONCE(map->frozen)) 5404 return -EPERM; 5405 5406 err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); 5407 if (err != 0) 5408 return err; 5409 5410 if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0) 5411 return -EFAULT; 5412 } else if (info.hash_size) { 5413 return -EINVAL; 5414 } 5415 5416 if (copy_to_user(uinfo, &info, info_len) || 5417 put_user(info_len, &uattr->info.info_len)) 5418 return -EFAULT; 5419 5420 return 0; 5421 } 5422 5423 static int bpf_btf_get_info_by_fd(struct file *file, 5424 struct btf *btf, 5425 const union bpf_attr *attr, 5426 union bpf_attr __user *uattr) 5427 { 5428 struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5429 u32 info_len = attr->info.info_len; 5430 int err; 5431 5432 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); 5433 if (err) 5434 return err; 5435 5436 return btf_get_info_by_fd(btf, attr, uattr); 5437 } 5438 5439 static int bpf_link_get_info_by_fd(struct file *file, 5440 struct bpf_link *link, 5441 const union bpf_attr *attr, 5442 union bpf_attr __user *uattr) 5443 { 5444 struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5445 struct bpf_link_info info; 5446 u32 info_len = attr->info.info_len; 5447 int err; 5448 5449 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); 5450 if (err) 5451 return err; 5452 info_len = min_t(u32, sizeof(info), info_len); 5453 5454 memset(&info, 0, sizeof(info)); 5455 if (copy_from_user(&info, uinfo, info_len)) 5456 return -EFAULT; 5457 5458 info.type = link->type; 5459 info.id = link->id; 5460 if (link->prog) 5461 info.prog_id = link->prog->aux->id; 5462 5463 if (link->ops->fill_link_info) { 5464 err = link->ops->fill_link_info(link, &info); 5465 if (err) 5466 return err; 5467 } 5468 5469 if (copy_to_user(uinfo, &info, info_len) || 5470 put_user(info_len, &uattr->info.info_len)) 5471 return -EFAULT; 5472 5473 return 0; 5474 } 5475 5476 5477 static int token_get_info_by_fd(struct file *file, 5478 struct bpf_token *token, 5479 const union bpf_attr *attr, 5480 union bpf_attr __user *uattr) 5481 { 5482 struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5483 u32 info_len = attr->info.info_len; 5484 int err; 5485 5486 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); 5487 if (err) 5488 return err; 5489 return bpf_token_get_info_by_fd(token, attr, uattr); 5490 } 5491 5492 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info 5493 5494 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, 5495 union bpf_attr __user *uattr) 5496 { 5497 if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) 5498 return -EINVAL; 5499 5500 CLASS(fd, f)(attr->info.bpf_fd); 5501 if (fd_empty(f)) 5502 return -EBADFD; 5503 5504 if (fd_file(f)->f_op == &bpf_prog_fops) 5505 return bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, 5506 uattr); 5507 else if (fd_file(f)->f_op == &bpf_map_fops) 5508 return bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, 5509 uattr); 5510 else if (fd_file(f)->f_op == &btf_fops) 5511 return bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); 5512 else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll) 5513 return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, 5514 attr, uattr); 5515 else if (fd_file(f)->f_op == &bpf_token_fops) 5516 return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data, 5517 attr, uattr); 5518 return -EINVAL; 5519 } 5520 5521 #define BPF_BTF_LOAD_LAST_FIELD btf_token_fd 5522 5523 static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) 5524 { 5525 struct bpf_token *token = NULL; 5526 5527 if (CHECK_ATTR(BPF_BTF_LOAD)) 5528 return -EINVAL; 5529 5530 if (attr->btf_flags & ~BPF_F_TOKEN_FD) 5531 return -EINVAL; 5532 5533 if (attr->btf_flags & BPF_F_TOKEN_FD) { 5534 token = bpf_token_get_from_fd(attr->btf_token_fd); 5535 if (IS_ERR(token)) 5536 return PTR_ERR(token); 5537 if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) { 5538 bpf_token_put(token); 5539 token = NULL; 5540 } 5541 } 5542 5543 if (!bpf_token_capable(token, CAP_BPF)) { 5544 bpf_token_put(token); 5545 return -EPERM; 5546 } 5547 5548 bpf_token_put(token); 5549 5550 return btf_new_fd(attr, uattr, attr_log); 5551 } 5552 5553 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd 5554 5555 static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) 5556 { 5557 struct bpf_token *token = NULL; 5558 5559 if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) 5560 return -EINVAL; 5561 5562 if (attr->open_flags & ~BPF_F_TOKEN_FD) 5563 return -EINVAL; 5564 5565 if (attr->open_flags & BPF_F_TOKEN_FD) { 5566 token = bpf_token_get_from_fd(attr->fd_by_id_token_fd); 5567 if (IS_ERR(token)) 5568 return PTR_ERR(token); 5569 if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) { 5570 bpf_token_put(token); 5571 token = NULL; 5572 } 5573 } 5574 5575 if (!bpf_token_capable(token, CAP_SYS_ADMIN)) { 5576 bpf_token_put(token); 5577 return -EPERM; 5578 } 5579 5580 bpf_token_put(token); 5581 5582 return btf_get_fd_by_id(attr->btf_id); 5583 } 5584 5585 static int bpf_task_fd_query_copy(const union bpf_attr *attr, 5586 union bpf_attr __user *uattr, 5587 u32 prog_id, u32 fd_type, 5588 const char *buf, u64 probe_offset, 5589 u64 probe_addr) 5590 { 5591 char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); 5592 u32 len = buf ? strlen(buf) : 0, input_len; 5593 int err = 0; 5594 5595 if (put_user(len, &uattr->task_fd_query.buf_len)) 5596 return -EFAULT; 5597 input_len = attr->task_fd_query.buf_len; 5598 if (input_len && ubuf) { 5599 if (!len) { 5600 /* nothing to copy, just make ubuf NULL terminated */ 5601 char zero = '\0'; 5602 5603 if (put_user(zero, ubuf)) 5604 return -EFAULT; 5605 } else { 5606 err = bpf_copy_to_user(ubuf, buf, input_len, len); 5607 if (err == -EFAULT) 5608 return err; 5609 } 5610 } 5611 5612 if (put_user(prog_id, &uattr->task_fd_query.prog_id) || 5613 put_user(fd_type, &uattr->task_fd_query.fd_type) || 5614 put_user(probe_offset, &uattr->task_fd_query.probe_offset) || 5615 put_user(probe_addr, &uattr->task_fd_query.probe_addr)) 5616 return -EFAULT; 5617 5618 return err; 5619 } 5620 5621 #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr 5622 5623 static int bpf_task_fd_query(const union bpf_attr *attr, 5624 union bpf_attr __user *uattr) 5625 { 5626 pid_t pid = attr->task_fd_query.pid; 5627 u32 fd = attr->task_fd_query.fd; 5628 const struct perf_event *event; 5629 struct task_struct *task; 5630 struct file *file; 5631 int err; 5632 5633 if (CHECK_ATTR(BPF_TASK_FD_QUERY)) 5634 return -EINVAL; 5635 5636 if (!capable(CAP_SYS_ADMIN)) 5637 return -EPERM; 5638 5639 if (attr->task_fd_query.flags != 0) 5640 return -EINVAL; 5641 5642 rcu_read_lock(); 5643 task = get_pid_task(find_vpid(pid), PIDTYPE_PID); 5644 rcu_read_unlock(); 5645 if (!task) 5646 return -ENOENT; 5647 5648 err = 0; 5649 file = fget_task(task, fd); 5650 put_task_struct(task); 5651 if (!file) 5652 return -EBADF; 5653 5654 if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) { 5655 struct bpf_link *link = file->private_data; 5656 5657 if (link->ops == &bpf_raw_tp_link_lops) { 5658 struct bpf_raw_tp_link *raw_tp = 5659 container_of(link, struct bpf_raw_tp_link, link); 5660 struct bpf_raw_event_map *btp = raw_tp->btp; 5661 5662 err = bpf_task_fd_query_copy(attr, uattr, 5663 raw_tp->link.prog->aux->id, 5664 BPF_FD_TYPE_RAW_TRACEPOINT, 5665 btp->tp->name, 0, 0); 5666 goto put_file; 5667 } 5668 goto out_not_supp; 5669 } 5670 5671 event = perf_get_event(file); 5672 if (!IS_ERR(event)) { 5673 u64 probe_offset, probe_addr; 5674 u32 prog_id, fd_type; 5675 const char *buf; 5676 5677 err = bpf_get_perf_event_info(event, &prog_id, &fd_type, 5678 &buf, &probe_offset, 5679 &probe_addr, NULL); 5680 if (!err) 5681 err = bpf_task_fd_query_copy(attr, uattr, prog_id, 5682 fd_type, buf, 5683 probe_offset, 5684 probe_addr); 5685 goto put_file; 5686 } 5687 5688 out_not_supp: 5689 err = -ENOTSUPP; 5690 put_file: 5691 fput(file); 5692 return err; 5693 } 5694 5695 #define BPF_MAP_BATCH_LAST_FIELD batch.flags 5696 5697 #define BPF_DO_BATCH(fn, ...) \ 5698 do { \ 5699 if (!fn) { \ 5700 err = -ENOTSUPP; \ 5701 goto err_put; \ 5702 } \ 5703 err = fn(__VA_ARGS__); \ 5704 } while (0) 5705 5706 static int bpf_map_do_batch(const union bpf_attr *attr, 5707 union bpf_attr __user *uattr, 5708 int cmd) 5709 { 5710 bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || 5711 cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; 5712 bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; 5713 struct bpf_map *map; 5714 int err; 5715 5716 if (CHECK_ATTR(BPF_MAP_BATCH)) 5717 return -EINVAL; 5718 5719 CLASS(fd, f)(attr->batch.map_fd); 5720 5721 map = __bpf_map_get(f); 5722 if (IS_ERR(map)) 5723 return PTR_ERR(map); 5724 if (has_write) 5725 bpf_map_write_active_inc(map); 5726 if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { 5727 err = -EPERM; 5728 goto err_put; 5729 } 5730 if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 5731 err = -EPERM; 5732 goto err_put; 5733 } 5734 5735 if (cmd == BPF_MAP_LOOKUP_BATCH) 5736 BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr); 5737 else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) 5738 BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr); 5739 else if (cmd == BPF_MAP_UPDATE_BATCH) 5740 BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr); 5741 else 5742 BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr); 5743 err_put: 5744 if (has_write) { 5745 maybe_wait_bpf_programs(map); 5746 bpf_map_write_active_dec(map); 5747 } 5748 return err; 5749 } 5750 5751 #define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid 5752 static int link_create(union bpf_attr *attr, bpfptr_t uattr) 5753 { 5754 struct bpf_prog *prog; 5755 int ret; 5756 5757 if (CHECK_ATTR(BPF_LINK_CREATE)) 5758 return -EINVAL; 5759 5760 if (attr->link_create.attach_type == BPF_STRUCT_OPS) 5761 return bpf_struct_ops_link_create(attr); 5762 5763 prog = bpf_prog_get(attr->link_create.prog_fd); 5764 if (IS_ERR(prog)) 5765 return PTR_ERR(prog); 5766 5767 ret = bpf_prog_attach_check_attach_type(prog, 5768 attr->link_create.attach_type); 5769 if (ret) 5770 goto out; 5771 5772 switch (prog->type) { 5773 case BPF_PROG_TYPE_CGROUP_SKB: 5774 case BPF_PROG_TYPE_CGROUP_SOCK: 5775 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 5776 case BPF_PROG_TYPE_SOCK_OPS: 5777 case BPF_PROG_TYPE_CGROUP_DEVICE: 5778 case BPF_PROG_TYPE_CGROUP_SYSCTL: 5779 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 5780 ret = cgroup_bpf_link_attach(attr, prog); 5781 break; 5782 case BPF_PROG_TYPE_EXT: 5783 ret = bpf_tracing_prog_attach(prog, 5784 attr->link_create.target_fd, 5785 attr->link_create.target_btf_id, 5786 attr->link_create.tracing.cookie, 5787 attr->link_create.attach_type); 5788 break; 5789 case BPF_PROG_TYPE_LSM: 5790 case BPF_PROG_TYPE_TRACING: 5791 if (attr->link_create.attach_type != prog->expected_attach_type) { 5792 ret = -EINVAL; 5793 goto out; 5794 } 5795 if (prog->expected_attach_type == BPF_TRACE_RAW_TP) 5796 ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie, 5797 attr->link_create.attach_type); 5798 else if (prog->expected_attach_type == BPF_TRACE_ITER) 5799 ret = bpf_iter_link_attach(attr, uattr, prog); 5800 else if (prog->expected_attach_type == BPF_LSM_CGROUP) 5801 ret = cgroup_bpf_link_attach(attr, prog); 5802 else 5803 ret = bpf_tracing_prog_attach(prog, 5804 attr->link_create.target_fd, 5805 attr->link_create.target_btf_id, 5806 attr->link_create.tracing.cookie, 5807 attr->link_create.attach_type); 5808 break; 5809 case BPF_PROG_TYPE_FLOW_DISSECTOR: 5810 case BPF_PROG_TYPE_SK_LOOKUP: 5811 ret = netns_bpf_link_create(attr, prog); 5812 break; 5813 case BPF_PROG_TYPE_SK_MSG: 5814 case BPF_PROG_TYPE_SK_SKB: 5815 ret = sock_map_link_create(attr, prog); 5816 break; 5817 #ifdef CONFIG_NET 5818 case BPF_PROG_TYPE_XDP: 5819 ret = bpf_xdp_link_attach(attr, prog); 5820 break; 5821 case BPF_PROG_TYPE_SCHED_CLS: 5822 if (attr->link_create.attach_type == BPF_TCX_INGRESS || 5823 attr->link_create.attach_type == BPF_TCX_EGRESS) 5824 ret = tcx_link_attach(attr, prog); 5825 else 5826 ret = netkit_link_attach(attr, prog); 5827 break; 5828 case BPF_PROG_TYPE_NETFILTER: 5829 ret = bpf_nf_link_attach(attr, prog); 5830 break; 5831 #endif 5832 case BPF_PROG_TYPE_PERF_EVENT: 5833 case BPF_PROG_TYPE_TRACEPOINT: 5834 ret = bpf_perf_link_attach(attr, prog); 5835 break; 5836 case BPF_PROG_TYPE_KPROBE: 5837 if (attr->link_create.attach_type == BPF_PERF_EVENT) 5838 ret = bpf_perf_link_attach(attr, prog); 5839 else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || 5840 attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) 5841 ret = bpf_kprobe_multi_link_attach(attr, prog); 5842 else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI || 5843 attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION) 5844 ret = bpf_uprobe_multi_link_attach(attr, prog); 5845 break; 5846 default: 5847 ret = -EINVAL; 5848 } 5849 5850 out: 5851 if (ret < 0) 5852 bpf_prog_put(prog); 5853 return ret; 5854 } 5855 5856 static int link_update_map(struct bpf_link *link, union bpf_attr *attr) 5857 { 5858 struct bpf_map *new_map, *old_map = NULL; 5859 int ret; 5860 5861 new_map = bpf_map_get(attr->link_update.new_map_fd); 5862 if (IS_ERR(new_map)) 5863 return PTR_ERR(new_map); 5864 5865 if (attr->link_update.flags & BPF_F_REPLACE) { 5866 old_map = bpf_map_get(attr->link_update.old_map_fd); 5867 if (IS_ERR(old_map)) { 5868 ret = PTR_ERR(old_map); 5869 goto out_put; 5870 } 5871 } else if (attr->link_update.old_map_fd) { 5872 ret = -EINVAL; 5873 goto out_put; 5874 } 5875 5876 ret = link->ops->update_map(link, new_map, old_map); 5877 5878 if (old_map) 5879 bpf_map_put(old_map); 5880 out_put: 5881 bpf_map_put(new_map); 5882 return ret; 5883 } 5884 5885 #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd 5886 5887 static int link_update(union bpf_attr *attr) 5888 { 5889 struct bpf_prog *old_prog = NULL, *new_prog; 5890 struct bpf_link *link; 5891 u32 flags; 5892 int ret; 5893 5894 if (CHECK_ATTR(BPF_LINK_UPDATE)) 5895 return -EINVAL; 5896 5897 flags = attr->link_update.flags; 5898 if (flags & ~BPF_F_REPLACE) 5899 return -EINVAL; 5900 5901 link = bpf_link_get_from_fd(attr->link_update.link_fd); 5902 if (IS_ERR(link)) 5903 return PTR_ERR(link); 5904 5905 if (link->ops->update_map) { 5906 ret = link_update_map(link, attr); 5907 goto out_put_link; 5908 } 5909 5910 new_prog = bpf_prog_get(attr->link_update.new_prog_fd); 5911 if (IS_ERR(new_prog)) { 5912 ret = PTR_ERR(new_prog); 5913 goto out_put_link; 5914 } 5915 5916 if (flags & BPF_F_REPLACE) { 5917 old_prog = bpf_prog_get(attr->link_update.old_prog_fd); 5918 if (IS_ERR(old_prog)) { 5919 ret = PTR_ERR(old_prog); 5920 old_prog = NULL; 5921 goto out_put_progs; 5922 } 5923 } else if (attr->link_update.old_prog_fd) { 5924 ret = -EINVAL; 5925 goto out_put_progs; 5926 } 5927 5928 if (link->ops->update_prog) 5929 ret = link->ops->update_prog(link, new_prog, old_prog); 5930 else 5931 ret = -EINVAL; 5932 5933 out_put_progs: 5934 if (old_prog) 5935 bpf_prog_put(old_prog); 5936 if (ret) 5937 bpf_prog_put(new_prog); 5938 out_put_link: 5939 bpf_link_put_direct(link); 5940 return ret; 5941 } 5942 5943 #define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd 5944 5945 static int link_detach(union bpf_attr *attr) 5946 { 5947 struct bpf_link *link; 5948 int ret; 5949 5950 if (CHECK_ATTR(BPF_LINK_DETACH)) 5951 return -EINVAL; 5952 5953 link = bpf_link_get_from_fd(attr->link_detach.link_fd); 5954 if (IS_ERR(link)) 5955 return PTR_ERR(link); 5956 5957 if (link->ops->detach) 5958 ret = link->ops->detach(link); 5959 else 5960 ret = -EOPNOTSUPP; 5961 5962 bpf_link_put_direct(link); 5963 return ret; 5964 } 5965 5966 struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) 5967 { 5968 return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); 5969 } 5970 EXPORT_SYMBOL(bpf_link_inc_not_zero); 5971 5972 struct bpf_link *bpf_link_by_id(u32 id) 5973 { 5974 struct bpf_link *link; 5975 5976 if (!id) 5977 return ERR_PTR(-ENOENT); 5978 5979 spin_lock_bh(&link_idr_lock); 5980 /* before link is "settled", ID is 0, pretend it doesn't exist yet */ 5981 link = idr_find(&link_idr, id); 5982 if (link) { 5983 if (link->id) 5984 link = bpf_link_inc_not_zero(link); 5985 else 5986 link = ERR_PTR(-EAGAIN); 5987 } else { 5988 link = ERR_PTR(-ENOENT); 5989 } 5990 spin_unlock_bh(&link_idr_lock); 5991 return link; 5992 } 5993 5994 struct bpf_link *bpf_link_get_curr_or_next(u32 *id) 5995 { 5996 struct bpf_link *link; 5997 5998 spin_lock_bh(&link_idr_lock); 5999 again: 6000 link = idr_get_next(&link_idr, id); 6001 if (link) { 6002 link = bpf_link_inc_not_zero(link); 6003 if (IS_ERR(link)) { 6004 (*id)++; 6005 goto again; 6006 } 6007 } 6008 spin_unlock_bh(&link_idr_lock); 6009 6010 return link; 6011 } 6012 6013 #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id 6014 6015 static int bpf_link_get_fd_by_id(const union bpf_attr *attr) 6016 { 6017 struct bpf_link *link; 6018 u32 id = attr->link_id; 6019 int fd; 6020 6021 if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) 6022 return -EINVAL; 6023 6024 if (!capable(CAP_SYS_ADMIN)) 6025 return -EPERM; 6026 6027 link = bpf_link_by_id(id); 6028 if (IS_ERR(link)) 6029 return PTR_ERR(link); 6030 6031 fd = bpf_link_new_fd(link); 6032 if (fd < 0) 6033 bpf_link_put_direct(link); 6034 6035 return fd; 6036 } 6037 6038 DEFINE_MUTEX(bpf_stats_enabled_mutex); 6039 6040 static int bpf_stats_release(struct inode *inode, struct file *file) 6041 { 6042 mutex_lock(&bpf_stats_enabled_mutex); 6043 static_key_slow_dec(&bpf_stats_enabled_key.key); 6044 mutex_unlock(&bpf_stats_enabled_mutex); 6045 return 0; 6046 } 6047 6048 static const struct file_operations bpf_stats_fops = { 6049 .release = bpf_stats_release, 6050 }; 6051 6052 static int bpf_enable_runtime_stats(void) 6053 { 6054 int fd; 6055 6056 mutex_lock(&bpf_stats_enabled_mutex); 6057 6058 /* Set a very high limit to avoid overflow */ 6059 if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { 6060 mutex_unlock(&bpf_stats_enabled_mutex); 6061 return -EBUSY; 6062 } 6063 6064 fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); 6065 if (fd >= 0) 6066 static_key_slow_inc(&bpf_stats_enabled_key.key); 6067 6068 mutex_unlock(&bpf_stats_enabled_mutex); 6069 return fd; 6070 } 6071 6072 #define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type 6073 6074 static int bpf_enable_stats(union bpf_attr *attr) 6075 { 6076 6077 if (CHECK_ATTR(BPF_ENABLE_STATS)) 6078 return -EINVAL; 6079 6080 if (!capable(CAP_SYS_ADMIN)) 6081 return -EPERM; 6082 6083 switch (attr->enable_stats.type) { 6084 case BPF_STATS_RUN_TIME: 6085 return bpf_enable_runtime_stats(); 6086 default: 6087 break; 6088 } 6089 return -EINVAL; 6090 } 6091 6092 #define BPF_ITER_CREATE_LAST_FIELD iter_create.flags 6093 6094 static int bpf_iter_create(union bpf_attr *attr) 6095 { 6096 struct bpf_link *link; 6097 int err; 6098 6099 if (CHECK_ATTR(BPF_ITER_CREATE)) 6100 return -EINVAL; 6101 6102 if (attr->iter_create.flags) 6103 return -EINVAL; 6104 6105 link = bpf_link_get_from_fd(attr->iter_create.link_fd); 6106 if (IS_ERR(link)) 6107 return PTR_ERR(link); 6108 6109 err = bpf_iter_new_fd(link); 6110 bpf_link_put_direct(link); 6111 6112 return err; 6113 } 6114 6115 #define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags 6116 6117 static int bpf_prog_bind_map(union bpf_attr *attr) 6118 { 6119 struct bpf_prog *prog; 6120 struct bpf_map *map; 6121 struct bpf_map **used_maps_old, **used_maps_new; 6122 int i, ret = 0; 6123 6124 if (CHECK_ATTR(BPF_PROG_BIND_MAP)) 6125 return -EINVAL; 6126 6127 if (attr->prog_bind_map.flags) 6128 return -EINVAL; 6129 6130 prog = bpf_prog_get(attr->prog_bind_map.prog_fd); 6131 if (IS_ERR(prog)) 6132 return PTR_ERR(prog); 6133 6134 map = bpf_map_get(attr->prog_bind_map.map_fd); 6135 if (IS_ERR(map)) { 6136 ret = PTR_ERR(map); 6137 goto out_prog_put; 6138 } 6139 6140 mutex_lock(&prog->aux->used_maps_mutex); 6141 6142 used_maps_old = prog->aux->used_maps; 6143 6144 for (i = 0; i < prog->aux->used_map_cnt; i++) 6145 if (used_maps_old[i] == map) { 6146 bpf_map_put(map); 6147 goto out_unlock; 6148 } 6149 6150 used_maps_new = kmalloc_objs(used_maps_new[0], 6151 prog->aux->used_map_cnt + 1); 6152 if (!used_maps_new) { 6153 ret = -ENOMEM; 6154 goto out_unlock; 6155 } 6156 6157 /* The bpf program will not access the bpf map, but for the sake of 6158 * simplicity, increase sleepable_refcnt for sleepable program as well. 6159 */ 6160 if (prog->sleepable) 6161 atomic64_inc(&map->sleepable_refcnt); 6162 memcpy(used_maps_new, used_maps_old, 6163 sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); 6164 used_maps_new[prog->aux->used_map_cnt] = map; 6165 6166 prog->aux->used_map_cnt++; 6167 prog->aux->used_maps = used_maps_new; 6168 6169 kfree(used_maps_old); 6170 6171 out_unlock: 6172 mutex_unlock(&prog->aux->used_maps_mutex); 6173 6174 if (ret) 6175 bpf_map_put(map); 6176 out_prog_put: 6177 bpf_prog_put(prog); 6178 return ret; 6179 } 6180 6181 #define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd 6182 6183 static int token_create(union bpf_attr *attr) 6184 { 6185 if (CHECK_ATTR(BPF_TOKEN_CREATE)) 6186 return -EINVAL; 6187 6188 /* no flags are supported yet */ 6189 if (attr->token_create.flags) 6190 return -EINVAL; 6191 6192 return bpf_token_create(attr); 6193 } 6194 6195 #define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd 6196 6197 static int prog_stream_read(union bpf_attr *attr) 6198 { 6199 char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf); 6200 u32 len = attr->prog_stream_read.stream_buf_len; 6201 struct bpf_prog *prog; 6202 int ret; 6203 6204 if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD)) 6205 return -EINVAL; 6206 6207 prog = bpf_prog_get(attr->prog_stream_read.prog_fd); 6208 if (IS_ERR(prog)) 6209 return PTR_ERR(prog); 6210 6211 ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len); 6212 bpf_prog_put(prog); 6213 6214 return ret; 6215 } 6216 6217 #define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd 6218 6219 static int prog_assoc_struct_ops(union bpf_attr *attr) 6220 { 6221 struct bpf_prog *prog; 6222 struct bpf_map *map; 6223 int ret; 6224 6225 if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS)) 6226 return -EINVAL; 6227 6228 if (attr->prog_assoc_struct_ops.flags) 6229 return -EINVAL; 6230 6231 prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd); 6232 if (IS_ERR(prog)) 6233 return PTR_ERR(prog); 6234 6235 if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { 6236 ret = -EINVAL; 6237 goto put_prog; 6238 } 6239 6240 map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd); 6241 if (IS_ERR(map)) { 6242 ret = PTR_ERR(map); 6243 goto put_prog; 6244 } 6245 6246 if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { 6247 ret = -EINVAL; 6248 goto put_map; 6249 } 6250 6251 ret = bpf_prog_assoc_struct_ops(prog, map); 6252 6253 put_map: 6254 bpf_map_put(map); 6255 put_prog: 6256 bpf_prog_put(prog); 6257 return ret; 6258 } 6259 6260 static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, 6261 bpfptr_t uattr_common, unsigned int size_common) 6262 { 6263 struct bpf_common_attr attr_common; 6264 u32 offsetof_log_true_size = 0; 6265 struct bpf_log_attr attr_log; 6266 union bpf_attr attr; 6267 int err; 6268 6269 err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); 6270 if (err) 6271 return err; 6272 size = min_t(u32, size, sizeof(attr)); 6273 6274 /* copy attributes from user space, may be less than sizeof(bpf_attr) */ 6275 memset(&attr, 0, sizeof(attr)); 6276 if (copy_from_bpfptr(&attr, uattr, size) != 0) 6277 return -EFAULT; 6278 6279 memset(&attr_common, 0, sizeof(attr_common)); 6280 if (cmd & BPF_COMMON_ATTRS) { 6281 err = bpf_check_uarg_tail_zero(uattr_common, 6282 offsetofend(struct bpf_common_attr, log_true_size), 6283 size_common); 6284 if (err) 6285 return err; 6286 6287 cmd &= ~BPF_COMMON_ATTRS; 6288 size_common = min_t(u32, size_common, sizeof(attr_common)); 6289 if (copy_from_bpfptr(&attr_common, uattr_common, size_common) != 0) 6290 return -EFAULT; 6291 } else { 6292 size_common = 0; 6293 } 6294 6295 err = security_bpf(cmd, &attr, size, uattr.is_kernel); 6296 if (err < 0) 6297 return err; 6298 6299 switch (cmd) { 6300 case BPF_MAP_CREATE: 6301 err = map_create(&attr, uattr, &attr_common, uattr_common, size_common); 6302 break; 6303 case BPF_MAP_LOOKUP_ELEM: 6304 err = map_lookup_elem(&attr); 6305 break; 6306 case BPF_MAP_UPDATE_ELEM: 6307 err = map_update_elem(&attr, uattr); 6308 break; 6309 case BPF_MAP_DELETE_ELEM: 6310 err = map_delete_elem(&attr, uattr); 6311 break; 6312 case BPF_MAP_GET_NEXT_KEY: 6313 err = map_get_next_key(&attr); 6314 break; 6315 case BPF_MAP_FREEZE: 6316 err = map_freeze(&attr); 6317 break; 6318 case BPF_PROG_LOAD: 6319 if (size >= offsetofend(union bpf_attr, log_true_size)) 6320 offsetof_log_true_size = offsetof(union bpf_attr, log_true_size); 6321 err = bpf_log_attr_init(&attr_log, attr.log_buf, attr.log_size, attr.log_level, 6322 offsetof_log_true_size, uattr, &attr_common, uattr_common, 6323 size_common); 6324 err = err ?: bpf_prog_load(&attr, uattr, &attr_log); 6325 break; 6326 case BPF_OBJ_PIN: 6327 err = bpf_obj_pin(&attr); 6328 break; 6329 case BPF_OBJ_GET: 6330 err = bpf_obj_get(&attr); 6331 break; 6332 case BPF_PROG_ATTACH: 6333 err = bpf_prog_attach(&attr); 6334 break; 6335 case BPF_PROG_DETACH: 6336 err = bpf_prog_detach(&attr); 6337 break; 6338 case BPF_PROG_QUERY: 6339 err = bpf_prog_query(&attr, uattr.user); 6340 break; 6341 case BPF_PROG_TEST_RUN: 6342 err = bpf_prog_test_run(&attr, uattr.user); 6343 break; 6344 case BPF_PROG_GET_NEXT_ID: 6345 err = bpf_obj_get_next_id(&attr, uattr.user, 6346 &prog_idr, &prog_idr_lock); 6347 break; 6348 case BPF_MAP_GET_NEXT_ID: 6349 err = bpf_obj_get_next_id(&attr, uattr.user, 6350 &map_idr, &map_idr_lock); 6351 break; 6352 case BPF_BTF_GET_NEXT_ID: 6353 err = bpf_obj_get_next_id(&attr, uattr.user, 6354 &btf_idr, &btf_idr_lock); 6355 break; 6356 case BPF_PROG_GET_FD_BY_ID: 6357 err = bpf_prog_get_fd_by_id(&attr); 6358 break; 6359 case BPF_MAP_GET_FD_BY_ID: 6360 err = bpf_map_get_fd_by_id(&attr); 6361 break; 6362 case BPF_OBJ_GET_INFO_BY_FD: 6363 err = bpf_obj_get_info_by_fd(&attr, uattr.user); 6364 break; 6365 case BPF_RAW_TRACEPOINT_OPEN: 6366 err = bpf_raw_tracepoint_open(&attr); 6367 break; 6368 case BPF_BTF_LOAD: 6369 if (size >= offsetofend(union bpf_attr, btf_log_true_size)) 6370 offsetof_log_true_size = offsetof(union bpf_attr, btf_log_true_size); 6371 err = bpf_log_attr_init(&attr_log, attr.btf_log_buf, attr.btf_log_size, 6372 attr.btf_log_level, offsetof_log_true_size, uattr, 6373 &attr_common, uattr_common, size_common); 6374 err = err ?: bpf_btf_load(&attr, uattr, &attr_log); 6375 break; 6376 case BPF_BTF_GET_FD_BY_ID: 6377 err = bpf_btf_get_fd_by_id(&attr); 6378 break; 6379 case BPF_TASK_FD_QUERY: 6380 err = bpf_task_fd_query(&attr, uattr.user); 6381 break; 6382 case BPF_MAP_LOOKUP_AND_DELETE_ELEM: 6383 err = map_lookup_and_delete_elem(&attr); 6384 break; 6385 case BPF_MAP_LOOKUP_BATCH: 6386 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH); 6387 break; 6388 case BPF_MAP_LOOKUP_AND_DELETE_BATCH: 6389 err = bpf_map_do_batch(&attr, uattr.user, 6390 BPF_MAP_LOOKUP_AND_DELETE_BATCH); 6391 break; 6392 case BPF_MAP_UPDATE_BATCH: 6393 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH); 6394 break; 6395 case BPF_MAP_DELETE_BATCH: 6396 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH); 6397 break; 6398 case BPF_LINK_CREATE: 6399 err = link_create(&attr, uattr); 6400 break; 6401 case BPF_LINK_UPDATE: 6402 err = link_update(&attr); 6403 break; 6404 case BPF_LINK_GET_FD_BY_ID: 6405 err = bpf_link_get_fd_by_id(&attr); 6406 break; 6407 case BPF_LINK_GET_NEXT_ID: 6408 err = bpf_obj_get_next_id(&attr, uattr.user, 6409 &link_idr, &link_idr_lock); 6410 break; 6411 case BPF_ENABLE_STATS: 6412 err = bpf_enable_stats(&attr); 6413 break; 6414 case BPF_ITER_CREATE: 6415 err = bpf_iter_create(&attr); 6416 break; 6417 case BPF_LINK_DETACH: 6418 err = link_detach(&attr); 6419 break; 6420 case BPF_PROG_BIND_MAP: 6421 err = bpf_prog_bind_map(&attr); 6422 break; 6423 case BPF_TOKEN_CREATE: 6424 err = token_create(&attr); 6425 break; 6426 case BPF_PROG_STREAM_READ_BY_FD: 6427 err = prog_stream_read(&attr); 6428 break; 6429 case BPF_PROG_ASSOC_STRUCT_OPS: 6430 err = prog_assoc_struct_ops(&attr); 6431 break; 6432 default: 6433 err = -EINVAL; 6434 break; 6435 } 6436 6437 return err; 6438 } 6439 6440 SYSCALL_DEFINE5(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size, 6441 struct bpf_common_attr __user *, uattr_common, unsigned int, size_common) 6442 { 6443 return __sys_bpf(cmd, USER_BPFPTR(uattr), size, USER_BPFPTR(uattr_common), size_common); 6444 } 6445 6446 static bool syscall_prog_is_valid_access(int off, int size, 6447 enum bpf_access_type type, 6448 const struct bpf_prog *prog, 6449 struct bpf_insn_access_aux *info) 6450 { 6451 if (off < 0 || off >= U16_MAX) 6452 return false; 6453 /* No alignment requirements for syscall ctx accesses. */ 6454 return true; 6455 } 6456 6457 BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) 6458 { 6459 switch (cmd) { 6460 case BPF_MAP_CREATE: 6461 case BPF_MAP_DELETE_ELEM: 6462 case BPF_MAP_UPDATE_ELEM: 6463 case BPF_MAP_FREEZE: 6464 case BPF_MAP_GET_FD_BY_ID: 6465 case BPF_PROG_LOAD: 6466 case BPF_BTF_LOAD: 6467 case BPF_LINK_CREATE: 6468 case BPF_RAW_TRACEPOINT_OPEN: 6469 break; 6470 default: 6471 return -EINVAL; 6472 } 6473 return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size, KERNEL_BPFPTR(NULL), 0); 6474 } 6475 6476 6477 /* To shut up -Wmissing-prototypes. 6478 * This function is used by the kernel light skeleton 6479 * to load bpf programs when modules are loaded or during kernel boot. 6480 * See tools/lib/bpf/skel_internal.h 6481 */ 6482 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); 6483 6484 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) 6485 { 6486 struct bpf_prog * __maybe_unused prog; 6487 struct bpf_tramp_run_ctx __maybe_unused run_ctx; 6488 6489 switch (cmd) { 6490 #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ 6491 case BPF_PROG_TEST_RUN: 6492 if (attr->test.data_in || attr->test.data_out || 6493 attr->test.ctx_out || attr->test.duration || 6494 attr->test.repeat || attr->test.flags) 6495 return -EINVAL; 6496 6497 prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL); 6498 if (IS_ERR(prog)) 6499 return PTR_ERR(prog); 6500 6501 if (attr->test.ctx_size_in < prog->aux->max_ctx_offset || 6502 attr->test.ctx_size_in > U16_MAX) { 6503 bpf_prog_put(prog); 6504 return -EINVAL; 6505 } 6506 6507 run_ctx.bpf_cookie = 0; 6508 if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { 6509 /* recursion detected */ 6510 __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); 6511 bpf_prog_put(prog); 6512 return -EBUSY; 6513 } 6514 attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); 6515 __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, 6516 &run_ctx); 6517 bpf_prog_put(prog); 6518 return 0; 6519 #endif 6520 default: 6521 return ____bpf_sys_bpf(cmd, attr, size); 6522 } 6523 } 6524 EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL"); 6525 6526 static const struct bpf_func_proto bpf_sys_bpf_proto = { 6527 .func = bpf_sys_bpf, 6528 .gpl_only = false, 6529 .ret_type = RET_INTEGER, 6530 .arg1_type = ARG_ANYTHING, 6531 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 6532 .arg3_type = ARG_CONST_SIZE, 6533 }; 6534 6535 const struct bpf_func_proto * __weak 6536 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 6537 { 6538 return bpf_base_func_proto(func_id, prog); 6539 } 6540 6541 BPF_CALL_1(bpf_sys_close, u32, fd) 6542 { 6543 /* When bpf program calls this helper there should not be 6544 * an fdget() without matching completed fdput(). 6545 * This helper is allowed in the following callchain only: 6546 * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close 6547 */ 6548 return close_fd(fd); 6549 } 6550 6551 static const struct bpf_func_proto bpf_sys_close_proto = { 6552 .func = bpf_sys_close, 6553 .gpl_only = false, 6554 .ret_type = RET_INTEGER, 6555 .arg1_type = ARG_ANYTHING, 6556 }; 6557 6558 BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res) 6559 { 6560 *res = 0; 6561 if (flags) 6562 return -EINVAL; 6563 6564 if (name_sz <= 1 || name[name_sz - 1]) 6565 return -EINVAL; 6566 6567 if (!bpf_dump_raw_ok(current_cred())) 6568 return -EPERM; 6569 6570 *res = kallsyms_lookup_name(name); 6571 return *res ? 0 : -ENOENT; 6572 } 6573 6574 static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { 6575 .func = bpf_kallsyms_lookup_name, 6576 .gpl_only = false, 6577 .ret_type = RET_INTEGER, 6578 .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, 6579 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 6580 .arg3_type = ARG_ANYTHING, 6581 .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, 6582 .arg4_size = sizeof(u64), 6583 }; 6584 6585 static const struct bpf_func_proto * 6586 syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 6587 { 6588 switch (func_id) { 6589 case BPF_FUNC_sys_bpf: 6590 return !bpf_token_capable(prog->aux->token, CAP_PERFMON) 6591 ? NULL : &bpf_sys_bpf_proto; 6592 case BPF_FUNC_btf_find_by_name_kind: 6593 return &bpf_btf_find_by_name_kind_proto; 6594 case BPF_FUNC_sys_close: 6595 return &bpf_sys_close_proto; 6596 case BPF_FUNC_kallsyms_lookup_name: 6597 return &bpf_kallsyms_lookup_name_proto; 6598 default: 6599 return tracing_prog_func_proto(func_id, prog); 6600 } 6601 } 6602 6603 const struct bpf_verifier_ops bpf_syscall_verifier_ops = { 6604 .get_func_proto = syscall_prog_func_proto, 6605 .is_valid_access = syscall_prog_is_valid_access, 6606 }; 6607 6608 const struct bpf_prog_ops bpf_syscall_prog_ops = { 6609 .test_run = bpf_prog_test_run_syscall, 6610 }; 6611 6612 #ifdef CONFIG_SYSCTL 6613 static int bpf_stats_handler(const struct ctl_table *table, int write, 6614 void *buffer, size_t *lenp, loff_t *ppos) 6615 { 6616 struct static_key *key = (struct static_key *)table->data; 6617 static int saved_val; 6618 int val, ret; 6619 struct ctl_table tmp = { 6620 .data = &val, 6621 .maxlen = sizeof(val), 6622 .mode = table->mode, 6623 .extra1 = SYSCTL_ZERO, 6624 .extra2 = SYSCTL_ONE, 6625 }; 6626 6627 if (write && !capable(CAP_SYS_ADMIN)) 6628 return -EPERM; 6629 6630 mutex_lock(&bpf_stats_enabled_mutex); 6631 val = saved_val; 6632 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 6633 if (write && !ret && val != saved_val) { 6634 if (val) 6635 static_key_slow_inc(key); 6636 else 6637 static_key_slow_dec(key); 6638 saved_val = val; 6639 } 6640 mutex_unlock(&bpf_stats_enabled_mutex); 6641 return ret; 6642 } 6643 6644 void __weak unpriv_ebpf_notify(int new_state) 6645 { 6646 } 6647 6648 static int bpf_unpriv_handler(const struct ctl_table *table, int write, 6649 void *buffer, size_t *lenp, loff_t *ppos) 6650 { 6651 int ret, unpriv_enable = *(int *)table->data; 6652 bool locked_state = unpriv_enable == 1; 6653 struct ctl_table tmp = *table; 6654 6655 if (write && !capable(CAP_SYS_ADMIN)) 6656 return -EPERM; 6657 6658 tmp.data = &unpriv_enable; 6659 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 6660 if (write && !ret) { 6661 if (locked_state && unpriv_enable != 1) 6662 return -EPERM; 6663 *(int *)table->data = unpriv_enable; 6664 } 6665 6666 if (write) 6667 unpriv_ebpf_notify(unpriv_enable); 6668 6669 return ret; 6670 } 6671 6672 static const struct ctl_table bpf_syscall_table[] = { 6673 { 6674 .procname = "unprivileged_bpf_disabled", 6675 .data = &sysctl_unprivileged_bpf_disabled, 6676 .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), 6677 .mode = 0644, 6678 .proc_handler = bpf_unpriv_handler, 6679 .extra1 = SYSCTL_ZERO, 6680 .extra2 = SYSCTL_TWO, 6681 }, 6682 { 6683 .procname = "bpf_stats_enabled", 6684 .data = &bpf_stats_enabled_key.key, 6685 .mode = 0644, 6686 .proc_handler = bpf_stats_handler, 6687 }, 6688 }; 6689 6690 static int __init bpf_syscall_sysctl_init(void) 6691 { 6692 register_sysctl_init("kernel", bpf_syscall_table); 6693 return 0; 6694 } 6695 late_initcall(bpf_syscall_sysctl_init); 6696 #endif /* CONFIG_SYSCTL */ 6697