1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 */ 4 #include <crypto/sha2.h> 5 #include <linux/bpf.h> 6 #include <linux/bpf-cgroup.h> 7 #include <linux/bpf_trace.h> 8 #include <linux/bpf_lirc.h> 9 #include <linux/bpf_verifier.h> 10 #include <linux/bsearch.h> 11 #include <linux/btf.h> 12 #include <linux/hex.h> 13 #include <linux/syscalls.h> 14 #include <linux/slab.h> 15 #include <linux/sched/signal.h> 16 #include <linux/vmalloc.h> 17 #include <linux/mmzone.h> 18 #include <linux/anon_inodes.h> 19 #include <linux/fdtable.h> 20 #include <linux/file.h> 21 #include <linux/fs.h> 22 #include <linux/license.h> 23 #include <linux/filter.h> 24 #include <linux/kernel.h> 25 #include <linux/idr.h> 26 #include <linux/cred.h> 27 #include <linux/timekeeping.h> 28 #include <linux/ctype.h> 29 #include <linux/nospec.h> 30 #include <linux/audit.h> 31 #include <uapi/linux/btf.h> 32 #include <linux/pgtable.h> 33 #include <linux/bpf_lsm.h> 34 #include <linux/poll.h> 35 #include <linux/sort.h> 36 #include <linux/bpf-netns.h> 37 #include <linux/rcupdate_trace.h> 38 #include <linux/memcontrol.h> 39 #include <linux/trace_events.h> 40 #include <linux/tracepoint.h> 41 #include <linux/overflow.h> 42 #include <linux/cookie.h> 43 #include <linux/verification.h> 44 #include <linux/btf_ids.h> 45 46 #include <net/netfilter/nf_bpf_link.h> 47 #include <net/netkit.h> 48 #include <net/tcx.h> 49 50 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ 51 (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ 52 (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) 53 #define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) 54 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) 55 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ 56 IS_FD_HASH(map)) 57 58 #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) 59 60 DEFINE_PER_CPU(int, bpf_prog_active); 61 DEFINE_COOKIE(bpf_map_cookie); 62 static DEFINE_IDR(prog_idr); 63 static DEFINE_SPINLOCK(prog_idr_lock); 64 static DEFINE_IDR(map_idr); 65 static DEFINE_SPINLOCK(map_idr_lock); 66 static DEFINE_IDR(link_idr); 67 static DEFINE_SPINLOCK(link_idr_lock); 68 69 int sysctl_unprivileged_bpf_disabled __read_mostly = 70 IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; 71 72 static const struct bpf_map_ops * const bpf_map_types[] = { 73 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) 74 #define BPF_MAP_TYPE(_id, _ops) \ 75 [_id] = &_ops, 76 #define BPF_LINK_TYPE(_id, _name) 77 #include <linux/bpf_types.h> 78 #undef BPF_PROG_TYPE 79 #undef BPF_MAP_TYPE 80 #undef BPF_LINK_TYPE 81 }; 82 83 /* 84 * If we're handed a bigger struct than we know of, ensure all the unknown bits 85 * are 0 - i.e. new user-space does not rely on any kernel feature extensions 86 * we don't know about yet. 87 * 88 * There is a ToCToU between this function call and the following 89 * copy_from_user() call. However, this is not a concern since this function is 90 * meant to be a future-proofing of bits. 91 */ 92 int bpf_check_uarg_tail_zero(bpfptr_t uaddr, 93 size_t expected_size, 94 size_t actual_size) 95 { 96 int res; 97 98 if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ 99 return -E2BIG; 100 101 if (actual_size <= expected_size) 102 return 0; 103 104 if (uaddr.is_kernel) 105 res = memchr_inv(uaddr.kernel + expected_size, 0, 106 actual_size - expected_size) == NULL; 107 else 108 res = check_zeroed_user(uaddr.user + expected_size, 109 actual_size - expected_size); 110 if (res < 0) 111 return res; 112 return res ? 0 : -E2BIG; 113 } 114 115 const struct bpf_map_ops bpf_map_offload_ops = { 116 .map_meta_equal = bpf_map_meta_equal, 117 .map_alloc = bpf_map_offload_map_alloc, 118 .map_free = bpf_map_offload_map_free, 119 .map_check_btf = map_check_no_btf, 120 .map_mem_usage = bpf_map_offload_map_mem_usage, 121 }; 122 123 static void bpf_map_write_active_inc(struct bpf_map *map) 124 { 125 atomic64_inc(&map->writecnt); 126 } 127 128 static void bpf_map_write_active_dec(struct bpf_map *map) 129 { 130 atomic64_dec(&map->writecnt); 131 } 132 133 bool bpf_map_write_active(const struct bpf_map *map) 134 { 135 return atomic64_read(&map->writecnt) != 0; 136 } 137 138 static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags) 139 { 140 if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) 141 return map->value_size; 142 else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 143 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || 144 map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || 145 map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) 146 return round_up(map->value_size, 8) * num_possible_cpus(); 147 else if (IS_FD_MAP(map)) 148 return sizeof(u32); 149 else 150 return map->value_size; 151 } 152 153 static void maybe_wait_bpf_programs(struct bpf_map *map) 154 { 155 /* Wait for any running non-sleepable BPF programs to complete so that 156 * userspace, when we return to it, knows that all non-sleepable 157 * programs that could be running use the new map value. For sleepable 158 * BPF programs, synchronize_rcu_tasks_trace() should be used to wait 159 * for the completions of these programs, but considering the waiting 160 * time can be very long and userspace may think it will hang forever, 161 * so don't handle sleepable BPF programs now. 162 */ 163 if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || 164 map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) 165 synchronize_rcu_expedited(); 166 } 167 168 static void unpin_uptr_kaddr(void *kaddr) 169 { 170 if (kaddr) 171 unpin_user_page(virt_to_page(kaddr)); 172 } 173 174 static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj) 175 { 176 const struct btf_field *field; 177 void **uptr_addr; 178 int i; 179 180 for (i = 0, field = rec->fields; i < cnt; i++, field++) { 181 if (field->type != BPF_UPTR) 182 continue; 183 184 uptr_addr = obj + field->offset; 185 unpin_uptr_kaddr(*uptr_addr); 186 } 187 } 188 189 static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj) 190 { 191 if (!btf_record_has_field(rec, BPF_UPTR)) 192 return; 193 194 __bpf_obj_unpin_uptrs(rec, rec->cnt, obj); 195 } 196 197 static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj) 198 { 199 const struct btf_field *field; 200 const struct btf_type *t; 201 unsigned long start, end; 202 struct page *page; 203 void **uptr_addr; 204 int i, err; 205 206 if (!btf_record_has_field(rec, BPF_UPTR)) 207 return 0; 208 209 for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) { 210 if (field->type != BPF_UPTR) 211 continue; 212 213 uptr_addr = obj + field->offset; 214 start = *(unsigned long *)uptr_addr; 215 if (!start) 216 continue; 217 218 t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); 219 /* t->size was checked for zero before */ 220 if (check_add_overflow(start, t->size - 1, &end)) { 221 err = -EFAULT; 222 goto unpin_all; 223 } 224 225 /* The uptr's struct cannot span across two pages */ 226 if ((start & PAGE_MASK) != (end & PAGE_MASK)) { 227 err = -EOPNOTSUPP; 228 goto unpin_all; 229 } 230 231 err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page); 232 if (err != 1) 233 goto unpin_all; 234 235 if (PageHighMem(page)) { 236 err = -EOPNOTSUPP; 237 unpin_user_page(page); 238 goto unpin_all; 239 } 240 241 *uptr_addr = page_address(page) + offset_in_page(start); 242 } 243 244 return 0; 245 246 unpin_all: 247 __bpf_obj_unpin_uptrs(rec, i, obj); 248 return err; 249 } 250 251 static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, 252 void *key, void *value, __u64 flags) 253 { 254 int err; 255 256 /* Need to create a kthread, thus must support schedule */ 257 if (bpf_map_is_offloaded(map)) { 258 return bpf_map_offload_update_elem(map, key, value, flags); 259 } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || 260 map->map_type == BPF_MAP_TYPE_ARENA || 261 map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { 262 return map->ops->map_update_elem(map, key, value, flags); 263 } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || 264 map->map_type == BPF_MAP_TYPE_SOCKMAP) { 265 return sock_map_update_elem_sys(map, key, value, flags); 266 } else if (IS_FD_PROG_ARRAY(map)) { 267 return bpf_fd_array_map_update_elem(map, map_file, key, value, 268 flags); 269 } 270 271 bpf_disable_instrumentation(); 272 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 273 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 274 err = bpf_percpu_hash_update(map, key, value, flags); 275 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 276 err = bpf_percpu_array_update(map, key, value, flags); 277 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { 278 err = bpf_percpu_cgroup_storage_update(map, key, value, 279 flags); 280 } else if (IS_FD_ARRAY(map)) { 281 err = bpf_fd_array_map_update_elem(map, map_file, key, value, 282 flags); 283 } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { 284 err = bpf_fd_htab_map_update_elem(map, map_file, key, value, 285 flags); 286 } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { 287 /* rcu_read_lock() is not needed */ 288 err = bpf_fd_reuseport_array_update_elem(map, key, value, 289 flags); 290 } else if (map->map_type == BPF_MAP_TYPE_QUEUE || 291 map->map_type == BPF_MAP_TYPE_STACK || 292 map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { 293 err = map->ops->map_push_elem(map, value, flags); 294 } else { 295 err = bpf_obj_pin_uptrs(map->record, value); 296 if (!err) { 297 rcu_read_lock(); 298 err = map->ops->map_update_elem(map, key, value, flags); 299 rcu_read_unlock(); 300 if (err) 301 bpf_obj_unpin_uptrs(map->record, value); 302 } 303 } 304 bpf_enable_instrumentation(); 305 306 return err; 307 } 308 309 static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, 310 __u64 flags) 311 { 312 void *ptr; 313 int err; 314 315 if (bpf_map_is_offloaded(map)) 316 return bpf_map_offload_lookup_elem(map, key, value); 317 318 bpf_disable_instrumentation(); 319 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 320 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 321 err = bpf_percpu_hash_copy(map, key, value, flags); 322 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 323 err = bpf_percpu_array_copy(map, key, value, flags); 324 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { 325 err = bpf_percpu_cgroup_storage_copy(map, key, value, flags); 326 } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { 327 err = bpf_stackmap_extract(map, key, value, false); 328 } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { 329 err = bpf_fd_array_map_lookup_elem(map, key, value); 330 } else if (IS_FD_HASH(map)) { 331 err = bpf_fd_htab_map_lookup_elem(map, key, value); 332 } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { 333 err = bpf_fd_reuseport_array_lookup_elem(map, key, value); 334 } else if (map->map_type == BPF_MAP_TYPE_QUEUE || 335 map->map_type == BPF_MAP_TYPE_STACK || 336 map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { 337 err = map->ops->map_peek_elem(map, value); 338 } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { 339 /* struct_ops map requires directly updating "value" */ 340 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); 341 } else { 342 rcu_read_lock(); 343 if (map->ops->map_lookup_elem_sys_only) 344 ptr = map->ops->map_lookup_elem_sys_only(map, key); 345 else 346 ptr = map->ops->map_lookup_elem(map, key); 347 if (IS_ERR(ptr)) { 348 err = PTR_ERR(ptr); 349 } else if (!ptr) { 350 err = -ENOENT; 351 } else { 352 err = 0; 353 if (flags & BPF_F_LOCK) 354 /* lock 'ptr' and copy everything but lock */ 355 copy_map_value_locked(map, value, ptr, true); 356 else 357 copy_map_value(map, value, ptr); 358 /* mask lock and timer, since value wasn't zero inited */ 359 check_and_init_map_value(map, value); 360 } 361 rcu_read_unlock(); 362 } 363 364 bpf_enable_instrumentation(); 365 366 return err; 367 } 368 369 /* Please, do not use this function outside from the map creation path 370 * (e.g. in map update path) without taking care of setting the active 371 * memory cgroup (see at bpf_map_kmalloc_node() for example). 372 */ 373 static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) 374 { 375 /* We really just want to fail instead of triggering OOM killer 376 * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, 377 * which is used for lower order allocation requests. 378 * 379 * It has been observed that higher order allocation requests done by 380 * vmalloc with __GFP_NORETRY being set might fail due to not trying 381 * to reclaim memory from the page cache, thus we set 382 * __GFP_RETRY_MAYFAIL to avoid such situations. 383 */ 384 385 gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO); 386 unsigned int flags = 0; 387 unsigned long align = 1; 388 void *area; 389 390 if (size >= SIZE_MAX) 391 return NULL; 392 393 /* kmalloc()'ed memory can't be mmap()'ed */ 394 if (mmapable) { 395 BUG_ON(!PAGE_ALIGNED(size)); 396 align = SHMLBA; 397 flags = VM_USERMAP; 398 } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 399 area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, 400 numa_node); 401 if (area != NULL) 402 return area; 403 } 404 405 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 406 gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, 407 flags, numa_node, __builtin_return_address(0)); 408 } 409 410 void *bpf_map_area_alloc(u64 size, int numa_node) 411 { 412 return __bpf_map_area_alloc(size, numa_node, false); 413 } 414 415 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node) 416 { 417 return __bpf_map_area_alloc(size, numa_node, true); 418 } 419 420 void bpf_map_area_free(void *area) 421 { 422 kvfree(area); 423 } 424 425 static u32 bpf_map_flags_retain_permanent(u32 flags) 426 { 427 /* Some map creation flags are not tied to the map object but 428 * rather to the map fd instead, so they have no meaning upon 429 * map object inspection since multiple file descriptors with 430 * different (access) properties can exist here. Thus, given 431 * this has zero meaning for the map itself, lets clear these 432 * from here. 433 */ 434 return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); 435 } 436 437 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) 438 { 439 map->map_type = attr->map_type; 440 map->key_size = attr->key_size; 441 map->value_size = attr->value_size; 442 map->max_entries = attr->max_entries; 443 map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); 444 map->numa_node = bpf_map_attr_numa_node(attr); 445 map->map_extra = attr->map_extra; 446 } 447 448 static int bpf_map_alloc_id(struct bpf_map *map) 449 { 450 int id; 451 452 idr_preload(GFP_KERNEL); 453 spin_lock_bh(&map_idr_lock); 454 id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); 455 if (id > 0) 456 map->id = id; 457 spin_unlock_bh(&map_idr_lock); 458 idr_preload_end(); 459 460 if (WARN_ON_ONCE(!id)) 461 return -ENOSPC; 462 463 return id > 0 ? 0 : id; 464 } 465 466 void bpf_map_free_id(struct bpf_map *map) 467 { 468 unsigned long flags; 469 470 /* Offloaded maps are removed from the IDR store when their device 471 * disappears - even if someone holds an fd to them they are unusable, 472 * the memory is gone, all ops will fail; they are simply waiting for 473 * refcnt to drop to be freed. 474 */ 475 if (!map->id) 476 return; 477 478 spin_lock_irqsave(&map_idr_lock, flags); 479 480 idr_remove(&map_idr, map->id); 481 map->id = 0; 482 483 spin_unlock_irqrestore(&map_idr_lock, flags); 484 } 485 486 #ifdef CONFIG_MEMCG 487 static void bpf_map_save_memcg(struct bpf_map *map) 488 { 489 /* Currently if a map is created by a process belonging to the root 490 * memory cgroup, get_obj_cgroup_from_current() will return NULL. 491 * So we have to check map->objcg for being NULL each time it's 492 * being used. 493 */ 494 if (memcg_bpf_enabled()) 495 map->objcg = get_obj_cgroup_from_current(); 496 } 497 498 static void bpf_map_release_memcg(struct bpf_map *map) 499 { 500 if (map->objcg) 501 obj_cgroup_put(map->objcg); 502 } 503 504 static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) 505 { 506 if (map->objcg) 507 return get_mem_cgroup_from_objcg(map->objcg); 508 509 return root_mem_cgroup; 510 } 511 512 void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, 513 struct mem_cgroup **new_memcg) 514 { 515 *new_memcg = bpf_map_get_memcg(map); 516 *old_memcg = set_active_memcg(*new_memcg); 517 } 518 519 void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, 520 struct mem_cgroup *new_memcg) 521 { 522 set_active_memcg(old_memcg); 523 mem_cgroup_put(new_memcg); 524 } 525 526 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, 527 int node) 528 { 529 struct mem_cgroup *memcg, *old_memcg; 530 void *ptr; 531 532 bpf_map_memcg_enter(map, &old_memcg, &memcg); 533 ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); 534 bpf_map_memcg_exit(old_memcg, memcg); 535 536 return ptr; 537 } 538 539 void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, 540 int node) 541 { 542 struct mem_cgroup *memcg, *old_memcg; 543 void *ptr; 544 545 bpf_map_memcg_enter(map, &old_memcg, &memcg); 546 ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); 547 bpf_map_memcg_exit(old_memcg, memcg); 548 549 return ptr; 550 } 551 552 void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) 553 { 554 struct mem_cgroup *memcg, *old_memcg; 555 void *ptr; 556 557 bpf_map_memcg_enter(map, &old_memcg, &memcg); 558 ptr = kzalloc(size, flags | __GFP_ACCOUNT); 559 bpf_map_memcg_exit(old_memcg, memcg); 560 561 return ptr; 562 } 563 564 void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, 565 gfp_t flags) 566 { 567 struct mem_cgroup *memcg, *old_memcg; 568 void *ptr; 569 570 bpf_map_memcg_enter(map, &old_memcg, &memcg); 571 ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); 572 bpf_map_memcg_exit(old_memcg, memcg); 573 574 return ptr; 575 } 576 577 void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, 578 size_t align, gfp_t flags) 579 { 580 struct mem_cgroup *memcg, *old_memcg; 581 void __percpu *ptr; 582 583 bpf_map_memcg_enter(map, &old_memcg, &memcg); 584 ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); 585 bpf_map_memcg_exit(old_memcg, memcg); 586 587 return ptr; 588 } 589 590 #else 591 static void bpf_map_save_memcg(struct bpf_map *map) 592 { 593 } 594 595 static void bpf_map_release_memcg(struct bpf_map *map) 596 { 597 } 598 #endif 599 600 static bool can_alloc_pages(void) 601 { 602 return preempt_count() == 0 && !irqs_disabled() && 603 !IS_ENABLED(CONFIG_PREEMPT_RT); 604 } 605 606 static struct page *__bpf_alloc_page(int nid) 607 { 608 if (!can_alloc_pages()) 609 return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0); 610 611 return alloc_pages_node(nid, 612 GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT 613 | __GFP_NOWARN, 614 0); 615 } 616 617 int bpf_map_alloc_pages(const struct bpf_map *map, int nid, 618 unsigned long nr_pages, struct page **pages) 619 { 620 unsigned long i, j; 621 struct page *pg; 622 int ret = 0; 623 624 for (i = 0; i < nr_pages; i++) { 625 pg = __bpf_alloc_page(nid); 626 627 if (pg) { 628 pages[i] = pg; 629 continue; 630 } 631 for (j = 0; j < i; j++) 632 free_pages_nolock(pages[j], 0); 633 ret = -ENOMEM; 634 break; 635 } 636 637 return ret; 638 } 639 640 641 static int btf_field_cmp(const void *a, const void *b) 642 { 643 const struct btf_field *f1 = a, *f2 = b; 644 645 if (f1->offset < f2->offset) 646 return -1; 647 else if (f1->offset > f2->offset) 648 return 1; 649 return 0; 650 } 651 652 struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, 653 u32 field_mask) 654 { 655 struct btf_field *field; 656 657 if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask)) 658 return NULL; 659 field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); 660 if (!field || !(field->type & field_mask)) 661 return NULL; 662 return field; 663 } 664 665 void btf_record_free(struct btf_record *rec) 666 { 667 int i; 668 669 if (IS_ERR_OR_NULL(rec)) 670 return; 671 for (i = 0; i < rec->cnt; i++) { 672 switch (rec->fields[i].type) { 673 case BPF_KPTR_UNREF: 674 case BPF_KPTR_REF: 675 case BPF_KPTR_PERCPU: 676 case BPF_UPTR: 677 if (rec->fields[i].kptr.module) 678 module_put(rec->fields[i].kptr.module); 679 if (btf_is_kernel(rec->fields[i].kptr.btf)) 680 btf_put(rec->fields[i].kptr.btf); 681 break; 682 case BPF_LIST_HEAD: 683 case BPF_LIST_NODE: 684 case BPF_RB_ROOT: 685 case BPF_RB_NODE: 686 case BPF_SPIN_LOCK: 687 case BPF_RES_SPIN_LOCK: 688 case BPF_TIMER: 689 case BPF_REFCOUNT: 690 case BPF_WORKQUEUE: 691 case BPF_TASK_WORK: 692 /* Nothing to release */ 693 break; 694 default: 695 WARN_ON_ONCE(1); 696 continue; 697 } 698 } 699 kfree(rec); 700 } 701 702 void bpf_map_free_record(struct bpf_map *map) 703 { 704 btf_record_free(map->record); 705 map->record = NULL; 706 } 707 708 struct btf_record *btf_record_dup(const struct btf_record *rec) 709 { 710 const struct btf_field *fields; 711 struct btf_record *new_rec; 712 int ret, size, i; 713 714 if (IS_ERR_OR_NULL(rec)) 715 return NULL; 716 size = struct_size(rec, fields, rec->cnt); 717 new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); 718 if (!new_rec) 719 return ERR_PTR(-ENOMEM); 720 /* Do a deep copy of the btf_record */ 721 fields = rec->fields; 722 new_rec->cnt = 0; 723 for (i = 0; i < rec->cnt; i++) { 724 switch (fields[i].type) { 725 case BPF_KPTR_UNREF: 726 case BPF_KPTR_REF: 727 case BPF_KPTR_PERCPU: 728 case BPF_UPTR: 729 if (btf_is_kernel(fields[i].kptr.btf)) 730 btf_get(fields[i].kptr.btf); 731 if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { 732 ret = -ENXIO; 733 goto free; 734 } 735 break; 736 case BPF_LIST_HEAD: 737 case BPF_LIST_NODE: 738 case BPF_RB_ROOT: 739 case BPF_RB_NODE: 740 case BPF_SPIN_LOCK: 741 case BPF_RES_SPIN_LOCK: 742 case BPF_TIMER: 743 case BPF_REFCOUNT: 744 case BPF_WORKQUEUE: 745 case BPF_TASK_WORK: 746 /* Nothing to acquire */ 747 break; 748 default: 749 ret = -EFAULT; 750 WARN_ON_ONCE(1); 751 goto free; 752 } 753 new_rec->cnt++; 754 } 755 return new_rec; 756 free: 757 btf_record_free(new_rec); 758 return ERR_PTR(ret); 759 } 760 761 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b) 762 { 763 bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b); 764 int size; 765 766 if (!a_has_fields && !b_has_fields) 767 return true; 768 if (a_has_fields != b_has_fields) 769 return false; 770 if (rec_a->cnt != rec_b->cnt) 771 return false; 772 size = struct_size(rec_a, fields, rec_a->cnt); 773 /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused 774 * members are zeroed out. So memcmp is safe to do without worrying 775 * about padding/unused fields. 776 * 777 * While spin_lock, timer, and kptr have no relation to map BTF, 778 * list_head metadata is specific to map BTF, the btf and value_rec 779 * members in particular. btf is the map BTF, while value_rec points to 780 * btf_record in that map BTF. 781 * 782 * So while by default, we don't rely on the map BTF (which the records 783 * were parsed from) matching for both records, which is not backwards 784 * compatible, in case list_head is part of it, we implicitly rely on 785 * that by way of depending on memcmp succeeding for it. 786 */ 787 return !memcmp(rec_a, rec_b, size); 788 } 789 790 void bpf_obj_free_timer(const struct btf_record *rec, void *obj) 791 { 792 if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER))) 793 return; 794 bpf_timer_cancel_and_free(obj + rec->timer_off); 795 } 796 797 void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) 798 { 799 if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE))) 800 return; 801 bpf_wq_cancel_and_free(obj + rec->wq_off); 802 } 803 804 void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) 805 { 806 if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) 807 return; 808 bpf_task_work_cancel_and_free(obj + rec->task_work_off); 809 } 810 811 void bpf_obj_cancel_fields(struct bpf_map *map, void *obj) 812 { 813 bpf_map_free_internal_structs(map, obj); 814 } 815 816 void bpf_obj_free_fields(const struct btf_record *rec, void *obj) 817 { 818 const struct btf_field *fields; 819 int i; 820 821 if (IS_ERR_OR_NULL(rec)) 822 return; 823 fields = rec->fields; 824 for (i = 0; i < rec->cnt; i++) { 825 struct btf_struct_meta *pointee_struct_meta; 826 const struct btf_field *field = &fields[i]; 827 void *field_ptr = obj + field->offset; 828 void *xchgd_field; 829 830 switch (fields[i].type) { 831 case BPF_SPIN_LOCK: 832 case BPF_RES_SPIN_LOCK: 833 break; 834 case BPF_TIMER: 835 bpf_timer_cancel_and_free(field_ptr); 836 break; 837 case BPF_WORKQUEUE: 838 bpf_wq_cancel_and_free(field_ptr); 839 break; 840 case BPF_TASK_WORK: 841 bpf_task_work_cancel_and_free(field_ptr); 842 break; 843 case BPF_KPTR_UNREF: 844 WRITE_ONCE(*(u64 *)field_ptr, 0); 845 break; 846 case BPF_KPTR_REF: 847 case BPF_KPTR_PERCPU: 848 xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); 849 if (!xchgd_field) 850 break; 851 852 if (!btf_is_kernel(field->kptr.btf)) { 853 pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, 854 field->kptr.btf_id); 855 __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? 856 pointee_struct_meta->record : NULL, 857 fields[i].type == BPF_KPTR_PERCPU); 858 } else { 859 field->kptr.dtor(xchgd_field); 860 } 861 break; 862 case BPF_UPTR: 863 /* The caller ensured that no one is using the uptr */ 864 unpin_uptr_kaddr(*(void **)field_ptr); 865 break; 866 case BPF_LIST_HEAD: 867 if (WARN_ON_ONCE(rec->spin_lock_off < 0)) 868 continue; 869 bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off); 870 break; 871 case BPF_RB_ROOT: 872 if (WARN_ON_ONCE(rec->spin_lock_off < 0)) 873 continue; 874 bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off); 875 break; 876 case BPF_LIST_NODE: 877 case BPF_RB_NODE: 878 case BPF_REFCOUNT: 879 break; 880 default: 881 WARN_ON_ONCE(1); 882 continue; 883 } 884 } 885 } 886 887 static void bpf_map_free(struct bpf_map *map) 888 { 889 struct btf_record *rec = map->record; 890 struct btf *btf = map->btf; 891 892 /* implementation dependent freeing. Disabling migration to simplify 893 * the free of values or special fields allocated from bpf memory 894 * allocator. 895 */ 896 kfree(map->excl_prog_sha); 897 migrate_disable(); 898 map->ops->map_free(map); 899 migrate_enable(); 900 901 /* Delay freeing of btf_record for maps, as map_free 902 * callback usually needs access to them. It is better to do it here 903 * than require each callback to do the free itself manually. 904 * 905 * Note that the btf_record stashed in map->inner_map_meta->record was 906 * already freed using the map_free callback for map in map case which 907 * eventually calls bpf_map_free_meta, since inner_map_meta is only a 908 * template bpf_map struct used during verification. 909 */ 910 btf_record_free(rec); 911 /* Delay freeing of btf for maps, as map_free callback may need 912 * struct_meta info which will be freed with btf_put(). 913 */ 914 btf_put(btf); 915 } 916 917 /* called from workqueue */ 918 static void bpf_map_free_deferred(struct work_struct *work) 919 { 920 struct bpf_map *map = container_of(work, struct bpf_map, work); 921 922 security_bpf_map_free(map); 923 bpf_map_release_memcg(map); 924 bpf_map_owner_free(map); 925 bpf_map_free(map); 926 } 927 928 static void bpf_map_put_uref(struct bpf_map *map) 929 { 930 if (atomic64_dec_and_test(&map->usercnt)) { 931 if (map->ops->map_release_uref) 932 map->ops->map_release_uref(map); 933 } 934 } 935 936 static void bpf_map_free_in_work(struct bpf_map *map) 937 { 938 INIT_WORK(&map->work, bpf_map_free_deferred); 939 /* Avoid spawning kworkers, since they all might contend 940 * for the same mutex like slab_mutex. 941 */ 942 queue_work(system_dfl_wq, &map->work); 943 } 944 945 static void bpf_map_free_rcu_gp(struct rcu_head *rcu) 946 { 947 bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); 948 } 949 950 /* decrement map refcnt and schedule it for freeing via workqueue 951 * (underlying map implementation ops->map_free() might sleep) 952 */ 953 void bpf_map_put(struct bpf_map *map) 954 { 955 if (atomic64_dec_and_test(&map->refcnt)) { 956 /* bpf_map_free_id() must be called first */ 957 bpf_map_free_id(map); 958 959 WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); 960 /* RCU tasks trace grace period implies RCU grace period. */ 961 if (READ_ONCE(map->free_after_mult_rcu_gp)) 962 call_rcu_tasks_trace(&map->rcu, bpf_map_free_rcu_gp); 963 else if (READ_ONCE(map->free_after_rcu_gp)) 964 call_rcu(&map->rcu, bpf_map_free_rcu_gp); 965 else 966 bpf_map_free_in_work(map); 967 } 968 } 969 EXPORT_SYMBOL_GPL(bpf_map_put); 970 971 void bpf_map_put_with_uref(struct bpf_map *map) 972 { 973 bpf_map_put_uref(map); 974 bpf_map_put(map); 975 } 976 977 static int bpf_map_release(struct inode *inode, struct file *filp) 978 { 979 struct bpf_map *map = filp->private_data; 980 981 if (map->ops->map_release) 982 map->ops->map_release(map, filp); 983 984 bpf_map_put_with_uref(map); 985 return 0; 986 } 987 988 static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) 989 { 990 fmode_t mode = fd_file(f)->f_mode; 991 992 /* Our file permissions may have been overridden by global 993 * map permissions facing syscall side. 994 */ 995 if (READ_ONCE(map->frozen)) 996 mode &= ~FMODE_CAN_WRITE; 997 return mode; 998 } 999 1000 #ifdef CONFIG_PROC_FS 1001 /* Show the memory usage of a bpf map */ 1002 static u64 bpf_map_memory_usage(const struct bpf_map *map) 1003 { 1004 return map->ops->map_mem_usage(map); 1005 } 1006 1007 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) 1008 { 1009 struct bpf_map *map = filp->private_data; 1010 u32 type = 0, jited = 0; 1011 1012 spin_lock(&map->owner_lock); 1013 if (map->owner) { 1014 type = map->owner->type; 1015 jited = map->owner->jited; 1016 } 1017 spin_unlock(&map->owner_lock); 1018 1019 seq_printf(m, 1020 "map_type:\t%u\n" 1021 "key_size:\t%u\n" 1022 "value_size:\t%u\n" 1023 "max_entries:\t%u\n" 1024 "map_flags:\t%#x\n" 1025 "map_extra:\t%#llx\n" 1026 "memlock:\t%llu\n" 1027 "map_id:\t%u\n" 1028 "frozen:\t%u\n", 1029 map->map_type, 1030 map->key_size, 1031 map->value_size, 1032 map->max_entries, 1033 map->map_flags, 1034 (unsigned long long)map->map_extra, 1035 bpf_map_memory_usage(map), 1036 map->id, 1037 READ_ONCE(map->frozen)); 1038 if (type) { 1039 seq_printf(m, "owner_prog_type:\t%u\n", type); 1040 seq_printf(m, "owner_jited:\t%u\n", jited); 1041 } 1042 } 1043 #endif 1044 1045 static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, 1046 loff_t *ppos) 1047 { 1048 /* We need this handler such that alloc_file() enables 1049 * f_mode with FMODE_CAN_READ. 1050 */ 1051 return -EINVAL; 1052 } 1053 1054 static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, 1055 size_t siz, loff_t *ppos) 1056 { 1057 /* We need this handler such that alloc_file() enables 1058 * f_mode with FMODE_CAN_WRITE. 1059 */ 1060 return -EINVAL; 1061 } 1062 1063 /* called for any extra memory-mapped regions (except initial) */ 1064 static void bpf_map_mmap_open(struct vm_area_struct *vma) 1065 { 1066 struct bpf_map *map = vma->vm_file->private_data; 1067 1068 if (vma->vm_flags & VM_MAYWRITE) 1069 bpf_map_write_active_inc(map); 1070 } 1071 1072 /* called for all unmapped memory region (including initial) */ 1073 static void bpf_map_mmap_close(struct vm_area_struct *vma) 1074 { 1075 struct bpf_map *map = vma->vm_file->private_data; 1076 1077 if (vma->vm_flags & VM_MAYWRITE) 1078 bpf_map_write_active_dec(map); 1079 } 1080 1081 static const struct vm_operations_struct bpf_map_default_vmops = { 1082 .open = bpf_map_mmap_open, 1083 .close = bpf_map_mmap_close, 1084 }; 1085 1086 static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) 1087 { 1088 struct bpf_map *map = filp->private_data; 1089 int err = 0; 1090 1091 if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) 1092 return -ENOTSUPP; 1093 1094 if (!(vma->vm_flags & VM_SHARED)) 1095 return -EINVAL; 1096 1097 mutex_lock(&map->freeze_mutex); 1098 1099 if (vma->vm_flags & VM_WRITE) { 1100 if (map->frozen) { 1101 err = -EPERM; 1102 goto out; 1103 } 1104 /* map is meant to be read-only, so do not allow mapping as 1105 * writable, because it's possible to leak a writable page 1106 * reference and allows user-space to still modify it after 1107 * freezing, while verifier will assume contents do not change 1108 */ 1109 if (map->map_flags & BPF_F_RDONLY_PROG) { 1110 err = -EACCES; 1111 goto out; 1112 } 1113 bpf_map_write_active_inc(map); 1114 } 1115 out: 1116 mutex_unlock(&map->freeze_mutex); 1117 if (err) 1118 return err; 1119 1120 /* set default open/close callbacks */ 1121 vma->vm_ops = &bpf_map_default_vmops; 1122 vma->vm_private_data = map; 1123 vm_flags_clear(vma, VM_MAYEXEC); 1124 /* If mapping is read-only, then disallow potentially re-mapping with 1125 * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing 1126 * means that as far as BPF map's memory-mapped VMAs are concerned, 1127 * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set, 1128 * both should be set, so we can forget about VM_MAYWRITE and always 1129 * check just VM_WRITE 1130 */ 1131 if (!(vma->vm_flags & VM_WRITE)) 1132 vm_flags_clear(vma, VM_MAYWRITE); 1133 1134 err = map->ops->map_mmap(map, vma); 1135 if (err) { 1136 if (vma->vm_flags & VM_WRITE) 1137 bpf_map_write_active_dec(map); 1138 } 1139 1140 return err; 1141 } 1142 1143 static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) 1144 { 1145 struct bpf_map *map = filp->private_data; 1146 1147 if (map->ops->map_poll) 1148 return map->ops->map_poll(map, filp, pts); 1149 1150 return EPOLLERR; 1151 } 1152 1153 static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr, 1154 unsigned long len, unsigned long pgoff, 1155 unsigned long flags) 1156 { 1157 struct bpf_map *map = filp->private_data; 1158 1159 if (map->ops->map_get_unmapped_area) 1160 return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); 1161 #ifdef CONFIG_MMU 1162 return mm_get_unmapped_area(filp, addr, len, pgoff, flags); 1163 #else 1164 return addr; 1165 #endif 1166 } 1167 1168 const struct file_operations bpf_map_fops = { 1169 #ifdef CONFIG_PROC_FS 1170 .show_fdinfo = bpf_map_show_fdinfo, 1171 #endif 1172 .release = bpf_map_release, 1173 .read = bpf_dummy_read, 1174 .write = bpf_dummy_write, 1175 .mmap = bpf_map_mmap, 1176 .poll = bpf_map_poll, 1177 .get_unmapped_area = bpf_get_unmapped_area, 1178 }; 1179 1180 int bpf_map_new_fd(struct bpf_map *map, int flags) 1181 { 1182 int ret; 1183 1184 ret = security_bpf_map(map, OPEN_FMODE(flags)); 1185 if (ret < 0) 1186 return ret; 1187 1188 return anon_inode_getfd("bpf-map", &bpf_map_fops, map, 1189 flags | O_CLOEXEC); 1190 } 1191 1192 int bpf_get_file_flag(int flags) 1193 { 1194 if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) 1195 return -EINVAL; 1196 if (flags & BPF_F_RDONLY) 1197 return O_RDONLY; 1198 if (flags & BPF_F_WRONLY) 1199 return O_WRONLY; 1200 return O_RDWR; 1201 } 1202 1203 /* helper macro to check that unused fields 'union bpf_attr' are zero */ 1204 #define CHECK_ATTR(CMD) \ 1205 memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ 1206 sizeof(attr->CMD##_LAST_FIELD), 0, \ 1207 sizeof(*attr) - \ 1208 offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ 1209 sizeof(attr->CMD##_LAST_FIELD)) != NULL 1210 1211 /* dst and src must have at least "size" number of bytes. 1212 * Return strlen on success and < 0 on error. 1213 */ 1214 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) 1215 { 1216 const char *end = src + size; 1217 const char *orig_src = src; 1218 1219 memset(dst, 0, size); 1220 /* Copy all isalnum(), '_' and '.' chars. */ 1221 while (src < end && *src) { 1222 if (!isalnum(*src) && 1223 *src != '_' && *src != '.') 1224 return -EINVAL; 1225 *dst++ = *src++; 1226 } 1227 1228 /* No '\0' found in "size" number of bytes */ 1229 if (src == end) 1230 return -EINVAL; 1231 1232 return src - orig_src; 1233 } 1234 EXPORT_SYMBOL_GPL(bpf_obj_name_cpy); 1235 1236 int map_check_no_btf(struct bpf_map *map, 1237 const struct btf *btf, 1238 const struct btf_type *key_type, 1239 const struct btf_type *value_type) 1240 { 1241 return -ENOTSUPP; 1242 } 1243 1244 static int map_check_btf(struct bpf_map *map, struct bpf_token *token, 1245 const struct btf *btf, u32 btf_key_id, u32 btf_value_id) 1246 { 1247 const struct btf_type *key_type, *value_type; 1248 u32 key_size, value_size; 1249 int ret = 0; 1250 1251 /* Some maps allow key to be unspecified. */ 1252 if (btf_key_id) { 1253 key_type = btf_type_id_size(btf, &btf_key_id, &key_size); 1254 if (!key_type || key_size != map->key_size) 1255 return -EINVAL; 1256 } else { 1257 key_type = btf_type_by_id(btf, 0); 1258 if (!map->ops->map_check_btf) 1259 return -EINVAL; 1260 } 1261 1262 value_type = btf_type_id_size(btf, &btf_value_id, &value_size); 1263 if (!value_type || value_size != map->value_size) 1264 return -EINVAL; 1265 1266 map->record = btf_parse_fields(btf, value_type, 1267 BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | 1268 BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | 1269 BPF_TASK_WORK, 1270 map->value_size); 1271 if (!IS_ERR_OR_NULL(map->record)) { 1272 int i; 1273 1274 if (!bpf_token_capable(token, CAP_BPF)) { 1275 ret = -EPERM; 1276 goto free_map_tab; 1277 } 1278 if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { 1279 ret = -EACCES; 1280 goto free_map_tab; 1281 } 1282 for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) { 1283 switch (map->record->field_mask & (1 << i)) { 1284 case 0: 1285 continue; 1286 case BPF_SPIN_LOCK: 1287 case BPF_RES_SPIN_LOCK: 1288 if (map->map_type != BPF_MAP_TYPE_HASH && 1289 map->map_type != BPF_MAP_TYPE_RHASH && 1290 map->map_type != BPF_MAP_TYPE_ARRAY && 1291 map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && 1292 map->map_type != BPF_MAP_TYPE_SK_STORAGE && 1293 map->map_type != BPF_MAP_TYPE_INODE_STORAGE && 1294 map->map_type != BPF_MAP_TYPE_TASK_STORAGE && 1295 map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { 1296 ret = -EOPNOTSUPP; 1297 goto free_map_tab; 1298 } 1299 break; 1300 case BPF_TIMER: 1301 case BPF_WORKQUEUE: 1302 case BPF_TASK_WORK: 1303 if (map->map_type != BPF_MAP_TYPE_HASH && 1304 map->map_type != BPF_MAP_TYPE_RHASH && 1305 map->map_type != BPF_MAP_TYPE_LRU_HASH && 1306 map->map_type != BPF_MAP_TYPE_ARRAY) { 1307 ret = -EOPNOTSUPP; 1308 goto free_map_tab; 1309 } 1310 break; 1311 case BPF_KPTR_UNREF: 1312 case BPF_KPTR_REF: 1313 case BPF_KPTR_PERCPU: 1314 case BPF_REFCOUNT: 1315 if (map->map_type != BPF_MAP_TYPE_HASH && 1316 map->map_type != BPF_MAP_TYPE_RHASH && 1317 map->map_type != BPF_MAP_TYPE_PERCPU_HASH && 1318 map->map_type != BPF_MAP_TYPE_LRU_HASH && 1319 map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && 1320 map->map_type != BPF_MAP_TYPE_ARRAY && 1321 map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && 1322 map->map_type != BPF_MAP_TYPE_SK_STORAGE && 1323 map->map_type != BPF_MAP_TYPE_INODE_STORAGE && 1324 map->map_type != BPF_MAP_TYPE_TASK_STORAGE && 1325 map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { 1326 ret = -EOPNOTSUPP; 1327 goto free_map_tab; 1328 } 1329 break; 1330 case BPF_UPTR: 1331 if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) { 1332 ret = -EOPNOTSUPP; 1333 goto free_map_tab; 1334 } 1335 break; 1336 case BPF_LIST_HEAD: 1337 case BPF_RB_ROOT: 1338 if (map->map_type != BPF_MAP_TYPE_HASH && 1339 map->map_type != BPF_MAP_TYPE_LRU_HASH && 1340 map->map_type != BPF_MAP_TYPE_ARRAY) { 1341 ret = -EOPNOTSUPP; 1342 goto free_map_tab; 1343 } 1344 break; 1345 default: 1346 /* Fail if map_type checks are missing for a field type */ 1347 ret = -EOPNOTSUPP; 1348 goto free_map_tab; 1349 } 1350 } 1351 } 1352 1353 ret = btf_check_and_fixup_fields(btf, map->record); 1354 if (ret < 0) 1355 goto free_map_tab; 1356 1357 if (map->ops->map_check_btf) { 1358 ret = map->ops->map_check_btf(map, btf, key_type, value_type); 1359 if (ret < 0) 1360 goto free_map_tab; 1361 } 1362 1363 return ret; 1364 free_map_tab: 1365 bpf_map_free_record(map); 1366 return ret; 1367 } 1368 1369 #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size 1370 /* called via syscall */ 1371 static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log, 1372 struct bpf_map **mapp, struct bpf_token **tokenp) 1373 { 1374 const struct bpf_map_ops *ops; 1375 struct bpf_token *token = NULL; 1376 int numa_node = bpf_map_attr_numa_node(attr); 1377 u32 map_type = attr->map_type; 1378 struct bpf_map *map; 1379 bool token_flag; 1380 int err; 1381 1382 err = CHECK_ATTR(BPF_MAP_CREATE); 1383 if (err) { 1384 bpf_log(log, "Invalid attr.\n"); 1385 return -EINVAL; 1386 } 1387 1388 /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it 1389 * to avoid per-map type checks tripping on unknown flag 1390 */ 1391 token_flag = attr->map_flags & BPF_F_TOKEN_FD; 1392 attr->map_flags &= ~BPF_F_TOKEN_FD; 1393 1394 if (attr->btf_vmlinux_value_type_id) { 1395 if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS) { 1396 bpf_log(log, "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n"); 1397 return -EINVAL; 1398 } 1399 if (attr->btf_key_type_id || attr->btf_value_type_id) { 1400 bpf_log(log, "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n"); 1401 return -EINVAL; 1402 } 1403 } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { 1404 bpf_log(log, "Invalid btf_value_type_id.\n"); 1405 return -EINVAL; 1406 } 1407 1408 if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && 1409 attr->map_type != BPF_MAP_TYPE_ARENA && 1410 attr->map_type != BPF_MAP_TYPE_RHASH && 1411 attr->map_extra != 0) { 1412 bpf_log(log, "Invalid map_extra.\n"); 1413 return -EINVAL; 1414 } 1415 1416 if (numa_node != NUMA_NO_NODE && 1417 ((unsigned int)numa_node >= nr_node_ids || 1418 !node_online(numa_node))) { 1419 bpf_log(log, "Invalid numa_node.\n"); 1420 return -EINVAL; 1421 } 1422 1423 /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ 1424 map_type = attr->map_type; 1425 if (map_type >= ARRAY_SIZE(bpf_map_types)) { 1426 bpf_log(log, "Invalid map_type.\n"); 1427 return -EINVAL; 1428 } 1429 map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); 1430 ops = bpf_map_types[map_type]; 1431 if (!ops) 1432 return -EINVAL; 1433 1434 if (ops->map_alloc_check) { 1435 err = ops->map_alloc_check(attr); 1436 if (err) 1437 return err; 1438 } 1439 if (attr->map_ifindex) 1440 ops = &bpf_map_offload_ops; 1441 if (!ops->map_mem_usage) 1442 return -EINVAL; 1443 1444 if (token_flag) { 1445 token = bpf_token_get_from_fd(attr->map_token_fd); 1446 if (IS_ERR(token)) { 1447 bpf_log(log, "Invalid map_token_fd.\n"); 1448 return PTR_ERR(token); 1449 } 1450 1451 /* if current token doesn't grant map creation permissions, 1452 * then we can't use this token, so ignore it and rely on 1453 * system-wide capabilities checks 1454 */ 1455 if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) || 1456 !bpf_token_allow_map_type(token, attr->map_type)) { 1457 bpf_token_put(token); 1458 token = NULL; 1459 } 1460 } 1461 1462 err = -EPERM; 1463 1464 /* Intent here is for unprivileged_bpf_disabled to block BPF map 1465 * creation for unprivileged users; other actions depend 1466 * on fd availability and access to bpffs, so are dependent on 1467 * object creation success. Even with unprivileged BPF disabled, 1468 * capability checks are still carried out. 1469 */ 1470 if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF)) 1471 goto put_token; 1472 1473 /* check privileged map type permissions */ 1474 switch (map_type) { 1475 case BPF_MAP_TYPE_ARRAY: 1476 case BPF_MAP_TYPE_PERCPU_ARRAY: 1477 case BPF_MAP_TYPE_PROG_ARRAY: 1478 case BPF_MAP_TYPE_PERF_EVENT_ARRAY: 1479 case BPF_MAP_TYPE_CGROUP_ARRAY: 1480 case BPF_MAP_TYPE_ARRAY_OF_MAPS: 1481 case BPF_MAP_TYPE_HASH: 1482 case BPF_MAP_TYPE_RHASH: 1483 case BPF_MAP_TYPE_PERCPU_HASH: 1484 case BPF_MAP_TYPE_HASH_OF_MAPS: 1485 case BPF_MAP_TYPE_RINGBUF: 1486 case BPF_MAP_TYPE_USER_RINGBUF: 1487 case BPF_MAP_TYPE_CGROUP_STORAGE: 1488 case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: 1489 /* unprivileged */ 1490 break; 1491 case BPF_MAP_TYPE_SK_STORAGE: 1492 case BPF_MAP_TYPE_INODE_STORAGE: 1493 case BPF_MAP_TYPE_TASK_STORAGE: 1494 case BPF_MAP_TYPE_CGRP_STORAGE: 1495 case BPF_MAP_TYPE_BLOOM_FILTER: 1496 case BPF_MAP_TYPE_LPM_TRIE: 1497 case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: 1498 case BPF_MAP_TYPE_STACK_TRACE: 1499 case BPF_MAP_TYPE_QUEUE: 1500 case BPF_MAP_TYPE_STACK: 1501 case BPF_MAP_TYPE_LRU_HASH: 1502 case BPF_MAP_TYPE_LRU_PERCPU_HASH: 1503 case BPF_MAP_TYPE_STRUCT_OPS: 1504 case BPF_MAP_TYPE_CPUMAP: 1505 case BPF_MAP_TYPE_ARENA: 1506 case BPF_MAP_TYPE_INSN_ARRAY: 1507 if (!bpf_token_capable(token, CAP_BPF)) 1508 goto put_token; 1509 break; 1510 case BPF_MAP_TYPE_SOCKMAP: 1511 case BPF_MAP_TYPE_SOCKHASH: 1512 case BPF_MAP_TYPE_DEVMAP: 1513 case BPF_MAP_TYPE_DEVMAP_HASH: 1514 case BPF_MAP_TYPE_XSKMAP: 1515 if (!bpf_token_capable(token, CAP_NET_ADMIN)) 1516 goto put_token; 1517 break; 1518 default: 1519 WARN(1, "unsupported map type %d", map_type); 1520 goto put_token; 1521 } 1522 1523 map = ops->map_alloc(attr); 1524 if (IS_ERR(map)) { 1525 err = PTR_ERR(map); 1526 goto put_token; 1527 } 1528 map->ops = ops; 1529 map->map_type = map_type; 1530 1531 err = bpf_obj_name_cpy(map->name, attr->map_name, 1532 sizeof(attr->map_name)); 1533 if (err < 0) { 1534 bpf_log(log, "Invalid map_name.\n"); 1535 goto free_map; 1536 } 1537 1538 preempt_disable(); 1539 map->cookie = gen_cookie_next(&bpf_map_cookie); 1540 preempt_enable(); 1541 1542 atomic64_set(&map->refcnt, 1); 1543 atomic64_set(&map->usercnt, 1); 1544 mutex_init(&map->freeze_mutex); 1545 spin_lock_init(&map->owner_lock); 1546 1547 if (attr->btf_key_type_id || attr->btf_value_type_id || 1548 /* Even the map's value is a kernel's struct, 1549 * the bpf_prog.o must have BTF to begin with 1550 * to figure out the corresponding kernel's 1551 * counter part. Thus, attr->btf_fd has 1552 * to be valid also. 1553 */ 1554 attr->btf_vmlinux_value_type_id) { 1555 struct btf *btf; 1556 1557 btf = btf_get_by_fd(attr->btf_fd); 1558 if (IS_ERR(btf)) { 1559 bpf_log(log, "Invalid btf_fd.\n"); 1560 err = PTR_ERR(btf); 1561 goto free_map; 1562 } 1563 if (btf_is_kernel(btf)) { 1564 btf_put(btf); 1565 err = -EACCES; 1566 goto free_map; 1567 } 1568 map->btf = btf; 1569 1570 if (attr->btf_value_type_id) { 1571 err = map_check_btf(map, token, btf, attr->btf_key_type_id, 1572 attr->btf_value_type_id); 1573 if (err) 1574 goto free_map; 1575 } 1576 1577 map->btf_key_type_id = attr->btf_key_type_id; 1578 map->btf_value_type_id = attr->btf_value_type_id; 1579 map->btf_vmlinux_value_type_id = 1580 attr->btf_vmlinux_value_type_id; 1581 } 1582 1583 if (attr->excl_prog_hash) { 1584 bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); 1585 1586 if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { 1587 bpf_log(log, "Invalid excl_prog_hash_size.\n"); 1588 err = -EINVAL; 1589 goto free_map; 1590 } 1591 1592 map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); 1593 if (!map->excl_prog_sha) { 1594 err = -ENOMEM; 1595 goto free_map; 1596 } 1597 1598 if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) { 1599 err = -EFAULT; 1600 goto free_map; 1601 } 1602 1603 /* See libbpf: emit_signature_match() */ 1604 BUILD_BUG_ON(offsetof(struct bpf_map, excl) != SHA256_DIGEST_SIZE); 1605 BUILD_BUG_ON(!__same_type(map->excl, u32)); 1606 BUILD_BUG_ON(offsetof(struct bpf_map, sha) != 0); 1607 BUILD_BUG_ON(!__same_type(map->sha, u8[SHA256_DIGEST_SIZE])); 1608 map->excl = 1; 1609 } else if (attr->excl_prog_hash_size) { 1610 bpf_log(log, "Invalid excl_prog_hash_size.\n"); 1611 err = -EINVAL; 1612 goto free_map; 1613 } 1614 1615 *mapp = map; 1616 *tokenp = token; 1617 return 0; 1618 1619 free_map: 1620 bpf_map_free(map); 1621 put_token: 1622 bpf_token_put(token); 1623 return err; 1624 } 1625 1626 static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common, 1627 bpfptr_t uattr_common, u32 size_common) 1628 { 1629 struct bpf_token *token = NULL; 1630 struct bpf_verifier_log *log; 1631 struct bpf_log_attr attr_log; 1632 struct bpf_map *map = NULL; 1633 int err, ret; 1634 int f_flags; 1635 1636 log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common); 1637 if (IS_ERR(log)) 1638 return PTR_ERR(log); 1639 1640 err = map_create_alloc(attr, uattr, log, &map, &token); 1641 1642 /* preserve original error even if log finalization is successful */ 1643 ret = bpf_log_attr_finalize(&attr_log, log); 1644 if (ret) 1645 err = ret; 1646 1647 kfree(log); 1648 1649 if (err) 1650 goto free_map; 1651 1652 f_flags = bpf_get_file_flag(attr->map_flags); 1653 if (f_flags < 0) { 1654 err = f_flags; 1655 goto free_map; 1656 } 1657 1658 err = security_bpf_map_create(map, attr, token, uattr.is_kernel); 1659 if (err) 1660 goto free_map_sec; 1661 1662 err = bpf_map_alloc_id(map); 1663 if (err) 1664 goto free_map_sec; 1665 1666 bpf_map_save_memcg(map); 1667 bpf_token_put(token); 1668 1669 err = bpf_map_new_fd(map, f_flags); 1670 if (err < 0) { 1671 /* failed to allocate fd. 1672 * bpf_map_put_with_uref() is needed because the above 1673 * bpf_map_alloc_id() has published the map 1674 * to the userspace and the userspace may 1675 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. 1676 */ 1677 bpf_map_put_with_uref(map); 1678 return err; 1679 } 1680 1681 return err; 1682 1683 free_map_sec: 1684 security_bpf_map_free(map); 1685 free_map: 1686 if (map) 1687 bpf_map_free(map); 1688 bpf_token_put(token); 1689 return err; 1690 } 1691 1692 void bpf_map_inc(struct bpf_map *map) 1693 { 1694 atomic64_inc(&map->refcnt); 1695 } 1696 EXPORT_SYMBOL_GPL(bpf_map_inc); 1697 1698 void bpf_map_inc_with_uref(struct bpf_map *map) 1699 { 1700 atomic64_inc(&map->refcnt); 1701 atomic64_inc(&map->usercnt); 1702 } 1703 EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); 1704 1705 struct bpf_map *bpf_map_get(u32 ufd) 1706 { 1707 CLASS(fd, f)(ufd); 1708 struct bpf_map *map = __bpf_map_get(f); 1709 1710 if (!IS_ERR(map)) 1711 bpf_map_inc(map); 1712 1713 return map; 1714 } 1715 EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL"); 1716 1717 struct bpf_map *bpf_map_get_with_uref(u32 ufd) 1718 { 1719 CLASS(fd, f)(ufd); 1720 struct bpf_map *map = __bpf_map_get(f); 1721 1722 if (!IS_ERR(map)) 1723 bpf_map_inc_with_uref(map); 1724 1725 return map; 1726 } 1727 1728 /* map_idr_lock should have been held or the map should have been 1729 * protected by rcu read lock. 1730 */ 1731 struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) 1732 { 1733 int refold; 1734 1735 refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0); 1736 if (!refold) 1737 return ERR_PTR(-ENOENT); 1738 if (uref) 1739 atomic64_inc(&map->usercnt); 1740 1741 return map; 1742 } 1743 1744 struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) 1745 { 1746 lockdep_assert(rcu_read_lock_held()); 1747 return __bpf_map_inc_not_zero(map, false); 1748 } 1749 EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); 1750 1751 int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, 1752 bool delete) 1753 { 1754 return -ENOTSUPP; 1755 } 1756 1757 static void *__bpf_copy_key(void __user *ukey, u64 key_size) 1758 { 1759 if (key_size) 1760 return vmemdup_user(ukey, key_size); 1761 1762 if (ukey) 1763 return ERR_PTR(-EINVAL); 1764 1765 return NULL; 1766 } 1767 1768 static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) 1769 { 1770 if (key_size) 1771 return kvmemdup_bpfptr(ukey, key_size); 1772 1773 if (!bpfptr_is_null(ukey)) 1774 return ERR_PTR(-EINVAL); 1775 1776 return NULL; 1777 } 1778 1779 /* last field in 'union bpf_attr' used by this command */ 1780 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags 1781 1782 static int map_lookup_elem(union bpf_attr *attr) 1783 { 1784 void __user *ukey = u64_to_user_ptr(attr->key); 1785 void __user *uvalue = u64_to_user_ptr(attr->value); 1786 struct bpf_map *map; 1787 void *key, *value; 1788 u32 value_size; 1789 int err; 1790 1791 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) 1792 return -EINVAL; 1793 1794 CLASS(fd, f)(attr->map_fd); 1795 map = __bpf_map_get(f); 1796 if (IS_ERR(map)) 1797 return PTR_ERR(map); 1798 if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) 1799 return -EPERM; 1800 1801 err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU); 1802 if (err) 1803 return err; 1804 1805 key = __bpf_copy_key(ukey, map->key_size); 1806 if (IS_ERR(key)) 1807 return PTR_ERR(key); 1808 1809 value_size = bpf_map_value_size(map, attr->flags); 1810 1811 err = -ENOMEM; 1812 value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); 1813 if (!value) 1814 goto free_key; 1815 1816 if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { 1817 if (copy_from_user(value, uvalue, value_size)) 1818 err = -EFAULT; 1819 else 1820 err = bpf_map_copy_value(map, key, value, attr->flags); 1821 goto free_value; 1822 } 1823 1824 err = bpf_map_copy_value(map, key, value, attr->flags); 1825 if (err) 1826 goto free_value; 1827 1828 err = -EFAULT; 1829 if (copy_to_user(uvalue, value, value_size) != 0) 1830 goto free_value; 1831 1832 err = 0; 1833 1834 free_value: 1835 kvfree(value); 1836 free_key: 1837 kvfree(key); 1838 return err; 1839 } 1840 1841 1842 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags 1843 1844 static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) 1845 { 1846 bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); 1847 bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel); 1848 struct bpf_map *map; 1849 void *key, *value; 1850 u32 value_size; 1851 int err; 1852 1853 if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) 1854 return -EINVAL; 1855 1856 CLASS(fd, f)(attr->map_fd); 1857 map = __bpf_map_get(f); 1858 if (IS_ERR(map)) 1859 return PTR_ERR(map); 1860 bpf_map_write_active_inc(map); 1861 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 1862 err = -EPERM; 1863 goto err_put; 1864 } 1865 1866 err = bpf_map_check_op_flags(map, attr->flags, ~0); 1867 if (err) 1868 goto err_put; 1869 1870 key = ___bpf_copy_key(ukey, map->key_size); 1871 if (IS_ERR(key)) { 1872 err = PTR_ERR(key); 1873 goto err_put; 1874 } 1875 1876 value_size = bpf_map_value_size(map, attr->flags); 1877 value = kvmemdup_bpfptr(uvalue, value_size); 1878 if (IS_ERR(value)) { 1879 err = PTR_ERR(value); 1880 goto free_key; 1881 } 1882 1883 err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags); 1884 if (!err) 1885 maybe_wait_bpf_programs(map); 1886 1887 kvfree(value); 1888 free_key: 1889 kvfree(key); 1890 err_put: 1891 bpf_map_write_active_dec(map); 1892 return err; 1893 } 1894 1895 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key 1896 1897 static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) 1898 { 1899 bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); 1900 struct bpf_map *map; 1901 void *key; 1902 int err; 1903 1904 if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) 1905 return -EINVAL; 1906 1907 CLASS(fd, f)(attr->map_fd); 1908 map = __bpf_map_get(f); 1909 if (IS_ERR(map)) 1910 return PTR_ERR(map); 1911 bpf_map_write_active_inc(map); 1912 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 1913 err = -EPERM; 1914 goto err_put; 1915 } 1916 1917 key = ___bpf_copy_key(ukey, map->key_size); 1918 if (IS_ERR(key)) { 1919 err = PTR_ERR(key); 1920 goto err_put; 1921 } 1922 1923 if (bpf_map_is_offloaded(map)) { 1924 err = bpf_map_offload_delete_elem(map, key); 1925 goto out; 1926 } else if (IS_FD_PROG_ARRAY(map) || 1927 map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { 1928 /* These maps require sleepable context */ 1929 err = map->ops->map_delete_elem(map, key); 1930 goto out; 1931 } 1932 1933 bpf_disable_instrumentation(); 1934 rcu_read_lock(); 1935 err = map->ops->map_delete_elem(map, key); 1936 rcu_read_unlock(); 1937 bpf_enable_instrumentation(); 1938 if (!err) 1939 maybe_wait_bpf_programs(map); 1940 out: 1941 kvfree(key); 1942 err_put: 1943 bpf_map_write_active_dec(map); 1944 return err; 1945 } 1946 1947 /* last field in 'union bpf_attr' used by this command */ 1948 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key 1949 1950 static int map_get_next_key(union bpf_attr *attr) 1951 { 1952 void __user *ukey = u64_to_user_ptr(attr->key); 1953 void __user *unext_key = u64_to_user_ptr(attr->next_key); 1954 struct bpf_map *map; 1955 void *key, *next_key; 1956 int err; 1957 1958 if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) 1959 return -EINVAL; 1960 1961 CLASS(fd, f)(attr->map_fd); 1962 map = __bpf_map_get(f); 1963 if (IS_ERR(map)) 1964 return PTR_ERR(map); 1965 if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) 1966 return -EPERM; 1967 1968 if (ukey) { 1969 key = __bpf_copy_key(ukey, map->key_size); 1970 if (IS_ERR(key)) 1971 return PTR_ERR(key); 1972 } else { 1973 key = NULL; 1974 } 1975 1976 err = -ENOMEM; 1977 next_key = kvmalloc(map->key_size, GFP_USER); 1978 if (!next_key) 1979 goto free_key; 1980 1981 if (bpf_map_is_offloaded(map)) { 1982 err = bpf_map_offload_get_next_key(map, key, next_key); 1983 goto out; 1984 } 1985 1986 rcu_read_lock(); 1987 err = map->ops->map_get_next_key(map, key, next_key); 1988 rcu_read_unlock(); 1989 out: 1990 if (err) 1991 goto free_next_key; 1992 1993 err = -EFAULT; 1994 if (copy_to_user(unext_key, next_key, map->key_size) != 0) 1995 goto free_next_key; 1996 1997 err = 0; 1998 1999 free_next_key: 2000 kvfree(next_key); 2001 free_key: 2002 kvfree(key); 2003 return err; 2004 } 2005 2006 int generic_map_delete_batch(struct bpf_map *map, 2007 const union bpf_attr *attr, 2008 union bpf_attr __user *uattr) 2009 { 2010 void __user *keys = u64_to_user_ptr(attr->batch.keys); 2011 u32 cp, max_count; 2012 int err = 0; 2013 void *key; 2014 2015 if (attr->batch.elem_flags & ~BPF_F_LOCK) 2016 return -EINVAL; 2017 2018 if ((attr->batch.elem_flags & BPF_F_LOCK) && 2019 !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { 2020 return -EINVAL; 2021 } 2022 2023 max_count = attr->batch.count; 2024 if (!max_count) 2025 return 0; 2026 2027 if (put_user(0, &uattr->batch.count)) 2028 return -EFAULT; 2029 2030 key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); 2031 if (!key) 2032 return -ENOMEM; 2033 2034 for (cp = 0; cp < max_count; cp++) { 2035 err = -EFAULT; 2036 if (copy_from_user(key, keys + cp * map->key_size, 2037 map->key_size)) 2038 break; 2039 2040 if (bpf_map_is_offloaded(map)) { 2041 err = bpf_map_offload_delete_elem(map, key); 2042 break; 2043 } 2044 2045 bpf_disable_instrumentation(); 2046 rcu_read_lock(); 2047 err = map->ops->map_delete_elem(map, key); 2048 rcu_read_unlock(); 2049 bpf_enable_instrumentation(); 2050 if (err) 2051 break; 2052 cond_resched(); 2053 } 2054 if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) 2055 err = -EFAULT; 2056 2057 kvfree(key); 2058 2059 return err; 2060 } 2061 2062 int generic_map_update_batch(struct bpf_map *map, struct file *map_file, 2063 const union bpf_attr *attr, 2064 union bpf_attr __user *uattr) 2065 { 2066 void __user *values = u64_to_user_ptr(attr->batch.values); 2067 void __user *keys = u64_to_user_ptr(attr->batch.keys); 2068 u32 value_size, cp, max_count; 2069 void *key, *value; 2070 int err = 0; 2071 2072 err = bpf_map_check_op_flags(map, attr->batch.elem_flags, 2073 BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS); 2074 if (err) 2075 return err; 2076 2077 value_size = bpf_map_value_size(map, attr->batch.elem_flags); 2078 2079 max_count = attr->batch.count; 2080 if (!max_count) 2081 return 0; 2082 2083 if (put_user(0, &uattr->batch.count)) 2084 return -EFAULT; 2085 2086 key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); 2087 if (!key) 2088 return -ENOMEM; 2089 2090 value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); 2091 if (!value) { 2092 kvfree(key); 2093 return -ENOMEM; 2094 } 2095 2096 for (cp = 0; cp < max_count; cp++) { 2097 err = -EFAULT; 2098 if (copy_from_user(key, keys + cp * map->key_size, 2099 map->key_size) || 2100 copy_from_user(value, values + cp * value_size, value_size)) 2101 break; 2102 2103 err = bpf_map_update_value(map, map_file, key, value, 2104 attr->batch.elem_flags); 2105 2106 if (err) 2107 break; 2108 cond_resched(); 2109 } 2110 2111 if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) 2112 err = -EFAULT; 2113 2114 kvfree(value); 2115 kvfree(key); 2116 2117 return err; 2118 } 2119 2120 int generic_map_lookup_batch(struct bpf_map *map, 2121 const union bpf_attr *attr, 2122 union bpf_attr __user *uattr) 2123 { 2124 void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); 2125 void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); 2126 void __user *values = u64_to_user_ptr(attr->batch.values); 2127 void __user *keys = u64_to_user_ptr(attr->batch.keys); 2128 void *buf, *buf_prevkey, *prev_key, *key, *value; 2129 u32 value_size, cp, max_count; 2130 int err; 2131 2132 err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU); 2133 if (err) 2134 return err; 2135 2136 value_size = bpf_map_value_size(map, attr->batch.elem_flags); 2137 2138 max_count = attr->batch.count; 2139 if (!max_count) 2140 return 0; 2141 2142 if (put_user(0, &uattr->batch.count)) 2143 return -EFAULT; 2144 2145 buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); 2146 if (!buf_prevkey) 2147 return -ENOMEM; 2148 2149 buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); 2150 if (!buf) { 2151 kvfree(buf_prevkey); 2152 return -ENOMEM; 2153 } 2154 2155 err = -EFAULT; 2156 prev_key = NULL; 2157 if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) 2158 goto free_buf; 2159 key = buf; 2160 value = key + map->key_size; 2161 if (ubatch) 2162 prev_key = buf_prevkey; 2163 2164 for (cp = 0; cp < max_count;) { 2165 rcu_read_lock(); 2166 err = map->ops->map_get_next_key(map, prev_key, key); 2167 rcu_read_unlock(); 2168 if (err) 2169 break; 2170 err = bpf_map_copy_value(map, key, value, 2171 attr->batch.elem_flags); 2172 2173 if (err == -ENOENT) 2174 goto next_key; 2175 2176 if (err) 2177 goto free_buf; 2178 2179 if (copy_to_user(keys + cp * map->key_size, key, 2180 map->key_size)) { 2181 err = -EFAULT; 2182 goto free_buf; 2183 } 2184 if (copy_to_user(values + cp * value_size, value, value_size)) { 2185 err = -EFAULT; 2186 goto free_buf; 2187 } 2188 2189 cp++; 2190 next_key: 2191 if (!prev_key) 2192 prev_key = buf_prevkey; 2193 2194 swap(prev_key, key); 2195 cond_resched(); 2196 } 2197 2198 if (err == -EFAULT) 2199 goto free_buf; 2200 2201 if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || 2202 (cp && copy_to_user(uobatch, prev_key, map->key_size)))) 2203 err = -EFAULT; 2204 2205 free_buf: 2206 kvfree(buf_prevkey); 2207 kvfree(buf); 2208 return err; 2209 } 2210 2211 #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags 2212 2213 static int map_lookup_and_delete_elem(union bpf_attr *attr) 2214 { 2215 void __user *ukey = u64_to_user_ptr(attr->key); 2216 void __user *uvalue = u64_to_user_ptr(attr->value); 2217 struct bpf_map *map; 2218 void *key, *value; 2219 u32 value_size; 2220 int err; 2221 2222 if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) 2223 return -EINVAL; 2224 2225 if (attr->flags & ~BPF_F_LOCK) 2226 return -EINVAL; 2227 2228 CLASS(fd, f)(attr->map_fd); 2229 map = __bpf_map_get(f); 2230 if (IS_ERR(map)) 2231 return PTR_ERR(map); 2232 bpf_map_write_active_inc(map); 2233 if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || 2234 !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 2235 err = -EPERM; 2236 goto err_put; 2237 } 2238 2239 if (attr->flags && 2240 (map->map_type == BPF_MAP_TYPE_QUEUE || 2241 map->map_type == BPF_MAP_TYPE_STACK)) { 2242 err = -EINVAL; 2243 goto err_put; 2244 } 2245 2246 if ((attr->flags & BPF_F_LOCK) && 2247 !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { 2248 err = -EINVAL; 2249 goto err_put; 2250 } 2251 2252 key = __bpf_copy_key(ukey, map->key_size); 2253 if (IS_ERR(key)) { 2254 err = PTR_ERR(key); 2255 goto err_put; 2256 } 2257 2258 value_size = bpf_map_value_size(map, 0); 2259 2260 err = -ENOMEM; 2261 value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); 2262 if (!value) 2263 goto free_key; 2264 2265 err = -ENOTSUPP; 2266 if (map->map_type == BPF_MAP_TYPE_QUEUE || 2267 map->map_type == BPF_MAP_TYPE_STACK) { 2268 err = map->ops->map_pop_elem(map, value); 2269 } else if (map->map_type == BPF_MAP_TYPE_HASH || 2270 map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 2271 map->map_type == BPF_MAP_TYPE_LRU_HASH || 2272 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || 2273 map->map_type == BPF_MAP_TYPE_RHASH || 2274 map->map_type == BPF_MAP_TYPE_STACK_TRACE) { 2275 if (!bpf_map_is_offloaded(map)) { 2276 bpf_disable_instrumentation(); 2277 rcu_read_lock(); 2278 err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); 2279 rcu_read_unlock(); 2280 bpf_enable_instrumentation(); 2281 } 2282 } 2283 2284 if (err) 2285 goto free_value; 2286 2287 if (copy_to_user(uvalue, value, value_size) != 0) { 2288 err = -EFAULT; 2289 goto free_value; 2290 } 2291 2292 err = 0; 2293 2294 free_value: 2295 kvfree(value); 2296 free_key: 2297 kvfree(key); 2298 err_put: 2299 bpf_map_write_active_dec(map); 2300 return err; 2301 } 2302 2303 #define BPF_MAP_FREEZE_LAST_FIELD map_fd 2304 2305 static int map_freeze(const union bpf_attr *attr) 2306 { 2307 int err = 0; 2308 struct bpf_map *map; 2309 2310 if (CHECK_ATTR(BPF_MAP_FREEZE)) 2311 return -EINVAL; 2312 2313 CLASS(fd, f)(attr->map_fd); 2314 map = __bpf_map_get(f); 2315 if (IS_ERR(map)) 2316 return PTR_ERR(map); 2317 2318 if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) 2319 return -ENOTSUPP; 2320 2321 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) 2322 return -EPERM; 2323 2324 mutex_lock(&map->freeze_mutex); 2325 if (bpf_map_write_active(map)) { 2326 err = -EBUSY; 2327 goto err_put; 2328 } 2329 if (READ_ONCE(map->frozen)) { 2330 err = -EBUSY; 2331 goto err_put; 2332 } 2333 2334 WRITE_ONCE(map->frozen, true); 2335 err_put: 2336 mutex_unlock(&map->freeze_mutex); 2337 return err; 2338 } 2339 2340 static const struct bpf_prog_ops * const bpf_prog_types[] = { 2341 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ 2342 [_id] = & _name ## _prog_ops, 2343 #define BPF_MAP_TYPE(_id, _ops) 2344 #define BPF_LINK_TYPE(_id, _name) 2345 #include <linux/bpf_types.h> 2346 #undef BPF_PROG_TYPE 2347 #undef BPF_MAP_TYPE 2348 #undef BPF_LINK_TYPE 2349 }; 2350 2351 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) 2352 { 2353 const struct bpf_prog_ops *ops; 2354 2355 if (type >= ARRAY_SIZE(bpf_prog_types)) 2356 return -EINVAL; 2357 type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types)); 2358 ops = bpf_prog_types[type]; 2359 if (!ops) 2360 return -EINVAL; 2361 2362 if (!bpf_prog_is_offloaded(prog->aux)) 2363 prog->aux->ops = ops; 2364 else 2365 prog->aux->ops = &bpf_offload_prog_ops; 2366 prog->type = type; 2367 return 0; 2368 } 2369 2370 enum bpf_audit { 2371 BPF_AUDIT_LOAD, 2372 BPF_AUDIT_UNLOAD, 2373 BPF_AUDIT_MAX, 2374 }; 2375 2376 static const char * const bpf_audit_str[BPF_AUDIT_MAX] = { 2377 [BPF_AUDIT_LOAD] = "LOAD", 2378 [BPF_AUDIT_UNLOAD] = "UNLOAD", 2379 }; 2380 2381 static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) 2382 { 2383 struct audit_context *ctx = NULL; 2384 struct audit_buffer *ab; 2385 2386 if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX)) 2387 return; 2388 if (audit_enabled == AUDIT_OFF) 2389 return; 2390 if (!in_hardirq() && !irqs_disabled()) 2391 ctx = audit_context(); 2392 ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); 2393 if (unlikely(!ab)) 2394 return; 2395 audit_log_format(ab, "prog-id=%u op=%s", 2396 prog->aux->id, bpf_audit_str[op]); 2397 audit_log_end(ab); 2398 } 2399 2400 static int bpf_prog_alloc_id(struct bpf_prog *prog) 2401 { 2402 int id; 2403 2404 idr_preload(GFP_KERNEL); 2405 spin_lock_bh(&prog_idr_lock); 2406 id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); 2407 if (id > 0) 2408 prog->aux->id = id; 2409 spin_unlock_bh(&prog_idr_lock); 2410 idr_preload_end(); 2411 2412 /* id is in [1, INT_MAX) */ 2413 if (WARN_ON_ONCE(!id)) 2414 return -ENOSPC; 2415 2416 return id > 0 ? 0 : id; 2417 } 2418 2419 void bpf_prog_free_id(struct bpf_prog *prog) 2420 { 2421 unsigned long flags; 2422 2423 /* cBPF to eBPF migrations are currently not in the idr store. 2424 * Offloaded programs are removed from the store when their device 2425 * disappears - even if someone grabs an fd to them they are unusable, 2426 * simply waiting for refcnt to drop to be freed. 2427 */ 2428 if (!prog->aux->id) 2429 return; 2430 2431 spin_lock_irqsave(&prog_idr_lock, flags); 2432 idr_remove(&prog_idr, prog->aux->id); 2433 prog->aux->id = 0; 2434 spin_unlock_irqrestore(&prog_idr_lock, flags); 2435 } 2436 2437 static void __bpf_prog_put_rcu(struct rcu_head *rcu) 2438 { 2439 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); 2440 2441 kvfree(aux->func_info); 2442 kfree(aux->func_info_aux); 2443 free_uid(aux->user); 2444 security_bpf_prog_free(aux->prog); 2445 bpf_prog_free(aux->prog); 2446 } 2447 2448 static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) 2449 { 2450 bpf_prog_kallsyms_del_all(prog); 2451 btf_put(prog->aux->btf); 2452 module_put(prog->aux->mod); 2453 kvfree(prog->aux->jited_linfo); 2454 kvfree(prog->aux->linfo); 2455 kfree(prog->aux->kfunc_tab); 2456 kfree(prog->aux->ctx_arg_info); 2457 if (prog->aux->attach_btf) 2458 btf_put(prog->aux->attach_btf); 2459 2460 if (deferred) { 2461 if (prog->sleepable) 2462 call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); 2463 else 2464 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); 2465 } else { 2466 __bpf_prog_put_rcu(&prog->aux->rcu); 2467 } 2468 } 2469 2470 static void bpf_prog_put_deferred(struct work_struct *work) 2471 { 2472 struct bpf_prog_aux *aux; 2473 struct bpf_prog *prog; 2474 2475 aux = container_of(work, struct bpf_prog_aux, work); 2476 prog = aux->prog; 2477 perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); 2478 bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); 2479 bpf_prog_free_id(prog); 2480 __bpf_prog_put_noref(prog, true); 2481 } 2482 2483 static void __bpf_prog_put(struct bpf_prog *prog) 2484 { 2485 struct bpf_prog_aux *aux = prog->aux; 2486 2487 if (atomic64_dec_and_test(&aux->refcnt)) { 2488 if (in_hardirq() || irqs_disabled()) { 2489 INIT_WORK(&aux->work, bpf_prog_put_deferred); 2490 schedule_work(&aux->work); 2491 } else { 2492 bpf_prog_put_deferred(&aux->work); 2493 } 2494 } 2495 } 2496 2497 void bpf_prog_put(struct bpf_prog *prog) 2498 { 2499 __bpf_prog_put(prog); 2500 } 2501 EXPORT_SYMBOL_GPL(bpf_prog_put); 2502 2503 static int bpf_prog_release(struct inode *inode, struct file *filp) 2504 { 2505 struct bpf_prog *prog = filp->private_data; 2506 2507 bpf_prog_put(prog); 2508 return 0; 2509 } 2510 2511 struct bpf_prog_kstats { 2512 u64 nsecs; 2513 u64 cnt; 2514 u64 misses; 2515 }; 2516 2517 void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) 2518 { 2519 struct bpf_prog_stats *stats; 2520 unsigned int flags; 2521 2522 if (unlikely(!prog->stats)) 2523 return; 2524 2525 stats = this_cpu_ptr(prog->stats); 2526 flags = u64_stats_update_begin_irqsave(&stats->syncp); 2527 u64_stats_inc(&stats->misses); 2528 u64_stats_update_end_irqrestore(&stats->syncp, flags); 2529 } 2530 2531 static void bpf_prog_get_stats(const struct bpf_prog *prog, 2532 struct bpf_prog_kstats *stats) 2533 { 2534 u64 nsecs = 0, cnt = 0, misses = 0; 2535 int cpu; 2536 2537 for_each_possible_cpu(cpu) { 2538 const struct bpf_prog_stats *st; 2539 unsigned int start; 2540 u64 tnsecs, tcnt, tmisses; 2541 2542 st = per_cpu_ptr(prog->stats, cpu); 2543 do { 2544 start = u64_stats_fetch_begin(&st->syncp); 2545 tnsecs = u64_stats_read(&st->nsecs); 2546 tcnt = u64_stats_read(&st->cnt); 2547 tmisses = u64_stats_read(&st->misses); 2548 } while (u64_stats_fetch_retry(&st->syncp, start)); 2549 nsecs += tnsecs; 2550 cnt += tcnt; 2551 misses += tmisses; 2552 } 2553 stats->nsecs = nsecs; 2554 stats->cnt = cnt; 2555 stats->misses = misses; 2556 } 2557 2558 #ifdef CONFIG_PROC_FS 2559 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) 2560 { 2561 const struct bpf_prog *prog = filp->private_data; 2562 char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; 2563 struct bpf_prog_kstats stats; 2564 2565 bpf_prog_get_stats(prog, &stats); 2566 bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); 2567 seq_printf(m, 2568 "prog_type:\t%u\n" 2569 "prog_jited:\t%u\n" 2570 "prog_tag:\t%s\n" 2571 "memlock:\t%llu\n" 2572 "prog_id:\t%u\n" 2573 "run_time_ns:\t%llu\n" 2574 "run_cnt:\t%llu\n" 2575 "recursion_misses:\t%llu\n" 2576 "verified_insns:\t%u\n", 2577 prog->type, 2578 prog->jited, 2579 prog_tag, 2580 prog->pages * 1ULL << PAGE_SHIFT, 2581 prog->aux->id, 2582 stats.nsecs, 2583 stats.cnt, 2584 stats.misses, 2585 prog->aux->verified_insns); 2586 } 2587 #endif 2588 2589 const struct file_operations bpf_prog_fops = { 2590 #ifdef CONFIG_PROC_FS 2591 .show_fdinfo = bpf_prog_show_fdinfo, 2592 #endif 2593 .release = bpf_prog_release, 2594 .read = bpf_dummy_read, 2595 .write = bpf_dummy_write, 2596 }; 2597 2598 int bpf_prog_new_fd(struct bpf_prog *prog) 2599 { 2600 int ret; 2601 2602 ret = security_bpf_prog(prog); 2603 if (ret < 0) 2604 return ret; 2605 2606 return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, 2607 O_RDWR | O_CLOEXEC); 2608 } 2609 2610 void bpf_prog_add(struct bpf_prog *prog, int i) 2611 { 2612 atomic64_add(i, &prog->aux->refcnt); 2613 } 2614 EXPORT_SYMBOL_GPL(bpf_prog_add); 2615 2616 void bpf_prog_sub(struct bpf_prog *prog, int i) 2617 { 2618 /* Only to be used for undoing previous bpf_prog_add() in some 2619 * error path. We still know that another entity in our call 2620 * path holds a reference to the program, thus atomic_sub() can 2621 * be safely used in such cases! 2622 */ 2623 WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0); 2624 } 2625 EXPORT_SYMBOL_GPL(bpf_prog_sub); 2626 2627 void bpf_prog_inc(struct bpf_prog *prog) 2628 { 2629 atomic64_inc(&prog->aux->refcnt); 2630 } 2631 EXPORT_SYMBOL_GPL(bpf_prog_inc); 2632 2633 /* prog_idr_lock should have been held */ 2634 struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) 2635 { 2636 int refold; 2637 2638 refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0); 2639 2640 if (!refold) 2641 return ERR_PTR(-ENOENT); 2642 2643 return prog; 2644 } 2645 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); 2646 2647 bool bpf_prog_get_ok(struct bpf_prog *prog, 2648 enum bpf_prog_type *attach_type, bool attach_drv) 2649 { 2650 /* not an attachment, just a refcount inc, always allow */ 2651 if (!attach_type) 2652 return true; 2653 2654 if (prog->type != *attach_type) 2655 return false; 2656 if (bpf_prog_is_offloaded(prog->aux) && !attach_drv) 2657 return false; 2658 2659 return true; 2660 } 2661 2662 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, 2663 bool attach_drv) 2664 { 2665 CLASS(fd, f)(ufd); 2666 struct bpf_prog *prog; 2667 2668 if (fd_empty(f)) 2669 return ERR_PTR(-EBADF); 2670 if (fd_file(f)->f_op != &bpf_prog_fops) 2671 return ERR_PTR(-EINVAL); 2672 2673 prog = fd_file(f)->private_data; 2674 if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) 2675 return ERR_PTR(-EINVAL); 2676 2677 bpf_prog_inc(prog); 2678 return prog; 2679 } 2680 2681 struct bpf_prog *bpf_prog_get(u32 ufd) 2682 { 2683 return __bpf_prog_get(ufd, NULL, false); 2684 } 2685 2686 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, 2687 bool attach_drv) 2688 { 2689 return __bpf_prog_get(ufd, &type, attach_drv); 2690 } 2691 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); 2692 2693 /* Initially all BPF programs could be loaded w/o specifying 2694 * expected_attach_type. Later for some of them specifying expected_attach_type 2695 * at load time became required so that program could be validated properly. 2696 * Programs of types that are allowed to be loaded both w/ and w/o (for 2697 * backward compatibility) expected_attach_type, should have the default attach 2698 * type assigned to expected_attach_type for the latter case, so that it can be 2699 * validated later at attach time. 2700 * 2701 * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if 2702 * prog type requires it but has some attach types that have to be backward 2703 * compatible. 2704 */ 2705 static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) 2706 { 2707 switch (attr->prog_type) { 2708 case BPF_PROG_TYPE_CGROUP_SOCK: 2709 /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't 2710 * exist so checking for non-zero is the way to go here. 2711 */ 2712 if (!attr->expected_attach_type) 2713 attr->expected_attach_type = 2714 BPF_CGROUP_INET_SOCK_CREATE; 2715 break; 2716 case BPF_PROG_TYPE_SK_REUSEPORT: 2717 if (!attr->expected_attach_type) 2718 attr->expected_attach_type = 2719 BPF_SK_REUSEPORT_SELECT; 2720 break; 2721 } 2722 } 2723 2724 static int 2725 bpf_prog_load_check_attach(enum bpf_prog_type prog_type, 2726 enum bpf_attach_type expected_attach_type, 2727 struct btf *attach_btf, u32 btf_id, 2728 struct bpf_prog *dst_prog, 2729 bool multi_func) 2730 { 2731 if (btf_id) { 2732 if (btf_id > BTF_MAX_TYPE) 2733 return -EINVAL; 2734 2735 if (!attach_btf && !dst_prog) 2736 return -EINVAL; 2737 2738 switch (prog_type) { 2739 case BPF_PROG_TYPE_TRACING: 2740 case BPF_PROG_TYPE_LSM: 2741 case BPF_PROG_TYPE_STRUCT_OPS: 2742 case BPF_PROG_TYPE_EXT: 2743 break; 2744 default: 2745 return -EINVAL; 2746 } 2747 } 2748 2749 if (multi_func) { 2750 if (prog_type != BPF_PROG_TYPE_TRACING) 2751 return -EINVAL; 2752 if (!attach_btf || btf_id) 2753 return -EINVAL; 2754 return 0; 2755 } 2756 2757 if (attach_btf && (!btf_id || dst_prog)) 2758 return -EINVAL; 2759 2760 if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING && 2761 prog_type != BPF_PROG_TYPE_EXT) 2762 return -EINVAL; 2763 2764 switch (prog_type) { 2765 case BPF_PROG_TYPE_CGROUP_SOCK: 2766 switch (expected_attach_type) { 2767 case BPF_CGROUP_INET_SOCK_CREATE: 2768 case BPF_CGROUP_INET_SOCK_RELEASE: 2769 case BPF_CGROUP_INET4_POST_BIND: 2770 case BPF_CGROUP_INET6_POST_BIND: 2771 return 0; 2772 default: 2773 return -EINVAL; 2774 } 2775 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 2776 switch (expected_attach_type) { 2777 case BPF_CGROUP_INET4_BIND: 2778 case BPF_CGROUP_INET6_BIND: 2779 case BPF_CGROUP_INET4_CONNECT: 2780 case BPF_CGROUP_INET6_CONNECT: 2781 case BPF_CGROUP_UNIX_CONNECT: 2782 case BPF_CGROUP_INET4_GETPEERNAME: 2783 case BPF_CGROUP_INET6_GETPEERNAME: 2784 case BPF_CGROUP_UNIX_GETPEERNAME: 2785 case BPF_CGROUP_INET4_GETSOCKNAME: 2786 case BPF_CGROUP_INET6_GETSOCKNAME: 2787 case BPF_CGROUP_UNIX_GETSOCKNAME: 2788 case BPF_CGROUP_UDP4_SENDMSG: 2789 case BPF_CGROUP_UDP6_SENDMSG: 2790 case BPF_CGROUP_UNIX_SENDMSG: 2791 case BPF_CGROUP_UDP4_RECVMSG: 2792 case BPF_CGROUP_UDP6_RECVMSG: 2793 case BPF_CGROUP_UNIX_RECVMSG: 2794 return 0; 2795 default: 2796 return -EINVAL; 2797 } 2798 case BPF_PROG_TYPE_CGROUP_SKB: 2799 switch (expected_attach_type) { 2800 case BPF_CGROUP_INET_INGRESS: 2801 case BPF_CGROUP_INET_EGRESS: 2802 return 0; 2803 default: 2804 return -EINVAL; 2805 } 2806 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 2807 switch (expected_attach_type) { 2808 case BPF_CGROUP_SETSOCKOPT: 2809 case BPF_CGROUP_GETSOCKOPT: 2810 return 0; 2811 default: 2812 return -EINVAL; 2813 } 2814 case BPF_PROG_TYPE_SK_LOOKUP: 2815 if (expected_attach_type == BPF_SK_LOOKUP) 2816 return 0; 2817 return -EINVAL; 2818 case BPF_PROG_TYPE_SK_REUSEPORT: 2819 switch (expected_attach_type) { 2820 case BPF_SK_REUSEPORT_SELECT: 2821 case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: 2822 return 0; 2823 default: 2824 return -EINVAL; 2825 } 2826 case BPF_PROG_TYPE_NETFILTER: 2827 if (expected_attach_type == BPF_NETFILTER) 2828 return 0; 2829 return -EINVAL; 2830 case BPF_PROG_TYPE_SYSCALL: 2831 case BPF_PROG_TYPE_EXT: 2832 if (expected_attach_type) 2833 return -EINVAL; 2834 fallthrough; 2835 default: 2836 return 0; 2837 } 2838 } 2839 2840 static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) 2841 { 2842 switch (prog_type) { 2843 case BPF_PROG_TYPE_SCHED_CLS: 2844 case BPF_PROG_TYPE_SCHED_ACT: 2845 case BPF_PROG_TYPE_XDP: 2846 case BPF_PROG_TYPE_LWT_IN: 2847 case BPF_PROG_TYPE_LWT_OUT: 2848 case BPF_PROG_TYPE_LWT_XMIT: 2849 case BPF_PROG_TYPE_LWT_SEG6LOCAL: 2850 case BPF_PROG_TYPE_SK_SKB: 2851 case BPF_PROG_TYPE_SK_MSG: 2852 case BPF_PROG_TYPE_FLOW_DISSECTOR: 2853 case BPF_PROG_TYPE_CGROUP_DEVICE: 2854 case BPF_PROG_TYPE_CGROUP_SOCK: 2855 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 2856 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 2857 case BPF_PROG_TYPE_CGROUP_SYSCTL: 2858 case BPF_PROG_TYPE_SOCK_OPS: 2859 case BPF_PROG_TYPE_EXT: /* extends any prog */ 2860 case BPF_PROG_TYPE_NETFILTER: 2861 return true; 2862 case BPF_PROG_TYPE_CGROUP_SKB: 2863 /* always unpriv */ 2864 case BPF_PROG_TYPE_SK_REUSEPORT: 2865 /* equivalent to SOCKET_FILTER. need CAP_BPF only */ 2866 default: 2867 return false; 2868 } 2869 } 2870 2871 static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) 2872 { 2873 switch (prog_type) { 2874 case BPF_PROG_TYPE_KPROBE: 2875 case BPF_PROG_TYPE_TRACEPOINT: 2876 case BPF_PROG_TYPE_PERF_EVENT: 2877 case BPF_PROG_TYPE_RAW_TRACEPOINT: 2878 case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: 2879 case BPF_PROG_TYPE_TRACING: 2880 case BPF_PROG_TYPE_LSM: 2881 case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ 2882 case BPF_PROG_TYPE_EXT: /* extends any prog */ 2883 return true; 2884 default: 2885 return false; 2886 } 2887 } 2888 2889 static enum bpf_sig_keyring bpf_classify_keyring(s32 keyring_id) 2890 { 2891 switch (keyring_id) { 2892 case 0: 2893 return BPF_SIG_KEYRING_BUILTIN; 2894 case (s32)(unsigned long)VERIFY_USE_SECONDARY_KEYRING: 2895 return BPF_SIG_KEYRING_SECONDARY; 2896 case (s32)(unsigned long)VERIFY_USE_PLATFORM_KEYRING: 2897 return BPF_SIG_KEYRING_PLATFORM; 2898 default: 2899 return BPF_SIG_KEYRING_USER; 2900 } 2901 } 2902 2903 static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, 2904 bool is_kernel, s32 *keyring_serial) 2905 { 2906 bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); 2907 struct bpf_dynptr_kern sig_ptr, insns_ptr; 2908 struct bpf_key *key = NULL; 2909 void *sig; 2910 int err = 0; 2911 2912 /* 2913 * Don't attempt to use kmalloc_large or vmalloc for signatures. 2914 * Practical signature for BPF program should be below this limit. 2915 */ 2916 if (attr->signature_size > KMALLOC_MAX_CACHE_SIZE) 2917 return -EINVAL; 2918 2919 if (system_keyring_id_check(attr->keyring_id) == 0) 2920 key = bpf_lookup_system_key(attr->keyring_id); 2921 else 2922 key = bpf_lookup_user_key(attr->keyring_id, 0); 2923 2924 if (!key) 2925 return -EINVAL; 2926 2927 sig = kvmemdup_bpfptr(usig, attr->signature_size); 2928 if (IS_ERR(sig)) { 2929 bpf_key_put(key); 2930 return PTR_ERR(sig); 2931 } 2932 2933 bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0, 2934 attr->signature_size); 2935 bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0, 2936 prog->len * sizeof(struct bpf_insn)); 2937 2938 err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, 2939 (struct bpf_dynptr *)&sig_ptr, key); 2940 if (!err) 2941 *keyring_serial = bpf_key_serial(key); 2942 bpf_key_put(key); 2943 kvfree(sig); 2944 return err; 2945 } 2946 2947 static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) 2948 { 2949 int err; 2950 int i; 2951 2952 for (i = 0; i < prog->aux->used_map_cnt; i++) { 2953 if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY) 2954 continue; 2955 2956 err = bpf_insn_array_ready(prog->aux->used_maps[i]); 2957 if (err) 2958 return err; 2959 } 2960 2961 return 0; 2962 } 2963 2964 extern int bpf_multi_func(void); 2965 int __init __used bpf_multi_func(void) { return 0; } 2966 2967 BTF_ID_LIST_GLOBAL_SINGLE(bpf_multi_func_btf_id, func, bpf_multi_func) 2968 2969 /* last field in 'union bpf_attr' used by this command */ 2970 #define BPF_PROG_LOAD_LAST_FIELD keyring_id 2971 2972 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) 2973 { 2974 enum bpf_prog_type type = attr->prog_type; 2975 struct bpf_prog *prog, *dst_prog = NULL; 2976 struct btf *attach_btf = NULL; 2977 struct bpf_token *token = NULL; 2978 bool bpf_cap; 2979 int err; 2980 char license[128]; 2981 bool multi_func; 2982 2983 if (CHECK_ATTR(BPF_PROG_LOAD)) 2984 return -EINVAL; 2985 2986 if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | 2987 BPF_F_ANY_ALIGNMENT | 2988 BPF_F_TEST_STATE_FREQ | 2989 BPF_F_SLEEPABLE | 2990 BPF_F_TEST_RND_HI32 | 2991 BPF_F_XDP_HAS_FRAGS | 2992 BPF_F_XDP_DEV_BOUND_ONLY | 2993 BPF_F_TEST_REG_INVARIANTS | 2994 BPF_F_TOKEN_FD)) 2995 return -EINVAL; 2996 2997 bpf_prog_load_fixup_attach_type(attr); 2998 2999 if (attr->prog_flags & BPF_F_TOKEN_FD) { 3000 token = bpf_token_get_from_fd(attr->prog_token_fd); 3001 if (IS_ERR(token)) 3002 return PTR_ERR(token); 3003 /* if current token doesn't grant prog loading permissions, 3004 * then we can't use this token, so ignore it and rely on 3005 * system-wide capabilities checks 3006 */ 3007 if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) || 3008 !bpf_token_allow_prog_type(token, attr->prog_type, 3009 attr->expected_attach_type)) { 3010 bpf_token_put(token); 3011 token = NULL; 3012 } 3013 } 3014 3015 bpf_cap = bpf_token_capable(token, CAP_BPF); 3016 err = -EPERM; 3017 3018 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && 3019 (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && 3020 !bpf_cap) 3021 goto put_token; 3022 3023 /* Intent here is for unprivileged_bpf_disabled to block BPF program 3024 * creation for unprivileged users; other actions depend 3025 * on fd availability and access to bpffs, so are dependent on 3026 * object creation success. Even with unprivileged BPF disabled, 3027 * capability checks are still carried out for these 3028 * and other operations. 3029 */ 3030 if (sysctl_unprivileged_bpf_disabled && !bpf_cap) 3031 goto put_token; 3032 3033 if (attr->insn_cnt == 0 || 3034 attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) { 3035 err = -E2BIG; 3036 goto put_token; 3037 } 3038 if (type != BPF_PROG_TYPE_SOCKET_FILTER && 3039 type != BPF_PROG_TYPE_CGROUP_SKB && 3040 !bpf_cap) 3041 goto put_token; 3042 3043 if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN)) 3044 goto put_token; 3045 if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) 3046 goto put_token; 3047 3048 multi_func = is_tracing_multi(attr->expected_attach_type); 3049 3050 /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog 3051 * or btf, we need to check which one it is 3052 */ 3053 if (attr->attach_prog_fd) { 3054 dst_prog = bpf_prog_get(attr->attach_prog_fd); 3055 if (IS_ERR(dst_prog)) { 3056 dst_prog = NULL; 3057 attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); 3058 if (IS_ERR(attach_btf)) { 3059 err = -EINVAL; 3060 goto put_token; 3061 } 3062 if (!btf_is_kernel(attach_btf)) { 3063 /* attaching through specifying bpf_prog's BTF 3064 * objects directly might be supported eventually 3065 */ 3066 btf_put(attach_btf); 3067 err = -ENOTSUPP; 3068 goto put_token; 3069 } 3070 } 3071 } else if (attr->attach_btf_id || multi_func) { 3072 /* fall back to vmlinux BTF, if BTF type ID is specified */ 3073 attach_btf = bpf_get_btf_vmlinux(); 3074 if (IS_ERR(attach_btf)) { 3075 err = PTR_ERR(attach_btf); 3076 goto put_token; 3077 } 3078 if (!attach_btf) { 3079 err = -EINVAL; 3080 goto put_token; 3081 } 3082 btf_get(attach_btf); 3083 } 3084 3085 if (bpf_prog_load_check_attach(type, attr->expected_attach_type, 3086 attach_btf, attr->attach_btf_id, 3087 dst_prog, multi_func)) { 3088 if (dst_prog) 3089 bpf_prog_put(dst_prog); 3090 if (attach_btf) 3091 btf_put(attach_btf); 3092 err = -EINVAL; 3093 goto put_token; 3094 } 3095 3096 /* plain bpf_prog allocation */ 3097 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); 3098 if (!prog) { 3099 if (dst_prog) 3100 bpf_prog_put(dst_prog); 3101 if (attach_btf) 3102 btf_put(attach_btf); 3103 err = -EINVAL; 3104 goto put_token; 3105 } 3106 3107 prog->expected_attach_type = attr->expected_attach_type; 3108 prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE); 3109 prog->aux->attach_btf = attach_btf; 3110 prog->aux->attach_btf_id = multi_func ? bpf_multi_func_btf_id[0] : attr->attach_btf_id; 3111 prog->aux->dst_prog = dst_prog; 3112 prog->aux->dev_bound = !!attr->prog_ifindex; 3113 prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; 3114 3115 /* move token into prog->aux, reuse taken refcnt */ 3116 prog->aux->token = token; 3117 token = NULL; 3118 3119 prog->aux->user = get_current_user(); 3120 prog->len = attr->insn_cnt; 3121 3122 err = -EFAULT; 3123 if (copy_from_bpfptr(prog->insns, 3124 make_bpfptr(attr->insns, uattr.is_kernel), 3125 bpf_prog_insn_size(prog)) != 0) 3126 goto free_prog; 3127 /* copy eBPF program license from user space */ 3128 if (strncpy_from_bpfptr(license, 3129 make_bpfptr(attr->license, uattr.is_kernel), 3130 sizeof(license) - 1) < 0) 3131 goto free_prog; 3132 license[sizeof(license) - 1] = 0; 3133 3134 /* eBPF programs must be GPL compatible to use GPL-ed functions */ 3135 prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; 3136 if (attr->signature) { 3137 err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel, 3138 &prog->aux->sig.keyring_serial); 3139 if (err) 3140 goto free_prog; 3141 prog->aux->sig.keyring_type = bpf_classify_keyring(attr->keyring_id); 3142 prog->aux->sig.verdict = BPF_SIG_VERIFIED; 3143 } else { 3144 prog->aux->sig.keyring_type = BPF_SIG_KEYRING_NONE; 3145 prog->aux->sig.verdict = BPF_SIG_UNSIGNED; 3146 } 3147 prog->orig_prog = NULL; 3148 prog->jited = 0; 3149 3150 atomic64_set(&prog->aux->refcnt, 1); 3151 3152 if (bpf_prog_is_dev_bound(prog->aux)) { 3153 err = bpf_prog_dev_bound_init(prog, attr); 3154 if (err) 3155 goto free_prog; 3156 } 3157 3158 if (type == BPF_PROG_TYPE_EXT && dst_prog && 3159 bpf_prog_is_dev_bound(dst_prog->aux)) { 3160 err = bpf_prog_dev_bound_inherit(prog, dst_prog); 3161 if (err) 3162 goto free_prog; 3163 } 3164 3165 /* 3166 * Bookkeeping for managing the program attachment chain. 3167 * 3168 * It might be tempting to set attach_tracing_prog flag at the attachment 3169 * time, but this will not prevent from loading bunch of tracing prog 3170 * first, then attach them one to another. 3171 * 3172 * The flag attach_tracing_prog is set for the whole program lifecycle, and 3173 * doesn't have to be cleared in bpf_tracing_link_release, since tracing 3174 * programs cannot change attachment target. 3175 */ 3176 if (type == BPF_PROG_TYPE_TRACING && dst_prog && 3177 dst_prog->type == BPF_PROG_TYPE_TRACING) { 3178 prog->aux->attach_tracing_prog = true; 3179 } 3180 3181 /* find program type: socket_filter vs tracing_filter */ 3182 err = find_prog_type(type, prog); 3183 if (err < 0) 3184 goto free_prog; 3185 3186 prog->aux->load_time = ktime_get_boottime_ns(); 3187 err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, 3188 sizeof(attr->prog_name)); 3189 if (err < 0) 3190 goto free_prog; 3191 3192 err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); 3193 if (err) 3194 goto free_prog; 3195 3196 /* run eBPF verifier */ 3197 err = bpf_check(&prog, attr, uattr, attr_log); 3198 if (err < 0) 3199 goto free_used_maps; 3200 3201 err = bpf_prog_mark_insn_arrays_ready(prog); 3202 if (err < 0) 3203 goto free_used_maps; 3204 3205 err = bpf_prog_alloc_id(prog); 3206 if (err) 3207 goto free_used_maps; 3208 3209 /* Upon success of bpf_prog_alloc_id(), the BPF prog is 3210 * effectively publicly exposed. However, retrieving via 3211 * bpf_prog_get_fd_by_id() will take another reference, 3212 * therefore it cannot be gone underneath us. 3213 * 3214 * Only for the time /after/ successful bpf_prog_new_fd() 3215 * and before returning to userspace, we might just hold 3216 * one reference and any parallel close on that fd could 3217 * rip everything out. Hence, below notifications must 3218 * happen before bpf_prog_new_fd(). 3219 * 3220 * Also, any failure handling from this point onwards must 3221 * be using bpf_prog_put() given the program is exposed. 3222 */ 3223 bpf_prog_kallsyms_add(prog); 3224 perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); 3225 bpf_audit_prog(prog, BPF_AUDIT_LOAD); 3226 3227 err = bpf_prog_new_fd(prog); 3228 if (err < 0) 3229 bpf_prog_put(prog); 3230 return err; 3231 3232 free_used_maps: 3233 /* In case we have subprogs, we need to wait for a grace 3234 * period before we can tear down JIT memory since symbols 3235 * are already exposed under kallsyms. 3236 */ 3237 __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); 3238 return err; 3239 3240 free_prog: 3241 free_uid(prog->aux->user); 3242 if (prog->aux->attach_btf) 3243 btf_put(prog->aux->attach_btf); 3244 bpf_prog_free(prog); 3245 put_token: 3246 bpf_token_put(token); 3247 return err; 3248 } 3249 3250 #define BPF_OBJ_LAST_FIELD path_fd 3251 3252 static int bpf_obj_pin(const union bpf_attr *attr) 3253 { 3254 int path_fd; 3255 3256 if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD) 3257 return -EINVAL; 3258 3259 /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ 3260 if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) 3261 return -EINVAL; 3262 3263 path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; 3264 return bpf_obj_pin_user(attr->bpf_fd, path_fd, 3265 u64_to_user_ptr(attr->pathname)); 3266 } 3267 3268 static int bpf_obj_get(const union bpf_attr *attr) 3269 { 3270 int path_fd; 3271 3272 if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || 3273 attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD)) 3274 return -EINVAL; 3275 3276 /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ 3277 if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) 3278 return -EINVAL; 3279 3280 path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; 3281 return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname), 3282 attr->file_flags); 3283 } 3284 3285 /* bpf_link_init_sleepable() allows to specify whether BPF link itself has 3286 * "sleepable" semantics, which normally would mean that BPF link's attach 3287 * hook can dereference link or link's underlying program for some time after 3288 * detachment due to RCU Tasks Trace-based lifetime protection scheme. 3289 * BPF program itself can be non-sleepable, yet, because it's transitively 3290 * reachable through BPF link, its freeing has to be delayed until after RCU 3291 * Tasks Trace GP. 3292 */ 3293 void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, 3294 const struct bpf_link_ops *ops, struct bpf_prog *prog, 3295 enum bpf_attach_type attach_type, bool sleepable) 3296 { 3297 WARN_ON(ops->dealloc && ops->dealloc_deferred); 3298 atomic64_set(&link->refcnt, 1); 3299 link->type = type; 3300 link->sleepable = sleepable; 3301 link->id = 0; 3302 link->ops = ops; 3303 link->prog = prog; 3304 link->attach_type = attach_type; 3305 } 3306 3307 void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, 3308 const struct bpf_link_ops *ops, struct bpf_prog *prog, 3309 enum bpf_attach_type attach_type) 3310 { 3311 bpf_link_init_sleepable(link, type, ops, prog, attach_type, false); 3312 } 3313 3314 void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type, 3315 const struct bpf_link_ops *ops, struct bpf_prog *prog, 3316 enum bpf_attach_type attach_type, u64 cookie) 3317 { 3318 bpf_link_init(&link->link, type, ops, prog, attach_type); 3319 link->node.link = &link->link; 3320 link->node.cookie = cookie; 3321 } 3322 3323 static void bpf_link_free_id(int id) 3324 { 3325 if (!id) 3326 return; 3327 3328 spin_lock_bh(&link_idr_lock); 3329 idr_remove(&link_idr, id); 3330 spin_unlock_bh(&link_idr_lock); 3331 } 3332 3333 /* Clean up bpf_link and corresponding anon_inode file and FD. After 3334 * anon_inode is created, bpf_link can't be just kfree()'d due to deferred 3335 * anon_inode's release() call. This helper marks bpf_link as 3336 * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt 3337 * is not decremented, it's the responsibility of a calling code that failed 3338 * to complete bpf_link initialization. 3339 * This helper eventually calls link's dealloc callback, but does not call 3340 * link's release callback. 3341 */ 3342 void bpf_link_cleanup(struct bpf_link_primer *primer) 3343 { 3344 primer->link->prog = NULL; 3345 bpf_link_free_id(primer->id); 3346 fput(primer->file); 3347 put_unused_fd(primer->fd); 3348 } 3349 3350 void bpf_link_inc(struct bpf_link *link) 3351 { 3352 atomic64_inc(&link->refcnt); 3353 } 3354 3355 static void bpf_link_dealloc(struct bpf_link *link) 3356 { 3357 /* now that we know that bpf_link itself can't be reached, put underlying BPF program */ 3358 if (link->prog) 3359 bpf_prog_put(link->prog); 3360 3361 /* free bpf_link and its containing memory */ 3362 if (link->ops->dealloc_deferred) 3363 link->ops->dealloc_deferred(link); 3364 else 3365 link->ops->dealloc(link); 3366 } 3367 3368 static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) 3369 { 3370 struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); 3371 3372 bpf_link_dealloc(link); 3373 } 3374 3375 static bool bpf_link_is_tracepoint(struct bpf_link *link) 3376 { 3377 /* 3378 * Only these combinations support a tracepoint bpf_link. 3379 * BPF_LINK_TYPE_TRACING raw_tp progs are hardcoded to use 3380 * bpf_raw_tp_link_lops and thus dealloc_deferred(), see 3381 * bpf_raw_tp_link_attach(). 3382 */ 3383 return link->type == BPF_LINK_TYPE_RAW_TRACEPOINT || 3384 (link->type == BPF_LINK_TYPE_TRACING && link->attach_type == BPF_TRACE_RAW_TP); 3385 } 3386 3387 /* bpf_link_free is guaranteed to be called from process context */ 3388 static void bpf_link_free(struct bpf_link *link) 3389 { 3390 const struct bpf_link_ops *ops = link->ops; 3391 3392 bpf_link_free_id(link->id); 3393 /* detach BPF program, clean up used resources */ 3394 if (link->prog) 3395 ops->release(link); 3396 if (ops->dealloc_deferred) { 3397 /* 3398 * Schedule BPF link deallocation, which will only then 3399 * trigger putting BPF program refcount. 3400 * If underlying BPF program is sleepable or BPF link's target 3401 * attach hookpoint is sleepable or otherwise requires RCU GPs 3402 * to ensure link and its underlying BPF program is not 3403 * reachable anymore, we need to first wait for RCU tasks 3404 * trace sync, and then go through "classic" RCU grace period. 3405 * 3406 * For tracepoint BPF links, we need to go through SRCU grace 3407 * period wait instead when non-faultable tracepoint is used. We 3408 * don't need to chain SRCU grace period waits, however, for the 3409 * faultable case, since it exclusively uses RCU Tasks Trace. 3410 */ 3411 if (link->sleepable || (link->prog && link->prog->sleepable)) 3412 /* RCU Tasks Trace grace period implies RCU grace period. */ 3413 call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_rcu_gp); 3414 /* We need to do a SRCU grace period wait for non-faultable tracepoint BPF links. */ 3415 else if (bpf_link_is_tracepoint(link)) 3416 call_tracepoint_unregister_atomic(&link->rcu, bpf_link_defer_dealloc_rcu_gp); 3417 else 3418 call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); 3419 } else if (ops->dealloc) { 3420 bpf_link_dealloc(link); 3421 } 3422 } 3423 3424 static void bpf_link_put_deferred(struct work_struct *work) 3425 { 3426 struct bpf_link *link = container_of(work, struct bpf_link, work); 3427 3428 bpf_link_free(link); 3429 } 3430 3431 /* bpf_link_put might be called from atomic context. It needs to be called 3432 * from sleepable context in order to acquire sleeping locks during the process. 3433 */ 3434 void bpf_link_put(struct bpf_link *link) 3435 { 3436 if (!atomic64_dec_and_test(&link->refcnt)) 3437 return; 3438 3439 INIT_WORK(&link->work, bpf_link_put_deferred); 3440 schedule_work(&link->work); 3441 } 3442 EXPORT_SYMBOL(bpf_link_put); 3443 3444 static void bpf_link_put_direct(struct bpf_link *link) 3445 { 3446 if (!atomic64_dec_and_test(&link->refcnt)) 3447 return; 3448 bpf_link_free(link); 3449 } 3450 3451 static int bpf_link_release(struct inode *inode, struct file *filp) 3452 { 3453 struct bpf_link *link = filp->private_data; 3454 3455 bpf_link_put_direct(link); 3456 return 0; 3457 } 3458 3459 #ifdef CONFIG_PROC_FS 3460 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) 3461 #define BPF_MAP_TYPE(_id, _ops) 3462 #define BPF_LINK_TYPE(_id, _name) [_id] = #_name, 3463 static const char *bpf_link_type_strs[] = { 3464 [BPF_LINK_TYPE_UNSPEC] = "<invalid>", 3465 #include <linux/bpf_types.h> 3466 }; 3467 #undef BPF_PROG_TYPE 3468 #undef BPF_MAP_TYPE 3469 #undef BPF_LINK_TYPE 3470 3471 static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) 3472 { 3473 const struct bpf_link *link = filp->private_data; 3474 const struct bpf_prog *prog = link->prog; 3475 enum bpf_link_type type = link->type; 3476 char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; 3477 3478 if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { 3479 if (link->type == BPF_LINK_TYPE_KPROBE_MULTI) 3480 seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ? 3481 "kretprobe_multi" : "kprobe_multi"); 3482 else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI) 3483 seq_printf(m, "link_type:\t%s\n", link->flags & BPF_F_UPROBE_MULTI_RETURN ? 3484 "uretprobe_multi" : "uprobe_multi"); 3485 else 3486 seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); 3487 } else { 3488 WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); 3489 seq_printf(m, "link_type:\t<%u>\n", type); 3490 } 3491 seq_printf(m, "link_id:\t%u\n", link->id); 3492 3493 if (prog) { 3494 bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); 3495 seq_printf(m, 3496 "prog_tag:\t%s\n" 3497 "prog_id:\t%u\n", 3498 prog_tag, 3499 prog->aux->id); 3500 } 3501 if (link->ops->show_fdinfo) 3502 link->ops->show_fdinfo(link, m); 3503 } 3504 #endif 3505 3506 static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts) 3507 { 3508 struct bpf_link *link = file->private_data; 3509 3510 return link->ops->poll(file, pts); 3511 } 3512 3513 static const struct file_operations bpf_link_fops = { 3514 #ifdef CONFIG_PROC_FS 3515 .show_fdinfo = bpf_link_show_fdinfo, 3516 #endif 3517 .release = bpf_link_release, 3518 .read = bpf_dummy_read, 3519 .write = bpf_dummy_write, 3520 }; 3521 3522 static const struct file_operations bpf_link_fops_poll = { 3523 #ifdef CONFIG_PROC_FS 3524 .show_fdinfo = bpf_link_show_fdinfo, 3525 #endif 3526 .release = bpf_link_release, 3527 .read = bpf_dummy_read, 3528 .write = bpf_dummy_write, 3529 .poll = bpf_link_poll, 3530 }; 3531 3532 static int bpf_link_alloc_id(struct bpf_link *link) 3533 { 3534 int id; 3535 3536 idr_preload(GFP_KERNEL); 3537 spin_lock_bh(&link_idr_lock); 3538 id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); 3539 spin_unlock_bh(&link_idr_lock); 3540 idr_preload_end(); 3541 3542 return id; 3543 } 3544 3545 /* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, 3546 * reserving unused FD and allocating ID from link_idr. This is to be paired 3547 * with bpf_link_settle() to install FD and ID and expose bpf_link to 3548 * user-space, if bpf_link is successfully attached. If not, bpf_link and 3549 * pre-allocated resources are to be freed with bpf_cleanup() call. All the 3550 * transient state is passed around in struct bpf_link_primer. 3551 * This is preferred way to create and initialize bpf_link, especially when 3552 * there are complicated and expensive operations in between creating bpf_link 3553 * itself and attaching it to BPF hook. By using bpf_link_prime() and 3554 * bpf_link_settle() kernel code using bpf_link doesn't have to perform 3555 * expensive (and potentially failing) roll back operations in a rare case 3556 * that file, FD, or ID can't be allocated. 3557 */ 3558 int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) 3559 { 3560 struct file *file; 3561 int fd, id; 3562 3563 fd = get_unused_fd_flags(O_CLOEXEC); 3564 if (fd < 0) 3565 return fd; 3566 3567 3568 id = bpf_link_alloc_id(link); 3569 if (id < 0) { 3570 put_unused_fd(fd); 3571 return id; 3572 } 3573 3574 file = anon_inode_getfile("bpf_link", 3575 link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, 3576 link, O_CLOEXEC); 3577 if (IS_ERR(file)) { 3578 bpf_link_free_id(id); 3579 put_unused_fd(fd); 3580 return PTR_ERR(file); 3581 } 3582 3583 primer->link = link; 3584 primer->file = file; 3585 primer->fd = fd; 3586 primer->id = id; 3587 return 0; 3588 } 3589 3590 int bpf_link_settle(struct bpf_link_primer *primer) 3591 { 3592 /* make bpf_link fetchable by ID */ 3593 spin_lock_bh(&link_idr_lock); 3594 primer->link->id = primer->id; 3595 spin_unlock_bh(&link_idr_lock); 3596 /* make bpf_link fetchable by FD */ 3597 fd_install(primer->fd, primer->file); 3598 /* pass through installed FD */ 3599 return primer->fd; 3600 } 3601 3602 int bpf_link_new_fd(struct bpf_link *link) 3603 { 3604 return anon_inode_getfd("bpf-link", 3605 link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, 3606 link, O_CLOEXEC); 3607 } 3608 3609 struct bpf_link *bpf_link_get_from_fd(u32 ufd) 3610 { 3611 CLASS(fd, f)(ufd); 3612 struct bpf_link *link; 3613 3614 if (fd_empty(f)) 3615 return ERR_PTR(-EBADF); 3616 if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll) 3617 return ERR_PTR(-EINVAL); 3618 3619 link = fd_file(f)->private_data; 3620 bpf_link_inc(link); 3621 return link; 3622 } 3623 EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL"); 3624 3625 static void bpf_tracing_link_release(struct bpf_link *link) 3626 { 3627 struct bpf_tracing_link *tr_link = 3628 container_of(link, struct bpf_tracing_link, link.link); 3629 3630 WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link.node, 3631 tr_link->trampoline, 3632 tr_link->tgt_prog)); 3633 3634 bpf_trampoline_put(tr_link->trampoline); 3635 3636 /* tgt_prog is NULL if target is a kernel function */ 3637 if (tr_link->tgt_prog) 3638 bpf_prog_put(tr_link->tgt_prog); 3639 } 3640 3641 static void bpf_tracing_link_dealloc(struct bpf_link *link) 3642 { 3643 struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); 3644 3645 kfree(tr_link); 3646 } 3647 3648 static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, 3649 struct seq_file *seq) 3650 { 3651 struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); 3652 3653 u32 target_btf_id, target_obj_id; 3654 3655 bpf_trampoline_unpack_key(tr_link->trampoline->key, 3656 &target_obj_id, &target_btf_id); 3657 seq_printf(seq, 3658 "attach_type:\t%d\n" 3659 "target_obj_id:\t%u\n" 3660 "target_btf_id:\t%u\n" 3661 "cookie:\t%llu\n", 3662 link->attach_type, 3663 target_obj_id, 3664 target_btf_id, 3665 tr_link->link.node.cookie); 3666 } 3667 3668 static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, 3669 struct bpf_link_info *info) 3670 { 3671 struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); 3672 3673 info->tracing.attach_type = link->attach_type; 3674 info->tracing.cookie = tr_link->link.node.cookie; 3675 bpf_trampoline_unpack_key(tr_link->trampoline->key, 3676 &info->tracing.target_obj_id, 3677 &info->tracing.target_btf_id); 3678 3679 return 0; 3680 } 3681 3682 static const struct bpf_link_ops bpf_tracing_link_lops = { 3683 .release = bpf_tracing_link_release, 3684 .dealloc = bpf_tracing_link_dealloc, 3685 .show_fdinfo = bpf_tracing_link_show_fdinfo, 3686 .fill_link_info = bpf_tracing_link_fill_link_info, 3687 }; 3688 3689 static int bpf_tracing_prog_attach(struct bpf_prog *prog, 3690 int tgt_prog_fd, 3691 u32 btf_id, 3692 u64 bpf_cookie, 3693 enum bpf_attach_type attach_type) 3694 { 3695 struct bpf_link_primer link_primer; 3696 struct bpf_prog *tgt_prog = NULL; 3697 struct bpf_trampoline *tr = NULL; 3698 struct bpf_tracing_link *link; 3699 u64 key = 0; 3700 int err; 3701 3702 switch (prog->type) { 3703 case BPF_PROG_TYPE_TRACING: 3704 if (prog->expected_attach_type != BPF_TRACE_FENTRY && 3705 prog->expected_attach_type != BPF_TRACE_FEXIT && 3706 prog->expected_attach_type != BPF_TRACE_FSESSION && 3707 prog->expected_attach_type != BPF_MODIFY_RETURN) { 3708 err = -EINVAL; 3709 goto out_put_prog; 3710 } 3711 break; 3712 case BPF_PROG_TYPE_EXT: 3713 if (prog->expected_attach_type != 0) { 3714 err = -EINVAL; 3715 goto out_put_prog; 3716 } 3717 break; 3718 case BPF_PROG_TYPE_LSM: 3719 if (prog->expected_attach_type != BPF_LSM_MAC) { 3720 err = -EINVAL; 3721 goto out_put_prog; 3722 } 3723 break; 3724 default: 3725 err = -EINVAL; 3726 goto out_put_prog; 3727 } 3728 3729 if (!!tgt_prog_fd != !!btf_id) { 3730 err = -EINVAL; 3731 goto out_put_prog; 3732 } 3733 3734 if (tgt_prog_fd) { 3735 /* 3736 * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this 3737 * part would be changed to implement the same for 3738 * BPF_PROG_TYPE_TRACING, do not forget to update the way how 3739 * attach_tracing_prog flag is set. 3740 */ 3741 if (prog->type != BPF_PROG_TYPE_EXT) { 3742 err = -EINVAL; 3743 goto out_put_prog; 3744 } 3745 3746 tgt_prog = bpf_prog_get(tgt_prog_fd); 3747 if (IS_ERR(tgt_prog)) { 3748 err = PTR_ERR(tgt_prog); 3749 tgt_prog = NULL; 3750 goto out_put_prog; 3751 } 3752 3753 key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); 3754 } 3755 3756 link = kzalloc_obj(*link, GFP_USER); 3757 if (!link) { 3758 err = -ENOMEM; 3759 goto out_put_prog; 3760 } 3761 bpf_tramp_link_init(&link->link, BPF_LINK_TYPE_TRACING, 3762 &bpf_tracing_link_lops, prog, attach_type, bpf_cookie); 3763 3764 if (prog->expected_attach_type == BPF_TRACE_FSESSION) { 3765 link->fexit.link = &link->link.link; 3766 link->fexit.cookie = bpf_cookie; 3767 } 3768 3769 mutex_lock(&prog->aux->dst_mutex); 3770 3771 /* There are a few possible cases here: 3772 * 3773 * - if prog->aux->dst_trampoline is set, the program was just loaded 3774 * and not yet attached to anything, so we can use the values stored 3775 * in prog->aux 3776 * 3777 * - if prog->aux->dst_trampoline is NULL, the program has already been 3778 * attached to a target and its initial target was cleared (below) 3779 * 3780 * - if tgt_prog != NULL, the caller specified tgt_prog_fd + 3781 * target_btf_id using the link_create API. 3782 * 3783 * - if tgt_prog == NULL when this function was called using the old 3784 * raw_tracepoint_open API, and we need a target from prog->aux 3785 * 3786 * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program 3787 * was detached and is going for re-attachment. 3788 * 3789 * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf 3790 * are NULL, then program was already attached and user did not provide 3791 * tgt_prog_fd so we have no way to find out or create trampoline 3792 */ 3793 if (!prog->aux->dst_trampoline && !tgt_prog) { 3794 /* 3795 * Allow re-attach for TRACING and LSM programs. If it's 3796 * currently linked, bpf_trampoline_link_prog will fail. 3797 * EXT programs need to specify tgt_prog_fd, so they 3798 * re-attach in separate code path. 3799 */ 3800 if (prog->type != BPF_PROG_TYPE_TRACING && 3801 prog->type != BPF_PROG_TYPE_LSM) { 3802 err = -EINVAL; 3803 goto out_unlock; 3804 } 3805 /* We can allow re-attach only if we have valid attach_btf. */ 3806 if (!prog->aux->attach_btf) { 3807 err = -EINVAL; 3808 goto out_unlock; 3809 } 3810 btf_id = prog->aux->attach_btf_id; 3811 key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id); 3812 } 3813 3814 if (!prog->aux->dst_trampoline || 3815 (key && key != prog->aux->dst_trampoline->key)) { 3816 /* If there is no saved target, or the specified target is 3817 * different from the destination specified at load time, we 3818 * need a new trampoline and a check for compatibility 3819 */ 3820 struct bpf_attach_target_info tgt_info = {}; 3821 3822 err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id, 3823 &tgt_info); 3824 if (err) 3825 goto out_unlock; 3826 3827 if (tgt_info.tgt_mod) { 3828 module_put(prog->aux->mod); 3829 prog->aux->mod = tgt_info.tgt_mod; 3830 } 3831 3832 tr = bpf_trampoline_get(key, &tgt_info); 3833 if (!tr) { 3834 err = -ENOMEM; 3835 goto out_unlock; 3836 } 3837 } else { 3838 /* The caller didn't specify a target, or the target was the 3839 * same as the destination supplied during program load. This 3840 * means we can reuse the trampoline and reference from program 3841 * load time, and there is no need to allocate a new one. This 3842 * can only happen once for any program, as the saved values in 3843 * prog->aux are cleared below. 3844 */ 3845 tr = prog->aux->dst_trampoline; 3846 tgt_prog = prog->aux->dst_prog; 3847 } 3848 /* 3849 * It is to prevent modifying struct pt_regs via kprobe_write_ctx=true 3850 * freplace prog. Without this check, kprobe_write_ctx=true freplace 3851 * prog is allowed to attach to kprobe_write_ctx=false kprobe prog, and 3852 * then modify the registers of the kprobe prog's target kernel 3853 * function. 3854 * 3855 * This also blocks the combination of uprobe+freplace, because it is 3856 * unable to recognize the use of the tgt_prog as an uprobe or a kprobe 3857 * by tgt_prog itself. At attach time, uprobe/kprobe is recognized by 3858 * the target perf event flags in __perf_event_set_bpf_prog(). 3859 */ 3860 if (prog->type == BPF_PROG_TYPE_EXT && 3861 prog->aux->kprobe_write_ctx != tgt_prog->aux->kprobe_write_ctx) { 3862 err = -EINVAL; 3863 goto out_unlock; 3864 } 3865 3866 err = bpf_link_prime(&link->link.link, &link_primer); 3867 if (err) 3868 goto out_unlock; 3869 3870 err = bpf_trampoline_link_prog(&link->link.node, tr, tgt_prog); 3871 if (err) { 3872 bpf_link_cleanup(&link_primer); 3873 link = NULL; 3874 goto out_unlock; 3875 } 3876 3877 link->tgt_prog = tgt_prog; 3878 link->trampoline = tr; 3879 3880 /* Always clear the trampoline and target prog from prog->aux to make 3881 * sure the original attach destination is not kept alive after a 3882 * program is (re-)attached to another target. 3883 */ 3884 if (prog->aux->dst_prog && 3885 (tgt_prog_fd || tr != prog->aux->dst_trampoline)) 3886 /* got extra prog ref from syscall, or attaching to different prog */ 3887 bpf_prog_put(prog->aux->dst_prog); 3888 if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline) 3889 /* we allocated a new trampoline, so free the old one */ 3890 bpf_trampoline_put(prog->aux->dst_trampoline); 3891 3892 prog->aux->dst_prog = NULL; 3893 prog->aux->dst_trampoline = NULL; 3894 mutex_unlock(&prog->aux->dst_mutex); 3895 3896 return bpf_link_settle(&link_primer); 3897 out_unlock: 3898 if (tr && tr != prog->aux->dst_trampoline) 3899 bpf_trampoline_put(tr); 3900 mutex_unlock(&prog->aux->dst_mutex); 3901 kfree(link); 3902 out_put_prog: 3903 if (tgt_prog_fd && tgt_prog) 3904 bpf_prog_put(tgt_prog); 3905 return err; 3906 } 3907 3908 static void bpf_raw_tp_link_release(struct bpf_link *link) 3909 { 3910 struct bpf_raw_tp_link *raw_tp = 3911 container_of(link, struct bpf_raw_tp_link, link); 3912 3913 bpf_probe_unregister(raw_tp->btp, raw_tp); 3914 bpf_put_raw_tracepoint(raw_tp->btp); 3915 } 3916 3917 static void bpf_raw_tp_link_dealloc(struct bpf_link *link) 3918 { 3919 struct bpf_raw_tp_link *raw_tp = 3920 container_of(link, struct bpf_raw_tp_link, link); 3921 3922 kfree(raw_tp); 3923 } 3924 3925 static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, 3926 struct seq_file *seq) 3927 { 3928 struct bpf_raw_tp_link *raw_tp_link = 3929 container_of(link, struct bpf_raw_tp_link, link); 3930 3931 seq_printf(seq, 3932 "tp_name:\t%s\n" 3933 "cookie:\t%llu\n", 3934 raw_tp_link->btp->tp->name, 3935 raw_tp_link->cookie); 3936 } 3937 3938 static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen, 3939 u32 len) 3940 { 3941 if (ulen >= len + 1) { 3942 if (copy_to_user(ubuf, buf, len + 1)) 3943 return -EFAULT; 3944 } else { 3945 char zero = '\0'; 3946 3947 if (copy_to_user(ubuf, buf, ulen - 1)) 3948 return -EFAULT; 3949 if (put_user(zero, ubuf + ulen - 1)) 3950 return -EFAULT; 3951 return -ENOSPC; 3952 } 3953 3954 return 0; 3955 } 3956 3957 static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, 3958 struct bpf_link_info *info) 3959 { 3960 struct bpf_raw_tp_link *raw_tp_link = 3961 container_of(link, struct bpf_raw_tp_link, link); 3962 char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); 3963 const char *tp_name = raw_tp_link->btp->tp->name; 3964 u32 ulen = info->raw_tracepoint.tp_name_len; 3965 size_t tp_len = strlen(tp_name); 3966 3967 if (!ulen ^ !ubuf) 3968 return -EINVAL; 3969 3970 info->raw_tracepoint.tp_name_len = tp_len + 1; 3971 info->raw_tracepoint.cookie = raw_tp_link->cookie; 3972 3973 if (!ubuf) 3974 return 0; 3975 3976 return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len); 3977 } 3978 3979 static const struct bpf_link_ops bpf_raw_tp_link_lops = { 3980 .release = bpf_raw_tp_link_release, 3981 .dealloc_deferred = bpf_raw_tp_link_dealloc, 3982 .show_fdinfo = bpf_raw_tp_link_show_fdinfo, 3983 .fill_link_info = bpf_raw_tp_link_fill_link_info, 3984 }; 3985 3986 #ifdef CONFIG_PERF_EVENTS 3987 struct bpf_perf_link { 3988 struct bpf_link link; 3989 struct file *perf_file; 3990 }; 3991 3992 static void bpf_perf_link_release(struct bpf_link *link) 3993 { 3994 struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); 3995 struct perf_event *event = perf_link->perf_file->private_data; 3996 3997 perf_event_free_bpf_prog(event); 3998 fput(perf_link->perf_file); 3999 } 4000 4001 static void bpf_perf_link_dealloc(struct bpf_link *link) 4002 { 4003 struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); 4004 4005 kfree(perf_link); 4006 } 4007 4008 static int bpf_perf_link_fill_common(const struct perf_event *event, 4009 char __user *uname, u32 *ulenp, 4010 u64 *probe_offset, u64 *probe_addr, 4011 u32 *fd_type, unsigned long *missed) 4012 { 4013 const char *buf; 4014 u32 prog_id, ulen; 4015 size_t len; 4016 int err; 4017 4018 ulen = *ulenp; 4019 if (!ulen ^ !uname) 4020 return -EINVAL; 4021 4022 err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf, 4023 probe_offset, probe_addr, missed); 4024 if (err) 4025 return err; 4026 4027 if (buf) { 4028 len = strlen(buf); 4029 *ulenp = len + 1; 4030 } else { 4031 *ulenp = 1; 4032 } 4033 if (!uname) 4034 return 0; 4035 4036 if (buf) { 4037 err = bpf_copy_to_user(uname, buf, ulen, len); 4038 if (err) 4039 return err; 4040 } else { 4041 char zero = '\0'; 4042 4043 if (put_user(zero, uname)) 4044 return -EFAULT; 4045 } 4046 return 0; 4047 } 4048 4049 #ifdef CONFIG_KPROBE_EVENTS 4050 static int bpf_perf_link_fill_kprobe(const struct perf_event *event, 4051 struct bpf_link_info *info) 4052 { 4053 unsigned long missed; 4054 char __user *uname; 4055 u64 addr, offset; 4056 u32 ulen, type; 4057 int err; 4058 4059 uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); 4060 ulen = info->perf_event.kprobe.name_len; 4061 err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, 4062 &type, &missed); 4063 if (err) 4064 return err; 4065 if (type == BPF_FD_TYPE_KRETPROBE) 4066 info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; 4067 else 4068 info->perf_event.type = BPF_PERF_EVENT_KPROBE; 4069 info->perf_event.kprobe.name_len = ulen; 4070 info->perf_event.kprobe.offset = offset; 4071 info->perf_event.kprobe.missed = missed; 4072 if (!kallsyms_show_value(current_cred())) 4073 addr = 0; 4074 info->perf_event.kprobe.addr = addr; 4075 info->perf_event.kprobe.cookie = event->bpf_cookie; 4076 return 0; 4077 } 4078 4079 static void bpf_perf_link_fdinfo_kprobe(const struct perf_event *event, 4080 struct seq_file *seq) 4081 { 4082 const char *name; 4083 int err; 4084 u32 prog_id, type; 4085 u64 offset, addr; 4086 unsigned long missed; 4087 4088 err = bpf_get_perf_event_info(event, &prog_id, &type, &name, 4089 &offset, &addr, &missed); 4090 if (err) 4091 return; 4092 4093 seq_printf(seq, 4094 "name:\t%s\n" 4095 "offset:\t%#llx\n" 4096 "missed:\t%lu\n" 4097 "addr:\t%#llx\n" 4098 "event_type:\t%s\n" 4099 "cookie:\t%llu\n", 4100 name, offset, missed, addr, 4101 type == BPF_FD_TYPE_KRETPROBE ? "kretprobe" : "kprobe", 4102 event->bpf_cookie); 4103 } 4104 #endif 4105 4106 #ifdef CONFIG_UPROBE_EVENTS 4107 static int bpf_perf_link_fill_uprobe(const struct perf_event *event, 4108 struct bpf_link_info *info) 4109 { 4110 u64 ref_ctr_offset, offset; 4111 char __user *uname; 4112 u32 ulen, type; 4113 int err; 4114 4115 uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); 4116 ulen = info->perf_event.uprobe.name_len; 4117 err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset, 4118 &type, NULL); 4119 if (err) 4120 return err; 4121 4122 if (type == BPF_FD_TYPE_URETPROBE) 4123 info->perf_event.type = BPF_PERF_EVENT_URETPROBE; 4124 else 4125 info->perf_event.type = BPF_PERF_EVENT_UPROBE; 4126 info->perf_event.uprobe.name_len = ulen; 4127 info->perf_event.uprobe.offset = offset; 4128 info->perf_event.uprobe.cookie = event->bpf_cookie; 4129 info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset; 4130 return 0; 4131 } 4132 4133 static void bpf_perf_link_fdinfo_uprobe(const struct perf_event *event, 4134 struct seq_file *seq) 4135 { 4136 const char *name; 4137 int err; 4138 u32 prog_id, type; 4139 u64 offset, ref_ctr_offset; 4140 unsigned long missed; 4141 4142 err = bpf_get_perf_event_info(event, &prog_id, &type, &name, 4143 &offset, &ref_ctr_offset, &missed); 4144 if (err) 4145 return; 4146 4147 seq_printf(seq, 4148 "name:\t%s\n" 4149 "offset:\t%#llx\n" 4150 "ref_ctr_offset:\t%#llx\n" 4151 "event_type:\t%s\n" 4152 "cookie:\t%llu\n", 4153 name, offset, ref_ctr_offset, 4154 type == BPF_FD_TYPE_URETPROBE ? "uretprobe" : "uprobe", 4155 event->bpf_cookie); 4156 } 4157 #endif 4158 4159 static int bpf_perf_link_fill_probe(const struct perf_event *event, 4160 struct bpf_link_info *info) 4161 { 4162 #ifdef CONFIG_KPROBE_EVENTS 4163 if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) 4164 return bpf_perf_link_fill_kprobe(event, info); 4165 #endif 4166 #ifdef CONFIG_UPROBE_EVENTS 4167 if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) 4168 return bpf_perf_link_fill_uprobe(event, info); 4169 #endif 4170 return -EOPNOTSUPP; 4171 } 4172 4173 static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, 4174 struct bpf_link_info *info) 4175 { 4176 char __user *uname; 4177 u32 ulen; 4178 int err; 4179 4180 uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); 4181 ulen = info->perf_event.tracepoint.name_len; 4182 err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL); 4183 if (err) 4184 return err; 4185 4186 info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; 4187 info->perf_event.tracepoint.name_len = ulen; 4188 info->perf_event.tracepoint.cookie = event->bpf_cookie; 4189 return 0; 4190 } 4191 4192 static int bpf_perf_link_fill_perf_event(const struct perf_event *event, 4193 struct bpf_link_info *info) 4194 { 4195 info->perf_event.event.type = event->attr.type; 4196 info->perf_event.event.config = event->attr.config; 4197 info->perf_event.event.cookie = event->bpf_cookie; 4198 info->perf_event.type = BPF_PERF_EVENT_EVENT; 4199 return 0; 4200 } 4201 4202 static int bpf_perf_link_fill_link_info(const struct bpf_link *link, 4203 struct bpf_link_info *info) 4204 { 4205 struct bpf_perf_link *perf_link; 4206 const struct perf_event *event; 4207 4208 perf_link = container_of(link, struct bpf_perf_link, link); 4209 event = perf_get_event(perf_link->perf_file); 4210 if (IS_ERR(event)) 4211 return PTR_ERR(event); 4212 4213 switch (event->prog->type) { 4214 case BPF_PROG_TYPE_PERF_EVENT: 4215 return bpf_perf_link_fill_perf_event(event, info); 4216 case BPF_PROG_TYPE_TRACEPOINT: 4217 return bpf_perf_link_fill_tracepoint(event, info); 4218 case BPF_PROG_TYPE_KPROBE: 4219 return bpf_perf_link_fill_probe(event, info); 4220 default: 4221 return -EOPNOTSUPP; 4222 } 4223 } 4224 4225 static void bpf_perf_event_link_show_fdinfo(const struct perf_event *event, 4226 struct seq_file *seq) 4227 { 4228 seq_printf(seq, 4229 "type:\t%u\n" 4230 "config:\t%llu\n" 4231 "event_type:\t%s\n" 4232 "cookie:\t%llu\n", 4233 event->attr.type, event->attr.config, 4234 "event", event->bpf_cookie); 4235 } 4236 4237 static void bpf_tracepoint_link_show_fdinfo(const struct perf_event *event, 4238 struct seq_file *seq) 4239 { 4240 int err; 4241 const char *name; 4242 u32 prog_id; 4243 4244 err = bpf_get_perf_event_info(event, &prog_id, NULL, &name, NULL, 4245 NULL, NULL); 4246 if (err) 4247 return; 4248 4249 seq_printf(seq, 4250 "tp_name:\t%s\n" 4251 "event_type:\t%s\n" 4252 "cookie:\t%llu\n", 4253 name, "tracepoint", event->bpf_cookie); 4254 } 4255 4256 static void bpf_probe_link_show_fdinfo(const struct perf_event *event, 4257 struct seq_file *seq) 4258 { 4259 #ifdef CONFIG_KPROBE_EVENTS 4260 if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) 4261 return bpf_perf_link_fdinfo_kprobe(event, seq); 4262 #endif 4263 4264 #ifdef CONFIG_UPROBE_EVENTS 4265 if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) 4266 return bpf_perf_link_fdinfo_uprobe(event, seq); 4267 #endif 4268 } 4269 4270 static void bpf_perf_link_show_fdinfo(const struct bpf_link *link, 4271 struct seq_file *seq) 4272 { 4273 struct bpf_perf_link *perf_link; 4274 const struct perf_event *event; 4275 4276 perf_link = container_of(link, struct bpf_perf_link, link); 4277 event = perf_get_event(perf_link->perf_file); 4278 if (IS_ERR(event)) 4279 return; 4280 4281 switch (event->prog->type) { 4282 case BPF_PROG_TYPE_PERF_EVENT: 4283 return bpf_perf_event_link_show_fdinfo(event, seq); 4284 case BPF_PROG_TYPE_TRACEPOINT: 4285 return bpf_tracepoint_link_show_fdinfo(event, seq); 4286 case BPF_PROG_TYPE_KPROBE: 4287 return bpf_probe_link_show_fdinfo(event, seq); 4288 default: 4289 return; 4290 } 4291 } 4292 4293 static const struct bpf_link_ops bpf_perf_link_lops = { 4294 .release = bpf_perf_link_release, 4295 .dealloc = bpf_perf_link_dealloc, 4296 .fill_link_info = bpf_perf_link_fill_link_info, 4297 .show_fdinfo = bpf_perf_link_show_fdinfo, 4298 }; 4299 4300 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) 4301 { 4302 struct bpf_link_primer link_primer; 4303 struct bpf_perf_link *link; 4304 struct perf_event *event; 4305 struct file *perf_file; 4306 int err; 4307 4308 if (attr->link_create.flags) 4309 return -EINVAL; 4310 4311 perf_file = perf_event_get(attr->link_create.target_fd); 4312 if (IS_ERR(perf_file)) 4313 return PTR_ERR(perf_file); 4314 4315 link = kzalloc_obj(*link, GFP_USER); 4316 if (!link) { 4317 err = -ENOMEM; 4318 goto out_put_file; 4319 } 4320 bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog, 4321 attr->link_create.attach_type); 4322 link->perf_file = perf_file; 4323 4324 err = bpf_link_prime(&link->link, &link_primer); 4325 if (err) { 4326 kfree(link); 4327 goto out_put_file; 4328 } 4329 4330 event = perf_file->private_data; 4331 err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie); 4332 if (err) { 4333 bpf_link_cleanup(&link_primer); 4334 goto out_put_file; 4335 } 4336 /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */ 4337 bpf_prog_inc(prog); 4338 4339 return bpf_link_settle(&link_primer); 4340 4341 out_put_file: 4342 fput(perf_file); 4343 return err; 4344 } 4345 #else 4346 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) 4347 { 4348 return -EOPNOTSUPP; 4349 } 4350 #endif /* CONFIG_PERF_EVENTS */ 4351 4352 static int bpf_raw_tp_link_attach(struct bpf_prog *prog, 4353 const char __user *user_tp_name, u64 cookie, 4354 enum bpf_attach_type attach_type) 4355 { 4356 struct bpf_link_primer link_primer; 4357 struct bpf_raw_tp_link *link; 4358 struct bpf_raw_event_map *btp; 4359 const char *tp_name; 4360 char buf[128]; 4361 int err; 4362 4363 switch (prog->type) { 4364 case BPF_PROG_TYPE_TRACING: 4365 case BPF_PROG_TYPE_EXT: 4366 case BPF_PROG_TYPE_LSM: 4367 if (user_tp_name) 4368 /* The attach point for this category of programs 4369 * should be specified via btf_id during program load. 4370 */ 4371 return -EINVAL; 4372 if (prog->type == BPF_PROG_TYPE_TRACING && 4373 prog->expected_attach_type == BPF_TRACE_RAW_TP) { 4374 tp_name = prog->aux->attach_func_name; 4375 break; 4376 } 4377 return bpf_tracing_prog_attach(prog, 0, 0, 0, attach_type); 4378 case BPF_PROG_TYPE_RAW_TRACEPOINT: 4379 case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: 4380 if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) 4381 return -EFAULT; 4382 buf[sizeof(buf) - 1] = 0; 4383 tp_name = buf; 4384 break; 4385 default: 4386 return -EINVAL; 4387 } 4388 4389 btp = bpf_get_raw_tracepoint(tp_name); 4390 if (!btp) 4391 return -ENOENT; 4392 4393 if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { 4394 bpf_put_raw_tracepoint(btp); 4395 return -EINVAL; 4396 } 4397 4398 link = kzalloc_obj(*link, GFP_USER); 4399 if (!link) { 4400 err = -ENOMEM; 4401 goto out_put_btp; 4402 } 4403 bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, 4404 &bpf_raw_tp_link_lops, prog, attach_type, 4405 tracepoint_is_faultable(btp->tp)); 4406 link->btp = btp; 4407 link->cookie = cookie; 4408 4409 err = bpf_link_prime(&link->link, &link_primer); 4410 if (err) { 4411 kfree(link); 4412 goto out_put_btp; 4413 } 4414 4415 err = bpf_probe_register(link->btp, link); 4416 if (err) { 4417 bpf_link_cleanup(&link_primer); 4418 goto out_put_btp; 4419 } 4420 4421 return bpf_link_settle(&link_primer); 4422 4423 out_put_btp: 4424 bpf_put_raw_tracepoint(btp); 4425 return err; 4426 } 4427 4428 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie 4429 4430 static int bpf_raw_tracepoint_open(const union bpf_attr *attr) 4431 { 4432 struct bpf_prog *prog; 4433 void __user *tp_name; 4434 __u64 cookie; 4435 int fd; 4436 4437 if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) 4438 return -EINVAL; 4439 4440 prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); 4441 if (IS_ERR(prog)) 4442 return PTR_ERR(prog); 4443 4444 tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); 4445 cookie = attr->raw_tracepoint.cookie; 4446 fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type); 4447 if (fd < 0) 4448 bpf_prog_put(prog); 4449 return fd; 4450 } 4451 4452 static enum bpf_prog_type 4453 attach_type_to_prog_type(enum bpf_attach_type attach_type) 4454 { 4455 switch (attach_type) { 4456 case BPF_CGROUP_INET_INGRESS: 4457 case BPF_CGROUP_INET_EGRESS: 4458 return BPF_PROG_TYPE_CGROUP_SKB; 4459 case BPF_CGROUP_INET_SOCK_CREATE: 4460 case BPF_CGROUP_INET_SOCK_RELEASE: 4461 case BPF_CGROUP_INET4_POST_BIND: 4462 case BPF_CGROUP_INET6_POST_BIND: 4463 return BPF_PROG_TYPE_CGROUP_SOCK; 4464 case BPF_CGROUP_INET4_BIND: 4465 case BPF_CGROUP_INET6_BIND: 4466 case BPF_CGROUP_INET4_CONNECT: 4467 case BPF_CGROUP_INET6_CONNECT: 4468 case BPF_CGROUP_UNIX_CONNECT: 4469 case BPF_CGROUP_INET4_GETPEERNAME: 4470 case BPF_CGROUP_INET6_GETPEERNAME: 4471 case BPF_CGROUP_UNIX_GETPEERNAME: 4472 case BPF_CGROUP_INET4_GETSOCKNAME: 4473 case BPF_CGROUP_INET6_GETSOCKNAME: 4474 case BPF_CGROUP_UNIX_GETSOCKNAME: 4475 case BPF_CGROUP_UDP4_SENDMSG: 4476 case BPF_CGROUP_UDP6_SENDMSG: 4477 case BPF_CGROUP_UNIX_SENDMSG: 4478 case BPF_CGROUP_UDP4_RECVMSG: 4479 case BPF_CGROUP_UDP6_RECVMSG: 4480 case BPF_CGROUP_UNIX_RECVMSG: 4481 return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; 4482 case BPF_CGROUP_SOCK_OPS: 4483 return BPF_PROG_TYPE_SOCK_OPS; 4484 case BPF_CGROUP_DEVICE: 4485 return BPF_PROG_TYPE_CGROUP_DEVICE; 4486 case BPF_SK_MSG_VERDICT: 4487 return BPF_PROG_TYPE_SK_MSG; 4488 case BPF_SK_SKB_STREAM_PARSER: 4489 case BPF_SK_SKB_STREAM_VERDICT: 4490 case BPF_SK_SKB_VERDICT: 4491 return BPF_PROG_TYPE_SK_SKB; 4492 case BPF_LIRC_MODE2: 4493 return BPF_PROG_TYPE_LIRC_MODE2; 4494 case BPF_FLOW_DISSECTOR: 4495 return BPF_PROG_TYPE_FLOW_DISSECTOR; 4496 case BPF_CGROUP_SYSCTL: 4497 return BPF_PROG_TYPE_CGROUP_SYSCTL; 4498 case BPF_CGROUP_GETSOCKOPT: 4499 case BPF_CGROUP_SETSOCKOPT: 4500 return BPF_PROG_TYPE_CGROUP_SOCKOPT; 4501 case BPF_TRACE_ITER: 4502 case BPF_TRACE_RAW_TP: 4503 case BPF_TRACE_FENTRY: 4504 case BPF_TRACE_FEXIT: 4505 case BPF_TRACE_FSESSION: 4506 case BPF_TRACE_FSESSION_MULTI: 4507 case BPF_TRACE_FENTRY_MULTI: 4508 case BPF_TRACE_FEXIT_MULTI: 4509 case BPF_MODIFY_RETURN: 4510 return BPF_PROG_TYPE_TRACING; 4511 case BPF_LSM_MAC: 4512 return BPF_PROG_TYPE_LSM; 4513 case BPF_SK_LOOKUP: 4514 return BPF_PROG_TYPE_SK_LOOKUP; 4515 case BPF_XDP: 4516 return BPF_PROG_TYPE_XDP; 4517 case BPF_LSM_CGROUP: 4518 return BPF_PROG_TYPE_LSM; 4519 case BPF_TCX_INGRESS: 4520 case BPF_TCX_EGRESS: 4521 case BPF_NETKIT_PRIMARY: 4522 case BPF_NETKIT_PEER: 4523 return BPF_PROG_TYPE_SCHED_CLS; 4524 default: 4525 return BPF_PROG_TYPE_UNSPEC; 4526 } 4527 } 4528 4529 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, 4530 enum bpf_attach_type attach_type) 4531 { 4532 enum bpf_prog_type ptype; 4533 4534 switch (prog->type) { 4535 case BPF_PROG_TYPE_CGROUP_SOCK: 4536 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 4537 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 4538 case BPF_PROG_TYPE_SK_LOOKUP: 4539 return attach_type == prog->expected_attach_type ? 0 : -EINVAL; 4540 case BPF_PROG_TYPE_CGROUP_SKB: 4541 if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN)) 4542 /* cg-skb progs can be loaded by unpriv user. 4543 * check permissions at attach time. 4544 */ 4545 return -EPERM; 4546 4547 ptype = attach_type_to_prog_type(attach_type); 4548 if (prog->type != ptype) 4549 return -EINVAL; 4550 4551 return prog->enforce_expected_attach_type && 4552 prog->expected_attach_type != attach_type ? 4553 -EINVAL : 0; 4554 case BPF_PROG_TYPE_EXT: 4555 return 0; 4556 case BPF_PROG_TYPE_NETFILTER: 4557 if (attach_type != BPF_NETFILTER) 4558 return -EINVAL; 4559 return 0; 4560 case BPF_PROG_TYPE_PERF_EVENT: 4561 case BPF_PROG_TYPE_TRACEPOINT: 4562 if (attach_type != BPF_PERF_EVENT) 4563 return -EINVAL; 4564 return 0; 4565 case BPF_PROG_TYPE_KPROBE: 4566 if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && 4567 attach_type != BPF_TRACE_KPROBE_MULTI) 4568 return -EINVAL; 4569 if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION && 4570 attach_type != BPF_TRACE_KPROBE_SESSION) 4571 return -EINVAL; 4572 if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && 4573 attach_type != BPF_TRACE_UPROBE_MULTI) 4574 return -EINVAL; 4575 if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION && 4576 attach_type != BPF_TRACE_UPROBE_SESSION) 4577 return -EINVAL; 4578 if (attach_type != BPF_PERF_EVENT && 4579 attach_type != BPF_TRACE_KPROBE_MULTI && 4580 attach_type != BPF_TRACE_KPROBE_SESSION && 4581 attach_type != BPF_TRACE_UPROBE_MULTI && 4582 attach_type != BPF_TRACE_UPROBE_SESSION) 4583 return -EINVAL; 4584 return 0; 4585 case BPF_PROG_TYPE_SCHED_CLS: 4586 if (attach_type != BPF_TCX_INGRESS && 4587 attach_type != BPF_TCX_EGRESS && 4588 attach_type != BPF_NETKIT_PRIMARY && 4589 attach_type != BPF_NETKIT_PEER) 4590 return -EINVAL; 4591 return 0; 4592 default: 4593 ptype = attach_type_to_prog_type(attach_type); 4594 if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) 4595 return -EINVAL; 4596 return 0; 4597 } 4598 } 4599 4600 static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype, 4601 bool check_atype) 4602 { 4603 switch (ptype) { 4604 case BPF_PROG_TYPE_CGROUP_DEVICE: 4605 case BPF_PROG_TYPE_CGROUP_SKB: 4606 case BPF_PROG_TYPE_CGROUP_SOCK: 4607 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 4608 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 4609 case BPF_PROG_TYPE_CGROUP_SYSCTL: 4610 case BPF_PROG_TYPE_SOCK_OPS: 4611 return true; 4612 case BPF_PROG_TYPE_LSM: 4613 return check_atype ? atype == BPF_LSM_CGROUP : true; 4614 default: 4615 return false; 4616 } 4617 } 4618 4619 #define BPF_PROG_ATTACH_LAST_FIELD expected_revision 4620 4621 #define BPF_F_ATTACH_MASK_BASE \ 4622 (BPF_F_ALLOW_OVERRIDE | \ 4623 BPF_F_ALLOW_MULTI | \ 4624 BPF_F_REPLACE | \ 4625 BPF_F_PREORDER) 4626 4627 #define BPF_F_ATTACH_MASK_MPROG \ 4628 (BPF_F_REPLACE | \ 4629 BPF_F_BEFORE | \ 4630 BPF_F_AFTER | \ 4631 BPF_F_ID | \ 4632 BPF_F_LINK) 4633 4634 static int bpf_prog_attach(const union bpf_attr *attr) 4635 { 4636 enum bpf_prog_type ptype; 4637 struct bpf_prog *prog; 4638 int ret; 4639 4640 if (CHECK_ATTR(BPF_PROG_ATTACH)) 4641 return -EINVAL; 4642 4643 ptype = attach_type_to_prog_type(attr->attach_type); 4644 if (ptype == BPF_PROG_TYPE_UNSPEC) 4645 return -EINVAL; 4646 if (bpf_mprog_supported(ptype)) { 4647 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) 4648 return -EINVAL; 4649 } else if (is_cgroup_prog_type(ptype, 0, false)) { 4650 if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG)) 4651 return -EINVAL; 4652 } else { 4653 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE) 4654 return -EINVAL; 4655 if (attr->relative_fd || 4656 attr->expected_revision) 4657 return -EINVAL; 4658 } 4659 4660 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); 4661 if (IS_ERR(prog)) 4662 return PTR_ERR(prog); 4663 4664 if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { 4665 bpf_prog_put(prog); 4666 return -EINVAL; 4667 } 4668 4669 if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) { 4670 ret = cgroup_bpf_prog_attach(attr, ptype, prog); 4671 goto out; 4672 } 4673 4674 switch (ptype) { 4675 case BPF_PROG_TYPE_SK_SKB: 4676 case BPF_PROG_TYPE_SK_MSG: 4677 ret = sock_map_get_from_fd(attr, prog); 4678 break; 4679 case BPF_PROG_TYPE_LIRC_MODE2: 4680 ret = lirc_prog_attach(attr, prog); 4681 break; 4682 case BPF_PROG_TYPE_FLOW_DISSECTOR: 4683 ret = netns_bpf_prog_attach(attr, prog); 4684 break; 4685 case BPF_PROG_TYPE_SCHED_CLS: 4686 if (attr->attach_type == BPF_TCX_INGRESS || 4687 attr->attach_type == BPF_TCX_EGRESS) 4688 ret = tcx_prog_attach(attr, prog); 4689 else 4690 ret = netkit_prog_attach(attr, prog); 4691 break; 4692 default: 4693 ret = -EINVAL; 4694 } 4695 out: 4696 if (ret) 4697 bpf_prog_put(prog); 4698 return ret; 4699 } 4700 4701 #define BPF_PROG_DETACH_LAST_FIELD expected_revision 4702 4703 static int bpf_prog_detach(const union bpf_attr *attr) 4704 { 4705 struct bpf_prog *prog = NULL; 4706 enum bpf_prog_type ptype; 4707 int ret; 4708 4709 if (CHECK_ATTR(BPF_PROG_DETACH)) 4710 return -EINVAL; 4711 4712 ptype = attach_type_to_prog_type(attr->attach_type); 4713 if (bpf_mprog_supported(ptype)) { 4714 if (ptype == BPF_PROG_TYPE_UNSPEC) 4715 return -EINVAL; 4716 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) 4717 return -EINVAL; 4718 if (attr->attach_bpf_fd) { 4719 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); 4720 if (IS_ERR(prog)) 4721 return PTR_ERR(prog); 4722 } else if (!bpf_mprog_detach_empty(ptype)) { 4723 return -EPERM; 4724 } 4725 } else if (is_cgroup_prog_type(ptype, 0, false)) { 4726 if (attr->attach_flags || attr->relative_fd) 4727 return -EINVAL; 4728 } else if (attr->attach_flags || 4729 attr->relative_fd || 4730 attr->expected_revision) { 4731 return -EINVAL; 4732 } 4733 4734 switch (ptype) { 4735 case BPF_PROG_TYPE_SK_MSG: 4736 case BPF_PROG_TYPE_SK_SKB: 4737 ret = sock_map_prog_detach(attr, ptype); 4738 break; 4739 case BPF_PROG_TYPE_LIRC_MODE2: 4740 ret = lirc_prog_detach(attr); 4741 break; 4742 case BPF_PROG_TYPE_FLOW_DISSECTOR: 4743 ret = netns_bpf_prog_detach(attr, ptype); 4744 break; 4745 case BPF_PROG_TYPE_CGROUP_DEVICE: 4746 case BPF_PROG_TYPE_CGROUP_SKB: 4747 case BPF_PROG_TYPE_CGROUP_SOCK: 4748 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 4749 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 4750 case BPF_PROG_TYPE_CGROUP_SYSCTL: 4751 case BPF_PROG_TYPE_SOCK_OPS: 4752 case BPF_PROG_TYPE_LSM: 4753 ret = cgroup_bpf_prog_detach(attr, ptype); 4754 break; 4755 case BPF_PROG_TYPE_SCHED_CLS: 4756 if (attr->attach_type == BPF_TCX_INGRESS || 4757 attr->attach_type == BPF_TCX_EGRESS) 4758 ret = tcx_prog_detach(attr, prog); 4759 else 4760 ret = netkit_prog_detach(attr, prog); 4761 break; 4762 default: 4763 ret = -EINVAL; 4764 } 4765 4766 if (prog) 4767 bpf_prog_put(prog); 4768 return ret; 4769 } 4770 4771 #define BPF_PROG_QUERY_LAST_FIELD query.revision 4772 4773 static int bpf_prog_query(const union bpf_attr *attr, 4774 union bpf_attr __user *uattr, u32 uattr_size) 4775 { 4776 if (!bpf_net_capable()) 4777 return -EPERM; 4778 if (CHECK_ATTR(BPF_PROG_QUERY)) 4779 return -EINVAL; 4780 if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) 4781 return -EINVAL; 4782 4783 switch (attr->query.attach_type) { 4784 case BPF_CGROUP_INET_INGRESS: 4785 case BPF_CGROUP_INET_EGRESS: 4786 case BPF_CGROUP_INET_SOCK_CREATE: 4787 case BPF_CGROUP_INET_SOCK_RELEASE: 4788 case BPF_CGROUP_INET4_BIND: 4789 case BPF_CGROUP_INET6_BIND: 4790 case BPF_CGROUP_INET4_POST_BIND: 4791 case BPF_CGROUP_INET6_POST_BIND: 4792 case BPF_CGROUP_INET4_CONNECT: 4793 case BPF_CGROUP_INET6_CONNECT: 4794 case BPF_CGROUP_UNIX_CONNECT: 4795 case BPF_CGROUP_INET4_GETPEERNAME: 4796 case BPF_CGROUP_INET6_GETPEERNAME: 4797 case BPF_CGROUP_UNIX_GETPEERNAME: 4798 case BPF_CGROUP_INET4_GETSOCKNAME: 4799 case BPF_CGROUP_INET6_GETSOCKNAME: 4800 case BPF_CGROUP_UNIX_GETSOCKNAME: 4801 case BPF_CGROUP_UDP4_SENDMSG: 4802 case BPF_CGROUP_UDP6_SENDMSG: 4803 case BPF_CGROUP_UNIX_SENDMSG: 4804 case BPF_CGROUP_UDP4_RECVMSG: 4805 case BPF_CGROUP_UDP6_RECVMSG: 4806 case BPF_CGROUP_UNIX_RECVMSG: 4807 case BPF_CGROUP_SOCK_OPS: 4808 case BPF_CGROUP_DEVICE: 4809 case BPF_CGROUP_SYSCTL: 4810 case BPF_CGROUP_GETSOCKOPT: 4811 case BPF_CGROUP_SETSOCKOPT: 4812 case BPF_LSM_CGROUP: 4813 return cgroup_bpf_prog_query(attr, uattr, uattr_size); 4814 case BPF_LIRC_MODE2: 4815 return lirc_prog_query(attr, uattr); 4816 case BPF_FLOW_DISSECTOR: 4817 case BPF_SK_LOOKUP: 4818 return netns_bpf_prog_query(attr, uattr); 4819 case BPF_SK_SKB_STREAM_PARSER: 4820 case BPF_SK_SKB_STREAM_VERDICT: 4821 case BPF_SK_MSG_VERDICT: 4822 case BPF_SK_SKB_VERDICT: 4823 return sock_map_bpf_prog_query(attr, uattr); 4824 case BPF_TCX_INGRESS: 4825 case BPF_TCX_EGRESS: 4826 return tcx_prog_query(attr, uattr); 4827 case BPF_NETKIT_PRIMARY: 4828 case BPF_NETKIT_PEER: 4829 return netkit_prog_query(attr, uattr); 4830 default: 4831 return -EINVAL; 4832 } 4833 } 4834 4835 #define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size 4836 4837 static int bpf_prog_test_run(const union bpf_attr *attr, 4838 union bpf_attr __user *uattr) 4839 { 4840 struct bpf_prog *prog; 4841 int ret = -ENOTSUPP; 4842 4843 if (CHECK_ATTR(BPF_PROG_TEST_RUN)) 4844 return -EINVAL; 4845 4846 if ((attr->test.ctx_size_in && !attr->test.ctx_in) || 4847 (!attr->test.ctx_size_in && attr->test.ctx_in)) 4848 return -EINVAL; 4849 4850 if ((attr->test.ctx_size_out && !attr->test.ctx_out) || 4851 (!attr->test.ctx_size_out && attr->test.ctx_out)) 4852 return -EINVAL; 4853 4854 prog = bpf_prog_get(attr->test.prog_fd); 4855 if (IS_ERR(prog)) 4856 return PTR_ERR(prog); 4857 4858 if (prog->aux->ops->test_run) 4859 ret = prog->aux->ops->test_run(prog, attr, uattr); 4860 4861 bpf_prog_put(prog); 4862 return ret; 4863 } 4864 4865 #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id 4866 4867 static int bpf_obj_get_next_id(const union bpf_attr *attr, 4868 union bpf_attr __user *uattr, 4869 struct idr *idr, 4870 spinlock_t *lock) 4871 { 4872 u32 next_id = attr->start_id; 4873 int err = 0; 4874 4875 if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) 4876 return -EINVAL; 4877 4878 if (!capable(CAP_SYS_ADMIN)) 4879 return -EPERM; 4880 4881 next_id++; 4882 spin_lock_bh(lock); 4883 if (!idr_get_next(idr, &next_id)) 4884 err = -ENOENT; 4885 spin_unlock_bh(lock); 4886 4887 if (!err) 4888 err = put_user(next_id, &uattr->next_id); 4889 4890 return err; 4891 } 4892 4893 struct bpf_map *bpf_map_get_curr_or_next(u32 *id) 4894 { 4895 struct bpf_map *map; 4896 4897 spin_lock_bh(&map_idr_lock); 4898 again: 4899 map = idr_get_next(&map_idr, id); 4900 if (map) { 4901 map = __bpf_map_inc_not_zero(map, false); 4902 if (IS_ERR(map)) { 4903 (*id)++; 4904 goto again; 4905 } 4906 } 4907 spin_unlock_bh(&map_idr_lock); 4908 4909 return map; 4910 } 4911 4912 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id) 4913 { 4914 struct bpf_prog *prog; 4915 4916 spin_lock_bh(&prog_idr_lock); 4917 again: 4918 prog = idr_get_next(&prog_idr, id); 4919 if (prog) { 4920 prog = bpf_prog_inc_not_zero(prog); 4921 if (IS_ERR(prog)) { 4922 (*id)++; 4923 goto again; 4924 } 4925 } 4926 spin_unlock_bh(&prog_idr_lock); 4927 4928 return prog; 4929 } 4930 4931 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id 4932 4933 struct bpf_prog *bpf_prog_by_id(u32 id) 4934 { 4935 struct bpf_prog *prog; 4936 4937 if (!id) 4938 return ERR_PTR(-ENOENT); 4939 4940 spin_lock_bh(&prog_idr_lock); 4941 prog = idr_find(&prog_idr, id); 4942 if (prog) 4943 prog = bpf_prog_inc_not_zero(prog); 4944 else 4945 prog = ERR_PTR(-ENOENT); 4946 spin_unlock_bh(&prog_idr_lock); 4947 return prog; 4948 } 4949 4950 static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) 4951 { 4952 struct bpf_prog *prog; 4953 u32 id = attr->prog_id; 4954 int fd; 4955 4956 if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) 4957 return -EINVAL; 4958 4959 if (!capable(CAP_SYS_ADMIN)) 4960 return -EPERM; 4961 4962 prog = bpf_prog_by_id(id); 4963 if (IS_ERR(prog)) 4964 return PTR_ERR(prog); 4965 4966 fd = bpf_prog_new_fd(prog); 4967 if (fd < 0) 4968 bpf_prog_put(prog); 4969 4970 return fd; 4971 } 4972 4973 #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags 4974 4975 static int bpf_map_get_fd_by_id(const union bpf_attr *attr) 4976 { 4977 struct bpf_map *map; 4978 u32 id = attr->map_id; 4979 int f_flags; 4980 int fd; 4981 4982 if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || 4983 attr->open_flags & ~BPF_OBJ_FLAG_MASK) 4984 return -EINVAL; 4985 4986 if (!capable(CAP_SYS_ADMIN)) 4987 return -EPERM; 4988 4989 f_flags = bpf_get_file_flag(attr->open_flags); 4990 if (f_flags < 0) 4991 return f_flags; 4992 4993 spin_lock_bh(&map_idr_lock); 4994 map = idr_find(&map_idr, id); 4995 if (map) 4996 map = __bpf_map_inc_not_zero(map, true); 4997 else 4998 map = ERR_PTR(-ENOENT); 4999 spin_unlock_bh(&map_idr_lock); 5000 5001 if (IS_ERR(map)) 5002 return PTR_ERR(map); 5003 5004 fd = bpf_map_new_fd(map, f_flags); 5005 if (fd < 0) 5006 bpf_map_put_with_uref(map); 5007 5008 return fd; 5009 } 5010 5011 static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, 5012 unsigned long addr, u32 *off, 5013 u32 *type) 5014 { 5015 const struct bpf_map *map; 5016 int i; 5017 5018 mutex_lock(&prog->aux->used_maps_mutex); 5019 for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { 5020 map = prog->aux->used_maps[i]; 5021 if (map == (void *)addr) { 5022 *type = BPF_PSEUDO_MAP_FD; 5023 goto out; 5024 } 5025 if (!map->ops->map_direct_value_meta) 5026 continue; 5027 if (!map->ops->map_direct_value_meta(map, addr, off)) { 5028 *type = BPF_PSEUDO_MAP_VALUE; 5029 goto out; 5030 } 5031 } 5032 map = NULL; 5033 5034 out: 5035 mutex_unlock(&prog->aux->used_maps_mutex); 5036 return map; 5037 } 5038 5039 static void prepare_dump_pseudo_call(struct bpf_insn *insn) 5040 { 5041 s32 call_off = insn->imm; 5042 5043 /* 5044 * BPF_CALL_ARGS only exists for interpreter fallback. 5045 * 1. For interpreter (BPF_CALL_ARGS): insn->off is the index of 5046 * interpreters_args array, so here using bpf_call_args_imm() 5047 * to get the real address offset. 5048 * 2. For JIT (BPF_CALL): insn->off is the subprog id. 5049 */ 5050 if (insn->code == (BPF_JMP | BPF_CALL_ARGS)) 5051 insn->imm = bpf_call_args_imm(insn->off); 5052 else 5053 insn->imm = insn->off; 5054 5055 /* Avoid dumping a truncated and misleading pc-relative offset. */ 5056 if (call_off > S16_MAX || call_off < S16_MIN) 5057 insn->off = 0; 5058 else 5059 insn->off = call_off; 5060 } 5061 5062 static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, 5063 const struct cred *f_cred) 5064 { 5065 const struct bpf_map *map; 5066 struct bpf_insn *insns; 5067 u32 off, type; 5068 u64 imm; 5069 u8 code; 5070 int i; 5071 5072 insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), 5073 GFP_USER); 5074 if (!insns) 5075 return insns; 5076 5077 for (i = 0; i < prog->len; i++) { 5078 code = insns[i].code; 5079 5080 if (code == (BPF_JMP | BPF_TAIL_CALL)) { 5081 insns[i].code = BPF_JMP | BPF_CALL; 5082 insns[i].imm = BPF_FUNC_tail_call; 5083 /* fall-through */ 5084 } 5085 if (code == (BPF_JMP | BPF_CALL) || 5086 code == (BPF_JMP | BPF_CALL_ARGS)) { 5087 /* Restore the legacy xlated dump layout. */ 5088 if (insns[i].src_reg == BPF_PSEUDO_CALL) 5089 prepare_dump_pseudo_call(&insns[i]); 5090 if (code == (BPF_JMP | BPF_CALL_ARGS)) 5091 insns[i].code = BPF_JMP | BPF_CALL; 5092 if (!bpf_dump_raw_ok(f_cred)) 5093 insns[i].imm = 0; 5094 continue; 5095 } 5096 if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) { 5097 insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM; 5098 continue; 5099 } 5100 5101 if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX || 5102 BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) { 5103 insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM; 5104 continue; 5105 } 5106 5107 if (code != (BPF_LD | BPF_IMM | BPF_DW)) 5108 continue; 5109 5110 imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; 5111 map = bpf_map_from_imm(prog, imm, &off, &type); 5112 if (map) { 5113 insns[i].src_reg = type; 5114 insns[i].imm = map->id; 5115 insns[i + 1].imm = off; 5116 continue; 5117 } 5118 } 5119 5120 return insns; 5121 } 5122 5123 static int set_info_rec_size(struct bpf_prog_info *info) 5124 { 5125 /* 5126 * Ensure info.*_rec_size is the same as kernel expected size 5127 * 5128 * or 5129 * 5130 * Only allow zero *_rec_size if both _rec_size and _cnt are 5131 * zero. In this case, the kernel will set the expected 5132 * _rec_size back to the info. 5133 */ 5134 5135 if ((info->nr_func_info || info->func_info_rec_size) && 5136 info->func_info_rec_size != sizeof(struct bpf_func_info)) 5137 return -EINVAL; 5138 5139 if ((info->nr_line_info || info->line_info_rec_size) && 5140 info->line_info_rec_size != sizeof(struct bpf_line_info)) 5141 return -EINVAL; 5142 5143 if ((info->nr_jited_line_info || info->jited_line_info_rec_size) && 5144 info->jited_line_info_rec_size != sizeof(__u64)) 5145 return -EINVAL; 5146 5147 info->func_info_rec_size = sizeof(struct bpf_func_info); 5148 info->line_info_rec_size = sizeof(struct bpf_line_info); 5149 info->jited_line_info_rec_size = sizeof(__u64); 5150 5151 return 0; 5152 } 5153 5154 static int bpf_prog_get_info_by_fd(struct file *file, 5155 struct bpf_prog *prog, 5156 const union bpf_attr *attr, 5157 union bpf_attr __user *uattr) 5158 { 5159 struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5160 struct btf *attach_btf = bpf_prog_get_target_btf(prog); 5161 struct bpf_prog_info info; 5162 u32 info_len = attr->info.info_len; 5163 struct bpf_prog_kstats stats; 5164 char __user *uinsns; 5165 u32 ulen, len; 5166 int err; 5167 5168 len = offsetofend(struct bpf_prog_info, attach_btf_id); 5169 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len); 5170 if (err) 5171 return err; 5172 info_len = min_t(u32, sizeof(info), info_len); 5173 5174 memset(&info, 0, sizeof(info)); 5175 if (copy_from_user(&info, uinfo, info_len)) 5176 return -EFAULT; 5177 5178 info.type = prog->type; 5179 info.id = prog->aux->id; 5180 info.load_time = prog->aux->load_time; 5181 info.created_by_uid = from_kuid_munged(current_user_ns(), 5182 prog->aux->user->uid); 5183 info.gpl_compatible = prog->gpl_compatible; 5184 5185 memcpy(info.tag, prog->tag, sizeof(prog->tag)); 5186 memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); 5187 5188 mutex_lock(&prog->aux->used_maps_mutex); 5189 ulen = info.nr_map_ids; 5190 info.nr_map_ids = prog->aux->used_map_cnt; 5191 ulen = min_t(u32, info.nr_map_ids, ulen); 5192 if (ulen) { 5193 u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); 5194 u32 i; 5195 5196 for (i = 0; i < ulen; i++) 5197 if (put_user(prog->aux->used_maps[i]->id, 5198 &user_map_ids[i])) { 5199 mutex_unlock(&prog->aux->used_maps_mutex); 5200 return -EFAULT; 5201 } 5202 } 5203 mutex_unlock(&prog->aux->used_maps_mutex); 5204 5205 err = set_info_rec_size(&info); 5206 if (err) 5207 return err; 5208 5209 bpf_prog_get_stats(prog, &stats); 5210 info.run_time_ns = stats.nsecs; 5211 info.run_cnt = stats.cnt; 5212 info.recursion_misses = stats.misses; 5213 5214 info.verified_insns = prog->aux->verified_insns; 5215 if (prog->aux->btf) 5216 info.btf_id = btf_obj_id(prog->aux->btf); 5217 5218 if (!bpf_capable()) { 5219 info.jited_prog_len = 0; 5220 info.xlated_prog_len = 0; 5221 info.nr_jited_ksyms = 0; 5222 info.nr_jited_func_lens = 0; 5223 info.nr_func_info = 0; 5224 info.nr_line_info = 0; 5225 info.nr_jited_line_info = 0; 5226 goto done; 5227 } 5228 5229 ulen = info.xlated_prog_len; 5230 info.xlated_prog_len = bpf_prog_insn_size(prog); 5231 if (info.xlated_prog_len && ulen) { 5232 struct bpf_insn *insns_sanitized; 5233 bool fault; 5234 5235 if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) { 5236 insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); 5237 if (!insns_sanitized) 5238 return -ENOMEM; 5239 uinsns = u64_to_user_ptr(info.xlated_prog_insns); 5240 ulen = min_t(u32, info.xlated_prog_len, ulen); 5241 fault = copy_to_user(uinsns, insns_sanitized, ulen); 5242 kfree(insns_sanitized); 5243 if (fault) 5244 return -EFAULT; 5245 } else { 5246 info.xlated_prog_insns = 0; 5247 } 5248 } 5249 5250 if (bpf_prog_is_offloaded(prog->aux)) { 5251 err = bpf_prog_offload_info_fill(&info, prog); 5252 if (err) 5253 return err; 5254 goto done; 5255 } 5256 5257 /* NOTE: the following code is supposed to be skipped for offload. 5258 * bpf_prog_offload_info_fill() is the place to fill similar fields 5259 * for offload. 5260 */ 5261 ulen = info.jited_prog_len; 5262 if (prog->aux->func_cnt) { 5263 u32 i; 5264 5265 info.jited_prog_len = 0; 5266 for (i = 0; i < prog->aux->func_cnt; i++) 5267 info.jited_prog_len += prog->aux->func[i]->jited_len; 5268 } else { 5269 info.jited_prog_len = prog->jited_len; 5270 } 5271 5272 if (info.jited_prog_len && ulen) { 5273 if (bpf_dump_raw_ok(file->f_cred)) { 5274 uinsns = u64_to_user_ptr(info.jited_prog_insns); 5275 ulen = min_t(u32, info.jited_prog_len, ulen); 5276 5277 /* for multi-function programs, copy the JITed 5278 * instructions for all the functions 5279 */ 5280 if (prog->aux->func_cnt) { 5281 u32 len, free, i; 5282 u8 *img; 5283 5284 free = ulen; 5285 for (i = 0; i < prog->aux->func_cnt; i++) { 5286 len = prog->aux->func[i]->jited_len; 5287 len = min_t(u32, len, free); 5288 img = (u8 *) prog->aux->func[i]->bpf_func; 5289 if (copy_to_user(uinsns, img, len)) 5290 return -EFAULT; 5291 uinsns += len; 5292 free -= len; 5293 if (!free) 5294 break; 5295 } 5296 } else { 5297 if (copy_to_user(uinsns, prog->bpf_func, ulen)) 5298 return -EFAULT; 5299 } 5300 } else { 5301 info.jited_prog_insns = 0; 5302 } 5303 } 5304 5305 ulen = info.nr_jited_ksyms; 5306 info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; 5307 if (ulen) { 5308 if (bpf_dump_raw_ok(file->f_cred)) { 5309 unsigned long ksym_addr; 5310 u64 __user *user_ksyms; 5311 u32 i; 5312 5313 /* copy the address of the kernel symbol 5314 * corresponding to each function 5315 */ 5316 ulen = min_t(u32, info.nr_jited_ksyms, ulen); 5317 user_ksyms = u64_to_user_ptr(info.jited_ksyms); 5318 if (prog->aux->func_cnt) { 5319 for (i = 0; i < ulen; i++) { 5320 ksym_addr = (unsigned long) 5321 prog->aux->func[i]->bpf_func; 5322 if (put_user((u64) ksym_addr, 5323 &user_ksyms[i])) 5324 return -EFAULT; 5325 } 5326 } else { 5327 ksym_addr = (unsigned long) prog->bpf_func; 5328 if (put_user((u64) ksym_addr, &user_ksyms[0])) 5329 return -EFAULT; 5330 } 5331 } else { 5332 info.jited_ksyms = 0; 5333 } 5334 } 5335 5336 ulen = info.nr_jited_func_lens; 5337 info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; 5338 if (ulen) { 5339 if (bpf_dump_raw_ok(file->f_cred)) { 5340 u32 __user *user_lens; 5341 u32 func_len, i; 5342 5343 /* copy the JITed image lengths for each function */ 5344 ulen = min_t(u32, info.nr_jited_func_lens, ulen); 5345 user_lens = u64_to_user_ptr(info.jited_func_lens); 5346 if (prog->aux->func_cnt) { 5347 for (i = 0; i < ulen; i++) { 5348 func_len = 5349 prog->aux->func[i]->jited_len; 5350 if (put_user(func_len, &user_lens[i])) 5351 return -EFAULT; 5352 } 5353 } else { 5354 func_len = prog->jited_len; 5355 if (put_user(func_len, &user_lens[0])) 5356 return -EFAULT; 5357 } 5358 } else { 5359 info.jited_func_lens = 0; 5360 } 5361 } 5362 5363 info.attach_btf_id = prog->aux->attach_btf_id; 5364 if (attach_btf) 5365 info.attach_btf_obj_id = btf_obj_id(attach_btf); 5366 5367 ulen = info.nr_func_info; 5368 info.nr_func_info = prog->aux->func_info_cnt; 5369 if (info.nr_func_info && ulen) { 5370 char __user *user_finfo; 5371 5372 user_finfo = u64_to_user_ptr(info.func_info); 5373 ulen = min_t(u32, info.nr_func_info, ulen); 5374 if (copy_to_user(user_finfo, prog->aux->func_info, 5375 info.func_info_rec_size * ulen)) 5376 return -EFAULT; 5377 } 5378 5379 ulen = info.nr_line_info; 5380 info.nr_line_info = prog->aux->nr_linfo; 5381 if (info.nr_line_info && ulen) { 5382 __u8 __user *user_linfo; 5383 5384 user_linfo = u64_to_user_ptr(info.line_info); 5385 ulen = min_t(u32, info.nr_line_info, ulen); 5386 if (copy_to_user(user_linfo, prog->aux->linfo, 5387 info.line_info_rec_size * ulen)) 5388 return -EFAULT; 5389 } 5390 5391 ulen = info.nr_jited_line_info; 5392 if (prog->aux->jited_linfo) 5393 info.nr_jited_line_info = prog->aux->nr_linfo; 5394 else 5395 info.nr_jited_line_info = 0; 5396 if (info.nr_jited_line_info && ulen) { 5397 if (bpf_dump_raw_ok(file->f_cred)) { 5398 unsigned long line_addr; 5399 __u64 __user *user_linfo; 5400 u32 i; 5401 5402 user_linfo = u64_to_user_ptr(info.jited_line_info); 5403 ulen = min_t(u32, info.nr_jited_line_info, ulen); 5404 for (i = 0; i < ulen; i++) { 5405 line_addr = (unsigned long)prog->aux->jited_linfo[i]; 5406 if (put_user((__u64)line_addr, &user_linfo[i])) 5407 return -EFAULT; 5408 } 5409 } else { 5410 info.jited_line_info = 0; 5411 } 5412 } 5413 5414 ulen = info.nr_prog_tags; 5415 info.nr_prog_tags = prog->aux->func_cnt ? : 1; 5416 if (ulen) { 5417 __u8 __user (*user_prog_tags)[BPF_TAG_SIZE]; 5418 u32 i; 5419 5420 user_prog_tags = u64_to_user_ptr(info.prog_tags); 5421 ulen = min_t(u32, info.nr_prog_tags, ulen); 5422 if (prog->aux->func_cnt) { 5423 for (i = 0; i < ulen; i++) { 5424 if (copy_to_user(user_prog_tags[i], 5425 prog->aux->func[i]->tag, 5426 BPF_TAG_SIZE)) 5427 return -EFAULT; 5428 } 5429 } else { 5430 if (copy_to_user(user_prog_tags[0], 5431 prog->tag, BPF_TAG_SIZE)) 5432 return -EFAULT; 5433 } 5434 } 5435 5436 done: 5437 if (copy_to_user(uinfo, &info, info_len) || 5438 put_user(info_len, &uattr->info.info_len)) 5439 return -EFAULT; 5440 5441 return 0; 5442 } 5443 5444 static int bpf_map_get_info_by_fd(struct file *file, 5445 struct bpf_map *map, 5446 const union bpf_attr *attr, 5447 union bpf_attr __user *uattr) 5448 { 5449 struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5450 struct bpf_map_info info; 5451 u32 info_len = attr->info.info_len, len; 5452 int err; 5453 5454 len = offsetofend(struct bpf_map_info, hash_size); 5455 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len); 5456 if (err) 5457 return err; 5458 info_len = min_t(u32, sizeof(info), info_len); 5459 5460 memset(&info, 0, sizeof(info)); 5461 if (copy_from_user(&info, uinfo, info_len)) 5462 return -EFAULT; 5463 5464 info.type = map->map_type; 5465 info.id = map->id; 5466 info.key_size = map->key_size; 5467 info.value_size = map->value_size; 5468 info.max_entries = map->max_entries; 5469 info.map_flags = map->map_flags; 5470 info.map_extra = map->map_extra; 5471 memcpy(info.name, map->name, sizeof(map->name)); 5472 5473 if (map->btf) { 5474 info.btf_id = btf_obj_id(map->btf); 5475 info.btf_key_type_id = map->btf_key_type_id; 5476 info.btf_value_type_id = map->btf_value_type_id; 5477 } 5478 info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; 5479 if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) 5480 bpf_map_struct_ops_info_fill(&info, map); 5481 5482 if (bpf_map_is_offloaded(map)) { 5483 err = bpf_map_offload_info_fill(&info, map); 5484 if (err) 5485 return err; 5486 } 5487 5488 if (info.hash) { 5489 char __user *uhash = u64_to_user_ptr(info.hash); 5490 5491 if (!map->ops->map_get_hash) 5492 return -EINVAL; 5493 if (info.hash_size != sizeof(map->sha)) 5494 return -EINVAL; 5495 if (!READ_ONCE(map->frozen)) 5496 return -EPERM; 5497 5498 err = map->ops->map_get_hash(map); 5499 if (err != 0) 5500 return err; 5501 5502 if (copy_to_user(uhash, map->sha, sizeof(map->sha)) != 0) 5503 return -EFAULT; 5504 } else if (info.hash_size) { 5505 return -EINVAL; 5506 } 5507 5508 if (copy_to_user(uinfo, &info, info_len) || 5509 put_user(info_len, &uattr->info.info_len)) 5510 return -EFAULT; 5511 5512 return 0; 5513 } 5514 5515 static int bpf_btf_get_info_by_fd(struct file *file, 5516 struct btf *btf, 5517 const union bpf_attr *attr, 5518 union bpf_attr __user *uattr) 5519 { 5520 struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5521 u32 info_len = attr->info.info_len; 5522 int err; 5523 5524 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); 5525 if (err) 5526 return err; 5527 5528 return btf_get_info_by_fd(btf, attr, uattr); 5529 } 5530 5531 static int bpf_link_get_info_by_fd(struct file *file, 5532 struct bpf_link *link, 5533 const union bpf_attr *attr, 5534 union bpf_attr __user *uattr) 5535 { 5536 struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5537 struct bpf_link_info info; 5538 u32 info_len = attr->info.info_len; 5539 int err; 5540 5541 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); 5542 if (err) 5543 return err; 5544 info_len = min_t(u32, sizeof(info), info_len); 5545 5546 memset(&info, 0, sizeof(info)); 5547 if (copy_from_user(&info, uinfo, info_len)) 5548 return -EFAULT; 5549 5550 info.type = link->type; 5551 info.id = link->id; 5552 if (link->prog) 5553 info.prog_id = link->prog->aux->id; 5554 5555 if (link->ops->fill_link_info) { 5556 err = link->ops->fill_link_info(link, &info); 5557 if (err) 5558 return err; 5559 } 5560 5561 if (copy_to_user(uinfo, &info, info_len) || 5562 put_user(info_len, &uattr->info.info_len)) 5563 return -EFAULT; 5564 5565 return 0; 5566 } 5567 5568 5569 static int token_get_info_by_fd(struct file *file, 5570 struct bpf_token *token, 5571 const union bpf_attr *attr, 5572 union bpf_attr __user *uattr) 5573 { 5574 struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5575 u32 info_len = attr->info.info_len; 5576 int err; 5577 5578 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); 5579 if (err) 5580 return err; 5581 return bpf_token_get_info_by_fd(token, attr, uattr); 5582 } 5583 5584 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info 5585 5586 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, 5587 union bpf_attr __user *uattr) 5588 { 5589 if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) 5590 return -EINVAL; 5591 5592 CLASS(fd, f)(attr->info.bpf_fd); 5593 if (fd_empty(f)) 5594 return -EBADFD; 5595 5596 if (fd_file(f)->f_op == &bpf_prog_fops) 5597 return bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, 5598 uattr); 5599 else if (fd_file(f)->f_op == &bpf_map_fops) 5600 return bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, 5601 uattr); 5602 else if (fd_file(f)->f_op == &btf_fops) 5603 return bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); 5604 else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll) 5605 return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, 5606 attr, uattr); 5607 else if (fd_file(f)->f_op == &bpf_token_fops) 5608 return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data, 5609 attr, uattr); 5610 return -EINVAL; 5611 } 5612 5613 #define BPF_BTF_LOAD_LAST_FIELD btf_token_fd 5614 5615 static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) 5616 { 5617 struct bpf_token *token = NULL; 5618 5619 if (CHECK_ATTR(BPF_BTF_LOAD)) 5620 return -EINVAL; 5621 5622 if (attr->btf_flags & ~BPF_F_TOKEN_FD) 5623 return -EINVAL; 5624 5625 if (attr->btf_flags & BPF_F_TOKEN_FD) { 5626 token = bpf_token_get_from_fd(attr->btf_token_fd); 5627 if (IS_ERR(token)) 5628 return PTR_ERR(token); 5629 if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) { 5630 bpf_token_put(token); 5631 token = NULL; 5632 } 5633 } 5634 5635 if (!bpf_token_capable(token, CAP_BPF)) { 5636 bpf_token_put(token); 5637 return -EPERM; 5638 } 5639 5640 bpf_token_put(token); 5641 5642 return btf_new_fd(attr, uattr, attr_log); 5643 } 5644 5645 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd 5646 5647 static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) 5648 { 5649 struct bpf_token *token = NULL; 5650 5651 if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) 5652 return -EINVAL; 5653 5654 if (attr->open_flags & ~BPF_F_TOKEN_FD) 5655 return -EINVAL; 5656 5657 if (attr->open_flags & BPF_F_TOKEN_FD) { 5658 token = bpf_token_get_from_fd(attr->fd_by_id_token_fd); 5659 if (IS_ERR(token)) 5660 return PTR_ERR(token); 5661 if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) { 5662 bpf_token_put(token); 5663 token = NULL; 5664 } 5665 } 5666 5667 if (!bpf_token_capable(token, CAP_SYS_ADMIN)) { 5668 bpf_token_put(token); 5669 return -EPERM; 5670 } 5671 5672 bpf_token_put(token); 5673 5674 return btf_get_fd_by_id(attr->btf_id); 5675 } 5676 5677 static int bpf_task_fd_query_copy(const union bpf_attr *attr, 5678 union bpf_attr __user *uattr, 5679 u32 prog_id, u32 fd_type, 5680 const char *buf, u64 probe_offset, 5681 u64 probe_addr) 5682 { 5683 char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); 5684 u32 len = buf ? strlen(buf) : 0, input_len; 5685 int err = 0; 5686 5687 if (put_user(len, &uattr->task_fd_query.buf_len)) 5688 return -EFAULT; 5689 input_len = attr->task_fd_query.buf_len; 5690 if (input_len && ubuf) { 5691 if (!len) { 5692 /* nothing to copy, just make ubuf NULL terminated */ 5693 char zero = '\0'; 5694 5695 if (put_user(zero, ubuf)) 5696 return -EFAULT; 5697 } else { 5698 err = bpf_copy_to_user(ubuf, buf, input_len, len); 5699 if (err == -EFAULT) 5700 return err; 5701 } 5702 } 5703 5704 if (put_user(prog_id, &uattr->task_fd_query.prog_id) || 5705 put_user(fd_type, &uattr->task_fd_query.fd_type) || 5706 put_user(probe_offset, &uattr->task_fd_query.probe_offset) || 5707 put_user(probe_addr, &uattr->task_fd_query.probe_addr)) 5708 return -EFAULT; 5709 5710 return err; 5711 } 5712 5713 #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr 5714 5715 static int bpf_task_fd_query(const union bpf_attr *attr, 5716 union bpf_attr __user *uattr) 5717 { 5718 pid_t pid = attr->task_fd_query.pid; 5719 u32 fd = attr->task_fd_query.fd; 5720 const struct perf_event *event; 5721 struct task_struct *task; 5722 struct file *file; 5723 int err; 5724 5725 if (CHECK_ATTR(BPF_TASK_FD_QUERY)) 5726 return -EINVAL; 5727 5728 if (!capable(CAP_SYS_ADMIN)) 5729 return -EPERM; 5730 5731 if (attr->task_fd_query.flags != 0) 5732 return -EINVAL; 5733 5734 rcu_read_lock(); 5735 task = get_pid_task(find_vpid(pid), PIDTYPE_PID); 5736 rcu_read_unlock(); 5737 if (!task) 5738 return -ENOENT; 5739 5740 err = 0; 5741 file = fget_task(task, fd); 5742 put_task_struct(task); 5743 if (!file) 5744 return -EBADF; 5745 5746 if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) { 5747 struct bpf_link *link = file->private_data; 5748 5749 if (link->ops == &bpf_raw_tp_link_lops) { 5750 struct bpf_raw_tp_link *raw_tp = 5751 container_of(link, struct bpf_raw_tp_link, link); 5752 struct bpf_raw_event_map *btp = raw_tp->btp; 5753 5754 err = bpf_task_fd_query_copy(attr, uattr, 5755 raw_tp->link.prog->aux->id, 5756 BPF_FD_TYPE_RAW_TRACEPOINT, 5757 btp->tp->name, 0, 0); 5758 goto put_file; 5759 } 5760 goto out_not_supp; 5761 } 5762 5763 event = perf_get_event(file); 5764 if (!IS_ERR(event)) { 5765 u64 probe_offset, probe_addr; 5766 u32 prog_id, fd_type; 5767 const char *buf; 5768 5769 err = bpf_get_perf_event_info(event, &prog_id, &fd_type, 5770 &buf, &probe_offset, 5771 &probe_addr, NULL); 5772 if (!err) 5773 err = bpf_task_fd_query_copy(attr, uattr, prog_id, 5774 fd_type, buf, 5775 probe_offset, 5776 probe_addr); 5777 goto put_file; 5778 } 5779 5780 out_not_supp: 5781 err = -ENOTSUPP; 5782 put_file: 5783 fput(file); 5784 return err; 5785 } 5786 5787 #define BPF_MAP_BATCH_LAST_FIELD batch.flags 5788 5789 #define BPF_DO_BATCH(fn, ...) \ 5790 do { \ 5791 if (!fn) { \ 5792 err = -ENOTSUPP; \ 5793 goto err_put; \ 5794 } \ 5795 err = fn(__VA_ARGS__); \ 5796 } while (0) 5797 5798 static int bpf_map_do_batch(const union bpf_attr *attr, 5799 union bpf_attr __user *uattr, 5800 int cmd) 5801 { 5802 bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || 5803 cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; 5804 bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; 5805 struct bpf_map *map; 5806 int err; 5807 5808 if (CHECK_ATTR(BPF_MAP_BATCH)) 5809 return -EINVAL; 5810 5811 CLASS(fd, f)(attr->batch.map_fd); 5812 5813 map = __bpf_map_get(f); 5814 if (IS_ERR(map)) 5815 return PTR_ERR(map); 5816 if (has_write) 5817 bpf_map_write_active_inc(map); 5818 if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { 5819 err = -EPERM; 5820 goto err_put; 5821 } 5822 if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 5823 err = -EPERM; 5824 goto err_put; 5825 } 5826 5827 if (cmd == BPF_MAP_LOOKUP_BATCH) 5828 BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr); 5829 else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) 5830 BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr); 5831 else if (cmd == BPF_MAP_UPDATE_BATCH) 5832 BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr); 5833 else 5834 BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr); 5835 err_put: 5836 if (has_write) { 5837 maybe_wait_bpf_programs(map); 5838 bpf_map_write_active_dec(map); 5839 } 5840 return err; 5841 } 5842 5843 #define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.path_fd 5844 static int link_create(union bpf_attr *attr, bpfptr_t uattr) 5845 { 5846 struct bpf_prog *prog; 5847 int ret; 5848 5849 if (CHECK_ATTR(BPF_LINK_CREATE)) 5850 return -EINVAL; 5851 5852 if (attr->link_create.attach_type == BPF_STRUCT_OPS) 5853 return bpf_struct_ops_link_create(attr); 5854 5855 prog = bpf_prog_get(attr->link_create.prog_fd); 5856 if (IS_ERR(prog)) 5857 return PTR_ERR(prog); 5858 5859 ret = bpf_prog_attach_check_attach_type(prog, 5860 attr->link_create.attach_type); 5861 if (ret) 5862 goto out; 5863 5864 switch (prog->type) { 5865 case BPF_PROG_TYPE_CGROUP_SKB: 5866 case BPF_PROG_TYPE_CGROUP_SOCK: 5867 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 5868 case BPF_PROG_TYPE_SOCK_OPS: 5869 case BPF_PROG_TYPE_CGROUP_DEVICE: 5870 case BPF_PROG_TYPE_CGROUP_SYSCTL: 5871 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 5872 ret = cgroup_bpf_link_attach(attr, prog); 5873 break; 5874 case BPF_PROG_TYPE_EXT: 5875 ret = bpf_tracing_prog_attach(prog, 5876 attr->link_create.target_fd, 5877 attr->link_create.target_btf_id, 5878 attr->link_create.tracing.cookie, 5879 attr->link_create.attach_type); 5880 break; 5881 case BPF_PROG_TYPE_LSM: 5882 case BPF_PROG_TYPE_TRACING: 5883 if (attr->link_create.attach_type != prog->expected_attach_type) { 5884 ret = -EINVAL; 5885 goto out; 5886 } 5887 if (prog->expected_attach_type == BPF_TRACE_RAW_TP) 5888 ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie, 5889 attr->link_create.attach_type); 5890 else if (prog->expected_attach_type == BPF_TRACE_ITER) 5891 ret = bpf_iter_link_attach(attr, uattr, prog); 5892 else if (prog->expected_attach_type == BPF_LSM_CGROUP) 5893 ret = cgroup_bpf_link_attach(attr, prog); 5894 else if (is_tracing_multi(prog->expected_attach_type)) 5895 ret = bpf_tracing_multi_attach(prog, attr); 5896 else 5897 ret = bpf_tracing_prog_attach(prog, 5898 attr->link_create.target_fd, 5899 attr->link_create.target_btf_id, 5900 attr->link_create.tracing.cookie, 5901 attr->link_create.attach_type); 5902 break; 5903 case BPF_PROG_TYPE_FLOW_DISSECTOR: 5904 case BPF_PROG_TYPE_SK_LOOKUP: 5905 ret = netns_bpf_link_create(attr, prog); 5906 break; 5907 case BPF_PROG_TYPE_SK_MSG: 5908 case BPF_PROG_TYPE_SK_SKB: 5909 ret = sock_map_link_create(attr, prog); 5910 break; 5911 #ifdef CONFIG_NET 5912 case BPF_PROG_TYPE_XDP: 5913 ret = bpf_xdp_link_attach(attr, prog); 5914 break; 5915 case BPF_PROG_TYPE_SCHED_CLS: 5916 if (attr->link_create.attach_type == BPF_TCX_INGRESS || 5917 attr->link_create.attach_type == BPF_TCX_EGRESS) 5918 ret = tcx_link_attach(attr, prog); 5919 else 5920 ret = netkit_link_attach(attr, prog); 5921 break; 5922 case BPF_PROG_TYPE_NETFILTER: 5923 ret = bpf_nf_link_attach(attr, prog); 5924 break; 5925 #endif 5926 case BPF_PROG_TYPE_PERF_EVENT: 5927 case BPF_PROG_TYPE_TRACEPOINT: 5928 ret = bpf_perf_link_attach(attr, prog); 5929 break; 5930 case BPF_PROG_TYPE_KPROBE: 5931 if (attr->link_create.attach_type == BPF_PERF_EVENT) 5932 ret = bpf_perf_link_attach(attr, prog); 5933 else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || 5934 attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) 5935 ret = bpf_kprobe_multi_link_attach(attr, prog); 5936 else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI || 5937 attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION) 5938 ret = bpf_uprobe_multi_link_attach(attr, prog); 5939 break; 5940 default: 5941 ret = -EINVAL; 5942 } 5943 5944 out: 5945 if (ret < 0) 5946 bpf_prog_put(prog); 5947 return ret; 5948 } 5949 5950 static int link_update_map(struct bpf_link *link, union bpf_attr *attr) 5951 { 5952 struct bpf_map *new_map, *old_map = NULL; 5953 int ret; 5954 5955 new_map = bpf_map_get(attr->link_update.new_map_fd); 5956 if (IS_ERR(new_map)) 5957 return PTR_ERR(new_map); 5958 5959 if (attr->link_update.flags & BPF_F_REPLACE) { 5960 old_map = bpf_map_get(attr->link_update.old_map_fd); 5961 if (IS_ERR(old_map)) { 5962 ret = PTR_ERR(old_map); 5963 goto out_put; 5964 } 5965 } else if (attr->link_update.old_map_fd) { 5966 ret = -EINVAL; 5967 goto out_put; 5968 } 5969 5970 ret = link->ops->update_map(link, new_map, old_map); 5971 5972 if (old_map) 5973 bpf_map_put(old_map); 5974 out_put: 5975 bpf_map_put(new_map); 5976 return ret; 5977 } 5978 5979 #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd 5980 5981 static int link_update(union bpf_attr *attr) 5982 { 5983 struct bpf_prog *old_prog = NULL, *new_prog; 5984 struct bpf_link *link; 5985 u32 flags; 5986 int ret; 5987 5988 if (CHECK_ATTR(BPF_LINK_UPDATE)) 5989 return -EINVAL; 5990 5991 flags = attr->link_update.flags; 5992 if (flags & ~BPF_F_REPLACE) 5993 return -EINVAL; 5994 5995 link = bpf_link_get_from_fd(attr->link_update.link_fd); 5996 if (IS_ERR(link)) 5997 return PTR_ERR(link); 5998 5999 if (link->ops->update_map) { 6000 ret = link_update_map(link, attr); 6001 goto out_put_link; 6002 } 6003 6004 new_prog = bpf_prog_get(attr->link_update.new_prog_fd); 6005 if (IS_ERR(new_prog)) { 6006 ret = PTR_ERR(new_prog); 6007 goto out_put_link; 6008 } 6009 6010 if (flags & BPF_F_REPLACE) { 6011 old_prog = bpf_prog_get(attr->link_update.old_prog_fd); 6012 if (IS_ERR(old_prog)) { 6013 ret = PTR_ERR(old_prog); 6014 old_prog = NULL; 6015 goto out_put_progs; 6016 } 6017 } else if (attr->link_update.old_prog_fd) { 6018 ret = -EINVAL; 6019 goto out_put_progs; 6020 } 6021 6022 if (link->ops->update_prog) 6023 ret = link->ops->update_prog(link, new_prog, old_prog); 6024 else 6025 ret = -EINVAL; 6026 6027 out_put_progs: 6028 if (old_prog) 6029 bpf_prog_put(old_prog); 6030 if (ret) 6031 bpf_prog_put(new_prog); 6032 out_put_link: 6033 bpf_link_put_direct(link); 6034 return ret; 6035 } 6036 6037 #define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd 6038 6039 static int link_detach(union bpf_attr *attr) 6040 { 6041 struct bpf_link *link; 6042 int ret; 6043 6044 if (CHECK_ATTR(BPF_LINK_DETACH)) 6045 return -EINVAL; 6046 6047 link = bpf_link_get_from_fd(attr->link_detach.link_fd); 6048 if (IS_ERR(link)) 6049 return PTR_ERR(link); 6050 6051 if (link->ops->detach) 6052 ret = link->ops->detach(link); 6053 else 6054 ret = -EOPNOTSUPP; 6055 6056 bpf_link_put_direct(link); 6057 return ret; 6058 } 6059 6060 struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) 6061 { 6062 return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); 6063 } 6064 EXPORT_SYMBOL(bpf_link_inc_not_zero); 6065 6066 struct bpf_link *bpf_link_by_id(u32 id) 6067 { 6068 struct bpf_link *link; 6069 6070 if (!id) 6071 return ERR_PTR(-ENOENT); 6072 6073 spin_lock_bh(&link_idr_lock); 6074 /* before link is "settled", ID is 0, pretend it doesn't exist yet */ 6075 link = idr_find(&link_idr, id); 6076 if (link) { 6077 if (link->id) 6078 link = bpf_link_inc_not_zero(link); 6079 else 6080 link = ERR_PTR(-EAGAIN); 6081 } else { 6082 link = ERR_PTR(-ENOENT); 6083 } 6084 spin_unlock_bh(&link_idr_lock); 6085 return link; 6086 } 6087 6088 struct bpf_link *bpf_link_get_curr_or_next(u32 *id) 6089 { 6090 struct bpf_link *link; 6091 6092 spin_lock_bh(&link_idr_lock); 6093 again: 6094 link = idr_get_next(&link_idr, id); 6095 if (link) { 6096 link = bpf_link_inc_not_zero(link); 6097 if (IS_ERR(link)) { 6098 (*id)++; 6099 goto again; 6100 } 6101 } 6102 spin_unlock_bh(&link_idr_lock); 6103 6104 return link; 6105 } 6106 6107 #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id 6108 6109 static int bpf_link_get_fd_by_id(const union bpf_attr *attr) 6110 { 6111 struct bpf_link *link; 6112 u32 id = attr->link_id; 6113 int fd; 6114 6115 if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) 6116 return -EINVAL; 6117 6118 if (!capable(CAP_SYS_ADMIN)) 6119 return -EPERM; 6120 6121 link = bpf_link_by_id(id); 6122 if (IS_ERR(link)) 6123 return PTR_ERR(link); 6124 6125 fd = bpf_link_new_fd(link); 6126 if (fd < 0) 6127 bpf_link_put_direct(link); 6128 6129 return fd; 6130 } 6131 6132 DEFINE_MUTEX(bpf_stats_enabled_mutex); 6133 6134 static int bpf_stats_release(struct inode *inode, struct file *file) 6135 { 6136 mutex_lock(&bpf_stats_enabled_mutex); 6137 static_key_slow_dec(&bpf_stats_enabled_key.key); 6138 mutex_unlock(&bpf_stats_enabled_mutex); 6139 return 0; 6140 } 6141 6142 static const struct file_operations bpf_stats_fops = { 6143 .release = bpf_stats_release, 6144 }; 6145 6146 static int bpf_enable_runtime_stats(void) 6147 { 6148 int fd; 6149 6150 mutex_lock(&bpf_stats_enabled_mutex); 6151 6152 /* Set a very high limit to avoid overflow */ 6153 if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { 6154 mutex_unlock(&bpf_stats_enabled_mutex); 6155 return -EBUSY; 6156 } 6157 6158 fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); 6159 if (fd >= 0) 6160 static_key_slow_inc(&bpf_stats_enabled_key.key); 6161 6162 mutex_unlock(&bpf_stats_enabled_mutex); 6163 return fd; 6164 } 6165 6166 #define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type 6167 6168 static int bpf_enable_stats(union bpf_attr *attr) 6169 { 6170 6171 if (CHECK_ATTR(BPF_ENABLE_STATS)) 6172 return -EINVAL; 6173 6174 if (!capable(CAP_SYS_ADMIN)) 6175 return -EPERM; 6176 6177 switch (attr->enable_stats.type) { 6178 case BPF_STATS_RUN_TIME: 6179 return bpf_enable_runtime_stats(); 6180 default: 6181 break; 6182 } 6183 return -EINVAL; 6184 } 6185 6186 #define BPF_ITER_CREATE_LAST_FIELD iter_create.flags 6187 6188 static int bpf_iter_create(union bpf_attr *attr) 6189 { 6190 struct bpf_link *link; 6191 int err; 6192 6193 if (CHECK_ATTR(BPF_ITER_CREATE)) 6194 return -EINVAL; 6195 6196 if (attr->iter_create.flags) 6197 return -EINVAL; 6198 6199 link = bpf_link_get_from_fd(attr->iter_create.link_fd); 6200 if (IS_ERR(link)) 6201 return PTR_ERR(link); 6202 6203 err = bpf_iter_new_fd(link); 6204 bpf_link_put_direct(link); 6205 6206 return err; 6207 } 6208 6209 #define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags 6210 6211 static int bpf_prog_bind_map(union bpf_attr *attr) 6212 { 6213 struct bpf_prog *prog; 6214 struct bpf_map *map; 6215 struct bpf_map **used_maps_old, **used_maps_new; 6216 int i, ret = 0; 6217 6218 if (CHECK_ATTR(BPF_PROG_BIND_MAP)) 6219 return -EINVAL; 6220 6221 if (attr->prog_bind_map.flags) 6222 return -EINVAL; 6223 6224 prog = bpf_prog_get(attr->prog_bind_map.prog_fd); 6225 if (IS_ERR(prog)) 6226 return PTR_ERR(prog); 6227 6228 map = bpf_map_get(attr->prog_bind_map.map_fd); 6229 if (IS_ERR(map)) { 6230 ret = PTR_ERR(map); 6231 goto out_prog_put; 6232 } 6233 6234 mutex_lock(&prog->aux->used_maps_mutex); 6235 6236 used_maps_old = prog->aux->used_maps; 6237 6238 for (i = 0; i < prog->aux->used_map_cnt; i++) 6239 if (used_maps_old[i] == map) { 6240 bpf_map_put(map); 6241 goto out_unlock; 6242 } 6243 6244 used_maps_new = kmalloc_objs(used_maps_new[0], 6245 prog->aux->used_map_cnt + 1); 6246 if (!used_maps_new) { 6247 ret = -ENOMEM; 6248 goto out_unlock; 6249 } 6250 6251 /* The bpf program will not access the bpf map, but for the sake of 6252 * simplicity, increase sleepable_refcnt for sleepable program as well. 6253 */ 6254 if (prog->sleepable) 6255 atomic64_inc(&map->sleepable_refcnt); 6256 memcpy(used_maps_new, used_maps_old, 6257 sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); 6258 used_maps_new[prog->aux->used_map_cnt] = map; 6259 6260 prog->aux->used_map_cnt++; 6261 prog->aux->used_maps = used_maps_new; 6262 6263 kfree(used_maps_old); 6264 6265 out_unlock: 6266 mutex_unlock(&prog->aux->used_maps_mutex); 6267 6268 if (ret) 6269 bpf_map_put(map); 6270 out_prog_put: 6271 bpf_prog_put(prog); 6272 return ret; 6273 } 6274 6275 #define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd 6276 6277 static int token_create(union bpf_attr *attr) 6278 { 6279 if (CHECK_ATTR(BPF_TOKEN_CREATE)) 6280 return -EINVAL; 6281 6282 /* no flags are supported yet */ 6283 if (attr->token_create.flags) 6284 return -EINVAL; 6285 6286 return bpf_token_create(attr); 6287 } 6288 6289 #define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd 6290 6291 static int prog_stream_read(union bpf_attr *attr) 6292 { 6293 char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf); 6294 u32 len = attr->prog_stream_read.stream_buf_len; 6295 struct bpf_prog *prog; 6296 int ret; 6297 6298 if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD)) 6299 return -EINVAL; 6300 6301 prog = bpf_prog_get(attr->prog_stream_read.prog_fd); 6302 if (IS_ERR(prog)) 6303 return PTR_ERR(prog); 6304 6305 ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len); 6306 bpf_prog_put(prog); 6307 6308 return ret; 6309 } 6310 6311 #define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd 6312 6313 static int prog_assoc_struct_ops(union bpf_attr *attr) 6314 { 6315 struct bpf_prog *prog; 6316 struct bpf_map *map; 6317 int ret; 6318 6319 if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS)) 6320 return -EINVAL; 6321 6322 if (attr->prog_assoc_struct_ops.flags) 6323 return -EINVAL; 6324 6325 prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd); 6326 if (IS_ERR(prog)) 6327 return PTR_ERR(prog); 6328 6329 if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { 6330 ret = -EINVAL; 6331 goto put_prog; 6332 } 6333 6334 map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd); 6335 if (IS_ERR(map)) { 6336 ret = PTR_ERR(map); 6337 goto put_prog; 6338 } 6339 6340 if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { 6341 ret = -EINVAL; 6342 goto put_map; 6343 } 6344 6345 ret = bpf_prog_assoc_struct_ops(prog, map); 6346 6347 put_map: 6348 bpf_map_put(map); 6349 put_prog: 6350 bpf_prog_put(prog); 6351 return ret; 6352 } 6353 6354 static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, 6355 bpfptr_t uattr_common, unsigned int size_common) 6356 { 6357 struct bpf_common_attr attr_common; 6358 u32 offsetof_log_true_size = 0; 6359 struct bpf_log_attr attr_log; 6360 union bpf_attr attr; 6361 int err; 6362 6363 err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); 6364 if (err) 6365 return err; 6366 size = min_t(u32, size, sizeof(attr)); 6367 6368 /* copy attributes from user space, may be less than sizeof(bpf_attr) */ 6369 memset(&attr, 0, sizeof(attr)); 6370 if (copy_from_bpfptr(&attr, uattr, size) != 0) 6371 return -EFAULT; 6372 6373 memset(&attr_common, 0, sizeof(attr_common)); 6374 if (cmd & BPF_COMMON_ATTRS) { 6375 err = bpf_check_uarg_tail_zero(uattr_common, 6376 offsetofend(struct bpf_common_attr, log_true_size), 6377 size_common); 6378 if (err) 6379 return err; 6380 6381 cmd &= ~BPF_COMMON_ATTRS; 6382 size_common = min_t(u32, size_common, sizeof(attr_common)); 6383 if (copy_from_bpfptr(&attr_common, uattr_common, size_common) != 0) 6384 return -EFAULT; 6385 } else { 6386 size_common = 0; 6387 } 6388 6389 err = security_bpf(cmd, &attr, size, uattr.is_kernel); 6390 if (err < 0) 6391 return err; 6392 6393 switch (cmd) { 6394 case BPF_MAP_CREATE: 6395 err = map_create(&attr, uattr, &attr_common, uattr_common, size_common); 6396 break; 6397 case BPF_MAP_LOOKUP_ELEM: 6398 err = map_lookup_elem(&attr); 6399 break; 6400 case BPF_MAP_UPDATE_ELEM: 6401 err = map_update_elem(&attr, uattr); 6402 break; 6403 case BPF_MAP_DELETE_ELEM: 6404 err = map_delete_elem(&attr, uattr); 6405 break; 6406 case BPF_MAP_GET_NEXT_KEY: 6407 err = map_get_next_key(&attr); 6408 break; 6409 case BPF_MAP_FREEZE: 6410 err = map_freeze(&attr); 6411 break; 6412 case BPF_PROG_LOAD: 6413 if (size >= offsetofend(union bpf_attr, log_true_size)) 6414 offsetof_log_true_size = offsetof(union bpf_attr, log_true_size); 6415 err = bpf_log_attr_init(&attr_log, attr.log_buf, attr.log_size, attr.log_level, 6416 offsetof_log_true_size, uattr, &attr_common, uattr_common, 6417 size_common); 6418 err = err ?: bpf_prog_load(&attr, uattr, &attr_log); 6419 break; 6420 case BPF_OBJ_PIN: 6421 err = bpf_obj_pin(&attr); 6422 break; 6423 case BPF_OBJ_GET: 6424 err = bpf_obj_get(&attr); 6425 break; 6426 case BPF_PROG_ATTACH: 6427 err = bpf_prog_attach(&attr); 6428 break; 6429 case BPF_PROG_DETACH: 6430 err = bpf_prog_detach(&attr); 6431 break; 6432 case BPF_PROG_QUERY: 6433 err = bpf_prog_query(&attr, uattr.user, size); 6434 break; 6435 case BPF_PROG_TEST_RUN: 6436 err = bpf_prog_test_run(&attr, uattr.user); 6437 break; 6438 case BPF_PROG_GET_NEXT_ID: 6439 err = bpf_obj_get_next_id(&attr, uattr.user, 6440 &prog_idr, &prog_idr_lock); 6441 break; 6442 case BPF_MAP_GET_NEXT_ID: 6443 err = bpf_obj_get_next_id(&attr, uattr.user, 6444 &map_idr, &map_idr_lock); 6445 break; 6446 case BPF_BTF_GET_NEXT_ID: 6447 err = bpf_obj_get_next_id(&attr, uattr.user, 6448 &btf_idr, &btf_idr_lock); 6449 break; 6450 case BPF_PROG_GET_FD_BY_ID: 6451 err = bpf_prog_get_fd_by_id(&attr); 6452 break; 6453 case BPF_MAP_GET_FD_BY_ID: 6454 err = bpf_map_get_fd_by_id(&attr); 6455 break; 6456 case BPF_OBJ_GET_INFO_BY_FD: 6457 err = bpf_obj_get_info_by_fd(&attr, uattr.user); 6458 break; 6459 case BPF_RAW_TRACEPOINT_OPEN: 6460 err = bpf_raw_tracepoint_open(&attr); 6461 break; 6462 case BPF_BTF_LOAD: 6463 if (size >= offsetofend(union bpf_attr, btf_log_true_size)) 6464 offsetof_log_true_size = offsetof(union bpf_attr, btf_log_true_size); 6465 err = bpf_log_attr_init(&attr_log, attr.btf_log_buf, attr.btf_log_size, 6466 attr.btf_log_level, offsetof_log_true_size, uattr, 6467 &attr_common, uattr_common, size_common); 6468 err = err ?: bpf_btf_load(&attr, uattr, &attr_log); 6469 break; 6470 case BPF_BTF_GET_FD_BY_ID: 6471 err = bpf_btf_get_fd_by_id(&attr); 6472 break; 6473 case BPF_TASK_FD_QUERY: 6474 err = bpf_task_fd_query(&attr, uattr.user); 6475 break; 6476 case BPF_MAP_LOOKUP_AND_DELETE_ELEM: 6477 err = map_lookup_and_delete_elem(&attr); 6478 break; 6479 case BPF_MAP_LOOKUP_BATCH: 6480 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH); 6481 break; 6482 case BPF_MAP_LOOKUP_AND_DELETE_BATCH: 6483 err = bpf_map_do_batch(&attr, uattr.user, 6484 BPF_MAP_LOOKUP_AND_DELETE_BATCH); 6485 break; 6486 case BPF_MAP_UPDATE_BATCH: 6487 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH); 6488 break; 6489 case BPF_MAP_DELETE_BATCH: 6490 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH); 6491 break; 6492 case BPF_LINK_CREATE: 6493 err = link_create(&attr, uattr); 6494 break; 6495 case BPF_LINK_UPDATE: 6496 err = link_update(&attr); 6497 break; 6498 case BPF_LINK_GET_FD_BY_ID: 6499 err = bpf_link_get_fd_by_id(&attr); 6500 break; 6501 case BPF_LINK_GET_NEXT_ID: 6502 err = bpf_obj_get_next_id(&attr, uattr.user, 6503 &link_idr, &link_idr_lock); 6504 break; 6505 case BPF_ENABLE_STATS: 6506 err = bpf_enable_stats(&attr); 6507 break; 6508 case BPF_ITER_CREATE: 6509 err = bpf_iter_create(&attr); 6510 break; 6511 case BPF_LINK_DETACH: 6512 err = link_detach(&attr); 6513 break; 6514 case BPF_PROG_BIND_MAP: 6515 err = bpf_prog_bind_map(&attr); 6516 break; 6517 case BPF_TOKEN_CREATE: 6518 err = token_create(&attr); 6519 break; 6520 case BPF_PROG_STREAM_READ_BY_FD: 6521 err = prog_stream_read(&attr); 6522 break; 6523 case BPF_PROG_ASSOC_STRUCT_OPS: 6524 err = prog_assoc_struct_ops(&attr); 6525 break; 6526 default: 6527 err = -EINVAL; 6528 break; 6529 } 6530 6531 return err; 6532 } 6533 6534 SYSCALL_DEFINE5(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size, 6535 struct bpf_common_attr __user *, uattr_common, unsigned int, size_common) 6536 { 6537 return __sys_bpf(cmd, USER_BPFPTR(uattr), size, USER_BPFPTR(uattr_common), size_common); 6538 } 6539 6540 static bool syscall_prog_is_valid_access(int off, int size, 6541 enum bpf_access_type type, 6542 const struct bpf_prog *prog, 6543 struct bpf_insn_access_aux *info) 6544 { 6545 if (off < 0 || off >= U16_MAX) 6546 return false; 6547 /* No alignment requirements for syscall ctx accesses. */ 6548 return true; 6549 } 6550 6551 BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) 6552 { 6553 switch (cmd) { 6554 case BPF_MAP_CREATE: 6555 case BPF_MAP_DELETE_ELEM: 6556 case BPF_MAP_UPDATE_ELEM: 6557 case BPF_MAP_FREEZE: 6558 case BPF_MAP_GET_FD_BY_ID: 6559 case BPF_PROG_LOAD: 6560 case BPF_BTF_LOAD: 6561 case BPF_LINK_CREATE: 6562 case BPF_RAW_TRACEPOINT_OPEN: 6563 break; 6564 default: 6565 return -EINVAL; 6566 } 6567 return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size, KERNEL_BPFPTR(NULL), 0); 6568 } 6569 6570 6571 /* To shut up -Wmissing-prototypes. 6572 * This function is used by the kernel light skeleton 6573 * to load bpf programs when modules are loaded or during kernel boot. 6574 * See tools/lib/bpf/skel_internal.h 6575 */ 6576 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); 6577 6578 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) 6579 { 6580 struct bpf_prog * __maybe_unused prog; 6581 struct bpf_tramp_run_ctx __maybe_unused run_ctx; 6582 6583 switch (cmd) { 6584 #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ 6585 case BPF_PROG_TEST_RUN: 6586 if (attr->test.data_in || attr->test.data_out || 6587 attr->test.ctx_out || attr->test.duration || 6588 attr->test.repeat || attr->test.flags) 6589 return -EINVAL; 6590 6591 prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL); 6592 if (IS_ERR(prog)) 6593 return PTR_ERR(prog); 6594 6595 if (attr->test.ctx_size_in < prog->aux->max_ctx_offset || 6596 attr->test.ctx_size_in > U16_MAX) { 6597 bpf_prog_put(prog); 6598 return -EINVAL; 6599 } 6600 6601 run_ctx.bpf_cookie = 0; 6602 if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { 6603 /* recursion detected */ 6604 __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); 6605 bpf_prog_put(prog); 6606 return -EBUSY; 6607 } 6608 attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); 6609 __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, 6610 &run_ctx); 6611 bpf_prog_put(prog); 6612 return 0; 6613 #endif 6614 default: 6615 return ____bpf_sys_bpf(cmd, attr, size); 6616 } 6617 } 6618 EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL"); 6619 6620 static const struct bpf_func_proto bpf_sys_bpf_proto = { 6621 .func = bpf_sys_bpf, 6622 .gpl_only = false, 6623 .ret_type = RET_INTEGER, 6624 .arg1_type = ARG_ANYTHING, 6625 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 6626 .arg3_type = ARG_CONST_SIZE, 6627 }; 6628 6629 const struct bpf_func_proto * __weak 6630 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 6631 { 6632 return bpf_base_func_proto(func_id, prog); 6633 } 6634 6635 BPF_CALL_1(bpf_sys_close, u32, fd) 6636 { 6637 /* When bpf program calls this helper there should not be 6638 * an fdget() without matching completed fdput(). 6639 * This helper is allowed in the following callchain only: 6640 * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close 6641 */ 6642 return close_fd(fd); 6643 } 6644 6645 static const struct bpf_func_proto bpf_sys_close_proto = { 6646 .func = bpf_sys_close, 6647 .gpl_only = false, 6648 .ret_type = RET_INTEGER, 6649 .arg1_type = ARG_ANYTHING, 6650 }; 6651 6652 BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res) 6653 { 6654 *res = 0; 6655 if (flags) 6656 return -EINVAL; 6657 6658 if (name_sz <= 1 || name[name_sz - 1]) 6659 return -EINVAL; 6660 6661 if (!bpf_dump_raw_ok(current_cred())) 6662 return -EPERM; 6663 6664 *res = kallsyms_lookup_name(name); 6665 return *res ? 0 : -ENOENT; 6666 } 6667 6668 static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { 6669 .func = bpf_kallsyms_lookup_name, 6670 .gpl_only = false, 6671 .ret_type = RET_INTEGER, 6672 .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, 6673 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 6674 .arg3_type = ARG_ANYTHING, 6675 .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, 6676 .arg4_size = sizeof(u64), 6677 }; 6678 6679 static const struct bpf_func_proto * 6680 syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 6681 { 6682 switch (func_id) { 6683 case BPF_FUNC_sys_bpf: 6684 return !bpf_token_capable(prog->aux->token, CAP_PERFMON) 6685 ? NULL : &bpf_sys_bpf_proto; 6686 case BPF_FUNC_btf_find_by_name_kind: 6687 return &bpf_btf_find_by_name_kind_proto; 6688 case BPF_FUNC_sys_close: 6689 return &bpf_sys_close_proto; 6690 case BPF_FUNC_kallsyms_lookup_name: 6691 return &bpf_kallsyms_lookup_name_proto; 6692 default: 6693 return tracing_prog_func_proto(func_id, prog); 6694 } 6695 } 6696 6697 const struct bpf_verifier_ops bpf_syscall_verifier_ops = { 6698 .get_func_proto = syscall_prog_func_proto, 6699 .is_valid_access = syscall_prog_is_valid_access, 6700 }; 6701 6702 const struct bpf_prog_ops bpf_syscall_prog_ops = { 6703 .test_run = bpf_prog_test_run_syscall, 6704 }; 6705 6706 #ifdef CONFIG_SYSCTL 6707 static int bpf_stats_handler(const struct ctl_table *table, int write, 6708 void *buffer, size_t *lenp, loff_t *ppos) 6709 { 6710 struct static_key *key = (struct static_key *)table->data; 6711 static int saved_val; 6712 int val, ret; 6713 struct ctl_table tmp = { 6714 .data = &val, 6715 .maxlen = sizeof(val), 6716 .mode = table->mode, 6717 .extra1 = SYSCTL_ZERO, 6718 .extra2 = SYSCTL_ONE, 6719 }; 6720 6721 if (write && !capable(CAP_SYS_ADMIN)) 6722 return -EPERM; 6723 6724 mutex_lock(&bpf_stats_enabled_mutex); 6725 val = saved_val; 6726 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 6727 if (write && !ret && val != saved_val) { 6728 if (val) 6729 static_key_slow_inc(key); 6730 else 6731 static_key_slow_dec(key); 6732 saved_val = val; 6733 } 6734 mutex_unlock(&bpf_stats_enabled_mutex); 6735 return ret; 6736 } 6737 6738 void __weak unpriv_ebpf_notify(int new_state) 6739 { 6740 } 6741 6742 static int bpf_unpriv_handler(const struct ctl_table *table, int write, 6743 void *buffer, size_t *lenp, loff_t *ppos) 6744 { 6745 int ret, unpriv_enable = *(int *)table->data; 6746 bool locked_state = unpriv_enable == 1; 6747 struct ctl_table tmp = *table; 6748 6749 if (write && !capable(CAP_SYS_ADMIN)) 6750 return -EPERM; 6751 6752 tmp.data = &unpriv_enable; 6753 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 6754 if (write && !ret) { 6755 if (locked_state && unpriv_enable != 1) 6756 return -EPERM; 6757 *(int *)table->data = unpriv_enable; 6758 } 6759 6760 if (write) 6761 unpriv_ebpf_notify(unpriv_enable); 6762 6763 return ret; 6764 } 6765 6766 static const struct ctl_table bpf_syscall_table[] = { 6767 { 6768 .procname = "unprivileged_bpf_disabled", 6769 .data = &sysctl_unprivileged_bpf_disabled, 6770 .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), 6771 .mode = 0644, 6772 .proc_handler = bpf_unpriv_handler, 6773 .extra1 = SYSCTL_ZERO, 6774 .extra2 = SYSCTL_TWO, 6775 }, 6776 { 6777 .procname = "bpf_stats_enabled", 6778 .data = &bpf_stats_enabled_key.key, 6779 .mode = 0644, 6780 .proc_handler = bpf_stats_handler, 6781 }, 6782 }; 6783 6784 static int __init bpf_syscall_sysctl_init(void) 6785 { 6786 register_sysctl_init("kernel", bpf_syscall_table); 6787 return 0; 6788 } 6789 late_initcall(bpf_syscall_sysctl_init); 6790 #endif /* CONFIG_SYSCTL */ 6791