1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 */ 4 #include <crypto/sha2.h> 5 #include <linux/bpf.h> 6 #include <linux/bpf-cgroup.h> 7 #include <linux/bpf_trace.h> 8 #include <linux/bpf_lirc.h> 9 #include <linux/bpf_verifier.h> 10 #include <linux/bsearch.h> 11 #include <linux/btf.h> 12 #include <linux/hex.h> 13 #include <linux/syscalls.h> 14 #include <linux/slab.h> 15 #include <linux/sched/signal.h> 16 #include <linux/vmalloc.h> 17 #include <linux/mmzone.h> 18 #include <linux/anon_inodes.h> 19 #include <linux/fdtable.h> 20 #include <linux/file.h> 21 #include <linux/fs.h> 22 #include <linux/license.h> 23 #include <linux/filter.h> 24 #include <linux/kernel.h> 25 #include <linux/idr.h> 26 #include <linux/cred.h> 27 #include <linux/timekeeping.h> 28 #include <linux/ctype.h> 29 #include <linux/nospec.h> 30 #include <linux/audit.h> 31 #include <uapi/linux/btf.h> 32 #include <linux/pgtable.h> 33 #include <linux/bpf_lsm.h> 34 #include <linux/poll.h> 35 #include <linux/sort.h> 36 #include <linux/bpf-netns.h> 37 #include <linux/rcupdate_trace.h> 38 #include <linux/memcontrol.h> 39 #include <linux/trace_events.h> 40 #include <linux/tracepoint.h> 41 #include <linux/overflow.h> 42 #include <linux/cookie.h> 43 #include <linux/verification.h> 44 #include <linux/btf_ids.h> 45 46 #include <net/netfilter/nf_bpf_link.h> 47 #include <net/netkit.h> 48 #include <net/tcx.h> 49 50 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ 51 (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ 52 (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) 53 #define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) 54 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) 55 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ 56 IS_FD_HASH(map)) 57 58 #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) 59 60 DEFINE_PER_CPU(int, bpf_prog_active); 61 DEFINE_COOKIE(bpf_map_cookie); 62 static DEFINE_IDR(prog_idr); 63 static DEFINE_SPINLOCK(prog_idr_lock); 64 static DEFINE_IDR(map_idr); 65 static DEFINE_SPINLOCK(map_idr_lock); 66 static DEFINE_IDR(link_idr); 67 static DEFINE_SPINLOCK(link_idr_lock); 68 69 int sysctl_unprivileged_bpf_disabled __read_mostly = 70 IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; 71 72 static const struct bpf_map_ops * const bpf_map_types[] = { 73 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) 74 #define BPF_MAP_TYPE(_id, _ops) \ 75 [_id] = &_ops, 76 #define BPF_LINK_TYPE(_id, _name) 77 #include <linux/bpf_types.h> 78 #undef BPF_PROG_TYPE 79 #undef BPF_MAP_TYPE 80 #undef BPF_LINK_TYPE 81 }; 82 83 /* 84 * If we're handed a bigger struct than we know of, ensure all the unknown bits 85 * are 0 - i.e. new user-space does not rely on any kernel feature extensions 86 * we don't know about yet. 87 * 88 * There is a ToCToU between this function call and the following 89 * copy_from_user() call. However, this is not a concern since this function is 90 * meant to be a future-proofing of bits. 91 */ 92 int bpf_check_uarg_tail_zero(bpfptr_t uaddr, 93 size_t expected_size, 94 size_t actual_size) 95 { 96 int res; 97 98 if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ 99 return -E2BIG; 100 101 if (actual_size <= expected_size) 102 return 0; 103 104 if (uaddr.is_kernel) 105 res = memchr_inv(uaddr.kernel + expected_size, 0, 106 actual_size - expected_size) == NULL; 107 else 108 res = check_zeroed_user(uaddr.user + expected_size, 109 actual_size - expected_size); 110 if (res < 0) 111 return res; 112 return res ? 0 : -E2BIG; 113 } 114 115 const struct bpf_map_ops bpf_map_offload_ops = { 116 .map_meta_equal = bpf_map_meta_equal, 117 .map_alloc = bpf_map_offload_map_alloc, 118 .map_free = bpf_map_offload_map_free, 119 .map_check_btf = map_check_no_btf, 120 .map_mem_usage = bpf_map_offload_map_mem_usage, 121 }; 122 123 static void bpf_map_write_active_inc(struct bpf_map *map) 124 { 125 atomic64_inc(&map->writecnt); 126 } 127 128 static void bpf_map_write_active_dec(struct bpf_map *map) 129 { 130 atomic64_dec(&map->writecnt); 131 } 132 133 bool bpf_map_write_active(const struct bpf_map *map) 134 { 135 return atomic64_read(&map->writecnt) != 0; 136 } 137 138 static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags) 139 { 140 if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) 141 return map->value_size; 142 else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 143 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || 144 map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || 145 map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) 146 return round_up(map->value_size, 8) * num_possible_cpus(); 147 else if (IS_FD_MAP(map)) 148 return sizeof(u32); 149 else 150 return map->value_size; 151 } 152 153 static void maybe_wait_bpf_programs(struct bpf_map *map) 154 { 155 /* Wait for any running non-sleepable BPF programs to complete so that 156 * userspace, when we return to it, knows that all non-sleepable 157 * programs that could be running use the new map value. For sleepable 158 * BPF programs, synchronize_rcu_tasks_trace() should be used to wait 159 * for the completions of these programs, but considering the waiting 160 * time can be very long and userspace may think it will hang forever, 161 * so don't handle sleepable BPF programs now. 162 */ 163 if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || 164 map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) 165 synchronize_rcu_expedited(); 166 } 167 168 static void unpin_uptr_kaddr(void *kaddr) 169 { 170 if (kaddr) 171 unpin_user_page(virt_to_page(kaddr)); 172 } 173 174 static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj) 175 { 176 const struct btf_field *field; 177 void **uptr_addr; 178 int i; 179 180 for (i = 0, field = rec->fields; i < cnt; i++, field++) { 181 if (field->type != BPF_UPTR) 182 continue; 183 184 uptr_addr = obj + field->offset; 185 unpin_uptr_kaddr(*uptr_addr); 186 } 187 } 188 189 static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj) 190 { 191 if (!btf_record_has_field(rec, BPF_UPTR)) 192 return; 193 194 __bpf_obj_unpin_uptrs(rec, rec->cnt, obj); 195 } 196 197 static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj) 198 { 199 const struct btf_field *field; 200 const struct btf_type *t; 201 unsigned long start, end; 202 struct page *page; 203 void **uptr_addr; 204 int i, err; 205 206 if (!btf_record_has_field(rec, BPF_UPTR)) 207 return 0; 208 209 for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) { 210 if (field->type != BPF_UPTR) 211 continue; 212 213 uptr_addr = obj + field->offset; 214 start = *(unsigned long *)uptr_addr; 215 if (!start) 216 continue; 217 218 t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); 219 /* t->size was checked for zero before */ 220 if (check_add_overflow(start, t->size - 1, &end)) { 221 err = -EFAULT; 222 goto unpin_all; 223 } 224 225 /* The uptr's struct cannot span across two pages */ 226 if ((start & PAGE_MASK) != (end & PAGE_MASK)) { 227 err = -EOPNOTSUPP; 228 goto unpin_all; 229 } 230 231 err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page); 232 if (err != 1) 233 goto unpin_all; 234 235 if (PageHighMem(page)) { 236 err = -EOPNOTSUPP; 237 unpin_user_page(page); 238 goto unpin_all; 239 } 240 241 *uptr_addr = page_address(page) + offset_in_page(start); 242 } 243 244 return 0; 245 246 unpin_all: 247 __bpf_obj_unpin_uptrs(rec, i, obj); 248 return err; 249 } 250 251 static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, 252 void *key, void *value, __u64 flags) 253 { 254 int err; 255 256 /* Need to create a kthread, thus must support schedule */ 257 if (bpf_map_is_offloaded(map)) { 258 return bpf_map_offload_update_elem(map, key, value, flags); 259 } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || 260 map->map_type == BPF_MAP_TYPE_ARENA || 261 map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { 262 return map->ops->map_update_elem(map, key, value, flags); 263 } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || 264 map->map_type == BPF_MAP_TYPE_SOCKMAP) { 265 return sock_map_update_elem_sys(map, key, value, flags); 266 } else if (IS_FD_PROG_ARRAY(map)) { 267 return bpf_fd_array_map_update_elem(map, map_file, key, value, 268 flags); 269 } 270 271 bpf_disable_instrumentation(); 272 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 273 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 274 err = bpf_percpu_hash_update(map, key, value, flags); 275 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 276 err = bpf_percpu_array_update(map, key, value, flags); 277 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { 278 err = bpf_percpu_cgroup_storage_update(map, key, value, 279 flags); 280 } else if (IS_FD_ARRAY(map)) { 281 err = bpf_fd_array_map_update_elem(map, map_file, key, value, 282 flags); 283 } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { 284 err = bpf_fd_htab_map_update_elem(map, map_file, key, value, 285 flags); 286 } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { 287 /* rcu_read_lock() is not needed */ 288 err = bpf_fd_reuseport_array_update_elem(map, key, value, 289 flags); 290 } else if (map->map_type == BPF_MAP_TYPE_QUEUE || 291 map->map_type == BPF_MAP_TYPE_STACK || 292 map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { 293 err = map->ops->map_push_elem(map, value, flags); 294 } else { 295 err = bpf_obj_pin_uptrs(map->record, value); 296 if (!err) { 297 rcu_read_lock(); 298 err = map->ops->map_update_elem(map, key, value, flags); 299 rcu_read_unlock(); 300 if (err) 301 bpf_obj_unpin_uptrs(map->record, value); 302 } 303 } 304 bpf_enable_instrumentation(); 305 306 return err; 307 } 308 309 static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, 310 __u64 flags) 311 { 312 void *ptr; 313 int err; 314 315 if (bpf_map_is_offloaded(map)) 316 return bpf_map_offload_lookup_elem(map, key, value); 317 318 bpf_disable_instrumentation(); 319 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 320 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 321 err = bpf_percpu_hash_copy(map, key, value, flags); 322 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 323 err = bpf_percpu_array_copy(map, key, value, flags); 324 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { 325 err = bpf_percpu_cgroup_storage_copy(map, key, value, flags); 326 } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { 327 err = bpf_stackmap_extract(map, key, value, false); 328 } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { 329 err = bpf_fd_array_map_lookup_elem(map, key, value); 330 } else if (IS_FD_HASH(map)) { 331 err = bpf_fd_htab_map_lookup_elem(map, key, value); 332 } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { 333 err = bpf_fd_reuseport_array_lookup_elem(map, key, value); 334 } else if (map->map_type == BPF_MAP_TYPE_QUEUE || 335 map->map_type == BPF_MAP_TYPE_STACK || 336 map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { 337 err = map->ops->map_peek_elem(map, value); 338 } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { 339 /* struct_ops map requires directly updating "value" */ 340 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); 341 } else { 342 rcu_read_lock(); 343 if (map->ops->map_lookup_elem_sys_only) 344 ptr = map->ops->map_lookup_elem_sys_only(map, key); 345 else 346 ptr = map->ops->map_lookup_elem(map, key); 347 if (IS_ERR(ptr)) { 348 err = PTR_ERR(ptr); 349 } else if (!ptr) { 350 err = -ENOENT; 351 } else { 352 err = 0; 353 if (flags & BPF_F_LOCK) 354 /* lock 'ptr' and copy everything but lock */ 355 copy_map_value_locked(map, value, ptr, true); 356 else 357 copy_map_value(map, value, ptr); 358 /* mask lock and timer, since value wasn't zero inited */ 359 check_and_init_map_value(map, value); 360 } 361 rcu_read_unlock(); 362 } 363 364 bpf_enable_instrumentation(); 365 366 return err; 367 } 368 369 /* Please, do not use this function outside from the map creation path 370 * (e.g. in map update path) without taking care of setting the active 371 * memory cgroup (see at bpf_map_kmalloc_node() for example). 372 */ 373 static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) 374 { 375 /* We really just want to fail instead of triggering OOM killer 376 * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, 377 * which is used for lower order allocation requests. 378 * 379 * It has been observed that higher order allocation requests done by 380 * vmalloc with __GFP_NORETRY being set might fail due to not trying 381 * to reclaim memory from the page cache, thus we set 382 * __GFP_RETRY_MAYFAIL to avoid such situations. 383 */ 384 385 gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO); 386 unsigned int flags = 0; 387 unsigned long align = 1; 388 void *area; 389 390 if (size >= SIZE_MAX) 391 return NULL; 392 393 /* kmalloc()'ed memory can't be mmap()'ed */ 394 if (mmapable) { 395 BUG_ON(!PAGE_ALIGNED(size)); 396 align = SHMLBA; 397 flags = VM_USERMAP; 398 } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 399 area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, 400 numa_node); 401 if (area != NULL) 402 return area; 403 } 404 405 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 406 gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, 407 flags, numa_node, __builtin_return_address(0)); 408 } 409 410 void *bpf_map_area_alloc(u64 size, int numa_node) 411 { 412 return __bpf_map_area_alloc(size, numa_node, false); 413 } 414 415 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node) 416 { 417 return __bpf_map_area_alloc(size, numa_node, true); 418 } 419 420 void bpf_map_area_free(void *area) 421 { 422 kvfree(area); 423 } 424 425 static u32 bpf_map_flags_retain_permanent(u32 flags) 426 { 427 /* Some map creation flags are not tied to the map object but 428 * rather to the map fd instead, so they have no meaning upon 429 * map object inspection since multiple file descriptors with 430 * different (access) properties can exist here. Thus, given 431 * this has zero meaning for the map itself, lets clear these 432 * from here. 433 */ 434 return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); 435 } 436 437 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) 438 { 439 map->map_type = attr->map_type; 440 map->key_size = attr->key_size; 441 map->value_size = attr->value_size; 442 map->max_entries = attr->max_entries; 443 map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); 444 map->numa_node = bpf_map_attr_numa_node(attr); 445 map->map_extra = attr->map_extra; 446 } 447 448 static int bpf_map_alloc_id(struct bpf_map *map) 449 { 450 int id; 451 452 idr_preload(GFP_KERNEL); 453 spin_lock_bh(&map_idr_lock); 454 id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); 455 if (id > 0) 456 map->id = id; 457 spin_unlock_bh(&map_idr_lock); 458 idr_preload_end(); 459 460 if (WARN_ON_ONCE(!id)) 461 return -ENOSPC; 462 463 return id > 0 ? 0 : id; 464 } 465 466 void bpf_map_free_id(struct bpf_map *map) 467 { 468 unsigned long flags; 469 470 /* Offloaded maps are removed from the IDR store when their device 471 * disappears - even if someone holds an fd to them they are unusable, 472 * the memory is gone, all ops will fail; they are simply waiting for 473 * refcnt to drop to be freed. 474 */ 475 if (!map->id) 476 return; 477 478 spin_lock_irqsave(&map_idr_lock, flags); 479 480 idr_remove(&map_idr, map->id); 481 map->id = 0; 482 483 spin_unlock_irqrestore(&map_idr_lock, flags); 484 } 485 486 #ifdef CONFIG_MEMCG 487 static void bpf_map_save_memcg(struct bpf_map *map) 488 { 489 /* Currently if a map is created by a process belonging to the root 490 * memory cgroup, get_obj_cgroup_from_current() will return NULL. 491 * So we have to check map->objcg for being NULL each time it's 492 * being used. 493 */ 494 if (memcg_bpf_enabled()) 495 map->objcg = get_obj_cgroup_from_current(); 496 } 497 498 static void bpf_map_release_memcg(struct bpf_map *map) 499 { 500 if (map->objcg) 501 obj_cgroup_put(map->objcg); 502 } 503 504 static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) 505 { 506 if (map->objcg) 507 return get_mem_cgroup_from_objcg(map->objcg); 508 509 return root_mem_cgroup; 510 } 511 512 void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, 513 struct mem_cgroup **new_memcg) 514 { 515 *new_memcg = bpf_map_get_memcg(map); 516 *old_memcg = set_active_memcg(*new_memcg); 517 } 518 519 void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, 520 struct mem_cgroup *new_memcg) 521 { 522 set_active_memcg(old_memcg); 523 mem_cgroup_put(new_memcg); 524 } 525 526 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, 527 int node) 528 { 529 struct mem_cgroup *memcg, *old_memcg; 530 void *ptr; 531 532 bpf_map_memcg_enter(map, &old_memcg, &memcg); 533 ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); 534 bpf_map_memcg_exit(old_memcg, memcg); 535 536 return ptr; 537 } 538 539 void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, 540 int node) 541 { 542 struct mem_cgroup *memcg, *old_memcg; 543 void *ptr; 544 545 bpf_map_memcg_enter(map, &old_memcg, &memcg); 546 ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); 547 bpf_map_memcg_exit(old_memcg, memcg); 548 549 return ptr; 550 } 551 552 void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) 553 { 554 struct mem_cgroup *memcg, *old_memcg; 555 void *ptr; 556 557 bpf_map_memcg_enter(map, &old_memcg, &memcg); 558 ptr = kzalloc(size, flags | __GFP_ACCOUNT); 559 bpf_map_memcg_exit(old_memcg, memcg); 560 561 return ptr; 562 } 563 564 void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, 565 gfp_t flags) 566 { 567 struct mem_cgroup *memcg, *old_memcg; 568 void *ptr; 569 570 bpf_map_memcg_enter(map, &old_memcg, &memcg); 571 ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); 572 bpf_map_memcg_exit(old_memcg, memcg); 573 574 return ptr; 575 } 576 577 void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, 578 size_t align, gfp_t flags) 579 { 580 struct mem_cgroup *memcg, *old_memcg; 581 void __percpu *ptr; 582 583 bpf_map_memcg_enter(map, &old_memcg, &memcg); 584 ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); 585 bpf_map_memcg_exit(old_memcg, memcg); 586 587 return ptr; 588 } 589 590 #else 591 static void bpf_map_save_memcg(struct bpf_map *map) 592 { 593 } 594 595 static void bpf_map_release_memcg(struct bpf_map *map) 596 { 597 } 598 #endif 599 600 static bool can_alloc_pages(void) 601 { 602 return preempt_count() == 0 && !irqs_disabled() && 603 !IS_ENABLED(CONFIG_PREEMPT_RT); 604 } 605 606 static struct page *__bpf_alloc_page(int nid) 607 { 608 if (!can_alloc_pages()) 609 return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0); 610 611 return alloc_pages_node(nid, 612 GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT 613 | __GFP_NOWARN, 614 0); 615 } 616 617 int bpf_map_alloc_pages(const struct bpf_map *map, int nid, 618 unsigned long nr_pages, struct page **pages) 619 { 620 unsigned long i, j; 621 struct page *pg; 622 int ret = 0; 623 624 for (i = 0; i < nr_pages; i++) { 625 pg = __bpf_alloc_page(nid); 626 627 if (pg) { 628 pages[i] = pg; 629 continue; 630 } 631 for (j = 0; j < i; j++) 632 free_pages_nolock(pages[j], 0); 633 ret = -ENOMEM; 634 break; 635 } 636 637 return ret; 638 } 639 640 641 static int btf_field_cmp(const void *a, const void *b) 642 { 643 const struct btf_field *f1 = a, *f2 = b; 644 645 if (f1->offset < f2->offset) 646 return -1; 647 else if (f1->offset > f2->offset) 648 return 1; 649 return 0; 650 } 651 652 struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, 653 u32 field_mask) 654 { 655 struct btf_field *field; 656 657 if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask)) 658 return NULL; 659 field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); 660 if (!field || !(field->type & field_mask)) 661 return NULL; 662 return field; 663 } 664 665 void btf_record_free(struct btf_record *rec) 666 { 667 int i; 668 669 if (IS_ERR_OR_NULL(rec)) 670 return; 671 for (i = 0; i < rec->cnt; i++) { 672 switch (rec->fields[i].type) { 673 case BPF_KPTR_UNREF: 674 case BPF_KPTR_REF: 675 case BPF_KPTR_PERCPU: 676 case BPF_UPTR: 677 if (rec->fields[i].kptr.module) 678 module_put(rec->fields[i].kptr.module); 679 if (btf_is_kernel(rec->fields[i].kptr.btf)) 680 btf_put(rec->fields[i].kptr.btf); 681 break; 682 case BPF_LIST_HEAD: 683 case BPF_LIST_NODE: 684 case BPF_RB_ROOT: 685 case BPF_RB_NODE: 686 case BPF_SPIN_LOCK: 687 case BPF_RES_SPIN_LOCK: 688 case BPF_TIMER: 689 case BPF_REFCOUNT: 690 case BPF_WORKQUEUE: 691 case BPF_TASK_WORK: 692 /* Nothing to release */ 693 break; 694 default: 695 WARN_ON_ONCE(1); 696 continue; 697 } 698 } 699 kfree(rec); 700 } 701 702 void bpf_map_free_record(struct bpf_map *map) 703 { 704 btf_record_free(map->record); 705 map->record = NULL; 706 } 707 708 struct btf_record *btf_record_dup(const struct btf_record *rec) 709 { 710 const struct btf_field *fields; 711 struct btf_record *new_rec; 712 int ret, size, i; 713 714 if (IS_ERR_OR_NULL(rec)) 715 return NULL; 716 size = struct_size(rec, fields, rec->cnt); 717 new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); 718 if (!new_rec) 719 return ERR_PTR(-ENOMEM); 720 /* Do a deep copy of the btf_record */ 721 fields = rec->fields; 722 new_rec->cnt = 0; 723 for (i = 0; i < rec->cnt; i++) { 724 switch (fields[i].type) { 725 case BPF_KPTR_UNREF: 726 case BPF_KPTR_REF: 727 case BPF_KPTR_PERCPU: 728 case BPF_UPTR: 729 if (btf_is_kernel(fields[i].kptr.btf)) 730 btf_get(fields[i].kptr.btf); 731 if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { 732 ret = -ENXIO; 733 goto free; 734 } 735 break; 736 case BPF_LIST_HEAD: 737 case BPF_LIST_NODE: 738 case BPF_RB_ROOT: 739 case BPF_RB_NODE: 740 case BPF_SPIN_LOCK: 741 case BPF_RES_SPIN_LOCK: 742 case BPF_TIMER: 743 case BPF_REFCOUNT: 744 case BPF_WORKQUEUE: 745 case BPF_TASK_WORK: 746 /* Nothing to acquire */ 747 break; 748 default: 749 ret = -EFAULT; 750 WARN_ON_ONCE(1); 751 goto free; 752 } 753 new_rec->cnt++; 754 } 755 return new_rec; 756 free: 757 btf_record_free(new_rec); 758 return ERR_PTR(ret); 759 } 760 761 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b) 762 { 763 bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b); 764 int size; 765 766 if (!a_has_fields && !b_has_fields) 767 return true; 768 if (a_has_fields != b_has_fields) 769 return false; 770 if (rec_a->cnt != rec_b->cnt) 771 return false; 772 size = struct_size(rec_a, fields, rec_a->cnt); 773 /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused 774 * members are zeroed out. So memcmp is safe to do without worrying 775 * about padding/unused fields. 776 * 777 * While spin_lock, timer, and kptr have no relation to map BTF, 778 * list_head metadata is specific to map BTF, the btf and value_rec 779 * members in particular. btf is the map BTF, while value_rec points to 780 * btf_record in that map BTF. 781 * 782 * So while by default, we don't rely on the map BTF (which the records 783 * were parsed from) matching for both records, which is not backwards 784 * compatible, in case list_head is part of it, we implicitly rely on 785 * that by way of depending on memcmp succeeding for it. 786 */ 787 return !memcmp(rec_a, rec_b, size); 788 } 789 790 void bpf_obj_free_timer(const struct btf_record *rec, void *obj) 791 { 792 if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER))) 793 return; 794 bpf_timer_cancel_and_free(obj + rec->timer_off); 795 } 796 797 void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) 798 { 799 if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE))) 800 return; 801 bpf_wq_cancel_and_free(obj + rec->wq_off); 802 } 803 804 void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) 805 { 806 if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) 807 return; 808 bpf_task_work_cancel_and_free(obj + rec->task_work_off); 809 } 810 811 void bpf_obj_free_fields(const struct btf_record *rec, void *obj) 812 { 813 const struct btf_field *fields; 814 int i; 815 816 if (IS_ERR_OR_NULL(rec)) 817 return; 818 fields = rec->fields; 819 for (i = 0; i < rec->cnt; i++) { 820 struct btf_struct_meta *pointee_struct_meta; 821 const struct btf_field *field = &fields[i]; 822 void *field_ptr = obj + field->offset; 823 void *xchgd_field; 824 825 switch (fields[i].type) { 826 case BPF_SPIN_LOCK: 827 case BPF_RES_SPIN_LOCK: 828 break; 829 case BPF_TIMER: 830 bpf_timer_cancel_and_free(field_ptr); 831 break; 832 case BPF_WORKQUEUE: 833 bpf_wq_cancel_and_free(field_ptr); 834 break; 835 case BPF_TASK_WORK: 836 bpf_task_work_cancel_and_free(field_ptr); 837 break; 838 case BPF_KPTR_UNREF: 839 WRITE_ONCE(*(u64 *)field_ptr, 0); 840 break; 841 case BPF_KPTR_REF: 842 case BPF_KPTR_PERCPU: 843 xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); 844 if (!xchgd_field) 845 break; 846 847 if (!btf_is_kernel(field->kptr.btf)) { 848 pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, 849 field->kptr.btf_id); 850 __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? 851 pointee_struct_meta->record : NULL, 852 fields[i].type == BPF_KPTR_PERCPU); 853 } else { 854 field->kptr.dtor(xchgd_field); 855 } 856 break; 857 case BPF_UPTR: 858 /* The caller ensured that no one is using the uptr */ 859 unpin_uptr_kaddr(*(void **)field_ptr); 860 break; 861 case BPF_LIST_HEAD: 862 if (WARN_ON_ONCE(rec->spin_lock_off < 0)) 863 continue; 864 bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off); 865 break; 866 case BPF_RB_ROOT: 867 if (WARN_ON_ONCE(rec->spin_lock_off < 0)) 868 continue; 869 bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off); 870 break; 871 case BPF_LIST_NODE: 872 case BPF_RB_NODE: 873 case BPF_REFCOUNT: 874 break; 875 default: 876 WARN_ON_ONCE(1); 877 continue; 878 } 879 } 880 } 881 882 static void bpf_map_free(struct bpf_map *map) 883 { 884 struct btf_record *rec = map->record; 885 struct btf *btf = map->btf; 886 887 /* implementation dependent freeing. Disabling migration to simplify 888 * the free of values or special fields allocated from bpf memory 889 * allocator. 890 */ 891 kfree(map->excl_prog_sha); 892 migrate_disable(); 893 map->ops->map_free(map); 894 migrate_enable(); 895 896 /* Delay freeing of btf_record for maps, as map_free 897 * callback usually needs access to them. It is better to do it here 898 * than require each callback to do the free itself manually. 899 * 900 * Note that the btf_record stashed in map->inner_map_meta->record was 901 * already freed using the map_free callback for map in map case which 902 * eventually calls bpf_map_free_meta, since inner_map_meta is only a 903 * template bpf_map struct used during verification. 904 */ 905 btf_record_free(rec); 906 /* Delay freeing of btf for maps, as map_free callback may need 907 * struct_meta info which will be freed with btf_put(). 908 */ 909 btf_put(btf); 910 } 911 912 /* called from workqueue */ 913 static void bpf_map_free_deferred(struct work_struct *work) 914 { 915 struct bpf_map *map = container_of(work, struct bpf_map, work); 916 917 security_bpf_map_free(map); 918 bpf_map_release_memcg(map); 919 bpf_map_owner_free(map); 920 bpf_map_free(map); 921 } 922 923 static void bpf_map_put_uref(struct bpf_map *map) 924 { 925 if (atomic64_dec_and_test(&map->usercnt)) { 926 if (map->ops->map_release_uref) 927 map->ops->map_release_uref(map); 928 } 929 } 930 931 static void bpf_map_free_in_work(struct bpf_map *map) 932 { 933 INIT_WORK(&map->work, bpf_map_free_deferred); 934 /* Avoid spawning kworkers, since they all might contend 935 * for the same mutex like slab_mutex. 936 */ 937 queue_work(system_dfl_wq, &map->work); 938 } 939 940 static void bpf_map_free_rcu_gp(struct rcu_head *rcu) 941 { 942 bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); 943 } 944 945 /* decrement map refcnt and schedule it for freeing via workqueue 946 * (underlying map implementation ops->map_free() might sleep) 947 */ 948 void bpf_map_put(struct bpf_map *map) 949 { 950 if (atomic64_dec_and_test(&map->refcnt)) { 951 /* bpf_map_free_id() must be called first */ 952 bpf_map_free_id(map); 953 954 WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); 955 /* RCU tasks trace grace period implies RCU grace period. */ 956 if (READ_ONCE(map->free_after_mult_rcu_gp)) 957 call_rcu_tasks_trace(&map->rcu, bpf_map_free_rcu_gp); 958 else if (READ_ONCE(map->free_after_rcu_gp)) 959 call_rcu(&map->rcu, bpf_map_free_rcu_gp); 960 else 961 bpf_map_free_in_work(map); 962 } 963 } 964 EXPORT_SYMBOL_GPL(bpf_map_put); 965 966 void bpf_map_put_with_uref(struct bpf_map *map) 967 { 968 bpf_map_put_uref(map); 969 bpf_map_put(map); 970 } 971 972 static int bpf_map_release(struct inode *inode, struct file *filp) 973 { 974 struct bpf_map *map = filp->private_data; 975 976 if (map->ops->map_release) 977 map->ops->map_release(map, filp); 978 979 bpf_map_put_with_uref(map); 980 return 0; 981 } 982 983 static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) 984 { 985 fmode_t mode = fd_file(f)->f_mode; 986 987 /* Our file permissions may have been overridden by global 988 * map permissions facing syscall side. 989 */ 990 if (READ_ONCE(map->frozen)) 991 mode &= ~FMODE_CAN_WRITE; 992 return mode; 993 } 994 995 #ifdef CONFIG_PROC_FS 996 /* Show the memory usage of a bpf map */ 997 static u64 bpf_map_memory_usage(const struct bpf_map *map) 998 { 999 return map->ops->map_mem_usage(map); 1000 } 1001 1002 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) 1003 { 1004 struct bpf_map *map = filp->private_data; 1005 u32 type = 0, jited = 0; 1006 1007 spin_lock(&map->owner_lock); 1008 if (map->owner) { 1009 type = map->owner->type; 1010 jited = map->owner->jited; 1011 } 1012 spin_unlock(&map->owner_lock); 1013 1014 seq_printf(m, 1015 "map_type:\t%u\n" 1016 "key_size:\t%u\n" 1017 "value_size:\t%u\n" 1018 "max_entries:\t%u\n" 1019 "map_flags:\t%#x\n" 1020 "map_extra:\t%#llx\n" 1021 "memlock:\t%llu\n" 1022 "map_id:\t%u\n" 1023 "frozen:\t%u\n", 1024 map->map_type, 1025 map->key_size, 1026 map->value_size, 1027 map->max_entries, 1028 map->map_flags, 1029 (unsigned long long)map->map_extra, 1030 bpf_map_memory_usage(map), 1031 map->id, 1032 READ_ONCE(map->frozen)); 1033 if (type) { 1034 seq_printf(m, "owner_prog_type:\t%u\n", type); 1035 seq_printf(m, "owner_jited:\t%u\n", jited); 1036 } 1037 } 1038 #endif 1039 1040 static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, 1041 loff_t *ppos) 1042 { 1043 /* We need this handler such that alloc_file() enables 1044 * f_mode with FMODE_CAN_READ. 1045 */ 1046 return -EINVAL; 1047 } 1048 1049 static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, 1050 size_t siz, loff_t *ppos) 1051 { 1052 /* We need this handler such that alloc_file() enables 1053 * f_mode with FMODE_CAN_WRITE. 1054 */ 1055 return -EINVAL; 1056 } 1057 1058 /* called for any extra memory-mapped regions (except initial) */ 1059 static void bpf_map_mmap_open(struct vm_area_struct *vma) 1060 { 1061 struct bpf_map *map = vma->vm_file->private_data; 1062 1063 if (vma->vm_flags & VM_MAYWRITE) 1064 bpf_map_write_active_inc(map); 1065 } 1066 1067 /* called for all unmapped memory region (including initial) */ 1068 static void bpf_map_mmap_close(struct vm_area_struct *vma) 1069 { 1070 struct bpf_map *map = vma->vm_file->private_data; 1071 1072 if (vma->vm_flags & VM_MAYWRITE) 1073 bpf_map_write_active_dec(map); 1074 } 1075 1076 static const struct vm_operations_struct bpf_map_default_vmops = { 1077 .open = bpf_map_mmap_open, 1078 .close = bpf_map_mmap_close, 1079 }; 1080 1081 static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) 1082 { 1083 struct bpf_map *map = filp->private_data; 1084 int err = 0; 1085 1086 if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) 1087 return -ENOTSUPP; 1088 1089 if (!(vma->vm_flags & VM_SHARED)) 1090 return -EINVAL; 1091 1092 mutex_lock(&map->freeze_mutex); 1093 1094 if (vma->vm_flags & VM_WRITE) { 1095 if (map->frozen) { 1096 err = -EPERM; 1097 goto out; 1098 } 1099 /* map is meant to be read-only, so do not allow mapping as 1100 * writable, because it's possible to leak a writable page 1101 * reference and allows user-space to still modify it after 1102 * freezing, while verifier will assume contents do not change 1103 */ 1104 if (map->map_flags & BPF_F_RDONLY_PROG) { 1105 err = -EACCES; 1106 goto out; 1107 } 1108 bpf_map_write_active_inc(map); 1109 } 1110 out: 1111 mutex_unlock(&map->freeze_mutex); 1112 if (err) 1113 return err; 1114 1115 /* set default open/close callbacks */ 1116 vma->vm_ops = &bpf_map_default_vmops; 1117 vma->vm_private_data = map; 1118 vm_flags_clear(vma, VM_MAYEXEC); 1119 /* If mapping is read-only, then disallow potentially re-mapping with 1120 * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing 1121 * means that as far as BPF map's memory-mapped VMAs are concerned, 1122 * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set, 1123 * both should be set, so we can forget about VM_MAYWRITE and always 1124 * check just VM_WRITE 1125 */ 1126 if (!(vma->vm_flags & VM_WRITE)) 1127 vm_flags_clear(vma, VM_MAYWRITE); 1128 1129 err = map->ops->map_mmap(map, vma); 1130 if (err) { 1131 if (vma->vm_flags & VM_WRITE) 1132 bpf_map_write_active_dec(map); 1133 } 1134 1135 return err; 1136 } 1137 1138 static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) 1139 { 1140 struct bpf_map *map = filp->private_data; 1141 1142 if (map->ops->map_poll) 1143 return map->ops->map_poll(map, filp, pts); 1144 1145 return EPOLLERR; 1146 } 1147 1148 static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr, 1149 unsigned long len, unsigned long pgoff, 1150 unsigned long flags) 1151 { 1152 struct bpf_map *map = filp->private_data; 1153 1154 if (map->ops->map_get_unmapped_area) 1155 return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); 1156 #ifdef CONFIG_MMU 1157 return mm_get_unmapped_area(filp, addr, len, pgoff, flags); 1158 #else 1159 return addr; 1160 #endif 1161 } 1162 1163 const struct file_operations bpf_map_fops = { 1164 #ifdef CONFIG_PROC_FS 1165 .show_fdinfo = bpf_map_show_fdinfo, 1166 #endif 1167 .release = bpf_map_release, 1168 .read = bpf_dummy_read, 1169 .write = bpf_dummy_write, 1170 .mmap = bpf_map_mmap, 1171 .poll = bpf_map_poll, 1172 .get_unmapped_area = bpf_get_unmapped_area, 1173 }; 1174 1175 int bpf_map_new_fd(struct bpf_map *map, int flags) 1176 { 1177 int ret; 1178 1179 ret = security_bpf_map(map, OPEN_FMODE(flags)); 1180 if (ret < 0) 1181 return ret; 1182 1183 return anon_inode_getfd("bpf-map", &bpf_map_fops, map, 1184 flags | O_CLOEXEC); 1185 } 1186 1187 int bpf_get_file_flag(int flags) 1188 { 1189 if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) 1190 return -EINVAL; 1191 if (flags & BPF_F_RDONLY) 1192 return O_RDONLY; 1193 if (flags & BPF_F_WRONLY) 1194 return O_WRONLY; 1195 return O_RDWR; 1196 } 1197 1198 /* helper macro to check that unused fields 'union bpf_attr' are zero */ 1199 #define CHECK_ATTR(CMD) \ 1200 memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ 1201 sizeof(attr->CMD##_LAST_FIELD), 0, \ 1202 sizeof(*attr) - \ 1203 offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ 1204 sizeof(attr->CMD##_LAST_FIELD)) != NULL 1205 1206 /* dst and src must have at least "size" number of bytes. 1207 * Return strlen on success and < 0 on error. 1208 */ 1209 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) 1210 { 1211 const char *end = src + size; 1212 const char *orig_src = src; 1213 1214 memset(dst, 0, size); 1215 /* Copy all isalnum(), '_' and '.' chars. */ 1216 while (src < end && *src) { 1217 if (!isalnum(*src) && 1218 *src != '_' && *src != '.') 1219 return -EINVAL; 1220 *dst++ = *src++; 1221 } 1222 1223 /* No '\0' found in "size" number of bytes */ 1224 if (src == end) 1225 return -EINVAL; 1226 1227 return src - orig_src; 1228 } 1229 EXPORT_SYMBOL_GPL(bpf_obj_name_cpy); 1230 1231 int map_check_no_btf(struct bpf_map *map, 1232 const struct btf *btf, 1233 const struct btf_type *key_type, 1234 const struct btf_type *value_type) 1235 { 1236 return -ENOTSUPP; 1237 } 1238 1239 static int map_check_btf(struct bpf_map *map, struct bpf_token *token, 1240 const struct btf *btf, u32 btf_key_id, u32 btf_value_id) 1241 { 1242 const struct btf_type *key_type, *value_type; 1243 u32 key_size, value_size; 1244 int ret = 0; 1245 1246 /* Some maps allow key to be unspecified. */ 1247 if (btf_key_id) { 1248 key_type = btf_type_id_size(btf, &btf_key_id, &key_size); 1249 if (!key_type || key_size != map->key_size) 1250 return -EINVAL; 1251 } else { 1252 key_type = btf_type_by_id(btf, 0); 1253 if (!map->ops->map_check_btf) 1254 return -EINVAL; 1255 } 1256 1257 value_type = btf_type_id_size(btf, &btf_value_id, &value_size); 1258 if (!value_type || value_size != map->value_size) 1259 return -EINVAL; 1260 1261 map->record = btf_parse_fields(btf, value_type, 1262 BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | 1263 BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | 1264 BPF_TASK_WORK, 1265 map->value_size); 1266 if (!IS_ERR_OR_NULL(map->record)) { 1267 int i; 1268 1269 if (!bpf_token_capable(token, CAP_BPF)) { 1270 ret = -EPERM; 1271 goto free_map_tab; 1272 } 1273 if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { 1274 ret = -EACCES; 1275 goto free_map_tab; 1276 } 1277 for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) { 1278 switch (map->record->field_mask & (1 << i)) { 1279 case 0: 1280 continue; 1281 case BPF_SPIN_LOCK: 1282 case BPF_RES_SPIN_LOCK: 1283 if (map->map_type != BPF_MAP_TYPE_HASH && 1284 map->map_type != BPF_MAP_TYPE_RHASH && 1285 map->map_type != BPF_MAP_TYPE_ARRAY && 1286 map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && 1287 map->map_type != BPF_MAP_TYPE_SK_STORAGE && 1288 map->map_type != BPF_MAP_TYPE_INODE_STORAGE && 1289 map->map_type != BPF_MAP_TYPE_TASK_STORAGE && 1290 map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { 1291 ret = -EOPNOTSUPP; 1292 goto free_map_tab; 1293 } 1294 break; 1295 case BPF_TIMER: 1296 case BPF_WORKQUEUE: 1297 case BPF_TASK_WORK: 1298 if (map->map_type != BPF_MAP_TYPE_HASH && 1299 map->map_type != BPF_MAP_TYPE_RHASH && 1300 map->map_type != BPF_MAP_TYPE_LRU_HASH && 1301 map->map_type != BPF_MAP_TYPE_ARRAY) { 1302 ret = -EOPNOTSUPP; 1303 goto free_map_tab; 1304 } 1305 break; 1306 case BPF_KPTR_UNREF: 1307 case BPF_KPTR_REF: 1308 case BPF_KPTR_PERCPU: 1309 case BPF_REFCOUNT: 1310 if (map->map_type != BPF_MAP_TYPE_HASH && 1311 map->map_type != BPF_MAP_TYPE_RHASH && 1312 map->map_type != BPF_MAP_TYPE_PERCPU_HASH && 1313 map->map_type != BPF_MAP_TYPE_LRU_HASH && 1314 map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && 1315 map->map_type != BPF_MAP_TYPE_ARRAY && 1316 map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && 1317 map->map_type != BPF_MAP_TYPE_SK_STORAGE && 1318 map->map_type != BPF_MAP_TYPE_INODE_STORAGE && 1319 map->map_type != BPF_MAP_TYPE_TASK_STORAGE && 1320 map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { 1321 ret = -EOPNOTSUPP; 1322 goto free_map_tab; 1323 } 1324 break; 1325 case BPF_UPTR: 1326 if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) { 1327 ret = -EOPNOTSUPP; 1328 goto free_map_tab; 1329 } 1330 break; 1331 case BPF_LIST_HEAD: 1332 case BPF_RB_ROOT: 1333 if (map->map_type != BPF_MAP_TYPE_HASH && 1334 map->map_type != BPF_MAP_TYPE_LRU_HASH && 1335 map->map_type != BPF_MAP_TYPE_ARRAY) { 1336 ret = -EOPNOTSUPP; 1337 goto free_map_tab; 1338 } 1339 break; 1340 default: 1341 /* Fail if map_type checks are missing for a field type */ 1342 ret = -EOPNOTSUPP; 1343 goto free_map_tab; 1344 } 1345 } 1346 } 1347 1348 ret = btf_check_and_fixup_fields(btf, map->record); 1349 if (ret < 0) 1350 goto free_map_tab; 1351 1352 if (map->ops->map_check_btf) { 1353 ret = map->ops->map_check_btf(map, btf, key_type, value_type); 1354 if (ret < 0) 1355 goto free_map_tab; 1356 } 1357 1358 return ret; 1359 free_map_tab: 1360 bpf_map_free_record(map); 1361 return ret; 1362 } 1363 1364 #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size 1365 /* called via syscall */ 1366 static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log, 1367 struct bpf_map **mapp, struct bpf_token **tokenp) 1368 { 1369 const struct bpf_map_ops *ops; 1370 struct bpf_token *token = NULL; 1371 int numa_node = bpf_map_attr_numa_node(attr); 1372 u32 map_type = attr->map_type; 1373 struct bpf_map *map; 1374 bool token_flag; 1375 int err; 1376 1377 err = CHECK_ATTR(BPF_MAP_CREATE); 1378 if (err) { 1379 bpf_log(log, "Invalid attr.\n"); 1380 return -EINVAL; 1381 } 1382 1383 /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it 1384 * to avoid per-map type checks tripping on unknown flag 1385 */ 1386 token_flag = attr->map_flags & BPF_F_TOKEN_FD; 1387 attr->map_flags &= ~BPF_F_TOKEN_FD; 1388 1389 if (attr->btf_vmlinux_value_type_id) { 1390 if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS) { 1391 bpf_log(log, "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n"); 1392 return -EINVAL; 1393 } 1394 if (attr->btf_key_type_id || attr->btf_value_type_id) { 1395 bpf_log(log, "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n"); 1396 return -EINVAL; 1397 } 1398 } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { 1399 bpf_log(log, "Invalid btf_value_type_id.\n"); 1400 return -EINVAL; 1401 } 1402 1403 if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && 1404 attr->map_type != BPF_MAP_TYPE_ARENA && 1405 attr->map_type != BPF_MAP_TYPE_RHASH && 1406 attr->map_extra != 0) { 1407 bpf_log(log, "Invalid map_extra.\n"); 1408 return -EINVAL; 1409 } 1410 1411 if (numa_node != NUMA_NO_NODE && 1412 ((unsigned int)numa_node >= nr_node_ids || 1413 !node_online(numa_node))) { 1414 bpf_log(log, "Invalid numa_node.\n"); 1415 return -EINVAL; 1416 } 1417 1418 /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ 1419 map_type = attr->map_type; 1420 if (map_type >= ARRAY_SIZE(bpf_map_types)) { 1421 bpf_log(log, "Invalid map_type.\n"); 1422 return -EINVAL; 1423 } 1424 map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); 1425 ops = bpf_map_types[map_type]; 1426 if (!ops) 1427 return -EINVAL; 1428 1429 if (ops->map_alloc_check) { 1430 err = ops->map_alloc_check(attr); 1431 if (err) 1432 return err; 1433 } 1434 if (attr->map_ifindex) 1435 ops = &bpf_map_offload_ops; 1436 if (!ops->map_mem_usage) 1437 return -EINVAL; 1438 1439 if (token_flag) { 1440 token = bpf_token_get_from_fd(attr->map_token_fd); 1441 if (IS_ERR(token)) { 1442 bpf_log(log, "Invalid map_token_fd.\n"); 1443 return PTR_ERR(token); 1444 } 1445 1446 /* if current token doesn't grant map creation permissions, 1447 * then we can't use this token, so ignore it and rely on 1448 * system-wide capabilities checks 1449 */ 1450 if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) || 1451 !bpf_token_allow_map_type(token, attr->map_type)) { 1452 bpf_token_put(token); 1453 token = NULL; 1454 } 1455 } 1456 1457 err = -EPERM; 1458 1459 /* Intent here is for unprivileged_bpf_disabled to block BPF map 1460 * creation for unprivileged users; other actions depend 1461 * on fd availability and access to bpffs, so are dependent on 1462 * object creation success. Even with unprivileged BPF disabled, 1463 * capability checks are still carried out. 1464 */ 1465 if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF)) 1466 goto put_token; 1467 1468 /* check privileged map type permissions */ 1469 switch (map_type) { 1470 case BPF_MAP_TYPE_ARRAY: 1471 case BPF_MAP_TYPE_PERCPU_ARRAY: 1472 case BPF_MAP_TYPE_PROG_ARRAY: 1473 case BPF_MAP_TYPE_PERF_EVENT_ARRAY: 1474 case BPF_MAP_TYPE_CGROUP_ARRAY: 1475 case BPF_MAP_TYPE_ARRAY_OF_MAPS: 1476 case BPF_MAP_TYPE_HASH: 1477 case BPF_MAP_TYPE_RHASH: 1478 case BPF_MAP_TYPE_PERCPU_HASH: 1479 case BPF_MAP_TYPE_HASH_OF_MAPS: 1480 case BPF_MAP_TYPE_RINGBUF: 1481 case BPF_MAP_TYPE_USER_RINGBUF: 1482 case BPF_MAP_TYPE_CGROUP_STORAGE: 1483 case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: 1484 /* unprivileged */ 1485 break; 1486 case BPF_MAP_TYPE_SK_STORAGE: 1487 case BPF_MAP_TYPE_INODE_STORAGE: 1488 case BPF_MAP_TYPE_TASK_STORAGE: 1489 case BPF_MAP_TYPE_CGRP_STORAGE: 1490 case BPF_MAP_TYPE_BLOOM_FILTER: 1491 case BPF_MAP_TYPE_LPM_TRIE: 1492 case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: 1493 case BPF_MAP_TYPE_STACK_TRACE: 1494 case BPF_MAP_TYPE_QUEUE: 1495 case BPF_MAP_TYPE_STACK: 1496 case BPF_MAP_TYPE_LRU_HASH: 1497 case BPF_MAP_TYPE_LRU_PERCPU_HASH: 1498 case BPF_MAP_TYPE_STRUCT_OPS: 1499 case BPF_MAP_TYPE_CPUMAP: 1500 case BPF_MAP_TYPE_ARENA: 1501 case BPF_MAP_TYPE_INSN_ARRAY: 1502 if (!bpf_token_capable(token, CAP_BPF)) 1503 goto put_token; 1504 break; 1505 case BPF_MAP_TYPE_SOCKMAP: 1506 case BPF_MAP_TYPE_SOCKHASH: 1507 case BPF_MAP_TYPE_DEVMAP: 1508 case BPF_MAP_TYPE_DEVMAP_HASH: 1509 case BPF_MAP_TYPE_XSKMAP: 1510 if (!bpf_token_capable(token, CAP_NET_ADMIN)) 1511 goto put_token; 1512 break; 1513 default: 1514 WARN(1, "unsupported map type %d", map_type); 1515 goto put_token; 1516 } 1517 1518 map = ops->map_alloc(attr); 1519 if (IS_ERR(map)) { 1520 err = PTR_ERR(map); 1521 goto put_token; 1522 } 1523 map->ops = ops; 1524 map->map_type = map_type; 1525 1526 err = bpf_obj_name_cpy(map->name, attr->map_name, 1527 sizeof(attr->map_name)); 1528 if (err < 0) { 1529 bpf_log(log, "Invalid map_name.\n"); 1530 goto free_map; 1531 } 1532 1533 preempt_disable(); 1534 map->cookie = gen_cookie_next(&bpf_map_cookie); 1535 preempt_enable(); 1536 1537 atomic64_set(&map->refcnt, 1); 1538 atomic64_set(&map->usercnt, 1); 1539 mutex_init(&map->freeze_mutex); 1540 spin_lock_init(&map->owner_lock); 1541 1542 if (attr->btf_key_type_id || attr->btf_value_type_id || 1543 /* Even the map's value is a kernel's struct, 1544 * the bpf_prog.o must have BTF to begin with 1545 * to figure out the corresponding kernel's 1546 * counter part. Thus, attr->btf_fd has 1547 * to be valid also. 1548 */ 1549 attr->btf_vmlinux_value_type_id) { 1550 struct btf *btf; 1551 1552 btf = btf_get_by_fd(attr->btf_fd); 1553 if (IS_ERR(btf)) { 1554 bpf_log(log, "Invalid btf_fd.\n"); 1555 err = PTR_ERR(btf); 1556 goto free_map; 1557 } 1558 if (btf_is_kernel(btf)) { 1559 btf_put(btf); 1560 err = -EACCES; 1561 goto free_map; 1562 } 1563 map->btf = btf; 1564 1565 if (attr->btf_value_type_id) { 1566 err = map_check_btf(map, token, btf, attr->btf_key_type_id, 1567 attr->btf_value_type_id); 1568 if (err) 1569 goto free_map; 1570 } 1571 1572 map->btf_key_type_id = attr->btf_key_type_id; 1573 map->btf_value_type_id = attr->btf_value_type_id; 1574 map->btf_vmlinux_value_type_id = 1575 attr->btf_vmlinux_value_type_id; 1576 } 1577 1578 if (attr->excl_prog_hash) { 1579 bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); 1580 1581 if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { 1582 bpf_log(log, "Invalid excl_prog_hash_size.\n"); 1583 err = -EINVAL; 1584 goto free_map; 1585 } 1586 1587 map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); 1588 if (!map->excl_prog_sha) { 1589 err = -ENOMEM; 1590 goto free_map; 1591 } 1592 1593 if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) { 1594 err = -EFAULT; 1595 goto free_map; 1596 } 1597 1598 /* See libbpf: emit_signature_match() */ 1599 BUILD_BUG_ON(offsetof(struct bpf_map, excl) != SHA256_DIGEST_SIZE); 1600 BUILD_BUG_ON(!__same_type(map->excl, u32)); 1601 BUILD_BUG_ON(offsetof(struct bpf_map, sha) != 0); 1602 BUILD_BUG_ON(!__same_type(map->sha, u8[SHA256_DIGEST_SIZE])); 1603 map->excl = 1; 1604 } else if (attr->excl_prog_hash_size) { 1605 bpf_log(log, "Invalid excl_prog_hash_size.\n"); 1606 err = -EINVAL; 1607 goto free_map; 1608 } 1609 1610 *mapp = map; 1611 *tokenp = token; 1612 return 0; 1613 1614 free_map: 1615 bpf_map_free(map); 1616 put_token: 1617 bpf_token_put(token); 1618 return err; 1619 } 1620 1621 static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common, 1622 bpfptr_t uattr_common, u32 size_common) 1623 { 1624 struct bpf_token *token = NULL; 1625 struct bpf_verifier_log *log; 1626 struct bpf_log_attr attr_log; 1627 struct bpf_map *map = NULL; 1628 int err, ret; 1629 int f_flags; 1630 1631 log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common); 1632 if (IS_ERR(log)) 1633 return PTR_ERR(log); 1634 1635 err = map_create_alloc(attr, uattr, log, &map, &token); 1636 1637 /* preserve original error even if log finalization is successful */ 1638 ret = bpf_log_attr_finalize(&attr_log, log); 1639 if (ret) 1640 err = ret; 1641 1642 kfree(log); 1643 1644 if (err) 1645 goto free_map; 1646 1647 f_flags = bpf_get_file_flag(attr->map_flags); 1648 if (f_flags < 0) { 1649 err = f_flags; 1650 goto free_map; 1651 } 1652 1653 err = security_bpf_map_create(map, attr, token, uattr.is_kernel); 1654 if (err) 1655 goto free_map_sec; 1656 1657 err = bpf_map_alloc_id(map); 1658 if (err) 1659 goto free_map_sec; 1660 1661 bpf_map_save_memcg(map); 1662 bpf_token_put(token); 1663 1664 err = bpf_map_new_fd(map, f_flags); 1665 if (err < 0) { 1666 /* failed to allocate fd. 1667 * bpf_map_put_with_uref() is needed because the above 1668 * bpf_map_alloc_id() has published the map 1669 * to the userspace and the userspace may 1670 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. 1671 */ 1672 bpf_map_put_with_uref(map); 1673 return err; 1674 } 1675 1676 return err; 1677 1678 free_map_sec: 1679 security_bpf_map_free(map); 1680 free_map: 1681 if (map) 1682 bpf_map_free(map); 1683 bpf_token_put(token); 1684 return err; 1685 } 1686 1687 void bpf_map_inc(struct bpf_map *map) 1688 { 1689 atomic64_inc(&map->refcnt); 1690 } 1691 EXPORT_SYMBOL_GPL(bpf_map_inc); 1692 1693 void bpf_map_inc_with_uref(struct bpf_map *map) 1694 { 1695 atomic64_inc(&map->refcnt); 1696 atomic64_inc(&map->usercnt); 1697 } 1698 EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); 1699 1700 struct bpf_map *bpf_map_get(u32 ufd) 1701 { 1702 CLASS(fd, f)(ufd); 1703 struct bpf_map *map = __bpf_map_get(f); 1704 1705 if (!IS_ERR(map)) 1706 bpf_map_inc(map); 1707 1708 return map; 1709 } 1710 EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL"); 1711 1712 struct bpf_map *bpf_map_get_with_uref(u32 ufd) 1713 { 1714 CLASS(fd, f)(ufd); 1715 struct bpf_map *map = __bpf_map_get(f); 1716 1717 if (!IS_ERR(map)) 1718 bpf_map_inc_with_uref(map); 1719 1720 return map; 1721 } 1722 1723 /* map_idr_lock should have been held or the map should have been 1724 * protected by rcu read lock. 1725 */ 1726 struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) 1727 { 1728 int refold; 1729 1730 refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0); 1731 if (!refold) 1732 return ERR_PTR(-ENOENT); 1733 if (uref) 1734 atomic64_inc(&map->usercnt); 1735 1736 return map; 1737 } 1738 1739 struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) 1740 { 1741 lockdep_assert(rcu_read_lock_held()); 1742 return __bpf_map_inc_not_zero(map, false); 1743 } 1744 EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); 1745 1746 int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, 1747 bool delete) 1748 { 1749 return -ENOTSUPP; 1750 } 1751 1752 static void *__bpf_copy_key(void __user *ukey, u64 key_size) 1753 { 1754 if (key_size) 1755 return vmemdup_user(ukey, key_size); 1756 1757 if (ukey) 1758 return ERR_PTR(-EINVAL); 1759 1760 return NULL; 1761 } 1762 1763 static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) 1764 { 1765 if (key_size) 1766 return kvmemdup_bpfptr(ukey, key_size); 1767 1768 if (!bpfptr_is_null(ukey)) 1769 return ERR_PTR(-EINVAL); 1770 1771 return NULL; 1772 } 1773 1774 /* last field in 'union bpf_attr' used by this command */ 1775 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags 1776 1777 static int map_lookup_elem(union bpf_attr *attr) 1778 { 1779 void __user *ukey = u64_to_user_ptr(attr->key); 1780 void __user *uvalue = u64_to_user_ptr(attr->value); 1781 struct bpf_map *map; 1782 void *key, *value; 1783 u32 value_size; 1784 int err; 1785 1786 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) 1787 return -EINVAL; 1788 1789 CLASS(fd, f)(attr->map_fd); 1790 map = __bpf_map_get(f); 1791 if (IS_ERR(map)) 1792 return PTR_ERR(map); 1793 if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) 1794 return -EPERM; 1795 1796 err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU); 1797 if (err) 1798 return err; 1799 1800 key = __bpf_copy_key(ukey, map->key_size); 1801 if (IS_ERR(key)) 1802 return PTR_ERR(key); 1803 1804 value_size = bpf_map_value_size(map, attr->flags); 1805 1806 err = -ENOMEM; 1807 value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); 1808 if (!value) 1809 goto free_key; 1810 1811 if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { 1812 if (copy_from_user(value, uvalue, value_size)) 1813 err = -EFAULT; 1814 else 1815 err = bpf_map_copy_value(map, key, value, attr->flags); 1816 goto free_value; 1817 } 1818 1819 err = bpf_map_copy_value(map, key, value, attr->flags); 1820 if (err) 1821 goto free_value; 1822 1823 err = -EFAULT; 1824 if (copy_to_user(uvalue, value, value_size) != 0) 1825 goto free_value; 1826 1827 err = 0; 1828 1829 free_value: 1830 kvfree(value); 1831 free_key: 1832 kvfree(key); 1833 return err; 1834 } 1835 1836 1837 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags 1838 1839 static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) 1840 { 1841 bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); 1842 bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel); 1843 struct bpf_map *map; 1844 void *key, *value; 1845 u32 value_size; 1846 int err; 1847 1848 if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) 1849 return -EINVAL; 1850 1851 CLASS(fd, f)(attr->map_fd); 1852 map = __bpf_map_get(f); 1853 if (IS_ERR(map)) 1854 return PTR_ERR(map); 1855 bpf_map_write_active_inc(map); 1856 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 1857 err = -EPERM; 1858 goto err_put; 1859 } 1860 1861 err = bpf_map_check_op_flags(map, attr->flags, ~0); 1862 if (err) 1863 goto err_put; 1864 1865 key = ___bpf_copy_key(ukey, map->key_size); 1866 if (IS_ERR(key)) { 1867 err = PTR_ERR(key); 1868 goto err_put; 1869 } 1870 1871 value_size = bpf_map_value_size(map, attr->flags); 1872 value = kvmemdup_bpfptr(uvalue, value_size); 1873 if (IS_ERR(value)) { 1874 err = PTR_ERR(value); 1875 goto free_key; 1876 } 1877 1878 err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags); 1879 if (!err) 1880 maybe_wait_bpf_programs(map); 1881 1882 kvfree(value); 1883 free_key: 1884 kvfree(key); 1885 err_put: 1886 bpf_map_write_active_dec(map); 1887 return err; 1888 } 1889 1890 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key 1891 1892 static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) 1893 { 1894 bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); 1895 struct bpf_map *map; 1896 void *key; 1897 int err; 1898 1899 if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) 1900 return -EINVAL; 1901 1902 CLASS(fd, f)(attr->map_fd); 1903 map = __bpf_map_get(f); 1904 if (IS_ERR(map)) 1905 return PTR_ERR(map); 1906 bpf_map_write_active_inc(map); 1907 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 1908 err = -EPERM; 1909 goto err_put; 1910 } 1911 1912 key = ___bpf_copy_key(ukey, map->key_size); 1913 if (IS_ERR(key)) { 1914 err = PTR_ERR(key); 1915 goto err_put; 1916 } 1917 1918 if (bpf_map_is_offloaded(map)) { 1919 err = bpf_map_offload_delete_elem(map, key); 1920 goto out; 1921 } else if (IS_FD_PROG_ARRAY(map) || 1922 map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { 1923 /* These maps require sleepable context */ 1924 err = map->ops->map_delete_elem(map, key); 1925 goto out; 1926 } 1927 1928 bpf_disable_instrumentation(); 1929 rcu_read_lock(); 1930 err = map->ops->map_delete_elem(map, key); 1931 rcu_read_unlock(); 1932 bpf_enable_instrumentation(); 1933 if (!err) 1934 maybe_wait_bpf_programs(map); 1935 out: 1936 kvfree(key); 1937 err_put: 1938 bpf_map_write_active_dec(map); 1939 return err; 1940 } 1941 1942 /* last field in 'union bpf_attr' used by this command */ 1943 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key 1944 1945 static int map_get_next_key(union bpf_attr *attr) 1946 { 1947 void __user *ukey = u64_to_user_ptr(attr->key); 1948 void __user *unext_key = u64_to_user_ptr(attr->next_key); 1949 struct bpf_map *map; 1950 void *key, *next_key; 1951 int err; 1952 1953 if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) 1954 return -EINVAL; 1955 1956 CLASS(fd, f)(attr->map_fd); 1957 map = __bpf_map_get(f); 1958 if (IS_ERR(map)) 1959 return PTR_ERR(map); 1960 if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) 1961 return -EPERM; 1962 1963 if (ukey) { 1964 key = __bpf_copy_key(ukey, map->key_size); 1965 if (IS_ERR(key)) 1966 return PTR_ERR(key); 1967 } else { 1968 key = NULL; 1969 } 1970 1971 err = -ENOMEM; 1972 next_key = kvmalloc(map->key_size, GFP_USER); 1973 if (!next_key) 1974 goto free_key; 1975 1976 if (bpf_map_is_offloaded(map)) { 1977 err = bpf_map_offload_get_next_key(map, key, next_key); 1978 goto out; 1979 } 1980 1981 rcu_read_lock(); 1982 err = map->ops->map_get_next_key(map, key, next_key); 1983 rcu_read_unlock(); 1984 out: 1985 if (err) 1986 goto free_next_key; 1987 1988 err = -EFAULT; 1989 if (copy_to_user(unext_key, next_key, map->key_size) != 0) 1990 goto free_next_key; 1991 1992 err = 0; 1993 1994 free_next_key: 1995 kvfree(next_key); 1996 free_key: 1997 kvfree(key); 1998 return err; 1999 } 2000 2001 int generic_map_delete_batch(struct bpf_map *map, 2002 const union bpf_attr *attr, 2003 union bpf_attr __user *uattr) 2004 { 2005 void __user *keys = u64_to_user_ptr(attr->batch.keys); 2006 u32 cp, max_count; 2007 int err = 0; 2008 void *key; 2009 2010 if (attr->batch.elem_flags & ~BPF_F_LOCK) 2011 return -EINVAL; 2012 2013 if ((attr->batch.elem_flags & BPF_F_LOCK) && 2014 !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { 2015 return -EINVAL; 2016 } 2017 2018 max_count = attr->batch.count; 2019 if (!max_count) 2020 return 0; 2021 2022 if (put_user(0, &uattr->batch.count)) 2023 return -EFAULT; 2024 2025 key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); 2026 if (!key) 2027 return -ENOMEM; 2028 2029 for (cp = 0; cp < max_count; cp++) { 2030 err = -EFAULT; 2031 if (copy_from_user(key, keys + cp * map->key_size, 2032 map->key_size)) 2033 break; 2034 2035 if (bpf_map_is_offloaded(map)) { 2036 err = bpf_map_offload_delete_elem(map, key); 2037 break; 2038 } 2039 2040 bpf_disable_instrumentation(); 2041 rcu_read_lock(); 2042 err = map->ops->map_delete_elem(map, key); 2043 rcu_read_unlock(); 2044 bpf_enable_instrumentation(); 2045 if (err) 2046 break; 2047 cond_resched(); 2048 } 2049 if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) 2050 err = -EFAULT; 2051 2052 kvfree(key); 2053 2054 return err; 2055 } 2056 2057 int generic_map_update_batch(struct bpf_map *map, struct file *map_file, 2058 const union bpf_attr *attr, 2059 union bpf_attr __user *uattr) 2060 { 2061 void __user *values = u64_to_user_ptr(attr->batch.values); 2062 void __user *keys = u64_to_user_ptr(attr->batch.keys); 2063 u32 value_size, cp, max_count; 2064 void *key, *value; 2065 int err = 0; 2066 2067 err = bpf_map_check_op_flags(map, attr->batch.elem_flags, 2068 BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS); 2069 if (err) 2070 return err; 2071 2072 value_size = bpf_map_value_size(map, attr->batch.elem_flags); 2073 2074 max_count = attr->batch.count; 2075 if (!max_count) 2076 return 0; 2077 2078 if (put_user(0, &uattr->batch.count)) 2079 return -EFAULT; 2080 2081 key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); 2082 if (!key) 2083 return -ENOMEM; 2084 2085 value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); 2086 if (!value) { 2087 kvfree(key); 2088 return -ENOMEM; 2089 } 2090 2091 for (cp = 0; cp < max_count; cp++) { 2092 err = -EFAULT; 2093 if (copy_from_user(key, keys + cp * map->key_size, 2094 map->key_size) || 2095 copy_from_user(value, values + cp * value_size, value_size)) 2096 break; 2097 2098 err = bpf_map_update_value(map, map_file, key, value, 2099 attr->batch.elem_flags); 2100 2101 if (err) 2102 break; 2103 cond_resched(); 2104 } 2105 2106 if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) 2107 err = -EFAULT; 2108 2109 kvfree(value); 2110 kvfree(key); 2111 2112 return err; 2113 } 2114 2115 int generic_map_lookup_batch(struct bpf_map *map, 2116 const union bpf_attr *attr, 2117 union bpf_attr __user *uattr) 2118 { 2119 void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); 2120 void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); 2121 void __user *values = u64_to_user_ptr(attr->batch.values); 2122 void __user *keys = u64_to_user_ptr(attr->batch.keys); 2123 void *buf, *buf_prevkey, *prev_key, *key, *value; 2124 u32 value_size, cp, max_count; 2125 int err; 2126 2127 err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU); 2128 if (err) 2129 return err; 2130 2131 value_size = bpf_map_value_size(map, attr->batch.elem_flags); 2132 2133 max_count = attr->batch.count; 2134 if (!max_count) 2135 return 0; 2136 2137 if (put_user(0, &uattr->batch.count)) 2138 return -EFAULT; 2139 2140 buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); 2141 if (!buf_prevkey) 2142 return -ENOMEM; 2143 2144 buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); 2145 if (!buf) { 2146 kvfree(buf_prevkey); 2147 return -ENOMEM; 2148 } 2149 2150 err = -EFAULT; 2151 prev_key = NULL; 2152 if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) 2153 goto free_buf; 2154 key = buf; 2155 value = key + map->key_size; 2156 if (ubatch) 2157 prev_key = buf_prevkey; 2158 2159 for (cp = 0; cp < max_count;) { 2160 rcu_read_lock(); 2161 err = map->ops->map_get_next_key(map, prev_key, key); 2162 rcu_read_unlock(); 2163 if (err) 2164 break; 2165 err = bpf_map_copy_value(map, key, value, 2166 attr->batch.elem_flags); 2167 2168 if (err == -ENOENT) 2169 goto next_key; 2170 2171 if (err) 2172 goto free_buf; 2173 2174 if (copy_to_user(keys + cp * map->key_size, key, 2175 map->key_size)) { 2176 err = -EFAULT; 2177 goto free_buf; 2178 } 2179 if (copy_to_user(values + cp * value_size, value, value_size)) { 2180 err = -EFAULT; 2181 goto free_buf; 2182 } 2183 2184 cp++; 2185 next_key: 2186 if (!prev_key) 2187 prev_key = buf_prevkey; 2188 2189 swap(prev_key, key); 2190 cond_resched(); 2191 } 2192 2193 if (err == -EFAULT) 2194 goto free_buf; 2195 2196 if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || 2197 (cp && copy_to_user(uobatch, prev_key, map->key_size)))) 2198 err = -EFAULT; 2199 2200 free_buf: 2201 kvfree(buf_prevkey); 2202 kvfree(buf); 2203 return err; 2204 } 2205 2206 #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags 2207 2208 static int map_lookup_and_delete_elem(union bpf_attr *attr) 2209 { 2210 void __user *ukey = u64_to_user_ptr(attr->key); 2211 void __user *uvalue = u64_to_user_ptr(attr->value); 2212 struct bpf_map *map; 2213 void *key, *value; 2214 u32 value_size; 2215 int err; 2216 2217 if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) 2218 return -EINVAL; 2219 2220 if (attr->flags & ~BPF_F_LOCK) 2221 return -EINVAL; 2222 2223 CLASS(fd, f)(attr->map_fd); 2224 map = __bpf_map_get(f); 2225 if (IS_ERR(map)) 2226 return PTR_ERR(map); 2227 bpf_map_write_active_inc(map); 2228 if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || 2229 !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 2230 err = -EPERM; 2231 goto err_put; 2232 } 2233 2234 if (attr->flags && 2235 (map->map_type == BPF_MAP_TYPE_QUEUE || 2236 map->map_type == BPF_MAP_TYPE_STACK)) { 2237 err = -EINVAL; 2238 goto err_put; 2239 } 2240 2241 if ((attr->flags & BPF_F_LOCK) && 2242 !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { 2243 err = -EINVAL; 2244 goto err_put; 2245 } 2246 2247 key = __bpf_copy_key(ukey, map->key_size); 2248 if (IS_ERR(key)) { 2249 err = PTR_ERR(key); 2250 goto err_put; 2251 } 2252 2253 value_size = bpf_map_value_size(map, 0); 2254 2255 err = -ENOMEM; 2256 value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); 2257 if (!value) 2258 goto free_key; 2259 2260 err = -ENOTSUPP; 2261 if (map->map_type == BPF_MAP_TYPE_QUEUE || 2262 map->map_type == BPF_MAP_TYPE_STACK) { 2263 err = map->ops->map_pop_elem(map, value); 2264 } else if (map->map_type == BPF_MAP_TYPE_HASH || 2265 map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 2266 map->map_type == BPF_MAP_TYPE_LRU_HASH || 2267 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || 2268 map->map_type == BPF_MAP_TYPE_RHASH || 2269 map->map_type == BPF_MAP_TYPE_STACK_TRACE) { 2270 if (!bpf_map_is_offloaded(map)) { 2271 bpf_disable_instrumentation(); 2272 rcu_read_lock(); 2273 err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); 2274 rcu_read_unlock(); 2275 bpf_enable_instrumentation(); 2276 } 2277 } 2278 2279 if (err) 2280 goto free_value; 2281 2282 if (copy_to_user(uvalue, value, value_size) != 0) { 2283 err = -EFAULT; 2284 goto free_value; 2285 } 2286 2287 err = 0; 2288 2289 free_value: 2290 kvfree(value); 2291 free_key: 2292 kvfree(key); 2293 err_put: 2294 bpf_map_write_active_dec(map); 2295 return err; 2296 } 2297 2298 #define BPF_MAP_FREEZE_LAST_FIELD map_fd 2299 2300 static int map_freeze(const union bpf_attr *attr) 2301 { 2302 int err = 0; 2303 struct bpf_map *map; 2304 2305 if (CHECK_ATTR(BPF_MAP_FREEZE)) 2306 return -EINVAL; 2307 2308 CLASS(fd, f)(attr->map_fd); 2309 map = __bpf_map_get(f); 2310 if (IS_ERR(map)) 2311 return PTR_ERR(map); 2312 2313 if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) 2314 return -ENOTSUPP; 2315 2316 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) 2317 return -EPERM; 2318 2319 mutex_lock(&map->freeze_mutex); 2320 if (bpf_map_write_active(map)) { 2321 err = -EBUSY; 2322 goto err_put; 2323 } 2324 if (READ_ONCE(map->frozen)) { 2325 err = -EBUSY; 2326 goto err_put; 2327 } 2328 2329 WRITE_ONCE(map->frozen, true); 2330 err_put: 2331 mutex_unlock(&map->freeze_mutex); 2332 return err; 2333 } 2334 2335 static const struct bpf_prog_ops * const bpf_prog_types[] = { 2336 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ 2337 [_id] = & _name ## _prog_ops, 2338 #define BPF_MAP_TYPE(_id, _ops) 2339 #define BPF_LINK_TYPE(_id, _name) 2340 #include <linux/bpf_types.h> 2341 #undef BPF_PROG_TYPE 2342 #undef BPF_MAP_TYPE 2343 #undef BPF_LINK_TYPE 2344 }; 2345 2346 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) 2347 { 2348 const struct bpf_prog_ops *ops; 2349 2350 if (type >= ARRAY_SIZE(bpf_prog_types)) 2351 return -EINVAL; 2352 type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types)); 2353 ops = bpf_prog_types[type]; 2354 if (!ops) 2355 return -EINVAL; 2356 2357 if (!bpf_prog_is_offloaded(prog->aux)) 2358 prog->aux->ops = ops; 2359 else 2360 prog->aux->ops = &bpf_offload_prog_ops; 2361 prog->type = type; 2362 return 0; 2363 } 2364 2365 enum bpf_audit { 2366 BPF_AUDIT_LOAD, 2367 BPF_AUDIT_UNLOAD, 2368 BPF_AUDIT_MAX, 2369 }; 2370 2371 static const char * const bpf_audit_str[BPF_AUDIT_MAX] = { 2372 [BPF_AUDIT_LOAD] = "LOAD", 2373 [BPF_AUDIT_UNLOAD] = "UNLOAD", 2374 }; 2375 2376 static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) 2377 { 2378 struct audit_context *ctx = NULL; 2379 struct audit_buffer *ab; 2380 2381 if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX)) 2382 return; 2383 if (audit_enabled == AUDIT_OFF) 2384 return; 2385 if (!in_hardirq() && !irqs_disabled()) 2386 ctx = audit_context(); 2387 ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); 2388 if (unlikely(!ab)) 2389 return; 2390 audit_log_format(ab, "prog-id=%u op=%s", 2391 prog->aux->id, bpf_audit_str[op]); 2392 audit_log_end(ab); 2393 } 2394 2395 static int bpf_prog_alloc_id(struct bpf_prog *prog) 2396 { 2397 int id; 2398 2399 idr_preload(GFP_KERNEL); 2400 spin_lock_bh(&prog_idr_lock); 2401 id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); 2402 if (id > 0) 2403 prog->aux->id = id; 2404 spin_unlock_bh(&prog_idr_lock); 2405 idr_preload_end(); 2406 2407 /* id is in [1, INT_MAX) */ 2408 if (WARN_ON_ONCE(!id)) 2409 return -ENOSPC; 2410 2411 return id > 0 ? 0 : id; 2412 } 2413 2414 void bpf_prog_free_id(struct bpf_prog *prog) 2415 { 2416 unsigned long flags; 2417 2418 /* cBPF to eBPF migrations are currently not in the idr store. 2419 * Offloaded programs are removed from the store when their device 2420 * disappears - even if someone grabs an fd to them they are unusable, 2421 * simply waiting for refcnt to drop to be freed. 2422 */ 2423 if (!prog->aux->id) 2424 return; 2425 2426 spin_lock_irqsave(&prog_idr_lock, flags); 2427 idr_remove(&prog_idr, prog->aux->id); 2428 prog->aux->id = 0; 2429 spin_unlock_irqrestore(&prog_idr_lock, flags); 2430 } 2431 2432 static void __bpf_prog_put_rcu(struct rcu_head *rcu) 2433 { 2434 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); 2435 2436 kvfree(aux->func_info); 2437 kfree(aux->func_info_aux); 2438 free_uid(aux->user); 2439 security_bpf_prog_free(aux->prog); 2440 bpf_prog_free(aux->prog); 2441 } 2442 2443 static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) 2444 { 2445 bpf_prog_kallsyms_del_all(prog); 2446 btf_put(prog->aux->btf); 2447 module_put(prog->aux->mod); 2448 kvfree(prog->aux->jited_linfo); 2449 kvfree(prog->aux->linfo); 2450 kfree(prog->aux->kfunc_tab); 2451 kfree(prog->aux->ctx_arg_info); 2452 if (prog->aux->attach_btf) 2453 btf_put(prog->aux->attach_btf); 2454 2455 if (deferred) { 2456 if (prog->sleepable) 2457 call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); 2458 else 2459 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); 2460 } else { 2461 __bpf_prog_put_rcu(&prog->aux->rcu); 2462 } 2463 } 2464 2465 static void bpf_prog_put_deferred(struct work_struct *work) 2466 { 2467 struct bpf_prog_aux *aux; 2468 struct bpf_prog *prog; 2469 2470 aux = container_of(work, struct bpf_prog_aux, work); 2471 prog = aux->prog; 2472 perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); 2473 bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); 2474 bpf_prog_free_id(prog); 2475 __bpf_prog_put_noref(prog, true); 2476 } 2477 2478 static void __bpf_prog_put(struct bpf_prog *prog) 2479 { 2480 struct bpf_prog_aux *aux = prog->aux; 2481 2482 if (atomic64_dec_and_test(&aux->refcnt)) { 2483 if (in_hardirq() || irqs_disabled()) { 2484 INIT_WORK(&aux->work, bpf_prog_put_deferred); 2485 schedule_work(&aux->work); 2486 } else { 2487 bpf_prog_put_deferred(&aux->work); 2488 } 2489 } 2490 } 2491 2492 void bpf_prog_put(struct bpf_prog *prog) 2493 { 2494 __bpf_prog_put(prog); 2495 } 2496 EXPORT_SYMBOL_GPL(bpf_prog_put); 2497 2498 static int bpf_prog_release(struct inode *inode, struct file *filp) 2499 { 2500 struct bpf_prog *prog = filp->private_data; 2501 2502 bpf_prog_put(prog); 2503 return 0; 2504 } 2505 2506 struct bpf_prog_kstats { 2507 u64 nsecs; 2508 u64 cnt; 2509 u64 misses; 2510 }; 2511 2512 void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) 2513 { 2514 struct bpf_prog_stats *stats; 2515 unsigned int flags; 2516 2517 if (unlikely(!prog->stats)) 2518 return; 2519 2520 stats = this_cpu_ptr(prog->stats); 2521 flags = u64_stats_update_begin_irqsave(&stats->syncp); 2522 u64_stats_inc(&stats->misses); 2523 u64_stats_update_end_irqrestore(&stats->syncp, flags); 2524 } 2525 2526 static void bpf_prog_get_stats(const struct bpf_prog *prog, 2527 struct bpf_prog_kstats *stats) 2528 { 2529 u64 nsecs = 0, cnt = 0, misses = 0; 2530 int cpu; 2531 2532 for_each_possible_cpu(cpu) { 2533 const struct bpf_prog_stats *st; 2534 unsigned int start; 2535 u64 tnsecs, tcnt, tmisses; 2536 2537 st = per_cpu_ptr(prog->stats, cpu); 2538 do { 2539 start = u64_stats_fetch_begin(&st->syncp); 2540 tnsecs = u64_stats_read(&st->nsecs); 2541 tcnt = u64_stats_read(&st->cnt); 2542 tmisses = u64_stats_read(&st->misses); 2543 } while (u64_stats_fetch_retry(&st->syncp, start)); 2544 nsecs += tnsecs; 2545 cnt += tcnt; 2546 misses += tmisses; 2547 } 2548 stats->nsecs = nsecs; 2549 stats->cnt = cnt; 2550 stats->misses = misses; 2551 } 2552 2553 #ifdef CONFIG_PROC_FS 2554 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) 2555 { 2556 const struct bpf_prog *prog = filp->private_data; 2557 char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; 2558 struct bpf_prog_kstats stats; 2559 2560 bpf_prog_get_stats(prog, &stats); 2561 bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); 2562 seq_printf(m, 2563 "prog_type:\t%u\n" 2564 "prog_jited:\t%u\n" 2565 "prog_tag:\t%s\n" 2566 "memlock:\t%llu\n" 2567 "prog_id:\t%u\n" 2568 "run_time_ns:\t%llu\n" 2569 "run_cnt:\t%llu\n" 2570 "recursion_misses:\t%llu\n" 2571 "verified_insns:\t%u\n", 2572 prog->type, 2573 prog->jited, 2574 prog_tag, 2575 prog->pages * 1ULL << PAGE_SHIFT, 2576 prog->aux->id, 2577 stats.nsecs, 2578 stats.cnt, 2579 stats.misses, 2580 prog->aux->verified_insns); 2581 } 2582 #endif 2583 2584 const struct file_operations bpf_prog_fops = { 2585 #ifdef CONFIG_PROC_FS 2586 .show_fdinfo = bpf_prog_show_fdinfo, 2587 #endif 2588 .release = bpf_prog_release, 2589 .read = bpf_dummy_read, 2590 .write = bpf_dummy_write, 2591 }; 2592 2593 int bpf_prog_new_fd(struct bpf_prog *prog) 2594 { 2595 int ret; 2596 2597 ret = security_bpf_prog(prog); 2598 if (ret < 0) 2599 return ret; 2600 2601 return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, 2602 O_RDWR | O_CLOEXEC); 2603 } 2604 2605 void bpf_prog_add(struct bpf_prog *prog, int i) 2606 { 2607 atomic64_add(i, &prog->aux->refcnt); 2608 } 2609 EXPORT_SYMBOL_GPL(bpf_prog_add); 2610 2611 void bpf_prog_sub(struct bpf_prog *prog, int i) 2612 { 2613 /* Only to be used for undoing previous bpf_prog_add() in some 2614 * error path. We still know that another entity in our call 2615 * path holds a reference to the program, thus atomic_sub() can 2616 * be safely used in such cases! 2617 */ 2618 WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0); 2619 } 2620 EXPORT_SYMBOL_GPL(bpf_prog_sub); 2621 2622 void bpf_prog_inc(struct bpf_prog *prog) 2623 { 2624 atomic64_inc(&prog->aux->refcnt); 2625 } 2626 EXPORT_SYMBOL_GPL(bpf_prog_inc); 2627 2628 /* prog_idr_lock should have been held */ 2629 struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) 2630 { 2631 int refold; 2632 2633 refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0); 2634 2635 if (!refold) 2636 return ERR_PTR(-ENOENT); 2637 2638 return prog; 2639 } 2640 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); 2641 2642 bool bpf_prog_get_ok(struct bpf_prog *prog, 2643 enum bpf_prog_type *attach_type, bool attach_drv) 2644 { 2645 /* not an attachment, just a refcount inc, always allow */ 2646 if (!attach_type) 2647 return true; 2648 2649 if (prog->type != *attach_type) 2650 return false; 2651 if (bpf_prog_is_offloaded(prog->aux) && !attach_drv) 2652 return false; 2653 2654 return true; 2655 } 2656 2657 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, 2658 bool attach_drv) 2659 { 2660 CLASS(fd, f)(ufd); 2661 struct bpf_prog *prog; 2662 2663 if (fd_empty(f)) 2664 return ERR_PTR(-EBADF); 2665 if (fd_file(f)->f_op != &bpf_prog_fops) 2666 return ERR_PTR(-EINVAL); 2667 2668 prog = fd_file(f)->private_data; 2669 if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) 2670 return ERR_PTR(-EINVAL); 2671 2672 bpf_prog_inc(prog); 2673 return prog; 2674 } 2675 2676 struct bpf_prog *bpf_prog_get(u32 ufd) 2677 { 2678 return __bpf_prog_get(ufd, NULL, false); 2679 } 2680 2681 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, 2682 bool attach_drv) 2683 { 2684 return __bpf_prog_get(ufd, &type, attach_drv); 2685 } 2686 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); 2687 2688 /* Initially all BPF programs could be loaded w/o specifying 2689 * expected_attach_type. Later for some of them specifying expected_attach_type 2690 * at load time became required so that program could be validated properly. 2691 * Programs of types that are allowed to be loaded both w/ and w/o (for 2692 * backward compatibility) expected_attach_type, should have the default attach 2693 * type assigned to expected_attach_type for the latter case, so that it can be 2694 * validated later at attach time. 2695 * 2696 * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if 2697 * prog type requires it but has some attach types that have to be backward 2698 * compatible. 2699 */ 2700 static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) 2701 { 2702 switch (attr->prog_type) { 2703 case BPF_PROG_TYPE_CGROUP_SOCK: 2704 /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't 2705 * exist so checking for non-zero is the way to go here. 2706 */ 2707 if (!attr->expected_attach_type) 2708 attr->expected_attach_type = 2709 BPF_CGROUP_INET_SOCK_CREATE; 2710 break; 2711 case BPF_PROG_TYPE_SK_REUSEPORT: 2712 if (!attr->expected_attach_type) 2713 attr->expected_attach_type = 2714 BPF_SK_REUSEPORT_SELECT; 2715 break; 2716 } 2717 } 2718 2719 static int 2720 bpf_prog_load_check_attach(enum bpf_prog_type prog_type, 2721 enum bpf_attach_type expected_attach_type, 2722 struct btf *attach_btf, u32 btf_id, 2723 struct bpf_prog *dst_prog, 2724 bool multi_func) 2725 { 2726 if (btf_id) { 2727 if (btf_id > BTF_MAX_TYPE) 2728 return -EINVAL; 2729 2730 if (!attach_btf && !dst_prog) 2731 return -EINVAL; 2732 2733 switch (prog_type) { 2734 case BPF_PROG_TYPE_TRACING: 2735 case BPF_PROG_TYPE_LSM: 2736 case BPF_PROG_TYPE_STRUCT_OPS: 2737 case BPF_PROG_TYPE_EXT: 2738 break; 2739 default: 2740 return -EINVAL; 2741 } 2742 } 2743 2744 if (multi_func) { 2745 if (prog_type != BPF_PROG_TYPE_TRACING) 2746 return -EINVAL; 2747 if (!attach_btf || btf_id) 2748 return -EINVAL; 2749 return 0; 2750 } 2751 2752 if (attach_btf && (!btf_id || dst_prog)) 2753 return -EINVAL; 2754 2755 if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING && 2756 prog_type != BPF_PROG_TYPE_EXT) 2757 return -EINVAL; 2758 2759 switch (prog_type) { 2760 case BPF_PROG_TYPE_CGROUP_SOCK: 2761 switch (expected_attach_type) { 2762 case BPF_CGROUP_INET_SOCK_CREATE: 2763 case BPF_CGROUP_INET_SOCK_RELEASE: 2764 case BPF_CGROUP_INET4_POST_BIND: 2765 case BPF_CGROUP_INET6_POST_BIND: 2766 return 0; 2767 default: 2768 return -EINVAL; 2769 } 2770 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 2771 switch (expected_attach_type) { 2772 case BPF_CGROUP_INET4_BIND: 2773 case BPF_CGROUP_INET6_BIND: 2774 case BPF_CGROUP_INET4_CONNECT: 2775 case BPF_CGROUP_INET6_CONNECT: 2776 case BPF_CGROUP_UNIX_CONNECT: 2777 case BPF_CGROUP_INET4_GETPEERNAME: 2778 case BPF_CGROUP_INET6_GETPEERNAME: 2779 case BPF_CGROUP_UNIX_GETPEERNAME: 2780 case BPF_CGROUP_INET4_GETSOCKNAME: 2781 case BPF_CGROUP_INET6_GETSOCKNAME: 2782 case BPF_CGROUP_UNIX_GETSOCKNAME: 2783 case BPF_CGROUP_UDP4_SENDMSG: 2784 case BPF_CGROUP_UDP6_SENDMSG: 2785 case BPF_CGROUP_UNIX_SENDMSG: 2786 case BPF_CGROUP_UDP4_RECVMSG: 2787 case BPF_CGROUP_UDP6_RECVMSG: 2788 case BPF_CGROUP_UNIX_RECVMSG: 2789 return 0; 2790 default: 2791 return -EINVAL; 2792 } 2793 case BPF_PROG_TYPE_CGROUP_SKB: 2794 switch (expected_attach_type) { 2795 case BPF_CGROUP_INET_INGRESS: 2796 case BPF_CGROUP_INET_EGRESS: 2797 return 0; 2798 default: 2799 return -EINVAL; 2800 } 2801 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 2802 switch (expected_attach_type) { 2803 case BPF_CGROUP_SETSOCKOPT: 2804 case BPF_CGROUP_GETSOCKOPT: 2805 return 0; 2806 default: 2807 return -EINVAL; 2808 } 2809 case BPF_PROG_TYPE_SK_LOOKUP: 2810 if (expected_attach_type == BPF_SK_LOOKUP) 2811 return 0; 2812 return -EINVAL; 2813 case BPF_PROG_TYPE_SK_REUSEPORT: 2814 switch (expected_attach_type) { 2815 case BPF_SK_REUSEPORT_SELECT: 2816 case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: 2817 return 0; 2818 default: 2819 return -EINVAL; 2820 } 2821 case BPF_PROG_TYPE_NETFILTER: 2822 if (expected_attach_type == BPF_NETFILTER) 2823 return 0; 2824 return -EINVAL; 2825 case BPF_PROG_TYPE_SYSCALL: 2826 case BPF_PROG_TYPE_EXT: 2827 if (expected_attach_type) 2828 return -EINVAL; 2829 fallthrough; 2830 default: 2831 return 0; 2832 } 2833 } 2834 2835 static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) 2836 { 2837 switch (prog_type) { 2838 case BPF_PROG_TYPE_SCHED_CLS: 2839 case BPF_PROG_TYPE_SCHED_ACT: 2840 case BPF_PROG_TYPE_XDP: 2841 case BPF_PROG_TYPE_LWT_IN: 2842 case BPF_PROG_TYPE_LWT_OUT: 2843 case BPF_PROG_TYPE_LWT_XMIT: 2844 case BPF_PROG_TYPE_LWT_SEG6LOCAL: 2845 case BPF_PROG_TYPE_SK_SKB: 2846 case BPF_PROG_TYPE_SK_MSG: 2847 case BPF_PROG_TYPE_FLOW_DISSECTOR: 2848 case BPF_PROG_TYPE_CGROUP_DEVICE: 2849 case BPF_PROG_TYPE_CGROUP_SOCK: 2850 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 2851 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 2852 case BPF_PROG_TYPE_CGROUP_SYSCTL: 2853 case BPF_PROG_TYPE_SOCK_OPS: 2854 case BPF_PROG_TYPE_EXT: /* extends any prog */ 2855 case BPF_PROG_TYPE_NETFILTER: 2856 return true; 2857 case BPF_PROG_TYPE_CGROUP_SKB: 2858 /* always unpriv */ 2859 case BPF_PROG_TYPE_SK_REUSEPORT: 2860 /* equivalent to SOCKET_FILTER. need CAP_BPF only */ 2861 default: 2862 return false; 2863 } 2864 } 2865 2866 static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) 2867 { 2868 switch (prog_type) { 2869 case BPF_PROG_TYPE_KPROBE: 2870 case BPF_PROG_TYPE_TRACEPOINT: 2871 case BPF_PROG_TYPE_PERF_EVENT: 2872 case BPF_PROG_TYPE_RAW_TRACEPOINT: 2873 case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: 2874 case BPF_PROG_TYPE_TRACING: 2875 case BPF_PROG_TYPE_LSM: 2876 case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ 2877 case BPF_PROG_TYPE_EXT: /* extends any prog */ 2878 return true; 2879 default: 2880 return false; 2881 } 2882 } 2883 2884 static enum bpf_sig_keyring bpf_classify_keyring(s32 keyring_id) 2885 { 2886 switch (keyring_id) { 2887 case 0: 2888 return BPF_SIG_KEYRING_BUILTIN; 2889 case (s32)(unsigned long)VERIFY_USE_SECONDARY_KEYRING: 2890 return BPF_SIG_KEYRING_SECONDARY; 2891 case (s32)(unsigned long)VERIFY_USE_PLATFORM_KEYRING: 2892 return BPF_SIG_KEYRING_PLATFORM; 2893 default: 2894 return BPF_SIG_KEYRING_USER; 2895 } 2896 } 2897 2898 static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, 2899 bool is_kernel, s32 *keyring_serial) 2900 { 2901 bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); 2902 struct bpf_dynptr_kern sig_ptr, insns_ptr; 2903 struct bpf_key *key = NULL; 2904 void *sig; 2905 int err = 0; 2906 2907 /* 2908 * Don't attempt to use kmalloc_large or vmalloc for signatures. 2909 * Practical signature for BPF program should be below this limit. 2910 */ 2911 if (attr->signature_size > KMALLOC_MAX_CACHE_SIZE) 2912 return -EINVAL; 2913 2914 if (system_keyring_id_check(attr->keyring_id) == 0) 2915 key = bpf_lookup_system_key(attr->keyring_id); 2916 else 2917 key = bpf_lookup_user_key(attr->keyring_id, 0); 2918 2919 if (!key) 2920 return -EINVAL; 2921 2922 sig = kvmemdup_bpfptr(usig, attr->signature_size); 2923 if (IS_ERR(sig)) { 2924 bpf_key_put(key); 2925 return PTR_ERR(sig); 2926 } 2927 2928 bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0, 2929 attr->signature_size); 2930 bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0, 2931 prog->len * sizeof(struct bpf_insn)); 2932 2933 err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, 2934 (struct bpf_dynptr *)&sig_ptr, key); 2935 if (!err) 2936 *keyring_serial = bpf_key_serial(key); 2937 bpf_key_put(key); 2938 kvfree(sig); 2939 return err; 2940 } 2941 2942 static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) 2943 { 2944 int err; 2945 int i; 2946 2947 for (i = 0; i < prog->aux->used_map_cnt; i++) { 2948 if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY) 2949 continue; 2950 2951 err = bpf_insn_array_ready(prog->aux->used_maps[i]); 2952 if (err) 2953 return err; 2954 } 2955 2956 return 0; 2957 } 2958 2959 extern int bpf_multi_func(void); 2960 int __init __used bpf_multi_func(void) { return 0; } 2961 2962 BTF_ID_LIST_GLOBAL_SINGLE(bpf_multi_func_btf_id, func, bpf_multi_func) 2963 2964 /* last field in 'union bpf_attr' used by this command */ 2965 #define BPF_PROG_LOAD_LAST_FIELD keyring_id 2966 2967 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) 2968 { 2969 enum bpf_prog_type type = attr->prog_type; 2970 struct bpf_prog *prog, *dst_prog = NULL; 2971 struct btf *attach_btf = NULL; 2972 struct bpf_token *token = NULL; 2973 bool bpf_cap; 2974 int err; 2975 char license[128]; 2976 bool multi_func; 2977 2978 if (CHECK_ATTR(BPF_PROG_LOAD)) 2979 return -EINVAL; 2980 2981 if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | 2982 BPF_F_ANY_ALIGNMENT | 2983 BPF_F_TEST_STATE_FREQ | 2984 BPF_F_SLEEPABLE | 2985 BPF_F_TEST_RND_HI32 | 2986 BPF_F_XDP_HAS_FRAGS | 2987 BPF_F_XDP_DEV_BOUND_ONLY | 2988 BPF_F_TEST_REG_INVARIANTS | 2989 BPF_F_TOKEN_FD)) 2990 return -EINVAL; 2991 2992 bpf_prog_load_fixup_attach_type(attr); 2993 2994 if (attr->prog_flags & BPF_F_TOKEN_FD) { 2995 token = bpf_token_get_from_fd(attr->prog_token_fd); 2996 if (IS_ERR(token)) 2997 return PTR_ERR(token); 2998 /* if current token doesn't grant prog loading permissions, 2999 * then we can't use this token, so ignore it and rely on 3000 * system-wide capabilities checks 3001 */ 3002 if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) || 3003 !bpf_token_allow_prog_type(token, attr->prog_type, 3004 attr->expected_attach_type)) { 3005 bpf_token_put(token); 3006 token = NULL; 3007 } 3008 } 3009 3010 bpf_cap = bpf_token_capable(token, CAP_BPF); 3011 err = -EPERM; 3012 3013 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && 3014 (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && 3015 !bpf_cap) 3016 goto put_token; 3017 3018 /* Intent here is for unprivileged_bpf_disabled to block BPF program 3019 * creation for unprivileged users; other actions depend 3020 * on fd availability and access to bpffs, so are dependent on 3021 * object creation success. Even with unprivileged BPF disabled, 3022 * capability checks are still carried out for these 3023 * and other operations. 3024 */ 3025 if (sysctl_unprivileged_bpf_disabled && !bpf_cap) 3026 goto put_token; 3027 3028 if (attr->insn_cnt == 0 || 3029 attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) { 3030 err = -E2BIG; 3031 goto put_token; 3032 } 3033 if (type != BPF_PROG_TYPE_SOCKET_FILTER && 3034 type != BPF_PROG_TYPE_CGROUP_SKB && 3035 !bpf_cap) 3036 goto put_token; 3037 3038 if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN)) 3039 goto put_token; 3040 if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) 3041 goto put_token; 3042 3043 multi_func = is_tracing_multi(attr->expected_attach_type); 3044 3045 /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog 3046 * or btf, we need to check which one it is 3047 */ 3048 if (attr->attach_prog_fd) { 3049 dst_prog = bpf_prog_get(attr->attach_prog_fd); 3050 if (IS_ERR(dst_prog)) { 3051 dst_prog = NULL; 3052 attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); 3053 if (IS_ERR(attach_btf)) { 3054 err = -EINVAL; 3055 goto put_token; 3056 } 3057 if (!btf_is_kernel(attach_btf)) { 3058 /* attaching through specifying bpf_prog's BTF 3059 * objects directly might be supported eventually 3060 */ 3061 btf_put(attach_btf); 3062 err = -ENOTSUPP; 3063 goto put_token; 3064 } 3065 } 3066 } else if (attr->attach_btf_id || multi_func) { 3067 /* fall back to vmlinux BTF, if BTF type ID is specified */ 3068 attach_btf = bpf_get_btf_vmlinux(); 3069 if (IS_ERR(attach_btf)) { 3070 err = PTR_ERR(attach_btf); 3071 goto put_token; 3072 } 3073 if (!attach_btf) { 3074 err = -EINVAL; 3075 goto put_token; 3076 } 3077 btf_get(attach_btf); 3078 } 3079 3080 if (bpf_prog_load_check_attach(type, attr->expected_attach_type, 3081 attach_btf, attr->attach_btf_id, 3082 dst_prog, multi_func)) { 3083 if (dst_prog) 3084 bpf_prog_put(dst_prog); 3085 if (attach_btf) 3086 btf_put(attach_btf); 3087 err = -EINVAL; 3088 goto put_token; 3089 } 3090 3091 /* plain bpf_prog allocation */ 3092 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); 3093 if (!prog) { 3094 if (dst_prog) 3095 bpf_prog_put(dst_prog); 3096 if (attach_btf) 3097 btf_put(attach_btf); 3098 err = -EINVAL; 3099 goto put_token; 3100 } 3101 3102 prog->expected_attach_type = attr->expected_attach_type; 3103 prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE); 3104 prog->aux->attach_btf = attach_btf; 3105 prog->aux->attach_btf_id = multi_func ? bpf_multi_func_btf_id[0] : attr->attach_btf_id; 3106 prog->aux->dst_prog = dst_prog; 3107 prog->aux->dev_bound = !!attr->prog_ifindex; 3108 prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; 3109 3110 /* move token into prog->aux, reuse taken refcnt */ 3111 prog->aux->token = token; 3112 token = NULL; 3113 3114 prog->aux->user = get_current_user(); 3115 prog->len = attr->insn_cnt; 3116 3117 err = -EFAULT; 3118 if (copy_from_bpfptr(prog->insns, 3119 make_bpfptr(attr->insns, uattr.is_kernel), 3120 bpf_prog_insn_size(prog)) != 0) 3121 goto free_prog; 3122 /* copy eBPF program license from user space */ 3123 if (strncpy_from_bpfptr(license, 3124 make_bpfptr(attr->license, uattr.is_kernel), 3125 sizeof(license) - 1) < 0) 3126 goto free_prog; 3127 license[sizeof(license) - 1] = 0; 3128 3129 /* eBPF programs must be GPL compatible to use GPL-ed functions */ 3130 prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; 3131 if (attr->signature) { 3132 err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel, 3133 &prog->aux->sig.keyring_serial); 3134 if (err) 3135 goto free_prog; 3136 prog->aux->sig.keyring_type = bpf_classify_keyring(attr->keyring_id); 3137 prog->aux->sig.verdict = BPF_SIG_VERIFIED; 3138 } else { 3139 prog->aux->sig.keyring_type = BPF_SIG_KEYRING_NONE; 3140 prog->aux->sig.verdict = BPF_SIG_UNSIGNED; 3141 } 3142 prog->orig_prog = NULL; 3143 prog->jited = 0; 3144 3145 atomic64_set(&prog->aux->refcnt, 1); 3146 3147 if (bpf_prog_is_dev_bound(prog->aux)) { 3148 err = bpf_prog_dev_bound_init(prog, attr); 3149 if (err) 3150 goto free_prog; 3151 } 3152 3153 if (type == BPF_PROG_TYPE_EXT && dst_prog && 3154 bpf_prog_is_dev_bound(dst_prog->aux)) { 3155 err = bpf_prog_dev_bound_inherit(prog, dst_prog); 3156 if (err) 3157 goto free_prog; 3158 } 3159 3160 /* 3161 * Bookkeeping for managing the program attachment chain. 3162 * 3163 * It might be tempting to set attach_tracing_prog flag at the attachment 3164 * time, but this will not prevent from loading bunch of tracing prog 3165 * first, then attach them one to another. 3166 * 3167 * The flag attach_tracing_prog is set for the whole program lifecycle, and 3168 * doesn't have to be cleared in bpf_tracing_link_release, since tracing 3169 * programs cannot change attachment target. 3170 */ 3171 if (type == BPF_PROG_TYPE_TRACING && dst_prog && 3172 dst_prog->type == BPF_PROG_TYPE_TRACING) { 3173 prog->aux->attach_tracing_prog = true; 3174 } 3175 3176 /* find program type: socket_filter vs tracing_filter */ 3177 err = find_prog_type(type, prog); 3178 if (err < 0) 3179 goto free_prog; 3180 3181 prog->aux->load_time = ktime_get_boottime_ns(); 3182 err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, 3183 sizeof(attr->prog_name)); 3184 if (err < 0) 3185 goto free_prog; 3186 3187 err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); 3188 if (err) 3189 goto free_prog; 3190 3191 /* run eBPF verifier */ 3192 err = bpf_check(&prog, attr, uattr, attr_log); 3193 if (err < 0) 3194 goto free_used_maps; 3195 3196 err = bpf_prog_mark_insn_arrays_ready(prog); 3197 if (err < 0) 3198 goto free_used_maps; 3199 3200 err = bpf_prog_alloc_id(prog); 3201 if (err) 3202 goto free_used_maps; 3203 3204 /* Upon success of bpf_prog_alloc_id(), the BPF prog is 3205 * effectively publicly exposed. However, retrieving via 3206 * bpf_prog_get_fd_by_id() will take another reference, 3207 * therefore it cannot be gone underneath us. 3208 * 3209 * Only for the time /after/ successful bpf_prog_new_fd() 3210 * and before returning to userspace, we might just hold 3211 * one reference and any parallel close on that fd could 3212 * rip everything out. Hence, below notifications must 3213 * happen before bpf_prog_new_fd(). 3214 * 3215 * Also, any failure handling from this point onwards must 3216 * be using bpf_prog_put() given the program is exposed. 3217 */ 3218 bpf_prog_kallsyms_add(prog); 3219 perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); 3220 bpf_audit_prog(prog, BPF_AUDIT_LOAD); 3221 3222 err = bpf_prog_new_fd(prog); 3223 if (err < 0) 3224 bpf_prog_put(prog); 3225 return err; 3226 3227 free_used_maps: 3228 /* In case we have subprogs, we need to wait for a grace 3229 * period before we can tear down JIT memory since symbols 3230 * are already exposed under kallsyms. 3231 */ 3232 __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); 3233 return err; 3234 3235 free_prog: 3236 free_uid(prog->aux->user); 3237 if (prog->aux->attach_btf) 3238 btf_put(prog->aux->attach_btf); 3239 bpf_prog_free(prog); 3240 put_token: 3241 bpf_token_put(token); 3242 return err; 3243 } 3244 3245 #define BPF_OBJ_LAST_FIELD path_fd 3246 3247 static int bpf_obj_pin(const union bpf_attr *attr) 3248 { 3249 int path_fd; 3250 3251 if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD) 3252 return -EINVAL; 3253 3254 /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ 3255 if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) 3256 return -EINVAL; 3257 3258 path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; 3259 return bpf_obj_pin_user(attr->bpf_fd, path_fd, 3260 u64_to_user_ptr(attr->pathname)); 3261 } 3262 3263 static int bpf_obj_get(const union bpf_attr *attr) 3264 { 3265 int path_fd; 3266 3267 if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || 3268 attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD)) 3269 return -EINVAL; 3270 3271 /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ 3272 if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) 3273 return -EINVAL; 3274 3275 path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; 3276 return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname), 3277 attr->file_flags); 3278 } 3279 3280 /* bpf_link_init_sleepable() allows to specify whether BPF link itself has 3281 * "sleepable" semantics, which normally would mean that BPF link's attach 3282 * hook can dereference link or link's underlying program for some time after 3283 * detachment due to RCU Tasks Trace-based lifetime protection scheme. 3284 * BPF program itself can be non-sleepable, yet, because it's transitively 3285 * reachable through BPF link, its freeing has to be delayed until after RCU 3286 * Tasks Trace GP. 3287 */ 3288 void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, 3289 const struct bpf_link_ops *ops, struct bpf_prog *prog, 3290 enum bpf_attach_type attach_type, bool sleepable) 3291 { 3292 WARN_ON(ops->dealloc && ops->dealloc_deferred); 3293 atomic64_set(&link->refcnt, 1); 3294 link->type = type; 3295 link->sleepable = sleepable; 3296 link->id = 0; 3297 link->ops = ops; 3298 link->prog = prog; 3299 link->attach_type = attach_type; 3300 } 3301 3302 void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, 3303 const struct bpf_link_ops *ops, struct bpf_prog *prog, 3304 enum bpf_attach_type attach_type) 3305 { 3306 bpf_link_init_sleepable(link, type, ops, prog, attach_type, false); 3307 } 3308 3309 void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type, 3310 const struct bpf_link_ops *ops, struct bpf_prog *prog, 3311 enum bpf_attach_type attach_type, u64 cookie) 3312 { 3313 bpf_link_init(&link->link, type, ops, prog, attach_type); 3314 link->node.link = &link->link; 3315 link->node.cookie = cookie; 3316 } 3317 3318 static void bpf_link_free_id(int id) 3319 { 3320 if (!id) 3321 return; 3322 3323 spin_lock_bh(&link_idr_lock); 3324 idr_remove(&link_idr, id); 3325 spin_unlock_bh(&link_idr_lock); 3326 } 3327 3328 /* Clean up bpf_link and corresponding anon_inode file and FD. After 3329 * anon_inode is created, bpf_link can't be just kfree()'d due to deferred 3330 * anon_inode's release() call. This helper marks bpf_link as 3331 * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt 3332 * is not decremented, it's the responsibility of a calling code that failed 3333 * to complete bpf_link initialization. 3334 * This helper eventually calls link's dealloc callback, but does not call 3335 * link's release callback. 3336 */ 3337 void bpf_link_cleanup(struct bpf_link_primer *primer) 3338 { 3339 primer->link->prog = NULL; 3340 bpf_link_free_id(primer->id); 3341 fput(primer->file); 3342 put_unused_fd(primer->fd); 3343 } 3344 3345 void bpf_link_inc(struct bpf_link *link) 3346 { 3347 atomic64_inc(&link->refcnt); 3348 } 3349 3350 static void bpf_link_dealloc(struct bpf_link *link) 3351 { 3352 /* now that we know that bpf_link itself can't be reached, put underlying BPF program */ 3353 if (link->prog) 3354 bpf_prog_put(link->prog); 3355 3356 /* free bpf_link and its containing memory */ 3357 if (link->ops->dealloc_deferred) 3358 link->ops->dealloc_deferred(link); 3359 else 3360 link->ops->dealloc(link); 3361 } 3362 3363 static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) 3364 { 3365 struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); 3366 3367 bpf_link_dealloc(link); 3368 } 3369 3370 static bool bpf_link_is_tracepoint(struct bpf_link *link) 3371 { 3372 /* 3373 * Only these combinations support a tracepoint bpf_link. 3374 * BPF_LINK_TYPE_TRACING raw_tp progs are hardcoded to use 3375 * bpf_raw_tp_link_lops and thus dealloc_deferred(), see 3376 * bpf_raw_tp_link_attach(). 3377 */ 3378 return link->type == BPF_LINK_TYPE_RAW_TRACEPOINT || 3379 (link->type == BPF_LINK_TYPE_TRACING && link->attach_type == BPF_TRACE_RAW_TP); 3380 } 3381 3382 /* bpf_link_free is guaranteed to be called from process context */ 3383 static void bpf_link_free(struct bpf_link *link) 3384 { 3385 const struct bpf_link_ops *ops = link->ops; 3386 3387 bpf_link_free_id(link->id); 3388 /* detach BPF program, clean up used resources */ 3389 if (link->prog) 3390 ops->release(link); 3391 if (ops->dealloc_deferred) { 3392 /* 3393 * Schedule BPF link deallocation, which will only then 3394 * trigger putting BPF program refcount. 3395 * If underlying BPF program is sleepable or BPF link's target 3396 * attach hookpoint is sleepable or otherwise requires RCU GPs 3397 * to ensure link and its underlying BPF program is not 3398 * reachable anymore, we need to first wait for RCU tasks 3399 * trace sync, and then go through "classic" RCU grace period. 3400 * 3401 * For tracepoint BPF links, we need to go through SRCU grace 3402 * period wait instead when non-faultable tracepoint is used. We 3403 * don't need to chain SRCU grace period waits, however, for the 3404 * faultable case, since it exclusively uses RCU Tasks Trace. 3405 */ 3406 if (link->sleepable || (link->prog && link->prog->sleepable)) 3407 /* RCU Tasks Trace grace period implies RCU grace period. */ 3408 call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_rcu_gp); 3409 /* We need to do a SRCU grace period wait for non-faultable tracepoint BPF links. */ 3410 else if (bpf_link_is_tracepoint(link)) 3411 call_tracepoint_unregister_atomic(&link->rcu, bpf_link_defer_dealloc_rcu_gp); 3412 else 3413 call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); 3414 } else if (ops->dealloc) { 3415 bpf_link_dealloc(link); 3416 } 3417 } 3418 3419 static void bpf_link_put_deferred(struct work_struct *work) 3420 { 3421 struct bpf_link *link = container_of(work, struct bpf_link, work); 3422 3423 bpf_link_free(link); 3424 } 3425 3426 /* bpf_link_put might be called from atomic context. It needs to be called 3427 * from sleepable context in order to acquire sleeping locks during the process. 3428 */ 3429 void bpf_link_put(struct bpf_link *link) 3430 { 3431 if (!atomic64_dec_and_test(&link->refcnt)) 3432 return; 3433 3434 INIT_WORK(&link->work, bpf_link_put_deferred); 3435 schedule_work(&link->work); 3436 } 3437 EXPORT_SYMBOL(bpf_link_put); 3438 3439 static void bpf_link_put_direct(struct bpf_link *link) 3440 { 3441 if (!atomic64_dec_and_test(&link->refcnt)) 3442 return; 3443 bpf_link_free(link); 3444 } 3445 3446 static int bpf_link_release(struct inode *inode, struct file *filp) 3447 { 3448 struct bpf_link *link = filp->private_data; 3449 3450 bpf_link_put_direct(link); 3451 return 0; 3452 } 3453 3454 #ifdef CONFIG_PROC_FS 3455 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) 3456 #define BPF_MAP_TYPE(_id, _ops) 3457 #define BPF_LINK_TYPE(_id, _name) [_id] = #_name, 3458 static const char *bpf_link_type_strs[] = { 3459 [BPF_LINK_TYPE_UNSPEC] = "<invalid>", 3460 #include <linux/bpf_types.h> 3461 }; 3462 #undef BPF_PROG_TYPE 3463 #undef BPF_MAP_TYPE 3464 #undef BPF_LINK_TYPE 3465 3466 static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) 3467 { 3468 const struct bpf_link *link = filp->private_data; 3469 const struct bpf_prog *prog = link->prog; 3470 enum bpf_link_type type = link->type; 3471 char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; 3472 3473 if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { 3474 if (link->type == BPF_LINK_TYPE_KPROBE_MULTI) 3475 seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ? 3476 "kretprobe_multi" : "kprobe_multi"); 3477 else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI) 3478 seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ? 3479 "uretprobe_multi" : "uprobe_multi"); 3480 else 3481 seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); 3482 } else { 3483 WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); 3484 seq_printf(m, "link_type:\t<%u>\n", type); 3485 } 3486 seq_printf(m, "link_id:\t%u\n", link->id); 3487 3488 if (prog) { 3489 bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); 3490 seq_printf(m, 3491 "prog_tag:\t%s\n" 3492 "prog_id:\t%u\n", 3493 prog_tag, 3494 prog->aux->id); 3495 } 3496 if (link->ops->show_fdinfo) 3497 link->ops->show_fdinfo(link, m); 3498 } 3499 #endif 3500 3501 static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts) 3502 { 3503 struct bpf_link *link = file->private_data; 3504 3505 return link->ops->poll(file, pts); 3506 } 3507 3508 static const struct file_operations bpf_link_fops = { 3509 #ifdef CONFIG_PROC_FS 3510 .show_fdinfo = bpf_link_show_fdinfo, 3511 #endif 3512 .release = bpf_link_release, 3513 .read = bpf_dummy_read, 3514 .write = bpf_dummy_write, 3515 }; 3516 3517 static const struct file_operations bpf_link_fops_poll = { 3518 #ifdef CONFIG_PROC_FS 3519 .show_fdinfo = bpf_link_show_fdinfo, 3520 #endif 3521 .release = bpf_link_release, 3522 .read = bpf_dummy_read, 3523 .write = bpf_dummy_write, 3524 .poll = bpf_link_poll, 3525 }; 3526 3527 static int bpf_link_alloc_id(struct bpf_link *link) 3528 { 3529 int id; 3530 3531 idr_preload(GFP_KERNEL); 3532 spin_lock_bh(&link_idr_lock); 3533 id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); 3534 spin_unlock_bh(&link_idr_lock); 3535 idr_preload_end(); 3536 3537 return id; 3538 } 3539 3540 /* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, 3541 * reserving unused FD and allocating ID from link_idr. This is to be paired 3542 * with bpf_link_settle() to install FD and ID and expose bpf_link to 3543 * user-space, if bpf_link is successfully attached. If not, bpf_link and 3544 * pre-allocated resources are to be freed with bpf_cleanup() call. All the 3545 * transient state is passed around in struct bpf_link_primer. 3546 * This is preferred way to create and initialize bpf_link, especially when 3547 * there are complicated and expensive operations in between creating bpf_link 3548 * itself and attaching it to BPF hook. By using bpf_link_prime() and 3549 * bpf_link_settle() kernel code using bpf_link doesn't have to perform 3550 * expensive (and potentially failing) roll back operations in a rare case 3551 * that file, FD, or ID can't be allocated. 3552 */ 3553 int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) 3554 { 3555 struct file *file; 3556 int fd, id; 3557 3558 fd = get_unused_fd_flags(O_CLOEXEC); 3559 if (fd < 0) 3560 return fd; 3561 3562 3563 id = bpf_link_alloc_id(link); 3564 if (id < 0) { 3565 put_unused_fd(fd); 3566 return id; 3567 } 3568 3569 file = anon_inode_getfile("bpf_link", 3570 link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, 3571 link, O_CLOEXEC); 3572 if (IS_ERR(file)) { 3573 bpf_link_free_id(id); 3574 put_unused_fd(fd); 3575 return PTR_ERR(file); 3576 } 3577 3578 primer->link = link; 3579 primer->file = file; 3580 primer->fd = fd; 3581 primer->id = id; 3582 return 0; 3583 } 3584 3585 int bpf_link_settle(struct bpf_link_primer *primer) 3586 { 3587 /* make bpf_link fetchable by ID */ 3588 spin_lock_bh(&link_idr_lock); 3589 primer->link->id = primer->id; 3590 spin_unlock_bh(&link_idr_lock); 3591 /* make bpf_link fetchable by FD */ 3592 fd_install(primer->fd, primer->file); 3593 /* pass through installed FD */ 3594 return primer->fd; 3595 } 3596 3597 int bpf_link_new_fd(struct bpf_link *link) 3598 { 3599 return anon_inode_getfd("bpf-link", 3600 link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, 3601 link, O_CLOEXEC); 3602 } 3603 3604 struct bpf_link *bpf_link_get_from_fd(u32 ufd) 3605 { 3606 CLASS(fd, f)(ufd); 3607 struct bpf_link *link; 3608 3609 if (fd_empty(f)) 3610 return ERR_PTR(-EBADF); 3611 if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll) 3612 return ERR_PTR(-EINVAL); 3613 3614 link = fd_file(f)->private_data; 3615 bpf_link_inc(link); 3616 return link; 3617 } 3618 EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL"); 3619 3620 static void bpf_tracing_link_release(struct bpf_link *link) 3621 { 3622 struct bpf_tracing_link *tr_link = 3623 container_of(link, struct bpf_tracing_link, link.link); 3624 3625 WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link.node, 3626 tr_link->trampoline, 3627 tr_link->tgt_prog)); 3628 3629 bpf_trampoline_put(tr_link->trampoline); 3630 3631 /* tgt_prog is NULL if target is a kernel function */ 3632 if (tr_link->tgt_prog) 3633 bpf_prog_put(tr_link->tgt_prog); 3634 } 3635 3636 static void bpf_tracing_link_dealloc(struct bpf_link *link) 3637 { 3638 struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); 3639 3640 kfree(tr_link); 3641 } 3642 3643 static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, 3644 struct seq_file *seq) 3645 { 3646 struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); 3647 3648 u32 target_btf_id, target_obj_id; 3649 3650 bpf_trampoline_unpack_key(tr_link->trampoline->key, 3651 &target_obj_id, &target_btf_id); 3652 seq_printf(seq, 3653 "attach_type:\t%d\n" 3654 "target_obj_id:\t%u\n" 3655 "target_btf_id:\t%u\n" 3656 "cookie:\t%llu\n", 3657 link->attach_type, 3658 target_obj_id, 3659 target_btf_id, 3660 tr_link->link.node.cookie); 3661 } 3662 3663 static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, 3664 struct bpf_link_info *info) 3665 { 3666 struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); 3667 3668 info->tracing.attach_type = link->attach_type; 3669 info->tracing.cookie = tr_link->link.node.cookie; 3670 bpf_trampoline_unpack_key(tr_link->trampoline->key, 3671 &info->tracing.target_obj_id, 3672 &info->tracing.target_btf_id); 3673 3674 return 0; 3675 } 3676 3677 static const struct bpf_link_ops bpf_tracing_link_lops = { 3678 .release = bpf_tracing_link_release, 3679 .dealloc = bpf_tracing_link_dealloc, 3680 .show_fdinfo = bpf_tracing_link_show_fdinfo, 3681 .fill_link_info = bpf_tracing_link_fill_link_info, 3682 }; 3683 3684 static int bpf_tracing_prog_attach(struct bpf_prog *prog, 3685 int tgt_prog_fd, 3686 u32 btf_id, 3687 u64 bpf_cookie, 3688 enum bpf_attach_type attach_type) 3689 { 3690 struct bpf_link_primer link_primer; 3691 struct bpf_prog *tgt_prog = NULL; 3692 struct bpf_trampoline *tr = NULL; 3693 struct bpf_tracing_link *link; 3694 u64 key = 0; 3695 int err; 3696 3697 switch (prog->type) { 3698 case BPF_PROG_TYPE_TRACING: 3699 if (prog->expected_attach_type != BPF_TRACE_FENTRY && 3700 prog->expected_attach_type != BPF_TRACE_FEXIT && 3701 prog->expected_attach_type != BPF_TRACE_FSESSION && 3702 prog->expected_attach_type != BPF_MODIFY_RETURN) { 3703 err = -EINVAL; 3704 goto out_put_prog; 3705 } 3706 break; 3707 case BPF_PROG_TYPE_EXT: 3708 if (prog->expected_attach_type != 0) { 3709 err = -EINVAL; 3710 goto out_put_prog; 3711 } 3712 break; 3713 case BPF_PROG_TYPE_LSM: 3714 if (prog->expected_attach_type != BPF_LSM_MAC) { 3715 err = -EINVAL; 3716 goto out_put_prog; 3717 } 3718 break; 3719 default: 3720 err = -EINVAL; 3721 goto out_put_prog; 3722 } 3723 3724 if (!!tgt_prog_fd != !!btf_id) { 3725 err = -EINVAL; 3726 goto out_put_prog; 3727 } 3728 3729 if (tgt_prog_fd) { 3730 /* 3731 * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this 3732 * part would be changed to implement the same for 3733 * BPF_PROG_TYPE_TRACING, do not forget to update the way how 3734 * attach_tracing_prog flag is set. 3735 */ 3736 if (prog->type != BPF_PROG_TYPE_EXT) { 3737 err = -EINVAL; 3738 goto out_put_prog; 3739 } 3740 3741 tgt_prog = bpf_prog_get(tgt_prog_fd); 3742 if (IS_ERR(tgt_prog)) { 3743 err = PTR_ERR(tgt_prog); 3744 tgt_prog = NULL; 3745 goto out_put_prog; 3746 } 3747 3748 key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); 3749 } 3750 3751 link = kzalloc_obj(*link, GFP_USER); 3752 if (!link) { 3753 err = -ENOMEM; 3754 goto out_put_prog; 3755 } 3756 bpf_tramp_link_init(&link->link, BPF_LINK_TYPE_TRACING, 3757 &bpf_tracing_link_lops, prog, attach_type, bpf_cookie); 3758 3759 if (prog->expected_attach_type == BPF_TRACE_FSESSION) { 3760 link->fexit.link = &link->link.link; 3761 link->fexit.cookie = bpf_cookie; 3762 } 3763 3764 mutex_lock(&prog->aux->dst_mutex); 3765 3766 /* There are a few possible cases here: 3767 * 3768 * - if prog->aux->dst_trampoline is set, the program was just loaded 3769 * and not yet attached to anything, so we can use the values stored 3770 * in prog->aux 3771 * 3772 * - if prog->aux->dst_trampoline is NULL, the program has already been 3773 * attached to a target and its initial target was cleared (below) 3774 * 3775 * - if tgt_prog != NULL, the caller specified tgt_prog_fd + 3776 * target_btf_id using the link_create API. 3777 * 3778 * - if tgt_prog == NULL when this function was called using the old 3779 * raw_tracepoint_open API, and we need a target from prog->aux 3780 * 3781 * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program 3782 * was detached and is going for re-attachment. 3783 * 3784 * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf 3785 * are NULL, then program was already attached and user did not provide 3786 * tgt_prog_fd so we have no way to find out or create trampoline 3787 */ 3788 if (!prog->aux->dst_trampoline && !tgt_prog) { 3789 /* 3790 * Allow re-attach for TRACING and LSM programs. If it's 3791 * currently linked, bpf_trampoline_link_prog will fail. 3792 * EXT programs need to specify tgt_prog_fd, so they 3793 * re-attach in separate code path. 3794 */ 3795 if (prog->type != BPF_PROG_TYPE_TRACING && 3796 prog->type != BPF_PROG_TYPE_LSM) { 3797 err = -EINVAL; 3798 goto out_unlock; 3799 } 3800 /* We can allow re-attach only if we have valid attach_btf. */ 3801 if (!prog->aux->attach_btf) { 3802 err = -EINVAL; 3803 goto out_unlock; 3804 } 3805 btf_id = prog->aux->attach_btf_id; 3806 key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id); 3807 } 3808 3809 if (!prog->aux->dst_trampoline || 3810 (key && key != prog->aux->dst_trampoline->key)) { 3811 /* If there is no saved target, or the specified target is 3812 * different from the destination specified at load time, we 3813 * need a new trampoline and a check for compatibility 3814 */ 3815 struct bpf_attach_target_info tgt_info = {}; 3816 3817 err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id, 3818 &tgt_info); 3819 if (err) 3820 goto out_unlock; 3821 3822 if (tgt_info.tgt_mod) { 3823 module_put(prog->aux->mod); 3824 prog->aux->mod = tgt_info.tgt_mod; 3825 } 3826 3827 tr = bpf_trampoline_get(key, &tgt_info); 3828 if (!tr) { 3829 err = -ENOMEM; 3830 goto out_unlock; 3831 } 3832 } else { 3833 /* The caller didn't specify a target, or the target was the 3834 * same as the destination supplied during program load. This 3835 * means we can reuse the trampoline and reference from program 3836 * load time, and there is no need to allocate a new one. This 3837 * can only happen once for any program, as the saved values in 3838 * prog->aux are cleared below. 3839 */ 3840 tr = prog->aux->dst_trampoline; 3841 tgt_prog = prog->aux->dst_prog; 3842 } 3843 /* 3844 * It is to prevent modifying struct pt_regs via kprobe_write_ctx=true 3845 * freplace prog. Without this check, kprobe_write_ctx=true freplace 3846 * prog is allowed to attach to kprobe_write_ctx=false kprobe prog, and 3847 * then modify the registers of the kprobe prog's target kernel 3848 * function. 3849 * 3850 * This also blocks the combination of uprobe+freplace, because it is 3851 * unable to recognize the use of the tgt_prog as an uprobe or a kprobe 3852 * by tgt_prog itself. At attach time, uprobe/kprobe is recognized by 3853 * the target perf event flags in __perf_event_set_bpf_prog(). 3854 */ 3855 if (prog->type == BPF_PROG_TYPE_EXT && 3856 prog->aux->kprobe_write_ctx != tgt_prog->aux->kprobe_write_ctx) { 3857 err = -EINVAL; 3858 goto out_unlock; 3859 } 3860 3861 err = bpf_link_prime(&link->link.link, &link_primer); 3862 if (err) 3863 goto out_unlock; 3864 3865 err = bpf_trampoline_link_prog(&link->link.node, tr, tgt_prog); 3866 if (err) { 3867 bpf_link_cleanup(&link_primer); 3868 link = NULL; 3869 goto out_unlock; 3870 } 3871 3872 link->tgt_prog = tgt_prog; 3873 link->trampoline = tr; 3874 3875 /* Always clear the trampoline and target prog from prog->aux to make 3876 * sure the original attach destination is not kept alive after a 3877 * program is (re-)attached to another target. 3878 */ 3879 if (prog->aux->dst_prog && 3880 (tgt_prog_fd || tr != prog->aux->dst_trampoline)) 3881 /* got extra prog ref from syscall, or attaching to different prog */ 3882 bpf_prog_put(prog->aux->dst_prog); 3883 if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline) 3884 /* we allocated a new trampoline, so free the old one */ 3885 bpf_trampoline_put(prog->aux->dst_trampoline); 3886 3887 prog->aux->dst_prog = NULL; 3888 prog->aux->dst_trampoline = NULL; 3889 mutex_unlock(&prog->aux->dst_mutex); 3890 3891 return bpf_link_settle(&link_primer); 3892 out_unlock: 3893 if (tr && tr != prog->aux->dst_trampoline) 3894 bpf_trampoline_put(tr); 3895 mutex_unlock(&prog->aux->dst_mutex); 3896 kfree(link); 3897 out_put_prog: 3898 if (tgt_prog_fd && tgt_prog) 3899 bpf_prog_put(tgt_prog); 3900 return err; 3901 } 3902 3903 static void bpf_raw_tp_link_release(struct bpf_link *link) 3904 { 3905 struct bpf_raw_tp_link *raw_tp = 3906 container_of(link, struct bpf_raw_tp_link, link); 3907 3908 bpf_probe_unregister(raw_tp->btp, raw_tp); 3909 bpf_put_raw_tracepoint(raw_tp->btp); 3910 } 3911 3912 static void bpf_raw_tp_link_dealloc(struct bpf_link *link) 3913 { 3914 struct bpf_raw_tp_link *raw_tp = 3915 container_of(link, struct bpf_raw_tp_link, link); 3916 3917 kfree(raw_tp); 3918 } 3919 3920 static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, 3921 struct seq_file *seq) 3922 { 3923 struct bpf_raw_tp_link *raw_tp_link = 3924 container_of(link, struct bpf_raw_tp_link, link); 3925 3926 seq_printf(seq, 3927 "tp_name:\t%s\n" 3928 "cookie:\t%llu\n", 3929 raw_tp_link->btp->tp->name, 3930 raw_tp_link->cookie); 3931 } 3932 3933 static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen, 3934 u32 len) 3935 { 3936 if (ulen >= len + 1) { 3937 if (copy_to_user(ubuf, buf, len + 1)) 3938 return -EFAULT; 3939 } else { 3940 char zero = '\0'; 3941 3942 if (copy_to_user(ubuf, buf, ulen - 1)) 3943 return -EFAULT; 3944 if (put_user(zero, ubuf + ulen - 1)) 3945 return -EFAULT; 3946 return -ENOSPC; 3947 } 3948 3949 return 0; 3950 } 3951 3952 static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, 3953 struct bpf_link_info *info) 3954 { 3955 struct bpf_raw_tp_link *raw_tp_link = 3956 container_of(link, struct bpf_raw_tp_link, link); 3957 char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); 3958 const char *tp_name = raw_tp_link->btp->tp->name; 3959 u32 ulen = info->raw_tracepoint.tp_name_len; 3960 size_t tp_len = strlen(tp_name); 3961 3962 if (!ulen ^ !ubuf) 3963 return -EINVAL; 3964 3965 info->raw_tracepoint.tp_name_len = tp_len + 1; 3966 info->raw_tracepoint.cookie = raw_tp_link->cookie; 3967 3968 if (!ubuf) 3969 return 0; 3970 3971 return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len); 3972 } 3973 3974 static const struct bpf_link_ops bpf_raw_tp_link_lops = { 3975 .release = bpf_raw_tp_link_release, 3976 .dealloc_deferred = bpf_raw_tp_link_dealloc, 3977 .show_fdinfo = bpf_raw_tp_link_show_fdinfo, 3978 .fill_link_info = bpf_raw_tp_link_fill_link_info, 3979 }; 3980 3981 #ifdef CONFIG_PERF_EVENTS 3982 struct bpf_perf_link { 3983 struct bpf_link link; 3984 struct file *perf_file; 3985 }; 3986 3987 static void bpf_perf_link_release(struct bpf_link *link) 3988 { 3989 struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); 3990 struct perf_event *event = perf_link->perf_file->private_data; 3991 3992 perf_event_free_bpf_prog(event); 3993 fput(perf_link->perf_file); 3994 } 3995 3996 static void bpf_perf_link_dealloc(struct bpf_link *link) 3997 { 3998 struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); 3999 4000 kfree(perf_link); 4001 } 4002 4003 static int bpf_perf_link_fill_common(const struct perf_event *event, 4004 char __user *uname, u32 *ulenp, 4005 u64 *probe_offset, u64 *probe_addr, 4006 u32 *fd_type, unsigned long *missed) 4007 { 4008 const char *buf; 4009 u32 prog_id, ulen; 4010 size_t len; 4011 int err; 4012 4013 ulen = *ulenp; 4014 if (!ulen ^ !uname) 4015 return -EINVAL; 4016 4017 err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf, 4018 probe_offset, probe_addr, missed); 4019 if (err) 4020 return err; 4021 4022 if (buf) { 4023 len = strlen(buf); 4024 *ulenp = len + 1; 4025 } else { 4026 *ulenp = 1; 4027 } 4028 if (!uname) 4029 return 0; 4030 4031 if (buf) { 4032 err = bpf_copy_to_user(uname, buf, ulen, len); 4033 if (err) 4034 return err; 4035 } else { 4036 char zero = '\0'; 4037 4038 if (put_user(zero, uname)) 4039 return -EFAULT; 4040 } 4041 return 0; 4042 } 4043 4044 #ifdef CONFIG_KPROBE_EVENTS 4045 static int bpf_perf_link_fill_kprobe(const struct perf_event *event, 4046 struct bpf_link_info *info) 4047 { 4048 unsigned long missed; 4049 char __user *uname; 4050 u64 addr, offset; 4051 u32 ulen, type; 4052 int err; 4053 4054 uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); 4055 ulen = info->perf_event.kprobe.name_len; 4056 err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, 4057 &type, &missed); 4058 if (err) 4059 return err; 4060 if (type == BPF_FD_TYPE_KRETPROBE) 4061 info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; 4062 else 4063 info->perf_event.type = BPF_PERF_EVENT_KPROBE; 4064 info->perf_event.kprobe.name_len = ulen; 4065 info->perf_event.kprobe.offset = offset; 4066 info->perf_event.kprobe.missed = missed; 4067 if (!kallsyms_show_value(current_cred())) 4068 addr = 0; 4069 info->perf_event.kprobe.addr = addr; 4070 info->perf_event.kprobe.cookie = event->bpf_cookie; 4071 return 0; 4072 } 4073 4074 static void bpf_perf_link_fdinfo_kprobe(const struct perf_event *event, 4075 struct seq_file *seq) 4076 { 4077 const char *name; 4078 int err; 4079 u32 prog_id, type; 4080 u64 offset, addr; 4081 unsigned long missed; 4082 4083 err = bpf_get_perf_event_info(event, &prog_id, &type, &name, 4084 &offset, &addr, &missed); 4085 if (err) 4086 return; 4087 4088 seq_printf(seq, 4089 "name:\t%s\n" 4090 "offset:\t%#llx\n" 4091 "missed:\t%lu\n" 4092 "addr:\t%#llx\n" 4093 "event_type:\t%s\n" 4094 "cookie:\t%llu\n", 4095 name, offset, missed, addr, 4096 type == BPF_FD_TYPE_KRETPROBE ? "kretprobe" : "kprobe", 4097 event->bpf_cookie); 4098 } 4099 #endif 4100 4101 #ifdef CONFIG_UPROBE_EVENTS 4102 static int bpf_perf_link_fill_uprobe(const struct perf_event *event, 4103 struct bpf_link_info *info) 4104 { 4105 u64 ref_ctr_offset, offset; 4106 char __user *uname; 4107 u32 ulen, type; 4108 int err; 4109 4110 uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); 4111 ulen = info->perf_event.uprobe.name_len; 4112 err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset, 4113 &type, NULL); 4114 if (err) 4115 return err; 4116 4117 if (type == BPF_FD_TYPE_URETPROBE) 4118 info->perf_event.type = BPF_PERF_EVENT_URETPROBE; 4119 else 4120 info->perf_event.type = BPF_PERF_EVENT_UPROBE; 4121 info->perf_event.uprobe.name_len = ulen; 4122 info->perf_event.uprobe.offset = offset; 4123 info->perf_event.uprobe.cookie = event->bpf_cookie; 4124 info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset; 4125 return 0; 4126 } 4127 4128 static void bpf_perf_link_fdinfo_uprobe(const struct perf_event *event, 4129 struct seq_file *seq) 4130 { 4131 const char *name; 4132 int err; 4133 u32 prog_id, type; 4134 u64 offset, ref_ctr_offset; 4135 unsigned long missed; 4136 4137 err = bpf_get_perf_event_info(event, &prog_id, &type, &name, 4138 &offset, &ref_ctr_offset, &missed); 4139 if (err) 4140 return; 4141 4142 seq_printf(seq, 4143 "name:\t%s\n" 4144 "offset:\t%#llx\n" 4145 "ref_ctr_offset:\t%#llx\n" 4146 "event_type:\t%s\n" 4147 "cookie:\t%llu\n", 4148 name, offset, ref_ctr_offset, 4149 type == BPF_FD_TYPE_URETPROBE ? "uretprobe" : "uprobe", 4150 event->bpf_cookie); 4151 } 4152 #endif 4153 4154 static int bpf_perf_link_fill_probe(const struct perf_event *event, 4155 struct bpf_link_info *info) 4156 { 4157 #ifdef CONFIG_KPROBE_EVENTS 4158 if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) 4159 return bpf_perf_link_fill_kprobe(event, info); 4160 #endif 4161 #ifdef CONFIG_UPROBE_EVENTS 4162 if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) 4163 return bpf_perf_link_fill_uprobe(event, info); 4164 #endif 4165 return -EOPNOTSUPP; 4166 } 4167 4168 static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, 4169 struct bpf_link_info *info) 4170 { 4171 char __user *uname; 4172 u32 ulen; 4173 int err; 4174 4175 uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); 4176 ulen = info->perf_event.tracepoint.name_len; 4177 err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL); 4178 if (err) 4179 return err; 4180 4181 info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; 4182 info->perf_event.tracepoint.name_len = ulen; 4183 info->perf_event.tracepoint.cookie = event->bpf_cookie; 4184 return 0; 4185 } 4186 4187 static int bpf_perf_link_fill_perf_event(const struct perf_event *event, 4188 struct bpf_link_info *info) 4189 { 4190 info->perf_event.event.type = event->attr.type; 4191 info->perf_event.event.config = event->attr.config; 4192 info->perf_event.event.cookie = event->bpf_cookie; 4193 info->perf_event.type = BPF_PERF_EVENT_EVENT; 4194 return 0; 4195 } 4196 4197 static int bpf_perf_link_fill_link_info(const struct bpf_link *link, 4198 struct bpf_link_info *info) 4199 { 4200 struct bpf_perf_link *perf_link; 4201 const struct perf_event *event; 4202 4203 perf_link = container_of(link, struct bpf_perf_link, link); 4204 event = perf_get_event(perf_link->perf_file); 4205 if (IS_ERR(event)) 4206 return PTR_ERR(event); 4207 4208 switch (event->prog->type) { 4209 case BPF_PROG_TYPE_PERF_EVENT: 4210 return bpf_perf_link_fill_perf_event(event, info); 4211 case BPF_PROG_TYPE_TRACEPOINT: 4212 return bpf_perf_link_fill_tracepoint(event, info); 4213 case BPF_PROG_TYPE_KPROBE: 4214 return bpf_perf_link_fill_probe(event, info); 4215 default: 4216 return -EOPNOTSUPP; 4217 } 4218 } 4219 4220 static void bpf_perf_event_link_show_fdinfo(const struct perf_event *event, 4221 struct seq_file *seq) 4222 { 4223 seq_printf(seq, 4224 "type:\t%u\n" 4225 "config:\t%llu\n" 4226 "event_type:\t%s\n" 4227 "cookie:\t%llu\n", 4228 event->attr.type, event->attr.config, 4229 "event", event->bpf_cookie); 4230 } 4231 4232 static void bpf_tracepoint_link_show_fdinfo(const struct perf_event *event, 4233 struct seq_file *seq) 4234 { 4235 int err; 4236 const char *name; 4237 u32 prog_id; 4238 4239 err = bpf_get_perf_event_info(event, &prog_id, NULL, &name, NULL, 4240 NULL, NULL); 4241 if (err) 4242 return; 4243 4244 seq_printf(seq, 4245 "tp_name:\t%s\n" 4246 "event_type:\t%s\n" 4247 "cookie:\t%llu\n", 4248 name, "tracepoint", event->bpf_cookie); 4249 } 4250 4251 static void bpf_probe_link_show_fdinfo(const struct perf_event *event, 4252 struct seq_file *seq) 4253 { 4254 #ifdef CONFIG_KPROBE_EVENTS 4255 if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) 4256 return bpf_perf_link_fdinfo_kprobe(event, seq); 4257 #endif 4258 4259 #ifdef CONFIG_UPROBE_EVENTS 4260 if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) 4261 return bpf_perf_link_fdinfo_uprobe(event, seq); 4262 #endif 4263 } 4264 4265 static void bpf_perf_link_show_fdinfo(const struct bpf_link *link, 4266 struct seq_file *seq) 4267 { 4268 struct bpf_perf_link *perf_link; 4269 const struct perf_event *event; 4270 4271 perf_link = container_of(link, struct bpf_perf_link, link); 4272 event = perf_get_event(perf_link->perf_file); 4273 if (IS_ERR(event)) 4274 return; 4275 4276 switch (event->prog->type) { 4277 case BPF_PROG_TYPE_PERF_EVENT: 4278 return bpf_perf_event_link_show_fdinfo(event, seq); 4279 case BPF_PROG_TYPE_TRACEPOINT: 4280 return bpf_tracepoint_link_show_fdinfo(event, seq); 4281 case BPF_PROG_TYPE_KPROBE: 4282 return bpf_probe_link_show_fdinfo(event, seq); 4283 default: 4284 return; 4285 } 4286 } 4287 4288 static const struct bpf_link_ops bpf_perf_link_lops = { 4289 .release = bpf_perf_link_release, 4290 .dealloc = bpf_perf_link_dealloc, 4291 .fill_link_info = bpf_perf_link_fill_link_info, 4292 .show_fdinfo = bpf_perf_link_show_fdinfo, 4293 }; 4294 4295 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) 4296 { 4297 struct bpf_link_primer link_primer; 4298 struct bpf_perf_link *link; 4299 struct perf_event *event; 4300 struct file *perf_file; 4301 int err; 4302 4303 if (attr->link_create.flags) 4304 return -EINVAL; 4305 4306 perf_file = perf_event_get(attr->link_create.target_fd); 4307 if (IS_ERR(perf_file)) 4308 return PTR_ERR(perf_file); 4309 4310 link = kzalloc_obj(*link, GFP_USER); 4311 if (!link) { 4312 err = -ENOMEM; 4313 goto out_put_file; 4314 } 4315 bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog, 4316 attr->link_create.attach_type); 4317 link->perf_file = perf_file; 4318 4319 err = bpf_link_prime(&link->link, &link_primer); 4320 if (err) { 4321 kfree(link); 4322 goto out_put_file; 4323 } 4324 4325 event = perf_file->private_data; 4326 err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie); 4327 if (err) { 4328 bpf_link_cleanup(&link_primer); 4329 goto out_put_file; 4330 } 4331 /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */ 4332 bpf_prog_inc(prog); 4333 4334 return bpf_link_settle(&link_primer); 4335 4336 out_put_file: 4337 fput(perf_file); 4338 return err; 4339 } 4340 #else 4341 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) 4342 { 4343 return -EOPNOTSUPP; 4344 } 4345 #endif /* CONFIG_PERF_EVENTS */ 4346 4347 static int bpf_raw_tp_link_attach(struct bpf_prog *prog, 4348 const char __user *user_tp_name, u64 cookie, 4349 enum bpf_attach_type attach_type) 4350 { 4351 struct bpf_link_primer link_primer; 4352 struct bpf_raw_tp_link *link; 4353 struct bpf_raw_event_map *btp; 4354 const char *tp_name; 4355 char buf[128]; 4356 int err; 4357 4358 switch (prog->type) { 4359 case BPF_PROG_TYPE_TRACING: 4360 case BPF_PROG_TYPE_EXT: 4361 case BPF_PROG_TYPE_LSM: 4362 if (user_tp_name) 4363 /* The attach point for this category of programs 4364 * should be specified via btf_id during program load. 4365 */ 4366 return -EINVAL; 4367 if (prog->type == BPF_PROG_TYPE_TRACING && 4368 prog->expected_attach_type == BPF_TRACE_RAW_TP) { 4369 tp_name = prog->aux->attach_func_name; 4370 break; 4371 } 4372 return bpf_tracing_prog_attach(prog, 0, 0, 0, attach_type); 4373 case BPF_PROG_TYPE_RAW_TRACEPOINT: 4374 case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: 4375 if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) 4376 return -EFAULT; 4377 buf[sizeof(buf) - 1] = 0; 4378 tp_name = buf; 4379 break; 4380 default: 4381 return -EINVAL; 4382 } 4383 4384 btp = bpf_get_raw_tracepoint(tp_name); 4385 if (!btp) 4386 return -ENOENT; 4387 4388 if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { 4389 bpf_put_raw_tracepoint(btp); 4390 return -EINVAL; 4391 } 4392 4393 link = kzalloc_obj(*link, GFP_USER); 4394 if (!link) { 4395 err = -ENOMEM; 4396 goto out_put_btp; 4397 } 4398 bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, 4399 &bpf_raw_tp_link_lops, prog, attach_type, 4400 tracepoint_is_faultable(btp->tp)); 4401 link->btp = btp; 4402 link->cookie = cookie; 4403 4404 err = bpf_link_prime(&link->link, &link_primer); 4405 if (err) { 4406 kfree(link); 4407 goto out_put_btp; 4408 } 4409 4410 err = bpf_probe_register(link->btp, link); 4411 if (err) { 4412 bpf_link_cleanup(&link_primer); 4413 goto out_put_btp; 4414 } 4415 4416 return bpf_link_settle(&link_primer); 4417 4418 out_put_btp: 4419 bpf_put_raw_tracepoint(btp); 4420 return err; 4421 } 4422 4423 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie 4424 4425 static int bpf_raw_tracepoint_open(const union bpf_attr *attr) 4426 { 4427 struct bpf_prog *prog; 4428 void __user *tp_name; 4429 __u64 cookie; 4430 int fd; 4431 4432 if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) 4433 return -EINVAL; 4434 4435 prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); 4436 if (IS_ERR(prog)) 4437 return PTR_ERR(prog); 4438 4439 tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); 4440 cookie = attr->raw_tracepoint.cookie; 4441 fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type); 4442 if (fd < 0) 4443 bpf_prog_put(prog); 4444 return fd; 4445 } 4446 4447 static enum bpf_prog_type 4448 attach_type_to_prog_type(enum bpf_attach_type attach_type) 4449 { 4450 switch (attach_type) { 4451 case BPF_CGROUP_INET_INGRESS: 4452 case BPF_CGROUP_INET_EGRESS: 4453 return BPF_PROG_TYPE_CGROUP_SKB; 4454 case BPF_CGROUP_INET_SOCK_CREATE: 4455 case BPF_CGROUP_INET_SOCK_RELEASE: 4456 case BPF_CGROUP_INET4_POST_BIND: 4457 case BPF_CGROUP_INET6_POST_BIND: 4458 return BPF_PROG_TYPE_CGROUP_SOCK; 4459 case BPF_CGROUP_INET4_BIND: 4460 case BPF_CGROUP_INET6_BIND: 4461 case BPF_CGROUP_INET4_CONNECT: 4462 case BPF_CGROUP_INET6_CONNECT: 4463 case BPF_CGROUP_UNIX_CONNECT: 4464 case BPF_CGROUP_INET4_GETPEERNAME: 4465 case BPF_CGROUP_INET6_GETPEERNAME: 4466 case BPF_CGROUP_UNIX_GETPEERNAME: 4467 case BPF_CGROUP_INET4_GETSOCKNAME: 4468 case BPF_CGROUP_INET6_GETSOCKNAME: 4469 case BPF_CGROUP_UNIX_GETSOCKNAME: 4470 case BPF_CGROUP_UDP4_SENDMSG: 4471 case BPF_CGROUP_UDP6_SENDMSG: 4472 case BPF_CGROUP_UNIX_SENDMSG: 4473 case BPF_CGROUP_UDP4_RECVMSG: 4474 case BPF_CGROUP_UDP6_RECVMSG: 4475 case BPF_CGROUP_UNIX_RECVMSG: 4476 return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; 4477 case BPF_CGROUP_SOCK_OPS: 4478 return BPF_PROG_TYPE_SOCK_OPS; 4479 case BPF_CGROUP_DEVICE: 4480 return BPF_PROG_TYPE_CGROUP_DEVICE; 4481 case BPF_SK_MSG_VERDICT: 4482 return BPF_PROG_TYPE_SK_MSG; 4483 case BPF_SK_SKB_STREAM_PARSER: 4484 case BPF_SK_SKB_STREAM_VERDICT: 4485 case BPF_SK_SKB_VERDICT: 4486 return BPF_PROG_TYPE_SK_SKB; 4487 case BPF_LIRC_MODE2: 4488 return BPF_PROG_TYPE_LIRC_MODE2; 4489 case BPF_FLOW_DISSECTOR: 4490 return BPF_PROG_TYPE_FLOW_DISSECTOR; 4491 case BPF_CGROUP_SYSCTL: 4492 return BPF_PROG_TYPE_CGROUP_SYSCTL; 4493 case BPF_CGROUP_GETSOCKOPT: 4494 case BPF_CGROUP_SETSOCKOPT: 4495 return BPF_PROG_TYPE_CGROUP_SOCKOPT; 4496 case BPF_TRACE_ITER: 4497 case BPF_TRACE_RAW_TP: 4498 case BPF_TRACE_FENTRY: 4499 case BPF_TRACE_FEXIT: 4500 case BPF_TRACE_FSESSION: 4501 case BPF_TRACE_FSESSION_MULTI: 4502 case BPF_TRACE_FENTRY_MULTI: 4503 case BPF_TRACE_FEXIT_MULTI: 4504 case BPF_MODIFY_RETURN: 4505 return BPF_PROG_TYPE_TRACING; 4506 case BPF_LSM_MAC: 4507 return BPF_PROG_TYPE_LSM; 4508 case BPF_SK_LOOKUP: 4509 return BPF_PROG_TYPE_SK_LOOKUP; 4510 case BPF_XDP: 4511 return BPF_PROG_TYPE_XDP; 4512 case BPF_LSM_CGROUP: 4513 return BPF_PROG_TYPE_LSM; 4514 case BPF_TCX_INGRESS: 4515 case BPF_TCX_EGRESS: 4516 case BPF_NETKIT_PRIMARY: 4517 case BPF_NETKIT_PEER: 4518 return BPF_PROG_TYPE_SCHED_CLS; 4519 default: 4520 return BPF_PROG_TYPE_UNSPEC; 4521 } 4522 } 4523 4524 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, 4525 enum bpf_attach_type attach_type) 4526 { 4527 enum bpf_prog_type ptype; 4528 4529 switch (prog->type) { 4530 case BPF_PROG_TYPE_CGROUP_SOCK: 4531 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 4532 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 4533 case BPF_PROG_TYPE_SK_LOOKUP: 4534 return attach_type == prog->expected_attach_type ? 0 : -EINVAL; 4535 case BPF_PROG_TYPE_CGROUP_SKB: 4536 if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN)) 4537 /* cg-skb progs can be loaded by unpriv user. 4538 * check permissions at attach time. 4539 */ 4540 return -EPERM; 4541 4542 ptype = attach_type_to_prog_type(attach_type); 4543 if (prog->type != ptype) 4544 return -EINVAL; 4545 4546 return prog->enforce_expected_attach_type && 4547 prog->expected_attach_type != attach_type ? 4548 -EINVAL : 0; 4549 case BPF_PROG_TYPE_EXT: 4550 return 0; 4551 case BPF_PROG_TYPE_NETFILTER: 4552 if (attach_type != BPF_NETFILTER) 4553 return -EINVAL; 4554 return 0; 4555 case BPF_PROG_TYPE_PERF_EVENT: 4556 case BPF_PROG_TYPE_TRACEPOINT: 4557 if (attach_type != BPF_PERF_EVENT) 4558 return -EINVAL; 4559 return 0; 4560 case BPF_PROG_TYPE_KPROBE: 4561 if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && 4562 attach_type != BPF_TRACE_KPROBE_MULTI) 4563 return -EINVAL; 4564 if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION && 4565 attach_type != BPF_TRACE_KPROBE_SESSION) 4566 return -EINVAL; 4567 if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && 4568 attach_type != BPF_TRACE_UPROBE_MULTI) 4569 return -EINVAL; 4570 if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION && 4571 attach_type != BPF_TRACE_UPROBE_SESSION) 4572 return -EINVAL; 4573 if (attach_type != BPF_PERF_EVENT && 4574 attach_type != BPF_TRACE_KPROBE_MULTI && 4575 attach_type != BPF_TRACE_KPROBE_SESSION && 4576 attach_type != BPF_TRACE_UPROBE_MULTI && 4577 attach_type != BPF_TRACE_UPROBE_SESSION) 4578 return -EINVAL; 4579 return 0; 4580 case BPF_PROG_TYPE_SCHED_CLS: 4581 if (attach_type != BPF_TCX_INGRESS && 4582 attach_type != BPF_TCX_EGRESS && 4583 attach_type != BPF_NETKIT_PRIMARY && 4584 attach_type != BPF_NETKIT_PEER) 4585 return -EINVAL; 4586 return 0; 4587 default: 4588 ptype = attach_type_to_prog_type(attach_type); 4589 if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) 4590 return -EINVAL; 4591 return 0; 4592 } 4593 } 4594 4595 static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype, 4596 bool check_atype) 4597 { 4598 switch (ptype) { 4599 case BPF_PROG_TYPE_CGROUP_DEVICE: 4600 case BPF_PROG_TYPE_CGROUP_SKB: 4601 case BPF_PROG_TYPE_CGROUP_SOCK: 4602 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 4603 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 4604 case BPF_PROG_TYPE_CGROUP_SYSCTL: 4605 case BPF_PROG_TYPE_SOCK_OPS: 4606 return true; 4607 case BPF_PROG_TYPE_LSM: 4608 return check_atype ? atype == BPF_LSM_CGROUP : true; 4609 default: 4610 return false; 4611 } 4612 } 4613 4614 #define BPF_PROG_ATTACH_LAST_FIELD expected_revision 4615 4616 #define BPF_F_ATTACH_MASK_BASE \ 4617 (BPF_F_ALLOW_OVERRIDE | \ 4618 BPF_F_ALLOW_MULTI | \ 4619 BPF_F_REPLACE | \ 4620 BPF_F_PREORDER) 4621 4622 #define BPF_F_ATTACH_MASK_MPROG \ 4623 (BPF_F_REPLACE | \ 4624 BPF_F_BEFORE | \ 4625 BPF_F_AFTER | \ 4626 BPF_F_ID | \ 4627 BPF_F_LINK) 4628 4629 static int bpf_prog_attach(const union bpf_attr *attr) 4630 { 4631 enum bpf_prog_type ptype; 4632 struct bpf_prog *prog; 4633 int ret; 4634 4635 if (CHECK_ATTR(BPF_PROG_ATTACH)) 4636 return -EINVAL; 4637 4638 ptype = attach_type_to_prog_type(attr->attach_type); 4639 if (ptype == BPF_PROG_TYPE_UNSPEC) 4640 return -EINVAL; 4641 if (bpf_mprog_supported(ptype)) { 4642 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) 4643 return -EINVAL; 4644 } else if (is_cgroup_prog_type(ptype, 0, false)) { 4645 if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG)) 4646 return -EINVAL; 4647 } else { 4648 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE) 4649 return -EINVAL; 4650 if (attr->relative_fd || 4651 attr->expected_revision) 4652 return -EINVAL; 4653 } 4654 4655 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); 4656 if (IS_ERR(prog)) 4657 return PTR_ERR(prog); 4658 4659 if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { 4660 bpf_prog_put(prog); 4661 return -EINVAL; 4662 } 4663 4664 if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) { 4665 ret = cgroup_bpf_prog_attach(attr, ptype, prog); 4666 goto out; 4667 } 4668 4669 switch (ptype) { 4670 case BPF_PROG_TYPE_SK_SKB: 4671 case BPF_PROG_TYPE_SK_MSG: 4672 ret = sock_map_get_from_fd(attr, prog); 4673 break; 4674 case BPF_PROG_TYPE_LIRC_MODE2: 4675 ret = lirc_prog_attach(attr, prog); 4676 break; 4677 case BPF_PROG_TYPE_FLOW_DISSECTOR: 4678 ret = netns_bpf_prog_attach(attr, prog); 4679 break; 4680 case BPF_PROG_TYPE_SCHED_CLS: 4681 if (attr->attach_type == BPF_TCX_INGRESS || 4682 attr->attach_type == BPF_TCX_EGRESS) 4683 ret = tcx_prog_attach(attr, prog); 4684 else 4685 ret = netkit_prog_attach(attr, prog); 4686 break; 4687 default: 4688 ret = -EINVAL; 4689 } 4690 out: 4691 if (ret) 4692 bpf_prog_put(prog); 4693 return ret; 4694 } 4695 4696 #define BPF_PROG_DETACH_LAST_FIELD expected_revision 4697 4698 static int bpf_prog_detach(const union bpf_attr *attr) 4699 { 4700 struct bpf_prog *prog = NULL; 4701 enum bpf_prog_type ptype; 4702 int ret; 4703 4704 if (CHECK_ATTR(BPF_PROG_DETACH)) 4705 return -EINVAL; 4706 4707 ptype = attach_type_to_prog_type(attr->attach_type); 4708 if (bpf_mprog_supported(ptype)) { 4709 if (ptype == BPF_PROG_TYPE_UNSPEC) 4710 return -EINVAL; 4711 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) 4712 return -EINVAL; 4713 if (attr->attach_bpf_fd) { 4714 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); 4715 if (IS_ERR(prog)) 4716 return PTR_ERR(prog); 4717 } else if (!bpf_mprog_detach_empty(ptype)) { 4718 return -EPERM; 4719 } 4720 } else if (is_cgroup_prog_type(ptype, 0, false)) { 4721 if (attr->attach_flags || attr->relative_fd) 4722 return -EINVAL; 4723 } else if (attr->attach_flags || 4724 attr->relative_fd || 4725 attr->expected_revision) { 4726 return -EINVAL; 4727 } 4728 4729 switch (ptype) { 4730 case BPF_PROG_TYPE_SK_MSG: 4731 case BPF_PROG_TYPE_SK_SKB: 4732 ret = sock_map_prog_detach(attr, ptype); 4733 break; 4734 case BPF_PROG_TYPE_LIRC_MODE2: 4735 ret = lirc_prog_detach(attr); 4736 break; 4737 case BPF_PROG_TYPE_FLOW_DISSECTOR: 4738 ret = netns_bpf_prog_detach(attr, ptype); 4739 break; 4740 case BPF_PROG_TYPE_CGROUP_DEVICE: 4741 case BPF_PROG_TYPE_CGROUP_SKB: 4742 case BPF_PROG_TYPE_CGROUP_SOCK: 4743 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 4744 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 4745 case BPF_PROG_TYPE_CGROUP_SYSCTL: 4746 case BPF_PROG_TYPE_SOCK_OPS: 4747 case BPF_PROG_TYPE_LSM: 4748 ret = cgroup_bpf_prog_detach(attr, ptype); 4749 break; 4750 case BPF_PROG_TYPE_SCHED_CLS: 4751 if (attr->attach_type == BPF_TCX_INGRESS || 4752 attr->attach_type == BPF_TCX_EGRESS) 4753 ret = tcx_prog_detach(attr, prog); 4754 else 4755 ret = netkit_prog_detach(attr, prog); 4756 break; 4757 default: 4758 ret = -EINVAL; 4759 } 4760 4761 if (prog) 4762 bpf_prog_put(prog); 4763 return ret; 4764 } 4765 4766 #define BPF_PROG_QUERY_LAST_FIELD query.revision 4767 4768 static int bpf_prog_query(const union bpf_attr *attr, 4769 union bpf_attr __user *uattr, u32 uattr_size) 4770 { 4771 if (!bpf_net_capable()) 4772 return -EPERM; 4773 if (CHECK_ATTR(BPF_PROG_QUERY)) 4774 return -EINVAL; 4775 if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) 4776 return -EINVAL; 4777 4778 switch (attr->query.attach_type) { 4779 case BPF_CGROUP_INET_INGRESS: 4780 case BPF_CGROUP_INET_EGRESS: 4781 case BPF_CGROUP_INET_SOCK_CREATE: 4782 case BPF_CGROUP_INET_SOCK_RELEASE: 4783 case BPF_CGROUP_INET4_BIND: 4784 case BPF_CGROUP_INET6_BIND: 4785 case BPF_CGROUP_INET4_POST_BIND: 4786 case BPF_CGROUP_INET6_POST_BIND: 4787 case BPF_CGROUP_INET4_CONNECT: 4788 case BPF_CGROUP_INET6_CONNECT: 4789 case BPF_CGROUP_UNIX_CONNECT: 4790 case BPF_CGROUP_INET4_GETPEERNAME: 4791 case BPF_CGROUP_INET6_GETPEERNAME: 4792 case BPF_CGROUP_UNIX_GETPEERNAME: 4793 case BPF_CGROUP_INET4_GETSOCKNAME: 4794 case BPF_CGROUP_INET6_GETSOCKNAME: 4795 case BPF_CGROUP_UNIX_GETSOCKNAME: 4796 case BPF_CGROUP_UDP4_SENDMSG: 4797 case BPF_CGROUP_UDP6_SENDMSG: 4798 case BPF_CGROUP_UNIX_SENDMSG: 4799 case BPF_CGROUP_UDP4_RECVMSG: 4800 case BPF_CGROUP_UDP6_RECVMSG: 4801 case BPF_CGROUP_UNIX_RECVMSG: 4802 case BPF_CGROUP_SOCK_OPS: 4803 case BPF_CGROUP_DEVICE: 4804 case BPF_CGROUP_SYSCTL: 4805 case BPF_CGROUP_GETSOCKOPT: 4806 case BPF_CGROUP_SETSOCKOPT: 4807 case BPF_LSM_CGROUP: 4808 return cgroup_bpf_prog_query(attr, uattr, uattr_size); 4809 case BPF_LIRC_MODE2: 4810 return lirc_prog_query(attr, uattr); 4811 case BPF_FLOW_DISSECTOR: 4812 case BPF_SK_LOOKUP: 4813 return netns_bpf_prog_query(attr, uattr); 4814 case BPF_SK_SKB_STREAM_PARSER: 4815 case BPF_SK_SKB_STREAM_VERDICT: 4816 case BPF_SK_MSG_VERDICT: 4817 case BPF_SK_SKB_VERDICT: 4818 return sock_map_bpf_prog_query(attr, uattr); 4819 case BPF_TCX_INGRESS: 4820 case BPF_TCX_EGRESS: 4821 return tcx_prog_query(attr, uattr); 4822 case BPF_NETKIT_PRIMARY: 4823 case BPF_NETKIT_PEER: 4824 return netkit_prog_query(attr, uattr); 4825 default: 4826 return -EINVAL; 4827 } 4828 } 4829 4830 #define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size 4831 4832 static int bpf_prog_test_run(const union bpf_attr *attr, 4833 union bpf_attr __user *uattr) 4834 { 4835 struct bpf_prog *prog; 4836 int ret = -ENOTSUPP; 4837 4838 if (CHECK_ATTR(BPF_PROG_TEST_RUN)) 4839 return -EINVAL; 4840 4841 if ((attr->test.ctx_size_in && !attr->test.ctx_in) || 4842 (!attr->test.ctx_size_in && attr->test.ctx_in)) 4843 return -EINVAL; 4844 4845 if ((attr->test.ctx_size_out && !attr->test.ctx_out) || 4846 (!attr->test.ctx_size_out && attr->test.ctx_out)) 4847 return -EINVAL; 4848 4849 prog = bpf_prog_get(attr->test.prog_fd); 4850 if (IS_ERR(prog)) 4851 return PTR_ERR(prog); 4852 4853 if (prog->aux->ops->test_run) 4854 ret = prog->aux->ops->test_run(prog, attr, uattr); 4855 4856 bpf_prog_put(prog); 4857 return ret; 4858 } 4859 4860 #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id 4861 4862 static int bpf_obj_get_next_id(const union bpf_attr *attr, 4863 union bpf_attr __user *uattr, 4864 struct idr *idr, 4865 spinlock_t *lock) 4866 { 4867 u32 next_id = attr->start_id; 4868 int err = 0; 4869 4870 if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) 4871 return -EINVAL; 4872 4873 if (!capable(CAP_SYS_ADMIN)) 4874 return -EPERM; 4875 4876 next_id++; 4877 spin_lock_bh(lock); 4878 if (!idr_get_next(idr, &next_id)) 4879 err = -ENOENT; 4880 spin_unlock_bh(lock); 4881 4882 if (!err) 4883 err = put_user(next_id, &uattr->next_id); 4884 4885 return err; 4886 } 4887 4888 struct bpf_map *bpf_map_get_curr_or_next(u32 *id) 4889 { 4890 struct bpf_map *map; 4891 4892 spin_lock_bh(&map_idr_lock); 4893 again: 4894 map = idr_get_next(&map_idr, id); 4895 if (map) { 4896 map = __bpf_map_inc_not_zero(map, false); 4897 if (IS_ERR(map)) { 4898 (*id)++; 4899 goto again; 4900 } 4901 } 4902 spin_unlock_bh(&map_idr_lock); 4903 4904 return map; 4905 } 4906 4907 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id) 4908 { 4909 struct bpf_prog *prog; 4910 4911 spin_lock_bh(&prog_idr_lock); 4912 again: 4913 prog = idr_get_next(&prog_idr, id); 4914 if (prog) { 4915 prog = bpf_prog_inc_not_zero(prog); 4916 if (IS_ERR(prog)) { 4917 (*id)++; 4918 goto again; 4919 } 4920 } 4921 spin_unlock_bh(&prog_idr_lock); 4922 4923 return prog; 4924 } 4925 4926 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id 4927 4928 struct bpf_prog *bpf_prog_by_id(u32 id) 4929 { 4930 struct bpf_prog *prog; 4931 4932 if (!id) 4933 return ERR_PTR(-ENOENT); 4934 4935 spin_lock_bh(&prog_idr_lock); 4936 prog = idr_find(&prog_idr, id); 4937 if (prog) 4938 prog = bpf_prog_inc_not_zero(prog); 4939 else 4940 prog = ERR_PTR(-ENOENT); 4941 spin_unlock_bh(&prog_idr_lock); 4942 return prog; 4943 } 4944 4945 static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) 4946 { 4947 struct bpf_prog *prog; 4948 u32 id = attr->prog_id; 4949 int fd; 4950 4951 if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) 4952 return -EINVAL; 4953 4954 if (!capable(CAP_SYS_ADMIN)) 4955 return -EPERM; 4956 4957 prog = bpf_prog_by_id(id); 4958 if (IS_ERR(prog)) 4959 return PTR_ERR(prog); 4960 4961 fd = bpf_prog_new_fd(prog); 4962 if (fd < 0) 4963 bpf_prog_put(prog); 4964 4965 return fd; 4966 } 4967 4968 #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags 4969 4970 static int bpf_map_get_fd_by_id(const union bpf_attr *attr) 4971 { 4972 struct bpf_map *map; 4973 u32 id = attr->map_id; 4974 int f_flags; 4975 int fd; 4976 4977 if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || 4978 attr->open_flags & ~BPF_OBJ_FLAG_MASK) 4979 return -EINVAL; 4980 4981 if (!capable(CAP_SYS_ADMIN)) 4982 return -EPERM; 4983 4984 f_flags = bpf_get_file_flag(attr->open_flags); 4985 if (f_flags < 0) 4986 return f_flags; 4987 4988 spin_lock_bh(&map_idr_lock); 4989 map = idr_find(&map_idr, id); 4990 if (map) 4991 map = __bpf_map_inc_not_zero(map, true); 4992 else 4993 map = ERR_PTR(-ENOENT); 4994 spin_unlock_bh(&map_idr_lock); 4995 4996 if (IS_ERR(map)) 4997 return PTR_ERR(map); 4998 4999 fd = bpf_map_new_fd(map, f_flags); 5000 if (fd < 0) 5001 bpf_map_put_with_uref(map); 5002 5003 return fd; 5004 } 5005 5006 static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, 5007 unsigned long addr, u32 *off, 5008 u32 *type) 5009 { 5010 const struct bpf_map *map; 5011 int i; 5012 5013 mutex_lock(&prog->aux->used_maps_mutex); 5014 for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { 5015 map = prog->aux->used_maps[i]; 5016 if (map == (void *)addr) { 5017 *type = BPF_PSEUDO_MAP_FD; 5018 goto out; 5019 } 5020 if (!map->ops->map_direct_value_meta) 5021 continue; 5022 if (!map->ops->map_direct_value_meta(map, addr, off)) { 5023 *type = BPF_PSEUDO_MAP_VALUE; 5024 goto out; 5025 } 5026 } 5027 map = NULL; 5028 5029 out: 5030 mutex_unlock(&prog->aux->used_maps_mutex); 5031 return map; 5032 } 5033 5034 static void prepare_dump_pseudo_call(struct bpf_insn *insn) 5035 { 5036 s32 call_off = insn->imm; 5037 5038 /* 5039 * BPF_CALL_ARGS only exists for interpreter fallback. 5040 * 1. For interpreter (BPF_CALL_ARGS): insn->off is the index of 5041 * interpreters_args array, so here using bpf_call_args_imm() 5042 * to get the real address offset. 5043 * 2. For JIT (BPF_CALL): insn->off is the subprog id. 5044 */ 5045 if (insn->code == (BPF_JMP | BPF_CALL_ARGS)) 5046 insn->imm = bpf_call_args_imm(insn->off); 5047 else 5048 insn->imm = insn->off; 5049 5050 /* Avoid dumping a truncated and misleading pc-relative offset. */ 5051 if (call_off > S16_MAX || call_off < S16_MIN) 5052 insn->off = 0; 5053 else 5054 insn->off = call_off; 5055 } 5056 5057 static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, 5058 const struct cred *f_cred) 5059 { 5060 const struct bpf_map *map; 5061 struct bpf_insn *insns; 5062 u32 off, type; 5063 u64 imm; 5064 u8 code; 5065 int i; 5066 5067 insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), 5068 GFP_USER); 5069 if (!insns) 5070 return insns; 5071 5072 for (i = 0; i < prog->len; i++) { 5073 code = insns[i].code; 5074 5075 if (code == (BPF_JMP | BPF_TAIL_CALL)) { 5076 insns[i].code = BPF_JMP | BPF_CALL; 5077 insns[i].imm = BPF_FUNC_tail_call; 5078 /* fall-through */ 5079 } 5080 if (code == (BPF_JMP | BPF_CALL) || 5081 code == (BPF_JMP | BPF_CALL_ARGS)) { 5082 /* Restore the legacy xlated dump layout. */ 5083 if (insns[i].src_reg == BPF_PSEUDO_CALL) 5084 prepare_dump_pseudo_call(&insns[i]); 5085 if (code == (BPF_JMP | BPF_CALL_ARGS)) 5086 insns[i].code = BPF_JMP | BPF_CALL; 5087 if (!bpf_dump_raw_ok(f_cred)) 5088 insns[i].imm = 0; 5089 continue; 5090 } 5091 if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) { 5092 insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM; 5093 continue; 5094 } 5095 5096 if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX || 5097 BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) { 5098 insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM; 5099 continue; 5100 } 5101 5102 if (code != (BPF_LD | BPF_IMM | BPF_DW)) 5103 continue; 5104 5105 imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; 5106 map = bpf_map_from_imm(prog, imm, &off, &type); 5107 if (map) { 5108 insns[i].src_reg = type; 5109 insns[i].imm = map->id; 5110 insns[i + 1].imm = off; 5111 continue; 5112 } 5113 } 5114 5115 return insns; 5116 } 5117 5118 static int set_info_rec_size(struct bpf_prog_info *info) 5119 { 5120 /* 5121 * Ensure info.*_rec_size is the same as kernel expected size 5122 * 5123 * or 5124 * 5125 * Only allow zero *_rec_size if both _rec_size and _cnt are 5126 * zero. In this case, the kernel will set the expected 5127 * _rec_size back to the info. 5128 */ 5129 5130 if ((info->nr_func_info || info->func_info_rec_size) && 5131 info->func_info_rec_size != sizeof(struct bpf_func_info)) 5132 return -EINVAL; 5133 5134 if ((info->nr_line_info || info->line_info_rec_size) && 5135 info->line_info_rec_size != sizeof(struct bpf_line_info)) 5136 return -EINVAL; 5137 5138 if ((info->nr_jited_line_info || info->jited_line_info_rec_size) && 5139 info->jited_line_info_rec_size != sizeof(__u64)) 5140 return -EINVAL; 5141 5142 info->func_info_rec_size = sizeof(struct bpf_func_info); 5143 info->line_info_rec_size = sizeof(struct bpf_line_info); 5144 info->jited_line_info_rec_size = sizeof(__u64); 5145 5146 return 0; 5147 } 5148 5149 static int bpf_prog_get_info_by_fd(struct file *file, 5150 struct bpf_prog *prog, 5151 const union bpf_attr *attr, 5152 union bpf_attr __user *uattr) 5153 { 5154 struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5155 struct btf *attach_btf = bpf_prog_get_target_btf(prog); 5156 struct bpf_prog_info info; 5157 u32 info_len = attr->info.info_len; 5158 struct bpf_prog_kstats stats; 5159 char __user *uinsns; 5160 u32 ulen, len; 5161 int err; 5162 5163 len = offsetofend(struct bpf_prog_info, attach_btf_id); 5164 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len); 5165 if (err) 5166 return err; 5167 info_len = min_t(u32, sizeof(info), info_len); 5168 5169 memset(&info, 0, sizeof(info)); 5170 if (copy_from_user(&info, uinfo, info_len)) 5171 return -EFAULT; 5172 5173 info.type = prog->type; 5174 info.id = prog->aux->id; 5175 info.load_time = prog->aux->load_time; 5176 info.created_by_uid = from_kuid_munged(current_user_ns(), 5177 prog->aux->user->uid); 5178 info.gpl_compatible = prog->gpl_compatible; 5179 5180 memcpy(info.tag, prog->tag, sizeof(prog->tag)); 5181 memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); 5182 5183 mutex_lock(&prog->aux->used_maps_mutex); 5184 ulen = info.nr_map_ids; 5185 info.nr_map_ids = prog->aux->used_map_cnt; 5186 ulen = min_t(u32, info.nr_map_ids, ulen); 5187 if (ulen) { 5188 u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); 5189 u32 i; 5190 5191 for (i = 0; i < ulen; i++) 5192 if (put_user(prog->aux->used_maps[i]->id, 5193 &user_map_ids[i])) { 5194 mutex_unlock(&prog->aux->used_maps_mutex); 5195 return -EFAULT; 5196 } 5197 } 5198 mutex_unlock(&prog->aux->used_maps_mutex); 5199 5200 err = set_info_rec_size(&info); 5201 if (err) 5202 return err; 5203 5204 bpf_prog_get_stats(prog, &stats); 5205 info.run_time_ns = stats.nsecs; 5206 info.run_cnt = stats.cnt; 5207 info.recursion_misses = stats.misses; 5208 5209 info.verified_insns = prog->aux->verified_insns; 5210 if (prog->aux->btf) 5211 info.btf_id = btf_obj_id(prog->aux->btf); 5212 5213 if (!bpf_capable()) { 5214 info.jited_prog_len = 0; 5215 info.xlated_prog_len = 0; 5216 info.nr_jited_ksyms = 0; 5217 info.nr_jited_func_lens = 0; 5218 info.nr_func_info = 0; 5219 info.nr_line_info = 0; 5220 info.nr_jited_line_info = 0; 5221 goto done; 5222 } 5223 5224 ulen = info.xlated_prog_len; 5225 info.xlated_prog_len = bpf_prog_insn_size(prog); 5226 if (info.xlated_prog_len && ulen) { 5227 struct bpf_insn *insns_sanitized; 5228 bool fault; 5229 5230 if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) { 5231 insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); 5232 if (!insns_sanitized) 5233 return -ENOMEM; 5234 uinsns = u64_to_user_ptr(info.xlated_prog_insns); 5235 ulen = min_t(u32, info.xlated_prog_len, ulen); 5236 fault = copy_to_user(uinsns, insns_sanitized, ulen); 5237 kfree(insns_sanitized); 5238 if (fault) 5239 return -EFAULT; 5240 } else { 5241 info.xlated_prog_insns = 0; 5242 } 5243 } 5244 5245 if (bpf_prog_is_offloaded(prog->aux)) { 5246 err = bpf_prog_offload_info_fill(&info, prog); 5247 if (err) 5248 return err; 5249 goto done; 5250 } 5251 5252 /* NOTE: the following code is supposed to be skipped for offload. 5253 * bpf_prog_offload_info_fill() is the place to fill similar fields 5254 * for offload. 5255 */ 5256 ulen = info.jited_prog_len; 5257 if (prog->aux->func_cnt) { 5258 u32 i; 5259 5260 info.jited_prog_len = 0; 5261 for (i = 0; i < prog->aux->func_cnt; i++) 5262 info.jited_prog_len += prog->aux->func[i]->jited_len; 5263 } else { 5264 info.jited_prog_len = prog->jited_len; 5265 } 5266 5267 if (info.jited_prog_len && ulen) { 5268 if (bpf_dump_raw_ok(file->f_cred)) { 5269 uinsns = u64_to_user_ptr(info.jited_prog_insns); 5270 ulen = min_t(u32, info.jited_prog_len, ulen); 5271 5272 /* for multi-function programs, copy the JITed 5273 * instructions for all the functions 5274 */ 5275 if (prog->aux->func_cnt) { 5276 u32 len, free, i; 5277 u8 *img; 5278 5279 free = ulen; 5280 for (i = 0; i < prog->aux->func_cnt; i++) { 5281 len = prog->aux->func[i]->jited_len; 5282 len = min_t(u32, len, free); 5283 img = (u8 *) prog->aux->func[i]->bpf_func; 5284 if (copy_to_user(uinsns, img, len)) 5285 return -EFAULT; 5286 uinsns += len; 5287 free -= len; 5288 if (!free) 5289 break; 5290 } 5291 } else { 5292 if (copy_to_user(uinsns, prog->bpf_func, ulen)) 5293 return -EFAULT; 5294 } 5295 } else { 5296 info.jited_prog_insns = 0; 5297 } 5298 } 5299 5300 ulen = info.nr_jited_ksyms; 5301 info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; 5302 if (ulen) { 5303 if (bpf_dump_raw_ok(file->f_cred)) { 5304 unsigned long ksym_addr; 5305 u64 __user *user_ksyms; 5306 u32 i; 5307 5308 /* copy the address of the kernel symbol 5309 * corresponding to each function 5310 */ 5311 ulen = min_t(u32, info.nr_jited_ksyms, ulen); 5312 user_ksyms = u64_to_user_ptr(info.jited_ksyms); 5313 if (prog->aux->func_cnt) { 5314 for (i = 0; i < ulen; i++) { 5315 ksym_addr = (unsigned long) 5316 prog->aux->func[i]->bpf_func; 5317 if (put_user((u64) ksym_addr, 5318 &user_ksyms[i])) 5319 return -EFAULT; 5320 } 5321 } else { 5322 ksym_addr = (unsigned long) prog->bpf_func; 5323 if (put_user((u64) ksym_addr, &user_ksyms[0])) 5324 return -EFAULT; 5325 } 5326 } else { 5327 info.jited_ksyms = 0; 5328 } 5329 } 5330 5331 ulen = info.nr_jited_func_lens; 5332 info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; 5333 if (ulen) { 5334 if (bpf_dump_raw_ok(file->f_cred)) { 5335 u32 __user *user_lens; 5336 u32 func_len, i; 5337 5338 /* copy the JITed image lengths for each function */ 5339 ulen = min_t(u32, info.nr_jited_func_lens, ulen); 5340 user_lens = u64_to_user_ptr(info.jited_func_lens); 5341 if (prog->aux->func_cnt) { 5342 for (i = 0; i < ulen; i++) { 5343 func_len = 5344 prog->aux->func[i]->jited_len; 5345 if (put_user(func_len, &user_lens[i])) 5346 return -EFAULT; 5347 } 5348 } else { 5349 func_len = prog->jited_len; 5350 if (put_user(func_len, &user_lens[0])) 5351 return -EFAULT; 5352 } 5353 } else { 5354 info.jited_func_lens = 0; 5355 } 5356 } 5357 5358 info.attach_btf_id = prog->aux->attach_btf_id; 5359 if (attach_btf) 5360 info.attach_btf_obj_id = btf_obj_id(attach_btf); 5361 5362 ulen = info.nr_func_info; 5363 info.nr_func_info = prog->aux->func_info_cnt; 5364 if (info.nr_func_info && ulen) { 5365 char __user *user_finfo; 5366 5367 user_finfo = u64_to_user_ptr(info.func_info); 5368 ulen = min_t(u32, info.nr_func_info, ulen); 5369 if (copy_to_user(user_finfo, prog->aux->func_info, 5370 info.func_info_rec_size * ulen)) 5371 return -EFAULT; 5372 } 5373 5374 ulen = info.nr_line_info; 5375 info.nr_line_info = prog->aux->nr_linfo; 5376 if (info.nr_line_info && ulen) { 5377 __u8 __user *user_linfo; 5378 5379 user_linfo = u64_to_user_ptr(info.line_info); 5380 ulen = min_t(u32, info.nr_line_info, ulen); 5381 if (copy_to_user(user_linfo, prog->aux->linfo, 5382 info.line_info_rec_size * ulen)) 5383 return -EFAULT; 5384 } 5385 5386 ulen = info.nr_jited_line_info; 5387 if (prog->aux->jited_linfo) 5388 info.nr_jited_line_info = prog->aux->nr_linfo; 5389 else 5390 info.nr_jited_line_info = 0; 5391 if (info.nr_jited_line_info && ulen) { 5392 if (bpf_dump_raw_ok(file->f_cred)) { 5393 unsigned long line_addr; 5394 __u64 __user *user_linfo; 5395 u32 i; 5396 5397 user_linfo = u64_to_user_ptr(info.jited_line_info); 5398 ulen = min_t(u32, info.nr_jited_line_info, ulen); 5399 for (i = 0; i < ulen; i++) { 5400 line_addr = (unsigned long)prog->aux->jited_linfo[i]; 5401 if (put_user((__u64)line_addr, &user_linfo[i])) 5402 return -EFAULT; 5403 } 5404 } else { 5405 info.jited_line_info = 0; 5406 } 5407 } 5408 5409 ulen = info.nr_prog_tags; 5410 info.nr_prog_tags = prog->aux->func_cnt ? : 1; 5411 if (ulen) { 5412 __u8 __user (*user_prog_tags)[BPF_TAG_SIZE]; 5413 u32 i; 5414 5415 user_prog_tags = u64_to_user_ptr(info.prog_tags); 5416 ulen = min_t(u32, info.nr_prog_tags, ulen); 5417 if (prog->aux->func_cnt) { 5418 for (i = 0; i < ulen; i++) { 5419 if (copy_to_user(user_prog_tags[i], 5420 prog->aux->func[i]->tag, 5421 BPF_TAG_SIZE)) 5422 return -EFAULT; 5423 } 5424 } else { 5425 if (copy_to_user(user_prog_tags[0], 5426 prog->tag, BPF_TAG_SIZE)) 5427 return -EFAULT; 5428 } 5429 } 5430 5431 done: 5432 if (copy_to_user(uinfo, &info, info_len) || 5433 put_user(info_len, &uattr->info.info_len)) 5434 return -EFAULT; 5435 5436 return 0; 5437 } 5438 5439 static int bpf_map_get_info_by_fd(struct file *file, 5440 struct bpf_map *map, 5441 const union bpf_attr *attr, 5442 union bpf_attr __user *uattr) 5443 { 5444 struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5445 struct bpf_map_info info; 5446 u32 info_len = attr->info.info_len, len; 5447 int err; 5448 5449 len = offsetofend(struct bpf_map_info, hash_size); 5450 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len); 5451 if (err) 5452 return err; 5453 info_len = min_t(u32, sizeof(info), info_len); 5454 5455 memset(&info, 0, sizeof(info)); 5456 if (copy_from_user(&info, uinfo, info_len)) 5457 return -EFAULT; 5458 5459 info.type = map->map_type; 5460 info.id = map->id; 5461 info.key_size = map->key_size; 5462 info.value_size = map->value_size; 5463 info.max_entries = map->max_entries; 5464 info.map_flags = map->map_flags; 5465 info.map_extra = map->map_extra; 5466 memcpy(info.name, map->name, sizeof(map->name)); 5467 5468 if (map->btf) { 5469 info.btf_id = btf_obj_id(map->btf); 5470 info.btf_key_type_id = map->btf_key_type_id; 5471 info.btf_value_type_id = map->btf_value_type_id; 5472 } 5473 info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; 5474 if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) 5475 bpf_map_struct_ops_info_fill(&info, map); 5476 5477 if (bpf_map_is_offloaded(map)) { 5478 err = bpf_map_offload_info_fill(&info, map); 5479 if (err) 5480 return err; 5481 } 5482 5483 if (info.hash) { 5484 char __user *uhash = u64_to_user_ptr(info.hash); 5485 5486 if (!map->ops->map_get_hash) 5487 return -EINVAL; 5488 if (info.hash_size != sizeof(map->sha)) 5489 return -EINVAL; 5490 if (!READ_ONCE(map->frozen)) 5491 return -EPERM; 5492 5493 err = map->ops->map_get_hash(map); 5494 if (err != 0) 5495 return err; 5496 5497 if (copy_to_user(uhash, map->sha, sizeof(map->sha)) != 0) 5498 return -EFAULT; 5499 } else if (info.hash_size) { 5500 return -EINVAL; 5501 } 5502 5503 if (copy_to_user(uinfo, &info, info_len) || 5504 put_user(info_len, &uattr->info.info_len)) 5505 return -EFAULT; 5506 5507 return 0; 5508 } 5509 5510 static int bpf_btf_get_info_by_fd(struct file *file, 5511 struct btf *btf, 5512 const union bpf_attr *attr, 5513 union bpf_attr __user *uattr) 5514 { 5515 struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5516 u32 info_len = attr->info.info_len; 5517 int err; 5518 5519 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); 5520 if (err) 5521 return err; 5522 5523 return btf_get_info_by_fd(btf, attr, uattr); 5524 } 5525 5526 static int bpf_link_get_info_by_fd(struct file *file, 5527 struct bpf_link *link, 5528 const union bpf_attr *attr, 5529 union bpf_attr __user *uattr) 5530 { 5531 struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5532 struct bpf_link_info info; 5533 u32 info_len = attr->info.info_len; 5534 int err; 5535 5536 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); 5537 if (err) 5538 return err; 5539 info_len = min_t(u32, sizeof(info), info_len); 5540 5541 memset(&info, 0, sizeof(info)); 5542 if (copy_from_user(&info, uinfo, info_len)) 5543 return -EFAULT; 5544 5545 info.type = link->type; 5546 info.id = link->id; 5547 if (link->prog) 5548 info.prog_id = link->prog->aux->id; 5549 5550 if (link->ops->fill_link_info) { 5551 err = link->ops->fill_link_info(link, &info); 5552 if (err) 5553 return err; 5554 } 5555 5556 if (copy_to_user(uinfo, &info, info_len) || 5557 put_user(info_len, &uattr->info.info_len)) 5558 return -EFAULT; 5559 5560 return 0; 5561 } 5562 5563 5564 static int token_get_info_by_fd(struct file *file, 5565 struct bpf_token *token, 5566 const union bpf_attr *attr, 5567 union bpf_attr __user *uattr) 5568 { 5569 struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); 5570 u32 info_len = attr->info.info_len; 5571 int err; 5572 5573 err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); 5574 if (err) 5575 return err; 5576 return bpf_token_get_info_by_fd(token, attr, uattr); 5577 } 5578 5579 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info 5580 5581 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, 5582 union bpf_attr __user *uattr) 5583 { 5584 if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) 5585 return -EINVAL; 5586 5587 CLASS(fd, f)(attr->info.bpf_fd); 5588 if (fd_empty(f)) 5589 return -EBADFD; 5590 5591 if (fd_file(f)->f_op == &bpf_prog_fops) 5592 return bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, 5593 uattr); 5594 else if (fd_file(f)->f_op == &bpf_map_fops) 5595 return bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, 5596 uattr); 5597 else if (fd_file(f)->f_op == &btf_fops) 5598 return bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); 5599 else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll) 5600 return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, 5601 attr, uattr); 5602 else if (fd_file(f)->f_op == &bpf_token_fops) 5603 return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data, 5604 attr, uattr); 5605 return -EINVAL; 5606 } 5607 5608 #define BPF_BTF_LOAD_LAST_FIELD btf_token_fd 5609 5610 static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) 5611 { 5612 struct bpf_token *token = NULL; 5613 5614 if (CHECK_ATTR(BPF_BTF_LOAD)) 5615 return -EINVAL; 5616 5617 if (attr->btf_flags & ~BPF_F_TOKEN_FD) 5618 return -EINVAL; 5619 5620 if (attr->btf_flags & BPF_F_TOKEN_FD) { 5621 token = bpf_token_get_from_fd(attr->btf_token_fd); 5622 if (IS_ERR(token)) 5623 return PTR_ERR(token); 5624 if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) { 5625 bpf_token_put(token); 5626 token = NULL; 5627 } 5628 } 5629 5630 if (!bpf_token_capable(token, CAP_BPF)) { 5631 bpf_token_put(token); 5632 return -EPERM; 5633 } 5634 5635 bpf_token_put(token); 5636 5637 return btf_new_fd(attr, uattr, attr_log); 5638 } 5639 5640 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd 5641 5642 static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) 5643 { 5644 struct bpf_token *token = NULL; 5645 5646 if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) 5647 return -EINVAL; 5648 5649 if (attr->open_flags & ~BPF_F_TOKEN_FD) 5650 return -EINVAL; 5651 5652 if (attr->open_flags & BPF_F_TOKEN_FD) { 5653 token = bpf_token_get_from_fd(attr->fd_by_id_token_fd); 5654 if (IS_ERR(token)) 5655 return PTR_ERR(token); 5656 if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) { 5657 bpf_token_put(token); 5658 token = NULL; 5659 } 5660 } 5661 5662 if (!bpf_token_capable(token, CAP_SYS_ADMIN)) { 5663 bpf_token_put(token); 5664 return -EPERM; 5665 } 5666 5667 bpf_token_put(token); 5668 5669 return btf_get_fd_by_id(attr->btf_id); 5670 } 5671 5672 static int bpf_task_fd_query_copy(const union bpf_attr *attr, 5673 union bpf_attr __user *uattr, 5674 u32 prog_id, u32 fd_type, 5675 const char *buf, u64 probe_offset, 5676 u64 probe_addr) 5677 { 5678 char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); 5679 u32 len = buf ? strlen(buf) : 0, input_len; 5680 int err = 0; 5681 5682 if (put_user(len, &uattr->task_fd_query.buf_len)) 5683 return -EFAULT; 5684 input_len = attr->task_fd_query.buf_len; 5685 if (input_len && ubuf) { 5686 if (!len) { 5687 /* nothing to copy, just make ubuf NULL terminated */ 5688 char zero = '\0'; 5689 5690 if (put_user(zero, ubuf)) 5691 return -EFAULT; 5692 } else { 5693 err = bpf_copy_to_user(ubuf, buf, input_len, len); 5694 if (err == -EFAULT) 5695 return err; 5696 } 5697 } 5698 5699 if (put_user(prog_id, &uattr->task_fd_query.prog_id) || 5700 put_user(fd_type, &uattr->task_fd_query.fd_type) || 5701 put_user(probe_offset, &uattr->task_fd_query.probe_offset) || 5702 put_user(probe_addr, &uattr->task_fd_query.probe_addr)) 5703 return -EFAULT; 5704 5705 return err; 5706 } 5707 5708 #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr 5709 5710 static int bpf_task_fd_query(const union bpf_attr *attr, 5711 union bpf_attr __user *uattr) 5712 { 5713 pid_t pid = attr->task_fd_query.pid; 5714 u32 fd = attr->task_fd_query.fd; 5715 const struct perf_event *event; 5716 struct task_struct *task; 5717 struct file *file; 5718 int err; 5719 5720 if (CHECK_ATTR(BPF_TASK_FD_QUERY)) 5721 return -EINVAL; 5722 5723 if (!capable(CAP_SYS_ADMIN)) 5724 return -EPERM; 5725 5726 if (attr->task_fd_query.flags != 0) 5727 return -EINVAL; 5728 5729 rcu_read_lock(); 5730 task = get_pid_task(find_vpid(pid), PIDTYPE_PID); 5731 rcu_read_unlock(); 5732 if (!task) 5733 return -ENOENT; 5734 5735 err = 0; 5736 file = fget_task(task, fd); 5737 put_task_struct(task); 5738 if (!file) 5739 return -EBADF; 5740 5741 if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) { 5742 struct bpf_link *link = file->private_data; 5743 5744 if (link->ops == &bpf_raw_tp_link_lops) { 5745 struct bpf_raw_tp_link *raw_tp = 5746 container_of(link, struct bpf_raw_tp_link, link); 5747 struct bpf_raw_event_map *btp = raw_tp->btp; 5748 5749 err = bpf_task_fd_query_copy(attr, uattr, 5750 raw_tp->link.prog->aux->id, 5751 BPF_FD_TYPE_RAW_TRACEPOINT, 5752 btp->tp->name, 0, 0); 5753 goto put_file; 5754 } 5755 goto out_not_supp; 5756 } 5757 5758 event = perf_get_event(file); 5759 if (!IS_ERR(event)) { 5760 u64 probe_offset, probe_addr; 5761 u32 prog_id, fd_type; 5762 const char *buf; 5763 5764 err = bpf_get_perf_event_info(event, &prog_id, &fd_type, 5765 &buf, &probe_offset, 5766 &probe_addr, NULL); 5767 if (!err) 5768 err = bpf_task_fd_query_copy(attr, uattr, prog_id, 5769 fd_type, buf, 5770 probe_offset, 5771 probe_addr); 5772 goto put_file; 5773 } 5774 5775 out_not_supp: 5776 err = -ENOTSUPP; 5777 put_file: 5778 fput(file); 5779 return err; 5780 } 5781 5782 #define BPF_MAP_BATCH_LAST_FIELD batch.flags 5783 5784 #define BPF_DO_BATCH(fn, ...) \ 5785 do { \ 5786 if (!fn) { \ 5787 err = -ENOTSUPP; \ 5788 goto err_put; \ 5789 } \ 5790 err = fn(__VA_ARGS__); \ 5791 } while (0) 5792 5793 static int bpf_map_do_batch(const union bpf_attr *attr, 5794 union bpf_attr __user *uattr, 5795 int cmd) 5796 { 5797 bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || 5798 cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; 5799 bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; 5800 struct bpf_map *map; 5801 int err; 5802 5803 if (CHECK_ATTR(BPF_MAP_BATCH)) 5804 return -EINVAL; 5805 5806 CLASS(fd, f)(attr->batch.map_fd); 5807 5808 map = __bpf_map_get(f); 5809 if (IS_ERR(map)) 5810 return PTR_ERR(map); 5811 if (has_write) 5812 bpf_map_write_active_inc(map); 5813 if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { 5814 err = -EPERM; 5815 goto err_put; 5816 } 5817 if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 5818 err = -EPERM; 5819 goto err_put; 5820 } 5821 5822 if (cmd == BPF_MAP_LOOKUP_BATCH) 5823 BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr); 5824 else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) 5825 BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr); 5826 else if (cmd == BPF_MAP_UPDATE_BATCH) 5827 BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr); 5828 else 5829 BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr); 5830 err_put: 5831 if (has_write) { 5832 maybe_wait_bpf_programs(map); 5833 bpf_map_write_active_dec(map); 5834 } 5835 return err; 5836 } 5837 5838 #define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid 5839 static int link_create(union bpf_attr *attr, bpfptr_t uattr) 5840 { 5841 struct bpf_prog *prog; 5842 int ret; 5843 5844 if (CHECK_ATTR(BPF_LINK_CREATE)) 5845 return -EINVAL; 5846 5847 if (attr->link_create.attach_type == BPF_STRUCT_OPS) 5848 return bpf_struct_ops_link_create(attr); 5849 5850 prog = bpf_prog_get(attr->link_create.prog_fd); 5851 if (IS_ERR(prog)) 5852 return PTR_ERR(prog); 5853 5854 ret = bpf_prog_attach_check_attach_type(prog, 5855 attr->link_create.attach_type); 5856 if (ret) 5857 goto out; 5858 5859 switch (prog->type) { 5860 case BPF_PROG_TYPE_CGROUP_SKB: 5861 case BPF_PROG_TYPE_CGROUP_SOCK: 5862 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 5863 case BPF_PROG_TYPE_SOCK_OPS: 5864 case BPF_PROG_TYPE_CGROUP_DEVICE: 5865 case BPF_PROG_TYPE_CGROUP_SYSCTL: 5866 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 5867 ret = cgroup_bpf_link_attach(attr, prog); 5868 break; 5869 case BPF_PROG_TYPE_EXT: 5870 ret = bpf_tracing_prog_attach(prog, 5871 attr->link_create.target_fd, 5872 attr->link_create.target_btf_id, 5873 attr->link_create.tracing.cookie, 5874 attr->link_create.attach_type); 5875 break; 5876 case BPF_PROG_TYPE_LSM: 5877 case BPF_PROG_TYPE_TRACING: 5878 if (attr->link_create.attach_type != prog->expected_attach_type) { 5879 ret = -EINVAL; 5880 goto out; 5881 } 5882 if (prog->expected_attach_type == BPF_TRACE_RAW_TP) 5883 ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie, 5884 attr->link_create.attach_type); 5885 else if (prog->expected_attach_type == BPF_TRACE_ITER) 5886 ret = bpf_iter_link_attach(attr, uattr, prog); 5887 else if (prog->expected_attach_type == BPF_LSM_CGROUP) 5888 ret = cgroup_bpf_link_attach(attr, prog); 5889 else if (is_tracing_multi(prog->expected_attach_type)) 5890 ret = bpf_tracing_multi_attach(prog, attr); 5891 else 5892 ret = bpf_tracing_prog_attach(prog, 5893 attr->link_create.target_fd, 5894 attr->link_create.target_btf_id, 5895 attr->link_create.tracing.cookie, 5896 attr->link_create.attach_type); 5897 break; 5898 case BPF_PROG_TYPE_FLOW_DISSECTOR: 5899 case BPF_PROG_TYPE_SK_LOOKUP: 5900 ret = netns_bpf_link_create(attr, prog); 5901 break; 5902 case BPF_PROG_TYPE_SK_MSG: 5903 case BPF_PROG_TYPE_SK_SKB: 5904 ret = sock_map_link_create(attr, prog); 5905 break; 5906 #ifdef CONFIG_NET 5907 case BPF_PROG_TYPE_XDP: 5908 ret = bpf_xdp_link_attach(attr, prog); 5909 break; 5910 case BPF_PROG_TYPE_SCHED_CLS: 5911 if (attr->link_create.attach_type == BPF_TCX_INGRESS || 5912 attr->link_create.attach_type == BPF_TCX_EGRESS) 5913 ret = tcx_link_attach(attr, prog); 5914 else 5915 ret = netkit_link_attach(attr, prog); 5916 break; 5917 case BPF_PROG_TYPE_NETFILTER: 5918 ret = bpf_nf_link_attach(attr, prog); 5919 break; 5920 #endif 5921 case BPF_PROG_TYPE_PERF_EVENT: 5922 case BPF_PROG_TYPE_TRACEPOINT: 5923 ret = bpf_perf_link_attach(attr, prog); 5924 break; 5925 case BPF_PROG_TYPE_KPROBE: 5926 if (attr->link_create.attach_type == BPF_PERF_EVENT) 5927 ret = bpf_perf_link_attach(attr, prog); 5928 else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || 5929 attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) 5930 ret = bpf_kprobe_multi_link_attach(attr, prog); 5931 else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI || 5932 attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION) 5933 ret = bpf_uprobe_multi_link_attach(attr, prog); 5934 break; 5935 default: 5936 ret = -EINVAL; 5937 } 5938 5939 out: 5940 if (ret < 0) 5941 bpf_prog_put(prog); 5942 return ret; 5943 } 5944 5945 static int link_update_map(struct bpf_link *link, union bpf_attr *attr) 5946 { 5947 struct bpf_map *new_map, *old_map = NULL; 5948 int ret; 5949 5950 new_map = bpf_map_get(attr->link_update.new_map_fd); 5951 if (IS_ERR(new_map)) 5952 return PTR_ERR(new_map); 5953 5954 if (attr->link_update.flags & BPF_F_REPLACE) { 5955 old_map = bpf_map_get(attr->link_update.old_map_fd); 5956 if (IS_ERR(old_map)) { 5957 ret = PTR_ERR(old_map); 5958 goto out_put; 5959 } 5960 } else if (attr->link_update.old_map_fd) { 5961 ret = -EINVAL; 5962 goto out_put; 5963 } 5964 5965 ret = link->ops->update_map(link, new_map, old_map); 5966 5967 if (old_map) 5968 bpf_map_put(old_map); 5969 out_put: 5970 bpf_map_put(new_map); 5971 return ret; 5972 } 5973 5974 #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd 5975 5976 static int link_update(union bpf_attr *attr) 5977 { 5978 struct bpf_prog *old_prog = NULL, *new_prog; 5979 struct bpf_link *link; 5980 u32 flags; 5981 int ret; 5982 5983 if (CHECK_ATTR(BPF_LINK_UPDATE)) 5984 return -EINVAL; 5985 5986 flags = attr->link_update.flags; 5987 if (flags & ~BPF_F_REPLACE) 5988 return -EINVAL; 5989 5990 link = bpf_link_get_from_fd(attr->link_update.link_fd); 5991 if (IS_ERR(link)) 5992 return PTR_ERR(link); 5993 5994 if (link->ops->update_map) { 5995 ret = link_update_map(link, attr); 5996 goto out_put_link; 5997 } 5998 5999 new_prog = bpf_prog_get(attr->link_update.new_prog_fd); 6000 if (IS_ERR(new_prog)) { 6001 ret = PTR_ERR(new_prog); 6002 goto out_put_link; 6003 } 6004 6005 if (flags & BPF_F_REPLACE) { 6006 old_prog = bpf_prog_get(attr->link_update.old_prog_fd); 6007 if (IS_ERR(old_prog)) { 6008 ret = PTR_ERR(old_prog); 6009 old_prog = NULL; 6010 goto out_put_progs; 6011 } 6012 } else if (attr->link_update.old_prog_fd) { 6013 ret = -EINVAL; 6014 goto out_put_progs; 6015 } 6016 6017 if (link->ops->update_prog) 6018 ret = link->ops->update_prog(link, new_prog, old_prog); 6019 else 6020 ret = -EINVAL; 6021 6022 out_put_progs: 6023 if (old_prog) 6024 bpf_prog_put(old_prog); 6025 if (ret) 6026 bpf_prog_put(new_prog); 6027 out_put_link: 6028 bpf_link_put_direct(link); 6029 return ret; 6030 } 6031 6032 #define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd 6033 6034 static int link_detach(union bpf_attr *attr) 6035 { 6036 struct bpf_link *link; 6037 int ret; 6038 6039 if (CHECK_ATTR(BPF_LINK_DETACH)) 6040 return -EINVAL; 6041 6042 link = bpf_link_get_from_fd(attr->link_detach.link_fd); 6043 if (IS_ERR(link)) 6044 return PTR_ERR(link); 6045 6046 if (link->ops->detach) 6047 ret = link->ops->detach(link); 6048 else 6049 ret = -EOPNOTSUPP; 6050 6051 bpf_link_put_direct(link); 6052 return ret; 6053 } 6054 6055 struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) 6056 { 6057 return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); 6058 } 6059 EXPORT_SYMBOL(bpf_link_inc_not_zero); 6060 6061 struct bpf_link *bpf_link_by_id(u32 id) 6062 { 6063 struct bpf_link *link; 6064 6065 if (!id) 6066 return ERR_PTR(-ENOENT); 6067 6068 spin_lock_bh(&link_idr_lock); 6069 /* before link is "settled", ID is 0, pretend it doesn't exist yet */ 6070 link = idr_find(&link_idr, id); 6071 if (link) { 6072 if (link->id) 6073 link = bpf_link_inc_not_zero(link); 6074 else 6075 link = ERR_PTR(-EAGAIN); 6076 } else { 6077 link = ERR_PTR(-ENOENT); 6078 } 6079 spin_unlock_bh(&link_idr_lock); 6080 return link; 6081 } 6082 6083 struct bpf_link *bpf_link_get_curr_or_next(u32 *id) 6084 { 6085 struct bpf_link *link; 6086 6087 spin_lock_bh(&link_idr_lock); 6088 again: 6089 link = idr_get_next(&link_idr, id); 6090 if (link) { 6091 link = bpf_link_inc_not_zero(link); 6092 if (IS_ERR(link)) { 6093 (*id)++; 6094 goto again; 6095 } 6096 } 6097 spin_unlock_bh(&link_idr_lock); 6098 6099 return link; 6100 } 6101 6102 #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id 6103 6104 static int bpf_link_get_fd_by_id(const union bpf_attr *attr) 6105 { 6106 struct bpf_link *link; 6107 u32 id = attr->link_id; 6108 int fd; 6109 6110 if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) 6111 return -EINVAL; 6112 6113 if (!capable(CAP_SYS_ADMIN)) 6114 return -EPERM; 6115 6116 link = bpf_link_by_id(id); 6117 if (IS_ERR(link)) 6118 return PTR_ERR(link); 6119 6120 fd = bpf_link_new_fd(link); 6121 if (fd < 0) 6122 bpf_link_put_direct(link); 6123 6124 return fd; 6125 } 6126 6127 DEFINE_MUTEX(bpf_stats_enabled_mutex); 6128 6129 static int bpf_stats_release(struct inode *inode, struct file *file) 6130 { 6131 mutex_lock(&bpf_stats_enabled_mutex); 6132 static_key_slow_dec(&bpf_stats_enabled_key.key); 6133 mutex_unlock(&bpf_stats_enabled_mutex); 6134 return 0; 6135 } 6136 6137 static const struct file_operations bpf_stats_fops = { 6138 .release = bpf_stats_release, 6139 }; 6140 6141 static int bpf_enable_runtime_stats(void) 6142 { 6143 int fd; 6144 6145 mutex_lock(&bpf_stats_enabled_mutex); 6146 6147 /* Set a very high limit to avoid overflow */ 6148 if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { 6149 mutex_unlock(&bpf_stats_enabled_mutex); 6150 return -EBUSY; 6151 } 6152 6153 fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); 6154 if (fd >= 0) 6155 static_key_slow_inc(&bpf_stats_enabled_key.key); 6156 6157 mutex_unlock(&bpf_stats_enabled_mutex); 6158 return fd; 6159 } 6160 6161 #define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type 6162 6163 static int bpf_enable_stats(union bpf_attr *attr) 6164 { 6165 6166 if (CHECK_ATTR(BPF_ENABLE_STATS)) 6167 return -EINVAL; 6168 6169 if (!capable(CAP_SYS_ADMIN)) 6170 return -EPERM; 6171 6172 switch (attr->enable_stats.type) { 6173 case BPF_STATS_RUN_TIME: 6174 return bpf_enable_runtime_stats(); 6175 default: 6176 break; 6177 } 6178 return -EINVAL; 6179 } 6180 6181 #define BPF_ITER_CREATE_LAST_FIELD iter_create.flags 6182 6183 static int bpf_iter_create(union bpf_attr *attr) 6184 { 6185 struct bpf_link *link; 6186 int err; 6187 6188 if (CHECK_ATTR(BPF_ITER_CREATE)) 6189 return -EINVAL; 6190 6191 if (attr->iter_create.flags) 6192 return -EINVAL; 6193 6194 link = bpf_link_get_from_fd(attr->iter_create.link_fd); 6195 if (IS_ERR(link)) 6196 return PTR_ERR(link); 6197 6198 err = bpf_iter_new_fd(link); 6199 bpf_link_put_direct(link); 6200 6201 return err; 6202 } 6203 6204 #define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags 6205 6206 static int bpf_prog_bind_map(union bpf_attr *attr) 6207 { 6208 struct bpf_prog *prog; 6209 struct bpf_map *map; 6210 struct bpf_map **used_maps_old, **used_maps_new; 6211 int i, ret = 0; 6212 6213 if (CHECK_ATTR(BPF_PROG_BIND_MAP)) 6214 return -EINVAL; 6215 6216 if (attr->prog_bind_map.flags) 6217 return -EINVAL; 6218 6219 prog = bpf_prog_get(attr->prog_bind_map.prog_fd); 6220 if (IS_ERR(prog)) 6221 return PTR_ERR(prog); 6222 6223 map = bpf_map_get(attr->prog_bind_map.map_fd); 6224 if (IS_ERR(map)) { 6225 ret = PTR_ERR(map); 6226 goto out_prog_put; 6227 } 6228 6229 mutex_lock(&prog->aux->used_maps_mutex); 6230 6231 used_maps_old = prog->aux->used_maps; 6232 6233 for (i = 0; i < prog->aux->used_map_cnt; i++) 6234 if (used_maps_old[i] == map) { 6235 bpf_map_put(map); 6236 goto out_unlock; 6237 } 6238 6239 used_maps_new = kmalloc_objs(used_maps_new[0], 6240 prog->aux->used_map_cnt + 1); 6241 if (!used_maps_new) { 6242 ret = -ENOMEM; 6243 goto out_unlock; 6244 } 6245 6246 /* The bpf program will not access the bpf map, but for the sake of 6247 * simplicity, increase sleepable_refcnt for sleepable program as well. 6248 */ 6249 if (prog->sleepable) 6250 atomic64_inc(&map->sleepable_refcnt); 6251 memcpy(used_maps_new, used_maps_old, 6252 sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); 6253 used_maps_new[prog->aux->used_map_cnt] = map; 6254 6255 prog->aux->used_map_cnt++; 6256 prog->aux->used_maps = used_maps_new; 6257 6258 kfree(used_maps_old); 6259 6260 out_unlock: 6261 mutex_unlock(&prog->aux->used_maps_mutex); 6262 6263 if (ret) 6264 bpf_map_put(map); 6265 out_prog_put: 6266 bpf_prog_put(prog); 6267 return ret; 6268 } 6269 6270 #define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd 6271 6272 static int token_create(union bpf_attr *attr) 6273 { 6274 if (CHECK_ATTR(BPF_TOKEN_CREATE)) 6275 return -EINVAL; 6276 6277 /* no flags are supported yet */ 6278 if (attr->token_create.flags) 6279 return -EINVAL; 6280 6281 return bpf_token_create(attr); 6282 } 6283 6284 #define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd 6285 6286 static int prog_stream_read(union bpf_attr *attr) 6287 { 6288 char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf); 6289 u32 len = attr->prog_stream_read.stream_buf_len; 6290 struct bpf_prog *prog; 6291 int ret; 6292 6293 if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD)) 6294 return -EINVAL; 6295 6296 prog = bpf_prog_get(attr->prog_stream_read.prog_fd); 6297 if (IS_ERR(prog)) 6298 return PTR_ERR(prog); 6299 6300 ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len); 6301 bpf_prog_put(prog); 6302 6303 return ret; 6304 } 6305 6306 #define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd 6307 6308 static int prog_assoc_struct_ops(union bpf_attr *attr) 6309 { 6310 struct bpf_prog *prog; 6311 struct bpf_map *map; 6312 int ret; 6313 6314 if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS)) 6315 return -EINVAL; 6316 6317 if (attr->prog_assoc_struct_ops.flags) 6318 return -EINVAL; 6319 6320 prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd); 6321 if (IS_ERR(prog)) 6322 return PTR_ERR(prog); 6323 6324 if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { 6325 ret = -EINVAL; 6326 goto put_prog; 6327 } 6328 6329 map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd); 6330 if (IS_ERR(map)) { 6331 ret = PTR_ERR(map); 6332 goto put_prog; 6333 } 6334 6335 if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { 6336 ret = -EINVAL; 6337 goto put_map; 6338 } 6339 6340 ret = bpf_prog_assoc_struct_ops(prog, map); 6341 6342 put_map: 6343 bpf_map_put(map); 6344 put_prog: 6345 bpf_prog_put(prog); 6346 return ret; 6347 } 6348 6349 static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, 6350 bpfptr_t uattr_common, unsigned int size_common) 6351 { 6352 struct bpf_common_attr attr_common; 6353 u32 offsetof_log_true_size = 0; 6354 struct bpf_log_attr attr_log; 6355 union bpf_attr attr; 6356 int err; 6357 6358 err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); 6359 if (err) 6360 return err; 6361 size = min_t(u32, size, sizeof(attr)); 6362 6363 /* copy attributes from user space, may be less than sizeof(bpf_attr) */ 6364 memset(&attr, 0, sizeof(attr)); 6365 if (copy_from_bpfptr(&attr, uattr, size) != 0) 6366 return -EFAULT; 6367 6368 memset(&attr_common, 0, sizeof(attr_common)); 6369 if (cmd & BPF_COMMON_ATTRS) { 6370 err = bpf_check_uarg_tail_zero(uattr_common, 6371 offsetofend(struct bpf_common_attr, log_true_size), 6372 size_common); 6373 if (err) 6374 return err; 6375 6376 cmd &= ~BPF_COMMON_ATTRS; 6377 size_common = min_t(u32, size_common, sizeof(attr_common)); 6378 if (copy_from_bpfptr(&attr_common, uattr_common, size_common) != 0) 6379 return -EFAULT; 6380 } else { 6381 size_common = 0; 6382 } 6383 6384 err = security_bpf(cmd, &attr, size, uattr.is_kernel); 6385 if (err < 0) 6386 return err; 6387 6388 switch (cmd) { 6389 case BPF_MAP_CREATE: 6390 err = map_create(&attr, uattr, &attr_common, uattr_common, size_common); 6391 break; 6392 case BPF_MAP_LOOKUP_ELEM: 6393 err = map_lookup_elem(&attr); 6394 break; 6395 case BPF_MAP_UPDATE_ELEM: 6396 err = map_update_elem(&attr, uattr); 6397 break; 6398 case BPF_MAP_DELETE_ELEM: 6399 err = map_delete_elem(&attr, uattr); 6400 break; 6401 case BPF_MAP_GET_NEXT_KEY: 6402 err = map_get_next_key(&attr); 6403 break; 6404 case BPF_MAP_FREEZE: 6405 err = map_freeze(&attr); 6406 break; 6407 case BPF_PROG_LOAD: 6408 if (size >= offsetofend(union bpf_attr, log_true_size)) 6409 offsetof_log_true_size = offsetof(union bpf_attr, log_true_size); 6410 err = bpf_log_attr_init(&attr_log, attr.log_buf, attr.log_size, attr.log_level, 6411 offsetof_log_true_size, uattr, &attr_common, uattr_common, 6412 size_common); 6413 err = err ?: bpf_prog_load(&attr, uattr, &attr_log); 6414 break; 6415 case BPF_OBJ_PIN: 6416 err = bpf_obj_pin(&attr); 6417 break; 6418 case BPF_OBJ_GET: 6419 err = bpf_obj_get(&attr); 6420 break; 6421 case BPF_PROG_ATTACH: 6422 err = bpf_prog_attach(&attr); 6423 break; 6424 case BPF_PROG_DETACH: 6425 err = bpf_prog_detach(&attr); 6426 break; 6427 case BPF_PROG_QUERY: 6428 err = bpf_prog_query(&attr, uattr.user, size); 6429 break; 6430 case BPF_PROG_TEST_RUN: 6431 err = bpf_prog_test_run(&attr, uattr.user); 6432 break; 6433 case BPF_PROG_GET_NEXT_ID: 6434 err = bpf_obj_get_next_id(&attr, uattr.user, 6435 &prog_idr, &prog_idr_lock); 6436 break; 6437 case BPF_MAP_GET_NEXT_ID: 6438 err = bpf_obj_get_next_id(&attr, uattr.user, 6439 &map_idr, &map_idr_lock); 6440 break; 6441 case BPF_BTF_GET_NEXT_ID: 6442 err = bpf_obj_get_next_id(&attr, uattr.user, 6443 &btf_idr, &btf_idr_lock); 6444 break; 6445 case BPF_PROG_GET_FD_BY_ID: 6446 err = bpf_prog_get_fd_by_id(&attr); 6447 break; 6448 case BPF_MAP_GET_FD_BY_ID: 6449 err = bpf_map_get_fd_by_id(&attr); 6450 break; 6451 case BPF_OBJ_GET_INFO_BY_FD: 6452 err = bpf_obj_get_info_by_fd(&attr, uattr.user); 6453 break; 6454 case BPF_RAW_TRACEPOINT_OPEN: 6455 err = bpf_raw_tracepoint_open(&attr); 6456 break; 6457 case BPF_BTF_LOAD: 6458 if (size >= offsetofend(union bpf_attr, btf_log_true_size)) 6459 offsetof_log_true_size = offsetof(union bpf_attr, btf_log_true_size); 6460 err = bpf_log_attr_init(&attr_log, attr.btf_log_buf, attr.btf_log_size, 6461 attr.btf_log_level, offsetof_log_true_size, uattr, 6462 &attr_common, uattr_common, size_common); 6463 err = err ?: bpf_btf_load(&attr, uattr, &attr_log); 6464 break; 6465 case BPF_BTF_GET_FD_BY_ID: 6466 err = bpf_btf_get_fd_by_id(&attr); 6467 break; 6468 case BPF_TASK_FD_QUERY: 6469 err = bpf_task_fd_query(&attr, uattr.user); 6470 break; 6471 case BPF_MAP_LOOKUP_AND_DELETE_ELEM: 6472 err = map_lookup_and_delete_elem(&attr); 6473 break; 6474 case BPF_MAP_LOOKUP_BATCH: 6475 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH); 6476 break; 6477 case BPF_MAP_LOOKUP_AND_DELETE_BATCH: 6478 err = bpf_map_do_batch(&attr, uattr.user, 6479 BPF_MAP_LOOKUP_AND_DELETE_BATCH); 6480 break; 6481 case BPF_MAP_UPDATE_BATCH: 6482 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH); 6483 break; 6484 case BPF_MAP_DELETE_BATCH: 6485 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH); 6486 break; 6487 case BPF_LINK_CREATE: 6488 err = link_create(&attr, uattr); 6489 break; 6490 case BPF_LINK_UPDATE: 6491 err = link_update(&attr); 6492 break; 6493 case BPF_LINK_GET_FD_BY_ID: 6494 err = bpf_link_get_fd_by_id(&attr); 6495 break; 6496 case BPF_LINK_GET_NEXT_ID: 6497 err = bpf_obj_get_next_id(&attr, uattr.user, 6498 &link_idr, &link_idr_lock); 6499 break; 6500 case BPF_ENABLE_STATS: 6501 err = bpf_enable_stats(&attr); 6502 break; 6503 case BPF_ITER_CREATE: 6504 err = bpf_iter_create(&attr); 6505 break; 6506 case BPF_LINK_DETACH: 6507 err = link_detach(&attr); 6508 break; 6509 case BPF_PROG_BIND_MAP: 6510 err = bpf_prog_bind_map(&attr); 6511 break; 6512 case BPF_TOKEN_CREATE: 6513 err = token_create(&attr); 6514 break; 6515 case BPF_PROG_STREAM_READ_BY_FD: 6516 err = prog_stream_read(&attr); 6517 break; 6518 case BPF_PROG_ASSOC_STRUCT_OPS: 6519 err = prog_assoc_struct_ops(&attr); 6520 break; 6521 default: 6522 err = -EINVAL; 6523 break; 6524 } 6525 6526 return err; 6527 } 6528 6529 SYSCALL_DEFINE5(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size, 6530 struct bpf_common_attr __user *, uattr_common, unsigned int, size_common) 6531 { 6532 return __sys_bpf(cmd, USER_BPFPTR(uattr), size, USER_BPFPTR(uattr_common), size_common); 6533 } 6534 6535 static bool syscall_prog_is_valid_access(int off, int size, 6536 enum bpf_access_type type, 6537 const struct bpf_prog *prog, 6538 struct bpf_insn_access_aux *info) 6539 { 6540 if (off < 0 || off >= U16_MAX) 6541 return false; 6542 /* No alignment requirements for syscall ctx accesses. */ 6543 return true; 6544 } 6545 6546 BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) 6547 { 6548 switch (cmd) { 6549 case BPF_MAP_CREATE: 6550 case BPF_MAP_DELETE_ELEM: 6551 case BPF_MAP_UPDATE_ELEM: 6552 case BPF_MAP_FREEZE: 6553 case BPF_MAP_GET_FD_BY_ID: 6554 case BPF_PROG_LOAD: 6555 case BPF_BTF_LOAD: 6556 case BPF_LINK_CREATE: 6557 case BPF_RAW_TRACEPOINT_OPEN: 6558 break; 6559 default: 6560 return -EINVAL; 6561 } 6562 return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size, KERNEL_BPFPTR(NULL), 0); 6563 } 6564 6565 6566 /* To shut up -Wmissing-prototypes. 6567 * This function is used by the kernel light skeleton 6568 * to load bpf programs when modules are loaded or during kernel boot. 6569 * See tools/lib/bpf/skel_internal.h 6570 */ 6571 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); 6572 6573 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) 6574 { 6575 struct bpf_prog * __maybe_unused prog; 6576 struct bpf_tramp_run_ctx __maybe_unused run_ctx; 6577 6578 switch (cmd) { 6579 #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ 6580 case BPF_PROG_TEST_RUN: 6581 if (attr->test.data_in || attr->test.data_out || 6582 attr->test.ctx_out || attr->test.duration || 6583 attr->test.repeat || attr->test.flags) 6584 return -EINVAL; 6585 6586 prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL); 6587 if (IS_ERR(prog)) 6588 return PTR_ERR(prog); 6589 6590 if (attr->test.ctx_size_in < prog->aux->max_ctx_offset || 6591 attr->test.ctx_size_in > U16_MAX) { 6592 bpf_prog_put(prog); 6593 return -EINVAL; 6594 } 6595 6596 run_ctx.bpf_cookie = 0; 6597 if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { 6598 /* recursion detected */ 6599 __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); 6600 bpf_prog_put(prog); 6601 return -EBUSY; 6602 } 6603 attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); 6604 __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, 6605 &run_ctx); 6606 bpf_prog_put(prog); 6607 return 0; 6608 #endif 6609 default: 6610 return ____bpf_sys_bpf(cmd, attr, size); 6611 } 6612 } 6613 EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL"); 6614 6615 static const struct bpf_func_proto bpf_sys_bpf_proto = { 6616 .func = bpf_sys_bpf, 6617 .gpl_only = false, 6618 .ret_type = RET_INTEGER, 6619 .arg1_type = ARG_ANYTHING, 6620 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 6621 .arg3_type = ARG_CONST_SIZE, 6622 }; 6623 6624 const struct bpf_func_proto * __weak 6625 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 6626 { 6627 return bpf_base_func_proto(func_id, prog); 6628 } 6629 6630 BPF_CALL_1(bpf_sys_close, u32, fd) 6631 { 6632 /* When bpf program calls this helper there should not be 6633 * an fdget() without matching completed fdput(). 6634 * This helper is allowed in the following callchain only: 6635 * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close 6636 */ 6637 return close_fd(fd); 6638 } 6639 6640 static const struct bpf_func_proto bpf_sys_close_proto = { 6641 .func = bpf_sys_close, 6642 .gpl_only = false, 6643 .ret_type = RET_INTEGER, 6644 .arg1_type = ARG_ANYTHING, 6645 }; 6646 6647 BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res) 6648 { 6649 *res = 0; 6650 if (flags) 6651 return -EINVAL; 6652 6653 if (name_sz <= 1 || name[name_sz - 1]) 6654 return -EINVAL; 6655 6656 if (!bpf_dump_raw_ok(current_cred())) 6657 return -EPERM; 6658 6659 *res = kallsyms_lookup_name(name); 6660 return *res ? 0 : -ENOENT; 6661 } 6662 6663 static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { 6664 .func = bpf_kallsyms_lookup_name, 6665 .gpl_only = false, 6666 .ret_type = RET_INTEGER, 6667 .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, 6668 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 6669 .arg3_type = ARG_ANYTHING, 6670 .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, 6671 .arg4_size = sizeof(u64), 6672 }; 6673 6674 static const struct bpf_func_proto * 6675 syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 6676 { 6677 switch (func_id) { 6678 case BPF_FUNC_sys_bpf: 6679 return !bpf_token_capable(prog->aux->token, CAP_PERFMON) 6680 ? NULL : &bpf_sys_bpf_proto; 6681 case BPF_FUNC_btf_find_by_name_kind: 6682 return &bpf_btf_find_by_name_kind_proto; 6683 case BPF_FUNC_sys_close: 6684 return &bpf_sys_close_proto; 6685 case BPF_FUNC_kallsyms_lookup_name: 6686 return &bpf_kallsyms_lookup_name_proto; 6687 default: 6688 return tracing_prog_func_proto(func_id, prog); 6689 } 6690 } 6691 6692 const struct bpf_verifier_ops bpf_syscall_verifier_ops = { 6693 .get_func_proto = syscall_prog_func_proto, 6694 .is_valid_access = syscall_prog_is_valid_access, 6695 }; 6696 6697 const struct bpf_prog_ops bpf_syscall_prog_ops = { 6698 .test_run = bpf_prog_test_run_syscall, 6699 }; 6700 6701 #ifdef CONFIG_SYSCTL 6702 static int bpf_stats_handler(const struct ctl_table *table, int write, 6703 void *buffer, size_t *lenp, loff_t *ppos) 6704 { 6705 struct static_key *key = (struct static_key *)table->data; 6706 static int saved_val; 6707 int val, ret; 6708 struct ctl_table tmp = { 6709 .data = &val, 6710 .maxlen = sizeof(val), 6711 .mode = table->mode, 6712 .extra1 = SYSCTL_ZERO, 6713 .extra2 = SYSCTL_ONE, 6714 }; 6715 6716 if (write && !capable(CAP_SYS_ADMIN)) 6717 return -EPERM; 6718 6719 mutex_lock(&bpf_stats_enabled_mutex); 6720 val = saved_val; 6721 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 6722 if (write && !ret && val != saved_val) { 6723 if (val) 6724 static_key_slow_inc(key); 6725 else 6726 static_key_slow_dec(key); 6727 saved_val = val; 6728 } 6729 mutex_unlock(&bpf_stats_enabled_mutex); 6730 return ret; 6731 } 6732 6733 void __weak unpriv_ebpf_notify(int new_state) 6734 { 6735 } 6736 6737 static int bpf_unpriv_handler(const struct ctl_table *table, int write, 6738 void *buffer, size_t *lenp, loff_t *ppos) 6739 { 6740 int ret, unpriv_enable = *(int *)table->data; 6741 bool locked_state = unpriv_enable == 1; 6742 struct ctl_table tmp = *table; 6743 6744 if (write && !capable(CAP_SYS_ADMIN)) 6745 return -EPERM; 6746 6747 tmp.data = &unpriv_enable; 6748 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 6749 if (write && !ret) { 6750 if (locked_state && unpriv_enable != 1) 6751 return -EPERM; 6752 *(int *)table->data = unpriv_enable; 6753 } 6754 6755 if (write) 6756 unpriv_ebpf_notify(unpriv_enable); 6757 6758 return ret; 6759 } 6760 6761 static const struct ctl_table bpf_syscall_table[] = { 6762 { 6763 .procname = "unprivileged_bpf_disabled", 6764 .data = &sysctl_unprivileged_bpf_disabled, 6765 .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), 6766 .mode = 0644, 6767 .proc_handler = bpf_unpriv_handler, 6768 .extra1 = SYSCTL_ZERO, 6769 .extra2 = SYSCTL_TWO, 6770 }, 6771 { 6772 .procname = "bpf_stats_enabled", 6773 .data = &bpf_stats_enabled_key.key, 6774 .mode = 0644, 6775 .proc_handler = bpf_stats_handler, 6776 }, 6777 }; 6778 6779 static int __init bpf_syscall_sysctl_init(void) 6780 { 6781 register_sysctl_init("kernel", bpf_syscall_table); 6782 return 0; 6783 } 6784 late_initcall(bpf_syscall_sysctl_init); 6785 #endif /* CONFIG_SYSCTL */ 6786