1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 */ 4 #include <linux/bpf.h> 5 #include <linux/btf.h> 6 #include <linux/bpf-cgroup.h> 7 #include <linux/cgroup.h> 8 #include <linux/rcupdate.h> 9 #include <linux/random.h> 10 #include <linux/smp.h> 11 #include <linux/topology.h> 12 #include <linux/ktime.h> 13 #include <linux/sched.h> 14 #include <linux/uidgid.h> 15 #include <linux/filter.h> 16 #include <linux/ctype.h> 17 #include <linux/jiffies.h> 18 #include <linux/pid_namespace.h> 19 #include <linux/poison.h> 20 #include <linux/proc_ns.h> 21 #include <linux/security.h> 22 #include <linux/btf_ids.h> 23 #include <linux/bpf_mem_alloc.h> 24 25 #include "../../lib/kstrtox.h" 26 27 /* If kernel subsystem is allowing eBPF programs to call this function, 28 * inside its own verifier_ops->get_func_proto() callback it should return 29 * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments 30 * 31 * Different map implementations will rely on rcu in map methods 32 * lookup/update/delete, therefore eBPF programs must run under rcu lock 33 * if program is allowed to access maps, so check rcu_read_lock_held in 34 * all three functions. 35 */ 36 BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) 37 { 38 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); 39 return (unsigned long) map->ops->map_lookup_elem(map, key); 40 } 41 42 const struct bpf_func_proto bpf_map_lookup_elem_proto = { 43 .func = bpf_map_lookup_elem, 44 .gpl_only = false, 45 .pkt_access = true, 46 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, 47 .arg1_type = ARG_CONST_MAP_PTR, 48 .arg2_type = ARG_PTR_TO_MAP_KEY, 49 }; 50 51 BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, 52 void *, value, u64, flags) 53 { 54 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); 55 return map->ops->map_update_elem(map, key, value, flags); 56 } 57 58 const struct bpf_func_proto bpf_map_update_elem_proto = { 59 .func = bpf_map_update_elem, 60 .gpl_only = false, 61 .pkt_access = true, 62 .ret_type = RET_INTEGER, 63 .arg1_type = ARG_CONST_MAP_PTR, 64 .arg2_type = ARG_PTR_TO_MAP_KEY, 65 .arg3_type = ARG_PTR_TO_MAP_VALUE, 66 .arg4_type = ARG_ANYTHING, 67 }; 68 69 BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) 70 { 71 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); 72 return map->ops->map_delete_elem(map, key); 73 } 74 75 const struct bpf_func_proto bpf_map_delete_elem_proto = { 76 .func = bpf_map_delete_elem, 77 .gpl_only = false, 78 .pkt_access = true, 79 .ret_type = RET_INTEGER, 80 .arg1_type = ARG_CONST_MAP_PTR, 81 .arg2_type = ARG_PTR_TO_MAP_KEY, 82 }; 83 84 BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) 85 { 86 return map->ops->map_push_elem(map, value, flags); 87 } 88 89 const struct bpf_func_proto bpf_map_push_elem_proto = { 90 .func = bpf_map_push_elem, 91 .gpl_only = false, 92 .pkt_access = true, 93 .ret_type = RET_INTEGER, 94 .arg1_type = ARG_CONST_MAP_PTR, 95 .arg2_type = ARG_PTR_TO_MAP_VALUE, 96 .arg3_type = ARG_ANYTHING, 97 }; 98 99 BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) 100 { 101 return map->ops->map_pop_elem(map, value); 102 } 103 104 const struct bpf_func_proto bpf_map_pop_elem_proto = { 105 .func = bpf_map_pop_elem, 106 .gpl_only = false, 107 .ret_type = RET_INTEGER, 108 .arg1_type = ARG_CONST_MAP_PTR, 109 .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, 110 }; 111 112 BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) 113 { 114 return map->ops->map_peek_elem(map, value); 115 } 116 117 const struct bpf_func_proto bpf_map_peek_elem_proto = { 118 .func = bpf_map_peek_elem, 119 .gpl_only = false, 120 .ret_type = RET_INTEGER, 121 .arg1_type = ARG_CONST_MAP_PTR, 122 .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, 123 }; 124 125 BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) 126 { 127 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); 128 return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); 129 } 130 131 const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = { 132 .func = bpf_map_lookup_percpu_elem, 133 .gpl_only = false, 134 .pkt_access = true, 135 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, 136 .arg1_type = ARG_CONST_MAP_PTR, 137 .arg2_type = ARG_PTR_TO_MAP_KEY, 138 .arg3_type = ARG_ANYTHING, 139 }; 140 141 const struct bpf_func_proto bpf_get_prandom_u32_proto = { 142 .func = bpf_user_rnd_u32, 143 .gpl_only = false, 144 .ret_type = RET_INTEGER, 145 }; 146 147 BPF_CALL_0(bpf_get_smp_processor_id) 148 { 149 return smp_processor_id(); 150 } 151 152 const struct bpf_func_proto bpf_get_smp_processor_id_proto = { 153 .func = bpf_get_smp_processor_id, 154 .gpl_only = false, 155 .ret_type = RET_INTEGER, 156 }; 157 158 BPF_CALL_0(bpf_get_numa_node_id) 159 { 160 return numa_node_id(); 161 } 162 163 const struct bpf_func_proto bpf_get_numa_node_id_proto = { 164 .func = bpf_get_numa_node_id, 165 .gpl_only = false, 166 .ret_type = RET_INTEGER, 167 }; 168 169 BPF_CALL_0(bpf_ktime_get_ns) 170 { 171 /* NMI safe access to clock monotonic */ 172 return ktime_get_mono_fast_ns(); 173 } 174 175 const struct bpf_func_proto bpf_ktime_get_ns_proto = { 176 .func = bpf_ktime_get_ns, 177 .gpl_only = false, 178 .ret_type = RET_INTEGER, 179 }; 180 181 BPF_CALL_0(bpf_ktime_get_boot_ns) 182 { 183 /* NMI safe access to clock boottime */ 184 return ktime_get_boot_fast_ns(); 185 } 186 187 const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { 188 .func = bpf_ktime_get_boot_ns, 189 .gpl_only = false, 190 .ret_type = RET_INTEGER, 191 }; 192 193 BPF_CALL_0(bpf_ktime_get_coarse_ns) 194 { 195 return ktime_get_coarse_ns(); 196 } 197 198 const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { 199 .func = bpf_ktime_get_coarse_ns, 200 .gpl_only = false, 201 .ret_type = RET_INTEGER, 202 }; 203 204 BPF_CALL_0(bpf_ktime_get_tai_ns) 205 { 206 /* NMI safe access to clock tai */ 207 return ktime_get_tai_fast_ns(); 208 } 209 210 const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = { 211 .func = bpf_ktime_get_tai_ns, 212 .gpl_only = false, 213 .ret_type = RET_INTEGER, 214 }; 215 216 BPF_CALL_0(bpf_get_current_pid_tgid) 217 { 218 struct task_struct *task = current; 219 220 if (unlikely(!task)) 221 return -EINVAL; 222 223 return (u64) task->tgid << 32 | task->pid; 224 } 225 226 const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { 227 .func = bpf_get_current_pid_tgid, 228 .gpl_only = false, 229 .ret_type = RET_INTEGER, 230 }; 231 232 BPF_CALL_0(bpf_get_current_uid_gid) 233 { 234 struct task_struct *task = current; 235 kuid_t uid; 236 kgid_t gid; 237 238 if (unlikely(!task)) 239 return -EINVAL; 240 241 current_uid_gid(&uid, &gid); 242 return (u64) from_kgid(&init_user_ns, gid) << 32 | 243 from_kuid(&init_user_ns, uid); 244 } 245 246 const struct bpf_func_proto bpf_get_current_uid_gid_proto = { 247 .func = bpf_get_current_uid_gid, 248 .gpl_only = false, 249 .ret_type = RET_INTEGER, 250 }; 251 252 BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) 253 { 254 struct task_struct *task = current; 255 256 if (unlikely(!task)) 257 goto err_clear; 258 259 /* Verifier guarantees that size > 0 */ 260 strscpy(buf, task->comm, size); 261 return 0; 262 err_clear: 263 memset(buf, 0, size); 264 return -EINVAL; 265 } 266 267 const struct bpf_func_proto bpf_get_current_comm_proto = { 268 .func = bpf_get_current_comm, 269 .gpl_only = false, 270 .ret_type = RET_INTEGER, 271 .arg1_type = ARG_PTR_TO_UNINIT_MEM, 272 .arg2_type = ARG_CONST_SIZE, 273 }; 274 275 #if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) 276 277 static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) 278 { 279 arch_spinlock_t *l = (void *)lock; 280 union { 281 __u32 val; 282 arch_spinlock_t lock; 283 } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; 284 285 compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); 286 BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); 287 BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); 288 arch_spin_lock(l); 289 } 290 291 static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) 292 { 293 arch_spinlock_t *l = (void *)lock; 294 295 arch_spin_unlock(l); 296 } 297 298 #else 299 300 static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) 301 { 302 atomic_t *l = (void *)lock; 303 304 BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); 305 do { 306 atomic_cond_read_relaxed(l, !VAL); 307 } while (atomic_xchg(l, 1)); 308 } 309 310 static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) 311 { 312 atomic_t *l = (void *)lock; 313 314 atomic_set_release(l, 0); 315 } 316 317 #endif 318 319 static DEFINE_PER_CPU(unsigned long, irqsave_flags); 320 321 static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock) 322 { 323 unsigned long flags; 324 325 local_irq_save(flags); 326 __bpf_spin_lock(lock); 327 __this_cpu_write(irqsave_flags, flags); 328 } 329 330 notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) 331 { 332 __bpf_spin_lock_irqsave(lock); 333 return 0; 334 } 335 336 const struct bpf_func_proto bpf_spin_lock_proto = { 337 .func = bpf_spin_lock, 338 .gpl_only = false, 339 .ret_type = RET_VOID, 340 .arg1_type = ARG_PTR_TO_SPIN_LOCK, 341 .arg1_btf_id = BPF_PTR_POISON, 342 }; 343 344 static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) 345 { 346 unsigned long flags; 347 348 flags = __this_cpu_read(irqsave_flags); 349 __bpf_spin_unlock(lock); 350 local_irq_restore(flags); 351 } 352 353 notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) 354 { 355 __bpf_spin_unlock_irqrestore(lock); 356 return 0; 357 } 358 359 const struct bpf_func_proto bpf_spin_unlock_proto = { 360 .func = bpf_spin_unlock, 361 .gpl_only = false, 362 .ret_type = RET_VOID, 363 .arg1_type = ARG_PTR_TO_SPIN_LOCK, 364 .arg1_btf_id = BPF_PTR_POISON, 365 }; 366 367 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, 368 bool lock_src) 369 { 370 struct bpf_spin_lock *lock; 371 372 if (lock_src) 373 lock = src + map->record->spin_lock_off; 374 else 375 lock = dst + map->record->spin_lock_off; 376 preempt_disable(); 377 __bpf_spin_lock_irqsave(lock); 378 copy_map_value(map, dst, src); 379 __bpf_spin_unlock_irqrestore(lock); 380 preempt_enable(); 381 } 382 383 BPF_CALL_0(bpf_jiffies64) 384 { 385 return get_jiffies_64(); 386 } 387 388 const struct bpf_func_proto bpf_jiffies64_proto = { 389 .func = bpf_jiffies64, 390 .gpl_only = false, 391 .ret_type = RET_INTEGER, 392 }; 393 394 #ifdef CONFIG_CGROUPS 395 BPF_CALL_0(bpf_get_current_cgroup_id) 396 { 397 struct cgroup *cgrp; 398 u64 cgrp_id; 399 400 rcu_read_lock(); 401 cgrp = task_dfl_cgroup(current); 402 cgrp_id = cgroup_id(cgrp); 403 rcu_read_unlock(); 404 405 return cgrp_id; 406 } 407 408 const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { 409 .func = bpf_get_current_cgroup_id, 410 .gpl_only = false, 411 .ret_type = RET_INTEGER, 412 }; 413 414 BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level) 415 { 416 struct cgroup *cgrp; 417 struct cgroup *ancestor; 418 u64 cgrp_id; 419 420 rcu_read_lock(); 421 cgrp = task_dfl_cgroup(current); 422 ancestor = cgroup_ancestor(cgrp, ancestor_level); 423 cgrp_id = ancestor ? cgroup_id(ancestor) : 0; 424 rcu_read_unlock(); 425 426 return cgrp_id; 427 } 428 429 const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { 430 .func = bpf_get_current_ancestor_cgroup_id, 431 .gpl_only = false, 432 .ret_type = RET_INTEGER, 433 .arg1_type = ARG_ANYTHING, 434 }; 435 #endif /* CONFIG_CGROUPS */ 436 437 #define BPF_STRTOX_BASE_MASK 0x1F 438 439 static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, 440 unsigned long long *res, bool *is_negative) 441 { 442 unsigned int base = flags & BPF_STRTOX_BASE_MASK; 443 const char *cur_buf = buf; 444 size_t cur_len = buf_len; 445 unsigned int consumed; 446 size_t val_len; 447 char str[64]; 448 449 if (!buf || !buf_len || !res || !is_negative) 450 return -EINVAL; 451 452 if (base != 0 && base != 8 && base != 10 && base != 16) 453 return -EINVAL; 454 455 if (flags & ~BPF_STRTOX_BASE_MASK) 456 return -EINVAL; 457 458 while (cur_buf < buf + buf_len && isspace(*cur_buf)) 459 ++cur_buf; 460 461 *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); 462 if (*is_negative) 463 ++cur_buf; 464 465 consumed = cur_buf - buf; 466 cur_len -= consumed; 467 if (!cur_len) 468 return -EINVAL; 469 470 cur_len = min(cur_len, sizeof(str) - 1); 471 memcpy(str, cur_buf, cur_len); 472 str[cur_len] = '\0'; 473 cur_buf = str; 474 475 cur_buf = _parse_integer_fixup_radix(cur_buf, &base); 476 val_len = _parse_integer(cur_buf, base, res); 477 478 if (val_len & KSTRTOX_OVERFLOW) 479 return -ERANGE; 480 481 if (val_len == 0) 482 return -EINVAL; 483 484 cur_buf += val_len; 485 consumed += cur_buf - str; 486 487 return consumed; 488 } 489 490 static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, 491 long long *res) 492 { 493 unsigned long long _res; 494 bool is_negative; 495 int err; 496 497 err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); 498 if (err < 0) 499 return err; 500 if (is_negative) { 501 if ((long long)-_res > 0) 502 return -ERANGE; 503 *res = -_res; 504 } else { 505 if ((long long)_res < 0) 506 return -ERANGE; 507 *res = _res; 508 } 509 return err; 510 } 511 512 BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, 513 long *, res) 514 { 515 long long _res; 516 int err; 517 518 err = __bpf_strtoll(buf, buf_len, flags, &_res); 519 if (err < 0) 520 return err; 521 if (_res != (long)_res) 522 return -ERANGE; 523 *res = _res; 524 return err; 525 } 526 527 const struct bpf_func_proto bpf_strtol_proto = { 528 .func = bpf_strtol, 529 .gpl_only = false, 530 .ret_type = RET_INTEGER, 531 .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, 532 .arg2_type = ARG_CONST_SIZE, 533 .arg3_type = ARG_ANYTHING, 534 .arg4_type = ARG_PTR_TO_LONG, 535 }; 536 537 BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, 538 unsigned long *, res) 539 { 540 unsigned long long _res; 541 bool is_negative; 542 int err; 543 544 err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); 545 if (err < 0) 546 return err; 547 if (is_negative) 548 return -EINVAL; 549 if (_res != (unsigned long)_res) 550 return -ERANGE; 551 *res = _res; 552 return err; 553 } 554 555 const struct bpf_func_proto bpf_strtoul_proto = { 556 .func = bpf_strtoul, 557 .gpl_only = false, 558 .ret_type = RET_INTEGER, 559 .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, 560 .arg2_type = ARG_CONST_SIZE, 561 .arg3_type = ARG_ANYTHING, 562 .arg4_type = ARG_PTR_TO_LONG, 563 }; 564 565 BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) 566 { 567 return strncmp(s1, s2, s1_sz); 568 } 569 570 static const struct bpf_func_proto bpf_strncmp_proto = { 571 .func = bpf_strncmp, 572 .gpl_only = false, 573 .ret_type = RET_INTEGER, 574 .arg1_type = ARG_PTR_TO_MEM, 575 .arg2_type = ARG_CONST_SIZE, 576 .arg3_type = ARG_PTR_TO_CONST_STR, 577 }; 578 579 BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, 580 struct bpf_pidns_info *, nsdata, u32, size) 581 { 582 struct task_struct *task = current; 583 struct pid_namespace *pidns; 584 int err = -EINVAL; 585 586 if (unlikely(size != sizeof(struct bpf_pidns_info))) 587 goto clear; 588 589 if (unlikely((u64)(dev_t)dev != dev)) 590 goto clear; 591 592 if (unlikely(!task)) 593 goto clear; 594 595 pidns = task_active_pid_ns(task); 596 if (unlikely(!pidns)) { 597 err = -ENOENT; 598 goto clear; 599 } 600 601 if (!ns_match(&pidns->ns, (dev_t)dev, ino)) 602 goto clear; 603 604 nsdata->pid = task_pid_nr_ns(task, pidns); 605 nsdata->tgid = task_tgid_nr_ns(task, pidns); 606 return 0; 607 clear: 608 memset((void *)nsdata, 0, (size_t) size); 609 return err; 610 } 611 612 const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { 613 .func = bpf_get_ns_current_pid_tgid, 614 .gpl_only = false, 615 .ret_type = RET_INTEGER, 616 .arg1_type = ARG_ANYTHING, 617 .arg2_type = ARG_ANYTHING, 618 .arg3_type = ARG_PTR_TO_UNINIT_MEM, 619 .arg4_type = ARG_CONST_SIZE, 620 }; 621 622 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { 623 .func = bpf_get_raw_cpu_id, 624 .gpl_only = false, 625 .ret_type = RET_INTEGER, 626 }; 627 628 BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, 629 u64, flags, void *, data, u64, size) 630 { 631 if (unlikely(flags & ~(BPF_F_INDEX_MASK))) 632 return -EINVAL; 633 634 return bpf_event_output(map, flags, data, size, NULL, 0, NULL); 635 } 636 637 const struct bpf_func_proto bpf_event_output_data_proto = { 638 .func = bpf_event_output_data, 639 .gpl_only = true, 640 .ret_type = RET_INTEGER, 641 .arg1_type = ARG_PTR_TO_CTX, 642 .arg2_type = ARG_CONST_MAP_PTR, 643 .arg3_type = ARG_ANYTHING, 644 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 645 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 646 }; 647 648 BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, 649 const void __user *, user_ptr) 650 { 651 int ret = copy_from_user(dst, user_ptr, size); 652 653 if (unlikely(ret)) { 654 memset(dst, 0, size); 655 ret = -EFAULT; 656 } 657 658 return ret; 659 } 660 661 const struct bpf_func_proto bpf_copy_from_user_proto = { 662 .func = bpf_copy_from_user, 663 .gpl_only = false, 664 .might_sleep = true, 665 .ret_type = RET_INTEGER, 666 .arg1_type = ARG_PTR_TO_UNINIT_MEM, 667 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 668 .arg3_type = ARG_ANYTHING, 669 }; 670 671 BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, 672 const void __user *, user_ptr, struct task_struct *, tsk, u64, flags) 673 { 674 int ret; 675 676 /* flags is not used yet */ 677 if (unlikely(flags)) 678 return -EINVAL; 679 680 if (unlikely(!size)) 681 return 0; 682 683 ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0); 684 if (ret == size) 685 return 0; 686 687 memset(dst, 0, size); 688 /* Return -EFAULT for partial read */ 689 return ret < 0 ? ret : -EFAULT; 690 } 691 692 const struct bpf_func_proto bpf_copy_from_user_task_proto = { 693 .func = bpf_copy_from_user_task, 694 .gpl_only = true, 695 .might_sleep = true, 696 .ret_type = RET_INTEGER, 697 .arg1_type = ARG_PTR_TO_UNINIT_MEM, 698 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 699 .arg3_type = ARG_ANYTHING, 700 .arg4_type = ARG_PTR_TO_BTF_ID, 701 .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], 702 .arg5_type = ARG_ANYTHING 703 }; 704 705 BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) 706 { 707 if (cpu >= nr_cpu_ids) 708 return (unsigned long)NULL; 709 710 return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu); 711 } 712 713 const struct bpf_func_proto bpf_per_cpu_ptr_proto = { 714 .func = bpf_per_cpu_ptr, 715 .gpl_only = false, 716 .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, 717 .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, 718 .arg2_type = ARG_ANYTHING, 719 }; 720 721 BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) 722 { 723 return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr); 724 } 725 726 const struct bpf_func_proto bpf_this_cpu_ptr_proto = { 727 .func = bpf_this_cpu_ptr, 728 .gpl_only = false, 729 .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, 730 .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, 731 }; 732 733 static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, 734 size_t bufsz) 735 { 736 void __user *user_ptr = (__force void __user *)unsafe_ptr; 737 738 buf[0] = 0; 739 740 switch (fmt_ptype) { 741 case 's': 742 #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE 743 if ((unsigned long)unsafe_ptr < TASK_SIZE) 744 return strncpy_from_user_nofault(buf, user_ptr, bufsz); 745 fallthrough; 746 #endif 747 case 'k': 748 return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); 749 case 'u': 750 return strncpy_from_user_nofault(buf, user_ptr, bufsz); 751 } 752 753 return -EINVAL; 754 } 755 756 /* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary 757 * arguments representation. 758 */ 759 #define MAX_BPRINTF_BUF_LEN 512 760 761 /* Support executing three nested bprintf helper calls on a given CPU */ 762 #define MAX_BPRINTF_NEST_LEVEL 3 763 struct bpf_bprintf_buffers { 764 char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN]; 765 }; 766 static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs); 767 static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); 768 769 static int try_get_fmt_tmp_buf(char **tmp_buf) 770 { 771 struct bpf_bprintf_buffers *bufs; 772 int nest_level; 773 774 preempt_disable(); 775 nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); 776 if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { 777 this_cpu_dec(bpf_bprintf_nest_level); 778 preempt_enable(); 779 return -EBUSY; 780 } 781 bufs = this_cpu_ptr(&bpf_bprintf_bufs); 782 *tmp_buf = bufs->tmp_bufs[nest_level - 1]; 783 784 return 0; 785 } 786 787 void bpf_bprintf_cleanup(void) 788 { 789 if (this_cpu_read(bpf_bprintf_nest_level)) { 790 this_cpu_dec(bpf_bprintf_nest_level); 791 preempt_enable(); 792 } 793 } 794 795 /* 796 * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers 797 * 798 * Returns a negative value if fmt is an invalid format string or 0 otherwise. 799 * 800 * This can be used in two ways: 801 * - Format string verification only: when bin_args is NULL 802 * - Arguments preparation: in addition to the above verification, it writes in 803 * bin_args a binary representation of arguments usable by bstr_printf where 804 * pointers from BPF have been sanitized. 805 * 806 * In argument preparation mode, if 0 is returned, safe temporary buffers are 807 * allocated and bpf_bprintf_cleanup should be called to free them after use. 808 */ 809 int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, 810 u32 **bin_args, u32 num_args) 811 { 812 char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; 813 size_t sizeof_cur_arg, sizeof_cur_ip; 814 int err, i, num_spec = 0; 815 u64 cur_arg; 816 char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX"; 817 818 fmt_end = strnchr(fmt, fmt_size, 0); 819 if (!fmt_end) 820 return -EINVAL; 821 fmt_size = fmt_end - fmt; 822 823 if (bin_args) { 824 if (num_args && try_get_fmt_tmp_buf(&tmp_buf)) 825 return -EBUSY; 826 827 tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN; 828 *bin_args = (u32 *)tmp_buf; 829 } 830 831 for (i = 0; i < fmt_size; i++) { 832 if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { 833 err = -EINVAL; 834 goto out; 835 } 836 837 if (fmt[i] != '%') 838 continue; 839 840 if (fmt[i + 1] == '%') { 841 i++; 842 continue; 843 } 844 845 if (num_spec >= num_args) { 846 err = -EINVAL; 847 goto out; 848 } 849 850 /* The string is zero-terminated so if fmt[i] != 0, we can 851 * always access fmt[i + 1], in the worst case it will be a 0 852 */ 853 i++; 854 855 /* skip optional "[0 +-][num]" width formatting field */ 856 while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || 857 fmt[i] == ' ') 858 i++; 859 if (fmt[i] >= '1' && fmt[i] <= '9') { 860 i++; 861 while (fmt[i] >= '0' && fmt[i] <= '9') 862 i++; 863 } 864 865 if (fmt[i] == 'p') { 866 sizeof_cur_arg = sizeof(long); 867 868 if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && 869 fmt[i + 2] == 's') { 870 fmt_ptype = fmt[i + 1]; 871 i += 2; 872 goto fmt_str; 873 } 874 875 if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || 876 ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || 877 fmt[i + 1] == 'x' || fmt[i + 1] == 's' || 878 fmt[i + 1] == 'S') { 879 /* just kernel pointers */ 880 if (tmp_buf) 881 cur_arg = raw_args[num_spec]; 882 i++; 883 goto nocopy_fmt; 884 } 885 886 if (fmt[i + 1] == 'B') { 887 if (tmp_buf) { 888 err = snprintf(tmp_buf, 889 (tmp_buf_end - tmp_buf), 890 "%pB", 891 (void *)(long)raw_args[num_spec]); 892 tmp_buf += (err + 1); 893 } 894 895 i++; 896 num_spec++; 897 continue; 898 } 899 900 /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ 901 if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') || 902 (fmt[i + 2] != '4' && fmt[i + 2] != '6')) { 903 err = -EINVAL; 904 goto out; 905 } 906 907 i += 2; 908 if (!tmp_buf) 909 goto nocopy_fmt; 910 911 sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16; 912 if (tmp_buf_end - tmp_buf < sizeof_cur_ip) { 913 err = -ENOSPC; 914 goto out; 915 } 916 917 unsafe_ptr = (char *)(long)raw_args[num_spec]; 918 err = copy_from_kernel_nofault(cur_ip, unsafe_ptr, 919 sizeof_cur_ip); 920 if (err < 0) 921 memset(cur_ip, 0, sizeof_cur_ip); 922 923 /* hack: bstr_printf expects IP addresses to be 924 * pre-formatted as strings, ironically, the easiest way 925 * to do that is to call snprintf. 926 */ 927 ip_spec[2] = fmt[i - 1]; 928 ip_spec[3] = fmt[i]; 929 err = snprintf(tmp_buf, tmp_buf_end - tmp_buf, 930 ip_spec, &cur_ip); 931 932 tmp_buf += err + 1; 933 num_spec++; 934 935 continue; 936 } else if (fmt[i] == 's') { 937 fmt_ptype = fmt[i]; 938 fmt_str: 939 if (fmt[i + 1] != 0 && 940 !isspace(fmt[i + 1]) && 941 !ispunct(fmt[i + 1])) { 942 err = -EINVAL; 943 goto out; 944 } 945 946 if (!tmp_buf) 947 goto nocopy_fmt; 948 949 if (tmp_buf_end == tmp_buf) { 950 err = -ENOSPC; 951 goto out; 952 } 953 954 unsafe_ptr = (char *)(long)raw_args[num_spec]; 955 err = bpf_trace_copy_string(tmp_buf, unsafe_ptr, 956 fmt_ptype, 957 tmp_buf_end - tmp_buf); 958 if (err < 0) { 959 tmp_buf[0] = '\0'; 960 err = 1; 961 } 962 963 tmp_buf += err; 964 num_spec++; 965 966 continue; 967 } else if (fmt[i] == 'c') { 968 if (!tmp_buf) 969 goto nocopy_fmt; 970 971 if (tmp_buf_end == tmp_buf) { 972 err = -ENOSPC; 973 goto out; 974 } 975 976 *tmp_buf = raw_args[num_spec]; 977 tmp_buf++; 978 num_spec++; 979 980 continue; 981 } 982 983 sizeof_cur_arg = sizeof(int); 984 985 if (fmt[i] == 'l') { 986 sizeof_cur_arg = sizeof(long); 987 i++; 988 } 989 if (fmt[i] == 'l') { 990 sizeof_cur_arg = sizeof(long long); 991 i++; 992 } 993 994 if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && 995 fmt[i] != 'x' && fmt[i] != 'X') { 996 err = -EINVAL; 997 goto out; 998 } 999 1000 if (tmp_buf) 1001 cur_arg = raw_args[num_spec]; 1002 nocopy_fmt: 1003 if (tmp_buf) { 1004 tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32)); 1005 if (tmp_buf_end - tmp_buf < sizeof_cur_arg) { 1006 err = -ENOSPC; 1007 goto out; 1008 } 1009 1010 if (sizeof_cur_arg == 8) { 1011 *(u32 *)tmp_buf = *(u32 *)&cur_arg; 1012 *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1); 1013 } else { 1014 *(u32 *)tmp_buf = (u32)(long)cur_arg; 1015 } 1016 tmp_buf += sizeof_cur_arg; 1017 } 1018 num_spec++; 1019 } 1020 1021 err = 0; 1022 out: 1023 if (err) 1024 bpf_bprintf_cleanup(); 1025 return err; 1026 } 1027 1028 BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, 1029 const void *, data, u32, data_len) 1030 { 1031 int err, num_args; 1032 u32 *bin_args; 1033 1034 if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 || 1035 (data_len && !data)) 1036 return -EINVAL; 1037 num_args = data_len / 8; 1038 1039 /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we 1040 * can safely give an unbounded size. 1041 */ 1042 err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args); 1043 if (err < 0) 1044 return err; 1045 1046 err = bstr_printf(str, str_size, fmt, bin_args); 1047 1048 bpf_bprintf_cleanup(); 1049 1050 return err + 1; 1051 } 1052 1053 const struct bpf_func_proto bpf_snprintf_proto = { 1054 .func = bpf_snprintf, 1055 .gpl_only = true, 1056 .ret_type = RET_INTEGER, 1057 .arg1_type = ARG_PTR_TO_MEM_OR_NULL, 1058 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 1059 .arg3_type = ARG_PTR_TO_CONST_STR, 1060 .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, 1061 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 1062 }; 1063 1064 /* BPF map elements can contain 'struct bpf_timer'. 1065 * Such map owns all of its BPF timers. 1066 * 'struct bpf_timer' is allocated as part of map element allocation 1067 * and it's zero initialized. 1068 * That space is used to keep 'struct bpf_timer_kern'. 1069 * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and 1070 * remembers 'struct bpf_map *' pointer it's part of. 1071 * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. 1072 * bpf_timer_start() arms the timer. 1073 * If user space reference to a map goes to zero at this point 1074 * ops->map_release_uref callback is responsible for cancelling the timers, 1075 * freeing their memory, and decrementing prog's refcnts. 1076 * bpf_timer_cancel() cancels the timer and decrements prog's refcnt. 1077 * Inner maps can contain bpf timers as well. ops->map_release_uref is 1078 * freeing the timers when inner map is replaced or deleted by user space. 1079 */ 1080 struct bpf_hrtimer { 1081 struct hrtimer timer; 1082 struct bpf_map *map; 1083 struct bpf_prog *prog; 1084 void __rcu *callback_fn; 1085 void *value; 1086 }; 1087 1088 /* the actual struct hidden inside uapi struct bpf_timer */ 1089 struct bpf_timer_kern { 1090 struct bpf_hrtimer *timer; 1091 /* bpf_spin_lock is used here instead of spinlock_t to make 1092 * sure that it always fits into space reserved by struct bpf_timer 1093 * regardless of LOCKDEP and spinlock debug flags. 1094 */ 1095 struct bpf_spin_lock lock; 1096 } __attribute__((aligned(8))); 1097 1098 static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); 1099 1100 static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) 1101 { 1102 struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); 1103 struct bpf_map *map = t->map; 1104 void *value = t->value; 1105 bpf_callback_t callback_fn; 1106 void *key; 1107 u32 idx; 1108 1109 BTF_TYPE_EMIT(struct bpf_timer); 1110 callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); 1111 if (!callback_fn) 1112 goto out; 1113 1114 /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and 1115 * cannot be preempted by another bpf_timer_cb() on the same cpu. 1116 * Remember the timer this callback is servicing to prevent 1117 * deadlock if callback_fn() calls bpf_timer_cancel() or 1118 * bpf_map_delete_elem() on the same timer. 1119 */ 1120 this_cpu_write(hrtimer_running, t); 1121 if (map->map_type == BPF_MAP_TYPE_ARRAY) { 1122 struct bpf_array *array = container_of(map, struct bpf_array, map); 1123 1124 /* compute the key */ 1125 idx = ((char *)value - array->value) / array->elem_size; 1126 key = &idx; 1127 } else { /* hash or lru */ 1128 key = value - round_up(map->key_size, 8); 1129 } 1130 1131 callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); 1132 /* The verifier checked that return value is zero. */ 1133 1134 this_cpu_write(hrtimer_running, NULL); 1135 out: 1136 return HRTIMER_NORESTART; 1137 } 1138 1139 BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map, 1140 u64, flags) 1141 { 1142 clockid_t clockid = flags & (MAX_CLOCKS - 1); 1143 struct bpf_hrtimer *t; 1144 int ret = 0; 1145 1146 BUILD_BUG_ON(MAX_CLOCKS != 16); 1147 BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer)); 1148 BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer)); 1149 1150 if (in_nmi()) 1151 return -EOPNOTSUPP; 1152 1153 if (flags >= MAX_CLOCKS || 1154 /* similar to timerfd except _ALARM variants are not supported */ 1155 (clockid != CLOCK_MONOTONIC && 1156 clockid != CLOCK_REALTIME && 1157 clockid != CLOCK_BOOTTIME)) 1158 return -EINVAL; 1159 __bpf_spin_lock_irqsave(&timer->lock); 1160 t = timer->timer; 1161 if (t) { 1162 ret = -EBUSY; 1163 goto out; 1164 } 1165 if (!atomic64_read(&map->usercnt)) { 1166 /* maps with timers must be either held by user space 1167 * or pinned in bpffs. 1168 */ 1169 ret = -EPERM; 1170 goto out; 1171 } 1172 /* allocate hrtimer via map_kmalloc to use memcg accounting */ 1173 t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); 1174 if (!t) { 1175 ret = -ENOMEM; 1176 goto out; 1177 } 1178 t->value = (void *)timer - map->record->timer_off; 1179 t->map = map; 1180 t->prog = NULL; 1181 rcu_assign_pointer(t->callback_fn, NULL); 1182 hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); 1183 t->timer.function = bpf_timer_cb; 1184 timer->timer = t; 1185 out: 1186 __bpf_spin_unlock_irqrestore(&timer->lock); 1187 return ret; 1188 } 1189 1190 static const struct bpf_func_proto bpf_timer_init_proto = { 1191 .func = bpf_timer_init, 1192 .gpl_only = true, 1193 .ret_type = RET_INTEGER, 1194 .arg1_type = ARG_PTR_TO_TIMER, 1195 .arg2_type = ARG_CONST_MAP_PTR, 1196 .arg3_type = ARG_ANYTHING, 1197 }; 1198 1199 BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn, 1200 struct bpf_prog_aux *, aux) 1201 { 1202 struct bpf_prog *prev, *prog = aux->prog; 1203 struct bpf_hrtimer *t; 1204 int ret = 0; 1205 1206 if (in_nmi()) 1207 return -EOPNOTSUPP; 1208 __bpf_spin_lock_irqsave(&timer->lock); 1209 t = timer->timer; 1210 if (!t) { 1211 ret = -EINVAL; 1212 goto out; 1213 } 1214 if (!atomic64_read(&t->map->usercnt)) { 1215 /* maps with timers must be either held by user space 1216 * or pinned in bpffs. Otherwise timer might still be 1217 * running even when bpf prog is detached and user space 1218 * is gone, since map_release_uref won't ever be called. 1219 */ 1220 ret = -EPERM; 1221 goto out; 1222 } 1223 prev = t->prog; 1224 if (prev != prog) { 1225 /* Bump prog refcnt once. Every bpf_timer_set_callback() 1226 * can pick different callback_fn-s within the same prog. 1227 */ 1228 prog = bpf_prog_inc_not_zero(prog); 1229 if (IS_ERR(prog)) { 1230 ret = PTR_ERR(prog); 1231 goto out; 1232 } 1233 if (prev) 1234 /* Drop prev prog refcnt when swapping with new prog */ 1235 bpf_prog_put(prev); 1236 t->prog = prog; 1237 } 1238 rcu_assign_pointer(t->callback_fn, callback_fn); 1239 out: 1240 __bpf_spin_unlock_irqrestore(&timer->lock); 1241 return ret; 1242 } 1243 1244 static const struct bpf_func_proto bpf_timer_set_callback_proto = { 1245 .func = bpf_timer_set_callback, 1246 .gpl_only = true, 1247 .ret_type = RET_INTEGER, 1248 .arg1_type = ARG_PTR_TO_TIMER, 1249 .arg2_type = ARG_PTR_TO_FUNC, 1250 }; 1251 1252 BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags) 1253 { 1254 struct bpf_hrtimer *t; 1255 int ret = 0; 1256 1257 if (in_nmi()) 1258 return -EOPNOTSUPP; 1259 if (flags) 1260 return -EINVAL; 1261 __bpf_spin_lock_irqsave(&timer->lock); 1262 t = timer->timer; 1263 if (!t || !t->prog) { 1264 ret = -EINVAL; 1265 goto out; 1266 } 1267 hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT); 1268 out: 1269 __bpf_spin_unlock_irqrestore(&timer->lock); 1270 return ret; 1271 } 1272 1273 static const struct bpf_func_proto bpf_timer_start_proto = { 1274 .func = bpf_timer_start, 1275 .gpl_only = true, 1276 .ret_type = RET_INTEGER, 1277 .arg1_type = ARG_PTR_TO_TIMER, 1278 .arg2_type = ARG_ANYTHING, 1279 .arg3_type = ARG_ANYTHING, 1280 }; 1281 1282 static void drop_prog_refcnt(struct bpf_hrtimer *t) 1283 { 1284 struct bpf_prog *prog = t->prog; 1285 1286 if (prog) { 1287 bpf_prog_put(prog); 1288 t->prog = NULL; 1289 rcu_assign_pointer(t->callback_fn, NULL); 1290 } 1291 } 1292 1293 BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) 1294 { 1295 struct bpf_hrtimer *t; 1296 int ret = 0; 1297 1298 if (in_nmi()) 1299 return -EOPNOTSUPP; 1300 __bpf_spin_lock_irqsave(&timer->lock); 1301 t = timer->timer; 1302 if (!t) { 1303 ret = -EINVAL; 1304 goto out; 1305 } 1306 if (this_cpu_read(hrtimer_running) == t) { 1307 /* If bpf callback_fn is trying to bpf_timer_cancel() 1308 * its own timer the hrtimer_cancel() will deadlock 1309 * since it waits for callback_fn to finish 1310 */ 1311 ret = -EDEADLK; 1312 goto out; 1313 } 1314 drop_prog_refcnt(t); 1315 out: 1316 __bpf_spin_unlock_irqrestore(&timer->lock); 1317 /* Cancel the timer and wait for associated callback to finish 1318 * if it was running. 1319 */ 1320 ret = ret ?: hrtimer_cancel(&t->timer); 1321 return ret; 1322 } 1323 1324 static const struct bpf_func_proto bpf_timer_cancel_proto = { 1325 .func = bpf_timer_cancel, 1326 .gpl_only = true, 1327 .ret_type = RET_INTEGER, 1328 .arg1_type = ARG_PTR_TO_TIMER, 1329 }; 1330 1331 /* This function is called by map_delete/update_elem for individual element and 1332 * by ops->map_release_uref when the user space reference to a map reaches zero. 1333 */ 1334 void bpf_timer_cancel_and_free(void *val) 1335 { 1336 struct bpf_timer_kern *timer = val; 1337 struct bpf_hrtimer *t; 1338 1339 /* Performance optimization: read timer->timer without lock first. */ 1340 if (!READ_ONCE(timer->timer)) 1341 return; 1342 1343 __bpf_spin_lock_irqsave(&timer->lock); 1344 /* re-read it under lock */ 1345 t = timer->timer; 1346 if (!t) 1347 goto out; 1348 drop_prog_refcnt(t); 1349 /* The subsequent bpf_timer_start/cancel() helpers won't be able to use 1350 * this timer, since it won't be initialized. 1351 */ 1352 timer->timer = NULL; 1353 out: 1354 __bpf_spin_unlock_irqrestore(&timer->lock); 1355 if (!t) 1356 return; 1357 /* Cancel the timer and wait for callback to complete if it was running. 1358 * If hrtimer_cancel() can be safely called it's safe to call kfree(t) 1359 * right after for both preallocated and non-preallocated maps. 1360 * The timer->timer = NULL was already done and no code path can 1361 * see address 't' anymore. 1362 * 1363 * Check that bpf_map_delete/update_elem() wasn't called from timer 1364 * callback_fn. In such case don't call hrtimer_cancel() (since it will 1365 * deadlock) and don't call hrtimer_try_to_cancel() (since it will just 1366 * return -1). Though callback_fn is still running on this cpu it's 1367 * safe to do kfree(t) because bpf_timer_cb() read everything it needed 1368 * from 't'. The bpf subprog callback_fn won't be able to access 't', 1369 * since timer->timer = NULL was already done. The timer will be 1370 * effectively cancelled because bpf_timer_cb() will return 1371 * HRTIMER_NORESTART. 1372 */ 1373 if (this_cpu_read(hrtimer_running) != t) 1374 hrtimer_cancel(&t->timer); 1375 kfree(t); 1376 } 1377 1378 BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) 1379 { 1380 unsigned long *kptr = map_value; 1381 1382 return xchg(kptr, (unsigned long)ptr); 1383 } 1384 1385 /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() 1386 * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to 1387 * denote type that verifier will determine. 1388 */ 1389 static const struct bpf_func_proto bpf_kptr_xchg_proto = { 1390 .func = bpf_kptr_xchg, 1391 .gpl_only = false, 1392 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, 1393 .ret_btf_id = BPF_PTR_POISON, 1394 .arg1_type = ARG_PTR_TO_KPTR, 1395 .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE, 1396 .arg2_btf_id = BPF_PTR_POISON, 1397 }; 1398 1399 /* Since the upper 8 bits of dynptr->size is reserved, the 1400 * maximum supported size is 2^24 - 1. 1401 */ 1402 #define DYNPTR_MAX_SIZE ((1UL << 24) - 1) 1403 #define DYNPTR_TYPE_SHIFT 28 1404 #define DYNPTR_SIZE_MASK 0xFFFFFF 1405 #define DYNPTR_RDONLY_BIT BIT(31) 1406 1407 static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) 1408 { 1409 return ptr->size & DYNPTR_RDONLY_BIT; 1410 } 1411 1412 static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type) 1413 { 1414 ptr->size |= type << DYNPTR_TYPE_SHIFT; 1415 } 1416 1417 u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) 1418 { 1419 return ptr->size & DYNPTR_SIZE_MASK; 1420 } 1421 1422 int bpf_dynptr_check_size(u32 size) 1423 { 1424 return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; 1425 } 1426 1427 void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, 1428 enum bpf_dynptr_type type, u32 offset, u32 size) 1429 { 1430 ptr->data = data; 1431 ptr->offset = offset; 1432 ptr->size = size; 1433 bpf_dynptr_set_type(ptr, type); 1434 } 1435 1436 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) 1437 { 1438 memset(ptr, 0, sizeof(*ptr)); 1439 } 1440 1441 static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) 1442 { 1443 u32 size = bpf_dynptr_get_size(ptr); 1444 1445 if (len > size || offset > size - len) 1446 return -E2BIG; 1447 1448 return 0; 1449 } 1450 1451 BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) 1452 { 1453 int err; 1454 1455 BTF_TYPE_EMIT(struct bpf_dynptr); 1456 1457 err = bpf_dynptr_check_size(size); 1458 if (err) 1459 goto error; 1460 1461 /* flags is currently unsupported */ 1462 if (flags) { 1463 err = -EINVAL; 1464 goto error; 1465 } 1466 1467 bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size); 1468 1469 return 0; 1470 1471 error: 1472 bpf_dynptr_set_null(ptr); 1473 return err; 1474 } 1475 1476 static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { 1477 .func = bpf_dynptr_from_mem, 1478 .gpl_only = false, 1479 .ret_type = RET_INTEGER, 1480 .arg1_type = ARG_PTR_TO_UNINIT_MEM, 1481 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 1482 .arg3_type = ARG_ANYTHING, 1483 .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, 1484 }; 1485 1486 BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, 1487 u32, offset, u64, flags) 1488 { 1489 int err; 1490 1491 if (!src->data || flags) 1492 return -EINVAL; 1493 1494 err = bpf_dynptr_check_off_len(src, offset, len); 1495 if (err) 1496 return err; 1497 1498 /* Source and destination may possibly overlap, hence use memmove to 1499 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr 1500 * pointing to overlapping PTR_TO_MAP_VALUE regions. 1501 */ 1502 memmove(dst, src->data + src->offset + offset, len); 1503 1504 return 0; 1505 } 1506 1507 static const struct bpf_func_proto bpf_dynptr_read_proto = { 1508 .func = bpf_dynptr_read, 1509 .gpl_only = false, 1510 .ret_type = RET_INTEGER, 1511 .arg1_type = ARG_PTR_TO_UNINIT_MEM, 1512 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 1513 .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, 1514 .arg4_type = ARG_ANYTHING, 1515 .arg5_type = ARG_ANYTHING, 1516 }; 1517 1518 BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, 1519 u32, len, u64, flags) 1520 { 1521 int err; 1522 1523 if (!dst->data || flags || bpf_dynptr_is_rdonly(dst)) 1524 return -EINVAL; 1525 1526 err = bpf_dynptr_check_off_len(dst, offset, len); 1527 if (err) 1528 return err; 1529 1530 /* Source and destination may possibly overlap, hence use memmove to 1531 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr 1532 * pointing to overlapping PTR_TO_MAP_VALUE regions. 1533 */ 1534 memmove(dst->data + dst->offset + offset, src, len); 1535 1536 return 0; 1537 } 1538 1539 static const struct bpf_func_proto bpf_dynptr_write_proto = { 1540 .func = bpf_dynptr_write, 1541 .gpl_only = false, 1542 .ret_type = RET_INTEGER, 1543 .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, 1544 .arg2_type = ARG_ANYTHING, 1545 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, 1546 .arg4_type = ARG_CONST_SIZE_OR_ZERO, 1547 .arg5_type = ARG_ANYTHING, 1548 }; 1549 1550 BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) 1551 { 1552 int err; 1553 1554 if (!ptr->data) 1555 return 0; 1556 1557 err = bpf_dynptr_check_off_len(ptr, offset, len); 1558 if (err) 1559 return 0; 1560 1561 if (bpf_dynptr_is_rdonly(ptr)) 1562 return 0; 1563 1564 return (unsigned long)(ptr->data + ptr->offset + offset); 1565 } 1566 1567 static const struct bpf_func_proto bpf_dynptr_data_proto = { 1568 .func = bpf_dynptr_data, 1569 .gpl_only = false, 1570 .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, 1571 .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, 1572 .arg2_type = ARG_ANYTHING, 1573 .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, 1574 }; 1575 1576 const struct bpf_func_proto bpf_get_current_task_proto __weak; 1577 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; 1578 const struct bpf_func_proto bpf_probe_read_user_proto __weak; 1579 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; 1580 const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; 1581 const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; 1582 const struct bpf_func_proto bpf_task_pt_regs_proto __weak; 1583 1584 const struct bpf_func_proto * 1585 bpf_base_func_proto(enum bpf_func_id func_id) 1586 { 1587 switch (func_id) { 1588 case BPF_FUNC_map_lookup_elem: 1589 return &bpf_map_lookup_elem_proto; 1590 case BPF_FUNC_map_update_elem: 1591 return &bpf_map_update_elem_proto; 1592 case BPF_FUNC_map_delete_elem: 1593 return &bpf_map_delete_elem_proto; 1594 case BPF_FUNC_map_push_elem: 1595 return &bpf_map_push_elem_proto; 1596 case BPF_FUNC_map_pop_elem: 1597 return &bpf_map_pop_elem_proto; 1598 case BPF_FUNC_map_peek_elem: 1599 return &bpf_map_peek_elem_proto; 1600 case BPF_FUNC_map_lookup_percpu_elem: 1601 return &bpf_map_lookup_percpu_elem_proto; 1602 case BPF_FUNC_get_prandom_u32: 1603 return &bpf_get_prandom_u32_proto; 1604 case BPF_FUNC_get_smp_processor_id: 1605 return &bpf_get_raw_smp_processor_id_proto; 1606 case BPF_FUNC_get_numa_node_id: 1607 return &bpf_get_numa_node_id_proto; 1608 case BPF_FUNC_tail_call: 1609 return &bpf_tail_call_proto; 1610 case BPF_FUNC_ktime_get_ns: 1611 return &bpf_ktime_get_ns_proto; 1612 case BPF_FUNC_ktime_get_boot_ns: 1613 return &bpf_ktime_get_boot_ns_proto; 1614 case BPF_FUNC_ktime_get_tai_ns: 1615 return &bpf_ktime_get_tai_ns_proto; 1616 case BPF_FUNC_ringbuf_output: 1617 return &bpf_ringbuf_output_proto; 1618 case BPF_FUNC_ringbuf_reserve: 1619 return &bpf_ringbuf_reserve_proto; 1620 case BPF_FUNC_ringbuf_submit: 1621 return &bpf_ringbuf_submit_proto; 1622 case BPF_FUNC_ringbuf_discard: 1623 return &bpf_ringbuf_discard_proto; 1624 case BPF_FUNC_ringbuf_query: 1625 return &bpf_ringbuf_query_proto; 1626 case BPF_FUNC_strncmp: 1627 return &bpf_strncmp_proto; 1628 case BPF_FUNC_strtol: 1629 return &bpf_strtol_proto; 1630 case BPF_FUNC_strtoul: 1631 return &bpf_strtoul_proto; 1632 default: 1633 break; 1634 } 1635 1636 if (!bpf_capable()) 1637 return NULL; 1638 1639 switch (func_id) { 1640 case BPF_FUNC_spin_lock: 1641 return &bpf_spin_lock_proto; 1642 case BPF_FUNC_spin_unlock: 1643 return &bpf_spin_unlock_proto; 1644 case BPF_FUNC_jiffies64: 1645 return &bpf_jiffies64_proto; 1646 case BPF_FUNC_per_cpu_ptr: 1647 return &bpf_per_cpu_ptr_proto; 1648 case BPF_FUNC_this_cpu_ptr: 1649 return &bpf_this_cpu_ptr_proto; 1650 case BPF_FUNC_timer_init: 1651 return &bpf_timer_init_proto; 1652 case BPF_FUNC_timer_set_callback: 1653 return &bpf_timer_set_callback_proto; 1654 case BPF_FUNC_timer_start: 1655 return &bpf_timer_start_proto; 1656 case BPF_FUNC_timer_cancel: 1657 return &bpf_timer_cancel_proto; 1658 case BPF_FUNC_kptr_xchg: 1659 return &bpf_kptr_xchg_proto; 1660 case BPF_FUNC_for_each_map_elem: 1661 return &bpf_for_each_map_elem_proto; 1662 case BPF_FUNC_loop: 1663 return &bpf_loop_proto; 1664 case BPF_FUNC_user_ringbuf_drain: 1665 return &bpf_user_ringbuf_drain_proto; 1666 case BPF_FUNC_ringbuf_reserve_dynptr: 1667 return &bpf_ringbuf_reserve_dynptr_proto; 1668 case BPF_FUNC_ringbuf_submit_dynptr: 1669 return &bpf_ringbuf_submit_dynptr_proto; 1670 case BPF_FUNC_ringbuf_discard_dynptr: 1671 return &bpf_ringbuf_discard_dynptr_proto; 1672 case BPF_FUNC_dynptr_from_mem: 1673 return &bpf_dynptr_from_mem_proto; 1674 case BPF_FUNC_dynptr_read: 1675 return &bpf_dynptr_read_proto; 1676 case BPF_FUNC_dynptr_write: 1677 return &bpf_dynptr_write_proto; 1678 case BPF_FUNC_dynptr_data: 1679 return &bpf_dynptr_data_proto; 1680 #ifdef CONFIG_CGROUPS 1681 case BPF_FUNC_cgrp_storage_get: 1682 return &bpf_cgrp_storage_get_proto; 1683 case BPF_FUNC_cgrp_storage_delete: 1684 return &bpf_cgrp_storage_delete_proto; 1685 #endif 1686 default: 1687 break; 1688 } 1689 1690 if (!perfmon_capable()) 1691 return NULL; 1692 1693 switch (func_id) { 1694 case BPF_FUNC_trace_printk: 1695 return bpf_get_trace_printk_proto(); 1696 case BPF_FUNC_get_current_task: 1697 return &bpf_get_current_task_proto; 1698 case BPF_FUNC_get_current_task_btf: 1699 return &bpf_get_current_task_btf_proto; 1700 case BPF_FUNC_probe_read_user: 1701 return &bpf_probe_read_user_proto; 1702 case BPF_FUNC_probe_read_kernel: 1703 return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? 1704 NULL : &bpf_probe_read_kernel_proto; 1705 case BPF_FUNC_probe_read_user_str: 1706 return &bpf_probe_read_user_str_proto; 1707 case BPF_FUNC_probe_read_kernel_str: 1708 return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? 1709 NULL : &bpf_probe_read_kernel_str_proto; 1710 case BPF_FUNC_snprintf_btf: 1711 return &bpf_snprintf_btf_proto; 1712 case BPF_FUNC_snprintf: 1713 return &bpf_snprintf_proto; 1714 case BPF_FUNC_task_pt_regs: 1715 return &bpf_task_pt_regs_proto; 1716 case BPF_FUNC_trace_vprintk: 1717 return bpf_get_trace_vprintk_proto(); 1718 default: 1719 return NULL; 1720 } 1721 } 1722 1723 void bpf_list_head_free(const struct btf_field *field, void *list_head, 1724 struct bpf_spin_lock *spin_lock) 1725 { 1726 struct list_head *head = list_head, *orig_head = list_head; 1727 1728 BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); 1729 BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); 1730 1731 /* Do the actual list draining outside the lock to not hold the lock for 1732 * too long, and also prevent deadlocks if tracing programs end up 1733 * executing on entry/exit of functions called inside the critical 1734 * section, and end up doing map ops that call bpf_list_head_free for 1735 * the same map value again. 1736 */ 1737 __bpf_spin_lock_irqsave(spin_lock); 1738 if (!head->next || list_empty(head)) 1739 goto unlock; 1740 head = head->next; 1741 unlock: 1742 INIT_LIST_HEAD(orig_head); 1743 __bpf_spin_unlock_irqrestore(spin_lock); 1744 1745 while (head != orig_head) { 1746 void *obj = head; 1747 1748 obj -= field->list_head.node_offset; 1749 head = head->next; 1750 /* The contained type can also have resources, including a 1751 * bpf_list_head which needs to be freed. 1752 */ 1753 bpf_obj_free_fields(field->list_head.value_rec, obj); 1754 /* bpf_mem_free requires migrate_disable(), since we can be 1755 * called from map free path as well apart from BPF program (as 1756 * part of map ops doing bpf_obj_free_fields). 1757 */ 1758 migrate_disable(); 1759 bpf_mem_free(&bpf_global_ma, obj); 1760 migrate_enable(); 1761 } 1762 } 1763 1764 __diag_push(); 1765 __diag_ignore_all("-Wmissing-prototypes", 1766 "Global functions as their definitions will be in vmlinux BTF"); 1767 1768 void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) 1769 { 1770 struct btf_struct_meta *meta = meta__ign; 1771 u64 size = local_type_id__k; 1772 void *p; 1773 1774 p = bpf_mem_alloc(&bpf_global_ma, size); 1775 if (!p) 1776 return NULL; 1777 if (meta) 1778 bpf_obj_init(meta->field_offs, p); 1779 return p; 1780 } 1781 1782 void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) 1783 { 1784 struct btf_struct_meta *meta = meta__ign; 1785 void *p = p__alloc; 1786 1787 if (meta) 1788 bpf_obj_free_fields(meta->record, p); 1789 bpf_mem_free(&bpf_global_ma, p); 1790 } 1791 1792 static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail) 1793 { 1794 struct list_head *n = (void *)node, *h = (void *)head; 1795 1796 if (unlikely(!h->next)) 1797 INIT_LIST_HEAD(h); 1798 if (unlikely(!n->next)) 1799 INIT_LIST_HEAD(n); 1800 tail ? list_add_tail(n, h) : list_add(n, h); 1801 } 1802 1803 void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) 1804 { 1805 return __bpf_list_add(node, head, false); 1806 } 1807 1808 void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) 1809 { 1810 return __bpf_list_add(node, head, true); 1811 } 1812 1813 static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) 1814 { 1815 struct list_head *n, *h = (void *)head; 1816 1817 if (unlikely(!h->next)) 1818 INIT_LIST_HEAD(h); 1819 if (list_empty(h)) 1820 return NULL; 1821 n = tail ? h->prev : h->next; 1822 list_del_init(n); 1823 return (struct bpf_list_node *)n; 1824 } 1825 1826 struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) 1827 { 1828 return __bpf_list_del(head, false); 1829 } 1830 1831 struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) 1832 { 1833 return __bpf_list_del(head, true); 1834 } 1835 1836 /** 1837 * bpf_task_acquire - Acquire a reference to a task. A task acquired by this 1838 * kfunc which is not stored in a map as a kptr, must be released by calling 1839 * bpf_task_release(). 1840 * @p: The task on which a reference is being acquired. 1841 */ 1842 struct task_struct *bpf_task_acquire(struct task_struct *p) 1843 { 1844 return get_task_struct(p); 1845 } 1846 1847 /** 1848 * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task 1849 * acquired by this kfunc which is not stored in a map as a kptr, must be 1850 * released by calling bpf_task_release(). 1851 * @p: The task on which a reference is being acquired. 1852 */ 1853 struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) 1854 { 1855 /* For the time being this function returns NULL, as it's not currently 1856 * possible to safely acquire a reference to a task with RCU protection 1857 * using get_task_struct() and put_task_struct(). This is due to the 1858 * slightly odd mechanics of p->rcu_users, and how task RCU protection 1859 * works. 1860 * 1861 * A struct task_struct is refcounted by two different refcount_t 1862 * fields: 1863 * 1864 * 1. p->usage: The "true" refcount field which tracks a task's 1865 * lifetime. The task is freed as soon as this 1866 * refcount drops to 0. 1867 * 1868 * 2. p->rcu_users: An "RCU users" refcount field which is statically 1869 * initialized to 2, and is co-located in a union with 1870 * a struct rcu_head field (p->rcu). p->rcu_users 1871 * essentially encapsulates a single p->usage 1872 * refcount, and when p->rcu_users goes to 0, an RCU 1873 * callback is scheduled on the struct rcu_head which 1874 * decrements the p->usage refcount. 1875 * 1876 * There are two important implications to this task refcounting logic 1877 * described above. The first is that 1878 * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as 1879 * after the refcount goes to 0, the RCU callback being scheduled will 1880 * cause the memory backing the refcount to again be nonzero due to the 1881 * fields sharing a union. The other is that we can't rely on RCU to 1882 * guarantee that a task is valid in a BPF program. This is because a 1883 * task could have already transitioned to being in the TASK_DEAD 1884 * state, had its rcu_users refcount go to 0, and its rcu callback 1885 * invoked in which it drops its single p->usage reference. At this 1886 * point the task will be freed as soon as the last p->usage reference 1887 * goes to 0, without waiting for another RCU gp to elapse. The only 1888 * way that a BPF program can guarantee that a task is valid is in this 1889 * scenario is to hold a p->usage refcount itself. 1890 * 1891 * Until we're able to resolve this issue, either by pulling 1892 * p->rcu_users and p->rcu out of the union, or by getting rid of 1893 * p->usage and just using p->rcu_users for refcounting, we'll just 1894 * return NULL here. 1895 */ 1896 return NULL; 1897 } 1898 1899 /** 1900 * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task 1901 * kptr acquired by this kfunc which is not subsequently stored in a map, must 1902 * be released by calling bpf_task_release(). 1903 * @pp: A pointer to a task kptr on which a reference is being acquired. 1904 */ 1905 struct task_struct *bpf_task_kptr_get(struct task_struct **pp) 1906 { 1907 /* We must return NULL here until we have clarity on how to properly 1908 * leverage RCU for ensuring a task's lifetime. See the comment above 1909 * in bpf_task_acquire_not_zero() for more details. 1910 */ 1911 return NULL; 1912 } 1913 1914 /** 1915 * bpf_task_release - Release the reference acquired on a task. 1916 * @p: The task on which a reference is being released. 1917 */ 1918 void bpf_task_release(struct task_struct *p) 1919 { 1920 if (!p) 1921 return; 1922 1923 put_task_struct(p); 1924 } 1925 1926 #ifdef CONFIG_CGROUPS 1927 /** 1928 * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by 1929 * this kfunc which is not stored in a map as a kptr, must be released by 1930 * calling bpf_cgroup_release(). 1931 * @cgrp: The cgroup on which a reference is being acquired. 1932 */ 1933 struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) 1934 { 1935 cgroup_get(cgrp); 1936 return cgrp; 1937 } 1938 1939 /** 1940 * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup 1941 * kptr acquired by this kfunc which is not subsequently stored in a map, must 1942 * be released by calling bpf_cgroup_release(). 1943 * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired. 1944 */ 1945 struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp) 1946 { 1947 struct cgroup *cgrp; 1948 1949 rcu_read_lock(); 1950 /* Another context could remove the cgroup from the map and release it 1951 * at any time, including after we've done the lookup above. This is 1952 * safe because we're in an RCU read region, so the cgroup is 1953 * guaranteed to remain valid until at least the rcu_read_unlock() 1954 * below. 1955 */ 1956 cgrp = READ_ONCE(*cgrpp); 1957 1958 if (cgrp && !cgroup_tryget(cgrp)) 1959 /* If the cgroup had been removed from the map and freed as 1960 * described above, cgroup_tryget() will return false. The 1961 * cgroup will be freed at some point after the current RCU gp 1962 * has ended, so just return NULL to the user. 1963 */ 1964 cgrp = NULL; 1965 rcu_read_unlock(); 1966 1967 return cgrp; 1968 } 1969 1970 /** 1971 * bpf_cgroup_release - Release the reference acquired on a cgroup. 1972 * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to 1973 * not be freed until the current grace period has ended, even if its refcount 1974 * drops to 0. 1975 * @cgrp: The cgroup on which a reference is being released. 1976 */ 1977 void bpf_cgroup_release(struct cgroup *cgrp) 1978 { 1979 if (!cgrp) 1980 return; 1981 1982 cgroup_put(cgrp); 1983 } 1984 1985 /** 1986 * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor 1987 * array. A cgroup returned by this kfunc which is not subsequently stored in a 1988 * map, must be released by calling bpf_cgroup_release(). 1989 * @cgrp: The cgroup for which we're performing a lookup. 1990 * @level: The level of ancestor to look up. 1991 */ 1992 struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) 1993 { 1994 struct cgroup *ancestor; 1995 1996 if (level > cgrp->level || level < 0) 1997 return NULL; 1998 1999 ancestor = cgrp->ancestors[level]; 2000 cgroup_get(ancestor); 2001 return ancestor; 2002 } 2003 #endif /* CONFIG_CGROUPS */ 2004 2005 /** 2006 * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up 2007 * in the root pid namespace idr. If a task is returned, it must either be 2008 * stored in a map, or released with bpf_task_release(). 2009 * @pid: The pid of the task being looked up. 2010 */ 2011 struct task_struct *bpf_task_from_pid(s32 pid) 2012 { 2013 struct task_struct *p; 2014 2015 rcu_read_lock(); 2016 p = find_task_by_pid_ns(pid, &init_pid_ns); 2017 if (p) 2018 bpf_task_acquire(p); 2019 rcu_read_unlock(); 2020 2021 return p; 2022 } 2023 2024 void *bpf_cast_to_kern_ctx(void *obj) 2025 { 2026 return obj; 2027 } 2028 2029 void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k) 2030 { 2031 return obj__ign; 2032 } 2033 2034 void bpf_rcu_read_lock(void) 2035 { 2036 rcu_read_lock(); 2037 } 2038 2039 void bpf_rcu_read_unlock(void) 2040 { 2041 rcu_read_unlock(); 2042 } 2043 2044 __diag_pop(); 2045 2046 BTF_SET8_START(generic_btf_ids) 2047 #ifdef CONFIG_KEXEC_CORE 2048 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) 2049 #endif 2050 BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) 2051 BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) 2052 BTF_ID_FLAGS(func, bpf_list_push_front) 2053 BTF_ID_FLAGS(func, bpf_list_push_back) 2054 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) 2055 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) 2056 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) 2057 BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) 2058 BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) 2059 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) 2060 #ifdef CONFIG_CGROUPS 2061 BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) 2062 BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) 2063 BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) 2064 BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) 2065 #endif 2066 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) 2067 BTF_SET8_END(generic_btf_ids) 2068 2069 static const struct btf_kfunc_id_set generic_kfunc_set = { 2070 .owner = THIS_MODULE, 2071 .set = &generic_btf_ids, 2072 }; 2073 2074 2075 BTF_ID_LIST(generic_dtor_ids) 2076 BTF_ID(struct, task_struct) 2077 BTF_ID(func, bpf_task_release) 2078 #ifdef CONFIG_CGROUPS 2079 BTF_ID(struct, cgroup) 2080 BTF_ID(func, bpf_cgroup_release) 2081 #endif 2082 2083 BTF_SET8_START(common_btf_ids) 2084 BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) 2085 BTF_ID_FLAGS(func, bpf_rdonly_cast) 2086 BTF_ID_FLAGS(func, bpf_rcu_read_lock) 2087 BTF_ID_FLAGS(func, bpf_rcu_read_unlock) 2088 BTF_SET8_END(common_btf_ids) 2089 2090 static const struct btf_kfunc_id_set common_kfunc_set = { 2091 .owner = THIS_MODULE, 2092 .set = &common_btf_ids, 2093 }; 2094 2095 static int __init kfunc_init(void) 2096 { 2097 int ret; 2098 const struct btf_id_dtor_kfunc generic_dtors[] = { 2099 { 2100 .btf_id = generic_dtor_ids[0], 2101 .kfunc_btf_id = generic_dtor_ids[1] 2102 }, 2103 #ifdef CONFIG_CGROUPS 2104 { 2105 .btf_id = generic_dtor_ids[2], 2106 .kfunc_btf_id = generic_dtor_ids[3] 2107 }, 2108 #endif 2109 }; 2110 2111 ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set); 2112 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); 2113 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); 2114 ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, 2115 ARRAY_SIZE(generic_dtors), 2116 THIS_MODULE); 2117 return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); 2118 } 2119 2120 late_initcall(kfunc_init); 2121