xref: /linux/kernel/bpf/helpers.c (revision b960430ea8862ef37ce53c8bf74a8dc79d3f2404)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  */
4 #include <linux/bpf.h>
5 #include <linux/btf.h>
6 #include <linux/bpf-cgroup.h>
7 #include <linux/cgroup.h>
8 #include <linux/rcupdate.h>
9 #include <linux/random.h>
10 #include <linux/smp.h>
11 #include <linux/topology.h>
12 #include <linux/ktime.h>
13 #include <linux/sched.h>
14 #include <linux/uidgid.h>
15 #include <linux/filter.h>
16 #include <linux/ctype.h>
17 #include <linux/jiffies.h>
18 #include <linux/pid_namespace.h>
19 #include <linux/poison.h>
20 #include <linux/proc_ns.h>
21 #include <linux/sched/task.h>
22 #include <linux/security.h>
23 #include <linux/btf_ids.h>
24 #include <linux/bpf_mem_alloc.h>
25 #include <linux/kasan.h>
26 #include <linux/bpf_verifier.h>
27 #include <linux/uaccess.h>
28 #include <linux/verification.h>
29 #include <linux/task_work.h>
30 #include <linux/irq_work.h>
31 #include <linux/buildid.h>
32 
33 #include "../../lib/kstrtox.h"
34 
35 /* If kernel subsystem is allowing eBPF programs to call this function,
36  * inside its own verifier_ops->get_func_proto() callback it should return
37  * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
38  *
39  * Different map implementations will rely on rcu in map methods
40  * lookup/update/delete, therefore eBPF programs must run under rcu lock
41  * if program is allowed to access maps, so check rcu_read_lock_held() or
42  * rcu_read_lock_trace_held() in all three functions.
43  */
44 BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
45 {
46 	WARN_ON_ONCE(!bpf_rcu_lock_held());
47 	return (unsigned long) map->ops->map_lookup_elem(map, key);
48 }
49 
50 const struct bpf_func_proto bpf_map_lookup_elem_proto = {
51 	.func		= bpf_map_lookup_elem,
52 	.gpl_only	= false,
53 	.pkt_access	= true,
54 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
55 	.arg1_type	= ARG_CONST_MAP_PTR,
56 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
57 };
58 
59 BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
60 	   void *, value, u64, flags)
61 {
62 	WARN_ON_ONCE(!bpf_rcu_lock_held());
63 	return map->ops->map_update_elem(map, key, value, flags);
64 }
65 
66 const struct bpf_func_proto bpf_map_update_elem_proto = {
67 	.func		= bpf_map_update_elem,
68 	.gpl_only	= false,
69 	.pkt_access	= true,
70 	.ret_type	= RET_INTEGER,
71 	.arg1_type	= ARG_CONST_MAP_PTR,
72 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
73 	.arg3_type	= ARG_PTR_TO_MAP_VALUE,
74 	.arg4_type	= ARG_ANYTHING,
75 };
76 
77 BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
78 {
79 	WARN_ON_ONCE(!bpf_rcu_lock_held());
80 	return map->ops->map_delete_elem(map, key);
81 }
82 
83 const struct bpf_func_proto bpf_map_delete_elem_proto = {
84 	.func		= bpf_map_delete_elem,
85 	.gpl_only	= false,
86 	.pkt_access	= true,
87 	.ret_type	= RET_INTEGER,
88 	.arg1_type	= ARG_CONST_MAP_PTR,
89 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
90 };
91 
92 BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags)
93 {
94 	return map->ops->map_push_elem(map, value, flags);
95 }
96 
97 const struct bpf_func_proto bpf_map_push_elem_proto = {
98 	.func		= bpf_map_push_elem,
99 	.gpl_only	= false,
100 	.pkt_access	= true,
101 	.ret_type	= RET_INTEGER,
102 	.arg1_type	= ARG_CONST_MAP_PTR,
103 	.arg2_type	= ARG_PTR_TO_MAP_VALUE,
104 	.arg3_type	= ARG_ANYTHING,
105 };
106 
107 BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value)
108 {
109 	return map->ops->map_pop_elem(map, value);
110 }
111 
112 const struct bpf_func_proto bpf_map_pop_elem_proto = {
113 	.func		= bpf_map_pop_elem,
114 	.gpl_only	= false,
115 	.ret_type	= RET_INTEGER,
116 	.arg1_type	= ARG_CONST_MAP_PTR,
117 	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
118 };
119 
120 BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
121 {
122 	return map->ops->map_peek_elem(map, value);
123 }
124 
125 const struct bpf_func_proto bpf_map_peek_elem_proto = {
126 	.func		= bpf_map_peek_elem,
127 	.gpl_only	= false,
128 	.ret_type	= RET_INTEGER,
129 	.arg1_type	= ARG_CONST_MAP_PTR,
130 	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
131 };
132 
133 BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
134 {
135 	WARN_ON_ONCE(!bpf_rcu_lock_held());
136 	return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
137 }
138 
139 const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
140 	.func		= bpf_map_lookup_percpu_elem,
141 	.gpl_only	= false,
142 	.pkt_access	= true,
143 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
144 	.arg1_type	= ARG_CONST_MAP_PTR,
145 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
146 	.arg3_type	= ARG_ANYTHING,
147 };
148 
149 const struct bpf_func_proto bpf_get_prandom_u32_proto = {
150 	.func		= bpf_user_rnd_u32,
151 	.gpl_only	= false,
152 	.ret_type	= RET_INTEGER,
153 };
154 
155 BPF_CALL_0(bpf_get_smp_processor_id)
156 {
157 	return smp_processor_id();
158 }
159 
160 const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
161 	.func		= bpf_get_smp_processor_id,
162 	.gpl_only	= false,
163 	.ret_type	= RET_INTEGER,
164 	.allow_fastcall	= true,
165 };
166 
167 BPF_CALL_0(bpf_get_numa_node_id)
168 {
169 	return numa_node_id();
170 }
171 
172 const struct bpf_func_proto bpf_get_numa_node_id_proto = {
173 	.func		= bpf_get_numa_node_id,
174 	.gpl_only	= false,
175 	.ret_type	= RET_INTEGER,
176 };
177 
178 BPF_CALL_0(bpf_ktime_get_ns)
179 {
180 	/* NMI safe access to clock monotonic */
181 	return ktime_get_mono_fast_ns();
182 }
183 
184 const struct bpf_func_proto bpf_ktime_get_ns_proto = {
185 	.func		= bpf_ktime_get_ns,
186 	.gpl_only	= false,
187 	.ret_type	= RET_INTEGER,
188 };
189 
190 BPF_CALL_0(bpf_ktime_get_boot_ns)
191 {
192 	/* NMI safe access to clock boottime */
193 	return ktime_get_boot_fast_ns();
194 }
195 
196 const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
197 	.func		= bpf_ktime_get_boot_ns,
198 	.gpl_only	= false,
199 	.ret_type	= RET_INTEGER,
200 };
201 
202 BPF_CALL_0(bpf_ktime_get_coarse_ns)
203 {
204 	return ktime_get_coarse_ns();
205 }
206 
207 const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
208 	.func		= bpf_ktime_get_coarse_ns,
209 	.gpl_only	= false,
210 	.ret_type	= RET_INTEGER,
211 };
212 
213 BPF_CALL_0(bpf_ktime_get_tai_ns)
214 {
215 	/* NMI safe access to clock tai */
216 	return ktime_get_tai_fast_ns();
217 }
218 
219 const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
220 	.func		= bpf_ktime_get_tai_ns,
221 	.gpl_only	= false,
222 	.ret_type	= RET_INTEGER,
223 };
224 
225 BPF_CALL_0(bpf_get_current_pid_tgid)
226 {
227 	struct task_struct *task = current;
228 
229 	if (unlikely(!task))
230 		return -EINVAL;
231 
232 	return (u64) task->tgid << 32 | task->pid;
233 }
234 
235 const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
236 	.func		= bpf_get_current_pid_tgid,
237 	.gpl_only	= false,
238 	.ret_type	= RET_INTEGER,
239 };
240 
241 BPF_CALL_0(bpf_get_current_uid_gid)
242 {
243 	struct task_struct *task = current;
244 	kuid_t uid;
245 	kgid_t gid;
246 
247 	if (unlikely(!task))
248 		return -EINVAL;
249 
250 	current_uid_gid(&uid, &gid);
251 	return (u64) from_kgid(&init_user_ns, gid) << 32 |
252 		     from_kuid(&init_user_ns, uid);
253 }
254 
255 const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
256 	.func		= bpf_get_current_uid_gid,
257 	.gpl_only	= false,
258 	.ret_type	= RET_INTEGER,
259 };
260 
261 BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
262 {
263 	struct task_struct *task = current;
264 
265 	if (unlikely(!task))
266 		goto err_clear;
267 
268 	/* Verifier guarantees that size > 0 */
269 	strscpy_pad(buf, task->comm, size);
270 	return 0;
271 err_clear:
272 	memset(buf, 0, size);
273 	return -EINVAL;
274 }
275 
276 const struct bpf_func_proto bpf_get_current_comm_proto = {
277 	.func		= bpf_get_current_comm,
278 	.gpl_only	= false,
279 	.ret_type	= RET_INTEGER,
280 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
281 	.arg2_type	= ARG_CONST_SIZE,
282 };
283 
284 #if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
285 
286 static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
287 {
288 	arch_spinlock_t *l = (void *)lock;
289 	union {
290 		__u32 val;
291 		arch_spinlock_t lock;
292 	} u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
293 
294 	compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
295 	BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
296 	BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
297 	preempt_disable();
298 	arch_spin_lock(l);
299 }
300 
301 static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
302 {
303 	arch_spinlock_t *l = (void *)lock;
304 
305 	arch_spin_unlock(l);
306 	preempt_enable();
307 }
308 
309 #else
310 
311 static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
312 {
313 	atomic_t *l = (void *)lock;
314 
315 	BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
316 	do {
317 		atomic_cond_read_relaxed(l, !VAL);
318 	} while (atomic_xchg(l, 1));
319 }
320 
321 static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
322 {
323 	atomic_t *l = (void *)lock;
324 
325 	atomic_set_release(l, 0);
326 }
327 
328 #endif
329 
330 static DEFINE_PER_CPU(unsigned long, irqsave_flags);
331 
332 static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
333 {
334 	unsigned long flags;
335 
336 	local_irq_save(flags);
337 	__bpf_spin_lock(lock);
338 	__this_cpu_write(irqsave_flags, flags);
339 }
340 
341 NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
342 {
343 	__bpf_spin_lock_irqsave(lock);
344 	return 0;
345 }
346 
347 const struct bpf_func_proto bpf_spin_lock_proto = {
348 	.func		= bpf_spin_lock,
349 	.gpl_only	= false,
350 	.ret_type	= RET_VOID,
351 	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
352 	.arg1_btf_id    = BPF_PTR_POISON,
353 };
354 
355 static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
356 {
357 	unsigned long flags;
358 
359 	flags = __this_cpu_read(irqsave_flags);
360 	__bpf_spin_unlock(lock);
361 	local_irq_restore(flags);
362 }
363 
364 NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
365 {
366 	__bpf_spin_unlock_irqrestore(lock);
367 	return 0;
368 }
369 
370 const struct bpf_func_proto bpf_spin_unlock_proto = {
371 	.func		= bpf_spin_unlock,
372 	.gpl_only	= false,
373 	.ret_type	= RET_VOID,
374 	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
375 	.arg1_btf_id    = BPF_PTR_POISON,
376 };
377 
378 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
379 			   bool lock_src)
380 {
381 	struct bpf_spin_lock *lock;
382 
383 	if (lock_src)
384 		lock = src + map->record->spin_lock_off;
385 	else
386 		lock = dst + map->record->spin_lock_off;
387 	preempt_disable();
388 	__bpf_spin_lock_irqsave(lock);
389 	copy_map_value(map, dst, src);
390 	__bpf_spin_unlock_irqrestore(lock);
391 	preempt_enable();
392 }
393 
394 BPF_CALL_0(bpf_jiffies64)
395 {
396 	return get_jiffies_64();
397 }
398 
399 const struct bpf_func_proto bpf_jiffies64_proto = {
400 	.func		= bpf_jiffies64,
401 	.gpl_only	= false,
402 	.ret_type	= RET_INTEGER,
403 };
404 
405 #ifdef CONFIG_CGROUPS
406 BPF_CALL_0(bpf_get_current_cgroup_id)
407 {
408 	struct cgroup *cgrp;
409 	u64 cgrp_id;
410 
411 	rcu_read_lock();
412 	cgrp = task_dfl_cgroup(current);
413 	cgrp_id = cgroup_id(cgrp);
414 	rcu_read_unlock();
415 
416 	return cgrp_id;
417 }
418 
419 const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
420 	.func		= bpf_get_current_cgroup_id,
421 	.gpl_only	= false,
422 	.ret_type	= RET_INTEGER,
423 };
424 
425 BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
426 {
427 	struct cgroup *cgrp;
428 	struct cgroup *ancestor;
429 	u64 cgrp_id;
430 
431 	rcu_read_lock();
432 	cgrp = task_dfl_cgroup(current);
433 	ancestor = cgroup_ancestor(cgrp, ancestor_level);
434 	cgrp_id = ancestor ? cgroup_id(ancestor) : 0;
435 	rcu_read_unlock();
436 
437 	return cgrp_id;
438 }
439 
440 const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
441 	.func		= bpf_get_current_ancestor_cgroup_id,
442 	.gpl_only	= false,
443 	.ret_type	= RET_INTEGER,
444 	.arg1_type	= ARG_ANYTHING,
445 };
446 #endif /* CONFIG_CGROUPS */
447 
448 #define BPF_STRTOX_BASE_MASK 0x1F
449 
450 static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags,
451 			  unsigned long long *res, bool *is_negative)
452 {
453 	unsigned int base = flags & BPF_STRTOX_BASE_MASK;
454 	const char *cur_buf = buf;
455 	size_t cur_len = buf_len;
456 	unsigned int consumed;
457 	size_t val_len;
458 	char str[64];
459 
460 	if (!buf || !buf_len || !res || !is_negative)
461 		return -EINVAL;
462 
463 	if (base != 0 && base != 8 && base != 10 && base != 16)
464 		return -EINVAL;
465 
466 	if (flags & ~BPF_STRTOX_BASE_MASK)
467 		return -EINVAL;
468 
469 	while (cur_buf < buf + buf_len && isspace(*cur_buf))
470 		++cur_buf;
471 
472 	*is_negative = (cur_buf < buf + buf_len && *cur_buf == '-');
473 	if (*is_negative)
474 		++cur_buf;
475 
476 	consumed = cur_buf - buf;
477 	cur_len -= consumed;
478 	if (!cur_len)
479 		return -EINVAL;
480 
481 	cur_len = min(cur_len, sizeof(str) - 1);
482 	memcpy(str, cur_buf, cur_len);
483 	str[cur_len] = '\0';
484 	cur_buf = str;
485 
486 	cur_buf = _parse_integer_fixup_radix(cur_buf, &base);
487 	val_len = _parse_integer(cur_buf, base, res);
488 
489 	if (val_len & KSTRTOX_OVERFLOW)
490 		return -ERANGE;
491 
492 	if (val_len == 0)
493 		return -EINVAL;
494 
495 	cur_buf += val_len;
496 	consumed += cur_buf - str;
497 
498 	return consumed;
499 }
500 
501 static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
502 			 long long *res)
503 {
504 	unsigned long long _res;
505 	bool is_negative;
506 	int err;
507 
508 	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
509 	if (err < 0)
510 		return err;
511 	if (is_negative) {
512 		if ((long long)-_res > 0)
513 			return -ERANGE;
514 		*res = -_res;
515 	} else {
516 		if ((long long)_res < 0)
517 			return -ERANGE;
518 		*res = _res;
519 	}
520 	return err;
521 }
522 
523 BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
524 	   s64 *, res)
525 {
526 	long long _res;
527 	int err;
528 
529 	*res = 0;
530 	err = __bpf_strtoll(buf, buf_len, flags, &_res);
531 	if (err < 0)
532 		return err;
533 	*res = _res;
534 	return err;
535 }
536 
537 const struct bpf_func_proto bpf_strtol_proto = {
538 	.func		= bpf_strtol,
539 	.gpl_only	= false,
540 	.ret_type	= RET_INTEGER,
541 	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
542 	.arg2_type	= ARG_CONST_SIZE,
543 	.arg3_type	= ARG_ANYTHING,
544 	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
545 	.arg4_size	= sizeof(s64),
546 };
547 
548 BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
549 	   u64 *, res)
550 {
551 	unsigned long long _res;
552 	bool is_negative;
553 	int err;
554 
555 	*res = 0;
556 	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
557 	if (err < 0)
558 		return err;
559 	if (is_negative)
560 		return -EINVAL;
561 	*res = _res;
562 	return err;
563 }
564 
565 const struct bpf_func_proto bpf_strtoul_proto = {
566 	.func		= bpf_strtoul,
567 	.gpl_only	= false,
568 	.ret_type	= RET_INTEGER,
569 	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
570 	.arg2_type	= ARG_CONST_SIZE,
571 	.arg3_type	= ARG_ANYTHING,
572 	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
573 	.arg4_size	= sizeof(u64),
574 };
575 
576 BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
577 {
578 	return strncmp(s1, s2, s1_sz);
579 }
580 
581 static const struct bpf_func_proto bpf_strncmp_proto = {
582 	.func		= bpf_strncmp,
583 	.gpl_only	= false,
584 	.ret_type	= RET_INTEGER,
585 	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
586 	.arg2_type	= ARG_CONST_SIZE,
587 	.arg3_type	= ARG_PTR_TO_CONST_STR,
588 };
589 
590 BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
591 	   struct bpf_pidns_info *, nsdata, u32, size)
592 {
593 	struct task_struct *task = current;
594 	struct pid_namespace *pidns;
595 	int err = -EINVAL;
596 
597 	if (unlikely(size != sizeof(struct bpf_pidns_info)))
598 		goto clear;
599 
600 	if (unlikely((u64)(dev_t)dev != dev))
601 		goto clear;
602 
603 	if (unlikely(!task))
604 		goto clear;
605 
606 	pidns = task_active_pid_ns(task);
607 	if (unlikely(!pidns)) {
608 		err = -ENOENT;
609 		goto clear;
610 	}
611 
612 	if (!ns_match(&pidns->ns, (dev_t)dev, ino))
613 		goto clear;
614 
615 	nsdata->pid = task_pid_nr_ns(task, pidns);
616 	nsdata->tgid = task_tgid_nr_ns(task, pidns);
617 	return 0;
618 clear:
619 	memset((void *)nsdata, 0, (size_t) size);
620 	return err;
621 }
622 
623 const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
624 	.func		= bpf_get_ns_current_pid_tgid,
625 	.gpl_only	= false,
626 	.ret_type	= RET_INTEGER,
627 	.arg1_type	= ARG_ANYTHING,
628 	.arg2_type	= ARG_ANYTHING,
629 	.arg3_type      = ARG_PTR_TO_UNINIT_MEM,
630 	.arg4_type      = ARG_CONST_SIZE,
631 };
632 
633 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
634 	.func		= bpf_get_raw_cpu_id,
635 	.gpl_only	= false,
636 	.ret_type	= RET_INTEGER,
637 };
638 
639 BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map,
640 	   u64, flags, void *, data, u64, size)
641 {
642 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
643 		return -EINVAL;
644 
645 	return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
646 }
647 
648 const struct bpf_func_proto bpf_event_output_data_proto =  {
649 	.func		= bpf_event_output_data,
650 	.gpl_only       = true,
651 	.ret_type       = RET_INTEGER,
652 	.arg1_type      = ARG_PTR_TO_CTX,
653 	.arg2_type      = ARG_CONST_MAP_PTR,
654 	.arg3_type      = ARG_ANYTHING,
655 	.arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
656 	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
657 };
658 
659 BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
660 	   const void __user *, user_ptr)
661 {
662 	int ret = copy_from_user(dst, user_ptr, size);
663 
664 	if (unlikely(ret)) {
665 		memset(dst, 0, size);
666 		ret = -EFAULT;
667 	}
668 
669 	return ret;
670 }
671 
672 const struct bpf_func_proto bpf_copy_from_user_proto = {
673 	.func		= bpf_copy_from_user,
674 	.gpl_only	= false,
675 	.might_sleep	= true,
676 	.ret_type	= RET_INTEGER,
677 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
678 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
679 	.arg3_type	= ARG_ANYTHING,
680 };
681 
682 BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
683 	   const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
684 {
685 	int ret;
686 
687 	/* flags is not used yet */
688 	if (unlikely(flags))
689 		return -EINVAL;
690 
691 	if (unlikely(!size))
692 		return 0;
693 
694 	ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
695 	if (ret == size)
696 		return 0;
697 
698 	memset(dst, 0, size);
699 	/* Return -EFAULT for partial read */
700 	return ret < 0 ? ret : -EFAULT;
701 }
702 
703 const struct bpf_func_proto bpf_copy_from_user_task_proto = {
704 	.func		= bpf_copy_from_user_task,
705 	.gpl_only	= true,
706 	.might_sleep	= true,
707 	.ret_type	= RET_INTEGER,
708 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
709 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
710 	.arg3_type	= ARG_ANYTHING,
711 	.arg4_type	= ARG_PTR_TO_BTF_ID,
712 	.arg4_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
713 	.arg5_type	= ARG_ANYTHING
714 };
715 
716 BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
717 {
718 	if (cpu >= nr_cpu_ids)
719 		return (unsigned long)NULL;
720 
721 	return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu);
722 }
723 
724 const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
725 	.func		= bpf_per_cpu_ptr,
726 	.gpl_only	= false,
727 	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
728 	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
729 	.arg2_type	= ARG_ANYTHING,
730 };
731 
732 BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
733 {
734 	return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr);
735 }
736 
737 const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
738 	.func		= bpf_this_cpu_ptr,
739 	.gpl_only	= false,
740 	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
741 	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
742 };
743 
744 static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
745 		size_t bufsz)
746 {
747 	void __user *user_ptr = (__force void __user *)unsafe_ptr;
748 
749 	buf[0] = 0;
750 
751 	switch (fmt_ptype) {
752 	case 's':
753 #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
754 		if ((unsigned long)unsafe_ptr < TASK_SIZE)
755 			return strncpy_from_user_nofault(buf, user_ptr, bufsz);
756 		fallthrough;
757 #endif
758 	case 'k':
759 		return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
760 	case 'u':
761 		return strncpy_from_user_nofault(buf, user_ptr, bufsz);
762 	}
763 
764 	return -EINVAL;
765 }
766 
767 /* Support executing three nested bprintf helper calls on a given CPU */
768 #define MAX_BPRINTF_NEST_LEVEL	3
769 
770 static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
771 static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
772 
773 int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
774 {
775 	int nest_level;
776 
777 	preempt_disable();
778 	nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
779 	if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
780 		this_cpu_dec(bpf_bprintf_nest_level);
781 		preempt_enable();
782 		return -EBUSY;
783 	}
784 	*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
785 
786 	return 0;
787 }
788 
789 void bpf_put_buffers(void)
790 {
791 	if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
792 		return;
793 	this_cpu_dec(bpf_bprintf_nest_level);
794 	preempt_enable();
795 }
796 
797 void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
798 {
799 	if (!data->bin_args && !data->buf)
800 		return;
801 	bpf_put_buffers();
802 }
803 
804 /*
805  * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
806  *
807  * Returns a negative value if fmt is an invalid format string or 0 otherwise.
808  *
809  * This can be used in two ways:
810  * - Format string verification only: when data->get_bin_args is false
811  * - Arguments preparation: in addition to the above verification, it writes in
812  *   data->bin_args a binary representation of arguments usable by bstr_printf
813  *   where pointers from BPF have been sanitized.
814  *
815  * In argument preparation mode, if 0 is returned, safe temporary buffers are
816  * allocated and bpf_bprintf_cleanup should be called to free them after use.
817  */
818 int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
819 			u32 num_args, struct bpf_bprintf_data *data)
820 {
821 	bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
822 	char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
823 	struct bpf_bprintf_buffers *buffers = NULL;
824 	size_t sizeof_cur_arg, sizeof_cur_ip;
825 	int err, i, num_spec = 0;
826 	u64 cur_arg;
827 	char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX";
828 
829 	fmt_end = strnchr(fmt, fmt_size, 0);
830 	if (!fmt_end)
831 		return -EINVAL;
832 	fmt_size = fmt_end - fmt;
833 
834 	if (get_buffers && bpf_try_get_buffers(&buffers))
835 		return -EBUSY;
836 
837 	if (data->get_bin_args) {
838 		if (num_args)
839 			tmp_buf = buffers->bin_args;
840 		tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS;
841 		data->bin_args = (u32 *)tmp_buf;
842 	}
843 
844 	if (data->get_buf)
845 		data->buf = buffers->buf;
846 
847 	for (i = 0; i < fmt_size; i++) {
848 		unsigned char c = fmt[i];
849 
850 		/*
851 		 * Permit bytes >= 0x80 in plain text so UTF-8 literals can pass
852 		 * through unchanged, while still rejecting ASCII control bytes.
853 		 */
854 		if (isascii(c) && !isprint(c) && !isspace(c)) {
855 			err = -EINVAL;
856 			goto out;
857 		}
858 
859 		if (fmt[i] != '%')
860 			continue;
861 
862 		if (fmt[i + 1] == '%') {
863 			i++;
864 			continue;
865 		}
866 
867 		if (num_spec >= num_args) {
868 			err = -EINVAL;
869 			goto out;
870 		}
871 
872 		/* The string is zero-terminated so if fmt[i] != 0, we can
873 		 * always access fmt[i + 1], in the worst case it will be a 0
874 		 */
875 		i++;
876 		c = fmt[i];
877 		/*
878 		 * The format parser below only understands ASCII conversion
879 		 * specifiers and modifiers, so reject non-ASCII after '%'.
880 		 */
881 		if (!isascii(c)) {
882 			err = -EINVAL;
883 			goto out;
884 		}
885 
886 		/* skip optional "[0 +-][num]" width formatting field */
887 		while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-' ||
888 		       fmt[i] == ' ')
889 			i++;
890 		if (fmt[i] >= '1' && fmt[i] <= '9') {
891 			i++;
892 			while (fmt[i] >= '0' && fmt[i] <= '9')
893 				i++;
894 		}
895 
896 		if (fmt[i] == 'p') {
897 			sizeof_cur_arg = sizeof(long);
898 
899 			if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
900 			    ispunct(fmt[i + 1])) {
901 				if (tmp_buf)
902 					cur_arg = raw_args[num_spec];
903 				goto nocopy_fmt;
904 			}
905 
906 			if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
907 			    fmt[i + 2] == 's') {
908 				fmt_ptype = fmt[i + 1];
909 				i += 2;
910 				goto fmt_str;
911 			}
912 
913 			if (fmt[i + 1] == 'K' ||
914 			    fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
915 			    fmt[i + 1] == 'S') {
916 				if (tmp_buf)
917 					cur_arg = raw_args[num_spec];
918 				i++;
919 				goto nocopy_fmt;
920 			}
921 
922 			if (fmt[i + 1] == 'B') {
923 				if (tmp_buf)  {
924 					err = snprintf(tmp_buf,
925 						       (tmp_buf_end - tmp_buf),
926 						       "%pB",
927 						       (void *)(long)raw_args[num_spec]);
928 					tmp_buf += (err + 1);
929 				}
930 
931 				i++;
932 				num_spec++;
933 				continue;
934 			}
935 
936 			/* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
937 			if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') ||
938 			    (fmt[i + 2] != '4' && fmt[i + 2] != '6')) {
939 				err = -EINVAL;
940 				goto out;
941 			}
942 
943 			i += 2;
944 			if (!tmp_buf)
945 				goto nocopy_fmt;
946 
947 			sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16;
948 			if (tmp_buf_end - tmp_buf < sizeof_cur_ip) {
949 				err = -ENOSPC;
950 				goto out;
951 			}
952 
953 			unsafe_ptr = (char *)(long)raw_args[num_spec];
954 			err = copy_from_kernel_nofault(cur_ip, unsafe_ptr,
955 						       sizeof_cur_ip);
956 			if (err < 0)
957 				memset(cur_ip, 0, sizeof_cur_ip);
958 
959 			/* hack: bstr_printf expects IP addresses to be
960 			 * pre-formatted as strings, ironically, the easiest way
961 			 * to do that is to call snprintf.
962 			 */
963 			ip_spec[2] = fmt[i - 1];
964 			ip_spec[3] = fmt[i];
965 			err = snprintf(tmp_buf, tmp_buf_end - tmp_buf,
966 				       ip_spec, &cur_ip);
967 
968 			tmp_buf += err + 1;
969 			num_spec++;
970 
971 			continue;
972 		} else if (fmt[i] == 's') {
973 			fmt_ptype = fmt[i];
974 fmt_str:
975 			if (fmt[i + 1] != 0 &&
976 			    !isspace(fmt[i + 1]) &&
977 			    !ispunct(fmt[i + 1])) {
978 				err = -EINVAL;
979 				goto out;
980 			}
981 
982 			if (!tmp_buf)
983 				goto nocopy_fmt;
984 
985 			if (tmp_buf_end == tmp_buf) {
986 				err = -ENOSPC;
987 				goto out;
988 			}
989 
990 			unsafe_ptr = (char *)(long)raw_args[num_spec];
991 			err = bpf_trace_copy_string(tmp_buf, unsafe_ptr,
992 						    fmt_ptype,
993 						    tmp_buf_end - tmp_buf);
994 			if (err < 0) {
995 				tmp_buf[0] = '\0';
996 				err = 1;
997 			}
998 
999 			tmp_buf += err;
1000 			num_spec++;
1001 
1002 			continue;
1003 		} else if (fmt[i] == 'c') {
1004 			if (!tmp_buf)
1005 				goto nocopy_fmt;
1006 
1007 			if (tmp_buf_end == tmp_buf) {
1008 				err = -ENOSPC;
1009 				goto out;
1010 			}
1011 
1012 			*tmp_buf = raw_args[num_spec];
1013 			tmp_buf++;
1014 			num_spec++;
1015 
1016 			continue;
1017 		}
1018 
1019 		sizeof_cur_arg = sizeof(int);
1020 
1021 		if (fmt[i] == 'l') {
1022 			sizeof_cur_arg = sizeof(long);
1023 			i++;
1024 		}
1025 		if (fmt[i] == 'l') {
1026 			sizeof_cur_arg = sizeof(long long);
1027 			i++;
1028 		}
1029 
1030 		if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' &&
1031 		    fmt[i] != 'x' && fmt[i] != 'X') {
1032 			err = -EINVAL;
1033 			goto out;
1034 		}
1035 
1036 		if (tmp_buf)
1037 			cur_arg = raw_args[num_spec];
1038 nocopy_fmt:
1039 		if (tmp_buf) {
1040 			tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32));
1041 			if (tmp_buf_end - tmp_buf < sizeof_cur_arg) {
1042 				err = -ENOSPC;
1043 				goto out;
1044 			}
1045 
1046 			if (sizeof_cur_arg == 8) {
1047 				*(u32 *)tmp_buf = *(u32 *)&cur_arg;
1048 				*(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1);
1049 			} else {
1050 				*(u32 *)tmp_buf = (u32)(long)cur_arg;
1051 			}
1052 			tmp_buf += sizeof_cur_arg;
1053 		}
1054 		num_spec++;
1055 	}
1056 
1057 	err = 0;
1058 out:
1059 	if (err)
1060 		bpf_bprintf_cleanup(data);
1061 	return err;
1062 }
1063 
1064 BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
1065 	   const void *, args, u32, data_len)
1066 {
1067 	struct bpf_bprintf_data data = {
1068 		.get_bin_args	= true,
1069 	};
1070 	int err, num_args;
1071 
1072 	if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
1073 	    (data_len && !args))
1074 		return -EINVAL;
1075 	num_args = data_len / 8;
1076 
1077 	/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
1078 	 * can safely give an unbounded size.
1079 	 */
1080 	err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);
1081 	if (err < 0)
1082 		return err;
1083 
1084 	err = bstr_printf(str, str_size, fmt, data.bin_args);
1085 
1086 	bpf_bprintf_cleanup(&data);
1087 
1088 	return err + 1;
1089 }
1090 
1091 const struct bpf_func_proto bpf_snprintf_proto = {
1092 	.func		= bpf_snprintf,
1093 	.gpl_only	= true,
1094 	.ret_type	= RET_INTEGER,
1095 	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE,
1096 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1097 	.arg3_type	= ARG_PTR_TO_CONST_STR,
1098 	.arg4_type	= ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
1099 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
1100 };
1101 
1102 static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
1103 {
1104 	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
1105 		struct bpf_array *array = container_of(map, struct bpf_array, map);
1106 
1107 		*arr_idx = ((char *)value - array->value) / array->elem_size;
1108 		return arr_idx;
1109 	}
1110 	return (void *)value - round_up(map->key_size, 8);
1111 }
1112 
1113 enum bpf_async_type {
1114 	BPF_ASYNC_TYPE_TIMER = 0,
1115 	BPF_ASYNC_TYPE_WQ,
1116 };
1117 
1118 enum bpf_async_op {
1119 	BPF_ASYNC_START,
1120 	BPF_ASYNC_CANCEL
1121 };
1122 
1123 struct bpf_async_cmd {
1124 	struct llist_node node;
1125 	u64 nsec;
1126 	u32 mode;
1127 	enum bpf_async_op op;
1128 };
1129 
1130 struct bpf_async_cb {
1131 	struct bpf_map *map;
1132 	struct bpf_prog *prog;
1133 	void __rcu *callback_fn;
1134 	void *value;
1135 	struct rcu_head rcu;
1136 	u64 flags;
1137 	struct irq_work worker;
1138 	refcount_t refcnt;
1139 	enum bpf_async_type type;
1140 	struct llist_head async_cmds;
1141 };
1142 
1143 /* BPF map elements can contain 'struct bpf_timer'.
1144  * Such map owns all of its BPF timers.
1145  * 'struct bpf_timer' is allocated as part of map element allocation
1146  * and it's zero initialized.
1147  * That space is used to keep 'struct bpf_async_kern'.
1148  * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
1149  * remembers 'struct bpf_map *' pointer it's part of.
1150  * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
1151  * bpf_timer_start() arms the timer.
1152  * If user space reference to a map goes to zero at this point
1153  * ops->map_release_uref callback is responsible for cancelling the timers,
1154  * freeing their memory, and decrementing prog's refcnts.
1155  * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
1156  * Inner maps can contain bpf timers as well. ops->map_release_uref is
1157  * freeing the timers when inner map is replaced or deleted by user space.
1158  */
1159 struct bpf_hrtimer {
1160 	struct bpf_async_cb cb;
1161 	struct hrtimer timer;
1162 	atomic_t cancelling;
1163 };
1164 
1165 struct bpf_work {
1166 	struct bpf_async_cb cb;
1167 	struct work_struct work;
1168 };
1169 
1170 /* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
1171 struct bpf_async_kern {
1172 	union {
1173 		struct bpf_async_cb *cb;
1174 		struct bpf_hrtimer *timer;
1175 		struct bpf_work *work;
1176 	};
1177 } __attribute__((aligned(8)));
1178 
1179 static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
1180 
1181 static void bpf_async_refcount_put(struct bpf_async_cb *cb);
1182 
1183 static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
1184 {
1185 	struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
1186 	struct bpf_map *map = t->cb.map;
1187 	void *value = t->cb.value;
1188 	bpf_callback_t callback_fn;
1189 	void *key;
1190 	u32 idx;
1191 
1192 	BTF_TYPE_EMIT(struct bpf_timer);
1193 	callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
1194 	if (!callback_fn)
1195 		goto out;
1196 
1197 	/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
1198 	 * cannot be preempted by another bpf_timer_cb() on the same cpu.
1199 	 * Remember the timer this callback is servicing to prevent
1200 	 * deadlock if callback_fn() calls bpf_timer_cancel() or
1201 	 * bpf_map_delete_elem() on the same timer.
1202 	 */
1203 	this_cpu_write(hrtimer_running, t);
1204 
1205 	key = map_key_from_value(map, value, &idx);
1206 
1207 	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
1208 	/* The verifier checked that return value is zero. */
1209 
1210 	this_cpu_write(hrtimer_running, NULL);
1211 out:
1212 	return HRTIMER_NORESTART;
1213 }
1214 
1215 static void bpf_wq_work(struct work_struct *work)
1216 {
1217 	struct bpf_work *w = container_of(work, struct bpf_work, work);
1218 	struct bpf_async_cb *cb = &w->cb;
1219 	struct bpf_map *map = cb->map;
1220 	bpf_callback_t callback_fn;
1221 	void *value = cb->value;
1222 	void *key;
1223 	u32 idx;
1224 
1225 	BTF_TYPE_EMIT(struct bpf_wq);
1226 
1227 	callback_fn = READ_ONCE(cb->callback_fn);
1228 	if (!callback_fn)
1229 		return;
1230 
1231 	key = map_key_from_value(map, value, &idx);
1232 
1233         rcu_read_lock_trace();
1234         migrate_disable();
1235 
1236 	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
1237 
1238 	migrate_enable();
1239 	rcu_read_unlock_trace();
1240 }
1241 
1242 static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
1243 {
1244 	struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
1245 
1246 	/*
1247 	 * Drop the last reference to prog only after RCU GP, as set_callback()
1248 	 * may race with cancel_and_free()
1249 	 */
1250 	if (cb->prog)
1251 		bpf_prog_put(cb->prog);
1252 
1253 	kfree_nolock(cb);
1254 }
1255 
1256 /* Callback from call_rcu_tasks_trace, chains to call_rcu for final free */
1257 static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu)
1258 {
1259 	struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
1260 	struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
1261 	struct bpf_work *w = container_of(cb, struct bpf_work, cb);
1262 	bool retry = false;
1263 
1264 	/*
1265 	 * bpf_async_cancel_and_free() tried to cancel timer/wq, but it
1266 	 * could have raced with timer/wq_start. Now refcnt is zero and
1267 	 * srcu/rcu GP completed. Cancel timer/wq again.
1268 	 */
1269 	switch (cb->type) {
1270 	case BPF_ASYNC_TYPE_TIMER:
1271 		if (hrtimer_try_to_cancel(&t->timer) < 0)
1272 			retry = true;
1273 		break;
1274 	case BPF_ASYNC_TYPE_WQ:
1275 		if (!cancel_work(&w->work) && work_busy(&w->work))
1276 			retry = true;
1277 		break;
1278 	}
1279 	if (retry) {
1280 		/*
1281 		 * hrtimer or wq callback may still be running. It must be
1282 		 * in rcu_tasks_trace or rcu CS, so wait for GP again.
1283 		 * It won't retry forever, since refcnt zero prevents all
1284 		 * operations on timer/wq.
1285 		 */
1286 		call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
1287 		return;
1288 	}
1289 
1290 	/* RCU Tasks Trace grace period implies RCU grace period. */
1291 	bpf_async_cb_rcu_free(rcu);
1292 }
1293 
1294 static void worker_for_call_rcu(struct irq_work *work)
1295 {
1296 	struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker);
1297 
1298 	call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
1299 }
1300 
1301 static void bpf_async_refcount_put(struct bpf_async_cb *cb)
1302 {
1303 	if (!refcount_dec_and_test(&cb->refcnt))
1304 		return;
1305 
1306 	if (irqs_disabled()) {
1307 		cb->worker = IRQ_WORK_INIT(worker_for_call_rcu);
1308 		irq_work_queue(&cb->worker);
1309 	} else {
1310 		call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
1311 	}
1312 }
1313 
1314 static void bpf_async_cancel_and_free(struct bpf_async_kern *async);
1315 static void bpf_async_irq_worker(struct irq_work *work);
1316 
1317 static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
1318 			    enum bpf_async_type type)
1319 {
1320 	struct bpf_async_cb *cb, *old_cb;
1321 	struct bpf_hrtimer *t;
1322 	struct bpf_work *w;
1323 	clockid_t clockid;
1324 	size_t size;
1325 
1326 	switch (type) {
1327 	case BPF_ASYNC_TYPE_TIMER:
1328 		size = sizeof(struct bpf_hrtimer);
1329 		break;
1330 	case BPF_ASYNC_TYPE_WQ:
1331 		size = sizeof(struct bpf_work);
1332 		break;
1333 	default:
1334 		return -EINVAL;
1335 	}
1336 
1337 	old_cb = READ_ONCE(async->cb);
1338 	if (old_cb)
1339 		return -EBUSY;
1340 
1341 	cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
1342 	if (!cb)
1343 		return -ENOMEM;
1344 
1345 	switch (type) {
1346 	case BPF_ASYNC_TYPE_TIMER:
1347 		clockid = flags & (MAX_CLOCKS - 1);
1348 		t = (struct bpf_hrtimer *)cb;
1349 
1350 		atomic_set(&t->cancelling, 0);
1351 		hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT);
1352 		cb->value = (void *)async - map->record->timer_off;
1353 		break;
1354 	case BPF_ASYNC_TYPE_WQ:
1355 		w = (struct bpf_work *)cb;
1356 
1357 		INIT_WORK(&w->work, bpf_wq_work);
1358 		cb->value = (void *)async - map->record->wq_off;
1359 		break;
1360 	}
1361 	cb->map = map;
1362 	cb->prog = NULL;
1363 	cb->flags = flags;
1364 	cb->worker = IRQ_WORK_INIT(bpf_async_irq_worker);
1365 	init_llist_head(&cb->async_cmds);
1366 	refcount_set(&cb->refcnt, 1); /* map's reference */
1367 	cb->type = type;
1368 	rcu_assign_pointer(cb->callback_fn, NULL);
1369 
1370 	old_cb = cmpxchg(&async->cb, NULL, cb);
1371 	if (old_cb) {
1372 		/* Lost the race to initialize this bpf_async_kern, drop the allocated object */
1373 		kfree_nolock(cb);
1374 		return -EBUSY;
1375 	}
1376 	/* Guarantee the order between async->cb and map->usercnt. So
1377 	 * when there are concurrent uref release and bpf timer init, either
1378 	 * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
1379 	 * timer or atomic64_read() below returns a zero usercnt.
1380 	 */
1381 	smp_mb();
1382 	if (!atomic64_read(&map->usercnt)) {
1383 		/* maps with timers must be either held by user space
1384 		 * or pinned in bpffs.
1385 		 */
1386 		bpf_async_cancel_and_free(async);
1387 		return -EPERM;
1388 	}
1389 
1390 	return 0;
1391 }
1392 
1393 BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
1394 	   u64, flags)
1395 {
1396 	clock_t clockid = flags & (MAX_CLOCKS - 1);
1397 
1398 	BUILD_BUG_ON(MAX_CLOCKS != 16);
1399 	BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
1400 	BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
1401 
1402 	if (flags >= MAX_CLOCKS ||
1403 	    /* similar to timerfd except _ALARM variants are not supported */
1404 	    (clockid != CLOCK_MONOTONIC &&
1405 	     clockid != CLOCK_REALTIME &&
1406 	     clockid != CLOCK_BOOTTIME))
1407 		return -EINVAL;
1408 
1409 	return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
1410 }
1411 
1412 static const struct bpf_func_proto bpf_timer_init_proto = {
1413 	.func		= bpf_timer_init,
1414 	.gpl_only	= true,
1415 	.ret_type	= RET_INTEGER,
1416 	.arg1_type	= ARG_PTR_TO_TIMER,
1417 	.arg2_type	= ARG_CONST_MAP_PTR,
1418 	.arg3_type	= ARG_ANYTHING,
1419 };
1420 
1421 static int bpf_async_update_prog_callback(struct bpf_async_cb *cb,
1422 					  struct bpf_prog *prog,
1423 					  void *callback_fn)
1424 {
1425 	struct bpf_prog *prev;
1426 
1427 	/* Acquire a guard reference on prog to prevent it from being freed during the loop */
1428 	if (prog) {
1429 		prog = bpf_prog_inc_not_zero(prog);
1430 		if (IS_ERR(prog))
1431 			return PTR_ERR(prog);
1432 	}
1433 
1434 	do {
1435 		if (prog)
1436 			prog = bpf_prog_inc_not_zero(prog);
1437 		prev = xchg(&cb->prog, prog);
1438 		rcu_assign_pointer(cb->callback_fn, callback_fn);
1439 
1440 		/*
1441 		 * Release previous prog, make sure that if other CPU is contending,
1442 		 * to set bpf_prog, references are not leaked as each iteration acquires and
1443 		 * releases one reference.
1444 		 */
1445 		if (prev)
1446 			bpf_prog_put(prev);
1447 
1448 	} while (READ_ONCE(cb->prog) != prog ||
1449 		 (void __force *)READ_ONCE(cb->callback_fn) != callback_fn);
1450 
1451 	if (prog)
1452 		bpf_prog_put(prog);
1453 
1454 	return 0;
1455 }
1456 
1457 static DEFINE_PER_CPU(struct bpf_async_cb *, async_cb_running);
1458 
1459 static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op,
1460 				 u64 nsec, u32 timer_mode)
1461 {
1462 	/*
1463 	 * Do not schedule another operation on this cpu if it's in irq_work
1464 	 * callback that is processing async_cmds queue. Otherwise the following
1465 	 * loop is possible:
1466 	 * bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue().
1467 	 * irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start().
1468 	 */
1469 	if (this_cpu_read(async_cb_running) == cb) {
1470 		bpf_async_refcount_put(cb);
1471 		return -EDEADLK;
1472 	}
1473 
1474 	struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE);
1475 
1476 	if (!cmd) {
1477 		bpf_async_refcount_put(cb);
1478 		return -ENOMEM;
1479 	}
1480 	init_llist_node(&cmd->node);
1481 	cmd->nsec = nsec;
1482 	cmd->mode = timer_mode;
1483 	cmd->op = op;
1484 	if (llist_add(&cmd->node, &cb->async_cmds))
1485 		irq_work_queue(&cb->worker);
1486 	return 0;
1487 }
1488 
1489 static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
1490 				    struct bpf_prog *prog)
1491 {
1492 	struct bpf_async_cb *cb;
1493 
1494 	cb = READ_ONCE(async->cb);
1495 	if (!cb)
1496 		return -EINVAL;
1497 
1498 	return bpf_async_update_prog_callback(cb, prog, callback_fn);
1499 }
1500 
1501 BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
1502 	   struct bpf_prog_aux *, aux)
1503 {
1504 	return __bpf_async_set_callback(timer, callback_fn, aux->prog);
1505 }
1506 
1507 static const struct bpf_func_proto bpf_timer_set_callback_proto = {
1508 	.func		= bpf_timer_set_callback,
1509 	.gpl_only	= true,
1510 	.ret_type	= RET_INTEGER,
1511 	.arg1_type	= ARG_PTR_TO_TIMER,
1512 	.arg2_type	= ARG_PTR_TO_FUNC,
1513 };
1514 
1515 static bool defer_timer_wq_op(void)
1516 {
1517 	return in_hardirq() || irqs_disabled();
1518 }
1519 
1520 BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags)
1521 {
1522 	struct bpf_hrtimer *t;
1523 	u32 mode;
1524 
1525 	if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
1526 		return -EINVAL;
1527 
1528 	t = READ_ONCE(async->timer);
1529 	if (!t || !READ_ONCE(t->cb.prog))
1530 		return -EINVAL;
1531 
1532 	if (flags & BPF_F_TIMER_ABS)
1533 		mode = HRTIMER_MODE_ABS_SOFT;
1534 	else
1535 		mode = HRTIMER_MODE_REL_SOFT;
1536 
1537 	if (flags & BPF_F_TIMER_CPU_PIN)
1538 		mode |= HRTIMER_MODE_PINNED;
1539 
1540 	/*
1541 	 * bpf_async_cancel_and_free() could have dropped refcnt to zero. In
1542 	 * such case BPF progs are not allowed to arm the timer to prevent UAF.
1543 	 */
1544 	if (!refcount_inc_not_zero(&t->cb.refcnt))
1545 		return -ENOENT;
1546 
1547 	if (!defer_timer_wq_op()) {
1548 		hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
1549 		bpf_async_refcount_put(&t->cb);
1550 		return 0;
1551 	} else {
1552 		return bpf_async_schedule_op(&t->cb, BPF_ASYNC_START, nsecs, mode);
1553 	}
1554 }
1555 
1556 static const struct bpf_func_proto bpf_timer_start_proto = {
1557 	.func		= bpf_timer_start,
1558 	.gpl_only	= true,
1559 	.ret_type	= RET_INTEGER,
1560 	.arg1_type	= ARG_PTR_TO_TIMER,
1561 	.arg2_type	= ARG_ANYTHING,
1562 	.arg3_type	= ARG_ANYTHING,
1563 };
1564 
1565 BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async)
1566 {
1567 	struct bpf_hrtimer *t, *cur_t;
1568 	bool inc = false;
1569 	int ret = 0;
1570 
1571 	if (defer_timer_wq_op())
1572 		return -EOPNOTSUPP;
1573 
1574 	t = READ_ONCE(async->timer);
1575 	if (!t)
1576 		return -EINVAL;
1577 
1578 	cur_t = this_cpu_read(hrtimer_running);
1579 	if (cur_t == t) {
1580 		/* If bpf callback_fn is trying to bpf_timer_cancel()
1581 		 * its own timer the hrtimer_cancel() will deadlock
1582 		 * since it waits for callback_fn to finish.
1583 		 */
1584 		return -EDEADLK;
1585 	}
1586 
1587 	/* Only account in-flight cancellations when invoked from a timer
1588 	 * callback, since we want to avoid waiting only if other _callbacks_
1589 	 * are waiting on us, to avoid introducing lockups. Non-callback paths
1590 	 * are ok, since nobody would synchronously wait for their completion.
1591 	 */
1592 	if (!cur_t)
1593 		goto drop;
1594 	atomic_inc(&t->cancelling);
1595 	/* Need full barrier after relaxed atomic_inc */
1596 	smp_mb__after_atomic();
1597 	inc = true;
1598 	if (atomic_read(&cur_t->cancelling)) {
1599 		/* We're cancelling timer t, while some other timer callback is
1600 		 * attempting to cancel us. In such a case, it might be possible
1601 		 * that timer t belongs to the other callback, or some other
1602 		 * callback waiting upon it (creating transitive dependencies
1603 		 * upon us), and we will enter a deadlock if we continue
1604 		 * cancelling and waiting for it synchronously, since it might
1605 		 * do the same. Bail!
1606 		 */
1607 		atomic_dec(&t->cancelling);
1608 		return -EDEADLK;
1609 	}
1610 drop:
1611 	bpf_async_update_prog_callback(&t->cb, NULL, NULL);
1612 	/* Cancel the timer and wait for associated callback to finish
1613 	 * if it was running.
1614 	 */
1615 	ret = hrtimer_cancel(&t->timer);
1616 	if (inc)
1617 		atomic_dec(&t->cancelling);
1618 	return ret;
1619 }
1620 
1621 static const struct bpf_func_proto bpf_timer_cancel_proto = {
1622 	.func		= bpf_timer_cancel,
1623 	.gpl_only	= true,
1624 	.ret_type	= RET_INTEGER,
1625 	.arg1_type	= ARG_PTR_TO_TIMER,
1626 };
1627 
1628 static void bpf_async_process_op(struct bpf_async_cb *cb, u32 op,
1629 				 u64 timer_nsec, u32 timer_mode)
1630 {
1631 	switch (cb->type) {
1632 	case BPF_ASYNC_TYPE_TIMER: {
1633 		struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
1634 
1635 		switch (op) {
1636 		case BPF_ASYNC_START:
1637 			hrtimer_start(&t->timer, ns_to_ktime(timer_nsec), timer_mode);
1638 			break;
1639 		case BPF_ASYNC_CANCEL:
1640 			hrtimer_try_to_cancel(&t->timer);
1641 			break;
1642 		}
1643 		break;
1644 	}
1645 	case BPF_ASYNC_TYPE_WQ: {
1646 		struct bpf_work *w = container_of(cb, struct bpf_work, cb);
1647 
1648 		switch (op) {
1649 		case BPF_ASYNC_START:
1650 			schedule_work(&w->work);
1651 			break;
1652 		case BPF_ASYNC_CANCEL:
1653 			cancel_work(&w->work);
1654 			break;
1655 		}
1656 		break;
1657 	}
1658 	}
1659 	bpf_async_refcount_put(cb);
1660 }
1661 
1662 static void bpf_async_irq_worker(struct irq_work *work)
1663 {
1664 	struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker);
1665 	struct llist_node *pos, *n, *list;
1666 
1667 	list = llist_del_all(&cb->async_cmds);
1668 	if (!list)
1669 		return;
1670 
1671 	list = llist_reverse_order(list);
1672 	this_cpu_write(async_cb_running, cb);
1673 	llist_for_each_safe(pos, n, list) {
1674 		struct bpf_async_cmd *cmd;
1675 
1676 		cmd = container_of(pos, struct bpf_async_cmd, node);
1677 		bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode);
1678 		kfree_nolock(cmd);
1679 	}
1680 	this_cpu_write(async_cb_running, NULL);
1681 }
1682 
1683 static void bpf_async_cancel_and_free(struct bpf_async_kern *async)
1684 {
1685 	struct bpf_async_cb *cb;
1686 
1687 	if (!READ_ONCE(async->cb))
1688 		return;
1689 
1690 	cb = xchg(&async->cb, NULL);
1691 	if (!cb)
1692 		return;
1693 
1694 	bpf_async_update_prog_callback(cb, NULL, NULL);
1695 	/*
1696 	 * No refcount_inc_not_zero(&cb->refcnt) here. Dropping the last
1697 	 * refcnt. Either synchronously or asynchronously in irq_work.
1698 	 */
1699 
1700 	if (!defer_timer_wq_op()) {
1701 		bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0);
1702 	} else {
1703 		(void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0);
1704 		/*
1705 		 * bpf_async_schedule_op() either enqueues allocated cmd into llist
1706 		 * or fails with ENOMEM and drop the last refcnt.
1707 		 * This is unlikely, but safe, since bpf_async_cb_rcu_tasks_trace_free()
1708 		 * callback will do additional timer/wq_cancel due to races anyway.
1709 		 */
1710 	}
1711 }
1712 
1713 /*
1714  * This function is called by map_delete/update_elem for individual element and
1715  * by ops->map_release_uref when the user space reference to a map reaches zero.
1716  */
1717 void bpf_timer_cancel_and_free(void *val)
1718 {
1719 	bpf_async_cancel_and_free(val);
1720 }
1721 
1722 /*
1723  * This function is called by map_delete/update_elem for individual element and
1724  * by ops->map_release_uref when the user space reference to a map reaches zero.
1725  */
1726 void bpf_wq_cancel_and_free(void *val)
1727 {
1728 	bpf_async_cancel_and_free(val);
1729 }
1730 
1731 BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
1732 {
1733 	unsigned long *kptr = dst;
1734 
1735 	/* This helper may be inlined by verifier. */
1736 	return xchg(kptr, (unsigned long)ptr);
1737 }
1738 
1739 /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
1740  * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
1741  * denote type that verifier will determine.
1742  */
1743 static const struct bpf_func_proto bpf_kptr_xchg_proto = {
1744 	.func         = bpf_kptr_xchg,
1745 	.gpl_only     = false,
1746 	.ret_type     = RET_PTR_TO_BTF_ID_OR_NULL,
1747 	.ret_btf_id   = BPF_PTR_POISON,
1748 	.arg1_type    = ARG_KPTR_XCHG_DEST,
1749 	.arg2_type    = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
1750 	.arg2_btf_id  = BPF_PTR_POISON,
1751 };
1752 
1753 struct bpf_dynptr_file_impl {
1754 	struct freader freader;
1755 	/* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */
1756 	u64 offset;
1757 	u64 size;
1758 };
1759 
1760 /* Since the upper 8 bits of dynptr->size is reserved, the
1761  * maximum supported size is 2^24 - 1.
1762  */
1763 #define DYNPTR_MAX_SIZE	((1UL << 24) - 1)
1764 #define DYNPTR_TYPE_SHIFT	28
1765 #define DYNPTR_SIZE_MASK	0xFFFFFF
1766 #define DYNPTR_RDONLY_BIT	BIT(31)
1767 
1768 bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
1769 {
1770 	return ptr->size & DYNPTR_RDONLY_BIT;
1771 }
1772 
1773 void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
1774 {
1775 	ptr->size |= DYNPTR_RDONLY_BIT;
1776 }
1777 
1778 static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
1779 {
1780 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
1781 }
1782 
1783 static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
1784 {
1785 	return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
1786 }
1787 
1788 u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
1789 {
1790 	if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
1791 		struct bpf_dynptr_file_impl *df = ptr->data;
1792 
1793 		return df->size;
1794 	}
1795 
1796 	return ptr->size & DYNPTR_SIZE_MASK;
1797 }
1798 
1799 static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off)
1800 {
1801 	if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
1802 		struct bpf_dynptr_file_impl *df = ptr->data;
1803 
1804 		df->offset += off;
1805 		return;
1806 	}
1807 	ptr->offset += off;
1808 }
1809 
1810 static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size)
1811 {
1812 	u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
1813 
1814 	if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
1815 		struct bpf_dynptr_file_impl *df = ptr->data;
1816 
1817 		df->size = new_size;
1818 		return;
1819 	}
1820 	ptr->size = (u32)new_size | metadata;
1821 }
1822 
1823 int bpf_dynptr_check_size(u64 size)
1824 {
1825 	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
1826 }
1827 
1828 static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len)
1829 {
1830 	const void *ptr;
1831 
1832 	if (!buf)
1833 		return -EINVAL;
1834 
1835 	df->freader.buf = buf;
1836 	df->freader.buf_sz = len;
1837 	ptr = freader_fetch(&df->freader, offset + df->offset, len);
1838 	if (!ptr)
1839 		return df->freader.err;
1840 
1841 	if (ptr != buf) /* Force copying into the buffer */
1842 		memcpy(buf, ptr, len);
1843 
1844 	return 0;
1845 }
1846 
1847 void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
1848 		     enum bpf_dynptr_type type, u32 offset, u32 size)
1849 {
1850 	ptr->data = data;
1851 	ptr->offset = offset;
1852 	ptr->size = size;
1853 	bpf_dynptr_set_type(ptr, type);
1854 }
1855 
1856 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
1857 {
1858 	memset(ptr, 0, sizeof(*ptr));
1859 }
1860 
1861 BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr)
1862 {
1863 	int err;
1864 
1865 	BTF_TYPE_EMIT(struct bpf_dynptr);
1866 
1867 	err = bpf_dynptr_check_size(size);
1868 	if (err)
1869 		goto error;
1870 
1871 	/* flags is currently unsupported */
1872 	if (flags) {
1873 		err = -EINVAL;
1874 		goto error;
1875 	}
1876 
1877 	bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);
1878 
1879 	return 0;
1880 
1881 error:
1882 	bpf_dynptr_set_null(ptr);
1883 	return err;
1884 }
1885 
1886 static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
1887 	.func		= bpf_dynptr_from_mem,
1888 	.gpl_only	= false,
1889 	.ret_type	= RET_INTEGER,
1890 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
1891 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1892 	.arg3_type	= ARG_ANYTHING,
1893 	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
1894 };
1895 
1896 static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src,
1897 			     u64 offset, u64 flags)
1898 {
1899 	enum bpf_dynptr_type type;
1900 	int err;
1901 
1902 	if (!src->data || flags)
1903 		return -EINVAL;
1904 
1905 	err = bpf_dynptr_check_off_len(src, offset, len);
1906 	if (err)
1907 		return err;
1908 
1909 	type = bpf_dynptr_get_type(src);
1910 
1911 	switch (type) {
1912 	case BPF_DYNPTR_TYPE_LOCAL:
1913 	case BPF_DYNPTR_TYPE_RINGBUF:
1914 		/* Source and destination may possibly overlap, hence use memmove to
1915 		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
1916 		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
1917 		 */
1918 		memmove(dst, src->data + src->offset + offset, len);
1919 		return 0;
1920 	case BPF_DYNPTR_TYPE_SKB:
1921 		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
1922 	case BPF_DYNPTR_TYPE_XDP:
1923 		return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
1924 	case BPF_DYNPTR_TYPE_SKB_META:
1925 		memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
1926 		return 0;
1927 	case BPF_DYNPTR_TYPE_FILE:
1928 		return bpf_file_fetch_bytes(src->data, offset, dst, len);
1929 	default:
1930 		WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
1931 		return -EFAULT;
1932 	}
1933 }
1934 
1935 BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src,
1936 	   u64, offset, u64, flags)
1937 {
1938 	return __bpf_dynptr_read(dst, len, src, offset, flags);
1939 }
1940 
1941 static const struct bpf_func_proto bpf_dynptr_read_proto = {
1942 	.func		= bpf_dynptr_read,
1943 	.gpl_only	= false,
1944 	.ret_type	= RET_INTEGER,
1945 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
1946 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1947 	.arg3_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
1948 	.arg4_type	= ARG_ANYTHING,
1949 	.arg5_type	= ARG_ANYTHING,
1950 };
1951 
1952 int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src,
1953 		       u64 len, u64 flags)
1954 {
1955 	enum bpf_dynptr_type type;
1956 	int err;
1957 
1958 	if (!dst->data || __bpf_dynptr_is_rdonly(dst))
1959 		return -EINVAL;
1960 
1961 	err = bpf_dynptr_check_off_len(dst, offset, len);
1962 	if (err)
1963 		return err;
1964 
1965 	type = bpf_dynptr_get_type(dst);
1966 
1967 	switch (type) {
1968 	case BPF_DYNPTR_TYPE_LOCAL:
1969 	case BPF_DYNPTR_TYPE_RINGBUF:
1970 		if (flags)
1971 			return -EINVAL;
1972 		/* Source and destination may possibly overlap, hence use memmove to
1973 		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
1974 		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
1975 		 */
1976 		memmove(dst->data + dst->offset + offset, src, len);
1977 		return 0;
1978 	case BPF_DYNPTR_TYPE_SKB:
1979 		return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
1980 					     flags);
1981 	case BPF_DYNPTR_TYPE_XDP:
1982 		if (flags)
1983 			return -EINVAL;
1984 		return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
1985 	case BPF_DYNPTR_TYPE_SKB_META:
1986 		return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src,
1987 						  len, flags);
1988 	default:
1989 		WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
1990 		return -EFAULT;
1991 	}
1992 }
1993 
1994 BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src,
1995 	   u64, len, u64, flags)
1996 {
1997 	return __bpf_dynptr_write(dst, offset, src, len, flags);
1998 }
1999 
2000 static const struct bpf_func_proto bpf_dynptr_write_proto = {
2001 	.func		= bpf_dynptr_write,
2002 	.gpl_only	= false,
2003 	.ret_type	= RET_INTEGER,
2004 	.arg1_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
2005 	.arg2_type	= ARG_ANYTHING,
2006 	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
2007 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
2008 	.arg5_type	= ARG_ANYTHING,
2009 };
2010 
2011 BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len)
2012 {
2013 	enum bpf_dynptr_type type;
2014 	int err;
2015 
2016 	if (!ptr->data)
2017 		return 0;
2018 
2019 	err = bpf_dynptr_check_off_len(ptr, offset, len);
2020 	if (err)
2021 		return 0;
2022 
2023 	if (__bpf_dynptr_is_rdonly(ptr))
2024 		return 0;
2025 
2026 	type = bpf_dynptr_get_type(ptr);
2027 
2028 	switch (type) {
2029 	case BPF_DYNPTR_TYPE_LOCAL:
2030 	case BPF_DYNPTR_TYPE_RINGBUF:
2031 		return (unsigned long)(ptr->data + ptr->offset + offset);
2032 	case BPF_DYNPTR_TYPE_SKB:
2033 	case BPF_DYNPTR_TYPE_XDP:
2034 	case BPF_DYNPTR_TYPE_SKB_META:
2035 		/* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
2036 		return 0;
2037 	default:
2038 		WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
2039 		return 0;
2040 	}
2041 }
2042 
2043 static const struct bpf_func_proto bpf_dynptr_data_proto = {
2044 	.func		= bpf_dynptr_data,
2045 	.gpl_only	= false,
2046 	.ret_type	= RET_PTR_TO_DYNPTR_MEM_OR_NULL,
2047 	.arg1_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
2048 	.arg2_type	= ARG_ANYTHING,
2049 	.arg3_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
2050 };
2051 
2052 const struct bpf_func_proto bpf_get_current_task_proto __weak;
2053 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
2054 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
2055 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
2056 const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
2057 const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
2058 const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
2059 const struct bpf_func_proto bpf_perf_event_read_proto __weak;
2060 const struct bpf_func_proto bpf_send_signal_proto __weak;
2061 const struct bpf_func_proto bpf_send_signal_thread_proto __weak;
2062 const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak;
2063 const struct bpf_func_proto bpf_get_task_stack_proto __weak;
2064 const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak;
2065 
2066 const struct bpf_func_proto *
2067 bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2068 {
2069 	switch (func_id) {
2070 	case BPF_FUNC_map_lookup_elem:
2071 		return &bpf_map_lookup_elem_proto;
2072 	case BPF_FUNC_map_update_elem:
2073 		return &bpf_map_update_elem_proto;
2074 	case BPF_FUNC_map_delete_elem:
2075 		return &bpf_map_delete_elem_proto;
2076 	case BPF_FUNC_map_push_elem:
2077 		return &bpf_map_push_elem_proto;
2078 	case BPF_FUNC_map_pop_elem:
2079 		return &bpf_map_pop_elem_proto;
2080 	case BPF_FUNC_map_peek_elem:
2081 		return &bpf_map_peek_elem_proto;
2082 	case BPF_FUNC_map_lookup_percpu_elem:
2083 		return &bpf_map_lookup_percpu_elem_proto;
2084 	case BPF_FUNC_get_prandom_u32:
2085 		return &bpf_get_prandom_u32_proto;
2086 	case BPF_FUNC_get_smp_processor_id:
2087 		return &bpf_get_raw_smp_processor_id_proto;
2088 	case BPF_FUNC_get_numa_node_id:
2089 		return &bpf_get_numa_node_id_proto;
2090 	case BPF_FUNC_tail_call:
2091 		return &bpf_tail_call_proto;
2092 	case BPF_FUNC_ktime_get_ns:
2093 		return &bpf_ktime_get_ns_proto;
2094 	case BPF_FUNC_ktime_get_boot_ns:
2095 		return &bpf_ktime_get_boot_ns_proto;
2096 	case BPF_FUNC_ktime_get_tai_ns:
2097 		return &bpf_ktime_get_tai_ns_proto;
2098 	case BPF_FUNC_ringbuf_output:
2099 		return &bpf_ringbuf_output_proto;
2100 	case BPF_FUNC_ringbuf_reserve:
2101 		return &bpf_ringbuf_reserve_proto;
2102 	case BPF_FUNC_ringbuf_submit:
2103 		return &bpf_ringbuf_submit_proto;
2104 	case BPF_FUNC_ringbuf_discard:
2105 		return &bpf_ringbuf_discard_proto;
2106 	case BPF_FUNC_ringbuf_query:
2107 		return &bpf_ringbuf_query_proto;
2108 	case BPF_FUNC_strncmp:
2109 		return &bpf_strncmp_proto;
2110 	case BPF_FUNC_strtol:
2111 		return &bpf_strtol_proto;
2112 	case BPF_FUNC_strtoul:
2113 		return &bpf_strtoul_proto;
2114 	case BPF_FUNC_get_current_pid_tgid:
2115 		return &bpf_get_current_pid_tgid_proto;
2116 	case BPF_FUNC_get_ns_current_pid_tgid:
2117 		return &bpf_get_ns_current_pid_tgid_proto;
2118 	case BPF_FUNC_get_current_uid_gid:
2119 		return &bpf_get_current_uid_gid_proto;
2120 	default:
2121 		break;
2122 	}
2123 
2124 	if (!bpf_token_capable(prog->aux->token, CAP_BPF))
2125 		return NULL;
2126 
2127 	switch (func_id) {
2128 	case BPF_FUNC_spin_lock:
2129 		return &bpf_spin_lock_proto;
2130 	case BPF_FUNC_spin_unlock:
2131 		return &bpf_spin_unlock_proto;
2132 	case BPF_FUNC_jiffies64:
2133 		return &bpf_jiffies64_proto;
2134 	case BPF_FUNC_per_cpu_ptr:
2135 		return &bpf_per_cpu_ptr_proto;
2136 	case BPF_FUNC_this_cpu_ptr:
2137 		return &bpf_this_cpu_ptr_proto;
2138 	case BPF_FUNC_timer_init:
2139 		return &bpf_timer_init_proto;
2140 	case BPF_FUNC_timer_set_callback:
2141 		return &bpf_timer_set_callback_proto;
2142 	case BPF_FUNC_timer_start:
2143 		return &bpf_timer_start_proto;
2144 	case BPF_FUNC_timer_cancel:
2145 		return &bpf_timer_cancel_proto;
2146 	case BPF_FUNC_kptr_xchg:
2147 		return &bpf_kptr_xchg_proto;
2148 	case BPF_FUNC_for_each_map_elem:
2149 		return &bpf_for_each_map_elem_proto;
2150 	case BPF_FUNC_loop:
2151 		return &bpf_loop_proto;
2152 	case BPF_FUNC_user_ringbuf_drain:
2153 		return &bpf_user_ringbuf_drain_proto;
2154 	case BPF_FUNC_ringbuf_reserve_dynptr:
2155 		return &bpf_ringbuf_reserve_dynptr_proto;
2156 	case BPF_FUNC_ringbuf_submit_dynptr:
2157 		return &bpf_ringbuf_submit_dynptr_proto;
2158 	case BPF_FUNC_ringbuf_discard_dynptr:
2159 		return &bpf_ringbuf_discard_dynptr_proto;
2160 	case BPF_FUNC_dynptr_from_mem:
2161 		return &bpf_dynptr_from_mem_proto;
2162 	case BPF_FUNC_dynptr_read:
2163 		return &bpf_dynptr_read_proto;
2164 	case BPF_FUNC_dynptr_write:
2165 		return &bpf_dynptr_write_proto;
2166 	case BPF_FUNC_dynptr_data:
2167 		return &bpf_dynptr_data_proto;
2168 #ifdef CONFIG_CGROUPS
2169 	case BPF_FUNC_cgrp_storage_get:
2170 		return &bpf_cgrp_storage_get_proto;
2171 	case BPF_FUNC_cgrp_storage_delete:
2172 		return &bpf_cgrp_storage_delete_proto;
2173 	case BPF_FUNC_get_current_cgroup_id:
2174 		return &bpf_get_current_cgroup_id_proto;
2175 	case BPF_FUNC_get_current_ancestor_cgroup_id:
2176 		return &bpf_get_current_ancestor_cgroup_id_proto;
2177 	case BPF_FUNC_current_task_under_cgroup:
2178 		return &bpf_current_task_under_cgroup_proto;
2179 #endif
2180 #ifdef CONFIG_CGROUP_NET_CLASSID
2181 	case BPF_FUNC_get_cgroup_classid:
2182 		return &bpf_get_cgroup_classid_curr_proto;
2183 #endif
2184 	case BPF_FUNC_task_storage_get:
2185 		return &bpf_task_storage_get_proto;
2186 	case BPF_FUNC_task_storage_delete:
2187 		return &bpf_task_storage_delete_proto;
2188 	default:
2189 		break;
2190 	}
2191 
2192 	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
2193 		return NULL;
2194 
2195 	switch (func_id) {
2196 	case BPF_FUNC_trace_printk:
2197 		return bpf_get_trace_printk_proto();
2198 	case BPF_FUNC_get_current_task:
2199 		return &bpf_get_current_task_proto;
2200 	case BPF_FUNC_get_current_task_btf:
2201 		return &bpf_get_current_task_btf_proto;
2202 	case BPF_FUNC_get_current_comm:
2203 		return &bpf_get_current_comm_proto;
2204 	case BPF_FUNC_probe_read_user:
2205 		return &bpf_probe_read_user_proto;
2206 	case BPF_FUNC_probe_read_kernel:
2207 		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
2208 		       NULL : &bpf_probe_read_kernel_proto;
2209 	case BPF_FUNC_probe_read_user_str:
2210 		return &bpf_probe_read_user_str_proto;
2211 	case BPF_FUNC_probe_read_kernel_str:
2212 		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
2213 		       NULL : &bpf_probe_read_kernel_str_proto;
2214 	case BPF_FUNC_copy_from_user:
2215 		return &bpf_copy_from_user_proto;
2216 	case BPF_FUNC_copy_from_user_task:
2217 		return &bpf_copy_from_user_task_proto;
2218 	case BPF_FUNC_snprintf_btf:
2219 		return &bpf_snprintf_btf_proto;
2220 	case BPF_FUNC_snprintf:
2221 		return &bpf_snprintf_proto;
2222 	case BPF_FUNC_task_pt_regs:
2223 		return &bpf_task_pt_regs_proto;
2224 	case BPF_FUNC_trace_vprintk:
2225 		return bpf_get_trace_vprintk_proto();
2226 	case BPF_FUNC_perf_event_read_value:
2227 		return bpf_get_perf_event_read_value_proto();
2228 	case BPF_FUNC_perf_event_read:
2229 		return &bpf_perf_event_read_proto;
2230 	case BPF_FUNC_send_signal:
2231 		return &bpf_send_signal_proto;
2232 	case BPF_FUNC_send_signal_thread:
2233 		return &bpf_send_signal_thread_proto;
2234 	case BPF_FUNC_get_task_stack:
2235 		return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
2236 				       : &bpf_get_task_stack_proto;
2237 	case BPF_FUNC_get_branch_snapshot:
2238 		return &bpf_get_branch_snapshot_proto;
2239 	case BPF_FUNC_find_vma:
2240 		return &bpf_find_vma_proto;
2241 	default:
2242 		return NULL;
2243 	}
2244 }
2245 EXPORT_SYMBOL_GPL(bpf_base_func_proto);
2246 
2247 void bpf_list_head_free(const struct btf_field *field, void *list_head,
2248 			struct bpf_spin_lock *spin_lock)
2249 {
2250 	struct list_head *head = list_head, *orig_head = list_head;
2251 
2252 	BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
2253 	BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
2254 
2255 	/* Do the actual list draining outside the lock to not hold the lock for
2256 	 * too long, and also prevent deadlocks if tracing programs end up
2257 	 * executing on entry/exit of functions called inside the critical
2258 	 * section, and end up doing map ops that call bpf_list_head_free for
2259 	 * the same map value again.
2260 	 */
2261 	__bpf_spin_lock_irqsave(spin_lock);
2262 	if (!head->next || list_empty(head))
2263 		goto unlock;
2264 	head = head->next;
2265 unlock:
2266 	INIT_LIST_HEAD(orig_head);
2267 	__bpf_spin_unlock_irqrestore(spin_lock);
2268 
2269 	while (head != orig_head) {
2270 		void *obj = head;
2271 
2272 		obj -= field->graph_root.node_offset;
2273 		head = head->next;
2274 		/* The contained type can also have resources, including a
2275 		 * bpf_list_head which needs to be freed.
2276 		 */
2277 		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
2278 	}
2279 }
2280 
2281 /* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
2282  * 'rb_node *', so field name of rb_node within containing struct is not
2283  * needed.
2284  *
2285  * Since bpf_rb_tree's node type has a corresponding struct btf_field with
2286  * graph_root.node_offset, it's not necessary to know field name
2287  * or type of node struct
2288  */
2289 #define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
2290 	for (pos = rb_first_postorder(root); \
2291 	    pos && ({ n = rb_next_postorder(pos); 1; }); \
2292 	    pos = n)
2293 
2294 void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
2295 		      struct bpf_spin_lock *spin_lock)
2296 {
2297 	struct rb_root_cached orig_root, *root = rb_root;
2298 	struct rb_node *pos, *n;
2299 	void *obj;
2300 
2301 	BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
2302 	BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));
2303 
2304 	__bpf_spin_lock_irqsave(spin_lock);
2305 	orig_root = *root;
2306 	*root = RB_ROOT_CACHED;
2307 	__bpf_spin_unlock_irqrestore(spin_lock);
2308 
2309 	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
2310 		obj = pos;
2311 		obj -= field->graph_root.node_offset;
2312 
2313 
2314 		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
2315 	}
2316 }
2317 
2318 __bpf_kfunc_start_defs();
2319 
2320 /**
2321  * bpf_obj_new() - allocate an object described by program BTF
2322  * @local_type_id__k: type ID in program BTF
2323  * @meta: verifier-supplied struct metadata
2324  *
2325  * Allocate an object of the type identified by @local_type_id__k and
2326  * initialize its special fields. BPF programs can use
2327  * bpf_core_type_id_local() to provide @local_type_id__k. The verifier
2328  * rewrites @meta; BPF programs do not set it.
2329  *
2330  * Return: Pointer to the allocated object, or %NULL on failure.
2331  */
2332 __bpf_kfunc void *bpf_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta)
2333 {
2334 	u64 size = local_type_id__k;
2335 	void *p;
2336 
2337 	p = bpf_mem_alloc(&bpf_global_ma, size);
2338 	if (!p)
2339 		return NULL;
2340 	if (meta)
2341 		bpf_obj_init(meta->record, p);
2342 
2343 	return p;
2344 }
2345 
2346 __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
2347 {
2348 	return bpf_obj_new(local_type_id__k, meta__ign);
2349 }
2350 
2351 /**
2352  * bpf_percpu_obj_new() - allocate a percpu object described by program BTF
2353  * @local_type_id__k: type ID in program BTF
2354  * @meta: verifier-supplied struct metadata
2355  *
2356  * Allocate a percpu object of the type identified by @local_type_id__k. BPF
2357  * programs can use bpf_core_type_id_local() to provide @local_type_id__k.
2358  * The verifier rewrites @meta; BPF programs do not set it.
2359  *
2360  * Return: Pointer to the allocated percpu object, or %NULL on failure.
2361  */
2362 __bpf_kfunc void *bpf_percpu_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta)
2363 {
2364 	u64 size = local_type_id__k;
2365 
2366 	/* The verifier has ensured that meta must be NULL */
2367 	return bpf_mem_alloc(&bpf_global_percpu_ma, size);
2368 }
2369 
2370 __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
2371 {
2372 	return bpf_percpu_obj_new(local_type_id__k, meta__ign);
2373 }
2374 
2375 /* Must be called under migrate_disable(), as required by bpf_mem_free */
2376 void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
2377 {
2378 	struct bpf_mem_alloc *ma;
2379 
2380 	if (rec && rec->refcount_off >= 0 &&
2381 	    !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
2382 		/* Object is refcounted and refcount_dec didn't result in 0
2383 		 * refcount. Return without freeing the object
2384 		 */
2385 		return;
2386 	}
2387 
2388 	if (rec)
2389 		bpf_obj_free_fields(rec, p);
2390 
2391 	if (percpu)
2392 		ma = &bpf_global_percpu_ma;
2393 	else
2394 		ma = &bpf_global_ma;
2395 	bpf_mem_free_rcu(ma, p);
2396 }
2397 
2398 /**
2399  * bpf_obj_drop() - drop a previously allocated object
2400  * @p__alloc: object to free
2401  * @meta: verifier-supplied struct metadata
2402  *
2403  * Destroy special fields in @p__alloc as needed and free the object. The
2404  * verifier rewrites @meta; BPF programs do not set it.
2405  */
2406 __bpf_kfunc void bpf_obj_drop(void *p__alloc, struct btf_struct_meta *meta)
2407 {
2408 	void *p = p__alloc;
2409 
2410 	__bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
2411 }
2412 
2413 __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
2414 {
2415 	return bpf_obj_drop(p__alloc, meta__ign);
2416 }
2417 
2418 /**
2419  * bpf_percpu_obj_drop() - drop a previously allocated percpu object
2420  * @p__alloc: percpu object to free
2421  * @meta: verifier-supplied struct metadata
2422  *
2423  * Free @p__alloc. The verifier rewrites @meta; BPF programs do not set it.
2424  */
2425 __bpf_kfunc void bpf_percpu_obj_drop(void *p__alloc, struct btf_struct_meta *meta)
2426 {
2427 	/* The verifier has ensured that meta must be NULL */
2428 	bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
2429 }
2430 
2431 __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
2432 {
2433 	bpf_percpu_obj_drop(p__alloc, meta__ign);
2434 }
2435 
2436 /**
2437  * bpf_refcount_acquire() - turn a local kptr into an owning reference
2438  * @p__refcounted_kptr: non-owning local kptr
2439  * @meta: verifier-supplied struct metadata
2440  *
2441  * Increment the refcount for @p__refcounted_kptr. The verifier rewrites
2442  * @meta; BPF programs do not set it.
2443  *
2444  * Return: Owning reference to @p__refcounted_kptr, or %NULL on failure.
2445  */
2446 __bpf_kfunc void *bpf_refcount_acquire(void *p__refcounted_kptr, struct btf_struct_meta *meta)
2447 {
2448 	struct bpf_refcount *ref;
2449 
2450 	/* Could just cast directly to refcount_t *, but need some code using
2451 	 * bpf_refcount type so that it is emitted in vmlinux BTF
2452 	 */
2453 	ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
2454 	if (!refcount_inc_not_zero((refcount_t *)ref))
2455 		return NULL;
2456 
2457 	/* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null
2458 	 * in verifier.c
2459 	 */
2460 	return (void *)p__refcounted_kptr;
2461 }
2462 
2463 __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
2464 {
2465 	return bpf_refcount_acquire(p__refcounted_kptr, meta__ign);
2466 }
2467 
2468 static int __bpf_list_add(struct bpf_list_node_kern *node,
2469 			  struct bpf_list_head *head,
2470 			  bool tail, struct btf_record *rec, u64 off)
2471 {
2472 	struct list_head *n = &node->list_head, *h = (void *)head;
2473 
2474 	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
2475 	 * called on its fields, so init here
2476 	 */
2477 	if (unlikely(!h->next))
2478 		INIT_LIST_HEAD(h);
2479 
2480 	/* node->owner != NULL implies !list_empty(n), no need to separately
2481 	 * check the latter
2482 	 */
2483 	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
2484 		/* Only called from BPF prog, no need to migrate_disable */
2485 		__bpf_obj_drop_impl((void *)n - off, rec, false);
2486 		return -EINVAL;
2487 	}
2488 
2489 	tail ? list_add_tail(n, h) : list_add(n, h);
2490 	WRITE_ONCE(node->owner, head);
2491 
2492 	return 0;
2493 }
2494 
2495 /**
2496  * bpf_list_push_front() - add a node to the front of a BPF linked list
2497  * @head: list head
2498  * @node: node to insert
2499  * @meta: verifier-supplied struct metadata
2500  * @off: verifier-supplied offset of @node within the containing object
2501  *
2502  * Insert @node at the front of @head. The verifier rewrites @meta and @off;
2503  * BPF programs do not set them.
2504  *
2505  * Return: 0 on success, or %-EINVAL if @node is already linked.
2506  */
2507 __bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head,
2508 				    struct bpf_list_node *node,
2509 				    struct btf_struct_meta *meta,
2510 				    u64 off)
2511 {
2512 	struct bpf_list_node_kern *n = (void *)node;
2513 
2514 	return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
2515 }
2516 
2517 __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
2518 					 struct bpf_list_node *node,
2519 					 void *meta__ign, u64 off)
2520 {
2521 	return bpf_list_push_front(head, node, meta__ign, off);
2522 }
2523 
2524 /**
2525  * bpf_list_push_back() - add a node to the back of a BPF linked list
2526  * @head: list head
2527  * @node: node to insert
2528  * @meta: verifier-supplied struct metadata
2529  * @off: verifier-supplied offset of @node within the containing object
2530  *
2531  * Insert @node at the back of @head. The verifier rewrites @meta and @off;
2532  * BPF programs do not set them.
2533  *
2534  * Return: 0 on success, or %-EINVAL if @node is already linked.
2535  */
2536 __bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head,
2537 				   struct bpf_list_node *node,
2538 				   struct btf_struct_meta *meta,
2539 				   u64 off)
2540 {
2541 	struct bpf_list_node_kern *n = (void *)node;
2542 
2543 	return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
2544 }
2545 
2546 __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
2547 					struct bpf_list_node *node,
2548 					void *meta__ign, u64 off)
2549 {
2550 	return bpf_list_push_back(head, node, meta__ign, off);
2551 }
2552 
2553 static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
2554 {
2555 	struct list_head *n, *h = (void *)head;
2556 	struct bpf_list_node_kern *node;
2557 
2558 	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
2559 	 * called on its fields, so init here
2560 	 */
2561 	if (unlikely(!h->next))
2562 		INIT_LIST_HEAD(h);
2563 	if (list_empty(h))
2564 		return NULL;
2565 
2566 	n = tail ? h->prev : h->next;
2567 	node = container_of(n, struct bpf_list_node_kern, list_head);
2568 	if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
2569 		return NULL;
2570 
2571 	list_del_init(n);
2572 	WRITE_ONCE(node->owner, NULL);
2573 	return (struct bpf_list_node *)n;
2574 }
2575 
2576 __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
2577 {
2578 	return __bpf_list_del(head, false);
2579 }
2580 
2581 __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
2582 {
2583 	return __bpf_list_del(head, true);
2584 }
2585 
2586 __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
2587 {
2588 	struct list_head *h = (struct list_head *)head;
2589 
2590 	if (list_empty(h) || unlikely(!h->next))
2591 		return NULL;
2592 
2593 	return (struct bpf_list_node *)h->next;
2594 }
2595 
2596 __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head)
2597 {
2598 	struct list_head *h = (struct list_head *)head;
2599 
2600 	if (list_empty(h) || unlikely(!h->next))
2601 		return NULL;
2602 
2603 	return (struct bpf_list_node *)h->prev;
2604 }
2605 
2606 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
2607 						  struct bpf_rb_node *node)
2608 {
2609 	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
2610 	struct rb_root_cached *r = (struct rb_root_cached *)root;
2611 	struct rb_node *n = &node_internal->rb_node;
2612 
2613 	/* node_internal->owner != root implies either RB_EMPTY_NODE(n) or
2614 	 * n is owned by some other tree. No need to check RB_EMPTY_NODE(n)
2615 	 */
2616 	if (READ_ONCE(node_internal->owner) != root)
2617 		return NULL;
2618 
2619 	rb_erase_cached(n, r);
2620 	RB_CLEAR_NODE(n);
2621 	WRITE_ONCE(node_internal->owner, NULL);
2622 	return (struct bpf_rb_node *)n;
2623 }
2624 
2625 /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
2626  * program
2627  */
2628 static int __bpf_rbtree_add(struct bpf_rb_root *root,
2629 			    struct bpf_rb_node_kern *node,
2630 			    void *less, struct btf_record *rec, u64 off)
2631 {
2632 	struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
2633 	struct rb_node *parent = NULL, *n = &node->rb_node;
2634 	bpf_callback_t cb = (bpf_callback_t)less;
2635 	bool leftmost = true;
2636 
2637 	/* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately
2638 	 * check the latter
2639 	 */
2640 	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
2641 		/* Only called from BPF prog, no need to migrate_disable */
2642 		__bpf_obj_drop_impl((void *)n - off, rec, false);
2643 		return -EINVAL;
2644 	}
2645 
2646 	while (*link) {
2647 		parent = *link;
2648 		if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
2649 			link = &parent->rb_left;
2650 		} else {
2651 			link = &parent->rb_right;
2652 			leftmost = false;
2653 		}
2654 	}
2655 
2656 	rb_link_node(n, parent, link);
2657 	rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
2658 	WRITE_ONCE(node->owner, root);
2659 	return 0;
2660 }
2661 
2662 /**
2663  * bpf_rbtree_add() - add a node to a BPF rbtree
2664  * @root: tree root
2665  * @node: node to insert
2666  * @less: comparator used to order nodes
2667  * @meta: verifier-supplied struct metadata
2668  * @off: verifier-supplied offset of @node within the containing object
2669  *
2670  * Insert @node into @root using @less. The verifier rewrites @meta and @off;
2671  * BPF programs do not set them.
2672  *
2673  * Return: 0 on success, or %-EINVAL if @node is already linked in a tree.
2674  */
2675 __bpf_kfunc int bpf_rbtree_add(struct bpf_rb_root *root,
2676 			       struct bpf_rb_node *node,
2677 			       bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
2678 			       struct btf_struct_meta *meta,
2679 			       u64 off)
2680 {
2681 	struct bpf_rb_node_kern *n = (void *)node;
2682 
2683 	return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
2684 }
2685 
2686 __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
2687 				    bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
2688 				    void *meta__ign, u64 off)
2689 {
2690 	return bpf_rbtree_add(root, node, less, meta__ign, off);
2691 }
2692 
2693 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
2694 {
2695 	struct rb_root_cached *r = (struct rb_root_cached *)root;
2696 
2697 	return (struct bpf_rb_node *)rb_first_cached(r);
2698 }
2699 
2700 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root)
2701 {
2702 	struct rb_root_cached *r = (struct rb_root_cached *)root;
2703 
2704 	return (struct bpf_rb_node *)r->rb_root.rb_node;
2705 }
2706 
2707 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node)
2708 {
2709 	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
2710 
2711 	if (READ_ONCE(node_internal->owner) != root)
2712 		return NULL;
2713 
2714 	return (struct bpf_rb_node *)node_internal->rb_node.rb_left;
2715 }
2716 
2717 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node)
2718 {
2719 	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
2720 
2721 	if (READ_ONCE(node_internal->owner) != root)
2722 		return NULL;
2723 
2724 	return (struct bpf_rb_node *)node_internal->rb_node.rb_right;
2725 }
2726 
2727 /**
2728  * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
2729  * kfunc which is not stored in a map as a kptr, must be released by calling
2730  * bpf_task_release().
2731  * @p: The task on which a reference is being acquired.
2732  */
2733 __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
2734 {
2735 	if (refcount_inc_not_zero(&p->rcu_users))
2736 		return p;
2737 	return NULL;
2738 }
2739 
2740 /**
2741  * bpf_task_release - Release the reference acquired on a task.
2742  * @p: The task on which a reference is being released.
2743  */
2744 __bpf_kfunc void bpf_task_release(struct task_struct *p)
2745 {
2746 	put_task_struct_rcu_user(p);
2747 }
2748 
2749 __bpf_kfunc void bpf_task_release_dtor(void *p)
2750 {
2751 	put_task_struct_rcu_user(p);
2752 }
2753 CFI_NOSEAL(bpf_task_release_dtor);
2754 
2755 #ifdef CONFIG_CGROUPS
2756 /**
2757  * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
2758  * this kfunc which is not stored in a map as a kptr, must be released by
2759  * calling bpf_cgroup_release().
2760  * @cgrp: The cgroup on which a reference is being acquired.
2761  */
2762 __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
2763 {
2764 	return cgroup_tryget(cgrp) ? cgrp : NULL;
2765 }
2766 
2767 /**
2768  * bpf_cgroup_release - Release the reference acquired on a cgroup.
2769  * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
2770  * not be freed until the current grace period has ended, even if its refcount
2771  * drops to 0.
2772  * @cgrp: The cgroup on which a reference is being released.
2773  */
2774 __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
2775 {
2776 	cgroup_put(cgrp);
2777 }
2778 
2779 __bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp)
2780 {
2781 	cgroup_put(cgrp);
2782 }
2783 CFI_NOSEAL(bpf_cgroup_release_dtor);
2784 
2785 /**
2786  * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
2787  * array. A cgroup returned by this kfunc which is not subsequently stored in a
2788  * map, must be released by calling bpf_cgroup_release().
2789  * @cgrp: The cgroup for which we're performing a lookup.
2790  * @level: The level of ancestor to look up.
2791  */
2792 __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
2793 {
2794 	struct cgroup *ancestor;
2795 
2796 	if (level > cgrp->level || level < 0)
2797 		return NULL;
2798 
2799 	/* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
2800 	ancestor = cgrp->ancestors[level];
2801 	if (!cgroup_tryget(ancestor))
2802 		return NULL;
2803 	return ancestor;
2804 }
2805 
2806 /**
2807  * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this
2808  * kfunc which is not subsequently stored in a map, must be released by calling
2809  * bpf_cgroup_release().
2810  * @cgid: cgroup id.
2811  */
2812 __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
2813 {
2814 	struct cgroup *cgrp;
2815 
2816 	cgrp = __cgroup_get_from_id(cgid);
2817 	if (IS_ERR(cgrp))
2818 		return NULL;
2819 	return cgrp;
2820 }
2821 
2822 /**
2823  * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test
2824  * task's membership of cgroup ancestry.
2825  * @task: the task to be tested
2826  * @ancestor: possible ancestor of @task's cgroup
2827  *
2828  * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
2829  * It follows all the same rules as cgroup_is_descendant, and only applies
2830  * to the default hierarchy.
2831  */
2832 __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
2833 				       struct cgroup *ancestor)
2834 {
2835 	long ret;
2836 
2837 	rcu_read_lock();
2838 	ret = task_under_cgroup_hierarchy(task, ancestor);
2839 	rcu_read_unlock();
2840 	return ret;
2841 }
2842 
2843 BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
2844 {
2845 	struct bpf_array *array = container_of(map, struct bpf_array, map);
2846 	struct cgroup *cgrp;
2847 
2848 	if (unlikely(idx >= array->map.max_entries))
2849 		return -E2BIG;
2850 
2851 	cgrp = READ_ONCE(array->ptrs[idx]);
2852 	if (unlikely(!cgrp))
2853 		return -EAGAIN;
2854 
2855 	return task_under_cgroup_hierarchy(current, cgrp);
2856 }
2857 
2858 const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
2859 	.func           = bpf_current_task_under_cgroup,
2860 	.gpl_only       = false,
2861 	.ret_type       = RET_INTEGER,
2862 	.arg1_type      = ARG_CONST_MAP_PTR,
2863 	.arg2_type      = ARG_ANYTHING,
2864 };
2865 
2866 /**
2867  * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
2868  * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
2869  * hierarchy ID.
2870  * @task: The target task
2871  * @hierarchy_id: The ID of a cgroup1 hierarchy
2872  *
2873  * On success, the cgroup is returen. On failure, NULL is returned.
2874  */
2875 __bpf_kfunc struct cgroup *
2876 bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id)
2877 {
2878 	struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id);
2879 
2880 	if (IS_ERR(cgrp))
2881 		return NULL;
2882 	return cgrp;
2883 }
2884 #endif /* CONFIG_CGROUPS */
2885 
2886 /**
2887  * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
2888  * in the root pid namespace idr. If a task is returned, it must either be
2889  * stored in a map, or released with bpf_task_release().
2890  * @pid: The pid of the task being looked up.
2891  */
2892 __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
2893 {
2894 	struct task_struct *p;
2895 
2896 	rcu_read_lock();
2897 	p = find_task_by_pid_ns(pid, &init_pid_ns);
2898 	if (p)
2899 		p = bpf_task_acquire(p);
2900 	rcu_read_unlock();
2901 
2902 	return p;
2903 }
2904 
2905 /**
2906  * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up
2907  * in the pid namespace of the current task. If a task is returned, it must
2908  * either be stored in a map, or released with bpf_task_release().
2909  * @vpid: The vpid of the task being looked up.
2910  */
2911 __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
2912 {
2913 	struct task_struct *p;
2914 
2915 	rcu_read_lock();
2916 	p = find_task_by_vpid(vpid);
2917 	if (p)
2918 		p = bpf_task_acquire(p);
2919 	rcu_read_unlock();
2920 
2921 	return p;
2922 }
2923 
2924 /**
2925  * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
2926  * @p: The dynptr whose data slice to retrieve
2927  * @offset: Offset into the dynptr
2928  * @buffer__nullable: User-provided buffer to copy contents into.  May be NULL
2929  * @buffer__szk: Size (in bytes) of the buffer if present. This is the
2930  *               length of the requested slice. This must be a constant.
2931  *
2932  * For non-skb and non-xdp type dynptrs, there is no difference between
2933  * bpf_dynptr_slice and bpf_dynptr_data.
2934  *
2935  *  If buffer__nullable is NULL, the call will fail if buffer_opt was needed.
2936  *
2937  * If the intention is to write to the data slice, please use
2938  * bpf_dynptr_slice_rdwr.
2939  *
2940  * The user must check that the returned pointer is not null before using it.
2941  *
2942  * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
2943  * does not change the underlying packet data pointers, so a call to
2944  * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
2945  * the bpf program.
2946  *
2947  * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only
2948  * data slice (can be either direct pointer to the data or a pointer to the user
2949  * provided buffer, with its contents containing the data, if unable to obtain
2950  * direct pointer)
2951  */
2952 __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
2953 				   void *buffer__nullable, u64 buffer__szk)
2954 {
2955 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
2956 	enum bpf_dynptr_type type;
2957 	u64 len = buffer__szk;
2958 	int err;
2959 
2960 	if (!ptr->data)
2961 		return NULL;
2962 
2963 	err = bpf_dynptr_check_off_len(ptr, offset, len);
2964 	if (err)
2965 		return NULL;
2966 
2967 	type = bpf_dynptr_get_type(ptr);
2968 
2969 	switch (type) {
2970 	case BPF_DYNPTR_TYPE_LOCAL:
2971 	case BPF_DYNPTR_TYPE_RINGBUF:
2972 		return ptr->data + ptr->offset + offset;
2973 	case BPF_DYNPTR_TYPE_SKB:
2974 		if (buffer__nullable)
2975 			return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__nullable);
2976 		else
2977 			return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
2978 	case BPF_DYNPTR_TYPE_XDP:
2979 	{
2980 		void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
2981 		if (!IS_ERR_OR_NULL(xdp_ptr))
2982 			return xdp_ptr;
2983 
2984 		if (!buffer__nullable)
2985 			return NULL;
2986 		bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__nullable, len, false);
2987 		return buffer__nullable;
2988 	}
2989 	case BPF_DYNPTR_TYPE_SKB_META:
2990 		return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
2991 	case BPF_DYNPTR_TYPE_FILE:
2992 		err = bpf_file_fetch_bytes(ptr->data, offset, buffer__nullable, buffer__szk);
2993 		return err ? NULL : buffer__nullable;
2994 	default:
2995 		WARN_ONCE(true, "unknown dynptr type %d\n", type);
2996 		return NULL;
2997 	}
2998 }
2999 
3000 /**
3001  * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
3002  * @p: The dynptr whose data slice to retrieve
3003  * @offset: Offset into the dynptr
3004  * @buffer__nullable: User-provided buffer to copy contents into. May be NULL
3005  * @buffer__szk: Size (in bytes) of the buffer if present. This is the
3006  *               length of the requested slice. This must be a constant.
3007  *
3008  * For non-skb and non-xdp type dynptrs, there is no difference between
3009  * bpf_dynptr_slice and bpf_dynptr_data.
3010  *
3011  * If buffer__nullable is NULL, the call will fail if buffer_opt was needed.
3012  *
3013  * The returned pointer is writable and may point to either directly the dynptr
3014  * data at the requested offset or to the buffer if unable to obtain a direct
3015  * data pointer to (example: the requested slice is to the paged area of an skb
3016  * packet). In the case where the returned pointer is to the buffer, the user
3017  * is responsible for persisting writes through calling bpf_dynptr_write(). This
3018  * usually looks something like this pattern:
3019  *
3020  * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
3021  * if (!eth)
3022  *	return TC_ACT_SHOT;
3023  *
3024  * // mutate eth header //
3025  *
3026  * if (eth == buffer)
3027  *	bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
3028  *
3029  * Please note that, as in the example above, the user must check that the
3030  * returned pointer is not null before using it.
3031  *
3032  * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
3033  * does not change the underlying packet data pointers, so a call to
3034  * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
3035  * the bpf program.
3036  *
3037  * Return: NULL if the call failed (eg invalid dynptr), pointer to a
3038  * data slice (can be either direct pointer to the data or a pointer to the user
3039  * provided buffer, with its contents containing the data, if unable to obtain
3040  * direct pointer)
3041  */
3042 __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
3043 					void *buffer__nullable, u64 buffer__szk)
3044 {
3045 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3046 
3047 	if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
3048 		return NULL;
3049 
3050 	/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
3051 	 *
3052 	 * For skb-type dynptrs, it is safe to write into the returned pointer
3053 	 * if the bpf program allows skb data writes. There are two possibilities
3054 	 * that may occur when calling bpf_dynptr_slice_rdwr:
3055 	 *
3056 	 * 1) The requested slice is in the head of the skb. In this case, the
3057 	 * returned pointer is directly to skb data, and if the skb is cloned, the
3058 	 * verifier will have uncloned it (see bpf_unclone_prologue()) already.
3059 	 * The pointer can be directly written into.
3060 	 *
3061 	 * 2) Some portion of the requested slice is in the paged buffer area.
3062 	 * In this case, the requested data will be copied out into the buffer
3063 	 * and the returned pointer will be a pointer to the buffer. The skb
3064 	 * will not be pulled. To persist the write, the user will need to call
3065 	 * bpf_dynptr_write(), which will pull the skb and commit the write.
3066 	 *
3067 	 * Similarly for xdp programs, if the requested slice is not across xdp
3068 	 * fragments, then a direct pointer will be returned, otherwise the data
3069 	 * will be copied out into the buffer and the user will need to call
3070 	 * bpf_dynptr_write() to commit changes.
3071 	 */
3072 	return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk);
3073 }
3074 
3075 __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end)
3076 {
3077 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3078 	u64 size;
3079 
3080 	if (!ptr->data || start > end)
3081 		return -EINVAL;
3082 
3083 	size = __bpf_dynptr_size(ptr);
3084 
3085 	if (start > size || end > size)
3086 		return -ERANGE;
3087 
3088 	bpf_dynptr_advance_offset(ptr, start);
3089 	bpf_dynptr_set_size(ptr, end - start);
3090 
3091 	return 0;
3092 }
3093 
3094 __bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)
3095 {
3096 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3097 
3098 	return !ptr->data;
3099 }
3100 
3101 __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
3102 {
3103 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3104 
3105 	if (!ptr->data)
3106 		return false;
3107 
3108 	return __bpf_dynptr_is_rdonly(ptr);
3109 }
3110 
3111 __bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p)
3112 {
3113 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3114 
3115 	if (!ptr->data)
3116 		return -EINVAL;
3117 
3118 	return __bpf_dynptr_size(ptr);
3119 }
3120 
3121 __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
3122 				 struct bpf_dynptr *clone__uninit)
3123 {
3124 	struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit;
3125 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3126 
3127 	if (!ptr->data) {
3128 		bpf_dynptr_set_null(clone);
3129 		return -EINVAL;
3130 	}
3131 
3132 	*clone = *ptr;
3133 
3134 	return 0;
3135 }
3136 
3137 /**
3138  * bpf_dynptr_copy() - Copy data from one dynptr to another.
3139  * @dst_ptr: Destination dynptr - where data should be copied to
3140  * @dst_off: Offset into the destination dynptr
3141  * @src_ptr: Source dynptr - where data should be copied from
3142  * @src_off: Offset into the source dynptr
3143  * @size: Length of the data to copy from source to destination
3144  *
3145  * Copies data from source dynptr to destination dynptr.
3146  * Returns 0 on success; negative error, otherwise.
3147  */
3148 __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off,
3149 				struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
3150 {
3151 	struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
3152 	struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
3153 	void *src_slice, *dst_slice;
3154 	char buf[256];
3155 	u64 off;
3156 
3157 	src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size);
3158 	dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size);
3159 
3160 	if (src_slice && dst_slice) {
3161 		memmove(dst_slice, src_slice, size);
3162 		return 0;
3163 	}
3164 
3165 	if (src_slice)
3166 		return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0);
3167 
3168 	if (dst_slice)
3169 		return __bpf_dynptr_read(dst_slice, size, src, src_off, 0);
3170 
3171 	if (bpf_dynptr_check_off_len(dst, dst_off, size) ||
3172 	    bpf_dynptr_check_off_len(src, src_off, size))
3173 		return -E2BIG;
3174 
3175 	off = 0;
3176 	while (off < size) {
3177 		u64 chunk_sz = min_t(u64, sizeof(buf), size - off);
3178 		int err;
3179 
3180 		err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0);
3181 		if (err)
3182 			return err;
3183 		err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0);
3184 		if (err)
3185 			return err;
3186 
3187 		off += chunk_sz;
3188 	}
3189 	return 0;
3190 }
3191 
3192 /**
3193  * bpf_dynptr_memset() - Fill dynptr memory with a constant byte.
3194  * @p: Destination dynptr - where data will be filled
3195  * @offset: Offset into the dynptr to start filling from
3196  * @size: Number of bytes to fill
3197  * @val: Constant byte to fill the memory with
3198  *
3199  * Fills the @size bytes of the memory area pointed to by @p
3200  * at @offset with the constant byte @val.
3201  * Returns 0 on success; negative error, otherwise.
3202  */
3203 __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
3204 {
3205 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3206 	u64 chunk_sz, write_off;
3207 	char buf[256];
3208 	void* slice;
3209 	int err;
3210 
3211 	slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size);
3212 	if (likely(slice)) {
3213 		memset(slice, val, size);
3214 		return 0;
3215 	}
3216 
3217 	if (__bpf_dynptr_is_rdonly(ptr))
3218 		return -EINVAL;
3219 
3220 	err = bpf_dynptr_check_off_len(ptr, offset, size);
3221 	if (err)
3222 		return err;
3223 
3224 	/* Non-linear data under the dynptr, write from a local buffer */
3225 	chunk_sz = min_t(u64, sizeof(buf), size);
3226 	memset(buf, val, chunk_sz);
3227 
3228 	for (write_off = 0; write_off < size; write_off += chunk_sz) {
3229 		chunk_sz = min_t(u64, sizeof(buf), size - write_off);
3230 		err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0);
3231 		if (err)
3232 			return err;
3233 	}
3234 
3235 	return 0;
3236 }
3237 
3238 __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
3239 {
3240 	return obj;
3241 }
3242 
3243 __bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k)
3244 {
3245 	return (void *)obj__ign;
3246 }
3247 
3248 __bpf_kfunc void bpf_rcu_read_lock(void)
3249 {
3250 	rcu_read_lock();
3251 }
3252 
3253 __bpf_kfunc void bpf_rcu_read_unlock(void)
3254 {
3255 	rcu_read_unlock();
3256 }
3257 
3258 struct bpf_throw_ctx {
3259 	struct bpf_prog_aux *aux;
3260 	u64 sp;
3261 	u64 bp;
3262 	int cnt;
3263 };
3264 
3265 static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
3266 {
3267 	struct bpf_throw_ctx *ctx = cookie;
3268 	struct bpf_prog *prog;
3269 
3270 	/*
3271 	 * The RCU read lock is held to safely traverse the latch tree, but we
3272 	 * don't need its protection when accessing the prog, since it has an
3273 	 * active stack frame on the current stack trace, and won't disappear.
3274 	 */
3275 	rcu_read_lock();
3276 	prog = bpf_prog_ksym_find(ip);
3277 	rcu_read_unlock();
3278 	if (!prog)
3279 		return !ctx->cnt;
3280 	ctx->cnt++;
3281 	if (bpf_is_subprog(prog))
3282 		return true;
3283 	ctx->aux = prog->aux;
3284 	ctx->sp = sp;
3285 	ctx->bp = bp;
3286 	return false;
3287 }
3288 
3289 __bpf_kfunc void bpf_throw(u64 cookie)
3290 {
3291 	struct bpf_throw_ctx ctx = {};
3292 
3293 	arch_bpf_stack_walk(bpf_stack_walker, &ctx);
3294 	WARN_ON_ONCE(!ctx.aux);
3295 	if (ctx.aux)
3296 		WARN_ON_ONCE(!ctx.aux->exception_boundary);
3297 	WARN_ON_ONCE(!ctx.bp);
3298 	WARN_ON_ONCE(!ctx.cnt);
3299 	/* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning
3300 	 * deeper stack depths than ctx.sp as we do not return from bpf_throw,
3301 	 * which skips compiler generated instrumentation to do the same.
3302 	 */
3303 	kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
3304 	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0);
3305 	WARN(1, "A call to BPF exception callback should never return\n");
3306 }
3307 
3308 __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags)
3309 {
3310 	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
3311 	struct bpf_map *map = p__map;
3312 
3313 	BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
3314 	BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
3315 
3316 	if (flags)
3317 		return -EINVAL;
3318 
3319 	return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
3320 }
3321 
3322 __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
3323 {
3324 	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
3325 	struct bpf_work *w;
3326 
3327 	if (flags)
3328 		return -EINVAL;
3329 
3330 	w = READ_ONCE(async->work);
3331 	if (!w || !READ_ONCE(w->cb.prog))
3332 		return -EINVAL;
3333 
3334 	if (!refcount_inc_not_zero(&w->cb.refcnt))
3335 		return -ENOENT;
3336 
3337 	if (!defer_timer_wq_op()) {
3338 		schedule_work(&w->work);
3339 		bpf_async_refcount_put(&w->cb);
3340 		return 0;
3341 	} else {
3342 		return bpf_async_schedule_op(&w->cb, BPF_ASYNC_START, 0, 0);
3343 	}
3344 }
3345 
3346 __bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq,
3347 				    int (callback_fn)(void *map, int *key, void *value),
3348 				    unsigned int flags,
3349 				    struct bpf_prog_aux *aux)
3350 {
3351 	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
3352 
3353 	if (flags)
3354 		return -EINVAL;
3355 
3356 	return __bpf_async_set_callback(async, callback_fn, aux->prog);
3357 }
3358 
3359 __bpf_kfunc void bpf_preempt_disable(void)
3360 {
3361 	preempt_disable();
3362 }
3363 
3364 __bpf_kfunc void bpf_preempt_enable(void)
3365 {
3366 	preempt_enable();
3367 }
3368 
3369 struct bpf_iter_bits {
3370 	__u64 __opaque[2];
3371 } __aligned(8);
3372 
3373 #define BITS_ITER_NR_WORDS_MAX 511
3374 
3375 struct bpf_iter_bits_kern {
3376 	union {
3377 		__u64 *bits;
3378 		__u64 bits_copy;
3379 	};
3380 	int nr_bits;
3381 	int bit;
3382 } __aligned(8);
3383 
3384 /* On 64-bit hosts, unsigned long and u64 have the same size, so passing
3385  * a u64 pointer and an unsigned long pointer to find_next_bit() will
3386  * return the same result, as both point to the same 8-byte area.
3387  *
3388  * For 32-bit little-endian hosts, using a u64 pointer or unsigned long
3389  * pointer also makes no difference. This is because the first iterated
3390  * unsigned long is composed of bits 0-31 of the u64 and the second unsigned
3391  * long is composed of bits 32-63 of the u64.
3392  *
3393  * However, for 32-bit big-endian hosts, this is not the case. The first
3394  * iterated unsigned long will be bits 32-63 of the u64, so swap these two
3395  * ulong values within the u64.
3396  */
3397 static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
3398 {
3399 #if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
3400 	unsigned int i;
3401 
3402 	for (i = 0; i < nr; i++)
3403 		bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
3404 #endif
3405 }
3406 
3407 /**
3408  * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
3409  * @it: The new bpf_iter_bits to be created
3410  * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
3411  * @nr_words: The size of the specified memory area, measured in 8-byte units.
3412  * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
3413  * further reduced by the BPF memory allocator implementation.
3414  *
3415  * This function initializes a new bpf_iter_bits structure for iterating over
3416  * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
3417  * copies the data of the memory area to the newly created bpf_iter_bits @it for
3418  * subsequent iteration operations.
3419  *
3420  * On success, 0 is returned. On failure, ERR is returned.
3421  */
3422 __bpf_kfunc int
3423 bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words)
3424 {
3425 	struct bpf_iter_bits_kern *kit = (void *)it;
3426 	u32 nr_bytes = nr_words * sizeof(u64);
3427 	u32 nr_bits = BYTES_TO_BITS(nr_bytes);
3428 	int err;
3429 
3430 	BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits));
3431 	BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) !=
3432 		     __alignof__(struct bpf_iter_bits));
3433 
3434 	kit->nr_bits = 0;
3435 	kit->bits_copy = 0;
3436 	kit->bit = -1;
3437 
3438 	if (!unsafe_ptr__ign || !nr_words)
3439 		return -EINVAL;
3440 	if (nr_words > BITS_ITER_NR_WORDS_MAX)
3441 		return -E2BIG;
3442 
3443 	/* Optimization for u64 mask */
3444 	if (nr_bits == 64) {
3445 		err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign);
3446 		if (err)
3447 			return -EFAULT;
3448 
3449 		swap_ulong_in_u64(&kit->bits_copy, nr_words);
3450 
3451 		kit->nr_bits = nr_bits;
3452 		return 0;
3453 	}
3454 
3455 	if (bpf_mem_alloc_check_size(false, nr_bytes))
3456 		return -E2BIG;
3457 
3458 	/* Fallback to memalloc */
3459 	kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
3460 	if (!kit->bits)
3461 		return -ENOMEM;
3462 
3463 	err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign);
3464 	if (err) {
3465 		bpf_mem_free(&bpf_global_ma, kit->bits);
3466 		return err;
3467 	}
3468 
3469 	swap_ulong_in_u64(kit->bits, nr_words);
3470 
3471 	kit->nr_bits = nr_bits;
3472 	return 0;
3473 }
3474 
3475 /**
3476  * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits
3477  * @it: The bpf_iter_bits to be checked
3478  *
3479  * This function returns a pointer to a number representing the value of the
3480  * next bit in the bits.
3481  *
3482  * If there are no further bits available, it returns NULL.
3483  */
3484 __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
3485 {
3486 	struct bpf_iter_bits_kern *kit = (void *)it;
3487 	int bit = kit->bit, nr_bits = kit->nr_bits;
3488 	const void *bits;
3489 
3490 	if (!nr_bits || bit >= nr_bits)
3491 		return NULL;
3492 
3493 	bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
3494 	bit = find_next_bit(bits, nr_bits, bit + 1);
3495 	if (bit >= nr_bits) {
3496 		kit->bit = bit;
3497 		return NULL;
3498 	}
3499 
3500 	kit->bit = bit;
3501 	return &kit->bit;
3502 }
3503 
3504 /**
3505  * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits
3506  * @it: The bpf_iter_bits to be destroyed
3507  *
3508  * Destroy the resource associated with the bpf_iter_bits.
3509  */
3510 __bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
3511 {
3512 	struct bpf_iter_bits_kern *kit = (void *)it;
3513 
3514 	if (kit->nr_bits <= 64)
3515 		return;
3516 	bpf_mem_free(&bpf_global_ma, kit->bits);
3517 }
3518 
3519 /**
3520  * bpf_copy_from_user_str() - Copy a string from an unsafe user address
3521  * @dst:             Destination address, in kernel space.  This buffer must be
3522  *                   at least @dst__sz bytes long.
3523  * @dst__sz:         Maximum number of bytes to copy, includes the trailing NUL.
3524  * @unsafe_ptr__ign: Source address, in user space.
3525  * @flags:           The only supported flag is BPF_F_PAD_ZEROS
3526  *
3527  * Copies a NUL-terminated string from userspace to BPF space. If user string is
3528  * too long this will still ensure zero termination in the dst buffer unless
3529  * buffer size is 0.
3530  *
3531  * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and
3532  * memset all of @dst on failure.
3533  */
3534 __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags)
3535 {
3536 	int ret;
3537 
3538 	if (unlikely(flags & ~BPF_F_PAD_ZEROS))
3539 		return -EINVAL;
3540 
3541 	if (unlikely(!dst__sz))
3542 		return 0;
3543 
3544 	ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1);
3545 	if (ret < 0) {
3546 		if (flags & BPF_F_PAD_ZEROS)
3547 			memset((char *)dst, 0, dst__sz);
3548 
3549 		return ret;
3550 	}
3551 
3552 	if (flags & BPF_F_PAD_ZEROS)
3553 		memset((char *)dst + ret, 0, dst__sz - ret);
3554 	else
3555 		((char *)dst)[ret] = '\0';
3556 
3557 	return ret + 1;
3558 }
3559 
3560 /**
3561  * bpf_copy_from_user_task_str() - Copy a string from an task's address space
3562  * @dst:             Destination address, in kernel space.  This buffer must be
3563  *                   at least @dst__sz bytes long.
3564  * @dst__sz:         Maximum number of bytes to copy, includes the trailing NUL.
3565  * @unsafe_ptr__ign: Source address in the task's address space.
3566  * @tsk:             The task whose address space will be used
3567  * @flags:           The only supported flag is BPF_F_PAD_ZEROS
3568  *
3569  * Copies a NUL terminated string from a task's address space to @dst__sz
3570  * buffer. If user string is too long this will still ensure zero termination
3571  * in the @dst__sz buffer unless buffer size is 0.
3572  *
3573  * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success
3574  * and memset all of @dst__sz on failure.
3575  *
3576  * Return: The number of copied bytes on success including the NUL terminator.
3577  * A negative error code on failure.
3578  */
3579 __bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz,
3580 					    const void __user *unsafe_ptr__ign,
3581 					    struct task_struct *tsk, u64 flags)
3582 {
3583 	int ret;
3584 
3585 	if (unlikely(flags & ~BPF_F_PAD_ZEROS))
3586 		return -EINVAL;
3587 
3588 	if (unlikely(dst__sz == 0))
3589 		return 0;
3590 
3591 	ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0);
3592 	if (ret < 0) {
3593 		if (flags & BPF_F_PAD_ZEROS)
3594 			memset(dst, 0, dst__sz);
3595 		return ret;
3596 	}
3597 
3598 	if (flags & BPF_F_PAD_ZEROS)
3599 		memset(dst + ret, 0, dst__sz - ret);
3600 
3601 	return ret + 1;
3602 }
3603 
3604 /* Keep unsinged long in prototype so that kfunc is usable when emitted to
3605  * vmlinux.h in BPF programs directly, but note that while in BPF prog, the
3606  * unsigned long always points to 8-byte region on stack, the kernel may only
3607  * read and write the 4-bytes on 32-bit.
3608  */
3609 __bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag)
3610 {
3611 	local_irq_save(*flags__irq_flag);
3612 }
3613 
3614 __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
3615 {
3616 	local_irq_restore(*flags__irq_flag);
3617 }
3618 
3619 __bpf_kfunc void __bpf_trap(void)
3620 {
3621 }
3622 
3623 /*
3624  * Kfuncs for string operations.
3625  *
3626  * Since strings are not necessarily %NUL-terminated, we cannot directly call
3627  * in-kernel implementations. Instead, we open-code the implementations using
3628  * __get_kernel_nofault instead of plain dereference to make them safe.
3629  */
3630 
3631 static int __bpf_strncasecmp(const char *s1, const char *s2, bool ignore_case, size_t len)
3632 {
3633 	char c1, c2;
3634 	int i;
3635 
3636 	if (!copy_from_kernel_nofault_allowed(s1, 1) ||
3637 	    !copy_from_kernel_nofault_allowed(s2, 1)) {
3638 		return -ERANGE;
3639 	}
3640 
3641 	guard(pagefault)();
3642 	for (i = 0; i < len && i < XATTR_SIZE_MAX; i++) {
3643 		__get_kernel_nofault(&c1, s1, char, err_out);
3644 		__get_kernel_nofault(&c2, s2, char, err_out);
3645 		if (ignore_case) {
3646 			c1 = tolower(c1);
3647 			c2 = tolower(c2);
3648 		}
3649 		if (c1 != c2)
3650 			return c1 < c2 ? -1 : 1;
3651 		if (c1 == '\0')
3652 			return 0;
3653 		s1++;
3654 		s2++;
3655 	}
3656 	return i == XATTR_SIZE_MAX ? -E2BIG : 0;
3657 err_out:
3658 	return -EFAULT;
3659 }
3660 
3661 /**
3662  * bpf_strcmp - Compare two strings
3663  * @s1__ign: One string
3664  * @s2__ign: Another string
3665  *
3666  * Return:
3667  * * %0       - Strings are equal
3668  * * %-1      - @s1__ign is smaller
3669  * * %1       - @s2__ign is smaller
3670  * * %-EFAULT - Cannot read one of the strings
3671  * * %-E2BIG  - One of strings is too large
3672  * * %-ERANGE - One of strings is outside of kernel address space
3673  */
3674 __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
3675 {
3676 	return __bpf_strncasecmp(s1__ign, s2__ign, false, XATTR_SIZE_MAX);
3677 }
3678 
3679 /**
3680  * bpf_strcasecmp - Compare two strings, ignoring the case of the characters
3681  * @s1__ign: One string
3682  * @s2__ign: Another string
3683  *
3684  * Return:
3685  * * %0       - Strings are equal
3686  * * %-1      - @s1__ign is smaller
3687  * * %1       - @s2__ign is smaller
3688  * * %-EFAULT - Cannot read one of the strings
3689  * * %-E2BIG  - One of strings is too large
3690  * * %-ERANGE - One of strings is outside of kernel address space
3691  */
3692 __bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign)
3693 {
3694 	return __bpf_strncasecmp(s1__ign, s2__ign, true, XATTR_SIZE_MAX);
3695 }
3696 
3697 /*
3698  * bpf_strncasecmp - Compare two length-limited strings, ignoring case
3699  * @s1__ign: One string
3700  * @s2__ign: Another string
3701  * @len: The maximum number of characters to compare
3702  *
3703  * Return:
3704  * * %0       - Strings are equal
3705  * * %-1      - @s1__ign is smaller
3706  * * %1       - @s2__ign is smaller
3707  * * %-EFAULT - Cannot read one of the strings
3708  * * %-E2BIG  - One of strings is too large
3709  * * %-ERANGE - One of strings is outside of kernel address space
3710  */
3711 __bpf_kfunc int bpf_strncasecmp(const char *s1__ign, const char *s2__ign, size_t len)
3712 {
3713 	return __bpf_strncasecmp(s1__ign, s2__ign, true, len);
3714 }
3715 
3716 /**
3717  * bpf_strnchr - Find a character in a length limited string
3718  * @s__ign: The string to be searched
3719  * @count: The number of characters to be searched
3720  * @c: The character to search for
3721  *
3722  * Note that the %NUL-terminator is considered part of the string, and can
3723  * be searched for.
3724  *
3725  * Return:
3726  * * >=0      - Index of the first occurrence of @c within @s__ign
3727  * * %-ENOENT - @c not found in the first @count characters of @s__ign
3728  * * %-EFAULT - Cannot read @s__ign
3729  * * %-E2BIG  - @s__ign is too large
3730  * * %-ERANGE - @s__ign is outside of kernel address space
3731  */
3732 __bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c)
3733 {
3734 	char sc;
3735 	int i;
3736 
3737 	if (!copy_from_kernel_nofault_allowed(s__ign, 1))
3738 		return -ERANGE;
3739 
3740 	guard(pagefault)();
3741 	for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
3742 		__get_kernel_nofault(&sc, s__ign, char, err_out);
3743 		if (sc == c)
3744 			return i;
3745 		if (sc == '\0')
3746 			return -ENOENT;
3747 		s__ign++;
3748 	}
3749 	return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT;
3750 err_out:
3751 	return -EFAULT;
3752 }
3753 
3754 /**
3755  * bpf_strchr - Find the first occurrence of a character in a string
3756  * @s__ign: The string to be searched
3757  * @c: The character to search for
3758  *
3759  * Note that the %NUL-terminator is considered part of the string, and can
3760  * be searched for.
3761  *
3762  * Return:
3763  * * >=0      - The index of the first occurrence of @c within @s__ign
3764  * * %-ENOENT - @c not found in @s__ign
3765  * * %-EFAULT - Cannot read @s__ign
3766  * * %-E2BIG  - @s__ign is too large
3767  * * %-ERANGE - @s__ign is outside of kernel address space
3768  */
3769 __bpf_kfunc int bpf_strchr(const char *s__ign, char c)
3770 {
3771 	return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c);
3772 }
3773 
3774 /**
3775  * bpf_strchrnul - Find and return a character in a string, or end of string
3776  * @s__ign: The string to be searched
3777  * @c: The character to search for
3778  *
3779  * Return:
3780  * * >=0      - Index of the first occurrence of @c within @s__ign or index of
3781  *              the null byte at the end of @s__ign when @c is not found
3782  * * %-EFAULT - Cannot read @s__ign
3783  * * %-E2BIG  - @s__ign is too large
3784  * * %-ERANGE - @s__ign is outside of kernel address space
3785  */
3786 __bpf_kfunc int bpf_strchrnul(const char *s__ign, char c)
3787 {
3788 	char sc;
3789 	int i;
3790 
3791 	if (!copy_from_kernel_nofault_allowed(s__ign, 1))
3792 		return -ERANGE;
3793 
3794 	guard(pagefault)();
3795 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
3796 		__get_kernel_nofault(&sc, s__ign, char, err_out);
3797 		if (sc == '\0' || sc == c)
3798 			return i;
3799 		s__ign++;
3800 	}
3801 	return -E2BIG;
3802 err_out:
3803 	return -EFAULT;
3804 }
3805 
3806 /**
3807  * bpf_strrchr - Find the last occurrence of a character in a string
3808  * @s__ign: The string to be searched
3809  * @c: The character to search for
3810  *
3811  * Return:
3812  * * >=0      - Index of the last occurrence of @c within @s__ign
3813  * * %-ENOENT - @c not found in @s__ign
3814  * * %-EFAULT - Cannot read @s__ign
3815  * * %-E2BIG  - @s__ign is too large
3816  * * %-ERANGE - @s__ign is outside of kernel address space
3817  */
3818 __bpf_kfunc int bpf_strrchr(const char *s__ign, int c)
3819 {
3820 	char sc;
3821 	int i, last = -ENOENT;
3822 
3823 	if (!copy_from_kernel_nofault_allowed(s__ign, 1))
3824 		return -ERANGE;
3825 
3826 	guard(pagefault)();
3827 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
3828 		__get_kernel_nofault(&sc, s__ign, char, err_out);
3829 		if (sc == c)
3830 			last = i;
3831 		if (sc == '\0')
3832 			return last;
3833 		s__ign++;
3834 	}
3835 	return -E2BIG;
3836 err_out:
3837 	return -EFAULT;
3838 }
3839 
3840 /**
3841  * bpf_strnlen - Calculate the length of a length-limited string
3842  * @s__ign: The string
3843  * @count: The maximum number of characters to count
3844  *
3845  * Return:
3846  * * >=0      - The length of @s__ign
3847  * * %-EFAULT - Cannot read @s__ign
3848  * * %-E2BIG  - @s__ign is too large
3849  * * %-ERANGE - @s__ign is outside of kernel address space
3850  */
3851 __bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count)
3852 {
3853 	char c;
3854 	int i;
3855 
3856 	if (!copy_from_kernel_nofault_allowed(s__ign, 1))
3857 		return -ERANGE;
3858 
3859 	guard(pagefault)();
3860 	for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
3861 		__get_kernel_nofault(&c, s__ign, char, err_out);
3862 		if (c == '\0')
3863 			return i;
3864 		s__ign++;
3865 	}
3866 	return i == XATTR_SIZE_MAX ? -E2BIG : i;
3867 err_out:
3868 	return -EFAULT;
3869 }
3870 
3871 /**
3872  * bpf_strlen - Calculate the length of a string
3873  * @s__ign: The string
3874  *
3875  * Return:
3876  * * >=0      - The length of @s__ign
3877  * * %-EFAULT - Cannot read @s__ign
3878  * * %-E2BIG  - @s__ign is too large
3879  * * %-ERANGE - @s__ign is outside of kernel address space
3880  */
3881 __bpf_kfunc int bpf_strlen(const char *s__ign)
3882 {
3883 	return bpf_strnlen(s__ign, XATTR_SIZE_MAX);
3884 }
3885 
3886 /**
3887  * bpf_strspn - Calculate the length of the initial substring of @s__ign which
3888  *              only contains letters in @accept__ign
3889  * @s__ign: The string to be searched
3890  * @accept__ign: The string to search for
3891  *
3892  * Return:
3893  * * >=0      - The length of the initial substring of @s__ign which only
3894  *              contains letters from @accept__ign
3895  * * %-EFAULT - Cannot read one of the strings
3896  * * %-E2BIG  - One of the strings is too large
3897  * * %-ERANGE - One of the strings is outside of kernel address space
3898  */
3899 __bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign)
3900 {
3901 	char cs, ca;
3902 	int i, j;
3903 
3904 	if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
3905 	    !copy_from_kernel_nofault_allowed(accept__ign, 1)) {
3906 		return -ERANGE;
3907 	}
3908 
3909 	guard(pagefault)();
3910 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
3911 		__get_kernel_nofault(&cs, s__ign, char, err_out);
3912 		if (cs == '\0')
3913 			return i;
3914 		for (j = 0; j < XATTR_SIZE_MAX; j++) {
3915 			__get_kernel_nofault(&ca, accept__ign + j, char, err_out);
3916 			if (cs == ca || ca == '\0')
3917 				break;
3918 		}
3919 		if (j == XATTR_SIZE_MAX)
3920 			return -E2BIG;
3921 		if (ca == '\0')
3922 			return i;
3923 		s__ign++;
3924 	}
3925 	return -E2BIG;
3926 err_out:
3927 	return -EFAULT;
3928 }
3929 
3930 /**
3931  * bpf_strcspn - Calculate the length of the initial substring of @s__ign which
3932  *               does not contain letters in @reject__ign
3933  * @s__ign: The string to be searched
3934  * @reject__ign: The string to search for
3935  *
3936  * Return:
3937  * * >=0      - The length of the initial substring of @s__ign which does not
3938  *              contain letters from @reject__ign
3939  * * %-EFAULT - Cannot read one of the strings
3940  * * %-E2BIG  - One of the strings is too large
3941  * * %-ERANGE - One of the strings is outside of kernel address space
3942  */
3943 __bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign)
3944 {
3945 	char cs, cr;
3946 	int i, j;
3947 
3948 	if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
3949 	    !copy_from_kernel_nofault_allowed(reject__ign, 1)) {
3950 		return -ERANGE;
3951 	}
3952 
3953 	guard(pagefault)();
3954 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
3955 		__get_kernel_nofault(&cs, s__ign, char, err_out);
3956 		if (cs == '\0')
3957 			return i;
3958 		for (j = 0; j < XATTR_SIZE_MAX; j++) {
3959 			__get_kernel_nofault(&cr, reject__ign + j, char, err_out);
3960 			if (cs == cr || cr == '\0')
3961 				break;
3962 		}
3963 		if (j == XATTR_SIZE_MAX)
3964 			return -E2BIG;
3965 		if (cr != '\0')
3966 			return i;
3967 		s__ign++;
3968 	}
3969 	return -E2BIG;
3970 err_out:
3971 	return -EFAULT;
3972 }
3973 
3974 static int __bpf_strnstr(const char *s1, const char *s2, size_t len,
3975 			 bool ignore_case)
3976 {
3977 	char c1, c2;
3978 	int i, j;
3979 
3980 	if (!copy_from_kernel_nofault_allowed(s1, 1) ||
3981 	    !copy_from_kernel_nofault_allowed(s2, 1)) {
3982 		return -ERANGE;
3983 	}
3984 
3985 	guard(pagefault)();
3986 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
3987 		for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
3988 			__get_kernel_nofault(&c2, s2 + j, char, err_out);
3989 			if (c2 == '\0')
3990 				return i;
3991 			/*
3992 			 * We allow reading an extra byte from s2 (note the
3993 			 * `i + j <= len` above) to cover the case when s2 is
3994 			 * a suffix of the first len chars of s1.
3995 			 */
3996 			if (i + j == len)
3997 				break;
3998 			__get_kernel_nofault(&c1, s1 + j, char, err_out);
3999 
4000 			if (ignore_case) {
4001 				c1 = tolower(c1);
4002 				c2 = tolower(c2);
4003 			}
4004 
4005 			if (c1 == '\0')
4006 				return -ENOENT;
4007 			if (c1 != c2)
4008 				break;
4009 		}
4010 		if (j == XATTR_SIZE_MAX)
4011 			return -E2BIG;
4012 		if (i + j == len)
4013 			return -ENOENT;
4014 		s1++;
4015 	}
4016 	return -E2BIG;
4017 err_out:
4018 	return -EFAULT;
4019 }
4020 
4021 /**
4022  * bpf_strstr - Find the first substring in a string
4023  * @s1__ign: The string to be searched
4024  * @s2__ign: The string to search for
4025  *
4026  * Return:
4027  * * >=0      - Index of the first character of the first occurrence of @s2__ign
4028  *              within @s1__ign
4029  * * %-ENOENT - @s2__ign is not a substring of @s1__ign
4030  * * %-EFAULT - Cannot read one of the strings
4031  * * %-E2BIG  - One of the strings is too large
4032  * * %-ERANGE - One of the strings is outside of kernel address space
4033  */
4034 __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
4035 {
4036 	return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false);
4037 }
4038 
4039 /**
4040  * bpf_strcasestr - Find the first substring in a string, ignoring the case of
4041  *                  the characters
4042  * @s1__ign: The string to be searched
4043  * @s2__ign: The string to search for
4044  *
4045  * Return:
4046  * * >=0      - Index of the first character of the first occurrence of @s2__ign
4047  *              within @s1__ign
4048  * * %-ENOENT - @s2__ign is not a substring of @s1__ign
4049  * * %-EFAULT - Cannot read one of the strings
4050  * * %-E2BIG  - One of the strings is too large
4051  * * %-ERANGE - One of the strings is outside of kernel address space
4052  */
4053 __bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign)
4054 {
4055 	return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true);
4056 }
4057 
4058 /**
4059  * bpf_strnstr - Find the first substring in a length-limited string
4060  * @s1__ign: The string to be searched
4061  * @s2__ign: The string to search for
4062  * @len: the maximum number of characters to search
4063  *
4064  * Return:
4065  * * >=0      - Index of the first character of the first occurrence of @s2__ign
4066  *              within the first @len characters of @s1__ign
4067  * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
4068  * * %-EFAULT - Cannot read one of the strings
4069  * * %-E2BIG  - One of the strings is too large
4070  * * %-ERANGE - One of the strings is outside of kernel address space
4071  */
4072 __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign,
4073 			    size_t len)
4074 {
4075 	return __bpf_strnstr(s1__ign, s2__ign, len, false);
4076 }
4077 
4078 /**
4079  * bpf_strncasestr - Find the first substring in a length-limited string,
4080  *                   ignoring the case of the characters
4081  * @s1__ign: The string to be searched
4082  * @s2__ign: The string to search for
4083  * @len: the maximum number of characters to search
4084  *
4085  * Return:
4086  * * >=0      - Index of the first character of the first occurrence of @s2__ign
4087  *              within the first @len characters of @s1__ign
4088  * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
4089  * * %-EFAULT - Cannot read one of the strings
4090  * * %-E2BIG  - One of the strings is too large
4091  * * %-ERANGE - One of the strings is outside of kernel address space
4092  */
4093 __bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign,
4094 				size_t len)
4095 {
4096 	return __bpf_strnstr(s1__ign, s2__ign, len, true);
4097 }
4098 
4099 #ifdef CONFIG_KEYS
4100 /**
4101  * bpf_lookup_user_key - lookup a key by its serial
4102  * @serial: key handle serial number
4103  * @flags: lookup-specific flags
4104  *
4105  * Search a key with a given *serial* and the provided *flags*.
4106  * If found, increment the reference count of the key by one, and
4107  * return it in the bpf_key structure.
4108  *
4109  * The bpf_key structure must be passed to bpf_key_put() when done
4110  * with it, so that the key reference count is decremented and the
4111  * bpf_key structure is freed.
4112  *
4113  * Permission checks are deferred to the time the key is used by
4114  * one of the available key-specific kfuncs.
4115  *
4116  * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
4117  * special keyring (e.g. session keyring), if it doesn't yet exist.
4118  * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
4119  * for the key construction, and to retrieve uninstantiated keys (keys
4120  * without data attached to them).
4121  *
4122  * Return: a bpf_key pointer with a valid key pointer if the key is found, a
4123  *         NULL pointer otherwise.
4124  */
4125 __bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
4126 {
4127 	key_ref_t key_ref;
4128 	struct bpf_key *bkey;
4129 
4130 	if (flags & ~KEY_LOOKUP_ALL)
4131 		return NULL;
4132 
4133 	/*
4134 	 * Permission check is deferred until the key is used, as the
4135 	 * intent of the caller is unknown here.
4136 	 */
4137 	key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
4138 	if (IS_ERR(key_ref))
4139 		return NULL;
4140 
4141 	bkey = kmalloc_obj(*bkey);
4142 	if (!bkey) {
4143 		key_put(key_ref_to_ptr(key_ref));
4144 		return NULL;
4145 	}
4146 
4147 	bkey->key = key_ref_to_ptr(key_ref);
4148 	bkey->has_ref = true;
4149 
4150 	return bkey;
4151 }
4152 
4153 /**
4154  * bpf_lookup_system_key - lookup a key by a system-defined ID
4155  * @id: key ID
4156  *
4157  * Obtain a bpf_key structure with a key pointer set to the passed key ID.
4158  * The key pointer is marked as invalid, to prevent bpf_key_put() from
4159  * attempting to decrement the key reference count on that pointer. The key
4160  * pointer set in such way is currently understood only by
4161  * verify_pkcs7_signature().
4162  *
4163  * Set *id* to one of the values defined in include/linux/verification.h:
4164  * 0 for the primary keyring (immutable keyring of system keys);
4165  * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
4166  * (where keys can be added only if they are vouched for by existing keys
4167  * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
4168  * keyring (primarily used by the integrity subsystem to verify a kexec'ed
4169  * kerned image and, possibly, the initramfs signature).
4170  *
4171  * Return: a bpf_key pointer with an invalid key pointer set from the
4172  *         pre-determined ID on success, a NULL pointer otherwise
4173  */
4174 __bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
4175 {
4176 	struct bpf_key *bkey;
4177 
4178 	if (system_keyring_id_check(id) < 0)
4179 		return NULL;
4180 
4181 	bkey = kmalloc_obj(*bkey, GFP_ATOMIC);
4182 	if (!bkey)
4183 		return NULL;
4184 
4185 	bkey->key = (struct key *)(unsigned long)id;
4186 	bkey->has_ref = false;
4187 
4188 	return bkey;
4189 }
4190 
4191 /**
4192  * bpf_key_put - decrement key reference count if key is valid and free bpf_key
4193  * @bkey: bpf_key structure
4194  *
4195  * Decrement the reference count of the key inside *bkey*, if the pointer
4196  * is valid, and free *bkey*.
4197  */
4198 __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
4199 {
4200 	if (bkey->has_ref)
4201 		key_put(bkey->key);
4202 
4203 	kfree(bkey);
4204 }
4205 
4206 /**
4207  * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
4208  * @data_p: data to verify
4209  * @sig_p: signature of the data
4210  * @trusted_keyring: keyring with keys trusted for signature verification
4211  *
4212  * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
4213  * with keys in a keyring referenced by *trusted_keyring*.
4214  *
4215  * Return: 0 on success, a negative value on error.
4216  */
4217 __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
4218 			       struct bpf_dynptr *sig_p,
4219 			       struct bpf_key *trusted_keyring)
4220 {
4221 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
4222 	struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
4223 	struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
4224 	const void *data, *sig;
4225 	u32 data_len, sig_len;
4226 	int ret;
4227 
4228 	if (trusted_keyring->has_ref) {
4229 		/*
4230 		 * Do the permission check deferred in bpf_lookup_user_key().
4231 		 * See bpf_lookup_user_key() for more details.
4232 		 *
4233 		 * A call to key_task_permission() here would be redundant, as
4234 		 * it is already done by keyring_search() called by
4235 		 * find_asymmetric_key().
4236 		 */
4237 		ret = key_validate(trusted_keyring->key);
4238 		if (ret < 0)
4239 			return ret;
4240 	}
4241 
4242 	data_len = __bpf_dynptr_size(data_ptr);
4243 	data = __bpf_dynptr_data(data_ptr, data_len);
4244 	sig_len = __bpf_dynptr_size(sig_ptr);
4245 	sig = __bpf_dynptr_data(sig_ptr, sig_len);
4246 
4247 	return verify_pkcs7_signature(data, data_len, sig, sig_len,
4248 				      trusted_keyring->key,
4249 				      VERIFYING_BPF_SIGNATURE, NULL,
4250 				      NULL);
4251 #else
4252 	return -EOPNOTSUPP;
4253 #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
4254 }
4255 #endif /* CONFIG_KEYS */
4256 
4257 typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value);
4258 
4259 enum bpf_task_work_state {
4260 	/* bpf_task_work is ready to be used */
4261 	BPF_TW_STANDBY = 0,
4262 	/* irq work scheduling in progress */
4263 	BPF_TW_PENDING,
4264 	/* task work scheduling in progress */
4265 	BPF_TW_SCHEDULING,
4266 	/* task work is scheduled successfully */
4267 	BPF_TW_SCHEDULED,
4268 	/* callback is running */
4269 	BPF_TW_RUNNING,
4270 	/* associated BPF map value is deleted */
4271 	BPF_TW_FREED,
4272 };
4273 
4274 struct bpf_task_work_ctx {
4275 	enum bpf_task_work_state state;
4276 	refcount_t refcnt;
4277 	struct callback_head work;
4278 	struct irq_work irq_work;
4279 	/* bpf_prog that schedules task work */
4280 	struct bpf_prog *prog;
4281 	/* task for which callback is scheduled */
4282 	struct task_struct *task;
4283 	/* the map and map value associated with this context */
4284 	struct bpf_map *map;
4285 	void *map_val;
4286 	enum task_work_notify_mode mode;
4287 	bpf_task_work_callback_t callback_fn;
4288 	struct rcu_head rcu;
4289 } __aligned(8);
4290 
4291 /* Actual type for struct bpf_task_work */
4292 struct bpf_task_work_kern {
4293 	struct bpf_task_work_ctx *ctx;
4294 };
4295 
4296 static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx)
4297 {
4298 	if (ctx->prog) {
4299 		bpf_prog_put(ctx->prog);
4300 		ctx->prog = NULL;
4301 	}
4302 	if (ctx->task) {
4303 		bpf_task_release(ctx->task);
4304 		ctx->task = NULL;
4305 	}
4306 }
4307 
4308 static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
4309 {
4310 	return refcount_inc_not_zero(&ctx->refcnt);
4311 }
4312 
4313 static void bpf_task_work_destroy(struct irq_work *irq_work)
4314 {
4315 	struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
4316 
4317 	bpf_task_work_ctx_reset(ctx);
4318 	kfree_rcu(ctx, rcu);
4319 }
4320 
4321 static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
4322 {
4323 	if (!refcount_dec_and_test(&ctx->refcnt))
4324 		return;
4325 
4326 	if (irqs_disabled()) {
4327 		ctx->irq_work = IRQ_WORK_INIT(bpf_task_work_destroy);
4328 		irq_work_queue(&ctx->irq_work);
4329 	} else {
4330 		bpf_task_work_destroy(&ctx->irq_work);
4331 	}
4332 }
4333 
4334 static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
4335 {
4336 	/*
4337 	 * Scheduled task_work callback holds ctx ref, so if we successfully
4338 	 * cancelled, we put that ref on callback's behalf. If we couldn't
4339 	 * cancel, callback will inevitably run or has already completed
4340 	 * running, and it would have taken care of its ctx ref itself.
4341 	 */
4342 	if (task_work_cancel(ctx->task, &ctx->work))
4343 		bpf_task_work_ctx_put(ctx);
4344 }
4345 
4346 static void bpf_task_work_callback(struct callback_head *cb)
4347 {
4348 	struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work);
4349 	enum bpf_task_work_state state;
4350 	u32 idx;
4351 	void *key;
4352 
4353 	/* Read lock is needed to protect ctx and map key/value access */
4354 	guard(rcu_tasks_trace)();
4355 	/*
4356 	 * This callback may start running before bpf_task_work_irq() switched to
4357 	 * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
4358 	 */
4359 	state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
4360 	if (state == BPF_TW_SCHEDULED)
4361 		state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
4362 	if (state == BPF_TW_FREED) {
4363 		bpf_task_work_ctx_put(ctx);
4364 		return;
4365 	}
4366 
4367 	key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);
4368 
4369 	migrate_disable();
4370 	ctx->callback_fn(ctx->map, key, ctx->map_val);
4371 	migrate_enable();
4372 
4373 	bpf_task_work_ctx_reset(ctx);
4374 	(void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
4375 
4376 	bpf_task_work_ctx_put(ctx);
4377 }
4378 
4379 static void bpf_task_work_irq(struct irq_work *irq_work)
4380 {
4381 	struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
4382 	enum bpf_task_work_state state;
4383 	int err;
4384 
4385 	guard(rcu)();
4386 
4387 	if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
4388 		bpf_task_work_ctx_put(ctx);
4389 		return;
4390 	}
4391 
4392 	err = task_work_add(ctx->task, &ctx->work, ctx->mode);
4393 	if (err) {
4394 		bpf_task_work_ctx_reset(ctx);
4395 		/*
4396 		 * try to switch back to STANDBY for another task_work reuse, but we might have
4397 		 * gone to FREED already, which is fine as we already cleaned up after ourselves
4398 		 */
4399 		(void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
4400 		bpf_task_work_ctx_put(ctx);
4401 		return;
4402 	}
4403 
4404 	/*
4405 	 * It's technically possible for just scheduled task_work callback to
4406 	 * complete running by now, going SCHEDULING -> RUNNING and then
4407 	 * dropping its ctx refcount. Instead of capturing an extra ref just
4408 	 * to protect below ctx->state access, we rely on rcu_read_lock
4409 	 * above to prevent kfree_rcu from freeing ctx before we return.
4410 	 */
4411 	state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
4412 	if (state == BPF_TW_FREED)
4413 		bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */
4414 }
4415 
4416 static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw,
4417 							 struct bpf_map *map)
4418 {
4419 	struct bpf_task_work_kern *twk = (void *)tw;
4420 	struct bpf_task_work_ctx *ctx, *old_ctx;
4421 
4422 	ctx = READ_ONCE(twk->ctx);
4423 	if (ctx)
4424 		return ctx;
4425 
4426 	ctx = bpf_map_kmalloc_nolock(map, sizeof(*ctx), 0, NUMA_NO_NODE);
4427 	if (!ctx)
4428 		return ERR_PTR(-ENOMEM);
4429 
4430 	memset(ctx, 0, sizeof(*ctx));
4431 	refcount_set(&ctx->refcnt, 1); /* map's own ref */
4432 	ctx->state = BPF_TW_STANDBY;
4433 
4434 	old_ctx = cmpxchg(&twk->ctx, NULL, ctx);
4435 	if (old_ctx) {
4436 		/*
4437 		 * tw->ctx is set by concurrent BPF program, release allocated
4438 		 * memory and try to reuse already set context.
4439 		 */
4440 		kfree_nolock(ctx);
4441 		return old_ctx;
4442 	}
4443 
4444 	return ctx; /* Success */
4445 }
4446 
4447 static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw,
4448 							   struct bpf_map *map)
4449 {
4450 	struct bpf_task_work_ctx *ctx;
4451 
4452 	/*
4453 	 * Sleepable BPF programs hold rcu_read_lock_trace but not
4454 	 * regular rcu_read_lock. Since kfree_rcu waits for regular
4455 	 * RCU GP, the ctx can be freed while we're between reading
4456 	 * the pointer and incrementing the refcount. Take regular
4457 	 * rcu_read_lock to prevent kfree_rcu from freeing the ctx
4458 	 * before we can tryget it.
4459 	 */
4460 	scoped_guard(rcu) {
4461 		ctx = bpf_task_work_fetch_ctx(tw, map);
4462 		if (IS_ERR(ctx))
4463 			return ctx;
4464 
4465 		/* try to get ref for task_work callback to hold */
4466 		if (!bpf_task_work_ctx_tryget(ctx))
4467 			return ERR_PTR(-EBUSY);
4468 	}
4469 
4470 	if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
4471 		/* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
4472 		bpf_task_work_ctx_put(ctx);
4473 		return ERR_PTR(-EBUSY);
4474 	}
4475 
4476 	/*
4477 	 * If no process or bpffs is holding a reference to the map, no new callbacks should be
4478 	 * scheduled. This does not address any race or correctness issue, but rather is a policy
4479 	 * choice: dropping user references should stop everything.
4480 	 */
4481 	if (!atomic64_read(&map->usercnt)) {
4482 		/* drop ref we just got for task_work callback itself */
4483 		bpf_task_work_ctx_put(ctx);
4484 		/* transfer map's ref into cancel_and_free() */
4485 		bpf_task_work_cancel_and_free(tw);
4486 		return ERR_PTR(-EBUSY);
4487 	}
4488 
4489 	return ctx;
4490 }
4491 
4492 static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
4493 				  struct bpf_map *map, bpf_task_work_callback_t callback_fn,
4494 				  struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
4495 {
4496 	struct bpf_prog *prog;
4497 	struct bpf_task_work_ctx *ctx;
4498 	int err;
4499 
4500 	BTF_TYPE_EMIT(struct bpf_task_work);
4501 
4502 	prog = bpf_prog_inc_not_zero(aux->prog);
4503 	if (IS_ERR(prog))
4504 		return -EBADF;
4505 	task = bpf_task_acquire(task);
4506 	if (!task) {
4507 		err = -EBADF;
4508 		goto release_prog;
4509 	}
4510 
4511 	ctx = bpf_task_work_acquire_ctx(tw, map);
4512 	if (IS_ERR(ctx)) {
4513 		err = PTR_ERR(ctx);
4514 		goto release_all;
4515 	}
4516 
4517 	ctx->task = task;
4518 	ctx->callback_fn = callback_fn;
4519 	ctx->prog = prog;
4520 	ctx->mode = mode;
4521 	ctx->map = map;
4522 	ctx->map_val = (void *)tw - map->record->task_work_off;
4523 	init_task_work(&ctx->work, bpf_task_work_callback);
4524 	init_irq_work(&ctx->irq_work, bpf_task_work_irq);
4525 
4526 	irq_work_queue(&ctx->irq_work);
4527 	return 0;
4528 
4529 release_all:
4530 	bpf_task_release(task);
4531 release_prog:
4532 	bpf_prog_put(prog);
4533 	return err;
4534 }
4535 
4536 /**
4537  * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL
4538  * mode
4539  * @task: Task struct for which callback should be scheduled
4540  * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
4541  * @map__map: bpf_map that embeds struct bpf_task_work in the values
4542  * @callback: pointer to BPF subprogram to call
4543  * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
4544  *
4545  * Return: 0 if task work has been scheduled successfully, negative error code otherwise
4546  */
4547 __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
4548 					      void *map__map, bpf_task_work_callback_t callback,
4549 					      struct bpf_prog_aux *aux)
4550 {
4551 	return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL);
4552 }
4553 
4554 /**
4555  * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME
4556  * mode
4557  * @task: Task struct for which callback should be scheduled
4558  * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
4559  * @map__map: bpf_map that embeds struct bpf_task_work in the values
4560  * @callback: pointer to BPF subprogram to call
4561  * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
4562  *
4563  * Return: 0 if task work has been scheduled successfully, negative error code otherwise
4564  */
4565 __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
4566 					      void *map__map, bpf_task_work_callback_t callback,
4567 					      struct bpf_prog_aux *aux)
4568 {
4569 	return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME);
4570 }
4571 
4572 static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
4573 			    struct bpf_dynptr_kern *ptr)
4574 {
4575 	struct bpf_dynptr_file_impl *state;
4576 
4577 	/* flags is currently unsupported */
4578 	if (flags) {
4579 		bpf_dynptr_set_null(ptr);
4580 		return -EINVAL;
4581 	}
4582 
4583 	state = kmalloc_nolock(sizeof(*state), 0, NUMA_NO_NODE);
4584 	if (!state) {
4585 		bpf_dynptr_set_null(ptr);
4586 		return -ENOMEM;
4587 	}
4588 	state->offset = 0;
4589 	state->size = U64_MAX; /* Don't restrict size, as file may change anyways */
4590 	freader_init_from_file(&state->freader, NULL, 0, file, may_sleep);
4591 	bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0);
4592 	bpf_dynptr_set_rdonly(ptr);
4593 	return 0;
4594 }
4595 
4596 __bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
4597 {
4598 	return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit);
4599 }
4600 
4601 int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
4602 {
4603 	return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit);
4604 }
4605 
4606 __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
4607 {
4608 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr;
4609 	struct bpf_dynptr_file_impl *df = ptr->data;
4610 
4611 	if (!df)
4612 		return 0;
4613 
4614 	freader_cleanup(&df->freader);
4615 	kfree_nolock(df);
4616 	bpf_dynptr_set_null(ptr);
4617 	return 0;
4618 }
4619 
4620 /**
4621  * bpf_timer_cancel_async - try to deactivate a timer
4622  * @timer:	bpf_timer to stop
4623  *
4624  * Returns:
4625  *
4626  *  *  0 when the timer was not active
4627  *  *  1 when the timer was active
4628  *  * -1 when the timer is currently executing the callback function and
4629  *       cannot be stopped
4630  *  * -ECANCELED when the timer will be cancelled asynchronously
4631  *  * -ENOMEM when out of memory
4632  *  * -EINVAL when the timer was not initialized
4633  *  * -ENOENT when this kfunc is racing with timer deletion
4634  */
4635 __bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer)
4636 {
4637 	struct bpf_async_kern *async = (void *)timer;
4638 	struct bpf_async_cb *cb;
4639 	int ret;
4640 
4641 	cb = READ_ONCE(async->cb);
4642 	if (!cb)
4643 		return -EINVAL;
4644 
4645 	/*
4646 	 * Unlike hrtimer_start() it's ok to synchronously call
4647 	 * hrtimer_try_to_cancel() when refcnt reached zero, but deferring to
4648 	 * irq_work is not, since irq callback may execute after RCU GP and
4649 	 * cb could be freed at that time. Check for refcnt zero for
4650 	 * consistency.
4651 	 */
4652 	if (!refcount_inc_not_zero(&cb->refcnt))
4653 		return -ENOENT;
4654 
4655 	if (!defer_timer_wq_op()) {
4656 		struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
4657 
4658 		ret = hrtimer_try_to_cancel(&t->timer);
4659 		bpf_async_refcount_put(cb);
4660 		return ret;
4661 	} else {
4662 		ret = bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0);
4663 		return ret ? ret : -ECANCELED;
4664 	}
4665 }
4666 
4667 __bpf_kfunc_end_defs();
4668 
4669 static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
4670 {
4671 	struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
4672 
4673 	bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */
4674 	bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */
4675 }
4676 
4677 void bpf_task_work_cancel_and_free(void *val)
4678 {
4679 	struct bpf_task_work_kern *twk = val;
4680 	struct bpf_task_work_ctx *ctx;
4681 	enum bpf_task_work_state state;
4682 
4683 	ctx = xchg(&twk->ctx, NULL);
4684 	if (!ctx)
4685 		return;
4686 
4687 	state = xchg(&ctx->state, BPF_TW_FREED);
4688 	if (state == BPF_TW_SCHEDULED) {
4689 		/* run in irq_work to avoid locks in NMI */
4690 		init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled);
4691 		irq_work_queue(&ctx->irq_work);
4692 		return;
4693 	}
4694 
4695 	bpf_task_work_ctx_put(ctx); /* put bpf map's ref */
4696 }
4697 
4698 BTF_KFUNCS_START(generic_btf_ids)
4699 #ifdef CONFIG_CRASH_DUMP
4700 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
4701 #endif
4702 BTF_ID_FLAGS(func, bpf_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS)
4703 BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
4704 BTF_ID_FLAGS(func, bpf_percpu_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS)
4705 BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
4706 BTF_ID_FLAGS(func, bpf_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS)
4707 BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
4708 BTF_ID_FLAGS(func, bpf_percpu_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS)
4709 BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
4710 BTF_ID_FLAGS(func, bpf_refcount_acquire, KF_ACQUIRE | KF_RET_NULL | KF_RCU | KF_IMPLICIT_ARGS)
4711 BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
4712 BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS)
4713 BTF_ID_FLAGS(func, bpf_list_push_front_impl)
4714 BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS)
4715 BTF_ID_FLAGS(func, bpf_list_push_back_impl)
4716 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
4717 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
4718 BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
4719 BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
4720 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
4721 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
4722 BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
4723 BTF_ID_FLAGS(func, bpf_rbtree_add, KF_IMPLICIT_ARGS)
4724 BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
4725 BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
4726 BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
4727 BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
4728 BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)
4729 
4730 #ifdef CONFIG_CGROUPS
4731 BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
4732 BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
4733 BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
4734 BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
4735 BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
4736 BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
4737 #endif
4738 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
4739 BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
4740 BTF_ID_FLAGS(func, bpf_throw)
4741 #ifdef CONFIG_BPF_EVENTS
4742 BTF_ID_FLAGS(func, bpf_send_signal_task)
4743 #endif
4744 #ifdef CONFIG_KEYS
4745 BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
4746 BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
4747 BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
4748 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
4749 BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
4750 #endif
4751 #endif
4752 #ifdef CONFIG_S390
4753 BTF_ID_FLAGS(func, bpf_get_lowcore)
4754 #endif
4755 BTF_KFUNCS_END(generic_btf_ids)
4756 
4757 static const struct btf_kfunc_id_set generic_kfunc_set = {
4758 	.owner = THIS_MODULE,
4759 	.set   = &generic_btf_ids,
4760 };
4761 
4762 
4763 BTF_ID_LIST(generic_dtor_ids)
4764 BTF_ID(struct, task_struct)
4765 BTF_ID(func, bpf_task_release_dtor)
4766 #ifdef CONFIG_CGROUPS
4767 BTF_ID(struct, cgroup)
4768 BTF_ID(func, bpf_cgroup_release_dtor)
4769 #endif
4770 
4771 BTF_KFUNCS_START(common_btf_ids)
4772 BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL)
4773 BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL)
4774 BTF_ID_FLAGS(func, bpf_rcu_read_lock)
4775 BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
4776 BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
4777 BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
4778 BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
4779 BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
4780 BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
4781 BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
4782 BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
4783 BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
4784 #ifdef CONFIG_CGROUPS
4785 BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
4786 BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
4787 BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
4788 BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED)
4789 BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
4790 BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
4791 #endif
4792 BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED)
4793 BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
4794 BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
4795 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
4796 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
4797 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
4798 BTF_ID_FLAGS(func, bpf_dynptr_size)
4799 BTF_ID_FLAGS(func, bpf_dynptr_clone)
4800 BTF_ID_FLAGS(func, bpf_dynptr_copy)
4801 BTF_ID_FLAGS(func, bpf_dynptr_memset)
4802 #ifdef CONFIG_NET
4803 BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
4804 #endif
4805 BTF_ID_FLAGS(func, bpf_wq_init)
4806 BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS)
4807 BTF_ID_FLAGS(func, bpf_wq_start)
4808 BTF_ID_FLAGS(func, bpf_preempt_disable)
4809 BTF_ID_FLAGS(func, bpf_preempt_enable)
4810 BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
4811 BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
4812 BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
4813 BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
4814 BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE)
4815 BTF_ID_FLAGS(func, bpf_get_kmem_cache)
4816 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
4817 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
4818 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
4819 BTF_ID_FLAGS(func, bpf_local_irq_save)
4820 BTF_ID_FLAGS(func, bpf_local_irq_restore)
4821 #ifdef CONFIG_BPF_EVENTS
4822 BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr)
4823 BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr)
4824 BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
4825 BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr)
4826 BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
4827 BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
4828 BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE)
4829 BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE)
4830 #endif
4831 #ifdef CONFIG_DMA_SHARED_BUFFER
4832 BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
4833 BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
4834 BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
4835 #endif
4836 BTF_ID_FLAGS(func, __bpf_trap)
4837 BTF_ID_FLAGS(func, bpf_strcmp);
4838 BTF_ID_FLAGS(func, bpf_strcasecmp);
4839 BTF_ID_FLAGS(func, bpf_strncasecmp);
4840 BTF_ID_FLAGS(func, bpf_strchr);
4841 BTF_ID_FLAGS(func, bpf_strchrnul);
4842 BTF_ID_FLAGS(func, bpf_strnchr);
4843 BTF_ID_FLAGS(func, bpf_strrchr);
4844 BTF_ID_FLAGS(func, bpf_strlen);
4845 BTF_ID_FLAGS(func, bpf_strnlen);
4846 BTF_ID_FLAGS(func, bpf_strspn);
4847 BTF_ID_FLAGS(func, bpf_strcspn);
4848 BTF_ID_FLAGS(func, bpf_strstr);
4849 BTF_ID_FLAGS(func, bpf_strcasestr);
4850 BTF_ID_FLAGS(func, bpf_strnstr);
4851 BTF_ID_FLAGS(func, bpf_strncasestr);
4852 #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
4853 BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
4854 #endif
4855 BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS)
4856 BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS)
4857 BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS)
4858 BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS)
4859 BTF_ID_FLAGS(func, bpf_dynptr_from_file)
4860 BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
4861 BTF_ID_FLAGS(func, bpf_timer_cancel_async)
4862 BTF_KFUNCS_END(common_btf_ids)
4863 
4864 static const struct btf_kfunc_id_set common_kfunc_set = {
4865 	.owner = THIS_MODULE,
4866 	.set   = &common_btf_ids,
4867 };
4868 
4869 static int __init kfunc_init(void)
4870 {
4871 	int ret;
4872 	const struct btf_id_dtor_kfunc generic_dtors[] = {
4873 		{
4874 			.btf_id       = generic_dtor_ids[0],
4875 			.kfunc_btf_id = generic_dtor_ids[1]
4876 		},
4877 #ifdef CONFIG_CGROUPS
4878 		{
4879 			.btf_id       = generic_dtor_ids[2],
4880 			.kfunc_btf_id = generic_dtor_ids[3]
4881 		},
4882 #endif
4883 	};
4884 
4885 	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
4886 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
4887 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
4888 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
4889 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
4890 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set);
4891 	ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
4892 						  ARRAY_SIZE(generic_dtors),
4893 						  THIS_MODULE);
4894 	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
4895 }
4896 
4897 late_initcall(kfunc_init);
4898 
4899 /* Get a pointer to dynptr data up to len bytes for read only access. If
4900  * the dynptr doesn't have continuous data up to len bytes, return NULL.
4901  */
4902 const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len)
4903 {
4904 	const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
4905 
4906 	return bpf_dynptr_slice(p, 0, NULL, len);
4907 }
4908 
4909 /* Get a pointer to dynptr data up to len bytes for read write access. If
4910  * the dynptr doesn't have continuous data up to len bytes, or the dynptr
4911  * is read only, return NULL.
4912  */
4913 void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len)
4914 {
4915 	if (__bpf_dynptr_is_rdonly(ptr))
4916 		return NULL;
4917 	return (void *)__bpf_dynptr_data(ptr, len);
4918 }
4919 
4920 void bpf_map_free_internal_structs(struct bpf_map *map, void *val)
4921 {
4922 	if (btf_record_has_field(map->record, BPF_TIMER))
4923 		bpf_obj_free_timer(map->record, val);
4924 	if (btf_record_has_field(map->record, BPF_WORKQUEUE))
4925 		bpf_obj_free_workqueue(map->record, val);
4926 	if (btf_record_has_field(map->record, BPF_TASK_WORK))
4927 		bpf_obj_free_task_work(map->record, val);
4928 }
4929