xref: /linux/kernel/bpf/helpers.c (revision 9779193e871b144e34ec4a3e50109b3778a51a69)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  */
4 #include <linux/bpf.h>
5 #include <linux/btf.h>
6 #include <linux/bpf-cgroup.h>
7 #include <linux/cgroup.h>
8 #include <linux/rcupdate.h>
9 #include <linux/random.h>
10 #include <linux/smp.h>
11 #include <linux/topology.h>
12 #include <linux/ktime.h>
13 #include <linux/sched.h>
14 #include <linux/uidgid.h>
15 #include <linux/filter.h>
16 #include <linux/ctype.h>
17 #include <linux/jiffies.h>
18 #include <linux/pid_namespace.h>
19 #include <linux/poison.h>
20 #include <linux/proc_ns.h>
21 #include <linux/sched/task.h>
22 #include <linux/security.h>
23 #include <linux/btf_ids.h>
24 #include <linux/bpf_mem_alloc.h>
25 #include <linux/kasan.h>
26 #include <linux/bpf_verifier.h>
27 #include <linux/uaccess.h>
28 #include <linux/verification.h>
29 #include <linux/task_work.h>
30 #include <linux/irq_work.h>
31 #include <linux/buildid.h>
32 
33 #include "../../lib/kstrtox.h"
34 
35 /* If kernel subsystem is allowing eBPF programs to call this function,
36  * inside its own verifier_ops->get_func_proto() callback it should return
37  * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
38  *
39  * Different map implementations will rely on rcu in map methods
40  * lookup/update/delete, therefore eBPF programs must run under rcu lock
41  * if program is allowed to access maps, so check rcu_read_lock_held() or
42  * rcu_read_lock_trace_held() in all three functions.
43  */
44 BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
45 {
46 	WARN_ON_ONCE(!bpf_rcu_lock_held());
47 	return (unsigned long) map->ops->map_lookup_elem(map, key);
48 }
49 
50 const struct bpf_func_proto bpf_map_lookup_elem_proto = {
51 	.func		= bpf_map_lookup_elem,
52 	.gpl_only	= false,
53 	.pkt_access	= true,
54 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
55 	.arg1_type	= ARG_CONST_MAP_PTR,
56 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
57 };
58 
59 BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
60 	   void *, value, u64, flags)
61 {
62 	WARN_ON_ONCE(!bpf_rcu_lock_held());
63 	return map->ops->map_update_elem(map, key, value, flags);
64 }
65 
66 const struct bpf_func_proto bpf_map_update_elem_proto = {
67 	.func		= bpf_map_update_elem,
68 	.gpl_only	= false,
69 	.pkt_access	= true,
70 	.ret_type	= RET_INTEGER,
71 	.arg1_type	= ARG_CONST_MAP_PTR,
72 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
73 	.arg3_type	= ARG_PTR_TO_MAP_VALUE,
74 	.arg4_type	= ARG_ANYTHING,
75 };
76 
77 BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
78 {
79 	WARN_ON_ONCE(!bpf_rcu_lock_held());
80 	return map->ops->map_delete_elem(map, key);
81 }
82 
83 const struct bpf_func_proto bpf_map_delete_elem_proto = {
84 	.func		= bpf_map_delete_elem,
85 	.gpl_only	= false,
86 	.pkt_access	= true,
87 	.ret_type	= RET_INTEGER,
88 	.arg1_type	= ARG_CONST_MAP_PTR,
89 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
90 };
91 
92 BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags)
93 {
94 	return map->ops->map_push_elem(map, value, flags);
95 }
96 
97 const struct bpf_func_proto bpf_map_push_elem_proto = {
98 	.func		= bpf_map_push_elem,
99 	.gpl_only	= false,
100 	.pkt_access	= true,
101 	.ret_type	= RET_INTEGER,
102 	.arg1_type	= ARG_CONST_MAP_PTR,
103 	.arg2_type	= ARG_PTR_TO_MAP_VALUE,
104 	.arg3_type	= ARG_ANYTHING,
105 };
106 
107 BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value)
108 {
109 	return map->ops->map_pop_elem(map, value);
110 }
111 
112 const struct bpf_func_proto bpf_map_pop_elem_proto = {
113 	.func		= bpf_map_pop_elem,
114 	.gpl_only	= false,
115 	.ret_type	= RET_INTEGER,
116 	.arg1_type	= ARG_CONST_MAP_PTR,
117 	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
118 };
119 
120 BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
121 {
122 	return map->ops->map_peek_elem(map, value);
123 }
124 
125 const struct bpf_func_proto bpf_map_peek_elem_proto = {
126 	.func		= bpf_map_peek_elem,
127 	.gpl_only	= false,
128 	.ret_type	= RET_INTEGER,
129 	.arg1_type	= ARG_CONST_MAP_PTR,
130 	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
131 };
132 
133 BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
134 {
135 	WARN_ON_ONCE(!bpf_rcu_lock_held());
136 	return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
137 }
138 
139 const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
140 	.func		= bpf_map_lookup_percpu_elem,
141 	.gpl_only	= false,
142 	.pkt_access	= true,
143 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
144 	.arg1_type	= ARG_CONST_MAP_PTR,
145 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
146 	.arg3_type	= ARG_ANYTHING,
147 };
148 
149 const struct bpf_func_proto bpf_get_prandom_u32_proto = {
150 	.func		= bpf_user_rnd_u32,
151 	.gpl_only	= false,
152 	.ret_type	= RET_INTEGER,
153 };
154 
155 BPF_CALL_0(bpf_get_smp_processor_id)
156 {
157 	return smp_processor_id();
158 }
159 
160 const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
161 	.func		= bpf_get_smp_processor_id,
162 	.gpl_only	= false,
163 	.ret_type	= RET_INTEGER,
164 	.allow_fastcall	= true,
165 };
166 
167 BPF_CALL_0(bpf_get_numa_node_id)
168 {
169 	return numa_node_id();
170 }
171 
172 const struct bpf_func_proto bpf_get_numa_node_id_proto = {
173 	.func		= bpf_get_numa_node_id,
174 	.gpl_only	= false,
175 	.ret_type	= RET_INTEGER,
176 };
177 
178 BPF_CALL_0(bpf_ktime_get_ns)
179 {
180 	/* NMI safe access to clock monotonic */
181 	return ktime_get_mono_fast_ns();
182 }
183 
184 const struct bpf_func_proto bpf_ktime_get_ns_proto = {
185 	.func		= bpf_ktime_get_ns,
186 	.gpl_only	= false,
187 	.ret_type	= RET_INTEGER,
188 };
189 
190 BPF_CALL_0(bpf_ktime_get_boot_ns)
191 {
192 	/* NMI safe access to clock boottime */
193 	return ktime_get_boot_fast_ns();
194 }
195 
196 const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
197 	.func		= bpf_ktime_get_boot_ns,
198 	.gpl_only	= false,
199 	.ret_type	= RET_INTEGER,
200 };
201 
202 BPF_CALL_0(bpf_ktime_get_coarse_ns)
203 {
204 	return ktime_get_coarse_ns();
205 }
206 
207 const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
208 	.func		= bpf_ktime_get_coarse_ns,
209 	.gpl_only	= false,
210 	.ret_type	= RET_INTEGER,
211 };
212 
213 BPF_CALL_0(bpf_ktime_get_tai_ns)
214 {
215 	/* NMI safe access to clock tai */
216 	return ktime_get_tai_fast_ns();
217 }
218 
219 const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
220 	.func		= bpf_ktime_get_tai_ns,
221 	.gpl_only	= false,
222 	.ret_type	= RET_INTEGER,
223 };
224 
225 BPF_CALL_0(bpf_get_current_pid_tgid)
226 {
227 	struct task_struct *task = current;
228 
229 	if (unlikely(!task))
230 		return -EINVAL;
231 
232 	return (u64) task->tgid << 32 | task->pid;
233 }
234 
235 const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
236 	.func		= bpf_get_current_pid_tgid,
237 	.gpl_only	= false,
238 	.ret_type	= RET_INTEGER,
239 };
240 
241 BPF_CALL_0(bpf_get_current_uid_gid)
242 {
243 	struct task_struct *task = current;
244 	kuid_t uid;
245 	kgid_t gid;
246 
247 	if (unlikely(!task))
248 		return -EINVAL;
249 
250 	current_uid_gid(&uid, &gid);
251 	return (u64) from_kgid(&init_user_ns, gid) << 32 |
252 		     from_kuid(&init_user_ns, uid);
253 }
254 
255 const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
256 	.func		= bpf_get_current_uid_gid,
257 	.gpl_only	= false,
258 	.ret_type	= RET_INTEGER,
259 };
260 
261 BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
262 {
263 	struct task_struct *task = current;
264 
265 	if (unlikely(!task))
266 		goto err_clear;
267 
268 	/* Verifier guarantees that size > 0 */
269 	strscpy_pad(buf, task->comm, size);
270 	return 0;
271 err_clear:
272 	memset(buf, 0, size);
273 	return -EINVAL;
274 }
275 
276 const struct bpf_func_proto bpf_get_current_comm_proto = {
277 	.func		= bpf_get_current_comm,
278 	.gpl_only	= false,
279 	.ret_type	= RET_INTEGER,
280 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
281 	.arg2_type	= ARG_CONST_SIZE,
282 };
283 
284 #if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
285 
286 static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
287 {
288 	arch_spinlock_t *l = (void *)lock;
289 	union {
290 		__u32 val;
291 		arch_spinlock_t lock;
292 	} u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
293 
294 	compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
295 	BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
296 	BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
297 	preempt_disable();
298 	arch_spin_lock(l);
299 }
300 
301 static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
302 {
303 	arch_spinlock_t *l = (void *)lock;
304 
305 	arch_spin_unlock(l);
306 	preempt_enable();
307 }
308 
309 #else
310 
311 static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
312 {
313 	atomic_t *l = (void *)lock;
314 
315 	BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
316 	do {
317 		atomic_cond_read_relaxed(l, !VAL);
318 	} while (atomic_xchg(l, 1));
319 }
320 
321 static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
322 {
323 	atomic_t *l = (void *)lock;
324 
325 	atomic_set_release(l, 0);
326 }
327 
328 #endif
329 
330 static DEFINE_PER_CPU(unsigned long, irqsave_flags);
331 
332 static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
333 {
334 	unsigned long flags;
335 
336 	local_irq_save(flags);
337 	__bpf_spin_lock(lock);
338 	__this_cpu_write(irqsave_flags, flags);
339 }
340 
341 NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
342 {
343 	__bpf_spin_lock_irqsave(lock);
344 	return 0;
345 }
346 
347 const struct bpf_func_proto bpf_spin_lock_proto = {
348 	.func		= bpf_spin_lock,
349 	.gpl_only	= false,
350 	.ret_type	= RET_VOID,
351 	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
352 	.arg1_btf_id    = BPF_PTR_POISON,
353 };
354 
355 static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
356 {
357 	unsigned long flags;
358 
359 	flags = __this_cpu_read(irqsave_flags);
360 	__bpf_spin_unlock(lock);
361 	local_irq_restore(flags);
362 }
363 
364 NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
365 {
366 	__bpf_spin_unlock_irqrestore(lock);
367 	return 0;
368 }
369 
370 const struct bpf_func_proto bpf_spin_unlock_proto = {
371 	.func		= bpf_spin_unlock,
372 	.gpl_only	= false,
373 	.ret_type	= RET_VOID,
374 	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
375 	.arg1_btf_id    = BPF_PTR_POISON,
376 };
377 
378 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
379 			   bool lock_src)
380 {
381 	struct bpf_spin_lock *lock;
382 
383 	if (lock_src)
384 		lock = src + map->record->spin_lock_off;
385 	else
386 		lock = dst + map->record->spin_lock_off;
387 	preempt_disable();
388 	__bpf_spin_lock_irqsave(lock);
389 	copy_map_value(map, dst, src);
390 	__bpf_spin_unlock_irqrestore(lock);
391 	preempt_enable();
392 }
393 
394 BPF_CALL_0(bpf_jiffies64)
395 {
396 	return get_jiffies_64();
397 }
398 
399 const struct bpf_func_proto bpf_jiffies64_proto = {
400 	.func		= bpf_jiffies64,
401 	.gpl_only	= false,
402 	.ret_type	= RET_INTEGER,
403 };
404 
405 #ifdef CONFIG_CGROUPS
406 BPF_CALL_0(bpf_get_current_cgroup_id)
407 {
408 	struct cgroup *cgrp;
409 	u64 cgrp_id;
410 
411 	rcu_read_lock();
412 	cgrp = task_dfl_cgroup(current);
413 	cgrp_id = cgroup_id(cgrp);
414 	rcu_read_unlock();
415 
416 	return cgrp_id;
417 }
418 
419 const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
420 	.func		= bpf_get_current_cgroup_id,
421 	.gpl_only	= false,
422 	.ret_type	= RET_INTEGER,
423 };
424 
425 BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
426 {
427 	struct cgroup *cgrp;
428 	struct cgroup *ancestor;
429 	u64 cgrp_id;
430 
431 	rcu_read_lock();
432 	cgrp = task_dfl_cgroup(current);
433 	ancestor = cgroup_ancestor(cgrp, ancestor_level);
434 	cgrp_id = ancestor ? cgroup_id(ancestor) : 0;
435 	rcu_read_unlock();
436 
437 	return cgrp_id;
438 }
439 
440 const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
441 	.func		= bpf_get_current_ancestor_cgroup_id,
442 	.gpl_only	= false,
443 	.ret_type	= RET_INTEGER,
444 	.arg1_type	= ARG_ANYTHING,
445 };
446 #endif /* CONFIG_CGROUPS */
447 
448 #define BPF_STRTOX_BASE_MASK 0x1F
449 
450 static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags,
451 			  unsigned long long *res, bool *is_negative)
452 {
453 	unsigned int base = flags & BPF_STRTOX_BASE_MASK;
454 	const char *cur_buf = buf;
455 	size_t cur_len = buf_len;
456 	unsigned int consumed;
457 	size_t val_len;
458 	char str[64];
459 
460 	if (!buf || !buf_len || !res || !is_negative)
461 		return -EINVAL;
462 
463 	if (base != 0 && base != 8 && base != 10 && base != 16)
464 		return -EINVAL;
465 
466 	if (flags & ~BPF_STRTOX_BASE_MASK)
467 		return -EINVAL;
468 
469 	while (cur_buf < buf + buf_len && isspace(*cur_buf))
470 		++cur_buf;
471 
472 	*is_negative = (cur_buf < buf + buf_len && *cur_buf == '-');
473 	if (*is_negative)
474 		++cur_buf;
475 
476 	consumed = cur_buf - buf;
477 	cur_len -= consumed;
478 	if (!cur_len)
479 		return -EINVAL;
480 
481 	cur_len = min(cur_len, sizeof(str) - 1);
482 	memcpy(str, cur_buf, cur_len);
483 	str[cur_len] = '\0';
484 	cur_buf = str;
485 
486 	cur_buf = _parse_integer_fixup_radix(cur_buf, &base);
487 	val_len = _parse_integer(cur_buf, base, res);
488 
489 	if (val_len & KSTRTOX_OVERFLOW)
490 		return -ERANGE;
491 
492 	if (val_len == 0)
493 		return -EINVAL;
494 
495 	cur_buf += val_len;
496 	consumed += cur_buf - str;
497 
498 	return consumed;
499 }
500 
501 static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
502 			 long long *res)
503 {
504 	unsigned long long _res;
505 	bool is_negative;
506 	int err;
507 
508 	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
509 	if (err < 0)
510 		return err;
511 	if (is_negative) {
512 		if ((long long)-_res > 0)
513 			return -ERANGE;
514 		*res = -_res;
515 	} else {
516 		if ((long long)_res < 0)
517 			return -ERANGE;
518 		*res = _res;
519 	}
520 	return err;
521 }
522 
523 BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
524 	   s64 *, res)
525 {
526 	long long _res;
527 	int err;
528 
529 	*res = 0;
530 	err = __bpf_strtoll(buf, buf_len, flags, &_res);
531 	if (err < 0)
532 		return err;
533 	*res = _res;
534 	return err;
535 }
536 
537 const struct bpf_func_proto bpf_strtol_proto = {
538 	.func		= bpf_strtol,
539 	.gpl_only	= false,
540 	.ret_type	= RET_INTEGER,
541 	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
542 	.arg2_type	= ARG_CONST_SIZE,
543 	.arg3_type	= ARG_ANYTHING,
544 	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
545 	.arg4_size	= sizeof(s64),
546 };
547 
548 BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
549 	   u64 *, res)
550 {
551 	unsigned long long _res;
552 	bool is_negative;
553 	int err;
554 
555 	*res = 0;
556 	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
557 	if (err < 0)
558 		return err;
559 	if (is_negative)
560 		return -EINVAL;
561 	*res = _res;
562 	return err;
563 }
564 
565 const struct bpf_func_proto bpf_strtoul_proto = {
566 	.func		= bpf_strtoul,
567 	.gpl_only	= false,
568 	.ret_type	= RET_INTEGER,
569 	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
570 	.arg2_type	= ARG_CONST_SIZE,
571 	.arg3_type	= ARG_ANYTHING,
572 	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
573 	.arg4_size	= sizeof(u64),
574 };
575 
576 BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
577 {
578 	return strncmp(s1, s2, s1_sz);
579 }
580 
581 static const struct bpf_func_proto bpf_strncmp_proto = {
582 	.func		= bpf_strncmp,
583 	.gpl_only	= false,
584 	.ret_type	= RET_INTEGER,
585 	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
586 	.arg2_type	= ARG_CONST_SIZE,
587 	.arg3_type	= ARG_PTR_TO_CONST_STR,
588 };
589 
590 BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
591 	   struct bpf_pidns_info *, nsdata, u32, size)
592 {
593 	struct task_struct *task = current;
594 	struct pid_namespace *pidns;
595 	int err = -EINVAL;
596 
597 	if (unlikely(size != sizeof(struct bpf_pidns_info)))
598 		goto clear;
599 
600 	if (unlikely((u64)(dev_t)dev != dev))
601 		goto clear;
602 
603 	if (unlikely(!task))
604 		goto clear;
605 
606 	pidns = task_active_pid_ns(task);
607 	if (unlikely(!pidns)) {
608 		err = -ENOENT;
609 		goto clear;
610 	}
611 
612 	if (!ns_match(&pidns->ns, (dev_t)dev, ino))
613 		goto clear;
614 
615 	nsdata->pid = task_pid_nr_ns(task, pidns);
616 	nsdata->tgid = task_tgid_nr_ns(task, pidns);
617 	return 0;
618 clear:
619 	memset((void *)nsdata, 0, (size_t) size);
620 	return err;
621 }
622 
623 const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
624 	.func		= bpf_get_ns_current_pid_tgid,
625 	.gpl_only	= false,
626 	.ret_type	= RET_INTEGER,
627 	.arg1_type	= ARG_ANYTHING,
628 	.arg2_type	= ARG_ANYTHING,
629 	.arg3_type      = ARG_PTR_TO_UNINIT_MEM,
630 	.arg4_type      = ARG_CONST_SIZE,
631 };
632 
633 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
634 	.func		= bpf_get_raw_cpu_id,
635 	.gpl_only	= false,
636 	.ret_type	= RET_INTEGER,
637 };
638 
639 BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map,
640 	   u64, flags, void *, data, u64, size)
641 {
642 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
643 		return -EINVAL;
644 
645 	return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
646 }
647 
648 const struct bpf_func_proto bpf_event_output_data_proto =  {
649 	.func		= bpf_event_output_data,
650 	.gpl_only       = true,
651 	.ret_type       = RET_INTEGER,
652 	.arg1_type      = ARG_PTR_TO_CTX,
653 	.arg2_type      = ARG_CONST_MAP_PTR,
654 	.arg3_type      = ARG_ANYTHING,
655 	.arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
656 	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
657 };
658 
659 BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
660 	   const void __user *, user_ptr)
661 {
662 	int ret = copy_from_user(dst, user_ptr, size);
663 
664 	if (unlikely(ret)) {
665 		memset(dst, 0, size);
666 		ret = -EFAULT;
667 	}
668 
669 	return ret;
670 }
671 
672 const struct bpf_func_proto bpf_copy_from_user_proto = {
673 	.func		= bpf_copy_from_user,
674 	.gpl_only	= false,
675 	.might_sleep	= true,
676 	.ret_type	= RET_INTEGER,
677 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
678 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
679 	.arg3_type	= ARG_ANYTHING,
680 };
681 
682 BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
683 	   const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
684 {
685 	int ret;
686 
687 	/* flags is not used yet */
688 	if (unlikely(flags))
689 		return -EINVAL;
690 
691 	if (unlikely(!size))
692 		return 0;
693 
694 	ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
695 	if (ret == size)
696 		return 0;
697 
698 	memset(dst, 0, size);
699 	/* Return -EFAULT for partial read */
700 	return ret < 0 ? ret : -EFAULT;
701 }
702 
703 const struct bpf_func_proto bpf_copy_from_user_task_proto = {
704 	.func		= bpf_copy_from_user_task,
705 	.gpl_only	= true,
706 	.might_sleep	= true,
707 	.ret_type	= RET_INTEGER,
708 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
709 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
710 	.arg3_type	= ARG_ANYTHING,
711 	.arg4_type	= ARG_PTR_TO_BTF_ID,
712 	.arg4_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
713 	.arg5_type	= ARG_ANYTHING
714 };
715 
716 BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
717 {
718 	if (cpu >= nr_cpu_ids)
719 		return (unsigned long)NULL;
720 
721 	return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu);
722 }
723 
724 const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
725 	.func		= bpf_per_cpu_ptr,
726 	.gpl_only	= false,
727 	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
728 	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
729 	.arg2_type	= ARG_ANYTHING,
730 };
731 
732 BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
733 {
734 	return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr);
735 }
736 
737 const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
738 	.func		= bpf_this_cpu_ptr,
739 	.gpl_only	= false,
740 	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
741 	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
742 };
743 
744 static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
745 		size_t bufsz)
746 {
747 	void __user *user_ptr = (__force void __user *)unsafe_ptr;
748 
749 	buf[0] = 0;
750 
751 	switch (fmt_ptype) {
752 	case 's':
753 #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
754 		if ((unsigned long)unsafe_ptr < TASK_SIZE)
755 			return strncpy_from_user_nofault(buf, user_ptr, bufsz);
756 		fallthrough;
757 #endif
758 	case 'k':
759 		return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
760 	case 'u':
761 		return strncpy_from_user_nofault(buf, user_ptr, bufsz);
762 	}
763 
764 	return -EINVAL;
765 }
766 
767 /* Support executing three nested bprintf helper calls on a given CPU */
768 #define MAX_BPRINTF_NEST_LEVEL	3
769 
770 static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
771 static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
772 
773 int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
774 {
775 	int nest_level;
776 
777 	preempt_disable();
778 	nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
779 	if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
780 		this_cpu_dec(bpf_bprintf_nest_level);
781 		preempt_enable();
782 		return -EBUSY;
783 	}
784 	*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
785 
786 	return 0;
787 }
788 
789 void bpf_put_buffers(void)
790 {
791 	if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
792 		return;
793 	this_cpu_dec(bpf_bprintf_nest_level);
794 	preempt_enable();
795 }
796 
797 void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
798 {
799 	if (!data->bin_args && !data->buf)
800 		return;
801 	bpf_put_buffers();
802 }
803 
804 /*
805  * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
806  *
807  * Returns a negative value if fmt is an invalid format string or 0 otherwise.
808  *
809  * This can be used in two ways:
810  * - Format string verification only: when data->get_bin_args is false
811  * - Arguments preparation: in addition to the above verification, it writes in
812  *   data->bin_args a binary representation of arguments usable by bstr_printf
813  *   where pointers from BPF have been sanitized.
814  *
815  * In argument preparation mode, if 0 is returned, safe temporary buffers are
816  * allocated and bpf_bprintf_cleanup should be called to free them after use.
817  */
818 int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
819 			u32 num_args, struct bpf_bprintf_data *data)
820 {
821 	bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
822 	char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
823 	struct bpf_bprintf_buffers *buffers = NULL;
824 	size_t sizeof_cur_arg, sizeof_cur_ip;
825 	int err, i, num_spec = 0;
826 	u64 cur_arg;
827 	char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX";
828 
829 	fmt_end = strnchr(fmt, fmt_size, 0);
830 	if (!fmt_end)
831 		return -EINVAL;
832 	fmt_size = fmt_end - fmt;
833 
834 	if (get_buffers && bpf_try_get_buffers(&buffers))
835 		return -EBUSY;
836 
837 	if (data->get_bin_args) {
838 		if (num_args)
839 			tmp_buf = buffers->bin_args;
840 		tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS;
841 		data->bin_args = (u32 *)tmp_buf;
842 	}
843 
844 	if (data->get_buf)
845 		data->buf = buffers->buf;
846 
847 	for (i = 0; i < fmt_size; i++) {
848 		unsigned char c = fmt[i];
849 
850 		/*
851 		 * Permit bytes >= 0x80 in plain text so UTF-8 literals can pass
852 		 * through unchanged, while still rejecting ASCII control bytes.
853 		 */
854 		if (isascii(c) && !isprint(c) && !isspace(c)) {
855 			err = -EINVAL;
856 			goto out;
857 		}
858 
859 		if (fmt[i] != '%')
860 			continue;
861 
862 		if (fmt[i + 1] == '%') {
863 			i++;
864 			continue;
865 		}
866 
867 		if (num_spec >= num_args) {
868 			err = -EINVAL;
869 			goto out;
870 		}
871 
872 		/* The string is zero-terminated so if fmt[i] != 0, we can
873 		 * always access fmt[i + 1], in the worst case it will be a 0
874 		 */
875 		i++;
876 		c = fmt[i];
877 		/*
878 		 * The format parser below only understands ASCII conversion
879 		 * specifiers and modifiers, so reject non-ASCII after '%'.
880 		 */
881 		if (!isascii(c)) {
882 			err = -EINVAL;
883 			goto out;
884 		}
885 
886 		/* skip optional "[0 +-][num]" width formatting field */
887 		while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-' ||
888 		       fmt[i] == ' ')
889 			i++;
890 		if (fmt[i] >= '1' && fmt[i] <= '9') {
891 			i++;
892 			while (fmt[i] >= '0' && fmt[i] <= '9')
893 				i++;
894 		}
895 
896 		if (fmt[i] == 'p') {
897 			sizeof_cur_arg = sizeof(long);
898 
899 			if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
900 			    ispunct(fmt[i + 1])) {
901 				if (tmp_buf)
902 					cur_arg = raw_args[num_spec];
903 				goto nocopy_fmt;
904 			}
905 
906 			if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
907 			    fmt[i + 2] == 's') {
908 				fmt_ptype = fmt[i + 1];
909 				i += 2;
910 				goto fmt_str;
911 			}
912 
913 			if (fmt[i + 1] == 'K' ||
914 			    fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
915 			    fmt[i + 1] == 'S') {
916 				if (tmp_buf)
917 					cur_arg = raw_args[num_spec];
918 				i++;
919 				goto nocopy_fmt;
920 			}
921 
922 			if (fmt[i + 1] == 'B') {
923 				if (tmp_buf)  {
924 					err = snprintf(tmp_buf,
925 						       (tmp_buf_end - tmp_buf),
926 						       "%pB",
927 						       (void *)(long)raw_args[num_spec]);
928 					tmp_buf += (err + 1);
929 				}
930 
931 				i++;
932 				num_spec++;
933 				continue;
934 			}
935 
936 			/* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
937 			if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') ||
938 			    (fmt[i + 2] != '4' && fmt[i + 2] != '6')) {
939 				err = -EINVAL;
940 				goto out;
941 			}
942 
943 			i += 2;
944 			if (!tmp_buf)
945 				goto nocopy_fmt;
946 
947 			sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16;
948 			if (tmp_buf_end - tmp_buf < sizeof_cur_ip) {
949 				err = -ENOSPC;
950 				goto out;
951 			}
952 
953 			unsafe_ptr = (char *)(long)raw_args[num_spec];
954 			err = copy_from_kernel_nofault(cur_ip, unsafe_ptr,
955 						       sizeof_cur_ip);
956 			if (err < 0)
957 				memset(cur_ip, 0, sizeof_cur_ip);
958 
959 			/* hack: bstr_printf expects IP addresses to be
960 			 * pre-formatted as strings, ironically, the easiest way
961 			 * to do that is to call snprintf.
962 			 */
963 			ip_spec[2] = fmt[i - 1];
964 			ip_spec[3] = fmt[i];
965 			err = snprintf(tmp_buf, tmp_buf_end - tmp_buf,
966 				       ip_spec, &cur_ip);
967 
968 			tmp_buf += err + 1;
969 			num_spec++;
970 
971 			continue;
972 		} else if (fmt[i] == 's') {
973 			fmt_ptype = fmt[i];
974 fmt_str:
975 			if (fmt[i + 1] != 0 &&
976 			    !isspace(fmt[i + 1]) &&
977 			    !ispunct(fmt[i + 1])) {
978 				err = -EINVAL;
979 				goto out;
980 			}
981 
982 			if (!tmp_buf)
983 				goto nocopy_fmt;
984 
985 			if (tmp_buf_end == tmp_buf) {
986 				err = -ENOSPC;
987 				goto out;
988 			}
989 
990 			unsafe_ptr = (char *)(long)raw_args[num_spec];
991 			err = bpf_trace_copy_string(tmp_buf, unsafe_ptr,
992 						    fmt_ptype,
993 						    tmp_buf_end - tmp_buf);
994 			if (err < 0) {
995 				tmp_buf[0] = '\0';
996 				err = 1;
997 			}
998 
999 			tmp_buf += err;
1000 			num_spec++;
1001 
1002 			continue;
1003 		} else if (fmt[i] == 'c') {
1004 			if (!tmp_buf)
1005 				goto nocopy_fmt;
1006 
1007 			if (tmp_buf_end == tmp_buf) {
1008 				err = -ENOSPC;
1009 				goto out;
1010 			}
1011 
1012 			*tmp_buf = raw_args[num_spec];
1013 			tmp_buf++;
1014 			num_spec++;
1015 
1016 			continue;
1017 		}
1018 
1019 		sizeof_cur_arg = sizeof(int);
1020 
1021 		if (fmt[i] == 'l') {
1022 			sizeof_cur_arg = sizeof(long);
1023 			i++;
1024 		}
1025 		if (fmt[i] == 'l') {
1026 			sizeof_cur_arg = sizeof(long long);
1027 			i++;
1028 		}
1029 
1030 		if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' &&
1031 		    fmt[i] != 'x' && fmt[i] != 'X') {
1032 			err = -EINVAL;
1033 			goto out;
1034 		}
1035 
1036 		if (tmp_buf)
1037 			cur_arg = raw_args[num_spec];
1038 nocopy_fmt:
1039 		if (tmp_buf) {
1040 			tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32));
1041 			if (tmp_buf_end - tmp_buf < sizeof_cur_arg) {
1042 				err = -ENOSPC;
1043 				goto out;
1044 			}
1045 
1046 			if (sizeof_cur_arg == 8) {
1047 				*(u32 *)tmp_buf = *(u32 *)&cur_arg;
1048 				*(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1);
1049 			} else {
1050 				*(u32 *)tmp_buf = (u32)(long)cur_arg;
1051 			}
1052 			tmp_buf += sizeof_cur_arg;
1053 		}
1054 		num_spec++;
1055 	}
1056 
1057 	err = 0;
1058 out:
1059 	if (err)
1060 		bpf_bprintf_cleanup(data);
1061 	return err;
1062 }
1063 
1064 BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
1065 	   const void *, args, u32, data_len)
1066 {
1067 	struct bpf_bprintf_data data = {
1068 		.get_bin_args	= true,
1069 	};
1070 	int err, num_args;
1071 
1072 	if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
1073 	    (data_len && !args))
1074 		return -EINVAL;
1075 	num_args = data_len / 8;
1076 
1077 	/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
1078 	 * can safely give an unbounded size.
1079 	 */
1080 	err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);
1081 	if (err < 0)
1082 		return err;
1083 
1084 	err = bstr_printf(str, str_size, fmt, data.bin_args);
1085 
1086 	bpf_bprintf_cleanup(&data);
1087 
1088 	return err + 1;
1089 }
1090 
1091 const struct bpf_func_proto bpf_snprintf_proto = {
1092 	.func		= bpf_snprintf,
1093 	.gpl_only	= true,
1094 	.ret_type	= RET_INTEGER,
1095 	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE,
1096 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1097 	.arg3_type	= ARG_PTR_TO_CONST_STR,
1098 	.arg4_type	= ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
1099 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
1100 };
1101 
1102 static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
1103 {
1104 	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
1105 		struct bpf_array *array = container_of(map, struct bpf_array, map);
1106 
1107 		*arr_idx = ((char *)value - array->value) / array->elem_size;
1108 		return arr_idx;
1109 	}
1110 	return (void *)value - round_up(map->key_size, 8);
1111 }
1112 
1113 enum bpf_async_type {
1114 	BPF_ASYNC_TYPE_TIMER = 0,
1115 	BPF_ASYNC_TYPE_WQ,
1116 };
1117 
1118 enum bpf_async_op {
1119 	BPF_ASYNC_START,
1120 	BPF_ASYNC_CANCEL
1121 };
1122 
1123 struct bpf_async_cmd {
1124 	struct llist_node node;
1125 	u64 nsec;
1126 	u32 mode;
1127 	enum bpf_async_op op;
1128 };
1129 
1130 struct bpf_async_cb {
1131 	struct bpf_map *map;
1132 	struct bpf_prog *prog;
1133 	void __rcu *callback_fn;
1134 	void *value;
1135 	struct rcu_head rcu;
1136 	u64 flags;
1137 	struct irq_work worker;
1138 	refcount_t refcnt;
1139 	enum bpf_async_type type;
1140 	struct llist_head async_cmds;
1141 };
1142 
1143 /* BPF map elements can contain 'struct bpf_timer'.
1144  * Such map owns all of its BPF timers.
1145  * 'struct bpf_timer' is allocated as part of map element allocation
1146  * and it's zero initialized.
1147  * That space is used to keep 'struct bpf_async_kern'.
1148  * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
1149  * remembers 'struct bpf_map *' pointer it's part of.
1150  * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
1151  * bpf_timer_start() arms the timer.
1152  * If user space reference to a map goes to zero at this point
1153  * ops->map_release_uref callback is responsible for cancelling the timers,
1154  * freeing their memory, and decrementing prog's refcnts.
1155  * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
1156  * Inner maps can contain bpf timers as well. ops->map_release_uref is
1157  * freeing the timers when inner map is replaced or deleted by user space.
1158  */
1159 struct bpf_hrtimer {
1160 	struct bpf_async_cb cb;
1161 	struct hrtimer timer;
1162 	atomic_t cancelling;
1163 };
1164 
1165 struct bpf_work {
1166 	struct bpf_async_cb cb;
1167 	struct work_struct work;
1168 };
1169 
1170 /* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
1171 struct bpf_async_kern {
1172 	union {
1173 		struct bpf_async_cb *cb;
1174 		struct bpf_hrtimer *timer;
1175 		struct bpf_work *work;
1176 	};
1177 } __attribute__((aligned(8)));
1178 
1179 static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
1180 
1181 static void bpf_async_refcount_put(struct bpf_async_cb *cb);
1182 
1183 static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
1184 {
1185 	struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
1186 	struct bpf_map *map = t->cb.map;
1187 	void *value = t->cb.value;
1188 	bpf_callback_t callback_fn;
1189 	void *key;
1190 	u32 idx;
1191 
1192 	BTF_TYPE_EMIT(struct bpf_timer);
1193 	callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
1194 	if (!callback_fn)
1195 		goto out;
1196 
1197 	/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
1198 	 * cannot be preempted by another bpf_timer_cb() on the same cpu.
1199 	 * Remember the timer this callback is servicing to prevent
1200 	 * deadlock if callback_fn() calls bpf_timer_cancel() or
1201 	 * bpf_map_delete_elem() on the same timer.
1202 	 */
1203 	this_cpu_write(hrtimer_running, t);
1204 
1205 	key = map_key_from_value(map, value, &idx);
1206 
1207 	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
1208 	/* The verifier checked that return value is zero. */
1209 
1210 	this_cpu_write(hrtimer_running, NULL);
1211 out:
1212 	return HRTIMER_NORESTART;
1213 }
1214 
1215 static void bpf_wq_work(struct work_struct *work)
1216 {
1217 	struct bpf_work *w = container_of(work, struct bpf_work, work);
1218 	struct bpf_async_cb *cb = &w->cb;
1219 	struct bpf_map *map = cb->map;
1220 	bpf_callback_t callback_fn;
1221 	void *value = cb->value;
1222 	void *key;
1223 	u32 idx;
1224 
1225 	BTF_TYPE_EMIT(struct bpf_wq);
1226 
1227 	callback_fn = READ_ONCE(cb->callback_fn);
1228 	if (!callback_fn)
1229 		return;
1230 
1231 	key = map_key_from_value(map, value, &idx);
1232 
1233         rcu_read_lock_trace();
1234         migrate_disable();
1235 
1236 	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
1237 
1238 	migrate_enable();
1239 	rcu_read_unlock_trace();
1240 }
1241 
1242 static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
1243 {
1244 	struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
1245 
1246 	/*
1247 	 * Drop the last reference to prog only after RCU GP, as set_callback()
1248 	 * may race with cancel_and_free()
1249 	 */
1250 	if (cb->prog)
1251 		bpf_prog_put(cb->prog);
1252 
1253 	kfree_nolock(cb);
1254 }
1255 
1256 /* Callback from call_rcu_tasks_trace, chains to call_rcu for final free */
1257 static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu)
1258 {
1259 	struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
1260 	struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
1261 	struct bpf_work *w = container_of(cb, struct bpf_work, cb);
1262 	bool retry = false;
1263 
1264 	/*
1265 	 * bpf_async_cancel_and_free() tried to cancel timer/wq, but it
1266 	 * could have raced with timer/wq_start. Now refcnt is zero and
1267 	 * srcu/rcu GP completed. Cancel timer/wq again.
1268 	 */
1269 	switch (cb->type) {
1270 	case BPF_ASYNC_TYPE_TIMER:
1271 		if (hrtimer_try_to_cancel(&t->timer) < 0)
1272 			retry = true;
1273 		break;
1274 	case BPF_ASYNC_TYPE_WQ:
1275 		if (!cancel_work(&w->work) && work_busy(&w->work))
1276 			retry = true;
1277 		break;
1278 	}
1279 	if (retry) {
1280 		/*
1281 		 * hrtimer or wq callback may still be running. It must be
1282 		 * in rcu_tasks_trace or rcu CS, so wait for GP again.
1283 		 * It won't retry forever, since refcnt zero prevents all
1284 		 * operations on timer/wq.
1285 		 */
1286 		call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
1287 		return;
1288 	}
1289 
1290 	/* RCU Tasks Trace grace period implies RCU grace period. */
1291 	bpf_async_cb_rcu_free(rcu);
1292 }
1293 
1294 static void worker_for_call_rcu(struct irq_work *work)
1295 {
1296 	struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker);
1297 
1298 	call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
1299 }
1300 
1301 static void bpf_async_refcount_put(struct bpf_async_cb *cb)
1302 {
1303 	if (!refcount_dec_and_test(&cb->refcnt))
1304 		return;
1305 
1306 	if (irqs_disabled()) {
1307 		cb->worker = IRQ_WORK_INIT(worker_for_call_rcu);
1308 		irq_work_queue(&cb->worker);
1309 	} else {
1310 		call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
1311 	}
1312 }
1313 
1314 static void bpf_async_cancel_and_free(struct bpf_async_kern *async);
1315 static void bpf_async_irq_worker(struct irq_work *work);
1316 
1317 static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
1318 			    enum bpf_async_type type)
1319 {
1320 	struct bpf_async_cb *cb, *old_cb;
1321 	struct bpf_hrtimer *t;
1322 	struct bpf_work *w;
1323 	clockid_t clockid;
1324 	size_t size;
1325 
1326 	switch (type) {
1327 	case BPF_ASYNC_TYPE_TIMER:
1328 		size = sizeof(struct bpf_hrtimer);
1329 		break;
1330 	case BPF_ASYNC_TYPE_WQ:
1331 		size = sizeof(struct bpf_work);
1332 		break;
1333 	default:
1334 		return -EINVAL;
1335 	}
1336 
1337 	old_cb = READ_ONCE(async->cb);
1338 	if (old_cb)
1339 		return -EBUSY;
1340 
1341 	cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
1342 	if (!cb)
1343 		return -ENOMEM;
1344 
1345 	switch (type) {
1346 	case BPF_ASYNC_TYPE_TIMER:
1347 		clockid = flags & (MAX_CLOCKS - 1);
1348 		t = (struct bpf_hrtimer *)cb;
1349 
1350 		atomic_set(&t->cancelling, 0);
1351 		hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT);
1352 		cb->value = (void *)async - map->record->timer_off;
1353 		break;
1354 	case BPF_ASYNC_TYPE_WQ:
1355 		w = (struct bpf_work *)cb;
1356 
1357 		INIT_WORK(&w->work, bpf_wq_work);
1358 		cb->value = (void *)async - map->record->wq_off;
1359 		break;
1360 	}
1361 	cb->map = map;
1362 	cb->prog = NULL;
1363 	cb->flags = flags;
1364 	cb->worker = IRQ_WORK_INIT(bpf_async_irq_worker);
1365 	init_llist_head(&cb->async_cmds);
1366 	refcount_set(&cb->refcnt, 1); /* map's reference */
1367 	cb->type = type;
1368 	rcu_assign_pointer(cb->callback_fn, NULL);
1369 
1370 	old_cb = cmpxchg(&async->cb, NULL, cb);
1371 	if (old_cb) {
1372 		/* Lost the race to initialize this bpf_async_kern, drop the allocated object */
1373 		kfree_nolock(cb);
1374 		return -EBUSY;
1375 	}
1376 	/* Guarantee the order between async->cb and map->usercnt. So
1377 	 * when there are concurrent uref release and bpf timer init, either
1378 	 * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
1379 	 * timer or atomic64_read() below returns a zero usercnt.
1380 	 */
1381 	smp_mb();
1382 	if (!atomic64_read(&map->usercnt)) {
1383 		/* maps with timers must be either held by user space
1384 		 * or pinned in bpffs.
1385 		 */
1386 		bpf_async_cancel_and_free(async);
1387 		return -EPERM;
1388 	}
1389 
1390 	return 0;
1391 }
1392 
1393 BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
1394 	   u64, flags)
1395 {
1396 	clock_t clockid = flags & (MAX_CLOCKS - 1);
1397 
1398 	BUILD_BUG_ON(MAX_CLOCKS != 16);
1399 	BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
1400 	BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
1401 
1402 	if (flags >= MAX_CLOCKS ||
1403 	    /* similar to timerfd except _ALARM variants are not supported */
1404 	    (clockid != CLOCK_MONOTONIC &&
1405 	     clockid != CLOCK_REALTIME &&
1406 	     clockid != CLOCK_BOOTTIME))
1407 		return -EINVAL;
1408 
1409 	return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
1410 }
1411 
1412 static const struct bpf_func_proto bpf_timer_init_proto = {
1413 	.func		= bpf_timer_init,
1414 	.gpl_only	= true,
1415 	.ret_type	= RET_INTEGER,
1416 	.arg1_type	= ARG_PTR_TO_TIMER,
1417 	.arg2_type	= ARG_CONST_MAP_PTR,
1418 	.arg3_type	= ARG_ANYTHING,
1419 };
1420 
1421 static int bpf_async_update_prog_callback(struct bpf_async_cb *cb,
1422 					  struct bpf_prog *prog,
1423 					  void *callback_fn)
1424 {
1425 	struct bpf_prog *prev;
1426 
1427 	/* Acquire a guard reference on prog to prevent it from being freed during the loop */
1428 	if (prog) {
1429 		prog = bpf_prog_inc_not_zero(prog);
1430 		if (IS_ERR(prog))
1431 			return PTR_ERR(prog);
1432 	}
1433 
1434 	do {
1435 		if (prog)
1436 			prog = bpf_prog_inc_not_zero(prog);
1437 		prev = xchg(&cb->prog, prog);
1438 		rcu_assign_pointer(cb->callback_fn, callback_fn);
1439 
1440 		/*
1441 		 * Release previous prog, make sure that if other CPU is contending,
1442 		 * to set bpf_prog, references are not leaked as each iteration acquires and
1443 		 * releases one reference.
1444 		 */
1445 		if (prev)
1446 			bpf_prog_put(prev);
1447 
1448 	} while (READ_ONCE(cb->prog) != prog ||
1449 		 (void __force *)READ_ONCE(cb->callback_fn) != callback_fn);
1450 
1451 	if (prog)
1452 		bpf_prog_put(prog);
1453 
1454 	return 0;
1455 }
1456 
1457 static DEFINE_PER_CPU(struct bpf_async_cb *, async_cb_running);
1458 
1459 static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op,
1460 				 u64 nsec, u32 timer_mode)
1461 {
1462 	/*
1463 	 * Do not schedule another operation on this cpu if it's in irq_work
1464 	 * callback that is processing async_cmds queue. Otherwise the following
1465 	 * loop is possible:
1466 	 * bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue().
1467 	 * irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start().
1468 	 */
1469 	if (this_cpu_read(async_cb_running) == cb) {
1470 		bpf_async_refcount_put(cb);
1471 		return -EDEADLK;
1472 	}
1473 
1474 	struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE);
1475 
1476 	if (!cmd) {
1477 		bpf_async_refcount_put(cb);
1478 		return -ENOMEM;
1479 	}
1480 	init_llist_node(&cmd->node);
1481 	cmd->nsec = nsec;
1482 	cmd->mode = timer_mode;
1483 	cmd->op = op;
1484 	if (llist_add(&cmd->node, &cb->async_cmds))
1485 		irq_work_queue(&cb->worker);
1486 	return 0;
1487 }
1488 
1489 static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
1490 				    struct bpf_prog *prog)
1491 {
1492 	struct bpf_async_cb *cb;
1493 
1494 	cb = READ_ONCE(async->cb);
1495 	if (!cb)
1496 		return -EINVAL;
1497 
1498 	return bpf_async_update_prog_callback(cb, prog, callback_fn);
1499 }
1500 
1501 BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
1502 	   struct bpf_prog_aux *, aux)
1503 {
1504 	return __bpf_async_set_callback(timer, callback_fn, aux->prog);
1505 }
1506 
1507 static const struct bpf_func_proto bpf_timer_set_callback_proto = {
1508 	.func		= bpf_timer_set_callback,
1509 	.gpl_only	= true,
1510 	.ret_type	= RET_INTEGER,
1511 	.arg1_type	= ARG_PTR_TO_TIMER,
1512 	.arg2_type	= ARG_PTR_TO_FUNC,
1513 };
1514 
1515 static bool defer_timer_wq_op(void)
1516 {
1517 	return in_hardirq() || irqs_disabled();
1518 }
1519 
1520 BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags)
1521 {
1522 	struct bpf_hrtimer *t;
1523 	u32 mode;
1524 
1525 	if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
1526 		return -EINVAL;
1527 
1528 	t = READ_ONCE(async->timer);
1529 	if (!t || !READ_ONCE(t->cb.prog))
1530 		return -EINVAL;
1531 
1532 	if (flags & BPF_F_TIMER_ABS)
1533 		mode = HRTIMER_MODE_ABS_SOFT;
1534 	else
1535 		mode = HRTIMER_MODE_REL_SOFT;
1536 
1537 	if (flags & BPF_F_TIMER_CPU_PIN)
1538 		mode |= HRTIMER_MODE_PINNED;
1539 
1540 	/*
1541 	 * bpf_async_cancel_and_free() could have dropped refcnt to zero. In
1542 	 * such case BPF progs are not allowed to arm the timer to prevent UAF.
1543 	 */
1544 	if (!refcount_inc_not_zero(&t->cb.refcnt))
1545 		return -ENOENT;
1546 
1547 	if (!defer_timer_wq_op()) {
1548 		hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
1549 		bpf_async_refcount_put(&t->cb);
1550 		return 0;
1551 	} else {
1552 		return bpf_async_schedule_op(&t->cb, BPF_ASYNC_START, nsecs, mode);
1553 	}
1554 }
1555 
1556 static const struct bpf_func_proto bpf_timer_start_proto = {
1557 	.func		= bpf_timer_start,
1558 	.gpl_only	= true,
1559 	.ret_type	= RET_INTEGER,
1560 	.arg1_type	= ARG_PTR_TO_TIMER,
1561 	.arg2_type	= ARG_ANYTHING,
1562 	.arg3_type	= ARG_ANYTHING,
1563 };
1564 
1565 BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async)
1566 {
1567 	struct bpf_hrtimer *t, *cur_t;
1568 	bool inc = false;
1569 	int ret = 0;
1570 
1571 	if (defer_timer_wq_op())
1572 		return -EOPNOTSUPP;
1573 
1574 	t = READ_ONCE(async->timer);
1575 	if (!t)
1576 		return -EINVAL;
1577 
1578 	cur_t = this_cpu_read(hrtimer_running);
1579 	if (cur_t == t) {
1580 		/* If bpf callback_fn is trying to bpf_timer_cancel()
1581 		 * its own timer the hrtimer_cancel() will deadlock
1582 		 * since it waits for callback_fn to finish.
1583 		 */
1584 		return -EDEADLK;
1585 	}
1586 
1587 	/* Only account in-flight cancellations when invoked from a timer
1588 	 * callback, since we want to avoid waiting only if other _callbacks_
1589 	 * are waiting on us, to avoid introducing lockups. Non-callback paths
1590 	 * are ok, since nobody would synchronously wait for their completion.
1591 	 */
1592 	if (!cur_t)
1593 		goto drop;
1594 	atomic_inc(&t->cancelling);
1595 	/* Need full barrier after relaxed atomic_inc */
1596 	smp_mb__after_atomic();
1597 	inc = true;
1598 	if (atomic_read(&cur_t->cancelling)) {
1599 		/* We're cancelling timer t, while some other timer callback is
1600 		 * attempting to cancel us. In such a case, it might be possible
1601 		 * that timer t belongs to the other callback, or some other
1602 		 * callback waiting upon it (creating transitive dependencies
1603 		 * upon us), and we will enter a deadlock if we continue
1604 		 * cancelling and waiting for it synchronously, since it might
1605 		 * do the same. Bail!
1606 		 */
1607 		atomic_dec(&t->cancelling);
1608 		return -EDEADLK;
1609 	}
1610 drop:
1611 	bpf_async_update_prog_callback(&t->cb, NULL, NULL);
1612 	/* Cancel the timer and wait for associated callback to finish
1613 	 * if it was running.
1614 	 */
1615 	ret = hrtimer_cancel(&t->timer);
1616 	if (inc)
1617 		atomic_dec(&t->cancelling);
1618 	return ret;
1619 }
1620 
1621 static const struct bpf_func_proto bpf_timer_cancel_proto = {
1622 	.func		= bpf_timer_cancel,
1623 	.gpl_only	= true,
1624 	.ret_type	= RET_INTEGER,
1625 	.arg1_type	= ARG_PTR_TO_TIMER,
1626 };
1627 
1628 static void bpf_async_process_op(struct bpf_async_cb *cb, u32 op,
1629 				 u64 timer_nsec, u32 timer_mode)
1630 {
1631 	switch (cb->type) {
1632 	case BPF_ASYNC_TYPE_TIMER: {
1633 		struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
1634 
1635 		switch (op) {
1636 		case BPF_ASYNC_START:
1637 			hrtimer_start(&t->timer, ns_to_ktime(timer_nsec), timer_mode);
1638 			break;
1639 		case BPF_ASYNC_CANCEL:
1640 			hrtimer_try_to_cancel(&t->timer);
1641 			break;
1642 		}
1643 		break;
1644 	}
1645 	case BPF_ASYNC_TYPE_WQ: {
1646 		struct bpf_work *w = container_of(cb, struct bpf_work, cb);
1647 
1648 		switch (op) {
1649 		case BPF_ASYNC_START:
1650 			schedule_work(&w->work);
1651 			break;
1652 		case BPF_ASYNC_CANCEL:
1653 			cancel_work(&w->work);
1654 			break;
1655 		}
1656 		break;
1657 	}
1658 	}
1659 	bpf_async_refcount_put(cb);
1660 }
1661 
1662 static void bpf_async_irq_worker(struct irq_work *work)
1663 {
1664 	struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker);
1665 	struct llist_node *pos, *n, *list;
1666 
1667 	list = llist_del_all(&cb->async_cmds);
1668 	if (!list)
1669 		return;
1670 
1671 	list = llist_reverse_order(list);
1672 	this_cpu_write(async_cb_running, cb);
1673 	llist_for_each_safe(pos, n, list) {
1674 		struct bpf_async_cmd *cmd;
1675 
1676 		cmd = container_of(pos, struct bpf_async_cmd, node);
1677 		bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode);
1678 		kfree_nolock(cmd);
1679 	}
1680 	this_cpu_write(async_cb_running, NULL);
1681 }
1682 
1683 static void bpf_async_cancel_and_free(struct bpf_async_kern *async)
1684 {
1685 	struct bpf_async_cb *cb;
1686 
1687 	if (!READ_ONCE(async->cb))
1688 		return;
1689 
1690 	cb = xchg(&async->cb, NULL);
1691 	if (!cb)
1692 		return;
1693 
1694 	bpf_async_update_prog_callback(cb, NULL, NULL);
1695 	/*
1696 	 * No refcount_inc_not_zero(&cb->refcnt) here. Dropping the last
1697 	 * refcnt. Either synchronously or asynchronously in irq_work.
1698 	 */
1699 
1700 	if (!defer_timer_wq_op()) {
1701 		bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0);
1702 	} else {
1703 		(void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0);
1704 		/*
1705 		 * bpf_async_schedule_op() either enqueues allocated cmd into llist
1706 		 * or fails with ENOMEM and drop the last refcnt.
1707 		 * This is unlikely, but safe, since bpf_async_cb_rcu_tasks_trace_free()
1708 		 * callback will do additional timer/wq_cancel due to races anyway.
1709 		 */
1710 	}
1711 }
1712 
1713 /*
1714  * This function is called by map_delete/update_elem for individual element and
1715  * by ops->map_release_uref when the user space reference to a map reaches zero.
1716  */
1717 void bpf_timer_cancel_and_free(void *val)
1718 {
1719 	bpf_async_cancel_and_free(val);
1720 }
1721 
1722 /*
1723  * This function is called by map_delete/update_elem for individual element and
1724  * by ops->map_release_uref when the user space reference to a map reaches zero.
1725  */
1726 void bpf_wq_cancel_and_free(void *val)
1727 {
1728 	bpf_async_cancel_and_free(val);
1729 }
1730 
1731 BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
1732 {
1733 	unsigned long *kptr = dst;
1734 
1735 	/* This helper may be inlined by verifier. */
1736 	return xchg(kptr, (unsigned long)ptr);
1737 }
1738 
1739 /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
1740  * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
1741  * denote type that verifier will determine.
1742  */
1743 static const struct bpf_func_proto bpf_kptr_xchg_proto = {
1744 	.func         = bpf_kptr_xchg,
1745 	.gpl_only     = false,
1746 	.ret_type     = RET_PTR_TO_BTF_ID_OR_NULL,
1747 	.ret_btf_id   = BPF_PTR_POISON,
1748 	.arg1_type    = ARG_KPTR_XCHG_DEST,
1749 	.arg2_type    = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
1750 	.arg2_btf_id  = BPF_PTR_POISON,
1751 };
1752 
1753 struct bpf_dynptr_file_impl {
1754 	struct freader freader;
1755 	/* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */
1756 	u64 offset;
1757 	u64 size;
1758 };
1759 
1760 /* Since the upper 8 bits of dynptr->size is reserved, the
1761  * maximum supported size is 2^24 - 1.
1762  */
1763 #define DYNPTR_MAX_SIZE	((1UL << 24) - 1)
1764 #define DYNPTR_TYPE_SHIFT	28
1765 #define DYNPTR_SIZE_MASK	0xFFFFFF
1766 #define DYNPTR_RDONLY_BIT	BIT(31)
1767 
1768 bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
1769 {
1770 	return ptr->size & DYNPTR_RDONLY_BIT;
1771 }
1772 
1773 void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
1774 {
1775 	ptr->size |= DYNPTR_RDONLY_BIT;
1776 }
1777 
1778 static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
1779 {
1780 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
1781 }
1782 
1783 static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
1784 {
1785 	return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
1786 }
1787 
1788 u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
1789 {
1790 	if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
1791 		struct bpf_dynptr_file_impl *df = ptr->data;
1792 
1793 		return df->size;
1794 	}
1795 
1796 	return ptr->size & DYNPTR_SIZE_MASK;
1797 }
1798 
1799 static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off)
1800 {
1801 	if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
1802 		struct bpf_dynptr_file_impl *df = ptr->data;
1803 
1804 		df->offset += off;
1805 		return;
1806 	}
1807 	ptr->offset += off;
1808 }
1809 
1810 static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size)
1811 {
1812 	u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
1813 
1814 	if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
1815 		struct bpf_dynptr_file_impl *df = ptr->data;
1816 
1817 		df->size = new_size;
1818 		return;
1819 	}
1820 	ptr->size = (u32)new_size | metadata;
1821 }
1822 
1823 int bpf_dynptr_check_size(u64 size)
1824 {
1825 	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
1826 }
1827 
1828 static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len)
1829 {
1830 	const void *ptr;
1831 
1832 	if (!buf)
1833 		return -EINVAL;
1834 
1835 	df->freader.buf = buf;
1836 	df->freader.buf_sz = len;
1837 	ptr = freader_fetch(&df->freader, offset + df->offset, len);
1838 	if (!ptr)
1839 		return df->freader.err;
1840 
1841 	if (ptr != buf) /* Force copying into the buffer */
1842 		memcpy(buf, ptr, len);
1843 
1844 	return 0;
1845 }
1846 
1847 void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
1848 		     enum bpf_dynptr_type type, u32 offset, u32 size)
1849 {
1850 	ptr->data = data;
1851 	ptr->offset = offset;
1852 	ptr->size = size;
1853 	bpf_dynptr_set_type(ptr, type);
1854 }
1855 
1856 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
1857 {
1858 	memset(ptr, 0, sizeof(*ptr));
1859 }
1860 
1861 BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr)
1862 {
1863 	int err;
1864 
1865 	BTF_TYPE_EMIT(struct bpf_dynptr);
1866 
1867 	err = bpf_dynptr_check_size(size);
1868 	if (err)
1869 		goto error;
1870 
1871 	/* flags is currently unsupported */
1872 	if (flags) {
1873 		err = -EINVAL;
1874 		goto error;
1875 	}
1876 
1877 	bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);
1878 
1879 	return 0;
1880 
1881 error:
1882 	bpf_dynptr_set_null(ptr);
1883 	return err;
1884 }
1885 
1886 static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
1887 	.func		= bpf_dynptr_from_mem,
1888 	.gpl_only	= false,
1889 	.ret_type	= RET_INTEGER,
1890 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
1891 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1892 	.arg3_type	= ARG_ANYTHING,
1893 	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
1894 };
1895 
1896 static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src,
1897 			     u64 offset, u64 flags)
1898 {
1899 	enum bpf_dynptr_type type;
1900 	int err;
1901 
1902 	if (!src->data || flags)
1903 		return -EINVAL;
1904 
1905 	err = bpf_dynptr_check_off_len(src, offset, len);
1906 	if (err)
1907 		return err;
1908 
1909 	type = bpf_dynptr_get_type(src);
1910 
1911 	switch (type) {
1912 	case BPF_DYNPTR_TYPE_LOCAL:
1913 	case BPF_DYNPTR_TYPE_RINGBUF:
1914 		/* Source and destination may possibly overlap, hence use memmove to
1915 		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
1916 		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
1917 		 */
1918 		memmove(dst, src->data + src->offset + offset, len);
1919 		return 0;
1920 	case BPF_DYNPTR_TYPE_SKB:
1921 		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
1922 	case BPF_DYNPTR_TYPE_XDP:
1923 		return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
1924 	case BPF_DYNPTR_TYPE_SKB_META:
1925 		memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
1926 		return 0;
1927 	case BPF_DYNPTR_TYPE_FILE:
1928 		return bpf_file_fetch_bytes(src->data, offset, dst, len);
1929 	default:
1930 		WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
1931 		return -EFAULT;
1932 	}
1933 }
1934 
1935 BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src,
1936 	   u64, offset, u64, flags)
1937 {
1938 	return __bpf_dynptr_read(dst, len, src, offset, flags);
1939 }
1940 
1941 static const struct bpf_func_proto bpf_dynptr_read_proto = {
1942 	.func		= bpf_dynptr_read,
1943 	.gpl_only	= false,
1944 	.ret_type	= RET_INTEGER,
1945 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
1946 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1947 	.arg3_type	= ARG_PTR_TO_DYNPTR,
1948 	.arg4_type	= ARG_ANYTHING,
1949 	.arg5_type	= ARG_ANYTHING,
1950 };
1951 
1952 int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src,
1953 		       u64 len, u64 flags)
1954 {
1955 	enum bpf_dynptr_type type;
1956 	int err;
1957 
1958 	if (!dst->data || __bpf_dynptr_is_rdonly(dst))
1959 		return -EINVAL;
1960 
1961 	err = bpf_dynptr_check_off_len(dst, offset, len);
1962 	if (err)
1963 		return err;
1964 
1965 	type = bpf_dynptr_get_type(dst);
1966 
1967 	switch (type) {
1968 	case BPF_DYNPTR_TYPE_LOCAL:
1969 	case BPF_DYNPTR_TYPE_RINGBUF:
1970 		if (flags)
1971 			return -EINVAL;
1972 		/* Source and destination may possibly overlap, hence use memmove to
1973 		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
1974 		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
1975 		 */
1976 		memmove(dst->data + dst->offset + offset, src, len);
1977 		return 0;
1978 	case BPF_DYNPTR_TYPE_SKB:
1979 		return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
1980 					     flags);
1981 	case BPF_DYNPTR_TYPE_XDP:
1982 		if (flags)
1983 			return -EINVAL;
1984 		return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
1985 	case BPF_DYNPTR_TYPE_SKB_META:
1986 		return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src,
1987 						  len, flags);
1988 	default:
1989 		WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
1990 		return -EFAULT;
1991 	}
1992 }
1993 
1994 BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src,
1995 	   u64, len, u64, flags)
1996 {
1997 	return __bpf_dynptr_write(dst, offset, src, len, flags);
1998 }
1999 
2000 static const struct bpf_func_proto bpf_dynptr_write_proto = {
2001 	.func		= bpf_dynptr_write,
2002 	.gpl_only	= false,
2003 	.ret_type	= RET_INTEGER,
2004 	.arg1_type	= ARG_PTR_TO_DYNPTR,
2005 	.arg2_type	= ARG_ANYTHING,
2006 	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
2007 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
2008 	.arg5_type	= ARG_ANYTHING,
2009 };
2010 
2011 BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len)
2012 {
2013 	enum bpf_dynptr_type type;
2014 	int err;
2015 
2016 	if (!ptr->data)
2017 		return 0;
2018 
2019 	err = bpf_dynptr_check_off_len(ptr, offset, len);
2020 	if (err)
2021 		return 0;
2022 
2023 	if (__bpf_dynptr_is_rdonly(ptr))
2024 		return 0;
2025 
2026 	type = bpf_dynptr_get_type(ptr);
2027 
2028 	switch (type) {
2029 	case BPF_DYNPTR_TYPE_LOCAL:
2030 	case BPF_DYNPTR_TYPE_RINGBUF:
2031 		return (unsigned long)(ptr->data + ptr->offset + offset);
2032 	case BPF_DYNPTR_TYPE_SKB:
2033 	case BPF_DYNPTR_TYPE_XDP:
2034 	case BPF_DYNPTR_TYPE_SKB_META:
2035 		/* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
2036 		return 0;
2037 	default:
2038 		WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
2039 		return 0;
2040 	}
2041 }
2042 
2043 static const struct bpf_func_proto bpf_dynptr_data_proto = {
2044 	.func		= bpf_dynptr_data,
2045 	.gpl_only	= false,
2046 	.ret_type	= RET_PTR_TO_DYNPTR_MEM_OR_NULL,
2047 	.arg1_type	= ARG_PTR_TO_DYNPTR,
2048 	.arg2_type	= ARG_ANYTHING,
2049 	.arg3_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
2050 };
2051 
2052 const struct bpf_func_proto bpf_get_current_task_proto __weak;
2053 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
2054 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
2055 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
2056 const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
2057 const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
2058 const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
2059 const struct bpf_func_proto bpf_perf_event_read_proto __weak;
2060 const struct bpf_func_proto bpf_send_signal_proto __weak;
2061 const struct bpf_func_proto bpf_send_signal_thread_proto __weak;
2062 const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak;
2063 const struct bpf_func_proto bpf_get_task_stack_proto __weak;
2064 const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak;
2065 
2066 const struct bpf_func_proto *
2067 bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2068 {
2069 	switch (func_id) {
2070 	case BPF_FUNC_map_lookup_elem:
2071 		return &bpf_map_lookup_elem_proto;
2072 	case BPF_FUNC_map_update_elem:
2073 		return &bpf_map_update_elem_proto;
2074 	case BPF_FUNC_map_delete_elem:
2075 		return &bpf_map_delete_elem_proto;
2076 	case BPF_FUNC_map_push_elem:
2077 		return &bpf_map_push_elem_proto;
2078 	case BPF_FUNC_map_pop_elem:
2079 		return &bpf_map_pop_elem_proto;
2080 	case BPF_FUNC_map_peek_elem:
2081 		return &bpf_map_peek_elem_proto;
2082 	case BPF_FUNC_map_lookup_percpu_elem:
2083 		return &bpf_map_lookup_percpu_elem_proto;
2084 	case BPF_FUNC_get_prandom_u32:
2085 		return &bpf_get_prandom_u32_proto;
2086 	case BPF_FUNC_get_smp_processor_id:
2087 		return &bpf_get_raw_smp_processor_id_proto;
2088 	case BPF_FUNC_get_numa_node_id:
2089 		return &bpf_get_numa_node_id_proto;
2090 	case BPF_FUNC_tail_call:
2091 		return &bpf_tail_call_proto;
2092 	case BPF_FUNC_ktime_get_ns:
2093 		return &bpf_ktime_get_ns_proto;
2094 	case BPF_FUNC_ktime_get_boot_ns:
2095 		return &bpf_ktime_get_boot_ns_proto;
2096 	case BPF_FUNC_ktime_get_tai_ns:
2097 		return &bpf_ktime_get_tai_ns_proto;
2098 	case BPF_FUNC_ringbuf_output:
2099 		return &bpf_ringbuf_output_proto;
2100 	case BPF_FUNC_ringbuf_reserve:
2101 		return &bpf_ringbuf_reserve_proto;
2102 	case BPF_FUNC_ringbuf_submit:
2103 		return &bpf_ringbuf_submit_proto;
2104 	case BPF_FUNC_ringbuf_discard:
2105 		return &bpf_ringbuf_discard_proto;
2106 	case BPF_FUNC_ringbuf_query:
2107 		return &bpf_ringbuf_query_proto;
2108 	case BPF_FUNC_strncmp:
2109 		return &bpf_strncmp_proto;
2110 	case BPF_FUNC_strtol:
2111 		return &bpf_strtol_proto;
2112 	case BPF_FUNC_strtoul:
2113 		return &bpf_strtoul_proto;
2114 	case BPF_FUNC_get_current_pid_tgid:
2115 		return &bpf_get_current_pid_tgid_proto;
2116 	case BPF_FUNC_get_ns_current_pid_tgid:
2117 		return &bpf_get_ns_current_pid_tgid_proto;
2118 	case BPF_FUNC_get_current_uid_gid:
2119 		return &bpf_get_current_uid_gid_proto;
2120 	default:
2121 		break;
2122 	}
2123 
2124 	if (!bpf_token_capable(prog->aux->token, CAP_BPF))
2125 		return NULL;
2126 
2127 	switch (func_id) {
2128 	case BPF_FUNC_spin_lock:
2129 		return &bpf_spin_lock_proto;
2130 	case BPF_FUNC_spin_unlock:
2131 		return &bpf_spin_unlock_proto;
2132 	case BPF_FUNC_jiffies64:
2133 		return &bpf_jiffies64_proto;
2134 	case BPF_FUNC_per_cpu_ptr:
2135 		return &bpf_per_cpu_ptr_proto;
2136 	case BPF_FUNC_this_cpu_ptr:
2137 		return &bpf_this_cpu_ptr_proto;
2138 	case BPF_FUNC_timer_init:
2139 		return &bpf_timer_init_proto;
2140 	case BPF_FUNC_timer_set_callback:
2141 		return &bpf_timer_set_callback_proto;
2142 	case BPF_FUNC_timer_start:
2143 		return &bpf_timer_start_proto;
2144 	case BPF_FUNC_timer_cancel:
2145 		return &bpf_timer_cancel_proto;
2146 	case BPF_FUNC_kptr_xchg:
2147 		return &bpf_kptr_xchg_proto;
2148 	case BPF_FUNC_for_each_map_elem:
2149 		return &bpf_for_each_map_elem_proto;
2150 	case BPF_FUNC_loop:
2151 		return &bpf_loop_proto;
2152 	case BPF_FUNC_user_ringbuf_drain:
2153 		return &bpf_user_ringbuf_drain_proto;
2154 	case BPF_FUNC_ringbuf_reserve_dynptr:
2155 		return &bpf_ringbuf_reserve_dynptr_proto;
2156 	case BPF_FUNC_ringbuf_submit_dynptr:
2157 		return &bpf_ringbuf_submit_dynptr_proto;
2158 	case BPF_FUNC_ringbuf_discard_dynptr:
2159 		return &bpf_ringbuf_discard_dynptr_proto;
2160 	case BPF_FUNC_dynptr_from_mem:
2161 		return &bpf_dynptr_from_mem_proto;
2162 	case BPF_FUNC_dynptr_read:
2163 		return &bpf_dynptr_read_proto;
2164 	case BPF_FUNC_dynptr_write:
2165 		return &bpf_dynptr_write_proto;
2166 	case BPF_FUNC_dynptr_data:
2167 		return &bpf_dynptr_data_proto;
2168 #ifdef CONFIG_CGROUPS
2169 	case BPF_FUNC_cgrp_storage_get:
2170 		return &bpf_cgrp_storage_get_proto;
2171 	case BPF_FUNC_cgrp_storage_delete:
2172 		return &bpf_cgrp_storage_delete_proto;
2173 	case BPF_FUNC_get_current_cgroup_id:
2174 		return &bpf_get_current_cgroup_id_proto;
2175 	case BPF_FUNC_get_current_ancestor_cgroup_id:
2176 		return &bpf_get_current_ancestor_cgroup_id_proto;
2177 	case BPF_FUNC_current_task_under_cgroup:
2178 		return &bpf_current_task_under_cgroup_proto;
2179 #endif
2180 #ifdef CONFIG_CGROUP_NET_CLASSID
2181 	case BPF_FUNC_get_cgroup_classid:
2182 		return &bpf_get_cgroup_classid_curr_proto;
2183 #endif
2184 	case BPF_FUNC_task_storage_get:
2185 		return &bpf_task_storage_get_proto;
2186 	case BPF_FUNC_task_storage_delete:
2187 		return &bpf_task_storage_delete_proto;
2188 	default:
2189 		break;
2190 	}
2191 
2192 	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
2193 		return NULL;
2194 
2195 	switch (func_id) {
2196 	case BPF_FUNC_trace_printk:
2197 		return bpf_get_trace_printk_proto();
2198 	case BPF_FUNC_get_current_task:
2199 		return &bpf_get_current_task_proto;
2200 	case BPF_FUNC_get_current_task_btf:
2201 		return &bpf_get_current_task_btf_proto;
2202 	case BPF_FUNC_get_current_comm:
2203 		return &bpf_get_current_comm_proto;
2204 	case BPF_FUNC_probe_read_user:
2205 		return &bpf_probe_read_user_proto;
2206 	case BPF_FUNC_probe_read_kernel:
2207 		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
2208 		       NULL : &bpf_probe_read_kernel_proto;
2209 	case BPF_FUNC_probe_read_user_str:
2210 		return &bpf_probe_read_user_str_proto;
2211 	case BPF_FUNC_probe_read_kernel_str:
2212 		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
2213 		       NULL : &bpf_probe_read_kernel_str_proto;
2214 	case BPF_FUNC_copy_from_user:
2215 		return &bpf_copy_from_user_proto;
2216 	case BPF_FUNC_copy_from_user_task:
2217 		return &bpf_copy_from_user_task_proto;
2218 	case BPF_FUNC_snprintf_btf:
2219 		return &bpf_snprintf_btf_proto;
2220 	case BPF_FUNC_snprintf:
2221 		return &bpf_snprintf_proto;
2222 	case BPF_FUNC_task_pt_regs:
2223 		return &bpf_task_pt_regs_proto;
2224 	case BPF_FUNC_trace_vprintk:
2225 		return bpf_get_trace_vprintk_proto();
2226 	case BPF_FUNC_perf_event_read_value:
2227 		return bpf_get_perf_event_read_value_proto();
2228 	case BPF_FUNC_perf_event_read:
2229 		return &bpf_perf_event_read_proto;
2230 	case BPF_FUNC_send_signal:
2231 		return &bpf_send_signal_proto;
2232 	case BPF_FUNC_send_signal_thread:
2233 		return &bpf_send_signal_thread_proto;
2234 	case BPF_FUNC_get_task_stack:
2235 		return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
2236 				       : &bpf_get_task_stack_proto;
2237 	case BPF_FUNC_get_branch_snapshot:
2238 		return &bpf_get_branch_snapshot_proto;
2239 	case BPF_FUNC_find_vma:
2240 		return &bpf_find_vma_proto;
2241 	default:
2242 		return NULL;
2243 	}
2244 }
2245 EXPORT_SYMBOL_GPL(bpf_base_func_proto);
2246 
2247 void bpf_list_head_free(const struct btf_field *field, void *list_head,
2248 			struct bpf_spin_lock *spin_lock)
2249 {
2250 	struct list_head *head = list_head, drain, *pos, *n;
2251 
2252 	BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
2253 	BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
2254 	INIT_LIST_HEAD(&drain);
2255 
2256 	/* Do the actual list draining outside the lock to not hold the lock for
2257 	 * too long, and also prevent deadlocks if tracing programs end up
2258 	 * executing on entry/exit of functions called inside the critical
2259 	 * section, and end up doing map ops that call bpf_list_head_free for
2260 	 * the same map value again.
2261 	 */
2262 	__bpf_spin_lock_irqsave(spin_lock);
2263 	if (!head->next || list_empty(head))
2264 		goto unlock;
2265 	list_for_each_safe(pos, n, head) {
2266 		struct bpf_list_node_kern *node;
2267 
2268 		node = container_of(pos, struct bpf_list_node_kern, list_head);
2269 		WRITE_ONCE(node->owner, BPF_PTR_POISON);
2270 		list_move_tail(pos, &drain);
2271 	}
2272 unlock:
2273 	INIT_LIST_HEAD(head);
2274 	__bpf_spin_unlock_irqrestore(spin_lock);
2275 
2276 	while (!list_empty(&drain)) {
2277 		struct bpf_list_node_kern *node;
2278 
2279 		pos = drain.next;
2280 		node = container_of(pos, struct bpf_list_node_kern, list_head);
2281 		list_del_init(pos);
2282 		/* Ensure __bpf_list_add() sees the node as unlinked. */
2283 		smp_store_release(&node->owner, NULL);
2284 		/* The contained type can also have resources, including a
2285 		 * bpf_list_head which needs to be freed.
2286 		 */
2287 		__bpf_obj_drop_impl((char *)pos - field->graph_root.node_offset,
2288 				    field->graph_root.value_rec, false);
2289 	}
2290 }
2291 
2292 /* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
2293  * 'rb_node *', so field name of rb_node within containing struct is not
2294  * needed.
2295  *
2296  * Since bpf_rb_tree's node type has a corresponding struct btf_field with
2297  * graph_root.node_offset, it's not necessary to know field name
2298  * or type of node struct
2299  */
2300 #define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
2301 	for (pos = rb_first_postorder(root); \
2302 	    pos && ({ n = rb_next_postorder(pos); 1; }); \
2303 	    pos = n)
2304 
2305 void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
2306 		      struct bpf_spin_lock *spin_lock)
2307 {
2308 	struct rb_root_cached orig_root, *root = rb_root;
2309 	struct bpf_rb_node_kern *node;
2310 	struct rb_node *pos, *n;
2311 	void *obj;
2312 
2313 	BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
2314 	BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));
2315 
2316 	__bpf_spin_lock_irqsave(spin_lock);
2317 	orig_root = *root;
2318 	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
2319 		node = rb_entry(pos, struct bpf_rb_node_kern, rb_node);
2320 		WRITE_ONCE(node->owner, BPF_PTR_POISON);
2321 	}
2322 	*root = RB_ROOT_CACHED;
2323 	__bpf_spin_unlock_irqrestore(spin_lock);
2324 
2325 	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
2326 		obj = pos;
2327 		obj -= field->graph_root.node_offset;
2328 		node = rb_entry(pos, struct bpf_rb_node_kern, rb_node);
2329 		RB_CLEAR_NODE(pos);
2330 		/* Ensure __bpf_rbtree_add() sees the node as unlinked. */
2331 		smp_store_release(&node->owner, NULL);
2332 		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
2333 	}
2334 }
2335 
2336 __bpf_kfunc_start_defs();
2337 
2338 /**
2339  * bpf_obj_new() - allocate an object described by program BTF
2340  * @local_type_id__k: type ID in program BTF
2341  * @meta: verifier-supplied struct metadata
2342  *
2343  * Allocate an object of the type identified by @local_type_id__k and
2344  * initialize its special fields. BPF programs can use
2345  * bpf_core_type_id_local() to provide @local_type_id__k. The verifier
2346  * rewrites @meta; BPF programs do not set it.
2347  *
2348  * Return: Pointer to the allocated object, or %NULL on failure.
2349  */
2350 __bpf_kfunc void *bpf_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta)
2351 {
2352 	u64 size = local_type_id__k;
2353 	void *p;
2354 
2355 	p = bpf_mem_alloc(&bpf_global_ma, size);
2356 	if (!p)
2357 		return NULL;
2358 	if (meta)
2359 		bpf_obj_init(meta->record, p);
2360 
2361 	return p;
2362 }
2363 
2364 __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
2365 {
2366 	return bpf_obj_new(local_type_id__k, meta__ign);
2367 }
2368 
2369 /**
2370  * bpf_percpu_obj_new() - allocate a percpu object described by program BTF
2371  * @local_type_id__k: type ID in program BTF
2372  * @meta: verifier-supplied struct metadata
2373  *
2374  * Allocate a percpu object of the type identified by @local_type_id__k. BPF
2375  * programs can use bpf_core_type_id_local() to provide @local_type_id__k.
2376  * The verifier rewrites @meta; BPF programs do not set it.
2377  *
2378  * Return: Pointer to the allocated percpu object, or %NULL on failure.
2379  */
2380 __bpf_kfunc void *bpf_percpu_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta)
2381 {
2382 	u64 size = local_type_id__k;
2383 
2384 	/* The verifier has ensured that meta must be NULL */
2385 	return bpf_mem_alloc(&bpf_global_percpu_ma, size);
2386 }
2387 
2388 __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
2389 {
2390 	return bpf_percpu_obj_new(local_type_id__k, meta__ign);
2391 }
2392 
2393 /* Must be called under migrate_disable(), as required by bpf_mem_free */
2394 void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
2395 {
2396 	struct bpf_mem_alloc *ma;
2397 
2398 	if (rec && rec->refcount_off >= 0 &&
2399 	    !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
2400 		/* Object is refcounted and refcount_dec didn't result in 0
2401 		 * refcount. Return without freeing the object
2402 		 */
2403 		return;
2404 	}
2405 
2406 	if (rec)
2407 		bpf_obj_free_fields(rec, p);
2408 
2409 	if (percpu)
2410 		ma = &bpf_global_percpu_ma;
2411 	else
2412 		ma = &bpf_global_ma;
2413 	bpf_mem_free_rcu(ma, p);
2414 }
2415 
2416 /**
2417  * bpf_obj_drop() - drop a previously allocated object
2418  * @p__alloc: object to free
2419  * @meta: verifier-supplied struct metadata
2420  *
2421  * Destroy special fields in @p__alloc as needed and free the object. The
2422  * verifier rewrites @meta; BPF programs do not set it.
2423  */
2424 __bpf_kfunc void bpf_obj_drop(void *p__alloc, struct btf_struct_meta *meta)
2425 {
2426 	void *p = p__alloc;
2427 
2428 	__bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
2429 }
2430 
2431 __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
2432 {
2433 	return bpf_obj_drop(p__alloc, meta__ign);
2434 }
2435 
2436 /**
2437  * bpf_percpu_obj_drop() - drop a previously allocated percpu object
2438  * @p__alloc: percpu object to free
2439  * @meta: verifier-supplied struct metadata
2440  *
2441  * Free @p__alloc. The verifier rewrites @meta; BPF programs do not set it.
2442  */
2443 __bpf_kfunc void bpf_percpu_obj_drop(void *p__alloc, struct btf_struct_meta *meta)
2444 {
2445 	/* The verifier has ensured that meta must be NULL */
2446 	bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
2447 }
2448 
2449 __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
2450 {
2451 	bpf_percpu_obj_drop(p__alloc, meta__ign);
2452 }
2453 
2454 /**
2455  * bpf_refcount_acquire() - turn a local kptr into an owning reference
2456  * @p__refcounted_kptr: non-owning local kptr
2457  * @meta: verifier-supplied struct metadata
2458  *
2459  * Increment the refcount for @p__refcounted_kptr. The verifier rewrites
2460  * @meta; BPF programs do not set it.
2461  *
2462  * Return: Owning reference to @p__refcounted_kptr, or %NULL on failure.
2463  */
2464 __bpf_kfunc void *bpf_refcount_acquire(void *p__refcounted_kptr, struct btf_struct_meta *meta)
2465 {
2466 	struct bpf_refcount *ref;
2467 
2468 	/* Could just cast directly to refcount_t *, but need some code using
2469 	 * bpf_refcount type so that it is emitted in vmlinux BTF
2470 	 */
2471 	ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
2472 	if (!refcount_inc_not_zero((refcount_t *)ref))
2473 		return NULL;
2474 
2475 	/* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null
2476 	 * in verifier.c
2477 	 */
2478 	return (void *)p__refcounted_kptr;
2479 }
2480 
2481 __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
2482 {
2483 	return bpf_refcount_acquire(p__refcounted_kptr, meta__ign);
2484 }
2485 
2486 static int __bpf_list_add(struct bpf_list_node_kern *node,
2487 			  struct bpf_list_head *head,
2488 			  struct list_head **prev_ptr,
2489 			  struct btf_record *rec, u64 off)
2490 {
2491 	struct list_head *n = &node->list_head, *h = (void *)head;
2492 	struct list_head *prev;
2493 
2494 	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
2495 	 * called on its fields, so init here
2496 	 */
2497 	if (unlikely(!h->next))
2498 		INIT_LIST_HEAD(h);
2499 
2500 	prev = *prev_ptr;
2501 
2502 	/* When prev is not the list head, it must be a node in this list. */
2503 	if (prev != h) {
2504 		struct bpf_list_node_kern *prev_kn =
2505 			container_of(prev, struct bpf_list_node_kern, list_head);
2506 
2507 		if (unlikely(READ_ONCE(prev_kn->owner) != head))
2508 			goto fail;
2509 	}
2510 
2511 	/* node->owner != NULL implies !list_empty(n), no need to separately
2512 	 * check the latter
2513 	 */
2514 	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON))
2515 		goto fail;
2516 
2517 	list_add(n, prev);
2518 	WRITE_ONCE(node->owner, head);
2519 	return 0;
2520 
2521 fail:
2522 	/* Only called from BPF prog, no need to migrate_disable */
2523 	__bpf_obj_drop_impl((void *)n - off, rec, false);
2524 	return -EINVAL;
2525 }
2526 
2527 /**
2528  * bpf_list_push_front() - add a node to the front of a BPF linked list
2529  * @head: list head
2530  * @node: node to insert
2531  * @meta: verifier-supplied struct metadata
2532  * @off: verifier-supplied offset of @node within the containing object
2533  *
2534  * Insert @node at the front of @head. The verifier rewrites @meta and @off;
2535  * BPF programs do not set them.
2536  *
2537  * Return: 0 on success, or %-EINVAL if @node is already linked.
2538  */
2539 __bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head,
2540 				    struct bpf_list_node *node,
2541 				    struct btf_struct_meta *meta,
2542 				    u64 off)
2543 {
2544 	struct bpf_list_node_kern *n = (void *)node;
2545 	struct list_head *h = (void *)head;
2546 
2547 	return __bpf_list_add(n, head, &h, meta ? meta->record : NULL, off);
2548 }
2549 
2550 __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
2551 					 struct bpf_list_node *node,
2552 					 void *meta__ign, u64 off)
2553 {
2554 	return bpf_list_push_front(head, node, meta__ign, off);
2555 }
2556 
2557 /**
2558  * bpf_list_push_back() - add a node to the back of a BPF linked list
2559  * @head: list head
2560  * @node: node to insert
2561  * @meta: verifier-supplied struct metadata
2562  * @off: verifier-supplied offset of @node within the containing object
2563  *
2564  * Insert @node at the back of @head. The verifier rewrites @meta and @off;
2565  * BPF programs do not set them.
2566  *
2567  * Return: 0 on success, or %-EINVAL if @node is already linked.
2568  */
2569 __bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head,
2570 				   struct bpf_list_node *node,
2571 				   struct btf_struct_meta *meta,
2572 				   u64 off)
2573 {
2574 	struct bpf_list_node_kern *n = (void *)node;
2575 	struct list_head *h = (void *)head;
2576 
2577 	return __bpf_list_add(n, head, &h->prev, meta ? meta->record : NULL, off);
2578 }
2579 
2580 __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
2581 					struct bpf_list_node *node,
2582 					void *meta__ign, u64 off)
2583 {
2584 	return bpf_list_push_back(head, node, meta__ign, off);
2585 }
2586 
2587 __bpf_kfunc int bpf_list_add(struct bpf_list_head *head, struct bpf_list_node *new,
2588 			     struct bpf_list_node *prev__nonown_allowed,
2589 			     struct btf_struct_meta *meta, u64 off)
2590 {
2591 	struct bpf_list_node_kern *n = (void *)new, *p = (void *)prev__nonown_allowed;
2592 	struct list_head *prev_ptr = &p->list_head;
2593 
2594 	return __bpf_list_add(n, head, &prev_ptr, meta ? meta->record : NULL, off);
2595 }
2596 
2597 static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head,
2598 					    struct list_head *n)
2599 {
2600 	struct list_head *h = (void *)head;
2601 	struct bpf_list_node_kern *node;
2602 
2603 	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
2604 	 * called on its fields, so init here
2605 	 */
2606 	if (unlikely(!h->next)) {
2607 		INIT_LIST_HEAD(h);
2608 		return NULL;
2609 	}
2610 	if (list_empty(h))
2611 		return NULL;
2612 
2613 	node = container_of(n, struct bpf_list_node_kern, list_head);
2614 	if (unlikely(READ_ONCE(node->owner) != head))
2615 		return NULL;
2616 
2617 	list_del_init(n);
2618 	/* Ensure __bpf_list_add() sees the node as unlinked. */
2619 	smp_store_release(&node->owner, NULL);
2620 	return (struct bpf_list_node *)n;
2621 }
2622 
2623 __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
2624 {
2625 	struct list_head *h = (void *)head;
2626 
2627 	return __bpf_list_del(head, h->next);
2628 }
2629 
2630 __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
2631 {
2632 	struct list_head *h = (void *)head;
2633 
2634 	return __bpf_list_del(head, h->prev);
2635 }
2636 
2637 __bpf_kfunc struct bpf_list_node *bpf_list_del(struct bpf_list_head *head,
2638 					       struct bpf_list_node *node__nonown_allowed)
2639 {
2640 	struct bpf_list_node_kern *kn = (void *)node__nonown_allowed;
2641 
2642 	/* verifier guarantees node is a list node rather than list head */
2643 	return __bpf_list_del(head, &kn->list_head);
2644 }
2645 
2646 __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
2647 {
2648 	struct list_head *h = (struct list_head *)head;
2649 
2650 	if (list_empty(h) || unlikely(!h->next))
2651 		return NULL;
2652 
2653 	return (struct bpf_list_node *)h->next;
2654 }
2655 
2656 __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head)
2657 {
2658 	struct list_head *h = (struct list_head *)head;
2659 
2660 	if (list_empty(h) || unlikely(!h->next))
2661 		return NULL;
2662 
2663 	return (struct bpf_list_node *)h->prev;
2664 }
2665 
2666 __bpf_kfunc bool bpf_list_is_first(struct bpf_list_head *head,
2667 				   struct bpf_list_node *node__nonown_allowed)
2668 {
2669 	struct list_head *h = (struct list_head *)head;
2670 	struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed;
2671 
2672 	if (READ_ONCE(kn->owner) != head)
2673 		return false;
2674 
2675 	return list_is_first(&kn->list_head, h);
2676 }
2677 
2678 __bpf_kfunc bool bpf_list_is_last(struct bpf_list_head *head,
2679 				  struct bpf_list_node *node__nonown_allowed)
2680 {
2681 	struct list_head *h = (struct list_head *)head;
2682 	struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed;
2683 
2684 	if (READ_ONCE(kn->owner) != head)
2685 		return false;
2686 
2687 	return list_is_last(&kn->list_head, h);
2688 }
2689 
2690 __bpf_kfunc bool bpf_list_empty(struct bpf_list_head *head)
2691 {
2692 	struct list_head *h = (struct list_head *)head;
2693 
2694 	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
2695 	 * called on its fields, so init here
2696 	 */
2697 	if (unlikely(!h->next))
2698 		INIT_LIST_HEAD(h);
2699 
2700 	return list_empty(h);
2701 }
2702 
2703 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
2704 						  struct bpf_rb_node *node)
2705 {
2706 	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
2707 	struct rb_root_cached *r = (struct rb_root_cached *)root;
2708 	struct rb_node *n = &node_internal->rb_node;
2709 
2710 	/* node_internal->owner != root implies either RB_EMPTY_NODE(n) or
2711 	 * n is owned by some other tree. No need to check RB_EMPTY_NODE(n)
2712 	 */
2713 	if (READ_ONCE(node_internal->owner) != root)
2714 		return NULL;
2715 
2716 	rb_erase_cached(n, r);
2717 	RB_CLEAR_NODE(n);
2718 	WRITE_ONCE(node_internal->owner, NULL);
2719 	return (struct bpf_rb_node *)n;
2720 }
2721 
2722 /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
2723  * program
2724  */
2725 static int __bpf_rbtree_add(struct bpf_rb_root *root,
2726 			    struct bpf_rb_node_kern *node,
2727 			    void *less, struct btf_record *rec, u64 off)
2728 {
2729 	struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
2730 	struct rb_node *parent = NULL, *n = &node->rb_node;
2731 	bpf_callback_t cb = (bpf_callback_t)less;
2732 	bool leftmost = true;
2733 
2734 	/* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately
2735 	 * check the latter
2736 	 */
2737 	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
2738 		/* Only called from BPF prog, no need to migrate_disable */
2739 		__bpf_obj_drop_impl((void *)n - off, rec, false);
2740 		return -EINVAL;
2741 	}
2742 
2743 	while (*link) {
2744 		parent = *link;
2745 		if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
2746 			link = &parent->rb_left;
2747 		} else {
2748 			link = &parent->rb_right;
2749 			leftmost = false;
2750 		}
2751 	}
2752 
2753 	rb_link_node(n, parent, link);
2754 	rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
2755 	WRITE_ONCE(node->owner, root);
2756 	return 0;
2757 }
2758 
2759 /**
2760  * bpf_rbtree_add() - add a node to a BPF rbtree
2761  * @root: tree root
2762  * @node: node to insert
2763  * @less: comparator used to order nodes
2764  * @meta: verifier-supplied struct metadata
2765  * @off: verifier-supplied offset of @node within the containing object
2766  *
2767  * Insert @node into @root using @less. The verifier rewrites @meta and @off;
2768  * BPF programs do not set them.
2769  *
2770  * Return: 0 on success, or %-EINVAL if @node is already linked in a tree.
2771  */
2772 __bpf_kfunc int bpf_rbtree_add(struct bpf_rb_root *root,
2773 			       struct bpf_rb_node *node,
2774 			       bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
2775 			       struct btf_struct_meta *meta,
2776 			       u64 off)
2777 {
2778 	struct bpf_rb_node_kern *n = (void *)node;
2779 
2780 	return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
2781 }
2782 
2783 __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
2784 				    bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
2785 				    void *meta__ign, u64 off)
2786 {
2787 	return bpf_rbtree_add(root, node, less, meta__ign, off);
2788 }
2789 
2790 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
2791 {
2792 	struct rb_root_cached *r = (struct rb_root_cached *)root;
2793 
2794 	return (struct bpf_rb_node *)rb_first_cached(r);
2795 }
2796 
2797 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root)
2798 {
2799 	struct rb_root_cached *r = (struct rb_root_cached *)root;
2800 
2801 	return (struct bpf_rb_node *)r->rb_root.rb_node;
2802 }
2803 
2804 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node)
2805 {
2806 	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
2807 
2808 	if (READ_ONCE(node_internal->owner) != root)
2809 		return NULL;
2810 
2811 	return (struct bpf_rb_node *)node_internal->rb_node.rb_left;
2812 }
2813 
2814 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node)
2815 {
2816 	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
2817 
2818 	if (READ_ONCE(node_internal->owner) != root)
2819 		return NULL;
2820 
2821 	return (struct bpf_rb_node *)node_internal->rb_node.rb_right;
2822 }
2823 
2824 /**
2825  * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
2826  * kfunc which is not stored in a map as a kptr, must be released by calling
2827  * bpf_task_release().
2828  * @p: The task on which a reference is being acquired.
2829  */
2830 __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
2831 {
2832 	if (refcount_inc_not_zero(&p->rcu_users))
2833 		return p;
2834 	return NULL;
2835 }
2836 
2837 /**
2838  * bpf_task_release - Release the reference acquired on a task.
2839  * @p: The task on which a reference is being released.
2840  */
2841 __bpf_kfunc void bpf_task_release(struct task_struct *p)
2842 {
2843 	put_task_struct_rcu_user(p);
2844 }
2845 
2846 __bpf_kfunc void bpf_task_release_dtor(void *p)
2847 {
2848 	put_task_struct_rcu_user(p);
2849 }
2850 CFI_NOSEAL(bpf_task_release_dtor);
2851 
2852 #ifdef CONFIG_CGROUPS
2853 /**
2854  * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
2855  * this kfunc which is not stored in a map as a kptr, must be released by
2856  * calling bpf_cgroup_release().
2857  * @cgrp: The cgroup on which a reference is being acquired.
2858  */
2859 __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
2860 {
2861 	return cgroup_tryget(cgrp) ? cgrp : NULL;
2862 }
2863 
2864 /**
2865  * bpf_cgroup_release - Release the reference acquired on a cgroup.
2866  * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
2867  * not be freed until the current grace period has ended, even if its refcount
2868  * drops to 0.
2869  * @cgrp: The cgroup on which a reference is being released.
2870  */
2871 __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
2872 {
2873 	cgroup_put(cgrp);
2874 }
2875 
2876 __bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp)
2877 {
2878 	cgroup_put(cgrp);
2879 }
2880 CFI_NOSEAL(bpf_cgroup_release_dtor);
2881 
2882 /**
2883  * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
2884  * array. A cgroup returned by this kfunc which is not subsequently stored in a
2885  * map, must be released by calling bpf_cgroup_release().
2886  * @cgrp: The cgroup for which we're performing a lookup.
2887  * @level: The level of ancestor to look up.
2888  */
2889 __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
2890 {
2891 	struct cgroup *ancestor;
2892 
2893 	if (level > cgrp->level || level < 0)
2894 		return NULL;
2895 
2896 	/* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
2897 	ancestor = cgrp->ancestors[level];
2898 	if (!cgroup_tryget(ancestor))
2899 		return NULL;
2900 	return ancestor;
2901 }
2902 
2903 /**
2904  * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this
2905  * kfunc which is not subsequently stored in a map, must be released by calling
2906  * bpf_cgroup_release().
2907  * @cgid: cgroup id.
2908  */
2909 __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
2910 {
2911 	struct cgroup *cgrp;
2912 
2913 	cgrp = __cgroup_get_from_id(cgid);
2914 	if (IS_ERR(cgrp))
2915 		return NULL;
2916 	return cgrp;
2917 }
2918 
2919 /**
2920  * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test
2921  * task's membership of cgroup ancestry.
2922  * @task: the task to be tested
2923  * @ancestor: possible ancestor of @task's cgroup
2924  *
2925  * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
2926  * It follows all the same rules as cgroup_is_descendant, and only applies
2927  * to the default hierarchy.
2928  */
2929 __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
2930 				       struct cgroup *ancestor)
2931 {
2932 	long ret;
2933 
2934 	rcu_read_lock();
2935 	ret = task_under_cgroup_hierarchy(task, ancestor);
2936 	rcu_read_unlock();
2937 	return ret;
2938 }
2939 
2940 BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
2941 {
2942 	struct bpf_array *array = container_of(map, struct bpf_array, map);
2943 	struct cgroup *cgrp;
2944 
2945 	if (unlikely(idx >= array->map.max_entries))
2946 		return -E2BIG;
2947 
2948 	cgrp = READ_ONCE(array->ptrs[idx]);
2949 	if (unlikely(!cgrp))
2950 		return -EAGAIN;
2951 
2952 	return task_under_cgroup_hierarchy(current, cgrp);
2953 }
2954 
2955 const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
2956 	.func           = bpf_current_task_under_cgroup,
2957 	.gpl_only       = false,
2958 	.ret_type       = RET_INTEGER,
2959 	.arg1_type      = ARG_CONST_MAP_PTR,
2960 	.arg2_type      = ARG_ANYTHING,
2961 };
2962 
2963 /**
2964  * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
2965  * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
2966  * hierarchy ID.
2967  * @task: The target task
2968  * @hierarchy_id: The ID of a cgroup1 hierarchy
2969  *
2970  * On success, the cgroup is returen. On failure, NULL is returned.
2971  */
2972 __bpf_kfunc struct cgroup *
2973 bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id)
2974 {
2975 	struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id);
2976 
2977 	if (IS_ERR(cgrp))
2978 		return NULL;
2979 	return cgrp;
2980 }
2981 #endif /* CONFIG_CGROUPS */
2982 
2983 /**
2984  * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
2985  * in the root pid namespace idr. If a task is returned, it must either be
2986  * stored in a map, or released with bpf_task_release().
2987  * @pid: The pid of the task being looked up.
2988  */
2989 __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
2990 {
2991 	struct task_struct *p;
2992 
2993 	rcu_read_lock();
2994 	p = find_task_by_pid_ns(pid, &init_pid_ns);
2995 	if (p)
2996 		p = bpf_task_acquire(p);
2997 	rcu_read_unlock();
2998 
2999 	return p;
3000 }
3001 
3002 /**
3003  * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up
3004  * in the pid namespace of the current task. If a task is returned, it must
3005  * either be stored in a map, or released with bpf_task_release().
3006  * @vpid: The vpid of the task being looked up.
3007  */
3008 __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
3009 {
3010 	struct task_struct *p;
3011 
3012 	rcu_read_lock();
3013 	p = find_task_by_vpid(vpid);
3014 	if (p)
3015 		p = bpf_task_acquire(p);
3016 	rcu_read_unlock();
3017 
3018 	return p;
3019 }
3020 
3021 /**
3022  * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
3023  * @p: The dynptr whose data slice to retrieve
3024  * @offset: Offset into the dynptr
3025  * @buffer__nullable: User-provided buffer to copy contents into.  May be NULL
3026  * @buffer__szk: Size (in bytes) of the buffer if present. This is the
3027  *               length of the requested slice. This must be a constant.
3028  *
3029  * For non-skb and non-xdp type dynptrs, there is no difference between
3030  * bpf_dynptr_slice and bpf_dynptr_data.
3031  *
3032  *  If buffer__nullable is NULL, the call will fail if buffer_opt was needed.
3033  *
3034  * If the intention is to write to the data slice, please use
3035  * bpf_dynptr_slice_rdwr.
3036  *
3037  * The user must check that the returned pointer is not null before using it.
3038  *
3039  * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
3040  * does not change the underlying packet data pointers, so a call to
3041  * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
3042  * the bpf program.
3043  *
3044  * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only
3045  * data slice (can be either direct pointer to the data or a pointer to the user
3046  * provided buffer, with its contents containing the data, if unable to obtain
3047  * direct pointer)
3048  */
3049 __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
3050 				   void *buffer__nullable, u64 buffer__szk)
3051 {
3052 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3053 	enum bpf_dynptr_type type;
3054 	u64 len = buffer__szk;
3055 	int err;
3056 
3057 	if (!ptr->data)
3058 		return NULL;
3059 
3060 	err = bpf_dynptr_check_off_len(ptr, offset, len);
3061 	if (err)
3062 		return NULL;
3063 
3064 	type = bpf_dynptr_get_type(ptr);
3065 
3066 	switch (type) {
3067 	case BPF_DYNPTR_TYPE_LOCAL:
3068 	case BPF_DYNPTR_TYPE_RINGBUF:
3069 		return ptr->data + ptr->offset + offset;
3070 	case BPF_DYNPTR_TYPE_SKB:
3071 		if (buffer__nullable)
3072 			return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__nullable);
3073 		else
3074 			return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
3075 	case BPF_DYNPTR_TYPE_XDP:
3076 	{
3077 		void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
3078 		if (!IS_ERR_OR_NULL(xdp_ptr))
3079 			return xdp_ptr;
3080 
3081 		if (!buffer__nullable)
3082 			return NULL;
3083 		bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__nullable, len, false);
3084 		return buffer__nullable;
3085 	}
3086 	case BPF_DYNPTR_TYPE_SKB_META:
3087 		return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
3088 	case BPF_DYNPTR_TYPE_FILE:
3089 		err = bpf_file_fetch_bytes(ptr->data, offset, buffer__nullable, buffer__szk);
3090 		return err ? NULL : buffer__nullable;
3091 	default:
3092 		WARN_ONCE(true, "unknown dynptr type %d\n", type);
3093 		return NULL;
3094 	}
3095 }
3096 
3097 /**
3098  * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
3099  * @p: The dynptr whose data slice to retrieve
3100  * @offset: Offset into the dynptr
3101  * @buffer__nullable: User-provided buffer to copy contents into. May be NULL
3102  * @buffer__szk: Size (in bytes) of the buffer if present. This is the
3103  *               length of the requested slice. This must be a constant.
3104  *
3105  * For non-skb and non-xdp type dynptrs, there is no difference between
3106  * bpf_dynptr_slice and bpf_dynptr_data.
3107  *
3108  * If buffer__nullable is NULL, the call will fail if buffer_opt was needed.
3109  *
3110  * The returned pointer is writable and may point to either directly the dynptr
3111  * data at the requested offset or to the buffer if unable to obtain a direct
3112  * data pointer to (example: the requested slice is to the paged area of an skb
3113  * packet). In the case where the returned pointer is to the buffer, the user
3114  * is responsible for persisting writes through calling bpf_dynptr_write(). This
3115  * usually looks something like this pattern:
3116  *
3117  * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
3118  * if (!eth)
3119  *	return TC_ACT_SHOT;
3120  *
3121  * // mutate eth header //
3122  *
3123  * if (eth == buffer)
3124  *	bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
3125  *
3126  * Please note that, as in the example above, the user must check that the
3127  * returned pointer is not null before using it.
3128  *
3129  * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
3130  * does not change the underlying packet data pointers, so a call to
3131  * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
3132  * the bpf program.
3133  *
3134  * Return: NULL if the call failed (eg invalid dynptr), pointer to a
3135  * data slice (can be either direct pointer to the data or a pointer to the user
3136  * provided buffer, with its contents containing the data, if unable to obtain
3137  * direct pointer)
3138  */
3139 __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
3140 					void *buffer__nullable, u64 buffer__szk)
3141 {
3142 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3143 
3144 	if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
3145 		return NULL;
3146 
3147 	/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
3148 	 *
3149 	 * For skb-type dynptrs, it is safe to write into the returned pointer
3150 	 * if the bpf program allows skb data writes. There are two possibilities
3151 	 * that may occur when calling bpf_dynptr_slice_rdwr:
3152 	 *
3153 	 * 1) The requested slice is in the head of the skb. In this case, the
3154 	 * returned pointer is directly to skb data, and if the skb is cloned, the
3155 	 * verifier will have uncloned it (see bpf_unclone_prologue()) already.
3156 	 * The pointer can be directly written into.
3157 	 *
3158 	 * 2) Some portion of the requested slice is in the paged buffer area.
3159 	 * In this case, the requested data will be copied out into the buffer
3160 	 * and the returned pointer will be a pointer to the buffer. The skb
3161 	 * will not be pulled. To persist the write, the user will need to call
3162 	 * bpf_dynptr_write(), which will pull the skb and commit the write.
3163 	 *
3164 	 * Similarly for xdp programs, if the requested slice is not across xdp
3165 	 * fragments, then a direct pointer will be returned, otherwise the data
3166 	 * will be copied out into the buffer and the user will need to call
3167 	 * bpf_dynptr_write() to commit changes.
3168 	 */
3169 	return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk);
3170 }
3171 
3172 __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr *p, u64 start, u64 end)
3173 {
3174 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3175 	u64 size;
3176 
3177 	if (!ptr->data || start > end)
3178 		return -EINVAL;
3179 
3180 	size = __bpf_dynptr_size(ptr);
3181 
3182 	if (start > size || end > size)
3183 		return -ERANGE;
3184 
3185 	bpf_dynptr_advance_offset(ptr, start);
3186 	bpf_dynptr_set_size(ptr, end - start);
3187 
3188 	return 0;
3189 }
3190 
3191 __bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)
3192 {
3193 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3194 
3195 	return !ptr->data;
3196 }
3197 
3198 __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
3199 {
3200 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3201 
3202 	if (!ptr->data)
3203 		return false;
3204 
3205 	return __bpf_dynptr_is_rdonly(ptr);
3206 }
3207 
3208 __bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p)
3209 {
3210 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3211 
3212 	if (!ptr->data)
3213 		return -EINVAL;
3214 
3215 	return __bpf_dynptr_size(ptr);
3216 }
3217 
3218 __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
3219 				 struct bpf_dynptr *clone__uninit)
3220 {
3221 	struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit;
3222 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3223 
3224 	if (!ptr->data) {
3225 		bpf_dynptr_set_null(clone);
3226 		return -EINVAL;
3227 	}
3228 
3229 	*clone = *ptr;
3230 
3231 	return 0;
3232 }
3233 
3234 /**
3235  * bpf_dynptr_copy() - Copy data from one dynptr to another.
3236  * @dst_ptr: Destination dynptr - where data should be copied to
3237  * @dst_off: Offset into the destination dynptr
3238  * @src_ptr: Source dynptr - where data should be copied from
3239  * @src_off: Offset into the source dynptr
3240  * @size: Length of the data to copy from source to destination
3241  *
3242  * Copies data from source dynptr to destination dynptr.
3243  * Returns 0 on success; negative error, otherwise.
3244  */
3245 __bpf_kfunc int bpf_dynptr_copy(const struct bpf_dynptr *dst_ptr, u64 dst_off,
3246 				const struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
3247 {
3248 	const struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
3249 	const struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
3250 	void *src_slice, *dst_slice;
3251 	char buf[256];
3252 	u64 off;
3253 
3254 	src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size);
3255 	dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size);
3256 
3257 	if (src_slice && dst_slice) {
3258 		memmove(dst_slice, src_slice, size);
3259 		return 0;
3260 	}
3261 
3262 	if (src_slice)
3263 		return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0);
3264 
3265 	if (dst_slice)
3266 		return __bpf_dynptr_read(dst_slice, size, src, src_off, 0);
3267 
3268 	if (bpf_dynptr_check_off_len(dst, dst_off, size) ||
3269 	    bpf_dynptr_check_off_len(src, src_off, size))
3270 		return -E2BIG;
3271 
3272 	off = 0;
3273 	while (off < size) {
3274 		u64 chunk_sz = min_t(u64, sizeof(buf), size - off);
3275 		int err;
3276 
3277 		err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0);
3278 		if (err)
3279 			return err;
3280 		err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0);
3281 		if (err)
3282 			return err;
3283 
3284 		off += chunk_sz;
3285 	}
3286 	return 0;
3287 }
3288 
3289 /**
3290  * bpf_dynptr_memset() - Fill dynptr memory with a constant byte.
3291  * @p: Destination dynptr - where data will be filled
3292  * @offset: Offset into the dynptr to start filling from
3293  * @size: Number of bytes to fill
3294  * @val: Constant byte to fill the memory with
3295  *
3296  * Fills the @size bytes of the memory area pointed to by @p
3297  * at @offset with the constant byte @val.
3298  * Returns 0 on success; negative error, otherwise.
3299  */
3300 __bpf_kfunc int bpf_dynptr_memset(const struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
3301 {
3302 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3303 	u64 chunk_sz, write_off;
3304 	char buf[256];
3305 	void* slice;
3306 	int err;
3307 
3308 	slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size);
3309 	if (likely(slice)) {
3310 		memset(slice, val, size);
3311 		return 0;
3312 	}
3313 
3314 	if (__bpf_dynptr_is_rdonly(ptr))
3315 		return -EINVAL;
3316 
3317 	err = bpf_dynptr_check_off_len(ptr, offset, size);
3318 	if (err)
3319 		return err;
3320 
3321 	/* Non-linear data under the dynptr, write from a local buffer */
3322 	chunk_sz = min_t(u64, sizeof(buf), size);
3323 	memset(buf, val, chunk_sz);
3324 
3325 	for (write_off = 0; write_off < size; write_off += chunk_sz) {
3326 		chunk_sz = min_t(u64, sizeof(buf), size - write_off);
3327 		err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0);
3328 		if (err)
3329 			return err;
3330 	}
3331 
3332 	return 0;
3333 }
3334 
3335 __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
3336 {
3337 	return obj;
3338 }
3339 
3340 __bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k)
3341 {
3342 	return (void *)obj__ign;
3343 }
3344 
3345 __bpf_kfunc void bpf_rcu_read_lock(void)
3346 {
3347 	rcu_read_lock();
3348 }
3349 
3350 __bpf_kfunc void bpf_rcu_read_unlock(void)
3351 {
3352 	rcu_read_unlock();
3353 }
3354 
3355 struct bpf_throw_ctx {
3356 	struct bpf_prog_aux *aux;
3357 	u64 sp;
3358 	u64 bp;
3359 	int cnt;
3360 };
3361 
3362 static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
3363 {
3364 	struct bpf_throw_ctx *ctx = cookie;
3365 	struct bpf_prog *prog;
3366 
3367 	/*
3368 	 * The RCU read lock is held to safely traverse the latch tree, but we
3369 	 * don't need its protection when accessing the prog, since it has an
3370 	 * active stack frame on the current stack trace, and won't disappear.
3371 	 */
3372 	rcu_read_lock();
3373 	prog = bpf_prog_ksym_find(ip);
3374 	rcu_read_unlock();
3375 	if (!prog)
3376 		return !ctx->cnt;
3377 	ctx->cnt++;
3378 	if (bpf_is_subprog(prog))
3379 		return true;
3380 	ctx->aux = prog->aux;
3381 	ctx->sp = sp;
3382 	ctx->bp = bp;
3383 	return false;
3384 }
3385 
3386 __bpf_kfunc void bpf_throw(u64 cookie)
3387 {
3388 	struct bpf_throw_ctx ctx = {};
3389 
3390 	arch_bpf_stack_walk(bpf_stack_walker, &ctx);
3391 	WARN_ON_ONCE(!ctx.aux);
3392 	if (ctx.aux)
3393 		WARN_ON_ONCE(!ctx.aux->exception_boundary);
3394 	WARN_ON_ONCE(!ctx.bp);
3395 	WARN_ON_ONCE(!ctx.cnt);
3396 	/* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning
3397 	 * deeper stack depths than ctx.sp as we do not return from bpf_throw,
3398 	 * which skips compiler generated instrumentation to do the same.
3399 	 */
3400 	kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
3401 	ctx.aux->bpf_exception_cb(cookie, ctx.sp + ctx.aux->stack_arg_sp_adjust, ctx.bp, 0, 0);
3402 	WARN(1, "A call to BPF exception callback should never return\n");
3403 }
3404 
3405 __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags)
3406 {
3407 	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
3408 	struct bpf_map *map = p__map;
3409 
3410 	BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
3411 	BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
3412 
3413 	if (flags)
3414 		return -EINVAL;
3415 
3416 	return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
3417 }
3418 
3419 __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
3420 {
3421 	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
3422 	struct bpf_work *w;
3423 
3424 	if (flags)
3425 		return -EINVAL;
3426 
3427 	w = READ_ONCE(async->work);
3428 	if (!w || !READ_ONCE(w->cb.prog))
3429 		return -EINVAL;
3430 
3431 	if (!refcount_inc_not_zero(&w->cb.refcnt))
3432 		return -ENOENT;
3433 
3434 	if (!defer_timer_wq_op()) {
3435 		schedule_work(&w->work);
3436 		bpf_async_refcount_put(&w->cb);
3437 		return 0;
3438 	} else {
3439 		return bpf_async_schedule_op(&w->cb, BPF_ASYNC_START, 0, 0);
3440 	}
3441 }
3442 
3443 __bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq,
3444 				    int (callback_fn)(void *map, int *key, void *value),
3445 				    unsigned int flags,
3446 				    struct bpf_prog_aux *aux)
3447 {
3448 	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
3449 
3450 	if (flags)
3451 		return -EINVAL;
3452 
3453 	return __bpf_async_set_callback(async, callback_fn, aux->prog);
3454 }
3455 
3456 __bpf_kfunc void bpf_preempt_disable(void)
3457 {
3458 	preempt_disable();
3459 }
3460 
3461 __bpf_kfunc void bpf_preempt_enable(void)
3462 {
3463 	preempt_enable();
3464 }
3465 
3466 struct bpf_iter_bits {
3467 	__u64 __opaque[2];
3468 } __aligned(8);
3469 
3470 #define BITS_ITER_NR_WORDS_MAX 511
3471 
3472 struct bpf_iter_bits_kern {
3473 	union {
3474 		__u64 *bits;
3475 		__u64 bits_copy;
3476 	};
3477 	int nr_bits;
3478 	int bit;
3479 } __aligned(8);
3480 
3481 /* On 64-bit hosts, unsigned long and u64 have the same size, so passing
3482  * a u64 pointer and an unsigned long pointer to find_next_bit() will
3483  * return the same result, as both point to the same 8-byte area.
3484  *
3485  * For 32-bit little-endian hosts, using a u64 pointer or unsigned long
3486  * pointer also makes no difference. This is because the first iterated
3487  * unsigned long is composed of bits 0-31 of the u64 and the second unsigned
3488  * long is composed of bits 32-63 of the u64.
3489  *
3490  * However, for 32-bit big-endian hosts, this is not the case. The first
3491  * iterated unsigned long will be bits 32-63 of the u64, so swap these two
3492  * ulong values within the u64.
3493  */
3494 static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
3495 {
3496 #if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
3497 	unsigned int i;
3498 
3499 	for (i = 0; i < nr; i++)
3500 		bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
3501 #endif
3502 }
3503 
3504 /**
3505  * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
3506  * @it: The new bpf_iter_bits to be created
3507  * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
3508  * @nr_words: The size of the specified memory area, measured in 8-byte units.
3509  * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
3510  * further reduced by the BPF memory allocator implementation.
3511  *
3512  * This function initializes a new bpf_iter_bits structure for iterating over
3513  * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
3514  * copies the data of the memory area to the newly created bpf_iter_bits @it for
3515  * subsequent iteration operations.
3516  *
3517  * On success, 0 is returned. On failure, ERR is returned.
3518  */
3519 __bpf_kfunc int
3520 bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words)
3521 {
3522 	struct bpf_iter_bits_kern *kit = (void *)it;
3523 	u32 nr_bytes = nr_words * sizeof(u64);
3524 	u32 nr_bits = BYTES_TO_BITS(nr_bytes);
3525 	int err;
3526 
3527 	BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits));
3528 	BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) !=
3529 		     __alignof__(struct bpf_iter_bits));
3530 
3531 	kit->nr_bits = 0;
3532 	kit->bits_copy = 0;
3533 	kit->bit = -1;
3534 
3535 	if (!unsafe_ptr__ign || !nr_words)
3536 		return -EINVAL;
3537 	if (nr_words > BITS_ITER_NR_WORDS_MAX)
3538 		return -E2BIG;
3539 
3540 	/* Optimization for u64 mask */
3541 	if (nr_bits == 64) {
3542 		err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign);
3543 		if (err)
3544 			return -EFAULT;
3545 
3546 		swap_ulong_in_u64(&kit->bits_copy, nr_words);
3547 
3548 		kit->nr_bits = nr_bits;
3549 		return 0;
3550 	}
3551 
3552 	if (bpf_mem_alloc_check_size(false, nr_bytes))
3553 		return -E2BIG;
3554 
3555 	/* Fallback to memalloc */
3556 	kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
3557 	if (!kit->bits)
3558 		return -ENOMEM;
3559 
3560 	err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign);
3561 	if (err) {
3562 		bpf_mem_free(&bpf_global_ma, kit->bits);
3563 		return err;
3564 	}
3565 
3566 	swap_ulong_in_u64(kit->bits, nr_words);
3567 
3568 	kit->nr_bits = nr_bits;
3569 	return 0;
3570 }
3571 
3572 /**
3573  * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits
3574  * @it: The bpf_iter_bits to be checked
3575  *
3576  * This function returns a pointer to a number representing the value of the
3577  * next bit in the bits.
3578  *
3579  * If there are no further bits available, it returns NULL.
3580  */
3581 __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
3582 {
3583 	struct bpf_iter_bits_kern *kit = (void *)it;
3584 	int bit = kit->bit, nr_bits = kit->nr_bits;
3585 	const void *bits;
3586 
3587 	if (!nr_bits || bit >= nr_bits)
3588 		return NULL;
3589 
3590 	bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
3591 	bit = find_next_bit(bits, nr_bits, bit + 1);
3592 	if (bit >= nr_bits) {
3593 		kit->bit = bit;
3594 		return NULL;
3595 	}
3596 
3597 	kit->bit = bit;
3598 	return &kit->bit;
3599 }
3600 
3601 /**
3602  * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits
3603  * @it: The bpf_iter_bits to be destroyed
3604  *
3605  * Destroy the resource associated with the bpf_iter_bits.
3606  */
3607 __bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
3608 {
3609 	struct bpf_iter_bits_kern *kit = (void *)it;
3610 
3611 	if (kit->nr_bits <= 64)
3612 		return;
3613 	bpf_mem_free(&bpf_global_ma, kit->bits);
3614 }
3615 
3616 /**
3617  * bpf_copy_from_user_str() - Copy a string from an unsafe user address
3618  * @dst:             Destination address, in kernel space.  This buffer must be
3619  *                   at least @dst__sz bytes long.
3620  * @dst__sz:         Maximum number of bytes to copy, includes the trailing NUL.
3621  * @unsafe_ptr__ign: Source address, in user space.
3622  * @flags:           The only supported flag is BPF_F_PAD_ZEROS
3623  *
3624  * Copies a NUL-terminated string from userspace to BPF space. If user string is
3625  * too long this will still ensure zero termination in the dst buffer unless
3626  * buffer size is 0.
3627  *
3628  * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and
3629  * memset all of @dst on failure.
3630  */
3631 __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags)
3632 {
3633 	int ret;
3634 
3635 	if (unlikely(flags & ~BPF_F_PAD_ZEROS))
3636 		return -EINVAL;
3637 
3638 	if (unlikely(!dst__sz))
3639 		return 0;
3640 
3641 	ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1);
3642 	if (ret < 0) {
3643 		if (flags & BPF_F_PAD_ZEROS)
3644 			memset((char *)dst, 0, dst__sz);
3645 
3646 		return ret;
3647 	}
3648 
3649 	if (flags & BPF_F_PAD_ZEROS)
3650 		memset((char *)dst + ret, 0, dst__sz - ret);
3651 	else
3652 		((char *)dst)[ret] = '\0';
3653 
3654 	return ret + 1;
3655 }
3656 
3657 /**
3658  * bpf_copy_from_user_task_str() - Copy a string from an task's address space
3659  * @dst:             Destination address, in kernel space.  This buffer must be
3660  *                   at least @dst__sz bytes long.
3661  * @dst__sz:         Maximum number of bytes to copy, includes the trailing NUL.
3662  * @unsafe_ptr__ign: Source address in the task's address space.
3663  * @tsk:             The task whose address space will be used
3664  * @flags:           The only supported flag is BPF_F_PAD_ZEROS
3665  *
3666  * Copies a NUL terminated string from a task's address space to @dst__sz
3667  * buffer. If user string is too long this will still ensure zero termination
3668  * in the @dst__sz buffer unless buffer size is 0.
3669  *
3670  * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success
3671  * and memset all of @dst__sz on failure.
3672  *
3673  * Return: The number of copied bytes on success including the NUL terminator.
3674  * A negative error code on failure.
3675  */
3676 __bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz,
3677 					    const void __user *unsafe_ptr__ign,
3678 					    struct task_struct *tsk, u64 flags)
3679 {
3680 	int ret;
3681 
3682 	if (unlikely(flags & ~BPF_F_PAD_ZEROS))
3683 		return -EINVAL;
3684 
3685 	if (unlikely(dst__sz == 0))
3686 		return 0;
3687 
3688 	ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0);
3689 	if (ret < 0) {
3690 		if (flags & BPF_F_PAD_ZEROS)
3691 			memset(dst, 0, dst__sz);
3692 		return ret;
3693 	}
3694 
3695 	if (flags & BPF_F_PAD_ZEROS)
3696 		memset(dst + ret, 0, dst__sz - ret);
3697 
3698 	return ret + 1;
3699 }
3700 
3701 /* Keep unsinged long in prototype so that kfunc is usable when emitted to
3702  * vmlinux.h in BPF programs directly, but note that while in BPF prog, the
3703  * unsigned long always points to 8-byte region on stack, the kernel may only
3704  * read and write the 4-bytes on 32-bit.
3705  */
3706 __bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag)
3707 {
3708 	local_irq_save(*flags__irq_flag);
3709 }
3710 
3711 __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
3712 {
3713 	local_irq_restore(*flags__irq_flag);
3714 }
3715 
3716 __bpf_kfunc void __bpf_trap(void)
3717 {
3718 }
3719 
3720 /*
3721  * Kfuncs for string operations.
3722  *
3723  * Since strings are not necessarily %NUL-terminated, we cannot directly call
3724  * in-kernel implementations. Instead, we open-code the implementations using
3725  * __get_kernel_nofault instead of plain dereference to make them safe.
3726  */
3727 
3728 static int __bpf_strncasecmp(const char *s1, const char *s2, bool ignore_case, size_t len)
3729 {
3730 	char c1, c2;
3731 	int i;
3732 
3733 	if (!copy_from_kernel_nofault_allowed(s1, 1) ||
3734 	    !copy_from_kernel_nofault_allowed(s2, 1)) {
3735 		return -ERANGE;
3736 	}
3737 
3738 	guard(pagefault)();
3739 	for (i = 0; i < len && i < XATTR_SIZE_MAX; i++) {
3740 		__get_kernel_nofault(&c1, s1, char, err_out);
3741 		__get_kernel_nofault(&c2, s2, char, err_out);
3742 		if (ignore_case) {
3743 			c1 = tolower(c1);
3744 			c2 = tolower(c2);
3745 		}
3746 		if (c1 != c2)
3747 			return c1 < c2 ? -1 : 1;
3748 		if (c1 == '\0')
3749 			return 0;
3750 		s1++;
3751 		s2++;
3752 	}
3753 	return i == XATTR_SIZE_MAX ? -E2BIG : 0;
3754 err_out:
3755 	return -EFAULT;
3756 }
3757 
3758 /**
3759  * bpf_strcmp - Compare two strings
3760  * @s1__ign: One string
3761  * @s2__ign: Another string
3762  *
3763  * Return:
3764  * * %0       - Strings are equal
3765  * * %-1      - @s1__ign is smaller
3766  * * %1       - @s2__ign is smaller
3767  * * %-EFAULT - Cannot read one of the strings
3768  * * %-E2BIG  - One of strings is too large
3769  * * %-ERANGE - One of strings is outside of kernel address space
3770  */
3771 __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
3772 {
3773 	return __bpf_strncasecmp(s1__ign, s2__ign, false, XATTR_SIZE_MAX);
3774 }
3775 
3776 /**
3777  * bpf_strcasecmp - Compare two strings, ignoring the case of the characters
3778  * @s1__ign: One string
3779  * @s2__ign: Another string
3780  *
3781  * Return:
3782  * * %0       - Strings are equal
3783  * * %-1      - @s1__ign is smaller
3784  * * %1       - @s2__ign is smaller
3785  * * %-EFAULT - Cannot read one of the strings
3786  * * %-E2BIG  - One of strings is too large
3787  * * %-ERANGE - One of strings is outside of kernel address space
3788  */
3789 __bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign)
3790 {
3791 	return __bpf_strncasecmp(s1__ign, s2__ign, true, XATTR_SIZE_MAX);
3792 }
3793 
3794 /*
3795  * bpf_strncasecmp - Compare two length-limited strings, ignoring case
3796  * @s1__ign: One string
3797  * @s2__ign: Another string
3798  * @len: The maximum number of characters to compare
3799  *
3800  * Return:
3801  * * %0       - Strings are equal
3802  * * %-1      - @s1__ign is smaller
3803  * * %1       - @s2__ign is smaller
3804  * * %-EFAULT - Cannot read one of the strings
3805  * * %-E2BIG  - One of strings is too large
3806  * * %-ERANGE - One of strings is outside of kernel address space
3807  */
3808 __bpf_kfunc int bpf_strncasecmp(const char *s1__ign, const char *s2__ign, size_t len)
3809 {
3810 	return __bpf_strncasecmp(s1__ign, s2__ign, true, len);
3811 }
3812 
3813 /**
3814  * bpf_strnchr - Find a character in a length limited string
3815  * @s__ign: The string to be searched
3816  * @count: The number of characters to be searched
3817  * @c: The character to search for
3818  *
3819  * Note that the %NUL-terminator is considered part of the string, and can
3820  * be searched for.
3821  *
3822  * Return:
3823  * * >=0      - Index of the first occurrence of @c within @s__ign
3824  * * %-ENOENT - @c not found in the first @count characters of @s__ign
3825  * * %-EFAULT - Cannot read @s__ign
3826  * * %-E2BIG  - @s__ign is too large
3827  * * %-ERANGE - @s__ign is outside of kernel address space
3828  */
3829 __bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c)
3830 {
3831 	char sc;
3832 	int i;
3833 
3834 	if (!copy_from_kernel_nofault_allowed(s__ign, 1))
3835 		return -ERANGE;
3836 
3837 	guard(pagefault)();
3838 	for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
3839 		__get_kernel_nofault(&sc, s__ign, char, err_out);
3840 		if (sc == c)
3841 			return i;
3842 		if (sc == '\0')
3843 			return -ENOENT;
3844 		s__ign++;
3845 	}
3846 	return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT;
3847 err_out:
3848 	return -EFAULT;
3849 }
3850 
3851 /**
3852  * bpf_strchr - Find the first occurrence of a character in a string
3853  * @s__ign: The string to be searched
3854  * @c: The character to search for
3855  *
3856  * Note that the %NUL-terminator is considered part of the string, and can
3857  * be searched for.
3858  *
3859  * Return:
3860  * * >=0      - The index of the first occurrence of @c within @s__ign
3861  * * %-ENOENT - @c not found in @s__ign
3862  * * %-EFAULT - Cannot read @s__ign
3863  * * %-E2BIG  - @s__ign is too large
3864  * * %-ERANGE - @s__ign is outside of kernel address space
3865  */
3866 __bpf_kfunc int bpf_strchr(const char *s__ign, char c)
3867 {
3868 	return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c);
3869 }
3870 
3871 /**
3872  * bpf_strchrnul - Find and return a character in a string, or end of string
3873  * @s__ign: The string to be searched
3874  * @c: The character to search for
3875  *
3876  * Return:
3877  * * >=0      - Index of the first occurrence of @c within @s__ign or index of
3878  *              the null byte at the end of @s__ign when @c is not found
3879  * * %-EFAULT - Cannot read @s__ign
3880  * * %-E2BIG  - @s__ign is too large
3881  * * %-ERANGE - @s__ign is outside of kernel address space
3882  */
3883 __bpf_kfunc int bpf_strchrnul(const char *s__ign, char c)
3884 {
3885 	char sc;
3886 	int i;
3887 
3888 	if (!copy_from_kernel_nofault_allowed(s__ign, 1))
3889 		return -ERANGE;
3890 
3891 	guard(pagefault)();
3892 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
3893 		__get_kernel_nofault(&sc, s__ign, char, err_out);
3894 		if (sc == '\0' || sc == c)
3895 			return i;
3896 		s__ign++;
3897 	}
3898 	return -E2BIG;
3899 err_out:
3900 	return -EFAULT;
3901 }
3902 
3903 /**
3904  * bpf_strrchr - Find the last occurrence of a character in a string
3905  * @s__ign: The string to be searched
3906  * @c: The character to search for
3907  *
3908  * Return:
3909  * * >=0      - Index of the last occurrence of @c within @s__ign
3910  * * %-ENOENT - @c not found in @s__ign
3911  * * %-EFAULT - Cannot read @s__ign
3912  * * %-E2BIG  - @s__ign is too large
3913  * * %-ERANGE - @s__ign is outside of kernel address space
3914  */
3915 __bpf_kfunc int bpf_strrchr(const char *s__ign, int c)
3916 {
3917 	char sc;
3918 	int i, last = -ENOENT;
3919 
3920 	if (!copy_from_kernel_nofault_allowed(s__ign, 1))
3921 		return -ERANGE;
3922 
3923 	guard(pagefault)();
3924 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
3925 		__get_kernel_nofault(&sc, s__ign, char, err_out);
3926 		if (sc == c)
3927 			last = i;
3928 		if (sc == '\0')
3929 			return last;
3930 		s__ign++;
3931 	}
3932 	return -E2BIG;
3933 err_out:
3934 	return -EFAULT;
3935 }
3936 
3937 /**
3938  * bpf_strnlen - Calculate the length of a length-limited string
3939  * @s__ign: The string
3940  * @count: The maximum number of characters to count
3941  *
3942  * Return:
3943  * * >=0      - The length of @s__ign
3944  * * %-EFAULT - Cannot read @s__ign
3945  * * %-E2BIG  - @s__ign is too large
3946  * * %-ERANGE - @s__ign is outside of kernel address space
3947  */
3948 __bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count)
3949 {
3950 	char c;
3951 	int i;
3952 
3953 	if (!copy_from_kernel_nofault_allowed(s__ign, 1))
3954 		return -ERANGE;
3955 
3956 	guard(pagefault)();
3957 	for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
3958 		__get_kernel_nofault(&c, s__ign, char, err_out);
3959 		if (c == '\0')
3960 			return i;
3961 		s__ign++;
3962 	}
3963 	return i == XATTR_SIZE_MAX ? -E2BIG : i;
3964 err_out:
3965 	return -EFAULT;
3966 }
3967 
3968 /**
3969  * bpf_strlen - Calculate the length of a string
3970  * @s__ign: The string
3971  *
3972  * Return:
3973  * * >=0      - The length of @s__ign
3974  * * %-EFAULT - Cannot read @s__ign
3975  * * %-E2BIG  - @s__ign is too large
3976  * * %-ERANGE - @s__ign is outside of kernel address space
3977  */
3978 __bpf_kfunc int bpf_strlen(const char *s__ign)
3979 {
3980 	return bpf_strnlen(s__ign, XATTR_SIZE_MAX);
3981 }
3982 
3983 /**
3984  * bpf_strspn - Calculate the length of the initial substring of @s__ign which
3985  *              only contains letters in @accept__ign
3986  * @s__ign: The string to be searched
3987  * @accept__ign: The string to search for
3988  *
3989  * Return:
3990  * * >=0      - The length of the initial substring of @s__ign which only
3991  *              contains letters from @accept__ign
3992  * * %-EFAULT - Cannot read one of the strings
3993  * * %-E2BIG  - One of the strings is too large
3994  * * %-ERANGE - One of the strings is outside of kernel address space
3995  */
3996 __bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign)
3997 {
3998 	char cs, ca;
3999 	int i, j;
4000 
4001 	if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
4002 	    !copy_from_kernel_nofault_allowed(accept__ign, 1)) {
4003 		return -ERANGE;
4004 	}
4005 
4006 	guard(pagefault)();
4007 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
4008 		__get_kernel_nofault(&cs, s__ign, char, err_out);
4009 		if (cs == '\0')
4010 			return i;
4011 		for (j = 0; j < XATTR_SIZE_MAX; j++) {
4012 			__get_kernel_nofault(&ca, accept__ign + j, char, err_out);
4013 			if (cs == ca || ca == '\0')
4014 				break;
4015 		}
4016 		if (j == XATTR_SIZE_MAX)
4017 			return -E2BIG;
4018 		if (ca == '\0')
4019 			return i;
4020 		s__ign++;
4021 	}
4022 	return -E2BIG;
4023 err_out:
4024 	return -EFAULT;
4025 }
4026 
4027 /**
4028  * bpf_strcspn - Calculate the length of the initial substring of @s__ign which
4029  *               does not contain letters in @reject__ign
4030  * @s__ign: The string to be searched
4031  * @reject__ign: The string to search for
4032  *
4033  * Return:
4034  * * >=0      - The length of the initial substring of @s__ign which does not
4035  *              contain letters from @reject__ign
4036  * * %-EFAULT - Cannot read one of the strings
4037  * * %-E2BIG  - One of the strings is too large
4038  * * %-ERANGE - One of the strings is outside of kernel address space
4039  */
4040 __bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign)
4041 {
4042 	char cs, cr;
4043 	int i, j;
4044 
4045 	if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
4046 	    !copy_from_kernel_nofault_allowed(reject__ign, 1)) {
4047 		return -ERANGE;
4048 	}
4049 
4050 	guard(pagefault)();
4051 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
4052 		__get_kernel_nofault(&cs, s__ign, char, err_out);
4053 		if (cs == '\0')
4054 			return i;
4055 		for (j = 0; j < XATTR_SIZE_MAX; j++) {
4056 			__get_kernel_nofault(&cr, reject__ign + j, char, err_out);
4057 			if (cs == cr || cr == '\0')
4058 				break;
4059 		}
4060 		if (j == XATTR_SIZE_MAX)
4061 			return -E2BIG;
4062 		if (cr != '\0')
4063 			return i;
4064 		s__ign++;
4065 	}
4066 	return -E2BIG;
4067 err_out:
4068 	return -EFAULT;
4069 }
4070 
4071 static int __bpf_strnstr(const char *s1, const char *s2, size_t len,
4072 			 bool ignore_case)
4073 {
4074 	char c1, c2;
4075 	int i, j;
4076 
4077 	if (!copy_from_kernel_nofault_allowed(s1, 1) ||
4078 	    !copy_from_kernel_nofault_allowed(s2, 1)) {
4079 		return -ERANGE;
4080 	}
4081 
4082 	guard(pagefault)();
4083 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
4084 		for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
4085 			__get_kernel_nofault(&c2, s2 + j, char, err_out);
4086 			if (c2 == '\0')
4087 				return i;
4088 			/*
4089 			 * We allow reading an extra byte from s2 (note the
4090 			 * `i + j <= len` above) to cover the case when s2 is
4091 			 * a suffix of the first len chars of s1.
4092 			 */
4093 			if (i + j == len)
4094 				break;
4095 			__get_kernel_nofault(&c1, s1 + j, char, err_out);
4096 
4097 			if (ignore_case) {
4098 				c1 = tolower(c1);
4099 				c2 = tolower(c2);
4100 			}
4101 
4102 			if (c1 == '\0')
4103 				return -ENOENT;
4104 			if (c1 != c2)
4105 				break;
4106 		}
4107 		if (j == XATTR_SIZE_MAX)
4108 			return -E2BIG;
4109 		if (i + j == len)
4110 			return -ENOENT;
4111 		s1++;
4112 	}
4113 	return -E2BIG;
4114 err_out:
4115 	return -EFAULT;
4116 }
4117 
4118 /**
4119  * bpf_strstr - Find the first substring in a string
4120  * @s1__ign: The string to be searched
4121  * @s2__ign: The string to search for
4122  *
4123  * Return:
4124  * * >=0      - Index of the first character of the first occurrence of @s2__ign
4125  *              within @s1__ign
4126  * * %-ENOENT - @s2__ign is not a substring of @s1__ign
4127  * * %-EFAULT - Cannot read one of the strings
4128  * * %-E2BIG  - One of the strings is too large
4129  * * %-ERANGE - One of the strings is outside of kernel address space
4130  */
4131 __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
4132 {
4133 	return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false);
4134 }
4135 
4136 /**
4137  * bpf_strcasestr - Find the first substring in a string, ignoring the case of
4138  *                  the characters
4139  * @s1__ign: The string to be searched
4140  * @s2__ign: The string to search for
4141  *
4142  * Return:
4143  * * >=0      - Index of the first character of the first occurrence of @s2__ign
4144  *              within @s1__ign
4145  * * %-ENOENT - @s2__ign is not a substring of @s1__ign
4146  * * %-EFAULT - Cannot read one of the strings
4147  * * %-E2BIG  - One of the strings is too large
4148  * * %-ERANGE - One of the strings is outside of kernel address space
4149  */
4150 __bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign)
4151 {
4152 	return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true);
4153 }
4154 
4155 /**
4156  * bpf_strnstr - Find the first substring in a length-limited string
4157  * @s1__ign: The string to be searched
4158  * @s2__ign: The string to search for
4159  * @len: the maximum number of characters to search
4160  *
4161  * Return:
4162  * * >=0      - Index of the first character of the first occurrence of @s2__ign
4163  *              within the first @len characters of @s1__ign
4164  * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
4165  * * %-EFAULT - Cannot read one of the strings
4166  * * %-E2BIG  - One of the strings is too large
4167  * * %-ERANGE - One of the strings is outside of kernel address space
4168  */
4169 __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign,
4170 			    size_t len)
4171 {
4172 	return __bpf_strnstr(s1__ign, s2__ign, len, false);
4173 }
4174 
4175 /**
4176  * bpf_strncasestr - Find the first substring in a length-limited string,
4177  *                   ignoring the case of the characters
4178  * @s1__ign: The string to be searched
4179  * @s2__ign: The string to search for
4180  * @len: the maximum number of characters to search
4181  *
4182  * Return:
4183  * * >=0      - Index of the first character of the first occurrence of @s2__ign
4184  *              within the first @len characters of @s1__ign
4185  * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
4186  * * %-EFAULT - Cannot read one of the strings
4187  * * %-E2BIG  - One of the strings is too large
4188  * * %-ERANGE - One of the strings is outside of kernel address space
4189  */
4190 __bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign,
4191 				size_t len)
4192 {
4193 	return __bpf_strnstr(s1__ign, s2__ign, len, true);
4194 }
4195 
4196 #ifdef CONFIG_KEYS
4197 /**
4198  * bpf_lookup_user_key - lookup a key by its serial
4199  * @serial: key handle serial number
4200  * @flags: lookup-specific flags
4201  *
4202  * Search a key with a given *serial* and the provided *flags*.
4203  * If found, increment the reference count of the key by one, and
4204  * return it in the bpf_key structure.
4205  *
4206  * The bpf_key structure must be passed to bpf_key_put() when done
4207  * with it, so that the key reference count is decremented and the
4208  * bpf_key structure is freed.
4209  *
4210  * Permission checks are deferred to the time the key is used by
4211  * one of the available key-specific kfuncs.
4212  *
4213  * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
4214  * special keyring (e.g. session keyring), if it doesn't yet exist.
4215  * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
4216  * for the key construction, and to retrieve uninstantiated keys (keys
4217  * without data attached to them).
4218  *
4219  * Return: a bpf_key pointer with a valid key pointer if the key is found, a
4220  *         NULL pointer otherwise.
4221  */
4222 __bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
4223 {
4224 	key_ref_t key_ref;
4225 	struct bpf_key *bkey;
4226 
4227 	if (flags & ~KEY_LOOKUP_ALL)
4228 		return NULL;
4229 
4230 	/*
4231 	 * Permission check is deferred until the key is used, as the
4232 	 * intent of the caller is unknown here.
4233 	 */
4234 	key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
4235 	if (IS_ERR(key_ref))
4236 		return NULL;
4237 
4238 	bkey = kmalloc_obj(*bkey);
4239 	if (!bkey) {
4240 		key_put(key_ref_to_ptr(key_ref));
4241 		return NULL;
4242 	}
4243 
4244 	bkey->key = key_ref_to_ptr(key_ref);
4245 	bkey->has_ref = true;
4246 
4247 	return bkey;
4248 }
4249 
4250 /**
4251  * bpf_lookup_system_key - lookup a key by a system-defined ID
4252  * @id: key ID
4253  *
4254  * Obtain a bpf_key structure with a key pointer set to the passed key ID.
4255  * The key pointer is marked as invalid, to prevent bpf_key_put() from
4256  * attempting to decrement the key reference count on that pointer. The key
4257  * pointer set in such way is currently understood only by
4258  * verify_pkcs7_signature().
4259  *
4260  * Set *id* to one of the values defined in include/linux/verification.h:
4261  * 0 for the primary keyring (immutable keyring of system keys);
4262  * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
4263  * (where keys can be added only if they are vouched for by existing keys
4264  * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
4265  * keyring (primarily used by the integrity subsystem to verify a kexec'ed
4266  * kerned image and, possibly, the initramfs signature).
4267  *
4268  * Return: a bpf_key pointer with an invalid key pointer set from the
4269  *         pre-determined ID on success, a NULL pointer otherwise
4270  */
4271 __bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
4272 {
4273 	struct bpf_key *bkey;
4274 
4275 	if (system_keyring_id_check(id) < 0)
4276 		return NULL;
4277 
4278 	bkey = kmalloc_obj(*bkey, GFP_ATOMIC);
4279 	if (!bkey)
4280 		return NULL;
4281 
4282 	bkey->key = (struct key *)(unsigned long)id;
4283 	bkey->has_ref = false;
4284 
4285 	return bkey;
4286 }
4287 
4288 /**
4289  * bpf_key_put - decrement key reference count if key is valid and free bpf_key
4290  * @bkey: bpf_key structure
4291  *
4292  * Decrement the reference count of the key inside *bkey*, if the pointer
4293  * is valid, and free *bkey*.
4294  */
4295 __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
4296 {
4297 	if (bkey->has_ref)
4298 		key_put(bkey->key);
4299 
4300 	kfree(bkey);
4301 }
4302 
4303 /**
4304  * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
4305  * @data_p: data to verify
4306  * @sig_p: signature of the data
4307  * @trusted_keyring: keyring with keys trusted for signature verification
4308  *
4309  * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
4310  * with keys in a keyring referenced by *trusted_keyring*.
4311  *
4312  * Return: 0 on success, a negative value on error.
4313  */
4314 __bpf_kfunc int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p,
4315 			       const struct bpf_dynptr *sig_p,
4316 			       struct bpf_key *trusted_keyring)
4317 {
4318 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
4319 	const struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
4320 	const struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
4321 	const void *data, *sig;
4322 	u32 data_len, sig_len;
4323 	int ret;
4324 
4325 	if (trusted_keyring->has_ref) {
4326 		/*
4327 		 * Do the permission check deferred in bpf_lookup_user_key().
4328 		 * See bpf_lookup_user_key() for more details.
4329 		 *
4330 		 * A call to key_task_permission() here would be redundant, as
4331 		 * it is already done by keyring_search() called by
4332 		 * find_asymmetric_key().
4333 		 */
4334 		ret = key_validate(trusted_keyring->key);
4335 		if (ret < 0)
4336 			return ret;
4337 	}
4338 
4339 	data_len = __bpf_dynptr_size(data_ptr);
4340 	data = __bpf_dynptr_data(data_ptr, data_len);
4341 	if (!data)
4342 		return -EINVAL;
4343 
4344 	sig_len = __bpf_dynptr_size(sig_ptr);
4345 	sig = __bpf_dynptr_data(sig_ptr, sig_len);
4346 	if (!sig)
4347 		return -EINVAL;
4348 
4349 	return verify_pkcs7_signature(data, data_len, sig, sig_len,
4350 				      trusted_keyring->key,
4351 				      VERIFYING_BPF_SIGNATURE, NULL,
4352 				      NULL);
4353 #else
4354 	return -EOPNOTSUPP;
4355 #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
4356 }
4357 #endif /* CONFIG_KEYS */
4358 
4359 typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value);
4360 
4361 enum bpf_task_work_state {
4362 	/* bpf_task_work is ready to be used */
4363 	BPF_TW_STANDBY = 0,
4364 	/* irq work scheduling in progress */
4365 	BPF_TW_PENDING,
4366 	/* task work scheduling in progress */
4367 	BPF_TW_SCHEDULING,
4368 	/* task work is scheduled successfully */
4369 	BPF_TW_SCHEDULED,
4370 	/* callback is running */
4371 	BPF_TW_RUNNING,
4372 	/* associated BPF map value is deleted */
4373 	BPF_TW_FREED,
4374 };
4375 
4376 struct bpf_task_work_ctx {
4377 	enum bpf_task_work_state state;
4378 	refcount_t refcnt;
4379 	struct callback_head work;
4380 	struct irq_work irq_work;
4381 	/* bpf_prog that schedules task work */
4382 	struct bpf_prog *prog;
4383 	/* task for which callback is scheduled */
4384 	struct task_struct *task;
4385 	/* the map and map value associated with this context */
4386 	struct bpf_map *map;
4387 	void *map_val;
4388 	enum task_work_notify_mode mode;
4389 	bpf_task_work_callback_t callback_fn;
4390 	struct rcu_head rcu;
4391 } __aligned(8);
4392 
4393 /* Actual type for struct bpf_task_work */
4394 struct bpf_task_work_kern {
4395 	struct bpf_task_work_ctx *ctx;
4396 };
4397 
4398 static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx)
4399 {
4400 	if (ctx->prog) {
4401 		bpf_prog_put(ctx->prog);
4402 		ctx->prog = NULL;
4403 	}
4404 	if (ctx->task) {
4405 		bpf_task_release(ctx->task);
4406 		ctx->task = NULL;
4407 	}
4408 }
4409 
4410 static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
4411 {
4412 	return refcount_inc_not_zero(&ctx->refcnt);
4413 }
4414 
4415 static void bpf_task_work_destroy(struct irq_work *irq_work)
4416 {
4417 	struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
4418 
4419 	bpf_task_work_ctx_reset(ctx);
4420 	kfree_rcu(ctx, rcu);
4421 }
4422 
4423 static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
4424 {
4425 	if (!refcount_dec_and_test(&ctx->refcnt))
4426 		return;
4427 
4428 	if (irqs_disabled()) {
4429 		ctx->irq_work = IRQ_WORK_INIT(bpf_task_work_destroy);
4430 		irq_work_queue(&ctx->irq_work);
4431 	} else {
4432 		bpf_task_work_destroy(&ctx->irq_work);
4433 	}
4434 }
4435 
4436 static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
4437 {
4438 	/*
4439 	 * Scheduled task_work callback holds ctx ref, so if we successfully
4440 	 * cancelled, we put that ref on callback's behalf. If we couldn't
4441 	 * cancel, callback will inevitably run or has already completed
4442 	 * running, and it would have taken care of its ctx ref itself.
4443 	 */
4444 	if (task_work_cancel(ctx->task, &ctx->work))
4445 		bpf_task_work_ctx_put(ctx);
4446 }
4447 
4448 static void bpf_task_work_callback(struct callback_head *cb)
4449 {
4450 	struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work);
4451 	enum bpf_task_work_state state;
4452 	u32 idx;
4453 	void *key;
4454 
4455 	/* Read lock is needed to protect ctx and map key/value access */
4456 	guard(rcu_tasks_trace)();
4457 	/*
4458 	 * This callback may start running before bpf_task_work_irq() switched to
4459 	 * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
4460 	 */
4461 	state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
4462 	if (state == BPF_TW_SCHEDULED)
4463 		state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
4464 	if (state == BPF_TW_FREED) {
4465 		bpf_task_work_ctx_put(ctx);
4466 		return;
4467 	}
4468 
4469 	key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);
4470 
4471 	migrate_disable();
4472 	ctx->callback_fn(ctx->map, key, ctx->map_val);
4473 	migrate_enable();
4474 
4475 	bpf_task_work_ctx_reset(ctx);
4476 	(void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
4477 
4478 	bpf_task_work_ctx_put(ctx);
4479 }
4480 
4481 static void bpf_task_work_irq(struct irq_work *irq_work)
4482 {
4483 	struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
4484 	enum bpf_task_work_state state;
4485 	int err;
4486 
4487 	guard(rcu)();
4488 
4489 	if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
4490 		bpf_task_work_ctx_put(ctx);
4491 		return;
4492 	}
4493 
4494 	err = task_work_add(ctx->task, &ctx->work, ctx->mode);
4495 	if (err) {
4496 		bpf_task_work_ctx_reset(ctx);
4497 		/*
4498 		 * try to switch back to STANDBY for another task_work reuse, but we might have
4499 		 * gone to FREED already, which is fine as we already cleaned up after ourselves
4500 		 */
4501 		(void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
4502 		bpf_task_work_ctx_put(ctx);
4503 		return;
4504 	}
4505 
4506 	/*
4507 	 * It's technically possible for just scheduled task_work callback to
4508 	 * complete running by now, going SCHEDULING -> RUNNING and then
4509 	 * dropping its ctx refcount. Instead of capturing an extra ref just
4510 	 * to protect below ctx->state access, we rely on rcu_read_lock
4511 	 * above to prevent kfree_rcu from freeing ctx before we return.
4512 	 */
4513 	state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
4514 	if (state == BPF_TW_FREED)
4515 		bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */
4516 }
4517 
4518 static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw,
4519 							 struct bpf_map *map)
4520 {
4521 	struct bpf_task_work_kern *twk = (void *)tw;
4522 	struct bpf_task_work_ctx *ctx, *old_ctx;
4523 
4524 	ctx = READ_ONCE(twk->ctx);
4525 	if (ctx)
4526 		return ctx;
4527 
4528 	ctx = bpf_map_kmalloc_nolock(map, sizeof(*ctx), 0, NUMA_NO_NODE);
4529 	if (!ctx)
4530 		return ERR_PTR(-ENOMEM);
4531 
4532 	memset(ctx, 0, sizeof(*ctx));
4533 	refcount_set(&ctx->refcnt, 1); /* map's own ref */
4534 	ctx->state = BPF_TW_STANDBY;
4535 
4536 	old_ctx = cmpxchg(&twk->ctx, NULL, ctx);
4537 	if (old_ctx) {
4538 		/*
4539 		 * tw->ctx is set by concurrent BPF program, release allocated
4540 		 * memory and try to reuse already set context.
4541 		 */
4542 		kfree_nolock(ctx);
4543 		return old_ctx;
4544 	}
4545 
4546 	return ctx; /* Success */
4547 }
4548 
4549 static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw,
4550 							   struct bpf_map *map)
4551 {
4552 	struct bpf_task_work_ctx *ctx;
4553 
4554 	/*
4555 	 * Sleepable BPF programs hold rcu_read_lock_trace but not
4556 	 * regular rcu_read_lock. Since kfree_rcu waits for regular
4557 	 * RCU GP, the ctx can be freed while we're between reading
4558 	 * the pointer and incrementing the refcount. Take regular
4559 	 * rcu_read_lock to prevent kfree_rcu from freeing the ctx
4560 	 * before we can tryget it.
4561 	 */
4562 	scoped_guard(rcu) {
4563 		ctx = bpf_task_work_fetch_ctx(tw, map);
4564 		if (IS_ERR(ctx))
4565 			return ctx;
4566 
4567 		/* try to get ref for task_work callback to hold */
4568 		if (!bpf_task_work_ctx_tryget(ctx))
4569 			return ERR_PTR(-EBUSY);
4570 	}
4571 
4572 	if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
4573 		/* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
4574 		bpf_task_work_ctx_put(ctx);
4575 		return ERR_PTR(-EBUSY);
4576 	}
4577 
4578 	/*
4579 	 * If no process or bpffs is holding a reference to the map, no new callbacks should be
4580 	 * scheduled. This does not address any race or correctness issue, but rather is a policy
4581 	 * choice: dropping user references should stop everything.
4582 	 */
4583 	if (!atomic64_read(&map->usercnt)) {
4584 		/* drop ref we just got for task_work callback itself */
4585 		bpf_task_work_ctx_put(ctx);
4586 		/* transfer map's ref into cancel_and_free() */
4587 		bpf_task_work_cancel_and_free(tw);
4588 		return ERR_PTR(-EBUSY);
4589 	}
4590 
4591 	return ctx;
4592 }
4593 
4594 static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
4595 				  struct bpf_map *map, bpf_task_work_callback_t callback_fn,
4596 				  struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
4597 {
4598 	struct bpf_prog *prog;
4599 	struct bpf_task_work_ctx *ctx;
4600 	int err;
4601 
4602 	BTF_TYPE_EMIT(struct bpf_task_work);
4603 
4604 	prog = bpf_prog_inc_not_zero(aux->prog);
4605 	if (IS_ERR(prog))
4606 		return -EBADF;
4607 	task = bpf_task_acquire(task);
4608 	if (!task) {
4609 		err = -EBADF;
4610 		goto release_prog;
4611 	}
4612 
4613 	ctx = bpf_task_work_acquire_ctx(tw, map);
4614 	if (IS_ERR(ctx)) {
4615 		err = PTR_ERR(ctx);
4616 		goto release_all;
4617 	}
4618 
4619 	ctx->task = task;
4620 	ctx->callback_fn = callback_fn;
4621 	ctx->prog = prog;
4622 	ctx->mode = mode;
4623 	ctx->map = map;
4624 	ctx->map_val = (void *)tw - map->record->task_work_off;
4625 	init_task_work(&ctx->work, bpf_task_work_callback);
4626 	init_irq_work(&ctx->irq_work, bpf_task_work_irq);
4627 
4628 	irq_work_queue(&ctx->irq_work);
4629 	return 0;
4630 
4631 release_all:
4632 	bpf_task_release(task);
4633 release_prog:
4634 	bpf_prog_put(prog);
4635 	return err;
4636 }
4637 
4638 /**
4639  * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL
4640  * mode
4641  * @task: Task struct for which callback should be scheduled
4642  * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
4643  * @map__map: bpf_map that embeds struct bpf_task_work in the values
4644  * @callback: pointer to BPF subprogram to call
4645  * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
4646  *
4647  * Return: 0 if task work has been scheduled successfully, negative error code otherwise
4648  */
4649 __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
4650 					      void *map__map, bpf_task_work_callback_t callback,
4651 					      struct bpf_prog_aux *aux)
4652 {
4653 	return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL);
4654 }
4655 
4656 /**
4657  * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME
4658  * mode
4659  * @task: Task struct for which callback should be scheduled
4660  * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
4661  * @map__map: bpf_map that embeds struct bpf_task_work in the values
4662  * @callback: pointer to BPF subprogram to call
4663  * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
4664  *
4665  * Return: 0 if task work has been scheduled successfully, negative error code otherwise
4666  */
4667 __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
4668 					      void *map__map, bpf_task_work_callback_t callback,
4669 					      struct bpf_prog_aux *aux)
4670 {
4671 	return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME);
4672 }
4673 
4674 static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
4675 			    struct bpf_dynptr_kern *ptr)
4676 {
4677 	struct bpf_dynptr_file_impl *state;
4678 
4679 	/* flags is currently unsupported */
4680 	if (flags) {
4681 		bpf_dynptr_set_null(ptr);
4682 		return -EINVAL;
4683 	}
4684 
4685 	state = kmalloc_nolock(sizeof(*state), 0, NUMA_NO_NODE);
4686 	if (!state) {
4687 		bpf_dynptr_set_null(ptr);
4688 		return -ENOMEM;
4689 	}
4690 	state->offset = 0;
4691 	state->size = U64_MAX; /* Don't restrict size, as file may change anyways */
4692 	freader_init_from_file(&state->freader, NULL, 0, file, may_sleep);
4693 	bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0);
4694 	bpf_dynptr_set_rdonly(ptr);
4695 	return 0;
4696 }
4697 
4698 __bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
4699 {
4700 	return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit);
4701 }
4702 
4703 int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
4704 {
4705 	return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit);
4706 }
4707 
4708 __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
4709 {
4710 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr;
4711 	struct bpf_dynptr_file_impl *df = ptr->data;
4712 
4713 	if (!df)
4714 		return 0;
4715 
4716 	freader_cleanup(&df->freader);
4717 	kfree_nolock(df);
4718 	bpf_dynptr_set_null(ptr);
4719 	return 0;
4720 }
4721 
4722 /**
4723  * bpf_timer_cancel_async - try to deactivate a timer
4724  * @timer:	bpf_timer to stop
4725  *
4726  * Returns:
4727  *
4728  *  *  0 when the timer was not active
4729  *  *  1 when the timer was active
4730  *  * -1 when the timer is currently executing the callback function and
4731  *       cannot be stopped
4732  *  * -ECANCELED when the timer will be cancelled asynchronously
4733  *  * -ENOMEM when out of memory
4734  *  * -EINVAL when the timer was not initialized
4735  *  * -ENOENT when this kfunc is racing with timer deletion
4736  */
4737 __bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer)
4738 {
4739 	struct bpf_async_kern *async = (void *)timer;
4740 	struct bpf_async_cb *cb;
4741 	int ret;
4742 
4743 	cb = READ_ONCE(async->cb);
4744 	if (!cb)
4745 		return -EINVAL;
4746 
4747 	/*
4748 	 * Unlike hrtimer_start() it's ok to synchronously call
4749 	 * hrtimer_try_to_cancel() when refcnt reached zero, but deferring to
4750 	 * irq_work is not, since irq callback may execute after RCU GP and
4751 	 * cb could be freed at that time. Check for refcnt zero for
4752 	 * consistency.
4753 	 */
4754 	if (!refcount_inc_not_zero(&cb->refcnt))
4755 		return -ENOENT;
4756 
4757 	if (!defer_timer_wq_op()) {
4758 		struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
4759 
4760 		ret = hrtimer_try_to_cancel(&t->timer);
4761 		bpf_async_refcount_put(cb);
4762 		return ret;
4763 	} else {
4764 		ret = bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0);
4765 		return ret ? ret : -ECANCELED;
4766 	}
4767 }
4768 
4769 __bpf_kfunc_end_defs();
4770 
4771 static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
4772 {
4773 	struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
4774 
4775 	bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */
4776 	bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */
4777 }
4778 
4779 void bpf_task_work_cancel_and_free(void *val)
4780 {
4781 	struct bpf_task_work_kern *twk = val;
4782 	struct bpf_task_work_ctx *ctx;
4783 	enum bpf_task_work_state state;
4784 
4785 	ctx = xchg(&twk->ctx, NULL);
4786 	if (!ctx)
4787 		return;
4788 
4789 	state = xchg(&ctx->state, BPF_TW_FREED);
4790 	if (state == BPF_TW_SCHEDULED) {
4791 		/* run in irq_work to avoid locks in NMI */
4792 		init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled);
4793 		irq_work_queue(&ctx->irq_work);
4794 		return;
4795 	}
4796 
4797 	bpf_task_work_ctx_put(ctx); /* put bpf map's ref */
4798 }
4799 
4800 BTF_KFUNCS_START(generic_btf_ids)
4801 #ifdef CONFIG_CRASH_DUMP
4802 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
4803 #endif
4804 BTF_ID_FLAGS(func, bpf_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS)
4805 BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
4806 BTF_ID_FLAGS(func, bpf_percpu_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS)
4807 BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
4808 BTF_ID_FLAGS(func, bpf_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS)
4809 BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
4810 BTF_ID_FLAGS(func, bpf_percpu_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS)
4811 BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
4812 BTF_ID_FLAGS(func, bpf_refcount_acquire, KF_ACQUIRE | KF_RET_NULL | KF_RCU | KF_IMPLICIT_ARGS)
4813 BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
4814 BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS)
4815 BTF_ID_FLAGS(func, bpf_list_push_front_impl)
4816 BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS)
4817 BTF_ID_FLAGS(func, bpf_list_push_back_impl)
4818 BTF_ID_FLAGS(func, bpf_list_add, KF_IMPLICIT_ARGS)
4819 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
4820 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
4821 BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL)
4822 BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
4823 BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
4824 BTF_ID_FLAGS(func, bpf_list_is_first)
4825 BTF_ID_FLAGS(func, bpf_list_is_last)
4826 BTF_ID_FLAGS(func, bpf_list_empty)
4827 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
4828 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
4829 BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
4830 BTF_ID_FLAGS(func, bpf_rbtree_add, KF_IMPLICIT_ARGS)
4831 BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
4832 BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
4833 BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
4834 BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
4835 BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)
4836 
4837 #ifdef CONFIG_CGROUPS
4838 BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
4839 BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
4840 BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
4841 BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
4842 BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
4843 BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
4844 #endif
4845 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
4846 BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
4847 BTF_ID_FLAGS(func, bpf_throw)
4848 #ifdef CONFIG_BPF_EVENTS
4849 BTF_ID_FLAGS(func, bpf_send_signal_task)
4850 #endif
4851 #ifdef CONFIG_KEYS
4852 BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
4853 BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
4854 BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
4855 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
4856 BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
4857 #endif
4858 #endif
4859 #ifdef CONFIG_S390
4860 BTF_ID_FLAGS(func, bpf_get_lowcore)
4861 #endif
4862 BTF_KFUNCS_END(generic_btf_ids)
4863 
4864 static const struct btf_kfunc_id_set generic_kfunc_set = {
4865 	.owner = THIS_MODULE,
4866 	.set   = &generic_btf_ids,
4867 };
4868 
4869 
4870 BTF_ID_LIST(generic_dtor_ids)
4871 BTF_ID(struct, task_struct)
4872 BTF_ID(func, bpf_task_release_dtor)
4873 #ifdef CONFIG_CGROUPS
4874 BTF_ID(struct, cgroup)
4875 BTF_ID(func, bpf_cgroup_release_dtor)
4876 #endif
4877 
4878 BTF_KFUNCS_START(common_btf_ids)
4879 BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL)
4880 BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL)
4881 BTF_ID_FLAGS(func, bpf_rcu_read_lock)
4882 BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
4883 BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
4884 BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
4885 BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
4886 BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
4887 BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
4888 BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
4889 BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
4890 BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
4891 #ifdef CONFIG_CGROUPS
4892 BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
4893 BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
4894 BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
4895 BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED)
4896 BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
4897 BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
4898 #endif
4899 BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED)
4900 BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
4901 BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
4902 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
4903 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
4904 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
4905 BTF_ID_FLAGS(func, bpf_dynptr_size)
4906 BTF_ID_FLAGS(func, bpf_dynptr_clone)
4907 BTF_ID_FLAGS(func, bpf_dynptr_copy)
4908 BTF_ID_FLAGS(func, bpf_dynptr_memset)
4909 #ifdef CONFIG_NET
4910 BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
4911 #endif
4912 BTF_ID_FLAGS(func, bpf_wq_init)
4913 BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS)
4914 BTF_ID_FLAGS(func, bpf_wq_start)
4915 BTF_ID_FLAGS(func, bpf_preempt_disable)
4916 BTF_ID_FLAGS(func, bpf_preempt_enable)
4917 BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
4918 BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
4919 BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
4920 BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
4921 BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE)
4922 BTF_ID_FLAGS(func, bpf_get_kmem_cache)
4923 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
4924 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
4925 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
4926 BTF_ID_FLAGS(func, bpf_local_irq_save)
4927 BTF_ID_FLAGS(func, bpf_local_irq_restore)
4928 #ifdef CONFIG_BPF_EVENTS
4929 BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr)
4930 BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr)
4931 BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
4932 BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr)
4933 BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
4934 BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
4935 BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE)
4936 BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE)
4937 #endif
4938 #ifdef CONFIG_DMA_SHARED_BUFFER
4939 BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
4940 BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
4941 BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
4942 #endif
4943 BTF_ID_FLAGS(func, __bpf_trap)
4944 BTF_ID_FLAGS(func, bpf_strcmp);
4945 BTF_ID_FLAGS(func, bpf_strcasecmp);
4946 BTF_ID_FLAGS(func, bpf_strncasecmp);
4947 BTF_ID_FLAGS(func, bpf_strchr);
4948 BTF_ID_FLAGS(func, bpf_strchrnul);
4949 BTF_ID_FLAGS(func, bpf_strnchr);
4950 BTF_ID_FLAGS(func, bpf_strrchr);
4951 BTF_ID_FLAGS(func, bpf_strlen);
4952 BTF_ID_FLAGS(func, bpf_strnlen);
4953 BTF_ID_FLAGS(func, bpf_strspn);
4954 BTF_ID_FLAGS(func, bpf_strcspn);
4955 BTF_ID_FLAGS(func, bpf_strstr);
4956 BTF_ID_FLAGS(func, bpf_strcasestr);
4957 BTF_ID_FLAGS(func, bpf_strnstr);
4958 BTF_ID_FLAGS(func, bpf_strncasestr);
4959 #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
4960 BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
4961 #endif
4962 BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS)
4963 BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS)
4964 BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS)
4965 BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS)
4966 BTF_ID_FLAGS(func, bpf_dynptr_from_file)
4967 BTF_ID_FLAGS(func, bpf_dynptr_file_discard, KF_RELEASE)
4968 BTF_ID_FLAGS(func, bpf_timer_cancel_async)
4969 BTF_KFUNCS_END(common_btf_ids)
4970 
4971 static const struct btf_kfunc_id_set common_kfunc_set = {
4972 	.owner = THIS_MODULE,
4973 	.set   = &common_btf_ids,
4974 };
4975 
4976 static int __init kfunc_init(void)
4977 {
4978 	int ret;
4979 	const struct btf_id_dtor_kfunc generic_dtors[] = {
4980 		{
4981 			.btf_id       = generic_dtor_ids[0],
4982 			.kfunc_btf_id = generic_dtor_ids[1]
4983 		},
4984 #ifdef CONFIG_CGROUPS
4985 		{
4986 			.btf_id       = generic_dtor_ids[2],
4987 			.kfunc_btf_id = generic_dtor_ids[3]
4988 		},
4989 #endif
4990 	};
4991 
4992 	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
4993 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
4994 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
4995 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
4996 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
4997 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set);
4998 	ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
4999 						  ARRAY_SIZE(generic_dtors),
5000 						  THIS_MODULE);
5001 	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
5002 }
5003 
5004 late_initcall(kfunc_init);
5005 
5006 /* Get a pointer to dynptr data up to len bytes for read only access. If
5007  * the dynptr doesn't have continuous data up to len bytes, return NULL.
5008  */
5009 const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len)
5010 {
5011 	const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
5012 
5013 	return bpf_dynptr_slice(p, 0, NULL, len);
5014 }
5015 
5016 /* Get a pointer to dynptr data up to len bytes for read write access. If
5017  * the dynptr doesn't have continuous data up to len bytes, or the dynptr
5018  * is read only, return NULL.
5019  */
5020 void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len)
5021 {
5022 	if (__bpf_dynptr_is_rdonly(ptr))
5023 		return NULL;
5024 	return (void *)__bpf_dynptr_data(ptr, len);
5025 }
5026 
5027 void bpf_map_free_internal_structs(struct bpf_map *map, void *val)
5028 {
5029 	if (btf_record_has_field(map->record, BPF_TIMER))
5030 		bpf_obj_free_timer(map->record, val);
5031 	if (btf_record_has_field(map->record, BPF_WORKQUEUE))
5032 		bpf_obj_free_workqueue(map->record, val);
5033 	if (btf_record_has_field(map->record, BPF_TASK_WORK))
5034 		bpf_obj_free_task_work(map->record, val);
5035 }
5036