xref: /linux/kernel/bpf/helpers.c (revision 50dff00615522f3ec03449680ca23beb4cfc549c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  */
4 #include <linux/bpf.h>
5 #include <linux/btf.h>
6 #include <linux/bpf-cgroup.h>
7 #include <linux/cgroup.h>
8 #include <linux/rcupdate.h>
9 #include <linux/random.h>
10 #include <linux/smp.h>
11 #include <linux/topology.h>
12 #include <linux/ktime.h>
13 #include <linux/sched.h>
14 #include <linux/uidgid.h>
15 #include <linux/filter.h>
16 #include <linux/ctype.h>
17 #include <linux/jiffies.h>
18 #include <linux/pid_namespace.h>
19 #include <linux/poison.h>
20 #include <linux/proc_ns.h>
21 #include <linux/sched/task.h>
22 #include <linux/security.h>
23 #include <linux/btf_ids.h>
24 #include <linux/bpf_mem_alloc.h>
25 #include <linux/kasan.h>
26 #include <linux/bpf_verifier.h>
27 #include <linux/uaccess.h>
28 #include <linux/verification.h>
29 #include <linux/task_work.h>
30 #include <linux/irq_work.h>
31 #include <linux/buildid.h>
32 
33 #include "../../lib/kstrtox.h"
34 
35 /* If kernel subsystem is allowing eBPF programs to call this function,
36  * inside its own verifier_ops->get_func_proto() callback it should return
37  * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
38  *
39  * Different map implementations will rely on rcu in map methods
40  * lookup/update/delete, therefore eBPF programs must run under rcu lock
41  * if program is allowed to access maps, so check rcu_read_lock_held() or
42  * rcu_read_lock_trace_held() in all three functions.
43  */
44 BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
45 {
46 	WARN_ON_ONCE(!bpf_rcu_lock_held());
47 	return (unsigned long) map->ops->map_lookup_elem(map, key);
48 }
49 
50 const struct bpf_func_proto bpf_map_lookup_elem_proto = {
51 	.func		= bpf_map_lookup_elem,
52 	.gpl_only	= false,
53 	.pkt_access	= true,
54 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
55 	.arg1_type	= ARG_CONST_MAP_PTR,
56 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
57 };
58 
59 BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
60 	   void *, value, u64, flags)
61 {
62 	WARN_ON_ONCE(!bpf_rcu_lock_held());
63 	return map->ops->map_update_elem(map, key, value, flags);
64 }
65 
66 const struct bpf_func_proto bpf_map_update_elem_proto = {
67 	.func		= bpf_map_update_elem,
68 	.gpl_only	= false,
69 	.pkt_access	= true,
70 	.ret_type	= RET_INTEGER,
71 	.arg1_type	= ARG_CONST_MAP_PTR,
72 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
73 	.arg3_type	= ARG_PTR_TO_MAP_VALUE,
74 	.arg4_type	= ARG_ANYTHING,
75 };
76 
77 BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
78 {
79 	WARN_ON_ONCE(!bpf_rcu_lock_held());
80 	return map->ops->map_delete_elem(map, key);
81 }
82 
83 const struct bpf_func_proto bpf_map_delete_elem_proto = {
84 	.func		= bpf_map_delete_elem,
85 	.gpl_only	= false,
86 	.pkt_access	= true,
87 	.ret_type	= RET_INTEGER,
88 	.arg1_type	= ARG_CONST_MAP_PTR,
89 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
90 };
91 
92 BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags)
93 {
94 	return map->ops->map_push_elem(map, value, flags);
95 }
96 
97 const struct bpf_func_proto bpf_map_push_elem_proto = {
98 	.func		= bpf_map_push_elem,
99 	.gpl_only	= false,
100 	.pkt_access	= true,
101 	.ret_type	= RET_INTEGER,
102 	.arg1_type	= ARG_CONST_MAP_PTR,
103 	.arg2_type	= ARG_PTR_TO_MAP_VALUE,
104 	.arg3_type	= ARG_ANYTHING,
105 };
106 
107 BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value)
108 {
109 	return map->ops->map_pop_elem(map, value);
110 }
111 
112 const struct bpf_func_proto bpf_map_pop_elem_proto = {
113 	.func		= bpf_map_pop_elem,
114 	.gpl_only	= false,
115 	.ret_type	= RET_INTEGER,
116 	.arg1_type	= ARG_CONST_MAP_PTR,
117 	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
118 };
119 
120 BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
121 {
122 	return map->ops->map_peek_elem(map, value);
123 }
124 
125 const struct bpf_func_proto bpf_map_peek_elem_proto = {
126 	.func		= bpf_map_peek_elem,
127 	.gpl_only	= false,
128 	.ret_type	= RET_INTEGER,
129 	.arg1_type	= ARG_CONST_MAP_PTR,
130 	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
131 };
132 
133 BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
134 {
135 	WARN_ON_ONCE(!bpf_rcu_lock_held());
136 	return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
137 }
138 
139 const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
140 	.func		= bpf_map_lookup_percpu_elem,
141 	.gpl_only	= false,
142 	.pkt_access	= true,
143 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
144 	.arg1_type	= ARG_CONST_MAP_PTR,
145 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
146 	.arg3_type	= ARG_ANYTHING,
147 };
148 
149 const struct bpf_func_proto bpf_get_prandom_u32_proto = {
150 	.func		= bpf_user_rnd_u32,
151 	.gpl_only	= false,
152 	.ret_type	= RET_INTEGER,
153 };
154 
155 BPF_CALL_0(bpf_get_smp_processor_id)
156 {
157 	return smp_processor_id();
158 }
159 
160 const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
161 	.func		= bpf_get_smp_processor_id,
162 	.gpl_only	= false,
163 	.ret_type	= RET_INTEGER,
164 	.allow_fastcall	= true,
165 };
166 
167 BPF_CALL_0(bpf_get_numa_node_id)
168 {
169 	return numa_node_id();
170 }
171 
172 const struct bpf_func_proto bpf_get_numa_node_id_proto = {
173 	.func		= bpf_get_numa_node_id,
174 	.gpl_only	= false,
175 	.ret_type	= RET_INTEGER,
176 };
177 
178 BPF_CALL_0(bpf_ktime_get_ns)
179 {
180 	/* NMI safe access to clock monotonic */
181 	return ktime_get_mono_fast_ns();
182 }
183 
184 const struct bpf_func_proto bpf_ktime_get_ns_proto = {
185 	.func		= bpf_ktime_get_ns,
186 	.gpl_only	= false,
187 	.ret_type	= RET_INTEGER,
188 };
189 
190 BPF_CALL_0(bpf_ktime_get_boot_ns)
191 {
192 	/* NMI safe access to clock boottime */
193 	return ktime_get_boot_fast_ns();
194 }
195 
196 const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
197 	.func		= bpf_ktime_get_boot_ns,
198 	.gpl_only	= false,
199 	.ret_type	= RET_INTEGER,
200 };
201 
202 BPF_CALL_0(bpf_ktime_get_coarse_ns)
203 {
204 	return ktime_get_coarse_ns();
205 }
206 
207 const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
208 	.func		= bpf_ktime_get_coarse_ns,
209 	.gpl_only	= false,
210 	.ret_type	= RET_INTEGER,
211 };
212 
213 BPF_CALL_0(bpf_ktime_get_tai_ns)
214 {
215 	/* NMI safe access to clock tai */
216 	return ktime_get_tai_fast_ns();
217 }
218 
219 const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
220 	.func		= bpf_ktime_get_tai_ns,
221 	.gpl_only	= false,
222 	.ret_type	= RET_INTEGER,
223 };
224 
225 BPF_CALL_0(bpf_get_current_pid_tgid)
226 {
227 	struct task_struct *task = current;
228 
229 	if (unlikely(!task))
230 		return -EINVAL;
231 
232 	return (u64) task->tgid << 32 | task->pid;
233 }
234 
235 const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
236 	.func		= bpf_get_current_pid_tgid,
237 	.gpl_only	= false,
238 	.ret_type	= RET_INTEGER,
239 };
240 
241 BPF_CALL_0(bpf_get_current_uid_gid)
242 {
243 	struct task_struct *task = current;
244 	kuid_t uid;
245 	kgid_t gid;
246 
247 	if (unlikely(!task))
248 		return -EINVAL;
249 
250 	current_uid_gid(&uid, &gid);
251 	return (u64) from_kgid(&init_user_ns, gid) << 32 |
252 		     from_kuid(&init_user_ns, uid);
253 }
254 
255 const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
256 	.func		= bpf_get_current_uid_gid,
257 	.gpl_only	= false,
258 	.ret_type	= RET_INTEGER,
259 };
260 
261 BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
262 {
263 	struct task_struct *task = current;
264 
265 	if (unlikely(!task))
266 		goto err_clear;
267 
268 	/* Verifier guarantees that size > 0 */
269 	strscpy_pad(buf, task->comm, size);
270 	return 0;
271 err_clear:
272 	memset(buf, 0, size);
273 	return -EINVAL;
274 }
275 
276 const struct bpf_func_proto bpf_get_current_comm_proto = {
277 	.func		= bpf_get_current_comm,
278 	.gpl_only	= false,
279 	.ret_type	= RET_INTEGER,
280 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
281 	.arg2_type	= ARG_CONST_SIZE,
282 };
283 
284 #if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
285 
286 static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
287 {
288 	arch_spinlock_t *l = (void *)lock;
289 	union {
290 		__u32 val;
291 		arch_spinlock_t lock;
292 	} u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
293 
294 	compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
295 	BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
296 	BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
297 	preempt_disable();
298 	arch_spin_lock(l);
299 }
300 
301 static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
302 {
303 	arch_spinlock_t *l = (void *)lock;
304 
305 	arch_spin_unlock(l);
306 	preempt_enable();
307 }
308 
309 #else
310 
311 static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
312 {
313 	atomic_t *l = (void *)lock;
314 
315 	BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
316 	do {
317 		atomic_cond_read_relaxed(l, !VAL);
318 	} while (atomic_xchg(l, 1));
319 }
320 
321 static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
322 {
323 	atomic_t *l = (void *)lock;
324 
325 	atomic_set_release(l, 0);
326 }
327 
328 #endif
329 
330 static DEFINE_PER_CPU(unsigned long, irqsave_flags);
331 
332 static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
333 {
334 	unsigned long flags;
335 
336 	local_irq_save(flags);
337 	__bpf_spin_lock(lock);
338 	__this_cpu_write(irqsave_flags, flags);
339 }
340 
341 NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
342 {
343 	__bpf_spin_lock_irqsave(lock);
344 	return 0;
345 }
346 
347 const struct bpf_func_proto bpf_spin_lock_proto = {
348 	.func		= bpf_spin_lock,
349 	.gpl_only	= false,
350 	.ret_type	= RET_VOID,
351 	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
352 	.arg1_btf_id    = BPF_PTR_POISON,
353 };
354 
355 static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
356 {
357 	unsigned long flags;
358 
359 	flags = __this_cpu_read(irqsave_flags);
360 	__bpf_spin_unlock(lock);
361 	local_irq_restore(flags);
362 }
363 
364 NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
365 {
366 	__bpf_spin_unlock_irqrestore(lock);
367 	return 0;
368 }
369 
370 const struct bpf_func_proto bpf_spin_unlock_proto = {
371 	.func		= bpf_spin_unlock,
372 	.gpl_only	= false,
373 	.ret_type	= RET_VOID,
374 	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
375 	.arg1_btf_id    = BPF_PTR_POISON,
376 };
377 
378 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
379 			   bool lock_src)
380 {
381 	struct bpf_spin_lock *lock;
382 
383 	if (lock_src)
384 		lock = src + map->record->spin_lock_off;
385 	else
386 		lock = dst + map->record->spin_lock_off;
387 	preempt_disable();
388 	__bpf_spin_lock_irqsave(lock);
389 	copy_map_value(map, dst, src);
390 	__bpf_spin_unlock_irqrestore(lock);
391 	preempt_enable();
392 }
393 
394 BPF_CALL_0(bpf_jiffies64)
395 {
396 	return get_jiffies_64();
397 }
398 
399 const struct bpf_func_proto bpf_jiffies64_proto = {
400 	.func		= bpf_jiffies64,
401 	.gpl_only	= false,
402 	.ret_type	= RET_INTEGER,
403 };
404 
405 #ifdef CONFIG_CGROUPS
406 BPF_CALL_0(bpf_get_current_cgroup_id)
407 {
408 	struct cgroup *cgrp;
409 	u64 cgrp_id;
410 
411 	rcu_read_lock();
412 	cgrp = task_dfl_cgroup(current);
413 	cgrp_id = cgroup_id(cgrp);
414 	rcu_read_unlock();
415 
416 	return cgrp_id;
417 }
418 
419 const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
420 	.func		= bpf_get_current_cgroup_id,
421 	.gpl_only	= false,
422 	.ret_type	= RET_INTEGER,
423 };
424 
425 BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
426 {
427 	struct cgroup *cgrp;
428 	struct cgroup *ancestor;
429 	u64 cgrp_id;
430 
431 	rcu_read_lock();
432 	cgrp = task_dfl_cgroup(current);
433 	ancestor = cgroup_ancestor(cgrp, ancestor_level);
434 	cgrp_id = ancestor ? cgroup_id(ancestor) : 0;
435 	rcu_read_unlock();
436 
437 	return cgrp_id;
438 }
439 
440 const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
441 	.func		= bpf_get_current_ancestor_cgroup_id,
442 	.gpl_only	= false,
443 	.ret_type	= RET_INTEGER,
444 	.arg1_type	= ARG_ANYTHING,
445 };
446 #endif /* CONFIG_CGROUPS */
447 
448 #define BPF_STRTOX_BASE_MASK 0x1F
449 
450 static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags,
451 			  unsigned long long *res, bool *is_negative)
452 {
453 	unsigned int base = flags & BPF_STRTOX_BASE_MASK;
454 	const char *cur_buf = buf;
455 	size_t cur_len = buf_len;
456 	unsigned int consumed;
457 	size_t val_len;
458 	char str[64];
459 
460 	if (!buf || !buf_len || !res || !is_negative)
461 		return -EINVAL;
462 
463 	if (base != 0 && base != 8 && base != 10 && base != 16)
464 		return -EINVAL;
465 
466 	if (flags & ~BPF_STRTOX_BASE_MASK)
467 		return -EINVAL;
468 
469 	while (cur_buf < buf + buf_len && isspace(*cur_buf))
470 		++cur_buf;
471 
472 	*is_negative = (cur_buf < buf + buf_len && *cur_buf == '-');
473 	if (*is_negative)
474 		++cur_buf;
475 
476 	consumed = cur_buf - buf;
477 	cur_len -= consumed;
478 	if (!cur_len)
479 		return -EINVAL;
480 
481 	cur_len = min(cur_len, sizeof(str) - 1);
482 	memcpy(str, cur_buf, cur_len);
483 	str[cur_len] = '\0';
484 	cur_buf = str;
485 
486 	cur_buf = _parse_integer_fixup_radix(cur_buf, &base);
487 	val_len = _parse_integer(cur_buf, base, res);
488 
489 	if (val_len & KSTRTOX_OVERFLOW)
490 		return -ERANGE;
491 
492 	if (val_len == 0)
493 		return -EINVAL;
494 
495 	cur_buf += val_len;
496 	consumed += cur_buf - str;
497 
498 	return consumed;
499 }
500 
501 static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
502 			 long long *res)
503 {
504 	unsigned long long _res;
505 	bool is_negative;
506 	int err;
507 
508 	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
509 	if (err < 0)
510 		return err;
511 	if (is_negative) {
512 		if ((long long)-_res > 0)
513 			return -ERANGE;
514 		*res = -_res;
515 	} else {
516 		if ((long long)_res < 0)
517 			return -ERANGE;
518 		*res = _res;
519 	}
520 	return err;
521 }
522 
523 BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
524 	   s64 *, res)
525 {
526 	long long _res;
527 	int err;
528 
529 	*res = 0;
530 	err = __bpf_strtoll(buf, buf_len, flags, &_res);
531 	if (err < 0)
532 		return err;
533 	*res = _res;
534 	return err;
535 }
536 
537 const struct bpf_func_proto bpf_strtol_proto = {
538 	.func		= bpf_strtol,
539 	.gpl_only	= false,
540 	.ret_type	= RET_INTEGER,
541 	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
542 	.arg2_type	= ARG_CONST_SIZE,
543 	.arg3_type	= ARG_ANYTHING,
544 	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
545 	.arg4_size	= sizeof(s64),
546 };
547 
548 BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
549 	   u64 *, res)
550 {
551 	unsigned long long _res;
552 	bool is_negative;
553 	int err;
554 
555 	*res = 0;
556 	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
557 	if (err < 0)
558 		return err;
559 	if (is_negative)
560 		return -EINVAL;
561 	*res = _res;
562 	return err;
563 }
564 
565 const struct bpf_func_proto bpf_strtoul_proto = {
566 	.func		= bpf_strtoul,
567 	.gpl_only	= false,
568 	.ret_type	= RET_INTEGER,
569 	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
570 	.arg2_type	= ARG_CONST_SIZE,
571 	.arg3_type	= ARG_ANYTHING,
572 	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
573 	.arg4_size	= sizeof(u64),
574 };
575 
576 BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
577 {
578 	return strncmp(s1, s2, s1_sz);
579 }
580 
581 static const struct bpf_func_proto bpf_strncmp_proto = {
582 	.func		= bpf_strncmp,
583 	.gpl_only	= false,
584 	.ret_type	= RET_INTEGER,
585 	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
586 	.arg2_type	= ARG_CONST_SIZE,
587 	.arg3_type	= ARG_PTR_TO_CONST_STR,
588 };
589 
590 BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
591 	   struct bpf_pidns_info *, nsdata, u32, size)
592 {
593 	struct task_struct *task = current;
594 	struct pid_namespace *pidns;
595 	int err = -EINVAL;
596 
597 	if (unlikely(size != sizeof(struct bpf_pidns_info)))
598 		goto clear;
599 
600 	if (unlikely((u64)(dev_t)dev != dev))
601 		goto clear;
602 
603 	if (unlikely(!task))
604 		goto clear;
605 
606 	pidns = task_active_pid_ns(task);
607 	if (unlikely(!pidns)) {
608 		err = -ENOENT;
609 		goto clear;
610 	}
611 
612 	if (!ns_match(&pidns->ns, (dev_t)dev, ino))
613 		goto clear;
614 
615 	nsdata->pid = task_pid_nr_ns(task, pidns);
616 	nsdata->tgid = task_tgid_nr_ns(task, pidns);
617 	return 0;
618 clear:
619 	memset((void *)nsdata, 0, (size_t) size);
620 	return err;
621 }
622 
623 const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
624 	.func		= bpf_get_ns_current_pid_tgid,
625 	.gpl_only	= false,
626 	.ret_type	= RET_INTEGER,
627 	.arg1_type	= ARG_ANYTHING,
628 	.arg2_type	= ARG_ANYTHING,
629 	.arg3_type      = ARG_PTR_TO_UNINIT_MEM,
630 	.arg4_type      = ARG_CONST_SIZE,
631 };
632 
633 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
634 	.func		= bpf_get_raw_cpu_id,
635 	.gpl_only	= false,
636 	.ret_type	= RET_INTEGER,
637 };
638 
639 BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map,
640 	   u64, flags, void *, data, u64, size)
641 {
642 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
643 		return -EINVAL;
644 
645 	return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
646 }
647 
648 const struct bpf_func_proto bpf_event_output_data_proto =  {
649 	.func		= bpf_event_output_data,
650 	.gpl_only       = true,
651 	.ret_type       = RET_INTEGER,
652 	.arg1_type      = ARG_PTR_TO_CTX,
653 	.arg2_type      = ARG_CONST_MAP_PTR,
654 	.arg3_type      = ARG_ANYTHING,
655 	.arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
656 	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
657 };
658 
659 BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
660 	   const void __user *, user_ptr)
661 {
662 	int ret = copy_from_user(dst, user_ptr, size);
663 
664 	if (unlikely(ret)) {
665 		memset(dst, 0, size);
666 		ret = -EFAULT;
667 	}
668 
669 	return ret;
670 }
671 
672 const struct bpf_func_proto bpf_copy_from_user_proto = {
673 	.func		= bpf_copy_from_user,
674 	.gpl_only	= false,
675 	.might_sleep	= true,
676 	.ret_type	= RET_INTEGER,
677 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
678 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
679 	.arg3_type	= ARG_ANYTHING,
680 };
681 
682 BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
683 	   const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
684 {
685 	int ret;
686 
687 	/* flags is not used yet */
688 	if (unlikely(flags))
689 		return -EINVAL;
690 
691 	if (unlikely(!size))
692 		return 0;
693 
694 	ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
695 	if (ret == size)
696 		return 0;
697 
698 	memset(dst, 0, size);
699 	/* Return -EFAULT for partial read */
700 	return ret < 0 ? ret : -EFAULT;
701 }
702 
703 const struct bpf_func_proto bpf_copy_from_user_task_proto = {
704 	.func		= bpf_copy_from_user_task,
705 	.gpl_only	= true,
706 	.might_sleep	= true,
707 	.ret_type	= RET_INTEGER,
708 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
709 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
710 	.arg3_type	= ARG_ANYTHING,
711 	.arg4_type	= ARG_PTR_TO_BTF_ID,
712 	.arg4_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
713 	.arg5_type	= ARG_ANYTHING
714 };
715 
716 BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
717 {
718 	if (cpu >= nr_cpu_ids)
719 		return (unsigned long)NULL;
720 
721 	return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu);
722 }
723 
724 const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
725 	.func		= bpf_per_cpu_ptr,
726 	.gpl_only	= false,
727 	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
728 	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
729 	.arg2_type	= ARG_ANYTHING,
730 };
731 
732 BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
733 {
734 	return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr);
735 }
736 
737 const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
738 	.func		= bpf_this_cpu_ptr,
739 	.gpl_only	= false,
740 	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
741 	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
742 };
743 
744 static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
745 		size_t bufsz)
746 {
747 	void __user *user_ptr = (__force void __user *)unsafe_ptr;
748 
749 	buf[0] = 0;
750 
751 	switch (fmt_ptype) {
752 	case 's':
753 #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
754 		if ((unsigned long)unsafe_ptr < TASK_SIZE)
755 			return strncpy_from_user_nofault(buf, user_ptr, bufsz);
756 		fallthrough;
757 #endif
758 	case 'k':
759 		return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
760 	case 'u':
761 		return strncpy_from_user_nofault(buf, user_ptr, bufsz);
762 	}
763 
764 	return -EINVAL;
765 }
766 
767 /* Support executing three nested bprintf helper calls on a given CPU */
768 #define MAX_BPRINTF_NEST_LEVEL	3
769 
770 static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
771 static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
772 
773 int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
774 {
775 	int nest_level;
776 
777 	preempt_disable();
778 	nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
779 	if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
780 		this_cpu_dec(bpf_bprintf_nest_level);
781 		preempt_enable();
782 		return -EBUSY;
783 	}
784 	*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
785 
786 	return 0;
787 }
788 
789 void bpf_put_buffers(void)
790 {
791 	if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
792 		return;
793 	this_cpu_dec(bpf_bprintf_nest_level);
794 	preempt_enable();
795 }
796 
797 void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
798 {
799 	if (!data->bin_args && !data->buf)
800 		return;
801 	bpf_put_buffers();
802 }
803 
804 /*
805  * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
806  *
807  * Returns a negative value if fmt is an invalid format string or 0 otherwise.
808  *
809  * This can be used in two ways:
810  * - Format string verification only: when data->get_bin_args is false
811  * - Arguments preparation: in addition to the above verification, it writes in
812  *   data->bin_args a binary representation of arguments usable by bstr_printf
813  *   where pointers from BPF have been sanitized.
814  *
815  * In argument preparation mode, if 0 is returned, safe temporary buffers are
816  * allocated and bpf_bprintf_cleanup should be called to free them after use.
817  */
818 int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
819 			u32 num_args, struct bpf_bprintf_data *data)
820 {
821 	bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
822 	char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
823 	struct bpf_bprintf_buffers *buffers = NULL;
824 	size_t sizeof_cur_arg, sizeof_cur_ip;
825 	int err, i, num_spec = 0;
826 	u64 cur_arg;
827 	char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX";
828 
829 	fmt_end = strnchr(fmt, fmt_size, 0);
830 	if (!fmt_end)
831 		return -EINVAL;
832 	fmt_size = fmt_end - fmt;
833 
834 	if (get_buffers && bpf_try_get_buffers(&buffers))
835 		return -EBUSY;
836 
837 	if (data->get_bin_args) {
838 		if (num_args)
839 			tmp_buf = buffers->bin_args;
840 		tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS;
841 		data->bin_args = (u32 *)tmp_buf;
842 	}
843 
844 	if (data->get_buf)
845 		data->buf = buffers->buf;
846 
847 	for (i = 0; i < fmt_size; i++) {
848 		unsigned char c = fmt[i];
849 
850 		/*
851 		 * Permit bytes >= 0x80 in plain text so UTF-8 literals can pass
852 		 * through unchanged, while still rejecting ASCII control bytes.
853 		 */
854 		if (isascii(c) && !isprint(c) && !isspace(c)) {
855 			err = -EINVAL;
856 			goto out;
857 		}
858 
859 		if (fmt[i] != '%')
860 			continue;
861 
862 		if (fmt[i + 1] == '%') {
863 			i++;
864 			continue;
865 		}
866 
867 		if (num_spec >= num_args) {
868 			err = -EINVAL;
869 			goto out;
870 		}
871 
872 		/* The string is zero-terminated so if fmt[i] != 0, we can
873 		 * always access fmt[i + 1], in the worst case it will be a 0
874 		 */
875 		i++;
876 		c = fmt[i];
877 		/*
878 		 * The format parser below only understands ASCII conversion
879 		 * specifiers and modifiers, so reject non-ASCII after '%'.
880 		 */
881 		if (!isascii(c)) {
882 			err = -EINVAL;
883 			goto out;
884 		}
885 
886 		/* skip optional "[0 +-][num]" width formatting field */
887 		while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-' ||
888 		       fmt[i] == ' ')
889 			i++;
890 		if (fmt[i] >= '1' && fmt[i] <= '9') {
891 			i++;
892 			while (fmt[i] >= '0' && fmt[i] <= '9')
893 				i++;
894 		}
895 
896 		if (fmt[i] == 'p') {
897 			sizeof_cur_arg = sizeof(long);
898 
899 			if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
900 			    ispunct(fmt[i + 1])) {
901 				if (tmp_buf)
902 					cur_arg = raw_args[num_spec];
903 				goto nocopy_fmt;
904 			}
905 
906 			if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
907 			    fmt[i + 2] == 's') {
908 				fmt_ptype = fmt[i + 1];
909 				i += 2;
910 				goto fmt_str;
911 			}
912 
913 			if (fmt[i + 1] == 'K' ||
914 			    fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
915 			    fmt[i + 1] == 'S') {
916 				if (tmp_buf)
917 					cur_arg = raw_args[num_spec];
918 				i++;
919 				goto nocopy_fmt;
920 			}
921 
922 			if (fmt[i + 1] == 'B') {
923 				if (tmp_buf)  {
924 					err = snprintf(tmp_buf,
925 						       (tmp_buf_end - tmp_buf),
926 						       "%pB",
927 						       (void *)(long)raw_args[num_spec]);
928 					tmp_buf += (err + 1);
929 				}
930 
931 				i++;
932 				num_spec++;
933 				continue;
934 			}
935 
936 			/* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
937 			if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') ||
938 			    (fmt[i + 2] != '4' && fmt[i + 2] != '6')) {
939 				err = -EINVAL;
940 				goto out;
941 			}
942 
943 			i += 2;
944 			if (!tmp_buf)
945 				goto nocopy_fmt;
946 
947 			sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16;
948 			if (tmp_buf_end - tmp_buf < sizeof_cur_ip) {
949 				err = -ENOSPC;
950 				goto out;
951 			}
952 
953 			unsafe_ptr = (char *)(long)raw_args[num_spec];
954 			err = copy_from_kernel_nofault(cur_ip, unsafe_ptr,
955 						       sizeof_cur_ip);
956 			if (err < 0)
957 				memset(cur_ip, 0, sizeof_cur_ip);
958 
959 			/* hack: bstr_printf expects IP addresses to be
960 			 * pre-formatted as strings, ironically, the easiest way
961 			 * to do that is to call snprintf.
962 			 */
963 			ip_spec[2] = fmt[i - 1];
964 			ip_spec[3] = fmt[i];
965 			err = snprintf(tmp_buf, tmp_buf_end - tmp_buf,
966 				       ip_spec, &cur_ip);
967 
968 			tmp_buf += err + 1;
969 			num_spec++;
970 
971 			continue;
972 		} else if (fmt[i] == 's') {
973 			fmt_ptype = fmt[i];
974 fmt_str:
975 			if (fmt[i + 1] != 0 &&
976 			    !isspace(fmt[i + 1]) &&
977 			    !ispunct(fmt[i + 1])) {
978 				err = -EINVAL;
979 				goto out;
980 			}
981 
982 			if (!tmp_buf)
983 				goto nocopy_fmt;
984 
985 			if (tmp_buf_end == tmp_buf) {
986 				err = -ENOSPC;
987 				goto out;
988 			}
989 
990 			unsafe_ptr = (char *)(long)raw_args[num_spec];
991 			err = bpf_trace_copy_string(tmp_buf, unsafe_ptr,
992 						    fmt_ptype,
993 						    tmp_buf_end - tmp_buf);
994 			if (err < 0) {
995 				tmp_buf[0] = '\0';
996 				err = 1;
997 			}
998 
999 			tmp_buf += err;
1000 			num_spec++;
1001 
1002 			continue;
1003 		} else if (fmt[i] == 'c') {
1004 			if (!tmp_buf)
1005 				goto nocopy_fmt;
1006 
1007 			if (tmp_buf_end == tmp_buf) {
1008 				err = -ENOSPC;
1009 				goto out;
1010 			}
1011 
1012 			*tmp_buf = raw_args[num_spec];
1013 			tmp_buf++;
1014 			num_spec++;
1015 
1016 			continue;
1017 		}
1018 
1019 		sizeof_cur_arg = sizeof(int);
1020 
1021 		if (fmt[i] == 'l') {
1022 			sizeof_cur_arg = sizeof(long);
1023 			i++;
1024 		}
1025 		if (fmt[i] == 'l') {
1026 			sizeof_cur_arg = sizeof(long long);
1027 			i++;
1028 		}
1029 
1030 		if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' &&
1031 		    fmt[i] != 'x' && fmt[i] != 'X') {
1032 			err = -EINVAL;
1033 			goto out;
1034 		}
1035 
1036 		if (tmp_buf)
1037 			cur_arg = raw_args[num_spec];
1038 nocopy_fmt:
1039 		if (tmp_buf) {
1040 			tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32));
1041 			if (tmp_buf_end - tmp_buf < sizeof_cur_arg) {
1042 				err = -ENOSPC;
1043 				goto out;
1044 			}
1045 
1046 			if (sizeof_cur_arg == 8) {
1047 				*(u32 *)tmp_buf = *(u32 *)&cur_arg;
1048 				*(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1);
1049 			} else {
1050 				*(u32 *)tmp_buf = (u32)(long)cur_arg;
1051 			}
1052 			tmp_buf += sizeof_cur_arg;
1053 		}
1054 		num_spec++;
1055 	}
1056 
1057 	err = 0;
1058 out:
1059 	if (err)
1060 		bpf_bprintf_cleanup(data);
1061 	return err;
1062 }
1063 
1064 BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
1065 	   const void *, args, u32, data_len)
1066 {
1067 	struct bpf_bprintf_data data = {
1068 		.get_bin_args	= true,
1069 	};
1070 	int err, num_args;
1071 
1072 	if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
1073 	    (data_len && !args))
1074 		return -EINVAL;
1075 	num_args = data_len / 8;
1076 
1077 	/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
1078 	 * can safely give an unbounded size.
1079 	 */
1080 	err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);
1081 	if (err < 0)
1082 		return err;
1083 
1084 	err = bstr_printf(str, str_size, fmt, data.bin_args);
1085 
1086 	bpf_bprintf_cleanup(&data);
1087 
1088 	return err + 1;
1089 }
1090 
1091 const struct bpf_func_proto bpf_snprintf_proto = {
1092 	.func		= bpf_snprintf,
1093 	.gpl_only	= true,
1094 	.ret_type	= RET_INTEGER,
1095 	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE,
1096 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1097 	.arg3_type	= ARG_PTR_TO_CONST_STR,
1098 	.arg4_type	= ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
1099 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
1100 };
1101 
1102 static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
1103 {
1104 	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
1105 		struct bpf_array *array = container_of(map, struct bpf_array, map);
1106 
1107 		*arr_idx = ((char *)value - array->value) / array->elem_size;
1108 		return arr_idx;
1109 	}
1110 	return (void *)value - round_up(map->key_size, 8);
1111 }
1112 
1113 enum bpf_async_type {
1114 	BPF_ASYNC_TYPE_TIMER = 0,
1115 	BPF_ASYNC_TYPE_WQ,
1116 };
1117 
1118 enum bpf_async_op {
1119 	BPF_ASYNC_START,
1120 	BPF_ASYNC_CANCEL
1121 };
1122 
1123 struct bpf_async_cmd {
1124 	struct llist_node node;
1125 	u64 nsec;
1126 	u32 mode;
1127 	enum bpf_async_op op;
1128 };
1129 
1130 struct bpf_async_cb {
1131 	struct bpf_map *map;
1132 	struct bpf_prog *prog;
1133 	void __rcu *callback_fn;
1134 	void *value;
1135 	struct rcu_head rcu;
1136 	u64 flags;
1137 	struct irq_work worker;
1138 	refcount_t refcnt;
1139 	enum bpf_async_type type;
1140 	struct llist_head async_cmds;
1141 };
1142 
1143 /* BPF map elements can contain 'struct bpf_timer'.
1144  * Such map owns all of its BPF timers.
1145  * 'struct bpf_timer' is allocated as part of map element allocation
1146  * and it's zero initialized.
1147  * That space is used to keep 'struct bpf_async_kern'.
1148  * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
1149  * remembers 'struct bpf_map *' pointer it's part of.
1150  * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
1151  * bpf_timer_start() arms the timer.
1152  * If user space reference to a map goes to zero at this point
1153  * ops->map_release_uref callback is responsible for cancelling the timers,
1154  * freeing their memory, and decrementing prog's refcnts.
1155  * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
1156  * Inner maps can contain bpf timers as well. ops->map_release_uref is
1157  * freeing the timers when inner map is replaced or deleted by user space.
1158  */
1159 struct bpf_hrtimer {
1160 	struct bpf_async_cb cb;
1161 	struct hrtimer timer;
1162 	atomic_t cancelling;
1163 };
1164 
1165 struct bpf_work {
1166 	struct bpf_async_cb cb;
1167 	struct work_struct work;
1168 };
1169 
1170 /* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
1171 struct bpf_async_kern {
1172 	union {
1173 		struct bpf_async_cb *cb;
1174 		struct bpf_hrtimer *timer;
1175 		struct bpf_work *work;
1176 	};
1177 } __attribute__((aligned(8)));
1178 
1179 static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
1180 
1181 static void bpf_async_refcount_put(struct bpf_async_cb *cb);
1182 
1183 static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
1184 {
1185 	struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
1186 	struct bpf_map *map = t->cb.map;
1187 	void *value = t->cb.value;
1188 	bpf_callback_t callback_fn;
1189 	void *key;
1190 	u32 idx;
1191 
1192 	BTF_TYPE_EMIT(struct bpf_timer);
1193 	callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
1194 	if (!callback_fn)
1195 		goto out;
1196 
1197 	/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
1198 	 * cannot be preempted by another bpf_timer_cb() on the same cpu.
1199 	 * Remember the timer this callback is servicing to prevent
1200 	 * deadlock if callback_fn() calls bpf_timer_cancel() or
1201 	 * bpf_map_delete_elem() on the same timer.
1202 	 */
1203 	this_cpu_write(hrtimer_running, t);
1204 
1205 	key = map_key_from_value(map, value, &idx);
1206 
1207 	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
1208 	/* The verifier checked that return value is zero. */
1209 
1210 	this_cpu_write(hrtimer_running, NULL);
1211 out:
1212 	return HRTIMER_NORESTART;
1213 }
1214 
1215 static void bpf_wq_work(struct work_struct *work)
1216 {
1217 	struct bpf_work *w = container_of(work, struct bpf_work, work);
1218 	struct bpf_async_cb *cb = &w->cb;
1219 	struct bpf_map *map = cb->map;
1220 	bpf_callback_t callback_fn;
1221 	void *value = cb->value;
1222 	void *key;
1223 	u32 idx;
1224 
1225 	BTF_TYPE_EMIT(struct bpf_wq);
1226 
1227 	callback_fn = READ_ONCE(cb->callback_fn);
1228 	if (!callback_fn)
1229 		return;
1230 
1231 	key = map_key_from_value(map, value, &idx);
1232 
1233         rcu_read_lock_trace();
1234         migrate_disable();
1235 
1236 	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
1237 
1238 	migrate_enable();
1239 	rcu_read_unlock_trace();
1240 }
1241 
1242 static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
1243 {
1244 	struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
1245 
1246 	/*
1247 	 * Drop the last reference to prog only after RCU GP, as set_callback()
1248 	 * may race with cancel_and_free()
1249 	 */
1250 	if (cb->prog)
1251 		bpf_prog_put(cb->prog);
1252 
1253 	kfree_nolock(cb);
1254 }
1255 
1256 /* Callback from call_rcu_tasks_trace, chains to call_rcu for final free */
1257 static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu)
1258 {
1259 	struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
1260 	struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
1261 	struct bpf_work *w = container_of(cb, struct bpf_work, cb);
1262 	bool retry = false;
1263 
1264 	/*
1265 	 * bpf_async_cancel_and_free() tried to cancel timer/wq, but it
1266 	 * could have raced with timer/wq_start. Now refcnt is zero and
1267 	 * srcu/rcu GP completed. Cancel timer/wq again.
1268 	 */
1269 	switch (cb->type) {
1270 	case BPF_ASYNC_TYPE_TIMER:
1271 		if (hrtimer_try_to_cancel(&t->timer) < 0)
1272 			retry = true;
1273 		break;
1274 	case BPF_ASYNC_TYPE_WQ:
1275 		if (!cancel_work(&w->work) && work_busy(&w->work))
1276 			retry = true;
1277 		break;
1278 	}
1279 	if (retry) {
1280 		/*
1281 		 * hrtimer or wq callback may still be running. It must be
1282 		 * in rcu_tasks_trace or rcu CS, so wait for GP again.
1283 		 * It won't retry forever, since refcnt zero prevents all
1284 		 * operations on timer/wq.
1285 		 */
1286 		call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
1287 		return;
1288 	}
1289 
1290 	/* RCU Tasks Trace grace period implies RCU grace period. */
1291 	bpf_async_cb_rcu_free(rcu);
1292 }
1293 
1294 static void worker_for_call_rcu(struct irq_work *work)
1295 {
1296 	struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker);
1297 
1298 	call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
1299 }
1300 
1301 static void bpf_async_refcount_put(struct bpf_async_cb *cb)
1302 {
1303 	if (!refcount_dec_and_test(&cb->refcnt))
1304 		return;
1305 
1306 	if (irqs_disabled()) {
1307 		cb->worker = IRQ_WORK_INIT(worker_for_call_rcu);
1308 		irq_work_queue(&cb->worker);
1309 	} else {
1310 		call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
1311 	}
1312 }
1313 
1314 static void bpf_async_cancel_and_free(struct bpf_async_kern *async);
1315 static void bpf_async_irq_worker(struct irq_work *work);
1316 
1317 static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
1318 			    enum bpf_async_type type)
1319 {
1320 	struct bpf_async_cb *cb, *old_cb;
1321 	struct bpf_hrtimer *t;
1322 	struct bpf_work *w;
1323 	clockid_t clockid;
1324 	size_t size;
1325 
1326 	switch (type) {
1327 	case BPF_ASYNC_TYPE_TIMER:
1328 		size = sizeof(struct bpf_hrtimer);
1329 		break;
1330 	case BPF_ASYNC_TYPE_WQ:
1331 		size = sizeof(struct bpf_work);
1332 		break;
1333 	default:
1334 		return -EINVAL;
1335 	}
1336 
1337 	old_cb = READ_ONCE(async->cb);
1338 	if (old_cb)
1339 		return -EBUSY;
1340 
1341 	cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
1342 	if (!cb)
1343 		return -ENOMEM;
1344 
1345 	switch (type) {
1346 	case BPF_ASYNC_TYPE_TIMER:
1347 		clockid = flags & (MAX_CLOCKS - 1);
1348 		t = (struct bpf_hrtimer *)cb;
1349 
1350 		atomic_set(&t->cancelling, 0);
1351 		hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT);
1352 		cb->value = (void *)async - map->record->timer_off;
1353 		break;
1354 	case BPF_ASYNC_TYPE_WQ:
1355 		w = (struct bpf_work *)cb;
1356 
1357 		INIT_WORK(&w->work, bpf_wq_work);
1358 		cb->value = (void *)async - map->record->wq_off;
1359 		break;
1360 	}
1361 	cb->map = map;
1362 	cb->prog = NULL;
1363 	cb->flags = flags;
1364 	cb->worker = IRQ_WORK_INIT(bpf_async_irq_worker);
1365 	init_llist_head(&cb->async_cmds);
1366 	refcount_set(&cb->refcnt, 1); /* map's reference */
1367 	cb->type = type;
1368 	rcu_assign_pointer(cb->callback_fn, NULL);
1369 
1370 	old_cb = cmpxchg(&async->cb, NULL, cb);
1371 	if (old_cb) {
1372 		/* Lost the race to initialize this bpf_async_kern, drop the allocated object */
1373 		kfree_nolock(cb);
1374 		return -EBUSY;
1375 	}
1376 	/* Guarantee the order between async->cb and map->usercnt. So
1377 	 * when there are concurrent uref release and bpf timer init, either
1378 	 * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
1379 	 * timer or atomic64_read() below returns a zero usercnt.
1380 	 */
1381 	smp_mb();
1382 	if (!atomic64_read(&map->usercnt)) {
1383 		/* maps with timers must be either held by user space
1384 		 * or pinned in bpffs.
1385 		 */
1386 		bpf_async_cancel_and_free(async);
1387 		return -EPERM;
1388 	}
1389 
1390 	return 0;
1391 }
1392 
1393 BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
1394 	   u64, flags)
1395 {
1396 	clock_t clockid = flags & (MAX_CLOCKS - 1);
1397 
1398 	BUILD_BUG_ON(MAX_CLOCKS != 16);
1399 	BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
1400 	BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
1401 
1402 	if (flags >= MAX_CLOCKS ||
1403 	    /* similar to timerfd except _ALARM variants are not supported */
1404 	    (clockid != CLOCK_MONOTONIC &&
1405 	     clockid != CLOCK_REALTIME &&
1406 	     clockid != CLOCK_BOOTTIME))
1407 		return -EINVAL;
1408 
1409 	return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
1410 }
1411 
1412 static const struct bpf_func_proto bpf_timer_init_proto = {
1413 	.func		= bpf_timer_init,
1414 	.gpl_only	= true,
1415 	.ret_type	= RET_INTEGER,
1416 	.arg1_type	= ARG_PTR_TO_TIMER,
1417 	.arg2_type	= ARG_CONST_MAP_PTR,
1418 	.arg3_type	= ARG_ANYTHING,
1419 };
1420 
1421 static int bpf_async_update_prog_callback(struct bpf_async_cb *cb,
1422 					  struct bpf_prog *prog,
1423 					  void *callback_fn)
1424 {
1425 	struct bpf_prog *prev;
1426 
1427 	/* Acquire a guard reference on prog to prevent it from being freed during the loop */
1428 	if (prog) {
1429 		prog = bpf_prog_inc_not_zero(prog);
1430 		if (IS_ERR(prog))
1431 			return PTR_ERR(prog);
1432 	}
1433 
1434 	do {
1435 		if (prog)
1436 			prog = bpf_prog_inc_not_zero(prog);
1437 		prev = xchg(&cb->prog, prog);
1438 		rcu_assign_pointer(cb->callback_fn, callback_fn);
1439 
1440 		/*
1441 		 * Release previous prog, make sure that if other CPU is contending,
1442 		 * to set bpf_prog, references are not leaked as each iteration acquires and
1443 		 * releases one reference.
1444 		 */
1445 		if (prev)
1446 			bpf_prog_put(prev);
1447 
1448 	} while (READ_ONCE(cb->prog) != prog ||
1449 		 (void __force *)READ_ONCE(cb->callback_fn) != callback_fn);
1450 
1451 	if (prog)
1452 		bpf_prog_put(prog);
1453 
1454 	return 0;
1455 }
1456 
1457 static DEFINE_PER_CPU(struct bpf_async_cb *, async_cb_running);
1458 
1459 static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op,
1460 				 u64 nsec, u32 timer_mode)
1461 {
1462 	/*
1463 	 * Do not schedule another operation on this cpu if it's in irq_work
1464 	 * callback that is processing async_cmds queue. Otherwise the following
1465 	 * loop is possible:
1466 	 * bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue().
1467 	 * irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start().
1468 	 */
1469 	if (this_cpu_read(async_cb_running) == cb) {
1470 		bpf_async_refcount_put(cb);
1471 		return -EDEADLK;
1472 	}
1473 
1474 	struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE);
1475 
1476 	if (!cmd) {
1477 		bpf_async_refcount_put(cb);
1478 		return -ENOMEM;
1479 	}
1480 	init_llist_node(&cmd->node);
1481 	cmd->nsec = nsec;
1482 	cmd->mode = timer_mode;
1483 	cmd->op = op;
1484 	if (llist_add(&cmd->node, &cb->async_cmds))
1485 		irq_work_queue(&cb->worker);
1486 	return 0;
1487 }
1488 
1489 static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
1490 				    struct bpf_prog *prog)
1491 {
1492 	struct bpf_async_cb *cb;
1493 
1494 	cb = READ_ONCE(async->cb);
1495 	if (!cb)
1496 		return -EINVAL;
1497 
1498 	return bpf_async_update_prog_callback(cb, prog, callback_fn);
1499 }
1500 
1501 BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
1502 	   struct bpf_prog_aux *, aux)
1503 {
1504 	return __bpf_async_set_callback(timer, callback_fn, aux->prog);
1505 }
1506 
1507 static const struct bpf_func_proto bpf_timer_set_callback_proto = {
1508 	.func		= bpf_timer_set_callback,
1509 	.gpl_only	= true,
1510 	.ret_type	= RET_INTEGER,
1511 	.arg1_type	= ARG_PTR_TO_TIMER,
1512 	.arg2_type	= ARG_PTR_TO_FUNC,
1513 };
1514 
1515 static bool defer_timer_wq_op(void)
1516 {
1517 	return in_hardirq() || irqs_disabled();
1518 }
1519 
1520 BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags)
1521 {
1522 	struct bpf_hrtimer *t;
1523 	u32 mode;
1524 
1525 	if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
1526 		return -EINVAL;
1527 
1528 	t = READ_ONCE(async->timer);
1529 	if (!t || !READ_ONCE(t->cb.prog))
1530 		return -EINVAL;
1531 
1532 	if (flags & BPF_F_TIMER_ABS)
1533 		mode = HRTIMER_MODE_ABS_SOFT;
1534 	else
1535 		mode = HRTIMER_MODE_REL_SOFT;
1536 
1537 	if (flags & BPF_F_TIMER_CPU_PIN)
1538 		mode |= HRTIMER_MODE_PINNED;
1539 
1540 	/*
1541 	 * bpf_async_cancel_and_free() could have dropped refcnt to zero. In
1542 	 * such case BPF progs are not allowed to arm the timer to prevent UAF.
1543 	 */
1544 	if (!refcount_inc_not_zero(&t->cb.refcnt))
1545 		return -ENOENT;
1546 
1547 	if (!defer_timer_wq_op()) {
1548 		hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
1549 		bpf_async_refcount_put(&t->cb);
1550 		return 0;
1551 	} else {
1552 		return bpf_async_schedule_op(&t->cb, BPF_ASYNC_START, nsecs, mode);
1553 	}
1554 }
1555 
1556 static const struct bpf_func_proto bpf_timer_start_proto = {
1557 	.func		= bpf_timer_start,
1558 	.gpl_only	= true,
1559 	.ret_type	= RET_INTEGER,
1560 	.arg1_type	= ARG_PTR_TO_TIMER,
1561 	.arg2_type	= ARG_ANYTHING,
1562 	.arg3_type	= ARG_ANYTHING,
1563 };
1564 
1565 BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async)
1566 {
1567 	struct bpf_hrtimer *t, *cur_t;
1568 	bool inc = false;
1569 	int ret = 0;
1570 
1571 	if (defer_timer_wq_op())
1572 		return -EOPNOTSUPP;
1573 
1574 	t = READ_ONCE(async->timer);
1575 	if (!t)
1576 		return -EINVAL;
1577 
1578 	cur_t = this_cpu_read(hrtimer_running);
1579 	if (cur_t == t) {
1580 		/* If bpf callback_fn is trying to bpf_timer_cancel()
1581 		 * its own timer the hrtimer_cancel() will deadlock
1582 		 * since it waits for callback_fn to finish.
1583 		 */
1584 		return -EDEADLK;
1585 	}
1586 
1587 	/* Only account in-flight cancellations when invoked from a timer
1588 	 * callback, since we want to avoid waiting only if other _callbacks_
1589 	 * are waiting on us, to avoid introducing lockups. Non-callback paths
1590 	 * are ok, since nobody would synchronously wait for their completion.
1591 	 */
1592 	if (!cur_t)
1593 		goto drop;
1594 	atomic_inc(&t->cancelling);
1595 	/* Need full barrier after relaxed atomic_inc */
1596 	smp_mb__after_atomic();
1597 	inc = true;
1598 	if (atomic_read(&cur_t->cancelling)) {
1599 		/* We're cancelling timer t, while some other timer callback is
1600 		 * attempting to cancel us. In such a case, it might be possible
1601 		 * that timer t belongs to the other callback, or some other
1602 		 * callback waiting upon it (creating transitive dependencies
1603 		 * upon us), and we will enter a deadlock if we continue
1604 		 * cancelling and waiting for it synchronously, since it might
1605 		 * do the same. Bail!
1606 		 */
1607 		atomic_dec(&t->cancelling);
1608 		return -EDEADLK;
1609 	}
1610 drop:
1611 	bpf_async_update_prog_callback(&t->cb, NULL, NULL);
1612 	/* Cancel the timer and wait for associated callback to finish
1613 	 * if it was running.
1614 	 */
1615 	ret = hrtimer_cancel(&t->timer);
1616 	if (inc)
1617 		atomic_dec(&t->cancelling);
1618 	return ret;
1619 }
1620 
1621 static const struct bpf_func_proto bpf_timer_cancel_proto = {
1622 	.func		= bpf_timer_cancel,
1623 	.gpl_only	= true,
1624 	.ret_type	= RET_INTEGER,
1625 	.arg1_type	= ARG_PTR_TO_TIMER,
1626 };
1627 
1628 static void bpf_async_process_op(struct bpf_async_cb *cb, u32 op,
1629 				 u64 timer_nsec, u32 timer_mode)
1630 {
1631 	switch (cb->type) {
1632 	case BPF_ASYNC_TYPE_TIMER: {
1633 		struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
1634 
1635 		switch (op) {
1636 		case BPF_ASYNC_START:
1637 			hrtimer_start(&t->timer, ns_to_ktime(timer_nsec), timer_mode);
1638 			break;
1639 		case BPF_ASYNC_CANCEL:
1640 			hrtimer_try_to_cancel(&t->timer);
1641 			break;
1642 		}
1643 		break;
1644 	}
1645 	case BPF_ASYNC_TYPE_WQ: {
1646 		struct bpf_work *w = container_of(cb, struct bpf_work, cb);
1647 
1648 		switch (op) {
1649 		case BPF_ASYNC_START:
1650 			schedule_work(&w->work);
1651 			break;
1652 		case BPF_ASYNC_CANCEL:
1653 			cancel_work(&w->work);
1654 			break;
1655 		}
1656 		break;
1657 	}
1658 	}
1659 	bpf_async_refcount_put(cb);
1660 }
1661 
1662 static void bpf_async_irq_worker(struct irq_work *work)
1663 {
1664 	struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker);
1665 	struct llist_node *pos, *n, *list;
1666 
1667 	list = llist_del_all(&cb->async_cmds);
1668 	if (!list)
1669 		return;
1670 
1671 	list = llist_reverse_order(list);
1672 	this_cpu_write(async_cb_running, cb);
1673 	llist_for_each_safe(pos, n, list) {
1674 		struct bpf_async_cmd *cmd;
1675 
1676 		cmd = container_of(pos, struct bpf_async_cmd, node);
1677 		bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode);
1678 		kfree_nolock(cmd);
1679 	}
1680 	this_cpu_write(async_cb_running, NULL);
1681 }
1682 
1683 static void bpf_async_cancel_and_free(struct bpf_async_kern *async)
1684 {
1685 	struct bpf_async_cb *cb;
1686 
1687 	if (!READ_ONCE(async->cb))
1688 		return;
1689 
1690 	cb = xchg(&async->cb, NULL);
1691 	if (!cb)
1692 		return;
1693 
1694 	bpf_async_update_prog_callback(cb, NULL, NULL);
1695 	/*
1696 	 * No refcount_inc_not_zero(&cb->refcnt) here. Dropping the last
1697 	 * refcnt. Either synchronously or asynchronously in irq_work.
1698 	 */
1699 
1700 	if (!defer_timer_wq_op()) {
1701 		bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0);
1702 	} else {
1703 		(void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0);
1704 		/*
1705 		 * bpf_async_schedule_op() either enqueues allocated cmd into llist
1706 		 * or fails with ENOMEM and drop the last refcnt.
1707 		 * This is unlikely, but safe, since bpf_async_cb_rcu_tasks_trace_free()
1708 		 * callback will do additional timer/wq_cancel due to races anyway.
1709 		 */
1710 	}
1711 }
1712 
1713 /*
1714  * This function is called by map_delete/update_elem for individual element and
1715  * by ops->map_release_uref when the user space reference to a map reaches zero.
1716  */
1717 void bpf_timer_cancel_and_free(void *val)
1718 {
1719 	bpf_async_cancel_and_free(val);
1720 }
1721 
1722 /*
1723  * This function is called by map_delete/update_elem for individual element and
1724  * by ops->map_release_uref when the user space reference to a map reaches zero.
1725  */
1726 void bpf_wq_cancel_and_free(void *val)
1727 {
1728 	bpf_async_cancel_and_free(val);
1729 }
1730 
1731 BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
1732 {
1733 	unsigned long *kptr = dst;
1734 
1735 	/* This helper may be inlined by verifier. */
1736 	return xchg(kptr, (unsigned long)ptr);
1737 }
1738 
1739 /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
1740  * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
1741  * denote type that verifier will determine.
1742  */
1743 static const struct bpf_func_proto bpf_kptr_xchg_proto = {
1744 	.func         = bpf_kptr_xchg,
1745 	.gpl_only     = false,
1746 	.ret_type     = RET_PTR_TO_BTF_ID_OR_NULL,
1747 	.ret_btf_id   = BPF_PTR_POISON,
1748 	.arg1_type    = ARG_KPTR_XCHG_DEST,
1749 	.arg2_type    = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
1750 	.arg2_btf_id  = BPF_PTR_POISON,
1751 };
1752 
1753 struct bpf_dynptr_file_impl {
1754 	struct freader freader;
1755 	/* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */
1756 	u64 offset;
1757 	u64 size;
1758 };
1759 
1760 /* Since the upper 8 bits of dynptr->size is reserved, the
1761  * maximum supported size is 2^24 - 1.
1762  */
1763 #define DYNPTR_MAX_SIZE	((1UL << 24) - 1)
1764 #define DYNPTR_TYPE_SHIFT	28
1765 #define DYNPTR_SIZE_MASK	0xFFFFFF
1766 #define DYNPTR_RDONLY_BIT	BIT(31)
1767 
1768 bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
1769 {
1770 	return ptr->size & DYNPTR_RDONLY_BIT;
1771 }
1772 
1773 void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
1774 {
1775 	ptr->size |= DYNPTR_RDONLY_BIT;
1776 }
1777 
1778 static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
1779 {
1780 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
1781 }
1782 
1783 static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
1784 {
1785 	return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
1786 }
1787 
1788 u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
1789 {
1790 	if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
1791 		struct bpf_dynptr_file_impl *df = ptr->data;
1792 
1793 		return df->size;
1794 	}
1795 
1796 	return ptr->size & DYNPTR_SIZE_MASK;
1797 }
1798 
1799 static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off)
1800 {
1801 	if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
1802 		struct bpf_dynptr_file_impl *df = ptr->data;
1803 
1804 		df->offset += off;
1805 		return;
1806 	}
1807 	ptr->offset += off;
1808 }
1809 
1810 static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size)
1811 {
1812 	u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
1813 
1814 	if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
1815 		struct bpf_dynptr_file_impl *df = ptr->data;
1816 
1817 		df->size = new_size;
1818 		return;
1819 	}
1820 	ptr->size = (u32)new_size | metadata;
1821 }
1822 
1823 int bpf_dynptr_check_size(u64 size)
1824 {
1825 	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
1826 }
1827 
1828 static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len)
1829 {
1830 	const void *ptr;
1831 
1832 	if (!buf)
1833 		return -EINVAL;
1834 
1835 	df->freader.buf = buf;
1836 	df->freader.buf_sz = len;
1837 	ptr = freader_fetch(&df->freader, offset + df->offset, len);
1838 	if (!ptr)
1839 		return df->freader.err;
1840 
1841 	if (ptr != buf) /* Force copying into the buffer */
1842 		memcpy(buf, ptr, len);
1843 
1844 	return 0;
1845 }
1846 
1847 void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
1848 		     enum bpf_dynptr_type type, u32 offset, u32 size)
1849 {
1850 	ptr->data = data;
1851 	ptr->offset = offset;
1852 	ptr->size = size;
1853 	bpf_dynptr_set_type(ptr, type);
1854 }
1855 
1856 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
1857 {
1858 	memset(ptr, 0, sizeof(*ptr));
1859 }
1860 
1861 BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr)
1862 {
1863 	int err;
1864 
1865 	BTF_TYPE_EMIT(struct bpf_dynptr);
1866 
1867 	err = bpf_dynptr_check_size(size);
1868 	if (err)
1869 		goto error;
1870 
1871 	/* flags is currently unsupported */
1872 	if (flags) {
1873 		err = -EINVAL;
1874 		goto error;
1875 	}
1876 
1877 	bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);
1878 
1879 	return 0;
1880 
1881 error:
1882 	bpf_dynptr_set_null(ptr);
1883 	return err;
1884 }
1885 
1886 static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
1887 	.func		= bpf_dynptr_from_mem,
1888 	.gpl_only	= false,
1889 	.ret_type	= RET_INTEGER,
1890 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
1891 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1892 	.arg3_type	= ARG_ANYTHING,
1893 	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
1894 };
1895 
1896 static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src,
1897 			     u64 offset, u64 flags)
1898 {
1899 	enum bpf_dynptr_type type;
1900 	int err;
1901 
1902 	if (!src->data || flags)
1903 		return -EINVAL;
1904 
1905 	err = bpf_dynptr_check_off_len(src, offset, len);
1906 	if (err)
1907 		return err;
1908 
1909 	type = bpf_dynptr_get_type(src);
1910 
1911 	switch (type) {
1912 	case BPF_DYNPTR_TYPE_LOCAL:
1913 	case BPF_DYNPTR_TYPE_RINGBUF:
1914 		/* Source and destination may possibly overlap, hence use memmove to
1915 		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
1916 		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
1917 		 */
1918 		memmove(dst, src->data + src->offset + offset, len);
1919 		return 0;
1920 	case BPF_DYNPTR_TYPE_SKB:
1921 		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
1922 	case BPF_DYNPTR_TYPE_XDP:
1923 		return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
1924 	case BPF_DYNPTR_TYPE_SKB_META:
1925 		memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
1926 		return 0;
1927 	case BPF_DYNPTR_TYPE_FILE:
1928 		return bpf_file_fetch_bytes(src->data, offset, dst, len);
1929 	default:
1930 		WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
1931 		return -EFAULT;
1932 	}
1933 }
1934 
1935 BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src,
1936 	   u64, offset, u64, flags)
1937 {
1938 	return __bpf_dynptr_read(dst, len, src, offset, flags);
1939 }
1940 
1941 static const struct bpf_func_proto bpf_dynptr_read_proto = {
1942 	.func		= bpf_dynptr_read,
1943 	.gpl_only	= false,
1944 	.ret_type	= RET_INTEGER,
1945 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
1946 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1947 	.arg3_type	= ARG_PTR_TO_DYNPTR,
1948 	.arg4_type	= ARG_ANYTHING,
1949 	.arg5_type	= ARG_ANYTHING,
1950 };
1951 
1952 int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src,
1953 		       u64 len, u64 flags)
1954 {
1955 	enum bpf_dynptr_type type;
1956 	int err;
1957 
1958 	if (!dst->data || __bpf_dynptr_is_rdonly(dst))
1959 		return -EINVAL;
1960 
1961 	err = bpf_dynptr_check_off_len(dst, offset, len);
1962 	if (err)
1963 		return err;
1964 
1965 	type = bpf_dynptr_get_type(dst);
1966 
1967 	switch (type) {
1968 	case BPF_DYNPTR_TYPE_LOCAL:
1969 	case BPF_DYNPTR_TYPE_RINGBUF:
1970 		if (flags)
1971 			return -EINVAL;
1972 		/* Source and destination may possibly overlap, hence use memmove to
1973 		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
1974 		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
1975 		 */
1976 		memmove(dst->data + dst->offset + offset, src, len);
1977 		return 0;
1978 	case BPF_DYNPTR_TYPE_SKB:
1979 		return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
1980 					     flags);
1981 	case BPF_DYNPTR_TYPE_XDP:
1982 		if (flags)
1983 			return -EINVAL;
1984 		return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
1985 	case BPF_DYNPTR_TYPE_SKB_META:
1986 		return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src,
1987 						  len, flags);
1988 	default:
1989 		WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
1990 		return -EFAULT;
1991 	}
1992 }
1993 
1994 BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src,
1995 	   u64, len, u64, flags)
1996 {
1997 	return __bpf_dynptr_write(dst, offset, src, len, flags);
1998 }
1999 
2000 static const struct bpf_func_proto bpf_dynptr_write_proto = {
2001 	.func		= bpf_dynptr_write,
2002 	.gpl_only	= false,
2003 	.ret_type	= RET_INTEGER,
2004 	.arg1_type	= ARG_PTR_TO_DYNPTR,
2005 	.arg2_type	= ARG_ANYTHING,
2006 	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
2007 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
2008 	.arg5_type	= ARG_ANYTHING,
2009 };
2010 
2011 BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len)
2012 {
2013 	enum bpf_dynptr_type type;
2014 	int err;
2015 
2016 	if (!ptr->data)
2017 		return 0;
2018 
2019 	err = bpf_dynptr_check_off_len(ptr, offset, len);
2020 	if (err)
2021 		return 0;
2022 
2023 	if (__bpf_dynptr_is_rdonly(ptr))
2024 		return 0;
2025 
2026 	type = bpf_dynptr_get_type(ptr);
2027 
2028 	switch (type) {
2029 	case BPF_DYNPTR_TYPE_LOCAL:
2030 	case BPF_DYNPTR_TYPE_RINGBUF:
2031 		return (unsigned long)(ptr->data + ptr->offset + offset);
2032 	case BPF_DYNPTR_TYPE_SKB:
2033 	case BPF_DYNPTR_TYPE_XDP:
2034 	case BPF_DYNPTR_TYPE_SKB_META:
2035 		/* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
2036 		return 0;
2037 	default:
2038 		WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
2039 		return 0;
2040 	}
2041 }
2042 
2043 static const struct bpf_func_proto bpf_dynptr_data_proto = {
2044 	.func		= bpf_dynptr_data,
2045 	.gpl_only	= false,
2046 	.ret_type	= RET_PTR_TO_DYNPTR_MEM_OR_NULL,
2047 	.arg1_type	= ARG_PTR_TO_DYNPTR,
2048 	.arg2_type	= ARG_ANYTHING,
2049 	.arg3_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
2050 };
2051 
2052 const struct bpf_func_proto bpf_get_current_task_proto __weak;
2053 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
2054 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
2055 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
2056 const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
2057 const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
2058 const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
2059 const struct bpf_func_proto bpf_perf_event_read_proto __weak;
2060 const struct bpf_func_proto bpf_send_signal_proto __weak;
2061 const struct bpf_func_proto bpf_send_signal_thread_proto __weak;
2062 const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak;
2063 const struct bpf_func_proto bpf_get_task_stack_proto __weak;
2064 const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak;
2065 
2066 const struct bpf_func_proto *
2067 bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2068 {
2069 	switch (func_id) {
2070 	case BPF_FUNC_map_lookup_elem:
2071 		return &bpf_map_lookup_elem_proto;
2072 	case BPF_FUNC_map_update_elem:
2073 		return &bpf_map_update_elem_proto;
2074 	case BPF_FUNC_map_delete_elem:
2075 		return &bpf_map_delete_elem_proto;
2076 	case BPF_FUNC_map_push_elem:
2077 		return &bpf_map_push_elem_proto;
2078 	case BPF_FUNC_map_pop_elem:
2079 		return &bpf_map_pop_elem_proto;
2080 	case BPF_FUNC_map_peek_elem:
2081 		return &bpf_map_peek_elem_proto;
2082 	case BPF_FUNC_map_lookup_percpu_elem:
2083 		return &bpf_map_lookup_percpu_elem_proto;
2084 	case BPF_FUNC_get_prandom_u32:
2085 		return &bpf_get_prandom_u32_proto;
2086 	case BPF_FUNC_get_smp_processor_id:
2087 		return &bpf_get_raw_smp_processor_id_proto;
2088 	case BPF_FUNC_get_numa_node_id:
2089 		return &bpf_get_numa_node_id_proto;
2090 	case BPF_FUNC_tail_call:
2091 		return &bpf_tail_call_proto;
2092 	case BPF_FUNC_ktime_get_ns:
2093 		return &bpf_ktime_get_ns_proto;
2094 	case BPF_FUNC_ktime_get_boot_ns:
2095 		return &bpf_ktime_get_boot_ns_proto;
2096 	case BPF_FUNC_ktime_get_tai_ns:
2097 		return &bpf_ktime_get_tai_ns_proto;
2098 	case BPF_FUNC_ringbuf_output:
2099 		return &bpf_ringbuf_output_proto;
2100 	case BPF_FUNC_ringbuf_reserve:
2101 		return &bpf_ringbuf_reserve_proto;
2102 	case BPF_FUNC_ringbuf_submit:
2103 		return &bpf_ringbuf_submit_proto;
2104 	case BPF_FUNC_ringbuf_discard:
2105 		return &bpf_ringbuf_discard_proto;
2106 	case BPF_FUNC_ringbuf_query:
2107 		return &bpf_ringbuf_query_proto;
2108 	case BPF_FUNC_strncmp:
2109 		return &bpf_strncmp_proto;
2110 	case BPF_FUNC_strtol:
2111 		return &bpf_strtol_proto;
2112 	case BPF_FUNC_strtoul:
2113 		return &bpf_strtoul_proto;
2114 	case BPF_FUNC_get_current_pid_tgid:
2115 		return &bpf_get_current_pid_tgid_proto;
2116 	case BPF_FUNC_get_ns_current_pid_tgid:
2117 		return &bpf_get_ns_current_pid_tgid_proto;
2118 	case BPF_FUNC_get_current_uid_gid:
2119 		return &bpf_get_current_uid_gid_proto;
2120 	default:
2121 		break;
2122 	}
2123 
2124 	if (!bpf_token_capable(prog->aux->token, CAP_BPF))
2125 		return NULL;
2126 
2127 	switch (func_id) {
2128 	case BPF_FUNC_spin_lock:
2129 		return &bpf_spin_lock_proto;
2130 	case BPF_FUNC_spin_unlock:
2131 		return &bpf_spin_unlock_proto;
2132 	case BPF_FUNC_jiffies64:
2133 		return &bpf_jiffies64_proto;
2134 	case BPF_FUNC_per_cpu_ptr:
2135 		return &bpf_per_cpu_ptr_proto;
2136 	case BPF_FUNC_this_cpu_ptr:
2137 		return &bpf_this_cpu_ptr_proto;
2138 	case BPF_FUNC_timer_init:
2139 		return &bpf_timer_init_proto;
2140 	case BPF_FUNC_timer_set_callback:
2141 		return &bpf_timer_set_callback_proto;
2142 	case BPF_FUNC_timer_start:
2143 		return &bpf_timer_start_proto;
2144 	case BPF_FUNC_timer_cancel:
2145 		return &bpf_timer_cancel_proto;
2146 	case BPF_FUNC_kptr_xchg:
2147 		return &bpf_kptr_xchg_proto;
2148 	case BPF_FUNC_for_each_map_elem:
2149 		return &bpf_for_each_map_elem_proto;
2150 	case BPF_FUNC_loop:
2151 		return &bpf_loop_proto;
2152 	case BPF_FUNC_user_ringbuf_drain:
2153 		return &bpf_user_ringbuf_drain_proto;
2154 	case BPF_FUNC_ringbuf_reserve_dynptr:
2155 		return &bpf_ringbuf_reserve_dynptr_proto;
2156 	case BPF_FUNC_ringbuf_submit_dynptr:
2157 		return &bpf_ringbuf_submit_dynptr_proto;
2158 	case BPF_FUNC_ringbuf_discard_dynptr:
2159 		return &bpf_ringbuf_discard_dynptr_proto;
2160 	case BPF_FUNC_dynptr_from_mem:
2161 		return &bpf_dynptr_from_mem_proto;
2162 	case BPF_FUNC_dynptr_read:
2163 		return &bpf_dynptr_read_proto;
2164 	case BPF_FUNC_dynptr_write:
2165 		return &bpf_dynptr_write_proto;
2166 	case BPF_FUNC_dynptr_data:
2167 		return &bpf_dynptr_data_proto;
2168 #ifdef CONFIG_CGROUPS
2169 	case BPF_FUNC_cgrp_storage_get:
2170 		return &bpf_cgrp_storage_get_proto;
2171 	case BPF_FUNC_cgrp_storage_delete:
2172 		return &bpf_cgrp_storage_delete_proto;
2173 	case BPF_FUNC_get_current_cgroup_id:
2174 		return &bpf_get_current_cgroup_id_proto;
2175 	case BPF_FUNC_get_current_ancestor_cgroup_id:
2176 		return &bpf_get_current_ancestor_cgroup_id_proto;
2177 	case BPF_FUNC_current_task_under_cgroup:
2178 		return &bpf_current_task_under_cgroup_proto;
2179 #endif
2180 #ifdef CONFIG_CGROUP_NET_CLASSID
2181 	case BPF_FUNC_get_cgroup_classid:
2182 		return &bpf_get_cgroup_classid_curr_proto;
2183 #endif
2184 	case BPF_FUNC_task_storage_get:
2185 		return &bpf_task_storage_get_proto;
2186 	case BPF_FUNC_task_storage_delete:
2187 		return &bpf_task_storage_delete_proto;
2188 	default:
2189 		break;
2190 	}
2191 
2192 	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
2193 		return NULL;
2194 
2195 	switch (func_id) {
2196 	case BPF_FUNC_trace_printk:
2197 		return bpf_get_trace_printk_proto();
2198 	case BPF_FUNC_get_current_task:
2199 		return &bpf_get_current_task_proto;
2200 	case BPF_FUNC_get_current_task_btf:
2201 		return &bpf_get_current_task_btf_proto;
2202 	case BPF_FUNC_get_current_comm:
2203 		return &bpf_get_current_comm_proto;
2204 	case BPF_FUNC_probe_read_user:
2205 		return &bpf_probe_read_user_proto;
2206 	case BPF_FUNC_probe_read_kernel:
2207 		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
2208 		       NULL : &bpf_probe_read_kernel_proto;
2209 	case BPF_FUNC_probe_read_user_str:
2210 		return &bpf_probe_read_user_str_proto;
2211 	case BPF_FUNC_probe_read_kernel_str:
2212 		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
2213 		       NULL : &bpf_probe_read_kernel_str_proto;
2214 	case BPF_FUNC_copy_from_user:
2215 		return &bpf_copy_from_user_proto;
2216 	case BPF_FUNC_copy_from_user_task:
2217 		return &bpf_copy_from_user_task_proto;
2218 	case BPF_FUNC_snprintf_btf:
2219 		return &bpf_snprintf_btf_proto;
2220 	case BPF_FUNC_snprintf:
2221 		return &bpf_snprintf_proto;
2222 	case BPF_FUNC_task_pt_regs:
2223 		return &bpf_task_pt_regs_proto;
2224 	case BPF_FUNC_trace_vprintk:
2225 		return bpf_get_trace_vprintk_proto();
2226 	case BPF_FUNC_perf_event_read_value:
2227 		return bpf_get_perf_event_read_value_proto();
2228 	case BPF_FUNC_perf_event_read:
2229 		return &bpf_perf_event_read_proto;
2230 	case BPF_FUNC_send_signal:
2231 		return &bpf_send_signal_proto;
2232 	case BPF_FUNC_send_signal_thread:
2233 		return &bpf_send_signal_thread_proto;
2234 	case BPF_FUNC_get_task_stack:
2235 		return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
2236 				       : &bpf_get_task_stack_proto;
2237 	case BPF_FUNC_get_branch_snapshot:
2238 		return &bpf_get_branch_snapshot_proto;
2239 	case BPF_FUNC_find_vma:
2240 		return &bpf_find_vma_proto;
2241 	default:
2242 		return NULL;
2243 	}
2244 }
2245 EXPORT_SYMBOL_GPL(bpf_base_func_proto);
2246 
2247 void bpf_list_head_free(const struct btf_field *field, void *list_head,
2248 			struct bpf_spin_lock *spin_lock)
2249 {
2250 	struct list_head *head = list_head, drain, *pos, *n;
2251 
2252 	BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
2253 	BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
2254 	INIT_LIST_HEAD(&drain);
2255 
2256 	/* Do the actual list draining outside the lock to not hold the lock for
2257 	 * too long, and also prevent deadlocks if tracing programs end up
2258 	 * executing on entry/exit of functions called inside the critical
2259 	 * section, and end up doing map ops that call bpf_list_head_free for
2260 	 * the same map value again.
2261 	 */
2262 	__bpf_spin_lock_irqsave(spin_lock);
2263 	if (!head->next || list_empty(head))
2264 		goto unlock;
2265 	list_for_each_safe(pos, n, head) {
2266 		struct bpf_list_node_kern *node;
2267 
2268 		node = container_of(pos, struct bpf_list_node_kern, list_head);
2269 		WRITE_ONCE(node->owner, BPF_PTR_POISON);
2270 		list_move_tail(pos, &drain);
2271 	}
2272 unlock:
2273 	INIT_LIST_HEAD(head);
2274 	__bpf_spin_unlock_irqrestore(spin_lock);
2275 
2276 	while (!list_empty(&drain)) {
2277 		struct bpf_list_node_kern *node;
2278 
2279 		pos = drain.next;
2280 		node = container_of(pos, struct bpf_list_node_kern, list_head);
2281 		list_del_init(pos);
2282 		/* Ensure __bpf_list_add() sees the node as unlinked. */
2283 		smp_store_release(&node->owner, NULL);
2284 		/* The contained type can also have resources, including a
2285 		 * bpf_list_head which needs to be freed.
2286 		 */
2287 		__bpf_obj_drop_impl((char *)pos - field->graph_root.node_offset,
2288 				    field->graph_root.value_rec, false);
2289 	}
2290 }
2291 
2292 /* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
2293  * 'rb_node *', so field name of rb_node within containing struct is not
2294  * needed.
2295  *
2296  * Since bpf_rb_tree's node type has a corresponding struct btf_field with
2297  * graph_root.node_offset, it's not necessary to know field name
2298  * or type of node struct
2299  */
2300 #define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
2301 	for (pos = rb_first_postorder(root); \
2302 	    pos && ({ n = rb_next_postorder(pos); 1; }); \
2303 	    pos = n)
2304 
2305 void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
2306 		      struct bpf_spin_lock *spin_lock)
2307 {
2308 	struct rb_root_cached orig_root, *root = rb_root;
2309 	struct bpf_rb_node_kern *node;
2310 	struct rb_node *pos, *n;
2311 	void *obj;
2312 
2313 	BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
2314 	BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));
2315 
2316 	__bpf_spin_lock_irqsave(spin_lock);
2317 	orig_root = *root;
2318 	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
2319 		node = rb_entry(pos, struct bpf_rb_node_kern, rb_node);
2320 		WRITE_ONCE(node->owner, BPF_PTR_POISON);
2321 	}
2322 	*root = RB_ROOT_CACHED;
2323 	__bpf_spin_unlock_irqrestore(spin_lock);
2324 
2325 	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
2326 		obj = pos;
2327 		obj -= field->graph_root.node_offset;
2328 		node = rb_entry(pos, struct bpf_rb_node_kern, rb_node);
2329 		RB_CLEAR_NODE(pos);
2330 		/* Ensure __bpf_rbtree_add() sees the node as unlinked. */
2331 		smp_store_release(&node->owner, NULL);
2332 		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
2333 	}
2334 }
2335 
2336 __bpf_kfunc_start_defs();
2337 
2338 /**
2339  * bpf_obj_new() - allocate an object described by program BTF
2340  * @local_type_id__k: type ID in program BTF
2341  * @meta: verifier-supplied struct metadata
2342  *
2343  * Allocate an object of the type identified by @local_type_id__k and
2344  * initialize its special fields. BPF programs can use
2345  * bpf_core_type_id_local() to provide @local_type_id__k. The verifier
2346  * rewrites @meta; BPF programs do not set it.
2347  *
2348  * Return: Pointer to the allocated object, or %NULL on failure.
2349  */
2350 __bpf_kfunc void *bpf_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta)
2351 {
2352 	u64 size = local_type_id__k;
2353 	void *p;
2354 
2355 	p = bpf_mem_alloc(&bpf_global_ma, size);
2356 	if (!p)
2357 		return NULL;
2358 	if (meta)
2359 		bpf_obj_init(meta->record, p);
2360 
2361 	return p;
2362 }
2363 
2364 __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
2365 {
2366 	return bpf_obj_new(local_type_id__k, meta__ign);
2367 }
2368 
2369 /**
2370  * bpf_percpu_obj_new() - allocate a percpu object described by program BTF
2371  * @local_type_id__k: type ID in program BTF
2372  * @meta: verifier-supplied struct metadata
2373  *
2374  * Allocate a percpu object of the type identified by @local_type_id__k. BPF
2375  * programs can use bpf_core_type_id_local() to provide @local_type_id__k.
2376  * The verifier rewrites @meta; BPF programs do not set it.
2377  *
2378  * Return: Pointer to the allocated percpu object, or %NULL on failure.
2379  */
2380 __bpf_kfunc void *bpf_percpu_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta)
2381 {
2382 	u64 size = local_type_id__k;
2383 
2384 	/* The verifier has ensured that meta must be NULL */
2385 	return bpf_mem_alloc(&bpf_global_percpu_ma, size);
2386 }
2387 
2388 __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
2389 {
2390 	return bpf_percpu_obj_new(local_type_id__k, meta__ign);
2391 }
2392 
2393 /* Must be called under migrate_disable(), as required by bpf_mem_free */
2394 void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
2395 {
2396 	struct bpf_mem_alloc *ma;
2397 
2398 	if (rec && rec->refcount_off >= 0 &&
2399 	    !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
2400 		/* Object is refcounted and refcount_dec didn't result in 0
2401 		 * refcount. Return without freeing the object
2402 		 */
2403 		return;
2404 	}
2405 
2406 	if (rec)
2407 		bpf_obj_free_fields(rec, p);
2408 
2409 	if (percpu)
2410 		ma = &bpf_global_percpu_ma;
2411 	else
2412 		ma = &bpf_global_ma;
2413 	bpf_mem_free_rcu(ma, p);
2414 }
2415 
2416 /**
2417  * bpf_obj_drop() - drop a previously allocated object
2418  * @p__alloc: object to free
2419  * @meta: verifier-supplied struct metadata
2420  *
2421  * Destroy special fields in @p__alloc as needed and free the object. The
2422  * verifier rewrites @meta; BPF programs do not set it.
2423  */
2424 __bpf_kfunc void bpf_obj_drop(void *p__alloc, struct btf_struct_meta *meta)
2425 {
2426 	void *p = p__alloc;
2427 
2428 	__bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
2429 }
2430 
2431 __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
2432 {
2433 	return bpf_obj_drop(p__alloc, meta__ign);
2434 }
2435 
2436 /**
2437  * bpf_percpu_obj_drop() - drop a previously allocated percpu object
2438  * @p__alloc: percpu object to free
2439  * @meta: verifier-supplied struct metadata
2440  *
2441  * Free @p__alloc. The verifier rewrites @meta; BPF programs do not set it.
2442  */
2443 __bpf_kfunc void bpf_percpu_obj_drop(void *p__alloc, struct btf_struct_meta *meta)
2444 {
2445 	/* The verifier has ensured that meta must be NULL */
2446 	bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
2447 }
2448 
2449 __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
2450 {
2451 	bpf_percpu_obj_drop(p__alloc, meta__ign);
2452 }
2453 
2454 /**
2455  * bpf_refcount_acquire() - turn a local kptr into an owning reference
2456  * @p__refcounted_kptr: non-owning local kptr
2457  * @meta: verifier-supplied struct metadata
2458  *
2459  * Increment the refcount for @p__refcounted_kptr. The verifier rewrites
2460  * @meta; BPF programs do not set it.
2461  *
2462  * Return: Owning reference to @p__refcounted_kptr, or %NULL on failure.
2463  */
2464 __bpf_kfunc void *bpf_refcount_acquire(void *p__refcounted_kptr, struct btf_struct_meta *meta)
2465 {
2466 	struct bpf_refcount *ref;
2467 
2468 	/* Could just cast directly to refcount_t *, but need some code using
2469 	 * bpf_refcount type so that it is emitted in vmlinux BTF
2470 	 */
2471 	ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
2472 	if (!refcount_inc_not_zero((refcount_t *)ref))
2473 		return NULL;
2474 
2475 	/* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null
2476 	 * in verifier.c
2477 	 */
2478 	return (void *)p__refcounted_kptr;
2479 }
2480 
2481 __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
2482 {
2483 	return bpf_refcount_acquire(p__refcounted_kptr, meta__ign);
2484 }
2485 
2486 static int __bpf_list_add(struct bpf_list_node_kern *node,
2487 			  struct bpf_list_head *head,
2488 			  struct list_head **prev_ptr,
2489 			  struct btf_record *rec, u64 off)
2490 {
2491 	struct list_head *n = &node->list_head, *h = (void *)head;
2492 	struct list_head *prev;
2493 
2494 	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
2495 	 * called on its fields, so init here
2496 	 */
2497 	if (unlikely(!h->next))
2498 		INIT_LIST_HEAD(h);
2499 
2500 	prev = *prev_ptr;
2501 
2502 	/* When prev is not the list head, it must be a node in this list. */
2503 	if (prev != h) {
2504 		struct bpf_list_node_kern *prev_kn =
2505 			container_of(prev, struct bpf_list_node_kern, list_head);
2506 
2507 		if (unlikely(READ_ONCE(prev_kn->owner) != head))
2508 			goto fail;
2509 	}
2510 
2511 	/* node->owner != NULL implies !list_empty(n), no need to separately
2512 	 * check the latter
2513 	 */
2514 	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON))
2515 		goto fail;
2516 
2517 	list_add(n, prev);
2518 	WRITE_ONCE(node->owner, head);
2519 	return 0;
2520 
2521 fail:
2522 	/* Only called from BPF prog, no need to migrate_disable */
2523 	__bpf_obj_drop_impl((void *)n - off, rec, false);
2524 	return -EINVAL;
2525 }
2526 
2527 /**
2528  * bpf_list_push_front() - add a node to the front of a BPF linked list
2529  * @head: list head
2530  * @node: node to insert
2531  * @meta: verifier-supplied struct metadata
2532  * @off: verifier-supplied offset of @node within the containing object
2533  *
2534  * Insert @node at the front of @head. The verifier rewrites @meta and @off;
2535  * BPF programs do not set them.
2536  *
2537  * Return: 0 on success, or %-EINVAL if @node is already linked.
2538  */
2539 __bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head,
2540 				    struct bpf_list_node *node,
2541 				    struct btf_struct_meta *meta,
2542 				    u64 off)
2543 {
2544 	struct bpf_list_node_kern *n = (void *)node;
2545 	struct list_head *h = (void *)head;
2546 
2547 	return __bpf_list_add(n, head, &h, meta ? meta->record : NULL, off);
2548 }
2549 
2550 __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
2551 					 struct bpf_list_node *node,
2552 					 void *meta__ign, u64 off)
2553 {
2554 	return bpf_list_push_front(head, node, meta__ign, off);
2555 }
2556 
2557 /**
2558  * bpf_list_push_back() - add a node to the back of a BPF linked list
2559  * @head: list head
2560  * @node: node to insert
2561  * @meta: verifier-supplied struct metadata
2562  * @off: verifier-supplied offset of @node within the containing object
2563  *
2564  * Insert @node at the back of @head. The verifier rewrites @meta and @off;
2565  * BPF programs do not set them.
2566  *
2567  * Return: 0 on success, or %-EINVAL if @node is already linked.
2568  */
2569 __bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head,
2570 				   struct bpf_list_node *node,
2571 				   struct btf_struct_meta *meta,
2572 				   u64 off)
2573 {
2574 	struct bpf_list_node_kern *n = (void *)node;
2575 	struct list_head *h = (void *)head;
2576 
2577 	return __bpf_list_add(n, head, &h->prev, meta ? meta->record : NULL, off);
2578 }
2579 
2580 __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
2581 					struct bpf_list_node *node,
2582 					void *meta__ign, u64 off)
2583 {
2584 	return bpf_list_push_back(head, node, meta__ign, off);
2585 }
2586 
2587 __bpf_kfunc int bpf_list_add(struct bpf_list_head *head, struct bpf_list_node *new,
2588 			     struct bpf_list_node *prev__nonown_allowed,
2589 			     struct btf_struct_meta *meta, u64 off)
2590 {
2591 	struct bpf_list_node_kern *n = (void *)new, *p = (void *)prev__nonown_allowed;
2592 	struct list_head *prev_ptr = &p->list_head;
2593 
2594 	return __bpf_list_add(n, head, &prev_ptr, meta ? meta->record : NULL, off);
2595 }
2596 
2597 static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head,
2598 					    struct list_head *n)
2599 {
2600 	struct list_head *h = (void *)head;
2601 	struct bpf_list_node_kern *node;
2602 
2603 	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
2604 	 * called on its fields, so init here
2605 	 */
2606 	if (unlikely(!h->next)) {
2607 		INIT_LIST_HEAD(h);
2608 		return NULL;
2609 	}
2610 	if (list_empty(h))
2611 		return NULL;
2612 
2613 	node = container_of(n, struct bpf_list_node_kern, list_head);
2614 	if (unlikely(READ_ONCE(node->owner) != head))
2615 		return NULL;
2616 
2617 	list_del_init(n);
2618 	/* Ensure __bpf_list_add() sees the node as unlinked. */
2619 	smp_store_release(&node->owner, NULL);
2620 	return (struct bpf_list_node *)n;
2621 }
2622 
2623 __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
2624 {
2625 	struct list_head *h = (void *)head;
2626 
2627 	return __bpf_list_del(head, h->next);
2628 }
2629 
2630 __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
2631 {
2632 	struct list_head *h = (void *)head;
2633 
2634 	return __bpf_list_del(head, h->prev);
2635 }
2636 
2637 __bpf_kfunc struct bpf_list_node *bpf_list_del(struct bpf_list_head *head,
2638 					       struct bpf_list_node *node__nonown_allowed)
2639 {
2640 	struct bpf_list_node_kern *kn = (void *)node__nonown_allowed;
2641 
2642 	/* verifier guarantees node is a list node rather than list head */
2643 	return __bpf_list_del(head, &kn->list_head);
2644 }
2645 
2646 __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
2647 {
2648 	struct list_head *h = (struct list_head *)head;
2649 
2650 	if (list_empty(h) || unlikely(!h->next))
2651 		return NULL;
2652 
2653 	return (struct bpf_list_node *)h->next;
2654 }
2655 
2656 __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head)
2657 {
2658 	struct list_head *h = (struct list_head *)head;
2659 
2660 	if (list_empty(h) || unlikely(!h->next))
2661 		return NULL;
2662 
2663 	return (struct bpf_list_node *)h->prev;
2664 }
2665 
2666 __bpf_kfunc bool bpf_list_is_first(struct bpf_list_head *head,
2667 				   struct bpf_list_node *node__nonown_allowed)
2668 {
2669 	struct list_head *h = (struct list_head *)head;
2670 	struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed;
2671 
2672 	if (READ_ONCE(kn->owner) != head)
2673 		return false;
2674 
2675 	return list_is_first(&kn->list_head, h);
2676 }
2677 
2678 __bpf_kfunc bool bpf_list_is_last(struct bpf_list_head *head,
2679 				  struct bpf_list_node *node__nonown_allowed)
2680 {
2681 	struct list_head *h = (struct list_head *)head;
2682 	struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed;
2683 
2684 	if (READ_ONCE(kn->owner) != head)
2685 		return false;
2686 
2687 	return list_is_last(&kn->list_head, h);
2688 }
2689 
2690 __bpf_kfunc bool bpf_list_empty(struct bpf_list_head *head)
2691 {
2692 	struct list_head *h = (struct list_head *)head;
2693 
2694 	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
2695 	 * called on its fields, so init here
2696 	 */
2697 	if (unlikely(!h->next))
2698 		INIT_LIST_HEAD(h);
2699 
2700 	return list_empty(h);
2701 }
2702 
2703 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
2704 						  struct bpf_rb_node *node)
2705 {
2706 	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
2707 	struct rb_root_cached *r = (struct rb_root_cached *)root;
2708 	struct rb_node *n = &node_internal->rb_node;
2709 
2710 	/* node_internal->owner != root implies either RB_EMPTY_NODE(n) or
2711 	 * n is owned by some other tree. No need to check RB_EMPTY_NODE(n)
2712 	 */
2713 	if (READ_ONCE(node_internal->owner) != root)
2714 		return NULL;
2715 
2716 	rb_erase_cached(n, r);
2717 	RB_CLEAR_NODE(n);
2718 	WRITE_ONCE(node_internal->owner, NULL);
2719 	return (struct bpf_rb_node *)n;
2720 }
2721 
2722 /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
2723  * program
2724  */
2725 static int __bpf_rbtree_add(struct bpf_rb_root *root,
2726 			    struct bpf_rb_node_kern *node,
2727 			    void *less, struct btf_record *rec, u64 off)
2728 {
2729 	struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
2730 	struct rb_node *parent = NULL, *n = &node->rb_node;
2731 	bpf_callback_t cb = (bpf_callback_t)less;
2732 	bool leftmost = true;
2733 
2734 	/* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately
2735 	 * check the latter
2736 	 */
2737 	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
2738 		/* Only called from BPF prog, no need to migrate_disable */
2739 		__bpf_obj_drop_impl((void *)n - off, rec, false);
2740 		return -EINVAL;
2741 	}
2742 
2743 	while (*link) {
2744 		parent = *link;
2745 		if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
2746 			link = &parent->rb_left;
2747 		} else {
2748 			link = &parent->rb_right;
2749 			leftmost = false;
2750 		}
2751 	}
2752 
2753 	rb_link_node(n, parent, link);
2754 	rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
2755 	WRITE_ONCE(node->owner, root);
2756 	return 0;
2757 }
2758 
2759 /**
2760  * bpf_rbtree_add() - add a node to a BPF rbtree
2761  * @root: tree root
2762  * @node: node to insert
2763  * @less: comparator used to order nodes
2764  * @meta: verifier-supplied struct metadata
2765  * @off: verifier-supplied offset of @node within the containing object
2766  *
2767  * Insert @node into @root using @less. The verifier rewrites @meta and @off;
2768  * BPF programs do not set them.
2769  *
2770  * Return: 0 on success, or %-EINVAL if @node is already linked in a tree.
2771  */
2772 __bpf_kfunc int bpf_rbtree_add(struct bpf_rb_root *root,
2773 			       struct bpf_rb_node *node,
2774 			       bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
2775 			       struct btf_struct_meta *meta,
2776 			       u64 off)
2777 {
2778 	struct bpf_rb_node_kern *n = (void *)node;
2779 
2780 	return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
2781 }
2782 
2783 __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
2784 				    bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
2785 				    void *meta__ign, u64 off)
2786 {
2787 	return bpf_rbtree_add(root, node, less, meta__ign, off);
2788 }
2789 
2790 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
2791 {
2792 	struct rb_root_cached *r = (struct rb_root_cached *)root;
2793 
2794 	return (struct bpf_rb_node *)rb_first_cached(r);
2795 }
2796 
2797 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root)
2798 {
2799 	struct rb_root_cached *r = (struct rb_root_cached *)root;
2800 
2801 	return (struct bpf_rb_node *)r->rb_root.rb_node;
2802 }
2803 
2804 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node)
2805 {
2806 	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
2807 
2808 	if (READ_ONCE(node_internal->owner) != root)
2809 		return NULL;
2810 
2811 	return (struct bpf_rb_node *)node_internal->rb_node.rb_left;
2812 }
2813 
2814 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node)
2815 {
2816 	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
2817 
2818 	if (READ_ONCE(node_internal->owner) != root)
2819 		return NULL;
2820 
2821 	return (struct bpf_rb_node *)node_internal->rb_node.rb_right;
2822 }
2823 
2824 /**
2825  * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
2826  * kfunc which is not stored in a map as a kptr, must be released by calling
2827  * bpf_task_release().
2828  * @p: The task on which a reference is being acquired.
2829  */
2830 __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
2831 {
2832 	if (refcount_inc_not_zero(&p->rcu_users))
2833 		return p;
2834 	return NULL;
2835 }
2836 
2837 /**
2838  * bpf_task_release - Release the reference acquired on a task.
2839  * @p: The task on which a reference is being released.
2840  */
2841 __bpf_kfunc void bpf_task_release(struct task_struct *p)
2842 {
2843 	put_task_struct_rcu_user(p);
2844 }
2845 
2846 __bpf_kfunc void bpf_task_release_dtor(void *p)
2847 {
2848 	put_task_struct_rcu_user(p);
2849 }
2850 CFI_NOSEAL(bpf_task_release_dtor);
2851 
2852 #ifdef CONFIG_CGROUPS
2853 /**
2854  * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
2855  * this kfunc which is not stored in a map as a kptr, must be released by
2856  * calling bpf_cgroup_release().
2857  * @cgrp: The cgroup on which a reference is being acquired.
2858  */
2859 __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
2860 {
2861 	return cgroup_tryget(cgrp) ? cgrp : NULL;
2862 }
2863 
2864 /**
2865  * bpf_cgroup_release - Release the reference acquired on a cgroup.
2866  * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
2867  * not be freed until the current grace period has ended, even if its refcount
2868  * drops to 0.
2869  * @cgrp: The cgroup on which a reference is being released.
2870  */
2871 __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
2872 {
2873 	cgroup_put(cgrp);
2874 }
2875 
2876 __bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp)
2877 {
2878 	cgroup_put(cgrp);
2879 }
2880 CFI_NOSEAL(bpf_cgroup_release_dtor);
2881 
2882 /**
2883  * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
2884  * array. A cgroup returned by this kfunc which is not subsequently stored in a
2885  * map, must be released by calling bpf_cgroup_release().
2886  * @cgrp: The cgroup for which we're performing a lookup.
2887  * @level: The level of ancestor to look up.
2888  */
2889 __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
2890 {
2891 	struct cgroup *ancestor;
2892 
2893 	if (level > cgrp->level || level < 0)
2894 		return NULL;
2895 
2896 	/* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
2897 	ancestor = cgrp->ancestors[level];
2898 	if (!cgroup_tryget(ancestor))
2899 		return NULL;
2900 	return ancestor;
2901 }
2902 
2903 /**
2904  * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this
2905  * kfunc which is not subsequently stored in a map, must be released by calling
2906  * bpf_cgroup_release().
2907  * @cgid: cgroup id.
2908  */
2909 __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
2910 {
2911 	struct cgroup *cgrp;
2912 
2913 	cgrp = __cgroup_get_from_id(cgid);
2914 	if (IS_ERR(cgrp))
2915 		return NULL;
2916 	return cgrp;
2917 }
2918 
2919 /**
2920  * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test
2921  * task's membership of cgroup ancestry.
2922  * @task: the task to be tested
2923  * @ancestor: possible ancestor of @task's cgroup
2924  *
2925  * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
2926  * It follows all the same rules as cgroup_is_descendant, and only applies
2927  * to the default hierarchy.
2928  */
2929 __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
2930 				       struct cgroup *ancestor)
2931 {
2932 	long ret;
2933 
2934 	rcu_read_lock();
2935 	ret = task_under_cgroup_hierarchy(task, ancestor);
2936 	rcu_read_unlock();
2937 	return ret;
2938 }
2939 
2940 BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
2941 {
2942 	struct bpf_array *array = container_of(map, struct bpf_array, map);
2943 	struct cgroup *cgrp;
2944 
2945 	if (unlikely(idx >= array->map.max_entries))
2946 		return -E2BIG;
2947 
2948 	cgrp = READ_ONCE(array->ptrs[idx]);
2949 	if (unlikely(!cgrp))
2950 		return -EAGAIN;
2951 
2952 	return task_under_cgroup_hierarchy(current, cgrp);
2953 }
2954 
2955 const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
2956 	.func           = bpf_current_task_under_cgroup,
2957 	.gpl_only       = false,
2958 	.ret_type       = RET_INTEGER,
2959 	.arg1_type      = ARG_CONST_MAP_PTR,
2960 	.arg2_type      = ARG_ANYTHING,
2961 };
2962 
2963 /**
2964  * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
2965  * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
2966  * hierarchy ID.
2967  * @task: The target task
2968  * @hierarchy_id: The ID of a cgroup1 hierarchy
2969  *
2970  * On success, the cgroup is returen. On failure, NULL is returned.
2971  */
2972 __bpf_kfunc struct cgroup *
2973 bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id)
2974 {
2975 	struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id);
2976 
2977 	if (IS_ERR(cgrp))
2978 		return NULL;
2979 	return cgrp;
2980 }
2981 #endif /* CONFIG_CGROUPS */
2982 
2983 /**
2984  * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
2985  * in the root pid namespace idr. If a task is returned, it must either be
2986  * stored in a map, or released with bpf_task_release().
2987  * @pid: The pid of the task being looked up.
2988  */
2989 __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
2990 {
2991 	struct task_struct *p;
2992 
2993 	rcu_read_lock();
2994 	p = find_task_by_pid_ns(pid, &init_pid_ns);
2995 	if (p)
2996 		p = bpf_task_acquire(p);
2997 	rcu_read_unlock();
2998 
2999 	return p;
3000 }
3001 
3002 /**
3003  * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up
3004  * in the pid namespace of the current task. If a task is returned, it must
3005  * either be stored in a map, or released with bpf_task_release().
3006  * @vpid: The vpid of the task being looked up.
3007  */
3008 __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
3009 {
3010 	struct task_struct *p;
3011 
3012 	guard(rcu)();
3013 	if (!task_active_pid_ns(current))
3014 		return NULL;
3015 
3016 	p = find_task_by_vpid(vpid);
3017 	if (p)
3018 		p = bpf_task_acquire(p);
3019 
3020 	return p;
3021 }
3022 
3023 /**
3024  * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
3025  * @p: The dynptr whose data slice to retrieve
3026  * @offset: Offset into the dynptr
3027  * @buffer__nullable: User-provided buffer to copy contents into.  May be NULL
3028  * @buffer__szk: Size (in bytes) of the buffer if present. This is the
3029  *               length of the requested slice. This must be a constant.
3030  *
3031  * For non-skb and non-xdp type dynptrs, there is no difference between
3032  * bpf_dynptr_slice and bpf_dynptr_data.
3033  *
3034  *  If buffer__nullable is NULL, the call will fail if buffer_opt was needed.
3035  *
3036  * If the intention is to write to the data slice, please use
3037  * bpf_dynptr_slice_rdwr.
3038  *
3039  * The user must check that the returned pointer is not null before using it.
3040  *
3041  * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
3042  * does not change the underlying packet data pointers, so a call to
3043  * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
3044  * the bpf program.
3045  *
3046  * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only
3047  * data slice (can be either direct pointer to the data or a pointer to the user
3048  * provided buffer, with its contents containing the data, if unable to obtain
3049  * direct pointer)
3050  */
3051 __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
3052 				   void *buffer__nullable, u64 buffer__szk)
3053 {
3054 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3055 	enum bpf_dynptr_type type;
3056 	u64 len = buffer__szk;
3057 	int err;
3058 
3059 	if (!ptr->data)
3060 		return NULL;
3061 
3062 	err = bpf_dynptr_check_off_len(ptr, offset, len);
3063 	if (err)
3064 		return NULL;
3065 
3066 	type = bpf_dynptr_get_type(ptr);
3067 
3068 	switch (type) {
3069 	case BPF_DYNPTR_TYPE_LOCAL:
3070 	case BPF_DYNPTR_TYPE_RINGBUF:
3071 		return ptr->data + ptr->offset + offset;
3072 	case BPF_DYNPTR_TYPE_SKB:
3073 		if (buffer__nullable)
3074 			return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__nullable);
3075 		else
3076 			return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
3077 	case BPF_DYNPTR_TYPE_XDP:
3078 	{
3079 		void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
3080 		if (!IS_ERR_OR_NULL(xdp_ptr))
3081 			return xdp_ptr;
3082 
3083 		if (!buffer__nullable)
3084 			return NULL;
3085 		bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__nullable, len, false);
3086 		return buffer__nullable;
3087 	}
3088 	case BPF_DYNPTR_TYPE_SKB_META:
3089 		return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
3090 	case BPF_DYNPTR_TYPE_FILE:
3091 		err = bpf_file_fetch_bytes(ptr->data, offset, buffer__nullable, buffer__szk);
3092 		return err ? NULL : buffer__nullable;
3093 	default:
3094 		WARN_ONCE(true, "unknown dynptr type %d\n", type);
3095 		return NULL;
3096 	}
3097 }
3098 
3099 /**
3100  * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
3101  * @p: The dynptr whose data slice to retrieve
3102  * @offset: Offset into the dynptr
3103  * @buffer__nullable: User-provided buffer to copy contents into. May be NULL
3104  * @buffer__szk: Size (in bytes) of the buffer if present. This is the
3105  *               length of the requested slice. This must be a constant.
3106  *
3107  * For non-skb and non-xdp type dynptrs, there is no difference between
3108  * bpf_dynptr_slice and bpf_dynptr_data.
3109  *
3110  * If buffer__nullable is NULL, the call will fail if buffer_opt was needed.
3111  *
3112  * The returned pointer is writable and may point to either directly the dynptr
3113  * data at the requested offset or to the buffer if unable to obtain a direct
3114  * data pointer to (example: the requested slice is to the paged area of an skb
3115  * packet). In the case where the returned pointer is to the buffer, the user
3116  * is responsible for persisting writes through calling bpf_dynptr_write(). This
3117  * usually looks something like this pattern:
3118  *
3119  * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
3120  * if (!eth)
3121  *	return TC_ACT_SHOT;
3122  *
3123  * // mutate eth header //
3124  *
3125  * if (eth == buffer)
3126  *	bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
3127  *
3128  * Please note that, as in the example above, the user must check that the
3129  * returned pointer is not null before using it.
3130  *
3131  * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
3132  * does not change the underlying packet data pointers, so a call to
3133  * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
3134  * the bpf program.
3135  *
3136  * Return: NULL if the call failed (eg invalid dynptr), pointer to a
3137  * data slice (can be either direct pointer to the data or a pointer to the user
3138  * provided buffer, with its contents containing the data, if unable to obtain
3139  * direct pointer)
3140  */
3141 __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
3142 					void *buffer__nullable, u64 buffer__szk)
3143 {
3144 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3145 
3146 	if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
3147 		return NULL;
3148 
3149 	/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
3150 	 *
3151 	 * For skb-type dynptrs, it is safe to write into the returned pointer
3152 	 * if the bpf program allows skb data writes. There are two possibilities
3153 	 * that may occur when calling bpf_dynptr_slice_rdwr:
3154 	 *
3155 	 * 1) The requested slice is in the head of the skb. In this case, the
3156 	 * returned pointer is directly to skb data, and if the skb is cloned, the
3157 	 * verifier will have uncloned it (see bpf_unclone_prologue()) already.
3158 	 * The pointer can be directly written into.
3159 	 *
3160 	 * 2) Some portion of the requested slice is in the paged buffer area.
3161 	 * In this case, the requested data will be copied out into the buffer
3162 	 * and the returned pointer will be a pointer to the buffer. The skb
3163 	 * will not be pulled. To persist the write, the user will need to call
3164 	 * bpf_dynptr_write(), which will pull the skb and commit the write.
3165 	 *
3166 	 * Similarly for xdp programs, if the requested slice is not across xdp
3167 	 * fragments, then a direct pointer will be returned, otherwise the data
3168 	 * will be copied out into the buffer and the user will need to call
3169 	 * bpf_dynptr_write() to commit changes.
3170 	 */
3171 	return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk);
3172 }
3173 
3174 __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr *p, u64 start, u64 end)
3175 {
3176 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3177 	u64 size;
3178 
3179 	if (!ptr->data || start > end)
3180 		return -EINVAL;
3181 
3182 	size = __bpf_dynptr_size(ptr);
3183 
3184 	if (start > size || end > size)
3185 		return -ERANGE;
3186 
3187 	bpf_dynptr_advance_offset(ptr, start);
3188 	bpf_dynptr_set_size(ptr, end - start);
3189 
3190 	return 0;
3191 }
3192 
3193 __bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)
3194 {
3195 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3196 
3197 	return !ptr->data;
3198 }
3199 
3200 __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
3201 {
3202 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3203 
3204 	if (!ptr->data)
3205 		return false;
3206 
3207 	return __bpf_dynptr_is_rdonly(ptr);
3208 }
3209 
3210 __bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p)
3211 {
3212 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3213 
3214 	if (!ptr->data)
3215 		return -EINVAL;
3216 
3217 	return __bpf_dynptr_size(ptr);
3218 }
3219 
3220 __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
3221 				 struct bpf_dynptr *clone__uninit)
3222 {
3223 	struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit;
3224 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3225 
3226 	if (!ptr->data) {
3227 		bpf_dynptr_set_null(clone);
3228 		return -EINVAL;
3229 	}
3230 
3231 	*clone = *ptr;
3232 
3233 	return 0;
3234 }
3235 
3236 /**
3237  * bpf_dynptr_copy() - Copy data from one dynptr to another.
3238  * @dst_ptr: Destination dynptr - where data should be copied to
3239  * @dst_off: Offset into the destination dynptr
3240  * @src_ptr: Source dynptr - where data should be copied from
3241  * @src_off: Offset into the source dynptr
3242  * @size: Length of the data to copy from source to destination
3243  *
3244  * Copies data from source dynptr to destination dynptr.
3245  * Returns 0 on success; negative error, otherwise.
3246  */
3247 __bpf_kfunc int bpf_dynptr_copy(const struct bpf_dynptr *dst_ptr, u64 dst_off,
3248 				const struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
3249 {
3250 	const struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
3251 	const struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
3252 	void *src_slice, *dst_slice;
3253 	char buf[256];
3254 	u64 off;
3255 
3256 	src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size);
3257 	dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size);
3258 
3259 	if (src_slice && dst_slice) {
3260 		memmove(dst_slice, src_slice, size);
3261 		return 0;
3262 	}
3263 
3264 	if (src_slice)
3265 		return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0);
3266 
3267 	if (dst_slice)
3268 		return __bpf_dynptr_read(dst_slice, size, src, src_off, 0);
3269 
3270 	if (bpf_dynptr_check_off_len(dst, dst_off, size) ||
3271 	    bpf_dynptr_check_off_len(src, src_off, size))
3272 		return -E2BIG;
3273 
3274 	off = 0;
3275 	while (off < size) {
3276 		u64 chunk_sz = min_t(u64, sizeof(buf), size - off);
3277 		int err;
3278 
3279 		err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0);
3280 		if (err)
3281 			return err;
3282 		err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0);
3283 		if (err)
3284 			return err;
3285 
3286 		off += chunk_sz;
3287 	}
3288 	return 0;
3289 }
3290 
3291 /**
3292  * bpf_dynptr_memset() - Fill dynptr memory with a constant byte.
3293  * @p: Destination dynptr - where data will be filled
3294  * @offset: Offset into the dynptr to start filling from
3295  * @size: Number of bytes to fill
3296  * @val: Constant byte to fill the memory with
3297  *
3298  * Fills the @size bytes of the memory area pointed to by @p
3299  * at @offset with the constant byte @val.
3300  * Returns 0 on success; negative error, otherwise.
3301  */
3302 __bpf_kfunc int bpf_dynptr_memset(const struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
3303 {
3304 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
3305 	u64 chunk_sz, write_off;
3306 	char buf[256];
3307 	void* slice;
3308 	int err;
3309 
3310 	slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size);
3311 	if (likely(slice)) {
3312 		memset(slice, val, size);
3313 		return 0;
3314 	}
3315 
3316 	if (__bpf_dynptr_is_rdonly(ptr))
3317 		return -EINVAL;
3318 
3319 	err = bpf_dynptr_check_off_len(ptr, offset, size);
3320 	if (err)
3321 		return err;
3322 
3323 	/* Non-linear data under the dynptr, write from a local buffer */
3324 	chunk_sz = min_t(u64, sizeof(buf), size);
3325 	memset(buf, val, chunk_sz);
3326 
3327 	for (write_off = 0; write_off < size; write_off += chunk_sz) {
3328 		chunk_sz = min_t(u64, sizeof(buf), size - write_off);
3329 		err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0);
3330 		if (err)
3331 			return err;
3332 	}
3333 
3334 	return 0;
3335 }
3336 
3337 __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
3338 {
3339 	return obj;
3340 }
3341 
3342 __bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k)
3343 {
3344 	return (void *)obj__ign;
3345 }
3346 
3347 __bpf_kfunc void bpf_rcu_read_lock(void)
3348 {
3349 	rcu_read_lock();
3350 }
3351 
3352 __bpf_kfunc void bpf_rcu_read_unlock(void)
3353 {
3354 	rcu_read_unlock();
3355 }
3356 
3357 struct bpf_throw_ctx {
3358 	struct bpf_prog_aux *aux;
3359 	u64 sp;
3360 	u64 bp;
3361 	int cnt;
3362 };
3363 
3364 static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
3365 {
3366 	struct bpf_throw_ctx *ctx = cookie;
3367 	struct bpf_prog *prog;
3368 
3369 	/*
3370 	 * The RCU read lock is held to safely traverse the latch tree, but we
3371 	 * don't need its protection when accessing the prog, since it has an
3372 	 * active stack frame on the current stack trace, and won't disappear.
3373 	 */
3374 	rcu_read_lock();
3375 	prog = bpf_prog_ksym_find(ip);
3376 	rcu_read_unlock();
3377 	if (!prog)
3378 		return !ctx->cnt;
3379 	ctx->cnt++;
3380 	if (bpf_is_subprog(prog))
3381 		return true;
3382 	ctx->aux = prog->aux;
3383 	ctx->sp = sp;
3384 	ctx->bp = bp;
3385 	return false;
3386 }
3387 
3388 __bpf_kfunc void bpf_throw(u64 cookie)
3389 {
3390 	struct bpf_throw_ctx ctx = {};
3391 
3392 	arch_bpf_stack_walk(bpf_stack_walker, &ctx);
3393 	WARN_ON_ONCE(!ctx.aux);
3394 	if (ctx.aux)
3395 		WARN_ON_ONCE(!ctx.aux->exception_boundary);
3396 	WARN_ON_ONCE(!ctx.bp);
3397 	WARN_ON_ONCE(!ctx.cnt);
3398 	/* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning
3399 	 * deeper stack depths than ctx.sp as we do not return from bpf_throw,
3400 	 * which skips compiler generated instrumentation to do the same.
3401 	 */
3402 	kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
3403 	ctx.aux->bpf_exception_cb(cookie, ctx.sp + ctx.aux->stack_arg_sp_adjust, ctx.bp, 0, 0);
3404 	WARN(1, "A call to BPF exception callback should never return\n");
3405 }
3406 
3407 __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags)
3408 {
3409 	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
3410 	struct bpf_map *map = p__map;
3411 
3412 	BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
3413 	BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
3414 
3415 	if (flags)
3416 		return -EINVAL;
3417 
3418 	return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
3419 }
3420 
3421 __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
3422 {
3423 	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
3424 	struct bpf_work *w;
3425 
3426 	if (flags)
3427 		return -EINVAL;
3428 
3429 	w = READ_ONCE(async->work);
3430 	if (!w || !READ_ONCE(w->cb.prog))
3431 		return -EINVAL;
3432 
3433 	if (!refcount_inc_not_zero(&w->cb.refcnt))
3434 		return -ENOENT;
3435 
3436 	if (!defer_timer_wq_op()) {
3437 		schedule_work(&w->work);
3438 		bpf_async_refcount_put(&w->cb);
3439 		return 0;
3440 	} else {
3441 		return bpf_async_schedule_op(&w->cb, BPF_ASYNC_START, 0, 0);
3442 	}
3443 }
3444 
3445 __bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq,
3446 				    int (callback_fn)(void *map, int *key, void *value),
3447 				    unsigned int flags,
3448 				    struct bpf_prog_aux *aux)
3449 {
3450 	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
3451 
3452 	if (flags)
3453 		return -EINVAL;
3454 
3455 	return __bpf_async_set_callback(async, callback_fn, aux->prog);
3456 }
3457 
3458 __bpf_kfunc void bpf_preempt_disable(void)
3459 {
3460 	preempt_disable();
3461 }
3462 
3463 __bpf_kfunc void bpf_preempt_enable(void)
3464 {
3465 	preempt_enable();
3466 }
3467 
3468 struct bpf_iter_bits {
3469 	__u64 __opaque[2];
3470 } __aligned(8);
3471 
3472 #define BITS_ITER_NR_WORDS_MAX 511
3473 
3474 struct bpf_iter_bits_kern {
3475 	union {
3476 		__u64 *bits;
3477 		__u64 bits_copy;
3478 	};
3479 	int nr_bits;
3480 	int bit;
3481 } __aligned(8);
3482 
3483 /* On 64-bit hosts, unsigned long and u64 have the same size, so passing
3484  * a u64 pointer and an unsigned long pointer to find_next_bit() will
3485  * return the same result, as both point to the same 8-byte area.
3486  *
3487  * For 32-bit little-endian hosts, using a u64 pointer or unsigned long
3488  * pointer also makes no difference. This is because the first iterated
3489  * unsigned long is composed of bits 0-31 of the u64 and the second unsigned
3490  * long is composed of bits 32-63 of the u64.
3491  *
3492  * However, for 32-bit big-endian hosts, this is not the case. The first
3493  * iterated unsigned long will be bits 32-63 of the u64, so swap these two
3494  * ulong values within the u64.
3495  */
3496 static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
3497 {
3498 #if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
3499 	unsigned int i;
3500 
3501 	for (i = 0; i < nr; i++)
3502 		bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
3503 #endif
3504 }
3505 
3506 /**
3507  * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
3508  * @it: The new bpf_iter_bits to be created
3509  * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
3510  * @nr_words: The size of the specified memory area, measured in 8-byte units.
3511  * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
3512  * further reduced by the BPF memory allocator implementation.
3513  *
3514  * This function initializes a new bpf_iter_bits structure for iterating over
3515  * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
3516  * copies the data of the memory area to the newly created bpf_iter_bits @it for
3517  * subsequent iteration operations.
3518  *
3519  * On success, 0 is returned. On failure, ERR is returned.
3520  */
3521 __bpf_kfunc int
3522 bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words)
3523 {
3524 	struct bpf_iter_bits_kern *kit = (void *)it;
3525 	u32 nr_bytes = nr_words * sizeof(u64);
3526 	u32 nr_bits = BYTES_TO_BITS(nr_bytes);
3527 	int err;
3528 
3529 	BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits));
3530 	BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) !=
3531 		     __alignof__(struct bpf_iter_bits));
3532 
3533 	kit->nr_bits = 0;
3534 	kit->bits_copy = 0;
3535 	kit->bit = -1;
3536 
3537 	if (!unsafe_ptr__ign || !nr_words)
3538 		return -EINVAL;
3539 	if (nr_words > BITS_ITER_NR_WORDS_MAX)
3540 		return -E2BIG;
3541 
3542 	/* Optimization for u64 mask */
3543 	if (nr_bits == 64) {
3544 		err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign);
3545 		if (err)
3546 			return -EFAULT;
3547 
3548 		swap_ulong_in_u64(&kit->bits_copy, nr_words);
3549 
3550 		kit->nr_bits = nr_bits;
3551 		return 0;
3552 	}
3553 
3554 	if (bpf_mem_alloc_check_size(false, nr_bytes))
3555 		return -E2BIG;
3556 
3557 	/* Fallback to memalloc */
3558 	kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
3559 	if (!kit->bits)
3560 		return -ENOMEM;
3561 
3562 	err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign);
3563 	if (err) {
3564 		bpf_mem_free(&bpf_global_ma, kit->bits);
3565 		return err;
3566 	}
3567 
3568 	swap_ulong_in_u64(kit->bits, nr_words);
3569 
3570 	kit->nr_bits = nr_bits;
3571 	return 0;
3572 }
3573 
3574 /**
3575  * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits
3576  * @it: The bpf_iter_bits to be checked
3577  *
3578  * This function returns a pointer to a number representing the value of the
3579  * next bit in the bits.
3580  *
3581  * If there are no further bits available, it returns NULL.
3582  */
3583 __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
3584 {
3585 	struct bpf_iter_bits_kern *kit = (void *)it;
3586 	int bit = kit->bit, nr_bits = kit->nr_bits;
3587 	const void *bits;
3588 
3589 	if (!nr_bits || bit >= nr_bits)
3590 		return NULL;
3591 
3592 	bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
3593 	bit = find_next_bit(bits, nr_bits, bit + 1);
3594 	if (bit >= nr_bits) {
3595 		kit->bit = bit;
3596 		return NULL;
3597 	}
3598 
3599 	kit->bit = bit;
3600 	return &kit->bit;
3601 }
3602 
3603 /**
3604  * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits
3605  * @it: The bpf_iter_bits to be destroyed
3606  *
3607  * Destroy the resource associated with the bpf_iter_bits.
3608  */
3609 __bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
3610 {
3611 	struct bpf_iter_bits_kern *kit = (void *)it;
3612 
3613 	if (kit->nr_bits <= 64)
3614 		return;
3615 	bpf_mem_free(&bpf_global_ma, kit->bits);
3616 }
3617 
3618 /**
3619  * bpf_copy_from_user_str() - Copy a string from an unsafe user address
3620  * @dst:             Destination address, in kernel space.  This buffer must be
3621  *                   at least @dst__sz bytes long.
3622  * @dst__sz:         Maximum number of bytes to copy, includes the trailing NUL.
3623  * @unsafe_ptr__ign: Source address, in user space.
3624  * @flags:           The only supported flag is BPF_F_PAD_ZEROS
3625  *
3626  * Copies a NUL-terminated string from userspace to BPF space. If user string is
3627  * too long this will still ensure zero termination in the dst buffer unless
3628  * buffer size is 0.
3629  *
3630  * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and
3631  * memset all of @dst on failure.
3632  */
3633 __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags)
3634 {
3635 	int ret;
3636 
3637 	if (unlikely(flags & ~BPF_F_PAD_ZEROS))
3638 		return -EINVAL;
3639 
3640 	if (unlikely(!dst__sz))
3641 		return 0;
3642 
3643 	ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1);
3644 	if (ret < 0) {
3645 		if (flags & BPF_F_PAD_ZEROS)
3646 			memset((char *)dst, 0, dst__sz);
3647 
3648 		return ret;
3649 	}
3650 
3651 	if (flags & BPF_F_PAD_ZEROS)
3652 		memset((char *)dst + ret, 0, dst__sz - ret);
3653 	else
3654 		((char *)dst)[ret] = '\0';
3655 
3656 	return ret + 1;
3657 }
3658 
3659 /**
3660  * bpf_copy_from_user_task_str() - Copy a string from an task's address space
3661  * @dst:             Destination address, in kernel space.  This buffer must be
3662  *                   at least @dst__sz bytes long.
3663  * @dst__sz:         Maximum number of bytes to copy, includes the trailing NUL.
3664  * @unsafe_ptr__ign: Source address in the task's address space.
3665  * @tsk:             The task whose address space will be used
3666  * @flags:           The only supported flag is BPF_F_PAD_ZEROS
3667  *
3668  * Copies a NUL terminated string from a task's address space to @dst__sz
3669  * buffer. If user string is too long this will still ensure zero termination
3670  * in the @dst__sz buffer unless buffer size is 0.
3671  *
3672  * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success
3673  * and memset all of @dst__sz on failure.
3674  *
3675  * Return: The number of copied bytes on success including the NUL terminator.
3676  * A negative error code on failure.
3677  */
3678 __bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz,
3679 					    const void __user *unsafe_ptr__ign,
3680 					    struct task_struct *tsk, u64 flags)
3681 {
3682 	int ret;
3683 
3684 	if (unlikely(flags & ~BPF_F_PAD_ZEROS))
3685 		return -EINVAL;
3686 
3687 	if (unlikely(dst__sz == 0))
3688 		return 0;
3689 
3690 	ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0);
3691 	if (ret < 0) {
3692 		if (flags & BPF_F_PAD_ZEROS)
3693 			memset(dst, 0, dst__sz);
3694 		return ret;
3695 	}
3696 
3697 	if (flags & BPF_F_PAD_ZEROS)
3698 		memset(dst + ret, 0, dst__sz - ret);
3699 
3700 	return ret + 1;
3701 }
3702 
3703 /* Keep unsinged long in prototype so that kfunc is usable when emitted to
3704  * vmlinux.h in BPF programs directly, but note that while in BPF prog, the
3705  * unsigned long always points to 8-byte region on stack, the kernel may only
3706  * read and write the 4-bytes on 32-bit.
3707  */
3708 __bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag)
3709 {
3710 	local_irq_save(*flags__irq_flag);
3711 }
3712 
3713 __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
3714 {
3715 	local_irq_restore(*flags__irq_flag);
3716 }
3717 
3718 __bpf_kfunc void __bpf_trap(void)
3719 {
3720 }
3721 
3722 /*
3723  * Kfuncs for string operations.
3724  *
3725  * Since strings are not necessarily %NUL-terminated, we cannot directly call
3726  * in-kernel implementations. Instead, we open-code the implementations using
3727  * __get_kernel_nofault instead of plain dereference to make them safe.
3728  */
3729 
3730 static int __bpf_strncasecmp(const char *s1, const char *s2, bool ignore_case, size_t len)
3731 {
3732 	char c1, c2;
3733 	int i;
3734 
3735 	if (!copy_from_kernel_nofault_allowed(s1, 1) ||
3736 	    !copy_from_kernel_nofault_allowed(s2, 1)) {
3737 		return -ERANGE;
3738 	}
3739 
3740 	guard(pagefault)();
3741 	for (i = 0; i < len && i < XATTR_SIZE_MAX; i++) {
3742 		__get_kernel_nofault(&c1, s1, char, err_out);
3743 		__get_kernel_nofault(&c2, s2, char, err_out);
3744 		if (ignore_case) {
3745 			c1 = tolower(c1);
3746 			c2 = tolower(c2);
3747 		}
3748 		if (c1 != c2)
3749 			return c1 < c2 ? -1 : 1;
3750 		if (c1 == '\0')
3751 			return 0;
3752 		s1++;
3753 		s2++;
3754 	}
3755 	return i == XATTR_SIZE_MAX ? -E2BIG : 0;
3756 err_out:
3757 	return -EFAULT;
3758 }
3759 
3760 /**
3761  * bpf_strcmp - Compare two strings
3762  * @s1__ign: One string
3763  * @s2__ign: Another string
3764  *
3765  * Return:
3766  * * %0       - Strings are equal
3767  * * %-1      - @s1__ign is smaller
3768  * * %1       - @s2__ign is smaller
3769  * * %-EFAULT - Cannot read one of the strings
3770  * * %-E2BIG  - One of strings is too large
3771  * * %-ERANGE - One of strings is outside of kernel address space
3772  */
3773 __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
3774 {
3775 	return __bpf_strncasecmp(s1__ign, s2__ign, false, XATTR_SIZE_MAX);
3776 }
3777 
3778 /**
3779  * bpf_strcasecmp - Compare two strings, ignoring the case of the characters
3780  * @s1__ign: One string
3781  * @s2__ign: Another string
3782  *
3783  * Return:
3784  * * %0       - Strings are equal
3785  * * %-1      - @s1__ign is smaller
3786  * * %1       - @s2__ign is smaller
3787  * * %-EFAULT - Cannot read one of the strings
3788  * * %-E2BIG  - One of strings is too large
3789  * * %-ERANGE - One of strings is outside of kernel address space
3790  */
3791 __bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign)
3792 {
3793 	return __bpf_strncasecmp(s1__ign, s2__ign, true, XATTR_SIZE_MAX);
3794 }
3795 
3796 /*
3797  * bpf_strncasecmp - Compare two length-limited strings, ignoring case
3798  * @s1__ign: One string
3799  * @s2__ign: Another string
3800  * @len: The maximum number of characters to compare
3801  *
3802  * Return:
3803  * * %0       - Strings are equal
3804  * * %-1      - @s1__ign is smaller
3805  * * %1       - @s2__ign is smaller
3806  * * %-EFAULT - Cannot read one of the strings
3807  * * %-E2BIG  - One of strings is too large
3808  * * %-ERANGE - One of strings is outside of kernel address space
3809  */
3810 __bpf_kfunc int bpf_strncasecmp(const char *s1__ign, const char *s2__ign, size_t len)
3811 {
3812 	return __bpf_strncasecmp(s1__ign, s2__ign, true, len);
3813 }
3814 
3815 /**
3816  * bpf_strnchr - Find a character in a length limited string
3817  * @s__ign: The string to be searched
3818  * @count: The number of characters to be searched
3819  * @c: The character to search for
3820  *
3821  * Note that the %NUL-terminator is considered part of the string, and can
3822  * be searched for.
3823  *
3824  * Return:
3825  * * >=0      - Index of the first occurrence of @c within @s__ign
3826  * * %-ENOENT - @c not found in the first @count characters of @s__ign
3827  * * %-EFAULT - Cannot read @s__ign
3828  * * %-E2BIG  - @s__ign is too large
3829  * * %-ERANGE - @s__ign is outside of kernel address space
3830  */
3831 __bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c)
3832 {
3833 	char sc;
3834 	int i;
3835 
3836 	if (!copy_from_kernel_nofault_allowed(s__ign, 1))
3837 		return -ERANGE;
3838 
3839 	guard(pagefault)();
3840 	for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
3841 		__get_kernel_nofault(&sc, s__ign, char, err_out);
3842 		if (sc == c)
3843 			return i;
3844 		if (sc == '\0')
3845 			return -ENOENT;
3846 		s__ign++;
3847 	}
3848 	return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT;
3849 err_out:
3850 	return -EFAULT;
3851 }
3852 
3853 /**
3854  * bpf_strchr - Find the first occurrence of a character in a string
3855  * @s__ign: The string to be searched
3856  * @c: The character to search for
3857  *
3858  * Note that the %NUL-terminator is considered part of the string, and can
3859  * be searched for.
3860  *
3861  * Return:
3862  * * >=0      - The index of the first occurrence of @c within @s__ign
3863  * * %-ENOENT - @c not found in @s__ign
3864  * * %-EFAULT - Cannot read @s__ign
3865  * * %-E2BIG  - @s__ign is too large
3866  * * %-ERANGE - @s__ign is outside of kernel address space
3867  */
3868 __bpf_kfunc int bpf_strchr(const char *s__ign, char c)
3869 {
3870 	return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c);
3871 }
3872 
3873 /**
3874  * bpf_strchrnul - Find and return a character in a string, or end of string
3875  * @s__ign: The string to be searched
3876  * @c: The character to search for
3877  *
3878  * Return:
3879  * * >=0      - Index of the first occurrence of @c within @s__ign or index of
3880  *              the null byte at the end of @s__ign when @c is not found
3881  * * %-EFAULT - Cannot read @s__ign
3882  * * %-E2BIG  - @s__ign is too large
3883  * * %-ERANGE - @s__ign is outside of kernel address space
3884  */
3885 __bpf_kfunc int bpf_strchrnul(const char *s__ign, char c)
3886 {
3887 	char sc;
3888 	int i;
3889 
3890 	if (!copy_from_kernel_nofault_allowed(s__ign, 1))
3891 		return -ERANGE;
3892 
3893 	guard(pagefault)();
3894 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
3895 		__get_kernel_nofault(&sc, s__ign, char, err_out);
3896 		if (sc == '\0' || sc == c)
3897 			return i;
3898 		s__ign++;
3899 	}
3900 	return -E2BIG;
3901 err_out:
3902 	return -EFAULT;
3903 }
3904 
3905 /**
3906  * bpf_strrchr - Find the last occurrence of a character in a string
3907  * @s__ign: The string to be searched
3908  * @c: The character to search for
3909  *
3910  * Return:
3911  * * >=0      - Index of the last occurrence of @c within @s__ign
3912  * * %-ENOENT - @c not found in @s__ign
3913  * * %-EFAULT - Cannot read @s__ign
3914  * * %-E2BIG  - @s__ign is too large
3915  * * %-ERANGE - @s__ign is outside of kernel address space
3916  */
3917 __bpf_kfunc int bpf_strrchr(const char *s__ign, int c)
3918 {
3919 	char sc;
3920 	int i, last = -ENOENT;
3921 
3922 	if (!copy_from_kernel_nofault_allowed(s__ign, 1))
3923 		return -ERANGE;
3924 
3925 	guard(pagefault)();
3926 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
3927 		__get_kernel_nofault(&sc, s__ign, char, err_out);
3928 		if (sc == c)
3929 			last = i;
3930 		if (sc == '\0')
3931 			return last;
3932 		s__ign++;
3933 	}
3934 	return -E2BIG;
3935 err_out:
3936 	return -EFAULT;
3937 }
3938 
3939 /**
3940  * bpf_strnlen - Calculate the length of a length-limited string
3941  * @s__ign: The string
3942  * @count: The maximum number of characters to count
3943  *
3944  * Return:
3945  * * >=0      - The length of @s__ign
3946  * * %-EFAULT - Cannot read @s__ign
3947  * * %-E2BIG  - @s__ign is too large
3948  * * %-ERANGE - @s__ign is outside of kernel address space
3949  */
3950 __bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count)
3951 {
3952 	char c;
3953 	int i;
3954 
3955 	if (!copy_from_kernel_nofault_allowed(s__ign, 1))
3956 		return -ERANGE;
3957 
3958 	guard(pagefault)();
3959 	for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
3960 		__get_kernel_nofault(&c, s__ign, char, err_out);
3961 		if (c == '\0')
3962 			return i;
3963 		s__ign++;
3964 	}
3965 	return i == XATTR_SIZE_MAX ? -E2BIG : i;
3966 err_out:
3967 	return -EFAULT;
3968 }
3969 
3970 /**
3971  * bpf_strlen - Calculate the length of a string
3972  * @s__ign: The string
3973  *
3974  * Return:
3975  * * >=0      - The length of @s__ign
3976  * * %-EFAULT - Cannot read @s__ign
3977  * * %-E2BIG  - @s__ign is too large
3978  * * %-ERANGE - @s__ign is outside of kernel address space
3979  */
3980 __bpf_kfunc int bpf_strlen(const char *s__ign)
3981 {
3982 	return bpf_strnlen(s__ign, XATTR_SIZE_MAX);
3983 }
3984 
3985 /**
3986  * bpf_strspn - Calculate the length of the initial substring of @s__ign which
3987  *              only contains letters in @accept__ign
3988  * @s__ign: The string to be searched
3989  * @accept__ign: The string to search for
3990  *
3991  * Return:
3992  * * >=0      - The length of the initial substring of @s__ign which only
3993  *              contains letters from @accept__ign
3994  * * %-EFAULT - Cannot read one of the strings
3995  * * %-E2BIG  - One of the strings is too large
3996  * * %-ERANGE - One of the strings is outside of kernel address space
3997  */
3998 __bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign)
3999 {
4000 	char cs, ca;
4001 	int i, j;
4002 
4003 	if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
4004 	    !copy_from_kernel_nofault_allowed(accept__ign, 1)) {
4005 		return -ERANGE;
4006 	}
4007 
4008 	guard(pagefault)();
4009 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
4010 		__get_kernel_nofault(&cs, s__ign, char, err_out);
4011 		if (cs == '\0')
4012 			return i;
4013 		for (j = 0; j < XATTR_SIZE_MAX; j++) {
4014 			__get_kernel_nofault(&ca, accept__ign + j, char, err_out);
4015 			if (cs == ca || ca == '\0')
4016 				break;
4017 		}
4018 		if (j == XATTR_SIZE_MAX)
4019 			return -E2BIG;
4020 		if (ca == '\0')
4021 			return i;
4022 		s__ign++;
4023 	}
4024 	return -E2BIG;
4025 err_out:
4026 	return -EFAULT;
4027 }
4028 
4029 /**
4030  * bpf_strcspn - Calculate the length of the initial substring of @s__ign which
4031  *               does not contain letters in @reject__ign
4032  * @s__ign: The string to be searched
4033  * @reject__ign: The string to search for
4034  *
4035  * Return:
4036  * * >=0      - The length of the initial substring of @s__ign which does not
4037  *              contain letters from @reject__ign
4038  * * %-EFAULT - Cannot read one of the strings
4039  * * %-E2BIG  - One of the strings is too large
4040  * * %-ERANGE - One of the strings is outside of kernel address space
4041  */
4042 __bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign)
4043 {
4044 	char cs, cr;
4045 	int i, j;
4046 
4047 	if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
4048 	    !copy_from_kernel_nofault_allowed(reject__ign, 1)) {
4049 		return -ERANGE;
4050 	}
4051 
4052 	guard(pagefault)();
4053 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
4054 		__get_kernel_nofault(&cs, s__ign, char, err_out);
4055 		if (cs == '\0')
4056 			return i;
4057 		for (j = 0; j < XATTR_SIZE_MAX; j++) {
4058 			__get_kernel_nofault(&cr, reject__ign + j, char, err_out);
4059 			if (cs == cr || cr == '\0')
4060 				break;
4061 		}
4062 		if (j == XATTR_SIZE_MAX)
4063 			return -E2BIG;
4064 		if (cr != '\0')
4065 			return i;
4066 		s__ign++;
4067 	}
4068 	return -E2BIG;
4069 err_out:
4070 	return -EFAULT;
4071 }
4072 
4073 static int __bpf_strnstr(const char *s1, const char *s2, size_t len,
4074 			 bool ignore_case)
4075 {
4076 	char c1, c2;
4077 	int i, j;
4078 
4079 	if (!copy_from_kernel_nofault_allowed(s1, 1) ||
4080 	    !copy_from_kernel_nofault_allowed(s2, 1)) {
4081 		return -ERANGE;
4082 	}
4083 
4084 	guard(pagefault)();
4085 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
4086 		for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
4087 			__get_kernel_nofault(&c2, s2 + j, char, err_out);
4088 			if (c2 == '\0')
4089 				return i;
4090 			/*
4091 			 * We allow reading an extra byte from s2 (note the
4092 			 * `i + j <= len` above) to cover the case when s2 is
4093 			 * a suffix of the first len chars of s1.
4094 			 */
4095 			if (i + j == len)
4096 				break;
4097 			__get_kernel_nofault(&c1, s1 + j, char, err_out);
4098 
4099 			if (ignore_case) {
4100 				c1 = tolower(c1);
4101 				c2 = tolower(c2);
4102 			}
4103 
4104 			if (c1 == '\0')
4105 				return -ENOENT;
4106 			if (c1 != c2)
4107 				break;
4108 		}
4109 		if (j == XATTR_SIZE_MAX)
4110 			return -E2BIG;
4111 		if (i + j == len)
4112 			return -ENOENT;
4113 		s1++;
4114 	}
4115 	return -E2BIG;
4116 err_out:
4117 	return -EFAULT;
4118 }
4119 
4120 /**
4121  * bpf_strstr - Find the first substring in a string
4122  * @s1__ign: The string to be searched
4123  * @s2__ign: The string to search for
4124  *
4125  * Return:
4126  * * >=0      - Index of the first character of the first occurrence of @s2__ign
4127  *              within @s1__ign
4128  * * %-ENOENT - @s2__ign is not a substring of @s1__ign
4129  * * %-EFAULT - Cannot read one of the strings
4130  * * %-E2BIG  - One of the strings is too large
4131  * * %-ERANGE - One of the strings is outside of kernel address space
4132  */
4133 __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
4134 {
4135 	return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false);
4136 }
4137 
4138 /**
4139  * bpf_strcasestr - Find the first substring in a string, ignoring the case of
4140  *                  the characters
4141  * @s1__ign: The string to be searched
4142  * @s2__ign: The string to search for
4143  *
4144  * Return:
4145  * * >=0      - Index of the first character of the first occurrence of @s2__ign
4146  *              within @s1__ign
4147  * * %-ENOENT - @s2__ign is not a substring of @s1__ign
4148  * * %-EFAULT - Cannot read one of the strings
4149  * * %-E2BIG  - One of the strings is too large
4150  * * %-ERANGE - One of the strings is outside of kernel address space
4151  */
4152 __bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign)
4153 {
4154 	return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true);
4155 }
4156 
4157 /**
4158  * bpf_strnstr - Find the first substring in a length-limited string
4159  * @s1__ign: The string to be searched
4160  * @s2__ign: The string to search for
4161  * @len: the maximum number of characters to search
4162  *
4163  * Return:
4164  * * >=0      - Index of the first character of the first occurrence of @s2__ign
4165  *              within the first @len characters of @s1__ign
4166  * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
4167  * * %-EFAULT - Cannot read one of the strings
4168  * * %-E2BIG  - One of the strings is too large
4169  * * %-ERANGE - One of the strings is outside of kernel address space
4170  */
4171 __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign,
4172 			    size_t len)
4173 {
4174 	return __bpf_strnstr(s1__ign, s2__ign, len, false);
4175 }
4176 
4177 /**
4178  * bpf_strncasestr - Find the first substring in a length-limited string,
4179  *                   ignoring the case of the characters
4180  * @s1__ign: The string to be searched
4181  * @s2__ign: The string to search for
4182  * @len: the maximum number of characters to search
4183  *
4184  * Return:
4185  * * >=0      - Index of the first character of the first occurrence of @s2__ign
4186  *              within the first @len characters of @s1__ign
4187  * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
4188  * * %-EFAULT - Cannot read one of the strings
4189  * * %-E2BIG  - One of the strings is too large
4190  * * %-ERANGE - One of the strings is outside of kernel address space
4191  */
4192 __bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign,
4193 				size_t len)
4194 {
4195 	return __bpf_strnstr(s1__ign, s2__ign, len, true);
4196 }
4197 
4198 #ifdef CONFIG_KEYS
4199 /**
4200  * bpf_lookup_user_key - lookup a key by its serial
4201  * @serial: key handle serial number
4202  * @flags: lookup-specific flags
4203  *
4204  * Search a key with a given *serial* and the provided *flags*.
4205  * If found, increment the reference count of the key by one, and
4206  * return it in the bpf_key structure.
4207  *
4208  * The bpf_key structure must be passed to bpf_key_put() when done
4209  * with it, so that the key reference count is decremented and the
4210  * bpf_key structure is freed.
4211  *
4212  * Permission checks are deferred to the time the key is used by
4213  * one of the available key-specific kfuncs.
4214  *
4215  * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
4216  * special keyring (e.g. session keyring), if it doesn't yet exist.
4217  * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
4218  * for the key construction, and to retrieve uninstantiated keys (keys
4219  * without data attached to them).
4220  *
4221  * Return: a bpf_key pointer with a valid key pointer if the key is found, a
4222  *         NULL pointer otherwise.
4223  */
4224 __bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
4225 {
4226 	key_ref_t key_ref;
4227 	struct bpf_key *bkey;
4228 
4229 	if (flags & ~KEY_LOOKUP_ALL)
4230 		return NULL;
4231 
4232 	/*
4233 	 * Permission check is deferred until the key is used, as the
4234 	 * intent of the caller is unknown here.
4235 	 */
4236 	key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
4237 	if (IS_ERR(key_ref))
4238 		return NULL;
4239 
4240 	bkey = kmalloc_obj(*bkey);
4241 	if (!bkey) {
4242 		key_put(key_ref_to_ptr(key_ref));
4243 		return NULL;
4244 	}
4245 
4246 	bkey->key = key_ref_to_ptr(key_ref);
4247 	bkey->has_ref = true;
4248 
4249 	return bkey;
4250 }
4251 
4252 /**
4253  * bpf_lookup_system_key - lookup a key by a system-defined ID
4254  * @id: key ID
4255  *
4256  * Obtain a bpf_key structure with a key pointer set to the passed key ID.
4257  * The key pointer is marked as invalid, to prevent bpf_key_put() from
4258  * attempting to decrement the key reference count on that pointer. The key
4259  * pointer set in such way is currently understood only by
4260  * verify_pkcs7_signature().
4261  *
4262  * Set *id* to one of the values defined in include/linux/verification.h:
4263  * 0 for the primary keyring (immutable keyring of system keys);
4264  * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
4265  * (where keys can be added only if they are vouched for by existing keys
4266  * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
4267  * keyring (primarily used by the integrity subsystem to verify a kexec'ed
4268  * kerned image and, possibly, the initramfs signature).
4269  *
4270  * Return: a bpf_key pointer with an invalid key pointer set from the
4271  *         pre-determined ID on success, a NULL pointer otherwise
4272  */
4273 __bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
4274 {
4275 	struct bpf_key *bkey;
4276 
4277 	if (system_keyring_id_check(id) < 0)
4278 		return NULL;
4279 
4280 	bkey = kmalloc_obj(*bkey, GFP_ATOMIC);
4281 	if (!bkey)
4282 		return NULL;
4283 
4284 	bkey->key = (struct key *)(unsigned long)id;
4285 	bkey->has_ref = false;
4286 
4287 	return bkey;
4288 }
4289 
4290 /**
4291  * bpf_key_put - decrement key reference count if key is valid and free bpf_key
4292  * @bkey: bpf_key structure
4293  *
4294  * Decrement the reference count of the key inside *bkey*, if the pointer
4295  * is valid, and free *bkey*.
4296  */
4297 __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
4298 {
4299 	if (bkey->has_ref)
4300 		key_put(bkey->key);
4301 
4302 	kfree(bkey);
4303 }
4304 
4305 /**
4306  * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
4307  * @data_p: data to verify
4308  * @sig_p: signature of the data
4309  * @trusted_keyring: keyring with keys trusted for signature verification
4310  *
4311  * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
4312  * with keys in a keyring referenced by *trusted_keyring*.
4313  *
4314  * Return: 0 on success, a negative value on error.
4315  */
4316 __bpf_kfunc int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p,
4317 			       const struct bpf_dynptr *sig_p,
4318 			       struct bpf_key *trusted_keyring)
4319 {
4320 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
4321 	const struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
4322 	const struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
4323 	const void *data, *sig;
4324 	u32 data_len, sig_len;
4325 	int ret;
4326 
4327 	if (trusted_keyring->has_ref) {
4328 		/*
4329 		 * Do the permission check deferred in bpf_lookup_user_key().
4330 		 * See bpf_lookup_user_key() for more details.
4331 		 *
4332 		 * A call to key_task_permission() here would be redundant, as
4333 		 * it is already done by keyring_search() called by
4334 		 * find_asymmetric_key().
4335 		 */
4336 		ret = key_validate(trusted_keyring->key);
4337 		if (ret < 0)
4338 			return ret;
4339 	}
4340 
4341 	data_len = __bpf_dynptr_size(data_ptr);
4342 	data = __bpf_dynptr_data(data_ptr, data_len);
4343 	if (!data)
4344 		return -EINVAL;
4345 
4346 	sig_len = __bpf_dynptr_size(sig_ptr);
4347 	sig = __bpf_dynptr_data(sig_ptr, sig_len);
4348 	if (!sig)
4349 		return -EINVAL;
4350 
4351 	return verify_pkcs7_signature(data, data_len, sig, sig_len,
4352 				      trusted_keyring->key,
4353 				      VERIFYING_BPF_SIGNATURE, NULL,
4354 				      NULL);
4355 #else
4356 	return -EOPNOTSUPP;
4357 #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
4358 }
4359 #endif /* CONFIG_KEYS */
4360 
4361 typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value);
4362 
4363 enum bpf_task_work_state {
4364 	/* bpf_task_work is ready to be used */
4365 	BPF_TW_STANDBY = 0,
4366 	/* irq work scheduling in progress */
4367 	BPF_TW_PENDING,
4368 	/* task work scheduling in progress */
4369 	BPF_TW_SCHEDULING,
4370 	/* task work is scheduled successfully */
4371 	BPF_TW_SCHEDULED,
4372 	/* callback is running */
4373 	BPF_TW_RUNNING,
4374 	/* associated BPF map value is deleted */
4375 	BPF_TW_FREED,
4376 };
4377 
4378 struct bpf_task_work_ctx {
4379 	enum bpf_task_work_state state;
4380 	refcount_t refcnt;
4381 	struct callback_head work;
4382 	struct irq_work irq_work;
4383 	/* bpf_prog that schedules task work */
4384 	struct bpf_prog *prog;
4385 	/* task for which callback is scheduled */
4386 	struct task_struct *task;
4387 	/* the map and map value associated with this context */
4388 	struct bpf_map *map;
4389 	void *map_val;
4390 	enum task_work_notify_mode mode;
4391 	bpf_task_work_callback_t callback_fn;
4392 	struct rcu_head rcu;
4393 } __aligned(8);
4394 
4395 /* Actual type for struct bpf_task_work */
4396 struct bpf_task_work_kern {
4397 	struct bpf_task_work_ctx *ctx;
4398 };
4399 
4400 static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx)
4401 {
4402 	if (ctx->prog) {
4403 		bpf_prog_put(ctx->prog);
4404 		ctx->prog = NULL;
4405 	}
4406 	if (ctx->task) {
4407 		bpf_task_release(ctx->task);
4408 		ctx->task = NULL;
4409 	}
4410 }
4411 
4412 static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
4413 {
4414 	return refcount_inc_not_zero(&ctx->refcnt);
4415 }
4416 
4417 static void bpf_task_work_destroy(struct irq_work *irq_work)
4418 {
4419 	struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
4420 
4421 	bpf_task_work_ctx_reset(ctx);
4422 	kfree_rcu(ctx, rcu);
4423 }
4424 
4425 static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
4426 {
4427 	if (!refcount_dec_and_test(&ctx->refcnt))
4428 		return;
4429 
4430 	if (irqs_disabled()) {
4431 		ctx->irq_work = IRQ_WORK_INIT(bpf_task_work_destroy);
4432 		irq_work_queue(&ctx->irq_work);
4433 	} else {
4434 		bpf_task_work_destroy(&ctx->irq_work);
4435 	}
4436 }
4437 
4438 static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
4439 {
4440 	/*
4441 	 * Scheduled task_work callback holds ctx ref, so if we successfully
4442 	 * cancelled, we put that ref on callback's behalf. If we couldn't
4443 	 * cancel, callback will inevitably run or has already completed
4444 	 * running, and it would have taken care of its ctx ref itself.
4445 	 */
4446 	if (task_work_cancel(ctx->task, &ctx->work))
4447 		bpf_task_work_ctx_put(ctx);
4448 }
4449 
4450 static void bpf_task_work_callback(struct callback_head *cb)
4451 {
4452 	struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work);
4453 	enum bpf_task_work_state state;
4454 	u32 idx;
4455 	void *key;
4456 
4457 	/* Read lock is needed to protect ctx and map key/value access */
4458 	guard(rcu_tasks_trace)();
4459 	/*
4460 	 * This callback may start running before bpf_task_work_irq() switched to
4461 	 * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
4462 	 */
4463 	state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
4464 	if (state == BPF_TW_SCHEDULED)
4465 		state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
4466 	if (state == BPF_TW_FREED) {
4467 		bpf_task_work_ctx_put(ctx);
4468 		return;
4469 	}
4470 
4471 	key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);
4472 
4473 	migrate_disable();
4474 	ctx->callback_fn(ctx->map, key, ctx->map_val);
4475 	migrate_enable();
4476 
4477 	bpf_task_work_ctx_reset(ctx);
4478 	(void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
4479 
4480 	bpf_task_work_ctx_put(ctx);
4481 }
4482 
4483 static void bpf_task_work_irq(struct irq_work *irq_work)
4484 {
4485 	struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
4486 	enum bpf_task_work_state state;
4487 	int err;
4488 
4489 	guard(rcu)();
4490 
4491 	if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
4492 		bpf_task_work_ctx_put(ctx);
4493 		return;
4494 	}
4495 
4496 	err = task_work_add(ctx->task, &ctx->work, ctx->mode);
4497 	if (err) {
4498 		bpf_task_work_ctx_reset(ctx);
4499 		/*
4500 		 * try to switch back to STANDBY for another task_work reuse, but we might have
4501 		 * gone to FREED already, which is fine as we already cleaned up after ourselves
4502 		 */
4503 		(void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
4504 		bpf_task_work_ctx_put(ctx);
4505 		return;
4506 	}
4507 
4508 	/*
4509 	 * It's technically possible for just scheduled task_work callback to
4510 	 * complete running by now, going SCHEDULING -> RUNNING and then
4511 	 * dropping its ctx refcount. Instead of capturing an extra ref just
4512 	 * to protect below ctx->state access, we rely on rcu_read_lock
4513 	 * above to prevent kfree_rcu from freeing ctx before we return.
4514 	 */
4515 	state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
4516 	if (state == BPF_TW_FREED)
4517 		bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */
4518 }
4519 
4520 static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw,
4521 							 struct bpf_map *map)
4522 {
4523 	struct bpf_task_work_kern *twk = (void *)tw;
4524 	struct bpf_task_work_ctx *ctx, *old_ctx;
4525 
4526 	ctx = READ_ONCE(twk->ctx);
4527 	if (ctx)
4528 		return ctx;
4529 
4530 	ctx = bpf_map_kmalloc_nolock(map, sizeof(*ctx), 0, NUMA_NO_NODE);
4531 	if (!ctx)
4532 		return ERR_PTR(-ENOMEM);
4533 
4534 	memset(ctx, 0, sizeof(*ctx));
4535 	refcount_set(&ctx->refcnt, 1); /* map's own ref */
4536 	ctx->state = BPF_TW_STANDBY;
4537 
4538 	old_ctx = cmpxchg(&twk->ctx, NULL, ctx);
4539 	if (old_ctx) {
4540 		/*
4541 		 * tw->ctx is set by concurrent BPF program, release allocated
4542 		 * memory and try to reuse already set context.
4543 		 */
4544 		kfree_nolock(ctx);
4545 		return old_ctx;
4546 	}
4547 
4548 	return ctx; /* Success */
4549 }
4550 
4551 static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw,
4552 							   struct bpf_map *map)
4553 {
4554 	struct bpf_task_work_ctx *ctx;
4555 
4556 	/*
4557 	 * Sleepable BPF programs hold rcu_read_lock_trace but not
4558 	 * regular rcu_read_lock. Since kfree_rcu waits for regular
4559 	 * RCU GP, the ctx can be freed while we're between reading
4560 	 * the pointer and incrementing the refcount. Take regular
4561 	 * rcu_read_lock to prevent kfree_rcu from freeing the ctx
4562 	 * before we can tryget it.
4563 	 */
4564 	scoped_guard(rcu) {
4565 		ctx = bpf_task_work_fetch_ctx(tw, map);
4566 		if (IS_ERR(ctx))
4567 			return ctx;
4568 
4569 		/* try to get ref for task_work callback to hold */
4570 		if (!bpf_task_work_ctx_tryget(ctx))
4571 			return ERR_PTR(-EBUSY);
4572 	}
4573 
4574 	if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
4575 		/* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
4576 		bpf_task_work_ctx_put(ctx);
4577 		return ERR_PTR(-EBUSY);
4578 	}
4579 
4580 	/*
4581 	 * If no process or bpffs is holding a reference to the map, no new callbacks should be
4582 	 * scheduled. This does not address any race or correctness issue, but rather is a policy
4583 	 * choice: dropping user references should stop everything.
4584 	 */
4585 	if (!atomic64_read(&map->usercnt)) {
4586 		/* drop ref we just got for task_work callback itself */
4587 		bpf_task_work_ctx_put(ctx);
4588 		/* transfer map's ref into cancel_and_free() */
4589 		bpf_task_work_cancel_and_free(tw);
4590 		return ERR_PTR(-EBUSY);
4591 	}
4592 
4593 	return ctx;
4594 }
4595 
4596 static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
4597 				  struct bpf_map *map, bpf_task_work_callback_t callback_fn,
4598 				  struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
4599 {
4600 	struct bpf_prog *prog;
4601 	struct bpf_task_work_ctx *ctx;
4602 	int err;
4603 
4604 	BTF_TYPE_EMIT(struct bpf_task_work);
4605 
4606 	prog = bpf_prog_inc_not_zero(aux->prog);
4607 	if (IS_ERR(prog))
4608 		return -EBADF;
4609 	task = bpf_task_acquire(task);
4610 	if (!task) {
4611 		err = -EBADF;
4612 		goto release_prog;
4613 	}
4614 
4615 	ctx = bpf_task_work_acquire_ctx(tw, map);
4616 	if (IS_ERR(ctx)) {
4617 		err = PTR_ERR(ctx);
4618 		goto release_all;
4619 	}
4620 
4621 	ctx->task = task;
4622 	ctx->callback_fn = callback_fn;
4623 	ctx->prog = prog;
4624 	ctx->mode = mode;
4625 	ctx->map = map;
4626 	ctx->map_val = (void *)tw - map->record->task_work_off;
4627 	init_task_work(&ctx->work, bpf_task_work_callback);
4628 	init_irq_work(&ctx->irq_work, bpf_task_work_irq);
4629 
4630 	irq_work_queue(&ctx->irq_work);
4631 	return 0;
4632 
4633 release_all:
4634 	bpf_task_release(task);
4635 release_prog:
4636 	bpf_prog_put(prog);
4637 	return err;
4638 }
4639 
4640 /**
4641  * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL
4642  * mode
4643  * @task: Task struct for which callback should be scheduled
4644  * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
4645  * @map__map: bpf_map that embeds struct bpf_task_work in the values
4646  * @callback: pointer to BPF subprogram to call
4647  * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
4648  *
4649  * Return: 0 if task work has been scheduled successfully, negative error code otherwise
4650  */
4651 __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
4652 					      void *map__map, bpf_task_work_callback_t callback,
4653 					      struct bpf_prog_aux *aux)
4654 {
4655 	return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL);
4656 }
4657 
4658 /**
4659  * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME
4660  * mode
4661  * @task: Task struct for which callback should be scheduled
4662  * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
4663  * @map__map: bpf_map that embeds struct bpf_task_work in the values
4664  * @callback: pointer to BPF subprogram to call
4665  * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
4666  *
4667  * Return: 0 if task work has been scheduled successfully, negative error code otherwise
4668  */
4669 __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
4670 					      void *map__map, bpf_task_work_callback_t callback,
4671 					      struct bpf_prog_aux *aux)
4672 {
4673 	return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME);
4674 }
4675 
4676 static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
4677 			    struct bpf_dynptr_kern *ptr)
4678 {
4679 	struct bpf_dynptr_file_impl *state;
4680 
4681 	/* flags is currently unsupported */
4682 	if (flags) {
4683 		bpf_dynptr_set_null(ptr);
4684 		return -EINVAL;
4685 	}
4686 
4687 	state = kmalloc_nolock(sizeof(*state), 0, NUMA_NO_NODE);
4688 	if (!state) {
4689 		bpf_dynptr_set_null(ptr);
4690 		return -ENOMEM;
4691 	}
4692 	state->offset = 0;
4693 	state->size = U64_MAX; /* Don't restrict size, as file may change anyways */
4694 	freader_init_from_file(&state->freader, NULL, 0, file, may_sleep);
4695 	bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0);
4696 	bpf_dynptr_set_rdonly(ptr);
4697 	return 0;
4698 }
4699 
4700 __bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
4701 {
4702 	return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit);
4703 }
4704 
4705 int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
4706 {
4707 	return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit);
4708 }
4709 
4710 __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
4711 {
4712 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr;
4713 	struct bpf_dynptr_file_impl *df = ptr->data;
4714 
4715 	if (!df)
4716 		return 0;
4717 
4718 	freader_cleanup(&df->freader);
4719 	kfree_nolock(df);
4720 	bpf_dynptr_set_null(ptr);
4721 	return 0;
4722 }
4723 
4724 /**
4725  * bpf_timer_cancel_async - try to deactivate a timer
4726  * @timer:	bpf_timer to stop
4727  *
4728  * Returns:
4729  *
4730  *  *  0 when the timer was not active
4731  *  *  1 when the timer was active
4732  *  * -1 when the timer is currently executing the callback function and
4733  *       cannot be stopped
4734  *  * -ECANCELED when the timer will be cancelled asynchronously
4735  *  * -ENOMEM when out of memory
4736  *  * -EINVAL when the timer was not initialized
4737  *  * -ENOENT when this kfunc is racing with timer deletion
4738  */
4739 __bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer)
4740 {
4741 	struct bpf_async_kern *async = (void *)timer;
4742 	struct bpf_async_cb *cb;
4743 	int ret;
4744 
4745 	cb = READ_ONCE(async->cb);
4746 	if (!cb)
4747 		return -EINVAL;
4748 
4749 	/*
4750 	 * Unlike hrtimer_start() it's ok to synchronously call
4751 	 * hrtimer_try_to_cancel() when refcnt reached zero, but deferring to
4752 	 * irq_work is not, since irq callback may execute after RCU GP and
4753 	 * cb could be freed at that time. Check for refcnt zero for
4754 	 * consistency.
4755 	 */
4756 	if (!refcount_inc_not_zero(&cb->refcnt))
4757 		return -ENOENT;
4758 
4759 	if (!defer_timer_wq_op()) {
4760 		struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
4761 
4762 		ret = hrtimer_try_to_cancel(&t->timer);
4763 		bpf_async_refcount_put(cb);
4764 		return ret;
4765 	} else {
4766 		ret = bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0);
4767 		return ret ? ret : -ECANCELED;
4768 	}
4769 }
4770 
4771 __bpf_kfunc_end_defs();
4772 
4773 static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
4774 {
4775 	struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
4776 
4777 	bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */
4778 	bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */
4779 }
4780 
4781 void bpf_task_work_cancel_and_free(void *val)
4782 {
4783 	struct bpf_task_work_kern *twk = val;
4784 	struct bpf_task_work_ctx *ctx;
4785 	enum bpf_task_work_state state;
4786 
4787 	ctx = xchg(&twk->ctx, NULL);
4788 	if (!ctx)
4789 		return;
4790 
4791 	state = xchg(&ctx->state, BPF_TW_FREED);
4792 	if (state == BPF_TW_SCHEDULED) {
4793 		/* run in irq_work to avoid locks in NMI */
4794 		init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled);
4795 		irq_work_queue(&ctx->irq_work);
4796 		return;
4797 	}
4798 
4799 	bpf_task_work_ctx_put(ctx); /* put bpf map's ref */
4800 }
4801 
4802 BTF_KFUNCS_START(generic_btf_ids)
4803 #ifdef CONFIG_CRASH_DUMP
4804 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
4805 #endif
4806 BTF_ID_FLAGS(func, bpf_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS)
4807 BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
4808 BTF_ID_FLAGS(func, bpf_percpu_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS)
4809 BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
4810 BTF_ID_FLAGS(func, bpf_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS)
4811 BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
4812 BTF_ID_FLAGS(func, bpf_percpu_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS)
4813 BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
4814 BTF_ID_FLAGS(func, bpf_refcount_acquire, KF_ACQUIRE | KF_RET_NULL | KF_RCU | KF_IMPLICIT_ARGS)
4815 BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
4816 BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS)
4817 BTF_ID_FLAGS(func, bpf_list_push_front_impl)
4818 BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS)
4819 BTF_ID_FLAGS(func, bpf_list_push_back_impl)
4820 BTF_ID_FLAGS(func, bpf_list_add, KF_IMPLICIT_ARGS)
4821 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
4822 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
4823 BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL)
4824 BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
4825 BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
4826 BTF_ID_FLAGS(func, bpf_list_is_first)
4827 BTF_ID_FLAGS(func, bpf_list_is_last)
4828 BTF_ID_FLAGS(func, bpf_list_empty)
4829 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
4830 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
4831 BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
4832 BTF_ID_FLAGS(func, bpf_rbtree_add, KF_IMPLICIT_ARGS)
4833 BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
4834 BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
4835 BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
4836 BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
4837 BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)
4838 
4839 #ifdef CONFIG_CGROUPS
4840 BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
4841 BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
4842 BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
4843 BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
4844 BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
4845 BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
4846 #endif
4847 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
4848 BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
4849 BTF_ID_FLAGS(func, bpf_throw)
4850 #ifdef CONFIG_BPF_EVENTS
4851 BTF_ID_FLAGS(func, bpf_send_signal_task)
4852 #endif
4853 #ifdef CONFIG_KEYS
4854 BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
4855 BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
4856 BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
4857 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
4858 BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
4859 #endif
4860 #endif
4861 #ifdef CONFIG_S390
4862 BTF_ID_FLAGS(func, bpf_get_lowcore)
4863 #endif
4864 BTF_KFUNCS_END(generic_btf_ids)
4865 
4866 static const struct btf_kfunc_id_set generic_kfunc_set = {
4867 	.owner = THIS_MODULE,
4868 	.set   = &generic_btf_ids,
4869 };
4870 
4871 
4872 BTF_ID_LIST(generic_dtor_ids)
4873 BTF_ID(struct, task_struct)
4874 BTF_ID(func, bpf_task_release_dtor)
4875 #ifdef CONFIG_CGROUPS
4876 BTF_ID(struct, cgroup)
4877 BTF_ID(func, bpf_cgroup_release_dtor)
4878 #endif
4879 
4880 BTF_KFUNCS_START(common_btf_ids)
4881 BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL)
4882 BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL)
4883 BTF_ID_FLAGS(func, bpf_rcu_read_lock)
4884 BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
4885 BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
4886 BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
4887 BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
4888 BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
4889 BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
4890 BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
4891 BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
4892 BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
4893 #ifdef CONFIG_CGROUPS
4894 BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
4895 BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
4896 BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
4897 BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED)
4898 BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
4899 BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
4900 #endif
4901 BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED)
4902 BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
4903 BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
4904 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
4905 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
4906 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
4907 BTF_ID_FLAGS(func, bpf_dynptr_size)
4908 BTF_ID_FLAGS(func, bpf_dynptr_clone)
4909 BTF_ID_FLAGS(func, bpf_dynptr_copy)
4910 BTF_ID_FLAGS(func, bpf_dynptr_memset)
4911 #ifdef CONFIG_NET
4912 BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
4913 #endif
4914 BTF_ID_FLAGS(func, bpf_wq_init)
4915 BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS)
4916 BTF_ID_FLAGS(func, bpf_wq_start)
4917 BTF_ID_FLAGS(func, bpf_preempt_disable)
4918 BTF_ID_FLAGS(func, bpf_preempt_enable)
4919 BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
4920 BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
4921 BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
4922 BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
4923 BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE)
4924 BTF_ID_FLAGS(func, bpf_get_kmem_cache)
4925 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
4926 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
4927 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
4928 BTF_ID_FLAGS(func, bpf_local_irq_save)
4929 BTF_ID_FLAGS(func, bpf_local_irq_restore)
4930 #ifdef CONFIG_BPF_EVENTS
4931 BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr)
4932 BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr)
4933 BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
4934 BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr)
4935 BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
4936 BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
4937 BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE)
4938 BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE)
4939 #endif
4940 #ifdef CONFIG_DMA_SHARED_BUFFER
4941 BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
4942 BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
4943 BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
4944 #endif
4945 BTF_ID_FLAGS(func, __bpf_trap)
4946 BTF_ID_FLAGS(func, bpf_strcmp);
4947 BTF_ID_FLAGS(func, bpf_strcasecmp);
4948 BTF_ID_FLAGS(func, bpf_strncasecmp);
4949 BTF_ID_FLAGS(func, bpf_strchr);
4950 BTF_ID_FLAGS(func, bpf_strchrnul);
4951 BTF_ID_FLAGS(func, bpf_strnchr);
4952 BTF_ID_FLAGS(func, bpf_strrchr);
4953 BTF_ID_FLAGS(func, bpf_strlen);
4954 BTF_ID_FLAGS(func, bpf_strnlen);
4955 BTF_ID_FLAGS(func, bpf_strspn);
4956 BTF_ID_FLAGS(func, bpf_strcspn);
4957 BTF_ID_FLAGS(func, bpf_strstr);
4958 BTF_ID_FLAGS(func, bpf_strcasestr);
4959 BTF_ID_FLAGS(func, bpf_strnstr);
4960 BTF_ID_FLAGS(func, bpf_strncasestr);
4961 #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
4962 BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
4963 #endif
4964 BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS)
4965 BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS)
4966 BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS)
4967 BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS)
4968 BTF_ID_FLAGS(func, bpf_dynptr_from_file)
4969 BTF_ID_FLAGS(func, bpf_dynptr_file_discard, KF_RELEASE)
4970 BTF_ID_FLAGS(func, bpf_timer_cancel_async)
4971 BTF_KFUNCS_END(common_btf_ids)
4972 
4973 static const struct btf_kfunc_id_set common_kfunc_set = {
4974 	.owner = THIS_MODULE,
4975 	.set   = &common_btf_ids,
4976 };
4977 
4978 static int __init kfunc_init(void)
4979 {
4980 	int ret;
4981 	const struct btf_id_dtor_kfunc generic_dtors[] = {
4982 		{
4983 			.btf_id       = generic_dtor_ids[0],
4984 			.kfunc_btf_id = generic_dtor_ids[1]
4985 		},
4986 #ifdef CONFIG_CGROUPS
4987 		{
4988 			.btf_id       = generic_dtor_ids[2],
4989 			.kfunc_btf_id = generic_dtor_ids[3]
4990 		},
4991 #endif
4992 	};
4993 
4994 	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
4995 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
4996 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
4997 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
4998 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
4999 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set);
5000 	ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
5001 						  ARRAY_SIZE(generic_dtors),
5002 						  THIS_MODULE);
5003 	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
5004 }
5005 
5006 late_initcall(kfunc_init);
5007 
5008 /* Get a pointer to dynptr data up to len bytes for read only access. If
5009  * the dynptr doesn't have continuous data up to len bytes, return NULL.
5010  */
5011 const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len)
5012 {
5013 	const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
5014 
5015 	return bpf_dynptr_slice(p, 0, NULL, len);
5016 }
5017 
5018 /* Get a pointer to dynptr data up to len bytes for read write access. If
5019  * the dynptr doesn't have continuous data up to len bytes, or the dynptr
5020  * is read only, return NULL.
5021  */
5022 void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len)
5023 {
5024 	if (__bpf_dynptr_is_rdonly(ptr))
5025 		return NULL;
5026 	return (void *)__bpf_dynptr_data(ptr, len);
5027 }
5028 
5029 void bpf_map_free_internal_structs(struct bpf_map *map, void *val)
5030 {
5031 	if (btf_record_has_field(map->record, BPF_TIMER))
5032 		bpf_obj_free_timer(map->record, val);
5033 	if (btf_record_has_field(map->record, BPF_WORKQUEUE))
5034 		bpf_obj_free_workqueue(map->record, val);
5035 	if (btf_record_has_field(map->record, BPF_TASK_WORK))
5036 		bpf_obj_free_task_work(map->record, val);
5037 }
5038