xref: /linux/kernel/bpf/task_iter.c (revision f9bff0e31881d03badf191d3b0005839391f5f2b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2020 Facebook */
3 
4 #include <linux/init.h>
5 #include <linux/namei.h>
6 #include <linux/pid_namespace.h>
7 #include <linux/fs.h>
8 #include <linux/fdtable.h>
9 #include <linux/filter.h>
10 #include <linux/btf_ids.h>
11 #include "mmap_unlock_work.h"
12 
13 static const char * const iter_task_type_names[] = {
14 	"ALL",
15 	"TID",
16 	"PID",
17 };
18 
19 struct bpf_iter_seq_task_common {
20 	struct pid_namespace *ns;
21 	enum bpf_iter_task_type	type;
22 	u32 pid;
23 	u32 pid_visiting;
24 };
25 
26 struct bpf_iter_seq_task_info {
27 	/* The first field must be struct bpf_iter_seq_task_common.
28 	 * this is assumed by {init, fini}_seq_pidns() callback functions.
29 	 */
30 	struct bpf_iter_seq_task_common common;
31 	u32 tid;
32 };
33 
34 static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
35 						   u32 *tid,
36 						   bool skip_if_dup_files)
37 {
38 	struct task_struct *task, *next_task;
39 	struct pid *pid;
40 	u32 saved_tid;
41 
42 	if (!*tid) {
43 		/* The first time, the iterator calls this function. */
44 		pid = find_pid_ns(common->pid, common->ns);
45 		if (!pid)
46 			return NULL;
47 
48 		task = get_pid_task(pid, PIDTYPE_TGID);
49 		if (!task)
50 			return NULL;
51 
52 		*tid = common->pid;
53 		common->pid_visiting = common->pid;
54 
55 		return task;
56 	}
57 
58 	/* If the control returns to user space and comes back to the
59 	 * kernel again, *tid and common->pid_visiting should be the
60 	 * same for task_seq_start() to pick up the correct task.
61 	 */
62 	if (*tid == common->pid_visiting) {
63 		pid = find_pid_ns(common->pid_visiting, common->ns);
64 		task = get_pid_task(pid, PIDTYPE_PID);
65 
66 		return task;
67 	}
68 
69 	pid = find_pid_ns(common->pid_visiting, common->ns);
70 	if (!pid)
71 		return NULL;
72 
73 	task = get_pid_task(pid, PIDTYPE_PID);
74 	if (!task)
75 		return NULL;
76 
77 retry:
78 	if (!pid_alive(task)) {
79 		put_task_struct(task);
80 		return NULL;
81 	}
82 
83 	next_task = next_thread(task);
84 	put_task_struct(task);
85 	if (!next_task)
86 		return NULL;
87 
88 	saved_tid = *tid;
89 	*tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
90 	if (!*tid || *tid == common->pid) {
91 		/* Run out of tasks of a process.  The tasks of a
92 		 * thread_group are linked as circular linked list.
93 		 */
94 		*tid = saved_tid;
95 		return NULL;
96 	}
97 
98 	get_task_struct(next_task);
99 	common->pid_visiting = *tid;
100 
101 	if (skip_if_dup_files && task->files == task->group_leader->files) {
102 		task = next_task;
103 		goto retry;
104 	}
105 
106 	return next_task;
107 }
108 
109 static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
110 					     u32 *tid,
111 					     bool skip_if_dup_files)
112 {
113 	struct task_struct *task = NULL;
114 	struct pid *pid;
115 
116 	if (common->type == BPF_TASK_ITER_TID) {
117 		if (*tid && *tid != common->pid)
118 			return NULL;
119 		rcu_read_lock();
120 		pid = find_pid_ns(common->pid, common->ns);
121 		if (pid) {
122 			task = get_pid_task(pid, PIDTYPE_TGID);
123 			*tid = common->pid;
124 		}
125 		rcu_read_unlock();
126 
127 		return task;
128 	}
129 
130 	if (common->type == BPF_TASK_ITER_TGID) {
131 		rcu_read_lock();
132 		task = task_group_seq_get_next(common, tid, skip_if_dup_files);
133 		rcu_read_unlock();
134 
135 		return task;
136 	}
137 
138 	rcu_read_lock();
139 retry:
140 	pid = find_ge_pid(*tid, common->ns);
141 	if (pid) {
142 		*tid = pid_nr_ns(pid, common->ns);
143 		task = get_pid_task(pid, PIDTYPE_PID);
144 		if (!task) {
145 			++*tid;
146 			goto retry;
147 		} else if (skip_if_dup_files && !thread_group_leader(task) &&
148 			   task->files == task->group_leader->files) {
149 			put_task_struct(task);
150 			task = NULL;
151 			++*tid;
152 			goto retry;
153 		}
154 	}
155 	rcu_read_unlock();
156 
157 	return task;
158 }
159 
160 static void *task_seq_start(struct seq_file *seq, loff_t *pos)
161 {
162 	struct bpf_iter_seq_task_info *info = seq->private;
163 	struct task_struct *task;
164 
165 	task = task_seq_get_next(&info->common, &info->tid, false);
166 	if (!task)
167 		return NULL;
168 
169 	if (*pos == 0)
170 		++*pos;
171 	return task;
172 }
173 
174 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
175 {
176 	struct bpf_iter_seq_task_info *info = seq->private;
177 	struct task_struct *task;
178 
179 	++*pos;
180 	++info->tid;
181 	put_task_struct((struct task_struct *)v);
182 	task = task_seq_get_next(&info->common, &info->tid, false);
183 	if (!task)
184 		return NULL;
185 
186 	return task;
187 }
188 
189 struct bpf_iter__task {
190 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
191 	__bpf_md_ptr(struct task_struct *, task);
192 };
193 
194 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
195 
196 static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
197 			   bool in_stop)
198 {
199 	struct bpf_iter_meta meta;
200 	struct bpf_iter__task ctx;
201 	struct bpf_prog *prog;
202 
203 	meta.seq = seq;
204 	prog = bpf_iter_get_info(&meta, in_stop);
205 	if (!prog)
206 		return 0;
207 
208 	ctx.meta = &meta;
209 	ctx.task = task;
210 	return bpf_iter_run_prog(prog, &ctx);
211 }
212 
213 static int task_seq_show(struct seq_file *seq, void *v)
214 {
215 	return __task_seq_show(seq, v, false);
216 }
217 
218 static void task_seq_stop(struct seq_file *seq, void *v)
219 {
220 	if (!v)
221 		(void)__task_seq_show(seq, v, true);
222 	else
223 		put_task_struct((struct task_struct *)v);
224 }
225 
226 static int bpf_iter_attach_task(struct bpf_prog *prog,
227 				union bpf_iter_link_info *linfo,
228 				struct bpf_iter_aux_info *aux)
229 {
230 	unsigned int flags;
231 	struct pid *pid;
232 	pid_t tgid;
233 
234 	if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
235 		return -EINVAL;
236 
237 	aux->task.type = BPF_TASK_ITER_ALL;
238 	if (linfo->task.tid != 0) {
239 		aux->task.type = BPF_TASK_ITER_TID;
240 		aux->task.pid = linfo->task.tid;
241 	}
242 	if (linfo->task.pid != 0) {
243 		aux->task.type = BPF_TASK_ITER_TGID;
244 		aux->task.pid = linfo->task.pid;
245 	}
246 	if (linfo->task.pid_fd != 0) {
247 		aux->task.type = BPF_TASK_ITER_TGID;
248 
249 		pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
250 		if (IS_ERR(pid))
251 			return PTR_ERR(pid);
252 
253 		tgid = pid_nr_ns(pid, task_active_pid_ns(current));
254 		aux->task.pid = tgid;
255 		put_pid(pid);
256 	}
257 
258 	return 0;
259 }
260 
261 static const struct seq_operations task_seq_ops = {
262 	.start	= task_seq_start,
263 	.next	= task_seq_next,
264 	.stop	= task_seq_stop,
265 	.show	= task_seq_show,
266 };
267 
268 struct bpf_iter_seq_task_file_info {
269 	/* The first field must be struct bpf_iter_seq_task_common.
270 	 * this is assumed by {init, fini}_seq_pidns() callback functions.
271 	 */
272 	struct bpf_iter_seq_task_common common;
273 	struct task_struct *task;
274 	u32 tid;
275 	u32 fd;
276 };
277 
278 static struct file *
279 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
280 {
281 	u32 saved_tid = info->tid;
282 	struct task_struct *curr_task;
283 	unsigned int curr_fd = info->fd;
284 
285 	/* If this function returns a non-NULL file object,
286 	 * it held a reference to the task/file.
287 	 * Otherwise, it does not hold any reference.
288 	 */
289 again:
290 	if (info->task) {
291 		curr_task = info->task;
292 		curr_fd = info->fd;
293 	} else {
294 		curr_task = task_seq_get_next(&info->common, &info->tid, true);
295                 if (!curr_task) {
296                         info->task = NULL;
297                         return NULL;
298                 }
299 
300 		/* set info->task */
301 		info->task = curr_task;
302 		if (saved_tid == info->tid)
303 			curr_fd = info->fd;
304 		else
305 			curr_fd = 0;
306 	}
307 
308 	rcu_read_lock();
309 	for (;; curr_fd++) {
310 		struct file *f;
311 		f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
312 		if (!f)
313 			break;
314 		if (!get_file_rcu(f))
315 			continue;
316 
317 		/* set info->fd */
318 		info->fd = curr_fd;
319 		rcu_read_unlock();
320 		return f;
321 	}
322 
323 	/* the current task is done, go to the next task */
324 	rcu_read_unlock();
325 	put_task_struct(curr_task);
326 
327 	if (info->common.type == BPF_TASK_ITER_TID) {
328 		info->task = NULL;
329 		return NULL;
330 	}
331 
332 	info->task = NULL;
333 	info->fd = 0;
334 	saved_tid = ++(info->tid);
335 	goto again;
336 }
337 
338 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
339 {
340 	struct bpf_iter_seq_task_file_info *info = seq->private;
341 	struct file *file;
342 
343 	info->task = NULL;
344 	file = task_file_seq_get_next(info);
345 	if (file && *pos == 0)
346 		++*pos;
347 
348 	return file;
349 }
350 
351 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
352 {
353 	struct bpf_iter_seq_task_file_info *info = seq->private;
354 
355 	++*pos;
356 	++info->fd;
357 	fput((struct file *)v);
358 	return task_file_seq_get_next(info);
359 }
360 
361 struct bpf_iter__task_file {
362 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
363 	__bpf_md_ptr(struct task_struct *, task);
364 	u32 fd __aligned(8);
365 	__bpf_md_ptr(struct file *, file);
366 };
367 
368 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
369 		     struct task_struct *task, u32 fd,
370 		     struct file *file)
371 
372 static int __task_file_seq_show(struct seq_file *seq, struct file *file,
373 				bool in_stop)
374 {
375 	struct bpf_iter_seq_task_file_info *info = seq->private;
376 	struct bpf_iter__task_file ctx;
377 	struct bpf_iter_meta meta;
378 	struct bpf_prog *prog;
379 
380 	meta.seq = seq;
381 	prog = bpf_iter_get_info(&meta, in_stop);
382 	if (!prog)
383 		return 0;
384 
385 	ctx.meta = &meta;
386 	ctx.task = info->task;
387 	ctx.fd = info->fd;
388 	ctx.file = file;
389 	return bpf_iter_run_prog(prog, &ctx);
390 }
391 
392 static int task_file_seq_show(struct seq_file *seq, void *v)
393 {
394 	return __task_file_seq_show(seq, v, false);
395 }
396 
397 static void task_file_seq_stop(struct seq_file *seq, void *v)
398 {
399 	struct bpf_iter_seq_task_file_info *info = seq->private;
400 
401 	if (!v) {
402 		(void)__task_file_seq_show(seq, v, true);
403 	} else {
404 		fput((struct file *)v);
405 		put_task_struct(info->task);
406 		info->task = NULL;
407 	}
408 }
409 
410 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
411 {
412 	struct bpf_iter_seq_task_common *common = priv_data;
413 
414 	common->ns = get_pid_ns(task_active_pid_ns(current));
415 	common->type = aux->task.type;
416 	common->pid = aux->task.pid;
417 
418 	return 0;
419 }
420 
421 static void fini_seq_pidns(void *priv_data)
422 {
423 	struct bpf_iter_seq_task_common *common = priv_data;
424 
425 	put_pid_ns(common->ns);
426 }
427 
428 static const struct seq_operations task_file_seq_ops = {
429 	.start	= task_file_seq_start,
430 	.next	= task_file_seq_next,
431 	.stop	= task_file_seq_stop,
432 	.show	= task_file_seq_show,
433 };
434 
435 struct bpf_iter_seq_task_vma_info {
436 	/* The first field must be struct bpf_iter_seq_task_common.
437 	 * this is assumed by {init, fini}_seq_pidns() callback functions.
438 	 */
439 	struct bpf_iter_seq_task_common common;
440 	struct task_struct *task;
441 	struct mm_struct *mm;
442 	struct vm_area_struct *vma;
443 	u32 tid;
444 	unsigned long prev_vm_start;
445 	unsigned long prev_vm_end;
446 };
447 
448 enum bpf_task_vma_iter_find_op {
449 	task_vma_iter_first_vma,   /* use find_vma() with addr 0 */
450 	task_vma_iter_next_vma,    /* use vma_next() with curr_vma */
451 	task_vma_iter_find_vma,    /* use find_vma() to find next vma */
452 };
453 
454 static struct vm_area_struct *
455 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
456 {
457 	enum bpf_task_vma_iter_find_op op;
458 	struct vm_area_struct *curr_vma;
459 	struct task_struct *curr_task;
460 	struct mm_struct *curr_mm;
461 	u32 saved_tid = info->tid;
462 
463 	/* If this function returns a non-NULL vma, it holds a reference to
464 	 * the task_struct, holds a refcount on mm->mm_users, and holds
465 	 * read lock on vma->mm->mmap_lock.
466 	 * If this function returns NULL, it does not hold any reference or
467 	 * lock.
468 	 */
469 	if (info->task) {
470 		curr_task = info->task;
471 		curr_vma = info->vma;
472 		curr_mm = info->mm;
473 		/* In case of lock contention, drop mmap_lock to unblock
474 		 * the writer.
475 		 *
476 		 * After relock, call find(mm, prev_vm_end - 1) to find
477 		 * new vma to process.
478 		 *
479 		 *   +------+------+-----------+
480 		 *   | VMA1 | VMA2 | VMA3      |
481 		 *   +------+------+-----------+
482 		 *   |      |      |           |
483 		 *  4k     8k     16k         400k
484 		 *
485 		 * For example, curr_vma == VMA2. Before unlock, we set
486 		 *
487 		 *    prev_vm_start = 8k
488 		 *    prev_vm_end   = 16k
489 		 *
490 		 * There are a few cases:
491 		 *
492 		 * 1) VMA2 is freed, but VMA3 exists.
493 		 *
494 		 *    find_vma() will return VMA3, just process VMA3.
495 		 *
496 		 * 2) VMA2 still exists.
497 		 *
498 		 *    find_vma() will return VMA2, process VMA2->next.
499 		 *
500 		 * 3) no more vma in this mm.
501 		 *
502 		 *    Process the next task.
503 		 *
504 		 * 4) find_vma() returns a different vma, VMA2'.
505 		 *
506 		 *    4.1) If VMA2 covers same range as VMA2', skip VMA2',
507 		 *         because we already covered the range;
508 		 *    4.2) VMA2 and VMA2' covers different ranges, process
509 		 *         VMA2'.
510 		 */
511 		if (mmap_lock_is_contended(curr_mm)) {
512 			info->prev_vm_start = curr_vma->vm_start;
513 			info->prev_vm_end = curr_vma->vm_end;
514 			op = task_vma_iter_find_vma;
515 			mmap_read_unlock(curr_mm);
516 			if (mmap_read_lock_killable(curr_mm)) {
517 				mmput(curr_mm);
518 				goto finish;
519 			}
520 		} else {
521 			op = task_vma_iter_next_vma;
522 		}
523 	} else {
524 again:
525 		curr_task = task_seq_get_next(&info->common, &info->tid, true);
526 		if (!curr_task) {
527 			info->tid++;
528 			goto finish;
529 		}
530 
531 		if (saved_tid != info->tid) {
532 			/* new task, process the first vma */
533 			op = task_vma_iter_first_vma;
534 		} else {
535 			/* Found the same tid, which means the user space
536 			 * finished data in previous buffer and read more.
537 			 * We dropped mmap_lock before returning to user
538 			 * space, so it is necessary to use find_vma() to
539 			 * find the next vma to process.
540 			 */
541 			op = task_vma_iter_find_vma;
542 		}
543 
544 		curr_mm = get_task_mm(curr_task);
545 		if (!curr_mm)
546 			goto next_task;
547 
548 		if (mmap_read_lock_killable(curr_mm)) {
549 			mmput(curr_mm);
550 			goto finish;
551 		}
552 	}
553 
554 	switch (op) {
555 	case task_vma_iter_first_vma:
556 		curr_vma = find_vma(curr_mm, 0);
557 		break;
558 	case task_vma_iter_next_vma:
559 		curr_vma = find_vma(curr_mm, curr_vma->vm_end);
560 		break;
561 	case task_vma_iter_find_vma:
562 		/* We dropped mmap_lock so it is necessary to use find_vma
563 		 * to find the next vma. This is similar to the  mechanism
564 		 * in show_smaps_rollup().
565 		 */
566 		curr_vma = find_vma(curr_mm, info->prev_vm_end - 1);
567 		/* case 1) and 4.2) above just use curr_vma */
568 
569 		/* check for case 2) or case 4.1) above */
570 		if (curr_vma &&
571 		    curr_vma->vm_start == info->prev_vm_start &&
572 		    curr_vma->vm_end == info->prev_vm_end)
573 			curr_vma = find_vma(curr_mm, curr_vma->vm_end);
574 		break;
575 	}
576 	if (!curr_vma) {
577 		/* case 3) above, or case 2) 4.1) with vma->next == NULL */
578 		mmap_read_unlock(curr_mm);
579 		mmput(curr_mm);
580 		goto next_task;
581 	}
582 	info->task = curr_task;
583 	info->vma = curr_vma;
584 	info->mm = curr_mm;
585 	return curr_vma;
586 
587 next_task:
588 	if (info->common.type == BPF_TASK_ITER_TID)
589 		goto finish;
590 
591 	put_task_struct(curr_task);
592 	info->task = NULL;
593 	info->mm = NULL;
594 	info->tid++;
595 	goto again;
596 
597 finish:
598 	if (curr_task)
599 		put_task_struct(curr_task);
600 	info->task = NULL;
601 	info->vma = NULL;
602 	info->mm = NULL;
603 	return NULL;
604 }
605 
606 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
607 {
608 	struct bpf_iter_seq_task_vma_info *info = seq->private;
609 	struct vm_area_struct *vma;
610 
611 	vma = task_vma_seq_get_next(info);
612 	if (vma && *pos == 0)
613 		++*pos;
614 
615 	return vma;
616 }
617 
618 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
619 {
620 	struct bpf_iter_seq_task_vma_info *info = seq->private;
621 
622 	++*pos;
623 	return task_vma_seq_get_next(info);
624 }
625 
626 struct bpf_iter__task_vma {
627 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
628 	__bpf_md_ptr(struct task_struct *, task);
629 	__bpf_md_ptr(struct vm_area_struct *, vma);
630 };
631 
632 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
633 		     struct task_struct *task, struct vm_area_struct *vma)
634 
635 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
636 {
637 	struct bpf_iter_seq_task_vma_info *info = seq->private;
638 	struct bpf_iter__task_vma ctx;
639 	struct bpf_iter_meta meta;
640 	struct bpf_prog *prog;
641 
642 	meta.seq = seq;
643 	prog = bpf_iter_get_info(&meta, in_stop);
644 	if (!prog)
645 		return 0;
646 
647 	ctx.meta = &meta;
648 	ctx.task = info->task;
649 	ctx.vma = info->vma;
650 	return bpf_iter_run_prog(prog, &ctx);
651 }
652 
653 static int task_vma_seq_show(struct seq_file *seq, void *v)
654 {
655 	return __task_vma_seq_show(seq, false);
656 }
657 
658 static void task_vma_seq_stop(struct seq_file *seq, void *v)
659 {
660 	struct bpf_iter_seq_task_vma_info *info = seq->private;
661 
662 	if (!v) {
663 		(void)__task_vma_seq_show(seq, true);
664 	} else {
665 		/* info->vma has not been seen by the BPF program. If the
666 		 * user space reads more, task_vma_seq_get_next should
667 		 * return this vma again. Set prev_vm_start to ~0UL,
668 		 * so that we don't skip the vma returned by the next
669 		 * find_vma() (case task_vma_iter_find_vma in
670 		 * task_vma_seq_get_next()).
671 		 */
672 		info->prev_vm_start = ~0UL;
673 		info->prev_vm_end = info->vma->vm_end;
674 		mmap_read_unlock(info->mm);
675 		mmput(info->mm);
676 		info->mm = NULL;
677 		put_task_struct(info->task);
678 		info->task = NULL;
679 	}
680 }
681 
682 static const struct seq_operations task_vma_seq_ops = {
683 	.start	= task_vma_seq_start,
684 	.next	= task_vma_seq_next,
685 	.stop	= task_vma_seq_stop,
686 	.show	= task_vma_seq_show,
687 };
688 
689 static const struct bpf_iter_seq_info task_seq_info = {
690 	.seq_ops		= &task_seq_ops,
691 	.init_seq_private	= init_seq_pidns,
692 	.fini_seq_private	= fini_seq_pidns,
693 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
694 };
695 
696 static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
697 {
698 	switch (aux->task.type) {
699 	case BPF_TASK_ITER_TID:
700 		info->iter.task.tid = aux->task.pid;
701 		break;
702 	case BPF_TASK_ITER_TGID:
703 		info->iter.task.pid = aux->task.pid;
704 		break;
705 	default:
706 		break;
707 	}
708 	return 0;
709 }
710 
711 static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
712 {
713 	seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
714 	if (aux->task.type == BPF_TASK_ITER_TID)
715 		seq_printf(seq, "tid:\t%u\n", aux->task.pid);
716 	else if (aux->task.type == BPF_TASK_ITER_TGID)
717 		seq_printf(seq, "pid:\t%u\n", aux->task.pid);
718 }
719 
720 static struct bpf_iter_reg task_reg_info = {
721 	.target			= "task",
722 	.attach_target		= bpf_iter_attach_task,
723 	.feature		= BPF_ITER_RESCHED,
724 	.ctx_arg_info_size	= 1,
725 	.ctx_arg_info		= {
726 		{ offsetof(struct bpf_iter__task, task),
727 		  PTR_TO_BTF_ID_OR_NULL },
728 	},
729 	.seq_info		= &task_seq_info,
730 	.fill_link_info		= bpf_iter_fill_link_info,
731 	.show_fdinfo		= bpf_iter_task_show_fdinfo,
732 };
733 
734 static const struct bpf_iter_seq_info task_file_seq_info = {
735 	.seq_ops		= &task_file_seq_ops,
736 	.init_seq_private	= init_seq_pidns,
737 	.fini_seq_private	= fini_seq_pidns,
738 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
739 };
740 
741 static struct bpf_iter_reg task_file_reg_info = {
742 	.target			= "task_file",
743 	.attach_target		= bpf_iter_attach_task,
744 	.feature		= BPF_ITER_RESCHED,
745 	.ctx_arg_info_size	= 2,
746 	.ctx_arg_info		= {
747 		{ offsetof(struct bpf_iter__task_file, task),
748 		  PTR_TO_BTF_ID_OR_NULL },
749 		{ offsetof(struct bpf_iter__task_file, file),
750 		  PTR_TO_BTF_ID_OR_NULL },
751 	},
752 	.seq_info		= &task_file_seq_info,
753 	.fill_link_info		= bpf_iter_fill_link_info,
754 	.show_fdinfo		= bpf_iter_task_show_fdinfo,
755 };
756 
757 static const struct bpf_iter_seq_info task_vma_seq_info = {
758 	.seq_ops		= &task_vma_seq_ops,
759 	.init_seq_private	= init_seq_pidns,
760 	.fini_seq_private	= fini_seq_pidns,
761 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_vma_info),
762 };
763 
764 static struct bpf_iter_reg task_vma_reg_info = {
765 	.target			= "task_vma",
766 	.attach_target		= bpf_iter_attach_task,
767 	.feature		= BPF_ITER_RESCHED,
768 	.ctx_arg_info_size	= 2,
769 	.ctx_arg_info		= {
770 		{ offsetof(struct bpf_iter__task_vma, task),
771 		  PTR_TO_BTF_ID_OR_NULL },
772 		{ offsetof(struct bpf_iter__task_vma, vma),
773 		  PTR_TO_BTF_ID_OR_NULL },
774 	},
775 	.seq_info		= &task_vma_seq_info,
776 	.fill_link_info		= bpf_iter_fill_link_info,
777 	.show_fdinfo		= bpf_iter_task_show_fdinfo,
778 };
779 
780 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
781 	   bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
782 {
783 	struct mmap_unlock_irq_work *work = NULL;
784 	struct vm_area_struct *vma;
785 	bool irq_work_busy = false;
786 	struct mm_struct *mm;
787 	int ret = -ENOENT;
788 
789 	if (flags)
790 		return -EINVAL;
791 
792 	if (!task)
793 		return -ENOENT;
794 
795 	mm = task->mm;
796 	if (!mm)
797 		return -ENOENT;
798 
799 	irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
800 
801 	if (irq_work_busy || !mmap_read_trylock(mm))
802 		return -EBUSY;
803 
804 	vma = find_vma(mm, start);
805 
806 	if (vma && vma->vm_start <= start && vma->vm_end > start) {
807 		callback_fn((u64)(long)task, (u64)(long)vma,
808 			    (u64)(long)callback_ctx, 0, 0);
809 		ret = 0;
810 	}
811 	bpf_mmap_unlock_mm(work, mm);
812 	return ret;
813 }
814 
815 const struct bpf_func_proto bpf_find_vma_proto = {
816 	.func		= bpf_find_vma,
817 	.ret_type	= RET_INTEGER,
818 	.arg1_type	= ARG_PTR_TO_BTF_ID,
819 	.arg1_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
820 	.arg2_type	= ARG_ANYTHING,
821 	.arg3_type	= ARG_PTR_TO_FUNC,
822 	.arg4_type	= ARG_PTR_TO_STACK_OR_NULL,
823 	.arg5_type	= ARG_ANYTHING,
824 };
825 
826 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
827 
828 static void do_mmap_read_unlock(struct irq_work *entry)
829 {
830 	struct mmap_unlock_irq_work *work;
831 
832 	if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
833 		return;
834 
835 	work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
836 	mmap_read_unlock_non_owner(work->mm);
837 }
838 
839 static int __init task_iter_init(void)
840 {
841 	struct mmap_unlock_irq_work *work;
842 	int ret, cpu;
843 
844 	for_each_possible_cpu(cpu) {
845 		work = per_cpu_ptr(&mmap_unlock_work, cpu);
846 		init_irq_work(&work->irq_work, do_mmap_read_unlock);
847 	}
848 
849 	task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
850 	ret = bpf_iter_reg_target(&task_reg_info);
851 	if (ret)
852 		return ret;
853 
854 	task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
855 	task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
856 	ret =  bpf_iter_reg_target(&task_file_reg_info);
857 	if (ret)
858 		return ret;
859 
860 	task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
861 	task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
862 	return bpf_iter_reg_target(&task_vma_reg_info);
863 }
864 late_initcall(task_iter_init);
865