1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2020 Facebook */ 3 4 #include <linux/init.h> 5 #include <linux/namei.h> 6 #include <linux/pid_namespace.h> 7 #include <linux/fs.h> 8 #include <linux/filter.h> 9 #include <linux/bpf_mem_alloc.h> 10 #include <linux/btf_ids.h> 11 #include <linux/mm_types.h> 12 #include "mmap_unlock_work.h" 13 14 static const char * const iter_task_type_names[] = { 15 "ALL", 16 "TID", 17 "PID", 18 }; 19 20 struct bpf_iter_seq_task_common { 21 struct pid_namespace *ns; 22 enum bpf_iter_task_type type; 23 u32 pid; 24 u32 pid_visiting; 25 }; 26 27 struct bpf_iter_seq_task_info { 28 /* The first field must be struct bpf_iter_seq_task_common. 29 * this is assumed by {init, fini}_seq_pidns() callback functions. 30 */ 31 struct bpf_iter_seq_task_common common; 32 u32 tid; 33 }; 34 35 static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common, 36 u32 *tid, 37 bool skip_if_dup_files) 38 { 39 struct task_struct *task; 40 struct pid *pid; 41 u32 next_tid; 42 43 if (!*tid) { 44 /* The first time, the iterator calls this function. */ 45 pid = find_pid_ns(common->pid, common->ns); 46 task = get_pid_task(pid, PIDTYPE_TGID); 47 if (!task) 48 return NULL; 49 50 *tid = common->pid; 51 common->pid_visiting = common->pid; 52 53 return task; 54 } 55 56 /* If the control returns to user space and comes back to the 57 * kernel again, *tid and common->pid_visiting should be the 58 * same for task_seq_start() to pick up the correct task. 59 */ 60 if (*tid == common->pid_visiting) { 61 pid = find_pid_ns(common->pid_visiting, common->ns); 62 task = get_pid_task(pid, PIDTYPE_PID); 63 64 return task; 65 } 66 67 task = find_task_by_pid_ns(common->pid_visiting, common->ns); 68 if (!task) 69 return NULL; 70 71 retry: 72 task = __next_thread(task); 73 if (!task) 74 return NULL; 75 76 next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns); 77 if (!next_tid) 78 goto retry; 79 80 if (skip_if_dup_files && task->files == task->group_leader->files) 81 goto retry; 82 83 *tid = common->pid_visiting = next_tid; 84 get_task_struct(task); 85 return task; 86 } 87 88 static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, 89 u32 *tid, 90 bool skip_if_dup_files) 91 { 92 struct task_struct *task = NULL; 93 struct pid *pid; 94 95 if (common->type == BPF_TASK_ITER_TID) { 96 if (*tid && *tid != common->pid) 97 return NULL; 98 rcu_read_lock(); 99 pid = find_pid_ns(common->pid, common->ns); 100 if (pid) { 101 task = get_pid_task(pid, PIDTYPE_PID); 102 *tid = common->pid; 103 } 104 rcu_read_unlock(); 105 106 return task; 107 } 108 109 if (common->type == BPF_TASK_ITER_TGID) { 110 rcu_read_lock(); 111 task = task_group_seq_get_next(common, tid, skip_if_dup_files); 112 rcu_read_unlock(); 113 114 return task; 115 } 116 117 rcu_read_lock(); 118 retry: 119 pid = find_ge_pid(*tid, common->ns); 120 if (pid) { 121 *tid = pid_nr_ns(pid, common->ns); 122 task = get_pid_task(pid, PIDTYPE_PID); 123 if (!task) { 124 ++*tid; 125 goto retry; 126 } else if (skip_if_dup_files && !thread_group_leader(task) && 127 task->files == task->group_leader->files) { 128 put_task_struct(task); 129 task = NULL; 130 ++*tid; 131 goto retry; 132 } 133 } 134 rcu_read_unlock(); 135 136 return task; 137 } 138 139 static void *task_seq_start(struct seq_file *seq, loff_t *pos) 140 { 141 struct bpf_iter_seq_task_info *info = seq->private; 142 struct task_struct *task; 143 144 task = task_seq_get_next(&info->common, &info->tid, false); 145 if (!task) 146 return NULL; 147 148 if (*pos == 0) 149 ++*pos; 150 return task; 151 } 152 153 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) 154 { 155 struct bpf_iter_seq_task_info *info = seq->private; 156 struct task_struct *task; 157 158 ++*pos; 159 ++info->tid; 160 put_task_struct((struct task_struct *)v); 161 task = task_seq_get_next(&info->common, &info->tid, false); 162 if (!task) 163 return NULL; 164 165 return task; 166 } 167 168 struct bpf_iter__task { 169 __bpf_md_ptr(struct bpf_iter_meta *, meta); 170 __bpf_md_ptr(struct task_struct *, task); 171 }; 172 173 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) 174 175 static int __task_seq_show(struct seq_file *seq, struct task_struct *task, 176 bool in_stop) 177 { 178 struct bpf_iter_meta meta; 179 struct bpf_iter__task ctx; 180 struct bpf_prog *prog; 181 182 meta.seq = seq; 183 prog = bpf_iter_get_info(&meta, in_stop); 184 if (!prog) 185 return 0; 186 187 ctx.meta = &meta; 188 ctx.task = task; 189 return bpf_iter_run_prog(prog, &ctx); 190 } 191 192 static int task_seq_show(struct seq_file *seq, void *v) 193 { 194 return __task_seq_show(seq, v, false); 195 } 196 197 static void task_seq_stop(struct seq_file *seq, void *v) 198 { 199 if (!v) 200 (void)__task_seq_show(seq, v, true); 201 else 202 put_task_struct((struct task_struct *)v); 203 } 204 205 static int bpf_iter_attach_task(struct bpf_prog *prog, 206 union bpf_iter_link_info *linfo, 207 struct bpf_iter_aux_info *aux) 208 { 209 unsigned int flags; 210 struct pid *pid; 211 pid_t tgid; 212 213 if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1) 214 return -EINVAL; 215 216 aux->task.type = BPF_TASK_ITER_ALL; 217 if (linfo->task.tid != 0) { 218 aux->task.type = BPF_TASK_ITER_TID; 219 aux->task.pid = linfo->task.tid; 220 } 221 if (linfo->task.pid != 0) { 222 aux->task.type = BPF_TASK_ITER_TGID; 223 aux->task.pid = linfo->task.pid; 224 } 225 if (linfo->task.pid_fd != 0) { 226 aux->task.type = BPF_TASK_ITER_TGID; 227 228 pid = pidfd_get_pid(linfo->task.pid_fd, &flags); 229 if (IS_ERR(pid)) 230 return PTR_ERR(pid); 231 232 tgid = pid_nr_ns(pid, task_active_pid_ns(current)); 233 aux->task.pid = tgid; 234 put_pid(pid); 235 } 236 237 return 0; 238 } 239 240 static const struct seq_operations task_seq_ops = { 241 .start = task_seq_start, 242 .next = task_seq_next, 243 .stop = task_seq_stop, 244 .show = task_seq_show, 245 }; 246 247 struct bpf_iter_seq_task_file_info { 248 /* The first field must be struct bpf_iter_seq_task_common. 249 * this is assumed by {init, fini}_seq_pidns() callback functions. 250 */ 251 struct bpf_iter_seq_task_common common; 252 struct task_struct *task; 253 u32 tid; 254 u32 fd; 255 }; 256 257 static struct file * 258 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) 259 { 260 u32 saved_tid = info->tid; 261 struct task_struct *curr_task; 262 unsigned int curr_fd = info->fd; 263 struct file *f; 264 265 /* If this function returns a non-NULL file object, 266 * it held a reference to the task/file. 267 * Otherwise, it does not hold any reference. 268 */ 269 again: 270 if (info->task) { 271 curr_task = info->task; 272 curr_fd = info->fd; 273 } else { 274 curr_task = task_seq_get_next(&info->common, &info->tid, true); 275 if (!curr_task) { 276 info->task = NULL; 277 return NULL; 278 } 279 280 /* set info->task */ 281 info->task = curr_task; 282 if (saved_tid == info->tid) 283 curr_fd = info->fd; 284 else 285 curr_fd = 0; 286 } 287 288 f = fget_task_next(curr_task, &curr_fd); 289 if (f) { 290 /* set info->fd */ 291 info->fd = curr_fd; 292 return f; 293 } 294 295 /* the current task is done, go to the next task */ 296 put_task_struct(curr_task); 297 298 if (info->common.type == BPF_TASK_ITER_TID) { 299 info->task = NULL; 300 return NULL; 301 } 302 303 info->task = NULL; 304 info->fd = 0; 305 saved_tid = ++(info->tid); 306 goto again; 307 } 308 309 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) 310 { 311 struct bpf_iter_seq_task_file_info *info = seq->private; 312 struct file *file; 313 314 info->task = NULL; 315 file = task_file_seq_get_next(info); 316 if (file && *pos == 0) 317 ++*pos; 318 319 return file; 320 } 321 322 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) 323 { 324 struct bpf_iter_seq_task_file_info *info = seq->private; 325 326 ++*pos; 327 ++info->fd; 328 fput((struct file *)v); 329 return task_file_seq_get_next(info); 330 } 331 332 struct bpf_iter__task_file { 333 __bpf_md_ptr(struct bpf_iter_meta *, meta); 334 __bpf_md_ptr(struct task_struct *, task); 335 u32 fd __aligned(8); 336 __bpf_md_ptr(struct file *, file); 337 }; 338 339 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, 340 struct task_struct *task, u32 fd, 341 struct file *file) 342 343 static int __task_file_seq_show(struct seq_file *seq, struct file *file, 344 bool in_stop) 345 { 346 struct bpf_iter_seq_task_file_info *info = seq->private; 347 struct bpf_iter__task_file ctx; 348 struct bpf_iter_meta meta; 349 struct bpf_prog *prog; 350 351 meta.seq = seq; 352 prog = bpf_iter_get_info(&meta, in_stop); 353 if (!prog) 354 return 0; 355 356 ctx.meta = &meta; 357 ctx.task = info->task; 358 ctx.fd = info->fd; 359 ctx.file = file; 360 return bpf_iter_run_prog(prog, &ctx); 361 } 362 363 static int task_file_seq_show(struct seq_file *seq, void *v) 364 { 365 return __task_file_seq_show(seq, v, false); 366 } 367 368 static void task_file_seq_stop(struct seq_file *seq, void *v) 369 { 370 struct bpf_iter_seq_task_file_info *info = seq->private; 371 372 if (!v) { 373 (void)__task_file_seq_show(seq, v, true); 374 } else { 375 fput((struct file *)v); 376 put_task_struct(info->task); 377 info->task = NULL; 378 } 379 } 380 381 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) 382 { 383 struct bpf_iter_seq_task_common *common = priv_data; 384 385 common->ns = get_pid_ns(task_active_pid_ns(current)); 386 common->type = aux->task.type; 387 common->pid = aux->task.pid; 388 389 return 0; 390 } 391 392 static void fini_seq_pidns(void *priv_data) 393 { 394 struct bpf_iter_seq_task_common *common = priv_data; 395 396 put_pid_ns(common->ns); 397 } 398 399 static const struct seq_operations task_file_seq_ops = { 400 .start = task_file_seq_start, 401 .next = task_file_seq_next, 402 .stop = task_file_seq_stop, 403 .show = task_file_seq_show, 404 }; 405 406 struct bpf_iter_seq_task_vma_info { 407 /* The first field must be struct bpf_iter_seq_task_common. 408 * this is assumed by {init, fini}_seq_pidns() callback functions. 409 */ 410 struct bpf_iter_seq_task_common common; 411 struct task_struct *task; 412 struct mm_struct *mm; 413 struct vm_area_struct *vma; 414 u32 tid; 415 unsigned long prev_vm_start; 416 unsigned long prev_vm_end; 417 }; 418 419 enum bpf_task_vma_iter_find_op { 420 task_vma_iter_first_vma, /* use find_vma() with addr 0 */ 421 task_vma_iter_next_vma, /* use vma_next() with curr_vma */ 422 task_vma_iter_find_vma, /* use find_vma() to find next vma */ 423 }; 424 425 static struct vm_area_struct * 426 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) 427 { 428 enum bpf_task_vma_iter_find_op op; 429 struct vm_area_struct *curr_vma; 430 struct task_struct *curr_task; 431 struct mm_struct *curr_mm; 432 u32 saved_tid = info->tid; 433 434 /* If this function returns a non-NULL vma, it holds a reference to 435 * the task_struct, holds a refcount on mm->mm_users, and holds 436 * read lock on vma->mm->mmap_lock. 437 * If this function returns NULL, it does not hold any reference or 438 * lock. 439 */ 440 if (info->task) { 441 curr_task = info->task; 442 curr_vma = info->vma; 443 curr_mm = info->mm; 444 /* In case of lock contention, drop mmap_lock to unblock 445 * the writer. 446 * 447 * After relock, call find(mm, prev_vm_end - 1) to find 448 * new vma to process. 449 * 450 * +------+------+-----------+ 451 * | VMA1 | VMA2 | VMA3 | 452 * +------+------+-----------+ 453 * | | | | 454 * 4k 8k 16k 400k 455 * 456 * For example, curr_vma == VMA2. Before unlock, we set 457 * 458 * prev_vm_start = 8k 459 * prev_vm_end = 16k 460 * 461 * There are a few cases: 462 * 463 * 1) VMA2 is freed, but VMA3 exists. 464 * 465 * find_vma() will return VMA3, just process VMA3. 466 * 467 * 2) VMA2 still exists. 468 * 469 * find_vma() will return VMA2, process VMA2->next. 470 * 471 * 3) no more vma in this mm. 472 * 473 * Process the next task. 474 * 475 * 4) find_vma() returns a different vma, VMA2'. 476 * 477 * 4.1) If VMA2 covers same range as VMA2', skip VMA2', 478 * because we already covered the range; 479 * 4.2) VMA2 and VMA2' covers different ranges, process 480 * VMA2'. 481 */ 482 if (mmap_lock_is_contended(curr_mm)) { 483 info->prev_vm_start = curr_vma->vm_start; 484 info->prev_vm_end = curr_vma->vm_end; 485 op = task_vma_iter_find_vma; 486 mmap_read_unlock(curr_mm); 487 if (mmap_read_lock_killable(curr_mm)) { 488 mmput(curr_mm); 489 goto finish; 490 } 491 } else { 492 op = task_vma_iter_next_vma; 493 } 494 } else { 495 again: 496 curr_task = task_seq_get_next(&info->common, &info->tid, true); 497 if (!curr_task) { 498 info->tid++; 499 goto finish; 500 } 501 502 if (saved_tid != info->tid) { 503 /* new task, process the first vma */ 504 op = task_vma_iter_first_vma; 505 } else { 506 /* Found the same tid, which means the user space 507 * finished data in previous buffer and read more. 508 * We dropped mmap_lock before returning to user 509 * space, so it is necessary to use find_vma() to 510 * find the next vma to process. 511 */ 512 op = task_vma_iter_find_vma; 513 } 514 515 curr_mm = get_task_mm(curr_task); 516 if (!curr_mm) 517 goto next_task; 518 519 if (mmap_read_lock_killable(curr_mm)) { 520 mmput(curr_mm); 521 goto finish; 522 } 523 } 524 525 switch (op) { 526 case task_vma_iter_first_vma: 527 curr_vma = find_vma(curr_mm, 0); 528 break; 529 case task_vma_iter_next_vma: 530 curr_vma = find_vma(curr_mm, curr_vma->vm_end); 531 break; 532 case task_vma_iter_find_vma: 533 /* We dropped mmap_lock so it is necessary to use find_vma 534 * to find the next vma. This is similar to the mechanism 535 * in show_smaps_rollup(). 536 */ 537 curr_vma = find_vma(curr_mm, info->prev_vm_end - 1); 538 /* case 1) and 4.2) above just use curr_vma */ 539 540 /* check for case 2) or case 4.1) above */ 541 if (curr_vma && 542 curr_vma->vm_start == info->prev_vm_start && 543 curr_vma->vm_end == info->prev_vm_end) 544 curr_vma = find_vma(curr_mm, curr_vma->vm_end); 545 break; 546 } 547 if (!curr_vma) { 548 /* case 3) above, or case 2) 4.1) with vma->next == NULL */ 549 mmap_read_unlock(curr_mm); 550 mmput(curr_mm); 551 goto next_task; 552 } 553 info->task = curr_task; 554 info->vma = curr_vma; 555 info->mm = curr_mm; 556 return curr_vma; 557 558 next_task: 559 if (info->common.type == BPF_TASK_ITER_TID) 560 goto finish; 561 562 put_task_struct(curr_task); 563 info->task = NULL; 564 info->mm = NULL; 565 info->tid++; 566 goto again; 567 568 finish: 569 if (curr_task) 570 put_task_struct(curr_task); 571 info->task = NULL; 572 info->vma = NULL; 573 info->mm = NULL; 574 return NULL; 575 } 576 577 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) 578 { 579 struct bpf_iter_seq_task_vma_info *info = seq->private; 580 struct vm_area_struct *vma; 581 582 vma = task_vma_seq_get_next(info); 583 if (vma && *pos == 0) 584 ++*pos; 585 586 return vma; 587 } 588 589 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) 590 { 591 struct bpf_iter_seq_task_vma_info *info = seq->private; 592 593 ++*pos; 594 return task_vma_seq_get_next(info); 595 } 596 597 struct bpf_iter__task_vma { 598 __bpf_md_ptr(struct bpf_iter_meta *, meta); 599 __bpf_md_ptr(struct task_struct *, task); 600 __bpf_md_ptr(struct vm_area_struct *, vma); 601 }; 602 603 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, 604 struct task_struct *task, struct vm_area_struct *vma) 605 606 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) 607 { 608 struct bpf_iter_seq_task_vma_info *info = seq->private; 609 struct bpf_iter__task_vma ctx; 610 struct bpf_iter_meta meta; 611 struct bpf_prog *prog; 612 613 meta.seq = seq; 614 prog = bpf_iter_get_info(&meta, in_stop); 615 if (!prog) 616 return 0; 617 618 ctx.meta = &meta; 619 ctx.task = info->task; 620 ctx.vma = info->vma; 621 return bpf_iter_run_prog(prog, &ctx); 622 } 623 624 static int task_vma_seq_show(struct seq_file *seq, void *v) 625 { 626 return __task_vma_seq_show(seq, false); 627 } 628 629 static void task_vma_seq_stop(struct seq_file *seq, void *v) 630 { 631 struct bpf_iter_seq_task_vma_info *info = seq->private; 632 633 if (!v) { 634 (void)__task_vma_seq_show(seq, true); 635 } else { 636 /* info->vma has not been seen by the BPF program. If the 637 * user space reads more, task_vma_seq_get_next should 638 * return this vma again. Set prev_vm_start to ~0UL, 639 * so that we don't skip the vma returned by the next 640 * find_vma() (case task_vma_iter_find_vma in 641 * task_vma_seq_get_next()). 642 */ 643 info->prev_vm_start = ~0UL; 644 info->prev_vm_end = info->vma->vm_end; 645 mmap_read_unlock(info->mm); 646 mmput(info->mm); 647 info->mm = NULL; 648 put_task_struct(info->task); 649 info->task = NULL; 650 } 651 } 652 653 static const struct seq_operations task_vma_seq_ops = { 654 .start = task_vma_seq_start, 655 .next = task_vma_seq_next, 656 .stop = task_vma_seq_stop, 657 .show = task_vma_seq_show, 658 }; 659 660 static const struct bpf_iter_seq_info task_seq_info = { 661 .seq_ops = &task_seq_ops, 662 .init_seq_private = init_seq_pidns, 663 .fini_seq_private = fini_seq_pidns, 664 .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), 665 }; 666 667 static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info) 668 { 669 switch (aux->task.type) { 670 case BPF_TASK_ITER_TID: 671 info->iter.task.tid = aux->task.pid; 672 break; 673 case BPF_TASK_ITER_TGID: 674 info->iter.task.pid = aux->task.pid; 675 break; 676 default: 677 break; 678 } 679 return 0; 680 } 681 682 static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq) 683 { 684 seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]); 685 if (aux->task.type == BPF_TASK_ITER_TID) 686 seq_printf(seq, "tid:\t%u\n", aux->task.pid); 687 else if (aux->task.type == BPF_TASK_ITER_TGID) 688 seq_printf(seq, "pid:\t%u\n", aux->task.pid); 689 } 690 691 static struct bpf_iter_reg task_reg_info = { 692 .target = "task", 693 .attach_target = bpf_iter_attach_task, 694 .feature = BPF_ITER_RESCHED, 695 .ctx_arg_info_size = 1, 696 .ctx_arg_info = { 697 { offsetof(struct bpf_iter__task, task), 698 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 699 }, 700 .seq_info = &task_seq_info, 701 .fill_link_info = bpf_iter_fill_link_info, 702 .show_fdinfo = bpf_iter_task_show_fdinfo, 703 }; 704 705 static const struct bpf_iter_seq_info task_file_seq_info = { 706 .seq_ops = &task_file_seq_ops, 707 .init_seq_private = init_seq_pidns, 708 .fini_seq_private = fini_seq_pidns, 709 .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), 710 }; 711 712 static struct bpf_iter_reg task_file_reg_info = { 713 .target = "task_file", 714 .attach_target = bpf_iter_attach_task, 715 .feature = BPF_ITER_RESCHED, 716 .ctx_arg_info_size = 2, 717 .ctx_arg_info = { 718 { offsetof(struct bpf_iter__task_file, task), 719 PTR_TO_BTF_ID_OR_NULL }, 720 { offsetof(struct bpf_iter__task_file, file), 721 PTR_TO_BTF_ID_OR_NULL }, 722 }, 723 .seq_info = &task_file_seq_info, 724 .fill_link_info = bpf_iter_fill_link_info, 725 .show_fdinfo = bpf_iter_task_show_fdinfo, 726 }; 727 728 static const struct bpf_iter_seq_info task_vma_seq_info = { 729 .seq_ops = &task_vma_seq_ops, 730 .init_seq_private = init_seq_pidns, 731 .fini_seq_private = fini_seq_pidns, 732 .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info), 733 }; 734 735 static struct bpf_iter_reg task_vma_reg_info = { 736 .target = "task_vma", 737 .attach_target = bpf_iter_attach_task, 738 .feature = BPF_ITER_RESCHED, 739 .ctx_arg_info_size = 2, 740 .ctx_arg_info = { 741 { offsetof(struct bpf_iter__task_vma, task), 742 PTR_TO_BTF_ID_OR_NULL }, 743 { offsetof(struct bpf_iter__task_vma, vma), 744 PTR_TO_BTF_ID_OR_NULL }, 745 }, 746 .seq_info = &task_vma_seq_info, 747 .fill_link_info = bpf_iter_fill_link_info, 748 .show_fdinfo = bpf_iter_task_show_fdinfo, 749 }; 750 751 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, 752 bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) 753 { 754 struct mmap_unlock_irq_work *work = NULL; 755 struct vm_area_struct *vma; 756 bool irq_work_busy = false; 757 struct mm_struct *mm; 758 int ret = -ENOENT; 759 760 if (flags) 761 return -EINVAL; 762 763 if (!task) 764 return -ENOENT; 765 766 mm = task->mm; 767 if (!mm) 768 return -ENOENT; 769 770 irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); 771 772 if (irq_work_busy || !mmap_read_trylock(mm)) 773 return -EBUSY; 774 775 vma = find_vma(mm, start); 776 777 if (vma && vma->vm_start <= start && vma->vm_end > start) { 778 callback_fn((u64)(long)task, (u64)(long)vma, 779 (u64)(long)callback_ctx, 0, 0); 780 ret = 0; 781 } 782 bpf_mmap_unlock_mm(work, mm); 783 return ret; 784 } 785 786 const struct bpf_func_proto bpf_find_vma_proto = { 787 .func = bpf_find_vma, 788 .ret_type = RET_INTEGER, 789 .arg1_type = ARG_PTR_TO_BTF_ID, 790 .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], 791 .arg2_type = ARG_ANYTHING, 792 .arg3_type = ARG_PTR_TO_FUNC, 793 .arg4_type = ARG_PTR_TO_STACK_OR_NULL, 794 .arg5_type = ARG_ANYTHING, 795 }; 796 797 struct bpf_iter_task_vma_kern_data { 798 struct task_struct *task; 799 struct mm_struct *mm; 800 struct mmap_unlock_irq_work *work; 801 struct vma_iterator vmi; 802 }; 803 804 struct bpf_iter_task_vma { 805 /* opaque iterator state; having __u64 here allows to preserve correct 806 * alignment requirements in vmlinux.h, generated from BTF 807 */ 808 __u64 __opaque[1]; 809 } __attribute__((aligned(8))); 810 811 /* Non-opaque version of bpf_iter_task_vma */ 812 struct bpf_iter_task_vma_kern { 813 struct bpf_iter_task_vma_kern_data *data; 814 } __attribute__((aligned(8))); 815 816 __bpf_kfunc_start_defs(); 817 818 __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, 819 struct task_struct *task, u64 addr) 820 { 821 struct bpf_iter_task_vma_kern *kit = (void *)it; 822 bool irq_work_busy = false; 823 int err; 824 825 BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma)); 826 BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma)); 827 828 /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized 829 * before, so non-NULL kit->data doesn't point to previously 830 * bpf_mem_alloc'd bpf_iter_task_vma_kern_data 831 */ 832 kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data)); 833 if (!kit->data) 834 return -ENOMEM; 835 836 kit->data->task = get_task_struct(task); 837 kit->data->mm = task->mm; 838 if (!kit->data->mm) { 839 err = -ENOENT; 840 goto err_cleanup_iter; 841 } 842 843 /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */ 844 irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work); 845 if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) { 846 err = -EBUSY; 847 goto err_cleanup_iter; 848 } 849 850 vma_iter_init(&kit->data->vmi, kit->data->mm, addr); 851 return 0; 852 853 err_cleanup_iter: 854 if (kit->data->task) 855 put_task_struct(kit->data->task); 856 bpf_mem_free(&bpf_global_ma, kit->data); 857 /* NULL kit->data signals failed bpf_iter_task_vma initialization */ 858 kit->data = NULL; 859 return err; 860 } 861 862 __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) 863 { 864 struct bpf_iter_task_vma_kern *kit = (void *)it; 865 866 if (!kit->data) /* bpf_iter_task_vma_new failed */ 867 return NULL; 868 return vma_next(&kit->data->vmi); 869 } 870 871 __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) 872 { 873 struct bpf_iter_task_vma_kern *kit = (void *)it; 874 875 if (kit->data) { 876 bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); 877 put_task_struct(kit->data->task); 878 bpf_mem_free(&bpf_global_ma, kit->data); 879 } 880 } 881 882 __bpf_kfunc_end_defs(); 883 884 #ifdef CONFIG_CGROUPS 885 886 struct bpf_iter_css_task { 887 __u64 __opaque[1]; 888 } __attribute__((aligned(8))); 889 890 struct bpf_iter_css_task_kern { 891 struct css_task_iter *css_it; 892 } __attribute__((aligned(8))); 893 894 __bpf_kfunc_start_defs(); 895 896 __bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it, 897 struct cgroup_subsys_state *css, unsigned int flags) 898 { 899 struct bpf_iter_css_task_kern *kit = (void *)it; 900 901 BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task)); 902 BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) != 903 __alignof__(struct bpf_iter_css_task)); 904 kit->css_it = NULL; 905 switch (flags) { 906 case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED: 907 case CSS_TASK_ITER_PROCS: 908 case 0: 909 break; 910 default: 911 return -EINVAL; 912 } 913 914 kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter)); 915 if (!kit->css_it) 916 return -ENOMEM; 917 css_task_iter_start(css, flags, kit->css_it); 918 return 0; 919 } 920 921 __bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) 922 { 923 struct bpf_iter_css_task_kern *kit = (void *)it; 924 925 if (!kit->css_it) 926 return NULL; 927 return css_task_iter_next(kit->css_it); 928 } 929 930 __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) 931 { 932 struct bpf_iter_css_task_kern *kit = (void *)it; 933 934 if (!kit->css_it) 935 return; 936 css_task_iter_end(kit->css_it); 937 bpf_mem_free(&bpf_global_ma, kit->css_it); 938 } 939 940 __bpf_kfunc_end_defs(); 941 942 #endif /* CONFIG_CGROUPS */ 943 944 struct bpf_iter_task { 945 __u64 __opaque[3]; 946 } __attribute__((aligned(8))); 947 948 struct bpf_iter_task_kern { 949 struct task_struct *task; 950 struct task_struct *pos; 951 unsigned int flags; 952 } __attribute__((aligned(8))); 953 954 enum { 955 /* all process in the system */ 956 BPF_TASK_ITER_ALL_PROCS, 957 /* all threads in the system */ 958 BPF_TASK_ITER_ALL_THREADS, 959 /* all threads of a specific process */ 960 BPF_TASK_ITER_PROC_THREADS 961 }; 962 963 __bpf_kfunc_start_defs(); 964 965 __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, 966 struct task_struct *task__nullable, unsigned int flags) 967 { 968 struct bpf_iter_task_kern *kit = (void *)it; 969 970 BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task)); 971 BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != 972 __alignof__(struct bpf_iter_task)); 973 974 kit->pos = NULL; 975 976 switch (flags) { 977 case BPF_TASK_ITER_ALL_THREADS: 978 case BPF_TASK_ITER_ALL_PROCS: 979 break; 980 case BPF_TASK_ITER_PROC_THREADS: 981 if (!task__nullable) 982 return -EINVAL; 983 break; 984 default: 985 return -EINVAL; 986 } 987 988 if (flags == BPF_TASK_ITER_PROC_THREADS) 989 kit->task = task__nullable; 990 else 991 kit->task = &init_task; 992 kit->pos = kit->task; 993 kit->flags = flags; 994 return 0; 995 } 996 997 __bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) 998 { 999 struct bpf_iter_task_kern *kit = (void *)it; 1000 struct task_struct *pos; 1001 unsigned int flags; 1002 1003 flags = kit->flags; 1004 pos = kit->pos; 1005 1006 if (!pos) 1007 return pos; 1008 1009 if (flags == BPF_TASK_ITER_ALL_PROCS) 1010 goto get_next_task; 1011 1012 kit->pos = __next_thread(kit->pos); 1013 if (kit->pos || flags == BPF_TASK_ITER_PROC_THREADS) 1014 return pos; 1015 1016 get_next_task: 1017 kit->task = next_task(kit->task); 1018 if (kit->task == &init_task) 1019 kit->pos = NULL; 1020 else 1021 kit->pos = kit->task; 1022 1023 return pos; 1024 } 1025 1026 __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it) 1027 { 1028 } 1029 1030 __bpf_kfunc_end_defs(); 1031 1032 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); 1033 1034 static void do_mmap_read_unlock(struct irq_work *entry) 1035 { 1036 struct mmap_unlock_irq_work *work; 1037 1038 if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) 1039 return; 1040 1041 work = container_of(entry, struct mmap_unlock_irq_work, irq_work); 1042 mmap_read_unlock_non_owner(work->mm); 1043 } 1044 1045 static int __init task_iter_init(void) 1046 { 1047 struct mmap_unlock_irq_work *work; 1048 int ret, cpu; 1049 1050 for_each_possible_cpu(cpu) { 1051 work = per_cpu_ptr(&mmap_unlock_work, cpu); 1052 init_irq_work(&work->irq_work, do_mmap_read_unlock); 1053 } 1054 1055 task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 1056 ret = bpf_iter_reg_target(&task_reg_info); 1057 if (ret) 1058 return ret; 1059 1060 task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 1061 task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE]; 1062 ret = bpf_iter_reg_target(&task_file_reg_info); 1063 if (ret) 1064 return ret; 1065 1066 task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 1067 task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA]; 1068 return bpf_iter_reg_target(&task_vma_reg_info); 1069 } 1070 late_initcall(task_iter_init); 1071