1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2020 Facebook */ 3 4 #include <linux/init.h> 5 #include <linux/namei.h> 6 #include <linux/pid_namespace.h> 7 #include <linux/fs.h> 8 #include <linux/fdtable.h> 9 #include <linux/filter.h> 10 #include <linux/bpf_mem_alloc.h> 11 #include <linux/btf_ids.h> 12 #include <linux/mm_types.h> 13 #include "mmap_unlock_work.h" 14 15 static const char * const iter_task_type_names[] = { 16 "ALL", 17 "TID", 18 "PID", 19 }; 20 21 struct bpf_iter_seq_task_common { 22 struct pid_namespace *ns; 23 enum bpf_iter_task_type type; 24 u32 pid; 25 u32 pid_visiting; 26 }; 27 28 struct bpf_iter_seq_task_info { 29 /* The first field must be struct bpf_iter_seq_task_common. 30 * this is assumed by {init, fini}_seq_pidns() callback functions. 31 */ 32 struct bpf_iter_seq_task_common common; 33 u32 tid; 34 }; 35 36 static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common, 37 u32 *tid, 38 bool skip_if_dup_files) 39 { 40 struct task_struct *task; 41 struct pid *pid; 42 u32 next_tid; 43 44 if (!*tid) { 45 /* The first time, the iterator calls this function. */ 46 pid = find_pid_ns(common->pid, common->ns); 47 task = get_pid_task(pid, PIDTYPE_TGID); 48 if (!task) 49 return NULL; 50 51 *tid = common->pid; 52 common->pid_visiting = common->pid; 53 54 return task; 55 } 56 57 /* If the control returns to user space and comes back to the 58 * kernel again, *tid and common->pid_visiting should be the 59 * same for task_seq_start() to pick up the correct task. 60 */ 61 if (*tid == common->pid_visiting) { 62 pid = find_pid_ns(common->pid_visiting, common->ns); 63 task = get_pid_task(pid, PIDTYPE_PID); 64 65 return task; 66 } 67 68 task = find_task_by_pid_ns(common->pid_visiting, common->ns); 69 if (!task) 70 return NULL; 71 72 retry: 73 task = __next_thread(task); 74 if (!task) 75 return NULL; 76 77 next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns); 78 if (!next_tid) 79 goto retry; 80 81 if (skip_if_dup_files && task->files == task->group_leader->files) 82 goto retry; 83 84 *tid = common->pid_visiting = next_tid; 85 get_task_struct(task); 86 return task; 87 } 88 89 static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, 90 u32 *tid, 91 bool skip_if_dup_files) 92 { 93 struct task_struct *task = NULL; 94 struct pid *pid; 95 96 if (common->type == BPF_TASK_ITER_TID) { 97 if (*tid && *tid != common->pid) 98 return NULL; 99 rcu_read_lock(); 100 pid = find_pid_ns(common->pid, common->ns); 101 if (pid) { 102 task = get_pid_task(pid, PIDTYPE_TGID); 103 *tid = common->pid; 104 } 105 rcu_read_unlock(); 106 107 return task; 108 } 109 110 if (common->type == BPF_TASK_ITER_TGID) { 111 rcu_read_lock(); 112 task = task_group_seq_get_next(common, tid, skip_if_dup_files); 113 rcu_read_unlock(); 114 115 return task; 116 } 117 118 rcu_read_lock(); 119 retry: 120 pid = find_ge_pid(*tid, common->ns); 121 if (pid) { 122 *tid = pid_nr_ns(pid, common->ns); 123 task = get_pid_task(pid, PIDTYPE_PID); 124 if (!task) { 125 ++*tid; 126 goto retry; 127 } else if (skip_if_dup_files && !thread_group_leader(task) && 128 task->files == task->group_leader->files) { 129 put_task_struct(task); 130 task = NULL; 131 ++*tid; 132 goto retry; 133 } 134 } 135 rcu_read_unlock(); 136 137 return task; 138 } 139 140 static void *task_seq_start(struct seq_file *seq, loff_t *pos) 141 { 142 struct bpf_iter_seq_task_info *info = seq->private; 143 struct task_struct *task; 144 145 task = task_seq_get_next(&info->common, &info->tid, false); 146 if (!task) 147 return NULL; 148 149 if (*pos == 0) 150 ++*pos; 151 return task; 152 } 153 154 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) 155 { 156 struct bpf_iter_seq_task_info *info = seq->private; 157 struct task_struct *task; 158 159 ++*pos; 160 ++info->tid; 161 put_task_struct((struct task_struct *)v); 162 task = task_seq_get_next(&info->common, &info->tid, false); 163 if (!task) 164 return NULL; 165 166 return task; 167 } 168 169 struct bpf_iter__task { 170 __bpf_md_ptr(struct bpf_iter_meta *, meta); 171 __bpf_md_ptr(struct task_struct *, task); 172 }; 173 174 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) 175 176 static int __task_seq_show(struct seq_file *seq, struct task_struct *task, 177 bool in_stop) 178 { 179 struct bpf_iter_meta meta; 180 struct bpf_iter__task ctx; 181 struct bpf_prog *prog; 182 183 meta.seq = seq; 184 prog = bpf_iter_get_info(&meta, in_stop); 185 if (!prog) 186 return 0; 187 188 ctx.meta = &meta; 189 ctx.task = task; 190 return bpf_iter_run_prog(prog, &ctx); 191 } 192 193 static int task_seq_show(struct seq_file *seq, void *v) 194 { 195 return __task_seq_show(seq, v, false); 196 } 197 198 static void task_seq_stop(struct seq_file *seq, void *v) 199 { 200 if (!v) 201 (void)__task_seq_show(seq, v, true); 202 else 203 put_task_struct((struct task_struct *)v); 204 } 205 206 static int bpf_iter_attach_task(struct bpf_prog *prog, 207 union bpf_iter_link_info *linfo, 208 struct bpf_iter_aux_info *aux) 209 { 210 unsigned int flags; 211 struct pid *pid; 212 pid_t tgid; 213 214 if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1) 215 return -EINVAL; 216 217 aux->task.type = BPF_TASK_ITER_ALL; 218 if (linfo->task.tid != 0) { 219 aux->task.type = BPF_TASK_ITER_TID; 220 aux->task.pid = linfo->task.tid; 221 } 222 if (linfo->task.pid != 0) { 223 aux->task.type = BPF_TASK_ITER_TGID; 224 aux->task.pid = linfo->task.pid; 225 } 226 if (linfo->task.pid_fd != 0) { 227 aux->task.type = BPF_TASK_ITER_TGID; 228 229 pid = pidfd_get_pid(linfo->task.pid_fd, &flags); 230 if (IS_ERR(pid)) 231 return PTR_ERR(pid); 232 233 tgid = pid_nr_ns(pid, task_active_pid_ns(current)); 234 aux->task.pid = tgid; 235 put_pid(pid); 236 } 237 238 return 0; 239 } 240 241 static const struct seq_operations task_seq_ops = { 242 .start = task_seq_start, 243 .next = task_seq_next, 244 .stop = task_seq_stop, 245 .show = task_seq_show, 246 }; 247 248 struct bpf_iter_seq_task_file_info { 249 /* The first field must be struct bpf_iter_seq_task_common. 250 * this is assumed by {init, fini}_seq_pidns() callback functions. 251 */ 252 struct bpf_iter_seq_task_common common; 253 struct task_struct *task; 254 u32 tid; 255 u32 fd; 256 }; 257 258 static struct file * 259 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) 260 { 261 u32 saved_tid = info->tid; 262 struct task_struct *curr_task; 263 unsigned int curr_fd = info->fd; 264 struct file *f; 265 266 /* If this function returns a non-NULL file object, 267 * it held a reference to the task/file. 268 * Otherwise, it does not hold any reference. 269 */ 270 again: 271 if (info->task) { 272 curr_task = info->task; 273 curr_fd = info->fd; 274 } else { 275 curr_task = task_seq_get_next(&info->common, &info->tid, true); 276 if (!curr_task) { 277 info->task = NULL; 278 return NULL; 279 } 280 281 /* set info->task */ 282 info->task = curr_task; 283 if (saved_tid == info->tid) 284 curr_fd = info->fd; 285 else 286 curr_fd = 0; 287 } 288 289 rcu_read_lock(); 290 f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); 291 if (f) { 292 /* set info->fd */ 293 info->fd = curr_fd; 294 rcu_read_unlock(); 295 return f; 296 } 297 298 /* the current task is done, go to the next task */ 299 rcu_read_unlock(); 300 put_task_struct(curr_task); 301 302 if (info->common.type == BPF_TASK_ITER_TID) { 303 info->task = NULL; 304 return NULL; 305 } 306 307 info->task = NULL; 308 info->fd = 0; 309 saved_tid = ++(info->tid); 310 goto again; 311 } 312 313 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) 314 { 315 struct bpf_iter_seq_task_file_info *info = seq->private; 316 struct file *file; 317 318 info->task = NULL; 319 file = task_file_seq_get_next(info); 320 if (file && *pos == 0) 321 ++*pos; 322 323 return file; 324 } 325 326 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) 327 { 328 struct bpf_iter_seq_task_file_info *info = seq->private; 329 330 ++*pos; 331 ++info->fd; 332 fput((struct file *)v); 333 return task_file_seq_get_next(info); 334 } 335 336 struct bpf_iter__task_file { 337 __bpf_md_ptr(struct bpf_iter_meta *, meta); 338 __bpf_md_ptr(struct task_struct *, task); 339 u32 fd __aligned(8); 340 __bpf_md_ptr(struct file *, file); 341 }; 342 343 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, 344 struct task_struct *task, u32 fd, 345 struct file *file) 346 347 static int __task_file_seq_show(struct seq_file *seq, struct file *file, 348 bool in_stop) 349 { 350 struct bpf_iter_seq_task_file_info *info = seq->private; 351 struct bpf_iter__task_file ctx; 352 struct bpf_iter_meta meta; 353 struct bpf_prog *prog; 354 355 meta.seq = seq; 356 prog = bpf_iter_get_info(&meta, in_stop); 357 if (!prog) 358 return 0; 359 360 ctx.meta = &meta; 361 ctx.task = info->task; 362 ctx.fd = info->fd; 363 ctx.file = file; 364 return bpf_iter_run_prog(prog, &ctx); 365 } 366 367 static int task_file_seq_show(struct seq_file *seq, void *v) 368 { 369 return __task_file_seq_show(seq, v, false); 370 } 371 372 static void task_file_seq_stop(struct seq_file *seq, void *v) 373 { 374 struct bpf_iter_seq_task_file_info *info = seq->private; 375 376 if (!v) { 377 (void)__task_file_seq_show(seq, v, true); 378 } else { 379 fput((struct file *)v); 380 put_task_struct(info->task); 381 info->task = NULL; 382 } 383 } 384 385 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) 386 { 387 struct bpf_iter_seq_task_common *common = priv_data; 388 389 common->ns = get_pid_ns(task_active_pid_ns(current)); 390 common->type = aux->task.type; 391 common->pid = aux->task.pid; 392 393 return 0; 394 } 395 396 static void fini_seq_pidns(void *priv_data) 397 { 398 struct bpf_iter_seq_task_common *common = priv_data; 399 400 put_pid_ns(common->ns); 401 } 402 403 static const struct seq_operations task_file_seq_ops = { 404 .start = task_file_seq_start, 405 .next = task_file_seq_next, 406 .stop = task_file_seq_stop, 407 .show = task_file_seq_show, 408 }; 409 410 struct bpf_iter_seq_task_vma_info { 411 /* The first field must be struct bpf_iter_seq_task_common. 412 * this is assumed by {init, fini}_seq_pidns() callback functions. 413 */ 414 struct bpf_iter_seq_task_common common; 415 struct task_struct *task; 416 struct mm_struct *mm; 417 struct vm_area_struct *vma; 418 u32 tid; 419 unsigned long prev_vm_start; 420 unsigned long prev_vm_end; 421 }; 422 423 enum bpf_task_vma_iter_find_op { 424 task_vma_iter_first_vma, /* use find_vma() with addr 0 */ 425 task_vma_iter_next_vma, /* use vma_next() with curr_vma */ 426 task_vma_iter_find_vma, /* use find_vma() to find next vma */ 427 }; 428 429 static struct vm_area_struct * 430 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) 431 { 432 enum bpf_task_vma_iter_find_op op; 433 struct vm_area_struct *curr_vma; 434 struct task_struct *curr_task; 435 struct mm_struct *curr_mm; 436 u32 saved_tid = info->tid; 437 438 /* If this function returns a non-NULL vma, it holds a reference to 439 * the task_struct, holds a refcount on mm->mm_users, and holds 440 * read lock on vma->mm->mmap_lock. 441 * If this function returns NULL, it does not hold any reference or 442 * lock. 443 */ 444 if (info->task) { 445 curr_task = info->task; 446 curr_vma = info->vma; 447 curr_mm = info->mm; 448 /* In case of lock contention, drop mmap_lock to unblock 449 * the writer. 450 * 451 * After relock, call find(mm, prev_vm_end - 1) to find 452 * new vma to process. 453 * 454 * +------+------+-----------+ 455 * | VMA1 | VMA2 | VMA3 | 456 * +------+------+-----------+ 457 * | | | | 458 * 4k 8k 16k 400k 459 * 460 * For example, curr_vma == VMA2. Before unlock, we set 461 * 462 * prev_vm_start = 8k 463 * prev_vm_end = 16k 464 * 465 * There are a few cases: 466 * 467 * 1) VMA2 is freed, but VMA3 exists. 468 * 469 * find_vma() will return VMA3, just process VMA3. 470 * 471 * 2) VMA2 still exists. 472 * 473 * find_vma() will return VMA2, process VMA2->next. 474 * 475 * 3) no more vma in this mm. 476 * 477 * Process the next task. 478 * 479 * 4) find_vma() returns a different vma, VMA2'. 480 * 481 * 4.1) If VMA2 covers same range as VMA2', skip VMA2', 482 * because we already covered the range; 483 * 4.2) VMA2 and VMA2' covers different ranges, process 484 * VMA2'. 485 */ 486 if (mmap_lock_is_contended(curr_mm)) { 487 info->prev_vm_start = curr_vma->vm_start; 488 info->prev_vm_end = curr_vma->vm_end; 489 op = task_vma_iter_find_vma; 490 mmap_read_unlock(curr_mm); 491 if (mmap_read_lock_killable(curr_mm)) { 492 mmput(curr_mm); 493 goto finish; 494 } 495 } else { 496 op = task_vma_iter_next_vma; 497 } 498 } else { 499 again: 500 curr_task = task_seq_get_next(&info->common, &info->tid, true); 501 if (!curr_task) { 502 info->tid++; 503 goto finish; 504 } 505 506 if (saved_tid != info->tid) { 507 /* new task, process the first vma */ 508 op = task_vma_iter_first_vma; 509 } else { 510 /* Found the same tid, which means the user space 511 * finished data in previous buffer and read more. 512 * We dropped mmap_lock before returning to user 513 * space, so it is necessary to use find_vma() to 514 * find the next vma to process. 515 */ 516 op = task_vma_iter_find_vma; 517 } 518 519 curr_mm = get_task_mm(curr_task); 520 if (!curr_mm) 521 goto next_task; 522 523 if (mmap_read_lock_killable(curr_mm)) { 524 mmput(curr_mm); 525 goto finish; 526 } 527 } 528 529 switch (op) { 530 case task_vma_iter_first_vma: 531 curr_vma = find_vma(curr_mm, 0); 532 break; 533 case task_vma_iter_next_vma: 534 curr_vma = find_vma(curr_mm, curr_vma->vm_end); 535 break; 536 case task_vma_iter_find_vma: 537 /* We dropped mmap_lock so it is necessary to use find_vma 538 * to find the next vma. This is similar to the mechanism 539 * in show_smaps_rollup(). 540 */ 541 curr_vma = find_vma(curr_mm, info->prev_vm_end - 1); 542 /* case 1) and 4.2) above just use curr_vma */ 543 544 /* check for case 2) or case 4.1) above */ 545 if (curr_vma && 546 curr_vma->vm_start == info->prev_vm_start && 547 curr_vma->vm_end == info->prev_vm_end) 548 curr_vma = find_vma(curr_mm, curr_vma->vm_end); 549 break; 550 } 551 if (!curr_vma) { 552 /* case 3) above, or case 2) 4.1) with vma->next == NULL */ 553 mmap_read_unlock(curr_mm); 554 mmput(curr_mm); 555 goto next_task; 556 } 557 info->task = curr_task; 558 info->vma = curr_vma; 559 info->mm = curr_mm; 560 return curr_vma; 561 562 next_task: 563 if (info->common.type == BPF_TASK_ITER_TID) 564 goto finish; 565 566 put_task_struct(curr_task); 567 info->task = NULL; 568 info->mm = NULL; 569 info->tid++; 570 goto again; 571 572 finish: 573 if (curr_task) 574 put_task_struct(curr_task); 575 info->task = NULL; 576 info->vma = NULL; 577 info->mm = NULL; 578 return NULL; 579 } 580 581 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) 582 { 583 struct bpf_iter_seq_task_vma_info *info = seq->private; 584 struct vm_area_struct *vma; 585 586 vma = task_vma_seq_get_next(info); 587 if (vma && *pos == 0) 588 ++*pos; 589 590 return vma; 591 } 592 593 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) 594 { 595 struct bpf_iter_seq_task_vma_info *info = seq->private; 596 597 ++*pos; 598 return task_vma_seq_get_next(info); 599 } 600 601 struct bpf_iter__task_vma { 602 __bpf_md_ptr(struct bpf_iter_meta *, meta); 603 __bpf_md_ptr(struct task_struct *, task); 604 __bpf_md_ptr(struct vm_area_struct *, vma); 605 }; 606 607 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, 608 struct task_struct *task, struct vm_area_struct *vma) 609 610 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) 611 { 612 struct bpf_iter_seq_task_vma_info *info = seq->private; 613 struct bpf_iter__task_vma ctx; 614 struct bpf_iter_meta meta; 615 struct bpf_prog *prog; 616 617 meta.seq = seq; 618 prog = bpf_iter_get_info(&meta, in_stop); 619 if (!prog) 620 return 0; 621 622 ctx.meta = &meta; 623 ctx.task = info->task; 624 ctx.vma = info->vma; 625 return bpf_iter_run_prog(prog, &ctx); 626 } 627 628 static int task_vma_seq_show(struct seq_file *seq, void *v) 629 { 630 return __task_vma_seq_show(seq, false); 631 } 632 633 static void task_vma_seq_stop(struct seq_file *seq, void *v) 634 { 635 struct bpf_iter_seq_task_vma_info *info = seq->private; 636 637 if (!v) { 638 (void)__task_vma_seq_show(seq, true); 639 } else { 640 /* info->vma has not been seen by the BPF program. If the 641 * user space reads more, task_vma_seq_get_next should 642 * return this vma again. Set prev_vm_start to ~0UL, 643 * so that we don't skip the vma returned by the next 644 * find_vma() (case task_vma_iter_find_vma in 645 * task_vma_seq_get_next()). 646 */ 647 info->prev_vm_start = ~0UL; 648 info->prev_vm_end = info->vma->vm_end; 649 mmap_read_unlock(info->mm); 650 mmput(info->mm); 651 info->mm = NULL; 652 put_task_struct(info->task); 653 info->task = NULL; 654 } 655 } 656 657 static const struct seq_operations task_vma_seq_ops = { 658 .start = task_vma_seq_start, 659 .next = task_vma_seq_next, 660 .stop = task_vma_seq_stop, 661 .show = task_vma_seq_show, 662 }; 663 664 static const struct bpf_iter_seq_info task_seq_info = { 665 .seq_ops = &task_seq_ops, 666 .init_seq_private = init_seq_pidns, 667 .fini_seq_private = fini_seq_pidns, 668 .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), 669 }; 670 671 static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info) 672 { 673 switch (aux->task.type) { 674 case BPF_TASK_ITER_TID: 675 info->iter.task.tid = aux->task.pid; 676 break; 677 case BPF_TASK_ITER_TGID: 678 info->iter.task.pid = aux->task.pid; 679 break; 680 default: 681 break; 682 } 683 return 0; 684 } 685 686 static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq) 687 { 688 seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]); 689 if (aux->task.type == BPF_TASK_ITER_TID) 690 seq_printf(seq, "tid:\t%u\n", aux->task.pid); 691 else if (aux->task.type == BPF_TASK_ITER_TGID) 692 seq_printf(seq, "pid:\t%u\n", aux->task.pid); 693 } 694 695 static struct bpf_iter_reg task_reg_info = { 696 .target = "task", 697 .attach_target = bpf_iter_attach_task, 698 .feature = BPF_ITER_RESCHED, 699 .ctx_arg_info_size = 1, 700 .ctx_arg_info = { 701 { offsetof(struct bpf_iter__task, task), 702 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 703 }, 704 .seq_info = &task_seq_info, 705 .fill_link_info = bpf_iter_fill_link_info, 706 .show_fdinfo = bpf_iter_task_show_fdinfo, 707 }; 708 709 static const struct bpf_iter_seq_info task_file_seq_info = { 710 .seq_ops = &task_file_seq_ops, 711 .init_seq_private = init_seq_pidns, 712 .fini_seq_private = fini_seq_pidns, 713 .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), 714 }; 715 716 static struct bpf_iter_reg task_file_reg_info = { 717 .target = "task_file", 718 .attach_target = bpf_iter_attach_task, 719 .feature = BPF_ITER_RESCHED, 720 .ctx_arg_info_size = 2, 721 .ctx_arg_info = { 722 { offsetof(struct bpf_iter__task_file, task), 723 PTR_TO_BTF_ID_OR_NULL }, 724 { offsetof(struct bpf_iter__task_file, file), 725 PTR_TO_BTF_ID_OR_NULL }, 726 }, 727 .seq_info = &task_file_seq_info, 728 .fill_link_info = bpf_iter_fill_link_info, 729 .show_fdinfo = bpf_iter_task_show_fdinfo, 730 }; 731 732 static const struct bpf_iter_seq_info task_vma_seq_info = { 733 .seq_ops = &task_vma_seq_ops, 734 .init_seq_private = init_seq_pidns, 735 .fini_seq_private = fini_seq_pidns, 736 .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info), 737 }; 738 739 static struct bpf_iter_reg task_vma_reg_info = { 740 .target = "task_vma", 741 .attach_target = bpf_iter_attach_task, 742 .feature = BPF_ITER_RESCHED, 743 .ctx_arg_info_size = 2, 744 .ctx_arg_info = { 745 { offsetof(struct bpf_iter__task_vma, task), 746 PTR_TO_BTF_ID_OR_NULL }, 747 { offsetof(struct bpf_iter__task_vma, vma), 748 PTR_TO_BTF_ID_OR_NULL }, 749 }, 750 .seq_info = &task_vma_seq_info, 751 .fill_link_info = bpf_iter_fill_link_info, 752 .show_fdinfo = bpf_iter_task_show_fdinfo, 753 }; 754 755 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, 756 bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) 757 { 758 struct mmap_unlock_irq_work *work = NULL; 759 struct vm_area_struct *vma; 760 bool irq_work_busy = false; 761 struct mm_struct *mm; 762 int ret = -ENOENT; 763 764 if (flags) 765 return -EINVAL; 766 767 if (!task) 768 return -ENOENT; 769 770 mm = task->mm; 771 if (!mm) 772 return -ENOENT; 773 774 irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); 775 776 if (irq_work_busy || !mmap_read_trylock(mm)) 777 return -EBUSY; 778 779 vma = find_vma(mm, start); 780 781 if (vma && vma->vm_start <= start && vma->vm_end > start) { 782 callback_fn((u64)(long)task, (u64)(long)vma, 783 (u64)(long)callback_ctx, 0, 0); 784 ret = 0; 785 } 786 bpf_mmap_unlock_mm(work, mm); 787 return ret; 788 } 789 790 const struct bpf_func_proto bpf_find_vma_proto = { 791 .func = bpf_find_vma, 792 .ret_type = RET_INTEGER, 793 .arg1_type = ARG_PTR_TO_BTF_ID, 794 .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], 795 .arg2_type = ARG_ANYTHING, 796 .arg3_type = ARG_PTR_TO_FUNC, 797 .arg4_type = ARG_PTR_TO_STACK_OR_NULL, 798 .arg5_type = ARG_ANYTHING, 799 }; 800 801 struct bpf_iter_task_vma_kern_data { 802 struct task_struct *task; 803 struct mm_struct *mm; 804 struct mmap_unlock_irq_work *work; 805 struct vma_iterator vmi; 806 }; 807 808 struct bpf_iter_task_vma { 809 /* opaque iterator state; having __u64 here allows to preserve correct 810 * alignment requirements in vmlinux.h, generated from BTF 811 */ 812 __u64 __opaque[1]; 813 } __attribute__((aligned(8))); 814 815 /* Non-opaque version of bpf_iter_task_vma */ 816 struct bpf_iter_task_vma_kern { 817 struct bpf_iter_task_vma_kern_data *data; 818 } __attribute__((aligned(8))); 819 820 __bpf_kfunc_start_defs(); 821 822 __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, 823 struct task_struct *task, u64 addr) 824 { 825 struct bpf_iter_task_vma_kern *kit = (void *)it; 826 bool irq_work_busy = false; 827 int err; 828 829 BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma)); 830 BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma)); 831 832 /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized 833 * before, so non-NULL kit->data doesn't point to previously 834 * bpf_mem_alloc'd bpf_iter_task_vma_kern_data 835 */ 836 kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data)); 837 if (!kit->data) 838 return -ENOMEM; 839 840 kit->data->task = get_task_struct(task); 841 kit->data->mm = task->mm; 842 if (!kit->data->mm) { 843 err = -ENOENT; 844 goto err_cleanup_iter; 845 } 846 847 /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */ 848 irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work); 849 if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) { 850 err = -EBUSY; 851 goto err_cleanup_iter; 852 } 853 854 vma_iter_init(&kit->data->vmi, kit->data->mm, addr); 855 return 0; 856 857 err_cleanup_iter: 858 if (kit->data->task) 859 put_task_struct(kit->data->task); 860 bpf_mem_free(&bpf_global_ma, kit->data); 861 /* NULL kit->data signals failed bpf_iter_task_vma initialization */ 862 kit->data = NULL; 863 return err; 864 } 865 866 __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) 867 { 868 struct bpf_iter_task_vma_kern *kit = (void *)it; 869 870 if (!kit->data) /* bpf_iter_task_vma_new failed */ 871 return NULL; 872 return vma_next(&kit->data->vmi); 873 } 874 875 __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) 876 { 877 struct bpf_iter_task_vma_kern *kit = (void *)it; 878 879 if (kit->data) { 880 bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); 881 put_task_struct(kit->data->task); 882 bpf_mem_free(&bpf_global_ma, kit->data); 883 } 884 } 885 886 __bpf_kfunc_end_defs(); 887 888 #ifdef CONFIG_CGROUPS 889 890 struct bpf_iter_css_task { 891 __u64 __opaque[1]; 892 } __attribute__((aligned(8))); 893 894 struct bpf_iter_css_task_kern { 895 struct css_task_iter *css_it; 896 } __attribute__((aligned(8))); 897 898 __bpf_kfunc_start_defs(); 899 900 __bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it, 901 struct cgroup_subsys_state *css, unsigned int flags) 902 { 903 struct bpf_iter_css_task_kern *kit = (void *)it; 904 905 BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task)); 906 BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) != 907 __alignof__(struct bpf_iter_css_task)); 908 kit->css_it = NULL; 909 switch (flags) { 910 case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED: 911 case CSS_TASK_ITER_PROCS: 912 case 0: 913 break; 914 default: 915 return -EINVAL; 916 } 917 918 kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter)); 919 if (!kit->css_it) 920 return -ENOMEM; 921 css_task_iter_start(css, flags, kit->css_it); 922 return 0; 923 } 924 925 __bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) 926 { 927 struct bpf_iter_css_task_kern *kit = (void *)it; 928 929 if (!kit->css_it) 930 return NULL; 931 return css_task_iter_next(kit->css_it); 932 } 933 934 __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) 935 { 936 struct bpf_iter_css_task_kern *kit = (void *)it; 937 938 if (!kit->css_it) 939 return; 940 css_task_iter_end(kit->css_it); 941 bpf_mem_free(&bpf_global_ma, kit->css_it); 942 } 943 944 __bpf_kfunc_end_defs(); 945 946 #endif /* CONFIG_CGROUPS */ 947 948 struct bpf_iter_task { 949 __u64 __opaque[3]; 950 } __attribute__((aligned(8))); 951 952 struct bpf_iter_task_kern { 953 struct task_struct *task; 954 struct task_struct *pos; 955 unsigned int flags; 956 } __attribute__((aligned(8))); 957 958 enum { 959 /* all process in the system */ 960 BPF_TASK_ITER_ALL_PROCS, 961 /* all threads in the system */ 962 BPF_TASK_ITER_ALL_THREADS, 963 /* all threads of a specific process */ 964 BPF_TASK_ITER_PROC_THREADS 965 }; 966 967 __bpf_kfunc_start_defs(); 968 969 __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, 970 struct task_struct *task__nullable, unsigned int flags) 971 { 972 struct bpf_iter_task_kern *kit = (void *)it; 973 974 BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task)); 975 BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != 976 __alignof__(struct bpf_iter_task)); 977 978 kit->pos = NULL; 979 980 switch (flags) { 981 case BPF_TASK_ITER_ALL_THREADS: 982 case BPF_TASK_ITER_ALL_PROCS: 983 break; 984 case BPF_TASK_ITER_PROC_THREADS: 985 if (!task__nullable) 986 return -EINVAL; 987 break; 988 default: 989 return -EINVAL; 990 } 991 992 if (flags == BPF_TASK_ITER_PROC_THREADS) 993 kit->task = task__nullable; 994 else 995 kit->task = &init_task; 996 kit->pos = kit->task; 997 kit->flags = flags; 998 return 0; 999 } 1000 1001 __bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) 1002 { 1003 struct bpf_iter_task_kern *kit = (void *)it; 1004 struct task_struct *pos; 1005 unsigned int flags; 1006 1007 flags = kit->flags; 1008 pos = kit->pos; 1009 1010 if (!pos) 1011 return pos; 1012 1013 if (flags == BPF_TASK_ITER_ALL_PROCS) 1014 goto get_next_task; 1015 1016 kit->pos = __next_thread(kit->pos); 1017 if (kit->pos || flags == BPF_TASK_ITER_PROC_THREADS) 1018 return pos; 1019 1020 get_next_task: 1021 kit->task = next_task(kit->task); 1022 if (kit->task == &init_task) 1023 kit->pos = NULL; 1024 else 1025 kit->pos = kit->task; 1026 1027 return pos; 1028 } 1029 1030 __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it) 1031 { 1032 } 1033 1034 __bpf_kfunc_end_defs(); 1035 1036 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); 1037 1038 static void do_mmap_read_unlock(struct irq_work *entry) 1039 { 1040 struct mmap_unlock_irq_work *work; 1041 1042 if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) 1043 return; 1044 1045 work = container_of(entry, struct mmap_unlock_irq_work, irq_work); 1046 mmap_read_unlock_non_owner(work->mm); 1047 } 1048 1049 static int __init task_iter_init(void) 1050 { 1051 struct mmap_unlock_irq_work *work; 1052 int ret, cpu; 1053 1054 for_each_possible_cpu(cpu) { 1055 work = per_cpu_ptr(&mmap_unlock_work, cpu); 1056 init_irq_work(&work->irq_work, do_mmap_read_unlock); 1057 } 1058 1059 task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 1060 ret = bpf_iter_reg_target(&task_reg_info); 1061 if (ret) 1062 return ret; 1063 1064 task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 1065 task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE]; 1066 ret = bpf_iter_reg_target(&task_file_reg_info); 1067 if (ret) 1068 return ret; 1069 1070 task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 1071 task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA]; 1072 return bpf_iter_reg_target(&task_vma_reg_info); 1073 } 1074 late_initcall(task_iter_init); 1075