1 /* 2 * Performance events core code: 3 * 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 8 * 9 * For licensing details see kernel-base/COPYING 10 */ 11 12 #include <linux/fs.h> 13 #include <linux/mm.h> 14 #include <linux/cpu.h> 15 #include <linux/smp.h> 16 #include <linux/idr.h> 17 #include <linux/file.h> 18 #include <linux/poll.h> 19 #include <linux/slab.h> 20 #include <linux/hash.h> 21 #include <linux/tick.h> 22 #include <linux/sysfs.h> 23 #include <linux/dcache.h> 24 #include <linux/percpu.h> 25 #include <linux/ptrace.h> 26 #include <linux/reboot.h> 27 #include <linux/vmstat.h> 28 #include <linux/device.h> 29 #include <linux/export.h> 30 #include <linux/vmalloc.h> 31 #include <linux/hardirq.h> 32 #include <linux/rculist.h> 33 #include <linux/uaccess.h> 34 #include <linux/syscalls.h> 35 #include <linux/anon_inodes.h> 36 #include <linux/kernel_stat.h> 37 #include <linux/cgroup.h> 38 #include <linux/perf_event.h> 39 #include <linux/trace_events.h> 40 #include <linux/hw_breakpoint.h> 41 #include <linux/mm_types.h> 42 #include <linux/module.h> 43 #include <linux/mman.h> 44 #include <linux/compat.h> 45 #include <linux/bpf.h> 46 #include <linux/filter.h> 47 #include <linux/namei.h> 48 #include <linux/parser.h> 49 #include <linux/sched/clock.h> 50 #include <linux/sched/mm.h> 51 #include <linux/proc_ns.h> 52 #include <linux/mount.h> 53 54 #include "internal.h" 55 56 #include <asm/irq_regs.h> 57 58 typedef int (*remote_function_f)(void *); 59 60 struct remote_function_call { 61 struct task_struct *p; 62 remote_function_f func; 63 void *info; 64 int ret; 65 }; 66 67 static void remote_function(void *data) 68 { 69 struct remote_function_call *tfc = data; 70 struct task_struct *p = tfc->p; 71 72 if (p) { 73 /* -EAGAIN */ 74 if (task_cpu(p) != smp_processor_id()) 75 return; 76 77 /* 78 * Now that we're on right CPU with IRQs disabled, we can test 79 * if we hit the right task without races. 80 */ 81 82 tfc->ret = -ESRCH; /* No such (running) process */ 83 if (p != current) 84 return; 85 } 86 87 tfc->ret = tfc->func(tfc->info); 88 } 89 90 /** 91 * task_function_call - call a function on the cpu on which a task runs 92 * @p: the task to evaluate 93 * @func: the function to be called 94 * @info: the function call argument 95 * 96 * Calls the function @func when the task is currently running. This might 97 * be on the current CPU, which just calls the function directly 98 * 99 * returns: @func return value, or 100 * -ESRCH - when the process isn't running 101 * -EAGAIN - when the process moved away 102 */ 103 static int 104 task_function_call(struct task_struct *p, remote_function_f func, void *info) 105 { 106 struct remote_function_call data = { 107 .p = p, 108 .func = func, 109 .info = info, 110 .ret = -EAGAIN, 111 }; 112 int ret; 113 114 do { 115 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); 116 if (!ret) 117 ret = data.ret; 118 } while (ret == -EAGAIN); 119 120 return ret; 121 } 122 123 /** 124 * cpu_function_call - call a function on the cpu 125 * @func: the function to be called 126 * @info: the function call argument 127 * 128 * Calls the function @func on the remote cpu. 129 * 130 * returns: @func return value or -ENXIO when the cpu is offline 131 */ 132 static int cpu_function_call(int cpu, remote_function_f func, void *info) 133 { 134 struct remote_function_call data = { 135 .p = NULL, 136 .func = func, 137 .info = info, 138 .ret = -ENXIO, /* No such CPU */ 139 }; 140 141 smp_call_function_single(cpu, remote_function, &data, 1); 142 143 return data.ret; 144 } 145 146 static inline struct perf_cpu_context * 147 __get_cpu_context(struct perf_event_context *ctx) 148 { 149 return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 150 } 151 152 static void perf_ctx_lock(struct perf_cpu_context *cpuctx, 153 struct perf_event_context *ctx) 154 { 155 raw_spin_lock(&cpuctx->ctx.lock); 156 if (ctx) 157 raw_spin_lock(&ctx->lock); 158 } 159 160 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, 161 struct perf_event_context *ctx) 162 { 163 if (ctx) 164 raw_spin_unlock(&ctx->lock); 165 raw_spin_unlock(&cpuctx->ctx.lock); 166 } 167 168 #define TASK_TOMBSTONE ((void *)-1L) 169 170 static bool is_kernel_event(struct perf_event *event) 171 { 172 return READ_ONCE(event->owner) == TASK_TOMBSTONE; 173 } 174 175 /* 176 * On task ctx scheduling... 177 * 178 * When !ctx->nr_events a task context will not be scheduled. This means 179 * we can disable the scheduler hooks (for performance) without leaving 180 * pending task ctx state. 181 * 182 * This however results in two special cases: 183 * 184 * - removing the last event from a task ctx; this is relatively straight 185 * forward and is done in __perf_remove_from_context. 186 * 187 * - adding the first event to a task ctx; this is tricky because we cannot 188 * rely on ctx->is_active and therefore cannot use event_function_call(). 189 * See perf_install_in_context(). 190 * 191 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. 192 */ 193 194 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, 195 struct perf_event_context *, void *); 196 197 struct event_function_struct { 198 struct perf_event *event; 199 event_f func; 200 void *data; 201 }; 202 203 static int event_function(void *info) 204 { 205 struct event_function_struct *efs = info; 206 struct perf_event *event = efs->event; 207 struct perf_event_context *ctx = event->ctx; 208 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 209 struct perf_event_context *task_ctx = cpuctx->task_ctx; 210 int ret = 0; 211 212 lockdep_assert_irqs_disabled(); 213 214 perf_ctx_lock(cpuctx, task_ctx); 215 /* 216 * Since we do the IPI call without holding ctx->lock things can have 217 * changed, double check we hit the task we set out to hit. 218 */ 219 if (ctx->task) { 220 if (ctx->task != current) { 221 ret = -ESRCH; 222 goto unlock; 223 } 224 225 /* 226 * We only use event_function_call() on established contexts, 227 * and event_function() is only ever called when active (or 228 * rather, we'll have bailed in task_function_call() or the 229 * above ctx->task != current test), therefore we must have 230 * ctx->is_active here. 231 */ 232 WARN_ON_ONCE(!ctx->is_active); 233 /* 234 * And since we have ctx->is_active, cpuctx->task_ctx must 235 * match. 236 */ 237 WARN_ON_ONCE(task_ctx != ctx); 238 } else { 239 WARN_ON_ONCE(&cpuctx->ctx != ctx); 240 } 241 242 efs->func(event, cpuctx, ctx, efs->data); 243 unlock: 244 perf_ctx_unlock(cpuctx, task_ctx); 245 246 return ret; 247 } 248 249 static void event_function_call(struct perf_event *event, event_f func, void *data) 250 { 251 struct perf_event_context *ctx = event->ctx; 252 struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ 253 struct event_function_struct efs = { 254 .event = event, 255 .func = func, 256 .data = data, 257 }; 258 259 if (!event->parent) { 260 /* 261 * If this is a !child event, we must hold ctx::mutex to 262 * stabilize the the event->ctx relation. See 263 * perf_event_ctx_lock(). 264 */ 265 lockdep_assert_held(&ctx->mutex); 266 } 267 268 if (!task) { 269 cpu_function_call(event->cpu, event_function, &efs); 270 return; 271 } 272 273 if (task == TASK_TOMBSTONE) 274 return; 275 276 again: 277 if (!task_function_call(task, event_function, &efs)) 278 return; 279 280 raw_spin_lock_irq(&ctx->lock); 281 /* 282 * Reload the task pointer, it might have been changed by 283 * a concurrent perf_event_context_sched_out(). 284 */ 285 task = ctx->task; 286 if (task == TASK_TOMBSTONE) { 287 raw_spin_unlock_irq(&ctx->lock); 288 return; 289 } 290 if (ctx->is_active) { 291 raw_spin_unlock_irq(&ctx->lock); 292 goto again; 293 } 294 func(event, NULL, ctx, data); 295 raw_spin_unlock_irq(&ctx->lock); 296 } 297 298 /* 299 * Similar to event_function_call() + event_function(), but hard assumes IRQs 300 * are already disabled and we're on the right CPU. 301 */ 302 static void event_function_local(struct perf_event *event, event_f func, void *data) 303 { 304 struct perf_event_context *ctx = event->ctx; 305 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 306 struct task_struct *task = READ_ONCE(ctx->task); 307 struct perf_event_context *task_ctx = NULL; 308 309 lockdep_assert_irqs_disabled(); 310 311 if (task) { 312 if (task == TASK_TOMBSTONE) 313 return; 314 315 task_ctx = ctx; 316 } 317 318 perf_ctx_lock(cpuctx, task_ctx); 319 320 task = ctx->task; 321 if (task == TASK_TOMBSTONE) 322 goto unlock; 323 324 if (task) { 325 /* 326 * We must be either inactive or active and the right task, 327 * otherwise we're screwed, since we cannot IPI to somewhere 328 * else. 329 */ 330 if (ctx->is_active) { 331 if (WARN_ON_ONCE(task != current)) 332 goto unlock; 333 334 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) 335 goto unlock; 336 } 337 } else { 338 WARN_ON_ONCE(&cpuctx->ctx != ctx); 339 } 340 341 func(event, cpuctx, ctx, data); 342 unlock: 343 perf_ctx_unlock(cpuctx, task_ctx); 344 } 345 346 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ 347 PERF_FLAG_FD_OUTPUT |\ 348 PERF_FLAG_PID_CGROUP |\ 349 PERF_FLAG_FD_CLOEXEC) 350 351 /* 352 * branch priv levels that need permission checks 353 */ 354 #define PERF_SAMPLE_BRANCH_PERM_PLM \ 355 (PERF_SAMPLE_BRANCH_KERNEL |\ 356 PERF_SAMPLE_BRANCH_HV) 357 358 enum event_type_t { 359 EVENT_FLEXIBLE = 0x1, 360 EVENT_PINNED = 0x2, 361 EVENT_TIME = 0x4, 362 /* see ctx_resched() for details */ 363 EVENT_CPU = 0x8, 364 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 365 }; 366 367 /* 368 * perf_sched_events : >0 events exist 369 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 370 */ 371 372 static void perf_sched_delayed(struct work_struct *work); 373 DEFINE_STATIC_KEY_FALSE(perf_sched_events); 374 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); 375 static DEFINE_MUTEX(perf_sched_mutex); 376 static atomic_t perf_sched_count; 377 378 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 379 static DEFINE_PER_CPU(int, perf_sched_cb_usages); 380 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); 381 382 static atomic_t nr_mmap_events __read_mostly; 383 static atomic_t nr_comm_events __read_mostly; 384 static atomic_t nr_namespaces_events __read_mostly; 385 static atomic_t nr_task_events __read_mostly; 386 static atomic_t nr_freq_events __read_mostly; 387 static atomic_t nr_switch_events __read_mostly; 388 389 static LIST_HEAD(pmus); 390 static DEFINE_MUTEX(pmus_lock); 391 static struct srcu_struct pmus_srcu; 392 static cpumask_var_t perf_online_mask; 393 394 /* 395 * perf event paranoia level: 396 * -1 - not paranoid at all 397 * 0 - disallow raw tracepoint access for unpriv 398 * 1 - disallow cpu events for unpriv 399 * 2 - disallow kernel profiling for unpriv 400 */ 401 int sysctl_perf_event_paranoid __read_mostly = 2; 402 403 /* Minimum for 512 kiB + 1 user control page */ 404 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ 405 406 /* 407 * max perf event sample rate 408 */ 409 #define DEFAULT_MAX_SAMPLE_RATE 100000 410 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) 411 #define DEFAULT_CPU_TIME_MAX_PERCENT 25 412 413 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; 414 415 static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 416 static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; 417 418 static int perf_sample_allowed_ns __read_mostly = 419 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; 420 421 static void update_perf_cpu_limits(void) 422 { 423 u64 tmp = perf_sample_period_ns; 424 425 tmp *= sysctl_perf_cpu_time_max_percent; 426 tmp = div_u64(tmp, 100); 427 if (!tmp) 428 tmp = 1; 429 430 WRITE_ONCE(perf_sample_allowed_ns, tmp); 431 } 432 433 static int perf_rotate_context(struct perf_cpu_context *cpuctx); 434 435 int perf_proc_update_handler(struct ctl_table *table, int write, 436 void __user *buffer, size_t *lenp, 437 loff_t *ppos) 438 { 439 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 440 441 if (ret || !write) 442 return ret; 443 444 /* 445 * If throttling is disabled don't allow the write: 446 */ 447 if (sysctl_perf_cpu_time_max_percent == 100 || 448 sysctl_perf_cpu_time_max_percent == 0) 449 return -EINVAL; 450 451 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); 452 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 453 update_perf_cpu_limits(); 454 455 return 0; 456 } 457 458 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; 459 460 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, 461 void __user *buffer, size_t *lenp, 462 loff_t *ppos) 463 { 464 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 465 466 if (ret || !write) 467 return ret; 468 469 if (sysctl_perf_cpu_time_max_percent == 100 || 470 sysctl_perf_cpu_time_max_percent == 0) { 471 printk(KERN_WARNING 472 "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); 473 WRITE_ONCE(perf_sample_allowed_ns, 0); 474 } else { 475 update_perf_cpu_limits(); 476 } 477 478 return 0; 479 } 480 481 /* 482 * perf samples are done in some very critical code paths (NMIs). 483 * If they take too much CPU time, the system can lock up and not 484 * get any real work done. This will drop the sample rate when 485 * we detect that events are taking too long. 486 */ 487 #define NR_ACCUMULATED_SAMPLES 128 488 static DEFINE_PER_CPU(u64, running_sample_length); 489 490 static u64 __report_avg; 491 static u64 __report_allowed; 492 493 static void perf_duration_warn(struct irq_work *w) 494 { 495 printk_ratelimited(KERN_INFO 496 "perf: interrupt took too long (%lld > %lld), lowering " 497 "kernel.perf_event_max_sample_rate to %d\n", 498 __report_avg, __report_allowed, 499 sysctl_perf_event_sample_rate); 500 } 501 502 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); 503 504 void perf_sample_event_took(u64 sample_len_ns) 505 { 506 u64 max_len = READ_ONCE(perf_sample_allowed_ns); 507 u64 running_len; 508 u64 avg_len; 509 u32 max; 510 511 if (max_len == 0) 512 return; 513 514 /* Decay the counter by 1 average sample. */ 515 running_len = __this_cpu_read(running_sample_length); 516 running_len -= running_len/NR_ACCUMULATED_SAMPLES; 517 running_len += sample_len_ns; 518 __this_cpu_write(running_sample_length, running_len); 519 520 /* 521 * Note: this will be biased artifically low until we have 522 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us 523 * from having to maintain a count. 524 */ 525 avg_len = running_len/NR_ACCUMULATED_SAMPLES; 526 if (avg_len <= max_len) 527 return; 528 529 __report_avg = avg_len; 530 __report_allowed = max_len; 531 532 /* 533 * Compute a throttle threshold 25% below the current duration. 534 */ 535 avg_len += avg_len / 4; 536 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; 537 if (avg_len < max) 538 max /= (u32)avg_len; 539 else 540 max = 1; 541 542 WRITE_ONCE(perf_sample_allowed_ns, avg_len); 543 WRITE_ONCE(max_samples_per_tick, max); 544 545 sysctl_perf_event_sample_rate = max * HZ; 546 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 547 548 if (!irq_work_queue(&perf_duration_work)) { 549 early_printk("perf: interrupt took too long (%lld > %lld), lowering " 550 "kernel.perf_event_max_sample_rate to %d\n", 551 __report_avg, __report_allowed, 552 sysctl_perf_event_sample_rate); 553 } 554 } 555 556 static atomic64_t perf_event_id; 557 558 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 559 enum event_type_t event_type); 560 561 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 562 enum event_type_t event_type, 563 struct task_struct *task); 564 565 static void update_context_time(struct perf_event_context *ctx); 566 static u64 perf_event_time(struct perf_event *event); 567 568 void __weak perf_event_print_debug(void) { } 569 570 extern __weak const char *perf_pmu_name(void) 571 { 572 return "pmu"; 573 } 574 575 static inline u64 perf_clock(void) 576 { 577 return local_clock(); 578 } 579 580 static inline u64 perf_event_clock(struct perf_event *event) 581 { 582 return event->clock(); 583 } 584 585 /* 586 * State based event timekeeping... 587 * 588 * The basic idea is to use event->state to determine which (if any) time 589 * fields to increment with the current delta. This means we only need to 590 * update timestamps when we change state or when they are explicitly requested 591 * (read). 592 * 593 * Event groups make things a little more complicated, but not terribly so. The 594 * rules for a group are that if the group leader is OFF the entire group is 595 * OFF, irrespecive of what the group member states are. This results in 596 * __perf_effective_state(). 597 * 598 * A futher ramification is that when a group leader flips between OFF and 599 * !OFF, we need to update all group member times. 600 * 601 * 602 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we 603 * need to make sure the relevant context time is updated before we try and 604 * update our timestamps. 605 */ 606 607 static __always_inline enum perf_event_state 608 __perf_effective_state(struct perf_event *event) 609 { 610 struct perf_event *leader = event->group_leader; 611 612 if (leader->state <= PERF_EVENT_STATE_OFF) 613 return leader->state; 614 615 return event->state; 616 } 617 618 static __always_inline void 619 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) 620 { 621 enum perf_event_state state = __perf_effective_state(event); 622 u64 delta = now - event->tstamp; 623 624 *enabled = event->total_time_enabled; 625 if (state >= PERF_EVENT_STATE_INACTIVE) 626 *enabled += delta; 627 628 *running = event->total_time_running; 629 if (state >= PERF_EVENT_STATE_ACTIVE) 630 *running += delta; 631 } 632 633 static void perf_event_update_time(struct perf_event *event) 634 { 635 u64 now = perf_event_time(event); 636 637 __perf_update_times(event, now, &event->total_time_enabled, 638 &event->total_time_running); 639 event->tstamp = now; 640 } 641 642 static void perf_event_update_sibling_time(struct perf_event *leader) 643 { 644 struct perf_event *sibling; 645 646 list_for_each_entry(sibling, &leader->sibling_list, group_entry) 647 perf_event_update_time(sibling); 648 } 649 650 static void 651 perf_event_set_state(struct perf_event *event, enum perf_event_state state) 652 { 653 if (event->state == state) 654 return; 655 656 perf_event_update_time(event); 657 /* 658 * If a group leader gets enabled/disabled all its siblings 659 * are affected too. 660 */ 661 if ((event->state < 0) ^ (state < 0)) 662 perf_event_update_sibling_time(event); 663 664 WRITE_ONCE(event->state, state); 665 } 666 667 #ifdef CONFIG_CGROUP_PERF 668 669 static inline bool 670 perf_cgroup_match(struct perf_event *event) 671 { 672 struct perf_event_context *ctx = event->ctx; 673 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 674 675 /* @event doesn't care about cgroup */ 676 if (!event->cgrp) 677 return true; 678 679 /* wants specific cgroup scope but @cpuctx isn't associated with any */ 680 if (!cpuctx->cgrp) 681 return false; 682 683 /* 684 * Cgroup scoping is recursive. An event enabled for a cgroup is 685 * also enabled for all its descendant cgroups. If @cpuctx's 686 * cgroup is a descendant of @event's (the test covers identity 687 * case), it's a match. 688 */ 689 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, 690 event->cgrp->css.cgroup); 691 } 692 693 static inline void perf_detach_cgroup(struct perf_event *event) 694 { 695 css_put(&event->cgrp->css); 696 event->cgrp = NULL; 697 } 698 699 static inline int is_cgroup_event(struct perf_event *event) 700 { 701 return event->cgrp != NULL; 702 } 703 704 static inline u64 perf_cgroup_event_time(struct perf_event *event) 705 { 706 struct perf_cgroup_info *t; 707 708 t = per_cpu_ptr(event->cgrp->info, event->cpu); 709 return t->time; 710 } 711 712 static inline void __update_cgrp_time(struct perf_cgroup *cgrp) 713 { 714 struct perf_cgroup_info *info; 715 u64 now; 716 717 now = perf_clock(); 718 719 info = this_cpu_ptr(cgrp->info); 720 721 info->time += now - info->timestamp; 722 info->timestamp = now; 723 } 724 725 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) 726 { 727 struct perf_cgroup *cgrp = cpuctx->cgrp; 728 struct cgroup_subsys_state *css; 729 730 if (cgrp) { 731 for (css = &cgrp->css; css; css = css->parent) { 732 cgrp = container_of(css, struct perf_cgroup, css); 733 __update_cgrp_time(cgrp); 734 } 735 } 736 } 737 738 static inline void update_cgrp_time_from_event(struct perf_event *event) 739 { 740 struct perf_cgroup *cgrp; 741 742 /* 743 * ensure we access cgroup data only when needed and 744 * when we know the cgroup is pinned (css_get) 745 */ 746 if (!is_cgroup_event(event)) 747 return; 748 749 cgrp = perf_cgroup_from_task(current, event->ctx); 750 /* 751 * Do not update time when cgroup is not active 752 */ 753 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) 754 __update_cgrp_time(event->cgrp); 755 } 756 757 static inline void 758 perf_cgroup_set_timestamp(struct task_struct *task, 759 struct perf_event_context *ctx) 760 { 761 struct perf_cgroup *cgrp; 762 struct perf_cgroup_info *info; 763 struct cgroup_subsys_state *css; 764 765 /* 766 * ctx->lock held by caller 767 * ensure we do not access cgroup data 768 * unless we have the cgroup pinned (css_get) 769 */ 770 if (!task || !ctx->nr_cgroups) 771 return; 772 773 cgrp = perf_cgroup_from_task(task, ctx); 774 775 for (css = &cgrp->css; css; css = css->parent) { 776 cgrp = container_of(css, struct perf_cgroup, css); 777 info = this_cpu_ptr(cgrp->info); 778 info->timestamp = ctx->timestamp; 779 } 780 } 781 782 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); 783 784 #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ 785 #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ 786 787 /* 788 * reschedule events based on the cgroup constraint of task. 789 * 790 * mode SWOUT : schedule out everything 791 * mode SWIN : schedule in based on cgroup for next 792 */ 793 static void perf_cgroup_switch(struct task_struct *task, int mode) 794 { 795 struct perf_cpu_context *cpuctx; 796 struct list_head *list; 797 unsigned long flags; 798 799 /* 800 * Disable interrupts and preemption to avoid this CPU's 801 * cgrp_cpuctx_entry to change under us. 802 */ 803 local_irq_save(flags); 804 805 list = this_cpu_ptr(&cgrp_cpuctx_list); 806 list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { 807 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); 808 809 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 810 perf_pmu_disable(cpuctx->ctx.pmu); 811 812 if (mode & PERF_CGROUP_SWOUT) { 813 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 814 /* 815 * must not be done before ctxswout due 816 * to event_filter_match() in event_sched_out() 817 */ 818 cpuctx->cgrp = NULL; 819 } 820 821 if (mode & PERF_CGROUP_SWIN) { 822 WARN_ON_ONCE(cpuctx->cgrp); 823 /* 824 * set cgrp before ctxsw in to allow 825 * event_filter_match() to not have to pass 826 * task around 827 * we pass the cpuctx->ctx to perf_cgroup_from_task() 828 * because cgorup events are only per-cpu 829 */ 830 cpuctx->cgrp = perf_cgroup_from_task(task, 831 &cpuctx->ctx); 832 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); 833 } 834 perf_pmu_enable(cpuctx->ctx.pmu); 835 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 836 } 837 838 local_irq_restore(flags); 839 } 840 841 static inline void perf_cgroup_sched_out(struct task_struct *task, 842 struct task_struct *next) 843 { 844 struct perf_cgroup *cgrp1; 845 struct perf_cgroup *cgrp2 = NULL; 846 847 rcu_read_lock(); 848 /* 849 * we come here when we know perf_cgroup_events > 0 850 * we do not need to pass the ctx here because we know 851 * we are holding the rcu lock 852 */ 853 cgrp1 = perf_cgroup_from_task(task, NULL); 854 cgrp2 = perf_cgroup_from_task(next, NULL); 855 856 /* 857 * only schedule out current cgroup events if we know 858 * that we are switching to a different cgroup. Otherwise, 859 * do no touch the cgroup events. 860 */ 861 if (cgrp1 != cgrp2) 862 perf_cgroup_switch(task, PERF_CGROUP_SWOUT); 863 864 rcu_read_unlock(); 865 } 866 867 static inline void perf_cgroup_sched_in(struct task_struct *prev, 868 struct task_struct *task) 869 { 870 struct perf_cgroup *cgrp1; 871 struct perf_cgroup *cgrp2 = NULL; 872 873 rcu_read_lock(); 874 /* 875 * we come here when we know perf_cgroup_events > 0 876 * we do not need to pass the ctx here because we know 877 * we are holding the rcu lock 878 */ 879 cgrp1 = perf_cgroup_from_task(task, NULL); 880 cgrp2 = perf_cgroup_from_task(prev, NULL); 881 882 /* 883 * only need to schedule in cgroup events if we are changing 884 * cgroup during ctxsw. Cgroup events were not scheduled 885 * out of ctxsw out if that was not the case. 886 */ 887 if (cgrp1 != cgrp2) 888 perf_cgroup_switch(task, PERF_CGROUP_SWIN); 889 890 rcu_read_unlock(); 891 } 892 893 static inline int perf_cgroup_connect(int fd, struct perf_event *event, 894 struct perf_event_attr *attr, 895 struct perf_event *group_leader) 896 { 897 struct perf_cgroup *cgrp; 898 struct cgroup_subsys_state *css; 899 struct fd f = fdget(fd); 900 int ret = 0; 901 902 if (!f.file) 903 return -EBADF; 904 905 css = css_tryget_online_from_dir(f.file->f_path.dentry, 906 &perf_event_cgrp_subsys); 907 if (IS_ERR(css)) { 908 ret = PTR_ERR(css); 909 goto out; 910 } 911 912 cgrp = container_of(css, struct perf_cgroup, css); 913 event->cgrp = cgrp; 914 915 /* 916 * all events in a group must monitor 917 * the same cgroup because a task belongs 918 * to only one perf cgroup at a time 919 */ 920 if (group_leader && group_leader->cgrp != cgrp) { 921 perf_detach_cgroup(event); 922 ret = -EINVAL; 923 } 924 out: 925 fdput(f); 926 return ret; 927 } 928 929 static inline void 930 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) 931 { 932 struct perf_cgroup_info *t; 933 t = per_cpu_ptr(event->cgrp->info, event->cpu); 934 event->shadow_ctx_time = now - t->timestamp; 935 } 936 937 /* 938 * Update cpuctx->cgrp so that it is set when first cgroup event is added and 939 * cleared when last cgroup event is removed. 940 */ 941 static inline void 942 list_update_cgroup_event(struct perf_event *event, 943 struct perf_event_context *ctx, bool add) 944 { 945 struct perf_cpu_context *cpuctx; 946 struct list_head *cpuctx_entry; 947 948 if (!is_cgroup_event(event)) 949 return; 950 951 if (add && ctx->nr_cgroups++) 952 return; 953 else if (!add && --ctx->nr_cgroups) 954 return; 955 /* 956 * Because cgroup events are always per-cpu events, 957 * this will always be called from the right CPU. 958 */ 959 cpuctx = __get_cpu_context(ctx); 960 cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; 961 /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ 962 if (add) { 963 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); 964 965 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); 966 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) 967 cpuctx->cgrp = cgrp; 968 } else { 969 list_del(cpuctx_entry); 970 cpuctx->cgrp = NULL; 971 } 972 } 973 974 #else /* !CONFIG_CGROUP_PERF */ 975 976 static inline bool 977 perf_cgroup_match(struct perf_event *event) 978 { 979 return true; 980 } 981 982 static inline void perf_detach_cgroup(struct perf_event *event) 983 {} 984 985 static inline int is_cgroup_event(struct perf_event *event) 986 { 987 return 0; 988 } 989 990 static inline void update_cgrp_time_from_event(struct perf_event *event) 991 { 992 } 993 994 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) 995 { 996 } 997 998 static inline void perf_cgroup_sched_out(struct task_struct *task, 999 struct task_struct *next) 1000 { 1001 } 1002 1003 static inline void perf_cgroup_sched_in(struct task_struct *prev, 1004 struct task_struct *task) 1005 { 1006 } 1007 1008 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, 1009 struct perf_event_attr *attr, 1010 struct perf_event *group_leader) 1011 { 1012 return -EINVAL; 1013 } 1014 1015 static inline void 1016 perf_cgroup_set_timestamp(struct task_struct *task, 1017 struct perf_event_context *ctx) 1018 { 1019 } 1020 1021 void 1022 perf_cgroup_switch(struct task_struct *task, struct task_struct *next) 1023 { 1024 } 1025 1026 static inline void 1027 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) 1028 { 1029 } 1030 1031 static inline u64 perf_cgroup_event_time(struct perf_event *event) 1032 { 1033 return 0; 1034 } 1035 1036 static inline void 1037 list_update_cgroup_event(struct perf_event *event, 1038 struct perf_event_context *ctx, bool add) 1039 { 1040 } 1041 1042 #endif 1043 1044 /* 1045 * set default to be dependent on timer tick just 1046 * like original code 1047 */ 1048 #define PERF_CPU_HRTIMER (1000 / HZ) 1049 /* 1050 * function must be called with interrupts disabled 1051 */ 1052 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) 1053 { 1054 struct perf_cpu_context *cpuctx; 1055 int rotations = 0; 1056 1057 lockdep_assert_irqs_disabled(); 1058 1059 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); 1060 rotations = perf_rotate_context(cpuctx); 1061 1062 raw_spin_lock(&cpuctx->hrtimer_lock); 1063 if (rotations) 1064 hrtimer_forward_now(hr, cpuctx->hrtimer_interval); 1065 else 1066 cpuctx->hrtimer_active = 0; 1067 raw_spin_unlock(&cpuctx->hrtimer_lock); 1068 1069 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; 1070 } 1071 1072 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) 1073 { 1074 struct hrtimer *timer = &cpuctx->hrtimer; 1075 struct pmu *pmu = cpuctx->ctx.pmu; 1076 u64 interval; 1077 1078 /* no multiplexing needed for SW PMU */ 1079 if (pmu->task_ctx_nr == perf_sw_context) 1080 return; 1081 1082 /* 1083 * check default is sane, if not set then force to 1084 * default interval (1/tick) 1085 */ 1086 interval = pmu->hrtimer_interval_ms; 1087 if (interval < 1) 1088 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; 1089 1090 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); 1091 1092 raw_spin_lock_init(&cpuctx->hrtimer_lock); 1093 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); 1094 timer->function = perf_mux_hrtimer_handler; 1095 } 1096 1097 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) 1098 { 1099 struct hrtimer *timer = &cpuctx->hrtimer; 1100 struct pmu *pmu = cpuctx->ctx.pmu; 1101 unsigned long flags; 1102 1103 /* not for SW PMU */ 1104 if (pmu->task_ctx_nr == perf_sw_context) 1105 return 0; 1106 1107 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); 1108 if (!cpuctx->hrtimer_active) { 1109 cpuctx->hrtimer_active = 1; 1110 hrtimer_forward_now(timer, cpuctx->hrtimer_interval); 1111 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 1112 } 1113 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); 1114 1115 return 0; 1116 } 1117 1118 void perf_pmu_disable(struct pmu *pmu) 1119 { 1120 int *count = this_cpu_ptr(pmu->pmu_disable_count); 1121 if (!(*count)++) 1122 pmu->pmu_disable(pmu); 1123 } 1124 1125 void perf_pmu_enable(struct pmu *pmu) 1126 { 1127 int *count = this_cpu_ptr(pmu->pmu_disable_count); 1128 if (!--(*count)) 1129 pmu->pmu_enable(pmu); 1130 } 1131 1132 static DEFINE_PER_CPU(struct list_head, active_ctx_list); 1133 1134 /* 1135 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and 1136 * perf_event_task_tick() are fully serialized because they're strictly cpu 1137 * affine and perf_event_ctx{activate,deactivate} are called with IRQs 1138 * disabled, while perf_event_task_tick is called from IRQ context. 1139 */ 1140 static void perf_event_ctx_activate(struct perf_event_context *ctx) 1141 { 1142 struct list_head *head = this_cpu_ptr(&active_ctx_list); 1143 1144 lockdep_assert_irqs_disabled(); 1145 1146 WARN_ON(!list_empty(&ctx->active_ctx_list)); 1147 1148 list_add(&ctx->active_ctx_list, head); 1149 } 1150 1151 static void perf_event_ctx_deactivate(struct perf_event_context *ctx) 1152 { 1153 lockdep_assert_irqs_disabled(); 1154 1155 WARN_ON(list_empty(&ctx->active_ctx_list)); 1156 1157 list_del_init(&ctx->active_ctx_list); 1158 } 1159 1160 static void get_ctx(struct perf_event_context *ctx) 1161 { 1162 WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); 1163 } 1164 1165 static void free_ctx(struct rcu_head *head) 1166 { 1167 struct perf_event_context *ctx; 1168 1169 ctx = container_of(head, struct perf_event_context, rcu_head); 1170 kfree(ctx->task_ctx_data); 1171 kfree(ctx); 1172 } 1173 1174 static void put_ctx(struct perf_event_context *ctx) 1175 { 1176 if (atomic_dec_and_test(&ctx->refcount)) { 1177 if (ctx->parent_ctx) 1178 put_ctx(ctx->parent_ctx); 1179 if (ctx->task && ctx->task != TASK_TOMBSTONE) 1180 put_task_struct(ctx->task); 1181 call_rcu(&ctx->rcu_head, free_ctx); 1182 } 1183 } 1184 1185 /* 1186 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and 1187 * perf_pmu_migrate_context() we need some magic. 1188 * 1189 * Those places that change perf_event::ctx will hold both 1190 * perf_event_ctx::mutex of the 'old' and 'new' ctx value. 1191 * 1192 * Lock ordering is by mutex address. There are two other sites where 1193 * perf_event_context::mutex nests and those are: 1194 * 1195 * - perf_event_exit_task_context() [ child , 0 ] 1196 * perf_event_exit_event() 1197 * put_event() [ parent, 1 ] 1198 * 1199 * - perf_event_init_context() [ parent, 0 ] 1200 * inherit_task_group() 1201 * inherit_group() 1202 * inherit_event() 1203 * perf_event_alloc() 1204 * perf_init_event() 1205 * perf_try_init_event() [ child , 1 ] 1206 * 1207 * While it appears there is an obvious deadlock here -- the parent and child 1208 * nesting levels are inverted between the two. This is in fact safe because 1209 * life-time rules separate them. That is an exiting task cannot fork, and a 1210 * spawning task cannot (yet) exit. 1211 * 1212 * But remember that that these are parent<->child context relations, and 1213 * migration does not affect children, therefore these two orderings should not 1214 * interact. 1215 * 1216 * The change in perf_event::ctx does not affect children (as claimed above) 1217 * because the sys_perf_event_open() case will install a new event and break 1218 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only 1219 * concerned with cpuctx and that doesn't have children. 1220 * 1221 * The places that change perf_event::ctx will issue: 1222 * 1223 * perf_remove_from_context(); 1224 * synchronize_rcu(); 1225 * perf_install_in_context(); 1226 * 1227 * to affect the change. The remove_from_context() + synchronize_rcu() should 1228 * quiesce the event, after which we can install it in the new location. This 1229 * means that only external vectors (perf_fops, prctl) can perturb the event 1230 * while in transit. Therefore all such accessors should also acquire 1231 * perf_event_context::mutex to serialize against this. 1232 * 1233 * However; because event->ctx can change while we're waiting to acquire 1234 * ctx->mutex we must be careful and use the below perf_event_ctx_lock() 1235 * function. 1236 * 1237 * Lock order: 1238 * cred_guard_mutex 1239 * task_struct::perf_event_mutex 1240 * perf_event_context::mutex 1241 * perf_event::child_mutex; 1242 * perf_event_context::lock 1243 * perf_event::mmap_mutex 1244 * mmap_sem 1245 * 1246 * cpu_hotplug_lock 1247 * pmus_lock 1248 * cpuctx->mutex / perf_event_context::mutex 1249 */ 1250 static struct perf_event_context * 1251 perf_event_ctx_lock_nested(struct perf_event *event, int nesting) 1252 { 1253 struct perf_event_context *ctx; 1254 1255 again: 1256 rcu_read_lock(); 1257 ctx = READ_ONCE(event->ctx); 1258 if (!atomic_inc_not_zero(&ctx->refcount)) { 1259 rcu_read_unlock(); 1260 goto again; 1261 } 1262 rcu_read_unlock(); 1263 1264 mutex_lock_nested(&ctx->mutex, nesting); 1265 if (event->ctx != ctx) { 1266 mutex_unlock(&ctx->mutex); 1267 put_ctx(ctx); 1268 goto again; 1269 } 1270 1271 return ctx; 1272 } 1273 1274 static inline struct perf_event_context * 1275 perf_event_ctx_lock(struct perf_event *event) 1276 { 1277 return perf_event_ctx_lock_nested(event, 0); 1278 } 1279 1280 static void perf_event_ctx_unlock(struct perf_event *event, 1281 struct perf_event_context *ctx) 1282 { 1283 mutex_unlock(&ctx->mutex); 1284 put_ctx(ctx); 1285 } 1286 1287 /* 1288 * This must be done under the ctx->lock, such as to serialize against 1289 * context_equiv(), therefore we cannot call put_ctx() since that might end up 1290 * calling scheduler related locks and ctx->lock nests inside those. 1291 */ 1292 static __must_check struct perf_event_context * 1293 unclone_ctx(struct perf_event_context *ctx) 1294 { 1295 struct perf_event_context *parent_ctx = ctx->parent_ctx; 1296 1297 lockdep_assert_held(&ctx->lock); 1298 1299 if (parent_ctx) 1300 ctx->parent_ctx = NULL; 1301 ctx->generation++; 1302 1303 return parent_ctx; 1304 } 1305 1306 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, 1307 enum pid_type type) 1308 { 1309 u32 nr; 1310 /* 1311 * only top level events have the pid namespace they were created in 1312 */ 1313 if (event->parent) 1314 event = event->parent; 1315 1316 nr = __task_pid_nr_ns(p, type, event->ns); 1317 /* avoid -1 if it is idle thread or runs in another ns */ 1318 if (!nr && !pid_alive(p)) 1319 nr = -1; 1320 return nr; 1321 } 1322 1323 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 1324 { 1325 return perf_event_pid_type(event, p, __PIDTYPE_TGID); 1326 } 1327 1328 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) 1329 { 1330 return perf_event_pid_type(event, p, PIDTYPE_PID); 1331 } 1332 1333 /* 1334 * If we inherit events we want to return the parent event id 1335 * to userspace. 1336 */ 1337 static u64 primary_event_id(struct perf_event *event) 1338 { 1339 u64 id = event->id; 1340 1341 if (event->parent) 1342 id = event->parent->id; 1343 1344 return id; 1345 } 1346 1347 /* 1348 * Get the perf_event_context for a task and lock it. 1349 * 1350 * This has to cope with with the fact that until it is locked, 1351 * the context could get moved to another task. 1352 */ 1353 static struct perf_event_context * 1354 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) 1355 { 1356 struct perf_event_context *ctx; 1357 1358 retry: 1359 /* 1360 * One of the few rules of preemptible RCU is that one cannot do 1361 * rcu_read_unlock() while holding a scheduler (or nested) lock when 1362 * part of the read side critical section was irqs-enabled -- see 1363 * rcu_read_unlock_special(). 1364 * 1365 * Since ctx->lock nests under rq->lock we must ensure the entire read 1366 * side critical section has interrupts disabled. 1367 */ 1368 local_irq_save(*flags); 1369 rcu_read_lock(); 1370 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); 1371 if (ctx) { 1372 /* 1373 * If this context is a clone of another, it might 1374 * get swapped for another underneath us by 1375 * perf_event_task_sched_out, though the 1376 * rcu_read_lock() protects us from any context 1377 * getting freed. Lock the context and check if it 1378 * got swapped before we could get the lock, and retry 1379 * if so. If we locked the right context, then it 1380 * can't get swapped on us any more. 1381 */ 1382 raw_spin_lock(&ctx->lock); 1383 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { 1384 raw_spin_unlock(&ctx->lock); 1385 rcu_read_unlock(); 1386 local_irq_restore(*flags); 1387 goto retry; 1388 } 1389 1390 if (ctx->task == TASK_TOMBSTONE || 1391 !atomic_inc_not_zero(&ctx->refcount)) { 1392 raw_spin_unlock(&ctx->lock); 1393 ctx = NULL; 1394 } else { 1395 WARN_ON_ONCE(ctx->task != task); 1396 } 1397 } 1398 rcu_read_unlock(); 1399 if (!ctx) 1400 local_irq_restore(*flags); 1401 return ctx; 1402 } 1403 1404 /* 1405 * Get the context for a task and increment its pin_count so it 1406 * can't get swapped to another task. This also increments its 1407 * reference count so that the context can't get freed. 1408 */ 1409 static struct perf_event_context * 1410 perf_pin_task_context(struct task_struct *task, int ctxn) 1411 { 1412 struct perf_event_context *ctx; 1413 unsigned long flags; 1414 1415 ctx = perf_lock_task_context(task, ctxn, &flags); 1416 if (ctx) { 1417 ++ctx->pin_count; 1418 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1419 } 1420 return ctx; 1421 } 1422 1423 static void perf_unpin_context(struct perf_event_context *ctx) 1424 { 1425 unsigned long flags; 1426 1427 raw_spin_lock_irqsave(&ctx->lock, flags); 1428 --ctx->pin_count; 1429 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1430 } 1431 1432 /* 1433 * Update the record of the current time in a context. 1434 */ 1435 static void update_context_time(struct perf_event_context *ctx) 1436 { 1437 u64 now = perf_clock(); 1438 1439 ctx->time += now - ctx->timestamp; 1440 ctx->timestamp = now; 1441 } 1442 1443 static u64 perf_event_time(struct perf_event *event) 1444 { 1445 struct perf_event_context *ctx = event->ctx; 1446 1447 if (is_cgroup_event(event)) 1448 return perf_cgroup_event_time(event); 1449 1450 return ctx ? ctx->time : 0; 1451 } 1452 1453 static enum event_type_t get_event_type(struct perf_event *event) 1454 { 1455 struct perf_event_context *ctx = event->ctx; 1456 enum event_type_t event_type; 1457 1458 lockdep_assert_held(&ctx->lock); 1459 1460 /* 1461 * It's 'group type', really, because if our group leader is 1462 * pinned, so are we. 1463 */ 1464 if (event->group_leader != event) 1465 event = event->group_leader; 1466 1467 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; 1468 if (!ctx->task) 1469 event_type |= EVENT_CPU; 1470 1471 return event_type; 1472 } 1473 1474 static struct list_head * 1475 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) 1476 { 1477 if (event->attr.pinned) 1478 return &ctx->pinned_groups; 1479 else 1480 return &ctx->flexible_groups; 1481 } 1482 1483 /* 1484 * Add a event from the lists for its context. 1485 * Must be called with ctx->mutex and ctx->lock held. 1486 */ 1487 static void 1488 list_add_event(struct perf_event *event, struct perf_event_context *ctx) 1489 { 1490 lockdep_assert_held(&ctx->lock); 1491 1492 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); 1493 event->attach_state |= PERF_ATTACH_CONTEXT; 1494 1495 event->tstamp = perf_event_time(event); 1496 1497 /* 1498 * If we're a stand alone event or group leader, we go to the context 1499 * list, group events are kept attached to the group so that 1500 * perf_group_detach can, at all times, locate all siblings. 1501 */ 1502 if (event->group_leader == event) { 1503 struct list_head *list; 1504 1505 event->group_caps = event->event_caps; 1506 1507 list = ctx_group_list(event, ctx); 1508 list_add_tail(&event->group_entry, list); 1509 } 1510 1511 list_update_cgroup_event(event, ctx, true); 1512 1513 list_add_rcu(&event->event_entry, &ctx->event_list); 1514 ctx->nr_events++; 1515 if (event->attr.inherit_stat) 1516 ctx->nr_stat++; 1517 1518 ctx->generation++; 1519 } 1520 1521 /* 1522 * Initialize event state based on the perf_event_attr::disabled. 1523 */ 1524 static inline void perf_event__state_init(struct perf_event *event) 1525 { 1526 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : 1527 PERF_EVENT_STATE_INACTIVE; 1528 } 1529 1530 static void __perf_event_read_size(struct perf_event *event, int nr_siblings) 1531 { 1532 int entry = sizeof(u64); /* value */ 1533 int size = 0; 1534 int nr = 1; 1535 1536 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1537 size += sizeof(u64); 1538 1539 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 1540 size += sizeof(u64); 1541 1542 if (event->attr.read_format & PERF_FORMAT_ID) 1543 entry += sizeof(u64); 1544 1545 if (event->attr.read_format & PERF_FORMAT_GROUP) { 1546 nr += nr_siblings; 1547 size += sizeof(u64); 1548 } 1549 1550 size += entry * nr; 1551 event->read_size = size; 1552 } 1553 1554 static void __perf_event_header_size(struct perf_event *event, u64 sample_type) 1555 { 1556 struct perf_sample_data *data; 1557 u16 size = 0; 1558 1559 if (sample_type & PERF_SAMPLE_IP) 1560 size += sizeof(data->ip); 1561 1562 if (sample_type & PERF_SAMPLE_ADDR) 1563 size += sizeof(data->addr); 1564 1565 if (sample_type & PERF_SAMPLE_PERIOD) 1566 size += sizeof(data->period); 1567 1568 if (sample_type & PERF_SAMPLE_WEIGHT) 1569 size += sizeof(data->weight); 1570 1571 if (sample_type & PERF_SAMPLE_READ) 1572 size += event->read_size; 1573 1574 if (sample_type & PERF_SAMPLE_DATA_SRC) 1575 size += sizeof(data->data_src.val); 1576 1577 if (sample_type & PERF_SAMPLE_TRANSACTION) 1578 size += sizeof(data->txn); 1579 1580 if (sample_type & PERF_SAMPLE_PHYS_ADDR) 1581 size += sizeof(data->phys_addr); 1582 1583 event->header_size = size; 1584 } 1585 1586 /* 1587 * Called at perf_event creation and when events are attached/detached from a 1588 * group. 1589 */ 1590 static void perf_event__header_size(struct perf_event *event) 1591 { 1592 __perf_event_read_size(event, 1593 event->group_leader->nr_siblings); 1594 __perf_event_header_size(event, event->attr.sample_type); 1595 } 1596 1597 static void perf_event__id_header_size(struct perf_event *event) 1598 { 1599 struct perf_sample_data *data; 1600 u64 sample_type = event->attr.sample_type; 1601 u16 size = 0; 1602 1603 if (sample_type & PERF_SAMPLE_TID) 1604 size += sizeof(data->tid_entry); 1605 1606 if (sample_type & PERF_SAMPLE_TIME) 1607 size += sizeof(data->time); 1608 1609 if (sample_type & PERF_SAMPLE_IDENTIFIER) 1610 size += sizeof(data->id); 1611 1612 if (sample_type & PERF_SAMPLE_ID) 1613 size += sizeof(data->id); 1614 1615 if (sample_type & PERF_SAMPLE_STREAM_ID) 1616 size += sizeof(data->stream_id); 1617 1618 if (sample_type & PERF_SAMPLE_CPU) 1619 size += sizeof(data->cpu_entry); 1620 1621 event->id_header_size = size; 1622 } 1623 1624 static bool perf_event_validate_size(struct perf_event *event) 1625 { 1626 /* 1627 * The values computed here will be over-written when we actually 1628 * attach the event. 1629 */ 1630 __perf_event_read_size(event, event->group_leader->nr_siblings + 1); 1631 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); 1632 perf_event__id_header_size(event); 1633 1634 /* 1635 * Sum the lot; should not exceed the 64k limit we have on records. 1636 * Conservative limit to allow for callchains and other variable fields. 1637 */ 1638 if (event->read_size + event->header_size + 1639 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) 1640 return false; 1641 1642 return true; 1643 } 1644 1645 static void perf_group_attach(struct perf_event *event) 1646 { 1647 struct perf_event *group_leader = event->group_leader, *pos; 1648 1649 lockdep_assert_held(&event->ctx->lock); 1650 1651 /* 1652 * We can have double attach due to group movement in perf_event_open. 1653 */ 1654 if (event->attach_state & PERF_ATTACH_GROUP) 1655 return; 1656 1657 event->attach_state |= PERF_ATTACH_GROUP; 1658 1659 if (group_leader == event) 1660 return; 1661 1662 WARN_ON_ONCE(group_leader->ctx != event->ctx); 1663 1664 group_leader->group_caps &= event->event_caps; 1665 1666 list_add_tail(&event->group_entry, &group_leader->sibling_list); 1667 group_leader->nr_siblings++; 1668 1669 perf_event__header_size(group_leader); 1670 1671 list_for_each_entry(pos, &group_leader->sibling_list, group_entry) 1672 perf_event__header_size(pos); 1673 } 1674 1675 /* 1676 * Remove a event from the lists for its context. 1677 * Must be called with ctx->mutex and ctx->lock held. 1678 */ 1679 static void 1680 list_del_event(struct perf_event *event, struct perf_event_context *ctx) 1681 { 1682 WARN_ON_ONCE(event->ctx != ctx); 1683 lockdep_assert_held(&ctx->lock); 1684 1685 /* 1686 * We can have double detach due to exit/hot-unplug + close. 1687 */ 1688 if (!(event->attach_state & PERF_ATTACH_CONTEXT)) 1689 return; 1690 1691 event->attach_state &= ~PERF_ATTACH_CONTEXT; 1692 1693 list_update_cgroup_event(event, ctx, false); 1694 1695 ctx->nr_events--; 1696 if (event->attr.inherit_stat) 1697 ctx->nr_stat--; 1698 1699 list_del_rcu(&event->event_entry); 1700 1701 if (event->group_leader == event) 1702 list_del_init(&event->group_entry); 1703 1704 /* 1705 * If event was in error state, then keep it 1706 * that way, otherwise bogus counts will be 1707 * returned on read(). The only way to get out 1708 * of error state is by explicit re-enabling 1709 * of the event 1710 */ 1711 if (event->state > PERF_EVENT_STATE_OFF) 1712 perf_event_set_state(event, PERF_EVENT_STATE_OFF); 1713 1714 ctx->generation++; 1715 } 1716 1717 static void perf_group_detach(struct perf_event *event) 1718 { 1719 struct perf_event *sibling, *tmp; 1720 struct list_head *list = NULL; 1721 1722 lockdep_assert_held(&event->ctx->lock); 1723 1724 /* 1725 * We can have double detach due to exit/hot-unplug + close. 1726 */ 1727 if (!(event->attach_state & PERF_ATTACH_GROUP)) 1728 return; 1729 1730 event->attach_state &= ~PERF_ATTACH_GROUP; 1731 1732 /* 1733 * If this is a sibling, remove it from its group. 1734 */ 1735 if (event->group_leader != event) { 1736 list_del_init(&event->group_entry); 1737 event->group_leader->nr_siblings--; 1738 goto out; 1739 } 1740 1741 if (!list_empty(&event->group_entry)) 1742 list = &event->group_entry; 1743 1744 /* 1745 * If this was a group event with sibling events then 1746 * upgrade the siblings to singleton events by adding them 1747 * to whatever list we are on. 1748 */ 1749 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 1750 if (list) 1751 list_move_tail(&sibling->group_entry, list); 1752 sibling->group_leader = sibling; 1753 1754 /* Inherit group flags from the previous leader */ 1755 sibling->group_caps = event->group_caps; 1756 1757 WARN_ON_ONCE(sibling->ctx != event->ctx); 1758 } 1759 1760 out: 1761 perf_event__header_size(event->group_leader); 1762 1763 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) 1764 perf_event__header_size(tmp); 1765 } 1766 1767 static bool is_orphaned_event(struct perf_event *event) 1768 { 1769 return event->state == PERF_EVENT_STATE_DEAD; 1770 } 1771 1772 static inline int __pmu_filter_match(struct perf_event *event) 1773 { 1774 struct pmu *pmu = event->pmu; 1775 return pmu->filter_match ? pmu->filter_match(event) : 1; 1776 } 1777 1778 /* 1779 * Check whether we should attempt to schedule an event group based on 1780 * PMU-specific filtering. An event group can consist of HW and SW events, 1781 * potentially with a SW leader, so we must check all the filters, to 1782 * determine whether a group is schedulable: 1783 */ 1784 static inline int pmu_filter_match(struct perf_event *event) 1785 { 1786 struct perf_event *child; 1787 1788 if (!__pmu_filter_match(event)) 1789 return 0; 1790 1791 list_for_each_entry(child, &event->sibling_list, group_entry) { 1792 if (!__pmu_filter_match(child)) 1793 return 0; 1794 } 1795 1796 return 1; 1797 } 1798 1799 static inline int 1800 event_filter_match(struct perf_event *event) 1801 { 1802 return (event->cpu == -1 || event->cpu == smp_processor_id()) && 1803 perf_cgroup_match(event) && pmu_filter_match(event); 1804 } 1805 1806 static void 1807 event_sched_out(struct perf_event *event, 1808 struct perf_cpu_context *cpuctx, 1809 struct perf_event_context *ctx) 1810 { 1811 enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; 1812 1813 WARN_ON_ONCE(event->ctx != ctx); 1814 lockdep_assert_held(&ctx->lock); 1815 1816 if (event->state != PERF_EVENT_STATE_ACTIVE) 1817 return; 1818 1819 perf_pmu_disable(event->pmu); 1820 1821 event->pmu->del(event, 0); 1822 event->oncpu = -1; 1823 1824 if (event->pending_disable) { 1825 event->pending_disable = 0; 1826 state = PERF_EVENT_STATE_OFF; 1827 } 1828 perf_event_set_state(event, state); 1829 1830 if (!is_software_event(event)) 1831 cpuctx->active_oncpu--; 1832 if (!--ctx->nr_active) 1833 perf_event_ctx_deactivate(ctx); 1834 if (event->attr.freq && event->attr.sample_freq) 1835 ctx->nr_freq--; 1836 if (event->attr.exclusive || !cpuctx->active_oncpu) 1837 cpuctx->exclusive = 0; 1838 1839 perf_pmu_enable(event->pmu); 1840 } 1841 1842 static void 1843 group_sched_out(struct perf_event *group_event, 1844 struct perf_cpu_context *cpuctx, 1845 struct perf_event_context *ctx) 1846 { 1847 struct perf_event *event; 1848 1849 if (group_event->state != PERF_EVENT_STATE_ACTIVE) 1850 return; 1851 1852 perf_pmu_disable(ctx->pmu); 1853 1854 event_sched_out(group_event, cpuctx, ctx); 1855 1856 /* 1857 * Schedule out siblings (if any): 1858 */ 1859 list_for_each_entry(event, &group_event->sibling_list, group_entry) 1860 event_sched_out(event, cpuctx, ctx); 1861 1862 perf_pmu_enable(ctx->pmu); 1863 1864 if (group_event->attr.exclusive) 1865 cpuctx->exclusive = 0; 1866 } 1867 1868 #define DETACH_GROUP 0x01UL 1869 1870 /* 1871 * Cross CPU call to remove a performance event 1872 * 1873 * We disable the event on the hardware level first. After that we 1874 * remove it from the context list. 1875 */ 1876 static void 1877 __perf_remove_from_context(struct perf_event *event, 1878 struct perf_cpu_context *cpuctx, 1879 struct perf_event_context *ctx, 1880 void *info) 1881 { 1882 unsigned long flags = (unsigned long)info; 1883 1884 if (ctx->is_active & EVENT_TIME) { 1885 update_context_time(ctx); 1886 update_cgrp_time_from_cpuctx(cpuctx); 1887 } 1888 1889 event_sched_out(event, cpuctx, ctx); 1890 if (flags & DETACH_GROUP) 1891 perf_group_detach(event); 1892 list_del_event(event, ctx); 1893 1894 if (!ctx->nr_events && ctx->is_active) { 1895 ctx->is_active = 0; 1896 if (ctx->task) { 1897 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 1898 cpuctx->task_ctx = NULL; 1899 } 1900 } 1901 } 1902 1903 /* 1904 * Remove the event from a task's (or a CPU's) list of events. 1905 * 1906 * If event->ctx is a cloned context, callers must make sure that 1907 * every task struct that event->ctx->task could possibly point to 1908 * remains valid. This is OK when called from perf_release since 1909 * that only calls us on the top-level context, which can't be a clone. 1910 * When called from perf_event_exit_task, it's OK because the 1911 * context has been detached from its task. 1912 */ 1913 static void perf_remove_from_context(struct perf_event *event, unsigned long flags) 1914 { 1915 struct perf_event_context *ctx = event->ctx; 1916 1917 lockdep_assert_held(&ctx->mutex); 1918 1919 event_function_call(event, __perf_remove_from_context, (void *)flags); 1920 1921 /* 1922 * The above event_function_call() can NO-OP when it hits 1923 * TASK_TOMBSTONE. In that case we must already have been detached 1924 * from the context (by perf_event_exit_event()) but the grouping 1925 * might still be in-tact. 1926 */ 1927 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); 1928 if ((flags & DETACH_GROUP) && 1929 (event->attach_state & PERF_ATTACH_GROUP)) { 1930 /* 1931 * Since in that case we cannot possibly be scheduled, simply 1932 * detach now. 1933 */ 1934 raw_spin_lock_irq(&ctx->lock); 1935 perf_group_detach(event); 1936 raw_spin_unlock_irq(&ctx->lock); 1937 } 1938 } 1939 1940 /* 1941 * Cross CPU call to disable a performance event 1942 */ 1943 static void __perf_event_disable(struct perf_event *event, 1944 struct perf_cpu_context *cpuctx, 1945 struct perf_event_context *ctx, 1946 void *info) 1947 { 1948 if (event->state < PERF_EVENT_STATE_INACTIVE) 1949 return; 1950 1951 if (ctx->is_active & EVENT_TIME) { 1952 update_context_time(ctx); 1953 update_cgrp_time_from_event(event); 1954 } 1955 1956 if (event == event->group_leader) 1957 group_sched_out(event, cpuctx, ctx); 1958 else 1959 event_sched_out(event, cpuctx, ctx); 1960 1961 perf_event_set_state(event, PERF_EVENT_STATE_OFF); 1962 } 1963 1964 /* 1965 * Disable a event. 1966 * 1967 * If event->ctx is a cloned context, callers must make sure that 1968 * every task struct that event->ctx->task could possibly point to 1969 * remains valid. This condition is satisifed when called through 1970 * perf_event_for_each_child or perf_event_for_each because they 1971 * hold the top-level event's child_mutex, so any descendant that 1972 * goes to exit will block in perf_event_exit_event(). 1973 * 1974 * When called from perf_pending_event it's OK because event->ctx 1975 * is the current context on this CPU and preemption is disabled, 1976 * hence we can't get into perf_event_task_sched_out for this context. 1977 */ 1978 static void _perf_event_disable(struct perf_event *event) 1979 { 1980 struct perf_event_context *ctx = event->ctx; 1981 1982 raw_spin_lock_irq(&ctx->lock); 1983 if (event->state <= PERF_EVENT_STATE_OFF) { 1984 raw_spin_unlock_irq(&ctx->lock); 1985 return; 1986 } 1987 raw_spin_unlock_irq(&ctx->lock); 1988 1989 event_function_call(event, __perf_event_disable, NULL); 1990 } 1991 1992 void perf_event_disable_local(struct perf_event *event) 1993 { 1994 event_function_local(event, __perf_event_disable, NULL); 1995 } 1996 1997 /* 1998 * Strictly speaking kernel users cannot create groups and therefore this 1999 * interface does not need the perf_event_ctx_lock() magic. 2000 */ 2001 void perf_event_disable(struct perf_event *event) 2002 { 2003 struct perf_event_context *ctx; 2004 2005 ctx = perf_event_ctx_lock(event); 2006 _perf_event_disable(event); 2007 perf_event_ctx_unlock(event, ctx); 2008 } 2009 EXPORT_SYMBOL_GPL(perf_event_disable); 2010 2011 void perf_event_disable_inatomic(struct perf_event *event) 2012 { 2013 event->pending_disable = 1; 2014 irq_work_queue(&event->pending); 2015 } 2016 2017 static void perf_set_shadow_time(struct perf_event *event, 2018 struct perf_event_context *ctx) 2019 { 2020 /* 2021 * use the correct time source for the time snapshot 2022 * 2023 * We could get by without this by leveraging the 2024 * fact that to get to this function, the caller 2025 * has most likely already called update_context_time() 2026 * and update_cgrp_time_xx() and thus both timestamp 2027 * are identical (or very close). Given that tstamp is, 2028 * already adjusted for cgroup, we could say that: 2029 * tstamp - ctx->timestamp 2030 * is equivalent to 2031 * tstamp - cgrp->timestamp. 2032 * 2033 * Then, in perf_output_read(), the calculation would 2034 * work with no changes because: 2035 * - event is guaranteed scheduled in 2036 * - no scheduled out in between 2037 * - thus the timestamp would be the same 2038 * 2039 * But this is a bit hairy. 2040 * 2041 * So instead, we have an explicit cgroup call to remain 2042 * within the time time source all along. We believe it 2043 * is cleaner and simpler to understand. 2044 */ 2045 if (is_cgroup_event(event)) 2046 perf_cgroup_set_shadow_time(event, event->tstamp); 2047 else 2048 event->shadow_ctx_time = event->tstamp - ctx->timestamp; 2049 } 2050 2051 #define MAX_INTERRUPTS (~0ULL) 2052 2053 static void perf_log_throttle(struct perf_event *event, int enable); 2054 static void perf_log_itrace_start(struct perf_event *event); 2055 2056 static int 2057 event_sched_in(struct perf_event *event, 2058 struct perf_cpu_context *cpuctx, 2059 struct perf_event_context *ctx) 2060 { 2061 int ret = 0; 2062 2063 lockdep_assert_held(&ctx->lock); 2064 2065 if (event->state <= PERF_EVENT_STATE_OFF) 2066 return 0; 2067 2068 WRITE_ONCE(event->oncpu, smp_processor_id()); 2069 /* 2070 * Order event::oncpu write to happen before the ACTIVE state is 2071 * visible. This allows perf_event_{stop,read}() to observe the correct 2072 * ->oncpu if it sees ACTIVE. 2073 */ 2074 smp_wmb(); 2075 perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); 2076 2077 /* 2078 * Unthrottle events, since we scheduled we might have missed several 2079 * ticks already, also for a heavily scheduling task there is little 2080 * guarantee it'll get a tick in a timely manner. 2081 */ 2082 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { 2083 perf_log_throttle(event, 1); 2084 event->hw.interrupts = 0; 2085 } 2086 2087 perf_pmu_disable(event->pmu); 2088 2089 perf_set_shadow_time(event, ctx); 2090 2091 perf_log_itrace_start(event); 2092 2093 if (event->pmu->add(event, PERF_EF_START)) { 2094 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); 2095 event->oncpu = -1; 2096 ret = -EAGAIN; 2097 goto out; 2098 } 2099 2100 if (!is_software_event(event)) 2101 cpuctx->active_oncpu++; 2102 if (!ctx->nr_active++) 2103 perf_event_ctx_activate(ctx); 2104 if (event->attr.freq && event->attr.sample_freq) 2105 ctx->nr_freq++; 2106 2107 if (event->attr.exclusive) 2108 cpuctx->exclusive = 1; 2109 2110 out: 2111 perf_pmu_enable(event->pmu); 2112 2113 return ret; 2114 } 2115 2116 static int 2117 group_sched_in(struct perf_event *group_event, 2118 struct perf_cpu_context *cpuctx, 2119 struct perf_event_context *ctx) 2120 { 2121 struct perf_event *event, *partial_group = NULL; 2122 struct pmu *pmu = ctx->pmu; 2123 2124 if (group_event->state == PERF_EVENT_STATE_OFF) 2125 return 0; 2126 2127 pmu->start_txn(pmu, PERF_PMU_TXN_ADD); 2128 2129 if (event_sched_in(group_event, cpuctx, ctx)) { 2130 pmu->cancel_txn(pmu); 2131 perf_mux_hrtimer_restart(cpuctx); 2132 return -EAGAIN; 2133 } 2134 2135 /* 2136 * Schedule in siblings as one group (if any): 2137 */ 2138 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 2139 if (event_sched_in(event, cpuctx, ctx)) { 2140 partial_group = event; 2141 goto group_error; 2142 } 2143 } 2144 2145 if (!pmu->commit_txn(pmu)) 2146 return 0; 2147 2148 group_error: 2149 /* 2150 * Groups can be scheduled in as one unit only, so undo any 2151 * partial group before returning: 2152 * The events up to the failed event are scheduled out normally. 2153 */ 2154 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 2155 if (event == partial_group) 2156 break; 2157 2158 event_sched_out(event, cpuctx, ctx); 2159 } 2160 event_sched_out(group_event, cpuctx, ctx); 2161 2162 pmu->cancel_txn(pmu); 2163 2164 perf_mux_hrtimer_restart(cpuctx); 2165 2166 return -EAGAIN; 2167 } 2168 2169 /* 2170 * Work out whether we can put this event group on the CPU now. 2171 */ 2172 static int group_can_go_on(struct perf_event *event, 2173 struct perf_cpu_context *cpuctx, 2174 int can_add_hw) 2175 { 2176 /* 2177 * Groups consisting entirely of software events can always go on. 2178 */ 2179 if (event->group_caps & PERF_EV_CAP_SOFTWARE) 2180 return 1; 2181 /* 2182 * If an exclusive group is already on, no other hardware 2183 * events can go on. 2184 */ 2185 if (cpuctx->exclusive) 2186 return 0; 2187 /* 2188 * If this group is exclusive and there are already 2189 * events on the CPU, it can't go on. 2190 */ 2191 if (event->attr.exclusive && cpuctx->active_oncpu) 2192 return 0; 2193 /* 2194 * Otherwise, try to add it if all previous groups were able 2195 * to go on. 2196 */ 2197 return can_add_hw; 2198 } 2199 2200 static void add_event_to_ctx(struct perf_event *event, 2201 struct perf_event_context *ctx) 2202 { 2203 list_add_event(event, ctx); 2204 perf_group_attach(event); 2205 } 2206 2207 static void ctx_sched_out(struct perf_event_context *ctx, 2208 struct perf_cpu_context *cpuctx, 2209 enum event_type_t event_type); 2210 static void 2211 ctx_sched_in(struct perf_event_context *ctx, 2212 struct perf_cpu_context *cpuctx, 2213 enum event_type_t event_type, 2214 struct task_struct *task); 2215 2216 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, 2217 struct perf_event_context *ctx, 2218 enum event_type_t event_type) 2219 { 2220 if (!cpuctx->task_ctx) 2221 return; 2222 2223 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2224 return; 2225 2226 ctx_sched_out(ctx, cpuctx, event_type); 2227 } 2228 2229 static void perf_event_sched_in(struct perf_cpu_context *cpuctx, 2230 struct perf_event_context *ctx, 2231 struct task_struct *task) 2232 { 2233 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); 2234 if (ctx) 2235 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); 2236 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); 2237 if (ctx) 2238 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); 2239 } 2240 2241 /* 2242 * We want to maintain the following priority of scheduling: 2243 * - CPU pinned (EVENT_CPU | EVENT_PINNED) 2244 * - task pinned (EVENT_PINNED) 2245 * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) 2246 * - task flexible (EVENT_FLEXIBLE). 2247 * 2248 * In order to avoid unscheduling and scheduling back in everything every 2249 * time an event is added, only do it for the groups of equal priority and 2250 * below. 2251 * 2252 * This can be called after a batch operation on task events, in which case 2253 * event_type is a bit mask of the types of events involved. For CPU events, 2254 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. 2255 */ 2256 static void ctx_resched(struct perf_cpu_context *cpuctx, 2257 struct perf_event_context *task_ctx, 2258 enum event_type_t event_type) 2259 { 2260 enum event_type_t ctx_event_type; 2261 bool cpu_event = !!(event_type & EVENT_CPU); 2262 2263 /* 2264 * If pinned groups are involved, flexible groups also need to be 2265 * scheduled out. 2266 */ 2267 if (event_type & EVENT_PINNED) 2268 event_type |= EVENT_FLEXIBLE; 2269 2270 ctx_event_type = event_type & EVENT_ALL; 2271 2272 perf_pmu_disable(cpuctx->ctx.pmu); 2273 if (task_ctx) 2274 task_ctx_sched_out(cpuctx, task_ctx, event_type); 2275 2276 /* 2277 * Decide which cpu ctx groups to schedule out based on the types 2278 * of events that caused rescheduling: 2279 * - EVENT_CPU: schedule out corresponding groups; 2280 * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; 2281 * - otherwise, do nothing more. 2282 */ 2283 if (cpu_event) 2284 cpu_ctx_sched_out(cpuctx, ctx_event_type); 2285 else if (ctx_event_type & EVENT_PINNED) 2286 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2287 2288 perf_event_sched_in(cpuctx, task_ctx, current); 2289 perf_pmu_enable(cpuctx->ctx.pmu); 2290 } 2291 2292 /* 2293 * Cross CPU call to install and enable a performance event 2294 * 2295 * Very similar to remote_function() + event_function() but cannot assume that 2296 * things like ctx->is_active and cpuctx->task_ctx are set. 2297 */ 2298 static int __perf_install_in_context(void *info) 2299 { 2300 struct perf_event *event = info; 2301 struct perf_event_context *ctx = event->ctx; 2302 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2303 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2304 bool reprogram = true; 2305 int ret = 0; 2306 2307 raw_spin_lock(&cpuctx->ctx.lock); 2308 if (ctx->task) { 2309 raw_spin_lock(&ctx->lock); 2310 task_ctx = ctx; 2311 2312 reprogram = (ctx->task == current); 2313 2314 /* 2315 * If the task is running, it must be running on this CPU, 2316 * otherwise we cannot reprogram things. 2317 * 2318 * If its not running, we don't care, ctx->lock will 2319 * serialize against it becoming runnable. 2320 */ 2321 if (task_curr(ctx->task) && !reprogram) { 2322 ret = -ESRCH; 2323 goto unlock; 2324 } 2325 2326 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); 2327 } else if (task_ctx) { 2328 raw_spin_lock(&task_ctx->lock); 2329 } 2330 2331 if (reprogram) { 2332 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2333 add_event_to_ctx(event, ctx); 2334 ctx_resched(cpuctx, task_ctx, get_event_type(event)); 2335 } else { 2336 add_event_to_ctx(event, ctx); 2337 } 2338 2339 unlock: 2340 perf_ctx_unlock(cpuctx, task_ctx); 2341 2342 return ret; 2343 } 2344 2345 /* 2346 * Attach a performance event to a context. 2347 * 2348 * Very similar to event_function_call, see comment there. 2349 */ 2350 static void 2351 perf_install_in_context(struct perf_event_context *ctx, 2352 struct perf_event *event, 2353 int cpu) 2354 { 2355 struct task_struct *task = READ_ONCE(ctx->task); 2356 2357 lockdep_assert_held(&ctx->mutex); 2358 2359 if (event->cpu != -1) 2360 event->cpu = cpu; 2361 2362 /* 2363 * Ensures that if we can observe event->ctx, both the event and ctx 2364 * will be 'complete'. See perf_iterate_sb_cpu(). 2365 */ 2366 smp_store_release(&event->ctx, ctx); 2367 2368 if (!task) { 2369 cpu_function_call(cpu, __perf_install_in_context, event); 2370 return; 2371 } 2372 2373 /* 2374 * Should not happen, we validate the ctx is still alive before calling. 2375 */ 2376 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) 2377 return; 2378 2379 /* 2380 * Installing events is tricky because we cannot rely on ctx->is_active 2381 * to be set in case this is the nr_events 0 -> 1 transition. 2382 * 2383 * Instead we use task_curr(), which tells us if the task is running. 2384 * However, since we use task_curr() outside of rq::lock, we can race 2385 * against the actual state. This means the result can be wrong. 2386 * 2387 * If we get a false positive, we retry, this is harmless. 2388 * 2389 * If we get a false negative, things are complicated. If we are after 2390 * perf_event_context_sched_in() ctx::lock will serialize us, and the 2391 * value must be correct. If we're before, it doesn't matter since 2392 * perf_event_context_sched_in() will program the counter. 2393 * 2394 * However, this hinges on the remote context switch having observed 2395 * our task->perf_event_ctxp[] store, such that it will in fact take 2396 * ctx::lock in perf_event_context_sched_in(). 2397 * 2398 * We do this by task_function_call(), if the IPI fails to hit the task 2399 * we know any future context switch of task must see the 2400 * perf_event_ctpx[] store. 2401 */ 2402 2403 /* 2404 * This smp_mb() orders the task->perf_event_ctxp[] store with the 2405 * task_cpu() load, such that if the IPI then does not find the task 2406 * running, a future context switch of that task must observe the 2407 * store. 2408 */ 2409 smp_mb(); 2410 again: 2411 if (!task_function_call(task, __perf_install_in_context, event)) 2412 return; 2413 2414 raw_spin_lock_irq(&ctx->lock); 2415 task = ctx->task; 2416 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { 2417 /* 2418 * Cannot happen because we already checked above (which also 2419 * cannot happen), and we hold ctx->mutex, which serializes us 2420 * against perf_event_exit_task_context(). 2421 */ 2422 raw_spin_unlock_irq(&ctx->lock); 2423 return; 2424 } 2425 /* 2426 * If the task is not running, ctx->lock will avoid it becoming so, 2427 * thus we can safely install the event. 2428 */ 2429 if (task_curr(task)) { 2430 raw_spin_unlock_irq(&ctx->lock); 2431 goto again; 2432 } 2433 add_event_to_ctx(event, ctx); 2434 raw_spin_unlock_irq(&ctx->lock); 2435 } 2436 2437 /* 2438 * Cross CPU call to enable a performance event 2439 */ 2440 static void __perf_event_enable(struct perf_event *event, 2441 struct perf_cpu_context *cpuctx, 2442 struct perf_event_context *ctx, 2443 void *info) 2444 { 2445 struct perf_event *leader = event->group_leader; 2446 struct perf_event_context *task_ctx; 2447 2448 if (event->state >= PERF_EVENT_STATE_INACTIVE || 2449 event->state <= PERF_EVENT_STATE_ERROR) 2450 return; 2451 2452 if (ctx->is_active) 2453 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2454 2455 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); 2456 2457 if (!ctx->is_active) 2458 return; 2459 2460 if (!event_filter_match(event)) { 2461 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); 2462 return; 2463 } 2464 2465 /* 2466 * If the event is in a group and isn't the group leader, 2467 * then don't put it on unless the group is on. 2468 */ 2469 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { 2470 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); 2471 return; 2472 } 2473 2474 task_ctx = cpuctx->task_ctx; 2475 if (ctx->task) 2476 WARN_ON_ONCE(task_ctx != ctx); 2477 2478 ctx_resched(cpuctx, task_ctx, get_event_type(event)); 2479 } 2480 2481 /* 2482 * Enable a event. 2483 * 2484 * If event->ctx is a cloned context, callers must make sure that 2485 * every task struct that event->ctx->task could possibly point to 2486 * remains valid. This condition is satisfied when called through 2487 * perf_event_for_each_child or perf_event_for_each as described 2488 * for perf_event_disable. 2489 */ 2490 static void _perf_event_enable(struct perf_event *event) 2491 { 2492 struct perf_event_context *ctx = event->ctx; 2493 2494 raw_spin_lock_irq(&ctx->lock); 2495 if (event->state >= PERF_EVENT_STATE_INACTIVE || 2496 event->state < PERF_EVENT_STATE_ERROR) { 2497 raw_spin_unlock_irq(&ctx->lock); 2498 return; 2499 } 2500 2501 /* 2502 * If the event is in error state, clear that first. 2503 * 2504 * That way, if we see the event in error state below, we know that it 2505 * has gone back into error state, as distinct from the task having 2506 * been scheduled away before the cross-call arrived. 2507 */ 2508 if (event->state == PERF_EVENT_STATE_ERROR) 2509 event->state = PERF_EVENT_STATE_OFF; 2510 raw_spin_unlock_irq(&ctx->lock); 2511 2512 event_function_call(event, __perf_event_enable, NULL); 2513 } 2514 2515 /* 2516 * See perf_event_disable(); 2517 */ 2518 void perf_event_enable(struct perf_event *event) 2519 { 2520 struct perf_event_context *ctx; 2521 2522 ctx = perf_event_ctx_lock(event); 2523 _perf_event_enable(event); 2524 perf_event_ctx_unlock(event, ctx); 2525 } 2526 EXPORT_SYMBOL_GPL(perf_event_enable); 2527 2528 struct stop_event_data { 2529 struct perf_event *event; 2530 unsigned int restart; 2531 }; 2532 2533 static int __perf_event_stop(void *info) 2534 { 2535 struct stop_event_data *sd = info; 2536 struct perf_event *event = sd->event; 2537 2538 /* if it's already INACTIVE, do nothing */ 2539 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) 2540 return 0; 2541 2542 /* matches smp_wmb() in event_sched_in() */ 2543 smp_rmb(); 2544 2545 /* 2546 * There is a window with interrupts enabled before we get here, 2547 * so we need to check again lest we try to stop another CPU's event. 2548 */ 2549 if (READ_ONCE(event->oncpu) != smp_processor_id()) 2550 return -EAGAIN; 2551 2552 event->pmu->stop(event, PERF_EF_UPDATE); 2553 2554 /* 2555 * May race with the actual stop (through perf_pmu_output_stop()), 2556 * but it is only used for events with AUX ring buffer, and such 2557 * events will refuse to restart because of rb::aux_mmap_count==0, 2558 * see comments in perf_aux_output_begin(). 2559 * 2560 * Since this is happening on a event-local CPU, no trace is lost 2561 * while restarting. 2562 */ 2563 if (sd->restart) 2564 event->pmu->start(event, 0); 2565 2566 return 0; 2567 } 2568 2569 static int perf_event_stop(struct perf_event *event, int restart) 2570 { 2571 struct stop_event_data sd = { 2572 .event = event, 2573 .restart = restart, 2574 }; 2575 int ret = 0; 2576 2577 do { 2578 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) 2579 return 0; 2580 2581 /* matches smp_wmb() in event_sched_in() */ 2582 smp_rmb(); 2583 2584 /* 2585 * We only want to restart ACTIVE events, so if the event goes 2586 * inactive here (event->oncpu==-1), there's nothing more to do; 2587 * fall through with ret==-ENXIO. 2588 */ 2589 ret = cpu_function_call(READ_ONCE(event->oncpu), 2590 __perf_event_stop, &sd); 2591 } while (ret == -EAGAIN); 2592 2593 return ret; 2594 } 2595 2596 /* 2597 * In order to contain the amount of racy and tricky in the address filter 2598 * configuration management, it is a two part process: 2599 * 2600 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, 2601 * we update the addresses of corresponding vmas in 2602 * event::addr_filters_offs array and bump the event::addr_filters_gen; 2603 * (p2) when an event is scheduled in (pmu::add), it calls 2604 * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() 2605 * if the generation has changed since the previous call. 2606 * 2607 * If (p1) happens while the event is active, we restart it to force (p2). 2608 * 2609 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on 2610 * pre-existing mappings, called once when new filters arrive via SET_FILTER 2611 * ioctl; 2612 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly 2613 * registered mapping, called for every new mmap(), with mm::mmap_sem down 2614 * for reading; 2615 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process 2616 * of exec. 2617 */ 2618 void perf_event_addr_filters_sync(struct perf_event *event) 2619 { 2620 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 2621 2622 if (!has_addr_filter(event)) 2623 return; 2624 2625 raw_spin_lock(&ifh->lock); 2626 if (event->addr_filters_gen != event->hw.addr_filters_gen) { 2627 event->pmu->addr_filters_sync(event); 2628 event->hw.addr_filters_gen = event->addr_filters_gen; 2629 } 2630 raw_spin_unlock(&ifh->lock); 2631 } 2632 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); 2633 2634 static int _perf_event_refresh(struct perf_event *event, int refresh) 2635 { 2636 /* 2637 * not supported on inherited events 2638 */ 2639 if (event->attr.inherit || !is_sampling_event(event)) 2640 return -EINVAL; 2641 2642 atomic_add(refresh, &event->event_limit); 2643 _perf_event_enable(event); 2644 2645 return 0; 2646 } 2647 2648 /* 2649 * See perf_event_disable() 2650 */ 2651 int perf_event_refresh(struct perf_event *event, int refresh) 2652 { 2653 struct perf_event_context *ctx; 2654 int ret; 2655 2656 ctx = perf_event_ctx_lock(event); 2657 ret = _perf_event_refresh(event, refresh); 2658 perf_event_ctx_unlock(event, ctx); 2659 2660 return ret; 2661 } 2662 EXPORT_SYMBOL_GPL(perf_event_refresh); 2663 2664 static void ctx_sched_out(struct perf_event_context *ctx, 2665 struct perf_cpu_context *cpuctx, 2666 enum event_type_t event_type) 2667 { 2668 int is_active = ctx->is_active; 2669 struct perf_event *event; 2670 2671 lockdep_assert_held(&ctx->lock); 2672 2673 if (likely(!ctx->nr_events)) { 2674 /* 2675 * See __perf_remove_from_context(). 2676 */ 2677 WARN_ON_ONCE(ctx->is_active); 2678 if (ctx->task) 2679 WARN_ON_ONCE(cpuctx->task_ctx); 2680 return; 2681 } 2682 2683 ctx->is_active &= ~event_type; 2684 if (!(ctx->is_active & EVENT_ALL)) 2685 ctx->is_active = 0; 2686 2687 if (ctx->task) { 2688 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2689 if (!ctx->is_active) 2690 cpuctx->task_ctx = NULL; 2691 } 2692 2693 /* 2694 * Always update time if it was set; not only when it changes. 2695 * Otherwise we can 'forget' to update time for any but the last 2696 * context we sched out. For example: 2697 * 2698 * ctx_sched_out(.event_type = EVENT_FLEXIBLE) 2699 * ctx_sched_out(.event_type = EVENT_PINNED) 2700 * 2701 * would only update time for the pinned events. 2702 */ 2703 if (is_active & EVENT_TIME) { 2704 /* update (and stop) ctx time */ 2705 update_context_time(ctx); 2706 update_cgrp_time_from_cpuctx(cpuctx); 2707 } 2708 2709 is_active ^= ctx->is_active; /* changed bits */ 2710 2711 if (!ctx->nr_active || !(is_active & EVENT_ALL)) 2712 return; 2713 2714 perf_pmu_disable(ctx->pmu); 2715 if (is_active & EVENT_PINNED) { 2716 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 2717 group_sched_out(event, cpuctx, ctx); 2718 } 2719 2720 if (is_active & EVENT_FLEXIBLE) { 2721 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 2722 group_sched_out(event, cpuctx, ctx); 2723 } 2724 perf_pmu_enable(ctx->pmu); 2725 } 2726 2727 /* 2728 * Test whether two contexts are equivalent, i.e. whether they have both been 2729 * cloned from the same version of the same context. 2730 * 2731 * Equivalence is measured using a generation number in the context that is 2732 * incremented on each modification to it; see unclone_ctx(), list_add_event() 2733 * and list_del_event(). 2734 */ 2735 static int context_equiv(struct perf_event_context *ctx1, 2736 struct perf_event_context *ctx2) 2737 { 2738 lockdep_assert_held(&ctx1->lock); 2739 lockdep_assert_held(&ctx2->lock); 2740 2741 /* Pinning disables the swap optimization */ 2742 if (ctx1->pin_count || ctx2->pin_count) 2743 return 0; 2744 2745 /* If ctx1 is the parent of ctx2 */ 2746 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) 2747 return 1; 2748 2749 /* If ctx2 is the parent of ctx1 */ 2750 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) 2751 return 1; 2752 2753 /* 2754 * If ctx1 and ctx2 have the same parent; we flatten the parent 2755 * hierarchy, see perf_event_init_context(). 2756 */ 2757 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && 2758 ctx1->parent_gen == ctx2->parent_gen) 2759 return 1; 2760 2761 /* Unmatched */ 2762 return 0; 2763 } 2764 2765 static void __perf_event_sync_stat(struct perf_event *event, 2766 struct perf_event *next_event) 2767 { 2768 u64 value; 2769 2770 if (!event->attr.inherit_stat) 2771 return; 2772 2773 /* 2774 * Update the event value, we cannot use perf_event_read() 2775 * because we're in the middle of a context switch and have IRQs 2776 * disabled, which upsets smp_call_function_single(), however 2777 * we know the event must be on the current CPU, therefore we 2778 * don't need to use it. 2779 */ 2780 if (event->state == PERF_EVENT_STATE_ACTIVE) 2781 event->pmu->read(event); 2782 2783 perf_event_update_time(event); 2784 2785 /* 2786 * In order to keep per-task stats reliable we need to flip the event 2787 * values when we flip the contexts. 2788 */ 2789 value = local64_read(&next_event->count); 2790 value = local64_xchg(&event->count, value); 2791 local64_set(&next_event->count, value); 2792 2793 swap(event->total_time_enabled, next_event->total_time_enabled); 2794 swap(event->total_time_running, next_event->total_time_running); 2795 2796 /* 2797 * Since we swizzled the values, update the user visible data too. 2798 */ 2799 perf_event_update_userpage(event); 2800 perf_event_update_userpage(next_event); 2801 } 2802 2803 static void perf_event_sync_stat(struct perf_event_context *ctx, 2804 struct perf_event_context *next_ctx) 2805 { 2806 struct perf_event *event, *next_event; 2807 2808 if (!ctx->nr_stat) 2809 return; 2810 2811 update_context_time(ctx); 2812 2813 event = list_first_entry(&ctx->event_list, 2814 struct perf_event, event_entry); 2815 2816 next_event = list_first_entry(&next_ctx->event_list, 2817 struct perf_event, event_entry); 2818 2819 while (&event->event_entry != &ctx->event_list && 2820 &next_event->event_entry != &next_ctx->event_list) { 2821 2822 __perf_event_sync_stat(event, next_event); 2823 2824 event = list_next_entry(event, event_entry); 2825 next_event = list_next_entry(next_event, event_entry); 2826 } 2827 } 2828 2829 static void perf_event_context_sched_out(struct task_struct *task, int ctxn, 2830 struct task_struct *next) 2831 { 2832 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 2833 struct perf_event_context *next_ctx; 2834 struct perf_event_context *parent, *next_parent; 2835 struct perf_cpu_context *cpuctx; 2836 int do_switch = 1; 2837 2838 if (likely(!ctx)) 2839 return; 2840 2841 cpuctx = __get_cpu_context(ctx); 2842 if (!cpuctx->task_ctx) 2843 return; 2844 2845 rcu_read_lock(); 2846 next_ctx = next->perf_event_ctxp[ctxn]; 2847 if (!next_ctx) 2848 goto unlock; 2849 2850 parent = rcu_dereference(ctx->parent_ctx); 2851 next_parent = rcu_dereference(next_ctx->parent_ctx); 2852 2853 /* If neither context have a parent context; they cannot be clones. */ 2854 if (!parent && !next_parent) 2855 goto unlock; 2856 2857 if (next_parent == ctx || next_ctx == parent || next_parent == parent) { 2858 /* 2859 * Looks like the two contexts are clones, so we might be 2860 * able to optimize the context switch. We lock both 2861 * contexts and check that they are clones under the 2862 * lock (including re-checking that neither has been 2863 * uncloned in the meantime). It doesn't matter which 2864 * order we take the locks because no other cpu could 2865 * be trying to lock both of these tasks. 2866 */ 2867 raw_spin_lock(&ctx->lock); 2868 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 2869 if (context_equiv(ctx, next_ctx)) { 2870 WRITE_ONCE(ctx->task, next); 2871 WRITE_ONCE(next_ctx->task, task); 2872 2873 swap(ctx->task_ctx_data, next_ctx->task_ctx_data); 2874 2875 /* 2876 * RCU_INIT_POINTER here is safe because we've not 2877 * modified the ctx and the above modification of 2878 * ctx->task and ctx->task_ctx_data are immaterial 2879 * since those values are always verified under 2880 * ctx->lock which we're now holding. 2881 */ 2882 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); 2883 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); 2884 2885 do_switch = 0; 2886 2887 perf_event_sync_stat(ctx, next_ctx); 2888 } 2889 raw_spin_unlock(&next_ctx->lock); 2890 raw_spin_unlock(&ctx->lock); 2891 } 2892 unlock: 2893 rcu_read_unlock(); 2894 2895 if (do_switch) { 2896 raw_spin_lock(&ctx->lock); 2897 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); 2898 raw_spin_unlock(&ctx->lock); 2899 } 2900 } 2901 2902 static DEFINE_PER_CPU(struct list_head, sched_cb_list); 2903 2904 void perf_sched_cb_dec(struct pmu *pmu) 2905 { 2906 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 2907 2908 this_cpu_dec(perf_sched_cb_usages); 2909 2910 if (!--cpuctx->sched_cb_usage) 2911 list_del(&cpuctx->sched_cb_entry); 2912 } 2913 2914 2915 void perf_sched_cb_inc(struct pmu *pmu) 2916 { 2917 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 2918 2919 if (!cpuctx->sched_cb_usage++) 2920 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); 2921 2922 this_cpu_inc(perf_sched_cb_usages); 2923 } 2924 2925 /* 2926 * This function provides the context switch callback to the lower code 2927 * layer. It is invoked ONLY when the context switch callback is enabled. 2928 * 2929 * This callback is relevant even to per-cpu events; for example multi event 2930 * PEBS requires this to provide PID/TID information. This requires we flush 2931 * all queued PEBS records before we context switch to a new task. 2932 */ 2933 static void perf_pmu_sched_task(struct task_struct *prev, 2934 struct task_struct *next, 2935 bool sched_in) 2936 { 2937 struct perf_cpu_context *cpuctx; 2938 struct pmu *pmu; 2939 2940 if (prev == next) 2941 return; 2942 2943 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { 2944 pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ 2945 2946 if (WARN_ON_ONCE(!pmu->sched_task)) 2947 continue; 2948 2949 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 2950 perf_pmu_disable(pmu); 2951 2952 pmu->sched_task(cpuctx->task_ctx, sched_in); 2953 2954 perf_pmu_enable(pmu); 2955 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 2956 } 2957 } 2958 2959 static void perf_event_switch(struct task_struct *task, 2960 struct task_struct *next_prev, bool sched_in); 2961 2962 #define for_each_task_context_nr(ctxn) \ 2963 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) 2964 2965 /* 2966 * Called from scheduler to remove the events of the current task, 2967 * with interrupts disabled. 2968 * 2969 * We stop each event and update the event value in event->count. 2970 * 2971 * This does not protect us against NMI, but disable() 2972 * sets the disabled bit in the control field of event _before_ 2973 * accessing the event control register. If a NMI hits, then it will 2974 * not restart the event. 2975 */ 2976 void __perf_event_task_sched_out(struct task_struct *task, 2977 struct task_struct *next) 2978 { 2979 int ctxn; 2980 2981 if (__this_cpu_read(perf_sched_cb_usages)) 2982 perf_pmu_sched_task(task, next, false); 2983 2984 if (atomic_read(&nr_switch_events)) 2985 perf_event_switch(task, next, false); 2986 2987 for_each_task_context_nr(ctxn) 2988 perf_event_context_sched_out(task, ctxn, next); 2989 2990 /* 2991 * if cgroup events exist on this CPU, then we need 2992 * to check if we have to switch out PMU state. 2993 * cgroup event are system-wide mode only 2994 */ 2995 if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 2996 perf_cgroup_sched_out(task, next); 2997 } 2998 2999 /* 3000 * Called with IRQs disabled 3001 */ 3002 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 3003 enum event_type_t event_type) 3004 { 3005 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); 3006 } 3007 3008 static void 3009 ctx_pinned_sched_in(struct perf_event_context *ctx, 3010 struct perf_cpu_context *cpuctx) 3011 { 3012 struct perf_event *event; 3013 3014 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 3015 if (event->state <= PERF_EVENT_STATE_OFF) 3016 continue; 3017 if (!event_filter_match(event)) 3018 continue; 3019 3020 if (group_can_go_on(event, cpuctx, 1)) 3021 group_sched_in(event, cpuctx, ctx); 3022 3023 /* 3024 * If this pinned group hasn't been scheduled, 3025 * put it in error state. 3026 */ 3027 if (event->state == PERF_EVENT_STATE_INACTIVE) 3028 perf_event_set_state(event, PERF_EVENT_STATE_ERROR); 3029 } 3030 } 3031 3032 static void 3033 ctx_flexible_sched_in(struct perf_event_context *ctx, 3034 struct perf_cpu_context *cpuctx) 3035 { 3036 struct perf_event *event; 3037 int can_add_hw = 1; 3038 3039 list_for_each_entry(event, &ctx->flexible_groups, group_entry) { 3040 /* Ignore events in OFF or ERROR state */ 3041 if (event->state <= PERF_EVENT_STATE_OFF) 3042 continue; 3043 /* 3044 * Listen to the 'cpu' scheduling filter constraint 3045 * of events: 3046 */ 3047 if (!event_filter_match(event)) 3048 continue; 3049 3050 if (group_can_go_on(event, cpuctx, can_add_hw)) { 3051 if (group_sched_in(event, cpuctx, ctx)) 3052 can_add_hw = 0; 3053 } 3054 } 3055 } 3056 3057 static void 3058 ctx_sched_in(struct perf_event_context *ctx, 3059 struct perf_cpu_context *cpuctx, 3060 enum event_type_t event_type, 3061 struct task_struct *task) 3062 { 3063 int is_active = ctx->is_active; 3064 u64 now; 3065 3066 lockdep_assert_held(&ctx->lock); 3067 3068 if (likely(!ctx->nr_events)) 3069 return; 3070 3071 ctx->is_active |= (event_type | EVENT_TIME); 3072 if (ctx->task) { 3073 if (!is_active) 3074 cpuctx->task_ctx = ctx; 3075 else 3076 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 3077 } 3078 3079 is_active ^= ctx->is_active; /* changed bits */ 3080 3081 if (is_active & EVENT_TIME) { 3082 /* start ctx time */ 3083 now = perf_clock(); 3084 ctx->timestamp = now; 3085 perf_cgroup_set_timestamp(task, ctx); 3086 } 3087 3088 /* 3089 * First go through the list and put on any pinned groups 3090 * in order to give them the best chance of going on. 3091 */ 3092 if (is_active & EVENT_PINNED) 3093 ctx_pinned_sched_in(ctx, cpuctx); 3094 3095 /* Then walk through the lower prio flexible groups */ 3096 if (is_active & EVENT_FLEXIBLE) 3097 ctx_flexible_sched_in(ctx, cpuctx); 3098 } 3099 3100 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 3101 enum event_type_t event_type, 3102 struct task_struct *task) 3103 { 3104 struct perf_event_context *ctx = &cpuctx->ctx; 3105 3106 ctx_sched_in(ctx, cpuctx, event_type, task); 3107 } 3108 3109 static void perf_event_context_sched_in(struct perf_event_context *ctx, 3110 struct task_struct *task) 3111 { 3112 struct perf_cpu_context *cpuctx; 3113 3114 cpuctx = __get_cpu_context(ctx); 3115 if (cpuctx->task_ctx == ctx) 3116 return; 3117 3118 perf_ctx_lock(cpuctx, ctx); 3119 /* 3120 * We must check ctx->nr_events while holding ctx->lock, such 3121 * that we serialize against perf_install_in_context(). 3122 */ 3123 if (!ctx->nr_events) 3124 goto unlock; 3125 3126 perf_pmu_disable(ctx->pmu); 3127 /* 3128 * We want to keep the following priority order: 3129 * cpu pinned (that don't need to move), task pinned, 3130 * cpu flexible, task flexible. 3131 * 3132 * However, if task's ctx is not carrying any pinned 3133 * events, no need to flip the cpuctx's events around. 3134 */ 3135 if (!list_empty(&ctx->pinned_groups)) 3136 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 3137 perf_event_sched_in(cpuctx, ctx, task); 3138 perf_pmu_enable(ctx->pmu); 3139 3140 unlock: 3141 perf_ctx_unlock(cpuctx, ctx); 3142 } 3143 3144 /* 3145 * Called from scheduler to add the events of the current task 3146 * with interrupts disabled. 3147 * 3148 * We restore the event value and then enable it. 3149 * 3150 * This does not protect us against NMI, but enable() 3151 * sets the enabled bit in the control field of event _before_ 3152 * accessing the event control register. If a NMI hits, then it will 3153 * keep the event running. 3154 */ 3155 void __perf_event_task_sched_in(struct task_struct *prev, 3156 struct task_struct *task) 3157 { 3158 struct perf_event_context *ctx; 3159 int ctxn; 3160 3161 /* 3162 * If cgroup events exist on this CPU, then we need to check if we have 3163 * to switch in PMU state; cgroup event are system-wide mode only. 3164 * 3165 * Since cgroup events are CPU events, we must schedule these in before 3166 * we schedule in the task events. 3167 */ 3168 if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 3169 perf_cgroup_sched_in(prev, task); 3170 3171 for_each_task_context_nr(ctxn) { 3172 ctx = task->perf_event_ctxp[ctxn]; 3173 if (likely(!ctx)) 3174 continue; 3175 3176 perf_event_context_sched_in(ctx, task); 3177 } 3178 3179 if (atomic_read(&nr_switch_events)) 3180 perf_event_switch(task, prev, true); 3181 3182 if (__this_cpu_read(perf_sched_cb_usages)) 3183 perf_pmu_sched_task(prev, task, true); 3184 } 3185 3186 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 3187 { 3188 u64 frequency = event->attr.sample_freq; 3189 u64 sec = NSEC_PER_SEC; 3190 u64 divisor, dividend; 3191 3192 int count_fls, nsec_fls, frequency_fls, sec_fls; 3193 3194 count_fls = fls64(count); 3195 nsec_fls = fls64(nsec); 3196 frequency_fls = fls64(frequency); 3197 sec_fls = 30; 3198 3199 /* 3200 * We got @count in @nsec, with a target of sample_freq HZ 3201 * the target period becomes: 3202 * 3203 * @count * 10^9 3204 * period = ------------------- 3205 * @nsec * sample_freq 3206 * 3207 */ 3208 3209 /* 3210 * Reduce accuracy by one bit such that @a and @b converge 3211 * to a similar magnitude. 3212 */ 3213 #define REDUCE_FLS(a, b) \ 3214 do { \ 3215 if (a##_fls > b##_fls) { \ 3216 a >>= 1; \ 3217 a##_fls--; \ 3218 } else { \ 3219 b >>= 1; \ 3220 b##_fls--; \ 3221 } \ 3222 } while (0) 3223 3224 /* 3225 * Reduce accuracy until either term fits in a u64, then proceed with 3226 * the other, so that finally we can do a u64/u64 division. 3227 */ 3228 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { 3229 REDUCE_FLS(nsec, frequency); 3230 REDUCE_FLS(sec, count); 3231 } 3232 3233 if (count_fls + sec_fls > 64) { 3234 divisor = nsec * frequency; 3235 3236 while (count_fls + sec_fls > 64) { 3237 REDUCE_FLS(count, sec); 3238 divisor >>= 1; 3239 } 3240 3241 dividend = count * sec; 3242 } else { 3243 dividend = count * sec; 3244 3245 while (nsec_fls + frequency_fls > 64) { 3246 REDUCE_FLS(nsec, frequency); 3247 dividend >>= 1; 3248 } 3249 3250 divisor = nsec * frequency; 3251 } 3252 3253 if (!divisor) 3254 return dividend; 3255 3256 return div64_u64(dividend, divisor); 3257 } 3258 3259 static DEFINE_PER_CPU(int, perf_throttled_count); 3260 static DEFINE_PER_CPU(u64, perf_throttled_seq); 3261 3262 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) 3263 { 3264 struct hw_perf_event *hwc = &event->hw; 3265 s64 period, sample_period; 3266 s64 delta; 3267 3268 period = perf_calculate_period(event, nsec, count); 3269 3270 delta = (s64)(period - hwc->sample_period); 3271 delta = (delta + 7) / 8; /* low pass filter */ 3272 3273 sample_period = hwc->sample_period + delta; 3274 3275 if (!sample_period) 3276 sample_period = 1; 3277 3278 hwc->sample_period = sample_period; 3279 3280 if (local64_read(&hwc->period_left) > 8*sample_period) { 3281 if (disable) 3282 event->pmu->stop(event, PERF_EF_UPDATE); 3283 3284 local64_set(&hwc->period_left, 0); 3285 3286 if (disable) 3287 event->pmu->start(event, PERF_EF_RELOAD); 3288 } 3289 } 3290 3291 /* 3292 * combine freq adjustment with unthrottling to avoid two passes over the 3293 * events. At the same time, make sure, having freq events does not change 3294 * the rate of unthrottling as that would introduce bias. 3295 */ 3296 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, 3297 int needs_unthr) 3298 { 3299 struct perf_event *event; 3300 struct hw_perf_event *hwc; 3301 u64 now, period = TICK_NSEC; 3302 s64 delta; 3303 3304 /* 3305 * only need to iterate over all events iff: 3306 * - context have events in frequency mode (needs freq adjust) 3307 * - there are events to unthrottle on this cpu 3308 */ 3309 if (!(ctx->nr_freq || needs_unthr)) 3310 return; 3311 3312 raw_spin_lock(&ctx->lock); 3313 perf_pmu_disable(ctx->pmu); 3314 3315 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3316 if (event->state != PERF_EVENT_STATE_ACTIVE) 3317 continue; 3318 3319 if (!event_filter_match(event)) 3320 continue; 3321 3322 perf_pmu_disable(event->pmu); 3323 3324 hwc = &event->hw; 3325 3326 if (hwc->interrupts == MAX_INTERRUPTS) { 3327 hwc->interrupts = 0; 3328 perf_log_throttle(event, 1); 3329 event->pmu->start(event, 0); 3330 } 3331 3332 if (!event->attr.freq || !event->attr.sample_freq) 3333 goto next; 3334 3335 /* 3336 * stop the event and update event->count 3337 */ 3338 event->pmu->stop(event, PERF_EF_UPDATE); 3339 3340 now = local64_read(&event->count); 3341 delta = now - hwc->freq_count_stamp; 3342 hwc->freq_count_stamp = now; 3343 3344 /* 3345 * restart the event 3346 * reload only if value has changed 3347 * we have stopped the event so tell that 3348 * to perf_adjust_period() to avoid stopping it 3349 * twice. 3350 */ 3351 if (delta > 0) 3352 perf_adjust_period(event, period, delta, false); 3353 3354 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); 3355 next: 3356 perf_pmu_enable(event->pmu); 3357 } 3358 3359 perf_pmu_enable(ctx->pmu); 3360 raw_spin_unlock(&ctx->lock); 3361 } 3362 3363 /* 3364 * Round-robin a context's events: 3365 */ 3366 static void rotate_ctx(struct perf_event_context *ctx) 3367 { 3368 /* 3369 * Rotate the first entry last of non-pinned groups. Rotation might be 3370 * disabled by the inheritance code. 3371 */ 3372 if (!ctx->rotate_disable) 3373 list_rotate_left(&ctx->flexible_groups); 3374 } 3375 3376 static int perf_rotate_context(struct perf_cpu_context *cpuctx) 3377 { 3378 struct perf_event_context *ctx = NULL; 3379 int rotate = 0; 3380 3381 if (cpuctx->ctx.nr_events) { 3382 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 3383 rotate = 1; 3384 } 3385 3386 ctx = cpuctx->task_ctx; 3387 if (ctx && ctx->nr_events) { 3388 if (ctx->nr_events != ctx->nr_active) 3389 rotate = 1; 3390 } 3391 3392 if (!rotate) 3393 goto done; 3394 3395 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 3396 perf_pmu_disable(cpuctx->ctx.pmu); 3397 3398 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 3399 if (ctx) 3400 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); 3401 3402 rotate_ctx(&cpuctx->ctx); 3403 if (ctx) 3404 rotate_ctx(ctx); 3405 3406 perf_event_sched_in(cpuctx, ctx, current); 3407 3408 perf_pmu_enable(cpuctx->ctx.pmu); 3409 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 3410 done: 3411 3412 return rotate; 3413 } 3414 3415 void perf_event_task_tick(void) 3416 { 3417 struct list_head *head = this_cpu_ptr(&active_ctx_list); 3418 struct perf_event_context *ctx, *tmp; 3419 int throttled; 3420 3421 lockdep_assert_irqs_disabled(); 3422 3423 __this_cpu_inc(perf_throttled_seq); 3424 throttled = __this_cpu_xchg(perf_throttled_count, 0); 3425 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); 3426 3427 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) 3428 perf_adjust_freq_unthr_context(ctx, throttled); 3429 } 3430 3431 static int event_enable_on_exec(struct perf_event *event, 3432 struct perf_event_context *ctx) 3433 { 3434 if (!event->attr.enable_on_exec) 3435 return 0; 3436 3437 event->attr.enable_on_exec = 0; 3438 if (event->state >= PERF_EVENT_STATE_INACTIVE) 3439 return 0; 3440 3441 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); 3442 3443 return 1; 3444 } 3445 3446 /* 3447 * Enable all of a task's events that have been marked enable-on-exec. 3448 * This expects task == current. 3449 */ 3450 static void perf_event_enable_on_exec(int ctxn) 3451 { 3452 struct perf_event_context *ctx, *clone_ctx = NULL; 3453 enum event_type_t event_type = 0; 3454 struct perf_cpu_context *cpuctx; 3455 struct perf_event *event; 3456 unsigned long flags; 3457 int enabled = 0; 3458 3459 local_irq_save(flags); 3460 ctx = current->perf_event_ctxp[ctxn]; 3461 if (!ctx || !ctx->nr_events) 3462 goto out; 3463 3464 cpuctx = __get_cpu_context(ctx); 3465 perf_ctx_lock(cpuctx, ctx); 3466 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 3467 list_for_each_entry(event, &ctx->event_list, event_entry) { 3468 enabled |= event_enable_on_exec(event, ctx); 3469 event_type |= get_event_type(event); 3470 } 3471 3472 /* 3473 * Unclone and reschedule this context if we enabled any event. 3474 */ 3475 if (enabled) { 3476 clone_ctx = unclone_ctx(ctx); 3477 ctx_resched(cpuctx, ctx, event_type); 3478 } else { 3479 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); 3480 } 3481 perf_ctx_unlock(cpuctx, ctx); 3482 3483 out: 3484 local_irq_restore(flags); 3485 3486 if (clone_ctx) 3487 put_ctx(clone_ctx); 3488 } 3489 3490 struct perf_read_data { 3491 struct perf_event *event; 3492 bool group; 3493 int ret; 3494 }; 3495 3496 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) 3497 { 3498 u16 local_pkg, event_pkg; 3499 3500 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { 3501 int local_cpu = smp_processor_id(); 3502 3503 event_pkg = topology_physical_package_id(event_cpu); 3504 local_pkg = topology_physical_package_id(local_cpu); 3505 3506 if (event_pkg == local_pkg) 3507 return local_cpu; 3508 } 3509 3510 return event_cpu; 3511 } 3512 3513 /* 3514 * Cross CPU call to read the hardware event 3515 */ 3516 static void __perf_event_read(void *info) 3517 { 3518 struct perf_read_data *data = info; 3519 struct perf_event *sub, *event = data->event; 3520 struct perf_event_context *ctx = event->ctx; 3521 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 3522 struct pmu *pmu = event->pmu; 3523 3524 /* 3525 * If this is a task context, we need to check whether it is 3526 * the current task context of this cpu. If not it has been 3527 * scheduled out before the smp call arrived. In that case 3528 * event->count would have been updated to a recent sample 3529 * when the event was scheduled out. 3530 */ 3531 if (ctx->task && cpuctx->task_ctx != ctx) 3532 return; 3533 3534 raw_spin_lock(&ctx->lock); 3535 if (ctx->is_active & EVENT_TIME) { 3536 update_context_time(ctx); 3537 update_cgrp_time_from_event(event); 3538 } 3539 3540 perf_event_update_time(event); 3541 if (data->group) 3542 perf_event_update_sibling_time(event); 3543 3544 if (event->state != PERF_EVENT_STATE_ACTIVE) 3545 goto unlock; 3546 3547 if (!data->group) { 3548 pmu->read(event); 3549 data->ret = 0; 3550 goto unlock; 3551 } 3552 3553 pmu->start_txn(pmu, PERF_PMU_TXN_READ); 3554 3555 pmu->read(event); 3556 3557 list_for_each_entry(sub, &event->sibling_list, group_entry) { 3558 if (sub->state == PERF_EVENT_STATE_ACTIVE) { 3559 /* 3560 * Use sibling's PMU rather than @event's since 3561 * sibling could be on different (eg: software) PMU. 3562 */ 3563 sub->pmu->read(sub); 3564 } 3565 } 3566 3567 data->ret = pmu->commit_txn(pmu); 3568 3569 unlock: 3570 raw_spin_unlock(&ctx->lock); 3571 } 3572 3573 static inline u64 perf_event_count(struct perf_event *event) 3574 { 3575 return local64_read(&event->count) + atomic64_read(&event->child_count); 3576 } 3577 3578 /* 3579 * NMI-safe method to read a local event, that is an event that 3580 * is: 3581 * - either for the current task, or for this CPU 3582 * - does not have inherit set, for inherited task events 3583 * will not be local and we cannot read them atomically 3584 * - must not have a pmu::count method 3585 */ 3586 int perf_event_read_local(struct perf_event *event, u64 *value, 3587 u64 *enabled, u64 *running) 3588 { 3589 unsigned long flags; 3590 int ret = 0; 3591 3592 /* 3593 * Disabling interrupts avoids all counter scheduling (context 3594 * switches, timer based rotation and IPIs). 3595 */ 3596 local_irq_save(flags); 3597 3598 /* 3599 * It must not be an event with inherit set, we cannot read 3600 * all child counters from atomic context. 3601 */ 3602 if (event->attr.inherit) { 3603 ret = -EOPNOTSUPP; 3604 goto out; 3605 } 3606 3607 /* If this is a per-task event, it must be for current */ 3608 if ((event->attach_state & PERF_ATTACH_TASK) && 3609 event->hw.target != current) { 3610 ret = -EINVAL; 3611 goto out; 3612 } 3613 3614 /* If this is a per-CPU event, it must be for this CPU */ 3615 if (!(event->attach_state & PERF_ATTACH_TASK) && 3616 event->cpu != smp_processor_id()) { 3617 ret = -EINVAL; 3618 goto out; 3619 } 3620 3621 /* 3622 * If the event is currently on this CPU, its either a per-task event, 3623 * or local to this CPU. Furthermore it means its ACTIVE (otherwise 3624 * oncpu == -1). 3625 */ 3626 if (event->oncpu == smp_processor_id()) 3627 event->pmu->read(event); 3628 3629 *value = local64_read(&event->count); 3630 if (enabled || running) { 3631 u64 now = event->shadow_ctx_time + perf_clock(); 3632 u64 __enabled, __running; 3633 3634 __perf_update_times(event, now, &__enabled, &__running); 3635 if (enabled) 3636 *enabled = __enabled; 3637 if (running) 3638 *running = __running; 3639 } 3640 out: 3641 local_irq_restore(flags); 3642 3643 return ret; 3644 } 3645 3646 static int perf_event_read(struct perf_event *event, bool group) 3647 { 3648 enum perf_event_state state = READ_ONCE(event->state); 3649 int event_cpu, ret = 0; 3650 3651 /* 3652 * If event is enabled and currently active on a CPU, update the 3653 * value in the event structure: 3654 */ 3655 again: 3656 if (state == PERF_EVENT_STATE_ACTIVE) { 3657 struct perf_read_data data; 3658 3659 /* 3660 * Orders the ->state and ->oncpu loads such that if we see 3661 * ACTIVE we must also see the right ->oncpu. 3662 * 3663 * Matches the smp_wmb() from event_sched_in(). 3664 */ 3665 smp_rmb(); 3666 3667 event_cpu = READ_ONCE(event->oncpu); 3668 if ((unsigned)event_cpu >= nr_cpu_ids) 3669 return 0; 3670 3671 data = (struct perf_read_data){ 3672 .event = event, 3673 .group = group, 3674 .ret = 0, 3675 }; 3676 3677 preempt_disable(); 3678 event_cpu = __perf_event_read_cpu(event, event_cpu); 3679 3680 /* 3681 * Purposely ignore the smp_call_function_single() return 3682 * value. 3683 * 3684 * If event_cpu isn't a valid CPU it means the event got 3685 * scheduled out and that will have updated the event count. 3686 * 3687 * Therefore, either way, we'll have an up-to-date event count 3688 * after this. 3689 */ 3690 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); 3691 preempt_enable(); 3692 ret = data.ret; 3693 3694 } else if (state == PERF_EVENT_STATE_INACTIVE) { 3695 struct perf_event_context *ctx = event->ctx; 3696 unsigned long flags; 3697 3698 raw_spin_lock_irqsave(&ctx->lock, flags); 3699 state = event->state; 3700 if (state != PERF_EVENT_STATE_INACTIVE) { 3701 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3702 goto again; 3703 } 3704 3705 /* 3706 * May read while context is not active (e.g., thread is 3707 * blocked), in that case we cannot update context time 3708 */ 3709 if (ctx->is_active & EVENT_TIME) { 3710 update_context_time(ctx); 3711 update_cgrp_time_from_event(event); 3712 } 3713 3714 perf_event_update_time(event); 3715 if (group) 3716 perf_event_update_sibling_time(event); 3717 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3718 } 3719 3720 return ret; 3721 } 3722 3723 /* 3724 * Initialize the perf_event context in a task_struct: 3725 */ 3726 static void __perf_event_init_context(struct perf_event_context *ctx) 3727 { 3728 raw_spin_lock_init(&ctx->lock); 3729 mutex_init(&ctx->mutex); 3730 INIT_LIST_HEAD(&ctx->active_ctx_list); 3731 INIT_LIST_HEAD(&ctx->pinned_groups); 3732 INIT_LIST_HEAD(&ctx->flexible_groups); 3733 INIT_LIST_HEAD(&ctx->event_list); 3734 atomic_set(&ctx->refcount, 1); 3735 } 3736 3737 static struct perf_event_context * 3738 alloc_perf_context(struct pmu *pmu, struct task_struct *task) 3739 { 3740 struct perf_event_context *ctx; 3741 3742 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 3743 if (!ctx) 3744 return NULL; 3745 3746 __perf_event_init_context(ctx); 3747 if (task) { 3748 ctx->task = task; 3749 get_task_struct(task); 3750 } 3751 ctx->pmu = pmu; 3752 3753 return ctx; 3754 } 3755 3756 static struct task_struct * 3757 find_lively_task_by_vpid(pid_t vpid) 3758 { 3759 struct task_struct *task; 3760 3761 rcu_read_lock(); 3762 if (!vpid) 3763 task = current; 3764 else 3765 task = find_task_by_vpid(vpid); 3766 if (task) 3767 get_task_struct(task); 3768 rcu_read_unlock(); 3769 3770 if (!task) 3771 return ERR_PTR(-ESRCH); 3772 3773 return task; 3774 } 3775 3776 /* 3777 * Returns a matching context with refcount and pincount. 3778 */ 3779 static struct perf_event_context * 3780 find_get_context(struct pmu *pmu, struct task_struct *task, 3781 struct perf_event *event) 3782 { 3783 struct perf_event_context *ctx, *clone_ctx = NULL; 3784 struct perf_cpu_context *cpuctx; 3785 void *task_ctx_data = NULL; 3786 unsigned long flags; 3787 int ctxn, err; 3788 int cpu = event->cpu; 3789 3790 if (!task) { 3791 /* Must be root to operate on a CPU event: */ 3792 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 3793 return ERR_PTR(-EACCES); 3794 3795 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 3796 ctx = &cpuctx->ctx; 3797 get_ctx(ctx); 3798 ++ctx->pin_count; 3799 3800 return ctx; 3801 } 3802 3803 err = -EINVAL; 3804 ctxn = pmu->task_ctx_nr; 3805 if (ctxn < 0) 3806 goto errout; 3807 3808 if (event->attach_state & PERF_ATTACH_TASK_DATA) { 3809 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); 3810 if (!task_ctx_data) { 3811 err = -ENOMEM; 3812 goto errout; 3813 } 3814 } 3815 3816 retry: 3817 ctx = perf_lock_task_context(task, ctxn, &flags); 3818 if (ctx) { 3819 clone_ctx = unclone_ctx(ctx); 3820 ++ctx->pin_count; 3821 3822 if (task_ctx_data && !ctx->task_ctx_data) { 3823 ctx->task_ctx_data = task_ctx_data; 3824 task_ctx_data = NULL; 3825 } 3826 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3827 3828 if (clone_ctx) 3829 put_ctx(clone_ctx); 3830 } else { 3831 ctx = alloc_perf_context(pmu, task); 3832 err = -ENOMEM; 3833 if (!ctx) 3834 goto errout; 3835 3836 if (task_ctx_data) { 3837 ctx->task_ctx_data = task_ctx_data; 3838 task_ctx_data = NULL; 3839 } 3840 3841 err = 0; 3842 mutex_lock(&task->perf_event_mutex); 3843 /* 3844 * If it has already passed perf_event_exit_task(). 3845 * we must see PF_EXITING, it takes this mutex too. 3846 */ 3847 if (task->flags & PF_EXITING) 3848 err = -ESRCH; 3849 else if (task->perf_event_ctxp[ctxn]) 3850 err = -EAGAIN; 3851 else { 3852 get_ctx(ctx); 3853 ++ctx->pin_count; 3854 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 3855 } 3856 mutex_unlock(&task->perf_event_mutex); 3857 3858 if (unlikely(err)) { 3859 put_ctx(ctx); 3860 3861 if (err == -EAGAIN) 3862 goto retry; 3863 goto errout; 3864 } 3865 } 3866 3867 kfree(task_ctx_data); 3868 return ctx; 3869 3870 errout: 3871 kfree(task_ctx_data); 3872 return ERR_PTR(err); 3873 } 3874 3875 static void perf_event_free_filter(struct perf_event *event); 3876 static void perf_event_free_bpf_prog(struct perf_event *event); 3877 3878 static void free_event_rcu(struct rcu_head *head) 3879 { 3880 struct perf_event *event; 3881 3882 event = container_of(head, struct perf_event, rcu_head); 3883 if (event->ns) 3884 put_pid_ns(event->ns); 3885 perf_event_free_filter(event); 3886 kfree(event); 3887 } 3888 3889 static void ring_buffer_attach(struct perf_event *event, 3890 struct ring_buffer *rb); 3891 3892 static void detach_sb_event(struct perf_event *event) 3893 { 3894 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); 3895 3896 raw_spin_lock(&pel->lock); 3897 list_del_rcu(&event->sb_list); 3898 raw_spin_unlock(&pel->lock); 3899 } 3900 3901 static bool is_sb_event(struct perf_event *event) 3902 { 3903 struct perf_event_attr *attr = &event->attr; 3904 3905 if (event->parent) 3906 return false; 3907 3908 if (event->attach_state & PERF_ATTACH_TASK) 3909 return false; 3910 3911 if (attr->mmap || attr->mmap_data || attr->mmap2 || 3912 attr->comm || attr->comm_exec || 3913 attr->task || 3914 attr->context_switch) 3915 return true; 3916 return false; 3917 } 3918 3919 static void unaccount_pmu_sb_event(struct perf_event *event) 3920 { 3921 if (is_sb_event(event)) 3922 detach_sb_event(event); 3923 } 3924 3925 static void unaccount_event_cpu(struct perf_event *event, int cpu) 3926 { 3927 if (event->parent) 3928 return; 3929 3930 if (is_cgroup_event(event)) 3931 atomic_dec(&per_cpu(perf_cgroup_events, cpu)); 3932 } 3933 3934 #ifdef CONFIG_NO_HZ_FULL 3935 static DEFINE_SPINLOCK(nr_freq_lock); 3936 #endif 3937 3938 static void unaccount_freq_event_nohz(void) 3939 { 3940 #ifdef CONFIG_NO_HZ_FULL 3941 spin_lock(&nr_freq_lock); 3942 if (atomic_dec_and_test(&nr_freq_events)) 3943 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); 3944 spin_unlock(&nr_freq_lock); 3945 #endif 3946 } 3947 3948 static void unaccount_freq_event(void) 3949 { 3950 if (tick_nohz_full_enabled()) 3951 unaccount_freq_event_nohz(); 3952 else 3953 atomic_dec(&nr_freq_events); 3954 } 3955 3956 static void unaccount_event(struct perf_event *event) 3957 { 3958 bool dec = false; 3959 3960 if (event->parent) 3961 return; 3962 3963 if (event->attach_state & PERF_ATTACH_TASK) 3964 dec = true; 3965 if (event->attr.mmap || event->attr.mmap_data) 3966 atomic_dec(&nr_mmap_events); 3967 if (event->attr.comm) 3968 atomic_dec(&nr_comm_events); 3969 if (event->attr.namespaces) 3970 atomic_dec(&nr_namespaces_events); 3971 if (event->attr.task) 3972 atomic_dec(&nr_task_events); 3973 if (event->attr.freq) 3974 unaccount_freq_event(); 3975 if (event->attr.context_switch) { 3976 dec = true; 3977 atomic_dec(&nr_switch_events); 3978 } 3979 if (is_cgroup_event(event)) 3980 dec = true; 3981 if (has_branch_stack(event)) 3982 dec = true; 3983 3984 if (dec) { 3985 if (!atomic_add_unless(&perf_sched_count, -1, 1)) 3986 schedule_delayed_work(&perf_sched_work, HZ); 3987 } 3988 3989 unaccount_event_cpu(event, event->cpu); 3990 3991 unaccount_pmu_sb_event(event); 3992 } 3993 3994 static void perf_sched_delayed(struct work_struct *work) 3995 { 3996 mutex_lock(&perf_sched_mutex); 3997 if (atomic_dec_and_test(&perf_sched_count)) 3998 static_branch_disable(&perf_sched_events); 3999 mutex_unlock(&perf_sched_mutex); 4000 } 4001 4002 /* 4003 * The following implement mutual exclusion of events on "exclusive" pmus 4004 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled 4005 * at a time, so we disallow creating events that might conflict, namely: 4006 * 4007 * 1) cpu-wide events in the presence of per-task events, 4008 * 2) per-task events in the presence of cpu-wide events, 4009 * 3) two matching events on the same context. 4010 * 4011 * The former two cases are handled in the allocation path (perf_event_alloc(), 4012 * _free_event()), the latter -- before the first perf_install_in_context(). 4013 */ 4014 static int exclusive_event_init(struct perf_event *event) 4015 { 4016 struct pmu *pmu = event->pmu; 4017 4018 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) 4019 return 0; 4020 4021 /* 4022 * Prevent co-existence of per-task and cpu-wide events on the 4023 * same exclusive pmu. 4024 * 4025 * Negative pmu::exclusive_cnt means there are cpu-wide 4026 * events on this "exclusive" pmu, positive means there are 4027 * per-task events. 4028 * 4029 * Since this is called in perf_event_alloc() path, event::ctx 4030 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK 4031 * to mean "per-task event", because unlike other attach states it 4032 * never gets cleared. 4033 */ 4034 if (event->attach_state & PERF_ATTACH_TASK) { 4035 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) 4036 return -EBUSY; 4037 } else { 4038 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) 4039 return -EBUSY; 4040 } 4041 4042 return 0; 4043 } 4044 4045 static void exclusive_event_destroy(struct perf_event *event) 4046 { 4047 struct pmu *pmu = event->pmu; 4048 4049 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) 4050 return; 4051 4052 /* see comment in exclusive_event_init() */ 4053 if (event->attach_state & PERF_ATTACH_TASK) 4054 atomic_dec(&pmu->exclusive_cnt); 4055 else 4056 atomic_inc(&pmu->exclusive_cnt); 4057 } 4058 4059 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) 4060 { 4061 if ((e1->pmu == e2->pmu) && 4062 (e1->cpu == e2->cpu || 4063 e1->cpu == -1 || 4064 e2->cpu == -1)) 4065 return true; 4066 return false; 4067 } 4068 4069 /* Called under the same ctx::mutex as perf_install_in_context() */ 4070 static bool exclusive_event_installable(struct perf_event *event, 4071 struct perf_event_context *ctx) 4072 { 4073 struct perf_event *iter_event; 4074 struct pmu *pmu = event->pmu; 4075 4076 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) 4077 return true; 4078 4079 list_for_each_entry(iter_event, &ctx->event_list, event_entry) { 4080 if (exclusive_event_match(iter_event, event)) 4081 return false; 4082 } 4083 4084 return true; 4085 } 4086 4087 static void perf_addr_filters_splice(struct perf_event *event, 4088 struct list_head *head); 4089 4090 static void _free_event(struct perf_event *event) 4091 { 4092 irq_work_sync(&event->pending); 4093 4094 unaccount_event(event); 4095 4096 if (event->rb) { 4097 /* 4098 * Can happen when we close an event with re-directed output. 4099 * 4100 * Since we have a 0 refcount, perf_mmap_close() will skip 4101 * over us; possibly making our ring_buffer_put() the last. 4102 */ 4103 mutex_lock(&event->mmap_mutex); 4104 ring_buffer_attach(event, NULL); 4105 mutex_unlock(&event->mmap_mutex); 4106 } 4107 4108 if (is_cgroup_event(event)) 4109 perf_detach_cgroup(event); 4110 4111 if (!event->parent) { 4112 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 4113 put_callchain_buffers(); 4114 } 4115 4116 perf_event_free_bpf_prog(event); 4117 perf_addr_filters_splice(event, NULL); 4118 kfree(event->addr_filters_offs); 4119 4120 if (event->destroy) 4121 event->destroy(event); 4122 4123 if (event->ctx) 4124 put_ctx(event->ctx); 4125 4126 exclusive_event_destroy(event); 4127 module_put(event->pmu->module); 4128 4129 call_rcu(&event->rcu_head, free_event_rcu); 4130 } 4131 4132 /* 4133 * Used to free events which have a known refcount of 1, such as in error paths 4134 * where the event isn't exposed yet and inherited events. 4135 */ 4136 static void free_event(struct perf_event *event) 4137 { 4138 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, 4139 "unexpected event refcount: %ld; ptr=%p\n", 4140 atomic_long_read(&event->refcount), event)) { 4141 /* leak to avoid use-after-free */ 4142 return; 4143 } 4144 4145 _free_event(event); 4146 } 4147 4148 /* 4149 * Remove user event from the owner task. 4150 */ 4151 static void perf_remove_from_owner(struct perf_event *event) 4152 { 4153 struct task_struct *owner; 4154 4155 rcu_read_lock(); 4156 /* 4157 * Matches the smp_store_release() in perf_event_exit_task(). If we 4158 * observe !owner it means the list deletion is complete and we can 4159 * indeed free this event, otherwise we need to serialize on 4160 * owner->perf_event_mutex. 4161 */ 4162 owner = READ_ONCE(event->owner); 4163 if (owner) { 4164 /* 4165 * Since delayed_put_task_struct() also drops the last 4166 * task reference we can safely take a new reference 4167 * while holding the rcu_read_lock(). 4168 */ 4169 get_task_struct(owner); 4170 } 4171 rcu_read_unlock(); 4172 4173 if (owner) { 4174 /* 4175 * If we're here through perf_event_exit_task() we're already 4176 * holding ctx->mutex which would be an inversion wrt. the 4177 * normal lock order. 4178 * 4179 * However we can safely take this lock because its the child 4180 * ctx->mutex. 4181 */ 4182 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); 4183 4184 /* 4185 * We have to re-check the event->owner field, if it is cleared 4186 * we raced with perf_event_exit_task(), acquiring the mutex 4187 * ensured they're done, and we can proceed with freeing the 4188 * event. 4189 */ 4190 if (event->owner) { 4191 list_del_init(&event->owner_entry); 4192 smp_store_release(&event->owner, NULL); 4193 } 4194 mutex_unlock(&owner->perf_event_mutex); 4195 put_task_struct(owner); 4196 } 4197 } 4198 4199 static void put_event(struct perf_event *event) 4200 { 4201 if (!atomic_long_dec_and_test(&event->refcount)) 4202 return; 4203 4204 _free_event(event); 4205 } 4206 4207 /* 4208 * Kill an event dead; while event:refcount will preserve the event 4209 * object, it will not preserve its functionality. Once the last 'user' 4210 * gives up the object, we'll destroy the thing. 4211 */ 4212 int perf_event_release_kernel(struct perf_event *event) 4213 { 4214 struct perf_event_context *ctx = event->ctx; 4215 struct perf_event *child, *tmp; 4216 LIST_HEAD(free_list); 4217 4218 /* 4219 * If we got here through err_file: fput(event_file); we will not have 4220 * attached to a context yet. 4221 */ 4222 if (!ctx) { 4223 WARN_ON_ONCE(event->attach_state & 4224 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); 4225 goto no_ctx; 4226 } 4227 4228 if (!is_kernel_event(event)) 4229 perf_remove_from_owner(event); 4230 4231 ctx = perf_event_ctx_lock(event); 4232 WARN_ON_ONCE(ctx->parent_ctx); 4233 perf_remove_from_context(event, DETACH_GROUP); 4234 4235 raw_spin_lock_irq(&ctx->lock); 4236 /* 4237 * Mark this event as STATE_DEAD, there is no external reference to it 4238 * anymore. 4239 * 4240 * Anybody acquiring event->child_mutex after the below loop _must_ 4241 * also see this, most importantly inherit_event() which will avoid 4242 * placing more children on the list. 4243 * 4244 * Thus this guarantees that we will in fact observe and kill _ALL_ 4245 * child events. 4246 */ 4247 event->state = PERF_EVENT_STATE_DEAD; 4248 raw_spin_unlock_irq(&ctx->lock); 4249 4250 perf_event_ctx_unlock(event, ctx); 4251 4252 again: 4253 mutex_lock(&event->child_mutex); 4254 list_for_each_entry(child, &event->child_list, child_list) { 4255 4256 /* 4257 * Cannot change, child events are not migrated, see the 4258 * comment with perf_event_ctx_lock_nested(). 4259 */ 4260 ctx = READ_ONCE(child->ctx); 4261 /* 4262 * Since child_mutex nests inside ctx::mutex, we must jump 4263 * through hoops. We start by grabbing a reference on the ctx. 4264 * 4265 * Since the event cannot get freed while we hold the 4266 * child_mutex, the context must also exist and have a !0 4267 * reference count. 4268 */ 4269 get_ctx(ctx); 4270 4271 /* 4272 * Now that we have a ctx ref, we can drop child_mutex, and 4273 * acquire ctx::mutex without fear of it going away. Then we 4274 * can re-acquire child_mutex. 4275 */ 4276 mutex_unlock(&event->child_mutex); 4277 mutex_lock(&ctx->mutex); 4278 mutex_lock(&event->child_mutex); 4279 4280 /* 4281 * Now that we hold ctx::mutex and child_mutex, revalidate our 4282 * state, if child is still the first entry, it didn't get freed 4283 * and we can continue doing so. 4284 */ 4285 tmp = list_first_entry_or_null(&event->child_list, 4286 struct perf_event, child_list); 4287 if (tmp == child) { 4288 perf_remove_from_context(child, DETACH_GROUP); 4289 list_move(&child->child_list, &free_list); 4290 /* 4291 * This matches the refcount bump in inherit_event(); 4292 * this can't be the last reference. 4293 */ 4294 put_event(event); 4295 } 4296 4297 mutex_unlock(&event->child_mutex); 4298 mutex_unlock(&ctx->mutex); 4299 put_ctx(ctx); 4300 goto again; 4301 } 4302 mutex_unlock(&event->child_mutex); 4303 4304 list_for_each_entry_safe(child, tmp, &free_list, child_list) { 4305 list_del(&child->child_list); 4306 free_event(child); 4307 } 4308 4309 no_ctx: 4310 put_event(event); /* Must be the 'last' reference */ 4311 return 0; 4312 } 4313 EXPORT_SYMBOL_GPL(perf_event_release_kernel); 4314 4315 /* 4316 * Called when the last reference to the file is gone. 4317 */ 4318 static int perf_release(struct inode *inode, struct file *file) 4319 { 4320 perf_event_release_kernel(file->private_data); 4321 return 0; 4322 } 4323 4324 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 4325 { 4326 struct perf_event *child; 4327 u64 total = 0; 4328 4329 *enabled = 0; 4330 *running = 0; 4331 4332 mutex_lock(&event->child_mutex); 4333 4334 (void)perf_event_read(event, false); 4335 total += perf_event_count(event); 4336 4337 *enabled += event->total_time_enabled + 4338 atomic64_read(&event->child_total_time_enabled); 4339 *running += event->total_time_running + 4340 atomic64_read(&event->child_total_time_running); 4341 4342 list_for_each_entry(child, &event->child_list, child_list) { 4343 (void)perf_event_read(child, false); 4344 total += perf_event_count(child); 4345 *enabled += child->total_time_enabled; 4346 *running += child->total_time_running; 4347 } 4348 mutex_unlock(&event->child_mutex); 4349 4350 return total; 4351 } 4352 4353 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 4354 { 4355 struct perf_event_context *ctx; 4356 u64 count; 4357 4358 ctx = perf_event_ctx_lock(event); 4359 count = __perf_event_read_value(event, enabled, running); 4360 perf_event_ctx_unlock(event, ctx); 4361 4362 return count; 4363 } 4364 EXPORT_SYMBOL_GPL(perf_event_read_value); 4365 4366 static int __perf_read_group_add(struct perf_event *leader, 4367 u64 read_format, u64 *values) 4368 { 4369 struct perf_event_context *ctx = leader->ctx; 4370 struct perf_event *sub; 4371 unsigned long flags; 4372 int n = 1; /* skip @nr */ 4373 int ret; 4374 4375 ret = perf_event_read(leader, true); 4376 if (ret) 4377 return ret; 4378 4379 raw_spin_lock_irqsave(&ctx->lock, flags); 4380 4381 /* 4382 * Since we co-schedule groups, {enabled,running} times of siblings 4383 * will be identical to those of the leader, so we only publish one 4384 * set. 4385 */ 4386 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 4387 values[n++] += leader->total_time_enabled + 4388 atomic64_read(&leader->child_total_time_enabled); 4389 } 4390 4391 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 4392 values[n++] += leader->total_time_running + 4393 atomic64_read(&leader->child_total_time_running); 4394 } 4395 4396 /* 4397 * Write {count,id} tuples for every sibling. 4398 */ 4399 values[n++] += perf_event_count(leader); 4400 if (read_format & PERF_FORMAT_ID) 4401 values[n++] = primary_event_id(leader); 4402 4403 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 4404 values[n++] += perf_event_count(sub); 4405 if (read_format & PERF_FORMAT_ID) 4406 values[n++] = primary_event_id(sub); 4407 } 4408 4409 raw_spin_unlock_irqrestore(&ctx->lock, flags); 4410 return 0; 4411 } 4412 4413 static int perf_read_group(struct perf_event *event, 4414 u64 read_format, char __user *buf) 4415 { 4416 struct perf_event *leader = event->group_leader, *child; 4417 struct perf_event_context *ctx = leader->ctx; 4418 int ret; 4419 u64 *values; 4420 4421 lockdep_assert_held(&ctx->mutex); 4422 4423 values = kzalloc(event->read_size, GFP_KERNEL); 4424 if (!values) 4425 return -ENOMEM; 4426 4427 values[0] = 1 + leader->nr_siblings; 4428 4429 /* 4430 * By locking the child_mutex of the leader we effectively 4431 * lock the child list of all siblings.. XXX explain how. 4432 */ 4433 mutex_lock(&leader->child_mutex); 4434 4435 ret = __perf_read_group_add(leader, read_format, values); 4436 if (ret) 4437 goto unlock; 4438 4439 list_for_each_entry(child, &leader->child_list, child_list) { 4440 ret = __perf_read_group_add(child, read_format, values); 4441 if (ret) 4442 goto unlock; 4443 } 4444 4445 mutex_unlock(&leader->child_mutex); 4446 4447 ret = event->read_size; 4448 if (copy_to_user(buf, values, event->read_size)) 4449 ret = -EFAULT; 4450 goto out; 4451 4452 unlock: 4453 mutex_unlock(&leader->child_mutex); 4454 out: 4455 kfree(values); 4456 return ret; 4457 } 4458 4459 static int perf_read_one(struct perf_event *event, 4460 u64 read_format, char __user *buf) 4461 { 4462 u64 enabled, running; 4463 u64 values[4]; 4464 int n = 0; 4465 4466 values[n++] = __perf_event_read_value(event, &enabled, &running); 4467 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 4468 values[n++] = enabled; 4469 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 4470 values[n++] = running; 4471 if (read_format & PERF_FORMAT_ID) 4472 values[n++] = primary_event_id(event); 4473 4474 if (copy_to_user(buf, values, n * sizeof(u64))) 4475 return -EFAULT; 4476 4477 return n * sizeof(u64); 4478 } 4479 4480 static bool is_event_hup(struct perf_event *event) 4481 { 4482 bool no_children; 4483 4484 if (event->state > PERF_EVENT_STATE_EXIT) 4485 return false; 4486 4487 mutex_lock(&event->child_mutex); 4488 no_children = list_empty(&event->child_list); 4489 mutex_unlock(&event->child_mutex); 4490 return no_children; 4491 } 4492 4493 /* 4494 * Read the performance event - simple non blocking version for now 4495 */ 4496 static ssize_t 4497 __perf_read(struct perf_event *event, char __user *buf, size_t count) 4498 { 4499 u64 read_format = event->attr.read_format; 4500 int ret; 4501 4502 /* 4503 * Return end-of-file for a read on a event that is in 4504 * error state (i.e. because it was pinned but it couldn't be 4505 * scheduled on to the CPU at some point). 4506 */ 4507 if (event->state == PERF_EVENT_STATE_ERROR) 4508 return 0; 4509 4510 if (count < event->read_size) 4511 return -ENOSPC; 4512 4513 WARN_ON_ONCE(event->ctx->parent_ctx); 4514 if (read_format & PERF_FORMAT_GROUP) 4515 ret = perf_read_group(event, read_format, buf); 4516 else 4517 ret = perf_read_one(event, read_format, buf); 4518 4519 return ret; 4520 } 4521 4522 static ssize_t 4523 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 4524 { 4525 struct perf_event *event = file->private_data; 4526 struct perf_event_context *ctx; 4527 int ret; 4528 4529 ctx = perf_event_ctx_lock(event); 4530 ret = __perf_read(event, buf, count); 4531 perf_event_ctx_unlock(event, ctx); 4532 4533 return ret; 4534 } 4535 4536 static __poll_t perf_poll(struct file *file, poll_table *wait) 4537 { 4538 struct perf_event *event = file->private_data; 4539 struct ring_buffer *rb; 4540 __poll_t events = EPOLLHUP; 4541 4542 poll_wait(file, &event->waitq, wait); 4543 4544 if (is_event_hup(event)) 4545 return events; 4546 4547 /* 4548 * Pin the event->rb by taking event->mmap_mutex; otherwise 4549 * perf_event_set_output() can swizzle our rb and make us miss wakeups. 4550 */ 4551 mutex_lock(&event->mmap_mutex); 4552 rb = event->rb; 4553 if (rb) 4554 events = atomic_xchg(&rb->poll, 0); 4555 mutex_unlock(&event->mmap_mutex); 4556 return events; 4557 } 4558 4559 static void _perf_event_reset(struct perf_event *event) 4560 { 4561 (void)perf_event_read(event, false); 4562 local64_set(&event->count, 0); 4563 perf_event_update_userpage(event); 4564 } 4565 4566 /* 4567 * Holding the top-level event's child_mutex means that any 4568 * descendant process that has inherited this event will block 4569 * in perf_event_exit_event() if it goes to exit, thus satisfying the 4570 * task existence requirements of perf_event_enable/disable. 4571 */ 4572 static void perf_event_for_each_child(struct perf_event *event, 4573 void (*func)(struct perf_event *)) 4574 { 4575 struct perf_event *child; 4576 4577 WARN_ON_ONCE(event->ctx->parent_ctx); 4578 4579 mutex_lock(&event->child_mutex); 4580 func(event); 4581 list_for_each_entry(child, &event->child_list, child_list) 4582 func(child); 4583 mutex_unlock(&event->child_mutex); 4584 } 4585 4586 static void perf_event_for_each(struct perf_event *event, 4587 void (*func)(struct perf_event *)) 4588 { 4589 struct perf_event_context *ctx = event->ctx; 4590 struct perf_event *sibling; 4591 4592 lockdep_assert_held(&ctx->mutex); 4593 4594 event = event->group_leader; 4595 4596 perf_event_for_each_child(event, func); 4597 list_for_each_entry(sibling, &event->sibling_list, group_entry) 4598 perf_event_for_each_child(sibling, func); 4599 } 4600 4601 static void __perf_event_period(struct perf_event *event, 4602 struct perf_cpu_context *cpuctx, 4603 struct perf_event_context *ctx, 4604 void *info) 4605 { 4606 u64 value = *((u64 *)info); 4607 bool active; 4608 4609 if (event->attr.freq) { 4610 event->attr.sample_freq = value; 4611 } else { 4612 event->attr.sample_period = value; 4613 event->hw.sample_period = value; 4614 } 4615 4616 active = (event->state == PERF_EVENT_STATE_ACTIVE); 4617 if (active) { 4618 perf_pmu_disable(ctx->pmu); 4619 /* 4620 * We could be throttled; unthrottle now to avoid the tick 4621 * trying to unthrottle while we already re-started the event. 4622 */ 4623 if (event->hw.interrupts == MAX_INTERRUPTS) { 4624 event->hw.interrupts = 0; 4625 perf_log_throttle(event, 1); 4626 } 4627 event->pmu->stop(event, PERF_EF_UPDATE); 4628 } 4629 4630 local64_set(&event->hw.period_left, 0); 4631 4632 if (active) { 4633 event->pmu->start(event, PERF_EF_RELOAD); 4634 perf_pmu_enable(ctx->pmu); 4635 } 4636 } 4637 4638 static int perf_event_period(struct perf_event *event, u64 __user *arg) 4639 { 4640 u64 value; 4641 4642 if (!is_sampling_event(event)) 4643 return -EINVAL; 4644 4645 if (copy_from_user(&value, arg, sizeof(value))) 4646 return -EFAULT; 4647 4648 if (!value) 4649 return -EINVAL; 4650 4651 if (event->attr.freq && value > sysctl_perf_event_sample_rate) 4652 return -EINVAL; 4653 4654 event_function_call(event, __perf_event_period, &value); 4655 4656 return 0; 4657 } 4658 4659 static const struct file_operations perf_fops; 4660 4661 static inline int perf_fget_light(int fd, struct fd *p) 4662 { 4663 struct fd f = fdget(fd); 4664 if (!f.file) 4665 return -EBADF; 4666 4667 if (f.file->f_op != &perf_fops) { 4668 fdput(f); 4669 return -EBADF; 4670 } 4671 *p = f; 4672 return 0; 4673 } 4674 4675 static int perf_event_set_output(struct perf_event *event, 4676 struct perf_event *output_event); 4677 static int perf_event_set_filter(struct perf_event *event, void __user *arg); 4678 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); 4679 4680 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) 4681 { 4682 void (*func)(struct perf_event *); 4683 u32 flags = arg; 4684 4685 switch (cmd) { 4686 case PERF_EVENT_IOC_ENABLE: 4687 func = _perf_event_enable; 4688 break; 4689 case PERF_EVENT_IOC_DISABLE: 4690 func = _perf_event_disable; 4691 break; 4692 case PERF_EVENT_IOC_RESET: 4693 func = _perf_event_reset; 4694 break; 4695 4696 case PERF_EVENT_IOC_REFRESH: 4697 return _perf_event_refresh(event, arg); 4698 4699 case PERF_EVENT_IOC_PERIOD: 4700 return perf_event_period(event, (u64 __user *)arg); 4701 4702 case PERF_EVENT_IOC_ID: 4703 { 4704 u64 id = primary_event_id(event); 4705 4706 if (copy_to_user((void __user *)arg, &id, sizeof(id))) 4707 return -EFAULT; 4708 return 0; 4709 } 4710 4711 case PERF_EVENT_IOC_SET_OUTPUT: 4712 { 4713 int ret; 4714 if (arg != -1) { 4715 struct perf_event *output_event; 4716 struct fd output; 4717 ret = perf_fget_light(arg, &output); 4718 if (ret) 4719 return ret; 4720 output_event = output.file->private_data; 4721 ret = perf_event_set_output(event, output_event); 4722 fdput(output); 4723 } else { 4724 ret = perf_event_set_output(event, NULL); 4725 } 4726 return ret; 4727 } 4728 4729 case PERF_EVENT_IOC_SET_FILTER: 4730 return perf_event_set_filter(event, (void __user *)arg); 4731 4732 case PERF_EVENT_IOC_SET_BPF: 4733 return perf_event_set_bpf_prog(event, arg); 4734 4735 case PERF_EVENT_IOC_PAUSE_OUTPUT: { 4736 struct ring_buffer *rb; 4737 4738 rcu_read_lock(); 4739 rb = rcu_dereference(event->rb); 4740 if (!rb || !rb->nr_pages) { 4741 rcu_read_unlock(); 4742 return -EINVAL; 4743 } 4744 rb_toggle_paused(rb, !!arg); 4745 rcu_read_unlock(); 4746 return 0; 4747 } 4748 4749 case PERF_EVENT_IOC_QUERY_BPF: 4750 return perf_event_query_prog_array(event, (void __user *)arg); 4751 default: 4752 return -ENOTTY; 4753 } 4754 4755 if (flags & PERF_IOC_FLAG_GROUP) 4756 perf_event_for_each(event, func); 4757 else 4758 perf_event_for_each_child(event, func); 4759 4760 return 0; 4761 } 4762 4763 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 4764 { 4765 struct perf_event *event = file->private_data; 4766 struct perf_event_context *ctx; 4767 long ret; 4768 4769 ctx = perf_event_ctx_lock(event); 4770 ret = _perf_ioctl(event, cmd, arg); 4771 perf_event_ctx_unlock(event, ctx); 4772 4773 return ret; 4774 } 4775 4776 #ifdef CONFIG_COMPAT 4777 static long perf_compat_ioctl(struct file *file, unsigned int cmd, 4778 unsigned long arg) 4779 { 4780 switch (_IOC_NR(cmd)) { 4781 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): 4782 case _IOC_NR(PERF_EVENT_IOC_ID): 4783 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ 4784 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { 4785 cmd &= ~IOCSIZE_MASK; 4786 cmd |= sizeof(void *) << IOCSIZE_SHIFT; 4787 } 4788 break; 4789 } 4790 return perf_ioctl(file, cmd, arg); 4791 } 4792 #else 4793 # define perf_compat_ioctl NULL 4794 #endif 4795 4796 int perf_event_task_enable(void) 4797 { 4798 struct perf_event_context *ctx; 4799 struct perf_event *event; 4800 4801 mutex_lock(¤t->perf_event_mutex); 4802 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { 4803 ctx = perf_event_ctx_lock(event); 4804 perf_event_for_each_child(event, _perf_event_enable); 4805 perf_event_ctx_unlock(event, ctx); 4806 } 4807 mutex_unlock(¤t->perf_event_mutex); 4808 4809 return 0; 4810 } 4811 4812 int perf_event_task_disable(void) 4813 { 4814 struct perf_event_context *ctx; 4815 struct perf_event *event; 4816 4817 mutex_lock(¤t->perf_event_mutex); 4818 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { 4819 ctx = perf_event_ctx_lock(event); 4820 perf_event_for_each_child(event, _perf_event_disable); 4821 perf_event_ctx_unlock(event, ctx); 4822 } 4823 mutex_unlock(¤t->perf_event_mutex); 4824 4825 return 0; 4826 } 4827 4828 static int perf_event_index(struct perf_event *event) 4829 { 4830 if (event->hw.state & PERF_HES_STOPPED) 4831 return 0; 4832 4833 if (event->state != PERF_EVENT_STATE_ACTIVE) 4834 return 0; 4835 4836 return event->pmu->event_idx(event); 4837 } 4838 4839 static void calc_timer_values(struct perf_event *event, 4840 u64 *now, 4841 u64 *enabled, 4842 u64 *running) 4843 { 4844 u64 ctx_time; 4845 4846 *now = perf_clock(); 4847 ctx_time = event->shadow_ctx_time + *now; 4848 __perf_update_times(event, ctx_time, enabled, running); 4849 } 4850 4851 static void perf_event_init_userpage(struct perf_event *event) 4852 { 4853 struct perf_event_mmap_page *userpg; 4854 struct ring_buffer *rb; 4855 4856 rcu_read_lock(); 4857 rb = rcu_dereference(event->rb); 4858 if (!rb) 4859 goto unlock; 4860 4861 userpg = rb->user_page; 4862 4863 /* Allow new userspace to detect that bit 0 is deprecated */ 4864 userpg->cap_bit0_is_deprecated = 1; 4865 userpg->size = offsetof(struct perf_event_mmap_page, __reserved); 4866 userpg->data_offset = PAGE_SIZE; 4867 userpg->data_size = perf_data_size(rb); 4868 4869 unlock: 4870 rcu_read_unlock(); 4871 } 4872 4873 void __weak arch_perf_update_userpage( 4874 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) 4875 { 4876 } 4877 4878 /* 4879 * Callers need to ensure there can be no nesting of this function, otherwise 4880 * the seqlock logic goes bad. We can not serialize this because the arch 4881 * code calls this from NMI context. 4882 */ 4883 void perf_event_update_userpage(struct perf_event *event) 4884 { 4885 struct perf_event_mmap_page *userpg; 4886 struct ring_buffer *rb; 4887 u64 enabled, running, now; 4888 4889 rcu_read_lock(); 4890 rb = rcu_dereference(event->rb); 4891 if (!rb) 4892 goto unlock; 4893 4894 /* 4895 * compute total_time_enabled, total_time_running 4896 * based on snapshot values taken when the event 4897 * was last scheduled in. 4898 * 4899 * we cannot simply called update_context_time() 4900 * because of locking issue as we can be called in 4901 * NMI context 4902 */ 4903 calc_timer_values(event, &now, &enabled, &running); 4904 4905 userpg = rb->user_page; 4906 /* 4907 * Disable preemption so as to not let the corresponding user-space 4908 * spin too long if we get preempted. 4909 */ 4910 preempt_disable(); 4911 ++userpg->lock; 4912 barrier(); 4913 userpg->index = perf_event_index(event); 4914 userpg->offset = perf_event_count(event); 4915 if (userpg->index) 4916 userpg->offset -= local64_read(&event->hw.prev_count); 4917 4918 userpg->time_enabled = enabled + 4919 atomic64_read(&event->child_total_time_enabled); 4920 4921 userpg->time_running = running + 4922 atomic64_read(&event->child_total_time_running); 4923 4924 arch_perf_update_userpage(event, userpg, now); 4925 4926 barrier(); 4927 ++userpg->lock; 4928 preempt_enable(); 4929 unlock: 4930 rcu_read_unlock(); 4931 } 4932 EXPORT_SYMBOL_GPL(perf_event_update_userpage); 4933 4934 static int perf_mmap_fault(struct vm_fault *vmf) 4935 { 4936 struct perf_event *event = vmf->vma->vm_file->private_data; 4937 struct ring_buffer *rb; 4938 int ret = VM_FAULT_SIGBUS; 4939 4940 if (vmf->flags & FAULT_FLAG_MKWRITE) { 4941 if (vmf->pgoff == 0) 4942 ret = 0; 4943 return ret; 4944 } 4945 4946 rcu_read_lock(); 4947 rb = rcu_dereference(event->rb); 4948 if (!rb) 4949 goto unlock; 4950 4951 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 4952 goto unlock; 4953 4954 vmf->page = perf_mmap_to_page(rb, vmf->pgoff); 4955 if (!vmf->page) 4956 goto unlock; 4957 4958 get_page(vmf->page); 4959 vmf->page->mapping = vmf->vma->vm_file->f_mapping; 4960 vmf->page->index = vmf->pgoff; 4961 4962 ret = 0; 4963 unlock: 4964 rcu_read_unlock(); 4965 4966 return ret; 4967 } 4968 4969 static void ring_buffer_attach(struct perf_event *event, 4970 struct ring_buffer *rb) 4971 { 4972 struct ring_buffer *old_rb = NULL; 4973 unsigned long flags; 4974 4975 if (event->rb) { 4976 /* 4977 * Should be impossible, we set this when removing 4978 * event->rb_entry and wait/clear when adding event->rb_entry. 4979 */ 4980 WARN_ON_ONCE(event->rcu_pending); 4981 4982 old_rb = event->rb; 4983 spin_lock_irqsave(&old_rb->event_lock, flags); 4984 list_del_rcu(&event->rb_entry); 4985 spin_unlock_irqrestore(&old_rb->event_lock, flags); 4986 4987 event->rcu_batches = get_state_synchronize_rcu(); 4988 event->rcu_pending = 1; 4989 } 4990 4991 if (rb) { 4992 if (event->rcu_pending) { 4993 cond_synchronize_rcu(event->rcu_batches); 4994 event->rcu_pending = 0; 4995 } 4996 4997 spin_lock_irqsave(&rb->event_lock, flags); 4998 list_add_rcu(&event->rb_entry, &rb->event_list); 4999 spin_unlock_irqrestore(&rb->event_lock, flags); 5000 } 5001 5002 /* 5003 * Avoid racing with perf_mmap_close(AUX): stop the event 5004 * before swizzling the event::rb pointer; if it's getting 5005 * unmapped, its aux_mmap_count will be 0 and it won't 5006 * restart. See the comment in __perf_pmu_output_stop(). 5007 * 5008 * Data will inevitably be lost when set_output is done in 5009 * mid-air, but then again, whoever does it like this is 5010 * not in for the data anyway. 5011 */ 5012 if (has_aux(event)) 5013 perf_event_stop(event, 0); 5014 5015 rcu_assign_pointer(event->rb, rb); 5016 5017 if (old_rb) { 5018 ring_buffer_put(old_rb); 5019 /* 5020 * Since we detached before setting the new rb, so that we 5021 * could attach the new rb, we could have missed a wakeup. 5022 * Provide it now. 5023 */ 5024 wake_up_all(&event->waitq); 5025 } 5026 } 5027 5028 static void ring_buffer_wakeup(struct perf_event *event) 5029 { 5030 struct ring_buffer *rb; 5031 5032 rcu_read_lock(); 5033 rb = rcu_dereference(event->rb); 5034 if (rb) { 5035 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 5036 wake_up_all(&event->waitq); 5037 } 5038 rcu_read_unlock(); 5039 } 5040 5041 struct ring_buffer *ring_buffer_get(struct perf_event *event) 5042 { 5043 struct ring_buffer *rb; 5044 5045 rcu_read_lock(); 5046 rb = rcu_dereference(event->rb); 5047 if (rb) { 5048 if (!atomic_inc_not_zero(&rb->refcount)) 5049 rb = NULL; 5050 } 5051 rcu_read_unlock(); 5052 5053 return rb; 5054 } 5055 5056 void ring_buffer_put(struct ring_buffer *rb) 5057 { 5058 if (!atomic_dec_and_test(&rb->refcount)) 5059 return; 5060 5061 WARN_ON_ONCE(!list_empty(&rb->event_list)); 5062 5063 call_rcu(&rb->rcu_head, rb_free_rcu); 5064 } 5065 5066 static void perf_mmap_open(struct vm_area_struct *vma) 5067 { 5068 struct perf_event *event = vma->vm_file->private_data; 5069 5070 atomic_inc(&event->mmap_count); 5071 atomic_inc(&event->rb->mmap_count); 5072 5073 if (vma->vm_pgoff) 5074 atomic_inc(&event->rb->aux_mmap_count); 5075 5076 if (event->pmu->event_mapped) 5077 event->pmu->event_mapped(event, vma->vm_mm); 5078 } 5079 5080 static void perf_pmu_output_stop(struct perf_event *event); 5081 5082 /* 5083 * A buffer can be mmap()ed multiple times; either directly through the same 5084 * event, or through other events by use of perf_event_set_output(). 5085 * 5086 * In order to undo the VM accounting done by perf_mmap() we need to destroy 5087 * the buffer here, where we still have a VM context. This means we need 5088 * to detach all events redirecting to us. 5089 */ 5090 static void perf_mmap_close(struct vm_area_struct *vma) 5091 { 5092 struct perf_event *event = vma->vm_file->private_data; 5093 5094 struct ring_buffer *rb = ring_buffer_get(event); 5095 struct user_struct *mmap_user = rb->mmap_user; 5096 int mmap_locked = rb->mmap_locked; 5097 unsigned long size = perf_data_size(rb); 5098 5099 if (event->pmu->event_unmapped) 5100 event->pmu->event_unmapped(event, vma->vm_mm); 5101 5102 /* 5103 * rb->aux_mmap_count will always drop before rb->mmap_count and 5104 * event->mmap_count, so it is ok to use event->mmap_mutex to 5105 * serialize with perf_mmap here. 5106 */ 5107 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && 5108 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { 5109 /* 5110 * Stop all AUX events that are writing to this buffer, 5111 * so that we can free its AUX pages and corresponding PMU 5112 * data. Note that after rb::aux_mmap_count dropped to zero, 5113 * they won't start any more (see perf_aux_output_begin()). 5114 */ 5115 perf_pmu_output_stop(event); 5116 5117 /* now it's safe to free the pages */ 5118 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); 5119 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; 5120 5121 /* this has to be the last one */ 5122 rb_free_aux(rb); 5123 WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); 5124 5125 mutex_unlock(&event->mmap_mutex); 5126 } 5127 5128 atomic_dec(&rb->mmap_count); 5129 5130 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 5131 goto out_put; 5132 5133 ring_buffer_attach(event, NULL); 5134 mutex_unlock(&event->mmap_mutex); 5135 5136 /* If there's still other mmap()s of this buffer, we're done. */ 5137 if (atomic_read(&rb->mmap_count)) 5138 goto out_put; 5139 5140 /* 5141 * No other mmap()s, detach from all other events that might redirect 5142 * into the now unreachable buffer. Somewhat complicated by the 5143 * fact that rb::event_lock otherwise nests inside mmap_mutex. 5144 */ 5145 again: 5146 rcu_read_lock(); 5147 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { 5148 if (!atomic_long_inc_not_zero(&event->refcount)) { 5149 /* 5150 * This event is en-route to free_event() which will 5151 * detach it and remove it from the list. 5152 */ 5153 continue; 5154 } 5155 rcu_read_unlock(); 5156 5157 mutex_lock(&event->mmap_mutex); 5158 /* 5159 * Check we didn't race with perf_event_set_output() which can 5160 * swizzle the rb from under us while we were waiting to 5161 * acquire mmap_mutex. 5162 * 5163 * If we find a different rb; ignore this event, a next 5164 * iteration will no longer find it on the list. We have to 5165 * still restart the iteration to make sure we're not now 5166 * iterating the wrong list. 5167 */ 5168 if (event->rb == rb) 5169 ring_buffer_attach(event, NULL); 5170 5171 mutex_unlock(&event->mmap_mutex); 5172 put_event(event); 5173 5174 /* 5175 * Restart the iteration; either we're on the wrong list or 5176 * destroyed its integrity by doing a deletion. 5177 */ 5178 goto again; 5179 } 5180 rcu_read_unlock(); 5181 5182 /* 5183 * It could be there's still a few 0-ref events on the list; they'll 5184 * get cleaned up by free_event() -- they'll also still have their 5185 * ref on the rb and will free it whenever they are done with it. 5186 * 5187 * Aside from that, this buffer is 'fully' detached and unmapped, 5188 * undo the VM accounting. 5189 */ 5190 5191 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); 5192 vma->vm_mm->pinned_vm -= mmap_locked; 5193 free_uid(mmap_user); 5194 5195 out_put: 5196 ring_buffer_put(rb); /* could be last */ 5197 } 5198 5199 static const struct vm_operations_struct perf_mmap_vmops = { 5200 .open = perf_mmap_open, 5201 .close = perf_mmap_close, /* non mergable */ 5202 .fault = perf_mmap_fault, 5203 .page_mkwrite = perf_mmap_fault, 5204 }; 5205 5206 static int perf_mmap(struct file *file, struct vm_area_struct *vma) 5207 { 5208 struct perf_event *event = file->private_data; 5209 unsigned long user_locked, user_lock_limit; 5210 struct user_struct *user = current_user(); 5211 unsigned long locked, lock_limit; 5212 struct ring_buffer *rb = NULL; 5213 unsigned long vma_size; 5214 unsigned long nr_pages; 5215 long user_extra = 0, extra = 0; 5216 int ret = 0, flags = 0; 5217 5218 /* 5219 * Don't allow mmap() of inherited per-task counters. This would 5220 * create a performance issue due to all children writing to the 5221 * same rb. 5222 */ 5223 if (event->cpu == -1 && event->attr.inherit) 5224 return -EINVAL; 5225 5226 if (!(vma->vm_flags & VM_SHARED)) 5227 return -EINVAL; 5228 5229 vma_size = vma->vm_end - vma->vm_start; 5230 5231 if (vma->vm_pgoff == 0) { 5232 nr_pages = (vma_size / PAGE_SIZE) - 1; 5233 } else { 5234 /* 5235 * AUX area mapping: if rb->aux_nr_pages != 0, it's already 5236 * mapped, all subsequent mappings should have the same size 5237 * and offset. Must be above the normal perf buffer. 5238 */ 5239 u64 aux_offset, aux_size; 5240 5241 if (!event->rb) 5242 return -EINVAL; 5243 5244 nr_pages = vma_size / PAGE_SIZE; 5245 5246 mutex_lock(&event->mmap_mutex); 5247 ret = -EINVAL; 5248 5249 rb = event->rb; 5250 if (!rb) 5251 goto aux_unlock; 5252 5253 aux_offset = READ_ONCE(rb->user_page->aux_offset); 5254 aux_size = READ_ONCE(rb->user_page->aux_size); 5255 5256 if (aux_offset < perf_data_size(rb) + PAGE_SIZE) 5257 goto aux_unlock; 5258 5259 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) 5260 goto aux_unlock; 5261 5262 /* already mapped with a different offset */ 5263 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) 5264 goto aux_unlock; 5265 5266 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) 5267 goto aux_unlock; 5268 5269 /* already mapped with a different size */ 5270 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) 5271 goto aux_unlock; 5272 5273 if (!is_power_of_2(nr_pages)) 5274 goto aux_unlock; 5275 5276 if (!atomic_inc_not_zero(&rb->mmap_count)) 5277 goto aux_unlock; 5278 5279 if (rb_has_aux(rb)) { 5280 atomic_inc(&rb->aux_mmap_count); 5281 ret = 0; 5282 goto unlock; 5283 } 5284 5285 atomic_set(&rb->aux_mmap_count, 1); 5286 user_extra = nr_pages; 5287 5288 goto accounting; 5289 } 5290 5291 /* 5292 * If we have rb pages ensure they're a power-of-two number, so we 5293 * can do bitmasks instead of modulo. 5294 */ 5295 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 5296 return -EINVAL; 5297 5298 if (vma_size != PAGE_SIZE * (1 + nr_pages)) 5299 return -EINVAL; 5300 5301 WARN_ON_ONCE(event->ctx->parent_ctx); 5302 again: 5303 mutex_lock(&event->mmap_mutex); 5304 if (event->rb) { 5305 if (event->rb->nr_pages != nr_pages) { 5306 ret = -EINVAL; 5307 goto unlock; 5308 } 5309 5310 if (!atomic_inc_not_zero(&event->rb->mmap_count)) { 5311 /* 5312 * Raced against perf_mmap_close() through 5313 * perf_event_set_output(). Try again, hope for better 5314 * luck. 5315 */ 5316 mutex_unlock(&event->mmap_mutex); 5317 goto again; 5318 } 5319 5320 goto unlock; 5321 } 5322 5323 user_extra = nr_pages + 1; 5324 5325 accounting: 5326 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); 5327 5328 /* 5329 * Increase the limit linearly with more CPUs: 5330 */ 5331 user_lock_limit *= num_online_cpus(); 5332 5333 user_locked = atomic_long_read(&user->locked_vm) + user_extra; 5334 5335 if (user_locked > user_lock_limit) 5336 extra = user_locked - user_lock_limit; 5337 5338 lock_limit = rlimit(RLIMIT_MEMLOCK); 5339 lock_limit >>= PAGE_SHIFT; 5340 locked = vma->vm_mm->pinned_vm + extra; 5341 5342 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && 5343 !capable(CAP_IPC_LOCK)) { 5344 ret = -EPERM; 5345 goto unlock; 5346 } 5347 5348 WARN_ON(!rb && event->rb); 5349 5350 if (vma->vm_flags & VM_WRITE) 5351 flags |= RING_BUFFER_WRITABLE; 5352 5353 if (!rb) { 5354 rb = rb_alloc(nr_pages, 5355 event->attr.watermark ? event->attr.wakeup_watermark : 0, 5356 event->cpu, flags); 5357 5358 if (!rb) { 5359 ret = -ENOMEM; 5360 goto unlock; 5361 } 5362 5363 atomic_set(&rb->mmap_count, 1); 5364 rb->mmap_user = get_current_user(); 5365 rb->mmap_locked = extra; 5366 5367 ring_buffer_attach(event, rb); 5368 5369 perf_event_init_userpage(event); 5370 perf_event_update_userpage(event); 5371 } else { 5372 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, 5373 event->attr.aux_watermark, flags); 5374 if (!ret) 5375 rb->aux_mmap_locked = extra; 5376 } 5377 5378 unlock: 5379 if (!ret) { 5380 atomic_long_add(user_extra, &user->locked_vm); 5381 vma->vm_mm->pinned_vm += extra; 5382 5383 atomic_inc(&event->mmap_count); 5384 } else if (rb) { 5385 atomic_dec(&rb->mmap_count); 5386 } 5387 aux_unlock: 5388 mutex_unlock(&event->mmap_mutex); 5389 5390 /* 5391 * Since pinned accounting is per vm we cannot allow fork() to copy our 5392 * vma. 5393 */ 5394 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; 5395 vma->vm_ops = &perf_mmap_vmops; 5396 5397 if (event->pmu->event_mapped) 5398 event->pmu->event_mapped(event, vma->vm_mm); 5399 5400 return ret; 5401 } 5402 5403 static int perf_fasync(int fd, struct file *filp, int on) 5404 { 5405 struct inode *inode = file_inode(filp); 5406 struct perf_event *event = filp->private_data; 5407 int retval; 5408 5409 inode_lock(inode); 5410 retval = fasync_helper(fd, filp, on, &event->fasync); 5411 inode_unlock(inode); 5412 5413 if (retval < 0) 5414 return retval; 5415 5416 return 0; 5417 } 5418 5419 static const struct file_operations perf_fops = { 5420 .llseek = no_llseek, 5421 .release = perf_release, 5422 .read = perf_read, 5423 .poll = perf_poll, 5424 .unlocked_ioctl = perf_ioctl, 5425 .compat_ioctl = perf_compat_ioctl, 5426 .mmap = perf_mmap, 5427 .fasync = perf_fasync, 5428 }; 5429 5430 /* 5431 * Perf event wakeup 5432 * 5433 * If there's data, ensure we set the poll() state and publish everything 5434 * to user-space before waking everybody up. 5435 */ 5436 5437 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) 5438 { 5439 /* only the parent has fasync state */ 5440 if (event->parent) 5441 event = event->parent; 5442 return &event->fasync; 5443 } 5444 5445 void perf_event_wakeup(struct perf_event *event) 5446 { 5447 ring_buffer_wakeup(event); 5448 5449 if (event->pending_kill) { 5450 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); 5451 event->pending_kill = 0; 5452 } 5453 } 5454 5455 static void perf_pending_event(struct irq_work *entry) 5456 { 5457 struct perf_event *event = container_of(entry, 5458 struct perf_event, pending); 5459 int rctx; 5460 5461 rctx = perf_swevent_get_recursion_context(); 5462 /* 5463 * If we 'fail' here, that's OK, it means recursion is already disabled 5464 * and we won't recurse 'further'. 5465 */ 5466 5467 if (event->pending_disable) { 5468 event->pending_disable = 0; 5469 perf_event_disable_local(event); 5470 } 5471 5472 if (event->pending_wakeup) { 5473 event->pending_wakeup = 0; 5474 perf_event_wakeup(event); 5475 } 5476 5477 if (rctx >= 0) 5478 perf_swevent_put_recursion_context(rctx); 5479 } 5480 5481 /* 5482 * We assume there is only KVM supporting the callbacks. 5483 * Later on, we might change it to a list if there is 5484 * another virtualization implementation supporting the callbacks. 5485 */ 5486 struct perf_guest_info_callbacks *perf_guest_cbs; 5487 5488 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) 5489 { 5490 perf_guest_cbs = cbs; 5491 return 0; 5492 } 5493 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); 5494 5495 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) 5496 { 5497 perf_guest_cbs = NULL; 5498 return 0; 5499 } 5500 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 5501 5502 static void 5503 perf_output_sample_regs(struct perf_output_handle *handle, 5504 struct pt_regs *regs, u64 mask) 5505 { 5506 int bit; 5507 DECLARE_BITMAP(_mask, 64); 5508 5509 bitmap_from_u64(_mask, mask); 5510 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { 5511 u64 val; 5512 5513 val = perf_reg_value(regs, bit); 5514 perf_output_put(handle, val); 5515 } 5516 } 5517 5518 static void perf_sample_regs_user(struct perf_regs *regs_user, 5519 struct pt_regs *regs, 5520 struct pt_regs *regs_user_copy) 5521 { 5522 if (user_mode(regs)) { 5523 regs_user->abi = perf_reg_abi(current); 5524 regs_user->regs = regs; 5525 } else if (current->mm) { 5526 perf_get_regs_user(regs_user, regs, regs_user_copy); 5527 } else { 5528 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; 5529 regs_user->regs = NULL; 5530 } 5531 } 5532 5533 static void perf_sample_regs_intr(struct perf_regs *regs_intr, 5534 struct pt_regs *regs) 5535 { 5536 regs_intr->regs = regs; 5537 regs_intr->abi = perf_reg_abi(current); 5538 } 5539 5540 5541 /* 5542 * Get remaining task size from user stack pointer. 5543 * 5544 * It'd be better to take stack vma map and limit this more 5545 * precisly, but there's no way to get it safely under interrupt, 5546 * so using TASK_SIZE as limit. 5547 */ 5548 static u64 perf_ustack_task_size(struct pt_regs *regs) 5549 { 5550 unsigned long addr = perf_user_stack_pointer(regs); 5551 5552 if (!addr || addr >= TASK_SIZE) 5553 return 0; 5554 5555 return TASK_SIZE - addr; 5556 } 5557 5558 static u16 5559 perf_sample_ustack_size(u16 stack_size, u16 header_size, 5560 struct pt_regs *regs) 5561 { 5562 u64 task_size; 5563 5564 /* No regs, no stack pointer, no dump. */ 5565 if (!regs) 5566 return 0; 5567 5568 /* 5569 * Check if we fit in with the requested stack size into the: 5570 * - TASK_SIZE 5571 * If we don't, we limit the size to the TASK_SIZE. 5572 * 5573 * - remaining sample size 5574 * If we don't, we customize the stack size to 5575 * fit in to the remaining sample size. 5576 */ 5577 5578 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); 5579 stack_size = min(stack_size, (u16) task_size); 5580 5581 /* Current header size plus static size and dynamic size. */ 5582 header_size += 2 * sizeof(u64); 5583 5584 /* Do we fit in with the current stack dump size? */ 5585 if ((u16) (header_size + stack_size) < header_size) { 5586 /* 5587 * If we overflow the maximum size for the sample, 5588 * we customize the stack dump size to fit in. 5589 */ 5590 stack_size = USHRT_MAX - header_size - sizeof(u64); 5591 stack_size = round_up(stack_size, sizeof(u64)); 5592 } 5593 5594 return stack_size; 5595 } 5596 5597 static void 5598 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, 5599 struct pt_regs *regs) 5600 { 5601 /* Case of a kernel thread, nothing to dump */ 5602 if (!regs) { 5603 u64 size = 0; 5604 perf_output_put(handle, size); 5605 } else { 5606 unsigned long sp; 5607 unsigned int rem; 5608 u64 dyn_size; 5609 5610 /* 5611 * We dump: 5612 * static size 5613 * - the size requested by user or the best one we can fit 5614 * in to the sample max size 5615 * data 5616 * - user stack dump data 5617 * dynamic size 5618 * - the actual dumped size 5619 */ 5620 5621 /* Static size. */ 5622 perf_output_put(handle, dump_size); 5623 5624 /* Data. */ 5625 sp = perf_user_stack_pointer(regs); 5626 rem = __output_copy_user(handle, (void *) sp, dump_size); 5627 dyn_size = dump_size - rem; 5628 5629 perf_output_skip(handle, rem); 5630 5631 /* Dynamic size. */ 5632 perf_output_put(handle, dyn_size); 5633 } 5634 } 5635 5636 static void __perf_event_header__init_id(struct perf_event_header *header, 5637 struct perf_sample_data *data, 5638 struct perf_event *event) 5639 { 5640 u64 sample_type = event->attr.sample_type; 5641 5642 data->type = sample_type; 5643 header->size += event->id_header_size; 5644 5645 if (sample_type & PERF_SAMPLE_TID) { 5646 /* namespace issues */ 5647 data->tid_entry.pid = perf_event_pid(event, current); 5648 data->tid_entry.tid = perf_event_tid(event, current); 5649 } 5650 5651 if (sample_type & PERF_SAMPLE_TIME) 5652 data->time = perf_event_clock(event); 5653 5654 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) 5655 data->id = primary_event_id(event); 5656 5657 if (sample_type & PERF_SAMPLE_STREAM_ID) 5658 data->stream_id = event->id; 5659 5660 if (sample_type & PERF_SAMPLE_CPU) { 5661 data->cpu_entry.cpu = raw_smp_processor_id(); 5662 data->cpu_entry.reserved = 0; 5663 } 5664 } 5665 5666 void perf_event_header__init_id(struct perf_event_header *header, 5667 struct perf_sample_data *data, 5668 struct perf_event *event) 5669 { 5670 if (event->attr.sample_id_all) 5671 __perf_event_header__init_id(header, data, event); 5672 } 5673 5674 static void __perf_event__output_id_sample(struct perf_output_handle *handle, 5675 struct perf_sample_data *data) 5676 { 5677 u64 sample_type = data->type; 5678 5679 if (sample_type & PERF_SAMPLE_TID) 5680 perf_output_put(handle, data->tid_entry); 5681 5682 if (sample_type & PERF_SAMPLE_TIME) 5683 perf_output_put(handle, data->time); 5684 5685 if (sample_type & PERF_SAMPLE_ID) 5686 perf_output_put(handle, data->id); 5687 5688 if (sample_type & PERF_SAMPLE_STREAM_ID) 5689 perf_output_put(handle, data->stream_id); 5690 5691 if (sample_type & PERF_SAMPLE_CPU) 5692 perf_output_put(handle, data->cpu_entry); 5693 5694 if (sample_type & PERF_SAMPLE_IDENTIFIER) 5695 perf_output_put(handle, data->id); 5696 } 5697 5698 void perf_event__output_id_sample(struct perf_event *event, 5699 struct perf_output_handle *handle, 5700 struct perf_sample_data *sample) 5701 { 5702 if (event->attr.sample_id_all) 5703 __perf_event__output_id_sample(handle, sample); 5704 } 5705 5706 static void perf_output_read_one(struct perf_output_handle *handle, 5707 struct perf_event *event, 5708 u64 enabled, u64 running) 5709 { 5710 u64 read_format = event->attr.read_format; 5711 u64 values[4]; 5712 int n = 0; 5713 5714 values[n++] = perf_event_count(event); 5715 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 5716 values[n++] = enabled + 5717 atomic64_read(&event->child_total_time_enabled); 5718 } 5719 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 5720 values[n++] = running + 5721 atomic64_read(&event->child_total_time_running); 5722 } 5723 if (read_format & PERF_FORMAT_ID) 5724 values[n++] = primary_event_id(event); 5725 5726 __output_copy(handle, values, n * sizeof(u64)); 5727 } 5728 5729 static void perf_output_read_group(struct perf_output_handle *handle, 5730 struct perf_event *event, 5731 u64 enabled, u64 running) 5732 { 5733 struct perf_event *leader = event->group_leader, *sub; 5734 u64 read_format = event->attr.read_format; 5735 u64 values[5]; 5736 int n = 0; 5737 5738 values[n++] = 1 + leader->nr_siblings; 5739 5740 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 5741 values[n++] = enabled; 5742 5743 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 5744 values[n++] = running; 5745 5746 if (leader != event) 5747 leader->pmu->read(leader); 5748 5749 values[n++] = perf_event_count(leader); 5750 if (read_format & PERF_FORMAT_ID) 5751 values[n++] = primary_event_id(leader); 5752 5753 __output_copy(handle, values, n * sizeof(u64)); 5754 5755 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 5756 n = 0; 5757 5758 if ((sub != event) && 5759 (sub->state == PERF_EVENT_STATE_ACTIVE)) 5760 sub->pmu->read(sub); 5761 5762 values[n++] = perf_event_count(sub); 5763 if (read_format & PERF_FORMAT_ID) 5764 values[n++] = primary_event_id(sub); 5765 5766 __output_copy(handle, values, n * sizeof(u64)); 5767 } 5768 } 5769 5770 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ 5771 PERF_FORMAT_TOTAL_TIME_RUNNING) 5772 5773 /* 5774 * XXX PERF_SAMPLE_READ vs inherited events seems difficult. 5775 * 5776 * The problem is that its both hard and excessively expensive to iterate the 5777 * child list, not to mention that its impossible to IPI the children running 5778 * on another CPU, from interrupt/NMI context. 5779 */ 5780 static void perf_output_read(struct perf_output_handle *handle, 5781 struct perf_event *event) 5782 { 5783 u64 enabled = 0, running = 0, now; 5784 u64 read_format = event->attr.read_format; 5785 5786 /* 5787 * compute total_time_enabled, total_time_running 5788 * based on snapshot values taken when the event 5789 * was last scheduled in. 5790 * 5791 * we cannot simply called update_context_time() 5792 * because of locking issue as we are called in 5793 * NMI context 5794 */ 5795 if (read_format & PERF_FORMAT_TOTAL_TIMES) 5796 calc_timer_values(event, &now, &enabled, &running); 5797 5798 if (event->attr.read_format & PERF_FORMAT_GROUP) 5799 perf_output_read_group(handle, event, enabled, running); 5800 else 5801 perf_output_read_one(handle, event, enabled, running); 5802 } 5803 5804 void perf_output_sample(struct perf_output_handle *handle, 5805 struct perf_event_header *header, 5806 struct perf_sample_data *data, 5807 struct perf_event *event) 5808 { 5809 u64 sample_type = data->type; 5810 5811 perf_output_put(handle, *header); 5812 5813 if (sample_type & PERF_SAMPLE_IDENTIFIER) 5814 perf_output_put(handle, data->id); 5815 5816 if (sample_type & PERF_SAMPLE_IP) 5817 perf_output_put(handle, data->ip); 5818 5819 if (sample_type & PERF_SAMPLE_TID) 5820 perf_output_put(handle, data->tid_entry); 5821 5822 if (sample_type & PERF_SAMPLE_TIME) 5823 perf_output_put(handle, data->time); 5824 5825 if (sample_type & PERF_SAMPLE_ADDR) 5826 perf_output_put(handle, data->addr); 5827 5828 if (sample_type & PERF_SAMPLE_ID) 5829 perf_output_put(handle, data->id); 5830 5831 if (sample_type & PERF_SAMPLE_STREAM_ID) 5832 perf_output_put(handle, data->stream_id); 5833 5834 if (sample_type & PERF_SAMPLE_CPU) 5835 perf_output_put(handle, data->cpu_entry); 5836 5837 if (sample_type & PERF_SAMPLE_PERIOD) 5838 perf_output_put(handle, data->period); 5839 5840 if (sample_type & PERF_SAMPLE_READ) 5841 perf_output_read(handle, event); 5842 5843 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 5844 int size = 1; 5845 5846 size += data->callchain->nr; 5847 size *= sizeof(u64); 5848 __output_copy(handle, data->callchain, size); 5849 } 5850 5851 if (sample_type & PERF_SAMPLE_RAW) { 5852 struct perf_raw_record *raw = data->raw; 5853 5854 if (raw) { 5855 struct perf_raw_frag *frag = &raw->frag; 5856 5857 perf_output_put(handle, raw->size); 5858 do { 5859 if (frag->copy) { 5860 __output_custom(handle, frag->copy, 5861 frag->data, frag->size); 5862 } else { 5863 __output_copy(handle, frag->data, 5864 frag->size); 5865 } 5866 if (perf_raw_frag_last(frag)) 5867 break; 5868 frag = frag->next; 5869 } while (1); 5870 if (frag->pad) 5871 __output_skip(handle, NULL, frag->pad); 5872 } else { 5873 struct { 5874 u32 size; 5875 u32 data; 5876 } raw = { 5877 .size = sizeof(u32), 5878 .data = 0, 5879 }; 5880 perf_output_put(handle, raw); 5881 } 5882 } 5883 5884 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 5885 if (data->br_stack) { 5886 size_t size; 5887 5888 size = data->br_stack->nr 5889 * sizeof(struct perf_branch_entry); 5890 5891 perf_output_put(handle, data->br_stack->nr); 5892 perf_output_copy(handle, data->br_stack->entries, size); 5893 } else { 5894 /* 5895 * we always store at least the value of nr 5896 */ 5897 u64 nr = 0; 5898 perf_output_put(handle, nr); 5899 } 5900 } 5901 5902 if (sample_type & PERF_SAMPLE_REGS_USER) { 5903 u64 abi = data->regs_user.abi; 5904 5905 /* 5906 * If there are no regs to dump, notice it through 5907 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). 5908 */ 5909 perf_output_put(handle, abi); 5910 5911 if (abi) { 5912 u64 mask = event->attr.sample_regs_user; 5913 perf_output_sample_regs(handle, 5914 data->regs_user.regs, 5915 mask); 5916 } 5917 } 5918 5919 if (sample_type & PERF_SAMPLE_STACK_USER) { 5920 perf_output_sample_ustack(handle, 5921 data->stack_user_size, 5922 data->regs_user.regs); 5923 } 5924 5925 if (sample_type & PERF_SAMPLE_WEIGHT) 5926 perf_output_put(handle, data->weight); 5927 5928 if (sample_type & PERF_SAMPLE_DATA_SRC) 5929 perf_output_put(handle, data->data_src.val); 5930 5931 if (sample_type & PERF_SAMPLE_TRANSACTION) 5932 perf_output_put(handle, data->txn); 5933 5934 if (sample_type & PERF_SAMPLE_REGS_INTR) { 5935 u64 abi = data->regs_intr.abi; 5936 /* 5937 * If there are no regs to dump, notice it through 5938 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). 5939 */ 5940 perf_output_put(handle, abi); 5941 5942 if (abi) { 5943 u64 mask = event->attr.sample_regs_intr; 5944 5945 perf_output_sample_regs(handle, 5946 data->regs_intr.regs, 5947 mask); 5948 } 5949 } 5950 5951 if (sample_type & PERF_SAMPLE_PHYS_ADDR) 5952 perf_output_put(handle, data->phys_addr); 5953 5954 if (!event->attr.watermark) { 5955 int wakeup_events = event->attr.wakeup_events; 5956 5957 if (wakeup_events) { 5958 struct ring_buffer *rb = handle->rb; 5959 int events = local_inc_return(&rb->events); 5960 5961 if (events >= wakeup_events) { 5962 local_sub(wakeup_events, &rb->events); 5963 local_inc(&rb->wakeup); 5964 } 5965 } 5966 } 5967 } 5968 5969 static u64 perf_virt_to_phys(u64 virt) 5970 { 5971 u64 phys_addr = 0; 5972 struct page *p = NULL; 5973 5974 if (!virt) 5975 return 0; 5976 5977 if (virt >= TASK_SIZE) { 5978 /* If it's vmalloc()d memory, leave phys_addr as 0 */ 5979 if (virt_addr_valid((void *)(uintptr_t)virt) && 5980 !(virt >= VMALLOC_START && virt < VMALLOC_END)) 5981 phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); 5982 } else { 5983 /* 5984 * Walking the pages tables for user address. 5985 * Interrupts are disabled, so it prevents any tear down 5986 * of the page tables. 5987 * Try IRQ-safe __get_user_pages_fast first. 5988 * If failed, leave phys_addr as 0. 5989 */ 5990 if ((current->mm != NULL) && 5991 (__get_user_pages_fast(virt, 1, 0, &p) == 1)) 5992 phys_addr = page_to_phys(p) + virt % PAGE_SIZE; 5993 5994 if (p) 5995 put_page(p); 5996 } 5997 5998 return phys_addr; 5999 } 6000 6001 static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; 6002 6003 static struct perf_callchain_entry * 6004 perf_callchain(struct perf_event *event, struct pt_regs *regs) 6005 { 6006 bool kernel = !event->attr.exclude_callchain_kernel; 6007 bool user = !event->attr.exclude_callchain_user; 6008 /* Disallow cross-task user callchains. */ 6009 bool crosstask = event->ctx->task && event->ctx->task != current; 6010 const u32 max_stack = event->attr.sample_max_stack; 6011 struct perf_callchain_entry *callchain; 6012 6013 if (!kernel && !user) 6014 return &__empty_callchain; 6015 6016 callchain = get_perf_callchain(regs, 0, kernel, user, 6017 max_stack, crosstask, true); 6018 return callchain ?: &__empty_callchain; 6019 } 6020 6021 void perf_prepare_sample(struct perf_event_header *header, 6022 struct perf_sample_data *data, 6023 struct perf_event *event, 6024 struct pt_regs *regs) 6025 { 6026 u64 sample_type = event->attr.sample_type; 6027 6028 header->type = PERF_RECORD_SAMPLE; 6029 header->size = sizeof(*header) + event->header_size; 6030 6031 header->misc = 0; 6032 header->misc |= perf_misc_flags(regs); 6033 6034 __perf_event_header__init_id(header, data, event); 6035 6036 if (sample_type & PERF_SAMPLE_IP) 6037 data->ip = perf_instruction_pointer(regs); 6038 6039 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 6040 int size = 1; 6041 6042 data->callchain = perf_callchain(event, regs); 6043 size += data->callchain->nr; 6044 6045 header->size += size * sizeof(u64); 6046 } 6047 6048 if (sample_type & PERF_SAMPLE_RAW) { 6049 struct perf_raw_record *raw = data->raw; 6050 int size; 6051 6052 if (raw) { 6053 struct perf_raw_frag *frag = &raw->frag; 6054 u32 sum = 0; 6055 6056 do { 6057 sum += frag->size; 6058 if (perf_raw_frag_last(frag)) 6059 break; 6060 frag = frag->next; 6061 } while (1); 6062 6063 size = round_up(sum + sizeof(u32), sizeof(u64)); 6064 raw->size = size - sizeof(u32); 6065 frag->pad = raw->size - sum; 6066 } else { 6067 size = sizeof(u64); 6068 } 6069 6070 header->size += size; 6071 } 6072 6073 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 6074 int size = sizeof(u64); /* nr */ 6075 if (data->br_stack) { 6076 size += data->br_stack->nr 6077 * sizeof(struct perf_branch_entry); 6078 } 6079 header->size += size; 6080 } 6081 6082 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) 6083 perf_sample_regs_user(&data->regs_user, regs, 6084 &data->regs_user_copy); 6085 6086 if (sample_type & PERF_SAMPLE_REGS_USER) { 6087 /* regs dump ABI info */ 6088 int size = sizeof(u64); 6089 6090 if (data->regs_user.regs) { 6091 u64 mask = event->attr.sample_regs_user; 6092 size += hweight64(mask) * sizeof(u64); 6093 } 6094 6095 header->size += size; 6096 } 6097 6098 if (sample_type & PERF_SAMPLE_STACK_USER) { 6099 /* 6100 * Either we need PERF_SAMPLE_STACK_USER bit to be allways 6101 * processed as the last one or have additional check added 6102 * in case new sample type is added, because we could eat 6103 * up the rest of the sample size. 6104 */ 6105 u16 stack_size = event->attr.sample_stack_user; 6106 u16 size = sizeof(u64); 6107 6108 stack_size = perf_sample_ustack_size(stack_size, header->size, 6109 data->regs_user.regs); 6110 6111 /* 6112 * If there is something to dump, add space for the dump 6113 * itself and for the field that tells the dynamic size, 6114 * which is how many have been actually dumped. 6115 */ 6116 if (stack_size) 6117 size += sizeof(u64) + stack_size; 6118 6119 data->stack_user_size = stack_size; 6120 header->size += size; 6121 } 6122 6123 if (sample_type & PERF_SAMPLE_REGS_INTR) { 6124 /* regs dump ABI info */ 6125 int size = sizeof(u64); 6126 6127 perf_sample_regs_intr(&data->regs_intr, regs); 6128 6129 if (data->regs_intr.regs) { 6130 u64 mask = event->attr.sample_regs_intr; 6131 6132 size += hweight64(mask) * sizeof(u64); 6133 } 6134 6135 header->size += size; 6136 } 6137 6138 if (sample_type & PERF_SAMPLE_PHYS_ADDR) 6139 data->phys_addr = perf_virt_to_phys(data->addr); 6140 } 6141 6142 static void __always_inline 6143 __perf_event_output(struct perf_event *event, 6144 struct perf_sample_data *data, 6145 struct pt_regs *regs, 6146 int (*output_begin)(struct perf_output_handle *, 6147 struct perf_event *, 6148 unsigned int)) 6149 { 6150 struct perf_output_handle handle; 6151 struct perf_event_header header; 6152 6153 /* protect the callchain buffers */ 6154 rcu_read_lock(); 6155 6156 perf_prepare_sample(&header, data, event, regs); 6157 6158 if (output_begin(&handle, event, header.size)) 6159 goto exit; 6160 6161 perf_output_sample(&handle, &header, data, event); 6162 6163 perf_output_end(&handle); 6164 6165 exit: 6166 rcu_read_unlock(); 6167 } 6168 6169 void 6170 perf_event_output_forward(struct perf_event *event, 6171 struct perf_sample_data *data, 6172 struct pt_regs *regs) 6173 { 6174 __perf_event_output(event, data, regs, perf_output_begin_forward); 6175 } 6176 6177 void 6178 perf_event_output_backward(struct perf_event *event, 6179 struct perf_sample_data *data, 6180 struct pt_regs *regs) 6181 { 6182 __perf_event_output(event, data, regs, perf_output_begin_backward); 6183 } 6184 6185 void 6186 perf_event_output(struct perf_event *event, 6187 struct perf_sample_data *data, 6188 struct pt_regs *regs) 6189 { 6190 __perf_event_output(event, data, regs, perf_output_begin); 6191 } 6192 6193 /* 6194 * read event_id 6195 */ 6196 6197 struct perf_read_event { 6198 struct perf_event_header header; 6199 6200 u32 pid; 6201 u32 tid; 6202 }; 6203 6204 static void 6205 perf_event_read_event(struct perf_event *event, 6206 struct task_struct *task) 6207 { 6208 struct perf_output_handle handle; 6209 struct perf_sample_data sample; 6210 struct perf_read_event read_event = { 6211 .header = { 6212 .type = PERF_RECORD_READ, 6213 .misc = 0, 6214 .size = sizeof(read_event) + event->read_size, 6215 }, 6216 .pid = perf_event_pid(event, task), 6217 .tid = perf_event_tid(event, task), 6218 }; 6219 int ret; 6220 6221 perf_event_header__init_id(&read_event.header, &sample, event); 6222 ret = perf_output_begin(&handle, event, read_event.header.size); 6223 if (ret) 6224 return; 6225 6226 perf_output_put(&handle, read_event); 6227 perf_output_read(&handle, event); 6228 perf_event__output_id_sample(event, &handle, &sample); 6229 6230 perf_output_end(&handle); 6231 } 6232 6233 typedef void (perf_iterate_f)(struct perf_event *event, void *data); 6234 6235 static void 6236 perf_iterate_ctx(struct perf_event_context *ctx, 6237 perf_iterate_f output, 6238 void *data, bool all) 6239 { 6240 struct perf_event *event; 6241 6242 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 6243 if (!all) { 6244 if (event->state < PERF_EVENT_STATE_INACTIVE) 6245 continue; 6246 if (!event_filter_match(event)) 6247 continue; 6248 } 6249 6250 output(event, data); 6251 } 6252 } 6253 6254 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) 6255 { 6256 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); 6257 struct perf_event *event; 6258 6259 list_for_each_entry_rcu(event, &pel->list, sb_list) { 6260 /* 6261 * Skip events that are not fully formed yet; ensure that 6262 * if we observe event->ctx, both event and ctx will be 6263 * complete enough. See perf_install_in_context(). 6264 */ 6265 if (!smp_load_acquire(&event->ctx)) 6266 continue; 6267 6268 if (event->state < PERF_EVENT_STATE_INACTIVE) 6269 continue; 6270 if (!event_filter_match(event)) 6271 continue; 6272 output(event, data); 6273 } 6274 } 6275 6276 /* 6277 * Iterate all events that need to receive side-band events. 6278 * 6279 * For new callers; ensure that account_pmu_sb_event() includes 6280 * your event, otherwise it might not get delivered. 6281 */ 6282 static void 6283 perf_iterate_sb(perf_iterate_f output, void *data, 6284 struct perf_event_context *task_ctx) 6285 { 6286 struct perf_event_context *ctx; 6287 int ctxn; 6288 6289 rcu_read_lock(); 6290 preempt_disable(); 6291 6292 /* 6293 * If we have task_ctx != NULL we only notify the task context itself. 6294 * The task_ctx is set only for EXIT events before releasing task 6295 * context. 6296 */ 6297 if (task_ctx) { 6298 perf_iterate_ctx(task_ctx, output, data, false); 6299 goto done; 6300 } 6301 6302 perf_iterate_sb_cpu(output, data); 6303 6304 for_each_task_context_nr(ctxn) { 6305 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 6306 if (ctx) 6307 perf_iterate_ctx(ctx, output, data, false); 6308 } 6309 done: 6310 preempt_enable(); 6311 rcu_read_unlock(); 6312 } 6313 6314 /* 6315 * Clear all file-based filters at exec, they'll have to be 6316 * re-instated when/if these objects are mmapped again. 6317 */ 6318 static void perf_event_addr_filters_exec(struct perf_event *event, void *data) 6319 { 6320 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 6321 struct perf_addr_filter *filter; 6322 unsigned int restart = 0, count = 0; 6323 unsigned long flags; 6324 6325 if (!has_addr_filter(event)) 6326 return; 6327 6328 raw_spin_lock_irqsave(&ifh->lock, flags); 6329 list_for_each_entry(filter, &ifh->list, entry) { 6330 if (filter->inode) { 6331 event->addr_filters_offs[count] = 0; 6332 restart++; 6333 } 6334 6335 count++; 6336 } 6337 6338 if (restart) 6339 event->addr_filters_gen++; 6340 raw_spin_unlock_irqrestore(&ifh->lock, flags); 6341 6342 if (restart) 6343 perf_event_stop(event, 1); 6344 } 6345 6346 void perf_event_exec(void) 6347 { 6348 struct perf_event_context *ctx; 6349 int ctxn; 6350 6351 rcu_read_lock(); 6352 for_each_task_context_nr(ctxn) { 6353 ctx = current->perf_event_ctxp[ctxn]; 6354 if (!ctx) 6355 continue; 6356 6357 perf_event_enable_on_exec(ctxn); 6358 6359 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, 6360 true); 6361 } 6362 rcu_read_unlock(); 6363 } 6364 6365 struct remote_output { 6366 struct ring_buffer *rb; 6367 int err; 6368 }; 6369 6370 static void __perf_event_output_stop(struct perf_event *event, void *data) 6371 { 6372 struct perf_event *parent = event->parent; 6373 struct remote_output *ro = data; 6374 struct ring_buffer *rb = ro->rb; 6375 struct stop_event_data sd = { 6376 .event = event, 6377 }; 6378 6379 if (!has_aux(event)) 6380 return; 6381 6382 if (!parent) 6383 parent = event; 6384 6385 /* 6386 * In case of inheritance, it will be the parent that links to the 6387 * ring-buffer, but it will be the child that's actually using it. 6388 * 6389 * We are using event::rb to determine if the event should be stopped, 6390 * however this may race with ring_buffer_attach() (through set_output), 6391 * which will make us skip the event that actually needs to be stopped. 6392 * So ring_buffer_attach() has to stop an aux event before re-assigning 6393 * its rb pointer. 6394 */ 6395 if (rcu_dereference(parent->rb) == rb) 6396 ro->err = __perf_event_stop(&sd); 6397 } 6398 6399 static int __perf_pmu_output_stop(void *info) 6400 { 6401 struct perf_event *event = info; 6402 struct pmu *pmu = event->pmu; 6403 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 6404 struct remote_output ro = { 6405 .rb = event->rb, 6406 }; 6407 6408 rcu_read_lock(); 6409 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); 6410 if (cpuctx->task_ctx) 6411 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, 6412 &ro, false); 6413 rcu_read_unlock(); 6414 6415 return ro.err; 6416 } 6417 6418 static void perf_pmu_output_stop(struct perf_event *event) 6419 { 6420 struct perf_event *iter; 6421 int err, cpu; 6422 6423 restart: 6424 rcu_read_lock(); 6425 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { 6426 /* 6427 * For per-CPU events, we need to make sure that neither they 6428 * nor their children are running; for cpu==-1 events it's 6429 * sufficient to stop the event itself if it's active, since 6430 * it can't have children. 6431 */ 6432 cpu = iter->cpu; 6433 if (cpu == -1) 6434 cpu = READ_ONCE(iter->oncpu); 6435 6436 if (cpu == -1) 6437 continue; 6438 6439 err = cpu_function_call(cpu, __perf_pmu_output_stop, event); 6440 if (err == -EAGAIN) { 6441 rcu_read_unlock(); 6442 goto restart; 6443 } 6444 } 6445 rcu_read_unlock(); 6446 } 6447 6448 /* 6449 * task tracking -- fork/exit 6450 * 6451 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task 6452 */ 6453 6454 struct perf_task_event { 6455 struct task_struct *task; 6456 struct perf_event_context *task_ctx; 6457 6458 struct { 6459 struct perf_event_header header; 6460 6461 u32 pid; 6462 u32 ppid; 6463 u32 tid; 6464 u32 ptid; 6465 u64 time; 6466 } event_id; 6467 }; 6468 6469 static int perf_event_task_match(struct perf_event *event) 6470 { 6471 return event->attr.comm || event->attr.mmap || 6472 event->attr.mmap2 || event->attr.mmap_data || 6473 event->attr.task; 6474 } 6475 6476 static void perf_event_task_output(struct perf_event *event, 6477 void *data) 6478 { 6479 struct perf_task_event *task_event = data; 6480 struct perf_output_handle handle; 6481 struct perf_sample_data sample; 6482 struct task_struct *task = task_event->task; 6483 int ret, size = task_event->event_id.header.size; 6484 6485 if (!perf_event_task_match(event)) 6486 return; 6487 6488 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 6489 6490 ret = perf_output_begin(&handle, event, 6491 task_event->event_id.header.size); 6492 if (ret) 6493 goto out; 6494 6495 task_event->event_id.pid = perf_event_pid(event, task); 6496 task_event->event_id.ppid = perf_event_pid(event, current); 6497 6498 task_event->event_id.tid = perf_event_tid(event, task); 6499 task_event->event_id.ptid = perf_event_tid(event, current); 6500 6501 task_event->event_id.time = perf_event_clock(event); 6502 6503 perf_output_put(&handle, task_event->event_id); 6504 6505 perf_event__output_id_sample(event, &handle, &sample); 6506 6507 perf_output_end(&handle); 6508 out: 6509 task_event->event_id.header.size = size; 6510 } 6511 6512 static void perf_event_task(struct task_struct *task, 6513 struct perf_event_context *task_ctx, 6514 int new) 6515 { 6516 struct perf_task_event task_event; 6517 6518 if (!atomic_read(&nr_comm_events) && 6519 !atomic_read(&nr_mmap_events) && 6520 !atomic_read(&nr_task_events)) 6521 return; 6522 6523 task_event = (struct perf_task_event){ 6524 .task = task, 6525 .task_ctx = task_ctx, 6526 .event_id = { 6527 .header = { 6528 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, 6529 .misc = 0, 6530 .size = sizeof(task_event.event_id), 6531 }, 6532 /* .pid */ 6533 /* .ppid */ 6534 /* .tid */ 6535 /* .ptid */ 6536 /* .time */ 6537 }, 6538 }; 6539 6540 perf_iterate_sb(perf_event_task_output, 6541 &task_event, 6542 task_ctx); 6543 } 6544 6545 void perf_event_fork(struct task_struct *task) 6546 { 6547 perf_event_task(task, NULL, 1); 6548 perf_event_namespaces(task); 6549 } 6550 6551 /* 6552 * comm tracking 6553 */ 6554 6555 struct perf_comm_event { 6556 struct task_struct *task; 6557 char *comm; 6558 int comm_size; 6559 6560 struct { 6561 struct perf_event_header header; 6562 6563 u32 pid; 6564 u32 tid; 6565 } event_id; 6566 }; 6567 6568 static int perf_event_comm_match(struct perf_event *event) 6569 { 6570 return event->attr.comm; 6571 } 6572 6573 static void perf_event_comm_output(struct perf_event *event, 6574 void *data) 6575 { 6576 struct perf_comm_event *comm_event = data; 6577 struct perf_output_handle handle; 6578 struct perf_sample_data sample; 6579 int size = comm_event->event_id.header.size; 6580 int ret; 6581 6582 if (!perf_event_comm_match(event)) 6583 return; 6584 6585 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 6586 ret = perf_output_begin(&handle, event, 6587 comm_event->event_id.header.size); 6588 6589 if (ret) 6590 goto out; 6591 6592 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 6593 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 6594 6595 perf_output_put(&handle, comm_event->event_id); 6596 __output_copy(&handle, comm_event->comm, 6597 comm_event->comm_size); 6598 6599 perf_event__output_id_sample(event, &handle, &sample); 6600 6601 perf_output_end(&handle); 6602 out: 6603 comm_event->event_id.header.size = size; 6604 } 6605 6606 static void perf_event_comm_event(struct perf_comm_event *comm_event) 6607 { 6608 char comm[TASK_COMM_LEN]; 6609 unsigned int size; 6610 6611 memset(comm, 0, sizeof(comm)); 6612 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 6613 size = ALIGN(strlen(comm)+1, sizeof(u64)); 6614 6615 comm_event->comm = comm; 6616 comm_event->comm_size = size; 6617 6618 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 6619 6620 perf_iterate_sb(perf_event_comm_output, 6621 comm_event, 6622 NULL); 6623 } 6624 6625 void perf_event_comm(struct task_struct *task, bool exec) 6626 { 6627 struct perf_comm_event comm_event; 6628 6629 if (!atomic_read(&nr_comm_events)) 6630 return; 6631 6632 comm_event = (struct perf_comm_event){ 6633 .task = task, 6634 /* .comm */ 6635 /* .comm_size */ 6636 .event_id = { 6637 .header = { 6638 .type = PERF_RECORD_COMM, 6639 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, 6640 /* .size */ 6641 }, 6642 /* .pid */ 6643 /* .tid */ 6644 }, 6645 }; 6646 6647 perf_event_comm_event(&comm_event); 6648 } 6649 6650 /* 6651 * namespaces tracking 6652 */ 6653 6654 struct perf_namespaces_event { 6655 struct task_struct *task; 6656 6657 struct { 6658 struct perf_event_header header; 6659 6660 u32 pid; 6661 u32 tid; 6662 u64 nr_namespaces; 6663 struct perf_ns_link_info link_info[NR_NAMESPACES]; 6664 } event_id; 6665 }; 6666 6667 static int perf_event_namespaces_match(struct perf_event *event) 6668 { 6669 return event->attr.namespaces; 6670 } 6671 6672 static void perf_event_namespaces_output(struct perf_event *event, 6673 void *data) 6674 { 6675 struct perf_namespaces_event *namespaces_event = data; 6676 struct perf_output_handle handle; 6677 struct perf_sample_data sample; 6678 u16 header_size = namespaces_event->event_id.header.size; 6679 int ret; 6680 6681 if (!perf_event_namespaces_match(event)) 6682 return; 6683 6684 perf_event_header__init_id(&namespaces_event->event_id.header, 6685 &sample, event); 6686 ret = perf_output_begin(&handle, event, 6687 namespaces_event->event_id.header.size); 6688 if (ret) 6689 goto out; 6690 6691 namespaces_event->event_id.pid = perf_event_pid(event, 6692 namespaces_event->task); 6693 namespaces_event->event_id.tid = perf_event_tid(event, 6694 namespaces_event->task); 6695 6696 perf_output_put(&handle, namespaces_event->event_id); 6697 6698 perf_event__output_id_sample(event, &handle, &sample); 6699 6700 perf_output_end(&handle); 6701 out: 6702 namespaces_event->event_id.header.size = header_size; 6703 } 6704 6705 static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, 6706 struct task_struct *task, 6707 const struct proc_ns_operations *ns_ops) 6708 { 6709 struct path ns_path; 6710 struct inode *ns_inode; 6711 void *error; 6712 6713 error = ns_get_path(&ns_path, task, ns_ops); 6714 if (!error) { 6715 ns_inode = ns_path.dentry->d_inode; 6716 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); 6717 ns_link_info->ino = ns_inode->i_ino; 6718 path_put(&ns_path); 6719 } 6720 } 6721 6722 void perf_event_namespaces(struct task_struct *task) 6723 { 6724 struct perf_namespaces_event namespaces_event; 6725 struct perf_ns_link_info *ns_link_info; 6726 6727 if (!atomic_read(&nr_namespaces_events)) 6728 return; 6729 6730 namespaces_event = (struct perf_namespaces_event){ 6731 .task = task, 6732 .event_id = { 6733 .header = { 6734 .type = PERF_RECORD_NAMESPACES, 6735 .misc = 0, 6736 .size = sizeof(namespaces_event.event_id), 6737 }, 6738 /* .pid */ 6739 /* .tid */ 6740 .nr_namespaces = NR_NAMESPACES, 6741 /* .link_info[NR_NAMESPACES] */ 6742 }, 6743 }; 6744 6745 ns_link_info = namespaces_event.event_id.link_info; 6746 6747 perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX], 6748 task, &mntns_operations); 6749 6750 #ifdef CONFIG_USER_NS 6751 perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX], 6752 task, &userns_operations); 6753 #endif 6754 #ifdef CONFIG_NET_NS 6755 perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX], 6756 task, &netns_operations); 6757 #endif 6758 #ifdef CONFIG_UTS_NS 6759 perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX], 6760 task, &utsns_operations); 6761 #endif 6762 #ifdef CONFIG_IPC_NS 6763 perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX], 6764 task, &ipcns_operations); 6765 #endif 6766 #ifdef CONFIG_PID_NS 6767 perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX], 6768 task, &pidns_operations); 6769 #endif 6770 #ifdef CONFIG_CGROUPS 6771 perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX], 6772 task, &cgroupns_operations); 6773 #endif 6774 6775 perf_iterate_sb(perf_event_namespaces_output, 6776 &namespaces_event, 6777 NULL); 6778 } 6779 6780 /* 6781 * mmap tracking 6782 */ 6783 6784 struct perf_mmap_event { 6785 struct vm_area_struct *vma; 6786 6787 const char *file_name; 6788 int file_size; 6789 int maj, min; 6790 u64 ino; 6791 u64 ino_generation; 6792 u32 prot, flags; 6793 6794 struct { 6795 struct perf_event_header header; 6796 6797 u32 pid; 6798 u32 tid; 6799 u64 start; 6800 u64 len; 6801 u64 pgoff; 6802 } event_id; 6803 }; 6804 6805 static int perf_event_mmap_match(struct perf_event *event, 6806 void *data) 6807 { 6808 struct perf_mmap_event *mmap_event = data; 6809 struct vm_area_struct *vma = mmap_event->vma; 6810 int executable = vma->vm_flags & VM_EXEC; 6811 6812 return (!executable && event->attr.mmap_data) || 6813 (executable && (event->attr.mmap || event->attr.mmap2)); 6814 } 6815 6816 static void perf_event_mmap_output(struct perf_event *event, 6817 void *data) 6818 { 6819 struct perf_mmap_event *mmap_event = data; 6820 struct perf_output_handle handle; 6821 struct perf_sample_data sample; 6822 int size = mmap_event->event_id.header.size; 6823 int ret; 6824 6825 if (!perf_event_mmap_match(event, data)) 6826 return; 6827 6828 if (event->attr.mmap2) { 6829 mmap_event->event_id.header.type = PERF_RECORD_MMAP2; 6830 mmap_event->event_id.header.size += sizeof(mmap_event->maj); 6831 mmap_event->event_id.header.size += sizeof(mmap_event->min); 6832 mmap_event->event_id.header.size += sizeof(mmap_event->ino); 6833 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); 6834 mmap_event->event_id.header.size += sizeof(mmap_event->prot); 6835 mmap_event->event_id.header.size += sizeof(mmap_event->flags); 6836 } 6837 6838 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 6839 ret = perf_output_begin(&handle, event, 6840 mmap_event->event_id.header.size); 6841 if (ret) 6842 goto out; 6843 6844 mmap_event->event_id.pid = perf_event_pid(event, current); 6845 mmap_event->event_id.tid = perf_event_tid(event, current); 6846 6847 perf_output_put(&handle, mmap_event->event_id); 6848 6849 if (event->attr.mmap2) { 6850 perf_output_put(&handle, mmap_event->maj); 6851 perf_output_put(&handle, mmap_event->min); 6852 perf_output_put(&handle, mmap_event->ino); 6853 perf_output_put(&handle, mmap_event->ino_generation); 6854 perf_output_put(&handle, mmap_event->prot); 6855 perf_output_put(&handle, mmap_event->flags); 6856 } 6857 6858 __output_copy(&handle, mmap_event->file_name, 6859 mmap_event->file_size); 6860 6861 perf_event__output_id_sample(event, &handle, &sample); 6862 6863 perf_output_end(&handle); 6864 out: 6865 mmap_event->event_id.header.size = size; 6866 } 6867 6868 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 6869 { 6870 struct vm_area_struct *vma = mmap_event->vma; 6871 struct file *file = vma->vm_file; 6872 int maj = 0, min = 0; 6873 u64 ino = 0, gen = 0; 6874 u32 prot = 0, flags = 0; 6875 unsigned int size; 6876 char tmp[16]; 6877 char *buf = NULL; 6878 char *name; 6879 6880 if (vma->vm_flags & VM_READ) 6881 prot |= PROT_READ; 6882 if (vma->vm_flags & VM_WRITE) 6883 prot |= PROT_WRITE; 6884 if (vma->vm_flags & VM_EXEC) 6885 prot |= PROT_EXEC; 6886 6887 if (vma->vm_flags & VM_MAYSHARE) 6888 flags = MAP_SHARED; 6889 else 6890 flags = MAP_PRIVATE; 6891 6892 if (vma->vm_flags & VM_DENYWRITE) 6893 flags |= MAP_DENYWRITE; 6894 if (vma->vm_flags & VM_MAYEXEC) 6895 flags |= MAP_EXECUTABLE; 6896 if (vma->vm_flags & VM_LOCKED) 6897 flags |= MAP_LOCKED; 6898 if (vma->vm_flags & VM_HUGETLB) 6899 flags |= MAP_HUGETLB; 6900 6901 if (file) { 6902 struct inode *inode; 6903 dev_t dev; 6904 6905 buf = kmalloc(PATH_MAX, GFP_KERNEL); 6906 if (!buf) { 6907 name = "//enomem"; 6908 goto cpy_name; 6909 } 6910 /* 6911 * d_path() works from the end of the rb backwards, so we 6912 * need to add enough zero bytes after the string to handle 6913 * the 64bit alignment we do later. 6914 */ 6915 name = file_path(file, buf, PATH_MAX - sizeof(u64)); 6916 if (IS_ERR(name)) { 6917 name = "//toolong"; 6918 goto cpy_name; 6919 } 6920 inode = file_inode(vma->vm_file); 6921 dev = inode->i_sb->s_dev; 6922 ino = inode->i_ino; 6923 gen = inode->i_generation; 6924 maj = MAJOR(dev); 6925 min = MINOR(dev); 6926 6927 goto got_name; 6928 } else { 6929 if (vma->vm_ops && vma->vm_ops->name) { 6930 name = (char *) vma->vm_ops->name(vma); 6931 if (name) 6932 goto cpy_name; 6933 } 6934 6935 name = (char *)arch_vma_name(vma); 6936 if (name) 6937 goto cpy_name; 6938 6939 if (vma->vm_start <= vma->vm_mm->start_brk && 6940 vma->vm_end >= vma->vm_mm->brk) { 6941 name = "[heap]"; 6942 goto cpy_name; 6943 } 6944 if (vma->vm_start <= vma->vm_mm->start_stack && 6945 vma->vm_end >= vma->vm_mm->start_stack) { 6946 name = "[stack]"; 6947 goto cpy_name; 6948 } 6949 6950 name = "//anon"; 6951 goto cpy_name; 6952 } 6953 6954 cpy_name: 6955 strlcpy(tmp, name, sizeof(tmp)); 6956 name = tmp; 6957 got_name: 6958 /* 6959 * Since our buffer works in 8 byte units we need to align our string 6960 * size to a multiple of 8. However, we must guarantee the tail end is 6961 * zero'd out to avoid leaking random bits to userspace. 6962 */ 6963 size = strlen(name)+1; 6964 while (!IS_ALIGNED(size, sizeof(u64))) 6965 name[size++] = '\0'; 6966 6967 mmap_event->file_name = name; 6968 mmap_event->file_size = size; 6969 mmap_event->maj = maj; 6970 mmap_event->min = min; 6971 mmap_event->ino = ino; 6972 mmap_event->ino_generation = gen; 6973 mmap_event->prot = prot; 6974 mmap_event->flags = flags; 6975 6976 if (!(vma->vm_flags & VM_EXEC)) 6977 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; 6978 6979 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 6980 6981 perf_iterate_sb(perf_event_mmap_output, 6982 mmap_event, 6983 NULL); 6984 6985 kfree(buf); 6986 } 6987 6988 /* 6989 * Check whether inode and address range match filter criteria. 6990 */ 6991 static bool perf_addr_filter_match(struct perf_addr_filter *filter, 6992 struct file *file, unsigned long offset, 6993 unsigned long size) 6994 { 6995 if (filter->inode != file_inode(file)) 6996 return false; 6997 6998 if (filter->offset > offset + size) 6999 return false; 7000 7001 if (filter->offset + filter->size < offset) 7002 return false; 7003 7004 return true; 7005 } 7006 7007 static void __perf_addr_filters_adjust(struct perf_event *event, void *data) 7008 { 7009 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 7010 struct vm_area_struct *vma = data; 7011 unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; 7012 struct file *file = vma->vm_file; 7013 struct perf_addr_filter *filter; 7014 unsigned int restart = 0, count = 0; 7015 7016 if (!has_addr_filter(event)) 7017 return; 7018 7019 if (!file) 7020 return; 7021 7022 raw_spin_lock_irqsave(&ifh->lock, flags); 7023 list_for_each_entry(filter, &ifh->list, entry) { 7024 if (perf_addr_filter_match(filter, file, off, 7025 vma->vm_end - vma->vm_start)) { 7026 event->addr_filters_offs[count] = vma->vm_start; 7027 restart++; 7028 } 7029 7030 count++; 7031 } 7032 7033 if (restart) 7034 event->addr_filters_gen++; 7035 raw_spin_unlock_irqrestore(&ifh->lock, flags); 7036 7037 if (restart) 7038 perf_event_stop(event, 1); 7039 } 7040 7041 /* 7042 * Adjust all task's events' filters to the new vma 7043 */ 7044 static void perf_addr_filters_adjust(struct vm_area_struct *vma) 7045 { 7046 struct perf_event_context *ctx; 7047 int ctxn; 7048 7049 /* 7050 * Data tracing isn't supported yet and as such there is no need 7051 * to keep track of anything that isn't related to executable code: 7052 */ 7053 if (!(vma->vm_flags & VM_EXEC)) 7054 return; 7055 7056 rcu_read_lock(); 7057 for_each_task_context_nr(ctxn) { 7058 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 7059 if (!ctx) 7060 continue; 7061 7062 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); 7063 } 7064 rcu_read_unlock(); 7065 } 7066 7067 void perf_event_mmap(struct vm_area_struct *vma) 7068 { 7069 struct perf_mmap_event mmap_event; 7070 7071 if (!atomic_read(&nr_mmap_events)) 7072 return; 7073 7074 mmap_event = (struct perf_mmap_event){ 7075 .vma = vma, 7076 /* .file_name */ 7077 /* .file_size */ 7078 .event_id = { 7079 .header = { 7080 .type = PERF_RECORD_MMAP, 7081 .misc = PERF_RECORD_MISC_USER, 7082 /* .size */ 7083 }, 7084 /* .pid */ 7085 /* .tid */ 7086 .start = vma->vm_start, 7087 .len = vma->vm_end - vma->vm_start, 7088 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, 7089 }, 7090 /* .maj (attr_mmap2 only) */ 7091 /* .min (attr_mmap2 only) */ 7092 /* .ino (attr_mmap2 only) */ 7093 /* .ino_generation (attr_mmap2 only) */ 7094 /* .prot (attr_mmap2 only) */ 7095 /* .flags (attr_mmap2 only) */ 7096 }; 7097 7098 perf_addr_filters_adjust(vma); 7099 perf_event_mmap_event(&mmap_event); 7100 } 7101 7102 void perf_event_aux_event(struct perf_event *event, unsigned long head, 7103 unsigned long size, u64 flags) 7104 { 7105 struct perf_output_handle handle; 7106 struct perf_sample_data sample; 7107 struct perf_aux_event { 7108 struct perf_event_header header; 7109 u64 offset; 7110 u64 size; 7111 u64 flags; 7112 } rec = { 7113 .header = { 7114 .type = PERF_RECORD_AUX, 7115 .misc = 0, 7116 .size = sizeof(rec), 7117 }, 7118 .offset = head, 7119 .size = size, 7120 .flags = flags, 7121 }; 7122 int ret; 7123 7124 perf_event_header__init_id(&rec.header, &sample, event); 7125 ret = perf_output_begin(&handle, event, rec.header.size); 7126 7127 if (ret) 7128 return; 7129 7130 perf_output_put(&handle, rec); 7131 perf_event__output_id_sample(event, &handle, &sample); 7132 7133 perf_output_end(&handle); 7134 } 7135 7136 /* 7137 * Lost/dropped samples logging 7138 */ 7139 void perf_log_lost_samples(struct perf_event *event, u64 lost) 7140 { 7141 struct perf_output_handle handle; 7142 struct perf_sample_data sample; 7143 int ret; 7144 7145 struct { 7146 struct perf_event_header header; 7147 u64 lost; 7148 } lost_samples_event = { 7149 .header = { 7150 .type = PERF_RECORD_LOST_SAMPLES, 7151 .misc = 0, 7152 .size = sizeof(lost_samples_event), 7153 }, 7154 .lost = lost, 7155 }; 7156 7157 perf_event_header__init_id(&lost_samples_event.header, &sample, event); 7158 7159 ret = perf_output_begin(&handle, event, 7160 lost_samples_event.header.size); 7161 if (ret) 7162 return; 7163 7164 perf_output_put(&handle, lost_samples_event); 7165 perf_event__output_id_sample(event, &handle, &sample); 7166 perf_output_end(&handle); 7167 } 7168 7169 /* 7170 * context_switch tracking 7171 */ 7172 7173 struct perf_switch_event { 7174 struct task_struct *task; 7175 struct task_struct *next_prev; 7176 7177 struct { 7178 struct perf_event_header header; 7179 u32 next_prev_pid; 7180 u32 next_prev_tid; 7181 } event_id; 7182 }; 7183 7184 static int perf_event_switch_match(struct perf_event *event) 7185 { 7186 return event->attr.context_switch; 7187 } 7188 7189 static void perf_event_switch_output(struct perf_event *event, void *data) 7190 { 7191 struct perf_switch_event *se = data; 7192 struct perf_output_handle handle; 7193 struct perf_sample_data sample; 7194 int ret; 7195 7196 if (!perf_event_switch_match(event)) 7197 return; 7198 7199 /* Only CPU-wide events are allowed to see next/prev pid/tid */ 7200 if (event->ctx->task) { 7201 se->event_id.header.type = PERF_RECORD_SWITCH; 7202 se->event_id.header.size = sizeof(se->event_id.header); 7203 } else { 7204 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; 7205 se->event_id.header.size = sizeof(se->event_id); 7206 se->event_id.next_prev_pid = 7207 perf_event_pid(event, se->next_prev); 7208 se->event_id.next_prev_tid = 7209 perf_event_tid(event, se->next_prev); 7210 } 7211 7212 perf_event_header__init_id(&se->event_id.header, &sample, event); 7213 7214 ret = perf_output_begin(&handle, event, se->event_id.header.size); 7215 if (ret) 7216 return; 7217 7218 if (event->ctx->task) 7219 perf_output_put(&handle, se->event_id.header); 7220 else 7221 perf_output_put(&handle, se->event_id); 7222 7223 perf_event__output_id_sample(event, &handle, &sample); 7224 7225 perf_output_end(&handle); 7226 } 7227 7228 static void perf_event_switch(struct task_struct *task, 7229 struct task_struct *next_prev, bool sched_in) 7230 { 7231 struct perf_switch_event switch_event; 7232 7233 /* N.B. caller checks nr_switch_events != 0 */ 7234 7235 switch_event = (struct perf_switch_event){ 7236 .task = task, 7237 .next_prev = next_prev, 7238 .event_id = { 7239 .header = { 7240 /* .type */ 7241 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, 7242 /* .size */ 7243 }, 7244 /* .next_prev_pid */ 7245 /* .next_prev_tid */ 7246 }, 7247 }; 7248 7249 perf_iterate_sb(perf_event_switch_output, 7250 &switch_event, 7251 NULL); 7252 } 7253 7254 /* 7255 * IRQ throttle logging 7256 */ 7257 7258 static void perf_log_throttle(struct perf_event *event, int enable) 7259 { 7260 struct perf_output_handle handle; 7261 struct perf_sample_data sample; 7262 int ret; 7263 7264 struct { 7265 struct perf_event_header header; 7266 u64 time; 7267 u64 id; 7268 u64 stream_id; 7269 } throttle_event = { 7270 .header = { 7271 .type = PERF_RECORD_THROTTLE, 7272 .misc = 0, 7273 .size = sizeof(throttle_event), 7274 }, 7275 .time = perf_event_clock(event), 7276 .id = primary_event_id(event), 7277 .stream_id = event->id, 7278 }; 7279 7280 if (enable) 7281 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 7282 7283 perf_event_header__init_id(&throttle_event.header, &sample, event); 7284 7285 ret = perf_output_begin(&handle, event, 7286 throttle_event.header.size); 7287 if (ret) 7288 return; 7289 7290 perf_output_put(&handle, throttle_event); 7291 perf_event__output_id_sample(event, &handle, &sample); 7292 perf_output_end(&handle); 7293 } 7294 7295 void perf_event_itrace_started(struct perf_event *event) 7296 { 7297 event->attach_state |= PERF_ATTACH_ITRACE; 7298 } 7299 7300 static void perf_log_itrace_start(struct perf_event *event) 7301 { 7302 struct perf_output_handle handle; 7303 struct perf_sample_data sample; 7304 struct perf_aux_event { 7305 struct perf_event_header header; 7306 u32 pid; 7307 u32 tid; 7308 } rec; 7309 int ret; 7310 7311 if (event->parent) 7312 event = event->parent; 7313 7314 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || 7315 event->attach_state & PERF_ATTACH_ITRACE) 7316 return; 7317 7318 rec.header.type = PERF_RECORD_ITRACE_START; 7319 rec.header.misc = 0; 7320 rec.header.size = sizeof(rec); 7321 rec.pid = perf_event_pid(event, current); 7322 rec.tid = perf_event_tid(event, current); 7323 7324 perf_event_header__init_id(&rec.header, &sample, event); 7325 ret = perf_output_begin(&handle, event, rec.header.size); 7326 7327 if (ret) 7328 return; 7329 7330 perf_output_put(&handle, rec); 7331 perf_event__output_id_sample(event, &handle, &sample); 7332 7333 perf_output_end(&handle); 7334 } 7335 7336 static int 7337 __perf_event_account_interrupt(struct perf_event *event, int throttle) 7338 { 7339 struct hw_perf_event *hwc = &event->hw; 7340 int ret = 0; 7341 u64 seq; 7342 7343 seq = __this_cpu_read(perf_throttled_seq); 7344 if (seq != hwc->interrupts_seq) { 7345 hwc->interrupts_seq = seq; 7346 hwc->interrupts = 1; 7347 } else { 7348 hwc->interrupts++; 7349 if (unlikely(throttle 7350 && hwc->interrupts >= max_samples_per_tick)) { 7351 __this_cpu_inc(perf_throttled_count); 7352 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); 7353 hwc->interrupts = MAX_INTERRUPTS; 7354 perf_log_throttle(event, 0); 7355 ret = 1; 7356 } 7357 } 7358 7359 if (event->attr.freq) { 7360 u64 now = perf_clock(); 7361 s64 delta = now - hwc->freq_time_stamp; 7362 7363 hwc->freq_time_stamp = now; 7364 7365 if (delta > 0 && delta < 2*TICK_NSEC) 7366 perf_adjust_period(event, delta, hwc->last_period, true); 7367 } 7368 7369 return ret; 7370 } 7371 7372 int perf_event_account_interrupt(struct perf_event *event) 7373 { 7374 return __perf_event_account_interrupt(event, 1); 7375 } 7376 7377 /* 7378 * Generic event overflow handling, sampling. 7379 */ 7380 7381 static int __perf_event_overflow(struct perf_event *event, 7382 int throttle, struct perf_sample_data *data, 7383 struct pt_regs *regs) 7384 { 7385 int events = atomic_read(&event->event_limit); 7386 int ret = 0; 7387 7388 /* 7389 * Non-sampling counters might still use the PMI to fold short 7390 * hardware counters, ignore those. 7391 */ 7392 if (unlikely(!is_sampling_event(event))) 7393 return 0; 7394 7395 ret = __perf_event_account_interrupt(event, throttle); 7396 7397 /* 7398 * XXX event_limit might not quite work as expected on inherited 7399 * events 7400 */ 7401 7402 event->pending_kill = POLL_IN; 7403 if (events && atomic_dec_and_test(&event->event_limit)) { 7404 ret = 1; 7405 event->pending_kill = POLL_HUP; 7406 7407 perf_event_disable_inatomic(event); 7408 } 7409 7410 READ_ONCE(event->overflow_handler)(event, data, regs); 7411 7412 if (*perf_event_fasync(event) && event->pending_kill) { 7413 event->pending_wakeup = 1; 7414 irq_work_queue(&event->pending); 7415 } 7416 7417 return ret; 7418 } 7419 7420 int perf_event_overflow(struct perf_event *event, 7421 struct perf_sample_data *data, 7422 struct pt_regs *regs) 7423 { 7424 return __perf_event_overflow(event, 1, data, regs); 7425 } 7426 7427 /* 7428 * Generic software event infrastructure 7429 */ 7430 7431 struct swevent_htable { 7432 struct swevent_hlist *swevent_hlist; 7433 struct mutex hlist_mutex; 7434 int hlist_refcount; 7435 7436 /* Recursion avoidance in each contexts */ 7437 int recursion[PERF_NR_CONTEXTS]; 7438 }; 7439 7440 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); 7441 7442 /* 7443 * We directly increment event->count and keep a second value in 7444 * event->hw.period_left to count intervals. This period event 7445 * is kept in the range [-sample_period, 0] so that we can use the 7446 * sign as trigger. 7447 */ 7448 7449 u64 perf_swevent_set_period(struct perf_event *event) 7450 { 7451 struct hw_perf_event *hwc = &event->hw; 7452 u64 period = hwc->last_period; 7453 u64 nr, offset; 7454 s64 old, val; 7455 7456 hwc->last_period = hwc->sample_period; 7457 7458 again: 7459 old = val = local64_read(&hwc->period_left); 7460 if (val < 0) 7461 return 0; 7462 7463 nr = div64_u64(period + val, period); 7464 offset = nr * period; 7465 val -= offset; 7466 if (local64_cmpxchg(&hwc->period_left, old, val) != old) 7467 goto again; 7468 7469 return nr; 7470 } 7471 7472 static void perf_swevent_overflow(struct perf_event *event, u64 overflow, 7473 struct perf_sample_data *data, 7474 struct pt_regs *regs) 7475 { 7476 struct hw_perf_event *hwc = &event->hw; 7477 int throttle = 0; 7478 7479 if (!overflow) 7480 overflow = perf_swevent_set_period(event); 7481 7482 if (hwc->interrupts == MAX_INTERRUPTS) 7483 return; 7484 7485 for (; overflow; overflow--) { 7486 if (__perf_event_overflow(event, throttle, 7487 data, regs)) { 7488 /* 7489 * We inhibit the overflow from happening when 7490 * hwc->interrupts == MAX_INTERRUPTS. 7491 */ 7492 break; 7493 } 7494 throttle = 1; 7495 } 7496 } 7497 7498 static void perf_swevent_event(struct perf_event *event, u64 nr, 7499 struct perf_sample_data *data, 7500 struct pt_regs *regs) 7501 { 7502 struct hw_perf_event *hwc = &event->hw; 7503 7504 local64_add(nr, &event->count); 7505 7506 if (!regs) 7507 return; 7508 7509 if (!is_sampling_event(event)) 7510 return; 7511 7512 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { 7513 data->period = nr; 7514 return perf_swevent_overflow(event, 1, data, regs); 7515 } else 7516 data->period = event->hw.last_period; 7517 7518 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 7519 return perf_swevent_overflow(event, 1, data, regs); 7520 7521 if (local64_add_negative(nr, &hwc->period_left)) 7522 return; 7523 7524 perf_swevent_overflow(event, 0, data, regs); 7525 } 7526 7527 static int perf_exclude_event(struct perf_event *event, 7528 struct pt_regs *regs) 7529 { 7530 if (event->hw.state & PERF_HES_STOPPED) 7531 return 1; 7532 7533 if (regs) { 7534 if (event->attr.exclude_user && user_mode(regs)) 7535 return 1; 7536 7537 if (event->attr.exclude_kernel && !user_mode(regs)) 7538 return 1; 7539 } 7540 7541 return 0; 7542 } 7543 7544 static int perf_swevent_match(struct perf_event *event, 7545 enum perf_type_id type, 7546 u32 event_id, 7547 struct perf_sample_data *data, 7548 struct pt_regs *regs) 7549 { 7550 if (event->attr.type != type) 7551 return 0; 7552 7553 if (event->attr.config != event_id) 7554 return 0; 7555 7556 if (perf_exclude_event(event, regs)) 7557 return 0; 7558 7559 return 1; 7560 } 7561 7562 static inline u64 swevent_hash(u64 type, u32 event_id) 7563 { 7564 u64 val = event_id | (type << 32); 7565 7566 return hash_64(val, SWEVENT_HLIST_BITS); 7567 } 7568 7569 static inline struct hlist_head * 7570 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) 7571 { 7572 u64 hash = swevent_hash(type, event_id); 7573 7574 return &hlist->heads[hash]; 7575 } 7576 7577 /* For the read side: events when they trigger */ 7578 static inline struct hlist_head * 7579 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) 7580 { 7581 struct swevent_hlist *hlist; 7582 7583 hlist = rcu_dereference(swhash->swevent_hlist); 7584 if (!hlist) 7585 return NULL; 7586 7587 return __find_swevent_head(hlist, type, event_id); 7588 } 7589 7590 /* For the event head insertion and removal in the hlist */ 7591 static inline struct hlist_head * 7592 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) 7593 { 7594 struct swevent_hlist *hlist; 7595 u32 event_id = event->attr.config; 7596 u64 type = event->attr.type; 7597 7598 /* 7599 * Event scheduling is always serialized against hlist allocation 7600 * and release. Which makes the protected version suitable here. 7601 * The context lock guarantees that. 7602 */ 7603 hlist = rcu_dereference_protected(swhash->swevent_hlist, 7604 lockdep_is_held(&event->ctx->lock)); 7605 if (!hlist) 7606 return NULL; 7607 7608 return __find_swevent_head(hlist, type, event_id); 7609 } 7610 7611 static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 7612 u64 nr, 7613 struct perf_sample_data *data, 7614 struct pt_regs *regs) 7615 { 7616 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 7617 struct perf_event *event; 7618 struct hlist_head *head; 7619 7620 rcu_read_lock(); 7621 head = find_swevent_head_rcu(swhash, type, event_id); 7622 if (!head) 7623 goto end; 7624 7625 hlist_for_each_entry_rcu(event, head, hlist_entry) { 7626 if (perf_swevent_match(event, type, event_id, data, regs)) 7627 perf_swevent_event(event, nr, data, regs); 7628 } 7629 end: 7630 rcu_read_unlock(); 7631 } 7632 7633 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); 7634 7635 int perf_swevent_get_recursion_context(void) 7636 { 7637 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 7638 7639 return get_recursion_context(swhash->recursion); 7640 } 7641 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 7642 7643 void perf_swevent_put_recursion_context(int rctx) 7644 { 7645 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 7646 7647 put_recursion_context(swhash->recursion, rctx); 7648 } 7649 7650 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 7651 { 7652 struct perf_sample_data data; 7653 7654 if (WARN_ON_ONCE(!regs)) 7655 return; 7656 7657 perf_sample_data_init(&data, addr, 0); 7658 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 7659 } 7660 7661 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 7662 { 7663 int rctx; 7664 7665 preempt_disable_notrace(); 7666 rctx = perf_swevent_get_recursion_context(); 7667 if (unlikely(rctx < 0)) 7668 goto fail; 7669 7670 ___perf_sw_event(event_id, nr, regs, addr); 7671 7672 perf_swevent_put_recursion_context(rctx); 7673 fail: 7674 preempt_enable_notrace(); 7675 } 7676 7677 static void perf_swevent_read(struct perf_event *event) 7678 { 7679 } 7680 7681 static int perf_swevent_add(struct perf_event *event, int flags) 7682 { 7683 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 7684 struct hw_perf_event *hwc = &event->hw; 7685 struct hlist_head *head; 7686 7687 if (is_sampling_event(event)) { 7688 hwc->last_period = hwc->sample_period; 7689 perf_swevent_set_period(event); 7690 } 7691 7692 hwc->state = !(flags & PERF_EF_START); 7693 7694 head = find_swevent_head(swhash, event); 7695 if (WARN_ON_ONCE(!head)) 7696 return -EINVAL; 7697 7698 hlist_add_head_rcu(&event->hlist_entry, head); 7699 perf_event_update_userpage(event); 7700 7701 return 0; 7702 } 7703 7704 static void perf_swevent_del(struct perf_event *event, int flags) 7705 { 7706 hlist_del_rcu(&event->hlist_entry); 7707 } 7708 7709 static void perf_swevent_start(struct perf_event *event, int flags) 7710 { 7711 event->hw.state = 0; 7712 } 7713 7714 static void perf_swevent_stop(struct perf_event *event, int flags) 7715 { 7716 event->hw.state = PERF_HES_STOPPED; 7717 } 7718 7719 /* Deref the hlist from the update side */ 7720 static inline struct swevent_hlist * 7721 swevent_hlist_deref(struct swevent_htable *swhash) 7722 { 7723 return rcu_dereference_protected(swhash->swevent_hlist, 7724 lockdep_is_held(&swhash->hlist_mutex)); 7725 } 7726 7727 static void swevent_hlist_release(struct swevent_htable *swhash) 7728 { 7729 struct swevent_hlist *hlist = swevent_hlist_deref(swhash); 7730 7731 if (!hlist) 7732 return; 7733 7734 RCU_INIT_POINTER(swhash->swevent_hlist, NULL); 7735 kfree_rcu(hlist, rcu_head); 7736 } 7737 7738 static void swevent_hlist_put_cpu(int cpu) 7739 { 7740 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7741 7742 mutex_lock(&swhash->hlist_mutex); 7743 7744 if (!--swhash->hlist_refcount) 7745 swevent_hlist_release(swhash); 7746 7747 mutex_unlock(&swhash->hlist_mutex); 7748 } 7749 7750 static void swevent_hlist_put(void) 7751 { 7752 int cpu; 7753 7754 for_each_possible_cpu(cpu) 7755 swevent_hlist_put_cpu(cpu); 7756 } 7757 7758 static int swevent_hlist_get_cpu(int cpu) 7759 { 7760 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7761 int err = 0; 7762 7763 mutex_lock(&swhash->hlist_mutex); 7764 if (!swevent_hlist_deref(swhash) && 7765 cpumask_test_cpu(cpu, perf_online_mask)) { 7766 struct swevent_hlist *hlist; 7767 7768 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 7769 if (!hlist) { 7770 err = -ENOMEM; 7771 goto exit; 7772 } 7773 rcu_assign_pointer(swhash->swevent_hlist, hlist); 7774 } 7775 swhash->hlist_refcount++; 7776 exit: 7777 mutex_unlock(&swhash->hlist_mutex); 7778 7779 return err; 7780 } 7781 7782 static int swevent_hlist_get(void) 7783 { 7784 int err, cpu, failed_cpu; 7785 7786 mutex_lock(&pmus_lock); 7787 for_each_possible_cpu(cpu) { 7788 err = swevent_hlist_get_cpu(cpu); 7789 if (err) { 7790 failed_cpu = cpu; 7791 goto fail; 7792 } 7793 } 7794 mutex_unlock(&pmus_lock); 7795 return 0; 7796 fail: 7797 for_each_possible_cpu(cpu) { 7798 if (cpu == failed_cpu) 7799 break; 7800 swevent_hlist_put_cpu(cpu); 7801 } 7802 mutex_unlock(&pmus_lock); 7803 return err; 7804 } 7805 7806 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; 7807 7808 static void sw_perf_event_destroy(struct perf_event *event) 7809 { 7810 u64 event_id = event->attr.config; 7811 7812 WARN_ON(event->parent); 7813 7814 static_key_slow_dec(&perf_swevent_enabled[event_id]); 7815 swevent_hlist_put(); 7816 } 7817 7818 static int perf_swevent_init(struct perf_event *event) 7819 { 7820 u64 event_id = event->attr.config; 7821 7822 if (event->attr.type != PERF_TYPE_SOFTWARE) 7823 return -ENOENT; 7824 7825 /* 7826 * no branch sampling for software events 7827 */ 7828 if (has_branch_stack(event)) 7829 return -EOPNOTSUPP; 7830 7831 switch (event_id) { 7832 case PERF_COUNT_SW_CPU_CLOCK: 7833 case PERF_COUNT_SW_TASK_CLOCK: 7834 return -ENOENT; 7835 7836 default: 7837 break; 7838 } 7839 7840 if (event_id >= PERF_COUNT_SW_MAX) 7841 return -ENOENT; 7842 7843 if (!event->parent) { 7844 int err; 7845 7846 err = swevent_hlist_get(); 7847 if (err) 7848 return err; 7849 7850 static_key_slow_inc(&perf_swevent_enabled[event_id]); 7851 event->destroy = sw_perf_event_destroy; 7852 } 7853 7854 return 0; 7855 } 7856 7857 static struct pmu perf_swevent = { 7858 .task_ctx_nr = perf_sw_context, 7859 7860 .capabilities = PERF_PMU_CAP_NO_NMI, 7861 7862 .event_init = perf_swevent_init, 7863 .add = perf_swevent_add, 7864 .del = perf_swevent_del, 7865 .start = perf_swevent_start, 7866 .stop = perf_swevent_stop, 7867 .read = perf_swevent_read, 7868 }; 7869 7870 #ifdef CONFIG_EVENT_TRACING 7871 7872 static int perf_tp_filter_match(struct perf_event *event, 7873 struct perf_sample_data *data) 7874 { 7875 void *record = data->raw->frag.data; 7876 7877 /* only top level events have filters set */ 7878 if (event->parent) 7879 event = event->parent; 7880 7881 if (likely(!event->filter) || filter_match_preds(event->filter, record)) 7882 return 1; 7883 return 0; 7884 } 7885 7886 static int perf_tp_event_match(struct perf_event *event, 7887 struct perf_sample_data *data, 7888 struct pt_regs *regs) 7889 { 7890 if (event->hw.state & PERF_HES_STOPPED) 7891 return 0; 7892 /* 7893 * All tracepoints are from kernel-space. 7894 */ 7895 if (event->attr.exclude_kernel) 7896 return 0; 7897 7898 if (!perf_tp_filter_match(event, data)) 7899 return 0; 7900 7901 return 1; 7902 } 7903 7904 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, 7905 struct trace_event_call *call, u64 count, 7906 struct pt_regs *regs, struct hlist_head *head, 7907 struct task_struct *task) 7908 { 7909 if (bpf_prog_array_valid(call)) { 7910 *(struct pt_regs **)raw_data = regs; 7911 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { 7912 perf_swevent_put_recursion_context(rctx); 7913 return; 7914 } 7915 } 7916 perf_tp_event(call->event.type, count, raw_data, size, regs, head, 7917 rctx, task); 7918 } 7919 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); 7920 7921 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, 7922 struct pt_regs *regs, struct hlist_head *head, int rctx, 7923 struct task_struct *task) 7924 { 7925 struct perf_sample_data data; 7926 struct perf_event *event; 7927 7928 struct perf_raw_record raw = { 7929 .frag = { 7930 .size = entry_size, 7931 .data = record, 7932 }, 7933 }; 7934 7935 perf_sample_data_init(&data, 0, 0); 7936 data.raw = &raw; 7937 7938 perf_trace_buf_update(record, event_type); 7939 7940 hlist_for_each_entry_rcu(event, head, hlist_entry) { 7941 if (perf_tp_event_match(event, &data, regs)) 7942 perf_swevent_event(event, count, &data, regs); 7943 } 7944 7945 /* 7946 * If we got specified a target task, also iterate its context and 7947 * deliver this event there too. 7948 */ 7949 if (task && task != current) { 7950 struct perf_event_context *ctx; 7951 struct trace_entry *entry = record; 7952 7953 rcu_read_lock(); 7954 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); 7955 if (!ctx) 7956 goto unlock; 7957 7958 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 7959 if (event->attr.type != PERF_TYPE_TRACEPOINT) 7960 continue; 7961 if (event->attr.config != entry->type) 7962 continue; 7963 if (perf_tp_event_match(event, &data, regs)) 7964 perf_swevent_event(event, count, &data, regs); 7965 } 7966 unlock: 7967 rcu_read_unlock(); 7968 } 7969 7970 perf_swevent_put_recursion_context(rctx); 7971 } 7972 EXPORT_SYMBOL_GPL(perf_tp_event); 7973 7974 static void tp_perf_event_destroy(struct perf_event *event) 7975 { 7976 perf_trace_destroy(event); 7977 } 7978 7979 static int perf_tp_event_init(struct perf_event *event) 7980 { 7981 int err; 7982 7983 if (event->attr.type != PERF_TYPE_TRACEPOINT) 7984 return -ENOENT; 7985 7986 /* 7987 * no branch sampling for tracepoint events 7988 */ 7989 if (has_branch_stack(event)) 7990 return -EOPNOTSUPP; 7991 7992 err = perf_trace_init(event); 7993 if (err) 7994 return err; 7995 7996 event->destroy = tp_perf_event_destroy; 7997 7998 return 0; 7999 } 8000 8001 static struct pmu perf_tracepoint = { 8002 .task_ctx_nr = perf_sw_context, 8003 8004 .event_init = perf_tp_event_init, 8005 .add = perf_trace_add, 8006 .del = perf_trace_del, 8007 .start = perf_swevent_start, 8008 .stop = perf_swevent_stop, 8009 .read = perf_swevent_read, 8010 }; 8011 8012 static inline void perf_tp_register(void) 8013 { 8014 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); 8015 } 8016 8017 static void perf_event_free_filter(struct perf_event *event) 8018 { 8019 ftrace_profile_free_filter(event); 8020 } 8021 8022 #ifdef CONFIG_BPF_SYSCALL 8023 static void bpf_overflow_handler(struct perf_event *event, 8024 struct perf_sample_data *data, 8025 struct pt_regs *regs) 8026 { 8027 struct bpf_perf_event_data_kern ctx = { 8028 .data = data, 8029 .event = event, 8030 }; 8031 int ret = 0; 8032 8033 ctx.regs = perf_arch_bpf_user_pt_regs(regs); 8034 preempt_disable(); 8035 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) 8036 goto out; 8037 rcu_read_lock(); 8038 ret = BPF_PROG_RUN(event->prog, &ctx); 8039 rcu_read_unlock(); 8040 out: 8041 __this_cpu_dec(bpf_prog_active); 8042 preempt_enable(); 8043 if (!ret) 8044 return; 8045 8046 event->orig_overflow_handler(event, data, regs); 8047 } 8048 8049 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) 8050 { 8051 struct bpf_prog *prog; 8052 8053 if (event->overflow_handler_context) 8054 /* hw breakpoint or kernel counter */ 8055 return -EINVAL; 8056 8057 if (event->prog) 8058 return -EEXIST; 8059 8060 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); 8061 if (IS_ERR(prog)) 8062 return PTR_ERR(prog); 8063 8064 event->prog = prog; 8065 event->orig_overflow_handler = READ_ONCE(event->overflow_handler); 8066 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); 8067 return 0; 8068 } 8069 8070 static void perf_event_free_bpf_handler(struct perf_event *event) 8071 { 8072 struct bpf_prog *prog = event->prog; 8073 8074 if (!prog) 8075 return; 8076 8077 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); 8078 event->prog = NULL; 8079 bpf_prog_put(prog); 8080 } 8081 #else 8082 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) 8083 { 8084 return -EOPNOTSUPP; 8085 } 8086 static void perf_event_free_bpf_handler(struct perf_event *event) 8087 { 8088 } 8089 #endif 8090 8091 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) 8092 { 8093 bool is_kprobe, is_tracepoint, is_syscall_tp; 8094 struct bpf_prog *prog; 8095 int ret; 8096 8097 if (event->attr.type != PERF_TYPE_TRACEPOINT) 8098 return perf_event_set_bpf_handler(event, prog_fd); 8099 8100 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; 8101 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; 8102 is_syscall_tp = is_syscall_trace_event(event->tp_event); 8103 if (!is_kprobe && !is_tracepoint && !is_syscall_tp) 8104 /* bpf programs can only be attached to u/kprobe or tracepoint */ 8105 return -EINVAL; 8106 8107 prog = bpf_prog_get(prog_fd); 8108 if (IS_ERR(prog)) 8109 return PTR_ERR(prog); 8110 8111 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || 8112 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || 8113 (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { 8114 /* valid fd, but invalid bpf program type */ 8115 bpf_prog_put(prog); 8116 return -EINVAL; 8117 } 8118 8119 /* Kprobe override only works for kprobes, not uprobes. */ 8120 if (prog->kprobe_override && 8121 !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { 8122 bpf_prog_put(prog); 8123 return -EINVAL; 8124 } 8125 8126 if (is_tracepoint || is_syscall_tp) { 8127 int off = trace_event_get_offsets(event->tp_event); 8128 8129 if (prog->aux->max_ctx_offset > off) { 8130 bpf_prog_put(prog); 8131 return -EACCES; 8132 } 8133 } 8134 8135 ret = perf_event_attach_bpf_prog(event, prog); 8136 if (ret) 8137 bpf_prog_put(prog); 8138 return ret; 8139 } 8140 8141 static void perf_event_free_bpf_prog(struct perf_event *event) 8142 { 8143 if (event->attr.type != PERF_TYPE_TRACEPOINT) { 8144 perf_event_free_bpf_handler(event); 8145 return; 8146 } 8147 perf_event_detach_bpf_prog(event); 8148 } 8149 8150 #else 8151 8152 static inline void perf_tp_register(void) 8153 { 8154 } 8155 8156 static void perf_event_free_filter(struct perf_event *event) 8157 { 8158 } 8159 8160 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) 8161 { 8162 return -ENOENT; 8163 } 8164 8165 static void perf_event_free_bpf_prog(struct perf_event *event) 8166 { 8167 } 8168 #endif /* CONFIG_EVENT_TRACING */ 8169 8170 #ifdef CONFIG_HAVE_HW_BREAKPOINT 8171 void perf_bp_event(struct perf_event *bp, void *data) 8172 { 8173 struct perf_sample_data sample; 8174 struct pt_regs *regs = data; 8175 8176 perf_sample_data_init(&sample, bp->attr.bp_addr, 0); 8177 8178 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 8179 perf_swevent_event(bp, 1, &sample, regs); 8180 } 8181 #endif 8182 8183 /* 8184 * Allocate a new address filter 8185 */ 8186 static struct perf_addr_filter * 8187 perf_addr_filter_new(struct perf_event *event, struct list_head *filters) 8188 { 8189 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); 8190 struct perf_addr_filter *filter; 8191 8192 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); 8193 if (!filter) 8194 return NULL; 8195 8196 INIT_LIST_HEAD(&filter->entry); 8197 list_add_tail(&filter->entry, filters); 8198 8199 return filter; 8200 } 8201 8202 static void free_filters_list(struct list_head *filters) 8203 { 8204 struct perf_addr_filter *filter, *iter; 8205 8206 list_for_each_entry_safe(filter, iter, filters, entry) { 8207 if (filter->inode) 8208 iput(filter->inode); 8209 list_del(&filter->entry); 8210 kfree(filter); 8211 } 8212 } 8213 8214 /* 8215 * Free existing address filters and optionally install new ones 8216 */ 8217 static void perf_addr_filters_splice(struct perf_event *event, 8218 struct list_head *head) 8219 { 8220 unsigned long flags; 8221 LIST_HEAD(list); 8222 8223 if (!has_addr_filter(event)) 8224 return; 8225 8226 /* don't bother with children, they don't have their own filters */ 8227 if (event->parent) 8228 return; 8229 8230 raw_spin_lock_irqsave(&event->addr_filters.lock, flags); 8231 8232 list_splice_init(&event->addr_filters.list, &list); 8233 if (head) 8234 list_splice(head, &event->addr_filters.list); 8235 8236 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); 8237 8238 free_filters_list(&list); 8239 } 8240 8241 /* 8242 * Scan through mm's vmas and see if one of them matches the 8243 * @filter; if so, adjust filter's address range. 8244 * Called with mm::mmap_sem down for reading. 8245 */ 8246 static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, 8247 struct mm_struct *mm) 8248 { 8249 struct vm_area_struct *vma; 8250 8251 for (vma = mm->mmap; vma; vma = vma->vm_next) { 8252 struct file *file = vma->vm_file; 8253 unsigned long off = vma->vm_pgoff << PAGE_SHIFT; 8254 unsigned long vma_size = vma->vm_end - vma->vm_start; 8255 8256 if (!file) 8257 continue; 8258 8259 if (!perf_addr_filter_match(filter, file, off, vma_size)) 8260 continue; 8261 8262 return vma->vm_start; 8263 } 8264 8265 return 0; 8266 } 8267 8268 /* 8269 * Update event's address range filters based on the 8270 * task's existing mappings, if any. 8271 */ 8272 static void perf_event_addr_filters_apply(struct perf_event *event) 8273 { 8274 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 8275 struct task_struct *task = READ_ONCE(event->ctx->task); 8276 struct perf_addr_filter *filter; 8277 struct mm_struct *mm = NULL; 8278 unsigned int count = 0; 8279 unsigned long flags; 8280 8281 /* 8282 * We may observe TASK_TOMBSTONE, which means that the event tear-down 8283 * will stop on the parent's child_mutex that our caller is also holding 8284 */ 8285 if (task == TASK_TOMBSTONE) 8286 return; 8287 8288 if (!ifh->nr_file_filters) 8289 return; 8290 8291 mm = get_task_mm(event->ctx->task); 8292 if (!mm) 8293 goto restart; 8294 8295 down_read(&mm->mmap_sem); 8296 8297 raw_spin_lock_irqsave(&ifh->lock, flags); 8298 list_for_each_entry(filter, &ifh->list, entry) { 8299 event->addr_filters_offs[count] = 0; 8300 8301 /* 8302 * Adjust base offset if the filter is associated to a binary 8303 * that needs to be mapped: 8304 */ 8305 if (filter->inode) 8306 event->addr_filters_offs[count] = 8307 perf_addr_filter_apply(filter, mm); 8308 8309 count++; 8310 } 8311 8312 event->addr_filters_gen++; 8313 raw_spin_unlock_irqrestore(&ifh->lock, flags); 8314 8315 up_read(&mm->mmap_sem); 8316 8317 mmput(mm); 8318 8319 restart: 8320 perf_event_stop(event, 1); 8321 } 8322 8323 /* 8324 * Address range filtering: limiting the data to certain 8325 * instruction address ranges. Filters are ioctl()ed to us from 8326 * userspace as ascii strings. 8327 * 8328 * Filter string format: 8329 * 8330 * ACTION RANGE_SPEC 8331 * where ACTION is one of the 8332 * * "filter": limit the trace to this region 8333 * * "start": start tracing from this address 8334 * * "stop": stop tracing at this address/region; 8335 * RANGE_SPEC is 8336 * * for kernel addresses: <start address>[/<size>] 8337 * * for object files: <start address>[/<size>]@</path/to/object/file> 8338 * 8339 * if <size> is not specified, the range is treated as a single address. 8340 */ 8341 enum { 8342 IF_ACT_NONE = -1, 8343 IF_ACT_FILTER, 8344 IF_ACT_START, 8345 IF_ACT_STOP, 8346 IF_SRC_FILE, 8347 IF_SRC_KERNEL, 8348 IF_SRC_FILEADDR, 8349 IF_SRC_KERNELADDR, 8350 }; 8351 8352 enum { 8353 IF_STATE_ACTION = 0, 8354 IF_STATE_SOURCE, 8355 IF_STATE_END, 8356 }; 8357 8358 static const match_table_t if_tokens = { 8359 { IF_ACT_FILTER, "filter" }, 8360 { IF_ACT_START, "start" }, 8361 { IF_ACT_STOP, "stop" }, 8362 { IF_SRC_FILE, "%u/%u@%s" }, 8363 { IF_SRC_KERNEL, "%u/%u" }, 8364 { IF_SRC_FILEADDR, "%u@%s" }, 8365 { IF_SRC_KERNELADDR, "%u" }, 8366 { IF_ACT_NONE, NULL }, 8367 }; 8368 8369 /* 8370 * Address filter string parser 8371 */ 8372 static int 8373 perf_event_parse_addr_filter(struct perf_event *event, char *fstr, 8374 struct list_head *filters) 8375 { 8376 struct perf_addr_filter *filter = NULL; 8377 char *start, *orig, *filename = NULL; 8378 struct path path; 8379 substring_t args[MAX_OPT_ARGS]; 8380 int state = IF_STATE_ACTION, token; 8381 unsigned int kernel = 0; 8382 int ret = -EINVAL; 8383 8384 orig = fstr = kstrdup(fstr, GFP_KERNEL); 8385 if (!fstr) 8386 return -ENOMEM; 8387 8388 while ((start = strsep(&fstr, " ,\n")) != NULL) { 8389 ret = -EINVAL; 8390 8391 if (!*start) 8392 continue; 8393 8394 /* filter definition begins */ 8395 if (state == IF_STATE_ACTION) { 8396 filter = perf_addr_filter_new(event, filters); 8397 if (!filter) 8398 goto fail; 8399 } 8400 8401 token = match_token(start, if_tokens, args); 8402 switch (token) { 8403 case IF_ACT_FILTER: 8404 case IF_ACT_START: 8405 filter->filter = 1; 8406 8407 case IF_ACT_STOP: 8408 if (state != IF_STATE_ACTION) 8409 goto fail; 8410 8411 state = IF_STATE_SOURCE; 8412 break; 8413 8414 case IF_SRC_KERNELADDR: 8415 case IF_SRC_KERNEL: 8416 kernel = 1; 8417 8418 case IF_SRC_FILEADDR: 8419 case IF_SRC_FILE: 8420 if (state != IF_STATE_SOURCE) 8421 goto fail; 8422 8423 if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) 8424 filter->range = 1; 8425 8426 *args[0].to = 0; 8427 ret = kstrtoul(args[0].from, 0, &filter->offset); 8428 if (ret) 8429 goto fail; 8430 8431 if (filter->range) { 8432 *args[1].to = 0; 8433 ret = kstrtoul(args[1].from, 0, &filter->size); 8434 if (ret) 8435 goto fail; 8436 } 8437 8438 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { 8439 int fpos = filter->range ? 2 : 1; 8440 8441 filename = match_strdup(&args[fpos]); 8442 if (!filename) { 8443 ret = -ENOMEM; 8444 goto fail; 8445 } 8446 } 8447 8448 state = IF_STATE_END; 8449 break; 8450 8451 default: 8452 goto fail; 8453 } 8454 8455 /* 8456 * Filter definition is fully parsed, validate and install it. 8457 * Make sure that it doesn't contradict itself or the event's 8458 * attribute. 8459 */ 8460 if (state == IF_STATE_END) { 8461 ret = -EINVAL; 8462 if (kernel && event->attr.exclude_kernel) 8463 goto fail; 8464 8465 if (!kernel) { 8466 if (!filename) 8467 goto fail; 8468 8469 /* 8470 * For now, we only support file-based filters 8471 * in per-task events; doing so for CPU-wide 8472 * events requires additional context switching 8473 * trickery, since same object code will be 8474 * mapped at different virtual addresses in 8475 * different processes. 8476 */ 8477 ret = -EOPNOTSUPP; 8478 if (!event->ctx->task) 8479 goto fail_free_name; 8480 8481 /* look up the path and grab its inode */ 8482 ret = kern_path(filename, LOOKUP_FOLLOW, &path); 8483 if (ret) 8484 goto fail_free_name; 8485 8486 filter->inode = igrab(d_inode(path.dentry)); 8487 path_put(&path); 8488 kfree(filename); 8489 filename = NULL; 8490 8491 ret = -EINVAL; 8492 if (!filter->inode || 8493 !S_ISREG(filter->inode->i_mode)) 8494 /* free_filters_list() will iput() */ 8495 goto fail; 8496 8497 event->addr_filters.nr_file_filters++; 8498 } 8499 8500 /* ready to consume more filters */ 8501 state = IF_STATE_ACTION; 8502 filter = NULL; 8503 } 8504 } 8505 8506 if (state != IF_STATE_ACTION) 8507 goto fail; 8508 8509 kfree(orig); 8510 8511 return 0; 8512 8513 fail_free_name: 8514 kfree(filename); 8515 fail: 8516 free_filters_list(filters); 8517 kfree(orig); 8518 8519 return ret; 8520 } 8521 8522 static int 8523 perf_event_set_addr_filter(struct perf_event *event, char *filter_str) 8524 { 8525 LIST_HEAD(filters); 8526 int ret; 8527 8528 /* 8529 * Since this is called in perf_ioctl() path, we're already holding 8530 * ctx::mutex. 8531 */ 8532 lockdep_assert_held(&event->ctx->mutex); 8533 8534 if (WARN_ON_ONCE(event->parent)) 8535 return -EINVAL; 8536 8537 ret = perf_event_parse_addr_filter(event, filter_str, &filters); 8538 if (ret) 8539 goto fail_clear_files; 8540 8541 ret = event->pmu->addr_filters_validate(&filters); 8542 if (ret) 8543 goto fail_free_filters; 8544 8545 /* remove existing filters, if any */ 8546 perf_addr_filters_splice(event, &filters); 8547 8548 /* install new filters */ 8549 perf_event_for_each_child(event, perf_event_addr_filters_apply); 8550 8551 return ret; 8552 8553 fail_free_filters: 8554 free_filters_list(&filters); 8555 8556 fail_clear_files: 8557 event->addr_filters.nr_file_filters = 0; 8558 8559 return ret; 8560 } 8561 8562 static int 8563 perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) 8564 { 8565 struct perf_event_context *ctx = event->ctx; 8566 int ret; 8567 8568 /* 8569 * Beware, here be dragons!! 8570 * 8571 * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint 8572 * stuff does not actually need it. So temporarily drop ctx->mutex. As per 8573 * perf_event_ctx_lock() we already have a reference on ctx. 8574 * 8575 * This can result in event getting moved to a different ctx, but that 8576 * does not affect the tracepoint state. 8577 */ 8578 mutex_unlock(&ctx->mutex); 8579 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); 8580 mutex_lock(&ctx->mutex); 8581 8582 return ret; 8583 } 8584 8585 static int perf_event_set_filter(struct perf_event *event, void __user *arg) 8586 { 8587 char *filter_str; 8588 int ret = -EINVAL; 8589 8590 if ((event->attr.type != PERF_TYPE_TRACEPOINT || 8591 !IS_ENABLED(CONFIG_EVENT_TRACING)) && 8592 !has_addr_filter(event)) 8593 return -EINVAL; 8594 8595 filter_str = strndup_user(arg, PAGE_SIZE); 8596 if (IS_ERR(filter_str)) 8597 return PTR_ERR(filter_str); 8598 8599 if (IS_ENABLED(CONFIG_EVENT_TRACING) && 8600 event->attr.type == PERF_TYPE_TRACEPOINT) 8601 ret = perf_tracepoint_set_filter(event, filter_str); 8602 else if (has_addr_filter(event)) 8603 ret = perf_event_set_addr_filter(event, filter_str); 8604 8605 kfree(filter_str); 8606 return ret; 8607 } 8608 8609 /* 8610 * hrtimer based swevent callback 8611 */ 8612 8613 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) 8614 { 8615 enum hrtimer_restart ret = HRTIMER_RESTART; 8616 struct perf_sample_data data; 8617 struct pt_regs *regs; 8618 struct perf_event *event; 8619 u64 period; 8620 8621 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 8622 8623 if (event->state != PERF_EVENT_STATE_ACTIVE) 8624 return HRTIMER_NORESTART; 8625 8626 event->pmu->read(event); 8627 8628 perf_sample_data_init(&data, 0, event->hw.last_period); 8629 regs = get_irq_regs(); 8630 8631 if (regs && !perf_exclude_event(event, regs)) { 8632 if (!(event->attr.exclude_idle && is_idle_task(current))) 8633 if (__perf_event_overflow(event, 1, &data, regs)) 8634 ret = HRTIMER_NORESTART; 8635 } 8636 8637 period = max_t(u64, 10000, event->hw.sample_period); 8638 hrtimer_forward_now(hrtimer, ns_to_ktime(period)); 8639 8640 return ret; 8641 } 8642 8643 static void perf_swevent_start_hrtimer(struct perf_event *event) 8644 { 8645 struct hw_perf_event *hwc = &event->hw; 8646 s64 period; 8647 8648 if (!is_sampling_event(event)) 8649 return; 8650 8651 period = local64_read(&hwc->period_left); 8652 if (period) { 8653 if (period < 0) 8654 period = 10000; 8655 8656 local64_set(&hwc->period_left, 0); 8657 } else { 8658 period = max_t(u64, 10000, hwc->sample_period); 8659 } 8660 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), 8661 HRTIMER_MODE_REL_PINNED); 8662 } 8663 8664 static void perf_swevent_cancel_hrtimer(struct perf_event *event) 8665 { 8666 struct hw_perf_event *hwc = &event->hw; 8667 8668 if (is_sampling_event(event)) { 8669 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 8670 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 8671 8672 hrtimer_cancel(&hwc->hrtimer); 8673 } 8674 } 8675 8676 static void perf_swevent_init_hrtimer(struct perf_event *event) 8677 { 8678 struct hw_perf_event *hwc = &event->hw; 8679 8680 if (!is_sampling_event(event)) 8681 return; 8682 8683 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 8684 hwc->hrtimer.function = perf_swevent_hrtimer; 8685 8686 /* 8687 * Since hrtimers have a fixed rate, we can do a static freq->period 8688 * mapping and avoid the whole period adjust feedback stuff. 8689 */ 8690 if (event->attr.freq) { 8691 long freq = event->attr.sample_freq; 8692 8693 event->attr.sample_period = NSEC_PER_SEC / freq; 8694 hwc->sample_period = event->attr.sample_period; 8695 local64_set(&hwc->period_left, hwc->sample_period); 8696 hwc->last_period = hwc->sample_period; 8697 event->attr.freq = 0; 8698 } 8699 } 8700 8701 /* 8702 * Software event: cpu wall time clock 8703 */ 8704 8705 static void cpu_clock_event_update(struct perf_event *event) 8706 { 8707 s64 prev; 8708 u64 now; 8709 8710 now = local_clock(); 8711 prev = local64_xchg(&event->hw.prev_count, now); 8712 local64_add(now - prev, &event->count); 8713 } 8714 8715 static void cpu_clock_event_start(struct perf_event *event, int flags) 8716 { 8717 local64_set(&event->hw.prev_count, local_clock()); 8718 perf_swevent_start_hrtimer(event); 8719 } 8720 8721 static void cpu_clock_event_stop(struct perf_event *event, int flags) 8722 { 8723 perf_swevent_cancel_hrtimer(event); 8724 cpu_clock_event_update(event); 8725 } 8726 8727 static int cpu_clock_event_add(struct perf_event *event, int flags) 8728 { 8729 if (flags & PERF_EF_START) 8730 cpu_clock_event_start(event, flags); 8731 perf_event_update_userpage(event); 8732 8733 return 0; 8734 } 8735 8736 static void cpu_clock_event_del(struct perf_event *event, int flags) 8737 { 8738 cpu_clock_event_stop(event, flags); 8739 } 8740 8741 static void cpu_clock_event_read(struct perf_event *event) 8742 { 8743 cpu_clock_event_update(event); 8744 } 8745 8746 static int cpu_clock_event_init(struct perf_event *event) 8747 { 8748 if (event->attr.type != PERF_TYPE_SOFTWARE) 8749 return -ENOENT; 8750 8751 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 8752 return -ENOENT; 8753 8754 /* 8755 * no branch sampling for software events 8756 */ 8757 if (has_branch_stack(event)) 8758 return -EOPNOTSUPP; 8759 8760 perf_swevent_init_hrtimer(event); 8761 8762 return 0; 8763 } 8764 8765 static struct pmu perf_cpu_clock = { 8766 .task_ctx_nr = perf_sw_context, 8767 8768 .capabilities = PERF_PMU_CAP_NO_NMI, 8769 8770 .event_init = cpu_clock_event_init, 8771 .add = cpu_clock_event_add, 8772 .del = cpu_clock_event_del, 8773 .start = cpu_clock_event_start, 8774 .stop = cpu_clock_event_stop, 8775 .read = cpu_clock_event_read, 8776 }; 8777 8778 /* 8779 * Software event: task time clock 8780 */ 8781 8782 static void task_clock_event_update(struct perf_event *event, u64 now) 8783 { 8784 u64 prev; 8785 s64 delta; 8786 8787 prev = local64_xchg(&event->hw.prev_count, now); 8788 delta = now - prev; 8789 local64_add(delta, &event->count); 8790 } 8791 8792 static void task_clock_event_start(struct perf_event *event, int flags) 8793 { 8794 local64_set(&event->hw.prev_count, event->ctx->time); 8795 perf_swevent_start_hrtimer(event); 8796 } 8797 8798 static void task_clock_event_stop(struct perf_event *event, int flags) 8799 { 8800 perf_swevent_cancel_hrtimer(event); 8801 task_clock_event_update(event, event->ctx->time); 8802 } 8803 8804 static int task_clock_event_add(struct perf_event *event, int flags) 8805 { 8806 if (flags & PERF_EF_START) 8807 task_clock_event_start(event, flags); 8808 perf_event_update_userpage(event); 8809 8810 return 0; 8811 } 8812 8813 static void task_clock_event_del(struct perf_event *event, int flags) 8814 { 8815 task_clock_event_stop(event, PERF_EF_UPDATE); 8816 } 8817 8818 static void task_clock_event_read(struct perf_event *event) 8819 { 8820 u64 now = perf_clock(); 8821 u64 delta = now - event->ctx->timestamp; 8822 u64 time = event->ctx->time + delta; 8823 8824 task_clock_event_update(event, time); 8825 } 8826 8827 static int task_clock_event_init(struct perf_event *event) 8828 { 8829 if (event->attr.type != PERF_TYPE_SOFTWARE) 8830 return -ENOENT; 8831 8832 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 8833 return -ENOENT; 8834 8835 /* 8836 * no branch sampling for software events 8837 */ 8838 if (has_branch_stack(event)) 8839 return -EOPNOTSUPP; 8840 8841 perf_swevent_init_hrtimer(event); 8842 8843 return 0; 8844 } 8845 8846 static struct pmu perf_task_clock = { 8847 .task_ctx_nr = perf_sw_context, 8848 8849 .capabilities = PERF_PMU_CAP_NO_NMI, 8850 8851 .event_init = task_clock_event_init, 8852 .add = task_clock_event_add, 8853 .del = task_clock_event_del, 8854 .start = task_clock_event_start, 8855 .stop = task_clock_event_stop, 8856 .read = task_clock_event_read, 8857 }; 8858 8859 static void perf_pmu_nop_void(struct pmu *pmu) 8860 { 8861 } 8862 8863 static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) 8864 { 8865 } 8866 8867 static int perf_pmu_nop_int(struct pmu *pmu) 8868 { 8869 return 0; 8870 } 8871 8872 static DEFINE_PER_CPU(unsigned int, nop_txn_flags); 8873 8874 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) 8875 { 8876 __this_cpu_write(nop_txn_flags, flags); 8877 8878 if (flags & ~PERF_PMU_TXN_ADD) 8879 return; 8880 8881 perf_pmu_disable(pmu); 8882 } 8883 8884 static int perf_pmu_commit_txn(struct pmu *pmu) 8885 { 8886 unsigned int flags = __this_cpu_read(nop_txn_flags); 8887 8888 __this_cpu_write(nop_txn_flags, 0); 8889 8890 if (flags & ~PERF_PMU_TXN_ADD) 8891 return 0; 8892 8893 perf_pmu_enable(pmu); 8894 return 0; 8895 } 8896 8897 static void perf_pmu_cancel_txn(struct pmu *pmu) 8898 { 8899 unsigned int flags = __this_cpu_read(nop_txn_flags); 8900 8901 __this_cpu_write(nop_txn_flags, 0); 8902 8903 if (flags & ~PERF_PMU_TXN_ADD) 8904 return; 8905 8906 perf_pmu_enable(pmu); 8907 } 8908 8909 static int perf_event_idx_default(struct perf_event *event) 8910 { 8911 return 0; 8912 } 8913 8914 /* 8915 * Ensures all contexts with the same task_ctx_nr have the same 8916 * pmu_cpu_context too. 8917 */ 8918 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) 8919 { 8920 struct pmu *pmu; 8921 8922 if (ctxn < 0) 8923 return NULL; 8924 8925 list_for_each_entry(pmu, &pmus, entry) { 8926 if (pmu->task_ctx_nr == ctxn) 8927 return pmu->pmu_cpu_context; 8928 } 8929 8930 return NULL; 8931 } 8932 8933 static void free_pmu_context(struct pmu *pmu) 8934 { 8935 /* 8936 * Static contexts such as perf_sw_context have a global lifetime 8937 * and may be shared between different PMUs. Avoid freeing them 8938 * when a single PMU is going away. 8939 */ 8940 if (pmu->task_ctx_nr > perf_invalid_context) 8941 return; 8942 8943 mutex_lock(&pmus_lock); 8944 free_percpu(pmu->pmu_cpu_context); 8945 mutex_unlock(&pmus_lock); 8946 } 8947 8948 /* 8949 * Let userspace know that this PMU supports address range filtering: 8950 */ 8951 static ssize_t nr_addr_filters_show(struct device *dev, 8952 struct device_attribute *attr, 8953 char *page) 8954 { 8955 struct pmu *pmu = dev_get_drvdata(dev); 8956 8957 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); 8958 } 8959 DEVICE_ATTR_RO(nr_addr_filters); 8960 8961 static struct idr pmu_idr; 8962 8963 static ssize_t 8964 type_show(struct device *dev, struct device_attribute *attr, char *page) 8965 { 8966 struct pmu *pmu = dev_get_drvdata(dev); 8967 8968 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 8969 } 8970 static DEVICE_ATTR_RO(type); 8971 8972 static ssize_t 8973 perf_event_mux_interval_ms_show(struct device *dev, 8974 struct device_attribute *attr, 8975 char *page) 8976 { 8977 struct pmu *pmu = dev_get_drvdata(dev); 8978 8979 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); 8980 } 8981 8982 static DEFINE_MUTEX(mux_interval_mutex); 8983 8984 static ssize_t 8985 perf_event_mux_interval_ms_store(struct device *dev, 8986 struct device_attribute *attr, 8987 const char *buf, size_t count) 8988 { 8989 struct pmu *pmu = dev_get_drvdata(dev); 8990 int timer, cpu, ret; 8991 8992 ret = kstrtoint(buf, 0, &timer); 8993 if (ret) 8994 return ret; 8995 8996 if (timer < 1) 8997 return -EINVAL; 8998 8999 /* same value, noting to do */ 9000 if (timer == pmu->hrtimer_interval_ms) 9001 return count; 9002 9003 mutex_lock(&mux_interval_mutex); 9004 pmu->hrtimer_interval_ms = timer; 9005 9006 /* update all cpuctx for this PMU */ 9007 cpus_read_lock(); 9008 for_each_online_cpu(cpu) { 9009 struct perf_cpu_context *cpuctx; 9010 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 9011 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); 9012 9013 cpu_function_call(cpu, 9014 (remote_function_f)perf_mux_hrtimer_restart, cpuctx); 9015 } 9016 cpus_read_unlock(); 9017 mutex_unlock(&mux_interval_mutex); 9018 9019 return count; 9020 } 9021 static DEVICE_ATTR_RW(perf_event_mux_interval_ms); 9022 9023 static struct attribute *pmu_dev_attrs[] = { 9024 &dev_attr_type.attr, 9025 &dev_attr_perf_event_mux_interval_ms.attr, 9026 NULL, 9027 }; 9028 ATTRIBUTE_GROUPS(pmu_dev); 9029 9030 static int pmu_bus_running; 9031 static struct bus_type pmu_bus = { 9032 .name = "event_source", 9033 .dev_groups = pmu_dev_groups, 9034 }; 9035 9036 static void pmu_dev_release(struct device *dev) 9037 { 9038 kfree(dev); 9039 } 9040 9041 static int pmu_dev_alloc(struct pmu *pmu) 9042 { 9043 int ret = -ENOMEM; 9044 9045 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); 9046 if (!pmu->dev) 9047 goto out; 9048 9049 pmu->dev->groups = pmu->attr_groups; 9050 device_initialize(pmu->dev); 9051 ret = dev_set_name(pmu->dev, "%s", pmu->name); 9052 if (ret) 9053 goto free_dev; 9054 9055 dev_set_drvdata(pmu->dev, pmu); 9056 pmu->dev->bus = &pmu_bus; 9057 pmu->dev->release = pmu_dev_release; 9058 ret = device_add(pmu->dev); 9059 if (ret) 9060 goto free_dev; 9061 9062 /* For PMUs with address filters, throw in an extra attribute: */ 9063 if (pmu->nr_addr_filters) 9064 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); 9065 9066 if (ret) 9067 goto del_dev; 9068 9069 out: 9070 return ret; 9071 9072 del_dev: 9073 device_del(pmu->dev); 9074 9075 free_dev: 9076 put_device(pmu->dev); 9077 goto out; 9078 } 9079 9080 static struct lock_class_key cpuctx_mutex; 9081 static struct lock_class_key cpuctx_lock; 9082 9083 int perf_pmu_register(struct pmu *pmu, const char *name, int type) 9084 { 9085 int cpu, ret; 9086 9087 mutex_lock(&pmus_lock); 9088 ret = -ENOMEM; 9089 pmu->pmu_disable_count = alloc_percpu(int); 9090 if (!pmu->pmu_disable_count) 9091 goto unlock; 9092 9093 pmu->type = -1; 9094 if (!name) 9095 goto skip_type; 9096 pmu->name = name; 9097 9098 if (type < 0) { 9099 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); 9100 if (type < 0) { 9101 ret = type; 9102 goto free_pdc; 9103 } 9104 } 9105 pmu->type = type; 9106 9107 if (pmu_bus_running) { 9108 ret = pmu_dev_alloc(pmu); 9109 if (ret) 9110 goto free_idr; 9111 } 9112 9113 skip_type: 9114 if (pmu->task_ctx_nr == perf_hw_context) { 9115 static int hw_context_taken = 0; 9116 9117 /* 9118 * Other than systems with heterogeneous CPUs, it never makes 9119 * sense for two PMUs to share perf_hw_context. PMUs which are 9120 * uncore must use perf_invalid_context. 9121 */ 9122 if (WARN_ON_ONCE(hw_context_taken && 9123 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) 9124 pmu->task_ctx_nr = perf_invalid_context; 9125 9126 hw_context_taken = 1; 9127 } 9128 9129 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); 9130 if (pmu->pmu_cpu_context) 9131 goto got_cpu_context; 9132 9133 ret = -ENOMEM; 9134 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 9135 if (!pmu->pmu_cpu_context) 9136 goto free_dev; 9137 9138 for_each_possible_cpu(cpu) { 9139 struct perf_cpu_context *cpuctx; 9140 9141 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 9142 __perf_event_init_context(&cpuctx->ctx); 9143 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 9144 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 9145 cpuctx->ctx.pmu = pmu; 9146 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); 9147 9148 __perf_mux_hrtimer_init(cpuctx, cpu); 9149 } 9150 9151 got_cpu_context: 9152 if (!pmu->start_txn) { 9153 if (pmu->pmu_enable) { 9154 /* 9155 * If we have pmu_enable/pmu_disable calls, install 9156 * transaction stubs that use that to try and batch 9157 * hardware accesses. 9158 */ 9159 pmu->start_txn = perf_pmu_start_txn; 9160 pmu->commit_txn = perf_pmu_commit_txn; 9161 pmu->cancel_txn = perf_pmu_cancel_txn; 9162 } else { 9163 pmu->start_txn = perf_pmu_nop_txn; 9164 pmu->commit_txn = perf_pmu_nop_int; 9165 pmu->cancel_txn = perf_pmu_nop_void; 9166 } 9167 } 9168 9169 if (!pmu->pmu_enable) { 9170 pmu->pmu_enable = perf_pmu_nop_void; 9171 pmu->pmu_disable = perf_pmu_nop_void; 9172 } 9173 9174 if (!pmu->event_idx) 9175 pmu->event_idx = perf_event_idx_default; 9176 9177 list_add_rcu(&pmu->entry, &pmus); 9178 atomic_set(&pmu->exclusive_cnt, 0); 9179 ret = 0; 9180 unlock: 9181 mutex_unlock(&pmus_lock); 9182 9183 return ret; 9184 9185 free_dev: 9186 device_del(pmu->dev); 9187 put_device(pmu->dev); 9188 9189 free_idr: 9190 if (pmu->type >= PERF_TYPE_MAX) 9191 idr_remove(&pmu_idr, pmu->type); 9192 9193 free_pdc: 9194 free_percpu(pmu->pmu_disable_count); 9195 goto unlock; 9196 } 9197 EXPORT_SYMBOL_GPL(perf_pmu_register); 9198 9199 void perf_pmu_unregister(struct pmu *pmu) 9200 { 9201 int remove_device; 9202 9203 mutex_lock(&pmus_lock); 9204 remove_device = pmu_bus_running; 9205 list_del_rcu(&pmu->entry); 9206 mutex_unlock(&pmus_lock); 9207 9208 /* 9209 * We dereference the pmu list under both SRCU and regular RCU, so 9210 * synchronize against both of those. 9211 */ 9212 synchronize_srcu(&pmus_srcu); 9213 synchronize_rcu(); 9214 9215 free_percpu(pmu->pmu_disable_count); 9216 if (pmu->type >= PERF_TYPE_MAX) 9217 idr_remove(&pmu_idr, pmu->type); 9218 if (remove_device) { 9219 if (pmu->nr_addr_filters) 9220 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); 9221 device_del(pmu->dev); 9222 put_device(pmu->dev); 9223 } 9224 free_pmu_context(pmu); 9225 } 9226 EXPORT_SYMBOL_GPL(perf_pmu_unregister); 9227 9228 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) 9229 { 9230 struct perf_event_context *ctx = NULL; 9231 int ret; 9232 9233 if (!try_module_get(pmu->module)) 9234 return -ENODEV; 9235 9236 /* 9237 * A number of pmu->event_init() methods iterate the sibling_list to, 9238 * for example, validate if the group fits on the PMU. Therefore, 9239 * if this is a sibling event, acquire the ctx->mutex to protect 9240 * the sibling_list. 9241 */ 9242 if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { 9243 /* 9244 * This ctx->mutex can nest when we're called through 9245 * inheritance. See the perf_event_ctx_lock_nested() comment. 9246 */ 9247 ctx = perf_event_ctx_lock_nested(event->group_leader, 9248 SINGLE_DEPTH_NESTING); 9249 BUG_ON(!ctx); 9250 } 9251 9252 event->pmu = pmu; 9253 ret = pmu->event_init(event); 9254 9255 if (ctx) 9256 perf_event_ctx_unlock(event->group_leader, ctx); 9257 9258 if (ret) 9259 module_put(pmu->module); 9260 9261 return ret; 9262 } 9263 9264 static struct pmu *perf_init_event(struct perf_event *event) 9265 { 9266 struct pmu *pmu; 9267 int idx; 9268 int ret; 9269 9270 idx = srcu_read_lock(&pmus_srcu); 9271 9272 /* Try parent's PMU first: */ 9273 if (event->parent && event->parent->pmu) { 9274 pmu = event->parent->pmu; 9275 ret = perf_try_init_event(pmu, event); 9276 if (!ret) 9277 goto unlock; 9278 } 9279 9280 rcu_read_lock(); 9281 pmu = idr_find(&pmu_idr, event->attr.type); 9282 rcu_read_unlock(); 9283 if (pmu) { 9284 ret = perf_try_init_event(pmu, event); 9285 if (ret) 9286 pmu = ERR_PTR(ret); 9287 goto unlock; 9288 } 9289 9290 list_for_each_entry_rcu(pmu, &pmus, entry) { 9291 ret = perf_try_init_event(pmu, event); 9292 if (!ret) 9293 goto unlock; 9294 9295 if (ret != -ENOENT) { 9296 pmu = ERR_PTR(ret); 9297 goto unlock; 9298 } 9299 } 9300 pmu = ERR_PTR(-ENOENT); 9301 unlock: 9302 srcu_read_unlock(&pmus_srcu, idx); 9303 9304 return pmu; 9305 } 9306 9307 static void attach_sb_event(struct perf_event *event) 9308 { 9309 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); 9310 9311 raw_spin_lock(&pel->lock); 9312 list_add_rcu(&event->sb_list, &pel->list); 9313 raw_spin_unlock(&pel->lock); 9314 } 9315 9316 /* 9317 * We keep a list of all !task (and therefore per-cpu) events 9318 * that need to receive side-band records. 9319 * 9320 * This avoids having to scan all the various PMU per-cpu contexts 9321 * looking for them. 9322 */ 9323 static void account_pmu_sb_event(struct perf_event *event) 9324 { 9325 if (is_sb_event(event)) 9326 attach_sb_event(event); 9327 } 9328 9329 static void account_event_cpu(struct perf_event *event, int cpu) 9330 { 9331 if (event->parent) 9332 return; 9333 9334 if (is_cgroup_event(event)) 9335 atomic_inc(&per_cpu(perf_cgroup_events, cpu)); 9336 } 9337 9338 /* Freq events need the tick to stay alive (see perf_event_task_tick). */ 9339 static void account_freq_event_nohz(void) 9340 { 9341 #ifdef CONFIG_NO_HZ_FULL 9342 /* Lock so we don't race with concurrent unaccount */ 9343 spin_lock(&nr_freq_lock); 9344 if (atomic_inc_return(&nr_freq_events) == 1) 9345 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); 9346 spin_unlock(&nr_freq_lock); 9347 #endif 9348 } 9349 9350 static void account_freq_event(void) 9351 { 9352 if (tick_nohz_full_enabled()) 9353 account_freq_event_nohz(); 9354 else 9355 atomic_inc(&nr_freq_events); 9356 } 9357 9358 9359 static void account_event(struct perf_event *event) 9360 { 9361 bool inc = false; 9362 9363 if (event->parent) 9364 return; 9365 9366 if (event->attach_state & PERF_ATTACH_TASK) 9367 inc = true; 9368 if (event->attr.mmap || event->attr.mmap_data) 9369 atomic_inc(&nr_mmap_events); 9370 if (event->attr.comm) 9371 atomic_inc(&nr_comm_events); 9372 if (event->attr.namespaces) 9373 atomic_inc(&nr_namespaces_events); 9374 if (event->attr.task) 9375 atomic_inc(&nr_task_events); 9376 if (event->attr.freq) 9377 account_freq_event(); 9378 if (event->attr.context_switch) { 9379 atomic_inc(&nr_switch_events); 9380 inc = true; 9381 } 9382 if (has_branch_stack(event)) 9383 inc = true; 9384 if (is_cgroup_event(event)) 9385 inc = true; 9386 9387 if (inc) { 9388 /* 9389 * We need the mutex here because static_branch_enable() 9390 * must complete *before* the perf_sched_count increment 9391 * becomes visible. 9392 */ 9393 if (atomic_inc_not_zero(&perf_sched_count)) 9394 goto enabled; 9395 9396 mutex_lock(&perf_sched_mutex); 9397 if (!atomic_read(&perf_sched_count)) { 9398 static_branch_enable(&perf_sched_events); 9399 /* 9400 * Guarantee that all CPUs observe they key change and 9401 * call the perf scheduling hooks before proceeding to 9402 * install events that need them. 9403 */ 9404 synchronize_sched(); 9405 } 9406 /* 9407 * Now that we have waited for the sync_sched(), allow further 9408 * increments to by-pass the mutex. 9409 */ 9410 atomic_inc(&perf_sched_count); 9411 mutex_unlock(&perf_sched_mutex); 9412 } 9413 enabled: 9414 9415 account_event_cpu(event, event->cpu); 9416 9417 account_pmu_sb_event(event); 9418 } 9419 9420 /* 9421 * Allocate and initialize a event structure 9422 */ 9423 static struct perf_event * 9424 perf_event_alloc(struct perf_event_attr *attr, int cpu, 9425 struct task_struct *task, 9426 struct perf_event *group_leader, 9427 struct perf_event *parent_event, 9428 perf_overflow_handler_t overflow_handler, 9429 void *context, int cgroup_fd) 9430 { 9431 struct pmu *pmu; 9432 struct perf_event *event; 9433 struct hw_perf_event *hwc; 9434 long err = -EINVAL; 9435 9436 if ((unsigned)cpu >= nr_cpu_ids) { 9437 if (!task || cpu != -1) 9438 return ERR_PTR(-EINVAL); 9439 } 9440 9441 event = kzalloc(sizeof(*event), GFP_KERNEL); 9442 if (!event) 9443 return ERR_PTR(-ENOMEM); 9444 9445 /* 9446 * Single events are their own group leaders, with an 9447 * empty sibling list: 9448 */ 9449 if (!group_leader) 9450 group_leader = event; 9451 9452 mutex_init(&event->child_mutex); 9453 INIT_LIST_HEAD(&event->child_list); 9454 9455 INIT_LIST_HEAD(&event->group_entry); 9456 INIT_LIST_HEAD(&event->event_entry); 9457 INIT_LIST_HEAD(&event->sibling_list); 9458 INIT_LIST_HEAD(&event->rb_entry); 9459 INIT_LIST_HEAD(&event->active_entry); 9460 INIT_LIST_HEAD(&event->addr_filters.list); 9461 INIT_HLIST_NODE(&event->hlist_entry); 9462 9463 9464 init_waitqueue_head(&event->waitq); 9465 init_irq_work(&event->pending, perf_pending_event); 9466 9467 mutex_init(&event->mmap_mutex); 9468 raw_spin_lock_init(&event->addr_filters.lock); 9469 9470 atomic_long_set(&event->refcount, 1); 9471 event->cpu = cpu; 9472 event->attr = *attr; 9473 event->group_leader = group_leader; 9474 event->pmu = NULL; 9475 event->oncpu = -1; 9476 9477 event->parent = parent_event; 9478 9479 event->ns = get_pid_ns(task_active_pid_ns(current)); 9480 event->id = atomic64_inc_return(&perf_event_id); 9481 9482 event->state = PERF_EVENT_STATE_INACTIVE; 9483 9484 if (task) { 9485 event->attach_state = PERF_ATTACH_TASK; 9486 /* 9487 * XXX pmu::event_init needs to know what task to account to 9488 * and we cannot use the ctx information because we need the 9489 * pmu before we get a ctx. 9490 */ 9491 event->hw.target = task; 9492 } 9493 9494 event->clock = &local_clock; 9495 if (parent_event) 9496 event->clock = parent_event->clock; 9497 9498 if (!overflow_handler && parent_event) { 9499 overflow_handler = parent_event->overflow_handler; 9500 context = parent_event->overflow_handler_context; 9501 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) 9502 if (overflow_handler == bpf_overflow_handler) { 9503 struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); 9504 9505 if (IS_ERR(prog)) { 9506 err = PTR_ERR(prog); 9507 goto err_ns; 9508 } 9509 event->prog = prog; 9510 event->orig_overflow_handler = 9511 parent_event->orig_overflow_handler; 9512 } 9513 #endif 9514 } 9515 9516 if (overflow_handler) { 9517 event->overflow_handler = overflow_handler; 9518 event->overflow_handler_context = context; 9519 } else if (is_write_backward(event)){ 9520 event->overflow_handler = perf_event_output_backward; 9521 event->overflow_handler_context = NULL; 9522 } else { 9523 event->overflow_handler = perf_event_output_forward; 9524 event->overflow_handler_context = NULL; 9525 } 9526 9527 perf_event__state_init(event); 9528 9529 pmu = NULL; 9530 9531 hwc = &event->hw; 9532 hwc->sample_period = attr->sample_period; 9533 if (attr->freq && attr->sample_freq) 9534 hwc->sample_period = 1; 9535 hwc->last_period = hwc->sample_period; 9536 9537 local64_set(&hwc->period_left, hwc->sample_period); 9538 9539 /* 9540 * We currently do not support PERF_SAMPLE_READ on inherited events. 9541 * See perf_output_read(). 9542 */ 9543 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) 9544 goto err_ns; 9545 9546 if (!has_branch_stack(event)) 9547 event->attr.branch_sample_type = 0; 9548 9549 if (cgroup_fd != -1) { 9550 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); 9551 if (err) 9552 goto err_ns; 9553 } 9554 9555 pmu = perf_init_event(event); 9556 if (IS_ERR(pmu)) { 9557 err = PTR_ERR(pmu); 9558 goto err_ns; 9559 } 9560 9561 err = exclusive_event_init(event); 9562 if (err) 9563 goto err_pmu; 9564 9565 if (has_addr_filter(event)) { 9566 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, 9567 sizeof(unsigned long), 9568 GFP_KERNEL); 9569 if (!event->addr_filters_offs) { 9570 err = -ENOMEM; 9571 goto err_per_task; 9572 } 9573 9574 /* force hw sync on the address filters */ 9575 event->addr_filters_gen = 1; 9576 } 9577 9578 if (!event->parent) { 9579 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { 9580 err = get_callchain_buffers(attr->sample_max_stack); 9581 if (err) 9582 goto err_addr_filters; 9583 } 9584 } 9585 9586 /* symmetric to unaccount_event() in _free_event() */ 9587 account_event(event); 9588 9589 return event; 9590 9591 err_addr_filters: 9592 kfree(event->addr_filters_offs); 9593 9594 err_per_task: 9595 exclusive_event_destroy(event); 9596 9597 err_pmu: 9598 if (event->destroy) 9599 event->destroy(event); 9600 module_put(pmu->module); 9601 err_ns: 9602 if (is_cgroup_event(event)) 9603 perf_detach_cgroup(event); 9604 if (event->ns) 9605 put_pid_ns(event->ns); 9606 kfree(event); 9607 9608 return ERR_PTR(err); 9609 } 9610 9611 static int perf_copy_attr(struct perf_event_attr __user *uattr, 9612 struct perf_event_attr *attr) 9613 { 9614 u32 size; 9615 int ret; 9616 9617 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) 9618 return -EFAULT; 9619 9620 /* 9621 * zero the full structure, so that a short copy will be nice. 9622 */ 9623 memset(attr, 0, sizeof(*attr)); 9624 9625 ret = get_user(size, &uattr->size); 9626 if (ret) 9627 return ret; 9628 9629 if (size > PAGE_SIZE) /* silly large */ 9630 goto err_size; 9631 9632 if (!size) /* abi compat */ 9633 size = PERF_ATTR_SIZE_VER0; 9634 9635 if (size < PERF_ATTR_SIZE_VER0) 9636 goto err_size; 9637 9638 /* 9639 * If we're handed a bigger struct than we know of, 9640 * ensure all the unknown bits are 0 - i.e. new 9641 * user-space does not rely on any kernel feature 9642 * extensions we dont know about yet. 9643 */ 9644 if (size > sizeof(*attr)) { 9645 unsigned char __user *addr; 9646 unsigned char __user *end; 9647 unsigned char val; 9648 9649 addr = (void __user *)uattr + sizeof(*attr); 9650 end = (void __user *)uattr + size; 9651 9652 for (; addr < end; addr++) { 9653 ret = get_user(val, addr); 9654 if (ret) 9655 return ret; 9656 if (val) 9657 goto err_size; 9658 } 9659 size = sizeof(*attr); 9660 } 9661 9662 ret = copy_from_user(attr, uattr, size); 9663 if (ret) 9664 return -EFAULT; 9665 9666 attr->size = size; 9667 9668 if (attr->__reserved_1) 9669 return -EINVAL; 9670 9671 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 9672 return -EINVAL; 9673 9674 if (attr->read_format & ~(PERF_FORMAT_MAX-1)) 9675 return -EINVAL; 9676 9677 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { 9678 u64 mask = attr->branch_sample_type; 9679 9680 /* only using defined bits */ 9681 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) 9682 return -EINVAL; 9683 9684 /* at least one branch bit must be set */ 9685 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) 9686 return -EINVAL; 9687 9688 /* propagate priv level, when not set for branch */ 9689 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { 9690 9691 /* exclude_kernel checked on syscall entry */ 9692 if (!attr->exclude_kernel) 9693 mask |= PERF_SAMPLE_BRANCH_KERNEL; 9694 9695 if (!attr->exclude_user) 9696 mask |= PERF_SAMPLE_BRANCH_USER; 9697 9698 if (!attr->exclude_hv) 9699 mask |= PERF_SAMPLE_BRANCH_HV; 9700 /* 9701 * adjust user setting (for HW filter setup) 9702 */ 9703 attr->branch_sample_type = mask; 9704 } 9705 /* privileged levels capture (kernel, hv): check permissions */ 9706 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) 9707 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 9708 return -EACCES; 9709 } 9710 9711 if (attr->sample_type & PERF_SAMPLE_REGS_USER) { 9712 ret = perf_reg_validate(attr->sample_regs_user); 9713 if (ret) 9714 return ret; 9715 } 9716 9717 if (attr->sample_type & PERF_SAMPLE_STACK_USER) { 9718 if (!arch_perf_have_user_stack_dump()) 9719 return -ENOSYS; 9720 9721 /* 9722 * We have __u32 type for the size, but so far 9723 * we can only use __u16 as maximum due to the 9724 * __u16 sample size limit. 9725 */ 9726 if (attr->sample_stack_user >= USHRT_MAX) 9727 ret = -EINVAL; 9728 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) 9729 ret = -EINVAL; 9730 } 9731 9732 if (attr->sample_type & PERF_SAMPLE_REGS_INTR) 9733 ret = perf_reg_validate(attr->sample_regs_intr); 9734 out: 9735 return ret; 9736 9737 err_size: 9738 put_user(sizeof(*attr), &uattr->size); 9739 ret = -E2BIG; 9740 goto out; 9741 } 9742 9743 static int 9744 perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 9745 { 9746 struct ring_buffer *rb = NULL; 9747 int ret = -EINVAL; 9748 9749 if (!output_event) 9750 goto set; 9751 9752 /* don't allow circular references */ 9753 if (event == output_event) 9754 goto out; 9755 9756 /* 9757 * Don't allow cross-cpu buffers 9758 */ 9759 if (output_event->cpu != event->cpu) 9760 goto out; 9761 9762 /* 9763 * If its not a per-cpu rb, it must be the same task. 9764 */ 9765 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 9766 goto out; 9767 9768 /* 9769 * Mixing clocks in the same buffer is trouble you don't need. 9770 */ 9771 if (output_event->clock != event->clock) 9772 goto out; 9773 9774 /* 9775 * Either writing ring buffer from beginning or from end. 9776 * Mixing is not allowed. 9777 */ 9778 if (is_write_backward(output_event) != is_write_backward(event)) 9779 goto out; 9780 9781 /* 9782 * If both events generate aux data, they must be on the same PMU 9783 */ 9784 if (has_aux(event) && has_aux(output_event) && 9785 event->pmu != output_event->pmu) 9786 goto out; 9787 9788 set: 9789 mutex_lock(&event->mmap_mutex); 9790 /* Can't redirect output if we've got an active mmap() */ 9791 if (atomic_read(&event->mmap_count)) 9792 goto unlock; 9793 9794 if (output_event) { 9795 /* get the rb we want to redirect to */ 9796 rb = ring_buffer_get(output_event); 9797 if (!rb) 9798 goto unlock; 9799 } 9800 9801 ring_buffer_attach(event, rb); 9802 9803 ret = 0; 9804 unlock: 9805 mutex_unlock(&event->mmap_mutex); 9806 9807 out: 9808 return ret; 9809 } 9810 9811 static void mutex_lock_double(struct mutex *a, struct mutex *b) 9812 { 9813 if (b < a) 9814 swap(a, b); 9815 9816 mutex_lock(a); 9817 mutex_lock_nested(b, SINGLE_DEPTH_NESTING); 9818 } 9819 9820 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) 9821 { 9822 bool nmi_safe = false; 9823 9824 switch (clk_id) { 9825 case CLOCK_MONOTONIC: 9826 event->clock = &ktime_get_mono_fast_ns; 9827 nmi_safe = true; 9828 break; 9829 9830 case CLOCK_MONOTONIC_RAW: 9831 event->clock = &ktime_get_raw_fast_ns; 9832 nmi_safe = true; 9833 break; 9834 9835 case CLOCK_REALTIME: 9836 event->clock = &ktime_get_real_ns; 9837 break; 9838 9839 case CLOCK_BOOTTIME: 9840 event->clock = &ktime_get_boot_ns; 9841 break; 9842 9843 case CLOCK_TAI: 9844 event->clock = &ktime_get_tai_ns; 9845 break; 9846 9847 default: 9848 return -EINVAL; 9849 } 9850 9851 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) 9852 return -EINVAL; 9853 9854 return 0; 9855 } 9856 9857 /* 9858 * Variation on perf_event_ctx_lock_nested(), except we take two context 9859 * mutexes. 9860 */ 9861 static struct perf_event_context * 9862 __perf_event_ctx_lock_double(struct perf_event *group_leader, 9863 struct perf_event_context *ctx) 9864 { 9865 struct perf_event_context *gctx; 9866 9867 again: 9868 rcu_read_lock(); 9869 gctx = READ_ONCE(group_leader->ctx); 9870 if (!atomic_inc_not_zero(&gctx->refcount)) { 9871 rcu_read_unlock(); 9872 goto again; 9873 } 9874 rcu_read_unlock(); 9875 9876 mutex_lock_double(&gctx->mutex, &ctx->mutex); 9877 9878 if (group_leader->ctx != gctx) { 9879 mutex_unlock(&ctx->mutex); 9880 mutex_unlock(&gctx->mutex); 9881 put_ctx(gctx); 9882 goto again; 9883 } 9884 9885 return gctx; 9886 } 9887 9888 /** 9889 * sys_perf_event_open - open a performance event, associate it to a task/cpu 9890 * 9891 * @attr_uptr: event_id type attributes for monitoring/sampling 9892 * @pid: target pid 9893 * @cpu: target cpu 9894 * @group_fd: group leader event fd 9895 */ 9896 SYSCALL_DEFINE5(perf_event_open, 9897 struct perf_event_attr __user *, attr_uptr, 9898 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 9899 { 9900 struct perf_event *group_leader = NULL, *output_event = NULL; 9901 struct perf_event *event, *sibling; 9902 struct perf_event_attr attr; 9903 struct perf_event_context *ctx, *uninitialized_var(gctx); 9904 struct file *event_file = NULL; 9905 struct fd group = {NULL, 0}; 9906 struct task_struct *task = NULL; 9907 struct pmu *pmu; 9908 int event_fd; 9909 int move_group = 0; 9910 int err; 9911 int f_flags = O_RDWR; 9912 int cgroup_fd = -1; 9913 9914 /* for future expandability... */ 9915 if (flags & ~PERF_FLAG_ALL) 9916 return -EINVAL; 9917 9918 err = perf_copy_attr(attr_uptr, &attr); 9919 if (err) 9920 return err; 9921 9922 if (!attr.exclude_kernel) { 9923 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 9924 return -EACCES; 9925 } 9926 9927 if (attr.namespaces) { 9928 if (!capable(CAP_SYS_ADMIN)) 9929 return -EACCES; 9930 } 9931 9932 if (attr.freq) { 9933 if (attr.sample_freq > sysctl_perf_event_sample_rate) 9934 return -EINVAL; 9935 } else { 9936 if (attr.sample_period & (1ULL << 63)) 9937 return -EINVAL; 9938 } 9939 9940 /* Only privileged users can get physical addresses */ 9941 if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && 9942 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 9943 return -EACCES; 9944 9945 if (!attr.sample_max_stack) 9946 attr.sample_max_stack = sysctl_perf_event_max_stack; 9947 9948 /* 9949 * In cgroup mode, the pid argument is used to pass the fd 9950 * opened to the cgroup directory in cgroupfs. The cpu argument 9951 * designates the cpu on which to monitor threads from that 9952 * cgroup. 9953 */ 9954 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) 9955 return -EINVAL; 9956 9957 if (flags & PERF_FLAG_FD_CLOEXEC) 9958 f_flags |= O_CLOEXEC; 9959 9960 event_fd = get_unused_fd_flags(f_flags); 9961 if (event_fd < 0) 9962 return event_fd; 9963 9964 if (group_fd != -1) { 9965 err = perf_fget_light(group_fd, &group); 9966 if (err) 9967 goto err_fd; 9968 group_leader = group.file->private_data; 9969 if (flags & PERF_FLAG_FD_OUTPUT) 9970 output_event = group_leader; 9971 if (flags & PERF_FLAG_FD_NO_GROUP) 9972 group_leader = NULL; 9973 } 9974 9975 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { 9976 task = find_lively_task_by_vpid(pid); 9977 if (IS_ERR(task)) { 9978 err = PTR_ERR(task); 9979 goto err_group_fd; 9980 } 9981 } 9982 9983 if (task && group_leader && 9984 group_leader->attr.inherit != attr.inherit) { 9985 err = -EINVAL; 9986 goto err_task; 9987 } 9988 9989 if (task) { 9990 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); 9991 if (err) 9992 goto err_task; 9993 9994 /* 9995 * Reuse ptrace permission checks for now. 9996 * 9997 * We must hold cred_guard_mutex across this and any potential 9998 * perf_install_in_context() call for this new event to 9999 * serialize against exec() altering our credentials (and the 10000 * perf_event_exit_task() that could imply). 10001 */ 10002 err = -EACCES; 10003 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) 10004 goto err_cred; 10005 } 10006 10007 if (flags & PERF_FLAG_PID_CGROUP) 10008 cgroup_fd = pid; 10009 10010 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 10011 NULL, NULL, cgroup_fd); 10012 if (IS_ERR(event)) { 10013 err = PTR_ERR(event); 10014 goto err_cred; 10015 } 10016 10017 if (is_sampling_event(event)) { 10018 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { 10019 err = -EOPNOTSUPP; 10020 goto err_alloc; 10021 } 10022 } 10023 10024 /* 10025 * Special case software events and allow them to be part of 10026 * any hardware group. 10027 */ 10028 pmu = event->pmu; 10029 10030 if (attr.use_clockid) { 10031 err = perf_event_set_clock(event, attr.clockid); 10032 if (err) 10033 goto err_alloc; 10034 } 10035 10036 if (pmu->task_ctx_nr == perf_sw_context) 10037 event->event_caps |= PERF_EV_CAP_SOFTWARE; 10038 10039 if (group_leader && 10040 (is_software_event(event) != is_software_event(group_leader))) { 10041 if (is_software_event(event)) { 10042 /* 10043 * If event and group_leader are not both a software 10044 * event, and event is, then group leader is not. 10045 * 10046 * Allow the addition of software events to !software 10047 * groups, this is safe because software events never 10048 * fail to schedule. 10049 */ 10050 pmu = group_leader->pmu; 10051 } else if (is_software_event(group_leader) && 10052 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { 10053 /* 10054 * In case the group is a pure software group, and we 10055 * try to add a hardware event, move the whole group to 10056 * the hardware context. 10057 */ 10058 move_group = 1; 10059 } 10060 } 10061 10062 /* 10063 * Get the target context (task or percpu): 10064 */ 10065 ctx = find_get_context(pmu, task, event); 10066 if (IS_ERR(ctx)) { 10067 err = PTR_ERR(ctx); 10068 goto err_alloc; 10069 } 10070 10071 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { 10072 err = -EBUSY; 10073 goto err_context; 10074 } 10075 10076 /* 10077 * Look up the group leader (we will attach this event to it): 10078 */ 10079 if (group_leader) { 10080 err = -EINVAL; 10081 10082 /* 10083 * Do not allow a recursive hierarchy (this new sibling 10084 * becoming part of another group-sibling): 10085 */ 10086 if (group_leader->group_leader != group_leader) 10087 goto err_context; 10088 10089 /* All events in a group should have the same clock */ 10090 if (group_leader->clock != event->clock) 10091 goto err_context; 10092 10093 /* 10094 * Make sure we're both events for the same CPU; 10095 * grouping events for different CPUs is broken; since 10096 * you can never concurrently schedule them anyhow. 10097 */ 10098 if (group_leader->cpu != event->cpu) 10099 goto err_context; 10100 10101 /* 10102 * Make sure we're both on the same task, or both 10103 * per-CPU events. 10104 */ 10105 if (group_leader->ctx->task != ctx->task) 10106 goto err_context; 10107 10108 /* 10109 * Do not allow to attach to a group in a different task 10110 * or CPU context. If we're moving SW events, we'll fix 10111 * this up later, so allow that. 10112 */ 10113 if (!move_group && group_leader->ctx != ctx) 10114 goto err_context; 10115 10116 /* 10117 * Only a group leader can be exclusive or pinned 10118 */ 10119 if (attr.exclusive || attr.pinned) 10120 goto err_context; 10121 } 10122 10123 if (output_event) { 10124 err = perf_event_set_output(event, output_event); 10125 if (err) 10126 goto err_context; 10127 } 10128 10129 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, 10130 f_flags); 10131 if (IS_ERR(event_file)) { 10132 err = PTR_ERR(event_file); 10133 event_file = NULL; 10134 goto err_context; 10135 } 10136 10137 if (move_group) { 10138 gctx = __perf_event_ctx_lock_double(group_leader, ctx); 10139 10140 if (gctx->task == TASK_TOMBSTONE) { 10141 err = -ESRCH; 10142 goto err_locked; 10143 } 10144 10145 /* 10146 * Check if we raced against another sys_perf_event_open() call 10147 * moving the software group underneath us. 10148 */ 10149 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { 10150 /* 10151 * If someone moved the group out from under us, check 10152 * if this new event wound up on the same ctx, if so 10153 * its the regular !move_group case, otherwise fail. 10154 */ 10155 if (gctx != ctx) { 10156 err = -EINVAL; 10157 goto err_locked; 10158 } else { 10159 perf_event_ctx_unlock(group_leader, gctx); 10160 move_group = 0; 10161 } 10162 } 10163 } else { 10164 mutex_lock(&ctx->mutex); 10165 } 10166 10167 if (ctx->task == TASK_TOMBSTONE) { 10168 err = -ESRCH; 10169 goto err_locked; 10170 } 10171 10172 if (!perf_event_validate_size(event)) { 10173 err = -E2BIG; 10174 goto err_locked; 10175 } 10176 10177 if (!task) { 10178 /* 10179 * Check if the @cpu we're creating an event for is online. 10180 * 10181 * We use the perf_cpu_context::ctx::mutex to serialize against 10182 * the hotplug notifiers. See perf_event_{init,exit}_cpu(). 10183 */ 10184 struct perf_cpu_context *cpuctx = 10185 container_of(ctx, struct perf_cpu_context, ctx); 10186 10187 if (!cpuctx->online) { 10188 err = -ENODEV; 10189 goto err_locked; 10190 } 10191 } 10192 10193 10194 /* 10195 * Must be under the same ctx::mutex as perf_install_in_context(), 10196 * because we need to serialize with concurrent event creation. 10197 */ 10198 if (!exclusive_event_installable(event, ctx)) { 10199 /* exclusive and group stuff are assumed mutually exclusive */ 10200 WARN_ON_ONCE(move_group); 10201 10202 err = -EBUSY; 10203 goto err_locked; 10204 } 10205 10206 WARN_ON_ONCE(ctx->parent_ctx); 10207 10208 /* 10209 * This is the point on no return; we cannot fail hereafter. This is 10210 * where we start modifying current state. 10211 */ 10212 10213 if (move_group) { 10214 /* 10215 * See perf_event_ctx_lock() for comments on the details 10216 * of swizzling perf_event::ctx. 10217 */ 10218 perf_remove_from_context(group_leader, 0); 10219 put_ctx(gctx); 10220 10221 list_for_each_entry(sibling, &group_leader->sibling_list, 10222 group_entry) { 10223 perf_remove_from_context(sibling, 0); 10224 put_ctx(gctx); 10225 } 10226 10227 /* 10228 * Wait for everybody to stop referencing the events through 10229 * the old lists, before installing it on new lists. 10230 */ 10231 synchronize_rcu(); 10232 10233 /* 10234 * Install the group siblings before the group leader. 10235 * 10236 * Because a group leader will try and install the entire group 10237 * (through the sibling list, which is still in-tact), we can 10238 * end up with siblings installed in the wrong context. 10239 * 10240 * By installing siblings first we NO-OP because they're not 10241 * reachable through the group lists. 10242 */ 10243 list_for_each_entry(sibling, &group_leader->sibling_list, 10244 group_entry) { 10245 perf_event__state_init(sibling); 10246 perf_install_in_context(ctx, sibling, sibling->cpu); 10247 get_ctx(ctx); 10248 } 10249 10250 /* 10251 * Removing from the context ends up with disabled 10252 * event. What we want here is event in the initial 10253 * startup state, ready to be add into new context. 10254 */ 10255 perf_event__state_init(group_leader); 10256 perf_install_in_context(ctx, group_leader, group_leader->cpu); 10257 get_ctx(ctx); 10258 } 10259 10260 /* 10261 * Precalculate sample_data sizes; do while holding ctx::mutex such 10262 * that we're serialized against further additions and before 10263 * perf_install_in_context() which is the point the event is active and 10264 * can use these values. 10265 */ 10266 perf_event__header_size(event); 10267 perf_event__id_header_size(event); 10268 10269 event->owner = current; 10270 10271 perf_install_in_context(ctx, event, event->cpu); 10272 perf_unpin_context(ctx); 10273 10274 if (move_group) 10275 perf_event_ctx_unlock(group_leader, gctx); 10276 mutex_unlock(&ctx->mutex); 10277 10278 if (task) { 10279 mutex_unlock(&task->signal->cred_guard_mutex); 10280 put_task_struct(task); 10281 } 10282 10283 mutex_lock(¤t->perf_event_mutex); 10284 list_add_tail(&event->owner_entry, ¤t->perf_event_list); 10285 mutex_unlock(¤t->perf_event_mutex); 10286 10287 /* 10288 * Drop the reference on the group_event after placing the 10289 * new event on the sibling_list. This ensures destruction 10290 * of the group leader will find the pointer to itself in 10291 * perf_group_detach(). 10292 */ 10293 fdput(group); 10294 fd_install(event_fd, event_file); 10295 return event_fd; 10296 10297 err_locked: 10298 if (move_group) 10299 perf_event_ctx_unlock(group_leader, gctx); 10300 mutex_unlock(&ctx->mutex); 10301 /* err_file: */ 10302 fput(event_file); 10303 err_context: 10304 perf_unpin_context(ctx); 10305 put_ctx(ctx); 10306 err_alloc: 10307 /* 10308 * If event_file is set, the fput() above will have called ->release() 10309 * and that will take care of freeing the event. 10310 */ 10311 if (!event_file) 10312 free_event(event); 10313 err_cred: 10314 if (task) 10315 mutex_unlock(&task->signal->cred_guard_mutex); 10316 err_task: 10317 if (task) 10318 put_task_struct(task); 10319 err_group_fd: 10320 fdput(group); 10321 err_fd: 10322 put_unused_fd(event_fd); 10323 return err; 10324 } 10325 10326 /** 10327 * perf_event_create_kernel_counter 10328 * 10329 * @attr: attributes of the counter to create 10330 * @cpu: cpu in which the counter is bound 10331 * @task: task to profile (NULL for percpu) 10332 */ 10333 struct perf_event * 10334 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 10335 struct task_struct *task, 10336 perf_overflow_handler_t overflow_handler, 10337 void *context) 10338 { 10339 struct perf_event_context *ctx; 10340 struct perf_event *event; 10341 int err; 10342 10343 /* 10344 * Get the target context (task or percpu): 10345 */ 10346 10347 event = perf_event_alloc(attr, cpu, task, NULL, NULL, 10348 overflow_handler, context, -1); 10349 if (IS_ERR(event)) { 10350 err = PTR_ERR(event); 10351 goto err; 10352 } 10353 10354 /* Mark owner so we could distinguish it from user events. */ 10355 event->owner = TASK_TOMBSTONE; 10356 10357 ctx = find_get_context(event->pmu, task, event); 10358 if (IS_ERR(ctx)) { 10359 err = PTR_ERR(ctx); 10360 goto err_free; 10361 } 10362 10363 WARN_ON_ONCE(ctx->parent_ctx); 10364 mutex_lock(&ctx->mutex); 10365 if (ctx->task == TASK_TOMBSTONE) { 10366 err = -ESRCH; 10367 goto err_unlock; 10368 } 10369 10370 if (!task) { 10371 /* 10372 * Check if the @cpu we're creating an event for is online. 10373 * 10374 * We use the perf_cpu_context::ctx::mutex to serialize against 10375 * the hotplug notifiers. See perf_event_{init,exit}_cpu(). 10376 */ 10377 struct perf_cpu_context *cpuctx = 10378 container_of(ctx, struct perf_cpu_context, ctx); 10379 if (!cpuctx->online) { 10380 err = -ENODEV; 10381 goto err_unlock; 10382 } 10383 } 10384 10385 if (!exclusive_event_installable(event, ctx)) { 10386 err = -EBUSY; 10387 goto err_unlock; 10388 } 10389 10390 perf_install_in_context(ctx, event, cpu); 10391 perf_unpin_context(ctx); 10392 mutex_unlock(&ctx->mutex); 10393 10394 return event; 10395 10396 err_unlock: 10397 mutex_unlock(&ctx->mutex); 10398 perf_unpin_context(ctx); 10399 put_ctx(ctx); 10400 err_free: 10401 free_event(event); 10402 err: 10403 return ERR_PTR(err); 10404 } 10405 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 10406 10407 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) 10408 { 10409 struct perf_event_context *src_ctx; 10410 struct perf_event_context *dst_ctx; 10411 struct perf_event *event, *tmp; 10412 LIST_HEAD(events); 10413 10414 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; 10415 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; 10416 10417 /* 10418 * See perf_event_ctx_lock() for comments on the details 10419 * of swizzling perf_event::ctx. 10420 */ 10421 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); 10422 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 10423 event_entry) { 10424 perf_remove_from_context(event, 0); 10425 unaccount_event_cpu(event, src_cpu); 10426 put_ctx(src_ctx); 10427 list_add(&event->migrate_entry, &events); 10428 } 10429 10430 /* 10431 * Wait for the events to quiesce before re-instating them. 10432 */ 10433 synchronize_rcu(); 10434 10435 /* 10436 * Re-instate events in 2 passes. 10437 * 10438 * Skip over group leaders and only install siblings on this first 10439 * pass, siblings will not get enabled without a leader, however a 10440 * leader will enable its siblings, even if those are still on the old 10441 * context. 10442 */ 10443 list_for_each_entry_safe(event, tmp, &events, migrate_entry) { 10444 if (event->group_leader == event) 10445 continue; 10446 10447 list_del(&event->migrate_entry); 10448 if (event->state >= PERF_EVENT_STATE_OFF) 10449 event->state = PERF_EVENT_STATE_INACTIVE; 10450 account_event_cpu(event, dst_cpu); 10451 perf_install_in_context(dst_ctx, event, dst_cpu); 10452 get_ctx(dst_ctx); 10453 } 10454 10455 /* 10456 * Once all the siblings are setup properly, install the group leaders 10457 * to make it go. 10458 */ 10459 list_for_each_entry_safe(event, tmp, &events, migrate_entry) { 10460 list_del(&event->migrate_entry); 10461 if (event->state >= PERF_EVENT_STATE_OFF) 10462 event->state = PERF_EVENT_STATE_INACTIVE; 10463 account_event_cpu(event, dst_cpu); 10464 perf_install_in_context(dst_ctx, event, dst_cpu); 10465 get_ctx(dst_ctx); 10466 } 10467 mutex_unlock(&dst_ctx->mutex); 10468 mutex_unlock(&src_ctx->mutex); 10469 } 10470 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); 10471 10472 static void sync_child_event(struct perf_event *child_event, 10473 struct task_struct *child) 10474 { 10475 struct perf_event *parent_event = child_event->parent; 10476 u64 child_val; 10477 10478 if (child_event->attr.inherit_stat) 10479 perf_event_read_event(child_event, child); 10480 10481 child_val = perf_event_count(child_event); 10482 10483 /* 10484 * Add back the child's count to the parent's count: 10485 */ 10486 atomic64_add(child_val, &parent_event->child_count); 10487 atomic64_add(child_event->total_time_enabled, 10488 &parent_event->child_total_time_enabled); 10489 atomic64_add(child_event->total_time_running, 10490 &parent_event->child_total_time_running); 10491 } 10492 10493 static void 10494 perf_event_exit_event(struct perf_event *child_event, 10495 struct perf_event_context *child_ctx, 10496 struct task_struct *child) 10497 { 10498 struct perf_event *parent_event = child_event->parent; 10499 10500 /* 10501 * Do not destroy the 'original' grouping; because of the context 10502 * switch optimization the original events could've ended up in a 10503 * random child task. 10504 * 10505 * If we were to destroy the original group, all group related 10506 * operations would cease to function properly after this random 10507 * child dies. 10508 * 10509 * Do destroy all inherited groups, we don't care about those 10510 * and being thorough is better. 10511 */ 10512 raw_spin_lock_irq(&child_ctx->lock); 10513 WARN_ON_ONCE(child_ctx->is_active); 10514 10515 if (parent_event) 10516 perf_group_detach(child_event); 10517 list_del_event(child_event, child_ctx); 10518 perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */ 10519 raw_spin_unlock_irq(&child_ctx->lock); 10520 10521 /* 10522 * Parent events are governed by their filedesc, retain them. 10523 */ 10524 if (!parent_event) { 10525 perf_event_wakeup(child_event); 10526 return; 10527 } 10528 /* 10529 * Child events can be cleaned up. 10530 */ 10531 10532 sync_child_event(child_event, child); 10533 10534 /* 10535 * Remove this event from the parent's list 10536 */ 10537 WARN_ON_ONCE(parent_event->ctx->parent_ctx); 10538 mutex_lock(&parent_event->child_mutex); 10539 list_del_init(&child_event->child_list); 10540 mutex_unlock(&parent_event->child_mutex); 10541 10542 /* 10543 * Kick perf_poll() for is_event_hup(). 10544 */ 10545 perf_event_wakeup(parent_event); 10546 free_event(child_event); 10547 put_event(parent_event); 10548 } 10549 10550 static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 10551 { 10552 struct perf_event_context *child_ctx, *clone_ctx = NULL; 10553 struct perf_event *child_event, *next; 10554 10555 WARN_ON_ONCE(child != current); 10556 10557 child_ctx = perf_pin_task_context(child, ctxn); 10558 if (!child_ctx) 10559 return; 10560 10561 /* 10562 * In order to reduce the amount of tricky in ctx tear-down, we hold 10563 * ctx::mutex over the entire thing. This serializes against almost 10564 * everything that wants to access the ctx. 10565 * 10566 * The exception is sys_perf_event_open() / 10567 * perf_event_create_kernel_count() which does find_get_context() 10568 * without ctx::mutex (it cannot because of the move_group double mutex 10569 * lock thing). See the comments in perf_install_in_context(). 10570 */ 10571 mutex_lock(&child_ctx->mutex); 10572 10573 /* 10574 * In a single ctx::lock section, de-schedule the events and detach the 10575 * context from the task such that we cannot ever get it scheduled back 10576 * in. 10577 */ 10578 raw_spin_lock_irq(&child_ctx->lock); 10579 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL); 10580 10581 /* 10582 * Now that the context is inactive, destroy the task <-> ctx relation 10583 * and mark the context dead. 10584 */ 10585 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); 10586 put_ctx(child_ctx); /* cannot be last */ 10587 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); 10588 put_task_struct(current); /* cannot be last */ 10589 10590 clone_ctx = unclone_ctx(child_ctx); 10591 raw_spin_unlock_irq(&child_ctx->lock); 10592 10593 if (clone_ctx) 10594 put_ctx(clone_ctx); 10595 10596 /* 10597 * Report the task dead after unscheduling the events so that we 10598 * won't get any samples after PERF_RECORD_EXIT. We can however still 10599 * get a few PERF_RECORD_READ events. 10600 */ 10601 perf_event_task(child, child_ctx, 0); 10602 10603 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) 10604 perf_event_exit_event(child_event, child_ctx, child); 10605 10606 mutex_unlock(&child_ctx->mutex); 10607 10608 put_ctx(child_ctx); 10609 } 10610 10611 /* 10612 * When a child task exits, feed back event values to parent events. 10613 * 10614 * Can be called with cred_guard_mutex held when called from 10615 * install_exec_creds(). 10616 */ 10617 void perf_event_exit_task(struct task_struct *child) 10618 { 10619 struct perf_event *event, *tmp; 10620 int ctxn; 10621 10622 mutex_lock(&child->perf_event_mutex); 10623 list_for_each_entry_safe(event, tmp, &child->perf_event_list, 10624 owner_entry) { 10625 list_del_init(&event->owner_entry); 10626 10627 /* 10628 * Ensure the list deletion is visible before we clear 10629 * the owner, closes a race against perf_release() where 10630 * we need to serialize on the owner->perf_event_mutex. 10631 */ 10632 smp_store_release(&event->owner, NULL); 10633 } 10634 mutex_unlock(&child->perf_event_mutex); 10635 10636 for_each_task_context_nr(ctxn) 10637 perf_event_exit_task_context(child, ctxn); 10638 10639 /* 10640 * The perf_event_exit_task_context calls perf_event_task 10641 * with child's task_ctx, which generates EXIT events for 10642 * child contexts and sets child->perf_event_ctxp[] to NULL. 10643 * At this point we need to send EXIT events to cpu contexts. 10644 */ 10645 perf_event_task(child, NULL, 0); 10646 } 10647 10648 static void perf_free_event(struct perf_event *event, 10649 struct perf_event_context *ctx) 10650 { 10651 struct perf_event *parent = event->parent; 10652 10653 if (WARN_ON_ONCE(!parent)) 10654 return; 10655 10656 mutex_lock(&parent->child_mutex); 10657 list_del_init(&event->child_list); 10658 mutex_unlock(&parent->child_mutex); 10659 10660 put_event(parent); 10661 10662 raw_spin_lock_irq(&ctx->lock); 10663 perf_group_detach(event); 10664 list_del_event(event, ctx); 10665 raw_spin_unlock_irq(&ctx->lock); 10666 free_event(event); 10667 } 10668 10669 /* 10670 * Free an unexposed, unused context as created by inheritance by 10671 * perf_event_init_task below, used by fork() in case of fail. 10672 * 10673 * Not all locks are strictly required, but take them anyway to be nice and 10674 * help out with the lockdep assertions. 10675 */ 10676 void perf_event_free_task(struct task_struct *task) 10677 { 10678 struct perf_event_context *ctx; 10679 struct perf_event *event, *tmp; 10680 int ctxn; 10681 10682 for_each_task_context_nr(ctxn) { 10683 ctx = task->perf_event_ctxp[ctxn]; 10684 if (!ctx) 10685 continue; 10686 10687 mutex_lock(&ctx->mutex); 10688 raw_spin_lock_irq(&ctx->lock); 10689 /* 10690 * Destroy the task <-> ctx relation and mark the context dead. 10691 * 10692 * This is important because even though the task hasn't been 10693 * exposed yet the context has been (through child_list). 10694 */ 10695 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL); 10696 WRITE_ONCE(ctx->task, TASK_TOMBSTONE); 10697 put_task_struct(task); /* cannot be last */ 10698 raw_spin_unlock_irq(&ctx->lock); 10699 10700 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) 10701 perf_free_event(event, ctx); 10702 10703 mutex_unlock(&ctx->mutex); 10704 put_ctx(ctx); 10705 } 10706 } 10707 10708 void perf_event_delayed_put(struct task_struct *task) 10709 { 10710 int ctxn; 10711 10712 for_each_task_context_nr(ctxn) 10713 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); 10714 } 10715 10716 struct file *perf_event_get(unsigned int fd) 10717 { 10718 struct file *file; 10719 10720 file = fget_raw(fd); 10721 if (!file) 10722 return ERR_PTR(-EBADF); 10723 10724 if (file->f_op != &perf_fops) { 10725 fput(file); 10726 return ERR_PTR(-EBADF); 10727 } 10728 10729 return file; 10730 } 10731 10732 const struct perf_event_attr *perf_event_attrs(struct perf_event *event) 10733 { 10734 if (!event) 10735 return ERR_PTR(-EINVAL); 10736 10737 return &event->attr; 10738 } 10739 10740 /* 10741 * Inherit a event from parent task to child task. 10742 * 10743 * Returns: 10744 * - valid pointer on success 10745 * - NULL for orphaned events 10746 * - IS_ERR() on error 10747 */ 10748 static struct perf_event * 10749 inherit_event(struct perf_event *parent_event, 10750 struct task_struct *parent, 10751 struct perf_event_context *parent_ctx, 10752 struct task_struct *child, 10753 struct perf_event *group_leader, 10754 struct perf_event_context *child_ctx) 10755 { 10756 enum perf_event_state parent_state = parent_event->state; 10757 struct perf_event *child_event; 10758 unsigned long flags; 10759 10760 /* 10761 * Instead of creating recursive hierarchies of events, 10762 * we link inherited events back to the original parent, 10763 * which has a filp for sure, which we use as the reference 10764 * count: 10765 */ 10766 if (parent_event->parent) 10767 parent_event = parent_event->parent; 10768 10769 child_event = perf_event_alloc(&parent_event->attr, 10770 parent_event->cpu, 10771 child, 10772 group_leader, parent_event, 10773 NULL, NULL, -1); 10774 if (IS_ERR(child_event)) 10775 return child_event; 10776 10777 10778 if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) && 10779 !child_ctx->task_ctx_data) { 10780 struct pmu *pmu = child_event->pmu; 10781 10782 child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, 10783 GFP_KERNEL); 10784 if (!child_ctx->task_ctx_data) { 10785 free_event(child_event); 10786 return NULL; 10787 } 10788 } 10789 10790 /* 10791 * is_orphaned_event() and list_add_tail(&parent_event->child_list) 10792 * must be under the same lock in order to serialize against 10793 * perf_event_release_kernel(), such that either we must observe 10794 * is_orphaned_event() or they will observe us on the child_list. 10795 */ 10796 mutex_lock(&parent_event->child_mutex); 10797 if (is_orphaned_event(parent_event) || 10798 !atomic_long_inc_not_zero(&parent_event->refcount)) { 10799 mutex_unlock(&parent_event->child_mutex); 10800 /* task_ctx_data is freed with child_ctx */ 10801 free_event(child_event); 10802 return NULL; 10803 } 10804 10805 get_ctx(child_ctx); 10806 10807 /* 10808 * Make the child state follow the state of the parent event, 10809 * not its attr.disabled bit. We hold the parent's mutex, 10810 * so we won't race with perf_event_{en, dis}able_family. 10811 */ 10812 if (parent_state >= PERF_EVENT_STATE_INACTIVE) 10813 child_event->state = PERF_EVENT_STATE_INACTIVE; 10814 else 10815 child_event->state = PERF_EVENT_STATE_OFF; 10816 10817 if (parent_event->attr.freq) { 10818 u64 sample_period = parent_event->hw.sample_period; 10819 struct hw_perf_event *hwc = &child_event->hw; 10820 10821 hwc->sample_period = sample_period; 10822 hwc->last_period = sample_period; 10823 10824 local64_set(&hwc->period_left, sample_period); 10825 } 10826 10827 child_event->ctx = child_ctx; 10828 child_event->overflow_handler = parent_event->overflow_handler; 10829 child_event->overflow_handler_context 10830 = parent_event->overflow_handler_context; 10831 10832 /* 10833 * Precalculate sample_data sizes 10834 */ 10835 perf_event__header_size(child_event); 10836 perf_event__id_header_size(child_event); 10837 10838 /* 10839 * Link it up in the child's context: 10840 */ 10841 raw_spin_lock_irqsave(&child_ctx->lock, flags); 10842 add_event_to_ctx(child_event, child_ctx); 10843 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 10844 10845 /* 10846 * Link this into the parent event's child list 10847 */ 10848 list_add_tail(&child_event->child_list, &parent_event->child_list); 10849 mutex_unlock(&parent_event->child_mutex); 10850 10851 return child_event; 10852 } 10853 10854 /* 10855 * Inherits an event group. 10856 * 10857 * This will quietly suppress orphaned events; !inherit_event() is not an error. 10858 * This matches with perf_event_release_kernel() removing all child events. 10859 * 10860 * Returns: 10861 * - 0 on success 10862 * - <0 on error 10863 */ 10864 static int inherit_group(struct perf_event *parent_event, 10865 struct task_struct *parent, 10866 struct perf_event_context *parent_ctx, 10867 struct task_struct *child, 10868 struct perf_event_context *child_ctx) 10869 { 10870 struct perf_event *leader; 10871 struct perf_event *sub; 10872 struct perf_event *child_ctr; 10873 10874 leader = inherit_event(parent_event, parent, parent_ctx, 10875 child, NULL, child_ctx); 10876 if (IS_ERR(leader)) 10877 return PTR_ERR(leader); 10878 /* 10879 * @leader can be NULL here because of is_orphaned_event(). In this 10880 * case inherit_event() will create individual events, similar to what 10881 * perf_group_detach() would do anyway. 10882 */ 10883 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { 10884 child_ctr = inherit_event(sub, parent, parent_ctx, 10885 child, leader, child_ctx); 10886 if (IS_ERR(child_ctr)) 10887 return PTR_ERR(child_ctr); 10888 } 10889 return 0; 10890 } 10891 10892 /* 10893 * Creates the child task context and tries to inherit the event-group. 10894 * 10895 * Clears @inherited_all on !attr.inherited or error. Note that we'll leave 10896 * inherited_all set when we 'fail' to inherit an orphaned event; this is 10897 * consistent with perf_event_release_kernel() removing all child events. 10898 * 10899 * Returns: 10900 * - 0 on success 10901 * - <0 on error 10902 */ 10903 static int 10904 inherit_task_group(struct perf_event *event, struct task_struct *parent, 10905 struct perf_event_context *parent_ctx, 10906 struct task_struct *child, int ctxn, 10907 int *inherited_all) 10908 { 10909 int ret; 10910 struct perf_event_context *child_ctx; 10911 10912 if (!event->attr.inherit) { 10913 *inherited_all = 0; 10914 return 0; 10915 } 10916 10917 child_ctx = child->perf_event_ctxp[ctxn]; 10918 if (!child_ctx) { 10919 /* 10920 * This is executed from the parent task context, so 10921 * inherit events that have been marked for cloning. 10922 * First allocate and initialize a context for the 10923 * child. 10924 */ 10925 child_ctx = alloc_perf_context(parent_ctx->pmu, child); 10926 if (!child_ctx) 10927 return -ENOMEM; 10928 10929 child->perf_event_ctxp[ctxn] = child_ctx; 10930 } 10931 10932 ret = inherit_group(event, parent, parent_ctx, 10933 child, child_ctx); 10934 10935 if (ret) 10936 *inherited_all = 0; 10937 10938 return ret; 10939 } 10940 10941 /* 10942 * Initialize the perf_event context in task_struct 10943 */ 10944 static int perf_event_init_context(struct task_struct *child, int ctxn) 10945 { 10946 struct perf_event_context *child_ctx, *parent_ctx; 10947 struct perf_event_context *cloned_ctx; 10948 struct perf_event *event; 10949 struct task_struct *parent = current; 10950 int inherited_all = 1; 10951 unsigned long flags; 10952 int ret = 0; 10953 10954 if (likely(!parent->perf_event_ctxp[ctxn])) 10955 return 0; 10956 10957 /* 10958 * If the parent's context is a clone, pin it so it won't get 10959 * swapped under us. 10960 */ 10961 parent_ctx = perf_pin_task_context(parent, ctxn); 10962 if (!parent_ctx) 10963 return 0; 10964 10965 /* 10966 * No need to check if parent_ctx != NULL here; since we saw 10967 * it non-NULL earlier, the only reason for it to become NULL 10968 * is if we exit, and since we're currently in the middle of 10969 * a fork we can't be exiting at the same time. 10970 */ 10971 10972 /* 10973 * Lock the parent list. No need to lock the child - not PID 10974 * hashed yet and not running, so nobody can access it. 10975 */ 10976 mutex_lock(&parent_ctx->mutex); 10977 10978 /* 10979 * We dont have to disable NMIs - we are only looking at 10980 * the list, not manipulating it: 10981 */ 10982 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 10983 ret = inherit_task_group(event, parent, parent_ctx, 10984 child, ctxn, &inherited_all); 10985 if (ret) 10986 goto out_unlock; 10987 } 10988 10989 /* 10990 * We can't hold ctx->lock when iterating the ->flexible_group list due 10991 * to allocations, but we need to prevent rotation because 10992 * rotate_ctx() will change the list from interrupt context. 10993 */ 10994 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 10995 parent_ctx->rotate_disable = 1; 10996 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 10997 10998 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 10999 ret = inherit_task_group(event, parent, parent_ctx, 11000 child, ctxn, &inherited_all); 11001 if (ret) 11002 goto out_unlock; 11003 } 11004 11005 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 11006 parent_ctx->rotate_disable = 0; 11007 11008 child_ctx = child->perf_event_ctxp[ctxn]; 11009 11010 if (child_ctx && inherited_all) { 11011 /* 11012 * Mark the child context as a clone of the parent 11013 * context, or of whatever the parent is a clone of. 11014 * 11015 * Note that if the parent is a clone, the holding of 11016 * parent_ctx->lock avoids it from being uncloned. 11017 */ 11018 cloned_ctx = parent_ctx->parent_ctx; 11019 if (cloned_ctx) { 11020 child_ctx->parent_ctx = cloned_ctx; 11021 child_ctx->parent_gen = parent_ctx->parent_gen; 11022 } else { 11023 child_ctx->parent_ctx = parent_ctx; 11024 child_ctx->parent_gen = parent_ctx->generation; 11025 } 11026 get_ctx(child_ctx->parent_ctx); 11027 } 11028 11029 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 11030 out_unlock: 11031 mutex_unlock(&parent_ctx->mutex); 11032 11033 perf_unpin_context(parent_ctx); 11034 put_ctx(parent_ctx); 11035 11036 return ret; 11037 } 11038 11039 /* 11040 * Initialize the perf_event context in task_struct 11041 */ 11042 int perf_event_init_task(struct task_struct *child) 11043 { 11044 int ctxn, ret; 11045 11046 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); 11047 mutex_init(&child->perf_event_mutex); 11048 INIT_LIST_HEAD(&child->perf_event_list); 11049 11050 for_each_task_context_nr(ctxn) { 11051 ret = perf_event_init_context(child, ctxn); 11052 if (ret) { 11053 perf_event_free_task(child); 11054 return ret; 11055 } 11056 } 11057 11058 return 0; 11059 } 11060 11061 static void __init perf_event_init_all_cpus(void) 11062 { 11063 struct swevent_htable *swhash; 11064 int cpu; 11065 11066 zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); 11067 11068 for_each_possible_cpu(cpu) { 11069 swhash = &per_cpu(swevent_htable, cpu); 11070 mutex_init(&swhash->hlist_mutex); 11071 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); 11072 11073 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); 11074 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); 11075 11076 #ifdef CONFIG_CGROUP_PERF 11077 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); 11078 #endif 11079 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); 11080 } 11081 } 11082 11083 void perf_swevent_init_cpu(unsigned int cpu) 11084 { 11085 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 11086 11087 mutex_lock(&swhash->hlist_mutex); 11088 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { 11089 struct swevent_hlist *hlist; 11090 11091 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); 11092 WARN_ON(!hlist); 11093 rcu_assign_pointer(swhash->swevent_hlist, hlist); 11094 } 11095 mutex_unlock(&swhash->hlist_mutex); 11096 } 11097 11098 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE 11099 static void __perf_event_exit_context(void *__info) 11100 { 11101 struct perf_event_context *ctx = __info; 11102 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 11103 struct perf_event *event; 11104 11105 raw_spin_lock(&ctx->lock); 11106 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 11107 list_for_each_entry(event, &ctx->event_list, event_entry) 11108 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); 11109 raw_spin_unlock(&ctx->lock); 11110 } 11111 11112 static void perf_event_exit_cpu_context(int cpu) 11113 { 11114 struct perf_cpu_context *cpuctx; 11115 struct perf_event_context *ctx; 11116 struct pmu *pmu; 11117 11118 mutex_lock(&pmus_lock); 11119 list_for_each_entry(pmu, &pmus, entry) { 11120 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 11121 ctx = &cpuctx->ctx; 11122 11123 mutex_lock(&ctx->mutex); 11124 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); 11125 cpuctx->online = 0; 11126 mutex_unlock(&ctx->mutex); 11127 } 11128 cpumask_clear_cpu(cpu, perf_online_mask); 11129 mutex_unlock(&pmus_lock); 11130 } 11131 #else 11132 11133 static void perf_event_exit_cpu_context(int cpu) { } 11134 11135 #endif 11136 11137 int perf_event_init_cpu(unsigned int cpu) 11138 { 11139 struct perf_cpu_context *cpuctx; 11140 struct perf_event_context *ctx; 11141 struct pmu *pmu; 11142 11143 perf_swevent_init_cpu(cpu); 11144 11145 mutex_lock(&pmus_lock); 11146 cpumask_set_cpu(cpu, perf_online_mask); 11147 list_for_each_entry(pmu, &pmus, entry) { 11148 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 11149 ctx = &cpuctx->ctx; 11150 11151 mutex_lock(&ctx->mutex); 11152 cpuctx->online = 1; 11153 mutex_unlock(&ctx->mutex); 11154 } 11155 mutex_unlock(&pmus_lock); 11156 11157 return 0; 11158 } 11159 11160 int perf_event_exit_cpu(unsigned int cpu) 11161 { 11162 perf_event_exit_cpu_context(cpu); 11163 return 0; 11164 } 11165 11166 static int 11167 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) 11168 { 11169 int cpu; 11170 11171 for_each_online_cpu(cpu) 11172 perf_event_exit_cpu(cpu); 11173 11174 return NOTIFY_OK; 11175 } 11176 11177 /* 11178 * Run the perf reboot notifier at the very last possible moment so that 11179 * the generic watchdog code runs as long as possible. 11180 */ 11181 static struct notifier_block perf_reboot_notifier = { 11182 .notifier_call = perf_reboot, 11183 .priority = INT_MIN, 11184 }; 11185 11186 void __init perf_event_init(void) 11187 { 11188 int ret; 11189 11190 idr_init(&pmu_idr); 11191 11192 perf_event_init_all_cpus(); 11193 init_srcu_struct(&pmus_srcu); 11194 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); 11195 perf_pmu_register(&perf_cpu_clock, NULL, -1); 11196 perf_pmu_register(&perf_task_clock, NULL, -1); 11197 perf_tp_register(); 11198 perf_event_init_cpu(smp_processor_id()); 11199 register_reboot_notifier(&perf_reboot_notifier); 11200 11201 ret = init_hw_breakpoint(); 11202 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 11203 11204 /* 11205 * Build time assertion that we keep the data_head at the intended 11206 * location. IOW, validation we got the __reserved[] size right. 11207 */ 11208 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) 11209 != 1024); 11210 } 11211 11212 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, 11213 char *page) 11214 { 11215 struct perf_pmu_events_attr *pmu_attr = 11216 container_of(attr, struct perf_pmu_events_attr, attr); 11217 11218 if (pmu_attr->event_str) 11219 return sprintf(page, "%s\n", pmu_attr->event_str); 11220 11221 return 0; 11222 } 11223 EXPORT_SYMBOL_GPL(perf_event_sysfs_show); 11224 11225 static int __init perf_event_sysfs_init(void) 11226 { 11227 struct pmu *pmu; 11228 int ret; 11229 11230 mutex_lock(&pmus_lock); 11231 11232 ret = bus_register(&pmu_bus); 11233 if (ret) 11234 goto unlock; 11235 11236 list_for_each_entry(pmu, &pmus, entry) { 11237 if (!pmu->name || pmu->type < 0) 11238 continue; 11239 11240 ret = pmu_dev_alloc(pmu); 11241 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); 11242 } 11243 pmu_bus_running = 1; 11244 ret = 0; 11245 11246 unlock: 11247 mutex_unlock(&pmus_lock); 11248 11249 return ret; 11250 } 11251 device_initcall(perf_event_sysfs_init); 11252 11253 #ifdef CONFIG_CGROUP_PERF 11254 static struct cgroup_subsys_state * 11255 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 11256 { 11257 struct perf_cgroup *jc; 11258 11259 jc = kzalloc(sizeof(*jc), GFP_KERNEL); 11260 if (!jc) 11261 return ERR_PTR(-ENOMEM); 11262 11263 jc->info = alloc_percpu(struct perf_cgroup_info); 11264 if (!jc->info) { 11265 kfree(jc); 11266 return ERR_PTR(-ENOMEM); 11267 } 11268 11269 return &jc->css; 11270 } 11271 11272 static void perf_cgroup_css_free(struct cgroup_subsys_state *css) 11273 { 11274 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); 11275 11276 free_percpu(jc->info); 11277 kfree(jc); 11278 } 11279 11280 static int __perf_cgroup_move(void *info) 11281 { 11282 struct task_struct *task = info; 11283 rcu_read_lock(); 11284 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); 11285 rcu_read_unlock(); 11286 return 0; 11287 } 11288 11289 static void perf_cgroup_attach(struct cgroup_taskset *tset) 11290 { 11291 struct task_struct *task; 11292 struct cgroup_subsys_state *css; 11293 11294 cgroup_taskset_for_each(task, css, tset) 11295 task_function_call(task, __perf_cgroup_move, task); 11296 } 11297 11298 struct cgroup_subsys perf_event_cgrp_subsys = { 11299 .css_alloc = perf_cgroup_css_alloc, 11300 .css_free = perf_cgroup_css_free, 11301 .attach = perf_cgroup_attach, 11302 /* 11303 * Implicitly enable on dfl hierarchy so that perf events can 11304 * always be filtered by cgroup2 path as long as perf_event 11305 * controller is not mounted on a legacy hierarchy. 11306 */ 11307 .implicit_on_dfl = true, 11308 .threaded = true, 11309 }; 11310 #endif /* CONFIG_CGROUP_PERF */ 11311