1 /* 2 * Performance events core code: 3 * 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 8 * 9 * For licensing details see kernel-base/COPYING 10 */ 11 12 #include <linux/fs.h> 13 #include <linux/mm.h> 14 #include <linux/cpu.h> 15 #include <linux/smp.h> 16 #include <linux/idr.h> 17 #include <linux/file.h> 18 #include <linux/poll.h> 19 #include <linux/slab.h> 20 #include <linux/hash.h> 21 #include <linux/tick.h> 22 #include <linux/sysfs.h> 23 #include <linux/dcache.h> 24 #include <linux/percpu.h> 25 #include <linux/ptrace.h> 26 #include <linux/reboot.h> 27 #include <linux/vmstat.h> 28 #include <linux/device.h> 29 #include <linux/export.h> 30 #include <linux/vmalloc.h> 31 #include <linux/hardirq.h> 32 #include <linux/rculist.h> 33 #include <linux/uaccess.h> 34 #include <linux/syscalls.h> 35 #include <linux/anon_inodes.h> 36 #include <linux/kernel_stat.h> 37 #include <linux/cgroup.h> 38 #include <linux/perf_event.h> 39 #include <linux/trace_events.h> 40 #include <linux/hw_breakpoint.h> 41 #include <linux/mm_types.h> 42 #include <linux/module.h> 43 #include <linux/mman.h> 44 #include <linux/compat.h> 45 #include <linux/bpf.h> 46 #include <linux/filter.h> 47 #include <linux/namei.h> 48 #include <linux/parser.h> 49 50 #include "internal.h" 51 52 #include <asm/irq_regs.h> 53 54 typedef int (*remote_function_f)(void *); 55 56 struct remote_function_call { 57 struct task_struct *p; 58 remote_function_f func; 59 void *info; 60 int ret; 61 }; 62 63 static void remote_function(void *data) 64 { 65 struct remote_function_call *tfc = data; 66 struct task_struct *p = tfc->p; 67 68 if (p) { 69 /* -EAGAIN */ 70 if (task_cpu(p) != smp_processor_id()) 71 return; 72 73 /* 74 * Now that we're on right CPU with IRQs disabled, we can test 75 * if we hit the right task without races. 76 */ 77 78 tfc->ret = -ESRCH; /* No such (running) process */ 79 if (p != current) 80 return; 81 } 82 83 tfc->ret = tfc->func(tfc->info); 84 } 85 86 /** 87 * task_function_call - call a function on the cpu on which a task runs 88 * @p: the task to evaluate 89 * @func: the function to be called 90 * @info: the function call argument 91 * 92 * Calls the function @func when the task is currently running. This might 93 * be on the current CPU, which just calls the function directly 94 * 95 * returns: @func return value, or 96 * -ESRCH - when the process isn't running 97 * -EAGAIN - when the process moved away 98 */ 99 static int 100 task_function_call(struct task_struct *p, remote_function_f func, void *info) 101 { 102 struct remote_function_call data = { 103 .p = p, 104 .func = func, 105 .info = info, 106 .ret = -EAGAIN, 107 }; 108 int ret; 109 110 do { 111 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); 112 if (!ret) 113 ret = data.ret; 114 } while (ret == -EAGAIN); 115 116 return ret; 117 } 118 119 /** 120 * cpu_function_call - call a function on the cpu 121 * @func: the function to be called 122 * @info: the function call argument 123 * 124 * Calls the function @func on the remote cpu. 125 * 126 * returns: @func return value or -ENXIO when the cpu is offline 127 */ 128 static int cpu_function_call(int cpu, remote_function_f func, void *info) 129 { 130 struct remote_function_call data = { 131 .p = NULL, 132 .func = func, 133 .info = info, 134 .ret = -ENXIO, /* No such CPU */ 135 }; 136 137 smp_call_function_single(cpu, remote_function, &data, 1); 138 139 return data.ret; 140 } 141 142 static inline struct perf_cpu_context * 143 __get_cpu_context(struct perf_event_context *ctx) 144 { 145 return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 146 } 147 148 static void perf_ctx_lock(struct perf_cpu_context *cpuctx, 149 struct perf_event_context *ctx) 150 { 151 raw_spin_lock(&cpuctx->ctx.lock); 152 if (ctx) 153 raw_spin_lock(&ctx->lock); 154 } 155 156 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, 157 struct perf_event_context *ctx) 158 { 159 if (ctx) 160 raw_spin_unlock(&ctx->lock); 161 raw_spin_unlock(&cpuctx->ctx.lock); 162 } 163 164 #define TASK_TOMBSTONE ((void *)-1L) 165 166 static bool is_kernel_event(struct perf_event *event) 167 { 168 return READ_ONCE(event->owner) == TASK_TOMBSTONE; 169 } 170 171 /* 172 * On task ctx scheduling... 173 * 174 * When !ctx->nr_events a task context will not be scheduled. This means 175 * we can disable the scheduler hooks (for performance) without leaving 176 * pending task ctx state. 177 * 178 * This however results in two special cases: 179 * 180 * - removing the last event from a task ctx; this is relatively straight 181 * forward and is done in __perf_remove_from_context. 182 * 183 * - adding the first event to a task ctx; this is tricky because we cannot 184 * rely on ctx->is_active and therefore cannot use event_function_call(). 185 * See perf_install_in_context(). 186 * 187 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. 188 */ 189 190 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, 191 struct perf_event_context *, void *); 192 193 struct event_function_struct { 194 struct perf_event *event; 195 event_f func; 196 void *data; 197 }; 198 199 static int event_function(void *info) 200 { 201 struct event_function_struct *efs = info; 202 struct perf_event *event = efs->event; 203 struct perf_event_context *ctx = event->ctx; 204 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 205 struct perf_event_context *task_ctx = cpuctx->task_ctx; 206 int ret = 0; 207 208 WARN_ON_ONCE(!irqs_disabled()); 209 210 perf_ctx_lock(cpuctx, task_ctx); 211 /* 212 * Since we do the IPI call without holding ctx->lock things can have 213 * changed, double check we hit the task we set out to hit. 214 */ 215 if (ctx->task) { 216 if (ctx->task != current) { 217 ret = -ESRCH; 218 goto unlock; 219 } 220 221 /* 222 * We only use event_function_call() on established contexts, 223 * and event_function() is only ever called when active (or 224 * rather, we'll have bailed in task_function_call() or the 225 * above ctx->task != current test), therefore we must have 226 * ctx->is_active here. 227 */ 228 WARN_ON_ONCE(!ctx->is_active); 229 /* 230 * And since we have ctx->is_active, cpuctx->task_ctx must 231 * match. 232 */ 233 WARN_ON_ONCE(task_ctx != ctx); 234 } else { 235 WARN_ON_ONCE(&cpuctx->ctx != ctx); 236 } 237 238 efs->func(event, cpuctx, ctx, efs->data); 239 unlock: 240 perf_ctx_unlock(cpuctx, task_ctx); 241 242 return ret; 243 } 244 245 static void event_function_local(struct perf_event *event, event_f func, void *data) 246 { 247 struct event_function_struct efs = { 248 .event = event, 249 .func = func, 250 .data = data, 251 }; 252 253 int ret = event_function(&efs); 254 WARN_ON_ONCE(ret); 255 } 256 257 static void event_function_call(struct perf_event *event, event_f func, void *data) 258 { 259 struct perf_event_context *ctx = event->ctx; 260 struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ 261 struct event_function_struct efs = { 262 .event = event, 263 .func = func, 264 .data = data, 265 }; 266 267 if (!event->parent) { 268 /* 269 * If this is a !child event, we must hold ctx::mutex to 270 * stabilize the the event->ctx relation. See 271 * perf_event_ctx_lock(). 272 */ 273 lockdep_assert_held(&ctx->mutex); 274 } 275 276 if (!task) { 277 cpu_function_call(event->cpu, event_function, &efs); 278 return; 279 } 280 281 if (task == TASK_TOMBSTONE) 282 return; 283 284 again: 285 if (!task_function_call(task, event_function, &efs)) 286 return; 287 288 raw_spin_lock_irq(&ctx->lock); 289 /* 290 * Reload the task pointer, it might have been changed by 291 * a concurrent perf_event_context_sched_out(). 292 */ 293 task = ctx->task; 294 if (task == TASK_TOMBSTONE) { 295 raw_spin_unlock_irq(&ctx->lock); 296 return; 297 } 298 if (ctx->is_active) { 299 raw_spin_unlock_irq(&ctx->lock); 300 goto again; 301 } 302 func(event, NULL, ctx, data); 303 raw_spin_unlock_irq(&ctx->lock); 304 } 305 306 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ 307 PERF_FLAG_FD_OUTPUT |\ 308 PERF_FLAG_PID_CGROUP |\ 309 PERF_FLAG_FD_CLOEXEC) 310 311 /* 312 * branch priv levels that need permission checks 313 */ 314 #define PERF_SAMPLE_BRANCH_PERM_PLM \ 315 (PERF_SAMPLE_BRANCH_KERNEL |\ 316 PERF_SAMPLE_BRANCH_HV) 317 318 enum event_type_t { 319 EVENT_FLEXIBLE = 0x1, 320 EVENT_PINNED = 0x2, 321 EVENT_TIME = 0x4, 322 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 323 }; 324 325 /* 326 * perf_sched_events : >0 events exist 327 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 328 */ 329 330 static void perf_sched_delayed(struct work_struct *work); 331 DEFINE_STATIC_KEY_FALSE(perf_sched_events); 332 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); 333 static DEFINE_MUTEX(perf_sched_mutex); 334 static atomic_t perf_sched_count; 335 336 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 337 static DEFINE_PER_CPU(int, perf_sched_cb_usages); 338 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); 339 340 static atomic_t nr_mmap_events __read_mostly; 341 static atomic_t nr_comm_events __read_mostly; 342 static atomic_t nr_task_events __read_mostly; 343 static atomic_t nr_freq_events __read_mostly; 344 static atomic_t nr_switch_events __read_mostly; 345 346 static LIST_HEAD(pmus); 347 static DEFINE_MUTEX(pmus_lock); 348 static struct srcu_struct pmus_srcu; 349 350 /* 351 * perf event paranoia level: 352 * -1 - not paranoid at all 353 * 0 - disallow raw tracepoint access for unpriv 354 * 1 - disallow cpu events for unpriv 355 * 2 - disallow kernel profiling for unpriv 356 */ 357 int sysctl_perf_event_paranoid __read_mostly = 2; 358 359 /* Minimum for 512 kiB + 1 user control page */ 360 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ 361 362 /* 363 * max perf event sample rate 364 */ 365 #define DEFAULT_MAX_SAMPLE_RATE 100000 366 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) 367 #define DEFAULT_CPU_TIME_MAX_PERCENT 25 368 369 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; 370 371 static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 372 static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; 373 374 static int perf_sample_allowed_ns __read_mostly = 375 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; 376 377 static void update_perf_cpu_limits(void) 378 { 379 u64 tmp = perf_sample_period_ns; 380 381 tmp *= sysctl_perf_cpu_time_max_percent; 382 tmp = div_u64(tmp, 100); 383 if (!tmp) 384 tmp = 1; 385 386 WRITE_ONCE(perf_sample_allowed_ns, tmp); 387 } 388 389 static int perf_rotate_context(struct perf_cpu_context *cpuctx); 390 391 int perf_proc_update_handler(struct ctl_table *table, int write, 392 void __user *buffer, size_t *lenp, 393 loff_t *ppos) 394 { 395 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 396 397 if (ret || !write) 398 return ret; 399 400 /* 401 * If throttling is disabled don't allow the write: 402 */ 403 if (sysctl_perf_cpu_time_max_percent == 100 || 404 sysctl_perf_cpu_time_max_percent == 0) 405 return -EINVAL; 406 407 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); 408 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 409 update_perf_cpu_limits(); 410 411 return 0; 412 } 413 414 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; 415 416 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, 417 void __user *buffer, size_t *lenp, 418 loff_t *ppos) 419 { 420 int ret = proc_dointvec(table, write, buffer, lenp, ppos); 421 422 if (ret || !write) 423 return ret; 424 425 if (sysctl_perf_cpu_time_max_percent == 100 || 426 sysctl_perf_cpu_time_max_percent == 0) { 427 printk(KERN_WARNING 428 "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); 429 WRITE_ONCE(perf_sample_allowed_ns, 0); 430 } else { 431 update_perf_cpu_limits(); 432 } 433 434 return 0; 435 } 436 437 /* 438 * perf samples are done in some very critical code paths (NMIs). 439 * If they take too much CPU time, the system can lock up and not 440 * get any real work done. This will drop the sample rate when 441 * we detect that events are taking too long. 442 */ 443 #define NR_ACCUMULATED_SAMPLES 128 444 static DEFINE_PER_CPU(u64, running_sample_length); 445 446 static u64 __report_avg; 447 static u64 __report_allowed; 448 449 static void perf_duration_warn(struct irq_work *w) 450 { 451 printk_ratelimited(KERN_INFO 452 "perf: interrupt took too long (%lld > %lld), lowering " 453 "kernel.perf_event_max_sample_rate to %d\n", 454 __report_avg, __report_allowed, 455 sysctl_perf_event_sample_rate); 456 } 457 458 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); 459 460 void perf_sample_event_took(u64 sample_len_ns) 461 { 462 u64 max_len = READ_ONCE(perf_sample_allowed_ns); 463 u64 running_len; 464 u64 avg_len; 465 u32 max; 466 467 if (max_len == 0) 468 return; 469 470 /* Decay the counter by 1 average sample. */ 471 running_len = __this_cpu_read(running_sample_length); 472 running_len -= running_len/NR_ACCUMULATED_SAMPLES; 473 running_len += sample_len_ns; 474 __this_cpu_write(running_sample_length, running_len); 475 476 /* 477 * Note: this will be biased artifically low until we have 478 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us 479 * from having to maintain a count. 480 */ 481 avg_len = running_len/NR_ACCUMULATED_SAMPLES; 482 if (avg_len <= max_len) 483 return; 484 485 __report_avg = avg_len; 486 __report_allowed = max_len; 487 488 /* 489 * Compute a throttle threshold 25% below the current duration. 490 */ 491 avg_len += avg_len / 4; 492 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; 493 if (avg_len < max) 494 max /= (u32)avg_len; 495 else 496 max = 1; 497 498 WRITE_ONCE(perf_sample_allowed_ns, avg_len); 499 WRITE_ONCE(max_samples_per_tick, max); 500 501 sysctl_perf_event_sample_rate = max * HZ; 502 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 503 504 if (!irq_work_queue(&perf_duration_work)) { 505 early_printk("perf: interrupt took too long (%lld > %lld), lowering " 506 "kernel.perf_event_max_sample_rate to %d\n", 507 __report_avg, __report_allowed, 508 sysctl_perf_event_sample_rate); 509 } 510 } 511 512 static atomic64_t perf_event_id; 513 514 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 515 enum event_type_t event_type); 516 517 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 518 enum event_type_t event_type, 519 struct task_struct *task); 520 521 static void update_context_time(struct perf_event_context *ctx); 522 static u64 perf_event_time(struct perf_event *event); 523 524 void __weak perf_event_print_debug(void) { } 525 526 extern __weak const char *perf_pmu_name(void) 527 { 528 return "pmu"; 529 } 530 531 static inline u64 perf_clock(void) 532 { 533 return local_clock(); 534 } 535 536 static inline u64 perf_event_clock(struct perf_event *event) 537 { 538 return event->clock(); 539 } 540 541 #ifdef CONFIG_CGROUP_PERF 542 543 static inline bool 544 perf_cgroup_match(struct perf_event *event) 545 { 546 struct perf_event_context *ctx = event->ctx; 547 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 548 549 /* @event doesn't care about cgroup */ 550 if (!event->cgrp) 551 return true; 552 553 /* wants specific cgroup scope but @cpuctx isn't associated with any */ 554 if (!cpuctx->cgrp) 555 return false; 556 557 /* 558 * Cgroup scoping is recursive. An event enabled for a cgroup is 559 * also enabled for all its descendant cgroups. If @cpuctx's 560 * cgroup is a descendant of @event's (the test covers identity 561 * case), it's a match. 562 */ 563 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, 564 event->cgrp->css.cgroup); 565 } 566 567 static inline void perf_detach_cgroup(struct perf_event *event) 568 { 569 css_put(&event->cgrp->css); 570 event->cgrp = NULL; 571 } 572 573 static inline int is_cgroup_event(struct perf_event *event) 574 { 575 return event->cgrp != NULL; 576 } 577 578 static inline u64 perf_cgroup_event_time(struct perf_event *event) 579 { 580 struct perf_cgroup_info *t; 581 582 t = per_cpu_ptr(event->cgrp->info, event->cpu); 583 return t->time; 584 } 585 586 static inline void __update_cgrp_time(struct perf_cgroup *cgrp) 587 { 588 struct perf_cgroup_info *info; 589 u64 now; 590 591 now = perf_clock(); 592 593 info = this_cpu_ptr(cgrp->info); 594 595 info->time += now - info->timestamp; 596 info->timestamp = now; 597 } 598 599 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) 600 { 601 struct perf_cgroup *cgrp_out = cpuctx->cgrp; 602 if (cgrp_out) 603 __update_cgrp_time(cgrp_out); 604 } 605 606 static inline void update_cgrp_time_from_event(struct perf_event *event) 607 { 608 struct perf_cgroup *cgrp; 609 610 /* 611 * ensure we access cgroup data only when needed and 612 * when we know the cgroup is pinned (css_get) 613 */ 614 if (!is_cgroup_event(event)) 615 return; 616 617 cgrp = perf_cgroup_from_task(current, event->ctx); 618 /* 619 * Do not update time when cgroup is not active 620 */ 621 if (cgrp == event->cgrp) 622 __update_cgrp_time(event->cgrp); 623 } 624 625 static inline void 626 perf_cgroup_set_timestamp(struct task_struct *task, 627 struct perf_event_context *ctx) 628 { 629 struct perf_cgroup *cgrp; 630 struct perf_cgroup_info *info; 631 632 /* 633 * ctx->lock held by caller 634 * ensure we do not access cgroup data 635 * unless we have the cgroup pinned (css_get) 636 */ 637 if (!task || !ctx->nr_cgroups) 638 return; 639 640 cgrp = perf_cgroup_from_task(task, ctx); 641 info = this_cpu_ptr(cgrp->info); 642 info->timestamp = ctx->timestamp; 643 } 644 645 #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ 646 #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ 647 648 /* 649 * reschedule events based on the cgroup constraint of task. 650 * 651 * mode SWOUT : schedule out everything 652 * mode SWIN : schedule in based on cgroup for next 653 */ 654 static void perf_cgroup_switch(struct task_struct *task, int mode) 655 { 656 struct perf_cpu_context *cpuctx; 657 struct pmu *pmu; 658 unsigned long flags; 659 660 /* 661 * disable interrupts to avoid geting nr_cgroup 662 * changes via __perf_event_disable(). Also 663 * avoids preemption. 664 */ 665 local_irq_save(flags); 666 667 /* 668 * we reschedule only in the presence of cgroup 669 * constrained events. 670 */ 671 672 list_for_each_entry_rcu(pmu, &pmus, entry) { 673 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 674 if (cpuctx->unique_pmu != pmu) 675 continue; /* ensure we process each cpuctx once */ 676 677 /* 678 * perf_cgroup_events says at least one 679 * context on this CPU has cgroup events. 680 * 681 * ctx->nr_cgroups reports the number of cgroup 682 * events for a context. 683 */ 684 if (cpuctx->ctx.nr_cgroups > 0) { 685 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 686 perf_pmu_disable(cpuctx->ctx.pmu); 687 688 if (mode & PERF_CGROUP_SWOUT) { 689 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 690 /* 691 * must not be done before ctxswout due 692 * to event_filter_match() in event_sched_out() 693 */ 694 cpuctx->cgrp = NULL; 695 } 696 697 if (mode & PERF_CGROUP_SWIN) { 698 WARN_ON_ONCE(cpuctx->cgrp); 699 /* 700 * set cgrp before ctxsw in to allow 701 * event_filter_match() to not have to pass 702 * task around 703 * we pass the cpuctx->ctx to perf_cgroup_from_task() 704 * because cgorup events are only per-cpu 705 */ 706 cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); 707 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); 708 } 709 perf_pmu_enable(cpuctx->ctx.pmu); 710 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 711 } 712 } 713 714 local_irq_restore(flags); 715 } 716 717 static inline void perf_cgroup_sched_out(struct task_struct *task, 718 struct task_struct *next) 719 { 720 struct perf_cgroup *cgrp1; 721 struct perf_cgroup *cgrp2 = NULL; 722 723 rcu_read_lock(); 724 /* 725 * we come here when we know perf_cgroup_events > 0 726 * we do not need to pass the ctx here because we know 727 * we are holding the rcu lock 728 */ 729 cgrp1 = perf_cgroup_from_task(task, NULL); 730 cgrp2 = perf_cgroup_from_task(next, NULL); 731 732 /* 733 * only schedule out current cgroup events if we know 734 * that we are switching to a different cgroup. Otherwise, 735 * do no touch the cgroup events. 736 */ 737 if (cgrp1 != cgrp2) 738 perf_cgroup_switch(task, PERF_CGROUP_SWOUT); 739 740 rcu_read_unlock(); 741 } 742 743 static inline void perf_cgroup_sched_in(struct task_struct *prev, 744 struct task_struct *task) 745 { 746 struct perf_cgroup *cgrp1; 747 struct perf_cgroup *cgrp2 = NULL; 748 749 rcu_read_lock(); 750 /* 751 * we come here when we know perf_cgroup_events > 0 752 * we do not need to pass the ctx here because we know 753 * we are holding the rcu lock 754 */ 755 cgrp1 = perf_cgroup_from_task(task, NULL); 756 cgrp2 = perf_cgroup_from_task(prev, NULL); 757 758 /* 759 * only need to schedule in cgroup events if we are changing 760 * cgroup during ctxsw. Cgroup events were not scheduled 761 * out of ctxsw out if that was not the case. 762 */ 763 if (cgrp1 != cgrp2) 764 perf_cgroup_switch(task, PERF_CGROUP_SWIN); 765 766 rcu_read_unlock(); 767 } 768 769 static inline int perf_cgroup_connect(int fd, struct perf_event *event, 770 struct perf_event_attr *attr, 771 struct perf_event *group_leader) 772 { 773 struct perf_cgroup *cgrp; 774 struct cgroup_subsys_state *css; 775 struct fd f = fdget(fd); 776 int ret = 0; 777 778 if (!f.file) 779 return -EBADF; 780 781 css = css_tryget_online_from_dir(f.file->f_path.dentry, 782 &perf_event_cgrp_subsys); 783 if (IS_ERR(css)) { 784 ret = PTR_ERR(css); 785 goto out; 786 } 787 788 cgrp = container_of(css, struct perf_cgroup, css); 789 event->cgrp = cgrp; 790 791 /* 792 * all events in a group must monitor 793 * the same cgroup because a task belongs 794 * to only one perf cgroup at a time 795 */ 796 if (group_leader && group_leader->cgrp != cgrp) { 797 perf_detach_cgroup(event); 798 ret = -EINVAL; 799 } 800 out: 801 fdput(f); 802 return ret; 803 } 804 805 static inline void 806 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) 807 { 808 struct perf_cgroup_info *t; 809 t = per_cpu_ptr(event->cgrp->info, event->cpu); 810 event->shadow_ctx_time = now - t->timestamp; 811 } 812 813 static inline void 814 perf_cgroup_defer_enabled(struct perf_event *event) 815 { 816 /* 817 * when the current task's perf cgroup does not match 818 * the event's, we need to remember to call the 819 * perf_mark_enable() function the first time a task with 820 * a matching perf cgroup is scheduled in. 821 */ 822 if (is_cgroup_event(event) && !perf_cgroup_match(event)) 823 event->cgrp_defer_enabled = 1; 824 } 825 826 static inline void 827 perf_cgroup_mark_enabled(struct perf_event *event, 828 struct perf_event_context *ctx) 829 { 830 struct perf_event *sub; 831 u64 tstamp = perf_event_time(event); 832 833 if (!event->cgrp_defer_enabled) 834 return; 835 836 event->cgrp_defer_enabled = 0; 837 838 event->tstamp_enabled = tstamp - event->total_time_enabled; 839 list_for_each_entry(sub, &event->sibling_list, group_entry) { 840 if (sub->state >= PERF_EVENT_STATE_INACTIVE) { 841 sub->tstamp_enabled = tstamp - sub->total_time_enabled; 842 sub->cgrp_defer_enabled = 0; 843 } 844 } 845 } 846 847 /* 848 * Update cpuctx->cgrp so that it is set when first cgroup event is added and 849 * cleared when last cgroup event is removed. 850 */ 851 static inline void 852 list_update_cgroup_event(struct perf_event *event, 853 struct perf_event_context *ctx, bool add) 854 { 855 struct perf_cpu_context *cpuctx; 856 857 if (!is_cgroup_event(event)) 858 return; 859 860 if (add && ctx->nr_cgroups++) 861 return; 862 else if (!add && --ctx->nr_cgroups) 863 return; 864 /* 865 * Because cgroup events are always per-cpu events, 866 * this will always be called from the right CPU. 867 */ 868 cpuctx = __get_cpu_context(ctx); 869 cpuctx->cgrp = add ? event->cgrp : NULL; 870 } 871 872 #else /* !CONFIG_CGROUP_PERF */ 873 874 static inline bool 875 perf_cgroup_match(struct perf_event *event) 876 { 877 return true; 878 } 879 880 static inline void perf_detach_cgroup(struct perf_event *event) 881 {} 882 883 static inline int is_cgroup_event(struct perf_event *event) 884 { 885 return 0; 886 } 887 888 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) 889 { 890 return 0; 891 } 892 893 static inline void update_cgrp_time_from_event(struct perf_event *event) 894 { 895 } 896 897 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) 898 { 899 } 900 901 static inline void perf_cgroup_sched_out(struct task_struct *task, 902 struct task_struct *next) 903 { 904 } 905 906 static inline void perf_cgroup_sched_in(struct task_struct *prev, 907 struct task_struct *task) 908 { 909 } 910 911 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, 912 struct perf_event_attr *attr, 913 struct perf_event *group_leader) 914 { 915 return -EINVAL; 916 } 917 918 static inline void 919 perf_cgroup_set_timestamp(struct task_struct *task, 920 struct perf_event_context *ctx) 921 { 922 } 923 924 void 925 perf_cgroup_switch(struct task_struct *task, struct task_struct *next) 926 { 927 } 928 929 static inline void 930 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) 931 { 932 } 933 934 static inline u64 perf_cgroup_event_time(struct perf_event *event) 935 { 936 return 0; 937 } 938 939 static inline void 940 perf_cgroup_defer_enabled(struct perf_event *event) 941 { 942 } 943 944 static inline void 945 perf_cgroup_mark_enabled(struct perf_event *event, 946 struct perf_event_context *ctx) 947 { 948 } 949 950 static inline void 951 list_update_cgroup_event(struct perf_event *event, 952 struct perf_event_context *ctx, bool add) 953 { 954 } 955 956 #endif 957 958 /* 959 * set default to be dependent on timer tick just 960 * like original code 961 */ 962 #define PERF_CPU_HRTIMER (1000 / HZ) 963 /* 964 * function must be called with interrupts disbled 965 */ 966 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) 967 { 968 struct perf_cpu_context *cpuctx; 969 int rotations = 0; 970 971 WARN_ON(!irqs_disabled()); 972 973 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); 974 rotations = perf_rotate_context(cpuctx); 975 976 raw_spin_lock(&cpuctx->hrtimer_lock); 977 if (rotations) 978 hrtimer_forward_now(hr, cpuctx->hrtimer_interval); 979 else 980 cpuctx->hrtimer_active = 0; 981 raw_spin_unlock(&cpuctx->hrtimer_lock); 982 983 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; 984 } 985 986 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) 987 { 988 struct hrtimer *timer = &cpuctx->hrtimer; 989 struct pmu *pmu = cpuctx->ctx.pmu; 990 u64 interval; 991 992 /* no multiplexing needed for SW PMU */ 993 if (pmu->task_ctx_nr == perf_sw_context) 994 return; 995 996 /* 997 * check default is sane, if not set then force to 998 * default interval (1/tick) 999 */ 1000 interval = pmu->hrtimer_interval_ms; 1001 if (interval < 1) 1002 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; 1003 1004 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); 1005 1006 raw_spin_lock_init(&cpuctx->hrtimer_lock); 1007 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); 1008 timer->function = perf_mux_hrtimer_handler; 1009 } 1010 1011 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) 1012 { 1013 struct hrtimer *timer = &cpuctx->hrtimer; 1014 struct pmu *pmu = cpuctx->ctx.pmu; 1015 unsigned long flags; 1016 1017 /* not for SW PMU */ 1018 if (pmu->task_ctx_nr == perf_sw_context) 1019 return 0; 1020 1021 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); 1022 if (!cpuctx->hrtimer_active) { 1023 cpuctx->hrtimer_active = 1; 1024 hrtimer_forward_now(timer, cpuctx->hrtimer_interval); 1025 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 1026 } 1027 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); 1028 1029 return 0; 1030 } 1031 1032 void perf_pmu_disable(struct pmu *pmu) 1033 { 1034 int *count = this_cpu_ptr(pmu->pmu_disable_count); 1035 if (!(*count)++) 1036 pmu->pmu_disable(pmu); 1037 } 1038 1039 void perf_pmu_enable(struct pmu *pmu) 1040 { 1041 int *count = this_cpu_ptr(pmu->pmu_disable_count); 1042 if (!--(*count)) 1043 pmu->pmu_enable(pmu); 1044 } 1045 1046 static DEFINE_PER_CPU(struct list_head, active_ctx_list); 1047 1048 /* 1049 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and 1050 * perf_event_task_tick() are fully serialized because they're strictly cpu 1051 * affine and perf_event_ctx{activate,deactivate} are called with IRQs 1052 * disabled, while perf_event_task_tick is called from IRQ context. 1053 */ 1054 static void perf_event_ctx_activate(struct perf_event_context *ctx) 1055 { 1056 struct list_head *head = this_cpu_ptr(&active_ctx_list); 1057 1058 WARN_ON(!irqs_disabled()); 1059 1060 WARN_ON(!list_empty(&ctx->active_ctx_list)); 1061 1062 list_add(&ctx->active_ctx_list, head); 1063 } 1064 1065 static void perf_event_ctx_deactivate(struct perf_event_context *ctx) 1066 { 1067 WARN_ON(!irqs_disabled()); 1068 1069 WARN_ON(list_empty(&ctx->active_ctx_list)); 1070 1071 list_del_init(&ctx->active_ctx_list); 1072 } 1073 1074 static void get_ctx(struct perf_event_context *ctx) 1075 { 1076 WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); 1077 } 1078 1079 static void free_ctx(struct rcu_head *head) 1080 { 1081 struct perf_event_context *ctx; 1082 1083 ctx = container_of(head, struct perf_event_context, rcu_head); 1084 kfree(ctx->task_ctx_data); 1085 kfree(ctx); 1086 } 1087 1088 static void put_ctx(struct perf_event_context *ctx) 1089 { 1090 if (atomic_dec_and_test(&ctx->refcount)) { 1091 if (ctx->parent_ctx) 1092 put_ctx(ctx->parent_ctx); 1093 if (ctx->task && ctx->task != TASK_TOMBSTONE) 1094 put_task_struct(ctx->task); 1095 call_rcu(&ctx->rcu_head, free_ctx); 1096 } 1097 } 1098 1099 /* 1100 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and 1101 * perf_pmu_migrate_context() we need some magic. 1102 * 1103 * Those places that change perf_event::ctx will hold both 1104 * perf_event_ctx::mutex of the 'old' and 'new' ctx value. 1105 * 1106 * Lock ordering is by mutex address. There are two other sites where 1107 * perf_event_context::mutex nests and those are: 1108 * 1109 * - perf_event_exit_task_context() [ child , 0 ] 1110 * perf_event_exit_event() 1111 * put_event() [ parent, 1 ] 1112 * 1113 * - perf_event_init_context() [ parent, 0 ] 1114 * inherit_task_group() 1115 * inherit_group() 1116 * inherit_event() 1117 * perf_event_alloc() 1118 * perf_init_event() 1119 * perf_try_init_event() [ child , 1 ] 1120 * 1121 * While it appears there is an obvious deadlock here -- the parent and child 1122 * nesting levels are inverted between the two. This is in fact safe because 1123 * life-time rules separate them. That is an exiting task cannot fork, and a 1124 * spawning task cannot (yet) exit. 1125 * 1126 * But remember that that these are parent<->child context relations, and 1127 * migration does not affect children, therefore these two orderings should not 1128 * interact. 1129 * 1130 * The change in perf_event::ctx does not affect children (as claimed above) 1131 * because the sys_perf_event_open() case will install a new event and break 1132 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only 1133 * concerned with cpuctx and that doesn't have children. 1134 * 1135 * The places that change perf_event::ctx will issue: 1136 * 1137 * perf_remove_from_context(); 1138 * synchronize_rcu(); 1139 * perf_install_in_context(); 1140 * 1141 * to affect the change. The remove_from_context() + synchronize_rcu() should 1142 * quiesce the event, after which we can install it in the new location. This 1143 * means that only external vectors (perf_fops, prctl) can perturb the event 1144 * while in transit. Therefore all such accessors should also acquire 1145 * perf_event_context::mutex to serialize against this. 1146 * 1147 * However; because event->ctx can change while we're waiting to acquire 1148 * ctx->mutex we must be careful and use the below perf_event_ctx_lock() 1149 * function. 1150 * 1151 * Lock order: 1152 * cred_guard_mutex 1153 * task_struct::perf_event_mutex 1154 * perf_event_context::mutex 1155 * perf_event::child_mutex; 1156 * perf_event_context::lock 1157 * perf_event::mmap_mutex 1158 * mmap_sem 1159 */ 1160 static struct perf_event_context * 1161 perf_event_ctx_lock_nested(struct perf_event *event, int nesting) 1162 { 1163 struct perf_event_context *ctx; 1164 1165 again: 1166 rcu_read_lock(); 1167 ctx = ACCESS_ONCE(event->ctx); 1168 if (!atomic_inc_not_zero(&ctx->refcount)) { 1169 rcu_read_unlock(); 1170 goto again; 1171 } 1172 rcu_read_unlock(); 1173 1174 mutex_lock_nested(&ctx->mutex, nesting); 1175 if (event->ctx != ctx) { 1176 mutex_unlock(&ctx->mutex); 1177 put_ctx(ctx); 1178 goto again; 1179 } 1180 1181 return ctx; 1182 } 1183 1184 static inline struct perf_event_context * 1185 perf_event_ctx_lock(struct perf_event *event) 1186 { 1187 return perf_event_ctx_lock_nested(event, 0); 1188 } 1189 1190 static void perf_event_ctx_unlock(struct perf_event *event, 1191 struct perf_event_context *ctx) 1192 { 1193 mutex_unlock(&ctx->mutex); 1194 put_ctx(ctx); 1195 } 1196 1197 /* 1198 * This must be done under the ctx->lock, such as to serialize against 1199 * context_equiv(), therefore we cannot call put_ctx() since that might end up 1200 * calling scheduler related locks and ctx->lock nests inside those. 1201 */ 1202 static __must_check struct perf_event_context * 1203 unclone_ctx(struct perf_event_context *ctx) 1204 { 1205 struct perf_event_context *parent_ctx = ctx->parent_ctx; 1206 1207 lockdep_assert_held(&ctx->lock); 1208 1209 if (parent_ctx) 1210 ctx->parent_ctx = NULL; 1211 ctx->generation++; 1212 1213 return parent_ctx; 1214 } 1215 1216 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 1217 { 1218 /* 1219 * only top level events have the pid namespace they were created in 1220 */ 1221 if (event->parent) 1222 event = event->parent; 1223 1224 return task_tgid_nr_ns(p, event->ns); 1225 } 1226 1227 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) 1228 { 1229 /* 1230 * only top level events have the pid namespace they were created in 1231 */ 1232 if (event->parent) 1233 event = event->parent; 1234 1235 return task_pid_nr_ns(p, event->ns); 1236 } 1237 1238 /* 1239 * If we inherit events we want to return the parent event id 1240 * to userspace. 1241 */ 1242 static u64 primary_event_id(struct perf_event *event) 1243 { 1244 u64 id = event->id; 1245 1246 if (event->parent) 1247 id = event->parent->id; 1248 1249 return id; 1250 } 1251 1252 /* 1253 * Get the perf_event_context for a task and lock it. 1254 * 1255 * This has to cope with with the fact that until it is locked, 1256 * the context could get moved to another task. 1257 */ 1258 static struct perf_event_context * 1259 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) 1260 { 1261 struct perf_event_context *ctx; 1262 1263 retry: 1264 /* 1265 * One of the few rules of preemptible RCU is that one cannot do 1266 * rcu_read_unlock() while holding a scheduler (or nested) lock when 1267 * part of the read side critical section was irqs-enabled -- see 1268 * rcu_read_unlock_special(). 1269 * 1270 * Since ctx->lock nests under rq->lock we must ensure the entire read 1271 * side critical section has interrupts disabled. 1272 */ 1273 local_irq_save(*flags); 1274 rcu_read_lock(); 1275 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); 1276 if (ctx) { 1277 /* 1278 * If this context is a clone of another, it might 1279 * get swapped for another underneath us by 1280 * perf_event_task_sched_out, though the 1281 * rcu_read_lock() protects us from any context 1282 * getting freed. Lock the context and check if it 1283 * got swapped before we could get the lock, and retry 1284 * if so. If we locked the right context, then it 1285 * can't get swapped on us any more. 1286 */ 1287 raw_spin_lock(&ctx->lock); 1288 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { 1289 raw_spin_unlock(&ctx->lock); 1290 rcu_read_unlock(); 1291 local_irq_restore(*flags); 1292 goto retry; 1293 } 1294 1295 if (ctx->task == TASK_TOMBSTONE || 1296 !atomic_inc_not_zero(&ctx->refcount)) { 1297 raw_spin_unlock(&ctx->lock); 1298 ctx = NULL; 1299 } else { 1300 WARN_ON_ONCE(ctx->task != task); 1301 } 1302 } 1303 rcu_read_unlock(); 1304 if (!ctx) 1305 local_irq_restore(*flags); 1306 return ctx; 1307 } 1308 1309 /* 1310 * Get the context for a task and increment its pin_count so it 1311 * can't get swapped to another task. This also increments its 1312 * reference count so that the context can't get freed. 1313 */ 1314 static struct perf_event_context * 1315 perf_pin_task_context(struct task_struct *task, int ctxn) 1316 { 1317 struct perf_event_context *ctx; 1318 unsigned long flags; 1319 1320 ctx = perf_lock_task_context(task, ctxn, &flags); 1321 if (ctx) { 1322 ++ctx->pin_count; 1323 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1324 } 1325 return ctx; 1326 } 1327 1328 static void perf_unpin_context(struct perf_event_context *ctx) 1329 { 1330 unsigned long flags; 1331 1332 raw_spin_lock_irqsave(&ctx->lock, flags); 1333 --ctx->pin_count; 1334 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1335 } 1336 1337 /* 1338 * Update the record of the current time in a context. 1339 */ 1340 static void update_context_time(struct perf_event_context *ctx) 1341 { 1342 u64 now = perf_clock(); 1343 1344 ctx->time += now - ctx->timestamp; 1345 ctx->timestamp = now; 1346 } 1347 1348 static u64 perf_event_time(struct perf_event *event) 1349 { 1350 struct perf_event_context *ctx = event->ctx; 1351 1352 if (is_cgroup_event(event)) 1353 return perf_cgroup_event_time(event); 1354 1355 return ctx ? ctx->time : 0; 1356 } 1357 1358 /* 1359 * Update the total_time_enabled and total_time_running fields for a event. 1360 */ 1361 static void update_event_times(struct perf_event *event) 1362 { 1363 struct perf_event_context *ctx = event->ctx; 1364 u64 run_end; 1365 1366 lockdep_assert_held(&ctx->lock); 1367 1368 if (event->state < PERF_EVENT_STATE_INACTIVE || 1369 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 1370 return; 1371 1372 /* 1373 * in cgroup mode, time_enabled represents 1374 * the time the event was enabled AND active 1375 * tasks were in the monitored cgroup. This is 1376 * independent of the activity of the context as 1377 * there may be a mix of cgroup and non-cgroup events. 1378 * 1379 * That is why we treat cgroup events differently 1380 * here. 1381 */ 1382 if (is_cgroup_event(event)) 1383 run_end = perf_cgroup_event_time(event); 1384 else if (ctx->is_active) 1385 run_end = ctx->time; 1386 else 1387 run_end = event->tstamp_stopped; 1388 1389 event->total_time_enabled = run_end - event->tstamp_enabled; 1390 1391 if (event->state == PERF_EVENT_STATE_INACTIVE) 1392 run_end = event->tstamp_stopped; 1393 else 1394 run_end = perf_event_time(event); 1395 1396 event->total_time_running = run_end - event->tstamp_running; 1397 1398 } 1399 1400 /* 1401 * Update total_time_enabled and total_time_running for all events in a group. 1402 */ 1403 static void update_group_times(struct perf_event *leader) 1404 { 1405 struct perf_event *event; 1406 1407 update_event_times(leader); 1408 list_for_each_entry(event, &leader->sibling_list, group_entry) 1409 update_event_times(event); 1410 } 1411 1412 static struct list_head * 1413 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) 1414 { 1415 if (event->attr.pinned) 1416 return &ctx->pinned_groups; 1417 else 1418 return &ctx->flexible_groups; 1419 } 1420 1421 /* 1422 * Add a event from the lists for its context. 1423 * Must be called with ctx->mutex and ctx->lock held. 1424 */ 1425 static void 1426 list_add_event(struct perf_event *event, struct perf_event_context *ctx) 1427 { 1428 1429 lockdep_assert_held(&ctx->lock); 1430 1431 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); 1432 event->attach_state |= PERF_ATTACH_CONTEXT; 1433 1434 /* 1435 * If we're a stand alone event or group leader, we go to the context 1436 * list, group events are kept attached to the group so that 1437 * perf_group_detach can, at all times, locate all siblings. 1438 */ 1439 if (event->group_leader == event) { 1440 struct list_head *list; 1441 1442 if (is_software_event(event)) 1443 event->group_flags |= PERF_GROUP_SOFTWARE; 1444 1445 list = ctx_group_list(event, ctx); 1446 list_add_tail(&event->group_entry, list); 1447 } 1448 1449 list_update_cgroup_event(event, ctx, true); 1450 1451 list_add_rcu(&event->event_entry, &ctx->event_list); 1452 ctx->nr_events++; 1453 if (event->attr.inherit_stat) 1454 ctx->nr_stat++; 1455 1456 ctx->generation++; 1457 } 1458 1459 /* 1460 * Initialize event state based on the perf_event_attr::disabled. 1461 */ 1462 static inline void perf_event__state_init(struct perf_event *event) 1463 { 1464 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : 1465 PERF_EVENT_STATE_INACTIVE; 1466 } 1467 1468 static void __perf_event_read_size(struct perf_event *event, int nr_siblings) 1469 { 1470 int entry = sizeof(u64); /* value */ 1471 int size = 0; 1472 int nr = 1; 1473 1474 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1475 size += sizeof(u64); 1476 1477 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 1478 size += sizeof(u64); 1479 1480 if (event->attr.read_format & PERF_FORMAT_ID) 1481 entry += sizeof(u64); 1482 1483 if (event->attr.read_format & PERF_FORMAT_GROUP) { 1484 nr += nr_siblings; 1485 size += sizeof(u64); 1486 } 1487 1488 size += entry * nr; 1489 event->read_size = size; 1490 } 1491 1492 static void __perf_event_header_size(struct perf_event *event, u64 sample_type) 1493 { 1494 struct perf_sample_data *data; 1495 u16 size = 0; 1496 1497 if (sample_type & PERF_SAMPLE_IP) 1498 size += sizeof(data->ip); 1499 1500 if (sample_type & PERF_SAMPLE_ADDR) 1501 size += sizeof(data->addr); 1502 1503 if (sample_type & PERF_SAMPLE_PERIOD) 1504 size += sizeof(data->period); 1505 1506 if (sample_type & PERF_SAMPLE_WEIGHT) 1507 size += sizeof(data->weight); 1508 1509 if (sample_type & PERF_SAMPLE_READ) 1510 size += event->read_size; 1511 1512 if (sample_type & PERF_SAMPLE_DATA_SRC) 1513 size += sizeof(data->data_src.val); 1514 1515 if (sample_type & PERF_SAMPLE_TRANSACTION) 1516 size += sizeof(data->txn); 1517 1518 event->header_size = size; 1519 } 1520 1521 /* 1522 * Called at perf_event creation and when events are attached/detached from a 1523 * group. 1524 */ 1525 static void perf_event__header_size(struct perf_event *event) 1526 { 1527 __perf_event_read_size(event, 1528 event->group_leader->nr_siblings); 1529 __perf_event_header_size(event, event->attr.sample_type); 1530 } 1531 1532 static void perf_event__id_header_size(struct perf_event *event) 1533 { 1534 struct perf_sample_data *data; 1535 u64 sample_type = event->attr.sample_type; 1536 u16 size = 0; 1537 1538 if (sample_type & PERF_SAMPLE_TID) 1539 size += sizeof(data->tid_entry); 1540 1541 if (sample_type & PERF_SAMPLE_TIME) 1542 size += sizeof(data->time); 1543 1544 if (sample_type & PERF_SAMPLE_IDENTIFIER) 1545 size += sizeof(data->id); 1546 1547 if (sample_type & PERF_SAMPLE_ID) 1548 size += sizeof(data->id); 1549 1550 if (sample_type & PERF_SAMPLE_STREAM_ID) 1551 size += sizeof(data->stream_id); 1552 1553 if (sample_type & PERF_SAMPLE_CPU) 1554 size += sizeof(data->cpu_entry); 1555 1556 event->id_header_size = size; 1557 } 1558 1559 static bool perf_event_validate_size(struct perf_event *event) 1560 { 1561 /* 1562 * The values computed here will be over-written when we actually 1563 * attach the event. 1564 */ 1565 __perf_event_read_size(event, event->group_leader->nr_siblings + 1); 1566 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); 1567 perf_event__id_header_size(event); 1568 1569 /* 1570 * Sum the lot; should not exceed the 64k limit we have on records. 1571 * Conservative limit to allow for callchains and other variable fields. 1572 */ 1573 if (event->read_size + event->header_size + 1574 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) 1575 return false; 1576 1577 return true; 1578 } 1579 1580 static void perf_group_attach(struct perf_event *event) 1581 { 1582 struct perf_event *group_leader = event->group_leader, *pos; 1583 1584 /* 1585 * We can have double attach due to group movement in perf_event_open. 1586 */ 1587 if (event->attach_state & PERF_ATTACH_GROUP) 1588 return; 1589 1590 event->attach_state |= PERF_ATTACH_GROUP; 1591 1592 if (group_leader == event) 1593 return; 1594 1595 WARN_ON_ONCE(group_leader->ctx != event->ctx); 1596 1597 if (group_leader->group_flags & PERF_GROUP_SOFTWARE && 1598 !is_software_event(event)) 1599 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; 1600 1601 list_add_tail(&event->group_entry, &group_leader->sibling_list); 1602 group_leader->nr_siblings++; 1603 1604 perf_event__header_size(group_leader); 1605 1606 list_for_each_entry(pos, &group_leader->sibling_list, group_entry) 1607 perf_event__header_size(pos); 1608 } 1609 1610 /* 1611 * Remove a event from the lists for its context. 1612 * Must be called with ctx->mutex and ctx->lock held. 1613 */ 1614 static void 1615 list_del_event(struct perf_event *event, struct perf_event_context *ctx) 1616 { 1617 WARN_ON_ONCE(event->ctx != ctx); 1618 lockdep_assert_held(&ctx->lock); 1619 1620 /* 1621 * We can have double detach due to exit/hot-unplug + close. 1622 */ 1623 if (!(event->attach_state & PERF_ATTACH_CONTEXT)) 1624 return; 1625 1626 event->attach_state &= ~PERF_ATTACH_CONTEXT; 1627 1628 list_update_cgroup_event(event, ctx, false); 1629 1630 ctx->nr_events--; 1631 if (event->attr.inherit_stat) 1632 ctx->nr_stat--; 1633 1634 list_del_rcu(&event->event_entry); 1635 1636 if (event->group_leader == event) 1637 list_del_init(&event->group_entry); 1638 1639 update_group_times(event); 1640 1641 /* 1642 * If event was in error state, then keep it 1643 * that way, otherwise bogus counts will be 1644 * returned on read(). The only way to get out 1645 * of error state is by explicit re-enabling 1646 * of the event 1647 */ 1648 if (event->state > PERF_EVENT_STATE_OFF) 1649 event->state = PERF_EVENT_STATE_OFF; 1650 1651 ctx->generation++; 1652 } 1653 1654 static void perf_group_detach(struct perf_event *event) 1655 { 1656 struct perf_event *sibling, *tmp; 1657 struct list_head *list = NULL; 1658 1659 /* 1660 * We can have double detach due to exit/hot-unplug + close. 1661 */ 1662 if (!(event->attach_state & PERF_ATTACH_GROUP)) 1663 return; 1664 1665 event->attach_state &= ~PERF_ATTACH_GROUP; 1666 1667 /* 1668 * If this is a sibling, remove it from its group. 1669 */ 1670 if (event->group_leader != event) { 1671 list_del_init(&event->group_entry); 1672 event->group_leader->nr_siblings--; 1673 goto out; 1674 } 1675 1676 if (!list_empty(&event->group_entry)) 1677 list = &event->group_entry; 1678 1679 /* 1680 * If this was a group event with sibling events then 1681 * upgrade the siblings to singleton events by adding them 1682 * to whatever list we are on. 1683 */ 1684 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 1685 if (list) 1686 list_move_tail(&sibling->group_entry, list); 1687 sibling->group_leader = sibling; 1688 1689 /* Inherit group flags from the previous leader */ 1690 sibling->group_flags = event->group_flags; 1691 1692 WARN_ON_ONCE(sibling->ctx != event->ctx); 1693 } 1694 1695 out: 1696 perf_event__header_size(event->group_leader); 1697 1698 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) 1699 perf_event__header_size(tmp); 1700 } 1701 1702 static bool is_orphaned_event(struct perf_event *event) 1703 { 1704 return event->state == PERF_EVENT_STATE_DEAD; 1705 } 1706 1707 static inline int __pmu_filter_match(struct perf_event *event) 1708 { 1709 struct pmu *pmu = event->pmu; 1710 return pmu->filter_match ? pmu->filter_match(event) : 1; 1711 } 1712 1713 /* 1714 * Check whether we should attempt to schedule an event group based on 1715 * PMU-specific filtering. An event group can consist of HW and SW events, 1716 * potentially with a SW leader, so we must check all the filters, to 1717 * determine whether a group is schedulable: 1718 */ 1719 static inline int pmu_filter_match(struct perf_event *event) 1720 { 1721 struct perf_event *child; 1722 1723 if (!__pmu_filter_match(event)) 1724 return 0; 1725 1726 list_for_each_entry(child, &event->sibling_list, group_entry) { 1727 if (!__pmu_filter_match(child)) 1728 return 0; 1729 } 1730 1731 return 1; 1732 } 1733 1734 static inline int 1735 event_filter_match(struct perf_event *event) 1736 { 1737 return (event->cpu == -1 || event->cpu == smp_processor_id()) && 1738 perf_cgroup_match(event) && pmu_filter_match(event); 1739 } 1740 1741 static void 1742 event_sched_out(struct perf_event *event, 1743 struct perf_cpu_context *cpuctx, 1744 struct perf_event_context *ctx) 1745 { 1746 u64 tstamp = perf_event_time(event); 1747 u64 delta; 1748 1749 WARN_ON_ONCE(event->ctx != ctx); 1750 lockdep_assert_held(&ctx->lock); 1751 1752 /* 1753 * An event which could not be activated because of 1754 * filter mismatch still needs to have its timings 1755 * maintained, otherwise bogus information is return 1756 * via read() for time_enabled, time_running: 1757 */ 1758 if (event->state == PERF_EVENT_STATE_INACTIVE && 1759 !event_filter_match(event)) { 1760 delta = tstamp - event->tstamp_stopped; 1761 event->tstamp_running += delta; 1762 event->tstamp_stopped = tstamp; 1763 } 1764 1765 if (event->state != PERF_EVENT_STATE_ACTIVE) 1766 return; 1767 1768 perf_pmu_disable(event->pmu); 1769 1770 event->tstamp_stopped = tstamp; 1771 event->pmu->del(event, 0); 1772 event->oncpu = -1; 1773 event->state = PERF_EVENT_STATE_INACTIVE; 1774 if (event->pending_disable) { 1775 event->pending_disable = 0; 1776 event->state = PERF_EVENT_STATE_OFF; 1777 } 1778 1779 if (!is_software_event(event)) 1780 cpuctx->active_oncpu--; 1781 if (!--ctx->nr_active) 1782 perf_event_ctx_deactivate(ctx); 1783 if (event->attr.freq && event->attr.sample_freq) 1784 ctx->nr_freq--; 1785 if (event->attr.exclusive || !cpuctx->active_oncpu) 1786 cpuctx->exclusive = 0; 1787 1788 perf_pmu_enable(event->pmu); 1789 } 1790 1791 static void 1792 group_sched_out(struct perf_event *group_event, 1793 struct perf_cpu_context *cpuctx, 1794 struct perf_event_context *ctx) 1795 { 1796 struct perf_event *event; 1797 int state = group_event->state; 1798 1799 event_sched_out(group_event, cpuctx, ctx); 1800 1801 /* 1802 * Schedule out siblings (if any): 1803 */ 1804 list_for_each_entry(event, &group_event->sibling_list, group_entry) 1805 event_sched_out(event, cpuctx, ctx); 1806 1807 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) 1808 cpuctx->exclusive = 0; 1809 } 1810 1811 #define DETACH_GROUP 0x01UL 1812 1813 /* 1814 * Cross CPU call to remove a performance event 1815 * 1816 * We disable the event on the hardware level first. After that we 1817 * remove it from the context list. 1818 */ 1819 static void 1820 __perf_remove_from_context(struct perf_event *event, 1821 struct perf_cpu_context *cpuctx, 1822 struct perf_event_context *ctx, 1823 void *info) 1824 { 1825 unsigned long flags = (unsigned long)info; 1826 1827 event_sched_out(event, cpuctx, ctx); 1828 if (flags & DETACH_GROUP) 1829 perf_group_detach(event); 1830 list_del_event(event, ctx); 1831 1832 if (!ctx->nr_events && ctx->is_active) { 1833 ctx->is_active = 0; 1834 if (ctx->task) { 1835 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 1836 cpuctx->task_ctx = NULL; 1837 } 1838 } 1839 } 1840 1841 /* 1842 * Remove the event from a task's (or a CPU's) list of events. 1843 * 1844 * If event->ctx is a cloned context, callers must make sure that 1845 * every task struct that event->ctx->task could possibly point to 1846 * remains valid. This is OK when called from perf_release since 1847 * that only calls us on the top-level context, which can't be a clone. 1848 * When called from perf_event_exit_task, it's OK because the 1849 * context has been detached from its task. 1850 */ 1851 static void perf_remove_from_context(struct perf_event *event, unsigned long flags) 1852 { 1853 lockdep_assert_held(&event->ctx->mutex); 1854 1855 event_function_call(event, __perf_remove_from_context, (void *)flags); 1856 } 1857 1858 /* 1859 * Cross CPU call to disable a performance event 1860 */ 1861 static void __perf_event_disable(struct perf_event *event, 1862 struct perf_cpu_context *cpuctx, 1863 struct perf_event_context *ctx, 1864 void *info) 1865 { 1866 if (event->state < PERF_EVENT_STATE_INACTIVE) 1867 return; 1868 1869 update_context_time(ctx); 1870 update_cgrp_time_from_event(event); 1871 update_group_times(event); 1872 if (event == event->group_leader) 1873 group_sched_out(event, cpuctx, ctx); 1874 else 1875 event_sched_out(event, cpuctx, ctx); 1876 event->state = PERF_EVENT_STATE_OFF; 1877 } 1878 1879 /* 1880 * Disable a event. 1881 * 1882 * If event->ctx is a cloned context, callers must make sure that 1883 * every task struct that event->ctx->task could possibly point to 1884 * remains valid. This condition is satisifed when called through 1885 * perf_event_for_each_child or perf_event_for_each because they 1886 * hold the top-level event's child_mutex, so any descendant that 1887 * goes to exit will block in perf_event_exit_event(). 1888 * 1889 * When called from perf_pending_event it's OK because event->ctx 1890 * is the current context on this CPU and preemption is disabled, 1891 * hence we can't get into perf_event_task_sched_out for this context. 1892 */ 1893 static void _perf_event_disable(struct perf_event *event) 1894 { 1895 struct perf_event_context *ctx = event->ctx; 1896 1897 raw_spin_lock_irq(&ctx->lock); 1898 if (event->state <= PERF_EVENT_STATE_OFF) { 1899 raw_spin_unlock_irq(&ctx->lock); 1900 return; 1901 } 1902 raw_spin_unlock_irq(&ctx->lock); 1903 1904 event_function_call(event, __perf_event_disable, NULL); 1905 } 1906 1907 void perf_event_disable_local(struct perf_event *event) 1908 { 1909 event_function_local(event, __perf_event_disable, NULL); 1910 } 1911 1912 /* 1913 * Strictly speaking kernel users cannot create groups and therefore this 1914 * interface does not need the perf_event_ctx_lock() magic. 1915 */ 1916 void perf_event_disable(struct perf_event *event) 1917 { 1918 struct perf_event_context *ctx; 1919 1920 ctx = perf_event_ctx_lock(event); 1921 _perf_event_disable(event); 1922 perf_event_ctx_unlock(event, ctx); 1923 } 1924 EXPORT_SYMBOL_GPL(perf_event_disable); 1925 1926 static void perf_set_shadow_time(struct perf_event *event, 1927 struct perf_event_context *ctx, 1928 u64 tstamp) 1929 { 1930 /* 1931 * use the correct time source for the time snapshot 1932 * 1933 * We could get by without this by leveraging the 1934 * fact that to get to this function, the caller 1935 * has most likely already called update_context_time() 1936 * and update_cgrp_time_xx() and thus both timestamp 1937 * are identical (or very close). Given that tstamp is, 1938 * already adjusted for cgroup, we could say that: 1939 * tstamp - ctx->timestamp 1940 * is equivalent to 1941 * tstamp - cgrp->timestamp. 1942 * 1943 * Then, in perf_output_read(), the calculation would 1944 * work with no changes because: 1945 * - event is guaranteed scheduled in 1946 * - no scheduled out in between 1947 * - thus the timestamp would be the same 1948 * 1949 * But this is a bit hairy. 1950 * 1951 * So instead, we have an explicit cgroup call to remain 1952 * within the time time source all along. We believe it 1953 * is cleaner and simpler to understand. 1954 */ 1955 if (is_cgroup_event(event)) 1956 perf_cgroup_set_shadow_time(event, tstamp); 1957 else 1958 event->shadow_ctx_time = tstamp - ctx->timestamp; 1959 } 1960 1961 #define MAX_INTERRUPTS (~0ULL) 1962 1963 static void perf_log_throttle(struct perf_event *event, int enable); 1964 static void perf_log_itrace_start(struct perf_event *event); 1965 1966 static int 1967 event_sched_in(struct perf_event *event, 1968 struct perf_cpu_context *cpuctx, 1969 struct perf_event_context *ctx) 1970 { 1971 u64 tstamp = perf_event_time(event); 1972 int ret = 0; 1973 1974 lockdep_assert_held(&ctx->lock); 1975 1976 if (event->state <= PERF_EVENT_STATE_OFF) 1977 return 0; 1978 1979 WRITE_ONCE(event->oncpu, smp_processor_id()); 1980 /* 1981 * Order event::oncpu write to happen before the ACTIVE state 1982 * is visible. 1983 */ 1984 smp_wmb(); 1985 WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); 1986 1987 /* 1988 * Unthrottle events, since we scheduled we might have missed several 1989 * ticks already, also for a heavily scheduling task there is little 1990 * guarantee it'll get a tick in a timely manner. 1991 */ 1992 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { 1993 perf_log_throttle(event, 1); 1994 event->hw.interrupts = 0; 1995 } 1996 1997 /* 1998 * The new state must be visible before we turn it on in the hardware: 1999 */ 2000 smp_wmb(); 2001 2002 perf_pmu_disable(event->pmu); 2003 2004 perf_set_shadow_time(event, ctx, tstamp); 2005 2006 perf_log_itrace_start(event); 2007 2008 if (event->pmu->add(event, PERF_EF_START)) { 2009 event->state = PERF_EVENT_STATE_INACTIVE; 2010 event->oncpu = -1; 2011 ret = -EAGAIN; 2012 goto out; 2013 } 2014 2015 event->tstamp_running += tstamp - event->tstamp_stopped; 2016 2017 if (!is_software_event(event)) 2018 cpuctx->active_oncpu++; 2019 if (!ctx->nr_active++) 2020 perf_event_ctx_activate(ctx); 2021 if (event->attr.freq && event->attr.sample_freq) 2022 ctx->nr_freq++; 2023 2024 if (event->attr.exclusive) 2025 cpuctx->exclusive = 1; 2026 2027 out: 2028 perf_pmu_enable(event->pmu); 2029 2030 return ret; 2031 } 2032 2033 static int 2034 group_sched_in(struct perf_event *group_event, 2035 struct perf_cpu_context *cpuctx, 2036 struct perf_event_context *ctx) 2037 { 2038 struct perf_event *event, *partial_group = NULL; 2039 struct pmu *pmu = ctx->pmu; 2040 u64 now = ctx->time; 2041 bool simulate = false; 2042 2043 if (group_event->state == PERF_EVENT_STATE_OFF) 2044 return 0; 2045 2046 pmu->start_txn(pmu, PERF_PMU_TXN_ADD); 2047 2048 if (event_sched_in(group_event, cpuctx, ctx)) { 2049 pmu->cancel_txn(pmu); 2050 perf_mux_hrtimer_restart(cpuctx); 2051 return -EAGAIN; 2052 } 2053 2054 /* 2055 * Schedule in siblings as one group (if any): 2056 */ 2057 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 2058 if (event_sched_in(event, cpuctx, ctx)) { 2059 partial_group = event; 2060 goto group_error; 2061 } 2062 } 2063 2064 if (!pmu->commit_txn(pmu)) 2065 return 0; 2066 2067 group_error: 2068 /* 2069 * Groups can be scheduled in as one unit only, so undo any 2070 * partial group before returning: 2071 * The events up to the failed event are scheduled out normally, 2072 * tstamp_stopped will be updated. 2073 * 2074 * The failed events and the remaining siblings need to have 2075 * their timings updated as if they had gone thru event_sched_in() 2076 * and event_sched_out(). This is required to get consistent timings 2077 * across the group. This also takes care of the case where the group 2078 * could never be scheduled by ensuring tstamp_stopped is set to mark 2079 * the time the event was actually stopped, such that time delta 2080 * calculation in update_event_times() is correct. 2081 */ 2082 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 2083 if (event == partial_group) 2084 simulate = true; 2085 2086 if (simulate) { 2087 event->tstamp_running += now - event->tstamp_stopped; 2088 event->tstamp_stopped = now; 2089 } else { 2090 event_sched_out(event, cpuctx, ctx); 2091 } 2092 } 2093 event_sched_out(group_event, cpuctx, ctx); 2094 2095 pmu->cancel_txn(pmu); 2096 2097 perf_mux_hrtimer_restart(cpuctx); 2098 2099 return -EAGAIN; 2100 } 2101 2102 /* 2103 * Work out whether we can put this event group on the CPU now. 2104 */ 2105 static int group_can_go_on(struct perf_event *event, 2106 struct perf_cpu_context *cpuctx, 2107 int can_add_hw) 2108 { 2109 /* 2110 * Groups consisting entirely of software events can always go on. 2111 */ 2112 if (event->group_flags & PERF_GROUP_SOFTWARE) 2113 return 1; 2114 /* 2115 * If an exclusive group is already on, no other hardware 2116 * events can go on. 2117 */ 2118 if (cpuctx->exclusive) 2119 return 0; 2120 /* 2121 * If this group is exclusive and there are already 2122 * events on the CPU, it can't go on. 2123 */ 2124 if (event->attr.exclusive && cpuctx->active_oncpu) 2125 return 0; 2126 /* 2127 * Otherwise, try to add it if all previous groups were able 2128 * to go on. 2129 */ 2130 return can_add_hw; 2131 } 2132 2133 static void add_event_to_ctx(struct perf_event *event, 2134 struct perf_event_context *ctx) 2135 { 2136 u64 tstamp = perf_event_time(event); 2137 2138 list_add_event(event, ctx); 2139 perf_group_attach(event); 2140 event->tstamp_enabled = tstamp; 2141 event->tstamp_running = tstamp; 2142 event->tstamp_stopped = tstamp; 2143 } 2144 2145 static void ctx_sched_out(struct perf_event_context *ctx, 2146 struct perf_cpu_context *cpuctx, 2147 enum event_type_t event_type); 2148 static void 2149 ctx_sched_in(struct perf_event_context *ctx, 2150 struct perf_cpu_context *cpuctx, 2151 enum event_type_t event_type, 2152 struct task_struct *task); 2153 2154 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, 2155 struct perf_event_context *ctx) 2156 { 2157 if (!cpuctx->task_ctx) 2158 return; 2159 2160 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2161 return; 2162 2163 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 2164 } 2165 2166 static void perf_event_sched_in(struct perf_cpu_context *cpuctx, 2167 struct perf_event_context *ctx, 2168 struct task_struct *task) 2169 { 2170 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); 2171 if (ctx) 2172 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); 2173 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); 2174 if (ctx) 2175 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); 2176 } 2177 2178 static void ctx_resched(struct perf_cpu_context *cpuctx, 2179 struct perf_event_context *task_ctx) 2180 { 2181 perf_pmu_disable(cpuctx->ctx.pmu); 2182 if (task_ctx) 2183 task_ctx_sched_out(cpuctx, task_ctx); 2184 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 2185 perf_event_sched_in(cpuctx, task_ctx, current); 2186 perf_pmu_enable(cpuctx->ctx.pmu); 2187 } 2188 2189 /* 2190 * Cross CPU call to install and enable a performance event 2191 * 2192 * Very similar to remote_function() + event_function() but cannot assume that 2193 * things like ctx->is_active and cpuctx->task_ctx are set. 2194 */ 2195 static int __perf_install_in_context(void *info) 2196 { 2197 struct perf_event *event = info; 2198 struct perf_event_context *ctx = event->ctx; 2199 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2200 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2201 bool activate = true; 2202 int ret = 0; 2203 2204 raw_spin_lock(&cpuctx->ctx.lock); 2205 if (ctx->task) { 2206 raw_spin_lock(&ctx->lock); 2207 task_ctx = ctx; 2208 2209 /* If we're on the wrong CPU, try again */ 2210 if (task_cpu(ctx->task) != smp_processor_id()) { 2211 ret = -ESRCH; 2212 goto unlock; 2213 } 2214 2215 /* 2216 * If we're on the right CPU, see if the task we target is 2217 * current, if not we don't have to activate the ctx, a future 2218 * context switch will do that for us. 2219 */ 2220 if (ctx->task != current) 2221 activate = false; 2222 else 2223 WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); 2224 2225 } else if (task_ctx) { 2226 raw_spin_lock(&task_ctx->lock); 2227 } 2228 2229 if (activate) { 2230 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2231 add_event_to_ctx(event, ctx); 2232 ctx_resched(cpuctx, task_ctx); 2233 } else { 2234 add_event_to_ctx(event, ctx); 2235 } 2236 2237 unlock: 2238 perf_ctx_unlock(cpuctx, task_ctx); 2239 2240 return ret; 2241 } 2242 2243 /* 2244 * Attach a performance event to a context. 2245 * 2246 * Very similar to event_function_call, see comment there. 2247 */ 2248 static void 2249 perf_install_in_context(struct perf_event_context *ctx, 2250 struct perf_event *event, 2251 int cpu) 2252 { 2253 struct task_struct *task = READ_ONCE(ctx->task); 2254 2255 lockdep_assert_held(&ctx->mutex); 2256 2257 if (event->cpu != -1) 2258 event->cpu = cpu; 2259 2260 /* 2261 * Ensures that if we can observe event->ctx, both the event and ctx 2262 * will be 'complete'. See perf_iterate_sb_cpu(). 2263 */ 2264 smp_store_release(&event->ctx, ctx); 2265 2266 if (!task) { 2267 cpu_function_call(cpu, __perf_install_in_context, event); 2268 return; 2269 } 2270 2271 /* 2272 * Should not happen, we validate the ctx is still alive before calling. 2273 */ 2274 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) 2275 return; 2276 2277 /* 2278 * Installing events is tricky because we cannot rely on ctx->is_active 2279 * to be set in case this is the nr_events 0 -> 1 transition. 2280 */ 2281 again: 2282 /* 2283 * Cannot use task_function_call() because we need to run on the task's 2284 * CPU regardless of whether its current or not. 2285 */ 2286 if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) 2287 return; 2288 2289 raw_spin_lock_irq(&ctx->lock); 2290 task = ctx->task; 2291 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { 2292 /* 2293 * Cannot happen because we already checked above (which also 2294 * cannot happen), and we hold ctx->mutex, which serializes us 2295 * against perf_event_exit_task_context(). 2296 */ 2297 raw_spin_unlock_irq(&ctx->lock); 2298 return; 2299 } 2300 raw_spin_unlock_irq(&ctx->lock); 2301 /* 2302 * Since !ctx->is_active doesn't mean anything, we must IPI 2303 * unconditionally. 2304 */ 2305 goto again; 2306 } 2307 2308 /* 2309 * Put a event into inactive state and update time fields. 2310 * Enabling the leader of a group effectively enables all 2311 * the group members that aren't explicitly disabled, so we 2312 * have to update their ->tstamp_enabled also. 2313 * Note: this works for group members as well as group leaders 2314 * since the non-leader members' sibling_lists will be empty. 2315 */ 2316 static void __perf_event_mark_enabled(struct perf_event *event) 2317 { 2318 struct perf_event *sub; 2319 u64 tstamp = perf_event_time(event); 2320 2321 event->state = PERF_EVENT_STATE_INACTIVE; 2322 event->tstamp_enabled = tstamp - event->total_time_enabled; 2323 list_for_each_entry(sub, &event->sibling_list, group_entry) { 2324 if (sub->state >= PERF_EVENT_STATE_INACTIVE) 2325 sub->tstamp_enabled = tstamp - sub->total_time_enabled; 2326 } 2327 } 2328 2329 /* 2330 * Cross CPU call to enable a performance event 2331 */ 2332 static void __perf_event_enable(struct perf_event *event, 2333 struct perf_cpu_context *cpuctx, 2334 struct perf_event_context *ctx, 2335 void *info) 2336 { 2337 struct perf_event *leader = event->group_leader; 2338 struct perf_event_context *task_ctx; 2339 2340 if (event->state >= PERF_EVENT_STATE_INACTIVE || 2341 event->state <= PERF_EVENT_STATE_ERROR) 2342 return; 2343 2344 if (ctx->is_active) 2345 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2346 2347 __perf_event_mark_enabled(event); 2348 2349 if (!ctx->is_active) 2350 return; 2351 2352 if (!event_filter_match(event)) { 2353 if (is_cgroup_event(event)) 2354 perf_cgroup_defer_enabled(event); 2355 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); 2356 return; 2357 } 2358 2359 /* 2360 * If the event is in a group and isn't the group leader, 2361 * then don't put it on unless the group is on. 2362 */ 2363 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { 2364 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); 2365 return; 2366 } 2367 2368 task_ctx = cpuctx->task_ctx; 2369 if (ctx->task) 2370 WARN_ON_ONCE(task_ctx != ctx); 2371 2372 ctx_resched(cpuctx, task_ctx); 2373 } 2374 2375 /* 2376 * Enable a event. 2377 * 2378 * If event->ctx is a cloned context, callers must make sure that 2379 * every task struct that event->ctx->task could possibly point to 2380 * remains valid. This condition is satisfied when called through 2381 * perf_event_for_each_child or perf_event_for_each as described 2382 * for perf_event_disable. 2383 */ 2384 static void _perf_event_enable(struct perf_event *event) 2385 { 2386 struct perf_event_context *ctx = event->ctx; 2387 2388 raw_spin_lock_irq(&ctx->lock); 2389 if (event->state >= PERF_EVENT_STATE_INACTIVE || 2390 event->state < PERF_EVENT_STATE_ERROR) { 2391 raw_spin_unlock_irq(&ctx->lock); 2392 return; 2393 } 2394 2395 /* 2396 * If the event is in error state, clear that first. 2397 * 2398 * That way, if we see the event in error state below, we know that it 2399 * has gone back into error state, as distinct from the task having 2400 * been scheduled away before the cross-call arrived. 2401 */ 2402 if (event->state == PERF_EVENT_STATE_ERROR) 2403 event->state = PERF_EVENT_STATE_OFF; 2404 raw_spin_unlock_irq(&ctx->lock); 2405 2406 event_function_call(event, __perf_event_enable, NULL); 2407 } 2408 2409 /* 2410 * See perf_event_disable(); 2411 */ 2412 void perf_event_enable(struct perf_event *event) 2413 { 2414 struct perf_event_context *ctx; 2415 2416 ctx = perf_event_ctx_lock(event); 2417 _perf_event_enable(event); 2418 perf_event_ctx_unlock(event, ctx); 2419 } 2420 EXPORT_SYMBOL_GPL(perf_event_enable); 2421 2422 struct stop_event_data { 2423 struct perf_event *event; 2424 unsigned int restart; 2425 }; 2426 2427 static int __perf_event_stop(void *info) 2428 { 2429 struct stop_event_data *sd = info; 2430 struct perf_event *event = sd->event; 2431 2432 /* if it's already INACTIVE, do nothing */ 2433 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) 2434 return 0; 2435 2436 /* matches smp_wmb() in event_sched_in() */ 2437 smp_rmb(); 2438 2439 /* 2440 * There is a window with interrupts enabled before we get here, 2441 * so we need to check again lest we try to stop another CPU's event. 2442 */ 2443 if (READ_ONCE(event->oncpu) != smp_processor_id()) 2444 return -EAGAIN; 2445 2446 event->pmu->stop(event, PERF_EF_UPDATE); 2447 2448 /* 2449 * May race with the actual stop (through perf_pmu_output_stop()), 2450 * but it is only used for events with AUX ring buffer, and such 2451 * events will refuse to restart because of rb::aux_mmap_count==0, 2452 * see comments in perf_aux_output_begin(). 2453 * 2454 * Since this is happening on a event-local CPU, no trace is lost 2455 * while restarting. 2456 */ 2457 if (sd->restart) 2458 event->pmu->start(event, PERF_EF_START); 2459 2460 return 0; 2461 } 2462 2463 static int perf_event_restart(struct perf_event *event) 2464 { 2465 struct stop_event_data sd = { 2466 .event = event, 2467 .restart = 1, 2468 }; 2469 int ret = 0; 2470 2471 do { 2472 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) 2473 return 0; 2474 2475 /* matches smp_wmb() in event_sched_in() */ 2476 smp_rmb(); 2477 2478 /* 2479 * We only want to restart ACTIVE events, so if the event goes 2480 * inactive here (event->oncpu==-1), there's nothing more to do; 2481 * fall through with ret==-ENXIO. 2482 */ 2483 ret = cpu_function_call(READ_ONCE(event->oncpu), 2484 __perf_event_stop, &sd); 2485 } while (ret == -EAGAIN); 2486 2487 return ret; 2488 } 2489 2490 /* 2491 * In order to contain the amount of racy and tricky in the address filter 2492 * configuration management, it is a two part process: 2493 * 2494 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, 2495 * we update the addresses of corresponding vmas in 2496 * event::addr_filters_offs array and bump the event::addr_filters_gen; 2497 * (p2) when an event is scheduled in (pmu::add), it calls 2498 * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() 2499 * if the generation has changed since the previous call. 2500 * 2501 * If (p1) happens while the event is active, we restart it to force (p2). 2502 * 2503 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on 2504 * pre-existing mappings, called once when new filters arrive via SET_FILTER 2505 * ioctl; 2506 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly 2507 * registered mapping, called for every new mmap(), with mm::mmap_sem down 2508 * for reading; 2509 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process 2510 * of exec. 2511 */ 2512 void perf_event_addr_filters_sync(struct perf_event *event) 2513 { 2514 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 2515 2516 if (!has_addr_filter(event)) 2517 return; 2518 2519 raw_spin_lock(&ifh->lock); 2520 if (event->addr_filters_gen != event->hw.addr_filters_gen) { 2521 event->pmu->addr_filters_sync(event); 2522 event->hw.addr_filters_gen = event->addr_filters_gen; 2523 } 2524 raw_spin_unlock(&ifh->lock); 2525 } 2526 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); 2527 2528 static int _perf_event_refresh(struct perf_event *event, int refresh) 2529 { 2530 /* 2531 * not supported on inherited events 2532 */ 2533 if (event->attr.inherit || !is_sampling_event(event)) 2534 return -EINVAL; 2535 2536 atomic_add(refresh, &event->event_limit); 2537 _perf_event_enable(event); 2538 2539 return 0; 2540 } 2541 2542 /* 2543 * See perf_event_disable() 2544 */ 2545 int perf_event_refresh(struct perf_event *event, int refresh) 2546 { 2547 struct perf_event_context *ctx; 2548 int ret; 2549 2550 ctx = perf_event_ctx_lock(event); 2551 ret = _perf_event_refresh(event, refresh); 2552 perf_event_ctx_unlock(event, ctx); 2553 2554 return ret; 2555 } 2556 EXPORT_SYMBOL_GPL(perf_event_refresh); 2557 2558 static void ctx_sched_out(struct perf_event_context *ctx, 2559 struct perf_cpu_context *cpuctx, 2560 enum event_type_t event_type) 2561 { 2562 int is_active = ctx->is_active; 2563 struct perf_event *event; 2564 2565 lockdep_assert_held(&ctx->lock); 2566 2567 if (likely(!ctx->nr_events)) { 2568 /* 2569 * See __perf_remove_from_context(). 2570 */ 2571 WARN_ON_ONCE(ctx->is_active); 2572 if (ctx->task) 2573 WARN_ON_ONCE(cpuctx->task_ctx); 2574 return; 2575 } 2576 2577 ctx->is_active &= ~event_type; 2578 if (!(ctx->is_active & EVENT_ALL)) 2579 ctx->is_active = 0; 2580 2581 if (ctx->task) { 2582 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2583 if (!ctx->is_active) 2584 cpuctx->task_ctx = NULL; 2585 } 2586 2587 /* 2588 * Always update time if it was set; not only when it changes. 2589 * Otherwise we can 'forget' to update time for any but the last 2590 * context we sched out. For example: 2591 * 2592 * ctx_sched_out(.event_type = EVENT_FLEXIBLE) 2593 * ctx_sched_out(.event_type = EVENT_PINNED) 2594 * 2595 * would only update time for the pinned events. 2596 */ 2597 if (is_active & EVENT_TIME) { 2598 /* update (and stop) ctx time */ 2599 update_context_time(ctx); 2600 update_cgrp_time_from_cpuctx(cpuctx); 2601 } 2602 2603 is_active ^= ctx->is_active; /* changed bits */ 2604 2605 if (!ctx->nr_active || !(is_active & EVENT_ALL)) 2606 return; 2607 2608 perf_pmu_disable(ctx->pmu); 2609 if (is_active & EVENT_PINNED) { 2610 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 2611 group_sched_out(event, cpuctx, ctx); 2612 } 2613 2614 if (is_active & EVENT_FLEXIBLE) { 2615 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 2616 group_sched_out(event, cpuctx, ctx); 2617 } 2618 perf_pmu_enable(ctx->pmu); 2619 } 2620 2621 /* 2622 * Test whether two contexts are equivalent, i.e. whether they have both been 2623 * cloned from the same version of the same context. 2624 * 2625 * Equivalence is measured using a generation number in the context that is 2626 * incremented on each modification to it; see unclone_ctx(), list_add_event() 2627 * and list_del_event(). 2628 */ 2629 static int context_equiv(struct perf_event_context *ctx1, 2630 struct perf_event_context *ctx2) 2631 { 2632 lockdep_assert_held(&ctx1->lock); 2633 lockdep_assert_held(&ctx2->lock); 2634 2635 /* Pinning disables the swap optimization */ 2636 if (ctx1->pin_count || ctx2->pin_count) 2637 return 0; 2638 2639 /* If ctx1 is the parent of ctx2 */ 2640 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) 2641 return 1; 2642 2643 /* If ctx2 is the parent of ctx1 */ 2644 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) 2645 return 1; 2646 2647 /* 2648 * If ctx1 and ctx2 have the same parent; we flatten the parent 2649 * hierarchy, see perf_event_init_context(). 2650 */ 2651 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && 2652 ctx1->parent_gen == ctx2->parent_gen) 2653 return 1; 2654 2655 /* Unmatched */ 2656 return 0; 2657 } 2658 2659 static void __perf_event_sync_stat(struct perf_event *event, 2660 struct perf_event *next_event) 2661 { 2662 u64 value; 2663 2664 if (!event->attr.inherit_stat) 2665 return; 2666 2667 /* 2668 * Update the event value, we cannot use perf_event_read() 2669 * because we're in the middle of a context switch and have IRQs 2670 * disabled, which upsets smp_call_function_single(), however 2671 * we know the event must be on the current CPU, therefore we 2672 * don't need to use it. 2673 */ 2674 switch (event->state) { 2675 case PERF_EVENT_STATE_ACTIVE: 2676 event->pmu->read(event); 2677 /* fall-through */ 2678 2679 case PERF_EVENT_STATE_INACTIVE: 2680 update_event_times(event); 2681 break; 2682 2683 default: 2684 break; 2685 } 2686 2687 /* 2688 * In order to keep per-task stats reliable we need to flip the event 2689 * values when we flip the contexts. 2690 */ 2691 value = local64_read(&next_event->count); 2692 value = local64_xchg(&event->count, value); 2693 local64_set(&next_event->count, value); 2694 2695 swap(event->total_time_enabled, next_event->total_time_enabled); 2696 swap(event->total_time_running, next_event->total_time_running); 2697 2698 /* 2699 * Since we swizzled the values, update the user visible data too. 2700 */ 2701 perf_event_update_userpage(event); 2702 perf_event_update_userpage(next_event); 2703 } 2704 2705 static void perf_event_sync_stat(struct perf_event_context *ctx, 2706 struct perf_event_context *next_ctx) 2707 { 2708 struct perf_event *event, *next_event; 2709 2710 if (!ctx->nr_stat) 2711 return; 2712 2713 update_context_time(ctx); 2714 2715 event = list_first_entry(&ctx->event_list, 2716 struct perf_event, event_entry); 2717 2718 next_event = list_first_entry(&next_ctx->event_list, 2719 struct perf_event, event_entry); 2720 2721 while (&event->event_entry != &ctx->event_list && 2722 &next_event->event_entry != &next_ctx->event_list) { 2723 2724 __perf_event_sync_stat(event, next_event); 2725 2726 event = list_next_entry(event, event_entry); 2727 next_event = list_next_entry(next_event, event_entry); 2728 } 2729 } 2730 2731 static void perf_event_context_sched_out(struct task_struct *task, int ctxn, 2732 struct task_struct *next) 2733 { 2734 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 2735 struct perf_event_context *next_ctx; 2736 struct perf_event_context *parent, *next_parent; 2737 struct perf_cpu_context *cpuctx; 2738 int do_switch = 1; 2739 2740 if (likely(!ctx)) 2741 return; 2742 2743 cpuctx = __get_cpu_context(ctx); 2744 if (!cpuctx->task_ctx) 2745 return; 2746 2747 rcu_read_lock(); 2748 next_ctx = next->perf_event_ctxp[ctxn]; 2749 if (!next_ctx) 2750 goto unlock; 2751 2752 parent = rcu_dereference(ctx->parent_ctx); 2753 next_parent = rcu_dereference(next_ctx->parent_ctx); 2754 2755 /* If neither context have a parent context; they cannot be clones. */ 2756 if (!parent && !next_parent) 2757 goto unlock; 2758 2759 if (next_parent == ctx || next_ctx == parent || next_parent == parent) { 2760 /* 2761 * Looks like the two contexts are clones, so we might be 2762 * able to optimize the context switch. We lock both 2763 * contexts and check that they are clones under the 2764 * lock (including re-checking that neither has been 2765 * uncloned in the meantime). It doesn't matter which 2766 * order we take the locks because no other cpu could 2767 * be trying to lock both of these tasks. 2768 */ 2769 raw_spin_lock(&ctx->lock); 2770 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 2771 if (context_equiv(ctx, next_ctx)) { 2772 WRITE_ONCE(ctx->task, next); 2773 WRITE_ONCE(next_ctx->task, task); 2774 2775 swap(ctx->task_ctx_data, next_ctx->task_ctx_data); 2776 2777 /* 2778 * RCU_INIT_POINTER here is safe because we've not 2779 * modified the ctx and the above modification of 2780 * ctx->task and ctx->task_ctx_data are immaterial 2781 * since those values are always verified under 2782 * ctx->lock which we're now holding. 2783 */ 2784 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); 2785 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); 2786 2787 do_switch = 0; 2788 2789 perf_event_sync_stat(ctx, next_ctx); 2790 } 2791 raw_spin_unlock(&next_ctx->lock); 2792 raw_spin_unlock(&ctx->lock); 2793 } 2794 unlock: 2795 rcu_read_unlock(); 2796 2797 if (do_switch) { 2798 raw_spin_lock(&ctx->lock); 2799 task_ctx_sched_out(cpuctx, ctx); 2800 raw_spin_unlock(&ctx->lock); 2801 } 2802 } 2803 2804 void perf_sched_cb_dec(struct pmu *pmu) 2805 { 2806 this_cpu_dec(perf_sched_cb_usages); 2807 } 2808 2809 void perf_sched_cb_inc(struct pmu *pmu) 2810 { 2811 this_cpu_inc(perf_sched_cb_usages); 2812 } 2813 2814 /* 2815 * This function provides the context switch callback to the lower code 2816 * layer. It is invoked ONLY when the context switch callback is enabled. 2817 */ 2818 static void perf_pmu_sched_task(struct task_struct *prev, 2819 struct task_struct *next, 2820 bool sched_in) 2821 { 2822 struct perf_cpu_context *cpuctx; 2823 struct pmu *pmu; 2824 unsigned long flags; 2825 2826 if (prev == next) 2827 return; 2828 2829 local_irq_save(flags); 2830 2831 rcu_read_lock(); 2832 2833 list_for_each_entry_rcu(pmu, &pmus, entry) { 2834 if (pmu->sched_task) { 2835 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 2836 2837 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 2838 2839 perf_pmu_disable(pmu); 2840 2841 pmu->sched_task(cpuctx->task_ctx, sched_in); 2842 2843 perf_pmu_enable(pmu); 2844 2845 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 2846 } 2847 } 2848 2849 rcu_read_unlock(); 2850 2851 local_irq_restore(flags); 2852 } 2853 2854 static void perf_event_switch(struct task_struct *task, 2855 struct task_struct *next_prev, bool sched_in); 2856 2857 #define for_each_task_context_nr(ctxn) \ 2858 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) 2859 2860 /* 2861 * Called from scheduler to remove the events of the current task, 2862 * with interrupts disabled. 2863 * 2864 * We stop each event and update the event value in event->count. 2865 * 2866 * This does not protect us against NMI, but disable() 2867 * sets the disabled bit in the control field of event _before_ 2868 * accessing the event control register. If a NMI hits, then it will 2869 * not restart the event. 2870 */ 2871 void __perf_event_task_sched_out(struct task_struct *task, 2872 struct task_struct *next) 2873 { 2874 int ctxn; 2875 2876 if (__this_cpu_read(perf_sched_cb_usages)) 2877 perf_pmu_sched_task(task, next, false); 2878 2879 if (atomic_read(&nr_switch_events)) 2880 perf_event_switch(task, next, false); 2881 2882 for_each_task_context_nr(ctxn) 2883 perf_event_context_sched_out(task, ctxn, next); 2884 2885 /* 2886 * if cgroup events exist on this CPU, then we need 2887 * to check if we have to switch out PMU state. 2888 * cgroup event are system-wide mode only 2889 */ 2890 if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 2891 perf_cgroup_sched_out(task, next); 2892 } 2893 2894 /* 2895 * Called with IRQs disabled 2896 */ 2897 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 2898 enum event_type_t event_type) 2899 { 2900 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); 2901 } 2902 2903 static void 2904 ctx_pinned_sched_in(struct perf_event_context *ctx, 2905 struct perf_cpu_context *cpuctx) 2906 { 2907 struct perf_event *event; 2908 2909 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2910 if (event->state <= PERF_EVENT_STATE_OFF) 2911 continue; 2912 if (!event_filter_match(event)) 2913 continue; 2914 2915 /* may need to reset tstamp_enabled */ 2916 if (is_cgroup_event(event)) 2917 perf_cgroup_mark_enabled(event, ctx); 2918 2919 if (group_can_go_on(event, cpuctx, 1)) 2920 group_sched_in(event, cpuctx, ctx); 2921 2922 /* 2923 * If this pinned group hasn't been scheduled, 2924 * put it in error state. 2925 */ 2926 if (event->state == PERF_EVENT_STATE_INACTIVE) { 2927 update_group_times(event); 2928 event->state = PERF_EVENT_STATE_ERROR; 2929 } 2930 } 2931 } 2932 2933 static void 2934 ctx_flexible_sched_in(struct perf_event_context *ctx, 2935 struct perf_cpu_context *cpuctx) 2936 { 2937 struct perf_event *event; 2938 int can_add_hw = 1; 2939 2940 list_for_each_entry(event, &ctx->flexible_groups, group_entry) { 2941 /* Ignore events in OFF or ERROR state */ 2942 if (event->state <= PERF_EVENT_STATE_OFF) 2943 continue; 2944 /* 2945 * Listen to the 'cpu' scheduling filter constraint 2946 * of events: 2947 */ 2948 if (!event_filter_match(event)) 2949 continue; 2950 2951 /* may need to reset tstamp_enabled */ 2952 if (is_cgroup_event(event)) 2953 perf_cgroup_mark_enabled(event, ctx); 2954 2955 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2956 if (group_sched_in(event, cpuctx, ctx)) 2957 can_add_hw = 0; 2958 } 2959 } 2960 } 2961 2962 static void 2963 ctx_sched_in(struct perf_event_context *ctx, 2964 struct perf_cpu_context *cpuctx, 2965 enum event_type_t event_type, 2966 struct task_struct *task) 2967 { 2968 int is_active = ctx->is_active; 2969 u64 now; 2970 2971 lockdep_assert_held(&ctx->lock); 2972 2973 if (likely(!ctx->nr_events)) 2974 return; 2975 2976 ctx->is_active |= (event_type | EVENT_TIME); 2977 if (ctx->task) { 2978 if (!is_active) 2979 cpuctx->task_ctx = ctx; 2980 else 2981 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2982 } 2983 2984 is_active ^= ctx->is_active; /* changed bits */ 2985 2986 if (is_active & EVENT_TIME) { 2987 /* start ctx time */ 2988 now = perf_clock(); 2989 ctx->timestamp = now; 2990 perf_cgroup_set_timestamp(task, ctx); 2991 } 2992 2993 /* 2994 * First go through the list and put on any pinned groups 2995 * in order to give them the best chance of going on. 2996 */ 2997 if (is_active & EVENT_PINNED) 2998 ctx_pinned_sched_in(ctx, cpuctx); 2999 3000 /* Then walk through the lower prio flexible groups */ 3001 if (is_active & EVENT_FLEXIBLE) 3002 ctx_flexible_sched_in(ctx, cpuctx); 3003 } 3004 3005 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 3006 enum event_type_t event_type, 3007 struct task_struct *task) 3008 { 3009 struct perf_event_context *ctx = &cpuctx->ctx; 3010 3011 ctx_sched_in(ctx, cpuctx, event_type, task); 3012 } 3013 3014 static void perf_event_context_sched_in(struct perf_event_context *ctx, 3015 struct task_struct *task) 3016 { 3017 struct perf_cpu_context *cpuctx; 3018 3019 cpuctx = __get_cpu_context(ctx); 3020 if (cpuctx->task_ctx == ctx) 3021 return; 3022 3023 perf_ctx_lock(cpuctx, ctx); 3024 perf_pmu_disable(ctx->pmu); 3025 /* 3026 * We want to keep the following priority order: 3027 * cpu pinned (that don't need to move), task pinned, 3028 * cpu flexible, task flexible. 3029 */ 3030 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 3031 perf_event_sched_in(cpuctx, ctx, task); 3032 perf_pmu_enable(ctx->pmu); 3033 perf_ctx_unlock(cpuctx, ctx); 3034 } 3035 3036 /* 3037 * Called from scheduler to add the events of the current task 3038 * with interrupts disabled. 3039 * 3040 * We restore the event value and then enable it. 3041 * 3042 * This does not protect us against NMI, but enable() 3043 * sets the enabled bit in the control field of event _before_ 3044 * accessing the event control register. If a NMI hits, then it will 3045 * keep the event running. 3046 */ 3047 void __perf_event_task_sched_in(struct task_struct *prev, 3048 struct task_struct *task) 3049 { 3050 struct perf_event_context *ctx; 3051 int ctxn; 3052 3053 /* 3054 * If cgroup events exist on this CPU, then we need to check if we have 3055 * to switch in PMU state; cgroup event are system-wide mode only. 3056 * 3057 * Since cgroup events are CPU events, we must schedule these in before 3058 * we schedule in the task events. 3059 */ 3060 if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 3061 perf_cgroup_sched_in(prev, task); 3062 3063 for_each_task_context_nr(ctxn) { 3064 ctx = task->perf_event_ctxp[ctxn]; 3065 if (likely(!ctx)) 3066 continue; 3067 3068 perf_event_context_sched_in(ctx, task); 3069 } 3070 3071 if (atomic_read(&nr_switch_events)) 3072 perf_event_switch(task, prev, true); 3073 3074 if (__this_cpu_read(perf_sched_cb_usages)) 3075 perf_pmu_sched_task(prev, task, true); 3076 } 3077 3078 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 3079 { 3080 u64 frequency = event->attr.sample_freq; 3081 u64 sec = NSEC_PER_SEC; 3082 u64 divisor, dividend; 3083 3084 int count_fls, nsec_fls, frequency_fls, sec_fls; 3085 3086 count_fls = fls64(count); 3087 nsec_fls = fls64(nsec); 3088 frequency_fls = fls64(frequency); 3089 sec_fls = 30; 3090 3091 /* 3092 * We got @count in @nsec, with a target of sample_freq HZ 3093 * the target period becomes: 3094 * 3095 * @count * 10^9 3096 * period = ------------------- 3097 * @nsec * sample_freq 3098 * 3099 */ 3100 3101 /* 3102 * Reduce accuracy by one bit such that @a and @b converge 3103 * to a similar magnitude. 3104 */ 3105 #define REDUCE_FLS(a, b) \ 3106 do { \ 3107 if (a##_fls > b##_fls) { \ 3108 a >>= 1; \ 3109 a##_fls--; \ 3110 } else { \ 3111 b >>= 1; \ 3112 b##_fls--; \ 3113 } \ 3114 } while (0) 3115 3116 /* 3117 * Reduce accuracy until either term fits in a u64, then proceed with 3118 * the other, so that finally we can do a u64/u64 division. 3119 */ 3120 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { 3121 REDUCE_FLS(nsec, frequency); 3122 REDUCE_FLS(sec, count); 3123 } 3124 3125 if (count_fls + sec_fls > 64) { 3126 divisor = nsec * frequency; 3127 3128 while (count_fls + sec_fls > 64) { 3129 REDUCE_FLS(count, sec); 3130 divisor >>= 1; 3131 } 3132 3133 dividend = count * sec; 3134 } else { 3135 dividend = count * sec; 3136 3137 while (nsec_fls + frequency_fls > 64) { 3138 REDUCE_FLS(nsec, frequency); 3139 dividend >>= 1; 3140 } 3141 3142 divisor = nsec * frequency; 3143 } 3144 3145 if (!divisor) 3146 return dividend; 3147 3148 return div64_u64(dividend, divisor); 3149 } 3150 3151 static DEFINE_PER_CPU(int, perf_throttled_count); 3152 static DEFINE_PER_CPU(u64, perf_throttled_seq); 3153 3154 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) 3155 { 3156 struct hw_perf_event *hwc = &event->hw; 3157 s64 period, sample_period; 3158 s64 delta; 3159 3160 period = perf_calculate_period(event, nsec, count); 3161 3162 delta = (s64)(period - hwc->sample_period); 3163 delta = (delta + 7) / 8; /* low pass filter */ 3164 3165 sample_period = hwc->sample_period + delta; 3166 3167 if (!sample_period) 3168 sample_period = 1; 3169 3170 hwc->sample_period = sample_period; 3171 3172 if (local64_read(&hwc->period_left) > 8*sample_period) { 3173 if (disable) 3174 event->pmu->stop(event, PERF_EF_UPDATE); 3175 3176 local64_set(&hwc->period_left, 0); 3177 3178 if (disable) 3179 event->pmu->start(event, PERF_EF_RELOAD); 3180 } 3181 } 3182 3183 /* 3184 * combine freq adjustment with unthrottling to avoid two passes over the 3185 * events. At the same time, make sure, having freq events does not change 3186 * the rate of unthrottling as that would introduce bias. 3187 */ 3188 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, 3189 int needs_unthr) 3190 { 3191 struct perf_event *event; 3192 struct hw_perf_event *hwc; 3193 u64 now, period = TICK_NSEC; 3194 s64 delta; 3195 3196 /* 3197 * only need to iterate over all events iff: 3198 * - context have events in frequency mode (needs freq adjust) 3199 * - there are events to unthrottle on this cpu 3200 */ 3201 if (!(ctx->nr_freq || needs_unthr)) 3202 return; 3203 3204 raw_spin_lock(&ctx->lock); 3205 perf_pmu_disable(ctx->pmu); 3206 3207 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3208 if (event->state != PERF_EVENT_STATE_ACTIVE) 3209 continue; 3210 3211 if (!event_filter_match(event)) 3212 continue; 3213 3214 perf_pmu_disable(event->pmu); 3215 3216 hwc = &event->hw; 3217 3218 if (hwc->interrupts == MAX_INTERRUPTS) { 3219 hwc->interrupts = 0; 3220 perf_log_throttle(event, 1); 3221 event->pmu->start(event, 0); 3222 } 3223 3224 if (!event->attr.freq || !event->attr.sample_freq) 3225 goto next; 3226 3227 /* 3228 * stop the event and update event->count 3229 */ 3230 event->pmu->stop(event, PERF_EF_UPDATE); 3231 3232 now = local64_read(&event->count); 3233 delta = now - hwc->freq_count_stamp; 3234 hwc->freq_count_stamp = now; 3235 3236 /* 3237 * restart the event 3238 * reload only if value has changed 3239 * we have stopped the event so tell that 3240 * to perf_adjust_period() to avoid stopping it 3241 * twice. 3242 */ 3243 if (delta > 0) 3244 perf_adjust_period(event, period, delta, false); 3245 3246 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); 3247 next: 3248 perf_pmu_enable(event->pmu); 3249 } 3250 3251 perf_pmu_enable(ctx->pmu); 3252 raw_spin_unlock(&ctx->lock); 3253 } 3254 3255 /* 3256 * Round-robin a context's events: 3257 */ 3258 static void rotate_ctx(struct perf_event_context *ctx) 3259 { 3260 /* 3261 * Rotate the first entry last of non-pinned groups. Rotation might be 3262 * disabled by the inheritance code. 3263 */ 3264 if (!ctx->rotate_disable) 3265 list_rotate_left(&ctx->flexible_groups); 3266 } 3267 3268 static int perf_rotate_context(struct perf_cpu_context *cpuctx) 3269 { 3270 struct perf_event_context *ctx = NULL; 3271 int rotate = 0; 3272 3273 if (cpuctx->ctx.nr_events) { 3274 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 3275 rotate = 1; 3276 } 3277 3278 ctx = cpuctx->task_ctx; 3279 if (ctx && ctx->nr_events) { 3280 if (ctx->nr_events != ctx->nr_active) 3281 rotate = 1; 3282 } 3283 3284 if (!rotate) 3285 goto done; 3286 3287 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 3288 perf_pmu_disable(cpuctx->ctx.pmu); 3289 3290 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 3291 if (ctx) 3292 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); 3293 3294 rotate_ctx(&cpuctx->ctx); 3295 if (ctx) 3296 rotate_ctx(ctx); 3297 3298 perf_event_sched_in(cpuctx, ctx, current); 3299 3300 perf_pmu_enable(cpuctx->ctx.pmu); 3301 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 3302 done: 3303 3304 return rotate; 3305 } 3306 3307 void perf_event_task_tick(void) 3308 { 3309 struct list_head *head = this_cpu_ptr(&active_ctx_list); 3310 struct perf_event_context *ctx, *tmp; 3311 int throttled; 3312 3313 WARN_ON(!irqs_disabled()); 3314 3315 __this_cpu_inc(perf_throttled_seq); 3316 throttled = __this_cpu_xchg(perf_throttled_count, 0); 3317 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); 3318 3319 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) 3320 perf_adjust_freq_unthr_context(ctx, throttled); 3321 } 3322 3323 static int event_enable_on_exec(struct perf_event *event, 3324 struct perf_event_context *ctx) 3325 { 3326 if (!event->attr.enable_on_exec) 3327 return 0; 3328 3329 event->attr.enable_on_exec = 0; 3330 if (event->state >= PERF_EVENT_STATE_INACTIVE) 3331 return 0; 3332 3333 __perf_event_mark_enabled(event); 3334 3335 return 1; 3336 } 3337 3338 /* 3339 * Enable all of a task's events that have been marked enable-on-exec. 3340 * This expects task == current. 3341 */ 3342 static void perf_event_enable_on_exec(int ctxn) 3343 { 3344 struct perf_event_context *ctx, *clone_ctx = NULL; 3345 struct perf_cpu_context *cpuctx; 3346 struct perf_event *event; 3347 unsigned long flags; 3348 int enabled = 0; 3349 3350 local_irq_save(flags); 3351 ctx = current->perf_event_ctxp[ctxn]; 3352 if (!ctx || !ctx->nr_events) 3353 goto out; 3354 3355 cpuctx = __get_cpu_context(ctx); 3356 perf_ctx_lock(cpuctx, ctx); 3357 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 3358 list_for_each_entry(event, &ctx->event_list, event_entry) 3359 enabled |= event_enable_on_exec(event, ctx); 3360 3361 /* 3362 * Unclone and reschedule this context if we enabled any event. 3363 */ 3364 if (enabled) { 3365 clone_ctx = unclone_ctx(ctx); 3366 ctx_resched(cpuctx, ctx); 3367 } 3368 perf_ctx_unlock(cpuctx, ctx); 3369 3370 out: 3371 local_irq_restore(flags); 3372 3373 if (clone_ctx) 3374 put_ctx(clone_ctx); 3375 } 3376 3377 struct perf_read_data { 3378 struct perf_event *event; 3379 bool group; 3380 int ret; 3381 }; 3382 3383 /* 3384 * Cross CPU call to read the hardware event 3385 */ 3386 static void __perf_event_read(void *info) 3387 { 3388 struct perf_read_data *data = info; 3389 struct perf_event *sub, *event = data->event; 3390 struct perf_event_context *ctx = event->ctx; 3391 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 3392 struct pmu *pmu = event->pmu; 3393 3394 /* 3395 * If this is a task context, we need to check whether it is 3396 * the current task context of this cpu. If not it has been 3397 * scheduled out before the smp call arrived. In that case 3398 * event->count would have been updated to a recent sample 3399 * when the event was scheduled out. 3400 */ 3401 if (ctx->task && cpuctx->task_ctx != ctx) 3402 return; 3403 3404 raw_spin_lock(&ctx->lock); 3405 if (ctx->is_active) { 3406 update_context_time(ctx); 3407 update_cgrp_time_from_event(event); 3408 } 3409 3410 update_event_times(event); 3411 if (event->state != PERF_EVENT_STATE_ACTIVE) 3412 goto unlock; 3413 3414 if (!data->group) { 3415 pmu->read(event); 3416 data->ret = 0; 3417 goto unlock; 3418 } 3419 3420 pmu->start_txn(pmu, PERF_PMU_TXN_READ); 3421 3422 pmu->read(event); 3423 3424 list_for_each_entry(sub, &event->sibling_list, group_entry) { 3425 update_event_times(sub); 3426 if (sub->state == PERF_EVENT_STATE_ACTIVE) { 3427 /* 3428 * Use sibling's PMU rather than @event's since 3429 * sibling could be on different (eg: software) PMU. 3430 */ 3431 sub->pmu->read(sub); 3432 } 3433 } 3434 3435 data->ret = pmu->commit_txn(pmu); 3436 3437 unlock: 3438 raw_spin_unlock(&ctx->lock); 3439 } 3440 3441 static inline u64 perf_event_count(struct perf_event *event) 3442 { 3443 if (event->pmu->count) 3444 return event->pmu->count(event); 3445 3446 return __perf_event_count(event); 3447 } 3448 3449 /* 3450 * NMI-safe method to read a local event, that is an event that 3451 * is: 3452 * - either for the current task, or for this CPU 3453 * - does not have inherit set, for inherited task events 3454 * will not be local and we cannot read them atomically 3455 * - must not have a pmu::count method 3456 */ 3457 u64 perf_event_read_local(struct perf_event *event) 3458 { 3459 unsigned long flags; 3460 u64 val; 3461 3462 /* 3463 * Disabling interrupts avoids all counter scheduling (context 3464 * switches, timer based rotation and IPIs). 3465 */ 3466 local_irq_save(flags); 3467 3468 /* If this is a per-task event, it must be for current */ 3469 WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) && 3470 event->hw.target != current); 3471 3472 /* If this is a per-CPU event, it must be for this CPU */ 3473 WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) && 3474 event->cpu != smp_processor_id()); 3475 3476 /* 3477 * It must not be an event with inherit set, we cannot read 3478 * all child counters from atomic context. 3479 */ 3480 WARN_ON_ONCE(event->attr.inherit); 3481 3482 /* 3483 * It must not have a pmu::count method, those are not 3484 * NMI safe. 3485 */ 3486 WARN_ON_ONCE(event->pmu->count); 3487 3488 /* 3489 * If the event is currently on this CPU, its either a per-task event, 3490 * or local to this CPU. Furthermore it means its ACTIVE (otherwise 3491 * oncpu == -1). 3492 */ 3493 if (event->oncpu == smp_processor_id()) 3494 event->pmu->read(event); 3495 3496 val = local64_read(&event->count); 3497 local_irq_restore(flags); 3498 3499 return val; 3500 } 3501 3502 static int perf_event_read(struct perf_event *event, bool group) 3503 { 3504 int ret = 0; 3505 3506 /* 3507 * If event is enabled and currently active on a CPU, update the 3508 * value in the event structure: 3509 */ 3510 if (event->state == PERF_EVENT_STATE_ACTIVE) { 3511 struct perf_read_data data = { 3512 .event = event, 3513 .group = group, 3514 .ret = 0, 3515 }; 3516 smp_call_function_single(event->oncpu, 3517 __perf_event_read, &data, 1); 3518 ret = data.ret; 3519 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 3520 struct perf_event_context *ctx = event->ctx; 3521 unsigned long flags; 3522 3523 raw_spin_lock_irqsave(&ctx->lock, flags); 3524 /* 3525 * may read while context is not active 3526 * (e.g., thread is blocked), in that case 3527 * we cannot update context time 3528 */ 3529 if (ctx->is_active) { 3530 update_context_time(ctx); 3531 update_cgrp_time_from_event(event); 3532 } 3533 if (group) 3534 update_group_times(event); 3535 else 3536 update_event_times(event); 3537 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3538 } 3539 3540 return ret; 3541 } 3542 3543 /* 3544 * Initialize the perf_event context in a task_struct: 3545 */ 3546 static void __perf_event_init_context(struct perf_event_context *ctx) 3547 { 3548 raw_spin_lock_init(&ctx->lock); 3549 mutex_init(&ctx->mutex); 3550 INIT_LIST_HEAD(&ctx->active_ctx_list); 3551 INIT_LIST_HEAD(&ctx->pinned_groups); 3552 INIT_LIST_HEAD(&ctx->flexible_groups); 3553 INIT_LIST_HEAD(&ctx->event_list); 3554 atomic_set(&ctx->refcount, 1); 3555 } 3556 3557 static struct perf_event_context * 3558 alloc_perf_context(struct pmu *pmu, struct task_struct *task) 3559 { 3560 struct perf_event_context *ctx; 3561 3562 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 3563 if (!ctx) 3564 return NULL; 3565 3566 __perf_event_init_context(ctx); 3567 if (task) { 3568 ctx->task = task; 3569 get_task_struct(task); 3570 } 3571 ctx->pmu = pmu; 3572 3573 return ctx; 3574 } 3575 3576 static struct task_struct * 3577 find_lively_task_by_vpid(pid_t vpid) 3578 { 3579 struct task_struct *task; 3580 3581 rcu_read_lock(); 3582 if (!vpid) 3583 task = current; 3584 else 3585 task = find_task_by_vpid(vpid); 3586 if (task) 3587 get_task_struct(task); 3588 rcu_read_unlock(); 3589 3590 if (!task) 3591 return ERR_PTR(-ESRCH); 3592 3593 return task; 3594 } 3595 3596 /* 3597 * Returns a matching context with refcount and pincount. 3598 */ 3599 static struct perf_event_context * 3600 find_get_context(struct pmu *pmu, struct task_struct *task, 3601 struct perf_event *event) 3602 { 3603 struct perf_event_context *ctx, *clone_ctx = NULL; 3604 struct perf_cpu_context *cpuctx; 3605 void *task_ctx_data = NULL; 3606 unsigned long flags; 3607 int ctxn, err; 3608 int cpu = event->cpu; 3609 3610 if (!task) { 3611 /* Must be root to operate on a CPU event: */ 3612 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 3613 return ERR_PTR(-EACCES); 3614 3615 /* 3616 * We could be clever and allow to attach a event to an 3617 * offline CPU and activate it when the CPU comes up, but 3618 * that's for later. 3619 */ 3620 if (!cpu_online(cpu)) 3621 return ERR_PTR(-ENODEV); 3622 3623 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 3624 ctx = &cpuctx->ctx; 3625 get_ctx(ctx); 3626 ++ctx->pin_count; 3627 3628 return ctx; 3629 } 3630 3631 err = -EINVAL; 3632 ctxn = pmu->task_ctx_nr; 3633 if (ctxn < 0) 3634 goto errout; 3635 3636 if (event->attach_state & PERF_ATTACH_TASK_DATA) { 3637 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); 3638 if (!task_ctx_data) { 3639 err = -ENOMEM; 3640 goto errout; 3641 } 3642 } 3643 3644 retry: 3645 ctx = perf_lock_task_context(task, ctxn, &flags); 3646 if (ctx) { 3647 clone_ctx = unclone_ctx(ctx); 3648 ++ctx->pin_count; 3649 3650 if (task_ctx_data && !ctx->task_ctx_data) { 3651 ctx->task_ctx_data = task_ctx_data; 3652 task_ctx_data = NULL; 3653 } 3654 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3655 3656 if (clone_ctx) 3657 put_ctx(clone_ctx); 3658 } else { 3659 ctx = alloc_perf_context(pmu, task); 3660 err = -ENOMEM; 3661 if (!ctx) 3662 goto errout; 3663 3664 if (task_ctx_data) { 3665 ctx->task_ctx_data = task_ctx_data; 3666 task_ctx_data = NULL; 3667 } 3668 3669 err = 0; 3670 mutex_lock(&task->perf_event_mutex); 3671 /* 3672 * If it has already passed perf_event_exit_task(). 3673 * we must see PF_EXITING, it takes this mutex too. 3674 */ 3675 if (task->flags & PF_EXITING) 3676 err = -ESRCH; 3677 else if (task->perf_event_ctxp[ctxn]) 3678 err = -EAGAIN; 3679 else { 3680 get_ctx(ctx); 3681 ++ctx->pin_count; 3682 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 3683 } 3684 mutex_unlock(&task->perf_event_mutex); 3685 3686 if (unlikely(err)) { 3687 put_ctx(ctx); 3688 3689 if (err == -EAGAIN) 3690 goto retry; 3691 goto errout; 3692 } 3693 } 3694 3695 kfree(task_ctx_data); 3696 return ctx; 3697 3698 errout: 3699 kfree(task_ctx_data); 3700 return ERR_PTR(err); 3701 } 3702 3703 static void perf_event_free_filter(struct perf_event *event); 3704 static void perf_event_free_bpf_prog(struct perf_event *event); 3705 3706 static void free_event_rcu(struct rcu_head *head) 3707 { 3708 struct perf_event *event; 3709 3710 event = container_of(head, struct perf_event, rcu_head); 3711 if (event->ns) 3712 put_pid_ns(event->ns); 3713 perf_event_free_filter(event); 3714 kfree(event); 3715 } 3716 3717 static void ring_buffer_attach(struct perf_event *event, 3718 struct ring_buffer *rb); 3719 3720 static void detach_sb_event(struct perf_event *event) 3721 { 3722 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); 3723 3724 raw_spin_lock(&pel->lock); 3725 list_del_rcu(&event->sb_list); 3726 raw_spin_unlock(&pel->lock); 3727 } 3728 3729 static bool is_sb_event(struct perf_event *event) 3730 { 3731 struct perf_event_attr *attr = &event->attr; 3732 3733 if (event->parent) 3734 return false; 3735 3736 if (event->attach_state & PERF_ATTACH_TASK) 3737 return false; 3738 3739 if (attr->mmap || attr->mmap_data || attr->mmap2 || 3740 attr->comm || attr->comm_exec || 3741 attr->task || 3742 attr->context_switch) 3743 return true; 3744 return false; 3745 } 3746 3747 static void unaccount_pmu_sb_event(struct perf_event *event) 3748 { 3749 if (is_sb_event(event)) 3750 detach_sb_event(event); 3751 } 3752 3753 static void unaccount_event_cpu(struct perf_event *event, int cpu) 3754 { 3755 if (event->parent) 3756 return; 3757 3758 if (is_cgroup_event(event)) 3759 atomic_dec(&per_cpu(perf_cgroup_events, cpu)); 3760 } 3761 3762 #ifdef CONFIG_NO_HZ_FULL 3763 static DEFINE_SPINLOCK(nr_freq_lock); 3764 #endif 3765 3766 static void unaccount_freq_event_nohz(void) 3767 { 3768 #ifdef CONFIG_NO_HZ_FULL 3769 spin_lock(&nr_freq_lock); 3770 if (atomic_dec_and_test(&nr_freq_events)) 3771 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); 3772 spin_unlock(&nr_freq_lock); 3773 #endif 3774 } 3775 3776 static void unaccount_freq_event(void) 3777 { 3778 if (tick_nohz_full_enabled()) 3779 unaccount_freq_event_nohz(); 3780 else 3781 atomic_dec(&nr_freq_events); 3782 } 3783 3784 static void unaccount_event(struct perf_event *event) 3785 { 3786 bool dec = false; 3787 3788 if (event->parent) 3789 return; 3790 3791 if (event->attach_state & PERF_ATTACH_TASK) 3792 dec = true; 3793 if (event->attr.mmap || event->attr.mmap_data) 3794 atomic_dec(&nr_mmap_events); 3795 if (event->attr.comm) 3796 atomic_dec(&nr_comm_events); 3797 if (event->attr.task) 3798 atomic_dec(&nr_task_events); 3799 if (event->attr.freq) 3800 unaccount_freq_event(); 3801 if (event->attr.context_switch) { 3802 dec = true; 3803 atomic_dec(&nr_switch_events); 3804 } 3805 if (is_cgroup_event(event)) 3806 dec = true; 3807 if (has_branch_stack(event)) 3808 dec = true; 3809 3810 if (dec) { 3811 if (!atomic_add_unless(&perf_sched_count, -1, 1)) 3812 schedule_delayed_work(&perf_sched_work, HZ); 3813 } 3814 3815 unaccount_event_cpu(event, event->cpu); 3816 3817 unaccount_pmu_sb_event(event); 3818 } 3819 3820 static void perf_sched_delayed(struct work_struct *work) 3821 { 3822 mutex_lock(&perf_sched_mutex); 3823 if (atomic_dec_and_test(&perf_sched_count)) 3824 static_branch_disable(&perf_sched_events); 3825 mutex_unlock(&perf_sched_mutex); 3826 } 3827 3828 /* 3829 * The following implement mutual exclusion of events on "exclusive" pmus 3830 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled 3831 * at a time, so we disallow creating events that might conflict, namely: 3832 * 3833 * 1) cpu-wide events in the presence of per-task events, 3834 * 2) per-task events in the presence of cpu-wide events, 3835 * 3) two matching events on the same context. 3836 * 3837 * The former two cases are handled in the allocation path (perf_event_alloc(), 3838 * _free_event()), the latter -- before the first perf_install_in_context(). 3839 */ 3840 static int exclusive_event_init(struct perf_event *event) 3841 { 3842 struct pmu *pmu = event->pmu; 3843 3844 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) 3845 return 0; 3846 3847 /* 3848 * Prevent co-existence of per-task and cpu-wide events on the 3849 * same exclusive pmu. 3850 * 3851 * Negative pmu::exclusive_cnt means there are cpu-wide 3852 * events on this "exclusive" pmu, positive means there are 3853 * per-task events. 3854 * 3855 * Since this is called in perf_event_alloc() path, event::ctx 3856 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK 3857 * to mean "per-task event", because unlike other attach states it 3858 * never gets cleared. 3859 */ 3860 if (event->attach_state & PERF_ATTACH_TASK) { 3861 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) 3862 return -EBUSY; 3863 } else { 3864 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) 3865 return -EBUSY; 3866 } 3867 3868 return 0; 3869 } 3870 3871 static void exclusive_event_destroy(struct perf_event *event) 3872 { 3873 struct pmu *pmu = event->pmu; 3874 3875 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) 3876 return; 3877 3878 /* see comment in exclusive_event_init() */ 3879 if (event->attach_state & PERF_ATTACH_TASK) 3880 atomic_dec(&pmu->exclusive_cnt); 3881 else 3882 atomic_inc(&pmu->exclusive_cnt); 3883 } 3884 3885 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) 3886 { 3887 if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && 3888 (e1->cpu == e2->cpu || 3889 e1->cpu == -1 || 3890 e2->cpu == -1)) 3891 return true; 3892 return false; 3893 } 3894 3895 /* Called under the same ctx::mutex as perf_install_in_context() */ 3896 static bool exclusive_event_installable(struct perf_event *event, 3897 struct perf_event_context *ctx) 3898 { 3899 struct perf_event *iter_event; 3900 struct pmu *pmu = event->pmu; 3901 3902 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) 3903 return true; 3904 3905 list_for_each_entry(iter_event, &ctx->event_list, event_entry) { 3906 if (exclusive_event_match(iter_event, event)) 3907 return false; 3908 } 3909 3910 return true; 3911 } 3912 3913 static void perf_addr_filters_splice(struct perf_event *event, 3914 struct list_head *head); 3915 3916 static void _free_event(struct perf_event *event) 3917 { 3918 irq_work_sync(&event->pending); 3919 3920 unaccount_event(event); 3921 3922 if (event->rb) { 3923 /* 3924 * Can happen when we close an event with re-directed output. 3925 * 3926 * Since we have a 0 refcount, perf_mmap_close() will skip 3927 * over us; possibly making our ring_buffer_put() the last. 3928 */ 3929 mutex_lock(&event->mmap_mutex); 3930 ring_buffer_attach(event, NULL); 3931 mutex_unlock(&event->mmap_mutex); 3932 } 3933 3934 if (is_cgroup_event(event)) 3935 perf_detach_cgroup(event); 3936 3937 if (!event->parent) { 3938 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 3939 put_callchain_buffers(); 3940 } 3941 3942 perf_event_free_bpf_prog(event); 3943 perf_addr_filters_splice(event, NULL); 3944 kfree(event->addr_filters_offs); 3945 3946 if (event->destroy) 3947 event->destroy(event); 3948 3949 if (event->ctx) 3950 put_ctx(event->ctx); 3951 3952 exclusive_event_destroy(event); 3953 module_put(event->pmu->module); 3954 3955 call_rcu(&event->rcu_head, free_event_rcu); 3956 } 3957 3958 /* 3959 * Used to free events which have a known refcount of 1, such as in error paths 3960 * where the event isn't exposed yet and inherited events. 3961 */ 3962 static void free_event(struct perf_event *event) 3963 { 3964 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, 3965 "unexpected event refcount: %ld; ptr=%p\n", 3966 atomic_long_read(&event->refcount), event)) { 3967 /* leak to avoid use-after-free */ 3968 return; 3969 } 3970 3971 _free_event(event); 3972 } 3973 3974 /* 3975 * Remove user event from the owner task. 3976 */ 3977 static void perf_remove_from_owner(struct perf_event *event) 3978 { 3979 struct task_struct *owner; 3980 3981 rcu_read_lock(); 3982 /* 3983 * Matches the smp_store_release() in perf_event_exit_task(). If we 3984 * observe !owner it means the list deletion is complete and we can 3985 * indeed free this event, otherwise we need to serialize on 3986 * owner->perf_event_mutex. 3987 */ 3988 owner = lockless_dereference(event->owner); 3989 if (owner) { 3990 /* 3991 * Since delayed_put_task_struct() also drops the last 3992 * task reference we can safely take a new reference 3993 * while holding the rcu_read_lock(). 3994 */ 3995 get_task_struct(owner); 3996 } 3997 rcu_read_unlock(); 3998 3999 if (owner) { 4000 /* 4001 * If we're here through perf_event_exit_task() we're already 4002 * holding ctx->mutex which would be an inversion wrt. the 4003 * normal lock order. 4004 * 4005 * However we can safely take this lock because its the child 4006 * ctx->mutex. 4007 */ 4008 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); 4009 4010 /* 4011 * We have to re-check the event->owner field, if it is cleared 4012 * we raced with perf_event_exit_task(), acquiring the mutex 4013 * ensured they're done, and we can proceed with freeing the 4014 * event. 4015 */ 4016 if (event->owner) { 4017 list_del_init(&event->owner_entry); 4018 smp_store_release(&event->owner, NULL); 4019 } 4020 mutex_unlock(&owner->perf_event_mutex); 4021 put_task_struct(owner); 4022 } 4023 } 4024 4025 static void put_event(struct perf_event *event) 4026 { 4027 if (!atomic_long_dec_and_test(&event->refcount)) 4028 return; 4029 4030 _free_event(event); 4031 } 4032 4033 /* 4034 * Kill an event dead; while event:refcount will preserve the event 4035 * object, it will not preserve its functionality. Once the last 'user' 4036 * gives up the object, we'll destroy the thing. 4037 */ 4038 int perf_event_release_kernel(struct perf_event *event) 4039 { 4040 struct perf_event_context *ctx = event->ctx; 4041 struct perf_event *child, *tmp; 4042 4043 /* 4044 * If we got here through err_file: fput(event_file); we will not have 4045 * attached to a context yet. 4046 */ 4047 if (!ctx) { 4048 WARN_ON_ONCE(event->attach_state & 4049 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); 4050 goto no_ctx; 4051 } 4052 4053 if (!is_kernel_event(event)) 4054 perf_remove_from_owner(event); 4055 4056 ctx = perf_event_ctx_lock(event); 4057 WARN_ON_ONCE(ctx->parent_ctx); 4058 perf_remove_from_context(event, DETACH_GROUP); 4059 4060 raw_spin_lock_irq(&ctx->lock); 4061 /* 4062 * Mark this even as STATE_DEAD, there is no external reference to it 4063 * anymore. 4064 * 4065 * Anybody acquiring event->child_mutex after the below loop _must_ 4066 * also see this, most importantly inherit_event() which will avoid 4067 * placing more children on the list. 4068 * 4069 * Thus this guarantees that we will in fact observe and kill _ALL_ 4070 * child events. 4071 */ 4072 event->state = PERF_EVENT_STATE_DEAD; 4073 raw_spin_unlock_irq(&ctx->lock); 4074 4075 perf_event_ctx_unlock(event, ctx); 4076 4077 again: 4078 mutex_lock(&event->child_mutex); 4079 list_for_each_entry(child, &event->child_list, child_list) { 4080 4081 /* 4082 * Cannot change, child events are not migrated, see the 4083 * comment with perf_event_ctx_lock_nested(). 4084 */ 4085 ctx = lockless_dereference(child->ctx); 4086 /* 4087 * Since child_mutex nests inside ctx::mutex, we must jump 4088 * through hoops. We start by grabbing a reference on the ctx. 4089 * 4090 * Since the event cannot get freed while we hold the 4091 * child_mutex, the context must also exist and have a !0 4092 * reference count. 4093 */ 4094 get_ctx(ctx); 4095 4096 /* 4097 * Now that we have a ctx ref, we can drop child_mutex, and 4098 * acquire ctx::mutex without fear of it going away. Then we 4099 * can re-acquire child_mutex. 4100 */ 4101 mutex_unlock(&event->child_mutex); 4102 mutex_lock(&ctx->mutex); 4103 mutex_lock(&event->child_mutex); 4104 4105 /* 4106 * Now that we hold ctx::mutex and child_mutex, revalidate our 4107 * state, if child is still the first entry, it didn't get freed 4108 * and we can continue doing so. 4109 */ 4110 tmp = list_first_entry_or_null(&event->child_list, 4111 struct perf_event, child_list); 4112 if (tmp == child) { 4113 perf_remove_from_context(child, DETACH_GROUP); 4114 list_del(&child->child_list); 4115 free_event(child); 4116 /* 4117 * This matches the refcount bump in inherit_event(); 4118 * this can't be the last reference. 4119 */ 4120 put_event(event); 4121 } 4122 4123 mutex_unlock(&event->child_mutex); 4124 mutex_unlock(&ctx->mutex); 4125 put_ctx(ctx); 4126 goto again; 4127 } 4128 mutex_unlock(&event->child_mutex); 4129 4130 no_ctx: 4131 put_event(event); /* Must be the 'last' reference */ 4132 return 0; 4133 } 4134 EXPORT_SYMBOL_GPL(perf_event_release_kernel); 4135 4136 /* 4137 * Called when the last reference to the file is gone. 4138 */ 4139 static int perf_release(struct inode *inode, struct file *file) 4140 { 4141 perf_event_release_kernel(file->private_data); 4142 return 0; 4143 } 4144 4145 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 4146 { 4147 struct perf_event *child; 4148 u64 total = 0; 4149 4150 *enabled = 0; 4151 *running = 0; 4152 4153 mutex_lock(&event->child_mutex); 4154 4155 (void)perf_event_read(event, false); 4156 total += perf_event_count(event); 4157 4158 *enabled += event->total_time_enabled + 4159 atomic64_read(&event->child_total_time_enabled); 4160 *running += event->total_time_running + 4161 atomic64_read(&event->child_total_time_running); 4162 4163 list_for_each_entry(child, &event->child_list, child_list) { 4164 (void)perf_event_read(child, false); 4165 total += perf_event_count(child); 4166 *enabled += child->total_time_enabled; 4167 *running += child->total_time_running; 4168 } 4169 mutex_unlock(&event->child_mutex); 4170 4171 return total; 4172 } 4173 EXPORT_SYMBOL_GPL(perf_event_read_value); 4174 4175 static int __perf_read_group_add(struct perf_event *leader, 4176 u64 read_format, u64 *values) 4177 { 4178 struct perf_event *sub; 4179 int n = 1; /* skip @nr */ 4180 int ret; 4181 4182 ret = perf_event_read(leader, true); 4183 if (ret) 4184 return ret; 4185 4186 /* 4187 * Since we co-schedule groups, {enabled,running} times of siblings 4188 * will be identical to those of the leader, so we only publish one 4189 * set. 4190 */ 4191 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 4192 values[n++] += leader->total_time_enabled + 4193 atomic64_read(&leader->child_total_time_enabled); 4194 } 4195 4196 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 4197 values[n++] += leader->total_time_running + 4198 atomic64_read(&leader->child_total_time_running); 4199 } 4200 4201 /* 4202 * Write {count,id} tuples for every sibling. 4203 */ 4204 values[n++] += perf_event_count(leader); 4205 if (read_format & PERF_FORMAT_ID) 4206 values[n++] = primary_event_id(leader); 4207 4208 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 4209 values[n++] += perf_event_count(sub); 4210 if (read_format & PERF_FORMAT_ID) 4211 values[n++] = primary_event_id(sub); 4212 } 4213 4214 return 0; 4215 } 4216 4217 static int perf_read_group(struct perf_event *event, 4218 u64 read_format, char __user *buf) 4219 { 4220 struct perf_event *leader = event->group_leader, *child; 4221 struct perf_event_context *ctx = leader->ctx; 4222 int ret; 4223 u64 *values; 4224 4225 lockdep_assert_held(&ctx->mutex); 4226 4227 values = kzalloc(event->read_size, GFP_KERNEL); 4228 if (!values) 4229 return -ENOMEM; 4230 4231 values[0] = 1 + leader->nr_siblings; 4232 4233 /* 4234 * By locking the child_mutex of the leader we effectively 4235 * lock the child list of all siblings.. XXX explain how. 4236 */ 4237 mutex_lock(&leader->child_mutex); 4238 4239 ret = __perf_read_group_add(leader, read_format, values); 4240 if (ret) 4241 goto unlock; 4242 4243 list_for_each_entry(child, &leader->child_list, child_list) { 4244 ret = __perf_read_group_add(child, read_format, values); 4245 if (ret) 4246 goto unlock; 4247 } 4248 4249 mutex_unlock(&leader->child_mutex); 4250 4251 ret = event->read_size; 4252 if (copy_to_user(buf, values, event->read_size)) 4253 ret = -EFAULT; 4254 goto out; 4255 4256 unlock: 4257 mutex_unlock(&leader->child_mutex); 4258 out: 4259 kfree(values); 4260 return ret; 4261 } 4262 4263 static int perf_read_one(struct perf_event *event, 4264 u64 read_format, char __user *buf) 4265 { 4266 u64 enabled, running; 4267 u64 values[4]; 4268 int n = 0; 4269 4270 values[n++] = perf_event_read_value(event, &enabled, &running); 4271 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 4272 values[n++] = enabled; 4273 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 4274 values[n++] = running; 4275 if (read_format & PERF_FORMAT_ID) 4276 values[n++] = primary_event_id(event); 4277 4278 if (copy_to_user(buf, values, n * sizeof(u64))) 4279 return -EFAULT; 4280 4281 return n * sizeof(u64); 4282 } 4283 4284 static bool is_event_hup(struct perf_event *event) 4285 { 4286 bool no_children; 4287 4288 if (event->state > PERF_EVENT_STATE_EXIT) 4289 return false; 4290 4291 mutex_lock(&event->child_mutex); 4292 no_children = list_empty(&event->child_list); 4293 mutex_unlock(&event->child_mutex); 4294 return no_children; 4295 } 4296 4297 /* 4298 * Read the performance event - simple non blocking version for now 4299 */ 4300 static ssize_t 4301 __perf_read(struct perf_event *event, char __user *buf, size_t count) 4302 { 4303 u64 read_format = event->attr.read_format; 4304 int ret; 4305 4306 /* 4307 * Return end-of-file for a read on a event that is in 4308 * error state (i.e. because it was pinned but it couldn't be 4309 * scheduled on to the CPU at some point). 4310 */ 4311 if (event->state == PERF_EVENT_STATE_ERROR) 4312 return 0; 4313 4314 if (count < event->read_size) 4315 return -ENOSPC; 4316 4317 WARN_ON_ONCE(event->ctx->parent_ctx); 4318 if (read_format & PERF_FORMAT_GROUP) 4319 ret = perf_read_group(event, read_format, buf); 4320 else 4321 ret = perf_read_one(event, read_format, buf); 4322 4323 return ret; 4324 } 4325 4326 static ssize_t 4327 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 4328 { 4329 struct perf_event *event = file->private_data; 4330 struct perf_event_context *ctx; 4331 int ret; 4332 4333 ctx = perf_event_ctx_lock(event); 4334 ret = __perf_read(event, buf, count); 4335 perf_event_ctx_unlock(event, ctx); 4336 4337 return ret; 4338 } 4339 4340 static unsigned int perf_poll(struct file *file, poll_table *wait) 4341 { 4342 struct perf_event *event = file->private_data; 4343 struct ring_buffer *rb; 4344 unsigned int events = POLLHUP; 4345 4346 poll_wait(file, &event->waitq, wait); 4347 4348 if (is_event_hup(event)) 4349 return events; 4350 4351 /* 4352 * Pin the event->rb by taking event->mmap_mutex; otherwise 4353 * perf_event_set_output() can swizzle our rb and make us miss wakeups. 4354 */ 4355 mutex_lock(&event->mmap_mutex); 4356 rb = event->rb; 4357 if (rb) 4358 events = atomic_xchg(&rb->poll, 0); 4359 mutex_unlock(&event->mmap_mutex); 4360 return events; 4361 } 4362 4363 static void _perf_event_reset(struct perf_event *event) 4364 { 4365 (void)perf_event_read(event, false); 4366 local64_set(&event->count, 0); 4367 perf_event_update_userpage(event); 4368 } 4369 4370 /* 4371 * Holding the top-level event's child_mutex means that any 4372 * descendant process that has inherited this event will block 4373 * in perf_event_exit_event() if it goes to exit, thus satisfying the 4374 * task existence requirements of perf_event_enable/disable. 4375 */ 4376 static void perf_event_for_each_child(struct perf_event *event, 4377 void (*func)(struct perf_event *)) 4378 { 4379 struct perf_event *child; 4380 4381 WARN_ON_ONCE(event->ctx->parent_ctx); 4382 4383 mutex_lock(&event->child_mutex); 4384 func(event); 4385 list_for_each_entry(child, &event->child_list, child_list) 4386 func(child); 4387 mutex_unlock(&event->child_mutex); 4388 } 4389 4390 static void perf_event_for_each(struct perf_event *event, 4391 void (*func)(struct perf_event *)) 4392 { 4393 struct perf_event_context *ctx = event->ctx; 4394 struct perf_event *sibling; 4395 4396 lockdep_assert_held(&ctx->mutex); 4397 4398 event = event->group_leader; 4399 4400 perf_event_for_each_child(event, func); 4401 list_for_each_entry(sibling, &event->sibling_list, group_entry) 4402 perf_event_for_each_child(sibling, func); 4403 } 4404 4405 static void __perf_event_period(struct perf_event *event, 4406 struct perf_cpu_context *cpuctx, 4407 struct perf_event_context *ctx, 4408 void *info) 4409 { 4410 u64 value = *((u64 *)info); 4411 bool active; 4412 4413 if (event->attr.freq) { 4414 event->attr.sample_freq = value; 4415 } else { 4416 event->attr.sample_period = value; 4417 event->hw.sample_period = value; 4418 } 4419 4420 active = (event->state == PERF_EVENT_STATE_ACTIVE); 4421 if (active) { 4422 perf_pmu_disable(ctx->pmu); 4423 /* 4424 * We could be throttled; unthrottle now to avoid the tick 4425 * trying to unthrottle while we already re-started the event. 4426 */ 4427 if (event->hw.interrupts == MAX_INTERRUPTS) { 4428 event->hw.interrupts = 0; 4429 perf_log_throttle(event, 1); 4430 } 4431 event->pmu->stop(event, PERF_EF_UPDATE); 4432 } 4433 4434 local64_set(&event->hw.period_left, 0); 4435 4436 if (active) { 4437 event->pmu->start(event, PERF_EF_RELOAD); 4438 perf_pmu_enable(ctx->pmu); 4439 } 4440 } 4441 4442 static int perf_event_period(struct perf_event *event, u64 __user *arg) 4443 { 4444 u64 value; 4445 4446 if (!is_sampling_event(event)) 4447 return -EINVAL; 4448 4449 if (copy_from_user(&value, arg, sizeof(value))) 4450 return -EFAULT; 4451 4452 if (!value) 4453 return -EINVAL; 4454 4455 if (event->attr.freq && value > sysctl_perf_event_sample_rate) 4456 return -EINVAL; 4457 4458 event_function_call(event, __perf_event_period, &value); 4459 4460 return 0; 4461 } 4462 4463 static const struct file_operations perf_fops; 4464 4465 static inline int perf_fget_light(int fd, struct fd *p) 4466 { 4467 struct fd f = fdget(fd); 4468 if (!f.file) 4469 return -EBADF; 4470 4471 if (f.file->f_op != &perf_fops) { 4472 fdput(f); 4473 return -EBADF; 4474 } 4475 *p = f; 4476 return 0; 4477 } 4478 4479 static int perf_event_set_output(struct perf_event *event, 4480 struct perf_event *output_event); 4481 static int perf_event_set_filter(struct perf_event *event, void __user *arg); 4482 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); 4483 4484 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) 4485 { 4486 void (*func)(struct perf_event *); 4487 u32 flags = arg; 4488 4489 switch (cmd) { 4490 case PERF_EVENT_IOC_ENABLE: 4491 func = _perf_event_enable; 4492 break; 4493 case PERF_EVENT_IOC_DISABLE: 4494 func = _perf_event_disable; 4495 break; 4496 case PERF_EVENT_IOC_RESET: 4497 func = _perf_event_reset; 4498 break; 4499 4500 case PERF_EVENT_IOC_REFRESH: 4501 return _perf_event_refresh(event, arg); 4502 4503 case PERF_EVENT_IOC_PERIOD: 4504 return perf_event_period(event, (u64 __user *)arg); 4505 4506 case PERF_EVENT_IOC_ID: 4507 { 4508 u64 id = primary_event_id(event); 4509 4510 if (copy_to_user((void __user *)arg, &id, sizeof(id))) 4511 return -EFAULT; 4512 return 0; 4513 } 4514 4515 case PERF_EVENT_IOC_SET_OUTPUT: 4516 { 4517 int ret; 4518 if (arg != -1) { 4519 struct perf_event *output_event; 4520 struct fd output; 4521 ret = perf_fget_light(arg, &output); 4522 if (ret) 4523 return ret; 4524 output_event = output.file->private_data; 4525 ret = perf_event_set_output(event, output_event); 4526 fdput(output); 4527 } else { 4528 ret = perf_event_set_output(event, NULL); 4529 } 4530 return ret; 4531 } 4532 4533 case PERF_EVENT_IOC_SET_FILTER: 4534 return perf_event_set_filter(event, (void __user *)arg); 4535 4536 case PERF_EVENT_IOC_SET_BPF: 4537 return perf_event_set_bpf_prog(event, arg); 4538 4539 case PERF_EVENT_IOC_PAUSE_OUTPUT: { 4540 struct ring_buffer *rb; 4541 4542 rcu_read_lock(); 4543 rb = rcu_dereference(event->rb); 4544 if (!rb || !rb->nr_pages) { 4545 rcu_read_unlock(); 4546 return -EINVAL; 4547 } 4548 rb_toggle_paused(rb, !!arg); 4549 rcu_read_unlock(); 4550 return 0; 4551 } 4552 default: 4553 return -ENOTTY; 4554 } 4555 4556 if (flags & PERF_IOC_FLAG_GROUP) 4557 perf_event_for_each(event, func); 4558 else 4559 perf_event_for_each_child(event, func); 4560 4561 return 0; 4562 } 4563 4564 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 4565 { 4566 struct perf_event *event = file->private_data; 4567 struct perf_event_context *ctx; 4568 long ret; 4569 4570 ctx = perf_event_ctx_lock(event); 4571 ret = _perf_ioctl(event, cmd, arg); 4572 perf_event_ctx_unlock(event, ctx); 4573 4574 return ret; 4575 } 4576 4577 #ifdef CONFIG_COMPAT 4578 static long perf_compat_ioctl(struct file *file, unsigned int cmd, 4579 unsigned long arg) 4580 { 4581 switch (_IOC_NR(cmd)) { 4582 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): 4583 case _IOC_NR(PERF_EVENT_IOC_ID): 4584 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ 4585 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { 4586 cmd &= ~IOCSIZE_MASK; 4587 cmd |= sizeof(void *) << IOCSIZE_SHIFT; 4588 } 4589 break; 4590 } 4591 return perf_ioctl(file, cmd, arg); 4592 } 4593 #else 4594 # define perf_compat_ioctl NULL 4595 #endif 4596 4597 int perf_event_task_enable(void) 4598 { 4599 struct perf_event_context *ctx; 4600 struct perf_event *event; 4601 4602 mutex_lock(¤t->perf_event_mutex); 4603 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { 4604 ctx = perf_event_ctx_lock(event); 4605 perf_event_for_each_child(event, _perf_event_enable); 4606 perf_event_ctx_unlock(event, ctx); 4607 } 4608 mutex_unlock(¤t->perf_event_mutex); 4609 4610 return 0; 4611 } 4612 4613 int perf_event_task_disable(void) 4614 { 4615 struct perf_event_context *ctx; 4616 struct perf_event *event; 4617 4618 mutex_lock(¤t->perf_event_mutex); 4619 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { 4620 ctx = perf_event_ctx_lock(event); 4621 perf_event_for_each_child(event, _perf_event_disable); 4622 perf_event_ctx_unlock(event, ctx); 4623 } 4624 mutex_unlock(¤t->perf_event_mutex); 4625 4626 return 0; 4627 } 4628 4629 static int perf_event_index(struct perf_event *event) 4630 { 4631 if (event->hw.state & PERF_HES_STOPPED) 4632 return 0; 4633 4634 if (event->state != PERF_EVENT_STATE_ACTIVE) 4635 return 0; 4636 4637 return event->pmu->event_idx(event); 4638 } 4639 4640 static void calc_timer_values(struct perf_event *event, 4641 u64 *now, 4642 u64 *enabled, 4643 u64 *running) 4644 { 4645 u64 ctx_time; 4646 4647 *now = perf_clock(); 4648 ctx_time = event->shadow_ctx_time + *now; 4649 *enabled = ctx_time - event->tstamp_enabled; 4650 *running = ctx_time - event->tstamp_running; 4651 } 4652 4653 static void perf_event_init_userpage(struct perf_event *event) 4654 { 4655 struct perf_event_mmap_page *userpg; 4656 struct ring_buffer *rb; 4657 4658 rcu_read_lock(); 4659 rb = rcu_dereference(event->rb); 4660 if (!rb) 4661 goto unlock; 4662 4663 userpg = rb->user_page; 4664 4665 /* Allow new userspace to detect that bit 0 is deprecated */ 4666 userpg->cap_bit0_is_deprecated = 1; 4667 userpg->size = offsetof(struct perf_event_mmap_page, __reserved); 4668 userpg->data_offset = PAGE_SIZE; 4669 userpg->data_size = perf_data_size(rb); 4670 4671 unlock: 4672 rcu_read_unlock(); 4673 } 4674 4675 void __weak arch_perf_update_userpage( 4676 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) 4677 { 4678 } 4679 4680 /* 4681 * Callers need to ensure there can be no nesting of this function, otherwise 4682 * the seqlock logic goes bad. We can not serialize this because the arch 4683 * code calls this from NMI context. 4684 */ 4685 void perf_event_update_userpage(struct perf_event *event) 4686 { 4687 struct perf_event_mmap_page *userpg; 4688 struct ring_buffer *rb; 4689 u64 enabled, running, now; 4690 4691 rcu_read_lock(); 4692 rb = rcu_dereference(event->rb); 4693 if (!rb) 4694 goto unlock; 4695 4696 /* 4697 * compute total_time_enabled, total_time_running 4698 * based on snapshot values taken when the event 4699 * was last scheduled in. 4700 * 4701 * we cannot simply called update_context_time() 4702 * because of locking issue as we can be called in 4703 * NMI context 4704 */ 4705 calc_timer_values(event, &now, &enabled, &running); 4706 4707 userpg = rb->user_page; 4708 /* 4709 * Disable preemption so as to not let the corresponding user-space 4710 * spin too long if we get preempted. 4711 */ 4712 preempt_disable(); 4713 ++userpg->lock; 4714 barrier(); 4715 userpg->index = perf_event_index(event); 4716 userpg->offset = perf_event_count(event); 4717 if (userpg->index) 4718 userpg->offset -= local64_read(&event->hw.prev_count); 4719 4720 userpg->time_enabled = enabled + 4721 atomic64_read(&event->child_total_time_enabled); 4722 4723 userpg->time_running = running + 4724 atomic64_read(&event->child_total_time_running); 4725 4726 arch_perf_update_userpage(event, userpg, now); 4727 4728 barrier(); 4729 ++userpg->lock; 4730 preempt_enable(); 4731 unlock: 4732 rcu_read_unlock(); 4733 } 4734 4735 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 4736 { 4737 struct perf_event *event = vma->vm_file->private_data; 4738 struct ring_buffer *rb; 4739 int ret = VM_FAULT_SIGBUS; 4740 4741 if (vmf->flags & FAULT_FLAG_MKWRITE) { 4742 if (vmf->pgoff == 0) 4743 ret = 0; 4744 return ret; 4745 } 4746 4747 rcu_read_lock(); 4748 rb = rcu_dereference(event->rb); 4749 if (!rb) 4750 goto unlock; 4751 4752 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 4753 goto unlock; 4754 4755 vmf->page = perf_mmap_to_page(rb, vmf->pgoff); 4756 if (!vmf->page) 4757 goto unlock; 4758 4759 get_page(vmf->page); 4760 vmf->page->mapping = vma->vm_file->f_mapping; 4761 vmf->page->index = vmf->pgoff; 4762 4763 ret = 0; 4764 unlock: 4765 rcu_read_unlock(); 4766 4767 return ret; 4768 } 4769 4770 static void ring_buffer_attach(struct perf_event *event, 4771 struct ring_buffer *rb) 4772 { 4773 struct ring_buffer *old_rb = NULL; 4774 unsigned long flags; 4775 4776 if (event->rb) { 4777 /* 4778 * Should be impossible, we set this when removing 4779 * event->rb_entry and wait/clear when adding event->rb_entry. 4780 */ 4781 WARN_ON_ONCE(event->rcu_pending); 4782 4783 old_rb = event->rb; 4784 spin_lock_irqsave(&old_rb->event_lock, flags); 4785 list_del_rcu(&event->rb_entry); 4786 spin_unlock_irqrestore(&old_rb->event_lock, flags); 4787 4788 event->rcu_batches = get_state_synchronize_rcu(); 4789 event->rcu_pending = 1; 4790 } 4791 4792 if (rb) { 4793 if (event->rcu_pending) { 4794 cond_synchronize_rcu(event->rcu_batches); 4795 event->rcu_pending = 0; 4796 } 4797 4798 spin_lock_irqsave(&rb->event_lock, flags); 4799 list_add_rcu(&event->rb_entry, &rb->event_list); 4800 spin_unlock_irqrestore(&rb->event_lock, flags); 4801 } 4802 4803 rcu_assign_pointer(event->rb, rb); 4804 4805 if (old_rb) { 4806 ring_buffer_put(old_rb); 4807 /* 4808 * Since we detached before setting the new rb, so that we 4809 * could attach the new rb, we could have missed a wakeup. 4810 * Provide it now. 4811 */ 4812 wake_up_all(&event->waitq); 4813 } 4814 } 4815 4816 static void ring_buffer_wakeup(struct perf_event *event) 4817 { 4818 struct ring_buffer *rb; 4819 4820 rcu_read_lock(); 4821 rb = rcu_dereference(event->rb); 4822 if (rb) { 4823 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 4824 wake_up_all(&event->waitq); 4825 } 4826 rcu_read_unlock(); 4827 } 4828 4829 struct ring_buffer *ring_buffer_get(struct perf_event *event) 4830 { 4831 struct ring_buffer *rb; 4832 4833 rcu_read_lock(); 4834 rb = rcu_dereference(event->rb); 4835 if (rb) { 4836 if (!atomic_inc_not_zero(&rb->refcount)) 4837 rb = NULL; 4838 } 4839 rcu_read_unlock(); 4840 4841 return rb; 4842 } 4843 4844 void ring_buffer_put(struct ring_buffer *rb) 4845 { 4846 if (!atomic_dec_and_test(&rb->refcount)) 4847 return; 4848 4849 WARN_ON_ONCE(!list_empty(&rb->event_list)); 4850 4851 call_rcu(&rb->rcu_head, rb_free_rcu); 4852 } 4853 4854 static void perf_mmap_open(struct vm_area_struct *vma) 4855 { 4856 struct perf_event *event = vma->vm_file->private_data; 4857 4858 atomic_inc(&event->mmap_count); 4859 atomic_inc(&event->rb->mmap_count); 4860 4861 if (vma->vm_pgoff) 4862 atomic_inc(&event->rb->aux_mmap_count); 4863 4864 if (event->pmu->event_mapped) 4865 event->pmu->event_mapped(event); 4866 } 4867 4868 static void perf_pmu_output_stop(struct perf_event *event); 4869 4870 /* 4871 * A buffer can be mmap()ed multiple times; either directly through the same 4872 * event, or through other events by use of perf_event_set_output(). 4873 * 4874 * In order to undo the VM accounting done by perf_mmap() we need to destroy 4875 * the buffer here, where we still have a VM context. This means we need 4876 * to detach all events redirecting to us. 4877 */ 4878 static void perf_mmap_close(struct vm_area_struct *vma) 4879 { 4880 struct perf_event *event = vma->vm_file->private_data; 4881 4882 struct ring_buffer *rb = ring_buffer_get(event); 4883 struct user_struct *mmap_user = rb->mmap_user; 4884 int mmap_locked = rb->mmap_locked; 4885 unsigned long size = perf_data_size(rb); 4886 4887 if (event->pmu->event_unmapped) 4888 event->pmu->event_unmapped(event); 4889 4890 /* 4891 * rb->aux_mmap_count will always drop before rb->mmap_count and 4892 * event->mmap_count, so it is ok to use event->mmap_mutex to 4893 * serialize with perf_mmap here. 4894 */ 4895 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && 4896 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { 4897 /* 4898 * Stop all AUX events that are writing to this buffer, 4899 * so that we can free its AUX pages and corresponding PMU 4900 * data. Note that after rb::aux_mmap_count dropped to zero, 4901 * they won't start any more (see perf_aux_output_begin()). 4902 */ 4903 perf_pmu_output_stop(event); 4904 4905 /* now it's safe to free the pages */ 4906 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); 4907 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; 4908 4909 /* this has to be the last one */ 4910 rb_free_aux(rb); 4911 WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); 4912 4913 mutex_unlock(&event->mmap_mutex); 4914 } 4915 4916 atomic_dec(&rb->mmap_count); 4917 4918 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 4919 goto out_put; 4920 4921 ring_buffer_attach(event, NULL); 4922 mutex_unlock(&event->mmap_mutex); 4923 4924 /* If there's still other mmap()s of this buffer, we're done. */ 4925 if (atomic_read(&rb->mmap_count)) 4926 goto out_put; 4927 4928 /* 4929 * No other mmap()s, detach from all other events that might redirect 4930 * into the now unreachable buffer. Somewhat complicated by the 4931 * fact that rb::event_lock otherwise nests inside mmap_mutex. 4932 */ 4933 again: 4934 rcu_read_lock(); 4935 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { 4936 if (!atomic_long_inc_not_zero(&event->refcount)) { 4937 /* 4938 * This event is en-route to free_event() which will 4939 * detach it and remove it from the list. 4940 */ 4941 continue; 4942 } 4943 rcu_read_unlock(); 4944 4945 mutex_lock(&event->mmap_mutex); 4946 /* 4947 * Check we didn't race with perf_event_set_output() which can 4948 * swizzle the rb from under us while we were waiting to 4949 * acquire mmap_mutex. 4950 * 4951 * If we find a different rb; ignore this event, a next 4952 * iteration will no longer find it on the list. We have to 4953 * still restart the iteration to make sure we're not now 4954 * iterating the wrong list. 4955 */ 4956 if (event->rb == rb) 4957 ring_buffer_attach(event, NULL); 4958 4959 mutex_unlock(&event->mmap_mutex); 4960 put_event(event); 4961 4962 /* 4963 * Restart the iteration; either we're on the wrong list or 4964 * destroyed its integrity by doing a deletion. 4965 */ 4966 goto again; 4967 } 4968 rcu_read_unlock(); 4969 4970 /* 4971 * It could be there's still a few 0-ref events on the list; they'll 4972 * get cleaned up by free_event() -- they'll also still have their 4973 * ref on the rb and will free it whenever they are done with it. 4974 * 4975 * Aside from that, this buffer is 'fully' detached and unmapped, 4976 * undo the VM accounting. 4977 */ 4978 4979 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); 4980 vma->vm_mm->pinned_vm -= mmap_locked; 4981 free_uid(mmap_user); 4982 4983 out_put: 4984 ring_buffer_put(rb); /* could be last */ 4985 } 4986 4987 static const struct vm_operations_struct perf_mmap_vmops = { 4988 .open = perf_mmap_open, 4989 .close = perf_mmap_close, /* non mergable */ 4990 .fault = perf_mmap_fault, 4991 .page_mkwrite = perf_mmap_fault, 4992 }; 4993 4994 static int perf_mmap(struct file *file, struct vm_area_struct *vma) 4995 { 4996 struct perf_event *event = file->private_data; 4997 unsigned long user_locked, user_lock_limit; 4998 struct user_struct *user = current_user(); 4999 unsigned long locked, lock_limit; 5000 struct ring_buffer *rb = NULL; 5001 unsigned long vma_size; 5002 unsigned long nr_pages; 5003 long user_extra = 0, extra = 0; 5004 int ret = 0, flags = 0; 5005 5006 /* 5007 * Don't allow mmap() of inherited per-task counters. This would 5008 * create a performance issue due to all children writing to the 5009 * same rb. 5010 */ 5011 if (event->cpu == -1 && event->attr.inherit) 5012 return -EINVAL; 5013 5014 if (!(vma->vm_flags & VM_SHARED)) 5015 return -EINVAL; 5016 5017 vma_size = vma->vm_end - vma->vm_start; 5018 5019 if (vma->vm_pgoff == 0) { 5020 nr_pages = (vma_size / PAGE_SIZE) - 1; 5021 } else { 5022 /* 5023 * AUX area mapping: if rb->aux_nr_pages != 0, it's already 5024 * mapped, all subsequent mappings should have the same size 5025 * and offset. Must be above the normal perf buffer. 5026 */ 5027 u64 aux_offset, aux_size; 5028 5029 if (!event->rb) 5030 return -EINVAL; 5031 5032 nr_pages = vma_size / PAGE_SIZE; 5033 5034 mutex_lock(&event->mmap_mutex); 5035 ret = -EINVAL; 5036 5037 rb = event->rb; 5038 if (!rb) 5039 goto aux_unlock; 5040 5041 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); 5042 aux_size = ACCESS_ONCE(rb->user_page->aux_size); 5043 5044 if (aux_offset < perf_data_size(rb) + PAGE_SIZE) 5045 goto aux_unlock; 5046 5047 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) 5048 goto aux_unlock; 5049 5050 /* already mapped with a different offset */ 5051 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) 5052 goto aux_unlock; 5053 5054 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) 5055 goto aux_unlock; 5056 5057 /* already mapped with a different size */ 5058 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) 5059 goto aux_unlock; 5060 5061 if (!is_power_of_2(nr_pages)) 5062 goto aux_unlock; 5063 5064 if (!atomic_inc_not_zero(&rb->mmap_count)) 5065 goto aux_unlock; 5066 5067 if (rb_has_aux(rb)) { 5068 atomic_inc(&rb->aux_mmap_count); 5069 ret = 0; 5070 goto unlock; 5071 } 5072 5073 atomic_set(&rb->aux_mmap_count, 1); 5074 user_extra = nr_pages; 5075 5076 goto accounting; 5077 } 5078 5079 /* 5080 * If we have rb pages ensure they're a power-of-two number, so we 5081 * can do bitmasks instead of modulo. 5082 */ 5083 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 5084 return -EINVAL; 5085 5086 if (vma_size != PAGE_SIZE * (1 + nr_pages)) 5087 return -EINVAL; 5088 5089 WARN_ON_ONCE(event->ctx->parent_ctx); 5090 again: 5091 mutex_lock(&event->mmap_mutex); 5092 if (event->rb) { 5093 if (event->rb->nr_pages != nr_pages) { 5094 ret = -EINVAL; 5095 goto unlock; 5096 } 5097 5098 if (!atomic_inc_not_zero(&event->rb->mmap_count)) { 5099 /* 5100 * Raced against perf_mmap_close() through 5101 * perf_event_set_output(). Try again, hope for better 5102 * luck. 5103 */ 5104 mutex_unlock(&event->mmap_mutex); 5105 goto again; 5106 } 5107 5108 goto unlock; 5109 } 5110 5111 user_extra = nr_pages + 1; 5112 5113 accounting: 5114 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); 5115 5116 /* 5117 * Increase the limit linearly with more CPUs: 5118 */ 5119 user_lock_limit *= num_online_cpus(); 5120 5121 user_locked = atomic_long_read(&user->locked_vm) + user_extra; 5122 5123 if (user_locked > user_lock_limit) 5124 extra = user_locked - user_lock_limit; 5125 5126 lock_limit = rlimit(RLIMIT_MEMLOCK); 5127 lock_limit >>= PAGE_SHIFT; 5128 locked = vma->vm_mm->pinned_vm + extra; 5129 5130 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && 5131 !capable(CAP_IPC_LOCK)) { 5132 ret = -EPERM; 5133 goto unlock; 5134 } 5135 5136 WARN_ON(!rb && event->rb); 5137 5138 if (vma->vm_flags & VM_WRITE) 5139 flags |= RING_BUFFER_WRITABLE; 5140 5141 if (!rb) { 5142 rb = rb_alloc(nr_pages, 5143 event->attr.watermark ? event->attr.wakeup_watermark : 0, 5144 event->cpu, flags); 5145 5146 if (!rb) { 5147 ret = -ENOMEM; 5148 goto unlock; 5149 } 5150 5151 atomic_set(&rb->mmap_count, 1); 5152 rb->mmap_user = get_current_user(); 5153 rb->mmap_locked = extra; 5154 5155 ring_buffer_attach(event, rb); 5156 5157 perf_event_init_userpage(event); 5158 perf_event_update_userpage(event); 5159 } else { 5160 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, 5161 event->attr.aux_watermark, flags); 5162 if (!ret) 5163 rb->aux_mmap_locked = extra; 5164 } 5165 5166 unlock: 5167 if (!ret) { 5168 atomic_long_add(user_extra, &user->locked_vm); 5169 vma->vm_mm->pinned_vm += extra; 5170 5171 atomic_inc(&event->mmap_count); 5172 } else if (rb) { 5173 atomic_dec(&rb->mmap_count); 5174 } 5175 aux_unlock: 5176 mutex_unlock(&event->mmap_mutex); 5177 5178 /* 5179 * Since pinned accounting is per vm we cannot allow fork() to copy our 5180 * vma. 5181 */ 5182 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; 5183 vma->vm_ops = &perf_mmap_vmops; 5184 5185 if (event->pmu->event_mapped) 5186 event->pmu->event_mapped(event); 5187 5188 return ret; 5189 } 5190 5191 static int perf_fasync(int fd, struct file *filp, int on) 5192 { 5193 struct inode *inode = file_inode(filp); 5194 struct perf_event *event = filp->private_data; 5195 int retval; 5196 5197 inode_lock(inode); 5198 retval = fasync_helper(fd, filp, on, &event->fasync); 5199 inode_unlock(inode); 5200 5201 if (retval < 0) 5202 return retval; 5203 5204 return 0; 5205 } 5206 5207 static const struct file_operations perf_fops = { 5208 .llseek = no_llseek, 5209 .release = perf_release, 5210 .read = perf_read, 5211 .poll = perf_poll, 5212 .unlocked_ioctl = perf_ioctl, 5213 .compat_ioctl = perf_compat_ioctl, 5214 .mmap = perf_mmap, 5215 .fasync = perf_fasync, 5216 }; 5217 5218 /* 5219 * Perf event wakeup 5220 * 5221 * If there's data, ensure we set the poll() state and publish everything 5222 * to user-space before waking everybody up. 5223 */ 5224 5225 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) 5226 { 5227 /* only the parent has fasync state */ 5228 if (event->parent) 5229 event = event->parent; 5230 return &event->fasync; 5231 } 5232 5233 void perf_event_wakeup(struct perf_event *event) 5234 { 5235 ring_buffer_wakeup(event); 5236 5237 if (event->pending_kill) { 5238 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); 5239 event->pending_kill = 0; 5240 } 5241 } 5242 5243 static void perf_pending_event(struct irq_work *entry) 5244 { 5245 struct perf_event *event = container_of(entry, 5246 struct perf_event, pending); 5247 int rctx; 5248 5249 rctx = perf_swevent_get_recursion_context(); 5250 /* 5251 * If we 'fail' here, that's OK, it means recursion is already disabled 5252 * and we won't recurse 'further'. 5253 */ 5254 5255 if (event->pending_disable) { 5256 event->pending_disable = 0; 5257 perf_event_disable_local(event); 5258 } 5259 5260 if (event->pending_wakeup) { 5261 event->pending_wakeup = 0; 5262 perf_event_wakeup(event); 5263 } 5264 5265 if (rctx >= 0) 5266 perf_swevent_put_recursion_context(rctx); 5267 } 5268 5269 /* 5270 * We assume there is only KVM supporting the callbacks. 5271 * Later on, we might change it to a list if there is 5272 * another virtualization implementation supporting the callbacks. 5273 */ 5274 struct perf_guest_info_callbacks *perf_guest_cbs; 5275 5276 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) 5277 { 5278 perf_guest_cbs = cbs; 5279 return 0; 5280 } 5281 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); 5282 5283 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) 5284 { 5285 perf_guest_cbs = NULL; 5286 return 0; 5287 } 5288 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 5289 5290 static void 5291 perf_output_sample_regs(struct perf_output_handle *handle, 5292 struct pt_regs *regs, u64 mask) 5293 { 5294 int bit; 5295 5296 for_each_set_bit(bit, (const unsigned long *) &mask, 5297 sizeof(mask) * BITS_PER_BYTE) { 5298 u64 val; 5299 5300 val = perf_reg_value(regs, bit); 5301 perf_output_put(handle, val); 5302 } 5303 } 5304 5305 static void perf_sample_regs_user(struct perf_regs *regs_user, 5306 struct pt_regs *regs, 5307 struct pt_regs *regs_user_copy) 5308 { 5309 if (user_mode(regs)) { 5310 regs_user->abi = perf_reg_abi(current); 5311 regs_user->regs = regs; 5312 } else if (current->mm) { 5313 perf_get_regs_user(regs_user, regs, regs_user_copy); 5314 } else { 5315 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; 5316 regs_user->regs = NULL; 5317 } 5318 } 5319 5320 static void perf_sample_regs_intr(struct perf_regs *regs_intr, 5321 struct pt_regs *regs) 5322 { 5323 regs_intr->regs = regs; 5324 regs_intr->abi = perf_reg_abi(current); 5325 } 5326 5327 5328 /* 5329 * Get remaining task size from user stack pointer. 5330 * 5331 * It'd be better to take stack vma map and limit this more 5332 * precisly, but there's no way to get it safely under interrupt, 5333 * so using TASK_SIZE as limit. 5334 */ 5335 static u64 perf_ustack_task_size(struct pt_regs *regs) 5336 { 5337 unsigned long addr = perf_user_stack_pointer(regs); 5338 5339 if (!addr || addr >= TASK_SIZE) 5340 return 0; 5341 5342 return TASK_SIZE - addr; 5343 } 5344 5345 static u16 5346 perf_sample_ustack_size(u16 stack_size, u16 header_size, 5347 struct pt_regs *regs) 5348 { 5349 u64 task_size; 5350 5351 /* No regs, no stack pointer, no dump. */ 5352 if (!regs) 5353 return 0; 5354 5355 /* 5356 * Check if we fit in with the requested stack size into the: 5357 * - TASK_SIZE 5358 * If we don't, we limit the size to the TASK_SIZE. 5359 * 5360 * - remaining sample size 5361 * If we don't, we customize the stack size to 5362 * fit in to the remaining sample size. 5363 */ 5364 5365 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); 5366 stack_size = min(stack_size, (u16) task_size); 5367 5368 /* Current header size plus static size and dynamic size. */ 5369 header_size += 2 * sizeof(u64); 5370 5371 /* Do we fit in with the current stack dump size? */ 5372 if ((u16) (header_size + stack_size) < header_size) { 5373 /* 5374 * If we overflow the maximum size for the sample, 5375 * we customize the stack dump size to fit in. 5376 */ 5377 stack_size = USHRT_MAX - header_size - sizeof(u64); 5378 stack_size = round_up(stack_size, sizeof(u64)); 5379 } 5380 5381 return stack_size; 5382 } 5383 5384 static void 5385 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, 5386 struct pt_regs *regs) 5387 { 5388 /* Case of a kernel thread, nothing to dump */ 5389 if (!regs) { 5390 u64 size = 0; 5391 perf_output_put(handle, size); 5392 } else { 5393 unsigned long sp; 5394 unsigned int rem; 5395 u64 dyn_size; 5396 5397 /* 5398 * We dump: 5399 * static size 5400 * - the size requested by user or the best one we can fit 5401 * in to the sample max size 5402 * data 5403 * - user stack dump data 5404 * dynamic size 5405 * - the actual dumped size 5406 */ 5407 5408 /* Static size. */ 5409 perf_output_put(handle, dump_size); 5410 5411 /* Data. */ 5412 sp = perf_user_stack_pointer(regs); 5413 rem = __output_copy_user(handle, (void *) sp, dump_size); 5414 dyn_size = dump_size - rem; 5415 5416 perf_output_skip(handle, rem); 5417 5418 /* Dynamic size. */ 5419 perf_output_put(handle, dyn_size); 5420 } 5421 } 5422 5423 static void __perf_event_header__init_id(struct perf_event_header *header, 5424 struct perf_sample_data *data, 5425 struct perf_event *event) 5426 { 5427 u64 sample_type = event->attr.sample_type; 5428 5429 data->type = sample_type; 5430 header->size += event->id_header_size; 5431 5432 if (sample_type & PERF_SAMPLE_TID) { 5433 /* namespace issues */ 5434 data->tid_entry.pid = perf_event_pid(event, current); 5435 data->tid_entry.tid = perf_event_tid(event, current); 5436 } 5437 5438 if (sample_type & PERF_SAMPLE_TIME) 5439 data->time = perf_event_clock(event); 5440 5441 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) 5442 data->id = primary_event_id(event); 5443 5444 if (sample_type & PERF_SAMPLE_STREAM_ID) 5445 data->stream_id = event->id; 5446 5447 if (sample_type & PERF_SAMPLE_CPU) { 5448 data->cpu_entry.cpu = raw_smp_processor_id(); 5449 data->cpu_entry.reserved = 0; 5450 } 5451 } 5452 5453 void perf_event_header__init_id(struct perf_event_header *header, 5454 struct perf_sample_data *data, 5455 struct perf_event *event) 5456 { 5457 if (event->attr.sample_id_all) 5458 __perf_event_header__init_id(header, data, event); 5459 } 5460 5461 static void __perf_event__output_id_sample(struct perf_output_handle *handle, 5462 struct perf_sample_data *data) 5463 { 5464 u64 sample_type = data->type; 5465 5466 if (sample_type & PERF_SAMPLE_TID) 5467 perf_output_put(handle, data->tid_entry); 5468 5469 if (sample_type & PERF_SAMPLE_TIME) 5470 perf_output_put(handle, data->time); 5471 5472 if (sample_type & PERF_SAMPLE_ID) 5473 perf_output_put(handle, data->id); 5474 5475 if (sample_type & PERF_SAMPLE_STREAM_ID) 5476 perf_output_put(handle, data->stream_id); 5477 5478 if (sample_type & PERF_SAMPLE_CPU) 5479 perf_output_put(handle, data->cpu_entry); 5480 5481 if (sample_type & PERF_SAMPLE_IDENTIFIER) 5482 perf_output_put(handle, data->id); 5483 } 5484 5485 void perf_event__output_id_sample(struct perf_event *event, 5486 struct perf_output_handle *handle, 5487 struct perf_sample_data *sample) 5488 { 5489 if (event->attr.sample_id_all) 5490 __perf_event__output_id_sample(handle, sample); 5491 } 5492 5493 static void perf_output_read_one(struct perf_output_handle *handle, 5494 struct perf_event *event, 5495 u64 enabled, u64 running) 5496 { 5497 u64 read_format = event->attr.read_format; 5498 u64 values[4]; 5499 int n = 0; 5500 5501 values[n++] = perf_event_count(event); 5502 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 5503 values[n++] = enabled + 5504 atomic64_read(&event->child_total_time_enabled); 5505 } 5506 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 5507 values[n++] = running + 5508 atomic64_read(&event->child_total_time_running); 5509 } 5510 if (read_format & PERF_FORMAT_ID) 5511 values[n++] = primary_event_id(event); 5512 5513 __output_copy(handle, values, n * sizeof(u64)); 5514 } 5515 5516 /* 5517 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. 5518 */ 5519 static void perf_output_read_group(struct perf_output_handle *handle, 5520 struct perf_event *event, 5521 u64 enabled, u64 running) 5522 { 5523 struct perf_event *leader = event->group_leader, *sub; 5524 u64 read_format = event->attr.read_format; 5525 u64 values[5]; 5526 int n = 0; 5527 5528 values[n++] = 1 + leader->nr_siblings; 5529 5530 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 5531 values[n++] = enabled; 5532 5533 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 5534 values[n++] = running; 5535 5536 if (leader != event) 5537 leader->pmu->read(leader); 5538 5539 values[n++] = perf_event_count(leader); 5540 if (read_format & PERF_FORMAT_ID) 5541 values[n++] = primary_event_id(leader); 5542 5543 __output_copy(handle, values, n * sizeof(u64)); 5544 5545 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 5546 n = 0; 5547 5548 if ((sub != event) && 5549 (sub->state == PERF_EVENT_STATE_ACTIVE)) 5550 sub->pmu->read(sub); 5551 5552 values[n++] = perf_event_count(sub); 5553 if (read_format & PERF_FORMAT_ID) 5554 values[n++] = primary_event_id(sub); 5555 5556 __output_copy(handle, values, n * sizeof(u64)); 5557 } 5558 } 5559 5560 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ 5561 PERF_FORMAT_TOTAL_TIME_RUNNING) 5562 5563 static void perf_output_read(struct perf_output_handle *handle, 5564 struct perf_event *event) 5565 { 5566 u64 enabled = 0, running = 0, now; 5567 u64 read_format = event->attr.read_format; 5568 5569 /* 5570 * compute total_time_enabled, total_time_running 5571 * based on snapshot values taken when the event 5572 * was last scheduled in. 5573 * 5574 * we cannot simply called update_context_time() 5575 * because of locking issue as we are called in 5576 * NMI context 5577 */ 5578 if (read_format & PERF_FORMAT_TOTAL_TIMES) 5579 calc_timer_values(event, &now, &enabled, &running); 5580 5581 if (event->attr.read_format & PERF_FORMAT_GROUP) 5582 perf_output_read_group(handle, event, enabled, running); 5583 else 5584 perf_output_read_one(handle, event, enabled, running); 5585 } 5586 5587 void perf_output_sample(struct perf_output_handle *handle, 5588 struct perf_event_header *header, 5589 struct perf_sample_data *data, 5590 struct perf_event *event) 5591 { 5592 u64 sample_type = data->type; 5593 5594 perf_output_put(handle, *header); 5595 5596 if (sample_type & PERF_SAMPLE_IDENTIFIER) 5597 perf_output_put(handle, data->id); 5598 5599 if (sample_type & PERF_SAMPLE_IP) 5600 perf_output_put(handle, data->ip); 5601 5602 if (sample_type & PERF_SAMPLE_TID) 5603 perf_output_put(handle, data->tid_entry); 5604 5605 if (sample_type & PERF_SAMPLE_TIME) 5606 perf_output_put(handle, data->time); 5607 5608 if (sample_type & PERF_SAMPLE_ADDR) 5609 perf_output_put(handle, data->addr); 5610 5611 if (sample_type & PERF_SAMPLE_ID) 5612 perf_output_put(handle, data->id); 5613 5614 if (sample_type & PERF_SAMPLE_STREAM_ID) 5615 perf_output_put(handle, data->stream_id); 5616 5617 if (sample_type & PERF_SAMPLE_CPU) 5618 perf_output_put(handle, data->cpu_entry); 5619 5620 if (sample_type & PERF_SAMPLE_PERIOD) 5621 perf_output_put(handle, data->period); 5622 5623 if (sample_type & PERF_SAMPLE_READ) 5624 perf_output_read(handle, event); 5625 5626 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 5627 if (data->callchain) { 5628 int size = 1; 5629 5630 if (data->callchain) 5631 size += data->callchain->nr; 5632 5633 size *= sizeof(u64); 5634 5635 __output_copy(handle, data->callchain, size); 5636 } else { 5637 u64 nr = 0; 5638 perf_output_put(handle, nr); 5639 } 5640 } 5641 5642 if (sample_type & PERF_SAMPLE_RAW) { 5643 struct perf_raw_record *raw = data->raw; 5644 5645 if (raw) { 5646 struct perf_raw_frag *frag = &raw->frag; 5647 5648 perf_output_put(handle, raw->size); 5649 do { 5650 if (frag->copy) { 5651 __output_custom(handle, frag->copy, 5652 frag->data, frag->size); 5653 } else { 5654 __output_copy(handle, frag->data, 5655 frag->size); 5656 } 5657 if (perf_raw_frag_last(frag)) 5658 break; 5659 frag = frag->next; 5660 } while (1); 5661 if (frag->pad) 5662 __output_skip(handle, NULL, frag->pad); 5663 } else { 5664 struct { 5665 u32 size; 5666 u32 data; 5667 } raw = { 5668 .size = sizeof(u32), 5669 .data = 0, 5670 }; 5671 perf_output_put(handle, raw); 5672 } 5673 } 5674 5675 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 5676 if (data->br_stack) { 5677 size_t size; 5678 5679 size = data->br_stack->nr 5680 * sizeof(struct perf_branch_entry); 5681 5682 perf_output_put(handle, data->br_stack->nr); 5683 perf_output_copy(handle, data->br_stack->entries, size); 5684 } else { 5685 /* 5686 * we always store at least the value of nr 5687 */ 5688 u64 nr = 0; 5689 perf_output_put(handle, nr); 5690 } 5691 } 5692 5693 if (sample_type & PERF_SAMPLE_REGS_USER) { 5694 u64 abi = data->regs_user.abi; 5695 5696 /* 5697 * If there are no regs to dump, notice it through 5698 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). 5699 */ 5700 perf_output_put(handle, abi); 5701 5702 if (abi) { 5703 u64 mask = event->attr.sample_regs_user; 5704 perf_output_sample_regs(handle, 5705 data->regs_user.regs, 5706 mask); 5707 } 5708 } 5709 5710 if (sample_type & PERF_SAMPLE_STACK_USER) { 5711 perf_output_sample_ustack(handle, 5712 data->stack_user_size, 5713 data->regs_user.regs); 5714 } 5715 5716 if (sample_type & PERF_SAMPLE_WEIGHT) 5717 perf_output_put(handle, data->weight); 5718 5719 if (sample_type & PERF_SAMPLE_DATA_SRC) 5720 perf_output_put(handle, data->data_src.val); 5721 5722 if (sample_type & PERF_SAMPLE_TRANSACTION) 5723 perf_output_put(handle, data->txn); 5724 5725 if (sample_type & PERF_SAMPLE_REGS_INTR) { 5726 u64 abi = data->regs_intr.abi; 5727 /* 5728 * If there are no regs to dump, notice it through 5729 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). 5730 */ 5731 perf_output_put(handle, abi); 5732 5733 if (abi) { 5734 u64 mask = event->attr.sample_regs_intr; 5735 5736 perf_output_sample_regs(handle, 5737 data->regs_intr.regs, 5738 mask); 5739 } 5740 } 5741 5742 if (!event->attr.watermark) { 5743 int wakeup_events = event->attr.wakeup_events; 5744 5745 if (wakeup_events) { 5746 struct ring_buffer *rb = handle->rb; 5747 int events = local_inc_return(&rb->events); 5748 5749 if (events >= wakeup_events) { 5750 local_sub(wakeup_events, &rb->events); 5751 local_inc(&rb->wakeup); 5752 } 5753 } 5754 } 5755 } 5756 5757 void perf_prepare_sample(struct perf_event_header *header, 5758 struct perf_sample_data *data, 5759 struct perf_event *event, 5760 struct pt_regs *regs) 5761 { 5762 u64 sample_type = event->attr.sample_type; 5763 5764 header->type = PERF_RECORD_SAMPLE; 5765 header->size = sizeof(*header) + event->header_size; 5766 5767 header->misc = 0; 5768 header->misc |= perf_misc_flags(regs); 5769 5770 __perf_event_header__init_id(header, data, event); 5771 5772 if (sample_type & PERF_SAMPLE_IP) 5773 data->ip = perf_instruction_pointer(regs); 5774 5775 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 5776 int size = 1; 5777 5778 data->callchain = perf_callchain(event, regs); 5779 5780 if (data->callchain) 5781 size += data->callchain->nr; 5782 5783 header->size += size * sizeof(u64); 5784 } 5785 5786 if (sample_type & PERF_SAMPLE_RAW) { 5787 struct perf_raw_record *raw = data->raw; 5788 int size; 5789 5790 if (raw) { 5791 struct perf_raw_frag *frag = &raw->frag; 5792 u32 sum = 0; 5793 5794 do { 5795 sum += frag->size; 5796 if (perf_raw_frag_last(frag)) 5797 break; 5798 frag = frag->next; 5799 } while (1); 5800 5801 size = round_up(sum + sizeof(u32), sizeof(u64)); 5802 raw->size = size - sizeof(u32); 5803 frag->pad = raw->size - sum; 5804 } else { 5805 size = sizeof(u64); 5806 } 5807 5808 header->size += size; 5809 } 5810 5811 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 5812 int size = sizeof(u64); /* nr */ 5813 if (data->br_stack) { 5814 size += data->br_stack->nr 5815 * sizeof(struct perf_branch_entry); 5816 } 5817 header->size += size; 5818 } 5819 5820 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) 5821 perf_sample_regs_user(&data->regs_user, regs, 5822 &data->regs_user_copy); 5823 5824 if (sample_type & PERF_SAMPLE_REGS_USER) { 5825 /* regs dump ABI info */ 5826 int size = sizeof(u64); 5827 5828 if (data->regs_user.regs) { 5829 u64 mask = event->attr.sample_regs_user; 5830 size += hweight64(mask) * sizeof(u64); 5831 } 5832 5833 header->size += size; 5834 } 5835 5836 if (sample_type & PERF_SAMPLE_STACK_USER) { 5837 /* 5838 * Either we need PERF_SAMPLE_STACK_USER bit to be allways 5839 * processed as the last one or have additional check added 5840 * in case new sample type is added, because we could eat 5841 * up the rest of the sample size. 5842 */ 5843 u16 stack_size = event->attr.sample_stack_user; 5844 u16 size = sizeof(u64); 5845 5846 stack_size = perf_sample_ustack_size(stack_size, header->size, 5847 data->regs_user.regs); 5848 5849 /* 5850 * If there is something to dump, add space for the dump 5851 * itself and for the field that tells the dynamic size, 5852 * which is how many have been actually dumped. 5853 */ 5854 if (stack_size) 5855 size += sizeof(u64) + stack_size; 5856 5857 data->stack_user_size = stack_size; 5858 header->size += size; 5859 } 5860 5861 if (sample_type & PERF_SAMPLE_REGS_INTR) { 5862 /* regs dump ABI info */ 5863 int size = sizeof(u64); 5864 5865 perf_sample_regs_intr(&data->regs_intr, regs); 5866 5867 if (data->regs_intr.regs) { 5868 u64 mask = event->attr.sample_regs_intr; 5869 5870 size += hweight64(mask) * sizeof(u64); 5871 } 5872 5873 header->size += size; 5874 } 5875 } 5876 5877 static void __always_inline 5878 __perf_event_output(struct perf_event *event, 5879 struct perf_sample_data *data, 5880 struct pt_regs *regs, 5881 int (*output_begin)(struct perf_output_handle *, 5882 struct perf_event *, 5883 unsigned int)) 5884 { 5885 struct perf_output_handle handle; 5886 struct perf_event_header header; 5887 5888 /* protect the callchain buffers */ 5889 rcu_read_lock(); 5890 5891 perf_prepare_sample(&header, data, event, regs); 5892 5893 if (output_begin(&handle, event, header.size)) 5894 goto exit; 5895 5896 perf_output_sample(&handle, &header, data, event); 5897 5898 perf_output_end(&handle); 5899 5900 exit: 5901 rcu_read_unlock(); 5902 } 5903 5904 void 5905 perf_event_output_forward(struct perf_event *event, 5906 struct perf_sample_data *data, 5907 struct pt_regs *regs) 5908 { 5909 __perf_event_output(event, data, regs, perf_output_begin_forward); 5910 } 5911 5912 void 5913 perf_event_output_backward(struct perf_event *event, 5914 struct perf_sample_data *data, 5915 struct pt_regs *regs) 5916 { 5917 __perf_event_output(event, data, regs, perf_output_begin_backward); 5918 } 5919 5920 void 5921 perf_event_output(struct perf_event *event, 5922 struct perf_sample_data *data, 5923 struct pt_regs *regs) 5924 { 5925 __perf_event_output(event, data, regs, perf_output_begin); 5926 } 5927 5928 /* 5929 * read event_id 5930 */ 5931 5932 struct perf_read_event { 5933 struct perf_event_header header; 5934 5935 u32 pid; 5936 u32 tid; 5937 }; 5938 5939 static void 5940 perf_event_read_event(struct perf_event *event, 5941 struct task_struct *task) 5942 { 5943 struct perf_output_handle handle; 5944 struct perf_sample_data sample; 5945 struct perf_read_event read_event = { 5946 .header = { 5947 .type = PERF_RECORD_READ, 5948 .misc = 0, 5949 .size = sizeof(read_event) + event->read_size, 5950 }, 5951 .pid = perf_event_pid(event, task), 5952 .tid = perf_event_tid(event, task), 5953 }; 5954 int ret; 5955 5956 perf_event_header__init_id(&read_event.header, &sample, event); 5957 ret = perf_output_begin(&handle, event, read_event.header.size); 5958 if (ret) 5959 return; 5960 5961 perf_output_put(&handle, read_event); 5962 perf_output_read(&handle, event); 5963 perf_event__output_id_sample(event, &handle, &sample); 5964 5965 perf_output_end(&handle); 5966 } 5967 5968 typedef void (perf_iterate_f)(struct perf_event *event, void *data); 5969 5970 static void 5971 perf_iterate_ctx(struct perf_event_context *ctx, 5972 perf_iterate_f output, 5973 void *data, bool all) 5974 { 5975 struct perf_event *event; 5976 5977 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 5978 if (!all) { 5979 if (event->state < PERF_EVENT_STATE_INACTIVE) 5980 continue; 5981 if (!event_filter_match(event)) 5982 continue; 5983 } 5984 5985 output(event, data); 5986 } 5987 } 5988 5989 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) 5990 { 5991 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); 5992 struct perf_event *event; 5993 5994 list_for_each_entry_rcu(event, &pel->list, sb_list) { 5995 /* 5996 * Skip events that are not fully formed yet; ensure that 5997 * if we observe event->ctx, both event and ctx will be 5998 * complete enough. See perf_install_in_context(). 5999 */ 6000 if (!smp_load_acquire(&event->ctx)) 6001 continue; 6002 6003 if (event->state < PERF_EVENT_STATE_INACTIVE) 6004 continue; 6005 if (!event_filter_match(event)) 6006 continue; 6007 output(event, data); 6008 } 6009 } 6010 6011 /* 6012 * Iterate all events that need to receive side-band events. 6013 * 6014 * For new callers; ensure that account_pmu_sb_event() includes 6015 * your event, otherwise it might not get delivered. 6016 */ 6017 static void 6018 perf_iterate_sb(perf_iterate_f output, void *data, 6019 struct perf_event_context *task_ctx) 6020 { 6021 struct perf_event_context *ctx; 6022 int ctxn; 6023 6024 rcu_read_lock(); 6025 preempt_disable(); 6026 6027 /* 6028 * If we have task_ctx != NULL we only notify the task context itself. 6029 * The task_ctx is set only for EXIT events before releasing task 6030 * context. 6031 */ 6032 if (task_ctx) { 6033 perf_iterate_ctx(task_ctx, output, data, false); 6034 goto done; 6035 } 6036 6037 perf_iterate_sb_cpu(output, data); 6038 6039 for_each_task_context_nr(ctxn) { 6040 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 6041 if (ctx) 6042 perf_iterate_ctx(ctx, output, data, false); 6043 } 6044 done: 6045 preempt_enable(); 6046 rcu_read_unlock(); 6047 } 6048 6049 /* 6050 * Clear all file-based filters at exec, they'll have to be 6051 * re-instated when/if these objects are mmapped again. 6052 */ 6053 static void perf_event_addr_filters_exec(struct perf_event *event, void *data) 6054 { 6055 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 6056 struct perf_addr_filter *filter; 6057 unsigned int restart = 0, count = 0; 6058 unsigned long flags; 6059 6060 if (!has_addr_filter(event)) 6061 return; 6062 6063 raw_spin_lock_irqsave(&ifh->lock, flags); 6064 list_for_each_entry(filter, &ifh->list, entry) { 6065 if (filter->inode) { 6066 event->addr_filters_offs[count] = 0; 6067 restart++; 6068 } 6069 6070 count++; 6071 } 6072 6073 if (restart) 6074 event->addr_filters_gen++; 6075 raw_spin_unlock_irqrestore(&ifh->lock, flags); 6076 6077 if (restart) 6078 perf_event_restart(event); 6079 } 6080 6081 void perf_event_exec(void) 6082 { 6083 struct perf_event_context *ctx; 6084 int ctxn; 6085 6086 rcu_read_lock(); 6087 for_each_task_context_nr(ctxn) { 6088 ctx = current->perf_event_ctxp[ctxn]; 6089 if (!ctx) 6090 continue; 6091 6092 perf_event_enable_on_exec(ctxn); 6093 6094 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, 6095 true); 6096 } 6097 rcu_read_unlock(); 6098 } 6099 6100 struct remote_output { 6101 struct ring_buffer *rb; 6102 int err; 6103 }; 6104 6105 static void __perf_event_output_stop(struct perf_event *event, void *data) 6106 { 6107 struct perf_event *parent = event->parent; 6108 struct remote_output *ro = data; 6109 struct ring_buffer *rb = ro->rb; 6110 struct stop_event_data sd = { 6111 .event = event, 6112 }; 6113 6114 if (!has_aux(event)) 6115 return; 6116 6117 if (!parent) 6118 parent = event; 6119 6120 /* 6121 * In case of inheritance, it will be the parent that links to the 6122 * ring-buffer, but it will be the child that's actually using it: 6123 */ 6124 if (rcu_dereference(parent->rb) == rb) 6125 ro->err = __perf_event_stop(&sd); 6126 } 6127 6128 static int __perf_pmu_output_stop(void *info) 6129 { 6130 struct perf_event *event = info; 6131 struct pmu *pmu = event->pmu; 6132 struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 6133 struct remote_output ro = { 6134 .rb = event->rb, 6135 }; 6136 6137 rcu_read_lock(); 6138 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); 6139 if (cpuctx->task_ctx) 6140 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, 6141 &ro, false); 6142 rcu_read_unlock(); 6143 6144 return ro.err; 6145 } 6146 6147 static void perf_pmu_output_stop(struct perf_event *event) 6148 { 6149 struct perf_event *iter; 6150 int err, cpu; 6151 6152 restart: 6153 rcu_read_lock(); 6154 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { 6155 /* 6156 * For per-CPU events, we need to make sure that neither they 6157 * nor their children are running; for cpu==-1 events it's 6158 * sufficient to stop the event itself if it's active, since 6159 * it can't have children. 6160 */ 6161 cpu = iter->cpu; 6162 if (cpu == -1) 6163 cpu = READ_ONCE(iter->oncpu); 6164 6165 if (cpu == -1) 6166 continue; 6167 6168 err = cpu_function_call(cpu, __perf_pmu_output_stop, event); 6169 if (err == -EAGAIN) { 6170 rcu_read_unlock(); 6171 goto restart; 6172 } 6173 } 6174 rcu_read_unlock(); 6175 } 6176 6177 /* 6178 * task tracking -- fork/exit 6179 * 6180 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task 6181 */ 6182 6183 struct perf_task_event { 6184 struct task_struct *task; 6185 struct perf_event_context *task_ctx; 6186 6187 struct { 6188 struct perf_event_header header; 6189 6190 u32 pid; 6191 u32 ppid; 6192 u32 tid; 6193 u32 ptid; 6194 u64 time; 6195 } event_id; 6196 }; 6197 6198 static int perf_event_task_match(struct perf_event *event) 6199 { 6200 return event->attr.comm || event->attr.mmap || 6201 event->attr.mmap2 || event->attr.mmap_data || 6202 event->attr.task; 6203 } 6204 6205 static void perf_event_task_output(struct perf_event *event, 6206 void *data) 6207 { 6208 struct perf_task_event *task_event = data; 6209 struct perf_output_handle handle; 6210 struct perf_sample_data sample; 6211 struct task_struct *task = task_event->task; 6212 int ret, size = task_event->event_id.header.size; 6213 6214 if (!perf_event_task_match(event)) 6215 return; 6216 6217 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 6218 6219 ret = perf_output_begin(&handle, event, 6220 task_event->event_id.header.size); 6221 if (ret) 6222 goto out; 6223 6224 task_event->event_id.pid = perf_event_pid(event, task); 6225 task_event->event_id.ppid = perf_event_pid(event, current); 6226 6227 task_event->event_id.tid = perf_event_tid(event, task); 6228 task_event->event_id.ptid = perf_event_tid(event, current); 6229 6230 task_event->event_id.time = perf_event_clock(event); 6231 6232 perf_output_put(&handle, task_event->event_id); 6233 6234 perf_event__output_id_sample(event, &handle, &sample); 6235 6236 perf_output_end(&handle); 6237 out: 6238 task_event->event_id.header.size = size; 6239 } 6240 6241 static void perf_event_task(struct task_struct *task, 6242 struct perf_event_context *task_ctx, 6243 int new) 6244 { 6245 struct perf_task_event task_event; 6246 6247 if (!atomic_read(&nr_comm_events) && 6248 !atomic_read(&nr_mmap_events) && 6249 !atomic_read(&nr_task_events)) 6250 return; 6251 6252 task_event = (struct perf_task_event){ 6253 .task = task, 6254 .task_ctx = task_ctx, 6255 .event_id = { 6256 .header = { 6257 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, 6258 .misc = 0, 6259 .size = sizeof(task_event.event_id), 6260 }, 6261 /* .pid */ 6262 /* .ppid */ 6263 /* .tid */ 6264 /* .ptid */ 6265 /* .time */ 6266 }, 6267 }; 6268 6269 perf_iterate_sb(perf_event_task_output, 6270 &task_event, 6271 task_ctx); 6272 } 6273 6274 void perf_event_fork(struct task_struct *task) 6275 { 6276 perf_event_task(task, NULL, 1); 6277 } 6278 6279 /* 6280 * comm tracking 6281 */ 6282 6283 struct perf_comm_event { 6284 struct task_struct *task; 6285 char *comm; 6286 int comm_size; 6287 6288 struct { 6289 struct perf_event_header header; 6290 6291 u32 pid; 6292 u32 tid; 6293 } event_id; 6294 }; 6295 6296 static int perf_event_comm_match(struct perf_event *event) 6297 { 6298 return event->attr.comm; 6299 } 6300 6301 static void perf_event_comm_output(struct perf_event *event, 6302 void *data) 6303 { 6304 struct perf_comm_event *comm_event = data; 6305 struct perf_output_handle handle; 6306 struct perf_sample_data sample; 6307 int size = comm_event->event_id.header.size; 6308 int ret; 6309 6310 if (!perf_event_comm_match(event)) 6311 return; 6312 6313 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 6314 ret = perf_output_begin(&handle, event, 6315 comm_event->event_id.header.size); 6316 6317 if (ret) 6318 goto out; 6319 6320 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 6321 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 6322 6323 perf_output_put(&handle, comm_event->event_id); 6324 __output_copy(&handle, comm_event->comm, 6325 comm_event->comm_size); 6326 6327 perf_event__output_id_sample(event, &handle, &sample); 6328 6329 perf_output_end(&handle); 6330 out: 6331 comm_event->event_id.header.size = size; 6332 } 6333 6334 static void perf_event_comm_event(struct perf_comm_event *comm_event) 6335 { 6336 char comm[TASK_COMM_LEN]; 6337 unsigned int size; 6338 6339 memset(comm, 0, sizeof(comm)); 6340 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 6341 size = ALIGN(strlen(comm)+1, sizeof(u64)); 6342 6343 comm_event->comm = comm; 6344 comm_event->comm_size = size; 6345 6346 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 6347 6348 perf_iterate_sb(perf_event_comm_output, 6349 comm_event, 6350 NULL); 6351 } 6352 6353 void perf_event_comm(struct task_struct *task, bool exec) 6354 { 6355 struct perf_comm_event comm_event; 6356 6357 if (!atomic_read(&nr_comm_events)) 6358 return; 6359 6360 comm_event = (struct perf_comm_event){ 6361 .task = task, 6362 /* .comm */ 6363 /* .comm_size */ 6364 .event_id = { 6365 .header = { 6366 .type = PERF_RECORD_COMM, 6367 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, 6368 /* .size */ 6369 }, 6370 /* .pid */ 6371 /* .tid */ 6372 }, 6373 }; 6374 6375 perf_event_comm_event(&comm_event); 6376 } 6377 6378 /* 6379 * mmap tracking 6380 */ 6381 6382 struct perf_mmap_event { 6383 struct vm_area_struct *vma; 6384 6385 const char *file_name; 6386 int file_size; 6387 int maj, min; 6388 u64 ino; 6389 u64 ino_generation; 6390 u32 prot, flags; 6391 6392 struct { 6393 struct perf_event_header header; 6394 6395 u32 pid; 6396 u32 tid; 6397 u64 start; 6398 u64 len; 6399 u64 pgoff; 6400 } event_id; 6401 }; 6402 6403 static int perf_event_mmap_match(struct perf_event *event, 6404 void *data) 6405 { 6406 struct perf_mmap_event *mmap_event = data; 6407 struct vm_area_struct *vma = mmap_event->vma; 6408 int executable = vma->vm_flags & VM_EXEC; 6409 6410 return (!executable && event->attr.mmap_data) || 6411 (executable && (event->attr.mmap || event->attr.mmap2)); 6412 } 6413 6414 static void perf_event_mmap_output(struct perf_event *event, 6415 void *data) 6416 { 6417 struct perf_mmap_event *mmap_event = data; 6418 struct perf_output_handle handle; 6419 struct perf_sample_data sample; 6420 int size = mmap_event->event_id.header.size; 6421 int ret; 6422 6423 if (!perf_event_mmap_match(event, data)) 6424 return; 6425 6426 if (event->attr.mmap2) { 6427 mmap_event->event_id.header.type = PERF_RECORD_MMAP2; 6428 mmap_event->event_id.header.size += sizeof(mmap_event->maj); 6429 mmap_event->event_id.header.size += sizeof(mmap_event->min); 6430 mmap_event->event_id.header.size += sizeof(mmap_event->ino); 6431 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); 6432 mmap_event->event_id.header.size += sizeof(mmap_event->prot); 6433 mmap_event->event_id.header.size += sizeof(mmap_event->flags); 6434 } 6435 6436 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 6437 ret = perf_output_begin(&handle, event, 6438 mmap_event->event_id.header.size); 6439 if (ret) 6440 goto out; 6441 6442 mmap_event->event_id.pid = perf_event_pid(event, current); 6443 mmap_event->event_id.tid = perf_event_tid(event, current); 6444 6445 perf_output_put(&handle, mmap_event->event_id); 6446 6447 if (event->attr.mmap2) { 6448 perf_output_put(&handle, mmap_event->maj); 6449 perf_output_put(&handle, mmap_event->min); 6450 perf_output_put(&handle, mmap_event->ino); 6451 perf_output_put(&handle, mmap_event->ino_generation); 6452 perf_output_put(&handle, mmap_event->prot); 6453 perf_output_put(&handle, mmap_event->flags); 6454 } 6455 6456 __output_copy(&handle, mmap_event->file_name, 6457 mmap_event->file_size); 6458 6459 perf_event__output_id_sample(event, &handle, &sample); 6460 6461 perf_output_end(&handle); 6462 out: 6463 mmap_event->event_id.header.size = size; 6464 } 6465 6466 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 6467 { 6468 struct vm_area_struct *vma = mmap_event->vma; 6469 struct file *file = vma->vm_file; 6470 int maj = 0, min = 0; 6471 u64 ino = 0, gen = 0; 6472 u32 prot = 0, flags = 0; 6473 unsigned int size; 6474 char tmp[16]; 6475 char *buf = NULL; 6476 char *name; 6477 6478 if (file) { 6479 struct inode *inode; 6480 dev_t dev; 6481 6482 buf = kmalloc(PATH_MAX, GFP_KERNEL); 6483 if (!buf) { 6484 name = "//enomem"; 6485 goto cpy_name; 6486 } 6487 /* 6488 * d_path() works from the end of the rb backwards, so we 6489 * need to add enough zero bytes after the string to handle 6490 * the 64bit alignment we do later. 6491 */ 6492 name = file_path(file, buf, PATH_MAX - sizeof(u64)); 6493 if (IS_ERR(name)) { 6494 name = "//toolong"; 6495 goto cpy_name; 6496 } 6497 inode = file_inode(vma->vm_file); 6498 dev = inode->i_sb->s_dev; 6499 ino = inode->i_ino; 6500 gen = inode->i_generation; 6501 maj = MAJOR(dev); 6502 min = MINOR(dev); 6503 6504 if (vma->vm_flags & VM_READ) 6505 prot |= PROT_READ; 6506 if (vma->vm_flags & VM_WRITE) 6507 prot |= PROT_WRITE; 6508 if (vma->vm_flags & VM_EXEC) 6509 prot |= PROT_EXEC; 6510 6511 if (vma->vm_flags & VM_MAYSHARE) 6512 flags = MAP_SHARED; 6513 else 6514 flags = MAP_PRIVATE; 6515 6516 if (vma->vm_flags & VM_DENYWRITE) 6517 flags |= MAP_DENYWRITE; 6518 if (vma->vm_flags & VM_MAYEXEC) 6519 flags |= MAP_EXECUTABLE; 6520 if (vma->vm_flags & VM_LOCKED) 6521 flags |= MAP_LOCKED; 6522 if (vma->vm_flags & VM_HUGETLB) 6523 flags |= MAP_HUGETLB; 6524 6525 goto got_name; 6526 } else { 6527 if (vma->vm_ops && vma->vm_ops->name) { 6528 name = (char *) vma->vm_ops->name(vma); 6529 if (name) 6530 goto cpy_name; 6531 } 6532 6533 name = (char *)arch_vma_name(vma); 6534 if (name) 6535 goto cpy_name; 6536 6537 if (vma->vm_start <= vma->vm_mm->start_brk && 6538 vma->vm_end >= vma->vm_mm->brk) { 6539 name = "[heap]"; 6540 goto cpy_name; 6541 } 6542 if (vma->vm_start <= vma->vm_mm->start_stack && 6543 vma->vm_end >= vma->vm_mm->start_stack) { 6544 name = "[stack]"; 6545 goto cpy_name; 6546 } 6547 6548 name = "//anon"; 6549 goto cpy_name; 6550 } 6551 6552 cpy_name: 6553 strlcpy(tmp, name, sizeof(tmp)); 6554 name = tmp; 6555 got_name: 6556 /* 6557 * Since our buffer works in 8 byte units we need to align our string 6558 * size to a multiple of 8. However, we must guarantee the tail end is 6559 * zero'd out to avoid leaking random bits to userspace. 6560 */ 6561 size = strlen(name)+1; 6562 while (!IS_ALIGNED(size, sizeof(u64))) 6563 name[size++] = '\0'; 6564 6565 mmap_event->file_name = name; 6566 mmap_event->file_size = size; 6567 mmap_event->maj = maj; 6568 mmap_event->min = min; 6569 mmap_event->ino = ino; 6570 mmap_event->ino_generation = gen; 6571 mmap_event->prot = prot; 6572 mmap_event->flags = flags; 6573 6574 if (!(vma->vm_flags & VM_EXEC)) 6575 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; 6576 6577 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 6578 6579 perf_iterate_sb(perf_event_mmap_output, 6580 mmap_event, 6581 NULL); 6582 6583 kfree(buf); 6584 } 6585 6586 /* 6587 * Whether this @filter depends on a dynamic object which is not loaded 6588 * yet or its load addresses are not known. 6589 */ 6590 static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter) 6591 { 6592 return filter->filter && filter->inode; 6593 } 6594 6595 /* 6596 * Check whether inode and address range match filter criteria. 6597 */ 6598 static bool perf_addr_filter_match(struct perf_addr_filter *filter, 6599 struct file *file, unsigned long offset, 6600 unsigned long size) 6601 { 6602 if (filter->inode != file->f_inode) 6603 return false; 6604 6605 if (filter->offset > offset + size) 6606 return false; 6607 6608 if (filter->offset + filter->size < offset) 6609 return false; 6610 6611 return true; 6612 } 6613 6614 static void __perf_addr_filters_adjust(struct perf_event *event, void *data) 6615 { 6616 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 6617 struct vm_area_struct *vma = data; 6618 unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; 6619 struct file *file = vma->vm_file; 6620 struct perf_addr_filter *filter; 6621 unsigned int restart = 0, count = 0; 6622 6623 if (!has_addr_filter(event)) 6624 return; 6625 6626 if (!file) 6627 return; 6628 6629 raw_spin_lock_irqsave(&ifh->lock, flags); 6630 list_for_each_entry(filter, &ifh->list, entry) { 6631 if (perf_addr_filter_match(filter, file, off, 6632 vma->vm_end - vma->vm_start)) { 6633 event->addr_filters_offs[count] = vma->vm_start; 6634 restart++; 6635 } 6636 6637 count++; 6638 } 6639 6640 if (restart) 6641 event->addr_filters_gen++; 6642 raw_spin_unlock_irqrestore(&ifh->lock, flags); 6643 6644 if (restart) 6645 perf_event_restart(event); 6646 } 6647 6648 /* 6649 * Adjust all task's events' filters to the new vma 6650 */ 6651 static void perf_addr_filters_adjust(struct vm_area_struct *vma) 6652 { 6653 struct perf_event_context *ctx; 6654 int ctxn; 6655 6656 rcu_read_lock(); 6657 for_each_task_context_nr(ctxn) { 6658 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 6659 if (!ctx) 6660 continue; 6661 6662 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); 6663 } 6664 rcu_read_unlock(); 6665 } 6666 6667 void perf_event_mmap(struct vm_area_struct *vma) 6668 { 6669 struct perf_mmap_event mmap_event; 6670 6671 if (!atomic_read(&nr_mmap_events)) 6672 return; 6673 6674 mmap_event = (struct perf_mmap_event){ 6675 .vma = vma, 6676 /* .file_name */ 6677 /* .file_size */ 6678 .event_id = { 6679 .header = { 6680 .type = PERF_RECORD_MMAP, 6681 .misc = PERF_RECORD_MISC_USER, 6682 /* .size */ 6683 }, 6684 /* .pid */ 6685 /* .tid */ 6686 .start = vma->vm_start, 6687 .len = vma->vm_end - vma->vm_start, 6688 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, 6689 }, 6690 /* .maj (attr_mmap2 only) */ 6691 /* .min (attr_mmap2 only) */ 6692 /* .ino (attr_mmap2 only) */ 6693 /* .ino_generation (attr_mmap2 only) */ 6694 /* .prot (attr_mmap2 only) */ 6695 /* .flags (attr_mmap2 only) */ 6696 }; 6697 6698 perf_addr_filters_adjust(vma); 6699 perf_event_mmap_event(&mmap_event); 6700 } 6701 6702 void perf_event_aux_event(struct perf_event *event, unsigned long head, 6703 unsigned long size, u64 flags) 6704 { 6705 struct perf_output_handle handle; 6706 struct perf_sample_data sample; 6707 struct perf_aux_event { 6708 struct perf_event_header header; 6709 u64 offset; 6710 u64 size; 6711 u64 flags; 6712 } rec = { 6713 .header = { 6714 .type = PERF_RECORD_AUX, 6715 .misc = 0, 6716 .size = sizeof(rec), 6717 }, 6718 .offset = head, 6719 .size = size, 6720 .flags = flags, 6721 }; 6722 int ret; 6723 6724 perf_event_header__init_id(&rec.header, &sample, event); 6725 ret = perf_output_begin(&handle, event, rec.header.size); 6726 6727 if (ret) 6728 return; 6729 6730 perf_output_put(&handle, rec); 6731 perf_event__output_id_sample(event, &handle, &sample); 6732 6733 perf_output_end(&handle); 6734 } 6735 6736 /* 6737 * Lost/dropped samples logging 6738 */ 6739 void perf_log_lost_samples(struct perf_event *event, u64 lost) 6740 { 6741 struct perf_output_handle handle; 6742 struct perf_sample_data sample; 6743 int ret; 6744 6745 struct { 6746 struct perf_event_header header; 6747 u64 lost; 6748 } lost_samples_event = { 6749 .header = { 6750 .type = PERF_RECORD_LOST_SAMPLES, 6751 .misc = 0, 6752 .size = sizeof(lost_samples_event), 6753 }, 6754 .lost = lost, 6755 }; 6756 6757 perf_event_header__init_id(&lost_samples_event.header, &sample, event); 6758 6759 ret = perf_output_begin(&handle, event, 6760 lost_samples_event.header.size); 6761 if (ret) 6762 return; 6763 6764 perf_output_put(&handle, lost_samples_event); 6765 perf_event__output_id_sample(event, &handle, &sample); 6766 perf_output_end(&handle); 6767 } 6768 6769 /* 6770 * context_switch tracking 6771 */ 6772 6773 struct perf_switch_event { 6774 struct task_struct *task; 6775 struct task_struct *next_prev; 6776 6777 struct { 6778 struct perf_event_header header; 6779 u32 next_prev_pid; 6780 u32 next_prev_tid; 6781 } event_id; 6782 }; 6783 6784 static int perf_event_switch_match(struct perf_event *event) 6785 { 6786 return event->attr.context_switch; 6787 } 6788 6789 static void perf_event_switch_output(struct perf_event *event, void *data) 6790 { 6791 struct perf_switch_event *se = data; 6792 struct perf_output_handle handle; 6793 struct perf_sample_data sample; 6794 int ret; 6795 6796 if (!perf_event_switch_match(event)) 6797 return; 6798 6799 /* Only CPU-wide events are allowed to see next/prev pid/tid */ 6800 if (event->ctx->task) { 6801 se->event_id.header.type = PERF_RECORD_SWITCH; 6802 se->event_id.header.size = sizeof(se->event_id.header); 6803 } else { 6804 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; 6805 se->event_id.header.size = sizeof(se->event_id); 6806 se->event_id.next_prev_pid = 6807 perf_event_pid(event, se->next_prev); 6808 se->event_id.next_prev_tid = 6809 perf_event_tid(event, se->next_prev); 6810 } 6811 6812 perf_event_header__init_id(&se->event_id.header, &sample, event); 6813 6814 ret = perf_output_begin(&handle, event, se->event_id.header.size); 6815 if (ret) 6816 return; 6817 6818 if (event->ctx->task) 6819 perf_output_put(&handle, se->event_id.header); 6820 else 6821 perf_output_put(&handle, se->event_id); 6822 6823 perf_event__output_id_sample(event, &handle, &sample); 6824 6825 perf_output_end(&handle); 6826 } 6827 6828 static void perf_event_switch(struct task_struct *task, 6829 struct task_struct *next_prev, bool sched_in) 6830 { 6831 struct perf_switch_event switch_event; 6832 6833 /* N.B. caller checks nr_switch_events != 0 */ 6834 6835 switch_event = (struct perf_switch_event){ 6836 .task = task, 6837 .next_prev = next_prev, 6838 .event_id = { 6839 .header = { 6840 /* .type */ 6841 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, 6842 /* .size */ 6843 }, 6844 /* .next_prev_pid */ 6845 /* .next_prev_tid */ 6846 }, 6847 }; 6848 6849 perf_iterate_sb(perf_event_switch_output, 6850 &switch_event, 6851 NULL); 6852 } 6853 6854 /* 6855 * IRQ throttle logging 6856 */ 6857 6858 static void perf_log_throttle(struct perf_event *event, int enable) 6859 { 6860 struct perf_output_handle handle; 6861 struct perf_sample_data sample; 6862 int ret; 6863 6864 struct { 6865 struct perf_event_header header; 6866 u64 time; 6867 u64 id; 6868 u64 stream_id; 6869 } throttle_event = { 6870 .header = { 6871 .type = PERF_RECORD_THROTTLE, 6872 .misc = 0, 6873 .size = sizeof(throttle_event), 6874 }, 6875 .time = perf_event_clock(event), 6876 .id = primary_event_id(event), 6877 .stream_id = event->id, 6878 }; 6879 6880 if (enable) 6881 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 6882 6883 perf_event_header__init_id(&throttle_event.header, &sample, event); 6884 6885 ret = perf_output_begin(&handle, event, 6886 throttle_event.header.size); 6887 if (ret) 6888 return; 6889 6890 perf_output_put(&handle, throttle_event); 6891 perf_event__output_id_sample(event, &handle, &sample); 6892 perf_output_end(&handle); 6893 } 6894 6895 static void perf_log_itrace_start(struct perf_event *event) 6896 { 6897 struct perf_output_handle handle; 6898 struct perf_sample_data sample; 6899 struct perf_aux_event { 6900 struct perf_event_header header; 6901 u32 pid; 6902 u32 tid; 6903 } rec; 6904 int ret; 6905 6906 if (event->parent) 6907 event = event->parent; 6908 6909 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || 6910 event->hw.itrace_started) 6911 return; 6912 6913 rec.header.type = PERF_RECORD_ITRACE_START; 6914 rec.header.misc = 0; 6915 rec.header.size = sizeof(rec); 6916 rec.pid = perf_event_pid(event, current); 6917 rec.tid = perf_event_tid(event, current); 6918 6919 perf_event_header__init_id(&rec.header, &sample, event); 6920 ret = perf_output_begin(&handle, event, rec.header.size); 6921 6922 if (ret) 6923 return; 6924 6925 perf_output_put(&handle, rec); 6926 perf_event__output_id_sample(event, &handle, &sample); 6927 6928 perf_output_end(&handle); 6929 } 6930 6931 /* 6932 * Generic event overflow handling, sampling. 6933 */ 6934 6935 static int __perf_event_overflow(struct perf_event *event, 6936 int throttle, struct perf_sample_data *data, 6937 struct pt_regs *regs) 6938 { 6939 int events = atomic_read(&event->event_limit); 6940 struct hw_perf_event *hwc = &event->hw; 6941 u64 seq; 6942 int ret = 0; 6943 6944 /* 6945 * Non-sampling counters might still use the PMI to fold short 6946 * hardware counters, ignore those. 6947 */ 6948 if (unlikely(!is_sampling_event(event))) 6949 return 0; 6950 6951 seq = __this_cpu_read(perf_throttled_seq); 6952 if (seq != hwc->interrupts_seq) { 6953 hwc->interrupts_seq = seq; 6954 hwc->interrupts = 1; 6955 } else { 6956 hwc->interrupts++; 6957 if (unlikely(throttle 6958 && hwc->interrupts >= max_samples_per_tick)) { 6959 __this_cpu_inc(perf_throttled_count); 6960 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); 6961 hwc->interrupts = MAX_INTERRUPTS; 6962 perf_log_throttle(event, 0); 6963 ret = 1; 6964 } 6965 } 6966 6967 if (event->attr.freq) { 6968 u64 now = perf_clock(); 6969 s64 delta = now - hwc->freq_time_stamp; 6970 6971 hwc->freq_time_stamp = now; 6972 6973 if (delta > 0 && delta < 2*TICK_NSEC) 6974 perf_adjust_period(event, delta, hwc->last_period, true); 6975 } 6976 6977 /* 6978 * XXX event_limit might not quite work as expected on inherited 6979 * events 6980 */ 6981 6982 event->pending_kill = POLL_IN; 6983 if (events && atomic_dec_and_test(&event->event_limit)) { 6984 ret = 1; 6985 event->pending_kill = POLL_HUP; 6986 event->pending_disable = 1; 6987 irq_work_queue(&event->pending); 6988 } 6989 6990 event->overflow_handler(event, data, regs); 6991 6992 if (*perf_event_fasync(event) && event->pending_kill) { 6993 event->pending_wakeup = 1; 6994 irq_work_queue(&event->pending); 6995 } 6996 6997 return ret; 6998 } 6999 7000 int perf_event_overflow(struct perf_event *event, 7001 struct perf_sample_data *data, 7002 struct pt_regs *regs) 7003 { 7004 return __perf_event_overflow(event, 1, data, regs); 7005 } 7006 7007 /* 7008 * Generic software event infrastructure 7009 */ 7010 7011 struct swevent_htable { 7012 struct swevent_hlist *swevent_hlist; 7013 struct mutex hlist_mutex; 7014 int hlist_refcount; 7015 7016 /* Recursion avoidance in each contexts */ 7017 int recursion[PERF_NR_CONTEXTS]; 7018 }; 7019 7020 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); 7021 7022 /* 7023 * We directly increment event->count and keep a second value in 7024 * event->hw.period_left to count intervals. This period event 7025 * is kept in the range [-sample_period, 0] so that we can use the 7026 * sign as trigger. 7027 */ 7028 7029 u64 perf_swevent_set_period(struct perf_event *event) 7030 { 7031 struct hw_perf_event *hwc = &event->hw; 7032 u64 period = hwc->last_period; 7033 u64 nr, offset; 7034 s64 old, val; 7035 7036 hwc->last_period = hwc->sample_period; 7037 7038 again: 7039 old = val = local64_read(&hwc->period_left); 7040 if (val < 0) 7041 return 0; 7042 7043 nr = div64_u64(period + val, period); 7044 offset = nr * period; 7045 val -= offset; 7046 if (local64_cmpxchg(&hwc->period_left, old, val) != old) 7047 goto again; 7048 7049 return nr; 7050 } 7051 7052 static void perf_swevent_overflow(struct perf_event *event, u64 overflow, 7053 struct perf_sample_data *data, 7054 struct pt_regs *regs) 7055 { 7056 struct hw_perf_event *hwc = &event->hw; 7057 int throttle = 0; 7058 7059 if (!overflow) 7060 overflow = perf_swevent_set_period(event); 7061 7062 if (hwc->interrupts == MAX_INTERRUPTS) 7063 return; 7064 7065 for (; overflow; overflow--) { 7066 if (__perf_event_overflow(event, throttle, 7067 data, regs)) { 7068 /* 7069 * We inhibit the overflow from happening when 7070 * hwc->interrupts == MAX_INTERRUPTS. 7071 */ 7072 break; 7073 } 7074 throttle = 1; 7075 } 7076 } 7077 7078 static void perf_swevent_event(struct perf_event *event, u64 nr, 7079 struct perf_sample_data *data, 7080 struct pt_regs *regs) 7081 { 7082 struct hw_perf_event *hwc = &event->hw; 7083 7084 local64_add(nr, &event->count); 7085 7086 if (!regs) 7087 return; 7088 7089 if (!is_sampling_event(event)) 7090 return; 7091 7092 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { 7093 data->period = nr; 7094 return perf_swevent_overflow(event, 1, data, regs); 7095 } else 7096 data->period = event->hw.last_period; 7097 7098 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 7099 return perf_swevent_overflow(event, 1, data, regs); 7100 7101 if (local64_add_negative(nr, &hwc->period_left)) 7102 return; 7103 7104 perf_swevent_overflow(event, 0, data, regs); 7105 } 7106 7107 static int perf_exclude_event(struct perf_event *event, 7108 struct pt_regs *regs) 7109 { 7110 if (event->hw.state & PERF_HES_STOPPED) 7111 return 1; 7112 7113 if (regs) { 7114 if (event->attr.exclude_user && user_mode(regs)) 7115 return 1; 7116 7117 if (event->attr.exclude_kernel && !user_mode(regs)) 7118 return 1; 7119 } 7120 7121 return 0; 7122 } 7123 7124 static int perf_swevent_match(struct perf_event *event, 7125 enum perf_type_id type, 7126 u32 event_id, 7127 struct perf_sample_data *data, 7128 struct pt_regs *regs) 7129 { 7130 if (event->attr.type != type) 7131 return 0; 7132 7133 if (event->attr.config != event_id) 7134 return 0; 7135 7136 if (perf_exclude_event(event, regs)) 7137 return 0; 7138 7139 return 1; 7140 } 7141 7142 static inline u64 swevent_hash(u64 type, u32 event_id) 7143 { 7144 u64 val = event_id | (type << 32); 7145 7146 return hash_64(val, SWEVENT_HLIST_BITS); 7147 } 7148 7149 static inline struct hlist_head * 7150 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) 7151 { 7152 u64 hash = swevent_hash(type, event_id); 7153 7154 return &hlist->heads[hash]; 7155 } 7156 7157 /* For the read side: events when they trigger */ 7158 static inline struct hlist_head * 7159 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) 7160 { 7161 struct swevent_hlist *hlist; 7162 7163 hlist = rcu_dereference(swhash->swevent_hlist); 7164 if (!hlist) 7165 return NULL; 7166 7167 return __find_swevent_head(hlist, type, event_id); 7168 } 7169 7170 /* For the event head insertion and removal in the hlist */ 7171 static inline struct hlist_head * 7172 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) 7173 { 7174 struct swevent_hlist *hlist; 7175 u32 event_id = event->attr.config; 7176 u64 type = event->attr.type; 7177 7178 /* 7179 * Event scheduling is always serialized against hlist allocation 7180 * and release. Which makes the protected version suitable here. 7181 * The context lock guarantees that. 7182 */ 7183 hlist = rcu_dereference_protected(swhash->swevent_hlist, 7184 lockdep_is_held(&event->ctx->lock)); 7185 if (!hlist) 7186 return NULL; 7187 7188 return __find_swevent_head(hlist, type, event_id); 7189 } 7190 7191 static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 7192 u64 nr, 7193 struct perf_sample_data *data, 7194 struct pt_regs *regs) 7195 { 7196 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 7197 struct perf_event *event; 7198 struct hlist_head *head; 7199 7200 rcu_read_lock(); 7201 head = find_swevent_head_rcu(swhash, type, event_id); 7202 if (!head) 7203 goto end; 7204 7205 hlist_for_each_entry_rcu(event, head, hlist_entry) { 7206 if (perf_swevent_match(event, type, event_id, data, regs)) 7207 perf_swevent_event(event, nr, data, regs); 7208 } 7209 end: 7210 rcu_read_unlock(); 7211 } 7212 7213 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); 7214 7215 int perf_swevent_get_recursion_context(void) 7216 { 7217 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 7218 7219 return get_recursion_context(swhash->recursion); 7220 } 7221 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 7222 7223 void perf_swevent_put_recursion_context(int rctx) 7224 { 7225 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 7226 7227 put_recursion_context(swhash->recursion, rctx); 7228 } 7229 7230 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 7231 { 7232 struct perf_sample_data data; 7233 7234 if (WARN_ON_ONCE(!regs)) 7235 return; 7236 7237 perf_sample_data_init(&data, addr, 0); 7238 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 7239 } 7240 7241 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 7242 { 7243 int rctx; 7244 7245 preempt_disable_notrace(); 7246 rctx = perf_swevent_get_recursion_context(); 7247 if (unlikely(rctx < 0)) 7248 goto fail; 7249 7250 ___perf_sw_event(event_id, nr, regs, addr); 7251 7252 perf_swevent_put_recursion_context(rctx); 7253 fail: 7254 preempt_enable_notrace(); 7255 } 7256 7257 static void perf_swevent_read(struct perf_event *event) 7258 { 7259 } 7260 7261 static int perf_swevent_add(struct perf_event *event, int flags) 7262 { 7263 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 7264 struct hw_perf_event *hwc = &event->hw; 7265 struct hlist_head *head; 7266 7267 if (is_sampling_event(event)) { 7268 hwc->last_period = hwc->sample_period; 7269 perf_swevent_set_period(event); 7270 } 7271 7272 hwc->state = !(flags & PERF_EF_START); 7273 7274 head = find_swevent_head(swhash, event); 7275 if (WARN_ON_ONCE(!head)) 7276 return -EINVAL; 7277 7278 hlist_add_head_rcu(&event->hlist_entry, head); 7279 perf_event_update_userpage(event); 7280 7281 return 0; 7282 } 7283 7284 static void perf_swevent_del(struct perf_event *event, int flags) 7285 { 7286 hlist_del_rcu(&event->hlist_entry); 7287 } 7288 7289 static void perf_swevent_start(struct perf_event *event, int flags) 7290 { 7291 event->hw.state = 0; 7292 } 7293 7294 static void perf_swevent_stop(struct perf_event *event, int flags) 7295 { 7296 event->hw.state = PERF_HES_STOPPED; 7297 } 7298 7299 /* Deref the hlist from the update side */ 7300 static inline struct swevent_hlist * 7301 swevent_hlist_deref(struct swevent_htable *swhash) 7302 { 7303 return rcu_dereference_protected(swhash->swevent_hlist, 7304 lockdep_is_held(&swhash->hlist_mutex)); 7305 } 7306 7307 static void swevent_hlist_release(struct swevent_htable *swhash) 7308 { 7309 struct swevent_hlist *hlist = swevent_hlist_deref(swhash); 7310 7311 if (!hlist) 7312 return; 7313 7314 RCU_INIT_POINTER(swhash->swevent_hlist, NULL); 7315 kfree_rcu(hlist, rcu_head); 7316 } 7317 7318 static void swevent_hlist_put_cpu(int cpu) 7319 { 7320 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7321 7322 mutex_lock(&swhash->hlist_mutex); 7323 7324 if (!--swhash->hlist_refcount) 7325 swevent_hlist_release(swhash); 7326 7327 mutex_unlock(&swhash->hlist_mutex); 7328 } 7329 7330 static void swevent_hlist_put(void) 7331 { 7332 int cpu; 7333 7334 for_each_possible_cpu(cpu) 7335 swevent_hlist_put_cpu(cpu); 7336 } 7337 7338 static int swevent_hlist_get_cpu(int cpu) 7339 { 7340 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7341 int err = 0; 7342 7343 mutex_lock(&swhash->hlist_mutex); 7344 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { 7345 struct swevent_hlist *hlist; 7346 7347 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 7348 if (!hlist) { 7349 err = -ENOMEM; 7350 goto exit; 7351 } 7352 rcu_assign_pointer(swhash->swevent_hlist, hlist); 7353 } 7354 swhash->hlist_refcount++; 7355 exit: 7356 mutex_unlock(&swhash->hlist_mutex); 7357 7358 return err; 7359 } 7360 7361 static int swevent_hlist_get(void) 7362 { 7363 int err, cpu, failed_cpu; 7364 7365 get_online_cpus(); 7366 for_each_possible_cpu(cpu) { 7367 err = swevent_hlist_get_cpu(cpu); 7368 if (err) { 7369 failed_cpu = cpu; 7370 goto fail; 7371 } 7372 } 7373 put_online_cpus(); 7374 7375 return 0; 7376 fail: 7377 for_each_possible_cpu(cpu) { 7378 if (cpu == failed_cpu) 7379 break; 7380 swevent_hlist_put_cpu(cpu); 7381 } 7382 7383 put_online_cpus(); 7384 return err; 7385 } 7386 7387 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; 7388 7389 static void sw_perf_event_destroy(struct perf_event *event) 7390 { 7391 u64 event_id = event->attr.config; 7392 7393 WARN_ON(event->parent); 7394 7395 static_key_slow_dec(&perf_swevent_enabled[event_id]); 7396 swevent_hlist_put(); 7397 } 7398 7399 static int perf_swevent_init(struct perf_event *event) 7400 { 7401 u64 event_id = event->attr.config; 7402 7403 if (event->attr.type != PERF_TYPE_SOFTWARE) 7404 return -ENOENT; 7405 7406 /* 7407 * no branch sampling for software events 7408 */ 7409 if (has_branch_stack(event)) 7410 return -EOPNOTSUPP; 7411 7412 switch (event_id) { 7413 case PERF_COUNT_SW_CPU_CLOCK: 7414 case PERF_COUNT_SW_TASK_CLOCK: 7415 return -ENOENT; 7416 7417 default: 7418 break; 7419 } 7420 7421 if (event_id >= PERF_COUNT_SW_MAX) 7422 return -ENOENT; 7423 7424 if (!event->parent) { 7425 int err; 7426 7427 err = swevent_hlist_get(); 7428 if (err) 7429 return err; 7430 7431 static_key_slow_inc(&perf_swevent_enabled[event_id]); 7432 event->destroy = sw_perf_event_destroy; 7433 } 7434 7435 return 0; 7436 } 7437 7438 static struct pmu perf_swevent = { 7439 .task_ctx_nr = perf_sw_context, 7440 7441 .capabilities = PERF_PMU_CAP_NO_NMI, 7442 7443 .event_init = perf_swevent_init, 7444 .add = perf_swevent_add, 7445 .del = perf_swevent_del, 7446 .start = perf_swevent_start, 7447 .stop = perf_swevent_stop, 7448 .read = perf_swevent_read, 7449 }; 7450 7451 #ifdef CONFIG_EVENT_TRACING 7452 7453 static int perf_tp_filter_match(struct perf_event *event, 7454 struct perf_sample_data *data) 7455 { 7456 void *record = data->raw->frag.data; 7457 7458 /* only top level events have filters set */ 7459 if (event->parent) 7460 event = event->parent; 7461 7462 if (likely(!event->filter) || filter_match_preds(event->filter, record)) 7463 return 1; 7464 return 0; 7465 } 7466 7467 static int perf_tp_event_match(struct perf_event *event, 7468 struct perf_sample_data *data, 7469 struct pt_regs *regs) 7470 { 7471 if (event->hw.state & PERF_HES_STOPPED) 7472 return 0; 7473 /* 7474 * All tracepoints are from kernel-space. 7475 */ 7476 if (event->attr.exclude_kernel) 7477 return 0; 7478 7479 if (!perf_tp_filter_match(event, data)) 7480 return 0; 7481 7482 return 1; 7483 } 7484 7485 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, 7486 struct trace_event_call *call, u64 count, 7487 struct pt_regs *regs, struct hlist_head *head, 7488 struct task_struct *task) 7489 { 7490 struct bpf_prog *prog = call->prog; 7491 7492 if (prog) { 7493 *(struct pt_regs **)raw_data = regs; 7494 if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) { 7495 perf_swevent_put_recursion_context(rctx); 7496 return; 7497 } 7498 } 7499 perf_tp_event(call->event.type, count, raw_data, size, regs, head, 7500 rctx, task); 7501 } 7502 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); 7503 7504 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, 7505 struct pt_regs *regs, struct hlist_head *head, int rctx, 7506 struct task_struct *task) 7507 { 7508 struct perf_sample_data data; 7509 struct perf_event *event; 7510 7511 struct perf_raw_record raw = { 7512 .frag = { 7513 .size = entry_size, 7514 .data = record, 7515 }, 7516 }; 7517 7518 perf_sample_data_init(&data, 0, 0); 7519 data.raw = &raw; 7520 7521 perf_trace_buf_update(record, event_type); 7522 7523 hlist_for_each_entry_rcu(event, head, hlist_entry) { 7524 if (perf_tp_event_match(event, &data, regs)) 7525 perf_swevent_event(event, count, &data, regs); 7526 } 7527 7528 /* 7529 * If we got specified a target task, also iterate its context and 7530 * deliver this event there too. 7531 */ 7532 if (task && task != current) { 7533 struct perf_event_context *ctx; 7534 struct trace_entry *entry = record; 7535 7536 rcu_read_lock(); 7537 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); 7538 if (!ctx) 7539 goto unlock; 7540 7541 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 7542 if (event->attr.type != PERF_TYPE_TRACEPOINT) 7543 continue; 7544 if (event->attr.config != entry->type) 7545 continue; 7546 if (perf_tp_event_match(event, &data, regs)) 7547 perf_swevent_event(event, count, &data, regs); 7548 } 7549 unlock: 7550 rcu_read_unlock(); 7551 } 7552 7553 perf_swevent_put_recursion_context(rctx); 7554 } 7555 EXPORT_SYMBOL_GPL(perf_tp_event); 7556 7557 static void tp_perf_event_destroy(struct perf_event *event) 7558 { 7559 perf_trace_destroy(event); 7560 } 7561 7562 static int perf_tp_event_init(struct perf_event *event) 7563 { 7564 int err; 7565 7566 if (event->attr.type != PERF_TYPE_TRACEPOINT) 7567 return -ENOENT; 7568 7569 /* 7570 * no branch sampling for tracepoint events 7571 */ 7572 if (has_branch_stack(event)) 7573 return -EOPNOTSUPP; 7574 7575 err = perf_trace_init(event); 7576 if (err) 7577 return err; 7578 7579 event->destroy = tp_perf_event_destroy; 7580 7581 return 0; 7582 } 7583 7584 static struct pmu perf_tracepoint = { 7585 .task_ctx_nr = perf_sw_context, 7586 7587 .event_init = perf_tp_event_init, 7588 .add = perf_trace_add, 7589 .del = perf_trace_del, 7590 .start = perf_swevent_start, 7591 .stop = perf_swevent_stop, 7592 .read = perf_swevent_read, 7593 }; 7594 7595 static inline void perf_tp_register(void) 7596 { 7597 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); 7598 } 7599 7600 static void perf_event_free_filter(struct perf_event *event) 7601 { 7602 ftrace_profile_free_filter(event); 7603 } 7604 7605 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) 7606 { 7607 bool is_kprobe, is_tracepoint; 7608 struct bpf_prog *prog; 7609 7610 if (event->attr.type != PERF_TYPE_TRACEPOINT) 7611 return -EINVAL; 7612 7613 if (event->tp_event->prog) 7614 return -EEXIST; 7615 7616 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; 7617 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; 7618 if (!is_kprobe && !is_tracepoint) 7619 /* bpf programs can only be attached to u/kprobe or tracepoint */ 7620 return -EINVAL; 7621 7622 prog = bpf_prog_get(prog_fd); 7623 if (IS_ERR(prog)) 7624 return PTR_ERR(prog); 7625 7626 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || 7627 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { 7628 /* valid fd, but invalid bpf program type */ 7629 bpf_prog_put(prog); 7630 return -EINVAL; 7631 } 7632 7633 if (is_tracepoint) { 7634 int off = trace_event_get_offsets(event->tp_event); 7635 7636 if (prog->aux->max_ctx_offset > off) { 7637 bpf_prog_put(prog); 7638 return -EACCES; 7639 } 7640 } 7641 event->tp_event->prog = prog; 7642 7643 return 0; 7644 } 7645 7646 static void perf_event_free_bpf_prog(struct perf_event *event) 7647 { 7648 struct bpf_prog *prog; 7649 7650 if (!event->tp_event) 7651 return; 7652 7653 prog = event->tp_event->prog; 7654 if (prog) { 7655 event->tp_event->prog = NULL; 7656 bpf_prog_put(prog); 7657 } 7658 } 7659 7660 #else 7661 7662 static inline void perf_tp_register(void) 7663 { 7664 } 7665 7666 static void perf_event_free_filter(struct perf_event *event) 7667 { 7668 } 7669 7670 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) 7671 { 7672 return -ENOENT; 7673 } 7674 7675 static void perf_event_free_bpf_prog(struct perf_event *event) 7676 { 7677 } 7678 #endif /* CONFIG_EVENT_TRACING */ 7679 7680 #ifdef CONFIG_HAVE_HW_BREAKPOINT 7681 void perf_bp_event(struct perf_event *bp, void *data) 7682 { 7683 struct perf_sample_data sample; 7684 struct pt_regs *regs = data; 7685 7686 perf_sample_data_init(&sample, bp->attr.bp_addr, 0); 7687 7688 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 7689 perf_swevent_event(bp, 1, &sample, regs); 7690 } 7691 #endif 7692 7693 /* 7694 * Allocate a new address filter 7695 */ 7696 static struct perf_addr_filter * 7697 perf_addr_filter_new(struct perf_event *event, struct list_head *filters) 7698 { 7699 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); 7700 struct perf_addr_filter *filter; 7701 7702 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); 7703 if (!filter) 7704 return NULL; 7705 7706 INIT_LIST_HEAD(&filter->entry); 7707 list_add_tail(&filter->entry, filters); 7708 7709 return filter; 7710 } 7711 7712 static void free_filters_list(struct list_head *filters) 7713 { 7714 struct perf_addr_filter *filter, *iter; 7715 7716 list_for_each_entry_safe(filter, iter, filters, entry) { 7717 if (filter->inode) 7718 iput(filter->inode); 7719 list_del(&filter->entry); 7720 kfree(filter); 7721 } 7722 } 7723 7724 /* 7725 * Free existing address filters and optionally install new ones 7726 */ 7727 static void perf_addr_filters_splice(struct perf_event *event, 7728 struct list_head *head) 7729 { 7730 unsigned long flags; 7731 LIST_HEAD(list); 7732 7733 if (!has_addr_filter(event)) 7734 return; 7735 7736 /* don't bother with children, they don't have their own filters */ 7737 if (event->parent) 7738 return; 7739 7740 raw_spin_lock_irqsave(&event->addr_filters.lock, flags); 7741 7742 list_splice_init(&event->addr_filters.list, &list); 7743 if (head) 7744 list_splice(head, &event->addr_filters.list); 7745 7746 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); 7747 7748 free_filters_list(&list); 7749 } 7750 7751 /* 7752 * Scan through mm's vmas and see if one of them matches the 7753 * @filter; if so, adjust filter's address range. 7754 * Called with mm::mmap_sem down for reading. 7755 */ 7756 static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, 7757 struct mm_struct *mm) 7758 { 7759 struct vm_area_struct *vma; 7760 7761 for (vma = mm->mmap; vma; vma = vma->vm_next) { 7762 struct file *file = vma->vm_file; 7763 unsigned long off = vma->vm_pgoff << PAGE_SHIFT; 7764 unsigned long vma_size = vma->vm_end - vma->vm_start; 7765 7766 if (!file) 7767 continue; 7768 7769 if (!perf_addr_filter_match(filter, file, off, vma_size)) 7770 continue; 7771 7772 return vma->vm_start; 7773 } 7774 7775 return 0; 7776 } 7777 7778 /* 7779 * Update event's address range filters based on the 7780 * task's existing mappings, if any. 7781 */ 7782 static void perf_event_addr_filters_apply(struct perf_event *event) 7783 { 7784 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 7785 struct task_struct *task = READ_ONCE(event->ctx->task); 7786 struct perf_addr_filter *filter; 7787 struct mm_struct *mm = NULL; 7788 unsigned int count = 0; 7789 unsigned long flags; 7790 7791 /* 7792 * We may observe TASK_TOMBSTONE, which means that the event tear-down 7793 * will stop on the parent's child_mutex that our caller is also holding 7794 */ 7795 if (task == TASK_TOMBSTONE) 7796 return; 7797 7798 mm = get_task_mm(event->ctx->task); 7799 if (!mm) 7800 goto restart; 7801 7802 down_read(&mm->mmap_sem); 7803 7804 raw_spin_lock_irqsave(&ifh->lock, flags); 7805 list_for_each_entry(filter, &ifh->list, entry) { 7806 event->addr_filters_offs[count] = 0; 7807 7808 if (perf_addr_filter_needs_mmap(filter)) 7809 event->addr_filters_offs[count] = 7810 perf_addr_filter_apply(filter, mm); 7811 7812 count++; 7813 } 7814 7815 event->addr_filters_gen++; 7816 raw_spin_unlock_irqrestore(&ifh->lock, flags); 7817 7818 up_read(&mm->mmap_sem); 7819 7820 mmput(mm); 7821 7822 restart: 7823 perf_event_restart(event); 7824 } 7825 7826 /* 7827 * Address range filtering: limiting the data to certain 7828 * instruction address ranges. Filters are ioctl()ed to us from 7829 * userspace as ascii strings. 7830 * 7831 * Filter string format: 7832 * 7833 * ACTION RANGE_SPEC 7834 * where ACTION is one of the 7835 * * "filter": limit the trace to this region 7836 * * "start": start tracing from this address 7837 * * "stop": stop tracing at this address/region; 7838 * RANGE_SPEC is 7839 * * for kernel addresses: <start address>[/<size>] 7840 * * for object files: <start address>[/<size>]@</path/to/object/file> 7841 * 7842 * if <size> is not specified, the range is treated as a single address. 7843 */ 7844 enum { 7845 IF_ACT_FILTER, 7846 IF_ACT_START, 7847 IF_ACT_STOP, 7848 IF_SRC_FILE, 7849 IF_SRC_KERNEL, 7850 IF_SRC_FILEADDR, 7851 IF_SRC_KERNELADDR, 7852 }; 7853 7854 enum { 7855 IF_STATE_ACTION = 0, 7856 IF_STATE_SOURCE, 7857 IF_STATE_END, 7858 }; 7859 7860 static const match_table_t if_tokens = { 7861 { IF_ACT_FILTER, "filter" }, 7862 { IF_ACT_START, "start" }, 7863 { IF_ACT_STOP, "stop" }, 7864 { IF_SRC_FILE, "%u/%u@%s" }, 7865 { IF_SRC_KERNEL, "%u/%u" }, 7866 { IF_SRC_FILEADDR, "%u@%s" }, 7867 { IF_SRC_KERNELADDR, "%u" }, 7868 }; 7869 7870 /* 7871 * Address filter string parser 7872 */ 7873 static int 7874 perf_event_parse_addr_filter(struct perf_event *event, char *fstr, 7875 struct list_head *filters) 7876 { 7877 struct perf_addr_filter *filter = NULL; 7878 char *start, *orig, *filename = NULL; 7879 struct path path; 7880 substring_t args[MAX_OPT_ARGS]; 7881 int state = IF_STATE_ACTION, token; 7882 unsigned int kernel = 0; 7883 int ret = -EINVAL; 7884 7885 orig = fstr = kstrdup(fstr, GFP_KERNEL); 7886 if (!fstr) 7887 return -ENOMEM; 7888 7889 while ((start = strsep(&fstr, " ,\n")) != NULL) { 7890 ret = -EINVAL; 7891 7892 if (!*start) 7893 continue; 7894 7895 /* filter definition begins */ 7896 if (state == IF_STATE_ACTION) { 7897 filter = perf_addr_filter_new(event, filters); 7898 if (!filter) 7899 goto fail; 7900 } 7901 7902 token = match_token(start, if_tokens, args); 7903 switch (token) { 7904 case IF_ACT_FILTER: 7905 case IF_ACT_START: 7906 filter->filter = 1; 7907 7908 case IF_ACT_STOP: 7909 if (state != IF_STATE_ACTION) 7910 goto fail; 7911 7912 state = IF_STATE_SOURCE; 7913 break; 7914 7915 case IF_SRC_KERNELADDR: 7916 case IF_SRC_KERNEL: 7917 kernel = 1; 7918 7919 case IF_SRC_FILEADDR: 7920 case IF_SRC_FILE: 7921 if (state != IF_STATE_SOURCE) 7922 goto fail; 7923 7924 if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) 7925 filter->range = 1; 7926 7927 *args[0].to = 0; 7928 ret = kstrtoul(args[0].from, 0, &filter->offset); 7929 if (ret) 7930 goto fail; 7931 7932 if (filter->range) { 7933 *args[1].to = 0; 7934 ret = kstrtoul(args[1].from, 0, &filter->size); 7935 if (ret) 7936 goto fail; 7937 } 7938 7939 if (token == IF_SRC_FILE) { 7940 filename = match_strdup(&args[2]); 7941 if (!filename) { 7942 ret = -ENOMEM; 7943 goto fail; 7944 } 7945 } 7946 7947 state = IF_STATE_END; 7948 break; 7949 7950 default: 7951 goto fail; 7952 } 7953 7954 /* 7955 * Filter definition is fully parsed, validate and install it. 7956 * Make sure that it doesn't contradict itself or the event's 7957 * attribute. 7958 */ 7959 if (state == IF_STATE_END) { 7960 if (kernel && event->attr.exclude_kernel) 7961 goto fail; 7962 7963 if (!kernel) { 7964 if (!filename) 7965 goto fail; 7966 7967 /* look up the path and grab its inode */ 7968 ret = kern_path(filename, LOOKUP_FOLLOW, &path); 7969 if (ret) 7970 goto fail_free_name; 7971 7972 filter->inode = igrab(d_inode(path.dentry)); 7973 path_put(&path); 7974 kfree(filename); 7975 filename = NULL; 7976 7977 ret = -EINVAL; 7978 if (!filter->inode || 7979 !S_ISREG(filter->inode->i_mode)) 7980 /* free_filters_list() will iput() */ 7981 goto fail; 7982 } 7983 7984 /* ready to consume more filters */ 7985 state = IF_STATE_ACTION; 7986 filter = NULL; 7987 } 7988 } 7989 7990 if (state != IF_STATE_ACTION) 7991 goto fail; 7992 7993 kfree(orig); 7994 7995 return 0; 7996 7997 fail_free_name: 7998 kfree(filename); 7999 fail: 8000 free_filters_list(filters); 8001 kfree(orig); 8002 8003 return ret; 8004 } 8005 8006 static int 8007 perf_event_set_addr_filter(struct perf_event *event, char *filter_str) 8008 { 8009 LIST_HEAD(filters); 8010 int ret; 8011 8012 /* 8013 * Since this is called in perf_ioctl() path, we're already holding 8014 * ctx::mutex. 8015 */ 8016 lockdep_assert_held(&event->ctx->mutex); 8017 8018 if (WARN_ON_ONCE(event->parent)) 8019 return -EINVAL; 8020 8021 /* 8022 * For now, we only support filtering in per-task events; doing so 8023 * for CPU-wide events requires additional context switching trickery, 8024 * since same object code will be mapped at different virtual 8025 * addresses in different processes. 8026 */ 8027 if (!event->ctx->task) 8028 return -EOPNOTSUPP; 8029 8030 ret = perf_event_parse_addr_filter(event, filter_str, &filters); 8031 if (ret) 8032 return ret; 8033 8034 ret = event->pmu->addr_filters_validate(&filters); 8035 if (ret) { 8036 free_filters_list(&filters); 8037 return ret; 8038 } 8039 8040 /* remove existing filters, if any */ 8041 perf_addr_filters_splice(event, &filters); 8042 8043 /* install new filters */ 8044 perf_event_for_each_child(event, perf_event_addr_filters_apply); 8045 8046 return ret; 8047 } 8048 8049 static int perf_event_set_filter(struct perf_event *event, void __user *arg) 8050 { 8051 char *filter_str; 8052 int ret = -EINVAL; 8053 8054 if ((event->attr.type != PERF_TYPE_TRACEPOINT || 8055 !IS_ENABLED(CONFIG_EVENT_TRACING)) && 8056 !has_addr_filter(event)) 8057 return -EINVAL; 8058 8059 filter_str = strndup_user(arg, PAGE_SIZE); 8060 if (IS_ERR(filter_str)) 8061 return PTR_ERR(filter_str); 8062 8063 if (IS_ENABLED(CONFIG_EVENT_TRACING) && 8064 event->attr.type == PERF_TYPE_TRACEPOINT) 8065 ret = ftrace_profile_set_filter(event, event->attr.config, 8066 filter_str); 8067 else if (has_addr_filter(event)) 8068 ret = perf_event_set_addr_filter(event, filter_str); 8069 8070 kfree(filter_str); 8071 return ret; 8072 } 8073 8074 /* 8075 * hrtimer based swevent callback 8076 */ 8077 8078 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) 8079 { 8080 enum hrtimer_restart ret = HRTIMER_RESTART; 8081 struct perf_sample_data data; 8082 struct pt_regs *regs; 8083 struct perf_event *event; 8084 u64 period; 8085 8086 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 8087 8088 if (event->state != PERF_EVENT_STATE_ACTIVE) 8089 return HRTIMER_NORESTART; 8090 8091 event->pmu->read(event); 8092 8093 perf_sample_data_init(&data, 0, event->hw.last_period); 8094 regs = get_irq_regs(); 8095 8096 if (regs && !perf_exclude_event(event, regs)) { 8097 if (!(event->attr.exclude_idle && is_idle_task(current))) 8098 if (__perf_event_overflow(event, 1, &data, regs)) 8099 ret = HRTIMER_NORESTART; 8100 } 8101 8102 period = max_t(u64, 10000, event->hw.sample_period); 8103 hrtimer_forward_now(hrtimer, ns_to_ktime(period)); 8104 8105 return ret; 8106 } 8107 8108 static void perf_swevent_start_hrtimer(struct perf_event *event) 8109 { 8110 struct hw_perf_event *hwc = &event->hw; 8111 s64 period; 8112 8113 if (!is_sampling_event(event)) 8114 return; 8115 8116 period = local64_read(&hwc->period_left); 8117 if (period) { 8118 if (period < 0) 8119 period = 10000; 8120 8121 local64_set(&hwc->period_left, 0); 8122 } else { 8123 period = max_t(u64, 10000, hwc->sample_period); 8124 } 8125 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), 8126 HRTIMER_MODE_REL_PINNED); 8127 } 8128 8129 static void perf_swevent_cancel_hrtimer(struct perf_event *event) 8130 { 8131 struct hw_perf_event *hwc = &event->hw; 8132 8133 if (is_sampling_event(event)) { 8134 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 8135 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 8136 8137 hrtimer_cancel(&hwc->hrtimer); 8138 } 8139 } 8140 8141 static void perf_swevent_init_hrtimer(struct perf_event *event) 8142 { 8143 struct hw_perf_event *hwc = &event->hw; 8144 8145 if (!is_sampling_event(event)) 8146 return; 8147 8148 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 8149 hwc->hrtimer.function = perf_swevent_hrtimer; 8150 8151 /* 8152 * Since hrtimers have a fixed rate, we can do a static freq->period 8153 * mapping and avoid the whole period adjust feedback stuff. 8154 */ 8155 if (event->attr.freq) { 8156 long freq = event->attr.sample_freq; 8157 8158 event->attr.sample_period = NSEC_PER_SEC / freq; 8159 hwc->sample_period = event->attr.sample_period; 8160 local64_set(&hwc->period_left, hwc->sample_period); 8161 hwc->last_period = hwc->sample_period; 8162 event->attr.freq = 0; 8163 } 8164 } 8165 8166 /* 8167 * Software event: cpu wall time clock 8168 */ 8169 8170 static void cpu_clock_event_update(struct perf_event *event) 8171 { 8172 s64 prev; 8173 u64 now; 8174 8175 now = local_clock(); 8176 prev = local64_xchg(&event->hw.prev_count, now); 8177 local64_add(now - prev, &event->count); 8178 } 8179 8180 static void cpu_clock_event_start(struct perf_event *event, int flags) 8181 { 8182 local64_set(&event->hw.prev_count, local_clock()); 8183 perf_swevent_start_hrtimer(event); 8184 } 8185 8186 static void cpu_clock_event_stop(struct perf_event *event, int flags) 8187 { 8188 perf_swevent_cancel_hrtimer(event); 8189 cpu_clock_event_update(event); 8190 } 8191 8192 static int cpu_clock_event_add(struct perf_event *event, int flags) 8193 { 8194 if (flags & PERF_EF_START) 8195 cpu_clock_event_start(event, flags); 8196 perf_event_update_userpage(event); 8197 8198 return 0; 8199 } 8200 8201 static void cpu_clock_event_del(struct perf_event *event, int flags) 8202 { 8203 cpu_clock_event_stop(event, flags); 8204 } 8205 8206 static void cpu_clock_event_read(struct perf_event *event) 8207 { 8208 cpu_clock_event_update(event); 8209 } 8210 8211 static int cpu_clock_event_init(struct perf_event *event) 8212 { 8213 if (event->attr.type != PERF_TYPE_SOFTWARE) 8214 return -ENOENT; 8215 8216 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 8217 return -ENOENT; 8218 8219 /* 8220 * no branch sampling for software events 8221 */ 8222 if (has_branch_stack(event)) 8223 return -EOPNOTSUPP; 8224 8225 perf_swevent_init_hrtimer(event); 8226 8227 return 0; 8228 } 8229 8230 static struct pmu perf_cpu_clock = { 8231 .task_ctx_nr = perf_sw_context, 8232 8233 .capabilities = PERF_PMU_CAP_NO_NMI, 8234 8235 .event_init = cpu_clock_event_init, 8236 .add = cpu_clock_event_add, 8237 .del = cpu_clock_event_del, 8238 .start = cpu_clock_event_start, 8239 .stop = cpu_clock_event_stop, 8240 .read = cpu_clock_event_read, 8241 }; 8242 8243 /* 8244 * Software event: task time clock 8245 */ 8246 8247 static void task_clock_event_update(struct perf_event *event, u64 now) 8248 { 8249 u64 prev; 8250 s64 delta; 8251 8252 prev = local64_xchg(&event->hw.prev_count, now); 8253 delta = now - prev; 8254 local64_add(delta, &event->count); 8255 } 8256 8257 static void task_clock_event_start(struct perf_event *event, int flags) 8258 { 8259 local64_set(&event->hw.prev_count, event->ctx->time); 8260 perf_swevent_start_hrtimer(event); 8261 } 8262 8263 static void task_clock_event_stop(struct perf_event *event, int flags) 8264 { 8265 perf_swevent_cancel_hrtimer(event); 8266 task_clock_event_update(event, event->ctx->time); 8267 } 8268 8269 static int task_clock_event_add(struct perf_event *event, int flags) 8270 { 8271 if (flags & PERF_EF_START) 8272 task_clock_event_start(event, flags); 8273 perf_event_update_userpage(event); 8274 8275 return 0; 8276 } 8277 8278 static void task_clock_event_del(struct perf_event *event, int flags) 8279 { 8280 task_clock_event_stop(event, PERF_EF_UPDATE); 8281 } 8282 8283 static void task_clock_event_read(struct perf_event *event) 8284 { 8285 u64 now = perf_clock(); 8286 u64 delta = now - event->ctx->timestamp; 8287 u64 time = event->ctx->time + delta; 8288 8289 task_clock_event_update(event, time); 8290 } 8291 8292 static int task_clock_event_init(struct perf_event *event) 8293 { 8294 if (event->attr.type != PERF_TYPE_SOFTWARE) 8295 return -ENOENT; 8296 8297 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 8298 return -ENOENT; 8299 8300 /* 8301 * no branch sampling for software events 8302 */ 8303 if (has_branch_stack(event)) 8304 return -EOPNOTSUPP; 8305 8306 perf_swevent_init_hrtimer(event); 8307 8308 return 0; 8309 } 8310 8311 static struct pmu perf_task_clock = { 8312 .task_ctx_nr = perf_sw_context, 8313 8314 .capabilities = PERF_PMU_CAP_NO_NMI, 8315 8316 .event_init = task_clock_event_init, 8317 .add = task_clock_event_add, 8318 .del = task_clock_event_del, 8319 .start = task_clock_event_start, 8320 .stop = task_clock_event_stop, 8321 .read = task_clock_event_read, 8322 }; 8323 8324 static void perf_pmu_nop_void(struct pmu *pmu) 8325 { 8326 } 8327 8328 static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) 8329 { 8330 } 8331 8332 static int perf_pmu_nop_int(struct pmu *pmu) 8333 { 8334 return 0; 8335 } 8336 8337 static DEFINE_PER_CPU(unsigned int, nop_txn_flags); 8338 8339 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) 8340 { 8341 __this_cpu_write(nop_txn_flags, flags); 8342 8343 if (flags & ~PERF_PMU_TXN_ADD) 8344 return; 8345 8346 perf_pmu_disable(pmu); 8347 } 8348 8349 static int perf_pmu_commit_txn(struct pmu *pmu) 8350 { 8351 unsigned int flags = __this_cpu_read(nop_txn_flags); 8352 8353 __this_cpu_write(nop_txn_flags, 0); 8354 8355 if (flags & ~PERF_PMU_TXN_ADD) 8356 return 0; 8357 8358 perf_pmu_enable(pmu); 8359 return 0; 8360 } 8361 8362 static void perf_pmu_cancel_txn(struct pmu *pmu) 8363 { 8364 unsigned int flags = __this_cpu_read(nop_txn_flags); 8365 8366 __this_cpu_write(nop_txn_flags, 0); 8367 8368 if (flags & ~PERF_PMU_TXN_ADD) 8369 return; 8370 8371 perf_pmu_enable(pmu); 8372 } 8373 8374 static int perf_event_idx_default(struct perf_event *event) 8375 { 8376 return 0; 8377 } 8378 8379 /* 8380 * Ensures all contexts with the same task_ctx_nr have the same 8381 * pmu_cpu_context too. 8382 */ 8383 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) 8384 { 8385 struct pmu *pmu; 8386 8387 if (ctxn < 0) 8388 return NULL; 8389 8390 list_for_each_entry(pmu, &pmus, entry) { 8391 if (pmu->task_ctx_nr == ctxn) 8392 return pmu->pmu_cpu_context; 8393 } 8394 8395 return NULL; 8396 } 8397 8398 static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) 8399 { 8400 int cpu; 8401 8402 for_each_possible_cpu(cpu) { 8403 struct perf_cpu_context *cpuctx; 8404 8405 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 8406 8407 if (cpuctx->unique_pmu == old_pmu) 8408 cpuctx->unique_pmu = pmu; 8409 } 8410 } 8411 8412 static void free_pmu_context(struct pmu *pmu) 8413 { 8414 struct pmu *i; 8415 8416 mutex_lock(&pmus_lock); 8417 /* 8418 * Like a real lame refcount. 8419 */ 8420 list_for_each_entry(i, &pmus, entry) { 8421 if (i->pmu_cpu_context == pmu->pmu_cpu_context) { 8422 update_pmu_context(i, pmu); 8423 goto out; 8424 } 8425 } 8426 8427 free_percpu(pmu->pmu_cpu_context); 8428 out: 8429 mutex_unlock(&pmus_lock); 8430 } 8431 8432 /* 8433 * Let userspace know that this PMU supports address range filtering: 8434 */ 8435 static ssize_t nr_addr_filters_show(struct device *dev, 8436 struct device_attribute *attr, 8437 char *page) 8438 { 8439 struct pmu *pmu = dev_get_drvdata(dev); 8440 8441 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); 8442 } 8443 DEVICE_ATTR_RO(nr_addr_filters); 8444 8445 static struct idr pmu_idr; 8446 8447 static ssize_t 8448 type_show(struct device *dev, struct device_attribute *attr, char *page) 8449 { 8450 struct pmu *pmu = dev_get_drvdata(dev); 8451 8452 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 8453 } 8454 static DEVICE_ATTR_RO(type); 8455 8456 static ssize_t 8457 perf_event_mux_interval_ms_show(struct device *dev, 8458 struct device_attribute *attr, 8459 char *page) 8460 { 8461 struct pmu *pmu = dev_get_drvdata(dev); 8462 8463 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); 8464 } 8465 8466 static DEFINE_MUTEX(mux_interval_mutex); 8467 8468 static ssize_t 8469 perf_event_mux_interval_ms_store(struct device *dev, 8470 struct device_attribute *attr, 8471 const char *buf, size_t count) 8472 { 8473 struct pmu *pmu = dev_get_drvdata(dev); 8474 int timer, cpu, ret; 8475 8476 ret = kstrtoint(buf, 0, &timer); 8477 if (ret) 8478 return ret; 8479 8480 if (timer < 1) 8481 return -EINVAL; 8482 8483 /* same value, noting to do */ 8484 if (timer == pmu->hrtimer_interval_ms) 8485 return count; 8486 8487 mutex_lock(&mux_interval_mutex); 8488 pmu->hrtimer_interval_ms = timer; 8489 8490 /* update all cpuctx for this PMU */ 8491 get_online_cpus(); 8492 for_each_online_cpu(cpu) { 8493 struct perf_cpu_context *cpuctx; 8494 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 8495 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); 8496 8497 cpu_function_call(cpu, 8498 (remote_function_f)perf_mux_hrtimer_restart, cpuctx); 8499 } 8500 put_online_cpus(); 8501 mutex_unlock(&mux_interval_mutex); 8502 8503 return count; 8504 } 8505 static DEVICE_ATTR_RW(perf_event_mux_interval_ms); 8506 8507 static struct attribute *pmu_dev_attrs[] = { 8508 &dev_attr_type.attr, 8509 &dev_attr_perf_event_mux_interval_ms.attr, 8510 NULL, 8511 }; 8512 ATTRIBUTE_GROUPS(pmu_dev); 8513 8514 static int pmu_bus_running; 8515 static struct bus_type pmu_bus = { 8516 .name = "event_source", 8517 .dev_groups = pmu_dev_groups, 8518 }; 8519 8520 static void pmu_dev_release(struct device *dev) 8521 { 8522 kfree(dev); 8523 } 8524 8525 static int pmu_dev_alloc(struct pmu *pmu) 8526 { 8527 int ret = -ENOMEM; 8528 8529 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); 8530 if (!pmu->dev) 8531 goto out; 8532 8533 pmu->dev->groups = pmu->attr_groups; 8534 device_initialize(pmu->dev); 8535 ret = dev_set_name(pmu->dev, "%s", pmu->name); 8536 if (ret) 8537 goto free_dev; 8538 8539 dev_set_drvdata(pmu->dev, pmu); 8540 pmu->dev->bus = &pmu_bus; 8541 pmu->dev->release = pmu_dev_release; 8542 ret = device_add(pmu->dev); 8543 if (ret) 8544 goto free_dev; 8545 8546 /* For PMUs with address filters, throw in an extra attribute: */ 8547 if (pmu->nr_addr_filters) 8548 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); 8549 8550 if (ret) 8551 goto del_dev; 8552 8553 out: 8554 return ret; 8555 8556 del_dev: 8557 device_del(pmu->dev); 8558 8559 free_dev: 8560 put_device(pmu->dev); 8561 goto out; 8562 } 8563 8564 static struct lock_class_key cpuctx_mutex; 8565 static struct lock_class_key cpuctx_lock; 8566 8567 int perf_pmu_register(struct pmu *pmu, const char *name, int type) 8568 { 8569 int cpu, ret; 8570 8571 mutex_lock(&pmus_lock); 8572 ret = -ENOMEM; 8573 pmu->pmu_disable_count = alloc_percpu(int); 8574 if (!pmu->pmu_disable_count) 8575 goto unlock; 8576 8577 pmu->type = -1; 8578 if (!name) 8579 goto skip_type; 8580 pmu->name = name; 8581 8582 if (type < 0) { 8583 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); 8584 if (type < 0) { 8585 ret = type; 8586 goto free_pdc; 8587 } 8588 } 8589 pmu->type = type; 8590 8591 if (pmu_bus_running) { 8592 ret = pmu_dev_alloc(pmu); 8593 if (ret) 8594 goto free_idr; 8595 } 8596 8597 skip_type: 8598 if (pmu->task_ctx_nr == perf_hw_context) { 8599 static int hw_context_taken = 0; 8600 8601 /* 8602 * Other than systems with heterogeneous CPUs, it never makes 8603 * sense for two PMUs to share perf_hw_context. PMUs which are 8604 * uncore must use perf_invalid_context. 8605 */ 8606 if (WARN_ON_ONCE(hw_context_taken && 8607 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) 8608 pmu->task_ctx_nr = perf_invalid_context; 8609 8610 hw_context_taken = 1; 8611 } 8612 8613 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); 8614 if (pmu->pmu_cpu_context) 8615 goto got_cpu_context; 8616 8617 ret = -ENOMEM; 8618 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 8619 if (!pmu->pmu_cpu_context) 8620 goto free_dev; 8621 8622 for_each_possible_cpu(cpu) { 8623 struct perf_cpu_context *cpuctx; 8624 8625 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 8626 __perf_event_init_context(&cpuctx->ctx); 8627 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 8628 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 8629 cpuctx->ctx.pmu = pmu; 8630 8631 __perf_mux_hrtimer_init(cpuctx, cpu); 8632 8633 cpuctx->unique_pmu = pmu; 8634 } 8635 8636 got_cpu_context: 8637 if (!pmu->start_txn) { 8638 if (pmu->pmu_enable) { 8639 /* 8640 * If we have pmu_enable/pmu_disable calls, install 8641 * transaction stubs that use that to try and batch 8642 * hardware accesses. 8643 */ 8644 pmu->start_txn = perf_pmu_start_txn; 8645 pmu->commit_txn = perf_pmu_commit_txn; 8646 pmu->cancel_txn = perf_pmu_cancel_txn; 8647 } else { 8648 pmu->start_txn = perf_pmu_nop_txn; 8649 pmu->commit_txn = perf_pmu_nop_int; 8650 pmu->cancel_txn = perf_pmu_nop_void; 8651 } 8652 } 8653 8654 if (!pmu->pmu_enable) { 8655 pmu->pmu_enable = perf_pmu_nop_void; 8656 pmu->pmu_disable = perf_pmu_nop_void; 8657 } 8658 8659 if (!pmu->event_idx) 8660 pmu->event_idx = perf_event_idx_default; 8661 8662 list_add_rcu(&pmu->entry, &pmus); 8663 atomic_set(&pmu->exclusive_cnt, 0); 8664 ret = 0; 8665 unlock: 8666 mutex_unlock(&pmus_lock); 8667 8668 return ret; 8669 8670 free_dev: 8671 device_del(pmu->dev); 8672 put_device(pmu->dev); 8673 8674 free_idr: 8675 if (pmu->type >= PERF_TYPE_MAX) 8676 idr_remove(&pmu_idr, pmu->type); 8677 8678 free_pdc: 8679 free_percpu(pmu->pmu_disable_count); 8680 goto unlock; 8681 } 8682 EXPORT_SYMBOL_GPL(perf_pmu_register); 8683 8684 void perf_pmu_unregister(struct pmu *pmu) 8685 { 8686 mutex_lock(&pmus_lock); 8687 list_del_rcu(&pmu->entry); 8688 mutex_unlock(&pmus_lock); 8689 8690 /* 8691 * We dereference the pmu list under both SRCU and regular RCU, so 8692 * synchronize against both of those. 8693 */ 8694 synchronize_srcu(&pmus_srcu); 8695 synchronize_rcu(); 8696 8697 free_percpu(pmu->pmu_disable_count); 8698 if (pmu->type >= PERF_TYPE_MAX) 8699 idr_remove(&pmu_idr, pmu->type); 8700 if (pmu->nr_addr_filters) 8701 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); 8702 device_del(pmu->dev); 8703 put_device(pmu->dev); 8704 free_pmu_context(pmu); 8705 } 8706 EXPORT_SYMBOL_GPL(perf_pmu_unregister); 8707 8708 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) 8709 { 8710 struct perf_event_context *ctx = NULL; 8711 int ret; 8712 8713 if (!try_module_get(pmu->module)) 8714 return -ENODEV; 8715 8716 if (event->group_leader != event) { 8717 /* 8718 * This ctx->mutex can nest when we're called through 8719 * inheritance. See the perf_event_ctx_lock_nested() comment. 8720 */ 8721 ctx = perf_event_ctx_lock_nested(event->group_leader, 8722 SINGLE_DEPTH_NESTING); 8723 BUG_ON(!ctx); 8724 } 8725 8726 event->pmu = pmu; 8727 ret = pmu->event_init(event); 8728 8729 if (ctx) 8730 perf_event_ctx_unlock(event->group_leader, ctx); 8731 8732 if (ret) 8733 module_put(pmu->module); 8734 8735 return ret; 8736 } 8737 8738 static struct pmu *perf_init_event(struct perf_event *event) 8739 { 8740 struct pmu *pmu = NULL; 8741 int idx; 8742 int ret; 8743 8744 idx = srcu_read_lock(&pmus_srcu); 8745 8746 rcu_read_lock(); 8747 pmu = idr_find(&pmu_idr, event->attr.type); 8748 rcu_read_unlock(); 8749 if (pmu) { 8750 ret = perf_try_init_event(pmu, event); 8751 if (ret) 8752 pmu = ERR_PTR(ret); 8753 goto unlock; 8754 } 8755 8756 list_for_each_entry_rcu(pmu, &pmus, entry) { 8757 ret = perf_try_init_event(pmu, event); 8758 if (!ret) 8759 goto unlock; 8760 8761 if (ret != -ENOENT) { 8762 pmu = ERR_PTR(ret); 8763 goto unlock; 8764 } 8765 } 8766 pmu = ERR_PTR(-ENOENT); 8767 unlock: 8768 srcu_read_unlock(&pmus_srcu, idx); 8769 8770 return pmu; 8771 } 8772 8773 static void attach_sb_event(struct perf_event *event) 8774 { 8775 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); 8776 8777 raw_spin_lock(&pel->lock); 8778 list_add_rcu(&event->sb_list, &pel->list); 8779 raw_spin_unlock(&pel->lock); 8780 } 8781 8782 /* 8783 * We keep a list of all !task (and therefore per-cpu) events 8784 * that need to receive side-band records. 8785 * 8786 * This avoids having to scan all the various PMU per-cpu contexts 8787 * looking for them. 8788 */ 8789 static void account_pmu_sb_event(struct perf_event *event) 8790 { 8791 if (is_sb_event(event)) 8792 attach_sb_event(event); 8793 } 8794 8795 static void account_event_cpu(struct perf_event *event, int cpu) 8796 { 8797 if (event->parent) 8798 return; 8799 8800 if (is_cgroup_event(event)) 8801 atomic_inc(&per_cpu(perf_cgroup_events, cpu)); 8802 } 8803 8804 /* Freq events need the tick to stay alive (see perf_event_task_tick). */ 8805 static void account_freq_event_nohz(void) 8806 { 8807 #ifdef CONFIG_NO_HZ_FULL 8808 /* Lock so we don't race with concurrent unaccount */ 8809 spin_lock(&nr_freq_lock); 8810 if (atomic_inc_return(&nr_freq_events) == 1) 8811 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); 8812 spin_unlock(&nr_freq_lock); 8813 #endif 8814 } 8815 8816 static void account_freq_event(void) 8817 { 8818 if (tick_nohz_full_enabled()) 8819 account_freq_event_nohz(); 8820 else 8821 atomic_inc(&nr_freq_events); 8822 } 8823 8824 8825 static void account_event(struct perf_event *event) 8826 { 8827 bool inc = false; 8828 8829 if (event->parent) 8830 return; 8831 8832 if (event->attach_state & PERF_ATTACH_TASK) 8833 inc = true; 8834 if (event->attr.mmap || event->attr.mmap_data) 8835 atomic_inc(&nr_mmap_events); 8836 if (event->attr.comm) 8837 atomic_inc(&nr_comm_events); 8838 if (event->attr.task) 8839 atomic_inc(&nr_task_events); 8840 if (event->attr.freq) 8841 account_freq_event(); 8842 if (event->attr.context_switch) { 8843 atomic_inc(&nr_switch_events); 8844 inc = true; 8845 } 8846 if (has_branch_stack(event)) 8847 inc = true; 8848 if (is_cgroup_event(event)) 8849 inc = true; 8850 8851 if (inc) { 8852 if (atomic_inc_not_zero(&perf_sched_count)) 8853 goto enabled; 8854 8855 mutex_lock(&perf_sched_mutex); 8856 if (!atomic_read(&perf_sched_count)) { 8857 static_branch_enable(&perf_sched_events); 8858 /* 8859 * Guarantee that all CPUs observe they key change and 8860 * call the perf scheduling hooks before proceeding to 8861 * install events that need them. 8862 */ 8863 synchronize_sched(); 8864 } 8865 /* 8866 * Now that we have waited for the sync_sched(), allow further 8867 * increments to by-pass the mutex. 8868 */ 8869 atomic_inc(&perf_sched_count); 8870 mutex_unlock(&perf_sched_mutex); 8871 } 8872 enabled: 8873 8874 account_event_cpu(event, event->cpu); 8875 8876 account_pmu_sb_event(event); 8877 } 8878 8879 /* 8880 * Allocate and initialize a event structure 8881 */ 8882 static struct perf_event * 8883 perf_event_alloc(struct perf_event_attr *attr, int cpu, 8884 struct task_struct *task, 8885 struct perf_event *group_leader, 8886 struct perf_event *parent_event, 8887 perf_overflow_handler_t overflow_handler, 8888 void *context, int cgroup_fd) 8889 { 8890 struct pmu *pmu; 8891 struct perf_event *event; 8892 struct hw_perf_event *hwc; 8893 long err = -EINVAL; 8894 8895 if ((unsigned)cpu >= nr_cpu_ids) { 8896 if (!task || cpu != -1) 8897 return ERR_PTR(-EINVAL); 8898 } 8899 8900 event = kzalloc(sizeof(*event), GFP_KERNEL); 8901 if (!event) 8902 return ERR_PTR(-ENOMEM); 8903 8904 /* 8905 * Single events are their own group leaders, with an 8906 * empty sibling list: 8907 */ 8908 if (!group_leader) 8909 group_leader = event; 8910 8911 mutex_init(&event->child_mutex); 8912 INIT_LIST_HEAD(&event->child_list); 8913 8914 INIT_LIST_HEAD(&event->group_entry); 8915 INIT_LIST_HEAD(&event->event_entry); 8916 INIT_LIST_HEAD(&event->sibling_list); 8917 INIT_LIST_HEAD(&event->rb_entry); 8918 INIT_LIST_HEAD(&event->active_entry); 8919 INIT_LIST_HEAD(&event->addr_filters.list); 8920 INIT_HLIST_NODE(&event->hlist_entry); 8921 8922 8923 init_waitqueue_head(&event->waitq); 8924 init_irq_work(&event->pending, perf_pending_event); 8925 8926 mutex_init(&event->mmap_mutex); 8927 raw_spin_lock_init(&event->addr_filters.lock); 8928 8929 atomic_long_set(&event->refcount, 1); 8930 event->cpu = cpu; 8931 event->attr = *attr; 8932 event->group_leader = group_leader; 8933 event->pmu = NULL; 8934 event->oncpu = -1; 8935 8936 event->parent = parent_event; 8937 8938 event->ns = get_pid_ns(task_active_pid_ns(current)); 8939 event->id = atomic64_inc_return(&perf_event_id); 8940 8941 event->state = PERF_EVENT_STATE_INACTIVE; 8942 8943 if (task) { 8944 event->attach_state = PERF_ATTACH_TASK; 8945 /* 8946 * XXX pmu::event_init needs to know what task to account to 8947 * and we cannot use the ctx information because we need the 8948 * pmu before we get a ctx. 8949 */ 8950 event->hw.target = task; 8951 } 8952 8953 event->clock = &local_clock; 8954 if (parent_event) 8955 event->clock = parent_event->clock; 8956 8957 if (!overflow_handler && parent_event) { 8958 overflow_handler = parent_event->overflow_handler; 8959 context = parent_event->overflow_handler_context; 8960 } 8961 8962 if (overflow_handler) { 8963 event->overflow_handler = overflow_handler; 8964 event->overflow_handler_context = context; 8965 } else if (is_write_backward(event)){ 8966 event->overflow_handler = perf_event_output_backward; 8967 event->overflow_handler_context = NULL; 8968 } else { 8969 event->overflow_handler = perf_event_output_forward; 8970 event->overflow_handler_context = NULL; 8971 } 8972 8973 perf_event__state_init(event); 8974 8975 pmu = NULL; 8976 8977 hwc = &event->hw; 8978 hwc->sample_period = attr->sample_period; 8979 if (attr->freq && attr->sample_freq) 8980 hwc->sample_period = 1; 8981 hwc->last_period = hwc->sample_period; 8982 8983 local64_set(&hwc->period_left, hwc->sample_period); 8984 8985 /* 8986 * we currently do not support PERF_FORMAT_GROUP on inherited events 8987 */ 8988 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 8989 goto err_ns; 8990 8991 if (!has_branch_stack(event)) 8992 event->attr.branch_sample_type = 0; 8993 8994 if (cgroup_fd != -1) { 8995 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); 8996 if (err) 8997 goto err_ns; 8998 } 8999 9000 pmu = perf_init_event(event); 9001 if (!pmu) 9002 goto err_ns; 9003 else if (IS_ERR(pmu)) { 9004 err = PTR_ERR(pmu); 9005 goto err_ns; 9006 } 9007 9008 err = exclusive_event_init(event); 9009 if (err) 9010 goto err_pmu; 9011 9012 if (has_addr_filter(event)) { 9013 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, 9014 sizeof(unsigned long), 9015 GFP_KERNEL); 9016 if (!event->addr_filters_offs) 9017 goto err_per_task; 9018 9019 /* force hw sync on the address filters */ 9020 event->addr_filters_gen = 1; 9021 } 9022 9023 if (!event->parent) { 9024 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { 9025 err = get_callchain_buffers(attr->sample_max_stack); 9026 if (err) 9027 goto err_addr_filters; 9028 } 9029 } 9030 9031 /* symmetric to unaccount_event() in _free_event() */ 9032 account_event(event); 9033 9034 return event; 9035 9036 err_addr_filters: 9037 kfree(event->addr_filters_offs); 9038 9039 err_per_task: 9040 exclusive_event_destroy(event); 9041 9042 err_pmu: 9043 if (event->destroy) 9044 event->destroy(event); 9045 module_put(pmu->module); 9046 err_ns: 9047 if (is_cgroup_event(event)) 9048 perf_detach_cgroup(event); 9049 if (event->ns) 9050 put_pid_ns(event->ns); 9051 kfree(event); 9052 9053 return ERR_PTR(err); 9054 } 9055 9056 static int perf_copy_attr(struct perf_event_attr __user *uattr, 9057 struct perf_event_attr *attr) 9058 { 9059 u32 size; 9060 int ret; 9061 9062 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) 9063 return -EFAULT; 9064 9065 /* 9066 * zero the full structure, so that a short copy will be nice. 9067 */ 9068 memset(attr, 0, sizeof(*attr)); 9069 9070 ret = get_user(size, &uattr->size); 9071 if (ret) 9072 return ret; 9073 9074 if (size > PAGE_SIZE) /* silly large */ 9075 goto err_size; 9076 9077 if (!size) /* abi compat */ 9078 size = PERF_ATTR_SIZE_VER0; 9079 9080 if (size < PERF_ATTR_SIZE_VER0) 9081 goto err_size; 9082 9083 /* 9084 * If we're handed a bigger struct than we know of, 9085 * ensure all the unknown bits are 0 - i.e. new 9086 * user-space does not rely on any kernel feature 9087 * extensions we dont know about yet. 9088 */ 9089 if (size > sizeof(*attr)) { 9090 unsigned char __user *addr; 9091 unsigned char __user *end; 9092 unsigned char val; 9093 9094 addr = (void __user *)uattr + sizeof(*attr); 9095 end = (void __user *)uattr + size; 9096 9097 for (; addr < end; addr++) { 9098 ret = get_user(val, addr); 9099 if (ret) 9100 return ret; 9101 if (val) 9102 goto err_size; 9103 } 9104 size = sizeof(*attr); 9105 } 9106 9107 ret = copy_from_user(attr, uattr, size); 9108 if (ret) 9109 return -EFAULT; 9110 9111 if (attr->__reserved_1) 9112 return -EINVAL; 9113 9114 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 9115 return -EINVAL; 9116 9117 if (attr->read_format & ~(PERF_FORMAT_MAX-1)) 9118 return -EINVAL; 9119 9120 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { 9121 u64 mask = attr->branch_sample_type; 9122 9123 /* only using defined bits */ 9124 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) 9125 return -EINVAL; 9126 9127 /* at least one branch bit must be set */ 9128 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) 9129 return -EINVAL; 9130 9131 /* propagate priv level, when not set for branch */ 9132 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { 9133 9134 /* exclude_kernel checked on syscall entry */ 9135 if (!attr->exclude_kernel) 9136 mask |= PERF_SAMPLE_BRANCH_KERNEL; 9137 9138 if (!attr->exclude_user) 9139 mask |= PERF_SAMPLE_BRANCH_USER; 9140 9141 if (!attr->exclude_hv) 9142 mask |= PERF_SAMPLE_BRANCH_HV; 9143 /* 9144 * adjust user setting (for HW filter setup) 9145 */ 9146 attr->branch_sample_type = mask; 9147 } 9148 /* privileged levels capture (kernel, hv): check permissions */ 9149 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) 9150 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 9151 return -EACCES; 9152 } 9153 9154 if (attr->sample_type & PERF_SAMPLE_REGS_USER) { 9155 ret = perf_reg_validate(attr->sample_regs_user); 9156 if (ret) 9157 return ret; 9158 } 9159 9160 if (attr->sample_type & PERF_SAMPLE_STACK_USER) { 9161 if (!arch_perf_have_user_stack_dump()) 9162 return -ENOSYS; 9163 9164 /* 9165 * We have __u32 type for the size, but so far 9166 * we can only use __u16 as maximum due to the 9167 * __u16 sample size limit. 9168 */ 9169 if (attr->sample_stack_user >= USHRT_MAX) 9170 ret = -EINVAL; 9171 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) 9172 ret = -EINVAL; 9173 } 9174 9175 if (attr->sample_type & PERF_SAMPLE_REGS_INTR) 9176 ret = perf_reg_validate(attr->sample_regs_intr); 9177 out: 9178 return ret; 9179 9180 err_size: 9181 put_user(sizeof(*attr), &uattr->size); 9182 ret = -E2BIG; 9183 goto out; 9184 } 9185 9186 static int 9187 perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 9188 { 9189 struct ring_buffer *rb = NULL; 9190 int ret = -EINVAL; 9191 9192 if (!output_event) 9193 goto set; 9194 9195 /* don't allow circular references */ 9196 if (event == output_event) 9197 goto out; 9198 9199 /* 9200 * Don't allow cross-cpu buffers 9201 */ 9202 if (output_event->cpu != event->cpu) 9203 goto out; 9204 9205 /* 9206 * If its not a per-cpu rb, it must be the same task. 9207 */ 9208 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 9209 goto out; 9210 9211 /* 9212 * Mixing clocks in the same buffer is trouble you don't need. 9213 */ 9214 if (output_event->clock != event->clock) 9215 goto out; 9216 9217 /* 9218 * Either writing ring buffer from beginning or from end. 9219 * Mixing is not allowed. 9220 */ 9221 if (is_write_backward(output_event) != is_write_backward(event)) 9222 goto out; 9223 9224 /* 9225 * If both events generate aux data, they must be on the same PMU 9226 */ 9227 if (has_aux(event) && has_aux(output_event) && 9228 event->pmu != output_event->pmu) 9229 goto out; 9230 9231 set: 9232 mutex_lock(&event->mmap_mutex); 9233 /* Can't redirect output if we've got an active mmap() */ 9234 if (atomic_read(&event->mmap_count)) 9235 goto unlock; 9236 9237 if (output_event) { 9238 /* get the rb we want to redirect to */ 9239 rb = ring_buffer_get(output_event); 9240 if (!rb) 9241 goto unlock; 9242 } 9243 9244 ring_buffer_attach(event, rb); 9245 9246 ret = 0; 9247 unlock: 9248 mutex_unlock(&event->mmap_mutex); 9249 9250 out: 9251 return ret; 9252 } 9253 9254 static void mutex_lock_double(struct mutex *a, struct mutex *b) 9255 { 9256 if (b < a) 9257 swap(a, b); 9258 9259 mutex_lock(a); 9260 mutex_lock_nested(b, SINGLE_DEPTH_NESTING); 9261 } 9262 9263 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) 9264 { 9265 bool nmi_safe = false; 9266 9267 switch (clk_id) { 9268 case CLOCK_MONOTONIC: 9269 event->clock = &ktime_get_mono_fast_ns; 9270 nmi_safe = true; 9271 break; 9272 9273 case CLOCK_MONOTONIC_RAW: 9274 event->clock = &ktime_get_raw_fast_ns; 9275 nmi_safe = true; 9276 break; 9277 9278 case CLOCK_REALTIME: 9279 event->clock = &ktime_get_real_ns; 9280 break; 9281 9282 case CLOCK_BOOTTIME: 9283 event->clock = &ktime_get_boot_ns; 9284 break; 9285 9286 case CLOCK_TAI: 9287 event->clock = &ktime_get_tai_ns; 9288 break; 9289 9290 default: 9291 return -EINVAL; 9292 } 9293 9294 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) 9295 return -EINVAL; 9296 9297 return 0; 9298 } 9299 9300 /** 9301 * sys_perf_event_open - open a performance event, associate it to a task/cpu 9302 * 9303 * @attr_uptr: event_id type attributes for monitoring/sampling 9304 * @pid: target pid 9305 * @cpu: target cpu 9306 * @group_fd: group leader event fd 9307 */ 9308 SYSCALL_DEFINE5(perf_event_open, 9309 struct perf_event_attr __user *, attr_uptr, 9310 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 9311 { 9312 struct perf_event *group_leader = NULL, *output_event = NULL; 9313 struct perf_event *event, *sibling; 9314 struct perf_event_attr attr; 9315 struct perf_event_context *ctx, *uninitialized_var(gctx); 9316 struct file *event_file = NULL; 9317 struct fd group = {NULL, 0}; 9318 struct task_struct *task = NULL; 9319 struct pmu *pmu; 9320 int event_fd; 9321 int move_group = 0; 9322 int err; 9323 int f_flags = O_RDWR; 9324 int cgroup_fd = -1; 9325 9326 /* for future expandability... */ 9327 if (flags & ~PERF_FLAG_ALL) 9328 return -EINVAL; 9329 9330 err = perf_copy_attr(attr_uptr, &attr); 9331 if (err) 9332 return err; 9333 9334 if (!attr.exclude_kernel) { 9335 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 9336 return -EACCES; 9337 } 9338 9339 if (attr.freq) { 9340 if (attr.sample_freq > sysctl_perf_event_sample_rate) 9341 return -EINVAL; 9342 } else { 9343 if (attr.sample_period & (1ULL << 63)) 9344 return -EINVAL; 9345 } 9346 9347 if (!attr.sample_max_stack) 9348 attr.sample_max_stack = sysctl_perf_event_max_stack; 9349 9350 /* 9351 * In cgroup mode, the pid argument is used to pass the fd 9352 * opened to the cgroup directory in cgroupfs. The cpu argument 9353 * designates the cpu on which to monitor threads from that 9354 * cgroup. 9355 */ 9356 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) 9357 return -EINVAL; 9358 9359 if (flags & PERF_FLAG_FD_CLOEXEC) 9360 f_flags |= O_CLOEXEC; 9361 9362 event_fd = get_unused_fd_flags(f_flags); 9363 if (event_fd < 0) 9364 return event_fd; 9365 9366 if (group_fd != -1) { 9367 err = perf_fget_light(group_fd, &group); 9368 if (err) 9369 goto err_fd; 9370 group_leader = group.file->private_data; 9371 if (flags & PERF_FLAG_FD_OUTPUT) 9372 output_event = group_leader; 9373 if (flags & PERF_FLAG_FD_NO_GROUP) 9374 group_leader = NULL; 9375 } 9376 9377 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { 9378 task = find_lively_task_by_vpid(pid); 9379 if (IS_ERR(task)) { 9380 err = PTR_ERR(task); 9381 goto err_group_fd; 9382 } 9383 } 9384 9385 if (task && group_leader && 9386 group_leader->attr.inherit != attr.inherit) { 9387 err = -EINVAL; 9388 goto err_task; 9389 } 9390 9391 get_online_cpus(); 9392 9393 if (task) { 9394 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); 9395 if (err) 9396 goto err_cpus; 9397 9398 /* 9399 * Reuse ptrace permission checks for now. 9400 * 9401 * We must hold cred_guard_mutex across this and any potential 9402 * perf_install_in_context() call for this new event to 9403 * serialize against exec() altering our credentials (and the 9404 * perf_event_exit_task() that could imply). 9405 */ 9406 err = -EACCES; 9407 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) 9408 goto err_cred; 9409 } 9410 9411 if (flags & PERF_FLAG_PID_CGROUP) 9412 cgroup_fd = pid; 9413 9414 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 9415 NULL, NULL, cgroup_fd); 9416 if (IS_ERR(event)) { 9417 err = PTR_ERR(event); 9418 goto err_cred; 9419 } 9420 9421 if (is_sampling_event(event)) { 9422 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { 9423 err = -EOPNOTSUPP; 9424 goto err_alloc; 9425 } 9426 } 9427 9428 /* 9429 * Special case software events and allow them to be part of 9430 * any hardware group. 9431 */ 9432 pmu = event->pmu; 9433 9434 if (attr.use_clockid) { 9435 err = perf_event_set_clock(event, attr.clockid); 9436 if (err) 9437 goto err_alloc; 9438 } 9439 9440 if (group_leader && 9441 (is_software_event(event) != is_software_event(group_leader))) { 9442 if (is_software_event(event)) { 9443 /* 9444 * If event and group_leader are not both a software 9445 * event, and event is, then group leader is not. 9446 * 9447 * Allow the addition of software events to !software 9448 * groups, this is safe because software events never 9449 * fail to schedule. 9450 */ 9451 pmu = group_leader->pmu; 9452 } else if (is_software_event(group_leader) && 9453 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { 9454 /* 9455 * In case the group is a pure software group, and we 9456 * try to add a hardware event, move the whole group to 9457 * the hardware context. 9458 */ 9459 move_group = 1; 9460 } 9461 } 9462 9463 /* 9464 * Get the target context (task or percpu): 9465 */ 9466 ctx = find_get_context(pmu, task, event); 9467 if (IS_ERR(ctx)) { 9468 err = PTR_ERR(ctx); 9469 goto err_alloc; 9470 } 9471 9472 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { 9473 err = -EBUSY; 9474 goto err_context; 9475 } 9476 9477 /* 9478 * Look up the group leader (we will attach this event to it): 9479 */ 9480 if (group_leader) { 9481 err = -EINVAL; 9482 9483 /* 9484 * Do not allow a recursive hierarchy (this new sibling 9485 * becoming part of another group-sibling): 9486 */ 9487 if (group_leader->group_leader != group_leader) 9488 goto err_context; 9489 9490 /* All events in a group should have the same clock */ 9491 if (group_leader->clock != event->clock) 9492 goto err_context; 9493 9494 /* 9495 * Do not allow to attach to a group in a different 9496 * task or CPU context: 9497 */ 9498 if (move_group) { 9499 /* 9500 * Make sure we're both on the same task, or both 9501 * per-cpu events. 9502 */ 9503 if (group_leader->ctx->task != ctx->task) 9504 goto err_context; 9505 9506 /* 9507 * Make sure we're both events for the same CPU; 9508 * grouping events for different CPUs is broken; since 9509 * you can never concurrently schedule them anyhow. 9510 */ 9511 if (group_leader->cpu != event->cpu) 9512 goto err_context; 9513 } else { 9514 if (group_leader->ctx != ctx) 9515 goto err_context; 9516 } 9517 9518 /* 9519 * Only a group leader can be exclusive or pinned 9520 */ 9521 if (attr.exclusive || attr.pinned) 9522 goto err_context; 9523 } 9524 9525 if (output_event) { 9526 err = perf_event_set_output(event, output_event); 9527 if (err) 9528 goto err_context; 9529 } 9530 9531 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, 9532 f_flags); 9533 if (IS_ERR(event_file)) { 9534 err = PTR_ERR(event_file); 9535 event_file = NULL; 9536 goto err_context; 9537 } 9538 9539 if (move_group) { 9540 gctx = group_leader->ctx; 9541 mutex_lock_double(&gctx->mutex, &ctx->mutex); 9542 if (gctx->task == TASK_TOMBSTONE) { 9543 err = -ESRCH; 9544 goto err_locked; 9545 } 9546 } else { 9547 mutex_lock(&ctx->mutex); 9548 } 9549 9550 if (ctx->task == TASK_TOMBSTONE) { 9551 err = -ESRCH; 9552 goto err_locked; 9553 } 9554 9555 if (!perf_event_validate_size(event)) { 9556 err = -E2BIG; 9557 goto err_locked; 9558 } 9559 9560 /* 9561 * Must be under the same ctx::mutex as perf_install_in_context(), 9562 * because we need to serialize with concurrent event creation. 9563 */ 9564 if (!exclusive_event_installable(event, ctx)) { 9565 /* exclusive and group stuff are assumed mutually exclusive */ 9566 WARN_ON_ONCE(move_group); 9567 9568 err = -EBUSY; 9569 goto err_locked; 9570 } 9571 9572 WARN_ON_ONCE(ctx->parent_ctx); 9573 9574 /* 9575 * This is the point on no return; we cannot fail hereafter. This is 9576 * where we start modifying current state. 9577 */ 9578 9579 if (move_group) { 9580 /* 9581 * See perf_event_ctx_lock() for comments on the details 9582 * of swizzling perf_event::ctx. 9583 */ 9584 perf_remove_from_context(group_leader, 0); 9585 9586 list_for_each_entry(sibling, &group_leader->sibling_list, 9587 group_entry) { 9588 perf_remove_from_context(sibling, 0); 9589 put_ctx(gctx); 9590 } 9591 9592 /* 9593 * Wait for everybody to stop referencing the events through 9594 * the old lists, before installing it on new lists. 9595 */ 9596 synchronize_rcu(); 9597 9598 /* 9599 * Install the group siblings before the group leader. 9600 * 9601 * Because a group leader will try and install the entire group 9602 * (through the sibling list, which is still in-tact), we can 9603 * end up with siblings installed in the wrong context. 9604 * 9605 * By installing siblings first we NO-OP because they're not 9606 * reachable through the group lists. 9607 */ 9608 list_for_each_entry(sibling, &group_leader->sibling_list, 9609 group_entry) { 9610 perf_event__state_init(sibling); 9611 perf_install_in_context(ctx, sibling, sibling->cpu); 9612 get_ctx(ctx); 9613 } 9614 9615 /* 9616 * Removing from the context ends up with disabled 9617 * event. What we want here is event in the initial 9618 * startup state, ready to be add into new context. 9619 */ 9620 perf_event__state_init(group_leader); 9621 perf_install_in_context(ctx, group_leader, group_leader->cpu); 9622 get_ctx(ctx); 9623 9624 /* 9625 * Now that all events are installed in @ctx, nothing 9626 * references @gctx anymore, so drop the last reference we have 9627 * on it. 9628 */ 9629 put_ctx(gctx); 9630 } 9631 9632 /* 9633 * Precalculate sample_data sizes; do while holding ctx::mutex such 9634 * that we're serialized against further additions and before 9635 * perf_install_in_context() which is the point the event is active and 9636 * can use these values. 9637 */ 9638 perf_event__header_size(event); 9639 perf_event__id_header_size(event); 9640 9641 event->owner = current; 9642 9643 perf_install_in_context(ctx, event, event->cpu); 9644 perf_unpin_context(ctx); 9645 9646 if (move_group) 9647 mutex_unlock(&gctx->mutex); 9648 mutex_unlock(&ctx->mutex); 9649 9650 if (task) { 9651 mutex_unlock(&task->signal->cred_guard_mutex); 9652 put_task_struct(task); 9653 } 9654 9655 put_online_cpus(); 9656 9657 mutex_lock(¤t->perf_event_mutex); 9658 list_add_tail(&event->owner_entry, ¤t->perf_event_list); 9659 mutex_unlock(¤t->perf_event_mutex); 9660 9661 /* 9662 * Drop the reference on the group_event after placing the 9663 * new event on the sibling_list. This ensures destruction 9664 * of the group leader will find the pointer to itself in 9665 * perf_group_detach(). 9666 */ 9667 fdput(group); 9668 fd_install(event_fd, event_file); 9669 return event_fd; 9670 9671 err_locked: 9672 if (move_group) 9673 mutex_unlock(&gctx->mutex); 9674 mutex_unlock(&ctx->mutex); 9675 /* err_file: */ 9676 fput(event_file); 9677 err_context: 9678 perf_unpin_context(ctx); 9679 put_ctx(ctx); 9680 err_alloc: 9681 /* 9682 * If event_file is set, the fput() above will have called ->release() 9683 * and that will take care of freeing the event. 9684 */ 9685 if (!event_file) 9686 free_event(event); 9687 err_cred: 9688 if (task) 9689 mutex_unlock(&task->signal->cred_guard_mutex); 9690 err_cpus: 9691 put_online_cpus(); 9692 err_task: 9693 if (task) 9694 put_task_struct(task); 9695 err_group_fd: 9696 fdput(group); 9697 err_fd: 9698 put_unused_fd(event_fd); 9699 return err; 9700 } 9701 9702 /** 9703 * perf_event_create_kernel_counter 9704 * 9705 * @attr: attributes of the counter to create 9706 * @cpu: cpu in which the counter is bound 9707 * @task: task to profile (NULL for percpu) 9708 */ 9709 struct perf_event * 9710 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 9711 struct task_struct *task, 9712 perf_overflow_handler_t overflow_handler, 9713 void *context) 9714 { 9715 struct perf_event_context *ctx; 9716 struct perf_event *event; 9717 int err; 9718 9719 /* 9720 * Get the target context (task or percpu): 9721 */ 9722 9723 event = perf_event_alloc(attr, cpu, task, NULL, NULL, 9724 overflow_handler, context, -1); 9725 if (IS_ERR(event)) { 9726 err = PTR_ERR(event); 9727 goto err; 9728 } 9729 9730 /* Mark owner so we could distinguish it from user events. */ 9731 event->owner = TASK_TOMBSTONE; 9732 9733 ctx = find_get_context(event->pmu, task, event); 9734 if (IS_ERR(ctx)) { 9735 err = PTR_ERR(ctx); 9736 goto err_free; 9737 } 9738 9739 WARN_ON_ONCE(ctx->parent_ctx); 9740 mutex_lock(&ctx->mutex); 9741 if (ctx->task == TASK_TOMBSTONE) { 9742 err = -ESRCH; 9743 goto err_unlock; 9744 } 9745 9746 if (!exclusive_event_installable(event, ctx)) { 9747 err = -EBUSY; 9748 goto err_unlock; 9749 } 9750 9751 perf_install_in_context(ctx, event, cpu); 9752 perf_unpin_context(ctx); 9753 mutex_unlock(&ctx->mutex); 9754 9755 return event; 9756 9757 err_unlock: 9758 mutex_unlock(&ctx->mutex); 9759 perf_unpin_context(ctx); 9760 put_ctx(ctx); 9761 err_free: 9762 free_event(event); 9763 err: 9764 return ERR_PTR(err); 9765 } 9766 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 9767 9768 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) 9769 { 9770 struct perf_event_context *src_ctx; 9771 struct perf_event_context *dst_ctx; 9772 struct perf_event *event, *tmp; 9773 LIST_HEAD(events); 9774 9775 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; 9776 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; 9777 9778 /* 9779 * See perf_event_ctx_lock() for comments on the details 9780 * of swizzling perf_event::ctx. 9781 */ 9782 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); 9783 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 9784 event_entry) { 9785 perf_remove_from_context(event, 0); 9786 unaccount_event_cpu(event, src_cpu); 9787 put_ctx(src_ctx); 9788 list_add(&event->migrate_entry, &events); 9789 } 9790 9791 /* 9792 * Wait for the events to quiesce before re-instating them. 9793 */ 9794 synchronize_rcu(); 9795 9796 /* 9797 * Re-instate events in 2 passes. 9798 * 9799 * Skip over group leaders and only install siblings on this first 9800 * pass, siblings will not get enabled without a leader, however a 9801 * leader will enable its siblings, even if those are still on the old 9802 * context. 9803 */ 9804 list_for_each_entry_safe(event, tmp, &events, migrate_entry) { 9805 if (event->group_leader == event) 9806 continue; 9807 9808 list_del(&event->migrate_entry); 9809 if (event->state >= PERF_EVENT_STATE_OFF) 9810 event->state = PERF_EVENT_STATE_INACTIVE; 9811 account_event_cpu(event, dst_cpu); 9812 perf_install_in_context(dst_ctx, event, dst_cpu); 9813 get_ctx(dst_ctx); 9814 } 9815 9816 /* 9817 * Once all the siblings are setup properly, install the group leaders 9818 * to make it go. 9819 */ 9820 list_for_each_entry_safe(event, tmp, &events, migrate_entry) { 9821 list_del(&event->migrate_entry); 9822 if (event->state >= PERF_EVENT_STATE_OFF) 9823 event->state = PERF_EVENT_STATE_INACTIVE; 9824 account_event_cpu(event, dst_cpu); 9825 perf_install_in_context(dst_ctx, event, dst_cpu); 9826 get_ctx(dst_ctx); 9827 } 9828 mutex_unlock(&dst_ctx->mutex); 9829 mutex_unlock(&src_ctx->mutex); 9830 } 9831 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); 9832 9833 static void sync_child_event(struct perf_event *child_event, 9834 struct task_struct *child) 9835 { 9836 struct perf_event *parent_event = child_event->parent; 9837 u64 child_val; 9838 9839 if (child_event->attr.inherit_stat) 9840 perf_event_read_event(child_event, child); 9841 9842 child_val = perf_event_count(child_event); 9843 9844 /* 9845 * Add back the child's count to the parent's count: 9846 */ 9847 atomic64_add(child_val, &parent_event->child_count); 9848 atomic64_add(child_event->total_time_enabled, 9849 &parent_event->child_total_time_enabled); 9850 atomic64_add(child_event->total_time_running, 9851 &parent_event->child_total_time_running); 9852 } 9853 9854 static void 9855 perf_event_exit_event(struct perf_event *child_event, 9856 struct perf_event_context *child_ctx, 9857 struct task_struct *child) 9858 { 9859 struct perf_event *parent_event = child_event->parent; 9860 9861 /* 9862 * Do not destroy the 'original' grouping; because of the context 9863 * switch optimization the original events could've ended up in a 9864 * random child task. 9865 * 9866 * If we were to destroy the original group, all group related 9867 * operations would cease to function properly after this random 9868 * child dies. 9869 * 9870 * Do destroy all inherited groups, we don't care about those 9871 * and being thorough is better. 9872 */ 9873 raw_spin_lock_irq(&child_ctx->lock); 9874 WARN_ON_ONCE(child_ctx->is_active); 9875 9876 if (parent_event) 9877 perf_group_detach(child_event); 9878 list_del_event(child_event, child_ctx); 9879 child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ 9880 raw_spin_unlock_irq(&child_ctx->lock); 9881 9882 /* 9883 * Parent events are governed by their filedesc, retain them. 9884 */ 9885 if (!parent_event) { 9886 perf_event_wakeup(child_event); 9887 return; 9888 } 9889 /* 9890 * Child events can be cleaned up. 9891 */ 9892 9893 sync_child_event(child_event, child); 9894 9895 /* 9896 * Remove this event from the parent's list 9897 */ 9898 WARN_ON_ONCE(parent_event->ctx->parent_ctx); 9899 mutex_lock(&parent_event->child_mutex); 9900 list_del_init(&child_event->child_list); 9901 mutex_unlock(&parent_event->child_mutex); 9902 9903 /* 9904 * Kick perf_poll() for is_event_hup(). 9905 */ 9906 perf_event_wakeup(parent_event); 9907 free_event(child_event); 9908 put_event(parent_event); 9909 } 9910 9911 static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 9912 { 9913 struct perf_event_context *child_ctx, *clone_ctx = NULL; 9914 struct perf_event *child_event, *next; 9915 9916 WARN_ON_ONCE(child != current); 9917 9918 child_ctx = perf_pin_task_context(child, ctxn); 9919 if (!child_ctx) 9920 return; 9921 9922 /* 9923 * In order to reduce the amount of tricky in ctx tear-down, we hold 9924 * ctx::mutex over the entire thing. This serializes against almost 9925 * everything that wants to access the ctx. 9926 * 9927 * The exception is sys_perf_event_open() / 9928 * perf_event_create_kernel_count() which does find_get_context() 9929 * without ctx::mutex (it cannot because of the move_group double mutex 9930 * lock thing). See the comments in perf_install_in_context(). 9931 */ 9932 mutex_lock(&child_ctx->mutex); 9933 9934 /* 9935 * In a single ctx::lock section, de-schedule the events and detach the 9936 * context from the task such that we cannot ever get it scheduled back 9937 * in. 9938 */ 9939 raw_spin_lock_irq(&child_ctx->lock); 9940 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); 9941 9942 /* 9943 * Now that the context is inactive, destroy the task <-> ctx relation 9944 * and mark the context dead. 9945 */ 9946 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); 9947 put_ctx(child_ctx); /* cannot be last */ 9948 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); 9949 put_task_struct(current); /* cannot be last */ 9950 9951 clone_ctx = unclone_ctx(child_ctx); 9952 raw_spin_unlock_irq(&child_ctx->lock); 9953 9954 if (clone_ctx) 9955 put_ctx(clone_ctx); 9956 9957 /* 9958 * Report the task dead after unscheduling the events so that we 9959 * won't get any samples after PERF_RECORD_EXIT. We can however still 9960 * get a few PERF_RECORD_READ events. 9961 */ 9962 perf_event_task(child, child_ctx, 0); 9963 9964 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) 9965 perf_event_exit_event(child_event, child_ctx, child); 9966 9967 mutex_unlock(&child_ctx->mutex); 9968 9969 put_ctx(child_ctx); 9970 } 9971 9972 /* 9973 * When a child task exits, feed back event values to parent events. 9974 * 9975 * Can be called with cred_guard_mutex held when called from 9976 * install_exec_creds(). 9977 */ 9978 void perf_event_exit_task(struct task_struct *child) 9979 { 9980 struct perf_event *event, *tmp; 9981 int ctxn; 9982 9983 mutex_lock(&child->perf_event_mutex); 9984 list_for_each_entry_safe(event, tmp, &child->perf_event_list, 9985 owner_entry) { 9986 list_del_init(&event->owner_entry); 9987 9988 /* 9989 * Ensure the list deletion is visible before we clear 9990 * the owner, closes a race against perf_release() where 9991 * we need to serialize on the owner->perf_event_mutex. 9992 */ 9993 smp_store_release(&event->owner, NULL); 9994 } 9995 mutex_unlock(&child->perf_event_mutex); 9996 9997 for_each_task_context_nr(ctxn) 9998 perf_event_exit_task_context(child, ctxn); 9999 10000 /* 10001 * The perf_event_exit_task_context calls perf_event_task 10002 * with child's task_ctx, which generates EXIT events for 10003 * child contexts and sets child->perf_event_ctxp[] to NULL. 10004 * At this point we need to send EXIT events to cpu contexts. 10005 */ 10006 perf_event_task(child, NULL, 0); 10007 } 10008 10009 static void perf_free_event(struct perf_event *event, 10010 struct perf_event_context *ctx) 10011 { 10012 struct perf_event *parent = event->parent; 10013 10014 if (WARN_ON_ONCE(!parent)) 10015 return; 10016 10017 mutex_lock(&parent->child_mutex); 10018 list_del_init(&event->child_list); 10019 mutex_unlock(&parent->child_mutex); 10020 10021 put_event(parent); 10022 10023 raw_spin_lock_irq(&ctx->lock); 10024 perf_group_detach(event); 10025 list_del_event(event, ctx); 10026 raw_spin_unlock_irq(&ctx->lock); 10027 free_event(event); 10028 } 10029 10030 /* 10031 * Free an unexposed, unused context as created by inheritance by 10032 * perf_event_init_task below, used by fork() in case of fail. 10033 * 10034 * Not all locks are strictly required, but take them anyway to be nice and 10035 * help out with the lockdep assertions. 10036 */ 10037 void perf_event_free_task(struct task_struct *task) 10038 { 10039 struct perf_event_context *ctx; 10040 struct perf_event *event, *tmp; 10041 int ctxn; 10042 10043 for_each_task_context_nr(ctxn) { 10044 ctx = task->perf_event_ctxp[ctxn]; 10045 if (!ctx) 10046 continue; 10047 10048 mutex_lock(&ctx->mutex); 10049 again: 10050 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, 10051 group_entry) 10052 perf_free_event(event, ctx); 10053 10054 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, 10055 group_entry) 10056 perf_free_event(event, ctx); 10057 10058 if (!list_empty(&ctx->pinned_groups) || 10059 !list_empty(&ctx->flexible_groups)) 10060 goto again; 10061 10062 mutex_unlock(&ctx->mutex); 10063 10064 put_ctx(ctx); 10065 } 10066 } 10067 10068 void perf_event_delayed_put(struct task_struct *task) 10069 { 10070 int ctxn; 10071 10072 for_each_task_context_nr(ctxn) 10073 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); 10074 } 10075 10076 struct file *perf_event_get(unsigned int fd) 10077 { 10078 struct file *file; 10079 10080 file = fget_raw(fd); 10081 if (!file) 10082 return ERR_PTR(-EBADF); 10083 10084 if (file->f_op != &perf_fops) { 10085 fput(file); 10086 return ERR_PTR(-EBADF); 10087 } 10088 10089 return file; 10090 } 10091 10092 const struct perf_event_attr *perf_event_attrs(struct perf_event *event) 10093 { 10094 if (!event) 10095 return ERR_PTR(-EINVAL); 10096 10097 return &event->attr; 10098 } 10099 10100 /* 10101 * inherit a event from parent task to child task: 10102 */ 10103 static struct perf_event * 10104 inherit_event(struct perf_event *parent_event, 10105 struct task_struct *parent, 10106 struct perf_event_context *parent_ctx, 10107 struct task_struct *child, 10108 struct perf_event *group_leader, 10109 struct perf_event_context *child_ctx) 10110 { 10111 enum perf_event_active_state parent_state = parent_event->state; 10112 struct perf_event *child_event; 10113 unsigned long flags; 10114 10115 /* 10116 * Instead of creating recursive hierarchies of events, 10117 * we link inherited events back to the original parent, 10118 * which has a filp for sure, which we use as the reference 10119 * count: 10120 */ 10121 if (parent_event->parent) 10122 parent_event = parent_event->parent; 10123 10124 child_event = perf_event_alloc(&parent_event->attr, 10125 parent_event->cpu, 10126 child, 10127 group_leader, parent_event, 10128 NULL, NULL, -1); 10129 if (IS_ERR(child_event)) 10130 return child_event; 10131 10132 /* 10133 * is_orphaned_event() and list_add_tail(&parent_event->child_list) 10134 * must be under the same lock in order to serialize against 10135 * perf_event_release_kernel(), such that either we must observe 10136 * is_orphaned_event() or they will observe us on the child_list. 10137 */ 10138 mutex_lock(&parent_event->child_mutex); 10139 if (is_orphaned_event(parent_event) || 10140 !atomic_long_inc_not_zero(&parent_event->refcount)) { 10141 mutex_unlock(&parent_event->child_mutex); 10142 free_event(child_event); 10143 return NULL; 10144 } 10145 10146 get_ctx(child_ctx); 10147 10148 /* 10149 * Make the child state follow the state of the parent event, 10150 * not its attr.disabled bit. We hold the parent's mutex, 10151 * so we won't race with perf_event_{en, dis}able_family. 10152 */ 10153 if (parent_state >= PERF_EVENT_STATE_INACTIVE) 10154 child_event->state = PERF_EVENT_STATE_INACTIVE; 10155 else 10156 child_event->state = PERF_EVENT_STATE_OFF; 10157 10158 if (parent_event->attr.freq) { 10159 u64 sample_period = parent_event->hw.sample_period; 10160 struct hw_perf_event *hwc = &child_event->hw; 10161 10162 hwc->sample_period = sample_period; 10163 hwc->last_period = sample_period; 10164 10165 local64_set(&hwc->period_left, sample_period); 10166 } 10167 10168 child_event->ctx = child_ctx; 10169 child_event->overflow_handler = parent_event->overflow_handler; 10170 child_event->overflow_handler_context 10171 = parent_event->overflow_handler_context; 10172 10173 /* 10174 * Precalculate sample_data sizes 10175 */ 10176 perf_event__header_size(child_event); 10177 perf_event__id_header_size(child_event); 10178 10179 /* 10180 * Link it up in the child's context: 10181 */ 10182 raw_spin_lock_irqsave(&child_ctx->lock, flags); 10183 add_event_to_ctx(child_event, child_ctx); 10184 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 10185 10186 /* 10187 * Link this into the parent event's child list 10188 */ 10189 list_add_tail(&child_event->child_list, &parent_event->child_list); 10190 mutex_unlock(&parent_event->child_mutex); 10191 10192 return child_event; 10193 } 10194 10195 static int inherit_group(struct perf_event *parent_event, 10196 struct task_struct *parent, 10197 struct perf_event_context *parent_ctx, 10198 struct task_struct *child, 10199 struct perf_event_context *child_ctx) 10200 { 10201 struct perf_event *leader; 10202 struct perf_event *sub; 10203 struct perf_event *child_ctr; 10204 10205 leader = inherit_event(parent_event, parent, parent_ctx, 10206 child, NULL, child_ctx); 10207 if (IS_ERR(leader)) 10208 return PTR_ERR(leader); 10209 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { 10210 child_ctr = inherit_event(sub, parent, parent_ctx, 10211 child, leader, child_ctx); 10212 if (IS_ERR(child_ctr)) 10213 return PTR_ERR(child_ctr); 10214 } 10215 return 0; 10216 } 10217 10218 static int 10219 inherit_task_group(struct perf_event *event, struct task_struct *parent, 10220 struct perf_event_context *parent_ctx, 10221 struct task_struct *child, int ctxn, 10222 int *inherited_all) 10223 { 10224 int ret; 10225 struct perf_event_context *child_ctx; 10226 10227 if (!event->attr.inherit) { 10228 *inherited_all = 0; 10229 return 0; 10230 } 10231 10232 child_ctx = child->perf_event_ctxp[ctxn]; 10233 if (!child_ctx) { 10234 /* 10235 * This is executed from the parent task context, so 10236 * inherit events that have been marked for cloning. 10237 * First allocate and initialize a context for the 10238 * child. 10239 */ 10240 10241 child_ctx = alloc_perf_context(parent_ctx->pmu, child); 10242 if (!child_ctx) 10243 return -ENOMEM; 10244 10245 child->perf_event_ctxp[ctxn] = child_ctx; 10246 } 10247 10248 ret = inherit_group(event, parent, parent_ctx, 10249 child, child_ctx); 10250 10251 if (ret) 10252 *inherited_all = 0; 10253 10254 return ret; 10255 } 10256 10257 /* 10258 * Initialize the perf_event context in task_struct 10259 */ 10260 static int perf_event_init_context(struct task_struct *child, int ctxn) 10261 { 10262 struct perf_event_context *child_ctx, *parent_ctx; 10263 struct perf_event_context *cloned_ctx; 10264 struct perf_event *event; 10265 struct task_struct *parent = current; 10266 int inherited_all = 1; 10267 unsigned long flags; 10268 int ret = 0; 10269 10270 if (likely(!parent->perf_event_ctxp[ctxn])) 10271 return 0; 10272 10273 /* 10274 * If the parent's context is a clone, pin it so it won't get 10275 * swapped under us. 10276 */ 10277 parent_ctx = perf_pin_task_context(parent, ctxn); 10278 if (!parent_ctx) 10279 return 0; 10280 10281 /* 10282 * No need to check if parent_ctx != NULL here; since we saw 10283 * it non-NULL earlier, the only reason for it to become NULL 10284 * is if we exit, and since we're currently in the middle of 10285 * a fork we can't be exiting at the same time. 10286 */ 10287 10288 /* 10289 * Lock the parent list. No need to lock the child - not PID 10290 * hashed yet and not running, so nobody can access it. 10291 */ 10292 mutex_lock(&parent_ctx->mutex); 10293 10294 /* 10295 * We dont have to disable NMIs - we are only looking at 10296 * the list, not manipulating it: 10297 */ 10298 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 10299 ret = inherit_task_group(event, parent, parent_ctx, 10300 child, ctxn, &inherited_all); 10301 if (ret) 10302 break; 10303 } 10304 10305 /* 10306 * We can't hold ctx->lock when iterating the ->flexible_group list due 10307 * to allocations, but we need to prevent rotation because 10308 * rotate_ctx() will change the list from interrupt context. 10309 */ 10310 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 10311 parent_ctx->rotate_disable = 1; 10312 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 10313 10314 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 10315 ret = inherit_task_group(event, parent, parent_ctx, 10316 child, ctxn, &inherited_all); 10317 if (ret) 10318 break; 10319 } 10320 10321 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 10322 parent_ctx->rotate_disable = 0; 10323 10324 child_ctx = child->perf_event_ctxp[ctxn]; 10325 10326 if (child_ctx && inherited_all) { 10327 /* 10328 * Mark the child context as a clone of the parent 10329 * context, or of whatever the parent is a clone of. 10330 * 10331 * Note that if the parent is a clone, the holding of 10332 * parent_ctx->lock avoids it from being uncloned. 10333 */ 10334 cloned_ctx = parent_ctx->parent_ctx; 10335 if (cloned_ctx) { 10336 child_ctx->parent_ctx = cloned_ctx; 10337 child_ctx->parent_gen = parent_ctx->parent_gen; 10338 } else { 10339 child_ctx->parent_ctx = parent_ctx; 10340 child_ctx->parent_gen = parent_ctx->generation; 10341 } 10342 get_ctx(child_ctx->parent_ctx); 10343 } 10344 10345 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 10346 mutex_unlock(&parent_ctx->mutex); 10347 10348 perf_unpin_context(parent_ctx); 10349 put_ctx(parent_ctx); 10350 10351 return ret; 10352 } 10353 10354 /* 10355 * Initialize the perf_event context in task_struct 10356 */ 10357 int perf_event_init_task(struct task_struct *child) 10358 { 10359 int ctxn, ret; 10360 10361 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); 10362 mutex_init(&child->perf_event_mutex); 10363 INIT_LIST_HEAD(&child->perf_event_list); 10364 10365 for_each_task_context_nr(ctxn) { 10366 ret = perf_event_init_context(child, ctxn); 10367 if (ret) { 10368 perf_event_free_task(child); 10369 return ret; 10370 } 10371 } 10372 10373 return 0; 10374 } 10375 10376 static void __init perf_event_init_all_cpus(void) 10377 { 10378 struct swevent_htable *swhash; 10379 int cpu; 10380 10381 for_each_possible_cpu(cpu) { 10382 swhash = &per_cpu(swevent_htable, cpu); 10383 mutex_init(&swhash->hlist_mutex); 10384 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); 10385 10386 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); 10387 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); 10388 } 10389 } 10390 10391 int perf_event_init_cpu(unsigned int cpu) 10392 { 10393 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 10394 10395 mutex_lock(&swhash->hlist_mutex); 10396 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { 10397 struct swevent_hlist *hlist; 10398 10399 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); 10400 WARN_ON(!hlist); 10401 rcu_assign_pointer(swhash->swevent_hlist, hlist); 10402 } 10403 mutex_unlock(&swhash->hlist_mutex); 10404 return 0; 10405 } 10406 10407 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE 10408 static void __perf_event_exit_context(void *__info) 10409 { 10410 struct perf_event_context *ctx = __info; 10411 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 10412 struct perf_event *event; 10413 10414 raw_spin_lock(&ctx->lock); 10415 list_for_each_entry(event, &ctx->event_list, event_entry) 10416 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); 10417 raw_spin_unlock(&ctx->lock); 10418 } 10419 10420 static void perf_event_exit_cpu_context(int cpu) 10421 { 10422 struct perf_event_context *ctx; 10423 struct pmu *pmu; 10424 int idx; 10425 10426 idx = srcu_read_lock(&pmus_srcu); 10427 list_for_each_entry_rcu(pmu, &pmus, entry) { 10428 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; 10429 10430 mutex_lock(&ctx->mutex); 10431 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); 10432 mutex_unlock(&ctx->mutex); 10433 } 10434 srcu_read_unlock(&pmus_srcu, idx); 10435 } 10436 #else 10437 10438 static void perf_event_exit_cpu_context(int cpu) { } 10439 10440 #endif 10441 10442 int perf_event_exit_cpu(unsigned int cpu) 10443 { 10444 perf_event_exit_cpu_context(cpu); 10445 return 0; 10446 } 10447 10448 static int 10449 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) 10450 { 10451 int cpu; 10452 10453 for_each_online_cpu(cpu) 10454 perf_event_exit_cpu(cpu); 10455 10456 return NOTIFY_OK; 10457 } 10458 10459 /* 10460 * Run the perf reboot notifier at the very last possible moment so that 10461 * the generic watchdog code runs as long as possible. 10462 */ 10463 static struct notifier_block perf_reboot_notifier = { 10464 .notifier_call = perf_reboot, 10465 .priority = INT_MIN, 10466 }; 10467 10468 void __init perf_event_init(void) 10469 { 10470 int ret; 10471 10472 idr_init(&pmu_idr); 10473 10474 perf_event_init_all_cpus(); 10475 init_srcu_struct(&pmus_srcu); 10476 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); 10477 perf_pmu_register(&perf_cpu_clock, NULL, -1); 10478 perf_pmu_register(&perf_task_clock, NULL, -1); 10479 perf_tp_register(); 10480 perf_event_init_cpu(smp_processor_id()); 10481 register_reboot_notifier(&perf_reboot_notifier); 10482 10483 ret = init_hw_breakpoint(); 10484 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 10485 10486 /* 10487 * Build time assertion that we keep the data_head at the intended 10488 * location. IOW, validation we got the __reserved[] size right. 10489 */ 10490 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) 10491 != 1024); 10492 } 10493 10494 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, 10495 char *page) 10496 { 10497 struct perf_pmu_events_attr *pmu_attr = 10498 container_of(attr, struct perf_pmu_events_attr, attr); 10499 10500 if (pmu_attr->event_str) 10501 return sprintf(page, "%s\n", pmu_attr->event_str); 10502 10503 return 0; 10504 } 10505 EXPORT_SYMBOL_GPL(perf_event_sysfs_show); 10506 10507 static int __init perf_event_sysfs_init(void) 10508 { 10509 struct pmu *pmu; 10510 int ret; 10511 10512 mutex_lock(&pmus_lock); 10513 10514 ret = bus_register(&pmu_bus); 10515 if (ret) 10516 goto unlock; 10517 10518 list_for_each_entry(pmu, &pmus, entry) { 10519 if (!pmu->name || pmu->type < 0) 10520 continue; 10521 10522 ret = pmu_dev_alloc(pmu); 10523 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); 10524 } 10525 pmu_bus_running = 1; 10526 ret = 0; 10527 10528 unlock: 10529 mutex_unlock(&pmus_lock); 10530 10531 return ret; 10532 } 10533 device_initcall(perf_event_sysfs_init); 10534 10535 #ifdef CONFIG_CGROUP_PERF 10536 static struct cgroup_subsys_state * 10537 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 10538 { 10539 struct perf_cgroup *jc; 10540 10541 jc = kzalloc(sizeof(*jc), GFP_KERNEL); 10542 if (!jc) 10543 return ERR_PTR(-ENOMEM); 10544 10545 jc->info = alloc_percpu(struct perf_cgroup_info); 10546 if (!jc->info) { 10547 kfree(jc); 10548 return ERR_PTR(-ENOMEM); 10549 } 10550 10551 return &jc->css; 10552 } 10553 10554 static void perf_cgroup_css_free(struct cgroup_subsys_state *css) 10555 { 10556 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); 10557 10558 free_percpu(jc->info); 10559 kfree(jc); 10560 } 10561 10562 static int __perf_cgroup_move(void *info) 10563 { 10564 struct task_struct *task = info; 10565 rcu_read_lock(); 10566 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); 10567 rcu_read_unlock(); 10568 return 0; 10569 } 10570 10571 static void perf_cgroup_attach(struct cgroup_taskset *tset) 10572 { 10573 struct task_struct *task; 10574 struct cgroup_subsys_state *css; 10575 10576 cgroup_taskset_for_each(task, css, tset) 10577 task_function_call(task, __perf_cgroup_move, task); 10578 } 10579 10580 struct cgroup_subsys perf_event_cgrp_subsys = { 10581 .css_alloc = perf_cgroup_css_alloc, 10582 .css_free = perf_cgroup_css_free, 10583 .attach = perf_cgroup_attach, 10584 }; 10585 #endif /* CONFIG_CGROUP_PERF */ 10586