1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Performance event support for the System z CPU-measurement Sampling Facility 4 * 5 * Copyright IBM Corp. 2013, 2018 6 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> 7 */ 8 #define KMSG_COMPONENT "cpum_sf" 9 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 10 11 #include <linux/kernel.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/perf_event.h> 14 #include <linux/percpu.h> 15 #include <linux/pid.h> 16 #include <linux/notifier.h> 17 #include <linux/slab.h> 18 #include <linux/mm.h> 19 #include <linux/moduleparam.h> 20 #include <asm/cpu_mf.h> 21 #include <asm/irq.h> 22 #include <asm/debug.h> 23 #include <asm/timex.h> 24 #include <linux/io.h> 25 26 /* Perf PMU definitions for the sampling facility */ 27 #define PERF_CPUM_SF_MAX_CTR 2 28 #define PERF_EVENT_CPUM_SF 0xB0000UL /* Event: Basic-sampling */ 29 #define PERF_EVENT_CPUM_SF_DIAG 0xBD000UL /* Event: Combined-sampling */ 30 #define PERF_CPUM_SF_BASIC_MODE 0x0001 /* Basic-sampling flag */ 31 #define PERF_CPUM_SF_DIAG_MODE 0x0002 /* Diagnostic-sampling flag */ 32 #define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */ 33 34 #define OVERFLOW_REG(hwc) ((hwc)->extra_reg.config) 35 #define SFB_ALLOC_REG(hwc) ((hwc)->extra_reg.alloc) 36 #define TEAR_REG(hwc) ((hwc)->last_tag) 37 #define SAMPL_RATE(hwc) ((hwc)->event_base) 38 #define SAMPL_FLAGS(hwc) ((hwc)->config_base) 39 #define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE) 40 #define SAMPL_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE) 41 42 /* Minimum number of sample-data-block-tables: 43 * At least one table is required for the sampling buffer structure. 44 * A single table contains up to 511 pointers to sample-data-blocks. 45 */ 46 #define CPUM_SF_MIN_SDBT 1 47 48 /* Number of sample-data-blocks per sample-data-block-table (SDBT): 49 * A table contains SDB pointers (8 bytes) and one table-link entry 50 * that points to the origin of the next SDBT. 51 */ 52 #define CPUM_SF_SDB_PER_TABLE ((PAGE_SIZE - 8) / 8) 53 54 /* Maximum page offset for an SDBT table-link entry: 55 * If this page offset is reached, a table-link entry to the next SDBT 56 * must be added. 57 */ 58 #define CPUM_SF_SDBT_TL_OFFSET (CPUM_SF_SDB_PER_TABLE * 8) 59 static inline int require_table_link(const void *sdbt) 60 { 61 return ((unsigned long)sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET; 62 } 63 64 /* Minimum and maximum sampling buffer sizes: 65 * 66 * This number represents the maximum size of the sampling buffer taking 67 * the number of sample-data-block-tables into account. Note that these 68 * numbers apply to the basic-sampling function only. 69 * The maximum number of SDBs is increased by CPUM_SF_SDB_DIAG_FACTOR if 70 * the diagnostic-sampling function is active. 71 * 72 * Sampling buffer size Buffer characteristics 73 * --------------------------------------------------- 74 * 64KB == 16 pages (4KB per page) 75 * 1 page for SDB-tables 76 * 15 pages for SDBs 77 * 78 * 32MB == 8192 pages (4KB per page) 79 * 16 pages for SDB-tables 80 * 8176 pages for SDBs 81 */ 82 static unsigned long __read_mostly CPUM_SF_MIN_SDB = 15; 83 static unsigned long __read_mostly CPUM_SF_MAX_SDB = 8176; 84 static unsigned long __read_mostly CPUM_SF_SDB_DIAG_FACTOR = 1; 85 86 struct sf_buffer { 87 unsigned long *sdbt; /* Sample-data-block-table origin */ 88 /* buffer characteristics (required for buffer increments) */ 89 unsigned long num_sdb; /* Number of sample-data-blocks */ 90 unsigned long num_sdbt; /* Number of sample-data-block-tables */ 91 unsigned long *tail; /* last sample-data-block-table */ 92 }; 93 94 struct aux_buffer { 95 struct sf_buffer sfb; 96 unsigned long head; /* index of SDB of buffer head */ 97 unsigned long alert_mark; /* index of SDB of alert request position */ 98 unsigned long empty_mark; /* mark of SDB not marked full */ 99 unsigned long *sdb_index; /* SDB address for fast lookup */ 100 unsigned long *sdbt_index; /* SDBT address for fast lookup */ 101 }; 102 103 struct cpu_hw_sf { 104 /* CPU-measurement sampling information block */ 105 struct hws_qsi_info_block qsi; 106 /* CPU-measurement sampling control block */ 107 struct hws_lsctl_request_block lsctl; 108 struct sf_buffer sfb; /* Sampling buffer */ 109 unsigned int flags; /* Status flags */ 110 struct perf_event *event; /* Scheduled perf event */ 111 struct perf_output_handle handle; /* AUX buffer output handle */ 112 }; 113 static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf); 114 115 /* Debug feature */ 116 static debug_info_t *sfdbg; 117 118 /* Sampling control helper functions */ 119 static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi, 120 unsigned long freq) 121 { 122 return (USEC_PER_SEC / freq) * qsi->cpu_speed; 123 } 124 125 static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi, 126 unsigned long rate) 127 { 128 return USEC_PER_SEC * qsi->cpu_speed / rate; 129 } 130 131 /* Return pointer to trailer entry of an sample data block */ 132 static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v) 133 { 134 void *ret; 135 136 ret = (void *)v; 137 ret += PAGE_SIZE; 138 ret -= sizeof(struct hws_trailer_entry); 139 140 return ret; 141 } 142 143 /* 144 * Return true if the entry in the sample data block table (sdbt) 145 * is a link to the next sdbt 146 */ 147 static inline int is_link_entry(unsigned long *s) 148 { 149 return *s & 0x1UL ? 1 : 0; 150 } 151 152 /* Return pointer to the linked sdbt */ 153 static inline unsigned long *get_next_sdbt(unsigned long *s) 154 { 155 return phys_to_virt(*s & ~0x1UL); 156 } 157 158 /* 159 * sf_disable() - Switch off sampling facility 160 */ 161 static void sf_disable(void) 162 { 163 struct hws_lsctl_request_block sreq; 164 165 memset(&sreq, 0, sizeof(sreq)); 166 lsctl(&sreq); 167 } 168 169 /* 170 * sf_buffer_available() - Check for an allocated sampling buffer 171 */ 172 static int sf_buffer_available(struct cpu_hw_sf *cpuhw) 173 { 174 return !!cpuhw->sfb.sdbt; 175 } 176 177 /* 178 * deallocate sampling facility buffer 179 */ 180 static void free_sampling_buffer(struct sf_buffer *sfb) 181 { 182 unsigned long *sdbt, *curr, *head; 183 184 sdbt = sfb->sdbt; 185 if (!sdbt) 186 return; 187 sfb->sdbt = NULL; 188 /* Free the SDBT after all SDBs are processed... */ 189 head = sdbt; 190 curr = sdbt; 191 do { 192 if (is_link_entry(curr)) { 193 /* Process table-link entries */ 194 curr = get_next_sdbt(curr); 195 free_page((unsigned long)sdbt); 196 sdbt = curr; 197 } else { 198 /* Process SDB pointer */ 199 free_page((unsigned long)phys_to_virt(*curr)); 200 curr++; 201 } 202 } while (curr != head); 203 memset(sfb, 0, sizeof(*sfb)); 204 } 205 206 static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags) 207 { 208 struct hws_trailer_entry *te; 209 unsigned long sdb; 210 211 /* Allocate and initialize sample-data-block */ 212 sdb = get_zeroed_page(gfp_flags); 213 if (!sdb) 214 return -ENOMEM; 215 te = trailer_entry_ptr(sdb); 216 te->header.a = 1; 217 218 /* Link SDB into the sample-data-block-table */ 219 *sdbt = virt_to_phys((void *)sdb); 220 221 return 0; 222 } 223 224 /* 225 * realloc_sampling_buffer() - extend sampler memory 226 * 227 * Allocates new sample-data-blocks and adds them to the specified sampling 228 * buffer memory. 229 * 230 * Important: This modifies the sampling buffer and must be called when the 231 * sampling facility is disabled. 232 * 233 * Returns zero on success, non-zero otherwise. 234 */ 235 static int realloc_sampling_buffer(struct sf_buffer *sfb, 236 unsigned long num_sdb, gfp_t gfp_flags) 237 { 238 int i, rc; 239 unsigned long *new, *tail, *tail_prev = NULL; 240 241 if (!sfb->sdbt || !sfb->tail) 242 return -EINVAL; 243 244 if (!is_link_entry(sfb->tail)) 245 return -EINVAL; 246 247 /* Append to the existing sampling buffer, overwriting the table-link 248 * register. 249 * The tail variables always points to the "tail" (last and table-link) 250 * entry in an SDB-table. 251 */ 252 tail = sfb->tail; 253 254 /* Do a sanity check whether the table-link entry points to 255 * the sampling buffer origin. 256 */ 257 if (sfb->sdbt != get_next_sdbt(tail)) { 258 debug_sprintf_event(sfdbg, 3, "%s buffer not linked origin %#lx tail %#lx\n", 259 __func__, (unsigned long)sfb->sdbt, 260 (unsigned long)tail); 261 return -EINVAL; 262 } 263 264 /* Allocate remaining SDBs */ 265 rc = 0; 266 for (i = 0; i < num_sdb; i++) { 267 /* Allocate a new SDB-table if it is full. */ 268 if (require_table_link(tail)) { 269 new = (unsigned long *)get_zeroed_page(gfp_flags); 270 if (!new) { 271 rc = -ENOMEM; 272 break; 273 } 274 sfb->num_sdbt++; 275 /* Link current page to tail of chain */ 276 *tail = virt_to_phys((void *)new) + 1; 277 tail_prev = tail; 278 tail = new; 279 } 280 281 /* Allocate a new sample-data-block. 282 * If there is not enough memory, stop the realloc process 283 * and simply use what was allocated. If this is a temporary 284 * issue, a new realloc call (if required) might succeed. 285 */ 286 rc = alloc_sample_data_block(tail, gfp_flags); 287 if (rc) { 288 /* Undo last SDBT. An SDBT with no SDB at its first 289 * entry but with an SDBT entry instead can not be 290 * handled by the interrupt handler code. 291 * Avoid this situation. 292 */ 293 if (tail_prev) { 294 sfb->num_sdbt--; 295 free_page((unsigned long)new); 296 tail = tail_prev; 297 } 298 break; 299 } 300 sfb->num_sdb++; 301 tail++; 302 tail_prev = new = NULL; /* Allocated at least one SBD */ 303 } 304 305 /* Link sampling buffer to its origin */ 306 *tail = virt_to_phys(sfb->sdbt) + 1; 307 sfb->tail = tail; 308 309 return rc; 310 } 311 312 /* 313 * allocate_sampling_buffer() - allocate sampler memory 314 * 315 * Allocates and initializes a sampling buffer structure using the 316 * specified number of sample-data-blocks (SDB). For each allocation, 317 * a 4K page is used. The number of sample-data-block-tables (SDBT) 318 * are calculated from SDBs. 319 * Also set the ALERT_REQ mask in each SDBs trailer. 320 * 321 * Returns zero on success, non-zero otherwise. 322 */ 323 static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb) 324 { 325 int rc; 326 327 if (sfb->sdbt) 328 return -EINVAL; 329 330 /* Allocate the sample-data-block-table origin */ 331 sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL); 332 if (!sfb->sdbt) 333 return -ENOMEM; 334 sfb->num_sdb = 0; 335 sfb->num_sdbt = 1; 336 337 /* Link the table origin to point to itself to prepare for 338 * realloc_sampling_buffer() invocation. 339 */ 340 sfb->tail = sfb->sdbt; 341 *sfb->tail = virt_to_phys((void *)sfb->sdbt) + 1; 342 343 /* Allocate requested number of sample-data-blocks */ 344 rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL); 345 if (rc) 346 free_sampling_buffer(sfb); 347 return rc; 348 } 349 350 static void sfb_set_limits(unsigned long min, unsigned long max) 351 { 352 struct hws_qsi_info_block si; 353 354 CPUM_SF_MIN_SDB = min; 355 CPUM_SF_MAX_SDB = max; 356 357 memset(&si, 0, sizeof(si)); 358 qsi(&si); 359 CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes); 360 } 361 362 static unsigned long sfb_max_limit(struct hw_perf_event *hwc) 363 { 364 return SAMPL_DIAG_MODE(hwc) ? CPUM_SF_MAX_SDB * CPUM_SF_SDB_DIAG_FACTOR 365 : CPUM_SF_MAX_SDB; 366 } 367 368 static unsigned long sfb_pending_allocs(struct sf_buffer *sfb, 369 struct hw_perf_event *hwc) 370 { 371 if (!sfb->sdbt) 372 return SFB_ALLOC_REG(hwc); 373 if (SFB_ALLOC_REG(hwc) > sfb->num_sdb) 374 return SFB_ALLOC_REG(hwc) - sfb->num_sdb; 375 return 0; 376 } 377 378 static void sfb_account_allocs(unsigned long num, struct hw_perf_event *hwc) 379 { 380 /* Limit the number of SDBs to not exceed the maximum */ 381 num = min_t(unsigned long, num, sfb_max_limit(hwc) - SFB_ALLOC_REG(hwc)); 382 if (num) 383 SFB_ALLOC_REG(hwc) += num; 384 } 385 386 static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc) 387 { 388 SFB_ALLOC_REG(hwc) = 0; 389 sfb_account_allocs(num, hwc); 390 } 391 392 static void deallocate_buffers(struct cpu_hw_sf *cpuhw) 393 { 394 if (sf_buffer_available(cpuhw)) 395 free_sampling_buffer(&cpuhw->sfb); 396 } 397 398 static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) 399 { 400 unsigned long n_sdb, freq; 401 402 /* Calculate sampling buffers using 4K pages 403 * 404 * 1. The sampling size is 32 bytes for basic sampling. This size 405 * is the same for all machine types. Diagnostic 406 * sampling uses auxlilary data buffer setup which provides the 407 * memory for SDBs using linux common code auxiliary trace 408 * setup. 409 * 410 * 2. Function alloc_sampling_buffer() sets the Alert Request 411 * Control indicator to trigger a measurement-alert to harvest 412 * sample-data-blocks (SDB). This is done per SDB. This 413 * measurement alert interrupt fires quick enough to handle 414 * one SDB, on very high frequency and work loads there might 415 * be 2 to 3 SBDs available for sample processing. 416 * Currently there is no need for setup alert request on every 417 * n-th page. This is counterproductive as one IRQ triggers 418 * a very high number of samples to be processed at one IRQ. 419 * 420 * 3. Use the sampling frequency as input. 421 * Compute the number of SDBs and ensure a minimum 422 * of CPUM_SF_MIN_SDB. Depending on frequency add some more 423 * SDBs to handle a higher sampling rate. 424 * Use a minimum of CPUM_SF_MIN_SDB and allow for 100 samples 425 * (one SDB) for every 10000 HZ frequency increment. 426 * 427 * 4. Compute the number of sample-data-block-tables (SDBT) and 428 * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up 429 * to 511 SDBs). 430 */ 431 freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)); 432 n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000); 433 434 /* If there is already a sampling buffer allocated, it is very likely 435 * that the sampling facility is enabled too. If the event to be 436 * initialized requires a greater sampling buffer, the allocation must 437 * be postponed. Changing the sampling buffer requires the sampling 438 * facility to be in the disabled state. So, account the number of 439 * required SDBs and let cpumsf_pmu_enable() resize the buffer just 440 * before the event is started. 441 */ 442 sfb_init_allocs(n_sdb, hwc); 443 if (sf_buffer_available(cpuhw)) 444 return 0; 445 446 return alloc_sampling_buffer(&cpuhw->sfb, 447 sfb_pending_allocs(&cpuhw->sfb, hwc)); 448 } 449 450 static unsigned long min_percent(unsigned int percent, unsigned long base, 451 unsigned long min) 452 { 453 return min_t(unsigned long, min, DIV_ROUND_UP(percent * base, 100)); 454 } 455 456 static unsigned long compute_sfb_extent(unsigned long ratio, unsigned long base) 457 { 458 /* Use a percentage-based approach to extend the sampling facility 459 * buffer. Accept up to 5% sample data loss. 460 * Vary the extents between 1% to 5% of the current number of 461 * sample-data-blocks. 462 */ 463 if (ratio <= 5) 464 return 0; 465 if (ratio <= 25) 466 return min_percent(1, base, 1); 467 if (ratio <= 50) 468 return min_percent(1, base, 1); 469 if (ratio <= 75) 470 return min_percent(2, base, 2); 471 if (ratio <= 100) 472 return min_percent(3, base, 3); 473 if (ratio <= 250) 474 return min_percent(4, base, 4); 475 476 return min_percent(5, base, 8); 477 } 478 479 static void sfb_account_overflows(struct cpu_hw_sf *cpuhw, 480 struct hw_perf_event *hwc) 481 { 482 unsigned long ratio, num; 483 484 if (!OVERFLOW_REG(hwc)) 485 return; 486 487 /* The sample_overflow contains the average number of sample data 488 * that has been lost because sample-data-blocks were full. 489 * 490 * Calculate the total number of sample data entries that has been 491 * discarded. Then calculate the ratio of lost samples to total samples 492 * per second in percent. 493 */ 494 ratio = DIV_ROUND_UP(100 * OVERFLOW_REG(hwc) * cpuhw->sfb.num_sdb, 495 sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc))); 496 497 /* Compute number of sample-data-blocks */ 498 num = compute_sfb_extent(ratio, cpuhw->sfb.num_sdb); 499 if (num) 500 sfb_account_allocs(num, hwc); 501 502 OVERFLOW_REG(hwc) = 0; 503 } 504 505 /* extend_sampling_buffer() - Extend sampling buffer 506 * @sfb: Sampling buffer structure (for local CPU) 507 * @hwc: Perf event hardware structure 508 * 509 * Use this function to extend the sampling buffer based on the overflow counter 510 * and postponed allocation extents stored in the specified Perf event hardware. 511 * 512 * Important: This function disables the sampling facility in order to safely 513 * change the sampling buffer structure. Do not call this function 514 * when the PMU is active. 515 */ 516 static void extend_sampling_buffer(struct sf_buffer *sfb, 517 struct hw_perf_event *hwc) 518 { 519 unsigned long num; 520 521 num = sfb_pending_allocs(sfb, hwc); 522 if (!num) 523 return; 524 525 /* Disable the sampling facility to reset any states and also 526 * clear pending measurement alerts. 527 */ 528 sf_disable(); 529 530 /* Extend the sampling buffer. 531 * This memory allocation typically happens in an atomic context when 532 * called by perf. Because this is a reallocation, it is fine if the 533 * new SDB-request cannot be satisfied immediately. 534 */ 535 realloc_sampling_buffer(sfb, num, GFP_ATOMIC); 536 } 537 538 /* Number of perf events counting hardware events */ 539 static refcount_t num_events; 540 /* Used to avoid races in calling reserve/release_cpumf_hardware */ 541 static DEFINE_MUTEX(pmc_reserve_mutex); 542 543 #define PMC_INIT 0 544 #define PMC_RELEASE 1 545 static void setup_pmc_cpu(void *flags) 546 { 547 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); 548 549 sf_disable(); 550 switch (*((int *)flags)) { 551 case PMC_INIT: 552 memset(cpuhw, 0, sizeof(*cpuhw)); 553 qsi(&cpuhw->qsi); 554 cpuhw->flags |= PMU_F_RESERVED; 555 break; 556 case PMC_RELEASE: 557 cpuhw->flags &= ~PMU_F_RESERVED; 558 deallocate_buffers(cpuhw); 559 break; 560 } 561 } 562 563 static void release_pmc_hardware(void) 564 { 565 int flags = PMC_RELEASE; 566 567 irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); 568 on_each_cpu(setup_pmc_cpu, &flags, 1); 569 } 570 571 static void reserve_pmc_hardware(void) 572 { 573 int flags = PMC_INIT; 574 575 on_each_cpu(setup_pmc_cpu, &flags, 1); 576 irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); 577 } 578 579 static void hw_perf_event_destroy(struct perf_event *event) 580 { 581 /* Release PMC if this is the last perf event */ 582 if (refcount_dec_and_mutex_lock(&num_events, &pmc_reserve_mutex)) { 583 release_pmc_hardware(); 584 mutex_unlock(&pmc_reserve_mutex); 585 } 586 } 587 588 static void hw_init_period(struct hw_perf_event *hwc, u64 period) 589 { 590 hwc->sample_period = period; 591 hwc->last_period = hwc->sample_period; 592 local64_set(&hwc->period_left, hwc->sample_period); 593 } 594 595 static unsigned long hw_limit_rate(const struct hws_qsi_info_block *si, 596 unsigned long rate) 597 { 598 return clamp_t(unsigned long, rate, 599 si->min_sampl_rate, si->max_sampl_rate); 600 } 601 602 static u32 cpumsf_pid_type(struct perf_event *event, 603 u32 pid, enum pid_type type) 604 { 605 struct task_struct *tsk; 606 607 /* Idle process */ 608 if (!pid) 609 goto out; 610 611 tsk = find_task_by_pid_ns(pid, &init_pid_ns); 612 pid = -1; 613 if (tsk) { 614 /* 615 * Only top level events contain the pid namespace in which 616 * they are created. 617 */ 618 if (event->parent) 619 event = event->parent; 620 pid = __task_pid_nr_ns(tsk, type, event->ns); 621 /* 622 * See also 1d953111b648 623 * "perf/core: Don't report zero PIDs for exiting tasks". 624 */ 625 if (!pid && !pid_alive(tsk)) 626 pid = -1; 627 } 628 out: 629 return pid; 630 } 631 632 static void cpumsf_output_event_pid(struct perf_event *event, 633 struct perf_sample_data *data, 634 struct pt_regs *regs) 635 { 636 u32 pid; 637 struct perf_event_header header; 638 struct perf_output_handle handle; 639 640 /* 641 * Obtain the PID from the basic-sampling data entry and 642 * correct the data->tid_entry.pid value. 643 */ 644 pid = data->tid_entry.pid; 645 646 /* Protect callchain buffers, tasks */ 647 rcu_read_lock(); 648 649 perf_prepare_sample(data, event, regs); 650 perf_prepare_header(&header, data, event, regs); 651 if (perf_output_begin(&handle, data, event, header.size)) 652 goto out; 653 654 /* Update the process ID (see also kernel/events/core.c) */ 655 data->tid_entry.pid = cpumsf_pid_type(event, pid, PIDTYPE_TGID); 656 data->tid_entry.tid = cpumsf_pid_type(event, pid, PIDTYPE_PID); 657 658 perf_output_sample(&handle, &header, data, event); 659 perf_output_end(&handle); 660 out: 661 rcu_read_unlock(); 662 } 663 664 static unsigned long getrate(bool freq, unsigned long sample, 665 struct hws_qsi_info_block *si) 666 { 667 unsigned long rate; 668 669 if (freq) { 670 rate = freq_to_sample_rate(si, sample); 671 rate = hw_limit_rate(si, rate); 672 } else { 673 /* The min/max sampling rates specifies the valid range 674 * of sample periods. If the specified sample period is 675 * out of range, limit the period to the range boundary. 676 */ 677 rate = hw_limit_rate(si, sample); 678 679 /* The perf core maintains a maximum sample rate that is 680 * configurable through the sysctl interface. Ensure the 681 * sampling rate does not exceed this value. This also helps 682 * to avoid throttling when pushing samples with 683 * perf_event_overflow(). 684 */ 685 if (sample_rate_to_freq(si, rate) > 686 sysctl_perf_event_sample_rate) { 687 rate = 0; 688 } 689 } 690 return rate; 691 } 692 693 /* The sampling information (si) contains information about the 694 * min/max sampling intervals and the CPU speed. So calculate the 695 * correct sampling interval and avoid the whole period adjust 696 * feedback loop. 697 * 698 * Since the CPU Measurement sampling facility can not handle frequency 699 * calculate the sampling interval when frequency is specified using 700 * this formula: 701 * interval := cpu_speed * 1000000 / sample_freq 702 * 703 * Returns errno on bad input and zero on success with parameter interval 704 * set to the correct sampling rate. 705 * 706 * Note: This function turns off freq bit to avoid calling function 707 * perf_adjust_period(). This causes frequency adjustment in the common 708 * code part which causes tremendous variations in the counter values. 709 */ 710 static int __hw_perf_event_init_rate(struct perf_event *event, 711 struct hws_qsi_info_block *si) 712 { 713 struct perf_event_attr *attr = &event->attr; 714 struct hw_perf_event *hwc = &event->hw; 715 unsigned long rate; 716 717 if (attr->freq) { 718 if (!attr->sample_freq) 719 return -EINVAL; 720 rate = getrate(attr->freq, attr->sample_freq, si); 721 attr->freq = 0; /* Don't call perf_adjust_period() */ 722 SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FREQ_MODE; 723 } else { 724 rate = getrate(attr->freq, attr->sample_period, si); 725 if (!rate) 726 return -EINVAL; 727 } 728 attr->sample_period = rate; 729 SAMPL_RATE(hwc) = rate; 730 hw_init_period(hwc, SAMPL_RATE(hwc)); 731 return 0; 732 } 733 734 static int __hw_perf_event_init(struct perf_event *event) 735 { 736 struct cpu_hw_sf *cpuhw; 737 struct hws_qsi_info_block si; 738 struct perf_event_attr *attr = &event->attr; 739 struct hw_perf_event *hwc = &event->hw; 740 int cpu, err = 0; 741 742 /* Reserve CPU-measurement sampling facility */ 743 mutex_lock(&pmc_reserve_mutex); 744 if (!refcount_inc_not_zero(&num_events)) { 745 reserve_pmc_hardware(); 746 refcount_set(&num_events, 1); 747 } 748 event->destroy = hw_perf_event_destroy; 749 750 /* Access per-CPU sampling information (query sampling info) */ 751 /* 752 * The event->cpu value can be -1 to count on every CPU, for example, 753 * when attaching to a task. If this is specified, use the query 754 * sampling info from the current CPU, otherwise use event->cpu to 755 * retrieve the per-CPU information. 756 * Later, cpuhw indicates whether to allocate sampling buffers for a 757 * particular CPU (cpuhw!=NULL) or each online CPU (cpuw==NULL). 758 */ 759 memset(&si, 0, sizeof(si)); 760 cpuhw = NULL; 761 if (event->cpu == -1) { 762 qsi(&si); 763 } else { 764 /* Event is pinned to a particular CPU, retrieve the per-CPU 765 * sampling structure for accessing the CPU-specific QSI. 766 */ 767 cpuhw = &per_cpu(cpu_hw_sf, event->cpu); 768 si = cpuhw->qsi; 769 } 770 771 /* Check sampling facility authorization and, if not authorized, 772 * fall back to other PMUs. It is safe to check any CPU because 773 * the authorization is identical for all configured CPUs. 774 */ 775 if (!si.as) { 776 err = -ENOENT; 777 goto out; 778 } 779 780 if (si.ribm & CPU_MF_SF_RIBM_NOTAV) { 781 pr_warn("CPU Measurement Facility sampling is temporarily not available\n"); 782 err = -EBUSY; 783 goto out; 784 } 785 786 /* Always enable basic sampling */ 787 SAMPL_FLAGS(hwc) = PERF_CPUM_SF_BASIC_MODE; 788 789 /* Check if diagnostic sampling is requested. Deny if the required 790 * sampling authorization is missing. 791 */ 792 if (attr->config == PERF_EVENT_CPUM_SF_DIAG) { 793 if (!si.ad) { 794 err = -EPERM; 795 goto out; 796 } 797 SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_DIAG_MODE; 798 } 799 800 err = __hw_perf_event_init_rate(event, &si); 801 if (err) 802 goto out; 803 804 /* Use AUX buffer. No need to allocate it by ourself */ 805 if (attr->config == PERF_EVENT_CPUM_SF_DIAG) 806 goto out; 807 808 /* Allocate the per-CPU sampling buffer using the CPU information 809 * from the event. If the event is not pinned to a particular 810 * CPU (event->cpu == -1; or cpuhw == NULL), allocate sampling 811 * buffers for each online CPU. 812 */ 813 if (cpuhw) 814 /* Event is pinned to a particular CPU */ 815 err = allocate_buffers(cpuhw, hwc); 816 else { 817 /* Event is not pinned, allocate sampling buffer on 818 * each online CPU 819 */ 820 for_each_online_cpu(cpu) { 821 cpuhw = &per_cpu(cpu_hw_sf, cpu); 822 err = allocate_buffers(cpuhw, hwc); 823 if (err) 824 break; 825 } 826 } 827 828 /* If PID/TID sampling is active, replace the default overflow 829 * handler to extract and resolve the PIDs from the basic-sampling 830 * data entries. 831 */ 832 if (event->attr.sample_type & PERF_SAMPLE_TID) 833 if (is_default_overflow_handler(event)) 834 event->overflow_handler = cpumsf_output_event_pid; 835 out: 836 mutex_unlock(&pmc_reserve_mutex); 837 return err; 838 } 839 840 static bool is_callchain_event(struct perf_event *event) 841 { 842 u64 sample_type = event->attr.sample_type; 843 844 return sample_type & (PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_REGS_USER | 845 PERF_SAMPLE_STACK_USER); 846 } 847 848 static int cpumsf_pmu_event_init(struct perf_event *event) 849 { 850 int err; 851 852 /* No support for taken branch sampling */ 853 /* No support for callchain, stacks and registers */ 854 if (has_branch_stack(event) || is_callchain_event(event)) 855 return -EOPNOTSUPP; 856 857 switch (event->attr.type) { 858 case PERF_TYPE_RAW: 859 if ((event->attr.config != PERF_EVENT_CPUM_SF) && 860 (event->attr.config != PERF_EVENT_CPUM_SF_DIAG)) 861 return -ENOENT; 862 break; 863 case PERF_TYPE_HARDWARE: 864 /* Support sampling of CPU cycles in addition to the 865 * counter facility. However, the counter facility 866 * is more precise and, hence, restrict this PMU to 867 * sampling events only. 868 */ 869 if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES) 870 return -ENOENT; 871 if (!is_sampling_event(event)) 872 return -ENOENT; 873 break; 874 default: 875 return -ENOENT; 876 } 877 878 /* Force reset of idle/hv excludes regardless of what the 879 * user requested. 880 */ 881 if (event->attr.exclude_hv) 882 event->attr.exclude_hv = 0; 883 if (event->attr.exclude_idle) 884 event->attr.exclude_idle = 0; 885 886 err = __hw_perf_event_init(event); 887 return err; 888 } 889 890 static void cpumsf_pmu_enable(struct pmu *pmu) 891 { 892 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); 893 struct hw_perf_event *hwc; 894 int err; 895 896 /* 897 * Event must be 898 * - added/started on this CPU (PMU_F_IN_USE set) 899 * - and CPU must be available (PMU_F_RESERVED set) 900 * - and not already enabled (PMU_F_ENABLED not set) 901 * - and not in error condition (PMU_F_ERR_MASK not set) 902 */ 903 if (cpuhw->flags != (PMU_F_IN_USE | PMU_F_RESERVED)) 904 return; 905 906 /* Check whether to extent the sampling buffer. 907 * 908 * Two conditions trigger an increase of the sampling buffer for a 909 * perf event: 910 * 1. Postponed buffer allocations from the event initialization. 911 * 2. Sampling overflows that contribute to pending allocations. 912 * 913 * Note that the extend_sampling_buffer() function disables the sampling 914 * facility, but it can be fully re-enabled using sampling controls that 915 * have been saved in cpumsf_pmu_disable(). 916 */ 917 hwc = &cpuhw->event->hw; 918 if (!(SAMPL_DIAG_MODE(hwc))) { 919 /* 920 * Account number of overflow-designated buffer extents 921 */ 922 sfb_account_overflows(cpuhw, hwc); 923 extend_sampling_buffer(&cpuhw->sfb, hwc); 924 } 925 /* Rate may be adjusted with ioctl() */ 926 cpuhw->lsctl.interval = SAMPL_RATE(hwc); 927 928 /* (Re)enable the PMU and sampling facility */ 929 err = lsctl(&cpuhw->lsctl); 930 if (err) { 931 pr_err("Loading sampling controls failed: op 1 err %i\n", err); 932 return; 933 } 934 935 /* Load current program parameter */ 936 lpp(&get_lowcore()->lpp); 937 cpuhw->flags |= PMU_F_ENABLED; 938 } 939 940 static void cpumsf_pmu_disable(struct pmu *pmu) 941 { 942 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); 943 struct hws_lsctl_request_block inactive; 944 struct hws_qsi_info_block si; 945 int err; 946 947 if (!(cpuhw->flags & PMU_F_ENABLED)) 948 return; 949 950 if (cpuhw->flags & PMU_F_ERR_MASK) 951 return; 952 953 /* Switch off sampling activation control */ 954 inactive = cpuhw->lsctl; 955 inactive.cs = 0; 956 inactive.cd = 0; 957 958 err = lsctl(&inactive); 959 if (err) { 960 pr_err("Loading sampling controls failed: op 2 err %i\n", err); 961 return; 962 } 963 964 /* 965 * Save state of TEAR and DEAR register contents. 966 * TEAR/DEAR values are valid only if the sampling facility is 967 * enabled. Note that cpumsf_pmu_disable() might be called even 968 * for a disabled sampling facility because cpumsf_pmu_enable() 969 * controls the enable/disable state. 970 */ 971 qsi(&si); 972 if (si.es) { 973 cpuhw->lsctl.tear = si.tear; 974 cpuhw->lsctl.dear = si.dear; 975 } 976 977 cpuhw->flags &= ~PMU_F_ENABLED; 978 } 979 980 /* perf_event_exclude() - Filter event 981 * @event: The perf event 982 * @regs: pt_regs structure 983 * @sde_regs: Sample-data-entry (sde) regs structure 984 * 985 * Filter perf events according to their exclude specification. 986 * 987 * Return non-zero if the event shall be excluded. 988 */ 989 static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs, 990 struct perf_sf_sde_regs *sde_regs) 991 { 992 if (event->attr.exclude_user && user_mode(regs)) 993 return 1; 994 if (event->attr.exclude_kernel && !user_mode(regs)) 995 return 1; 996 if (event->attr.exclude_guest && sde_regs->in_guest) 997 return 1; 998 if (event->attr.exclude_host && !sde_regs->in_guest) 999 return 1; 1000 return 0; 1001 } 1002 1003 /* perf_push_sample() - Push samples to perf 1004 * @event: The perf event 1005 * @sample: Hardware sample data 1006 * 1007 * Use the hardware sample data to create perf event sample. The sample 1008 * is the pushed to the event subsystem and the function checks for 1009 * possible event overflows. If an event overflow occurs, the PMU is 1010 * stopped. 1011 * 1012 * Return non-zero if an event overflow occurred. 1013 */ 1014 static int perf_push_sample(struct perf_event *event, 1015 struct hws_basic_entry *basic) 1016 { 1017 int overflow; 1018 struct pt_regs regs; 1019 struct perf_sf_sde_regs *sde_regs; 1020 struct perf_sample_data data; 1021 1022 /* Setup perf sample */ 1023 perf_sample_data_init(&data, 0, event->hw.last_period); 1024 1025 /* Setup pt_regs to look like an CPU-measurement external interrupt 1026 * using the Program Request Alert code. The regs.int_parm_long 1027 * field which is unused contains additional sample-data-entry related 1028 * indicators. 1029 */ 1030 memset(®s, 0, sizeof(regs)); 1031 regs.int_code = 0x1407; 1032 regs.int_parm = CPU_MF_INT_SF_PRA; 1033 sde_regs = (struct perf_sf_sde_regs *) ®s.int_parm_long; 1034 1035 psw_bits(regs.psw).ia = basic->ia; 1036 psw_bits(regs.psw).dat = basic->T; 1037 psw_bits(regs.psw).wait = basic->W; 1038 psw_bits(regs.psw).pstate = basic->P; 1039 psw_bits(regs.psw).as = basic->AS; 1040 1041 /* 1042 * Use the hardware provided configuration level to decide if the 1043 * sample belongs to a guest or host. If that is not available, 1044 * fall back to the following heuristics: 1045 * A non-zero guest program parameter always indicates a guest 1046 * sample. Some early samples or samples from guests without 1047 * lpp usage would be misaccounted to the host. We use the asn 1048 * value as an addon heuristic to detect most of these guest samples. 1049 * If the value differs from 0xffff (the host value), we assume to 1050 * be a KVM guest. 1051 */ 1052 switch (basic->CL) { 1053 case 1: /* logical partition */ 1054 sde_regs->in_guest = 0; 1055 break; 1056 case 2: /* virtual machine */ 1057 sde_regs->in_guest = 1; 1058 break; 1059 default: /* old machine, use heuristics */ 1060 if (basic->gpp || basic->prim_asn != 0xffff) 1061 sde_regs->in_guest = 1; 1062 break; 1063 } 1064 1065 /* 1066 * Store the PID value from the sample-data-entry to be 1067 * processed and resolved by cpumsf_output_event_pid(). 1068 */ 1069 data.tid_entry.pid = basic->hpp & LPP_PID_MASK; 1070 1071 overflow = 0; 1072 if (perf_event_exclude(event, ®s, sde_regs)) 1073 goto out; 1074 overflow = perf_event_overflow(event, &data, ®s); 1075 perf_event_update_userpage(event); 1076 out: 1077 return overflow; 1078 } 1079 1080 static void perf_event_count_update(struct perf_event *event, u64 count) 1081 { 1082 local64_add(count, &event->count); 1083 } 1084 1085 /* hw_collect_samples() - Walk through a sample-data-block and collect samples 1086 * @event: The perf event 1087 * @sdbt: Sample-data-block table 1088 * @overflow: Event overflow counter 1089 * 1090 * Walks through a sample-data-block and collects sampling data entries that are 1091 * then pushed to the perf event subsystem. Depending on the sampling function, 1092 * there can be either basic-sampling or combined-sampling data entries. A 1093 * combined-sampling data entry consists of a basic- and a diagnostic-sampling 1094 * data entry. The sampling function is determined by the flags in the perf 1095 * event hardware structure. The function always works with a combined-sampling 1096 * data entry but ignores the the diagnostic portion if it is not available. 1097 * 1098 * Note that the implementation focuses on basic-sampling data entries and, if 1099 * such an entry is not valid, the entire combined-sampling data entry is 1100 * ignored. 1101 * 1102 * The overflow variables counts the number of samples that has been discarded 1103 * due to a perf event overflow. 1104 */ 1105 static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, 1106 unsigned long long *overflow) 1107 { 1108 struct hws_trailer_entry *te; 1109 struct hws_basic_entry *sample; 1110 1111 te = trailer_entry_ptr((unsigned long)sdbt); 1112 sample = (struct hws_basic_entry *)sdbt; 1113 while ((unsigned long *)sample < (unsigned long *)te) { 1114 /* Check for an empty sample */ 1115 if (!sample->def || sample->LS) 1116 break; 1117 1118 /* Update perf event period */ 1119 perf_event_count_update(event, SAMPL_RATE(&event->hw)); 1120 1121 /* Check whether sample is valid */ 1122 if (sample->def == 0x0001) { 1123 /* If an event overflow occurred, the PMU is stopped to 1124 * throttle event delivery. Remaining sample data is 1125 * discarded. 1126 */ 1127 if (!*overflow) { 1128 /* Check whether sample is consistent */ 1129 if (sample->I == 0 && sample->W == 0) { 1130 /* Deliver sample data to perf */ 1131 *overflow = perf_push_sample(event, 1132 sample); 1133 } 1134 } else 1135 /* Count discarded samples */ 1136 *overflow += 1; 1137 } else { 1138 /* Sample slot is not yet written or other record. 1139 * 1140 * This condition can occur if the buffer was reused 1141 * from a combined basic- and diagnostic-sampling. 1142 * If only basic-sampling is then active, entries are 1143 * written into the larger diagnostic entries. 1144 * This is typically the case for sample-data-blocks 1145 * that are not full. Stop processing if the first 1146 * invalid format was detected. 1147 */ 1148 if (!te->header.f) 1149 break; 1150 } 1151 1152 /* Reset sample slot and advance to next sample */ 1153 sample->def = 0; 1154 sample++; 1155 } 1156 } 1157 1158 /* hw_perf_event_update() - Process sampling buffer 1159 * @event: The perf event 1160 * @flush_all: Flag to also flush partially filled sample-data-blocks 1161 * 1162 * Processes the sampling buffer and create perf event samples. 1163 * The sampling buffer position are retrieved and saved in the TEAR_REG 1164 * register of the specified perf event. 1165 * 1166 * Only full sample-data-blocks are processed. Specify the flush_all flag 1167 * to also walk through partially filled sample-data-blocks. 1168 */ 1169 static void hw_perf_event_update(struct perf_event *event, int flush_all) 1170 { 1171 unsigned long long event_overflow, sampl_overflow, num_sdb; 1172 struct hw_perf_event *hwc = &event->hw; 1173 union hws_trailer_header prev, new; 1174 struct hws_trailer_entry *te; 1175 unsigned long *sdbt, sdb; 1176 int done; 1177 1178 /* 1179 * AUX buffer is used when in diagnostic sampling mode. 1180 * No perf events/samples are created. 1181 */ 1182 if (SAMPL_DIAG_MODE(hwc)) 1183 return; 1184 1185 sdbt = (unsigned long *)TEAR_REG(hwc); 1186 done = event_overflow = sampl_overflow = num_sdb = 0; 1187 while (!done) { 1188 /* Get the trailer entry of the sample-data-block */ 1189 sdb = (unsigned long)phys_to_virt(*sdbt); 1190 te = trailer_entry_ptr(sdb); 1191 1192 /* Leave loop if no more work to do (block full indicator) */ 1193 if (!te->header.f) { 1194 done = 1; 1195 if (!flush_all) 1196 break; 1197 } 1198 1199 /* Check the sample overflow count */ 1200 if (te->header.overflow) 1201 /* Account sample overflows and, if a particular limit 1202 * is reached, extend the sampling buffer. 1203 * For details, see sfb_account_overflows(). 1204 */ 1205 sampl_overflow += te->header.overflow; 1206 1207 /* Collect all samples from a single sample-data-block and 1208 * flag if an (perf) event overflow happened. If so, the PMU 1209 * is stopped and remaining samples will be discarded. 1210 */ 1211 hw_collect_samples(event, (unsigned long *)sdb, &event_overflow); 1212 num_sdb++; 1213 1214 /* Reset trailer (using compare-double-and-swap) */ 1215 prev.val = READ_ONCE_ALIGNED_128(te->header.val); 1216 do { 1217 new.val = prev.val; 1218 new.f = 0; 1219 new.a = 1; 1220 new.overflow = 0; 1221 } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val)); 1222 1223 /* Advance to next sample-data-block */ 1224 sdbt++; 1225 if (is_link_entry(sdbt)) 1226 sdbt = get_next_sdbt(sdbt); 1227 1228 /* Update event hardware registers */ 1229 TEAR_REG(hwc) = (unsigned long)sdbt; 1230 1231 /* Stop processing sample-data if all samples of the current 1232 * sample-data-block were flushed even if it was not full. 1233 */ 1234 if (flush_all && done) 1235 break; 1236 } 1237 1238 /* Account sample overflows in the event hardware structure */ 1239 if (sampl_overflow) 1240 OVERFLOW_REG(hwc) = DIV_ROUND_UP(OVERFLOW_REG(hwc) + 1241 sampl_overflow, 1 + num_sdb); 1242 1243 /* Perf_event_overflow() and perf_event_account_interrupt() limit 1244 * the interrupt rate to an upper limit. Roughly 1000 samples per 1245 * task tick. 1246 * Hitting this limit results in a large number 1247 * of throttled REF_REPORT_THROTTLE entries and the samples 1248 * are dropped. 1249 * Slightly increase the interval to avoid hitting this limit. 1250 */ 1251 if (event_overflow) 1252 SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10); 1253 } 1254 1255 static inline unsigned long aux_sdb_index(struct aux_buffer *aux, 1256 unsigned long i) 1257 { 1258 return i % aux->sfb.num_sdb; 1259 } 1260 1261 static inline unsigned long aux_sdb_num(unsigned long start, unsigned long end) 1262 { 1263 return end >= start ? end - start + 1 : 0; 1264 } 1265 1266 static inline unsigned long aux_sdb_num_alert(struct aux_buffer *aux) 1267 { 1268 return aux_sdb_num(aux->head, aux->alert_mark); 1269 } 1270 1271 static inline unsigned long aux_sdb_num_empty(struct aux_buffer *aux) 1272 { 1273 return aux_sdb_num(aux->head, aux->empty_mark); 1274 } 1275 1276 /* 1277 * Get trailer entry by index of SDB. 1278 */ 1279 static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux, 1280 unsigned long index) 1281 { 1282 unsigned long sdb; 1283 1284 index = aux_sdb_index(aux, index); 1285 sdb = aux->sdb_index[index]; 1286 return trailer_entry_ptr(sdb); 1287 } 1288 1289 /* 1290 * Finish sampling on the cpu. Called by cpumsf_pmu_del() with pmu 1291 * disabled. Collect the full SDBs in AUX buffer which have not reached 1292 * the point of alert indicator. And ignore the SDBs which are not 1293 * full. 1294 * 1295 * 1. Scan SDBs to see how much data is there and consume them. 1296 * 2. Remove alert indicator in the buffer. 1297 */ 1298 static void aux_output_end(struct perf_output_handle *handle) 1299 { 1300 unsigned long i, range_scan, idx; 1301 struct aux_buffer *aux; 1302 struct hws_trailer_entry *te; 1303 1304 aux = perf_get_aux(handle); 1305 if (!aux) 1306 return; 1307 1308 range_scan = aux_sdb_num_alert(aux); 1309 for (i = 0, idx = aux->head; i < range_scan; i++, idx++) { 1310 te = aux_sdb_trailer(aux, idx); 1311 if (!te->header.f) 1312 break; 1313 } 1314 /* i is num of SDBs which are full */ 1315 perf_aux_output_end(handle, i << PAGE_SHIFT); 1316 1317 /* Remove alert indicators in the buffer */ 1318 te = aux_sdb_trailer(aux, aux->alert_mark); 1319 te->header.a = 0; 1320 } 1321 1322 /* 1323 * Start sampling on the CPU. Called by cpumsf_pmu_add() when an event 1324 * is first added to the CPU or rescheduled again to the CPU. It is called 1325 * with pmu disabled. 1326 * 1327 * 1. Reset the trailer of SDBs to get ready for new data. 1328 * 2. Tell the hardware where to put the data by reset the SDBs buffer 1329 * head(tear/dear). 1330 */ 1331 static int aux_output_begin(struct perf_output_handle *handle, 1332 struct aux_buffer *aux, 1333 struct cpu_hw_sf *cpuhw) 1334 { 1335 unsigned long range, i, range_scan, idx, head, base, offset; 1336 struct hws_trailer_entry *te; 1337 1338 if (handle->head & ~PAGE_MASK) 1339 return -EINVAL; 1340 1341 aux->head = handle->head >> PAGE_SHIFT; 1342 range = (handle->size + 1) >> PAGE_SHIFT; 1343 if (range <= 1) 1344 return -ENOMEM; 1345 1346 /* 1347 * SDBs between aux->head and aux->empty_mark are already ready 1348 * for new data. range_scan is num of SDBs not within them. 1349 */ 1350 if (range > aux_sdb_num_empty(aux)) { 1351 range_scan = range - aux_sdb_num_empty(aux); 1352 idx = aux->empty_mark + 1; 1353 for (i = 0; i < range_scan; i++, idx++) { 1354 te = aux_sdb_trailer(aux, idx); 1355 te->header.f = 0; 1356 te->header.a = 0; 1357 te->header.overflow = 0; 1358 } 1359 /* Save the position of empty SDBs */ 1360 aux->empty_mark = aux->head + range - 1; 1361 } 1362 1363 /* Set alert indicator */ 1364 aux->alert_mark = aux->head + range/2 - 1; 1365 te = aux_sdb_trailer(aux, aux->alert_mark); 1366 te->header.a = 1; 1367 1368 /* Reset hardware buffer head */ 1369 head = aux_sdb_index(aux, aux->head); 1370 base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE]; 1371 offset = head % CPUM_SF_SDB_PER_TABLE; 1372 cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long); 1373 cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]); 1374 1375 return 0; 1376 } 1377 1378 /* 1379 * Set alert indicator on SDB at index @alert_index while sampler is running. 1380 * 1381 * Return true if successfully. 1382 * Return false if full indicator is already set by hardware sampler. 1383 */ 1384 static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, 1385 unsigned long long *overflow) 1386 { 1387 union hws_trailer_header prev, new; 1388 struct hws_trailer_entry *te; 1389 1390 te = aux_sdb_trailer(aux, alert_index); 1391 prev.val = READ_ONCE_ALIGNED_128(te->header.val); 1392 do { 1393 new.val = prev.val; 1394 *overflow = prev.overflow; 1395 if (prev.f) { 1396 /* 1397 * SDB is already set by hardware. 1398 * Abort and try to set somewhere 1399 * behind. 1400 */ 1401 return false; 1402 } 1403 new.a = 1; 1404 new.overflow = 0; 1405 } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val)); 1406 return true; 1407 } 1408 1409 /* 1410 * aux_reset_buffer() - Scan and setup SDBs for new samples 1411 * @aux: The AUX buffer to set 1412 * @range: The range of SDBs to scan started from aux->head 1413 * @overflow: Set to overflow count 1414 * 1415 * Set alert indicator on the SDB at index of aux->alert_mark. If this SDB is 1416 * marked as empty, check if it is already set full by the hardware sampler. 1417 * If yes, that means new data is already there before we can set an alert 1418 * indicator. Caller should try to set alert indicator to some position behind. 1419 * 1420 * Scan the SDBs in AUX buffer from behind aux->empty_mark. They are used 1421 * previously and have already been consumed by user space. Reset these SDBs 1422 * (clear full indicator and alert indicator) for new data. 1423 * If aux->alert_mark fall in this area, just set it. Overflow count is 1424 * recorded while scanning. 1425 * 1426 * SDBs between aux->head and aux->empty_mark are already reset at last time. 1427 * and ready for new samples. So scanning on this area could be skipped. 1428 * 1429 * Return true if alert indicator is set successfully and false if not. 1430 */ 1431 static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, 1432 unsigned long long *overflow) 1433 { 1434 union hws_trailer_header prev, new; 1435 unsigned long i, range_scan, idx; 1436 unsigned long long orig_overflow; 1437 struct hws_trailer_entry *te; 1438 1439 if (range <= aux_sdb_num_empty(aux)) 1440 /* 1441 * No need to scan. All SDBs in range are marked as empty. 1442 * Just set alert indicator. Should check race with hardware 1443 * sampler. 1444 */ 1445 return aux_set_alert(aux, aux->alert_mark, overflow); 1446 1447 if (aux->alert_mark <= aux->empty_mark) 1448 /* 1449 * Set alert indicator on empty SDB. Should check race 1450 * with hardware sampler. 1451 */ 1452 if (!aux_set_alert(aux, aux->alert_mark, overflow)) 1453 return false; 1454 1455 /* 1456 * Scan the SDBs to clear full and alert indicator used previously. 1457 * Start scanning from one SDB behind empty_mark. If the new alert 1458 * indicator fall into this range, set it. 1459 */ 1460 range_scan = range - aux_sdb_num_empty(aux); 1461 idx = aux->empty_mark + 1; 1462 for (i = 0; i < range_scan; i++, idx++) { 1463 te = aux_sdb_trailer(aux, idx); 1464 prev.val = READ_ONCE_ALIGNED_128(te->header.val); 1465 do { 1466 new.val = prev.val; 1467 orig_overflow = prev.overflow; 1468 new.f = 0; 1469 new.overflow = 0; 1470 if (idx == aux->alert_mark) 1471 new.a = 1; 1472 else 1473 new.a = 0; 1474 } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val)); 1475 *overflow += orig_overflow; 1476 } 1477 1478 /* Update empty_mark to new position */ 1479 aux->empty_mark = aux->head + range - 1; 1480 1481 return true; 1482 } 1483 1484 /* 1485 * Measurement alert handler for diagnostic mode sampling. 1486 */ 1487 static void hw_collect_aux(struct cpu_hw_sf *cpuhw) 1488 { 1489 struct aux_buffer *aux; 1490 int done = 0; 1491 unsigned long range = 0, size; 1492 unsigned long long overflow = 0; 1493 struct perf_output_handle *handle = &cpuhw->handle; 1494 unsigned long num_sdb; 1495 1496 aux = perf_get_aux(handle); 1497 if (!aux) 1498 return; 1499 1500 /* Inform user space new data arrived */ 1501 size = aux_sdb_num_alert(aux) << PAGE_SHIFT; 1502 debug_sprintf_event(sfdbg, 6, "%s #alert %ld\n", __func__, 1503 size >> PAGE_SHIFT); 1504 perf_aux_output_end(handle, size); 1505 1506 num_sdb = aux->sfb.num_sdb; 1507 while (!done) { 1508 /* Get an output handle */ 1509 aux = perf_aux_output_begin(handle, cpuhw->event); 1510 if (handle->size == 0) { 1511 pr_err("The AUX buffer with %lu pages for the " 1512 "diagnostic-sampling mode is full\n", 1513 num_sdb); 1514 break; 1515 } 1516 if (!aux) 1517 return; 1518 1519 /* Update head and alert_mark to new position */ 1520 aux->head = handle->head >> PAGE_SHIFT; 1521 range = (handle->size + 1) >> PAGE_SHIFT; 1522 if (range == 1) 1523 aux->alert_mark = aux->head; 1524 else 1525 aux->alert_mark = aux->head + range/2 - 1; 1526 1527 if (aux_reset_buffer(aux, range, &overflow)) { 1528 if (!overflow) { 1529 done = 1; 1530 break; 1531 } 1532 size = range << PAGE_SHIFT; 1533 perf_aux_output_end(&cpuhw->handle, size); 1534 pr_err("Sample data caused the AUX buffer with %lu " 1535 "pages to overflow\n", aux->sfb.num_sdb); 1536 } else { 1537 size = aux_sdb_num_alert(aux) << PAGE_SHIFT; 1538 perf_aux_output_end(&cpuhw->handle, size); 1539 } 1540 } 1541 } 1542 1543 /* 1544 * Callback when freeing AUX buffers. 1545 */ 1546 static void aux_buffer_free(void *data) 1547 { 1548 struct aux_buffer *aux = data; 1549 unsigned long i, num_sdbt; 1550 1551 if (!aux) 1552 return; 1553 1554 /* Free SDBT. SDB is freed by the caller */ 1555 num_sdbt = aux->sfb.num_sdbt; 1556 for (i = 0; i < num_sdbt; i++) 1557 free_page(aux->sdbt_index[i]); 1558 1559 kfree(aux->sdbt_index); 1560 kfree(aux->sdb_index); 1561 kfree(aux); 1562 } 1563 1564 static void aux_sdb_init(unsigned long sdb) 1565 { 1566 struct hws_trailer_entry *te; 1567 1568 te = trailer_entry_ptr(sdb); 1569 1570 /* Save clock base */ 1571 te->clock_base = 1; 1572 te->progusage2 = tod_clock_base.tod; 1573 } 1574 1575 /* 1576 * aux_buffer_setup() - Setup AUX buffer for diagnostic mode sampling 1577 * @event: Event the buffer is setup for, event->cpu == -1 means current 1578 * @pages: Array of pointers to buffer pages passed from perf core 1579 * @nr_pages: Total pages 1580 * @snapshot: Flag for snapshot mode 1581 * 1582 * This is the callback when setup an event using AUX buffer. Perf tool can 1583 * trigger this by an additional mmap() call on the event. Unlike the buffer 1584 * for basic samples, AUX buffer belongs to the event. It is scheduled with 1585 * the task among online cpus when it is a per-thread event. 1586 * 1587 * Return the private AUX buffer structure if success or NULL if fails. 1588 */ 1589 static void *aux_buffer_setup(struct perf_event *event, void **pages, 1590 int nr_pages, bool snapshot) 1591 { 1592 struct sf_buffer *sfb; 1593 struct aux_buffer *aux; 1594 unsigned long *new, *tail; 1595 int i, n_sdbt; 1596 1597 if (!nr_pages || !pages) 1598 return NULL; 1599 1600 if (nr_pages > CPUM_SF_MAX_SDB * CPUM_SF_SDB_DIAG_FACTOR) { 1601 pr_err("AUX buffer size (%i pages) is larger than the " 1602 "maximum sampling buffer limit\n", 1603 nr_pages); 1604 return NULL; 1605 } else if (nr_pages < CPUM_SF_MIN_SDB * CPUM_SF_SDB_DIAG_FACTOR) { 1606 pr_err("AUX buffer size (%i pages) is less than the " 1607 "minimum sampling buffer limit\n", 1608 nr_pages); 1609 return NULL; 1610 } 1611 1612 /* Allocate aux_buffer struct for the event */ 1613 aux = kzalloc(sizeof(struct aux_buffer), GFP_KERNEL); 1614 if (!aux) 1615 goto no_aux; 1616 sfb = &aux->sfb; 1617 1618 /* Allocate sdbt_index for fast reference */ 1619 n_sdbt = DIV_ROUND_UP(nr_pages, CPUM_SF_SDB_PER_TABLE); 1620 aux->sdbt_index = kmalloc_array(n_sdbt, sizeof(void *), GFP_KERNEL); 1621 if (!aux->sdbt_index) 1622 goto no_sdbt_index; 1623 1624 /* Allocate sdb_index for fast reference */ 1625 aux->sdb_index = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); 1626 if (!aux->sdb_index) 1627 goto no_sdb_index; 1628 1629 /* Allocate the first SDBT */ 1630 sfb->num_sdbt = 0; 1631 sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL); 1632 if (!sfb->sdbt) 1633 goto no_sdbt; 1634 aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt; 1635 tail = sfb->tail = sfb->sdbt; 1636 1637 /* 1638 * Link the provided pages of AUX buffer to SDBT. 1639 * Allocate SDBT if needed. 1640 */ 1641 for (i = 0; i < nr_pages; i++, tail++) { 1642 if (require_table_link(tail)) { 1643 new = (unsigned long *)get_zeroed_page(GFP_KERNEL); 1644 if (!new) 1645 goto no_sdbt; 1646 aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new; 1647 /* Link current page to tail of chain */ 1648 *tail = virt_to_phys(new) + 1; 1649 tail = new; 1650 } 1651 /* Tail is the entry in a SDBT */ 1652 *tail = virt_to_phys(pages[i]); 1653 aux->sdb_index[i] = (unsigned long)pages[i]; 1654 aux_sdb_init((unsigned long)pages[i]); 1655 } 1656 sfb->num_sdb = nr_pages; 1657 1658 /* Link the last entry in the SDBT to the first SDBT */ 1659 *tail = virt_to_phys(sfb->sdbt) + 1; 1660 sfb->tail = tail; 1661 1662 /* 1663 * Initial all SDBs are zeroed. Mark it as empty. 1664 * So there is no need to clear the full indicator 1665 * when this event is first added. 1666 */ 1667 aux->empty_mark = sfb->num_sdb - 1; 1668 1669 return aux; 1670 1671 no_sdbt: 1672 /* SDBs (AUX buffer pages) are freed by caller */ 1673 for (i = 0; i < sfb->num_sdbt; i++) 1674 free_page(aux->sdbt_index[i]); 1675 kfree(aux->sdb_index); 1676 no_sdb_index: 1677 kfree(aux->sdbt_index); 1678 no_sdbt_index: 1679 kfree(aux); 1680 no_aux: 1681 return NULL; 1682 } 1683 1684 static void cpumsf_pmu_read(struct perf_event *event) 1685 { 1686 /* Nothing to do ... updates are interrupt-driven */ 1687 } 1688 1689 /* Check if the new sampling period/frequency is appropriate. 1690 * 1691 * Return non-zero on error and zero on passed checks. 1692 */ 1693 static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) 1694 { 1695 struct hws_qsi_info_block si; 1696 unsigned long rate; 1697 bool do_freq; 1698 1699 memset(&si, 0, sizeof(si)); 1700 if (event->cpu == -1) { 1701 qsi(&si); 1702 } else { 1703 /* Event is pinned to a particular CPU, retrieve the per-CPU 1704 * sampling structure for accessing the CPU-specific QSI. 1705 */ 1706 struct cpu_hw_sf *cpuhw = &per_cpu(cpu_hw_sf, event->cpu); 1707 1708 si = cpuhw->qsi; 1709 } 1710 1711 do_freq = !!SAMPL_FREQ_MODE(&event->hw); 1712 rate = getrate(do_freq, value, &si); 1713 if (!rate) 1714 return -EINVAL; 1715 1716 event->attr.sample_period = rate; 1717 SAMPL_RATE(&event->hw) = rate; 1718 hw_init_period(&event->hw, SAMPL_RATE(&event->hw)); 1719 return 0; 1720 } 1721 1722 /* Activate sampling control. 1723 * Next call of pmu_enable() starts sampling. 1724 */ 1725 static void cpumsf_pmu_start(struct perf_event *event, int flags) 1726 { 1727 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); 1728 1729 if (!(event->hw.state & PERF_HES_STOPPED)) 1730 return; 1731 perf_pmu_disable(event->pmu); 1732 event->hw.state = 0; 1733 cpuhw->lsctl.cs = 1; 1734 if (SAMPL_DIAG_MODE(&event->hw)) 1735 cpuhw->lsctl.cd = 1; 1736 perf_pmu_enable(event->pmu); 1737 } 1738 1739 /* Deactivate sampling control. 1740 * Next call of pmu_enable() stops sampling. 1741 */ 1742 static void cpumsf_pmu_stop(struct perf_event *event, int flags) 1743 { 1744 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); 1745 1746 if (event->hw.state & PERF_HES_STOPPED) 1747 return; 1748 1749 perf_pmu_disable(event->pmu); 1750 cpuhw->lsctl.cs = 0; 1751 cpuhw->lsctl.cd = 0; 1752 event->hw.state |= PERF_HES_STOPPED; 1753 1754 if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) { 1755 /* CPU hotplug off removes SDBs. No samples to extract. */ 1756 if (cpuhw->flags & PMU_F_RESERVED) 1757 hw_perf_event_update(event, 1); 1758 event->hw.state |= PERF_HES_UPTODATE; 1759 } 1760 perf_pmu_enable(event->pmu); 1761 } 1762 1763 static int cpumsf_pmu_add(struct perf_event *event, int flags) 1764 { 1765 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); 1766 struct aux_buffer *aux; 1767 int err = 0; 1768 1769 if (cpuhw->flags & PMU_F_IN_USE) 1770 return -EAGAIN; 1771 1772 if (!SAMPL_DIAG_MODE(&event->hw) && !sf_buffer_available(cpuhw)) 1773 return -EINVAL; 1774 1775 perf_pmu_disable(event->pmu); 1776 1777 event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 1778 1779 /* Set up sampling controls. Always program the sampling register 1780 * using the SDB-table start. Reset TEAR_REG event hardware register 1781 * that is used by hw_perf_event_update() to store the sampling buffer 1782 * position after samples have been flushed. 1783 */ 1784 cpuhw->lsctl.s = 0; 1785 cpuhw->lsctl.h = 1; 1786 cpuhw->lsctl.interval = SAMPL_RATE(&event->hw); 1787 if (!SAMPL_DIAG_MODE(&event->hw)) { 1788 cpuhw->lsctl.tear = virt_to_phys(cpuhw->sfb.sdbt); 1789 cpuhw->lsctl.dear = *(unsigned long *)cpuhw->sfb.sdbt; 1790 TEAR_REG(&event->hw) = (unsigned long)cpuhw->sfb.sdbt; 1791 } 1792 1793 /* Ensure sampling functions are in the disabled state. If disabled, 1794 * switch on sampling enable control. */ 1795 if (WARN_ON_ONCE(cpuhw->lsctl.es == 1 || cpuhw->lsctl.ed == 1)) { 1796 err = -EAGAIN; 1797 goto out; 1798 } 1799 if (SAMPL_DIAG_MODE(&event->hw)) { 1800 aux = perf_aux_output_begin(&cpuhw->handle, event); 1801 if (!aux) { 1802 err = -EINVAL; 1803 goto out; 1804 } 1805 err = aux_output_begin(&cpuhw->handle, aux, cpuhw); 1806 if (err) 1807 goto out; 1808 cpuhw->lsctl.ed = 1; 1809 } 1810 cpuhw->lsctl.es = 1; 1811 1812 /* Set in_use flag and store event */ 1813 cpuhw->event = event; 1814 cpuhw->flags |= PMU_F_IN_USE; 1815 1816 if (flags & PERF_EF_START) 1817 cpumsf_pmu_start(event, PERF_EF_RELOAD); 1818 out: 1819 perf_event_update_userpage(event); 1820 perf_pmu_enable(event->pmu); 1821 return err; 1822 } 1823 1824 static void cpumsf_pmu_del(struct perf_event *event, int flags) 1825 { 1826 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); 1827 1828 perf_pmu_disable(event->pmu); 1829 cpumsf_pmu_stop(event, PERF_EF_UPDATE); 1830 1831 cpuhw->lsctl.es = 0; 1832 cpuhw->lsctl.ed = 0; 1833 cpuhw->flags &= ~PMU_F_IN_USE; 1834 cpuhw->event = NULL; 1835 1836 if (SAMPL_DIAG_MODE(&event->hw)) 1837 aux_output_end(&cpuhw->handle); 1838 perf_event_update_userpage(event); 1839 perf_pmu_enable(event->pmu); 1840 } 1841 1842 CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC, PERF_EVENT_CPUM_SF); 1843 CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG); 1844 1845 /* Attribute list for CPU_SF. 1846 * 1847 * The availablitiy depends on the CPU_MF sampling facility authorization 1848 * for basic + diagnositic samples. This is determined at initialization 1849 * time by the sampling facility device driver. 1850 * If the authorization for basic samples is turned off, it should be 1851 * also turned off for diagnostic sampling. 1852 * 1853 * During initialization of the device driver, check the authorization 1854 * level for diagnostic sampling and installs the attribute 1855 * file for diagnostic sampling if necessary. 1856 * 1857 * For now install a placeholder to reference all possible attributes: 1858 * SF_CYCLES_BASIC and SF_CYCLES_BASIC_DIAG. 1859 * Add another entry for the final NULL pointer. 1860 */ 1861 enum { 1862 SF_CYCLES_BASIC_ATTR_IDX = 0, 1863 SF_CYCLES_BASIC_DIAG_ATTR_IDX, 1864 SF_CYCLES_ATTR_MAX 1865 }; 1866 1867 static struct attribute *cpumsf_pmu_events_attr[SF_CYCLES_ATTR_MAX + 1] = { 1868 [SF_CYCLES_BASIC_ATTR_IDX] = CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC) 1869 }; 1870 1871 PMU_FORMAT_ATTR(event, "config:0-63"); 1872 1873 static struct attribute *cpumsf_pmu_format_attr[] = { 1874 &format_attr_event.attr, 1875 NULL, 1876 }; 1877 1878 static struct attribute_group cpumsf_pmu_events_group = { 1879 .name = "events", 1880 .attrs = cpumsf_pmu_events_attr, 1881 }; 1882 1883 static struct attribute_group cpumsf_pmu_format_group = { 1884 .name = "format", 1885 .attrs = cpumsf_pmu_format_attr, 1886 }; 1887 1888 static const struct attribute_group *cpumsf_pmu_attr_groups[] = { 1889 &cpumsf_pmu_events_group, 1890 &cpumsf_pmu_format_group, 1891 NULL, 1892 }; 1893 1894 static struct pmu cpumf_sampling = { 1895 .pmu_enable = cpumsf_pmu_enable, 1896 .pmu_disable = cpumsf_pmu_disable, 1897 1898 .event_init = cpumsf_pmu_event_init, 1899 .add = cpumsf_pmu_add, 1900 .del = cpumsf_pmu_del, 1901 1902 .start = cpumsf_pmu_start, 1903 .stop = cpumsf_pmu_stop, 1904 .read = cpumsf_pmu_read, 1905 1906 .attr_groups = cpumsf_pmu_attr_groups, 1907 1908 .setup_aux = aux_buffer_setup, 1909 .free_aux = aux_buffer_free, 1910 1911 .check_period = cpumsf_pmu_check_period, 1912 }; 1913 1914 static void cpumf_measurement_alert(struct ext_code ext_code, 1915 unsigned int alert, unsigned long unused) 1916 { 1917 struct cpu_hw_sf *cpuhw; 1918 1919 if (!(alert & CPU_MF_INT_SF_MASK)) 1920 return; 1921 inc_irq_stat(IRQEXT_CMS); 1922 cpuhw = this_cpu_ptr(&cpu_hw_sf); 1923 1924 /* Measurement alerts are shared and might happen when the PMU 1925 * is not reserved. Ignore these alerts in this case. */ 1926 if (!(cpuhw->flags & PMU_F_RESERVED)) 1927 return; 1928 1929 /* The processing below must take care of multiple alert events that 1930 * might be indicated concurrently. */ 1931 1932 /* Program alert request */ 1933 if (alert & CPU_MF_INT_SF_PRA) { 1934 if (cpuhw->flags & PMU_F_IN_USE) { 1935 if (SAMPL_DIAG_MODE(&cpuhw->event->hw)) 1936 hw_collect_aux(cpuhw); 1937 else 1938 hw_perf_event_update(cpuhw->event, 0); 1939 } 1940 } 1941 1942 /* Report measurement alerts only for non-PRA codes */ 1943 if (alert != CPU_MF_INT_SF_PRA) 1944 debug_sprintf_event(sfdbg, 6, "%s alert %#x\n", __func__, 1945 alert); 1946 1947 /* Sampling authorization change request */ 1948 if (alert & CPU_MF_INT_SF_SACA) 1949 qsi(&cpuhw->qsi); 1950 1951 /* Loss of sample data due to high-priority machine activities */ 1952 if (alert & CPU_MF_INT_SF_LSDA) { 1953 pr_err("Sample data was lost\n"); 1954 cpuhw->flags |= PMU_F_ERR_LSDA; 1955 sf_disable(); 1956 } 1957 1958 /* Invalid sampling buffer entry */ 1959 if (alert & (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE)) { 1960 pr_err("A sampling buffer entry is incorrect (alert=%#x)\n", 1961 alert); 1962 cpuhw->flags |= PMU_F_ERR_IBE; 1963 sf_disable(); 1964 } 1965 } 1966 1967 static int cpusf_pmu_setup(unsigned int cpu, int flags) 1968 { 1969 /* Ignore the notification if no events are scheduled on the PMU. 1970 * This might be racy... 1971 */ 1972 if (!refcount_read(&num_events)) 1973 return 0; 1974 1975 local_irq_disable(); 1976 setup_pmc_cpu(&flags); 1977 local_irq_enable(); 1978 return 0; 1979 } 1980 1981 static int s390_pmu_sf_online_cpu(unsigned int cpu) 1982 { 1983 return cpusf_pmu_setup(cpu, PMC_INIT); 1984 } 1985 1986 static int s390_pmu_sf_offline_cpu(unsigned int cpu) 1987 { 1988 return cpusf_pmu_setup(cpu, PMC_RELEASE); 1989 } 1990 1991 static int param_get_sfb_size(char *buffer, const struct kernel_param *kp) 1992 { 1993 if (!cpum_sf_avail()) 1994 return -ENODEV; 1995 return sprintf(buffer, "%lu,%lu", CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB); 1996 } 1997 1998 static int param_set_sfb_size(const char *val, const struct kernel_param *kp) 1999 { 2000 int rc; 2001 unsigned long min, max; 2002 2003 if (!cpum_sf_avail()) 2004 return -ENODEV; 2005 if (!val || !strlen(val)) 2006 return -EINVAL; 2007 2008 /* Valid parameter values: "min,max" or "max" */ 2009 min = CPUM_SF_MIN_SDB; 2010 max = CPUM_SF_MAX_SDB; 2011 if (strchr(val, ',')) 2012 rc = (sscanf(val, "%lu,%lu", &min, &max) == 2) ? 0 : -EINVAL; 2013 else 2014 rc = kstrtoul(val, 10, &max); 2015 2016 if (min < 2 || min >= max || max > get_num_physpages()) 2017 rc = -EINVAL; 2018 if (rc) 2019 return rc; 2020 2021 sfb_set_limits(min, max); 2022 pr_info("The sampling buffer limits have changed to: " 2023 "min %lu max %lu (diag %lu)\n", 2024 CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB, CPUM_SF_SDB_DIAG_FACTOR); 2025 return 0; 2026 } 2027 2028 #define param_check_sfb_size(name, p) __param_check(name, p, void) 2029 static const struct kernel_param_ops param_ops_sfb_size = { 2030 .set = param_set_sfb_size, 2031 .get = param_get_sfb_size, 2032 }; 2033 2034 enum { 2035 RS_INIT_FAILURE_BSDES = 2, /* Bad basic sampling size */ 2036 RS_INIT_FAILURE_ALRT = 3, /* IRQ registration failure */ 2037 RS_INIT_FAILURE_PERF = 4 /* PMU registration failure */ 2038 }; 2039 2040 static void __init pr_cpumsf_err(unsigned int reason) 2041 { 2042 pr_err("Sampling facility support for perf is not available: " 2043 "reason %#x\n", reason); 2044 } 2045 2046 static int __init init_cpum_sampling_pmu(void) 2047 { 2048 struct hws_qsi_info_block si; 2049 int err; 2050 2051 if (!cpum_sf_avail()) 2052 return -ENODEV; 2053 2054 memset(&si, 0, sizeof(si)); 2055 qsi(&si); 2056 if (!si.as && !si.ad) 2057 return -ENODEV; 2058 2059 if (si.bsdes != sizeof(struct hws_basic_entry)) { 2060 pr_cpumsf_err(RS_INIT_FAILURE_BSDES); 2061 return -EINVAL; 2062 } 2063 2064 if (si.ad) { 2065 sfb_set_limits(CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB); 2066 /* Sampling of diagnostic data authorized, 2067 * install event into attribute list of PMU device. 2068 */ 2069 cpumsf_pmu_events_attr[SF_CYCLES_BASIC_DIAG_ATTR_IDX] = 2070 CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG); 2071 } 2072 2073 sfdbg = debug_register(KMSG_COMPONENT, 2, 1, 80); 2074 if (!sfdbg) { 2075 pr_err("Registering for s390dbf failed\n"); 2076 return -ENOMEM; 2077 } 2078 debug_register_view(sfdbg, &debug_sprintf_view); 2079 2080 err = register_external_irq(EXT_IRQ_MEASURE_ALERT, 2081 cpumf_measurement_alert); 2082 if (err) { 2083 pr_cpumsf_err(RS_INIT_FAILURE_ALRT); 2084 debug_unregister(sfdbg); 2085 goto out; 2086 } 2087 2088 err = perf_pmu_register(&cpumf_sampling, "cpum_sf", PERF_TYPE_RAW); 2089 if (err) { 2090 pr_cpumsf_err(RS_INIT_FAILURE_PERF); 2091 unregister_external_irq(EXT_IRQ_MEASURE_ALERT, 2092 cpumf_measurement_alert); 2093 debug_unregister(sfdbg); 2094 goto out; 2095 } 2096 2097 cpuhp_setup_state(CPUHP_AP_PERF_S390_SF_ONLINE, "perf/s390/sf:online", 2098 s390_pmu_sf_online_cpu, s390_pmu_sf_offline_cpu); 2099 out: 2100 return err; 2101 } 2102 2103 arch_initcall(init_cpum_sampling_pmu); 2104 core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0644); 2105