1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Hypervisor supplied "24x7" performance counter support 4 * 5 * Author: Cody P Schafer <cody@linux.vnet.ibm.com> 6 * Copyright 2014 IBM Corporation. 7 */ 8 9 #define pr_fmt(fmt) "hv-24x7: " fmt 10 11 #include <linux/perf_event.h> 12 #include <linux/rbtree.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/vmalloc.h> 16 17 #include <asm/cputhreads.h> 18 #include <asm/firmware.h> 19 #include <asm/hvcall.h> 20 #include <asm/io.h> 21 #include <linux/byteorder/generic.h> 22 23 #include <asm/rtas.h> 24 #include "hv-24x7.h" 25 #include "hv-24x7-catalog.h" 26 #include "hv-common.h" 27 28 /* Version of the 24x7 hypervisor API that we should use in this machine. */ 29 static int interface_version; 30 31 /* Whether we have to aggregate result data for some domains. */ 32 static bool aggregate_result_elements; 33 34 static cpumask_t hv_24x7_cpumask; 35 36 static bool domain_is_valid(unsigned domain) 37 { 38 switch (domain) { 39 #define DOMAIN(n, v, x, c) \ 40 case HV_PERF_DOMAIN_##n: \ 41 /* fall through */ 42 #include "hv-24x7-domains.h" 43 #undef DOMAIN 44 return true; 45 default: 46 return false; 47 } 48 } 49 50 static bool is_physical_domain(unsigned domain) 51 { 52 switch (domain) { 53 #define DOMAIN(n, v, x, c) \ 54 case HV_PERF_DOMAIN_##n: \ 55 return c; 56 #include "hv-24x7-domains.h" 57 #undef DOMAIN 58 default: 59 return false; 60 } 61 } 62 63 /* 64 * The Processor Module Information system parameter allows transferring 65 * of certain processor module information from the platform to the OS. 66 * Refer PAPR+ document to get parameter token value as '43'. 67 */ 68 69 #define PROCESSOR_MODULE_INFO 43 70 71 static u32 phys_sockets; /* Physical sockets */ 72 static u32 phys_chipspersocket; /* Physical chips per socket*/ 73 static u32 phys_coresperchip; /* Physical cores per chip */ 74 75 /* 76 * read_24x7_sys_info() 77 * Retrieve the number of sockets and chips per socket and cores per 78 * chip details through the get-system-parameter rtas call. 79 */ 80 void read_24x7_sys_info(void) 81 { 82 int call_status, len, ntypes; 83 84 spin_lock(&rtas_data_buf_lock); 85 86 /* 87 * Making system parameter: chips and sockets and cores per chip 88 * default to 1. 89 */ 90 phys_sockets = 1; 91 phys_chipspersocket = 1; 92 phys_coresperchip = 1; 93 94 call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, 95 NULL, 96 PROCESSOR_MODULE_INFO, 97 __pa(rtas_data_buf), 98 RTAS_DATA_BUF_SIZE); 99 100 if (call_status != 0) { 101 pr_err("Error calling get-system-parameter %d\n", 102 call_status); 103 } else { 104 len = be16_to_cpup((__be16 *)&rtas_data_buf[0]); 105 if (len < 8) 106 goto out; 107 108 ntypes = be16_to_cpup((__be16 *)&rtas_data_buf[2]); 109 110 if (!ntypes) 111 goto out; 112 113 phys_sockets = be16_to_cpup((__be16 *)&rtas_data_buf[4]); 114 phys_chipspersocket = be16_to_cpup((__be16 *)&rtas_data_buf[6]); 115 phys_coresperchip = be16_to_cpup((__be16 *)&rtas_data_buf[8]); 116 } 117 118 out: 119 spin_unlock(&rtas_data_buf_lock); 120 } 121 122 /* Domains for which more than one result element are returned for each event. */ 123 static bool domain_needs_aggregation(unsigned int domain) 124 { 125 return aggregate_result_elements && 126 (domain == HV_PERF_DOMAIN_PHYS_CORE || 127 (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE && 128 domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE)); 129 } 130 131 static const char *domain_name(unsigned domain) 132 { 133 if (!domain_is_valid(domain)) 134 return NULL; 135 136 switch (domain) { 137 case HV_PERF_DOMAIN_PHYS_CHIP: return "Physical Chip"; 138 case HV_PERF_DOMAIN_PHYS_CORE: return "Physical Core"; 139 case HV_PERF_DOMAIN_VCPU_HOME_CORE: return "VCPU Home Core"; 140 case HV_PERF_DOMAIN_VCPU_HOME_CHIP: return "VCPU Home Chip"; 141 case HV_PERF_DOMAIN_VCPU_HOME_NODE: return "VCPU Home Node"; 142 case HV_PERF_DOMAIN_VCPU_REMOTE_NODE: return "VCPU Remote Node"; 143 } 144 145 WARN_ON_ONCE(domain); 146 return NULL; 147 } 148 149 static bool catalog_entry_domain_is_valid(unsigned domain) 150 { 151 /* POWER8 doesn't support virtual domains. */ 152 if (interface_version == 1) 153 return is_physical_domain(domain); 154 else 155 return domain_is_valid(domain); 156 } 157 158 /* 159 * TODO: Merging events: 160 * - Think of the hcall as an interface to a 4d array of counters: 161 * - x = domains 162 * - y = indexes in the domain (core, chip, vcpu, node, etc) 163 * - z = offset into the counter space 164 * - w = lpars (guest vms, "logical partitions") 165 * - A single request is: x,y,y_last,z,z_last,w,w_last 166 * - this means we can retrieve a rectangle of counters in y,z for a single x. 167 * 168 * - Things to consider (ignoring w): 169 * - input cost_per_request = 16 170 * - output cost_per_result(ys,zs) = 8 + 8 * ys + ys * zs 171 * - limited number of requests per hcall (must fit into 4K bytes) 172 * - 4k = 16 [buffer header] - 16 [request size] * request_count 173 * - 255 requests per hcall 174 * - sometimes it will be more efficient to read extra data and discard 175 */ 176 177 /* 178 * Example usage: 179 * perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/' 180 */ 181 182 /* u3 0-6, one of HV_24X7_PERF_DOMAIN */ 183 EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3); 184 /* u16 */ 185 EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31); 186 EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31); 187 EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31); 188 /* u32, see "data_offset" */ 189 EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63); 190 /* u16 */ 191 EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15); 192 193 EVENT_DEFINE_RANGE(reserved1, config, 4, 15); 194 EVENT_DEFINE_RANGE(reserved2, config1, 16, 63); 195 EVENT_DEFINE_RANGE(reserved3, config2, 0, 63); 196 197 static struct attribute *format_attrs[] = { 198 &format_attr_domain.attr, 199 &format_attr_offset.attr, 200 &format_attr_core.attr, 201 &format_attr_chip.attr, 202 &format_attr_vcpu.attr, 203 &format_attr_lpar.attr, 204 NULL, 205 }; 206 207 static struct attribute_group format_group = { 208 .name = "format", 209 .attrs = format_attrs, 210 }; 211 212 static struct attribute_group event_group = { 213 .name = "events", 214 /* .attrs is set in init */ 215 }; 216 217 static struct attribute_group event_desc_group = { 218 .name = "event_descs", 219 /* .attrs is set in init */ 220 }; 221 222 static struct attribute_group event_long_desc_group = { 223 .name = "event_long_descs", 224 /* .attrs is set in init */ 225 }; 226 227 static struct kmem_cache *hv_page_cache; 228 229 DEFINE_PER_CPU(int, hv_24x7_txn_flags); 230 DEFINE_PER_CPU(int, hv_24x7_txn_err); 231 232 struct hv_24x7_hw { 233 struct perf_event *events[255]; 234 }; 235 236 DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); 237 238 /* 239 * request_buffer and result_buffer are not required to be 4k aligned, 240 * but are not allowed to cross any 4k boundary. Aligning them to 4k is 241 * the simplest way to ensure that. 242 */ 243 #define H24x7_DATA_BUFFER_SIZE 4096 244 DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); 245 DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); 246 247 static unsigned int max_num_requests(int interface_version) 248 { 249 return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer)) 250 / H24x7_REQUEST_SIZE(interface_version); 251 } 252 253 static char *event_name(struct hv_24x7_event_data *ev, int *len) 254 { 255 *len = be16_to_cpu(ev->event_name_len) - 2; 256 return (char *)ev->remainder; 257 } 258 259 static char *event_desc(struct hv_24x7_event_data *ev, int *len) 260 { 261 unsigned nl = be16_to_cpu(ev->event_name_len); 262 __be16 *desc_len = (__be16 *)(ev->remainder + nl - 2); 263 264 *len = be16_to_cpu(*desc_len) - 2; 265 return (char *)ev->remainder + nl; 266 } 267 268 static char *event_long_desc(struct hv_24x7_event_data *ev, int *len) 269 { 270 unsigned nl = be16_to_cpu(ev->event_name_len); 271 __be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2); 272 unsigned desc_len = be16_to_cpu(*desc_len_); 273 __be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2); 274 275 *len = be16_to_cpu(*long_desc_len) - 2; 276 return (char *)ev->remainder + nl + desc_len; 277 } 278 279 static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev, 280 void *end) 281 { 282 void *start = ev; 283 284 return (start + offsetof(struct hv_24x7_event_data, remainder)) < end; 285 } 286 287 /* 288 * Things we don't check: 289 * - padding for desc, name, and long/detailed desc is required to be '\0' 290 * bytes. 291 * 292 * Return NULL if we pass end, 293 * Otherwise return the address of the byte just following the event. 294 */ 295 static void *event_end(struct hv_24x7_event_data *ev, void *end) 296 { 297 void *start = ev; 298 __be16 *dl_, *ldl_; 299 unsigned dl, ldl; 300 unsigned nl = be16_to_cpu(ev->event_name_len); 301 302 if (nl < 2) { 303 pr_debug("%s: name length too short: %d", __func__, nl); 304 return NULL; 305 } 306 307 if (start + nl > end) { 308 pr_debug("%s: start=%p + nl=%u > end=%p", 309 __func__, start, nl, end); 310 return NULL; 311 } 312 313 dl_ = (__be16 *)(ev->remainder + nl - 2); 314 if (!IS_ALIGNED((uintptr_t)dl_, 2)) 315 pr_warn("desc len not aligned %p", dl_); 316 dl = be16_to_cpu(*dl_); 317 if (dl < 2) { 318 pr_debug("%s: desc len too short: %d", __func__, dl); 319 return NULL; 320 } 321 322 if (start + nl + dl > end) { 323 pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p", 324 __func__, start, nl, dl, start + nl + dl, end); 325 return NULL; 326 } 327 328 ldl_ = (__be16 *)(ev->remainder + nl + dl - 2); 329 if (!IS_ALIGNED((uintptr_t)ldl_, 2)) 330 pr_warn("long desc len not aligned %p", ldl_); 331 ldl = be16_to_cpu(*ldl_); 332 if (ldl < 2) { 333 pr_debug("%s: long desc len too short (ldl=%u)", 334 __func__, ldl); 335 return NULL; 336 } 337 338 if (start + nl + dl + ldl > end) { 339 pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p", 340 __func__, start, nl, dl, ldl, end); 341 return NULL; 342 } 343 344 return start + nl + dl + ldl; 345 } 346 347 static long h_get_24x7_catalog_page_(unsigned long phys_4096, 348 unsigned long version, unsigned long index) 349 { 350 pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)", 351 phys_4096, version, index); 352 353 WARN_ON(!IS_ALIGNED(phys_4096, 4096)); 354 355 return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE, 356 phys_4096, version, index); 357 } 358 359 static long h_get_24x7_catalog_page(char page[], u64 version, u32 index) 360 { 361 return h_get_24x7_catalog_page_(virt_to_phys(page), 362 version, index); 363 } 364 365 /* 366 * Each event we find in the catalog, will have a sysfs entry. Format the 367 * data for this sysfs entry based on the event's domain. 368 * 369 * Events belonging to the Chip domain can only be monitored in that domain. 370 * i.e the domain for these events is a fixed/knwon value. 371 * 372 * Events belonging to the Core domain can be monitored either in the physical 373 * core or in one of the virtual CPU domains. So the domain value for these 374 * events must be specified by the user (i.e is a required parameter). Format 375 * the Core events with 'domain=?' so the perf-tool can error check required 376 * parameters. 377 * 378 * NOTE: For the Core domain events, rather than making domain a required 379 * parameter we could default it to PHYS_CORE and allowe users to 380 * override the domain to one of the VCPU domains. 381 * 382 * However, this can make the interface a little inconsistent. 383 * 384 * If we set domain=2 (PHYS_CHIP) and allow user to override this field 385 * the user may be tempted to also modify the "offset=x" field in which 386 * can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and 387 * HPM_INST (offset=0x20) events. With: 388 * 389 * perf stat -e hv_24x7/HPM_PCYC,offset=0x20/ 390 * 391 * we end up monitoring HPM_INST, while the command line has HPM_PCYC. 392 * 393 * By not assigning a default value to the domain for the Core events, 394 * we can have simple guidelines: 395 * 396 * - Specifying values for parameters with "=?" is required. 397 * 398 * - Specifying (i.e overriding) values for other parameters 399 * is undefined. 400 */ 401 static char *event_fmt(struct hv_24x7_event_data *event, unsigned domain) 402 { 403 const char *sindex; 404 const char *lpar; 405 const char *domain_str; 406 char buf[8]; 407 408 switch (domain) { 409 case HV_PERF_DOMAIN_PHYS_CHIP: 410 snprintf(buf, sizeof(buf), "%d", domain); 411 domain_str = buf; 412 lpar = "0x0"; 413 sindex = "chip"; 414 break; 415 case HV_PERF_DOMAIN_PHYS_CORE: 416 domain_str = "?"; 417 lpar = "0x0"; 418 sindex = "core"; 419 break; 420 default: 421 domain_str = "?"; 422 lpar = "?"; 423 sindex = "vcpu"; 424 } 425 426 return kasprintf(GFP_KERNEL, 427 "domain=%s,offset=0x%x,%s=?,lpar=%s", 428 domain_str, 429 be16_to_cpu(event->event_counter_offs) + 430 be16_to_cpu(event->event_group_record_offs), 431 sindex, 432 lpar); 433 } 434 435 /* Avoid trusting fw to NUL terminate strings */ 436 static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp) 437 { 438 return kasprintf(gfp, "%.*s", max_len, maybe_str); 439 } 440 441 static ssize_t device_show_string(struct device *dev, 442 struct device_attribute *attr, char *buf) 443 { 444 struct dev_ext_attribute *d; 445 446 d = container_of(attr, struct dev_ext_attribute, attr); 447 448 return sprintf(buf, "%s\n", (char *)d->var); 449 } 450 451 static ssize_t cpumask_show(struct device *dev, 452 struct device_attribute *attr, char *buf) 453 { 454 return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask); 455 } 456 457 static ssize_t sockets_show(struct device *dev, 458 struct device_attribute *attr, char *buf) 459 { 460 return sprintf(buf, "%d\n", phys_sockets); 461 } 462 463 static ssize_t chipspersocket_show(struct device *dev, 464 struct device_attribute *attr, char *buf) 465 { 466 return sprintf(buf, "%d\n", phys_chipspersocket); 467 } 468 469 static ssize_t coresperchip_show(struct device *dev, 470 struct device_attribute *attr, char *buf) 471 { 472 return sprintf(buf, "%d\n", phys_coresperchip); 473 } 474 475 static struct attribute *device_str_attr_create_(char *name, char *str) 476 { 477 struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL); 478 479 if (!attr) 480 return NULL; 481 482 sysfs_attr_init(&attr->attr.attr); 483 484 attr->var = str; 485 attr->attr.attr.name = name; 486 attr->attr.attr.mode = 0444; 487 attr->attr.show = device_show_string; 488 489 return &attr->attr.attr; 490 } 491 492 /* 493 * Allocate and initialize strings representing event attributes. 494 * 495 * NOTE: The strings allocated here are never destroyed and continue to 496 * exist till shutdown. This is to allow us to create as many events 497 * from the catalog as possible, even if we encounter errors with some. 498 * In case of changes to error paths in future, these may need to be 499 * freed by the caller. 500 */ 501 static struct attribute *device_str_attr_create(char *name, int name_max, 502 int name_nonce, 503 char *str, size_t str_max) 504 { 505 char *n; 506 char *s = memdup_to_str(str, str_max, GFP_KERNEL); 507 struct attribute *a; 508 509 if (!s) 510 return NULL; 511 512 if (!name_nonce) 513 n = kasprintf(GFP_KERNEL, "%.*s", name_max, name); 514 else 515 n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name, 516 name_nonce); 517 if (!n) 518 goto out_s; 519 520 a = device_str_attr_create_(n, s); 521 if (!a) 522 goto out_n; 523 524 return a; 525 out_n: 526 kfree(n); 527 out_s: 528 kfree(s); 529 return NULL; 530 } 531 532 static struct attribute *event_to_attr(unsigned ix, 533 struct hv_24x7_event_data *event, 534 unsigned domain, 535 int nonce) 536 { 537 int event_name_len; 538 char *ev_name, *a_ev_name, *val; 539 struct attribute *attr; 540 541 if (!domain_is_valid(domain)) { 542 pr_warn("catalog event %u has invalid domain %u\n", 543 ix, domain); 544 return NULL; 545 } 546 547 val = event_fmt(event, domain); 548 if (!val) 549 return NULL; 550 551 ev_name = event_name(event, &event_name_len); 552 if (!nonce) 553 a_ev_name = kasprintf(GFP_KERNEL, "%.*s", 554 (int)event_name_len, ev_name); 555 else 556 a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d", 557 (int)event_name_len, ev_name, nonce); 558 559 if (!a_ev_name) 560 goto out_val; 561 562 attr = device_str_attr_create_(a_ev_name, val); 563 if (!attr) 564 goto out_name; 565 566 return attr; 567 out_name: 568 kfree(a_ev_name); 569 out_val: 570 kfree(val); 571 return NULL; 572 } 573 574 static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event, 575 int nonce) 576 { 577 int nl, dl; 578 char *name = event_name(event, &nl); 579 char *desc = event_desc(event, &dl); 580 581 /* If there isn't a description, don't create the sysfs file */ 582 if (!dl) 583 return NULL; 584 585 return device_str_attr_create(name, nl, nonce, desc, dl); 586 } 587 588 static struct attribute * 589 event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce) 590 { 591 int nl, dl; 592 char *name = event_name(event, &nl); 593 char *desc = event_long_desc(event, &dl); 594 595 /* If there isn't a description, don't create the sysfs file */ 596 if (!dl) 597 return NULL; 598 599 return device_str_attr_create(name, nl, nonce, desc, dl); 600 } 601 602 static int event_data_to_attrs(unsigned ix, struct attribute **attrs, 603 struct hv_24x7_event_data *event, int nonce) 604 { 605 *attrs = event_to_attr(ix, event, event->domain, nonce); 606 if (!*attrs) 607 return -1; 608 609 return 0; 610 } 611 612 /* */ 613 struct event_uniq { 614 struct rb_node node; 615 const char *name; 616 int nl; 617 unsigned ct; 618 unsigned domain; 619 }; 620 621 static int memord(const void *d1, size_t s1, const void *d2, size_t s2) 622 { 623 if (s1 < s2) 624 return 1; 625 if (s1 > s2) 626 return -1; 627 628 return memcmp(d1, d2, s1); 629 } 630 631 static int ev_uniq_ord(const void *v1, size_t s1, unsigned d1, const void *v2, 632 size_t s2, unsigned d2) 633 { 634 int r = memord(v1, s1, v2, s2); 635 636 if (r) 637 return r; 638 if (d1 > d2) 639 return 1; 640 if (d2 > d1) 641 return -1; 642 return 0; 643 } 644 645 static int event_uniq_add(struct rb_root *root, const char *name, int nl, 646 unsigned domain) 647 { 648 struct rb_node **new = &(root->rb_node), *parent = NULL; 649 struct event_uniq *data; 650 651 /* Figure out where to put new node */ 652 while (*new) { 653 struct event_uniq *it; 654 int result; 655 656 it = rb_entry(*new, struct event_uniq, node); 657 result = ev_uniq_ord(name, nl, domain, it->name, it->nl, 658 it->domain); 659 660 parent = *new; 661 if (result < 0) 662 new = &((*new)->rb_left); 663 else if (result > 0) 664 new = &((*new)->rb_right); 665 else { 666 it->ct++; 667 pr_info("found a duplicate event %.*s, ct=%u\n", nl, 668 name, it->ct); 669 return it->ct; 670 } 671 } 672 673 data = kmalloc(sizeof(*data), GFP_KERNEL); 674 if (!data) 675 return -ENOMEM; 676 677 *data = (struct event_uniq) { 678 .name = name, 679 .nl = nl, 680 .ct = 0, 681 .domain = domain, 682 }; 683 684 /* Add new node and rebalance tree. */ 685 rb_link_node(&data->node, parent, new); 686 rb_insert_color(&data->node, root); 687 688 /* data->ct */ 689 return 0; 690 } 691 692 static void event_uniq_destroy(struct rb_root *root) 693 { 694 /* 695 * the strings we point to are in the giant block of memory filled by 696 * the catalog, and are freed separately. 697 */ 698 struct event_uniq *pos, *n; 699 700 rbtree_postorder_for_each_entry_safe(pos, n, root, node) 701 kfree(pos); 702 } 703 704 705 /* 706 * ensure the event structure's sizes are self consistent and don't cause us to 707 * read outside of the event 708 * 709 * On success, return the event length in bytes. 710 * Otherwise, return -1 (and print as appropriate). 711 */ 712 static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event, 713 size_t event_idx, 714 size_t event_data_bytes, 715 size_t event_entry_count, 716 size_t offset, void *end) 717 { 718 ssize_t ev_len; 719 void *ev_end, *calc_ev_end; 720 721 if (offset >= event_data_bytes) 722 return -1; 723 724 if (event_idx >= event_entry_count) { 725 pr_devel("catalog event data has %zu bytes of padding after last event\n", 726 event_data_bytes - offset); 727 return -1; 728 } 729 730 if (!event_fixed_portion_is_within(event, end)) { 731 pr_warn("event %zu fixed portion is not within range\n", 732 event_idx); 733 return -1; 734 } 735 736 ev_len = be16_to_cpu(event->length); 737 738 if (ev_len % 16) 739 pr_info("event %zu has length %zu not divisible by 16: event=%pK\n", 740 event_idx, ev_len, event); 741 742 ev_end = (__u8 *)event + ev_len; 743 if (ev_end > end) { 744 pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n", 745 event_idx, ev_len, ev_end, end, 746 offset); 747 return -1; 748 } 749 750 calc_ev_end = event_end(event, end); 751 if (!calc_ev_end) { 752 pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n", 753 event_idx, event_data_bytes, event, end, 754 offset); 755 return -1; 756 } 757 758 if (calc_ev_end > ev_end) { 759 pr_warn("event %zu exceeds it's own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n", 760 event_idx, event, ev_end, offset, calc_ev_end); 761 return -1; 762 } 763 764 return ev_len; 765 } 766 767 #define MAX_4K (SIZE_MAX / 4096) 768 769 static int create_events_from_catalog(struct attribute ***events_, 770 struct attribute ***event_descs_, 771 struct attribute ***event_long_descs_) 772 { 773 long hret; 774 size_t catalog_len, catalog_page_len, event_entry_count, 775 event_data_len, event_data_offs, 776 event_data_bytes, junk_events, event_idx, event_attr_ct, i, 777 attr_max, event_idx_last, desc_ct, long_desc_ct; 778 ssize_t ct, ev_len; 779 uint64_t catalog_version_num; 780 struct attribute **events, **event_descs, **event_long_descs; 781 struct hv_24x7_catalog_page_0 *page_0 = 782 kmem_cache_alloc(hv_page_cache, GFP_KERNEL); 783 void *page = page_0; 784 void *event_data, *end; 785 struct hv_24x7_event_data *event; 786 struct rb_root ev_uniq = RB_ROOT; 787 int ret = 0; 788 789 if (!page) { 790 ret = -ENOMEM; 791 goto e_out; 792 } 793 794 hret = h_get_24x7_catalog_page(page, 0, 0); 795 if (hret) { 796 ret = -EIO; 797 goto e_free; 798 } 799 800 catalog_version_num = be64_to_cpu(page_0->version); 801 catalog_page_len = be32_to_cpu(page_0->length); 802 803 if (MAX_4K < catalog_page_len) { 804 pr_err("invalid page count: %zu\n", catalog_page_len); 805 ret = -EIO; 806 goto e_free; 807 } 808 809 catalog_len = catalog_page_len * 4096; 810 811 event_entry_count = be16_to_cpu(page_0->event_entry_count); 812 event_data_offs = be16_to_cpu(page_0->event_data_offs); 813 event_data_len = be16_to_cpu(page_0->event_data_len); 814 815 pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n", 816 catalog_version_num, catalog_len, 817 event_entry_count, event_data_offs, event_data_len); 818 819 if ((MAX_4K < event_data_len) 820 || (MAX_4K < event_data_offs) 821 || (MAX_4K - event_data_offs < event_data_len)) { 822 pr_err("invalid event data offs %zu and/or len %zu\n", 823 event_data_offs, event_data_len); 824 ret = -EIO; 825 goto e_free; 826 } 827 828 if ((event_data_offs + event_data_len) > catalog_page_len) { 829 pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n", 830 event_data_offs, 831 event_data_offs + event_data_len, 832 catalog_page_len); 833 ret = -EIO; 834 goto e_free; 835 } 836 837 if (SIZE_MAX - 1 < event_entry_count) { 838 pr_err("event_entry_count %zu is invalid\n", event_entry_count); 839 ret = -EIO; 840 goto e_free; 841 } 842 843 event_data_bytes = event_data_len * 4096; 844 845 /* 846 * event data can span several pages, events can cross between these 847 * pages. Use vmalloc to make this easier. 848 */ 849 event_data = vmalloc(event_data_bytes); 850 if (!event_data) { 851 pr_err("could not allocate event data\n"); 852 ret = -ENOMEM; 853 goto e_free; 854 } 855 856 end = event_data + event_data_bytes; 857 858 /* 859 * using vmalloc_to_phys() like this only works if PAGE_SIZE is 860 * divisible by 4096 861 */ 862 BUILD_BUG_ON(PAGE_SIZE % 4096); 863 864 for (i = 0; i < event_data_len; i++) { 865 hret = h_get_24x7_catalog_page_( 866 vmalloc_to_phys(event_data + i * 4096), 867 catalog_version_num, 868 i + event_data_offs); 869 if (hret) { 870 pr_err("Failed to get event data in page %zu: rc=%ld\n", 871 i + event_data_offs, hret); 872 ret = -EIO; 873 goto e_event_data; 874 } 875 } 876 877 /* 878 * scan the catalog to determine the number of attributes we need, and 879 * verify it at the same time. 880 */ 881 for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0; 882 ; 883 event_idx++, event = (void *)event + ev_len) { 884 size_t offset = (void *)event - (void *)event_data; 885 char *name; 886 int nl; 887 888 ev_len = catalog_event_len_validate(event, event_idx, 889 event_data_bytes, 890 event_entry_count, 891 offset, end); 892 if (ev_len < 0) 893 break; 894 895 name = event_name(event, &nl); 896 897 if (event->event_group_record_len == 0) { 898 pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n", 899 event_idx, nl, name); 900 junk_events++; 901 continue; 902 } 903 904 if (!catalog_entry_domain_is_valid(event->domain)) { 905 pr_info("event %zu (%.*s) has invalid domain %d\n", 906 event_idx, nl, name, event->domain); 907 junk_events++; 908 continue; 909 } 910 911 attr_max++; 912 } 913 914 event_idx_last = event_idx; 915 if (event_idx_last != event_entry_count) 916 pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n", 917 event_idx_last, event_entry_count, junk_events); 918 919 events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL); 920 if (!events) { 921 ret = -ENOMEM; 922 goto e_event_data; 923 } 924 925 event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs), 926 GFP_KERNEL); 927 if (!event_descs) { 928 ret = -ENOMEM; 929 goto e_event_attrs; 930 } 931 932 event_long_descs = kmalloc_array(event_idx + 1, 933 sizeof(*event_long_descs), GFP_KERNEL); 934 if (!event_long_descs) { 935 ret = -ENOMEM; 936 goto e_event_descs; 937 } 938 939 /* Iterate over the catalog filling in the attribute vector */ 940 for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0, 941 event = event_data, event_idx = 0; 942 event_idx < event_idx_last; 943 event_idx++, ev_len = be16_to_cpu(event->length), 944 event = (void *)event + ev_len) { 945 char *name; 946 int nl; 947 int nonce; 948 /* 949 * these are the only "bad" events that are intermixed and that 950 * we can ignore without issue. make sure to skip them here 951 */ 952 if (event->event_group_record_len == 0) 953 continue; 954 if (!catalog_entry_domain_is_valid(event->domain)) 955 continue; 956 957 name = event_name(event, &nl); 958 nonce = event_uniq_add(&ev_uniq, name, nl, event->domain); 959 ct = event_data_to_attrs(event_idx, events + event_attr_ct, 960 event, nonce); 961 if (ct < 0) { 962 pr_warn("event %zu (%.*s) creation failure, skipping\n", 963 event_idx, nl, name); 964 junk_events++; 965 } else { 966 event_attr_ct++; 967 event_descs[desc_ct] = event_to_desc_attr(event, nonce); 968 if (event_descs[desc_ct]) 969 desc_ct++; 970 event_long_descs[long_desc_ct] = 971 event_to_long_desc_attr(event, nonce); 972 if (event_long_descs[long_desc_ct]) 973 long_desc_ct++; 974 } 975 } 976 977 pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n", 978 event_idx, event_attr_ct, junk_events, desc_ct); 979 980 events[event_attr_ct] = NULL; 981 event_descs[desc_ct] = NULL; 982 event_long_descs[long_desc_ct] = NULL; 983 984 event_uniq_destroy(&ev_uniq); 985 vfree(event_data); 986 kmem_cache_free(hv_page_cache, page); 987 988 *events_ = events; 989 *event_descs_ = event_descs; 990 *event_long_descs_ = event_long_descs; 991 return 0; 992 993 e_event_descs: 994 kfree(event_descs); 995 e_event_attrs: 996 kfree(events); 997 e_event_data: 998 vfree(event_data); 999 e_free: 1000 kmem_cache_free(hv_page_cache, page); 1001 e_out: 1002 *events_ = NULL; 1003 *event_descs_ = NULL; 1004 *event_long_descs_ = NULL; 1005 return ret; 1006 } 1007 1008 static ssize_t catalog_read(struct file *filp, struct kobject *kobj, 1009 struct bin_attribute *bin_attr, char *buf, 1010 loff_t offset, size_t count) 1011 { 1012 long hret; 1013 ssize_t ret = 0; 1014 size_t catalog_len = 0, catalog_page_len = 0; 1015 loff_t page_offset = 0; 1016 loff_t offset_in_page; 1017 size_t copy_len; 1018 uint64_t catalog_version_num = 0; 1019 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); 1020 struct hv_24x7_catalog_page_0 *page_0 = page; 1021 1022 if (!page) 1023 return -ENOMEM; 1024 1025 hret = h_get_24x7_catalog_page(page, 0, 0); 1026 if (hret) { 1027 ret = -EIO; 1028 goto e_free; 1029 } 1030 1031 catalog_version_num = be64_to_cpu(page_0->version); 1032 catalog_page_len = be32_to_cpu(page_0->length); 1033 catalog_len = catalog_page_len * 4096; 1034 1035 page_offset = offset / 4096; 1036 offset_in_page = offset % 4096; 1037 1038 if (page_offset >= catalog_page_len) 1039 goto e_free; 1040 1041 if (page_offset != 0) { 1042 hret = h_get_24x7_catalog_page(page, catalog_version_num, 1043 page_offset); 1044 if (hret) { 1045 ret = -EIO; 1046 goto e_free; 1047 } 1048 } 1049 1050 copy_len = 4096 - offset_in_page; 1051 if (copy_len > count) 1052 copy_len = count; 1053 1054 memcpy(buf, page+offset_in_page, copy_len); 1055 ret = copy_len; 1056 1057 e_free: 1058 if (hret) 1059 pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:" 1060 " rc=%ld\n", 1061 catalog_version_num, page_offset, hret); 1062 kmem_cache_free(hv_page_cache, page); 1063 1064 pr_devel("catalog_read: offset=%lld(%lld) count=%zu " 1065 "catalog_len=%zu(%zu) => %zd\n", offset, page_offset, 1066 count, catalog_len, catalog_page_len, ret); 1067 1068 return ret; 1069 } 1070 1071 static ssize_t domains_show(struct device *dev, struct device_attribute *attr, 1072 char *page) 1073 { 1074 int d, n, count = 0; 1075 const char *str; 1076 1077 for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) { 1078 str = domain_name(d); 1079 if (!str) 1080 continue; 1081 1082 n = sprintf(page, "%d: %s\n", d, str); 1083 if (n < 0) 1084 break; 1085 1086 count += n; 1087 page += n; 1088 } 1089 return count; 1090 } 1091 1092 #define PAGE_0_ATTR(_name, _fmt, _expr) \ 1093 static ssize_t _name##_show(struct device *dev, \ 1094 struct device_attribute *dev_attr, \ 1095 char *buf) \ 1096 { \ 1097 long hret; \ 1098 ssize_t ret = 0; \ 1099 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); \ 1100 struct hv_24x7_catalog_page_0 *page_0 = page; \ 1101 if (!page) \ 1102 return -ENOMEM; \ 1103 hret = h_get_24x7_catalog_page(page, 0, 0); \ 1104 if (hret) { \ 1105 ret = -EIO; \ 1106 goto e_free; \ 1107 } \ 1108 ret = sprintf(buf, _fmt, _expr); \ 1109 e_free: \ 1110 kmem_cache_free(hv_page_cache, page); \ 1111 return ret; \ 1112 } \ 1113 static DEVICE_ATTR_RO(_name) 1114 1115 PAGE_0_ATTR(catalog_version, "%lld\n", 1116 (unsigned long long)be64_to_cpu(page_0->version)); 1117 PAGE_0_ATTR(catalog_len, "%lld\n", 1118 (unsigned long long)be32_to_cpu(page_0->length) * 4096); 1119 static BIN_ATTR_RO(catalog, 0/* real length varies */); 1120 static DEVICE_ATTR_RO(domains); 1121 static DEVICE_ATTR_RO(sockets); 1122 static DEVICE_ATTR_RO(chipspersocket); 1123 static DEVICE_ATTR_RO(coresperchip); 1124 static DEVICE_ATTR_RO(cpumask); 1125 1126 static struct bin_attribute *if_bin_attrs[] = { 1127 &bin_attr_catalog, 1128 NULL, 1129 }; 1130 1131 static struct attribute *if_attrs[] = { 1132 &dev_attr_catalog_len.attr, 1133 &dev_attr_catalog_version.attr, 1134 &dev_attr_domains.attr, 1135 &dev_attr_sockets.attr, 1136 &dev_attr_chipspersocket.attr, 1137 &dev_attr_coresperchip.attr, 1138 &dev_attr_cpumask.attr, 1139 NULL, 1140 }; 1141 1142 static struct attribute_group if_group = { 1143 .name = "interface", 1144 .bin_attrs = if_bin_attrs, 1145 .attrs = if_attrs, 1146 }; 1147 1148 static const struct attribute_group *attr_groups[] = { 1149 &format_group, 1150 &event_group, 1151 &event_desc_group, 1152 &event_long_desc_group, 1153 &if_group, 1154 NULL, 1155 }; 1156 1157 /* 1158 * Start the process for a new H_GET_24x7_DATA hcall. 1159 */ 1160 static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer, 1161 struct hv_24x7_data_result_buffer *result_buffer) 1162 { 1163 1164 memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE); 1165 memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE); 1166 1167 request_buffer->interface_version = interface_version; 1168 /* memset above set request_buffer->num_requests to 0 */ 1169 } 1170 1171 /* 1172 * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected 1173 * by 'init_24x7_request()' and 'add_event_to_24x7_request()'. 1174 */ 1175 static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer, 1176 struct hv_24x7_data_result_buffer *result_buffer) 1177 { 1178 long ret; 1179 1180 /* 1181 * NOTE: Due to variable number of array elements in request and 1182 * result buffer(s), sizeof() is not reliable. Use the actual 1183 * allocated buffer size, H24x7_DATA_BUFFER_SIZE. 1184 */ 1185 ret = plpar_hcall_norets(H_GET_24X7_DATA, 1186 virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE, 1187 virt_to_phys(result_buffer), H24x7_DATA_BUFFER_SIZE); 1188 1189 if (ret) { 1190 struct hv_24x7_request *req; 1191 1192 req = request_buffer->requests; 1193 pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n", 1194 req->performance_domain, req->data_offset, 1195 req->starting_ix, req->starting_lpar_ix, 1196 ret, ret, result_buffer->detailed_rc, 1197 result_buffer->failing_request_ix); 1198 return -EIO; 1199 } 1200 1201 return 0; 1202 } 1203 1204 /* 1205 * Add the given @event to the next slot in the 24x7 request_buffer. 1206 * 1207 * Note that H_GET_24X7_DATA hcall allows reading several counters' 1208 * values in a single HCALL. We expect the caller to add events to the 1209 * request buffer one by one, make the HCALL and process the results. 1210 */ 1211 static int add_event_to_24x7_request(struct perf_event *event, 1212 struct hv_24x7_request_buffer *request_buffer) 1213 { 1214 u16 idx; 1215 int i; 1216 size_t req_size; 1217 struct hv_24x7_request *req; 1218 1219 if (request_buffer->num_requests >= 1220 max_num_requests(request_buffer->interface_version)) { 1221 pr_devel("Too many requests for 24x7 HCALL %d\n", 1222 request_buffer->num_requests); 1223 return -EINVAL; 1224 } 1225 1226 switch (event_get_domain(event)) { 1227 case HV_PERF_DOMAIN_PHYS_CHIP: 1228 idx = event_get_chip(event); 1229 break; 1230 case HV_PERF_DOMAIN_PHYS_CORE: 1231 idx = event_get_core(event); 1232 break; 1233 default: 1234 idx = event_get_vcpu(event); 1235 } 1236 1237 req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version); 1238 1239 i = request_buffer->num_requests++; 1240 req = (void *) request_buffer->requests + i * req_size; 1241 1242 req->performance_domain = event_get_domain(event); 1243 req->data_size = cpu_to_be16(8); 1244 req->data_offset = cpu_to_be32(event_get_offset(event)); 1245 req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event)); 1246 req->max_num_lpars = cpu_to_be16(1); 1247 req->starting_ix = cpu_to_be16(idx); 1248 req->max_ix = cpu_to_be16(1); 1249 1250 if (request_buffer->interface_version > 1) { 1251 if (domain_needs_aggregation(req->performance_domain)) 1252 req->max_num_thread_groups = -1; 1253 else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) { 1254 req->starting_thread_group_ix = idx % 2; 1255 req->max_num_thread_groups = 1; 1256 } 1257 } 1258 1259 return 0; 1260 } 1261 1262 /** 1263 * get_count_from_result - get event count from all result elements in result 1264 * 1265 * If the event corresponding to this result needs aggregation of the result 1266 * element values, then this function does that. 1267 * 1268 * @event: Event associated with @res. 1269 * @resb: Result buffer containing @res. 1270 * @res: Result to work on. 1271 * @countp: Output variable containing the event count. 1272 * @next: Optional output variable pointing to the next result in @resb. 1273 */ 1274 static int get_count_from_result(struct perf_event *event, 1275 struct hv_24x7_data_result_buffer *resb, 1276 struct hv_24x7_result *res, u64 *countp, 1277 struct hv_24x7_result **next) 1278 { 1279 u16 num_elements = be16_to_cpu(res->num_elements_returned); 1280 u16 data_size = be16_to_cpu(res->result_element_data_size); 1281 unsigned int data_offset; 1282 void *element_data; 1283 int i; 1284 u64 count; 1285 1286 /* 1287 * We can bail out early if the result is empty. 1288 */ 1289 if (!num_elements) { 1290 pr_debug("Result of request %hhu is empty, nothing to do\n", 1291 res->result_ix); 1292 1293 if (next) 1294 *next = (struct hv_24x7_result *) res->elements; 1295 1296 return -ENODATA; 1297 } 1298 1299 /* 1300 * Since we always specify 1 as the maximum for the smallest resource 1301 * we're requesting, there should to be only one element per result. 1302 * Except when an event needs aggregation, in which case there are more. 1303 */ 1304 if (num_elements != 1 && 1305 !domain_needs_aggregation(event_get_domain(event))) { 1306 pr_err("Error: result of request %hhu has %hu elements\n", 1307 res->result_ix, num_elements); 1308 1309 return -EIO; 1310 } 1311 1312 if (data_size != sizeof(u64)) { 1313 pr_debug("Error: result of request %hhu has data of %hu bytes\n", 1314 res->result_ix, data_size); 1315 1316 return -ENOTSUPP; 1317 } 1318 1319 if (resb->interface_version == 1) 1320 data_offset = offsetof(struct hv_24x7_result_element_v1, 1321 element_data); 1322 else 1323 data_offset = offsetof(struct hv_24x7_result_element_v2, 1324 element_data); 1325 1326 /* Go through the result elements in the result. */ 1327 for (i = count = 0, element_data = res->elements + data_offset; 1328 i < num_elements; 1329 i++, element_data += data_size + data_offset) 1330 count += be64_to_cpu(*((u64 *) element_data)); 1331 1332 *countp = count; 1333 1334 /* The next result is after the last result element. */ 1335 if (next) 1336 *next = element_data - data_offset; 1337 1338 return 0; 1339 } 1340 1341 static int single_24x7_request(struct perf_event *event, u64 *count) 1342 { 1343 int ret; 1344 struct hv_24x7_request_buffer *request_buffer; 1345 struct hv_24x7_data_result_buffer *result_buffer; 1346 1347 BUILD_BUG_ON(sizeof(*request_buffer) > 4096); 1348 BUILD_BUG_ON(sizeof(*result_buffer) > 4096); 1349 1350 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1351 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1352 1353 init_24x7_request(request_buffer, result_buffer); 1354 1355 ret = add_event_to_24x7_request(event, request_buffer); 1356 if (ret) 1357 goto out; 1358 1359 ret = make_24x7_request(request_buffer, result_buffer); 1360 if (ret) 1361 goto out; 1362 1363 /* process result from hcall */ 1364 ret = get_count_from_result(event, result_buffer, 1365 result_buffer->results, count, NULL); 1366 1367 out: 1368 put_cpu_var(hv_24x7_reqb); 1369 put_cpu_var(hv_24x7_resb); 1370 return ret; 1371 } 1372 1373 1374 static int h_24x7_event_init(struct perf_event *event) 1375 { 1376 struct hv_perf_caps caps; 1377 unsigned domain; 1378 unsigned long hret; 1379 u64 ct; 1380 1381 /* Not our event */ 1382 if (event->attr.type != event->pmu->type) 1383 return -ENOENT; 1384 1385 /* Unused areas must be 0 */ 1386 if (event_get_reserved1(event) || 1387 event_get_reserved2(event) || 1388 event_get_reserved3(event)) { 1389 pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n", 1390 event->attr.config, 1391 event_get_reserved1(event), 1392 event->attr.config1, 1393 event_get_reserved2(event), 1394 event->attr.config2, 1395 event_get_reserved3(event)); 1396 return -EINVAL; 1397 } 1398 1399 /* no branch sampling */ 1400 if (has_branch_stack(event)) 1401 return -EOPNOTSUPP; 1402 1403 /* offset must be 8 byte aligned */ 1404 if (event_get_offset(event) % 8) { 1405 pr_devel("bad alignment\n"); 1406 return -EINVAL; 1407 } 1408 1409 domain = event_get_domain(event); 1410 if (domain >= HV_PERF_DOMAIN_MAX) { 1411 pr_devel("invalid domain %d\n", domain); 1412 return -EINVAL; 1413 } 1414 1415 hret = hv_perf_caps_get(&caps); 1416 if (hret) { 1417 pr_devel("could not get capabilities: rc=%ld\n", hret); 1418 return -EIO; 1419 } 1420 1421 /* Physical domains & other lpars require extra capabilities */ 1422 if (!caps.collect_privileged && (is_physical_domain(domain) || 1423 (event_get_lpar(event) != event_get_lpar_max()))) { 1424 pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n", 1425 is_physical_domain(domain), 1426 event_get_lpar(event)); 1427 return -EACCES; 1428 } 1429 1430 /* Get the initial value of the counter for this event */ 1431 if (single_24x7_request(event, &ct)) { 1432 pr_devel("test hcall failed\n"); 1433 return -EIO; 1434 } 1435 (void)local64_xchg(&event->hw.prev_count, ct); 1436 1437 return 0; 1438 } 1439 1440 static u64 h_24x7_get_value(struct perf_event *event) 1441 { 1442 u64 ct; 1443 1444 if (single_24x7_request(event, &ct)) 1445 /* We checked this in event init, shouldn't fail here... */ 1446 return 0; 1447 1448 return ct; 1449 } 1450 1451 static void update_event_count(struct perf_event *event, u64 now) 1452 { 1453 s64 prev; 1454 1455 prev = local64_xchg(&event->hw.prev_count, now); 1456 local64_add(now - prev, &event->count); 1457 } 1458 1459 static void h_24x7_event_read(struct perf_event *event) 1460 { 1461 u64 now; 1462 struct hv_24x7_request_buffer *request_buffer; 1463 struct hv_24x7_hw *h24x7hw; 1464 int txn_flags; 1465 1466 txn_flags = __this_cpu_read(hv_24x7_txn_flags); 1467 1468 /* 1469 * If in a READ transaction, add this counter to the list of 1470 * counters to read during the next HCALL (i.e commit_txn()). 1471 * If not in a READ transaction, go ahead and make the HCALL 1472 * to read this counter by itself. 1473 */ 1474 1475 if (txn_flags & PERF_PMU_TXN_READ) { 1476 int i; 1477 int ret; 1478 1479 if (__this_cpu_read(hv_24x7_txn_err)) 1480 return; 1481 1482 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1483 1484 ret = add_event_to_24x7_request(event, request_buffer); 1485 if (ret) { 1486 __this_cpu_write(hv_24x7_txn_err, ret); 1487 } else { 1488 /* 1489 * Associate the event with the HCALL request index, 1490 * so ->commit_txn() can quickly find/update count. 1491 */ 1492 i = request_buffer->num_requests - 1; 1493 1494 h24x7hw = &get_cpu_var(hv_24x7_hw); 1495 h24x7hw->events[i] = event; 1496 put_cpu_var(h24x7hw); 1497 } 1498 1499 put_cpu_var(hv_24x7_reqb); 1500 } else { 1501 now = h_24x7_get_value(event); 1502 update_event_count(event, now); 1503 } 1504 } 1505 1506 static void h_24x7_event_start(struct perf_event *event, int flags) 1507 { 1508 if (flags & PERF_EF_RELOAD) 1509 local64_set(&event->hw.prev_count, h_24x7_get_value(event)); 1510 } 1511 1512 static void h_24x7_event_stop(struct perf_event *event, int flags) 1513 { 1514 h_24x7_event_read(event); 1515 } 1516 1517 static int h_24x7_event_add(struct perf_event *event, int flags) 1518 { 1519 if (flags & PERF_EF_START) 1520 h_24x7_event_start(event, flags); 1521 1522 return 0; 1523 } 1524 1525 /* 1526 * 24x7 counters only support READ transactions. They are 1527 * always counting and dont need/support ADD transactions. 1528 * Cache the flags, but otherwise ignore transactions that 1529 * are not PERF_PMU_TXN_READ. 1530 */ 1531 static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags) 1532 { 1533 struct hv_24x7_request_buffer *request_buffer; 1534 struct hv_24x7_data_result_buffer *result_buffer; 1535 1536 /* We should not be called if we are already in a txn */ 1537 WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags)); 1538 1539 __this_cpu_write(hv_24x7_txn_flags, flags); 1540 if (flags & ~PERF_PMU_TXN_READ) 1541 return; 1542 1543 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1544 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1545 1546 init_24x7_request(request_buffer, result_buffer); 1547 1548 put_cpu_var(hv_24x7_resb); 1549 put_cpu_var(hv_24x7_reqb); 1550 } 1551 1552 /* 1553 * Clean up transaction state. 1554 * 1555 * NOTE: Ignore state of request and result buffers for now. 1556 * We will initialize them during the next read/txn. 1557 */ 1558 static void reset_txn(void) 1559 { 1560 __this_cpu_write(hv_24x7_txn_flags, 0); 1561 __this_cpu_write(hv_24x7_txn_err, 0); 1562 } 1563 1564 /* 1565 * 24x7 counters only support READ transactions. They are always counting 1566 * and dont need/support ADD transactions. Clear ->txn_flags but otherwise 1567 * ignore transactions that are not of type PERF_PMU_TXN_READ. 1568 * 1569 * For READ transactions, submit all pending 24x7 requests (i.e requests 1570 * that were queued by h_24x7_event_read()), to the hypervisor and update 1571 * the event counts. 1572 */ 1573 static int h_24x7_event_commit_txn(struct pmu *pmu) 1574 { 1575 struct hv_24x7_request_buffer *request_buffer; 1576 struct hv_24x7_data_result_buffer *result_buffer; 1577 struct hv_24x7_result *res, *next_res; 1578 u64 count; 1579 int i, ret, txn_flags; 1580 struct hv_24x7_hw *h24x7hw; 1581 1582 txn_flags = __this_cpu_read(hv_24x7_txn_flags); 1583 WARN_ON_ONCE(!txn_flags); 1584 1585 ret = 0; 1586 if (txn_flags & ~PERF_PMU_TXN_READ) 1587 goto out; 1588 1589 ret = __this_cpu_read(hv_24x7_txn_err); 1590 if (ret) 1591 goto out; 1592 1593 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1594 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1595 1596 ret = make_24x7_request(request_buffer, result_buffer); 1597 if (ret) 1598 goto put_reqb; 1599 1600 h24x7hw = &get_cpu_var(hv_24x7_hw); 1601 1602 /* Go through results in the result buffer to update event counts. */ 1603 for (i = 0, res = result_buffer->results; 1604 i < result_buffer->num_results; i++, res = next_res) { 1605 struct perf_event *event = h24x7hw->events[res->result_ix]; 1606 1607 ret = get_count_from_result(event, result_buffer, res, &count, 1608 &next_res); 1609 if (ret) 1610 break; 1611 1612 update_event_count(event, count); 1613 } 1614 1615 put_cpu_var(hv_24x7_hw); 1616 1617 put_reqb: 1618 put_cpu_var(hv_24x7_resb); 1619 put_cpu_var(hv_24x7_reqb); 1620 out: 1621 reset_txn(); 1622 return ret; 1623 } 1624 1625 /* 1626 * 24x7 counters only support READ transactions. They are always counting 1627 * and dont need/support ADD transactions. However, regardless of type 1628 * of transaction, all we need to do is cleanup, so we don't have to check 1629 * the type of transaction. 1630 */ 1631 static void h_24x7_event_cancel_txn(struct pmu *pmu) 1632 { 1633 WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags)); 1634 reset_txn(); 1635 } 1636 1637 static struct pmu h_24x7_pmu = { 1638 .task_ctx_nr = perf_invalid_context, 1639 1640 .name = "hv_24x7", 1641 .attr_groups = attr_groups, 1642 .event_init = h_24x7_event_init, 1643 .add = h_24x7_event_add, 1644 .del = h_24x7_event_stop, 1645 .start = h_24x7_event_start, 1646 .stop = h_24x7_event_stop, 1647 .read = h_24x7_event_read, 1648 .start_txn = h_24x7_event_start_txn, 1649 .commit_txn = h_24x7_event_commit_txn, 1650 .cancel_txn = h_24x7_event_cancel_txn, 1651 .capabilities = PERF_PMU_CAP_NO_EXCLUDE, 1652 }; 1653 1654 static int ppc_hv_24x7_cpu_online(unsigned int cpu) 1655 { 1656 if (cpumask_empty(&hv_24x7_cpumask)) 1657 cpumask_set_cpu(cpu, &hv_24x7_cpumask); 1658 1659 return 0; 1660 } 1661 1662 static int ppc_hv_24x7_cpu_offline(unsigned int cpu) 1663 { 1664 int target; 1665 1666 /* Check if exiting cpu is used for collecting 24x7 events */ 1667 if (!cpumask_test_and_clear_cpu(cpu, &hv_24x7_cpumask)) 1668 return 0; 1669 1670 /* Find a new cpu to collect 24x7 events */ 1671 target = cpumask_last(cpu_active_mask); 1672 1673 if (target < 0 || target >= nr_cpu_ids) { 1674 pr_err("hv_24x7: CPU hotplug init failed\n"); 1675 return -1; 1676 } 1677 1678 /* Migrate 24x7 events to the new target */ 1679 cpumask_set_cpu(target, &hv_24x7_cpumask); 1680 perf_pmu_migrate_context(&h_24x7_pmu, cpu, target); 1681 1682 return 0; 1683 } 1684 1685 static int hv_24x7_cpu_hotplug_init(void) 1686 { 1687 return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE, 1688 "perf/powerpc/hv_24x7:online", 1689 ppc_hv_24x7_cpu_online, 1690 ppc_hv_24x7_cpu_offline); 1691 } 1692 1693 static int hv_24x7_init(void) 1694 { 1695 int r; 1696 unsigned long hret; 1697 struct hv_perf_caps caps; 1698 1699 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 1700 pr_debug("not a virtualized system, not enabling\n"); 1701 return -ENODEV; 1702 } else if (!cur_cpu_spec->oprofile_cpu_type) 1703 return -ENODEV; 1704 1705 /* POWER8 only supports v1, while POWER9 only supports v2. */ 1706 if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8")) 1707 interface_version = 1; 1708 else { 1709 interface_version = 2; 1710 1711 /* SMT8 in POWER9 needs to aggregate result elements. */ 1712 if (threads_per_core == 8) 1713 aggregate_result_elements = true; 1714 } 1715 1716 hret = hv_perf_caps_get(&caps); 1717 if (hret) { 1718 pr_debug("could not obtain capabilities, not enabling, rc=%ld\n", 1719 hret); 1720 return -ENODEV; 1721 } 1722 1723 hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL); 1724 if (!hv_page_cache) 1725 return -ENOMEM; 1726 1727 /* sampling not supported */ 1728 h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; 1729 1730 r = create_events_from_catalog(&event_group.attrs, 1731 &event_desc_group.attrs, 1732 &event_long_desc_group.attrs); 1733 1734 if (r) 1735 return r; 1736 1737 /* init cpuhotplug */ 1738 r = hv_24x7_cpu_hotplug_init(); 1739 if (r) 1740 return r; 1741 1742 r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1); 1743 if (r) 1744 return r; 1745 1746 read_24x7_sys_info(); 1747 1748 return 0; 1749 } 1750 1751 device_initcall(hv_24x7_init); 1752