1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Hypervisor supplied "24x7" performance counter support 4 * 5 * Author: Cody P Schafer <cody@linux.vnet.ibm.com> 6 * Copyright 2014 IBM Corporation. 7 */ 8 9 #define pr_fmt(fmt) "hv-24x7: " fmt 10 11 #include <linux/perf_event.h> 12 #include <linux/rbtree.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/sysfs.h> 16 #include <linux/vmalloc.h> 17 18 #include <asm/cputhreads.h> 19 #include <asm/firmware.h> 20 #include <asm/hvcall.h> 21 #include <asm/io.h> 22 #include <asm/papr-sysparm.h> 23 #include <linux/byteorder/generic.h> 24 25 #include <asm/rtas.h> 26 #include "hv-24x7.h" 27 #include "hv-24x7-catalog.h" 28 #include "hv-common.h" 29 30 /* Version of the 24x7 hypervisor API that we should use in this machine. */ 31 static int interface_version; 32 33 /* Whether we have to aggregate result data for some domains. */ 34 static bool aggregate_result_elements; 35 36 static cpumask_t hv_24x7_cpumask; 37 38 static bool domain_is_valid(unsigned int domain) 39 { 40 switch (domain) { 41 #define DOMAIN(n, v, x, c) \ 42 case HV_PERF_DOMAIN_##n: \ 43 /* fall through */ 44 #include "hv-24x7-domains.h" 45 #undef DOMAIN 46 return true; 47 default: 48 return false; 49 } 50 } 51 52 static bool is_physical_domain(unsigned int domain) 53 { 54 switch (domain) { 55 #define DOMAIN(n, v, x, c) \ 56 case HV_PERF_DOMAIN_##n: \ 57 return c; 58 #include "hv-24x7-domains.h" 59 #undef DOMAIN 60 default: 61 return false; 62 } 63 } 64 65 /* 66 * The Processor Module Information system parameter allows transferring 67 * of certain processor module information from the platform to the OS. 68 * Refer PAPR+ document to get parameter token value as '43'. 69 */ 70 71 static u32 phys_sockets; /* Physical sockets */ 72 static u32 phys_chipspersocket; /* Physical chips per socket*/ 73 static u32 phys_coresperchip; /* Physical cores per chip */ 74 75 /* 76 * read_24x7_sys_info() 77 * Retrieve the number of sockets and chips per socket and cores per 78 * chip details through the get-system-parameter rtas call. 79 */ 80 void read_24x7_sys_info(void) 81 { 82 struct papr_sysparm_buf *buf; 83 84 /* 85 * Making system parameter: chips and sockets and cores per chip 86 * default to 1. 87 */ 88 phys_sockets = 1; 89 phys_chipspersocket = 1; 90 phys_coresperchip = 1; 91 92 buf = papr_sysparm_buf_alloc(); 93 if (!buf) 94 return; 95 96 if (!papr_sysparm_get(PAPR_SYSPARM_PROC_MODULE_INFO, buf)) { 97 int ntypes = be16_to_cpup((__be16 *)&buf->val[0]); 98 int len = be16_to_cpu(buf->len); 99 100 if (len >= 8 && ntypes != 0) { 101 phys_sockets = be16_to_cpup((__be16 *)&buf->val[2]); 102 phys_chipspersocket = be16_to_cpup((__be16 *)&buf->val[4]); 103 phys_coresperchip = be16_to_cpup((__be16 *)&buf->val[6]); 104 } 105 } 106 107 papr_sysparm_buf_free(buf); 108 } 109 110 /* Domains for which more than one result element are returned for each event. */ 111 static bool domain_needs_aggregation(unsigned int domain) 112 { 113 return aggregate_result_elements && 114 (domain == HV_PERF_DOMAIN_PHYS_CORE || 115 (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE && 116 domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE)); 117 } 118 119 static const char *domain_name(unsigned int domain) 120 { 121 if (!domain_is_valid(domain)) 122 return NULL; 123 124 switch (domain) { 125 case HV_PERF_DOMAIN_PHYS_CHIP: return "Physical Chip"; 126 case HV_PERF_DOMAIN_PHYS_CORE: return "Physical Core"; 127 case HV_PERF_DOMAIN_VCPU_HOME_CORE: return "VCPU Home Core"; 128 case HV_PERF_DOMAIN_VCPU_HOME_CHIP: return "VCPU Home Chip"; 129 case HV_PERF_DOMAIN_VCPU_HOME_NODE: return "VCPU Home Node"; 130 case HV_PERF_DOMAIN_VCPU_REMOTE_NODE: return "VCPU Remote Node"; 131 } 132 133 WARN_ON_ONCE(domain); 134 return NULL; 135 } 136 137 static bool catalog_entry_domain_is_valid(unsigned int domain) 138 { 139 /* POWER8 doesn't support virtual domains. */ 140 if (interface_version == 1) 141 return is_physical_domain(domain); 142 else 143 return domain_is_valid(domain); 144 } 145 146 /* 147 * TODO: Merging events: 148 * - Think of the hcall as an interface to a 4d array of counters: 149 * - x = domains 150 * - y = indexes in the domain (core, chip, vcpu, node, etc) 151 * - z = offset into the counter space 152 * - w = lpars (guest vms, "logical partitions") 153 * - A single request is: x,y,y_last,z,z_last,w,w_last 154 * - this means we can retrieve a rectangle of counters in y,z for a single x. 155 * 156 * - Things to consider (ignoring w): 157 * - input cost_per_request = 16 158 * - output cost_per_result(ys,zs) = 8 + 8 * ys + ys * zs 159 * - limited number of requests per hcall (must fit into 4K bytes) 160 * - 4k = 16 [buffer header] - 16 [request size] * request_count 161 * - 255 requests per hcall 162 * - sometimes it will be more efficient to read extra data and discard 163 */ 164 165 /* 166 * Example usage: 167 * perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/' 168 */ 169 170 /* u3 0-6, one of HV_24X7_PERF_DOMAIN */ 171 EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3); 172 /* u16 */ 173 EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31); 174 EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31); 175 EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31); 176 /* u32, see "data_offset" */ 177 EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63); 178 /* u16 */ 179 EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15); 180 181 EVENT_DEFINE_RANGE(reserved1, config, 4, 15); 182 EVENT_DEFINE_RANGE(reserved2, config1, 16, 63); 183 EVENT_DEFINE_RANGE(reserved3, config2, 0, 63); 184 185 static struct attribute *format_attrs[] = { 186 &format_attr_domain.attr, 187 &format_attr_offset.attr, 188 &format_attr_core.attr, 189 &format_attr_chip.attr, 190 &format_attr_vcpu.attr, 191 &format_attr_lpar.attr, 192 NULL, 193 }; 194 195 static const struct attribute_group format_group = { 196 .name = "format", 197 .attrs = format_attrs, 198 }; 199 200 static struct attribute_group event_group = { 201 .name = "events", 202 /* .attrs is set in init */ 203 }; 204 205 static struct attribute_group event_desc_group = { 206 .name = "event_descs", 207 /* .attrs is set in init */ 208 }; 209 210 static struct attribute_group event_long_desc_group = { 211 .name = "event_long_descs", 212 /* .attrs is set in init */ 213 }; 214 215 static struct kmem_cache *hv_page_cache; 216 217 static DEFINE_PER_CPU(int, hv_24x7_txn_flags); 218 static DEFINE_PER_CPU(int, hv_24x7_txn_err); 219 220 struct hv_24x7_hw { 221 struct perf_event *events[255]; 222 }; 223 224 static DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); 225 226 /* 227 * request_buffer and result_buffer are not required to be 4k aligned, 228 * but are not allowed to cross any 4k boundary. Aligning them to 4k is 229 * the simplest way to ensure that. 230 */ 231 #define H24x7_DATA_BUFFER_SIZE 4096 232 static DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); 233 static DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); 234 235 static unsigned int max_num_requests(int interface_version) 236 { 237 return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer)) 238 / H24x7_REQUEST_SIZE(interface_version); 239 } 240 241 static char *event_name(struct hv_24x7_event_data *ev, int *len) 242 { 243 *len = be16_to_cpu(ev->event_name_len) - 2; 244 return (char *)ev->remainder; 245 } 246 247 static char *event_desc(struct hv_24x7_event_data *ev, int *len) 248 { 249 unsigned int nl = be16_to_cpu(ev->event_name_len); 250 __be16 *desc_len = (__be16 *)(ev->remainder + nl - 2); 251 252 *len = be16_to_cpu(*desc_len) - 2; 253 return (char *)ev->remainder + nl; 254 } 255 256 static char *event_long_desc(struct hv_24x7_event_data *ev, int *len) 257 { 258 unsigned int nl = be16_to_cpu(ev->event_name_len); 259 __be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2); 260 unsigned int desc_len = be16_to_cpu(*desc_len_); 261 __be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2); 262 263 *len = be16_to_cpu(*long_desc_len) - 2; 264 return (char *)ev->remainder + nl + desc_len; 265 } 266 267 static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev, 268 void *end) 269 { 270 void *start = ev; 271 272 return (start + offsetof(struct hv_24x7_event_data, remainder)) < end; 273 } 274 275 /* 276 * Things we don't check: 277 * - padding for desc, name, and long/detailed desc is required to be '\0' 278 * bytes. 279 * 280 * Return NULL if we pass end, 281 * Otherwise return the address of the byte just following the event. 282 */ 283 static void *event_end(struct hv_24x7_event_data *ev, void *end) 284 { 285 void *start = ev; 286 __be16 *dl_, *ldl_; 287 unsigned int dl, ldl; 288 unsigned int nl = be16_to_cpu(ev->event_name_len); 289 290 if (nl < 2) { 291 pr_debug("%s: name length too short: %d", __func__, nl); 292 return NULL; 293 } 294 295 if (start + nl > end) { 296 pr_debug("%s: start=%p + nl=%u > end=%p", 297 __func__, start, nl, end); 298 return NULL; 299 } 300 301 dl_ = (__be16 *)(ev->remainder + nl - 2); 302 if (!IS_ALIGNED((uintptr_t)dl_, 2)) 303 pr_warn("desc len not aligned %p", dl_); 304 dl = be16_to_cpu(*dl_); 305 if (dl < 2) { 306 pr_debug("%s: desc len too short: %d", __func__, dl); 307 return NULL; 308 } 309 310 if (start + nl + dl > end) { 311 pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p", 312 __func__, start, nl, dl, start + nl + dl, end); 313 return NULL; 314 } 315 316 ldl_ = (__be16 *)(ev->remainder + nl + dl - 2); 317 if (!IS_ALIGNED((uintptr_t)ldl_, 2)) 318 pr_warn("long desc len not aligned %p", ldl_); 319 ldl = be16_to_cpu(*ldl_); 320 if (ldl < 2) { 321 pr_debug("%s: long desc len too short (ldl=%u)", 322 __func__, ldl); 323 return NULL; 324 } 325 326 if (start + nl + dl + ldl > end) { 327 pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p", 328 __func__, start, nl, dl, ldl, end); 329 return NULL; 330 } 331 332 return start + nl + dl + ldl; 333 } 334 335 static long h_get_24x7_catalog_page_(unsigned long phys_4096, 336 unsigned long version, unsigned long index) 337 { 338 pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)", 339 phys_4096, version, index); 340 341 WARN_ON(!IS_ALIGNED(phys_4096, 4096)); 342 343 return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE, 344 phys_4096, version, index); 345 } 346 347 static long h_get_24x7_catalog_page(char page[], u64 version, u32 index) 348 { 349 return h_get_24x7_catalog_page_(virt_to_phys(page), 350 version, index); 351 } 352 353 /* 354 * Each event we find in the catalog, will have a sysfs entry. Format the 355 * data for this sysfs entry based on the event's domain. 356 * 357 * Events belonging to the Chip domain can only be monitored in that domain. 358 * i.e the domain for these events is a fixed/knwon value. 359 * 360 * Events belonging to the Core domain can be monitored either in the physical 361 * core or in one of the virtual CPU domains. So the domain value for these 362 * events must be specified by the user (i.e is a required parameter). Format 363 * the Core events with 'domain=?' so the perf-tool can error check required 364 * parameters. 365 * 366 * NOTE: For the Core domain events, rather than making domain a required 367 * parameter we could default it to PHYS_CORE and allowe users to 368 * override the domain to one of the VCPU domains. 369 * 370 * However, this can make the interface a little inconsistent. 371 * 372 * If we set domain=2 (PHYS_CHIP) and allow user to override this field 373 * the user may be tempted to also modify the "offset=x" field in which 374 * can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and 375 * HPM_INST (offset=0x20) events. With: 376 * 377 * perf stat -e hv_24x7/HPM_PCYC,offset=0x20/ 378 * 379 * we end up monitoring HPM_INST, while the command line has HPM_PCYC. 380 * 381 * By not assigning a default value to the domain for the Core events, 382 * we can have simple guidelines: 383 * 384 * - Specifying values for parameters with "=?" is required. 385 * 386 * - Specifying (i.e overriding) values for other parameters 387 * is undefined. 388 */ 389 static char *event_fmt(struct hv_24x7_event_data *event, unsigned int domain) 390 { 391 const char *sindex; 392 const char *lpar; 393 const char *domain_str; 394 char buf[8]; 395 396 switch (domain) { 397 case HV_PERF_DOMAIN_PHYS_CHIP: 398 snprintf(buf, sizeof(buf), "%d", domain); 399 domain_str = buf; 400 lpar = "0x0"; 401 sindex = "chip"; 402 break; 403 case HV_PERF_DOMAIN_PHYS_CORE: 404 domain_str = "?"; 405 lpar = "0x0"; 406 sindex = "core"; 407 break; 408 default: 409 domain_str = "?"; 410 lpar = "?"; 411 sindex = "vcpu"; 412 } 413 414 return kasprintf(GFP_KERNEL, 415 "domain=%s,offset=0x%x,%s=?,lpar=%s", 416 domain_str, 417 be16_to_cpu(event->event_counter_offs) + 418 be16_to_cpu(event->event_group_record_offs), 419 sindex, 420 lpar); 421 } 422 423 /* Avoid trusting fw to NUL terminate strings */ 424 static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp) 425 { 426 return kasprintf(gfp, "%.*s", max_len, maybe_str); 427 } 428 429 static ssize_t cpumask_show(struct device *dev, 430 struct device_attribute *attr, char *buf) 431 { 432 return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask); 433 } 434 435 static ssize_t sockets_show(struct device *dev, 436 struct device_attribute *attr, char *buf) 437 { 438 return sysfs_emit(buf, "%d\n", phys_sockets); 439 } 440 441 static ssize_t chipspersocket_show(struct device *dev, 442 struct device_attribute *attr, char *buf) 443 { 444 return sysfs_emit(buf, "%d\n", phys_chipspersocket); 445 } 446 447 static ssize_t coresperchip_show(struct device *dev, 448 struct device_attribute *attr, char *buf) 449 { 450 return sysfs_emit(buf, "%d\n", phys_coresperchip); 451 } 452 453 static struct attribute *device_str_attr_create_(char *name, char *str) 454 { 455 struct dev_ext_attribute *attr = kzalloc_obj(*attr); 456 457 if (!attr) 458 return NULL; 459 460 sysfs_attr_init(&attr->attr.attr); 461 462 attr->var = str; 463 attr->attr.attr.name = name; 464 attr->attr.attr.mode = 0444; 465 attr->attr.show = device_show_string; 466 467 return &attr->attr.attr; 468 } 469 470 /* 471 * Allocate and initialize strings representing event attributes. 472 * 473 * NOTE: The strings allocated here are never destroyed and continue to 474 * exist till shutdown. This is to allow us to create as many events 475 * from the catalog as possible, even if we encounter errors with some. 476 * In case of changes to error paths in future, these may need to be 477 * freed by the caller. 478 */ 479 static struct attribute *device_str_attr_create(char *name, int name_max, 480 int name_nonce, 481 char *str, size_t str_max) 482 { 483 char *n; 484 char *s = memdup_to_str(str, str_max, GFP_KERNEL); 485 struct attribute *a; 486 487 if (!s) 488 return NULL; 489 490 if (!name_nonce) 491 n = kasprintf(GFP_KERNEL, "%.*s", name_max, name); 492 else 493 n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name, 494 name_nonce); 495 if (!n) 496 goto out_s; 497 498 a = device_str_attr_create_(n, s); 499 if (!a) 500 goto out_n; 501 502 return a; 503 out_n: 504 kfree(n); 505 out_s: 506 kfree(s); 507 return NULL; 508 } 509 510 static struct attribute *event_to_attr(unsigned int ix, 511 struct hv_24x7_event_data *event, 512 unsigned int domain, 513 int nonce) 514 { 515 int event_name_len; 516 char *ev_name, *a_ev_name, *val; 517 struct attribute *attr; 518 519 if (!domain_is_valid(domain)) { 520 pr_warn("catalog event %u has invalid domain %u\n", 521 ix, domain); 522 return NULL; 523 } 524 525 val = event_fmt(event, domain); 526 if (!val) 527 return NULL; 528 529 ev_name = event_name(event, &event_name_len); 530 if (!nonce) 531 a_ev_name = kasprintf(GFP_KERNEL, "%.*s", 532 (int)event_name_len, ev_name); 533 else 534 a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d", 535 (int)event_name_len, ev_name, nonce); 536 537 if (!a_ev_name) 538 goto out_val; 539 540 attr = device_str_attr_create_(a_ev_name, val); 541 if (!attr) 542 goto out_name; 543 544 return attr; 545 out_name: 546 kfree(a_ev_name); 547 out_val: 548 kfree(val); 549 return NULL; 550 } 551 552 static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event, 553 int nonce) 554 { 555 int nl, dl; 556 char *name = event_name(event, &nl); 557 char *desc = event_desc(event, &dl); 558 559 /* If there isn't a description, don't create the sysfs file */ 560 if (!dl) 561 return NULL; 562 563 return device_str_attr_create(name, nl, nonce, desc, dl); 564 } 565 566 static struct attribute * 567 event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce) 568 { 569 int nl, dl; 570 char *name = event_name(event, &nl); 571 char *desc = event_long_desc(event, &dl); 572 573 /* If there isn't a description, don't create the sysfs file */ 574 if (!dl) 575 return NULL; 576 577 return device_str_attr_create(name, nl, nonce, desc, dl); 578 } 579 580 static int event_data_to_attrs(unsigned int ix, struct attribute **attrs, 581 struct hv_24x7_event_data *event, int nonce) 582 { 583 *attrs = event_to_attr(ix, event, event->domain, nonce); 584 if (!*attrs) 585 return -1; 586 587 return 0; 588 } 589 590 /* */ 591 struct event_uniq { 592 struct rb_node node; 593 const char *name; 594 int nl; 595 unsigned int ct; 596 unsigned int domain; 597 }; 598 599 static int memord(const void *d1, size_t s1, const void *d2, size_t s2) 600 { 601 if (s1 < s2) 602 return 1; 603 if (s1 > s2) 604 return -1; 605 606 return memcmp(d1, d2, s1); 607 } 608 609 static int ev_uniq_ord(const void *v1, size_t s1, unsigned int d1, 610 const void *v2, size_t s2, unsigned int d2) 611 { 612 int r = memord(v1, s1, v2, s2); 613 614 if (r) 615 return r; 616 if (d1 > d2) 617 return 1; 618 if (d2 > d1) 619 return -1; 620 return 0; 621 } 622 623 static int event_uniq_add(struct rb_root *root, const char *name, int nl, 624 unsigned int domain) 625 { 626 struct rb_node **new = &(root->rb_node), *parent = NULL; 627 struct event_uniq *data; 628 629 /* Figure out where to put new node */ 630 while (*new) { 631 struct event_uniq *it; 632 int result; 633 634 it = rb_entry(*new, struct event_uniq, node); 635 result = ev_uniq_ord(name, nl, domain, it->name, it->nl, 636 it->domain); 637 638 parent = *new; 639 if (result < 0) 640 new = &((*new)->rb_left); 641 else if (result > 0) 642 new = &((*new)->rb_right); 643 else { 644 it->ct++; 645 pr_info("found a duplicate event %.*s, ct=%u\n", nl, 646 name, it->ct); 647 return it->ct; 648 } 649 } 650 651 data = kmalloc_obj(*data); 652 if (!data) 653 return -ENOMEM; 654 655 *data = (struct event_uniq) { 656 .name = name, 657 .nl = nl, 658 .ct = 0, 659 .domain = domain, 660 }; 661 662 /* Add new node and rebalance tree. */ 663 rb_link_node(&data->node, parent, new); 664 rb_insert_color(&data->node, root); 665 666 /* data->ct */ 667 return 0; 668 } 669 670 static void event_uniq_destroy(struct rb_root *root) 671 { 672 /* 673 * the strings we point to are in the giant block of memory filled by 674 * the catalog, and are freed separately. 675 */ 676 struct event_uniq *pos, *n; 677 678 rbtree_postorder_for_each_entry_safe(pos, n, root, node) 679 kfree(pos); 680 } 681 682 683 /* 684 * ensure the event structure's sizes are self consistent and don't cause us to 685 * read outside of the event 686 * 687 * On success, return the event length in bytes. 688 * Otherwise, return -1 (and print as appropriate). 689 */ 690 static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event, 691 size_t event_idx, 692 size_t event_data_bytes, 693 size_t event_entry_count, 694 size_t offset, void *end) 695 { 696 ssize_t ev_len; 697 void *ev_end, *calc_ev_end; 698 699 if (offset >= event_data_bytes) 700 return -1; 701 702 if (event_idx >= event_entry_count) { 703 pr_devel("catalog event data has %zu bytes of padding after last event\n", 704 event_data_bytes - offset); 705 return -1; 706 } 707 708 if (!event_fixed_portion_is_within(event, end)) { 709 pr_warn("event %zu fixed portion is not within range\n", 710 event_idx); 711 return -1; 712 } 713 714 ev_len = be16_to_cpu(event->length); 715 716 if (ev_len % 16) 717 pr_info("event %zu has length %zu not divisible by 16: event=%p\n", 718 event_idx, ev_len, event); 719 720 ev_end = (__u8 *)event + ev_len; 721 if (ev_end > end) { 722 pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%p > end=%p, offset=%zu\n", 723 event_idx, ev_len, ev_end, end, 724 offset); 725 return -1; 726 } 727 728 calc_ev_end = event_end(event, end); 729 if (!calc_ev_end) { 730 pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%p end=%p, offset=%zu\n", 731 event_idx, event_data_bytes, event, end, 732 offset); 733 return -1; 734 } 735 736 if (calc_ev_end > ev_end) { 737 pr_warn("event %zu exceeds its own length: event=%p, end=%p, offset=%zu, calc_ev_end=%p\n", 738 event_idx, event, ev_end, offset, calc_ev_end); 739 return -1; 740 } 741 742 return ev_len; 743 } 744 745 /* 746 * Return true incase of invalid or dummy events with names like RESERVED* 747 */ 748 static bool ignore_event(const char *name) 749 { 750 return strncmp(name, "RESERVED", 8) == 0; 751 } 752 753 #define MAX_4K (SIZE_MAX / 4096) 754 755 static int create_events_from_catalog(struct attribute ***events_, 756 struct attribute ***event_descs_, 757 struct attribute ***event_long_descs_) 758 { 759 long hret; 760 size_t catalog_len, catalog_page_len, event_entry_count, 761 event_data_len, event_data_offs, 762 event_data_bytes, junk_events, event_idx, event_attr_ct, i, 763 attr_max, event_idx_last, desc_ct, long_desc_ct; 764 ssize_t ct, ev_len; 765 uint64_t catalog_version_num; 766 struct attribute **events, **event_descs, **event_long_descs; 767 struct hv_24x7_catalog_page_0 *page_0 = 768 kmem_cache_alloc(hv_page_cache, GFP_KERNEL); 769 void *page = page_0; 770 void *event_data, *end; 771 struct hv_24x7_event_data *event; 772 struct rb_root ev_uniq = RB_ROOT; 773 int ret = 0; 774 775 if (!page) { 776 ret = -ENOMEM; 777 goto e_out; 778 } 779 780 hret = h_get_24x7_catalog_page(page, 0, 0); 781 if (hret) { 782 ret = -EIO; 783 goto e_free; 784 } 785 786 catalog_version_num = be64_to_cpu(page_0->version); 787 catalog_page_len = be32_to_cpu(page_0->length); 788 789 if (MAX_4K < catalog_page_len) { 790 pr_err("invalid page count: %zu\n", catalog_page_len); 791 ret = -EIO; 792 goto e_free; 793 } 794 795 catalog_len = catalog_page_len * 4096; 796 797 event_entry_count = be16_to_cpu(page_0->event_entry_count); 798 event_data_offs = be16_to_cpu(page_0->event_data_offs); 799 event_data_len = be16_to_cpu(page_0->event_data_len); 800 801 pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n", 802 catalog_version_num, catalog_len, 803 event_entry_count, event_data_offs, event_data_len); 804 805 if ((MAX_4K < event_data_len) 806 || (MAX_4K < event_data_offs) 807 || (MAX_4K - event_data_offs < event_data_len)) { 808 pr_err("invalid event data offs %zu and/or len %zu\n", 809 event_data_offs, event_data_len); 810 ret = -EIO; 811 goto e_free; 812 } 813 814 if ((event_data_offs + event_data_len) > catalog_page_len) { 815 pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n", 816 event_data_offs, 817 event_data_offs + event_data_len, 818 catalog_page_len); 819 ret = -EIO; 820 goto e_free; 821 } 822 823 if (SIZE_MAX - 1 < event_entry_count) { 824 pr_err("event_entry_count %zu is invalid\n", event_entry_count); 825 ret = -EIO; 826 goto e_free; 827 } 828 829 event_data_bytes = event_data_len * 4096; 830 831 /* 832 * event data can span several pages, events can cross between these 833 * pages. Use vmalloc to make this easier. 834 */ 835 event_data = vmalloc(event_data_bytes); 836 if (!event_data) { 837 pr_err("could not allocate event data\n"); 838 ret = -ENOMEM; 839 goto e_free; 840 } 841 842 end = event_data + event_data_bytes; 843 844 /* 845 * using vmalloc_to_phys() like this only works if PAGE_SIZE is 846 * divisible by 4096 847 */ 848 BUILD_BUG_ON(PAGE_SIZE % 4096); 849 850 for (i = 0; i < event_data_len; i++) { 851 hret = h_get_24x7_catalog_page_( 852 vmalloc_to_phys(event_data + i * 4096), 853 catalog_version_num, 854 i + event_data_offs); 855 if (hret) { 856 pr_err("Failed to get event data in page %zu: rc=%ld\n", 857 i + event_data_offs, hret); 858 ret = -EIO; 859 goto e_event_data; 860 } 861 } 862 863 /* 864 * scan the catalog to determine the number of attributes we need, and 865 * verify it at the same time. 866 */ 867 for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0; 868 ; 869 event_idx++, event = (void *)event + ev_len) { 870 size_t offset = (void *)event - (void *)event_data; 871 char *name; 872 int nl; 873 874 ev_len = catalog_event_len_validate(event, event_idx, 875 event_data_bytes, 876 event_entry_count, 877 offset, end); 878 if (ev_len < 0) 879 break; 880 881 name = event_name(event, &nl); 882 883 if (ignore_event(name)) { 884 junk_events++; 885 continue; 886 } 887 if (event->event_group_record_len == 0) { 888 pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n", 889 event_idx, nl, name); 890 junk_events++; 891 continue; 892 } 893 894 if (!catalog_entry_domain_is_valid(event->domain)) { 895 pr_info("event %zu (%.*s) has invalid domain %d\n", 896 event_idx, nl, name, event->domain); 897 junk_events++; 898 continue; 899 } 900 901 attr_max++; 902 } 903 904 event_idx_last = event_idx; 905 if (event_idx_last != event_entry_count) 906 pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n", 907 event_idx_last, event_entry_count, junk_events); 908 909 events = kmalloc_objs(*events, attr_max + 1); 910 if (!events) { 911 ret = -ENOMEM; 912 goto e_event_data; 913 } 914 915 event_descs = kmalloc_objs(*event_descs, event_idx + 1); 916 if (!event_descs) { 917 ret = -ENOMEM; 918 goto e_event_attrs; 919 } 920 921 event_long_descs = kmalloc_objs(*event_long_descs, event_idx + 1); 922 if (!event_long_descs) { 923 ret = -ENOMEM; 924 goto e_event_descs; 925 } 926 927 /* Iterate over the catalog filling in the attribute vector */ 928 for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0, 929 event = event_data, event_idx = 0; 930 event_idx < event_idx_last; 931 event_idx++, ev_len = be16_to_cpu(event->length), 932 event = (void *)event + ev_len) { 933 char *name; 934 int nl; 935 int nonce; 936 /* 937 * these are the only "bad" events that are intermixed and that 938 * we can ignore without issue. make sure to skip them here 939 */ 940 if (event->event_group_record_len == 0) 941 continue; 942 if (!catalog_entry_domain_is_valid(event->domain)) 943 continue; 944 945 name = event_name(event, &nl); 946 if (ignore_event(name)) 947 continue; 948 949 nonce = event_uniq_add(&ev_uniq, name, nl, event->domain); 950 ct = event_data_to_attrs(event_idx, events + event_attr_ct, 951 event, nonce); 952 if (ct < 0) { 953 pr_warn("event %zu (%.*s) creation failure, skipping\n", 954 event_idx, nl, name); 955 junk_events++; 956 } else { 957 event_attr_ct++; 958 event_descs[desc_ct] = event_to_desc_attr(event, nonce); 959 if (event_descs[desc_ct]) 960 desc_ct++; 961 event_long_descs[long_desc_ct] = 962 event_to_long_desc_attr(event, nonce); 963 if (event_long_descs[long_desc_ct]) 964 long_desc_ct++; 965 } 966 } 967 968 pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n", 969 event_idx, event_attr_ct, junk_events, desc_ct); 970 971 events[event_attr_ct] = NULL; 972 event_descs[desc_ct] = NULL; 973 event_long_descs[long_desc_ct] = NULL; 974 975 event_uniq_destroy(&ev_uniq); 976 vfree(event_data); 977 kmem_cache_free(hv_page_cache, page); 978 979 *events_ = events; 980 *event_descs_ = event_descs; 981 *event_long_descs_ = event_long_descs; 982 return 0; 983 984 e_event_descs: 985 kfree(event_descs); 986 e_event_attrs: 987 kfree(events); 988 e_event_data: 989 vfree(event_data); 990 e_free: 991 kmem_cache_free(hv_page_cache, page); 992 e_out: 993 *events_ = NULL; 994 *event_descs_ = NULL; 995 *event_long_descs_ = NULL; 996 return ret; 997 } 998 999 static ssize_t catalog_read(struct file *filp, struct kobject *kobj, 1000 const struct bin_attribute *bin_attr, char *buf, 1001 loff_t offset, size_t count) 1002 { 1003 long hret; 1004 ssize_t ret = 0; 1005 size_t catalog_len = 0, catalog_page_len = 0; 1006 loff_t page_offset = 0; 1007 loff_t offset_in_page; 1008 size_t copy_len; 1009 uint64_t catalog_version_num = 0; 1010 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); 1011 struct hv_24x7_catalog_page_0 *page_0 = page; 1012 1013 if (!page) 1014 return -ENOMEM; 1015 1016 hret = h_get_24x7_catalog_page(page, 0, 0); 1017 if (hret) { 1018 ret = -EIO; 1019 goto e_free; 1020 } 1021 1022 catalog_version_num = be64_to_cpu(page_0->version); 1023 catalog_page_len = be32_to_cpu(page_0->length); 1024 catalog_len = catalog_page_len * 4096; 1025 1026 page_offset = offset / 4096; 1027 offset_in_page = offset % 4096; 1028 1029 if (page_offset >= catalog_page_len) 1030 goto e_free; 1031 1032 if (page_offset != 0) { 1033 hret = h_get_24x7_catalog_page(page, catalog_version_num, 1034 page_offset); 1035 if (hret) { 1036 ret = -EIO; 1037 goto e_free; 1038 } 1039 } 1040 1041 copy_len = 4096 - offset_in_page; 1042 if (copy_len > count) 1043 copy_len = count; 1044 1045 memcpy(buf, page+offset_in_page, copy_len); 1046 ret = copy_len; 1047 1048 e_free: 1049 if (hret) 1050 pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:" 1051 " rc=%ld\n", 1052 catalog_version_num, page_offset, hret); 1053 kmem_cache_free(hv_page_cache, page); 1054 1055 pr_devel("catalog_read: offset=%lld(%lld) count=%zu " 1056 "catalog_len=%zu(%zu) => %zd\n", offset, page_offset, 1057 count, catalog_len, catalog_page_len, ret); 1058 1059 return ret; 1060 } 1061 1062 static ssize_t domains_show(struct device *dev, struct device_attribute *attr, 1063 char *page) 1064 { 1065 int d, count = 0; 1066 const char *str; 1067 1068 for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) { 1069 str = domain_name(d); 1070 if (!str) 1071 continue; 1072 1073 count += sysfs_emit_at(page, count, "%d: %s\n", d, str); 1074 } 1075 return count; 1076 } 1077 1078 #define PAGE_0_ATTR(_name, _fmt, _expr) \ 1079 static ssize_t _name##_show(struct device *dev, \ 1080 struct device_attribute *dev_attr, \ 1081 char *buf) \ 1082 { \ 1083 long hret; \ 1084 ssize_t ret = 0; \ 1085 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); \ 1086 struct hv_24x7_catalog_page_0 *page_0 = page; \ 1087 if (!page) \ 1088 return -ENOMEM; \ 1089 hret = h_get_24x7_catalog_page(page, 0, 0); \ 1090 if (hret) { \ 1091 ret = -EIO; \ 1092 goto e_free; \ 1093 } \ 1094 ret = sysfs_emit(buf, _fmt, _expr); \ 1095 e_free: \ 1096 kmem_cache_free(hv_page_cache, page); \ 1097 return ret; \ 1098 } \ 1099 static DEVICE_ATTR_RO(_name) 1100 1101 PAGE_0_ATTR(catalog_version, "%lld\n", 1102 (unsigned long long)be64_to_cpu(page_0->version)); 1103 PAGE_0_ATTR(catalog_len, "%lld\n", 1104 (unsigned long long)be32_to_cpu(page_0->length) * 4096); 1105 static const BIN_ATTR_RO(catalog, 0/* real length varies */); 1106 static DEVICE_ATTR_RO(domains); 1107 static DEVICE_ATTR_RO(sockets); 1108 static DEVICE_ATTR_RO(chipspersocket); 1109 static DEVICE_ATTR_RO(coresperchip); 1110 static DEVICE_ATTR_RO(cpumask); 1111 1112 static const struct bin_attribute *const if_bin_attrs[] = { 1113 &bin_attr_catalog, 1114 NULL, 1115 }; 1116 1117 static struct attribute *cpumask_attrs[] = { 1118 &dev_attr_cpumask.attr, 1119 NULL, 1120 }; 1121 1122 static const struct attribute_group cpumask_attr_group = { 1123 .attrs = cpumask_attrs, 1124 }; 1125 1126 static struct attribute *if_attrs[] = { 1127 &dev_attr_catalog_len.attr, 1128 &dev_attr_catalog_version.attr, 1129 &dev_attr_domains.attr, 1130 &dev_attr_sockets.attr, 1131 &dev_attr_chipspersocket.attr, 1132 &dev_attr_coresperchip.attr, 1133 NULL, 1134 }; 1135 1136 static const struct attribute_group if_group = { 1137 .name = "interface", 1138 .bin_attrs = if_bin_attrs, 1139 .attrs = if_attrs, 1140 }; 1141 1142 static const struct attribute_group *attr_groups[] = { 1143 &format_group, 1144 &event_group, 1145 &event_desc_group, 1146 &event_long_desc_group, 1147 &if_group, 1148 &cpumask_attr_group, 1149 NULL, 1150 }; 1151 1152 /* 1153 * Start the process for a new H_GET_24x7_DATA hcall. 1154 */ 1155 static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer, 1156 struct hv_24x7_data_result_buffer *result_buffer) 1157 { 1158 1159 memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE); 1160 memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE); 1161 1162 request_buffer->interface_version = interface_version; 1163 /* memset above set request_buffer->num_requests to 0 */ 1164 } 1165 1166 /* 1167 * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected 1168 * by 'init_24x7_request()' and 'add_event_to_24x7_request()'. 1169 */ 1170 static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer, 1171 struct hv_24x7_data_result_buffer *result_buffer) 1172 { 1173 long ret; 1174 1175 /* 1176 * NOTE: Due to variable number of array elements in request and 1177 * result buffer(s), sizeof() is not reliable. Use the actual 1178 * allocated buffer size, H24x7_DATA_BUFFER_SIZE. 1179 */ 1180 ret = plpar_hcall_norets(H_GET_24X7_DATA, 1181 virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE, 1182 virt_to_phys(result_buffer), H24x7_DATA_BUFFER_SIZE); 1183 1184 if (ret) { 1185 struct hv_24x7_request *req; 1186 1187 req = request_buffer->requests; 1188 pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n", 1189 req->performance_domain, req->data_offset, 1190 req->starting_ix, req->starting_lpar_ix, 1191 ret, ret, result_buffer->detailed_rc, 1192 result_buffer->failing_request_ix); 1193 return -EIO; 1194 } 1195 1196 return 0; 1197 } 1198 1199 /* 1200 * Add the given @event to the next slot in the 24x7 request_buffer. 1201 * 1202 * Note that H_GET_24X7_DATA hcall allows reading several counters' 1203 * values in a single HCALL. We expect the caller to add events to the 1204 * request buffer one by one, make the HCALL and process the results. 1205 */ 1206 static int add_event_to_24x7_request(struct perf_event *event, 1207 struct hv_24x7_request_buffer *request_buffer) 1208 { 1209 u16 idx; 1210 int i; 1211 size_t req_size; 1212 struct hv_24x7_request *req; 1213 1214 if (request_buffer->num_requests >= 1215 max_num_requests(request_buffer->interface_version)) { 1216 pr_devel("Too many requests for 24x7 HCALL %d\n", 1217 request_buffer->num_requests); 1218 return -EINVAL; 1219 } 1220 1221 switch (event_get_domain(event)) { 1222 case HV_PERF_DOMAIN_PHYS_CHIP: 1223 idx = event_get_chip(event); 1224 break; 1225 case HV_PERF_DOMAIN_PHYS_CORE: 1226 idx = event_get_core(event); 1227 break; 1228 default: 1229 idx = event_get_vcpu(event); 1230 } 1231 1232 req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version); 1233 1234 i = request_buffer->num_requests++; 1235 req = (void *) request_buffer->requests + i * req_size; 1236 1237 req->performance_domain = event_get_domain(event); 1238 req->data_size = cpu_to_be16(8); 1239 req->data_offset = cpu_to_be32(event_get_offset(event)); 1240 req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event)); 1241 req->max_num_lpars = cpu_to_be16(1); 1242 req->starting_ix = cpu_to_be16(idx); 1243 req->max_ix = cpu_to_be16(1); 1244 1245 if (request_buffer->interface_version > 1) { 1246 if (domain_needs_aggregation(req->performance_domain)) 1247 req->max_num_thread_groups = -1; 1248 else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) { 1249 req->starting_thread_group_ix = idx % 2; 1250 req->max_num_thread_groups = 1; 1251 } 1252 } 1253 1254 return 0; 1255 } 1256 1257 /** 1258 * get_count_from_result - get event count from all result elements in result 1259 * 1260 * If the event corresponding to this result needs aggregation of the result 1261 * element values, then this function does that. 1262 * 1263 * @event: Event associated with @res. 1264 * @resb: Result buffer containing @res. 1265 * @res: Result to work on. 1266 * @countp: Output variable containing the event count. 1267 * @next: Optional output variable pointing to the next result in @resb. 1268 */ 1269 static int get_count_from_result(struct perf_event *event, 1270 struct hv_24x7_data_result_buffer *resb, 1271 struct hv_24x7_result *res, u64 *countp, 1272 struct hv_24x7_result **next) 1273 { 1274 u16 num_elements = be16_to_cpu(res->num_elements_returned); 1275 u16 data_size = be16_to_cpu(res->result_element_data_size); 1276 unsigned int data_offset; 1277 void *element_data; 1278 int i; 1279 u64 count; 1280 1281 /* 1282 * We can bail out early if the result is empty. 1283 */ 1284 if (!num_elements) { 1285 pr_debug("Result of request %hhu is empty, nothing to do\n", 1286 res->result_ix); 1287 1288 if (next) 1289 *next = (struct hv_24x7_result *) res->elements; 1290 1291 return -ENODATA; 1292 } 1293 1294 /* 1295 * Since we always specify 1 as the maximum for the smallest resource 1296 * we're requesting, there should to be only one element per result. 1297 * Except when an event needs aggregation, in which case there are more. 1298 */ 1299 if (num_elements != 1 && 1300 !domain_needs_aggregation(event_get_domain(event))) { 1301 pr_err("Error: result of request %hhu has %hu elements\n", 1302 res->result_ix, num_elements); 1303 1304 return -EIO; 1305 } 1306 1307 if (data_size != sizeof(u64)) { 1308 pr_debug("Error: result of request %hhu has data of %hu bytes\n", 1309 res->result_ix, data_size); 1310 1311 return -ENOTSUPP; 1312 } 1313 1314 if (resb->interface_version == 1) 1315 data_offset = offsetof(struct hv_24x7_result_element_v1, 1316 element_data); 1317 else 1318 data_offset = offsetof(struct hv_24x7_result_element_v2, 1319 element_data); 1320 1321 /* Go through the result elements in the result. */ 1322 for (i = count = 0, element_data = res->elements + data_offset; 1323 i < num_elements; 1324 i++, element_data += data_size + data_offset) 1325 count += be64_to_cpu(*((__be64 *)element_data)); 1326 1327 *countp = count; 1328 1329 /* The next result is after the last result element. */ 1330 if (next) 1331 *next = element_data - data_offset; 1332 1333 return 0; 1334 } 1335 1336 static int single_24x7_request(struct perf_event *event, u64 *count) 1337 { 1338 int ret; 1339 struct hv_24x7_request_buffer *request_buffer; 1340 struct hv_24x7_data_result_buffer *result_buffer; 1341 1342 BUILD_BUG_ON(sizeof(*request_buffer) > 4096); 1343 BUILD_BUG_ON(sizeof(*result_buffer) > 4096); 1344 1345 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1346 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1347 1348 init_24x7_request(request_buffer, result_buffer); 1349 1350 ret = add_event_to_24x7_request(event, request_buffer); 1351 if (ret) 1352 goto out; 1353 1354 ret = make_24x7_request(request_buffer, result_buffer); 1355 if (ret) 1356 goto out; 1357 1358 /* process result from hcall */ 1359 ret = get_count_from_result(event, result_buffer, 1360 result_buffer->results, count, NULL); 1361 1362 out: 1363 put_cpu_var(hv_24x7_reqb); 1364 put_cpu_var(hv_24x7_resb); 1365 return ret; 1366 } 1367 1368 1369 static int h_24x7_event_init(struct perf_event *event) 1370 { 1371 struct hv_perf_caps caps; 1372 unsigned int domain; 1373 unsigned long hret; 1374 u64 ct; 1375 1376 /* Not our event */ 1377 if (event->attr.type != event->pmu->type) 1378 return -ENOENT; 1379 1380 /* Unused areas must be 0 */ 1381 if (event_get_reserved1(event) || 1382 event_get_reserved2(event) || 1383 event_get_reserved3(event)) { 1384 pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n", 1385 event->attr.config, 1386 event_get_reserved1(event), 1387 event->attr.config1, 1388 event_get_reserved2(event), 1389 event->attr.config2, 1390 event_get_reserved3(event)); 1391 return -EINVAL; 1392 } 1393 1394 /* no branch sampling */ 1395 if (has_branch_stack(event)) 1396 return -EOPNOTSUPP; 1397 1398 /* offset must be 8 byte aligned */ 1399 if (event_get_offset(event) % 8) { 1400 pr_devel("bad alignment\n"); 1401 return -EINVAL; 1402 } 1403 1404 domain = event_get_domain(event); 1405 if (domain == 0 || domain >= HV_PERF_DOMAIN_MAX) { 1406 pr_devel("invalid domain %d\n", domain); 1407 return -EINVAL; 1408 } 1409 1410 hret = hv_perf_caps_get(&caps); 1411 if (hret) { 1412 pr_devel("could not get capabilities: rc=%ld\n", hret); 1413 return -EIO; 1414 } 1415 1416 /* Physical domains & other lpars require extra capabilities */ 1417 if (!caps.collect_privileged && (is_physical_domain(domain) || 1418 (event_get_lpar(event) != event_get_lpar_max()))) { 1419 pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n", 1420 is_physical_domain(domain), 1421 event_get_lpar(event)); 1422 return -EACCES; 1423 } 1424 1425 /* Get the initial value of the counter for this event */ 1426 if (single_24x7_request(event, &ct)) { 1427 pr_devel("test hcall failed\n"); 1428 return -EIO; 1429 } 1430 (void)local64_xchg(&event->hw.prev_count, ct); 1431 1432 return 0; 1433 } 1434 1435 static u64 h_24x7_get_value(struct perf_event *event) 1436 { 1437 u64 ct; 1438 1439 if (single_24x7_request(event, &ct)) 1440 /* We checked this in event init, shouldn't fail here... */ 1441 return 0; 1442 1443 return ct; 1444 } 1445 1446 static void update_event_count(struct perf_event *event, u64 now) 1447 { 1448 s64 prev; 1449 1450 prev = local64_xchg(&event->hw.prev_count, now); 1451 local64_add(now - prev, &event->count); 1452 } 1453 1454 static void h_24x7_event_read(struct perf_event *event) 1455 { 1456 u64 now; 1457 struct hv_24x7_request_buffer *request_buffer; 1458 struct hv_24x7_hw *h24x7hw; 1459 int txn_flags; 1460 1461 txn_flags = __this_cpu_read(hv_24x7_txn_flags); 1462 1463 /* 1464 * If in a READ transaction, add this counter to the list of 1465 * counters to read during the next HCALL (i.e commit_txn()). 1466 * If not in a READ transaction, go ahead and make the HCALL 1467 * to read this counter by itself. 1468 */ 1469 1470 if (txn_flags & PERF_PMU_TXN_READ) { 1471 int i; 1472 int ret; 1473 1474 if (__this_cpu_read(hv_24x7_txn_err)) 1475 return; 1476 1477 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1478 1479 ret = add_event_to_24x7_request(event, request_buffer); 1480 if (ret) { 1481 __this_cpu_write(hv_24x7_txn_err, ret); 1482 } else { 1483 /* 1484 * Associate the event with the HCALL request index, 1485 * so ->commit_txn() can quickly find/update count. 1486 */ 1487 i = request_buffer->num_requests - 1; 1488 1489 h24x7hw = &get_cpu_var(hv_24x7_hw); 1490 h24x7hw->events[i] = event; 1491 put_cpu_var(h24x7hw); 1492 } 1493 1494 put_cpu_var(hv_24x7_reqb); 1495 } else { 1496 now = h_24x7_get_value(event); 1497 update_event_count(event, now); 1498 } 1499 } 1500 1501 static void h_24x7_event_start(struct perf_event *event, int flags) 1502 { 1503 if (flags & PERF_EF_RELOAD) 1504 local64_set(&event->hw.prev_count, h_24x7_get_value(event)); 1505 } 1506 1507 static void h_24x7_event_stop(struct perf_event *event, int flags) 1508 { 1509 h_24x7_event_read(event); 1510 } 1511 1512 static int h_24x7_event_add(struct perf_event *event, int flags) 1513 { 1514 if (flags & PERF_EF_START) 1515 h_24x7_event_start(event, flags); 1516 1517 return 0; 1518 } 1519 1520 /* 1521 * 24x7 counters only support READ transactions. They are 1522 * always counting and dont need/support ADD transactions. 1523 * Cache the flags, but otherwise ignore transactions that 1524 * are not PERF_PMU_TXN_READ. 1525 */ 1526 static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags) 1527 { 1528 struct hv_24x7_request_buffer *request_buffer; 1529 struct hv_24x7_data_result_buffer *result_buffer; 1530 1531 /* We should not be called if we are already in a txn */ 1532 WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags)); 1533 1534 __this_cpu_write(hv_24x7_txn_flags, flags); 1535 if (flags & ~PERF_PMU_TXN_READ) 1536 return; 1537 1538 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1539 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1540 1541 init_24x7_request(request_buffer, result_buffer); 1542 1543 put_cpu_var(hv_24x7_resb); 1544 put_cpu_var(hv_24x7_reqb); 1545 } 1546 1547 /* 1548 * Clean up transaction state. 1549 * 1550 * NOTE: Ignore state of request and result buffers for now. 1551 * We will initialize them during the next read/txn. 1552 */ 1553 static void reset_txn(void) 1554 { 1555 __this_cpu_write(hv_24x7_txn_flags, 0); 1556 __this_cpu_write(hv_24x7_txn_err, 0); 1557 } 1558 1559 /* 1560 * 24x7 counters only support READ transactions. They are always counting 1561 * and dont need/support ADD transactions. Clear ->txn_flags but otherwise 1562 * ignore transactions that are not of type PERF_PMU_TXN_READ. 1563 * 1564 * For READ transactions, submit all pending 24x7 requests (i.e requests 1565 * that were queued by h_24x7_event_read()), to the hypervisor and update 1566 * the event counts. 1567 */ 1568 static int h_24x7_event_commit_txn(struct pmu *pmu) 1569 { 1570 struct hv_24x7_request_buffer *request_buffer; 1571 struct hv_24x7_data_result_buffer *result_buffer; 1572 struct hv_24x7_result *res, *next_res; 1573 u64 count; 1574 int i, ret, txn_flags; 1575 struct hv_24x7_hw *h24x7hw; 1576 1577 txn_flags = __this_cpu_read(hv_24x7_txn_flags); 1578 WARN_ON_ONCE(!txn_flags); 1579 1580 ret = 0; 1581 if (txn_flags & ~PERF_PMU_TXN_READ) 1582 goto out; 1583 1584 ret = __this_cpu_read(hv_24x7_txn_err); 1585 if (ret) 1586 goto out; 1587 1588 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1589 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1590 1591 ret = make_24x7_request(request_buffer, result_buffer); 1592 if (ret) 1593 goto put_reqb; 1594 1595 h24x7hw = &get_cpu_var(hv_24x7_hw); 1596 1597 /* Go through results in the result buffer to update event counts. */ 1598 for (i = 0, res = result_buffer->results; 1599 i < result_buffer->num_results; i++, res = next_res) { 1600 struct perf_event *event = h24x7hw->events[res->result_ix]; 1601 1602 ret = get_count_from_result(event, result_buffer, res, &count, 1603 &next_res); 1604 if (ret) 1605 break; 1606 1607 update_event_count(event, count); 1608 } 1609 1610 put_cpu_var(hv_24x7_hw); 1611 1612 put_reqb: 1613 put_cpu_var(hv_24x7_resb); 1614 put_cpu_var(hv_24x7_reqb); 1615 out: 1616 reset_txn(); 1617 return ret; 1618 } 1619 1620 /* 1621 * 24x7 counters only support READ transactions. They are always counting 1622 * and dont need/support ADD transactions. However, regardless of type 1623 * of transaction, all we need to do is cleanup, so we don't have to check 1624 * the type of transaction. 1625 */ 1626 static void h_24x7_event_cancel_txn(struct pmu *pmu) 1627 { 1628 WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags)); 1629 reset_txn(); 1630 } 1631 1632 static struct pmu h_24x7_pmu = { 1633 .task_ctx_nr = perf_invalid_context, 1634 1635 .name = "hv_24x7", 1636 .attr_groups = attr_groups, 1637 .event_init = h_24x7_event_init, 1638 .add = h_24x7_event_add, 1639 .del = h_24x7_event_stop, 1640 .start = h_24x7_event_start, 1641 .stop = h_24x7_event_stop, 1642 .read = h_24x7_event_read, 1643 .start_txn = h_24x7_event_start_txn, 1644 .commit_txn = h_24x7_event_commit_txn, 1645 .cancel_txn = h_24x7_event_cancel_txn, 1646 .capabilities = PERF_PMU_CAP_NO_EXCLUDE, 1647 }; 1648 1649 static int ppc_hv_24x7_cpu_online(unsigned int cpu) 1650 { 1651 if (cpumask_empty(&hv_24x7_cpumask)) 1652 cpumask_set_cpu(cpu, &hv_24x7_cpumask); 1653 1654 return 0; 1655 } 1656 1657 static int ppc_hv_24x7_cpu_offline(unsigned int cpu) 1658 { 1659 int target; 1660 1661 /* Check if exiting cpu is used for collecting 24x7 events */ 1662 if (!cpumask_test_and_clear_cpu(cpu, &hv_24x7_cpumask)) 1663 return 0; 1664 1665 /* Find a new cpu to collect 24x7 events */ 1666 target = cpumask_last(cpu_active_mask); 1667 1668 if (target < 0 || target >= nr_cpu_ids) { 1669 pr_err("hv_24x7: CPU hotplug init failed\n"); 1670 return -1; 1671 } 1672 1673 /* Migrate 24x7 events to the new target */ 1674 cpumask_set_cpu(target, &hv_24x7_cpumask); 1675 perf_pmu_migrate_context(&h_24x7_pmu, cpu, target); 1676 1677 return 0; 1678 } 1679 1680 static int hv_24x7_cpu_hotplug_init(void) 1681 { 1682 return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE, 1683 "perf/powerpc/hv_24x7:online", 1684 ppc_hv_24x7_cpu_online, 1685 ppc_hv_24x7_cpu_offline); 1686 } 1687 1688 static int hv_24x7_init(void) 1689 { 1690 int r; 1691 unsigned long hret; 1692 unsigned int pvr = mfspr(SPRN_PVR); 1693 struct hv_perf_caps caps; 1694 1695 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 1696 pr_debug("not a virtualized system, not enabling\n"); 1697 return -ENODEV; 1698 } 1699 1700 /* POWER8 only supports v1, while POWER9 only supports v2. */ 1701 if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E || 1702 PVR_VER(pvr) == PVR_POWER8NVL) 1703 interface_version = 1; 1704 else { 1705 interface_version = 2; 1706 1707 /* SMT8 in POWER9 needs to aggregate result elements. */ 1708 if (threads_per_core == 8) 1709 aggregate_result_elements = true; 1710 } 1711 1712 hret = hv_perf_caps_get(&caps); 1713 if (hret) { 1714 pr_debug("could not obtain capabilities, not enabling, rc=%ld\n", 1715 hret); 1716 return -ENODEV; 1717 } 1718 1719 hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL); 1720 if (!hv_page_cache) 1721 return -ENOMEM; 1722 1723 /* sampling not supported */ 1724 h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; 1725 1726 r = create_events_from_catalog(&event_group.attrs, 1727 &event_desc_group.attrs, 1728 &event_long_desc_group.attrs); 1729 1730 if (r) 1731 return r; 1732 1733 /* init cpuhotplug */ 1734 r = hv_24x7_cpu_hotplug_init(); 1735 if (r) 1736 return r; 1737 1738 r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1); 1739 if (r) 1740 return r; 1741 1742 read_24x7_sys_info(); 1743 1744 return 0; 1745 } 1746 1747 device_initcall(hv_24x7_init); 1748