1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Hypervisor supplied "24x7" performance counter support 4 * 5 * Author: Cody P Schafer <cody@linux.vnet.ibm.com> 6 * Copyright 2014 IBM Corporation. 7 */ 8 9 #define pr_fmt(fmt) "hv-24x7: " fmt 10 11 #include <linux/perf_event.h> 12 #include <linux/rbtree.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/vmalloc.h> 16 17 #include <asm/cputhreads.h> 18 #include <asm/firmware.h> 19 #include <asm/hvcall.h> 20 #include <asm/io.h> 21 #include <asm/papr-sysparm.h> 22 #include <linux/byteorder/generic.h> 23 24 #include <asm/rtas.h> 25 #include "hv-24x7.h" 26 #include "hv-24x7-catalog.h" 27 #include "hv-common.h" 28 29 /* Version of the 24x7 hypervisor API that we should use in this machine. */ 30 static int interface_version; 31 32 /* Whether we have to aggregate result data for some domains. */ 33 static bool aggregate_result_elements; 34 35 static cpumask_t hv_24x7_cpumask; 36 37 static bool domain_is_valid(unsigned int domain) 38 { 39 switch (domain) { 40 #define DOMAIN(n, v, x, c) \ 41 case HV_PERF_DOMAIN_##n: \ 42 /* fall through */ 43 #include "hv-24x7-domains.h" 44 #undef DOMAIN 45 return true; 46 default: 47 return false; 48 } 49 } 50 51 static bool is_physical_domain(unsigned int domain) 52 { 53 switch (domain) { 54 #define DOMAIN(n, v, x, c) \ 55 case HV_PERF_DOMAIN_##n: \ 56 return c; 57 #include "hv-24x7-domains.h" 58 #undef DOMAIN 59 default: 60 return false; 61 } 62 } 63 64 /* 65 * The Processor Module Information system parameter allows transferring 66 * of certain processor module information from the platform to the OS. 67 * Refer PAPR+ document to get parameter token value as '43'. 68 */ 69 70 static u32 phys_sockets; /* Physical sockets */ 71 static u32 phys_chipspersocket; /* Physical chips per socket*/ 72 static u32 phys_coresperchip; /* Physical cores per chip */ 73 74 /* 75 * read_24x7_sys_info() 76 * Retrieve the number of sockets and chips per socket and cores per 77 * chip details through the get-system-parameter rtas call. 78 */ 79 void read_24x7_sys_info(void) 80 { 81 struct papr_sysparm_buf *buf; 82 83 /* 84 * Making system parameter: chips and sockets and cores per chip 85 * default to 1. 86 */ 87 phys_sockets = 1; 88 phys_chipspersocket = 1; 89 phys_coresperchip = 1; 90 91 buf = papr_sysparm_buf_alloc(); 92 if (!buf) 93 return; 94 95 if (!papr_sysparm_get(PAPR_SYSPARM_PROC_MODULE_INFO, buf)) { 96 int ntypes = be16_to_cpup((__be16 *)&buf->val[0]); 97 int len = be16_to_cpu(buf->len); 98 99 if (len >= 8 && ntypes != 0) { 100 phys_sockets = be16_to_cpup((__be16 *)&buf->val[2]); 101 phys_chipspersocket = be16_to_cpup((__be16 *)&buf->val[4]); 102 phys_coresperchip = be16_to_cpup((__be16 *)&buf->val[6]); 103 } 104 } 105 106 papr_sysparm_buf_free(buf); 107 } 108 109 /* Domains for which more than one result element are returned for each event. */ 110 static bool domain_needs_aggregation(unsigned int domain) 111 { 112 return aggregate_result_elements && 113 (domain == HV_PERF_DOMAIN_PHYS_CORE || 114 (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE && 115 domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE)); 116 } 117 118 static const char *domain_name(unsigned int domain) 119 { 120 if (!domain_is_valid(domain)) 121 return NULL; 122 123 switch (domain) { 124 case HV_PERF_DOMAIN_PHYS_CHIP: return "Physical Chip"; 125 case HV_PERF_DOMAIN_PHYS_CORE: return "Physical Core"; 126 case HV_PERF_DOMAIN_VCPU_HOME_CORE: return "VCPU Home Core"; 127 case HV_PERF_DOMAIN_VCPU_HOME_CHIP: return "VCPU Home Chip"; 128 case HV_PERF_DOMAIN_VCPU_HOME_NODE: return "VCPU Home Node"; 129 case HV_PERF_DOMAIN_VCPU_REMOTE_NODE: return "VCPU Remote Node"; 130 } 131 132 WARN_ON_ONCE(domain); 133 return NULL; 134 } 135 136 static bool catalog_entry_domain_is_valid(unsigned int domain) 137 { 138 /* POWER8 doesn't support virtual domains. */ 139 if (interface_version == 1) 140 return is_physical_domain(domain); 141 else 142 return domain_is_valid(domain); 143 } 144 145 /* 146 * TODO: Merging events: 147 * - Think of the hcall as an interface to a 4d array of counters: 148 * - x = domains 149 * - y = indexes in the domain (core, chip, vcpu, node, etc) 150 * - z = offset into the counter space 151 * - w = lpars (guest vms, "logical partitions") 152 * - A single request is: x,y,y_last,z,z_last,w,w_last 153 * - this means we can retrieve a rectangle of counters in y,z for a single x. 154 * 155 * - Things to consider (ignoring w): 156 * - input cost_per_request = 16 157 * - output cost_per_result(ys,zs) = 8 + 8 * ys + ys * zs 158 * - limited number of requests per hcall (must fit into 4K bytes) 159 * - 4k = 16 [buffer header] - 16 [request size] * request_count 160 * - 255 requests per hcall 161 * - sometimes it will be more efficient to read extra data and discard 162 */ 163 164 /* 165 * Example usage: 166 * perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/' 167 */ 168 169 /* u3 0-6, one of HV_24X7_PERF_DOMAIN */ 170 EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3); 171 /* u16 */ 172 EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31); 173 EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31); 174 EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31); 175 /* u32, see "data_offset" */ 176 EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63); 177 /* u16 */ 178 EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15); 179 180 EVENT_DEFINE_RANGE(reserved1, config, 4, 15); 181 EVENT_DEFINE_RANGE(reserved2, config1, 16, 63); 182 EVENT_DEFINE_RANGE(reserved3, config2, 0, 63); 183 184 static struct attribute *format_attrs[] = { 185 &format_attr_domain.attr, 186 &format_attr_offset.attr, 187 &format_attr_core.attr, 188 &format_attr_chip.attr, 189 &format_attr_vcpu.attr, 190 &format_attr_lpar.attr, 191 NULL, 192 }; 193 194 static const struct attribute_group format_group = { 195 .name = "format", 196 .attrs = format_attrs, 197 }; 198 199 static struct attribute_group event_group = { 200 .name = "events", 201 /* .attrs is set in init */ 202 }; 203 204 static struct attribute_group event_desc_group = { 205 .name = "event_descs", 206 /* .attrs is set in init */ 207 }; 208 209 static struct attribute_group event_long_desc_group = { 210 .name = "event_long_descs", 211 /* .attrs is set in init */ 212 }; 213 214 static struct kmem_cache *hv_page_cache; 215 216 static DEFINE_PER_CPU(int, hv_24x7_txn_flags); 217 static DEFINE_PER_CPU(int, hv_24x7_txn_err); 218 219 struct hv_24x7_hw { 220 struct perf_event *events[255]; 221 }; 222 223 static DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); 224 225 /* 226 * request_buffer and result_buffer are not required to be 4k aligned, 227 * but are not allowed to cross any 4k boundary. Aligning them to 4k is 228 * the simplest way to ensure that. 229 */ 230 #define H24x7_DATA_BUFFER_SIZE 4096 231 static DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); 232 static DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); 233 234 static unsigned int max_num_requests(int interface_version) 235 { 236 return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer)) 237 / H24x7_REQUEST_SIZE(interface_version); 238 } 239 240 static char *event_name(struct hv_24x7_event_data *ev, int *len) 241 { 242 *len = be16_to_cpu(ev->event_name_len) - 2; 243 return (char *)ev->remainder; 244 } 245 246 static char *event_desc(struct hv_24x7_event_data *ev, int *len) 247 { 248 unsigned int nl = be16_to_cpu(ev->event_name_len); 249 __be16 *desc_len = (__be16 *)(ev->remainder + nl - 2); 250 251 *len = be16_to_cpu(*desc_len) - 2; 252 return (char *)ev->remainder + nl; 253 } 254 255 static char *event_long_desc(struct hv_24x7_event_data *ev, int *len) 256 { 257 unsigned int nl = be16_to_cpu(ev->event_name_len); 258 __be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2); 259 unsigned int desc_len = be16_to_cpu(*desc_len_); 260 __be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2); 261 262 *len = be16_to_cpu(*long_desc_len) - 2; 263 return (char *)ev->remainder + nl + desc_len; 264 } 265 266 static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev, 267 void *end) 268 { 269 void *start = ev; 270 271 return (start + offsetof(struct hv_24x7_event_data, remainder)) < end; 272 } 273 274 /* 275 * Things we don't check: 276 * - padding for desc, name, and long/detailed desc is required to be '\0' 277 * bytes. 278 * 279 * Return NULL if we pass end, 280 * Otherwise return the address of the byte just following the event. 281 */ 282 static void *event_end(struct hv_24x7_event_data *ev, void *end) 283 { 284 void *start = ev; 285 __be16 *dl_, *ldl_; 286 unsigned int dl, ldl; 287 unsigned int nl = be16_to_cpu(ev->event_name_len); 288 289 if (nl < 2) { 290 pr_debug("%s: name length too short: %d", __func__, nl); 291 return NULL; 292 } 293 294 if (start + nl > end) { 295 pr_debug("%s: start=%p + nl=%u > end=%p", 296 __func__, start, nl, end); 297 return NULL; 298 } 299 300 dl_ = (__be16 *)(ev->remainder + nl - 2); 301 if (!IS_ALIGNED((uintptr_t)dl_, 2)) 302 pr_warn("desc len not aligned %p", dl_); 303 dl = be16_to_cpu(*dl_); 304 if (dl < 2) { 305 pr_debug("%s: desc len too short: %d", __func__, dl); 306 return NULL; 307 } 308 309 if (start + nl + dl > end) { 310 pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p", 311 __func__, start, nl, dl, start + nl + dl, end); 312 return NULL; 313 } 314 315 ldl_ = (__be16 *)(ev->remainder + nl + dl - 2); 316 if (!IS_ALIGNED((uintptr_t)ldl_, 2)) 317 pr_warn("long desc len not aligned %p", ldl_); 318 ldl = be16_to_cpu(*ldl_); 319 if (ldl < 2) { 320 pr_debug("%s: long desc len too short (ldl=%u)", 321 __func__, ldl); 322 return NULL; 323 } 324 325 if (start + nl + dl + ldl > end) { 326 pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p", 327 __func__, start, nl, dl, ldl, end); 328 return NULL; 329 } 330 331 return start + nl + dl + ldl; 332 } 333 334 static long h_get_24x7_catalog_page_(unsigned long phys_4096, 335 unsigned long version, unsigned long index) 336 { 337 pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)", 338 phys_4096, version, index); 339 340 WARN_ON(!IS_ALIGNED(phys_4096, 4096)); 341 342 return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE, 343 phys_4096, version, index); 344 } 345 346 static long h_get_24x7_catalog_page(char page[], u64 version, u32 index) 347 { 348 return h_get_24x7_catalog_page_(virt_to_phys(page), 349 version, index); 350 } 351 352 /* 353 * Each event we find in the catalog, will have a sysfs entry. Format the 354 * data for this sysfs entry based on the event's domain. 355 * 356 * Events belonging to the Chip domain can only be monitored in that domain. 357 * i.e the domain for these events is a fixed/knwon value. 358 * 359 * Events belonging to the Core domain can be monitored either in the physical 360 * core or in one of the virtual CPU domains. So the domain value for these 361 * events must be specified by the user (i.e is a required parameter). Format 362 * the Core events with 'domain=?' so the perf-tool can error check required 363 * parameters. 364 * 365 * NOTE: For the Core domain events, rather than making domain a required 366 * parameter we could default it to PHYS_CORE and allowe users to 367 * override the domain to one of the VCPU domains. 368 * 369 * However, this can make the interface a little inconsistent. 370 * 371 * If we set domain=2 (PHYS_CHIP) and allow user to override this field 372 * the user may be tempted to also modify the "offset=x" field in which 373 * can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and 374 * HPM_INST (offset=0x20) events. With: 375 * 376 * perf stat -e hv_24x7/HPM_PCYC,offset=0x20/ 377 * 378 * we end up monitoring HPM_INST, while the command line has HPM_PCYC. 379 * 380 * By not assigning a default value to the domain for the Core events, 381 * we can have simple guidelines: 382 * 383 * - Specifying values for parameters with "=?" is required. 384 * 385 * - Specifying (i.e overriding) values for other parameters 386 * is undefined. 387 */ 388 static char *event_fmt(struct hv_24x7_event_data *event, unsigned int domain) 389 { 390 const char *sindex; 391 const char *lpar; 392 const char *domain_str; 393 char buf[8]; 394 395 switch (domain) { 396 case HV_PERF_DOMAIN_PHYS_CHIP: 397 snprintf(buf, sizeof(buf), "%d", domain); 398 domain_str = buf; 399 lpar = "0x0"; 400 sindex = "chip"; 401 break; 402 case HV_PERF_DOMAIN_PHYS_CORE: 403 domain_str = "?"; 404 lpar = "0x0"; 405 sindex = "core"; 406 break; 407 default: 408 domain_str = "?"; 409 lpar = "?"; 410 sindex = "vcpu"; 411 } 412 413 return kasprintf(GFP_KERNEL, 414 "domain=%s,offset=0x%x,%s=?,lpar=%s", 415 domain_str, 416 be16_to_cpu(event->event_counter_offs) + 417 be16_to_cpu(event->event_group_record_offs), 418 sindex, 419 lpar); 420 } 421 422 /* Avoid trusting fw to NUL terminate strings */ 423 static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp) 424 { 425 return kasprintf(gfp, "%.*s", max_len, maybe_str); 426 } 427 428 static ssize_t device_show_string(struct device *dev, 429 struct device_attribute *attr, char *buf) 430 { 431 struct dev_ext_attribute *d; 432 433 d = container_of(attr, struct dev_ext_attribute, attr); 434 435 return sprintf(buf, "%s\n", (char *)d->var); 436 } 437 438 static ssize_t cpumask_show(struct device *dev, 439 struct device_attribute *attr, char *buf) 440 { 441 return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask); 442 } 443 444 static ssize_t sockets_show(struct device *dev, 445 struct device_attribute *attr, char *buf) 446 { 447 return sprintf(buf, "%d\n", phys_sockets); 448 } 449 450 static ssize_t chipspersocket_show(struct device *dev, 451 struct device_attribute *attr, char *buf) 452 { 453 return sprintf(buf, "%d\n", phys_chipspersocket); 454 } 455 456 static ssize_t coresperchip_show(struct device *dev, 457 struct device_attribute *attr, char *buf) 458 { 459 return sprintf(buf, "%d\n", phys_coresperchip); 460 } 461 462 static struct attribute *device_str_attr_create_(char *name, char *str) 463 { 464 struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL); 465 466 if (!attr) 467 return NULL; 468 469 sysfs_attr_init(&attr->attr.attr); 470 471 attr->var = str; 472 attr->attr.attr.name = name; 473 attr->attr.attr.mode = 0444; 474 attr->attr.show = device_show_string; 475 476 return &attr->attr.attr; 477 } 478 479 /* 480 * Allocate and initialize strings representing event attributes. 481 * 482 * NOTE: The strings allocated here are never destroyed and continue to 483 * exist till shutdown. This is to allow us to create as many events 484 * from the catalog as possible, even if we encounter errors with some. 485 * In case of changes to error paths in future, these may need to be 486 * freed by the caller. 487 */ 488 static struct attribute *device_str_attr_create(char *name, int name_max, 489 int name_nonce, 490 char *str, size_t str_max) 491 { 492 char *n; 493 char *s = memdup_to_str(str, str_max, GFP_KERNEL); 494 struct attribute *a; 495 496 if (!s) 497 return NULL; 498 499 if (!name_nonce) 500 n = kasprintf(GFP_KERNEL, "%.*s", name_max, name); 501 else 502 n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name, 503 name_nonce); 504 if (!n) 505 goto out_s; 506 507 a = device_str_attr_create_(n, s); 508 if (!a) 509 goto out_n; 510 511 return a; 512 out_n: 513 kfree(n); 514 out_s: 515 kfree(s); 516 return NULL; 517 } 518 519 static struct attribute *event_to_attr(unsigned int ix, 520 struct hv_24x7_event_data *event, 521 unsigned int domain, 522 int nonce) 523 { 524 int event_name_len; 525 char *ev_name, *a_ev_name, *val; 526 struct attribute *attr; 527 528 if (!domain_is_valid(domain)) { 529 pr_warn("catalog event %u has invalid domain %u\n", 530 ix, domain); 531 return NULL; 532 } 533 534 val = event_fmt(event, domain); 535 if (!val) 536 return NULL; 537 538 ev_name = event_name(event, &event_name_len); 539 if (!nonce) 540 a_ev_name = kasprintf(GFP_KERNEL, "%.*s", 541 (int)event_name_len, ev_name); 542 else 543 a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d", 544 (int)event_name_len, ev_name, nonce); 545 546 if (!a_ev_name) 547 goto out_val; 548 549 attr = device_str_attr_create_(a_ev_name, val); 550 if (!attr) 551 goto out_name; 552 553 return attr; 554 out_name: 555 kfree(a_ev_name); 556 out_val: 557 kfree(val); 558 return NULL; 559 } 560 561 static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event, 562 int nonce) 563 { 564 int nl, dl; 565 char *name = event_name(event, &nl); 566 char *desc = event_desc(event, &dl); 567 568 /* If there isn't a description, don't create the sysfs file */ 569 if (!dl) 570 return NULL; 571 572 return device_str_attr_create(name, nl, nonce, desc, dl); 573 } 574 575 static struct attribute * 576 event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce) 577 { 578 int nl, dl; 579 char *name = event_name(event, &nl); 580 char *desc = event_long_desc(event, &dl); 581 582 /* If there isn't a description, don't create the sysfs file */ 583 if (!dl) 584 return NULL; 585 586 return device_str_attr_create(name, nl, nonce, desc, dl); 587 } 588 589 static int event_data_to_attrs(unsigned int ix, struct attribute **attrs, 590 struct hv_24x7_event_data *event, int nonce) 591 { 592 *attrs = event_to_attr(ix, event, event->domain, nonce); 593 if (!*attrs) 594 return -1; 595 596 return 0; 597 } 598 599 /* */ 600 struct event_uniq { 601 struct rb_node node; 602 const char *name; 603 int nl; 604 unsigned int ct; 605 unsigned int domain; 606 }; 607 608 static int memord(const void *d1, size_t s1, const void *d2, size_t s2) 609 { 610 if (s1 < s2) 611 return 1; 612 if (s1 > s2) 613 return -1; 614 615 return memcmp(d1, d2, s1); 616 } 617 618 static int ev_uniq_ord(const void *v1, size_t s1, unsigned int d1, 619 const void *v2, size_t s2, unsigned int d2) 620 { 621 int r = memord(v1, s1, v2, s2); 622 623 if (r) 624 return r; 625 if (d1 > d2) 626 return 1; 627 if (d2 > d1) 628 return -1; 629 return 0; 630 } 631 632 static int event_uniq_add(struct rb_root *root, const char *name, int nl, 633 unsigned int domain) 634 { 635 struct rb_node **new = &(root->rb_node), *parent = NULL; 636 struct event_uniq *data; 637 638 /* Figure out where to put new node */ 639 while (*new) { 640 struct event_uniq *it; 641 int result; 642 643 it = rb_entry(*new, struct event_uniq, node); 644 result = ev_uniq_ord(name, nl, domain, it->name, it->nl, 645 it->domain); 646 647 parent = *new; 648 if (result < 0) 649 new = &((*new)->rb_left); 650 else if (result > 0) 651 new = &((*new)->rb_right); 652 else { 653 it->ct++; 654 pr_info("found a duplicate event %.*s, ct=%u\n", nl, 655 name, it->ct); 656 return it->ct; 657 } 658 } 659 660 data = kmalloc(sizeof(*data), GFP_KERNEL); 661 if (!data) 662 return -ENOMEM; 663 664 *data = (struct event_uniq) { 665 .name = name, 666 .nl = nl, 667 .ct = 0, 668 .domain = domain, 669 }; 670 671 /* Add new node and rebalance tree. */ 672 rb_link_node(&data->node, parent, new); 673 rb_insert_color(&data->node, root); 674 675 /* data->ct */ 676 return 0; 677 } 678 679 static void event_uniq_destroy(struct rb_root *root) 680 { 681 /* 682 * the strings we point to are in the giant block of memory filled by 683 * the catalog, and are freed separately. 684 */ 685 struct event_uniq *pos, *n; 686 687 rbtree_postorder_for_each_entry_safe(pos, n, root, node) 688 kfree(pos); 689 } 690 691 692 /* 693 * ensure the event structure's sizes are self consistent and don't cause us to 694 * read outside of the event 695 * 696 * On success, return the event length in bytes. 697 * Otherwise, return -1 (and print as appropriate). 698 */ 699 static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event, 700 size_t event_idx, 701 size_t event_data_bytes, 702 size_t event_entry_count, 703 size_t offset, void *end) 704 { 705 ssize_t ev_len; 706 void *ev_end, *calc_ev_end; 707 708 if (offset >= event_data_bytes) 709 return -1; 710 711 if (event_idx >= event_entry_count) { 712 pr_devel("catalog event data has %zu bytes of padding after last event\n", 713 event_data_bytes - offset); 714 return -1; 715 } 716 717 if (!event_fixed_portion_is_within(event, end)) { 718 pr_warn("event %zu fixed portion is not within range\n", 719 event_idx); 720 return -1; 721 } 722 723 ev_len = be16_to_cpu(event->length); 724 725 if (ev_len % 16) 726 pr_info("event %zu has length %zu not divisible by 16: event=%pK\n", 727 event_idx, ev_len, event); 728 729 ev_end = (__u8 *)event + ev_len; 730 if (ev_end > end) { 731 pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n", 732 event_idx, ev_len, ev_end, end, 733 offset); 734 return -1; 735 } 736 737 calc_ev_end = event_end(event, end); 738 if (!calc_ev_end) { 739 pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n", 740 event_idx, event_data_bytes, event, end, 741 offset); 742 return -1; 743 } 744 745 if (calc_ev_end > ev_end) { 746 pr_warn("event %zu exceeds its own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n", 747 event_idx, event, ev_end, offset, calc_ev_end); 748 return -1; 749 } 750 751 return ev_len; 752 } 753 754 /* 755 * Return true incase of invalid or dummy events with names like RESERVED* 756 */ 757 static bool ignore_event(const char *name) 758 { 759 return strncmp(name, "RESERVED", 8) == 0; 760 } 761 762 #define MAX_4K (SIZE_MAX / 4096) 763 764 static int create_events_from_catalog(struct attribute ***events_, 765 struct attribute ***event_descs_, 766 struct attribute ***event_long_descs_) 767 { 768 long hret; 769 size_t catalog_len, catalog_page_len, event_entry_count, 770 event_data_len, event_data_offs, 771 event_data_bytes, junk_events, event_idx, event_attr_ct, i, 772 attr_max, event_idx_last, desc_ct, long_desc_ct; 773 ssize_t ct, ev_len; 774 uint64_t catalog_version_num; 775 struct attribute **events, **event_descs, **event_long_descs; 776 struct hv_24x7_catalog_page_0 *page_0 = 777 kmem_cache_alloc(hv_page_cache, GFP_KERNEL); 778 void *page = page_0; 779 void *event_data, *end; 780 struct hv_24x7_event_data *event; 781 struct rb_root ev_uniq = RB_ROOT; 782 int ret = 0; 783 784 if (!page) { 785 ret = -ENOMEM; 786 goto e_out; 787 } 788 789 hret = h_get_24x7_catalog_page(page, 0, 0); 790 if (hret) { 791 ret = -EIO; 792 goto e_free; 793 } 794 795 catalog_version_num = be64_to_cpu(page_0->version); 796 catalog_page_len = be32_to_cpu(page_0->length); 797 798 if (MAX_4K < catalog_page_len) { 799 pr_err("invalid page count: %zu\n", catalog_page_len); 800 ret = -EIO; 801 goto e_free; 802 } 803 804 catalog_len = catalog_page_len * 4096; 805 806 event_entry_count = be16_to_cpu(page_0->event_entry_count); 807 event_data_offs = be16_to_cpu(page_0->event_data_offs); 808 event_data_len = be16_to_cpu(page_0->event_data_len); 809 810 pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n", 811 catalog_version_num, catalog_len, 812 event_entry_count, event_data_offs, event_data_len); 813 814 if ((MAX_4K < event_data_len) 815 || (MAX_4K < event_data_offs) 816 || (MAX_4K - event_data_offs < event_data_len)) { 817 pr_err("invalid event data offs %zu and/or len %zu\n", 818 event_data_offs, event_data_len); 819 ret = -EIO; 820 goto e_free; 821 } 822 823 if ((event_data_offs + event_data_len) > catalog_page_len) { 824 pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n", 825 event_data_offs, 826 event_data_offs + event_data_len, 827 catalog_page_len); 828 ret = -EIO; 829 goto e_free; 830 } 831 832 if (SIZE_MAX - 1 < event_entry_count) { 833 pr_err("event_entry_count %zu is invalid\n", event_entry_count); 834 ret = -EIO; 835 goto e_free; 836 } 837 838 event_data_bytes = event_data_len * 4096; 839 840 /* 841 * event data can span several pages, events can cross between these 842 * pages. Use vmalloc to make this easier. 843 */ 844 event_data = vmalloc(event_data_bytes); 845 if (!event_data) { 846 pr_err("could not allocate event data\n"); 847 ret = -ENOMEM; 848 goto e_free; 849 } 850 851 end = event_data + event_data_bytes; 852 853 /* 854 * using vmalloc_to_phys() like this only works if PAGE_SIZE is 855 * divisible by 4096 856 */ 857 BUILD_BUG_ON(PAGE_SIZE % 4096); 858 859 for (i = 0; i < event_data_len; i++) { 860 hret = h_get_24x7_catalog_page_( 861 vmalloc_to_phys(event_data + i * 4096), 862 catalog_version_num, 863 i + event_data_offs); 864 if (hret) { 865 pr_err("Failed to get event data in page %zu: rc=%ld\n", 866 i + event_data_offs, hret); 867 ret = -EIO; 868 goto e_event_data; 869 } 870 } 871 872 /* 873 * scan the catalog to determine the number of attributes we need, and 874 * verify it at the same time. 875 */ 876 for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0; 877 ; 878 event_idx++, event = (void *)event + ev_len) { 879 size_t offset = (void *)event - (void *)event_data; 880 char *name; 881 int nl; 882 883 ev_len = catalog_event_len_validate(event, event_idx, 884 event_data_bytes, 885 event_entry_count, 886 offset, end); 887 if (ev_len < 0) 888 break; 889 890 name = event_name(event, &nl); 891 892 if (ignore_event(name)) { 893 junk_events++; 894 continue; 895 } 896 if (event->event_group_record_len == 0) { 897 pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n", 898 event_idx, nl, name); 899 junk_events++; 900 continue; 901 } 902 903 if (!catalog_entry_domain_is_valid(event->domain)) { 904 pr_info("event %zu (%.*s) has invalid domain %d\n", 905 event_idx, nl, name, event->domain); 906 junk_events++; 907 continue; 908 } 909 910 attr_max++; 911 } 912 913 event_idx_last = event_idx; 914 if (event_idx_last != event_entry_count) 915 pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n", 916 event_idx_last, event_entry_count, junk_events); 917 918 events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL); 919 if (!events) { 920 ret = -ENOMEM; 921 goto e_event_data; 922 } 923 924 event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs), 925 GFP_KERNEL); 926 if (!event_descs) { 927 ret = -ENOMEM; 928 goto e_event_attrs; 929 } 930 931 event_long_descs = kmalloc_array(event_idx + 1, 932 sizeof(*event_long_descs), GFP_KERNEL); 933 if (!event_long_descs) { 934 ret = -ENOMEM; 935 goto e_event_descs; 936 } 937 938 /* Iterate over the catalog filling in the attribute vector */ 939 for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0, 940 event = event_data, event_idx = 0; 941 event_idx < event_idx_last; 942 event_idx++, ev_len = be16_to_cpu(event->length), 943 event = (void *)event + ev_len) { 944 char *name; 945 int nl; 946 int nonce; 947 /* 948 * these are the only "bad" events that are intermixed and that 949 * we can ignore without issue. make sure to skip them here 950 */ 951 if (event->event_group_record_len == 0) 952 continue; 953 if (!catalog_entry_domain_is_valid(event->domain)) 954 continue; 955 956 name = event_name(event, &nl); 957 if (ignore_event(name)) 958 continue; 959 960 nonce = event_uniq_add(&ev_uniq, name, nl, event->domain); 961 ct = event_data_to_attrs(event_idx, events + event_attr_ct, 962 event, nonce); 963 if (ct < 0) { 964 pr_warn("event %zu (%.*s) creation failure, skipping\n", 965 event_idx, nl, name); 966 junk_events++; 967 } else { 968 event_attr_ct++; 969 event_descs[desc_ct] = event_to_desc_attr(event, nonce); 970 if (event_descs[desc_ct]) 971 desc_ct++; 972 event_long_descs[long_desc_ct] = 973 event_to_long_desc_attr(event, nonce); 974 if (event_long_descs[long_desc_ct]) 975 long_desc_ct++; 976 } 977 } 978 979 pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n", 980 event_idx, event_attr_ct, junk_events, desc_ct); 981 982 events[event_attr_ct] = NULL; 983 event_descs[desc_ct] = NULL; 984 event_long_descs[long_desc_ct] = NULL; 985 986 event_uniq_destroy(&ev_uniq); 987 vfree(event_data); 988 kmem_cache_free(hv_page_cache, page); 989 990 *events_ = events; 991 *event_descs_ = event_descs; 992 *event_long_descs_ = event_long_descs; 993 return 0; 994 995 e_event_descs: 996 kfree(event_descs); 997 e_event_attrs: 998 kfree(events); 999 e_event_data: 1000 vfree(event_data); 1001 e_free: 1002 kmem_cache_free(hv_page_cache, page); 1003 e_out: 1004 *events_ = NULL; 1005 *event_descs_ = NULL; 1006 *event_long_descs_ = NULL; 1007 return ret; 1008 } 1009 1010 static ssize_t catalog_read(struct file *filp, struct kobject *kobj, 1011 struct bin_attribute *bin_attr, char *buf, 1012 loff_t offset, size_t count) 1013 { 1014 long hret; 1015 ssize_t ret = 0; 1016 size_t catalog_len = 0, catalog_page_len = 0; 1017 loff_t page_offset = 0; 1018 loff_t offset_in_page; 1019 size_t copy_len; 1020 uint64_t catalog_version_num = 0; 1021 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); 1022 struct hv_24x7_catalog_page_0 *page_0 = page; 1023 1024 if (!page) 1025 return -ENOMEM; 1026 1027 hret = h_get_24x7_catalog_page(page, 0, 0); 1028 if (hret) { 1029 ret = -EIO; 1030 goto e_free; 1031 } 1032 1033 catalog_version_num = be64_to_cpu(page_0->version); 1034 catalog_page_len = be32_to_cpu(page_0->length); 1035 catalog_len = catalog_page_len * 4096; 1036 1037 page_offset = offset / 4096; 1038 offset_in_page = offset % 4096; 1039 1040 if (page_offset >= catalog_page_len) 1041 goto e_free; 1042 1043 if (page_offset != 0) { 1044 hret = h_get_24x7_catalog_page(page, catalog_version_num, 1045 page_offset); 1046 if (hret) { 1047 ret = -EIO; 1048 goto e_free; 1049 } 1050 } 1051 1052 copy_len = 4096 - offset_in_page; 1053 if (copy_len > count) 1054 copy_len = count; 1055 1056 memcpy(buf, page+offset_in_page, copy_len); 1057 ret = copy_len; 1058 1059 e_free: 1060 if (hret) 1061 pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:" 1062 " rc=%ld\n", 1063 catalog_version_num, page_offset, hret); 1064 kmem_cache_free(hv_page_cache, page); 1065 1066 pr_devel("catalog_read: offset=%lld(%lld) count=%zu " 1067 "catalog_len=%zu(%zu) => %zd\n", offset, page_offset, 1068 count, catalog_len, catalog_page_len, ret); 1069 1070 return ret; 1071 } 1072 1073 static ssize_t domains_show(struct device *dev, struct device_attribute *attr, 1074 char *page) 1075 { 1076 int d, n, count = 0; 1077 const char *str; 1078 1079 for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) { 1080 str = domain_name(d); 1081 if (!str) 1082 continue; 1083 1084 n = sprintf(page, "%d: %s\n", d, str); 1085 if (n < 0) 1086 break; 1087 1088 count += n; 1089 page += n; 1090 } 1091 return count; 1092 } 1093 1094 #define PAGE_0_ATTR(_name, _fmt, _expr) \ 1095 static ssize_t _name##_show(struct device *dev, \ 1096 struct device_attribute *dev_attr, \ 1097 char *buf) \ 1098 { \ 1099 long hret; \ 1100 ssize_t ret = 0; \ 1101 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); \ 1102 struct hv_24x7_catalog_page_0 *page_0 = page; \ 1103 if (!page) \ 1104 return -ENOMEM; \ 1105 hret = h_get_24x7_catalog_page(page, 0, 0); \ 1106 if (hret) { \ 1107 ret = -EIO; \ 1108 goto e_free; \ 1109 } \ 1110 ret = sprintf(buf, _fmt, _expr); \ 1111 e_free: \ 1112 kmem_cache_free(hv_page_cache, page); \ 1113 return ret; \ 1114 } \ 1115 static DEVICE_ATTR_RO(_name) 1116 1117 PAGE_0_ATTR(catalog_version, "%lld\n", 1118 (unsigned long long)be64_to_cpu(page_0->version)); 1119 PAGE_0_ATTR(catalog_len, "%lld\n", 1120 (unsigned long long)be32_to_cpu(page_0->length) * 4096); 1121 static BIN_ATTR_RO(catalog, 0/* real length varies */); 1122 static DEVICE_ATTR_RO(domains); 1123 static DEVICE_ATTR_RO(sockets); 1124 static DEVICE_ATTR_RO(chipspersocket); 1125 static DEVICE_ATTR_RO(coresperchip); 1126 static DEVICE_ATTR_RO(cpumask); 1127 1128 static struct bin_attribute *if_bin_attrs[] = { 1129 &bin_attr_catalog, 1130 NULL, 1131 }; 1132 1133 static struct attribute *cpumask_attrs[] = { 1134 &dev_attr_cpumask.attr, 1135 NULL, 1136 }; 1137 1138 static const struct attribute_group cpumask_attr_group = { 1139 .attrs = cpumask_attrs, 1140 }; 1141 1142 static struct attribute *if_attrs[] = { 1143 &dev_attr_catalog_len.attr, 1144 &dev_attr_catalog_version.attr, 1145 &dev_attr_domains.attr, 1146 &dev_attr_sockets.attr, 1147 &dev_attr_chipspersocket.attr, 1148 &dev_attr_coresperchip.attr, 1149 NULL, 1150 }; 1151 1152 static const struct attribute_group if_group = { 1153 .name = "interface", 1154 .bin_attrs = if_bin_attrs, 1155 .attrs = if_attrs, 1156 }; 1157 1158 static const struct attribute_group *attr_groups[] = { 1159 &format_group, 1160 &event_group, 1161 &event_desc_group, 1162 &event_long_desc_group, 1163 &if_group, 1164 &cpumask_attr_group, 1165 NULL, 1166 }; 1167 1168 /* 1169 * Start the process for a new H_GET_24x7_DATA hcall. 1170 */ 1171 static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer, 1172 struct hv_24x7_data_result_buffer *result_buffer) 1173 { 1174 1175 memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE); 1176 memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE); 1177 1178 request_buffer->interface_version = interface_version; 1179 /* memset above set request_buffer->num_requests to 0 */ 1180 } 1181 1182 /* 1183 * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected 1184 * by 'init_24x7_request()' and 'add_event_to_24x7_request()'. 1185 */ 1186 static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer, 1187 struct hv_24x7_data_result_buffer *result_buffer) 1188 { 1189 long ret; 1190 1191 /* 1192 * NOTE: Due to variable number of array elements in request and 1193 * result buffer(s), sizeof() is not reliable. Use the actual 1194 * allocated buffer size, H24x7_DATA_BUFFER_SIZE. 1195 */ 1196 ret = plpar_hcall_norets(H_GET_24X7_DATA, 1197 virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE, 1198 virt_to_phys(result_buffer), H24x7_DATA_BUFFER_SIZE); 1199 1200 if (ret) { 1201 struct hv_24x7_request *req; 1202 1203 req = request_buffer->requests; 1204 pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n", 1205 req->performance_domain, req->data_offset, 1206 req->starting_ix, req->starting_lpar_ix, 1207 ret, ret, result_buffer->detailed_rc, 1208 result_buffer->failing_request_ix); 1209 return -EIO; 1210 } 1211 1212 return 0; 1213 } 1214 1215 /* 1216 * Add the given @event to the next slot in the 24x7 request_buffer. 1217 * 1218 * Note that H_GET_24X7_DATA hcall allows reading several counters' 1219 * values in a single HCALL. We expect the caller to add events to the 1220 * request buffer one by one, make the HCALL and process the results. 1221 */ 1222 static int add_event_to_24x7_request(struct perf_event *event, 1223 struct hv_24x7_request_buffer *request_buffer) 1224 { 1225 u16 idx; 1226 int i; 1227 size_t req_size; 1228 struct hv_24x7_request *req; 1229 1230 if (request_buffer->num_requests >= 1231 max_num_requests(request_buffer->interface_version)) { 1232 pr_devel("Too many requests for 24x7 HCALL %d\n", 1233 request_buffer->num_requests); 1234 return -EINVAL; 1235 } 1236 1237 switch (event_get_domain(event)) { 1238 case HV_PERF_DOMAIN_PHYS_CHIP: 1239 idx = event_get_chip(event); 1240 break; 1241 case HV_PERF_DOMAIN_PHYS_CORE: 1242 idx = event_get_core(event); 1243 break; 1244 default: 1245 idx = event_get_vcpu(event); 1246 } 1247 1248 req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version); 1249 1250 i = request_buffer->num_requests++; 1251 req = (void *) request_buffer->requests + i * req_size; 1252 1253 req->performance_domain = event_get_domain(event); 1254 req->data_size = cpu_to_be16(8); 1255 req->data_offset = cpu_to_be32(event_get_offset(event)); 1256 req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event)); 1257 req->max_num_lpars = cpu_to_be16(1); 1258 req->starting_ix = cpu_to_be16(idx); 1259 req->max_ix = cpu_to_be16(1); 1260 1261 if (request_buffer->interface_version > 1) { 1262 if (domain_needs_aggregation(req->performance_domain)) 1263 req->max_num_thread_groups = -1; 1264 else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) { 1265 req->starting_thread_group_ix = idx % 2; 1266 req->max_num_thread_groups = 1; 1267 } 1268 } 1269 1270 return 0; 1271 } 1272 1273 /** 1274 * get_count_from_result - get event count from all result elements in result 1275 * 1276 * If the event corresponding to this result needs aggregation of the result 1277 * element values, then this function does that. 1278 * 1279 * @event: Event associated with @res. 1280 * @resb: Result buffer containing @res. 1281 * @res: Result to work on. 1282 * @countp: Output variable containing the event count. 1283 * @next: Optional output variable pointing to the next result in @resb. 1284 */ 1285 static int get_count_from_result(struct perf_event *event, 1286 struct hv_24x7_data_result_buffer *resb, 1287 struct hv_24x7_result *res, u64 *countp, 1288 struct hv_24x7_result **next) 1289 { 1290 u16 num_elements = be16_to_cpu(res->num_elements_returned); 1291 u16 data_size = be16_to_cpu(res->result_element_data_size); 1292 unsigned int data_offset; 1293 void *element_data; 1294 int i; 1295 u64 count; 1296 1297 /* 1298 * We can bail out early if the result is empty. 1299 */ 1300 if (!num_elements) { 1301 pr_debug("Result of request %hhu is empty, nothing to do\n", 1302 res->result_ix); 1303 1304 if (next) 1305 *next = (struct hv_24x7_result *) res->elements; 1306 1307 return -ENODATA; 1308 } 1309 1310 /* 1311 * Since we always specify 1 as the maximum for the smallest resource 1312 * we're requesting, there should to be only one element per result. 1313 * Except when an event needs aggregation, in which case there are more. 1314 */ 1315 if (num_elements != 1 && 1316 !domain_needs_aggregation(event_get_domain(event))) { 1317 pr_err("Error: result of request %hhu has %hu elements\n", 1318 res->result_ix, num_elements); 1319 1320 return -EIO; 1321 } 1322 1323 if (data_size != sizeof(u64)) { 1324 pr_debug("Error: result of request %hhu has data of %hu bytes\n", 1325 res->result_ix, data_size); 1326 1327 return -ENOTSUPP; 1328 } 1329 1330 if (resb->interface_version == 1) 1331 data_offset = offsetof(struct hv_24x7_result_element_v1, 1332 element_data); 1333 else 1334 data_offset = offsetof(struct hv_24x7_result_element_v2, 1335 element_data); 1336 1337 /* Go through the result elements in the result. */ 1338 for (i = count = 0, element_data = res->elements + data_offset; 1339 i < num_elements; 1340 i++, element_data += data_size + data_offset) 1341 count += be64_to_cpu(*((__be64 *)element_data)); 1342 1343 *countp = count; 1344 1345 /* The next result is after the last result element. */ 1346 if (next) 1347 *next = element_data - data_offset; 1348 1349 return 0; 1350 } 1351 1352 static int single_24x7_request(struct perf_event *event, u64 *count) 1353 { 1354 int ret; 1355 struct hv_24x7_request_buffer *request_buffer; 1356 struct hv_24x7_data_result_buffer *result_buffer; 1357 1358 BUILD_BUG_ON(sizeof(*request_buffer) > 4096); 1359 BUILD_BUG_ON(sizeof(*result_buffer) > 4096); 1360 1361 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1362 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1363 1364 init_24x7_request(request_buffer, result_buffer); 1365 1366 ret = add_event_to_24x7_request(event, request_buffer); 1367 if (ret) 1368 goto out; 1369 1370 ret = make_24x7_request(request_buffer, result_buffer); 1371 if (ret) 1372 goto out; 1373 1374 /* process result from hcall */ 1375 ret = get_count_from_result(event, result_buffer, 1376 result_buffer->results, count, NULL); 1377 1378 out: 1379 put_cpu_var(hv_24x7_reqb); 1380 put_cpu_var(hv_24x7_resb); 1381 return ret; 1382 } 1383 1384 1385 static int h_24x7_event_init(struct perf_event *event) 1386 { 1387 struct hv_perf_caps caps; 1388 unsigned int domain; 1389 unsigned long hret; 1390 u64 ct; 1391 1392 /* Not our event */ 1393 if (event->attr.type != event->pmu->type) 1394 return -ENOENT; 1395 1396 /* Unused areas must be 0 */ 1397 if (event_get_reserved1(event) || 1398 event_get_reserved2(event) || 1399 event_get_reserved3(event)) { 1400 pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n", 1401 event->attr.config, 1402 event_get_reserved1(event), 1403 event->attr.config1, 1404 event_get_reserved2(event), 1405 event->attr.config2, 1406 event_get_reserved3(event)); 1407 return -EINVAL; 1408 } 1409 1410 /* no branch sampling */ 1411 if (has_branch_stack(event)) 1412 return -EOPNOTSUPP; 1413 1414 /* offset must be 8 byte aligned */ 1415 if (event_get_offset(event) % 8) { 1416 pr_devel("bad alignment\n"); 1417 return -EINVAL; 1418 } 1419 1420 domain = event_get_domain(event); 1421 if (domain == 0 || domain >= HV_PERF_DOMAIN_MAX) { 1422 pr_devel("invalid domain %d\n", domain); 1423 return -EINVAL; 1424 } 1425 1426 hret = hv_perf_caps_get(&caps); 1427 if (hret) { 1428 pr_devel("could not get capabilities: rc=%ld\n", hret); 1429 return -EIO; 1430 } 1431 1432 /* Physical domains & other lpars require extra capabilities */ 1433 if (!caps.collect_privileged && (is_physical_domain(domain) || 1434 (event_get_lpar(event) != event_get_lpar_max()))) { 1435 pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n", 1436 is_physical_domain(domain), 1437 event_get_lpar(event)); 1438 return -EACCES; 1439 } 1440 1441 /* Get the initial value of the counter for this event */ 1442 if (single_24x7_request(event, &ct)) { 1443 pr_devel("test hcall failed\n"); 1444 return -EIO; 1445 } 1446 (void)local64_xchg(&event->hw.prev_count, ct); 1447 1448 return 0; 1449 } 1450 1451 static u64 h_24x7_get_value(struct perf_event *event) 1452 { 1453 u64 ct; 1454 1455 if (single_24x7_request(event, &ct)) 1456 /* We checked this in event init, shouldn't fail here... */ 1457 return 0; 1458 1459 return ct; 1460 } 1461 1462 static void update_event_count(struct perf_event *event, u64 now) 1463 { 1464 s64 prev; 1465 1466 prev = local64_xchg(&event->hw.prev_count, now); 1467 local64_add(now - prev, &event->count); 1468 } 1469 1470 static void h_24x7_event_read(struct perf_event *event) 1471 { 1472 u64 now; 1473 struct hv_24x7_request_buffer *request_buffer; 1474 struct hv_24x7_hw *h24x7hw; 1475 int txn_flags; 1476 1477 txn_flags = __this_cpu_read(hv_24x7_txn_flags); 1478 1479 /* 1480 * If in a READ transaction, add this counter to the list of 1481 * counters to read during the next HCALL (i.e commit_txn()). 1482 * If not in a READ transaction, go ahead and make the HCALL 1483 * to read this counter by itself. 1484 */ 1485 1486 if (txn_flags & PERF_PMU_TXN_READ) { 1487 int i; 1488 int ret; 1489 1490 if (__this_cpu_read(hv_24x7_txn_err)) 1491 return; 1492 1493 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1494 1495 ret = add_event_to_24x7_request(event, request_buffer); 1496 if (ret) { 1497 __this_cpu_write(hv_24x7_txn_err, ret); 1498 } else { 1499 /* 1500 * Associate the event with the HCALL request index, 1501 * so ->commit_txn() can quickly find/update count. 1502 */ 1503 i = request_buffer->num_requests - 1; 1504 1505 h24x7hw = &get_cpu_var(hv_24x7_hw); 1506 h24x7hw->events[i] = event; 1507 put_cpu_var(h24x7hw); 1508 } 1509 1510 put_cpu_var(hv_24x7_reqb); 1511 } else { 1512 now = h_24x7_get_value(event); 1513 update_event_count(event, now); 1514 } 1515 } 1516 1517 static void h_24x7_event_start(struct perf_event *event, int flags) 1518 { 1519 if (flags & PERF_EF_RELOAD) 1520 local64_set(&event->hw.prev_count, h_24x7_get_value(event)); 1521 } 1522 1523 static void h_24x7_event_stop(struct perf_event *event, int flags) 1524 { 1525 h_24x7_event_read(event); 1526 } 1527 1528 static int h_24x7_event_add(struct perf_event *event, int flags) 1529 { 1530 if (flags & PERF_EF_START) 1531 h_24x7_event_start(event, flags); 1532 1533 return 0; 1534 } 1535 1536 /* 1537 * 24x7 counters only support READ transactions. They are 1538 * always counting and dont need/support ADD transactions. 1539 * Cache the flags, but otherwise ignore transactions that 1540 * are not PERF_PMU_TXN_READ. 1541 */ 1542 static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags) 1543 { 1544 struct hv_24x7_request_buffer *request_buffer; 1545 struct hv_24x7_data_result_buffer *result_buffer; 1546 1547 /* We should not be called if we are already in a txn */ 1548 WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags)); 1549 1550 __this_cpu_write(hv_24x7_txn_flags, flags); 1551 if (flags & ~PERF_PMU_TXN_READ) 1552 return; 1553 1554 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1555 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1556 1557 init_24x7_request(request_buffer, result_buffer); 1558 1559 put_cpu_var(hv_24x7_resb); 1560 put_cpu_var(hv_24x7_reqb); 1561 } 1562 1563 /* 1564 * Clean up transaction state. 1565 * 1566 * NOTE: Ignore state of request and result buffers for now. 1567 * We will initialize them during the next read/txn. 1568 */ 1569 static void reset_txn(void) 1570 { 1571 __this_cpu_write(hv_24x7_txn_flags, 0); 1572 __this_cpu_write(hv_24x7_txn_err, 0); 1573 } 1574 1575 /* 1576 * 24x7 counters only support READ transactions. They are always counting 1577 * and dont need/support ADD transactions. Clear ->txn_flags but otherwise 1578 * ignore transactions that are not of type PERF_PMU_TXN_READ. 1579 * 1580 * For READ transactions, submit all pending 24x7 requests (i.e requests 1581 * that were queued by h_24x7_event_read()), to the hypervisor and update 1582 * the event counts. 1583 */ 1584 static int h_24x7_event_commit_txn(struct pmu *pmu) 1585 { 1586 struct hv_24x7_request_buffer *request_buffer; 1587 struct hv_24x7_data_result_buffer *result_buffer; 1588 struct hv_24x7_result *res, *next_res; 1589 u64 count; 1590 int i, ret, txn_flags; 1591 struct hv_24x7_hw *h24x7hw; 1592 1593 txn_flags = __this_cpu_read(hv_24x7_txn_flags); 1594 WARN_ON_ONCE(!txn_flags); 1595 1596 ret = 0; 1597 if (txn_flags & ~PERF_PMU_TXN_READ) 1598 goto out; 1599 1600 ret = __this_cpu_read(hv_24x7_txn_err); 1601 if (ret) 1602 goto out; 1603 1604 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1605 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1606 1607 ret = make_24x7_request(request_buffer, result_buffer); 1608 if (ret) 1609 goto put_reqb; 1610 1611 h24x7hw = &get_cpu_var(hv_24x7_hw); 1612 1613 /* Go through results in the result buffer to update event counts. */ 1614 for (i = 0, res = result_buffer->results; 1615 i < result_buffer->num_results; i++, res = next_res) { 1616 struct perf_event *event = h24x7hw->events[res->result_ix]; 1617 1618 ret = get_count_from_result(event, result_buffer, res, &count, 1619 &next_res); 1620 if (ret) 1621 break; 1622 1623 update_event_count(event, count); 1624 } 1625 1626 put_cpu_var(hv_24x7_hw); 1627 1628 put_reqb: 1629 put_cpu_var(hv_24x7_resb); 1630 put_cpu_var(hv_24x7_reqb); 1631 out: 1632 reset_txn(); 1633 return ret; 1634 } 1635 1636 /* 1637 * 24x7 counters only support READ transactions. They are always counting 1638 * and dont need/support ADD transactions. However, regardless of type 1639 * of transaction, all we need to do is cleanup, so we don't have to check 1640 * the type of transaction. 1641 */ 1642 static void h_24x7_event_cancel_txn(struct pmu *pmu) 1643 { 1644 WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags)); 1645 reset_txn(); 1646 } 1647 1648 static struct pmu h_24x7_pmu = { 1649 .task_ctx_nr = perf_invalid_context, 1650 1651 .name = "hv_24x7", 1652 .attr_groups = attr_groups, 1653 .event_init = h_24x7_event_init, 1654 .add = h_24x7_event_add, 1655 .del = h_24x7_event_stop, 1656 .start = h_24x7_event_start, 1657 .stop = h_24x7_event_stop, 1658 .read = h_24x7_event_read, 1659 .start_txn = h_24x7_event_start_txn, 1660 .commit_txn = h_24x7_event_commit_txn, 1661 .cancel_txn = h_24x7_event_cancel_txn, 1662 .capabilities = PERF_PMU_CAP_NO_EXCLUDE, 1663 }; 1664 1665 static int ppc_hv_24x7_cpu_online(unsigned int cpu) 1666 { 1667 if (cpumask_empty(&hv_24x7_cpumask)) 1668 cpumask_set_cpu(cpu, &hv_24x7_cpumask); 1669 1670 return 0; 1671 } 1672 1673 static int ppc_hv_24x7_cpu_offline(unsigned int cpu) 1674 { 1675 int target; 1676 1677 /* Check if exiting cpu is used for collecting 24x7 events */ 1678 if (!cpumask_test_and_clear_cpu(cpu, &hv_24x7_cpumask)) 1679 return 0; 1680 1681 /* Find a new cpu to collect 24x7 events */ 1682 target = cpumask_last(cpu_active_mask); 1683 1684 if (target < 0 || target >= nr_cpu_ids) { 1685 pr_err("hv_24x7: CPU hotplug init failed\n"); 1686 return -1; 1687 } 1688 1689 /* Migrate 24x7 events to the new target */ 1690 cpumask_set_cpu(target, &hv_24x7_cpumask); 1691 perf_pmu_migrate_context(&h_24x7_pmu, cpu, target); 1692 1693 return 0; 1694 } 1695 1696 static int hv_24x7_cpu_hotplug_init(void) 1697 { 1698 return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE, 1699 "perf/powerpc/hv_24x7:online", 1700 ppc_hv_24x7_cpu_online, 1701 ppc_hv_24x7_cpu_offline); 1702 } 1703 1704 static int hv_24x7_init(void) 1705 { 1706 int r; 1707 unsigned long hret; 1708 unsigned int pvr = mfspr(SPRN_PVR); 1709 struct hv_perf_caps caps; 1710 1711 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 1712 pr_debug("not a virtualized system, not enabling\n"); 1713 return -ENODEV; 1714 } 1715 1716 /* POWER8 only supports v1, while POWER9 only supports v2. */ 1717 if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E || 1718 PVR_VER(pvr) == PVR_POWER8NVL) 1719 interface_version = 1; 1720 else { 1721 interface_version = 2; 1722 1723 /* SMT8 in POWER9 needs to aggregate result elements. */ 1724 if (threads_per_core == 8) 1725 aggregate_result_elements = true; 1726 } 1727 1728 hret = hv_perf_caps_get(&caps); 1729 if (hret) { 1730 pr_debug("could not obtain capabilities, not enabling, rc=%ld\n", 1731 hret); 1732 return -ENODEV; 1733 } 1734 1735 hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL); 1736 if (!hv_page_cache) 1737 return -ENOMEM; 1738 1739 /* sampling not supported */ 1740 h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; 1741 1742 r = create_events_from_catalog(&event_group.attrs, 1743 &event_desc_group.attrs, 1744 &event_long_desc_group.attrs); 1745 1746 if (r) 1747 return r; 1748 1749 /* init cpuhotplug */ 1750 r = hv_24x7_cpu_hotplug_init(); 1751 if (r) 1752 return r; 1753 1754 r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1); 1755 if (r) 1756 return r; 1757 1758 read_24x7_sys_info(); 1759 1760 return 0; 1761 } 1762 1763 device_initcall(hv_24x7_init); 1764