1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Hypervisor supplied "24x7" performance counter support 4 * 5 * Author: Cody P Schafer <cody@linux.vnet.ibm.com> 6 * Copyright 2014 IBM Corporation. 7 */ 8 9 #define pr_fmt(fmt) "hv-24x7: " fmt 10 11 #include <linux/perf_event.h> 12 #include <linux/rbtree.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/vmalloc.h> 16 17 #include <asm/cputhreads.h> 18 #include <asm/firmware.h> 19 #include <asm/hvcall.h> 20 #include <asm/io.h> 21 #include <linux/byteorder/generic.h> 22 23 #include <asm/rtas.h> 24 #include "hv-24x7.h" 25 #include "hv-24x7-catalog.h" 26 #include "hv-common.h" 27 28 /* Version of the 24x7 hypervisor API that we should use in this machine. */ 29 static int interface_version; 30 31 /* Whether we have to aggregate result data for some domains. */ 32 static bool aggregate_result_elements; 33 34 static cpumask_t hv_24x7_cpumask; 35 36 static bool domain_is_valid(unsigned domain) 37 { 38 switch (domain) { 39 #define DOMAIN(n, v, x, c) \ 40 case HV_PERF_DOMAIN_##n: \ 41 /* fall through */ 42 #include "hv-24x7-domains.h" 43 #undef DOMAIN 44 return true; 45 default: 46 return false; 47 } 48 } 49 50 static bool is_physical_domain(unsigned domain) 51 { 52 switch (domain) { 53 #define DOMAIN(n, v, x, c) \ 54 case HV_PERF_DOMAIN_##n: \ 55 return c; 56 #include "hv-24x7-domains.h" 57 #undef DOMAIN 58 default: 59 return false; 60 } 61 } 62 63 /* 64 * The Processor Module Information system parameter allows transferring 65 * of certain processor module information from the platform to the OS. 66 * Refer PAPR+ document to get parameter token value as '43'. 67 */ 68 69 #define PROCESSOR_MODULE_INFO 43 70 71 static u32 phys_sockets; /* Physical sockets */ 72 static u32 phys_chipspersocket; /* Physical chips per socket*/ 73 static u32 phys_coresperchip; /* Physical cores per chip */ 74 75 /* 76 * read_24x7_sys_info() 77 * Retrieve the number of sockets and chips per socket and cores per 78 * chip details through the get-system-parameter rtas call. 79 */ 80 void read_24x7_sys_info(void) 81 { 82 int call_status, len, ntypes; 83 84 spin_lock(&rtas_data_buf_lock); 85 86 /* 87 * Making system parameter: chips and sockets and cores per chip 88 * default to 1. 89 */ 90 phys_sockets = 1; 91 phys_chipspersocket = 1; 92 phys_coresperchip = 1; 93 94 call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, 95 NULL, 96 PROCESSOR_MODULE_INFO, 97 __pa(rtas_data_buf), 98 RTAS_DATA_BUF_SIZE); 99 100 if (call_status != 0) { 101 pr_err("Error calling get-system-parameter %d\n", 102 call_status); 103 } else { 104 len = be16_to_cpup((__be16 *)&rtas_data_buf[0]); 105 if (len < 8) 106 goto out; 107 108 ntypes = be16_to_cpup((__be16 *)&rtas_data_buf[2]); 109 110 if (!ntypes) 111 goto out; 112 113 phys_sockets = be16_to_cpup((__be16 *)&rtas_data_buf[4]); 114 phys_chipspersocket = be16_to_cpup((__be16 *)&rtas_data_buf[6]); 115 phys_coresperchip = be16_to_cpup((__be16 *)&rtas_data_buf[8]); 116 } 117 118 out: 119 spin_unlock(&rtas_data_buf_lock); 120 } 121 122 /* Domains for which more than one result element are returned for each event. */ 123 static bool domain_needs_aggregation(unsigned int domain) 124 { 125 return aggregate_result_elements && 126 (domain == HV_PERF_DOMAIN_PHYS_CORE || 127 (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE && 128 domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE)); 129 } 130 131 static const char *domain_name(unsigned domain) 132 { 133 if (!domain_is_valid(domain)) 134 return NULL; 135 136 switch (domain) { 137 case HV_PERF_DOMAIN_PHYS_CHIP: return "Physical Chip"; 138 case HV_PERF_DOMAIN_PHYS_CORE: return "Physical Core"; 139 case HV_PERF_DOMAIN_VCPU_HOME_CORE: return "VCPU Home Core"; 140 case HV_PERF_DOMAIN_VCPU_HOME_CHIP: return "VCPU Home Chip"; 141 case HV_PERF_DOMAIN_VCPU_HOME_NODE: return "VCPU Home Node"; 142 case HV_PERF_DOMAIN_VCPU_REMOTE_NODE: return "VCPU Remote Node"; 143 } 144 145 WARN_ON_ONCE(domain); 146 return NULL; 147 } 148 149 static bool catalog_entry_domain_is_valid(unsigned domain) 150 { 151 /* POWER8 doesn't support virtual domains. */ 152 if (interface_version == 1) 153 return is_physical_domain(domain); 154 else 155 return domain_is_valid(domain); 156 } 157 158 /* 159 * TODO: Merging events: 160 * - Think of the hcall as an interface to a 4d array of counters: 161 * - x = domains 162 * - y = indexes in the domain (core, chip, vcpu, node, etc) 163 * - z = offset into the counter space 164 * - w = lpars (guest vms, "logical partitions") 165 * - A single request is: x,y,y_last,z,z_last,w,w_last 166 * - this means we can retrieve a rectangle of counters in y,z for a single x. 167 * 168 * - Things to consider (ignoring w): 169 * - input cost_per_request = 16 170 * - output cost_per_result(ys,zs) = 8 + 8 * ys + ys * zs 171 * - limited number of requests per hcall (must fit into 4K bytes) 172 * - 4k = 16 [buffer header] - 16 [request size] * request_count 173 * - 255 requests per hcall 174 * - sometimes it will be more efficient to read extra data and discard 175 */ 176 177 /* 178 * Example usage: 179 * perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/' 180 */ 181 182 /* u3 0-6, one of HV_24X7_PERF_DOMAIN */ 183 EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3); 184 /* u16 */ 185 EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31); 186 EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31); 187 EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31); 188 /* u32, see "data_offset" */ 189 EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63); 190 /* u16 */ 191 EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15); 192 193 EVENT_DEFINE_RANGE(reserved1, config, 4, 15); 194 EVENT_DEFINE_RANGE(reserved2, config1, 16, 63); 195 EVENT_DEFINE_RANGE(reserved3, config2, 0, 63); 196 197 static struct attribute *format_attrs[] = { 198 &format_attr_domain.attr, 199 &format_attr_offset.attr, 200 &format_attr_core.attr, 201 &format_attr_chip.attr, 202 &format_attr_vcpu.attr, 203 &format_attr_lpar.attr, 204 NULL, 205 }; 206 207 static struct attribute_group format_group = { 208 .name = "format", 209 .attrs = format_attrs, 210 }; 211 212 static struct attribute_group event_group = { 213 .name = "events", 214 /* .attrs is set in init */ 215 }; 216 217 static struct attribute_group event_desc_group = { 218 .name = "event_descs", 219 /* .attrs is set in init */ 220 }; 221 222 static struct attribute_group event_long_desc_group = { 223 .name = "event_long_descs", 224 /* .attrs is set in init */ 225 }; 226 227 static struct kmem_cache *hv_page_cache; 228 229 DEFINE_PER_CPU(int, hv_24x7_txn_flags); 230 DEFINE_PER_CPU(int, hv_24x7_txn_err); 231 232 struct hv_24x7_hw { 233 struct perf_event *events[255]; 234 }; 235 236 DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); 237 238 /* 239 * request_buffer and result_buffer are not required to be 4k aligned, 240 * but are not allowed to cross any 4k boundary. Aligning them to 4k is 241 * the simplest way to ensure that. 242 */ 243 #define H24x7_DATA_BUFFER_SIZE 4096 244 DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); 245 DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); 246 247 static unsigned int max_num_requests(int interface_version) 248 { 249 return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer)) 250 / H24x7_REQUEST_SIZE(interface_version); 251 } 252 253 static char *event_name(struct hv_24x7_event_data *ev, int *len) 254 { 255 *len = be16_to_cpu(ev->event_name_len) - 2; 256 return (char *)ev->remainder; 257 } 258 259 static char *event_desc(struct hv_24x7_event_data *ev, int *len) 260 { 261 unsigned nl = be16_to_cpu(ev->event_name_len); 262 __be16 *desc_len = (__be16 *)(ev->remainder + nl - 2); 263 264 *len = be16_to_cpu(*desc_len) - 2; 265 return (char *)ev->remainder + nl; 266 } 267 268 static char *event_long_desc(struct hv_24x7_event_data *ev, int *len) 269 { 270 unsigned nl = be16_to_cpu(ev->event_name_len); 271 __be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2); 272 unsigned desc_len = be16_to_cpu(*desc_len_); 273 __be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2); 274 275 *len = be16_to_cpu(*long_desc_len) - 2; 276 return (char *)ev->remainder + nl + desc_len; 277 } 278 279 static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev, 280 void *end) 281 { 282 void *start = ev; 283 284 return (start + offsetof(struct hv_24x7_event_data, remainder)) < end; 285 } 286 287 /* 288 * Things we don't check: 289 * - padding for desc, name, and long/detailed desc is required to be '\0' 290 * bytes. 291 * 292 * Return NULL if we pass end, 293 * Otherwise return the address of the byte just following the event. 294 */ 295 static void *event_end(struct hv_24x7_event_data *ev, void *end) 296 { 297 void *start = ev; 298 __be16 *dl_, *ldl_; 299 unsigned dl, ldl; 300 unsigned nl = be16_to_cpu(ev->event_name_len); 301 302 if (nl < 2) { 303 pr_debug("%s: name length too short: %d", __func__, nl); 304 return NULL; 305 } 306 307 if (start + nl > end) { 308 pr_debug("%s: start=%p + nl=%u > end=%p", 309 __func__, start, nl, end); 310 return NULL; 311 } 312 313 dl_ = (__be16 *)(ev->remainder + nl - 2); 314 if (!IS_ALIGNED((uintptr_t)dl_, 2)) 315 pr_warn("desc len not aligned %p", dl_); 316 dl = be16_to_cpu(*dl_); 317 if (dl < 2) { 318 pr_debug("%s: desc len too short: %d", __func__, dl); 319 return NULL; 320 } 321 322 if (start + nl + dl > end) { 323 pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p", 324 __func__, start, nl, dl, start + nl + dl, end); 325 return NULL; 326 } 327 328 ldl_ = (__be16 *)(ev->remainder + nl + dl - 2); 329 if (!IS_ALIGNED((uintptr_t)ldl_, 2)) 330 pr_warn("long desc len not aligned %p", ldl_); 331 ldl = be16_to_cpu(*ldl_); 332 if (ldl < 2) { 333 pr_debug("%s: long desc len too short (ldl=%u)", 334 __func__, ldl); 335 return NULL; 336 } 337 338 if (start + nl + dl + ldl > end) { 339 pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p", 340 __func__, start, nl, dl, ldl, end); 341 return NULL; 342 } 343 344 return start + nl + dl + ldl; 345 } 346 347 static long h_get_24x7_catalog_page_(unsigned long phys_4096, 348 unsigned long version, unsigned long index) 349 { 350 pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)", 351 phys_4096, version, index); 352 353 WARN_ON(!IS_ALIGNED(phys_4096, 4096)); 354 355 return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE, 356 phys_4096, version, index); 357 } 358 359 static long h_get_24x7_catalog_page(char page[], u64 version, u32 index) 360 { 361 return h_get_24x7_catalog_page_(virt_to_phys(page), 362 version, index); 363 } 364 365 /* 366 * Each event we find in the catalog, will have a sysfs entry. Format the 367 * data for this sysfs entry based on the event's domain. 368 * 369 * Events belonging to the Chip domain can only be monitored in that domain. 370 * i.e the domain for these events is a fixed/knwon value. 371 * 372 * Events belonging to the Core domain can be monitored either in the physical 373 * core or in one of the virtual CPU domains. So the domain value for these 374 * events must be specified by the user (i.e is a required parameter). Format 375 * the Core events with 'domain=?' so the perf-tool can error check required 376 * parameters. 377 * 378 * NOTE: For the Core domain events, rather than making domain a required 379 * parameter we could default it to PHYS_CORE and allowe users to 380 * override the domain to one of the VCPU domains. 381 * 382 * However, this can make the interface a little inconsistent. 383 * 384 * If we set domain=2 (PHYS_CHIP) and allow user to override this field 385 * the user may be tempted to also modify the "offset=x" field in which 386 * can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and 387 * HPM_INST (offset=0x20) events. With: 388 * 389 * perf stat -e hv_24x7/HPM_PCYC,offset=0x20/ 390 * 391 * we end up monitoring HPM_INST, while the command line has HPM_PCYC. 392 * 393 * By not assigning a default value to the domain for the Core events, 394 * we can have simple guidelines: 395 * 396 * - Specifying values for parameters with "=?" is required. 397 * 398 * - Specifying (i.e overriding) values for other parameters 399 * is undefined. 400 */ 401 static char *event_fmt(struct hv_24x7_event_data *event, unsigned domain) 402 { 403 const char *sindex; 404 const char *lpar; 405 const char *domain_str; 406 char buf[8]; 407 408 switch (domain) { 409 case HV_PERF_DOMAIN_PHYS_CHIP: 410 snprintf(buf, sizeof(buf), "%d", domain); 411 domain_str = buf; 412 lpar = "0x0"; 413 sindex = "chip"; 414 break; 415 case HV_PERF_DOMAIN_PHYS_CORE: 416 domain_str = "?"; 417 lpar = "0x0"; 418 sindex = "core"; 419 break; 420 default: 421 domain_str = "?"; 422 lpar = "?"; 423 sindex = "vcpu"; 424 } 425 426 return kasprintf(GFP_KERNEL, 427 "domain=%s,offset=0x%x,%s=?,lpar=%s", 428 domain_str, 429 be16_to_cpu(event->event_counter_offs) + 430 be16_to_cpu(event->event_group_record_offs), 431 sindex, 432 lpar); 433 } 434 435 /* Avoid trusting fw to NUL terminate strings */ 436 static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp) 437 { 438 return kasprintf(gfp, "%.*s", max_len, maybe_str); 439 } 440 441 static ssize_t device_show_string(struct device *dev, 442 struct device_attribute *attr, char *buf) 443 { 444 struct dev_ext_attribute *d; 445 446 d = container_of(attr, struct dev_ext_attribute, attr); 447 448 return sprintf(buf, "%s\n", (char *)d->var); 449 } 450 451 static ssize_t cpumask_show(struct device *dev, 452 struct device_attribute *attr, char *buf) 453 { 454 return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask); 455 } 456 457 static ssize_t sockets_show(struct device *dev, 458 struct device_attribute *attr, char *buf) 459 { 460 return sprintf(buf, "%d\n", phys_sockets); 461 } 462 463 static ssize_t chipspersocket_show(struct device *dev, 464 struct device_attribute *attr, char *buf) 465 { 466 return sprintf(buf, "%d\n", phys_chipspersocket); 467 } 468 469 static ssize_t coresperchip_show(struct device *dev, 470 struct device_attribute *attr, char *buf) 471 { 472 return sprintf(buf, "%d\n", phys_coresperchip); 473 } 474 475 static struct attribute *device_str_attr_create_(char *name, char *str) 476 { 477 struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL); 478 479 if (!attr) 480 return NULL; 481 482 sysfs_attr_init(&attr->attr.attr); 483 484 attr->var = str; 485 attr->attr.attr.name = name; 486 attr->attr.attr.mode = 0444; 487 attr->attr.show = device_show_string; 488 489 return &attr->attr.attr; 490 } 491 492 /* 493 * Allocate and initialize strings representing event attributes. 494 * 495 * NOTE: The strings allocated here are never destroyed and continue to 496 * exist till shutdown. This is to allow us to create as many events 497 * from the catalog as possible, even if we encounter errors with some. 498 * In case of changes to error paths in future, these may need to be 499 * freed by the caller. 500 */ 501 static struct attribute *device_str_attr_create(char *name, int name_max, 502 int name_nonce, 503 char *str, size_t str_max) 504 { 505 char *n; 506 char *s = memdup_to_str(str, str_max, GFP_KERNEL); 507 struct attribute *a; 508 509 if (!s) 510 return NULL; 511 512 if (!name_nonce) 513 n = kasprintf(GFP_KERNEL, "%.*s", name_max, name); 514 else 515 n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name, 516 name_nonce); 517 if (!n) 518 goto out_s; 519 520 a = device_str_attr_create_(n, s); 521 if (!a) 522 goto out_n; 523 524 return a; 525 out_n: 526 kfree(n); 527 out_s: 528 kfree(s); 529 return NULL; 530 } 531 532 static struct attribute *event_to_attr(unsigned ix, 533 struct hv_24x7_event_data *event, 534 unsigned domain, 535 int nonce) 536 { 537 int event_name_len; 538 char *ev_name, *a_ev_name, *val; 539 struct attribute *attr; 540 541 if (!domain_is_valid(domain)) { 542 pr_warn("catalog event %u has invalid domain %u\n", 543 ix, domain); 544 return NULL; 545 } 546 547 val = event_fmt(event, domain); 548 if (!val) 549 return NULL; 550 551 ev_name = event_name(event, &event_name_len); 552 if (!nonce) 553 a_ev_name = kasprintf(GFP_KERNEL, "%.*s", 554 (int)event_name_len, ev_name); 555 else 556 a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d", 557 (int)event_name_len, ev_name, nonce); 558 559 if (!a_ev_name) 560 goto out_val; 561 562 attr = device_str_attr_create_(a_ev_name, val); 563 if (!attr) 564 goto out_name; 565 566 return attr; 567 out_name: 568 kfree(a_ev_name); 569 out_val: 570 kfree(val); 571 return NULL; 572 } 573 574 static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event, 575 int nonce) 576 { 577 int nl, dl; 578 char *name = event_name(event, &nl); 579 char *desc = event_desc(event, &dl); 580 581 /* If there isn't a description, don't create the sysfs file */ 582 if (!dl) 583 return NULL; 584 585 return device_str_attr_create(name, nl, nonce, desc, dl); 586 } 587 588 static struct attribute * 589 event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce) 590 { 591 int nl, dl; 592 char *name = event_name(event, &nl); 593 char *desc = event_long_desc(event, &dl); 594 595 /* If there isn't a description, don't create the sysfs file */ 596 if (!dl) 597 return NULL; 598 599 return device_str_attr_create(name, nl, nonce, desc, dl); 600 } 601 602 static int event_data_to_attrs(unsigned ix, struct attribute **attrs, 603 struct hv_24x7_event_data *event, int nonce) 604 { 605 *attrs = event_to_attr(ix, event, event->domain, nonce); 606 if (!*attrs) 607 return -1; 608 609 return 0; 610 } 611 612 /* */ 613 struct event_uniq { 614 struct rb_node node; 615 const char *name; 616 int nl; 617 unsigned ct; 618 unsigned domain; 619 }; 620 621 static int memord(const void *d1, size_t s1, const void *d2, size_t s2) 622 { 623 if (s1 < s2) 624 return 1; 625 if (s1 > s2) 626 return -1; 627 628 return memcmp(d1, d2, s1); 629 } 630 631 static int ev_uniq_ord(const void *v1, size_t s1, unsigned d1, const void *v2, 632 size_t s2, unsigned d2) 633 { 634 int r = memord(v1, s1, v2, s2); 635 636 if (r) 637 return r; 638 if (d1 > d2) 639 return 1; 640 if (d2 > d1) 641 return -1; 642 return 0; 643 } 644 645 static int event_uniq_add(struct rb_root *root, const char *name, int nl, 646 unsigned domain) 647 { 648 struct rb_node **new = &(root->rb_node), *parent = NULL; 649 struct event_uniq *data; 650 651 /* Figure out where to put new node */ 652 while (*new) { 653 struct event_uniq *it; 654 int result; 655 656 it = rb_entry(*new, struct event_uniq, node); 657 result = ev_uniq_ord(name, nl, domain, it->name, it->nl, 658 it->domain); 659 660 parent = *new; 661 if (result < 0) 662 new = &((*new)->rb_left); 663 else if (result > 0) 664 new = &((*new)->rb_right); 665 else { 666 it->ct++; 667 pr_info("found a duplicate event %.*s, ct=%u\n", nl, 668 name, it->ct); 669 return it->ct; 670 } 671 } 672 673 data = kmalloc(sizeof(*data), GFP_KERNEL); 674 if (!data) 675 return -ENOMEM; 676 677 *data = (struct event_uniq) { 678 .name = name, 679 .nl = nl, 680 .ct = 0, 681 .domain = domain, 682 }; 683 684 /* Add new node and rebalance tree. */ 685 rb_link_node(&data->node, parent, new); 686 rb_insert_color(&data->node, root); 687 688 /* data->ct */ 689 return 0; 690 } 691 692 static void event_uniq_destroy(struct rb_root *root) 693 { 694 /* 695 * the strings we point to are in the giant block of memory filled by 696 * the catalog, and are freed separately. 697 */ 698 struct event_uniq *pos, *n; 699 700 rbtree_postorder_for_each_entry_safe(pos, n, root, node) 701 kfree(pos); 702 } 703 704 705 /* 706 * ensure the event structure's sizes are self consistent and don't cause us to 707 * read outside of the event 708 * 709 * On success, return the event length in bytes. 710 * Otherwise, return -1 (and print as appropriate). 711 */ 712 static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event, 713 size_t event_idx, 714 size_t event_data_bytes, 715 size_t event_entry_count, 716 size_t offset, void *end) 717 { 718 ssize_t ev_len; 719 void *ev_end, *calc_ev_end; 720 721 if (offset >= event_data_bytes) 722 return -1; 723 724 if (event_idx >= event_entry_count) { 725 pr_devel("catalog event data has %zu bytes of padding after last event\n", 726 event_data_bytes - offset); 727 return -1; 728 } 729 730 if (!event_fixed_portion_is_within(event, end)) { 731 pr_warn("event %zu fixed portion is not within range\n", 732 event_idx); 733 return -1; 734 } 735 736 ev_len = be16_to_cpu(event->length); 737 738 if (ev_len % 16) 739 pr_info("event %zu has length %zu not divisible by 16: event=%pK\n", 740 event_idx, ev_len, event); 741 742 ev_end = (__u8 *)event + ev_len; 743 if (ev_end > end) { 744 pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n", 745 event_idx, ev_len, ev_end, end, 746 offset); 747 return -1; 748 } 749 750 calc_ev_end = event_end(event, end); 751 if (!calc_ev_end) { 752 pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n", 753 event_idx, event_data_bytes, event, end, 754 offset); 755 return -1; 756 } 757 758 if (calc_ev_end > ev_end) { 759 pr_warn("event %zu exceeds it's own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n", 760 event_idx, event, ev_end, offset, calc_ev_end); 761 return -1; 762 } 763 764 return ev_len; 765 } 766 767 /* 768 * Return true incase of invalid or dummy events with names like RESERVED* 769 */ 770 static bool ignore_event(const char *name) 771 { 772 return strncmp(name, "RESERVED", 8) == 0; 773 } 774 775 #define MAX_4K (SIZE_MAX / 4096) 776 777 static int create_events_from_catalog(struct attribute ***events_, 778 struct attribute ***event_descs_, 779 struct attribute ***event_long_descs_) 780 { 781 long hret; 782 size_t catalog_len, catalog_page_len, event_entry_count, 783 event_data_len, event_data_offs, 784 event_data_bytes, junk_events, event_idx, event_attr_ct, i, 785 attr_max, event_idx_last, desc_ct, long_desc_ct; 786 ssize_t ct, ev_len; 787 uint64_t catalog_version_num; 788 struct attribute **events, **event_descs, **event_long_descs; 789 struct hv_24x7_catalog_page_0 *page_0 = 790 kmem_cache_alloc(hv_page_cache, GFP_KERNEL); 791 void *page = page_0; 792 void *event_data, *end; 793 struct hv_24x7_event_data *event; 794 struct rb_root ev_uniq = RB_ROOT; 795 int ret = 0; 796 797 if (!page) { 798 ret = -ENOMEM; 799 goto e_out; 800 } 801 802 hret = h_get_24x7_catalog_page(page, 0, 0); 803 if (hret) { 804 ret = -EIO; 805 goto e_free; 806 } 807 808 catalog_version_num = be64_to_cpu(page_0->version); 809 catalog_page_len = be32_to_cpu(page_0->length); 810 811 if (MAX_4K < catalog_page_len) { 812 pr_err("invalid page count: %zu\n", catalog_page_len); 813 ret = -EIO; 814 goto e_free; 815 } 816 817 catalog_len = catalog_page_len * 4096; 818 819 event_entry_count = be16_to_cpu(page_0->event_entry_count); 820 event_data_offs = be16_to_cpu(page_0->event_data_offs); 821 event_data_len = be16_to_cpu(page_0->event_data_len); 822 823 pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n", 824 catalog_version_num, catalog_len, 825 event_entry_count, event_data_offs, event_data_len); 826 827 if ((MAX_4K < event_data_len) 828 || (MAX_4K < event_data_offs) 829 || (MAX_4K - event_data_offs < event_data_len)) { 830 pr_err("invalid event data offs %zu and/or len %zu\n", 831 event_data_offs, event_data_len); 832 ret = -EIO; 833 goto e_free; 834 } 835 836 if ((event_data_offs + event_data_len) > catalog_page_len) { 837 pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n", 838 event_data_offs, 839 event_data_offs + event_data_len, 840 catalog_page_len); 841 ret = -EIO; 842 goto e_free; 843 } 844 845 if (SIZE_MAX - 1 < event_entry_count) { 846 pr_err("event_entry_count %zu is invalid\n", event_entry_count); 847 ret = -EIO; 848 goto e_free; 849 } 850 851 event_data_bytes = event_data_len * 4096; 852 853 /* 854 * event data can span several pages, events can cross between these 855 * pages. Use vmalloc to make this easier. 856 */ 857 event_data = vmalloc(event_data_bytes); 858 if (!event_data) { 859 pr_err("could not allocate event data\n"); 860 ret = -ENOMEM; 861 goto e_free; 862 } 863 864 end = event_data + event_data_bytes; 865 866 /* 867 * using vmalloc_to_phys() like this only works if PAGE_SIZE is 868 * divisible by 4096 869 */ 870 BUILD_BUG_ON(PAGE_SIZE % 4096); 871 872 for (i = 0; i < event_data_len; i++) { 873 hret = h_get_24x7_catalog_page_( 874 vmalloc_to_phys(event_data + i * 4096), 875 catalog_version_num, 876 i + event_data_offs); 877 if (hret) { 878 pr_err("Failed to get event data in page %zu: rc=%ld\n", 879 i + event_data_offs, hret); 880 ret = -EIO; 881 goto e_event_data; 882 } 883 } 884 885 /* 886 * scan the catalog to determine the number of attributes we need, and 887 * verify it at the same time. 888 */ 889 for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0; 890 ; 891 event_idx++, event = (void *)event + ev_len) { 892 size_t offset = (void *)event - (void *)event_data; 893 char *name; 894 int nl; 895 896 ev_len = catalog_event_len_validate(event, event_idx, 897 event_data_bytes, 898 event_entry_count, 899 offset, end); 900 if (ev_len < 0) 901 break; 902 903 name = event_name(event, &nl); 904 905 if (ignore_event(name)) { 906 junk_events++; 907 continue; 908 } 909 if (event->event_group_record_len == 0) { 910 pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n", 911 event_idx, nl, name); 912 junk_events++; 913 continue; 914 } 915 916 if (!catalog_entry_domain_is_valid(event->domain)) { 917 pr_info("event %zu (%.*s) has invalid domain %d\n", 918 event_idx, nl, name, event->domain); 919 junk_events++; 920 continue; 921 } 922 923 attr_max++; 924 } 925 926 event_idx_last = event_idx; 927 if (event_idx_last != event_entry_count) 928 pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n", 929 event_idx_last, event_entry_count, junk_events); 930 931 events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL); 932 if (!events) { 933 ret = -ENOMEM; 934 goto e_event_data; 935 } 936 937 event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs), 938 GFP_KERNEL); 939 if (!event_descs) { 940 ret = -ENOMEM; 941 goto e_event_attrs; 942 } 943 944 event_long_descs = kmalloc_array(event_idx + 1, 945 sizeof(*event_long_descs), GFP_KERNEL); 946 if (!event_long_descs) { 947 ret = -ENOMEM; 948 goto e_event_descs; 949 } 950 951 /* Iterate over the catalog filling in the attribute vector */ 952 for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0, 953 event = event_data, event_idx = 0; 954 event_idx < event_idx_last; 955 event_idx++, ev_len = be16_to_cpu(event->length), 956 event = (void *)event + ev_len) { 957 char *name; 958 int nl; 959 int nonce; 960 /* 961 * these are the only "bad" events that are intermixed and that 962 * we can ignore without issue. make sure to skip them here 963 */ 964 if (event->event_group_record_len == 0) 965 continue; 966 if (!catalog_entry_domain_is_valid(event->domain)) 967 continue; 968 969 name = event_name(event, &nl); 970 if (ignore_event(name)) 971 continue; 972 973 nonce = event_uniq_add(&ev_uniq, name, nl, event->domain); 974 ct = event_data_to_attrs(event_idx, events + event_attr_ct, 975 event, nonce); 976 if (ct < 0) { 977 pr_warn("event %zu (%.*s) creation failure, skipping\n", 978 event_idx, nl, name); 979 junk_events++; 980 } else { 981 event_attr_ct++; 982 event_descs[desc_ct] = event_to_desc_attr(event, nonce); 983 if (event_descs[desc_ct]) 984 desc_ct++; 985 event_long_descs[long_desc_ct] = 986 event_to_long_desc_attr(event, nonce); 987 if (event_long_descs[long_desc_ct]) 988 long_desc_ct++; 989 } 990 } 991 992 pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n", 993 event_idx, event_attr_ct, junk_events, desc_ct); 994 995 events[event_attr_ct] = NULL; 996 event_descs[desc_ct] = NULL; 997 event_long_descs[long_desc_ct] = NULL; 998 999 event_uniq_destroy(&ev_uniq); 1000 vfree(event_data); 1001 kmem_cache_free(hv_page_cache, page); 1002 1003 *events_ = events; 1004 *event_descs_ = event_descs; 1005 *event_long_descs_ = event_long_descs; 1006 return 0; 1007 1008 e_event_descs: 1009 kfree(event_descs); 1010 e_event_attrs: 1011 kfree(events); 1012 e_event_data: 1013 vfree(event_data); 1014 e_free: 1015 kmem_cache_free(hv_page_cache, page); 1016 e_out: 1017 *events_ = NULL; 1018 *event_descs_ = NULL; 1019 *event_long_descs_ = NULL; 1020 return ret; 1021 } 1022 1023 static ssize_t catalog_read(struct file *filp, struct kobject *kobj, 1024 struct bin_attribute *bin_attr, char *buf, 1025 loff_t offset, size_t count) 1026 { 1027 long hret; 1028 ssize_t ret = 0; 1029 size_t catalog_len = 0, catalog_page_len = 0; 1030 loff_t page_offset = 0; 1031 loff_t offset_in_page; 1032 size_t copy_len; 1033 uint64_t catalog_version_num = 0; 1034 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); 1035 struct hv_24x7_catalog_page_0 *page_0 = page; 1036 1037 if (!page) 1038 return -ENOMEM; 1039 1040 hret = h_get_24x7_catalog_page(page, 0, 0); 1041 if (hret) { 1042 ret = -EIO; 1043 goto e_free; 1044 } 1045 1046 catalog_version_num = be64_to_cpu(page_0->version); 1047 catalog_page_len = be32_to_cpu(page_0->length); 1048 catalog_len = catalog_page_len * 4096; 1049 1050 page_offset = offset / 4096; 1051 offset_in_page = offset % 4096; 1052 1053 if (page_offset >= catalog_page_len) 1054 goto e_free; 1055 1056 if (page_offset != 0) { 1057 hret = h_get_24x7_catalog_page(page, catalog_version_num, 1058 page_offset); 1059 if (hret) { 1060 ret = -EIO; 1061 goto e_free; 1062 } 1063 } 1064 1065 copy_len = 4096 - offset_in_page; 1066 if (copy_len > count) 1067 copy_len = count; 1068 1069 memcpy(buf, page+offset_in_page, copy_len); 1070 ret = copy_len; 1071 1072 e_free: 1073 if (hret) 1074 pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:" 1075 " rc=%ld\n", 1076 catalog_version_num, page_offset, hret); 1077 kmem_cache_free(hv_page_cache, page); 1078 1079 pr_devel("catalog_read: offset=%lld(%lld) count=%zu " 1080 "catalog_len=%zu(%zu) => %zd\n", offset, page_offset, 1081 count, catalog_len, catalog_page_len, ret); 1082 1083 return ret; 1084 } 1085 1086 static ssize_t domains_show(struct device *dev, struct device_attribute *attr, 1087 char *page) 1088 { 1089 int d, n, count = 0; 1090 const char *str; 1091 1092 for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) { 1093 str = domain_name(d); 1094 if (!str) 1095 continue; 1096 1097 n = sprintf(page, "%d: %s\n", d, str); 1098 if (n < 0) 1099 break; 1100 1101 count += n; 1102 page += n; 1103 } 1104 return count; 1105 } 1106 1107 #define PAGE_0_ATTR(_name, _fmt, _expr) \ 1108 static ssize_t _name##_show(struct device *dev, \ 1109 struct device_attribute *dev_attr, \ 1110 char *buf) \ 1111 { \ 1112 long hret; \ 1113 ssize_t ret = 0; \ 1114 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); \ 1115 struct hv_24x7_catalog_page_0 *page_0 = page; \ 1116 if (!page) \ 1117 return -ENOMEM; \ 1118 hret = h_get_24x7_catalog_page(page, 0, 0); \ 1119 if (hret) { \ 1120 ret = -EIO; \ 1121 goto e_free; \ 1122 } \ 1123 ret = sprintf(buf, _fmt, _expr); \ 1124 e_free: \ 1125 kmem_cache_free(hv_page_cache, page); \ 1126 return ret; \ 1127 } \ 1128 static DEVICE_ATTR_RO(_name) 1129 1130 PAGE_0_ATTR(catalog_version, "%lld\n", 1131 (unsigned long long)be64_to_cpu(page_0->version)); 1132 PAGE_0_ATTR(catalog_len, "%lld\n", 1133 (unsigned long long)be32_to_cpu(page_0->length) * 4096); 1134 static BIN_ATTR_RO(catalog, 0/* real length varies */); 1135 static DEVICE_ATTR_RO(domains); 1136 static DEVICE_ATTR_RO(sockets); 1137 static DEVICE_ATTR_RO(chipspersocket); 1138 static DEVICE_ATTR_RO(coresperchip); 1139 static DEVICE_ATTR_RO(cpumask); 1140 1141 static struct bin_attribute *if_bin_attrs[] = { 1142 &bin_attr_catalog, 1143 NULL, 1144 }; 1145 1146 static struct attribute *cpumask_attrs[] = { 1147 &dev_attr_cpumask.attr, 1148 NULL, 1149 }; 1150 1151 static struct attribute_group cpumask_attr_group = { 1152 .attrs = cpumask_attrs, 1153 }; 1154 1155 static struct attribute *if_attrs[] = { 1156 &dev_attr_catalog_len.attr, 1157 &dev_attr_catalog_version.attr, 1158 &dev_attr_domains.attr, 1159 &dev_attr_sockets.attr, 1160 &dev_attr_chipspersocket.attr, 1161 &dev_attr_coresperchip.attr, 1162 NULL, 1163 }; 1164 1165 static struct attribute_group if_group = { 1166 .name = "interface", 1167 .bin_attrs = if_bin_attrs, 1168 .attrs = if_attrs, 1169 }; 1170 1171 static const struct attribute_group *attr_groups[] = { 1172 &format_group, 1173 &event_group, 1174 &event_desc_group, 1175 &event_long_desc_group, 1176 &if_group, 1177 &cpumask_attr_group, 1178 NULL, 1179 }; 1180 1181 /* 1182 * Start the process for a new H_GET_24x7_DATA hcall. 1183 */ 1184 static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer, 1185 struct hv_24x7_data_result_buffer *result_buffer) 1186 { 1187 1188 memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE); 1189 memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE); 1190 1191 request_buffer->interface_version = interface_version; 1192 /* memset above set request_buffer->num_requests to 0 */ 1193 } 1194 1195 /* 1196 * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected 1197 * by 'init_24x7_request()' and 'add_event_to_24x7_request()'. 1198 */ 1199 static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer, 1200 struct hv_24x7_data_result_buffer *result_buffer) 1201 { 1202 long ret; 1203 1204 /* 1205 * NOTE: Due to variable number of array elements in request and 1206 * result buffer(s), sizeof() is not reliable. Use the actual 1207 * allocated buffer size, H24x7_DATA_BUFFER_SIZE. 1208 */ 1209 ret = plpar_hcall_norets(H_GET_24X7_DATA, 1210 virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE, 1211 virt_to_phys(result_buffer), H24x7_DATA_BUFFER_SIZE); 1212 1213 if (ret) { 1214 struct hv_24x7_request *req; 1215 1216 req = request_buffer->requests; 1217 pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n", 1218 req->performance_domain, req->data_offset, 1219 req->starting_ix, req->starting_lpar_ix, 1220 ret, ret, result_buffer->detailed_rc, 1221 result_buffer->failing_request_ix); 1222 return -EIO; 1223 } 1224 1225 return 0; 1226 } 1227 1228 /* 1229 * Add the given @event to the next slot in the 24x7 request_buffer. 1230 * 1231 * Note that H_GET_24X7_DATA hcall allows reading several counters' 1232 * values in a single HCALL. We expect the caller to add events to the 1233 * request buffer one by one, make the HCALL and process the results. 1234 */ 1235 static int add_event_to_24x7_request(struct perf_event *event, 1236 struct hv_24x7_request_buffer *request_buffer) 1237 { 1238 u16 idx; 1239 int i; 1240 size_t req_size; 1241 struct hv_24x7_request *req; 1242 1243 if (request_buffer->num_requests >= 1244 max_num_requests(request_buffer->interface_version)) { 1245 pr_devel("Too many requests for 24x7 HCALL %d\n", 1246 request_buffer->num_requests); 1247 return -EINVAL; 1248 } 1249 1250 switch (event_get_domain(event)) { 1251 case HV_PERF_DOMAIN_PHYS_CHIP: 1252 idx = event_get_chip(event); 1253 break; 1254 case HV_PERF_DOMAIN_PHYS_CORE: 1255 idx = event_get_core(event); 1256 break; 1257 default: 1258 idx = event_get_vcpu(event); 1259 } 1260 1261 req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version); 1262 1263 i = request_buffer->num_requests++; 1264 req = (void *) request_buffer->requests + i * req_size; 1265 1266 req->performance_domain = event_get_domain(event); 1267 req->data_size = cpu_to_be16(8); 1268 req->data_offset = cpu_to_be32(event_get_offset(event)); 1269 req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event)); 1270 req->max_num_lpars = cpu_to_be16(1); 1271 req->starting_ix = cpu_to_be16(idx); 1272 req->max_ix = cpu_to_be16(1); 1273 1274 if (request_buffer->interface_version > 1) { 1275 if (domain_needs_aggregation(req->performance_domain)) 1276 req->max_num_thread_groups = -1; 1277 else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) { 1278 req->starting_thread_group_ix = idx % 2; 1279 req->max_num_thread_groups = 1; 1280 } 1281 } 1282 1283 return 0; 1284 } 1285 1286 /** 1287 * get_count_from_result - get event count from all result elements in result 1288 * 1289 * If the event corresponding to this result needs aggregation of the result 1290 * element values, then this function does that. 1291 * 1292 * @event: Event associated with @res. 1293 * @resb: Result buffer containing @res. 1294 * @res: Result to work on. 1295 * @countp: Output variable containing the event count. 1296 * @next: Optional output variable pointing to the next result in @resb. 1297 */ 1298 static int get_count_from_result(struct perf_event *event, 1299 struct hv_24x7_data_result_buffer *resb, 1300 struct hv_24x7_result *res, u64 *countp, 1301 struct hv_24x7_result **next) 1302 { 1303 u16 num_elements = be16_to_cpu(res->num_elements_returned); 1304 u16 data_size = be16_to_cpu(res->result_element_data_size); 1305 unsigned int data_offset; 1306 void *element_data; 1307 int i; 1308 u64 count; 1309 1310 /* 1311 * We can bail out early if the result is empty. 1312 */ 1313 if (!num_elements) { 1314 pr_debug("Result of request %hhu is empty, nothing to do\n", 1315 res->result_ix); 1316 1317 if (next) 1318 *next = (struct hv_24x7_result *) res->elements; 1319 1320 return -ENODATA; 1321 } 1322 1323 /* 1324 * Since we always specify 1 as the maximum for the smallest resource 1325 * we're requesting, there should to be only one element per result. 1326 * Except when an event needs aggregation, in which case there are more. 1327 */ 1328 if (num_elements != 1 && 1329 !domain_needs_aggregation(event_get_domain(event))) { 1330 pr_err("Error: result of request %hhu has %hu elements\n", 1331 res->result_ix, num_elements); 1332 1333 return -EIO; 1334 } 1335 1336 if (data_size != sizeof(u64)) { 1337 pr_debug("Error: result of request %hhu has data of %hu bytes\n", 1338 res->result_ix, data_size); 1339 1340 return -ENOTSUPP; 1341 } 1342 1343 if (resb->interface_version == 1) 1344 data_offset = offsetof(struct hv_24x7_result_element_v1, 1345 element_data); 1346 else 1347 data_offset = offsetof(struct hv_24x7_result_element_v2, 1348 element_data); 1349 1350 /* Go through the result elements in the result. */ 1351 for (i = count = 0, element_data = res->elements + data_offset; 1352 i < num_elements; 1353 i++, element_data += data_size + data_offset) 1354 count += be64_to_cpu(*((u64 *) element_data)); 1355 1356 *countp = count; 1357 1358 /* The next result is after the last result element. */ 1359 if (next) 1360 *next = element_data - data_offset; 1361 1362 return 0; 1363 } 1364 1365 static int single_24x7_request(struct perf_event *event, u64 *count) 1366 { 1367 int ret; 1368 struct hv_24x7_request_buffer *request_buffer; 1369 struct hv_24x7_data_result_buffer *result_buffer; 1370 1371 BUILD_BUG_ON(sizeof(*request_buffer) > 4096); 1372 BUILD_BUG_ON(sizeof(*result_buffer) > 4096); 1373 1374 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1375 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1376 1377 init_24x7_request(request_buffer, result_buffer); 1378 1379 ret = add_event_to_24x7_request(event, request_buffer); 1380 if (ret) 1381 goto out; 1382 1383 ret = make_24x7_request(request_buffer, result_buffer); 1384 if (ret) 1385 goto out; 1386 1387 /* process result from hcall */ 1388 ret = get_count_from_result(event, result_buffer, 1389 result_buffer->results, count, NULL); 1390 1391 out: 1392 put_cpu_var(hv_24x7_reqb); 1393 put_cpu_var(hv_24x7_resb); 1394 return ret; 1395 } 1396 1397 1398 static int h_24x7_event_init(struct perf_event *event) 1399 { 1400 struct hv_perf_caps caps; 1401 unsigned domain; 1402 unsigned long hret; 1403 u64 ct; 1404 1405 /* Not our event */ 1406 if (event->attr.type != event->pmu->type) 1407 return -ENOENT; 1408 1409 /* Unused areas must be 0 */ 1410 if (event_get_reserved1(event) || 1411 event_get_reserved2(event) || 1412 event_get_reserved3(event)) { 1413 pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n", 1414 event->attr.config, 1415 event_get_reserved1(event), 1416 event->attr.config1, 1417 event_get_reserved2(event), 1418 event->attr.config2, 1419 event_get_reserved3(event)); 1420 return -EINVAL; 1421 } 1422 1423 /* no branch sampling */ 1424 if (has_branch_stack(event)) 1425 return -EOPNOTSUPP; 1426 1427 /* offset must be 8 byte aligned */ 1428 if (event_get_offset(event) % 8) { 1429 pr_devel("bad alignment\n"); 1430 return -EINVAL; 1431 } 1432 1433 domain = event_get_domain(event); 1434 if (domain >= HV_PERF_DOMAIN_MAX) { 1435 pr_devel("invalid domain %d\n", domain); 1436 return -EINVAL; 1437 } 1438 1439 hret = hv_perf_caps_get(&caps); 1440 if (hret) { 1441 pr_devel("could not get capabilities: rc=%ld\n", hret); 1442 return -EIO; 1443 } 1444 1445 /* Physical domains & other lpars require extra capabilities */ 1446 if (!caps.collect_privileged && (is_physical_domain(domain) || 1447 (event_get_lpar(event) != event_get_lpar_max()))) { 1448 pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n", 1449 is_physical_domain(domain), 1450 event_get_lpar(event)); 1451 return -EACCES; 1452 } 1453 1454 /* Get the initial value of the counter for this event */ 1455 if (single_24x7_request(event, &ct)) { 1456 pr_devel("test hcall failed\n"); 1457 return -EIO; 1458 } 1459 (void)local64_xchg(&event->hw.prev_count, ct); 1460 1461 return 0; 1462 } 1463 1464 static u64 h_24x7_get_value(struct perf_event *event) 1465 { 1466 u64 ct; 1467 1468 if (single_24x7_request(event, &ct)) 1469 /* We checked this in event init, shouldn't fail here... */ 1470 return 0; 1471 1472 return ct; 1473 } 1474 1475 static void update_event_count(struct perf_event *event, u64 now) 1476 { 1477 s64 prev; 1478 1479 prev = local64_xchg(&event->hw.prev_count, now); 1480 local64_add(now - prev, &event->count); 1481 } 1482 1483 static void h_24x7_event_read(struct perf_event *event) 1484 { 1485 u64 now; 1486 struct hv_24x7_request_buffer *request_buffer; 1487 struct hv_24x7_hw *h24x7hw; 1488 int txn_flags; 1489 1490 txn_flags = __this_cpu_read(hv_24x7_txn_flags); 1491 1492 /* 1493 * If in a READ transaction, add this counter to the list of 1494 * counters to read during the next HCALL (i.e commit_txn()). 1495 * If not in a READ transaction, go ahead and make the HCALL 1496 * to read this counter by itself. 1497 */ 1498 1499 if (txn_flags & PERF_PMU_TXN_READ) { 1500 int i; 1501 int ret; 1502 1503 if (__this_cpu_read(hv_24x7_txn_err)) 1504 return; 1505 1506 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1507 1508 ret = add_event_to_24x7_request(event, request_buffer); 1509 if (ret) { 1510 __this_cpu_write(hv_24x7_txn_err, ret); 1511 } else { 1512 /* 1513 * Associate the event with the HCALL request index, 1514 * so ->commit_txn() can quickly find/update count. 1515 */ 1516 i = request_buffer->num_requests - 1; 1517 1518 h24x7hw = &get_cpu_var(hv_24x7_hw); 1519 h24x7hw->events[i] = event; 1520 put_cpu_var(h24x7hw); 1521 } 1522 1523 put_cpu_var(hv_24x7_reqb); 1524 } else { 1525 now = h_24x7_get_value(event); 1526 update_event_count(event, now); 1527 } 1528 } 1529 1530 static void h_24x7_event_start(struct perf_event *event, int flags) 1531 { 1532 if (flags & PERF_EF_RELOAD) 1533 local64_set(&event->hw.prev_count, h_24x7_get_value(event)); 1534 } 1535 1536 static void h_24x7_event_stop(struct perf_event *event, int flags) 1537 { 1538 h_24x7_event_read(event); 1539 } 1540 1541 static int h_24x7_event_add(struct perf_event *event, int flags) 1542 { 1543 if (flags & PERF_EF_START) 1544 h_24x7_event_start(event, flags); 1545 1546 return 0; 1547 } 1548 1549 /* 1550 * 24x7 counters only support READ transactions. They are 1551 * always counting and dont need/support ADD transactions. 1552 * Cache the flags, but otherwise ignore transactions that 1553 * are not PERF_PMU_TXN_READ. 1554 */ 1555 static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags) 1556 { 1557 struct hv_24x7_request_buffer *request_buffer; 1558 struct hv_24x7_data_result_buffer *result_buffer; 1559 1560 /* We should not be called if we are already in a txn */ 1561 WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags)); 1562 1563 __this_cpu_write(hv_24x7_txn_flags, flags); 1564 if (flags & ~PERF_PMU_TXN_READ) 1565 return; 1566 1567 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1568 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1569 1570 init_24x7_request(request_buffer, result_buffer); 1571 1572 put_cpu_var(hv_24x7_resb); 1573 put_cpu_var(hv_24x7_reqb); 1574 } 1575 1576 /* 1577 * Clean up transaction state. 1578 * 1579 * NOTE: Ignore state of request and result buffers for now. 1580 * We will initialize them during the next read/txn. 1581 */ 1582 static void reset_txn(void) 1583 { 1584 __this_cpu_write(hv_24x7_txn_flags, 0); 1585 __this_cpu_write(hv_24x7_txn_err, 0); 1586 } 1587 1588 /* 1589 * 24x7 counters only support READ transactions. They are always counting 1590 * and dont need/support ADD transactions. Clear ->txn_flags but otherwise 1591 * ignore transactions that are not of type PERF_PMU_TXN_READ. 1592 * 1593 * For READ transactions, submit all pending 24x7 requests (i.e requests 1594 * that were queued by h_24x7_event_read()), to the hypervisor and update 1595 * the event counts. 1596 */ 1597 static int h_24x7_event_commit_txn(struct pmu *pmu) 1598 { 1599 struct hv_24x7_request_buffer *request_buffer; 1600 struct hv_24x7_data_result_buffer *result_buffer; 1601 struct hv_24x7_result *res, *next_res; 1602 u64 count; 1603 int i, ret, txn_flags; 1604 struct hv_24x7_hw *h24x7hw; 1605 1606 txn_flags = __this_cpu_read(hv_24x7_txn_flags); 1607 WARN_ON_ONCE(!txn_flags); 1608 1609 ret = 0; 1610 if (txn_flags & ~PERF_PMU_TXN_READ) 1611 goto out; 1612 1613 ret = __this_cpu_read(hv_24x7_txn_err); 1614 if (ret) 1615 goto out; 1616 1617 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1618 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1619 1620 ret = make_24x7_request(request_buffer, result_buffer); 1621 if (ret) 1622 goto put_reqb; 1623 1624 h24x7hw = &get_cpu_var(hv_24x7_hw); 1625 1626 /* Go through results in the result buffer to update event counts. */ 1627 for (i = 0, res = result_buffer->results; 1628 i < result_buffer->num_results; i++, res = next_res) { 1629 struct perf_event *event = h24x7hw->events[res->result_ix]; 1630 1631 ret = get_count_from_result(event, result_buffer, res, &count, 1632 &next_res); 1633 if (ret) 1634 break; 1635 1636 update_event_count(event, count); 1637 } 1638 1639 put_cpu_var(hv_24x7_hw); 1640 1641 put_reqb: 1642 put_cpu_var(hv_24x7_resb); 1643 put_cpu_var(hv_24x7_reqb); 1644 out: 1645 reset_txn(); 1646 return ret; 1647 } 1648 1649 /* 1650 * 24x7 counters only support READ transactions. They are always counting 1651 * and dont need/support ADD transactions. However, regardless of type 1652 * of transaction, all we need to do is cleanup, so we don't have to check 1653 * the type of transaction. 1654 */ 1655 static void h_24x7_event_cancel_txn(struct pmu *pmu) 1656 { 1657 WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags)); 1658 reset_txn(); 1659 } 1660 1661 static struct pmu h_24x7_pmu = { 1662 .task_ctx_nr = perf_invalid_context, 1663 1664 .name = "hv_24x7", 1665 .attr_groups = attr_groups, 1666 .event_init = h_24x7_event_init, 1667 .add = h_24x7_event_add, 1668 .del = h_24x7_event_stop, 1669 .start = h_24x7_event_start, 1670 .stop = h_24x7_event_stop, 1671 .read = h_24x7_event_read, 1672 .start_txn = h_24x7_event_start_txn, 1673 .commit_txn = h_24x7_event_commit_txn, 1674 .cancel_txn = h_24x7_event_cancel_txn, 1675 .capabilities = PERF_PMU_CAP_NO_EXCLUDE, 1676 }; 1677 1678 static int ppc_hv_24x7_cpu_online(unsigned int cpu) 1679 { 1680 if (cpumask_empty(&hv_24x7_cpumask)) 1681 cpumask_set_cpu(cpu, &hv_24x7_cpumask); 1682 1683 return 0; 1684 } 1685 1686 static int ppc_hv_24x7_cpu_offline(unsigned int cpu) 1687 { 1688 int target; 1689 1690 /* Check if exiting cpu is used for collecting 24x7 events */ 1691 if (!cpumask_test_and_clear_cpu(cpu, &hv_24x7_cpumask)) 1692 return 0; 1693 1694 /* Find a new cpu to collect 24x7 events */ 1695 target = cpumask_last(cpu_active_mask); 1696 1697 if (target < 0 || target >= nr_cpu_ids) { 1698 pr_err("hv_24x7: CPU hotplug init failed\n"); 1699 return -1; 1700 } 1701 1702 /* Migrate 24x7 events to the new target */ 1703 cpumask_set_cpu(target, &hv_24x7_cpumask); 1704 perf_pmu_migrate_context(&h_24x7_pmu, cpu, target); 1705 1706 return 0; 1707 } 1708 1709 static int hv_24x7_cpu_hotplug_init(void) 1710 { 1711 return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE, 1712 "perf/powerpc/hv_24x7:online", 1713 ppc_hv_24x7_cpu_online, 1714 ppc_hv_24x7_cpu_offline); 1715 } 1716 1717 static int hv_24x7_init(void) 1718 { 1719 int r; 1720 unsigned long hret; 1721 struct hv_perf_caps caps; 1722 1723 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 1724 pr_debug("not a virtualized system, not enabling\n"); 1725 return -ENODEV; 1726 } else if (!cur_cpu_spec->oprofile_cpu_type) 1727 return -ENODEV; 1728 1729 /* POWER8 only supports v1, while POWER9 only supports v2. */ 1730 if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8")) 1731 interface_version = 1; 1732 else { 1733 interface_version = 2; 1734 1735 /* SMT8 in POWER9 needs to aggregate result elements. */ 1736 if (threads_per_core == 8) 1737 aggregate_result_elements = true; 1738 } 1739 1740 hret = hv_perf_caps_get(&caps); 1741 if (hret) { 1742 pr_debug("could not obtain capabilities, not enabling, rc=%ld\n", 1743 hret); 1744 return -ENODEV; 1745 } 1746 1747 hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL); 1748 if (!hv_page_cache) 1749 return -ENOMEM; 1750 1751 /* sampling not supported */ 1752 h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; 1753 1754 r = create_events_from_catalog(&event_group.attrs, 1755 &event_desc_group.attrs, 1756 &event_long_desc_group.attrs); 1757 1758 if (r) 1759 return r; 1760 1761 /* init cpuhotplug */ 1762 r = hv_24x7_cpu_hotplug_init(); 1763 if (r) 1764 return r; 1765 1766 r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1); 1767 if (r) 1768 return r; 1769 1770 read_24x7_sys_info(); 1771 1772 return 0; 1773 } 1774 1775 device_initcall(hv_24x7_init); 1776